Blame view

mm/filemap.c 98.3 KB
457c89965   Thomas Gleixner   treewide: Add SPD...
1
  // SPDX-License-Identifier: GPL-2.0-only
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2
3
4
5
6
7
8
9
10
11
12
  /*
   *	linux/mm/filemap.c
   *
   * Copyright (C) 1994-1999  Linus Torvalds
   */
  
  /*
   * This file handles the generic file mmap semantics used by
   * most "normal" filesystems (but you don't /have/ to use this:
   * the NFS filesystem used to do this differently, for example)
   */
b95f1b31b   Paul Gortmaker   mm: Map most file...
13
  #include <linux/export.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
14
  #include <linux/compiler.h>
f9fe48bec   Ross Zwisler   dax: support dirt...
15
  #include <linux/dax.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
16
  #include <linux/fs.h>
3f07c0144   Ingo Molnar   sched/headers: Pr...
17
  #include <linux/sched/signal.h>
c22ce143d   Hiro Yoshioka   [PATCH] x86: cach...
18
  #include <linux/uaccess.h>
c59ede7b7   Randy.Dunlap   [PATCH] move capa...
19
  #include <linux/capability.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
20
  #include <linux/kernel_stat.h>
5a0e3ad6a   Tejun Heo   include cleanup: ...
21
  #include <linux/gfp.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
22
23
24
25
26
27
  #include <linux/mm.h>
  #include <linux/swap.h>
  #include <linux/mman.h>
  #include <linux/pagemap.h>
  #include <linux/file.h>
  #include <linux/uio.h>
cfcbfb138   Josef Bacik   mm/filemap.c: ena...
28
  #include <linux/error-injection.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
29
30
  #include <linux/hash.h>
  #include <linux/writeback.h>
53253383f   Linus Torvalds   Include <linux/ba...
31
  #include <linux/backing-dev.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
32
33
34
  #include <linux/pagevec.h>
  #include <linux/blkdev.h>
  #include <linux/security.h>
44110fe38   Paul Jackson   [PATCH] cpuset me...
35
  #include <linux/cpuset.h>
00501b531   Johannes Weiner   mm: memcontrol: r...
36
  #include <linux/hugetlb.h>
8a9f3ccd2   Balbir Singh   Memory controller...
37
  #include <linux/memcontrol.h>
c515e1fd3   Dan Magenheimer   mm/fs: add hooks ...
38
  #include <linux/cleancache.h>
c7df8ad29   Mel Gorman   mm, truncate: do ...
39
  #include <linux/shmem_fs.h>
f1820361f   Kirill A. Shutemov   mm: implement ->m...
40
  #include <linux/rmap.h>
b1d29ba82   Johannes Weiner   delayacct: track ...
41
  #include <linux/delayacct.h>
eb414681d   Johannes Weiner   psi: pressure sta...
42
  #include <linux/psi.h>
d0e6a5821   Ben Dooks   mm/filemap.c: inc...
43
  #include <linux/ramfs.h>
b9306a796   Yang Shi   mm: filemap: clea...
44
  #include <linux/page_idle.h>
0f8053a50   Nick Piggin   [PATCH] mm: make ...
45
  #include "internal.h"
fe0bfaaff   Robert Jarzmik   mm: trace filemap...
46
47
  #define CREATE_TRACE_POINTS
  #include <trace/events/filemap.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
48
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
49
50
   * FIXME: remove all knowledge of the buffer layer from the core VM
   */
148f948ba   Jan Kara   vfs: Introduce ne...
51
  #include <linux/buffer_head.h> /* for try_to_free_buffers */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
52

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
  #include <asm/mman.h>
  
  /*
   * Shared mappings implemented 30.11.1994. It's not fully working yet,
   * though.
   *
   * Shared mappings now work. 15.8.1995  Bruno.
   *
   * finished 'unifying' the page and buffer cache and SMP-threaded the
   * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
   *
   * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
   */
  
  /*
   * Lock ordering:
   *
c8c06efa8   Davidlohr Bueso   mm: convert i_mma...
70
   *  ->i_mmap_rwsem		(truncate_pagecache)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
71
   *    ->private_lock		(__free_pte->__set_page_dirty_buffers)
5d337b919   Hugh Dickins   [PATCH] swap: swa...
72
   *      ->swap_lock		(exclusive_swap_page, others)
b93b01631   Matthew Wilcox   page cache: use x...
73
   *        ->i_pages lock
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
74
   *
1b1dcc1b5   Jes Sorensen   [PATCH] mutex sub...
75
   *  ->i_mutex
c8c06efa8   Davidlohr Bueso   mm: convert i_mma...
76
   *    ->i_mmap_rwsem		(truncate->unmap_mapping_range)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
77
   *
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
78
   *  ->mmap_lock
c8c06efa8   Davidlohr Bueso   mm: convert i_mma...
79
   *    ->i_mmap_rwsem
b8072f099   Hugh Dickins   [PATCH] mm: updat...
80
   *      ->page_table_lock or pte_lock	(various, mainly in memory.c)
b93b01631   Matthew Wilcox   page cache: use x...
81
   *        ->i_pages lock	(arch-dependent flush_dcache_mmap_lock)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
82
   *
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
83
   *  ->mmap_lock
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
84
85
   *    ->lock_page		(access_process_vm)
   *
ccad23656   Al Viro   kill generic_file...
86
   *  ->i_mutex			(generic_perform_write)
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
87
   *    ->mmap_lock		(fault_in_pages_readable->do_page_fault)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
88
   *
f758eeabe   Christoph Hellwig   writeback: split ...
89
   *  bdi->wb.list_lock
a66979aba   Dave Chinner   fs: move i_wb_lis...
90
   *    sb_lock			(fs/fs-writeback.c)
b93b01631   Matthew Wilcox   page cache: use x...
91
   *    ->i_pages lock		(__sync_single_inode)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
92
   *
c8c06efa8   Davidlohr Bueso   mm: convert i_mma...
93
   *  ->i_mmap_rwsem
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
94
95
96
   *    ->anon_vma.lock		(vma_adjust)
   *
   *  ->anon_vma.lock
b8072f099   Hugh Dickins   [PATCH] mm: updat...
97
   *    ->page_table_lock or pte_lock	(anon_vma_prepare and various)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
98
   *
b8072f099   Hugh Dickins   [PATCH] mm: updat...
99
   *  ->page_table_lock or pte_lock
5d337b919   Hugh Dickins   [PATCH] swap: swa...
100
   *    ->swap_lock		(try_to_unmap_one)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
101
   *    ->private_lock		(try_to_unmap_one)
b93b01631   Matthew Wilcox   page cache: use x...
102
   *    ->i_pages lock		(try_to_unmap_one)
f4b7e272b   Andrey Ryabinin   mm: remove zone_l...
103
104
   *    ->pgdat->lru_lock		(follow_page->mark_page_accessed)
   *    ->pgdat->lru_lock		(check_pte_range->isolate_lru_page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
105
   *    ->private_lock		(page_remove_rmap->set_page_dirty)
b93b01631   Matthew Wilcox   page cache: use x...
106
   *    ->i_pages lock		(page_remove_rmap->set_page_dirty)
f758eeabe   Christoph Hellwig   writeback: split ...
107
   *    bdi.wb->list_lock		(page_remove_rmap->set_page_dirty)
250df6ed2   Dave Chinner   fs: protect inode...
108
   *    ->inode->i_lock		(page_remove_rmap->set_page_dirty)
81f8c3a46   Johannes Weiner   mm: memcontrol: g...
109
   *    ->memcg->move_lock	(page_remove_rmap->lock_page_memcg)
f758eeabe   Christoph Hellwig   writeback: split ...
110
   *    bdi.wb->list_lock		(zap_pte_range->set_page_dirty)
250df6ed2   Dave Chinner   fs: protect inode...
111
   *    ->inode->i_lock		(zap_pte_range->set_page_dirty)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
112
113
   *    ->private_lock		(zap_pte_range->__set_page_dirty_buffers)
   *
c8c06efa8   Davidlohr Bueso   mm: convert i_mma...
114
   * ->i_mmap_rwsem
9a3c531df   Andi Kleen   mm: update stale ...
115
   *   ->tasklist_lock            (memory_failure, collect_procs_ao)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
116
   */
5c024e6a4   Matthew Wilcox   page cache: Conve...
117
  static void page_cache_delete(struct address_space *mapping,
91b0abe36   Johannes Weiner   mm + fs: store sh...
118
119
  				   struct page *page, void *shadow)
  {
5c024e6a4   Matthew Wilcox   page cache: Conve...
120
121
  	XA_STATE(xas, &mapping->i_pages, page->index);
  	unsigned int nr = 1;
c70b647d3   Kirill A. Shutemov   mm/filemap.c: add...
122

5c024e6a4   Matthew Wilcox   page cache: Conve...
123
  	mapping_set_update(&xas, mapping);
c70b647d3   Kirill A. Shutemov   mm/filemap.c: add...
124

5c024e6a4   Matthew Wilcox   page cache: Conve...
125
126
127
  	/* hugetlb pages are represented by a single entry in the xarray */
  	if (!PageHuge(page)) {
  		xas_set_order(&xas, page->index, compound_order(page));
d8c6546b1   Matthew Wilcox (Oracle)   mm: introduce com...
128
  		nr = compound_nr(page);
5c024e6a4   Matthew Wilcox   page cache: Conve...
129
  	}
91b0abe36   Johannes Weiner   mm + fs: store sh...
130

83929372f   Kirill A. Shutemov   filemap: prepare ...
131
132
133
  	VM_BUG_ON_PAGE(!PageLocked(page), page);
  	VM_BUG_ON_PAGE(PageTail(page), page);
  	VM_BUG_ON_PAGE(nr != 1 && shadow, page);
449dd6984   Johannes Weiner   mm: keep page cac...
134

5c024e6a4   Matthew Wilcox   page cache: Conve...
135
136
  	xas_store(&xas, shadow);
  	xas_init_marks(&xas);
d3798ae8c   Johannes Weiner   mm: filemap: don'...
137

2300638b1   Jan Kara   mm: move clearing...
138
139
  	page->mapping = NULL;
  	/* Leave page->index set: truncation lookup relies upon it */
d3798ae8c   Johannes Weiner   mm: filemap: don'...
140
141
142
143
144
145
146
147
148
149
150
  	if (shadow) {
  		mapping->nrexceptional += nr;
  		/*
  		 * Make sure the nrexceptional update is committed before
  		 * the nrpages update so that final truncate racing
  		 * with reclaim does not see both counters 0 at the
  		 * same time and miss a shadow entry.
  		 */
  		smp_wmb();
  	}
  	mapping->nrpages -= nr;
91b0abe36   Johannes Weiner   mm + fs: store sh...
151
  }
5ecc4d852   Jan Kara   mm: factor out ch...
152
153
  static void unaccount_page_cache_page(struct address_space *mapping,
  				      struct page *page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
154
  {
5ecc4d852   Jan Kara   mm: factor out ch...
155
  	int nr;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
156

c515e1fd3   Dan Magenheimer   mm/fs: add hooks ...
157
158
159
160
161
162
163
164
  	/*
  	 * if we're uptodate, flush out into the cleancache, otherwise
  	 * invalidate any existing cleancache entries.  We can't leave
  	 * stale data around in the cleancache once our page is gone
  	 */
  	if (PageUptodate(page) && PageMappedToDisk(page))
  		cleancache_put_page(page);
  	else
3167760f8   Dan Magenheimer   mm: cleancache: s...
165
  		cleancache_invalidate_page(mapping, page);
c515e1fd3   Dan Magenheimer   mm/fs: add hooks ...
166

83929372f   Kirill A. Shutemov   filemap: prepare ...
167
  	VM_BUG_ON_PAGE(PageTail(page), page);
06b241f32   Hugh Dickins   mm: __delete_from...
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
  	VM_BUG_ON_PAGE(page_mapped(page), page);
  	if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(page_mapped(page))) {
  		int mapcount;
  
  		pr_alert("BUG: Bad page cache in process %s  pfn:%05lx
  ",
  			 current->comm, page_to_pfn(page));
  		dump_page(page, "still mapped when deleted");
  		dump_stack();
  		add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
  
  		mapcount = page_mapcount(page);
  		if (mapping_exiting(mapping) &&
  		    page_count(page) >= mapcount + 2) {
  			/*
  			 * All vmas have already been torn down, so it's
  			 * a good bet that actually the page is unmapped,
  			 * and we'd prefer not to leak it: if we're wrong,
  			 * some other bad page check should catch it later.
  			 */
  			page_mapcount_reset(page);
6d061f9f6   Joonsoo Kim   mm/page_ref: use ...
189
  			page_ref_sub(page, mapcount);
06b241f32   Hugh Dickins   mm: __delete_from...
190
191
  		}
  	}
4165b9b46   Michal Hocko   hugetlb: do not a...
192
  	/* hugetlb pages do not participate in page cache accounting. */
5ecc4d852   Jan Kara   mm: factor out ch...
193
194
  	if (PageHuge(page))
  		return;
09612fa65   Naoya Horiguchi   mm: hugetlb: retu...
195

6c357848b   Matthew Wilcox (Oracle)   mm: replace hpage...
196
  	nr = thp_nr_pages(page);
5ecc4d852   Jan Kara   mm: factor out ch...
197

0d1c20722   Johannes Weiner   mm: memcontrol: s...
198
  	__mod_lruvec_page_state(page, NR_FILE_PAGES, -nr);
5ecc4d852   Jan Kara   mm: factor out ch...
199
  	if (PageSwapBacked(page)) {
0d1c20722   Johannes Weiner   mm: memcontrol: s...
200
  		__mod_lruvec_page_state(page, NR_SHMEM, -nr);
5ecc4d852   Jan Kara   mm: factor out ch...
201
202
  		if (PageTransHuge(page))
  			__dec_node_page_state(page, NR_SHMEM_THPS);
99cb0dbd4   Song Liu   mm,thp: add read-...
203
204
  	} else if (PageTransHuge(page)) {
  		__dec_node_page_state(page, NR_FILE_THPS);
09d91cda0   Song Liu   mm,thp: avoid wri...
205
  		filemap_nr_thps_dec(mapping);
800d8c63b   Kirill A. Shutemov   shmem: add huge p...
206
  	}
5ecc4d852   Jan Kara   mm: factor out ch...
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
  
  	/*
  	 * At this point page must be either written or cleaned by
  	 * truncate.  Dirty page here signals a bug and loss of
  	 * unwritten data.
  	 *
  	 * This fixes dirty accounting after removing the page entirely
  	 * but leaves PageDirty set: it has no effect for truncated
  	 * page and anyway will be cleared before returning page into
  	 * buddy allocator.
  	 */
  	if (WARN_ON_ONCE(PageDirty(page)))
  		account_page_cleaned(page, mapping, inode_to_wb(mapping->host));
  }
  
  /*
   * Delete a page from the page cache and free it. Caller has to make
   * sure the page is locked and that nobody else uses it - or that usage
b93b01631   Matthew Wilcox   page cache: use x...
225
   * is safe.  The caller must hold the i_pages lock.
5ecc4d852   Jan Kara   mm: factor out ch...
226
227
228
229
230
231
232
233
   */
  void __delete_from_page_cache(struct page *page, void *shadow)
  {
  	struct address_space *mapping = page->mapping;
  
  	trace_mm_filemap_delete_from_page_cache(page);
  
  	unaccount_page_cache_page(mapping, page);
5c024e6a4   Matthew Wilcox   page cache: Conve...
234
  	page_cache_delete(mapping, page, shadow);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
235
  }
59c66c5f8   Jan Kara   mm: factor out pa...
236
237
238
239
240
241
242
243
244
245
  static void page_cache_free_page(struct address_space *mapping,
  				struct page *page)
  {
  	void (*freepage)(struct page *);
  
  	freepage = mapping->a_ops->freepage;
  	if (freepage)
  		freepage(page);
  
  	if (PageTransHuge(page) && !PageHuge(page)) {
887b22c62   Matthew Wilcox (Oracle)   mm/filemap: fix p...
246
  		page_ref_sub(page, thp_nr_pages(page));
59c66c5f8   Jan Kara   mm: factor out pa...
247
248
249
250
251
  		VM_BUG_ON_PAGE(page_count(page) <= 0, page);
  	} else {
  		put_page(page);
  	}
  }
702cfbf93   Minchan Kim   mm: goodbye remov...
252
253
254
255
256
257
258
259
260
  /**
   * delete_from_page_cache - delete page from page cache
   * @page: the page which the kernel is trying to remove from page cache
   *
   * This must be called only on pages that have been verified to be in the page
   * cache and locked.  It will never put the page into the free list, the caller
   * has a reference on the page.
   */
  void delete_from_page_cache(struct page *page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
261
  {
83929372f   Kirill A. Shutemov   filemap: prepare ...
262
  	struct address_space *mapping = page_mapping(page);
c4843a759   Greg Thelen   memcg: add per cg...
263
  	unsigned long flags;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
264

cd7619d6b   Matt Mackall   [PATCH] Extermina...
265
  	BUG_ON(!PageLocked(page));
b93b01631   Matthew Wilcox   page cache: use x...
266
  	xa_lock_irqsave(&mapping->i_pages, flags);
62cccb8c8   Johannes Weiner   mm: simplify lock...
267
  	__delete_from_page_cache(page, NULL);
b93b01631   Matthew Wilcox   page cache: use x...
268
  	xa_unlock_irqrestore(&mapping->i_pages, flags);
6072d13c4   Linus Torvalds   Call the filesyst...
269

59c66c5f8   Jan Kara   mm: factor out pa...
270
  	page_cache_free_page(mapping, page);
97cecb5a2   Minchan Kim   mm: introduce del...
271
272
  }
  EXPORT_SYMBOL(delete_from_page_cache);
aa65c29ce   Jan Kara   mm: batch radix t...
273
  /*
ef8e5717d   Matthew Wilcox   page cache: Conve...
274
   * page_cache_delete_batch - delete several pages from page cache
aa65c29ce   Jan Kara   mm: batch radix t...
275
276
277
   * @mapping: the mapping to which pages belong
   * @pvec: pagevec with pages to delete
   *
b93b01631   Matthew Wilcox   page cache: use x...
278
   * The function walks over mapping->i_pages and removes pages passed in @pvec
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
279
280
   * from the mapping. The function expects @pvec to be sorted by page index
   * and is optimised for it to be dense.
b93b01631   Matthew Wilcox   page cache: use x...
281
   * It tolerates holes in @pvec (mapping entries at those indices are not
aa65c29ce   Jan Kara   mm: batch radix t...
282
   * modified). The function expects only THP head pages to be present in the
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
283
   * @pvec.
aa65c29ce   Jan Kara   mm: batch radix t...
284
   *
b93b01631   Matthew Wilcox   page cache: use x...
285
   * The function expects the i_pages lock to be held.
aa65c29ce   Jan Kara   mm: batch radix t...
286
   */
ef8e5717d   Matthew Wilcox   page cache: Conve...
287
  static void page_cache_delete_batch(struct address_space *mapping,
aa65c29ce   Jan Kara   mm: batch radix t...
288
289
  			     struct pagevec *pvec)
  {
ef8e5717d   Matthew Wilcox   page cache: Conve...
290
  	XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index);
aa65c29ce   Jan Kara   mm: batch radix t...
291
  	int total_pages = 0;
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
292
  	int i = 0;
aa65c29ce   Jan Kara   mm: batch radix t...
293
  	struct page *page;
aa65c29ce   Jan Kara   mm: batch radix t...
294

ef8e5717d   Matthew Wilcox   page cache: Conve...
295
296
  	mapping_set_update(&xas, mapping);
  	xas_for_each(&xas, page, ULONG_MAX) {
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
297
  		if (i >= pagevec_count(pvec))
aa65c29ce   Jan Kara   mm: batch radix t...
298
  			break;
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
299
300
  
  		/* A swap/dax/shadow entry got inserted? Skip it. */
3159f943a   Matthew Wilcox   xarray: Replace e...
301
  		if (xa_is_value(page))
aa65c29ce   Jan Kara   mm: batch radix t...
302
  			continue;
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
  		/*
  		 * A page got inserted in our range? Skip it. We have our
  		 * pages locked so they are protected from being removed.
  		 * If we see a page whose index is higher than ours, it
  		 * means our page has been removed, which shouldn't be
  		 * possible because we're holding the PageLock.
  		 */
  		if (page != pvec->pages[i]) {
  			VM_BUG_ON_PAGE(page->index > pvec->pages[i]->index,
  					page);
  			continue;
  		}
  
  		WARN_ON_ONCE(!PageLocked(page));
  
  		if (page->index == xas.xa_index)
aa65c29ce   Jan Kara   mm: batch radix t...
319
  			page->mapping = NULL;
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
320
321
322
323
324
325
326
327
  		/* Leave page->index set: truncation lookup relies on it */
  
  		/*
  		 * Move to the next page in the vector if this is a regular
  		 * page or the index is of the last sub-page of this compound
  		 * page.
  		 */
  		if (page->index + compound_nr(page) - 1 == xas.xa_index)
aa65c29ce   Jan Kara   mm: batch radix t...
328
  			i++;
ef8e5717d   Matthew Wilcox   page cache: Conve...
329
  		xas_store(&xas, NULL);
aa65c29ce   Jan Kara   mm: batch radix t...
330
331
332
333
334
335
336
337
338
339
340
341
342
  		total_pages++;
  	}
  	mapping->nrpages -= total_pages;
  }
  
  void delete_from_page_cache_batch(struct address_space *mapping,
  				  struct pagevec *pvec)
  {
  	int i;
  	unsigned long flags;
  
  	if (!pagevec_count(pvec))
  		return;
b93b01631   Matthew Wilcox   page cache: use x...
343
  	xa_lock_irqsave(&mapping->i_pages, flags);
aa65c29ce   Jan Kara   mm: batch radix t...
344
345
346
347
348
  	for (i = 0; i < pagevec_count(pvec); i++) {
  		trace_mm_filemap_delete_from_page_cache(pvec->pages[i]);
  
  		unaccount_page_cache_page(mapping, pvec->pages[i]);
  	}
ef8e5717d   Matthew Wilcox   page cache: Conve...
349
  	page_cache_delete_batch(mapping, pvec);
b93b01631   Matthew Wilcox   page cache: use x...
350
  	xa_unlock_irqrestore(&mapping->i_pages, flags);
aa65c29ce   Jan Kara   mm: batch radix t...
351
352
353
354
  
  	for (i = 0; i < pagevec_count(pvec); i++)
  		page_cache_free_page(mapping, pvec->pages[i]);
  }
d72d9e2a5   Miklos Szeredi   mm: export filema...
355
  int filemap_check_errors(struct address_space *mapping)
865ffef37   Dmitry Monakhov   fs: fix fsync() e...
356
357
358
  {
  	int ret = 0;
  	/* Check for outstanding write errors */
7fcbbaf18   Jens Axboe   mm/filemap.c: avo...
359
360
  	if (test_bit(AS_ENOSPC, &mapping->flags) &&
  	    test_and_clear_bit(AS_ENOSPC, &mapping->flags))
865ffef37   Dmitry Monakhov   fs: fix fsync() e...
361
  		ret = -ENOSPC;
7fcbbaf18   Jens Axboe   mm/filemap.c: avo...
362
363
  	if (test_bit(AS_EIO, &mapping->flags) &&
  	    test_and_clear_bit(AS_EIO, &mapping->flags))
865ffef37   Dmitry Monakhov   fs: fix fsync() e...
364
365
366
  		ret = -EIO;
  	return ret;
  }
d72d9e2a5   Miklos Szeredi   mm: export filema...
367
  EXPORT_SYMBOL(filemap_check_errors);
865ffef37   Dmitry Monakhov   fs: fix fsync() e...
368

76341cabb   Jeff Layton   jbd2: don't clear...
369
370
371
372
373
374
375
376
377
  static int filemap_check_and_keep_errors(struct address_space *mapping)
  {
  	/* Check for outstanding write errors */
  	if (test_bit(AS_EIO, &mapping->flags))
  		return -EIO;
  	if (test_bit(AS_ENOSPC, &mapping->flags))
  		return -ENOSPC;
  	return 0;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
378
  /**
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
379
   * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
67be2dd1b   Martin Waitz   [PATCH] DocBook: ...
380
381
   * @mapping:	address space structure to write
   * @start:	offset in bytes where the range starts
469eb4d03   Andrew Morton   [PATCH] filemap_f...
382
   * @end:	offset in bytes where the range ends (inclusive)
67be2dd1b   Martin Waitz   [PATCH] DocBook: ...
383
   * @sync_mode:	enable synchronous operation
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
384
   *
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
385
386
387
   * Start writeback against all of a mapping's dirty pages that lie
   * within the byte offsets <start, end> inclusive.
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
388
   * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
389
   * opposed to a regular memory cleansing writeback.  The difference between
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
390
391
   * these two operations is that if a dirty page/buffer is encountered, it must
   * be waited upon, and not just skipped over.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
392
393
   *
   * Return: %0 on success, negative error code otherwise.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
394
   */
ebcf28e1c   Andrew Morton   [PATCH] fadvise()...
395
396
  int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
  				loff_t end, int sync_mode)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
397
398
399
400
  {
  	int ret;
  	struct writeback_control wbc = {
  		.sync_mode = sync_mode,
05fe478dd   Nick Piggin   mm: write_cache_p...
401
  		.nr_to_write = LONG_MAX,
111ebb6e6   OGAWA Hirofumi   [PATCH] writeback...
402
403
  		.range_start = start,
  		.range_end = end,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
404
  	};
f56753ac2   Christoph Hellwig   bdi: replace BDI_...
405
  	if (!mapping_can_writeback(mapping) ||
c3aab9a0b   Konstantin Khlebnikov   mm/filemap.c: don...
406
  	    !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
407
  		return 0;
b16b1deb5   Tejun Heo   writeback: make w...
408
  	wbc_attach_fdatawrite_inode(&wbc, mapping->host);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
409
  	ret = do_writepages(mapping, &wbc);
b16b1deb5   Tejun Heo   writeback: make w...
410
  	wbc_detach_inode(&wbc);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
411
412
413
414
415
416
  	return ret;
  }
  
  static inline int __filemap_fdatawrite(struct address_space *mapping,
  	int sync_mode)
  {
111ebb6e6   OGAWA Hirofumi   [PATCH] writeback...
417
  	return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
418
419
420
421
422
423
424
  }
  
  int filemap_fdatawrite(struct address_space *mapping)
  {
  	return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
  }
  EXPORT_SYMBOL(filemap_fdatawrite);
f4c0a0fdf   Jan Kara   vfs: export filem...
425
  int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
ebcf28e1c   Andrew Morton   [PATCH] fadvise()...
426
  				loff_t end)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
427
428
429
  {
  	return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
  }
f4c0a0fdf   Jan Kara   vfs: export filem...
430
  EXPORT_SYMBOL(filemap_fdatawrite_range);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
431

485bb99b4   Randy Dunlap   [PATCH] kernel-do...
432
433
434
435
  /**
   * filemap_flush - mostly a non-blocking flush
   * @mapping:	target address_space
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
436
437
   * This is a mostly non-blocking flush.  Not suitable for data-integrity
   * purposes - I/O may not be started against all dirty pages.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
438
439
   *
   * Return: %0 on success, negative error code otherwise.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
440
441
442
443
444
445
   */
  int filemap_flush(struct address_space *mapping)
  {
  	return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
  }
  EXPORT_SYMBOL(filemap_flush);
7fc9e4722   Goldwyn Rodrigues   fs: Introduce fil...
446
447
448
449
450
451
452
453
  /**
   * filemap_range_has_page - check if a page exists in range.
   * @mapping:           address space within which to check
   * @start_byte:        offset in bytes where the range starts
   * @end_byte:          offset in bytes where the range ends (inclusive)
   *
   * Find at least one page in the range supplied, usually used to check if
   * direct writing in this range will trigger a writeback.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
454
455
456
   *
   * Return: %true if at least one page exists in the specified range,
   * %false otherwise.
7fc9e4722   Goldwyn Rodrigues   fs: Introduce fil...
457
458
459
460
   */
  bool filemap_range_has_page(struct address_space *mapping,
  			   loff_t start_byte, loff_t end_byte)
  {
f7b680468   Jan Kara   mm: use find_get_...
461
  	struct page *page;
8fa8e538e   Matthew Wilcox   page cache: Conve...
462
463
  	XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
  	pgoff_t max = end_byte >> PAGE_SHIFT;
7fc9e4722   Goldwyn Rodrigues   fs: Introduce fil...
464
465
466
  
  	if (end_byte < start_byte)
  		return false;
8fa8e538e   Matthew Wilcox   page cache: Conve...
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
  	rcu_read_lock();
  	for (;;) {
  		page = xas_find(&xas, max);
  		if (xas_retry(&xas, page))
  			continue;
  		/* Shadow entries don't count */
  		if (xa_is_value(page))
  			continue;
  		/*
  		 * We don't need to try to pin this page; we're about to
  		 * release the RCU lock anyway.  It is enough to know that
  		 * there was a page here recently.
  		 */
  		break;
  	}
  	rcu_read_unlock();
7fc9e4722   Goldwyn Rodrigues   fs: Introduce fil...
483

8fa8e538e   Matthew Wilcox   page cache: Conve...
484
  	return page != NULL;
7fc9e4722   Goldwyn Rodrigues   fs: Introduce fil...
485
486
  }
  EXPORT_SYMBOL(filemap_range_has_page);
5e8fcc1a0   Jeff Layton   mm: don't TestCle...
487
  static void __filemap_fdatawait_range(struct address_space *mapping,
aa750fd71   Junichi Nomura   mm/filemap.c: mak...
488
  				     loff_t start_byte, loff_t end_byte)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
489
  {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
490
491
  	pgoff_t index = start_byte >> PAGE_SHIFT;
  	pgoff_t end = end_byte >> PAGE_SHIFT;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
492
493
  	struct pagevec pvec;
  	int nr_pages;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
494

94004ed72   Christoph Hellwig   kill wait_on_page...
495
  	if (end_byte < start_byte)
5e8fcc1a0   Jeff Layton   mm: don't TestCle...
496
  		return;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
497

866798201   Mel Gorman   mm, pagevec: remo...
498
  	pagevec_init(&pvec);
312e9d2f7   Jan Kara   mm: use pagevec_l...
499
  	while (index <= end) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
500
  		unsigned i;
312e9d2f7   Jan Kara   mm: use pagevec_l...
501
  		nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index,
67fd707f4   Jan Kara   mm: remove nr_pag...
502
  				end, PAGECACHE_TAG_WRITEBACK);
312e9d2f7   Jan Kara   mm: use pagevec_l...
503
504
  		if (!nr_pages)
  			break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
505
506
  		for (i = 0; i < nr_pages; i++) {
  			struct page *page = pvec.pages[i];
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
507
  			wait_on_page_writeback(page);
5e8fcc1a0   Jeff Layton   mm: don't TestCle...
508
  			ClearPageError(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
509
510
511
512
  		}
  		pagevec_release(&pvec);
  		cond_resched();
  	}
aa750fd71   Junichi Nomura   mm/filemap.c: mak...
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
  }
  
  /**
   * filemap_fdatawait_range - wait for writeback to complete
   * @mapping:		address space structure to wait for
   * @start_byte:		offset in bytes where the range starts
   * @end_byte:		offset in bytes where the range ends (inclusive)
   *
   * Walk the list of under-writeback pages of the given address space
   * in the given range and wait for all of them.  Check error status of
   * the address space and return it.
   *
   * Since the error status of the address space is cleared by this function,
   * callers are responsible for checking the return value and handling and/or
   * reporting the error.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
528
529
   *
   * Return: error status of the address space.
aa750fd71   Junichi Nomura   mm/filemap.c: mak...
530
531
532
533
   */
  int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
  			    loff_t end_byte)
  {
5e8fcc1a0   Jeff Layton   mm: don't TestCle...
534
535
  	__filemap_fdatawait_range(mapping, start_byte, end_byte);
  	return filemap_check_errors(mapping);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
536
  }
d3bccb6f4   Jan Kara   vfs: Introduce fi...
537
538
539
  EXPORT_SYMBOL(filemap_fdatawait_range);
  
  /**
aa0bfcd93   Ross Zwisler   mm: add filemap_f...
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
   * filemap_fdatawait_range_keep_errors - wait for writeback to complete
   * @mapping:		address space structure to wait for
   * @start_byte:		offset in bytes where the range starts
   * @end_byte:		offset in bytes where the range ends (inclusive)
   *
   * Walk the list of under-writeback pages of the given address space in the
   * given range and wait for all of them.  Unlike filemap_fdatawait_range(),
   * this function does not clear error status of the address space.
   *
   * Use this function if callers don't handle errors themselves.  Expected
   * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
   * fsfreeze(8)
   */
  int filemap_fdatawait_range_keep_errors(struct address_space *mapping,
  		loff_t start_byte, loff_t end_byte)
  {
  	__filemap_fdatawait_range(mapping, start_byte, end_byte);
  	return filemap_check_and_keep_errors(mapping);
  }
  EXPORT_SYMBOL(filemap_fdatawait_range_keep_errors);
  
  /**
a823e4589   Jeff Layton   mm: add file_fdat...
562
563
564
565
566
567
568
569
570
571
572
573
   * file_fdatawait_range - wait for writeback to complete
   * @file:		file pointing to address space structure to wait for
   * @start_byte:		offset in bytes where the range starts
   * @end_byte:		offset in bytes where the range ends (inclusive)
   *
   * Walk the list of under-writeback pages of the address space that file
   * refers to, in the given range and wait for all of them.  Check error
   * status of the address space vs. the file->f_wb_err cursor and return it.
   *
   * Since the error status of the file is advanced by this function,
   * callers are responsible for checking the return value and handling and/or
   * reporting the error.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
574
575
   *
   * Return: error status of the address space vs. the file->f_wb_err cursor.
a823e4589   Jeff Layton   mm: add file_fdat...
576
577
578
579
580
581
582
583
584
   */
  int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte)
  {
  	struct address_space *mapping = file->f_mapping;
  
  	__filemap_fdatawait_range(mapping, start_byte, end_byte);
  	return file_check_and_advance_wb_err(file);
  }
  EXPORT_SYMBOL(file_fdatawait_range);
d3bccb6f4   Jan Kara   vfs: Introduce fi...
585
586
  
  /**
aa750fd71   Junichi Nomura   mm/filemap.c: mak...
587
588
589
590
591
592
593
594
595
596
   * filemap_fdatawait_keep_errors - wait for writeback without clearing errors
   * @mapping: address space structure to wait for
   *
   * Walk the list of under-writeback pages of the given address space
   * and wait for all of them.  Unlike filemap_fdatawait(), this function
   * does not clear error status of the address space.
   *
   * Use this function if callers don't handle errors themselves.  Expected
   * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
   * fsfreeze(8)
a862f68a8   Mike Rapoport   docs/core-api/mm:...
597
598
   *
   * Return: error status of the address space.
aa750fd71   Junichi Nomura   mm/filemap.c: mak...
599
   */
76341cabb   Jeff Layton   jbd2: don't clear...
600
  int filemap_fdatawait_keep_errors(struct address_space *mapping)
aa750fd71   Junichi Nomura   mm/filemap.c: mak...
601
  {
ffb959bbd   Jeff Layton   mm: remove optimi...
602
  	__filemap_fdatawait_range(mapping, 0, LLONG_MAX);
76341cabb   Jeff Layton   jbd2: don't clear...
603
  	return filemap_check_and_keep_errors(mapping);
aa750fd71   Junichi Nomura   mm/filemap.c: mak...
604
  }
76341cabb   Jeff Layton   jbd2: don't clear...
605
  EXPORT_SYMBOL(filemap_fdatawait_keep_errors);
aa750fd71   Junichi Nomura   mm/filemap.c: mak...
606

875d91b11   Konstantin Khlebnikov   mm/filemap.c: rew...
607
  /* Returns true if writeback might be needed or already in progress. */
9326c9b20   Jeff Layton   mm: consolidate d...
608
  static bool mapping_needs_writeback(struct address_space *mapping)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
609
  {
875d91b11   Konstantin Khlebnikov   mm/filemap.c: rew...
610
611
612
613
  	if (dax_mapping(mapping))
  		return mapping->nrexceptional;
  
  	return mapping->nrpages;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
614
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
615

485bb99b4   Randy Dunlap   [PATCH] kernel-do...
616
617
618
619
620
621
  /**
   * filemap_write_and_wait_range - write out & wait on a file range
   * @mapping:	the address_space for the pages
   * @lstart:	offset in bytes where the range starts
   * @lend:	offset in bytes where the range ends (inclusive)
   *
469eb4d03   Andrew Morton   [PATCH] filemap_f...
622
623
   * Write out and wait upon file offsets lstart->lend, inclusive.
   *
0e056eb55   mchehab@s-opensource.com   kernel-api.rst: f...
624
   * Note that @lend is inclusive (describes the last byte to be written) so
469eb4d03   Andrew Morton   [PATCH] filemap_f...
625
   * that this function can be used to write to the very end-of-file (end = -1).
a862f68a8   Mike Rapoport   docs/core-api/mm:...
626
627
   *
   * Return: error status of the address space.
469eb4d03   Andrew Morton   [PATCH] filemap_f...
628
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
629
630
631
  int filemap_write_and_wait_range(struct address_space *mapping,
  				 loff_t lstart, loff_t lend)
  {
28fd12982   OGAWA Hirofumi   [PATCH] Fix and a...
632
  	int err = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
633

9326c9b20   Jeff Layton   mm: consolidate d...
634
  	if (mapping_needs_writeback(mapping)) {
28fd12982   OGAWA Hirofumi   [PATCH] Fix and a...
635
636
  		err = __filemap_fdatawrite_range(mapping, lstart, lend,
  						 WB_SYNC_ALL);
ddf8f376d   Ira Weiny   mm/filemap.c: cle...
637
638
639
640
641
642
  		/*
  		 * Even if the above returned error, the pages may be
  		 * written partially (e.g. -ENOSPC), so we wait for it.
  		 * But the -EIO is special case, it may indicate the worst
  		 * thing (e.g. bug) happened, so we avoid waiting for it.
  		 */
28fd12982   OGAWA Hirofumi   [PATCH] Fix and a...
643
  		if (err != -EIO) {
94004ed72   Christoph Hellwig   kill wait_on_page...
644
645
  			int err2 = filemap_fdatawait_range(mapping,
  						lstart, lend);
28fd12982   OGAWA Hirofumi   [PATCH] Fix and a...
646
647
  			if (!err)
  				err = err2;
cbeaf9510   Jeff Layton   mm: clear AS_EIO/...
648
649
650
  		} else {
  			/* Clear any previously stored errors */
  			filemap_check_errors(mapping);
28fd12982   OGAWA Hirofumi   [PATCH] Fix and a...
651
  		}
865ffef37   Dmitry Monakhov   fs: fix fsync() e...
652
653
  	} else {
  		err = filemap_check_errors(mapping);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
654
  	}
28fd12982   OGAWA Hirofumi   [PATCH] Fix and a...
655
  	return err;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
656
  }
f69955855   Chris Mason   Export filemap_wr...
657
  EXPORT_SYMBOL(filemap_write_and_wait_range);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
658

5660e13d2   Jeff Layton   fs: new infrastru...
659
660
  void __filemap_set_wb_err(struct address_space *mapping, int err)
  {
3acdfd280   Jeff Layton   errseq: rename __...
661
  	errseq_t eseq = errseq_set(&mapping->wb_err, err);
5660e13d2   Jeff Layton   fs: new infrastru...
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
  
  	trace_filemap_set_wb_err(mapping, eseq);
  }
  EXPORT_SYMBOL(__filemap_set_wb_err);
  
  /**
   * file_check_and_advance_wb_err - report wb error (if any) that was previously
   * 				   and advance wb_err to current one
   * @file: struct file on which the error is being reported
   *
   * When userland calls fsync (or something like nfsd does the equivalent), we
   * want to report any writeback errors that occurred since the last fsync (or
   * since the file was opened if there haven't been any).
   *
   * Grab the wb_err from the mapping. If it matches what we have in the file,
   * then just quickly return 0. The file is all caught up.
   *
   * If it doesn't match, then take the mapping value, set the "seen" flag in
   * it and try to swap it into place. If it works, or another task beat us
   * to it with the new value, then update the f_wb_err and return the error
   * portion. The error at this point must be reported via proper channels
   * (a'la fsync, or NFS COMMIT operation, etc.).
   *
   * While we handle mapping->wb_err with atomic operations, the f_wb_err
   * value is protected by the f_lock since we must ensure that it reflects
   * the latest value swapped in for this file descriptor.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
688
689
   *
   * Return: %0 on success, negative error code otherwise.
5660e13d2   Jeff Layton   fs: new infrastru...
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
   */
  int file_check_and_advance_wb_err(struct file *file)
  {
  	int err = 0;
  	errseq_t old = READ_ONCE(file->f_wb_err);
  	struct address_space *mapping = file->f_mapping;
  
  	/* Locklessly handle the common case where nothing has changed */
  	if (errseq_check(&mapping->wb_err, old)) {
  		/* Something changed, must use slow path */
  		spin_lock(&file->f_lock);
  		old = file->f_wb_err;
  		err = errseq_check_and_advance(&mapping->wb_err,
  						&file->f_wb_err);
  		trace_file_check_and_advance_wb_err(file, old);
  		spin_unlock(&file->f_lock);
  	}
f4e222c56   Jeff Layton   mm: have filemap_...
707
708
709
710
711
712
713
714
  
  	/*
  	 * We're mostly using this function as a drop in replacement for
  	 * filemap_check_errors. Clear AS_EIO/AS_ENOSPC to emulate the effect
  	 * that the legacy code would have had on these flags.
  	 */
  	clear_bit(AS_EIO, &mapping->flags);
  	clear_bit(AS_ENOSPC, &mapping->flags);
5660e13d2   Jeff Layton   fs: new infrastru...
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
  	return err;
  }
  EXPORT_SYMBOL(file_check_and_advance_wb_err);
  
  /**
   * file_write_and_wait_range - write out & wait on a file range
   * @file:	file pointing to address_space with pages
   * @lstart:	offset in bytes where the range starts
   * @lend:	offset in bytes where the range ends (inclusive)
   *
   * Write out and wait upon file offsets lstart->lend, inclusive.
   *
   * Note that @lend is inclusive (describes the last byte to be written) so
   * that this function can be used to write to the very end-of-file (end = -1).
   *
   * After writing out and waiting on the data, we check and advance the
   * f_wb_err cursor to the latest value, and return any errors detected there.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
732
733
   *
   * Return: %0 on success, negative error code otherwise.
5660e13d2   Jeff Layton   fs: new infrastru...
734
735
736
737
738
   */
  int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)
  {
  	int err = 0, err2;
  	struct address_space *mapping = file->f_mapping;
9326c9b20   Jeff Layton   mm: consolidate d...
739
  	if (mapping_needs_writeback(mapping)) {
5660e13d2   Jeff Layton   fs: new infrastru...
740
741
742
743
744
745
746
747
748
749
750
751
  		err = __filemap_fdatawrite_range(mapping, lstart, lend,
  						 WB_SYNC_ALL);
  		/* See comment of filemap_write_and_wait() */
  		if (err != -EIO)
  			__filemap_fdatawait_range(mapping, lstart, lend);
  	}
  	err2 = file_check_and_advance_wb_err(file);
  	if (!err)
  		err = err2;
  	return err;
  }
  EXPORT_SYMBOL(file_write_and_wait_range);
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
752
  /**
ef6a3c631   Miklos Szeredi   mm: add replace_p...
753
754
755
756
757
758
759
760
761
762
763
   * replace_page_cache_page - replace a pagecache page with a new one
   * @old:	page to be replaced
   * @new:	page to replace with
   * @gfp_mask:	allocation mode
   *
   * This function replaces a page in the pagecache with a new one.  On
   * success it acquires the pagecache reference for the new page and
   * drops it for the old page.  Both the old and new pages must be
   * locked.  This function does not add the new page to the LRU, the
   * caller must do that.
   *
74d609585   Matthew Wilcox   page cache: Add a...
764
   * The remove + add is atomic.  This function cannot fail.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
765
766
   *
   * Return: %0
ef6a3c631   Miklos Szeredi   mm: add replace_p...
767
768
769
   */
  int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
  {
74d609585   Matthew Wilcox   page cache: Add a...
770
771
772
773
774
  	struct address_space *mapping = old->mapping;
  	void (*freepage)(struct page *) = mapping->a_ops->freepage;
  	pgoff_t offset = old->index;
  	XA_STATE(xas, &mapping->i_pages, offset);
  	unsigned long flags;
ef6a3c631   Miklos Szeredi   mm: add replace_p...
775

309381fea   Sasha Levin   mm: dump page whe...
776
777
778
  	VM_BUG_ON_PAGE(!PageLocked(old), old);
  	VM_BUG_ON_PAGE(!PageLocked(new), new);
  	VM_BUG_ON_PAGE(new->mapping, new);
ef6a3c631   Miklos Szeredi   mm: add replace_p...
779

74d609585   Matthew Wilcox   page cache: Add a...
780
781
782
  	get_page(new);
  	new->mapping = mapping;
  	new->index = offset;
ef6a3c631   Miklos Szeredi   mm: add replace_p...
783

0d1c20722   Johannes Weiner   mm: memcontrol: s...
784
  	mem_cgroup_migrate(old, new);
74d609585   Matthew Wilcox   page cache: Add a...
785
786
  	xas_lock_irqsave(&xas, flags);
  	xas_store(&xas, new);
4165b9b46   Michal Hocko   hugetlb: do not a...
787

74d609585   Matthew Wilcox   page cache: Add a...
788
789
790
  	old->mapping = NULL;
  	/* hugetlb pages do not participate in page cache accounting. */
  	if (!PageHuge(old))
0d1c20722   Johannes Weiner   mm: memcontrol: s...
791
  		__dec_lruvec_page_state(old, NR_FILE_PAGES);
74d609585   Matthew Wilcox   page cache: Add a...
792
  	if (!PageHuge(new))
0d1c20722   Johannes Weiner   mm: memcontrol: s...
793
  		__inc_lruvec_page_state(new, NR_FILE_PAGES);
74d609585   Matthew Wilcox   page cache: Add a...
794
  	if (PageSwapBacked(old))
0d1c20722   Johannes Weiner   mm: memcontrol: s...
795
  		__dec_lruvec_page_state(old, NR_SHMEM);
74d609585   Matthew Wilcox   page cache: Add a...
796
  	if (PageSwapBacked(new))
0d1c20722   Johannes Weiner   mm: memcontrol: s...
797
  		__inc_lruvec_page_state(new, NR_SHMEM);
74d609585   Matthew Wilcox   page cache: Add a...
798
  	xas_unlock_irqrestore(&xas, flags);
74d609585   Matthew Wilcox   page cache: Add a...
799
800
801
  	if (freepage)
  		freepage(old);
  	put_page(old);
ef6a3c631   Miklos Szeredi   mm: add replace_p...
802

74d609585   Matthew Wilcox   page cache: Add a...
803
  	return 0;
ef6a3c631   Miklos Szeredi   mm: add replace_p...
804
805
  }
  EXPORT_SYMBOL_GPL(replace_page_cache_page);
16c0cc0ce   Andrew Morton   revert "mm/filema...
806
  noinline int __add_to_page_cache_locked(struct page *page,
76cd61739   Alexei Starovoitov   mm/error_inject: ...
807
  					struct address_space *mapping,
c4cf498dc   Linus Torvalds   Merge branch 'akp...
808
  					pgoff_t offset, gfp_t gfp,
76cd61739   Alexei Starovoitov   mm/error_inject: ...
809
  					void **shadowp)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
810
  {
74d609585   Matthew Wilcox   page cache: Add a...
811
  	XA_STATE(xas, &mapping->i_pages, offset);
00501b531   Johannes Weiner   mm: memcontrol: r...
812
  	int huge = PageHuge(page);
e286781d5   Nick Piggin   mm: speculative p...
813
  	int error;
309381fea   Sasha Levin   mm: dump page whe...
814
815
  	VM_BUG_ON_PAGE(!PageLocked(page), page);
  	VM_BUG_ON_PAGE(PageSwapBacked(page), page);
74d609585   Matthew Wilcox   page cache: Add a...
816
  	mapping_set_update(&xas, mapping);
e286781d5   Nick Piggin   mm: speculative p...
817

09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
818
  	get_page(page);
66a0c8ee3   Kirill A. Shutemov   mm: cleanup add_t...
819
820
  	page->mapping = mapping;
  	page->index = offset;
3fea5a499   Johannes Weiner   mm: memcontrol: c...
821
  	if (!huge) {
198b62f83   Matthew Wilcox (Oracle)   mm/filemap: fix s...
822
  		error = mem_cgroup_charge(page, current->mm, gfp);
3fea5a499   Johannes Weiner   mm: memcontrol: c...
823
824
825
  		if (error)
  			goto error;
  	}
198b62f83   Matthew Wilcox (Oracle)   mm/filemap: fix s...
826
  	gfp &= GFP_RECLAIM_MASK;
74d609585   Matthew Wilcox   page cache: Add a...
827
  	do {
198b62f83   Matthew Wilcox (Oracle)   mm/filemap: fix s...
828
829
830
831
832
833
  		unsigned int order = xa_get_order(xas.xa, xas.xa_index);
  		void *entry, *old = NULL;
  
  		if (order > thp_order(page))
  			xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index),
  					order, gfp);
74d609585   Matthew Wilcox   page cache: Add a...
834
  		xas_lock_irq(&xas);
198b62f83   Matthew Wilcox (Oracle)   mm/filemap: fix s...
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
  		xas_for_each_conflict(&xas, entry) {
  			old = entry;
  			if (!xa_is_value(entry)) {
  				xas_set_err(&xas, -EEXIST);
  				goto unlock;
  			}
  		}
  
  		if (old) {
  			if (shadowp)
  				*shadowp = old;
  			/* entry may have been split before we acquired lock */
  			order = xa_get_order(xas.xa, xas.xa_index);
  			if (order > thp_order(page)) {
  				xas_split(&xas, old, order);
  				xas_reset(&xas);
  			}
  		}
74d609585   Matthew Wilcox   page cache: Add a...
853
854
855
  		xas_store(&xas, page);
  		if (xas_error(&xas))
  			goto unlock;
198b62f83   Matthew Wilcox (Oracle)   mm/filemap: fix s...
856
  		if (old)
74d609585   Matthew Wilcox   page cache: Add a...
857
  			mapping->nrexceptional--;
74d609585   Matthew Wilcox   page cache: Add a...
858
859
860
861
  		mapping->nrpages++;
  
  		/* hugetlb pages do not participate in page cache accounting */
  		if (!huge)
0d1c20722   Johannes Weiner   mm: memcontrol: s...
862
  			__inc_lruvec_page_state(page, NR_FILE_PAGES);
74d609585   Matthew Wilcox   page cache: Add a...
863
864
  unlock:
  		xas_unlock_irq(&xas);
198b62f83   Matthew Wilcox (Oracle)   mm/filemap: fix s...
865
  	} while (xas_nomem(&xas, gfp));
74d609585   Matthew Wilcox   page cache: Add a...
866

3fea5a499   Johannes Weiner   mm: memcontrol: c...
867
868
  	if (xas_error(&xas)) {
  		error = xas_error(&xas);
74d609585   Matthew Wilcox   page cache: Add a...
869
  		goto error;
3fea5a499   Johannes Weiner   mm: memcontrol: c...
870
  	}
4165b9b46   Michal Hocko   hugetlb: do not a...
871

66a0c8ee3   Kirill A. Shutemov   mm: cleanup add_t...
872
873
  	trace_mm_filemap_add_to_page_cache(page);
  	return 0;
74d609585   Matthew Wilcox   page cache: Add a...
874
  error:
66a0c8ee3   Kirill A. Shutemov   mm: cleanup add_t...
875
876
  	page->mapping = NULL;
  	/* Leave page->index set: truncation relies upon it */
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
877
  	put_page(page);
3fea5a499   Johannes Weiner   mm: memcontrol: c...
878
  	return error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
879
  }
cfcbfb138   Josef Bacik   mm/filemap.c: ena...
880
  ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO);
a528910e1   Johannes Weiner   mm: thrash detect...
881
882
883
884
885
886
887
888
889
890
  
  /**
   * add_to_page_cache_locked - add a locked page to the pagecache
   * @page:	page to add
   * @mapping:	the page's address_space
   * @offset:	page index
   * @gfp_mask:	page allocation mode
   *
   * This function is used to add a page to the pagecache. It must be locked.
   * This function does not add the page to the LRU.  The caller must do that.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
891
892
   *
   * Return: %0 on success, negative error code otherwise.
a528910e1   Johannes Weiner   mm: thrash detect...
893
894
895
896
897
898
899
   */
  int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
  		pgoff_t offset, gfp_t gfp_mask)
  {
  	return __add_to_page_cache_locked(page, mapping, offset,
  					  gfp_mask, NULL);
  }
e286781d5   Nick Piggin   mm: speculative p...
900
  EXPORT_SYMBOL(add_to_page_cache_locked);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
901
902
  
  int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
6daa0e286   Al Viro   [PATCH] gfp_t: mm...
903
  				pgoff_t offset, gfp_t gfp_mask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
904
  {
a528910e1   Johannes Weiner   mm: thrash detect...
905
  	void *shadow = NULL;
4f98a2fee   Rik van Riel   vmscan: split LRU...
906
  	int ret;
48c935ad8   Kirill A. Shutemov   page-flags: defin...
907
  	__SetPageLocked(page);
a528910e1   Johannes Weiner   mm: thrash detect...
908
909
910
  	ret = __add_to_page_cache_locked(page, mapping, offset,
  					 gfp_mask, &shadow);
  	if (unlikely(ret))
48c935ad8   Kirill A. Shutemov   page-flags: defin...
911
  		__ClearPageLocked(page);
a528910e1   Johannes Weiner   mm: thrash detect...
912
913
914
915
916
  	else {
  		/*
  		 * The page might have been evicted from cache only
  		 * recently, in which case it should be activated like
  		 * any other repeatedly accessed page.
f0281a00f   Rik van Riel   mm: workingset: o...
917
918
919
  		 * The exception is pages getting rewritten; evicting other
  		 * data from the working set, only to cache data that will
  		 * get overwritten with something else, is a waste of memory.
a528910e1   Johannes Weiner   mm: thrash detect...
920
  		 */
1899ad18c   Johannes Weiner   mm: workingset: t...
921
922
923
  		WARN_ON_ONCE(PageActive(page));
  		if (!(gfp_mask & __GFP_WRITE) && shadow)
  			workingset_refault(page, shadow);
a528910e1   Johannes Weiner   mm: thrash detect...
924
925
  		lru_cache_add(page);
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
926
927
  	return ret;
  }
18bc0bbd1   Evgeniy Polyakov   Staging: pohmelfs...
928
  EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
929

44110fe38   Paul Jackson   [PATCH] cpuset me...
930
  #ifdef CONFIG_NUMA
2ae88149a   Nick Piggin   [PATCH] mm: clean...
931
  struct page *__page_cache_alloc(gfp_t gfp)
44110fe38   Paul Jackson   [PATCH] cpuset me...
932
  {
c0ff7453b   Miao Xie   cpuset,mm: fix no...
933
934
  	int n;
  	struct page *page;
44110fe38   Paul Jackson   [PATCH] cpuset me...
935
  	if (cpuset_do_page_mem_spread()) {
cc9a6c877   Mel Gorman   cpuset: mm: reduc...
936
937
  		unsigned int cpuset_mems_cookie;
  		do {
d26914d11   Mel Gorman   mm: optimize put_...
938
  			cpuset_mems_cookie = read_mems_allowed_begin();
cc9a6c877   Mel Gorman   cpuset: mm: reduc...
939
  			n = cpuset_mem_spread_node();
96db800f5   Vlastimil Babka   mm: rename alloc_...
940
  			page = __alloc_pages_node(n, gfp, 0);
d26914d11   Mel Gorman   mm: optimize put_...
941
  		} while (!page && read_mems_allowed_retry(cpuset_mems_cookie));
cc9a6c877   Mel Gorman   cpuset: mm: reduc...
942

c0ff7453b   Miao Xie   cpuset,mm: fix no...
943
  		return page;
44110fe38   Paul Jackson   [PATCH] cpuset me...
944
  	}
2ae88149a   Nick Piggin   [PATCH] mm: clean...
945
  	return alloc_pages(gfp, 0);
44110fe38   Paul Jackson   [PATCH] cpuset me...
946
  }
2ae88149a   Nick Piggin   [PATCH] mm: clean...
947
  EXPORT_SYMBOL(__page_cache_alloc);
44110fe38   Paul Jackson   [PATCH] cpuset me...
948
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
949
950
951
952
953
954
955
956
957
958
  /*
   * In order to wait for pages to become available there must be
   * waitqueues associated with pages. By using a hash table of
   * waitqueues where the bucket discipline is to maintain all
   * waiters on the same queue and wake all when any of the pages
   * become available, and for the woken contexts to check to be
   * sure the appropriate page became available, this saves space
   * at a cost of "thundering herd" phenomena during rare hash
   * collisions.
   */
629060270   Nicholas Piggin   mm: add PageWaite...
959
960
961
962
963
  #define PAGE_WAIT_TABLE_BITS 8
  #define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)
  static wait_queue_head_t page_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
  
  static wait_queue_head_t *page_waitqueue(struct page *page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
964
  {
629060270   Nicholas Piggin   mm: add PageWaite...
965
  	return &page_wait_table[hash_ptr(page, PAGE_WAIT_TABLE_BITS)];
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
966
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
967

629060270   Nicholas Piggin   mm: add PageWaite...
968
  void __init pagecache_init(void)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
969
  {
629060270   Nicholas Piggin   mm: add PageWaite...
970
  	int i;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
971

629060270   Nicholas Piggin   mm: add PageWaite...
972
973
974
975
  	for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
  		init_waitqueue_head(&page_wait_table[i]);
  
  	page_writeback_init();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
976
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
977

5ef64cc89   Linus Torvalds   mm: allow a contr...
978
979
  /*
   * The page wait code treats the "wait->flags" somewhat unusually, because
5868ec267   Linus Torvalds   mm: fix wake_page...
980
   * we have multiple different kinds of waits, not just the usual "exclusive"
5ef64cc89   Linus Torvalds   mm: allow a contr...
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
   * one.
   *
   * We have:
   *
   *  (a) no special bits set:
   *
   *	We're just waiting for the bit to be released, and when a waker
   *	calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up,
   *	and remove it from the wait queue.
   *
   *	Simple and straightforward.
   *
   *  (b) WQ_FLAG_EXCLUSIVE:
   *
   *	The waiter is waiting to get the lock, and only one waiter should
   *	be woken up to avoid any thundering herd behavior. We'll set the
   *	WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue.
   *
   *	This is the traditional exclusive wait.
   *
5868ec267   Linus Torvalds   mm: fix wake_page...
1001
   *  (c) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM:
5ef64cc89   Linus Torvalds   mm: allow a contr...
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
   *
   *	The waiter is waiting to get the bit, and additionally wants the
   *	lock to be transferred to it for fair lock behavior. If the lock
   *	cannot be taken, we stop walking the wait queue without waking
   *	the waiter.
   *
   *	This is the "fair lock handoff" case, and in addition to setting
   *	WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see
   *	that it now has the lock.
   */
ac6424b98   Ingo Molnar   sched/wait: Renam...
1012
  static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
f62e00cc3   KOSAKI Motohiro   mm: introduce wai...
1013
  {
5ef64cc89   Linus Torvalds   mm: allow a contr...
1014
  	unsigned int flags;
629060270   Nicholas Piggin   mm: add PageWaite...
1015
1016
1017
  	struct wait_page_key *key = arg;
  	struct wait_page_queue *wait_page
  		= container_of(wait, struct wait_page_queue, wait);
cdc8fcb49   Linus Torvalds   Merge tag 'for-5....
1018
  	if (!wake_page_match(wait_page, key))
629060270   Nicholas Piggin   mm: add PageWaite...
1019
  		return 0;
3510ca20e   Linus Torvalds   Minor page waitqu...
1020

9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1021
  	/*
5ef64cc89   Linus Torvalds   mm: allow a contr...
1022
1023
  	 * If it's a lock handoff wait, we get the bit for it, and
  	 * stop walking (and do not wake it up) if we can't.
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1024
  	 */
5ef64cc89   Linus Torvalds   mm: allow a contr...
1025
1026
1027
  	flags = wait->flags;
  	if (flags & WQ_FLAG_EXCLUSIVE) {
  		if (test_bit(key->bit_nr, &key->page->flags))
2a9127fcf   Linus Torvalds   mm: rewrite wait_...
1028
  			return -1;
5ef64cc89   Linus Torvalds   mm: allow a contr...
1029
1030
1031
1032
1033
  		if (flags & WQ_FLAG_CUSTOM) {
  			if (test_and_set_bit(key->bit_nr, &key->page->flags))
  				return -1;
  			flags |= WQ_FLAG_DONE;
  		}
2a9127fcf   Linus Torvalds   mm: rewrite wait_...
1034
  	}
f62e00cc3   KOSAKI Motohiro   mm: introduce wai...
1035

5ef64cc89   Linus Torvalds   mm: allow a contr...
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
  	/*
  	 * We are holding the wait-queue lock, but the waiter that
  	 * is waiting for this will be checking the flags without
  	 * any locking.
  	 *
  	 * So update the flags atomically, and wake up the waiter
  	 * afterwards to avoid any races. This store-release pairs
  	 * with the load-acquire in wait_on_page_bit_common().
  	 */
  	smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN);
2a9127fcf   Linus Torvalds   mm: rewrite wait_...
1046
1047
1048
1049
1050
1051
  	wake_up_state(wait->private, mode);
  
  	/*
  	 * Ok, we have successfully done what we're waiting for,
  	 * and we can unconditionally remove the wait entry.
  	 *
5ef64cc89   Linus Torvalds   mm: allow a contr...
1052
1053
1054
  	 * Note that this pairs with the "finish_wait()" in the
  	 * waiter, and has to be the absolute last thing we do.
  	 * After this list_del_init(&wait->entry) the wait entry
2a9127fcf   Linus Torvalds   mm: rewrite wait_...
1055
1056
  	 * might be de-allocated and the process might even have
  	 * exited.
2a9127fcf   Linus Torvalds   mm: rewrite wait_...
1057
  	 */
c6fe44d96   Linus Torvalds   list: add "list_d...
1058
  	list_del_init_careful(&wait->entry);
5ef64cc89   Linus Torvalds   mm: allow a contr...
1059
  	return (flags & WQ_FLAG_EXCLUSIVE) != 0;
f62e00cc3   KOSAKI Motohiro   mm: introduce wai...
1060
  }
74d81bfae   Nicholas Piggin   mm: un-export wak...
1061
  static void wake_up_page_bit(struct page *page, int bit_nr)
cbbce8220   NeilBrown   SCHED: add some "...
1062
  {
629060270   Nicholas Piggin   mm: add PageWaite...
1063
1064
1065
  	wait_queue_head_t *q = page_waitqueue(page);
  	struct wait_page_key key;
  	unsigned long flags;
11a19c7b0   Tim Chen   sched/wait: Intro...
1066
  	wait_queue_entry_t bookmark;
cbbce8220   NeilBrown   SCHED: add some "...
1067

629060270   Nicholas Piggin   mm: add PageWaite...
1068
1069
1070
  	key.page = page;
  	key.bit_nr = bit_nr;
  	key.page_match = 0;
11a19c7b0   Tim Chen   sched/wait: Intro...
1071
1072
1073
1074
  	bookmark.flags = 0;
  	bookmark.private = NULL;
  	bookmark.func = NULL;
  	INIT_LIST_HEAD(&bookmark.entry);
629060270   Nicholas Piggin   mm: add PageWaite...
1075
  	spin_lock_irqsave(&q->lock, flags);
11a19c7b0   Tim Chen   sched/wait: Intro...
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
  	__wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
  
  	while (bookmark.flags & WQ_FLAG_BOOKMARK) {
  		/*
  		 * Take a breather from holding the lock,
  		 * allow pages that finish wake up asynchronously
  		 * to acquire the lock and remove themselves
  		 * from wait queue
  		 */
  		spin_unlock_irqrestore(&q->lock, flags);
  		cpu_relax();
  		spin_lock_irqsave(&q->lock, flags);
  		__wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
  	}
629060270   Nicholas Piggin   mm: add PageWaite...
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
  	/*
  	 * It is possible for other pages to have collided on the waitqueue
  	 * hash, so in that case check for a page match. That prevents a long-
  	 * term waiter
  	 *
  	 * It is still possible to miss a case here, when we woke page waiters
  	 * and removed them from the waitqueue, but there are still other
  	 * page waiters.
  	 */
  	if (!waitqueue_active(q) || !key.page_match) {
  		ClearPageWaiters(page);
  		/*
  		 * It's possible to miss clearing Waiters here, when we woke
  		 * our page waiters, but the hashed waitqueue has waiters for
  		 * other pages on it.
  		 *
  		 * That's okay, it's a rare case. The next waker will clear it.
  		 */
  	}
  	spin_unlock_irqrestore(&q->lock, flags);
  }
74d81bfae   Nicholas Piggin   mm: un-export wak...
1111
1112
1113
1114
1115
1116
1117
  
  static void wake_up_page(struct page *page, int bit)
  {
  	if (!PageWaiters(page))
  		return;
  	wake_up_page_bit(page, bit);
  }
629060270   Nicholas Piggin   mm: add PageWaite...
1118

9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
  /*
   * A choice of three behaviors for wait_on_page_bit_common():
   */
  enum behavior {
  	EXCLUSIVE,	/* Hold ref to page and take the bit when woken, like
  			 * __lock_page() waiting on then setting PG_locked.
  			 */
  	SHARED,		/* Hold ref to page and check the bit when woken, like
  			 * wait_on_page_writeback() waiting on PG_writeback.
  			 */
  	DROP,		/* Drop ref to page before wait, no check when woken,
  			 * like put_and_wait_on_page_locked() on PG_locked.
  			 */
  };
2a9127fcf   Linus Torvalds   mm: rewrite wait_...
1133
  /*
5ef64cc89   Linus Torvalds   mm: allow a contr...
1134
1135
   * Attempt to check (or get) the page bit, and mark us done
   * if successful.
2a9127fcf   Linus Torvalds   mm: rewrite wait_...
1136
1137
1138
1139
1140
1141
1142
1143
1144
   */
  static inline bool trylock_page_bit_common(struct page *page, int bit_nr,
  					struct wait_queue_entry *wait)
  {
  	if (wait->flags & WQ_FLAG_EXCLUSIVE) {
  		if (test_and_set_bit(bit_nr, &page->flags))
  			return false;
  	} else if (test_bit(bit_nr, &page->flags))
  		return false;
5ef64cc89   Linus Torvalds   mm: allow a contr...
1145
  	wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE;
2a9127fcf   Linus Torvalds   mm: rewrite wait_...
1146
1147
  	return true;
  }
5ef64cc89   Linus Torvalds   mm: allow a contr...
1148
1149
  /* How many times do we accept lock stealing from under a waiter? */
  int sysctl_page_lock_unfairness = 5;
629060270   Nicholas Piggin   mm: add PageWaite...
1150
  static inline int wait_on_page_bit_common(wait_queue_head_t *q,
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1151
  	struct page *page, int bit_nr, int state, enum behavior behavior)
629060270   Nicholas Piggin   mm: add PageWaite...
1152
  {
5ef64cc89   Linus Torvalds   mm: allow a contr...
1153
  	int unfairness = sysctl_page_lock_unfairness;
629060270   Nicholas Piggin   mm: add PageWaite...
1154
  	struct wait_page_queue wait_page;
ac6424b98   Ingo Molnar   sched/wait: Renam...
1155
  	wait_queue_entry_t *wait = &wait_page.wait;
b1d29ba82   Johannes Weiner   delayacct: track ...
1156
  	bool thrashing = false;
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1157
  	bool delayacct = false;
eb414681d   Johannes Weiner   psi: pressure sta...
1158
  	unsigned long pflags;
629060270   Nicholas Piggin   mm: add PageWaite...
1159

eb414681d   Johannes Weiner   psi: pressure sta...
1160
  	if (bit_nr == PG_locked &&
b1d29ba82   Johannes Weiner   delayacct: track ...
1161
  	    !PageUptodate(page) && PageWorkingset(page)) {
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1162
  		if (!PageSwapBacked(page)) {
eb414681d   Johannes Weiner   psi: pressure sta...
1163
  			delayacct_thrashing_start();
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1164
1165
  			delayacct = true;
  		}
eb414681d   Johannes Weiner   psi: pressure sta...
1166
  		psi_memstall_enter(&pflags);
b1d29ba82   Johannes Weiner   delayacct: track ...
1167
1168
  		thrashing = true;
  	}
629060270   Nicholas Piggin   mm: add PageWaite...
1169
1170
1171
1172
  	init_wait(wait);
  	wait->func = wake_page_function;
  	wait_page.page = page;
  	wait_page.bit_nr = bit_nr;
5ef64cc89   Linus Torvalds   mm: allow a contr...
1173
1174
1175
1176
1177
1178
1179
  repeat:
  	wait->flags = 0;
  	if (behavior == EXCLUSIVE) {
  		wait->flags = WQ_FLAG_EXCLUSIVE;
  		if (--unfairness < 0)
  			wait->flags |= WQ_FLAG_CUSTOM;
  	}
2a9127fcf   Linus Torvalds   mm: rewrite wait_...
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
  	/*
  	 * Do one last check whether we can get the
  	 * page bit synchronously.
  	 *
  	 * Do the SetPageWaiters() marking before that
  	 * to let any waker we _just_ missed know they
  	 * need to wake us up (otherwise they'll never
  	 * even go to the slow case that looks at the
  	 * page queue), and add ourselves to the wait
  	 * queue if we need to sleep.
  	 *
  	 * This part needs to be done under the queue
  	 * lock to avoid races.
  	 */
  	spin_lock_irq(&q->lock);
  	SetPageWaiters(page);
  	if (!trylock_page_bit_common(page, bit_nr, wait))
  		__add_wait_queue_entry_tail(q, wait);
  	spin_unlock_irq(&q->lock);
629060270   Nicholas Piggin   mm: add PageWaite...
1199

2a9127fcf   Linus Torvalds   mm: rewrite wait_...
1200
1201
  	/*
  	 * From now on, all the logic will be based on
5ef64cc89   Linus Torvalds   mm: allow a contr...
1202
1203
1204
  	 * the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to
  	 * see whether the page bit testing has already
  	 * been done by the wake function.
2a9127fcf   Linus Torvalds   mm: rewrite wait_...
1205
1206
1207
1208
1209
  	 *
  	 * We can drop our reference to the page.
  	 */
  	if (behavior == DROP)
  		put_page(page);
629060270   Nicholas Piggin   mm: add PageWaite...
1210

5ef64cc89   Linus Torvalds   mm: allow a contr...
1211
1212
1213
1214
1215
1216
  	/*
  	 * Note that until the "finish_wait()", or until
  	 * we see the WQ_FLAG_WOKEN flag, we need to
  	 * be very careful with the 'wait->flags', because
  	 * we may race with a waker that sets them.
  	 */
2a9127fcf   Linus Torvalds   mm: rewrite wait_...
1217
  	for (;;) {
5ef64cc89   Linus Torvalds   mm: allow a contr...
1218
  		unsigned int flags;
629060270   Nicholas Piggin   mm: add PageWaite...
1219
  		set_current_state(state);
5ef64cc89   Linus Torvalds   mm: allow a contr...
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
  		/* Loop until we've been woken or interrupted */
  		flags = smp_load_acquire(&wait->flags);
  		if (!(flags & WQ_FLAG_WOKEN)) {
  			if (signal_pending_state(state, current))
  				break;
  
  			io_schedule();
  			continue;
  		}
  
  		/* If we were non-exclusive, we're done */
  		if (behavior != EXCLUSIVE)
a8b169afb   Linus Torvalds   Avoid page waitqu...
1232
  			break;
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1233

5ef64cc89   Linus Torvalds   mm: allow a contr...
1234
1235
  		/* If the waker got the lock for us, we're done */
  		if (flags & WQ_FLAG_DONE)
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1236
  			break;
2a9127fcf   Linus Torvalds   mm: rewrite wait_...
1237

5ef64cc89   Linus Torvalds   mm: allow a contr...
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
  		/*
  		 * Otherwise, if we're getting the lock, we need to
  		 * try to get it ourselves.
  		 *
  		 * And if that fails, we'll have to retry this all.
  		 */
  		if (unlikely(test_and_set_bit(bit_nr, &page->flags)))
  			goto repeat;
  
  		wait->flags |= WQ_FLAG_DONE;
  		break;
629060270   Nicholas Piggin   mm: add PageWaite...
1249
  	}
5ef64cc89   Linus Torvalds   mm: allow a contr...
1250
1251
1252
1253
1254
1255
  	/*
  	 * If a signal happened, this 'finish_wait()' may remove the last
  	 * waiter from the wait-queues, but the PageWaiters bit will remain
  	 * set. That's ok. The next wakeup will take care of it, and trying
  	 * to do it here would be difficult and prone to races.
  	 */
629060270   Nicholas Piggin   mm: add PageWaite...
1256
  	finish_wait(q, wait);
eb414681d   Johannes Weiner   psi: pressure sta...
1257
  	if (thrashing) {
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1258
  		if (delayacct)
eb414681d   Johannes Weiner   psi: pressure sta...
1259
1260
1261
  			delayacct_thrashing_end();
  		psi_memstall_leave(&pflags);
  	}
b1d29ba82   Johannes Weiner   delayacct: track ...
1262

629060270   Nicholas Piggin   mm: add PageWaite...
1263
  	/*
5ef64cc89   Linus Torvalds   mm: allow a contr...
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
  	 * NOTE! The wait->flags weren't stable until we've done the
  	 * 'finish_wait()', and we could have exited the loop above due
  	 * to a signal, and had a wakeup event happen after the signal
  	 * test but before the 'finish_wait()'.
  	 *
  	 * So only after the finish_wait() can we reliably determine
  	 * if we got woken up or not, so we can now figure out the final
  	 * return value based on that state without races.
  	 *
  	 * Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive
  	 * waiter, but an exclusive one requires WQ_FLAG_DONE.
629060270   Nicholas Piggin   mm: add PageWaite...
1275
  	 */
5ef64cc89   Linus Torvalds   mm: allow a contr...
1276
1277
  	if (behavior == EXCLUSIVE)
  		return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR;
629060270   Nicholas Piggin   mm: add PageWaite...
1278

2a9127fcf   Linus Torvalds   mm: rewrite wait_...
1279
  	return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;
629060270   Nicholas Piggin   mm: add PageWaite...
1280
1281
1282
1283
1284
  }
  
  void wait_on_page_bit(struct page *page, int bit_nr)
  {
  	wait_queue_head_t *q = page_waitqueue(page);
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1285
  	wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);
629060270   Nicholas Piggin   mm: add PageWaite...
1286
1287
1288
1289
1290
1291
  }
  EXPORT_SYMBOL(wait_on_page_bit);
  
  int wait_on_page_bit_killable(struct page *page, int bit_nr)
  {
  	wait_queue_head_t *q = page_waitqueue(page);
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1292
  	return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, SHARED);
cbbce8220   NeilBrown   SCHED: add some "...
1293
  }
4343d0087   David Howells   afs: Get rid of t...
1294
  EXPORT_SYMBOL(wait_on_page_bit_killable);
cbbce8220   NeilBrown   SCHED: add some "...
1295

dd3e6d503   Jens Axboe   mm: add support f...
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
  static int __wait_on_page_locked_async(struct page *page,
  				       struct wait_page_queue *wait, bool set)
  {
  	struct wait_queue_head *q = page_waitqueue(page);
  	int ret = 0;
  
  	wait->page = page;
  	wait->bit_nr = PG_locked;
  
  	spin_lock_irq(&q->lock);
  	__add_wait_queue_entry_tail(q, &wait->wait);
  	SetPageWaiters(page);
  	if (set)
  		ret = !trylock_page(page);
  	else
  		ret = PageLocked(page);
  	/*
  	 * If we were succesful now, we know we're still on the
  	 * waitqueue as we're still under the lock. This means it's
  	 * safe to remove and return success, we know the callback
  	 * isn't going to trigger.
  	 */
  	if (!ret)
  		__remove_wait_queue(q, &wait->wait);
  	else
  		ret = -EIOCBQUEUED;
  	spin_unlock_irq(&q->lock);
  	return ret;
  }
1a0a7853b   Jens Axboe   mm: support async...
1325
1326
1327
1328
1329
1330
1331
  static int wait_on_page_locked_async(struct page *page,
  				     struct wait_page_queue *wait)
  {
  	if (!PageLocked(page))
  		return 0;
  	return __wait_on_page_locked_async(compound_head(page), wait, false);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1332
  /**
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
   * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked
   * @page: The page to wait for.
   *
   * The caller should hold a reference on @page.  They expect the page to
   * become unlocked relatively soon, but do not wish to hold up migration
   * (for example) by holding the reference while waiting for the page to
   * come unlocked.  After this function returns, the caller should not
   * dereference @page.
   */
  void put_and_wait_on_page_locked(struct page *page)
  {
  	wait_queue_head_t *q;
  
  	page = compound_head(page);
  	q = page_waitqueue(page);
  	wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, DROP);
  }
  
  /**
385e1ca5f   David Howells   CacheFiles: Permi...
1352
   * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
697f619fc   Randy Dunlap   filemap: fix kern...
1353
1354
   * @page: Page defining the wait queue of interest
   * @waiter: Waiter to add to the queue
385e1ca5f   David Howells   CacheFiles: Permi...
1355
1356
1357
   *
   * Add an arbitrary @waiter to the wait queue for the nominated @page.
   */
ac6424b98   Ingo Molnar   sched/wait: Renam...
1358
  void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter)
385e1ca5f   David Howells   CacheFiles: Permi...
1359
1360
1361
1362
1363
  {
  	wait_queue_head_t *q = page_waitqueue(page);
  	unsigned long flags;
  
  	spin_lock_irqsave(&q->lock, flags);
9c3a815f4   Linus Torvalds   page waitqueue: a...
1364
  	__add_wait_queue_entry_tail(q, waiter);
629060270   Nicholas Piggin   mm: add PageWaite...
1365
  	SetPageWaiters(page);
385e1ca5f   David Howells   CacheFiles: Permi...
1366
1367
1368
  	spin_unlock_irqrestore(&q->lock, flags);
  }
  EXPORT_SYMBOL_GPL(add_page_wait_queue);
b91e1302a   Linus Torvalds   mm: optimize Page...
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
  #ifndef clear_bit_unlock_is_negative_byte
  
  /*
   * PG_waiters is the high bit in the same byte as PG_lock.
   *
   * On x86 (and on many other architectures), we can clear PG_lock and
   * test the sign bit at the same time. But if the architecture does
   * not support that special operation, we just do this all by hand
   * instead.
   *
   * The read of PG_waiters has to be after (or concurrently with) PG_locked
ffceeb62f   Ethon Paul   mm/filemap: fix a...
1380
   * being cleared, but a memory barrier should be unnecessary since it is
b91e1302a   Linus Torvalds   mm: optimize Page...
1381
1382
1383
1384
1385
1386
   * in the same byte as PG_locked.
   */
  static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem)
  {
  	clear_bit_unlock(nr, mem);
  	/* smp_mb__after_atomic(); */
98473f9f3   Olof Johansson   mm/filemap: fix p...
1387
  	return test_bit(PG_waiters, mem);
b91e1302a   Linus Torvalds   mm: optimize Page...
1388
1389
1390
  }
  
  #endif
385e1ca5f   David Howells   CacheFiles: Permi...
1391
  /**
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
1392
   * unlock_page - unlock a locked page
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1393
1394
   * @page: the page
   *
0e9aa6755   Miaohe Lin   mm: fix some brok...
1395
   * Unlocks the page and wakes up sleepers in wait_on_page_locked().
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1396
   * Also wakes sleepers in wait_on_page_writeback() because the wakeup
da3dae54e   Masanari Iida   Documentation: Do...
1397
   * mechanism between PageLocked pages and PageWriteback pages is shared.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1398
1399
   * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
   *
b91e1302a   Linus Torvalds   mm: optimize Page...
1400
1401
1402
1403
1404
   * Note that this depends on PG_waiters being the sign bit in the byte
   * that contains PG_locked - thus the BUILD_BUG_ON(). That allows us to
   * clear the PG_locked bit and test PG_waiters at the same time fairly
   * portably (architectures that do LL/SC can test any bit, while x86 can
   * test the sign bit).
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1405
   */
920c7a5d0   Harvey Harrison   mm: remove fastca...
1406
  void unlock_page(struct page *page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1407
  {
b91e1302a   Linus Torvalds   mm: optimize Page...
1408
  	BUILD_BUG_ON(PG_waiters != 7);
48c935ad8   Kirill A. Shutemov   page-flags: defin...
1409
  	page = compound_head(page);
309381fea   Sasha Levin   mm: dump page whe...
1410
  	VM_BUG_ON_PAGE(!PageLocked(page), page);
b91e1302a   Linus Torvalds   mm: optimize Page...
1411
1412
  	if (clear_bit_unlock_is_negative_byte(PG_locked, &page->flags))
  		wake_up_page_bit(page, PG_locked);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1413
1414
  }
  EXPORT_SYMBOL(unlock_page);
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
1415
1416
1417
  /**
   * end_page_writeback - end writeback against a page
   * @page: the page
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1418
1419
1420
   */
  void end_page_writeback(struct page *page)
  {
888cf2db4   Mel Gorman   mm: avoid unneces...
1421
1422
1423
1424
1425
1426
1427
1428
1429
  	/*
  	 * TestClearPageReclaim could be used here but it is an atomic
  	 * operation and overkill in this particular case. Failing to
  	 * shuffle a page marked for immediate reclaim is too mild to
  	 * justify taking an atomic operation penalty at the end of
  	 * ever page writeback.
  	 */
  	if (PageReclaim(page)) {
  		ClearPageReclaim(page);
ac6aadb24   Miklos Szeredi   mm: rotate_reclai...
1430
  		rotate_reclaimable_page(page);
888cf2db4   Mel Gorman   mm: avoid unneces...
1431
  	}
ac6aadb24   Miklos Szeredi   mm: rotate_reclai...
1432

073861ed7   Hugh Dickins   mm: fix VM_BUG_ON...
1433
1434
1435
1436
1437
1438
1439
  	/*
  	 * Writeback does not hold a page reference of its own, relying
  	 * on truncation to wait for the clearing of PG_writeback.
  	 * But here we must make sure that the page is not freed and
  	 * reused before the wake_up_page().
  	 */
  	get_page(page);
ac6aadb24   Miklos Szeredi   mm: rotate_reclai...
1440
1441
  	if (!test_clear_page_writeback(page))
  		BUG();
4e857c58e   Peter Zijlstra   arch: Mass conver...
1442
  	smp_mb__after_atomic();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1443
  	wake_up_page(page, PG_writeback);
073861ed7   Hugh Dickins   mm: fix VM_BUG_ON...
1444
  	put_page(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1445
1446
  }
  EXPORT_SYMBOL(end_page_writeback);
57d998456   Matthew Wilcox   fs/mpage.c: facto...
1447
1448
1449
1450
  /*
   * After completing I/O on a page, call this routine to update the page
   * flags appropriately
   */
c11f0c0b5   Jens Axboe   block/mm: make bd...
1451
  void page_endio(struct page *page, bool is_write, int err)
57d998456   Matthew Wilcox   fs/mpage.c: facto...
1452
  {
c11f0c0b5   Jens Axboe   block/mm: make bd...
1453
  	if (!is_write) {
57d998456   Matthew Wilcox   fs/mpage.c: facto...
1454
1455
1456
1457
1458
1459
1460
  		if (!err) {
  			SetPageUptodate(page);
  		} else {
  			ClearPageUptodate(page);
  			SetPageError(page);
  		}
  		unlock_page(page);
abf545484   Mike Christie   mm/block: convert...
1461
  	} else {
57d998456   Matthew Wilcox   fs/mpage.c: facto...
1462
  		if (err) {
dd8416c47   Minchan Kim   mm: do not access...
1463
  			struct address_space *mapping;
57d998456   Matthew Wilcox   fs/mpage.c: facto...
1464
  			SetPageError(page);
dd8416c47   Minchan Kim   mm: do not access...
1465
1466
1467
  			mapping = page_mapping(page);
  			if (mapping)
  				mapping_set_error(mapping, err);
57d998456   Matthew Wilcox   fs/mpage.c: facto...
1468
1469
1470
1471
1472
  		}
  		end_page_writeback(page);
  	}
  }
  EXPORT_SYMBOL_GPL(page_endio);
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
1473
1474
  /**
   * __lock_page - get a lock on the page, assuming we need to sleep to get it
870667553   Randy Dunlap   mm: fix filemap.c...
1475
   * @__page: the page to lock
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1476
   */
629060270   Nicholas Piggin   mm: add PageWaite...
1477
  void __lock_page(struct page *__page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1478
  {
629060270   Nicholas Piggin   mm: add PageWaite...
1479
1480
  	struct page *page = compound_head(__page);
  	wait_queue_head_t *q = page_waitqueue(page);
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1481
1482
  	wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE,
  				EXCLUSIVE);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1483
1484
  }
  EXPORT_SYMBOL(__lock_page);
629060270   Nicholas Piggin   mm: add PageWaite...
1485
  int __lock_page_killable(struct page *__page)
2687a3569   Matthew Wilcox   Add lock_page_kil...
1486
  {
629060270   Nicholas Piggin   mm: add PageWaite...
1487
1488
  	struct page *page = compound_head(__page);
  	wait_queue_head_t *q = page_waitqueue(page);
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1489
1490
  	return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE,
  					EXCLUSIVE);
2687a3569   Matthew Wilcox   Add lock_page_kil...
1491
  }
18bc0bbd1   Evgeniy Polyakov   Staging: pohmelfs...
1492
  EXPORT_SYMBOL_GPL(__lock_page_killable);
2687a3569   Matthew Wilcox   Add lock_page_kil...
1493

dd3e6d503   Jens Axboe   mm: add support f...
1494
1495
1496
1497
  int __lock_page_async(struct page *page, struct wait_page_queue *wait)
  {
  	return __wait_on_page_locked_async(page, wait, true);
  }
9a95f3cf7   Paul Cassella   mm: describe mmap...
1498
1499
  /*
   * Return values:
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
1500
   * 1 - page is locked; mmap_lock is still held.
9a95f3cf7   Paul Cassella   mm: describe mmap...
1501
   * 0 - page is not locked.
3e4e28c5a   Michel Lespinasse   mmap locking API:...
1502
   *     mmap_lock has been released (mmap_read_unlock(), unless flags had both
9a95f3cf7   Paul Cassella   mm: describe mmap...
1503
   *     FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
1504
   *     which case mmap_lock is still held.
9a95f3cf7   Paul Cassella   mm: describe mmap...
1505
1506
   *
   * If neither ALLOW_RETRY nor KILLABLE are set, will always return 1
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
1507
   * with the page locked and the mmap_lock unperturbed.
9a95f3cf7   Paul Cassella   mm: describe mmap...
1508
   */
d065bd810   Michel Lespinasse   mm: retry page fa...
1509
1510
1511
  int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
  			 unsigned int flags)
  {
4064b9827   Peter Xu   mm: allow VM_FAUL...
1512
  	if (fault_flag_allow_retry_first(flags)) {
37b23e052   KOSAKI Motohiro   x86,mm: make page...
1513
  		/*
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
1514
  		 * CAUTION! In this case, mmap_lock is not released
37b23e052   KOSAKI Motohiro   x86,mm: make page...
1515
1516
1517
1518
  		 * even though return 0.
  		 */
  		if (flags & FAULT_FLAG_RETRY_NOWAIT)
  			return 0;
d8ed45c5d   Michel Lespinasse   mmap locking API:...
1519
  		mmap_read_unlock(mm);
37b23e052   KOSAKI Motohiro   x86,mm: make page...
1520
1521
1522
  		if (flags & FAULT_FLAG_KILLABLE)
  			wait_on_page_locked_killable(page);
  		else
318b275fb   Gleb Natapov   mm: allow GUP to ...
1523
  			wait_on_page_locked(page);
d065bd810   Michel Lespinasse   mm: retry page fa...
1524
  		return 0;
37b23e052   KOSAKI Motohiro   x86,mm: make page...
1525
1526
1527
1528
1529
1530
  	} else {
  		if (flags & FAULT_FLAG_KILLABLE) {
  			int ret;
  
  			ret = __lock_page_killable(page);
  			if (ret) {
d8ed45c5d   Michel Lespinasse   mmap locking API:...
1531
  				mmap_read_unlock(mm);
37b23e052   KOSAKI Motohiro   x86,mm: make page...
1532
1533
1534
1535
1536
  				return 0;
  			}
  		} else
  			__lock_page(page);
  		return 1;
d065bd810   Michel Lespinasse   mm: retry page fa...
1537
1538
  	}
  }
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
1539
  /**
0d3f92966   Matthew Wilcox   page cache: Conve...
1540
1541
1542
1543
   * page_cache_next_miss() - Find the next gap in the page cache.
   * @mapping: Mapping.
   * @index: Index.
   * @max_scan: Maximum range to search.
e7b563bb2   Johannes Weiner   mm: filemap: move...
1544
   *
0d3f92966   Matthew Wilcox   page cache: Conve...
1545
1546
   * Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the
   * gap with the lowest index.
e7b563bb2   Johannes Weiner   mm: filemap: move...
1547
   *
0d3f92966   Matthew Wilcox   page cache: Conve...
1548
1549
1550
1551
1552
   * This function may be called under the rcu_read_lock.  However, this will
   * not atomically search a snapshot of the cache at a single point in time.
   * For example, if a gap is created at index 5, then subsequently a gap is
   * created at index 10, page_cache_next_miss covering both indices may
   * return 10 if called under the rcu_read_lock.
e7b563bb2   Johannes Weiner   mm: filemap: move...
1553
   *
0d3f92966   Matthew Wilcox   page cache: Conve...
1554
1555
1556
   * Return: The index of the gap if found, otherwise an index outside the
   * range specified (in which case 'return - index >= max_scan' will be true).
   * In the rare case of index wrap-around, 0 will be returned.
e7b563bb2   Johannes Weiner   mm: filemap: move...
1557
   */
0d3f92966   Matthew Wilcox   page cache: Conve...
1558
  pgoff_t page_cache_next_miss(struct address_space *mapping,
e7b563bb2   Johannes Weiner   mm: filemap: move...
1559
1560
  			     pgoff_t index, unsigned long max_scan)
  {
0d3f92966   Matthew Wilcox   page cache: Conve...
1561
  	XA_STATE(xas, &mapping->i_pages, index);
e7b563bb2   Johannes Weiner   mm: filemap: move...
1562

0d3f92966   Matthew Wilcox   page cache: Conve...
1563
1564
1565
  	while (max_scan--) {
  		void *entry = xas_next(&xas);
  		if (!entry || xa_is_value(entry))
e7b563bb2   Johannes Weiner   mm: filemap: move...
1566
  			break;
0d3f92966   Matthew Wilcox   page cache: Conve...
1567
  		if (xas.xa_index == 0)
e7b563bb2   Johannes Weiner   mm: filemap: move...
1568
1569
  			break;
  	}
0d3f92966   Matthew Wilcox   page cache: Conve...
1570
  	return xas.xa_index;
e7b563bb2   Johannes Weiner   mm: filemap: move...
1571
  }
0d3f92966   Matthew Wilcox   page cache: Conve...
1572
  EXPORT_SYMBOL(page_cache_next_miss);
e7b563bb2   Johannes Weiner   mm: filemap: move...
1573
1574
  
  /**
2346a5605   Laurent Dufour   mm/filemap.c: fix...
1575
   * page_cache_prev_miss() - Find the previous gap in the page cache.
0d3f92966   Matthew Wilcox   page cache: Conve...
1576
1577
1578
   * @mapping: Mapping.
   * @index: Index.
   * @max_scan: Maximum range to search.
e7b563bb2   Johannes Weiner   mm: filemap: move...
1579
   *
0d3f92966   Matthew Wilcox   page cache: Conve...
1580
1581
   * Search the range [max(index - max_scan + 1, 0), index] for the
   * gap with the highest index.
e7b563bb2   Johannes Weiner   mm: filemap: move...
1582
   *
0d3f92966   Matthew Wilcox   page cache: Conve...
1583
1584
1585
1586
1587
   * This function may be called under the rcu_read_lock.  However, this will
   * not atomically search a snapshot of the cache at a single point in time.
   * For example, if a gap is created at index 10, then subsequently a gap is
   * created at index 5, page_cache_prev_miss() covering both indices may
   * return 5 if called under the rcu_read_lock.
e7b563bb2   Johannes Weiner   mm: filemap: move...
1588
   *
0d3f92966   Matthew Wilcox   page cache: Conve...
1589
1590
1591
   * Return: The index of the gap if found, otherwise an index outside the
   * range specified (in which case 'index - return >= max_scan' will be true).
   * In the rare case of wrap-around, ULONG_MAX will be returned.
e7b563bb2   Johannes Weiner   mm: filemap: move...
1592
   */
0d3f92966   Matthew Wilcox   page cache: Conve...
1593
  pgoff_t page_cache_prev_miss(struct address_space *mapping,
e7b563bb2   Johannes Weiner   mm: filemap: move...
1594
1595
  			     pgoff_t index, unsigned long max_scan)
  {
0d3f92966   Matthew Wilcox   page cache: Conve...
1596
  	XA_STATE(xas, &mapping->i_pages, index);
e7b563bb2   Johannes Weiner   mm: filemap: move...
1597

0d3f92966   Matthew Wilcox   page cache: Conve...
1598
1599
1600
  	while (max_scan--) {
  		void *entry = xas_prev(&xas);
  		if (!entry || xa_is_value(entry))
e7b563bb2   Johannes Weiner   mm: filemap: move...
1601
  			break;
0d3f92966   Matthew Wilcox   page cache: Conve...
1602
  		if (xas.xa_index == ULONG_MAX)
e7b563bb2   Johannes Weiner   mm: filemap: move...
1603
1604
  			break;
  	}
0d3f92966   Matthew Wilcox   page cache: Conve...
1605
  	return xas.xa_index;
e7b563bb2   Johannes Weiner   mm: filemap: move...
1606
  }
0d3f92966   Matthew Wilcox   page cache: Conve...
1607
  EXPORT_SYMBOL(page_cache_prev_miss);
e7b563bb2   Johannes Weiner   mm: filemap: move...
1608
1609
  
  /**
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1610
   * find_get_entry - find and get a page cache entry
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
1611
   * @mapping: the address_space to search
a6de4b487   Matthew Wilcox (Oracle)   mm: convert find_...
1612
   * @index: The page cache index.
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1613
1614
   *
   * Looks up the page cache slot at @mapping & @offset.  If there is a
a6de4b487   Matthew Wilcox (Oracle)   mm: convert find_...
1615
   * page cache page, the head page is returned with an increased refcount.
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
1616
   *
139b6a6fb   Johannes Weiner   mm: filemap: upda...
1617
1618
   * If the slot holds a shadow entry of a previously evicted page, or a
   * swap entry from shmem/tmpfs, it is returned.
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1619
   *
a6de4b487   Matthew Wilcox (Oracle)   mm: convert find_...
1620
   * Return: The head page or shadow entry, %NULL if nothing is found.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1621
   */
a6de4b487   Matthew Wilcox (Oracle)   mm: convert find_...
1622
  struct page *find_get_entry(struct address_space *mapping, pgoff_t index)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1623
  {
a6de4b487   Matthew Wilcox (Oracle)   mm: convert find_...
1624
  	XA_STATE(xas, &mapping->i_pages, index);
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1625
  	struct page *page;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1626

a60637c85   Nick Piggin   mm: lockless page...
1627
1628
  	rcu_read_lock();
  repeat:
4c7472c0d   Matthew Wilcox   page cache: Conve...
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
  	xas_reset(&xas);
  	page = xas_load(&xas);
  	if (xas_retry(&xas, page))
  		goto repeat;
  	/*
  	 * A shadow entry of a recently evicted page, or a swap entry from
  	 * shmem/tmpfs.  Return it without attempting to raise page count.
  	 */
  	if (!page || xa_is_value(page))
  		goto out;
83929372f   Kirill A. Shutemov   filemap: prepare ...
1639

4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1640
  	if (!page_cache_get_speculative(page))
4c7472c0d   Matthew Wilcox   page cache: Conve...
1641
  		goto repeat;
83929372f   Kirill A. Shutemov   filemap: prepare ...
1642

4c7472c0d   Matthew Wilcox   page cache: Conve...
1643
  	/*
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1644
  	 * Has the page moved or been split?
4c7472c0d   Matthew Wilcox   page cache: Conve...
1645
1646
1647
1648
  	 * This is part of the lockless pagecache protocol. See
  	 * include/linux/pagemap.h for details.
  	 */
  	if (unlikely(page != xas_reload(&xas))) {
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1649
  		put_page(page);
4c7472c0d   Matthew Wilcox   page cache: Conve...
1650
  		goto repeat;
a60637c85   Nick Piggin   mm: lockless page...
1651
  	}
27d20fddc   Nick Piggin   radix-tree: fix R...
1652
  out:
a60637c85   Nick Piggin   mm: lockless page...
1653
  	rcu_read_unlock();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1654
1655
  	return page;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1656

485bb99b4   Randy Dunlap   [PATCH] kernel-do...
1657
  /**
63ec1973d   Matthew Wilcox (Oracle)   mm/shmem: return ...
1658
1659
1660
   * find_lock_entry - Locate and lock a page cache entry.
   * @mapping: The address_space to search.
   * @index: The page cache index.
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1661
   *
63ec1973d   Matthew Wilcox (Oracle)   mm/shmem: return ...
1662
1663
   * Looks up the page at @mapping & @index.  If there is a page in the
   * cache, the head page is returned locked and with an increased refcount.
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1664
   *
139b6a6fb   Johannes Weiner   mm: filemap: upda...
1665
1666
   * If the slot holds a shadow entry of a previously evicted page, or a
   * swap entry from shmem/tmpfs, it is returned.
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1667
   *
63ec1973d   Matthew Wilcox (Oracle)   mm/shmem: return ...
1668
1669
   * Context: May sleep.
   * Return: The head page or shadow entry, %NULL if nothing is found.
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1670
   */
63ec1973d   Matthew Wilcox (Oracle)   mm/shmem: return ...
1671
  struct page *find_lock_entry(struct address_space *mapping, pgoff_t index)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1672
1673
  {
  	struct page *page;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1674
  repeat:
63ec1973d   Matthew Wilcox (Oracle)   mm/shmem: return ...
1675
  	page = find_get_entry(mapping, index);
4c7472c0d   Matthew Wilcox   page cache: Conve...
1676
  	if (page && !xa_is_value(page)) {
a60637c85   Nick Piggin   mm: lockless page...
1677
1678
  		lock_page(page);
  		/* Has the page been truncated? */
63ec1973d   Matthew Wilcox (Oracle)   mm/shmem: return ...
1679
  		if (unlikely(page->mapping != mapping)) {
a60637c85   Nick Piggin   mm: lockless page...
1680
  			unlock_page(page);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
1681
  			put_page(page);
a60637c85   Nick Piggin   mm: lockless page...
1682
  			goto repeat;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1683
  		}
63ec1973d   Matthew Wilcox (Oracle)   mm/shmem: return ...
1684
  		VM_BUG_ON_PAGE(!thp_contains(page, index), page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1685
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1686
1687
  	return page;
  }
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1688
1689
  
  /**
2294b32e0   Matthew Wilcox (Oracle)   mm/filemap.c: rew...
1690
1691
1692
1693
1694
   * pagecache_get_page - Find and get a reference to a page.
   * @mapping: The address_space to search.
   * @index: The page index.
   * @fgp_flags: %FGP flags modify how the page is returned.
   * @gfp_mask: Memory allocation flags to use if %FGP_CREAT is specified.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1695
   *
2294b32e0   Matthew Wilcox (Oracle)   mm/filemap.c: rew...
1696
   * Looks up the page cache entry at @mapping & @index.
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1697
   *
2294b32e0   Matthew Wilcox (Oracle)   mm/filemap.c: rew...
1698
   * @fgp_flags can be zero or more of these flags:
0e056eb55   mchehab@s-opensource.com   kernel-api.rst: f...
1699
   *
2294b32e0   Matthew Wilcox (Oracle)   mm/filemap.c: rew...
1700
1701
   * * %FGP_ACCESSED - The page will be marked accessed.
   * * %FGP_LOCK - The page is returned locked.
a8cf7f272   Matthew Wilcox (Oracle)   mm: add find_lock...
1702
1703
   * * %FGP_HEAD - If the page is present and a THP, return the head page
   *   rather than the exact page specified by the index.
2294b32e0   Matthew Wilcox (Oracle)   mm/filemap.c: rew...
1704
1705
1706
1707
1708
1709
   * * %FGP_CREAT - If no page is present then a new page is allocated using
   *   @gfp_mask and added to the page cache and the VM's LRU list.
   *   The page is returned locked and with an increased refcount.
   * * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the
   *   page is already in cache.  If the page was allocated, unlock it before
   *   returning so the caller can do the same dance.
605cad834   Yang Shi   mm: filemap: add ...
1710
1711
1712
   * * %FGP_WRITE - The page will be written
   * * %FGP_NOFS - __GFP_FS will get cleared in gfp mask
   * * %FGP_NOWAIT - Don't get blocked by page lock
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1713
   *
2294b32e0   Matthew Wilcox (Oracle)   mm/filemap.c: rew...
1714
1715
   * If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even
   * if the %GFP flags specified for %FGP_CREAT are atomic.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1716
   *
2457aec63   Mel Gorman   mm: non-atomicall...
1717
   * If there is a page cache page, it is returned with an increased refcount.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
1718
   *
2294b32e0   Matthew Wilcox (Oracle)   mm/filemap.c: rew...
1719
   * Return: The found page or %NULL otherwise.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1720
   */
2294b32e0   Matthew Wilcox (Oracle)   mm/filemap.c: rew...
1721
1722
  struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
  		int fgp_flags, gfp_t gfp_mask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1723
  {
eb2be1893   Nick Piggin   mm: buffered writ...
1724
  	struct page *page;
2457aec63   Mel Gorman   mm: non-atomicall...
1725

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1726
  repeat:
2294b32e0   Matthew Wilcox (Oracle)   mm/filemap.c: rew...
1727
  	page = find_get_entry(mapping, index);
3159f943a   Matthew Wilcox   xarray: Replace e...
1728
  	if (xa_is_value(page))
2457aec63   Mel Gorman   mm: non-atomicall...
1729
1730
1731
1732
1733
1734
1735
  		page = NULL;
  	if (!page)
  		goto no_page;
  
  	if (fgp_flags & FGP_LOCK) {
  		if (fgp_flags & FGP_NOWAIT) {
  			if (!trylock_page(page)) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
1736
  				put_page(page);
2457aec63   Mel Gorman   mm: non-atomicall...
1737
1738
1739
1740
1741
1742
1743
  				return NULL;
  			}
  		} else {
  			lock_page(page);
  		}
  
  		/* Has the page been truncated? */
a8cf7f272   Matthew Wilcox (Oracle)   mm: add find_lock...
1744
  		if (unlikely(page->mapping != mapping)) {
2457aec63   Mel Gorman   mm: non-atomicall...
1745
  			unlock_page(page);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
1746
  			put_page(page);
2457aec63   Mel Gorman   mm: non-atomicall...
1747
1748
  			goto repeat;
  		}
a8cf7f272   Matthew Wilcox (Oracle)   mm: add find_lock...
1749
  		VM_BUG_ON_PAGE(!thp_contains(page, index), page);
2457aec63   Mel Gorman   mm: non-atomicall...
1750
  	}
c16eb000c   Kirill Tkhai   mm/filemap.c: rem...
1751
  	if (fgp_flags & FGP_ACCESSED)
2457aec63   Mel Gorman   mm: non-atomicall...
1752
  		mark_page_accessed(page);
b9306a796   Yang Shi   mm: filemap: clea...
1753
1754
1755
1756
1757
  	else if (fgp_flags & FGP_WRITE) {
  		/* Clear idle flag for buffer write */
  		if (page_is_idle(page))
  			clear_page_idle(page);
  	}
a8cf7f272   Matthew Wilcox (Oracle)   mm: add find_lock...
1758
1759
  	if (!(fgp_flags & FGP_HEAD))
  		page = find_subpage(page, index);
2457aec63   Mel Gorman   mm: non-atomicall...
1760
1761
1762
1763
  
  no_page:
  	if (!page && (fgp_flags & FGP_CREAT)) {
  		int err;
f56753ac2   Christoph Hellwig   bdi: replace BDI_...
1764
  		if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
45f87de57   Michal Hocko   mm: get rid of ra...
1765
1766
1767
  			gfp_mask |= __GFP_WRITE;
  		if (fgp_flags & FGP_NOFS)
  			gfp_mask &= ~__GFP_FS;
2457aec63   Mel Gorman   mm: non-atomicall...
1768

45f87de57   Michal Hocko   mm: get rid of ra...
1769
  		page = __page_cache_alloc(gfp_mask);
eb2be1893   Nick Piggin   mm: buffered writ...
1770
1771
  		if (!page)
  			return NULL;
2457aec63   Mel Gorman   mm: non-atomicall...
1772

a75d4c333   Josef Bacik   filemap: kill pag...
1773
  		if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
2457aec63   Mel Gorman   mm: non-atomicall...
1774
  			fgp_flags |= FGP_LOCK;
eb39d618f   Hugh Dickins   mm: replace init_...
1775
  		/* Init accessed so avoid atomic mark_page_accessed later */
2457aec63   Mel Gorman   mm: non-atomicall...
1776
  		if (fgp_flags & FGP_ACCESSED)
eb39d618f   Hugh Dickins   mm: replace init_...
1777
  			__SetPageReferenced(page);
2457aec63   Mel Gorman   mm: non-atomicall...
1778

2294b32e0   Matthew Wilcox (Oracle)   mm/filemap.c: rew...
1779
  		err = add_to_page_cache_lru(page, mapping, index, gfp_mask);
eb2be1893   Nick Piggin   mm: buffered writ...
1780
  		if (unlikely(err)) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
1781
  			put_page(page);
eb2be1893   Nick Piggin   mm: buffered writ...
1782
1783
1784
  			page = NULL;
  			if (err == -EEXIST)
  				goto repeat;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1785
  		}
a75d4c333   Josef Bacik   filemap: kill pag...
1786
1787
1788
1789
1790
1791
1792
  
  		/*
  		 * add_to_page_cache_lru locks the page, and for mmap we expect
  		 * an unlocked page.
  		 */
  		if (page && (fgp_flags & FGP_FOR_MMAP))
  			unlock_page(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1793
  	}
2457aec63   Mel Gorman   mm: non-atomicall...
1794

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1795
1796
  	return page;
  }
2457aec63   Mel Gorman   mm: non-atomicall...
1797
  EXPORT_SYMBOL(pagecache_get_page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1798
1799
  
  /**
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
   * find_get_entries - gang pagecache lookup
   * @mapping:	The address_space to search
   * @start:	The starting page cache index
   * @nr_entries:	The maximum number of entries
   * @entries:	Where the resulting entries are placed
   * @indices:	The cache indices corresponding to the entries in @entries
   *
   * find_get_entries() will search for and return a group of up to
   * @nr_entries entries in the mapping.  The entries are placed at
   * @entries.  find_get_entries() takes a reference against any actual
   * pages it returns.
   *
   * The search returns a group of mapping-contiguous page cache entries
   * with ascending indexes.  There may be holes in the indices due to
   * not-present pages.
   *
139b6a6fb   Johannes Weiner   mm: filemap: upda...
1816
1817
   * Any shadow entries of evicted pages, or swap entries from
   * shmem/tmpfs, are included in the returned array.
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1818
   *
71725ed10   Hugh Dickins   mm: huge tmpfs: t...
1819
1820
1821
1822
1823
   * If it finds a Transparent Huge Page, head or tail, find_get_entries()
   * stops at that page: the caller is likely to have a better way to handle
   * the compound page as a whole, and then skip its extent, than repeatedly
   * calling find_get_entries() to return all its tails.
   *
a862f68a8   Mike Rapoport   docs/core-api/mm:...
1824
   * Return: the number of pages and shadow entries which were found.
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1825
1826
1827
1828
1829
   */
  unsigned find_get_entries(struct address_space *mapping,
  			  pgoff_t start, unsigned int nr_entries,
  			  struct page **entries, pgoff_t *indices)
  {
f280bf092   Matthew Wilcox   page cache: Conve...
1830
1831
  	XA_STATE(xas, &mapping->i_pages, start);
  	struct page *page;
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1832
  	unsigned int ret = 0;
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1833
1834
1835
1836
1837
  
  	if (!nr_entries)
  		return 0;
  
  	rcu_read_lock();
f280bf092   Matthew Wilcox   page cache: Conve...
1838
  	xas_for_each(&xas, page, ULONG_MAX) {
f280bf092   Matthew Wilcox   page cache: Conve...
1839
  		if (xas_retry(&xas, page))
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1840
  			continue;
f280bf092   Matthew Wilcox   page cache: Conve...
1841
1842
1843
1844
1845
1846
  		/*
  		 * A shadow entry of a recently evicted page, a swap
  		 * entry from shmem/tmpfs or a DAX entry.  Return it
  		 * without attempting to raise page count.
  		 */
  		if (xa_is_value(page))
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1847
  			goto export;
83929372f   Kirill A. Shutemov   filemap: prepare ...
1848

4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1849
  		if (!page_cache_get_speculative(page))
f280bf092   Matthew Wilcox   page cache: Conve...
1850
  			goto retry;
83929372f   Kirill A. Shutemov   filemap: prepare ...
1851

4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1852
  		/* Has the page moved or been split? */
f280bf092   Matthew Wilcox   page cache: Conve...
1853
1854
  		if (unlikely(page != xas_reload(&xas)))
  			goto put_page;
71725ed10   Hugh Dickins   mm: huge tmpfs: t...
1855
1856
1857
1858
1859
1860
1861
1862
  		/*
  		 * Terminate early on finding a THP, to allow the caller to
  		 * handle it all at once; but continue if this is hugetlbfs.
  		 */
  		if (PageTransHuge(page) && !PageHuge(page)) {
  			page = find_subpage(page, xas.xa_index);
  			nr_entries = ret + 1;
  		}
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1863
  export:
f280bf092   Matthew Wilcox   page cache: Conve...
1864
  		indices[ret] = xas.xa_index;
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1865
1866
1867
  		entries[ret] = page;
  		if (++ret == nr_entries)
  			break;
f280bf092   Matthew Wilcox   page cache: Conve...
1868
1869
  		continue;
  put_page:
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1870
  		put_page(page);
f280bf092   Matthew Wilcox   page cache: Conve...
1871
1872
  retry:
  		xas_reset(&xas);
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1873
1874
1875
1876
1877
1878
  	}
  	rcu_read_unlock();
  	return ret;
  }
  
  /**
b947cee4b   Jan Kara   mm: implement fin...
1879
   * find_get_pages_range - gang pagecache lookup
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1880
1881
   * @mapping:	The address_space to search
   * @start:	The starting page index
b947cee4b   Jan Kara   mm: implement fin...
1882
   * @end:	The final page index (inclusive)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1883
1884
1885
   * @nr_pages:	The maximum number of pages
   * @pages:	Where the resulting pages are placed
   *
b947cee4b   Jan Kara   mm: implement fin...
1886
1887
1888
1889
   * find_get_pages_range() will search for and return a group of up to @nr_pages
   * pages in the mapping starting at index @start and up to index @end
   * (inclusive).  The pages are placed at @pages.  find_get_pages_range() takes
   * a reference against the returned pages.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1890
1891
1892
   *
   * The search returns a group of mapping-contiguous pages with ascending
   * indexes.  There may be holes in the indices due to not-present pages.
d72dc8a25   Jan Kara   mm: make pagevec_...
1893
   * We also update @start to index the next page for the traversal.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1894
   *
a862f68a8   Mike Rapoport   docs/core-api/mm:...
1895
1896
   * Return: the number of pages which were found. If this number is
   * smaller than @nr_pages, the end of specified range has been
b947cee4b   Jan Kara   mm: implement fin...
1897
   * reached.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1898
   */
b947cee4b   Jan Kara   mm: implement fin...
1899
1900
1901
  unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
  			      pgoff_t end, unsigned int nr_pages,
  			      struct page **pages)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1902
  {
fd1b3cee2   Matthew Wilcox   page cache: Conve...
1903
1904
  	XA_STATE(xas, &mapping->i_pages, *start);
  	struct page *page;
0fc9d1040   Konstantin Khlebnikov   radix-tree: use i...
1905
1906
1907
1908
  	unsigned ret = 0;
  
  	if (unlikely(!nr_pages))
  		return 0;
a60637c85   Nick Piggin   mm: lockless page...
1909
1910
  
  	rcu_read_lock();
fd1b3cee2   Matthew Wilcox   page cache: Conve...
1911
  	xas_for_each(&xas, page, end) {
fd1b3cee2   Matthew Wilcox   page cache: Conve...
1912
  		if (xas_retry(&xas, page))
a60637c85   Nick Piggin   mm: lockless page...
1913
  			continue;
fd1b3cee2   Matthew Wilcox   page cache: Conve...
1914
1915
  		/* Skip over shadow, swap and DAX entries */
  		if (xa_is_value(page))
8079b1c85   Hugh Dickins   mm: clarify the r...
1916
  			continue;
a60637c85   Nick Piggin   mm: lockless page...
1917

4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1918
  		if (!page_cache_get_speculative(page))
fd1b3cee2   Matthew Wilcox   page cache: Conve...
1919
  			goto retry;
83929372f   Kirill A. Shutemov   filemap: prepare ...
1920

4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1921
  		/* Has the page moved or been split? */
fd1b3cee2   Matthew Wilcox   page cache: Conve...
1922
1923
  		if (unlikely(page != xas_reload(&xas)))
  			goto put_page;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1924

4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1925
  		pages[ret] = find_subpage(page, xas.xa_index);
b947cee4b   Jan Kara   mm: implement fin...
1926
  		if (++ret == nr_pages) {
5d3ee42f8   Yu Zhao   mm/shmem: make fi...
1927
  			*start = xas.xa_index + 1;
b947cee4b   Jan Kara   mm: implement fin...
1928
1929
  			goto out;
  		}
fd1b3cee2   Matthew Wilcox   page cache: Conve...
1930
1931
  		continue;
  put_page:
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1932
  		put_page(page);
fd1b3cee2   Matthew Wilcox   page cache: Conve...
1933
1934
  retry:
  		xas_reset(&xas);
a60637c85   Nick Piggin   mm: lockless page...
1935
  	}
5b280c0cc   Hugh Dickins   mm: don't return ...
1936

b947cee4b   Jan Kara   mm: implement fin...
1937
1938
1939
  	/*
  	 * We come here when there is no page beyond @end. We take care to not
  	 * overflow the index @start as it confuses some of the callers. This
fd1b3cee2   Matthew Wilcox   page cache: Conve...
1940
  	 * breaks the iteration when there is a page at index -1 but that is
b947cee4b   Jan Kara   mm: implement fin...
1941
1942
1943
1944
1945
1946
1947
  	 * already broken anyway.
  	 */
  	if (end == (pgoff_t)-1)
  		*start = (pgoff_t)-1;
  	else
  		*start = end + 1;
  out:
a60637c85   Nick Piggin   mm: lockless page...
1948
  	rcu_read_unlock();
d72dc8a25   Jan Kara   mm: make pagevec_...
1949

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1950
1951
  	return ret;
  }
ebf43500e   Jens Axboe   [PATCH] Add find_...
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
  /**
   * find_get_pages_contig - gang contiguous pagecache lookup
   * @mapping:	The address_space to search
   * @index:	The starting page index
   * @nr_pages:	The maximum number of pages
   * @pages:	Where the resulting pages are placed
   *
   * find_get_pages_contig() works exactly like find_get_pages(), except
   * that the returned number of pages are guaranteed to be contiguous.
   *
a862f68a8   Mike Rapoport   docs/core-api/mm:...
1962
   * Return: the number of pages which were found.
ebf43500e   Jens Axboe   [PATCH] Add find_...
1963
1964
1965
1966
   */
  unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
  			       unsigned int nr_pages, struct page **pages)
  {
3ece58a27   Matthew Wilcox   page cache: Conve...
1967
1968
  	XA_STATE(xas, &mapping->i_pages, index);
  	struct page *page;
0fc9d1040   Konstantin Khlebnikov   radix-tree: use i...
1969
1970
1971
1972
  	unsigned int ret = 0;
  
  	if (unlikely(!nr_pages))
  		return 0;
a60637c85   Nick Piggin   mm: lockless page...
1973
1974
  
  	rcu_read_lock();
3ece58a27   Matthew Wilcox   page cache: Conve...
1975
  	for (page = xas_load(&xas); page; page = xas_next(&xas)) {
3ece58a27   Matthew Wilcox   page cache: Conve...
1976
1977
1978
1979
1980
1981
1982
  		if (xas_retry(&xas, page))
  			continue;
  		/*
  		 * If the entry has been swapped out, we can stop looking.
  		 * No current caller is looking for DAX entries.
  		 */
  		if (xa_is_value(page))
8079b1c85   Hugh Dickins   mm: clarify the r...
1983
  			break;
ebf43500e   Jens Axboe   [PATCH] Add find_...
1984

4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1985
  		if (!page_cache_get_speculative(page))
3ece58a27   Matthew Wilcox   page cache: Conve...
1986
  			goto retry;
83929372f   Kirill A. Shutemov   filemap: prepare ...
1987

4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1988
  		/* Has the page moved or been split? */
3ece58a27   Matthew Wilcox   page cache: Conve...
1989
1990
  		if (unlikely(page != xas_reload(&xas)))
  			goto put_page;
a60637c85   Nick Piggin   mm: lockless page...
1991

4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1992
  		pages[ret] = find_subpage(page, xas.xa_index);
0fc9d1040   Konstantin Khlebnikov   radix-tree: use i...
1993
1994
  		if (++ret == nr_pages)
  			break;
3ece58a27   Matthew Wilcox   page cache: Conve...
1995
1996
  		continue;
  put_page:
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1997
  		put_page(page);
3ece58a27   Matthew Wilcox   page cache: Conve...
1998
1999
  retry:
  		xas_reset(&xas);
ebf43500e   Jens Axboe   [PATCH] Add find_...
2000
  	}
a60637c85   Nick Piggin   mm: lockless page...
2001
2002
  	rcu_read_unlock();
  	return ret;
ebf43500e   Jens Axboe   [PATCH] Add find_...
2003
  }
ef71c15c4   David Howells   AFS: export a cou...
2004
  EXPORT_SYMBOL(find_get_pages_contig);
ebf43500e   Jens Axboe   [PATCH] Add find_...
2005

485bb99b4   Randy Dunlap   [PATCH] kernel-do...
2006
  /**
72b045aec   Jan Kara   mm: implement fin...
2007
   * find_get_pages_range_tag - find and return pages in given range matching @tag
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
2008
2009
   * @mapping:	the address_space to search
   * @index:	the starting page index
72b045aec   Jan Kara   mm: implement fin...
2010
   * @end:	The final page index (inclusive)
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
2011
2012
2013
2014
   * @tag:	the tag index
   * @nr_pages:	the maximum number of pages
   * @pages:	where the resulting pages are placed
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2015
   * Like find_get_pages, except we only return pages which are tagged with
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
2016
   * @tag.   We update @index to index the next page for the traversal.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
2017
2018
   *
   * Return: the number of pages which were found.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2019
   */
72b045aec   Jan Kara   mm: implement fin...
2020
  unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
a6906972f   Matthew Wilcox   page cache; Conve...
2021
  			pgoff_t end, xa_mark_t tag, unsigned int nr_pages,
72b045aec   Jan Kara   mm: implement fin...
2022
  			struct page **pages)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2023
  {
a6906972f   Matthew Wilcox   page cache; Conve...
2024
2025
  	XA_STATE(xas, &mapping->i_pages, *index);
  	struct page *page;
0fc9d1040   Konstantin Khlebnikov   radix-tree: use i...
2026
2027
2028
2029
  	unsigned ret = 0;
  
  	if (unlikely(!nr_pages))
  		return 0;
a60637c85   Nick Piggin   mm: lockless page...
2030
2031
  
  	rcu_read_lock();
a6906972f   Matthew Wilcox   page cache; Conve...
2032
  	xas_for_each_marked(&xas, page, end, tag) {
a6906972f   Matthew Wilcox   page cache; Conve...
2033
  		if (xas_retry(&xas, page))
a60637c85   Nick Piggin   mm: lockless page...
2034
  			continue;
a6906972f   Matthew Wilcox   page cache; Conve...
2035
2036
2037
2038
2039
2040
  		/*
  		 * Shadow entries should never be tagged, but this iteration
  		 * is lockless so there is a window for page reclaim to evict
  		 * a page we saw tagged.  Skip over it.
  		 */
  		if (xa_is_value(page))
139b6a6fb   Johannes Weiner   mm: filemap: upda...
2041
  			continue;
a60637c85   Nick Piggin   mm: lockless page...
2042

4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
2043
  		if (!page_cache_get_speculative(page))
a6906972f   Matthew Wilcox   page cache; Conve...
2044
  			goto retry;
a60637c85   Nick Piggin   mm: lockless page...
2045

4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
2046
  		/* Has the page moved or been split? */
a6906972f   Matthew Wilcox   page cache; Conve...
2047
2048
  		if (unlikely(page != xas_reload(&xas)))
  			goto put_page;
a60637c85   Nick Piggin   mm: lockless page...
2049

4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
2050
  		pages[ret] = find_subpage(page, xas.xa_index);
72b045aec   Jan Kara   mm: implement fin...
2051
  		if (++ret == nr_pages) {
5d3ee42f8   Yu Zhao   mm/shmem: make fi...
2052
  			*index = xas.xa_index + 1;
72b045aec   Jan Kara   mm: implement fin...
2053
2054
  			goto out;
  		}
a6906972f   Matthew Wilcox   page cache; Conve...
2055
2056
  		continue;
  put_page:
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
2057
  		put_page(page);
a6906972f   Matthew Wilcox   page cache; Conve...
2058
2059
  retry:
  		xas_reset(&xas);
a60637c85   Nick Piggin   mm: lockless page...
2060
  	}
5b280c0cc   Hugh Dickins   mm: don't return ...
2061

72b045aec   Jan Kara   mm: implement fin...
2062
  	/*
a6906972f   Matthew Wilcox   page cache; Conve...
2063
  	 * We come here when we got to @end. We take care to not overflow the
72b045aec   Jan Kara   mm: implement fin...
2064
  	 * index @index as it confuses some of the callers. This breaks the
a6906972f   Matthew Wilcox   page cache; Conve...
2065
2066
  	 * iteration when there is a page at index -1 but that is already
  	 * broken anyway.
72b045aec   Jan Kara   mm: implement fin...
2067
2068
2069
2070
2071
2072
  	 */
  	if (end == (pgoff_t)-1)
  		*index = (pgoff_t)-1;
  	else
  		*index = end + 1;
  out:
a60637c85   Nick Piggin   mm: lockless page...
2073
  	rcu_read_unlock();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2074

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2075
2076
  	return ret;
  }
72b045aec   Jan Kara   mm: implement fin...
2077
  EXPORT_SYMBOL(find_get_pages_range_tag);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2078

76d42bd96   Wu Fengguang   [PATCH] readahead...
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
  /*
   * CD/DVDs are error prone. When a medium error occurs, the driver may fail
   * a _large_ part of the i/o request. Imagine the worst scenario:
   *
   *      ---R__________________________________________B__________
   *         ^ reading here                             ^ bad block(assume 4k)
   *
   * read(R) => miss => readahead(R...B) => media error => frustrating retries
   * => failing the whole request => read(R) => read(R+1) =>
   * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>
   * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>
   * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......
   *
   * It is going insane. Fix it by quickly scaling down the readahead size.
   */
0f8e2db4e   Souptick Joarder   mm/filemap.c: rem...
2094
  static void shrink_readahead_size_eio(struct file_ra_state *ra)
76d42bd96   Wu Fengguang   [PATCH] readahead...
2095
  {
76d42bd96   Wu Fengguang   [PATCH] readahead...
2096
  	ra->ra_pages /= 4;
76d42bd96   Wu Fengguang   [PATCH] readahead...
2097
  }
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
2098
  /**
47c27bc46   Christoph Hellwig   fs: pass iocb to ...
2099
2100
   * generic_file_buffered_read - generic file read routine
   * @iocb:	the iocb to read
6e58e79db   Al Viro   introduce copy_pa...
2101
2102
   * @iter:	data destination
   * @written:	already copied
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
2103
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2104
   * This is a generic file read routine, and uses the
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
2105
   * mapping->a_ops->readpage() function for the actual low-level stuff.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2106
2107
2108
   *
   * This is really ugly. But the goto's actually try to clarify some
   * of the logic when it comes to error handling etc.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
2109
2110
2111
2112
   *
   * Return:
   * * total number of bytes copied, including those the were already @written
   * * negative error code if nothing was copied
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2113
   */
d85dc2e11   Goldwyn Rodrigues   fs: export generi...
2114
  ssize_t generic_file_buffered_read(struct kiocb *iocb,
6e58e79db   Al Viro   introduce copy_pa...
2115
  		struct iov_iter *iter, ssize_t written)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2116
  {
47c27bc46   Christoph Hellwig   fs: pass iocb to ...
2117
  	struct file *filp = iocb->ki_filp;
36e789144   Christoph Hellwig   kill do_generic_m...
2118
  	struct address_space *mapping = filp->f_mapping;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2119
  	struct inode *inode = mapping->host;
36e789144   Christoph Hellwig   kill do_generic_m...
2120
  	struct file_ra_state *ra = &filp->f_ra;
47c27bc46   Christoph Hellwig   fs: pass iocb to ...
2121
  	loff_t *ppos = &iocb->ki_pos;
57f6b96c0   Fengguang Wu   filemap: convert ...
2122
2123
2124
2125
  	pgoff_t index;
  	pgoff_t last_index;
  	pgoff_t prev_index;
  	unsigned long offset;      /* offset into pagecache page */
ec0f16372   Jan Kara   readahead: improv...
2126
  	unsigned int prev_offset;
6e58e79db   Al Viro   introduce copy_pa...
2127
  	int error = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2128

c2a9737f4   Wei Fang   vfs,mm: fix a dea...
2129
  	if (unlikely(*ppos >= inode->i_sb->s_maxbytes))
d05c5f7ba   Linus Torvalds   vfs,mm: fix retur...
2130
  		return 0;
c2a9737f4   Wei Fang   vfs,mm: fix a dea...
2131
  	iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2132
2133
2134
2135
2136
  	index = *ppos >> PAGE_SHIFT;
  	prev_index = ra->prev_pos >> PAGE_SHIFT;
  	prev_offset = ra->prev_pos & (PAGE_SIZE-1);
  	last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
  	offset = *ppos & ~PAGE_MASK;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2137

13bd69142   Jens Axboe   mm: mark async io...
2138
2139
2140
2141
2142
2143
2144
  	/*
  	 * If we've already successfully copied some data, then we
  	 * can no longer safely return -EIOCBQUEUED. Hence mark
  	 * an async read NOWAIT at that point.
  	 */
  	if (written && (iocb->ki_flags & IOCB_WAITQ))
  		iocb->ki_flags |= IOCB_NOWAIT;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2145
2146
  	for (;;) {
  		struct page *page;
57f6b96c0   Fengguang Wu   filemap: convert ...
2147
  		pgoff_t end_index;
a32ea1e1f   NeilBrown   Fix read/truncate...
2148
  		loff_t isize;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2149
  		unsigned long nr, ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2150
  		cond_resched();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2151
  find_page:
5abf186a3   Michal Hocko   mm, fs: check for...
2152
2153
2154
2155
  		if (fatal_signal_pending(current)) {
  			error = -EINTR;
  			goto out;
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2156
  		page = find_get_page(mapping, index);
3ea89ee86   Fengguang Wu   readahead: conver...
2157
  		if (!page) {
cdc8fcb49   Linus Torvalds   Merge tag 'for-5....
2158
  			if (iocb->ki_flags & IOCB_NOIO)
3239d8348   Milosz Tanski   fs: support IOCB_...
2159
  				goto would_block;
cf914a7d6   Rusty Russell   readahead: split ...
2160
  			page_cache_sync_readahead(mapping,
7ff81078d   Fengguang Wu   readahead: remove...
2161
  					ra, filp,
3ea89ee86   Fengguang Wu   readahead: conver...
2162
2163
2164
2165
2166
2167
  					index, last_index - index);
  			page = find_get_page(mapping, index);
  			if (unlikely(page == NULL))
  				goto no_cached_page;
  		}
  		if (PageReadahead(page)) {
41da51bce   Andreas Gruenbacher   fs: Add IOCB_NOIO...
2168
2169
2170
2171
  			if (iocb->ki_flags & IOCB_NOIO) {
  				put_page(page);
  				goto out;
  			}
cf914a7d6   Rusty Russell   readahead: split ...
2172
  			page_cache_async_readahead(mapping,
7ff81078d   Fengguang Wu   readahead: remove...
2173
  					ra, filp, page,
3ea89ee86   Fengguang Wu   readahead: conver...
2174
  					index, last_index - index);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2175
  		}
8ab22b9ab   Hisashi Hifumi   vfs: pagecache us...
2176
  		if (!PageUptodate(page)) {
ebded0278   Mel Gorman   mm: filemap: avoi...
2177
2178
2179
2180
2181
  			/*
  			 * See comment in do_read_cache_page on why
  			 * wait_on_page_locked is used to avoid unnecessarily
  			 * serialisations and why it's safe.
  			 */
1a0a7853b   Jens Axboe   mm: support async...
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
  			if (iocb->ki_flags & IOCB_WAITQ) {
  				if (written) {
  					put_page(page);
  					goto out;
  				}
  				error = wait_on_page_locked_async(page,
  								iocb->ki_waitq);
  			} else {
  				if (iocb->ki_flags & IOCB_NOWAIT) {
  					put_page(page);
  					goto would_block;
  				}
  				error = wait_on_page_locked_killable(page);
  			}
c4b209a42   Bart Van Assche   do_generic_file_r...
2196
2197
  			if (unlikely(error))
  				goto readpage_error;
ebded0278   Mel Gorman   mm: filemap: avoi...
2198
2199
  			if (PageUptodate(page))
  				goto page_ok;
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2200
  			if (inode->i_blkbits == PAGE_SHIFT ||
8ab22b9ab   Hisashi Hifumi   vfs: pagecache us...
2201
2202
  					!mapping->a_ops->is_partially_uptodate)
  				goto page_not_up_to_date;
6d6d36bc6   Eryu Guan   mm/filemap: don't...
2203
  			/* pipes can't handle partially uptodate pages */
00e237074   David Howells   iov_iter: Use acc...
2204
  			if (unlikely(iov_iter_is_pipe(iter)))
6d6d36bc6   Eryu Guan   mm/filemap: don't...
2205
  				goto page_not_up_to_date;
529ae9aaa   Nick Piggin   mm: rename page t...
2206
  			if (!trylock_page(page))
8ab22b9ab   Hisashi Hifumi   vfs: pagecache us...
2207
  				goto page_not_up_to_date;
8d056cb96   Dave Hansen   mm/vfs: revalidat...
2208
2209
2210
  			/* Did it get truncated before we got the lock? */
  			if (!page->mapping)
  				goto page_not_up_to_date_locked;
8ab22b9ab   Hisashi Hifumi   vfs: pagecache us...
2211
  			if (!mapping->a_ops->is_partially_uptodate(page,
6e58e79db   Al Viro   introduce copy_pa...
2212
  							offset, iter->count))
8ab22b9ab   Hisashi Hifumi   vfs: pagecache us...
2213
2214
2215
  				goto page_not_up_to_date_locked;
  			unlock_page(page);
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2216
  page_ok:
a32ea1e1f   NeilBrown   Fix read/truncate...
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
  		/*
  		 * i_size must be checked after we know the page is Uptodate.
  		 *
  		 * Checking i_size after the check allows us to calculate
  		 * the correct value for "nr", which means the zero-filled
  		 * part of the page is not copied back to userspace (unless
  		 * another truncate extends the file - this is desired though).
  		 */
  
  		isize = i_size_read(inode);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2227
  		end_index = (isize - 1) >> PAGE_SHIFT;
a32ea1e1f   NeilBrown   Fix read/truncate...
2228
  		if (unlikely(!isize || index > end_index)) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2229
  			put_page(page);
a32ea1e1f   NeilBrown   Fix read/truncate...
2230
2231
2232
2233
  			goto out;
  		}
  
  		/* nr is the maximum number of bytes to copy from this page */
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2234
  		nr = PAGE_SIZE;
a32ea1e1f   NeilBrown   Fix read/truncate...
2235
  		if (index == end_index) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2236
  			nr = ((isize - 1) & ~PAGE_MASK) + 1;
a32ea1e1f   NeilBrown   Fix read/truncate...
2237
  			if (nr <= offset) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2238
  				put_page(page);
a32ea1e1f   NeilBrown   Fix read/truncate...
2239
2240
2241
2242
  				goto out;
  			}
  		}
  		nr = nr - offset;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2243
2244
2245
2246
2247
2248
2249
2250
2251
  
  		/* If users can be writing to this page using arbitrary
  		 * virtual addresses, take care about potential aliasing
  		 * before reading the page on the kernel side.
  		 */
  		if (mapping_writably_mapped(mapping))
  			flush_dcache_page(page);
  
  		/*
ec0f16372   Jan Kara   readahead: improv...
2252
2253
  		 * When a sequential read accesses a page several times,
  		 * only mark it as accessed the first time.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2254
  		 */
ec0f16372   Jan Kara   readahead: improv...
2255
  		if (prev_index != index || offset != prev_offset)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2256
2257
2258
2259
2260
2261
  			mark_page_accessed(page);
  		prev_index = index;
  
  		/*
  		 * Ok, we have the page, and it's up-to-date, so
  		 * now we can copy it to user space...
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2262
  		 */
6e58e79db   Al Viro   introduce copy_pa...
2263
2264
  
  		ret = copy_page_to_iter(page, offset, nr, iter);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2265
  		offset += ret;
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2266
2267
  		index += offset >> PAGE_SHIFT;
  		offset &= ~PAGE_MASK;
6ce745ed3   Jan Kara   readahead: code c...
2268
  		prev_offset = offset;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2269

09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2270
  		put_page(page);
6e58e79db   Al Viro   introduce copy_pa...
2271
2272
2273
2274
2275
2276
2277
2278
  		written += ret;
  		if (!iov_iter_count(iter))
  			goto out;
  		if (ret < nr) {
  			error = -EFAULT;
  			goto out;
  		}
  		continue;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2279
2280
2281
  
  page_not_up_to_date:
  		/* Get exclusive access to the page ... */
0abed7c69   Jens Axboe   mm: never attempt...
2282
2283
2284
2285
2286
  		if (iocb->ki_flags & IOCB_WAITQ) {
  			if (written) {
  				put_page(page);
  				goto out;
  			}
1a0a7853b   Jens Axboe   mm: support async...
2287
  			error = lock_page_async(page, iocb->ki_waitq);
0abed7c69   Jens Axboe   mm: never attempt...
2288
  		} else {
1a0a7853b   Jens Axboe   mm: support async...
2289
  			error = lock_page_killable(page);
0abed7c69   Jens Axboe   mm: never attempt...
2290
  		}
854623235   Oleg Nesterov   do_generic_file_r...
2291
2292
  		if (unlikely(error))
  			goto readpage_error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2293

8ab22b9ab   Hisashi Hifumi   vfs: pagecache us...
2294
  page_not_up_to_date_locked:
da6052f7b   Nick Piggin   [PATCH] update so...
2295
  		/* Did it get truncated before we got the lock? */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2296
2297
  		if (!page->mapping) {
  			unlock_page(page);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2298
  			put_page(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
  			continue;
  		}
  
  		/* Did somebody else fill it already? */
  		if (PageUptodate(page)) {
  			unlock_page(page);
  			goto page_ok;
  		}
  
  readpage:
cdc8fcb49   Linus Torvalds   Merge tag 'for-5....
2309
  		if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) {
41da51bce   Andreas Gruenbacher   fs: Add IOCB_NOIO...
2310
2311
2312
2313
  			unlock_page(page);
  			put_page(page);
  			goto would_block;
  		}
91803b499   Jeff Moyer   do_generic_file_r...
2314
2315
2316
2317
2318
2319
  		/*
  		 * A previous I/O error may have been due to temporary
  		 * failures, eg. multipath errors.
  		 * PG_error will be set again if readpage fails.
  		 */
  		ClearPageError(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2320
2321
  		/* Start the actual read. The read will unlock the page. */
  		error = mapping->a_ops->readpage(filp, page);
994fc28c7   Zach Brown   [PATCH] add AOP_T...
2322
2323
  		if (unlikely(error)) {
  			if (error == AOP_TRUNCATED_PAGE) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2324
  				put_page(page);
6e58e79db   Al Viro   introduce copy_pa...
2325
  				error = 0;
994fc28c7   Zach Brown   [PATCH] add AOP_T...
2326
2327
  				goto find_page;
  			}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2328
  			goto readpage_error;
994fc28c7   Zach Brown   [PATCH] add AOP_T...
2329
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2330
2331
  
  		if (!PageUptodate(page)) {
0abed7c69   Jens Axboe   mm: never attempt...
2332
2333
2334
2335
2336
  			if (iocb->ki_flags & IOCB_WAITQ) {
  				if (written) {
  					put_page(page);
  					goto out;
  				}
c8d317aa1   Hao Xu   io_uring: fix asy...
2337
  				error = lock_page_async(page, iocb->ki_waitq);
0abed7c69   Jens Axboe   mm: never attempt...
2338
  			} else {
c8d317aa1   Hao Xu   io_uring: fix asy...
2339
  				error = lock_page_killable(page);
0abed7c69   Jens Axboe   mm: never attempt...
2340
  			}
c8d317aa1   Hao Xu   io_uring: fix asy...
2341

854623235   Oleg Nesterov   do_generic_file_r...
2342
2343
  			if (unlikely(error))
  				goto readpage_error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2344
2345
2346
  			if (!PageUptodate(page)) {
  				if (page->mapping == NULL) {
  					/*
2ecdc82ef   Christoph Hellwig   kill unused inval...
2347
  					 * invalidate_mapping_pages got it
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2348
2349
  					 */
  					unlock_page(page);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2350
  					put_page(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2351
2352
2353
  					goto find_page;
  				}
  				unlock_page(page);
0f8e2db4e   Souptick Joarder   mm/filemap.c: rem...
2354
  				shrink_readahead_size_eio(ra);
854623235   Oleg Nesterov   do_generic_file_r...
2355
2356
  				error = -EIO;
  				goto readpage_error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2357
2358
2359
  			}
  			unlock_page(page);
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2360
2361
2362
2363
  		goto page_ok;
  
  readpage_error:
  		/* UHHUH! A synchronous read error occurred. Report it */
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2364
  		put_page(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2365
2366
2367
2368
2369
2370
2371
  		goto out;
  
  no_cached_page:
  		/*
  		 * Ok, it wasn't cached, so we need to create a new
  		 * page..
  		 */
453f85d43   Mel Gorman   mm: remove __GFP_...
2372
  		page = page_cache_alloc(mapping);
eb2be1893   Nick Piggin   mm: buffered writ...
2373
  		if (!page) {
6e58e79db   Al Viro   introduce copy_pa...
2374
  			error = -ENOMEM;
eb2be1893   Nick Piggin   mm: buffered writ...
2375
  			goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2376
  		}
6afdb859b   Michal Hocko   mm: do not ignore...
2377
  		error = add_to_page_cache_lru(page, mapping, index,
c62d25556   Michal Hocko   mm, fs: introduce...
2378
  				mapping_gfp_constraint(mapping, GFP_KERNEL));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2379
  		if (error) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2380
  			put_page(page);
6e58e79db   Al Viro   introduce copy_pa...
2381
2382
  			if (error == -EEXIST) {
  				error = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2383
  				goto find_page;
6e58e79db   Al Viro   introduce copy_pa...
2384
  			}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2385
2386
  			goto out;
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2387
2388
  		goto readpage;
  	}
3239d8348   Milosz Tanski   fs: support IOCB_...
2389
2390
  would_block:
  	error = -EAGAIN;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2391
  out:
7ff81078d   Fengguang Wu   readahead: remove...
2392
  	ra->prev_pos = prev_index;
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2393
  	ra->prev_pos <<= PAGE_SHIFT;
7ff81078d   Fengguang Wu   readahead: remove...
2394
  	ra->prev_pos |= prev_offset;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2395

09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2396
  	*ppos = ((loff_t)index << PAGE_SHIFT) + offset;
0c6aa2639   Krishna Kumar   mm: do_generic_fi...
2397
  	file_accessed(filp);
6e58e79db   Al Viro   introduce copy_pa...
2398
  	return written ? written : error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2399
  }
d85dc2e11   Goldwyn Rodrigues   fs: export generi...
2400
  EXPORT_SYMBOL_GPL(generic_file_buffered_read);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2401

485bb99b4   Randy Dunlap   [PATCH] kernel-do...
2402
  /**
6abd23227   Al Viro   bury generic_file...
2403
   * generic_file_read_iter - generic filesystem read routine
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
2404
   * @iocb:	kernel I/O control block
6abd23227   Al Viro   bury generic_file...
2405
   * @iter:	destination for the data read
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
2406
   *
6abd23227   Al Viro   bury generic_file...
2407
   * This is the "read_iter()" routine for all filesystems
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2408
   * that can use the page cache directly.
41da51bce   Andreas Gruenbacher   fs: Add IOCB_NOIO...
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
   *
   * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall
   * be returned when no data can be read without waiting for I/O requests
   * to complete; it doesn't prevent readahead.
   *
   * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O
   * requests shall be made for the read or for readahead.  When no data
   * can be read, -EAGAIN shall be returned.  When readahead would be
   * triggered, a partial, possibly empty read shall be returned.
   *
a862f68a8   Mike Rapoport   docs/core-api/mm:...
2419
2420
   * Return:
   * * number of bytes copied, even for partial reads
41da51bce   Andreas Gruenbacher   fs: Add IOCB_NOIO...
2421
   * * negative error code (or 0 if IOCB_NOIO) if nothing was read
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2422
2423
   */
  ssize_t
ed978a811   Al Viro   new helper: gener...
2424
  generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2425
  {
e7080a439   Nicolai Stange   mm/filemap: gener...
2426
  	size_t count = iov_iter_count(iter);
47c27bc46   Christoph Hellwig   fs: pass iocb to ...
2427
  	ssize_t retval = 0;
e7080a439   Nicolai Stange   mm/filemap: gener...
2428
2429
2430
  
  	if (!count)
  		goto out; /* skip atime */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2431

2ba48ce51   Al Viro   mirror O_APPEND a...
2432
  	if (iocb->ki_flags & IOCB_DIRECT) {
47c27bc46   Christoph Hellwig   fs: pass iocb to ...
2433
  		struct file *file = iocb->ki_filp;
ed978a811   Al Viro   new helper: gener...
2434
2435
  		struct address_space *mapping = file->f_mapping;
  		struct inode *inode = mapping->host;
543ade1fc   Badari Pulavarty   [PATCH] Streamlin...
2436
  		loff_t size;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2437

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2438
  		size = i_size_read(inode);
6be96d3ad   Goldwyn Rodrigues   fs: return if dir...
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
  		if (iocb->ki_flags & IOCB_NOWAIT) {
  			if (filemap_range_has_page(mapping, iocb->ki_pos,
  						   iocb->ki_pos + count - 1))
  				return -EAGAIN;
  		} else {
  			retval = filemap_write_and_wait_range(mapping,
  						iocb->ki_pos,
  					        iocb->ki_pos + count - 1);
  			if (retval < 0)
  				goto out;
  		}
d8d3d94b8   Al Viro   pass iov_iter to ...
2450

0d5b0cf24   Christoph Hellwig   fs: update atime ...
2451
  		file_accessed(file);
5ecda1371   Al Viro   generic_file_read...
2452
  		retval = mapping->a_ops->direct_IO(iocb, iter);
c3a690240   Al Viro   fix ITER_PIPE int...
2453
  		if (retval >= 0) {
c64fb5c74   Christoph Hellwig   filemap: remove p...
2454
  			iocb->ki_pos += retval;
5ecda1371   Al Viro   generic_file_read...
2455
  			count -= retval;
9fe55eea7   Steven Whitehouse   Fix race when che...
2456
  		}
5b47d59af   Al Viro   fix braino in gen...
2457
  		iov_iter_revert(iter, count - iov_iter_count(iter));
66f998f61   Josef Bacik   fs: allow short d...
2458

9fe55eea7   Steven Whitehouse   Fix race when che...
2459
2460
2461
2462
2463
2464
  		/*
  		 * Btrfs can have a short DIO read if we encounter
  		 * compressed extents, so if there was an error, or if
  		 * we've already read everything we wanted to, or if
  		 * there was a short read because we hit EOF, go ahead
  		 * and return.  Otherwise fallthrough to buffered io for
fbbbad4bc   Matthew Wilcox   vfs,ext2: introdu...
2465
2466
  		 * the rest of the read.  Buffered reads will not work for
  		 * DAX files, so don't bother trying.
9fe55eea7   Steven Whitehouse   Fix race when che...
2467
  		 */
5ecda1371   Al Viro   generic_file_read...
2468
  		if (retval < 0 || !count || iocb->ki_pos >= size ||
0d5b0cf24   Christoph Hellwig   fs: update atime ...
2469
  		    IS_DAX(inode))
9fe55eea7   Steven Whitehouse   Fix race when che...
2470
  			goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2471
  	}
47c27bc46   Christoph Hellwig   fs: pass iocb to ...
2472
  	retval = generic_file_buffered_read(iocb, iter, retval);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2473
2474
2475
  out:
  	return retval;
  }
ed978a811   Al Viro   new helper: gener...
2476
  EXPORT_SYMBOL(generic_file_read_iter);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2477

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2478
  #ifdef CONFIG_MMU
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2479
  #define MMAP_LOTSAMISS  (100)
6b4c9f446   Josef Bacik   filemap: drop the...
2480
  /*
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
2481
   * lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock
6b4c9f446   Josef Bacik   filemap: drop the...
2482
2483
2484
2485
   * @vmf - the vm_fault for this fault.
   * @page - the page to lock.
   * @fpin - the pointer to the file we may pin (or is already pinned).
   *
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
2486
   * This works similar to lock_page_or_retry in that it can drop the mmap_lock.
6b4c9f446   Josef Bacik   filemap: drop the...
2487
   * It differs in that it actually returns the page locked if it returns 1 and 0
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
2488
   * if it couldn't lock the page.  If we did have to drop the mmap_lock then fpin
6b4c9f446   Josef Bacik   filemap: drop the...
2489
2490
2491
2492
2493
2494
2495
   * will point to the pinned file and needs to be fput()'ed at a later point.
   */
  static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page,
  				     struct file **fpin)
  {
  	if (trylock_page(page))
  		return 1;
8b0f9fa2e   Linus Torvalds   filemap: add a co...
2496
2497
  	/*
  	 * NOTE! This will make us return with VM_FAULT_RETRY, but with
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
2498
  	 * the mmap_lock still held. That's how FAULT_FLAG_RETRY_NOWAIT
8b0f9fa2e   Linus Torvalds   filemap: add a co...
2499
2500
  	 * is supposed to work. We have way too many special cases..
  	 */
6b4c9f446   Josef Bacik   filemap: drop the...
2501
2502
2503
2504
2505
2506
2507
  	if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
  		return 0;
  
  	*fpin = maybe_unlock_mmap_for_io(vmf, *fpin);
  	if (vmf->flags & FAULT_FLAG_KILLABLE) {
  		if (__lock_page_killable(page)) {
  			/*
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
2508
  			 * We didn't have the right flags to drop the mmap_lock,
6b4c9f446   Josef Bacik   filemap: drop the...
2509
2510
  			 * but all fault_handlers only check for fatal signals
  			 * if we return VM_FAULT_RETRY, so we need to drop the
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
2511
  			 * mmap_lock here and return 0 if we don't have a fpin.
6b4c9f446   Josef Bacik   filemap: drop the...
2512
2513
  			 */
  			if (*fpin == NULL)
d8ed45c5d   Michel Lespinasse   mmap locking API:...
2514
  				mmap_read_unlock(vmf->vma->vm_mm);
6b4c9f446   Josef Bacik   filemap: drop the...
2515
2516
2517
2518
2519
2520
  			return 0;
  		}
  	} else
  		__lock_page(page);
  	return 1;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2521

ef00e08e2   Linus Torvalds   readahead: clean ...
2522
  /*
6b4c9f446   Josef Bacik   filemap: drop the...
2523
2524
2525
2526
2527
   * Synchronous readahead happens when we don't even find a page in the page
   * cache at all.  We don't want to perform IO under the mmap sem, so if we have
   * to drop the mmap sem we return the file that was pinned in order for us to do
   * that.  If we didn't pin a file then we return NULL.  The file that is
   * returned needs to be fput()'ed when we're done with it.
ef00e08e2   Linus Torvalds   readahead: clean ...
2528
   */
6b4c9f446   Josef Bacik   filemap: drop the...
2529
  static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
ef00e08e2   Linus Torvalds   readahead: clean ...
2530
  {
2a1180f1b   Josef Bacik   filemap: pass vm_...
2531
2532
  	struct file *file = vmf->vma->vm_file;
  	struct file_ra_state *ra = &file->f_ra;
ef00e08e2   Linus Torvalds   readahead: clean ...
2533
  	struct address_space *mapping = file->f_mapping;
db660d462   David Howells   mm/filemap: fold ...
2534
  	DEFINE_READAHEAD(ractl, file, mapping, vmf->pgoff);
6b4c9f446   Josef Bacik   filemap: drop the...
2535
  	struct file *fpin = NULL;
e630bfac7   Kirill A. Shutemov   mm/filemap.c: fix...
2536
  	unsigned int mmap_miss;
ef00e08e2   Linus Torvalds   readahead: clean ...
2537
2538
  
  	/* If we don't want any read-ahead, don't bother */
2a1180f1b   Josef Bacik   filemap: pass vm_...
2539
  	if (vmf->vma->vm_flags & VM_RAND_READ)
6b4c9f446   Josef Bacik   filemap: drop the...
2540
  		return fpin;
275b12bf5   Wu Fengguang   readahead: return...
2541
  	if (!ra->ra_pages)
6b4c9f446   Josef Bacik   filemap: drop the...
2542
  		return fpin;
ef00e08e2   Linus Torvalds   readahead: clean ...
2543

2a1180f1b   Josef Bacik   filemap: pass vm_...
2544
  	if (vmf->vma->vm_flags & VM_SEQ_READ) {
6b4c9f446   Josef Bacik   filemap: drop the...
2545
  		fpin = maybe_unlock_mmap_for_io(vmf, fpin);
db660d462   David Howells   mm/filemap: fold ...
2546
  		page_cache_sync_ra(&ractl, ra, ra->ra_pages);
6b4c9f446   Josef Bacik   filemap: drop the...
2547
  		return fpin;
ef00e08e2   Linus Torvalds   readahead: clean ...
2548
  	}
207d04baa   Andi Kleen   readahead: reduce...
2549
  	/* Avoid banging the cache line if not needed */
e630bfac7   Kirill A. Shutemov   mm/filemap.c: fix...
2550
2551
2552
  	mmap_miss = READ_ONCE(ra->mmap_miss);
  	if (mmap_miss < MMAP_LOTSAMISS * 10)
  		WRITE_ONCE(ra->mmap_miss, ++mmap_miss);
ef00e08e2   Linus Torvalds   readahead: clean ...
2553
2554
2555
2556
2557
  
  	/*
  	 * Do we miss much more than hit in this file? If so,
  	 * stop bothering with read-ahead. It will only hurt.
  	 */
e630bfac7   Kirill A. Shutemov   mm/filemap.c: fix...
2558
  	if (mmap_miss > MMAP_LOTSAMISS)
6b4c9f446   Josef Bacik   filemap: drop the...
2559
  		return fpin;
ef00e08e2   Linus Torvalds   readahead: clean ...
2560

d30a11004   Wu Fengguang   readahead: record...
2561
2562
2563
  	/*
  	 * mmap read-around
  	 */
6b4c9f446   Josef Bacik   filemap: drop the...
2564
  	fpin = maybe_unlock_mmap_for_io(vmf, fpin);
db660d462   David Howells   mm/filemap: fold ...
2565
  	ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
600e19afc   Roman Gushchin   mm: use only per-...
2566
2567
  	ra->size = ra->ra_pages;
  	ra->async_size = ra->ra_pages / 4;
db660d462   David Howells   mm/filemap: fold ...
2568
2569
  	ractl._index = ra->start;
  	do_page_cache_ra(&ractl, ra->size, ra->async_size);
6b4c9f446   Josef Bacik   filemap: drop the...
2570
  	return fpin;
ef00e08e2   Linus Torvalds   readahead: clean ...
2571
2572
2573
2574
  }
  
  /*
   * Asynchronous readahead happens when we find the page and PG_readahead,
6b4c9f446   Josef Bacik   filemap: drop the...
2575
   * so we want to possibly extend the readahead further.  We return the file that
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
2576
   * was pinned if we have to drop the mmap_lock in order to do IO.
ef00e08e2   Linus Torvalds   readahead: clean ...
2577
   */
6b4c9f446   Josef Bacik   filemap: drop the...
2578
2579
  static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
  					    struct page *page)
ef00e08e2   Linus Torvalds   readahead: clean ...
2580
  {
2a1180f1b   Josef Bacik   filemap: pass vm_...
2581
2582
  	struct file *file = vmf->vma->vm_file;
  	struct file_ra_state *ra = &file->f_ra;
ef00e08e2   Linus Torvalds   readahead: clean ...
2583
  	struct address_space *mapping = file->f_mapping;
6b4c9f446   Josef Bacik   filemap: drop the...
2584
  	struct file *fpin = NULL;
e630bfac7   Kirill A. Shutemov   mm/filemap.c: fix...
2585
  	unsigned int mmap_miss;
2a1180f1b   Josef Bacik   filemap: pass vm_...
2586
  	pgoff_t offset = vmf->pgoff;
ef00e08e2   Linus Torvalds   readahead: clean ...
2587
2588
  
  	/* If we don't want any read-ahead, don't bother */
5c72feee3   Jan Kara   mm/filemap.c: don...
2589
  	if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
6b4c9f446   Josef Bacik   filemap: drop the...
2590
  		return fpin;
e630bfac7   Kirill A. Shutemov   mm/filemap.c: fix...
2591
2592
2593
  	mmap_miss = READ_ONCE(ra->mmap_miss);
  	if (mmap_miss)
  		WRITE_ONCE(ra->mmap_miss, --mmap_miss);
6b4c9f446   Josef Bacik   filemap: drop the...
2594
2595
  	if (PageReadahead(page)) {
  		fpin = maybe_unlock_mmap_for_io(vmf, fpin);
2fad6f5de   Wu Fengguang   readahead: enforc...
2596
2597
  		page_cache_async_readahead(mapping, ra, file,
  					   page, offset, ra->ra_pages);
6b4c9f446   Josef Bacik   filemap: drop the...
2598
2599
  	}
  	return fpin;
ef00e08e2   Linus Torvalds   readahead: clean ...
2600
  }
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
2601
  /**
54cb8821d   Nick Piggin   mm: merge populat...
2602
   * filemap_fault - read in file data for page fault handling
d0217ac04   Nick Piggin   mm: fault feedbac...
2603
   * @vmf:	struct vm_fault containing details of the fault
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
2604
   *
54cb8821d   Nick Piggin   mm: merge populat...
2605
   * filemap_fault() is invoked via the vma operations vector for a
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2606
2607
2608
2609
2610
   * mapped memory region to read in file data during a page fault.
   *
   * The goto's are kind of ugly, but this streamlines the normal case of having
   * it in the page cache, and handles the special cases reasonably without
   * having a lot of duplicated code.
9a95f3cf7   Paul Cassella   mm: describe mmap...
2611
   *
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
2612
   * vma->vm_mm->mmap_lock must be held on entry.
9a95f3cf7   Paul Cassella   mm: describe mmap...
2613
   *
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
2614
   * If our return value has VM_FAULT_RETRY set, it's because the mmap_lock
a49858338   Yang Shi   mm/filemap.c: cor...
2615
   * may be dropped before doing I/O or by lock_page_maybe_drop_mmap().
9a95f3cf7   Paul Cassella   mm: describe mmap...
2616
   *
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
2617
   * If our return value does not have VM_FAULT_RETRY set, the mmap_lock
9a95f3cf7   Paul Cassella   mm: describe mmap...
2618
2619
2620
   * has not been released.
   *
   * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
2621
2622
   *
   * Return: bitwise-OR of %VM_FAULT_ codes.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2623
   */
2bcd6454b   Souptick Joarder   mm: use new retur...
2624
  vm_fault_t filemap_fault(struct vm_fault *vmf)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2625
2626
  {
  	int error;
11bac8000   Dave Jiang   mm, fs: reduce fa...
2627
  	struct file *file = vmf->vma->vm_file;
6b4c9f446   Josef Bacik   filemap: drop the...
2628
  	struct file *fpin = NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2629
2630
2631
  	struct address_space *mapping = file->f_mapping;
  	struct file_ra_state *ra = &file->f_ra;
  	struct inode *inode = mapping->host;
ef00e08e2   Linus Torvalds   readahead: clean ...
2632
  	pgoff_t offset = vmf->pgoff;
9ab2594fe   Matthew Wilcox   mm: tighten up th...
2633
  	pgoff_t max_off;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2634
  	struct page *page;
2bcd6454b   Souptick Joarder   mm: use new retur...
2635
  	vm_fault_t ret = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2636

9ab2594fe   Matthew Wilcox   mm: tighten up th...
2637
2638
  	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
  	if (unlikely(offset >= max_off))
5307cc1aa   Linus Torvalds   Remove broken ptr...
2639
  		return VM_FAULT_SIGBUS;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2640

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2641
  	/*
494264208   Johannes Weiner   mm: memcg: handle...
2642
  	 * Do we have something in the page cache already?
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2643
  	 */
ef00e08e2   Linus Torvalds   readahead: clean ...
2644
  	page = find_get_page(mapping, offset);
45cac65b0   Shaohua Li   readahead: fault ...
2645
  	if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2646
  		/*
ef00e08e2   Linus Torvalds   readahead: clean ...
2647
2648
  		 * We found the page, so try async readahead before
  		 * waiting for the lock.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2649
  		 */
6b4c9f446   Josef Bacik   filemap: drop the...
2650
  		fpin = do_async_mmap_readahead(vmf, page);
45cac65b0   Shaohua Li   readahead: fault ...
2651
  	} else if (!page) {
ef00e08e2   Linus Torvalds   readahead: clean ...
2652
  		/* No page in the page cache at all */
ef00e08e2   Linus Torvalds   readahead: clean ...
2653
  		count_vm_event(PGMAJFAULT);
2262185c5   Roman Gushchin   mm: per-cgroup me...
2654
  		count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
ef00e08e2   Linus Torvalds   readahead: clean ...
2655
  		ret = VM_FAULT_MAJOR;
6b4c9f446   Josef Bacik   filemap: drop the...
2656
  		fpin = do_sync_mmap_readahead(vmf);
ef00e08e2   Linus Torvalds   readahead: clean ...
2657
  retry_find:
a75d4c333   Josef Bacik   filemap: kill pag...
2658
2659
2660
  		page = pagecache_get_page(mapping, offset,
  					  FGP_CREAT|FGP_FOR_MMAP,
  					  vmf->gfp_mask);
6b4c9f446   Josef Bacik   filemap: drop the...
2661
2662
2663
  		if (!page) {
  			if (fpin)
  				goto out_retry;
e520e932d   Matthew Wilcox (Oracle)   mm/filemap.c: use...
2664
  			return VM_FAULT_OOM;
6b4c9f446   Josef Bacik   filemap: drop the...
2665
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2666
  	}
6b4c9f446   Josef Bacik   filemap: drop the...
2667
2668
  	if (!lock_page_maybe_drop_mmap(vmf, page, &fpin))
  		goto out_retry;
b522c94da   Michel Lespinasse   mm: filemap_fault...
2669
2670
  
  	/* Did it get truncated? */
585e5a7ba   Song Liu   filemap: check co...
2671
  	if (unlikely(compound_head(page)->mapping != mapping)) {
b522c94da   Michel Lespinasse   mm: filemap_fault...
2672
2673
2674
2675
  		unlock_page(page);
  		put_page(page);
  		goto retry_find;
  	}
520e5ba41   Song Liu   filemap: update o...
2676
  	VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
b522c94da   Michel Lespinasse   mm: filemap_fault...
2677

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2678
  	/*
d00806b18   Nick Piggin   mm: fix fault vs ...
2679
2680
  	 * We have a locked page in the page cache, now we need to check
  	 * that it's up-to-date. If not, it is going to be due to an error.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2681
  	 */
d00806b18   Nick Piggin   mm: fix fault vs ...
2682
  	if (unlikely(!PageUptodate(page)))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2683
  		goto page_not_uptodate;
ef00e08e2   Linus Torvalds   readahead: clean ...
2684
  	/*
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
2685
  	 * We've made it this far and we had to drop our mmap_lock, now is the
6b4c9f446   Josef Bacik   filemap: drop the...
2686
2687
2688
2689
2690
2691
2692
2693
2694
  	 * time to return to the upper layer and have it re-find the vma and
  	 * redo the fault.
  	 */
  	if (fpin) {
  		unlock_page(page);
  		goto out_retry;
  	}
  
  	/*
ef00e08e2   Linus Torvalds   readahead: clean ...
2695
2696
2697
  	 * Found the page and have a reference on it.
  	 * We must recheck i_size under page lock.
  	 */
9ab2594fe   Matthew Wilcox   mm: tighten up th...
2698
2699
  	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
  	if (unlikely(offset >= max_off)) {
d00806b18   Nick Piggin   mm: fix fault vs ...
2700
  		unlock_page(page);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2701
  		put_page(page);
5307cc1aa   Linus Torvalds   Remove broken ptr...
2702
  		return VM_FAULT_SIGBUS;
d00806b18   Nick Piggin   mm: fix fault vs ...
2703
  	}
d0217ac04   Nick Piggin   mm: fault feedbac...
2704
  	vmf->page = page;
83c54070e   Nick Piggin   mm: fault feedbac...
2705
  	return ret | VM_FAULT_LOCKED;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2706

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2707
  page_not_uptodate:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2708
2709
2710
2711
2712
2713
  	/*
  	 * Umm, take care of errors if the page isn't up-to-date.
  	 * Try to re-read it _once_. We do this synchronously,
  	 * because there really aren't any performance issues here
  	 * and we need to check for errors.
  	 */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2714
  	ClearPageError(page);
6b4c9f446   Josef Bacik   filemap: drop the...
2715
  	fpin = maybe_unlock_mmap_for_io(vmf, fpin);
994fc28c7   Zach Brown   [PATCH] add AOP_T...
2716
  	error = mapping->a_ops->readpage(file, page);
3ef0f720e   Miklos Szeredi   mm: fix infinite ...
2717
2718
2719
2720
2721
  	if (!error) {
  		wait_on_page_locked(page);
  		if (!PageUptodate(page))
  			error = -EIO;
  	}
6b4c9f446   Josef Bacik   filemap: drop the...
2722
2723
  	if (fpin)
  		goto out_retry;
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2724
  	put_page(page);
d00806b18   Nick Piggin   mm: fix fault vs ...
2725
2726
  
  	if (!error || error == AOP_TRUNCATED_PAGE)
994fc28c7   Zach Brown   [PATCH] add AOP_T...
2727
  		goto retry_find;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2728

0f8e2db4e   Souptick Joarder   mm/filemap.c: rem...
2729
  	shrink_readahead_size_eio(ra);
d0217ac04   Nick Piggin   mm: fault feedbac...
2730
  	return VM_FAULT_SIGBUS;
6b4c9f446   Josef Bacik   filemap: drop the...
2731
2732
2733
  
  out_retry:
  	/*
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
2734
  	 * We dropped the mmap_lock, we need to return to the fault handler to
6b4c9f446   Josef Bacik   filemap: drop the...
2735
2736
2737
2738
2739
2740
2741
2742
  	 * re-find the vma and come back and find our hopefully still populated
  	 * page.
  	 */
  	if (page)
  		put_page(page);
  	if (fpin)
  		fput(fpin);
  	return ret | VM_FAULT_RETRY;
54cb8821d   Nick Piggin   mm: merge populat...
2743
2744
  }
  EXPORT_SYMBOL(filemap_fault);
82b0f8c39   Jan Kara   mm: join struct f...
2745
  void filemap_map_pages(struct vm_fault *vmf,
bae473a42   Kirill A. Shutemov   mm: introduce fau...
2746
  		pgoff_t start_pgoff, pgoff_t end_pgoff)
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2747
  {
82b0f8c39   Jan Kara   mm: join struct f...
2748
  	struct file *file = vmf->vma->vm_file;
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2749
  	struct address_space *mapping = file->f_mapping;
bae473a42   Kirill A. Shutemov   mm: introduce fau...
2750
  	pgoff_t last_pgoff = start_pgoff;
9ab2594fe   Matthew Wilcox   mm: tighten up th...
2751
  	unsigned long max_idx;
070e807c6   Matthew Wilcox   page cache: Conve...
2752
  	XA_STATE(xas, &mapping->i_pages, start_pgoff);
27a83a609   Matthew Wilcox (Oracle)   mm/filemap: fix f...
2753
  	struct page *head, *page;
e630bfac7   Kirill A. Shutemov   mm/filemap.c: fix...
2754
  	unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2755
2756
  
  	rcu_read_lock();
27a83a609   Matthew Wilcox (Oracle)   mm/filemap: fix f...
2757
2758
  	xas_for_each(&xas, head, end_pgoff) {
  		if (xas_retry(&xas, head))
070e807c6   Matthew Wilcox   page cache: Conve...
2759
  			continue;
27a83a609   Matthew Wilcox (Oracle)   mm/filemap: fix f...
2760
  		if (xa_is_value(head))
2cf938aae   Matthew Wilcox   mm: use radix_tre...
2761
  			goto next;
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2762

e0975b2aa   Michal Hocko   mm, fault_around:...
2763
2764
2765
2766
  		/*
  		 * Check for a locked page first, as a speculative
  		 * reference may adversely influence page migration.
  		 */
27a83a609   Matthew Wilcox (Oracle)   mm/filemap: fix f...
2767
  		if (PageLocked(head))
e0975b2aa   Michal Hocko   mm, fault_around:...
2768
  			goto next;
27a83a609   Matthew Wilcox (Oracle)   mm/filemap: fix f...
2769
  		if (!page_cache_get_speculative(head))
070e807c6   Matthew Wilcox   page cache: Conve...
2770
  			goto next;
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2771

4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
2772
  		/* Has the page moved or been split? */
27a83a609   Matthew Wilcox (Oracle)   mm/filemap: fix f...
2773
  		if (unlikely(head != xas_reload(&xas)))
070e807c6   Matthew Wilcox   page cache: Conve...
2774
  			goto skip;
27a83a609   Matthew Wilcox (Oracle)   mm/filemap: fix f...
2775
  		page = find_subpage(head, xas.xa_index);
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2776

27a83a609   Matthew Wilcox (Oracle)   mm/filemap: fix f...
2777
  		if (!PageUptodate(head) ||
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2778
2779
2780
  				PageReadahead(page) ||
  				PageHWPoison(page))
  			goto skip;
27a83a609   Matthew Wilcox (Oracle)   mm/filemap: fix f...
2781
  		if (!trylock_page(head))
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2782
  			goto skip;
27a83a609   Matthew Wilcox (Oracle)   mm/filemap: fix f...
2783
  		if (head->mapping != mapping || !PageUptodate(head))
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2784
  			goto unlock;
9ab2594fe   Matthew Wilcox   mm: tighten up th...
2785
  		max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
27a83a609   Matthew Wilcox (Oracle)   mm/filemap: fix f...
2786
  		if (xas.xa_index >= max_idx)
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2787
  			goto unlock;
e630bfac7   Kirill A. Shutemov   mm/filemap.c: fix...
2788
2789
  		if (mmap_miss > 0)
  			mmap_miss--;
7267ec008   Kirill A. Shutemov   mm: postpone page...
2790

070e807c6   Matthew Wilcox   page cache: Conve...
2791
  		vmf->address += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
82b0f8c39   Jan Kara   mm: join struct f...
2792
  		if (vmf->pte)
070e807c6   Matthew Wilcox   page cache: Conve...
2793
2794
  			vmf->pte += xas.xa_index - last_pgoff;
  		last_pgoff = xas.xa_index;
9d82c6943   Johannes Weiner   mm: memcontrol: c...
2795
  		if (alloc_set_pte(vmf, page))
7267ec008   Kirill A. Shutemov   mm: postpone page...
2796
  			goto unlock;
27a83a609   Matthew Wilcox (Oracle)   mm/filemap: fix f...
2797
  		unlock_page(head);
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2798
2799
  		goto next;
  unlock:
27a83a609   Matthew Wilcox (Oracle)   mm/filemap: fix f...
2800
  		unlock_page(head);
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2801
  skip:
27a83a609   Matthew Wilcox (Oracle)   mm/filemap: fix f...
2802
  		put_page(head);
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2803
  next:
7267ec008   Kirill A. Shutemov   mm: postpone page...
2804
  		/* Huge page is mapped? No need to proceed. */
82b0f8c39   Jan Kara   mm: join struct f...
2805
  		if (pmd_trans_huge(*vmf->pmd))
7267ec008   Kirill A. Shutemov   mm: postpone page...
2806
  			break;
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2807
2808
  	}
  	rcu_read_unlock();
e630bfac7   Kirill A. Shutemov   mm/filemap.c: fix...
2809
  	WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss);
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2810
2811
  }
  EXPORT_SYMBOL(filemap_map_pages);
2bcd6454b   Souptick Joarder   mm: use new retur...
2812
  vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
4fcf1c620   Jan Kara   mm: Make default ...
2813
2814
  {
  	struct page *page = vmf->page;
11bac8000   Dave Jiang   mm, fs: reduce fa...
2815
  	struct inode *inode = file_inode(vmf->vma->vm_file);
2bcd6454b   Souptick Joarder   mm: use new retur...
2816
  	vm_fault_t ret = VM_FAULT_LOCKED;
4fcf1c620   Jan Kara   mm: Make default ...
2817

14da92001   Jan Kara   fs: Protect write...
2818
  	sb_start_pagefault(inode->i_sb);
11bac8000   Dave Jiang   mm, fs: reduce fa...
2819
  	file_update_time(vmf->vma->vm_file);
4fcf1c620   Jan Kara   mm: Make default ...
2820
2821
2822
2823
2824
2825
  	lock_page(page);
  	if (page->mapping != inode->i_mapping) {
  		unlock_page(page);
  		ret = VM_FAULT_NOPAGE;
  		goto out;
  	}
14da92001   Jan Kara   fs: Protect write...
2826
2827
2828
2829
2830
2831
  	/*
  	 * We mark the page dirty already here so that when freeze is in
  	 * progress, we are guaranteed that writeback during freezing will
  	 * see the dirty page and writeprotect it again.
  	 */
  	set_page_dirty(page);
1d1d1a767   Darrick J. Wong   mm: only enforce ...
2832
  	wait_for_stable_page(page);
4fcf1c620   Jan Kara   mm: Make default ...
2833
  out:
14da92001   Jan Kara   fs: Protect write...
2834
  	sb_end_pagefault(inode->i_sb);
4fcf1c620   Jan Kara   mm: Make default ...
2835
2836
  	return ret;
  }
4fcf1c620   Jan Kara   mm: Make default ...
2837

f0f37e2f7   Alexey Dobriyan   const: mark struc...
2838
  const struct vm_operations_struct generic_file_vm_ops = {
54cb8821d   Nick Piggin   mm: merge populat...
2839
  	.fault		= filemap_fault,
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2840
  	.map_pages	= filemap_map_pages,
4fcf1c620   Jan Kara   mm: Make default ...
2841
  	.page_mkwrite	= filemap_page_mkwrite,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
  };
  
  /* This is used for a general mmap of a disk file */
  
  int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
  {
  	struct address_space *mapping = file->f_mapping;
  
  	if (!mapping->a_ops->readpage)
  		return -ENOEXEC;
  	file_accessed(file);
  	vma->vm_ops = &generic_file_vm_ops;
  	return 0;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
  
  /*
   * This is for filesystems which do not implement ->writepage.
   */
  int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
  {
  	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
  		return -EINVAL;
  	return generic_file_mmap(file, vma);
  }
  #else
4b96a37d1   Souptick Joarder   mm: convert to us...
2867
  vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
453972283   Arnd Bergmann   mm/filemap.c: pro...
2868
  {
4b96a37d1   Souptick Joarder   mm: convert to us...
2869
  	return VM_FAULT_SIGBUS;
453972283   Arnd Bergmann   mm/filemap.c: pro...
2870
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2871
2872
2873
2874
2875
2876
2877
2878
2879
  int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
  {
  	return -ENOSYS;
  }
  int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
  {
  	return -ENOSYS;
  }
  #endif /* CONFIG_MMU */
453972283   Arnd Bergmann   mm/filemap.c: pro...
2880
  EXPORT_SYMBOL(filemap_page_mkwrite);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2881
2882
  EXPORT_SYMBOL(generic_file_mmap);
  EXPORT_SYMBOL(generic_file_readonly_mmap);
67f9fd91f   Sasha Levin   mm: remove read_c...
2883
2884
2885
2886
2887
  static struct page *wait_on_page_read(struct page *page)
  {
  	if (!IS_ERR(page)) {
  		wait_on_page_locked(page);
  		if (!PageUptodate(page)) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2888
  			put_page(page);
67f9fd91f   Sasha Levin   mm: remove read_c...
2889
2890
2891
2892
2893
  			page = ERR_PTR(-EIO);
  		}
  	}
  	return page;
  }
32b635298   Mel Gorman   mm: filemap: remo...
2894
  static struct page *do_read_cache_page(struct address_space *mapping,
57f6b96c0   Fengguang Wu   filemap: convert ...
2895
  				pgoff_t index,
5e5358e7c   Hugh Dickins   mm: cleanup descr...
2896
  				int (*filler)(void *, struct page *),
0531b2aac   Linus Torvalds   mm: add new 'read...
2897
2898
  				void *data,
  				gfp_t gfp)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2899
  {
eb2be1893   Nick Piggin   mm: buffered writ...
2900
  	struct page *page;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2901
2902
2903
2904
  	int err;
  repeat:
  	page = find_get_page(mapping, index);
  	if (!page) {
453f85d43   Mel Gorman   mm: remove __GFP_...
2905
  		page = __page_cache_alloc(gfp);
eb2be1893   Nick Piggin   mm: buffered writ...
2906
2907
  		if (!page)
  			return ERR_PTR(-ENOMEM);
e6f67b8c0   Dave Kleikamp   vfs: __read_cache...
2908
  		err = add_to_page_cache_lru(page, mapping, index, gfp);
eb2be1893   Nick Piggin   mm: buffered writ...
2909
  		if (unlikely(err)) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2910
  			put_page(page);
eb2be1893   Nick Piggin   mm: buffered writ...
2911
2912
  			if (err == -EEXIST)
  				goto repeat;
22ecdb4f8   Matthew Wilcox   page cache: Remov...
2913
  			/* Presumably ENOMEM for xarray node */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2914
2915
  			return ERR_PTR(err);
  		}
32b635298   Mel Gorman   mm: filemap: remo...
2916
2917
  
  filler:
6c45b4541   Christoph Hellwig   mm/filemap: don't...
2918
2919
2920
2921
  		if (filler)
  			err = filler(data, page);
  		else
  			err = mapping->a_ops->readpage(data, page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2922
  		if (err < 0) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2923
  			put_page(page);
32b635298   Mel Gorman   mm: filemap: remo...
2924
  			return ERR_PTR(err);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2925
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2926

32b635298   Mel Gorman   mm: filemap: remo...
2927
2928
2929
2930
2931
  		page = wait_on_page_read(page);
  		if (IS_ERR(page))
  			return page;
  		goto out;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2932
2933
  	if (PageUptodate(page))
  		goto out;
ebded0278   Mel Gorman   mm: filemap: avoi...
2934
  	/*
0e9aa6755   Miaohe Lin   mm: fix some brok...
2935
  	 * Page is not up to date and may be locked due to one of the following
ebded0278   Mel Gorman   mm: filemap: avoi...
2936
2937
2938
2939
2940
2941
2942
2943
  	 * case a: Page is being filled and the page lock is held
  	 * case b: Read/write error clearing the page uptodate status
  	 * case c: Truncation in progress (page locked)
  	 * case d: Reclaim in progress
  	 *
  	 * Case a, the page will be up to date when the page is unlocked.
  	 *    There is no need to serialise on the page lock here as the page
  	 *    is pinned so the lock gives no additional protection. Even if the
ce89fddfe   Randy Dunlap   mm/filemap.c: del...
2944
  	 *    page is truncated, the data is still valid if PageUptodate as
ebded0278   Mel Gorman   mm: filemap: avoi...
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
  	 *    it's a race vs truncate race.
  	 * Case b, the page will not be up to date
  	 * Case c, the page may be truncated but in itself, the data may still
  	 *    be valid after IO completes as it's a read vs truncate race. The
  	 *    operation must restart if the page is not uptodate on unlock but
  	 *    otherwise serialising on page lock to stabilise the mapping gives
  	 *    no additional guarantees to the caller as the page lock is
  	 *    released before return.
  	 * Case d, similar to truncation. If reclaim holds the page lock, it
  	 *    will be a race with remove_mapping that determines if the mapping
  	 *    is valid on unlock but otherwise the data is valid and there is
  	 *    no need to serialise with page lock.
  	 *
  	 * As the page lock gives no additional guarantee, we optimistically
  	 * wait on the page to be unlocked and check if it's up to date and
  	 * use the page if it is. Otherwise, the page lock is required to
  	 * distinguish between the different cases. The motivation is that we
  	 * avoid spurious serialisations and wakeups when multiple processes
  	 * wait on the same page for IO to complete.
  	 */
  	wait_on_page_locked(page);
  	if (PageUptodate(page))
  		goto out;
  
  	/* Distinguish between all the cases under the safety of the lock */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2970
  	lock_page(page);
ebded0278   Mel Gorman   mm: filemap: avoi...
2971
2972
  
  	/* Case c or d, restart the operation */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2973
2974
  	if (!page->mapping) {
  		unlock_page(page);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2975
  		put_page(page);
32b635298   Mel Gorman   mm: filemap: remo...
2976
  		goto repeat;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2977
  	}
ebded0278   Mel Gorman   mm: filemap: avoi...
2978
2979
  
  	/* Someone else locked and filled the page in a very small window */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2980
2981
2982
2983
  	if (PageUptodate(page)) {
  		unlock_page(page);
  		goto out;
  	}
faffdfa04   Xianting Tian   mm/filemap.c: cle...
2984
2985
2986
2987
2988
2989
2990
2991
  
  	/*
  	 * A previous I/O error may have been due to temporary
  	 * failures.
  	 * Clear page error before actual read, PG_error will be
  	 * set again if read page fails.
  	 */
  	ClearPageError(page);
32b635298   Mel Gorman   mm: filemap: remo...
2992
  	goto filler;
c855ff371   David Howells   Fix a bad error c...
2993
  out:
6fe6900e1   Nick Piggin   mm: make read_cac...
2994
2995
2996
  	mark_page_accessed(page);
  	return page;
  }
0531b2aac   Linus Torvalds   mm: add new 'read...
2997
2998
  
  /**
67f9fd91f   Sasha Levin   mm: remove read_c...
2999
   * read_cache_page - read into page cache, fill it if needed
0531b2aac   Linus Torvalds   mm: add new 'read...
3000
3001
3002
   * @mapping:	the page's address_space
   * @index:	the page index
   * @filler:	function to perform the read
5e5358e7c   Hugh Dickins   mm: cleanup descr...
3003
   * @data:	first arg to filler(data, page) function, often left as NULL
0531b2aac   Linus Torvalds   mm: add new 'read...
3004
   *
0531b2aac   Linus Torvalds   mm: add new 'read...
3005
   * Read into the page cache. If a page already exists, and PageUptodate() is
67f9fd91f   Sasha Levin   mm: remove read_c...
3006
   * not set, try to fill the page and wait for it to become unlocked.
0531b2aac   Linus Torvalds   mm: add new 'read...
3007
3008
   *
   * If the page does not get brought uptodate, return -EIO.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
3009
3010
   *
   * Return: up to date page on success, ERR_PTR() on failure.
0531b2aac   Linus Torvalds   mm: add new 'read...
3011
   */
67f9fd91f   Sasha Levin   mm: remove read_c...
3012
  struct page *read_cache_page(struct address_space *mapping,
0531b2aac   Linus Torvalds   mm: add new 'read...
3013
  				pgoff_t index,
5e5358e7c   Hugh Dickins   mm: cleanup descr...
3014
  				int (*filler)(void *, struct page *),
0531b2aac   Linus Torvalds   mm: add new 'read...
3015
3016
  				void *data)
  {
d322a8e5e   Christoph Hellwig   mm/filemap.c: fix...
3017
3018
  	return do_read_cache_page(mapping, index, filler, data,
  			mapping_gfp_mask(mapping));
0531b2aac   Linus Torvalds   mm: add new 'read...
3019
  }
67f9fd91f   Sasha Levin   mm: remove read_c...
3020
  EXPORT_SYMBOL(read_cache_page);
0531b2aac   Linus Torvalds   mm: add new 'read...
3021
3022
3023
3024
3025
3026
3027
3028
  
  /**
   * read_cache_page_gfp - read into page cache, using specified page allocation flags.
   * @mapping:	the page's address_space
   * @index:	the page index
   * @gfp:	the page allocator flags to use if allocating
   *
   * This is the same as "read_mapping_page(mapping, index, NULL)", but with
e6f67b8c0   Dave Kleikamp   vfs: __read_cache...
3029
   * any new page allocations done using the specified allocation flags.
0531b2aac   Linus Torvalds   mm: add new 'read...
3030
3031
   *
   * If the page does not get brought uptodate, return -EIO.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
3032
3033
   *
   * Return: up to date page on success, ERR_PTR() on failure.
0531b2aac   Linus Torvalds   mm: add new 'read...
3034
3035
3036
3037
3038
   */
  struct page *read_cache_page_gfp(struct address_space *mapping,
  				pgoff_t index,
  				gfp_t gfp)
  {
6c45b4541   Christoph Hellwig   mm/filemap: don't...
3039
  	return do_read_cache_page(mapping, index, NULL, NULL, gfp);
0531b2aac   Linus Torvalds   mm: add new 'read...
3040
3041
  }
  EXPORT_SYMBOL(read_cache_page_gfp);
afddba49d   Nick Piggin   fs: introduce wri...
3042
3043
3044
3045
3046
  int pagecache_write_begin(struct file *file, struct address_space *mapping,
  				loff_t pos, unsigned len, unsigned flags,
  				struct page **pagep, void **fsdata)
  {
  	const struct address_space_operations *aops = mapping->a_ops;
4e02ed4b4   Nick Piggin   fs: remove prepar...
3047
  	return aops->write_begin(file, mapping, pos, len, flags,
afddba49d   Nick Piggin   fs: introduce wri...
3048
  							pagep, fsdata);
afddba49d   Nick Piggin   fs: introduce wri...
3049
3050
3051
3052
3053
3054
3055
3056
  }
  EXPORT_SYMBOL(pagecache_write_begin);
  
  int pagecache_write_end(struct file *file, struct address_space *mapping,
  				loff_t pos, unsigned len, unsigned copied,
  				struct page *page, void *fsdata)
  {
  	const struct address_space_operations *aops = mapping->a_ops;
afddba49d   Nick Piggin   fs: introduce wri...
3057

4e02ed4b4   Nick Piggin   fs: remove prepar...
3058
  	return aops->write_end(file, mapping, pos, len, copied, page, fsdata);
afddba49d   Nick Piggin   fs: introduce wri...
3059
3060
  }
  EXPORT_SYMBOL(pagecache_write_end);
a92853b67   Konstantin Khlebnikov   fs/direct-io.c: k...
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
  /*
   * Warn about a page cache invalidation failure during a direct I/O write.
   */
  void dio_warn_stale_pagecache(struct file *filp)
  {
  	static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST);
  	char pathname[128];
  	struct inode *inode = file_inode(filp);
  	char *path;
  
  	errseq_set(&inode->i_mapping->wb_err, -EIO);
  	if (__ratelimit(&_rs)) {
  		path = file_path(filp, pathname, sizeof(pathname));
  		if (IS_ERR(path))
  			path = "(unknown)";
  		pr_crit("Page cache invalidation failure on direct I/O.  Possible data corruption due to collision with buffered I/O!
  ");
  		pr_crit("File: %s PID: %d Comm: %.20s
  ", path, current->pid,
  			current->comm);
  	}
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3083
  ssize_t
1af5bb491   Christoph Hellwig   filemap: remove t...
3084
  generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3085
3086
3087
3088
  {
  	struct file	*file = iocb->ki_filp;
  	struct address_space *mapping = file->f_mapping;
  	struct inode	*inode = mapping->host;
1af5bb491   Christoph Hellwig   filemap: remove t...
3089
  	loff_t		pos = iocb->ki_pos;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3090
  	ssize_t		written;
a969e903a   Christoph Hellwig   kill generic_file...
3091
3092
  	size_t		write_len;
  	pgoff_t		end;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3093

0c949334a   Al Viro   iov_iter_truncate()
3094
  	write_len = iov_iter_count(from);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
3095
  	end = (pos + write_len - 1) >> PAGE_SHIFT;
a969e903a   Christoph Hellwig   kill generic_file...
3096

6be96d3ad   Goldwyn Rodrigues   fs: return if dir...
3097
3098
3099
  	if (iocb->ki_flags & IOCB_NOWAIT) {
  		/* If there are pages to writeback, return */
  		if (filemap_range_has_page(inode->i_mapping, pos,
35f12f0f5   zhengbin   mm/filemap: pass ...
3100
  					   pos + write_len - 1))
6be96d3ad   Goldwyn Rodrigues   fs: return if dir...
3101
3102
3103
3104
3105
3106
3107
  			return -EAGAIN;
  	} else {
  		written = filemap_write_and_wait_range(mapping, pos,
  							pos + write_len - 1);
  		if (written)
  			goto out;
  	}
a969e903a   Christoph Hellwig   kill generic_file...
3108
3109
3110
3111
3112
  
  	/*
  	 * After a write we want buffered reads to be sure to go to disk to get
  	 * the new data.  We invalidate clean cached page from the region we're
  	 * about to write.  We do this *before* the write so that we can return
6ccfa806a   Hisashi Hifumi   VFS: fix dio writ...
3113
  	 * without clobbering -EIOCBQUEUED from ->direct_IO().
a969e903a   Christoph Hellwig   kill generic_file...
3114
  	 */
55635ba76   Andrey Ryabinin   fs: fix data inva...
3115
  	written = invalidate_inode_pages2_range(mapping,
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
3116
  					pos >> PAGE_SHIFT, end);
55635ba76   Andrey Ryabinin   fs: fix data inva...
3117
3118
3119
3120
3121
3122
3123
3124
  	/*
  	 * If a page can not be invalidated, return 0 to fall back
  	 * to buffered write.
  	 */
  	if (written) {
  		if (written == -EBUSY)
  			return 0;
  		goto out;
a969e903a   Christoph Hellwig   kill generic_file...
3125
  	}
639a93a52   Al Viro   generic_file_dire...
3126
  	written = mapping->a_ops->direct_IO(iocb, from);
a969e903a   Christoph Hellwig   kill generic_file...
3127
3128
3129
3130
3131
3132
3133
3134
  
  	/*
  	 * Finally, try again to invalidate clean pages which might have been
  	 * cached by non-direct readahead, or faulted in by get_user_pages()
  	 * if the source of the write was an mmap'ed region of the file
  	 * we're writing.  Either one is a pretty crazy thing to do,
  	 * so we don't support it 100%.  If this invalidation
  	 * fails, tough, the write still worked...
332391a99   Lukas Czerner   fs: Fix page cach...
3135
3136
3137
3138
  	 *
  	 * Most of the time we do not need this since dio_complete() will do
  	 * the invalidation for us. However there are some file systems that
  	 * do not end up with dio_complete() being called, so let's not break
80c1fe902   Konstantin Khlebnikov   mm/filemap.c: rem...
3139
3140
  	 * them by removing it completely.
  	 *
9266a1403   Konstantin Khlebnikov   mm/filemap.c: war...
3141
3142
  	 * Noticeable example is a blkdev_direct_IO().
  	 *
80c1fe902   Konstantin Khlebnikov   mm/filemap.c: rem...
3143
  	 * Skip invalidation for async writes or if mapping has no pages.
a969e903a   Christoph Hellwig   kill generic_file...
3144
  	 */
9266a1403   Konstantin Khlebnikov   mm/filemap.c: war...
3145
3146
3147
  	if (written > 0 && mapping->nrpages &&
  	    invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end))
  		dio_warn_stale_pagecache(file);
a969e903a   Christoph Hellwig   kill generic_file...
3148

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3149
  	if (written > 0) {
0116651c8   Namhyung Kim   mm: remove tempor...
3150
  		pos += written;
639a93a52   Al Viro   generic_file_dire...
3151
  		write_len -= written;
0116651c8   Namhyung Kim   mm: remove tempor...
3152
3153
  		if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
  			i_size_write(inode, pos);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3154
3155
  			mark_inode_dirty(inode);
  		}
5cb6c6c7e   Al Viro   generic_file_dire...
3156
  		iocb->ki_pos = pos;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3157
  	}
639a93a52   Al Viro   generic_file_dire...
3158
  	iov_iter_revert(from, write_len - iov_iter_count(from));
a969e903a   Christoph Hellwig   kill generic_file...
3159
  out:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3160
3161
3162
  	return written;
  }
  EXPORT_SYMBOL(generic_file_direct_write);
eb2be1893   Nick Piggin   mm: buffered writ...
3163
3164
3165
3166
  /*
   * Find or create a page at the given pagecache position. Return the locked
   * page. This function is specifically for buffered writes.
   */
54566b2c1   Nick Piggin   fs: symlink write...
3167
3168
  struct page *grab_cache_page_write_begin(struct address_space *mapping,
  					pgoff_t index, unsigned flags)
eb2be1893   Nick Piggin   mm: buffered writ...
3169
  {
eb2be1893   Nick Piggin   mm: buffered writ...
3170
  	struct page *page;
bbddabe2e   Johannes Weiner   mm: filemap: only...
3171
  	int fgp_flags = FGP_LOCK|FGP_WRITE|FGP_CREAT;
0faa70cb0   Johannes Weiner   mm: filemap: pass...
3172

54566b2c1   Nick Piggin   fs: symlink write...
3173
  	if (flags & AOP_FLAG_NOFS)
2457aec63   Mel Gorman   mm: non-atomicall...
3174
3175
3176
  		fgp_flags |= FGP_NOFS;
  
  	page = pagecache_get_page(mapping, index, fgp_flags,
45f87de57   Michal Hocko   mm: get rid of ra...
3177
  			mapping_gfp_mask(mapping));
c585a2678   Steven Rostedt   mm: remove likely...
3178
  	if (page)
2457aec63   Mel Gorman   mm: non-atomicall...
3179
  		wait_for_stable_page(page);
eb2be1893   Nick Piggin   mm: buffered writ...
3180

eb2be1893   Nick Piggin   mm: buffered writ...
3181
3182
  	return page;
  }
54566b2c1   Nick Piggin   fs: symlink write...
3183
  EXPORT_SYMBOL(grab_cache_page_write_begin);
eb2be1893   Nick Piggin   mm: buffered writ...
3184

3b93f911d   Al Viro   export generic_pe...
3185
  ssize_t generic_perform_write(struct file *file,
afddba49d   Nick Piggin   fs: introduce wri...
3186
3187
3188
3189
3190
3191
  				struct iov_iter *i, loff_t pos)
  {
  	struct address_space *mapping = file->f_mapping;
  	const struct address_space_operations *a_ops = mapping->a_ops;
  	long status = 0;
  	ssize_t written = 0;
674b892ed   Nick Piggin   mm: restore KERNE...
3192
  	unsigned int flags = 0;
afddba49d   Nick Piggin   fs: introduce wri...
3193
3194
  	do {
  		struct page *page;
afddba49d   Nick Piggin   fs: introduce wri...
3195
3196
3197
3198
  		unsigned long offset;	/* Offset into pagecache page */
  		unsigned long bytes;	/* Bytes to write to page */
  		size_t copied;		/* Bytes copied from user */
  		void *fsdata;
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
3199
3200
  		offset = (pos & (PAGE_SIZE - 1));
  		bytes = min_t(unsigned long, PAGE_SIZE - offset,
afddba49d   Nick Piggin   fs: introduce wri...
3201
3202
3203
  						iov_iter_count(i));
  
  again:
00a3d660c   Linus Torvalds   Revert "fs: do no...
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
  		/*
  		 * Bring in the user page that we will copy from _first_.
  		 * Otherwise there's a nasty deadlock on copying from the
  		 * same page as we're writing to, without it being marked
  		 * up-to-date.
  		 *
  		 * Not only is this an optimisation, but it is also required
  		 * to check that the address is actually valid, when atomic
  		 * usercopies are used, below.
  		 */
  		if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
  			status = -EFAULT;
  			break;
  		}
296291cdd   Jan Kara   mm: make sendfile...
3218
3219
3220
3221
  		if (fatal_signal_pending(current)) {
  			status = -EINTR;
  			break;
  		}
674b892ed   Nick Piggin   mm: restore KERNE...
3222
  		status = a_ops->write_begin(file, mapping, pos, bytes, flags,
afddba49d   Nick Piggin   fs: introduce wri...
3223
  						&page, &fsdata);
2457aec63   Mel Gorman   mm: non-atomicall...
3224
  		if (unlikely(status < 0))
afddba49d   Nick Piggin   fs: introduce wri...
3225
  			break;
931e80e4b   anfei zhou   mm: flush dcache ...
3226
3227
  		if (mapping_writably_mapped(mapping))
  			flush_dcache_page(page);
00a3d660c   Linus Torvalds   Revert "fs: do no...
3228

afddba49d   Nick Piggin   fs: introduce wri...
3229
  		copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
afddba49d   Nick Piggin   fs: introduce wri...
3230
3231
3232
3233
3234
3235
3236
3237
3238
  		flush_dcache_page(page);
  
  		status = a_ops->write_end(file, mapping, pos, bytes, copied,
  						page, fsdata);
  		if (unlikely(status < 0))
  			break;
  		copied = status;
  
  		cond_resched();
124d3b704   Nick Piggin   fix writev regres...
3239
  		iov_iter_advance(i, copied);
afddba49d   Nick Piggin   fs: introduce wri...
3240
3241
3242
3243
3244
3245
3246
3247
3248
  		if (unlikely(copied == 0)) {
  			/*
  			 * If we were unable to copy any data at all, we must
  			 * fall back to a single segment length write.
  			 *
  			 * If we didn't fallback here, we could livelock
  			 * because not all segments in the iov can be copied at
  			 * once without a pagefault.
  			 */
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
3249
  			bytes = min_t(unsigned long, PAGE_SIZE - offset,
afddba49d   Nick Piggin   fs: introduce wri...
3250
3251
3252
  						iov_iter_single_seg_count(i));
  			goto again;
  		}
afddba49d   Nick Piggin   fs: introduce wri...
3253
3254
3255
3256
  		pos += copied;
  		written += copied;
  
  		balance_dirty_pages_ratelimited(mapping);
afddba49d   Nick Piggin   fs: introduce wri...
3257
3258
3259
3260
  	} while (iov_iter_count(i));
  
  	return written ? written : status;
  }
3b93f911d   Al Viro   export generic_pe...
3261
  EXPORT_SYMBOL(generic_perform_write);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3262

e4dd9de3c   Jan Kara   vfs: Export __gen...
3263
  /**
8174202b3   Al Viro   write_iter varian...
3264
   * __generic_file_write_iter - write data to a file
e4dd9de3c   Jan Kara   vfs: Export __gen...
3265
   * @iocb:	IO state structure (file, offset, etc.)
8174202b3   Al Viro   write_iter varian...
3266
   * @from:	iov_iter with data to write
e4dd9de3c   Jan Kara   vfs: Export __gen...
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
   *
   * This function does all the work needed for actually writing data to a
   * file. It does all basic checks, removes SUID from the file, updates
   * modification times and calls proper subroutines depending on whether we
   * do direct IO or a standard buffered write.
   *
   * It expects i_mutex to be grabbed unless we work on a block device or similar
   * object which does not need locking at all.
   *
   * This function does *not* take care of syncing data in case of O_SYNC write.
   * A caller has to handle it. This is mainly due to the fact that we want to
   * avoid syncing under i_mutex.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
3279
3280
3281
3282
   *
   * Return:
   * * number of bytes written, even for truncated writes
   * * negative error code if no data has been written at all
e4dd9de3c   Jan Kara   vfs: Export __gen...
3283
   */
8174202b3   Al Viro   write_iter varian...
3284
  ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3285
3286
  {
  	struct file *file = iocb->ki_filp;
fb5527e68   Jeff Moyer   [PATCH] direct-io...
3287
  	struct address_space * mapping = file->f_mapping;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3288
  	struct inode 	*inode = mapping->host;
3b93f911d   Al Viro   export generic_pe...
3289
  	ssize_t		written = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3290
  	ssize_t		err;
3b93f911d   Al Viro   export generic_pe...
3291
  	ssize_t		status;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3292

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3293
  	/* We can write back this queue in page reclaim */
de1414a65   Christoph Hellwig   fs: export inode_...
3294
  	current->backing_dev_info = inode_to_bdi(inode);
5fa8e0a1c   Jan Kara   fs: Rename file_r...
3295
  	err = file_remove_privs(file);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3296
3297
  	if (err)
  		goto out;
c3b2da314   Josef Bacik   fs: introduce ino...
3298
3299
3300
  	err = file_update_time(file);
  	if (err)
  		goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3301

2ba48ce51   Al Viro   mirror O_APPEND a...
3302
  	if (iocb->ki_flags & IOCB_DIRECT) {
0b8def9d6   Al Viro   __generic_file_wr...
3303
  		loff_t pos, endbyte;
fb5527e68   Jeff Moyer   [PATCH] direct-io...
3304

1af5bb491   Christoph Hellwig   filemap: remove t...
3305
  		written = generic_file_direct_write(iocb, from);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3306
  		/*
fbbbad4bc   Matthew Wilcox   vfs,ext2: introdu...
3307
3308
3309
3310
3311
  		 * If the write stopped short of completing, fall back to
  		 * buffered writes.  Some filesystems do this for writes to
  		 * holes, for example.  For DAX files, a buffered write will
  		 * not succeed (even if it did, DAX does not handle dirty
  		 * page-cache pages correctly).
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3312
  		 */
0b8def9d6   Al Viro   __generic_file_wr...
3313
  		if (written < 0 || !iov_iter_count(from) || IS_DAX(inode))
fbbbad4bc   Matthew Wilcox   vfs,ext2: introdu...
3314
  			goto out;
0b8def9d6   Al Viro   __generic_file_wr...
3315
  		status = generic_perform_write(file, from, pos = iocb->ki_pos);
fb5527e68   Jeff Moyer   [PATCH] direct-io...
3316
  		/*
3b93f911d   Al Viro   export generic_pe...
3317
  		 * If generic_perform_write() returned a synchronous error
fb5527e68   Jeff Moyer   [PATCH] direct-io...
3318
3319
3320
3321
3322
  		 * then we want to return the number of bytes which were
  		 * direct-written, or the error code if that was zero.  Note
  		 * that this differs from normal direct-io semantics, which
  		 * will return -EFOO even if some bytes were written.
  		 */
60bb45297   Al Viro   __generic_file_wr...
3323
  		if (unlikely(status < 0)) {
3b93f911d   Al Viro   export generic_pe...
3324
  			err = status;
fb5527e68   Jeff Moyer   [PATCH] direct-io...
3325
3326
  			goto out;
  		}
fb5527e68   Jeff Moyer   [PATCH] direct-io...
3327
3328
3329
3330
3331
  		/*
  		 * We need to ensure that the page cache pages are written to
  		 * disk and invalidated to preserve the expected O_DIRECT
  		 * semantics.
  		 */
3b93f911d   Al Viro   export generic_pe...
3332
  		endbyte = pos + status - 1;
0b8def9d6   Al Viro   __generic_file_wr...
3333
  		err = filemap_write_and_wait_range(mapping, pos, endbyte);
fb5527e68   Jeff Moyer   [PATCH] direct-io...
3334
  		if (err == 0) {
0b8def9d6   Al Viro   __generic_file_wr...
3335
  			iocb->ki_pos = endbyte + 1;
3b93f911d   Al Viro   export generic_pe...
3336
  			written += status;
fb5527e68   Jeff Moyer   [PATCH] direct-io...
3337
  			invalidate_mapping_pages(mapping,
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
3338
3339
  						 pos >> PAGE_SHIFT,
  						 endbyte >> PAGE_SHIFT);
fb5527e68   Jeff Moyer   [PATCH] direct-io...
3340
3341
3342
3343
3344
3345
3346
  		} else {
  			/*
  			 * We don't know how much we wrote, so just return
  			 * the number of bytes which were direct-written
  			 */
  		}
  	} else {
0b8def9d6   Al Viro   __generic_file_wr...
3347
3348
3349
  		written = generic_perform_write(file, from, iocb->ki_pos);
  		if (likely(written > 0))
  			iocb->ki_pos += written;
fb5527e68   Jeff Moyer   [PATCH] direct-io...
3350
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3351
3352
3353
3354
  out:
  	current->backing_dev_info = NULL;
  	return written ? written : err;
  }
8174202b3   Al Viro   write_iter varian...
3355
  EXPORT_SYMBOL(__generic_file_write_iter);
e4dd9de3c   Jan Kara   vfs: Export __gen...
3356

e4dd9de3c   Jan Kara   vfs: Export __gen...
3357
  /**
8174202b3   Al Viro   write_iter varian...
3358
   * generic_file_write_iter - write data to a file
e4dd9de3c   Jan Kara   vfs: Export __gen...
3359
   * @iocb:	IO state structure
8174202b3   Al Viro   write_iter varian...
3360
   * @from:	iov_iter with data to write
e4dd9de3c   Jan Kara   vfs: Export __gen...
3361
   *
8174202b3   Al Viro   write_iter varian...
3362
   * This is a wrapper around __generic_file_write_iter() to be used by most
e4dd9de3c   Jan Kara   vfs: Export __gen...
3363
3364
   * filesystems. It takes care of syncing the file in case of O_SYNC file
   * and acquires i_mutex as needed.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
3365
3366
3367
3368
   * Return:
   * * negative error code if no data has been written at all of
   *   vfs_fsync_range() failed for a synchronous write
   * * number of bytes written, even for truncated writes
e4dd9de3c   Jan Kara   vfs: Export __gen...
3369
   */
8174202b3   Al Viro   write_iter varian...
3370
  ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3371
3372
  {
  	struct file *file = iocb->ki_filp;
148f948ba   Jan Kara   vfs: Introduce ne...
3373
  	struct inode *inode = file->f_mapping->host;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3374
  	ssize_t ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3375

5955102c9   Al Viro   wrappers for ->i_...
3376
  	inode_lock(inode);
3309dd04c   Al Viro   switch generic_wr...
3377
3378
  	ret = generic_write_checks(iocb, from);
  	if (ret > 0)
5f380c7fa   Al Viro   lift generic_writ...
3379
  		ret = __generic_file_write_iter(iocb, from);
5955102c9   Al Viro   wrappers for ->i_...
3380
  	inode_unlock(inode);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3381

e25922176   Christoph Hellwig   fs: simplify the ...
3382
3383
  	if (ret > 0)
  		ret = generic_write_sync(iocb, ret);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3384
3385
  	return ret;
  }
8174202b3   Al Viro   write_iter varian...
3386
  EXPORT_SYMBOL(generic_file_write_iter);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3387

cf9a2ae8d   David Howells   [PATCH] BLOCK: Mo...
3388
3389
3390
3391
3392
3393
3394
  /**
   * try_to_release_page() - release old fs-specific metadata on a page
   *
   * @page: the page which the kernel is trying to free
   * @gfp_mask: memory allocation flags (and I/O mode)
   *
   * The address_space is to try to release any data against the page
a862f68a8   Mike Rapoport   docs/core-api/mm:...
3395
   * (presumably at page->private).
cf9a2ae8d   David Howells   [PATCH] BLOCK: Mo...
3396
   *
266cf658e   David Howells   FS-Cache: Recruit...
3397
3398
3399
   * This may also be called if PG_fscache is set on a page, indicating that the
   * page is known to the local caching routines.
   *
cf9a2ae8d   David Howells   [PATCH] BLOCK: Mo...
3400
   * The @gfp_mask argument specifies whether I/O may be performed to release
71baba4b9   Mel Gorman   mm, page_alloc: r...
3401
   * this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS).
cf9a2ae8d   David Howells   [PATCH] BLOCK: Mo...
3402
   *
a862f68a8   Mike Rapoport   docs/core-api/mm:...
3403
   * Return: %1 if the release was successful, otherwise return zero.
cf9a2ae8d   David Howells   [PATCH] BLOCK: Mo...
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
   */
  int try_to_release_page(struct page *page, gfp_t gfp_mask)
  {
  	struct address_space * const mapping = page->mapping;
  
  	BUG_ON(!PageLocked(page));
  	if (PageWriteback(page))
  		return 0;
  
  	if (mapping && mapping->a_ops->releasepage)
  		return mapping->a_ops->releasepage(page, gfp_mask);
  	return try_to_free_buffers(page);
  }
  
  EXPORT_SYMBOL(try_to_release_page);