Blame view

mm/filemap.c 95.8 KB
457c89965   Thomas Gleixner   treewide: Add SPD...
1
  // SPDX-License-Identifier: GPL-2.0-only
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2
3
4
5
6
7
8
9
10
11
12
  /*
   *	linux/mm/filemap.c
   *
   * Copyright (C) 1994-1999  Linus Torvalds
   */
  
  /*
   * This file handles the generic file mmap semantics used by
   * most "normal" filesystems (but you don't /have/ to use this:
   * the NFS filesystem used to do this differently, for example)
   */
b95f1b31b   Paul Gortmaker   mm: Map most file...
13
  #include <linux/export.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
14
  #include <linux/compiler.h>
f9fe48bec   Ross Zwisler   dax: support dirt...
15
  #include <linux/dax.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
16
  #include <linux/fs.h>
3f07c0144   Ingo Molnar   sched/headers: Pr...
17
  #include <linux/sched/signal.h>
c22ce143d   Hiro Yoshioka   [PATCH] x86: cach...
18
  #include <linux/uaccess.h>
c59ede7b7   Randy.Dunlap   [PATCH] move capa...
19
  #include <linux/capability.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
20
  #include <linux/kernel_stat.h>
5a0e3ad6a   Tejun Heo   include cleanup: ...
21
  #include <linux/gfp.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
22
23
24
25
26
27
  #include <linux/mm.h>
  #include <linux/swap.h>
  #include <linux/mman.h>
  #include <linux/pagemap.h>
  #include <linux/file.h>
  #include <linux/uio.h>
cfcbfb138   Josef Bacik   mm/filemap.c: ena...
28
  #include <linux/error-injection.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
29
30
  #include <linux/hash.h>
  #include <linux/writeback.h>
53253383f   Linus Torvalds   Include <linux/ba...
31
  #include <linux/backing-dev.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
32
33
34
  #include <linux/pagevec.h>
  #include <linux/blkdev.h>
  #include <linux/security.h>
44110fe38   Paul Jackson   [PATCH] cpuset me...
35
  #include <linux/cpuset.h>
00501b531   Johannes Weiner   mm: memcontrol: r...
36
  #include <linux/hugetlb.h>
8a9f3ccd2   Balbir Singh   Memory controller...
37
  #include <linux/memcontrol.h>
c515e1fd3   Dan Magenheimer   mm/fs: add hooks ...
38
  #include <linux/cleancache.h>
c7df8ad29   Mel Gorman   mm, truncate: do ...
39
  #include <linux/shmem_fs.h>
f1820361f   Kirill A. Shutemov   mm: implement ->m...
40
  #include <linux/rmap.h>
b1d29ba82   Johannes Weiner   delayacct: track ...
41
  #include <linux/delayacct.h>
eb414681d   Johannes Weiner   psi: pressure sta...
42
  #include <linux/psi.h>
d0e6a5821   Ben Dooks   mm/filemap.c: inc...
43
  #include <linux/ramfs.h>
0f8053a50   Nick Piggin   [PATCH] mm: make ...
44
  #include "internal.h"
fe0bfaaff   Robert Jarzmik   mm: trace filemap...
45
46
  #define CREATE_TRACE_POINTS
  #include <trace/events/filemap.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
47
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
48
49
   * FIXME: remove all knowledge of the buffer layer from the core VM
   */
148f948ba   Jan Kara   vfs: Introduce ne...
50
  #include <linux/buffer_head.h> /* for try_to_free_buffers */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
51

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
  #include <asm/mman.h>
  
  /*
   * Shared mappings implemented 30.11.1994. It's not fully working yet,
   * though.
   *
   * Shared mappings now work. 15.8.1995  Bruno.
   *
   * finished 'unifying' the page and buffer cache and SMP-threaded the
   * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
   *
   * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
   */
  
  /*
   * Lock ordering:
   *
c8c06efa8   Davidlohr Bueso   mm: convert i_mma...
69
   *  ->i_mmap_rwsem		(truncate_pagecache)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
70
   *    ->private_lock		(__free_pte->__set_page_dirty_buffers)
5d337b919   Hugh Dickins   [PATCH] swap: swa...
71
   *      ->swap_lock		(exclusive_swap_page, others)
b93b01631   Matthew Wilcox   page cache: use x...
72
   *        ->i_pages lock
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
73
   *
1b1dcc1b5   Jes Sorensen   [PATCH] mutex sub...
74
   *  ->i_mutex
c8c06efa8   Davidlohr Bueso   mm: convert i_mma...
75
   *    ->i_mmap_rwsem		(truncate->unmap_mapping_range)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
76
77
   *
   *  ->mmap_sem
c8c06efa8   Davidlohr Bueso   mm: convert i_mma...
78
   *    ->i_mmap_rwsem
b8072f099   Hugh Dickins   [PATCH] mm: updat...
79
   *      ->page_table_lock or pte_lock	(various, mainly in memory.c)
b93b01631   Matthew Wilcox   page cache: use x...
80
   *        ->i_pages lock	(arch-dependent flush_dcache_mmap_lock)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
81
82
83
84
   *
   *  ->mmap_sem
   *    ->lock_page		(access_process_vm)
   *
ccad23656   Al Viro   kill generic_file...
85
   *  ->i_mutex			(generic_perform_write)
82591e6ea   Nick Piggin   [PATCH] mm: more ...
86
   *    ->mmap_sem		(fault_in_pages_readable->do_page_fault)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
87
   *
f758eeabe   Christoph Hellwig   writeback: split ...
88
   *  bdi->wb.list_lock
a66979aba   Dave Chinner   fs: move i_wb_lis...
89
   *    sb_lock			(fs/fs-writeback.c)
b93b01631   Matthew Wilcox   page cache: use x...
90
   *    ->i_pages lock		(__sync_single_inode)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
91
   *
c8c06efa8   Davidlohr Bueso   mm: convert i_mma...
92
   *  ->i_mmap_rwsem
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
93
94
95
   *    ->anon_vma.lock		(vma_adjust)
   *
   *  ->anon_vma.lock
b8072f099   Hugh Dickins   [PATCH] mm: updat...
96
   *    ->page_table_lock or pte_lock	(anon_vma_prepare and various)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
97
   *
b8072f099   Hugh Dickins   [PATCH] mm: updat...
98
   *  ->page_table_lock or pte_lock
5d337b919   Hugh Dickins   [PATCH] swap: swa...
99
   *    ->swap_lock		(try_to_unmap_one)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
100
   *    ->private_lock		(try_to_unmap_one)
b93b01631   Matthew Wilcox   page cache: use x...
101
   *    ->i_pages lock		(try_to_unmap_one)
f4b7e272b   Andrey Ryabinin   mm: remove zone_l...
102
103
   *    ->pgdat->lru_lock		(follow_page->mark_page_accessed)
   *    ->pgdat->lru_lock		(check_pte_range->isolate_lru_page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
104
   *    ->private_lock		(page_remove_rmap->set_page_dirty)
b93b01631   Matthew Wilcox   page cache: use x...
105
   *    ->i_pages lock		(page_remove_rmap->set_page_dirty)
f758eeabe   Christoph Hellwig   writeback: split ...
106
   *    bdi.wb->list_lock		(page_remove_rmap->set_page_dirty)
250df6ed2   Dave Chinner   fs: protect inode...
107
   *    ->inode->i_lock		(page_remove_rmap->set_page_dirty)
81f8c3a46   Johannes Weiner   mm: memcontrol: g...
108
   *    ->memcg->move_lock	(page_remove_rmap->lock_page_memcg)
f758eeabe   Christoph Hellwig   writeback: split ...
109
   *    bdi.wb->list_lock		(zap_pte_range->set_page_dirty)
250df6ed2   Dave Chinner   fs: protect inode...
110
   *    ->inode->i_lock		(zap_pte_range->set_page_dirty)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
111
112
   *    ->private_lock		(zap_pte_range->__set_page_dirty_buffers)
   *
c8c06efa8   Davidlohr Bueso   mm: convert i_mma...
113
   * ->i_mmap_rwsem
9a3c531df   Andi Kleen   mm: update stale ...
114
   *   ->tasklist_lock            (memory_failure, collect_procs_ao)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
115
   */
5c024e6a4   Matthew Wilcox   page cache: Conve...
116
  static void page_cache_delete(struct address_space *mapping,
91b0abe36   Johannes Weiner   mm + fs: store sh...
117
118
  				   struct page *page, void *shadow)
  {
5c024e6a4   Matthew Wilcox   page cache: Conve...
119
120
  	XA_STATE(xas, &mapping->i_pages, page->index);
  	unsigned int nr = 1;
c70b647d3   Kirill A. Shutemov   mm/filemap.c: add...
121

5c024e6a4   Matthew Wilcox   page cache: Conve...
122
  	mapping_set_update(&xas, mapping);
c70b647d3   Kirill A. Shutemov   mm/filemap.c: add...
123

5c024e6a4   Matthew Wilcox   page cache: Conve...
124
125
126
  	/* hugetlb pages are represented by a single entry in the xarray */
  	if (!PageHuge(page)) {
  		xas_set_order(&xas, page->index, compound_order(page));
d8c6546b1   Matthew Wilcox (Oracle)   mm: introduce com...
127
  		nr = compound_nr(page);
5c024e6a4   Matthew Wilcox   page cache: Conve...
128
  	}
91b0abe36   Johannes Weiner   mm + fs: store sh...
129

83929372f   Kirill A. Shutemov   filemap: prepare ...
130
131
132
  	VM_BUG_ON_PAGE(!PageLocked(page), page);
  	VM_BUG_ON_PAGE(PageTail(page), page);
  	VM_BUG_ON_PAGE(nr != 1 && shadow, page);
449dd6984   Johannes Weiner   mm: keep page cac...
133

5c024e6a4   Matthew Wilcox   page cache: Conve...
134
135
  	xas_store(&xas, shadow);
  	xas_init_marks(&xas);
d3798ae8c   Johannes Weiner   mm: filemap: don'...
136

2300638b1   Jan Kara   mm: move clearing...
137
138
  	page->mapping = NULL;
  	/* Leave page->index set: truncation lookup relies upon it */
d3798ae8c   Johannes Weiner   mm: filemap: don'...
139
140
141
142
143
144
145
146
147
148
149
  	if (shadow) {
  		mapping->nrexceptional += nr;
  		/*
  		 * Make sure the nrexceptional update is committed before
  		 * the nrpages update so that final truncate racing
  		 * with reclaim does not see both counters 0 at the
  		 * same time and miss a shadow entry.
  		 */
  		smp_wmb();
  	}
  	mapping->nrpages -= nr;
91b0abe36   Johannes Weiner   mm + fs: store sh...
150
  }
5ecc4d852   Jan Kara   mm: factor out ch...
151
152
  static void unaccount_page_cache_page(struct address_space *mapping,
  				      struct page *page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
153
  {
5ecc4d852   Jan Kara   mm: factor out ch...
154
  	int nr;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
155

c515e1fd3   Dan Magenheimer   mm/fs: add hooks ...
156
157
158
159
160
161
162
163
  	/*
  	 * if we're uptodate, flush out into the cleancache, otherwise
  	 * invalidate any existing cleancache entries.  We can't leave
  	 * stale data around in the cleancache once our page is gone
  	 */
  	if (PageUptodate(page) && PageMappedToDisk(page))
  		cleancache_put_page(page);
  	else
3167760f8   Dan Magenheimer   mm: cleancache: s...
164
  		cleancache_invalidate_page(mapping, page);
c515e1fd3   Dan Magenheimer   mm/fs: add hooks ...
165

83929372f   Kirill A. Shutemov   filemap: prepare ...
166
  	VM_BUG_ON_PAGE(PageTail(page), page);
06b241f32   Hugh Dickins   mm: __delete_from...
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
  	VM_BUG_ON_PAGE(page_mapped(page), page);
  	if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(page_mapped(page))) {
  		int mapcount;
  
  		pr_alert("BUG: Bad page cache in process %s  pfn:%05lx
  ",
  			 current->comm, page_to_pfn(page));
  		dump_page(page, "still mapped when deleted");
  		dump_stack();
  		add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
  
  		mapcount = page_mapcount(page);
  		if (mapping_exiting(mapping) &&
  		    page_count(page) >= mapcount + 2) {
  			/*
  			 * All vmas have already been torn down, so it's
  			 * a good bet that actually the page is unmapped,
  			 * and we'd prefer not to leak it: if we're wrong,
  			 * some other bad page check should catch it later.
  			 */
  			page_mapcount_reset(page);
6d061f9f6   Joonsoo Kim   mm/page_ref: use ...
188
  			page_ref_sub(page, mapcount);
06b241f32   Hugh Dickins   mm: __delete_from...
189
190
  		}
  	}
4165b9b46   Michal Hocko   hugetlb: do not a...
191
  	/* hugetlb pages do not participate in page cache accounting. */
5ecc4d852   Jan Kara   mm: factor out ch...
192
193
  	if (PageHuge(page))
  		return;
09612fa65   Naoya Horiguchi   mm: hugetlb: retu...
194

5ecc4d852   Jan Kara   mm: factor out ch...
195
196
197
198
199
200
201
  	nr = hpage_nr_pages(page);
  
  	__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
  	if (PageSwapBacked(page)) {
  		__mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr);
  		if (PageTransHuge(page))
  			__dec_node_page_state(page, NR_SHMEM_THPS);
99cb0dbd4   Song Liu   mm,thp: add read-...
202
203
  	} else if (PageTransHuge(page)) {
  		__dec_node_page_state(page, NR_FILE_THPS);
09d91cda0   Song Liu   mm,thp: avoid wri...
204
  		filemap_nr_thps_dec(mapping);
800d8c63b   Kirill A. Shutemov   shmem: add huge p...
205
  	}
5ecc4d852   Jan Kara   mm: factor out ch...
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
  
  	/*
  	 * At this point page must be either written or cleaned by
  	 * truncate.  Dirty page here signals a bug and loss of
  	 * unwritten data.
  	 *
  	 * This fixes dirty accounting after removing the page entirely
  	 * but leaves PageDirty set: it has no effect for truncated
  	 * page and anyway will be cleared before returning page into
  	 * buddy allocator.
  	 */
  	if (WARN_ON_ONCE(PageDirty(page)))
  		account_page_cleaned(page, mapping, inode_to_wb(mapping->host));
  }
  
  /*
   * Delete a page from the page cache and free it. Caller has to make
   * sure the page is locked and that nobody else uses it - or that usage
b93b01631   Matthew Wilcox   page cache: use x...
224
   * is safe.  The caller must hold the i_pages lock.
5ecc4d852   Jan Kara   mm: factor out ch...
225
226
227
228
229
230
231
232
   */
  void __delete_from_page_cache(struct page *page, void *shadow)
  {
  	struct address_space *mapping = page->mapping;
  
  	trace_mm_filemap_delete_from_page_cache(page);
  
  	unaccount_page_cache_page(mapping, page);
5c024e6a4   Matthew Wilcox   page cache: Conve...
233
  	page_cache_delete(mapping, page, shadow);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
234
  }
59c66c5f8   Jan Kara   mm: factor out pa...
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
  static void page_cache_free_page(struct address_space *mapping,
  				struct page *page)
  {
  	void (*freepage)(struct page *);
  
  	freepage = mapping->a_ops->freepage;
  	if (freepage)
  		freepage(page);
  
  	if (PageTransHuge(page) && !PageHuge(page)) {
  		page_ref_sub(page, HPAGE_PMD_NR);
  		VM_BUG_ON_PAGE(page_count(page) <= 0, page);
  	} else {
  		put_page(page);
  	}
  }
702cfbf93   Minchan Kim   mm: goodbye remov...
251
252
253
254
255
256
257
258
259
  /**
   * delete_from_page_cache - delete page from page cache
   * @page: the page which the kernel is trying to remove from page cache
   *
   * This must be called only on pages that have been verified to be in the page
   * cache and locked.  It will never put the page into the free list, the caller
   * has a reference on the page.
   */
  void delete_from_page_cache(struct page *page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
260
  {
83929372f   Kirill A. Shutemov   filemap: prepare ...
261
  	struct address_space *mapping = page_mapping(page);
c4843a759   Greg Thelen   memcg: add per cg...
262
  	unsigned long flags;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
263

cd7619d6b   Matt Mackall   [PATCH] Extermina...
264
  	BUG_ON(!PageLocked(page));
b93b01631   Matthew Wilcox   page cache: use x...
265
  	xa_lock_irqsave(&mapping->i_pages, flags);
62cccb8c8   Johannes Weiner   mm: simplify lock...
266
  	__delete_from_page_cache(page, NULL);
b93b01631   Matthew Wilcox   page cache: use x...
267
  	xa_unlock_irqrestore(&mapping->i_pages, flags);
6072d13c4   Linus Torvalds   Call the filesyst...
268

59c66c5f8   Jan Kara   mm: factor out pa...
269
  	page_cache_free_page(mapping, page);
97cecb5a2   Minchan Kim   mm: introduce del...
270
271
  }
  EXPORT_SYMBOL(delete_from_page_cache);
aa65c29ce   Jan Kara   mm: batch radix t...
272
  /*
ef8e5717d   Matthew Wilcox   page cache: Conve...
273
   * page_cache_delete_batch - delete several pages from page cache
aa65c29ce   Jan Kara   mm: batch radix t...
274
275
276
   * @mapping: the mapping to which pages belong
   * @pvec: pagevec with pages to delete
   *
b93b01631   Matthew Wilcox   page cache: use x...
277
   * The function walks over mapping->i_pages and removes pages passed in @pvec
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
278
279
   * from the mapping. The function expects @pvec to be sorted by page index
   * and is optimised for it to be dense.
b93b01631   Matthew Wilcox   page cache: use x...
280
   * It tolerates holes in @pvec (mapping entries at those indices are not
aa65c29ce   Jan Kara   mm: batch radix t...
281
   * modified). The function expects only THP head pages to be present in the
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
282
   * @pvec.
aa65c29ce   Jan Kara   mm: batch radix t...
283
   *
b93b01631   Matthew Wilcox   page cache: use x...
284
   * The function expects the i_pages lock to be held.
aa65c29ce   Jan Kara   mm: batch radix t...
285
   */
ef8e5717d   Matthew Wilcox   page cache: Conve...
286
  static void page_cache_delete_batch(struct address_space *mapping,
aa65c29ce   Jan Kara   mm: batch radix t...
287
288
  			     struct pagevec *pvec)
  {
ef8e5717d   Matthew Wilcox   page cache: Conve...
289
  	XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index);
aa65c29ce   Jan Kara   mm: batch radix t...
290
  	int total_pages = 0;
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
291
  	int i = 0;
aa65c29ce   Jan Kara   mm: batch radix t...
292
  	struct page *page;
aa65c29ce   Jan Kara   mm: batch radix t...
293

ef8e5717d   Matthew Wilcox   page cache: Conve...
294
295
  	mapping_set_update(&xas, mapping);
  	xas_for_each(&xas, page, ULONG_MAX) {
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
296
  		if (i >= pagevec_count(pvec))
aa65c29ce   Jan Kara   mm: batch radix t...
297
  			break;
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
298
299
  
  		/* A swap/dax/shadow entry got inserted? Skip it. */
3159f943a   Matthew Wilcox   xarray: Replace e...
300
  		if (xa_is_value(page))
aa65c29ce   Jan Kara   mm: batch radix t...
301
  			continue;
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
  		/*
  		 * A page got inserted in our range? Skip it. We have our
  		 * pages locked so they are protected from being removed.
  		 * If we see a page whose index is higher than ours, it
  		 * means our page has been removed, which shouldn't be
  		 * possible because we're holding the PageLock.
  		 */
  		if (page != pvec->pages[i]) {
  			VM_BUG_ON_PAGE(page->index > pvec->pages[i]->index,
  					page);
  			continue;
  		}
  
  		WARN_ON_ONCE(!PageLocked(page));
  
  		if (page->index == xas.xa_index)
aa65c29ce   Jan Kara   mm: batch radix t...
318
  			page->mapping = NULL;
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
319
320
321
322
323
324
325
326
  		/* Leave page->index set: truncation lookup relies on it */
  
  		/*
  		 * Move to the next page in the vector if this is a regular
  		 * page or the index is of the last sub-page of this compound
  		 * page.
  		 */
  		if (page->index + compound_nr(page) - 1 == xas.xa_index)
aa65c29ce   Jan Kara   mm: batch radix t...
327
  			i++;
ef8e5717d   Matthew Wilcox   page cache: Conve...
328
  		xas_store(&xas, NULL);
aa65c29ce   Jan Kara   mm: batch radix t...
329
330
331
332
333
334
335
336
337
338
339
340
341
  		total_pages++;
  	}
  	mapping->nrpages -= total_pages;
  }
  
  void delete_from_page_cache_batch(struct address_space *mapping,
  				  struct pagevec *pvec)
  {
  	int i;
  	unsigned long flags;
  
  	if (!pagevec_count(pvec))
  		return;
b93b01631   Matthew Wilcox   page cache: use x...
342
  	xa_lock_irqsave(&mapping->i_pages, flags);
aa65c29ce   Jan Kara   mm: batch radix t...
343
344
345
346
347
  	for (i = 0; i < pagevec_count(pvec); i++) {
  		trace_mm_filemap_delete_from_page_cache(pvec->pages[i]);
  
  		unaccount_page_cache_page(mapping, pvec->pages[i]);
  	}
ef8e5717d   Matthew Wilcox   page cache: Conve...
348
  	page_cache_delete_batch(mapping, pvec);
b93b01631   Matthew Wilcox   page cache: use x...
349
  	xa_unlock_irqrestore(&mapping->i_pages, flags);
aa65c29ce   Jan Kara   mm: batch radix t...
350
351
352
353
  
  	for (i = 0; i < pagevec_count(pvec); i++)
  		page_cache_free_page(mapping, pvec->pages[i]);
  }
d72d9e2a5   Miklos Szeredi   mm: export filema...
354
  int filemap_check_errors(struct address_space *mapping)
865ffef37   Dmitry Monakhov   fs: fix fsync() e...
355
356
357
  {
  	int ret = 0;
  	/* Check for outstanding write errors */
7fcbbaf18   Jens Axboe   mm/filemap.c: avo...
358
359
  	if (test_bit(AS_ENOSPC, &mapping->flags) &&
  	    test_and_clear_bit(AS_ENOSPC, &mapping->flags))
865ffef37   Dmitry Monakhov   fs: fix fsync() e...
360
  		ret = -ENOSPC;
7fcbbaf18   Jens Axboe   mm/filemap.c: avo...
361
362
  	if (test_bit(AS_EIO, &mapping->flags) &&
  	    test_and_clear_bit(AS_EIO, &mapping->flags))
865ffef37   Dmitry Monakhov   fs: fix fsync() e...
363
364
365
  		ret = -EIO;
  	return ret;
  }
d72d9e2a5   Miklos Szeredi   mm: export filema...
366
  EXPORT_SYMBOL(filemap_check_errors);
865ffef37   Dmitry Monakhov   fs: fix fsync() e...
367

76341cabb   Jeff Layton   jbd2: don't clear...
368
369
370
371
372
373
374
375
376
  static int filemap_check_and_keep_errors(struct address_space *mapping)
  {
  	/* Check for outstanding write errors */
  	if (test_bit(AS_EIO, &mapping->flags))
  		return -EIO;
  	if (test_bit(AS_ENOSPC, &mapping->flags))
  		return -ENOSPC;
  	return 0;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
377
  /**
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
378
   * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
67be2dd1b   Martin Waitz   [PATCH] DocBook: ...
379
380
   * @mapping:	address space structure to write
   * @start:	offset in bytes where the range starts
469eb4d03   Andrew Morton   [PATCH] filemap_f...
381
   * @end:	offset in bytes where the range ends (inclusive)
67be2dd1b   Martin Waitz   [PATCH] DocBook: ...
382
   * @sync_mode:	enable synchronous operation
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
383
   *
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
384
385
386
   * Start writeback against all of a mapping's dirty pages that lie
   * within the byte offsets <start, end> inclusive.
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
387
   * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
388
   * opposed to a regular memory cleansing writeback.  The difference between
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
389
390
   * these two operations is that if a dirty page/buffer is encountered, it must
   * be waited upon, and not just skipped over.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
391
392
   *
   * Return: %0 on success, negative error code otherwise.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
393
   */
ebcf28e1c   Andrew Morton   [PATCH] fadvise()...
394
395
  int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
  				loff_t end, int sync_mode)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
396
397
398
399
  {
  	int ret;
  	struct writeback_control wbc = {
  		.sync_mode = sync_mode,
05fe478dd   Nick Piggin   mm: write_cache_p...
400
  		.nr_to_write = LONG_MAX,
111ebb6e6   OGAWA Hirofumi   [PATCH] writeback...
401
402
  		.range_start = start,
  		.range_end = end,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
403
  	};
c3aab9a0b   Konstantin Khlebnikov   mm/filemap.c: don...
404
405
  	if (!mapping_cap_writeback_dirty(mapping) ||
  	    !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
406
  		return 0;
b16b1deb5   Tejun Heo   writeback: make w...
407
  	wbc_attach_fdatawrite_inode(&wbc, mapping->host);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
408
  	ret = do_writepages(mapping, &wbc);
b16b1deb5   Tejun Heo   writeback: make w...
409
  	wbc_detach_inode(&wbc);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
410
411
412
413
414
415
  	return ret;
  }
  
  static inline int __filemap_fdatawrite(struct address_space *mapping,
  	int sync_mode)
  {
111ebb6e6   OGAWA Hirofumi   [PATCH] writeback...
416
  	return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
417
418
419
420
421
422
423
  }
  
  int filemap_fdatawrite(struct address_space *mapping)
  {
  	return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
  }
  EXPORT_SYMBOL(filemap_fdatawrite);
f4c0a0fdf   Jan Kara   vfs: export filem...
424
  int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
ebcf28e1c   Andrew Morton   [PATCH] fadvise()...
425
  				loff_t end)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
426
427
428
  {
  	return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
  }
f4c0a0fdf   Jan Kara   vfs: export filem...
429
  EXPORT_SYMBOL(filemap_fdatawrite_range);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
430

485bb99b4   Randy Dunlap   [PATCH] kernel-do...
431
432
433
434
  /**
   * filemap_flush - mostly a non-blocking flush
   * @mapping:	target address_space
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
435
436
   * This is a mostly non-blocking flush.  Not suitable for data-integrity
   * purposes - I/O may not be started against all dirty pages.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
437
438
   *
   * Return: %0 on success, negative error code otherwise.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
439
440
441
442
443
444
   */
  int filemap_flush(struct address_space *mapping)
  {
  	return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
  }
  EXPORT_SYMBOL(filemap_flush);
7fc9e4722   Goldwyn Rodrigues   fs: Introduce fil...
445
446
447
448
449
450
451
452
  /**
   * filemap_range_has_page - check if a page exists in range.
   * @mapping:           address space within which to check
   * @start_byte:        offset in bytes where the range starts
   * @end_byte:          offset in bytes where the range ends (inclusive)
   *
   * Find at least one page in the range supplied, usually used to check if
   * direct writing in this range will trigger a writeback.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
453
454
455
   *
   * Return: %true if at least one page exists in the specified range,
   * %false otherwise.
7fc9e4722   Goldwyn Rodrigues   fs: Introduce fil...
456
457
458
459
   */
  bool filemap_range_has_page(struct address_space *mapping,
  			   loff_t start_byte, loff_t end_byte)
  {
f7b680468   Jan Kara   mm: use find_get_...
460
  	struct page *page;
8fa8e538e   Matthew Wilcox   page cache: Conve...
461
462
  	XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
  	pgoff_t max = end_byte >> PAGE_SHIFT;
7fc9e4722   Goldwyn Rodrigues   fs: Introduce fil...
463
464
465
  
  	if (end_byte < start_byte)
  		return false;
8fa8e538e   Matthew Wilcox   page cache: Conve...
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
  	rcu_read_lock();
  	for (;;) {
  		page = xas_find(&xas, max);
  		if (xas_retry(&xas, page))
  			continue;
  		/* Shadow entries don't count */
  		if (xa_is_value(page))
  			continue;
  		/*
  		 * We don't need to try to pin this page; we're about to
  		 * release the RCU lock anyway.  It is enough to know that
  		 * there was a page here recently.
  		 */
  		break;
  	}
  	rcu_read_unlock();
7fc9e4722   Goldwyn Rodrigues   fs: Introduce fil...
482

8fa8e538e   Matthew Wilcox   page cache: Conve...
483
  	return page != NULL;
7fc9e4722   Goldwyn Rodrigues   fs: Introduce fil...
484
485
  }
  EXPORT_SYMBOL(filemap_range_has_page);
5e8fcc1a0   Jeff Layton   mm: don't TestCle...
486
  static void __filemap_fdatawait_range(struct address_space *mapping,
aa750fd71   Junichi Nomura   mm/filemap.c: mak...
487
  				     loff_t start_byte, loff_t end_byte)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
488
  {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
489
490
  	pgoff_t index = start_byte >> PAGE_SHIFT;
  	pgoff_t end = end_byte >> PAGE_SHIFT;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
491
492
  	struct pagevec pvec;
  	int nr_pages;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
493

94004ed72   Christoph Hellwig   kill wait_on_page...
494
  	if (end_byte < start_byte)
5e8fcc1a0   Jeff Layton   mm: don't TestCle...
495
  		return;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
496

866798201   Mel Gorman   mm, pagevec: remo...
497
  	pagevec_init(&pvec);
312e9d2f7   Jan Kara   mm: use pagevec_l...
498
  	while (index <= end) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
499
  		unsigned i;
312e9d2f7   Jan Kara   mm: use pagevec_l...
500
  		nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index,
67fd707f4   Jan Kara   mm: remove nr_pag...
501
  				end, PAGECACHE_TAG_WRITEBACK);
312e9d2f7   Jan Kara   mm: use pagevec_l...
502
503
  		if (!nr_pages)
  			break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
504
505
  		for (i = 0; i < nr_pages; i++) {
  			struct page *page = pvec.pages[i];
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
506
  			wait_on_page_writeback(page);
5e8fcc1a0   Jeff Layton   mm: don't TestCle...
507
  			ClearPageError(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
508
509
510
511
  		}
  		pagevec_release(&pvec);
  		cond_resched();
  	}
aa750fd71   Junichi Nomura   mm/filemap.c: mak...
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
  }
  
  /**
   * filemap_fdatawait_range - wait for writeback to complete
   * @mapping:		address space structure to wait for
   * @start_byte:		offset in bytes where the range starts
   * @end_byte:		offset in bytes where the range ends (inclusive)
   *
   * Walk the list of under-writeback pages of the given address space
   * in the given range and wait for all of them.  Check error status of
   * the address space and return it.
   *
   * Since the error status of the address space is cleared by this function,
   * callers are responsible for checking the return value and handling and/or
   * reporting the error.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
527
528
   *
   * Return: error status of the address space.
aa750fd71   Junichi Nomura   mm/filemap.c: mak...
529
530
531
532
   */
  int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
  			    loff_t end_byte)
  {
5e8fcc1a0   Jeff Layton   mm: don't TestCle...
533
534
  	__filemap_fdatawait_range(mapping, start_byte, end_byte);
  	return filemap_check_errors(mapping);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
535
  }
d3bccb6f4   Jan Kara   vfs: Introduce fi...
536
537
538
  EXPORT_SYMBOL(filemap_fdatawait_range);
  
  /**
aa0bfcd93   Ross Zwisler   mm: add filemap_f...
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
   * filemap_fdatawait_range_keep_errors - wait for writeback to complete
   * @mapping:		address space structure to wait for
   * @start_byte:		offset in bytes where the range starts
   * @end_byte:		offset in bytes where the range ends (inclusive)
   *
   * Walk the list of under-writeback pages of the given address space in the
   * given range and wait for all of them.  Unlike filemap_fdatawait_range(),
   * this function does not clear error status of the address space.
   *
   * Use this function if callers don't handle errors themselves.  Expected
   * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
   * fsfreeze(8)
   */
  int filemap_fdatawait_range_keep_errors(struct address_space *mapping,
  		loff_t start_byte, loff_t end_byte)
  {
  	__filemap_fdatawait_range(mapping, start_byte, end_byte);
  	return filemap_check_and_keep_errors(mapping);
  }
  EXPORT_SYMBOL(filemap_fdatawait_range_keep_errors);
  
  /**
a823e4589   Jeff Layton   mm: add file_fdat...
561
562
563
564
565
566
567
568
569
570
571
572
   * file_fdatawait_range - wait for writeback to complete
   * @file:		file pointing to address space structure to wait for
   * @start_byte:		offset in bytes where the range starts
   * @end_byte:		offset in bytes where the range ends (inclusive)
   *
   * Walk the list of under-writeback pages of the address space that file
   * refers to, in the given range and wait for all of them.  Check error
   * status of the address space vs. the file->f_wb_err cursor and return it.
   *
   * Since the error status of the file is advanced by this function,
   * callers are responsible for checking the return value and handling and/or
   * reporting the error.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
573
574
   *
   * Return: error status of the address space vs. the file->f_wb_err cursor.
a823e4589   Jeff Layton   mm: add file_fdat...
575
576
577
578
579
580
581
582
583
   */
  int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte)
  {
  	struct address_space *mapping = file->f_mapping;
  
  	__filemap_fdatawait_range(mapping, start_byte, end_byte);
  	return file_check_and_advance_wb_err(file);
  }
  EXPORT_SYMBOL(file_fdatawait_range);
d3bccb6f4   Jan Kara   vfs: Introduce fi...
584
585
  
  /**
aa750fd71   Junichi Nomura   mm/filemap.c: mak...
586
587
588
589
590
591
592
593
594
595
   * filemap_fdatawait_keep_errors - wait for writeback without clearing errors
   * @mapping: address space structure to wait for
   *
   * Walk the list of under-writeback pages of the given address space
   * and wait for all of them.  Unlike filemap_fdatawait(), this function
   * does not clear error status of the address space.
   *
   * Use this function if callers don't handle errors themselves.  Expected
   * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
   * fsfreeze(8)
a862f68a8   Mike Rapoport   docs/core-api/mm:...
596
597
   *
   * Return: error status of the address space.
aa750fd71   Junichi Nomura   mm/filemap.c: mak...
598
   */
76341cabb   Jeff Layton   jbd2: don't clear...
599
  int filemap_fdatawait_keep_errors(struct address_space *mapping)
aa750fd71   Junichi Nomura   mm/filemap.c: mak...
600
  {
ffb959bbd   Jeff Layton   mm: remove optimi...
601
  	__filemap_fdatawait_range(mapping, 0, LLONG_MAX);
76341cabb   Jeff Layton   jbd2: don't clear...
602
  	return filemap_check_and_keep_errors(mapping);
aa750fd71   Junichi Nomura   mm/filemap.c: mak...
603
  }
76341cabb   Jeff Layton   jbd2: don't clear...
604
  EXPORT_SYMBOL(filemap_fdatawait_keep_errors);
aa750fd71   Junichi Nomura   mm/filemap.c: mak...
605

875d91b11   Konstantin Khlebnikov   mm/filemap.c: rew...
606
  /* Returns true if writeback might be needed or already in progress. */
9326c9b20   Jeff Layton   mm: consolidate d...
607
  static bool mapping_needs_writeback(struct address_space *mapping)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
608
  {
875d91b11   Konstantin Khlebnikov   mm/filemap.c: rew...
609
610
611
612
  	if (dax_mapping(mapping))
  		return mapping->nrexceptional;
  
  	return mapping->nrpages;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
613
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
614
615
616
  
  int filemap_write_and_wait(struct address_space *mapping)
  {
28fd12982   OGAWA Hirofumi   [PATCH] Fix and a...
617
  	int err = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
618

9326c9b20   Jeff Layton   mm: consolidate d...
619
  	if (mapping_needs_writeback(mapping)) {
28fd12982   OGAWA Hirofumi   [PATCH] Fix and a...
620
621
622
623
624
625
626
627
628
629
630
  		err = filemap_fdatawrite(mapping);
  		/*
  		 * Even if the above returned error, the pages may be
  		 * written partially (e.g. -ENOSPC), so we wait for it.
  		 * But the -EIO is special case, it may indicate the worst
  		 * thing (e.g. bug) happened, so we avoid waiting for it.
  		 */
  		if (err != -EIO) {
  			int err2 = filemap_fdatawait(mapping);
  			if (!err)
  				err = err2;
cbeaf9510   Jeff Layton   mm: clear AS_EIO/...
631
632
633
  		} else {
  			/* Clear any previously stored errors */
  			filemap_check_errors(mapping);
28fd12982   OGAWA Hirofumi   [PATCH] Fix and a...
634
  		}
865ffef37   Dmitry Monakhov   fs: fix fsync() e...
635
636
  	} else {
  		err = filemap_check_errors(mapping);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
637
  	}
28fd12982   OGAWA Hirofumi   [PATCH] Fix and a...
638
  	return err;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
639
  }
28fd12982   OGAWA Hirofumi   [PATCH] Fix and a...
640
  EXPORT_SYMBOL(filemap_write_and_wait);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
641

485bb99b4   Randy Dunlap   [PATCH] kernel-do...
642
643
644
645
646
647
  /**
   * filemap_write_and_wait_range - write out & wait on a file range
   * @mapping:	the address_space for the pages
   * @lstart:	offset in bytes where the range starts
   * @lend:	offset in bytes where the range ends (inclusive)
   *
469eb4d03   Andrew Morton   [PATCH] filemap_f...
648
649
   * Write out and wait upon file offsets lstart->lend, inclusive.
   *
0e056eb55   mchehab@s-opensource.com   kernel-api.rst: f...
650
   * Note that @lend is inclusive (describes the last byte to be written) so
469eb4d03   Andrew Morton   [PATCH] filemap_f...
651
   * that this function can be used to write to the very end-of-file (end = -1).
a862f68a8   Mike Rapoport   docs/core-api/mm:...
652
653
   *
   * Return: error status of the address space.
469eb4d03   Andrew Morton   [PATCH] filemap_f...
654
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
655
656
657
  int filemap_write_and_wait_range(struct address_space *mapping,
  				 loff_t lstart, loff_t lend)
  {
28fd12982   OGAWA Hirofumi   [PATCH] Fix and a...
658
  	int err = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
659

9326c9b20   Jeff Layton   mm: consolidate d...
660
  	if (mapping_needs_writeback(mapping)) {
28fd12982   OGAWA Hirofumi   [PATCH] Fix and a...
661
662
663
664
  		err = __filemap_fdatawrite_range(mapping, lstart, lend,
  						 WB_SYNC_ALL);
  		/* See comment of filemap_write_and_wait() */
  		if (err != -EIO) {
94004ed72   Christoph Hellwig   kill wait_on_page...
665
666
  			int err2 = filemap_fdatawait_range(mapping,
  						lstart, lend);
28fd12982   OGAWA Hirofumi   [PATCH] Fix and a...
667
668
  			if (!err)
  				err = err2;
cbeaf9510   Jeff Layton   mm: clear AS_EIO/...
669
670
671
  		} else {
  			/* Clear any previously stored errors */
  			filemap_check_errors(mapping);
28fd12982   OGAWA Hirofumi   [PATCH] Fix and a...
672
  		}
865ffef37   Dmitry Monakhov   fs: fix fsync() e...
673
674
  	} else {
  		err = filemap_check_errors(mapping);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
675
  	}
28fd12982   OGAWA Hirofumi   [PATCH] Fix and a...
676
  	return err;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
677
  }
f69955855   Chris Mason   Export filemap_wr...
678
  EXPORT_SYMBOL(filemap_write_and_wait_range);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
679

5660e13d2   Jeff Layton   fs: new infrastru...
680
681
  void __filemap_set_wb_err(struct address_space *mapping, int err)
  {
3acdfd280   Jeff Layton   errseq: rename __...
682
  	errseq_t eseq = errseq_set(&mapping->wb_err, err);
5660e13d2   Jeff Layton   fs: new infrastru...
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
  
  	trace_filemap_set_wb_err(mapping, eseq);
  }
  EXPORT_SYMBOL(__filemap_set_wb_err);
  
  /**
   * file_check_and_advance_wb_err - report wb error (if any) that was previously
   * 				   and advance wb_err to current one
   * @file: struct file on which the error is being reported
   *
   * When userland calls fsync (or something like nfsd does the equivalent), we
   * want to report any writeback errors that occurred since the last fsync (or
   * since the file was opened if there haven't been any).
   *
   * Grab the wb_err from the mapping. If it matches what we have in the file,
   * then just quickly return 0. The file is all caught up.
   *
   * If it doesn't match, then take the mapping value, set the "seen" flag in
   * it and try to swap it into place. If it works, or another task beat us
   * to it with the new value, then update the f_wb_err and return the error
   * portion. The error at this point must be reported via proper channels
   * (a'la fsync, or NFS COMMIT operation, etc.).
   *
   * While we handle mapping->wb_err with atomic operations, the f_wb_err
   * value is protected by the f_lock since we must ensure that it reflects
   * the latest value swapped in for this file descriptor.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
709
710
   *
   * Return: %0 on success, negative error code otherwise.
5660e13d2   Jeff Layton   fs: new infrastru...
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
   */
  int file_check_and_advance_wb_err(struct file *file)
  {
  	int err = 0;
  	errseq_t old = READ_ONCE(file->f_wb_err);
  	struct address_space *mapping = file->f_mapping;
  
  	/* Locklessly handle the common case where nothing has changed */
  	if (errseq_check(&mapping->wb_err, old)) {
  		/* Something changed, must use slow path */
  		spin_lock(&file->f_lock);
  		old = file->f_wb_err;
  		err = errseq_check_and_advance(&mapping->wb_err,
  						&file->f_wb_err);
  		trace_file_check_and_advance_wb_err(file, old);
  		spin_unlock(&file->f_lock);
  	}
f4e222c56   Jeff Layton   mm: have filemap_...
728
729
730
731
732
733
734
735
  
  	/*
  	 * We're mostly using this function as a drop in replacement for
  	 * filemap_check_errors. Clear AS_EIO/AS_ENOSPC to emulate the effect
  	 * that the legacy code would have had on these flags.
  	 */
  	clear_bit(AS_EIO, &mapping->flags);
  	clear_bit(AS_ENOSPC, &mapping->flags);
5660e13d2   Jeff Layton   fs: new infrastru...
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
  	return err;
  }
  EXPORT_SYMBOL(file_check_and_advance_wb_err);
  
  /**
   * file_write_and_wait_range - write out & wait on a file range
   * @file:	file pointing to address_space with pages
   * @lstart:	offset in bytes where the range starts
   * @lend:	offset in bytes where the range ends (inclusive)
   *
   * Write out and wait upon file offsets lstart->lend, inclusive.
   *
   * Note that @lend is inclusive (describes the last byte to be written) so
   * that this function can be used to write to the very end-of-file (end = -1).
   *
   * After writing out and waiting on the data, we check and advance the
   * f_wb_err cursor to the latest value, and return any errors detected there.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
753
754
   *
   * Return: %0 on success, negative error code otherwise.
5660e13d2   Jeff Layton   fs: new infrastru...
755
756
757
758
759
   */
  int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)
  {
  	int err = 0, err2;
  	struct address_space *mapping = file->f_mapping;
9326c9b20   Jeff Layton   mm: consolidate d...
760
  	if (mapping_needs_writeback(mapping)) {
5660e13d2   Jeff Layton   fs: new infrastru...
761
762
763
764
765
766
767
768
769
770
771
772
  		err = __filemap_fdatawrite_range(mapping, lstart, lend,
  						 WB_SYNC_ALL);
  		/* See comment of filemap_write_and_wait() */
  		if (err != -EIO)
  			__filemap_fdatawait_range(mapping, lstart, lend);
  	}
  	err2 = file_check_and_advance_wb_err(file);
  	if (!err)
  		err = err2;
  	return err;
  }
  EXPORT_SYMBOL(file_write_and_wait_range);
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
773
  /**
ef6a3c631   Miklos Szeredi   mm: add replace_p...
774
775
776
777
778
779
780
781
782
783
784
   * replace_page_cache_page - replace a pagecache page with a new one
   * @old:	page to be replaced
   * @new:	page to replace with
   * @gfp_mask:	allocation mode
   *
   * This function replaces a page in the pagecache with a new one.  On
   * success it acquires the pagecache reference for the new page and
   * drops it for the old page.  Both the old and new pages must be
   * locked.  This function does not add the new page to the LRU, the
   * caller must do that.
   *
74d609585   Matthew Wilcox   page cache: Add a...
785
   * The remove + add is atomic.  This function cannot fail.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
786
787
   *
   * Return: %0
ef6a3c631   Miklos Szeredi   mm: add replace_p...
788
789
790
   */
  int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
  {
74d609585   Matthew Wilcox   page cache: Add a...
791
792
793
794
795
  	struct address_space *mapping = old->mapping;
  	void (*freepage)(struct page *) = mapping->a_ops->freepage;
  	pgoff_t offset = old->index;
  	XA_STATE(xas, &mapping->i_pages, offset);
  	unsigned long flags;
ef6a3c631   Miklos Szeredi   mm: add replace_p...
796

309381fea   Sasha Levin   mm: dump page whe...
797
798
799
  	VM_BUG_ON_PAGE(!PageLocked(old), old);
  	VM_BUG_ON_PAGE(!PageLocked(new), new);
  	VM_BUG_ON_PAGE(new->mapping, new);
ef6a3c631   Miklos Szeredi   mm: add replace_p...
800

74d609585   Matthew Wilcox   page cache: Add a...
801
802
803
  	get_page(new);
  	new->mapping = mapping;
  	new->index = offset;
ef6a3c631   Miklos Szeredi   mm: add replace_p...
804

74d609585   Matthew Wilcox   page cache: Add a...
805
806
  	xas_lock_irqsave(&xas, flags);
  	xas_store(&xas, new);
4165b9b46   Michal Hocko   hugetlb: do not a...
807

74d609585   Matthew Wilcox   page cache: Add a...
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
  	old->mapping = NULL;
  	/* hugetlb pages do not participate in page cache accounting. */
  	if (!PageHuge(old))
  		__dec_node_page_state(new, NR_FILE_PAGES);
  	if (!PageHuge(new))
  		__inc_node_page_state(new, NR_FILE_PAGES);
  	if (PageSwapBacked(old))
  		__dec_node_page_state(new, NR_SHMEM);
  	if (PageSwapBacked(new))
  		__inc_node_page_state(new, NR_SHMEM);
  	xas_unlock_irqrestore(&xas, flags);
  	mem_cgroup_migrate(old, new);
  	if (freepage)
  		freepage(old);
  	put_page(old);
ef6a3c631   Miklos Szeredi   mm: add replace_p...
823

74d609585   Matthew Wilcox   page cache: Add a...
824
  	return 0;
ef6a3c631   Miklos Szeredi   mm: add replace_p...
825
826
  }
  EXPORT_SYMBOL_GPL(replace_page_cache_page);
a528910e1   Johannes Weiner   mm: thrash detect...
827
828
829
830
  static int __add_to_page_cache_locked(struct page *page,
  				      struct address_space *mapping,
  				      pgoff_t offset, gfp_t gfp_mask,
  				      void **shadowp)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
831
  {
74d609585   Matthew Wilcox   page cache: Add a...
832
  	XA_STATE(xas, &mapping->i_pages, offset);
00501b531   Johannes Weiner   mm: memcontrol: r...
833
834
  	int huge = PageHuge(page);
  	struct mem_cgroup *memcg;
e286781d5   Nick Piggin   mm: speculative p...
835
  	int error;
74d609585   Matthew Wilcox   page cache: Add a...
836
  	void *old;
e286781d5   Nick Piggin   mm: speculative p...
837

309381fea   Sasha Levin   mm: dump page whe...
838
839
  	VM_BUG_ON_PAGE(!PageLocked(page), page);
  	VM_BUG_ON_PAGE(PageSwapBacked(page), page);
74d609585   Matthew Wilcox   page cache: Add a...
840
  	mapping_set_update(&xas, mapping);
e286781d5   Nick Piggin   mm: speculative p...
841

00501b531   Johannes Weiner   mm: memcontrol: r...
842
843
  	if (!huge) {
  		error = mem_cgroup_try_charge(page, current->mm,
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
844
  					      gfp_mask, &memcg, false);
00501b531   Johannes Weiner   mm: memcontrol: r...
845
846
847
  		if (error)
  			return error;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
848

09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
849
  	get_page(page);
66a0c8ee3   Kirill A. Shutemov   mm: cleanup add_t...
850
851
  	page->mapping = mapping;
  	page->index = offset;
74d609585   Matthew Wilcox   page cache: Add a...
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
  	do {
  		xas_lock_irq(&xas);
  		old = xas_load(&xas);
  		if (old && !xa_is_value(old))
  			xas_set_err(&xas, -EEXIST);
  		xas_store(&xas, page);
  		if (xas_error(&xas))
  			goto unlock;
  
  		if (xa_is_value(old)) {
  			mapping->nrexceptional--;
  			if (shadowp)
  				*shadowp = old;
  		}
  		mapping->nrpages++;
  
  		/* hugetlb pages do not participate in page cache accounting */
  		if (!huge)
  			__inc_node_page_state(page, NR_FILE_PAGES);
  unlock:
  		xas_unlock_irq(&xas);
  	} while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK));
  
  	if (xas_error(&xas))
  		goto error;
4165b9b46   Michal Hocko   hugetlb: do not a...
877

00501b531   Johannes Weiner   mm: memcontrol: r...
878
  	if (!huge)
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
879
  		mem_cgroup_commit_charge(page, memcg, false, false);
66a0c8ee3   Kirill A. Shutemov   mm: cleanup add_t...
880
881
  	trace_mm_filemap_add_to_page_cache(page);
  	return 0;
74d609585   Matthew Wilcox   page cache: Add a...
882
  error:
66a0c8ee3   Kirill A. Shutemov   mm: cleanup add_t...
883
884
  	page->mapping = NULL;
  	/* Leave page->index set: truncation relies upon it */
00501b531   Johannes Weiner   mm: memcontrol: r...
885
  	if (!huge)
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
886
  		mem_cgroup_cancel_charge(page, memcg, false);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
887
  	put_page(page);
74d609585   Matthew Wilcox   page cache: Add a...
888
  	return xas_error(&xas);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
889
  }
cfcbfb138   Josef Bacik   mm/filemap.c: ena...
890
  ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO);
a528910e1   Johannes Weiner   mm: thrash detect...
891
892
893
894
895
896
897
898
899
900
  
  /**
   * add_to_page_cache_locked - add a locked page to the pagecache
   * @page:	page to add
   * @mapping:	the page's address_space
   * @offset:	page index
   * @gfp_mask:	page allocation mode
   *
   * This function is used to add a page to the pagecache. It must be locked.
   * This function does not add the page to the LRU.  The caller must do that.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
901
902
   *
   * Return: %0 on success, negative error code otherwise.
a528910e1   Johannes Weiner   mm: thrash detect...
903
904
905
906
907
908
909
   */
  int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
  		pgoff_t offset, gfp_t gfp_mask)
  {
  	return __add_to_page_cache_locked(page, mapping, offset,
  					  gfp_mask, NULL);
  }
e286781d5   Nick Piggin   mm: speculative p...
910
  EXPORT_SYMBOL(add_to_page_cache_locked);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
911
912
  
  int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
6daa0e286   Al Viro   [PATCH] gfp_t: mm...
913
  				pgoff_t offset, gfp_t gfp_mask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
914
  {
a528910e1   Johannes Weiner   mm: thrash detect...
915
  	void *shadow = NULL;
4f98a2fee   Rik van Riel   vmscan: split LRU...
916
  	int ret;
48c935ad8   Kirill A. Shutemov   page-flags: defin...
917
  	__SetPageLocked(page);
a528910e1   Johannes Weiner   mm: thrash detect...
918
919
920
  	ret = __add_to_page_cache_locked(page, mapping, offset,
  					 gfp_mask, &shadow);
  	if (unlikely(ret))
48c935ad8   Kirill A. Shutemov   page-flags: defin...
921
  		__ClearPageLocked(page);
a528910e1   Johannes Weiner   mm: thrash detect...
922
923
924
925
926
  	else {
  		/*
  		 * The page might have been evicted from cache only
  		 * recently, in which case it should be activated like
  		 * any other repeatedly accessed page.
f0281a00f   Rik van Riel   mm: workingset: o...
927
928
929
  		 * The exception is pages getting rewritten; evicting other
  		 * data from the working set, only to cache data that will
  		 * get overwritten with something else, is a waste of memory.
a528910e1   Johannes Weiner   mm: thrash detect...
930
  		 */
1899ad18c   Johannes Weiner   mm: workingset: t...
931
932
933
  		WARN_ON_ONCE(PageActive(page));
  		if (!(gfp_mask & __GFP_WRITE) && shadow)
  			workingset_refault(page, shadow);
a528910e1   Johannes Weiner   mm: thrash detect...
934
935
  		lru_cache_add(page);
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
936
937
  	return ret;
  }
18bc0bbd1   Evgeniy Polyakov   Staging: pohmelfs...
938
  EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
939

44110fe38   Paul Jackson   [PATCH] cpuset me...
940
  #ifdef CONFIG_NUMA
2ae88149a   Nick Piggin   [PATCH] mm: clean...
941
  struct page *__page_cache_alloc(gfp_t gfp)
44110fe38   Paul Jackson   [PATCH] cpuset me...
942
  {
c0ff7453b   Miao Xie   cpuset,mm: fix no...
943
944
  	int n;
  	struct page *page;
44110fe38   Paul Jackson   [PATCH] cpuset me...
945
  	if (cpuset_do_page_mem_spread()) {
cc9a6c877   Mel Gorman   cpuset: mm: reduc...
946
947
  		unsigned int cpuset_mems_cookie;
  		do {
d26914d11   Mel Gorman   mm: optimize put_...
948
  			cpuset_mems_cookie = read_mems_allowed_begin();
cc9a6c877   Mel Gorman   cpuset: mm: reduc...
949
  			n = cpuset_mem_spread_node();
96db800f5   Vlastimil Babka   mm: rename alloc_...
950
  			page = __alloc_pages_node(n, gfp, 0);
d26914d11   Mel Gorman   mm: optimize put_...
951
  		} while (!page && read_mems_allowed_retry(cpuset_mems_cookie));
cc9a6c877   Mel Gorman   cpuset: mm: reduc...
952

c0ff7453b   Miao Xie   cpuset,mm: fix no...
953
  		return page;
44110fe38   Paul Jackson   [PATCH] cpuset me...
954
  	}
2ae88149a   Nick Piggin   [PATCH] mm: clean...
955
  	return alloc_pages(gfp, 0);
44110fe38   Paul Jackson   [PATCH] cpuset me...
956
  }
2ae88149a   Nick Piggin   [PATCH] mm: clean...
957
  EXPORT_SYMBOL(__page_cache_alloc);
44110fe38   Paul Jackson   [PATCH] cpuset me...
958
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
959
960
961
962
963
964
965
966
967
968
  /*
   * In order to wait for pages to become available there must be
   * waitqueues associated with pages. By using a hash table of
   * waitqueues where the bucket discipline is to maintain all
   * waiters on the same queue and wake all when any of the pages
   * become available, and for the woken contexts to check to be
   * sure the appropriate page became available, this saves space
   * at a cost of "thundering herd" phenomena during rare hash
   * collisions.
   */
629060270   Nicholas Piggin   mm: add PageWaite...
969
970
971
972
973
  #define PAGE_WAIT_TABLE_BITS 8
  #define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)
  static wait_queue_head_t page_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
  
  static wait_queue_head_t *page_waitqueue(struct page *page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
974
  {
629060270   Nicholas Piggin   mm: add PageWaite...
975
  	return &page_wait_table[hash_ptr(page, PAGE_WAIT_TABLE_BITS)];
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
976
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
977

629060270   Nicholas Piggin   mm: add PageWaite...
978
  void __init pagecache_init(void)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
979
  {
629060270   Nicholas Piggin   mm: add PageWaite...
980
  	int i;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
981

629060270   Nicholas Piggin   mm: add PageWaite...
982
983
984
985
  	for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
  		init_waitqueue_head(&page_wait_table[i]);
  
  	page_writeback_init();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
986
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
987

3510ca20e   Linus Torvalds   Minor page waitqu...
988
  /* This has the same layout as wait_bit_key - see fs/cachefiles/rdwr.c */
629060270   Nicholas Piggin   mm: add PageWaite...
989
990
991
992
993
994
995
996
997
  struct wait_page_key {
  	struct page *page;
  	int bit_nr;
  	int page_match;
  };
  
  struct wait_page_queue {
  	struct page *page;
  	int bit_nr;
ac6424b98   Ingo Molnar   sched/wait: Renam...
998
  	wait_queue_entry_t wait;
629060270   Nicholas Piggin   mm: add PageWaite...
999
  };
ac6424b98   Ingo Molnar   sched/wait: Renam...
1000
  static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
f62e00cc3   KOSAKI Motohiro   mm: introduce wai...
1001
  {
629060270   Nicholas Piggin   mm: add PageWaite...
1002
1003
1004
1005
1006
1007
1008
  	struct wait_page_key *key = arg;
  	struct wait_page_queue *wait_page
  		= container_of(wait, struct wait_page_queue, wait);
  
  	if (wait_page->page != key->page)
  	       return 0;
  	key->page_match = 1;
f62e00cc3   KOSAKI Motohiro   mm: introduce wai...
1009

629060270   Nicholas Piggin   mm: add PageWaite...
1010
1011
  	if (wait_page->bit_nr != key->bit_nr)
  		return 0;
3510ca20e   Linus Torvalds   Minor page waitqu...
1012

9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1013
1014
1015
1016
1017
1018
1019
1020
  	/*
  	 * Stop walking if it's locked.
  	 * Is this safe if put_and_wait_on_page_locked() is in use?
  	 * Yes: the waker must hold a reference to this page, and if PG_locked
  	 * has now already been set by another task, that task must also hold
  	 * a reference to the *same usage* of this page; so there is no need
  	 * to walk on to wake even the put_and_wait_on_page_locked() callers.
  	 */
629060270   Nicholas Piggin   mm: add PageWaite...
1021
  	if (test_bit(key->bit_nr, &key->page->flags))
3510ca20e   Linus Torvalds   Minor page waitqu...
1022
  		return -1;
f62e00cc3   KOSAKI Motohiro   mm: introduce wai...
1023

629060270   Nicholas Piggin   mm: add PageWaite...
1024
  	return autoremove_wake_function(wait, mode, sync, key);
f62e00cc3   KOSAKI Motohiro   mm: introduce wai...
1025
  }
74d81bfae   Nicholas Piggin   mm: un-export wak...
1026
  static void wake_up_page_bit(struct page *page, int bit_nr)
cbbce8220   NeilBrown   SCHED: add some "...
1027
  {
629060270   Nicholas Piggin   mm: add PageWaite...
1028
1029
1030
  	wait_queue_head_t *q = page_waitqueue(page);
  	struct wait_page_key key;
  	unsigned long flags;
11a19c7b0   Tim Chen   sched/wait: Intro...
1031
  	wait_queue_entry_t bookmark;
cbbce8220   NeilBrown   SCHED: add some "...
1032

629060270   Nicholas Piggin   mm: add PageWaite...
1033
1034
1035
  	key.page = page;
  	key.bit_nr = bit_nr;
  	key.page_match = 0;
11a19c7b0   Tim Chen   sched/wait: Intro...
1036
1037
1038
1039
  	bookmark.flags = 0;
  	bookmark.private = NULL;
  	bookmark.func = NULL;
  	INIT_LIST_HEAD(&bookmark.entry);
629060270   Nicholas Piggin   mm: add PageWaite...
1040
  	spin_lock_irqsave(&q->lock, flags);
11a19c7b0   Tim Chen   sched/wait: Intro...
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
  	__wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
  
  	while (bookmark.flags & WQ_FLAG_BOOKMARK) {
  		/*
  		 * Take a breather from holding the lock,
  		 * allow pages that finish wake up asynchronously
  		 * to acquire the lock and remove themselves
  		 * from wait queue
  		 */
  		spin_unlock_irqrestore(&q->lock, flags);
  		cpu_relax();
  		spin_lock_irqsave(&q->lock, flags);
  		__wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
  	}
629060270   Nicholas Piggin   mm: add PageWaite...
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
  	/*
  	 * It is possible for other pages to have collided on the waitqueue
  	 * hash, so in that case check for a page match. That prevents a long-
  	 * term waiter
  	 *
  	 * It is still possible to miss a case here, when we woke page waiters
  	 * and removed them from the waitqueue, but there are still other
  	 * page waiters.
  	 */
  	if (!waitqueue_active(q) || !key.page_match) {
  		ClearPageWaiters(page);
  		/*
  		 * It's possible to miss clearing Waiters here, when we woke
  		 * our page waiters, but the hashed waitqueue has waiters for
  		 * other pages on it.
  		 *
  		 * That's okay, it's a rare case. The next waker will clear it.
  		 */
  	}
  	spin_unlock_irqrestore(&q->lock, flags);
  }
74d81bfae   Nicholas Piggin   mm: un-export wak...
1076
1077
1078
1079
1080
1081
1082
  
  static void wake_up_page(struct page *page, int bit)
  {
  	if (!PageWaiters(page))
  		return;
  	wake_up_page_bit(page, bit);
  }
629060270   Nicholas Piggin   mm: add PageWaite...
1083

9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
  /*
   * A choice of three behaviors for wait_on_page_bit_common():
   */
  enum behavior {
  	EXCLUSIVE,	/* Hold ref to page and take the bit when woken, like
  			 * __lock_page() waiting on then setting PG_locked.
  			 */
  	SHARED,		/* Hold ref to page and check the bit when woken, like
  			 * wait_on_page_writeback() waiting on PG_writeback.
  			 */
  	DROP,		/* Drop ref to page before wait, no check when woken,
  			 * like put_and_wait_on_page_locked() on PG_locked.
  			 */
  };
629060270   Nicholas Piggin   mm: add PageWaite...
1098
  static inline int wait_on_page_bit_common(wait_queue_head_t *q,
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1099
  	struct page *page, int bit_nr, int state, enum behavior behavior)
629060270   Nicholas Piggin   mm: add PageWaite...
1100
1101
  {
  	struct wait_page_queue wait_page;
ac6424b98   Ingo Molnar   sched/wait: Renam...
1102
  	wait_queue_entry_t *wait = &wait_page.wait;
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1103
  	bool bit_is_set;
b1d29ba82   Johannes Weiner   delayacct: track ...
1104
  	bool thrashing = false;
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1105
  	bool delayacct = false;
eb414681d   Johannes Weiner   psi: pressure sta...
1106
  	unsigned long pflags;
629060270   Nicholas Piggin   mm: add PageWaite...
1107
  	int ret = 0;
eb414681d   Johannes Weiner   psi: pressure sta...
1108
  	if (bit_nr == PG_locked &&
b1d29ba82   Johannes Weiner   delayacct: track ...
1109
  	    !PageUptodate(page) && PageWorkingset(page)) {
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1110
  		if (!PageSwapBacked(page)) {
eb414681d   Johannes Weiner   psi: pressure sta...
1111
  			delayacct_thrashing_start();
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1112
1113
  			delayacct = true;
  		}
eb414681d   Johannes Weiner   psi: pressure sta...
1114
  		psi_memstall_enter(&pflags);
b1d29ba82   Johannes Weiner   delayacct: track ...
1115
1116
  		thrashing = true;
  	}
629060270   Nicholas Piggin   mm: add PageWaite...
1117
  	init_wait(wait);
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1118
  	wait->flags = behavior == EXCLUSIVE ? WQ_FLAG_EXCLUSIVE : 0;
629060270   Nicholas Piggin   mm: add PageWaite...
1119
1120
1121
1122
1123
1124
  	wait->func = wake_page_function;
  	wait_page.page = page;
  	wait_page.bit_nr = bit_nr;
  
  	for (;;) {
  		spin_lock_irq(&q->lock);
2055da973   Ingo Molnar   sched/wait: Disam...
1125
  		if (likely(list_empty(&wait->entry))) {
3510ca20e   Linus Torvalds   Minor page waitqu...
1126
  			__add_wait_queue_entry_tail(q, wait);
629060270   Nicholas Piggin   mm: add PageWaite...
1127
1128
1129
1130
1131
1132
  			SetPageWaiters(page);
  		}
  
  		set_current_state(state);
  
  		spin_unlock_irq(&q->lock);
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1133
1134
1135
1136
1137
  		bit_is_set = test_bit(bit_nr, &page->flags);
  		if (behavior == DROP)
  			put_page(page);
  
  		if (likely(bit_is_set))
629060270   Nicholas Piggin   mm: add PageWaite...
1138
  			io_schedule();
629060270   Nicholas Piggin   mm: add PageWaite...
1139

9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1140
  		if (behavior == EXCLUSIVE) {
629060270   Nicholas Piggin   mm: add PageWaite...
1141
1142
  			if (!test_and_set_bit_lock(bit_nr, &page->flags))
  				break;
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1143
  		} else if (behavior == SHARED) {
629060270   Nicholas Piggin   mm: add PageWaite...
1144
1145
1146
  			if (!test_bit(bit_nr, &page->flags))
  				break;
  		}
a8b169afb   Linus Torvalds   Avoid page waitqu...
1147

fa45f1162   Davidlohr Bueso   mm/: remove calle...
1148
  		if (signal_pending_state(state, current)) {
a8b169afb   Linus Torvalds   Avoid page waitqu...
1149
1150
1151
  			ret = -EINTR;
  			break;
  		}
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
  
  		if (behavior == DROP) {
  			/*
  			 * We can no longer safely access page->flags:
  			 * even if CONFIG_MEMORY_HOTREMOVE is not enabled,
  			 * there is a risk of waiting forever on a page reused
  			 * for something that keeps it locked indefinitely.
  			 * But best check for -EINTR above before breaking.
  			 */
  			break;
  		}
629060270   Nicholas Piggin   mm: add PageWaite...
1163
1164
1165
  	}
  
  	finish_wait(q, wait);
eb414681d   Johannes Weiner   psi: pressure sta...
1166
  	if (thrashing) {
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1167
  		if (delayacct)
eb414681d   Johannes Weiner   psi: pressure sta...
1168
1169
1170
  			delayacct_thrashing_end();
  		psi_memstall_leave(&pflags);
  	}
b1d29ba82   Johannes Weiner   delayacct: track ...
1171

629060270   Nicholas Piggin   mm: add PageWaite...
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
  	/*
  	 * A signal could leave PageWaiters set. Clearing it here if
  	 * !waitqueue_active would be possible (by open-coding finish_wait),
  	 * but still fail to catch it in the case of wait hash collision. We
  	 * already can fail to clear wait hash collision cases, so don't
  	 * bother with signals either.
  	 */
  
  	return ret;
  }
  
  void wait_on_page_bit(struct page *page, int bit_nr)
  {
  	wait_queue_head_t *q = page_waitqueue(page);
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1186
  	wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);
629060270   Nicholas Piggin   mm: add PageWaite...
1187
1188
1189
1190
1191
1192
  }
  EXPORT_SYMBOL(wait_on_page_bit);
  
  int wait_on_page_bit_killable(struct page *page, int bit_nr)
  {
  	wait_queue_head_t *q = page_waitqueue(page);
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1193
  	return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, SHARED);
cbbce8220   NeilBrown   SCHED: add some "...
1194
  }
4343d0087   David Howells   afs: Get rid of t...
1195
  EXPORT_SYMBOL(wait_on_page_bit_killable);
cbbce8220   NeilBrown   SCHED: add some "...
1196

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1197
  /**
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
   * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked
   * @page: The page to wait for.
   *
   * The caller should hold a reference on @page.  They expect the page to
   * become unlocked relatively soon, but do not wish to hold up migration
   * (for example) by holding the reference while waiting for the page to
   * come unlocked.  After this function returns, the caller should not
   * dereference @page.
   */
  void put_and_wait_on_page_locked(struct page *page)
  {
  	wait_queue_head_t *q;
  
  	page = compound_head(page);
  	q = page_waitqueue(page);
  	wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, DROP);
  }
  
  /**
385e1ca5f   David Howells   CacheFiles: Permi...
1217
   * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
697f619fc   Randy Dunlap   filemap: fix kern...
1218
1219
   * @page: Page defining the wait queue of interest
   * @waiter: Waiter to add to the queue
385e1ca5f   David Howells   CacheFiles: Permi...
1220
1221
1222
   *
   * Add an arbitrary @waiter to the wait queue for the nominated @page.
   */
ac6424b98   Ingo Molnar   sched/wait: Renam...
1223
  void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter)
385e1ca5f   David Howells   CacheFiles: Permi...
1224
1225
1226
1227
1228
  {
  	wait_queue_head_t *q = page_waitqueue(page);
  	unsigned long flags;
  
  	spin_lock_irqsave(&q->lock, flags);
9c3a815f4   Linus Torvalds   page waitqueue: a...
1229
  	__add_wait_queue_entry_tail(q, waiter);
629060270   Nicholas Piggin   mm: add PageWaite...
1230
  	SetPageWaiters(page);
385e1ca5f   David Howells   CacheFiles: Permi...
1231
1232
1233
  	spin_unlock_irqrestore(&q->lock, flags);
  }
  EXPORT_SYMBOL_GPL(add_page_wait_queue);
b91e1302a   Linus Torvalds   mm: optimize Page...
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
  #ifndef clear_bit_unlock_is_negative_byte
  
  /*
   * PG_waiters is the high bit in the same byte as PG_lock.
   *
   * On x86 (and on many other architectures), we can clear PG_lock and
   * test the sign bit at the same time. But if the architecture does
   * not support that special operation, we just do this all by hand
   * instead.
   *
   * The read of PG_waiters has to be after (or concurrently with) PG_locked
   * being cleared, but a memory barrier should be unneccssary since it is
   * in the same byte as PG_locked.
   */
  static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem)
  {
  	clear_bit_unlock(nr, mem);
  	/* smp_mb__after_atomic(); */
98473f9f3   Olof Johansson   mm/filemap: fix p...
1252
  	return test_bit(PG_waiters, mem);
b91e1302a   Linus Torvalds   mm: optimize Page...
1253
1254
1255
  }
  
  #endif
385e1ca5f   David Howells   CacheFiles: Permi...
1256
  /**
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
1257
   * unlock_page - unlock a locked page
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1258
1259
1260
1261
   * @page: the page
   *
   * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
   * Also wakes sleepers in wait_on_page_writeback() because the wakeup
da3dae54e   Masanari Iida   Documentation: Do...
1262
   * mechanism between PageLocked pages and PageWriteback pages is shared.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1263
1264
   * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
   *
b91e1302a   Linus Torvalds   mm: optimize Page...
1265
1266
1267
1268
1269
   * Note that this depends on PG_waiters being the sign bit in the byte
   * that contains PG_locked - thus the BUILD_BUG_ON(). That allows us to
   * clear the PG_locked bit and test PG_waiters at the same time fairly
   * portably (architectures that do LL/SC can test any bit, while x86 can
   * test the sign bit).
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1270
   */
920c7a5d0   Harvey Harrison   mm: remove fastca...
1271
  void unlock_page(struct page *page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1272
  {
b91e1302a   Linus Torvalds   mm: optimize Page...
1273
  	BUILD_BUG_ON(PG_waiters != 7);
48c935ad8   Kirill A. Shutemov   page-flags: defin...
1274
  	page = compound_head(page);
309381fea   Sasha Levin   mm: dump page whe...
1275
  	VM_BUG_ON_PAGE(!PageLocked(page), page);
b91e1302a   Linus Torvalds   mm: optimize Page...
1276
1277
  	if (clear_bit_unlock_is_negative_byte(PG_locked, &page->flags))
  		wake_up_page_bit(page, PG_locked);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1278
1279
  }
  EXPORT_SYMBOL(unlock_page);
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
1280
1281
1282
  /**
   * end_page_writeback - end writeback against a page
   * @page: the page
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1283
1284
1285
   */
  void end_page_writeback(struct page *page)
  {
888cf2db4   Mel Gorman   mm: avoid unneces...
1286
1287
1288
1289
1290
1291
1292
1293
1294
  	/*
  	 * TestClearPageReclaim could be used here but it is an atomic
  	 * operation and overkill in this particular case. Failing to
  	 * shuffle a page marked for immediate reclaim is too mild to
  	 * justify taking an atomic operation penalty at the end of
  	 * ever page writeback.
  	 */
  	if (PageReclaim(page)) {
  		ClearPageReclaim(page);
ac6aadb24   Miklos Szeredi   mm: rotate_reclai...
1295
  		rotate_reclaimable_page(page);
888cf2db4   Mel Gorman   mm: avoid unneces...
1296
  	}
ac6aadb24   Miklos Szeredi   mm: rotate_reclai...
1297
1298
1299
  
  	if (!test_clear_page_writeback(page))
  		BUG();
4e857c58e   Peter Zijlstra   arch: Mass conver...
1300
  	smp_mb__after_atomic();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1301
1302
1303
  	wake_up_page(page, PG_writeback);
  }
  EXPORT_SYMBOL(end_page_writeback);
57d998456   Matthew Wilcox   fs/mpage.c: facto...
1304
1305
1306
1307
  /*
   * After completing I/O on a page, call this routine to update the page
   * flags appropriately
   */
c11f0c0b5   Jens Axboe   block/mm: make bd...
1308
  void page_endio(struct page *page, bool is_write, int err)
57d998456   Matthew Wilcox   fs/mpage.c: facto...
1309
  {
c11f0c0b5   Jens Axboe   block/mm: make bd...
1310
  	if (!is_write) {
57d998456   Matthew Wilcox   fs/mpage.c: facto...
1311
1312
1313
1314
1315
1316
1317
  		if (!err) {
  			SetPageUptodate(page);
  		} else {
  			ClearPageUptodate(page);
  			SetPageError(page);
  		}
  		unlock_page(page);
abf545484   Mike Christie   mm/block: convert...
1318
  	} else {
57d998456   Matthew Wilcox   fs/mpage.c: facto...
1319
  		if (err) {
dd8416c47   Minchan Kim   mm: do not access...
1320
  			struct address_space *mapping;
57d998456   Matthew Wilcox   fs/mpage.c: facto...
1321
  			SetPageError(page);
dd8416c47   Minchan Kim   mm: do not access...
1322
1323
1324
  			mapping = page_mapping(page);
  			if (mapping)
  				mapping_set_error(mapping, err);
57d998456   Matthew Wilcox   fs/mpage.c: facto...
1325
1326
1327
1328
1329
  		}
  		end_page_writeback(page);
  	}
  }
  EXPORT_SYMBOL_GPL(page_endio);
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
1330
1331
  /**
   * __lock_page - get a lock on the page, assuming we need to sleep to get it
870667553   Randy Dunlap   mm: fix filemap.c...
1332
   * @__page: the page to lock
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1333
   */
629060270   Nicholas Piggin   mm: add PageWaite...
1334
  void __lock_page(struct page *__page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1335
  {
629060270   Nicholas Piggin   mm: add PageWaite...
1336
1337
  	struct page *page = compound_head(__page);
  	wait_queue_head_t *q = page_waitqueue(page);
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1338
1339
  	wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE,
  				EXCLUSIVE);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1340
1341
  }
  EXPORT_SYMBOL(__lock_page);
629060270   Nicholas Piggin   mm: add PageWaite...
1342
  int __lock_page_killable(struct page *__page)
2687a3569   Matthew Wilcox   Add lock_page_kil...
1343
  {
629060270   Nicholas Piggin   mm: add PageWaite...
1344
1345
  	struct page *page = compound_head(__page);
  	wait_queue_head_t *q = page_waitqueue(page);
9a1ea439b   Hugh Dickins   mm: put_and_wait_...
1346
1347
  	return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE,
  					EXCLUSIVE);
2687a3569   Matthew Wilcox   Add lock_page_kil...
1348
  }
18bc0bbd1   Evgeniy Polyakov   Staging: pohmelfs...
1349
  EXPORT_SYMBOL_GPL(__lock_page_killable);
2687a3569   Matthew Wilcox   Add lock_page_kil...
1350

9a95f3cf7   Paul Cassella   mm: describe mmap...
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
  /*
   * Return values:
   * 1 - page is locked; mmap_sem is still held.
   * 0 - page is not locked.
   *     mmap_sem has been released (up_read()), unless flags had both
   *     FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in
   *     which case mmap_sem is still held.
   *
   * If neither ALLOW_RETRY nor KILLABLE are set, will always return 1
   * with the page locked and the mmap_sem unperturbed.
   */
d065bd810   Michel Lespinasse   mm: retry page fa...
1362
1363
1364
  int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
  			 unsigned int flags)
  {
37b23e052   KOSAKI Motohiro   x86,mm: make page...
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
  	if (flags & FAULT_FLAG_ALLOW_RETRY) {
  		/*
  		 * CAUTION! In this case, mmap_sem is not released
  		 * even though return 0.
  		 */
  		if (flags & FAULT_FLAG_RETRY_NOWAIT)
  			return 0;
  
  		up_read(&mm->mmap_sem);
  		if (flags & FAULT_FLAG_KILLABLE)
  			wait_on_page_locked_killable(page);
  		else
318b275fb   Gleb Natapov   mm: allow GUP to ...
1377
  			wait_on_page_locked(page);
d065bd810   Michel Lespinasse   mm: retry page fa...
1378
  		return 0;
37b23e052   KOSAKI Motohiro   x86,mm: make page...
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
  	} else {
  		if (flags & FAULT_FLAG_KILLABLE) {
  			int ret;
  
  			ret = __lock_page_killable(page);
  			if (ret) {
  				up_read(&mm->mmap_sem);
  				return 0;
  			}
  		} else
  			__lock_page(page);
  		return 1;
d065bd810   Michel Lespinasse   mm: retry page fa...
1391
1392
  	}
  }
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
1393
  /**
0d3f92966   Matthew Wilcox   page cache: Conve...
1394
1395
1396
1397
   * page_cache_next_miss() - Find the next gap in the page cache.
   * @mapping: Mapping.
   * @index: Index.
   * @max_scan: Maximum range to search.
e7b563bb2   Johannes Weiner   mm: filemap: move...
1398
   *
0d3f92966   Matthew Wilcox   page cache: Conve...
1399
1400
   * Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the
   * gap with the lowest index.
e7b563bb2   Johannes Weiner   mm: filemap: move...
1401
   *
0d3f92966   Matthew Wilcox   page cache: Conve...
1402
1403
1404
1405
1406
   * This function may be called under the rcu_read_lock.  However, this will
   * not atomically search a snapshot of the cache at a single point in time.
   * For example, if a gap is created at index 5, then subsequently a gap is
   * created at index 10, page_cache_next_miss covering both indices may
   * return 10 if called under the rcu_read_lock.
e7b563bb2   Johannes Weiner   mm: filemap: move...
1407
   *
0d3f92966   Matthew Wilcox   page cache: Conve...
1408
1409
1410
   * Return: The index of the gap if found, otherwise an index outside the
   * range specified (in which case 'return - index >= max_scan' will be true).
   * In the rare case of index wrap-around, 0 will be returned.
e7b563bb2   Johannes Weiner   mm: filemap: move...
1411
   */
0d3f92966   Matthew Wilcox   page cache: Conve...
1412
  pgoff_t page_cache_next_miss(struct address_space *mapping,
e7b563bb2   Johannes Weiner   mm: filemap: move...
1413
1414
  			     pgoff_t index, unsigned long max_scan)
  {
0d3f92966   Matthew Wilcox   page cache: Conve...
1415
  	XA_STATE(xas, &mapping->i_pages, index);
e7b563bb2   Johannes Weiner   mm: filemap: move...
1416

0d3f92966   Matthew Wilcox   page cache: Conve...
1417
1418
1419
  	while (max_scan--) {
  		void *entry = xas_next(&xas);
  		if (!entry || xa_is_value(entry))
e7b563bb2   Johannes Weiner   mm: filemap: move...
1420
  			break;
0d3f92966   Matthew Wilcox   page cache: Conve...
1421
  		if (xas.xa_index == 0)
e7b563bb2   Johannes Weiner   mm: filemap: move...
1422
1423
  			break;
  	}
0d3f92966   Matthew Wilcox   page cache: Conve...
1424
  	return xas.xa_index;
e7b563bb2   Johannes Weiner   mm: filemap: move...
1425
  }
0d3f92966   Matthew Wilcox   page cache: Conve...
1426
  EXPORT_SYMBOL(page_cache_next_miss);
e7b563bb2   Johannes Weiner   mm: filemap: move...
1427
1428
  
  /**
2346a5605   Laurent Dufour   mm/filemap.c: fix...
1429
   * page_cache_prev_miss() - Find the previous gap in the page cache.
0d3f92966   Matthew Wilcox   page cache: Conve...
1430
1431
1432
   * @mapping: Mapping.
   * @index: Index.
   * @max_scan: Maximum range to search.
e7b563bb2   Johannes Weiner   mm: filemap: move...
1433
   *
0d3f92966   Matthew Wilcox   page cache: Conve...
1434
1435
   * Search the range [max(index - max_scan + 1, 0), index] for the
   * gap with the highest index.
e7b563bb2   Johannes Weiner   mm: filemap: move...
1436
   *
0d3f92966   Matthew Wilcox   page cache: Conve...
1437
1438
1439
1440
1441
   * This function may be called under the rcu_read_lock.  However, this will
   * not atomically search a snapshot of the cache at a single point in time.
   * For example, if a gap is created at index 10, then subsequently a gap is
   * created at index 5, page_cache_prev_miss() covering both indices may
   * return 5 if called under the rcu_read_lock.
e7b563bb2   Johannes Weiner   mm: filemap: move...
1442
   *
0d3f92966   Matthew Wilcox   page cache: Conve...
1443
1444
1445
   * Return: The index of the gap if found, otherwise an index outside the
   * range specified (in which case 'index - return >= max_scan' will be true).
   * In the rare case of wrap-around, ULONG_MAX will be returned.
e7b563bb2   Johannes Weiner   mm: filemap: move...
1446
   */
0d3f92966   Matthew Wilcox   page cache: Conve...
1447
  pgoff_t page_cache_prev_miss(struct address_space *mapping,
e7b563bb2   Johannes Weiner   mm: filemap: move...
1448
1449
  			     pgoff_t index, unsigned long max_scan)
  {
0d3f92966   Matthew Wilcox   page cache: Conve...
1450
  	XA_STATE(xas, &mapping->i_pages, index);
e7b563bb2   Johannes Weiner   mm: filemap: move...
1451

0d3f92966   Matthew Wilcox   page cache: Conve...
1452
1453
1454
  	while (max_scan--) {
  		void *entry = xas_prev(&xas);
  		if (!entry || xa_is_value(entry))
e7b563bb2   Johannes Weiner   mm: filemap: move...
1455
  			break;
0d3f92966   Matthew Wilcox   page cache: Conve...
1456
  		if (xas.xa_index == ULONG_MAX)
e7b563bb2   Johannes Weiner   mm: filemap: move...
1457
1458
  			break;
  	}
0d3f92966   Matthew Wilcox   page cache: Conve...
1459
  	return xas.xa_index;
e7b563bb2   Johannes Weiner   mm: filemap: move...
1460
  }
0d3f92966   Matthew Wilcox   page cache: Conve...
1461
  EXPORT_SYMBOL(page_cache_prev_miss);
e7b563bb2   Johannes Weiner   mm: filemap: move...
1462
1463
  
  /**
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1464
   * find_get_entry - find and get a page cache entry
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
1465
   * @mapping: the address_space to search
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1466
1467
1468
1469
   * @offset: the page cache index
   *
   * Looks up the page cache slot at @mapping & @offset.  If there is a
   * page cache page, it is returned with an increased refcount.
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
1470
   *
139b6a6fb   Johannes Weiner   mm: filemap: upda...
1471
1472
   * If the slot holds a shadow entry of a previously evicted page, or a
   * swap entry from shmem/tmpfs, it is returned.
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1473
   *
a862f68a8   Mike Rapoport   docs/core-api/mm:...
1474
   * Return: the found page or shadow entry, %NULL if nothing is found.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1475
   */
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1476
  struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1477
  {
4c7472c0d   Matthew Wilcox   page cache: Conve...
1478
  	XA_STATE(xas, &mapping->i_pages, offset);
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1479
  	struct page *page;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1480

a60637c85   Nick Piggin   mm: lockless page...
1481
1482
  	rcu_read_lock();
  repeat:
4c7472c0d   Matthew Wilcox   page cache: Conve...
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
  	xas_reset(&xas);
  	page = xas_load(&xas);
  	if (xas_retry(&xas, page))
  		goto repeat;
  	/*
  	 * A shadow entry of a recently evicted page, or a swap entry from
  	 * shmem/tmpfs.  Return it without attempting to raise page count.
  	 */
  	if (!page || xa_is_value(page))
  		goto out;
83929372f   Kirill A. Shutemov   filemap: prepare ...
1493

4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1494
  	if (!page_cache_get_speculative(page))
4c7472c0d   Matthew Wilcox   page cache: Conve...
1495
  		goto repeat;
83929372f   Kirill A. Shutemov   filemap: prepare ...
1496

4c7472c0d   Matthew Wilcox   page cache: Conve...
1497
  	/*
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1498
  	 * Has the page moved or been split?
4c7472c0d   Matthew Wilcox   page cache: Conve...
1499
1500
1501
1502
  	 * This is part of the lockless pagecache protocol. See
  	 * include/linux/pagemap.h for details.
  	 */
  	if (unlikely(page != xas_reload(&xas))) {
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1503
  		put_page(page);
4c7472c0d   Matthew Wilcox   page cache: Conve...
1504
  		goto repeat;
a60637c85   Nick Piggin   mm: lockless page...
1505
  	}
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1506
  	page = find_subpage(page, offset);
27d20fddc   Nick Piggin   radix-tree: fix R...
1507
  out:
a60637c85   Nick Piggin   mm: lockless page...
1508
  	rcu_read_unlock();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1509
1510
  	return page;
  }
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1511
  EXPORT_SYMBOL(find_get_entry);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1512

485bb99b4   Randy Dunlap   [PATCH] kernel-do...
1513
  /**
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1514
1515
1516
1517
1518
1519
1520
1521
   * find_lock_entry - locate, pin and lock a page cache entry
   * @mapping: the address_space to search
   * @offset: the page cache index
   *
   * Looks up the page cache slot at @mapping & @offset.  If there is a
   * page cache page, it is returned locked and with an increased
   * refcount.
   *
139b6a6fb   Johannes Weiner   mm: filemap: upda...
1522
1523
   * If the slot holds a shadow entry of a previously evicted page, or a
   * swap entry from shmem/tmpfs, it is returned.
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1524
   *
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1525
   * find_lock_entry() may sleep.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
1526
1527
   *
   * Return: the found page or shadow entry, %NULL if nothing is found.
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1528
1529
   */
  struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1530
1531
  {
  	struct page *page;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1532
  repeat:
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1533
  	page = find_get_entry(mapping, offset);
4c7472c0d   Matthew Wilcox   page cache: Conve...
1534
  	if (page && !xa_is_value(page)) {
a60637c85   Nick Piggin   mm: lockless page...
1535
1536
  		lock_page(page);
  		/* Has the page been truncated? */
83929372f   Kirill A. Shutemov   filemap: prepare ...
1537
  		if (unlikely(page_mapping(page) != mapping)) {
a60637c85   Nick Piggin   mm: lockless page...
1538
  			unlock_page(page);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
1539
  			put_page(page);
a60637c85   Nick Piggin   mm: lockless page...
1540
  			goto repeat;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1541
  		}
83929372f   Kirill A. Shutemov   filemap: prepare ...
1542
  		VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1543
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1544
1545
  	return page;
  }
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1546
1547
1548
  EXPORT_SYMBOL(find_lock_entry);
  
  /**
2457aec63   Mel Gorman   mm: non-atomicall...
1549
   * pagecache_get_page - find and get a page reference
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1550
1551
   * @mapping: the address_space to search
   * @offset: the page index
2457aec63   Mel Gorman   mm: non-atomicall...
1552
   * @fgp_flags: PCG flags
45f87de57   Michal Hocko   mm: get rid of ra...
1553
   * @gfp_mask: gfp mask to use for the page cache data page allocation
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1554
   *
2457aec63   Mel Gorman   mm: non-atomicall...
1555
   * Looks up the page cache slot at @mapping & @offset.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1556
   *
75325189c   Randy Dunlap   mm: fix filemap.c...
1557
   * PCG flags modify how the page is returned.
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1558
   *
0e056eb55   mchehab@s-opensource.com   kernel-api.rst: f...
1559
1560
1561
1562
1563
1564
1565
   * @fgp_flags can be:
   *
   * - FGP_ACCESSED: the page will be marked accessed
   * - FGP_LOCK: Page is return locked
   * - FGP_CREAT: If page is not present then a new page is allocated using
   *   @gfp_mask and added to the page cache and the VM's LRU
   *   list. The page is returned locked and with an increased
a862f68a8   Mike Rapoport   docs/core-api/mm:...
1566
   *   refcount.
a75d4c333   Josef Bacik   filemap: kill pag...
1567
1568
1569
   * - FGP_FOR_MMAP: Similar to FGP_CREAT, only we want to allow the caller to do
   *   its own locking dance if the page is already in cache, or unlock the page
   *   before returning if we had to add the page to pagecache.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1570
   *
2457aec63   Mel Gorman   mm: non-atomicall...
1571
1572
   * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even
   * if the GFP flags specified for FGP_CREAT are atomic.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1573
   *
2457aec63   Mel Gorman   mm: non-atomicall...
1574
   * If there is a page cache page, it is returned with an increased refcount.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
1575
1576
   *
   * Return: the found page or %NULL otherwise.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1577
   */
2457aec63   Mel Gorman   mm: non-atomicall...
1578
  struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
45f87de57   Michal Hocko   mm: get rid of ra...
1579
  	int fgp_flags, gfp_t gfp_mask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1580
  {
eb2be1893   Nick Piggin   mm: buffered writ...
1581
  	struct page *page;
2457aec63   Mel Gorman   mm: non-atomicall...
1582

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1583
  repeat:
2457aec63   Mel Gorman   mm: non-atomicall...
1584
  	page = find_get_entry(mapping, offset);
3159f943a   Matthew Wilcox   xarray: Replace e...
1585
  	if (xa_is_value(page))
2457aec63   Mel Gorman   mm: non-atomicall...
1586
1587
1588
1589
1590
1591
1592
  		page = NULL;
  	if (!page)
  		goto no_page;
  
  	if (fgp_flags & FGP_LOCK) {
  		if (fgp_flags & FGP_NOWAIT) {
  			if (!trylock_page(page)) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
1593
  				put_page(page);
2457aec63   Mel Gorman   mm: non-atomicall...
1594
1595
1596
1597
1598
1599
1600
  				return NULL;
  			}
  		} else {
  			lock_page(page);
  		}
  
  		/* Has the page been truncated? */
31895438e   Song Liu   filemap: check co...
1601
  		if (unlikely(compound_head(page)->mapping != mapping)) {
2457aec63   Mel Gorman   mm: non-atomicall...
1602
  			unlock_page(page);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
1603
  			put_page(page);
2457aec63   Mel Gorman   mm: non-atomicall...
1604
1605
1606
1607
  			goto repeat;
  		}
  		VM_BUG_ON_PAGE(page->index != offset, page);
  	}
c16eb000c   Kirill Tkhai   mm/filemap.c: rem...
1608
  	if (fgp_flags & FGP_ACCESSED)
2457aec63   Mel Gorman   mm: non-atomicall...
1609
1610
1611
1612
1613
1614
  		mark_page_accessed(page);
  
  no_page:
  	if (!page && (fgp_flags & FGP_CREAT)) {
  		int err;
  		if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping))
45f87de57   Michal Hocko   mm: get rid of ra...
1615
1616
1617
  			gfp_mask |= __GFP_WRITE;
  		if (fgp_flags & FGP_NOFS)
  			gfp_mask &= ~__GFP_FS;
2457aec63   Mel Gorman   mm: non-atomicall...
1618

45f87de57   Michal Hocko   mm: get rid of ra...
1619
  		page = __page_cache_alloc(gfp_mask);
eb2be1893   Nick Piggin   mm: buffered writ...
1620
1621
  		if (!page)
  			return NULL;
2457aec63   Mel Gorman   mm: non-atomicall...
1622

a75d4c333   Josef Bacik   filemap: kill pag...
1623
  		if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
2457aec63   Mel Gorman   mm: non-atomicall...
1624
  			fgp_flags |= FGP_LOCK;
eb39d618f   Hugh Dickins   mm: replace init_...
1625
  		/* Init accessed so avoid atomic mark_page_accessed later */
2457aec63   Mel Gorman   mm: non-atomicall...
1626
  		if (fgp_flags & FGP_ACCESSED)
eb39d618f   Hugh Dickins   mm: replace init_...
1627
  			__SetPageReferenced(page);
2457aec63   Mel Gorman   mm: non-atomicall...
1628

abc1be13f   Matthew Wilcox   mm/filemap.c: fix...
1629
  		err = add_to_page_cache_lru(page, mapping, offset, gfp_mask);
eb2be1893   Nick Piggin   mm: buffered writ...
1630
  		if (unlikely(err)) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
1631
  			put_page(page);
eb2be1893   Nick Piggin   mm: buffered writ...
1632
1633
1634
  			page = NULL;
  			if (err == -EEXIST)
  				goto repeat;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1635
  		}
a75d4c333   Josef Bacik   filemap: kill pag...
1636
1637
1638
1639
1640
1641
1642
  
  		/*
  		 * add_to_page_cache_lru locks the page, and for mmap we expect
  		 * an unlocked page.
  		 */
  		if (page && (fgp_flags & FGP_FOR_MMAP))
  			unlock_page(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1643
  	}
2457aec63   Mel Gorman   mm: non-atomicall...
1644

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1645
1646
  	return page;
  }
2457aec63   Mel Gorman   mm: non-atomicall...
1647
  EXPORT_SYMBOL(pagecache_get_page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1648
1649
  
  /**
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
   * find_get_entries - gang pagecache lookup
   * @mapping:	The address_space to search
   * @start:	The starting page cache index
   * @nr_entries:	The maximum number of entries
   * @entries:	Where the resulting entries are placed
   * @indices:	The cache indices corresponding to the entries in @entries
   *
   * find_get_entries() will search for and return a group of up to
   * @nr_entries entries in the mapping.  The entries are placed at
   * @entries.  find_get_entries() takes a reference against any actual
   * pages it returns.
   *
   * The search returns a group of mapping-contiguous page cache entries
   * with ascending indexes.  There may be holes in the indices due to
   * not-present pages.
   *
139b6a6fb   Johannes Weiner   mm: filemap: upda...
1666
1667
   * Any shadow entries of evicted pages, or swap entries from
   * shmem/tmpfs, are included in the returned array.
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1668
   *
a862f68a8   Mike Rapoport   docs/core-api/mm:...
1669
   * Return: the number of pages and shadow entries which were found.
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1670
1671
1672
1673
1674
   */
  unsigned find_get_entries(struct address_space *mapping,
  			  pgoff_t start, unsigned int nr_entries,
  			  struct page **entries, pgoff_t *indices)
  {
f280bf092   Matthew Wilcox   page cache: Conve...
1675
1676
  	XA_STATE(xas, &mapping->i_pages, start);
  	struct page *page;
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1677
  	unsigned int ret = 0;
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1678
1679
1680
1681
1682
  
  	if (!nr_entries)
  		return 0;
  
  	rcu_read_lock();
f280bf092   Matthew Wilcox   page cache: Conve...
1683
  	xas_for_each(&xas, page, ULONG_MAX) {
f280bf092   Matthew Wilcox   page cache: Conve...
1684
  		if (xas_retry(&xas, page))
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1685
  			continue;
f280bf092   Matthew Wilcox   page cache: Conve...
1686
1687
1688
1689
1690
1691
  		/*
  		 * A shadow entry of a recently evicted page, a swap
  		 * entry from shmem/tmpfs or a DAX entry.  Return it
  		 * without attempting to raise page count.
  		 */
  		if (xa_is_value(page))
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1692
  			goto export;
83929372f   Kirill A. Shutemov   filemap: prepare ...
1693

4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1694
  		if (!page_cache_get_speculative(page))
f280bf092   Matthew Wilcox   page cache: Conve...
1695
  			goto retry;
83929372f   Kirill A. Shutemov   filemap: prepare ...
1696

4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1697
  		/* Has the page moved or been split? */
f280bf092   Matthew Wilcox   page cache: Conve...
1698
1699
  		if (unlikely(page != xas_reload(&xas)))
  			goto put_page;
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1700
  		page = find_subpage(page, xas.xa_index);
f280bf092   Matthew Wilcox   page cache: Conve...
1701

0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1702
  export:
f280bf092   Matthew Wilcox   page cache: Conve...
1703
  		indices[ret] = xas.xa_index;
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1704
1705
1706
  		entries[ret] = page;
  		if (++ret == nr_entries)
  			break;
f280bf092   Matthew Wilcox   page cache: Conve...
1707
1708
  		continue;
  put_page:
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1709
  		put_page(page);
f280bf092   Matthew Wilcox   page cache: Conve...
1710
1711
  retry:
  		xas_reset(&xas);
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1712
1713
1714
1715
1716
1717
  	}
  	rcu_read_unlock();
  	return ret;
  }
  
  /**
b947cee4b   Jan Kara   mm: implement fin...
1718
   * find_get_pages_range - gang pagecache lookup
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1719
1720
   * @mapping:	The address_space to search
   * @start:	The starting page index
b947cee4b   Jan Kara   mm: implement fin...
1721
   * @end:	The final page index (inclusive)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1722
1723
1724
   * @nr_pages:	The maximum number of pages
   * @pages:	Where the resulting pages are placed
   *
b947cee4b   Jan Kara   mm: implement fin...
1725
1726
1727
1728
   * find_get_pages_range() will search for and return a group of up to @nr_pages
   * pages in the mapping starting at index @start and up to index @end
   * (inclusive).  The pages are placed at @pages.  find_get_pages_range() takes
   * a reference against the returned pages.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1729
1730
1731
   *
   * The search returns a group of mapping-contiguous pages with ascending
   * indexes.  There may be holes in the indices due to not-present pages.
d72dc8a25   Jan Kara   mm: make pagevec_...
1732
   * We also update @start to index the next page for the traversal.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1733
   *
a862f68a8   Mike Rapoport   docs/core-api/mm:...
1734
1735
   * Return: the number of pages which were found. If this number is
   * smaller than @nr_pages, the end of specified range has been
b947cee4b   Jan Kara   mm: implement fin...
1736
   * reached.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1737
   */
b947cee4b   Jan Kara   mm: implement fin...
1738
1739
1740
  unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
  			      pgoff_t end, unsigned int nr_pages,
  			      struct page **pages)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1741
  {
fd1b3cee2   Matthew Wilcox   page cache: Conve...
1742
1743
  	XA_STATE(xas, &mapping->i_pages, *start);
  	struct page *page;
0fc9d1040   Konstantin Khlebnikov   radix-tree: use i...
1744
1745
1746
1747
  	unsigned ret = 0;
  
  	if (unlikely(!nr_pages))
  		return 0;
a60637c85   Nick Piggin   mm: lockless page...
1748
1749
  
  	rcu_read_lock();
fd1b3cee2   Matthew Wilcox   page cache: Conve...
1750
  	xas_for_each(&xas, page, end) {
fd1b3cee2   Matthew Wilcox   page cache: Conve...
1751
  		if (xas_retry(&xas, page))
a60637c85   Nick Piggin   mm: lockless page...
1752
  			continue;
fd1b3cee2   Matthew Wilcox   page cache: Conve...
1753
1754
  		/* Skip over shadow, swap and DAX entries */
  		if (xa_is_value(page))
8079b1c85   Hugh Dickins   mm: clarify the r...
1755
  			continue;
a60637c85   Nick Piggin   mm: lockless page...
1756

4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1757
  		if (!page_cache_get_speculative(page))
fd1b3cee2   Matthew Wilcox   page cache: Conve...
1758
  			goto retry;
83929372f   Kirill A. Shutemov   filemap: prepare ...
1759

4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1760
  		/* Has the page moved or been split? */
fd1b3cee2   Matthew Wilcox   page cache: Conve...
1761
1762
  		if (unlikely(page != xas_reload(&xas)))
  			goto put_page;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1763

4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1764
  		pages[ret] = find_subpage(page, xas.xa_index);
b947cee4b   Jan Kara   mm: implement fin...
1765
  		if (++ret == nr_pages) {
5d3ee42f8   Yu Zhao   mm/shmem: make fi...
1766
  			*start = xas.xa_index + 1;
b947cee4b   Jan Kara   mm: implement fin...
1767
1768
  			goto out;
  		}
fd1b3cee2   Matthew Wilcox   page cache: Conve...
1769
1770
  		continue;
  put_page:
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1771
  		put_page(page);
fd1b3cee2   Matthew Wilcox   page cache: Conve...
1772
1773
  retry:
  		xas_reset(&xas);
a60637c85   Nick Piggin   mm: lockless page...
1774
  	}
5b280c0cc   Hugh Dickins   mm: don't return ...
1775

b947cee4b   Jan Kara   mm: implement fin...
1776
1777
1778
  	/*
  	 * We come here when there is no page beyond @end. We take care to not
  	 * overflow the index @start as it confuses some of the callers. This
fd1b3cee2   Matthew Wilcox   page cache: Conve...
1779
  	 * breaks the iteration when there is a page at index -1 but that is
b947cee4b   Jan Kara   mm: implement fin...
1780
1781
1782
1783
1784
1785
1786
  	 * already broken anyway.
  	 */
  	if (end == (pgoff_t)-1)
  		*start = (pgoff_t)-1;
  	else
  		*start = end + 1;
  out:
a60637c85   Nick Piggin   mm: lockless page...
1787
  	rcu_read_unlock();
d72dc8a25   Jan Kara   mm: make pagevec_...
1788

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1789
1790
  	return ret;
  }
ebf43500e   Jens Axboe   [PATCH] Add find_...
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
  /**
   * find_get_pages_contig - gang contiguous pagecache lookup
   * @mapping:	The address_space to search
   * @index:	The starting page index
   * @nr_pages:	The maximum number of pages
   * @pages:	Where the resulting pages are placed
   *
   * find_get_pages_contig() works exactly like find_get_pages(), except
   * that the returned number of pages are guaranteed to be contiguous.
   *
a862f68a8   Mike Rapoport   docs/core-api/mm:...
1801
   * Return: the number of pages which were found.
ebf43500e   Jens Axboe   [PATCH] Add find_...
1802
1803
1804
1805
   */
  unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
  			       unsigned int nr_pages, struct page **pages)
  {
3ece58a27   Matthew Wilcox   page cache: Conve...
1806
1807
  	XA_STATE(xas, &mapping->i_pages, index);
  	struct page *page;
0fc9d1040   Konstantin Khlebnikov   radix-tree: use i...
1808
1809
1810
1811
  	unsigned int ret = 0;
  
  	if (unlikely(!nr_pages))
  		return 0;
a60637c85   Nick Piggin   mm: lockless page...
1812
1813
  
  	rcu_read_lock();
3ece58a27   Matthew Wilcox   page cache: Conve...
1814
  	for (page = xas_load(&xas); page; page = xas_next(&xas)) {
3ece58a27   Matthew Wilcox   page cache: Conve...
1815
1816
1817
1818
1819
1820
1821
  		if (xas_retry(&xas, page))
  			continue;
  		/*
  		 * If the entry has been swapped out, we can stop looking.
  		 * No current caller is looking for DAX entries.
  		 */
  		if (xa_is_value(page))
8079b1c85   Hugh Dickins   mm: clarify the r...
1822
  			break;
ebf43500e   Jens Axboe   [PATCH] Add find_...
1823

4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1824
  		if (!page_cache_get_speculative(page))
3ece58a27   Matthew Wilcox   page cache: Conve...
1825
  			goto retry;
83929372f   Kirill A. Shutemov   filemap: prepare ...
1826

4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1827
  		/* Has the page moved or been split? */
3ece58a27   Matthew Wilcox   page cache: Conve...
1828
1829
  		if (unlikely(page != xas_reload(&xas)))
  			goto put_page;
a60637c85   Nick Piggin   mm: lockless page...
1830

4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1831
  		pages[ret] = find_subpage(page, xas.xa_index);
0fc9d1040   Konstantin Khlebnikov   radix-tree: use i...
1832
1833
  		if (++ret == nr_pages)
  			break;
3ece58a27   Matthew Wilcox   page cache: Conve...
1834
1835
  		continue;
  put_page:
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1836
  		put_page(page);
3ece58a27   Matthew Wilcox   page cache: Conve...
1837
1838
  retry:
  		xas_reset(&xas);
ebf43500e   Jens Axboe   [PATCH] Add find_...
1839
  	}
a60637c85   Nick Piggin   mm: lockless page...
1840
1841
  	rcu_read_unlock();
  	return ret;
ebf43500e   Jens Axboe   [PATCH] Add find_...
1842
  }
ef71c15c4   David Howells   AFS: export a cou...
1843
  EXPORT_SYMBOL(find_get_pages_contig);
ebf43500e   Jens Axboe   [PATCH] Add find_...
1844

485bb99b4   Randy Dunlap   [PATCH] kernel-do...
1845
  /**
72b045aec   Jan Kara   mm: implement fin...
1846
   * find_get_pages_range_tag - find and return pages in given range matching @tag
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
1847
1848
   * @mapping:	the address_space to search
   * @index:	the starting page index
72b045aec   Jan Kara   mm: implement fin...
1849
   * @end:	The final page index (inclusive)
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
1850
1851
1852
1853
   * @tag:	the tag index
   * @nr_pages:	the maximum number of pages
   * @pages:	where the resulting pages are placed
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1854
   * Like find_get_pages, except we only return pages which are tagged with
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
1855
   * @tag.   We update @index to index the next page for the traversal.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
1856
1857
   *
   * Return: the number of pages which were found.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1858
   */
72b045aec   Jan Kara   mm: implement fin...
1859
  unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
a6906972f   Matthew Wilcox   page cache; Conve...
1860
  			pgoff_t end, xa_mark_t tag, unsigned int nr_pages,
72b045aec   Jan Kara   mm: implement fin...
1861
  			struct page **pages)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1862
  {
a6906972f   Matthew Wilcox   page cache; Conve...
1863
1864
  	XA_STATE(xas, &mapping->i_pages, *index);
  	struct page *page;
0fc9d1040   Konstantin Khlebnikov   radix-tree: use i...
1865
1866
1867
1868
  	unsigned ret = 0;
  
  	if (unlikely(!nr_pages))
  		return 0;
a60637c85   Nick Piggin   mm: lockless page...
1869
1870
  
  	rcu_read_lock();
a6906972f   Matthew Wilcox   page cache; Conve...
1871
  	xas_for_each_marked(&xas, page, end, tag) {
a6906972f   Matthew Wilcox   page cache; Conve...
1872
  		if (xas_retry(&xas, page))
a60637c85   Nick Piggin   mm: lockless page...
1873
  			continue;
a6906972f   Matthew Wilcox   page cache; Conve...
1874
1875
1876
1877
1878
1879
  		/*
  		 * Shadow entries should never be tagged, but this iteration
  		 * is lockless so there is a window for page reclaim to evict
  		 * a page we saw tagged.  Skip over it.
  		 */
  		if (xa_is_value(page))
139b6a6fb   Johannes Weiner   mm: filemap: upda...
1880
  			continue;
a60637c85   Nick Piggin   mm: lockless page...
1881

4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1882
  		if (!page_cache_get_speculative(page))
a6906972f   Matthew Wilcox   page cache; Conve...
1883
  			goto retry;
a60637c85   Nick Piggin   mm: lockless page...
1884

4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1885
  		/* Has the page moved or been split? */
a6906972f   Matthew Wilcox   page cache; Conve...
1886
1887
  		if (unlikely(page != xas_reload(&xas)))
  			goto put_page;
a60637c85   Nick Piggin   mm: lockless page...
1888

4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1889
  		pages[ret] = find_subpage(page, xas.xa_index);
72b045aec   Jan Kara   mm: implement fin...
1890
  		if (++ret == nr_pages) {
5d3ee42f8   Yu Zhao   mm/shmem: make fi...
1891
  			*index = xas.xa_index + 1;
72b045aec   Jan Kara   mm: implement fin...
1892
1893
  			goto out;
  		}
a6906972f   Matthew Wilcox   page cache; Conve...
1894
1895
  		continue;
  put_page:
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
1896
  		put_page(page);
a6906972f   Matthew Wilcox   page cache; Conve...
1897
1898
  retry:
  		xas_reset(&xas);
a60637c85   Nick Piggin   mm: lockless page...
1899
  	}
5b280c0cc   Hugh Dickins   mm: don't return ...
1900

72b045aec   Jan Kara   mm: implement fin...
1901
  	/*
a6906972f   Matthew Wilcox   page cache; Conve...
1902
  	 * We come here when we got to @end. We take care to not overflow the
72b045aec   Jan Kara   mm: implement fin...
1903
  	 * index @index as it confuses some of the callers. This breaks the
a6906972f   Matthew Wilcox   page cache; Conve...
1904
1905
  	 * iteration when there is a page at index -1 but that is already
  	 * broken anyway.
72b045aec   Jan Kara   mm: implement fin...
1906
1907
1908
1909
1910
1911
  	 */
  	if (end == (pgoff_t)-1)
  		*index = (pgoff_t)-1;
  	else
  		*index = end + 1;
  out:
a60637c85   Nick Piggin   mm: lockless page...
1912
  	rcu_read_unlock();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1913

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1914
1915
  	return ret;
  }
72b045aec   Jan Kara   mm: implement fin...
1916
  EXPORT_SYMBOL(find_get_pages_range_tag);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1917

76d42bd96   Wu Fengguang   [PATCH] readahead...
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
  /*
   * CD/DVDs are error prone. When a medium error occurs, the driver may fail
   * a _large_ part of the i/o request. Imagine the worst scenario:
   *
   *      ---R__________________________________________B__________
   *         ^ reading here                             ^ bad block(assume 4k)
   *
   * read(R) => miss => readahead(R...B) => media error => frustrating retries
   * => failing the whole request => read(R) => read(R+1) =>
   * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>
   * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>
   * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......
   *
   * It is going insane. Fix it by quickly scaling down the readahead size.
   */
  static void shrink_readahead_size_eio(struct file *filp,
  					struct file_ra_state *ra)
  {
76d42bd96   Wu Fengguang   [PATCH] readahead...
1936
  	ra->ra_pages /= 4;
76d42bd96   Wu Fengguang   [PATCH] readahead...
1937
  }
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
1938
  /**
47c27bc46   Christoph Hellwig   fs: pass iocb to ...
1939
1940
   * generic_file_buffered_read - generic file read routine
   * @iocb:	the iocb to read
6e58e79db   Al Viro   introduce copy_pa...
1941
1942
   * @iter:	data destination
   * @written:	already copied
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
1943
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1944
   * This is a generic file read routine, and uses the
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
1945
   * mapping->a_ops->readpage() function for the actual low-level stuff.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1946
1947
1948
   *
   * This is really ugly. But the goto's actually try to clarify some
   * of the logic when it comes to error handling etc.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
1949
1950
1951
1952
   *
   * Return:
   * * total number of bytes copied, including those the were already @written
   * * negative error code if nothing was copied
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1953
   */
47c27bc46   Christoph Hellwig   fs: pass iocb to ...
1954
  static ssize_t generic_file_buffered_read(struct kiocb *iocb,
6e58e79db   Al Viro   introduce copy_pa...
1955
  		struct iov_iter *iter, ssize_t written)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1956
  {
47c27bc46   Christoph Hellwig   fs: pass iocb to ...
1957
  	struct file *filp = iocb->ki_filp;
36e789144   Christoph Hellwig   kill do_generic_m...
1958
  	struct address_space *mapping = filp->f_mapping;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1959
  	struct inode *inode = mapping->host;
36e789144   Christoph Hellwig   kill do_generic_m...
1960
  	struct file_ra_state *ra = &filp->f_ra;
47c27bc46   Christoph Hellwig   fs: pass iocb to ...
1961
  	loff_t *ppos = &iocb->ki_pos;
57f6b96c0   Fengguang Wu   filemap: convert ...
1962
1963
1964
1965
  	pgoff_t index;
  	pgoff_t last_index;
  	pgoff_t prev_index;
  	unsigned long offset;      /* offset into pagecache page */
ec0f16372   Jan Kara   readahead: improv...
1966
  	unsigned int prev_offset;
6e58e79db   Al Viro   introduce copy_pa...
1967
  	int error = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1968

c2a9737f4   Wei Fang   vfs,mm: fix a dea...
1969
  	if (unlikely(*ppos >= inode->i_sb->s_maxbytes))
d05c5f7ba   Linus Torvalds   vfs,mm: fix retur...
1970
  		return 0;
c2a9737f4   Wei Fang   vfs,mm: fix a dea...
1971
  	iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
1972
1973
1974
1975
1976
  	index = *ppos >> PAGE_SHIFT;
  	prev_index = ra->prev_pos >> PAGE_SHIFT;
  	prev_offset = ra->prev_pos & (PAGE_SIZE-1);
  	last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
  	offset = *ppos & ~PAGE_MASK;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1977

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1978
1979
  	for (;;) {
  		struct page *page;
57f6b96c0   Fengguang Wu   filemap: convert ...
1980
  		pgoff_t end_index;
a32ea1e1f   NeilBrown   Fix read/truncate...
1981
  		loff_t isize;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1982
  		unsigned long nr, ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1983
  		cond_resched();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1984
  find_page:
5abf186a3   Michal Hocko   mm, fs: check for...
1985
1986
1987
1988
  		if (fatal_signal_pending(current)) {
  			error = -EINTR;
  			goto out;
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1989
  		page = find_get_page(mapping, index);
3ea89ee86   Fengguang Wu   readahead: conver...
1990
  		if (!page) {
3239d8348   Milosz Tanski   fs: support IOCB_...
1991
1992
  			if (iocb->ki_flags & IOCB_NOWAIT)
  				goto would_block;
cf914a7d6   Rusty Russell   readahead: split ...
1993
  			page_cache_sync_readahead(mapping,
7ff81078d   Fengguang Wu   readahead: remove...
1994
  					ra, filp,
3ea89ee86   Fengguang Wu   readahead: conver...
1995
1996
1997
1998
1999
2000
  					index, last_index - index);
  			page = find_get_page(mapping, index);
  			if (unlikely(page == NULL))
  				goto no_cached_page;
  		}
  		if (PageReadahead(page)) {
cf914a7d6   Rusty Russell   readahead: split ...
2001
  			page_cache_async_readahead(mapping,
7ff81078d   Fengguang Wu   readahead: remove...
2002
  					ra, filp, page,
3ea89ee86   Fengguang Wu   readahead: conver...
2003
  					index, last_index - index);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2004
  		}
8ab22b9ab   Hisashi Hifumi   vfs: pagecache us...
2005
  		if (!PageUptodate(page)) {
3239d8348   Milosz Tanski   fs: support IOCB_...
2006
2007
2008
2009
  			if (iocb->ki_flags & IOCB_NOWAIT) {
  				put_page(page);
  				goto would_block;
  			}
ebded0278   Mel Gorman   mm: filemap: avoi...
2010
2011
2012
2013
2014
  			/*
  			 * See comment in do_read_cache_page on why
  			 * wait_on_page_locked is used to avoid unnecessarily
  			 * serialisations and why it's safe.
  			 */
c4b209a42   Bart Van Assche   do_generic_file_r...
2015
2016
2017
  			error = wait_on_page_locked_killable(page);
  			if (unlikely(error))
  				goto readpage_error;
ebded0278   Mel Gorman   mm: filemap: avoi...
2018
2019
  			if (PageUptodate(page))
  				goto page_ok;
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2020
  			if (inode->i_blkbits == PAGE_SHIFT ||
8ab22b9ab   Hisashi Hifumi   vfs: pagecache us...
2021
2022
  					!mapping->a_ops->is_partially_uptodate)
  				goto page_not_up_to_date;
6d6d36bc6   Eryu Guan   mm/filemap: don't...
2023
  			/* pipes can't handle partially uptodate pages */
00e237074   David Howells   iov_iter: Use acc...
2024
  			if (unlikely(iov_iter_is_pipe(iter)))
6d6d36bc6   Eryu Guan   mm/filemap: don't...
2025
  				goto page_not_up_to_date;
529ae9aaa   Nick Piggin   mm: rename page t...
2026
  			if (!trylock_page(page))
8ab22b9ab   Hisashi Hifumi   vfs: pagecache us...
2027
  				goto page_not_up_to_date;
8d056cb96   Dave Hansen   mm/vfs: revalidat...
2028
2029
2030
  			/* Did it get truncated before we got the lock? */
  			if (!page->mapping)
  				goto page_not_up_to_date_locked;
8ab22b9ab   Hisashi Hifumi   vfs: pagecache us...
2031
  			if (!mapping->a_ops->is_partially_uptodate(page,
6e58e79db   Al Viro   introduce copy_pa...
2032
  							offset, iter->count))
8ab22b9ab   Hisashi Hifumi   vfs: pagecache us...
2033
2034
2035
  				goto page_not_up_to_date_locked;
  			unlock_page(page);
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2036
  page_ok:
a32ea1e1f   NeilBrown   Fix read/truncate...
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
  		/*
  		 * i_size must be checked after we know the page is Uptodate.
  		 *
  		 * Checking i_size after the check allows us to calculate
  		 * the correct value for "nr", which means the zero-filled
  		 * part of the page is not copied back to userspace (unless
  		 * another truncate extends the file - this is desired though).
  		 */
  
  		isize = i_size_read(inode);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2047
  		end_index = (isize - 1) >> PAGE_SHIFT;
a32ea1e1f   NeilBrown   Fix read/truncate...
2048
  		if (unlikely(!isize || index > end_index)) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2049
  			put_page(page);
a32ea1e1f   NeilBrown   Fix read/truncate...
2050
2051
2052
2053
  			goto out;
  		}
  
  		/* nr is the maximum number of bytes to copy from this page */
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2054
  		nr = PAGE_SIZE;
a32ea1e1f   NeilBrown   Fix read/truncate...
2055
  		if (index == end_index) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2056
  			nr = ((isize - 1) & ~PAGE_MASK) + 1;
a32ea1e1f   NeilBrown   Fix read/truncate...
2057
  			if (nr <= offset) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2058
  				put_page(page);
a32ea1e1f   NeilBrown   Fix read/truncate...
2059
2060
2061
2062
  				goto out;
  			}
  		}
  		nr = nr - offset;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2063
2064
2065
2066
2067
2068
2069
2070
2071
  
  		/* If users can be writing to this page using arbitrary
  		 * virtual addresses, take care about potential aliasing
  		 * before reading the page on the kernel side.
  		 */
  		if (mapping_writably_mapped(mapping))
  			flush_dcache_page(page);
  
  		/*
ec0f16372   Jan Kara   readahead: improv...
2072
2073
  		 * When a sequential read accesses a page several times,
  		 * only mark it as accessed the first time.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2074
  		 */
ec0f16372   Jan Kara   readahead: improv...
2075
  		if (prev_index != index || offset != prev_offset)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2076
2077
2078
2079
2080
2081
  			mark_page_accessed(page);
  		prev_index = index;
  
  		/*
  		 * Ok, we have the page, and it's up-to-date, so
  		 * now we can copy it to user space...
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2082
  		 */
6e58e79db   Al Viro   introduce copy_pa...
2083
2084
  
  		ret = copy_page_to_iter(page, offset, nr, iter);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2085
  		offset += ret;
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2086
2087
  		index += offset >> PAGE_SHIFT;
  		offset &= ~PAGE_MASK;
6ce745ed3   Jan Kara   readahead: code c...
2088
  		prev_offset = offset;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2089

09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2090
  		put_page(page);
6e58e79db   Al Viro   introduce copy_pa...
2091
2092
2093
2094
2095
2096
2097
2098
  		written += ret;
  		if (!iov_iter_count(iter))
  			goto out;
  		if (ret < nr) {
  			error = -EFAULT;
  			goto out;
  		}
  		continue;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2099
2100
2101
  
  page_not_up_to_date:
  		/* Get exclusive access to the page ... */
854623235   Oleg Nesterov   do_generic_file_r...
2102
2103
2104
  		error = lock_page_killable(page);
  		if (unlikely(error))
  			goto readpage_error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2105

8ab22b9ab   Hisashi Hifumi   vfs: pagecache us...
2106
  page_not_up_to_date_locked:
da6052f7b   Nick Piggin   [PATCH] update so...
2107
  		/* Did it get truncated before we got the lock? */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2108
2109
  		if (!page->mapping) {
  			unlock_page(page);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2110
  			put_page(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
  			continue;
  		}
  
  		/* Did somebody else fill it already? */
  		if (PageUptodate(page)) {
  			unlock_page(page);
  			goto page_ok;
  		}
  
  readpage:
91803b499   Jeff Moyer   do_generic_file_r...
2121
2122
2123
2124
2125
2126
  		/*
  		 * A previous I/O error may have been due to temporary
  		 * failures, eg. multipath errors.
  		 * PG_error will be set again if readpage fails.
  		 */
  		ClearPageError(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2127
2128
  		/* Start the actual read. The read will unlock the page. */
  		error = mapping->a_ops->readpage(filp, page);
994fc28c7   Zach Brown   [PATCH] add AOP_T...
2129
2130
  		if (unlikely(error)) {
  			if (error == AOP_TRUNCATED_PAGE) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2131
  				put_page(page);
6e58e79db   Al Viro   introduce copy_pa...
2132
  				error = 0;
994fc28c7   Zach Brown   [PATCH] add AOP_T...
2133
2134
  				goto find_page;
  			}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2135
  			goto readpage_error;
994fc28c7   Zach Brown   [PATCH] add AOP_T...
2136
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2137
2138
  
  		if (!PageUptodate(page)) {
854623235   Oleg Nesterov   do_generic_file_r...
2139
2140
2141
  			error = lock_page_killable(page);
  			if (unlikely(error))
  				goto readpage_error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2142
2143
2144
  			if (!PageUptodate(page)) {
  				if (page->mapping == NULL) {
  					/*
2ecdc82ef   Christoph Hellwig   kill unused inval...
2145
  					 * invalidate_mapping_pages got it
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2146
2147
  					 */
  					unlock_page(page);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2148
  					put_page(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2149
2150
2151
  					goto find_page;
  				}
  				unlock_page(page);
7ff81078d   Fengguang Wu   readahead: remove...
2152
  				shrink_readahead_size_eio(filp, ra);
854623235   Oleg Nesterov   do_generic_file_r...
2153
2154
  				error = -EIO;
  				goto readpage_error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2155
2156
2157
  			}
  			unlock_page(page);
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2158
2159
2160
2161
  		goto page_ok;
  
  readpage_error:
  		/* UHHUH! A synchronous read error occurred. Report it */
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2162
  		put_page(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2163
2164
2165
2166
2167
2168
2169
  		goto out;
  
  no_cached_page:
  		/*
  		 * Ok, it wasn't cached, so we need to create a new
  		 * page..
  		 */
453f85d43   Mel Gorman   mm: remove __GFP_...
2170
  		page = page_cache_alloc(mapping);
eb2be1893   Nick Piggin   mm: buffered writ...
2171
  		if (!page) {
6e58e79db   Al Viro   introduce copy_pa...
2172
  			error = -ENOMEM;
eb2be1893   Nick Piggin   mm: buffered writ...
2173
  			goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2174
  		}
6afdb859b   Michal Hocko   mm: do not ignore...
2175
  		error = add_to_page_cache_lru(page, mapping, index,
c62d25556   Michal Hocko   mm, fs: introduce...
2176
  				mapping_gfp_constraint(mapping, GFP_KERNEL));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2177
  		if (error) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2178
  			put_page(page);
6e58e79db   Al Viro   introduce copy_pa...
2179
2180
  			if (error == -EEXIST) {
  				error = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2181
  				goto find_page;
6e58e79db   Al Viro   introduce copy_pa...
2182
  			}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2183
2184
  			goto out;
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2185
2186
  		goto readpage;
  	}
3239d8348   Milosz Tanski   fs: support IOCB_...
2187
2188
  would_block:
  	error = -EAGAIN;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2189
  out:
7ff81078d   Fengguang Wu   readahead: remove...
2190
  	ra->prev_pos = prev_index;
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2191
  	ra->prev_pos <<= PAGE_SHIFT;
7ff81078d   Fengguang Wu   readahead: remove...
2192
  	ra->prev_pos |= prev_offset;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2193

09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2194
  	*ppos = ((loff_t)index << PAGE_SHIFT) + offset;
0c6aa2639   Krishna Kumar   mm: do_generic_fi...
2195
  	file_accessed(filp);
6e58e79db   Al Viro   introduce copy_pa...
2196
  	return written ? written : error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2197
  }
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
2198
  /**
6abd23227   Al Viro   bury generic_file...
2199
   * generic_file_read_iter - generic filesystem read routine
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
2200
   * @iocb:	kernel I/O control block
6abd23227   Al Viro   bury generic_file...
2201
   * @iter:	destination for the data read
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
2202
   *
6abd23227   Al Viro   bury generic_file...
2203
   * This is the "read_iter()" routine for all filesystems
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2204
   * that can use the page cache directly.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
2205
2206
2207
   * Return:
   * * number of bytes copied, even for partial reads
   * * negative error code if nothing was read
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2208
2209
   */
  ssize_t
ed978a811   Al Viro   new helper: gener...
2210
  generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2211
  {
e7080a439   Nicolai Stange   mm/filemap: gener...
2212
  	size_t count = iov_iter_count(iter);
47c27bc46   Christoph Hellwig   fs: pass iocb to ...
2213
  	ssize_t retval = 0;
e7080a439   Nicolai Stange   mm/filemap: gener...
2214
2215
2216
  
  	if (!count)
  		goto out; /* skip atime */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2217

2ba48ce51   Al Viro   mirror O_APPEND a...
2218
  	if (iocb->ki_flags & IOCB_DIRECT) {
47c27bc46   Christoph Hellwig   fs: pass iocb to ...
2219
  		struct file *file = iocb->ki_filp;
ed978a811   Al Viro   new helper: gener...
2220
2221
  		struct address_space *mapping = file->f_mapping;
  		struct inode *inode = mapping->host;
543ade1fc   Badari Pulavarty   [PATCH] Streamlin...
2222
  		loff_t size;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2223

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2224
  		size = i_size_read(inode);
6be96d3ad   Goldwyn Rodrigues   fs: return if dir...
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
  		if (iocb->ki_flags & IOCB_NOWAIT) {
  			if (filemap_range_has_page(mapping, iocb->ki_pos,
  						   iocb->ki_pos + count - 1))
  				return -EAGAIN;
  		} else {
  			retval = filemap_write_and_wait_range(mapping,
  						iocb->ki_pos,
  					        iocb->ki_pos + count - 1);
  			if (retval < 0)
  				goto out;
  		}
d8d3d94b8   Al Viro   pass iov_iter to ...
2236

0d5b0cf24   Christoph Hellwig   fs: update atime ...
2237
  		file_accessed(file);
5ecda1371   Al Viro   generic_file_read...
2238
  		retval = mapping->a_ops->direct_IO(iocb, iter);
c3a690240   Al Viro   fix ITER_PIPE int...
2239
  		if (retval >= 0) {
c64fb5c74   Christoph Hellwig   filemap: remove p...
2240
  			iocb->ki_pos += retval;
5ecda1371   Al Viro   generic_file_read...
2241
  			count -= retval;
9fe55eea7   Steven Whitehouse   Fix race when che...
2242
  		}
5b47d59af   Al Viro   fix braino in gen...
2243
  		iov_iter_revert(iter, count - iov_iter_count(iter));
66f998f61   Josef Bacik   fs: allow short d...
2244

9fe55eea7   Steven Whitehouse   Fix race when che...
2245
2246
2247
2248
2249
2250
  		/*
  		 * Btrfs can have a short DIO read if we encounter
  		 * compressed extents, so if there was an error, or if
  		 * we've already read everything we wanted to, or if
  		 * there was a short read because we hit EOF, go ahead
  		 * and return.  Otherwise fallthrough to buffered io for
fbbbad4bc   Matthew Wilcox   vfs,ext2: introdu...
2251
2252
  		 * the rest of the read.  Buffered reads will not work for
  		 * DAX files, so don't bother trying.
9fe55eea7   Steven Whitehouse   Fix race when che...
2253
  		 */
5ecda1371   Al Viro   generic_file_read...
2254
  		if (retval < 0 || !count || iocb->ki_pos >= size ||
0d5b0cf24   Christoph Hellwig   fs: update atime ...
2255
  		    IS_DAX(inode))
9fe55eea7   Steven Whitehouse   Fix race when che...
2256
  			goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2257
  	}
47c27bc46   Christoph Hellwig   fs: pass iocb to ...
2258
  	retval = generic_file_buffered_read(iocb, iter, retval);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2259
2260
2261
  out:
  	return retval;
  }
ed978a811   Al Viro   new helper: gener...
2262
  EXPORT_SYMBOL(generic_file_read_iter);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2263

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2264
  #ifdef CONFIG_MMU
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2265
  #define MMAP_LOTSAMISS  (100)
6b4c9f446   Josef Bacik   filemap: drop the...
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
  /*
   * lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_sem
   * @vmf - the vm_fault for this fault.
   * @page - the page to lock.
   * @fpin - the pointer to the file we may pin (or is already pinned).
   *
   * This works similar to lock_page_or_retry in that it can drop the mmap_sem.
   * It differs in that it actually returns the page locked if it returns 1 and 0
   * if it couldn't lock the page.  If we did have to drop the mmap_sem then fpin
   * will point to the pinned file and needs to be fput()'ed at a later point.
   */
  static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page,
  				     struct file **fpin)
  {
  	if (trylock_page(page))
  		return 1;
8b0f9fa2e   Linus Torvalds   filemap: add a co...
2282
2283
2284
2285
2286
  	/*
  	 * NOTE! This will make us return with VM_FAULT_RETRY, but with
  	 * the mmap_sem still held. That's how FAULT_FLAG_RETRY_NOWAIT
  	 * is supposed to work. We have way too many special cases..
  	 */
6b4c9f446   Josef Bacik   filemap: drop the...
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
  	if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
  		return 0;
  
  	*fpin = maybe_unlock_mmap_for_io(vmf, *fpin);
  	if (vmf->flags & FAULT_FLAG_KILLABLE) {
  		if (__lock_page_killable(page)) {
  			/*
  			 * We didn't have the right flags to drop the mmap_sem,
  			 * but all fault_handlers only check for fatal signals
  			 * if we return VM_FAULT_RETRY, so we need to drop the
  			 * mmap_sem here and return 0 if we don't have a fpin.
  			 */
  			if (*fpin == NULL)
  				up_read(&vmf->vma->vm_mm->mmap_sem);
  			return 0;
  		}
  	} else
  		__lock_page(page);
  	return 1;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2307

ef00e08e2   Linus Torvalds   readahead: clean ...
2308
  /*
6b4c9f446   Josef Bacik   filemap: drop the...
2309
2310
2311
2312
2313
   * Synchronous readahead happens when we don't even find a page in the page
   * cache at all.  We don't want to perform IO under the mmap sem, so if we have
   * to drop the mmap sem we return the file that was pinned in order for us to do
   * that.  If we didn't pin a file then we return NULL.  The file that is
   * returned needs to be fput()'ed when we're done with it.
ef00e08e2   Linus Torvalds   readahead: clean ...
2314
   */
6b4c9f446   Josef Bacik   filemap: drop the...
2315
  static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
ef00e08e2   Linus Torvalds   readahead: clean ...
2316
  {
2a1180f1b   Josef Bacik   filemap: pass vm_...
2317
2318
  	struct file *file = vmf->vma->vm_file;
  	struct file_ra_state *ra = &file->f_ra;
ef00e08e2   Linus Torvalds   readahead: clean ...
2319
  	struct address_space *mapping = file->f_mapping;
6b4c9f446   Josef Bacik   filemap: drop the...
2320
  	struct file *fpin = NULL;
2a1180f1b   Josef Bacik   filemap: pass vm_...
2321
  	pgoff_t offset = vmf->pgoff;
ef00e08e2   Linus Torvalds   readahead: clean ...
2322
2323
  
  	/* If we don't want any read-ahead, don't bother */
2a1180f1b   Josef Bacik   filemap: pass vm_...
2324
  	if (vmf->vma->vm_flags & VM_RAND_READ)
6b4c9f446   Josef Bacik   filemap: drop the...
2325
  		return fpin;
275b12bf5   Wu Fengguang   readahead: return...
2326
  	if (!ra->ra_pages)
6b4c9f446   Josef Bacik   filemap: drop the...
2327
  		return fpin;
ef00e08e2   Linus Torvalds   readahead: clean ...
2328

2a1180f1b   Josef Bacik   filemap: pass vm_...
2329
  	if (vmf->vma->vm_flags & VM_SEQ_READ) {
6b4c9f446   Josef Bacik   filemap: drop the...
2330
  		fpin = maybe_unlock_mmap_for_io(vmf, fpin);
7ffc59b4d   Wu Fengguang   readahead: enforc...
2331
2332
  		page_cache_sync_readahead(mapping, ra, file, offset,
  					  ra->ra_pages);
6b4c9f446   Josef Bacik   filemap: drop the...
2333
  		return fpin;
ef00e08e2   Linus Torvalds   readahead: clean ...
2334
  	}
207d04baa   Andi Kleen   readahead: reduce...
2335
2336
  	/* Avoid banging the cache line if not needed */
  	if (ra->mmap_miss < MMAP_LOTSAMISS * 10)
ef00e08e2   Linus Torvalds   readahead: clean ...
2337
2338
2339
2340
2341
2342
2343
  		ra->mmap_miss++;
  
  	/*
  	 * Do we miss much more than hit in this file? If so,
  	 * stop bothering with read-ahead. It will only hurt.
  	 */
  	if (ra->mmap_miss > MMAP_LOTSAMISS)
6b4c9f446   Josef Bacik   filemap: drop the...
2344
  		return fpin;
ef00e08e2   Linus Torvalds   readahead: clean ...
2345

d30a11004   Wu Fengguang   readahead: record...
2346
2347
2348
  	/*
  	 * mmap read-around
  	 */
6b4c9f446   Josef Bacik   filemap: drop the...
2349
  	fpin = maybe_unlock_mmap_for_io(vmf, fpin);
600e19afc   Roman Gushchin   mm: use only per-...
2350
2351
2352
  	ra->start = max_t(long, 0, offset - ra->ra_pages / 2);
  	ra->size = ra->ra_pages;
  	ra->async_size = ra->ra_pages / 4;
275b12bf5   Wu Fengguang   readahead: return...
2353
  	ra_submit(ra, mapping, file);
6b4c9f446   Josef Bacik   filemap: drop the...
2354
  	return fpin;
ef00e08e2   Linus Torvalds   readahead: clean ...
2355
2356
2357
2358
  }
  
  /*
   * Asynchronous readahead happens when we find the page and PG_readahead,
6b4c9f446   Josef Bacik   filemap: drop the...
2359
2360
   * so we want to possibly extend the readahead further.  We return the file that
   * was pinned if we have to drop the mmap_sem in order to do IO.
ef00e08e2   Linus Torvalds   readahead: clean ...
2361
   */
6b4c9f446   Josef Bacik   filemap: drop the...
2362
2363
  static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
  					    struct page *page)
ef00e08e2   Linus Torvalds   readahead: clean ...
2364
  {
2a1180f1b   Josef Bacik   filemap: pass vm_...
2365
2366
  	struct file *file = vmf->vma->vm_file;
  	struct file_ra_state *ra = &file->f_ra;
ef00e08e2   Linus Torvalds   readahead: clean ...
2367
  	struct address_space *mapping = file->f_mapping;
6b4c9f446   Josef Bacik   filemap: drop the...
2368
  	struct file *fpin = NULL;
2a1180f1b   Josef Bacik   filemap: pass vm_...
2369
  	pgoff_t offset = vmf->pgoff;
ef00e08e2   Linus Torvalds   readahead: clean ...
2370
2371
  
  	/* If we don't want any read-ahead, don't bother */
2a1180f1b   Josef Bacik   filemap: pass vm_...
2372
  	if (vmf->vma->vm_flags & VM_RAND_READ)
6b4c9f446   Josef Bacik   filemap: drop the...
2373
  		return fpin;
ef00e08e2   Linus Torvalds   readahead: clean ...
2374
2375
  	if (ra->mmap_miss > 0)
  		ra->mmap_miss--;
6b4c9f446   Josef Bacik   filemap: drop the...
2376
2377
  	if (PageReadahead(page)) {
  		fpin = maybe_unlock_mmap_for_io(vmf, fpin);
2fad6f5de   Wu Fengguang   readahead: enforc...
2378
2379
  		page_cache_async_readahead(mapping, ra, file,
  					   page, offset, ra->ra_pages);
6b4c9f446   Josef Bacik   filemap: drop the...
2380
2381
  	}
  	return fpin;
ef00e08e2   Linus Torvalds   readahead: clean ...
2382
  }
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
2383
  /**
54cb8821d   Nick Piggin   mm: merge populat...
2384
   * filemap_fault - read in file data for page fault handling
d0217ac04   Nick Piggin   mm: fault feedbac...
2385
   * @vmf:	struct vm_fault containing details of the fault
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
2386
   *
54cb8821d   Nick Piggin   mm: merge populat...
2387
   * filemap_fault() is invoked via the vma operations vector for a
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2388
2389
2390
2391
2392
   * mapped memory region to read in file data during a page fault.
   *
   * The goto's are kind of ugly, but this streamlines the normal case of having
   * it in the page cache, and handles the special cases reasonably without
   * having a lot of duplicated code.
9a95f3cf7   Paul Cassella   mm: describe mmap...
2393
2394
2395
   *
   * vma->vm_mm->mmap_sem must be held on entry.
   *
a49858338   Yang Shi   mm/filemap.c: cor...
2396
2397
   * If our return value has VM_FAULT_RETRY set, it's because the mmap_sem
   * may be dropped before doing I/O or by lock_page_maybe_drop_mmap().
9a95f3cf7   Paul Cassella   mm: describe mmap...
2398
2399
2400
2401
2402
   *
   * If our return value does not have VM_FAULT_RETRY set, the mmap_sem
   * has not been released.
   *
   * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
2403
2404
   *
   * Return: bitwise-OR of %VM_FAULT_ codes.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2405
   */
2bcd6454b   Souptick Joarder   mm: use new retur...
2406
  vm_fault_t filemap_fault(struct vm_fault *vmf)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2407
2408
  {
  	int error;
11bac8000   Dave Jiang   mm, fs: reduce fa...
2409
  	struct file *file = vmf->vma->vm_file;
6b4c9f446   Josef Bacik   filemap: drop the...
2410
  	struct file *fpin = NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2411
2412
2413
  	struct address_space *mapping = file->f_mapping;
  	struct file_ra_state *ra = &file->f_ra;
  	struct inode *inode = mapping->host;
ef00e08e2   Linus Torvalds   readahead: clean ...
2414
  	pgoff_t offset = vmf->pgoff;
9ab2594fe   Matthew Wilcox   mm: tighten up th...
2415
  	pgoff_t max_off;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2416
  	struct page *page;
2bcd6454b   Souptick Joarder   mm: use new retur...
2417
  	vm_fault_t ret = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2418

9ab2594fe   Matthew Wilcox   mm: tighten up th...
2419
2420
  	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
  	if (unlikely(offset >= max_off))
5307cc1aa   Linus Torvalds   Remove broken ptr...
2421
  		return VM_FAULT_SIGBUS;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2422

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2423
  	/*
494264208   Johannes Weiner   mm: memcg: handle...
2424
  	 * Do we have something in the page cache already?
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2425
  	 */
ef00e08e2   Linus Torvalds   readahead: clean ...
2426
  	page = find_get_page(mapping, offset);
45cac65b0   Shaohua Li   readahead: fault ...
2427
  	if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2428
  		/*
ef00e08e2   Linus Torvalds   readahead: clean ...
2429
2430
  		 * We found the page, so try async readahead before
  		 * waiting for the lock.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2431
  		 */
6b4c9f446   Josef Bacik   filemap: drop the...
2432
  		fpin = do_async_mmap_readahead(vmf, page);
45cac65b0   Shaohua Li   readahead: fault ...
2433
  	} else if (!page) {
ef00e08e2   Linus Torvalds   readahead: clean ...
2434
  		/* No page in the page cache at all */
ef00e08e2   Linus Torvalds   readahead: clean ...
2435
  		count_vm_event(PGMAJFAULT);
2262185c5   Roman Gushchin   mm: per-cgroup me...
2436
  		count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
ef00e08e2   Linus Torvalds   readahead: clean ...
2437
  		ret = VM_FAULT_MAJOR;
6b4c9f446   Josef Bacik   filemap: drop the...
2438
  		fpin = do_sync_mmap_readahead(vmf);
ef00e08e2   Linus Torvalds   readahead: clean ...
2439
  retry_find:
a75d4c333   Josef Bacik   filemap: kill pag...
2440
2441
2442
  		page = pagecache_get_page(mapping, offset,
  					  FGP_CREAT|FGP_FOR_MMAP,
  					  vmf->gfp_mask);
6b4c9f446   Josef Bacik   filemap: drop the...
2443
2444
2445
  		if (!page) {
  			if (fpin)
  				goto out_retry;
a75d4c333   Josef Bacik   filemap: kill pag...
2446
  			return vmf_error(-ENOMEM);
6b4c9f446   Josef Bacik   filemap: drop the...
2447
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2448
  	}
6b4c9f446   Josef Bacik   filemap: drop the...
2449
2450
  	if (!lock_page_maybe_drop_mmap(vmf, page, &fpin))
  		goto out_retry;
b522c94da   Michel Lespinasse   mm: filemap_fault...
2451
2452
  
  	/* Did it get truncated? */
585e5a7ba   Song Liu   filemap: check co...
2453
  	if (unlikely(compound_head(page)->mapping != mapping)) {
b522c94da   Michel Lespinasse   mm: filemap_fault...
2454
2455
2456
2457
  		unlock_page(page);
  		put_page(page);
  		goto retry_find;
  	}
520e5ba41   Song Liu   filemap: update o...
2458
  	VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
b522c94da   Michel Lespinasse   mm: filemap_fault...
2459

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2460
  	/*
d00806b18   Nick Piggin   mm: fix fault vs ...
2461
2462
  	 * We have a locked page in the page cache, now we need to check
  	 * that it's up-to-date. If not, it is going to be due to an error.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2463
  	 */
d00806b18   Nick Piggin   mm: fix fault vs ...
2464
  	if (unlikely(!PageUptodate(page)))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2465
  		goto page_not_uptodate;
ef00e08e2   Linus Torvalds   readahead: clean ...
2466
  	/*
6b4c9f446   Josef Bacik   filemap: drop the...
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
  	 * We've made it this far and we had to drop our mmap_sem, now is the
  	 * time to return to the upper layer and have it re-find the vma and
  	 * redo the fault.
  	 */
  	if (fpin) {
  		unlock_page(page);
  		goto out_retry;
  	}
  
  	/*
ef00e08e2   Linus Torvalds   readahead: clean ...
2477
2478
2479
  	 * Found the page and have a reference on it.
  	 * We must recheck i_size under page lock.
  	 */
9ab2594fe   Matthew Wilcox   mm: tighten up th...
2480
2481
  	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
  	if (unlikely(offset >= max_off)) {
d00806b18   Nick Piggin   mm: fix fault vs ...
2482
  		unlock_page(page);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2483
  		put_page(page);
5307cc1aa   Linus Torvalds   Remove broken ptr...
2484
  		return VM_FAULT_SIGBUS;
d00806b18   Nick Piggin   mm: fix fault vs ...
2485
  	}
d0217ac04   Nick Piggin   mm: fault feedbac...
2486
  	vmf->page = page;
83c54070e   Nick Piggin   mm: fault feedbac...
2487
  	return ret | VM_FAULT_LOCKED;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2488

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2489
  page_not_uptodate:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2490
2491
2492
2493
2494
2495
  	/*
  	 * Umm, take care of errors if the page isn't up-to-date.
  	 * Try to re-read it _once_. We do this synchronously,
  	 * because there really aren't any performance issues here
  	 * and we need to check for errors.
  	 */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2496
  	ClearPageError(page);
6b4c9f446   Josef Bacik   filemap: drop the...
2497
  	fpin = maybe_unlock_mmap_for_io(vmf, fpin);
994fc28c7   Zach Brown   [PATCH] add AOP_T...
2498
  	error = mapping->a_ops->readpage(file, page);
3ef0f720e   Miklos Szeredi   mm: fix infinite ...
2499
2500
2501
2502
2503
  	if (!error) {
  		wait_on_page_locked(page);
  		if (!PageUptodate(page))
  			error = -EIO;
  	}
6b4c9f446   Josef Bacik   filemap: drop the...
2504
2505
  	if (fpin)
  		goto out_retry;
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2506
  	put_page(page);
d00806b18   Nick Piggin   mm: fix fault vs ...
2507
2508
  
  	if (!error || error == AOP_TRUNCATED_PAGE)
994fc28c7   Zach Brown   [PATCH] add AOP_T...
2509
  		goto retry_find;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2510

d00806b18   Nick Piggin   mm: fix fault vs ...
2511
  	/* Things didn't work out. Return zero to tell the mm layer so. */
76d42bd96   Wu Fengguang   [PATCH] readahead...
2512
  	shrink_readahead_size_eio(file, ra);
d0217ac04   Nick Piggin   mm: fault feedbac...
2513
  	return VM_FAULT_SIGBUS;
6b4c9f446   Josef Bacik   filemap: drop the...
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
  
  out_retry:
  	/*
  	 * We dropped the mmap_sem, we need to return to the fault handler to
  	 * re-find the vma and come back and find our hopefully still populated
  	 * page.
  	 */
  	if (page)
  		put_page(page);
  	if (fpin)
  		fput(fpin);
  	return ret | VM_FAULT_RETRY;
54cb8821d   Nick Piggin   mm: merge populat...
2526
2527
  }
  EXPORT_SYMBOL(filemap_fault);
82b0f8c39   Jan Kara   mm: join struct f...
2528
  void filemap_map_pages(struct vm_fault *vmf,
bae473a42   Kirill A. Shutemov   mm: introduce fau...
2529
  		pgoff_t start_pgoff, pgoff_t end_pgoff)
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2530
  {
82b0f8c39   Jan Kara   mm: join struct f...
2531
  	struct file *file = vmf->vma->vm_file;
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2532
  	struct address_space *mapping = file->f_mapping;
bae473a42   Kirill A. Shutemov   mm: introduce fau...
2533
  	pgoff_t last_pgoff = start_pgoff;
9ab2594fe   Matthew Wilcox   mm: tighten up th...
2534
  	unsigned long max_idx;
070e807c6   Matthew Wilcox   page cache: Conve...
2535
  	XA_STATE(xas, &mapping->i_pages, start_pgoff);
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
2536
  	struct page *page;
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2537
2538
  
  	rcu_read_lock();
070e807c6   Matthew Wilcox   page cache: Conve...
2539
2540
2541
2542
  	xas_for_each(&xas, page, end_pgoff) {
  		if (xas_retry(&xas, page))
  			continue;
  		if (xa_is_value(page))
2cf938aae   Matthew Wilcox   mm: use radix_tre...
2543
  			goto next;
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2544

e0975b2aa   Michal Hocko   mm, fault_around:...
2545
2546
2547
2548
  		/*
  		 * Check for a locked page first, as a speculative
  		 * reference may adversely influence page migration.
  		 */
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
2549
  		if (PageLocked(page))
e0975b2aa   Michal Hocko   mm, fault_around:...
2550
  			goto next;
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
2551
  		if (!page_cache_get_speculative(page))
070e807c6   Matthew Wilcox   page cache: Conve...
2552
  			goto next;
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2553

4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
2554
  		/* Has the page moved or been split? */
070e807c6   Matthew Wilcox   page cache: Conve...
2555
2556
  		if (unlikely(page != xas_reload(&xas)))
  			goto skip;
4101196b1   Matthew Wilcox (Oracle)   mm: page cache: s...
2557
  		page = find_subpage(page, xas.xa_index);
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
  
  		if (!PageUptodate(page) ||
  				PageReadahead(page) ||
  				PageHWPoison(page))
  			goto skip;
  		if (!trylock_page(page))
  			goto skip;
  
  		if (page->mapping != mapping || !PageUptodate(page))
  			goto unlock;
9ab2594fe   Matthew Wilcox   mm: tighten up th...
2568
2569
  		max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
  		if (page->index >= max_idx)
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2570
  			goto unlock;
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2571
2572
  		if (file->f_ra.mmap_miss > 0)
  			file->f_ra.mmap_miss--;
7267ec008   Kirill A. Shutemov   mm: postpone page...
2573

070e807c6   Matthew Wilcox   page cache: Conve...
2574
  		vmf->address += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
82b0f8c39   Jan Kara   mm: join struct f...
2575
  		if (vmf->pte)
070e807c6   Matthew Wilcox   page cache: Conve...
2576
2577
  			vmf->pte += xas.xa_index - last_pgoff;
  		last_pgoff = xas.xa_index;
82b0f8c39   Jan Kara   mm: join struct f...
2578
  		if (alloc_set_pte(vmf, NULL, page))
7267ec008   Kirill A. Shutemov   mm: postpone page...
2579
  			goto unlock;
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2580
2581
2582
2583
2584
  		unlock_page(page);
  		goto next;
  unlock:
  		unlock_page(page);
  skip:
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2585
  		put_page(page);
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2586
  next:
7267ec008   Kirill A. Shutemov   mm: postpone page...
2587
  		/* Huge page is mapped? No need to proceed. */
82b0f8c39   Jan Kara   mm: join struct f...
2588
  		if (pmd_trans_huge(*vmf->pmd))
7267ec008   Kirill A. Shutemov   mm: postpone page...
2589
  			break;
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2590
2591
2592
2593
  	}
  	rcu_read_unlock();
  }
  EXPORT_SYMBOL(filemap_map_pages);
2bcd6454b   Souptick Joarder   mm: use new retur...
2594
  vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
4fcf1c620   Jan Kara   mm: Make default ...
2595
2596
  {
  	struct page *page = vmf->page;
11bac8000   Dave Jiang   mm, fs: reduce fa...
2597
  	struct inode *inode = file_inode(vmf->vma->vm_file);
2bcd6454b   Souptick Joarder   mm: use new retur...
2598
  	vm_fault_t ret = VM_FAULT_LOCKED;
4fcf1c620   Jan Kara   mm: Make default ...
2599

14da92001   Jan Kara   fs: Protect write...
2600
  	sb_start_pagefault(inode->i_sb);
11bac8000   Dave Jiang   mm, fs: reduce fa...
2601
  	file_update_time(vmf->vma->vm_file);
4fcf1c620   Jan Kara   mm: Make default ...
2602
2603
2604
2605
2606
2607
  	lock_page(page);
  	if (page->mapping != inode->i_mapping) {
  		unlock_page(page);
  		ret = VM_FAULT_NOPAGE;
  		goto out;
  	}
14da92001   Jan Kara   fs: Protect write...
2608
2609
2610
2611
2612
2613
  	/*
  	 * We mark the page dirty already here so that when freeze is in
  	 * progress, we are guaranteed that writeback during freezing will
  	 * see the dirty page and writeprotect it again.
  	 */
  	set_page_dirty(page);
1d1d1a767   Darrick J. Wong   mm: only enforce ...
2614
  	wait_for_stable_page(page);
4fcf1c620   Jan Kara   mm: Make default ...
2615
  out:
14da92001   Jan Kara   fs: Protect write...
2616
  	sb_end_pagefault(inode->i_sb);
4fcf1c620   Jan Kara   mm: Make default ...
2617
2618
  	return ret;
  }
4fcf1c620   Jan Kara   mm: Make default ...
2619

f0f37e2f7   Alexey Dobriyan   const: mark struc...
2620
  const struct vm_operations_struct generic_file_vm_ops = {
54cb8821d   Nick Piggin   mm: merge populat...
2621
  	.fault		= filemap_fault,
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2622
  	.map_pages	= filemap_map_pages,
4fcf1c620   Jan Kara   mm: Make default ...
2623
  	.page_mkwrite	= filemap_page_mkwrite,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
  };
  
  /* This is used for a general mmap of a disk file */
  
  int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
  {
  	struct address_space *mapping = file->f_mapping;
  
  	if (!mapping->a_ops->readpage)
  		return -ENOEXEC;
  	file_accessed(file);
  	vma->vm_ops = &generic_file_vm_ops;
  	return 0;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
  
  /*
   * This is for filesystems which do not implement ->writepage.
   */
  int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
  {
  	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
  		return -EINVAL;
  	return generic_file_mmap(file, vma);
  }
  #else
4b96a37d1   Souptick Joarder   mm: convert to us...
2649
  vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
453972283   Arnd Bergmann   mm/filemap.c: pro...
2650
  {
4b96a37d1   Souptick Joarder   mm: convert to us...
2651
  	return VM_FAULT_SIGBUS;
453972283   Arnd Bergmann   mm/filemap.c: pro...
2652
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2653
2654
2655
2656
2657
2658
2659
2660
2661
  int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
  {
  	return -ENOSYS;
  }
  int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
  {
  	return -ENOSYS;
  }
  #endif /* CONFIG_MMU */
453972283   Arnd Bergmann   mm/filemap.c: pro...
2662
  EXPORT_SYMBOL(filemap_page_mkwrite);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2663
2664
  EXPORT_SYMBOL(generic_file_mmap);
  EXPORT_SYMBOL(generic_file_readonly_mmap);
67f9fd91f   Sasha Levin   mm: remove read_c...
2665
2666
2667
2668
2669
  static struct page *wait_on_page_read(struct page *page)
  {
  	if (!IS_ERR(page)) {
  		wait_on_page_locked(page);
  		if (!PageUptodate(page)) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2670
  			put_page(page);
67f9fd91f   Sasha Levin   mm: remove read_c...
2671
2672
2673
2674
2675
  			page = ERR_PTR(-EIO);
  		}
  	}
  	return page;
  }
32b635298   Mel Gorman   mm: filemap: remo...
2676
  static struct page *do_read_cache_page(struct address_space *mapping,
57f6b96c0   Fengguang Wu   filemap: convert ...
2677
  				pgoff_t index,
5e5358e7c   Hugh Dickins   mm: cleanup descr...
2678
  				int (*filler)(void *, struct page *),
0531b2aac   Linus Torvalds   mm: add new 'read...
2679
2680
  				void *data,
  				gfp_t gfp)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2681
  {
eb2be1893   Nick Piggin   mm: buffered writ...
2682
  	struct page *page;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2683
2684
2685
2686
  	int err;
  repeat:
  	page = find_get_page(mapping, index);
  	if (!page) {
453f85d43   Mel Gorman   mm: remove __GFP_...
2687
  		page = __page_cache_alloc(gfp);
eb2be1893   Nick Piggin   mm: buffered writ...
2688
2689
  		if (!page)
  			return ERR_PTR(-ENOMEM);
e6f67b8c0   Dave Kleikamp   vfs: __read_cache...
2690
  		err = add_to_page_cache_lru(page, mapping, index, gfp);
eb2be1893   Nick Piggin   mm: buffered writ...
2691
  		if (unlikely(err)) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2692
  			put_page(page);
eb2be1893   Nick Piggin   mm: buffered writ...
2693
2694
  			if (err == -EEXIST)
  				goto repeat;
22ecdb4f8   Matthew Wilcox   page cache: Remov...
2695
  			/* Presumably ENOMEM for xarray node */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2696
2697
  			return ERR_PTR(err);
  		}
32b635298   Mel Gorman   mm: filemap: remo...
2698
2699
  
  filler:
6c45b4541   Christoph Hellwig   mm/filemap: don't...
2700
2701
2702
2703
  		if (filler)
  			err = filler(data, page);
  		else
  			err = mapping->a_ops->readpage(data, page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2704
  		if (err < 0) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2705
  			put_page(page);
32b635298   Mel Gorman   mm: filemap: remo...
2706
  			return ERR_PTR(err);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2707
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2708

32b635298   Mel Gorman   mm: filemap: remo...
2709
2710
2711
2712
2713
  		page = wait_on_page_read(page);
  		if (IS_ERR(page))
  			return page;
  		goto out;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2714
2715
  	if (PageUptodate(page))
  		goto out;
ebded0278   Mel Gorman   mm: filemap: avoi...
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
  	/*
  	 * Page is not up to date and may be locked due one of the following
  	 * case a: Page is being filled and the page lock is held
  	 * case b: Read/write error clearing the page uptodate status
  	 * case c: Truncation in progress (page locked)
  	 * case d: Reclaim in progress
  	 *
  	 * Case a, the page will be up to date when the page is unlocked.
  	 *    There is no need to serialise on the page lock here as the page
  	 *    is pinned so the lock gives no additional protection. Even if the
  	 *    the page is truncated, the data is still valid if PageUptodate as
  	 *    it's a race vs truncate race.
  	 * Case b, the page will not be up to date
  	 * Case c, the page may be truncated but in itself, the data may still
  	 *    be valid after IO completes as it's a read vs truncate race. The
  	 *    operation must restart if the page is not uptodate on unlock but
  	 *    otherwise serialising on page lock to stabilise the mapping gives
  	 *    no additional guarantees to the caller as the page lock is
  	 *    released before return.
  	 * Case d, similar to truncation. If reclaim holds the page lock, it
  	 *    will be a race with remove_mapping that determines if the mapping
  	 *    is valid on unlock but otherwise the data is valid and there is
  	 *    no need to serialise with page lock.
  	 *
  	 * As the page lock gives no additional guarantee, we optimistically
  	 * wait on the page to be unlocked and check if it's up to date and
  	 * use the page if it is. Otherwise, the page lock is required to
  	 * distinguish between the different cases. The motivation is that we
  	 * avoid spurious serialisations and wakeups when multiple processes
  	 * wait on the same page for IO to complete.
  	 */
  	wait_on_page_locked(page);
  	if (PageUptodate(page))
  		goto out;
  
  	/* Distinguish between all the cases under the safety of the lock */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2752
  	lock_page(page);
ebded0278   Mel Gorman   mm: filemap: avoi...
2753
2754
  
  	/* Case c or d, restart the operation */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2755
2756
  	if (!page->mapping) {
  		unlock_page(page);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2757
  		put_page(page);
32b635298   Mel Gorman   mm: filemap: remo...
2758
  		goto repeat;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2759
  	}
ebded0278   Mel Gorman   mm: filemap: avoi...
2760
2761
  
  	/* Someone else locked and filled the page in a very small window */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2762
2763
2764
2765
  	if (PageUptodate(page)) {
  		unlock_page(page);
  		goto out;
  	}
32b635298   Mel Gorman   mm: filemap: remo...
2766
  	goto filler;
c855ff371   David Howells   Fix a bad error c...
2767
  out:
6fe6900e1   Nick Piggin   mm: make read_cac...
2768
2769
2770
  	mark_page_accessed(page);
  	return page;
  }
0531b2aac   Linus Torvalds   mm: add new 'read...
2771
2772
  
  /**
67f9fd91f   Sasha Levin   mm: remove read_c...
2773
   * read_cache_page - read into page cache, fill it if needed
0531b2aac   Linus Torvalds   mm: add new 'read...
2774
2775
2776
   * @mapping:	the page's address_space
   * @index:	the page index
   * @filler:	function to perform the read
5e5358e7c   Hugh Dickins   mm: cleanup descr...
2777
   * @data:	first arg to filler(data, page) function, often left as NULL
0531b2aac   Linus Torvalds   mm: add new 'read...
2778
   *
0531b2aac   Linus Torvalds   mm: add new 'read...
2779
   * Read into the page cache. If a page already exists, and PageUptodate() is
67f9fd91f   Sasha Levin   mm: remove read_c...
2780
   * not set, try to fill the page and wait for it to become unlocked.
0531b2aac   Linus Torvalds   mm: add new 'read...
2781
2782
   *
   * If the page does not get brought uptodate, return -EIO.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
2783
2784
   *
   * Return: up to date page on success, ERR_PTR() on failure.
0531b2aac   Linus Torvalds   mm: add new 'read...
2785
   */
67f9fd91f   Sasha Levin   mm: remove read_c...
2786
  struct page *read_cache_page(struct address_space *mapping,
0531b2aac   Linus Torvalds   mm: add new 'read...
2787
  				pgoff_t index,
5e5358e7c   Hugh Dickins   mm: cleanup descr...
2788
  				int (*filler)(void *, struct page *),
0531b2aac   Linus Torvalds   mm: add new 'read...
2789
2790
  				void *data)
  {
d322a8e5e   Christoph Hellwig   mm/filemap.c: fix...
2791
2792
  	return do_read_cache_page(mapping, index, filler, data,
  			mapping_gfp_mask(mapping));
0531b2aac   Linus Torvalds   mm: add new 'read...
2793
  }
67f9fd91f   Sasha Levin   mm: remove read_c...
2794
  EXPORT_SYMBOL(read_cache_page);
0531b2aac   Linus Torvalds   mm: add new 'read...
2795
2796
2797
2798
2799
2800
2801
2802
  
  /**
   * read_cache_page_gfp - read into page cache, using specified page allocation flags.
   * @mapping:	the page's address_space
   * @index:	the page index
   * @gfp:	the page allocator flags to use if allocating
   *
   * This is the same as "read_mapping_page(mapping, index, NULL)", but with
e6f67b8c0   Dave Kleikamp   vfs: __read_cache...
2803
   * any new page allocations done using the specified allocation flags.
0531b2aac   Linus Torvalds   mm: add new 'read...
2804
2805
   *
   * If the page does not get brought uptodate, return -EIO.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
2806
2807
   *
   * Return: up to date page on success, ERR_PTR() on failure.
0531b2aac   Linus Torvalds   mm: add new 'read...
2808
2809
2810
2811
2812
   */
  struct page *read_cache_page_gfp(struct address_space *mapping,
  				pgoff_t index,
  				gfp_t gfp)
  {
6c45b4541   Christoph Hellwig   mm/filemap: don't...
2813
  	return do_read_cache_page(mapping, index, NULL, NULL, gfp);
0531b2aac   Linus Torvalds   mm: add new 'read...
2814
2815
  }
  EXPORT_SYMBOL(read_cache_page_gfp);
2f718ffc1   Nick Piggin   mm: buffered writ...
2816
  /*
9fd91a90c   Darrick J. Wong   vfs: strengthen c...
2817
2818
2819
2820
   * Don't operate on ranges the page cache doesn't support, and don't exceed the
   * LFS limits.  If pos is under the limit it becomes a short access.  If it
   * exceeds the limit we return -EFBIG.
   */
9fd91a90c   Darrick J. Wong   vfs: strengthen c...
2821
2822
2823
  static int generic_write_check_limits(struct file *file, loff_t pos,
  				      loff_t *count)
  {
646955cd5   Amir Goldstein   vfs: remove redun...
2824
2825
  	struct inode *inode = file->f_mapping->host;
  	loff_t max_size = inode->i_sb->s_maxbytes;
9fd91a90c   Darrick J. Wong   vfs: strengthen c...
2826
2827
2828
2829
2830
2831
2832
2833
2834
  	loff_t limit = rlimit(RLIMIT_FSIZE);
  
  	if (limit != RLIM_INFINITY) {
  		if (pos >= limit) {
  			send_sig(SIGXFSZ, current, 0);
  			return -EFBIG;
  		}
  		*count = min(*count, limit - pos);
  	}
646955cd5   Amir Goldstein   vfs: remove redun...
2835
2836
2837
2838
2839
2840
2841
2842
2843
  	if (!(file->f_flags & O_LARGEFILE))
  		max_size = MAX_NON_LFS;
  
  	if (unlikely(pos >= max_size))
  		return -EFBIG;
  
  	*count = min(*count, max_size - pos);
  
  	return 0;
9fd91a90c   Darrick J. Wong   vfs: strengthen c...
2844
2845
2846
  }
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2847
2848
   * Performs necessary checks before doing a write
   *
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
2849
   * Can adjust writing position or amount of bytes to write.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2850
2851
2852
   * Returns appropriate error code that caller should return or
   * zero in case that write should be allowed.
   */
3309dd04c   Al Viro   switch generic_wr...
2853
  inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2854
  {
3309dd04c   Al Viro   switch generic_wr...
2855
  	struct file *file = iocb->ki_filp;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2856
  	struct inode *inode = file->f_mapping->host;
9fd91a90c   Darrick J. Wong   vfs: strengthen c...
2857
2858
  	loff_t count;
  	int ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2859

dc617f29d   Darrick J. Wong   vfs: don't allow ...
2860
2861
  	if (IS_SWAPFILE(inode))
  		return -ETXTBSY;
3309dd04c   Al Viro   switch generic_wr...
2862
2863
  	if (!iov_iter_count(from))
  		return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2864

0fa6b005a   Al Viro   generic_write_che...
2865
  	/* FIXME: this is for backwards compatibility with 2.4 */
2ba48ce51   Al Viro   mirror O_APPEND a...
2866
  	if (iocb->ki_flags & IOCB_APPEND)
3309dd04c   Al Viro   switch generic_wr...
2867
  		iocb->ki_pos = i_size_read(inode);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2868

6be96d3ad   Goldwyn Rodrigues   fs: return if dir...
2869
2870
  	if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
  		return -EINVAL;
9fd91a90c   Darrick J. Wong   vfs: strengthen c...
2871
2872
2873
2874
  	count = iov_iter_count(from);
  	ret = generic_write_check_limits(file, iocb->ki_pos, &count);
  	if (ret)
  		return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2875

9fd91a90c   Darrick J. Wong   vfs: strengthen c...
2876
  	iov_iter_truncate(from, count);
3309dd04c   Al Viro   switch generic_wr...
2877
  	return iov_iter_count(from);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2878
2879
  }
  EXPORT_SYMBOL(generic_write_checks);
1383a7ed6   Darrick J. Wong   vfs: check file r...
2880
2881
2882
  /*
   * Performs necessary checks before doing a clone.
   *
646955cd5   Amir Goldstein   vfs: remove redun...
2883
   * Can adjust amount of bytes to clone via @req_count argument.
1383a7ed6   Darrick J. Wong   vfs: check file r...
2884
2885
2886
2887
2888
   * Returns appropriate error code that caller should return or
   * zero in case the clone should be allowed.
   */
  int generic_remap_checks(struct file *file_in, loff_t pos_in,
  			 struct file *file_out, loff_t pos_out,
42ec3d4c0   Darrick J. Wong   vfs: make remap_f...
2889
  			 loff_t *req_count, unsigned int remap_flags)
1383a7ed6   Darrick J. Wong   vfs: check file r...
2890
2891
2892
2893
2894
2895
2896
  {
  	struct inode *inode_in = file_in->f_mapping->host;
  	struct inode *inode_out = file_out->f_mapping->host;
  	uint64_t count = *req_count;
  	uint64_t bcount;
  	loff_t size_in, size_out;
  	loff_t bs = inode_out->i_sb->s_blocksize;
9fd91a90c   Darrick J. Wong   vfs: strengthen c...
2897
  	int ret;
1383a7ed6   Darrick J. Wong   vfs: check file r...
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
  
  	/* The start of both ranges must be aligned to an fs block. */
  	if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs))
  		return -EINVAL;
  
  	/* Ensure offsets don't wrap. */
  	if (pos_in + count < pos_in || pos_out + count < pos_out)
  		return -EINVAL;
  
  	size_in = i_size_read(inode_in);
  	size_out = i_size_read(inode_out);
  
  	/* Dedupe requires both ranges to be within EOF. */
3d28193e1   Darrick J. Wong   vfs: pass remap f...
2911
  	if ((remap_flags & REMAP_FILE_DEDUP) &&
1383a7ed6   Darrick J. Wong   vfs: check file r...
2912
2913
2914
2915
2916
2917
2918
2919
  	    (pos_in >= size_in || pos_in + count > size_in ||
  	     pos_out >= size_out || pos_out + count > size_out))
  		return -EINVAL;
  
  	/* Ensure the infile range is within the infile. */
  	if (pos_in >= size_in)
  		return -EINVAL;
  	count = min(count, size_in - (uint64_t)pos_in);
9fd91a90c   Darrick J. Wong   vfs: strengthen c...
2920
2921
2922
  	ret = generic_write_check_limits(file_out, pos_out, &count);
  	if (ret)
  		return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2923
2924
  
  	/*
1383a7ed6   Darrick J. Wong   vfs: check file r...
2925
2926
2927
2928
2929
  	 * If the user wanted us to link to the infile's EOF, round up to the
  	 * next block boundary for this check.
  	 *
  	 * Otherwise, make sure the count is also block-aligned, having
  	 * already confirmed the starting offsets' block alignment.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2930
  	 */
1383a7ed6   Darrick J. Wong   vfs: check file r...
2931
2932
2933
2934
  	if (pos_in + count == size_in) {
  		bcount = ALIGN(size_in, bs) - pos_in;
  	} else {
  		if (!IS_ALIGNED(count, bs))
eca3654e3   Darrick J. Wong   vfs: enable remap...
2935
  			count = ALIGN_DOWN(count, bs);
1383a7ed6   Darrick J. Wong   vfs: check file r...
2936
  		bcount = count;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2937
  	}
1383a7ed6   Darrick J. Wong   vfs: check file r...
2938
2939
2940
2941
2942
  	/* Don't allow overlapped cloning within the same file. */
  	if (inode_in == inode_out &&
  	    pos_out + bcount > pos_in &&
  	    pos_out < pos_in + bcount)
  		return -EINVAL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2943
  	/*
eca3654e3   Darrick J. Wong   vfs: enable remap...
2944
2945
  	 * We shortened the request but the caller can't deal with that, so
  	 * bounce the request back to userspace.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2946
  	 */
eca3654e3   Darrick J. Wong   vfs: enable remap...
2947
  	if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
1383a7ed6   Darrick J. Wong   vfs: check file r...
2948
  		return -EINVAL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2949

eca3654e3   Darrick J. Wong   vfs: enable remap...
2950
  	*req_count = count;
1383a7ed6   Darrick J. Wong   vfs: check file r...
2951
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2952
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2953

a31713517   Amir Goldstein   vfs: introduce ge...
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
  
  /*
   * Performs common checks before doing a file copy/clone
   * from @file_in to @file_out.
   */
  int generic_file_rw_checks(struct file *file_in, struct file *file_out)
  {
  	struct inode *inode_in = file_inode(file_in);
  	struct inode *inode_out = file_inode(file_out);
  
  	/* Don't copy dirs, pipes, sockets... */
  	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
  		return -EISDIR;
  	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
  		return -EINVAL;
  
  	if (!(file_in->f_mode & FMODE_READ) ||
  	    !(file_out->f_mode & FMODE_WRITE) ||
  	    (file_out->f_flags & O_APPEND))
  		return -EBADF;
  
  	return 0;
  }
96e6e8f4a   Amir Goldstein   vfs: add missing ...
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
  /*
   * Performs necessary checks before doing a file copy
   *
   * Can adjust amount of bytes to copy via @req_count argument.
   * Returns appropriate error code that caller should return or
   * zero in case the copy should be allowed.
   */
  int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
  			     struct file *file_out, loff_t pos_out,
  			     size_t *req_count, unsigned int flags)
  {
  	struct inode *inode_in = file_inode(file_in);
  	struct inode *inode_out = file_inode(file_out);
  	uint64_t count = *req_count;
  	loff_t size_in;
  	int ret;
  
  	ret = generic_file_rw_checks(file_in, file_out);
  	if (ret)
  		return ret;
  
  	/* Don't touch certain kinds of inodes */
  	if (IS_IMMUTABLE(inode_out))
  		return -EPERM;
  
  	if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
  		return -ETXTBSY;
  
  	/* Ensure offsets don't wrap. */
  	if (pos_in + count < pos_in || pos_out + count < pos_out)
  		return -EOVERFLOW;
  
  	/* Shorten the copy to EOF */
  	size_in = i_size_read(inode_in);
  	if (pos_in >= size_in)
  		count = 0;
  	else
  		count = min(count, size_in - (uint64_t)pos_in);
  
  	ret = generic_write_check_limits(file_out, pos_out, &count);
  	if (ret)
  		return ret;
  
  	/* Don't allow overlapped copying within the same file. */
  	if (inode_in == inode_out &&
  	    pos_out + count > pos_in &&
  	    pos_out < pos_in + count)
  		return -EINVAL;
  
  	*req_count = count;
  	return 0;
  }
afddba49d   Nick Piggin   fs: introduce wri...
3029
3030
3031
3032
3033
  int pagecache_write_begin(struct file *file, struct address_space *mapping,
  				loff_t pos, unsigned len, unsigned flags,
  				struct page **pagep, void **fsdata)
  {
  	const struct address_space_operations *aops = mapping->a_ops;
4e02ed4b4   Nick Piggin   fs: remove prepar...
3034
  	return aops->write_begin(file, mapping, pos, len, flags,
afddba49d   Nick Piggin   fs: introduce wri...
3035
  							pagep, fsdata);
afddba49d   Nick Piggin   fs: introduce wri...
3036
3037
3038
3039
3040
3041
3042
3043
  }
  EXPORT_SYMBOL(pagecache_write_begin);
  
  int pagecache_write_end(struct file *file, struct address_space *mapping,
  				loff_t pos, unsigned len, unsigned copied,
  				struct page *page, void *fsdata)
  {
  	const struct address_space_operations *aops = mapping->a_ops;
afddba49d   Nick Piggin   fs: introduce wri...
3044

4e02ed4b4   Nick Piggin   fs: remove prepar...
3045
  	return aops->write_end(file, mapping, pos, len, copied, page, fsdata);
afddba49d   Nick Piggin   fs: introduce wri...
3046
3047
  }
  EXPORT_SYMBOL(pagecache_write_end);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3048
  ssize_t
1af5bb491   Christoph Hellwig   filemap: remove t...
3049
  generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3050
3051
3052
3053
  {
  	struct file	*file = iocb->ki_filp;
  	struct address_space *mapping = file->f_mapping;
  	struct inode	*inode = mapping->host;
1af5bb491   Christoph Hellwig   filemap: remove t...
3054
  	loff_t		pos = iocb->ki_pos;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3055
  	ssize_t		written;
a969e903a   Christoph Hellwig   kill generic_file...
3056
3057
  	size_t		write_len;
  	pgoff_t		end;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3058

0c949334a   Al Viro   iov_iter_truncate()
3059
  	write_len = iov_iter_count(from);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
3060
  	end = (pos + write_len - 1) >> PAGE_SHIFT;
a969e903a   Christoph Hellwig   kill generic_file...
3061

6be96d3ad   Goldwyn Rodrigues   fs: return if dir...
3062
3063
3064
  	if (iocb->ki_flags & IOCB_NOWAIT) {
  		/* If there are pages to writeback, return */
  		if (filemap_range_has_page(inode->i_mapping, pos,
35f12f0f5   zhengbin   mm/filemap: pass ...
3065
  					   pos + write_len - 1))
6be96d3ad   Goldwyn Rodrigues   fs: return if dir...
3066
3067
3068
3069
3070
3071
3072
  			return -EAGAIN;
  	} else {
  		written = filemap_write_and_wait_range(mapping, pos,
  							pos + write_len - 1);
  		if (written)
  			goto out;
  	}
a969e903a   Christoph Hellwig   kill generic_file...
3073
3074
3075
3076
3077
  
  	/*
  	 * After a write we want buffered reads to be sure to go to disk to get
  	 * the new data.  We invalidate clean cached page from the region we're
  	 * about to write.  We do this *before* the write so that we can return
6ccfa806a   Hisashi Hifumi   VFS: fix dio writ...
3078
  	 * without clobbering -EIOCBQUEUED from ->direct_IO().
a969e903a   Christoph Hellwig   kill generic_file...
3079
  	 */
55635ba76   Andrey Ryabinin   fs: fix data inva...
3080
  	written = invalidate_inode_pages2_range(mapping,
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
3081
  					pos >> PAGE_SHIFT, end);
55635ba76   Andrey Ryabinin   fs: fix data inva...
3082
3083
3084
3085
3086
3087
3088
3089
  	/*
  	 * If a page can not be invalidated, return 0 to fall back
  	 * to buffered write.
  	 */
  	if (written) {
  		if (written == -EBUSY)
  			return 0;
  		goto out;
a969e903a   Christoph Hellwig   kill generic_file...
3090
  	}
639a93a52   Al Viro   generic_file_dire...
3091
  	written = mapping->a_ops->direct_IO(iocb, from);
a969e903a   Christoph Hellwig   kill generic_file...
3092
3093
3094
3095
3096
3097
3098
3099
  
  	/*
  	 * Finally, try again to invalidate clean pages which might have been
  	 * cached by non-direct readahead, or faulted in by get_user_pages()
  	 * if the source of the write was an mmap'ed region of the file
  	 * we're writing.  Either one is a pretty crazy thing to do,
  	 * so we don't support it 100%.  If this invalidation
  	 * fails, tough, the write still worked...
332391a99   Lukas Czerner   fs: Fix page cach...
3100
3101
3102
3103
3104
  	 *
  	 * Most of the time we do not need this since dio_complete() will do
  	 * the invalidation for us. However there are some file systems that
  	 * do not end up with dio_complete() being called, so let's not break
  	 * them by removing it completely
a969e903a   Christoph Hellwig   kill generic_file...
3105
  	 */
332391a99   Lukas Czerner   fs: Fix page cach...
3106
3107
3108
  	if (mapping->nrpages)
  		invalidate_inode_pages2_range(mapping,
  					pos >> PAGE_SHIFT, end);
a969e903a   Christoph Hellwig   kill generic_file...
3109

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3110
  	if (written > 0) {
0116651c8   Namhyung Kim   mm: remove tempor...
3111
  		pos += written;
639a93a52   Al Viro   generic_file_dire...
3112
  		write_len -= written;
0116651c8   Namhyung Kim   mm: remove tempor...
3113
3114
  		if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
  			i_size_write(inode, pos);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3115
3116
  			mark_inode_dirty(inode);
  		}
5cb6c6c7e   Al Viro   generic_file_dire...
3117
  		iocb->ki_pos = pos;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3118
  	}
639a93a52   Al Viro   generic_file_dire...
3119
  	iov_iter_revert(from, write_len - iov_iter_count(from));
a969e903a   Christoph Hellwig   kill generic_file...
3120
  out:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3121
3122
3123
  	return written;
  }
  EXPORT_SYMBOL(generic_file_direct_write);
eb2be1893   Nick Piggin   mm: buffered writ...
3124
3125
3126
3127
  /*
   * Find or create a page at the given pagecache position. Return the locked
   * page. This function is specifically for buffered writes.
   */
54566b2c1   Nick Piggin   fs: symlink write...
3128
3129
  struct page *grab_cache_page_write_begin(struct address_space *mapping,
  					pgoff_t index, unsigned flags)
eb2be1893   Nick Piggin   mm: buffered writ...
3130
  {
eb2be1893   Nick Piggin   mm: buffered writ...
3131
  	struct page *page;
bbddabe2e   Johannes Weiner   mm: filemap: only...
3132
  	int fgp_flags = FGP_LOCK|FGP_WRITE|FGP_CREAT;
0faa70cb0   Johannes Weiner   mm: filemap: pass...
3133

54566b2c1   Nick Piggin   fs: symlink write...
3134
  	if (flags & AOP_FLAG_NOFS)
2457aec63   Mel Gorman   mm: non-atomicall...
3135
3136
3137
  		fgp_flags |= FGP_NOFS;
  
  	page = pagecache_get_page(mapping, index, fgp_flags,
45f87de57   Michal Hocko   mm: get rid of ra...
3138
  			mapping_gfp_mask(mapping));
c585a2678   Steven Rostedt   mm: remove likely...
3139
  	if (page)
2457aec63   Mel Gorman   mm: non-atomicall...
3140
  		wait_for_stable_page(page);
eb2be1893   Nick Piggin   mm: buffered writ...
3141

eb2be1893   Nick Piggin   mm: buffered writ...
3142
3143
  	return page;
  }
54566b2c1   Nick Piggin   fs: symlink write...
3144
  EXPORT_SYMBOL(grab_cache_page_write_begin);
eb2be1893   Nick Piggin   mm: buffered writ...
3145

3b93f911d   Al Viro   export generic_pe...
3146
  ssize_t generic_perform_write(struct file *file,
afddba49d   Nick Piggin   fs: introduce wri...
3147
3148
3149
3150
3151
3152
  				struct iov_iter *i, loff_t pos)
  {
  	struct address_space *mapping = file->f_mapping;
  	const struct address_space_operations *a_ops = mapping->a_ops;
  	long status = 0;
  	ssize_t written = 0;
674b892ed   Nick Piggin   mm: restore KERNE...
3153
  	unsigned int flags = 0;
afddba49d   Nick Piggin   fs: introduce wri...
3154
3155
  	do {
  		struct page *page;
afddba49d   Nick Piggin   fs: introduce wri...
3156
3157
3158
3159
  		unsigned long offset;	/* Offset into pagecache page */
  		unsigned long bytes;	/* Bytes to write to page */
  		size_t copied;		/* Bytes copied from user */
  		void *fsdata;
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
3160
3161
  		offset = (pos & (PAGE_SIZE - 1));
  		bytes = min_t(unsigned long, PAGE_SIZE - offset,
afddba49d   Nick Piggin   fs: introduce wri...
3162
3163
3164
  						iov_iter_count(i));
  
  again:
00a3d660c   Linus Torvalds   Revert "fs: do no...
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
  		/*
  		 * Bring in the user page that we will copy from _first_.
  		 * Otherwise there's a nasty deadlock on copying from the
  		 * same page as we're writing to, without it being marked
  		 * up-to-date.
  		 *
  		 * Not only is this an optimisation, but it is also required
  		 * to check that the address is actually valid, when atomic
  		 * usercopies are used, below.
  		 */
  		if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
  			status = -EFAULT;
  			break;
  		}
296291cdd   Jan Kara   mm: make sendfile...
3179
3180
3181
3182
  		if (fatal_signal_pending(current)) {
  			status = -EINTR;
  			break;
  		}
674b892ed   Nick Piggin   mm: restore KERNE...
3183
  		status = a_ops->write_begin(file, mapping, pos, bytes, flags,
afddba49d   Nick Piggin   fs: introduce wri...
3184
  						&page, &fsdata);
2457aec63   Mel Gorman   mm: non-atomicall...
3185
  		if (unlikely(status < 0))
afddba49d   Nick Piggin   fs: introduce wri...
3186
  			break;
931e80e4b   anfei zhou   mm: flush dcache ...
3187
3188
  		if (mapping_writably_mapped(mapping))
  			flush_dcache_page(page);
00a3d660c   Linus Torvalds   Revert "fs: do no...
3189

afddba49d   Nick Piggin   fs: introduce wri...
3190
  		copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
afddba49d   Nick Piggin   fs: introduce wri...
3191
3192
3193
3194
3195
3196
3197
3198
3199
  		flush_dcache_page(page);
  
  		status = a_ops->write_end(file, mapping, pos, bytes, copied,
  						page, fsdata);
  		if (unlikely(status < 0))
  			break;
  		copied = status;
  
  		cond_resched();
124d3b704   Nick Piggin   fix writev regres...
3200
  		iov_iter_advance(i, copied);
afddba49d   Nick Piggin   fs: introduce wri...
3201
3202
3203
3204
3205
3206
3207
3208
3209
  		if (unlikely(copied == 0)) {
  			/*
  			 * If we were unable to copy any data at all, we must
  			 * fall back to a single segment length write.
  			 *
  			 * If we didn't fallback here, we could livelock
  			 * because not all segments in the iov can be copied at
  			 * once without a pagefault.
  			 */
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
3210
  			bytes = min_t(unsigned long, PAGE_SIZE - offset,
afddba49d   Nick Piggin   fs: introduce wri...
3211
3212
3213
  						iov_iter_single_seg_count(i));
  			goto again;
  		}
afddba49d   Nick Piggin   fs: introduce wri...
3214
3215
3216
3217
  		pos += copied;
  		written += copied;
  
  		balance_dirty_pages_ratelimited(mapping);
afddba49d   Nick Piggin   fs: introduce wri...
3218
3219
3220
3221
  	} while (iov_iter_count(i));
  
  	return written ? written : status;
  }
3b93f911d   Al Viro   export generic_pe...
3222
  EXPORT_SYMBOL(generic_perform_write);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3223

e4dd9de3c   Jan Kara   vfs: Export __gen...
3224
  /**
8174202b3   Al Viro   write_iter varian...
3225
   * __generic_file_write_iter - write data to a file
e4dd9de3c   Jan Kara   vfs: Export __gen...
3226
   * @iocb:	IO state structure (file, offset, etc.)
8174202b3   Al Viro   write_iter varian...
3227
   * @from:	iov_iter with data to write
e4dd9de3c   Jan Kara   vfs: Export __gen...
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
   *
   * This function does all the work needed for actually writing data to a
   * file. It does all basic checks, removes SUID from the file, updates
   * modification times and calls proper subroutines depending on whether we
   * do direct IO or a standard buffered write.
   *
   * It expects i_mutex to be grabbed unless we work on a block device or similar
   * object which does not need locking at all.
   *
   * This function does *not* take care of syncing data in case of O_SYNC write.
   * A caller has to handle it. This is mainly due to the fact that we want to
   * avoid syncing under i_mutex.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
3240
3241
3242
3243
   *
   * Return:
   * * number of bytes written, even for truncated writes
   * * negative error code if no data has been written at all
e4dd9de3c   Jan Kara   vfs: Export __gen...
3244
   */
8174202b3   Al Viro   write_iter varian...
3245
  ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3246
3247
  {
  	struct file *file = iocb->ki_filp;
fb5527e68   Jeff Moyer   [PATCH] direct-io...
3248
  	struct address_space * mapping = file->f_mapping;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3249
  	struct inode 	*inode = mapping->host;
3b93f911d   Al Viro   export generic_pe...
3250
  	ssize_t		written = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3251
  	ssize_t		err;
3b93f911d   Al Viro   export generic_pe...
3252
  	ssize_t		status;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3253

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3254
  	/* We can write back this queue in page reclaim */
de1414a65   Christoph Hellwig   fs: export inode_...
3255
  	current->backing_dev_info = inode_to_bdi(inode);
5fa8e0a1c   Jan Kara   fs: Rename file_r...
3256
  	err = file_remove_privs(file);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3257
3258
  	if (err)
  		goto out;
c3b2da314   Josef Bacik   fs: introduce ino...
3259
3260
3261
  	err = file_update_time(file);
  	if (err)
  		goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3262

2ba48ce51   Al Viro   mirror O_APPEND a...
3263
  	if (iocb->ki_flags & IOCB_DIRECT) {
0b8def9d6   Al Viro   __generic_file_wr...
3264
  		loff_t pos, endbyte;
fb5527e68   Jeff Moyer   [PATCH] direct-io...
3265

1af5bb491   Christoph Hellwig   filemap: remove t...
3266
  		written = generic_file_direct_write(iocb, from);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3267
  		/*
fbbbad4bc   Matthew Wilcox   vfs,ext2: introdu...
3268
3269
3270
3271
3272
  		 * If the write stopped short of completing, fall back to
  		 * buffered writes.  Some filesystems do this for writes to
  		 * holes, for example.  For DAX files, a buffered write will
  		 * not succeed (even if it did, DAX does not handle dirty
  		 * page-cache pages correctly).
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3273
  		 */
0b8def9d6   Al Viro   __generic_file_wr...
3274
  		if (written < 0 || !iov_iter_count(from) || IS_DAX(inode))
fbbbad4bc   Matthew Wilcox   vfs,ext2: introdu...
3275
  			goto out;
0b8def9d6   Al Viro   __generic_file_wr...
3276
  		status = generic_perform_write(file, from, pos = iocb->ki_pos);
fb5527e68   Jeff Moyer   [PATCH] direct-io...
3277
  		/*
3b93f911d   Al Viro   export generic_pe...
3278
  		 * If generic_perform_write() returned a synchronous error
fb5527e68   Jeff Moyer   [PATCH] direct-io...
3279
3280
3281
3282
3283
  		 * then we want to return the number of bytes which were
  		 * direct-written, or the error code if that was zero.  Note
  		 * that this differs from normal direct-io semantics, which
  		 * will return -EFOO even if some bytes were written.
  		 */
60bb45297   Al Viro   __generic_file_wr...
3284
  		if (unlikely(status < 0)) {
3b93f911d   Al Viro   export generic_pe...
3285
  			err = status;
fb5527e68   Jeff Moyer   [PATCH] direct-io...
3286
3287
  			goto out;
  		}
fb5527e68   Jeff Moyer   [PATCH] direct-io...
3288
3289
3290
3291
3292
  		/*
  		 * We need to ensure that the page cache pages are written to
  		 * disk and invalidated to preserve the expected O_DIRECT
  		 * semantics.
  		 */
3b93f911d   Al Viro   export generic_pe...
3293
  		endbyte = pos + status - 1;
0b8def9d6   Al Viro   __generic_file_wr...
3294
  		err = filemap_write_and_wait_range(mapping, pos, endbyte);
fb5527e68   Jeff Moyer   [PATCH] direct-io...
3295
  		if (err == 0) {
0b8def9d6   Al Viro   __generic_file_wr...
3296
  			iocb->ki_pos = endbyte + 1;
3b93f911d   Al Viro   export generic_pe...
3297
  			written += status;
fb5527e68   Jeff Moyer   [PATCH] direct-io...
3298
  			invalidate_mapping_pages(mapping,
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
3299
3300
  						 pos >> PAGE_SHIFT,
  						 endbyte >> PAGE_SHIFT);
fb5527e68   Jeff Moyer   [PATCH] direct-io...
3301
3302
3303
3304
3305
3306
3307
  		} else {
  			/*
  			 * We don't know how much we wrote, so just return
  			 * the number of bytes which were direct-written
  			 */
  		}
  	} else {
0b8def9d6   Al Viro   __generic_file_wr...
3308
3309
3310
  		written = generic_perform_write(file, from, iocb->ki_pos);
  		if (likely(written > 0))
  			iocb->ki_pos += written;
fb5527e68   Jeff Moyer   [PATCH] direct-io...
3311
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3312
3313
3314
3315
  out:
  	current->backing_dev_info = NULL;
  	return written ? written : err;
  }
8174202b3   Al Viro   write_iter varian...
3316
  EXPORT_SYMBOL(__generic_file_write_iter);
e4dd9de3c   Jan Kara   vfs: Export __gen...
3317

e4dd9de3c   Jan Kara   vfs: Export __gen...
3318
  /**
8174202b3   Al Viro   write_iter varian...
3319
   * generic_file_write_iter - write data to a file
e4dd9de3c   Jan Kara   vfs: Export __gen...
3320
   * @iocb:	IO state structure
8174202b3   Al Viro   write_iter varian...
3321
   * @from:	iov_iter with data to write
e4dd9de3c   Jan Kara   vfs: Export __gen...
3322
   *
8174202b3   Al Viro   write_iter varian...
3323
   * This is a wrapper around __generic_file_write_iter() to be used by most
e4dd9de3c   Jan Kara   vfs: Export __gen...
3324
3325
   * filesystems. It takes care of syncing the file in case of O_SYNC file
   * and acquires i_mutex as needed.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
3326
3327
3328
3329
   * Return:
   * * negative error code if no data has been written at all of
   *   vfs_fsync_range() failed for a synchronous write
   * * number of bytes written, even for truncated writes
e4dd9de3c   Jan Kara   vfs: Export __gen...
3330
   */
8174202b3   Al Viro   write_iter varian...
3331
  ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3332
3333
  {
  	struct file *file = iocb->ki_filp;
148f948ba   Jan Kara   vfs: Introduce ne...
3334
  	struct inode *inode = file->f_mapping->host;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3335
  	ssize_t ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3336

5955102c9   Al Viro   wrappers for ->i_...
3337
  	inode_lock(inode);
3309dd04c   Al Viro   switch generic_wr...
3338
3339
  	ret = generic_write_checks(iocb, from);
  	if (ret > 0)
5f380c7fa   Al Viro   lift generic_writ...
3340
  		ret = __generic_file_write_iter(iocb, from);
5955102c9   Al Viro   wrappers for ->i_...
3341
  	inode_unlock(inode);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3342

e25922176   Christoph Hellwig   fs: simplify the ...
3343
3344
  	if (ret > 0)
  		ret = generic_write_sync(iocb, ret);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3345
3346
  	return ret;
  }
8174202b3   Al Viro   write_iter varian...
3347
  EXPORT_SYMBOL(generic_file_write_iter);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3348

cf9a2ae8d   David Howells   [PATCH] BLOCK: Mo...
3349
3350
3351
3352
3353
3354
3355
  /**
   * try_to_release_page() - release old fs-specific metadata on a page
   *
   * @page: the page which the kernel is trying to free
   * @gfp_mask: memory allocation flags (and I/O mode)
   *
   * The address_space is to try to release any data against the page
a862f68a8   Mike Rapoport   docs/core-api/mm:...
3356
   * (presumably at page->private).
cf9a2ae8d   David Howells   [PATCH] BLOCK: Mo...
3357
   *
266cf658e   David Howells   FS-Cache: Recruit...
3358
3359
3360
   * This may also be called if PG_fscache is set on a page, indicating that the
   * page is known to the local caching routines.
   *
cf9a2ae8d   David Howells   [PATCH] BLOCK: Mo...
3361
   * The @gfp_mask argument specifies whether I/O may be performed to release
71baba4b9   Mel Gorman   mm, page_alloc: r...
3362
   * this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS).
cf9a2ae8d   David Howells   [PATCH] BLOCK: Mo...
3363
   *
a862f68a8   Mike Rapoport   docs/core-api/mm:...
3364
   * Return: %1 if the release was successful, otherwise return zero.
cf9a2ae8d   David Howells   [PATCH] BLOCK: Mo...
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
   */
  int try_to_release_page(struct page *page, gfp_t gfp_mask)
  {
  	struct address_space * const mapping = page->mapping;
  
  	BUG_ON(!PageLocked(page));
  	if (PageWriteback(page))
  		return 0;
  
  	if (mapping && mapping->a_ops->releasepage)
  		return mapping->a_ops->releasepage(page, gfp_mask);
  	return try_to_free_buffers(page);
  }
  
  EXPORT_SYMBOL(try_to_release_page);