Blame view

mm/filemap.c 89.3 KB
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1
2
3
4
5
6
7
8
9
10
11
  /*
   *	linux/mm/filemap.c
   *
   * Copyright (C) 1994-1999  Linus Torvalds
   */
  
  /*
   * This file handles the generic file mmap semantics used by
   * most "normal" filesystems (but you don't /have/ to use this:
   * the NFS filesystem used to do this differently, for example)
   */
b95f1b31b   Paul Gortmaker   mm: Map most file...
12
  #include <linux/export.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
13
  #include <linux/compiler.h>
f9fe48bec   Ross Zwisler   dax: support dirt...
14
  #include <linux/dax.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
15
  #include <linux/fs.h>
3f07c0144   Ingo Molnar   sched/headers: Pr...
16
  #include <linux/sched/signal.h>
c22ce143d   Hiro Yoshioka   [PATCH] x86: cach...
17
  #include <linux/uaccess.h>
c59ede7b7   Randy.Dunlap   [PATCH] move capa...
18
  #include <linux/capability.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
19
  #include <linux/kernel_stat.h>
5a0e3ad6a   Tejun Heo   include cleanup: ...
20
  #include <linux/gfp.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
21
22
23
24
25
26
27
28
  #include <linux/mm.h>
  #include <linux/swap.h>
  #include <linux/mman.h>
  #include <linux/pagemap.h>
  #include <linux/file.h>
  #include <linux/uio.h>
  #include <linux/hash.h>
  #include <linux/writeback.h>
53253383f   Linus Torvalds   Include <linux/ba...
29
  #include <linux/backing-dev.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
30
31
32
  #include <linux/pagevec.h>
  #include <linux/blkdev.h>
  #include <linux/security.h>
44110fe38   Paul Jackson   [PATCH] cpuset me...
33
  #include <linux/cpuset.h>
00501b531   Johannes Weiner   mm: memcontrol: r...
34
  #include <linux/hugetlb.h>
8a9f3ccd2   Balbir Singh   Memory controller...
35
  #include <linux/memcontrol.h>
c515e1fd3   Dan Magenheimer   mm/fs: add hooks ...
36
  #include <linux/cleancache.h>
c7df8ad29   Mel Gorman   mm, truncate: do ...
37
  #include <linux/shmem_fs.h>
f1820361f   Kirill A. Shutemov   mm: implement ->m...
38
  #include <linux/rmap.h>
0f8053a50   Nick Piggin   [PATCH] mm: make ...
39
  #include "internal.h"
fe0bfaaff   Robert Jarzmik   mm: trace filemap...
40
41
  #define CREATE_TRACE_POINTS
  #include <trace/events/filemap.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
42
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
43
44
   * FIXME: remove all knowledge of the buffer layer from the core VM
   */
148f948ba   Jan Kara   vfs: Introduce ne...
45
  #include <linux/buffer_head.h> /* for try_to_free_buffers */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
46

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
  #include <asm/mman.h>
  
  /*
   * Shared mappings implemented 30.11.1994. It's not fully working yet,
   * though.
   *
   * Shared mappings now work. 15.8.1995  Bruno.
   *
   * finished 'unifying' the page and buffer cache and SMP-threaded the
   * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
   *
   * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
   */
  
  /*
   * Lock ordering:
   *
c8c06efa8   Davidlohr Bueso   mm: convert i_mma...
64
   *  ->i_mmap_rwsem		(truncate_pagecache)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
65
   *    ->private_lock		(__free_pte->__set_page_dirty_buffers)
5d337b919   Hugh Dickins   [PATCH] swap: swa...
66
   *      ->swap_lock		(exclusive_swap_page, others)
b93b01631   Matthew Wilcox   page cache: use x...
67
   *        ->i_pages lock
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
68
   *
1b1dcc1b5   Jes Sorensen   [PATCH] mutex sub...
69
   *  ->i_mutex
c8c06efa8   Davidlohr Bueso   mm: convert i_mma...
70
   *    ->i_mmap_rwsem		(truncate->unmap_mapping_range)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
71
72
   *
   *  ->mmap_sem
c8c06efa8   Davidlohr Bueso   mm: convert i_mma...
73
   *    ->i_mmap_rwsem
b8072f099   Hugh Dickins   [PATCH] mm: updat...
74
   *      ->page_table_lock or pte_lock	(various, mainly in memory.c)
b93b01631   Matthew Wilcox   page cache: use x...
75
   *        ->i_pages lock	(arch-dependent flush_dcache_mmap_lock)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
76
77
78
79
   *
   *  ->mmap_sem
   *    ->lock_page		(access_process_vm)
   *
ccad23656   Al Viro   kill generic_file...
80
   *  ->i_mutex			(generic_perform_write)
82591e6ea   Nick Piggin   [PATCH] mm: more ...
81
   *    ->mmap_sem		(fault_in_pages_readable->do_page_fault)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
82
   *
f758eeabe   Christoph Hellwig   writeback: split ...
83
   *  bdi->wb.list_lock
a66979aba   Dave Chinner   fs: move i_wb_lis...
84
   *    sb_lock			(fs/fs-writeback.c)
b93b01631   Matthew Wilcox   page cache: use x...
85
   *    ->i_pages lock		(__sync_single_inode)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
86
   *
c8c06efa8   Davidlohr Bueso   mm: convert i_mma...
87
   *  ->i_mmap_rwsem
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
88
89
90
   *    ->anon_vma.lock		(vma_adjust)
   *
   *  ->anon_vma.lock
b8072f099   Hugh Dickins   [PATCH] mm: updat...
91
   *    ->page_table_lock or pte_lock	(anon_vma_prepare and various)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
92
   *
b8072f099   Hugh Dickins   [PATCH] mm: updat...
93
   *  ->page_table_lock or pte_lock
5d337b919   Hugh Dickins   [PATCH] swap: swa...
94
   *    ->swap_lock		(try_to_unmap_one)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
95
   *    ->private_lock		(try_to_unmap_one)
b93b01631   Matthew Wilcox   page cache: use x...
96
   *    ->i_pages lock		(try_to_unmap_one)
a52633d8e   Mel Gorman   mm, vmscan: move ...
97
98
   *    ->zone_lru_lock(zone)	(follow_page->mark_page_accessed)
   *    ->zone_lru_lock(zone)	(check_pte_range->isolate_lru_page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
99
   *    ->private_lock		(page_remove_rmap->set_page_dirty)
b93b01631   Matthew Wilcox   page cache: use x...
100
   *    ->i_pages lock		(page_remove_rmap->set_page_dirty)
f758eeabe   Christoph Hellwig   writeback: split ...
101
   *    bdi.wb->list_lock		(page_remove_rmap->set_page_dirty)
250df6ed2   Dave Chinner   fs: protect inode...
102
   *    ->inode->i_lock		(page_remove_rmap->set_page_dirty)
81f8c3a46   Johannes Weiner   mm: memcontrol: g...
103
   *    ->memcg->move_lock	(page_remove_rmap->lock_page_memcg)
f758eeabe   Christoph Hellwig   writeback: split ...
104
   *    bdi.wb->list_lock		(zap_pte_range->set_page_dirty)
250df6ed2   Dave Chinner   fs: protect inode...
105
   *    ->inode->i_lock		(zap_pte_range->set_page_dirty)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
106
107
   *    ->private_lock		(zap_pte_range->__set_page_dirty_buffers)
   *
c8c06efa8   Davidlohr Bueso   mm: convert i_mma...
108
   * ->i_mmap_rwsem
9a3c531df   Andi Kleen   mm: update stale ...
109
   *   ->tasklist_lock            (memory_failure, collect_procs_ao)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
110
   */
22f2ac51b   Johannes Weiner   mm: workingset: f...
111
112
113
114
115
116
  static int page_cache_tree_insert(struct address_space *mapping,
  				  struct page *page, void **shadowp)
  {
  	struct radix_tree_node *node;
  	void **slot;
  	int error;
b93b01631   Matthew Wilcox   page cache: use x...
117
  	error = __radix_tree_create(&mapping->i_pages, page->index, 0,
22f2ac51b   Johannes Weiner   mm: workingset: f...
118
119
120
121
122
  				    &node, &slot);
  	if (error)
  		return error;
  	if (*slot) {
  		void *p;
b93b01631   Matthew Wilcox   page cache: use x...
123
124
  		p = radix_tree_deref_slot_protected(slot,
  						    &mapping->i_pages.xa_lock);
22f2ac51b   Johannes Weiner   mm: workingset: f...
125
126
127
128
  		if (!radix_tree_exceptional_entry(p))
  			return -EEXIST;
  
  		mapping->nrexceptional--;
d01ad197a   Ross Zwisler   dax: remove DAX c...
129
130
  		if (shadowp)
  			*shadowp = p;
22f2ac51b   Johannes Weiner   mm: workingset: f...
131
  	}
b93b01631   Matthew Wilcox   page cache: use x...
132
  	__radix_tree_replace(&mapping->i_pages, node, slot, page,
c7df8ad29   Mel Gorman   mm, truncate: do ...
133
  			     workingset_lookup_update(mapping));
22f2ac51b   Johannes Weiner   mm: workingset: f...
134
  	mapping->nrpages++;
22f2ac51b   Johannes Weiner   mm: workingset: f...
135
136
  	return 0;
  }
91b0abe36   Johannes Weiner   mm + fs: store sh...
137
138
139
  static void page_cache_tree_delete(struct address_space *mapping,
  				   struct page *page, void *shadow)
  {
c70b647d3   Kirill A. Shutemov   mm/filemap.c: add...
140
141
142
143
  	int i, nr;
  
  	/* hugetlb pages are represented by one entry in the radix tree */
  	nr = PageHuge(page) ? 1 : hpage_nr_pages(page);
91b0abe36   Johannes Weiner   mm + fs: store sh...
144

83929372f   Kirill A. Shutemov   filemap: prepare ...
145
146
147
  	VM_BUG_ON_PAGE(!PageLocked(page), page);
  	VM_BUG_ON_PAGE(PageTail(page), page);
  	VM_BUG_ON_PAGE(nr != 1 && shadow, page);
449dd6984   Johannes Weiner   mm: keep page cac...
148

83929372f   Kirill A. Shutemov   filemap: prepare ...
149
  	for (i = 0; i < nr; i++) {
d3798ae8c   Johannes Weiner   mm: filemap: don'...
150
151
  		struct radix_tree_node *node;
  		void **slot;
b93b01631   Matthew Wilcox   page cache: use x...
152
  		__radix_tree_lookup(&mapping->i_pages, page->index + i,
d3798ae8c   Johannes Weiner   mm: filemap: don'...
153
  				    &node, &slot);
dbc446b88   Johannes Weiner   mm: workingset: r...
154
  		VM_BUG_ON_PAGE(!node && nr != 1, page);
449dd6984   Johannes Weiner   mm: keep page cac...
155

b93b01631   Matthew Wilcox   page cache: use x...
156
157
  		radix_tree_clear_tags(&mapping->i_pages, node, slot);
  		__radix_tree_replace(&mapping->i_pages, node, slot, shadow,
c7df8ad29   Mel Gorman   mm, truncate: do ...
158
  				workingset_lookup_update(mapping));
449dd6984   Johannes Weiner   mm: keep page cac...
159
  	}
d3798ae8c   Johannes Weiner   mm: filemap: don'...
160

2300638b1   Jan Kara   mm: move clearing...
161
162
  	page->mapping = NULL;
  	/* Leave page->index set: truncation lookup relies upon it */
d3798ae8c   Johannes Weiner   mm: filemap: don'...
163
164
165
166
167
168
169
170
171
172
173
  	if (shadow) {
  		mapping->nrexceptional += nr;
  		/*
  		 * Make sure the nrexceptional update is committed before
  		 * the nrpages update so that final truncate racing
  		 * with reclaim does not see both counters 0 at the
  		 * same time and miss a shadow entry.
  		 */
  		smp_wmb();
  	}
  	mapping->nrpages -= nr;
91b0abe36   Johannes Weiner   mm + fs: store sh...
174
  }
5ecc4d852   Jan Kara   mm: factor out ch...
175
176
  static void unaccount_page_cache_page(struct address_space *mapping,
  				      struct page *page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
177
  {
5ecc4d852   Jan Kara   mm: factor out ch...
178
  	int nr;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
179

c515e1fd3   Dan Magenheimer   mm/fs: add hooks ...
180
181
182
183
184
185
186
187
  	/*
  	 * if we're uptodate, flush out into the cleancache, otherwise
  	 * invalidate any existing cleancache entries.  We can't leave
  	 * stale data around in the cleancache once our page is gone
  	 */
  	if (PageUptodate(page) && PageMappedToDisk(page))
  		cleancache_put_page(page);
  	else
3167760f8   Dan Magenheimer   mm: cleancache: s...
188
  		cleancache_invalidate_page(mapping, page);
c515e1fd3   Dan Magenheimer   mm/fs: add hooks ...
189

83929372f   Kirill A. Shutemov   filemap: prepare ...
190
  	VM_BUG_ON_PAGE(PageTail(page), page);
06b241f32   Hugh Dickins   mm: __delete_from...
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
  	VM_BUG_ON_PAGE(page_mapped(page), page);
  	if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(page_mapped(page))) {
  		int mapcount;
  
  		pr_alert("BUG: Bad page cache in process %s  pfn:%05lx
  ",
  			 current->comm, page_to_pfn(page));
  		dump_page(page, "still mapped when deleted");
  		dump_stack();
  		add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
  
  		mapcount = page_mapcount(page);
  		if (mapping_exiting(mapping) &&
  		    page_count(page) >= mapcount + 2) {
  			/*
  			 * All vmas have already been torn down, so it's
  			 * a good bet that actually the page is unmapped,
  			 * and we'd prefer not to leak it: if we're wrong,
  			 * some other bad page check should catch it later.
  			 */
  			page_mapcount_reset(page);
6d061f9f6   Joonsoo Kim   mm/page_ref: use ...
212
  			page_ref_sub(page, mapcount);
06b241f32   Hugh Dickins   mm: __delete_from...
213
214
  		}
  	}
4165b9b46   Michal Hocko   hugetlb: do not a...
215
  	/* hugetlb pages do not participate in page cache accounting. */
5ecc4d852   Jan Kara   mm: factor out ch...
216
217
  	if (PageHuge(page))
  		return;
09612fa65   Naoya Horiguchi   mm: hugetlb: retu...
218

5ecc4d852   Jan Kara   mm: factor out ch...
219
220
221
222
223
224
225
226
227
  	nr = hpage_nr_pages(page);
  
  	__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
  	if (PageSwapBacked(page)) {
  		__mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr);
  		if (PageTransHuge(page))
  			__dec_node_page_state(page, NR_SHMEM_THPS);
  	} else {
  		VM_BUG_ON_PAGE(PageTransHuge(page), page);
800d8c63b   Kirill A. Shutemov   shmem: add huge p...
228
  	}
5ecc4d852   Jan Kara   mm: factor out ch...
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
  
  	/*
  	 * At this point page must be either written or cleaned by
  	 * truncate.  Dirty page here signals a bug and loss of
  	 * unwritten data.
  	 *
  	 * This fixes dirty accounting after removing the page entirely
  	 * but leaves PageDirty set: it has no effect for truncated
  	 * page and anyway will be cleared before returning page into
  	 * buddy allocator.
  	 */
  	if (WARN_ON_ONCE(PageDirty(page)))
  		account_page_cleaned(page, mapping, inode_to_wb(mapping->host));
  }
  
  /*
   * Delete a page from the page cache and free it. Caller has to make
   * sure the page is locked and that nobody else uses it - or that usage
b93b01631   Matthew Wilcox   page cache: use x...
247
   * is safe.  The caller must hold the i_pages lock.
5ecc4d852   Jan Kara   mm: factor out ch...
248
249
250
251
252
253
254
255
   */
  void __delete_from_page_cache(struct page *page, void *shadow)
  {
  	struct address_space *mapping = page->mapping;
  
  	trace_mm_filemap_delete_from_page_cache(page);
  
  	unaccount_page_cache_page(mapping, page);
76253fbc8   Jan Kara   mm: move accounti...
256
  	page_cache_tree_delete(mapping, page, shadow);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
257
  }
59c66c5f8   Jan Kara   mm: factor out pa...
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
  static void page_cache_free_page(struct address_space *mapping,
  				struct page *page)
  {
  	void (*freepage)(struct page *);
  
  	freepage = mapping->a_ops->freepage;
  	if (freepage)
  		freepage(page);
  
  	if (PageTransHuge(page) && !PageHuge(page)) {
  		page_ref_sub(page, HPAGE_PMD_NR);
  		VM_BUG_ON_PAGE(page_count(page) <= 0, page);
  	} else {
  		put_page(page);
  	}
  }
702cfbf93   Minchan Kim   mm: goodbye remov...
274
275
276
277
278
279
280
281
282
  /**
   * delete_from_page_cache - delete page from page cache
   * @page: the page which the kernel is trying to remove from page cache
   *
   * This must be called only on pages that have been verified to be in the page
   * cache and locked.  It will never put the page into the free list, the caller
   * has a reference on the page.
   */
  void delete_from_page_cache(struct page *page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
283
  {
83929372f   Kirill A. Shutemov   filemap: prepare ...
284
  	struct address_space *mapping = page_mapping(page);
c4843a759   Greg Thelen   memcg: add per cg...
285
  	unsigned long flags;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
286

cd7619d6b   Matt Mackall   [PATCH] Extermina...
287
  	BUG_ON(!PageLocked(page));
b93b01631   Matthew Wilcox   page cache: use x...
288
  	xa_lock_irqsave(&mapping->i_pages, flags);
62cccb8c8   Johannes Weiner   mm: simplify lock...
289
  	__delete_from_page_cache(page, NULL);
b93b01631   Matthew Wilcox   page cache: use x...
290
  	xa_unlock_irqrestore(&mapping->i_pages, flags);
6072d13c4   Linus Torvalds   Call the filesyst...
291

59c66c5f8   Jan Kara   mm: factor out pa...
292
  	page_cache_free_page(mapping, page);
97cecb5a2   Minchan Kim   mm: introduce del...
293
294
  }
  EXPORT_SYMBOL(delete_from_page_cache);
aa65c29ce   Jan Kara   mm: batch radix t...
295
296
297
298
299
  /*
   * page_cache_tree_delete_batch - delete several pages from page cache
   * @mapping: the mapping to which pages belong
   * @pvec: pagevec with pages to delete
   *
b93b01631   Matthew Wilcox   page cache: use x...
300
301
302
   * The function walks over mapping->i_pages and removes pages passed in @pvec
   * from the mapping. The function expects @pvec to be sorted by page index.
   * It tolerates holes in @pvec (mapping entries at those indices are not
aa65c29ce   Jan Kara   mm: batch radix t...
303
   * modified). The function expects only THP head pages to be present in the
b93b01631   Matthew Wilcox   page cache: use x...
304
305
   * @pvec and takes care to delete all corresponding tail pages from the
   * mapping as well.
aa65c29ce   Jan Kara   mm: batch radix t...
306
   *
b93b01631   Matthew Wilcox   page cache: use x...
307
   * The function expects the i_pages lock to be held.
aa65c29ce   Jan Kara   mm: batch radix t...
308
309
310
311
312
313
314
315
316
317
318
319
320
   */
  static void
  page_cache_tree_delete_batch(struct address_space *mapping,
  			     struct pagevec *pvec)
  {
  	struct radix_tree_iter iter;
  	void **slot;
  	int total_pages = 0;
  	int i = 0, tail_pages = 0;
  	struct page *page;
  	pgoff_t start;
  
  	start = pvec->pages[0]->index;
b93b01631   Matthew Wilcox   page cache: use x...
321
  	radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
aa65c29ce   Jan Kara   mm: batch radix t...
322
323
324
  		if (i >= pagevec_count(pvec) && !tail_pages)
  			break;
  		page = radix_tree_deref_slot_protected(slot,
b93b01631   Matthew Wilcox   page cache: use x...
325
  						       &mapping->i_pages.xa_lock);
aa65c29ce   Jan Kara   mm: batch radix t...
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
  		if (radix_tree_exceptional_entry(page))
  			continue;
  		if (!tail_pages) {
  			/*
  			 * Some page got inserted in our range? Skip it. We
  			 * have our pages locked so they are protected from
  			 * being removed.
  			 */
  			if (page != pvec->pages[i])
  				continue;
  			WARN_ON_ONCE(!PageLocked(page));
  			if (PageTransHuge(page) && !PageHuge(page))
  				tail_pages = HPAGE_PMD_NR - 1;
  			page->mapping = NULL;
  			/*
  			 * Leave page->index set: truncation lookup relies
  			 * upon it
  			 */
  			i++;
  		} else {
  			tail_pages--;
  		}
b93b01631   Matthew Wilcox   page cache: use x...
348
349
  		radix_tree_clear_tags(&mapping->i_pages, iter.node, slot);
  		__radix_tree_replace(&mapping->i_pages, iter.node, slot, NULL,
c7df8ad29   Mel Gorman   mm, truncate: do ...
350
  				workingset_lookup_update(mapping));
aa65c29ce   Jan Kara   mm: batch radix t...
351
352
353
354
355
356
357
358
359
360
361
362
363
  		total_pages++;
  	}
  	mapping->nrpages -= total_pages;
  }
  
  void delete_from_page_cache_batch(struct address_space *mapping,
  				  struct pagevec *pvec)
  {
  	int i;
  	unsigned long flags;
  
  	if (!pagevec_count(pvec))
  		return;
b93b01631   Matthew Wilcox   page cache: use x...
364
  	xa_lock_irqsave(&mapping->i_pages, flags);
aa65c29ce   Jan Kara   mm: batch radix t...
365
366
367
368
369
370
  	for (i = 0; i < pagevec_count(pvec); i++) {
  		trace_mm_filemap_delete_from_page_cache(pvec->pages[i]);
  
  		unaccount_page_cache_page(mapping, pvec->pages[i]);
  	}
  	page_cache_tree_delete_batch(mapping, pvec);
b93b01631   Matthew Wilcox   page cache: use x...
371
  	xa_unlock_irqrestore(&mapping->i_pages, flags);
aa65c29ce   Jan Kara   mm: batch radix t...
372
373
374
375
  
  	for (i = 0; i < pagevec_count(pvec); i++)
  		page_cache_free_page(mapping, pvec->pages[i]);
  }
d72d9e2a5   Miklos Szeredi   mm: export filema...
376
  int filemap_check_errors(struct address_space *mapping)
865ffef37   Dmitry Monakhov   fs: fix fsync() e...
377
378
379
  {
  	int ret = 0;
  	/* Check for outstanding write errors */
7fcbbaf18   Jens Axboe   mm/filemap.c: avo...
380
381
  	if (test_bit(AS_ENOSPC, &mapping->flags) &&
  	    test_and_clear_bit(AS_ENOSPC, &mapping->flags))
865ffef37   Dmitry Monakhov   fs: fix fsync() e...
382
  		ret = -ENOSPC;
7fcbbaf18   Jens Axboe   mm/filemap.c: avo...
383
384
  	if (test_bit(AS_EIO, &mapping->flags) &&
  	    test_and_clear_bit(AS_EIO, &mapping->flags))
865ffef37   Dmitry Monakhov   fs: fix fsync() e...
385
386
387
  		ret = -EIO;
  	return ret;
  }
d72d9e2a5   Miklos Szeredi   mm: export filema...
388
  EXPORT_SYMBOL(filemap_check_errors);
865ffef37   Dmitry Monakhov   fs: fix fsync() e...
389

76341cabb   Jeff Layton   jbd2: don't clear...
390
391
392
393
394
395
396
397
398
  static int filemap_check_and_keep_errors(struct address_space *mapping)
  {
  	/* Check for outstanding write errors */
  	if (test_bit(AS_EIO, &mapping->flags))
  		return -EIO;
  	if (test_bit(AS_ENOSPC, &mapping->flags))
  		return -ENOSPC;
  	return 0;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
399
  /**
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
400
   * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
67be2dd1b   Martin Waitz   [PATCH] DocBook: ...
401
402
   * @mapping:	address space structure to write
   * @start:	offset in bytes where the range starts
469eb4d03   Andrew Morton   [PATCH] filemap_f...
403
   * @end:	offset in bytes where the range ends (inclusive)
67be2dd1b   Martin Waitz   [PATCH] DocBook: ...
404
   * @sync_mode:	enable synchronous operation
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
405
   *
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
406
407
408
   * Start writeback against all of a mapping's dirty pages that lie
   * within the byte offsets <start, end> inclusive.
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
409
   * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
410
   * opposed to a regular memory cleansing writeback.  The difference between
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
411
412
413
   * these two operations is that if a dirty page/buffer is encountered, it must
   * be waited upon, and not just skipped over.
   */
ebcf28e1c   Andrew Morton   [PATCH] fadvise()...
414
415
  int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
  				loff_t end, int sync_mode)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
416
417
418
419
  {
  	int ret;
  	struct writeback_control wbc = {
  		.sync_mode = sync_mode,
05fe478dd   Nick Piggin   mm: write_cache_p...
420
  		.nr_to_write = LONG_MAX,
111ebb6e6   OGAWA Hirofumi   [PATCH] writeback...
421
422
  		.range_start = start,
  		.range_end = end,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
423
424
425
426
  	};
  
  	if (!mapping_cap_writeback_dirty(mapping))
  		return 0;
b16b1deb5   Tejun Heo   writeback: make w...
427
  	wbc_attach_fdatawrite_inode(&wbc, mapping->host);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
428
  	ret = do_writepages(mapping, &wbc);
b16b1deb5   Tejun Heo   writeback: make w...
429
  	wbc_detach_inode(&wbc);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
430
431
432
433
434
435
  	return ret;
  }
  
  static inline int __filemap_fdatawrite(struct address_space *mapping,
  	int sync_mode)
  {
111ebb6e6   OGAWA Hirofumi   [PATCH] writeback...
436
  	return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
437
438
439
440
441
442
443
  }
  
  int filemap_fdatawrite(struct address_space *mapping)
  {
  	return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
  }
  EXPORT_SYMBOL(filemap_fdatawrite);
f4c0a0fdf   Jan Kara   vfs: export filem...
444
  int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
ebcf28e1c   Andrew Morton   [PATCH] fadvise()...
445
  				loff_t end)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
446
447
448
  {
  	return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
  }
f4c0a0fdf   Jan Kara   vfs: export filem...
449
  EXPORT_SYMBOL(filemap_fdatawrite_range);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
450

485bb99b4   Randy Dunlap   [PATCH] kernel-do...
451
452
453
454
  /**
   * filemap_flush - mostly a non-blocking flush
   * @mapping:	target address_space
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
455
456
457
458
459
460
461
462
   * This is a mostly non-blocking flush.  Not suitable for data-integrity
   * purposes - I/O may not be started against all dirty pages.
   */
  int filemap_flush(struct address_space *mapping)
  {
  	return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
  }
  EXPORT_SYMBOL(filemap_flush);
7fc9e4722   Goldwyn Rodrigues   fs: Introduce fil...
463
464
465
466
467
468
469
470
471
472
473
474
475
476
  /**
   * filemap_range_has_page - check if a page exists in range.
   * @mapping:           address space within which to check
   * @start_byte:        offset in bytes where the range starts
   * @end_byte:          offset in bytes where the range ends (inclusive)
   *
   * Find at least one page in the range supplied, usually used to check if
   * direct writing in this range will trigger a writeback.
   */
  bool filemap_range_has_page(struct address_space *mapping,
  			   loff_t start_byte, loff_t end_byte)
  {
  	pgoff_t index = start_byte >> PAGE_SHIFT;
  	pgoff_t end = end_byte >> PAGE_SHIFT;
f7b680468   Jan Kara   mm: use find_get_...
477
  	struct page *page;
7fc9e4722   Goldwyn Rodrigues   fs: Introduce fil...
478
479
480
481
482
483
  
  	if (end_byte < start_byte)
  		return false;
  
  	if (mapping->nrpages == 0)
  		return false;
f7b680468   Jan Kara   mm: use find_get_...
484
  	if (!find_get_pages_range(mapping, &index, end, 1, &page))
7fc9e4722   Goldwyn Rodrigues   fs: Introduce fil...
485
  		return false;
f7b680468   Jan Kara   mm: use find_get_...
486
487
  	put_page(page);
  	return true;
7fc9e4722   Goldwyn Rodrigues   fs: Introduce fil...
488
489
  }
  EXPORT_SYMBOL(filemap_range_has_page);
5e8fcc1a0   Jeff Layton   mm: don't TestCle...
490
  static void __filemap_fdatawait_range(struct address_space *mapping,
aa750fd71   Junichi Nomura   mm/filemap.c: mak...
491
  				     loff_t start_byte, loff_t end_byte)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
492
  {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
493
494
  	pgoff_t index = start_byte >> PAGE_SHIFT;
  	pgoff_t end = end_byte >> PAGE_SHIFT;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
495
496
  	struct pagevec pvec;
  	int nr_pages;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
497

94004ed72   Christoph Hellwig   kill wait_on_page...
498
  	if (end_byte < start_byte)
5e8fcc1a0   Jeff Layton   mm: don't TestCle...
499
  		return;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
500

866798201   Mel Gorman   mm, pagevec: remo...
501
  	pagevec_init(&pvec);
312e9d2f7   Jan Kara   mm: use pagevec_l...
502
  	while (index <= end) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
503
  		unsigned i;
312e9d2f7   Jan Kara   mm: use pagevec_l...
504
  		nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index,
67fd707f4   Jan Kara   mm: remove nr_pag...
505
  				end, PAGECACHE_TAG_WRITEBACK);
312e9d2f7   Jan Kara   mm: use pagevec_l...
506
507
  		if (!nr_pages)
  			break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
508
509
  		for (i = 0; i < nr_pages; i++) {
  			struct page *page = pvec.pages[i];
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
510
  			wait_on_page_writeback(page);
5e8fcc1a0   Jeff Layton   mm: don't TestCle...
511
  			ClearPageError(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
512
513
514
515
  		}
  		pagevec_release(&pvec);
  		cond_resched();
  	}
aa750fd71   Junichi Nomura   mm/filemap.c: mak...
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
  }
  
  /**
   * filemap_fdatawait_range - wait for writeback to complete
   * @mapping:		address space structure to wait for
   * @start_byte:		offset in bytes where the range starts
   * @end_byte:		offset in bytes where the range ends (inclusive)
   *
   * Walk the list of under-writeback pages of the given address space
   * in the given range and wait for all of them.  Check error status of
   * the address space and return it.
   *
   * Since the error status of the address space is cleared by this function,
   * callers are responsible for checking the return value and handling and/or
   * reporting the error.
   */
  int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
  			    loff_t end_byte)
  {
5e8fcc1a0   Jeff Layton   mm: don't TestCle...
535
536
  	__filemap_fdatawait_range(mapping, start_byte, end_byte);
  	return filemap_check_errors(mapping);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
537
  }
d3bccb6f4   Jan Kara   vfs: Introduce fi...
538
539
540
  EXPORT_SYMBOL(filemap_fdatawait_range);
  
  /**
a823e4589   Jeff Layton   mm: add file_fdat...
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
   * file_fdatawait_range - wait for writeback to complete
   * @file:		file pointing to address space structure to wait for
   * @start_byte:		offset in bytes where the range starts
   * @end_byte:		offset in bytes where the range ends (inclusive)
   *
   * Walk the list of under-writeback pages of the address space that file
   * refers to, in the given range and wait for all of them.  Check error
   * status of the address space vs. the file->f_wb_err cursor and return it.
   *
   * Since the error status of the file is advanced by this function,
   * callers are responsible for checking the return value and handling and/or
   * reporting the error.
   */
  int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte)
  {
  	struct address_space *mapping = file->f_mapping;
  
  	__filemap_fdatawait_range(mapping, start_byte, end_byte);
  	return file_check_and_advance_wb_err(file);
  }
  EXPORT_SYMBOL(file_fdatawait_range);
d3bccb6f4   Jan Kara   vfs: Introduce fi...
562
563
  
  /**
aa750fd71   Junichi Nomura   mm/filemap.c: mak...
564
565
566
567
568
569
570
571
572
573
574
   * filemap_fdatawait_keep_errors - wait for writeback without clearing errors
   * @mapping: address space structure to wait for
   *
   * Walk the list of under-writeback pages of the given address space
   * and wait for all of them.  Unlike filemap_fdatawait(), this function
   * does not clear error status of the address space.
   *
   * Use this function if callers don't handle errors themselves.  Expected
   * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
   * fsfreeze(8)
   */
76341cabb   Jeff Layton   jbd2: don't clear...
575
  int filemap_fdatawait_keep_errors(struct address_space *mapping)
aa750fd71   Junichi Nomura   mm/filemap.c: mak...
576
  {
ffb959bbd   Jeff Layton   mm: remove optimi...
577
  	__filemap_fdatawait_range(mapping, 0, LLONG_MAX);
76341cabb   Jeff Layton   jbd2: don't clear...
578
  	return filemap_check_and_keep_errors(mapping);
aa750fd71   Junichi Nomura   mm/filemap.c: mak...
579
  }
76341cabb   Jeff Layton   jbd2: don't clear...
580
  EXPORT_SYMBOL(filemap_fdatawait_keep_errors);
aa750fd71   Junichi Nomura   mm/filemap.c: mak...
581

9326c9b20   Jeff Layton   mm: consolidate d...
582
  static bool mapping_needs_writeback(struct address_space *mapping)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
583
  {
9326c9b20   Jeff Layton   mm: consolidate d...
584
585
  	return (!dax_mapping(mapping) && mapping->nrpages) ||
  	    (dax_mapping(mapping) && mapping->nrexceptional);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
586
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
587
588
589
  
  int filemap_write_and_wait(struct address_space *mapping)
  {
28fd12982   OGAWA Hirofumi   [PATCH] Fix and a...
590
  	int err = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
591

9326c9b20   Jeff Layton   mm: consolidate d...
592
  	if (mapping_needs_writeback(mapping)) {
28fd12982   OGAWA Hirofumi   [PATCH] Fix and a...
593
594
595
596
597
598
599
600
601
602
603
  		err = filemap_fdatawrite(mapping);
  		/*
  		 * Even if the above returned error, the pages may be
  		 * written partially (e.g. -ENOSPC), so we wait for it.
  		 * But the -EIO is special case, it may indicate the worst
  		 * thing (e.g. bug) happened, so we avoid waiting for it.
  		 */
  		if (err != -EIO) {
  			int err2 = filemap_fdatawait(mapping);
  			if (!err)
  				err = err2;
cbeaf9510   Jeff Layton   mm: clear AS_EIO/...
604
605
606
  		} else {
  			/* Clear any previously stored errors */
  			filemap_check_errors(mapping);
28fd12982   OGAWA Hirofumi   [PATCH] Fix and a...
607
  		}
865ffef37   Dmitry Monakhov   fs: fix fsync() e...
608
609
  	} else {
  		err = filemap_check_errors(mapping);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
610
  	}
28fd12982   OGAWA Hirofumi   [PATCH] Fix and a...
611
  	return err;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
612
  }
28fd12982   OGAWA Hirofumi   [PATCH] Fix and a...
613
  EXPORT_SYMBOL(filemap_write_and_wait);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
614

485bb99b4   Randy Dunlap   [PATCH] kernel-do...
615
616
617
618
619
620
  /**
   * filemap_write_and_wait_range - write out & wait on a file range
   * @mapping:	the address_space for the pages
   * @lstart:	offset in bytes where the range starts
   * @lend:	offset in bytes where the range ends (inclusive)
   *
469eb4d03   Andrew Morton   [PATCH] filemap_f...
621
622
   * Write out and wait upon file offsets lstart->lend, inclusive.
   *
0e056eb55   mchehab@s-opensource.com   kernel-api.rst: f...
623
   * Note that @lend is inclusive (describes the last byte to be written) so
469eb4d03   Andrew Morton   [PATCH] filemap_f...
624
625
   * that this function can be used to write to the very end-of-file (end = -1).
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
626
627
628
  int filemap_write_and_wait_range(struct address_space *mapping,
  				 loff_t lstart, loff_t lend)
  {
28fd12982   OGAWA Hirofumi   [PATCH] Fix and a...
629
  	int err = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
630

9326c9b20   Jeff Layton   mm: consolidate d...
631
  	if (mapping_needs_writeback(mapping)) {
28fd12982   OGAWA Hirofumi   [PATCH] Fix and a...
632
633
634
635
  		err = __filemap_fdatawrite_range(mapping, lstart, lend,
  						 WB_SYNC_ALL);
  		/* See comment of filemap_write_and_wait() */
  		if (err != -EIO) {
94004ed72   Christoph Hellwig   kill wait_on_page...
636
637
  			int err2 = filemap_fdatawait_range(mapping,
  						lstart, lend);
28fd12982   OGAWA Hirofumi   [PATCH] Fix and a...
638
639
  			if (!err)
  				err = err2;
cbeaf9510   Jeff Layton   mm: clear AS_EIO/...
640
641
642
  		} else {
  			/* Clear any previously stored errors */
  			filemap_check_errors(mapping);
28fd12982   OGAWA Hirofumi   [PATCH] Fix and a...
643
  		}
865ffef37   Dmitry Monakhov   fs: fix fsync() e...
644
645
  	} else {
  		err = filemap_check_errors(mapping);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
646
  	}
28fd12982   OGAWA Hirofumi   [PATCH] Fix and a...
647
  	return err;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
648
  }
f69955855   Chris Mason   Export filemap_wr...
649
  EXPORT_SYMBOL(filemap_write_and_wait_range);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
650

5660e13d2   Jeff Layton   fs: new infrastru...
651
652
  void __filemap_set_wb_err(struct address_space *mapping, int err)
  {
3acdfd280   Jeff Layton   errseq: rename __...
653
  	errseq_t eseq = errseq_set(&mapping->wb_err, err);
5660e13d2   Jeff Layton   fs: new infrastru...
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
  
  	trace_filemap_set_wb_err(mapping, eseq);
  }
  EXPORT_SYMBOL(__filemap_set_wb_err);
  
  /**
   * file_check_and_advance_wb_err - report wb error (if any) that was previously
   * 				   and advance wb_err to current one
   * @file: struct file on which the error is being reported
   *
   * When userland calls fsync (or something like nfsd does the equivalent), we
   * want to report any writeback errors that occurred since the last fsync (or
   * since the file was opened if there haven't been any).
   *
   * Grab the wb_err from the mapping. If it matches what we have in the file,
   * then just quickly return 0. The file is all caught up.
   *
   * If it doesn't match, then take the mapping value, set the "seen" flag in
   * it and try to swap it into place. If it works, or another task beat us
   * to it with the new value, then update the f_wb_err and return the error
   * portion. The error at this point must be reported via proper channels
   * (a'la fsync, or NFS COMMIT operation, etc.).
   *
   * While we handle mapping->wb_err with atomic operations, the f_wb_err
   * value is protected by the f_lock since we must ensure that it reflects
   * the latest value swapped in for this file descriptor.
   */
  int file_check_and_advance_wb_err(struct file *file)
  {
  	int err = 0;
  	errseq_t old = READ_ONCE(file->f_wb_err);
  	struct address_space *mapping = file->f_mapping;
  
  	/* Locklessly handle the common case where nothing has changed */
  	if (errseq_check(&mapping->wb_err, old)) {
  		/* Something changed, must use slow path */
  		spin_lock(&file->f_lock);
  		old = file->f_wb_err;
  		err = errseq_check_and_advance(&mapping->wb_err,
  						&file->f_wb_err);
  		trace_file_check_and_advance_wb_err(file, old);
  		spin_unlock(&file->f_lock);
  	}
f4e222c56   Jeff Layton   mm: have filemap_...
697
698
699
700
701
702
703
704
  
  	/*
  	 * We're mostly using this function as a drop in replacement for
  	 * filemap_check_errors. Clear AS_EIO/AS_ENOSPC to emulate the effect
  	 * that the legacy code would have had on these flags.
  	 */
  	clear_bit(AS_EIO, &mapping->flags);
  	clear_bit(AS_ENOSPC, &mapping->flags);
5660e13d2   Jeff Layton   fs: new infrastru...
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
  	return err;
  }
  EXPORT_SYMBOL(file_check_and_advance_wb_err);
  
  /**
   * file_write_and_wait_range - write out & wait on a file range
   * @file:	file pointing to address_space with pages
   * @lstart:	offset in bytes where the range starts
   * @lend:	offset in bytes where the range ends (inclusive)
   *
   * Write out and wait upon file offsets lstart->lend, inclusive.
   *
   * Note that @lend is inclusive (describes the last byte to be written) so
   * that this function can be used to write to the very end-of-file (end = -1).
   *
   * After writing out and waiting on the data, we check and advance the
   * f_wb_err cursor to the latest value, and return any errors detected there.
   */
  int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)
  {
  	int err = 0, err2;
  	struct address_space *mapping = file->f_mapping;
9326c9b20   Jeff Layton   mm: consolidate d...
727
  	if (mapping_needs_writeback(mapping)) {
5660e13d2   Jeff Layton   fs: new infrastru...
728
729
730
731
732
733
734
735
736
737
738
739
  		err = __filemap_fdatawrite_range(mapping, lstart, lend,
  						 WB_SYNC_ALL);
  		/* See comment of filemap_write_and_wait() */
  		if (err != -EIO)
  			__filemap_fdatawait_range(mapping, lstart, lend);
  	}
  	err2 = file_check_and_advance_wb_err(file);
  	if (!err)
  		err = err2;
  	return err;
  }
  EXPORT_SYMBOL(file_write_and_wait_range);
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
740
  /**
ef6a3c631   Miklos Szeredi   mm: add replace_p...
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
   * replace_page_cache_page - replace a pagecache page with a new one
   * @old:	page to be replaced
   * @new:	page to replace with
   * @gfp_mask:	allocation mode
   *
   * This function replaces a page in the pagecache with a new one.  On
   * success it acquires the pagecache reference for the new page and
   * drops it for the old page.  Both the old and new pages must be
   * locked.  This function does not add the new page to the LRU, the
   * caller must do that.
   *
   * The remove + add is atomic.  The only way this function can fail is
   * memory allocation failure.
   */
  int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
  {
  	int error;
ef6a3c631   Miklos Szeredi   mm: add replace_p...
758

309381fea   Sasha Levin   mm: dump page whe...
759
760
761
  	VM_BUG_ON_PAGE(!PageLocked(old), old);
  	VM_BUG_ON_PAGE(!PageLocked(new), new);
  	VM_BUG_ON_PAGE(new->mapping, new);
ef6a3c631   Miklos Szeredi   mm: add replace_p...
762

abc1be13f   Matthew Wilcox   mm/filemap.c: fix...
763
  	error = radix_tree_preload(gfp_mask & GFP_RECLAIM_MASK);
ef6a3c631   Miklos Szeredi   mm: add replace_p...
764
765
766
  	if (!error) {
  		struct address_space *mapping = old->mapping;
  		void (*freepage)(struct page *);
c4843a759   Greg Thelen   memcg: add per cg...
767
  		unsigned long flags;
ef6a3c631   Miklos Szeredi   mm: add replace_p...
768
769
770
  
  		pgoff_t offset = old->index;
  		freepage = mapping->a_ops->freepage;
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
771
  		get_page(new);
ef6a3c631   Miklos Szeredi   mm: add replace_p...
772
773
  		new->mapping = mapping;
  		new->index = offset;
b93b01631   Matthew Wilcox   page cache: use x...
774
  		xa_lock_irqsave(&mapping->i_pages, flags);
62cccb8c8   Johannes Weiner   mm: simplify lock...
775
  		__delete_from_page_cache(old, NULL);
22f2ac51b   Johannes Weiner   mm: workingset: f...
776
  		error = page_cache_tree_insert(mapping, new, NULL);
ef6a3c631   Miklos Szeredi   mm: add replace_p...
777
  		BUG_ON(error);
4165b9b46   Michal Hocko   hugetlb: do not a...
778
779
780
781
782
  
  		/*
  		 * hugetlb pages do not participate in page cache accounting.
  		 */
  		if (!PageHuge(new))
11fb99898   Mel Gorman   mm: move most fil...
783
  			__inc_node_page_state(new, NR_FILE_PAGES);
ef6a3c631   Miklos Szeredi   mm: add replace_p...
784
  		if (PageSwapBacked(new))
11fb99898   Mel Gorman   mm: move most fil...
785
  			__inc_node_page_state(new, NR_SHMEM);
b93b01631   Matthew Wilcox   page cache: use x...
786
  		xa_unlock_irqrestore(&mapping->i_pages, flags);
6a93ca8fd   Johannes Weiner   mm: migrate: do n...
787
  		mem_cgroup_migrate(old, new);
ef6a3c631   Miklos Szeredi   mm: add replace_p...
788
789
790
  		radix_tree_preload_end();
  		if (freepage)
  			freepage(old);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
791
  		put_page(old);
ef6a3c631   Miklos Szeredi   mm: add replace_p...
792
793
794
795
796
  	}
  
  	return error;
  }
  EXPORT_SYMBOL_GPL(replace_page_cache_page);
a528910e1   Johannes Weiner   mm: thrash detect...
797
798
799
800
  static int __add_to_page_cache_locked(struct page *page,
  				      struct address_space *mapping,
  				      pgoff_t offset, gfp_t gfp_mask,
  				      void **shadowp)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
801
  {
00501b531   Johannes Weiner   mm: memcontrol: r...
802
803
  	int huge = PageHuge(page);
  	struct mem_cgroup *memcg;
e286781d5   Nick Piggin   mm: speculative p...
804
  	int error;
309381fea   Sasha Levin   mm: dump page whe...
805
806
  	VM_BUG_ON_PAGE(!PageLocked(page), page);
  	VM_BUG_ON_PAGE(PageSwapBacked(page), page);
e286781d5   Nick Piggin   mm: speculative p...
807

00501b531   Johannes Weiner   mm: memcontrol: r...
808
809
  	if (!huge) {
  		error = mem_cgroup_try_charge(page, current->mm,
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
810
  					      gfp_mask, &memcg, false);
00501b531   Johannes Weiner   mm: memcontrol: r...
811
812
813
  		if (error)
  			return error;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
814

abc1be13f   Matthew Wilcox   mm/filemap.c: fix...
815
  	error = radix_tree_maybe_preload(gfp_mask & GFP_RECLAIM_MASK);
66a0c8ee3   Kirill A. Shutemov   mm: cleanup add_t...
816
  	if (error) {
00501b531   Johannes Weiner   mm: memcontrol: r...
817
  		if (!huge)
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
818
  			mem_cgroup_cancel_charge(page, memcg, false);
66a0c8ee3   Kirill A. Shutemov   mm: cleanup add_t...
819
820
  		return error;
  	}
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
821
  	get_page(page);
66a0c8ee3   Kirill A. Shutemov   mm: cleanup add_t...
822
823
  	page->mapping = mapping;
  	page->index = offset;
b93b01631   Matthew Wilcox   page cache: use x...
824
  	xa_lock_irq(&mapping->i_pages);
a528910e1   Johannes Weiner   mm: thrash detect...
825
  	error = page_cache_tree_insert(mapping, page, shadowp);
66a0c8ee3   Kirill A. Shutemov   mm: cleanup add_t...
826
827
828
  	radix_tree_preload_end();
  	if (unlikely(error))
  		goto err_insert;
4165b9b46   Michal Hocko   hugetlb: do not a...
829
830
831
  
  	/* hugetlb pages do not participate in page cache accounting. */
  	if (!huge)
11fb99898   Mel Gorman   mm: move most fil...
832
  		__inc_node_page_state(page, NR_FILE_PAGES);
b93b01631   Matthew Wilcox   page cache: use x...
833
  	xa_unlock_irq(&mapping->i_pages);
00501b531   Johannes Weiner   mm: memcontrol: r...
834
  	if (!huge)
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
835
  		mem_cgroup_commit_charge(page, memcg, false, false);
66a0c8ee3   Kirill A. Shutemov   mm: cleanup add_t...
836
837
838
839
840
  	trace_mm_filemap_add_to_page_cache(page);
  	return 0;
  err_insert:
  	page->mapping = NULL;
  	/* Leave page->index set: truncation relies upon it */
b93b01631   Matthew Wilcox   page cache: use x...
841
  	xa_unlock_irq(&mapping->i_pages);
00501b531   Johannes Weiner   mm: memcontrol: r...
842
  	if (!huge)
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
843
  		mem_cgroup_cancel_charge(page, memcg, false);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
844
  	put_page(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
845
846
  	return error;
  }
a528910e1   Johannes Weiner   mm: thrash detect...
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
  
  /**
   * add_to_page_cache_locked - add a locked page to the pagecache
   * @page:	page to add
   * @mapping:	the page's address_space
   * @offset:	page index
   * @gfp_mask:	page allocation mode
   *
   * This function is used to add a page to the pagecache. It must be locked.
   * This function does not add the page to the LRU.  The caller must do that.
   */
  int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
  		pgoff_t offset, gfp_t gfp_mask)
  {
  	return __add_to_page_cache_locked(page, mapping, offset,
  					  gfp_mask, NULL);
  }
e286781d5   Nick Piggin   mm: speculative p...
864
  EXPORT_SYMBOL(add_to_page_cache_locked);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
865
866
  
  int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
6daa0e286   Al Viro   [PATCH] gfp_t: mm...
867
  				pgoff_t offset, gfp_t gfp_mask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
868
  {
a528910e1   Johannes Weiner   mm: thrash detect...
869
  	void *shadow = NULL;
4f98a2fee   Rik van Riel   vmscan: split LRU...
870
  	int ret;
48c935ad8   Kirill A. Shutemov   page-flags: defin...
871
  	__SetPageLocked(page);
a528910e1   Johannes Weiner   mm: thrash detect...
872
873
874
  	ret = __add_to_page_cache_locked(page, mapping, offset,
  					 gfp_mask, &shadow);
  	if (unlikely(ret))
48c935ad8   Kirill A. Shutemov   page-flags: defin...
875
  		__ClearPageLocked(page);
a528910e1   Johannes Weiner   mm: thrash detect...
876
877
878
879
880
  	else {
  		/*
  		 * The page might have been evicted from cache only
  		 * recently, in which case it should be activated like
  		 * any other repeatedly accessed page.
f0281a00f   Rik van Riel   mm: workingset: o...
881
882
883
  		 * The exception is pages getting rewritten; evicting other
  		 * data from the working set, only to cache data that will
  		 * get overwritten with something else, is a waste of memory.
a528910e1   Johannes Weiner   mm: thrash detect...
884
  		 */
f0281a00f   Rik van Riel   mm: workingset: o...
885
886
  		if (!(gfp_mask & __GFP_WRITE) &&
  		    shadow && workingset_refault(shadow)) {
a528910e1   Johannes Weiner   mm: thrash detect...
887
888
889
890
891
892
  			SetPageActive(page);
  			workingset_activation(page);
  		} else
  			ClearPageActive(page);
  		lru_cache_add(page);
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
893
894
  	return ret;
  }
18bc0bbd1   Evgeniy Polyakov   Staging: pohmelfs...
895
  EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
896

44110fe38   Paul Jackson   [PATCH] cpuset me...
897
  #ifdef CONFIG_NUMA
2ae88149a   Nick Piggin   [PATCH] mm: clean...
898
  struct page *__page_cache_alloc(gfp_t gfp)
44110fe38   Paul Jackson   [PATCH] cpuset me...
899
  {
c0ff7453b   Miao Xie   cpuset,mm: fix no...
900
901
  	int n;
  	struct page *page;
44110fe38   Paul Jackson   [PATCH] cpuset me...
902
  	if (cpuset_do_page_mem_spread()) {
cc9a6c877   Mel Gorman   cpuset: mm: reduc...
903
904
  		unsigned int cpuset_mems_cookie;
  		do {
d26914d11   Mel Gorman   mm: optimize put_...
905
  			cpuset_mems_cookie = read_mems_allowed_begin();
cc9a6c877   Mel Gorman   cpuset: mm: reduc...
906
  			n = cpuset_mem_spread_node();
96db800f5   Vlastimil Babka   mm: rename alloc_...
907
  			page = __alloc_pages_node(n, gfp, 0);
d26914d11   Mel Gorman   mm: optimize put_...
908
  		} while (!page && read_mems_allowed_retry(cpuset_mems_cookie));
cc9a6c877   Mel Gorman   cpuset: mm: reduc...
909

c0ff7453b   Miao Xie   cpuset,mm: fix no...
910
  		return page;
44110fe38   Paul Jackson   [PATCH] cpuset me...
911
  	}
2ae88149a   Nick Piggin   [PATCH] mm: clean...
912
  	return alloc_pages(gfp, 0);
44110fe38   Paul Jackson   [PATCH] cpuset me...
913
  }
2ae88149a   Nick Piggin   [PATCH] mm: clean...
914
  EXPORT_SYMBOL(__page_cache_alloc);
44110fe38   Paul Jackson   [PATCH] cpuset me...
915
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
916
917
918
919
920
921
922
923
924
925
  /*
   * In order to wait for pages to become available there must be
   * waitqueues associated with pages. By using a hash table of
   * waitqueues where the bucket discipline is to maintain all
   * waiters on the same queue and wake all when any of the pages
   * become available, and for the woken contexts to check to be
   * sure the appropriate page became available, this saves space
   * at a cost of "thundering herd" phenomena during rare hash
   * collisions.
   */
629060270   Nicholas Piggin   mm: add PageWaite...
926
927
928
929
930
  #define PAGE_WAIT_TABLE_BITS 8
  #define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)
  static wait_queue_head_t page_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
  
  static wait_queue_head_t *page_waitqueue(struct page *page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
931
  {
629060270   Nicholas Piggin   mm: add PageWaite...
932
  	return &page_wait_table[hash_ptr(page, PAGE_WAIT_TABLE_BITS)];
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
933
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
934

629060270   Nicholas Piggin   mm: add PageWaite...
935
  void __init pagecache_init(void)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
936
  {
629060270   Nicholas Piggin   mm: add PageWaite...
937
  	int i;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
938

629060270   Nicholas Piggin   mm: add PageWaite...
939
940
941
942
  	for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
  		init_waitqueue_head(&page_wait_table[i]);
  
  	page_writeback_init();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
943
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
944

3510ca20e   Linus Torvalds   Minor page waitqu...
945
  /* This has the same layout as wait_bit_key - see fs/cachefiles/rdwr.c */
629060270   Nicholas Piggin   mm: add PageWaite...
946
947
948
949
950
951
952
953
954
  struct wait_page_key {
  	struct page *page;
  	int bit_nr;
  	int page_match;
  };
  
  struct wait_page_queue {
  	struct page *page;
  	int bit_nr;
ac6424b98   Ingo Molnar   sched/wait: Renam...
955
  	wait_queue_entry_t wait;
629060270   Nicholas Piggin   mm: add PageWaite...
956
  };
ac6424b98   Ingo Molnar   sched/wait: Renam...
957
  static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
f62e00cc3   KOSAKI Motohiro   mm: introduce wai...
958
  {
629060270   Nicholas Piggin   mm: add PageWaite...
959
960
961
962
963
964
965
  	struct wait_page_key *key = arg;
  	struct wait_page_queue *wait_page
  		= container_of(wait, struct wait_page_queue, wait);
  
  	if (wait_page->page != key->page)
  	       return 0;
  	key->page_match = 1;
f62e00cc3   KOSAKI Motohiro   mm: introduce wai...
966

629060270   Nicholas Piggin   mm: add PageWaite...
967
968
  	if (wait_page->bit_nr != key->bit_nr)
  		return 0;
3510ca20e   Linus Torvalds   Minor page waitqu...
969
970
  
  	/* Stop walking if it's locked */
629060270   Nicholas Piggin   mm: add PageWaite...
971
  	if (test_bit(key->bit_nr, &key->page->flags))
3510ca20e   Linus Torvalds   Minor page waitqu...
972
  		return -1;
f62e00cc3   KOSAKI Motohiro   mm: introduce wai...
973

629060270   Nicholas Piggin   mm: add PageWaite...
974
  	return autoremove_wake_function(wait, mode, sync, key);
f62e00cc3   KOSAKI Motohiro   mm: introduce wai...
975
  }
74d81bfae   Nicholas Piggin   mm: un-export wak...
976
  static void wake_up_page_bit(struct page *page, int bit_nr)
cbbce8220   NeilBrown   SCHED: add some "...
977
  {
629060270   Nicholas Piggin   mm: add PageWaite...
978
979
980
  	wait_queue_head_t *q = page_waitqueue(page);
  	struct wait_page_key key;
  	unsigned long flags;
11a19c7b0   Tim Chen   sched/wait: Intro...
981
  	wait_queue_entry_t bookmark;
cbbce8220   NeilBrown   SCHED: add some "...
982

629060270   Nicholas Piggin   mm: add PageWaite...
983
984
985
  	key.page = page;
  	key.bit_nr = bit_nr;
  	key.page_match = 0;
11a19c7b0   Tim Chen   sched/wait: Intro...
986
987
988
989
  	bookmark.flags = 0;
  	bookmark.private = NULL;
  	bookmark.func = NULL;
  	INIT_LIST_HEAD(&bookmark.entry);
629060270   Nicholas Piggin   mm: add PageWaite...
990
  	spin_lock_irqsave(&q->lock, flags);
11a19c7b0   Tim Chen   sched/wait: Intro...
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
  	__wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
  
  	while (bookmark.flags & WQ_FLAG_BOOKMARK) {
  		/*
  		 * Take a breather from holding the lock,
  		 * allow pages that finish wake up asynchronously
  		 * to acquire the lock and remove themselves
  		 * from wait queue
  		 */
  		spin_unlock_irqrestore(&q->lock, flags);
  		cpu_relax();
  		spin_lock_irqsave(&q->lock, flags);
  		__wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
  	}
629060270   Nicholas Piggin   mm: add PageWaite...
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
  	/*
  	 * It is possible for other pages to have collided on the waitqueue
  	 * hash, so in that case check for a page match. That prevents a long-
  	 * term waiter
  	 *
  	 * It is still possible to miss a case here, when we woke page waiters
  	 * and removed them from the waitqueue, but there are still other
  	 * page waiters.
  	 */
  	if (!waitqueue_active(q) || !key.page_match) {
  		ClearPageWaiters(page);
  		/*
  		 * It's possible to miss clearing Waiters here, when we woke
  		 * our page waiters, but the hashed waitqueue has waiters for
  		 * other pages on it.
  		 *
  		 * That's okay, it's a rare case. The next waker will clear it.
  		 */
  	}
  	spin_unlock_irqrestore(&q->lock, flags);
  }
74d81bfae   Nicholas Piggin   mm: un-export wak...
1026
1027
1028
1029
1030
1031
1032
  
  static void wake_up_page(struct page *page, int bit)
  {
  	if (!PageWaiters(page))
  		return;
  	wake_up_page_bit(page, bit);
  }
629060270   Nicholas Piggin   mm: add PageWaite...
1033
1034
1035
1036
1037
  
  static inline int wait_on_page_bit_common(wait_queue_head_t *q,
  		struct page *page, int bit_nr, int state, bool lock)
  {
  	struct wait_page_queue wait_page;
ac6424b98   Ingo Molnar   sched/wait: Renam...
1038
  	wait_queue_entry_t *wait = &wait_page.wait;
629060270   Nicholas Piggin   mm: add PageWaite...
1039
1040
1041
  	int ret = 0;
  
  	init_wait(wait);
3510ca20e   Linus Torvalds   Minor page waitqu...
1042
  	wait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0;
629060270   Nicholas Piggin   mm: add PageWaite...
1043
1044
1045
1046
1047
1048
  	wait->func = wake_page_function;
  	wait_page.page = page;
  	wait_page.bit_nr = bit_nr;
  
  	for (;;) {
  		spin_lock_irq(&q->lock);
2055da973   Ingo Molnar   sched/wait: Disam...
1049
  		if (likely(list_empty(&wait->entry))) {
3510ca20e   Linus Torvalds   Minor page waitqu...
1050
  			__add_wait_queue_entry_tail(q, wait);
629060270   Nicholas Piggin   mm: add PageWaite...
1051
1052
1053
1054
1055
1056
1057
1058
1059
  			SetPageWaiters(page);
  		}
  
  		set_current_state(state);
  
  		spin_unlock_irq(&q->lock);
  
  		if (likely(test_bit(bit_nr, &page->flags))) {
  			io_schedule();
629060270   Nicholas Piggin   mm: add PageWaite...
1060
1061
1062
1063
1064
1065
1066
1067
1068
  		}
  
  		if (lock) {
  			if (!test_and_set_bit_lock(bit_nr, &page->flags))
  				break;
  		} else {
  			if (!test_bit(bit_nr, &page->flags))
  				break;
  		}
a8b169afb   Linus Torvalds   Avoid page waitqu...
1069
1070
1071
1072
1073
  
  		if (unlikely(signal_pending_state(state, current))) {
  			ret = -EINTR;
  			break;
  		}
629060270   Nicholas Piggin   mm: add PageWaite...
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
  	}
  
  	finish_wait(q, wait);
  
  	/*
  	 * A signal could leave PageWaiters set. Clearing it here if
  	 * !waitqueue_active would be possible (by open-coding finish_wait),
  	 * but still fail to catch it in the case of wait hash collision. We
  	 * already can fail to clear wait hash collision cases, so don't
  	 * bother with signals either.
  	 */
  
  	return ret;
  }
  
  void wait_on_page_bit(struct page *page, int bit_nr)
  {
  	wait_queue_head_t *q = page_waitqueue(page);
  	wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, false);
  }
  EXPORT_SYMBOL(wait_on_page_bit);
  
  int wait_on_page_bit_killable(struct page *page, int bit_nr)
  {
  	wait_queue_head_t *q = page_waitqueue(page);
  	return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, false);
cbbce8220   NeilBrown   SCHED: add some "...
1100
  }
4343d0087   David Howells   afs: Get rid of t...
1101
  EXPORT_SYMBOL(wait_on_page_bit_killable);
cbbce8220   NeilBrown   SCHED: add some "...
1102

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1103
  /**
385e1ca5f   David Howells   CacheFiles: Permi...
1104
   * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
697f619fc   Randy Dunlap   filemap: fix kern...
1105
1106
   * @page: Page defining the wait queue of interest
   * @waiter: Waiter to add to the queue
385e1ca5f   David Howells   CacheFiles: Permi...
1107
1108
1109
   *
   * Add an arbitrary @waiter to the wait queue for the nominated @page.
   */
ac6424b98   Ingo Molnar   sched/wait: Renam...
1110
  void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter)
385e1ca5f   David Howells   CacheFiles: Permi...
1111
1112
1113
1114
1115
  {
  	wait_queue_head_t *q = page_waitqueue(page);
  	unsigned long flags;
  
  	spin_lock_irqsave(&q->lock, flags);
9c3a815f4   Linus Torvalds   page waitqueue: a...
1116
  	__add_wait_queue_entry_tail(q, waiter);
629060270   Nicholas Piggin   mm: add PageWaite...
1117
  	SetPageWaiters(page);
385e1ca5f   David Howells   CacheFiles: Permi...
1118
1119
1120
  	spin_unlock_irqrestore(&q->lock, flags);
  }
  EXPORT_SYMBOL_GPL(add_page_wait_queue);
b91e1302a   Linus Torvalds   mm: optimize Page...
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
  #ifndef clear_bit_unlock_is_negative_byte
  
  /*
   * PG_waiters is the high bit in the same byte as PG_lock.
   *
   * On x86 (and on many other architectures), we can clear PG_lock and
   * test the sign bit at the same time. But if the architecture does
   * not support that special operation, we just do this all by hand
   * instead.
   *
   * The read of PG_waiters has to be after (or concurrently with) PG_locked
   * being cleared, but a memory barrier should be unneccssary since it is
   * in the same byte as PG_locked.
   */
  static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem)
  {
  	clear_bit_unlock(nr, mem);
  	/* smp_mb__after_atomic(); */
98473f9f3   Olof Johansson   mm/filemap: fix p...
1139
  	return test_bit(PG_waiters, mem);
b91e1302a   Linus Torvalds   mm: optimize Page...
1140
1141
1142
  }
  
  #endif
385e1ca5f   David Howells   CacheFiles: Permi...
1143
  /**
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
1144
   * unlock_page - unlock a locked page
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1145
1146
1147
1148
   * @page: the page
   *
   * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
   * Also wakes sleepers in wait_on_page_writeback() because the wakeup
da3dae54e   Masanari Iida   Documentation: Do...
1149
   * mechanism between PageLocked pages and PageWriteback pages is shared.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1150
1151
   * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
   *
b91e1302a   Linus Torvalds   mm: optimize Page...
1152
1153
1154
1155
1156
   * Note that this depends on PG_waiters being the sign bit in the byte
   * that contains PG_locked - thus the BUILD_BUG_ON(). That allows us to
   * clear the PG_locked bit and test PG_waiters at the same time fairly
   * portably (architectures that do LL/SC can test any bit, while x86 can
   * test the sign bit).
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1157
   */
920c7a5d0   Harvey Harrison   mm: remove fastca...
1158
  void unlock_page(struct page *page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1159
  {
b91e1302a   Linus Torvalds   mm: optimize Page...
1160
  	BUILD_BUG_ON(PG_waiters != 7);
48c935ad8   Kirill A. Shutemov   page-flags: defin...
1161
  	page = compound_head(page);
309381fea   Sasha Levin   mm: dump page whe...
1162
  	VM_BUG_ON_PAGE(!PageLocked(page), page);
b91e1302a   Linus Torvalds   mm: optimize Page...
1163
1164
  	if (clear_bit_unlock_is_negative_byte(PG_locked, &page->flags))
  		wake_up_page_bit(page, PG_locked);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1165
1166
  }
  EXPORT_SYMBOL(unlock_page);
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
1167
1168
1169
  /**
   * end_page_writeback - end writeback against a page
   * @page: the page
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1170
1171
1172
   */
  void end_page_writeback(struct page *page)
  {
888cf2db4   Mel Gorman   mm: avoid unneces...
1173
1174
1175
1176
1177
1178
1179
1180
1181
  	/*
  	 * TestClearPageReclaim could be used here but it is an atomic
  	 * operation and overkill in this particular case. Failing to
  	 * shuffle a page marked for immediate reclaim is too mild to
  	 * justify taking an atomic operation penalty at the end of
  	 * ever page writeback.
  	 */
  	if (PageReclaim(page)) {
  		ClearPageReclaim(page);
ac6aadb24   Miklos Szeredi   mm: rotate_reclai...
1182
  		rotate_reclaimable_page(page);
888cf2db4   Mel Gorman   mm: avoid unneces...
1183
  	}
ac6aadb24   Miklos Szeredi   mm: rotate_reclai...
1184
1185
1186
  
  	if (!test_clear_page_writeback(page))
  		BUG();
4e857c58e   Peter Zijlstra   arch: Mass conver...
1187
  	smp_mb__after_atomic();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1188
1189
1190
  	wake_up_page(page, PG_writeback);
  }
  EXPORT_SYMBOL(end_page_writeback);
57d998456   Matthew Wilcox   fs/mpage.c: facto...
1191
1192
1193
1194
  /*
   * After completing I/O on a page, call this routine to update the page
   * flags appropriately
   */
c11f0c0b5   Jens Axboe   block/mm: make bd...
1195
  void page_endio(struct page *page, bool is_write, int err)
57d998456   Matthew Wilcox   fs/mpage.c: facto...
1196
  {
c11f0c0b5   Jens Axboe   block/mm: make bd...
1197
  	if (!is_write) {
57d998456   Matthew Wilcox   fs/mpage.c: facto...
1198
1199
1200
1201
1202
1203
1204
  		if (!err) {
  			SetPageUptodate(page);
  		} else {
  			ClearPageUptodate(page);
  			SetPageError(page);
  		}
  		unlock_page(page);
abf545484   Mike Christie   mm/block: convert...
1205
  	} else {
57d998456   Matthew Wilcox   fs/mpage.c: facto...
1206
  		if (err) {
dd8416c47   Minchan Kim   mm: do not access...
1207
  			struct address_space *mapping;
57d998456   Matthew Wilcox   fs/mpage.c: facto...
1208
  			SetPageError(page);
dd8416c47   Minchan Kim   mm: do not access...
1209
1210
1211
  			mapping = page_mapping(page);
  			if (mapping)
  				mapping_set_error(mapping, err);
57d998456   Matthew Wilcox   fs/mpage.c: facto...
1212
1213
1214
1215
1216
  		}
  		end_page_writeback(page);
  	}
  }
  EXPORT_SYMBOL_GPL(page_endio);
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
1217
1218
  /**
   * __lock_page - get a lock on the page, assuming we need to sleep to get it
870667553   Randy Dunlap   mm: fix filemap.c...
1219
   * @__page: the page to lock
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1220
   */
629060270   Nicholas Piggin   mm: add PageWaite...
1221
  void __lock_page(struct page *__page)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1222
  {
629060270   Nicholas Piggin   mm: add PageWaite...
1223
1224
1225
  	struct page *page = compound_head(__page);
  	wait_queue_head_t *q = page_waitqueue(page);
  	wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, true);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1226
1227
  }
  EXPORT_SYMBOL(__lock_page);
629060270   Nicholas Piggin   mm: add PageWaite...
1228
  int __lock_page_killable(struct page *__page)
2687a3569   Matthew Wilcox   Add lock_page_kil...
1229
  {
629060270   Nicholas Piggin   mm: add PageWaite...
1230
1231
1232
  	struct page *page = compound_head(__page);
  	wait_queue_head_t *q = page_waitqueue(page);
  	return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE, true);
2687a3569   Matthew Wilcox   Add lock_page_kil...
1233
  }
18bc0bbd1   Evgeniy Polyakov   Staging: pohmelfs...
1234
  EXPORT_SYMBOL_GPL(__lock_page_killable);
2687a3569   Matthew Wilcox   Add lock_page_kil...
1235

9a95f3cf7   Paul Cassella   mm: describe mmap...
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
  /*
   * Return values:
   * 1 - page is locked; mmap_sem is still held.
   * 0 - page is not locked.
   *     mmap_sem has been released (up_read()), unless flags had both
   *     FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in
   *     which case mmap_sem is still held.
   *
   * If neither ALLOW_RETRY nor KILLABLE are set, will always return 1
   * with the page locked and the mmap_sem unperturbed.
   */
d065bd810   Michel Lespinasse   mm: retry page fa...
1247
1248
1249
  int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
  			 unsigned int flags)
  {
37b23e052   KOSAKI Motohiro   x86,mm: make page...
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
  	if (flags & FAULT_FLAG_ALLOW_RETRY) {
  		/*
  		 * CAUTION! In this case, mmap_sem is not released
  		 * even though return 0.
  		 */
  		if (flags & FAULT_FLAG_RETRY_NOWAIT)
  			return 0;
  
  		up_read(&mm->mmap_sem);
  		if (flags & FAULT_FLAG_KILLABLE)
  			wait_on_page_locked_killable(page);
  		else
318b275fb   Gleb Natapov   mm: allow GUP to ...
1262
  			wait_on_page_locked(page);
d065bd810   Michel Lespinasse   mm: retry page fa...
1263
  		return 0;
37b23e052   KOSAKI Motohiro   x86,mm: make page...
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
  	} else {
  		if (flags & FAULT_FLAG_KILLABLE) {
  			int ret;
  
  			ret = __lock_page_killable(page);
  			if (ret) {
  				up_read(&mm->mmap_sem);
  				return 0;
  			}
  		} else
  			__lock_page(page);
  		return 1;
d065bd810   Michel Lespinasse   mm: retry page fa...
1276
1277
  	}
  }
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
1278
  /**
e7b563bb2   Johannes Weiner   mm: filemap: move...
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
   * page_cache_next_hole - find the next hole (not-present entry)
   * @mapping: mapping
   * @index: index
   * @max_scan: maximum range to search
   *
   * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the
   * lowest indexed hole.
   *
   * Returns: the index of the hole if found, otherwise returns an index
   * outside of the set specified (in which case 'return - index >=
   * max_scan' will be true). In rare cases of index wrap-around, 0 will
   * be returned.
   *
   * page_cache_next_hole may be called under rcu_read_lock. However,
   * like radix_tree_gang_lookup, this will not atomically search a
   * snapshot of the tree at a single point in time. For example, if a
   * hole is created at index 5, then subsequently a hole is created at
   * index 10, page_cache_next_hole covering both indexes may return 10
   * if called under rcu_read_lock.
   */
  pgoff_t page_cache_next_hole(struct address_space *mapping,
  			     pgoff_t index, unsigned long max_scan)
  {
  	unsigned long i;
  
  	for (i = 0; i < max_scan; i++) {
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1305
  		struct page *page;
b93b01631   Matthew Wilcox   page cache: use x...
1306
  		page = radix_tree_lookup(&mapping->i_pages, index);
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1307
  		if (!page || radix_tree_exceptional_entry(page))
e7b563bb2   Johannes Weiner   mm: filemap: move...
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
  			break;
  		index++;
  		if (index == 0)
  			break;
  	}
  
  	return index;
  }
  EXPORT_SYMBOL(page_cache_next_hole);
  
  /**
   * page_cache_prev_hole - find the prev hole (not-present entry)
   * @mapping: mapping
   * @index: index
   * @max_scan: maximum range to search
   *
   * Search backwards in the range [max(index-max_scan+1, 0), index] for
   * the first hole.
   *
   * Returns: the index of the hole if found, otherwise returns an index
   * outside of the set specified (in which case 'index - return >=
   * max_scan' will be true). In rare cases of wrap-around, ULONG_MAX
   * will be returned.
   *
   * page_cache_prev_hole may be called under rcu_read_lock. However,
   * like radix_tree_gang_lookup, this will not atomically search a
   * snapshot of the tree at a single point in time. For example, if a
   * hole is created at index 10, then subsequently a hole is created at
   * index 5, page_cache_prev_hole covering both indexes may return 5 if
   * called under rcu_read_lock.
   */
  pgoff_t page_cache_prev_hole(struct address_space *mapping,
  			     pgoff_t index, unsigned long max_scan)
  {
  	unsigned long i;
  
  	for (i = 0; i < max_scan; i++) {
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1345
  		struct page *page;
b93b01631   Matthew Wilcox   page cache: use x...
1346
  		page = radix_tree_lookup(&mapping->i_pages, index);
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1347
  		if (!page || radix_tree_exceptional_entry(page))
e7b563bb2   Johannes Weiner   mm: filemap: move...
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
  			break;
  		index--;
  		if (index == ULONG_MAX)
  			break;
  	}
  
  	return index;
  }
  EXPORT_SYMBOL(page_cache_prev_hole);
  
  /**
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1359
   * find_get_entry - find and get a page cache entry
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
1360
   * @mapping: the address_space to search
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1361
1362
1363
1364
   * @offset: the page cache index
   *
   * Looks up the page cache slot at @mapping & @offset.  If there is a
   * page cache page, it is returned with an increased refcount.
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
1365
   *
139b6a6fb   Johannes Weiner   mm: filemap: upda...
1366
1367
   * If the slot holds a shadow entry of a previously evicted page, or a
   * swap entry from shmem/tmpfs, it is returned.
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1368
1369
   *
   * Otherwise, %NULL is returned.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1370
   */
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1371
  struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1372
  {
a60637c85   Nick Piggin   mm: lockless page...
1373
  	void **pagep;
83929372f   Kirill A. Shutemov   filemap: prepare ...
1374
  	struct page *head, *page;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1375

a60637c85   Nick Piggin   mm: lockless page...
1376
1377
1378
  	rcu_read_lock();
  repeat:
  	page = NULL;
b93b01631   Matthew Wilcox   page cache: use x...
1379
  	pagep = radix_tree_lookup_slot(&mapping->i_pages, offset);
a60637c85   Nick Piggin   mm: lockless page...
1380
1381
  	if (pagep) {
  		page = radix_tree_deref_slot(pagep);
27d20fddc   Nick Piggin   radix-tree: fix R...
1382
1383
  		if (unlikely(!page))
  			goto out;
a2c16d6cb   Hugh Dickins   mm: let swap use ...
1384
  		if (radix_tree_exception(page)) {
8079b1c85   Hugh Dickins   mm: clarify the r...
1385
1386
1387
  			if (radix_tree_deref_retry(page))
  				goto repeat;
  			/*
139b6a6fb   Johannes Weiner   mm: filemap: upda...
1388
1389
1390
  			 * A shadow entry of a recently evicted page,
  			 * or a swap entry from shmem/tmpfs.  Return
  			 * it without attempting to raise page count.
8079b1c85   Hugh Dickins   mm: clarify the r...
1391
1392
  			 */
  			goto out;
a2c16d6cb   Hugh Dickins   mm: let swap use ...
1393
  		}
83929372f   Kirill A. Shutemov   filemap: prepare ...
1394
1395
1396
1397
1398
1399
1400
1401
  
  		head = compound_head(page);
  		if (!page_cache_get_speculative(head))
  			goto repeat;
  
  		/* The page was split under us? */
  		if (compound_head(page) != head) {
  			put_page(head);
a60637c85   Nick Piggin   mm: lockless page...
1402
  			goto repeat;
83929372f   Kirill A. Shutemov   filemap: prepare ...
1403
  		}
a60637c85   Nick Piggin   mm: lockless page...
1404
1405
1406
1407
1408
1409
1410
  
  		/*
  		 * Has the page moved?
  		 * This is part of the lockless pagecache protocol. See
  		 * include/linux/pagemap.h for details.
  		 */
  		if (unlikely(page != *pagep)) {
83929372f   Kirill A. Shutemov   filemap: prepare ...
1411
  			put_page(head);
a60637c85   Nick Piggin   mm: lockless page...
1412
1413
1414
  			goto repeat;
  		}
  	}
27d20fddc   Nick Piggin   radix-tree: fix R...
1415
  out:
a60637c85   Nick Piggin   mm: lockless page...
1416
  	rcu_read_unlock();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1417
1418
  	return page;
  }
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1419
  EXPORT_SYMBOL(find_get_entry);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1420

485bb99b4   Randy Dunlap   [PATCH] kernel-do...
1421
  /**
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1422
1423
1424
1425
1426
1427
1428
1429
   * find_lock_entry - locate, pin and lock a page cache entry
   * @mapping: the address_space to search
   * @offset: the page cache index
   *
   * Looks up the page cache slot at @mapping & @offset.  If there is a
   * page cache page, it is returned locked and with an increased
   * refcount.
   *
139b6a6fb   Johannes Weiner   mm: filemap: upda...
1430
1431
   * If the slot holds a shadow entry of a previously evicted page, or a
   * swap entry from shmem/tmpfs, it is returned.
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1432
1433
1434
1435
1436
1437
   *
   * Otherwise, %NULL is returned.
   *
   * find_lock_entry() may sleep.
   */
  struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1438
1439
  {
  	struct page *page;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1440
  repeat:
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1441
  	page = find_get_entry(mapping, offset);
a2c16d6cb   Hugh Dickins   mm: let swap use ...
1442
  	if (page && !radix_tree_exception(page)) {
a60637c85   Nick Piggin   mm: lockless page...
1443
1444
  		lock_page(page);
  		/* Has the page been truncated? */
83929372f   Kirill A. Shutemov   filemap: prepare ...
1445
  		if (unlikely(page_mapping(page) != mapping)) {
a60637c85   Nick Piggin   mm: lockless page...
1446
  			unlock_page(page);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
1447
  			put_page(page);
a60637c85   Nick Piggin   mm: lockless page...
1448
  			goto repeat;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1449
  		}
83929372f   Kirill A. Shutemov   filemap: prepare ...
1450
  		VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1451
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1452
1453
  	return page;
  }
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1454
1455
1456
  EXPORT_SYMBOL(find_lock_entry);
  
  /**
2457aec63   Mel Gorman   mm: non-atomicall...
1457
   * pagecache_get_page - find and get a page reference
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1458
1459
   * @mapping: the address_space to search
   * @offset: the page index
2457aec63   Mel Gorman   mm: non-atomicall...
1460
   * @fgp_flags: PCG flags
45f87de57   Michal Hocko   mm: get rid of ra...
1461
   * @gfp_mask: gfp mask to use for the page cache data page allocation
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1462
   *
2457aec63   Mel Gorman   mm: non-atomicall...
1463
   * Looks up the page cache slot at @mapping & @offset.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1464
   *
75325189c   Randy Dunlap   mm: fix filemap.c...
1465
   * PCG flags modify how the page is returned.
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1466
   *
0e056eb55   mchehab@s-opensource.com   kernel-api.rst: f...
1467
1468
1469
1470
1471
1472
1473
1474
   * @fgp_flags can be:
   *
   * - FGP_ACCESSED: the page will be marked accessed
   * - FGP_LOCK: Page is return locked
   * - FGP_CREAT: If page is not present then a new page is allocated using
   *   @gfp_mask and added to the page cache and the VM's LRU
   *   list. The page is returned locked and with an increased
   *   refcount. Otherwise, NULL is returned.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1475
   *
2457aec63   Mel Gorman   mm: non-atomicall...
1476
1477
   * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even
   * if the GFP flags specified for FGP_CREAT are atomic.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1478
   *
2457aec63   Mel Gorman   mm: non-atomicall...
1479
   * If there is a page cache page, it is returned with an increased refcount.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1480
   */
2457aec63   Mel Gorman   mm: non-atomicall...
1481
  struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
45f87de57   Michal Hocko   mm: get rid of ra...
1482
  	int fgp_flags, gfp_t gfp_mask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1483
  {
eb2be1893   Nick Piggin   mm: buffered writ...
1484
  	struct page *page;
2457aec63   Mel Gorman   mm: non-atomicall...
1485

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1486
  repeat:
2457aec63   Mel Gorman   mm: non-atomicall...
1487
1488
1489
1490
1491
1492
1493
1494
1495
  	page = find_get_entry(mapping, offset);
  	if (radix_tree_exceptional_entry(page))
  		page = NULL;
  	if (!page)
  		goto no_page;
  
  	if (fgp_flags & FGP_LOCK) {
  		if (fgp_flags & FGP_NOWAIT) {
  			if (!trylock_page(page)) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
1496
  				put_page(page);
2457aec63   Mel Gorman   mm: non-atomicall...
1497
1498
1499
1500
1501
1502
1503
1504
1505
  				return NULL;
  			}
  		} else {
  			lock_page(page);
  		}
  
  		/* Has the page been truncated? */
  		if (unlikely(page->mapping != mapping)) {
  			unlock_page(page);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
1506
  			put_page(page);
2457aec63   Mel Gorman   mm: non-atomicall...
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
  			goto repeat;
  		}
  		VM_BUG_ON_PAGE(page->index != offset, page);
  	}
  
  	if (page && (fgp_flags & FGP_ACCESSED))
  		mark_page_accessed(page);
  
  no_page:
  	if (!page && (fgp_flags & FGP_CREAT)) {
  		int err;
  		if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping))
45f87de57   Michal Hocko   mm: get rid of ra...
1519
1520
1521
  			gfp_mask |= __GFP_WRITE;
  		if (fgp_flags & FGP_NOFS)
  			gfp_mask &= ~__GFP_FS;
2457aec63   Mel Gorman   mm: non-atomicall...
1522

45f87de57   Michal Hocko   mm: get rid of ra...
1523
  		page = __page_cache_alloc(gfp_mask);
eb2be1893   Nick Piggin   mm: buffered writ...
1524
1525
  		if (!page)
  			return NULL;
2457aec63   Mel Gorman   mm: non-atomicall...
1526
1527
1528
  
  		if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK)))
  			fgp_flags |= FGP_LOCK;
eb39d618f   Hugh Dickins   mm: replace init_...
1529
  		/* Init accessed so avoid atomic mark_page_accessed later */
2457aec63   Mel Gorman   mm: non-atomicall...
1530
  		if (fgp_flags & FGP_ACCESSED)
eb39d618f   Hugh Dickins   mm: replace init_...
1531
  			__SetPageReferenced(page);
2457aec63   Mel Gorman   mm: non-atomicall...
1532

abc1be13f   Matthew Wilcox   mm/filemap.c: fix...
1533
  		err = add_to_page_cache_lru(page, mapping, offset, gfp_mask);
eb2be1893   Nick Piggin   mm: buffered writ...
1534
  		if (unlikely(err)) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
1535
  			put_page(page);
eb2be1893   Nick Piggin   mm: buffered writ...
1536
1537
1538
  			page = NULL;
  			if (err == -EEXIST)
  				goto repeat;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1539
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1540
  	}
2457aec63   Mel Gorman   mm: non-atomicall...
1541

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1542
1543
  	return page;
  }
2457aec63   Mel Gorman   mm: non-atomicall...
1544
  EXPORT_SYMBOL(pagecache_get_page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1545
1546
  
  /**
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
   * find_get_entries - gang pagecache lookup
   * @mapping:	The address_space to search
   * @start:	The starting page cache index
   * @nr_entries:	The maximum number of entries
   * @entries:	Where the resulting entries are placed
   * @indices:	The cache indices corresponding to the entries in @entries
   *
   * find_get_entries() will search for and return a group of up to
   * @nr_entries entries in the mapping.  The entries are placed at
   * @entries.  find_get_entries() takes a reference against any actual
   * pages it returns.
   *
   * The search returns a group of mapping-contiguous page cache entries
   * with ascending indexes.  There may be holes in the indices due to
   * not-present pages.
   *
139b6a6fb   Johannes Weiner   mm: filemap: upda...
1563
1564
   * Any shadow entries of evicted pages, or swap entries from
   * shmem/tmpfs, are included in the returned array.
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
   *
   * find_get_entries() returns the number of pages and shadow entries
   * which were found.
   */
  unsigned find_get_entries(struct address_space *mapping,
  			  pgoff_t start, unsigned int nr_entries,
  			  struct page **entries, pgoff_t *indices)
  {
  	void **slot;
  	unsigned int ret = 0;
  	struct radix_tree_iter iter;
  
  	if (!nr_entries)
  		return 0;
  
  	rcu_read_lock();
b93b01631   Matthew Wilcox   page cache: use x...
1581
  	radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
83929372f   Kirill A. Shutemov   filemap: prepare ...
1582
  		struct page *head, *page;
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1583
1584
1585
1586
1587
  repeat:
  		page = radix_tree_deref_slot(slot);
  		if (unlikely(!page))
  			continue;
  		if (radix_tree_exception(page)) {
2cf938aae   Matthew Wilcox   mm: use radix_tre...
1588
1589
1590
1591
  			if (radix_tree_deref_retry(page)) {
  				slot = radix_tree_iter_retry(&iter);
  				continue;
  			}
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1592
  			/*
f9fe48bec   Ross Zwisler   dax: support dirt...
1593
1594
1595
  			 * A shadow entry of a recently evicted page, a swap
  			 * entry from shmem/tmpfs or a DAX entry.  Return it
  			 * without attempting to raise page count.
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1596
1597
1598
  			 */
  			goto export;
  		}
83929372f   Kirill A. Shutemov   filemap: prepare ...
1599
1600
1601
1602
1603
1604
1605
1606
  
  		head = compound_head(page);
  		if (!page_cache_get_speculative(head))
  			goto repeat;
  
  		/* The page was split under us? */
  		if (compound_head(page) != head) {
  			put_page(head);
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1607
  			goto repeat;
83929372f   Kirill A. Shutemov   filemap: prepare ...
1608
  		}
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1609
1610
1611
  
  		/* Has the page moved? */
  		if (unlikely(page != *slot)) {
83929372f   Kirill A. Shutemov   filemap: prepare ...
1612
  			put_page(head);
0cd6144aa   Johannes Weiner   mm + fs: prepare ...
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
  			goto repeat;
  		}
  export:
  		indices[ret] = iter.index;
  		entries[ret] = page;
  		if (++ret == nr_entries)
  			break;
  	}
  	rcu_read_unlock();
  	return ret;
  }
  
  /**
b947cee4b   Jan Kara   mm: implement fin...
1626
   * find_get_pages_range - gang pagecache lookup
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1627
1628
   * @mapping:	The address_space to search
   * @start:	The starting page index
b947cee4b   Jan Kara   mm: implement fin...
1629
   * @end:	The final page index (inclusive)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1630
1631
1632
   * @nr_pages:	The maximum number of pages
   * @pages:	Where the resulting pages are placed
   *
b947cee4b   Jan Kara   mm: implement fin...
1633
1634
1635
1636
   * find_get_pages_range() will search for and return a group of up to @nr_pages
   * pages in the mapping starting at index @start and up to index @end
   * (inclusive).  The pages are placed at @pages.  find_get_pages_range() takes
   * a reference against the returned pages.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1637
1638
1639
   *
   * The search returns a group of mapping-contiguous pages with ascending
   * indexes.  There may be holes in the indices due to not-present pages.
d72dc8a25   Jan Kara   mm: make pagevec_...
1640
   * We also update @start to index the next page for the traversal.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1641
   *
b947cee4b   Jan Kara   mm: implement fin...
1642
1643
1644
   * find_get_pages_range() returns the number of pages which were found. If this
   * number is smaller than @nr_pages, the end of specified range has been
   * reached.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1645
   */
b947cee4b   Jan Kara   mm: implement fin...
1646
1647
1648
  unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
  			      pgoff_t end, unsigned int nr_pages,
  			      struct page **pages)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1649
  {
0fc9d1040   Konstantin Khlebnikov   radix-tree: use i...
1650
1651
1652
1653
1654
1655
  	struct radix_tree_iter iter;
  	void **slot;
  	unsigned ret = 0;
  
  	if (unlikely(!nr_pages))
  		return 0;
a60637c85   Nick Piggin   mm: lockless page...
1656
1657
  
  	rcu_read_lock();
b93b01631   Matthew Wilcox   page cache: use x...
1658
  	radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, *start) {
83929372f   Kirill A. Shutemov   filemap: prepare ...
1659
  		struct page *head, *page;
b947cee4b   Jan Kara   mm: implement fin...
1660
1661
1662
  
  		if (iter.index > end)
  			break;
a60637c85   Nick Piggin   mm: lockless page...
1663
  repeat:
0fc9d1040   Konstantin Khlebnikov   radix-tree: use i...
1664
  		page = radix_tree_deref_slot(slot);
a60637c85   Nick Piggin   mm: lockless page...
1665
1666
  		if (unlikely(!page))
  			continue;
9d8aa4ea8   Hugh Dickins   mm: remove worryi...
1667

a2c16d6cb   Hugh Dickins   mm: let swap use ...
1668
  		if (radix_tree_exception(page)) {
8079b1c85   Hugh Dickins   mm: clarify the r...
1669
  			if (radix_tree_deref_retry(page)) {
2cf938aae   Matthew Wilcox   mm: use radix_tre...
1670
1671
  				slot = radix_tree_iter_retry(&iter);
  				continue;
8079b1c85   Hugh Dickins   mm: clarify the r...
1672
  			}
a2c16d6cb   Hugh Dickins   mm: let swap use ...
1673
  			/*
139b6a6fb   Johannes Weiner   mm: filemap: upda...
1674
1675
1676
  			 * A shadow entry of a recently evicted page,
  			 * or a swap entry from shmem/tmpfs.  Skip
  			 * over it.
a2c16d6cb   Hugh Dickins   mm: let swap use ...
1677
  			 */
8079b1c85   Hugh Dickins   mm: clarify the r...
1678
  			continue;
27d20fddc   Nick Piggin   radix-tree: fix R...
1679
  		}
a60637c85   Nick Piggin   mm: lockless page...
1680

83929372f   Kirill A. Shutemov   filemap: prepare ...
1681
1682
1683
1684
1685
1686
1687
  		head = compound_head(page);
  		if (!page_cache_get_speculative(head))
  			goto repeat;
  
  		/* The page was split under us? */
  		if (compound_head(page) != head) {
  			put_page(head);
a60637c85   Nick Piggin   mm: lockless page...
1688
  			goto repeat;
83929372f   Kirill A. Shutemov   filemap: prepare ...
1689
  		}
a60637c85   Nick Piggin   mm: lockless page...
1690
1691
  
  		/* Has the page moved? */
0fc9d1040   Konstantin Khlebnikov   radix-tree: use i...
1692
  		if (unlikely(page != *slot)) {
83929372f   Kirill A. Shutemov   filemap: prepare ...
1693
  			put_page(head);
a60637c85   Nick Piggin   mm: lockless page...
1694
1695
  			goto repeat;
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1696

a60637c85   Nick Piggin   mm: lockless page...
1697
  		pages[ret] = page;
b947cee4b   Jan Kara   mm: implement fin...
1698
1699
1700
1701
  		if (++ret == nr_pages) {
  			*start = pages[ret - 1]->index + 1;
  			goto out;
  		}
a60637c85   Nick Piggin   mm: lockless page...
1702
  	}
5b280c0cc   Hugh Dickins   mm: don't return ...
1703

b947cee4b   Jan Kara   mm: implement fin...
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
  	/*
  	 * We come here when there is no page beyond @end. We take care to not
  	 * overflow the index @start as it confuses some of the callers. This
  	 * breaks the iteration when there is page at index -1 but that is
  	 * already broken anyway.
  	 */
  	if (end == (pgoff_t)-1)
  		*start = (pgoff_t)-1;
  	else
  		*start = end + 1;
  out:
a60637c85   Nick Piggin   mm: lockless page...
1715
  	rcu_read_unlock();
d72dc8a25   Jan Kara   mm: make pagevec_...
1716

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1717
1718
  	return ret;
  }
ebf43500e   Jens Axboe   [PATCH] Add find_...
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
  /**
   * find_get_pages_contig - gang contiguous pagecache lookup
   * @mapping:	The address_space to search
   * @index:	The starting page index
   * @nr_pages:	The maximum number of pages
   * @pages:	Where the resulting pages are placed
   *
   * find_get_pages_contig() works exactly like find_get_pages(), except
   * that the returned number of pages are guaranteed to be contiguous.
   *
   * find_get_pages_contig() returns the number of pages which were found.
   */
  unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
  			       unsigned int nr_pages, struct page **pages)
  {
0fc9d1040   Konstantin Khlebnikov   radix-tree: use i...
1734
1735
1736
1737
1738
1739
  	struct radix_tree_iter iter;
  	void **slot;
  	unsigned int ret = 0;
  
  	if (unlikely(!nr_pages))
  		return 0;
a60637c85   Nick Piggin   mm: lockless page...
1740
1741
  
  	rcu_read_lock();
b93b01631   Matthew Wilcox   page cache: use x...
1742
  	radix_tree_for_each_contig(slot, &mapping->i_pages, &iter, index) {
83929372f   Kirill A. Shutemov   filemap: prepare ...
1743
  		struct page *head, *page;
a60637c85   Nick Piggin   mm: lockless page...
1744
  repeat:
0fc9d1040   Konstantin Khlebnikov   radix-tree: use i...
1745
1746
  		page = radix_tree_deref_slot(slot);
  		/* The hole, there no reason to continue */
a60637c85   Nick Piggin   mm: lockless page...
1747
  		if (unlikely(!page))
0fc9d1040   Konstantin Khlebnikov   radix-tree: use i...
1748
  			break;
9d8aa4ea8   Hugh Dickins   mm: remove worryi...
1749

a2c16d6cb   Hugh Dickins   mm: let swap use ...
1750
  		if (radix_tree_exception(page)) {
8079b1c85   Hugh Dickins   mm: clarify the r...
1751
  			if (radix_tree_deref_retry(page)) {
2cf938aae   Matthew Wilcox   mm: use radix_tre...
1752
1753
  				slot = radix_tree_iter_retry(&iter);
  				continue;
8079b1c85   Hugh Dickins   mm: clarify the r...
1754
  			}
a2c16d6cb   Hugh Dickins   mm: let swap use ...
1755
  			/*
139b6a6fb   Johannes Weiner   mm: filemap: upda...
1756
1757
1758
  			 * A shadow entry of a recently evicted page,
  			 * or a swap entry from shmem/tmpfs.  Stop
  			 * looking for contiguous pages.
a2c16d6cb   Hugh Dickins   mm: let swap use ...
1759
  			 */
8079b1c85   Hugh Dickins   mm: clarify the r...
1760
  			break;
a2c16d6cb   Hugh Dickins   mm: let swap use ...
1761
  		}
ebf43500e   Jens Axboe   [PATCH] Add find_...
1762

83929372f   Kirill A. Shutemov   filemap: prepare ...
1763
1764
1765
1766
1767
1768
1769
  		head = compound_head(page);
  		if (!page_cache_get_speculative(head))
  			goto repeat;
  
  		/* The page was split under us? */
  		if (compound_head(page) != head) {
  			put_page(head);
a60637c85   Nick Piggin   mm: lockless page...
1770
  			goto repeat;
83929372f   Kirill A. Shutemov   filemap: prepare ...
1771
  		}
a60637c85   Nick Piggin   mm: lockless page...
1772
1773
  
  		/* Has the page moved? */
0fc9d1040   Konstantin Khlebnikov   radix-tree: use i...
1774
  		if (unlikely(page != *slot)) {
83929372f   Kirill A. Shutemov   filemap: prepare ...
1775
  			put_page(head);
a60637c85   Nick Piggin   mm: lockless page...
1776
1777
  			goto repeat;
  		}
9cbb4cb21   Nick Piggin   mm: find_get_page...
1778
1779
1780
1781
1782
  		/*
  		 * must check mapping and index after taking the ref.
  		 * otherwise we can get both false positives and false
  		 * negatives, which is just confusing to the caller.
  		 */
83929372f   Kirill A. Shutemov   filemap: prepare ...
1783
  		if (page->mapping == NULL || page_to_pgoff(page) != iter.index) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
1784
  			put_page(page);
9cbb4cb21   Nick Piggin   mm: find_get_page...
1785
1786
  			break;
  		}
a60637c85   Nick Piggin   mm: lockless page...
1787
  		pages[ret] = page;
0fc9d1040   Konstantin Khlebnikov   radix-tree: use i...
1788
1789
  		if (++ret == nr_pages)
  			break;
ebf43500e   Jens Axboe   [PATCH] Add find_...
1790
  	}
a60637c85   Nick Piggin   mm: lockless page...
1791
1792
  	rcu_read_unlock();
  	return ret;
ebf43500e   Jens Axboe   [PATCH] Add find_...
1793
  }
ef71c15c4   David Howells   AFS: export a cou...
1794
  EXPORT_SYMBOL(find_get_pages_contig);
ebf43500e   Jens Axboe   [PATCH] Add find_...
1795

485bb99b4   Randy Dunlap   [PATCH] kernel-do...
1796
  /**
72b045aec   Jan Kara   mm: implement fin...
1797
   * find_get_pages_range_tag - find and return pages in given range matching @tag
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
1798
1799
   * @mapping:	the address_space to search
   * @index:	the starting page index
72b045aec   Jan Kara   mm: implement fin...
1800
   * @end:	The final page index (inclusive)
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
1801
1802
1803
1804
   * @tag:	the tag index
   * @nr_pages:	the maximum number of pages
   * @pages:	where the resulting pages are placed
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1805
   * Like find_get_pages, except we only return pages which are tagged with
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
1806
   * @tag.   We update @index to index the next page for the traversal.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1807
   */
72b045aec   Jan Kara   mm: implement fin...
1808
1809
1810
  unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
  			pgoff_t end, int tag, unsigned int nr_pages,
  			struct page **pages)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1811
  {
0fc9d1040   Konstantin Khlebnikov   radix-tree: use i...
1812
1813
1814
1815
1816
1817
  	struct radix_tree_iter iter;
  	void **slot;
  	unsigned ret = 0;
  
  	if (unlikely(!nr_pages))
  		return 0;
a60637c85   Nick Piggin   mm: lockless page...
1818
1819
  
  	rcu_read_lock();
b93b01631   Matthew Wilcox   page cache: use x...
1820
  	radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, *index, tag) {
83929372f   Kirill A. Shutemov   filemap: prepare ...
1821
  		struct page *head, *page;
72b045aec   Jan Kara   mm: implement fin...
1822
1823
1824
  
  		if (iter.index > end)
  			break;
a60637c85   Nick Piggin   mm: lockless page...
1825
  repeat:
0fc9d1040   Konstantin Khlebnikov   radix-tree: use i...
1826
  		page = radix_tree_deref_slot(slot);
a60637c85   Nick Piggin   mm: lockless page...
1827
1828
  		if (unlikely(!page))
  			continue;
9d8aa4ea8   Hugh Dickins   mm: remove worryi...
1829

a2c16d6cb   Hugh Dickins   mm: let swap use ...
1830
  		if (radix_tree_exception(page)) {
8079b1c85   Hugh Dickins   mm: clarify the r...
1831
  			if (radix_tree_deref_retry(page)) {
2cf938aae   Matthew Wilcox   mm: use radix_tre...
1832
1833
  				slot = radix_tree_iter_retry(&iter);
  				continue;
8079b1c85   Hugh Dickins   mm: clarify the r...
1834
  			}
a2c16d6cb   Hugh Dickins   mm: let swap use ...
1835
  			/*
139b6a6fb   Johannes Weiner   mm: filemap: upda...
1836
1837
1838
1839
1840
1841
1842
1843
1844
  			 * A shadow entry of a recently evicted page.
  			 *
  			 * Those entries should never be tagged, but
  			 * this tree walk is lockless and the tags are
  			 * looked up in bulk, one radix tree node at a
  			 * time, so there is a sizable window for page
  			 * reclaim to evict a page we saw tagged.
  			 *
  			 * Skip over it.
a2c16d6cb   Hugh Dickins   mm: let swap use ...
1845
  			 */
139b6a6fb   Johannes Weiner   mm: filemap: upda...
1846
  			continue;
a2c16d6cb   Hugh Dickins   mm: let swap use ...
1847
  		}
a60637c85   Nick Piggin   mm: lockless page...
1848

83929372f   Kirill A. Shutemov   filemap: prepare ...
1849
1850
  		head = compound_head(page);
  		if (!page_cache_get_speculative(head))
a60637c85   Nick Piggin   mm: lockless page...
1851
  			goto repeat;
83929372f   Kirill A. Shutemov   filemap: prepare ...
1852
1853
1854
1855
1856
  		/* The page was split under us? */
  		if (compound_head(page) != head) {
  			put_page(head);
  			goto repeat;
  		}
a60637c85   Nick Piggin   mm: lockless page...
1857
  		/* Has the page moved? */
0fc9d1040   Konstantin Khlebnikov   radix-tree: use i...
1858
  		if (unlikely(page != *slot)) {
83929372f   Kirill A. Shutemov   filemap: prepare ...
1859
  			put_page(head);
a60637c85   Nick Piggin   mm: lockless page...
1860
1861
1862
1863
  			goto repeat;
  		}
  
  		pages[ret] = page;
72b045aec   Jan Kara   mm: implement fin...
1864
1865
1866
1867
  		if (++ret == nr_pages) {
  			*index = pages[ret - 1]->index + 1;
  			goto out;
  		}
a60637c85   Nick Piggin   mm: lockless page...
1868
  	}
5b280c0cc   Hugh Dickins   mm: don't return ...
1869

72b045aec   Jan Kara   mm: implement fin...
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
  	/*
  	 * We come here when we got at @end. We take care to not overflow the
  	 * index @index as it confuses some of the callers. This breaks the
  	 * iteration when there is page at index -1 but that is already broken
  	 * anyway.
  	 */
  	if (end == (pgoff_t)-1)
  		*index = (pgoff_t)-1;
  	else
  		*index = end + 1;
  out:
a60637c85   Nick Piggin   mm: lockless page...
1881
  	rcu_read_unlock();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1882

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1883
1884
  	return ret;
  }
72b045aec   Jan Kara   mm: implement fin...
1885
  EXPORT_SYMBOL(find_get_pages_range_tag);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1886

7e7f77498   Ross Zwisler   mm: add find_get_...
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
  /**
   * find_get_entries_tag - find and return entries that match @tag
   * @mapping:	the address_space to search
   * @start:	the starting page cache index
   * @tag:	the tag index
   * @nr_entries:	the maximum number of entries
   * @entries:	where the resulting entries are placed
   * @indices:	the cache indices corresponding to the entries in @entries
   *
   * Like find_get_entries, except we only return entries which are tagged with
   * @tag.
   */
  unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
  			int tag, unsigned int nr_entries,
  			struct page **entries, pgoff_t *indices)
  {
  	void **slot;
  	unsigned int ret = 0;
  	struct radix_tree_iter iter;
  
  	if (!nr_entries)
  		return 0;
  
  	rcu_read_lock();
b93b01631   Matthew Wilcox   page cache: use x...
1911
  	radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start, tag) {
83929372f   Kirill A. Shutemov   filemap: prepare ...
1912
  		struct page *head, *page;
7e7f77498   Ross Zwisler   mm: add find_get_...
1913
1914
1915
1916
1917
1918
  repeat:
  		page = radix_tree_deref_slot(slot);
  		if (unlikely(!page))
  			continue;
  		if (radix_tree_exception(page)) {
  			if (radix_tree_deref_retry(page)) {
2cf938aae   Matthew Wilcox   mm: use radix_tre...
1919
1920
  				slot = radix_tree_iter_retry(&iter);
  				continue;
7e7f77498   Ross Zwisler   mm: add find_get_...
1921
1922
1923
1924
1925
1926
1927
1928
1929
  			}
  
  			/*
  			 * A shadow entry of a recently evicted page, a swap
  			 * entry from shmem/tmpfs or a DAX entry.  Return it
  			 * without attempting to raise page count.
  			 */
  			goto export;
  		}
83929372f   Kirill A. Shutemov   filemap: prepare ...
1930
1931
1932
  
  		head = compound_head(page);
  		if (!page_cache_get_speculative(head))
7e7f77498   Ross Zwisler   mm: add find_get_...
1933
  			goto repeat;
83929372f   Kirill A. Shutemov   filemap: prepare ...
1934
1935
1936
1937
1938
  		/* The page was split under us? */
  		if (compound_head(page) != head) {
  			put_page(head);
  			goto repeat;
  		}
7e7f77498   Ross Zwisler   mm: add find_get_...
1939
1940
  		/* Has the page moved? */
  		if (unlikely(page != *slot)) {
83929372f   Kirill A. Shutemov   filemap: prepare ...
1941
  			put_page(head);
7e7f77498   Ross Zwisler   mm: add find_get_...
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
  			goto repeat;
  		}
  export:
  		indices[ret] = iter.index;
  		entries[ret] = page;
  		if (++ret == nr_entries)
  			break;
  	}
  	rcu_read_unlock();
  	return ret;
  }
  EXPORT_SYMBOL(find_get_entries_tag);
76d42bd96   Wu Fengguang   [PATCH] readahead...
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
  /*
   * CD/DVDs are error prone. When a medium error occurs, the driver may fail
   * a _large_ part of the i/o request. Imagine the worst scenario:
   *
   *      ---R__________________________________________B__________
   *         ^ reading here                             ^ bad block(assume 4k)
   *
   * read(R) => miss => readahead(R...B) => media error => frustrating retries
   * => failing the whole request => read(R) => read(R+1) =>
   * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>
   * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>
   * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......
   *
   * It is going insane. Fix it by quickly scaling down the readahead size.
   */
  static void shrink_readahead_size_eio(struct file *filp,
  					struct file_ra_state *ra)
  {
76d42bd96   Wu Fengguang   [PATCH] readahead...
1972
  	ra->ra_pages /= 4;
76d42bd96   Wu Fengguang   [PATCH] readahead...
1973
  }
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
1974
  /**
47c27bc46   Christoph Hellwig   fs: pass iocb to ...
1975
1976
   * generic_file_buffered_read - generic file read routine
   * @iocb:	the iocb to read
6e58e79db   Al Viro   introduce copy_pa...
1977
1978
   * @iter:	data destination
   * @written:	already copied
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
1979
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1980
   * This is a generic file read routine, and uses the
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
1981
   * mapping->a_ops->readpage() function for the actual low-level stuff.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1982
1983
1984
   *
   * This is really ugly. But the goto's actually try to clarify some
   * of the logic when it comes to error handling etc.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1985
   */
47c27bc46   Christoph Hellwig   fs: pass iocb to ...
1986
  static ssize_t generic_file_buffered_read(struct kiocb *iocb,
6e58e79db   Al Viro   introduce copy_pa...
1987
  		struct iov_iter *iter, ssize_t written)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1988
  {
47c27bc46   Christoph Hellwig   fs: pass iocb to ...
1989
  	struct file *filp = iocb->ki_filp;
36e789144   Christoph Hellwig   kill do_generic_m...
1990
  	struct address_space *mapping = filp->f_mapping;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1991
  	struct inode *inode = mapping->host;
36e789144   Christoph Hellwig   kill do_generic_m...
1992
  	struct file_ra_state *ra = &filp->f_ra;
47c27bc46   Christoph Hellwig   fs: pass iocb to ...
1993
  	loff_t *ppos = &iocb->ki_pos;
57f6b96c0   Fengguang Wu   filemap: convert ...
1994
1995
1996
1997
  	pgoff_t index;
  	pgoff_t last_index;
  	pgoff_t prev_index;
  	unsigned long offset;      /* offset into pagecache page */
ec0f16372   Jan Kara   readahead: improv...
1998
  	unsigned int prev_offset;
6e58e79db   Al Viro   introduce copy_pa...
1999
  	int error = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2000

c2a9737f4   Wei Fang   vfs,mm: fix a dea...
2001
  	if (unlikely(*ppos >= inode->i_sb->s_maxbytes))
d05c5f7ba   Linus Torvalds   vfs,mm: fix retur...
2002
  		return 0;
c2a9737f4   Wei Fang   vfs,mm: fix a dea...
2003
  	iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2004
2005
2006
2007
2008
  	index = *ppos >> PAGE_SHIFT;
  	prev_index = ra->prev_pos >> PAGE_SHIFT;
  	prev_offset = ra->prev_pos & (PAGE_SIZE-1);
  	last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
  	offset = *ppos & ~PAGE_MASK;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2009

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2010
2011
  	for (;;) {
  		struct page *page;
57f6b96c0   Fengguang Wu   filemap: convert ...
2012
  		pgoff_t end_index;
a32ea1e1f   NeilBrown   Fix read/truncate...
2013
  		loff_t isize;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2014
  		unsigned long nr, ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2015
  		cond_resched();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2016
  find_page:
5abf186a3   Michal Hocko   mm, fs: check for...
2017
2018
2019
2020
  		if (fatal_signal_pending(current)) {
  			error = -EINTR;
  			goto out;
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2021
  		page = find_get_page(mapping, index);
3ea89ee86   Fengguang Wu   readahead: conver...
2022
  		if (!page) {
3239d8348   Milosz Tanski   fs: support IOCB_...
2023
2024
  			if (iocb->ki_flags & IOCB_NOWAIT)
  				goto would_block;
cf914a7d6   Rusty Russell   readahead: split ...
2025
  			page_cache_sync_readahead(mapping,
7ff81078d   Fengguang Wu   readahead: remove...
2026
  					ra, filp,
3ea89ee86   Fengguang Wu   readahead: conver...
2027
2028
2029
2030
2031
2032
  					index, last_index - index);
  			page = find_get_page(mapping, index);
  			if (unlikely(page == NULL))
  				goto no_cached_page;
  		}
  		if (PageReadahead(page)) {
cf914a7d6   Rusty Russell   readahead: split ...
2033
  			page_cache_async_readahead(mapping,
7ff81078d   Fengguang Wu   readahead: remove...
2034
  					ra, filp, page,
3ea89ee86   Fengguang Wu   readahead: conver...
2035
  					index, last_index - index);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2036
  		}
8ab22b9ab   Hisashi Hifumi   vfs: pagecache us...
2037
  		if (!PageUptodate(page)) {
3239d8348   Milosz Tanski   fs: support IOCB_...
2038
2039
2040
2041
  			if (iocb->ki_flags & IOCB_NOWAIT) {
  				put_page(page);
  				goto would_block;
  			}
ebded0278   Mel Gorman   mm: filemap: avoi...
2042
2043
2044
2045
2046
  			/*
  			 * See comment in do_read_cache_page on why
  			 * wait_on_page_locked is used to avoid unnecessarily
  			 * serialisations and why it's safe.
  			 */
c4b209a42   Bart Van Assche   do_generic_file_r...
2047
2048
2049
  			error = wait_on_page_locked_killable(page);
  			if (unlikely(error))
  				goto readpage_error;
ebded0278   Mel Gorman   mm: filemap: avoi...
2050
2051
  			if (PageUptodate(page))
  				goto page_ok;
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2052
  			if (inode->i_blkbits == PAGE_SHIFT ||
8ab22b9ab   Hisashi Hifumi   vfs: pagecache us...
2053
2054
  					!mapping->a_ops->is_partially_uptodate)
  				goto page_not_up_to_date;
6d6d36bc6   Eryu Guan   mm/filemap: don't...
2055
2056
2057
  			/* pipes can't handle partially uptodate pages */
  			if (unlikely(iter->type & ITER_PIPE))
  				goto page_not_up_to_date;
529ae9aaa   Nick Piggin   mm: rename page t...
2058
  			if (!trylock_page(page))
8ab22b9ab   Hisashi Hifumi   vfs: pagecache us...
2059
  				goto page_not_up_to_date;
8d056cb96   Dave Hansen   mm/vfs: revalidat...
2060
2061
2062
  			/* Did it get truncated before we got the lock? */
  			if (!page->mapping)
  				goto page_not_up_to_date_locked;
8ab22b9ab   Hisashi Hifumi   vfs: pagecache us...
2063
  			if (!mapping->a_ops->is_partially_uptodate(page,
6e58e79db   Al Viro   introduce copy_pa...
2064
  							offset, iter->count))
8ab22b9ab   Hisashi Hifumi   vfs: pagecache us...
2065
2066
2067
  				goto page_not_up_to_date_locked;
  			unlock_page(page);
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2068
  page_ok:
a32ea1e1f   NeilBrown   Fix read/truncate...
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
  		/*
  		 * i_size must be checked after we know the page is Uptodate.
  		 *
  		 * Checking i_size after the check allows us to calculate
  		 * the correct value for "nr", which means the zero-filled
  		 * part of the page is not copied back to userspace (unless
  		 * another truncate extends the file - this is desired though).
  		 */
  
  		isize = i_size_read(inode);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2079
  		end_index = (isize - 1) >> PAGE_SHIFT;
a32ea1e1f   NeilBrown   Fix read/truncate...
2080
  		if (unlikely(!isize || index > end_index)) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2081
  			put_page(page);
a32ea1e1f   NeilBrown   Fix read/truncate...
2082
2083
2084
2085
  			goto out;
  		}
  
  		/* nr is the maximum number of bytes to copy from this page */
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2086
  		nr = PAGE_SIZE;
a32ea1e1f   NeilBrown   Fix read/truncate...
2087
  		if (index == end_index) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2088
  			nr = ((isize - 1) & ~PAGE_MASK) + 1;
a32ea1e1f   NeilBrown   Fix read/truncate...
2089
  			if (nr <= offset) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2090
  				put_page(page);
a32ea1e1f   NeilBrown   Fix read/truncate...
2091
2092
2093
2094
  				goto out;
  			}
  		}
  		nr = nr - offset;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2095
2096
2097
2098
2099
2100
2101
2102
2103
  
  		/* If users can be writing to this page using arbitrary
  		 * virtual addresses, take care about potential aliasing
  		 * before reading the page on the kernel side.
  		 */
  		if (mapping_writably_mapped(mapping))
  			flush_dcache_page(page);
  
  		/*
ec0f16372   Jan Kara   readahead: improv...
2104
2105
  		 * When a sequential read accesses a page several times,
  		 * only mark it as accessed the first time.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2106
  		 */
ec0f16372   Jan Kara   readahead: improv...
2107
  		if (prev_index != index || offset != prev_offset)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2108
2109
2110
2111
2112
2113
  			mark_page_accessed(page);
  		prev_index = index;
  
  		/*
  		 * Ok, we have the page, and it's up-to-date, so
  		 * now we can copy it to user space...
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2114
  		 */
6e58e79db   Al Viro   introduce copy_pa...
2115
2116
  
  		ret = copy_page_to_iter(page, offset, nr, iter);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2117
  		offset += ret;
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2118
2119
  		index += offset >> PAGE_SHIFT;
  		offset &= ~PAGE_MASK;
6ce745ed3   Jan Kara   readahead: code c...
2120
  		prev_offset = offset;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2121

09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2122
  		put_page(page);
6e58e79db   Al Viro   introduce copy_pa...
2123
2124
2125
2126
2127
2128
2129
2130
  		written += ret;
  		if (!iov_iter_count(iter))
  			goto out;
  		if (ret < nr) {
  			error = -EFAULT;
  			goto out;
  		}
  		continue;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2131
2132
2133
  
  page_not_up_to_date:
  		/* Get exclusive access to the page ... */
854623235   Oleg Nesterov   do_generic_file_r...
2134
2135
2136
  		error = lock_page_killable(page);
  		if (unlikely(error))
  			goto readpage_error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2137

8ab22b9ab   Hisashi Hifumi   vfs: pagecache us...
2138
  page_not_up_to_date_locked:
da6052f7b   Nick Piggin   [PATCH] update so...
2139
  		/* Did it get truncated before we got the lock? */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2140
2141
  		if (!page->mapping) {
  			unlock_page(page);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2142
  			put_page(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
  			continue;
  		}
  
  		/* Did somebody else fill it already? */
  		if (PageUptodate(page)) {
  			unlock_page(page);
  			goto page_ok;
  		}
  
  readpage:
91803b499   Jeff Moyer   do_generic_file_r...
2153
2154
2155
2156
2157
2158
  		/*
  		 * A previous I/O error may have been due to temporary
  		 * failures, eg. multipath errors.
  		 * PG_error will be set again if readpage fails.
  		 */
  		ClearPageError(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2159
2160
  		/* Start the actual read. The read will unlock the page. */
  		error = mapping->a_ops->readpage(filp, page);
994fc28c7   Zach Brown   [PATCH] add AOP_T...
2161
2162
  		if (unlikely(error)) {
  			if (error == AOP_TRUNCATED_PAGE) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2163
  				put_page(page);
6e58e79db   Al Viro   introduce copy_pa...
2164
  				error = 0;
994fc28c7   Zach Brown   [PATCH] add AOP_T...
2165
2166
  				goto find_page;
  			}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2167
  			goto readpage_error;
994fc28c7   Zach Brown   [PATCH] add AOP_T...
2168
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2169
2170
  
  		if (!PageUptodate(page)) {
854623235   Oleg Nesterov   do_generic_file_r...
2171
2172
2173
  			error = lock_page_killable(page);
  			if (unlikely(error))
  				goto readpage_error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2174
2175
2176
  			if (!PageUptodate(page)) {
  				if (page->mapping == NULL) {
  					/*
2ecdc82ef   Christoph Hellwig   kill unused inval...
2177
  					 * invalidate_mapping_pages got it
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2178
2179
  					 */
  					unlock_page(page);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2180
  					put_page(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2181
2182
2183
  					goto find_page;
  				}
  				unlock_page(page);
7ff81078d   Fengguang Wu   readahead: remove...
2184
  				shrink_readahead_size_eio(filp, ra);
854623235   Oleg Nesterov   do_generic_file_r...
2185
2186
  				error = -EIO;
  				goto readpage_error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2187
2188
2189
  			}
  			unlock_page(page);
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2190
2191
2192
2193
  		goto page_ok;
  
  readpage_error:
  		/* UHHUH! A synchronous read error occurred. Report it */
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2194
  		put_page(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2195
2196
2197
2198
2199
2200
2201
  		goto out;
  
  no_cached_page:
  		/*
  		 * Ok, it wasn't cached, so we need to create a new
  		 * page..
  		 */
453f85d43   Mel Gorman   mm: remove __GFP_...
2202
  		page = page_cache_alloc(mapping);
eb2be1893   Nick Piggin   mm: buffered writ...
2203
  		if (!page) {
6e58e79db   Al Viro   introduce copy_pa...
2204
  			error = -ENOMEM;
eb2be1893   Nick Piggin   mm: buffered writ...
2205
  			goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2206
  		}
6afdb859b   Michal Hocko   mm: do not ignore...
2207
  		error = add_to_page_cache_lru(page, mapping, index,
c62d25556   Michal Hocko   mm, fs: introduce...
2208
  				mapping_gfp_constraint(mapping, GFP_KERNEL));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2209
  		if (error) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2210
  			put_page(page);
6e58e79db   Al Viro   introduce copy_pa...
2211
2212
  			if (error == -EEXIST) {
  				error = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2213
  				goto find_page;
6e58e79db   Al Viro   introduce copy_pa...
2214
  			}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2215
2216
  			goto out;
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2217
2218
  		goto readpage;
  	}
3239d8348   Milosz Tanski   fs: support IOCB_...
2219
2220
  would_block:
  	error = -EAGAIN;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2221
  out:
7ff81078d   Fengguang Wu   readahead: remove...
2222
  	ra->prev_pos = prev_index;
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2223
  	ra->prev_pos <<= PAGE_SHIFT;
7ff81078d   Fengguang Wu   readahead: remove...
2224
  	ra->prev_pos |= prev_offset;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2225

09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2226
  	*ppos = ((loff_t)index << PAGE_SHIFT) + offset;
0c6aa2639   Krishna Kumar   mm: do_generic_fi...
2227
  	file_accessed(filp);
6e58e79db   Al Viro   introduce copy_pa...
2228
  	return written ? written : error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2229
  }
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
2230
  /**
6abd23227   Al Viro   bury generic_file...
2231
   * generic_file_read_iter - generic filesystem read routine
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
2232
   * @iocb:	kernel I/O control block
6abd23227   Al Viro   bury generic_file...
2233
   * @iter:	destination for the data read
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
2234
   *
6abd23227   Al Viro   bury generic_file...
2235
   * This is the "read_iter()" routine for all filesystems
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2236
2237
2238
   * that can use the page cache directly.
   */
  ssize_t
ed978a811   Al Viro   new helper: gener...
2239
  generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2240
  {
e7080a439   Nicolai Stange   mm/filemap: gener...
2241
  	size_t count = iov_iter_count(iter);
47c27bc46   Christoph Hellwig   fs: pass iocb to ...
2242
  	ssize_t retval = 0;
e7080a439   Nicolai Stange   mm/filemap: gener...
2243
2244
2245
  
  	if (!count)
  		goto out; /* skip atime */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2246

2ba48ce51   Al Viro   mirror O_APPEND a...
2247
  	if (iocb->ki_flags & IOCB_DIRECT) {
47c27bc46   Christoph Hellwig   fs: pass iocb to ...
2248
  		struct file *file = iocb->ki_filp;
ed978a811   Al Viro   new helper: gener...
2249
2250
  		struct address_space *mapping = file->f_mapping;
  		struct inode *inode = mapping->host;
543ade1fc   Badari Pulavarty   [PATCH] Streamlin...
2251
  		loff_t size;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2252

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2253
  		size = i_size_read(inode);
6be96d3ad   Goldwyn Rodrigues   fs: return if dir...
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
  		if (iocb->ki_flags & IOCB_NOWAIT) {
  			if (filemap_range_has_page(mapping, iocb->ki_pos,
  						   iocb->ki_pos + count - 1))
  				return -EAGAIN;
  		} else {
  			retval = filemap_write_and_wait_range(mapping,
  						iocb->ki_pos,
  					        iocb->ki_pos + count - 1);
  			if (retval < 0)
  				goto out;
  		}
d8d3d94b8   Al Viro   pass iov_iter to ...
2265

0d5b0cf24   Christoph Hellwig   fs: update atime ...
2266
  		file_accessed(file);
5ecda1371   Al Viro   generic_file_read...
2267
  		retval = mapping->a_ops->direct_IO(iocb, iter);
c3a690240   Al Viro   fix ITER_PIPE int...
2268
  		if (retval >= 0) {
c64fb5c74   Christoph Hellwig   filemap: remove p...
2269
  			iocb->ki_pos += retval;
5ecda1371   Al Viro   generic_file_read...
2270
  			count -= retval;
9fe55eea7   Steven Whitehouse   Fix race when che...
2271
  		}
5b47d59af   Al Viro   fix braino in gen...
2272
  		iov_iter_revert(iter, count - iov_iter_count(iter));
66f998f61   Josef Bacik   fs: allow short d...
2273

9fe55eea7   Steven Whitehouse   Fix race when che...
2274
2275
2276
2277
2278
2279
  		/*
  		 * Btrfs can have a short DIO read if we encounter
  		 * compressed extents, so if there was an error, or if
  		 * we've already read everything we wanted to, or if
  		 * there was a short read because we hit EOF, go ahead
  		 * and return.  Otherwise fallthrough to buffered io for
fbbbad4bc   Matthew Wilcox   vfs,ext2: introdu...
2280
2281
  		 * the rest of the read.  Buffered reads will not work for
  		 * DAX files, so don't bother trying.
9fe55eea7   Steven Whitehouse   Fix race when che...
2282
  		 */
5ecda1371   Al Viro   generic_file_read...
2283
  		if (retval < 0 || !count || iocb->ki_pos >= size ||
0d5b0cf24   Christoph Hellwig   fs: update atime ...
2284
  		    IS_DAX(inode))
9fe55eea7   Steven Whitehouse   Fix race when che...
2285
  			goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2286
  	}
47c27bc46   Christoph Hellwig   fs: pass iocb to ...
2287
  	retval = generic_file_buffered_read(iocb, iter, retval);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2288
2289
2290
  out:
  	return retval;
  }
ed978a811   Al Viro   new helper: gener...
2291
  EXPORT_SYMBOL(generic_file_read_iter);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2292

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2293
  #ifdef CONFIG_MMU
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
2294
2295
2296
2297
  /**
   * page_cache_read - adds requested page to the page cache if not already there
   * @file:	file to read
   * @offset:	page index
62eb320ab   Randy Dunlap   mm: fix filemap.c...
2298
   * @gfp_mask:	memory allocation flags
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
2299
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2300
2301
2302
   * This adds the requested page to the page cache if it isn't already there,
   * and schedules an I/O to read in its contents from disk.
   */
c20cd45eb   Michal Hocko   mm: allow GFP_{FS...
2303
  static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2304
2305
  {
  	struct address_space *mapping = file->f_mapping;
99dadfdde   Paul McQuade   mm/filemap.c: rem...
2306
  	struct page *page;
994fc28c7   Zach Brown   [PATCH] add AOP_T...
2307
  	int ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2308

994fc28c7   Zach Brown   [PATCH] add AOP_T...
2309
  	do {
453f85d43   Mel Gorman   mm: remove __GFP_...
2310
  		page = __page_cache_alloc(gfp_mask);
994fc28c7   Zach Brown   [PATCH] add AOP_T...
2311
2312
  		if (!page)
  			return -ENOMEM;
abc1be13f   Matthew Wilcox   mm/filemap.c: fix...
2313
  		ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask);
994fc28c7   Zach Brown   [PATCH] add AOP_T...
2314
2315
2316
2317
  		if (ret == 0)
  			ret = mapping->a_ops->readpage(file, page);
  		else if (ret == -EEXIST)
  			ret = 0; /* losing race to add is OK */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2318

09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2319
  		put_page(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2320

994fc28c7   Zach Brown   [PATCH] add AOP_T...
2321
  	} while (ret == AOP_TRUNCATED_PAGE);
99dadfdde   Paul McQuade   mm/filemap.c: rem...
2322

994fc28c7   Zach Brown   [PATCH] add AOP_T...
2323
  	return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2324
2325
2326
  }
  
  #define MMAP_LOTSAMISS  (100)
ef00e08e2   Linus Torvalds   readahead: clean ...
2327
2328
2329
2330
2331
2332
2333
2334
2335
  /*
   * Synchronous readahead happens when we don't even find
   * a page in the page cache at all.
   */
  static void do_sync_mmap_readahead(struct vm_area_struct *vma,
  				   struct file_ra_state *ra,
  				   struct file *file,
  				   pgoff_t offset)
  {
ef00e08e2   Linus Torvalds   readahead: clean ...
2336
2337
2338
  	struct address_space *mapping = file->f_mapping;
  
  	/* If we don't want any read-ahead, don't bother */
64363aad5   Joe Perches   mm: remove unused...
2339
  	if (vma->vm_flags & VM_RAND_READ)
ef00e08e2   Linus Torvalds   readahead: clean ...
2340
  		return;
275b12bf5   Wu Fengguang   readahead: return...
2341
2342
  	if (!ra->ra_pages)
  		return;
ef00e08e2   Linus Torvalds   readahead: clean ...
2343

64363aad5   Joe Perches   mm: remove unused...
2344
  	if (vma->vm_flags & VM_SEQ_READ) {
7ffc59b4d   Wu Fengguang   readahead: enforc...
2345
2346
  		page_cache_sync_readahead(mapping, ra, file, offset,
  					  ra->ra_pages);
ef00e08e2   Linus Torvalds   readahead: clean ...
2347
2348
  		return;
  	}
207d04baa   Andi Kleen   readahead: reduce...
2349
2350
  	/* Avoid banging the cache line if not needed */
  	if (ra->mmap_miss < MMAP_LOTSAMISS * 10)
ef00e08e2   Linus Torvalds   readahead: clean ...
2351
2352
2353
2354
2355
2356
2357
2358
  		ra->mmap_miss++;
  
  	/*
  	 * Do we miss much more than hit in this file? If so,
  	 * stop bothering with read-ahead. It will only hurt.
  	 */
  	if (ra->mmap_miss > MMAP_LOTSAMISS)
  		return;
d30a11004   Wu Fengguang   readahead: record...
2359
2360
2361
  	/*
  	 * mmap read-around
  	 */
600e19afc   Roman Gushchin   mm: use only per-...
2362
2363
2364
  	ra->start = max_t(long, 0, offset - ra->ra_pages / 2);
  	ra->size = ra->ra_pages;
  	ra->async_size = ra->ra_pages / 4;
275b12bf5   Wu Fengguang   readahead: return...
2365
  	ra_submit(ra, mapping, file);
ef00e08e2   Linus Torvalds   readahead: clean ...
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
  }
  
  /*
   * Asynchronous readahead happens when we find the page and PG_readahead,
   * so we want to possibly extend the readahead further..
   */
  static void do_async_mmap_readahead(struct vm_area_struct *vma,
  				    struct file_ra_state *ra,
  				    struct file *file,
  				    struct page *page,
  				    pgoff_t offset)
  {
  	struct address_space *mapping = file->f_mapping;
  
  	/* If we don't want any read-ahead, don't bother */
64363aad5   Joe Perches   mm: remove unused...
2381
  	if (vma->vm_flags & VM_RAND_READ)
ef00e08e2   Linus Torvalds   readahead: clean ...
2382
2383
2384
2385
  		return;
  	if (ra->mmap_miss > 0)
  		ra->mmap_miss--;
  	if (PageReadahead(page))
2fad6f5de   Wu Fengguang   readahead: enforc...
2386
2387
  		page_cache_async_readahead(mapping, ra, file,
  					   page, offset, ra->ra_pages);
ef00e08e2   Linus Torvalds   readahead: clean ...
2388
  }
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
2389
  /**
54cb8821d   Nick Piggin   mm: merge populat...
2390
   * filemap_fault - read in file data for page fault handling
d0217ac04   Nick Piggin   mm: fault feedbac...
2391
   * @vmf:	struct vm_fault containing details of the fault
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
2392
   *
54cb8821d   Nick Piggin   mm: merge populat...
2393
   * filemap_fault() is invoked via the vma operations vector for a
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2394
2395
2396
2397
2398
   * mapped memory region to read in file data during a page fault.
   *
   * The goto's are kind of ugly, but this streamlines the normal case of having
   * it in the page cache, and handles the special cases reasonably without
   * having a lot of duplicated code.
9a95f3cf7   Paul Cassella   mm: describe mmap...
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
   *
   * vma->vm_mm->mmap_sem must be held on entry.
   *
   * If our return value has VM_FAULT_RETRY set, it's because
   * lock_page_or_retry() returned 0.
   * The mmap_sem has usually been released in this case.
   * See __lock_page_or_retry() for the exception.
   *
   * If our return value does not have VM_FAULT_RETRY set, the mmap_sem
   * has not been released.
   *
   * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2411
   */
2bcd6454b   Souptick Joarder   mm: use new retur...
2412
  vm_fault_t filemap_fault(struct vm_fault *vmf)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2413
2414
  {
  	int error;
11bac8000   Dave Jiang   mm, fs: reduce fa...
2415
  	struct file *file = vmf->vma->vm_file;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2416
2417
2418
  	struct address_space *mapping = file->f_mapping;
  	struct file_ra_state *ra = &file->f_ra;
  	struct inode *inode = mapping->host;
ef00e08e2   Linus Torvalds   readahead: clean ...
2419
  	pgoff_t offset = vmf->pgoff;
9ab2594fe   Matthew Wilcox   mm: tighten up th...
2420
  	pgoff_t max_off;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2421
  	struct page *page;
2bcd6454b   Souptick Joarder   mm: use new retur...
2422
  	vm_fault_t ret = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2423

9ab2594fe   Matthew Wilcox   mm: tighten up th...
2424
2425
  	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
  	if (unlikely(offset >= max_off))
5307cc1aa   Linus Torvalds   Remove broken ptr...
2426
  		return VM_FAULT_SIGBUS;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2427

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2428
  	/*
494264208   Johannes Weiner   mm: memcg: handle...
2429
  	 * Do we have something in the page cache already?
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2430
  	 */
ef00e08e2   Linus Torvalds   readahead: clean ...
2431
  	page = find_get_page(mapping, offset);
45cac65b0   Shaohua Li   readahead: fault ...
2432
  	if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2433
  		/*
ef00e08e2   Linus Torvalds   readahead: clean ...
2434
2435
  		 * We found the page, so try async readahead before
  		 * waiting for the lock.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2436
  		 */
11bac8000   Dave Jiang   mm, fs: reduce fa...
2437
  		do_async_mmap_readahead(vmf->vma, ra, file, page, offset);
45cac65b0   Shaohua Li   readahead: fault ...
2438
  	} else if (!page) {
ef00e08e2   Linus Torvalds   readahead: clean ...
2439
  		/* No page in the page cache at all */
11bac8000   Dave Jiang   mm, fs: reduce fa...
2440
  		do_sync_mmap_readahead(vmf->vma, ra, file, offset);
ef00e08e2   Linus Torvalds   readahead: clean ...
2441
  		count_vm_event(PGMAJFAULT);
2262185c5   Roman Gushchin   mm: per-cgroup me...
2442
  		count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
ef00e08e2   Linus Torvalds   readahead: clean ...
2443
2444
  		ret = VM_FAULT_MAJOR;
  retry_find:
b522c94da   Michel Lespinasse   mm: filemap_fault...
2445
  		page = find_get_page(mapping, offset);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2446
2447
2448
  		if (!page)
  			goto no_cached_page;
  	}
11bac8000   Dave Jiang   mm, fs: reduce fa...
2449
  	if (!lock_page_or_retry(page, vmf->vma->vm_mm, vmf->flags)) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2450
  		put_page(page);
d065bd810   Michel Lespinasse   mm: retry page fa...
2451
  		return ret | VM_FAULT_RETRY;
d88c0922f   Michel Lespinasse   Release page refe...
2452
  	}
b522c94da   Michel Lespinasse   mm: filemap_fault...
2453
2454
2455
2456
2457
2458
2459
  
  	/* Did it get truncated? */
  	if (unlikely(page->mapping != mapping)) {
  		unlock_page(page);
  		put_page(page);
  		goto retry_find;
  	}
309381fea   Sasha Levin   mm: dump page whe...
2460
  	VM_BUG_ON_PAGE(page->index != offset, page);
b522c94da   Michel Lespinasse   mm: filemap_fault...
2461

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2462
  	/*
d00806b18   Nick Piggin   mm: fix fault vs ...
2463
2464
  	 * We have a locked page in the page cache, now we need to check
  	 * that it's up-to-date. If not, it is going to be due to an error.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2465
  	 */
d00806b18   Nick Piggin   mm: fix fault vs ...
2466
  	if (unlikely(!PageUptodate(page)))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2467
  		goto page_not_uptodate;
ef00e08e2   Linus Torvalds   readahead: clean ...
2468
2469
2470
2471
  	/*
  	 * Found the page and have a reference on it.
  	 * We must recheck i_size under page lock.
  	 */
9ab2594fe   Matthew Wilcox   mm: tighten up th...
2472
2473
  	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
  	if (unlikely(offset >= max_off)) {
d00806b18   Nick Piggin   mm: fix fault vs ...
2474
  		unlock_page(page);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2475
  		put_page(page);
5307cc1aa   Linus Torvalds   Remove broken ptr...
2476
  		return VM_FAULT_SIGBUS;
d00806b18   Nick Piggin   mm: fix fault vs ...
2477
  	}
d0217ac04   Nick Piggin   mm: fault feedbac...
2478
  	vmf->page = page;
83c54070e   Nick Piggin   mm: fault feedbac...
2479
  	return ret | VM_FAULT_LOCKED;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2480

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2481
2482
2483
2484
2485
  no_cached_page:
  	/*
  	 * We're only likely to ever get here if MADV_RANDOM is in
  	 * effect.
  	 */
c20cd45eb   Michal Hocko   mm: allow GFP_{FS...
2486
  	error = page_cache_read(file, offset, vmf->gfp_mask);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
  
  	/*
  	 * The page we want has now been added to the page cache.
  	 * In the unlikely event that someone removed it in the
  	 * meantime, we'll just come back here and read it again.
  	 */
  	if (error >= 0)
  		goto retry_find;
  
  	/*
  	 * An error return from page_cache_read can result if the
  	 * system is low on memory, or a problem occurs while trying
  	 * to schedule I/O.
  	 */
  	if (error == -ENOMEM)
d0217ac04   Nick Piggin   mm: fault feedbac...
2502
2503
  		return VM_FAULT_OOM;
  	return VM_FAULT_SIGBUS;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2504
2505
  
  page_not_uptodate:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2506
2507
2508
2509
2510
2511
  	/*
  	 * Umm, take care of errors if the page isn't up-to-date.
  	 * Try to re-read it _once_. We do this synchronously,
  	 * because there really aren't any performance issues here
  	 * and we need to check for errors.
  	 */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2512
  	ClearPageError(page);
994fc28c7   Zach Brown   [PATCH] add AOP_T...
2513
  	error = mapping->a_ops->readpage(file, page);
3ef0f720e   Miklos Szeredi   mm: fix infinite ...
2514
2515
2516
2517
2518
  	if (!error) {
  		wait_on_page_locked(page);
  		if (!PageUptodate(page))
  			error = -EIO;
  	}
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2519
  	put_page(page);
d00806b18   Nick Piggin   mm: fix fault vs ...
2520
2521
  
  	if (!error || error == AOP_TRUNCATED_PAGE)
994fc28c7   Zach Brown   [PATCH] add AOP_T...
2522
  		goto retry_find;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2523

d00806b18   Nick Piggin   mm: fix fault vs ...
2524
  	/* Things didn't work out. Return zero to tell the mm layer so. */
76d42bd96   Wu Fengguang   [PATCH] readahead...
2525
  	shrink_readahead_size_eio(file, ra);
d0217ac04   Nick Piggin   mm: fault feedbac...
2526
  	return VM_FAULT_SIGBUS;
54cb8821d   Nick Piggin   mm: merge populat...
2527
2528
  }
  EXPORT_SYMBOL(filemap_fault);
82b0f8c39   Jan Kara   mm: join struct f...
2529
  void filemap_map_pages(struct vm_fault *vmf,
bae473a42   Kirill A. Shutemov   mm: introduce fau...
2530
  		pgoff_t start_pgoff, pgoff_t end_pgoff)
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2531
2532
2533
  {
  	struct radix_tree_iter iter;
  	void **slot;
82b0f8c39   Jan Kara   mm: join struct f...
2534
  	struct file *file = vmf->vma->vm_file;
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2535
  	struct address_space *mapping = file->f_mapping;
bae473a42   Kirill A. Shutemov   mm: introduce fau...
2536
  	pgoff_t last_pgoff = start_pgoff;
9ab2594fe   Matthew Wilcox   mm: tighten up th...
2537
  	unsigned long max_idx;
83929372f   Kirill A. Shutemov   filemap: prepare ...
2538
  	struct page *head, *page;
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2539
2540
  
  	rcu_read_lock();
b93b01631   Matthew Wilcox   page cache: use x...
2541
  	radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start_pgoff) {
bae473a42   Kirill A. Shutemov   mm: introduce fau...
2542
  		if (iter.index > end_pgoff)
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2543
2544
2545
2546
2547
2548
  			break;
  repeat:
  		page = radix_tree_deref_slot(slot);
  		if (unlikely(!page))
  			goto next;
  		if (radix_tree_exception(page)) {
2cf938aae   Matthew Wilcox   mm: use radix_tre...
2549
2550
2551
2552
2553
  			if (radix_tree_deref_retry(page)) {
  				slot = radix_tree_iter_retry(&iter);
  				continue;
  			}
  			goto next;
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2554
  		}
83929372f   Kirill A. Shutemov   filemap: prepare ...
2555
2556
  		head = compound_head(page);
  		if (!page_cache_get_speculative(head))
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2557
  			goto repeat;
83929372f   Kirill A. Shutemov   filemap: prepare ...
2558
2559
2560
2561
2562
  		/* The page was split under us? */
  		if (compound_head(page) != head) {
  			put_page(head);
  			goto repeat;
  		}
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2563
2564
  		/* Has the page moved? */
  		if (unlikely(page != *slot)) {
83929372f   Kirill A. Shutemov   filemap: prepare ...
2565
  			put_page(head);
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
  			goto repeat;
  		}
  
  		if (!PageUptodate(page) ||
  				PageReadahead(page) ||
  				PageHWPoison(page))
  			goto skip;
  		if (!trylock_page(page))
  			goto skip;
  
  		if (page->mapping != mapping || !PageUptodate(page))
  			goto unlock;
9ab2594fe   Matthew Wilcox   mm: tighten up th...
2578
2579
  		max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
  		if (page->index >= max_idx)
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2580
  			goto unlock;
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2581
2582
  		if (file->f_ra.mmap_miss > 0)
  			file->f_ra.mmap_miss--;
7267ec008   Kirill A. Shutemov   mm: postpone page...
2583

82b0f8c39   Jan Kara   mm: join struct f...
2584
2585
2586
  		vmf->address += (iter.index - last_pgoff) << PAGE_SHIFT;
  		if (vmf->pte)
  			vmf->pte += iter.index - last_pgoff;
7267ec008   Kirill A. Shutemov   mm: postpone page...
2587
  		last_pgoff = iter.index;
82b0f8c39   Jan Kara   mm: join struct f...
2588
  		if (alloc_set_pte(vmf, NULL, page))
7267ec008   Kirill A. Shutemov   mm: postpone page...
2589
  			goto unlock;
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2590
2591
2592
2593
2594
  		unlock_page(page);
  		goto next;
  unlock:
  		unlock_page(page);
  skip:
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2595
  		put_page(page);
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2596
  next:
7267ec008   Kirill A. Shutemov   mm: postpone page...
2597
  		/* Huge page is mapped? No need to proceed. */
82b0f8c39   Jan Kara   mm: join struct f...
2598
  		if (pmd_trans_huge(*vmf->pmd))
7267ec008   Kirill A. Shutemov   mm: postpone page...
2599
  			break;
bae473a42   Kirill A. Shutemov   mm: introduce fau...
2600
  		if (iter.index == end_pgoff)
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2601
2602
2603
2604
2605
  			break;
  	}
  	rcu_read_unlock();
  }
  EXPORT_SYMBOL(filemap_map_pages);
2bcd6454b   Souptick Joarder   mm: use new retur...
2606
  vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
4fcf1c620   Jan Kara   mm: Make default ...
2607
2608
  {
  	struct page *page = vmf->page;
11bac8000   Dave Jiang   mm, fs: reduce fa...
2609
  	struct inode *inode = file_inode(vmf->vma->vm_file);
2bcd6454b   Souptick Joarder   mm: use new retur...
2610
  	vm_fault_t ret = VM_FAULT_LOCKED;
4fcf1c620   Jan Kara   mm: Make default ...
2611

14da92001   Jan Kara   fs: Protect write...
2612
  	sb_start_pagefault(inode->i_sb);
11bac8000   Dave Jiang   mm, fs: reduce fa...
2613
  	file_update_time(vmf->vma->vm_file);
4fcf1c620   Jan Kara   mm: Make default ...
2614
2615
2616
2617
2618
2619
  	lock_page(page);
  	if (page->mapping != inode->i_mapping) {
  		unlock_page(page);
  		ret = VM_FAULT_NOPAGE;
  		goto out;
  	}
14da92001   Jan Kara   fs: Protect write...
2620
2621
2622
2623
2624
2625
  	/*
  	 * We mark the page dirty already here so that when freeze is in
  	 * progress, we are guaranteed that writeback during freezing will
  	 * see the dirty page and writeprotect it again.
  	 */
  	set_page_dirty(page);
1d1d1a767   Darrick J. Wong   mm: only enforce ...
2626
  	wait_for_stable_page(page);
4fcf1c620   Jan Kara   mm: Make default ...
2627
  out:
14da92001   Jan Kara   fs: Protect write...
2628
  	sb_end_pagefault(inode->i_sb);
4fcf1c620   Jan Kara   mm: Make default ...
2629
2630
  	return ret;
  }
4fcf1c620   Jan Kara   mm: Make default ...
2631

f0f37e2f7   Alexey Dobriyan   const: mark struc...
2632
  const struct vm_operations_struct generic_file_vm_ops = {
54cb8821d   Nick Piggin   mm: merge populat...
2633
  	.fault		= filemap_fault,
f1820361f   Kirill A. Shutemov   mm: implement ->m...
2634
  	.map_pages	= filemap_map_pages,
4fcf1c620   Jan Kara   mm: Make default ...
2635
  	.page_mkwrite	= filemap_page_mkwrite,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
  };
  
  /* This is used for a general mmap of a disk file */
  
  int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
  {
  	struct address_space *mapping = file->f_mapping;
  
  	if (!mapping->a_ops->readpage)
  		return -ENOEXEC;
  	file_accessed(file);
  	vma->vm_ops = &generic_file_vm_ops;
  	return 0;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
  
  /*
   * This is for filesystems which do not implement ->writepage.
   */
  int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
  {
  	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
  		return -EINVAL;
  	return generic_file_mmap(file, vma);
  }
  #else
453972283   Arnd Bergmann   mm/filemap.c: pro...
2661
2662
2663
2664
  int filemap_page_mkwrite(struct vm_fault *vmf)
  {
  	return -ENOSYS;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2665
2666
2667
2668
2669
2670
2671
2672
2673
  int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
  {
  	return -ENOSYS;
  }
  int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
  {
  	return -ENOSYS;
  }
  #endif /* CONFIG_MMU */
453972283   Arnd Bergmann   mm/filemap.c: pro...
2674
  EXPORT_SYMBOL(filemap_page_mkwrite);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2675
2676
  EXPORT_SYMBOL(generic_file_mmap);
  EXPORT_SYMBOL(generic_file_readonly_mmap);
67f9fd91f   Sasha Levin   mm: remove read_c...
2677
2678
2679
2680
2681
  static struct page *wait_on_page_read(struct page *page)
  {
  	if (!IS_ERR(page)) {
  		wait_on_page_locked(page);
  		if (!PageUptodate(page)) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2682
  			put_page(page);
67f9fd91f   Sasha Levin   mm: remove read_c...
2683
2684
2685
2686
2687
  			page = ERR_PTR(-EIO);
  		}
  	}
  	return page;
  }
32b635298   Mel Gorman   mm: filemap: remo...
2688
  static struct page *do_read_cache_page(struct address_space *mapping,
57f6b96c0   Fengguang Wu   filemap: convert ...
2689
  				pgoff_t index,
5e5358e7c   Hugh Dickins   mm: cleanup descr...
2690
  				int (*filler)(void *, struct page *),
0531b2aac   Linus Torvalds   mm: add new 'read...
2691
2692
  				void *data,
  				gfp_t gfp)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2693
  {
eb2be1893   Nick Piggin   mm: buffered writ...
2694
  	struct page *page;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2695
2696
2697
2698
  	int err;
  repeat:
  	page = find_get_page(mapping, index);
  	if (!page) {
453f85d43   Mel Gorman   mm: remove __GFP_...
2699
  		page = __page_cache_alloc(gfp);
eb2be1893   Nick Piggin   mm: buffered writ...
2700
2701
  		if (!page)
  			return ERR_PTR(-ENOMEM);
e6f67b8c0   Dave Kleikamp   vfs: __read_cache...
2702
  		err = add_to_page_cache_lru(page, mapping, index, gfp);
eb2be1893   Nick Piggin   mm: buffered writ...
2703
  		if (unlikely(err)) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2704
  			put_page(page);
eb2be1893   Nick Piggin   mm: buffered writ...
2705
2706
  			if (err == -EEXIST)
  				goto repeat;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2707
  			/* Presumably ENOMEM for radix tree node */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2708
2709
  			return ERR_PTR(err);
  		}
32b635298   Mel Gorman   mm: filemap: remo...
2710
2711
  
  filler:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2712
2713
  		err = filler(data, page);
  		if (err < 0) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2714
  			put_page(page);
32b635298   Mel Gorman   mm: filemap: remo...
2715
  			return ERR_PTR(err);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2716
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2717

32b635298   Mel Gorman   mm: filemap: remo...
2718
2719
2720
2721
2722
  		page = wait_on_page_read(page);
  		if (IS_ERR(page))
  			return page;
  		goto out;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2723
2724
  	if (PageUptodate(page))
  		goto out;
ebded0278   Mel Gorman   mm: filemap: avoi...
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
  	/*
  	 * Page is not up to date and may be locked due one of the following
  	 * case a: Page is being filled and the page lock is held
  	 * case b: Read/write error clearing the page uptodate status
  	 * case c: Truncation in progress (page locked)
  	 * case d: Reclaim in progress
  	 *
  	 * Case a, the page will be up to date when the page is unlocked.
  	 *    There is no need to serialise on the page lock here as the page
  	 *    is pinned so the lock gives no additional protection. Even if the
  	 *    the page is truncated, the data is still valid if PageUptodate as
  	 *    it's a race vs truncate race.
  	 * Case b, the page will not be up to date
  	 * Case c, the page may be truncated but in itself, the data may still
  	 *    be valid after IO completes as it's a read vs truncate race. The
  	 *    operation must restart if the page is not uptodate on unlock but
  	 *    otherwise serialising on page lock to stabilise the mapping gives
  	 *    no additional guarantees to the caller as the page lock is
  	 *    released before return.
  	 * Case d, similar to truncation. If reclaim holds the page lock, it
  	 *    will be a race with remove_mapping that determines if the mapping
  	 *    is valid on unlock but otherwise the data is valid and there is
  	 *    no need to serialise with page lock.
  	 *
  	 * As the page lock gives no additional guarantee, we optimistically
  	 * wait on the page to be unlocked and check if it's up to date and
  	 * use the page if it is. Otherwise, the page lock is required to
  	 * distinguish between the different cases. The motivation is that we
  	 * avoid spurious serialisations and wakeups when multiple processes
  	 * wait on the same page for IO to complete.
  	 */
  	wait_on_page_locked(page);
  	if (PageUptodate(page))
  		goto out;
  
  	/* Distinguish between all the cases under the safety of the lock */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2761
  	lock_page(page);
ebded0278   Mel Gorman   mm: filemap: avoi...
2762
2763
  
  	/* Case c or d, restart the operation */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2764
2765
  	if (!page->mapping) {
  		unlock_page(page);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2766
  		put_page(page);
32b635298   Mel Gorman   mm: filemap: remo...
2767
  		goto repeat;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2768
  	}
ebded0278   Mel Gorman   mm: filemap: avoi...
2769
2770
  
  	/* Someone else locked and filled the page in a very small window */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2771
2772
2773
2774
  	if (PageUptodate(page)) {
  		unlock_page(page);
  		goto out;
  	}
32b635298   Mel Gorman   mm: filemap: remo...
2775
  	goto filler;
c855ff371   David Howells   Fix a bad error c...
2776
  out:
6fe6900e1   Nick Piggin   mm: make read_cac...
2777
2778
2779
  	mark_page_accessed(page);
  	return page;
  }
0531b2aac   Linus Torvalds   mm: add new 'read...
2780
2781
  
  /**
67f9fd91f   Sasha Levin   mm: remove read_c...
2782
   * read_cache_page - read into page cache, fill it if needed
0531b2aac   Linus Torvalds   mm: add new 'read...
2783
2784
2785
   * @mapping:	the page's address_space
   * @index:	the page index
   * @filler:	function to perform the read
5e5358e7c   Hugh Dickins   mm: cleanup descr...
2786
   * @data:	first arg to filler(data, page) function, often left as NULL
0531b2aac   Linus Torvalds   mm: add new 'read...
2787
   *
0531b2aac   Linus Torvalds   mm: add new 'read...
2788
   * Read into the page cache. If a page already exists, and PageUptodate() is
67f9fd91f   Sasha Levin   mm: remove read_c...
2789
   * not set, try to fill the page and wait for it to become unlocked.
0531b2aac   Linus Torvalds   mm: add new 'read...
2790
2791
2792
   *
   * If the page does not get brought uptodate, return -EIO.
   */
67f9fd91f   Sasha Levin   mm: remove read_c...
2793
  struct page *read_cache_page(struct address_space *mapping,
0531b2aac   Linus Torvalds   mm: add new 'read...
2794
  				pgoff_t index,
5e5358e7c   Hugh Dickins   mm: cleanup descr...
2795
  				int (*filler)(void *, struct page *),
0531b2aac   Linus Torvalds   mm: add new 'read...
2796
2797
2798
2799
  				void *data)
  {
  	return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping));
  }
67f9fd91f   Sasha Levin   mm: remove read_c...
2800
  EXPORT_SYMBOL(read_cache_page);
0531b2aac   Linus Torvalds   mm: add new 'read...
2801
2802
2803
2804
2805
2806
2807
2808
  
  /**
   * read_cache_page_gfp - read into page cache, using specified page allocation flags.
   * @mapping:	the page's address_space
   * @index:	the page index
   * @gfp:	the page allocator flags to use if allocating
   *
   * This is the same as "read_mapping_page(mapping, index, NULL)", but with
e6f67b8c0   Dave Kleikamp   vfs: __read_cache...
2809
   * any new page allocations done using the specified allocation flags.
0531b2aac   Linus Torvalds   mm: add new 'read...
2810
2811
2812
2813
2814
2815
2816
2817
   *
   * If the page does not get brought uptodate, return -EIO.
   */
  struct page *read_cache_page_gfp(struct address_space *mapping,
  				pgoff_t index,
  				gfp_t gfp)
  {
  	filler_t *filler = (filler_t *)mapping->a_ops->readpage;
67f9fd91f   Sasha Levin   mm: remove read_c...
2818
  	return do_read_cache_page(mapping, index, filler, NULL, gfp);
0531b2aac   Linus Torvalds   mm: add new 'read...
2819
2820
  }
  EXPORT_SYMBOL(read_cache_page_gfp);
2f718ffc1   Nick Piggin   mm: buffered writ...
2821
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2822
2823
   * Performs necessary checks before doing a write
   *
485bb99b4   Randy Dunlap   [PATCH] kernel-do...
2824
   * Can adjust writing position or amount of bytes to write.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2825
2826
2827
   * Returns appropriate error code that caller should return or
   * zero in case that write should be allowed.
   */
3309dd04c   Al Viro   switch generic_wr...
2828
  inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2829
  {
3309dd04c   Al Viro   switch generic_wr...
2830
  	struct file *file = iocb->ki_filp;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2831
  	struct inode *inode = file->f_mapping->host;
59e99e5b9   Jiri Slaby   mm: use rlimit he...
2832
  	unsigned long limit = rlimit(RLIMIT_FSIZE);
3309dd04c   Al Viro   switch generic_wr...
2833
  	loff_t pos;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2834

3309dd04c   Al Viro   switch generic_wr...
2835
2836
  	if (!iov_iter_count(from))
  		return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2837

0fa6b005a   Al Viro   generic_write_che...
2838
  	/* FIXME: this is for backwards compatibility with 2.4 */
2ba48ce51   Al Viro   mirror O_APPEND a...
2839
  	if (iocb->ki_flags & IOCB_APPEND)
3309dd04c   Al Viro   switch generic_wr...
2840
  		iocb->ki_pos = i_size_read(inode);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2841

3309dd04c   Al Viro   switch generic_wr...
2842
  	pos = iocb->ki_pos;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2843

6be96d3ad   Goldwyn Rodrigues   fs: return if dir...
2844
2845
  	if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
  		return -EINVAL;
0fa6b005a   Al Viro   generic_write_che...
2846
  	if (limit != RLIM_INFINITY) {
3309dd04c   Al Viro   switch generic_wr...
2847
  		if (iocb->ki_pos >= limit) {
0fa6b005a   Al Viro   generic_write_che...
2848
2849
  			send_sig(SIGXFSZ, current, 0);
  			return -EFBIG;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2850
  		}
3309dd04c   Al Viro   switch generic_wr...
2851
  		iov_iter_truncate(from, limit - (unsigned long)pos);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2852
2853
2854
2855
2856
  	}
  
  	/*
  	 * LFS rule
  	 */
3309dd04c   Al Viro   switch generic_wr...
2857
  	if (unlikely(pos + iov_iter_count(from) > MAX_NON_LFS &&
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2858
  				!(file->f_flags & O_LARGEFILE))) {
3309dd04c   Al Viro   switch generic_wr...
2859
  		if (pos >= MAX_NON_LFS)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2860
  			return -EFBIG;
3309dd04c   Al Viro   switch generic_wr...
2861
  		iov_iter_truncate(from, MAX_NON_LFS - (unsigned long)pos);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2862
2863
2864
2865
2866
2867
2868
2869
2870
  	}
  
  	/*
  	 * Are we about to exceed the fs block limit ?
  	 *
  	 * If we have written data it becomes a short write.  If we have
  	 * exceeded without writing data we send a signal and return EFBIG.
  	 * Linus frestrict idea will clean these up nicely..
  	 */
3309dd04c   Al Viro   switch generic_wr...
2871
2872
  	if (unlikely(pos >= inode->i_sb->s_maxbytes))
  		return -EFBIG;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2873

3309dd04c   Al Viro   switch generic_wr...
2874
2875
  	iov_iter_truncate(from, inode->i_sb->s_maxbytes - pos);
  	return iov_iter_count(from);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2876
2877
  }
  EXPORT_SYMBOL(generic_write_checks);
afddba49d   Nick Piggin   fs: introduce wri...
2878
2879
2880
2881
2882
  int pagecache_write_begin(struct file *file, struct address_space *mapping,
  				loff_t pos, unsigned len, unsigned flags,
  				struct page **pagep, void **fsdata)
  {
  	const struct address_space_operations *aops = mapping->a_ops;
4e02ed4b4   Nick Piggin   fs: remove prepar...
2883
  	return aops->write_begin(file, mapping, pos, len, flags,
afddba49d   Nick Piggin   fs: introduce wri...
2884
  							pagep, fsdata);
afddba49d   Nick Piggin   fs: introduce wri...
2885
2886
2887
2888
2889
2890
2891
2892
  }
  EXPORT_SYMBOL(pagecache_write_begin);
  
  int pagecache_write_end(struct file *file, struct address_space *mapping,
  				loff_t pos, unsigned len, unsigned copied,
  				struct page *page, void *fsdata)
  {
  	const struct address_space_operations *aops = mapping->a_ops;
afddba49d   Nick Piggin   fs: introduce wri...
2893

4e02ed4b4   Nick Piggin   fs: remove prepar...
2894
  	return aops->write_end(file, mapping, pos, len, copied, page, fsdata);
afddba49d   Nick Piggin   fs: introduce wri...
2895
2896
  }
  EXPORT_SYMBOL(pagecache_write_end);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2897
  ssize_t
1af5bb491   Christoph Hellwig   filemap: remove t...
2898
  generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2899
2900
2901
2902
  {
  	struct file	*file = iocb->ki_filp;
  	struct address_space *mapping = file->f_mapping;
  	struct inode	*inode = mapping->host;
1af5bb491   Christoph Hellwig   filemap: remove t...
2903
  	loff_t		pos = iocb->ki_pos;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2904
  	ssize_t		written;
a969e903a   Christoph Hellwig   kill generic_file...
2905
2906
  	size_t		write_len;
  	pgoff_t		end;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2907

0c949334a   Al Viro   iov_iter_truncate()
2908
  	write_len = iov_iter_count(from);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2909
  	end = (pos + write_len - 1) >> PAGE_SHIFT;
a969e903a   Christoph Hellwig   kill generic_file...
2910

6be96d3ad   Goldwyn Rodrigues   fs: return if dir...
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
  	if (iocb->ki_flags & IOCB_NOWAIT) {
  		/* If there are pages to writeback, return */
  		if (filemap_range_has_page(inode->i_mapping, pos,
  					   pos + iov_iter_count(from)))
  			return -EAGAIN;
  	} else {
  		written = filemap_write_and_wait_range(mapping, pos,
  							pos + write_len - 1);
  		if (written)
  			goto out;
  	}
a969e903a   Christoph Hellwig   kill generic_file...
2922
2923
2924
2925
2926
  
  	/*
  	 * After a write we want buffered reads to be sure to go to disk to get
  	 * the new data.  We invalidate clean cached page from the region we're
  	 * about to write.  We do this *before* the write so that we can return
6ccfa806a   Hisashi Hifumi   VFS: fix dio writ...
2927
  	 * without clobbering -EIOCBQUEUED from ->direct_IO().
a969e903a   Christoph Hellwig   kill generic_file...
2928
  	 */
55635ba76   Andrey Ryabinin   fs: fix data inva...
2929
  	written = invalidate_inode_pages2_range(mapping,
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2930
  					pos >> PAGE_SHIFT, end);
55635ba76   Andrey Ryabinin   fs: fix data inva...
2931
2932
2933
2934
2935
2936
2937
2938
  	/*
  	 * If a page can not be invalidated, return 0 to fall back
  	 * to buffered write.
  	 */
  	if (written) {
  		if (written == -EBUSY)
  			return 0;
  		goto out;
a969e903a   Christoph Hellwig   kill generic_file...
2939
  	}
639a93a52   Al Viro   generic_file_dire...
2940
  	written = mapping->a_ops->direct_IO(iocb, from);
a969e903a   Christoph Hellwig   kill generic_file...
2941
2942
2943
2944
2945
2946
2947
2948
  
  	/*
  	 * Finally, try again to invalidate clean pages which might have been
  	 * cached by non-direct readahead, or faulted in by get_user_pages()
  	 * if the source of the write was an mmap'ed region of the file
  	 * we're writing.  Either one is a pretty crazy thing to do,
  	 * so we don't support it 100%.  If this invalidation
  	 * fails, tough, the write still worked...
332391a99   Lukas Czerner   fs: Fix page cach...
2949
2950
2951
2952
2953
  	 *
  	 * Most of the time we do not need this since dio_complete() will do
  	 * the invalidation for us. However there are some file systems that
  	 * do not end up with dio_complete() being called, so let's not break
  	 * them by removing it completely
a969e903a   Christoph Hellwig   kill generic_file...
2954
  	 */
332391a99   Lukas Czerner   fs: Fix page cach...
2955
2956
2957
  	if (mapping->nrpages)
  		invalidate_inode_pages2_range(mapping,
  					pos >> PAGE_SHIFT, end);
a969e903a   Christoph Hellwig   kill generic_file...
2958

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2959
  	if (written > 0) {
0116651c8   Namhyung Kim   mm: remove tempor...
2960
  		pos += written;
639a93a52   Al Viro   generic_file_dire...
2961
  		write_len -= written;
0116651c8   Namhyung Kim   mm: remove tempor...
2962
2963
  		if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
  			i_size_write(inode, pos);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2964
2965
  			mark_inode_dirty(inode);
  		}
5cb6c6c7e   Al Viro   generic_file_dire...
2966
  		iocb->ki_pos = pos;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2967
  	}
639a93a52   Al Viro   generic_file_dire...
2968
  	iov_iter_revert(from, write_len - iov_iter_count(from));
a969e903a   Christoph Hellwig   kill generic_file...
2969
  out:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2970
2971
2972
  	return written;
  }
  EXPORT_SYMBOL(generic_file_direct_write);
eb2be1893   Nick Piggin   mm: buffered writ...
2973
2974
2975
2976
  /*
   * Find or create a page at the given pagecache position. Return the locked
   * page. This function is specifically for buffered writes.
   */
54566b2c1   Nick Piggin   fs: symlink write...
2977
2978
  struct page *grab_cache_page_write_begin(struct address_space *mapping,
  					pgoff_t index, unsigned flags)
eb2be1893   Nick Piggin   mm: buffered writ...
2979
  {
eb2be1893   Nick Piggin   mm: buffered writ...
2980
  	struct page *page;
bbddabe2e   Johannes Weiner   mm: filemap: only...
2981
  	int fgp_flags = FGP_LOCK|FGP_WRITE|FGP_CREAT;
0faa70cb0   Johannes Weiner   mm: filemap: pass...
2982

54566b2c1   Nick Piggin   fs: symlink write...
2983
  	if (flags & AOP_FLAG_NOFS)
2457aec63   Mel Gorman   mm: non-atomicall...
2984
2985
2986
  		fgp_flags |= FGP_NOFS;
  
  	page = pagecache_get_page(mapping, index, fgp_flags,
45f87de57   Michal Hocko   mm: get rid of ra...
2987
  			mapping_gfp_mask(mapping));
c585a2678   Steven Rostedt   mm: remove likely...
2988
  	if (page)
2457aec63   Mel Gorman   mm: non-atomicall...
2989
  		wait_for_stable_page(page);
eb2be1893   Nick Piggin   mm: buffered writ...
2990

eb2be1893   Nick Piggin   mm: buffered writ...
2991
2992
  	return page;
  }
54566b2c1   Nick Piggin   fs: symlink write...
2993
  EXPORT_SYMBOL(grab_cache_page_write_begin);
eb2be1893   Nick Piggin   mm: buffered writ...
2994

3b93f911d   Al Viro   export generic_pe...
2995
  ssize_t generic_perform_write(struct file *file,
afddba49d   Nick Piggin   fs: introduce wri...
2996
2997
2998
2999
3000
3001
  				struct iov_iter *i, loff_t pos)
  {
  	struct address_space *mapping = file->f_mapping;
  	const struct address_space_operations *a_ops = mapping->a_ops;
  	long status = 0;
  	ssize_t written = 0;
674b892ed   Nick Piggin   mm: restore KERNE...
3002
  	unsigned int flags = 0;
afddba49d   Nick Piggin   fs: introduce wri...
3003
3004
  	do {
  		struct page *page;
afddba49d   Nick Piggin   fs: introduce wri...
3005
3006
3007
3008
  		unsigned long offset;	/* Offset into pagecache page */
  		unsigned long bytes;	/* Bytes to write to page */
  		size_t copied;		/* Bytes copied from user */
  		void *fsdata;
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
3009
3010
  		offset = (pos & (PAGE_SIZE - 1));
  		bytes = min_t(unsigned long, PAGE_SIZE - offset,
afddba49d   Nick Piggin   fs: introduce wri...
3011
3012
3013
  						iov_iter_count(i));
  
  again:
00a3d660c   Linus Torvalds   Revert "fs: do no...
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
  		/*
  		 * Bring in the user page that we will copy from _first_.
  		 * Otherwise there's a nasty deadlock on copying from the
  		 * same page as we're writing to, without it being marked
  		 * up-to-date.
  		 *
  		 * Not only is this an optimisation, but it is also required
  		 * to check that the address is actually valid, when atomic
  		 * usercopies are used, below.
  		 */
  		if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
  			status = -EFAULT;
  			break;
  		}
296291cdd   Jan Kara   mm: make sendfile...
3028
3029
3030
3031
  		if (fatal_signal_pending(current)) {
  			status = -EINTR;
  			break;
  		}
674b892ed   Nick Piggin   mm: restore KERNE...
3032
  		status = a_ops->write_begin(file, mapping, pos, bytes, flags,
afddba49d   Nick Piggin   fs: introduce wri...
3033
  						&page, &fsdata);
2457aec63   Mel Gorman   mm: non-atomicall...
3034
  		if (unlikely(status < 0))
afddba49d   Nick Piggin   fs: introduce wri...
3035
  			break;
931e80e4b   anfei zhou   mm: flush dcache ...
3036
3037
  		if (mapping_writably_mapped(mapping))
  			flush_dcache_page(page);
00a3d660c   Linus Torvalds   Revert "fs: do no...
3038

afddba49d   Nick Piggin   fs: introduce wri...
3039
  		copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
afddba49d   Nick Piggin   fs: introduce wri...
3040
3041
3042
3043
3044
3045
3046
3047
3048
  		flush_dcache_page(page);
  
  		status = a_ops->write_end(file, mapping, pos, bytes, copied,
  						page, fsdata);
  		if (unlikely(status < 0))
  			break;
  		copied = status;
  
  		cond_resched();
124d3b704   Nick Piggin   fix writev regres...
3049
  		iov_iter_advance(i, copied);
afddba49d   Nick Piggin   fs: introduce wri...
3050
3051
3052
3053
3054
3055
3056
3057
3058
  		if (unlikely(copied == 0)) {
  			/*
  			 * If we were unable to copy any data at all, we must
  			 * fall back to a single segment length write.
  			 *
  			 * If we didn't fallback here, we could livelock
  			 * because not all segments in the iov can be copied at
  			 * once without a pagefault.
  			 */
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
3059
  			bytes = min_t(unsigned long, PAGE_SIZE - offset,
afddba49d   Nick Piggin   fs: introduce wri...
3060
3061
3062
  						iov_iter_single_seg_count(i));
  			goto again;
  		}
afddba49d   Nick Piggin   fs: introduce wri...
3063
3064
3065
3066
  		pos += copied;
  		written += copied;
  
  		balance_dirty_pages_ratelimited(mapping);
afddba49d   Nick Piggin   fs: introduce wri...
3067
3068
3069
3070
  	} while (iov_iter_count(i));
  
  	return written ? written : status;
  }
3b93f911d   Al Viro   export generic_pe...
3071
  EXPORT_SYMBOL(generic_perform_write);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3072

e4dd9de3c   Jan Kara   vfs: Export __gen...
3073
  /**
8174202b3   Al Viro   write_iter varian...
3074
   * __generic_file_write_iter - write data to a file
e4dd9de3c   Jan Kara   vfs: Export __gen...
3075
   * @iocb:	IO state structure (file, offset, etc.)
8174202b3   Al Viro   write_iter varian...
3076
   * @from:	iov_iter with data to write
e4dd9de3c   Jan Kara   vfs: Export __gen...
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
   *
   * This function does all the work needed for actually writing data to a
   * file. It does all basic checks, removes SUID from the file, updates
   * modification times and calls proper subroutines depending on whether we
   * do direct IO or a standard buffered write.
   *
   * It expects i_mutex to be grabbed unless we work on a block device or similar
   * object which does not need locking at all.
   *
   * This function does *not* take care of syncing data in case of O_SYNC write.
   * A caller has to handle it. This is mainly due to the fact that we want to
   * avoid syncing under i_mutex.
   */
8174202b3   Al Viro   write_iter varian...
3090
  ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3091
3092
  {
  	struct file *file = iocb->ki_filp;
fb5527e68   Jeff Moyer   [PATCH] direct-io...
3093
  	struct address_space * mapping = file->f_mapping;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3094
  	struct inode 	*inode = mapping->host;
3b93f911d   Al Viro   export generic_pe...
3095
  	ssize_t		written = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3096
  	ssize_t		err;
3b93f911d   Al Viro   export generic_pe...
3097
  	ssize_t		status;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3098

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3099
  	/* We can write back this queue in page reclaim */
de1414a65   Christoph Hellwig   fs: export inode_...
3100
  	current->backing_dev_info = inode_to_bdi(inode);
5fa8e0a1c   Jan Kara   fs: Rename file_r...
3101
  	err = file_remove_privs(file);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3102
3103
  	if (err)
  		goto out;
c3b2da314   Josef Bacik   fs: introduce ino...
3104
3105
3106
  	err = file_update_time(file);
  	if (err)
  		goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3107

2ba48ce51   Al Viro   mirror O_APPEND a...
3108
  	if (iocb->ki_flags & IOCB_DIRECT) {
0b8def9d6   Al Viro   __generic_file_wr...
3109
  		loff_t pos, endbyte;
fb5527e68   Jeff Moyer   [PATCH] direct-io...
3110

1af5bb491   Christoph Hellwig   filemap: remove t...
3111
  		written = generic_file_direct_write(iocb, from);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3112
  		/*
fbbbad4bc   Matthew Wilcox   vfs,ext2: introdu...
3113
3114
3115
3116
3117
  		 * If the write stopped short of completing, fall back to
  		 * buffered writes.  Some filesystems do this for writes to
  		 * holes, for example.  For DAX files, a buffered write will
  		 * not succeed (even if it did, DAX does not handle dirty
  		 * page-cache pages correctly).
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3118
  		 */
0b8def9d6   Al Viro   __generic_file_wr...
3119
  		if (written < 0 || !iov_iter_count(from) || IS_DAX(inode))
fbbbad4bc   Matthew Wilcox   vfs,ext2: introdu...
3120
  			goto out;
0b8def9d6   Al Viro   __generic_file_wr...
3121
  		status = generic_perform_write(file, from, pos = iocb->ki_pos);
fb5527e68   Jeff Moyer   [PATCH] direct-io...
3122
  		/*
3b93f911d   Al Viro   export generic_pe...
3123
  		 * If generic_perform_write() returned a synchronous error
fb5527e68   Jeff Moyer   [PATCH] direct-io...
3124
3125
3126
3127
3128
  		 * then we want to return the number of bytes which were
  		 * direct-written, or the error code if that was zero.  Note
  		 * that this differs from normal direct-io semantics, which
  		 * will return -EFOO even if some bytes were written.
  		 */
60bb45297   Al Viro   __generic_file_wr...
3129
  		if (unlikely(status < 0)) {
3b93f911d   Al Viro   export generic_pe...
3130
  			err = status;
fb5527e68   Jeff Moyer   [PATCH] direct-io...
3131
3132
  			goto out;
  		}
fb5527e68   Jeff Moyer   [PATCH] direct-io...
3133
3134
3135
3136
3137
  		/*
  		 * We need to ensure that the page cache pages are written to
  		 * disk and invalidated to preserve the expected O_DIRECT
  		 * semantics.
  		 */
3b93f911d   Al Viro   export generic_pe...
3138
  		endbyte = pos + status - 1;
0b8def9d6   Al Viro   __generic_file_wr...
3139
  		err = filemap_write_and_wait_range(mapping, pos, endbyte);
fb5527e68   Jeff Moyer   [PATCH] direct-io...
3140
  		if (err == 0) {
0b8def9d6   Al Viro   __generic_file_wr...
3141
  			iocb->ki_pos = endbyte + 1;
3b93f911d   Al Viro   export generic_pe...
3142
  			written += status;
fb5527e68   Jeff Moyer   [PATCH] direct-io...
3143
  			invalidate_mapping_pages(mapping,
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
3144
3145
  						 pos >> PAGE_SHIFT,
  						 endbyte >> PAGE_SHIFT);
fb5527e68   Jeff Moyer   [PATCH] direct-io...
3146
3147
3148
3149
3150
3151
3152
  		} else {
  			/*
  			 * We don't know how much we wrote, so just return
  			 * the number of bytes which were direct-written
  			 */
  		}
  	} else {
0b8def9d6   Al Viro   __generic_file_wr...
3153
3154
3155
  		written = generic_perform_write(file, from, iocb->ki_pos);
  		if (likely(written > 0))
  			iocb->ki_pos += written;
fb5527e68   Jeff Moyer   [PATCH] direct-io...
3156
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3157
3158
3159
3160
  out:
  	current->backing_dev_info = NULL;
  	return written ? written : err;
  }
8174202b3   Al Viro   write_iter varian...
3161
  EXPORT_SYMBOL(__generic_file_write_iter);
e4dd9de3c   Jan Kara   vfs: Export __gen...
3162

e4dd9de3c   Jan Kara   vfs: Export __gen...
3163
  /**
8174202b3   Al Viro   write_iter varian...
3164
   * generic_file_write_iter - write data to a file
e4dd9de3c   Jan Kara   vfs: Export __gen...
3165
   * @iocb:	IO state structure
8174202b3   Al Viro   write_iter varian...
3166
   * @from:	iov_iter with data to write
e4dd9de3c   Jan Kara   vfs: Export __gen...
3167
   *
8174202b3   Al Viro   write_iter varian...
3168
   * This is a wrapper around __generic_file_write_iter() to be used by most
e4dd9de3c   Jan Kara   vfs: Export __gen...
3169
3170
3171
   * filesystems. It takes care of syncing the file in case of O_SYNC file
   * and acquires i_mutex as needed.
   */
8174202b3   Al Viro   write_iter varian...
3172
  ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3173
3174
  {
  	struct file *file = iocb->ki_filp;
148f948ba   Jan Kara   vfs: Introduce ne...
3175
  	struct inode *inode = file->f_mapping->host;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3176
  	ssize_t ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3177

5955102c9   Al Viro   wrappers for ->i_...
3178
  	inode_lock(inode);
3309dd04c   Al Viro   switch generic_wr...
3179
3180
  	ret = generic_write_checks(iocb, from);
  	if (ret > 0)
5f380c7fa   Al Viro   lift generic_writ...
3181
  		ret = __generic_file_write_iter(iocb, from);
5955102c9   Al Viro   wrappers for ->i_...
3182
  	inode_unlock(inode);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3183

e25922176   Christoph Hellwig   fs: simplify the ...
3184
3185
  	if (ret > 0)
  		ret = generic_write_sync(iocb, ret);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3186
3187
  	return ret;
  }
8174202b3   Al Viro   write_iter varian...
3188
  EXPORT_SYMBOL(generic_file_write_iter);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3189

cf9a2ae8d   David Howells   [PATCH] BLOCK: Mo...
3190
3191
3192
3193
3194
3195
3196
  /**
   * try_to_release_page() - release old fs-specific metadata on a page
   *
   * @page: the page which the kernel is trying to free
   * @gfp_mask: memory allocation flags (and I/O mode)
   *
   * The address_space is to try to release any data against the page
0e056eb55   mchehab@s-opensource.com   kernel-api.rst: f...
3197
   * (presumably at page->private).  If the release was successful, return '1'.
cf9a2ae8d   David Howells   [PATCH] BLOCK: Mo...
3198
3199
   * Otherwise return zero.
   *
266cf658e   David Howells   FS-Cache: Recruit...
3200
3201
3202
   * This may also be called if PG_fscache is set on a page, indicating that the
   * page is known to the local caching routines.
   *
cf9a2ae8d   David Howells   [PATCH] BLOCK: Mo...
3203
   * The @gfp_mask argument specifies whether I/O may be performed to release
71baba4b9   Mel Gorman   mm, page_alloc: r...
3204
   * this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS).
cf9a2ae8d   David Howells   [PATCH] BLOCK: Mo...
3205
   *
cf9a2ae8d   David Howells   [PATCH] BLOCK: Mo...
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
   */
  int try_to_release_page(struct page *page, gfp_t gfp_mask)
  {
  	struct address_space * const mapping = page->mapping;
  
  	BUG_ON(!PageLocked(page));
  	if (PageWriteback(page))
  		return 0;
  
  	if (mapping && mapping->a_ops->releasepage)
  		return mapping->a_ops->releasepage(page, gfp_mask);
  	return try_to_free_buffers(page);
  }
  
  EXPORT_SYMBOL(try_to_release_page);