Blame view

fs/dax.c 46.4 KB
2025cf9e1   Thomas Gleixner   treewide: Replace...
1
  // SPDX-License-Identifier: GPL-2.0-only
d475c6346   Matthew Wilcox   dax,ext2: replace...
2
3
4
5
6
  /*
   * fs/dax.c - Direct Access filesystem code
   * Copyright (c) 2013-2014 Intel Corporation
   * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
   * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
d475c6346   Matthew Wilcox   dax,ext2: replace...
7
8
9
10
11
   */
  
  #include <linux/atomic.h>
  #include <linux/blkdev.h>
  #include <linux/buffer_head.h>
d77e92e27   Ross Zwisler   dax: update PMD f...
12
  #include <linux/dax.h>
d475c6346   Matthew Wilcox   dax,ext2: replace...
13
14
  #include <linux/fs.h>
  #include <linux/genhd.h>
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
15
16
17
  #include <linux/highmem.h>
  #include <linux/memcontrol.h>
  #include <linux/mm.h>
d475c6346   Matthew Wilcox   dax,ext2: replace...
18
  #include <linux/mutex.h>
9973c98ec   Ross Zwisler   dax: add support ...
19
  #include <linux/pagevec.h>
289c6aeda   Matthew Wilcox   dax,ext2: replace...
20
  #include <linux/sched.h>
f361bf4a6   Ingo Molnar   sched/headers: Pr...
21
  #include <linux/sched/signal.h>
d475c6346   Matthew Wilcox   dax,ext2: replace...
22
  #include <linux/uio.h>
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
23
  #include <linux/vmstat.h>
34c0fd540   Dan Williams   mm, dax, pmem: in...
24
  #include <linux/pfn_t.h>
0e749e542   Dan Williams   dax: increase gra...
25
  #include <linux/sizes.h>
4b4bb46d0   Jan Kara   dax: clear dirty ...
26
  #include <linux/mmu_notifier.h>
a254e5681   Christoph Hellwig   dax: provide an i...
27
  #include <linux/iomap.h>
11cf9d863   Aneesh Kumar K.V   fs/dax: Deposit p...
28
  #include <asm/pgalloc.h>
d475c6346   Matthew Wilcox   dax,ext2: replace...
29

282a8e039   Ross Zwisler   dax: add tracepoi...
30
31
  #define CREATE_TRACE_POINTS
  #include <trace/events/fs_dax.h>
cfc93c6c6   Matthew Wilcox   dax: Convert dax_...
32
33
34
35
36
37
38
39
40
41
  static inline unsigned int pe_order(enum page_entry_size pe_size)
  {
  	if (pe_size == PE_SIZE_PTE)
  		return PAGE_SHIFT - PAGE_SHIFT;
  	if (pe_size == PE_SIZE_PMD)
  		return PMD_SHIFT - PAGE_SHIFT;
  	if (pe_size == PE_SIZE_PUD)
  		return PUD_SHIFT - PAGE_SHIFT;
  	return ~0;
  }
ac401cc78   Jan Kara   dax: New fault lo...
42
43
44
  /* We choose 4096 entries - same as per-zone page wait tables */
  #define DAX_WAIT_TABLE_BITS 12
  #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
917f34526   Ross Zwisler   dax: use PG_PMD_C...
45
46
  /* The 'colour' (ie low bits) within a PMD of a page offset.  */
  #define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1)
977fbdcd5   Matthew Wilcox   mm: add unmap_map...
47
  #define PG_PMD_NR	(PMD_SIZE >> PAGE_SHIFT)
917f34526   Ross Zwisler   dax: use PG_PMD_C...
48

cfc93c6c6   Matthew Wilcox   dax: Convert dax_...
49
50
  /* The order of a PMD entry */
  #define PMD_ORDER	(PMD_SHIFT - PAGE_SHIFT)
ce95ab0fa   Ross Zwisler   dax: make 'wait_t...
51
  static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
ac401cc78   Jan Kara   dax: New fault lo...
52
53
54
55
56
57
58
59
60
61
  
  static int __init init_dax_wait_table(void)
  {
  	int i;
  
  	for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
  		init_waitqueue_head(wait_table + i);
  	return 0;
  }
  fs_initcall(init_dax_wait_table);
527b19d08   Ross Zwisler   dax: move all DAX...
62
  /*
3159f943a   Matthew Wilcox   xarray: Replace e...
63
64
65
66
   * DAX pagecache entries use XArray value entries so they can't be mistaken
   * for pages.  We use one bit for locking, one bit for the entry size (PMD)
   * and two more to tell us if the entry is a zero page or an empty entry that
   * is just used for locking.  In total four special bits.
527b19d08   Ross Zwisler   dax: move all DAX...
67
68
69
70
71
   *
   * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
   * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
   * block allocation.
   */
3159f943a   Matthew Wilcox   xarray: Replace e...
72
73
74
75
76
  #define DAX_SHIFT	(4)
  #define DAX_LOCKED	(1UL << 0)
  #define DAX_PMD		(1UL << 1)
  #define DAX_ZERO_PAGE	(1UL << 2)
  #define DAX_EMPTY	(1UL << 3)
527b19d08   Ross Zwisler   dax: move all DAX...
77

a77d19f46   Matthew Wilcox   dax: Rename some ...
78
  static unsigned long dax_to_pfn(void *entry)
527b19d08   Ross Zwisler   dax: move all DAX...
79
  {
3159f943a   Matthew Wilcox   xarray: Replace e...
80
  	return xa_to_value(entry) >> DAX_SHIFT;
527b19d08   Ross Zwisler   dax: move all DAX...
81
  }
9f32d2213   Matthew Wilcox   dax: Convert dax_...
82
83
84
85
  static void *dax_make_entry(pfn_t pfn, unsigned long flags)
  {
  	return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT));
  }
cfc93c6c6   Matthew Wilcox   dax: Convert dax_...
86
87
88
89
  static bool dax_is_locked(void *entry)
  {
  	return xa_to_value(entry) & DAX_LOCKED;
  }
a77d19f46   Matthew Wilcox   dax: Rename some ...
90
  static unsigned int dax_entry_order(void *entry)
527b19d08   Ross Zwisler   dax: move all DAX...
91
  {
3159f943a   Matthew Wilcox   xarray: Replace e...
92
  	if (xa_to_value(entry) & DAX_PMD)
cfc93c6c6   Matthew Wilcox   dax: Convert dax_...
93
  		return PMD_ORDER;
527b19d08   Ross Zwisler   dax: move all DAX...
94
95
  	return 0;
  }
fda490d39   Matthew Wilcox   dax: Fix dax_unlo...
96
  static unsigned long dax_is_pmd_entry(void *entry)
d1a5f2b4d   Dan Williams   block: use DAX fo...
97
  {
3159f943a   Matthew Wilcox   xarray: Replace e...
98
  	return xa_to_value(entry) & DAX_PMD;
d1a5f2b4d   Dan Williams   block: use DAX fo...
99
  }
fda490d39   Matthew Wilcox   dax: Fix dax_unlo...
100
  static bool dax_is_pte_entry(void *entry)
d475c6346   Matthew Wilcox   dax,ext2: replace...
101
  {
3159f943a   Matthew Wilcox   xarray: Replace e...
102
  	return !(xa_to_value(entry) & DAX_PMD);
d475c6346   Matthew Wilcox   dax,ext2: replace...
103
  }
642261ac9   Ross Zwisler   dax: add struct i...
104
  static int dax_is_zero_entry(void *entry)
d475c6346   Matthew Wilcox   dax,ext2: replace...
105
  {
3159f943a   Matthew Wilcox   xarray: Replace e...
106
  	return xa_to_value(entry) & DAX_ZERO_PAGE;
d475c6346   Matthew Wilcox   dax,ext2: replace...
107
  }
642261ac9   Ross Zwisler   dax: add struct i...
108
  static int dax_is_empty_entry(void *entry)
b2e0d1625   Dan Williams   dax: fix lifetime...
109
  {
3159f943a   Matthew Wilcox   xarray: Replace e...
110
  	return xa_to_value(entry) & DAX_EMPTY;
b2e0d1625   Dan Williams   dax: fix lifetime...
111
  }
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
112
  /*
23c84eb78   Matthew Wilcox (Oracle)   dax: Fix missed w...
113
114
115
116
117
118
119
120
121
   * true if the entry that was found is of a smaller order than the entry
   * we were looking for
   */
  static bool dax_is_conflict(void *entry)
  {
  	return entry == XA_RETRY_ENTRY;
  }
  
  /*
a77d19f46   Matthew Wilcox   dax: Rename some ...
122
   * DAX page cache entry locking
ac401cc78   Jan Kara   dax: New fault lo...
123
124
   */
  struct exceptional_entry_key {
ec4907ff6   Matthew Wilcox   dax: Hash on XArr...
125
  	struct xarray *xa;
63e95b5c4   Ross Zwisler   dax: coordinate l...
126
  	pgoff_t entry_start;
ac401cc78   Jan Kara   dax: New fault lo...
127
128
129
  };
  
  struct wait_exceptional_entry_queue {
ac6424b98   Ingo Molnar   sched/wait: Renam...
130
  	wait_queue_entry_t wait;
ac401cc78   Jan Kara   dax: New fault lo...
131
132
  	struct exceptional_entry_key key;
  };
b15cd8006   Matthew Wilcox   dax: Convert page...
133
134
  static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas,
  		void *entry, struct exceptional_entry_key *key)
63e95b5c4   Ross Zwisler   dax: coordinate l...
135
136
  {
  	unsigned long hash;
b15cd8006   Matthew Wilcox   dax: Convert page...
137
  	unsigned long index = xas->xa_index;
63e95b5c4   Ross Zwisler   dax: coordinate l...
138
139
140
141
142
143
  
  	/*
  	 * If 'entry' is a PMD, align the 'index' that we use for the wait
  	 * queue to the start of that PMD.  This ensures that all offsets in
  	 * the range covered by the PMD map to the same bit lock.
  	 */
642261ac9   Ross Zwisler   dax: add struct i...
144
  	if (dax_is_pmd_entry(entry))
917f34526   Ross Zwisler   dax: use PG_PMD_C...
145
  		index &= ~PG_PMD_COLOUR;
b15cd8006   Matthew Wilcox   dax: Convert page...
146
  	key->xa = xas->xa;
63e95b5c4   Ross Zwisler   dax: coordinate l...
147
  	key->entry_start = index;
b15cd8006   Matthew Wilcox   dax: Convert page...
148
  	hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS);
63e95b5c4   Ross Zwisler   dax: coordinate l...
149
150
  	return wait_table + hash;
  }
ec4907ff6   Matthew Wilcox   dax: Hash on XArr...
151
152
  static int wake_exceptional_entry_func(wait_queue_entry_t *wait,
  		unsigned int mode, int sync, void *keyp)
ac401cc78   Jan Kara   dax: New fault lo...
153
154
155
156
  {
  	struct exceptional_entry_key *key = keyp;
  	struct wait_exceptional_entry_queue *ewait =
  		container_of(wait, struct wait_exceptional_entry_queue, wait);
ec4907ff6   Matthew Wilcox   dax: Hash on XArr...
157
  	if (key->xa != ewait->key.xa ||
63e95b5c4   Ross Zwisler   dax: coordinate l...
158
  	    key->entry_start != ewait->key.entry_start)
ac401cc78   Jan Kara   dax: New fault lo...
159
160
161
162
163
  		return 0;
  	return autoremove_wake_function(wait, mode, sync, NULL);
  }
  
  /*
b93b01631   Matthew Wilcox   page cache: use x...
164
165
166
   * @entry may no longer be the entry at the index in the mapping.
   * The important information it's conveying is whether the entry at
   * this index used to be a PMD entry.
e30331ff0   Ross Zwisler   dax: relocate som...
167
   */
b15cd8006   Matthew Wilcox   dax: Convert page...
168
  static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all)
e30331ff0   Ross Zwisler   dax: relocate som...
169
170
171
  {
  	struct exceptional_entry_key key;
  	wait_queue_head_t *wq;
b15cd8006   Matthew Wilcox   dax: Convert page...
172
  	wq = dax_entry_waitqueue(xas, entry, &key);
e30331ff0   Ross Zwisler   dax: relocate som...
173
174
175
  
  	/*
  	 * Checking for locked entry and prepare_to_wait_exclusive() happens
b93b01631   Matthew Wilcox   page cache: use x...
176
  	 * under the i_pages lock, ditto for entry handling in our callers.
e30331ff0   Ross Zwisler   dax: relocate som...
177
178
179
180
181
182
  	 * So at this point all tasks that could have seen our entry locked
  	 * must be in the waitqueue and the following check will see them.
  	 */
  	if (waitqueue_active(wq))
  		__wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
  }
cfc93c6c6   Matthew Wilcox   dax: Convert dax_...
183
184
185
186
  /*
   * Look up entry in page cache, wait for it to become unlocked if it
   * is a DAX entry and return it.  The caller must subsequently call
   * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry()
23c84eb78   Matthew Wilcox (Oracle)   dax: Fix missed w...
187
188
189
   * if it did.  The entry returned may have a larger order than @order.
   * If @order is larger than the order of the entry found in i_pages, this
   * function returns a dax_is_conflict entry.
cfc93c6c6   Matthew Wilcox   dax: Convert dax_...
190
191
192
   *
   * Must be called with the i_pages lock held.
   */
23c84eb78   Matthew Wilcox (Oracle)   dax: Fix missed w...
193
  static void *get_unlocked_entry(struct xa_state *xas, unsigned int order)
cfc93c6c6   Matthew Wilcox   dax: Convert dax_...
194
195
196
197
198
199
200
201
202
  {
  	void *entry;
  	struct wait_exceptional_entry_queue ewait;
  	wait_queue_head_t *wq;
  
  	init_wait(&ewait.wait);
  	ewait.wait.func = wake_exceptional_entry_func;
  
  	for (;;) {
0e40de033   Matthew Wilcox   dax: Fix huge pag...
203
  		entry = xas_find_conflict(xas);
6370740e5   Dan Williams   fs/dax: Fix pmd v...
204
205
  		if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
  			return entry;
23c84eb78   Matthew Wilcox (Oracle)   dax: Fix missed w...
206
207
  		if (dax_entry_order(entry) < order)
  			return XA_RETRY_ENTRY;
6370740e5   Dan Williams   fs/dax: Fix pmd v...
208
  		if (!dax_is_locked(entry))
cfc93c6c6   Matthew Wilcox   dax: Convert dax_...
209
  			return entry;
b15cd8006   Matthew Wilcox   dax: Convert page...
210
  		wq = dax_entry_waitqueue(xas, entry, &ewait.key);
cfc93c6c6   Matthew Wilcox   dax: Convert dax_...
211
212
213
214
215
216
217
218
219
  		prepare_to_wait_exclusive(wq, &ewait.wait,
  					  TASK_UNINTERRUPTIBLE);
  		xas_unlock_irq(xas);
  		xas_reset(xas);
  		schedule();
  		finish_wait(wq, &ewait.wait);
  		xas_lock_irq(xas);
  	}
  }
55e56f06e   Matthew Wilcox   dax: Don't access...
220
221
222
223
224
225
226
227
228
229
230
231
232
233
  /*
   * The only thing keeping the address space around is the i_pages lock
   * (it's cycled in clear_inode() after removing the entries from i_pages)
   * After we call xas_unlock_irq(), we cannot touch xas->xa.
   */
  static void wait_entry_unlocked(struct xa_state *xas, void *entry)
  {
  	struct wait_exceptional_entry_queue ewait;
  	wait_queue_head_t *wq;
  
  	init_wait(&ewait.wait);
  	ewait.wait.func = wake_exceptional_entry_func;
  
  	wq = dax_entry_waitqueue(xas, entry, &ewait.key);
d8a706414   Dan Williams   dax: Use non-excl...
234
235
236
237
238
239
240
  	/*
  	 * Unlike get_unlocked_entry() there is no guarantee that this
  	 * path ever successfully retrieves an unlocked entry before an
  	 * inode dies. Perform a non-exclusive wait in case this path
  	 * never successfully performs its own wake up.
  	 */
  	prepare_to_wait(wq, &ewait.wait, TASK_UNINTERRUPTIBLE);
55e56f06e   Matthew Wilcox   dax: Don't access...
241
242
243
  	xas_unlock_irq(xas);
  	schedule();
  	finish_wait(wq, &ewait.wait);
55e56f06e   Matthew Wilcox   dax: Don't access...
244
  }
cfc93c6c6   Matthew Wilcox   dax: Convert dax_...
245
246
247
  static void put_unlocked_entry(struct xa_state *xas, void *entry)
  {
  	/* If we were the only waiter woken, wake the next one */
61c30c98e   Jan Kara   dax: Fix missed w...
248
  	if (entry && !dax_is_conflict(entry))
cfc93c6c6   Matthew Wilcox   dax: Convert dax_...
249
250
251
252
253
254
255
256
257
258
259
  		dax_wake_entry(xas, entry, false);
  }
  
  /*
   * We used the xa_state to get the entry, but then we locked the entry and
   * dropped the xa_lock, so we know the xa_state is stale and must be reset
   * before use.
   */
  static void dax_unlock_entry(struct xa_state *xas, void *entry)
  {
  	void *old;
7ae2ea7dc   Matthew Wilcox   dax: Make sure th...
260
  	BUG_ON(dax_is_locked(entry));
cfc93c6c6   Matthew Wilcox   dax: Convert dax_...
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
  	xas_reset(xas);
  	xas_lock_irq(xas);
  	old = xas_store(xas, entry);
  	xas_unlock_irq(xas);
  	BUG_ON(!dax_is_locked(old));
  	dax_wake_entry(xas, entry, false);
  }
  
  /*
   * Return: The entry stored at this location before it was locked.
   */
  static void *dax_lock_entry(struct xa_state *xas, void *entry)
  {
  	unsigned long v = xa_to_value(entry);
  	return xas_store(xas, xa_mk_value(v | DAX_LOCKED));
  }
d2c997c0f   Dan Williams   fs, dax: use page...
277
278
279
280
281
282
283
284
285
286
287
  static unsigned long dax_entry_size(void *entry)
  {
  	if (dax_is_zero_entry(entry))
  		return 0;
  	else if (dax_is_empty_entry(entry))
  		return 0;
  	else if (dax_is_pmd_entry(entry))
  		return PMD_SIZE;
  	else
  		return PAGE_SIZE;
  }
a77d19f46   Matthew Wilcox   dax: Rename some ...
288
  static unsigned long dax_end_pfn(void *entry)
d2c997c0f   Dan Williams   fs, dax: use page...
289
  {
a77d19f46   Matthew Wilcox   dax: Rename some ...
290
  	return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
d2c997c0f   Dan Williams   fs, dax: use page...
291
292
293
294
295
296
297
  }
  
  /*
   * Iterate through all mapped pfns represented by an entry, i.e. skip
   * 'empty' and 'zero' entries.
   */
  #define for_each_mapped_pfn(entry, pfn) \
a77d19f46   Matthew Wilcox   dax: Rename some ...
298
299
  	for (pfn = dax_to_pfn(entry); \
  			pfn < dax_end_pfn(entry); pfn++)
d2c997c0f   Dan Williams   fs, dax: use page...
300

73449daf8   Dan Williams   filesystem-dax: S...
301
302
303
304
305
306
307
  /*
   * TODO: for reflink+dax we need a way to associate a single page with
   * multiple address_space instances at different linear_page_index()
   * offsets.
   */
  static void dax_associate_entry(void *entry, struct address_space *mapping,
  		struct vm_area_struct *vma, unsigned long address)
d2c997c0f   Dan Williams   fs, dax: use page...
308
  {
73449daf8   Dan Williams   filesystem-dax: S...
309
310
  	unsigned long size = dax_entry_size(entry), pfn, index;
  	int i = 0;
d2c997c0f   Dan Williams   fs, dax: use page...
311
312
313
  
  	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
  		return;
73449daf8   Dan Williams   filesystem-dax: S...
314
  	index = linear_page_index(vma, address & ~(size - 1));
d2c997c0f   Dan Williams   fs, dax: use page...
315
316
317
318
319
  	for_each_mapped_pfn(entry, pfn) {
  		struct page *page = pfn_to_page(pfn);
  
  		WARN_ON_ONCE(page->mapping);
  		page->mapping = mapping;
73449daf8   Dan Williams   filesystem-dax: S...
320
  		page->index = index + i++;
d2c997c0f   Dan Williams   fs, dax: use page...
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
  	}
  }
  
  static void dax_disassociate_entry(void *entry, struct address_space *mapping,
  		bool trunc)
  {
  	unsigned long pfn;
  
  	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
  		return;
  
  	for_each_mapped_pfn(entry, pfn) {
  		struct page *page = pfn_to_page(pfn);
  
  		WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
  		WARN_ON_ONCE(page->mapping && page->mapping != mapping);
  		page->mapping = NULL;
73449daf8   Dan Williams   filesystem-dax: S...
338
  		page->index = 0;
d2c997c0f   Dan Williams   fs, dax: use page...
339
340
  	}
  }
5fac7408d   Dan Williams   mm, fs, dax: hand...
341
342
343
344
345
346
347
348
349
350
351
352
  static struct page *dax_busy_page(void *entry)
  {
  	unsigned long pfn;
  
  	for_each_mapped_pfn(entry, pfn) {
  		struct page *page = pfn_to_page(pfn);
  
  		if (page_ref_count(page) > 1)
  			return page;
  	}
  	return NULL;
  }
c5bbd4515   Matthew Wilcox   dax: Reinstate RC...
353
354
355
356
357
  /*
   * dax_lock_mapping_entry - Lock the DAX entry corresponding to a page
   * @page: The page whose entry we want to lock
   *
   * Context: Process context.
27359fd6e   Matthew Wilcox   dax: Fix unlock m...
358
359
   * Return: A cookie to pass to dax_unlock_page() or 0 if the entry could
   * not be locked.
c5bbd4515   Matthew Wilcox   dax: Reinstate RC...
360
   */
27359fd6e   Matthew Wilcox   dax: Fix unlock m...
361
  dax_entry_t dax_lock_page(struct page *page)
c2a7d2a11   Dan Williams   filesystem-dax: I...
362
  {
9f32d2213   Matthew Wilcox   dax: Convert dax_...
363
364
  	XA_STATE(xas, NULL, 0);
  	void *entry;
c2a7d2a11   Dan Williams   filesystem-dax: I...
365

c5bbd4515   Matthew Wilcox   dax: Reinstate RC...
366
367
  	/* Ensure page->mapping isn't freed while we look at it */
  	rcu_read_lock();
c2a7d2a11   Dan Williams   filesystem-dax: I...
368
  	for (;;) {
9f32d2213   Matthew Wilcox   dax: Convert dax_...
369
  		struct address_space *mapping = READ_ONCE(page->mapping);
c2a7d2a11   Dan Williams   filesystem-dax: I...
370

27359fd6e   Matthew Wilcox   dax: Fix unlock m...
371
  		entry = NULL;
c93db7bb6   Matthew Wilcox   dax: Check page->...
372
  		if (!mapping || !dax_mapping(mapping))
c5bbd4515   Matthew Wilcox   dax: Reinstate RC...
373
  			break;
c2a7d2a11   Dan Williams   filesystem-dax: I...
374
375
376
377
378
379
380
381
  
  		/*
  		 * In the device-dax case there's no need to lock, a
  		 * struct dev_pagemap pin is sufficient to keep the
  		 * inode alive, and we assume we have dev_pagemap pin
  		 * otherwise we would not have a valid pfn_to_page()
  		 * translation.
  		 */
27359fd6e   Matthew Wilcox   dax: Fix unlock m...
382
  		entry = (void *)~0UL;
9f32d2213   Matthew Wilcox   dax: Convert dax_...
383
  		if (S_ISCHR(mapping->host->i_mode))
c5bbd4515   Matthew Wilcox   dax: Reinstate RC...
384
  			break;
c2a7d2a11   Dan Williams   filesystem-dax: I...
385

9f32d2213   Matthew Wilcox   dax: Convert dax_...
386
387
  		xas.xa = &mapping->i_pages;
  		xas_lock_irq(&xas);
c2a7d2a11   Dan Williams   filesystem-dax: I...
388
  		if (mapping != page->mapping) {
9f32d2213   Matthew Wilcox   dax: Convert dax_...
389
  			xas_unlock_irq(&xas);
c2a7d2a11   Dan Williams   filesystem-dax: I...
390
391
  			continue;
  		}
9f32d2213   Matthew Wilcox   dax: Convert dax_...
392
393
394
  		xas_set(&xas, page->index);
  		entry = xas_load(&xas);
  		if (dax_is_locked(entry)) {
c5bbd4515   Matthew Wilcox   dax: Reinstate RC...
395
  			rcu_read_unlock();
55e56f06e   Matthew Wilcox   dax: Don't access...
396
  			wait_entry_unlocked(&xas, entry);
c5bbd4515   Matthew Wilcox   dax: Reinstate RC...
397
  			rcu_read_lock();
6d7cd8c13   Matthew Wilcox   dax: Remove optim...
398
  			continue;
c2a7d2a11   Dan Williams   filesystem-dax: I...
399
  		}
9f32d2213   Matthew Wilcox   dax: Convert dax_...
400
401
  		dax_lock_entry(&xas, entry);
  		xas_unlock_irq(&xas);
c5bbd4515   Matthew Wilcox   dax: Reinstate RC...
402
  		break;
c2a7d2a11   Dan Williams   filesystem-dax: I...
403
  	}
c5bbd4515   Matthew Wilcox   dax: Reinstate RC...
404
  	rcu_read_unlock();
27359fd6e   Matthew Wilcox   dax: Fix unlock m...
405
  	return (dax_entry_t)entry;
c2a7d2a11   Dan Williams   filesystem-dax: I...
406
  }
27359fd6e   Matthew Wilcox   dax: Fix unlock m...
407
  void dax_unlock_page(struct page *page, dax_entry_t cookie)
c2a7d2a11   Dan Williams   filesystem-dax: I...
408
409
  {
  	struct address_space *mapping = page->mapping;
9f32d2213   Matthew Wilcox   dax: Convert dax_...
410
  	XA_STATE(xas, &mapping->i_pages, page->index);
c2a7d2a11   Dan Williams   filesystem-dax: I...
411

9f32d2213   Matthew Wilcox   dax: Convert dax_...
412
  	if (S_ISCHR(mapping->host->i_mode))
c2a7d2a11   Dan Williams   filesystem-dax: I...
413
  		return;
27359fd6e   Matthew Wilcox   dax: Fix unlock m...
414
  	dax_unlock_entry(&xas, (void *)cookie);
c2a7d2a11   Dan Williams   filesystem-dax: I...
415
  }
ac401cc78   Jan Kara   dax: New fault lo...
416
  /*
a77d19f46   Matthew Wilcox   dax: Rename some ...
417
418
419
   * Find page cache entry at given index. If it is a DAX entry, return it
   * with the entry locked. If the page cache doesn't contain an entry at
   * that index, add a locked empty entry.
ac401cc78   Jan Kara   dax: New fault lo...
420
   *
3159f943a   Matthew Wilcox   xarray: Replace e...
421
   * When requesting an entry with size DAX_PMD, grab_mapping_entry() will
b15cd8006   Matthew Wilcox   dax: Convert page...
422
423
424
   * either return that locked entry or will return VM_FAULT_FALLBACK.
   * This will happen if there are any PTE entries within the PMD range
   * that we are requesting.
642261ac9   Ross Zwisler   dax: add struct i...
425
   *
b15cd8006   Matthew Wilcox   dax: Convert page...
426
427
428
429
430
431
   * We always favor PTE entries over PMD entries. There isn't a flow where we
   * evict PTE entries in order to 'upgrade' them to a PMD entry.  A PMD
   * insertion will fail if it finds any PTE entries already in the tree, and a
   * PTE insertion will cause an existing PMD entry to be unmapped and
   * downgraded to PTE entries.  This happens for both PMD zero pages as
   * well as PMD empty entries.
642261ac9   Ross Zwisler   dax: add struct i...
432
   *
b15cd8006   Matthew Wilcox   dax: Convert page...
433
434
435
   * The exception to this downgrade path is for PMD entries that have
   * real storage backing them.  We will leave these real PMD entries in
   * the tree, and PTE writes will simply dirty the entire PMD entry.
642261ac9   Ross Zwisler   dax: add struct i...
436
   *
ac401cc78   Jan Kara   dax: New fault lo...
437
438
439
   * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
   * persistent memory the benefit is doubtful. We can add that later if we can
   * show it helps.
b15cd8006   Matthew Wilcox   dax: Convert page...
440
441
442
443
   *
   * On error, this function does not return an ERR_PTR.  Instead it returns
   * a VM_FAULT code, encoded as an xarray internal entry.  The ERR_PTR values
   * overlap with xarray value entries.
ac401cc78   Jan Kara   dax: New fault lo...
444
   */
b15cd8006   Matthew Wilcox   dax: Convert page...
445
  static void *grab_mapping_entry(struct xa_state *xas,
23c84eb78   Matthew Wilcox (Oracle)   dax: Fix missed w...
446
  		struct address_space *mapping, unsigned int order)
ac401cc78   Jan Kara   dax: New fault lo...
447
  {
b15cd8006   Matthew Wilcox   dax: Convert page...
448
449
450
  	unsigned long index = xas->xa_index;
  	bool pmd_downgrade = false; /* splitting PMD entry into PTE entries? */
  	void *entry;
642261ac9   Ross Zwisler   dax: add struct i...
451

b15cd8006   Matthew Wilcox   dax: Convert page...
452
453
  retry:
  	xas_lock_irq(xas);
23c84eb78   Matthew Wilcox (Oracle)   dax: Fix missed w...
454
  	entry = get_unlocked_entry(xas, order);
91d25ba8a   Ross Zwisler   dax: use common 4...
455

642261ac9   Ross Zwisler   dax: add struct i...
456
  	if (entry) {
23c84eb78   Matthew Wilcox (Oracle)   dax: Fix missed w...
457
458
  		if (dax_is_conflict(entry))
  			goto fallback;
0e40de033   Matthew Wilcox   dax: Fix huge pag...
459
  		if (!xa_is_value(entry)) {
49688e654   Hao Li   dax: Fix incorrec...
460
  			xas_set_err(xas, -EIO);
b15cd8006   Matthew Wilcox   dax: Convert page...
461
462
  			goto out_unlock;
  		}
23c84eb78   Matthew Wilcox (Oracle)   dax: Fix missed w...
463
  		if (order == 0) {
91d25ba8a   Ross Zwisler   dax: use common 4...
464
  			if (dax_is_pmd_entry(entry) &&
642261ac9   Ross Zwisler   dax: add struct i...
465
466
467
468
469
470
  			    (dax_is_zero_entry(entry) ||
  			     dax_is_empty_entry(entry))) {
  				pmd_downgrade = true;
  			}
  		}
  	}
b15cd8006   Matthew Wilcox   dax: Convert page...
471
472
473
474
475
476
  	if (pmd_downgrade) {
  		/*
  		 * Make sure 'entry' remains valid while we drop
  		 * the i_pages lock.
  		 */
  		dax_lock_entry(xas, entry);
642261ac9   Ross Zwisler   dax: add struct i...
477

642261ac9   Ross Zwisler   dax: add struct i...
478
479
480
481
482
  		/*
  		 * Besides huge zero pages the only other thing that gets
  		 * downgraded are empty entries which don't need to be
  		 * unmapped.
  		 */
b15cd8006   Matthew Wilcox   dax: Convert page...
483
484
485
486
487
488
489
  		if (dax_is_zero_entry(entry)) {
  			xas_unlock_irq(xas);
  			unmap_mapping_pages(mapping,
  					xas->xa_index & ~PG_PMD_COLOUR,
  					PG_PMD_NR, false);
  			xas_reset(xas);
  			xas_lock_irq(xas);
e11f8b7b6   Ross Zwisler   dax: fix radix tr...
490
  		}
b15cd8006   Matthew Wilcox   dax: Convert page...
491
492
493
494
495
496
497
  		dax_disassociate_entry(entry, mapping, false);
  		xas_store(xas, NULL);	/* undo the PMD join */
  		dax_wake_entry(xas, entry, true);
  		mapping->nrexceptional--;
  		entry = NULL;
  		xas_set(xas, index);
  	}
642261ac9   Ross Zwisler   dax: add struct i...
498

b15cd8006   Matthew Wilcox   dax: Convert page...
499
500
501
  	if (entry) {
  		dax_lock_entry(xas, entry);
  	} else {
23c84eb78   Matthew Wilcox (Oracle)   dax: Fix missed w...
502
503
504
505
506
  		unsigned long flags = DAX_EMPTY;
  
  		if (order > 0)
  			flags |= DAX_PMD;
  		entry = dax_make_entry(pfn_to_pfn_t(0), flags);
b15cd8006   Matthew Wilcox   dax: Convert page...
507
508
509
  		dax_lock_entry(xas, entry);
  		if (xas_error(xas))
  			goto out_unlock;
ac401cc78   Jan Kara   dax: New fault lo...
510
  		mapping->nrexceptional++;
ac401cc78   Jan Kara   dax: New fault lo...
511
  	}
b15cd8006   Matthew Wilcox   dax: Convert page...
512
513
514
515
516
517
518
519
520
  
  out_unlock:
  	xas_unlock_irq(xas);
  	if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM))
  		goto retry;
  	if (xas->xa_node == XA_ERROR(-ENOMEM))
  		return xa_mk_internal(VM_FAULT_OOM);
  	if (xas_error(xas))
  		return xa_mk_internal(VM_FAULT_SIGBUS);
e3ad61c64   Ross Zwisler   dax: consistent v...
521
  	return entry;
b15cd8006   Matthew Wilcox   dax: Convert page...
522
523
524
  fallback:
  	xas_unlock_irq(xas);
  	return xa_mk_internal(VM_FAULT_FALLBACK);
ac401cc78   Jan Kara   dax: New fault lo...
525
  }
5fac7408d   Dan Williams   mm, fs, dax: hand...
526
  /**
6bbdd563e   Vivek Goyal   dax: Create a ran...
527
   * dax_layout_busy_page_range - find first pinned page in @mapping
5fac7408d   Dan Williams   mm, fs, dax: hand...
528
   * @mapping: address space to scan for a page with ref count > 1
6bbdd563e   Vivek Goyal   dax: Create a ran...
529
530
531
   * @start: Starting offset. Page containing 'start' is included.
   * @end: End offset. Page containing 'end' is included. If 'end' is LLONG_MAX,
   *       pages from 'start' till the end of file are included.
5fac7408d   Dan Williams   mm, fs, dax: hand...
532
533
534
535
536
537
538
539
540
541
542
543
   *
   * DAX requires ZONE_DEVICE mapped pages. These pages are never
   * 'onlined' to the page allocator so they are considered idle when
   * page->count == 1. A filesystem uses this interface to determine if
   * any page in the mapping is busy, i.e. for DMA, or other
   * get_user_pages() usages.
   *
   * It is expected that the filesystem is holding locks to block the
   * establishment of new mappings in this address_space. I.e. it expects
   * to be able to run unmap_mapping_range() and subsequently not race
   * mapping_mapped() becoming true.
   */
6bbdd563e   Vivek Goyal   dax: Create a ran...
544
545
  struct page *dax_layout_busy_page_range(struct address_space *mapping,
  					loff_t start, loff_t end)
5fac7408d   Dan Williams   mm, fs, dax: hand...
546
  {
084a89900   Matthew Wilcox   dax: Convert dax_...
547
548
  	void *entry;
  	unsigned int scanned = 0;
5fac7408d   Dan Williams   mm, fs, dax: hand...
549
  	struct page *page = NULL;
6bbdd563e   Vivek Goyal   dax: Create a ran...
550
551
552
  	pgoff_t start_idx = start >> PAGE_SHIFT;
  	pgoff_t end_idx;
  	XA_STATE(xas, &mapping->i_pages, start_idx);
5fac7408d   Dan Williams   mm, fs, dax: hand...
553
554
555
556
557
558
559
560
561
  
  	/*
  	 * In the 'limited' case get_user_pages() for dax is disabled.
  	 */
  	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
  		return NULL;
  
  	if (!dax_mapping(mapping) || !mapping_mapped(mapping))
  		return NULL;
6bbdd563e   Vivek Goyal   dax: Create a ran...
562
563
564
565
566
  	/* If end == LLONG_MAX, all pages from start to till end of file */
  	if (end == LLONG_MAX)
  		end_idx = ULONG_MAX;
  	else
  		end_idx = end >> PAGE_SHIFT;
5fac7408d   Dan Williams   mm, fs, dax: hand...
567
568
  	/*
  	 * If we race get_user_pages_fast() here either we'll see the
084a89900   Matthew Wilcox   dax: Convert dax_...
569
  	 * elevated page count in the iteration and wait, or
5fac7408d   Dan Williams   mm, fs, dax: hand...
570
571
572
573
  	 * get_user_pages_fast() will see that the page it took a reference
  	 * against is no longer mapped in the page tables and bail to the
  	 * get_user_pages() slow path.  The slow path is protected by
  	 * pte_lock() and pmd_lock(). New references are not taken without
6bbdd563e   Vivek Goyal   dax: Create a ran...
574
  	 * holding those locks, and unmap_mapping_pages() will not zero the
5fac7408d   Dan Williams   mm, fs, dax: hand...
575
576
577
578
  	 * pte or pmd without holding the respective lock, so we are
  	 * guaranteed to either see new references or prevent new
  	 * references from being established.
  	 */
6bbdd563e   Vivek Goyal   dax: Create a ran...
579
  	unmap_mapping_pages(mapping, start_idx, end_idx - start_idx + 1, 0);
5fac7408d   Dan Williams   mm, fs, dax: hand...
580

084a89900   Matthew Wilcox   dax: Convert dax_...
581
  	xas_lock_irq(&xas);
6bbdd563e   Vivek Goyal   dax: Create a ran...
582
  	xas_for_each(&xas, entry, end_idx) {
084a89900   Matthew Wilcox   dax: Convert dax_...
583
584
585
  		if (WARN_ON_ONCE(!xa_is_value(entry)))
  			continue;
  		if (unlikely(dax_is_locked(entry)))
23c84eb78   Matthew Wilcox (Oracle)   dax: Fix missed w...
586
  			entry = get_unlocked_entry(&xas, 0);
084a89900   Matthew Wilcox   dax: Convert dax_...
587
588
589
  		if (entry)
  			page = dax_busy_page(entry);
  		put_unlocked_entry(&xas, entry);
5fac7408d   Dan Williams   mm, fs, dax: hand...
590
591
  		if (page)
  			break;
084a89900   Matthew Wilcox   dax: Convert dax_...
592
593
594
595
596
597
598
  		if (++scanned % XA_CHECK_SCHED)
  			continue;
  
  		xas_pause(&xas);
  		xas_unlock_irq(&xas);
  		cond_resched();
  		xas_lock_irq(&xas);
5fac7408d   Dan Williams   mm, fs, dax: hand...
599
  	}
084a89900   Matthew Wilcox   dax: Convert dax_...
600
  	xas_unlock_irq(&xas);
5fac7408d   Dan Williams   mm, fs, dax: hand...
601
602
  	return page;
  }
6bbdd563e   Vivek Goyal   dax: Create a ran...
603
604
605
606
607
608
  EXPORT_SYMBOL_GPL(dax_layout_busy_page_range);
  
  struct page *dax_layout_busy_page(struct address_space *mapping)
  {
  	return dax_layout_busy_page_range(mapping, 0, LLONG_MAX);
  }
5fac7408d   Dan Williams   mm, fs, dax: hand...
609
  EXPORT_SYMBOL_GPL(dax_layout_busy_page);
a77d19f46   Matthew Wilcox   dax: Rename some ...
610
  static int __dax_invalidate_entry(struct address_space *mapping,
c6dcf52c2   Jan Kara   mm: Invalidate DA...
611
612
  					  pgoff_t index, bool trunc)
  {
07f2d89cc   Matthew Wilcox   dax: Convert __da...
613
  	XA_STATE(xas, &mapping->i_pages, index);
c6dcf52c2   Jan Kara   mm: Invalidate DA...
614
615
  	int ret = 0;
  	void *entry;
c6dcf52c2   Jan Kara   mm: Invalidate DA...
616

07f2d89cc   Matthew Wilcox   dax: Convert __da...
617
  	xas_lock_irq(&xas);
23c84eb78   Matthew Wilcox (Oracle)   dax: Fix missed w...
618
  	entry = get_unlocked_entry(&xas, 0);
3159f943a   Matthew Wilcox   xarray: Replace e...
619
  	if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
c6dcf52c2   Jan Kara   mm: Invalidate DA...
620
621
  		goto out;
  	if (!trunc &&
07f2d89cc   Matthew Wilcox   dax: Convert __da...
622
623
  	    (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) ||
  	     xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE)))
c6dcf52c2   Jan Kara   mm: Invalidate DA...
624
  		goto out;
d2c997c0f   Dan Williams   fs, dax: use page...
625
  	dax_disassociate_entry(entry, mapping, trunc);
07f2d89cc   Matthew Wilcox   dax: Convert __da...
626
  	xas_store(&xas, NULL);
c6dcf52c2   Jan Kara   mm: Invalidate DA...
627
628
629
  	mapping->nrexceptional--;
  	ret = 1;
  out:
07f2d89cc   Matthew Wilcox   dax: Convert __da...
630
631
  	put_unlocked_entry(&xas, entry);
  	xas_unlock_irq(&xas);
c6dcf52c2   Jan Kara   mm: Invalidate DA...
632
633
  	return ret;
  }
07f2d89cc   Matthew Wilcox   dax: Convert __da...
634

ac401cc78   Jan Kara   dax: New fault lo...
635
  /*
3159f943a   Matthew Wilcox   xarray: Replace e...
636
637
   * Delete DAX entry at @index from @mapping.  Wait for it
   * to be unlocked before deleting it.
ac401cc78   Jan Kara   dax: New fault lo...
638
639
640
   */
  int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
  {
a77d19f46   Matthew Wilcox   dax: Rename some ...
641
  	int ret = __dax_invalidate_entry(mapping, index, true);
ac401cc78   Jan Kara   dax: New fault lo...
642

ac401cc78   Jan Kara   dax: New fault lo...
643
644
645
  	/*
  	 * This gets called from truncate / punch_hole path. As such, the caller
  	 * must hold locks protecting against concurrent modifications of the
a77d19f46   Matthew Wilcox   dax: Rename some ...
646
  	 * page cache (usually fs-private i_mmap_sem for writing). Since the
3159f943a   Matthew Wilcox   xarray: Replace e...
647
  	 * caller has seen a DAX entry for this index, we better find it
ac401cc78   Jan Kara   dax: New fault lo...
648
649
  	 * at that index as well...
  	 */
c6dcf52c2   Jan Kara   mm: Invalidate DA...
650
651
652
653
654
  	WARN_ON_ONCE(!ret);
  	return ret;
  }
  
  /*
3159f943a   Matthew Wilcox   xarray: Replace e...
655
   * Invalidate DAX entry if it is clean.
c6dcf52c2   Jan Kara   mm: Invalidate DA...
656
657
658
659
   */
  int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
  				      pgoff_t index)
  {
a77d19f46   Matthew Wilcox   dax: Rename some ...
660
  	return __dax_invalidate_entry(mapping, index, false);
ac401cc78   Jan Kara   dax: New fault lo...
661
  }
c7fe193f1   Ira Weiny   fs/dax: Remove un...
662
663
  static int copy_cow_page_dax(struct block_device *bdev, struct dax_device *dax_dev,
  			     sector_t sector, struct page *to, unsigned long vaddr)
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
664
  {
cccbce671   Dan Williams   filesystem-dax: c...
665
666
  	void *vto, *kaddr;
  	pgoff_t pgoff;
cccbce671   Dan Williams   filesystem-dax: c...
667
668
  	long rc;
  	int id;
c7fe193f1   Ira Weiny   fs/dax: Remove un...
669
  	rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff);
cccbce671   Dan Williams   filesystem-dax: c...
670
671
672
673
  	if (rc)
  		return rc;
  
  	id = dax_read_lock();
c7fe193f1   Ira Weiny   fs/dax: Remove un...
674
  	rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(PAGE_SIZE), &kaddr, NULL);
cccbce671   Dan Williams   filesystem-dax: c...
675
676
677
678
  	if (rc < 0) {
  		dax_read_unlock(id);
  		return rc;
  	}
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
679
  	vto = kmap_atomic(to);
cccbce671   Dan Williams   filesystem-dax: c...
680
  	copy_user_page(vto, (void __force *)kaddr, vaddr, to);
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
681
  	kunmap_atomic(vto);
cccbce671   Dan Williams   filesystem-dax: c...
682
  	dax_read_unlock(id);
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
683
684
  	return 0;
  }
642261ac9   Ross Zwisler   dax: add struct i...
685
686
687
688
689
690
691
  /*
   * By this point grab_mapping_entry() has ensured that we have a locked entry
   * of the appropriate size so we don't have to worry about downgrading PMDs to
   * PTEs.  If we happen to be trying to insert a PTE and there is a PMD
   * already in the tree, we will skip the insertion and just dirty the PMD as
   * appropriate.
   */
b15cd8006   Matthew Wilcox   dax: Convert page...
692
693
694
  static void *dax_insert_entry(struct xa_state *xas,
  		struct address_space *mapping, struct vm_fault *vmf,
  		void *entry, pfn_t pfn, unsigned long flags, bool dirty)
9973c98ec   Ross Zwisler   dax: add support ...
695
  {
b15cd8006   Matthew Wilcox   dax: Convert page...
696
  	void *new_entry = dax_make_entry(pfn, flags);
9973c98ec   Ross Zwisler   dax: add support ...
697

f5b7b7487   Jan Kara   dax: Allow tuning...
698
  	if (dirty)
d2b2a28e6   Dmitry Monakhov   dax: dirty inode ...
699
  		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
9973c98ec   Ross Zwisler   dax: add support ...
700

3159f943a   Matthew Wilcox   xarray: Replace e...
701
  	if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) {
b15cd8006   Matthew Wilcox   dax: Convert page...
702
  		unsigned long index = xas->xa_index;
91d25ba8a   Ross Zwisler   dax: use common 4...
703
704
  		/* we are replacing a zero page with block mapping */
  		if (dax_is_pmd_entry(entry))
977fbdcd5   Matthew Wilcox   mm: add unmap_map...
705
  			unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
b15cd8006   Matthew Wilcox   dax: Convert page...
706
  					PG_PMD_NR, false);
91d25ba8a   Ross Zwisler   dax: use common 4...
707
  		else /* pte entry */
b15cd8006   Matthew Wilcox   dax: Convert page...
708
  			unmap_mapping_pages(mapping, index, 1, false);
9973c98ec   Ross Zwisler   dax: add support ...
709
  	}
b15cd8006   Matthew Wilcox   dax: Convert page...
710
711
  	xas_reset(xas);
  	xas_lock_irq(xas);
1571c029a   Jan Kara   dax: Fix xarray e...
712
713
  	if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
  		void *old;
d2c997c0f   Dan Williams   fs, dax: use page...
714
  		dax_disassociate_entry(entry, mapping, false);
73449daf8   Dan Williams   filesystem-dax: S...
715
  		dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address);
642261ac9   Ross Zwisler   dax: add struct i...
716
  		/*
a77d19f46   Matthew Wilcox   dax: Rename some ...
717
  		 * Only swap our new entry into the page cache if the current
642261ac9   Ross Zwisler   dax: add struct i...
718
  		 * entry is a zero page or an empty entry.  If a normal PTE or
a77d19f46   Matthew Wilcox   dax: Rename some ...
719
  		 * PMD entry is already in the cache, we leave it alone.  This
642261ac9   Ross Zwisler   dax: add struct i...
720
721
722
723
  		 * means that if we are trying to insert a PTE and the
  		 * existing entry is a PMD, we will just leave the PMD in the
  		 * tree and dirty it if necessary.
  		 */
1571c029a   Jan Kara   dax: Fix xarray e...
724
  		old = dax_lock_entry(xas, new_entry);
b15cd8006   Matthew Wilcox   dax: Convert page...
725
726
  		WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) |
  					DAX_LOCKED));
91d25ba8a   Ross Zwisler   dax: use common 4...
727
  		entry = new_entry;
b15cd8006   Matthew Wilcox   dax: Convert page...
728
729
  	} else {
  		xas_load(xas);	/* Walk the xa_state */
9973c98ec   Ross Zwisler   dax: add support ...
730
  	}
91d25ba8a   Ross Zwisler   dax: use common 4...
731

f5b7b7487   Jan Kara   dax: Allow tuning...
732
  	if (dirty)
b15cd8006   Matthew Wilcox   dax: Convert page...
733
  		xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
91d25ba8a   Ross Zwisler   dax: use common 4...
734

b15cd8006   Matthew Wilcox   dax: Convert page...
735
  	xas_unlock_irq(xas);
91d25ba8a   Ross Zwisler   dax: use common 4...
736
  	return entry;
9973c98ec   Ross Zwisler   dax: add support ...
737
  }
a77d19f46   Matthew Wilcox   dax: Rename some ...
738
739
  static inline
  unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
4b4bb46d0   Jan Kara   dax: clear dirty ...
740
741
742
743
744
745
746
747
748
  {
  	unsigned long address;
  
  	address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
  	VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
  	return address;
  }
  
  /* Walk all mappings of a given index of a file and writeprotect them */
a77d19f46   Matthew Wilcox   dax: Rename some ...
749
750
  static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
  		unsigned long pfn)
4b4bb46d0   Jan Kara   dax: clear dirty ...
751
752
  {
  	struct vm_area_struct *vma;
f729c8c9b   Ross Zwisler   dax: wrprotect pm...
753
754
  	pte_t pte, *ptep = NULL;
  	pmd_t *pmdp = NULL;
4b4bb46d0   Jan Kara   dax: clear dirty ...
755
  	spinlock_t *ptl;
4b4bb46d0   Jan Kara   dax: clear dirty ...
756
757
758
  
  	i_mmap_lock_read(mapping);
  	vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) {
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
759
760
  		struct mmu_notifier_range range;
  		unsigned long address;
4b4bb46d0   Jan Kara   dax: clear dirty ...
761
762
763
764
765
766
767
  
  		cond_resched();
  
  		if (!(vma->vm_flags & VM_SHARED))
  			continue;
  
  		address = pgoff_address(index, vma);
a4d1a8852   Jérôme Glisse   dax: update to ne...
768
769
  
  		/*
0cefc36b3   Ira Weiny   fs/dax: NIT fix c...
770
  		 * Note because we provide range to follow_pte_pmd it will
a4d1a8852   Jérôme Glisse   dax: update to ne...
771
772
773
  		 * call mmu_notifier_invalidate_range_start() on our behalf
  		 * before taking any lock.
  		 */
ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
774
775
  		if (follow_pte_pmd(vma->vm_mm, address, &range,
  				   &ptep, &pmdp, &ptl))
4b4bb46d0   Jan Kara   dax: clear dirty ...
776
  			continue;
4b4bb46d0   Jan Kara   dax: clear dirty ...
777

0f10851ea   Jérôme Glisse   mm/mmu_notifier: ...
778
779
780
781
782
  		/*
  		 * No need to call mmu_notifier_invalidate_range() as we are
  		 * downgrading page table protection not changing it to point
  		 * to a new page.
  		 *
ad56b738c   Mike Rapoport   docs/vm: rename d...
783
  		 * See Documentation/vm/mmu_notifier.rst
0f10851ea   Jérôme Glisse   mm/mmu_notifier: ...
784
  		 */
f729c8c9b   Ross Zwisler   dax: wrprotect pm...
785
786
787
788
789
790
  		if (pmdp) {
  #ifdef CONFIG_FS_DAX_PMD
  			pmd_t pmd;
  
  			if (pfn != pmd_pfn(*pmdp))
  				goto unlock_pmd;
f6f373216   Linus Torvalds   Revert "mm: repla...
791
  			if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp))
f729c8c9b   Ross Zwisler   dax: wrprotect pm...
792
793
794
  				goto unlock_pmd;
  
  			flush_cache_page(vma, address, pfn);
024eee0e8   Aneesh Kumar K.V   mm: page_mkclean ...
795
  			pmd = pmdp_invalidate(vma, address, pmdp);
f729c8c9b   Ross Zwisler   dax: wrprotect pm...
796
797
798
  			pmd = pmd_wrprotect(pmd);
  			pmd = pmd_mkclean(pmd);
  			set_pmd_at(vma->vm_mm, address, pmdp, pmd);
f729c8c9b   Ross Zwisler   dax: wrprotect pm...
799
  unlock_pmd:
f729c8c9b   Ross Zwisler   dax: wrprotect pm...
800
  #endif
ee190ca65   Jan H. Schönherr   fs/dax.c: release...
801
  			spin_unlock(ptl);
f729c8c9b   Ross Zwisler   dax: wrprotect pm...
802
803
804
805
806
807
808
809
810
811
812
  		} else {
  			if (pfn != pte_pfn(*ptep))
  				goto unlock_pte;
  			if (!pte_dirty(*ptep) && !pte_write(*ptep))
  				goto unlock_pte;
  
  			flush_cache_page(vma, address, pfn);
  			pte = ptep_clear_flush(vma, address, ptep);
  			pte = pte_wrprotect(pte);
  			pte = pte_mkclean(pte);
  			set_pte_at(vma->vm_mm, address, ptep, pte);
f729c8c9b   Ross Zwisler   dax: wrprotect pm...
813
814
815
  unlock_pte:
  			pte_unmap_unlock(ptep, ptl);
  		}
4b4bb46d0   Jan Kara   dax: clear dirty ...
816

ac46d4f3c   Jérôme Glisse   mm/mmu_notifier: ...
817
  		mmu_notifier_invalidate_range_end(&range);
4b4bb46d0   Jan Kara   dax: clear dirty ...
818
819
820
  	}
  	i_mmap_unlock_read(mapping);
  }
9fc747f68   Matthew Wilcox   dax: Convert dax ...
821
822
  static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
  		struct address_space *mapping, void *entry)
9973c98ec   Ross Zwisler   dax: add support ...
823
  {
e4b3448bc   Matthew Wilcox   dax: Flush partia...
824
  	unsigned long pfn, index, count;
3fe0791c2   Dan Williams   dax: store pfns i...
825
  	long ret = 0;
9973c98ec   Ross Zwisler   dax: add support ...
826

9973c98ec   Ross Zwisler   dax: add support ...
827
  	/*
a6abc2c0e   Jan Kara   dax: make cache f...
828
829
  	 * A page got tagged dirty in DAX mapping? Something is seriously
  	 * wrong.
9973c98ec   Ross Zwisler   dax: add support ...
830
  	 */
3159f943a   Matthew Wilcox   xarray: Replace e...
831
  	if (WARN_ON(!xa_is_value(entry)))
a6abc2c0e   Jan Kara   dax: make cache f...
832
  		return -EIO;
9973c98ec   Ross Zwisler   dax: add support ...
833

9fc747f68   Matthew Wilcox   dax: Convert dax ...
834
835
  	if (unlikely(dax_is_locked(entry))) {
  		void *old_entry = entry;
23c84eb78   Matthew Wilcox (Oracle)   dax: Fix missed w...
836
  		entry = get_unlocked_entry(xas, 0);
9fc747f68   Matthew Wilcox   dax: Convert dax ...
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
  
  		/* Entry got punched out / reallocated? */
  		if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
  			goto put_unlocked;
  		/*
  		 * Entry got reallocated elsewhere? No need to writeback.
  		 * We have to compare pfns as we must not bail out due to
  		 * difference in lockbit or entry type.
  		 */
  		if (dax_to_pfn(old_entry) != dax_to_pfn(entry))
  			goto put_unlocked;
  		if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
  					dax_is_zero_entry(entry))) {
  			ret = -EIO;
  			goto put_unlocked;
  		}
  
  		/* Another fsync thread may have already done this entry */
  		if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE))
  			goto put_unlocked;
9973c98ec   Ross Zwisler   dax: add support ...
857
  	}
a6abc2c0e   Jan Kara   dax: make cache f...
858
  	/* Lock the entry to serialize with page faults */
9fc747f68   Matthew Wilcox   dax: Convert dax ...
859
  	dax_lock_entry(xas, entry);
a6abc2c0e   Jan Kara   dax: make cache f...
860
861
862
863
  	/*
  	 * We can clear the tag now but we have to be careful so that concurrent
  	 * dax_writeback_one() calls for the same index cannot finish before we
  	 * actually flush the caches. This is achieved as the calls will look
b93b01631   Matthew Wilcox   page cache: use x...
864
865
  	 * at the entry only under the i_pages lock and once they do that
  	 * they will see the entry locked and wait for it to unlock.
a6abc2c0e   Jan Kara   dax: make cache f...
866
  	 */
9fc747f68   Matthew Wilcox   dax: Convert dax ...
867
868
  	xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE);
  	xas_unlock_irq(xas);
a6abc2c0e   Jan Kara   dax: make cache f...
869

642261ac9   Ross Zwisler   dax: add struct i...
870
  	/*
e4b3448bc   Matthew Wilcox   dax: Flush partia...
871
872
873
  	 * If dax_writeback_mapping_range() was given a wbc->range_start
  	 * in the middle of a PMD, the 'index' we use needs to be
  	 * aligned to the start of the PMD.
3fe0791c2   Dan Williams   dax: store pfns i...
874
875
  	 * This allows us to flush for PMD_SIZE and not have to worry about
  	 * partial PMD writebacks.
642261ac9   Ross Zwisler   dax: add struct i...
876
  	 */
a77d19f46   Matthew Wilcox   dax: Rename some ...
877
  	pfn = dax_to_pfn(entry);
e4b3448bc   Matthew Wilcox   dax: Flush partia...
878
879
  	count = 1UL << dax_entry_order(entry);
  	index = xas->xa_index & ~(count - 1);
cccbce671   Dan Williams   filesystem-dax: c...
880

e4b3448bc   Matthew Wilcox   dax: Flush partia...
881
882
  	dax_entry_mkclean(mapping, index, pfn);
  	dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE);
4b4bb46d0   Jan Kara   dax: clear dirty ...
883
884
885
886
887
888
  	/*
  	 * After we have flushed the cache, we can clear the dirty tag. There
  	 * cannot be new dirty data in the pfn after the flush has completed as
  	 * the pfn mappings are writeprotected and fault waits for mapping
  	 * entry lock.
  	 */
9fc747f68   Matthew Wilcox   dax: Convert dax ...
889
890
891
892
893
  	xas_reset(xas);
  	xas_lock_irq(xas);
  	xas_store(xas, entry);
  	xas_clear_mark(xas, PAGECACHE_TAG_DIRTY);
  	dax_wake_entry(xas, entry, false);
e4b3448bc   Matthew Wilcox   dax: Flush partia...
894
  	trace_dax_writeback_one(mapping->host, index, count);
9973c98ec   Ross Zwisler   dax: add support ...
895
  	return ret;
a6abc2c0e   Jan Kara   dax: make cache f...
896
   put_unlocked:
9fc747f68   Matthew Wilcox   dax: Convert dax ...
897
  	put_unlocked_entry(xas, entry);
9973c98ec   Ross Zwisler   dax: add support ...
898
899
900
901
902
903
904
905
  	return ret;
  }
  
  /*
   * Flush the mapping to the persistent domain within the byte range of [start,
   * end]. This is required by data integrity operations to ensure file data is
   * on persistent storage prior to completion of the operation.
   */
7f6d5b529   Ross Zwisler   dax: move writeba...
906
  int dax_writeback_mapping_range(struct address_space *mapping,
3f666c56c   Vivek Goyal   dax: Pass dax_dev...
907
  		struct dax_device *dax_dev, struct writeback_control *wbc)
9973c98ec   Ross Zwisler   dax: add support ...
908
  {
9fc747f68   Matthew Wilcox   dax: Convert dax ...
909
  	XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT);
9973c98ec   Ross Zwisler   dax: add support ...
910
  	struct inode *inode = mapping->host;
9fc747f68   Matthew Wilcox   dax: Convert dax ...
911
  	pgoff_t end_index = wbc->range_end >> PAGE_SHIFT;
9fc747f68   Matthew Wilcox   dax: Convert dax ...
912
913
914
  	void *entry;
  	int ret = 0;
  	unsigned int scanned = 0;
9973c98ec   Ross Zwisler   dax: add support ...
915
916
917
  
  	if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
  		return -EIO;
7f6d5b529   Ross Zwisler   dax: move writeba...
918
919
  	if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
  		return 0;
9fc747f68   Matthew Wilcox   dax: Convert dax ...
920
  	trace_dax_writeback_range(inode, xas.xa_index, end_index);
9973c98ec   Ross Zwisler   dax: add support ...
921

9fc747f68   Matthew Wilcox   dax: Convert dax ...
922
  	tag_pages_for_writeback(mapping, xas.xa_index, end_index);
9973c98ec   Ross Zwisler   dax: add support ...
923

9fc747f68   Matthew Wilcox   dax: Convert dax ...
924
925
926
927
928
  	xas_lock_irq(&xas);
  	xas_for_each_marked(&xas, entry, end_index, PAGECACHE_TAG_TOWRITE) {
  		ret = dax_writeback_one(&xas, dax_dev, mapping, entry);
  		if (ret < 0) {
  			mapping_set_error(mapping, ret);
9973c98ec   Ross Zwisler   dax: add support ...
929
  			break;
9973c98ec   Ross Zwisler   dax: add support ...
930
  		}
9fc747f68   Matthew Wilcox   dax: Convert dax ...
931
932
933
934
935
936
937
  		if (++scanned % XA_CHECK_SCHED)
  			continue;
  
  		xas_pause(&xas);
  		xas_unlock_irq(&xas);
  		cond_resched();
  		xas_lock_irq(&xas);
9973c98ec   Ross Zwisler   dax: add support ...
938
  	}
9fc747f68   Matthew Wilcox   dax: Convert dax ...
939
  	xas_unlock_irq(&xas);
9fc747f68   Matthew Wilcox   dax: Convert dax ...
940
941
  	trace_dax_writeback_range_done(inode, xas.xa_index, end_index);
  	return ret;
9973c98ec   Ross Zwisler   dax: add support ...
942
943
  }
  EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
31a6f1a6e   Jan Kara   dax: Simplify arg...
944
  static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
945
  {
a3841f94c   Linus Torvalds   Merge tag 'libnvd...
946
  	return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
31a6f1a6e   Jan Kara   dax: Simplify arg...
947
  }
5e161e406   Jan Kara   dax: Factor out g...
948
949
  static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size,
  			 pfn_t *pfnp)
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
950
  {
31a6f1a6e   Jan Kara   dax: Simplify arg...
951
  	const sector_t sector = dax_iomap_sector(iomap, pos);
cccbce671   Dan Williams   filesystem-dax: c...
952
953
  	pgoff_t pgoff;
  	int id, rc;
5e161e406   Jan Kara   dax: Factor out g...
954
  	long length;
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
955

5e161e406   Jan Kara   dax: Factor out g...
956
  	rc = bdev_dax_pgoff(iomap->bdev, sector, size, &pgoff);
cccbce671   Dan Williams   filesystem-dax: c...
957
958
  	if (rc)
  		return rc;
cccbce671   Dan Williams   filesystem-dax: c...
959
  	id = dax_read_lock();
5e161e406   Jan Kara   dax: Factor out g...
960
  	length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
86ed913b0   Huaisheng Ye   filesystem-dax: D...
961
  				   NULL, pfnp);
5e161e406   Jan Kara   dax: Factor out g...
962
963
964
  	if (length < 0) {
  		rc = length;
  		goto out;
cccbce671   Dan Williams   filesystem-dax: c...
965
  	}
5e161e406   Jan Kara   dax: Factor out g...
966
967
968
969
970
971
972
973
974
975
  	rc = -EINVAL;
  	if (PFN_PHYS(length) < size)
  		goto out;
  	if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1))
  		goto out;
  	/* For larger pages we need devmap */
  	if (length > 1 && !pfn_t_devmap(*pfnp))
  		goto out;
  	rc = 0;
  out:
cccbce671   Dan Williams   filesystem-dax: c...
976
  	dax_read_unlock(id);
5e161e406   Jan Kara   dax: Factor out g...
977
  	return rc;
0e3b210ce   Boaz Harrosh   dax: use pfn_mkwr...
978
  }
0e3b210ce   Boaz Harrosh   dax: use pfn_mkwr...
979

e30331ff0   Ross Zwisler   dax: relocate som...
980
  /*
91d25ba8a   Ross Zwisler   dax: use common 4...
981
982
983
984
985
   * The user has performed a load from a hole in the file.  Allocating a new
   * page in the file would cause excessive storage usage for workloads with
   * sparse files.  Instead we insert a read-only mapping of the 4k zero page.
   * If this page is ever written to we will re-fault and change the mapping to
   * point to real DAX storage instead.
e30331ff0   Ross Zwisler   dax: relocate som...
986
   */
b15cd8006   Matthew Wilcox   dax: Convert page...
987
988
989
  static vm_fault_t dax_load_hole(struct xa_state *xas,
  		struct address_space *mapping, void **entry,
  		struct vm_fault *vmf)
e30331ff0   Ross Zwisler   dax: relocate som...
990
991
  {
  	struct inode *inode = mapping->host;
91d25ba8a   Ross Zwisler   dax: use common 4...
992
  	unsigned long vaddr = vmf->address;
b90ca5cc3   Matthew Wilcox   filesystem-dax: F...
993
994
  	pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
  	vm_fault_t ret;
e30331ff0   Ross Zwisler   dax: relocate som...
995

b15cd8006   Matthew Wilcox   dax: Convert page...
996
  	*entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
3159f943a   Matthew Wilcox   xarray: Replace e...
997
  			DAX_ZERO_PAGE, false);
ab77dab46   Souptick Joarder   fs/dax.c: use new...
998
  	ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
e30331ff0   Ross Zwisler   dax: relocate som...
999
1000
1001
  	trace_dax_load_hole(inode, vmf, ret);
  	return ret;
  }
81ee8e52a   Matthew Wilcox (Oracle)   iomap: Change cal...
1002
  s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
679c8bd3b   Christoph Hellwig   dax: export a low...
1003
  {
4f3b4f161   Vivek Goyal   dax,iomap: Add he...
1004
  	sector_t sector = iomap_sector(iomap, pos & PAGE_MASK);
0a23f9ffa   Vivek Goyal   dax: Use new dax ...
1005
1006
1007
1008
  	pgoff_t pgoff;
  	long rc, id;
  	void *kaddr;
  	bool page_aligned = false;
81ee8e52a   Matthew Wilcox (Oracle)   iomap: Change cal...
1009
1010
  	unsigned offset = offset_in_page(pos);
  	unsigned size = min_t(u64, PAGE_SIZE - offset, length);
cccbce671   Dan Williams   filesystem-dax: c...
1011

0a23f9ffa   Vivek Goyal   dax: Use new dax ...
1012
  	if (IS_ALIGNED(sector << SECTOR_SHIFT, PAGE_SIZE) &&
81ee8e52a   Matthew Wilcox (Oracle)   iomap: Change cal...
1013
  	    (size == PAGE_SIZE))
0a23f9ffa   Vivek Goyal   dax: Use new dax ...
1014
  		page_aligned = true;
cccbce671   Dan Williams   filesystem-dax: c...
1015

4f3b4f161   Vivek Goyal   dax,iomap: Add he...
1016
  	rc = bdev_dax_pgoff(iomap->bdev, sector, PAGE_SIZE, &pgoff);
0a23f9ffa   Vivek Goyal   dax: Use new dax ...
1017
1018
1019
1020
1021
1022
  	if (rc)
  		return rc;
  
  	id = dax_read_lock();
  
  	if (page_aligned)
81ee8e52a   Matthew Wilcox (Oracle)   iomap: Change cal...
1023
  		rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
0a23f9ffa   Vivek Goyal   dax: Use new dax ...
1024
  	else
4f3b4f161   Vivek Goyal   dax,iomap: Add he...
1025
  		rc = dax_direct_access(iomap->dax_dev, pgoff, 1, &kaddr, NULL);
0a23f9ffa   Vivek Goyal   dax: Use new dax ...
1026
1027
1028
1029
1030
1031
  	if (rc < 0) {
  		dax_read_unlock(id);
  		return rc;
  	}
  
  	if (!page_aligned) {
81f558701   Dan Williams   x86, dax: replace...
1032
  		memset(kaddr + offset, 0, size);
4f3b4f161   Vivek Goyal   dax,iomap: Add he...
1033
  		dax_flush(iomap->dax_dev, kaddr + offset, size);
4b0228fa1   Vishal Verma   dax: for truncate...
1034
  	}
0a23f9ffa   Vivek Goyal   dax: Use new dax ...
1035
  	dax_read_unlock(id);
81ee8e52a   Matthew Wilcox (Oracle)   iomap: Change cal...
1036
  	return size;
679c8bd3b   Christoph Hellwig   dax: export a low...
1037
  }
679c8bd3b   Christoph Hellwig   dax: export a low...
1038

a254e5681   Christoph Hellwig   dax: provide an i...
1039
  static loff_t
11c59c92f   Ross Zwisler   dax: correct dax ...
1040
  dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
c039b9979   Goldwyn Rodrigues   iomap: use a srcm...
1041
  		struct iomap *iomap, struct iomap *srcmap)
a254e5681   Christoph Hellwig   dax: provide an i...
1042
  {
cccbce671   Dan Williams   filesystem-dax: c...
1043
1044
  	struct block_device *bdev = iomap->bdev;
  	struct dax_device *dax_dev = iomap->dax_dev;
a254e5681   Christoph Hellwig   dax: provide an i...
1045
1046
1047
  	struct iov_iter *iter = data;
  	loff_t end = pos + length, done = 0;
  	ssize_t ret = 0;
a77d47864   Dan Williams   dax: Report bytes...
1048
  	size_t xfer;
cccbce671   Dan Williams   filesystem-dax: c...
1049
  	int id;
a254e5681   Christoph Hellwig   dax: provide an i...
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
  
  	if (iov_iter_rw(iter) == READ) {
  		end = min(end, i_size_read(inode));
  		if (pos >= end)
  			return 0;
  
  		if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
  			return iov_iter_zero(min(length, end - pos), iter);
  	}
  
  	if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
  		return -EIO;
e3fce68cd   Jan Kara   dax: Avoid page i...
1062
1063
1064
1065
1066
  	/*
  	 * Write can allocate block for an area which has a hole page mapped
  	 * into page tables. We have to tear down these mappings so that data
  	 * written by write(2) is visible in mmap.
  	 */
cd656375f   Jan Kara   mm: fix data corr...
1067
  	if (iomap->flags & IOMAP_F_NEW) {
e3fce68cd   Jan Kara   dax: Avoid page i...
1068
1069
1070
1071
  		invalidate_inode_pages2_range(inode->i_mapping,
  					      pos >> PAGE_SHIFT,
  					      (end - 1) >> PAGE_SHIFT);
  	}
cccbce671   Dan Williams   filesystem-dax: c...
1072
  	id = dax_read_lock();
a254e5681   Christoph Hellwig   dax: provide an i...
1073
1074
  	while (pos < end) {
  		unsigned offset = pos & (PAGE_SIZE - 1);
cccbce671   Dan Williams   filesystem-dax: c...
1075
1076
  		const size_t size = ALIGN(length + offset, PAGE_SIZE);
  		const sector_t sector = dax_iomap_sector(iomap, pos);
a254e5681   Christoph Hellwig   dax: provide an i...
1077
  		ssize_t map_len;
cccbce671   Dan Williams   filesystem-dax: c...
1078
1079
  		pgoff_t pgoff;
  		void *kaddr;
a254e5681   Christoph Hellwig   dax: provide an i...
1080

d1908f525   Michal Hocko   fs: break out of ...
1081
1082
1083
1084
  		if (fatal_signal_pending(current)) {
  			ret = -EINTR;
  			break;
  		}
cccbce671   Dan Williams   filesystem-dax: c...
1085
1086
1087
1088
1089
  		ret = bdev_dax_pgoff(bdev, sector, size, &pgoff);
  		if (ret)
  			break;
  
  		map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
86ed913b0   Huaisheng Ye   filesystem-dax: D...
1090
  				&kaddr, NULL);
a254e5681   Christoph Hellwig   dax: provide an i...
1091
1092
1093
1094
  		if (map_len < 0) {
  			ret = map_len;
  			break;
  		}
cccbce671   Dan Williams   filesystem-dax: c...
1095
1096
  		map_len = PFN_PHYS(map_len);
  		kaddr += offset;
a254e5681   Christoph Hellwig   dax: provide an i...
1097
1098
1099
  		map_len -= offset;
  		if (map_len > end - pos)
  			map_len = end - pos;
a2e050f5a   Ross Zwisler   dax: explain how ...
1100
1101
1102
1103
1104
  		/*
  		 * The userspace address for the memory copy has already been
  		 * validated via access_ok() in either vfs_read() or
  		 * vfs_write(), depending on which operation we are doing.
  		 */
a254e5681   Christoph Hellwig   dax: provide an i...
1105
  		if (iov_iter_rw(iter) == WRITE)
a77d47864   Dan Williams   dax: Report bytes...
1106
  			xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
fec53774f   Dan Williams   filesystem-dax: c...
1107
  					map_len, iter);
a254e5681   Christoph Hellwig   dax: provide an i...
1108
  		else
a77d47864   Dan Williams   dax: Report bytes...
1109
  			xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
b3a9a0c36   Dan Williams   dax: Introduce a ...
1110
  					map_len, iter);
a254e5681   Christoph Hellwig   dax: provide an i...
1111

a77d47864   Dan Williams   dax: Report bytes...
1112
1113
1114
1115
1116
1117
1118
1119
  		pos += xfer;
  		length -= xfer;
  		done += xfer;
  
  		if (xfer == 0)
  			ret = -EFAULT;
  		if (xfer < map_len)
  			break;
a254e5681   Christoph Hellwig   dax: provide an i...
1120
  	}
cccbce671   Dan Williams   filesystem-dax: c...
1121
  	dax_read_unlock(id);
a254e5681   Christoph Hellwig   dax: provide an i...
1122
1123
1124
1125
1126
  
  	return done ? done : ret;
  }
  
  /**
11c59c92f   Ross Zwisler   dax: correct dax ...
1127
   * dax_iomap_rw - Perform I/O to a DAX file
a254e5681   Christoph Hellwig   dax: provide an i...
1128
1129
1130
1131
1132
1133
1134
1135
1136
   * @iocb:	The control block for this I/O
   * @iter:	The addresses to do I/O from or to
   * @ops:	iomap ops passed from the file system
   *
   * This function performs read and write operations to directly mapped
   * persistent memory.  The callers needs to take care of read/write exclusion
   * and evicting any page cache pages in the region under I/O.
   */
  ssize_t
11c59c92f   Ross Zwisler   dax: correct dax ...
1137
  dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
8ff6daa17   Christoph Hellwig   iomap: constify s...
1138
  		const struct iomap_ops *ops)
a254e5681   Christoph Hellwig   dax: provide an i...
1139
1140
1141
1142
1143
  {
  	struct address_space *mapping = iocb->ki_filp->f_mapping;
  	struct inode *inode = mapping->host;
  	loff_t pos = iocb->ki_pos, ret = 0, done = 0;
  	unsigned flags = 0;
168316db3   Christoph Hellwig   dax: assert that ...
1144
  	if (iov_iter_rw(iter) == WRITE) {
9ffbe8ac0   Nikolay Borisov   locking/lockdep: ...
1145
  		lockdep_assert_held_write(&inode->i_rwsem);
a254e5681   Christoph Hellwig   dax: provide an i...
1146
  		flags |= IOMAP_WRITE;
168316db3   Christoph Hellwig   dax: assert that ...
1147
1148
1149
  	} else {
  		lockdep_assert_held(&inode->i_rwsem);
  	}
a254e5681   Christoph Hellwig   dax: provide an i...
1150

96222d538   Jeff Moyer   dax: pass NOWAIT ...
1151
1152
  	if (iocb->ki_flags & IOCB_NOWAIT)
  		flags |= IOMAP_NOWAIT;
a254e5681   Christoph Hellwig   dax: provide an i...
1153
1154
  	while (iov_iter_count(iter)) {
  		ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
11c59c92f   Ross Zwisler   dax: correct dax ...
1155
  				iter, dax_iomap_actor);
a254e5681   Christoph Hellwig   dax: provide an i...
1156
1157
1158
1159
1160
1161
1162
1163
1164
  		if (ret <= 0)
  			break;
  		pos += ret;
  		done += ret;
  	}
  
  	iocb->ki_pos += done;
  	return done ? done : ret;
  }
11c59c92f   Ross Zwisler   dax: correct dax ...
1165
  EXPORT_SYMBOL_GPL(dax_iomap_rw);
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1166

ab77dab46   Souptick Joarder   fs/dax.c: use new...
1167
  static vm_fault_t dax_fault_return(int error)
9f141d6ef   Jan Kara   dax: Call ->iomap...
1168
1169
1170
  {
  	if (error == 0)
  		return VM_FAULT_NOPAGE;
c9aed74e6   Souptick Joarder   fs/dax: Convert t...
1171
  	return vmf_error(error);
9f141d6ef   Jan Kara   dax: Call ->iomap...
1172
  }
aaa422c4c   Dan Williams   fs, dax: unify IO...
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
  /*
   * MAP_SYNC on a dax mapping guarantees dirty metadata is
   * flushed on write-faults (non-cow), but not read-faults.
   */
  static bool dax_fault_is_synchronous(unsigned long flags,
  		struct vm_area_struct *vma, struct iomap *iomap)
  {
  	return (flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC)
  		&& (iomap->flags & IOMAP_F_DIRTY);
  }
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1183
  static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
c0b246259   Jan Kara   dax: pass detaile...
1184
  			       int *iomap_errp, const struct iomap_ops *ops)
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1185
  {
a0987ad5c   Jan Kara   dax: Create local...
1186
1187
  	struct vm_area_struct *vma = vmf->vma;
  	struct address_space *mapping = vma->vm_file->f_mapping;
b15cd8006   Matthew Wilcox   dax: Convert page...
1188
  	XA_STATE(xas, &mapping->i_pages, vmf->pgoff);
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1189
  	struct inode *inode = mapping->host;
1a29d85eb   Jan Kara   mm: use vmf->addr...
1190
  	unsigned long vaddr = vmf->address;
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1191
  	loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
c039b9979   Goldwyn Rodrigues   iomap: use a srcm...
1192
1193
  	struct iomap iomap = { .type = IOMAP_HOLE };
  	struct iomap srcmap = { .type = IOMAP_HOLE };
9484ab1bf   Jan Kara   dax: Introduce IO...
1194
  	unsigned flags = IOMAP_FAULT;
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1195
  	int error, major = 0;
d2c43ef13   Jan Kara   dax: Create local...
1196
  	bool write = vmf->flags & FAULT_FLAG_WRITE;
caa51d26f   Jan Kara   dax, iomap: Add s...
1197
  	bool sync;
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1198
  	vm_fault_t ret = 0;
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1199
  	void *entry;
1b5a1cb21   Jan Kara   dax: Inline dax_i...
1200
  	pfn_t pfn;
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1201

ab77dab46   Souptick Joarder   fs/dax.c: use new...
1202
  	trace_dax_pte_fault(inode, vmf, ret);
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1203
1204
1205
1206
1207
  	/*
  	 * Check whether offset isn't beyond end of file now. Caller is supposed
  	 * to hold locks serializing us with truncate / punch hole so this is
  	 * a reliable test.
  	 */
a9c42b33e   Ross Zwisler   dax: add tracepoi...
1208
  	if (pos >= i_size_read(inode)) {
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1209
  		ret = VM_FAULT_SIGBUS;
a9c42b33e   Ross Zwisler   dax: add tracepoi...
1210
1211
  		goto out;
  	}
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1212

d2c43ef13   Jan Kara   dax: Create local...
1213
  	if (write && !vmf->cow_page)
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1214
  		flags |= IOMAP_WRITE;
b15cd8006   Matthew Wilcox   dax: Convert page...
1215
1216
1217
  	entry = grab_mapping_entry(&xas, mapping, 0);
  	if (xa_is_internal(entry)) {
  		ret = xa_to_internal(entry);
13e451fdc   Jan Kara   dax: fix data cor...
1218
1219
  		goto out;
  	}
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1220
  	/*
e2093926a   Ross Zwisler   dax: fix race bet...
1221
1222
1223
1224
1225
1226
  	 * It is possible, particularly with mixed reads & writes to private
  	 * mappings, that we have raced with a PMD fault that overlaps with
  	 * the PTE we need to set up.  If so just return and the fault will be
  	 * retried.
  	 */
  	if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) {
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1227
  		ret = VM_FAULT_NOPAGE;
e2093926a   Ross Zwisler   dax: fix race bet...
1228
1229
1230
1231
  		goto unlock_entry;
  	}
  
  	/*
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1232
1233
1234
1235
  	 * Note that we don't bother to use iomap_apply here: DAX required
  	 * the file system block size to be equal the page size, which means
  	 * that we never have to deal with more than a single extent here.
  	 */
c039b9979   Goldwyn Rodrigues   iomap: use a srcm...
1236
  	error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap, &srcmap);
c0b246259   Jan Kara   dax: pass detaile...
1237
1238
  	if (iomap_errp)
  		*iomap_errp = error;
a9c42b33e   Ross Zwisler   dax: add tracepoi...
1239
  	if (error) {
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1240
  		ret = dax_fault_return(error);
13e451fdc   Jan Kara   dax: fix data cor...
1241
  		goto unlock_entry;
a9c42b33e   Ross Zwisler   dax: add tracepoi...
1242
  	}
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1243
  	if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
13e451fdc   Jan Kara   dax: fix data cor...
1244
1245
  		error = -EIO;	/* fs corruption? */
  		goto error_finish_iomap;
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1246
  	}
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1247
  	if (vmf->cow_page) {
31a6f1a6e   Jan Kara   dax: Simplify arg...
1248
  		sector_t sector = dax_iomap_sector(&iomap, pos);
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1249
1250
1251
1252
1253
1254
  		switch (iomap.type) {
  		case IOMAP_HOLE:
  		case IOMAP_UNWRITTEN:
  			clear_user_highpage(vmf->cow_page, vaddr);
  			break;
  		case IOMAP_MAPPED:
c7fe193f1   Ira Weiny   fs/dax: Remove un...
1255
1256
  			error = copy_cow_page_dax(iomap.bdev, iomap.dax_dev,
  						  sector, vmf->cow_page, vaddr);
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1257
1258
1259
1260
1261
1262
1263
1264
  			break;
  		default:
  			WARN_ON_ONCE(1);
  			error = -EIO;
  			break;
  		}
  
  		if (error)
13e451fdc   Jan Kara   dax: fix data cor...
1265
  			goto error_finish_iomap;
b1aa812b2   Jan Kara   mm: move handling...
1266
1267
  
  		__SetPageUptodate(vmf->cow_page);
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1268
1269
1270
  		ret = finish_fault(vmf);
  		if (!ret)
  			ret = VM_FAULT_DONE_COW;
13e451fdc   Jan Kara   dax: fix data cor...
1271
  		goto finish_iomap;
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1272
  	}
aaa422c4c   Dan Williams   fs, dax: unify IO...
1273
  	sync = dax_fault_is_synchronous(flags, vma, &iomap);
caa51d26f   Jan Kara   dax, iomap: Add s...
1274

a7d73fe6c   Christoph Hellwig   dax: provide an i...
1275
1276
1277
1278
  	switch (iomap.type) {
  	case IOMAP_MAPPED:
  		if (iomap.flags & IOMAP_F_NEW) {
  			count_vm_event(PGMAJFAULT);
a0987ad5c   Jan Kara   dax: Create local...
1279
  			count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1280
1281
  			major = VM_FAULT_MAJOR;
  		}
1b5a1cb21   Jan Kara   dax: Inline dax_i...
1282
1283
1284
  		error = dax_iomap_pfn(&iomap, pos, PAGE_SIZE, &pfn);
  		if (error < 0)
  			goto error_finish_iomap;
b15cd8006   Matthew Wilcox   dax: Convert page...
1285
  		entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
caa51d26f   Jan Kara   dax, iomap: Add s...
1286
  						 0, write && !sync);
1b5a1cb21   Jan Kara   dax: Inline dax_i...
1287

caa51d26f   Jan Kara   dax, iomap: Add s...
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
  		/*
  		 * If we are doing synchronous page fault and inode needs fsync,
  		 * we can insert PTE into page tables only after that happens.
  		 * Skip insertion for now and return the pfn so that caller can
  		 * insert it after fsync is done.
  		 */
  		if (sync) {
  			if (WARN_ON_ONCE(!pfnp)) {
  				error = -EIO;
  				goto error_finish_iomap;
  			}
  			*pfnp = pfn;
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1300
  			ret = VM_FAULT_NEEDDSYNC | major;
caa51d26f   Jan Kara   dax, iomap: Add s...
1301
1302
  			goto finish_iomap;
  		}
1b5a1cb21   Jan Kara   dax: Inline dax_i...
1303
1304
  		trace_dax_insert_mapping(inode, vmf, entry);
  		if (write)
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1305
  			ret = vmf_insert_mixed_mkwrite(vma, vaddr, pfn);
1b5a1cb21   Jan Kara   dax: Inline dax_i...
1306
  		else
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1307
  			ret = vmf_insert_mixed(vma, vaddr, pfn);
1b5a1cb21   Jan Kara   dax: Inline dax_i...
1308

ab77dab46   Souptick Joarder   fs/dax.c: use new...
1309
  		goto finish_iomap;
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1310
1311
  	case IOMAP_UNWRITTEN:
  	case IOMAP_HOLE:
d2c43ef13   Jan Kara   dax: Create local...
1312
  		if (!write) {
b15cd8006   Matthew Wilcox   dax: Convert page...
1313
  			ret = dax_load_hole(&xas, mapping, &entry, vmf);
13e451fdc   Jan Kara   dax: fix data cor...
1314
  			goto finish_iomap;
1550290b0   Ross Zwisler   dax: dax_iomap_fa...
1315
  		}
df561f668   Gustavo A. R. Silva   treewide: Use fal...
1316
  		fallthrough;
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1317
1318
1319
1320
1321
  	default:
  		WARN_ON_ONCE(1);
  		error = -EIO;
  		break;
  	}
13e451fdc   Jan Kara   dax: fix data cor...
1322
   error_finish_iomap:
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1323
  	ret = dax_fault_return(error);
9f141d6ef   Jan Kara   dax: Call ->iomap...
1324
1325
1326
   finish_iomap:
  	if (ops->iomap_end) {
  		int copied = PAGE_SIZE;
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1327
  		if (ret & VM_FAULT_ERROR)
9f141d6ef   Jan Kara   dax: Call ->iomap...
1328
1329
1330
1331
1332
1333
1334
1335
  			copied = 0;
  		/*
  		 * The fault is done by now and there's no way back (other
  		 * thread may be already happily using PTE we have installed).
  		 * Just ignore error from ->iomap_end since we cannot do much
  		 * with it.
  		 */
  		ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
1550290b0   Ross Zwisler   dax: dax_iomap_fa...
1336
  	}
13e451fdc   Jan Kara   dax: fix data cor...
1337
   unlock_entry:
b15cd8006   Matthew Wilcox   dax: Convert page...
1338
  	dax_unlock_entry(&xas, entry);
13e451fdc   Jan Kara   dax: fix data cor...
1339
   out:
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1340
1341
  	trace_dax_pte_fault_done(inode, vmf, ret);
  	return ret | major;
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1342
  }
642261ac9   Ross Zwisler   dax: add struct i...
1343
1344
  
  #ifdef CONFIG_FS_DAX_PMD
b15cd8006   Matthew Wilcox   dax: Convert page...
1345
1346
  static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
  		struct iomap *iomap, void **entry)
642261ac9   Ross Zwisler   dax: add struct i...
1347
  {
f42003917   Dave Jiang   mm, dax: change p...
1348
1349
  	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
  	unsigned long pmd_addr = vmf->address & PMD_MASK;
11cf9d863   Aneesh Kumar K.V   fs/dax: Deposit p...
1350
  	struct vm_area_struct *vma = vmf->vma;
653b2ea33   Ross Zwisler   dax: add tracepoi...
1351
  	struct inode *inode = mapping->host;
11cf9d863   Aneesh Kumar K.V   fs/dax: Deposit p...
1352
  	pgtable_t pgtable = NULL;
642261ac9   Ross Zwisler   dax: add struct i...
1353
1354
1355
  	struct page *zero_page;
  	spinlock_t *ptl;
  	pmd_t pmd_entry;
3fe0791c2   Dan Williams   dax: store pfns i...
1356
  	pfn_t pfn;
642261ac9   Ross Zwisler   dax: add struct i...
1357

f42003917   Dave Jiang   mm, dax: change p...
1358
  	zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm);
642261ac9   Ross Zwisler   dax: add struct i...
1359
1360
  
  	if (unlikely(!zero_page))
653b2ea33   Ross Zwisler   dax: add tracepoi...
1361
  		goto fallback;
642261ac9   Ross Zwisler   dax: add struct i...
1362

3fe0791c2   Dan Williams   dax: store pfns i...
1363
  	pfn = page_to_pfn_t(zero_page);
b15cd8006   Matthew Wilcox   dax: Convert page...
1364
  	*entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
3159f943a   Matthew Wilcox   xarray: Replace e...
1365
  			DAX_PMD | DAX_ZERO_PAGE, false);
642261ac9   Ross Zwisler   dax: add struct i...
1366

11cf9d863   Aneesh Kumar K.V   fs/dax: Deposit p...
1367
1368
1369
1370
1371
  	if (arch_needs_pgtable_deposit()) {
  		pgtable = pte_alloc_one(vma->vm_mm);
  		if (!pgtable)
  			return VM_FAULT_OOM;
  	}
f42003917   Dave Jiang   mm, dax: change p...
1372
1373
  	ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
  	if (!pmd_none(*(vmf->pmd))) {
642261ac9   Ross Zwisler   dax: add struct i...
1374
  		spin_unlock(ptl);
653b2ea33   Ross Zwisler   dax: add tracepoi...
1375
  		goto fallback;
642261ac9   Ross Zwisler   dax: add struct i...
1376
  	}
11cf9d863   Aneesh Kumar K.V   fs/dax: Deposit p...
1377
1378
1379
1380
  	if (pgtable) {
  		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
  		mm_inc_nr_ptes(vma->vm_mm);
  	}
f42003917   Dave Jiang   mm, dax: change p...
1381
  	pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot);
642261ac9   Ross Zwisler   dax: add struct i...
1382
  	pmd_entry = pmd_mkhuge(pmd_entry);
f42003917   Dave Jiang   mm, dax: change p...
1383
  	set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
642261ac9   Ross Zwisler   dax: add struct i...
1384
  	spin_unlock(ptl);
b15cd8006   Matthew Wilcox   dax: Convert page...
1385
  	trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry);
642261ac9   Ross Zwisler   dax: add struct i...
1386
  	return VM_FAULT_NOPAGE;
653b2ea33   Ross Zwisler   dax: add tracepoi...
1387
1388
  
  fallback:
11cf9d863   Aneesh Kumar K.V   fs/dax: Deposit p...
1389
1390
  	if (pgtable)
  		pte_free(vma->vm_mm, pgtable);
b15cd8006   Matthew Wilcox   dax: Convert page...
1391
  	trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry);
653b2ea33   Ross Zwisler   dax: add tracepoi...
1392
  	return VM_FAULT_FALLBACK;
642261ac9   Ross Zwisler   dax: add struct i...
1393
  }
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1394
  static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
a2d581675   Dave Jiang   mm,fs,dax: change...
1395
  			       const struct iomap_ops *ops)
642261ac9   Ross Zwisler   dax: add struct i...
1396
  {
f42003917   Dave Jiang   mm, dax: change p...
1397
  	struct vm_area_struct *vma = vmf->vma;
642261ac9   Ross Zwisler   dax: add struct i...
1398
  	struct address_space *mapping = vma->vm_file->f_mapping;
b15cd8006   Matthew Wilcox   dax: Convert page...
1399
  	XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER);
d8a849e1b   Dave Jiang   mm, dax: make pmd...
1400
1401
  	unsigned long pmd_addr = vmf->address & PMD_MASK;
  	bool write = vmf->flags & FAULT_FLAG_WRITE;
caa51d26f   Jan Kara   dax, iomap: Add s...
1402
  	bool sync;
9484ab1bf   Jan Kara   dax: Introduce IO...
1403
  	unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
642261ac9   Ross Zwisler   dax: add struct i...
1404
  	struct inode *inode = mapping->host;
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1405
  	vm_fault_t result = VM_FAULT_FALLBACK;
c039b9979   Goldwyn Rodrigues   iomap: use a srcm...
1406
1407
  	struct iomap iomap = { .type = IOMAP_HOLE };
  	struct iomap srcmap = { .type = IOMAP_HOLE };
b15cd8006   Matthew Wilcox   dax: Convert page...
1408
  	pgoff_t max_pgoff;
642261ac9   Ross Zwisler   dax: add struct i...
1409
1410
1411
  	void *entry;
  	loff_t pos;
  	int error;
302a5e312   Jan Kara   dax: Inline dax_p...
1412
  	pfn_t pfn;
642261ac9   Ross Zwisler   dax: add struct i...
1413

282a8e039   Ross Zwisler   dax: add tracepoi...
1414
1415
1416
1417
1418
  	/*
  	 * Check whether offset isn't beyond end of file now. Caller is
  	 * supposed to hold locks serializing us with truncate / punch hole so
  	 * this is a reliable test.
  	 */
957ac8c42   Jeff Moyer   dax: fix PMD faul...
1419
  	max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
282a8e039   Ross Zwisler   dax: add tracepoi...
1420

f42003917   Dave Jiang   mm, dax: change p...
1421
  	trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);
282a8e039   Ross Zwisler   dax: add tracepoi...
1422

fffa281b4   Ross Zwisler   dax: fix deadlock...
1423
1424
1425
1426
  	/*
  	 * Make sure that the faulting address's PMD offset (color) matches
  	 * the PMD offset from the start of the file.  This is necessary so
  	 * that a PMD range in the page table overlaps exactly with a PMD
a77d19f46   Matthew Wilcox   dax: Rename some ...
1427
  	 * range in the page cache.
fffa281b4   Ross Zwisler   dax: fix deadlock...
1428
1429
1430
1431
  	 */
  	if ((vmf->pgoff & PG_PMD_COLOUR) !=
  	    ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR))
  		goto fallback;
642261ac9   Ross Zwisler   dax: add struct i...
1432
1433
1434
1435
1436
1437
1438
1439
1440
  	/* Fall back to PTEs if we're going to COW */
  	if (write && !(vma->vm_flags & VM_SHARED))
  		goto fallback;
  
  	/* If the PMD would extend outside the VMA */
  	if (pmd_addr < vma->vm_start)
  		goto fallback;
  	if ((pmd_addr + PMD_SIZE) > vma->vm_end)
  		goto fallback;
b15cd8006   Matthew Wilcox   dax: Convert page...
1441
  	if (xas.xa_index >= max_pgoff) {
282a8e039   Ross Zwisler   dax: add tracepoi...
1442
1443
1444
  		result = VM_FAULT_SIGBUS;
  		goto out;
  	}
642261ac9   Ross Zwisler   dax: add struct i...
1445
1446
  
  	/* If the PMD would extend beyond the file size */
b15cd8006   Matthew Wilcox   dax: Convert page...
1447
  	if ((xas.xa_index | PG_PMD_COLOUR) >= max_pgoff)
642261ac9   Ross Zwisler   dax: add struct i...
1448
1449
1450
  		goto fallback;
  
  	/*
b15cd8006   Matthew Wilcox   dax: Convert page...
1451
1452
1453
1454
  	 * grab_mapping_entry() will make sure we get an empty PMD entry,
  	 * a zero PMD entry or a DAX PMD.  If it can't (because a PTE
  	 * entry is already in the array, for instance), it will return
  	 * VM_FAULT_FALLBACK.
876f29460   Ross Zwisler   dax: fix PMD data...
1455
  	 */
23c84eb78   Matthew Wilcox (Oracle)   dax: Fix missed w...
1456
  	entry = grab_mapping_entry(&xas, mapping, PMD_ORDER);
b15cd8006   Matthew Wilcox   dax: Convert page...
1457
1458
  	if (xa_is_internal(entry)) {
  		result = xa_to_internal(entry);
876f29460   Ross Zwisler   dax: fix PMD data...
1459
  		goto fallback;
b15cd8006   Matthew Wilcox   dax: Convert page...
1460
  	}
876f29460   Ross Zwisler   dax: fix PMD data...
1461
1462
  
  	/*
e2093926a   Ross Zwisler   dax: fix race bet...
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
  	 * It is possible, particularly with mixed reads & writes to private
  	 * mappings, that we have raced with a PTE fault that overlaps with
  	 * the PMD we need to set up.  If so just return and the fault will be
  	 * retried.
  	 */
  	if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) &&
  			!pmd_devmap(*vmf->pmd)) {
  		result = 0;
  		goto unlock_entry;
  	}
  
  	/*
642261ac9   Ross Zwisler   dax: add struct i...
1475
1476
1477
1478
  	 * Note that we don't use iomap_apply here.  We aren't doing I/O, only
  	 * setting up a mapping, so really we're using iomap_begin() as a way
  	 * to look up our filesystem block.
  	 */
b15cd8006   Matthew Wilcox   dax: Convert page...
1479
  	pos = (loff_t)xas.xa_index << PAGE_SHIFT;
c039b9979   Goldwyn Rodrigues   iomap: use a srcm...
1480
1481
  	error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap,
  			&srcmap);
642261ac9   Ross Zwisler   dax: add struct i...
1482
  	if (error)
876f29460   Ross Zwisler   dax: fix PMD data...
1483
  		goto unlock_entry;
9f141d6ef   Jan Kara   dax: Call ->iomap...
1484

642261ac9   Ross Zwisler   dax: add struct i...
1485
1486
  	if (iomap.offset + iomap.length < pos + PMD_SIZE)
  		goto finish_iomap;
aaa422c4c   Dan Williams   fs, dax: unify IO...
1487
  	sync = dax_fault_is_synchronous(iomap_flags, vma, &iomap);
caa51d26f   Jan Kara   dax, iomap: Add s...
1488

642261ac9   Ross Zwisler   dax: add struct i...
1489
1490
  	switch (iomap.type) {
  	case IOMAP_MAPPED:
302a5e312   Jan Kara   dax: Inline dax_p...
1491
1492
1493
  		error = dax_iomap_pfn(&iomap, pos, PMD_SIZE, &pfn);
  		if (error < 0)
  			goto finish_iomap;
b15cd8006   Matthew Wilcox   dax: Convert page...
1494
  		entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
3159f943a   Matthew Wilcox   xarray: Replace e...
1495
  						DAX_PMD, write && !sync);
302a5e312   Jan Kara   dax: Inline dax_p...
1496

caa51d26f   Jan Kara   dax, iomap: Add s...
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
  		/*
  		 * If we are doing synchronous page fault and inode needs fsync,
  		 * we can insert PMD into page tables only after that happens.
  		 * Skip insertion for now and return the pfn so that caller can
  		 * insert it after fsync is done.
  		 */
  		if (sync) {
  			if (WARN_ON_ONCE(!pfnp))
  				goto finish_iomap;
  			*pfnp = pfn;
  			result = VM_FAULT_NEEDDSYNC;
  			goto finish_iomap;
  		}
302a5e312   Jan Kara   dax: Inline dax_p...
1510
  		trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
fce86ff58   Dan Williams   mm/huge_memory: f...
1511
  		result = vmf_insert_pfn_pmd(vmf, pfn, write);
642261ac9   Ross Zwisler   dax: add struct i...
1512
1513
1514
1515
  		break;
  	case IOMAP_UNWRITTEN:
  	case IOMAP_HOLE:
  		if (WARN_ON_ONCE(write))
876f29460   Ross Zwisler   dax: fix PMD data...
1516
  			break;
b15cd8006   Matthew Wilcox   dax: Convert page...
1517
  		result = dax_pmd_load_hole(&xas, vmf, &iomap, &entry);
642261ac9   Ross Zwisler   dax: add struct i...
1518
1519
1520
1521
1522
1523
1524
1525
  		break;
  	default:
  		WARN_ON_ONCE(1);
  		break;
  	}
  
   finish_iomap:
  	if (ops->iomap_end) {
9f141d6ef   Jan Kara   dax: Call ->iomap...
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
  		int copied = PMD_SIZE;
  
  		if (result == VM_FAULT_FALLBACK)
  			copied = 0;
  		/*
  		 * The fault is done by now and there's no way back (other
  		 * thread may be already happily using PMD we have installed).
  		 * Just ignore error from ->iomap_end since we cannot do much
  		 * with it.
  		 */
  		ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags,
  				&iomap);
642261ac9   Ross Zwisler   dax: add struct i...
1538
  	}
876f29460   Ross Zwisler   dax: fix PMD data...
1539
   unlock_entry:
b15cd8006   Matthew Wilcox   dax: Convert page...
1540
  	dax_unlock_entry(&xas, entry);
642261ac9   Ross Zwisler   dax: add struct i...
1541
1542
   fallback:
  	if (result == VM_FAULT_FALLBACK) {
d8a849e1b   Dave Jiang   mm, dax: make pmd...
1543
  		split_huge_pmd(vma, vmf->pmd, vmf->address);
642261ac9   Ross Zwisler   dax: add struct i...
1544
1545
  		count_vm_event(THP_FAULT_FALLBACK);
  	}
282a8e039   Ross Zwisler   dax: add tracepoi...
1546
  out:
f42003917   Dave Jiang   mm, dax: change p...
1547
  	trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result);
642261ac9   Ross Zwisler   dax: add struct i...
1548
1549
  	return result;
  }
a2d581675   Dave Jiang   mm,fs,dax: change...
1550
  #else
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1551
  static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
01cddfe99   Arnd Bergmann   mm,fs,dax: mark d...
1552
  			       const struct iomap_ops *ops)
a2d581675   Dave Jiang   mm,fs,dax: change...
1553
1554
1555
  {
  	return VM_FAULT_FALLBACK;
  }
642261ac9   Ross Zwisler   dax: add struct i...
1556
  #endif /* CONFIG_FS_DAX_PMD */
a2d581675   Dave Jiang   mm,fs,dax: change...
1557
1558
1559
1560
  
  /**
   * dax_iomap_fault - handle a page fault on a DAX file
   * @vmf: The description of the fault
cec04e8c8   Jan Kara   dax: Fix comment ...
1561
   * @pe_size: Size of the page to fault in
9a0dd4225   Jan Kara   dax: Allow dax_io...
1562
   * @pfnp: PFN to insert for synchronous faults if fsync is required
c0b246259   Jan Kara   dax: pass detaile...
1563
   * @iomap_errp: Storage for detailed error code in case of error
cec04e8c8   Jan Kara   dax: Fix comment ...
1564
   * @ops: Iomap ops passed from the file system
a2d581675   Dave Jiang   mm,fs,dax: change...
1565
1566
1567
1568
1569
1570
   *
   * When a page fault occurs, filesystems may call this helper in
   * their fault handler for DAX files. dax_iomap_fault() assumes the caller
   * has done all the necessary locking for page fault to proceed
   * successfully.
   */
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1571
  vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
c0b246259   Jan Kara   dax: pass detaile...
1572
  		    pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops)
a2d581675   Dave Jiang   mm,fs,dax: change...
1573
  {
c791ace1e   Dave Jiang   mm: replace FAULT...
1574
1575
  	switch (pe_size) {
  	case PE_SIZE_PTE:
c0b246259   Jan Kara   dax: pass detaile...
1576
  		return dax_iomap_pte_fault(vmf, pfnp, iomap_errp, ops);
c791ace1e   Dave Jiang   mm: replace FAULT...
1577
  	case PE_SIZE_PMD:
9a0dd4225   Jan Kara   dax: Allow dax_io...
1578
  		return dax_iomap_pmd_fault(vmf, pfnp, ops);
a2d581675   Dave Jiang   mm,fs,dax: change...
1579
1580
1581
1582
1583
  	default:
  		return VM_FAULT_FALLBACK;
  	}
  }
  EXPORT_SYMBOL_GPL(dax_iomap_fault);
71eab6dfd   Jan Kara   dax: Implement da...
1584

a77d19f46   Matthew Wilcox   dax: Rename some ...
1585
  /*
71eab6dfd   Jan Kara   dax: Implement da...
1586
1587
   * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables
   * @vmf: The description of the fault
71eab6dfd   Jan Kara   dax: Implement da...
1588
   * @pfn: PFN to insert
cfc93c6c6   Matthew Wilcox   dax: Convert dax_...
1589
   * @order: Order of entry to insert.
71eab6dfd   Jan Kara   dax: Implement da...
1590
   *
a77d19f46   Matthew Wilcox   dax: Rename some ...
1591
1592
   * This function inserts a writeable PTE or PMD entry into the page tables
   * for an mmaped DAX file.  It also marks the page cache entry as dirty.
71eab6dfd   Jan Kara   dax: Implement da...
1593
   */
cfc93c6c6   Matthew Wilcox   dax: Convert dax_...
1594
1595
  static vm_fault_t
  dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
71eab6dfd   Jan Kara   dax: Implement da...
1596
1597
  {
  	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
cfc93c6c6   Matthew Wilcox   dax: Convert dax_...
1598
1599
  	XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order);
  	void *entry;
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1600
  	vm_fault_t ret;
71eab6dfd   Jan Kara   dax: Implement da...
1601

cfc93c6c6   Matthew Wilcox   dax: Convert dax_...
1602
  	xas_lock_irq(&xas);
23c84eb78   Matthew Wilcox (Oracle)   dax: Fix missed w...
1603
  	entry = get_unlocked_entry(&xas, order);
71eab6dfd   Jan Kara   dax: Implement da...
1604
  	/* Did we race with someone splitting entry or so? */
23c84eb78   Matthew Wilcox (Oracle)   dax: Fix missed w...
1605
1606
  	if (!entry || dax_is_conflict(entry) ||
  	    (order == 0 && !dax_is_pte_entry(entry))) {
cfc93c6c6   Matthew Wilcox   dax: Convert dax_...
1607
1608
  		put_unlocked_entry(&xas, entry);
  		xas_unlock_irq(&xas);
71eab6dfd   Jan Kara   dax: Implement da...
1609
1610
1611
1612
  		trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
  						      VM_FAULT_NOPAGE);
  		return VM_FAULT_NOPAGE;
  	}
cfc93c6c6   Matthew Wilcox   dax: Convert dax_...
1613
1614
1615
1616
  	xas_set_mark(&xas, PAGECACHE_TAG_DIRTY);
  	dax_lock_entry(&xas, entry);
  	xas_unlock_irq(&xas);
  	if (order == 0)
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1617
  		ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
71eab6dfd   Jan Kara   dax: Implement da...
1618
  #ifdef CONFIG_FS_DAX_PMD
cfc93c6c6   Matthew Wilcox   dax: Convert dax_...
1619
  	else if (order == PMD_ORDER)
fce86ff58   Dan Williams   mm/huge_memory: f...
1620
  		ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE);
71eab6dfd   Jan Kara   dax: Implement da...
1621
  #endif
cfc93c6c6   Matthew Wilcox   dax: Convert dax_...
1622
  	else
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1623
  		ret = VM_FAULT_FALLBACK;
cfc93c6c6   Matthew Wilcox   dax: Convert dax_...
1624
  	dax_unlock_entry(&xas, entry);
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1625
1626
  	trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret);
  	return ret;
71eab6dfd   Jan Kara   dax: Implement da...
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
  }
  
  /**
   * dax_finish_sync_fault - finish synchronous page fault
   * @vmf: The description of the fault
   * @pe_size: Size of entry to be inserted
   * @pfn: PFN to insert
   *
   * This function ensures that the file range touched by the page fault is
   * stored persistently on the media and handles inserting of appropriate page
   * table entry.
   */
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1639
1640
  vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
  		enum page_entry_size pe_size, pfn_t pfn)
71eab6dfd   Jan Kara   dax: Implement da...
1641
1642
1643
  {
  	int err;
  	loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
cfc93c6c6   Matthew Wilcox   dax: Convert dax_...
1644
1645
  	unsigned int order = pe_order(pe_size);
  	size_t len = PAGE_SIZE << order;
71eab6dfd   Jan Kara   dax: Implement da...
1646

71eab6dfd   Jan Kara   dax: Implement da...
1647
1648
1649
  	err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1);
  	if (err)
  		return VM_FAULT_SIGBUS;
cfc93c6c6   Matthew Wilcox   dax: Convert dax_...
1650
  	return dax_insert_pfn_mkwrite(vmf, pfn, order);
71eab6dfd   Jan Kara   dax: Implement da...
1651
1652
  }
  EXPORT_SYMBOL_GPL(dax_finish_sync_fault);