Blame view

fs/dax.c 49.8 KB
d475c6346   Matthew Wilcox   dax,ext2: replace...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
  /*
   * fs/dax.c - Direct Access filesystem code
   * Copyright (c) 2013-2014 Intel Corporation
   * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
   * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
   *
   * This program is free software; you can redistribute it and/or modify it
   * under the terms and conditions of the GNU General Public License,
   * version 2, as published by the Free Software Foundation.
   *
   * This program is distributed in the hope it will be useful, but WITHOUT
   * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
   * more details.
   */
  
  #include <linux/atomic.h>
  #include <linux/blkdev.h>
  #include <linux/buffer_head.h>
d77e92e27   Ross Zwisler   dax: update PMD f...
20
  #include <linux/dax.h>
d475c6346   Matthew Wilcox   dax,ext2: replace...
21
22
  #include <linux/fs.h>
  #include <linux/genhd.h>
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
23
24
25
  #include <linux/highmem.h>
  #include <linux/memcontrol.h>
  #include <linux/mm.h>
d475c6346   Matthew Wilcox   dax,ext2: replace...
26
  #include <linux/mutex.h>
9973c98ec   Ross Zwisler   dax: add support ...
27
  #include <linux/pagevec.h>
289c6aeda   Matthew Wilcox   dax,ext2: replace...
28
  #include <linux/sched.h>
f361bf4a6   Ingo Molnar   sched/headers: Pr...
29
  #include <linux/sched/signal.h>
d475c6346   Matthew Wilcox   dax,ext2: replace...
30
  #include <linux/uio.h>
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
31
  #include <linux/vmstat.h>
34c0fd540   Dan Williams   mm, dax, pmem: in...
32
  #include <linux/pfn_t.h>
0e749e542   Dan Williams   dax: increase gra...
33
  #include <linux/sizes.h>
4b4bb46d0   Jan Kara   dax: clear dirty ...
34
  #include <linux/mmu_notifier.h>
a254e5681   Christoph Hellwig   dax: provide an i...
35
36
  #include <linux/iomap.h>
  #include "internal.h"
d475c6346   Matthew Wilcox   dax,ext2: replace...
37

282a8e039   Ross Zwisler   dax: add tracepoi...
38
39
  #define CREATE_TRACE_POINTS
  #include <trace/events/fs_dax.h>
ac401cc78   Jan Kara   dax: New fault lo...
40
41
42
  /* We choose 4096 entries - same as per-zone page wait tables */
  #define DAX_WAIT_TABLE_BITS 12
  #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
917f34526   Ross Zwisler   dax: use PG_PMD_C...
43
44
  /* The 'colour' (ie low bits) within a PMD of a page offset.  */
  #define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1)
977fbdcd5   Matthew Wilcox   mm: add unmap_map...
45
  #define PG_PMD_NR	(PMD_SIZE >> PAGE_SHIFT)
917f34526   Ross Zwisler   dax: use PG_PMD_C...
46

ce95ab0fa   Ross Zwisler   dax: make 'wait_t...
47
  static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
ac401cc78   Jan Kara   dax: New fault lo...
48
49
50
51
52
53
54
55
56
57
  
  static int __init init_dax_wait_table(void)
  {
  	int i;
  
  	for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
  		init_waitqueue_head(wait_table + i);
  	return 0;
  }
  fs_initcall(init_dax_wait_table);
527b19d08   Ross Zwisler   dax: move all DAX...
58
59
60
61
62
63
64
65
66
67
68
69
70
71
  /*
   * We use lowest available bit in exceptional entry for locking, one bit for
   * the entry size (PMD) and two more to tell us if the entry is a zero page or
   * an empty entry that is just used for locking.  In total four special bits.
   *
   * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
   * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
   * block allocation.
   */
  #define RADIX_DAX_SHIFT		(RADIX_TREE_EXCEPTIONAL_SHIFT + 4)
  #define RADIX_DAX_ENTRY_LOCK	(1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
  #define RADIX_DAX_PMD		(1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
  #define RADIX_DAX_ZERO_PAGE	(1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
  #define RADIX_DAX_EMPTY		(1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
3fe0791c2   Dan Williams   dax: store pfns i...
72
  static unsigned long dax_radix_pfn(void *entry)
527b19d08   Ross Zwisler   dax: move all DAX...
73
74
75
  {
  	return (unsigned long)entry >> RADIX_DAX_SHIFT;
  }
3fe0791c2   Dan Williams   dax: store pfns i...
76
  static void *dax_radix_locked_entry(unsigned long pfn, unsigned long flags)
527b19d08   Ross Zwisler   dax: move all DAX...
77
78
  {
  	return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags |
3fe0791c2   Dan Williams   dax: store pfns i...
79
  			(pfn << RADIX_DAX_SHIFT) | RADIX_DAX_ENTRY_LOCK);
527b19d08   Ross Zwisler   dax: move all DAX...
80
81
82
83
84
85
86
87
  }
  
  static unsigned int dax_radix_order(void *entry)
  {
  	if ((unsigned long)entry & RADIX_DAX_PMD)
  		return PMD_SHIFT - PAGE_SHIFT;
  	return 0;
  }
642261ac9   Ross Zwisler   dax: add struct i...
88
  static int dax_is_pmd_entry(void *entry)
d1a5f2b4d   Dan Williams   block: use DAX fo...
89
  {
642261ac9   Ross Zwisler   dax: add struct i...
90
  	return (unsigned long)entry & RADIX_DAX_PMD;
d1a5f2b4d   Dan Williams   block: use DAX fo...
91
  }
642261ac9   Ross Zwisler   dax: add struct i...
92
  static int dax_is_pte_entry(void *entry)
d475c6346   Matthew Wilcox   dax,ext2: replace...
93
  {
642261ac9   Ross Zwisler   dax: add struct i...
94
  	return !((unsigned long)entry & RADIX_DAX_PMD);
d475c6346   Matthew Wilcox   dax,ext2: replace...
95
  }
642261ac9   Ross Zwisler   dax: add struct i...
96
  static int dax_is_zero_entry(void *entry)
d475c6346   Matthew Wilcox   dax,ext2: replace...
97
  {
91d25ba8a   Ross Zwisler   dax: use common 4...
98
  	return (unsigned long)entry & RADIX_DAX_ZERO_PAGE;
d475c6346   Matthew Wilcox   dax,ext2: replace...
99
  }
642261ac9   Ross Zwisler   dax: add struct i...
100
  static int dax_is_empty_entry(void *entry)
b2e0d1625   Dan Williams   dax: fix lifetime...
101
  {
642261ac9   Ross Zwisler   dax: add struct i...
102
  	return (unsigned long)entry & RADIX_DAX_EMPTY;
b2e0d1625   Dan Williams   dax: fix lifetime...
103
  }
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
104
  /*
ac401cc78   Jan Kara   dax: New fault lo...
105
106
107
108
   * DAX radix tree locking
   */
  struct exceptional_entry_key {
  	struct address_space *mapping;
63e95b5c4   Ross Zwisler   dax: coordinate l...
109
  	pgoff_t entry_start;
ac401cc78   Jan Kara   dax: New fault lo...
110
111
112
  };
  
  struct wait_exceptional_entry_queue {
ac6424b98   Ingo Molnar   sched/wait: Renam...
113
  	wait_queue_entry_t wait;
ac401cc78   Jan Kara   dax: New fault lo...
114
115
  	struct exceptional_entry_key key;
  };
63e95b5c4   Ross Zwisler   dax: coordinate l...
116
117
118
119
120
121
122
123
124
125
  static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
  		pgoff_t index, void *entry, struct exceptional_entry_key *key)
  {
  	unsigned long hash;
  
  	/*
  	 * If 'entry' is a PMD, align the 'index' that we use for the wait
  	 * queue to the start of that PMD.  This ensures that all offsets in
  	 * the range covered by the PMD map to the same bit lock.
  	 */
642261ac9   Ross Zwisler   dax: add struct i...
126
  	if (dax_is_pmd_entry(entry))
917f34526   Ross Zwisler   dax: use PG_PMD_C...
127
  		index &= ~PG_PMD_COLOUR;
63e95b5c4   Ross Zwisler   dax: coordinate l...
128
129
130
131
132
133
134
  
  	key->mapping = mapping;
  	key->entry_start = index;
  
  	hash = hash_long((unsigned long)mapping ^ index, DAX_WAIT_TABLE_BITS);
  	return wait_table + hash;
  }
ac6424b98   Ingo Molnar   sched/wait: Renam...
135
  static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mode,
ac401cc78   Jan Kara   dax: New fault lo...
136
137
138
139
140
141
142
  				       int sync, void *keyp)
  {
  	struct exceptional_entry_key *key = keyp;
  	struct wait_exceptional_entry_queue *ewait =
  		container_of(wait, struct wait_exceptional_entry_queue, wait);
  
  	if (key->mapping != ewait->key.mapping ||
63e95b5c4   Ross Zwisler   dax: coordinate l...
143
  	    key->entry_start != ewait->key.entry_start)
ac401cc78   Jan Kara   dax: New fault lo...
144
145
146
147
148
  		return 0;
  	return autoremove_wake_function(wait, mode, sync, NULL);
  }
  
  /*
b93b01631   Matthew Wilcox   page cache: use x...
149
150
151
   * @entry may no longer be the entry at the index in the mapping.
   * The important information it's conveying is whether the entry at
   * this index used to be a PMD entry.
e30331ff0   Ross Zwisler   dax: relocate som...
152
   */
d01ad197a   Ross Zwisler   dax: remove DAX c...
153
  static void dax_wake_mapping_entry_waiter(struct address_space *mapping,
e30331ff0   Ross Zwisler   dax: relocate som...
154
155
156
157
158
159
160
161
162
  		pgoff_t index, void *entry, bool wake_all)
  {
  	struct exceptional_entry_key key;
  	wait_queue_head_t *wq;
  
  	wq = dax_entry_waitqueue(mapping, index, entry, &key);
  
  	/*
  	 * Checking for locked entry and prepare_to_wait_exclusive() happens
b93b01631   Matthew Wilcox   page cache: use x...
163
  	 * under the i_pages lock, ditto for entry handling in our callers.
e30331ff0   Ross Zwisler   dax: relocate som...
164
165
166
167
168
169
170
171
  	 * So at this point all tasks that could have seen our entry locked
  	 * must be in the waitqueue and the following check will see them.
  	 */
  	if (waitqueue_active(wq))
  		__wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
  }
  
  /*
b93b01631   Matthew Wilcox   page cache: use x...
172
173
   * Check whether the given slot is locked.  Must be called with the i_pages
   * lock held.
ac401cc78   Jan Kara   dax: New fault lo...
174
175
176
177
   */
  static inline int slot_locked(struct address_space *mapping, void **slot)
  {
  	unsigned long entry = (unsigned long)
b93b01631   Matthew Wilcox   page cache: use x...
178
  		radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock);
ac401cc78   Jan Kara   dax: New fault lo...
179
180
181
182
  	return entry & RADIX_DAX_ENTRY_LOCK;
  }
  
  /*
b93b01631   Matthew Wilcox   page cache: use x...
183
   * Mark the given slot as locked.  Must be called with the i_pages lock held.
ac401cc78   Jan Kara   dax: New fault lo...
184
185
186
187
   */
  static inline void *lock_slot(struct address_space *mapping, void **slot)
  {
  	unsigned long entry = (unsigned long)
b93b01631   Matthew Wilcox   page cache: use x...
188
  		radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock);
ac401cc78   Jan Kara   dax: New fault lo...
189
190
  
  	entry |= RADIX_DAX_ENTRY_LOCK;
b93b01631   Matthew Wilcox   page cache: use x...
191
  	radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry);
ac401cc78   Jan Kara   dax: New fault lo...
192
193
194
195
  	return (void *)entry;
  }
  
  /*
b93b01631   Matthew Wilcox   page cache: use x...
196
   * Mark the given slot as unlocked.  Must be called with the i_pages lock held.
ac401cc78   Jan Kara   dax: New fault lo...
197
198
199
200
   */
  static inline void *unlock_slot(struct address_space *mapping, void **slot)
  {
  	unsigned long entry = (unsigned long)
b93b01631   Matthew Wilcox   page cache: use x...
201
  		radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock);
ac401cc78   Jan Kara   dax: New fault lo...
202
203
  
  	entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
b93b01631   Matthew Wilcox   page cache: use x...
204
  	radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry);
ac401cc78   Jan Kara   dax: New fault lo...
205
206
  	return (void *)entry;
  }
e7a121e34   Matthew Wilcox   dax: Avoid losing...
207
208
  static void put_unlocked_mapping_entry(struct address_space *mapping,
  				       pgoff_t index, void *entry);
ac401cc78   Jan Kara   dax: New fault lo...
209
210
211
212
213
214
215
  /*
   * Lookup entry in radix tree, wait for it to become unlocked if it is
   * exceptional entry and return it. The caller must call
   * put_unlocked_mapping_entry() when he decided not to lock the entry or
   * put_locked_mapping_entry() when he locked the entry and now wants to
   * unlock it.
   *
b93b01631   Matthew Wilcox   page cache: use x...
216
   * Must be called with the i_pages lock held.
ac401cc78   Jan Kara   dax: New fault lo...
217
   */
c555772c2   Matthew Wilcox   dax: Don't access...
218
219
  static void *get_unlocked_mapping_entry(struct address_space *mapping,
  		pgoff_t index, void ***slotp)
ac401cc78   Jan Kara   dax: New fault lo...
220
  {
e3ad61c64   Ross Zwisler   dax: consistent v...
221
  	void *entry, **slot;
ac401cc78   Jan Kara   dax: New fault lo...
222
  	struct wait_exceptional_entry_queue ewait;
63e95b5c4   Ross Zwisler   dax: coordinate l...
223
  	wait_queue_head_t *wq;
ac401cc78   Jan Kara   dax: New fault lo...
224
225
226
  
  	init_wait(&ewait.wait);
  	ewait.wait.func = wake_exceptional_entry_func;
ac401cc78   Jan Kara   dax: New fault lo...
227
228
  
  	for (;;) {
b93b01631   Matthew Wilcox   page cache: use x...
229
  		entry = __radix_tree_lookup(&mapping->i_pages, index, NULL,
ac401cc78   Jan Kara   dax: New fault lo...
230
  					  &slot);
91d25ba8a   Ross Zwisler   dax: use common 4...
231
232
  		if (!entry ||
  		    WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) ||
ac401cc78   Jan Kara   dax: New fault lo...
233
234
235
  		    !slot_locked(mapping, slot)) {
  			if (slotp)
  				*slotp = slot;
e3ad61c64   Ross Zwisler   dax: consistent v...
236
  			return entry;
ac401cc78   Jan Kara   dax: New fault lo...
237
  		}
63e95b5c4   Ross Zwisler   dax: coordinate l...
238
239
  
  		wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key);
ac401cc78   Jan Kara   dax: New fault lo...
240
241
  		prepare_to_wait_exclusive(wq, &ewait.wait,
  					  TASK_UNINTERRUPTIBLE);
b93b01631   Matthew Wilcox   page cache: use x...
242
  		xa_unlock_irq(&mapping->i_pages);
c555772c2   Matthew Wilcox   dax: Don't access...
243
  		schedule();
ac401cc78   Jan Kara   dax: New fault lo...
244
  		finish_wait(wq, &ewait.wait);
b93b01631   Matthew Wilcox   page cache: use x...
245
  		xa_lock_irq(&mapping->i_pages);
ac401cc78   Jan Kara   dax: New fault lo...
246
247
  	}
  }
c555772c2   Matthew Wilcox   dax: Don't access...
248
249
250
251
252
253
254
  /*
   * The only thing keeping the address space around is the i_pages lock
   * (it's cycled in clear_inode() after removing the entries from i_pages)
   * After we call xas_unlock_irq(), we cannot touch xas->xa.
   */
  static void wait_entry_unlocked(struct address_space *mapping, pgoff_t index,
  		void ***slotp, void *entry)
c2a7d2a11   Dan Williams   filesystem-dax: I...
255
  {
c555772c2   Matthew Wilcox   dax: Don't access...
256
257
258
259
260
261
262
  	struct wait_exceptional_entry_queue ewait;
  	wait_queue_head_t *wq;
  
  	init_wait(&ewait.wait);
  	ewait.wait.func = wake_exceptional_entry_func;
  
  	wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key);
9621ea6b9   Dan Williams   dax: Use non-excl...
263
264
265
266
267
268
269
  	/*
  	 * Unlike get_unlocked_entry() there is no guarantee that this
  	 * path ever successfully retrieves an unlocked entry before an
  	 * inode dies. Perform a non-exclusive wait in case this path
  	 * never successfully performs its own wake up.
  	 */
  	prepare_to_wait(wq, &ewait.wait, TASK_UNINTERRUPTIBLE);
c555772c2   Matthew Wilcox   dax: Don't access...
270
  	xa_unlock_irq(&mapping->i_pages);
c2a7d2a11   Dan Williams   filesystem-dax: I...
271
  	schedule();
c555772c2   Matthew Wilcox   dax: Don't access...
272
  	finish_wait(wq, &ewait.wait);
c2a7d2a11   Dan Williams   filesystem-dax: I...
273
274
275
  }
  
  static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
b1aa812b2   Jan Kara   mm: move handling...
276
277
  {
  	void *entry, **slot;
b93b01631   Matthew Wilcox   page cache: use x...
278
279
  	xa_lock_irq(&mapping->i_pages);
  	entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot);
b1aa812b2   Jan Kara   mm: move handling...
280
281
  	if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) ||
  			 !slot_locked(mapping, slot))) {
b93b01631   Matthew Wilcox   page cache: use x...
282
  		xa_unlock_irq(&mapping->i_pages);
b1aa812b2   Jan Kara   mm: move handling...
283
284
285
  		return;
  	}
  	unlock_slot(mapping, slot);
b93b01631   Matthew Wilcox   page cache: use x...
286
  	xa_unlock_irq(&mapping->i_pages);
b1aa812b2   Jan Kara   mm: move handling...
287
288
  	dax_wake_mapping_entry_waiter(mapping, index, entry, false);
  }
422476c46   Ross Zwisler   dax: move put_(un...
289
  static void put_locked_mapping_entry(struct address_space *mapping,
91d25ba8a   Ross Zwisler   dax: use common 4...
290
  		pgoff_t index)
422476c46   Ross Zwisler   dax: move put_(un...
291
  {
c2a7d2a11   Dan Williams   filesystem-dax: I...
292
  	unlock_mapping_entry(mapping, index);
422476c46   Ross Zwisler   dax: move put_(un...
293
294
295
296
297
298
299
300
301
  }
  
  /*
   * Called when we are done with radix tree entry we looked up via
   * get_unlocked_mapping_entry() and which we didn't lock in the end.
   */
  static void put_unlocked_mapping_entry(struct address_space *mapping,
  				       pgoff_t index, void *entry)
  {
91d25ba8a   Ross Zwisler   dax: use common 4...
302
  	if (!entry)
422476c46   Ross Zwisler   dax: move put_(un...
303
304
305
306
307
  		return;
  
  	/* We have to wake up next waiter for the radix tree entry lock */
  	dax_wake_mapping_entry_waiter(mapping, index, entry, false);
  }
d2c997c0f   Dan Williams   fs, dax: use page...
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
  static unsigned long dax_entry_size(void *entry)
  {
  	if (dax_is_zero_entry(entry))
  		return 0;
  	else if (dax_is_empty_entry(entry))
  		return 0;
  	else if (dax_is_pmd_entry(entry))
  		return PMD_SIZE;
  	else
  		return PAGE_SIZE;
  }
  
  static unsigned long dax_radix_end_pfn(void *entry)
  {
  	return dax_radix_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
  }
  
  /*
   * Iterate through all mapped pfns represented by an entry, i.e. skip
   * 'empty' and 'zero' entries.
   */
  #define for_each_mapped_pfn(entry, pfn) \
  	for (pfn = dax_radix_pfn(entry); \
  			pfn < dax_radix_end_pfn(entry); pfn++)
73449daf8   Dan Williams   filesystem-dax: S...
332
333
334
335
336
337
338
  /*
   * TODO: for reflink+dax we need a way to associate a single page with
   * multiple address_space instances at different linear_page_index()
   * offsets.
   */
  static void dax_associate_entry(void *entry, struct address_space *mapping,
  		struct vm_area_struct *vma, unsigned long address)
d2c997c0f   Dan Williams   fs, dax: use page...
339
  {
73449daf8   Dan Williams   filesystem-dax: S...
340
341
  	unsigned long size = dax_entry_size(entry), pfn, index;
  	int i = 0;
d2c997c0f   Dan Williams   fs, dax: use page...
342
343
344
  
  	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
  		return;
73449daf8   Dan Williams   filesystem-dax: S...
345
  	index = linear_page_index(vma, address & ~(size - 1));
d2c997c0f   Dan Williams   fs, dax: use page...
346
347
348
349
350
  	for_each_mapped_pfn(entry, pfn) {
  		struct page *page = pfn_to_page(pfn);
  
  		WARN_ON_ONCE(page->mapping);
  		page->mapping = mapping;
73449daf8   Dan Williams   filesystem-dax: S...
351
  		page->index = index + i++;
d2c997c0f   Dan Williams   fs, dax: use page...
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
  	}
  }
  
  static void dax_disassociate_entry(void *entry, struct address_space *mapping,
  		bool trunc)
  {
  	unsigned long pfn;
  
  	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
  		return;
  
  	for_each_mapped_pfn(entry, pfn) {
  		struct page *page = pfn_to_page(pfn);
  
  		WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
  		WARN_ON_ONCE(page->mapping && page->mapping != mapping);
  		page->mapping = NULL;
73449daf8   Dan Williams   filesystem-dax: S...
369
  		page->index = 0;
d2c997c0f   Dan Williams   fs, dax: use page...
370
371
  	}
  }
5fac7408d   Dan Williams   mm, fs, dax: hand...
372
373
374
375
376
377
378
379
380
381
382
383
  static struct page *dax_busy_page(void *entry)
  {
  	unsigned long pfn;
  
  	for_each_mapped_pfn(entry, pfn) {
  		struct page *page = pfn_to_page(pfn);
  
  		if (page_ref_count(page) > 1)
  			return page;
  	}
  	return NULL;
  }
c2a7d2a11   Dan Williams   filesystem-dax: I...
384
385
386
387
388
389
390
391
392
393
394
  bool dax_lock_mapping_entry(struct page *page)
  {
  	pgoff_t index;
  	struct inode *inode;
  	bool did_lock = false;
  	void *entry = NULL, **slot;
  	struct address_space *mapping;
  
  	rcu_read_lock();
  	for (;;) {
  		mapping = READ_ONCE(page->mapping);
384f18115   Matthew Wilcox   dax: Check page->...
395
  		if (!mapping || !dax_mapping(mapping))
c2a7d2a11   Dan Williams   filesystem-dax: I...
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
  			break;
  
  		/*
  		 * In the device-dax case there's no need to lock, a
  		 * struct dev_pagemap pin is sufficient to keep the
  		 * inode alive, and we assume we have dev_pagemap pin
  		 * otherwise we would not have a valid pfn_to_page()
  		 * translation.
  		 */
  		inode = mapping->host;
  		if (S_ISCHR(inode->i_mode)) {
  			did_lock = true;
  			break;
  		}
  
  		xa_lock_irq(&mapping->i_pages);
  		if (mapping != page->mapping) {
  			xa_unlock_irq(&mapping->i_pages);
  			continue;
  		}
  		index = page->index;
c555772c2   Matthew Wilcox   dax: Don't access...
417
418
  		entry = __radix_tree_lookup(&mapping->i_pages, index,
  						NULL, &slot);
c2a7d2a11   Dan Williams   filesystem-dax: I...
419
420
421
  		if (!entry) {
  			xa_unlock_irq(&mapping->i_pages);
  			break;
c555772c2   Matthew Wilcox   dax: Don't access...
422
423
424
425
  		} else if (slot_locked(mapping, slot)) {
  			rcu_read_unlock();
  			wait_entry_unlocked(mapping, index, &slot, entry);
  			rcu_read_lock();
c2a7d2a11   Dan Williams   filesystem-dax: I...
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
  			continue;
  		}
  		lock_slot(mapping, slot);
  		did_lock = true;
  		xa_unlock_irq(&mapping->i_pages);
  		break;
  	}
  	rcu_read_unlock();
  
  	return did_lock;
  }
  
  void dax_unlock_mapping_entry(struct page *page)
  {
  	struct address_space *mapping = page->mapping;
  	struct inode *inode = mapping->host;
  
  	if (S_ISCHR(inode->i_mode))
  		return;
  
  	unlock_mapping_entry(mapping, page->index);
  }
ac401cc78   Jan Kara   dax: New fault lo...
448
  /*
91d25ba8a   Ross Zwisler   dax: use common 4...
449
450
451
452
   * Find radix tree entry at given index. If it points to an exceptional entry,
   * return it with the radix tree entry locked. If the radix tree doesn't
   * contain given index, create an empty exceptional entry for the index and
   * return with it locked.
ac401cc78   Jan Kara   dax: New fault lo...
453
   *
642261ac9   Ross Zwisler   dax: add struct i...
454
455
   * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will
   * either return that locked entry or will return an error.  This error will
91d25ba8a   Ross Zwisler   dax: use common 4...
456
457
   * happen if there are any 4k entries within the 2MiB range that we are
   * requesting.
642261ac9   Ross Zwisler   dax: add struct i...
458
459
460
461
462
463
464
465
466
467
468
469
   *
   * We always favor 4k entries over 2MiB entries. There isn't a flow where we
   * evict 4k entries in order to 'upgrade' them to a 2MiB entry.  A 2MiB
   * insertion will fail if it finds any 4k entries already in the tree, and a
   * 4k insertion will cause an existing 2MiB entry to be unmapped and
   * downgraded to 4k entries.  This happens for both 2MiB huge zero pages as
   * well as 2MiB empty entries.
   *
   * The exception to this downgrade path is for 2MiB DAX PMD entries that have
   * real storage backing them.  We will leave these real 2MiB DAX entries in
   * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry.
   *
ac401cc78   Jan Kara   dax: New fault lo...
470
471
472
473
   * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
   * persistent memory the benefit is doubtful. We can add that later if we can
   * show it helps.
   */
642261ac9   Ross Zwisler   dax: add struct i...
474
475
  static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index,
  		unsigned long size_flag)
ac401cc78   Jan Kara   dax: New fault lo...
476
  {
642261ac9   Ross Zwisler   dax: add struct i...
477
  	bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */
e3ad61c64   Ross Zwisler   dax: consistent v...
478
  	void *entry, **slot;
ac401cc78   Jan Kara   dax: New fault lo...
479
480
  
  restart:
b93b01631   Matthew Wilcox   page cache: use x...
481
  	xa_lock_irq(&mapping->i_pages);
e3ad61c64   Ross Zwisler   dax: consistent v...
482
  	entry = get_unlocked_mapping_entry(mapping, index, &slot);
642261ac9   Ross Zwisler   dax: add struct i...
483

91d25ba8a   Ross Zwisler   dax: use common 4...
484
485
486
487
  	if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) {
  		entry = ERR_PTR(-EIO);
  		goto out_unlock;
  	}
642261ac9   Ross Zwisler   dax: add struct i...
488
489
  	if (entry) {
  		if (size_flag & RADIX_DAX_PMD) {
91d25ba8a   Ross Zwisler   dax: use common 4...
490
  			if (dax_is_pte_entry(entry)) {
642261ac9   Ross Zwisler   dax: add struct i...
491
492
493
494
495
496
  				put_unlocked_mapping_entry(mapping, index,
  						entry);
  				entry = ERR_PTR(-EEXIST);
  				goto out_unlock;
  			}
  		} else { /* trying to grab a PTE entry */
91d25ba8a   Ross Zwisler   dax: use common 4...
497
  			if (dax_is_pmd_entry(entry) &&
642261ac9   Ross Zwisler   dax: add struct i...
498
499
500
501
502
503
  			    (dax_is_zero_entry(entry) ||
  			     dax_is_empty_entry(entry))) {
  				pmd_downgrade = true;
  			}
  		}
  	}
ac401cc78   Jan Kara   dax: New fault lo...
504
  	/* No entry for given index? Make sure radix tree is big enough. */
642261ac9   Ross Zwisler   dax: add struct i...
505
  	if (!entry || pmd_downgrade) {
ac401cc78   Jan Kara   dax: New fault lo...
506
  		int err;
642261ac9   Ross Zwisler   dax: add struct i...
507
508
509
  		if (pmd_downgrade) {
  			/*
  			 * Make sure 'entry' remains valid while we drop
b93b01631   Matthew Wilcox   page cache: use x...
510
  			 * the i_pages lock.
642261ac9   Ross Zwisler   dax: add struct i...
511
512
513
  			 */
  			entry = lock_slot(mapping, slot);
  		}
b93b01631   Matthew Wilcox   page cache: use x...
514
  		xa_unlock_irq(&mapping->i_pages);
642261ac9   Ross Zwisler   dax: add struct i...
515
516
517
518
519
520
  		/*
  		 * Besides huge zero pages the only other thing that gets
  		 * downgraded are empty entries which don't need to be
  		 * unmapped.
  		 */
  		if (pmd_downgrade && dax_is_zero_entry(entry))
977fbdcd5   Matthew Wilcox   mm: add unmap_map...
521
522
  			unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
  							PG_PMD_NR, false);
642261ac9   Ross Zwisler   dax: add struct i...
523

ac401cc78   Jan Kara   dax: New fault lo...
524
525
  		err = radix_tree_preload(
  				mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
0cb80b484   Jan Kara   dax: Fix sleep in...
526
527
  		if (err) {
  			if (pmd_downgrade)
91d25ba8a   Ross Zwisler   dax: use common 4...
528
  				put_locked_mapping_entry(mapping, index);
ac401cc78   Jan Kara   dax: New fault lo...
529
  			return ERR_PTR(err);
0cb80b484   Jan Kara   dax: Fix sleep in...
530
  		}
b93b01631   Matthew Wilcox   page cache: use x...
531
  		xa_lock_irq(&mapping->i_pages);
642261ac9   Ross Zwisler   dax: add struct i...
532

e11f8b7b6   Ross Zwisler   dax: fix radix tr...
533
534
  		if (!entry) {
  			/*
b93b01631   Matthew Wilcox   page cache: use x...
535
  			 * We needed to drop the i_pages lock while calling
e11f8b7b6   Ross Zwisler   dax: fix radix tr...
536
537
538
539
  			 * radix_tree_preload() and we didn't have an entry to
  			 * lock.  See if another thread inserted an entry at
  			 * our index during this time.
  			 */
b93b01631   Matthew Wilcox   page cache: use x...
540
  			entry = __radix_tree_lookup(&mapping->i_pages, index,
e11f8b7b6   Ross Zwisler   dax: fix radix tr...
541
542
543
  					NULL, &slot);
  			if (entry) {
  				radix_tree_preload_end();
b93b01631   Matthew Wilcox   page cache: use x...
544
  				xa_unlock_irq(&mapping->i_pages);
e11f8b7b6   Ross Zwisler   dax: fix radix tr...
545
546
547
  				goto restart;
  			}
  		}
642261ac9   Ross Zwisler   dax: add struct i...
548
  		if (pmd_downgrade) {
d2c997c0f   Dan Williams   fs, dax: use page...
549
  			dax_disassociate_entry(entry, mapping, false);
b93b01631   Matthew Wilcox   page cache: use x...
550
  			radix_tree_delete(&mapping->i_pages, index);
642261ac9   Ross Zwisler   dax: add struct i...
551
552
553
554
555
556
  			mapping->nrexceptional--;
  			dax_wake_mapping_entry_waiter(mapping, index, entry,
  					true);
  		}
  
  		entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY);
b93b01631   Matthew Wilcox   page cache: use x...
557
  		err = __radix_tree_insert(&mapping->i_pages, index,
642261ac9   Ross Zwisler   dax: add struct i...
558
  				dax_radix_order(entry), entry);
ac401cc78   Jan Kara   dax: New fault lo...
559
560
  		radix_tree_preload_end();
  		if (err) {
b93b01631   Matthew Wilcox   page cache: use x...
561
  			xa_unlock_irq(&mapping->i_pages);
642261ac9   Ross Zwisler   dax: add struct i...
562
  			/*
e11f8b7b6   Ross Zwisler   dax: fix radix tr...
563
564
565
566
567
568
  			 * Our insertion of a DAX entry failed, most likely
  			 * because we were inserting a PMD entry and it
  			 * collided with a PTE sized entry at a different
  			 * index in the PMD range.  We haven't inserted
  			 * anything into the radix tree and have no waiters to
  			 * wake.
642261ac9   Ross Zwisler   dax: add struct i...
569
  			 */
ac401cc78   Jan Kara   dax: New fault lo...
570
571
572
573
  			return ERR_PTR(err);
  		}
  		/* Good, we have inserted empty locked entry into the tree. */
  		mapping->nrexceptional++;
b93b01631   Matthew Wilcox   page cache: use x...
574
  		xa_unlock_irq(&mapping->i_pages);
e3ad61c64   Ross Zwisler   dax: consistent v...
575
  		return entry;
ac401cc78   Jan Kara   dax: New fault lo...
576
  	}
e3ad61c64   Ross Zwisler   dax: consistent v...
577
  	entry = lock_slot(mapping, slot);
642261ac9   Ross Zwisler   dax: add struct i...
578
   out_unlock:
b93b01631   Matthew Wilcox   page cache: use x...
579
  	xa_unlock_irq(&mapping->i_pages);
e3ad61c64   Ross Zwisler   dax: consistent v...
580
  	return entry;
ac401cc78   Jan Kara   dax: New fault lo...
581
  }
5fac7408d   Dan Williams   mm, fs, dax: hand...
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
  /**
   * dax_layout_busy_page - find first pinned page in @mapping
   * @mapping: address space to scan for a page with ref count > 1
   *
   * DAX requires ZONE_DEVICE mapped pages. These pages are never
   * 'onlined' to the page allocator so they are considered idle when
   * page->count == 1. A filesystem uses this interface to determine if
   * any page in the mapping is busy, i.e. for DMA, or other
   * get_user_pages() usages.
   *
   * It is expected that the filesystem is holding locks to block the
   * establishment of new mappings in this address_space. I.e. it expects
   * to be able to run unmap_mapping_range() and subsequently not race
   * mapping_mapped() becoming true.
   */
  struct page *dax_layout_busy_page(struct address_space *mapping)
  {
  	pgoff_t	indices[PAGEVEC_SIZE];
  	struct page *page = NULL;
  	struct pagevec pvec;
  	pgoff_t	index, end;
  	unsigned i;
  
  	/*
  	 * In the 'limited' case get_user_pages() for dax is disabled.
  	 */
  	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
  		return NULL;
  
  	if (!dax_mapping(mapping) || !mapping_mapped(mapping))
  		return NULL;
  
  	pagevec_init(&pvec);
  	index = 0;
  	end = -1;
  
  	/*
  	 * If we race get_user_pages_fast() here either we'll see the
  	 * elevated page count in the pagevec_lookup and wait, or
  	 * get_user_pages_fast() will see that the page it took a reference
  	 * against is no longer mapped in the page tables and bail to the
  	 * get_user_pages() slow path.  The slow path is protected by
  	 * pte_lock() and pmd_lock(). New references are not taken without
  	 * holding those locks, and unmap_mapping_range() will not zero the
  	 * pte or pmd without holding the respective lock, so we are
  	 * guaranteed to either see new references or prevent new
  	 * references from being established.
  	 */
  	unmap_mapping_range(mapping, 0, 0, 1);
  
  	while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
  				min(end - index, (pgoff_t)PAGEVEC_SIZE),
  				indices)) {
d7782145e   Dan Williams   filesystem-dax: F...
635
  		pgoff_t nr_pages = 1;
5fac7408d   Dan Williams   mm, fs, dax: hand...
636
637
638
639
640
641
642
  		for (i = 0; i < pagevec_count(&pvec); i++) {
  			struct page *pvec_ent = pvec.pages[i];
  			void *entry;
  
  			index = indices[i];
  			if (index >= end)
  				break;
cdbf8897c   Ross Zwisler   dax: dax_layout_b...
643
644
  			if (WARN_ON_ONCE(
  			     !radix_tree_exceptional_entry(pvec_ent)))
5fac7408d   Dan Williams   mm, fs, dax: hand...
645
646
647
648
  				continue;
  
  			xa_lock_irq(&mapping->i_pages);
  			entry = get_unlocked_mapping_entry(mapping, index, NULL);
d7782145e   Dan Williams   filesystem-dax: F...
649
  			if (entry) {
5fac7408d   Dan Williams   mm, fs, dax: hand...
650
  				page = dax_busy_page(entry);
d7782145e   Dan Williams   filesystem-dax: F...
651
652
653
654
655
656
657
  				/*
  				 * Account for multi-order entries at
  				 * the end of the pagevec.
  				 */
  				if (i + 1 >= pagevec_count(&pvec))
  					nr_pages = 1UL << dax_radix_order(entry);
  			}
5fac7408d   Dan Williams   mm, fs, dax: hand...
658
659
660
661
662
  			put_unlocked_mapping_entry(mapping, index, entry);
  			xa_unlock_irq(&mapping->i_pages);
  			if (page)
  				break;
  		}
cdbf8897c   Ross Zwisler   dax: dax_layout_b...
663
664
665
666
667
668
669
  
  		/*
  		 * We don't expect normal struct page entries to exist in our
  		 * tree, but we keep these pagevec calls so that this code is
  		 * consistent with the common pattern for handling pagevecs
  		 * throughout the kernel.
  		 */
5fac7408d   Dan Williams   mm, fs, dax: hand...
670
671
  		pagevec_remove_exceptionals(&pvec);
  		pagevec_release(&pvec);
d7782145e   Dan Williams   filesystem-dax: F...
672
  		index += nr_pages;
5fac7408d   Dan Williams   mm, fs, dax: hand...
673
674
675
676
677
678
679
  
  		if (page)
  			break;
  	}
  	return page;
  }
  EXPORT_SYMBOL_GPL(dax_layout_busy_page);
c6dcf52c2   Jan Kara   mm: Invalidate DA...
680
681
682
683
684
  static int __dax_invalidate_mapping_entry(struct address_space *mapping,
  					  pgoff_t index, bool trunc)
  {
  	int ret = 0;
  	void *entry;
b93b01631   Matthew Wilcox   page cache: use x...
685
  	struct radix_tree_root *pages = &mapping->i_pages;
c6dcf52c2   Jan Kara   mm: Invalidate DA...
686

b93b01631   Matthew Wilcox   page cache: use x...
687
  	xa_lock_irq(pages);
c6dcf52c2   Jan Kara   mm: Invalidate DA...
688
  	entry = get_unlocked_mapping_entry(mapping, index, NULL);
91d25ba8a   Ross Zwisler   dax: use common 4...
689
  	if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)))
c6dcf52c2   Jan Kara   mm: Invalidate DA...
690
691
  		goto out;
  	if (!trunc &&
b93b01631   Matthew Wilcox   page cache: use x...
692
693
  	    (radix_tree_tag_get(pages, index, PAGECACHE_TAG_DIRTY) ||
  	     radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE)))
c6dcf52c2   Jan Kara   mm: Invalidate DA...
694
  		goto out;
d2c997c0f   Dan Williams   fs, dax: use page...
695
  	dax_disassociate_entry(entry, mapping, trunc);
b93b01631   Matthew Wilcox   page cache: use x...
696
  	radix_tree_delete(pages, index);
c6dcf52c2   Jan Kara   mm: Invalidate DA...
697
698
699
700
  	mapping->nrexceptional--;
  	ret = 1;
  out:
  	put_unlocked_mapping_entry(mapping, index, entry);
b93b01631   Matthew Wilcox   page cache: use x...
701
  	xa_unlock_irq(pages);
c6dcf52c2   Jan Kara   mm: Invalidate DA...
702
703
  	return ret;
  }
ac401cc78   Jan Kara   dax: New fault lo...
704
705
706
707
708
709
  /*
   * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
   * entry to get unlocked before deleting it.
   */
  int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
  {
c6dcf52c2   Jan Kara   mm: Invalidate DA...
710
  	int ret = __dax_invalidate_mapping_entry(mapping, index, true);
ac401cc78   Jan Kara   dax: New fault lo...
711

ac401cc78   Jan Kara   dax: New fault lo...
712
713
714
715
716
717
718
  	/*
  	 * This gets called from truncate / punch_hole path. As such, the caller
  	 * must hold locks protecting against concurrent modifications of the
  	 * radix tree (usually fs-private i_mmap_sem for writing). Since the
  	 * caller has seen exceptional entry for this index, we better find it
  	 * at that index as well...
  	 */
c6dcf52c2   Jan Kara   mm: Invalidate DA...
719
720
721
722
723
  	WARN_ON_ONCE(!ret);
  	return ret;
  }
  
  /*
c6dcf52c2   Jan Kara   mm: Invalidate DA...
724
725
726
727
728
729
   * Invalidate exceptional DAX entry if it is clean.
   */
  int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
  				      pgoff_t index)
  {
  	return __dax_invalidate_mapping_entry(mapping, index, false);
ac401cc78   Jan Kara   dax: New fault lo...
730
  }
cccbce671   Dan Williams   filesystem-dax: c...
731
732
733
  static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
  		sector_t sector, size_t size, struct page *to,
  		unsigned long vaddr)
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
734
  {
cccbce671   Dan Williams   filesystem-dax: c...
735
736
  	void *vto, *kaddr;
  	pgoff_t pgoff;
cccbce671   Dan Williams   filesystem-dax: c...
737
738
739
740
741
742
743
744
  	long rc;
  	int id;
  
  	rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
  	if (rc)
  		return rc;
  
  	id = dax_read_lock();
86ed913b0   Huaisheng Ye   filesystem-dax: D...
745
  	rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, NULL);
cccbce671   Dan Williams   filesystem-dax: c...
746
747
748
749
  	if (rc < 0) {
  		dax_read_unlock(id);
  		return rc;
  	}
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
750
  	vto = kmap_atomic(to);
cccbce671   Dan Williams   filesystem-dax: c...
751
  	copy_user_page(vto, (void __force *)kaddr, vaddr, to);
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
752
  	kunmap_atomic(vto);
cccbce671   Dan Williams   filesystem-dax: c...
753
  	dax_read_unlock(id);
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
754
755
  	return 0;
  }
642261ac9   Ross Zwisler   dax: add struct i...
756
757
758
759
760
761
762
  /*
   * By this point grab_mapping_entry() has ensured that we have a locked entry
   * of the appropriate size so we don't have to worry about downgrading PMDs to
   * PTEs.  If we happen to be trying to insert a PTE and there is a PMD
   * already in the tree, we will skip the insertion and just dirty the PMD as
   * appropriate.
   */
ac401cc78   Jan Kara   dax: New fault lo...
763
764
  static void *dax_insert_mapping_entry(struct address_space *mapping,
  				      struct vm_fault *vmf,
3fe0791c2   Dan Williams   dax: store pfns i...
765
  				      void *entry, pfn_t pfn_t,
f5b7b7487   Jan Kara   dax: Allow tuning...
766
  				      unsigned long flags, bool dirty)
9973c98ec   Ross Zwisler   dax: add support ...
767
  {
b93b01631   Matthew Wilcox   page cache: use x...
768
  	struct radix_tree_root *pages = &mapping->i_pages;
3fe0791c2   Dan Williams   dax: store pfns i...
769
  	unsigned long pfn = pfn_t_to_pfn(pfn_t);
ac401cc78   Jan Kara   dax: New fault lo...
770
  	pgoff_t index = vmf->pgoff;
3fe0791c2   Dan Williams   dax: store pfns i...
771
  	void *new_entry;
9973c98ec   Ross Zwisler   dax: add support ...
772

f5b7b7487   Jan Kara   dax: Allow tuning...
773
  	if (dirty)
d2b2a28e6   Dmitry Monakhov   dax: dirty inode ...
774
  		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
9973c98ec   Ross Zwisler   dax: add support ...
775

91d25ba8a   Ross Zwisler   dax: use common 4...
776
777
778
  	if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) {
  		/* we are replacing a zero page with block mapping */
  		if (dax_is_pmd_entry(entry))
977fbdcd5   Matthew Wilcox   mm: add unmap_map...
779
780
  			unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
  							PG_PMD_NR, false);
91d25ba8a   Ross Zwisler   dax: use common 4...
781
  		else /* pte entry */
977fbdcd5   Matthew Wilcox   mm: add unmap_map...
782
  			unmap_mapping_pages(mapping, vmf->pgoff, 1, false);
9973c98ec   Ross Zwisler   dax: add support ...
783
  	}
b93b01631   Matthew Wilcox   page cache: use x...
784
  	xa_lock_irq(pages);
3fe0791c2   Dan Williams   dax: store pfns i...
785
  	new_entry = dax_radix_locked_entry(pfn, flags);
d2c997c0f   Dan Williams   fs, dax: use page...
786
787
  	if (dax_entry_size(entry) != dax_entry_size(new_entry)) {
  		dax_disassociate_entry(entry, mapping, false);
73449daf8   Dan Williams   filesystem-dax: S...
788
  		dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address);
d2c997c0f   Dan Williams   fs, dax: use page...
789
  	}
642261ac9   Ross Zwisler   dax: add struct i...
790

91d25ba8a   Ross Zwisler   dax: use common 4...
791
  	if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
642261ac9   Ross Zwisler   dax: add struct i...
792
793
794
795
796
797
798
799
  		/*
  		 * Only swap our new entry into the radix tree if the current
  		 * entry is a zero page or an empty entry.  If a normal PTE or
  		 * PMD entry is already in the tree, we leave it alone.  This
  		 * means that if we are trying to insert a PTE and the
  		 * existing entry is a PMD, we will just leave the PMD in the
  		 * tree and dirty it if necessary.
  		 */
f7942430e   Johannes Weiner   lib: radix-tree: ...
800
  		struct radix_tree_node *node;
ac401cc78   Jan Kara   dax: New fault lo...
801
802
  		void **slot;
  		void *ret;
9973c98ec   Ross Zwisler   dax: add support ...
803

b93b01631   Matthew Wilcox   page cache: use x...
804
  		ret = __radix_tree_lookup(pages, index, &node, &slot);
ac401cc78   Jan Kara   dax: New fault lo...
805
  		WARN_ON_ONCE(ret != entry);
b93b01631   Matthew Wilcox   page cache: use x...
806
  		__radix_tree_replace(pages, node, slot,
c7df8ad29   Mel Gorman   mm, truncate: do ...
807
  				     new_entry, NULL);
91d25ba8a   Ross Zwisler   dax: use common 4...
808
  		entry = new_entry;
9973c98ec   Ross Zwisler   dax: add support ...
809
  	}
91d25ba8a   Ross Zwisler   dax: use common 4...
810

f5b7b7487   Jan Kara   dax: Allow tuning...
811
  	if (dirty)
b93b01631   Matthew Wilcox   page cache: use x...
812
  		radix_tree_tag_set(pages, index, PAGECACHE_TAG_DIRTY);
91d25ba8a   Ross Zwisler   dax: use common 4...
813

b93b01631   Matthew Wilcox   page cache: use x...
814
  	xa_unlock_irq(pages);
91d25ba8a   Ross Zwisler   dax: use common 4...
815
  	return entry;
9973c98ec   Ross Zwisler   dax: add support ...
816
  }
4b4bb46d0   Jan Kara   dax: clear dirty ...
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
  static inline unsigned long
  pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
  {
  	unsigned long address;
  
  	address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
  	VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
  	return address;
  }
  
  /* Walk all mappings of a given index of a file and writeprotect them */
  static void dax_mapping_entry_mkclean(struct address_space *mapping,
  				      pgoff_t index, unsigned long pfn)
  {
  	struct vm_area_struct *vma;
f729c8c9b   Ross Zwisler   dax: wrprotect pm...
832
833
  	pte_t pte, *ptep = NULL;
  	pmd_t *pmdp = NULL;
4b4bb46d0   Jan Kara   dax: clear dirty ...
834
  	spinlock_t *ptl;
4b4bb46d0   Jan Kara   dax: clear dirty ...
835
836
837
  
  	i_mmap_lock_read(mapping);
  	vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) {
a4d1a8852   Jérôme Glisse   dax: update to ne...
838
  		unsigned long address, start, end;
4b4bb46d0   Jan Kara   dax: clear dirty ...
839
840
841
842
843
844
845
  
  		cond_resched();
  
  		if (!(vma->vm_flags & VM_SHARED))
  			continue;
  
  		address = pgoff_address(index, vma);
a4d1a8852   Jérôme Glisse   dax: update to ne...
846
847
848
849
850
851
852
  
  		/*
  		 * Note because we provide start/end to follow_pte_pmd it will
  		 * call mmu_notifier_invalidate_range_start() on our behalf
  		 * before taking any lock.
  		 */
  		if (follow_pte_pmd(vma->vm_mm, address, &start, &end, &ptep, &pmdp, &ptl))
4b4bb46d0   Jan Kara   dax: clear dirty ...
853
  			continue;
4b4bb46d0   Jan Kara   dax: clear dirty ...
854

0f10851ea   Jérôme Glisse   mm/mmu_notifier: ...
855
856
857
858
859
  		/*
  		 * No need to call mmu_notifier_invalidate_range() as we are
  		 * downgrading page table protection not changing it to point
  		 * to a new page.
  		 *
ad56b738c   Mike Rapoport   docs/vm: rename d...
860
  		 * See Documentation/vm/mmu_notifier.rst
0f10851ea   Jérôme Glisse   mm/mmu_notifier: ...
861
  		 */
f729c8c9b   Ross Zwisler   dax: wrprotect pm...
862
863
864
865
866
867
  		if (pmdp) {
  #ifdef CONFIG_FS_DAX_PMD
  			pmd_t pmd;
  
  			if (pfn != pmd_pfn(*pmdp))
  				goto unlock_pmd;
f6f373216   Linus Torvalds   Revert "mm: repla...
868
  			if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp))
f729c8c9b   Ross Zwisler   dax: wrprotect pm...
869
870
871
872
873
874
875
  				goto unlock_pmd;
  
  			flush_cache_page(vma, address, pfn);
  			pmd = pmdp_huge_clear_flush(vma, address, pmdp);
  			pmd = pmd_wrprotect(pmd);
  			pmd = pmd_mkclean(pmd);
  			set_pmd_at(vma->vm_mm, address, pmdp, pmd);
f729c8c9b   Ross Zwisler   dax: wrprotect pm...
876
  unlock_pmd:
f729c8c9b   Ross Zwisler   dax: wrprotect pm...
877
  #endif
ee190ca65   Jan H. Schönherr   fs/dax.c: release...
878
  			spin_unlock(ptl);
f729c8c9b   Ross Zwisler   dax: wrprotect pm...
879
880
881
882
883
884
885
886
887
888
889
  		} else {
  			if (pfn != pte_pfn(*ptep))
  				goto unlock_pte;
  			if (!pte_dirty(*ptep) && !pte_write(*ptep))
  				goto unlock_pte;
  
  			flush_cache_page(vma, address, pfn);
  			pte = ptep_clear_flush(vma, address, ptep);
  			pte = pte_wrprotect(pte);
  			pte = pte_mkclean(pte);
  			set_pte_at(vma->vm_mm, address, ptep, pte);
f729c8c9b   Ross Zwisler   dax: wrprotect pm...
890
891
892
  unlock_pte:
  			pte_unmap_unlock(ptep, ptl);
  		}
4b4bb46d0   Jan Kara   dax: clear dirty ...
893

a4d1a8852   Jérôme Glisse   dax: update to ne...
894
  		mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
4b4bb46d0   Jan Kara   dax: clear dirty ...
895
896
897
  	}
  	i_mmap_unlock_read(mapping);
  }
3fe0791c2   Dan Williams   dax: store pfns i...
898
899
  static int dax_writeback_one(struct dax_device *dax_dev,
  		struct address_space *mapping, pgoff_t index, void *entry)
9973c98ec   Ross Zwisler   dax: add support ...
900
  {
b93b01631   Matthew Wilcox   page cache: use x...
901
  	struct radix_tree_root *pages = &mapping->i_pages;
3fe0791c2   Dan Williams   dax: store pfns i...
902
903
904
  	void *entry2, **slot;
  	unsigned long pfn;
  	long ret = 0;
cccbce671   Dan Williams   filesystem-dax: c...
905
  	size_t size;
9973c98ec   Ross Zwisler   dax: add support ...
906

9973c98ec   Ross Zwisler   dax: add support ...
907
  	/*
a6abc2c0e   Jan Kara   dax: make cache f...
908
909
  	 * A page got tagged dirty in DAX mapping? Something is seriously
  	 * wrong.
9973c98ec   Ross Zwisler   dax: add support ...
910
  	 */
a6abc2c0e   Jan Kara   dax: make cache f...
911
912
  	if (WARN_ON(!radix_tree_exceptional_entry(entry)))
  		return -EIO;
9973c98ec   Ross Zwisler   dax: add support ...
913

b93b01631   Matthew Wilcox   page cache: use x...
914
  	xa_lock_irq(pages);
a6abc2c0e   Jan Kara   dax: make cache f...
915
916
  	entry2 = get_unlocked_mapping_entry(mapping, index, &slot);
  	/* Entry got punched out / reallocated? */
91d25ba8a   Ross Zwisler   dax: use common 4...
917
  	if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2)))
a6abc2c0e   Jan Kara   dax: make cache f...
918
919
920
  		goto put_unlocked;
  	/*
  	 * Entry got reallocated elsewhere? No need to writeback. We have to
3fe0791c2   Dan Williams   dax: store pfns i...
921
  	 * compare pfns as we must not bail out due to difference in lockbit
a6abc2c0e   Jan Kara   dax: make cache f...
922
923
  	 * or entry type.
  	 */
3fe0791c2   Dan Williams   dax: store pfns i...
924
  	if (dax_radix_pfn(entry2) != dax_radix_pfn(entry))
a6abc2c0e   Jan Kara   dax: make cache f...
925
  		goto put_unlocked;
642261ac9   Ross Zwisler   dax: add struct i...
926
927
  	if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
  				dax_is_zero_entry(entry))) {
9973c98ec   Ross Zwisler   dax: add support ...
928
  		ret = -EIO;
a6abc2c0e   Jan Kara   dax: make cache f...
929
  		goto put_unlocked;
9973c98ec   Ross Zwisler   dax: add support ...
930
  	}
a6abc2c0e   Jan Kara   dax: make cache f...
931
  	/* Another fsync thread may have already written back this entry */
b93b01631   Matthew Wilcox   page cache: use x...
932
  	if (!radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE))
a6abc2c0e   Jan Kara   dax: make cache f...
933
934
935
936
937
938
939
  		goto put_unlocked;
  	/* Lock the entry to serialize with page faults */
  	entry = lock_slot(mapping, slot);
  	/*
  	 * We can clear the tag now but we have to be careful so that concurrent
  	 * dax_writeback_one() calls for the same index cannot finish before we
  	 * actually flush the caches. This is achieved as the calls will look
b93b01631   Matthew Wilcox   page cache: use x...
940
941
  	 * at the entry only under the i_pages lock and once they do that
  	 * they will see the entry locked and wait for it to unlock.
a6abc2c0e   Jan Kara   dax: make cache f...
942
  	 */
b93b01631   Matthew Wilcox   page cache: use x...
943
944
  	radix_tree_tag_clear(pages, index, PAGECACHE_TAG_TOWRITE);
  	xa_unlock_irq(pages);
a6abc2c0e   Jan Kara   dax: make cache f...
945

642261ac9   Ross Zwisler   dax: add struct i...
946
947
948
  	/*
  	 * Even if dax_writeback_mapping_range() was given a wbc->range_start
  	 * in the middle of a PMD, the 'index' we are given will be aligned to
3fe0791c2   Dan Williams   dax: store pfns i...
949
950
951
  	 * the start index of the PMD, as will the pfn we pull from 'entry'.
  	 * This allows us to flush for PMD_SIZE and not have to worry about
  	 * partial PMD writebacks.
642261ac9   Ross Zwisler   dax: add struct i...
952
  	 */
3fe0791c2   Dan Williams   dax: store pfns i...
953
  	pfn = dax_radix_pfn(entry);
cccbce671   Dan Williams   filesystem-dax: c...
954
  	size = PAGE_SIZE << dax_radix_order(entry);
3fe0791c2   Dan Williams   dax: store pfns i...
955
956
  	dax_mapping_entry_mkclean(mapping, index, pfn);
  	dax_flush(dax_dev, page_address(pfn_to_page(pfn)), size);
4b4bb46d0   Jan Kara   dax: clear dirty ...
957
958
959
960
961
962
  	/*
  	 * After we have flushed the cache, we can clear the dirty tag. There
  	 * cannot be new dirty data in the pfn after the flush has completed as
  	 * the pfn mappings are writeprotected and fault waits for mapping
  	 * entry lock.
  	 */
b93b01631   Matthew Wilcox   page cache: use x...
963
964
965
  	xa_lock_irq(pages);
  	radix_tree_tag_clear(pages, index, PAGECACHE_TAG_DIRTY);
  	xa_unlock_irq(pages);
f9bc3a075   Ross Zwisler   dax: add tracepoi...
966
  	trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT);
91d25ba8a   Ross Zwisler   dax: use common 4...
967
  	put_locked_mapping_entry(mapping, index);
9973c98ec   Ross Zwisler   dax: add support ...
968
  	return ret;
a6abc2c0e   Jan Kara   dax: make cache f...
969
970
   put_unlocked:
  	put_unlocked_mapping_entry(mapping, index, entry2);
b93b01631   Matthew Wilcox   page cache: use x...
971
  	xa_unlock_irq(pages);
9973c98ec   Ross Zwisler   dax: add support ...
972
973
974
975
976
977
978
979
  	return ret;
  }
  
  /*
   * Flush the mapping to the persistent domain within the byte range of [start,
   * end]. This is required by data integrity operations to ensure file data is
   * on persistent storage prior to completion of the operation.
   */
7f6d5b529   Ross Zwisler   dax: move writeba...
980
981
  int dax_writeback_mapping_range(struct address_space *mapping,
  		struct block_device *bdev, struct writeback_control *wbc)
9973c98ec   Ross Zwisler   dax: add support ...
982
983
  {
  	struct inode *inode = mapping->host;
642261ac9   Ross Zwisler   dax: add struct i...
984
  	pgoff_t start_index, end_index;
9973c98ec   Ross Zwisler   dax: add support ...
985
  	pgoff_t indices[PAGEVEC_SIZE];
cccbce671   Dan Williams   filesystem-dax: c...
986
  	struct dax_device *dax_dev;
9973c98ec   Ross Zwisler   dax: add support ...
987
988
989
  	struct pagevec pvec;
  	bool done = false;
  	int i, ret = 0;
9973c98ec   Ross Zwisler   dax: add support ...
990
991
992
  
  	if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
  		return -EIO;
7f6d5b529   Ross Zwisler   dax: move writeba...
993
994
  	if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
  		return 0;
cccbce671   Dan Williams   filesystem-dax: c...
995
996
997
  	dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
  	if (!dax_dev)
  		return -EIO;
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
998
999
  	start_index = wbc->range_start >> PAGE_SHIFT;
  	end_index = wbc->range_end >> PAGE_SHIFT;
9973c98ec   Ross Zwisler   dax: add support ...
1000

d14a3f48a   Ross Zwisler   dax: add tracepoi...
1001
  	trace_dax_writeback_range(inode, start_index, end_index);
9973c98ec   Ross Zwisler   dax: add support ...
1002
  	tag_pages_for_writeback(mapping, start_index, end_index);
866798201   Mel Gorman   mm, pagevec: remo...
1003
  	pagevec_init(&pvec);
9973c98ec   Ross Zwisler   dax: add support ...
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
  	while (!done) {
  		pvec.nr = find_get_entries_tag(mapping, start_index,
  				PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
  				pvec.pages, indices);
  
  		if (pvec.nr == 0)
  			break;
  
  		for (i = 0; i < pvec.nr; i++) {
  			if (indices[i] > end_index) {
  				done = true;
  				break;
  			}
3fe0791c2   Dan Williams   dax: store pfns i...
1017
1018
  			ret = dax_writeback_one(dax_dev, mapping, indices[i],
  					pvec.pages[i]);
819ec6b91   Jeff Layton   dax: set errors i...
1019
1020
  			if (ret < 0) {
  				mapping_set_error(mapping, ret);
d14a3f48a   Ross Zwisler   dax: add tracepoi...
1021
  				goto out;
819ec6b91   Jeff Layton   dax: set errors i...
1022
  			}
9973c98ec   Ross Zwisler   dax: add support ...
1023
  		}
1eb643d02   Jan Kara   fs/dax.c: fix ine...
1024
  		start_index = indices[pvec.nr - 1] + 1;
9973c98ec   Ross Zwisler   dax: add support ...
1025
  	}
d14a3f48a   Ross Zwisler   dax: add tracepoi...
1026
  out:
cccbce671   Dan Williams   filesystem-dax: c...
1027
  	put_dax(dax_dev);
d14a3f48a   Ross Zwisler   dax: add tracepoi...
1028
1029
  	trace_dax_writeback_range_done(inode, start_index, end_index);
  	return (ret < 0 ? ret : 0);
9973c98ec   Ross Zwisler   dax: add support ...
1030
1031
  }
  EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
31a6f1a6e   Jan Kara   dax: Simplify arg...
1032
  static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
1033
  {
a3841f94c   Linus Torvalds   Merge tag 'libnvd...
1034
  	return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
31a6f1a6e   Jan Kara   dax: Simplify arg...
1035
  }
5e161e406   Jan Kara   dax: Factor out g...
1036
1037
  static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size,
  			 pfn_t *pfnp)
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
1038
  {
31a6f1a6e   Jan Kara   dax: Simplify arg...
1039
  	const sector_t sector = dax_iomap_sector(iomap, pos);
cccbce671   Dan Williams   filesystem-dax: c...
1040
1041
  	pgoff_t pgoff;
  	int id, rc;
5e161e406   Jan Kara   dax: Factor out g...
1042
  	long length;
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
1043

5e161e406   Jan Kara   dax: Factor out g...
1044
  	rc = bdev_dax_pgoff(iomap->bdev, sector, size, &pgoff);
cccbce671   Dan Williams   filesystem-dax: c...
1045
1046
  	if (rc)
  		return rc;
cccbce671   Dan Williams   filesystem-dax: c...
1047
  	id = dax_read_lock();
5e161e406   Jan Kara   dax: Factor out g...
1048
  	length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
86ed913b0   Huaisheng Ye   filesystem-dax: D...
1049
  				   NULL, pfnp);
5e161e406   Jan Kara   dax: Factor out g...
1050
1051
1052
  	if (length < 0) {
  		rc = length;
  		goto out;
cccbce671   Dan Williams   filesystem-dax: c...
1053
  	}
5e161e406   Jan Kara   dax: Factor out g...
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
  	rc = -EINVAL;
  	if (PFN_PHYS(length) < size)
  		goto out;
  	if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1))
  		goto out;
  	/* For larger pages we need devmap */
  	if (length > 1 && !pfn_t_devmap(*pfnp))
  		goto out;
  	rc = 0;
  out:
cccbce671   Dan Williams   filesystem-dax: c...
1064
  	dax_read_unlock(id);
5e161e406   Jan Kara   dax: Factor out g...
1065
  	return rc;
0e3b210ce   Boaz Harrosh   dax: use pfn_mkwr...
1066
  }
0e3b210ce   Boaz Harrosh   dax: use pfn_mkwr...
1067

e30331ff0   Ross Zwisler   dax: relocate som...
1068
  /*
91d25ba8a   Ross Zwisler   dax: use common 4...
1069
1070
1071
1072
1073
   * The user has performed a load from a hole in the file.  Allocating a new
   * page in the file would cause excessive storage usage for workloads with
   * sparse files.  Instead we insert a read-only mapping of the 4k zero page.
   * If this page is ever written to we will re-fault and change the mapping to
   * point to real DAX storage instead.
e30331ff0   Ross Zwisler   dax: relocate som...
1074
   */
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1075
  static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry,
e30331ff0   Ross Zwisler   dax: relocate som...
1076
1077
1078
  			 struct vm_fault *vmf)
  {
  	struct inode *inode = mapping->host;
91d25ba8a   Ross Zwisler   dax: use common 4...
1079
  	unsigned long vaddr = vmf->address;
b90ca5cc3   Matthew Wilcox   filesystem-dax: F...
1080
1081
  	pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
  	vm_fault_t ret;
e30331ff0   Ross Zwisler   dax: relocate som...
1082

cc4a90ac8   Matthew Wilcox   dax: dax_insert_m...
1083
1084
  	dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_ZERO_PAGE,
  			false);
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1085
  	ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
e30331ff0   Ross Zwisler   dax: relocate som...
1086
1087
1088
  	trace_dax_load_hole(inode, vmf, ret);
  	return ret;
  }
4b0228fa1   Vishal Verma   dax: for truncate...
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
  static bool dax_range_is_aligned(struct block_device *bdev,
  				 unsigned int offset, unsigned int length)
  {
  	unsigned short sector_size = bdev_logical_block_size(bdev);
  
  	if (!IS_ALIGNED(offset, sector_size))
  		return false;
  	if (!IS_ALIGNED(length, sector_size))
  		return false;
  
  	return true;
  }
cccbce671   Dan Williams   filesystem-dax: c...
1101
1102
1103
  int __dax_zero_page_range(struct block_device *bdev,
  		struct dax_device *dax_dev, sector_t sector,
  		unsigned int offset, unsigned int size)
679c8bd3b   Christoph Hellwig   dax: export a low...
1104
  {
cccbce671   Dan Williams   filesystem-dax: c...
1105
1106
  	if (dax_range_is_aligned(bdev, offset, size)) {
  		sector_t start_sector = sector + (offset >> 9);
4b0228fa1   Vishal Verma   dax: for truncate...
1107
1108
  
  		return blkdev_issue_zeroout(bdev, start_sector,
53ef7d0e2   Linus Torvalds   Merge tag 'libnvd...
1109
  				size >> 9, GFP_NOFS, 0);
4b0228fa1   Vishal Verma   dax: for truncate...
1110
  	} else {
cccbce671   Dan Williams   filesystem-dax: c...
1111
1112
1113
  		pgoff_t pgoff;
  		long rc, id;
  		void *kaddr;
cccbce671   Dan Williams   filesystem-dax: c...
1114

e84b83b9e   Dan Williams   filesystem-dax: f...
1115
  		rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff);
cccbce671   Dan Williams   filesystem-dax: c...
1116
1117
1118
1119
  		if (rc)
  			return rc;
  
  		id = dax_read_lock();
86ed913b0   Huaisheng Ye   filesystem-dax: D...
1120
  		rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL);
cccbce671   Dan Williams   filesystem-dax: c...
1121
1122
1123
1124
  		if (rc < 0) {
  			dax_read_unlock(id);
  			return rc;
  		}
81f558701   Dan Williams   x86, dax: replace...
1125
  		memset(kaddr + offset, 0, size);
c3ca015fa   Mikulas Patocka   dax: remove the p...
1126
  		dax_flush(dax_dev, kaddr + offset, size);
cccbce671   Dan Williams   filesystem-dax: c...
1127
  		dax_read_unlock(id);
4b0228fa1   Vishal Verma   dax: for truncate...
1128
  	}
679c8bd3b   Christoph Hellwig   dax: export a low...
1129
1130
1131
  	return 0;
  }
  EXPORT_SYMBOL_GPL(__dax_zero_page_range);
a254e5681   Christoph Hellwig   dax: provide an i...
1132
  static loff_t
11c59c92f   Ross Zwisler   dax: correct dax ...
1133
  dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
a254e5681   Christoph Hellwig   dax: provide an i...
1134
1135
  		struct iomap *iomap)
  {
cccbce671   Dan Williams   filesystem-dax: c...
1136
1137
  	struct block_device *bdev = iomap->bdev;
  	struct dax_device *dax_dev = iomap->dax_dev;
a254e5681   Christoph Hellwig   dax: provide an i...
1138
1139
1140
  	struct iov_iter *iter = data;
  	loff_t end = pos + length, done = 0;
  	ssize_t ret = 0;
a77d47864   Dan Williams   dax: Report bytes...
1141
  	size_t xfer;
cccbce671   Dan Williams   filesystem-dax: c...
1142
  	int id;
a254e5681   Christoph Hellwig   dax: provide an i...
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
  
  	if (iov_iter_rw(iter) == READ) {
  		end = min(end, i_size_read(inode));
  		if (pos >= end)
  			return 0;
  
  		if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
  			return iov_iter_zero(min(length, end - pos), iter);
  	}
  
  	if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
  		return -EIO;
e3fce68cd   Jan Kara   dax: Avoid page i...
1155
1156
1157
1158
1159
  	/*
  	 * Write can allocate block for an area which has a hole page mapped
  	 * into page tables. We have to tear down these mappings so that data
  	 * written by write(2) is visible in mmap.
  	 */
cd656375f   Jan Kara   mm: fix data corr...
1160
  	if (iomap->flags & IOMAP_F_NEW) {
e3fce68cd   Jan Kara   dax: Avoid page i...
1161
1162
1163
1164
  		invalidate_inode_pages2_range(inode->i_mapping,
  					      pos >> PAGE_SHIFT,
  					      (end - 1) >> PAGE_SHIFT);
  	}
cccbce671   Dan Williams   filesystem-dax: c...
1165
  	id = dax_read_lock();
a254e5681   Christoph Hellwig   dax: provide an i...
1166
1167
  	while (pos < end) {
  		unsigned offset = pos & (PAGE_SIZE - 1);
cccbce671   Dan Williams   filesystem-dax: c...
1168
1169
  		const size_t size = ALIGN(length + offset, PAGE_SIZE);
  		const sector_t sector = dax_iomap_sector(iomap, pos);
a254e5681   Christoph Hellwig   dax: provide an i...
1170
  		ssize_t map_len;
cccbce671   Dan Williams   filesystem-dax: c...
1171
1172
  		pgoff_t pgoff;
  		void *kaddr;
a254e5681   Christoph Hellwig   dax: provide an i...
1173

d1908f525   Michal Hocko   fs: break out of ...
1174
1175
1176
1177
  		if (fatal_signal_pending(current)) {
  			ret = -EINTR;
  			break;
  		}
cccbce671   Dan Williams   filesystem-dax: c...
1178
1179
1180
1181
1182
  		ret = bdev_dax_pgoff(bdev, sector, size, &pgoff);
  		if (ret)
  			break;
  
  		map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
86ed913b0   Huaisheng Ye   filesystem-dax: D...
1183
  				&kaddr, NULL);
a254e5681   Christoph Hellwig   dax: provide an i...
1184
1185
1186
1187
  		if (map_len < 0) {
  			ret = map_len;
  			break;
  		}
cccbce671   Dan Williams   filesystem-dax: c...
1188
1189
  		map_len = PFN_PHYS(map_len);
  		kaddr += offset;
a254e5681   Christoph Hellwig   dax: provide an i...
1190
1191
1192
  		map_len -= offset;
  		if (map_len > end - pos)
  			map_len = end - pos;
a2e050f5a   Ross Zwisler   dax: explain how ...
1193
1194
1195
1196
1197
  		/*
  		 * The userspace address for the memory copy has already been
  		 * validated via access_ok() in either vfs_read() or
  		 * vfs_write(), depending on which operation we are doing.
  		 */
a254e5681   Christoph Hellwig   dax: provide an i...
1198
  		if (iov_iter_rw(iter) == WRITE)
a77d47864   Dan Williams   dax: Report bytes...
1199
  			xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
fec53774f   Dan Williams   filesystem-dax: c...
1200
  					map_len, iter);
a254e5681   Christoph Hellwig   dax: provide an i...
1201
  		else
a77d47864   Dan Williams   dax: Report bytes...
1202
  			xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
b3a9a0c36   Dan Williams   dax: Introduce a ...
1203
  					map_len, iter);
a254e5681   Christoph Hellwig   dax: provide an i...
1204

a77d47864   Dan Williams   dax: Report bytes...
1205
1206
1207
1208
1209
1210
1211
1212
  		pos += xfer;
  		length -= xfer;
  		done += xfer;
  
  		if (xfer == 0)
  			ret = -EFAULT;
  		if (xfer < map_len)
  			break;
a254e5681   Christoph Hellwig   dax: provide an i...
1213
  	}
cccbce671   Dan Williams   filesystem-dax: c...
1214
  	dax_read_unlock(id);
a254e5681   Christoph Hellwig   dax: provide an i...
1215
1216
1217
1218
1219
  
  	return done ? done : ret;
  }
  
  /**
11c59c92f   Ross Zwisler   dax: correct dax ...
1220
   * dax_iomap_rw - Perform I/O to a DAX file
a254e5681   Christoph Hellwig   dax: provide an i...
1221
1222
1223
1224
1225
1226
1227
1228
1229
   * @iocb:	The control block for this I/O
   * @iter:	The addresses to do I/O from or to
   * @ops:	iomap ops passed from the file system
   *
   * This function performs read and write operations to directly mapped
   * persistent memory.  The callers needs to take care of read/write exclusion
   * and evicting any page cache pages in the region under I/O.
   */
  ssize_t
11c59c92f   Ross Zwisler   dax: correct dax ...
1230
  dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
8ff6daa17   Christoph Hellwig   iomap: constify s...
1231
  		const struct iomap_ops *ops)
a254e5681   Christoph Hellwig   dax: provide an i...
1232
1233
1234
1235
1236
  {
  	struct address_space *mapping = iocb->ki_filp->f_mapping;
  	struct inode *inode = mapping->host;
  	loff_t pos = iocb->ki_pos, ret = 0, done = 0;
  	unsigned flags = 0;
168316db3   Christoph Hellwig   dax: assert that ...
1237
1238
  	if (iov_iter_rw(iter) == WRITE) {
  		lockdep_assert_held_exclusive(&inode->i_rwsem);
a254e5681   Christoph Hellwig   dax: provide an i...
1239
  		flags |= IOMAP_WRITE;
168316db3   Christoph Hellwig   dax: assert that ...
1240
1241
1242
  	} else {
  		lockdep_assert_held(&inode->i_rwsem);
  	}
a254e5681   Christoph Hellwig   dax: provide an i...
1243

a254e5681   Christoph Hellwig   dax: provide an i...
1244
1245
  	while (iov_iter_count(iter)) {
  		ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
11c59c92f   Ross Zwisler   dax: correct dax ...
1246
  				iter, dax_iomap_actor);
a254e5681   Christoph Hellwig   dax: provide an i...
1247
1248
1249
1250
1251
1252
1253
1254
1255
  		if (ret <= 0)
  			break;
  		pos += ret;
  		done += ret;
  	}
  
  	iocb->ki_pos += done;
  	return done ? done : ret;
  }
11c59c92f   Ross Zwisler   dax: correct dax ...
1256
  EXPORT_SYMBOL_GPL(dax_iomap_rw);
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1257

ab77dab46   Souptick Joarder   fs/dax.c: use new...
1258
  static vm_fault_t dax_fault_return(int error)
9f141d6ef   Jan Kara   dax: Call ->iomap...
1259
1260
1261
1262
1263
1264
1265
  {
  	if (error == 0)
  		return VM_FAULT_NOPAGE;
  	if (error == -ENOMEM)
  		return VM_FAULT_OOM;
  	return VM_FAULT_SIGBUS;
  }
aaa422c4c   Dan Williams   fs, dax: unify IO...
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
  /*
   * MAP_SYNC on a dax mapping guarantees dirty metadata is
   * flushed on write-faults (non-cow), but not read-faults.
   */
  static bool dax_fault_is_synchronous(unsigned long flags,
  		struct vm_area_struct *vma, struct iomap *iomap)
  {
  	return (flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC)
  		&& (iomap->flags & IOMAP_F_DIRTY);
  }
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1276
  static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
c0b246259   Jan Kara   dax: pass detaile...
1277
  			       int *iomap_errp, const struct iomap_ops *ops)
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1278
  {
a0987ad5c   Jan Kara   dax: Create local...
1279
1280
  	struct vm_area_struct *vma = vmf->vma;
  	struct address_space *mapping = vma->vm_file->f_mapping;
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1281
  	struct inode *inode = mapping->host;
1a29d85eb   Jan Kara   mm: use vmf->addr...
1282
  	unsigned long vaddr = vmf->address;
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1283
  	loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1284
  	struct iomap iomap = { 0 };
9484ab1bf   Jan Kara   dax: Introduce IO...
1285
  	unsigned flags = IOMAP_FAULT;
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1286
  	int error, major = 0;
d2c43ef13   Jan Kara   dax: Create local...
1287
  	bool write = vmf->flags & FAULT_FLAG_WRITE;
caa51d26f   Jan Kara   dax, iomap: Add s...
1288
  	bool sync;
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1289
  	vm_fault_t ret = 0;
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1290
  	void *entry;
1b5a1cb21   Jan Kara   dax: Inline dax_i...
1291
  	pfn_t pfn;
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1292

ab77dab46   Souptick Joarder   fs/dax.c: use new...
1293
  	trace_dax_pte_fault(inode, vmf, ret);
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1294
1295
1296
1297
1298
  	/*
  	 * Check whether offset isn't beyond end of file now. Caller is supposed
  	 * to hold locks serializing us with truncate / punch hole so this is
  	 * a reliable test.
  	 */
a9c42b33e   Ross Zwisler   dax: add tracepoi...
1299
  	if (pos >= i_size_read(inode)) {
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1300
  		ret = VM_FAULT_SIGBUS;
a9c42b33e   Ross Zwisler   dax: add tracepoi...
1301
1302
  		goto out;
  	}
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1303

d2c43ef13   Jan Kara   dax: Create local...
1304
  	if (write && !vmf->cow_page)
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1305
  		flags |= IOMAP_WRITE;
13e451fdc   Jan Kara   dax: fix data cor...
1306
1307
  	entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
  	if (IS_ERR(entry)) {
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1308
  		ret = dax_fault_return(PTR_ERR(entry));
13e451fdc   Jan Kara   dax: fix data cor...
1309
1310
  		goto out;
  	}
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1311
  	/*
e2093926a   Ross Zwisler   dax: fix race bet...
1312
1313
1314
1315
1316
1317
  	 * It is possible, particularly with mixed reads & writes to private
  	 * mappings, that we have raced with a PMD fault that overlaps with
  	 * the PTE we need to set up.  If so just return and the fault will be
  	 * retried.
  	 */
  	if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) {
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1318
  		ret = VM_FAULT_NOPAGE;
e2093926a   Ross Zwisler   dax: fix race bet...
1319
1320
1321
1322
  		goto unlock_entry;
  	}
  
  	/*
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1323
1324
1325
1326
1327
  	 * Note that we don't bother to use iomap_apply here: DAX required
  	 * the file system block size to be equal the page size, which means
  	 * that we never have to deal with more than a single extent here.
  	 */
  	error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
c0b246259   Jan Kara   dax: pass detaile...
1328
1329
  	if (iomap_errp)
  		*iomap_errp = error;
a9c42b33e   Ross Zwisler   dax: add tracepoi...
1330
  	if (error) {
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1331
  		ret = dax_fault_return(error);
13e451fdc   Jan Kara   dax: fix data cor...
1332
  		goto unlock_entry;
a9c42b33e   Ross Zwisler   dax: add tracepoi...
1333
  	}
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1334
  	if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
13e451fdc   Jan Kara   dax: fix data cor...
1335
1336
  		error = -EIO;	/* fs corruption? */
  		goto error_finish_iomap;
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1337
  	}
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1338
  	if (vmf->cow_page) {
31a6f1a6e   Jan Kara   dax: Simplify arg...
1339
  		sector_t sector = dax_iomap_sector(&iomap, pos);
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1340
1341
1342
1343
1344
1345
  		switch (iomap.type) {
  		case IOMAP_HOLE:
  		case IOMAP_UNWRITTEN:
  			clear_user_highpage(vmf->cow_page, vaddr);
  			break;
  		case IOMAP_MAPPED:
cccbce671   Dan Williams   filesystem-dax: c...
1346
1347
  			error = copy_user_dax(iomap.bdev, iomap.dax_dev,
  					sector, PAGE_SIZE, vmf->cow_page, vaddr);
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1348
1349
1350
1351
1352
1353
1354
1355
  			break;
  		default:
  			WARN_ON_ONCE(1);
  			error = -EIO;
  			break;
  		}
  
  		if (error)
13e451fdc   Jan Kara   dax: fix data cor...
1356
  			goto error_finish_iomap;
b1aa812b2   Jan Kara   mm: move handling...
1357
1358
  
  		__SetPageUptodate(vmf->cow_page);
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1359
1360
1361
  		ret = finish_fault(vmf);
  		if (!ret)
  			ret = VM_FAULT_DONE_COW;
13e451fdc   Jan Kara   dax: fix data cor...
1362
  		goto finish_iomap;
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1363
  	}
aaa422c4c   Dan Williams   fs, dax: unify IO...
1364
  	sync = dax_fault_is_synchronous(flags, vma, &iomap);
caa51d26f   Jan Kara   dax, iomap: Add s...
1365

a7d73fe6c   Christoph Hellwig   dax: provide an i...
1366
1367
1368
1369
  	switch (iomap.type) {
  	case IOMAP_MAPPED:
  		if (iomap.flags & IOMAP_F_NEW) {
  			count_vm_event(PGMAJFAULT);
a0987ad5c   Jan Kara   dax: Create local...
1370
  			count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1371
1372
  			major = VM_FAULT_MAJOR;
  		}
1b5a1cb21   Jan Kara   dax: Inline dax_i...
1373
1374
1375
  		error = dax_iomap_pfn(&iomap, pos, PAGE_SIZE, &pfn);
  		if (error < 0)
  			goto error_finish_iomap;
3fe0791c2   Dan Williams   dax: store pfns i...
1376
  		entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
caa51d26f   Jan Kara   dax, iomap: Add s...
1377
  						 0, write && !sync);
1b5a1cb21   Jan Kara   dax: Inline dax_i...
1378

caa51d26f   Jan Kara   dax, iomap: Add s...
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
  		/*
  		 * If we are doing synchronous page fault and inode needs fsync,
  		 * we can insert PTE into page tables only after that happens.
  		 * Skip insertion for now and return the pfn so that caller can
  		 * insert it after fsync is done.
  		 */
  		if (sync) {
  			if (WARN_ON_ONCE(!pfnp)) {
  				error = -EIO;
  				goto error_finish_iomap;
  			}
  			*pfnp = pfn;
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1391
  			ret = VM_FAULT_NEEDDSYNC | major;
caa51d26f   Jan Kara   dax, iomap: Add s...
1392
1393
  			goto finish_iomap;
  		}
1b5a1cb21   Jan Kara   dax: Inline dax_i...
1394
1395
  		trace_dax_insert_mapping(inode, vmf, entry);
  		if (write)
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1396
  			ret = vmf_insert_mixed_mkwrite(vma, vaddr, pfn);
1b5a1cb21   Jan Kara   dax: Inline dax_i...
1397
  		else
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1398
  			ret = vmf_insert_mixed(vma, vaddr, pfn);
1b5a1cb21   Jan Kara   dax: Inline dax_i...
1399

ab77dab46   Souptick Joarder   fs/dax.c: use new...
1400
  		goto finish_iomap;
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1401
1402
  	case IOMAP_UNWRITTEN:
  	case IOMAP_HOLE:
d2c43ef13   Jan Kara   dax: Create local...
1403
  		if (!write) {
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1404
  			ret = dax_load_hole(mapping, entry, vmf);
13e451fdc   Jan Kara   dax: fix data cor...
1405
  			goto finish_iomap;
1550290b0   Ross Zwisler   dax: dax_iomap_fa...
1406
  		}
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1407
1408
1409
1410
1411
1412
  		/*FALLTHRU*/
  	default:
  		WARN_ON_ONCE(1);
  		error = -EIO;
  		break;
  	}
13e451fdc   Jan Kara   dax: fix data cor...
1413
   error_finish_iomap:
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1414
  	ret = dax_fault_return(error);
9f141d6ef   Jan Kara   dax: Call ->iomap...
1415
1416
1417
   finish_iomap:
  	if (ops->iomap_end) {
  		int copied = PAGE_SIZE;
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1418
  		if (ret & VM_FAULT_ERROR)
9f141d6ef   Jan Kara   dax: Call ->iomap...
1419
1420
1421
1422
1423
1424
1425
1426
  			copied = 0;
  		/*
  		 * The fault is done by now and there's no way back (other
  		 * thread may be already happily using PTE we have installed).
  		 * Just ignore error from ->iomap_end since we cannot do much
  		 * with it.
  		 */
  		ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
1550290b0   Ross Zwisler   dax: dax_iomap_fa...
1427
  	}
13e451fdc   Jan Kara   dax: fix data cor...
1428
   unlock_entry:
91d25ba8a   Ross Zwisler   dax: use common 4...
1429
  	put_locked_mapping_entry(mapping, vmf->pgoff);
13e451fdc   Jan Kara   dax: fix data cor...
1430
   out:
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1431
1432
  	trace_dax_pte_fault_done(inode, vmf, ret);
  	return ret | major;
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1433
  }
642261ac9   Ross Zwisler   dax: add struct i...
1434
1435
  
  #ifdef CONFIG_FS_DAX_PMD
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1436
  static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
91d25ba8a   Ross Zwisler   dax: use common 4...
1437
  		void *entry)
642261ac9   Ross Zwisler   dax: add struct i...
1438
  {
f42003917   Dave Jiang   mm, dax: change p...
1439
1440
  	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
  	unsigned long pmd_addr = vmf->address & PMD_MASK;
653b2ea33   Ross Zwisler   dax: add tracepoi...
1441
  	struct inode *inode = mapping->host;
642261ac9   Ross Zwisler   dax: add struct i...
1442
  	struct page *zero_page;
653b2ea33   Ross Zwisler   dax: add tracepoi...
1443
  	void *ret = NULL;
642261ac9   Ross Zwisler   dax: add struct i...
1444
1445
  	spinlock_t *ptl;
  	pmd_t pmd_entry;
3fe0791c2   Dan Williams   dax: store pfns i...
1446
  	pfn_t pfn;
642261ac9   Ross Zwisler   dax: add struct i...
1447

f42003917   Dave Jiang   mm, dax: change p...
1448
  	zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm);
642261ac9   Ross Zwisler   dax: add struct i...
1449
1450
  
  	if (unlikely(!zero_page))
653b2ea33   Ross Zwisler   dax: add tracepoi...
1451
  		goto fallback;
642261ac9   Ross Zwisler   dax: add struct i...
1452

3fe0791c2   Dan Williams   dax: store pfns i...
1453
1454
  	pfn = page_to_pfn_t(zero_page);
  	ret = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
f5b7b7487   Jan Kara   dax: Allow tuning...
1455
  			RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false);
642261ac9   Ross Zwisler   dax: add struct i...
1456

f42003917   Dave Jiang   mm, dax: change p...
1457
1458
  	ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
  	if (!pmd_none(*(vmf->pmd))) {
642261ac9   Ross Zwisler   dax: add struct i...
1459
  		spin_unlock(ptl);
653b2ea33   Ross Zwisler   dax: add tracepoi...
1460
  		goto fallback;
642261ac9   Ross Zwisler   dax: add struct i...
1461
  	}
f42003917   Dave Jiang   mm, dax: change p...
1462
  	pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot);
642261ac9   Ross Zwisler   dax: add struct i...
1463
  	pmd_entry = pmd_mkhuge(pmd_entry);
f42003917   Dave Jiang   mm, dax: change p...
1464
  	set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
642261ac9   Ross Zwisler   dax: add struct i...
1465
  	spin_unlock(ptl);
f42003917   Dave Jiang   mm, dax: change p...
1466
  	trace_dax_pmd_load_hole(inode, vmf, zero_page, ret);
642261ac9   Ross Zwisler   dax: add struct i...
1467
  	return VM_FAULT_NOPAGE;
653b2ea33   Ross Zwisler   dax: add tracepoi...
1468
1469
  
  fallback:
f42003917   Dave Jiang   mm, dax: change p...
1470
  	trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, ret);
653b2ea33   Ross Zwisler   dax: add tracepoi...
1471
  	return VM_FAULT_FALLBACK;
642261ac9   Ross Zwisler   dax: add struct i...
1472
  }
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1473
  static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
a2d581675   Dave Jiang   mm,fs,dax: change...
1474
  			       const struct iomap_ops *ops)
642261ac9   Ross Zwisler   dax: add struct i...
1475
  {
f42003917   Dave Jiang   mm, dax: change p...
1476
  	struct vm_area_struct *vma = vmf->vma;
642261ac9   Ross Zwisler   dax: add struct i...
1477
  	struct address_space *mapping = vma->vm_file->f_mapping;
d8a849e1b   Dave Jiang   mm, dax: make pmd...
1478
1479
  	unsigned long pmd_addr = vmf->address & PMD_MASK;
  	bool write = vmf->flags & FAULT_FLAG_WRITE;
caa51d26f   Jan Kara   dax, iomap: Add s...
1480
  	bool sync;
9484ab1bf   Jan Kara   dax: Introduce IO...
1481
  	unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
642261ac9   Ross Zwisler   dax: add struct i...
1482
  	struct inode *inode = mapping->host;
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1483
  	vm_fault_t result = VM_FAULT_FALLBACK;
642261ac9   Ross Zwisler   dax: add struct i...
1484
1485
  	struct iomap iomap = { 0 };
  	pgoff_t max_pgoff, pgoff;
642261ac9   Ross Zwisler   dax: add struct i...
1486
1487
1488
  	void *entry;
  	loff_t pos;
  	int error;
302a5e312   Jan Kara   dax: Inline dax_p...
1489
  	pfn_t pfn;
642261ac9   Ross Zwisler   dax: add struct i...
1490

282a8e039   Ross Zwisler   dax: add tracepoi...
1491
1492
1493
1494
1495
1496
  	/*
  	 * Check whether offset isn't beyond end of file now. Caller is
  	 * supposed to hold locks serializing us with truncate / punch hole so
  	 * this is a reliable test.
  	 */
  	pgoff = linear_page_index(vma, pmd_addr);
957ac8c42   Jeff Moyer   dax: fix PMD faul...
1497
  	max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
282a8e039   Ross Zwisler   dax: add tracepoi...
1498

f42003917   Dave Jiang   mm, dax: change p...
1499
  	trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);
282a8e039   Ross Zwisler   dax: add tracepoi...
1500

fffa281b4   Ross Zwisler   dax: fix deadlock...
1501
1502
1503
1504
1505
1506
1507
1508
1509
  	/*
  	 * Make sure that the faulting address's PMD offset (color) matches
  	 * the PMD offset from the start of the file.  This is necessary so
  	 * that a PMD range in the page table overlaps exactly with a PMD
  	 * range in the radix tree.
  	 */
  	if ((vmf->pgoff & PG_PMD_COLOUR) !=
  	    ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR))
  		goto fallback;
642261ac9   Ross Zwisler   dax: add struct i...
1510
1511
1512
1513
1514
1515
1516
1517
1518
  	/* Fall back to PTEs if we're going to COW */
  	if (write && !(vma->vm_flags & VM_SHARED))
  		goto fallback;
  
  	/* If the PMD would extend outside the VMA */
  	if (pmd_addr < vma->vm_start)
  		goto fallback;
  	if ((pmd_addr + PMD_SIZE) > vma->vm_end)
  		goto fallback;
957ac8c42   Jeff Moyer   dax: fix PMD faul...
1519
  	if (pgoff >= max_pgoff) {
282a8e039   Ross Zwisler   dax: add tracepoi...
1520
1521
1522
  		result = VM_FAULT_SIGBUS;
  		goto out;
  	}
642261ac9   Ross Zwisler   dax: add struct i...
1523
1524
  
  	/* If the PMD would extend beyond the file size */
957ac8c42   Jeff Moyer   dax: fix PMD faul...
1525
  	if ((pgoff | PG_PMD_COLOUR) >= max_pgoff)
642261ac9   Ross Zwisler   dax: add struct i...
1526
1527
1528
  		goto fallback;
  
  	/*
91d25ba8a   Ross Zwisler   dax: use common 4...
1529
1530
1531
1532
  	 * grab_mapping_entry() will make sure we get a 2MiB empty entry, a
  	 * 2MiB zero page entry or a DAX PMD.  If it can't (because a 4k page
  	 * is already in the tree, for instance), it will return -EEXIST and
  	 * we just fall back to 4k entries.
876f29460   Ross Zwisler   dax: fix PMD data...
1533
1534
1535
1536
1537
1538
  	 */
  	entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
  	if (IS_ERR(entry))
  		goto fallback;
  
  	/*
e2093926a   Ross Zwisler   dax: fix race bet...
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
  	 * It is possible, particularly with mixed reads & writes to private
  	 * mappings, that we have raced with a PTE fault that overlaps with
  	 * the PMD we need to set up.  If so just return and the fault will be
  	 * retried.
  	 */
  	if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) &&
  			!pmd_devmap(*vmf->pmd)) {
  		result = 0;
  		goto unlock_entry;
  	}
  
  	/*
642261ac9   Ross Zwisler   dax: add struct i...
1551
1552
1553
1554
1555
1556
1557
  	 * Note that we don't use iomap_apply here.  We aren't doing I/O, only
  	 * setting up a mapping, so really we're using iomap_begin() as a way
  	 * to look up our filesystem block.
  	 */
  	pos = (loff_t)pgoff << PAGE_SHIFT;
  	error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
  	if (error)
876f29460   Ross Zwisler   dax: fix PMD data...
1558
  		goto unlock_entry;
9f141d6ef   Jan Kara   dax: Call ->iomap...
1559

642261ac9   Ross Zwisler   dax: add struct i...
1560
1561
  	if (iomap.offset + iomap.length < pos + PMD_SIZE)
  		goto finish_iomap;
aaa422c4c   Dan Williams   fs, dax: unify IO...
1562
  	sync = dax_fault_is_synchronous(iomap_flags, vma, &iomap);
caa51d26f   Jan Kara   dax, iomap: Add s...
1563

642261ac9   Ross Zwisler   dax: add struct i...
1564
1565
  	switch (iomap.type) {
  	case IOMAP_MAPPED:
302a5e312   Jan Kara   dax: Inline dax_p...
1566
1567
1568
  		error = dax_iomap_pfn(&iomap, pos, PMD_SIZE, &pfn);
  		if (error < 0)
  			goto finish_iomap;
3fe0791c2   Dan Williams   dax: store pfns i...
1569
  		entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
caa51d26f   Jan Kara   dax, iomap: Add s...
1570
  						RADIX_DAX_PMD, write && !sync);
302a5e312   Jan Kara   dax: Inline dax_p...
1571

caa51d26f   Jan Kara   dax, iomap: Add s...
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
  		/*
  		 * If we are doing synchronous page fault and inode needs fsync,
  		 * we can insert PMD into page tables only after that happens.
  		 * Skip insertion for now and return the pfn so that caller can
  		 * insert it after fsync is done.
  		 */
  		if (sync) {
  			if (WARN_ON_ONCE(!pfnp))
  				goto finish_iomap;
  			*pfnp = pfn;
  			result = VM_FAULT_NEEDDSYNC;
  			goto finish_iomap;
  		}
302a5e312   Jan Kara   dax: Inline dax_p...
1585
1586
1587
  		trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
  		result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
  					    write);
642261ac9   Ross Zwisler   dax: add struct i...
1588
1589
1590
1591
  		break;
  	case IOMAP_UNWRITTEN:
  	case IOMAP_HOLE:
  		if (WARN_ON_ONCE(write))
876f29460   Ross Zwisler   dax: fix PMD data...
1592
  			break;
91d25ba8a   Ross Zwisler   dax: use common 4...
1593
  		result = dax_pmd_load_hole(vmf, &iomap, entry);
642261ac9   Ross Zwisler   dax: add struct i...
1594
1595
1596
1597
1598
1599
1600
1601
  		break;
  	default:
  		WARN_ON_ONCE(1);
  		break;
  	}
  
   finish_iomap:
  	if (ops->iomap_end) {
9f141d6ef   Jan Kara   dax: Call ->iomap...
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
  		int copied = PMD_SIZE;
  
  		if (result == VM_FAULT_FALLBACK)
  			copied = 0;
  		/*
  		 * The fault is done by now and there's no way back (other
  		 * thread may be already happily using PMD we have installed).
  		 * Just ignore error from ->iomap_end since we cannot do much
  		 * with it.
  		 */
  		ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags,
  				&iomap);
642261ac9   Ross Zwisler   dax: add struct i...
1614
  	}
876f29460   Ross Zwisler   dax: fix PMD data...
1615
   unlock_entry:
91d25ba8a   Ross Zwisler   dax: use common 4...
1616
  	put_locked_mapping_entry(mapping, pgoff);
642261ac9   Ross Zwisler   dax: add struct i...
1617
1618
   fallback:
  	if (result == VM_FAULT_FALLBACK) {
d8a849e1b   Dave Jiang   mm, dax: make pmd...
1619
  		split_huge_pmd(vma, vmf->pmd, vmf->address);
642261ac9   Ross Zwisler   dax: add struct i...
1620
1621
  		count_vm_event(THP_FAULT_FALLBACK);
  	}
282a8e039   Ross Zwisler   dax: add tracepoi...
1622
  out:
f42003917   Dave Jiang   mm, dax: change p...
1623
  	trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result);
642261ac9   Ross Zwisler   dax: add struct i...
1624
1625
  	return result;
  }
a2d581675   Dave Jiang   mm,fs,dax: change...
1626
  #else
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1627
  static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
01cddfe99   Arnd Bergmann   mm,fs,dax: mark d...
1628
  			       const struct iomap_ops *ops)
a2d581675   Dave Jiang   mm,fs,dax: change...
1629
1630
1631
  {
  	return VM_FAULT_FALLBACK;
  }
642261ac9   Ross Zwisler   dax: add struct i...
1632
  #endif /* CONFIG_FS_DAX_PMD */
a2d581675   Dave Jiang   mm,fs,dax: change...
1633
1634
1635
1636
  
  /**
   * dax_iomap_fault - handle a page fault on a DAX file
   * @vmf: The description of the fault
cec04e8c8   Jan Kara   dax: Fix comment ...
1637
   * @pe_size: Size of the page to fault in
9a0dd4225   Jan Kara   dax: Allow dax_io...
1638
   * @pfnp: PFN to insert for synchronous faults if fsync is required
c0b246259   Jan Kara   dax: pass detaile...
1639
   * @iomap_errp: Storage for detailed error code in case of error
cec04e8c8   Jan Kara   dax: Fix comment ...
1640
   * @ops: Iomap ops passed from the file system
a2d581675   Dave Jiang   mm,fs,dax: change...
1641
1642
1643
1644
1645
1646
   *
   * When a page fault occurs, filesystems may call this helper in
   * their fault handler for DAX files. dax_iomap_fault() assumes the caller
   * has done all the necessary locking for page fault to proceed
   * successfully.
   */
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1647
  vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
c0b246259   Jan Kara   dax: pass detaile...
1648
  		    pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops)
a2d581675   Dave Jiang   mm,fs,dax: change...
1649
  {
c791ace1e   Dave Jiang   mm: replace FAULT...
1650
1651
  	switch (pe_size) {
  	case PE_SIZE_PTE:
c0b246259   Jan Kara   dax: pass detaile...
1652
  		return dax_iomap_pte_fault(vmf, pfnp, iomap_errp, ops);
c791ace1e   Dave Jiang   mm: replace FAULT...
1653
  	case PE_SIZE_PMD:
9a0dd4225   Jan Kara   dax: Allow dax_io...
1654
  		return dax_iomap_pmd_fault(vmf, pfnp, ops);
a2d581675   Dave Jiang   mm,fs,dax: change...
1655
1656
1657
1658
1659
  	default:
  		return VM_FAULT_FALLBACK;
  	}
  }
  EXPORT_SYMBOL_GPL(dax_iomap_fault);
71eab6dfd   Jan Kara   dax: Implement da...
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
  
  /**
   * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables
   * @vmf: The description of the fault
   * @pe_size: Size of entry to be inserted
   * @pfn: PFN to insert
   *
   * This function inserts writeable PTE or PMD entry into page tables for mmaped
   * DAX file.  It takes care of marking corresponding radix tree entry as dirty
   * as well.
   */
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1671
  static vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf,
71eab6dfd   Jan Kara   dax: Implement da...
1672
1673
1674
1675
1676
1677
  				  enum page_entry_size pe_size,
  				  pfn_t pfn)
  {
  	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
  	void *entry, **slot;
  	pgoff_t index = vmf->pgoff;
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1678
  	vm_fault_t ret;
71eab6dfd   Jan Kara   dax: Implement da...
1679

b93b01631   Matthew Wilcox   page cache: use x...
1680
  	xa_lock_irq(&mapping->i_pages);
71eab6dfd   Jan Kara   dax: Implement da...
1681
1682
1683
1684
1685
1686
  	entry = get_unlocked_mapping_entry(mapping, index, &slot);
  	/* Did we race with someone splitting entry or so? */
  	if (!entry ||
  	    (pe_size == PE_SIZE_PTE && !dax_is_pte_entry(entry)) ||
  	    (pe_size == PE_SIZE_PMD && !dax_is_pmd_entry(entry))) {
  		put_unlocked_mapping_entry(mapping, index, entry);
b93b01631   Matthew Wilcox   page cache: use x...
1687
  		xa_unlock_irq(&mapping->i_pages);
71eab6dfd   Jan Kara   dax: Implement da...
1688
1689
1690
1691
  		trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
  						      VM_FAULT_NOPAGE);
  		return VM_FAULT_NOPAGE;
  	}
b93b01631   Matthew Wilcox   page cache: use x...
1692
  	radix_tree_tag_set(&mapping->i_pages, index, PAGECACHE_TAG_DIRTY);
71eab6dfd   Jan Kara   dax: Implement da...
1693
  	entry = lock_slot(mapping, slot);
b93b01631   Matthew Wilcox   page cache: use x...
1694
  	xa_unlock_irq(&mapping->i_pages);
71eab6dfd   Jan Kara   dax: Implement da...
1695
1696
  	switch (pe_size) {
  	case PE_SIZE_PTE:
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1697
  		ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
71eab6dfd   Jan Kara   dax: Implement da...
1698
1699
1700
  		break;
  #ifdef CONFIG_FS_DAX_PMD
  	case PE_SIZE_PMD:
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1701
  		ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
71eab6dfd   Jan Kara   dax: Implement da...
1702
1703
1704
1705
  			pfn, true);
  		break;
  #endif
  	default:
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1706
  		ret = VM_FAULT_FALLBACK;
71eab6dfd   Jan Kara   dax: Implement da...
1707
1708
  	}
  	put_locked_mapping_entry(mapping, index);
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1709
1710
  	trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret);
  	return ret;
71eab6dfd   Jan Kara   dax: Implement da...
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
  }
  
  /**
   * dax_finish_sync_fault - finish synchronous page fault
   * @vmf: The description of the fault
   * @pe_size: Size of entry to be inserted
   * @pfn: PFN to insert
   *
   * This function ensures that the file range touched by the page fault is
   * stored persistently on the media and handles inserting of appropriate page
   * table entry.
   */
ab77dab46   Souptick Joarder   fs/dax.c: use new...
1723
1724
  vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
  		enum page_entry_size pe_size, pfn_t pfn)
71eab6dfd   Jan Kara   dax: Implement da...
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
  {
  	int err;
  	loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
  	size_t len = 0;
  
  	if (pe_size == PE_SIZE_PTE)
  		len = PAGE_SIZE;
  	else if (pe_size == PE_SIZE_PMD)
  		len = PMD_SIZE;
  	else
  		WARN_ON_ONCE(1);
  	err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1);
  	if (err)
  		return VM_FAULT_SIGBUS;
  	return dax_insert_pfn_mkwrite(vmf, pe_size, pfn);
  }
  EXPORT_SYMBOL_GPL(dax_finish_sync_fault);