Blame view

fs/dax.c 40.4 KB
d475c6346   Matthew Wilcox   dax,ext2: replace...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
  /*
   * fs/dax.c - Direct Access filesystem code
   * Copyright (c) 2013-2014 Intel Corporation
   * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
   * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
   *
   * This program is free software; you can redistribute it and/or modify it
   * under the terms and conditions of the GNU General Public License,
   * version 2, as published by the Free Software Foundation.
   *
   * This program is distributed in the hope it will be useful, but WITHOUT
   * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
   * more details.
   */
  
  #include <linux/atomic.h>
  #include <linux/blkdev.h>
  #include <linux/buffer_head.h>
d77e92e27   Ross Zwisler   dax: update PMD f...
20
  #include <linux/dax.h>
d475c6346   Matthew Wilcox   dax,ext2: replace...
21
22
  #include <linux/fs.h>
  #include <linux/genhd.h>
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
23
24
25
  #include <linux/highmem.h>
  #include <linux/memcontrol.h>
  #include <linux/mm.h>
d475c6346   Matthew Wilcox   dax,ext2: replace...
26
  #include <linux/mutex.h>
9973c98ec   Ross Zwisler   dax: add support ...
27
  #include <linux/pagevec.h>
289c6aeda   Matthew Wilcox   dax,ext2: replace...
28
  #include <linux/sched.h>
f361bf4a6   Ingo Molnar   sched/headers: Pr...
29
  #include <linux/sched/signal.h>
d475c6346   Matthew Wilcox   dax,ext2: replace...
30
  #include <linux/uio.h>
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
31
  #include <linux/vmstat.h>
34c0fd540   Dan Williams   mm, dax, pmem: in...
32
  #include <linux/pfn_t.h>
0e749e542   Dan Williams   dax: increase gra...
33
  #include <linux/sizes.h>
4b4bb46d0   Jan Kara   dax: clear dirty ...
34
  #include <linux/mmu_notifier.h>
a254e5681   Christoph Hellwig   dax: provide an i...
35
36
  #include <linux/iomap.h>
  #include "internal.h"
d475c6346   Matthew Wilcox   dax,ext2: replace...
37

282a8e039   Ross Zwisler   dax: add tracepoi...
38
39
  #define CREATE_TRACE_POINTS
  #include <trace/events/fs_dax.h>
ac401cc78   Jan Kara   dax: New fault lo...
40
41
42
  /* We choose 4096 entries - same as per-zone page wait tables */
  #define DAX_WAIT_TABLE_BITS 12
  #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
917f34526   Ross Zwisler   dax: use PG_PMD_C...
43
44
  /* The 'colour' (ie low bits) within a PMD of a page offset.  */
  #define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1)
ce95ab0fa   Ross Zwisler   dax: make 'wait_t...
45
  static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
ac401cc78   Jan Kara   dax: New fault lo...
46
47
48
49
50
51
52
53
54
55
  
  static int __init init_dax_wait_table(void)
  {
  	int i;
  
  	for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
  		init_waitqueue_head(wait_table + i);
  	return 0;
  }
  fs_initcall(init_dax_wait_table);
527b19d08   Ross Zwisler   dax: move all DAX...
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
  /*
   * We use lowest available bit in exceptional entry for locking, one bit for
   * the entry size (PMD) and two more to tell us if the entry is a zero page or
   * an empty entry that is just used for locking.  In total four special bits.
   *
   * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
   * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
   * block allocation.
   */
  #define RADIX_DAX_SHIFT		(RADIX_TREE_EXCEPTIONAL_SHIFT + 4)
  #define RADIX_DAX_ENTRY_LOCK	(1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
  #define RADIX_DAX_PMD		(1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
  #define RADIX_DAX_ZERO_PAGE	(1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
  #define RADIX_DAX_EMPTY		(1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
  
  static unsigned long dax_radix_sector(void *entry)
  {
  	return (unsigned long)entry >> RADIX_DAX_SHIFT;
  }
  
  static void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
  {
  	return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags |
  			((unsigned long)sector << RADIX_DAX_SHIFT) |
  			RADIX_DAX_ENTRY_LOCK);
  }
  
  static unsigned int dax_radix_order(void *entry)
  {
  	if ((unsigned long)entry & RADIX_DAX_PMD)
  		return PMD_SHIFT - PAGE_SHIFT;
  	return 0;
  }
642261ac9   Ross Zwisler   dax: add struct i...
89
  static int dax_is_pmd_entry(void *entry)
d1a5f2b4d   Dan Williams   block: use DAX fo...
90
  {
642261ac9   Ross Zwisler   dax: add struct i...
91
  	return (unsigned long)entry & RADIX_DAX_PMD;
d1a5f2b4d   Dan Williams   block: use DAX fo...
92
  }
642261ac9   Ross Zwisler   dax: add struct i...
93
  static int dax_is_pte_entry(void *entry)
d475c6346   Matthew Wilcox   dax,ext2: replace...
94
  {
642261ac9   Ross Zwisler   dax: add struct i...
95
  	return !((unsigned long)entry & RADIX_DAX_PMD);
d475c6346   Matthew Wilcox   dax,ext2: replace...
96
  }
642261ac9   Ross Zwisler   dax: add struct i...
97
  static int dax_is_zero_entry(void *entry)
d475c6346   Matthew Wilcox   dax,ext2: replace...
98
  {
91d25ba8a   Ross Zwisler   dax: use common 4...
99
  	return (unsigned long)entry & RADIX_DAX_ZERO_PAGE;
d475c6346   Matthew Wilcox   dax,ext2: replace...
100
  }
642261ac9   Ross Zwisler   dax: add struct i...
101
  static int dax_is_empty_entry(void *entry)
b2e0d1625   Dan Williams   dax: fix lifetime...
102
  {
642261ac9   Ross Zwisler   dax: add struct i...
103
  	return (unsigned long)entry & RADIX_DAX_EMPTY;
b2e0d1625   Dan Williams   dax: fix lifetime...
104
  }
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
105
  /*
ac401cc78   Jan Kara   dax: New fault lo...
106
107
108
109
   * DAX radix tree locking
   */
  struct exceptional_entry_key {
  	struct address_space *mapping;
63e95b5c4   Ross Zwisler   dax: coordinate l...
110
  	pgoff_t entry_start;
ac401cc78   Jan Kara   dax: New fault lo...
111
112
113
  };
  
  struct wait_exceptional_entry_queue {
ac6424b98   Ingo Molnar   sched/wait: Renam...
114
  	wait_queue_entry_t wait;
ac401cc78   Jan Kara   dax: New fault lo...
115
116
  	struct exceptional_entry_key key;
  };
63e95b5c4   Ross Zwisler   dax: coordinate l...
117
118
119
120
121
122
123
124
125
126
  static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
  		pgoff_t index, void *entry, struct exceptional_entry_key *key)
  {
  	unsigned long hash;
  
  	/*
  	 * If 'entry' is a PMD, align the 'index' that we use for the wait
  	 * queue to the start of that PMD.  This ensures that all offsets in
  	 * the range covered by the PMD map to the same bit lock.
  	 */
642261ac9   Ross Zwisler   dax: add struct i...
127
  	if (dax_is_pmd_entry(entry))
917f34526   Ross Zwisler   dax: use PG_PMD_C...
128
  		index &= ~PG_PMD_COLOUR;
63e95b5c4   Ross Zwisler   dax: coordinate l...
129
130
131
132
133
134
135
  
  	key->mapping = mapping;
  	key->entry_start = index;
  
  	hash = hash_long((unsigned long)mapping ^ index, DAX_WAIT_TABLE_BITS);
  	return wait_table + hash;
  }
ac6424b98   Ingo Molnar   sched/wait: Renam...
136
  static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mode,
ac401cc78   Jan Kara   dax: New fault lo...
137
138
139
140
141
142
143
  				       int sync, void *keyp)
  {
  	struct exceptional_entry_key *key = keyp;
  	struct wait_exceptional_entry_queue *ewait =
  		container_of(wait, struct wait_exceptional_entry_queue, wait);
  
  	if (key->mapping != ewait->key.mapping ||
63e95b5c4   Ross Zwisler   dax: coordinate l...
144
  	    key->entry_start != ewait->key.entry_start)
ac401cc78   Jan Kara   dax: New fault lo...
145
146
147
148
149
  		return 0;
  	return autoremove_wake_function(wait, mode, sync, NULL);
  }
  
  /*
e30331ff0   Ross Zwisler   dax: relocate som...
150
151
152
153
154
155
   * We do not necessarily hold the mapping->tree_lock when we call this
   * function so it is possible that 'entry' is no longer a valid item in the
   * radix tree.  This is okay because all we really need to do is to find the
   * correct waitqueue where tasks might be waiting for that old 'entry' and
   * wake them.
   */
d01ad197a   Ross Zwisler   dax: remove DAX c...
156
  static void dax_wake_mapping_entry_waiter(struct address_space *mapping,
e30331ff0   Ross Zwisler   dax: relocate som...
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
  		pgoff_t index, void *entry, bool wake_all)
  {
  	struct exceptional_entry_key key;
  	wait_queue_head_t *wq;
  
  	wq = dax_entry_waitqueue(mapping, index, entry, &key);
  
  	/*
  	 * Checking for locked entry and prepare_to_wait_exclusive() happens
  	 * under mapping->tree_lock, ditto for entry handling in our callers.
  	 * So at this point all tasks that could have seen our entry locked
  	 * must be in the waitqueue and the following check will see them.
  	 */
  	if (waitqueue_active(wq))
  		__wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
  }
  
  /*
ac401cc78   Jan Kara   dax: New fault lo...
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
   * Check whether the given slot is locked. The function must be called with
   * mapping->tree_lock held
   */
  static inline int slot_locked(struct address_space *mapping, void **slot)
  {
  	unsigned long entry = (unsigned long)
  		radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
  	return entry & RADIX_DAX_ENTRY_LOCK;
  }
  
  /*
   * Mark the given slot is locked. The function must be called with
   * mapping->tree_lock held
   */
  static inline void *lock_slot(struct address_space *mapping, void **slot)
  {
  	unsigned long entry = (unsigned long)
  		radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
  
  	entry |= RADIX_DAX_ENTRY_LOCK;
6d75f366b   Johannes Weiner   lib: radix-tree: ...
195
  	radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry);
ac401cc78   Jan Kara   dax: New fault lo...
196
197
198
199
200
201
202
203
204
205
206
207
208
  	return (void *)entry;
  }
  
  /*
   * Mark the given slot is unlocked. The function must be called with
   * mapping->tree_lock held
   */
  static inline void *unlock_slot(struct address_space *mapping, void **slot)
  {
  	unsigned long entry = (unsigned long)
  		radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
  
  	entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
6d75f366b   Johannes Weiner   lib: radix-tree: ...
209
  	radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry);
ac401cc78   Jan Kara   dax: New fault lo...
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
  	return (void *)entry;
  }
  
  /*
   * Lookup entry in radix tree, wait for it to become unlocked if it is
   * exceptional entry and return it. The caller must call
   * put_unlocked_mapping_entry() when he decided not to lock the entry or
   * put_locked_mapping_entry() when he locked the entry and now wants to
   * unlock it.
   *
   * The function must be called with mapping->tree_lock held.
   */
  static void *get_unlocked_mapping_entry(struct address_space *mapping,
  					pgoff_t index, void ***slotp)
  {
e3ad61c64   Ross Zwisler   dax: consistent v...
225
  	void *entry, **slot;
ac401cc78   Jan Kara   dax: New fault lo...
226
  	struct wait_exceptional_entry_queue ewait;
63e95b5c4   Ross Zwisler   dax: coordinate l...
227
  	wait_queue_head_t *wq;
ac401cc78   Jan Kara   dax: New fault lo...
228
229
230
  
  	init_wait(&ewait.wait);
  	ewait.wait.func = wake_exceptional_entry_func;
ac401cc78   Jan Kara   dax: New fault lo...
231
232
  
  	for (;;) {
e3ad61c64   Ross Zwisler   dax: consistent v...
233
  		entry = __radix_tree_lookup(&mapping->page_tree, index, NULL,
ac401cc78   Jan Kara   dax: New fault lo...
234
  					  &slot);
91d25ba8a   Ross Zwisler   dax: use common 4...
235
236
  		if (!entry ||
  		    WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) ||
ac401cc78   Jan Kara   dax: New fault lo...
237
238
239
  		    !slot_locked(mapping, slot)) {
  			if (slotp)
  				*slotp = slot;
e3ad61c64   Ross Zwisler   dax: consistent v...
240
  			return entry;
ac401cc78   Jan Kara   dax: New fault lo...
241
  		}
63e95b5c4   Ross Zwisler   dax: coordinate l...
242
243
  
  		wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key);
ac401cc78   Jan Kara   dax: New fault lo...
244
245
246
247
248
249
250
251
  		prepare_to_wait_exclusive(wq, &ewait.wait,
  					  TASK_UNINTERRUPTIBLE);
  		spin_unlock_irq(&mapping->tree_lock);
  		schedule();
  		finish_wait(wq, &ewait.wait);
  		spin_lock_irq(&mapping->tree_lock);
  	}
  }
b1aa812b2   Jan Kara   mm: move handling...
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
  static void dax_unlock_mapping_entry(struct address_space *mapping,
  				     pgoff_t index)
  {
  	void *entry, **slot;
  
  	spin_lock_irq(&mapping->tree_lock);
  	entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
  	if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) ||
  			 !slot_locked(mapping, slot))) {
  		spin_unlock_irq(&mapping->tree_lock);
  		return;
  	}
  	unlock_slot(mapping, slot);
  	spin_unlock_irq(&mapping->tree_lock);
  	dax_wake_mapping_entry_waiter(mapping, index, entry, false);
  }
422476c46   Ross Zwisler   dax: move put_(un...
268
  static void put_locked_mapping_entry(struct address_space *mapping,
91d25ba8a   Ross Zwisler   dax: use common 4...
269
  		pgoff_t index)
422476c46   Ross Zwisler   dax: move put_(un...
270
  {
91d25ba8a   Ross Zwisler   dax: use common 4...
271
  	dax_unlock_mapping_entry(mapping, index);
422476c46   Ross Zwisler   dax: move put_(un...
272
273
274
275
276
277
278
279
280
  }
  
  /*
   * Called when we are done with radix tree entry we looked up via
   * get_unlocked_mapping_entry() and which we didn't lock in the end.
   */
  static void put_unlocked_mapping_entry(struct address_space *mapping,
  				       pgoff_t index, void *entry)
  {
91d25ba8a   Ross Zwisler   dax: use common 4...
281
  	if (!entry)
422476c46   Ross Zwisler   dax: move put_(un...
282
283
284
285
286
  		return;
  
  	/* We have to wake up next waiter for the radix tree entry lock */
  	dax_wake_mapping_entry_waiter(mapping, index, entry, false);
  }
ac401cc78   Jan Kara   dax: New fault lo...
287
  /*
91d25ba8a   Ross Zwisler   dax: use common 4...
288
289
290
291
   * Find radix tree entry at given index. If it points to an exceptional entry,
   * return it with the radix tree entry locked. If the radix tree doesn't
   * contain given index, create an empty exceptional entry for the index and
   * return with it locked.
ac401cc78   Jan Kara   dax: New fault lo...
292
   *
642261ac9   Ross Zwisler   dax: add struct i...
293
294
   * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will
   * either return that locked entry or will return an error.  This error will
91d25ba8a   Ross Zwisler   dax: use common 4...
295
296
   * happen if there are any 4k entries within the 2MiB range that we are
   * requesting.
642261ac9   Ross Zwisler   dax: add struct i...
297
298
299
300
301
302
303
304
305
306
307
308
   *
   * We always favor 4k entries over 2MiB entries. There isn't a flow where we
   * evict 4k entries in order to 'upgrade' them to a 2MiB entry.  A 2MiB
   * insertion will fail if it finds any 4k entries already in the tree, and a
   * 4k insertion will cause an existing 2MiB entry to be unmapped and
   * downgraded to 4k entries.  This happens for both 2MiB huge zero pages as
   * well as 2MiB empty entries.
   *
   * The exception to this downgrade path is for 2MiB DAX PMD entries that have
   * real storage backing them.  We will leave these real 2MiB DAX entries in
   * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry.
   *
ac401cc78   Jan Kara   dax: New fault lo...
309
310
311
312
   * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
   * persistent memory the benefit is doubtful. We can add that later if we can
   * show it helps.
   */
642261ac9   Ross Zwisler   dax: add struct i...
313
314
  static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index,
  		unsigned long size_flag)
ac401cc78   Jan Kara   dax: New fault lo...
315
  {
642261ac9   Ross Zwisler   dax: add struct i...
316
  	bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */
e3ad61c64   Ross Zwisler   dax: consistent v...
317
  	void *entry, **slot;
ac401cc78   Jan Kara   dax: New fault lo...
318
319
320
  
  restart:
  	spin_lock_irq(&mapping->tree_lock);
e3ad61c64   Ross Zwisler   dax: consistent v...
321
  	entry = get_unlocked_mapping_entry(mapping, index, &slot);
642261ac9   Ross Zwisler   dax: add struct i...
322

91d25ba8a   Ross Zwisler   dax: use common 4...
323
324
325
326
  	if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) {
  		entry = ERR_PTR(-EIO);
  		goto out_unlock;
  	}
642261ac9   Ross Zwisler   dax: add struct i...
327
328
  	if (entry) {
  		if (size_flag & RADIX_DAX_PMD) {
91d25ba8a   Ross Zwisler   dax: use common 4...
329
  			if (dax_is_pte_entry(entry)) {
642261ac9   Ross Zwisler   dax: add struct i...
330
331
332
333
334
335
  				put_unlocked_mapping_entry(mapping, index,
  						entry);
  				entry = ERR_PTR(-EEXIST);
  				goto out_unlock;
  			}
  		} else { /* trying to grab a PTE entry */
91d25ba8a   Ross Zwisler   dax: use common 4...
336
  			if (dax_is_pmd_entry(entry) &&
642261ac9   Ross Zwisler   dax: add struct i...
337
338
339
340
341
342
  			    (dax_is_zero_entry(entry) ||
  			     dax_is_empty_entry(entry))) {
  				pmd_downgrade = true;
  			}
  		}
  	}
ac401cc78   Jan Kara   dax: New fault lo...
343
  	/* No entry for given index? Make sure radix tree is big enough. */
642261ac9   Ross Zwisler   dax: add struct i...
344
  	if (!entry || pmd_downgrade) {
ac401cc78   Jan Kara   dax: New fault lo...
345
  		int err;
642261ac9   Ross Zwisler   dax: add struct i...
346
347
348
349
350
351
352
  		if (pmd_downgrade) {
  			/*
  			 * Make sure 'entry' remains valid while we drop
  			 * mapping->tree_lock.
  			 */
  			entry = lock_slot(mapping, slot);
  		}
ac401cc78   Jan Kara   dax: New fault lo...
353
  		spin_unlock_irq(&mapping->tree_lock);
642261ac9   Ross Zwisler   dax: add struct i...
354
355
356
357
358
359
360
361
  		/*
  		 * Besides huge zero pages the only other thing that gets
  		 * downgraded are empty entries which don't need to be
  		 * unmapped.
  		 */
  		if (pmd_downgrade && dax_is_zero_entry(entry))
  			unmap_mapping_range(mapping,
  				(index << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
ac401cc78   Jan Kara   dax: New fault lo...
362
363
  		err = radix_tree_preload(
  				mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
0cb80b484   Jan Kara   dax: Fix sleep in...
364
365
  		if (err) {
  			if (pmd_downgrade)
91d25ba8a   Ross Zwisler   dax: use common 4...
366
  				put_locked_mapping_entry(mapping, index);
ac401cc78   Jan Kara   dax: New fault lo...
367
  			return ERR_PTR(err);
0cb80b484   Jan Kara   dax: Fix sleep in...
368
  		}
ac401cc78   Jan Kara   dax: New fault lo...
369
  		spin_lock_irq(&mapping->tree_lock);
642261ac9   Ross Zwisler   dax: add struct i...
370

e11f8b7b6   Ross Zwisler   dax: fix radix tr...
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
  		if (!entry) {
  			/*
  			 * We needed to drop the page_tree lock while calling
  			 * radix_tree_preload() and we didn't have an entry to
  			 * lock.  See if another thread inserted an entry at
  			 * our index during this time.
  			 */
  			entry = __radix_tree_lookup(&mapping->page_tree, index,
  					NULL, &slot);
  			if (entry) {
  				radix_tree_preload_end();
  				spin_unlock_irq(&mapping->tree_lock);
  				goto restart;
  			}
  		}
642261ac9   Ross Zwisler   dax: add struct i...
386
387
388
389
390
391
392
393
394
395
396
  		if (pmd_downgrade) {
  			radix_tree_delete(&mapping->page_tree, index);
  			mapping->nrexceptional--;
  			dax_wake_mapping_entry_waiter(mapping, index, entry,
  					true);
  		}
  
  		entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY);
  
  		err = __radix_tree_insert(&mapping->page_tree, index,
  				dax_radix_order(entry), entry);
ac401cc78   Jan Kara   dax: New fault lo...
397
398
399
  		radix_tree_preload_end();
  		if (err) {
  			spin_unlock_irq(&mapping->tree_lock);
642261ac9   Ross Zwisler   dax: add struct i...
400
  			/*
e11f8b7b6   Ross Zwisler   dax: fix radix tr...
401
402
403
404
405
406
  			 * Our insertion of a DAX entry failed, most likely
  			 * because we were inserting a PMD entry and it
  			 * collided with a PTE sized entry at a different
  			 * index in the PMD range.  We haven't inserted
  			 * anything into the radix tree and have no waiters to
  			 * wake.
642261ac9   Ross Zwisler   dax: add struct i...
407
  			 */
ac401cc78   Jan Kara   dax: New fault lo...
408
409
410
411
412
  			return ERR_PTR(err);
  		}
  		/* Good, we have inserted empty locked entry into the tree. */
  		mapping->nrexceptional++;
  		spin_unlock_irq(&mapping->tree_lock);
e3ad61c64   Ross Zwisler   dax: consistent v...
413
  		return entry;
ac401cc78   Jan Kara   dax: New fault lo...
414
  	}
e3ad61c64   Ross Zwisler   dax: consistent v...
415
  	entry = lock_slot(mapping, slot);
642261ac9   Ross Zwisler   dax: add struct i...
416
   out_unlock:
ac401cc78   Jan Kara   dax: New fault lo...
417
  	spin_unlock_irq(&mapping->tree_lock);
e3ad61c64   Ross Zwisler   dax: consistent v...
418
  	return entry;
ac401cc78   Jan Kara   dax: New fault lo...
419
  }
c6dcf52c2   Jan Kara   mm: Invalidate DA...
420
421
422
423
424
425
426
427
428
  static int __dax_invalidate_mapping_entry(struct address_space *mapping,
  					  pgoff_t index, bool trunc)
  {
  	int ret = 0;
  	void *entry;
  	struct radix_tree_root *page_tree = &mapping->page_tree;
  
  	spin_lock_irq(&mapping->tree_lock);
  	entry = get_unlocked_mapping_entry(mapping, index, NULL);
91d25ba8a   Ross Zwisler   dax: use common 4...
429
  	if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)))
c6dcf52c2   Jan Kara   mm: Invalidate DA...
430
431
432
433
434
435
436
437
438
439
440
441
442
  		goto out;
  	if (!trunc &&
  	    (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
  	     radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)))
  		goto out;
  	radix_tree_delete(page_tree, index);
  	mapping->nrexceptional--;
  	ret = 1;
  out:
  	put_unlocked_mapping_entry(mapping, index, entry);
  	spin_unlock_irq(&mapping->tree_lock);
  	return ret;
  }
ac401cc78   Jan Kara   dax: New fault lo...
443
444
445
446
447
448
  /*
   * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
   * entry to get unlocked before deleting it.
   */
  int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
  {
c6dcf52c2   Jan Kara   mm: Invalidate DA...
449
  	int ret = __dax_invalidate_mapping_entry(mapping, index, true);
ac401cc78   Jan Kara   dax: New fault lo...
450

ac401cc78   Jan Kara   dax: New fault lo...
451
452
453
454
455
456
457
  	/*
  	 * This gets called from truncate / punch_hole path. As such, the caller
  	 * must hold locks protecting against concurrent modifications of the
  	 * radix tree (usually fs-private i_mmap_sem for writing). Since the
  	 * caller has seen exceptional entry for this index, we better find it
  	 * at that index as well...
  	 */
c6dcf52c2   Jan Kara   mm: Invalidate DA...
458
459
460
461
462
  	WARN_ON_ONCE(!ret);
  	return ret;
  }
  
  /*
c6dcf52c2   Jan Kara   mm: Invalidate DA...
463
464
465
466
467
468
   * Invalidate exceptional DAX entry if it is clean.
   */
  int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
  				      pgoff_t index)
  {
  	return __dax_invalidate_mapping_entry(mapping, index, false);
ac401cc78   Jan Kara   dax: New fault lo...
469
  }
cccbce671   Dan Williams   filesystem-dax: c...
470
471
472
  static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
  		sector_t sector, size_t size, struct page *to,
  		unsigned long vaddr)
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
473
  {
cccbce671   Dan Williams   filesystem-dax: c...
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
  	void *vto, *kaddr;
  	pgoff_t pgoff;
  	pfn_t pfn;
  	long rc;
  	int id;
  
  	rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
  	if (rc)
  		return rc;
  
  	id = dax_read_lock();
  	rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
  	if (rc < 0) {
  		dax_read_unlock(id);
  		return rc;
  	}
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
490
  	vto = kmap_atomic(to);
cccbce671   Dan Williams   filesystem-dax: c...
491
  	copy_user_page(vto, (void __force *)kaddr, vaddr, to);
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
492
  	kunmap_atomic(vto);
cccbce671   Dan Williams   filesystem-dax: c...
493
  	dax_read_unlock(id);
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
494
495
  	return 0;
  }
642261ac9   Ross Zwisler   dax: add struct i...
496
497
498
499
500
501
502
  /*
   * By this point grab_mapping_entry() has ensured that we have a locked entry
   * of the appropriate size so we don't have to worry about downgrading PMDs to
   * PTEs.  If we happen to be trying to insert a PTE and there is a PMD
   * already in the tree, we will skip the insertion and just dirty the PMD as
   * appropriate.
   */
ac401cc78   Jan Kara   dax: New fault lo...
503
504
  static void *dax_insert_mapping_entry(struct address_space *mapping,
  				      struct vm_fault *vmf,
642261ac9   Ross Zwisler   dax: add struct i...
505
506
  				      void *entry, sector_t sector,
  				      unsigned long flags)
9973c98ec   Ross Zwisler   dax: add support ...
507
508
  {
  	struct radix_tree_root *page_tree = &mapping->page_tree;
ac401cc78   Jan Kara   dax: New fault lo...
509
510
  	void *new_entry;
  	pgoff_t index = vmf->pgoff;
9973c98ec   Ross Zwisler   dax: add support ...
511

ac401cc78   Jan Kara   dax: New fault lo...
512
  	if (vmf->flags & FAULT_FLAG_WRITE)
d2b2a28e6   Dmitry Monakhov   dax: dirty inode ...
513
  		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
9973c98ec   Ross Zwisler   dax: add support ...
514

91d25ba8a   Ross Zwisler   dax: use common 4...
515
516
517
518
519
520
521
522
523
  	if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) {
  		/* we are replacing a zero page with block mapping */
  		if (dax_is_pmd_entry(entry))
  			unmap_mapping_range(mapping,
  					(vmf->pgoff << PAGE_SHIFT) & PMD_MASK,
  					PMD_SIZE, 0);
  		else /* pte entry */
  			unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
  					PAGE_SIZE, 0);
9973c98ec   Ross Zwisler   dax: add support ...
524
  	}
ac401cc78   Jan Kara   dax: New fault lo...
525
  	spin_lock_irq(&mapping->tree_lock);
642261ac9   Ross Zwisler   dax: add struct i...
526
  	new_entry = dax_radix_locked_entry(sector, flags);
91d25ba8a   Ross Zwisler   dax: use common 4...
527
  	if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
642261ac9   Ross Zwisler   dax: add struct i...
528
529
530
531
532
533
534
535
  		/*
  		 * Only swap our new entry into the radix tree if the current
  		 * entry is a zero page or an empty entry.  If a normal PTE or
  		 * PMD entry is already in the tree, we leave it alone.  This
  		 * means that if we are trying to insert a PTE and the
  		 * existing entry is a PMD, we will just leave the PMD in the
  		 * tree and dirty it if necessary.
  		 */
f7942430e   Johannes Weiner   lib: radix-tree: ...
536
  		struct radix_tree_node *node;
ac401cc78   Jan Kara   dax: New fault lo...
537
538
  		void **slot;
  		void *ret;
9973c98ec   Ross Zwisler   dax: add support ...
539

f7942430e   Johannes Weiner   lib: radix-tree: ...
540
  		ret = __radix_tree_lookup(page_tree, index, &node, &slot);
ac401cc78   Jan Kara   dax: New fault lo...
541
  		WARN_ON_ONCE(ret != entry);
4d693d086   Johannes Weiner   lib: radix-tree: ...
542
543
  		__radix_tree_replace(page_tree, node, slot,
  				     new_entry, NULL, NULL);
91d25ba8a   Ross Zwisler   dax: use common 4...
544
  		entry = new_entry;
9973c98ec   Ross Zwisler   dax: add support ...
545
  	}
91d25ba8a   Ross Zwisler   dax: use common 4...
546

ac401cc78   Jan Kara   dax: New fault lo...
547
  	if (vmf->flags & FAULT_FLAG_WRITE)
9973c98ec   Ross Zwisler   dax: add support ...
548
  		radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
91d25ba8a   Ross Zwisler   dax: use common 4...
549

9973c98ec   Ross Zwisler   dax: add support ...
550
  	spin_unlock_irq(&mapping->tree_lock);
91d25ba8a   Ross Zwisler   dax: use common 4...
551
  	return entry;
9973c98ec   Ross Zwisler   dax: add support ...
552
  }
4b4bb46d0   Jan Kara   dax: clear dirty ...
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
  static inline unsigned long
  pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
  {
  	unsigned long address;
  
  	address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
  	VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
  	return address;
  }
  
  /* Walk all mappings of a given index of a file and writeprotect them */
  static void dax_mapping_entry_mkclean(struct address_space *mapping,
  				      pgoff_t index, unsigned long pfn)
  {
  	struct vm_area_struct *vma;
f729c8c9b   Ross Zwisler   dax: wrprotect pm...
568
569
  	pte_t pte, *ptep = NULL;
  	pmd_t *pmdp = NULL;
4b4bb46d0   Jan Kara   dax: clear dirty ...
570
  	spinlock_t *ptl;
4b4bb46d0   Jan Kara   dax: clear dirty ...
571
572
573
  
  	i_mmap_lock_read(mapping);
  	vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) {
a4d1a8852   Jérôme Glisse   dax: update to ne...
574
  		unsigned long address, start, end;
4b4bb46d0   Jan Kara   dax: clear dirty ...
575
576
577
578
579
580
581
  
  		cond_resched();
  
  		if (!(vma->vm_flags & VM_SHARED))
  			continue;
  
  		address = pgoff_address(index, vma);
a4d1a8852   Jérôme Glisse   dax: update to ne...
582
583
584
585
586
587
588
  
  		/*
  		 * Note because we provide start/end to follow_pte_pmd it will
  		 * call mmu_notifier_invalidate_range_start() on our behalf
  		 * before taking any lock.
  		 */
  		if (follow_pte_pmd(vma->vm_mm, address, &start, &end, &ptep, &pmdp, &ptl))
4b4bb46d0   Jan Kara   dax: clear dirty ...
589
  			continue;
4b4bb46d0   Jan Kara   dax: clear dirty ...
590

f729c8c9b   Ross Zwisler   dax: wrprotect pm...
591
592
593
594
595
596
597
598
599
600
601
602
603
604
  		if (pmdp) {
  #ifdef CONFIG_FS_DAX_PMD
  			pmd_t pmd;
  
  			if (pfn != pmd_pfn(*pmdp))
  				goto unlock_pmd;
  			if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp))
  				goto unlock_pmd;
  
  			flush_cache_page(vma, address, pfn);
  			pmd = pmdp_huge_clear_flush(vma, address, pmdp);
  			pmd = pmd_wrprotect(pmd);
  			pmd = pmd_mkclean(pmd);
  			set_pmd_at(vma->vm_mm, address, pmdp, pmd);
a4d1a8852   Jérôme Glisse   dax: update to ne...
605
  			mmu_notifier_invalidate_range(vma->vm_mm, start, end);
f729c8c9b   Ross Zwisler   dax: wrprotect pm...
606
  unlock_pmd:
f729c8c9b   Ross Zwisler   dax: wrprotect pm...
607
  #endif
710b5124a   Jan H. Schönherr   fs/dax.c: release...
608
  			spin_unlock(ptl);
f729c8c9b   Ross Zwisler   dax: wrprotect pm...
609
610
611
612
613
614
615
616
617
618
619
  		} else {
  			if (pfn != pte_pfn(*ptep))
  				goto unlock_pte;
  			if (!pte_dirty(*ptep) && !pte_write(*ptep))
  				goto unlock_pte;
  
  			flush_cache_page(vma, address, pfn);
  			pte = ptep_clear_flush(vma, address, ptep);
  			pte = pte_wrprotect(pte);
  			pte = pte_mkclean(pte);
  			set_pte_at(vma->vm_mm, address, ptep, pte);
a4d1a8852   Jérôme Glisse   dax: update to ne...
620
  			mmu_notifier_invalidate_range(vma->vm_mm, start, end);
f729c8c9b   Ross Zwisler   dax: wrprotect pm...
621
622
623
  unlock_pte:
  			pte_unmap_unlock(ptep, ptl);
  		}
4b4bb46d0   Jan Kara   dax: clear dirty ...
624

a4d1a8852   Jérôme Glisse   dax: update to ne...
625
  		mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
4b4bb46d0   Jan Kara   dax: clear dirty ...
626
627
628
  	}
  	i_mmap_unlock_read(mapping);
  }
9973c98ec   Ross Zwisler   dax: add support ...
629
  static int dax_writeback_one(struct block_device *bdev,
cccbce671   Dan Williams   filesystem-dax: c...
630
631
  		struct dax_device *dax_dev, struct address_space *mapping,
  		pgoff_t index, void *entry)
9973c98ec   Ross Zwisler   dax: add support ...
632
633
  {
  	struct radix_tree_root *page_tree = &mapping->page_tree;
cccbce671   Dan Williams   filesystem-dax: c...
634
635
636
637
638
639
  	void *entry2, **slot, *kaddr;
  	long ret = 0, id;
  	sector_t sector;
  	pgoff_t pgoff;
  	size_t size;
  	pfn_t pfn;
9973c98ec   Ross Zwisler   dax: add support ...
640

9973c98ec   Ross Zwisler   dax: add support ...
641
  	/*
a6abc2c0e   Jan Kara   dax: make cache f...
642
643
  	 * A page got tagged dirty in DAX mapping? Something is seriously
  	 * wrong.
9973c98ec   Ross Zwisler   dax: add support ...
644
  	 */
a6abc2c0e   Jan Kara   dax: make cache f...
645
646
  	if (WARN_ON(!radix_tree_exceptional_entry(entry)))
  		return -EIO;
9973c98ec   Ross Zwisler   dax: add support ...
647

a6abc2c0e   Jan Kara   dax: make cache f...
648
649
650
  	spin_lock_irq(&mapping->tree_lock);
  	entry2 = get_unlocked_mapping_entry(mapping, index, &slot);
  	/* Entry got punched out / reallocated? */
91d25ba8a   Ross Zwisler   dax: use common 4...
651
  	if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2)))
a6abc2c0e   Jan Kara   dax: make cache f...
652
653
654
655
656
657
658
659
  		goto put_unlocked;
  	/*
  	 * Entry got reallocated elsewhere? No need to writeback. We have to
  	 * compare sectors as we must not bail out due to difference in lockbit
  	 * or entry type.
  	 */
  	if (dax_radix_sector(entry2) != dax_radix_sector(entry))
  		goto put_unlocked;
642261ac9   Ross Zwisler   dax: add struct i...
660
661
  	if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
  				dax_is_zero_entry(entry))) {
9973c98ec   Ross Zwisler   dax: add support ...
662
  		ret = -EIO;
a6abc2c0e   Jan Kara   dax: make cache f...
663
  		goto put_unlocked;
9973c98ec   Ross Zwisler   dax: add support ...
664
  	}
a6abc2c0e   Jan Kara   dax: make cache f...
665
666
667
668
669
670
671
672
673
674
675
676
677
678
  	/* Another fsync thread may have already written back this entry */
  	if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
  		goto put_unlocked;
  	/* Lock the entry to serialize with page faults */
  	entry = lock_slot(mapping, slot);
  	/*
  	 * We can clear the tag now but we have to be careful so that concurrent
  	 * dax_writeback_one() calls for the same index cannot finish before we
  	 * actually flush the caches. This is achieved as the calls will look
  	 * at the entry only under tree_lock and once they do that they will
  	 * see the entry locked and wait for it to unlock.
  	 */
  	radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
  	spin_unlock_irq(&mapping->tree_lock);
642261ac9   Ross Zwisler   dax: add struct i...
679
680
681
682
683
684
685
  	/*
  	 * Even if dax_writeback_mapping_range() was given a wbc->range_start
  	 * in the middle of a PMD, the 'index' we are given will be aligned to
  	 * the start index of the PMD, as will the sector we pull from
  	 * 'entry'.  This allows us to flush for PMD_SIZE and not have to
  	 * worry about partial PMD writebacks.
  	 */
cccbce671   Dan Williams   filesystem-dax: c...
686
687
688
689
690
691
692
  	sector = dax_radix_sector(entry);
  	size = PAGE_SIZE << dax_radix_order(entry);
  
  	id = dax_read_lock();
  	ret = bdev_dax_pgoff(bdev, sector, size, &pgoff);
  	if (ret)
  		goto dax_unlock;
9973c98ec   Ross Zwisler   dax: add support ...
693
694
  
  	/*
cccbce671   Dan Williams   filesystem-dax: c...
695
696
  	 * dax_direct_access() may sleep, so cannot hold tree_lock over
  	 * its invocation.
9973c98ec   Ross Zwisler   dax: add support ...
697
  	 */
cccbce671   Dan Williams   filesystem-dax: c...
698
699
700
  	ret = dax_direct_access(dax_dev, pgoff, size / PAGE_SIZE, &kaddr, &pfn);
  	if (ret < 0)
  		goto dax_unlock;
9973c98ec   Ross Zwisler   dax: add support ...
701

cccbce671   Dan Williams   filesystem-dax: c...
702
  	if (WARN_ON_ONCE(ret < size / PAGE_SIZE)) {
9973c98ec   Ross Zwisler   dax: add support ...
703
  		ret = -EIO;
cccbce671   Dan Williams   filesystem-dax: c...
704
  		goto dax_unlock;
9973c98ec   Ross Zwisler   dax: add support ...
705
  	}
cccbce671   Dan Williams   filesystem-dax: c...
706
  	dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(pfn));
c3ca015fa   Mikulas Patocka   dax: remove the p...
707
  	dax_flush(dax_dev, kaddr, size);
4b4bb46d0   Jan Kara   dax: clear dirty ...
708
709
710
711
712
713
714
715
716
  	/*
  	 * After we have flushed the cache, we can clear the dirty tag. There
  	 * cannot be new dirty data in the pfn after the flush has completed as
  	 * the pfn mappings are writeprotected and fault waits for mapping
  	 * entry lock.
  	 */
  	spin_lock_irq(&mapping->tree_lock);
  	radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY);
  	spin_unlock_irq(&mapping->tree_lock);
f9bc3a075   Ross Zwisler   dax: add tracepoi...
717
  	trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT);
cccbce671   Dan Williams   filesystem-dax: c...
718
719
   dax_unlock:
  	dax_read_unlock(id);
91d25ba8a   Ross Zwisler   dax: use common 4...
720
  	put_locked_mapping_entry(mapping, index);
9973c98ec   Ross Zwisler   dax: add support ...
721
  	return ret;
a6abc2c0e   Jan Kara   dax: make cache f...
722
723
   put_unlocked:
  	put_unlocked_mapping_entry(mapping, index, entry2);
9973c98ec   Ross Zwisler   dax: add support ...
724
725
726
727
728
729
730
731
732
  	spin_unlock_irq(&mapping->tree_lock);
  	return ret;
  }
  
  /*
   * Flush the mapping to the persistent domain within the byte range of [start,
   * end]. This is required by data integrity operations to ensure file data is
   * on persistent storage prior to completion of the operation.
   */
7f6d5b529   Ross Zwisler   dax: move writeba...
733
734
  int dax_writeback_mapping_range(struct address_space *mapping,
  		struct block_device *bdev, struct writeback_control *wbc)
9973c98ec   Ross Zwisler   dax: add support ...
735
736
  {
  	struct inode *inode = mapping->host;
642261ac9   Ross Zwisler   dax: add struct i...
737
  	pgoff_t start_index, end_index;
9973c98ec   Ross Zwisler   dax: add support ...
738
  	pgoff_t indices[PAGEVEC_SIZE];
cccbce671   Dan Williams   filesystem-dax: c...
739
  	struct dax_device *dax_dev;
9973c98ec   Ross Zwisler   dax: add support ...
740
741
742
  	struct pagevec pvec;
  	bool done = false;
  	int i, ret = 0;
9973c98ec   Ross Zwisler   dax: add support ...
743
744
745
  
  	if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
  		return -EIO;
7f6d5b529   Ross Zwisler   dax: move writeba...
746
747
  	if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
  		return 0;
cccbce671   Dan Williams   filesystem-dax: c...
748
749
750
  	dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
  	if (!dax_dev)
  		return -EIO;
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
751
752
  	start_index = wbc->range_start >> PAGE_SHIFT;
  	end_index = wbc->range_end >> PAGE_SHIFT;
9973c98ec   Ross Zwisler   dax: add support ...
753

d14a3f48a   Ross Zwisler   dax: add tracepoi...
754
  	trace_dax_writeback_range(inode, start_index, end_index);
9973c98ec   Ross Zwisler   dax: add support ...
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
  	tag_pages_for_writeback(mapping, start_index, end_index);
  
  	pagevec_init(&pvec, 0);
  	while (!done) {
  		pvec.nr = find_get_entries_tag(mapping, start_index,
  				PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
  				pvec.pages, indices);
  
  		if (pvec.nr == 0)
  			break;
  
  		for (i = 0; i < pvec.nr; i++) {
  			if (indices[i] > end_index) {
  				done = true;
  				break;
  			}
cccbce671   Dan Williams   filesystem-dax: c...
771
772
  			ret = dax_writeback_one(bdev, dax_dev, mapping,
  					indices[i], pvec.pages[i]);
819ec6b91   Jeff Layton   dax: set errors i...
773
774
  			if (ret < 0) {
  				mapping_set_error(mapping, ret);
d14a3f48a   Ross Zwisler   dax: add tracepoi...
775
  				goto out;
819ec6b91   Jeff Layton   dax: set errors i...
776
  			}
9973c98ec   Ross Zwisler   dax: add support ...
777
  		}
1eb643d02   Jan Kara   fs/dax.c: fix ine...
778
  		start_index = indices[pvec.nr - 1] + 1;
9973c98ec   Ross Zwisler   dax: add support ...
779
  	}
d14a3f48a   Ross Zwisler   dax: add tracepoi...
780
  out:
cccbce671   Dan Williams   filesystem-dax: c...
781
  	put_dax(dax_dev);
d14a3f48a   Ross Zwisler   dax: add tracepoi...
782
783
  	trace_dax_writeback_range_done(inode, start_index, end_index);
  	return (ret < 0 ? ret : 0);
9973c98ec   Ross Zwisler   dax: add support ...
784
785
  }
  EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
ac401cc78   Jan Kara   dax: New fault lo...
786
  static int dax_insert_mapping(struct address_space *mapping,
cccbce671   Dan Williams   filesystem-dax: c...
787
  		struct block_device *bdev, struct dax_device *dax_dev,
91d25ba8a   Ross Zwisler   dax: use common 4...
788
  		sector_t sector, size_t size, void *entry,
cccbce671   Dan Williams   filesystem-dax: c...
789
  		struct vm_area_struct *vma, struct vm_fault *vmf)
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
790
  {
1a29d85eb   Jan Kara   mm: use vmf->addr...
791
  	unsigned long vaddr = vmf->address;
cccbce671   Dan Williams   filesystem-dax: c...
792
793
794
795
  	void *ret, *kaddr;
  	pgoff_t pgoff;
  	int id, rc;
  	pfn_t pfn;
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
796

cccbce671   Dan Williams   filesystem-dax: c...
797
798
799
  	rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
  	if (rc)
  		return rc;
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
800

cccbce671   Dan Williams   filesystem-dax: c...
801
802
803
804
805
806
807
808
809
  	id = dax_read_lock();
  	rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
  	if (rc < 0) {
  		dax_read_unlock(id);
  		return rc;
  	}
  	dax_read_unlock(id);
  
  	ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0);
4d9a2c874   Jan Kara   dax: Remove i_mma...
810
811
  	if (IS_ERR(ret))
  		return PTR_ERR(ret);
9973c98ec   Ross Zwisler   dax: add support ...
812

b44407345   Ross Zwisler   dax: add tracepoi...
813
  	trace_dax_insert_mapping(mapping->host, vmf, ret);
91d25ba8a   Ross Zwisler   dax: use common 4...
814
815
816
817
  	if (vmf->flags & FAULT_FLAG_WRITE)
  		return vm_insert_mixed_mkwrite(vma, vaddr, pfn);
  	else
  		return vm_insert_mixed(vma, vaddr, pfn);
0e3b210ce   Boaz Harrosh   dax: use pfn_mkwr...
818
  }
0e3b210ce   Boaz Harrosh   dax: use pfn_mkwr...
819

e30331ff0   Ross Zwisler   dax: relocate som...
820
  /*
91d25ba8a   Ross Zwisler   dax: use common 4...
821
822
823
824
825
   * The user has performed a load from a hole in the file.  Allocating a new
   * page in the file would cause excessive storage usage for workloads with
   * sparse files.  Instead we insert a read-only mapping of the 4k zero page.
   * If this page is ever written to we will re-fault and change the mapping to
   * point to real DAX storage instead.
e30331ff0   Ross Zwisler   dax: relocate som...
826
   */
91d25ba8a   Ross Zwisler   dax: use common 4...
827
  static int dax_load_hole(struct address_space *mapping, void *entry,
e30331ff0   Ross Zwisler   dax: relocate som...
828
829
830
  			 struct vm_fault *vmf)
  {
  	struct inode *inode = mapping->host;
91d25ba8a   Ross Zwisler   dax: use common 4...
831
832
833
834
  	unsigned long vaddr = vmf->address;
  	int ret = VM_FAULT_NOPAGE;
  	struct page *zero_page;
  	void *entry2;
e30331ff0   Ross Zwisler   dax: relocate som...
835

91d25ba8a   Ross Zwisler   dax: use common 4...
836
837
  	zero_page = ZERO_PAGE(0);
  	if (unlikely(!zero_page)) {
e30331ff0   Ross Zwisler   dax: relocate som...
838
839
840
  		ret = VM_FAULT_OOM;
  		goto out;
  	}
91d25ba8a   Ross Zwisler   dax: use common 4...
841
842
843
844
845
  	entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0,
  			RADIX_DAX_ZERO_PAGE);
  	if (IS_ERR(entry2)) {
  		ret = VM_FAULT_SIGBUS;
  		goto out;
e30331ff0   Ross Zwisler   dax: relocate som...
846
  	}
91d25ba8a   Ross Zwisler   dax: use common 4...
847
848
  
  	vm_insert_mixed(vmf->vma, vaddr, page_to_pfn_t(zero_page));
e30331ff0   Ross Zwisler   dax: relocate som...
849
850
851
852
  out:
  	trace_dax_load_hole(inode, vmf, ret);
  	return ret;
  }
4b0228fa1   Vishal Verma   dax: for truncate...
853
854
855
856
857
858
859
860
861
862
863
864
  static bool dax_range_is_aligned(struct block_device *bdev,
  				 unsigned int offset, unsigned int length)
  {
  	unsigned short sector_size = bdev_logical_block_size(bdev);
  
  	if (!IS_ALIGNED(offset, sector_size))
  		return false;
  	if (!IS_ALIGNED(length, sector_size))
  		return false;
  
  	return true;
  }
cccbce671   Dan Williams   filesystem-dax: c...
865
866
867
  int __dax_zero_page_range(struct block_device *bdev,
  		struct dax_device *dax_dev, sector_t sector,
  		unsigned int offset, unsigned int size)
679c8bd3b   Christoph Hellwig   dax: export a low...
868
  {
cccbce671   Dan Williams   filesystem-dax: c...
869
870
  	if (dax_range_is_aligned(bdev, offset, size)) {
  		sector_t start_sector = sector + (offset >> 9);
4b0228fa1   Vishal Verma   dax: for truncate...
871
872
  
  		return blkdev_issue_zeroout(bdev, start_sector,
53ef7d0e2   Linus Torvalds   Merge tag 'libnvd...
873
  				size >> 9, GFP_NOFS, 0);
4b0228fa1   Vishal Verma   dax: for truncate...
874
  	} else {
cccbce671   Dan Williams   filesystem-dax: c...
875
876
877
878
  		pgoff_t pgoff;
  		long rc, id;
  		void *kaddr;
  		pfn_t pfn;
e84b83b9e   Dan Williams   filesystem-dax: f...
879
  		rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff);
cccbce671   Dan Williams   filesystem-dax: c...
880
881
882
883
  		if (rc)
  			return rc;
  
  		id = dax_read_lock();
e84b83b9e   Dan Williams   filesystem-dax: f...
884
  		rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr,
cccbce671   Dan Williams   filesystem-dax: c...
885
886
887
888
889
  				&pfn);
  		if (rc < 0) {
  			dax_read_unlock(id);
  			return rc;
  		}
81f558701   Dan Williams   x86, dax: replace...
890
  		memset(kaddr + offset, 0, size);
c3ca015fa   Mikulas Patocka   dax: remove the p...
891
  		dax_flush(dax_dev, kaddr + offset, size);
cccbce671   Dan Williams   filesystem-dax: c...
892
  		dax_read_unlock(id);
4b0228fa1   Vishal Verma   dax: for truncate...
893
  	}
679c8bd3b   Christoph Hellwig   dax: export a low...
894
895
896
  	return 0;
  }
  EXPORT_SYMBOL_GPL(__dax_zero_page_range);
333ccc978   Ross Zwisler   dax: add dax_ioma...
897
  static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
25726bc15   Matthew Wilcox   dax: add dax_zero...
898
  {
333ccc978   Ross Zwisler   dax: add dax_ioma...
899
  	return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9);
25726bc15   Matthew Wilcox   dax: add dax_zero...
900
  }
a254e5681   Christoph Hellwig   dax: provide an i...
901

a254e5681   Christoph Hellwig   dax: provide an i...
902
  static loff_t
11c59c92f   Ross Zwisler   dax: correct dax ...
903
  dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
a254e5681   Christoph Hellwig   dax: provide an i...
904
905
  		struct iomap *iomap)
  {
cccbce671   Dan Williams   filesystem-dax: c...
906
907
  	struct block_device *bdev = iomap->bdev;
  	struct dax_device *dax_dev = iomap->dax_dev;
a254e5681   Christoph Hellwig   dax: provide an i...
908
909
910
  	struct iov_iter *iter = data;
  	loff_t end = pos + length, done = 0;
  	ssize_t ret = 0;
cccbce671   Dan Williams   filesystem-dax: c...
911
  	int id;
a254e5681   Christoph Hellwig   dax: provide an i...
912
913
914
915
916
917
918
919
920
921
922
923
  
  	if (iov_iter_rw(iter) == READ) {
  		end = min(end, i_size_read(inode));
  		if (pos >= end)
  			return 0;
  
  		if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
  			return iov_iter_zero(min(length, end - pos), iter);
  	}
  
  	if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
  		return -EIO;
e3fce68cd   Jan Kara   dax: Avoid page i...
924
925
926
927
928
  	/*
  	 * Write can allocate block for an area which has a hole page mapped
  	 * into page tables. We have to tear down these mappings so that data
  	 * written by write(2) is visible in mmap.
  	 */
cd656375f   Jan Kara   mm: fix data corr...
929
  	if (iomap->flags & IOMAP_F_NEW) {
e3fce68cd   Jan Kara   dax: Avoid page i...
930
931
932
933
  		invalidate_inode_pages2_range(inode->i_mapping,
  					      pos >> PAGE_SHIFT,
  					      (end - 1) >> PAGE_SHIFT);
  	}
cccbce671   Dan Williams   filesystem-dax: c...
934
  	id = dax_read_lock();
a254e5681   Christoph Hellwig   dax: provide an i...
935
936
  	while (pos < end) {
  		unsigned offset = pos & (PAGE_SIZE - 1);
cccbce671   Dan Williams   filesystem-dax: c...
937
938
  		const size_t size = ALIGN(length + offset, PAGE_SIZE);
  		const sector_t sector = dax_iomap_sector(iomap, pos);
a254e5681   Christoph Hellwig   dax: provide an i...
939
  		ssize_t map_len;
cccbce671   Dan Williams   filesystem-dax: c...
940
941
942
  		pgoff_t pgoff;
  		void *kaddr;
  		pfn_t pfn;
a254e5681   Christoph Hellwig   dax: provide an i...
943

d1908f525   Michal Hocko   fs: break out of ...
944
945
946
947
  		if (fatal_signal_pending(current)) {
  			ret = -EINTR;
  			break;
  		}
cccbce671   Dan Williams   filesystem-dax: c...
948
949
950
951
952
953
  		ret = bdev_dax_pgoff(bdev, sector, size, &pgoff);
  		if (ret)
  			break;
  
  		map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
  				&kaddr, &pfn);
a254e5681   Christoph Hellwig   dax: provide an i...
954
955
956
957
  		if (map_len < 0) {
  			ret = map_len;
  			break;
  		}
cccbce671   Dan Williams   filesystem-dax: c...
958
959
  		map_len = PFN_PHYS(map_len);
  		kaddr += offset;
a254e5681   Christoph Hellwig   dax: provide an i...
960
961
962
  		map_len -= offset;
  		if (map_len > end - pos)
  			map_len = end - pos;
a2e050f5a   Ross Zwisler   dax: explain how ...
963
964
965
966
967
  		/*
  		 * The userspace address for the memory copy has already been
  		 * validated via access_ok() in either vfs_read() or
  		 * vfs_write(), depending on which operation we are doing.
  		 */
a254e5681   Christoph Hellwig   dax: provide an i...
968
  		if (iov_iter_rw(iter) == WRITE)
fec53774f   Dan Williams   filesystem-dax: c...
969
970
  			map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr,
  					map_len, iter);
a254e5681   Christoph Hellwig   dax: provide an i...
971
  		else
cccbce671   Dan Williams   filesystem-dax: c...
972
  			map_len = copy_to_iter(kaddr, map_len, iter);
a254e5681   Christoph Hellwig   dax: provide an i...
973
974
975
976
977
978
979
980
981
  		if (map_len <= 0) {
  			ret = map_len ? map_len : -EFAULT;
  			break;
  		}
  
  		pos += map_len;
  		length -= map_len;
  		done += map_len;
  	}
cccbce671   Dan Williams   filesystem-dax: c...
982
  	dax_read_unlock(id);
a254e5681   Christoph Hellwig   dax: provide an i...
983
984
985
986
987
  
  	return done ? done : ret;
  }
  
  /**
11c59c92f   Ross Zwisler   dax: correct dax ...
988
   * dax_iomap_rw - Perform I/O to a DAX file
a254e5681   Christoph Hellwig   dax: provide an i...
989
990
991
992
993
994
995
996
997
   * @iocb:	The control block for this I/O
   * @iter:	The addresses to do I/O from or to
   * @ops:	iomap ops passed from the file system
   *
   * This function performs read and write operations to directly mapped
   * persistent memory.  The callers needs to take care of read/write exclusion
   * and evicting any page cache pages in the region under I/O.
   */
  ssize_t
11c59c92f   Ross Zwisler   dax: correct dax ...
998
  dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
8ff6daa17   Christoph Hellwig   iomap: constify s...
999
  		const struct iomap_ops *ops)
a254e5681   Christoph Hellwig   dax: provide an i...
1000
1001
1002
1003
1004
  {
  	struct address_space *mapping = iocb->ki_filp->f_mapping;
  	struct inode *inode = mapping->host;
  	loff_t pos = iocb->ki_pos, ret = 0, done = 0;
  	unsigned flags = 0;
168316db3   Christoph Hellwig   dax: assert that ...
1005
1006
  	if (iov_iter_rw(iter) == WRITE) {
  		lockdep_assert_held_exclusive(&inode->i_rwsem);
a254e5681   Christoph Hellwig   dax: provide an i...
1007
  		flags |= IOMAP_WRITE;
168316db3   Christoph Hellwig   dax: assert that ...
1008
1009
1010
  	} else {
  		lockdep_assert_held(&inode->i_rwsem);
  	}
a254e5681   Christoph Hellwig   dax: provide an i...
1011

a254e5681   Christoph Hellwig   dax: provide an i...
1012
1013
  	while (iov_iter_count(iter)) {
  		ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
11c59c92f   Ross Zwisler   dax: correct dax ...
1014
  				iter, dax_iomap_actor);
a254e5681   Christoph Hellwig   dax: provide an i...
1015
1016
1017
1018
1019
1020
1021
1022
1023
  		if (ret <= 0)
  			break;
  		pos += ret;
  		done += ret;
  	}
  
  	iocb->ki_pos += done;
  	return done ? done : ret;
  }
11c59c92f   Ross Zwisler   dax: correct dax ...
1024
  EXPORT_SYMBOL_GPL(dax_iomap_rw);
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1025

9f141d6ef   Jan Kara   dax: Call ->iomap...
1026
1027
1028
1029
1030
1031
1032
1033
  static int dax_fault_return(int error)
  {
  	if (error == 0)
  		return VM_FAULT_NOPAGE;
  	if (error == -ENOMEM)
  		return VM_FAULT_OOM;
  	return VM_FAULT_SIGBUS;
  }
a2d581675   Dave Jiang   mm,fs,dax: change...
1034
1035
  static int dax_iomap_pte_fault(struct vm_fault *vmf,
  			       const struct iomap_ops *ops)
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1036
  {
11bac8000   Dave Jiang   mm, fs: reduce fa...
1037
  	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1038
  	struct inode *inode = mapping->host;
1a29d85eb   Jan Kara   mm: use vmf->addr...
1039
  	unsigned long vaddr = vmf->address;
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1040
1041
1042
  	loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
  	sector_t sector;
  	struct iomap iomap = { 0 };
9484ab1bf   Jan Kara   dax: Introduce IO...
1043
  	unsigned flags = IOMAP_FAULT;
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1044
  	int error, major = 0;
b1aa812b2   Jan Kara   mm: move handling...
1045
  	int vmf_ret = 0;
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1046
  	void *entry;
a9c42b33e   Ross Zwisler   dax: add tracepoi...
1047
  	trace_dax_pte_fault(inode, vmf, vmf_ret);
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1048
1049
1050
1051
1052
  	/*
  	 * Check whether offset isn't beyond end of file now. Caller is supposed
  	 * to hold locks serializing us with truncate / punch hole so this is
  	 * a reliable test.
  	 */
a9c42b33e   Ross Zwisler   dax: add tracepoi...
1053
1054
1055
1056
  	if (pos >= i_size_read(inode)) {
  		vmf_ret = VM_FAULT_SIGBUS;
  		goto out;
  	}
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1057

a7d73fe6c   Christoph Hellwig   dax: provide an i...
1058
1059
  	if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
  		flags |= IOMAP_WRITE;
13e451fdc   Jan Kara   dax: fix data cor...
1060
1061
1062
1063
1064
  	entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
  	if (IS_ERR(entry)) {
  		vmf_ret = dax_fault_return(PTR_ERR(entry));
  		goto out;
  	}
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1065
  	/*
e2093926a   Ross Zwisler   dax: fix race bet...
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
  	 * It is possible, particularly with mixed reads & writes to private
  	 * mappings, that we have raced with a PMD fault that overlaps with
  	 * the PTE we need to set up.  If so just return and the fault will be
  	 * retried.
  	 */
  	if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) {
  		vmf_ret = VM_FAULT_NOPAGE;
  		goto unlock_entry;
  	}
  
  	/*
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1077
1078
1079
1080
1081
  	 * Note that we don't bother to use iomap_apply here: DAX required
  	 * the file system block size to be equal the page size, which means
  	 * that we never have to deal with more than a single extent here.
  	 */
  	error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
a9c42b33e   Ross Zwisler   dax: add tracepoi...
1082
1083
  	if (error) {
  		vmf_ret = dax_fault_return(error);
13e451fdc   Jan Kara   dax: fix data cor...
1084
  		goto unlock_entry;
a9c42b33e   Ross Zwisler   dax: add tracepoi...
1085
  	}
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1086
  	if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
13e451fdc   Jan Kara   dax: fix data cor...
1087
1088
  		error = -EIO;	/* fs corruption? */
  		goto error_finish_iomap;
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1089
  	}
333ccc978   Ross Zwisler   dax: add dax_ioma...
1090
  	sector = dax_iomap_sector(&iomap, pos);
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1091
1092
1093
1094
1095
1096
1097
1098
  
  	if (vmf->cow_page) {
  		switch (iomap.type) {
  		case IOMAP_HOLE:
  		case IOMAP_UNWRITTEN:
  			clear_user_highpage(vmf->cow_page, vaddr);
  			break;
  		case IOMAP_MAPPED:
cccbce671   Dan Williams   filesystem-dax: c...
1099
1100
  			error = copy_user_dax(iomap.bdev, iomap.dax_dev,
  					sector, PAGE_SIZE, vmf->cow_page, vaddr);
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1101
1102
1103
1104
1105
1106
1107
1108
  			break;
  		default:
  			WARN_ON_ONCE(1);
  			error = -EIO;
  			break;
  		}
  
  		if (error)
13e451fdc   Jan Kara   dax: fix data cor...
1109
  			goto error_finish_iomap;
b1aa812b2   Jan Kara   mm: move handling...
1110
1111
1112
1113
1114
  
  		__SetPageUptodate(vmf->cow_page);
  		vmf_ret = finish_fault(vmf);
  		if (!vmf_ret)
  			vmf_ret = VM_FAULT_DONE_COW;
13e451fdc   Jan Kara   dax: fix data cor...
1115
  		goto finish_iomap;
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1116
1117
1118
1119
1120
1121
  	}
  
  	switch (iomap.type) {
  	case IOMAP_MAPPED:
  		if (iomap.flags & IOMAP_F_NEW) {
  			count_vm_event(PGMAJFAULT);
2262185c5   Roman Gushchin   mm: per-cgroup me...
1122
  			count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1123
1124
  			major = VM_FAULT_MAJOR;
  		}
cccbce671   Dan Williams   filesystem-dax: c...
1125
  		error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev,
91d25ba8a   Ross Zwisler   dax: use common 4...
1126
  				sector, PAGE_SIZE, entry, vmf->vma, vmf);
9f141d6ef   Jan Kara   dax: Call ->iomap...
1127
1128
1129
  		/* -EBUSY is fine, somebody else faulted on the same PTE */
  		if (error == -EBUSY)
  			error = 0;
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1130
1131
1132
  		break;
  	case IOMAP_UNWRITTEN:
  	case IOMAP_HOLE:
1550290b0   Ross Zwisler   dax: dax_iomap_fa...
1133
  		if (!(vmf->flags & FAULT_FLAG_WRITE)) {
91d25ba8a   Ross Zwisler   dax: use common 4...
1134
  			vmf_ret = dax_load_hole(mapping, entry, vmf);
13e451fdc   Jan Kara   dax: fix data cor...
1135
  			goto finish_iomap;
1550290b0   Ross Zwisler   dax: dax_iomap_fa...
1136
  		}
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1137
1138
1139
1140
1141
1142
  		/*FALLTHRU*/
  	default:
  		WARN_ON_ONCE(1);
  		error = -EIO;
  		break;
  	}
13e451fdc   Jan Kara   dax: fix data cor...
1143
   error_finish_iomap:
9f141d6ef   Jan Kara   dax: Call ->iomap...
1144
  	vmf_ret = dax_fault_return(error) | major;
9f141d6ef   Jan Kara   dax: Call ->iomap...
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
   finish_iomap:
  	if (ops->iomap_end) {
  		int copied = PAGE_SIZE;
  
  		if (vmf_ret & VM_FAULT_ERROR)
  			copied = 0;
  		/*
  		 * The fault is done by now and there's no way back (other
  		 * thread may be already happily using PTE we have installed).
  		 * Just ignore error from ->iomap_end since we cannot do much
  		 * with it.
  		 */
  		ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
1550290b0   Ross Zwisler   dax: dax_iomap_fa...
1158
  	}
13e451fdc   Jan Kara   dax: fix data cor...
1159
   unlock_entry:
91d25ba8a   Ross Zwisler   dax: use common 4...
1160
  	put_locked_mapping_entry(mapping, vmf->pgoff);
13e451fdc   Jan Kara   dax: fix data cor...
1161
   out:
a9c42b33e   Ross Zwisler   dax: add tracepoi...
1162
  	trace_dax_pte_fault_done(inode, vmf, vmf_ret);
9f141d6ef   Jan Kara   dax: Call ->iomap...
1163
  	return vmf_ret;
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1164
  }
642261ac9   Ross Zwisler   dax: add struct i...
1165
1166
  
  #ifdef CONFIG_FS_DAX_PMD
f42003917   Dave Jiang   mm, dax: change p...
1167
  static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
91d25ba8a   Ross Zwisler   dax: use common 4...
1168
  		loff_t pos, void *entry)
642261ac9   Ross Zwisler   dax: add struct i...
1169
  {
f42003917   Dave Jiang   mm, dax: change p...
1170
  	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
cccbce671   Dan Williams   filesystem-dax: c...
1171
1172
  	const sector_t sector = dax_iomap_sector(iomap, pos);
  	struct dax_device *dax_dev = iomap->dax_dev;
642261ac9   Ross Zwisler   dax: add struct i...
1173
  	struct block_device *bdev = iomap->bdev;
27a7ffacc   Ross Zwisler   dax: add tracepoi...
1174
  	struct inode *inode = mapping->host;
cccbce671   Dan Williams   filesystem-dax: c...
1175
1176
1177
1178
  	const size_t size = PMD_SIZE;
  	void *ret = NULL, *kaddr;
  	long length = 0;
  	pgoff_t pgoff;
2f52074d3   Nicolas Iooss   dax: initialize v...
1179
  	pfn_t pfn = {};
cccbce671   Dan Williams   filesystem-dax: c...
1180
1181
1182
  	int id;
  
  	if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0)
27a7ffacc   Ross Zwisler   dax: add tracepoi...
1183
  		goto fallback;
642261ac9   Ross Zwisler   dax: add struct i...
1184

cccbce671   Dan Williams   filesystem-dax: c...
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
  	id = dax_read_lock();
  	length = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
  	if (length < 0)
  		goto unlock_fallback;
  	length = PFN_PHYS(length);
  
  	if (length < size)
  		goto unlock_fallback;
  	if (pfn_t_to_pfn(pfn) & PG_PMD_COLOUR)
  		goto unlock_fallback;
  	if (!pfn_t_devmap(pfn))
  		goto unlock_fallback;
  	dax_read_unlock(id);
91d25ba8a   Ross Zwisler   dax: use common 4...
1198
  	ret = dax_insert_mapping_entry(mapping, vmf, entry, sector,
642261ac9   Ross Zwisler   dax: add struct i...
1199
1200
  			RADIX_DAX_PMD);
  	if (IS_ERR(ret))
27a7ffacc   Ross Zwisler   dax: add tracepoi...
1201
  		goto fallback;
642261ac9   Ross Zwisler   dax: add struct i...
1202

cccbce671   Dan Williams   filesystem-dax: c...
1203
  	trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret);
f42003917   Dave Jiang   mm, dax: change p...
1204
  	return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
cccbce671   Dan Williams   filesystem-dax: c...
1205
  			pfn, vmf->flags & FAULT_FLAG_WRITE);
642261ac9   Ross Zwisler   dax: add struct i...
1206

cccbce671   Dan Williams   filesystem-dax: c...
1207
1208
  unlock_fallback:
  	dax_read_unlock(id);
27a7ffacc   Ross Zwisler   dax: add tracepoi...
1209
  fallback:
cccbce671   Dan Williams   filesystem-dax: c...
1210
  	trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, pfn, ret);
642261ac9   Ross Zwisler   dax: add struct i...
1211
1212
  	return VM_FAULT_FALLBACK;
  }
f42003917   Dave Jiang   mm, dax: change p...
1213
  static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
91d25ba8a   Ross Zwisler   dax: use common 4...
1214
  		void *entry)
642261ac9   Ross Zwisler   dax: add struct i...
1215
  {
f42003917   Dave Jiang   mm, dax: change p...
1216
1217
  	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
  	unsigned long pmd_addr = vmf->address & PMD_MASK;
653b2ea33   Ross Zwisler   dax: add tracepoi...
1218
  	struct inode *inode = mapping->host;
642261ac9   Ross Zwisler   dax: add struct i...
1219
  	struct page *zero_page;
653b2ea33   Ross Zwisler   dax: add tracepoi...
1220
  	void *ret = NULL;
642261ac9   Ross Zwisler   dax: add struct i...
1221
1222
  	spinlock_t *ptl;
  	pmd_t pmd_entry;
642261ac9   Ross Zwisler   dax: add struct i...
1223

f42003917   Dave Jiang   mm, dax: change p...
1224
  	zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm);
642261ac9   Ross Zwisler   dax: add struct i...
1225
1226
  
  	if (unlikely(!zero_page))
653b2ea33   Ross Zwisler   dax: add tracepoi...
1227
  		goto fallback;
642261ac9   Ross Zwisler   dax: add struct i...
1228

91d25ba8a   Ross Zwisler   dax: use common 4...
1229
1230
  	ret = dax_insert_mapping_entry(mapping, vmf, entry, 0,
  			RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE);
642261ac9   Ross Zwisler   dax: add struct i...
1231
  	if (IS_ERR(ret))
653b2ea33   Ross Zwisler   dax: add tracepoi...
1232
  		goto fallback;
642261ac9   Ross Zwisler   dax: add struct i...
1233

f42003917   Dave Jiang   mm, dax: change p...
1234
1235
  	ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
  	if (!pmd_none(*(vmf->pmd))) {
642261ac9   Ross Zwisler   dax: add struct i...
1236
  		spin_unlock(ptl);
653b2ea33   Ross Zwisler   dax: add tracepoi...
1237
  		goto fallback;
642261ac9   Ross Zwisler   dax: add struct i...
1238
  	}
f42003917   Dave Jiang   mm, dax: change p...
1239
  	pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot);
642261ac9   Ross Zwisler   dax: add struct i...
1240
  	pmd_entry = pmd_mkhuge(pmd_entry);
f42003917   Dave Jiang   mm, dax: change p...
1241
  	set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
642261ac9   Ross Zwisler   dax: add struct i...
1242
  	spin_unlock(ptl);
f42003917   Dave Jiang   mm, dax: change p...
1243
  	trace_dax_pmd_load_hole(inode, vmf, zero_page, ret);
642261ac9   Ross Zwisler   dax: add struct i...
1244
  	return VM_FAULT_NOPAGE;
653b2ea33   Ross Zwisler   dax: add tracepoi...
1245
1246
  
  fallback:
f42003917   Dave Jiang   mm, dax: change p...
1247
  	trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, ret);
653b2ea33   Ross Zwisler   dax: add tracepoi...
1248
  	return VM_FAULT_FALLBACK;
642261ac9   Ross Zwisler   dax: add struct i...
1249
  }
a2d581675   Dave Jiang   mm,fs,dax: change...
1250
1251
  static int dax_iomap_pmd_fault(struct vm_fault *vmf,
  			       const struct iomap_ops *ops)
642261ac9   Ross Zwisler   dax: add struct i...
1252
  {
f42003917   Dave Jiang   mm, dax: change p...
1253
  	struct vm_area_struct *vma = vmf->vma;
642261ac9   Ross Zwisler   dax: add struct i...
1254
  	struct address_space *mapping = vma->vm_file->f_mapping;
d8a849e1b   Dave Jiang   mm, dax: make pmd...
1255
1256
  	unsigned long pmd_addr = vmf->address & PMD_MASK;
  	bool write = vmf->flags & FAULT_FLAG_WRITE;
9484ab1bf   Jan Kara   dax: Introduce IO...
1257
  	unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
642261ac9   Ross Zwisler   dax: add struct i...
1258
1259
1260
1261
  	struct inode *inode = mapping->host;
  	int result = VM_FAULT_FALLBACK;
  	struct iomap iomap = { 0 };
  	pgoff_t max_pgoff, pgoff;
642261ac9   Ross Zwisler   dax: add struct i...
1262
1263
1264
  	void *entry;
  	loff_t pos;
  	int error;
282a8e039   Ross Zwisler   dax: add tracepoi...
1265
1266
1267
1268
1269
1270
  	/*
  	 * Check whether offset isn't beyond end of file now. Caller is
  	 * supposed to hold locks serializing us with truncate / punch hole so
  	 * this is a reliable test.
  	 */
  	pgoff = linear_page_index(vma, pmd_addr);
c21261e63   Jeff Moyer   dax: fix PMD faul...
1271
  	max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
282a8e039   Ross Zwisler   dax: add tracepoi...
1272

f42003917   Dave Jiang   mm, dax: change p...
1273
  	trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);
282a8e039   Ross Zwisler   dax: add tracepoi...
1274

fffa281b4   Ross Zwisler   dax: fix deadlock...
1275
1276
1277
1278
1279
1280
1281
1282
1283
  	/*
  	 * Make sure that the faulting address's PMD offset (color) matches
  	 * the PMD offset from the start of the file.  This is necessary so
  	 * that a PMD range in the page table overlaps exactly with a PMD
  	 * range in the radix tree.
  	 */
  	if ((vmf->pgoff & PG_PMD_COLOUR) !=
  	    ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR))
  		goto fallback;
642261ac9   Ross Zwisler   dax: add struct i...
1284
1285
1286
1287
1288
1289
1290
1291
1292
  	/* Fall back to PTEs if we're going to COW */
  	if (write && !(vma->vm_flags & VM_SHARED))
  		goto fallback;
  
  	/* If the PMD would extend outside the VMA */
  	if (pmd_addr < vma->vm_start)
  		goto fallback;
  	if ((pmd_addr + PMD_SIZE) > vma->vm_end)
  		goto fallback;
c21261e63   Jeff Moyer   dax: fix PMD faul...
1293
  	if (pgoff >= max_pgoff) {
282a8e039   Ross Zwisler   dax: add tracepoi...
1294
1295
1296
  		result = VM_FAULT_SIGBUS;
  		goto out;
  	}
642261ac9   Ross Zwisler   dax: add struct i...
1297
1298
  
  	/* If the PMD would extend beyond the file size */
c21261e63   Jeff Moyer   dax: fix PMD faul...
1299
  	if ((pgoff | PG_PMD_COLOUR) >= max_pgoff)
642261ac9   Ross Zwisler   dax: add struct i...
1300
1301
1302
  		goto fallback;
  
  	/*
91d25ba8a   Ross Zwisler   dax: use common 4...
1303
1304
1305
1306
  	 * grab_mapping_entry() will make sure we get a 2MiB empty entry, a
  	 * 2MiB zero page entry or a DAX PMD.  If it can't (because a 4k page
  	 * is already in the tree, for instance), it will return -EEXIST and
  	 * we just fall back to 4k entries.
876f29460   Ross Zwisler   dax: fix PMD data...
1307
1308
1309
1310
1311
1312
  	 */
  	entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
  	if (IS_ERR(entry))
  		goto fallback;
  
  	/*
e2093926a   Ross Zwisler   dax: fix race bet...
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
  	 * It is possible, particularly with mixed reads & writes to private
  	 * mappings, that we have raced with a PTE fault that overlaps with
  	 * the PMD we need to set up.  If so just return and the fault will be
  	 * retried.
  	 */
  	if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) &&
  			!pmd_devmap(*vmf->pmd)) {
  		result = 0;
  		goto unlock_entry;
  	}
  
  	/*
642261ac9   Ross Zwisler   dax: add struct i...
1325
1326
1327
1328
1329
1330
1331
  	 * Note that we don't use iomap_apply here.  We aren't doing I/O, only
  	 * setting up a mapping, so really we're using iomap_begin() as a way
  	 * to look up our filesystem block.
  	 */
  	pos = (loff_t)pgoff << PAGE_SHIFT;
  	error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
  	if (error)
876f29460   Ross Zwisler   dax: fix PMD data...
1332
  		goto unlock_entry;
9f141d6ef   Jan Kara   dax: Call ->iomap...
1333

642261ac9   Ross Zwisler   dax: add struct i...
1334
1335
  	if (iomap.offset + iomap.length < pos + PMD_SIZE)
  		goto finish_iomap;
642261ac9   Ross Zwisler   dax: add struct i...
1336
1337
  	switch (iomap.type) {
  	case IOMAP_MAPPED:
91d25ba8a   Ross Zwisler   dax: use common 4...
1338
  		result = dax_pmd_insert_mapping(vmf, &iomap, pos, entry);
642261ac9   Ross Zwisler   dax: add struct i...
1339
1340
1341
1342
  		break;
  	case IOMAP_UNWRITTEN:
  	case IOMAP_HOLE:
  		if (WARN_ON_ONCE(write))
876f29460   Ross Zwisler   dax: fix PMD data...
1343
  			break;
91d25ba8a   Ross Zwisler   dax: use common 4...
1344
  		result = dax_pmd_load_hole(vmf, &iomap, entry);
642261ac9   Ross Zwisler   dax: add struct i...
1345
1346
1347
1348
1349
1350
1351
1352
  		break;
  	default:
  		WARN_ON_ONCE(1);
  		break;
  	}
  
   finish_iomap:
  	if (ops->iomap_end) {
9f141d6ef   Jan Kara   dax: Call ->iomap...
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
  		int copied = PMD_SIZE;
  
  		if (result == VM_FAULT_FALLBACK)
  			copied = 0;
  		/*
  		 * The fault is done by now and there's no way back (other
  		 * thread may be already happily using PMD we have installed).
  		 * Just ignore error from ->iomap_end since we cannot do much
  		 * with it.
  		 */
  		ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags,
  				&iomap);
642261ac9   Ross Zwisler   dax: add struct i...
1365
  	}
876f29460   Ross Zwisler   dax: fix PMD data...
1366
   unlock_entry:
91d25ba8a   Ross Zwisler   dax: use common 4...
1367
  	put_locked_mapping_entry(mapping, pgoff);
642261ac9   Ross Zwisler   dax: add struct i...
1368
1369
   fallback:
  	if (result == VM_FAULT_FALLBACK) {
d8a849e1b   Dave Jiang   mm, dax: make pmd...
1370
  		split_huge_pmd(vma, vmf->pmd, vmf->address);
642261ac9   Ross Zwisler   dax: add struct i...
1371
1372
  		count_vm_event(THP_FAULT_FALLBACK);
  	}
282a8e039   Ross Zwisler   dax: add tracepoi...
1373
  out:
f42003917   Dave Jiang   mm, dax: change p...
1374
  	trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result);
642261ac9   Ross Zwisler   dax: add struct i...
1375
1376
  	return result;
  }
a2d581675   Dave Jiang   mm,fs,dax: change...
1377
  #else
01cddfe99   Arnd Bergmann   mm,fs,dax: mark d...
1378
1379
  static int dax_iomap_pmd_fault(struct vm_fault *vmf,
  			       const struct iomap_ops *ops)
a2d581675   Dave Jiang   mm,fs,dax: change...
1380
1381
1382
  {
  	return VM_FAULT_FALLBACK;
  }
642261ac9   Ross Zwisler   dax: add struct i...
1383
  #endif /* CONFIG_FS_DAX_PMD */
a2d581675   Dave Jiang   mm,fs,dax: change...
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
  
  /**
   * dax_iomap_fault - handle a page fault on a DAX file
   * @vmf: The description of the fault
   * @ops: iomap ops passed from the file system
   *
   * When a page fault occurs, filesystems may call this helper in
   * their fault handler for DAX files. dax_iomap_fault() assumes the caller
   * has done all the necessary locking for page fault to proceed
   * successfully.
   */
c791ace1e   Dave Jiang   mm: replace FAULT...
1395
1396
  int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
  		    const struct iomap_ops *ops)
a2d581675   Dave Jiang   mm,fs,dax: change...
1397
  {
c791ace1e   Dave Jiang   mm: replace FAULT...
1398
1399
  	switch (pe_size) {
  	case PE_SIZE_PTE:
a2d581675   Dave Jiang   mm,fs,dax: change...
1400
  		return dax_iomap_pte_fault(vmf, ops);
c791ace1e   Dave Jiang   mm: replace FAULT...
1401
  	case PE_SIZE_PMD:
a2d581675   Dave Jiang   mm,fs,dax: change...
1402
1403
1404
1405
1406
1407
  		return dax_iomap_pmd_fault(vmf, ops);
  	default:
  		return VM_FAULT_FALLBACK;
  	}
  }
  EXPORT_SYMBOL_GPL(dax_iomap_fault);