Blame view

fs/dax.c 40.7 KB
d475c6346   Matthew Wilcox   dax,ext2: replace...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
  /*
   * fs/dax.c - Direct Access filesystem code
   * Copyright (c) 2013-2014 Intel Corporation
   * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
   * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
   *
   * This program is free software; you can redistribute it and/or modify it
   * under the terms and conditions of the GNU General Public License,
   * version 2, as published by the Free Software Foundation.
   *
   * This program is distributed in the hope it will be useful, but WITHOUT
   * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
   * more details.
   */
  
  #include <linux/atomic.h>
  #include <linux/blkdev.h>
  #include <linux/buffer_head.h>
d77e92e27   Ross Zwisler   dax: update PMD f...
20
  #include <linux/dax.h>
d475c6346   Matthew Wilcox   dax,ext2: replace...
21
22
  #include <linux/fs.h>
  #include <linux/genhd.h>
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
23
24
25
  #include <linux/highmem.h>
  #include <linux/memcontrol.h>
  #include <linux/mm.h>
d475c6346   Matthew Wilcox   dax,ext2: replace...
26
  #include <linux/mutex.h>
9973c98ec   Ross Zwisler   dax: add support ...
27
  #include <linux/pagevec.h>
2765cfbb3   Ross Zwisler   dax: update I/O p...
28
  #include <linux/pmem.h>
289c6aeda   Matthew Wilcox   dax,ext2: replace...
29
  #include <linux/sched.h>
d475c6346   Matthew Wilcox   dax,ext2: replace...
30
  #include <linux/uio.h>
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
31
  #include <linux/vmstat.h>
34c0fd540   Dan Williams   mm, dax, pmem: in...
32
  #include <linux/pfn_t.h>
0e749e542   Dan Williams   dax: increase gra...
33
  #include <linux/sizes.h>
a254e5681   Christoph Hellwig   dax: provide an i...
34
35
  #include <linux/iomap.h>
  #include "internal.h"
d475c6346   Matthew Wilcox   dax,ext2: replace...
36

e804315dd   Jan Kara   dax: Define DAX l...
37
38
39
40
41
42
43
44
45
  /*
   * We use lowest available bit in exceptional entry for locking, other two
   * bits to determine entry type. In total 3 special bits.
   */
  #define RADIX_DAX_SHIFT	(RADIX_TREE_EXCEPTIONAL_SHIFT + 3)
  #define RADIX_DAX_PTE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
  #define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
  #define RADIX_DAX_TYPE_MASK (RADIX_DAX_PTE | RADIX_DAX_PMD)
  #define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_TYPE_MASK)
78a9be0a0   NeilBrown   dax: move RADIX_D...
46
47
  #define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
  #define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
e804315dd   Jan Kara   dax: Define DAX l...
48
49
  		RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \
  		RADIX_TREE_EXCEPTIONAL_ENTRY))
e4b274915   NeilBrown   DAX: move RADIX_D...
50

ac401cc78   Jan Kara   dax: New fault lo...
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
  /* We choose 4096 entries - same as per-zone page wait tables */
  #define DAX_WAIT_TABLE_BITS 12
  #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
  
  wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
  
  static int __init init_dax_wait_table(void)
  {
  	int i;
  
  	for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
  		init_waitqueue_head(wait_table + i);
  	return 0;
  }
  fs_initcall(init_dax_wait_table);
  
  static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
  					      pgoff_t index)
  {
  	unsigned long hash = hash_long((unsigned long)mapping ^ index,
  				       DAX_WAIT_TABLE_BITS);
  	return wait_table + hash;
  }
78a9be0a0   NeilBrown   dax: move RADIX_D...
74

b2e0d1625   Dan Williams   dax: fix lifetime...
75
76
77
78
  static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
  {
  	struct request_queue *q = bdev->bd_queue;
  	long rc = -EIO;
7a9eb2066   Dan Williams   pmem: kill __pmem...
79
  	dax->addr = ERR_PTR(-EIO);
b2e0d1625   Dan Williams   dax: fix lifetime...
80
81
82
83
84
  	if (blk_queue_enter(q, true) != 0)
  		return rc;
  
  	rc = bdev_direct_access(bdev, dax);
  	if (rc < 0) {
7a9eb2066   Dan Williams   pmem: kill __pmem...
85
  		dax->addr = ERR_PTR(rc);
b2e0d1625   Dan Williams   dax: fix lifetime...
86
87
88
89
90
91
92
93
94
95
96
97
98
  		blk_queue_exit(q);
  		return rc;
  	}
  	return rc;
  }
  
  static void dax_unmap_atomic(struct block_device *bdev,
  		const struct blk_dax_ctl *dax)
  {
  	if (IS_ERR(dax->addr))
  		return;
  	blk_queue_exit(bdev->bd_queue);
  }
d1a5f2b4d   Dan Williams   block: use DAX fo...
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
  struct page *read_dax_sector(struct block_device *bdev, sector_t n)
  {
  	struct page *page = alloc_pages(GFP_KERNEL, 0);
  	struct blk_dax_ctl dax = {
  		.size = PAGE_SIZE,
  		.sector = n & ~((((int) PAGE_SIZE) / 512) - 1),
  	};
  	long rc;
  
  	if (!page)
  		return ERR_PTR(-ENOMEM);
  
  	rc = dax_map_atomic(bdev, &dax);
  	if (rc < 0)
  		return ERR_PTR(rc);
  	memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE);
  	dax_unmap_atomic(bdev, &dax);
  	return page;
  }
d475c6346   Matthew Wilcox   dax,ext2: replace...
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
  static bool buffer_written(struct buffer_head *bh)
  {
  	return buffer_mapped(bh) && !buffer_unwritten(bh);
  }
  
  /*
   * When ext4 encounters a hole, it returns without modifying the buffer_head
   * which means that we can't trust b_size.  To cope with this, we set b_state
   * to 0 before calling get_block and, if any bit is set, we know we can trust
   * b_size.  Unfortunate, really, since ext4 knows precisely how long a hole is
   * and would save us time calling get_block repeatedly.
   */
  static bool buffer_size_valid(struct buffer_head *bh)
  {
  	return bh->b_state != 0;
  }
b2e0d1625   Dan Williams   dax: fix lifetime...
134
135
136
137
138
139
140
141
  
  static sector_t to_sector(const struct buffer_head *bh,
  		const struct inode *inode)
  {
  	sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
  
  	return sector;
  }
a95cd6311   Omar Sandoval   Remove rw from da...
142
143
144
  static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
  		      loff_t start, loff_t end, get_block_t get_block,
  		      struct buffer_head *bh)
d475c6346   Matthew Wilcox   dax,ext2: replace...
145
  {
b2e0d1625   Dan Williams   dax: fix lifetime...
146
  	loff_t pos = start, max = start, bh_max = start;
14df6a4e7   Dan Williams   fs/dax: remove wm...
147
  	bool hole = false;
b2e0d1625   Dan Williams   dax: fix lifetime...
148
149
150
151
  	struct block_device *bdev = NULL;
  	int rw = iov_iter_rw(iter), rc;
  	long map_len = 0;
  	struct blk_dax_ctl dax = {
7a9eb2066   Dan Williams   pmem: kill __pmem...
152
  		.addr = ERR_PTR(-EIO),
b2e0d1625   Dan Williams   dax: fix lifetime...
153
  	};
069c77bc9   Jan Kara   dax: Remove zeroi...
154
155
156
  	unsigned blkbits = inode->i_blkbits;
  	sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1)
  								>> blkbits;
b2e0d1625   Dan Williams   dax: fix lifetime...
157
158
  
  	if (rw == READ)
d475c6346   Matthew Wilcox   dax,ext2: replace...
159
160
161
  		end = min(end, i_size_read(inode));
  
  	while (pos < end) {
2765cfbb3   Ross Zwisler   dax: update I/O p...
162
  		size_t len;
d475c6346   Matthew Wilcox   dax,ext2: replace...
163
  		if (pos == max) {
e94f5a228   Jeff Moyer   dax: fix O_DIRECT...
164
165
  			long page = pos >> PAGE_SHIFT;
  			sector_t block = page << (PAGE_SHIFT - blkbits);
d475c6346   Matthew Wilcox   dax,ext2: replace...
166
167
168
169
170
171
  			unsigned first = pos - (block << blkbits);
  			long size;
  
  			if (pos == bh_max) {
  				bh->b_size = PAGE_ALIGN(end - pos);
  				bh->b_state = 0;
b2e0d1625   Dan Williams   dax: fix lifetime...
172
173
  				rc = get_block(inode, block, bh, rw == WRITE);
  				if (rc)
d475c6346   Matthew Wilcox   dax,ext2: replace...
174
175
176
177
  					break;
  				if (!buffer_size_valid(bh))
  					bh->b_size = 1 << blkbits;
  				bh_max = pos - first + bh->b_size;
b2e0d1625   Dan Williams   dax: fix lifetime...
178
  				bdev = bh->b_bdev;
069c77bc9   Jan Kara   dax: Remove zeroi...
179
180
181
182
183
184
185
  				/*
  				 * We allow uninitialized buffers for writes
  				 * beyond EOF as those cannot race with faults
  				 */
  				WARN_ON_ONCE(
  					(buffer_new(bh) && block < file_blks) ||
  					(rw == WRITE && buffer_unwritten(bh)));
d475c6346   Matthew Wilcox   dax,ext2: replace...
186
187
188
189
190
191
  			} else {
  				unsigned done = bh->b_size -
  						(bh_max - (pos - first));
  				bh->b_blocknr += done >> blkbits;
  				bh->b_size -= done;
  			}
b2e0d1625   Dan Williams   dax: fix lifetime...
192
  			hole = rw == READ && !buffer_written(bh);
d475c6346   Matthew Wilcox   dax,ext2: replace...
193
  			if (hole) {
d475c6346   Matthew Wilcox   dax,ext2: replace...
194
195
  				size = bh->b_size - first;
  			} else {
b2e0d1625   Dan Williams   dax: fix lifetime...
196
197
198
199
200
201
  				dax_unmap_atomic(bdev, &dax);
  				dax.sector = to_sector(bh, inode);
  				dax.size = bh->b_size;
  				map_len = dax_map_atomic(bdev, &dax);
  				if (map_len < 0) {
  					rc = map_len;
d475c6346   Matthew Wilcox   dax,ext2: replace...
202
  					break;
b2e0d1625   Dan Williams   dax: fix lifetime...
203
  				}
b2e0d1625   Dan Williams   dax: fix lifetime...
204
205
  				dax.addr += first;
  				size = map_len - first;
d475c6346   Matthew Wilcox   dax,ext2: replace...
206
  			}
023954351   Eric Sandeen   dax: fix offset o...
207
208
209
210
211
212
  			/*
  			 * pos + size is one past the last offset for IO,
  			 * so pos + size can overflow loff_t at extreme offsets.
  			 * Cast to u64 to catch this and get the true minimum.
  			 */
  			max = min_t(u64, pos + size, end);
d475c6346   Matthew Wilcox   dax,ext2: replace...
213
  		}
2765cfbb3   Ross Zwisler   dax: update I/O p...
214
  		if (iov_iter_rw(iter) == WRITE) {
b2e0d1625   Dan Williams   dax: fix lifetime...
215
  			len = copy_from_iter_pmem(dax.addr, max - pos, iter);
2765cfbb3   Ross Zwisler   dax: update I/O p...
216
  		} else if (!hole)
b2e0d1625   Dan Williams   dax: fix lifetime...
217
  			len = copy_to_iter((void __force *) dax.addr, max - pos,
e2e05394e   Ross Zwisler   pmem, dax: have d...
218
  					iter);
d475c6346   Matthew Wilcox   dax,ext2: replace...
219
220
  		else
  			len = iov_iter_zero(max - pos, iter);
cadfbb6ec   Al Viro   dax_io(): don't l...
221
  		if (!len) {
b2e0d1625   Dan Williams   dax: fix lifetime...
222
  			rc = -EFAULT;
d475c6346   Matthew Wilcox   dax,ext2: replace...
223
  			break;
cadfbb6ec   Al Viro   dax_io(): don't l...
224
  		}
d475c6346   Matthew Wilcox   dax,ext2: replace...
225
226
  
  		pos += len;
b2e0d1625   Dan Williams   dax: fix lifetime...
227
228
  		if (!IS_ERR(dax.addr))
  			dax.addr += len;
d475c6346   Matthew Wilcox   dax,ext2: replace...
229
  	}
b2e0d1625   Dan Williams   dax: fix lifetime...
230
  	dax_unmap_atomic(bdev, &dax);
2765cfbb3   Ross Zwisler   dax: update I/O p...
231

b2e0d1625   Dan Williams   dax: fix lifetime...
232
  	return (pos == start) ? rc : pos - start;
d475c6346   Matthew Wilcox   dax,ext2: replace...
233
234
235
236
  }
  
  /**
   * dax_do_io - Perform I/O to a DAX file
d475c6346   Matthew Wilcox   dax,ext2: replace...
237
238
239
   * @iocb: The control block for this I/O
   * @inode: The file which the I/O is directed at
   * @iter: The addresses to do I/O from or to
d475c6346   Matthew Wilcox   dax,ext2: replace...
240
241
242
243
244
245
246
247
248
249
250
   * @get_block: The filesystem method used to translate file offsets to blocks
   * @end_io: A filesystem callback for I/O completion
   * @flags: See below
   *
   * This function uses the same locking scheme as do_blockdev_direct_IO:
   * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
   * caller for writes.  For reads, we take and release the i_mutex ourselves.
   * If DIO_LOCKING is not set, the filesystem takes care of its own locking.
   * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
   * is in progress.
   */
a95cd6311   Omar Sandoval   Remove rw from da...
251
  ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
c8b8e32d7   Christoph Hellwig   direct-io: elimin...
252
  		  struct iov_iter *iter, get_block_t get_block,
a95cd6311   Omar Sandoval   Remove rw from da...
253
  		  dio_iodone_t end_io, int flags)
d475c6346   Matthew Wilcox   dax,ext2: replace...
254
255
256
  {
  	struct buffer_head bh;
  	ssize_t retval = -EINVAL;
c8b8e32d7   Christoph Hellwig   direct-io: elimin...
257
  	loff_t pos = iocb->ki_pos;
d475c6346   Matthew Wilcox   dax,ext2: replace...
258
259
260
  	loff_t end = pos + iov_iter_count(iter);
  
  	memset(&bh, 0, sizeof(bh));
eab95db69   Ross Zwisler   dax: never rely o...
261
  	bh.b_bdev = inode->i_sb->s_bdev;
d475c6346   Matthew Wilcox   dax,ext2: replace...
262

c3d98e39d   Jan Kara   dax: Remove point...
263
  	if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
5955102c9   Al Viro   wrappers for ->i_...
264
  		inode_lock(inode);
d475c6346   Matthew Wilcox   dax,ext2: replace...
265
266
  
  	/* Protects against truncate */
bbab37ddc   Matthew Wilcox   block: Add suppor...
267
268
  	if (!(flags & DIO_SKIP_DIO_COUNT))
  		inode_dio_begin(inode);
d475c6346   Matthew Wilcox   dax,ext2: replace...
269

a95cd6311   Omar Sandoval   Remove rw from da...
270
  	retval = dax_io(inode, iter, pos, end, get_block, &bh);
d475c6346   Matthew Wilcox   dax,ext2: replace...
271

a95cd6311   Omar Sandoval   Remove rw from da...
272
  	if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
5955102c9   Al Viro   wrappers for ->i_...
273
  		inode_unlock(inode);
d475c6346   Matthew Wilcox   dax,ext2: replace...
274

187372a3b   Christoph Hellwig   direct-io: always...
275
276
277
278
279
280
281
  	if (end_io) {
  		int err;
  
  		err = end_io(iocb, pos, retval, bh.b_private);
  		if (err)
  			retval = err;
  	}
d475c6346   Matthew Wilcox   dax,ext2: replace...
282

bbab37ddc   Matthew Wilcox   block: Add suppor...
283
284
  	if (!(flags & DIO_SKIP_DIO_COUNT))
  		inode_dio_end(inode);
d475c6346   Matthew Wilcox   dax,ext2: replace...
285
286
287
  	return retval;
  }
  EXPORT_SYMBOL_GPL(dax_do_io);
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
288
289
  
  /*
ac401cc78   Jan Kara   dax: New fault lo...
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
   * DAX radix tree locking
   */
  struct exceptional_entry_key {
  	struct address_space *mapping;
  	unsigned long index;
  };
  
  struct wait_exceptional_entry_queue {
  	wait_queue_t wait;
  	struct exceptional_entry_key key;
  };
  
  static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode,
  				       int sync, void *keyp)
  {
  	struct exceptional_entry_key *key = keyp;
  	struct wait_exceptional_entry_queue *ewait =
  		container_of(wait, struct wait_exceptional_entry_queue, wait);
  
  	if (key->mapping != ewait->key.mapping ||
  	    key->index != ewait->key.index)
  		return 0;
  	return autoremove_wake_function(wait, mode, sync, NULL);
  }
  
  /*
   * Check whether the given slot is locked. The function must be called with
   * mapping->tree_lock held
   */
  static inline int slot_locked(struct address_space *mapping, void **slot)
  {
  	unsigned long entry = (unsigned long)
  		radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
  	return entry & RADIX_DAX_ENTRY_LOCK;
  }
  
  /*
   * Mark the given slot is locked. The function must be called with
   * mapping->tree_lock held
   */
  static inline void *lock_slot(struct address_space *mapping, void **slot)
  {
  	unsigned long entry = (unsigned long)
  		radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
  
  	entry |= RADIX_DAX_ENTRY_LOCK;
  	radix_tree_replace_slot(slot, (void *)entry);
  	return (void *)entry;
  }
  
  /*
   * Mark the given slot is unlocked. The function must be called with
   * mapping->tree_lock held
   */
  static inline void *unlock_slot(struct address_space *mapping, void **slot)
  {
  	unsigned long entry = (unsigned long)
  		radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
  
  	entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
  	radix_tree_replace_slot(slot, (void *)entry);
  	return (void *)entry;
  }
  
  /*
   * Lookup entry in radix tree, wait for it to become unlocked if it is
   * exceptional entry and return it. The caller must call
   * put_unlocked_mapping_entry() when he decided not to lock the entry or
   * put_locked_mapping_entry() when he locked the entry and now wants to
   * unlock it.
   *
   * The function must be called with mapping->tree_lock held.
   */
  static void *get_unlocked_mapping_entry(struct address_space *mapping,
  					pgoff_t index, void ***slotp)
  {
  	void *ret, **slot;
  	struct wait_exceptional_entry_queue ewait;
  	wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index);
  
  	init_wait(&ewait.wait);
  	ewait.wait.func = wake_exceptional_entry_func;
  	ewait.key.mapping = mapping;
  	ewait.key.index = index;
  
  	for (;;) {
  		ret = __radix_tree_lookup(&mapping->page_tree, index, NULL,
  					  &slot);
  		if (!ret || !radix_tree_exceptional_entry(ret) ||
  		    !slot_locked(mapping, slot)) {
  			if (slotp)
  				*slotp = slot;
  			return ret;
  		}
  		prepare_to_wait_exclusive(wq, &ewait.wait,
  					  TASK_UNINTERRUPTIBLE);
  		spin_unlock_irq(&mapping->tree_lock);
  		schedule();
  		finish_wait(wq, &ewait.wait);
  		spin_lock_irq(&mapping->tree_lock);
  	}
  }
  
  /*
   * Find radix tree entry at given index. If it points to a page, return with
   * the page locked. If it points to the exceptional entry, return with the
   * radix tree entry locked. If the radix tree doesn't contain given index,
   * create empty exceptional entry for the index and return with it locked.
   *
   * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
   * persistent memory the benefit is doubtful. We can add that later if we can
   * show it helps.
   */
  static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index)
  {
  	void *ret, **slot;
  
  restart:
  	spin_lock_irq(&mapping->tree_lock);
  	ret = get_unlocked_mapping_entry(mapping, index, &slot);
  	/* No entry for given index? Make sure radix tree is big enough. */
  	if (!ret) {
  		int err;
  
  		spin_unlock_irq(&mapping->tree_lock);
  		err = radix_tree_preload(
  				mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
  		if (err)
  			return ERR_PTR(err);
  		ret = (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
  			       RADIX_DAX_ENTRY_LOCK);
  		spin_lock_irq(&mapping->tree_lock);
  		err = radix_tree_insert(&mapping->page_tree, index, ret);
  		radix_tree_preload_end();
  		if (err) {
  			spin_unlock_irq(&mapping->tree_lock);
  			/* Someone already created the entry? */
  			if (err == -EEXIST)
  				goto restart;
  			return ERR_PTR(err);
  		}
  		/* Good, we have inserted empty locked entry into the tree. */
  		mapping->nrexceptional++;
  		spin_unlock_irq(&mapping->tree_lock);
  		return ret;
  	}
  	/* Normal page in radix tree? */
  	if (!radix_tree_exceptional_entry(ret)) {
  		struct page *page = ret;
  
  		get_page(page);
  		spin_unlock_irq(&mapping->tree_lock);
  		lock_page(page);
  		/* Page got truncated? Retry... */
  		if (unlikely(page->mapping != mapping)) {
  			unlock_page(page);
  			put_page(page);
  			goto restart;
  		}
  		return page;
  	}
  	ret = lock_slot(mapping, slot);
  	spin_unlock_irq(&mapping->tree_lock);
  	return ret;
  }
  
  void dax_wake_mapping_entry_waiter(struct address_space *mapping,
  				   pgoff_t index, bool wake_all)
  {
  	wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index);
  
  	/*
  	 * Checking for locked entry and prepare_to_wait_exclusive() happens
  	 * under mapping->tree_lock, ditto for entry handling in our callers.
  	 * So at this point all tasks that could have seen our entry locked
  	 * must be in the waitqueue and the following check will see them.
  	 */
  	if (waitqueue_active(wq)) {
  		struct exceptional_entry_key key;
  
  		key.mapping = mapping;
  		key.index = index;
  		__wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
  	}
  }
bc2466e42   Jan Kara   dax: Use radix tr...
475
  void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
ac401cc78   Jan Kara   dax: New fault lo...
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
  {
  	void *ret, **slot;
  
  	spin_lock_irq(&mapping->tree_lock);
  	ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
  	if (WARN_ON_ONCE(!ret || !radix_tree_exceptional_entry(ret) ||
  			 !slot_locked(mapping, slot))) {
  		spin_unlock_irq(&mapping->tree_lock);
  		return;
  	}
  	unlock_slot(mapping, slot);
  	spin_unlock_irq(&mapping->tree_lock);
  	dax_wake_mapping_entry_waiter(mapping, index, false);
  }
  
  static void put_locked_mapping_entry(struct address_space *mapping,
  				     pgoff_t index, void *entry)
  {
  	if (!radix_tree_exceptional_entry(entry)) {
  		unlock_page(entry);
  		put_page(entry);
  	} else {
bc2466e42   Jan Kara   dax: Use radix tr...
498
  		dax_unlock_mapping_entry(mapping, index);
ac401cc78   Jan Kara   dax: New fault lo...
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
  	}
  }
  
  /*
   * Called when we are done with radix tree entry we looked up via
   * get_unlocked_mapping_entry() and which we didn't lock in the end.
   */
  static void put_unlocked_mapping_entry(struct address_space *mapping,
  				       pgoff_t index, void *entry)
  {
  	if (!radix_tree_exceptional_entry(entry))
  		return;
  
  	/* We have to wake up next waiter for the radix tree entry lock */
  	dax_wake_mapping_entry_waiter(mapping, index, false);
  }
  
  /*
   * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
   * entry to get unlocked before deleting it.
   */
  int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
  {
  	void *entry;
  
  	spin_lock_irq(&mapping->tree_lock);
  	entry = get_unlocked_mapping_entry(mapping, index, NULL);
  	/*
  	 * This gets called from truncate / punch_hole path. As such, the caller
  	 * must hold locks protecting against concurrent modifications of the
  	 * radix tree (usually fs-private i_mmap_sem for writing). Since the
  	 * caller has seen exceptional entry for this index, we better find it
  	 * at that index as well...
  	 */
  	if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry))) {
  		spin_unlock_irq(&mapping->tree_lock);
  		return 0;
  	}
  	radix_tree_delete(&mapping->page_tree, index);
  	mapping->nrexceptional--;
  	spin_unlock_irq(&mapping->tree_lock);
  	dax_wake_mapping_entry_waiter(mapping, index, true);
  
  	return 1;
  }
  
  /*
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
546
547
548
549
550
551
552
   * The user has performed a load from a hole in the file.  Allocating
   * a new page in the file would cause excessive storage usage for
   * workloads with sparse files.  We allocate a page cache page instead.
   * We'll kick it out of the page cache if it's ever written to,
   * otherwise it will simply fall out of the page cache under memory
   * pressure without ever having been dirtied.
   */
ac401cc78   Jan Kara   dax: New fault lo...
553
554
  static int dax_load_hole(struct address_space *mapping, void *entry,
  			 struct vm_fault *vmf)
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
555
  {
ac401cc78   Jan Kara   dax: New fault lo...
556
  	struct page *page;
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
557

ac401cc78   Jan Kara   dax: New fault lo...
558
559
560
561
562
  	/* Hole page already exists? Return it...  */
  	if (!radix_tree_exceptional_entry(entry)) {
  		vmf->page = entry;
  		return VM_FAULT_LOCKED;
  	}
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
563

ac401cc78   Jan Kara   dax: New fault lo...
564
565
566
567
568
569
570
  	/* This will replace locked radix tree entry with a hole page */
  	page = find_or_create_page(mapping, vmf->pgoff,
  				   vmf->gfp_mask | __GFP_ZERO);
  	if (!page) {
  		put_locked_mapping_entry(mapping, vmf->pgoff, entry);
  		return VM_FAULT_OOM;
  	}
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
571
572
573
  	vmf->page = page;
  	return VM_FAULT_LOCKED;
  }
b0d5e82fc   Christoph Hellwig   dax: don't pass b...
574
575
  static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size,
  		struct page *to, unsigned long vaddr)
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
576
  {
b2e0d1625   Dan Williams   dax: fix lifetime...
577
  	struct blk_dax_ctl dax = {
b0d5e82fc   Christoph Hellwig   dax: don't pass b...
578
579
  		.sector = sector,
  		.size = size,
b2e0d1625   Dan Williams   dax: fix lifetime...
580
  	};
e2e05394e   Ross Zwisler   pmem, dax: have d...
581
  	void *vto;
b2e0d1625   Dan Williams   dax: fix lifetime...
582
583
  	if (dax_map_atomic(bdev, &dax) < 0)
  		return PTR_ERR(dax.addr);
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
584
  	vto = kmap_atomic(to);
b2e0d1625   Dan Williams   dax: fix lifetime...
585
  	copy_user_page(vto, (void __force *)dax.addr, vaddr, to);
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
586
  	kunmap_atomic(vto);
b2e0d1625   Dan Williams   dax: fix lifetime...
587
  	dax_unmap_atomic(bdev, &dax);
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
588
589
  	return 0;
  }
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
590
  #define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT))
9973c98ec   Ross Zwisler   dax: add support ...
591

ac401cc78   Jan Kara   dax: New fault lo...
592
593
594
  static void *dax_insert_mapping_entry(struct address_space *mapping,
  				      struct vm_fault *vmf,
  				      void *entry, sector_t sector)
9973c98ec   Ross Zwisler   dax: add support ...
595
596
  {
  	struct radix_tree_root *page_tree = &mapping->page_tree;
ac401cc78   Jan Kara   dax: New fault lo...
597
598
599
600
  	int error = 0;
  	bool hole_fill = false;
  	void *new_entry;
  	pgoff_t index = vmf->pgoff;
9973c98ec   Ross Zwisler   dax: add support ...
601

ac401cc78   Jan Kara   dax: New fault lo...
602
  	if (vmf->flags & FAULT_FLAG_WRITE)
d2b2a28e6   Dmitry Monakhov   dax: dirty inode ...
603
  		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
9973c98ec   Ross Zwisler   dax: add support ...
604

ac401cc78   Jan Kara   dax: New fault lo...
605
606
607
608
609
610
611
612
613
614
615
616
  	/* Replacing hole page with block mapping? */
  	if (!radix_tree_exceptional_entry(entry)) {
  		hole_fill = true;
  		/*
  		 * Unmap the page now before we remove it from page cache below.
  		 * The page is locked so it cannot be faulted in again.
  		 */
  		unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
  				    PAGE_SIZE, 0);
  		error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
  		if (error)
  			return ERR_PTR(error);
9973c98ec   Ross Zwisler   dax: add support ...
617
  	}
ac401cc78   Jan Kara   dax: New fault lo...
618
619
620
621
622
623
624
625
626
627
  	spin_lock_irq(&mapping->tree_lock);
  	new_entry = (void *)((unsigned long)RADIX_DAX_ENTRY(sector, false) |
  		       RADIX_DAX_ENTRY_LOCK);
  	if (hole_fill) {
  		__delete_from_page_cache(entry, NULL);
  		/* Drop pagecache reference */
  		put_page(entry);
  		error = radix_tree_insert(page_tree, index, new_entry);
  		if (error) {
  			new_entry = ERR_PTR(error);
9973c98ec   Ross Zwisler   dax: add support ...
628
629
  			goto unlock;
  		}
ac401cc78   Jan Kara   dax: New fault lo...
630
631
632
633
  		mapping->nrexceptional++;
  	} else {
  		void **slot;
  		void *ret;
9973c98ec   Ross Zwisler   dax: add support ...
634

ac401cc78   Jan Kara   dax: New fault lo...
635
636
637
  		ret = __radix_tree_lookup(page_tree, index, NULL, &slot);
  		WARN_ON_ONCE(ret != entry);
  		radix_tree_replace_slot(slot, new_entry);
9973c98ec   Ross Zwisler   dax: add support ...
638
  	}
ac401cc78   Jan Kara   dax: New fault lo...
639
  	if (vmf->flags & FAULT_FLAG_WRITE)
9973c98ec   Ross Zwisler   dax: add support ...
640
641
642
  		radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
   unlock:
  	spin_unlock_irq(&mapping->tree_lock);
ac401cc78   Jan Kara   dax: New fault lo...
643
644
645
646
647
648
649
650
651
652
653
654
  	if (hole_fill) {
  		radix_tree_preload_end();
  		/*
  		 * We don't need hole page anymore, it has been replaced with
  		 * locked radix tree entry now.
  		 */
  		if (mapping->a_ops->freepage)
  			mapping->a_ops->freepage(entry);
  		unlock_page(entry);
  		put_page(entry);
  	}
  	return new_entry;
9973c98ec   Ross Zwisler   dax: add support ...
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
  }
  
  static int dax_writeback_one(struct block_device *bdev,
  		struct address_space *mapping, pgoff_t index, void *entry)
  {
  	struct radix_tree_root *page_tree = &mapping->page_tree;
  	int type = RADIX_DAX_TYPE(entry);
  	struct radix_tree_node *node;
  	struct blk_dax_ctl dax;
  	void **slot;
  	int ret = 0;
  
  	spin_lock_irq(&mapping->tree_lock);
  	/*
  	 * Regular page slots are stabilized by the page lock even
  	 * without the tree itself locked.  These unlocked entries
  	 * need verification under the tree lock.
  	 */
  	if (!__radix_tree_lookup(page_tree, index, &node, &slot))
  		goto unlock;
  	if (*slot != entry)
  		goto unlock;
  
  	/* another fsync thread may have already written back this entry */
  	if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
  		goto unlock;
  
  	if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) {
  		ret = -EIO;
  		goto unlock;
  	}
  
  	dax.sector = RADIX_DAX_SECTOR(entry);
  	dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE);
  	spin_unlock_irq(&mapping->tree_lock);
  
  	/*
  	 * We cannot hold tree_lock while calling dax_map_atomic() because it
  	 * eventually calls cond_resched().
  	 */
  	ret = dax_map_atomic(bdev, &dax);
  	if (ret < 0)
  		return ret;
  
  	if (WARN_ON_ONCE(ret < dax.size)) {
  		ret = -EIO;
  		goto unmap;
  	}
  
  	wb_cache_pmem(dax.addr, dax.size);
  
  	spin_lock_irq(&mapping->tree_lock);
  	radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
  	spin_unlock_irq(&mapping->tree_lock);
   unmap:
  	dax_unmap_atomic(bdev, &dax);
  	return ret;
  
   unlock:
  	spin_unlock_irq(&mapping->tree_lock);
  	return ret;
  }
  
  /*
   * Flush the mapping to the persistent domain within the byte range of [start,
   * end]. This is required by data integrity operations to ensure file data is
   * on persistent storage prior to completion of the operation.
   */
7f6d5b529   Ross Zwisler   dax: move writeba...
723
724
  int dax_writeback_mapping_range(struct address_space *mapping,
  		struct block_device *bdev, struct writeback_control *wbc)
9973c98ec   Ross Zwisler   dax: add support ...
725
726
  {
  	struct inode *inode = mapping->host;
9973c98ec   Ross Zwisler   dax: add support ...
727
728
729
730
731
732
733
734
735
  	pgoff_t start_index, end_index, pmd_index;
  	pgoff_t indices[PAGEVEC_SIZE];
  	struct pagevec pvec;
  	bool done = false;
  	int i, ret = 0;
  	void *entry;
  
  	if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
  		return -EIO;
7f6d5b529   Ross Zwisler   dax: move writeba...
736
737
  	if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
  		return 0;
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
738
739
  	start_index = wbc->range_start >> PAGE_SHIFT;
  	end_index = wbc->range_end >> PAGE_SHIFT;
9973c98ec   Ross Zwisler   dax: add support ...
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
  	pmd_index = DAX_PMD_INDEX(start_index);
  
  	rcu_read_lock();
  	entry = radix_tree_lookup(&mapping->page_tree, pmd_index);
  	rcu_read_unlock();
  
  	/* see if the start of our range is covered by a PMD entry */
  	if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD)
  		start_index = pmd_index;
  
  	tag_pages_for_writeback(mapping, start_index, end_index);
  
  	pagevec_init(&pvec, 0);
  	while (!done) {
  		pvec.nr = find_get_entries_tag(mapping, start_index,
  				PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
  				pvec.pages, indices);
  
  		if (pvec.nr == 0)
  			break;
  
  		for (i = 0; i < pvec.nr; i++) {
  			if (indices[i] > end_index) {
  				done = true;
  				break;
  			}
  
  			ret = dax_writeback_one(bdev, mapping, indices[i],
  					pvec.pages[i]);
  			if (ret < 0)
  				return ret;
  		}
  	}
9973c98ec   Ross Zwisler   dax: add support ...
773
774
775
  	return 0;
  }
  EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
ac401cc78   Jan Kara   dax: New fault lo...
776
  static int dax_insert_mapping(struct address_space *mapping,
1aaba0958   Christoph Hellwig   dax: don't pass b...
777
778
  		struct block_device *bdev, sector_t sector, size_t size,
  		void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf)
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
779
  {
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
780
  	unsigned long vaddr = (unsigned long)vmf->virtual_address;
b2e0d1625   Dan Williams   dax: fix lifetime...
781
  	struct blk_dax_ctl dax = {
1aaba0958   Christoph Hellwig   dax: don't pass b...
782
783
  		.sector = sector,
  		.size = size,
b2e0d1625   Dan Williams   dax: fix lifetime...
784
  	};
ac401cc78   Jan Kara   dax: New fault lo...
785
786
  	void *ret;
  	void *entry = *entryp;
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
787

4d9a2c874   Jan Kara   dax: Remove i_mma...
788
789
  	if (dax_map_atomic(bdev, &dax) < 0)
  		return PTR_ERR(dax.addr);
b2e0d1625   Dan Williams   dax: fix lifetime...
790
  	dax_unmap_atomic(bdev, &dax);
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
791

ac401cc78   Jan Kara   dax: New fault lo...
792
  	ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector);
4d9a2c874   Jan Kara   dax: Remove i_mma...
793
794
  	if (IS_ERR(ret))
  		return PTR_ERR(ret);
ac401cc78   Jan Kara   dax: New fault lo...
795
  	*entryp = ret;
9973c98ec   Ross Zwisler   dax: add support ...
796

4d9a2c874   Jan Kara   dax: Remove i_mma...
797
  	return vm_insert_mixed(vma, vaddr, dax.pfn);
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
798
  }
ce5c5d554   Dave Chinner   dax: expose __dax...
799
  /**
6b524995a   Ross Zwisler   dax: remote unuse...
800
   * dax_fault - handle a page fault on a DAX file
ce5c5d554   Dave Chinner   dax: expose __dax...
801
802
803
804
805
   * @vma: The virtual memory area where the fault occurred
   * @vmf: The description of the fault
   * @get_block: The filesystem method used to translate file offsets to blocks
   *
   * When a page fault occurs, filesystems may call this helper in their
6b524995a   Ross Zwisler   dax: remote unuse...
806
   * fault handler for DAX files. dax_fault() assumes the caller has done all
ce5c5d554   Dave Chinner   dax: expose __dax...
807
808
   * the necessary locking for the page fault to proceed successfully.
   */
6b524995a   Ross Zwisler   dax: remote unuse...
809
  int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
02fbd1397   Jan Kara   dax: Remove compl...
810
  			get_block_t get_block)
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
811
812
813
814
  {
  	struct file *file = vma->vm_file;
  	struct address_space *mapping = file->f_mapping;
  	struct inode *inode = mapping->host;
ac401cc78   Jan Kara   dax: New fault lo...
815
  	void *entry;
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
816
817
818
819
820
821
822
  	struct buffer_head bh;
  	unsigned long vaddr = (unsigned long)vmf->virtual_address;
  	unsigned blkbits = inode->i_blkbits;
  	sector_t block;
  	pgoff_t size;
  	int error;
  	int major = 0;
ac401cc78   Jan Kara   dax: New fault lo...
823
824
825
826
827
  	/*
  	 * Check whether offset isn't beyond end of file now. Caller is supposed
  	 * to hold locks serializing us with truncate / punch hole so this is
  	 * a reliable test.
  	 */
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
828
829
830
831
832
833
  	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
  	if (vmf->pgoff >= size)
  		return VM_FAULT_SIGBUS;
  
  	memset(&bh, 0, sizeof(bh));
  	block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
eab95db69   Ross Zwisler   dax: never rely o...
834
  	bh.b_bdev = inode->i_sb->s_bdev;
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
835
  	bh.b_size = PAGE_SIZE;
ac401cc78   Jan Kara   dax: New fault lo...
836
837
838
839
  	entry = grab_mapping_entry(mapping, vmf->pgoff);
  	if (IS_ERR(entry)) {
  		error = PTR_ERR(entry);
  		goto out;
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
840
841
842
843
844
845
  	}
  
  	error = get_block(inode, block, &bh, 0);
  	if (!error && (bh.b_size < PAGE_SIZE))
  		error = -EIO;		/* fs corruption? */
  	if (error)
ac401cc78   Jan Kara   dax: New fault lo...
846
  		goto unlock_entry;
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
847
848
849
850
  
  	if (vmf->cow_page) {
  		struct page *new_page = vmf->cow_page;
  		if (buffer_written(&bh))
b0d5e82fc   Christoph Hellwig   dax: don't pass b...
851
852
  			error = copy_user_dax(bh.b_bdev, to_sector(&bh, inode),
  					bh.b_size, new_page, vaddr);
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
853
854
855
  		else
  			clear_user_highpage(new_page, vaddr);
  		if (error)
ac401cc78   Jan Kara   dax: New fault lo...
856
857
858
  			goto unlock_entry;
  		if (!radix_tree_exceptional_entry(entry)) {
  			vmf->page = entry;
bc2466e42   Jan Kara   dax: Use radix tr...
859
  			return VM_FAULT_LOCKED;
ac401cc78   Jan Kara   dax: New fault lo...
860
  		}
bc2466e42   Jan Kara   dax: Use radix tr...
861
862
  		vmf->entry = entry;
  		return VM_FAULT_DAX_LOCKED;
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
863
  	}
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
864

ac401cc78   Jan Kara   dax: New fault lo...
865
  	if (!buffer_mapped(&bh)) {
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
866
867
868
869
870
871
872
873
  		if (vmf->flags & FAULT_FLAG_WRITE) {
  			error = get_block(inode, block, &bh, 1);
  			count_vm_event(PGMAJFAULT);
  			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
  			major = VM_FAULT_MAJOR;
  			if (!error && (bh.b_size < PAGE_SIZE))
  				error = -EIO;
  			if (error)
ac401cc78   Jan Kara   dax: New fault lo...
874
  				goto unlock_entry;
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
875
  		} else {
ac401cc78   Jan Kara   dax: New fault lo...
876
  			return dax_load_hole(mapping, entry, vmf);
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
877
878
  		}
  	}
02fbd1397   Jan Kara   dax: Remove compl...
879
  	/* Filesystem should not return unwritten buffers to us! */
2b10945c5   Jan Kara   dax: Remove dead ...
880
  	WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
1aaba0958   Christoph Hellwig   dax: don't pass b...
881
882
  	error = dax_insert_mapping(mapping, bh.b_bdev, to_sector(&bh, inode),
  			bh.b_size, &entry, vma, vmf);
ac401cc78   Jan Kara   dax: New fault lo...
883
884
   unlock_entry:
  	put_locked_mapping_entry(mapping, vmf->pgoff, entry);
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
885
886
887
888
889
890
891
   out:
  	if (error == -ENOMEM)
  		return VM_FAULT_OOM | major;
  	/* -EBUSY is fine, somebody else faulted on the same PTE */
  	if ((error < 0) && (error != -EBUSY))
  		return VM_FAULT_SIGBUS | major;
  	return VM_FAULT_NOPAGE | major;
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
892
  }
f7ca90b16   Matthew Wilcox   dax,ext2: replace...
893
  EXPORT_SYMBOL_GPL(dax_fault);
4c0ccfef2   Matthew Wilcox   dax,ext2: replace...
894

348e967ab   Jan Kara   dax: Make huge pa...
895
  #if defined(CONFIG_TRANSPARENT_HUGEPAGE)
844f35db1   Matthew Wilcox   dax: add huge pag...
896
897
898
899
900
  /*
   * The 'colour' (ie low bits) within a PMD of a page offset.  This comes up
   * more often than one might expect in the below function.
   */
  #define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1)
cbb38e41a   Dan Williams   dax: provide diag...
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
  static void __dax_dbg(struct buffer_head *bh, unsigned long address,
  		const char *reason, const char *fn)
  {
  	if (bh) {
  		char bname[BDEVNAME_SIZE];
  		bdevname(bh->b_bdev, bname);
  		pr_debug("%s: %s addr: %lx dev %s state %lx start %lld "
  			"length %zd fallback: %s
  ", fn, current->comm,
  			address, bname, bh->b_state, (u64)bh->b_blocknr,
  			bh->b_size, reason);
  	} else {
  		pr_debug("%s: %s addr: %lx fallback: %s
  ", fn,
  			current->comm, address, reason);
  	}
  }
  
  #define dax_pmd_dbg(bh, address, reason)	__dax_dbg(bh, address, reason, "dax_pmd")
6b524995a   Ross Zwisler   dax: remote unuse...
920
921
922
923
924
925
926
927
928
929
  /**
   * dax_pmd_fault - handle a PMD fault on a DAX file
   * @vma: The virtual memory area where the fault occurred
   * @vmf: The description of the fault
   * @get_block: The filesystem method used to translate file offsets to blocks
   *
   * When a page fault occurs, filesystems may call this helper in their
   * pmd_fault handler for DAX files.
   */
  int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
02fbd1397   Jan Kara   dax: Remove compl...
930
  		pmd_t *pmd, unsigned int flags, get_block_t get_block)
844f35db1   Matthew Wilcox   dax: add huge pag...
931
932
933
934
935
936
937
938
  {
  	struct file *file = vma->vm_file;
  	struct address_space *mapping = file->f_mapping;
  	struct inode *inode = mapping->host;
  	struct buffer_head bh;
  	unsigned blkbits = inode->i_blkbits;
  	unsigned long pmd_addr = address & PMD_MASK;
  	bool write = flags & FAULT_FLAG_WRITE;
b2e0d1625   Dan Williams   dax: fix lifetime...
939
  	struct block_device *bdev;
844f35db1   Matthew Wilcox   dax: add huge pag...
940
  	pgoff_t size, pgoff;
b2e0d1625   Dan Williams   dax: fix lifetime...
941
  	sector_t block;
ac401cc78   Jan Kara   dax: New fault lo...
942
  	int result = 0;
9973c98ec   Ross Zwisler   dax: add support ...
943
  	bool alloc = false;
844f35db1   Matthew Wilcox   dax: add huge pag...
944

c046c321c   Dan Williams   dax: re-enable da...
945
  	/* dax pmd mappings require pfn_t_devmap() */
ee82c9ed4   Dan Williams   dax: disable pmd ...
946
947
  	if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
  		return VM_FAULT_FALLBACK;
844f35db1   Matthew Wilcox   dax: add huge pag...
948
  	/* Fall back to PTEs if we're going to COW */
59bf4fb9d   Toshi Kani   dax: Split pmd ma...
949
950
  	if (write && !(vma->vm_flags & VM_SHARED)) {
  		split_huge_pmd(vma, pmd, address);
cbb38e41a   Dan Williams   dax: provide diag...
951
  		dax_pmd_dbg(NULL, address, "cow write");
844f35db1   Matthew Wilcox   dax: add huge pag...
952
  		return VM_FAULT_FALLBACK;
59bf4fb9d   Toshi Kani   dax: Split pmd ma...
953
  	}
844f35db1   Matthew Wilcox   dax: add huge pag...
954
  	/* If the PMD would extend outside the VMA */
cbb38e41a   Dan Williams   dax: provide diag...
955
956
  	if (pmd_addr < vma->vm_start) {
  		dax_pmd_dbg(NULL, address, "vma start unaligned");
844f35db1   Matthew Wilcox   dax: add huge pag...
957
  		return VM_FAULT_FALLBACK;
cbb38e41a   Dan Williams   dax: provide diag...
958
959
960
  	}
  	if ((pmd_addr + PMD_SIZE) > vma->vm_end) {
  		dax_pmd_dbg(NULL, address, "vma end unaligned");
844f35db1   Matthew Wilcox   dax: add huge pag...
961
  		return VM_FAULT_FALLBACK;
cbb38e41a   Dan Williams   dax: provide diag...
962
  	}
844f35db1   Matthew Wilcox   dax: add huge pag...
963

3fdd1b479   Matthew Wilcox   dax: use linear_p...
964
  	pgoff = linear_page_index(vma, pmd_addr);
844f35db1   Matthew Wilcox   dax: add huge pag...
965
966
967
968
  	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
  	if (pgoff >= size)
  		return VM_FAULT_SIGBUS;
  	/* If the PMD would cover blocks out of the file */
cbb38e41a   Dan Williams   dax: provide diag...
969
970
971
  	if ((pgoff | PG_PMD_COLOUR) >= size) {
  		dax_pmd_dbg(NULL, address,
  				"offset + huge page size > file size");
844f35db1   Matthew Wilcox   dax: add huge pag...
972
  		return VM_FAULT_FALLBACK;
cbb38e41a   Dan Williams   dax: provide diag...
973
  	}
844f35db1   Matthew Wilcox   dax: add huge pag...
974
975
  
  	memset(&bh, 0, sizeof(bh));
d4bbe7068   Ross Zwisler   dax: fix NULL poi...
976
  	bh.b_bdev = inode->i_sb->s_bdev;
844f35db1   Matthew Wilcox   dax: add huge pag...
977
978
979
  	block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
  
  	bh.b_size = PMD_SIZE;
9973c98ec   Ross Zwisler   dax: add support ...
980
981
  
  	if (get_block(inode, block, &bh, 0) != 0)
844f35db1   Matthew Wilcox   dax: add huge pag...
982
  		return VM_FAULT_SIGBUS;
9973c98ec   Ross Zwisler   dax: add support ...
983
984
985
986
987
  
  	if (!buffer_mapped(&bh) && write) {
  		if (get_block(inode, block, &bh, 1) != 0)
  			return VM_FAULT_SIGBUS;
  		alloc = true;
2b10945c5   Jan Kara   dax: Remove dead ...
988
  		WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
9973c98ec   Ross Zwisler   dax: add support ...
989
  	}
b2e0d1625   Dan Williams   dax: fix lifetime...
990
  	bdev = bh.b_bdev;
844f35db1   Matthew Wilcox   dax: add huge pag...
991
992
993
994
995
996
  
  	/*
  	 * If the filesystem isn't willing to tell us the length of a hole,
  	 * just fall back to PTEs.  Calling get_block 512 times in a loop
  	 * would be silly.
  	 */
cbb38e41a   Dan Williams   dax: provide diag...
997
998
  	if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) {
  		dax_pmd_dbg(&bh, address, "allocated block too small");
9973c98ec   Ross Zwisler   dax: add support ...
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
  		return VM_FAULT_FALLBACK;
  	}
  
  	/*
  	 * If we allocated new storage, make sure no process has any
  	 * zero pages covering this hole
  	 */
  	if (alloc) {
  		loff_t lstart = pgoff << PAGE_SHIFT;
  		loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */
  
  		truncate_pagecache_range(inode, lstart, lend);
cbb38e41a   Dan Williams   dax: provide diag...
1011
  	}
844f35db1   Matthew Wilcox   dax: add huge pag...
1012

b9953536c   Jan Kara   dax: Fix conditio...
1013
  	if (!write && !buffer_mapped(&bh)) {
844f35db1   Matthew Wilcox   dax: add huge pag...
1014
  		spinlock_t *ptl;
d295e3415   Kirill A. Shutemov   dax: don't use se...
1015
  		pmd_t entry;
6fcb52a56   Aaron Lu   thp: reduce usage...
1016
  		struct page *zero_page = mm_get_huge_zero_page(vma->vm_mm);
d295e3415   Kirill A. Shutemov   dax: don't use se...
1017

cbb38e41a   Dan Williams   dax: provide diag...
1018
1019
  		if (unlikely(!zero_page)) {
  			dax_pmd_dbg(&bh, address, "no zero page");
844f35db1   Matthew Wilcox   dax: add huge pag...
1020
  			goto fallback;
cbb38e41a   Dan Williams   dax: provide diag...
1021
  		}
844f35db1   Matthew Wilcox   dax: add huge pag...
1022

d295e3415   Kirill A. Shutemov   dax: don't use se...
1023
1024
1025
  		ptl = pmd_lock(vma->vm_mm, pmd);
  		if (!pmd_none(*pmd)) {
  			spin_unlock(ptl);
cbb38e41a   Dan Williams   dax: provide diag...
1026
  			dax_pmd_dbg(&bh, address, "pmd already present");
d295e3415   Kirill A. Shutemov   dax: don't use se...
1027
1028
  			goto fallback;
  		}
cbb38e41a   Dan Williams   dax: provide diag...
1029
1030
1031
1032
1033
  		dev_dbg(part_to_dev(bdev->bd_part),
  				"%s: %s addr: %lx pfn: <zero> sect: %llx
  ",
  				__func__, current->comm, address,
  				(unsigned long long) to_sector(&bh, inode));
d295e3415   Kirill A. Shutemov   dax: don't use se...
1034
1035
1036
  		entry = mk_pmd(zero_page, vma->vm_page_prot);
  		entry = pmd_mkhuge(entry);
  		set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry);
844f35db1   Matthew Wilcox   dax: add huge pag...
1037
  		result = VM_FAULT_NOPAGE;
d295e3415   Kirill A. Shutemov   dax: don't use se...
1038
  		spin_unlock(ptl);
844f35db1   Matthew Wilcox   dax: add huge pag...
1039
  	} else {
b2e0d1625   Dan Williams   dax: fix lifetime...
1040
1041
1042
1043
1044
  		struct blk_dax_ctl dax = {
  			.sector = to_sector(&bh, inode),
  			.size = PMD_SIZE,
  		};
  		long length = dax_map_atomic(bdev, &dax);
844f35db1   Matthew Wilcox   dax: add huge pag...
1045
  		if (length < 0) {
8b3db9798   Dan Williams   dax: fallback fro...
1046
1047
  			dax_pmd_dbg(&bh, address, "dax-error fallback");
  			goto fallback;
844f35db1   Matthew Wilcox   dax: add huge pag...
1048
  		}
cbb38e41a   Dan Williams   dax: provide diag...
1049
1050
1051
1052
1053
1054
1055
  		if (length < PMD_SIZE) {
  			dax_pmd_dbg(&bh, address, "dax-length too small");
  			dax_unmap_atomic(bdev, &dax);
  			goto fallback;
  		}
  		if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) {
  			dax_pmd_dbg(&bh, address, "pfn unaligned");
b2e0d1625   Dan Williams   dax: fix lifetime...
1056
  			dax_unmap_atomic(bdev, &dax);
844f35db1   Matthew Wilcox   dax: add huge pag...
1057
  			goto fallback;
b2e0d1625   Dan Williams   dax: fix lifetime...
1058
  		}
844f35db1   Matthew Wilcox   dax: add huge pag...
1059

c046c321c   Dan Williams   dax: re-enable da...
1060
  		if (!pfn_t_devmap(dax.pfn)) {
b2e0d1625   Dan Williams   dax: fix lifetime...
1061
  			dax_unmap_atomic(bdev, &dax);
cbb38e41a   Dan Williams   dax: provide diag...
1062
  			dax_pmd_dbg(&bh, address, "pfn not in memmap");
152d7bd80   Dan Williams   dax: fix __dax_pm...
1063
  			goto fallback;
b2e0d1625   Dan Williams   dax: fix lifetime...
1064
  		}
b2e0d1625   Dan Williams   dax: fix lifetime...
1065
  		dax_unmap_atomic(bdev, &dax);
0f90cc660   Ross Zwisler   mm, dax: fix DAX ...
1066

9973c98ec   Ross Zwisler   dax: add support ...
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
  		/*
  		 * For PTE faults we insert a radix tree entry for reads, and
  		 * leave it clean.  Then on the first write we dirty the radix
  		 * tree entry via the dax_pfn_mkwrite() path.  This sequence
  		 * allows the dax_pfn_mkwrite() call to be simpler and avoid a
  		 * call into get_block() to translate the pgoff to a sector in
  		 * order to be able to create a new radix tree entry.
  		 *
  		 * The PMD path doesn't have an equivalent to
  		 * dax_pfn_mkwrite(), though, so for a read followed by a
6b524995a   Ross Zwisler   dax: remote unuse...
1077
  		 * write we traverse all the way through dax_pmd_fault()
9973c98ec   Ross Zwisler   dax: add support ...
1078
1079
1080
1081
1082
  		 * twice.  This means we can just skip inserting a radix tree
  		 * entry completely on the initial read and just wait until
  		 * the write to insert a dirty entry.
  		 */
  		if (write) {
ac401cc78   Jan Kara   dax: New fault lo...
1083
1084
1085
1086
  			/*
  			 * We should insert radix-tree entry and dirty it here.
  			 * For now this is broken...
  			 */
9973c98ec   Ross Zwisler   dax: add support ...
1087
  		}
cbb38e41a   Dan Williams   dax: provide diag...
1088
1089
1090
1091
1092
1093
  		dev_dbg(part_to_dev(bdev->bd_part),
  				"%s: %s addr: %lx pfn: %lx sect: %llx
  ",
  				__func__, current->comm, address,
  				pfn_t_to_pfn(dax.pfn),
  				(unsigned long long) dax.sector);
34c0fd540   Dan Williams   mm, dax, pmem: in...
1094
  		result |= vmf_insert_pfn_pmd(vma, address, pmd,
f25748e3c   Dan Williams   mm, dax: convert ...
1095
  				dax.pfn, write);
844f35db1   Matthew Wilcox   dax: add huge pag...
1096
1097
1098
  	}
  
   out:
844f35db1   Matthew Wilcox   dax: add huge pag...
1099
1100
1101
1102
1103
1104
1105
  	return result;
  
   fallback:
  	count_vm_event(THP_FAULT_FALLBACK);
  	result = VM_FAULT_FALLBACK;
  	goto out;
  }
844f35db1   Matthew Wilcox   dax: add huge pag...
1106
  EXPORT_SYMBOL_GPL(dax_pmd_fault);
dd8a2b6c2   Valentin Rothberg   fs/dax.c: fix typ...
1107
  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
844f35db1   Matthew Wilcox   dax: add huge pag...
1108

4c0ccfef2   Matthew Wilcox   dax,ext2: replace...
1109
  /**
0e3b210ce   Boaz Harrosh   dax: use pfn_mkwr...
1110
1111
1112
   * dax_pfn_mkwrite - handle first write to DAX page
   * @vma: The virtual memory area where the fault occurred
   * @vmf: The description of the fault
0e3b210ce   Boaz Harrosh   dax: use pfn_mkwr...
1113
1114
1115
   */
  int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
  {
9973c98ec   Ross Zwisler   dax: add support ...
1116
  	struct file *file = vma->vm_file;
ac401cc78   Jan Kara   dax: New fault lo...
1117
1118
1119
  	struct address_space *mapping = file->f_mapping;
  	void *entry;
  	pgoff_t index = vmf->pgoff;
30f471fd8   Ross Zwisler   dax: check return...
1120

ac401cc78   Jan Kara   dax: New fault lo...
1121
1122
1123
1124
1125
1126
1127
1128
  	spin_lock_irq(&mapping->tree_lock);
  	entry = get_unlocked_mapping_entry(mapping, index, NULL);
  	if (!entry || !radix_tree_exceptional_entry(entry))
  		goto out;
  	radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
  	put_unlocked_mapping_entry(mapping, index, entry);
  out:
  	spin_unlock_irq(&mapping->tree_lock);
0e3b210ce   Boaz Harrosh   dax: use pfn_mkwr...
1129
1130
1131
  	return VM_FAULT_NOPAGE;
  }
  EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
4b0228fa1   Vishal Verma   dax: for truncate...
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
  static bool dax_range_is_aligned(struct block_device *bdev,
  				 unsigned int offset, unsigned int length)
  {
  	unsigned short sector_size = bdev_logical_block_size(bdev);
  
  	if (!IS_ALIGNED(offset, sector_size))
  		return false;
  	if (!IS_ALIGNED(length, sector_size))
  		return false;
  
  	return true;
  }
679c8bd3b   Christoph Hellwig   dax: export a low...
1144
1145
1146
1147
1148
1149
1150
  int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
  		unsigned int offset, unsigned int length)
  {
  	struct blk_dax_ctl dax = {
  		.sector		= sector,
  		.size		= PAGE_SIZE,
  	};
4b0228fa1   Vishal Verma   dax: for truncate...
1151
1152
1153
1154
1155
1156
1157
1158
1159
  	if (dax_range_is_aligned(bdev, offset, length)) {
  		sector_t start_sector = dax.sector + (offset >> 9);
  
  		return blkdev_issue_zeroout(bdev, start_sector,
  				length >> 9, GFP_NOFS, true);
  	} else {
  		if (dax_map_atomic(bdev, &dax) < 0)
  			return PTR_ERR(dax.addr);
  		clear_pmem(dax.addr + offset, length);
4b0228fa1   Vishal Verma   dax: for truncate...
1160
1161
  		dax_unmap_atomic(bdev, &dax);
  	}
679c8bd3b   Christoph Hellwig   dax: export a low...
1162
1163
1164
  	return 0;
  }
  EXPORT_SYMBOL_GPL(__dax_zero_page_range);
0e3b210ce   Boaz Harrosh   dax: use pfn_mkwr...
1165
  /**
25726bc15   Matthew Wilcox   dax: add dax_zero...
1166
   * dax_zero_page_range - zero a range within a page of a DAX file
4c0ccfef2   Matthew Wilcox   dax,ext2: replace...
1167
1168
   * @inode: The file being truncated
   * @from: The file offset that is being truncated to
25726bc15   Matthew Wilcox   dax: add dax_zero...
1169
   * @length: The number of bytes to zero
4c0ccfef2   Matthew Wilcox   dax,ext2: replace...
1170
1171
   * @get_block: The filesystem method used to translate file offsets to blocks
   *
25726bc15   Matthew Wilcox   dax: add dax_zero...
1172
1173
1174
1175
   * This function can be called by a filesystem when it is zeroing part of a
   * page in a DAX file.  This is intended for hole-punch operations.  If
   * you are truncating a file, the helper function dax_truncate_page() may be
   * more convenient.
4c0ccfef2   Matthew Wilcox   dax,ext2: replace...
1176
   */
25726bc15   Matthew Wilcox   dax: add dax_zero...
1177
1178
  int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
  							get_block_t get_block)
4c0ccfef2   Matthew Wilcox   dax,ext2: replace...
1179
1180
  {
  	struct buffer_head bh;
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
1181
1182
  	pgoff_t index = from >> PAGE_SHIFT;
  	unsigned offset = from & (PAGE_SIZE-1);
4c0ccfef2   Matthew Wilcox   dax,ext2: replace...
1183
1184
1185
1186
1187
  	int err;
  
  	/* Block boundary? Nothing to do */
  	if (!length)
  		return 0;
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
1188
  	BUG_ON((offset + length) > PAGE_SIZE);
4c0ccfef2   Matthew Wilcox   dax,ext2: replace...
1189
1190
  
  	memset(&bh, 0, sizeof(bh));
eab95db69   Ross Zwisler   dax: never rely o...
1191
  	bh.b_bdev = inode->i_sb->s_bdev;
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
1192
  	bh.b_size = PAGE_SIZE;
4c0ccfef2   Matthew Wilcox   dax,ext2: replace...
1193
  	err = get_block(inode, index, &bh, 0);
679c8bd3b   Christoph Hellwig   dax: export a low...
1194
  	if (err < 0 || !buffer_written(&bh))
4c0ccfef2   Matthew Wilcox   dax,ext2: replace...
1195
  		return err;
4c0ccfef2   Matthew Wilcox   dax,ext2: replace...
1196

679c8bd3b   Christoph Hellwig   dax: export a low...
1197
1198
  	return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode),
  			offset, length);
4c0ccfef2   Matthew Wilcox   dax,ext2: replace...
1199
  }
25726bc15   Matthew Wilcox   dax: add dax_zero...
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
  EXPORT_SYMBOL_GPL(dax_zero_page_range);
  
  /**
   * dax_truncate_page - handle a partial page being truncated in a DAX file
   * @inode: The file being truncated
   * @from: The file offset that is being truncated to
   * @get_block: The filesystem method used to translate file offsets to blocks
   *
   * Similar to block_truncate_page(), this function can be called by a
   * filesystem when it is truncating a DAX file to handle the partial page.
25726bc15   Matthew Wilcox   dax: add dax_zero...
1210
1211
1212
   */
  int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
  {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
1213
  	unsigned length = PAGE_ALIGN(from) - from;
25726bc15   Matthew Wilcox   dax: add dax_zero...
1214
1215
  	return dax_zero_page_range(inode, from, length, get_block);
  }
4c0ccfef2   Matthew Wilcox   dax,ext2: replace...
1216
  EXPORT_SYMBOL_GPL(dax_truncate_page);
a254e5681   Christoph Hellwig   dax: provide an i...
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
  
  #ifdef CONFIG_FS_IOMAP
  static loff_t
  iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
  		struct iomap *iomap)
  {
  	struct iov_iter *iter = data;
  	loff_t end = pos + length, done = 0;
  	ssize_t ret = 0;
  
  	if (iov_iter_rw(iter) == READ) {
  		end = min(end, i_size_read(inode));
  		if (pos >= end)
  			return 0;
  
  		if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
  			return iov_iter_zero(min(length, end - pos), iter);
  	}
  
  	if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
  		return -EIO;
  
  	while (pos < end) {
  		unsigned offset = pos & (PAGE_SIZE - 1);
  		struct blk_dax_ctl dax = { 0 };
  		ssize_t map_len;
72cd604cf   Michal Hocko   fs: break out of ...
1243
1244
1245
1246
  		if (fatal_signal_pending(current)) {
  			ret = -EINTR;
  			break;
  		}
a254e5681   Christoph Hellwig   dax: provide an i...
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
  		dax.sector = iomap->blkno +
  			(((pos & PAGE_MASK) - iomap->offset) >> 9);
  		dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK;
  		map_len = dax_map_atomic(iomap->bdev, &dax);
  		if (map_len < 0) {
  			ret = map_len;
  			break;
  		}
  
  		dax.addr += offset;
  		map_len -= offset;
  		if (map_len > end - pos)
  			map_len = end - pos;
  
  		if (iov_iter_rw(iter) == WRITE)
  			map_len = copy_from_iter_pmem(dax.addr, map_len, iter);
  		else
  			map_len = copy_to_iter(dax.addr, map_len, iter);
  		dax_unmap_atomic(iomap->bdev, &dax);
  		if (map_len <= 0) {
  			ret = map_len ? map_len : -EFAULT;
  			break;
  		}
  
  		pos += map_len;
  		length -= map_len;
  		done += map_len;
  	}
  
  	return done ? done : ret;
  }
  
  /**
   * iomap_dax_rw - Perform I/O to a DAX file
   * @iocb:	The control block for this I/O
   * @iter:	The addresses to do I/O from or to
   * @ops:	iomap ops passed from the file system
   *
   * This function performs read and write operations to directly mapped
   * persistent memory.  The callers needs to take care of read/write exclusion
   * and evicting any page cache pages in the region under I/O.
   */
  ssize_t
  iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter,
  		struct iomap_ops *ops)
  {
  	struct address_space *mapping = iocb->ki_filp->f_mapping;
  	struct inode *inode = mapping->host;
  	loff_t pos = iocb->ki_pos, ret = 0, done = 0;
  	unsigned flags = 0;
  
  	if (iov_iter_rw(iter) == WRITE)
  		flags |= IOMAP_WRITE;
  
  	/*
  	 * Yes, even DAX files can have page cache attached to them:  A zeroed
  	 * page is inserted into the pagecache when we have to serve a write
  	 * fault on a hole.  It should never be dirtied and can simply be
  	 * dropped from the pagecache once we get real data for the page.
  	 *
  	 * XXX: This is racy against mmap, and there's nothing we can do about
  	 * it. We'll eventually need to shift this down even further so that
  	 * we can check if we allocated blocks over a hole first.
  	 */
  	if (mapping->nrpages) {
  		ret = invalidate_inode_pages2_range(mapping,
  				pos >> PAGE_SHIFT,
  				(pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT);
  		WARN_ON_ONCE(ret);
  	}
  
  	while (iov_iter_count(iter)) {
  		ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
  				iter, iomap_dax_actor);
  		if (ret <= 0)
  			break;
  		pos += ret;
  		done += ret;
  	}
  
  	iocb->ki_pos += done;
  	return done ? done : ret;
  }
  EXPORT_SYMBOL_GPL(iomap_dax_rw);
a7d73fe6c   Christoph Hellwig   dax: provide an i...
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
  
  /**
   * iomap_dax_fault - handle a page fault on a DAX file
   * @vma: The virtual memory area where the fault occurred
   * @vmf: The description of the fault
   * @ops: iomap ops passed from the file system
   *
   * When a page fault occurs, filesystems may call this helper in their fault
   * or mkwrite handler for DAX files. Assumes the caller has done all the
   * necessary locking for the page fault to proceed successfully.
   */
  int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
  			struct iomap_ops *ops)
  {
  	struct address_space *mapping = vma->vm_file->f_mapping;
  	struct inode *inode = mapping->host;
  	unsigned long vaddr = (unsigned long)vmf->virtual_address;
  	loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
  	sector_t sector;
  	struct iomap iomap = { 0 };
  	unsigned flags = 0;
  	int error, major = 0;
  	void *entry;
  
  	/*
  	 * Check whether offset isn't beyond end of file now. Caller is supposed
  	 * to hold locks serializing us with truncate / punch hole so this is
  	 * a reliable test.
  	 */
  	if (pos >= i_size_read(inode))
  		return VM_FAULT_SIGBUS;
  
  	entry = grab_mapping_entry(mapping, vmf->pgoff);
  	if (IS_ERR(entry)) {
  		error = PTR_ERR(entry);
  		goto out;
  	}
  
  	if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
  		flags |= IOMAP_WRITE;
  
  	/*
  	 * Note that we don't bother to use iomap_apply here: DAX required
  	 * the file system block size to be equal the page size, which means
  	 * that we never have to deal with more than a single extent here.
  	 */
  	error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
  	if (error)
  		goto unlock_entry;
  	if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
  		error = -EIO;		/* fs corruption? */
  		goto unlock_entry;
  	}
  
  	sector = iomap.blkno + (((pos & PAGE_MASK) - iomap.offset) >> 9);
  
  	if (vmf->cow_page) {
  		switch (iomap.type) {
  		case IOMAP_HOLE:
  		case IOMAP_UNWRITTEN:
  			clear_user_highpage(vmf->cow_page, vaddr);
  			break;
  		case IOMAP_MAPPED:
  			error = copy_user_dax(iomap.bdev, sector, PAGE_SIZE,
  					vmf->cow_page, vaddr);
  			break;
  		default:
  			WARN_ON_ONCE(1);
  			error = -EIO;
  			break;
  		}
  
  		if (error)
  			goto unlock_entry;
  		if (!radix_tree_exceptional_entry(entry)) {
  			vmf->page = entry;
  			return VM_FAULT_LOCKED;
  		}
  		vmf->entry = entry;
  		return VM_FAULT_DAX_LOCKED;
  	}
  
  	switch (iomap.type) {
  	case IOMAP_MAPPED:
  		if (iomap.flags & IOMAP_F_NEW) {
  			count_vm_event(PGMAJFAULT);
  			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
  			major = VM_FAULT_MAJOR;
  		}
  		error = dax_insert_mapping(mapping, iomap.bdev, sector,
  				PAGE_SIZE, &entry, vma, vmf);
  		break;
  	case IOMAP_UNWRITTEN:
  	case IOMAP_HOLE:
  		if (!(vmf->flags & FAULT_FLAG_WRITE))
  			return dax_load_hole(mapping, entry, vmf);
  		/*FALLTHRU*/
  	default:
  		WARN_ON_ONCE(1);
  		error = -EIO;
  		break;
  	}
  
   unlock_entry:
  	put_locked_mapping_entry(mapping, vmf->pgoff, entry);
   out:
  	if (error == -ENOMEM)
  		return VM_FAULT_OOM | major;
  	/* -EBUSY is fine, somebody else faulted on the same PTE */
  	if (error < 0 && error != -EBUSY)
  		return VM_FAULT_SIGBUS | major;
  	return VM_FAULT_NOPAGE | major;
  }
  EXPORT_SYMBOL_GPL(iomap_dax_fault);
a254e5681   Christoph Hellwig   dax: provide an i...
1445
  #endif /* CONFIG_FS_IOMAP */