Blame view

fs/iomap.c 35.5 KB
ae259a9c8   Christoph Hellwig   fs: introduce iom...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
  /*
   * Copyright (C) 2010 Red Hat, Inc.
   * Copyright (c) 2016 Christoph Hellwig.
   *
   * This program is free software; you can redistribute it and/or modify it
   * under the terms and conditions of the GNU General Public License,
   * version 2, as published by the Free Software Foundation.
   *
   * This program is distributed in the hope it will be useful, but WITHOUT
   * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
   * more details.
   */
  #include <linux/module.h>
  #include <linux/compiler.h>
  #include <linux/fs.h>
  #include <linux/iomap.h>
  #include <linux/uaccess.h>
  #include <linux/gfp.h>
  #include <linux/mm.h>
  #include <linux/swap.h>
  #include <linux/pagemap.h>
8a78cb1f1   Christoph Hellwig   fs: move page_cac...
23
  #include <linux/pagevec.h>
ae259a9c8   Christoph Hellwig   fs: introduce iom...
24
25
26
27
  #include <linux/file.h>
  #include <linux/uio.h>
  #include <linux/backing-dev.h>
  #include <linux/buffer_head.h>
ff6a9292e   Christoph Hellwig   iomap: implement ...
28
  #include <linux/task_io_accounting_ops.h>
9a286f0e5   Christoph Hellwig   fs: support DAX b...
29
  #include <linux/dax.h>
f361bf4a6   Ingo Molnar   sched/headers: Pr...
30
  #include <linux/sched/signal.h>
67482129c   Darrick J. Wong   iomap: add a swap...
31
  #include <linux/swap.h>
f361bf4a6   Ingo Molnar   sched/headers: Pr...
32

ae259a9c8   Christoph Hellwig   fs: introduce iom...
33
  #include "internal.h"
ae259a9c8   Christoph Hellwig   fs: introduce iom...
34
35
36
37
38
39
40
41
42
43
44
  /*
   * Execute a iomap write on a segment of the mapping that spans a
   * contiguous range of pages that have identical block mapping state.
   *
   * This avoids the need to map pages individually, do individual allocations
   * for each page and most importantly avoid the need for filesystem specific
   * locking per page. Instead, all the operations are amortised over the entire
   * range of pages. It is assumed that the filesystems will lock whatever
   * resources they require in the iomap_begin call, and release them in the
   * iomap_end call.
   */
befb503ca   Christoph Hellwig   iomap: expose iom...
45
  loff_t
ae259a9c8   Christoph Hellwig   fs: introduce iom...
46
  iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
8ff6daa17   Christoph Hellwig   iomap: constify s...
47
  		const struct iomap_ops *ops, void *data, iomap_actor_t actor)
ae259a9c8   Christoph Hellwig   fs: introduce iom...
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
  {
  	struct iomap iomap = { 0 };
  	loff_t written = 0, ret;
  
  	/*
  	 * Need to map a range from start position for length bytes. This can
  	 * span multiple pages - it is only guaranteed to return a range of a
  	 * single type of pages (e.g. all into a hole, all mapped or all
  	 * unwritten). Failure at this point has nothing to undo.
  	 *
  	 * If allocation is required for this range, reserve the space now so
  	 * that the allocation is guaranteed to succeed later on. Once we copy
  	 * the data into the page cache pages, then we cannot fail otherwise we
  	 * expose transient stale data. If the reserve fails, we can safely
  	 * back out at this point as there is nothing to undo.
  	 */
  	ret = ops->iomap_begin(inode, pos, length, flags, &iomap);
  	if (ret)
  		return ret;
  	if (WARN_ON(iomap.offset > pos))
  		return -EIO;
0c6dda7a1   Darrick J. Wong   iomap: warn on ze...
69
70
  	if (WARN_ON(iomap.length == 0))
  		return -EIO;
ae259a9c8   Christoph Hellwig   fs: introduce iom...
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
  
  	/*
  	 * Cut down the length to the one actually provided by the filesystem,
  	 * as it might not be able to give us the whole size that we requested.
  	 */
  	if (iomap.offset + iomap.length < pos + length)
  		length = iomap.offset + iomap.length - pos;
  
  	/*
  	 * Now that we have guaranteed that the space allocation will succeed.
  	 * we can do the copy-in page by page without having to worry about
  	 * failures exposing transient data.
  	 */
  	written = actor(inode, pos, length, data, &iomap);
  
  	/*
  	 * Now the data has been copied, commit the range we've copied.  This
  	 * should not fail unless the filesystem has had a fatal error.
  	 */
f20ac7ab1   Christoph Hellwig   iomap: mark ->iom...
90
91
92
93
94
  	if (ops->iomap_end) {
  		ret = ops->iomap_end(inode, pos, length,
  				     written > 0 ? written : 0,
  				     flags, &iomap);
  	}
ae259a9c8   Christoph Hellwig   fs: introduce iom...
95
96
97
  
  	return written ? written : ret;
  }
57fc505d1   Christoph Hellwig   iomap: add a ioma...
98
99
100
101
102
  static sector_t
  iomap_sector(struct iomap *iomap, loff_t pos)
  {
  	return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
  }
ae259a9c8   Christoph Hellwig   fs: introduce iom...
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
  static void
  iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
  {
  	loff_t i_size = i_size_read(inode);
  
  	/*
  	 * Only truncate newly allocated pages beyoned EOF, even if the
  	 * write started inside the existing inode size.
  	 */
  	if (pos + len > i_size)
  		truncate_pagecache_range(inode, max(pos, i_size), pos + len);
  }
  
  static int
  iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
  		struct page **pagep, struct iomap *iomap)
  {
  	pgoff_t index = pos >> PAGE_SHIFT;
  	struct page *page;
  	int status = 0;
  
  	BUG_ON(pos + len > iomap->offset + iomap->length);
d1908f525   Michal Hocko   fs: break out of ...
125
126
  	if (fatal_signal_pending(current))
  		return -EINTR;
ae259a9c8   Christoph Hellwig   fs: introduce iom...
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
  	page = grab_cache_page_write_begin(inode->i_mapping, index, flags);
  	if (!page)
  		return -ENOMEM;
  
  	status = __block_write_begin_int(page, pos, len, NULL, iomap);
  	if (unlikely(status)) {
  		unlock_page(page);
  		put_page(page);
  		page = NULL;
  
  		iomap_write_failed(inode, pos, len);
  	}
  
  	*pagep = page;
  	return status;
  }
  
  static int
  iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
  		unsigned copied, struct page *page)
  {
  	int ret;
  
  	ret = generic_write_end(NULL, inode->i_mapping, pos, len,
  			copied, page, NULL);
  	if (ret < len)
  		iomap_write_failed(inode, pos, len);
  	return ret;
  }
  
  static loff_t
  iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
  		struct iomap *iomap)
  {
  	struct iov_iter *i = data;
  	long status = 0;
  	ssize_t written = 0;
  	unsigned int flags = AOP_FLAG_NOFS;
ae259a9c8   Christoph Hellwig   fs: introduce iom...
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
  	do {
  		struct page *page;
  		unsigned long offset;	/* Offset into pagecache page */
  		unsigned long bytes;	/* Bytes to write to page */
  		size_t copied;		/* Bytes copied from user */
  
  		offset = (pos & (PAGE_SIZE - 1));
  		bytes = min_t(unsigned long, PAGE_SIZE - offset,
  						iov_iter_count(i));
  again:
  		if (bytes > length)
  			bytes = length;
  
  		/*
  		 * Bring in the user page that we will copy from _first_.
  		 * Otherwise there's a nasty deadlock on copying from the
  		 * same page as we're writing to, without it being marked
  		 * up-to-date.
  		 *
  		 * Not only is this an optimisation, but it is also required
  		 * to check that the address is actually valid, when atomic
  		 * usercopies are used, below.
  		 */
  		if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
  			status = -EFAULT;
  			break;
  		}
  
  		status = iomap_write_begin(inode, pos, bytes, flags, &page,
  				iomap);
  		if (unlikely(status))
  			break;
  
  		if (mapping_writably_mapped(inode->i_mapping))
  			flush_dcache_page(page);
ae259a9c8   Christoph Hellwig   fs: introduce iom...
200
  		copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
ae259a9c8   Christoph Hellwig   fs: introduce iom...
201
202
  
  		flush_dcache_page(page);
ae259a9c8   Christoph Hellwig   fs: introduce iom...
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
  
  		status = iomap_write_end(inode, pos, bytes, copied, page);
  		if (unlikely(status < 0))
  			break;
  		copied = status;
  
  		cond_resched();
  
  		iov_iter_advance(i, copied);
  		if (unlikely(copied == 0)) {
  			/*
  			 * If we were unable to copy any data at all, we must
  			 * fall back to a single segment length write.
  			 *
  			 * If we didn't fallback here, we could livelock
  			 * because not all segments in the iov can be copied at
  			 * once without a pagefault.
  			 */
  			bytes = min_t(unsigned long, PAGE_SIZE - offset,
  						iov_iter_single_seg_count(i));
  			goto again;
  		}
  		pos += copied;
  		written += copied;
  		length -= copied;
  
  		balance_dirty_pages_ratelimited(inode->i_mapping);
  	} while (iov_iter_count(i) && length);
  
  	return written ? written : status;
  }
  
  ssize_t
  iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter,
8ff6daa17   Christoph Hellwig   iomap: constify s...
237
  		const struct iomap_ops *ops)
ae259a9c8   Christoph Hellwig   fs: introduce iom...
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
  {
  	struct inode *inode = iocb->ki_filp->f_mapping->host;
  	loff_t pos = iocb->ki_pos, ret = 0, written = 0;
  
  	while (iov_iter_count(iter)) {
  		ret = iomap_apply(inode, pos, iov_iter_count(iter),
  				IOMAP_WRITE, ops, iter, iomap_write_actor);
  		if (ret <= 0)
  			break;
  		pos += ret;
  		written += ret;
  	}
  
  	return written ? written : ret;
  }
  EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
5f4e5752a   Christoph Hellwig   fs: add iomap_fil...
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
  static struct page *
  __iomap_read_page(struct inode *inode, loff_t offset)
  {
  	struct address_space *mapping = inode->i_mapping;
  	struct page *page;
  
  	page = read_mapping_page(mapping, offset >> PAGE_SHIFT, NULL);
  	if (IS_ERR(page))
  		return page;
  	if (!PageUptodate(page)) {
  		put_page(page);
  		return ERR_PTR(-EIO);
  	}
  	return page;
  }
  
  static loff_t
  iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
  		struct iomap *iomap)
  {
  	long status = 0;
  	ssize_t written = 0;
  
  	do {
  		struct page *page, *rpage;
  		unsigned long offset;	/* Offset into pagecache page */
  		unsigned long bytes;	/* Bytes to write to page */
  
  		offset = (pos & (PAGE_SIZE - 1));
e28ae8e42   Christoph Hellwig   iomap: fix intege...
283
  		bytes = min_t(loff_t, PAGE_SIZE - offset, length);
5f4e5752a   Christoph Hellwig   fs: add iomap_fil...
284
285
286
287
288
289
  
  		rpage = __iomap_read_page(inode, pos);
  		if (IS_ERR(rpage))
  			return PTR_ERR(rpage);
  
  		status = iomap_write_begin(inode, pos, bytes,
c718a9751   Tetsuo Handa   fs: semove set bu...
290
  					   AOP_FLAG_NOFS, &page, iomap);
5f4e5752a   Christoph Hellwig   fs: add iomap_fil...
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
  		put_page(rpage);
  		if (unlikely(status))
  			return status;
  
  		WARN_ON_ONCE(!PageUptodate(page));
  
  		status = iomap_write_end(inode, pos, bytes, bytes, page);
  		if (unlikely(status <= 0)) {
  			if (WARN_ON_ONCE(status == 0))
  				return -EIO;
  			return status;
  		}
  
  		cond_resched();
  
  		pos += status;
  		written += status;
  		length -= status;
  
  		balance_dirty_pages_ratelimited(inode->i_mapping);
  	} while (length);
  
  	return written;
  }
  
  int
  iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
8ff6daa17   Christoph Hellwig   iomap: constify s...
318
  		const struct iomap_ops *ops)
5f4e5752a   Christoph Hellwig   fs: add iomap_fil...
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
  {
  	loff_t ret;
  
  	while (len) {
  		ret = iomap_apply(inode, pos, len, IOMAP_WRITE, ops, NULL,
  				iomap_dirty_actor);
  		if (ret <= 0)
  			return ret;
  		pos += ret;
  		len -= ret;
  	}
  
  	return 0;
  }
  EXPORT_SYMBOL_GPL(iomap_file_dirty);
ae259a9c8   Christoph Hellwig   fs: introduce iom...
334
335
336
337
338
  static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
  		unsigned bytes, struct iomap *iomap)
  {
  	struct page *page;
  	int status;
c718a9751   Tetsuo Handa   fs: semove set bu...
339
340
  	status = iomap_write_begin(inode, pos, bytes, AOP_FLAG_NOFS, &page,
  				   iomap);
ae259a9c8   Christoph Hellwig   fs: introduce iom...
341
342
343
344
345
346
347
348
  	if (status)
  		return status;
  
  	zero_user(page, offset, bytes);
  	mark_page_accessed(page);
  
  	return iomap_write_end(inode, pos, bytes, bytes, page);
  }
9a286f0e5   Christoph Hellwig   fs: support DAX b...
349
350
351
  static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
  		struct iomap *iomap)
  {
57fc505d1   Christoph Hellwig   iomap: add a ioma...
352
353
  	return __dax_zero_page_range(iomap->bdev, iomap->dax_dev,
  			iomap_sector(iomap, pos & PAGE_MASK), offset, bytes);
9a286f0e5   Christoph Hellwig   fs: support DAX b...
354
  }
ae259a9c8   Christoph Hellwig   fs: introduce iom...
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
  static loff_t
  iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
  		void *data, struct iomap *iomap)
  {
  	bool *did_zero = data;
  	loff_t written = 0;
  	int status;
  
  	/* already zeroed?  we're done. */
  	if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
  	    	return count;
  
  	do {
  		unsigned offset, bytes;
  
  		offset = pos & (PAGE_SIZE - 1); /* Within page */
e28ae8e42   Christoph Hellwig   iomap: fix intege...
371
  		bytes = min_t(loff_t, PAGE_SIZE - offset, count);
ae259a9c8   Christoph Hellwig   fs: introduce iom...
372

9a286f0e5   Christoph Hellwig   fs: support DAX b...
373
374
375
376
  		if (IS_DAX(inode))
  			status = iomap_dax_zero(pos, offset, bytes, iomap);
  		else
  			status = iomap_zero(inode, pos, offset, bytes, iomap);
ae259a9c8   Christoph Hellwig   fs: introduce iom...
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
  		if (status < 0)
  			return status;
  
  		pos += bytes;
  		count -= bytes;
  		written += bytes;
  		if (did_zero)
  			*did_zero = true;
  	} while (count > 0);
  
  	return written;
  }
  
  int
  iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
8ff6daa17   Christoph Hellwig   iomap: constify s...
392
  		const struct iomap_ops *ops)
ae259a9c8   Christoph Hellwig   fs: introduce iom...
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
  {
  	loff_t ret;
  
  	while (len > 0) {
  		ret = iomap_apply(inode, pos, len, IOMAP_ZERO,
  				ops, did_zero, iomap_zero_range_actor);
  		if (ret <= 0)
  			return ret;
  
  		pos += ret;
  		len -= ret;
  	}
  
  	return 0;
  }
  EXPORT_SYMBOL_GPL(iomap_zero_range);
  
  int
  iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
8ff6daa17   Christoph Hellwig   iomap: constify s...
412
  		const struct iomap_ops *ops)
ae259a9c8   Christoph Hellwig   fs: introduce iom...
413
  {
93407472a   Fabian Frederick   fs: add i_blocksi...
414
415
  	unsigned int blocksize = i_blocksize(inode);
  	unsigned int off = pos & (blocksize - 1);
ae259a9c8   Christoph Hellwig   fs: introduce iom...
416
417
418
419
420
421
422
423
424
425
426
427
428
429
  
  	/* Block boundary? Nothing to do */
  	if (!off)
  		return 0;
  	return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops);
  }
  EXPORT_SYMBOL_GPL(iomap_truncate_page);
  
  static loff_t
  iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
  		void *data, struct iomap *iomap)
  {
  	struct page *page = data;
  	int ret;
c663e29f8   Jan Kara   fs: Do to trim hi...
430
  	ret = __block_write_begin_int(page, pos, length, NULL, iomap);
ae259a9c8   Christoph Hellwig   fs: introduce iom...
431
432
433
434
435
436
  	if (ret)
  		return ret;
  
  	block_commit_write(page, 0, length);
  	return length;
  }
11bac8000   Dave Jiang   mm, fs: reduce fa...
437
  int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
ae259a9c8   Christoph Hellwig   fs: introduce iom...
438
439
  {
  	struct page *page = vmf->page;
11bac8000   Dave Jiang   mm, fs: reduce fa...
440
  	struct inode *inode = file_inode(vmf->vma->vm_file);
ae259a9c8   Christoph Hellwig   fs: introduce iom...
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
  	unsigned long length;
  	loff_t offset, size;
  	ssize_t ret;
  
  	lock_page(page);
  	size = i_size_read(inode);
  	if ((page->mapping != inode->i_mapping) ||
  	    (page_offset(page) > size)) {
  		/* We overload EFAULT to mean page got truncated */
  		ret = -EFAULT;
  		goto out_unlock;
  	}
  
  	/* page is wholly or partially inside EOF */
  	if (((page->index + 1) << PAGE_SHIFT) > size)
  		length = size & ~PAGE_MASK;
  	else
  		length = PAGE_SIZE;
  
  	offset = page_offset(page);
  	while (length > 0) {
9484ab1bf   Jan Kara   dax: Introduce IO...
462
463
464
  		ret = iomap_apply(inode, offset, length,
  				IOMAP_WRITE | IOMAP_FAULT, ops, page,
  				iomap_page_mkwrite_actor);
ae259a9c8   Christoph Hellwig   fs: introduce iom...
465
466
467
468
469
470
471
472
  		if (unlikely(ret <= 0))
  			goto out_unlock;
  		offset += ret;
  		length -= ret;
  	}
  
  	set_page_dirty(page);
  	wait_for_stable_page(page);
e7647fb49   Christoph Hellwig   iomap: return VM_...
473
  	return VM_FAULT_LOCKED;
ae259a9c8   Christoph Hellwig   fs: introduce iom...
474
475
  out_unlock:
  	unlock_page(page);
e7647fb49   Christoph Hellwig   iomap: return VM_...
476
  	return block_page_mkwrite_return(ret);
ae259a9c8   Christoph Hellwig   fs: introduce iom...
477
478
  }
  EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
8be9f564d   Christoph Hellwig   fs: iomap based f...
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
  
  struct fiemap_ctx {
  	struct fiemap_extent_info *fi;
  	struct iomap prev;
  };
  
  static int iomap_to_fiemap(struct fiemap_extent_info *fi,
  		struct iomap *iomap, u32 flags)
  {
  	switch (iomap->type) {
  	case IOMAP_HOLE:
  		/* skip holes */
  		return 0;
  	case IOMAP_DELALLOC:
  		flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN;
  		break;
19319b532   Christoph Hellwig   iomap: inline dat...
495
496
  	case IOMAP_MAPPED:
  		break;
8be9f564d   Christoph Hellwig   fs: iomap based f...
497
498
499
  	case IOMAP_UNWRITTEN:
  		flags |= FIEMAP_EXTENT_UNWRITTEN;
  		break;
19319b532   Christoph Hellwig   iomap: inline dat...
500
501
  	case IOMAP_INLINE:
  		flags |= FIEMAP_EXTENT_DATA_INLINE;
8be9f564d   Christoph Hellwig   fs: iomap based f...
502
503
  		break;
  	}
17de0a9ff   Christoph Hellwig   iomap: don't set ...
504
505
  	if (iomap->flags & IOMAP_F_MERGED)
  		flags |= FIEMAP_EXTENT_MERGED;
e43c460dc   Darrick J. Wong   iomap: add a flag...
506
507
  	if (iomap->flags & IOMAP_F_SHARED)
  		flags |= FIEMAP_EXTENT_SHARED;
17de0a9ff   Christoph Hellwig   iomap: don't set ...
508

8be9f564d   Christoph Hellwig   fs: iomap based f...
509
  	return fiemap_fill_next_extent(fi, iomap->offset,
19fe5f643   Andreas Gruenbacher   iomap: Switch fro...
510
  			iomap->addr != IOMAP_NULL_ADDR ? iomap->addr : 0,
17de0a9ff   Christoph Hellwig   iomap: don't set ...
511
  			iomap->length, flags);
8be9f564d   Christoph Hellwig   fs: iomap based f...
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
  }
  
  static loff_t
  iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
  		struct iomap *iomap)
  {
  	struct fiemap_ctx *ctx = data;
  	loff_t ret = length;
  
  	if (iomap->type == IOMAP_HOLE)
  		return length;
  
  	ret = iomap_to_fiemap(ctx->fi, &ctx->prev, 0);
  	ctx->prev = *iomap;
  	switch (ret) {
  	case 0:		/* success */
  		return length;
  	case 1:		/* extent array full */
  		return 0;
  	default:
  		return ret;
  	}
  }
  
  int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
8ff6daa17   Christoph Hellwig   iomap: constify s...
537
  		loff_t start, loff_t len, const struct iomap_ops *ops)
8be9f564d   Christoph Hellwig   fs: iomap based f...
538
539
540
541
542
543
544
545
546
547
548
  {
  	struct fiemap_ctx ctx;
  	loff_t ret;
  
  	memset(&ctx, 0, sizeof(ctx));
  	ctx.fi = fi;
  	ctx.prev.type = IOMAP_HOLE;
  
  	ret = fiemap_check_flags(fi, FIEMAP_FLAG_SYNC);
  	if (ret)
  		return ret;
8896b8f60   Dave Chinner   iomap: fiemap sho...
549
550
551
552
553
  	if (fi->fi_flags & FIEMAP_FLAG_SYNC) {
  		ret = filemap_write_and_wait(inode->i_mapping);
  		if (ret)
  			return ret;
  	}
8be9f564d   Christoph Hellwig   fs: iomap based f...
554
555
  
  	while (len > 0) {
d33fd776f   Christoph Hellwig   iomap: add IOMAP_...
556
  		ret = iomap_apply(inode, start, len, IOMAP_REPORT, ops, &ctx,
8be9f564d   Christoph Hellwig   fs: iomap based f...
557
  				iomap_fiemap_actor);
ac2dc058b   Dave Chinner   iomap: prepare io...
558
559
560
  		/* inode with no (attribute) mapping will give ENOENT */
  		if (ret == -ENOENT)
  			break;
8be9f564d   Christoph Hellwig   fs: iomap based f...
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
  		if (ret < 0)
  			return ret;
  		if (ret == 0)
  			break;
  
  		start += ret;
  		len -= ret;
  	}
  
  	if (ctx.prev.type != IOMAP_HOLE) {
  		ret = iomap_to_fiemap(fi, &ctx.prev, FIEMAP_EXTENT_LAST);
  		if (ret < 0)
  			return ret;
  	}
  
  	return 0;
  }
  EXPORT_SYMBOL_GPL(iomap_fiemap);
ff6a9292e   Christoph Hellwig   iomap: implement ...
579

8a78cb1f1   Christoph Hellwig   fs: move page_cac...
580
581
  /*
   * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff.
afd9d6a1d   Christoph Hellwig   fs: use ->is_part...
582
   * Returns true if found and updates @lastoff to the offset in file.
8a78cb1f1   Christoph Hellwig   fs: move page_cac...
583
   */
afd9d6a1d   Christoph Hellwig   fs: use ->is_part...
584
585
586
  static bool
  page_seek_hole_data(struct inode *inode, struct page *page, loff_t *lastoff,
  		int whence)
8a78cb1f1   Christoph Hellwig   fs: move page_cac...
587
  {
afd9d6a1d   Christoph Hellwig   fs: use ->is_part...
588
589
  	const struct address_space_operations *ops = inode->i_mapping->a_ops;
  	unsigned int bsize = i_blocksize(inode), off;
8a78cb1f1   Christoph Hellwig   fs: move page_cac...
590
  	bool seek_data = whence == SEEK_DATA;
afd9d6a1d   Christoph Hellwig   fs: use ->is_part...
591
  	loff_t poff = page_offset(page);
8a78cb1f1   Christoph Hellwig   fs: move page_cac...
592

afd9d6a1d   Christoph Hellwig   fs: use ->is_part...
593
594
  	if (WARN_ON_ONCE(*lastoff >= poff + PAGE_SIZE))
  		return false;
8a78cb1f1   Christoph Hellwig   fs: move page_cac...
595

afd9d6a1d   Christoph Hellwig   fs: use ->is_part...
596
  	if (*lastoff < poff) {
8a78cb1f1   Christoph Hellwig   fs: move page_cac...
597
  		/*
afd9d6a1d   Christoph Hellwig   fs: use ->is_part...
598
599
  		 * Last offset smaller than the start of the page means we found
  		 * a hole:
8a78cb1f1   Christoph Hellwig   fs: move page_cac...
600
  		 */
afd9d6a1d   Christoph Hellwig   fs: use ->is_part...
601
602
603
604
  		if (whence == SEEK_HOLE)
  			return true;
  		*lastoff = poff;
  	}
8a78cb1f1   Christoph Hellwig   fs: move page_cac...
605

afd9d6a1d   Christoph Hellwig   fs: use ->is_part...
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
  	/*
  	 * Just check the page unless we can and should check block ranges:
  	 */
  	if (bsize == PAGE_SIZE || !ops->is_partially_uptodate)
  		return PageUptodate(page) == seek_data;
  
  	lock_page(page);
  	if (unlikely(page->mapping != inode->i_mapping))
  		goto out_unlock_not_found;
  
  	for (off = 0; off < PAGE_SIZE; off += bsize) {
  		if ((*lastoff & ~PAGE_MASK) >= off + bsize)
  			continue;
  		if (ops->is_partially_uptodate(page, off, bsize) == seek_data) {
  			unlock_page(page);
  			return true;
  		}
  		*lastoff = poff + off + bsize;
  	}
  
  out_unlock_not_found:
  	unlock_page(page);
  	return false;
8a78cb1f1   Christoph Hellwig   fs: move page_cac...
629
630
631
632
633
634
  }
  
  /*
   * Seek for SEEK_DATA / SEEK_HOLE in the page cache.
   *
   * Within unwritten extents, the page cache determines which parts are holes
bd56b3e14   Christoph Hellwig   fs: remove the bu...
635
636
   * and which are data: uptodate buffer heads count as data; everything else
   * counts as a hole.
8a78cb1f1   Christoph Hellwig   fs: move page_cac...
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
   *
   * Returns the resulting offset on successs, and -ENOENT otherwise.
   */
  static loff_t
  page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
  		int whence)
  {
  	pgoff_t index = offset >> PAGE_SHIFT;
  	pgoff_t end = DIV_ROUND_UP(offset + length, PAGE_SIZE);
  	loff_t lastoff = offset;
  	struct pagevec pvec;
  
  	if (length <= 0)
  		return -ENOENT;
  
  	pagevec_init(&pvec);
  
  	do {
  		unsigned nr_pages, i;
  
  		nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index,
  						end - 1);
  		if (nr_pages == 0)
  			break;
  
  		for (i = 0; i < nr_pages; i++) {
  			struct page *page = pvec.pages[i];
afd9d6a1d   Christoph Hellwig   fs: use ->is_part...
664
  			if (page_seek_hole_data(inode, page, &lastoff, whence))
8a78cb1f1   Christoph Hellwig   fs: move page_cac...
665
  				goto check_range;
8a78cb1f1   Christoph Hellwig   fs: move page_cac...
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
  			lastoff = page_offset(page) + PAGE_SIZE;
  		}
  		pagevec_release(&pvec);
  	} while (index < end);
  
  	/* When no page at lastoff and we are not done, we found a hole. */
  	if (whence != SEEK_HOLE)
  		goto not_found;
  
  check_range:
  	if (lastoff < offset + length)
  		goto out;
  not_found:
  	lastoff = -ENOENT;
  out:
  	pagevec_release(&pvec);
  	return lastoff;
  }
0ed3b0d45   Andreas Gruenbacher   vfs: Add iomap_se...
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
  static loff_t
  iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length,
  		      void *data, struct iomap *iomap)
  {
  	switch (iomap->type) {
  	case IOMAP_UNWRITTEN:
  		offset = page_cache_seek_hole_data(inode, offset, length,
  						   SEEK_HOLE);
  		if (offset < 0)
  			return length;
  		/* fall through */
  	case IOMAP_HOLE:
  		*(loff_t *)data = offset;
  		return 0;
  	default:
  		return length;
  	}
  }
  
  loff_t
  iomap_seek_hole(struct inode *inode, loff_t offset, const struct iomap_ops *ops)
  {
  	loff_t size = i_size_read(inode);
  	loff_t length = size - offset;
  	loff_t ret;
d6ab17f26   Darrick J. Wong   vfs: in iomap see...
709
710
  	/* Nothing to be found before or beyond the end of the file. */
  	if (offset < 0 || offset >= size)
0ed3b0d45   Andreas Gruenbacher   vfs: Add iomap_se...
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
  		return -ENXIO;
  
  	while (length > 0) {
  		ret = iomap_apply(inode, offset, length, IOMAP_REPORT, ops,
  				  &offset, iomap_seek_hole_actor);
  		if (ret < 0)
  			return ret;
  		if (ret == 0)
  			break;
  
  		offset += ret;
  		length -= ret;
  	}
  
  	return offset;
  }
  EXPORT_SYMBOL_GPL(iomap_seek_hole);
  
  static loff_t
  iomap_seek_data_actor(struct inode *inode, loff_t offset, loff_t length,
  		      void *data, struct iomap *iomap)
  {
  	switch (iomap->type) {
  	case IOMAP_HOLE:
  		return length;
  	case IOMAP_UNWRITTEN:
  		offset = page_cache_seek_hole_data(inode, offset, length,
  						   SEEK_DATA);
  		if (offset < 0)
  			return length;
  		/*FALLTHRU*/
  	default:
  		*(loff_t *)data = offset;
  		return 0;
  	}
  }
  
  loff_t
  iomap_seek_data(struct inode *inode, loff_t offset, const struct iomap_ops *ops)
  {
  	loff_t size = i_size_read(inode);
  	loff_t length = size - offset;
  	loff_t ret;
d6ab17f26   Darrick J. Wong   vfs: in iomap see...
754
755
  	/* Nothing to be found before or beyond the end of the file. */
  	if (offset < 0 || offset >= size)
0ed3b0d45   Andreas Gruenbacher   vfs: Add iomap_se...
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
  		return -ENXIO;
  
  	while (length > 0) {
  		ret = iomap_apply(inode, offset, length, IOMAP_REPORT, ops,
  				  &offset, iomap_seek_data_actor);
  		if (ret < 0)
  			return ret;
  		if (ret == 0)
  			break;
  
  		offset += ret;
  		length -= ret;
  	}
  
  	if (length <= 0)
  		return -ENXIO;
  	return offset;
  }
  EXPORT_SYMBOL_GPL(iomap_seek_data);
ff6a9292e   Christoph Hellwig   iomap: implement ...
775
776
777
778
  /*
   * Private flags for iomap_dio, must not overlap with the public ones in
   * iomap.h:
   */
3460cac1c   Dave Chinner   iomap: Use FUA fo...
779
  #define IOMAP_DIO_WRITE_FUA	(1 << 28)
4f8ff44ba   Dave Chinner   iomap: iomap_dio_...
780
  #define IOMAP_DIO_NEED_SYNC	(1 << 29)
ff6a9292e   Christoph Hellwig   iomap: implement ...
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
  #define IOMAP_DIO_WRITE		(1 << 30)
  #define IOMAP_DIO_DIRTY		(1 << 31)
  
  struct iomap_dio {
  	struct kiocb		*iocb;
  	iomap_dio_end_io_t	*end_io;
  	loff_t			i_size;
  	loff_t			size;
  	atomic_t		ref;
  	unsigned		flags;
  	int			error;
  
  	union {
  		/* used during submission and for synchronous completion: */
  		struct {
  			struct iov_iter		*iter;
  			struct task_struct	*waiter;
  			struct request_queue	*last_queue;
  			blk_qc_t		cookie;
  		} submit;
  
  		/* used for aio completion: */
  		struct {
  			struct work_struct	work;
  		} aio;
  	};
  };
  
  static ssize_t iomap_dio_complete(struct iomap_dio *dio)
  {
  	struct kiocb *iocb = dio->iocb;
332391a99   Lukas Czerner   fs: Fix page cach...
812
  	struct inode *inode = file_inode(iocb->ki_filp);
5e25c269e   Eryu Guan   fs: invalidate pa...
813
  	loff_t offset = iocb->ki_pos;
ff6a9292e   Christoph Hellwig   iomap: implement ...
814
815
816
817
818
819
820
821
822
823
824
825
826
  	ssize_t ret;
  
  	if (dio->end_io) {
  		ret = dio->end_io(iocb,
  				dio->error ? dio->error : dio->size,
  				dio->flags);
  	} else {
  		ret = dio->error;
  	}
  
  	if (likely(!ret)) {
  		ret = dio->size;
  		/* check for short read */
5e25c269e   Eryu Guan   fs: invalidate pa...
827
  		if (offset + ret > dio->i_size &&
ff6a9292e   Christoph Hellwig   iomap: implement ...
828
  		    !(dio->flags & IOMAP_DIO_WRITE))
5e25c269e   Eryu Guan   fs: invalidate pa...
829
  			ret = dio->i_size - offset;
ff6a9292e   Christoph Hellwig   iomap: implement ...
830
831
  		iocb->ki_pos += ret;
  	}
5e25c269e   Eryu Guan   fs: invalidate pa...
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
  	/*
  	 * Try again to invalidate clean pages which might have been cached by
  	 * non-direct readahead, or faulted in by get_user_pages() if the source
  	 * of the write was an mmap'ed region of the file we're writing.  Either
  	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
  	 * this invalidation fails, tough, the write still worked...
  	 *
  	 * And this page cache invalidation has to be after dio->end_io(), as
  	 * some filesystems convert unwritten extents to real allocations in
  	 * end_io() when necessary, otherwise a racing buffer read would cache
  	 * zeros from unwritten extents.
  	 */
  	if (!dio->error &&
  	    (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) {
  		int err;
  		err = invalidate_inode_pages2_range(inode->i_mapping,
  				offset >> PAGE_SHIFT,
  				(offset + dio->size - 1) >> PAGE_SHIFT);
5a9d929d6   Darrick J. Wong   iomap: report col...
850
851
  		if (err)
  			dio_warn_stale_pagecache(iocb->ki_filp);
5e25c269e   Eryu Guan   fs: invalidate pa...
852
  	}
4f8ff44ba   Dave Chinner   iomap: iomap_dio_...
853
854
855
856
857
858
  	/*
  	 * If this is a DSYNC write, make sure we push it to stable storage now
  	 * that we've written data.
  	 */
  	if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC))
  		ret = generic_write_sync(iocb, ret);
ff6a9292e   Christoph Hellwig   iomap: implement ...
859
860
861
862
863
864
865
866
867
868
  	inode_dio_end(file_inode(iocb->ki_filp));
  	kfree(dio);
  
  	return ret;
  }
  
  static void iomap_dio_complete_work(struct work_struct *work)
  {
  	struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
  	struct kiocb *iocb = dio->iocb;
ff6a9292e   Christoph Hellwig   iomap: implement ...
869

4f8ff44ba   Dave Chinner   iomap: iomap_dio_...
870
  	iocb->ki_complete(iocb, iomap_dio_complete(dio), 0);
ff6a9292e   Christoph Hellwig   iomap: implement ...
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
  }
  
  /*
   * Set an error in the dio if none is set yet.  We have to use cmpxchg
   * as the submission context and the completion context(s) can race to
   * update the error.
   */
  static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
  {
  	cmpxchg(&dio->error, 0, ret);
  }
  
  static void iomap_dio_bio_end_io(struct bio *bio)
  {
  	struct iomap_dio *dio = bio->bi_private;
  	bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
4e4cbee93   Christoph Hellwig   block: switch bio...
887
888
  	if (bio->bi_status)
  		iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
ff6a9292e   Christoph Hellwig   iomap: implement ...
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
  
  	if (atomic_dec_and_test(&dio->ref)) {
  		if (is_sync_kiocb(dio->iocb)) {
  			struct task_struct *waiter = dio->submit.waiter;
  
  			WRITE_ONCE(dio->submit.waiter, NULL);
  			wake_up_process(waiter);
  		} else if (dio->flags & IOMAP_DIO_WRITE) {
  			struct inode *inode = file_inode(dio->iocb->ki_filp);
  
  			INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
  			queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
  		} else {
  			iomap_dio_complete_work(&dio->aio.work);
  		}
  	}
  
  	if (should_dirty) {
  		bio_check_pages_dirty(bio);
  	} else {
  		struct bio_vec *bvec;
  		int i;
  
  		bio_for_each_segment_all(bvec, bio, i)
  			put_page(bvec->bv_page);
  		bio_put(bio);
  	}
  }
  
  static blk_qc_t
  iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
  		unsigned len)
  {
  	struct page *page = ZERO_PAGE(0);
  	struct bio *bio;
  
  	bio = bio_alloc(GFP_KERNEL, 1);
74d46992e   Christoph Hellwig   block: replace bi...
926
  	bio_set_dev(bio, iomap->bdev);
57fc505d1   Christoph Hellwig   iomap: add a ioma...
927
  	bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
ff6a9292e   Christoph Hellwig   iomap: implement ...
928
929
930
931
  	bio->bi_private = dio;
  	bio->bi_end_io = iomap_dio_bio_end_io;
  
  	get_page(page);
6533b4e40   Christoph Hellwig   iomap: use __bio_...
932
  	__bio_add_page(bio, page, len, 0);
5cc60aeed   Linus Torvalds   Merge tag 'xfs-fo...
933
  	bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE);
ff6a9292e   Christoph Hellwig   iomap: implement ...
934
935
936
937
938
939
940
941
942
943
  
  	atomic_inc(&dio->ref);
  	return submit_bio(bio);
  }
  
  static loff_t
  iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
  		void *data, struct iomap *iomap)
  {
  	struct iomap_dio *dio = data;
93407472a   Fabian Frederick   fs: add i_blocksi...
944
945
946
  	unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
  	unsigned int fs_block_size = i_blocksize(inode), pad;
  	unsigned int align = iov_iter_alignment(dio->submit.iter);
ff6a9292e   Christoph Hellwig   iomap: implement ...
947
948
949
  	struct iov_iter iter;
  	struct bio *bio;
  	bool need_zeroout = false;
3460cac1c   Dave Chinner   iomap: Use FUA fo...
950
  	bool use_fua = false;
ff6a9292e   Christoph Hellwig   iomap: implement ...
951
  	int nr_pages, ret;
cfe057f7d   Al Viro   iomap_dio_actor()...
952
  	size_t copied = 0;
ff6a9292e   Christoph Hellwig   iomap: implement ...
953
954
955
956
957
958
959
960
961
962
963
  
  	if ((pos | length | align) & ((1 << blkbits) - 1))
  		return -EINVAL;
  
  	switch (iomap->type) {
  	case IOMAP_HOLE:
  		if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE))
  			return -EIO;
  		/*FALLTHRU*/
  	case IOMAP_UNWRITTEN:
  		if (!(dio->flags & IOMAP_DIO_WRITE)) {
cfe057f7d   Al Viro   iomap_dio_actor()...
964
  			length = iov_iter_zero(length, dio->submit.iter);
ff6a9292e   Christoph Hellwig   iomap: implement ...
965
966
967
968
969
970
971
972
973
  			dio->size += length;
  			return length;
  		}
  		dio->flags |= IOMAP_DIO_UNWRITTEN;
  		need_zeroout = true;
  		break;
  	case IOMAP_MAPPED:
  		if (iomap->flags & IOMAP_F_SHARED)
  			dio->flags |= IOMAP_DIO_COW;
3460cac1c   Dave Chinner   iomap: Use FUA fo...
974
  		if (iomap->flags & IOMAP_F_NEW) {
ff6a9292e   Christoph Hellwig   iomap: implement ...
975
  			need_zeroout = true;
3460cac1c   Dave Chinner   iomap: Use FUA fo...
976
977
978
979
980
981
982
983
984
985
986
987
  		} else {
  			/*
  			 * Use a FUA write if we need datasync semantics, this
  			 * is a pure data IO that doesn't require any metadata
  			 * updates and the underlying device supports FUA. This
  			 * allows us to avoid cache flushes on IO completion.
  			 */
  			if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
  			    (dio->flags & IOMAP_DIO_WRITE_FUA) &&
  			    blk_queue_fua(bdev_get_queue(iomap->bdev)))
  				use_fua = true;
  		}
ff6a9292e   Christoph Hellwig   iomap: implement ...
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
  		break;
  	default:
  		WARN_ON_ONCE(1);
  		return -EIO;
  	}
  
  	/*
  	 * Operate on a partial iter trimmed to the extent we were called for.
  	 * We'll update the iter in the dio once we're done with this extent.
  	 */
  	iter = *dio->submit.iter;
  	iov_iter_truncate(&iter, length);
  
  	nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
  	if (nr_pages <= 0)
  		return nr_pages;
  
  	if (need_zeroout) {
  		/* zero out from the start of the block to the write offset */
  		pad = pos & (fs_block_size - 1);
  		if (pad)
  			iomap_dio_zero(dio, iomap, pos - pad, pad);
  	}
  
  	do {
cfe057f7d   Al Viro   iomap_dio_actor()...
1013
1014
1015
  		size_t n;
  		if (dio->error) {
  			iov_iter_revert(dio->submit.iter, copied);
ff6a9292e   Christoph Hellwig   iomap: implement ...
1016
  			return 0;
cfe057f7d   Al Viro   iomap_dio_actor()...
1017
  		}
ff6a9292e   Christoph Hellwig   iomap: implement ...
1018
1019
  
  		bio = bio_alloc(GFP_KERNEL, nr_pages);
74d46992e   Christoph Hellwig   block: replace bi...
1020
  		bio_set_dev(bio, iomap->bdev);
57fc505d1   Christoph Hellwig   iomap: add a ioma...
1021
  		bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
45d06cf70   Jens Axboe   fs: add O_DIRECT ...
1022
  		bio->bi_write_hint = dio->iocb->ki_hint;
087e56691   Adam Manzanares   fs: iomap dio set...
1023
  		bio->bi_ioprio = dio->iocb->ki_ioprio;
ff6a9292e   Christoph Hellwig   iomap: implement ...
1024
1025
1026
1027
1028
1029
  		bio->bi_private = dio;
  		bio->bi_end_io = iomap_dio_bio_end_io;
  
  		ret = bio_iov_iter_get_pages(bio, &iter);
  		if (unlikely(ret)) {
  			bio_put(bio);
cfe057f7d   Al Viro   iomap_dio_actor()...
1030
  			return copied ? copied : ret;
ff6a9292e   Christoph Hellwig   iomap: implement ...
1031
  		}
cfe057f7d   Al Viro   iomap_dio_actor()...
1032
  		n = bio->bi_iter.bi_size;
ff6a9292e   Christoph Hellwig   iomap: implement ...
1033
  		if (dio->flags & IOMAP_DIO_WRITE) {
3460cac1c   Dave Chinner   iomap: Use FUA fo...
1034
1035
1036
1037
1038
  			bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
  			if (use_fua)
  				bio->bi_opf |= REQ_FUA;
  			else
  				dio->flags &= ~IOMAP_DIO_WRITE_FUA;
cfe057f7d   Al Viro   iomap_dio_actor()...
1039
  			task_io_account_write(n);
ff6a9292e   Christoph Hellwig   iomap: implement ...
1040
  		} else {
3460cac1c   Dave Chinner   iomap: Use FUA fo...
1041
  			bio->bi_opf = REQ_OP_READ;
ff6a9292e   Christoph Hellwig   iomap: implement ...
1042
1043
1044
  			if (dio->flags & IOMAP_DIO_DIRTY)
  				bio_set_pages_dirty(bio);
  		}
cfe057f7d   Al Viro   iomap_dio_actor()...
1045
1046
1047
1048
1049
  		iov_iter_advance(dio->submit.iter, n);
  
  		dio->size += n;
  		pos += n;
  		copied += n;
ff6a9292e   Christoph Hellwig   iomap: implement ...
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
  
  		nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
  
  		atomic_inc(&dio->ref);
  
  		dio->submit.last_queue = bdev_get_queue(iomap->bdev);
  		dio->submit.cookie = submit_bio(bio);
  	} while (nr_pages);
  
  	if (need_zeroout) {
  		/* zero out from the end of the write to the end of the block */
  		pad = pos & (fs_block_size - 1);
  		if (pad)
  			iomap_dio_zero(dio, iomap, pos, fs_block_size - pad);
  	}
cfe057f7d   Al Viro   iomap_dio_actor()...
1065
  	return copied;
ff6a9292e   Christoph Hellwig   iomap: implement ...
1066
  }
4f8ff44ba   Dave Chinner   iomap: iomap_dio_...
1067
1068
  /*
   * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO
3460cac1c   Dave Chinner   iomap: Use FUA fo...
1069
1070
1071
1072
1073
1074
   * is being issued as AIO or not.  This allows us to optimise pure data writes
   * to use REQ_FUA rather than requiring generic_write_sync() to issue a
   * REQ_FLUSH post write. This is slightly tricky because a single request here
   * can be mapped into multiple disjoint IOs and only a subset of the IOs issued
   * may be pure data writes. In that case, we still need to do a full data sync
   * completion.
4f8ff44ba   Dave Chinner   iomap: iomap_dio_...
1075
   */
ff6a9292e   Christoph Hellwig   iomap: implement ...
1076
  ssize_t
8ff6daa17   Christoph Hellwig   iomap: constify s...
1077
1078
  iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
  		const struct iomap_ops *ops, iomap_dio_end_io_t end_io)
ff6a9292e   Christoph Hellwig   iomap: implement ...
1079
1080
1081
1082
  {
  	struct address_space *mapping = iocb->ki_filp->f_mapping;
  	struct inode *inode = file_inode(iocb->ki_filp);
  	size_t count = iov_iter_count(iter);
c771c14ba   Eryu Guan   iomap: invalidate...
1083
1084
  	loff_t pos = iocb->ki_pos, start = pos;
  	loff_t end = iocb->ki_pos + count - 1, ret = 0;
ff6a9292e   Christoph Hellwig   iomap: implement ...
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
  	unsigned int flags = IOMAP_DIRECT;
  	struct blk_plug plug;
  	struct iomap_dio *dio;
  
  	lockdep_assert_held(&inode->i_rwsem);
  
  	if (!count)
  		return 0;
  
  	dio = kmalloc(sizeof(*dio), GFP_KERNEL);
  	if (!dio)
  		return -ENOMEM;
  
  	dio->iocb = iocb;
  	atomic_set(&dio->ref, 1);
  	dio->size = 0;
  	dio->i_size = i_size_read(inode);
  	dio->end_io = end_io;
  	dio->error = 0;
  	dio->flags = 0;
  
  	dio->submit.iter = iter;
  	if (is_sync_kiocb(iocb)) {
  		dio->submit.waiter = current;
  		dio->submit.cookie = BLK_QC_T_NONE;
  		dio->submit.last_queue = NULL;
  	}
  
  	if (iov_iter_rw(iter) == READ) {
  		if (pos >= dio->i_size)
  			goto out_free_dio;
  
  		if (iter->type == ITER_IOVEC)
  			dio->flags |= IOMAP_DIO_DIRTY;
  	} else {
3460cac1c   Dave Chinner   iomap: Use FUA fo...
1120
  		flags |= IOMAP_WRITE;
ff6a9292e   Christoph Hellwig   iomap: implement ...
1121
  		dio->flags |= IOMAP_DIO_WRITE;
3460cac1c   Dave Chinner   iomap: Use FUA fo...
1122
1123
  
  		/* for data sync or sync, we need sync completion processing */
4f8ff44ba   Dave Chinner   iomap: iomap_dio_...
1124
1125
  		if (iocb->ki_flags & IOCB_DSYNC)
  			dio->flags |= IOMAP_DIO_NEED_SYNC;
3460cac1c   Dave Chinner   iomap: Use FUA fo...
1126
1127
1128
1129
1130
1131
1132
1133
1134
  
  		/*
  		 * For datasync only writes, we optimistically try using FUA for
  		 * this IO.  Any non-FUA write that occurs will clear this flag,
  		 * hence we know before completion whether a cache flush is
  		 * necessary.
  		 */
  		if ((iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) == IOCB_DSYNC)
  			dio->flags |= IOMAP_DIO_WRITE_FUA;
ff6a9292e   Christoph Hellwig   iomap: implement ...
1135
  	}
a38d12437   Goldwyn Rodrigues   fs: Introduce IOM...
1136
1137
1138
1139
1140
1141
1142
  	if (iocb->ki_flags & IOCB_NOWAIT) {
  		if (filemap_range_has_page(mapping, start, end)) {
  			ret = -EAGAIN;
  			goto out_free_dio;
  		}
  		flags |= IOMAP_NOWAIT;
  	}
55635ba76   Andrey Ryabinin   fs: fix data inva...
1143
1144
1145
  	ret = filemap_write_and_wait_range(mapping, start, end);
  	if (ret)
  		goto out_free_dio;
ff6a9292e   Christoph Hellwig   iomap: implement ...
1146

5a9d929d6   Darrick J. Wong   iomap: report col...
1147
1148
1149
1150
1151
1152
  	/*
  	 * Try to invalidate cache pages for the range we're direct
  	 * writing.  If this invalidation fails, tough, the write will
  	 * still work, but racing two incompatible write paths is a
  	 * pretty crazy thing to do, so we don't support it 100%.
  	 */
55635ba76   Andrey Ryabinin   fs: fix data inva...
1153
1154
  	ret = invalidate_inode_pages2_range(mapping,
  			start >> PAGE_SHIFT, end >> PAGE_SHIFT);
5a9d929d6   Darrick J. Wong   iomap: report col...
1155
1156
  	if (ret)
  		dio_warn_stale_pagecache(iocb->ki_filp);
55635ba76   Andrey Ryabinin   fs: fix data inva...
1157
  	ret = 0;
ff6a9292e   Christoph Hellwig   iomap: implement ...
1158

546e7be82   Chandan Rajendra   iomap_dio_rw: All...
1159
1160
1161
1162
1163
1164
  	if (iov_iter_rw(iter) == WRITE && !is_sync_kiocb(iocb) &&
  	    !inode->i_sb->s_dio_done_wq) {
  		ret = sb_init_dio_done_wq(inode->i_sb);
  		if (ret < 0)
  			goto out_free_dio;
  	}
ff6a9292e   Christoph Hellwig   iomap: implement ...
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
  	inode_dio_begin(inode);
  
  	blk_start_plug(&plug);
  	do {
  		ret = iomap_apply(inode, pos, count, flags, ops, dio,
  				iomap_dio_actor);
  		if (ret <= 0) {
  			/* magic error code to fall back to buffered I/O */
  			if (ret == -ENOTBLK)
  				ret = 0;
  			break;
  		}
  		pos += ret;
a008c31c7   Chandan Rajendra   iomap_dio_rw: Pre...
1178
1179
1180
  
  		if (iov_iter_rw(iter) == READ && pos >= dio->i_size)
  			break;
ff6a9292e   Christoph Hellwig   iomap: implement ...
1181
1182
1183
1184
1185
  	} while ((count = iov_iter_count(iter)) > 0);
  	blk_finish_plug(&plug);
  
  	if (ret < 0)
  		iomap_dio_set_error(dio, ret);
3460cac1c   Dave Chinner   iomap: Use FUA fo...
1186
1187
1188
1189
1190
1191
  	/*
  	 * If all the writes we issued were FUA, we don't need to flush the
  	 * cache on IO completion. Clear the sync flag for this case.
  	 */
  	if (dio->flags & IOMAP_DIO_WRITE_FUA)
  		dio->flags &= ~IOMAP_DIO_NEED_SYNC;
ff6a9292e   Christoph Hellwig   iomap: implement ...
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
  	if (!atomic_dec_and_test(&dio->ref)) {
  		if (!is_sync_kiocb(iocb))
  			return -EIOCBQUEUED;
  
  		for (;;) {
  			set_current_state(TASK_UNINTERRUPTIBLE);
  			if (!READ_ONCE(dio->submit.waiter))
  				break;
  
  			if (!(iocb->ki_flags & IOCB_HIPRI) ||
  			    !dio->submit.last_queue ||
ea435e1b9   Christoph Hellwig   block: add a poll...
1203
  			    !blk_poll(dio->submit.last_queue,
5cc60aeed   Linus Torvalds   Merge tag 'xfs-fo...
1204
  					 dio->submit.cookie))
ff6a9292e   Christoph Hellwig   iomap: implement ...
1205
1206
1207
1208
  				io_schedule();
  		}
  		__set_current_state(TASK_RUNNING);
  	}
c771c14ba   Eryu Guan   iomap: invalidate...
1209
  	ret = iomap_dio_complete(dio);
c771c14ba   Eryu Guan   iomap: invalidate...
1210
  	return ret;
ff6a9292e   Christoph Hellwig   iomap: implement ...
1211
1212
1213
1214
1215
1216
  
  out_free_dio:
  	kfree(dio);
  	return ret;
  }
  EXPORT_SYMBOL_GPL(iomap_dio_rw);
67482129c   Darrick J. Wong   iomap: add a swap...
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
  
  /* Swapfile activation */
  
  #ifdef CONFIG_SWAP
  struct iomap_swapfile_info {
  	struct iomap iomap;		/* accumulated iomap */
  	struct swap_info_struct *sis;
  	uint64_t lowest_ppage;		/* lowest physical addr seen (pages) */
  	uint64_t highest_ppage;		/* highest physical addr seen (pages) */
  	unsigned long nr_pages;		/* number of pages collected */
  	int nr_extents;			/* extent count */
  };
  
  /*
   * Collect physical extents for this swap file.  Physical extents reported to
   * the swap code must be trimmed to align to a page boundary.  The logical
   * offset within the file is irrelevant since the swapfile code maps logical
   * page numbers of the swap device to the physical page-aligned extents.
   */
  static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi)
  {
  	struct iomap *iomap = &isi->iomap;
  	unsigned long nr_pages;
  	uint64_t first_ppage;
  	uint64_t first_ppage_reported;
  	uint64_t next_ppage;
  	int error;
  
  	/*
  	 * Round the start up and the end down so that the physical
  	 * extent aligns to a page boundary.
  	 */
  	first_ppage = ALIGN(iomap->addr, PAGE_SIZE) >> PAGE_SHIFT;
  	next_ppage = ALIGN_DOWN(iomap->addr + iomap->length, PAGE_SIZE) >>
  			PAGE_SHIFT;
  
  	/* Skip too-short physical extents. */
  	if (first_ppage >= next_ppage)
  		return 0;
  	nr_pages = next_ppage - first_ppage;
  
  	/*
  	 * Calculate how much swap space we're adding; the first page contains
  	 * the swap header and doesn't count.  The mm still wants that first
  	 * page fed to add_swap_extent, however.
  	 */
  	first_ppage_reported = first_ppage;
  	if (iomap->offset == 0)
  		first_ppage_reported++;
  	if (isi->lowest_ppage > first_ppage_reported)
  		isi->lowest_ppage = first_ppage_reported;
  	if (isi->highest_ppage < (next_ppage - 1))
  		isi->highest_ppage = next_ppage - 1;
  
  	/* Add extent, set up for the next call. */
  	error = add_swap_extent(isi->sis, isi->nr_pages, nr_pages, first_ppage);
  	if (error < 0)
  		return error;
  	isi->nr_extents += error;
  	isi->nr_pages += nr_pages;
  	return 0;
  }
  
  /*
   * Accumulate iomaps for this swap file.  We have to accumulate iomaps because
   * swap only cares about contiguous page-aligned physical extents and makes no
   * distinction between written and unwritten extents.
   */
  static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos,
  		loff_t count, void *data, struct iomap *iomap)
  {
  	struct iomap_swapfile_info *isi = data;
  	int error;
19319b532   Christoph Hellwig   iomap: inline dat...
1290
1291
1292
1293
1294
1295
1296
  	switch (iomap->type) {
  	case IOMAP_MAPPED:
  	case IOMAP_UNWRITTEN:
  		/* Only real or unwritten extents. */
  		break;
  	case IOMAP_INLINE:
  		/* No inline data. */
ec601924d   Omar Sandoval   iomap: provide mo...
1297
1298
1299
  		pr_err("swapon: file is inline
  ");
  		return -EINVAL;
19319b532   Christoph Hellwig   iomap: inline dat...
1300
  	default:
ec601924d   Omar Sandoval   iomap: provide mo...
1301
1302
1303
1304
  		pr_err("swapon: file has unallocated extents
  ");
  		return -EINVAL;
  	}
67482129c   Darrick J. Wong   iomap: add a swap...
1305

ec601924d   Omar Sandoval   iomap: provide mo...
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
  	/* No uncommitted metadata or shared blocks. */
  	if (iomap->flags & IOMAP_F_DIRTY) {
  		pr_err("swapon: file is not committed
  ");
  		return -EINVAL;
  	}
  	if (iomap->flags & IOMAP_F_SHARED) {
  		pr_err("swapon: file has shared extents
  ");
  		return -EINVAL;
  	}
67482129c   Darrick J. Wong   iomap: add a swap...
1317

ec601924d   Omar Sandoval   iomap: provide mo...
1318
1319
1320
1321
1322
1323
  	/* Only one bdev per swap file. */
  	if (iomap->bdev != isi->sis->bdev) {
  		pr_err("swapon: file is on multiple devices
  ");
  		return -EINVAL;
  	}
67482129c   Darrick J. Wong   iomap: add a swap...
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
  
  	if (isi->iomap.length == 0) {
  		/* No accumulated extent, so just store it. */
  		memcpy(&isi->iomap, iomap, sizeof(isi->iomap));
  	} else if (isi->iomap.addr + isi->iomap.length == iomap->addr) {
  		/* Append this to the accumulated extent. */
  		isi->iomap.length += iomap->length;
  	} else {
  		/* Otherwise, add the retained iomap and store this one. */
  		error = iomap_swapfile_add_extent(isi);
  		if (error)
  			return error;
  		memcpy(&isi->iomap, iomap, sizeof(isi->iomap));
  	}
67482129c   Darrick J. Wong   iomap: add a swap...
1338
  	return count;
67482129c   Darrick J. Wong   iomap: add a swap...
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
  }
  
  /*
   * Iterate a swap file's iomaps to construct physical extents that can be
   * passed to the swapfile subsystem.
   */
  int iomap_swapfile_activate(struct swap_info_struct *sis,
  		struct file *swap_file, sector_t *pagespan,
  		const struct iomap_ops *ops)
  {
  	struct iomap_swapfile_info isi = {
  		.sis = sis,
  		.lowest_ppage = (sector_t)-1ULL,
  	};
  	struct address_space *mapping = swap_file->f_mapping;
  	struct inode *inode = mapping->host;
  	loff_t pos = 0;
  	loff_t len = ALIGN_DOWN(i_size_read(inode), PAGE_SIZE);
  	loff_t ret;
  
  	ret = filemap_write_and_wait(inode->i_mapping);
  	if (ret)
  		return ret;
  
  	while (len > 0) {
  		ret = iomap_apply(inode, pos, len, IOMAP_REPORT,
  				ops, &isi, iomap_swapfile_activate_actor);
  		if (ret <= 0)
  			return ret;
  
  		pos += ret;
  		len -= ret;
  	}
  
  	if (isi.iomap.length) {
  		ret = iomap_swapfile_add_extent(&isi);
  		if (ret)
  			return ret;
  	}
  
  	*pagespan = 1 + isi.highest_ppage - isi.lowest_ppage;
  	sis->max = isi.nr_pages;
  	sis->pages = isi.nr_pages - 1;
  	sis->highest_bit = isi.nr_pages - 1;
  	return isi.nr_extents;
  }
  EXPORT_SYMBOL_GPL(iomap_swapfile_activate);
  #endif /* CONFIG_SWAP */
89eb1906a   Christoph Hellwig   iomap: add an iom...
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
  
  static loff_t
  iomap_bmap_actor(struct inode *inode, loff_t pos, loff_t length,
  		void *data, struct iomap *iomap)
  {
  	sector_t *bno = data, addr;
  
  	if (iomap->type == IOMAP_MAPPED) {
  		addr = (pos - iomap->offset + iomap->addr) >> inode->i_blkbits;
  		if (addr > INT_MAX)
  			WARN(1, "would truncate bmap result
  ");
  		else
  			*bno = addr;
  	}
  	return 0;
  }
  
  /* legacy ->bmap interface.  0 is the error return (!) */
  sector_t
  iomap_bmap(struct address_space *mapping, sector_t bno,
  		const struct iomap_ops *ops)
  {
  	struct inode *inode = mapping->host;
  	loff_t pos = bno >> inode->i_blkbits;
  	unsigned blocksize = i_blocksize(inode);
  
  	if (filemap_write_and_wait(mapping))
  		return 0;
  
  	bno = 0;
  	iomap_apply(inode, pos, blocksize, 0, ops, &bno, iomap_bmap_actor);
  	return bno;
  }
  EXPORT_SYMBOL_GPL(iomap_bmap);