Blame view

fs/ceph/addr.c 31.5 KB
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
1
  #include <linux/ceph/ceph_debug.h>
1d3576fd1   Sage Weil   ceph: address spa...
2
3
4
5
6
7
  
  #include <linux/backing-dev.h>
  #include <linux/fs.h>
  #include <linux/mm.h>
  #include <linux/pagemap.h>
  #include <linux/writeback.h>	/* generic_writepages */
5a0e3ad6a   Tejun Heo   include cleanup: ...
8
  #include <linux/slab.h>
1d3576fd1   Sage Weil   ceph: address spa...
9
10
11
12
  #include <linux/pagevec.h>
  #include <linux/task_io_accounting_ops.h>
  
  #include "super.h"
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
13
14
  #include "mds_client.h"
  #include <linux/ceph/osd_client.h>
1d3576fd1   Sage Weil   ceph: address spa...
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
  
  /*
   * Ceph address space ops.
   *
   * There are a few funny things going on here.
   *
   * The page->private field is used to reference a struct
   * ceph_snap_context for _every_ dirty page.  This indicates which
   * snapshot the page was logically dirtied in, and thus which snap
   * context needs to be associated with the osd write during writeback.
   *
   * Similarly, struct ceph_inode_info maintains a set of counters to
   * count dirty pages on the inode.  In the absense of snapshots,
   * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count.
   *
   * When a snapshot is taken (that is, when the client receives
   * notification that a snapshot was taken), each inode with caps and
   * with dirty pages (dirty pages implies there is a cap) gets a new
   * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending
   * order, new snaps go to the tail).  The i_wrbuffer_ref_head count is
   * moved to capsnap->dirty. (Unless a sync write is currently in
   * progress.  In that case, the capsnap is said to be "pending", new
   * writes cannot start, and the capsnap isn't "finalized" until the
   * write completes (or fails) and a final size/mtime for the inode for
   * that snap can be settled upon.)  i_wrbuffer_ref_head is reset to 0.
   *
   * On writeback, we must submit writes to the osd IN SNAP ORDER.  So,
   * we look for the first capsnap in i_cap_snaps and write out pages in
   * that snap context _only_.  Then we move on to the next capsnap,
   * eventually reaching the "live" or "head" context (i.e., pages that
   * are not yet snapped) and are writing the most recently dirtied
   * pages.
   *
   * Invalidate and so forth must take care to ensure the dirty page
   * accounting is preserved.
   */
2baba2501   Yehuda Sadeh   ceph: writeback c...
51
52
53
54
  #define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10))
  #define CONGESTION_OFF_THRESH(congestion_kb)				\
  	(CONGESTION_ON_THRESH(congestion_kb) -				\
  	 (CONGESTION_ON_THRESH(congestion_kb) >> 2))
1d3576fd1   Sage Weil   ceph: address spa...
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
  
  /*
   * Dirty a page.  Optimistically adjust accounting, on the assumption
   * that we won't race with invalidate.  If we do, readjust.
   */
  static int ceph_set_page_dirty(struct page *page)
  {
  	struct address_space *mapping = page->mapping;
  	struct inode *inode;
  	struct ceph_inode_info *ci;
  	int undo = 0;
  	struct ceph_snap_context *snapc;
  
  	if (unlikely(!mapping))
  		return !TestSetPageDirty(page);
  
  	if (TestSetPageDirty(page)) {
  		dout("%p set_page_dirty %p idx %lu -- already dirty
  ",
  		     mapping->host, page, page->index);
  		return 0;
  	}
  
  	inode = mapping->host;
  	ci = ceph_inode(inode);
  
  	/*
  	 * Note that we're grabbing a snapc ref here without holding
  	 * any locks!
  	 */
  	snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
  
  	/* dirty the head */
  	spin_lock(&inode->i_lock);
7d8cb26d7   Sage Weil   ceph: maintain i_...
89
  	if (ci->i_head_snapc == NULL)
1d3576fd1   Sage Weil   ceph: address spa...
90
91
92
  		ci->i_head_snapc = ceph_get_snap_context(snapc);
  	++ci->i_wrbuffer_ref_head;
  	if (ci->i_wrbuffer_ref == 0)
0444d76ae   Dave Chinner   fs: don't use igr...
93
  		ihold(inode);
1d3576fd1   Sage Weil   ceph: address spa...
94
95
96
97
98
99
100
101
102
103
104
105
106
107
  	++ci->i_wrbuffer_ref;
  	dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d "
  	     "snapc %p seq %lld (%d snaps)
  ",
  	     mapping->host, page, page->index,
  	     ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
  	     ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
  	     snapc, snapc->seq, snapc->num_snaps);
  	spin_unlock(&inode->i_lock);
  
  	/* now adjust page */
  	spin_lock_irq(&mapping->tree_lock);
  	if (page->mapping) {	/* Race with truncate? */
  		WARN_ON_ONCE(!PageUptodate(page));
679ceace8   Michael Rubin   mm: exporting acc...
108
  		account_page_dirtied(page, page->mapping);
1d3576fd1   Sage Weil   ceph: address spa...
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
  		radix_tree_tag_set(&mapping->page_tree,
  				page_index(page), PAGECACHE_TAG_DIRTY);
  
  		/*
  		 * Reference snap context in page->private.  Also set
  		 * PagePrivate so that we get invalidatepage callback.
  		 */
  		page->private = (unsigned long)snapc;
  		SetPagePrivate(page);
  	} else {
  		dout("ANON set_page_dirty %p (raced truncate?)
  ", page);
  		undo = 1;
  	}
  
  	spin_unlock_irq(&mapping->tree_lock);
  
  	if (undo)
  		/* whoops, we failed to dirty the page */
  		ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
  
  	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
  
  	BUG_ON(!PageDirty(page));
  	return 1;
  }
  
  /*
   * If we are truncating the full page (i.e. offset == 0), adjust the
   * dirty page counters appropriately.  Only called if there is private
   * data on the page.
   */
  static void ceph_invalidatepage(struct page *page, unsigned long offset)
  {
4ce1e9ada   Alexander Beregalov   ceph: move derefe...
143
  	struct inode *inode;
1d3576fd1   Sage Weil   ceph: address spa...
144
145
146
147
148
149
150
  	struct ceph_inode_info *ci;
  	struct ceph_snap_context *snapc = (void *)page->private;
  
  	BUG_ON(!PageLocked(page));
  	BUG_ON(!page->private);
  	BUG_ON(!PagePrivate(page));
  	BUG_ON(!page->mapping);
4ce1e9ada   Alexander Beregalov   ceph: move derefe...
151
  	inode = page->mapping->host;
1d3576fd1   Sage Weil   ceph: address spa...
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
  	/*
  	 * We can get non-dirty pages here due to races between
  	 * set_page_dirty and truncate_complete_page; just spit out a
  	 * warning, in case we end up with accounting problems later.
  	 */
  	if (!PageDirty(page))
  		pr_err("%p invalidatepage %p page not dirty
  ", inode, page);
  
  	if (offset == 0)
  		ClearPageChecked(page);
  
  	ci = ceph_inode(inode);
  	if (offset == 0) {
  		dout("%p invalidatepage %p idx %lu full dirty page %lu
  ",
  		     inode, page, page->index, offset);
  		ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
  		ceph_put_snap_context(snapc);
  		page->private = 0;
  		ClearPagePrivate(page);
  	} else {
  		dout("%p invalidatepage %p idx %lu partial dirty page
  ",
  		     inode, page, page->index);
  	}
  }
  
  /* just a sanity check */
  static int ceph_releasepage(struct page *page, gfp_t g)
  {
  	struct inode *inode = page->mapping ? page->mapping->host : NULL;
  	dout("%p releasepage %p idx %lu
  ", inode, page, page->index);
  	WARN_ON(PageDirty(page));
  	WARN_ON(page->private);
  	WARN_ON(PagePrivate(page));
  	return 0;
  }
  
  /*
   * read a single page, without unlocking it.
   */
  static int readpage_nounlock(struct file *filp, struct page *page)
  {
  	struct inode *inode = filp->f_dentry->d_inode;
  	struct ceph_inode_info *ci = ceph_inode(inode);
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
199
200
  	struct ceph_osd_client *osdc = 
  		&ceph_inode_to_client(inode)->client->osdc;
1d3576fd1   Sage Weil   ceph: address spa...
201
202
203
204
205
206
207
208
209
  	int err = 0;
  	u64 len = PAGE_CACHE_SIZE;
  
  	dout("readpage inode %p file %p page %p index %lu
  ",
  	     inode, filp, page, page->index);
  	err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
  				  page->index << PAGE_CACHE_SHIFT, &len,
  				  ci->i_truncate_seq, ci->i_truncate_size,
b7495fc2f   Sage Weil   ceph: make page a...
210
  				  &page, 1, 0);
1d3576fd1   Sage Weil   ceph: address spa...
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
  	if (err == -ENOENT)
  		err = 0;
  	if (err < 0) {
  		SetPageError(page);
  		goto out;
  	} else if (err < PAGE_CACHE_SIZE) {
  		/* zero fill remainder of page */
  		zero_user_segment(page, err, PAGE_CACHE_SIZE);
  	}
  	SetPageUptodate(page);
  
  out:
  	return err < 0 ? err : 0;
  }
  
  static int ceph_readpage(struct file *filp, struct page *page)
  {
  	int r = readpage_nounlock(filp, page);
  	unlock_page(page);
  	return r;
  }
  
  /*
   * Build a vector of contiguous pages from the provided page list.
   */
  static struct page **page_vector_from_list(struct list_head *page_list,
  					   unsigned *nr_pages)
  {
  	struct page **pages;
  	struct page *page;
  	int next_index, contig_pages = 0;
  
  	/* build page vector */
  	pages = kmalloc(sizeof(*pages) * *nr_pages, GFP_NOFS);
  	if (!pages)
  		return ERR_PTR(-ENOMEM);
  
  	BUG_ON(list_empty(page_list));
  	next_index = list_entry(page_list->prev, struct page, lru)->index;
  	list_for_each_entry_reverse(page, page_list, lru) {
  		if (page->index == next_index) {
  			dout("readpages page %d %p
  ", contig_pages, page);
  			pages[contig_pages] = page;
  			contig_pages++;
  			next_index++;
  		} else {
  			break;
  		}
  	}
  	*nr_pages = contig_pages;
  	return pages;
  }
  
  /*
   * Read multiple pages.  Leave pages we don't read + unlock in page_list;
   * the caller (VM) cleans them up.
   */
  static int ceph_readpages(struct file *file, struct address_space *mapping,
  			  struct list_head *page_list, unsigned nr_pages)
  {
  	struct inode *inode = file->f_dentry->d_inode;
  	struct ceph_inode_info *ci = ceph_inode(inode);
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
274
275
  	struct ceph_osd_client *osdc =
  		&ceph_inode_to_client(inode)->client->osdc;
1d3576fd1   Sage Weil   ceph: address spa...
276
277
  	int rc = 0;
  	struct page **pages;
1d3576fd1   Sage Weil   ceph: address spa...
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
  	loff_t offset;
  	u64 len;
  
  	dout("readpages %p file %p nr_pages %d
  ",
  	     inode, file, nr_pages);
  
  	pages = page_vector_from_list(page_list, &nr_pages);
  	if (IS_ERR(pages))
  		return PTR_ERR(pages);
  
  	/* guess read extent */
  	offset = pages[0]->index << PAGE_CACHE_SHIFT;
  	len = nr_pages << PAGE_CACHE_SHIFT;
  	rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
  				 offset, &len,
  				 ci->i_truncate_seq, ci->i_truncate_size,
b7495fc2f   Sage Weil   ceph: make page a...
295
  				 pages, nr_pages, 0);
1d3576fd1   Sage Weil   ceph: address spa...
296
297
298
299
  	if (rc == -ENOENT)
  		rc = 0;
  	if (rc < 0)
  		goto out;
1d3576fd1   Sage Weil   ceph: address spa...
300
301
302
303
304
305
306
307
308
309
310
311
  	for (; !list_empty(page_list) && len > 0;
  	     rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) {
  		struct page *page =
  			list_entry(page_list->prev, struct page, lru);
  
  		list_del(&page->lru);
  
  		if (rc < (int)PAGE_CACHE_SIZE) {
  			/* zero (remainder of) page */
  			int s = rc < 0 ? 0 : rc;
  			zero_user_segment(page, s, PAGE_CACHE_SIZE);
  		}
213c99ee0   Sage Weil   ceph: whitespace ...
312
313
  		if (add_to_page_cache_lru(page, mapping, page->index,
  					  GFP_NOFS)) {
1d3576fd1   Sage Weil   ceph: address spa...
314
315
316
317
318
319
320
321
322
323
324
325
  			page_cache_release(page);
  			dout("readpages %p add_to_page_cache failed %p
  ",
  			     inode, page);
  			continue;
  		}
  		dout("readpages %p adding %p idx %lu
  ", inode, page,
  		     page->index);
  		flush_dcache_page(page);
  		SetPageUptodate(page);
  		unlock_page(page);
31459fe4b   Yehuda Sadeh   ceph: use __page_...
326
  		page_cache_release(page);
1d3576fd1   Sage Weil   ceph: address spa...
327
  	}
1d3576fd1   Sage Weil   ceph: address spa...
328
329
330
331
332
333
334
335
336
337
  	rc = 0;
  
  out:
  	kfree(pages);
  	return rc;
  }
  
  /*
   * Get ref for the oldest snapc for an inode with dirty data... that is, the
   * only snap context we are allowed to write back.
1d3576fd1   Sage Weil   ceph: address spa...
338
   */
6298a3375   Sage Weil   ceph: fix snap co...
339
340
  static struct ceph_snap_context *get_oldest_context(struct inode *inode,
  						    u64 *snap_size)
1d3576fd1   Sage Weil   ceph: address spa...
341
342
343
344
  {
  	struct ceph_inode_info *ci = ceph_inode(inode);
  	struct ceph_snap_context *snapc = NULL;
  	struct ceph_cap_snap *capsnap = NULL;
6298a3375   Sage Weil   ceph: fix snap co...
345
  	spin_lock(&inode->i_lock);
1d3576fd1   Sage Weil   ceph: address spa...
346
347
348
349
350
351
352
353
354
355
356
  	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
  		dout(" cap_snap %p snapc %p has %d dirty pages
  ", capsnap,
  		     capsnap->context, capsnap->dirty_pages);
  		if (capsnap->dirty_pages) {
  			snapc = ceph_get_snap_context(capsnap->context);
  			if (snap_size)
  				*snap_size = capsnap->size;
  			break;
  		}
  	}
7d8cb26d7   Sage Weil   ceph: maintain i_...
357
  	if (!snapc && ci->i_wrbuffer_ref_head) {
80e755fed   Sage Weil   ceph: allow write...
358
  		snapc = ceph_get_snap_context(ci->i_head_snapc);
1d3576fd1   Sage Weil   ceph: address spa...
359
360
361
362
  		dout(" head snapc %p has %d dirty pages
  ",
  		     snapc, ci->i_wrbuffer_ref_head);
  	}
1d3576fd1   Sage Weil   ceph: address spa...
363
364
365
366
367
368
369
370
371
372
373
374
375
376
  	spin_unlock(&inode->i_lock);
  	return snapc;
  }
  
  /*
   * Write a single page, but leave the page locked.
   *
   * If we get a write error, set the page error bit, but still adjust the
   * dirty page accounting (i.e., page is no longer dirty).
   */
  static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
  {
  	struct inode *inode;
  	struct ceph_inode_info *ci;
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
377
  	struct ceph_fs_client *fsc;
1d3576fd1   Sage Weil   ceph: address spa...
378
379
380
381
382
  	struct ceph_osd_client *osdc;
  	loff_t page_off = page->index << PAGE_CACHE_SHIFT;
  	int len = PAGE_CACHE_SIZE;
  	loff_t i_size;
  	int err = 0;
6298a3375   Sage Weil   ceph: fix snap co...
383
  	struct ceph_snap_context *snapc, *oldest;
1d3576fd1   Sage Weil   ceph: address spa...
384
  	u64 snap_size = 0;
2baba2501   Yehuda Sadeh   ceph: writeback c...
385
  	long writeback_stat;
1d3576fd1   Sage Weil   ceph: address spa...
386
387
388
389
390
391
392
393
394
395
396
  
  	dout("writepage %p idx %lu
  ", page, page->index);
  
  	if (!page->mapping || !page->mapping->host) {
  		dout("writepage %p - no mapping
  ", page);
  		return -EFAULT;
  	}
  	inode = page->mapping->host;
  	ci = ceph_inode(inode);
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
397
398
  	fsc = ceph_inode_to_client(inode);
  	osdc = &fsc->client->osdc;
1d3576fd1   Sage Weil   ceph: address spa...
399
400
401
402
403
404
405
406
  
  	/* verify this is a writeable snap context */
  	snapc = (void *)page->private;
  	if (snapc == NULL) {
  		dout("writepage %p page %p not dirty?
  ", inode, page);
  		goto out;
  	}
6298a3375   Sage Weil   ceph: fix snap co...
407
408
  	oldest = get_oldest_context(inode, &snap_size);
  	if (snapc->seq > oldest->seq) {
1d3576fd1   Sage Weil   ceph: address spa...
409
410
411
412
413
  		dout("writepage %p page %p snapc %p not writeable - noop
  ",
  		     inode, page, (void *)page->private);
  		/* we should only noop if called by kswapd */
  		WARN_ON((current->flags & PF_MEMALLOC) == 0);
6298a3375   Sage Weil   ceph: fix snap co...
414
  		ceph_put_snap_context(oldest);
1d3576fd1   Sage Weil   ceph: address spa...
415
416
  		goto out;
  	}
6298a3375   Sage Weil   ceph: fix snap co...
417
  	ceph_put_snap_context(oldest);
1d3576fd1   Sage Weil   ceph: address spa...
418
419
420
421
422
423
424
425
  
  	/* is this a partial page at end of file? */
  	if (snap_size)
  		i_size = snap_size;
  	else
  		i_size = i_size_read(inode);
  	if (i_size < page_off + len)
  		len = i_size - page_off;
ae00d4f37   Sage Weil   ceph: fix cap_sna...
426
427
428
  	dout("writepage %p page %p index %lu on %llu~%u snapc %p
  ",
  	     inode, page, page->index, page_off, len, snapc);
1d3576fd1   Sage Weil   ceph: address spa...
429

3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
430
  	writeback_stat = atomic_long_inc_return(&fsc->writeback_count);
2baba2501   Yehuda Sadeh   ceph: writeback c...
431
  	if (writeback_stat >
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
432
433
  	    CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
  		set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC);
2baba2501   Yehuda Sadeh   ceph: writeback c...
434

1d3576fd1   Sage Weil   ceph: address spa...
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
  	set_page_writeback(page);
  	err = ceph_osdc_writepages(osdc, ceph_vino(inode),
  				   &ci->i_layout, snapc,
  				   page_off, len,
  				   ci->i_truncate_seq, ci->i_truncate_size,
  				   &inode->i_mtime,
  				   &page, 1, 0, 0, true);
  	if (err < 0) {
  		dout("writepage setting page/mapping error %d %p
  ", err, page);
  		SetPageError(page);
  		mapping_set_error(&inode->i_data, err);
  		if (wbc)
  			wbc->pages_skipped++;
  	} else {
  		dout("writepage cleaned page %p
  ", page);
  		err = 0;  /* vfs expects us to return 0 */
  	}
  	page->private = 0;
  	ClearPagePrivate(page);
  	end_page_writeback(page);
  	ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
6298a3375   Sage Weil   ceph: fix snap co...
458
  	ceph_put_snap_context(snapc);  /* page's reference */
1d3576fd1   Sage Weil   ceph: address spa...
459
460
461
462
463
464
  out:
  	return err;
  }
  
  static int ceph_writepage(struct page *page, struct writeback_control *wbc)
  {
dbd646a85   Yehuda Sadeh   ceph: writepage g...
465
466
467
468
469
  	int err;
  	struct inode *inode = page->mapping->host;
  	BUG_ON(!inode);
  	igrab(inode);
  	err = writepage_nounlock(page, wbc);
1d3576fd1   Sage Weil   ceph: address spa...
470
  	unlock_page(page);
dbd646a85   Yehuda Sadeh   ceph: writepage g...
471
  	iput(inode);
1d3576fd1   Sage Weil   ceph: address spa...
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
  	return err;
  }
  
  
  /*
   * lame release_pages helper.  release_pages() isn't exported to
   * modules.
   */
  static void ceph_release_pages(struct page **pages, int num)
  {
  	struct pagevec pvec;
  	int i;
  
  	pagevec_init(&pvec, 0);
  	for (i = 0; i < num; i++) {
  		if (pagevec_add(&pvec, pages[i]) == 0)
  			pagevec_release(&pvec);
  	}
  	pagevec_release(&pvec);
  }
  
  
  /*
   * async writeback completion handler.
   *
   * If we get an error, set the mapping error bit, but not the individual
   * page error bits.
   */
  static void writepages_finish(struct ceph_osd_request *req,
  			      struct ceph_msg *msg)
  {
  	struct inode *inode = req->r_inode;
  	struct ceph_osd_reply_head *replyhead;
  	struct ceph_osd_op *op;
  	struct ceph_inode_info *ci = ceph_inode(inode);
  	unsigned wrote;
1d3576fd1   Sage Weil   ceph: address spa...
508
509
510
511
  	struct page *page;
  	int i;
  	struct ceph_snap_context *snapc = req->r_snapc;
  	struct address_space *mapping = inode->i_mapping;
1d3576fd1   Sage Weil   ceph: address spa...
512
513
  	__s32 rc = -EIO;
  	u64 bytes = 0;
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
514
  	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
2baba2501   Yehuda Sadeh   ceph: writeback c...
515
  	long writeback_stat;
7ff899da0   Sage Weil   ceph: fix lockles...
516
  	unsigned issued = ceph_caps_issued(ci);
1d3576fd1   Sage Weil   ceph: address spa...
517
518
519
520
521
522
523
524
525
  
  	/* parse reply */
  	replyhead = msg->front.iov_base;
  	WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
  	op = (void *)(replyhead + 1);
  	rc = le32_to_cpu(replyhead->result);
  	bytes = le64_to_cpu(op->extent.length);
  
  	if (rc >= 0) {
79788c698   Sage Weil   ceph: release all...
526
527
528
529
530
531
532
  		/*
  		 * Assume we wrote the pages we originally sent.  The
  		 * osd might reply with fewer pages if our writeback
  		 * raced with a truncation and was adjusted at the osd,
  		 * so don't believe the reply.
  		 */
  		wrote = req->r_num_pages;
1d3576fd1   Sage Weil   ceph: address spa...
533
534
535
536
537
538
539
540
541
542
543
544
545
  	} else {
  		wrote = 0;
  		mapping_set_error(mapping, rc);
  	}
  	dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)
  ",
  	     inode, rc, bytes, wrote);
  
  	/* clean all pages */
  	for (i = 0; i < req->r_num_pages; i++) {
  		page = req->r_pages[i];
  		BUG_ON(!page);
  		WARN_ON(!PageUptodate(page));
2baba2501   Yehuda Sadeh   ceph: writeback c...
546
  		writeback_stat =
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
547
  			atomic_long_dec_return(&fsc->writeback_count);
2baba2501   Yehuda Sadeh   ceph: writeback c...
548
  		if (writeback_stat <
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
549
550
  		    CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
  			clear_bdi_congested(&fsc->backing_dev_info,
2baba2501   Yehuda Sadeh   ceph: writeback c...
551
  					    BLK_RW_ASYNC);
80e755fed   Sage Weil   ceph: allow write...
552
  		ceph_put_snap_context((void *)page->private);
1d3576fd1   Sage Weil   ceph: address spa...
553
554
  		page->private = 0;
  		ClearPagePrivate(page);
1d3576fd1   Sage Weil   ceph: address spa...
555
556
557
  		dout("unlocking %d %p
  ", i, page);
  		end_page_writeback(page);
e63dc5c78   Yehuda Sadeh   ceph: remove page...
558
559
560
561
562
563
564
  
  		/*
  		 * We lost the cache cap, need to truncate the page before
  		 * it is unlocked, otherwise we'd truncate it later in the
  		 * page truncation thread, possibly losing some data that
  		 * raced its way in
  		 */
2962507ca   Sage Weil   ceph: perform laz...
565
  		if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)
e63dc5c78   Yehuda Sadeh   ceph: remove page...
566
  			generic_error_remove_page(inode->i_mapping, page);
1d3576fd1   Sage Weil   ceph: address spa...
567
568
569
570
571
572
573
574
575
  		unlock_page(page);
  	}
  	dout("%p wrote+cleaned %d pages
  ", inode, wrote);
  	ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc);
  
  	ceph_release_pages(req->r_pages, req->r_num_pages);
  	if (req->r_pages_from_pool)
  		mempool_free(req->r_pages,
640ef79d2   Cheng Renquan   ceph: use ceph_sb...
576
  			     ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
1d3576fd1   Sage Weil   ceph: address spa...
577
578
579
580
581
582
583
584
585
586
  	else
  		kfree(req->r_pages);
  	ceph_osdc_put_request(req);
  }
  
  /*
   * allocate a page vec, either directly, or if necessary, via a the
   * mempool.  we avoid the mempool if we can because req->r_num_pages
   * may be less than the maximum write size.
   */
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
587
  static void alloc_page_vec(struct ceph_fs_client *fsc,
1d3576fd1   Sage Weil   ceph: address spa...
588
589
590
591
592
  			   struct ceph_osd_request *req)
  {
  	req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
  			       GFP_NOFS);
  	if (!req->r_pages) {
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
593
  		req->r_pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS);
1d3576fd1   Sage Weil   ceph: address spa...
594
595
596
597
598
599
600
601
602
603
604
605
  		req->r_pages_from_pool = 1;
  		WARN_ON(!req->r_pages);
  	}
  }
  
  /*
   * initiate async writeback
   */
  static int ceph_writepages_start(struct address_space *mapping,
  				 struct writeback_control *wbc)
  {
  	struct inode *inode = mapping->host;
1d3576fd1   Sage Weil   ceph: address spa...
606
  	struct ceph_inode_info *ci = ceph_inode(inode);
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
607
  	struct ceph_fs_client *fsc;
1d3576fd1   Sage Weil   ceph: address spa...
608
609
610
611
  	pgoff_t index, start, end;
  	int range_whole = 0;
  	int should_loop = 1;
  	pgoff_t max_pages = 0, max_pages_ever = 0;
80e755fed   Sage Weil   ceph: allow write...
612
  	struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc;
1d3576fd1   Sage Weil   ceph: address spa...
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
  	struct pagevec pvec;
  	int done = 0;
  	int rc = 0;
  	unsigned wsize = 1 << inode->i_blkbits;
  	struct ceph_osd_request *req = NULL;
  	int do_sync;
  	u64 snap_size = 0;
  
  	/*
  	 * Include a 'sync' in the OSD request if this is a data
  	 * integrity write (e.g., O_SYNC write or fsync()), or if our
  	 * cap is being revoked.
  	 */
  	do_sync = wbc->sync_mode == WB_SYNC_ALL;
  	if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
  		do_sync = 1;
  	dout("writepages_start %p dosync=%d (mode=%s)
  ",
  	     inode, do_sync,
  	     wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
  	     (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
634
635
  	fsc = ceph_inode_to_client(inode);
  	if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
1d3576fd1   Sage Weil   ceph: address spa...
636
637
638
639
  		pr_warning("writepage_start %p on forced umount
  ", inode);
  		return -EIO; /* we're in a forced umount, don't write! */
  	}
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
640
641
  	if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize)
  		wsize = fsc->mount_options->wsize;
1d3576fd1   Sage Weil   ceph: address spa...
642
643
644
645
646
  	if (wsize < PAGE_CACHE_SIZE)
  		wsize = PAGE_CACHE_SIZE;
  	max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
  
  	pagevec_init(&pvec, 0);
1d3576fd1   Sage Weil   ceph: address spa...
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
  	/* where to start/end? */
  	if (wbc->range_cyclic) {
  		start = mapping->writeback_index; /* Start from prev offset */
  		end = -1;
  		dout(" cyclic, start at %lu
  ", start);
  	} else {
  		start = wbc->range_start >> PAGE_CACHE_SHIFT;
  		end = wbc->range_end >> PAGE_CACHE_SHIFT;
  		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
  			range_whole = 1;
  		should_loop = 0;
  		dout(" not cyclic, %lu to %lu
  ", start, end);
  	}
  	index = start;
  
  retry:
  	/* find oldest snap context with dirty data */
  	ceph_put_snap_context(snapc);
  	snapc = get_oldest_context(inode, &snap_size);
  	if (!snapc) {
  		/* hmm, why does writepages get called when there
  		   is no dirty data? */
  		dout(" no snap context with dirty data?
  ");
  		goto out;
  	}
  	dout(" oldest snapc is %p seq %lld (%d snaps)
  ",
  	     snapc, snapc->seq, snapc->num_snaps);
  	if (last_snapc && snapc != last_snapc) {
  		/* if we switched to a newer snapc, restart our scan at the
  		 * start of the original file range. */
  		dout("  snapc differs from last pass, restarting at %lu
  ",
  		     index);
  		index = start;
  	}
  	last_snapc = snapc;
  
  	while (!done && index <= end) {
  		unsigned i;
  		int first;
  		pgoff_t next;
  		int pvec_pages, locked_pages;
  		struct page *page;
  		int want;
  		u64 offset, len;
  		struct ceph_osd_request_head *reqhead;
  		struct ceph_osd_op *op;
2baba2501   Yehuda Sadeh   ceph: writeback c...
698
  		long writeback_stat;
1d3576fd1   Sage Weil   ceph: address spa...
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
  
  		next = 0;
  		locked_pages = 0;
  		max_pages = max_pages_ever;
  
  get_more_pages:
  		first = -1;
  		want = min(end - index,
  			   min((pgoff_t)PAGEVEC_SIZE,
  			       max_pages - (pgoff_t)locked_pages) - 1)
  			+ 1;
  		pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index,
  						PAGECACHE_TAG_DIRTY,
  						want);
  		dout("pagevec_lookup_tag got %d
  ", pvec_pages);
  		if (!pvec_pages && !locked_pages)
  			break;
  		for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) {
  			page = pvec.pages[i];
  			dout("? %p idx %lu
  ", page, page->index);
  			if (locked_pages == 0)
  				lock_page(page);  /* first page */
  			else if (!trylock_page(page))
  				break;
  
  			/* only dirty pages, or our accounting breaks */
  			if (unlikely(!PageDirty(page)) ||
  			    unlikely(page->mapping != mapping)) {
  				dout("!dirty or !mapping %p
  ", page);
  				unlock_page(page);
  				break;
  			}
  			if (!wbc->range_cyclic && page->index > end) {
  				dout("end of range %p
  ", page);
  				done = 1;
  				unlock_page(page);
  				break;
  			}
  			if (next && (page->index != next)) {
  				dout("not consecutive %p
  ", page);
  				unlock_page(page);
  				break;
  			}
  			if (wbc->sync_mode != WB_SYNC_NONE) {
  				dout("waiting on writeback %p
  ", page);
  				wait_on_page_writeback(page);
  			}
  			if ((snap_size && page_offset(page) > snap_size) ||
  			    (!snap_size &&
  			     page_offset(page) > i_size_read(inode))) {
  				dout("%p page eof %llu
  ", page, snap_size ?
  				     snap_size : i_size_read(inode));
  				done = 1;
  				unlock_page(page);
  				break;
  			}
  			if (PageWriteback(page)) {
  				dout("%p under writeback
  ", page);
  				unlock_page(page);
  				break;
  			}
  
  			/* only if matching snap context */
80e755fed   Sage Weil   ceph: allow write...
770
771
772
773
774
  			pgsnapc = (void *)page->private;
  			if (pgsnapc->seq > snapc->seq) {
  				dout("page snapc %p %lld > oldest %p %lld
  ",
  				     pgsnapc, pgsnapc->seq, snapc, snapc->seq);
1d3576fd1   Sage Weil   ceph: address spa...
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
  				unlock_page(page);
  				if (!locked_pages)
  					continue; /* keep looking for snap */
  				break;
  			}
  
  			if (!clear_page_dirty_for_io(page)) {
  				dout("%p !clear_page_dirty_for_io
  ", page);
  				unlock_page(page);
  				break;
  			}
  
  			/* ok */
  			if (locked_pages == 0) {
  				/* prepare async write request */
a77d9f7dc   Sage Weil   ceph: fix file of...
791
792
  				offset = (unsigned long long)page->index
  					<< PAGE_CACHE_SHIFT;
1d3576fd1   Sage Weil   ceph: address spa...
793
  				len = wsize;
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
794
  				req = ceph_osdc_new_request(&fsc->client->osdc,
1d3576fd1   Sage Weil   ceph: address spa...
795
796
797
798
799
800
801
802
803
  					    &ci->i_layout,
  					    ceph_vino(inode),
  					    offset, &len,
  					    CEPH_OSD_OP_WRITE,
  					    CEPH_OSD_FLAG_WRITE |
  						    CEPH_OSD_FLAG_ONDISK,
  					    snapc, do_sync,
  					    ci->i_truncate_seq,
  					    ci->i_truncate_size,
b7495fc2f   Sage Weil   ceph: make page a...
804
  					    &inode->i_mtime, true, 1, 0);
1d3576fd1   Sage Weil   ceph: address spa...
805
  				max_pages = req->r_num_pages;
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
806
  				alloc_page_vec(fsc, req);
1d3576fd1   Sage Weil   ceph: address spa...
807
808
  				req->r_callback = writepages_finish;
  				req->r_inode = inode;
1d3576fd1   Sage Weil   ceph: address spa...
809
810
811
812
813
814
815
816
  			}
  
  			/* note position of first page in pvec */
  			if (first < 0)
  				first = i;
  			dout("%p will write page %p idx %lu
  ",
  			     inode, page, page->index);
2baba2501   Yehuda Sadeh   ceph: writeback c...
817

213c99ee0   Sage Weil   ceph: whitespace ...
818
  			writeback_stat =
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
819
  			       atomic_long_inc_return(&fsc->writeback_count);
213c99ee0   Sage Weil   ceph: whitespace ...
820
  			if (writeback_stat > CONGESTION_ON_THRESH(
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
821
822
  				    fsc->mount_options->congestion_kb)) {
  				set_bdi_congested(&fsc->backing_dev_info,
213c99ee0   Sage Weil   ceph: whitespace ...
823
  						  BLK_RW_ASYNC);
2baba2501   Yehuda Sadeh   ceph: writeback c...
824
  			}
1d3576fd1   Sage Weil   ceph: address spa...
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
  			set_page_writeback(page);
  			req->r_pages[locked_pages] = page;
  			locked_pages++;
  			next = page->index + 1;
  		}
  
  		/* did we get anything? */
  		if (!locked_pages)
  			goto release_pvec_pages;
  		if (i) {
  			int j;
  			BUG_ON(!locked_pages || first < 0);
  
  			if (pvec_pages && i == pvec_pages &&
  			    locked_pages < max_pages) {
  				dout("reached end pvec, trying for more
  ");
  				pagevec_reinit(&pvec);
  				goto get_more_pages;
  			}
  
  			/* shift unused pages over in the pvec...  we
  			 * will need to release them below. */
  			for (j = i; j < pvec_pages; j++) {
  				dout(" pvec leftover page %p
  ",
  				     pvec.pages[j]);
  				pvec.pages[j-i+first] = pvec.pages[j];
  			}
  			pvec.nr -= i-first;
  		}
  
  		/* submit the write */
  		offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT;
  		len = min((snap_size ? snap_size : i_size_read(inode)) - offset,
  			  (u64)locked_pages << PAGE_CACHE_SHIFT);
  		dout("writepages got %d pages at %llu~%llu
  ",
  		     locked_pages, offset, len);
  
  		/* revise final length, page count */
  		req->r_num_pages = locked_pages;
  		reqhead = req->r_request->front.iov_base;
  		op = (void *)(reqhead + 1);
  		op->extent.length = cpu_to_le64(len);
  		op->payload_len = cpu_to_le32(len);
  		req->r_request->hdr.data_len = cpu_to_le32(len);
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
872
  		ceph_osdc_start_request(&fsc->client->osdc, req, true);
1d3576fd1   Sage Weil   ceph: address spa...
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
  		req = NULL;
  
  		/* continue? */
  		index = next;
  		wbc->nr_to_write -= locked_pages;
  		if (wbc->nr_to_write <= 0)
  			done = 1;
  
  release_pvec_pages:
  		dout("pagevec_release on %d pages (%p)
  ", (int)pvec.nr,
  		     pvec.nr ? pvec.pages[0] : NULL);
  		pagevec_release(&pvec);
  
  		if (locked_pages && !done)
  			goto retry;
  	}
  
  	if (should_loop && !done) {
  		/* more to do; loop back to beginning of file */
  		dout("writepages looping back to beginning of file
  ");
  		should_loop = 0;
  		index = 0;
  		goto retry;
  	}
  
  	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
  		mapping->writeback_index = index;
  
  out:
  	if (req)
  		ceph_osdc_put_request(req);
  	if (rc > 0)
  		rc = 0;  /* vfs expects us to return 0 */
  	ceph_put_snap_context(snapc);
  	dout("writepages done, rc = %d
  ", rc);
1d3576fd1   Sage Weil   ceph: address spa...
911
912
913
914
915
916
917
918
919
920
921
922
  	return rc;
  }
  
  
  
  /*
   * See if a given @snapc is either writeable, or already written.
   */
  static int context_is_writeable_or_written(struct inode *inode,
  					   struct ceph_snap_context *snapc)
  {
  	struct ceph_snap_context *oldest = get_oldest_context(inode, NULL);
6298a3375   Sage Weil   ceph: fix snap co...
923
924
925
926
  	int ret = !oldest || snapc->seq <= oldest->seq;
  
  	ceph_put_snap_context(oldest);
  	return ret;
1d3576fd1   Sage Weil   ceph: address spa...
927
928
929
930
931
  }
  
  /*
   * We are only allowed to write into/dirty the page if the page is
   * clean, or already dirty within the same snap context.
8f883c24d   Sage Weil   ceph: make write_...
932
933
934
935
   *
   * called with page locked.
   * return success with page locked,
   * or any failure (incl -EAGAIN) with page unlocked.
1d3576fd1   Sage Weil   ceph: address spa...
936
   */
4af6b2257   Yehuda Sadeh   ceph: refactor ce...
937
938
939
  static int ceph_update_writeable_page(struct file *file,
  			    loff_t pos, unsigned len,
  			    struct page *page)
1d3576fd1   Sage Weil   ceph: address spa...
940
941
942
  {
  	struct inode *inode = file->f_dentry->d_inode;
  	struct ceph_inode_info *ci = ceph_inode(inode);
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
943
  	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
1d3576fd1   Sage Weil   ceph: address spa...
944
945
946
947
  	loff_t page_off = pos & PAGE_CACHE_MASK;
  	int pos_in_page = pos & ~PAGE_CACHE_MASK;
  	int end_in_page = pos_in_page + len;
  	loff_t i_size;
1d3576fd1   Sage Weil   ceph: address spa...
948
  	int r;
80e755fed   Sage Weil   ceph: allow write...
949
  	struct ceph_snap_context *snapc, *oldest;
1d3576fd1   Sage Weil   ceph: address spa...
950

1d3576fd1   Sage Weil   ceph: address spa...
951
952
953
954
955
956
957
958
  retry_locked:
  	/* writepages currently holds page lock, but if we change that later, */
  	wait_on_page_writeback(page);
  
  	/* check snap context */
  	BUG_ON(!ci->i_snap_realm);
  	down_read(&mdsc->snap_rwsem);
  	BUG_ON(!ci->i_snap_realm->cached_context);
80e755fed   Sage Weil   ceph: allow write...
959
960
  	snapc = (void *)page->private;
  	if (snapc && snapc != ci->i_head_snapc) {
1d3576fd1   Sage Weil   ceph: address spa...
961
962
963
964
  		/*
  		 * this page is already dirty in another (older) snap
  		 * context!  is it writeable now?
  		 */
80e755fed   Sage Weil   ceph: allow write...
965
  		oldest = get_oldest_context(inode, NULL);
1d3576fd1   Sage Weil   ceph: address spa...
966
  		up_read(&mdsc->snap_rwsem);
80e755fed   Sage Weil   ceph: allow write...
967
  		if (snapc->seq > oldest->seq) {
6298a3375   Sage Weil   ceph: fix snap co...
968
  			ceph_put_snap_context(oldest);
1d3576fd1   Sage Weil   ceph: address spa...
969
970
  			dout(" page %p snapc %p not current or oldest
  ",
6298a3375   Sage Weil   ceph: fix snap co...
971
  			     page, snapc);
1d3576fd1   Sage Weil   ceph: address spa...
972
973
974
975
  			/*
  			 * queue for writeback, and wait for snapc to
  			 * be writeable or written
  			 */
6298a3375   Sage Weil   ceph: fix snap co...
976
  			snapc = ceph_get_snap_context(snapc);
1d3576fd1   Sage Weil   ceph: address spa...
977
  			unlock_page(page);
3c6f6b79a   Sage Weil   ceph: cleanup asy...
978
  			ceph_queue_writeback(inode);
8f883c24d   Sage Weil   ceph: make write_...
979
  			r = wait_event_interruptible(ci->i_cap_wq,
1d3576fd1   Sage Weil   ceph: address spa...
980
981
  			       context_is_writeable_or_written(inode, snapc));
  			ceph_put_snap_context(snapc);
8f883c24d   Sage Weil   ceph: make write_...
982
983
  			if (r == -ERESTARTSYS)
  				return r;
4af6b2257   Yehuda Sadeh   ceph: refactor ce...
984
  			return -EAGAIN;
1d3576fd1   Sage Weil   ceph: address spa...
985
  		}
6298a3375   Sage Weil   ceph: fix snap co...
986
  		ceph_put_snap_context(oldest);
1d3576fd1   Sage Weil   ceph: address spa...
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
  
  		/* yay, writeable, do it now (without dropping page lock) */
  		dout(" page %p snapc %p not current, but oldest
  ",
  		     page, snapc);
  		if (!clear_page_dirty_for_io(page))
  			goto retry_locked;
  		r = writepage_nounlock(page, NULL);
  		if (r < 0)
  			goto fail_nosnap;
  		goto retry_locked;
  	}
  
  	if (PageUptodate(page)) {
  		dout(" page %p already uptodate
  ", page);
  		return 0;
  	}
  
  	/* full page? */
  	if (pos_in_page == 0 && len == PAGE_CACHE_SIZE)
  		return 0;
  
  	/* past end of file? */
  	i_size = inode->i_size;   /* caller holds i_mutex */
  
  	if (i_size + len > inode->i_sb->s_maxbytes) {
  		/* file is too big */
  		r = -EINVAL;
  		goto fail;
  	}
  
  	if (page_off >= i_size ||
  	    (pos_in_page == 0 && (pos+len) >= i_size &&
  	     end_in_page - pos_in_page != PAGE_CACHE_SIZE)) {
  		dout(" zeroing %p 0 - %d and %d - %d
  ",
  		     page, pos_in_page, end_in_page, (int)PAGE_CACHE_SIZE);
  		zero_user_segments(page,
  				   0, pos_in_page,
  				   end_in_page, PAGE_CACHE_SIZE);
  		return 0;
  	}
  
  	/* we need to read it. */
  	up_read(&mdsc->snap_rwsem);
  	r = readpage_nounlock(file, page);
  	if (r < 0)
  		goto fail_nosnap;
  	goto retry_locked;
  
  fail:
  	up_read(&mdsc->snap_rwsem);
  fail_nosnap:
  	unlock_page(page);
  	return r;
  }
  
  /*
4af6b2257   Yehuda Sadeh   ceph: refactor ce...
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
   * We are only allowed to write into/dirty the page if the page is
   * clean, or already dirty within the same snap context.
   */
  static int ceph_write_begin(struct file *file, struct address_space *mapping,
  			    loff_t pos, unsigned len, unsigned flags,
  			    struct page **pagep, void **fsdata)
  {
  	struct inode *inode = file->f_dentry->d_inode;
  	struct page *page;
  	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
  	int r;
  
  	do {
8f883c24d   Sage Weil   ceph: make write_...
1059
  		/* get a page */
4af6b2257   Yehuda Sadeh   ceph: refactor ce...
1060
1061
1062
1063
1064
1065
1066
  		page = grab_cache_page_write_begin(mapping, index, 0);
  		if (!page)
  			return -ENOMEM;
  		*pagep = page;
  
  		dout("write_begin file %p inode %p page %p %d~%d
  ", file,
213c99ee0   Sage Weil   ceph: whitespace ...
1067
  		     inode, page, (int)pos, (int)len);
4af6b2257   Yehuda Sadeh   ceph: refactor ce...
1068
1069
1070
1071
1072
1073
1074
1075
  
  		r = ceph_update_writeable_page(file, pos, len, page);
  	} while (r == -EAGAIN);
  
  	return r;
  }
  
  /*
1d3576fd1   Sage Weil   ceph: address spa...
1076
1077
1078
1079
1080
1081
1082
1083
1084
   * we don't do anything in here that simple_write_end doesn't do
   * except adjust dirty page accounting and drop read lock on
   * mdsc->snap_rwsem.
   */
  static int ceph_write_end(struct file *file, struct address_space *mapping,
  			  loff_t pos, unsigned len, unsigned copied,
  			  struct page *page, void *fsdata)
  {
  	struct inode *inode = file->f_dentry->d_inode;
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
1085
1086
  	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
  	struct ceph_mds_client *mdsc = fsc->mdsc;
1d3576fd1   Sage Weil   ceph: address spa...
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
  	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
  	int check_cap = 0;
  
  	dout("write_end file %p inode %p page %p %d~%d (%d)
  ", file,
  	     inode, page, (int)pos, (int)copied, (int)len);
  
  	/* zero the stale part of the page if we did a short copy */
  	if (copied < len)
  		zero_user_segment(page, from+copied, len);
  
  	/* did file size increase? */
  	/* (no need for i_size_read(); we caller holds i_mutex */
  	if (pos+copied > inode->i_size)
  		check_cap = ceph_inode_set_size(inode, pos+copied);
  
  	if (!PageUptodate(page))
  		SetPageUptodate(page);
  
  	set_page_dirty(page);
  
  	unlock_page(page);
  	up_read(&mdsc->snap_rwsem);
  	page_cache_release(page);
  
  	if (check_cap)
  		ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
  
  	return copied;
  }
  
  /*
   * we set .direct_IO to indicate direct io is supported, but since we
   * intercept O_DIRECT reads and writes early, this function should
   * never get called.
   */
  static ssize_t ceph_direct_io(int rw, struct kiocb *iocb,
  			      const struct iovec *iov,
  			      loff_t pos, unsigned long nr_segs)
  {
  	WARN_ON(1);
  	return -EINVAL;
  }
  
  const struct address_space_operations ceph_aops = {
  	.readpage = ceph_readpage,
  	.readpages = ceph_readpages,
  	.writepage = ceph_writepage,
  	.writepages = ceph_writepages_start,
  	.write_begin = ceph_write_begin,
  	.write_end = ceph_write_end,
  	.set_page_dirty = ceph_set_page_dirty,
  	.invalidatepage = ceph_invalidatepage,
  	.releasepage = ceph_releasepage,
  	.direct_IO = ceph_direct_io,
  };
  
  
  /*
   * vm ops
   */
  
  /*
   * Reuse write_begin here for simplicity.
   */
  static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
  {
  	struct inode *inode = vma->vm_file->f_dentry->d_inode;
  	struct page *page = vmf->page;
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
1156
  	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
1d3576fd1   Sage Weil   ceph: address spa...
1157
1158
  	loff_t off = page->index << PAGE_CACHE_SHIFT;
  	loff_t size, len;
1d3576fd1   Sage Weil   ceph: address spa...
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
  	int ret;
  
  	size = i_size_read(inode);
  	if (off + PAGE_CACHE_SIZE <= size)
  		len = PAGE_CACHE_SIZE;
  	else
  		len = size & ~PAGE_CACHE_MASK;
  
  	dout("page_mkwrite %p %llu~%llu page %p idx %lu
  ", inode,
  	     off, len, page, page->index);
4af6b2257   Yehuda Sadeh   ceph: refactor ce...
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
  
  	lock_page(page);
  
  	ret = VM_FAULT_NOPAGE;
  	if ((off > size) ||
  	    (page->mapping != inode->i_mapping))
  		goto out;
  
  	ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
  	if (ret == 0) {
  		/* success.  we'll keep the page locked. */
1d3576fd1   Sage Weil   ceph: address spa...
1181
1182
  		set_page_dirty(page);
  		up_read(&mdsc->snap_rwsem);
1d3576fd1   Sage Weil   ceph: address spa...
1183
1184
  		ret = VM_FAULT_LOCKED;
  	} else {
4af6b2257   Yehuda Sadeh   ceph: refactor ce...
1185
1186
1187
1188
  		if (ret == -ENOMEM)
  			ret = VM_FAULT_OOM;
  		else
  			ret = VM_FAULT_SIGBUS;
1d3576fd1   Sage Weil   ceph: address spa...
1189
  	}
4af6b2257   Yehuda Sadeh   ceph: refactor ce...
1190
  out:
1d3576fd1   Sage Weil   ceph: address spa...
1191
1192
  	dout("page_mkwrite %p %llu~%llu = %d
  ", inode, off, len, ret);
4af6b2257   Yehuda Sadeh   ceph: refactor ce...
1193
1194
  	if (ret != VM_FAULT_LOCKED)
  		unlock_page(page);
1d3576fd1   Sage Weil   ceph: address spa...
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
  	return ret;
  }
  
  static struct vm_operations_struct ceph_vmops = {
  	.fault		= filemap_fault,
  	.page_mkwrite	= ceph_page_mkwrite,
  };
  
  int ceph_mmap(struct file *file, struct vm_area_struct *vma)
  {
  	struct address_space *mapping = file->f_mapping;
  
  	if (!mapping->a_ops->readpage)
  		return -ENOEXEC;
  	file_accessed(file);
  	vma->vm_ops = &ceph_vmops;
  	vma->vm_flags |= VM_CAN_NONLINEAR;
  	return 0;
  }