Blame view

fs/ceph/addr.c 33.1 KB
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
1
  #include <linux/ceph/ceph_debug.h>
1d3576fd1   Sage Weil   ceph: address spa...
2
3
4
5
6
7
  
  #include <linux/backing-dev.h>
  #include <linux/fs.h>
  #include <linux/mm.h>
  #include <linux/pagemap.h>
  #include <linux/writeback.h>	/* generic_writepages */
5a0e3ad6a   Tejun Heo   include cleanup: ...
8
  #include <linux/slab.h>
1d3576fd1   Sage Weil   ceph: address spa...
9
10
11
12
  #include <linux/pagevec.h>
  #include <linux/task_io_accounting_ops.h>
  
  #include "super.h"
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
13
14
  #include "mds_client.h"
  #include <linux/ceph/osd_client.h>
1d3576fd1   Sage Weil   ceph: address spa...
15
16
17
18
19
20
21
22
23
24
25
26
  
  /*
   * Ceph address space ops.
   *
   * There are a few funny things going on here.
   *
   * The page->private field is used to reference a struct
   * ceph_snap_context for _every_ dirty page.  This indicates which
   * snapshot the page was logically dirtied in, and thus which snap
   * context needs to be associated with the osd write during writeback.
   *
   * Similarly, struct ceph_inode_info maintains a set of counters to
25985edce   Lucas De Marchi   Fix common misspe...
27
   * count dirty pages on the inode.  In the absence of snapshots,
1d3576fd1   Sage Weil   ceph: address spa...
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
   * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count.
   *
   * When a snapshot is taken (that is, when the client receives
   * notification that a snapshot was taken), each inode with caps and
   * with dirty pages (dirty pages implies there is a cap) gets a new
   * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending
   * order, new snaps go to the tail).  The i_wrbuffer_ref_head count is
   * moved to capsnap->dirty. (Unless a sync write is currently in
   * progress.  In that case, the capsnap is said to be "pending", new
   * writes cannot start, and the capsnap isn't "finalized" until the
   * write completes (or fails) and a final size/mtime for the inode for
   * that snap can be settled upon.)  i_wrbuffer_ref_head is reset to 0.
   *
   * On writeback, we must submit writes to the osd IN SNAP ORDER.  So,
   * we look for the first capsnap in i_cap_snaps and write out pages in
   * that snap context _only_.  Then we move on to the next capsnap,
   * eventually reaching the "live" or "head" context (i.e., pages that
   * are not yet snapped) and are writing the most recently dirtied
   * pages.
   *
   * Invalidate and so forth must take care to ensure the dirty page
   * accounting is preserved.
   */
2baba2501   Yehuda Sadeh   ceph: writeback c...
51
52
53
54
  #define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10))
  #define CONGESTION_OFF_THRESH(congestion_kb)				\
  	(CONGESTION_ON_THRESH(congestion_kb) -				\
  	 (CONGESTION_ON_THRESH(congestion_kb) >> 2))
1d3576fd1   Sage Weil   ceph: address spa...
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
  
  /*
   * Dirty a page.  Optimistically adjust accounting, on the assumption
   * that we won't race with invalidate.  If we do, readjust.
   */
  static int ceph_set_page_dirty(struct page *page)
  {
  	struct address_space *mapping = page->mapping;
  	struct inode *inode;
  	struct ceph_inode_info *ci;
  	int undo = 0;
  	struct ceph_snap_context *snapc;
  
  	if (unlikely(!mapping))
  		return !TestSetPageDirty(page);
  
  	if (TestSetPageDirty(page)) {
  		dout("%p set_page_dirty %p idx %lu -- already dirty
  ",
  		     mapping->host, page, page->index);
  		return 0;
  	}
  
  	inode = mapping->host;
  	ci = ceph_inode(inode);
  
  	/*
  	 * Note that we're grabbing a snapc ref here without holding
  	 * any locks!
  	 */
  	snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
  
  	/* dirty the head */
be655596b   Sage Weil   ceph: use i_ceph_...
88
  	spin_lock(&ci->i_ceph_lock);
7d8cb26d7   Sage Weil   ceph: maintain i_...
89
  	if (ci->i_head_snapc == NULL)
1d3576fd1   Sage Weil   ceph: address spa...
90
91
92
  		ci->i_head_snapc = ceph_get_snap_context(snapc);
  	++ci->i_wrbuffer_ref_head;
  	if (ci->i_wrbuffer_ref == 0)
0444d76ae   Dave Chinner   fs: don't use igr...
93
  		ihold(inode);
1d3576fd1   Sage Weil   ceph: address spa...
94
95
96
97
98
99
100
101
  	++ci->i_wrbuffer_ref;
  	dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d "
  	     "snapc %p seq %lld (%d snaps)
  ",
  	     mapping->host, page, page->index,
  	     ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
  	     ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
  	     snapc, snapc->seq, snapc->num_snaps);
be655596b   Sage Weil   ceph: use i_ceph_...
102
  	spin_unlock(&ci->i_ceph_lock);
1d3576fd1   Sage Weil   ceph: address spa...
103
104
105
106
107
  
  	/* now adjust page */
  	spin_lock_irq(&mapping->tree_lock);
  	if (page->mapping) {	/* Race with truncate? */
  		WARN_ON_ONCE(!PageUptodate(page));
679ceace8   Michael Rubin   mm: exporting acc...
108
  		account_page_dirtied(page, page->mapping);
1d3576fd1   Sage Weil   ceph: address spa...
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
  		radix_tree_tag_set(&mapping->page_tree,
  				page_index(page), PAGECACHE_TAG_DIRTY);
  
  		/*
  		 * Reference snap context in page->private.  Also set
  		 * PagePrivate so that we get invalidatepage callback.
  		 */
  		page->private = (unsigned long)snapc;
  		SetPagePrivate(page);
  	} else {
  		dout("ANON set_page_dirty %p (raced truncate?)
  ", page);
  		undo = 1;
  	}
  
  	spin_unlock_irq(&mapping->tree_lock);
  
  	if (undo)
  		/* whoops, we failed to dirty the page */
  		ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
  
  	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
  
  	BUG_ON(!PageDirty(page));
  	return 1;
  }
  
  /*
   * If we are truncating the full page (i.e. offset == 0), adjust the
   * dirty page counters appropriately.  Only called if there is private
   * data on the page.
   */
  static void ceph_invalidatepage(struct page *page, unsigned long offset)
  {
4ce1e9ada   Alexander Beregalov   ceph: move derefe...
143
  	struct inode *inode;
1d3576fd1   Sage Weil   ceph: address spa...
144
145
146
147
148
149
150
  	struct ceph_inode_info *ci;
  	struct ceph_snap_context *snapc = (void *)page->private;
  
  	BUG_ON(!PageLocked(page));
  	BUG_ON(!page->private);
  	BUG_ON(!PagePrivate(page));
  	BUG_ON(!page->mapping);
4ce1e9ada   Alexander Beregalov   ceph: move derefe...
151
  	inode = page->mapping->host;
1d3576fd1   Sage Weil   ceph: address spa...
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
  	/*
  	 * We can get non-dirty pages here due to races between
  	 * set_page_dirty and truncate_complete_page; just spit out a
  	 * warning, in case we end up with accounting problems later.
  	 */
  	if (!PageDirty(page))
  		pr_err("%p invalidatepage %p page not dirty
  ", inode, page);
  
  	if (offset == 0)
  		ClearPageChecked(page);
  
  	ci = ceph_inode(inode);
  	if (offset == 0) {
  		dout("%p invalidatepage %p idx %lu full dirty page %lu
  ",
  		     inode, page, page->index, offset);
  		ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
  		ceph_put_snap_context(snapc);
  		page->private = 0;
  		ClearPagePrivate(page);
  	} else {
  		dout("%p invalidatepage %p idx %lu partial dirty page
  ",
  		     inode, page, page->index);
  	}
  }
  
  /* just a sanity check */
  static int ceph_releasepage(struct page *page, gfp_t g)
  {
  	struct inode *inode = page->mapping ? page->mapping->host : NULL;
  	dout("%p releasepage %p idx %lu
  ", inode, page, page->index);
  	WARN_ON(PageDirty(page));
  	WARN_ON(page->private);
  	WARN_ON(PagePrivate(page));
  	return 0;
  }
  
  /*
   * read a single page, without unlocking it.
   */
  static int readpage_nounlock(struct file *filp, struct page *page)
  {
  	struct inode *inode = filp->f_dentry->d_inode;
  	struct ceph_inode_info *ci = ceph_inode(inode);
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
199
200
  	struct ceph_osd_client *osdc = 
  		&ceph_inode_to_client(inode)->client->osdc;
1d3576fd1   Sage Weil   ceph: address spa...
201
202
203
204
205
206
207
208
209
  	int err = 0;
  	u64 len = PAGE_CACHE_SIZE;
  
  	dout("readpage inode %p file %p page %p index %lu
  ",
  	     inode, filp, page, page->index);
  	err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
  				  page->index << PAGE_CACHE_SHIFT, &len,
  				  ci->i_truncate_seq, ci->i_truncate_size,
b7495fc2f   Sage Weil   ceph: make page a...
210
  				  &page, 1, 0);
1d3576fd1   Sage Weil   ceph: address spa...
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
  	if (err == -ENOENT)
  		err = 0;
  	if (err < 0) {
  		SetPageError(page);
  		goto out;
  	} else if (err < PAGE_CACHE_SIZE) {
  		/* zero fill remainder of page */
  		zero_user_segment(page, err, PAGE_CACHE_SIZE);
  	}
  	SetPageUptodate(page);
  
  out:
  	return err < 0 ? err : 0;
  }
  
  static int ceph_readpage(struct file *filp, struct page *page)
  {
  	int r = readpage_nounlock(filp, page);
  	unlock_page(page);
  	return r;
  }
  
  /*
7c272194e   Sage Weil   ceph: make readpa...
234
   * Finish an async read(ahead) op.
1d3576fd1   Sage Weil   ceph: address spa...
235
   */
7c272194e   Sage Weil   ceph: make readpa...
236
  static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
1d3576fd1   Sage Weil   ceph: address spa...
237
  {
7c272194e   Sage Weil   ceph: make readpa...
238
239
240
241
  	struct inode *inode = req->r_inode;
  	struct ceph_osd_reply_head *replyhead;
  	int rc, bytes;
  	int i;
1d3576fd1   Sage Weil   ceph: address spa...
242

7c272194e   Sage Weil   ceph: make readpa...
243
244
245
246
247
  	/* parse reply */
  	replyhead = msg->front.iov_base;
  	WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
  	rc = le32_to_cpu(replyhead->result);
  	bytes = le32_to_cpu(msg->hdr.data_len);
1d3576fd1   Sage Weil   ceph: address spa...
248

7c272194e   Sage Weil   ceph: make readpa...
249
250
251
252
253
254
255
256
257
258
259
  	dout("finish_read %p req %p rc %d bytes %d
  ", inode, req, rc, bytes);
  
  	/* unlock all pages, zeroing any data we didn't read */
  	for (i = 0; i < req->r_num_pages; i++, bytes -= PAGE_CACHE_SIZE) {
  		struct page *page = req->r_pages[i];
  
  		if (bytes < (int)PAGE_CACHE_SIZE) {
  			/* zero (remainder of) page */
  			int s = bytes < 0 ? 0 : bytes;
  			zero_user_segment(page, s, PAGE_CACHE_SIZE);
1d3576fd1   Sage Weil   ceph: address spa...
260
  		}
7c272194e   Sage Weil   ceph: make readpa...
261
262
263
264
265
266
267
   		dout("finish_read %p uptodate %p idx %lu
  ", inode, page,
  		     page->index);
  		flush_dcache_page(page);
  		SetPageUptodate(page);
  		unlock_page(page);
  		page_cache_release(page);
1d3576fd1   Sage Weil   ceph: address spa...
268
  	}
7c272194e   Sage Weil   ceph: make readpa...
269
  	kfree(req->r_pages);
1d3576fd1   Sage Weil   ceph: address spa...
270
271
272
  }
  
  /*
7c272194e   Sage Weil   ceph: make readpa...
273
274
   * start an async read(ahead) operation.  return nr_pages we submitted
   * a read for on success, or negative error code.
1d3576fd1   Sage Weil   ceph: address spa...
275
   */
0d66a487c   Sage Weil   ceph: implement (...
276
  static int start_read(struct inode *inode, struct list_head *page_list, int max)
1d3576fd1   Sage Weil   ceph: address spa...
277
  {
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
278
279
  	struct ceph_osd_client *osdc =
  		&ceph_inode_to_client(inode)->client->osdc;
7c272194e   Sage Weil   ceph: make readpa...
280
281
282
283
  	struct ceph_inode_info *ci = ceph_inode(inode);
  	struct page *page = list_entry(page_list->prev, struct page, lru);
  	struct ceph_osd_request *req;
  	u64 off;
1d3576fd1   Sage Weil   ceph: address spa...
284
  	u64 len;
7c272194e   Sage Weil   ceph: make readpa...
285
286
287
288
289
  	int i;
  	struct page **pages;
  	pgoff_t next_index;
  	int nr_pages = 0;
  	int ret;
1d3576fd1   Sage Weil   ceph: address spa...
290

7c272194e   Sage Weil   ceph: make readpa...
291
  	off = page->index << PAGE_CACHE_SHIFT;
1d3576fd1   Sage Weil   ceph: address spa...
292

7c272194e   Sage Weil   ceph: make readpa...
293
294
295
296
297
298
299
  	/* count pages */
  	next_index = page->index;
  	list_for_each_entry_reverse(page, page_list, lru) {
  		if (page->index != next_index)
  			break;
  		nr_pages++;
  		next_index++;
0d66a487c   Sage Weil   ceph: implement (...
300
301
  		if (max && nr_pages == max)
  			break;
7c272194e   Sage Weil   ceph: make readpa...
302
  	}
1d3576fd1   Sage Weil   ceph: address spa...
303
  	len = nr_pages << PAGE_CACHE_SHIFT;
7c272194e   Sage Weil   ceph: make readpa...
304
305
306
307
308
309
310
311
312
313
314
315
  	dout("start_read %p nr_pages %d is %lld~%lld
  ", inode, nr_pages,
  	     off, len);
  
  	req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode),
  				    off, &len,
  				    CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
  				    NULL, 0,
  				    ci->i_truncate_seq, ci->i_truncate_size,
  				    NULL, false, 1, 0);
  	if (!req)
  		return -ENOMEM;
1d3576fd1   Sage Weil   ceph: address spa...
316

7c272194e   Sage Weil   ceph: make readpa...
317
318
319
320
321
322
323
324
325
  	/* build page vector */
  	nr_pages = len >> PAGE_CACHE_SHIFT;
  	pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS);
  	ret = -ENOMEM;
  	if (!pages)
  		goto out;
  	for (i = 0; i < nr_pages; ++i) {
  		page = list_entry(page_list->prev, struct page, lru);
  		BUG_ON(PageLocked(page));
1d3576fd1   Sage Weil   ceph: address spa...
326
  		list_del(&page->lru);
7c272194e   Sage Weil   ceph: make readpa...
327
328
329
330
331
  		
   		dout("start_read %p adding %p idx %lu
  ", inode, page,
  		     page->index);
  		if (add_to_page_cache_lru(page, &inode->i_data, page->index,
213c99ee0   Sage Weil   ceph: whitespace ...
332
  					  GFP_NOFS)) {
1d3576fd1   Sage Weil   ceph: address spa...
333
  			page_cache_release(page);
7c272194e   Sage Weil   ceph: make readpa...
334
335
  			dout("start_read %p add_to_page_cache failed %p
  ",
1d3576fd1   Sage Weil   ceph: address spa...
336
  			     inode, page);
7c272194e   Sage Weil   ceph: make readpa...
337
338
  			nr_pages = i;
  			goto out_pages;
1d3576fd1   Sage Weil   ceph: address spa...
339
  		}
7c272194e   Sage Weil   ceph: make readpa...
340
  		pages[i] = page;
1d3576fd1   Sage Weil   ceph: address spa...
341
  	}
7c272194e   Sage Weil   ceph: make readpa...
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
  	req->r_pages = pages;
  	req->r_num_pages = nr_pages;
  	req->r_callback = finish_read;
  	req->r_inode = inode;
  
  	dout("start_read %p starting %p %lld~%lld
  ", inode, req, off, len);
  	ret = ceph_osdc_start_request(osdc, req, false);
  	if (ret < 0)
  		goto out_pages;
  	ceph_osdc_put_request(req);
  	return nr_pages;
  
  out_pages:
  	ceph_release_page_vector(pages, nr_pages);
7c272194e   Sage Weil   ceph: make readpa...
357
358
359
360
  out:
  	ceph_osdc_put_request(req);
  	return ret;
  }
1d3576fd1   Sage Weil   ceph: address spa...
361

7c272194e   Sage Weil   ceph: make readpa...
362
363
364
365
366
367
368
369
370
  
  /*
   * Read multiple pages.  Leave pages we don't read + unlock in page_list;
   * the caller (VM) cleans them up.
   */
  static int ceph_readpages(struct file *file, struct address_space *mapping,
  			  struct list_head *page_list, unsigned nr_pages)
  {
  	struct inode *inode = file->f_dentry->d_inode;
0d66a487c   Sage Weil   ceph: implement (...
371
  	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
7c272194e   Sage Weil   ceph: make readpa...
372
  	int rc = 0;
0d66a487c   Sage Weil   ceph: implement (...
373
374
375
376
377
  	int max = 0;
  
  	if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE)
  		max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
  			>> PAGE_SHIFT;
7c272194e   Sage Weil   ceph: make readpa...
378

0d66a487c   Sage Weil   ceph: implement (...
379
380
381
  	dout("readpages %p file %p nr_pages %d max %d
  ", inode, file, nr_pages,
  	     max);
7c272194e   Sage Weil   ceph: make readpa...
382
  	while (!list_empty(page_list)) {
0d66a487c   Sage Weil   ceph: implement (...
383
  		rc = start_read(inode, page_list, max);
7c272194e   Sage Weil   ceph: make readpa...
384
385
386
387
  		if (rc < 0)
  			goto out;
  		BUG_ON(rc == 0);
  	}
1d3576fd1   Sage Weil   ceph: address spa...
388
  out:
7c272194e   Sage Weil   ceph: make readpa...
389
390
  	dout("readpages %p file %p ret %d
  ", inode, file, rc);
1d3576fd1   Sage Weil   ceph: address spa...
391
392
393
394
395
396
  	return rc;
  }
  
  /*
   * Get ref for the oldest snapc for an inode with dirty data... that is, the
   * only snap context we are allowed to write back.
1d3576fd1   Sage Weil   ceph: address spa...
397
   */
6298a3375   Sage Weil   ceph: fix snap co...
398
399
  static struct ceph_snap_context *get_oldest_context(struct inode *inode,
  						    u64 *snap_size)
1d3576fd1   Sage Weil   ceph: address spa...
400
401
402
403
  {
  	struct ceph_inode_info *ci = ceph_inode(inode);
  	struct ceph_snap_context *snapc = NULL;
  	struct ceph_cap_snap *capsnap = NULL;
be655596b   Sage Weil   ceph: use i_ceph_...
404
  	spin_lock(&ci->i_ceph_lock);
1d3576fd1   Sage Weil   ceph: address spa...
405
406
407
408
409
410
411
412
413
414
415
  	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
  		dout(" cap_snap %p snapc %p has %d dirty pages
  ", capsnap,
  		     capsnap->context, capsnap->dirty_pages);
  		if (capsnap->dirty_pages) {
  			snapc = ceph_get_snap_context(capsnap->context);
  			if (snap_size)
  				*snap_size = capsnap->size;
  			break;
  		}
  	}
7d8cb26d7   Sage Weil   ceph: maintain i_...
416
  	if (!snapc && ci->i_wrbuffer_ref_head) {
80e755fed   Sage Weil   ceph: allow write...
417
  		snapc = ceph_get_snap_context(ci->i_head_snapc);
1d3576fd1   Sage Weil   ceph: address spa...
418
419
420
421
  		dout(" head snapc %p has %d dirty pages
  ",
  		     snapc, ci->i_wrbuffer_ref_head);
  	}
be655596b   Sage Weil   ceph: use i_ceph_...
422
  	spin_unlock(&ci->i_ceph_lock);
1d3576fd1   Sage Weil   ceph: address spa...
423
424
425
426
427
428
429
430
431
432
433
434
435
  	return snapc;
  }
  
  /*
   * Write a single page, but leave the page locked.
   *
   * If we get a write error, set the page error bit, but still adjust the
   * dirty page accounting (i.e., page is no longer dirty).
   */
  static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
  {
  	struct inode *inode;
  	struct ceph_inode_info *ci;
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
436
  	struct ceph_fs_client *fsc;
1d3576fd1   Sage Weil   ceph: address spa...
437
438
439
440
441
  	struct ceph_osd_client *osdc;
  	loff_t page_off = page->index << PAGE_CACHE_SHIFT;
  	int len = PAGE_CACHE_SIZE;
  	loff_t i_size;
  	int err = 0;
6298a3375   Sage Weil   ceph: fix snap co...
442
  	struct ceph_snap_context *snapc, *oldest;
1d3576fd1   Sage Weil   ceph: address spa...
443
  	u64 snap_size = 0;
2baba2501   Yehuda Sadeh   ceph: writeback c...
444
  	long writeback_stat;
1d3576fd1   Sage Weil   ceph: address spa...
445
446
447
448
449
450
451
452
453
454
455
  
  	dout("writepage %p idx %lu
  ", page, page->index);
  
  	if (!page->mapping || !page->mapping->host) {
  		dout("writepage %p - no mapping
  ", page);
  		return -EFAULT;
  	}
  	inode = page->mapping->host;
  	ci = ceph_inode(inode);
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
456
457
  	fsc = ceph_inode_to_client(inode);
  	osdc = &fsc->client->osdc;
1d3576fd1   Sage Weil   ceph: address spa...
458
459
460
461
462
463
464
465
  
  	/* verify this is a writeable snap context */
  	snapc = (void *)page->private;
  	if (snapc == NULL) {
  		dout("writepage %p page %p not dirty?
  ", inode, page);
  		goto out;
  	}
6298a3375   Sage Weil   ceph: fix snap co...
466
467
  	oldest = get_oldest_context(inode, &snap_size);
  	if (snapc->seq > oldest->seq) {
1d3576fd1   Sage Weil   ceph: address spa...
468
469
470
471
472
  		dout("writepage %p page %p snapc %p not writeable - noop
  ",
  		     inode, page, (void *)page->private);
  		/* we should only noop if called by kswapd */
  		WARN_ON((current->flags & PF_MEMALLOC) == 0);
6298a3375   Sage Weil   ceph: fix snap co...
473
  		ceph_put_snap_context(oldest);
1d3576fd1   Sage Weil   ceph: address spa...
474
475
  		goto out;
  	}
6298a3375   Sage Weil   ceph: fix snap co...
476
  	ceph_put_snap_context(oldest);
1d3576fd1   Sage Weil   ceph: address spa...
477
478
479
480
481
482
483
484
  
  	/* is this a partial page at end of file? */
  	if (snap_size)
  		i_size = snap_size;
  	else
  		i_size = i_size_read(inode);
  	if (i_size < page_off + len)
  		len = i_size - page_off;
ae00d4f37   Sage Weil   ceph: fix cap_sna...
485
486
487
  	dout("writepage %p page %p index %lu on %llu~%u snapc %p
  ",
  	     inode, page, page->index, page_off, len, snapc);
1d3576fd1   Sage Weil   ceph: address spa...
488

3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
489
  	writeback_stat = atomic_long_inc_return(&fsc->writeback_count);
2baba2501   Yehuda Sadeh   ceph: writeback c...
490
  	if (writeback_stat >
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
491
492
  	    CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
  		set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC);
2baba2501   Yehuda Sadeh   ceph: writeback c...
493

1d3576fd1   Sage Weil   ceph: address spa...
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
  	set_page_writeback(page);
  	err = ceph_osdc_writepages(osdc, ceph_vino(inode),
  				   &ci->i_layout, snapc,
  				   page_off, len,
  				   ci->i_truncate_seq, ci->i_truncate_size,
  				   &inode->i_mtime,
  				   &page, 1, 0, 0, true);
  	if (err < 0) {
  		dout("writepage setting page/mapping error %d %p
  ", err, page);
  		SetPageError(page);
  		mapping_set_error(&inode->i_data, err);
  		if (wbc)
  			wbc->pages_skipped++;
  	} else {
  		dout("writepage cleaned page %p
  ", page);
  		err = 0;  /* vfs expects us to return 0 */
  	}
  	page->private = 0;
  	ClearPagePrivate(page);
  	end_page_writeback(page);
  	ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
6298a3375   Sage Weil   ceph: fix snap co...
517
  	ceph_put_snap_context(snapc);  /* page's reference */
1d3576fd1   Sage Weil   ceph: address spa...
518
519
520
521
522
523
  out:
  	return err;
  }
  
  static int ceph_writepage(struct page *page, struct writeback_control *wbc)
  {
dbd646a85   Yehuda Sadeh   ceph: writepage g...
524
525
526
  	int err;
  	struct inode *inode = page->mapping->host;
  	BUG_ON(!inode);
70b666c3b   Sage Weil   ceph: use ihold w...
527
  	ihold(inode);
dbd646a85   Yehuda Sadeh   ceph: writepage g...
528
  	err = writepage_nounlock(page, wbc);
1d3576fd1   Sage Weil   ceph: address spa...
529
  	unlock_page(page);
dbd646a85   Yehuda Sadeh   ceph: writepage g...
530
  	iput(inode);
1d3576fd1   Sage Weil   ceph: address spa...
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
  	return err;
  }
  
  
  /*
   * lame release_pages helper.  release_pages() isn't exported to
   * modules.
   */
  static void ceph_release_pages(struct page **pages, int num)
  {
  	struct pagevec pvec;
  	int i;
  
  	pagevec_init(&pvec, 0);
  	for (i = 0; i < num; i++) {
  		if (pagevec_add(&pvec, pages[i]) == 0)
  			pagevec_release(&pvec);
  	}
  	pagevec_release(&pvec);
  }
  
  
  /*
   * async writeback completion handler.
   *
   * If we get an error, set the mapping error bit, but not the individual
   * page error bits.
   */
  static void writepages_finish(struct ceph_osd_request *req,
  			      struct ceph_msg *msg)
  {
  	struct inode *inode = req->r_inode;
  	struct ceph_osd_reply_head *replyhead;
  	struct ceph_osd_op *op;
  	struct ceph_inode_info *ci = ceph_inode(inode);
  	unsigned wrote;
1d3576fd1   Sage Weil   ceph: address spa...
567
568
569
570
  	struct page *page;
  	int i;
  	struct ceph_snap_context *snapc = req->r_snapc;
  	struct address_space *mapping = inode->i_mapping;
1d3576fd1   Sage Weil   ceph: address spa...
571
572
  	__s32 rc = -EIO;
  	u64 bytes = 0;
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
573
  	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
2baba2501   Yehuda Sadeh   ceph: writeback c...
574
  	long writeback_stat;
7ff899da0   Sage Weil   ceph: fix lockles...
575
  	unsigned issued = ceph_caps_issued(ci);
1d3576fd1   Sage Weil   ceph: address spa...
576
577
578
579
580
581
582
583
584
  
  	/* parse reply */
  	replyhead = msg->front.iov_base;
  	WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
  	op = (void *)(replyhead + 1);
  	rc = le32_to_cpu(replyhead->result);
  	bytes = le64_to_cpu(op->extent.length);
  
  	if (rc >= 0) {
79788c698   Sage Weil   ceph: release all...
585
586
587
588
589
590
591
  		/*
  		 * Assume we wrote the pages we originally sent.  The
  		 * osd might reply with fewer pages if our writeback
  		 * raced with a truncation and was adjusted at the osd,
  		 * so don't believe the reply.
  		 */
  		wrote = req->r_num_pages;
1d3576fd1   Sage Weil   ceph: address spa...
592
593
594
595
596
597
598
599
600
601
602
603
604
  	} else {
  		wrote = 0;
  		mapping_set_error(mapping, rc);
  	}
  	dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)
  ",
  	     inode, rc, bytes, wrote);
  
  	/* clean all pages */
  	for (i = 0; i < req->r_num_pages; i++) {
  		page = req->r_pages[i];
  		BUG_ON(!page);
  		WARN_ON(!PageUptodate(page));
2baba2501   Yehuda Sadeh   ceph: writeback c...
605
  		writeback_stat =
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
606
  			atomic_long_dec_return(&fsc->writeback_count);
2baba2501   Yehuda Sadeh   ceph: writeback c...
607
  		if (writeback_stat <
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
608
609
  		    CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
  			clear_bdi_congested(&fsc->backing_dev_info,
2baba2501   Yehuda Sadeh   ceph: writeback c...
610
  					    BLK_RW_ASYNC);
80e755fed   Sage Weil   ceph: allow write...
611
  		ceph_put_snap_context((void *)page->private);
1d3576fd1   Sage Weil   ceph: address spa...
612
613
  		page->private = 0;
  		ClearPagePrivate(page);
1d3576fd1   Sage Weil   ceph: address spa...
614
615
616
  		dout("unlocking %d %p
  ", i, page);
  		end_page_writeback(page);
e63dc5c78   Yehuda Sadeh   ceph: remove page...
617
618
619
620
621
622
623
  
  		/*
  		 * We lost the cache cap, need to truncate the page before
  		 * it is unlocked, otherwise we'd truncate it later in the
  		 * page truncation thread, possibly losing some data that
  		 * raced its way in
  		 */
2962507ca   Sage Weil   ceph: perform laz...
624
  		if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)
e63dc5c78   Yehuda Sadeh   ceph: remove page...
625
  			generic_error_remove_page(inode->i_mapping, page);
1d3576fd1   Sage Weil   ceph: address spa...
626
627
628
629
630
631
632
633
634
  		unlock_page(page);
  	}
  	dout("%p wrote+cleaned %d pages
  ", inode, wrote);
  	ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc);
  
  	ceph_release_pages(req->r_pages, req->r_num_pages);
  	if (req->r_pages_from_pool)
  		mempool_free(req->r_pages,
640ef79d2   Cheng Renquan   ceph: use ceph_sb...
635
  			     ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
1d3576fd1   Sage Weil   ceph: address spa...
636
637
638
639
640
641
642
643
644
645
  	else
  		kfree(req->r_pages);
  	ceph_osdc_put_request(req);
  }
  
  /*
   * allocate a page vec, either directly, or if necessary, via a the
   * mempool.  we avoid the mempool if we can because req->r_num_pages
   * may be less than the maximum write size.
   */
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
646
  static void alloc_page_vec(struct ceph_fs_client *fsc,
1d3576fd1   Sage Weil   ceph: address spa...
647
648
649
650
651
  			   struct ceph_osd_request *req)
  {
  	req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
  			       GFP_NOFS);
  	if (!req->r_pages) {
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
652
  		req->r_pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS);
1d3576fd1   Sage Weil   ceph: address spa...
653
654
655
656
657
658
659
660
661
662
663
664
  		req->r_pages_from_pool = 1;
  		WARN_ON(!req->r_pages);
  	}
  }
  
  /*
   * initiate async writeback
   */
  static int ceph_writepages_start(struct address_space *mapping,
  				 struct writeback_control *wbc)
  {
  	struct inode *inode = mapping->host;
1d3576fd1   Sage Weil   ceph: address spa...
665
  	struct ceph_inode_info *ci = ceph_inode(inode);
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
666
  	struct ceph_fs_client *fsc;
1d3576fd1   Sage Weil   ceph: address spa...
667
668
669
670
  	pgoff_t index, start, end;
  	int range_whole = 0;
  	int should_loop = 1;
  	pgoff_t max_pages = 0, max_pages_ever = 0;
80e755fed   Sage Weil   ceph: allow write...
671
  	struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc;
1d3576fd1   Sage Weil   ceph: address spa...
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
  	struct pagevec pvec;
  	int done = 0;
  	int rc = 0;
  	unsigned wsize = 1 << inode->i_blkbits;
  	struct ceph_osd_request *req = NULL;
  	int do_sync;
  	u64 snap_size = 0;
  
  	/*
  	 * Include a 'sync' in the OSD request if this is a data
  	 * integrity write (e.g., O_SYNC write or fsync()), or if our
  	 * cap is being revoked.
  	 */
  	do_sync = wbc->sync_mode == WB_SYNC_ALL;
  	if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
  		do_sync = 1;
  	dout("writepages_start %p dosync=%d (mode=%s)
  ",
  	     inode, do_sync,
  	     wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
  	     (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
693
694
  	fsc = ceph_inode_to_client(inode);
  	if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
1d3576fd1   Sage Weil   ceph: address spa...
695
696
697
698
  		pr_warning("writepage_start %p on forced umount
  ", inode);
  		return -EIO; /* we're in a forced umount, don't write! */
  	}
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
699
700
  	if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize)
  		wsize = fsc->mount_options->wsize;
1d3576fd1   Sage Weil   ceph: address spa...
701
702
703
704
705
  	if (wsize < PAGE_CACHE_SIZE)
  		wsize = PAGE_CACHE_SIZE;
  	max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
  
  	pagevec_init(&pvec, 0);
1d3576fd1   Sage Weil   ceph: address spa...
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
  	/* where to start/end? */
  	if (wbc->range_cyclic) {
  		start = mapping->writeback_index; /* Start from prev offset */
  		end = -1;
  		dout(" cyclic, start at %lu
  ", start);
  	} else {
  		start = wbc->range_start >> PAGE_CACHE_SHIFT;
  		end = wbc->range_end >> PAGE_CACHE_SHIFT;
  		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
  			range_whole = 1;
  		should_loop = 0;
  		dout(" not cyclic, %lu to %lu
  ", start, end);
  	}
  	index = start;
  
  retry:
  	/* find oldest snap context with dirty data */
  	ceph_put_snap_context(snapc);
  	snapc = get_oldest_context(inode, &snap_size);
  	if (!snapc) {
  		/* hmm, why does writepages get called when there
  		   is no dirty data? */
  		dout(" no snap context with dirty data?
  ");
  		goto out;
  	}
  	dout(" oldest snapc is %p seq %lld (%d snaps)
  ",
  	     snapc, snapc->seq, snapc->num_snaps);
  	if (last_snapc && snapc != last_snapc) {
  		/* if we switched to a newer snapc, restart our scan at the
  		 * start of the original file range. */
  		dout("  snapc differs from last pass, restarting at %lu
  ",
  		     index);
  		index = start;
  	}
  	last_snapc = snapc;
  
  	while (!done && index <= end) {
  		unsigned i;
  		int first;
  		pgoff_t next;
  		int pvec_pages, locked_pages;
  		struct page *page;
  		int want;
  		u64 offset, len;
  		struct ceph_osd_request_head *reqhead;
  		struct ceph_osd_op *op;
2baba2501   Yehuda Sadeh   ceph: writeback c...
757
  		long writeback_stat;
1d3576fd1   Sage Weil   ceph: address spa...
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
  
  		next = 0;
  		locked_pages = 0;
  		max_pages = max_pages_ever;
  
  get_more_pages:
  		first = -1;
  		want = min(end - index,
  			   min((pgoff_t)PAGEVEC_SIZE,
  			       max_pages - (pgoff_t)locked_pages) - 1)
  			+ 1;
  		pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index,
  						PAGECACHE_TAG_DIRTY,
  						want);
  		dout("pagevec_lookup_tag got %d
  ", pvec_pages);
  		if (!pvec_pages && !locked_pages)
  			break;
  		for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) {
  			page = pvec.pages[i];
  			dout("? %p idx %lu
  ", page, page->index);
  			if (locked_pages == 0)
  				lock_page(page);  /* first page */
  			else if (!trylock_page(page))
  				break;
  
  			/* only dirty pages, or our accounting breaks */
  			if (unlikely(!PageDirty(page)) ||
  			    unlikely(page->mapping != mapping)) {
  				dout("!dirty or !mapping %p
  ", page);
  				unlock_page(page);
  				break;
  			}
  			if (!wbc->range_cyclic && page->index > end) {
  				dout("end of range %p
  ", page);
  				done = 1;
  				unlock_page(page);
  				break;
  			}
  			if (next && (page->index != next)) {
  				dout("not consecutive %p
  ", page);
  				unlock_page(page);
  				break;
  			}
  			if (wbc->sync_mode != WB_SYNC_NONE) {
  				dout("waiting on writeback %p
  ", page);
  				wait_on_page_writeback(page);
  			}
  			if ((snap_size && page_offset(page) > snap_size) ||
  			    (!snap_size &&
  			     page_offset(page) > i_size_read(inode))) {
  				dout("%p page eof %llu
  ", page, snap_size ?
  				     snap_size : i_size_read(inode));
  				done = 1;
  				unlock_page(page);
  				break;
  			}
  			if (PageWriteback(page)) {
  				dout("%p under writeback
  ", page);
  				unlock_page(page);
  				break;
  			}
  
  			/* only if matching snap context */
80e755fed   Sage Weil   ceph: allow write...
829
830
831
832
833
  			pgsnapc = (void *)page->private;
  			if (pgsnapc->seq > snapc->seq) {
  				dout("page snapc %p %lld > oldest %p %lld
  ",
  				     pgsnapc, pgsnapc->seq, snapc, snapc->seq);
1d3576fd1   Sage Weil   ceph: address spa...
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
  				unlock_page(page);
  				if (!locked_pages)
  					continue; /* keep looking for snap */
  				break;
  			}
  
  			if (!clear_page_dirty_for_io(page)) {
  				dout("%p !clear_page_dirty_for_io
  ", page);
  				unlock_page(page);
  				break;
  			}
  
  			/* ok */
  			if (locked_pages == 0) {
  				/* prepare async write request */
a77d9f7dc   Sage Weil   ceph: fix file of...
850
851
  				offset = (unsigned long long)page->index
  					<< PAGE_CACHE_SHIFT;
1d3576fd1   Sage Weil   ceph: address spa...
852
  				len = wsize;
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
853
  				req = ceph_osdc_new_request(&fsc->client->osdc,
1d3576fd1   Sage Weil   ceph: address spa...
854
855
856
857
858
859
860
861
862
  					    &ci->i_layout,
  					    ceph_vino(inode),
  					    offset, &len,
  					    CEPH_OSD_OP_WRITE,
  					    CEPH_OSD_FLAG_WRITE |
  						    CEPH_OSD_FLAG_ONDISK,
  					    snapc, do_sync,
  					    ci->i_truncate_seq,
  					    ci->i_truncate_size,
b7495fc2f   Sage Weil   ceph: make page a...
863
  					    &inode->i_mtime, true, 1, 0);
8c71897be   Henry C Chang   ceph: handle ceph...
864
865
866
867
868
869
  
  				if (!req) {
  					rc = -ENOMEM;
  					unlock_page(page);
  					break;
  				}
1d3576fd1   Sage Weil   ceph: address spa...
870
  				max_pages = req->r_num_pages;
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
871
  				alloc_page_vec(fsc, req);
1d3576fd1   Sage Weil   ceph: address spa...
872
873
  				req->r_callback = writepages_finish;
  				req->r_inode = inode;
1d3576fd1   Sage Weil   ceph: address spa...
874
875
876
877
878
879
880
881
  			}
  
  			/* note position of first page in pvec */
  			if (first < 0)
  				first = i;
  			dout("%p will write page %p idx %lu
  ",
  			     inode, page, page->index);
2baba2501   Yehuda Sadeh   ceph: writeback c...
882

213c99ee0   Sage Weil   ceph: whitespace ...
883
  			writeback_stat =
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
884
  			       atomic_long_inc_return(&fsc->writeback_count);
213c99ee0   Sage Weil   ceph: whitespace ...
885
  			if (writeback_stat > CONGESTION_ON_THRESH(
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
886
887
  				    fsc->mount_options->congestion_kb)) {
  				set_bdi_congested(&fsc->backing_dev_info,
213c99ee0   Sage Weil   ceph: whitespace ...
888
  						  BLK_RW_ASYNC);
2baba2501   Yehuda Sadeh   ceph: writeback c...
889
  			}
1d3576fd1   Sage Weil   ceph: address spa...
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
  			set_page_writeback(page);
  			req->r_pages[locked_pages] = page;
  			locked_pages++;
  			next = page->index + 1;
  		}
  
  		/* did we get anything? */
  		if (!locked_pages)
  			goto release_pvec_pages;
  		if (i) {
  			int j;
  			BUG_ON(!locked_pages || first < 0);
  
  			if (pvec_pages && i == pvec_pages &&
  			    locked_pages < max_pages) {
  				dout("reached end pvec, trying for more
  ");
  				pagevec_reinit(&pvec);
  				goto get_more_pages;
  			}
  
  			/* shift unused pages over in the pvec...  we
  			 * will need to release them below. */
  			for (j = i; j < pvec_pages; j++) {
  				dout(" pvec leftover page %p
  ",
  				     pvec.pages[j]);
  				pvec.pages[j-i+first] = pvec.pages[j];
  			}
  			pvec.nr -= i-first;
  		}
  
  		/* submit the write */
  		offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT;
  		len = min((snap_size ? snap_size : i_size_read(inode)) - offset,
  			  (u64)locked_pages << PAGE_CACHE_SHIFT);
  		dout("writepages got %d pages at %llu~%llu
  ",
  		     locked_pages, offset, len);
  
  		/* revise final length, page count */
  		req->r_num_pages = locked_pages;
  		reqhead = req->r_request->front.iov_base;
  		op = (void *)(reqhead + 1);
  		op->extent.length = cpu_to_le64(len);
  		op->payload_len = cpu_to_le32(len);
  		req->r_request->hdr.data_len = cpu_to_le32(len);
9d6fcb081   Sage Weil   ceph: check retur...
937
938
  		rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
  		BUG_ON(rc);
1d3576fd1   Sage Weil   ceph: address spa...
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
  		req = NULL;
  
  		/* continue? */
  		index = next;
  		wbc->nr_to_write -= locked_pages;
  		if (wbc->nr_to_write <= 0)
  			done = 1;
  
  release_pvec_pages:
  		dout("pagevec_release on %d pages (%p)
  ", (int)pvec.nr,
  		     pvec.nr ? pvec.pages[0] : NULL);
  		pagevec_release(&pvec);
  
  		if (locked_pages && !done)
  			goto retry;
  	}
  
  	if (should_loop && !done) {
  		/* more to do; loop back to beginning of file */
  		dout("writepages looping back to beginning of file
  ");
  		should_loop = 0;
  		index = 0;
  		goto retry;
  	}
  
  	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
  		mapping->writeback_index = index;
  
  out:
  	if (req)
  		ceph_osdc_put_request(req);
1d3576fd1   Sage Weil   ceph: address spa...
972
973
974
  	ceph_put_snap_context(snapc);
  	dout("writepages done, rc = %d
  ", rc);
1d3576fd1   Sage Weil   ceph: address spa...
975
976
977
978
979
980
981
982
983
984
985
986
  	return rc;
  }
  
  
  
  /*
   * See if a given @snapc is either writeable, or already written.
   */
  static int context_is_writeable_or_written(struct inode *inode,
  					   struct ceph_snap_context *snapc)
  {
  	struct ceph_snap_context *oldest = get_oldest_context(inode, NULL);
6298a3375   Sage Weil   ceph: fix snap co...
987
988
989
990
  	int ret = !oldest || snapc->seq <= oldest->seq;
  
  	ceph_put_snap_context(oldest);
  	return ret;
1d3576fd1   Sage Weil   ceph: address spa...
991
992
993
994
995
  }
  
  /*
   * We are only allowed to write into/dirty the page if the page is
   * clean, or already dirty within the same snap context.
8f883c24d   Sage Weil   ceph: make write_...
996
997
998
999
   *
   * called with page locked.
   * return success with page locked,
   * or any failure (incl -EAGAIN) with page unlocked.
1d3576fd1   Sage Weil   ceph: address spa...
1000
   */
4af6b2257   Yehuda Sadeh   ceph: refactor ce...
1001
1002
1003
  static int ceph_update_writeable_page(struct file *file,
  			    loff_t pos, unsigned len,
  			    struct page *page)
1d3576fd1   Sage Weil   ceph: address spa...
1004
1005
1006
  {
  	struct inode *inode = file->f_dentry->d_inode;
  	struct ceph_inode_info *ci = ceph_inode(inode);
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
1007
  	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
1d3576fd1   Sage Weil   ceph: address spa...
1008
1009
1010
1011
  	loff_t page_off = pos & PAGE_CACHE_MASK;
  	int pos_in_page = pos & ~PAGE_CACHE_MASK;
  	int end_in_page = pos_in_page + len;
  	loff_t i_size;
1d3576fd1   Sage Weil   ceph: address spa...
1012
  	int r;
80e755fed   Sage Weil   ceph: allow write...
1013
  	struct ceph_snap_context *snapc, *oldest;
1d3576fd1   Sage Weil   ceph: address spa...
1014

1d3576fd1   Sage Weil   ceph: address spa...
1015
1016
1017
1018
1019
1020
1021
1022
  retry_locked:
  	/* writepages currently holds page lock, but if we change that later, */
  	wait_on_page_writeback(page);
  
  	/* check snap context */
  	BUG_ON(!ci->i_snap_realm);
  	down_read(&mdsc->snap_rwsem);
  	BUG_ON(!ci->i_snap_realm->cached_context);
80e755fed   Sage Weil   ceph: allow write...
1023
1024
  	snapc = (void *)page->private;
  	if (snapc && snapc != ci->i_head_snapc) {
1d3576fd1   Sage Weil   ceph: address spa...
1025
1026
1027
1028
  		/*
  		 * this page is already dirty in another (older) snap
  		 * context!  is it writeable now?
  		 */
80e755fed   Sage Weil   ceph: allow write...
1029
  		oldest = get_oldest_context(inode, NULL);
1d3576fd1   Sage Weil   ceph: address spa...
1030
  		up_read(&mdsc->snap_rwsem);
80e755fed   Sage Weil   ceph: allow write...
1031
  		if (snapc->seq > oldest->seq) {
6298a3375   Sage Weil   ceph: fix snap co...
1032
  			ceph_put_snap_context(oldest);
1d3576fd1   Sage Weil   ceph: address spa...
1033
1034
  			dout(" page %p snapc %p not current or oldest
  ",
6298a3375   Sage Weil   ceph: fix snap co...
1035
  			     page, snapc);
1d3576fd1   Sage Weil   ceph: address spa...
1036
1037
1038
1039
  			/*
  			 * queue for writeback, and wait for snapc to
  			 * be writeable or written
  			 */
6298a3375   Sage Weil   ceph: fix snap co...
1040
  			snapc = ceph_get_snap_context(snapc);
1d3576fd1   Sage Weil   ceph: address spa...
1041
  			unlock_page(page);
3c6f6b79a   Sage Weil   ceph: cleanup asy...
1042
  			ceph_queue_writeback(inode);
8f883c24d   Sage Weil   ceph: make write_...
1043
  			r = wait_event_interruptible(ci->i_cap_wq,
1d3576fd1   Sage Weil   ceph: address spa...
1044
1045
  			       context_is_writeable_or_written(inode, snapc));
  			ceph_put_snap_context(snapc);
8f883c24d   Sage Weil   ceph: make write_...
1046
1047
  			if (r == -ERESTARTSYS)
  				return r;
4af6b2257   Yehuda Sadeh   ceph: refactor ce...
1048
  			return -EAGAIN;
1d3576fd1   Sage Weil   ceph: address spa...
1049
  		}
6298a3375   Sage Weil   ceph: fix snap co...
1050
  		ceph_put_snap_context(oldest);
1d3576fd1   Sage Weil   ceph: address spa...
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
  
  		/* yay, writeable, do it now (without dropping page lock) */
  		dout(" page %p snapc %p not current, but oldest
  ",
  		     page, snapc);
  		if (!clear_page_dirty_for_io(page))
  			goto retry_locked;
  		r = writepage_nounlock(page, NULL);
  		if (r < 0)
  			goto fail_nosnap;
  		goto retry_locked;
  	}
  
  	if (PageUptodate(page)) {
  		dout(" page %p already uptodate
  ", page);
  		return 0;
  	}
  
  	/* full page? */
  	if (pos_in_page == 0 && len == PAGE_CACHE_SIZE)
  		return 0;
  
  	/* past end of file? */
  	i_size = inode->i_size;   /* caller holds i_mutex */
  
  	if (i_size + len > inode->i_sb->s_maxbytes) {
  		/* file is too big */
  		r = -EINVAL;
  		goto fail;
  	}
  
  	if (page_off >= i_size ||
  	    (pos_in_page == 0 && (pos+len) >= i_size &&
  	     end_in_page - pos_in_page != PAGE_CACHE_SIZE)) {
  		dout(" zeroing %p 0 - %d and %d - %d
  ",
  		     page, pos_in_page, end_in_page, (int)PAGE_CACHE_SIZE);
  		zero_user_segments(page,
  				   0, pos_in_page,
  				   end_in_page, PAGE_CACHE_SIZE);
  		return 0;
  	}
  
  	/* we need to read it. */
  	up_read(&mdsc->snap_rwsem);
  	r = readpage_nounlock(file, page);
  	if (r < 0)
  		goto fail_nosnap;
  	goto retry_locked;
  
  fail:
  	up_read(&mdsc->snap_rwsem);
  fail_nosnap:
  	unlock_page(page);
  	return r;
  }
  
  /*
4af6b2257   Yehuda Sadeh   ceph: refactor ce...
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
   * We are only allowed to write into/dirty the page if the page is
   * clean, or already dirty within the same snap context.
   */
  static int ceph_write_begin(struct file *file, struct address_space *mapping,
  			    loff_t pos, unsigned len, unsigned flags,
  			    struct page **pagep, void **fsdata)
  {
  	struct inode *inode = file->f_dentry->d_inode;
  	struct page *page;
  	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
  	int r;
  
  	do {
8f883c24d   Sage Weil   ceph: make write_...
1123
  		/* get a page */
4af6b2257   Yehuda Sadeh   ceph: refactor ce...
1124
1125
1126
1127
1128
1129
1130
  		page = grab_cache_page_write_begin(mapping, index, 0);
  		if (!page)
  			return -ENOMEM;
  		*pagep = page;
  
  		dout("write_begin file %p inode %p page %p %d~%d
  ", file,
213c99ee0   Sage Weil   ceph: whitespace ...
1131
  		     inode, page, (int)pos, (int)len);
4af6b2257   Yehuda Sadeh   ceph: refactor ce...
1132
1133
1134
1135
1136
1137
1138
1139
  
  		r = ceph_update_writeable_page(file, pos, len, page);
  	} while (r == -EAGAIN);
  
  	return r;
  }
  
  /*
1d3576fd1   Sage Weil   ceph: address spa...
1140
1141
1142
1143
1144
1145
1146
1147
1148
   * we don't do anything in here that simple_write_end doesn't do
   * except adjust dirty page accounting and drop read lock on
   * mdsc->snap_rwsem.
   */
  static int ceph_write_end(struct file *file, struct address_space *mapping,
  			  loff_t pos, unsigned len, unsigned copied,
  			  struct page *page, void *fsdata)
  {
  	struct inode *inode = file->f_dentry->d_inode;
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
1149
1150
  	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
  	struct ceph_mds_client *mdsc = fsc->mdsc;
1d3576fd1   Sage Weil   ceph: address spa...
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
  	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
  	int check_cap = 0;
  
  	dout("write_end file %p inode %p page %p %d~%d (%d)
  ", file,
  	     inode, page, (int)pos, (int)copied, (int)len);
  
  	/* zero the stale part of the page if we did a short copy */
  	if (copied < len)
  		zero_user_segment(page, from+copied, len);
  
  	/* did file size increase? */
  	/* (no need for i_size_read(); we caller holds i_mutex */
  	if (pos+copied > inode->i_size)
  		check_cap = ceph_inode_set_size(inode, pos+copied);
  
  	if (!PageUptodate(page))
  		SetPageUptodate(page);
  
  	set_page_dirty(page);
  
  	unlock_page(page);
  	up_read(&mdsc->snap_rwsem);
  	page_cache_release(page);
  
  	if (check_cap)
  		ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
  
  	return copied;
  }
  
  /*
   * we set .direct_IO to indicate direct io is supported, but since we
   * intercept O_DIRECT reads and writes early, this function should
   * never get called.
   */
  static ssize_t ceph_direct_io(int rw, struct kiocb *iocb,
  			      const struct iovec *iov,
  			      loff_t pos, unsigned long nr_segs)
  {
  	WARN_ON(1);
  	return -EINVAL;
  }
  
  const struct address_space_operations ceph_aops = {
  	.readpage = ceph_readpage,
  	.readpages = ceph_readpages,
  	.writepage = ceph_writepage,
  	.writepages = ceph_writepages_start,
  	.write_begin = ceph_write_begin,
  	.write_end = ceph_write_end,
  	.set_page_dirty = ceph_set_page_dirty,
  	.invalidatepage = ceph_invalidatepage,
  	.releasepage = ceph_releasepage,
  	.direct_IO = ceph_direct_io,
  };
  
  
  /*
   * vm ops
   */
  
  /*
   * Reuse write_begin here for simplicity.
   */
  static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
  {
  	struct inode *inode = vma->vm_file->f_dentry->d_inode;
  	struct page *page = vmf->page;
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
1220
  	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
1d3576fd1   Sage Weil   ceph: address spa...
1221
1222
  	loff_t off = page->index << PAGE_CACHE_SHIFT;
  	loff_t size, len;
1d3576fd1   Sage Weil   ceph: address spa...
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
  	int ret;
  
  	size = i_size_read(inode);
  	if (off + PAGE_CACHE_SIZE <= size)
  		len = PAGE_CACHE_SIZE;
  	else
  		len = size & ~PAGE_CACHE_MASK;
  
  	dout("page_mkwrite %p %llu~%llu page %p idx %lu
  ", inode,
  	     off, len, page, page->index);
4af6b2257   Yehuda Sadeh   ceph: refactor ce...
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
  
  	lock_page(page);
  
  	ret = VM_FAULT_NOPAGE;
  	if ((off > size) ||
  	    (page->mapping != inode->i_mapping))
  		goto out;
  
  	ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
  	if (ret == 0) {
  		/* success.  we'll keep the page locked. */
1d3576fd1   Sage Weil   ceph: address spa...
1245
1246
  		set_page_dirty(page);
  		up_read(&mdsc->snap_rwsem);
1d3576fd1   Sage Weil   ceph: address spa...
1247
1248
  		ret = VM_FAULT_LOCKED;
  	} else {
4af6b2257   Yehuda Sadeh   ceph: refactor ce...
1249
1250
1251
1252
  		if (ret == -ENOMEM)
  			ret = VM_FAULT_OOM;
  		else
  			ret = VM_FAULT_SIGBUS;
1d3576fd1   Sage Weil   ceph: address spa...
1253
  	}
4af6b2257   Yehuda Sadeh   ceph: refactor ce...
1254
  out:
1d3576fd1   Sage Weil   ceph: address spa...
1255
1256
  	dout("page_mkwrite %p %llu~%llu = %d
  ", inode, off, len, ret);
4af6b2257   Yehuda Sadeh   ceph: refactor ce...
1257
1258
  	if (ret != VM_FAULT_LOCKED)
  		unlock_page(page);
1d3576fd1   Sage Weil   ceph: address spa...
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
  	return ret;
  }
  
  static struct vm_operations_struct ceph_vmops = {
  	.fault		= filemap_fault,
  	.page_mkwrite	= ceph_page_mkwrite,
  };
  
  int ceph_mmap(struct file *file, struct vm_area_struct *vma)
  {
  	struct address_space *mapping = file->f_mapping;
  
  	if (!mapping->a_ops->readpage)
  		return -ENOEXEC;
  	file_accessed(file);
  	vma->vm_ops = &ceph_vmops;
  	vma->vm_flags |= VM_CAN_NONLINEAR;
  	return 0;
  }