Blame view

fs/ceph/file.c 44.6 KB
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
1
  #include <linux/ceph/ceph_debug.h>
124e68e74   Sage Weil   ceph: file operat...
2

3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
3
  #include <linux/module.h>
124e68e74   Sage Weil   ceph: file operat...
4
  #include <linux/sched.h>
5a0e3ad6a   Tejun Heo   include cleanup: ...
5
  #include <linux/slab.h>
124e68e74   Sage Weil   ceph: file operat...
6
  #include <linux/file.h>
5ef50c3be   Sage Weil   ceph: simplify+fi...
7
  #include <linux/mount.h>
124e68e74   Sage Weil   ceph: file operat...
8
9
  #include <linux/namei.h>
  #include <linux/writeback.h>
ad7a60de8   Li Wang   ceph: punch hole ...
10
  #include <linux/falloc.h>
124e68e74   Sage Weil   ceph: file operat...
11
12
13
  
  #include "super.h"
  #include "mds_client.h"
99ccbd229   Milosz Tanski   ceph: use fscache...
14
  #include "cache.h"
124e68e74   Sage Weil   ceph: file operat...
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
  
  /*
   * Ceph file operations
   *
   * Implement basic open/close functionality, and implement
   * read/write.
   *
   * We implement three modes of file I/O:
   *  - buffered uses the generic_file_aio_{read,write} helpers
   *
   *  - synchronous is used when there is multi-client read/write
   *    sharing, avoids the page cache, and synchronously waits for an
   *    ack from the OSD.
   *
   *  - direct io takes the variant of the sync path that references
   *    user pages directly.
   *
   * fsync() flushes and waits on dirty pages, but just queues metadata
   * for writeback: since the MDS can recover size and mtime there is no
   * need to wait for MDS acknowledgement.
   */
b5b98989d   Zhu, Caifeng   ceph: combine as ...
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
  /*
   * Calculate the length sum of direct io vectors that can
   * be combined into one page vector.
   */
  static size_t dio_get_pagev_size(const struct iov_iter *it)
  {
      const struct iovec *iov = it->iov;
      const struct iovec *iovend = iov + it->nr_segs;
      size_t size;
  
      size = iov->iov_len - it->iov_offset;
      /*
       * An iov can be page vectored when both the current tail
       * and the next base are page aligned.
       */
      while (PAGE_ALIGNED((iov->iov_base + iov->iov_len)) &&
             (++iov < iovend && PAGE_ALIGNED((iov->iov_base)))) {
          size += iov->iov_len;
      }
      dout("dio_get_pagevlen len = %zu
  ", size);
      return size;
  }
  
  /*
   * Allocate a page vector based on (@it, @nbytes).
   * The return value is the tuple describing a page vector,
   * that is (@pages, @page_align, @num_pages).
   */
  static struct page **
  dio_get_pages_alloc(const struct iov_iter *it, size_t nbytes,
  		    size_t *page_align, int *num_pages)
  {
  	struct iov_iter tmp_it = *it;
  	size_t align;
  	struct page **pages;
  	int ret = 0, idx, npages;
  
  	align = (unsigned long)(it->iov->iov_base + it->iov_offset) &
  		(PAGE_SIZE - 1);
  	npages = calc_pages_for(align, nbytes);
  	pages = kmalloc(sizeof(*pages) * npages, GFP_KERNEL);
  	if (!pages) {
  		pages = vmalloc(sizeof(*pages) * npages);
  		if (!pages)
  			return ERR_PTR(-ENOMEM);
  	}
  
  	for (idx = 0; idx < npages; ) {
  		size_t start;
  		ret = iov_iter_get_pages(&tmp_it, pages + idx, nbytes,
  					 npages - idx, &start);
  		if (ret < 0)
  			goto fail;
  
  		iov_iter_advance(&tmp_it, ret);
  		nbytes -= ret;
  		idx += (ret + start + PAGE_SIZE - 1) / PAGE_SIZE;
  	}
  
  	BUG_ON(nbytes != 0);
  	*num_pages = npages;
  	*page_align = align;
  	dout("dio_get_pages_alloc: got %d pages align %zu
  ", npages, align);
  	return pages;
  fail:
  	ceph_put_page_vector(pages, idx, false);
  	return ERR_PTR(ret);
  }
124e68e74   Sage Weil   ceph: file operat...
106
107
108
109
110
111
112
113
  
  /*
   * Prepare an open request.  Preallocate ceph_cap to avoid an
   * inopportune ENOMEM later.
   */
  static struct ceph_mds_request *
  prepare_open_request(struct super_block *sb, int flags, int create_mode)
  {
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
114
115
  	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
  	struct ceph_mds_client *mdsc = fsc->mdsc;
124e68e74   Sage Weil   ceph: file operat...
116
117
118
119
120
121
122
123
124
125
126
127
128
  	struct ceph_mds_request *req;
  	int want_auth = USE_ANY_MDS;
  	int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
  
  	if (flags & (O_WRONLY|O_RDWR|O_CREAT|O_TRUNC))
  		want_auth = USE_AUTH_MDS;
  
  	req = ceph_mdsc_create_request(mdsc, op, want_auth);
  	if (IS_ERR(req))
  		goto out;
  	req->r_fmode = ceph_flags_to_mode(flags);
  	req->r_args.open.flags = cpu_to_le32(flags);
  	req->r_args.open.mode = cpu_to_le32(create_mode);
124e68e74   Sage Weil   ceph: file operat...
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
  out:
  	return req;
  }
  
  /*
   * initialize private struct file data.
   * if we fail, clean up by dropping fmode reference on the ceph_inode
   */
  static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
  {
  	struct ceph_file_info *cf;
  	int ret = 0;
  
  	switch (inode->i_mode & S_IFMT) {
  	case S_IFREG:
46b59b2be   Yan, Zheng   ceph: disable fsc...
144
145
  		ceph_fscache_register_inode_cookie(inode);
  		ceph_fscache_file_set_cookie(inode, file);
124e68e74   Sage Weil   ceph: file operat...
146
147
148
149
  	case S_IFDIR:
  		dout("init_file %p %p 0%o (regular)
  ", inode, file,
  		     inode->i_mode);
99ec26977   Geliang Tang   ceph: use kmem_ca...
150
  		cf = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
124e68e74   Sage Weil   ceph: file operat...
151
152
153
154
155
156
  		if (cf == NULL) {
  			ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
  			return -ENOMEM;
  		}
  		cf->fmode = fmode;
  		cf->next_offset = 2;
fdd4e1583   Yan, Zheng   ceph: rework dcac...
157
  		cf->readdir_cache_idx = -1;
124e68e74   Sage Weil   ceph: file operat...
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
  		file->private_data = cf;
  		BUG_ON(inode->i_fop->release != ceph_release);
  		break;
  
  	case S_IFLNK:
  		dout("init_file %p %p 0%o (symlink)
  ", inode, file,
  		     inode->i_mode);
  		ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
  		break;
  
  	default:
  		dout("init_file %p %p 0%o (special)
  ", inode, file,
  		     inode->i_mode);
  		/*
  		 * we need to drop the open ref now, since we don't
  		 * have .release set to ceph_release.
  		 */
  		ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
  		BUG_ON(inode->i_fop->release == ceph_release);
  
  		/* call the proper open fop */
  		ret = inode->i_fop->open(inode, file);
  	}
  	return ret;
  }
  
  /*
77310320c   Yan, Zheng   ceph: renew caps ...
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
   * try renew caps after session gets killed.
   */
  int ceph_renew_caps(struct inode *inode)
  {
  	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
  	struct ceph_inode_info *ci = ceph_inode(inode);
  	struct ceph_mds_request *req;
  	int err, flags, wanted;
  
  	spin_lock(&ci->i_ceph_lock);
  	wanted = __ceph_caps_file_wanted(ci);
  	if (__ceph_is_any_real_caps(ci) &&
  	    (!(wanted & CEPH_CAP_ANY_WR) == 0 || ci->i_auth_cap)) {
  		int issued = __ceph_caps_issued(ci, NULL);
  		spin_unlock(&ci->i_ceph_lock);
  		dout("renew caps %p want %s issued %s updating mds_wanted
  ",
  		     inode, ceph_cap_string(wanted), ceph_cap_string(issued));
  		ceph_check_caps(ci, 0, NULL);
  		return 0;
  	}
  	spin_unlock(&ci->i_ceph_lock);
  
  	flags = 0;
  	if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
  		flags = O_RDWR;
  	else if (wanted & CEPH_CAP_FILE_RD)
  		flags = O_RDONLY;
  	else if (wanted & CEPH_CAP_FILE_WR)
  		flags = O_WRONLY;
  #ifdef O_LAZY
  	if (wanted & CEPH_CAP_FILE_LAZYIO)
  		flags |= O_LAZY;
  #endif
  
  	req = prepare_open_request(inode->i_sb, flags, 0);
  	if (IS_ERR(req)) {
  		err = PTR_ERR(req);
  		goto out;
  	}
  
  	req->r_inode = inode;
  	ihold(inode);
  	req->r_num_caps = 1;
  	req->r_fmode = -1;
  
  	err = ceph_mdsc_do_request(mdsc, NULL, req);
  	ceph_mdsc_put_request(req);
  out:
  	dout("renew caps %p open result=%d
  ", inode, err);
  	return err < 0 ? err : 0;
  }
  
  /*
124e68e74   Sage Weil   ceph: file operat...
242
243
244
245
246
247
248
249
   * If we already have the requisite capabilities, we can satisfy
   * the open request locally (no need to request new caps from the
   * MDS).  We do, however, need to inform the MDS (asynchronously)
   * if our wanted caps set expands.
   */
  int ceph_open(struct inode *inode, struct file *file)
  {
  	struct ceph_inode_info *ci = ceph_inode(inode);
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
250
251
  	struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
  	struct ceph_mds_client *mdsc = fsc->mdsc;
124e68e74   Sage Weil   ceph: file operat...
252
253
  	struct ceph_mds_request *req;
  	struct ceph_file_info *cf = file->private_data;
124e68e74   Sage Weil   ceph: file operat...
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
  	int err;
  	int flags, fmode, wanted;
  
  	if (cf) {
  		dout("open file %p is already opened
  ", file);
  		return 0;
  	}
  
  	/* filter out O_CREAT|O_EXCL; vfs did that already.  yuck. */
  	flags = file->f_flags & ~(O_CREAT|O_EXCL);
  	if (S_ISDIR(inode->i_mode))
  		flags = O_DIRECTORY;  /* mds likes to know */
  
  	dout("open inode %p ino %llx.%llx file %p flags %d (%d)
  ", inode,
  	     ceph_vinop(inode), file, flags, file->f_flags);
  	fmode = ceph_flags_to_mode(flags);
  	wanted = ceph_caps_for_mode(fmode);
  
  	/* snapped files are read-only */
  	if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE))
  		return -EROFS;
  
  	/* trivially open snapdir */
  	if (ceph_snap(inode) == CEPH_SNAPDIR) {
be655596b   Sage Weil   ceph: use i_ceph_...
280
  		spin_lock(&ci->i_ceph_lock);
124e68e74   Sage Weil   ceph: file operat...
281
  		__ceph_get_fmode(ci, fmode);
be655596b   Sage Weil   ceph: use i_ceph_...
282
  		spin_unlock(&ci->i_ceph_lock);
124e68e74   Sage Weil   ceph: file operat...
283
284
285
286
  		return ceph_init_file(inode, file, fmode);
  	}
  
  	/*
7421ab804   Sage Weil   ceph: fix open fo...
287
288
  	 * No need to block if we have caps on the auth MDS (for
  	 * write) or any MDS (for read).  Update wanted set
124e68e74   Sage Weil   ceph: file operat...
289
290
  	 * asynchronously.
  	 */
be655596b   Sage Weil   ceph: use i_ceph_...
291
  	spin_lock(&ci->i_ceph_lock);
7421ab804   Sage Weil   ceph: fix open fo...
292
293
  	if (__ceph_is_any_real_caps(ci) &&
  	    (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) {
124e68e74   Sage Weil   ceph: file operat...
294
295
296
297
298
299
300
301
  		int mds_wanted = __ceph_caps_mds_wanted(ci);
  		int issued = __ceph_caps_issued(ci, NULL);
  
  		dout("open %p fmode %d want %s issued %s using existing
  ",
  		     inode, fmode, ceph_cap_string(wanted),
  		     ceph_cap_string(issued));
  		__ceph_get_fmode(ci, fmode);
be655596b   Sage Weil   ceph: use i_ceph_...
302
  		spin_unlock(&ci->i_ceph_lock);
124e68e74   Sage Weil   ceph: file operat...
303
304
305
306
307
308
309
310
311
312
313
  
  		/* adjust wanted? */
  		if ((issued & wanted) != wanted &&
  		    (mds_wanted & wanted) != wanted &&
  		    ceph_snap(inode) != CEPH_SNAPDIR)
  			ceph_check_caps(ci, 0, NULL);
  
  		return ceph_init_file(inode, file, fmode);
  	} else if (ceph_snap(inode) != CEPH_NOSNAP &&
  		   (ci->i_snap_caps & wanted) == wanted) {
  		__ceph_get_fmode(ci, fmode);
be655596b   Sage Weil   ceph: use i_ceph_...
314
  		spin_unlock(&ci->i_ceph_lock);
124e68e74   Sage Weil   ceph: file operat...
315
316
  		return ceph_init_file(inode, file, fmode);
  	}
99ccbd229   Milosz Tanski   ceph: use fscache...
317

be655596b   Sage Weil   ceph: use i_ceph_...
318
  	spin_unlock(&ci->i_ceph_lock);
124e68e74   Sage Weil   ceph: file operat...
319
320
321
322
323
324
325
326
  
  	dout("open fmode %d wants %s
  ", fmode, ceph_cap_string(wanted));
  	req = prepare_open_request(inode->i_sb, flags, 0);
  	if (IS_ERR(req)) {
  		err = PTR_ERR(req);
  		goto out;
  	}
70b666c3b   Sage Weil   ceph: use ihold w...
327
328
  	req->r_inode = inode;
  	ihold(inode);
99ccbd229   Milosz Tanski   ceph: use fscache...
329

124e68e74   Sage Weil   ceph: file operat...
330
  	req->r_num_caps = 1;
e36d571d7   Jianpeng Ma   ceph: no need to ...
331
  	err = ceph_mdsc_do_request(mdsc, NULL, req);
124e68e74   Sage Weil   ceph: file operat...
332
333
334
335
336
337
338
339
340
341
342
  	if (!err)
  		err = ceph_init_file(inode, file, req->r_fmode);
  	ceph_mdsc_put_request(req);
  	dout("open result=%d on %llx.%llx
  ", err, ceph_vinop(inode));
  out:
  	return err;
  }
  
  
  /*
5ef50c3be   Sage Weil   ceph: simplify+fi...
343
344
   * Do a lookup + open with a single request.  If we get a non-existent
   * file or symlink, return 1 so the VFS can retry.
124e68e74   Sage Weil   ceph: file operat...
345
   */
5ef50c3be   Sage Weil   ceph: simplify+fi...
346
  int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
30d904947   Al Viro   kill struct opendata
347
  		     struct file *file, unsigned flags, umode_t mode,
d95852777   Al Viro   make ->atomic_ope...
348
  		     int *opened)
124e68e74   Sage Weil   ceph: file operat...
349
  {
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
350
351
  	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
  	struct ceph_mds_client *mdsc = fsc->mdsc;
124e68e74   Sage Weil   ceph: file operat...
352
  	struct ceph_mds_request *req;
5ef50c3be   Sage Weil   ceph: simplify+fi...
353
  	struct dentry *dn;
b1ee94aa5   Yan, Zheng   ceph: include the...
354
  	struct ceph_acls_info acls = {};
315f24088   Yan, Zheng   ceph: fix securit...
355
         int mask;
124e68e74   Sage Weil   ceph: file operat...
356
  	int err;
124e68e74   Sage Weil   ceph: file operat...
357

a455589f1   Al Viro   assorted conversi...
358
359
360
  	dout("atomic_open %p dentry %p '%pd' %s flags %d mode 0%o
  ",
  	     dir, dentry, dentry,
5ef50c3be   Sage Weil   ceph: simplify+fi...
361
362
363
364
365
366
367
368
  	     d_unhashed(dentry) ? "unhashed" : "hashed", flags, mode);
  
  	if (dentry->d_name.len > NAME_MAX)
  		return -ENAMETOOLONG;
  
  	err = ceph_init_dentry(dentry);
  	if (err < 0)
  		return err;
124e68e74   Sage Weil   ceph: file operat...
369

b1ee94aa5   Yan, Zheng   ceph: include the...
370
371
372
373
374
  	if (flags & O_CREAT) {
  		err = ceph_pre_init_acls(dir, &mode, &acls);
  		if (err < 0)
  			return err;
  	}
124e68e74   Sage Weil   ceph: file operat...
375
376
  	/* do the open */
  	req = prepare_open_request(dir->i_sb, flags, mode);
b1ee94aa5   Yan, Zheng   ceph: include the...
377
378
379
380
  	if (IS_ERR(req)) {
  		err = PTR_ERR(req);
  		goto out_acl;
  	}
124e68e74   Sage Weil   ceph: file operat...
381
382
383
384
385
  	req->r_dentry = dget(dentry);
  	req->r_num_caps = 2;
  	if (flags & O_CREAT) {
  		req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
  		req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
b1ee94aa5   Yan, Zheng   ceph: include the...
386
387
388
389
  		if (acls.pagelist) {
  			req->r_pagelist = acls.pagelist;
  			acls.pagelist = NULL;
  		}
124e68e74   Sage Weil   ceph: file operat...
390
  	}
315f24088   Yan, Zheng   ceph: fix securit...
391
392
393
394
395
  
         mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
         if (ceph_security_xattr_wanted(dir))
                 mask |= CEPH_CAP_XATTR_SHARED;
         req->r_args.open.mask = cpu_to_le32(mask);
124e68e74   Sage Weil   ceph: file operat...
396
  	req->r_locked_dir = dir;           /* caller holds dir->i_mutex */
acda76578   Sage Weil   ceph: fix bad par...
397
398
399
  	err = ceph_mdsc_do_request(mdsc,
  				   (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
  				   req);
bf91c3150   Yan, Zheng   ceph: fix atomic_...
400
  	err = ceph_handle_snapdir(req, dentry, err);
79aec9844   Sam Lang   ceph: Check for e...
401
  	if (err)
b1ee94aa5   Yan, Zheng   ceph: include the...
402
  		goto out_req;
79aec9844   Sam Lang   ceph: Check for e...
403

a43137f7b   Jianpeng Ma   ceph: remove the ...
404
  	if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
124e68e74   Sage Weil   ceph: file operat...
405
  		err = ceph_handle_notrace_create(dir, dentry);
2d83bde9a   Miklos Szeredi   ceph: implement i...
406

00699ad85   Al Viro   Use the right pre...
407
  	if (d_in_lookup(dentry)) {
5ef50c3be   Sage Weil   ceph: simplify+fi...
408
409
410
411
412
413
414
415
  		dn = ceph_finish_lookup(req, dentry, err);
  		if (IS_ERR(dn))
  			err = PTR_ERR(dn);
  	} else {
  		/* we were given a hashed negative dentry */
  		dn = NULL;
  	}
  	if (err)
b1ee94aa5   Yan, Zheng   ceph: include the...
416
  		goto out_req;
2b0143b5c   David Howells   VFS: normal files...
417
  	if (dn || d_really_is_negative(dentry) || d_is_symlink(dentry)) {
5ef50c3be   Sage Weil   ceph: simplify+fi...
418
419
420
421
422
423
424
  		/* make vfs retry on splice, ENOENT, or symlink */
  		dout("atomic_open finish_no_open on dn %p
  ", dn);
  		err = finish_no_open(file, dn);
  	} else {
  		dout("atomic_open finish_open on dn %p
  ", dn);
6e8575faa   Sam Lang   ceph: Check for c...
425
  		if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
2b0143b5c   David Howells   VFS: normal files...
426
  			ceph_init_inode_acls(d_inode(dentry), &acls);
6e8575faa   Sam Lang   ceph: Check for c...
427
428
  			*opened |= FILE_CREATED;
  		}
5ef50c3be   Sage Weil   ceph: simplify+fi...
429
430
  		err = finish_open(file, dentry, ceph_open, opened);
  	}
b1ee94aa5   Yan, Zheng   ceph: include the...
431
  out_req:
ab866549b   Yan, Zheng   ceph: drop extra ...
432
433
  	if (!req->r_err && req->r_target_inode)
  		ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode);
5ef50c3be   Sage Weil   ceph: simplify+fi...
434
  	ceph_mdsc_put_request(req);
b1ee94aa5   Yan, Zheng   ceph: include the...
435
436
  out_acl:
  	ceph_release_acls_info(&acls);
5ef50c3be   Sage Weil   ceph: simplify+fi...
437
438
  	dout("atomic_open result=%d
  ", err);
d95852777   Al Viro   make ->atomic_ope...
439
  	return err;
124e68e74   Sage Weil   ceph: file operat...
440
441
442
443
444
445
446
447
448
449
450
451
452
453
  }
  
  int ceph_release(struct inode *inode, struct file *file)
  {
  	struct ceph_inode_info *ci = ceph_inode(inode);
  	struct ceph_file_info *cf = file->private_data;
  
  	dout("release inode %p file %p
  ", inode, file);
  	ceph_put_fmode(ci, cf->fmode);
  	if (cf->last_readdir)
  		ceph_mdsc_put_request(cf->last_readdir);
  	kfree(cf->last_name);
  	kfree(cf->dir_info);
124e68e74   Sage Weil   ceph: file operat...
454
  	kmem_cache_free(ceph_file_cachep, cf);
195d3ce2c   Sage Weil   ceph: return EBAD...
455
456
  
  	/* wake up anyone waiting for caps on this inode */
03066f234   Yehuda Sadeh   ceph: use complet...
457
  	wake_up_all(&ci->i_cap_wq);
124e68e74   Sage Weil   ceph: file operat...
458
459
  	return 0;
  }
83701246a   Yan, Zheng   ceph: sync read i...
460
  enum {
c8fe9b17d   Yan, Zheng   ceph: Asynchronou...
461
462
463
  	HAVE_RETRIED = 1,
  	CHECK_EOF =    2,
  	READ_INLINE =  3,
83701246a   Yan, Zheng   ceph: sync read i...
464
  };
124e68e74   Sage Weil   ceph: file operat...
465
  /*
124e68e74   Sage Weil   ceph: file operat...
466
467
468
469
470
471
472
473
   * Read a range of bytes striped over one or more objects.  Iterate over
   * objects we stripe over.  (That's not atomic, but good enough for now.)
   *
   * If we get a short result from the OSD, check against i_size; we need to
   * only return a short read to the caller if we hit EOF.
   */
  static int striped_read(struct inode *inode,
  			u64 off, u64 len,
6a026589b   Sage Weil   ceph: fix sync re...
474
  			struct page **pages, int num_pages,
c8fe9b17d   Yan, Zheng   ceph: Asynchronou...
475
  			int *checkeof)
124e68e74   Sage Weil   ceph: file operat...
476
  {
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
477
  	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
124e68e74   Sage Weil   ceph: file operat...
478
  	struct ceph_inode_info *ci = ceph_inode(inode);
688bac461   Dan Carpenter   ceph: cleanup typ...
479
  	u64 pos, this_len, left;
99c88e690   Yan, Zheng   ceph: use i_size_...
480
  	loff_t i_size;
c8fe9b17d   Yan, Zheng   ceph: Asynchronou...
481
482
  	int page_align, pages_left;
  	int read, ret;
124e68e74   Sage Weil   ceph: file operat...
483
  	struct page **page_pos;
124e68e74   Sage Weil   ceph: file operat...
484
485
486
487
488
489
490
491
492
493
494
495
  	bool hit_stripe, was_short;
  
  	/*
  	 * we may need to do multiple reads.  not atomic, unfortunately.
  	 */
  	pos = off;
  	left = len;
  	page_pos = pages;
  	pages_left = num_pages;
  	read = 0;
  
  more:
c8fe9b17d   Yan, Zheng   ceph: Asynchronou...
496
  	page_align = pos & ~PAGE_MASK;
124e68e74   Sage Weil   ceph: file operat...
497
  	this_len = left;
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
498
  	ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
124e68e74   Sage Weil   ceph: file operat...
499
500
501
  				  &ci->i_layout, pos, &this_len,
  				  ci->i_truncate_seq,
  				  ci->i_truncate_size,
b7495fc2f   Sage Weil   ceph: make page a...
502
  				  page_pos, pages_left, page_align);
124e68e74   Sage Weil   ceph: file operat...
503
504
  	if (ret == -ENOENT)
  		ret = 0;
0e98728fa   Sage Weil   ceph: fix ENOENT ...
505
506
  	hit_stripe = this_len < left;
  	was_short = ret >= 0 && ret < this_len;
688bac461   Dan Carpenter   ceph: cleanup typ...
507
508
  	dout("striped_read %llu~%llu (read %u) got %d%s%s
  ", pos, left, read,
124e68e74   Sage Weil   ceph: file operat...
509
  	     ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
99c88e690   Yan, Zheng   ceph: use i_size_...
510
  	i_size = i_size_read(inode);
02ae66d8b   majianpeng   ceph: fix bugs ab...
511
512
  	if (ret >= 0) {
  		int didpages;
99c88e690   Yan, Zheng   ceph: use i_size_...
513
514
  		if (was_short && (pos + ret < i_size)) {
  			int zlen = min(this_len - ret, i_size - pos - ret);
c8fe9b17d   Yan, Zheng   ceph: Asynchronou...
515
  			int zoff = (off & ~PAGE_MASK) + read + ret;
02ae66d8b   majianpeng   ceph: fix bugs ab...
516
517
  			dout(" zero gap %llu to %llu
  ",
1487a688d   Yan, Zheng   ceph: properly ze...
518
519
520
  				pos + ret, pos + ret + zlen);
  			ceph_zero_page_vector_range(zoff, zlen, pages);
  			ret += zlen;
124e68e74   Sage Weil   ceph: file operat...
521
  		}
02ae66d8b   majianpeng   ceph: fix bugs ab...
522

09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
523
  		didpages = (page_align + ret) >> PAGE_SHIFT;
124e68e74   Sage Weil   ceph: file operat...
524
525
526
527
528
  		pos += ret;
  		read = pos - off;
  		left -= ret;
  		page_pos += didpages;
  		pages_left -= didpages;
02ae66d8b   majianpeng   ceph: fix bugs ab...
529
  		/* hit stripe and need continue*/
99c88e690   Yan, Zheng   ceph: use i_size_...
530
  		if (left && hit_stripe && pos < i_size)
124e68e74   Sage Weil   ceph: file operat...
531
532
  			goto more;
  	}
ee7289bfa   majianpeng   ceph: allow sync_...
533
  	if (read > 0) {
02ae66d8b   majianpeng   ceph: fix bugs ab...
534
  		ret = read;
c3cd62839   Sage Weil   ceph: fix short s...
535
  		/* did we bounce off eof? */
99c88e690   Yan, Zheng   ceph: use i_size_...
536
  		if (pos + left > i_size)
83701246a   Yan, Zheng   ceph: sync read i...
537
  			*checkeof = CHECK_EOF;
124e68e74   Sage Weil   ceph: file operat...
538
  	}
124e68e74   Sage Weil   ceph: file operat...
539
540
541
542
543
544
545
546
547
548
549
  	dout("striped_read returns %d
  ", ret);
  	return ret;
  }
  
  /*
   * Completely synchronous read and write methods.  Direct from __user
   * buffer to osd, or directly to user pages (if O_DIRECT).
   *
   * If the read spans object boundary, just do multiple reads.
   */
8eb4efb09   majianpeng   ceph: implement r...
550
551
  static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
  				int *checkeof)
124e68e74   Sage Weil   ceph: file operat...
552
  {
8eb4efb09   majianpeng   ceph: implement r...
553
  	struct file *file = iocb->ki_filp;
496ad9aa8   Al Viro   new helper: file_...
554
  	struct inode *inode = file_inode(file);
124e68e74   Sage Weil   ceph: file operat...
555
  	struct page **pages;
8eb4efb09   majianpeng   ceph: implement r...
556
  	u64 off = iocb->ki_pos;
ab226e21a   Henry C Chang   ceph: fix direct-...
557
  	int num_pages, ret;
2b777c9dd   Al Viro   ceph_sync_read: s...
558
  	size_t len = iov_iter_count(i);
124e68e74   Sage Weil   ceph: file operat...
559

8eb4efb09   majianpeng   ceph: implement r...
560
561
562
  	dout("sync_read on file %p %llu~%u %s
  ", file, off,
  	     (unsigned)len,
124e68e74   Sage Weil   ceph: file operat...
563
  	     (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
d0d0db226   Yan, Zheng   ceph: check zero ...
564
565
566
  
  	if (!len)
  		return 0;
e98b6fed8   Sage Weil   ceph: fix comment...
567
568
569
570
571
572
  	/*
  	 * flush any page cache pages in this range.  this
  	 * will make concurrent normal and sync io slow,
  	 * but it will at least behave sensibly when they are
  	 * in sequence.
  	 */
8eb4efb09   majianpeng   ceph: implement r...
573
574
  	ret = filemap_write_and_wait_range(inode->i_mapping, off,
  						off + len);
29065a513   Yehuda Sadeh   ceph: sync read/w...
575
  	if (ret < 0)
8eb4efb09   majianpeng   ceph: implement r...
576
  		return ret;
29065a513   Yehuda Sadeh   ceph: sync read/w...
577

c8fe9b17d   Yan, Zheng   ceph: Asynchronou...
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
  	num_pages = calc_pages_for(off, len);
  	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
  	if (IS_ERR(pages))
  		return PTR_ERR(pages);
  	ret = striped_read(inode, off, len, pages,
  				num_pages, checkeof);
  	if (ret > 0) {
  		int l, k = 0;
  		size_t left = ret;
  
  		while (left) {
  			size_t page_off = off & ~PAGE_MASK;
  			size_t copy = min_t(size_t, left,
  					    PAGE_SIZE - page_off);
  			l = copy_page_to_iter(pages[k++], page_off, copy, i);
  			off += l;
  			left -= l;
  			if (l < copy)
8eb4efb09   majianpeng   ceph: implement r...
596
597
  				break;
  		}
8eb4efb09   majianpeng   ceph: implement r...
598
  	}
c8fe9b17d   Yan, Zheng   ceph: Asynchronou...
599
  	ceph_release_page_vector(pages, num_pages);
124e68e74   Sage Weil   ceph: file operat...
600

8eb4efb09   majianpeng   ceph: implement r...
601
602
603
604
  	if (off > iocb->ki_pos) {
  		ret = off - iocb->ki_pos;
  		iocb->ki_pos = off;
  	}
124e68e74   Sage Weil   ceph: file operat...
605

124e68e74   Sage Weil   ceph: file operat...
606
607
608
609
  	dout("sync_read result %d
  ", ret);
  	return ret;
  }
c8fe9b17d   Yan, Zheng   ceph: Asynchronou...
610
611
612
613
614
615
616
617
  struct ceph_aio_request {
  	struct kiocb *iocb;
  	size_t total_len;
  	int write;
  	int error;
  	struct list_head osd_reqs;
  	unsigned num_reqs;
  	atomic_t pending_reqs;
5be0389da   Yan, Zheng   ceph: re-send AIO...
618
  	struct timespec mtime;
c8fe9b17d   Yan, Zheng   ceph: Asynchronou...
619
620
  	struct ceph_cap_flush *prealloc_cf;
  };
5be0389da   Yan, Zheng   ceph: re-send AIO...
621
622
623
624
625
626
  struct ceph_aio_work {
  	struct work_struct work;
  	struct ceph_osd_request *req;
  };
  
  static void ceph_aio_retry_work(struct work_struct *work);
c8fe9b17d   Yan, Zheng   ceph: Asynchronou...
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
  static void ceph_aio_complete(struct inode *inode,
  			      struct ceph_aio_request *aio_req)
  {
  	struct ceph_inode_info *ci = ceph_inode(inode);
  	int ret;
  
  	if (!atomic_dec_and_test(&aio_req->pending_reqs))
  		return;
  
  	ret = aio_req->error;
  	if (!ret)
  		ret = aio_req->total_len;
  
  	dout("ceph_aio_complete %p rc %d
  ", inode, ret);
  
  	if (ret >= 0 && aio_req->write) {
  		int dirty;
  
  		loff_t endoff = aio_req->iocb->ki_pos + aio_req->total_len;
  		if (endoff > i_size_read(inode)) {
  			if (ceph_inode_set_size(inode, endoff))
  				ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
  		}
  
  		spin_lock(&ci->i_ceph_lock);
  		ci->i_inline_version = CEPH_INLINE_NONE;
  		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
  					       &aio_req->prealloc_cf);
  		spin_unlock(&ci->i_ceph_lock);
  		if (dirty)
  			__mark_inode_dirty(inode, dirty);
  
  	}
  
  	ceph_put_cap_refs(ci, (aio_req->write ? CEPH_CAP_FILE_WR :
  						CEPH_CAP_FILE_RD));
  
  	aio_req->iocb->ki_complete(aio_req->iocb, ret, 0);
  
  	ceph_free_cap_flush(aio_req->prealloc_cf);
  	kfree(aio_req);
  }
85e084feb   Ilya Dryomov   libceph: drop msg...
670
  static void ceph_aio_complete_req(struct ceph_osd_request *req)
c8fe9b17d   Yan, Zheng   ceph: Asynchronou...
671
672
673
674
675
676
677
678
679
680
681
682
683
  {
  	int rc = req->r_result;
  	struct inode *inode = req->r_inode;
  	struct ceph_aio_request *aio_req = req->r_priv;
  	struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
  	int num_pages = calc_pages_for((u64)osd_data->alignment,
  				       osd_data->length);
  
  	dout("ceph_aio_complete_req %p rc %d bytes %llu
  ",
  	     inode, rc, osd_data->length);
  
  	if (rc == -EOLDSNAPC) {
5be0389da   Yan, Zheng   ceph: re-send AIO...
684
685
686
687
688
689
690
691
692
693
694
695
696
  		struct ceph_aio_work *aio_work;
  		BUG_ON(!aio_req->write);
  
  		aio_work = kmalloc(sizeof(*aio_work), GFP_NOFS);
  		if (aio_work) {
  			INIT_WORK(&aio_work->work, ceph_aio_retry_work);
  			aio_work->req = req;
  			queue_work(ceph_inode_to_client(inode)->wb_wq,
  				   &aio_work->work);
  			return;
  		}
  		rc = -ENOMEM;
  	} else if (!aio_req->write) {
c8fe9b17d   Yan, Zheng   ceph: Asynchronou...
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
  		if (rc == -ENOENT)
  			rc = 0;
  		if (rc >= 0 && osd_data->length > rc) {
  			int zoff = osd_data->alignment + rc;
  			int zlen = osd_data->length - rc;
  			/*
  			 * If read is satisfied by single OSD request,
  			 * it can pass EOF. Otherwise read is within
  			 * i_size.
  			 */
  			if (aio_req->num_reqs == 1) {
  				loff_t i_size = i_size_read(inode);
  				loff_t endoff = aio_req->iocb->ki_pos + rc;
  				if (endoff < i_size)
  					zlen = min_t(size_t, zlen,
  						     i_size - endoff);
  				aio_req->total_len = rc + zlen;
  			}
  
  			if (zlen > 0)
  				ceph_zero_page_vector_range(zoff, zlen,
  							    osd_data->pages);
  		}
  	}
a22bd5ffa   Yan, Zheng   ceph: set user pa...
721
  	ceph_put_page_vector(osd_data->pages, num_pages, !aio_req->write);
c8fe9b17d   Yan, Zheng   ceph: Asynchronou...
722
723
724
725
726
727
728
729
  	ceph_osdc_put_request(req);
  
  	if (rc < 0)
  		cmpxchg(&aio_req->error, 0, rc);
  
  	ceph_aio_complete(inode, aio_req);
  	return;
  }
5be0389da   Yan, Zheng   ceph: re-send AIO...
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
  static void ceph_aio_retry_work(struct work_struct *work)
  {
  	struct ceph_aio_work *aio_work =
  		container_of(work, struct ceph_aio_work, work);
  	struct ceph_osd_request *orig_req = aio_work->req;
  	struct ceph_aio_request *aio_req = orig_req->r_priv;
  	struct inode *inode = orig_req->r_inode;
  	struct ceph_inode_info *ci = ceph_inode(inode);
  	struct ceph_snap_context *snapc;
  	struct ceph_osd_request *req;
  	int ret;
  
  	spin_lock(&ci->i_ceph_lock);
  	if (__ceph_have_pending_cap_snap(ci)) {
  		struct ceph_cap_snap *capsnap =
  			list_last_entry(&ci->i_cap_snaps,
  					struct ceph_cap_snap,
  					ci_item);
  		snapc = ceph_get_snap_context(capsnap->context);
  	} else {
  		BUG_ON(!ci->i_head_snapc);
  		snapc = ceph_get_snap_context(ci->i_head_snapc);
  	}
  	spin_unlock(&ci->i_ceph_lock);
  
  	req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 2,
  			false, GFP_NOFS);
1418bf076   Dan Carpenter   ceph: checking fo...
757
758
  	if (!req) {
  		ret = -ENOMEM;
5be0389da   Yan, Zheng   ceph: re-send AIO...
759
760
761
762
763
764
765
  		req = orig_req;
  		goto out;
  	}
  
  	req->r_flags =	CEPH_OSD_FLAG_ORDERSNAP |
  			CEPH_OSD_FLAG_ONDISK |
  			CEPH_OSD_FLAG_WRITE;
63244fa12   Ilya Dryomov   libceph: introduc...
766
  	ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc);
d30291b98   Ilya Dryomov   libceph: variable...
767
  	ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid);
5be0389da   Yan, Zheng   ceph: re-send AIO...
768

13d1ad16d   Ilya Dryomov   libceph: move mes...
769
770
771
772
773
774
  	ret = ceph_osdc_alloc_messages(req, GFP_NOFS);
  	if (ret) {
  		ceph_osdc_put_request(req);
  		req = orig_req;
  		goto out;
  	}
5be0389da   Yan, Zheng   ceph: re-send AIO...
775
776
777
  
  	req->r_ops[0] = orig_req->r_ops[0];
  	osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
bb873b539   Ilya Dryomov   libceph: switch t...
778
779
  	req->r_mtime = aio_req->mtime;
  	req->r_data_offset = req->r_ops[0].extent.offset;
5be0389da   Yan, Zheng   ceph: re-send AIO...
780

5be0389da   Yan, Zheng   ceph: re-send AIO...
781
782
783
784
785
786
787
788
789
  	ceph_osdc_put_request(orig_req);
  
  	req->r_callback = ceph_aio_complete_req;
  	req->r_inode = inode;
  	req->r_priv = aio_req;
  
  	ret = ceph_osdc_start_request(req->r_osdc, req, false);
  out:
  	if (ret < 0) {
5be0389da   Yan, Zheng   ceph: re-send AIO...
790
  		req->r_result = ret;
85e084feb   Ilya Dryomov   libceph: drop msg...
791
  		ceph_aio_complete_req(req);
5be0389da   Yan, Zheng   ceph: re-send AIO...
792
  	}
db6aed702   Yan, Zheng   ceph: fix snap co...
793
  	ceph_put_snap_context(snapc);
5be0389da   Yan, Zheng   ceph: re-send AIO...
794
795
  	kfree(aio_work);
  }
124e68e74   Sage Weil   ceph: file operat...
796
  /*
26be88087   Alex Elder   libceph: change h...
797
798
799
800
801
802
803
804
805
806
   * Write commit request unsafe callback, called to tell us when a
   * request is unsafe (that is, in flight--has been handed to the
   * messenger to send to its target osd).  It is called again when
   * we've received a response message indicating the request is
   * "safe" (its CEPH_OSD_FLAG_ONDISK flag is set), or when a request
   * is completed early (and unsuccessfully) due to a timeout or
   * interrupt.
   *
   * This is used if we requested both an ACK and ONDISK commit reply
   * from the OSD.
124e68e74   Sage Weil   ceph: file operat...
807
   */
26be88087   Alex Elder   libceph: change h...
808
  static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
124e68e74   Sage Weil   ceph: file operat...
809
810
  {
  	struct ceph_inode_info *ci = ceph_inode(req->r_inode);
26be88087   Alex Elder   libceph: change h...
811
812
813
814
815
816
817
818
819
  	dout("%s %p tid %llu %ssafe
  ", __func__, req, req->r_tid,
  		unsafe ? "un" : "");
  	if (unsafe) {
  		ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
  		spin_lock(&ci->i_unsafe_lock);
  		list_add_tail(&req->r_unsafe_item,
  			      &ci->i_unsafe_writes);
  		spin_unlock(&ci->i_unsafe_lock);
fe5da05e9   Ilya Dryomov   libceph: redo cal...
820
821
  
  		complete_all(&req->r_completion);
26be88087   Alex Elder   libceph: change h...
822
823
824
825
826
827
  	} else {
  		spin_lock(&ci->i_unsafe_lock);
  		list_del_init(&req->r_unsafe_item);
  		spin_unlock(&ci->i_unsafe_lock);
  		ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
  	}
124e68e74   Sage Weil   ceph: file operat...
828
  }
9a5530c63   Yan, Zheng   ceph: wait unsafe...
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
  /*
   * Wait on any unsafe replies for the given inode.  First wait on the
   * newest request, and make that the upper bound.  Then, if there are
   * more requests, keep waiting on the oldest as long as it is still older
   * than the original request.
   */
  void ceph_sync_write_wait(struct inode *inode)
  {
  	struct ceph_inode_info *ci = ceph_inode(inode);
  	struct list_head *head = &ci->i_unsafe_writes;
  	struct ceph_osd_request *req;
  	u64 last_tid;
  
  	if (!S_ISREG(inode->i_mode))
  		return;
  
  	spin_lock(&ci->i_unsafe_lock);
  	if (list_empty(head))
  		goto out;
  
  	/* set upper bound as _last_ entry in chain */
  
  	req = list_last_entry(head, struct ceph_osd_request,
  			      r_unsafe_item);
  	last_tid = req->r_tid;
  
  	do {
  		ceph_osdc_get_request(req);
  		spin_unlock(&ci->i_unsafe_lock);
  
  		dout("sync_write_wait on tid %llu (until %llu)
  ",
  		     req->r_tid, last_tid);
  		wait_for_completion(&req->r_safe_completion);
  		ceph_osdc_put_request(req);
  
  		spin_lock(&ci->i_unsafe_lock);
  		/*
  		 * from here on look at first entry in chain, since we
  		 * only want to wait for anything older than last_tid
  		 */
  		if (list_empty(head))
  			break;
  		req = list_first_entry(head, struct ceph_osd_request,
  				       r_unsafe_item);
  	} while (req->r_tid < last_tid);
  out:
  	spin_unlock(&ci->i_unsafe_lock);
  }
e8344e668   majianpeng   ceph: Implement w...
878

e8344e668   majianpeng   ceph: Implement w...
879
  static ssize_t
c8fe9b17d   Yan, Zheng   ceph: Asynchronou...
880
881
882
  ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
  		       struct ceph_snap_context *snapc,
  		       struct ceph_cap_flush **pcf)
124e68e74   Sage Weil   ceph: file operat...
883
  {
e8344e668   majianpeng   ceph: Implement w...
884
  	struct file *file = iocb->ki_filp;
496ad9aa8   Al Viro   new helper: file_...
885
  	struct inode *inode = file_inode(file);
124e68e74   Sage Weil   ceph: file operat...
886
  	struct ceph_inode_info *ci = ceph_inode(inode);
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
887
  	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
acead002b   Alex Elder   libceph: don't bu...
888
  	struct ceph_vino vino;
124e68e74   Sage Weil   ceph: file operat...
889
890
  	struct ceph_osd_request *req;
  	struct page **pages;
c8fe9b17d   Yan, Zheng   ceph: Asynchronou...
891
892
  	struct ceph_aio_request *aio_req = NULL;
  	int num_pages = 0;
124e68e74   Sage Weil   ceph: file operat...
893
  	int flags;
124e68e74   Sage Weil   ceph: file operat...
894
  	int ret;
c2050a454   Deepa Dinamani   fs: Replace curre...
895
  	struct timespec mtime = current_time(inode);
c8fe9b17d   Yan, Zheng   ceph: Asynchronou...
896
897
898
  	size_t count = iov_iter_count(iter);
  	loff_t pos = iocb->ki_pos;
  	bool write = iov_iter_rw(iter) == WRITE;
124e68e74   Sage Weil   ceph: file operat...
899

c8fe9b17d   Yan, Zheng   ceph: Asynchronou...
900
  	if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP)
124e68e74   Sage Weil   ceph: file operat...
901
  		return -EROFS;
c8fe9b17d   Yan, Zheng   ceph: Asynchronou...
902
903
904
  	dout("sync_direct_read_write (%s) on file %p %lld~%u
  ",
  	     (write ? "write" : "read"), file, pos, (unsigned)count);
124e68e74   Sage Weil   ceph: file operat...
905

e8344e668   majianpeng   ceph: Implement w...
906
  	ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
29065a513   Yehuda Sadeh   ceph: sync read/w...
907
908
  	if (ret < 0)
  		return ret;
c8fe9b17d   Yan, Zheng   ceph: Asynchronou...
909
  	if (write) {
5d7eb1a32   NeilBrown   ceph: ignore erro...
910
  		int ret2 = invalidate_inode_pages2_range(inode->i_mapping,
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
911
912
  					pos >> PAGE_SHIFT,
  					(pos + count) >> PAGE_SHIFT);
5d7eb1a32   NeilBrown   ceph: ignore erro...
913
  		if (ret2 < 0)
c8fe9b17d   Yan, Zheng   ceph: Asynchronou...
914
915
  			dout("invalidate_inode_pages2_range returned %d
  ", ret);
29065a513   Yehuda Sadeh   ceph: sync read/w...
916

c8fe9b17d   Yan, Zheng   ceph: Asynchronou...
917
918
919
920
921
922
  		flags = CEPH_OSD_FLAG_ORDERSNAP |
  			CEPH_OSD_FLAG_ONDISK |
  			CEPH_OSD_FLAG_WRITE;
  	} else {
  		flags = CEPH_OSD_FLAG_READ;
  	}
124e68e74   Sage Weil   ceph: file operat...
923

c8fe9b17d   Yan, Zheng   ceph: Asynchronou...
924
925
926
927
  	while (iov_iter_count(iter) > 0) {
  		u64 size = dio_get_pagev_size(iter);
  		size_t start = 0;
  		ssize_t len;
e8344e668   majianpeng   ceph: Implement w...
928

e8344e668   majianpeng   ceph: Implement w...
929
930
  		vino = ceph_vino(inode);
  		req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
c8fe9b17d   Yan, Zheng   ceph: Asynchronou...
931
932
933
934
935
936
  					    vino, pos, &size, 0,
  					    /*include a 'startsync' command*/
  					    write ? 2 : 1,
  					    write ? CEPH_OSD_OP_WRITE :
  						    CEPH_OSD_OP_READ,
  					    flags, snapc,
e8344e668   majianpeng   ceph: Implement w...
937
938
939
940
941
  					    ci->i_truncate_seq,
  					    ci->i_truncate_size,
  					    false);
  		if (IS_ERR(req)) {
  			ret = PTR_ERR(req);
eab87235c   Al Viro   ceph_sync_{,direc...
942
  			break;
e8344e668   majianpeng   ceph: Implement w...
943
  		}
124e68e74   Sage Weil   ceph: file operat...
944

c8fe9b17d   Yan, Zheng   ceph: Asynchronou...
945
946
  		len = size;
  		pages = dio_get_pages_alloc(iter, len, &start, &num_pages);
b5b98989d   Zhu, Caifeng   ceph: combine as ...
947
  		if (IS_ERR(pages)) {
64c313116   Al Viro   ceph_sync_direct_...
948
  			ceph_osdc_put_request(req);
b5b98989d   Zhu, Caifeng   ceph: combine as ...
949
  			ret = PTR_ERR(pages);
64c313116   Al Viro   ceph_sync_direct_...
950
  			break;
124e68e74   Sage Weil   ceph: file operat...
951
952
953
  		}
  
  		/*
c8fe9b17d   Yan, Zheng   ceph: Asynchronou...
954
955
  		 * To simplify error handling, allow AIO when IO within i_size
  		 * or IO can be satisfied by single OSD request.
124e68e74   Sage Weil   ceph: file operat...
956
  		 */
c8fe9b17d   Yan, Zheng   ceph: Asynchronou...
957
958
959
960
961
962
963
964
  		if (pos == iocb->ki_pos && !is_sync_kiocb(iocb) &&
  		    (len == count || pos + count <= i_size_read(inode))) {
  			aio_req = kzalloc(sizeof(*aio_req), GFP_KERNEL);
  			if (aio_req) {
  				aio_req->iocb = iocb;
  				aio_req->write = write;
  				INIT_LIST_HEAD(&aio_req->osd_reqs);
  				if (write) {
5be0389da   Yan, Zheng   ceph: re-send AIO...
965
  					aio_req->mtime = mtime;
c8fe9b17d   Yan, Zheng   ceph: Asynchronou...
966
967
968
969
970
971
972
973
974
975
976
977
  					swap(aio_req->prealloc_cf, *pcf);
  				}
  			}
  			/* ignore error */
  		}
  
  		if (write) {
  			/*
  			 * throw out any page cache pages in this range. this
  			 * may block.
  			 */
  			truncate_inode_pages_range(inode->i_mapping, pos,
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
978
  					(pos+len) | (PAGE_SIZE - 1));
c8fe9b17d   Yan, Zheng   ceph: Asynchronou...
979
980
  
  			osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
bb873b539   Ilya Dryomov   libceph: switch t...
981
  			req->r_mtime = mtime;
c8fe9b17d   Yan, Zheng   ceph: Asynchronou...
982
  		}
c8fe9b17d   Yan, Zheng   ceph: Asynchronou...
983
984
  		osd_req_op_extent_osd_data_pages(req, 0, pages, len, start,
  						 false, false);
e8344e668   majianpeng   ceph: Implement w...
985

c8fe9b17d   Yan, Zheng   ceph: Asynchronou...
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
  		if (aio_req) {
  			aio_req->total_len += len;
  			aio_req->num_reqs++;
  			atomic_inc(&aio_req->pending_reqs);
  
  			req->r_callback = ceph_aio_complete_req;
  			req->r_inode = inode;
  			req->r_priv = aio_req;
  			list_add_tail(&req->r_unsafe_item, &aio_req->osd_reqs);
  
  			pos += len;
  			iov_iter_advance(iter, len);
  			continue;
  		}
  
  		ret = ceph_osdc_start_request(req->r_osdc, req, false);
e8344e668   majianpeng   ceph: Implement w...
1002
1003
  		if (!ret)
  			ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
c8fe9b17d   Yan, Zheng   ceph: Asynchronou...
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
  		size = i_size_read(inode);
  		if (!write) {
  			if (ret == -ENOENT)
  				ret = 0;
  			if (ret >= 0 && ret < len && pos + ret < size) {
  				int zlen = min_t(size_t, len - ret,
  						 size - pos - ret);
  				ceph_zero_page_vector_range(start + ret, zlen,
  							    pages);
  				ret += zlen;
  			}
  			if (ret >= 0)
  				len = ret;
  		}
a22bd5ffa   Yan, Zheng   ceph: set user pa...
1018
  		ceph_put_page_vector(pages, num_pages, !write);
e8344e668   majianpeng   ceph: Implement w...
1019

e8344e668   majianpeng   ceph: Implement w...
1020
  		ceph_osdc_put_request(req);
c8fe9b17d   Yan, Zheng   ceph: Asynchronou...
1021
  		if (ret < 0)
e8344e668   majianpeng   ceph: Implement w...
1022
  			break;
64c313116   Al Viro   ceph_sync_direct_...
1023

c8fe9b17d   Yan, Zheng   ceph: Asynchronou...
1024
1025
1026
1027
  		pos += len;
  		iov_iter_advance(iter, len);
  
  		if (!write && pos >= size)
e8344e668   majianpeng   ceph: Implement w...
1028
  			break;
64c313116   Al Viro   ceph_sync_direct_...
1029

c8fe9b17d   Yan, Zheng   ceph: Asynchronou...
1030
1031
  		if (write && pos > size) {
  			if (ceph_inode_set_size(inode, pos))
64c313116   Al Viro   ceph_sync_direct_...
1032
1033
1034
1035
  				ceph_check_caps(ceph_inode(inode),
  						CHECK_CAPS_AUTHONLY,
  						NULL);
  		}
e8344e668   majianpeng   ceph: Implement w...
1036
  	}
c8fe9b17d   Yan, Zheng   ceph: Asynchronou...
1037
  	if (aio_req) {
fc8c3892f   Yan, Zheng   ceph: fix use-aft...
1038
  		LIST_HEAD(osd_reqs);
c8fe9b17d   Yan, Zheng   ceph: Asynchronou...
1039
1040
1041
1042
1043
1044
1045
  		if (aio_req->num_reqs == 0) {
  			kfree(aio_req);
  			return ret;
  		}
  
  		ceph_get_cap_refs(ci, write ? CEPH_CAP_FILE_WR :
  					      CEPH_CAP_FILE_RD);
fc8c3892f   Yan, Zheng   ceph: fix use-aft...
1046
1047
1048
  		list_splice(&aio_req->osd_reqs, &osd_reqs);
  		while (!list_empty(&osd_reqs)) {
  			req = list_first_entry(&osd_reqs,
c8fe9b17d   Yan, Zheng   ceph: Asynchronou...
1049
1050
1051
1052
1053
1054
1055
1056
  					       struct ceph_osd_request,
  					       r_unsafe_item);
  			list_del_init(&req->r_unsafe_item);
  			if (ret >= 0)
  				ret = ceph_osdc_start_request(req->r_osdc,
  							      req, false);
  			if (ret < 0) {
  				req->r_result = ret;
85e084feb   Ilya Dryomov   libceph: drop msg...
1057
  				ceph_aio_complete_req(req);
c8fe9b17d   Yan, Zheng   ceph: Asynchronou...
1058
1059
1060
1061
1062
1063
1064
  			}
  		}
  		return -EIOCBQUEUED;
  	}
  
  	if (ret != -EOLDSNAPC && pos > iocb->ki_pos) {
  		ret = pos - iocb->ki_pos;
e8344e668   majianpeng   ceph: Implement w...
1065
  		iocb->ki_pos = pos;
e8344e668   majianpeng   ceph: Implement w...
1066
1067
1068
  	}
  	return ret;
  }
e8344e668   majianpeng   ceph: Implement w...
1069
1070
1071
1072
1073
1074
1075
  /*
   * Synchronous write, straight from __user pointer or user pages.
   *
   * If write spans object boundary, just do multiple writes.  (For a
   * correct atomic write, we should e.g. take write locks on all
   * objects, rollback on failure, etc.)
   */
06fee30f6   Yan, Zheng   ceph: fix append ...
1076
  static ssize_t
5dda377cf   Yan, Zheng   ceph: set i_head_...
1077
1078
  ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
  		struct ceph_snap_context *snapc)
e8344e668   majianpeng   ceph: Implement w...
1079
1080
1081
1082
1083
  {
  	struct file *file = iocb->ki_filp;
  	struct inode *inode = file_inode(file);
  	struct ceph_inode_info *ci = ceph_inode(inode);
  	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
e8344e668   majianpeng   ceph: Implement w...
1084
1085
1086
1087
1088
1089
1090
1091
1092
  	struct ceph_vino vino;
  	struct ceph_osd_request *req;
  	struct page **pages;
  	u64 len;
  	int num_pages;
  	int written = 0;
  	int flags;
  	int check_caps = 0;
  	int ret;
c2050a454   Deepa Dinamani   fs: Replace curre...
1093
  	struct timespec mtime = current_time(inode);
4908b822b   Al Viro   ceph: switch to -...
1094
  	size_t count = iov_iter_count(from);
e8344e668   majianpeng   ceph: Implement w...
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
  
  	if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
  		return -EROFS;
  
  	dout("sync_write on file %p %lld~%u
  ", file, pos, (unsigned)count);
  
  	ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
  	if (ret < 0)
  		return ret;
  
  	ret = invalidate_inode_pages2_range(inode->i_mapping,
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
1107
1108
  					    pos >> PAGE_SHIFT,
  					    (pos + count) >> PAGE_SHIFT);
e8344e668   majianpeng   ceph: Implement w...
1109
1110
1111
1112
1113
1114
1115
1116
  	if (ret < 0)
  		dout("invalidate_inode_pages2_range returned %d
  ", ret);
  
  	flags = CEPH_OSD_FLAG_ORDERSNAP |
  		CEPH_OSD_FLAG_ONDISK |
  		CEPH_OSD_FLAG_WRITE |
  		CEPH_OSD_FLAG_ACK;
4908b822b   Al Viro   ceph: switch to -...
1117
  	while ((len = iov_iter_count(from)) > 0) {
e8344e668   majianpeng   ceph: Implement w...
1118
1119
  		size_t left;
  		int n;
e8344e668   majianpeng   ceph: Implement w...
1120
1121
  		vino = ceph_vino(inode);
  		req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
715e4cd40   Yan, Zheng   libceph: specify ...
1122
  					    vino, pos, &len, 0, 1,
e8344e668   majianpeng   ceph: Implement w...
1123
1124
1125
1126
1127
1128
  					    CEPH_OSD_OP_WRITE, flags, snapc,
  					    ci->i_truncate_seq,
  					    ci->i_truncate_size,
  					    false);
  		if (IS_ERR(req)) {
  			ret = PTR_ERR(req);
eab87235c   Al Viro   ceph_sync_{,direc...
1129
  			break;
e8344e668   majianpeng   ceph: Implement w...
1130
1131
1132
1133
1134
1135
  		}
  
  		/*
  		 * write from beginning of first page,
  		 * regardless of io alignment
  		 */
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
1136
  		num_pages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
e8344e668   majianpeng   ceph: Implement w...
1137

687265e5a   Yan, Zheng   ceph: switch some...
1138
  		pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
124e68e74   Sage Weil   ceph: file operat...
1139
1140
1141
1142
  		if (IS_ERR(pages)) {
  			ret = PTR_ERR(pages);
  			goto out;
  		}
e8344e668   majianpeng   ceph: Implement w...
1143
1144
1145
  
  		left = len;
  		for (n = 0; n < num_pages; n++) {
125d725c9   Ilya Dryomov   ceph: cast PAGE_S...
1146
  			size_t plen = min_t(size_t, left, PAGE_SIZE);
4908b822b   Al Viro   ceph: switch to -...
1147
  			ret = copy_page_from_iter(pages[n], 0, plen, from);
e8344e668   majianpeng   ceph: Implement w...
1148
1149
1150
1151
1152
  			if (ret != plen) {
  				ret = -EFAULT;
  				break;
  			}
  			left -= ret;
e8344e668   majianpeng   ceph: Implement w...
1153
  		}
124e68e74   Sage Weil   ceph: file operat...
1154
1155
1156
1157
  		if (ret < 0) {
  			ceph_release_page_vector(pages, num_pages);
  			goto out;
  		}
e8344e668   majianpeng   ceph: Implement w...
1158
1159
1160
  		/* get a second commit callback */
  		req->r_unsafe_callback = ceph_sync_write_unsafe;
  		req->r_inode = inode;
124e68e74   Sage Weil   ceph: file operat...
1161

e8344e668   majianpeng   ceph: Implement w...
1162
1163
  		osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
  						false, true);
02ee07d30   Alex Elder   libceph: hold off...
1164

bb873b539   Ilya Dryomov   libceph: switch t...
1165
  		req->r_mtime = mtime;
e8344e668   majianpeng   ceph: Implement w...
1166
1167
1168
  		ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
  		if (!ret)
  			ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
124e68e74   Sage Weil   ceph: file operat...
1169
1170
  
  out:
e8344e668   majianpeng   ceph: Implement w...
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
  		ceph_osdc_put_request(req);
  		if (ret == 0) {
  			pos += len;
  			written += len;
  
  			if (pos > i_size_read(inode)) {
  				check_caps = ceph_inode_set_size(inode, pos);
  				if (check_caps)
  					ceph_check_caps(ceph_inode(inode),
  							CHECK_CAPS_AUTHONLY,
  							NULL);
  			}
  		} else
  			break;
  	}
124e68e74   Sage Weil   ceph: file operat...
1186

e8344e668   majianpeng   ceph: Implement w...
1187
  	if (ret != -EOLDSNAPC && written > 0) {
124e68e74   Sage Weil   ceph: file operat...
1188
  		ret = written;
e8344e668   majianpeng   ceph: Implement w...
1189
  		iocb->ki_pos = pos;
124e68e74   Sage Weil   ceph: file operat...
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
  	}
  	return ret;
  }
  
  /*
   * Wrap generic_file_aio_read with checks for cap bits on the inode.
   * Atomically grab references, so that those bits are not released
   * back to the MDS mid-read.
   *
   * Hmm, the sync read case isn't actually async... should it be?
   */
3644424dc   Al Viro   ceph: switch to -...
1201
  static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
124e68e74   Sage Weil   ceph: file operat...
1202
1203
  {
  	struct file *filp = iocb->ki_filp;
2962507ca   Sage Weil   ceph: perform laz...
1204
  	struct ceph_file_info *fi = filp->private_data;
66ee59af6   Christoph Hellwig   fs: remove ki_nbytes
1205
  	size_t len = iov_iter_count(to);
496ad9aa8   Al Viro   new helper: file_...
1206
  	struct inode *inode = file_inode(filp);
124e68e74   Sage Weil   ceph: file operat...
1207
  	struct ceph_inode_info *ci = ceph_inode(inode);
3738daa68   Yan, Zheng   ceph: fetch inlin...
1208
  	struct page *pinned_page = NULL;
124e68e74   Sage Weil   ceph: file operat...
1209
  	ssize_t ret;
2962507ca   Sage Weil   ceph: perform laz...
1210
  	int want, got = 0;
83701246a   Yan, Zheng   ceph: sync read i...
1211
  	int retry_op = 0, read = 0;
124e68e74   Sage Weil   ceph: file operat...
1212

6a026589b   Sage Weil   ceph: fix sync re...
1213
  again:
8eb4efb09   majianpeng   ceph: implement r...
1214
1215
1216
  	dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p
  ",
  	     inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode);
2962507ca   Sage Weil   ceph: perform laz...
1217
1218
1219
1220
  	if (fi->fmode & CEPH_FILE_MODE_LAZY)
  		want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
  	else
  		want = CEPH_CAP_FILE_CACHE;
3738daa68   Yan, Zheng   ceph: fetch inlin...
1221
  	ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page);
124e68e74   Sage Weil   ceph: file operat...
1222
  	if (ret < 0)
8eb4efb09   majianpeng   ceph: implement r...
1223
  		return ret;
124e68e74   Sage Weil   ceph: file operat...
1224

2962507ca   Sage Weil   ceph: perform laz...
1225
  	if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 ||
2ba48ce51   Al Viro   mirror O_APPEND a...
1226
  	    (iocb->ki_flags & IOCB_DIRECT) ||
8eb4efb09   majianpeng   ceph: implement r...
1227
  	    (fi->flags & CEPH_F_SYNC)) {
8eb4efb09   majianpeng   ceph: implement r...
1228
1229
1230
1231
1232
  
  		dout("aio_sync_read %p %llx.%llx %llu~%u got cap refs on %s
  ",
  		     inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
  		     ceph_cap_string(got));
83701246a   Yan, Zheng   ceph: sync read i...
1233
  		if (ci->i_inline_version == CEPH_INLINE_NONE) {
c8fe9b17d   Yan, Zheng   ceph: Asynchronou...
1234
1235
1236
1237
1238
1239
1240
1241
  			if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) {
  				ret = ceph_direct_read_write(iocb, to,
  							     NULL, NULL);
  				if (ret >= 0 && ret < len)
  					retry_op = CHECK_EOF;
  			} else {
  				ret = ceph_sync_read(iocb, to, &retry_op);
  			}
83701246a   Yan, Zheng   ceph: sync read i...
1242
1243
1244
  		} else {
  			retry_op = READ_INLINE;
  		}
8eb4efb09   majianpeng   ceph: implement r...
1245
  	} else {
8eb4efb09   majianpeng   ceph: implement r...
1246
1247
  		dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s
  ",
3644424dc   Al Viro   ceph: switch to -...
1248
  		     inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
8eb4efb09   majianpeng   ceph: implement r...
1249
  		     ceph_cap_string(got));
124e68e74   Sage Weil   ceph: file operat...
1250

3644424dc   Al Viro   ceph: switch to -...
1251
  		ret = generic_file_read_iter(iocb, to);
8eb4efb09   majianpeng   ceph: implement r...
1252
  	}
124e68e74   Sage Weil   ceph: file operat...
1253
1254
1255
  	dout("aio_read %p %llx.%llx dropping cap refs on %s = %d
  ",
  	     inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
3738daa68   Yan, Zheng   ceph: fetch inlin...
1256
  	if (pinned_page) {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
1257
  		put_page(pinned_page);
3738daa68   Yan, Zheng   ceph: fetch inlin...
1258
1259
  		pinned_page = NULL;
  	}
124e68e74   Sage Weil   ceph: file operat...
1260
  	ceph_put_cap_refs(ci, got);
c8fe9b17d   Yan, Zheng   ceph: Asynchronou...
1261
  	if (retry_op > HAVE_RETRIED && ret >= 0) {
83701246a   Yan, Zheng   ceph: sync read i...
1262
1263
1264
1265
  		int statret;
  		struct page *page = NULL;
  		loff_t i_size;
  		if (retry_op == READ_INLINE) {
687265e5a   Yan, Zheng   ceph: switch some...
1266
  			page = __page_cache_alloc(GFP_KERNEL);
83701246a   Yan, Zheng   ceph: sync read i...
1267
1268
1269
  			if (!page)
  				return -ENOMEM;
  		}
6a026589b   Sage Weil   ceph: fix sync re...
1270

83701246a   Yan, Zheng   ceph: sync read i...
1271
1272
1273
  		statret = __ceph_do_getattr(inode, page,
  					    CEPH_STAT_CAP_INLINE_DATA, !!page);
  		if (statret < 0) {
0d7718f66   Nikolay Borisov   ceph: fix error h...
1274
1275
  			if (page)
  				__free_page(page);
83701246a   Yan, Zheng   ceph: sync read i...
1276
1277
1278
1279
1280
1281
  			if (statret == -ENODATA) {
  				BUG_ON(retry_op != READ_INLINE);
  				goto again;
  			}
  			return statret;
  		}
6a026589b   Sage Weil   ceph: fix sync re...
1282

83701246a   Yan, Zheng   ceph: sync read i...
1283
1284
  		i_size = i_size_read(inode);
  		if (retry_op == READ_INLINE) {
fcc02d2a0   Yan, Zheng   ceph: fix reading...
1285
1286
  			BUG_ON(ret > 0 || read > 0);
  			if (iocb->ki_pos < i_size &&
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
1287
  			    iocb->ki_pos < PAGE_SIZE) {
83701246a   Yan, Zheng   ceph: sync read i...
1288
1289
  				loff_t end = min_t(loff_t, i_size,
  						   iocb->ki_pos + len);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
1290
  				end = min_t(loff_t, end, PAGE_SIZE);
83701246a   Yan, Zheng   ceph: sync read i...
1291
1292
1293
1294
1295
1296
  				if (statret < end)
  					zero_user_segment(page, statret, end);
  				ret = copy_page_to_iter(page,
  						iocb->ki_pos & ~PAGE_MASK,
  						end - iocb->ki_pos, to);
  				iocb->ki_pos += ret;
fcc02d2a0   Yan, Zheng   ceph: fix reading...
1297
1298
1299
1300
1301
1302
1303
1304
  				read += ret;
  			}
  			if (iocb->ki_pos < i_size && read < len) {
  				size_t zlen = min_t(size_t, len - read,
  						    i_size - iocb->ki_pos);
  				ret = iov_iter_zero(zlen, to);
  				iocb->ki_pos += ret;
  				read += ret;
83701246a   Yan, Zheng   ceph: sync read i...
1305
1306
  			}
  			__free_pages(page, 0);
fcc02d2a0   Yan, Zheng   ceph: fix reading...
1307
  			return read;
83701246a   Yan, Zheng   ceph: sync read i...
1308
  		}
6a026589b   Sage Weil   ceph: fix sync re...
1309
1310
  
  		/* hit EOF or hole? */
83701246a   Yan, Zheng   ceph: sync read i...
1311
  		if (retry_op == CHECK_EOF && iocb->ki_pos < i_size &&
fcc02d2a0   Yan, Zheng   ceph: fix reading...
1312
  		    ret < len) {
8eb4efb09   majianpeng   ceph: implement r...
1313
  			dout("sync_read hit hole, ppos %lld < size %lld"
99c88e690   Yan, Zheng   ceph: use i_size_...
1314
1315
  			     ", reading more
  ", iocb->ki_pos, i_size);
8eb4efb09   majianpeng   ceph: implement r...
1316

6a026589b   Sage Weil   ceph: fix sync re...
1317
  			read += ret;
6a026589b   Sage Weil   ceph: fix sync re...
1318
  			len -= ret;
c8fe9b17d   Yan, Zheng   ceph: Asynchronou...
1319
  			retry_op = HAVE_RETRIED;
6a026589b   Sage Weil   ceph: fix sync re...
1320
1321
1322
  			goto again;
  		}
  	}
8eb4efb09   majianpeng   ceph: implement r...
1323

6a026589b   Sage Weil   ceph: fix sync re...
1324
1325
  	if (ret >= 0)
  		ret += read;
124e68e74   Sage Weil   ceph: file operat...
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
  	return ret;
  }
  
  /*
   * Take cap references to avoid releasing caps to MDS mid-write.
   *
   * If we are synchronous, and write with an old snap context, the OSD
   * may return EOLDSNAPC.  In that case, retry the write.. _after_
   * dropping our cap refs and allowing the pending snap to logically
   * complete _before_ this write occurs.
   *
   * If we are near ENOSPC, write synchronously.
   */
4908b822b   Al Viro   ceph: switch to -...
1339
  static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
124e68e74   Sage Weil   ceph: file operat...
1340
1341
  {
  	struct file *file = iocb->ki_filp;
33caad324   Sage Weil   ceph: perform laz...
1342
  	struct ceph_file_info *fi = file->private_data;
496ad9aa8   Al Viro   new helper: file_...
1343
  	struct inode *inode = file_inode(file);
124e68e74   Sage Weil   ceph: file operat...
1344
  	struct ceph_inode_info *ci = ceph_inode(inode);
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
1345
1346
  	struct ceph_osd_client *osdc =
  		&ceph_sb_to_client(inode->i_sb)->client->osdc;
f66fd9f09   Yan, Zheng   ceph: pre-allocat...
1347
  	struct ceph_cap_flush *prealloc_cf;
3309dd04c   Al Viro   switch generic_wr...
1348
  	ssize_t count, written = 0;
03d254ede   Yan, Zheng   ceph: apply write...
1349
  	int err, want, got;
3309dd04c   Al Viro   switch generic_wr...
1350
  	loff_t pos;
124e68e74   Sage Weil   ceph: file operat...
1351
1352
1353
  
  	if (ceph_snap(inode) != CEPH_NOSNAP)
  		return -EROFS;
f66fd9f09   Yan, Zheng   ceph: pre-allocat...
1354
1355
1356
  	prealloc_cf = ceph_alloc_cap_flush();
  	if (!prealloc_cf)
  		return -ENOMEM;
5955102c9   Al Viro   wrappers for ->i_...
1357
  	inode_lock(inode);
03d254ede   Yan, Zheng   ceph: apply write...
1358

03d254ede   Yan, Zheng   ceph: apply write...
1359
  	/* We can write back this queue in page reclaim */
de1414a65   Christoph Hellwig   fs: export inode_...
1360
  	current->backing_dev_info = inode_to_bdi(inode);
03d254ede   Yan, Zheng   ceph: apply write...
1361

55b0b31cb   Yan, Zheng   ceph: get inode s...
1362
1363
1364
1365
1366
  	if (iocb->ki_flags & IOCB_APPEND) {
  		err = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
  		if (err < 0)
  			goto out;
  	}
3309dd04c   Al Viro   switch generic_wr...
1367
1368
  	err = generic_write_checks(iocb, from);
  	if (err <= 0)
03d254ede   Yan, Zheng   ceph: apply write...
1369
  		goto out;
3309dd04c   Al Viro   switch generic_wr...
1370
1371
  	pos = iocb->ki_pos;
  	count = iov_iter_count(from);
5fa8e0a1c   Jan Kara   fs: Rename file_r...
1372
  	err = file_remove_privs(file);
03d254ede   Yan, Zheng   ceph: apply write...
1373
1374
1375
1376
1377
1378
  	if (err)
  		goto out;
  
  	err = file_update_time(file);
  	if (err)
  		goto out;
28127bdd2   Yan, Zheng   ceph: convert inl...
1379
1380
1381
1382
1383
  	if (ci->i_inline_version != CEPH_INLINE_NONE) {
  		err = ceph_uninline_data(file, NULL);
  		if (err < 0)
  			goto out;
  	}
124e68e74   Sage Weil   ceph: file operat...
1384
  retry_snap:
b7ec35b30   Ilya Dryomov   libceph: change c...
1385
  	if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL)) {
03d254ede   Yan, Zheng   ceph: apply write...
1386
  		err = -ENOSPC;
6070e0c1e   Yan, Zheng   ceph: don't early...
1387
1388
  		goto out;
  	}
03d254ede   Yan, Zheng   ceph: apply write...
1389

ac7f29bf2   Randy Dunlap   ceph: fix printk ...
1390
1391
  	dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu
  ",
99c88e690   Yan, Zheng   ceph: use i_size_...
1392
  	     inode, ceph_vinop(inode), pos, count, i_size_read(inode));
7971bd92b   Sage Weil   ceph: revert comm...
1393
1394
1395
1396
  	if (fi->fmode & CEPH_FILE_MODE_LAZY)
  		want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
  	else
  		want = CEPH_CAP_FILE_BUFFER;
03d254ede   Yan, Zheng   ceph: apply write...
1397
  	got = 0;
3738daa68   Yan, Zheng   ceph: fetch inlin...
1398
1399
  	err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, pos + count,
  			    &got, NULL);
03d254ede   Yan, Zheng   ceph: apply write...
1400
  	if (err < 0)
37505d576   Yan, Zheng   ceph: take i_mute...
1401
  		goto out;
124e68e74   Sage Weil   ceph: file operat...
1402

ac7f29bf2   Randy Dunlap   ceph: fix printk ...
1403
1404
  	dout("aio_write %p %llx.%llx %llu~%zd got cap refs on %s
  ",
03d254ede   Yan, Zheng   ceph: apply write...
1405
  	     inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));
7971bd92b   Sage Weil   ceph: revert comm...
1406
1407
  
  	if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
2ba48ce51   Al Viro   mirror O_APPEND a...
1408
  	    (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC)) {
5dda377cf   Yan, Zheng   ceph: set i_head_...
1409
  		struct ceph_snap_context *snapc;
4908b822b   Al Viro   ceph: switch to -...
1410
  		struct iov_iter data;
5955102c9   Al Viro   wrappers for ->i_...
1411
  		inode_unlock(inode);
5dda377cf   Yan, Zheng   ceph: set i_head_...
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
  
  		spin_lock(&ci->i_ceph_lock);
  		if (__ceph_have_pending_cap_snap(ci)) {
  			struct ceph_cap_snap *capsnap =
  					list_last_entry(&ci->i_cap_snaps,
  							struct ceph_cap_snap,
  							ci_item);
  			snapc = ceph_get_snap_context(capsnap->context);
  		} else {
  			BUG_ON(!ci->i_head_snapc);
  			snapc = ceph_get_snap_context(ci->i_head_snapc);
  		}
  		spin_unlock(&ci->i_ceph_lock);
4908b822b   Al Viro   ceph: switch to -...
1425
1426
  		/* we might need to revert back to that point */
  		data = *from;
2ba48ce51   Al Viro   mirror O_APPEND a...
1427
  		if (iocb->ki_flags & IOCB_DIRECT)
c8fe9b17d   Yan, Zheng   ceph: Asynchronou...
1428
1429
  			written = ceph_direct_read_write(iocb, &data, snapc,
  							 &prealloc_cf);
e8344e668   majianpeng   ceph: Implement w...
1430
  		else
5dda377cf   Yan, Zheng   ceph: set i_head_...
1431
  			written = ceph_sync_write(iocb, &data, pos, snapc);
0e5dd45ce   majianpeng   ceph: Move the pl...
1432
1433
1434
1435
1436
  		if (written == -EOLDSNAPC) {
  			dout("aio_write %p %llx.%llx %llu~%u"
  				"got EOLDSNAPC, retrying
  ",
  				inode, ceph_vinop(inode),
4908b822b   Al Viro   ceph: switch to -...
1437
  				pos, (unsigned)count);
5955102c9   Al Viro   wrappers for ->i_...
1438
  			inode_lock(inode);
0e5dd45ce   majianpeng   ceph: Move the pl...
1439
1440
  			goto retry_snap;
  		}
4908b822b   Al Viro   ceph: switch to -...
1441
1442
  		if (written > 0)
  			iov_iter_advance(from, written);
5dda377cf   Yan, Zheng   ceph: set i_head_...
1443
  		ceph_put_snap_context(snapc);
7971bd92b   Sage Weil   ceph: revert comm...
1444
  	} else {
b0d7c2231   Yan, Zheng   ceph: introduce i...
1445
1446
1447
1448
1449
1450
1451
  		/*
  		 * No need to acquire the i_truncate_mutex. Because
  		 * the MDS revokes Fwb caps before sending truncate
  		 * message to us. We can't get Fwb cap while there
  		 * are pending vmtruncate. So write and vmtruncate
  		 * can not run at the same time
  		 */
4908b822b   Al Viro   ceph: switch to -...
1452
  		written = generic_perform_write(file, from, pos);
aec605f42   Al Viro   ceph_aio_write():...
1453
1454
  		if (likely(written >= 0))
  			iocb->ki_pos = pos + written;
5955102c9   Al Viro   wrappers for ->i_...
1455
  		inode_unlock(inode);
7971bd92b   Sage Weil   ceph: revert comm...
1456
  	}
d8de9ab63   Sage Weil   ceph: avoid carry...
1457

03d254ede   Yan, Zheng   ceph: apply write...
1458
  	if (written >= 0) {
fca65b4ad   Sage Weil   ceph: do not call...
1459
  		int dirty;
be655596b   Sage Weil   ceph: use i_ceph_...
1460
  		spin_lock(&ci->i_ceph_lock);
28127bdd2   Yan, Zheng   ceph: convert inl...
1461
  		ci->i_inline_version = CEPH_INLINE_NONE;
f66fd9f09   Yan, Zheng   ceph: pre-allocat...
1462
1463
  		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
  					       &prealloc_cf);
be655596b   Sage Weil   ceph: use i_ceph_...
1464
  		spin_unlock(&ci->i_ceph_lock);
fca65b4ad   Sage Weil   ceph: do not call...
1465
1466
  		if (dirty)
  			__mark_inode_dirty(inode, dirty);
124e68e74   Sage Weil   ceph: file operat...
1467
  	}
7971bd92b   Sage Weil   ceph: revert comm...
1468

124e68e74   Sage Weil   ceph: file operat...
1469
1470
  	dout("aio_write %p %llx.%llx %llu~%u  dropping cap refs on %s
  ",
4908b822b   Al Viro   ceph: switch to -...
1471
  	     inode, ceph_vinop(inode), pos, (unsigned)count,
7971bd92b   Sage Weil   ceph: revert comm...
1472
  	     ceph_cap_string(got));
124e68e74   Sage Weil   ceph: file operat...
1473
  	ceph_put_cap_refs(ci, got);
7971bd92b   Sage Weil   ceph: revert comm...
1474

6aa657c85   Christoph Hellwig   ceph: use generic...
1475
  	if (written >= 0) {
b7ec35b30   Ilya Dryomov   libceph: change c...
1476
  		if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_NEARFULL))
6aa657c85   Christoph Hellwig   ceph: use generic...
1477
1478
1479
  			iocb->ki_flags |= IOCB_DSYNC;
  
  		written = generic_write_sync(iocb, written);
6070e0c1e   Yan, Zheng   ceph: don't early...
1480
  	}
03d254ede   Yan, Zheng   ceph: apply write...
1481

2f75e9e17   Sage Weil   ceph: replace hol...
1482
  	goto out_unlocked;
03d254ede   Yan, Zheng   ceph: apply write...
1483
  out:
5955102c9   Al Viro   wrappers for ->i_...
1484
  	inode_unlock(inode);
2f75e9e17   Sage Weil   ceph: replace hol...
1485
  out_unlocked:
f66fd9f09   Yan, Zheng   ceph: pre-allocat...
1486
  	ceph_free_cap_flush(prealloc_cf);
03d254ede   Yan, Zheng   ceph: apply write...
1487
  	current->backing_dev_info = NULL;
03d254ede   Yan, Zheng   ceph: apply write...
1488
  	return written ? written : err;
124e68e74   Sage Weil   ceph: file operat...
1489
1490
1491
1492
1493
  }
  
  /*
   * llseek.  be sure to verify file size on SEEK_END.
   */
965c8e59c   Andrew Morton   lseek: the "whenc...
1494
  static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
124e68e74   Sage Weil   ceph: file operat...
1495
1496
  {
  	struct inode *inode = file->f_mapping->host;
99c88e690   Yan, Zheng   ceph: use i_size_...
1497
  	loff_t i_size;
955818cd5   Phil Turnbull   ceph: Correctly r...
1498
  	loff_t ret;
124e68e74   Sage Weil   ceph: file operat...
1499

5955102c9   Al Viro   wrappers for ->i_...
1500
  	inode_lock(inode);
6a82c47aa   Sage Weil   ceph: fix SEEK_CU...
1501

965c8e59c   Andrew Morton   lseek: the "whenc...
1502
  	if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {
508b32d86   Yan, Zheng   ceph: request xat...
1503
  		ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
955818cd5   Phil Turnbull   ceph: Correctly r...
1504
  		if (ret < 0)
124e68e74   Sage Weil   ceph: file operat...
1505
  			goto out;
06222e491   Josef Bacik   fs: handle SEEK_H...
1506
  	}
99c88e690   Yan, Zheng   ceph: use i_size_...
1507
  	i_size = i_size_read(inode);
965c8e59c   Andrew Morton   lseek: the "whenc...
1508
  	switch (whence) {
06222e491   Josef Bacik   fs: handle SEEK_H...
1509
  	case SEEK_END:
99c88e690   Yan, Zheng   ceph: use i_size_...
1510
  		offset += i_size;
124e68e74   Sage Weil   ceph: file operat...
1511
1512
1513
1514
1515
1516
1517
1518
1519
  		break;
  	case SEEK_CUR:
  		/*
  		 * Here we special-case the lseek(fd, 0, SEEK_CUR)
  		 * position-querying operation.  Avoid rewriting the "same"
  		 * f_pos value back to the file because a concurrent read(),
  		 * write() or lseek() might have altered it
  		 */
  		if (offset == 0) {
955818cd5   Phil Turnbull   ceph: Correctly r...
1520
  			ret = file->f_pos;
124e68e74   Sage Weil   ceph: file operat...
1521
1522
1523
1524
  			goto out;
  		}
  		offset += file->f_pos;
  		break;
06222e491   Josef Bacik   fs: handle SEEK_H...
1525
  	case SEEK_DATA:
99c88e690   Yan, Zheng   ceph: use i_size_...
1526
  		if (offset >= i_size) {
06222e491   Josef Bacik   fs: handle SEEK_H...
1527
1528
1529
1530
1531
  			ret = -ENXIO;
  			goto out;
  		}
  		break;
  	case SEEK_HOLE:
99c88e690   Yan, Zheng   ceph: use i_size_...
1532
  		if (offset >= i_size) {
06222e491   Josef Bacik   fs: handle SEEK_H...
1533
1534
1535
  			ret = -ENXIO;
  			goto out;
  		}
99c88e690   Yan, Zheng   ceph: use i_size_...
1536
  		offset = i_size;
06222e491   Josef Bacik   fs: handle SEEK_H...
1537
  		break;
124e68e74   Sage Weil   ceph: file operat...
1538
  	}
955818cd5   Phil Turnbull   ceph: Correctly r...
1539
  	ret = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
124e68e74   Sage Weil   ceph: file operat...
1540
1541
  
  out:
5955102c9   Al Viro   wrappers for ->i_...
1542
  	inode_unlock(inode);
955818cd5   Phil Turnbull   ceph: Correctly r...
1543
  	return ret;
124e68e74   Sage Weil   ceph: file operat...
1544
  }
ad7a60de8   Li Wang   ceph: punch hole ...
1545
1546
1547
1548
  static inline void ceph_zero_partial_page(
  	struct inode *inode, loff_t offset, unsigned size)
  {
  	struct page *page;
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
1549
  	pgoff_t index = offset >> PAGE_SHIFT;
ad7a60de8   Li Wang   ceph: punch hole ...
1550
1551
1552
1553
  
  	page = find_lock_page(inode->i_mapping, index);
  	if (page) {
  		wait_on_page_writeback(page);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
1554
  		zero_user(page, offset & (PAGE_SIZE - 1), size);
ad7a60de8   Li Wang   ceph: punch hole ...
1555
  		unlock_page(page);
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
1556
  		put_page(page);
ad7a60de8   Li Wang   ceph: punch hole ...
1557
1558
1559
1560
1561
1562
  	}
  }
  
  static void ceph_zero_pagecache_range(struct inode *inode, loff_t offset,
  				      loff_t length)
  {
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
1563
  	loff_t nearly = round_up(offset, PAGE_SIZE);
ad7a60de8   Li Wang   ceph: punch hole ...
1564
1565
1566
1567
1568
1569
1570
1571
  	if (offset < nearly) {
  		loff_t size = nearly - offset;
  		if (length < size)
  			size = length;
  		ceph_zero_partial_page(inode, offset, size);
  		offset += size;
  		length -= size;
  	}
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
1572
1573
  	if (length >= PAGE_SIZE) {
  		loff_t size = round_down(length, PAGE_SIZE);
ad7a60de8   Li Wang   ceph: punch hole ...
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
  		truncate_pagecache_range(inode, offset, offset + size - 1);
  		offset += size;
  		length -= size;
  	}
  	if (length)
  		ceph_zero_partial_page(inode, offset, length);
  }
  
  static int ceph_zero_partial_object(struct inode *inode,
  				    loff_t offset, loff_t *length)
  {
  	struct ceph_inode_info *ci = ceph_inode(inode);
  	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
  	struct ceph_osd_request *req;
  	int ret = 0;
  	loff_t zero = 0;
  	int op;
  
  	if (!length) {
  		op = offset ? CEPH_OSD_OP_DELETE : CEPH_OSD_OP_TRUNCATE;
  		length = &zero;
  	} else {
  		op = CEPH_OSD_OP_ZERO;
  	}
  
  	req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
  					ceph_vino(inode),
  					offset, length,
715e4cd40   Yan, Zheng   libceph: specify ...
1602
  					0, 1, op,
ad7a60de8   Li Wang   ceph: punch hole ...
1603
1604
1605
1606
1607
1608
1609
  					CEPH_OSD_FLAG_WRITE |
  					CEPH_OSD_FLAG_ONDISK,
  					NULL, 0, 0, false);
  	if (IS_ERR(req)) {
  		ret = PTR_ERR(req);
  		goto out;
  	}
bb873b539   Ilya Dryomov   libceph: switch t...
1610
  	req->r_mtime = inode->i_mtime;
ad7a60de8   Li Wang   ceph: punch hole ...
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
  	ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
  	if (!ret) {
  		ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
  		if (ret == -ENOENT)
  			ret = 0;
  	}
  	ceph_osdc_put_request(req);
  
  out:
  	return ret;
  }
  
  static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length)
  {
  	int ret = 0;
  	struct ceph_inode_info *ci = ceph_inode(inode);
7627151ea   Yan, Zheng   libceph: define n...
1627
1628
1629
  	s32 stripe_unit = ci->i_layout.stripe_unit;
  	s32 stripe_count = ci->i_layout.stripe_count;
  	s32 object_size = ci->i_layout.object_size;
b314a90d8   Sage Weil   ceph: fix falloca...
1630
1631
1632
1633
1634
1635
1636
  	u64 object_set_size = object_size * stripe_count;
  	u64 nearly, t;
  
  	/* round offset up to next period boundary */
  	nearly = offset + object_set_size - 1;
  	t = nearly;
  	nearly -= do_div(t, object_set_size);
ad7a60de8   Li Wang   ceph: punch hole ...
1637

ad7a60de8   Li Wang   ceph: punch hole ...
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
  	while (length && offset < nearly) {
  		loff_t size = length;
  		ret = ceph_zero_partial_object(inode, offset, &size);
  		if (ret < 0)
  			return ret;
  		offset += size;
  		length -= size;
  	}
  	while (length >= object_set_size) {
  		int i;
  		loff_t pos = offset;
  		for (i = 0; i < stripe_count; ++i) {
  			ret = ceph_zero_partial_object(inode, pos, NULL);
  			if (ret < 0)
  				return ret;
  			pos += stripe_unit;
  		}
  		offset += object_set_size;
  		length -= object_set_size;
  	}
  	while (length) {
  		loff_t size = length;
  		ret = ceph_zero_partial_object(inode, offset, &size);
  		if (ret < 0)
  			return ret;
  		offset += size;
  		length -= size;
  	}
  	return ret;
  }
  
  static long ceph_fallocate(struct file *file, int mode,
  				loff_t offset, loff_t length)
  {
  	struct ceph_file_info *fi = file->private_data;
aa8b60e07   Libo Chen   fs: ceph: new hel...
1673
  	struct inode *inode = file_inode(file);
ad7a60de8   Li Wang   ceph: punch hole ...
1674
1675
1676
  	struct ceph_inode_info *ci = ceph_inode(inode);
  	struct ceph_osd_client *osdc =
  		&ceph_inode_to_client(inode)->client->osdc;
f66fd9f09   Yan, Zheng   ceph: pre-allocat...
1677
  	struct ceph_cap_flush *prealloc_cf;
ad7a60de8   Li Wang   ceph: punch hole ...
1678
1679
1680
1681
1682
  	int want, got = 0;
  	int dirty;
  	int ret = 0;
  	loff_t endoff = 0;
  	loff_t size;
494d77bf8   Yan, Zheng   ceph: check unsup...
1683
1684
  	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
  		return -EOPNOTSUPP;
ad7a60de8   Li Wang   ceph: punch hole ...
1685
1686
  	if (!S_ISREG(inode->i_mode))
  		return -EOPNOTSUPP;
f66fd9f09   Yan, Zheng   ceph: pre-allocat...
1687
1688
1689
  	prealloc_cf = ceph_alloc_cap_flush();
  	if (!prealloc_cf)
  		return -ENOMEM;
5955102c9   Al Viro   wrappers for ->i_...
1690
  	inode_lock(inode);
ad7a60de8   Li Wang   ceph: punch hole ...
1691
1692
1693
1694
1695
  
  	if (ceph_snap(inode) != CEPH_NOSNAP) {
  		ret = -EROFS;
  		goto unlock;
  	}
b7ec35b30   Ilya Dryomov   libceph: change c...
1696
1697
  	if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) &&
  	    !(mode & FALLOC_FL_PUNCH_HOLE)) {
ad7a60de8   Li Wang   ceph: punch hole ...
1698
1699
1700
  		ret = -ENOSPC;
  		goto unlock;
  	}
28127bdd2   Yan, Zheng   ceph: convert inl...
1701
1702
1703
1704
1705
  	if (ci->i_inline_version != CEPH_INLINE_NONE) {
  		ret = ceph_uninline_data(file, NULL);
  		if (ret < 0)
  			goto unlock;
  	}
ad7a60de8   Li Wang   ceph: punch hole ...
1706
1707
1708
1709
1710
1711
1712
1713
  	size = i_size_read(inode);
  	if (!(mode & FALLOC_FL_KEEP_SIZE))
  		endoff = offset + length;
  
  	if (fi->fmode & CEPH_FILE_MODE_LAZY)
  		want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
  	else
  		want = CEPH_CAP_FILE_BUFFER;
3738daa68   Yan, Zheng   ceph: fetch inlin...
1714
  	ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, endoff, &got, NULL);
ad7a60de8   Li Wang   ceph: punch hole ...
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
  	if (ret < 0)
  		goto unlock;
  
  	if (mode & FALLOC_FL_PUNCH_HOLE) {
  		if (offset < size)
  			ceph_zero_pagecache_range(inode, offset, length);
  		ret = ceph_zero_objects(inode, offset, length);
  	} else if (endoff > size) {
  		truncate_pagecache_range(inode, size, -1);
  		if (ceph_inode_set_size(inode, endoff))
  			ceph_check_caps(ceph_inode(inode),
  				CHECK_CAPS_AUTHONLY, NULL);
  	}
  
  	if (!ret) {
  		spin_lock(&ci->i_ceph_lock);
28127bdd2   Yan, Zheng   ceph: convert inl...
1731
  		ci->i_inline_version = CEPH_INLINE_NONE;
f66fd9f09   Yan, Zheng   ceph: pre-allocat...
1732
1733
  		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
  					       &prealloc_cf);
ad7a60de8   Li Wang   ceph: punch hole ...
1734
1735
1736
1737
1738
1739
1740
  		spin_unlock(&ci->i_ceph_lock);
  		if (dirty)
  			__mark_inode_dirty(inode, dirty);
  	}
  
  	ceph_put_cap_refs(ci, got);
  unlock:
5955102c9   Al Viro   wrappers for ->i_...
1741
  	inode_unlock(inode);
f66fd9f09   Yan, Zheng   ceph: pre-allocat...
1742
  	ceph_free_cap_flush(prealloc_cf);
ad7a60de8   Li Wang   ceph: punch hole ...
1743
1744
  	return ret;
  }
124e68e74   Sage Weil   ceph: file operat...
1745
1746
1747
1748
  const struct file_operations ceph_file_fops = {
  	.open = ceph_open,
  	.release = ceph_release,
  	.llseek = ceph_llseek,
3644424dc   Al Viro   ceph: switch to -...
1749
  	.read_iter = ceph_read_iter,
4908b822b   Al Viro   ceph: switch to -...
1750
  	.write_iter = ceph_write_iter,
124e68e74   Sage Weil   ceph: file operat...
1751
1752
  	.mmap = ceph_mmap,
  	.fsync = ceph_fsync,
40819f6fb   Greg Farnum   ceph: add flock/f...
1753
1754
  	.lock = ceph_lock,
  	.flock = ceph_flock,
3551dd79a   Al Viro   ceph: switch to i...
1755
  	.splice_write = iter_file_splice_write,
124e68e74   Sage Weil   ceph: file operat...
1756
1757
  	.unlocked_ioctl = ceph_ioctl,
  	.compat_ioctl	= ceph_ioctl,
ad7a60de8   Li Wang   ceph: punch hole ...
1758
  	.fallocate	= ceph_fallocate,
124e68e74   Sage Weil   ceph: file operat...
1759
  };