Blame view

fs/ceph/mds_client.c 132 KB
b24413180   Greg Kroah-Hartman   License cleanup: ...
1
  // SPDX-License-Identifier: GPL-2.0
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
2
  #include <linux/ceph/ceph_debug.h>
2f2dc0534   Sage Weil   ceph: MDS client
3

496e59553   Sage Weil   ceph: switch from...
4
  #include <linux/fs.h>
2f2dc0534   Sage Weil   ceph: MDS client
5
  #include <linux/wait.h>
5a0e3ad6a   Tejun Heo   include cleanup: ...
6
  #include <linux/slab.h>
54008399d   Yan, Zheng   ceph: preallocate...
7
  #include <linux/gfp.h>
2f2dc0534   Sage Weil   ceph: MDS client
8
  #include <linux/sched.h>
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
9
10
  #include <linux/debugfs.h>
  #include <linux/seq_file.h>
3e0708b99   Yan, Zheng   ceph: ratelimit w...
11
  #include <linux/ratelimit.h>
9ba1e2245   Xiubo Li   ceph: allocate th...
12
  #include <linux/bits.h>
70c948206   Xiubo Li   ceph: add metadat...
13
  #include <linux/ktime.h>
2f2dc0534   Sage Weil   ceph: MDS client
14

2f2dc0534   Sage Weil   ceph: MDS client
15
  #include "super.h"
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
16
  #include "mds_client.h"
1fe60e51a   Sage Weil   libceph: move fea...
17
  #include <linux/ceph/ceph_features.h>
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
18
19
20
21
22
  #include <linux/ceph/messenger.h>
  #include <linux/ceph/decode.h>
  #include <linux/ceph/pagelist.h>
  #include <linux/ceph/auth.h>
  #include <linux/ceph/debugfs.h>
2f2dc0534   Sage Weil   ceph: MDS client
23

81c5a1487   Yan, Zheng   ceph: split large...
24
  #define RECONNECT_MAX_SIZE (INT_MAX - PAGE_SIZE)
2f2dc0534   Sage Weil   ceph: MDS client
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
  /*
   * A cluster of MDS (metadata server) daemons is responsible for
   * managing the file system namespace (the directory hierarchy and
   * inodes) and for coordinating shared access to storage.  Metadata is
   * partitioning hierarchically across a number of servers, and that
   * partition varies over time as the cluster adjusts the distribution
   * in order to balance load.
   *
   * The MDS client is primarily responsible to managing synchronous
   * metadata requests for operations like open, unlink, and so forth.
   * If there is a MDS failure, we find out about it when we (possibly
   * request and) receive a new MDS map, and can resubmit affected
   * requests.
   *
   * For the most part, though, we take advantage of a lossless
   * communications channel to the MDS, and do not need to worry about
   * timing out or resubmitting requests.
   *
   * We maintain a stateful "session" with each MDS we interact with.
   * Within each session, we sent periodic heartbeat messages to ensure
   * any capabilities or leases we have been issues remain valid.  If
   * the session times out and goes stale, our leases and capabilities
   * are no longer valid.
   */
20cb34ae9   Sage Weil   ceph: support v2 ...
49
  struct ceph_reconnect_state {
81c5a1487   Yan, Zheng   ceph: split large...
50
51
  	struct ceph_mds_session *session;
  	int nr_caps, nr_realms;
20cb34ae9   Sage Weil   ceph: support v2 ...
52
  	struct ceph_pagelist *pagelist;
121f22a19   Yan, Zheng   ceph: update cap ...
53
  	unsigned msg_version;
81c5a1487   Yan, Zheng   ceph: split large...
54
  	bool allow_multi;
20cb34ae9   Sage Weil   ceph: support v2 ...
55
  };
2f2dc0534   Sage Weil   ceph: MDS client
56
57
  static void __wake_requests(struct ceph_mds_client *mdsc,
  			    struct list_head *head);
e3ec8d689   Yan, Zheng   ceph: send cap re...
58
  static void ceph_cap_release_work(struct work_struct *work);
37c4efc1d   Yan, Zheng   ceph: periodicall...
59
  static void ceph_cap_reclaim_work(struct work_struct *work);
2f2dc0534   Sage Weil   ceph: MDS client
60

9e32789f6   Tobias Klauser   ceph: Storage cla...
61
  static const struct ceph_connection_operations mds_con_ops;
2f2dc0534   Sage Weil   ceph: MDS client
62
63
64
65
66
  
  
  /*
   * mds reply parsing
   */
b37fe1f92   Yan, Zheng   ceph: support ver...
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
  static int parse_reply_info_quota(void **p, void *end,
  				  struct ceph_mds_reply_info_in *info)
  {
  	u8 struct_v, struct_compat;
  	u32 struct_len;
  
  	ceph_decode_8_safe(p, end, struct_v, bad);
  	ceph_decode_8_safe(p, end, struct_compat, bad);
  	/* struct_v is expected to be >= 1. we only
  	 * understand encoding with struct_compat == 1. */
  	if (!struct_v || struct_compat != 1)
  		goto bad;
  	ceph_decode_32_safe(p, end, struct_len, bad);
  	ceph_decode_need(p, end, struct_len, bad);
  	end = *p + struct_len;
  	ceph_decode_64_safe(p, end, info->max_bytes, bad);
  	ceph_decode_64_safe(p, end, info->max_files, bad);
  	*p = end;
  	return 0;
  bad:
  	return -EIO;
  }
2f2dc0534   Sage Weil   ceph: MDS client
89
90
91
92
  /*
   * parse individual inode info
   */
  static int parse_reply_info_in(void **p, void *end,
14303d20f   Sage Weil   ceph: implement D...
93
  			       struct ceph_mds_reply_info_in *info,
12b4629a9   Ilya Dryomov   libceph: all feat...
94
  			       u64 features)
2f2dc0534   Sage Weil   ceph: MDS client
95
  {
b37fe1f92   Yan, Zheng   ceph: support ver...
96
97
  	int err = 0;
  	u8 struct_v = 0;
2f2dc0534   Sage Weil   ceph: MDS client
98

b37fe1f92   Yan, Zheng   ceph: support ver...
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
  	if (features == (u64)-1) {
  		u32 struct_len;
  		u8 struct_compat;
  		ceph_decode_8_safe(p, end, struct_v, bad);
  		ceph_decode_8_safe(p, end, struct_compat, bad);
  		/* struct_v is expected to be >= 1. we only understand
  		 * encoding with struct_compat == 1. */
  		if (!struct_v || struct_compat != 1)
  			goto bad;
  		ceph_decode_32_safe(p, end, struct_len, bad);
  		ceph_decode_need(p, end, struct_len, bad);
  		end = *p + struct_len;
  	}
  
  	ceph_decode_need(p, end, sizeof(struct ceph_mds_reply_inode), bad);
2f2dc0534   Sage Weil   ceph: MDS client
114
115
116
117
118
119
120
121
122
  	info->in = *p;
  	*p += sizeof(struct ceph_mds_reply_inode) +
  		sizeof(*info->in->fragtree.splits) *
  		le32_to_cpu(info->in->fragtree.nsplits);
  
  	ceph_decode_32_safe(p, end, info->symlink_len, bad);
  	ceph_decode_need(p, end, info->symlink_len, bad);
  	info->symlink = *p;
  	*p += info->symlink_len;
23c625ce3   Ilya Dryomov   libceph: assume a...
123
124
  	ceph_decode_copy_safe(p, end, &info->dir_layout,
  			      sizeof(info->dir_layout), bad);
2f2dc0534   Sage Weil   ceph: MDS client
125
126
127
128
  	ceph_decode_32_safe(p, end, info->xattr_len, bad);
  	ceph_decode_need(p, end, info->xattr_len, bad);
  	info->xattr_data = *p;
  	*p += info->xattr_len;
fb01d1f8b   Yan, Zheng   ceph: parse inlin...
129

b37fe1f92   Yan, Zheng   ceph: support ver...
130
131
  	if (features == (u64)-1) {
  		/* inline data */
fb01d1f8b   Yan, Zheng   ceph: parse inlin...
132
133
134
135
136
  		ceph_decode_64_safe(p, end, info->inline_version, bad);
  		ceph_decode_32_safe(p, end, info->inline_len, bad);
  		ceph_decode_need(p, end, info->inline_len, bad);
  		info->inline_data = *p;
  		*p += info->inline_len;
b37fe1f92   Yan, Zheng   ceph: support ver...
137
138
139
140
141
142
143
144
145
146
147
  		/* quota */
  		err = parse_reply_info_quota(p, end, info);
  		if (err < 0)
  			goto out_bad;
  		/* pool namespace */
  		ceph_decode_32_safe(p, end, info->pool_ns_len, bad);
  		if (info->pool_ns_len > 0) {
  			ceph_decode_need(p, end, info->pool_ns_len, bad);
  			info->pool_ns_data = *p;
  			*p += info->pool_ns_len;
  		}
245ce991c   Jeff Layton   ceph: add btime f...
148
149
150
151
152
153
  
  		/* btime */
  		ceph_decode_need(p, end, sizeof(info->btime), bad);
  		ceph_decode_copy(p, &info->btime, sizeof(info->btime));
  
  		/* change attribute */
a35ead314   Jeff Layton   ceph: add change_...
154
  		ceph_decode_64_safe(p, end, info->change_attr, bad);
fb01d1f8b   Yan, Zheng   ceph: parse inlin...
155

08796873a   Yan, Zheng   ceph: support get...
156
157
158
159
160
161
  		/* dir pin */
  		if (struct_v >= 2) {
  			ceph_decode_32_safe(p, end, info->dir_pin, bad);
  		} else {
  			info->dir_pin = -ENODATA;
  		}
193e7b376   David Disseldorp   ceph: carry snaps...
162
163
164
165
166
167
168
169
  		/* snapshot birth time, remains zero for v<=2 */
  		if (struct_v >= 3) {
  			ceph_decode_need(p, end, sizeof(info->snap_btime), bad);
  			ceph_decode_copy(p, &info->snap_btime,
  					 sizeof(info->snap_btime));
  		} else {
  			memset(&info->snap_btime, 0, sizeof(info->snap_btime));
  		}
b37fe1f92   Yan, Zheng   ceph: support ver...
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
  		*p = end;
  	} else {
  		if (features & CEPH_FEATURE_MDS_INLINE_DATA) {
  			ceph_decode_64_safe(p, end, info->inline_version, bad);
  			ceph_decode_32_safe(p, end, info->inline_len, bad);
  			ceph_decode_need(p, end, info->inline_len, bad);
  			info->inline_data = *p;
  			*p += info->inline_len;
  		} else
  			info->inline_version = CEPH_INLINE_NONE;
  
  		if (features & CEPH_FEATURE_MDS_QUOTA) {
  			err = parse_reply_info_quota(p, end, info);
  			if (err < 0)
  				goto out_bad;
  		} else {
  			info->max_bytes = 0;
  			info->max_files = 0;
  		}
  
  		info->pool_ns_len = 0;
  		info->pool_ns_data = NULL;
  		if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) {
  			ceph_decode_32_safe(p, end, info->pool_ns_len, bad);
  			if (info->pool_ns_len > 0) {
  				ceph_decode_need(p, end, info->pool_ns_len, bad);
  				info->pool_ns_data = *p;
  				*p += info->pool_ns_len;
  			}
  		}
08796873a   Yan, Zheng   ceph: support get...
200

245ce991c   Jeff Layton   ceph: add btime f...
201
202
203
  		if (features & CEPH_FEATURE_FS_BTIME) {
  			ceph_decode_need(p, end, sizeof(info->btime), bad);
  			ceph_decode_copy(p, &info->btime, sizeof(info->btime));
a35ead314   Jeff Layton   ceph: add change_...
204
  			ceph_decode_64_safe(p, end, info->change_attr, bad);
245ce991c   Jeff Layton   ceph: add btime f...
205
  		}
08796873a   Yan, Zheng   ceph: support get...
206
  		info->dir_pin = -ENODATA;
193e7b376   David Disseldorp   ceph: carry snaps...
207
  		/* info->snap_btime remains zero */
b37fe1f92   Yan, Zheng   ceph: support ver...
208
209
210
211
212
213
214
215
216
217
218
219
220
  	}
  	return 0;
  bad:
  	err = -EIO;
  out_bad:
  	return err;
  }
  
  static int parse_reply_info_dir(void **p, void *end,
  				struct ceph_mds_reply_dirfrag **dirfrag,
  				u64 features)
  {
  	if (features == (u64)-1) {
fb18a5756   Luis Henriques   ceph: quota: add ...
221
222
  		u8 struct_v, struct_compat;
  		u32 struct_len;
fb18a5756   Luis Henriques   ceph: quota: add ...
223
224
  		ceph_decode_8_safe(p, end, struct_v, bad);
  		ceph_decode_8_safe(p, end, struct_compat, bad);
b37fe1f92   Yan, Zheng   ceph: support ver...
225
226
227
  		/* struct_v is expected to be >= 1. we only understand
  		 * encoding whose struct_compat == 1. */
  		if (!struct_v || struct_compat != 1)
fb18a5756   Luis Henriques   ceph: quota: add ...
228
229
230
  			goto bad;
  		ceph_decode_32_safe(p, end, struct_len, bad);
  		ceph_decode_need(p, end, struct_len, bad);
b37fe1f92   Yan, Zheng   ceph: support ver...
231
  		end = *p + struct_len;
fb18a5756   Luis Henriques   ceph: quota: add ...
232
  	}
b37fe1f92   Yan, Zheng   ceph: support ver...
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
  	ceph_decode_need(p, end, sizeof(**dirfrag), bad);
  	*dirfrag = *p;
  	*p += sizeof(**dirfrag) + sizeof(u32) * le32_to_cpu((*dirfrag)->ndist);
  	if (unlikely(*p > end))
  		goto bad;
  	if (features == (u64)-1)
  		*p = end;
  	return 0;
  bad:
  	return -EIO;
  }
  
  static int parse_reply_info_lease(void **p, void *end,
  				  struct ceph_mds_reply_lease **lease,
  				  u64 features)
  {
  	if (features == (u64)-1) {
  		u8 struct_v, struct_compat;
  		u32 struct_len;
  		ceph_decode_8_safe(p, end, struct_v, bad);
  		ceph_decode_8_safe(p, end, struct_compat, bad);
  		/* struct_v is expected to be >= 1. we only understand
  		 * encoding whose struct_compat == 1. */
  		if (!struct_v || struct_compat != 1)
  			goto bad;
  		ceph_decode_32_safe(p, end, struct_len, bad);
  		ceph_decode_need(p, end, struct_len, bad);
  		end = *p + struct_len;
5ea5c5e0a   Yan, Zheng   ceph: initial CEP...
261
  	}
b37fe1f92   Yan, Zheng   ceph: support ver...
262
263
264
265
266
  	ceph_decode_need(p, end, sizeof(**lease), bad);
  	*lease = *p;
  	*p += sizeof(**lease);
  	if (features == (u64)-1)
  		*p = end;
2f2dc0534   Sage Weil   ceph: MDS client
267
268
  	return 0;
  bad:
b37fe1f92   Yan, Zheng   ceph: support ver...
269
  	return -EIO;
2f2dc0534   Sage Weil   ceph: MDS client
270
271
272
273
274
275
276
  }
  
  /*
   * parse a normal reply, which may contain a (dir+)dentry and/or a
   * target inode.
   */
  static int parse_reply_info_trace(void **p, void *end,
14303d20f   Sage Weil   ceph: implement D...
277
  				  struct ceph_mds_reply_info_parsed *info,
12b4629a9   Ilya Dryomov   libceph: all feat...
278
  				  u64 features)
2f2dc0534   Sage Weil   ceph: MDS client
279
280
281
282
  {
  	int err;
  
  	if (info->head->is_dentry) {
14303d20f   Sage Weil   ceph: implement D...
283
  		err = parse_reply_info_in(p, end, &info->diri, features);
2f2dc0534   Sage Weil   ceph: MDS client
284
285
  		if (err < 0)
  			goto out_bad;
b37fe1f92   Yan, Zheng   ceph: support ver...
286
287
288
  		err = parse_reply_info_dir(p, end, &info->dirfrag, features);
  		if (err < 0)
  			goto out_bad;
2f2dc0534   Sage Weil   ceph: MDS client
289
290
291
292
293
  
  		ceph_decode_32_safe(p, end, info->dname_len, bad);
  		ceph_decode_need(p, end, info->dname_len, bad);
  		info->dname = *p;
  		*p += info->dname_len;
b37fe1f92   Yan, Zheng   ceph: support ver...
294
295
296
297
  
  		err = parse_reply_info_lease(p, end, &info->dlease, features);
  		if (err < 0)
  			goto out_bad;
2f2dc0534   Sage Weil   ceph: MDS client
298
299
300
  	}
  
  	if (info->head->is_target) {
14303d20f   Sage Weil   ceph: implement D...
301
  		err = parse_reply_info_in(p, end, &info->targeti, features);
2f2dc0534   Sage Weil   ceph: MDS client
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
  		if (err < 0)
  			goto out_bad;
  	}
  
  	if (unlikely(*p != end))
  		goto bad;
  	return 0;
  
  bad:
  	err = -EIO;
  out_bad:
  	pr_err("problem parsing mds trace %d
  ", err);
  	return err;
  }
  
  /*
   * parse readdir results
   */
b37fe1f92   Yan, Zheng   ceph: support ver...
321
  static int parse_reply_info_readdir(void **p, void *end,
14303d20f   Sage Weil   ceph: implement D...
322
  				struct ceph_mds_reply_info_parsed *info,
12b4629a9   Ilya Dryomov   libceph: all feat...
323
  				u64 features)
2f2dc0534   Sage Weil   ceph: MDS client
324
325
326
  {
  	u32 num, i = 0;
  	int err;
b37fe1f92   Yan, Zheng   ceph: support ver...
327
328
329
  	err = parse_reply_info_dir(p, end, &info->dir_dir, features);
  	if (err < 0)
  		goto out_bad;
2f2dc0534   Sage Weil   ceph: MDS client
330
331
  
  	ceph_decode_need(p, end, sizeof(num) + 2, bad);
c89136ea4   Sage Weil   ceph: convert enc...
332
  	num = ceph_decode_32(p);
956d39d63   Yan, Zheng   ceph: define 'end...
333
334
335
336
  	{
  		u16 flags = ceph_decode_16(p);
  		info->dir_end = !!(flags & CEPH_READDIR_FRAG_END);
  		info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE);
f3c4ebe65   Yan, Zheng   ceph: using hash ...
337
  		info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER);
79162547b   Yan, Zheng   ceph: make seeky ...
338
  		info->offset_hash = !!(flags & CEPH_READDIR_OFFSET_HASH);
956d39d63   Yan, Zheng   ceph: define 'end...
339
  	}
2f2dc0534   Sage Weil   ceph: MDS client
340
341
  	if (num == 0)
  		goto done;
2a5beea3f   Yan, Zheng   ceph: define stru...
342
343
344
  	BUG_ON(!info->dir_entries);
  	if ((unsigned long)(info->dir_entries + num) >
  	    (unsigned long)info->dir_entries + info->dir_buf_size) {
54008399d   Yan, Zheng   ceph: preallocate...
345
346
347
348
349
  		pr_err("dir contents are larger than expected
  ");
  		WARN_ON(1);
  		goto bad;
  	}
2f2dc0534   Sage Weil   ceph: MDS client
350

54008399d   Yan, Zheng   ceph: preallocate...
351
  	info->dir_nr = num;
2f2dc0534   Sage Weil   ceph: MDS client
352
  	while (num) {
2a5beea3f   Yan, Zheng   ceph: define stru...
353
  		struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i;
2f2dc0534   Sage Weil   ceph: MDS client
354
  		/* dentry */
b37fe1f92   Yan, Zheng   ceph: support ver...
355
  		ceph_decode_32_safe(p, end, rde->name_len, bad);
2a5beea3f   Yan, Zheng   ceph: define stru...
356
357
358
359
360
  		ceph_decode_need(p, end, rde->name_len, bad);
  		rde->name = *p;
  		*p += rde->name_len;
  		dout("parsed dir dname '%.*s'
  ", rde->name_len, rde->name);
2f2dc0534   Sage Weil   ceph: MDS client
361

b37fe1f92   Yan, Zheng   ceph: support ver...
362
363
364
365
  		/* dentry lease */
  		err = parse_reply_info_lease(p, end, &rde->lease, features);
  		if (err)
  			goto out_bad;
2f2dc0534   Sage Weil   ceph: MDS client
366
  		/* inode */
2a5beea3f   Yan, Zheng   ceph: define stru...
367
  		err = parse_reply_info_in(p, end, &rde->inode, features);
2f2dc0534   Sage Weil   ceph: MDS client
368
369
  		if (err < 0)
  			goto out_bad;
8974eebd3   Yan, Zheng   ceph: record 'off...
370
371
  		/* ceph_readdir_prepopulate() will update it */
  		rde->offset = 0;
2f2dc0534   Sage Weil   ceph: MDS client
372
373
374
375
376
  		i++;
  		num--;
  	}
  
  done:
1d3f87233   Jeff Layton   ceph: just skip u...
377
378
  	/* Skip over any unrecognized fields */
  	*p = end;
2f2dc0534   Sage Weil   ceph: MDS client
379
380
381
382
383
384
385
386
387
388
389
  	return 0;
  
  bad:
  	err = -EIO;
  out_bad:
  	pr_err("problem parsing dir contents %d
  ", err);
  	return err;
  }
  
  /*
25933abdd   Herb Shiu   ceph: Handle file...
390
391
392
   * parse fcntl F_GETLK results
   */
  static int parse_reply_info_filelock(void **p, void *end,
14303d20f   Sage Weil   ceph: implement D...
393
  				     struct ceph_mds_reply_info_parsed *info,
12b4629a9   Ilya Dryomov   libceph: all feat...
394
  				     u64 features)
25933abdd   Herb Shiu   ceph: Handle file...
395
396
397
398
399
  {
  	if (*p + sizeof(*info->filelock_reply) > end)
  		goto bad;
  
  	info->filelock_reply = *p;
25933abdd   Herb Shiu   ceph: Handle file...
400

1d3f87233   Jeff Layton   ceph: just skip u...
401
402
  	/* Skip over any unrecognized fields */
  	*p = end;
25933abdd   Herb Shiu   ceph: Handle file...
403
  	return 0;
25933abdd   Herb Shiu   ceph: Handle file...
404
405
406
  bad:
  	return -EIO;
  }
d48464878   Jeff Layton   ceph: decode inte...
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
  
  #if BITS_PER_LONG == 64
  
  #define DELEGATED_INO_AVAILABLE		xa_mk_value(1)
  
  static int ceph_parse_deleg_inos(void **p, void *end,
  				 struct ceph_mds_session *s)
  {
  	u32 sets;
  
  	ceph_decode_32_safe(p, end, sets, bad);
  	dout("got %u sets of delegated inodes
  ", sets);
  	while (sets--) {
  		u64 start, len, ino;
  
  		ceph_decode_64_safe(p, end, start, bad);
  		ceph_decode_64_safe(p, end, len, bad);
  		while (len--) {
  			int err = xa_insert(&s->s_delegated_inos, ino = start++,
  					    DELEGATED_INO_AVAILABLE,
  					    GFP_KERNEL);
  			if (!err) {
  				dout("added delegated inode 0x%llx
  ",
  				     start - 1);
  			} else if (err == -EBUSY) {
  				pr_warn("ceph: MDS delegated inode 0x%llx more than once.
  ",
  					start - 1);
  			} else {
  				return err;
  			}
  		}
  	}
  	return 0;
  bad:
  	return -EIO;
  }
  
  u64 ceph_get_deleg_ino(struct ceph_mds_session *s)
  {
  	unsigned long ino;
  	void *val;
  
  	xa_for_each(&s->s_delegated_inos, ino, val) {
  		val = xa_erase(&s->s_delegated_inos, ino);
  		if (val == DELEGATED_INO_AVAILABLE)
  			return ino;
  	}
  	return 0;
  }
  
  int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino)
  {
  	return xa_insert(&s->s_delegated_inos, ino, DELEGATED_INO_AVAILABLE,
  			 GFP_KERNEL);
  }
  #else /* BITS_PER_LONG == 64 */
  /*
   * FIXME: xarrays can't handle 64-bit indexes on a 32-bit arch. For now, just
   * ignore delegated_inos on 32 bit arch. Maybe eventually add xarrays for top
   * and bottom words?
   */
  static int ceph_parse_deleg_inos(void **p, void *end,
  				 struct ceph_mds_session *s)
  {
  	u32 sets;
  
  	ceph_decode_32_safe(p, end, sets, bad);
  	if (sets)
  		ceph_decode_skip_n(p, end, sets * 2 * sizeof(__le64), bad);
  	return 0;
  bad:
  	return -EIO;
  }
  
  u64 ceph_get_deleg_ino(struct ceph_mds_session *s)
  {
  	return 0;
  }
  
  int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino)
  {
  	return 0;
  }
  #endif /* BITS_PER_LONG == 64 */
25933abdd   Herb Shiu   ceph: Handle file...
494
  /*
6e8575faa   Sam Lang   ceph: Check for c...
495
496
497
498
   * parse create results
   */
  static int parse_reply_info_create(void **p, void *end,
  				  struct ceph_mds_reply_info_parsed *info,
d48464878   Jeff Layton   ceph: decode inte...
499
  				  u64 features, struct ceph_mds_session *s)
6e8575faa   Sam Lang   ceph: Check for c...
500
  {
d48464878   Jeff Layton   ceph: decode inte...
501
  	int ret;
b37fe1f92   Yan, Zheng   ceph: support ver...
502
503
  	if (features == (u64)-1 ||
  	    (features & CEPH_FEATURE_REPLY_CREATE_INODE)) {
6e8575faa   Sam Lang   ceph: Check for c...
504
  		if (*p == end) {
d48464878   Jeff Layton   ceph: decode inte...
505
  			/* Malformed reply? */
6e8575faa   Sam Lang   ceph: Check for c...
506
  			info->has_create_ino = false;
d48464878   Jeff Layton   ceph: decode inte...
507
508
509
  		} else if (test_bit(CEPHFS_FEATURE_DELEG_INO, &s->s_features)) {
  			u8 struct_v, struct_compat;
  			u32 len;
6e8575faa   Sam Lang   ceph: Check for c...
510
  			info->has_create_ino = true;
d48464878   Jeff Layton   ceph: decode inte...
511
512
513
514
515
516
517
518
519
  			ceph_decode_8_safe(p, end, struct_v, bad);
  			ceph_decode_8_safe(p, end, struct_compat, bad);
  			ceph_decode_32_safe(p, end, len, bad);
  			ceph_decode_64_safe(p, end, info->ino, bad);
  			ret = ceph_parse_deleg_inos(p, end, s);
  			if (ret)
  				return ret;
  		} else {
  			/* legacy */
1d3f87233   Jeff Layton   ceph: just skip u...
520
  			ceph_decode_64_safe(p, end, info->ino, bad);
d48464878   Jeff Layton   ceph: decode inte...
521
  			info->has_create_ino = true;
6e8575faa   Sam Lang   ceph: Check for c...
522
  		}
1d3f87233   Jeff Layton   ceph: just skip u...
523
524
525
  	} else {
  		if (*p != end)
  			goto bad;
6e8575faa   Sam Lang   ceph: Check for c...
526
  	}
1d3f87233   Jeff Layton   ceph: just skip u...
527
528
  	/* Skip over any unrecognized fields */
  	*p = end;
6e8575faa   Sam Lang   ceph: Check for c...
529
  	return 0;
6e8575faa   Sam Lang   ceph: Check for c...
530
531
532
533
534
  bad:
  	return -EIO;
  }
  
  /*
25933abdd   Herb Shiu   ceph: Handle file...
535
536
537
   * parse extra results
   */
  static int parse_reply_info_extra(void **p, void *end,
14303d20f   Sage Weil   ceph: implement D...
538
  				  struct ceph_mds_reply_info_parsed *info,
d48464878   Jeff Layton   ceph: decode inte...
539
  				  u64 features, struct ceph_mds_session *s)
25933abdd   Herb Shiu   ceph: Handle file...
540
  {
6df8c9d80   Jeff Layton   ceph: fix bad end...
541
542
543
  	u32 op = le32_to_cpu(info->head->op);
  
  	if (op == CEPH_MDS_OP_GETFILELOCK)
14303d20f   Sage Weil   ceph: implement D...
544
  		return parse_reply_info_filelock(p, end, info, features);
6df8c9d80   Jeff Layton   ceph: fix bad end...
545
  	else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP)
b37fe1f92   Yan, Zheng   ceph: support ver...
546
  		return parse_reply_info_readdir(p, end, info, features);
6df8c9d80   Jeff Layton   ceph: fix bad end...
547
  	else if (op == CEPH_MDS_OP_CREATE)
d48464878   Jeff Layton   ceph: decode inte...
548
  		return parse_reply_info_create(p, end, info, features, s);
6e8575faa   Sam Lang   ceph: Check for c...
549
550
  	else
  		return -EIO;
25933abdd   Herb Shiu   ceph: Handle file...
551
552
553
  }
  
  /*
2f2dc0534   Sage Weil   ceph: MDS client
554
555
   * parse entire mds reply
   */
d48464878   Jeff Layton   ceph: decode inte...
556
  static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg,
14303d20f   Sage Weil   ceph: implement D...
557
  			    struct ceph_mds_reply_info_parsed *info,
12b4629a9   Ilya Dryomov   libceph: all feat...
558
  			    u64 features)
2f2dc0534   Sage Weil   ceph: MDS client
559
560
561
562
563
564
565
566
567
568
569
570
  {
  	void *p, *end;
  	u32 len;
  	int err;
  
  	info->head = msg->front.iov_base;
  	p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head);
  	end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head);
  
  	/* trace */
  	ceph_decode_32_safe(&p, end, len, bad);
  	if (len > 0) {
32852a81b   Xi Wang   ceph: fix length ...
571
  		ceph_decode_need(&p, end, len, bad);
14303d20f   Sage Weil   ceph: implement D...
572
  		err = parse_reply_info_trace(&p, p+len, info, features);
2f2dc0534   Sage Weil   ceph: MDS client
573
574
575
  		if (err < 0)
  			goto out_bad;
  	}
25933abdd   Herb Shiu   ceph: Handle file...
576
  	/* extra */
2f2dc0534   Sage Weil   ceph: MDS client
577
578
  	ceph_decode_32_safe(&p, end, len, bad);
  	if (len > 0) {
32852a81b   Xi Wang   ceph: fix length ...
579
  		ceph_decode_need(&p, end, len, bad);
d48464878   Jeff Layton   ceph: decode inte...
580
  		err = parse_reply_info_extra(&p, p+len, info, features, s);
2f2dc0534   Sage Weil   ceph: MDS client
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
  		if (err < 0)
  			goto out_bad;
  	}
  
  	/* snap blob */
  	ceph_decode_32_safe(&p, end, len, bad);
  	info->snapblob_len = len;
  	info->snapblob = p;
  	p += len;
  
  	if (p != end)
  		goto bad;
  	return 0;
  
  bad:
  	err = -EIO;
  out_bad:
  	pr_err("mds parse_reply err %d
  ", err);
  	return err;
  }
  
  static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
  {
2a5beea3f   Yan, Zheng   ceph: define stru...
605
  	if (!info->dir_entries)
54008399d   Yan, Zheng   ceph: preallocate...
606
  		return;
2a5beea3f   Yan, Zheng   ceph: define stru...
607
  	free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size));
2f2dc0534   Sage Weil   ceph: MDS client
608
609
610
611
612
613
  }
  
  
  /*
   * sessions
   */
a687ecaf5   John Spray   ceph: export ceph...
614
  const char *ceph_session_state_name(int s)
2f2dc0534   Sage Weil   ceph: MDS client
615
616
617
618
619
620
621
  {
  	switch (s) {
  	case CEPH_MDS_SESSION_NEW: return "new";
  	case CEPH_MDS_SESSION_OPENING: return "opening";
  	case CEPH_MDS_SESSION_OPEN: return "open";
  	case CEPH_MDS_SESSION_HUNG: return "hung";
  	case CEPH_MDS_SESSION_CLOSING: return "closing";
4d681c2f9   Xiubo Li   ceph: keep the se...
622
  	case CEPH_MDS_SESSION_CLOSED: return "closed";
44ca18f26   Sage Weil   ceph: use rbtree ...
623
  	case CEPH_MDS_SESSION_RESTARTING: return "restarting";
2f2dc0534   Sage Weil   ceph: MDS client
624
  	case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting";
fcff415c9   Yan, Zheng   ceph: handle CEPH...
625
  	case CEPH_MDS_SESSION_REJECTED: return "rejected";
2f2dc0534   Sage Weil   ceph: MDS client
626
627
628
  	default: return "???";
  	}
  }
5b3248c67   Xiubo Li   ceph: rename get_...
629
  struct ceph_mds_session *ceph_get_mds_session(struct ceph_mds_session *s)
2f2dc0534   Sage Weil   ceph: MDS client
630
  {
3997c01d2   Elena Reshetova   ceph: convert cep...
631
  	if (refcount_inc_not_zero(&s->s_ref)) {
2f2dc0534   Sage Weil   ceph: MDS client
632
633
  		dout("mdsc get_session %p %d -> %d
  ", s,
3997c01d2   Elena Reshetova   ceph: convert cep...
634
  		     refcount_read(&s->s_ref)-1, refcount_read(&s->s_ref));
2f2dc0534   Sage Weil   ceph: MDS client
635
636
  		return s;
  	} else {
4c069a582   Chengguang Xu   ceph: add newline...
637
638
  		dout("mdsc get_session %p 0 -- FAIL
  ", s);
2f2dc0534   Sage Weil   ceph: MDS client
639
640
641
642
643
644
645
646
  		return NULL;
  	}
  }
  
  void ceph_put_mds_session(struct ceph_mds_session *s)
  {
  	dout("mdsc put_session %p %d -> %d
  ", s,
3997c01d2   Elena Reshetova   ceph: convert cep...
647
648
  	     refcount_read(&s->s_ref), refcount_read(&s->s_ref)-1);
  	if (refcount_dec_and_test(&s->s_ref)) {
6c4a19158   Alex Elder   ceph: define ceph...
649
  		if (s->s_auth.authorizer)
6c1ea260f   Ilya Dryomov   libceph: make aut...
650
  			ceph_auth_destroy_authorizer(s->s_auth.authorizer);
88828190f   Jeff Layton   ceph: throw a war...
651
  		WARN_ON(mutex_is_locked(&s->s_mutex));
d48464878   Jeff Layton   ceph: decode inte...
652
  		xa_destroy(&s->s_delegated_inos);
2f2dc0534   Sage Weil   ceph: MDS client
653
  		kfree(s);
4e7a5dcd1   Sage Weil   ceph: negotiate a...
654
  	}
2f2dc0534   Sage Weil   ceph: MDS client
655
656
657
658
659
660
661
662
  }
  
  /*
   * called under mdsc->mutex
   */
  struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
  						   int mds)
  {
d37b1d994   Markus Elfring   ceph: adjust 36 c...
663
  	if (mds >= mdsc->max_sessions || !mdsc->sessions[mds])
2f2dc0534   Sage Weil   ceph: MDS client
664
  		return NULL;
5b3248c67   Xiubo Li   ceph: rename get_...
665
  	return ceph_get_mds_session(mdsc->sessions[mds]);
2f2dc0534   Sage Weil   ceph: MDS client
666
667
668
669
  }
  
  static bool __have_session(struct ceph_mds_client *mdsc, int mds)
  {
98cfda810   Chengguang Xu   ceph: return prop...
670
  	if (mds >= mdsc->max_sessions || !mdsc->sessions[mds])
2f2dc0534   Sage Weil   ceph: MDS client
671
  		return false;
98cfda810   Chengguang Xu   ceph: return prop...
672
673
  	else
  		return true;
2f2dc0534   Sage Weil   ceph: MDS client
674
  }
2600d2dd5   Sage Weil   ceph: drop messag...
675
676
677
678
679
680
681
682
  static int __verify_registered_session(struct ceph_mds_client *mdsc,
  				       struct ceph_mds_session *s)
  {
  	if (s->s_mds >= mdsc->max_sessions ||
  	    mdsc->sessions[s->s_mds] != s)
  		return -ENOENT;
  	return 0;
  }
2f2dc0534   Sage Weil   ceph: MDS client
683
684
685
686
687
688
689
690
  /*
   * create+register a new session for given mds.
   * called under mdsc->mutex.
   */
  static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
  						 int mds)
  {
  	struct ceph_mds_session *s;
b38c9eb47   Xiubo Li   ceph: add possibl...
691
  	if (mds >= mdsc->mdsmap->possible_max_rank)
c338c07c5   Nathaniel Yazdani   ceph: fix null po...
692
  		return ERR_PTR(-EINVAL);
2f2dc0534   Sage Weil   ceph: MDS client
693
  	s = kzalloc(sizeof(*s), GFP_NOFS);
4736b009b   Dan Carpenter   ceph: handle kmal...
694
695
  	if (!s)
  		return ERR_PTR(-ENOMEM);
47474d0b0   Chengguang Xu   ceph: optimize md...
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
  
  	if (mds >= mdsc->max_sessions) {
  		int newmax = 1 << get_count_order(mds + 1);
  		struct ceph_mds_session **sa;
  
  		dout("%s: realloc to %d
  ", __func__, newmax);
  		sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
  		if (!sa)
  			goto fail_realloc;
  		if (mdsc->sessions) {
  			memcpy(sa, mdsc->sessions,
  			       mdsc->max_sessions * sizeof(void *));
  			kfree(mdsc->sessions);
  		}
  		mdsc->sessions = sa;
  		mdsc->max_sessions = newmax;
  	}
  
  	dout("%s: mds%d
  ", __func__, mds);
2f2dc0534   Sage Weil   ceph: MDS client
717
718
719
720
721
722
  	s->s_mdsc = mdsc;
  	s->s_mds = mds;
  	s->s_state = CEPH_MDS_SESSION_NEW;
  	s->s_ttl = 0;
  	s->s_seq = 0;
  	mutex_init(&s->s_mutex);
b7a9e5dd4   Sage Weil   libceph: set peer...
723
  	ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr);
2f2dc0534   Sage Weil   ceph: MDS client
724

d8fb02abd   Alex Elder   ceph: create a ne...
725
  	spin_lock_init(&s->s_gen_ttl_lock);
1e9c2eb68   Yan, Zheng   ceph: delete stal...
726
  	s->s_cap_gen = 1;
1ce208a6c   Alex Elder   ceph: don't reset...
727
  	s->s_cap_ttl = jiffies - 1;
d8fb02abd   Alex Elder   ceph: create a ne...
728
729
  
  	spin_lock_init(&s->s_cap_lock);
2f2dc0534   Sage Weil   ceph: MDS client
730
731
732
733
  	s->s_renew_requested = 0;
  	s->s_renew_seq = 0;
  	INIT_LIST_HEAD(&s->s_caps);
  	s->s_nr_caps = 0;
3997c01d2   Elena Reshetova   ceph: convert cep...
734
  	refcount_set(&s->s_ref, 1);
2f2dc0534   Sage Weil   ceph: MDS client
735
736
  	INIT_LIST_HEAD(&s->s_waiting);
  	INIT_LIST_HEAD(&s->s_unsafe);
d48464878   Jeff Layton   ceph: decode inte...
737
  	xa_init(&s->s_delegated_inos);
2f2dc0534   Sage Weil   ceph: MDS client
738
  	s->s_num_cap_releases = 0;
99a9c273b   Yan, Zheng   ceph: handle race...
739
  	s->s_cap_reconnect = 0;
7c1332b8c   Sage Weil   ceph: fix iterate...
740
  	s->s_cap_iterator = NULL;
2f2dc0534   Sage Weil   ceph: MDS client
741
  	INIT_LIST_HEAD(&s->s_cap_releases);
e3ec8d689   Yan, Zheng   ceph: send cap re...
742
  	INIT_WORK(&s->s_cap_release_work, ceph_cap_release_work);
1cf03a68e   Jeff Layton   ceph: convert mds...
743
  	INIT_LIST_HEAD(&s->s_cap_dirty);
2f2dc0534   Sage Weil   ceph: MDS client
744
  	INIT_LIST_HEAD(&s->s_cap_flushing);
2f2dc0534   Sage Weil   ceph: MDS client
745

2f2dc0534   Sage Weil   ceph: MDS client
746
  	mdsc->sessions[mds] = s;
86d8f67b2   Yan, Zheng   ceph: avoid block...
747
  	atomic_inc(&mdsc->num_sessions);
3997c01d2   Elena Reshetova   ceph: convert cep...
748
  	refcount_inc(&s->s_ref);  /* one ref to sessions[], one to caller */
42ce56e50   Sage Weil   ceph: remove bad ...
749

b7a9e5dd4   Sage Weil   libceph: set peer...
750
751
  	ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds,
  		      ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
42ce56e50   Sage Weil   ceph: remove bad ...
752

2f2dc0534   Sage Weil   ceph: MDS client
753
  	return s;
42ce56e50   Sage Weil   ceph: remove bad ...
754
755
756
757
  
  fail_realloc:
  	kfree(s);
  	return ERR_PTR(-ENOMEM);
2f2dc0534   Sage Weil   ceph: MDS client
758
759
760
761
762
  }
  
  /*
   * called under mdsc->mutex
   */
2600d2dd5   Sage Weil   ceph: drop messag...
763
  static void __unregister_session(struct ceph_mds_client *mdsc,
42ce56e50   Sage Weil   ceph: remove bad ...
764
  			       struct ceph_mds_session *s)
2f2dc0534   Sage Weil   ceph: MDS client
765
  {
2600d2dd5   Sage Weil   ceph: drop messag...
766
767
768
  	dout("__unregister_session mds%d %p
  ", s->s_mds, s);
  	BUG_ON(mdsc->sessions[s->s_mds] != s);
42ce56e50   Sage Weil   ceph: remove bad ...
769
770
771
  	mdsc->sessions[s->s_mds] = NULL;
  	ceph_con_close(&s->s_con);
  	ceph_put_mds_session(s);
86d8f67b2   Yan, Zheng   ceph: avoid block...
772
  	atomic_dec(&mdsc->num_sessions);
2f2dc0534   Sage Weil   ceph: MDS client
773
774
775
776
777
778
779
780
781
782
783
784
785
786
  }
  
  /*
   * drop session refs in request.
   *
   * should be last request ref, or hold mdsc->mutex
   */
  static void put_request_session(struct ceph_mds_request *req)
  {
  	if (req->r_session) {
  		ceph_put_mds_session(req->r_session);
  		req->r_session = NULL;
  	}
  }
153c8e6bf   Sage Weil   ceph: use kref fo...
787
  void ceph_mdsc_release_request(struct kref *kref)
2f2dc0534   Sage Weil   ceph: MDS client
788
  {
153c8e6bf   Sage Weil   ceph: use kref fo...
789
790
791
  	struct ceph_mds_request *req = container_of(kref,
  						    struct ceph_mds_request,
  						    r_kref);
e64f44a88   Xiubo Li   ceph: skip checki...
792
  	ceph_mdsc_release_dir_caps_no_check(req);
54008399d   Yan, Zheng   ceph: preallocate...
793
  	destroy_reply_info(&req->r_reply_info);
153c8e6bf   Sage Weil   ceph: use kref fo...
794
795
  	if (req->r_request)
  		ceph_msg_put(req->r_request);
54008399d   Yan, Zheng   ceph: preallocate...
796
  	if (req->r_reply)
153c8e6bf   Sage Weil   ceph: use kref fo...
797
  		ceph_msg_put(req->r_reply);
153c8e6bf   Sage Weil   ceph: use kref fo...
798
  	if (req->r_inode) {
41b02e1f9   Sage Weil   ceph: explicitly ...
799
  		ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
3e1d0452e   Yan, Zheng   ceph: avoid iput_...
800
801
  		/* avoid calling iput_final() in mds dispatch threads */
  		ceph_async_iput(req->r_inode);
153c8e6bf   Sage Weil   ceph: use kref fo...
802
  	}
9c1c2b35f   Jeff Layton   ceph: hold extra ...
803
  	if (req->r_parent) {
3dd69aabc   Jeff Layton   ceph: add a new f...
804
  		ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
9c1c2b35f   Jeff Layton   ceph: hold extra ...
805
806
  		ceph_async_iput(req->r_parent);
  	}
3e1d0452e   Yan, Zheng   ceph: avoid iput_...
807
  	ceph_async_iput(req->r_target_inode);
153c8e6bf   Sage Weil   ceph: use kref fo...
808
809
  	if (req->r_dentry)
  		dput(req->r_dentry);
844d87c33   Sage Weil   ceph: do not assu...
810
811
812
  	if (req->r_old_dentry)
  		dput(req->r_old_dentry);
  	if (req->r_old_dentry_dir) {
41b02e1f9   Sage Weil   ceph: explicitly ...
813
814
815
816
817
818
819
820
  		/*
  		 * track (and drop pins for) r_old_dentry_dir
  		 * separately, since r_old_dentry's d_parent may have
  		 * changed between the dir mutex being dropped and
  		 * this request being freed.
  		 */
  		ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir),
  				  CEPH_CAP_PIN);
3e1d0452e   Yan, Zheng   ceph: avoid iput_...
821
  		ceph_async_iput(req->r_old_dentry_dir);
2f2dc0534   Sage Weil   ceph: MDS client
822
  	}
153c8e6bf   Sage Weil   ceph: use kref fo...
823
824
  	kfree(req->r_path1);
  	kfree(req->r_path2);
25e6bae35   Yan, Zheng   ceph: use pagelis...
825
826
  	if (req->r_pagelist)
  		ceph_pagelist_release(req->r_pagelist);
153c8e6bf   Sage Weil   ceph: use kref fo...
827
  	put_request_session(req);
37151668b   Yehuda Sadeh   ceph: do caps acc...
828
  	ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation);
428138c98   Yan, Zheng   ceph: remove requ...
829
  	WARN_ON_ONCE(!list_empty(&req->r_wait));
058daab79   Jeff Layton   ceph: move to a d...
830
  	kmem_cache_free(ceph_mds_request_cachep, req);
2f2dc0534   Sage Weil   ceph: MDS client
831
  }
fcd00b68b   Ilya Dryomov   libceph: DEFINE_R...
832
  DEFINE_RB_FUNCS(request, struct ceph_mds_request, r_tid, r_node)
2f2dc0534   Sage Weil   ceph: MDS client
833
834
835
836
837
  /*
   * lookup session, bump ref if found.
   *
   * called under mdsc->mutex.
   */
fcd00b68b   Ilya Dryomov   libceph: DEFINE_R...
838
839
  static struct ceph_mds_request *
  lookup_get_request(struct ceph_mds_client *mdsc, u64 tid)
2f2dc0534   Sage Weil   ceph: MDS client
840
841
  {
  	struct ceph_mds_request *req;
44ca18f26   Sage Weil   ceph: use rbtree ...
842

fcd00b68b   Ilya Dryomov   libceph: DEFINE_R...
843
844
845
  	req = lookup_request(&mdsc->request_tree, tid);
  	if (req)
  		ceph_mdsc_get_request(req);
44ca18f26   Sage Weil   ceph: use rbtree ...
846

fcd00b68b   Ilya Dryomov   libceph: DEFINE_R...
847
  	return req;
2f2dc0534   Sage Weil   ceph: MDS client
848
849
850
851
852
853
854
855
856
857
858
859
  }
  
  /*
   * Register an in-flight request, and assign a tid.  Link to directory
   * are modifying (if any).
   *
   * Called under mdsc->mutex.
   */
  static void __register_request(struct ceph_mds_client *mdsc,
  			       struct ceph_mds_request *req,
  			       struct inode *dir)
  {
e30ee5812   Zhi Zhang   ceph: try to allo...
860
  	int ret = 0;
2f2dc0534   Sage Weil   ceph: MDS client
861
  	req->r_tid = ++mdsc->last_tid;
e30ee5812   Zhi Zhang   ceph: try to allo...
862
863
864
865
866
867
868
869
870
871
872
873
  	if (req->r_num_caps) {
  		ret = ceph_reserve_caps(mdsc, &req->r_caps_reservation,
  					req->r_num_caps);
  		if (ret < 0) {
  			pr_err("__register_request %p "
  			       "failed to reserve caps: %d
  ", req, ret);
  			/* set req->r_err to fail early from __do_request */
  			req->r_err = ret;
  			return;
  		}
  	}
2f2dc0534   Sage Weil   ceph: MDS client
874
875
876
  	dout("__register_request %p tid %lld
  ", req, req->r_tid);
  	ceph_mdsc_get_request(req);
fcd00b68b   Ilya Dryomov   libceph: DEFINE_R...
877
  	insert_request(&mdsc->request_tree, req);
2f2dc0534   Sage Weil   ceph: MDS client
878

cb4276cca   Sage Weil   ceph: fix uid/gid...
879
880
  	req->r_uid = current_fsuid();
  	req->r_gid = current_fsgid();
e8a7b8b12   Yan, Zheng   ceph: exclude set...
881
882
  	if (mdsc->oldest_tid == 0 && req->r_op != CEPH_MDS_OP_SETFILELOCK)
  		mdsc->oldest_tid = req->r_tid;
2f2dc0534   Sage Weil   ceph: MDS client
883
  	if (dir) {
3db0a2fc5   Jeff Layton   ceph: register MD...
884
  		struct ceph_inode_info *ci = ceph_inode(dir);
3b6637803   Sage Weil   ceph: take refere...
885
  		ihold(dir);
2f2dc0534   Sage Weil   ceph: MDS client
886
  		req->r_unsafe_dir = dir;
3db0a2fc5   Jeff Layton   ceph: register MD...
887
888
889
  		spin_lock(&ci->i_unsafe_lock);
  		list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
  		spin_unlock(&ci->i_unsafe_lock);
2f2dc0534   Sage Weil   ceph: MDS client
890
891
892
893
894
895
896
897
  	}
  }
  
  static void __unregister_request(struct ceph_mds_client *mdsc,
  				 struct ceph_mds_request *req)
  {
  	dout("__unregister_request %p tid %lld
  ", req, req->r_tid);
e8a7b8b12   Yan, Zheng   ceph: exclude set...
898

df963ea8a   Jeff Layton   ceph: remove req ...
899
900
  	/* Never leave an unregistered request on an unsafe list! */
  	list_del_init(&req->r_unsafe_item);
e8a7b8b12   Yan, Zheng   ceph: exclude set...
901
902
903
904
905
906
907
908
909
910
911
912
913
  	if (req->r_tid == mdsc->oldest_tid) {
  		struct rb_node *p = rb_next(&req->r_node);
  		mdsc->oldest_tid = 0;
  		while (p) {
  			struct ceph_mds_request *next_req =
  				rb_entry(p, struct ceph_mds_request, r_node);
  			if (next_req->r_op != CEPH_MDS_OP_SETFILELOCK) {
  				mdsc->oldest_tid = next_req->r_tid;
  				break;
  			}
  			p = rb_next(p);
  		}
  	}
fcd00b68b   Ilya Dryomov   libceph: DEFINE_R...
914
  	erase_request(&mdsc->request_tree, req);
2f2dc0534   Sage Weil   ceph: MDS client
915

3db0a2fc5   Jeff Layton   ceph: register MD...
916
  	if (req->r_unsafe_dir) {
2f2dc0534   Sage Weil   ceph: MDS client
917
  		struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
2f2dc0534   Sage Weil   ceph: MDS client
918
919
920
  		spin_lock(&ci->i_unsafe_lock);
  		list_del_init(&req->r_unsafe_dir_item);
  		spin_unlock(&ci->i_unsafe_lock);
4c06ace81   Yan, Zheng   ceph: add request...
921
  	}
bc2de10dc   Jeff Layton   ceph: convert boo...
922
923
  	if (req->r_target_inode &&
  	    test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
68cd5b4b7   Yan, Zheng   ceph: make fsync(...
924
925
926
927
928
  		struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
  		spin_lock(&ci->i_unsafe_lock);
  		list_del_init(&req->r_unsafe_target_item);
  		spin_unlock(&ci->i_unsafe_lock);
  	}
3b6637803   Sage Weil   ceph: take refere...
929

4c06ace81   Yan, Zheng   ceph: add request...
930
  	if (req->r_unsafe_dir) {
3e1d0452e   Yan, Zheng   ceph: avoid iput_...
931
932
  		/* avoid calling iput_final() in mds dispatch threads */
  		ceph_async_iput(req->r_unsafe_dir);
3b6637803   Sage Weil   ceph: take refere...
933
  		req->r_unsafe_dir = NULL;
2f2dc0534   Sage Weil   ceph: MDS client
934
  	}
94aa8ae13   Sage Weil   ceph: fix use aft...
935

fc55d2c94   Yan, Zheng   ceph: wake up 'sa...
936
  	complete_all(&req->r_safe_completion);
94aa8ae13   Sage Weil   ceph: fix use aft...
937
  	ceph_mdsc_put_request(req);
2f2dc0534   Sage Weil   ceph: MDS client
938
939
940
  }
  
  /*
30c71233a   Jeff Layton   ceph: clean up un...
941
942
943
944
945
946
   * Walk back up the dentry tree until we hit a dentry representing a
   * non-snapshot inode. We do this using the rcu_read_lock (which must be held
   * when calling this) to ensure that the objects won't disappear while we're
   * working with them. Once we hit a candidate dentry, we attempt to take a
   * reference to it, and return that as the result.
   */
f10754803   Dan Carpenter   ceph: tidy some w...
947
948
949
  static struct inode *get_nonsnap_parent(struct dentry *dentry)
  {
  	struct inode *inode = NULL;
30c71233a   Jeff Layton   ceph: clean up un...
950
951
952
953
954
955
956
957
958
959
960
961
962
  
  	while (dentry && !IS_ROOT(dentry)) {
  		inode = d_inode_rcu(dentry);
  		if (!inode || ceph_snap(inode) == CEPH_NOSNAP)
  			break;
  		dentry = dentry->d_parent;
  	}
  	if (inode)
  		inode = igrab(inode);
  	return inode;
  }
  
  /*
2f2dc0534   Sage Weil   ceph: MDS client
963
964
965
966
967
968
969
970
   * Choose mds to send request to next.  If there is a hint set in the
   * request (e.g., due to a prior forward hint from the mds), use that.
   * Otherwise, consult frag tree and/or caps to identify the
   * appropriate mds.  If all else fails, choose randomly.
   *
   * Called under mdsc->mutex.
   */
  static int __choose_mds(struct ceph_mds_client *mdsc,
c4853e977   Xiubo Li   ceph: retry the s...
971
972
  			struct ceph_mds_request *req,
  			bool *random)
2f2dc0534   Sage Weil   ceph: MDS client
973
974
975
976
977
978
979
  {
  	struct inode *inode;
  	struct ceph_inode_info *ci;
  	struct ceph_cap *cap;
  	int mode = req->r_direct_mode;
  	int mds = -1;
  	u32 hash = req->r_direct_hash;
bc2de10dc   Jeff Layton   ceph: convert boo...
980
  	bool is_hash = test_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags);
2f2dc0534   Sage Weil   ceph: MDS client
981

c4853e977   Xiubo Li   ceph: retry the s...
982
983
  	if (random)
  		*random = false;
2f2dc0534   Sage Weil   ceph: MDS client
984
985
986
987
988
989
990
  	/*
  	 * is there a specific mds we should try?  ignore hint if we have
  	 * no session and the mds is not up (active or recovering).
  	 */
  	if (req->r_resend_mds >= 0 &&
  	    (__have_session(mdsc, req->r_resend_mds) ||
  	     ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) {
3c802092d   Xiubo Li   ceph: print r_dir...
991
992
  		dout("%s using resend_mds mds%d
  ", __func__,
2f2dc0534   Sage Weil   ceph: MDS client
993
994
995
996
997
998
999
1000
1001
  		     req->r_resend_mds);
  		return req->r_resend_mds;
  	}
  
  	if (mode == USE_RANDOM_MDS)
  		goto random;
  
  	inode = NULL;
  	if (req->r_inode) {
5d37ca148   Yan, Zheng   ceph: send LSSNAP...
1002
1003
1004
1005
  		if (ceph_snap(req->r_inode) != CEPH_SNAPDIR) {
  			inode = req->r_inode;
  			ihold(inode);
  		} else {
38f340ccd   Yan, Zheng   ceph: fix __choos...
1006
1007
1008
1009
  			/* req->r_dentry is non-null for LSSNAP request */
  			rcu_read_lock();
  			inode = get_nonsnap_parent(req->r_dentry);
  			rcu_read_unlock();
3c802092d   Xiubo Li   ceph: print r_dir...
1010
1011
  			dout("%s using snapdir's parent %p
  ", __func__, inode);
5d37ca148   Yan, Zheng   ceph: send LSSNAP...
1012
  		}
38f340ccd   Yan, Zheng   ceph: fix __choos...
1013
  	} else if (req->r_dentry) {
d79698da3   Sage Weil   ceph: document un...
1014
  		/* ignore race with rename; old or new d_parent is okay */
30c71233a   Jeff Layton   ceph: clean up un...
1015
1016
1017
1018
  		struct dentry *parent;
  		struct inode *dir;
  
  		rcu_read_lock();
41883ba8e   Yan, Zheng   ceph: use READ_ON...
1019
  		parent = READ_ONCE(req->r_dentry->d_parent);
3dd69aabc   Jeff Layton   ceph: add a new f...
1020
  		dir = req->r_parent ? : d_inode_rcu(parent);
eb6bb1c5b   Sage Weil   ceph: direct requ...
1021

30c71233a   Jeff Layton   ceph: clean up un...
1022
1023
  		if (!dir || dir->i_sb != mdsc->fsc->sb) {
  			/*  not this fs or parent went negative */
2b0143b5c   David Howells   VFS: normal files...
1024
  			inode = d_inode(req->r_dentry);
30c71233a   Jeff Layton   ceph: clean up un...
1025
1026
  			if (inode)
  				ihold(inode);
eb6bb1c5b   Sage Weil   ceph: direct requ...
1027
1028
1029
  		} else if (ceph_snap(dir) != CEPH_NOSNAP) {
  			/* direct snapped/virtual snapdir requests
  			 * based on parent dir inode */
30c71233a   Jeff Layton   ceph: clean up un...
1030
  			inode = get_nonsnap_parent(parent);
3c802092d   Xiubo Li   ceph: print r_dir...
1031
1032
  			dout("%s using nonsnap parent %p
  ", __func__, inode);
ca18bede0   Yan, Zheng   ceph: handle -EST...
1033
  		} else {
eb6bb1c5b   Sage Weil   ceph: direct requ...
1034
  			/* dentry target */
2b0143b5c   David Howells   VFS: normal files...
1035
  			inode = d_inode(req->r_dentry);
ca18bede0   Yan, Zheng   ceph: handle -EST...
1036
1037
  			if (!inode || mode == USE_AUTH_MDS) {
  				/* dir + name */
30c71233a   Jeff Layton   ceph: clean up un...
1038
  				inode = igrab(dir);
ca18bede0   Yan, Zheng   ceph: handle -EST...
1039
1040
  				hash = ceph_dentry_hash(dir, req->r_dentry);
  				is_hash = true;
30c71233a   Jeff Layton   ceph: clean up un...
1041
1042
  			} else {
  				ihold(inode);
ca18bede0   Yan, Zheng   ceph: handle -EST...
1043
  			}
2f2dc0534   Sage Weil   ceph: MDS client
1044
  		}
30c71233a   Jeff Layton   ceph: clean up un...
1045
  		rcu_read_unlock();
2f2dc0534   Sage Weil   ceph: MDS client
1046
  	}
eb6bb1c5b   Sage Weil   ceph: direct requ...
1047

3c802092d   Xiubo Li   ceph: print r_dir...
1048
1049
1050
  	dout("%s %p is_hash=%d (0x%x) mode %d
  ", __func__, inode, (int)is_hash,
  	     hash, mode);
2f2dc0534   Sage Weil   ceph: MDS client
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
  	if (!inode)
  		goto random;
  	ci = ceph_inode(inode);
  
  	if (is_hash && S_ISDIR(inode->i_mode)) {
  		struct ceph_inode_frag frag;
  		int found;
  
  		ceph_choose_frag(ci, hash, &frag, &found);
  		if (found) {
  			if (mode == USE_ANY_MDS && frag.ndist > 0) {
  				u8 r;
  
  				/* choose a random replica */
  				get_random_bytes(&r, 1);
  				r %= frag.ndist;
  				mds = frag.dist[r];
3c802092d   Xiubo Li   ceph: print r_dir...
1068
1069
1070
1071
  				dout("%s %p %llx.%llx frag %u mds%d (%d/%d)
  ",
  				     __func__, inode, ceph_vinop(inode),
  				     frag.frag, mds, (int)r, frag.ndist);
d66bbd441   Sage Weil   ceph: avoid picki...
1072
  				if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
5d47648fe   Xiubo Li   ceph: only choose...
1073
1074
  				    CEPH_MDS_STATE_ACTIVE &&
  				    !ceph_mdsmap_is_laggy(mdsc->mdsmap, mds))
30c71233a   Jeff Layton   ceph: clean up un...
1075
  					goto out;
2f2dc0534   Sage Weil   ceph: MDS client
1076
1077
1078
1079
1080
  			}
  
  			/* since this file/dir wasn't known to be
  			 * replicated, then we want to look for the
  			 * authoritative mds. */
2f2dc0534   Sage Weil   ceph: MDS client
1081
1082
1083
  			if (frag.mds >= 0) {
  				/* choose auth mds */
  				mds = frag.mds;
3c802092d   Xiubo Li   ceph: print r_dir...
1084
1085
1086
1087
  				dout("%s %p %llx.%llx frag %u mds%d (auth)
  ",
  				     __func__, inode, ceph_vinop(inode),
  				     frag.frag, mds);
d66bbd441   Sage Weil   ceph: avoid picki...
1088
  				if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
5d47648fe   Xiubo Li   ceph: only choose...
1089
  				    CEPH_MDS_STATE_ACTIVE) {
224c7b677   Yanhu Cao   ceph: use frag's ...
1090
  					if (!ceph_mdsmap_is_laggy(mdsc->mdsmap,
5d47648fe   Xiubo Li   ceph: only choose...
1091
1092
1093
  								  mds))
  						goto out;
  				}
2f2dc0534   Sage Weil   ceph: MDS client
1094
  			}
5d47648fe   Xiubo Li   ceph: only choose...
1095
  			mode = USE_AUTH_MDS;
2f2dc0534   Sage Weil   ceph: MDS client
1096
1097
  		}
  	}
be655596b   Sage Weil   ceph: use i_ceph_...
1098
  	spin_lock(&ci->i_ceph_lock);
2f2dc0534   Sage Weil   ceph: MDS client
1099
1100
1101
1102
1103
1104
  	cap = NULL;
  	if (mode == USE_AUTH_MDS)
  		cap = ci->i_auth_cap;
  	if (!cap && !RB_EMPTY_ROOT(&ci->i_caps))
  		cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
  	if (!cap) {
be655596b   Sage Weil   ceph: use i_ceph_...
1105
  		spin_unlock(&ci->i_ceph_lock);
3e1d0452e   Yan, Zheng   ceph: avoid iput_...
1106
  		ceph_async_iput(inode);
2f2dc0534   Sage Weil   ceph: MDS client
1107
1108
1109
  		goto random;
  	}
  	mds = cap->session->s_mds;
3c802092d   Xiubo Li   ceph: print r_dir...
1110
1111
  	dout("%s %p %llx.%llx mds%d (%scap %p)
  ", __func__,
2f2dc0534   Sage Weil   ceph: MDS client
1112
1113
  	     inode, ceph_vinop(inode), mds,
  	     cap == ci->i_auth_cap ? "auth " : "", cap);
be655596b   Sage Weil   ceph: use i_ceph_...
1114
  	spin_unlock(&ci->i_ceph_lock);
30c71233a   Jeff Layton   ceph: clean up un...
1115
  out:
3e1d0452e   Yan, Zheng   ceph: avoid iput_...
1116
1117
1118
  	/* avoid calling iput_final() while holding mdsc->mutex or
  	 * in mds dispatch threads */
  	ceph_async_iput(inode);
2f2dc0534   Sage Weil   ceph: MDS client
1119
1120
1121
  	return mds;
  
  random:
c4853e977   Xiubo Li   ceph: retry the s...
1122
1123
  	if (random)
  		*random = true;
2f2dc0534   Sage Weil   ceph: MDS client
1124
  	mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
3c802092d   Xiubo Li   ceph: print r_dir...
1125
1126
  	dout("%s chose random mds%d
  ", __func__, mds);
2f2dc0534   Sage Weil   ceph: MDS client
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
  	return mds;
  }
  
  
  /*
   * session messages
   */
  static struct ceph_msg *create_session_msg(u32 op, u64 seq)
  {
  	struct ceph_msg *msg;
  	struct ceph_mds_session_head *h;
b61c27636   Sage Weil   libceph: don't co...
1138
1139
  	msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS,
  			   false);
a79832f26   Sage Weil   ceph: make ceph_m...
1140
  	if (!msg) {
2f2dc0534   Sage Weil   ceph: MDS client
1141
1142
  		pr_err("create_session_msg ENOMEM creating msg
  ");
a79832f26   Sage Weil   ceph: make ceph_m...
1143
  		return NULL;
2f2dc0534   Sage Weil   ceph: MDS client
1144
1145
1146
1147
  	}
  	h = msg->front.iov_base;
  	h->op = cpu_to_le32(op);
  	h->seq = cpu_to_le64(seq);
dbd0c8bf7   John Spray   ceph: send client...
1148
1149
1150
  
  	return msg;
  }
9ba1e2245   Xiubo Li   ceph: allocate th...
1151
1152
  static const unsigned char feature_bits[] = CEPHFS_FEATURES_CLIENT_SUPPORTED;
  #define FEATURE_BYTES(c) (DIV_ROUND_UP((size_t)feature_bits[c - 1] + 1, 64) * 8)
b682c6d41   Xiubo Li   ceph: switch to W...
1153
  static int encode_supported_features(void **p, void *end)
342ce1823   Yan, Zheng   ceph: support cep...
1154
  {
9ba1e2245   Xiubo Li   ceph: allocate th...
1155
  	static const size_t count = ARRAY_SIZE(feature_bits);
342ce1823   Yan, Zheng   ceph: support cep...
1156
1157
1158
  
  	if (count > 0) {
  		size_t i;
9ba1e2245   Xiubo Li   ceph: allocate th...
1159
  		size_t size = FEATURE_BYTES(count);
342ce1823   Yan, Zheng   ceph: support cep...
1160

b682c6d41   Xiubo Li   ceph: switch to W...
1161
1162
  		if (WARN_ON_ONCE(*p + 4 + size > end))
  			return -ERANGE;
342ce1823   Yan, Zheng   ceph: support cep...
1163
1164
1165
  		ceph_encode_32(p, size);
  		memset(*p, 0, size);
  		for (i = 0; i < count; i++)
9ba1e2245   Xiubo Li   ceph: allocate th...
1166
  			((unsigned char*)(*p))[i / 8] |= BIT(feature_bits[i] % 8);
342ce1823   Yan, Zheng   ceph: support cep...
1167
1168
  		*p += size;
  	} else {
b682c6d41   Xiubo Li   ceph: switch to W...
1169
1170
  		if (WARN_ON_ONCE(*p + 4 > end))
  			return -ERANGE;
342ce1823   Yan, Zheng   ceph: support cep...
1171
1172
  		ceph_encode_32(p, 0);
  	}
b682c6d41   Xiubo Li   ceph: switch to W...
1173
1174
  
  	return 0;
342ce1823   Yan, Zheng   ceph: support cep...
1175
  }
3b4168dd8   Xiubo Li   ceph: send client...
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
  static const unsigned char metric_bits[] = CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED;
  #define METRIC_BYTES(cnt) (DIV_ROUND_UP((size_t)metric_bits[cnt - 1] + 1, 64) * 8)
  static int encode_metric_spec(void **p, void *end)
  {
  	static const size_t count = ARRAY_SIZE(metric_bits);
  
  	/* header */
  	if (WARN_ON_ONCE(*p + 2 > end))
  		return -ERANGE;
  
  	ceph_encode_8(p, 1); /* version */
  	ceph_encode_8(p, 1); /* compat */
  
  	if (count > 0) {
  		size_t i;
  		size_t size = METRIC_BYTES(count);
  
  		if (WARN_ON_ONCE(*p + 4 + 4 + size > end))
  			return -ERANGE;
  
  		/* metric spec info length */
  		ceph_encode_32(p, 4 + size);
  
  		/* metric spec */
  		ceph_encode_32(p, size);
  		memset(*p, 0, size);
  		for (i = 0; i < count; i++)
  			((unsigned char *)(*p))[i / 8] |= BIT(metric_bits[i] % 8);
  		*p += size;
  	} else {
  		if (WARN_ON_ONCE(*p + 4 + 4 > end))
  			return -ERANGE;
  
  		/* metric spec info length */
  		ceph_encode_32(p, 4);
  		/* metric spec */
  		ceph_encode_32(p, 0);
  	}
  
  	return 0;
  }
dbd0c8bf7   John Spray   ceph: send client...
1217
1218
1219
1220
1221
1222
1223
1224
1225
  /*
   * session message, specialization for CEPH_SESSION_REQUEST_OPEN
   * to include additional client metadata fields.
   */
  static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u64 seq)
  {
  	struct ceph_msg *msg;
  	struct ceph_mds_session_head *h;
  	int i = -1;
342ce1823   Yan, Zheng   ceph: support cep...
1226
  	int extra_bytes = 0;
dbd0c8bf7   John Spray   ceph: send client...
1227
1228
  	int metadata_key_count = 0;
  	struct ceph_options *opt = mdsc->fsc->client->options;
3f3849540   Yan, Zheng   ceph: report moun...
1229
  	struct ceph_mount_options *fsopt = mdsc->fsc->mount_options;
9ba1e2245   Xiubo Li   ceph: allocate th...
1230
  	size_t size, count;
342ce1823   Yan, Zheng   ceph: support cep...
1231
  	void *p, *end;
b682c6d41   Xiubo Li   ceph: switch to W...
1232
  	int ret;
dbd0c8bf7   John Spray   ceph: send client...
1233

a6a5ce4f0   Yan, Zheng   client: include k...
1234
  	const char* metadata[][2] = {
717e6f289   Yan, Zheng   ceph: avoid panic...
1235
1236
  		{"hostname", mdsc->nodename},
  		{"kernel_version", init_utsname()->release},
3f3849540   Yan, Zheng   ceph: report moun...
1237
1238
  		{"entity_id", opt->name ? : ""},
  		{"root", fsopt->server_path ? : "/"},
dbd0c8bf7   John Spray   ceph: send client...
1239
1240
1241
1242
  		{NULL, NULL}
  	};
  
  	/* Calculate serialized length of metadata */
342ce1823   Yan, Zheng   ceph: support cep...
1243
  	extra_bytes = 4;  /* map length */
d37b1d994   Markus Elfring   ceph: adjust 36 c...
1244
  	for (i = 0; metadata[i][0]; ++i) {
342ce1823   Yan, Zheng   ceph: support cep...
1245
  		extra_bytes += 8 + strlen(metadata[i][0]) +
dbd0c8bf7   John Spray   ceph: send client...
1246
1247
1248
  			strlen(metadata[i][1]);
  		metadata_key_count++;
  	}
9ba1e2245   Xiubo Li   ceph: allocate th...
1249

342ce1823   Yan, Zheng   ceph: support cep...
1250
  	/* supported feature */
9ba1e2245   Xiubo Li   ceph: allocate th...
1251
1252
1253
1254
1255
  	size = 0;
  	count = ARRAY_SIZE(feature_bits);
  	if (count > 0)
  		size = FEATURE_BYTES(count);
  	extra_bytes += 4 + size;
dbd0c8bf7   John Spray   ceph: send client...
1256

3b4168dd8   Xiubo Li   ceph: send client...
1257
1258
1259
1260
1261
1262
  	/* metric spec */
  	size = 0;
  	count = ARRAY_SIZE(metric_bits);
  	if (count > 0)
  		size = METRIC_BYTES(count);
  	extra_bytes += 2 + 4 + 4 + size;
dbd0c8bf7   John Spray   ceph: send client...
1263
  	/* Allocate the message */
342ce1823   Yan, Zheng   ceph: support cep...
1264
  	msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes,
dbd0c8bf7   John Spray   ceph: send client...
1265
1266
1267
1268
  			   GFP_NOFS, false);
  	if (!msg) {
  		pr_err("create_session_msg ENOMEM creating msg
  ");
b682c6d41   Xiubo Li   ceph: switch to W...
1269
  		return ERR_PTR(-ENOMEM);
dbd0c8bf7   John Spray   ceph: send client...
1270
  	}
342ce1823   Yan, Zheng   ceph: support cep...
1271
1272
1273
1274
  	p = msg->front.iov_base;
  	end = p + msg->front.iov_len;
  
  	h = p;
dbd0c8bf7   John Spray   ceph: send client...
1275
1276
1277
1278
1279
1280
  	h->op = cpu_to_le32(CEPH_SESSION_REQUEST_OPEN);
  	h->seq = cpu_to_le64(seq);
  
  	/*
  	 * Serialize client metadata into waiting buffer space, using
  	 * the format that userspace expects for map<string, string>
7cfa0313d   John Spray   ceph: message ver...
1281
  	 *
3b4168dd8   Xiubo Li   ceph: send client...
1282
  	 * ClientSession messages with metadata are v4
dbd0c8bf7   John Spray   ceph: send client...
1283
  	 */
3b4168dd8   Xiubo Li   ceph: send client...
1284
  	msg->hdr.version = cpu_to_le16(4);
7cfa0313d   John Spray   ceph: message ver...
1285
  	msg->hdr.compat_version = cpu_to_le16(1);
dbd0c8bf7   John Spray   ceph: send client...
1286
1287
  
  	/* The write pointer, following the session_head structure */
342ce1823   Yan, Zheng   ceph: support cep...
1288
  	p += sizeof(*h);
dbd0c8bf7   John Spray   ceph: send client...
1289
1290
1291
1292
1293
  
  	/* Number of entries in the map */
  	ceph_encode_32(&p, metadata_key_count);
  
  	/* Two length-prefixed strings for each entry in the map */
d37b1d994   Markus Elfring   ceph: adjust 36 c...
1294
  	for (i = 0; metadata[i][0]; ++i) {
dbd0c8bf7   John Spray   ceph: send client...
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
  		size_t const key_len = strlen(metadata[i][0]);
  		size_t const val_len = strlen(metadata[i][1]);
  
  		ceph_encode_32(&p, key_len);
  		memcpy(p, metadata[i][0], key_len);
  		p += key_len;
  		ceph_encode_32(&p, val_len);
  		memcpy(p, metadata[i][1], val_len);
  		p += val_len;
  	}
b682c6d41   Xiubo Li   ceph: switch to W...
1305
1306
1307
1308
1309
1310
1311
  	ret = encode_supported_features(&p, end);
  	if (ret) {
  		pr_err("encode_supported_features failed!
  ");
  		ceph_msg_put(msg);
  		return ERR_PTR(ret);
  	}
3b4168dd8   Xiubo Li   ceph: send client...
1312
1313
1314
1315
1316
1317
1318
  	ret = encode_metric_spec(&p, end);
  	if (ret) {
  		pr_err("encode_metric_spec failed!
  ");
  		ceph_msg_put(msg);
  		return ERR_PTR(ret);
  	}
342ce1823   Yan, Zheng   ceph: support cep...
1319
1320
  	msg->front.iov_len = p - msg->front.iov_base;
  	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
2f2dc0534   Sage Weil   ceph: MDS client
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
  	return msg;
  }
  
  /*
   * send session open request.
   *
   * called under mdsc->mutex
   */
  static int __open_session(struct ceph_mds_client *mdsc,
  			  struct ceph_mds_session *session)
  {
  	struct ceph_msg *msg;
  	int mstate;
  	int mds = session->s_mds;
2f2dc0534   Sage Weil   ceph: MDS client
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
  
  	/* wait for mds to go active? */
  	mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
  	dout("open_session to mds%d (%s)
  ", mds,
  	     ceph_mds_state_name(mstate));
  	session->s_state = CEPH_MDS_SESSION_OPENING;
  	session->s_renew_requested = jiffies;
  
  	/* send connect message */
dbd0c8bf7   John Spray   ceph: send client...
1345
  	msg = create_session_open_msg(mdsc, session->s_seq);
b682c6d41   Xiubo Li   ceph: switch to W...
1346
1347
  	if (IS_ERR(msg))
  		return PTR_ERR(msg);
2f2dc0534   Sage Weil   ceph: MDS client
1348
  	ceph_con_send(&session->s_con, msg);
2f2dc0534   Sage Weil   ceph: MDS client
1349
1350
1351
1352
  	return 0;
  }
  
  /*
ed0552a1a   Sage Weil   ceph: introduce h...
1353
1354
1355
1356
   * open sessions for any export targets for the given mds
   *
   * called under mdsc->mutex
   */
5d72d13c4   Yan, Zheng   ceph: add open ex...
1357
1358
1359
1360
  static struct ceph_mds_session *
  __open_export_target_session(struct ceph_mds_client *mdsc, int target)
  {
  	struct ceph_mds_session *session;
b682c6d41   Xiubo Li   ceph: switch to W...
1361
  	int ret;
5d72d13c4   Yan, Zheng   ceph: add open ex...
1362
1363
1364
1365
1366
1367
1368
1369
  
  	session = __ceph_lookup_mds_session(mdsc, target);
  	if (!session) {
  		session = register_session(mdsc, target);
  		if (IS_ERR(session))
  			return session;
  	}
  	if (session->s_state == CEPH_MDS_SESSION_NEW ||
b682c6d41   Xiubo Li   ceph: switch to W...
1370
1371
1372
1373
1374
  	    session->s_state == CEPH_MDS_SESSION_CLOSING) {
  		ret = __open_session(mdsc, session);
  		if (ret)
  			return ERR_PTR(ret);
  	}
5d72d13c4   Yan, Zheng   ceph: add open ex...
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
  
  	return session;
  }
  
  struct ceph_mds_session *
  ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target)
  {
  	struct ceph_mds_session *session;
  
  	dout("open_export_target_session to mds%d
  ", target);
  
  	mutex_lock(&mdsc->mutex);
  	session = __open_export_target_session(mdsc, target);
  	mutex_unlock(&mdsc->mutex);
  
  	return session;
  }
ed0552a1a   Sage Weil   ceph: introduce h...
1393
1394
1395
1396
1397
1398
  static void __open_export_target_sessions(struct ceph_mds_client *mdsc,
  					  struct ceph_mds_session *session)
  {
  	struct ceph_mds_info *mi;
  	struct ceph_mds_session *ts;
  	int i, mds = session->s_mds;
ed0552a1a   Sage Weil   ceph: introduce h...
1399

b38c9eb47   Xiubo Li   ceph: add possibl...
1400
  	if (mds >= mdsc->mdsmap->possible_max_rank)
ed0552a1a   Sage Weil   ceph: introduce h...
1401
  		return;
5d72d13c4   Yan, Zheng   ceph: add open ex...
1402

ed0552a1a   Sage Weil   ceph: introduce h...
1403
1404
1405
1406
1407
1408
  	mi = &mdsc->mdsmap->m_info[mds];
  	dout("open_export_target_sessions for mds%d (%d targets)
  ",
  	     session->s_mds, mi->num_export_targets);
  
  	for (i = 0; i < mi->num_export_targets; i++) {
5d72d13c4   Yan, Zheng   ceph: add open ex...
1409
1410
1411
  		ts = __open_export_target_session(mdsc, mi->export_targets[i]);
  		if (!IS_ERR(ts))
  			ceph_put_mds_session(ts);
ed0552a1a   Sage Weil   ceph: introduce h...
1412
1413
  	}
  }
154f42c2c   Sage Weil   ceph: connect to ...
1414
1415
1416
1417
1418
1419
1420
  void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
  					   struct ceph_mds_session *session)
  {
  	mutex_lock(&mdsc->mutex);
  	__open_export_target_sessions(mdsc, session);
  	mutex_unlock(&mdsc->mutex);
  }
ed0552a1a   Sage Weil   ceph: introduce h...
1421
  /*
2f2dc0534   Sage Weil   ceph: MDS client
1422
1423
   * session caps
   */
c8a96a31c   Jeff Layton   ceph: clean up sp...
1424
1425
  static void detach_cap_releases(struct ceph_mds_session *session,
  				struct list_head *target)
2f2dc0534   Sage Weil   ceph: MDS client
1426
  {
c8a96a31c   Jeff Layton   ceph: clean up sp...
1427
1428
1429
  	lockdep_assert_held(&session->s_cap_lock);
  
  	list_splice_init(&session->s_cap_releases, target);
745a8e3bc   Yan, Zheng   ceph: don't pre-a...
1430
  	session->s_num_cap_releases = 0;
c8a96a31c   Jeff Layton   ceph: clean up sp...
1431
1432
1433
  	dout("dispose_cap_releases mds%d
  ", session->s_mds);
  }
2f2dc0534   Sage Weil   ceph: MDS client
1434

c8a96a31c   Jeff Layton   ceph: clean up sp...
1435
1436
1437
1438
  static void dispose_cap_releases(struct ceph_mds_client *mdsc,
  				 struct list_head *dispose)
  {
  	while (!list_empty(dispose)) {
745a8e3bc   Yan, Zheng   ceph: don't pre-a...
1439
1440
  		struct ceph_cap *cap;
  		/* zero out the in-progress message */
c8a96a31c   Jeff Layton   ceph: clean up sp...
1441
  		cap = list_first_entry(dispose, struct ceph_cap, session_caps);
745a8e3bc   Yan, Zheng   ceph: don't pre-a...
1442
1443
  		list_del(&cap->session_caps);
  		ceph_put_cap(mdsc, cap);
2f2dc0534   Sage Weil   ceph: MDS client
1444
  	}
2f2dc0534   Sage Weil   ceph: MDS client
1445
  }
1c841a96b   Yan, Zheng   ceph: cleanup uns...
1446
1447
1448
1449
1450
  static void cleanup_session_requests(struct ceph_mds_client *mdsc,
  				     struct ceph_mds_session *session)
  {
  	struct ceph_mds_request *req;
  	struct rb_node *p;
f4b978662   Yan, Zheng   ceph: track and r...
1451
  	struct ceph_inode_info *ci;
1c841a96b   Yan, Zheng   ceph: cleanup uns...
1452
1453
1454
1455
1456
1457
1458
  
  	dout("cleanup_session_requests mds%d
  ", session->s_mds);
  	mutex_lock(&mdsc->mutex);
  	while (!list_empty(&session->s_unsafe)) {
  		req = list_first_entry(&session->s_unsafe,
  				       struct ceph_mds_request, r_unsafe_item);
3e0708b99   Yan, Zheng   ceph: ratelimit w...
1459
1460
1461
  		pr_warn_ratelimited(" dropping unsafe request %llu
  ",
  				    req->r_tid);
f4b978662   Yan, Zheng   ceph: track and r...
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
  		if (req->r_target_inode) {
  			/* dropping unsafe change of inode's attributes */
  			ci = ceph_inode(req->r_target_inode);
  			errseq_set(&ci->i_meta_err, -EIO);
  		}
  		if (req->r_unsafe_dir) {
  			/* dropping unsafe directory operation */
  			ci = ceph_inode(req->r_unsafe_dir);
  			errseq_set(&ci->i_meta_err, -EIO);
  		}
1c841a96b   Yan, Zheng   ceph: cleanup uns...
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
  		__unregister_request(mdsc, req);
  	}
  	/* zero r_attempts, so kick_requests() will re-send requests */
  	p = rb_first(&mdsc->request_tree);
  	while (p) {
  		req = rb_entry(p, struct ceph_mds_request, r_node);
  		p = rb_next(p);
  		if (req->r_session &&
  		    req->r_session->s_mds == session->s_mds)
  			req->r_attempts = 0;
  	}
  	mutex_unlock(&mdsc->mutex);
  }
2f2dc0534   Sage Weil   ceph: MDS client
1485
  /*
f818a7367   Sage Weil   ceph: fix cap rem...
1486
1487
   * Helper to safely iterate over all caps associated with a session, with
   * special care taken to handle a racing __ceph_remove_cap().
2f2dc0534   Sage Weil   ceph: MDS client
1488
   *
f818a7367   Sage Weil   ceph: fix cap rem...
1489
   * Caller must hold session s_mutex.
2f2dc0534   Sage Weil   ceph: MDS client
1490
   */
f5d772690   Jeff Layton   ceph: make iterat...
1491
1492
1493
  int ceph_iterate_session_caps(struct ceph_mds_session *session,
  			      int (*cb)(struct inode *, struct ceph_cap *,
  					void *), void *arg)
2f2dc0534   Sage Weil   ceph: MDS client
1494
  {
7c1332b8c   Sage Weil   ceph: fix iterate...
1495
1496
1497
1498
  	struct list_head *p;
  	struct ceph_cap *cap;
  	struct inode *inode, *last_inode = NULL;
  	struct ceph_cap *old_cap = NULL;
2f2dc0534   Sage Weil   ceph: MDS client
1499
1500
1501
1502
1503
  	int ret;
  
  	dout("iterate_session_caps %p mds%d
  ", session, session->s_mds);
  	spin_lock(&session->s_cap_lock);
7c1332b8c   Sage Weil   ceph: fix iterate...
1504
1505
1506
  	p = session->s_caps.next;
  	while (p != &session->s_caps) {
  		cap = list_entry(p, struct ceph_cap, session_caps);
2f2dc0534   Sage Weil   ceph: MDS client
1507
  		inode = igrab(&cap->ci->vfs_inode);
7c1332b8c   Sage Weil   ceph: fix iterate...
1508
1509
  		if (!inode) {
  			p = p->next;
2f2dc0534   Sage Weil   ceph: MDS client
1510
  			continue;
7c1332b8c   Sage Weil   ceph: fix iterate...
1511
1512
  		}
  		session->s_cap_iterator = cap;
2f2dc0534   Sage Weil   ceph: MDS client
1513
  		spin_unlock(&session->s_cap_lock);
7c1332b8c   Sage Weil   ceph: fix iterate...
1514
1515
  
  		if (last_inode) {
3e1d0452e   Yan, Zheng   ceph: avoid iput_...
1516
1517
1518
  			/* avoid calling iput_final() while holding
  			 * s_mutex or in mds dispatch threads */
  			ceph_async_iput(last_inode);
7c1332b8c   Sage Weil   ceph: fix iterate...
1519
1520
1521
  			last_inode = NULL;
  		}
  		if (old_cap) {
37151668b   Yehuda Sadeh   ceph: do caps acc...
1522
  			ceph_put_cap(session->s_mdsc, old_cap);
7c1332b8c   Sage Weil   ceph: fix iterate...
1523
1524
  			old_cap = NULL;
  		}
2f2dc0534   Sage Weil   ceph: MDS client
1525
  		ret = cb(inode, cap, arg);
7c1332b8c   Sage Weil   ceph: fix iterate...
1526
  		last_inode = inode;
2f2dc0534   Sage Weil   ceph: MDS client
1527
  		spin_lock(&session->s_cap_lock);
7c1332b8c   Sage Weil   ceph: fix iterate...
1528
  		p = p->next;
d37b1d994   Markus Elfring   ceph: adjust 36 c...
1529
  		if (!cap->ci) {
7c1332b8c   Sage Weil   ceph: fix iterate...
1530
1531
1532
1533
  			dout("iterate_session_caps  finishing cap %p removal
  ",
  			     cap);
  			BUG_ON(cap->session != session);
745a8e3bc   Yan, Zheng   ceph: don't pre-a...
1534
  			cap->session = NULL;
7c1332b8c   Sage Weil   ceph: fix iterate...
1535
1536
  			list_del_init(&cap->session_caps);
  			session->s_nr_caps--;
4f1d756de   Xiubo Li   ceph: add global ...
1537
  			atomic64_dec(&session->s_mdsc->metric.total_caps);
e3ec8d689   Yan, Zheng   ceph: send cap re...
1538
1539
1540
  			if (cap->queue_release)
  				__ceph_queue_cap_release(session, cap);
  			else
745a8e3bc   Yan, Zheng   ceph: don't pre-a...
1541
  				old_cap = cap;  /* put_cap it w/o locks held */
7c1332b8c   Sage Weil   ceph: fix iterate...
1542
  		}
5dacf0912   Sage Weil   ceph: do not touc...
1543
1544
  		if (ret < 0)
  			goto out;
2f2dc0534   Sage Weil   ceph: MDS client
1545
  	}
5dacf0912   Sage Weil   ceph: do not touc...
1546
1547
  	ret = 0;
  out:
7c1332b8c   Sage Weil   ceph: fix iterate...
1548
  	session->s_cap_iterator = NULL;
2f2dc0534   Sage Weil   ceph: MDS client
1549
  	spin_unlock(&session->s_cap_lock);
7c1332b8c   Sage Weil   ceph: fix iterate...
1550

3e1d0452e   Yan, Zheng   ceph: avoid iput_...
1551
  	ceph_async_iput(last_inode);
7c1332b8c   Sage Weil   ceph: fix iterate...
1552
  	if (old_cap)
37151668b   Yehuda Sadeh   ceph: do caps acc...
1553
  		ceph_put_cap(session->s_mdsc, old_cap);
7c1332b8c   Sage Weil   ceph: fix iterate...
1554

5dacf0912   Sage Weil   ceph: do not touc...
1555
  	return ret;
2f2dc0534   Sage Weil   ceph: MDS client
1556
1557
1558
  }
  
  static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
6c99f2545   Sage Weil   ceph: throw out d...
1559
  				  void *arg)
2f2dc0534   Sage Weil   ceph: MDS client
1560
  {
6c93df5db   Yan, Zheng   ceph: don't call ...
1561
  	struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg;
2f2dc0534   Sage Weil   ceph: MDS client
1562
  	struct ceph_inode_info *ci = ceph_inode(inode);
553adfd94   Yan, Zheng   ceph: track pendi...
1563
  	LIST_HEAD(to_remove);
f4b978662   Yan, Zheng   ceph: track and r...
1564
  	bool dirty_dropped = false;
6c93df5db   Yan, Zheng   ceph: don't call ...
1565
  	bool invalidate = false;
6c99f2545   Sage Weil   ceph: throw out d...
1566

2f2dc0534   Sage Weil   ceph: MDS client
1567
1568
1569
  	dout("removing cap %p, ci is %p, inode is %p
  ",
  	     cap, ci, &ci->vfs_inode);
be655596b   Sage Weil   ceph: use i_ceph_...
1570
  	spin_lock(&ci->i_ceph_lock);
a096b09ae   Yan, Zheng   ceph: queue cap r...
1571
  	__ceph_remove_cap(cap, false);
571ade336   Yan, Zheng   ceph: don't mark ...
1572
  	if (!ci->i_auth_cap) {
553adfd94   Yan, Zheng   ceph: track pendi...
1573
  		struct ceph_cap_flush *cf;
6c93df5db   Yan, Zheng   ceph: don't call ...
1574
  		struct ceph_mds_client *mdsc = fsc->mdsc;
6c99f2545   Sage Weil   ceph: throw out d...
1575

d468e729b   Yan, Zheng   ceph: add helper ...
1576
1577
1578
1579
1580
1581
  		if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
  			if (inode->i_data.nrpages > 0)
  				invalidate = true;
  			if (ci->i_wrbuffer_ref > 0)
  				mapping_set_error(&inode->i_data, -EIO);
  		}
6c93df5db   Yan, Zheng   ceph: don't call ...
1582

e4500b5e3   Yan, Zheng   ceph: use list in...
1583
1584
1585
  		while (!list_empty(&ci->i_cap_flush_list)) {
  			cf = list_first_entry(&ci->i_cap_flush_list,
  					      struct ceph_cap_flush, i_list);
8cdcc07dd   Wei Yongjun   ceph: use list_mo...
1586
  			list_move(&cf->i_list, &to_remove);
553adfd94   Yan, Zheng   ceph: track pendi...
1587
  		}
6c99f2545   Sage Weil   ceph: throw out d...
1588
  		spin_lock(&mdsc->cap_dirty_lock);
8310b0891   Yan, Zheng   ceph: track pendi...
1589

e4500b5e3   Yan, Zheng   ceph: use list in...
1590
1591
  		list_for_each_entry(cf, &to_remove, i_list)
  			list_del(&cf->g_list);
8310b0891   Yan, Zheng   ceph: track pendi...
1592

6c99f2545   Sage Weil   ceph: throw out d...
1593
  		if (!list_empty(&ci->i_dirty_item)) {
3e0708b99   Yan, Zheng   ceph: ratelimit w...
1594
1595
1596
  			pr_warn_ratelimited(
  				" dropping dirty %s state for %p %lld
  ",
6c99f2545   Sage Weil   ceph: throw out d...
1597
1598
1599
1600
  				ceph_cap_string(ci->i_dirty_caps),
  				inode, ceph_ino(inode));
  			ci->i_dirty_caps = 0;
  			list_del_init(&ci->i_dirty_item);
f4b978662   Yan, Zheng   ceph: track and r...
1601
  			dirty_dropped = true;
6c99f2545   Sage Weil   ceph: throw out d...
1602
1603
  		}
  		if (!list_empty(&ci->i_flushing_item)) {
3e0708b99   Yan, Zheng   ceph: ratelimit w...
1604
1605
1606
  			pr_warn_ratelimited(
  				" dropping dirty+flushing %s state for %p %lld
  ",
6c99f2545   Sage Weil   ceph: throw out d...
1607
1608
1609
1610
1611
  				ceph_cap_string(ci->i_flushing_caps),
  				inode, ceph_ino(inode));
  			ci->i_flushing_caps = 0;
  			list_del_init(&ci->i_flushing_item);
  			mdsc->num_cap_flushing--;
f4b978662   Yan, Zheng   ceph: track and r...
1612
  			dirty_dropped = true;
6c99f2545   Sage Weil   ceph: throw out d...
1613
  		}
6c99f2545   Sage Weil   ceph: throw out d...
1614
  		spin_unlock(&mdsc->cap_dirty_lock);
553adfd94   Yan, Zheng   ceph: track pendi...
1615

f4b978662   Yan, Zheng   ceph: track and r...
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
  		if (dirty_dropped) {
  			errseq_set(&ci->i_meta_err, -EIO);
  
  			if (ci->i_wrbuffer_ref_head == 0 &&
  			    ci->i_wr_ref == 0 &&
  			    ci->i_dirty_caps == 0 &&
  			    ci->i_flushing_caps == 0) {
  				ceph_put_snap_context(ci->i_head_snapc);
  				ci->i_head_snapc = NULL;
  			}
  		}
b3f8d68f3   Yan, Zheng   ceph: handle 'ses...
1627
1628
1629
1630
1631
1632
1633
  		if (atomic_read(&ci->i_filelock_ref) > 0) {
  			/* make further file lock syscall return -EIO */
  			ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK;
  			pr_warn_ratelimited(" dropping file locks for %p %lld
  ",
  					    inode, ceph_ino(inode));
  		}
f66fd9f09   Yan, Zheng   ceph: pre-allocat...
1634
  		if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) {
e4500b5e3   Yan, Zheng   ceph: use list in...
1635
  			list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove);
f66fd9f09   Yan, Zheng   ceph: pre-allocat...
1636
1637
  			ci->i_prealloc_cap_flush = NULL;
  		}
6c99f2545   Sage Weil   ceph: throw out d...
1638
  	}
be655596b   Sage Weil   ceph: use i_ceph_...
1639
  	spin_unlock(&ci->i_ceph_lock);
553adfd94   Yan, Zheng   ceph: track pendi...
1640
1641
1642
  	while (!list_empty(&to_remove)) {
  		struct ceph_cap_flush *cf;
  		cf = list_first_entry(&to_remove,
e4500b5e3   Yan, Zheng   ceph: use list in...
1643
1644
  				      struct ceph_cap_flush, i_list);
  		list_del(&cf->i_list);
f66fd9f09   Yan, Zheng   ceph: pre-allocat...
1645
  		ceph_free_cap_flush(cf);
553adfd94   Yan, Zheng   ceph: track pendi...
1646
  	}
77310320c   Yan, Zheng   ceph: renew caps ...
1647
1648
  
  	wake_up_all(&ci->i_cap_wq);
6c93df5db   Yan, Zheng   ceph: don't call ...
1649
1650
  	if (invalidate)
  		ceph_queue_invalidate(inode);
f4b978662   Yan, Zheng   ceph: track and r...
1651
  	if (dirty_dropped)
6c99f2545   Sage Weil   ceph: throw out d...
1652
  		iput(inode);
2f2dc0534   Sage Weil   ceph: MDS client
1653
1654
1655
1656
1657
1658
1659
1660
  	return 0;
  }
  
  /*
   * caller must hold session s_mutex
   */
  static void remove_session_caps(struct ceph_mds_session *session)
  {
6c93df5db   Yan, Zheng   ceph: don't call ...
1661
1662
  	struct ceph_fs_client *fsc = session->s_mdsc->fsc;
  	struct super_block *sb = fsc->sb;
c8a96a31c   Jeff Layton   ceph: clean up sp...
1663
  	LIST_HEAD(dispose);
2f2dc0534   Sage Weil   ceph: MDS client
1664
1665
  	dout("remove_session_caps on %p
  ", session);
f5d772690   Jeff Layton   ceph: make iterat...
1666
  	ceph_iterate_session_caps(session, remove_session_caps_cb, fsc);
6f60f8894   Yan, Zheng   ceph: fix freeing...
1667

c8799fc46   Yan, Zheng   ceph: optimize ca...
1668
  	wake_up_all(&fsc->mdsc->cap_flushing_wq);
6f60f8894   Yan, Zheng   ceph: fix freeing...
1669
1670
  	spin_lock(&session->s_cap_lock);
  	if (session->s_nr_caps > 0) {
6f60f8894   Yan, Zheng   ceph: fix freeing...
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
  		struct inode *inode;
  		struct ceph_cap *cap, *prev = NULL;
  		struct ceph_vino vino;
  		/*
  		 * iterate_session_caps() skips inodes that are being
  		 * deleted, we need to wait until deletions are complete.
  		 * __wait_on_freeing_inode() is designed for the job,
  		 * but it is not exported, so use lookup inode function
  		 * to access it.
  		 */
  		while (!list_empty(&session->s_caps)) {
  			cap = list_entry(session->s_caps.next,
  					 struct ceph_cap, session_caps);
  			if (cap == prev)
  				break;
  			prev = cap;
  			vino = cap->ci->i_vino;
  			spin_unlock(&session->s_cap_lock);
ed284c49f   Yan, Zheng   ceph: remove ceph...
1689
  			inode = ceph_find_inode(sb, vino);
3e1d0452e   Yan, Zheng   ceph: avoid iput_...
1690
1691
  			 /* avoid calling iput_final() while holding s_mutex */
  			ceph_async_iput(inode);
6f60f8894   Yan, Zheng   ceph: fix freeing...
1692
1693
1694
1695
  
  			spin_lock(&session->s_cap_lock);
  		}
  	}
745a8e3bc   Yan, Zheng   ceph: don't pre-a...
1696
1697
  
  	// drop cap expires and unlock s_cap_lock
c8a96a31c   Jeff Layton   ceph: clean up sp...
1698
  	detach_cap_releases(session, &dispose);
6f60f8894   Yan, Zheng   ceph: fix freeing...
1699

2f2dc0534   Sage Weil   ceph: MDS client
1700
  	BUG_ON(session->s_nr_caps > 0);
6c99f2545   Sage Weil   ceph: throw out d...
1701
  	BUG_ON(!list_empty(&session->s_cap_flushing));
c8a96a31c   Jeff Layton   ceph: clean up sp...
1702
1703
  	spin_unlock(&session->s_cap_lock);
  	dispose_cap_releases(session->s_mdsc, &dispose);
2f2dc0534   Sage Weil   ceph: MDS client
1704
  }
d2f8bb27c   Yan, Zheng   ceph: update want...
1705
1706
1707
1708
1709
  enum {
  	RECONNECT,
  	RENEWCAPS,
  	FORCE_RO,
  };
2f2dc0534   Sage Weil   ceph: MDS client
1710
1711
1712
1713
1714
1715
1716
1717
1718
  /*
   * wake up any threads waiting on this session's caps.  if the cap is
   * old (didn't get renewed on the client reconnect), remove it now.
   *
   * caller must hold s_mutex.
   */
  static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
  			      void *arg)
  {
0dc2570fa   Sage Weil   ceph: reset reque...
1719
  	struct ceph_inode_info *ci = ceph_inode(inode);
d2f8bb27c   Yan, Zheng   ceph: update want...
1720
  	unsigned long ev = (unsigned long)arg;
0dc2570fa   Sage Weil   ceph: reset reque...
1721

d2f8bb27c   Yan, Zheng   ceph: update want...
1722
  	if (ev == RECONNECT) {
be655596b   Sage Weil   ceph: use i_ceph_...
1723
  		spin_lock(&ci->i_ceph_lock);
0dc2570fa   Sage Weil   ceph: reset reque...
1724
1725
  		ci->i_wanted_max_size = 0;
  		ci->i_requested_max_size = 0;
be655596b   Sage Weil   ceph: use i_ceph_...
1726
  		spin_unlock(&ci->i_ceph_lock);
d2f8bb27c   Yan, Zheng   ceph: update want...
1727
1728
1729
1730
1731
  	} else if (ev == RENEWCAPS) {
  		if (cap->cap_gen < cap->session->s_cap_gen) {
  			/* mds did not re-issue stale cap */
  			spin_lock(&ci->i_ceph_lock);
  			cap->issued = cap->implemented = CEPH_CAP_PIN;
d2f8bb27c   Yan, Zheng   ceph: update want...
1732
1733
1734
  			spin_unlock(&ci->i_ceph_lock);
  		}
  	} else if (ev == FORCE_RO) {
0dc2570fa   Sage Weil   ceph: reset reque...
1735
  	}
e53603093   Yan, Zheng   ceph: fix wake_up...
1736
  	wake_up_all(&ci->i_cap_wq);
2f2dc0534   Sage Weil   ceph: MDS client
1737
1738
  	return 0;
  }
d2f8bb27c   Yan, Zheng   ceph: update want...
1739
  static void wake_up_session_caps(struct ceph_mds_session *session, int ev)
2f2dc0534   Sage Weil   ceph: MDS client
1740
1741
1742
  {
  	dout("wake_up_session_caps %p mds%d
  ", session, session->s_mds);
f5d772690   Jeff Layton   ceph: make iterat...
1743
1744
  	ceph_iterate_session_caps(session, wake_up_session_cb,
  				  (void *)(unsigned long)ev);
2f2dc0534   Sage Weil   ceph: MDS client
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
  }
  
  /*
   * Send periodic message to MDS renewing all currently held caps.  The
   * ack will reset the expiration for all caps from this session.
   *
   * caller holds s_mutex
   */
  static int send_renew_caps(struct ceph_mds_client *mdsc,
  			   struct ceph_mds_session *session)
  {
  	struct ceph_msg *msg;
  	int state;
  
  	if (time_after_eq(jiffies, session->s_cap_ttl) &&
  	    time_after_eq(session->s_cap_ttl, session->s_renew_requested))
  		pr_info("mds%d caps stale
  ", session->s_mds);
e4cb4cb8a   Sage Weil   ceph: prevent dup...
1763
  	session->s_renew_requested = jiffies;
2f2dc0534   Sage Weil   ceph: MDS client
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
  
  	/* do not try to renew caps until a recovering mds has reconnected
  	 * with its clients. */
  	state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds);
  	if (state < CEPH_MDS_STATE_RECONNECT) {
  		dout("send_renew_caps ignoring mds%d (%s)
  ",
  		     session->s_mds, ceph_mds_state_name(state));
  		return 0;
  	}
  
  	dout("send_renew_caps to mds%d (%s)
  ", session->s_mds,
  		ceph_mds_state_name(state));
2f2dc0534   Sage Weil   ceph: MDS client
1778
1779
  	msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
  				 ++session->s_renew_seq);
a79832f26   Sage Weil   ceph: make ceph_m...
1780
1781
  	if (!msg)
  		return -ENOMEM;
2f2dc0534   Sage Weil   ceph: MDS client
1782
1783
1784
  	ceph_con_send(&session->s_con, msg);
  	return 0;
  }
186e4f7a4   Yan, Zheng   ceph: handle sess...
1785
1786
1787
1788
1789
1790
1791
  static int send_flushmsg_ack(struct ceph_mds_client *mdsc,
  			     struct ceph_mds_session *session, u64 seq)
  {
  	struct ceph_msg *msg;
  
  	dout("send_flushmsg_ack to mds%d (%s)s seq %lld
  ",
a687ecaf5   John Spray   ceph: export ceph...
1792
  	     session->s_mds, ceph_session_state_name(session->s_state), seq);
186e4f7a4   Yan, Zheng   ceph: handle sess...
1793
1794
1795
1796
1797
1798
  	msg = create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq);
  	if (!msg)
  		return -ENOMEM;
  	ceph_con_send(&session->s_con, msg);
  	return 0;
  }
2f2dc0534   Sage Weil   ceph: MDS client
1799
1800
  /*
   * Note new cap ttl, and any transition from stale -> not stale (fresh?).
0dc2570fa   Sage Weil   ceph: reset reque...
1801
1802
   *
   * Called under session->s_mutex
2f2dc0534   Sage Weil   ceph: MDS client
1803
1804
1805
1806
1807
1808
1809
1810
   */
  static void renewed_caps(struct ceph_mds_client *mdsc,
  			 struct ceph_mds_session *session, int is_renew)
  {
  	int was_stale;
  	int wake = 0;
  
  	spin_lock(&session->s_cap_lock);
1ce208a6c   Alex Elder   ceph: don't reset...
1811
  	was_stale = is_renew && time_after_eq(jiffies, session->s_cap_ttl);
2f2dc0534   Sage Weil   ceph: MDS client
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
  
  	session->s_cap_ttl = session->s_renew_requested +
  		mdsc->mdsmap->m_session_timeout*HZ;
  
  	if (was_stale) {
  		if (time_before(jiffies, session->s_cap_ttl)) {
  			pr_info("mds%d caps renewed
  ", session->s_mds);
  			wake = 1;
  		} else {
  			pr_info("mds%d caps still stale
  ", session->s_mds);
  		}
  	}
  	dout("renewed_caps mds%d ttl now %lu, was %s, now %s
  ",
  	     session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh",
  	     time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh");
  	spin_unlock(&session->s_cap_lock);
  
  	if (wake)
d2f8bb27c   Yan, Zheng   ceph: update want...
1833
  		wake_up_session_caps(session, RENEWCAPS);
2f2dc0534   Sage Weil   ceph: MDS client
1834
1835
1836
1837
1838
  }
  
  /*
   * send a session close request
   */
3e699bd86   Xiubo Li   ceph: add check_s...
1839
  static int request_close_session(struct ceph_mds_session *session)
2f2dc0534   Sage Weil   ceph: MDS client
1840
1841
  {
  	struct ceph_msg *msg;
2f2dc0534   Sage Weil   ceph: MDS client
1842
1843
1844
  
  	dout("request_close_session mds%d state %s seq %lld
  ",
a687ecaf5   John Spray   ceph: export ceph...
1845
  	     session->s_mds, ceph_session_state_name(session->s_state),
2f2dc0534   Sage Weil   ceph: MDS client
1846
1847
  	     session->s_seq);
  	msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
a79832f26   Sage Weil   ceph: make ceph_m...
1848
1849
1850
  	if (!msg)
  		return -ENOMEM;
  	ceph_con_send(&session->s_con, msg);
fcff415c9   Yan, Zheng   ceph: handle CEPH...
1851
  	return 1;
2f2dc0534   Sage Weil   ceph: MDS client
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
  }
  
  /*
   * Called with s_mutex held.
   */
  static int __close_session(struct ceph_mds_client *mdsc,
  			 struct ceph_mds_session *session)
  {
  	if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
  		return 0;
  	session->s_state = CEPH_MDS_SESSION_CLOSING;
3e699bd86   Xiubo Li   ceph: add check_s...
1863
  	return request_close_session(session);
2f2dc0534   Sage Weil   ceph: MDS client
1864
  }
040d78603   Yan, Zheng   ceph: drop negati...
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
  static bool drop_negative_children(struct dentry *dentry)
  {
  	struct dentry *child;
  	bool all_negative = true;
  
  	if (!d_is_dir(dentry))
  		goto out;
  
  	spin_lock(&dentry->d_lock);
  	list_for_each_entry(child, &dentry->d_subdirs, d_child) {
  		if (d_really_is_positive(child)) {
  			all_negative = false;
  			break;
  		}
  	}
  	spin_unlock(&dentry->d_lock);
  
  	if (all_negative)
  		shrink_dcache_parent(dentry);
  out:
  	return all_negative;
  }
2f2dc0534   Sage Weil   ceph: MDS client
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
  /*
   * Trim old(er) caps.
   *
   * Because we can't cache an inode without one or more caps, we do
   * this indirectly: if a cap is unused, we prune its aliases, at which
   * point the inode will hopefully get dropped to.
   *
   * Yes, this is a bit sloppy.  Our only real goal here is to respond to
   * memory pressure from the MDS, though, so it needn't be perfect.
   */
  static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
  {
533a2818d   Jeff Layton   ceph: eliminate s...
1899
  	int *remaining = arg;
2f2dc0534   Sage Weil   ceph: MDS client
1900
  	struct ceph_inode_info *ci = ceph_inode(inode);
979abfdd5   Yan, Zheng   ceph: fix trim caps
1901
  	int used, wanted, oissued, mine;
2f2dc0534   Sage Weil   ceph: MDS client
1902

533a2818d   Jeff Layton   ceph: eliminate s...
1903
  	if (*remaining <= 0)
2f2dc0534   Sage Weil   ceph: MDS client
1904
  		return -1;
be655596b   Sage Weil   ceph: use i_ceph_...
1905
  	spin_lock(&ci->i_ceph_lock);
2f2dc0534   Sage Weil   ceph: MDS client
1906
1907
  	mine = cap->issued | cap->implemented;
  	used = __ceph_caps_used(ci);
979abfdd5   Yan, Zheng   ceph: fix trim caps
1908
  	wanted = __ceph_caps_file_wanted(ci);
2f2dc0534   Sage Weil   ceph: MDS client
1909
  	oissued = __ceph_caps_issued_other(ci, cap);
979abfdd5   Yan, Zheng   ceph: fix trim caps
1910
1911
  	dout("trim_caps_cb %p cap %p mine %s oissued %s used %s wanted %s
  ",
2f2dc0534   Sage Weil   ceph: MDS client
1912
  	     inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
979abfdd5   Yan, Zheng   ceph: fix trim caps
1913
1914
  	     ceph_cap_string(used), ceph_cap_string(wanted));
  	if (cap == ci->i_auth_cap) {
622f3e250   Yan, Zheng   ceph: don't trim ...
1915
1916
  		if (ci->i_dirty_caps || ci->i_flushing_caps ||
  		    !list_empty(&ci->i_cap_snaps))
979abfdd5   Yan, Zheng   ceph: fix trim caps
1917
1918
1919
  			goto out;
  		if ((used | wanted) & CEPH_CAP_ANY_WR)
  			goto out;
89aa59301   Yan, Zheng   ceph: keep auth c...
1920
1921
1922
1923
1924
  		/* Note: it's possible that i_filelock_ref becomes non-zero
  		 * after dropping auth caps. It doesn't hurt because reply
  		 * of lock mds request will re-add auth caps. */
  		if (atomic_read(&ci->i_filelock_ref) > 0)
  			goto out;
979abfdd5   Yan, Zheng   ceph: fix trim caps
1925
  	}
5e804ac48   Yan, Zheng   ceph: don't inval...
1926
1927
  	/* The inode has cached pages, but it's no longer used.
  	 * we can safely drop it */
525d15e8e   Yan, Zheng   ceph: check inode...
1928
1929
  	if (S_ISREG(inode->i_mode) &&
  	    wanted == 0 && used == CEPH_CAP_FILE_CACHE &&
5e804ac48   Yan, Zheng   ceph: don't inval...
1930
1931
1932
1933
  	    !(oissued & CEPH_CAP_FILE_CACHE)) {
  	  used = 0;
  	  oissued = 0;
  	}
979abfdd5   Yan, Zheng   ceph: fix trim caps
1934
  	if ((used | wanted) & ~oissued & mine)
2f2dc0534   Sage Weil   ceph: MDS client
1935
  		goto out;   /* we need these caps */
2f2dc0534   Sage Weil   ceph: MDS client
1936
1937
  	if (oissued) {
  		/* we aren't the only cap.. just remove us */
a096b09ae   Yan, Zheng   ceph: queue cap r...
1938
  		__ceph_remove_cap(cap, true);
533a2818d   Jeff Layton   ceph: eliminate s...
1939
  		(*remaining)--;
2f2dc0534   Sage Weil   ceph: MDS client
1940
  	} else {
040d78603   Yan, Zheng   ceph: drop negati...
1941
  		struct dentry *dentry;
5e804ac48   Yan, Zheng   ceph: don't inval...
1942
  		/* try dropping referring dentries */
be655596b   Sage Weil   ceph: use i_ceph_...
1943
  		spin_unlock(&ci->i_ceph_lock);
040d78603   Yan, Zheng   ceph: drop negati...
1944
1945
1946
1947
1948
1949
1950
  		dentry = d_find_any_alias(inode);
  		if (dentry && drop_negative_children(dentry)) {
  			int count;
  			dput(dentry);
  			d_prune_aliases(inode);
  			count = atomic_read(&inode->i_count);
  			if (count == 1)
533a2818d   Jeff Layton   ceph: eliminate s...
1951
  				(*remaining)--;
040d78603   Yan, Zheng   ceph: drop negati...
1952
1953
1954
1955
1956
1957
  			dout("trim_caps_cb %p cap %p pruned, count now %d
  ",
  			     inode, cap, count);
  		} else {
  			dput(dentry);
  		}
2f2dc0534   Sage Weil   ceph: MDS client
1958
1959
1960
1961
  		return 0;
  	}
  
  out:
be655596b   Sage Weil   ceph: use i_ceph_...
1962
  	spin_unlock(&ci->i_ceph_lock);
2f2dc0534   Sage Weil   ceph: MDS client
1963
1964
1965
1966
1967
1968
  	return 0;
  }
  
  /*
   * Trim session cap count down to some max number.
   */
e30ee5812   Zhi Zhang   ceph: try to allo...
1969
1970
1971
  int ceph_trim_caps(struct ceph_mds_client *mdsc,
  		   struct ceph_mds_session *session,
  		   int max_caps)
2f2dc0534   Sage Weil   ceph: MDS client
1972
1973
1974
1975
1976
1977
1978
  {
  	int trim_caps = session->s_nr_caps - max_caps;
  
  	dout("trim_caps mds%d start: %d / %d, trim %d
  ",
  	     session->s_mds, session->s_nr_caps, max_caps, trim_caps);
  	if (trim_caps > 0) {
533a2818d   Jeff Layton   ceph: eliminate s...
1979
1980
1981
  		int remaining = trim_caps;
  
  		ceph_iterate_session_caps(session, trim_caps_cb, &remaining);
2f2dc0534   Sage Weil   ceph: MDS client
1982
1983
1984
  		dout("trim_caps mds%d done: %d / %d, trimmed %d
  ",
  		     session->s_mds, session->s_nr_caps, max_caps,
533a2818d   Jeff Layton   ceph: eliminate s...
1985
  			trim_caps - remaining);
2f2dc0534   Sage Weil   ceph: MDS client
1986
  	}
a56371d9d   Yan, Zheng   ceph: flush cap r...
1987

e3ec8d689   Yan, Zheng   ceph: send cap re...
1988
  	ceph_flush_cap_releases(mdsc, session);
2f2dc0534   Sage Weil   ceph: MDS client
1989
1990
  	return 0;
  }
8310b0891   Yan, Zheng   ceph: track pendi...
1991
1992
1993
  static int check_caps_flush(struct ceph_mds_client *mdsc,
  			    u64 want_flush_tid)
  {
8310b0891   Yan, Zheng   ceph: track pendi...
1994
1995
1996
  	int ret = 1;
  
  	spin_lock(&mdsc->cap_dirty_lock);
e4500b5e3   Yan, Zheng   ceph: use list in...
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
  	if (!list_empty(&mdsc->cap_flush_list)) {
  		struct ceph_cap_flush *cf =
  			list_first_entry(&mdsc->cap_flush_list,
  					 struct ceph_cap_flush, g_list);
  		if (cf->tid <= want_flush_tid) {
  			dout("check_caps_flush still flushing tid "
  			     "%llu <= %llu
  ", cf->tid, want_flush_tid);
  			ret = 0;
  		}
8310b0891   Yan, Zheng   ceph: track pendi...
2007
2008
2009
  	}
  	spin_unlock(&mdsc->cap_dirty_lock);
  	return ret;
d3383a8e3   Yan, Zheng   ceph: avoid block...
2010
  }
2f2dc0534   Sage Weil   ceph: MDS client
2011
2012
2013
  /*
   * flush all dirty inode data to disk.
   *
8310b0891   Yan, Zheng   ceph: track pendi...
2014
   * returns true if we've flushed through want_flush_tid
2f2dc0534   Sage Weil   ceph: MDS client
2015
   */
affbc19a6   Yan, Zheng   ceph: make sure s...
2016
  static void wait_caps_flush(struct ceph_mds_client *mdsc,
0e2943878   Yan, Zheng   ceph: unify cap f...
2017
  			    u64 want_flush_tid)
2f2dc0534   Sage Weil   ceph: MDS client
2018
  {
0e2943878   Yan, Zheng   ceph: unify cap f...
2019
2020
  	dout("check_caps_flush want %llu
  ", want_flush_tid);
8310b0891   Yan, Zheng   ceph: track pendi...
2021
2022
2023
2024
2025
2026
  
  	wait_event(mdsc->cap_flushing_wq,
  		   check_caps_flush(mdsc, want_flush_tid));
  
  	dout("check_caps_flush ok, flushed thru %llu
  ", want_flush_tid);
2f2dc0534   Sage Weil   ceph: MDS client
2027
2028
2029
2030
2031
  }
  
  /*
   * called under s_mutex
   */
e3ec8d689   Yan, Zheng   ceph: send cap re...
2032
2033
  static void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
  				   struct ceph_mds_session *session)
2f2dc0534   Sage Weil   ceph: MDS client
2034
  {
745a8e3bc   Yan, Zheng   ceph: don't pre-a...
2035
2036
2037
  	struct ceph_msg *msg = NULL;
  	struct ceph_mds_cap_release *head;
  	struct ceph_mds_cap_item *item;
92475f05b   Jeff Layton   ceph: handle epoc...
2038
  	struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc;
745a8e3bc   Yan, Zheng   ceph: don't pre-a...
2039
2040
2041
  	struct ceph_cap *cap;
  	LIST_HEAD(tmp_list);
  	int num_cap_releases;
92475f05b   Jeff Layton   ceph: handle epoc...
2042
2043
2044
2045
2046
  	__le32	barrier, *cap_barrier;
  
  	down_read(&osdc->lock);
  	barrier = cpu_to_le32(osdc->epoch_barrier);
  	up_read(&osdc->lock);
2f2dc0534   Sage Weil   ceph: MDS client
2047

0f8605f2b   Sage Weil   ceph: clean up ca...
2048
  	spin_lock(&session->s_cap_lock);
745a8e3bc   Yan, Zheng   ceph: don't pre-a...
2049
2050
2051
2052
  again:
  	list_splice_init(&session->s_cap_releases, &tmp_list);
  	num_cap_releases = session->s_num_cap_releases;
  	session->s_num_cap_releases = 0;
2f2dc0534   Sage Weil   ceph: MDS client
2053
  	spin_unlock(&session->s_cap_lock);
e01a59464   Sage Weil   ceph: dicard cap ...
2054

745a8e3bc   Yan, Zheng   ceph: don't pre-a...
2055
2056
2057
  	while (!list_empty(&tmp_list)) {
  		if (!msg) {
  			msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE,
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
2058
  					PAGE_SIZE, GFP_NOFS, false);
745a8e3bc   Yan, Zheng   ceph: don't pre-a...
2059
2060
2061
2062
2063
  			if (!msg)
  				goto out_err;
  			head = msg->front.iov_base;
  			head->num = cpu_to_le32(0);
  			msg->front.iov_len = sizeof(*head);
92475f05b   Jeff Layton   ceph: handle epoc...
2064
2065
2066
  
  			msg->hdr.version = cpu_to_le16(2);
  			msg->hdr.compat_version = cpu_to_le16(1);
745a8e3bc   Yan, Zheng   ceph: don't pre-a...
2067
  		}
92475f05b   Jeff Layton   ceph: handle epoc...
2068

745a8e3bc   Yan, Zheng   ceph: don't pre-a...
2069
2070
2071
2072
  		cap = list_first_entry(&tmp_list, struct ceph_cap,
  					session_caps);
  		list_del(&cap->session_caps);
  		num_cap_releases--;
e01a59464   Sage Weil   ceph: dicard cap ...
2073

00bd8edb8   Yan, Zheng   ceph: fix null po...
2074
  		head = msg->front.iov_base;
4198aba4f   Jeff Layton   ceph: fix unalign...
2075
2076
  		put_unaligned_le32(get_unaligned_le32(&head->num) + 1,
  				   &head->num);
745a8e3bc   Yan, Zheng   ceph: don't pre-a...
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
  		item = msg->front.iov_base + msg->front.iov_len;
  		item->ino = cpu_to_le64(cap->cap_ino);
  		item->cap_id = cpu_to_le64(cap->cap_id);
  		item->migrate_seq = cpu_to_le32(cap->mseq);
  		item->seq = cpu_to_le32(cap->issue_seq);
  		msg->front.iov_len += sizeof(*item);
  
  		ceph_put_cap(mdsc, cap);
  
  		if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
92475f05b   Jeff Layton   ceph: handle epoc...
2087
2088
2089
2090
  			// Append cap_barrier field
  			cap_barrier = msg->front.iov_base + msg->front.iov_len;
  			*cap_barrier = barrier;
  			msg->front.iov_len += sizeof(*cap_barrier);
745a8e3bc   Yan, Zheng   ceph: don't pre-a...
2091
2092
2093
2094
2095
2096
  			msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
  			dout("send_cap_releases mds%d %p
  ", session->s_mds, msg);
  			ceph_con_send(&session->s_con, msg);
  			msg = NULL;
  		}
00bd8edb8   Yan, Zheng   ceph: fix null po...
2097
  	}
e01a59464   Sage Weil   ceph: dicard cap ...
2098

745a8e3bc   Yan, Zheng   ceph: don't pre-a...
2099
  	BUG_ON(num_cap_releases != 0);
e01a59464   Sage Weil   ceph: dicard cap ...
2100

745a8e3bc   Yan, Zheng   ceph: don't pre-a...
2101
2102
2103
2104
2105
2106
  	spin_lock(&session->s_cap_lock);
  	if (!list_empty(&session->s_cap_releases))
  		goto again;
  	spin_unlock(&session->s_cap_lock);
  
  	if (msg) {
92475f05b   Jeff Layton   ceph: handle epoc...
2107
2108
2109
2110
  		// Append cap_barrier field
  		cap_barrier = msg->front.iov_base + msg->front.iov_len;
  		*cap_barrier = barrier;
  		msg->front.iov_len += sizeof(*cap_barrier);
745a8e3bc   Yan, Zheng   ceph: don't pre-a...
2111
2112
2113
2114
  		msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
  		dout("send_cap_releases mds%d %p
  ", session->s_mds, msg);
  		ceph_con_send(&session->s_con, msg);
e01a59464   Sage Weil   ceph: dicard cap ...
2115
  	}
745a8e3bc   Yan, Zheng   ceph: don't pre-a...
2116
2117
2118
2119
2120
2121
2122
2123
2124
  	return;
  out_err:
  	pr_err("send_cap_releases mds%d, failed to allocate message
  ",
  		session->s_mds);
  	spin_lock(&session->s_cap_lock);
  	list_splice(&tmp_list, &session->s_cap_releases);
  	session->s_num_cap_releases += num_cap_releases;
  	spin_unlock(&session->s_cap_lock);
e01a59464   Sage Weil   ceph: dicard cap ...
2125
  }
e3ec8d689   Yan, Zheng   ceph: send cap re...
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
  static void ceph_cap_release_work(struct work_struct *work)
  {
  	struct ceph_mds_session *session =
  		container_of(work, struct ceph_mds_session, s_cap_release_work);
  
  	mutex_lock(&session->s_mutex);
  	if (session->s_state == CEPH_MDS_SESSION_OPEN ||
  	    session->s_state == CEPH_MDS_SESSION_HUNG)
  		ceph_send_cap_releases(session->s_mdsc, session);
  	mutex_unlock(&session->s_mutex);
  	ceph_put_mds_session(session);
  }
  
  void ceph_flush_cap_releases(struct ceph_mds_client *mdsc,
  		             struct ceph_mds_session *session)
  {
  	if (mdsc->stopping)
  		return;
5b3248c67   Xiubo Li   ceph: rename get_...
2144
  	ceph_get_mds_session(session);
e3ec8d689   Yan, Zheng   ceph: send cap re...
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
  	if (queue_work(mdsc->fsc->cap_wq,
  		       &session->s_cap_release_work)) {
  		dout("cap release work queued
  ");
  	} else {
  		ceph_put_mds_session(session);
  		dout("failed to queue cap release work
  ");
  	}
  }
  
  /*
   * caller holds session->s_cap_lock
   */
  void __ceph_queue_cap_release(struct ceph_mds_session *session,
  			      struct ceph_cap *cap)
  {
  	list_add_tail(&cap->session_caps, &session->s_cap_releases);
  	session->s_num_cap_releases++;
  
  	if (!(session->s_num_cap_releases % CEPH_CAPS_PER_RELEASE))
  		ceph_flush_cap_releases(session->s_mdsc, session);
  }
37c4efc1d   Yan, Zheng   ceph: periodicall...
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
  static void ceph_cap_reclaim_work(struct work_struct *work)
  {
  	struct ceph_mds_client *mdsc =
  		container_of(work, struct ceph_mds_client, cap_reclaim_work);
  	int ret = ceph_trim_dentries(mdsc);
  	if (ret == -EAGAIN)
  		ceph_queue_cap_reclaim_work(mdsc);
  }
  
  void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc)
  {
  	if (mdsc->stopping)
  		return;
  
          if (queue_work(mdsc->fsc->cap_wq, &mdsc->cap_reclaim_work)) {
                  dout("caps reclaim work queued
  ");
          } else {
                  dout("failed to queue caps release work
  ");
          }
  }
fe33032da   Yan, Zheng   ceph: add mount o...
2190
2191
2192
2193
2194
2195
  void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr)
  {
  	int val;
  	if (!nr)
  		return;
  	val = atomic_add_return(nr, &mdsc->cap_reclaim_pending);
bba1560bd   Xiubo Li   ceph: trigger the...
2196
  	if ((val % CEPH_CAPS_PER_RELEASE) < nr) {
fe33032da   Yan, Zheng   ceph: add mount o...
2197
2198
2199
2200
  		atomic_set(&mdsc->cap_reclaim_pending, 0);
  		ceph_queue_cap_reclaim_work(mdsc);
  	}
  }
2f2dc0534   Sage Weil   ceph: MDS client
2201
2202
2203
  /*
   * requests
   */
54008399d   Yan, Zheng   ceph: preallocate...
2204
2205
2206
2207
2208
2209
  int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
  				    struct inode *dir)
  {
  	struct ceph_inode_info *ci = ceph_inode(dir);
  	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
  	struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options;
2a5beea3f   Yan, Zheng   ceph: define stru...
2210
  	size_t size = sizeof(struct ceph_mds_reply_dir_entry);
ad8c28a9e   Jeff Layton   ceph: convert int...
2211
2212
  	unsigned int num_entries;
  	int order;
54008399d   Yan, Zheng   ceph: preallocate...
2213
2214
2215
2216
  
  	spin_lock(&ci->i_ceph_lock);
  	num_entries = ci->i_files + ci->i_subdirs;
  	spin_unlock(&ci->i_ceph_lock);
ad8c28a9e   Jeff Layton   ceph: convert int...
2217
  	num_entries = max(num_entries, 1U);
54008399d   Yan, Zheng   ceph: preallocate...
2218
2219
2220
2221
  	num_entries = min(num_entries, opt->max_readdir);
  
  	order = get_order(size * num_entries);
  	while (order >= 0) {
2a5beea3f   Yan, Zheng   ceph: define stru...
2222
2223
2224
2225
  		rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL |
  							     __GFP_NOWARN,
  							     order);
  		if (rinfo->dir_entries)
54008399d   Yan, Zheng   ceph: preallocate...
2226
2227
2228
  			break;
  		order--;
  	}
2a5beea3f   Yan, Zheng   ceph: define stru...
2229
  	if (!rinfo->dir_entries)
54008399d   Yan, Zheng   ceph: preallocate...
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
  		return -ENOMEM;
  
  	num_entries = (PAGE_SIZE << order) / size;
  	num_entries = min(num_entries, opt->max_readdir);
  
  	rinfo->dir_buf_size = PAGE_SIZE << order;
  	req->r_num_caps = num_entries + 1;
  	req->r_args.readdir.max_entries = cpu_to_le32(num_entries);
  	req->r_args.readdir.max_bytes = cpu_to_le32(opt->max_readdir_bytes);
  	return 0;
  }
2f2dc0534   Sage Weil   ceph: MDS client
2241
2242
2243
2244
2245
2246
  /*
   * Create an mds request.
   */
  struct ceph_mds_request *
  ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
  {
058daab79   Jeff Layton   ceph: move to a d...
2247
  	struct ceph_mds_request *req;
2f2dc0534   Sage Weil   ceph: MDS client
2248

058daab79   Jeff Layton   ceph: move to a d...
2249
  	req = kmem_cache_zalloc(ceph_mds_request_cachep, GFP_NOFS);
2f2dc0534   Sage Weil   ceph: MDS client
2250
2251
  	if (!req)
  		return ERR_PTR(-ENOMEM);
b4556396f   Sage Weil   ceph: fix race be...
2252
  	mutex_init(&req->r_fill_mutex);
37151668b   Yehuda Sadeh   ceph: do caps acc...
2253
  	req->r_mdsc = mdsc;
2f2dc0534   Sage Weil   ceph: MDS client
2254
  	req->r_started = jiffies;
70c948206   Xiubo Li   ceph: add metadat...
2255
  	req->r_start_latency = ktime_get();
2f2dc0534   Sage Weil   ceph: MDS client
2256
2257
  	req->r_resend_mds = -1;
  	INIT_LIST_HEAD(&req->r_unsafe_dir_item);
68cd5b4b7   Yan, Zheng   ceph: make fsync(...
2258
  	INIT_LIST_HEAD(&req->r_unsafe_target_item);
2f2dc0534   Sage Weil   ceph: MDS client
2259
  	req->r_fmode = -1;
153c8e6bf   Sage Weil   ceph: use kref fo...
2260
  	kref_init(&req->r_kref);
fcd00b68b   Ilya Dryomov   libceph: DEFINE_R...
2261
  	RB_CLEAR_NODE(&req->r_node);
2f2dc0534   Sage Weil   ceph: MDS client
2262
2263
2264
2265
  	INIT_LIST_HEAD(&req->r_wait);
  	init_completion(&req->r_completion);
  	init_completion(&req->r_safe_completion);
  	INIT_LIST_HEAD(&req->r_unsafe_item);
668c9a61e   Deepa Dinamani   fs: ceph: Delete ...
2266
  	ktime_get_coarse_real_ts64(&req->r_stamp);
b8e69066d   Sage Weil   ceph: include tim...
2267

2f2dc0534   Sage Weil   ceph: MDS client
2268
2269
2270
2271
2272
2273
  	req->r_op = op;
  	req->r_direct_mode = mode;
  	return req;
  }
  
  /*
44ca18f26   Sage Weil   ceph: use rbtree ...
2274
   * return oldest (lowest) request, tid in request tree, 0 if none.
2f2dc0534   Sage Weil   ceph: MDS client
2275
2276
2277
   *
   * called under mdsc->mutex.
   */
44ca18f26   Sage Weil   ceph: use rbtree ...
2278
2279
2280
2281
2282
2283
2284
  static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc)
  {
  	if (RB_EMPTY_ROOT(&mdsc->request_tree))
  		return NULL;
  	return rb_entry(rb_first(&mdsc->request_tree),
  			struct ceph_mds_request, r_node);
  }
e8a7b8b12   Yan, Zheng   ceph: exclude set...
2285
  static inline  u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
2f2dc0534   Sage Weil   ceph: MDS client
2286
  {
e8a7b8b12   Yan, Zheng   ceph: exclude set...
2287
  	return mdsc->oldest_tid;
2f2dc0534   Sage Weil   ceph: MDS client
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
  }
  
  /*
   * Build a dentry's path.  Allocate on heap; caller must kfree.  Based
   * on build_path_from_dentry in fs/cifs/dir.c.
   *
   * If @stop_on_nosnap, generate path relative to the first non-snapped
   * inode.
   *
   * Encode hidden .snap dirs as a double /, i.e.
   *   foo/.snap/bar -> foo//bar
   */
69a10fb3f   Jeff Layton   ceph: fix potenti...
2300
  char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *pbase,
2f2dc0534   Sage Weil   ceph: MDS client
2301
2302
2303
2304
  			   int stop_on_nosnap)
  {
  	struct dentry *temp;
  	char *path;
f77f21bb2   Jeff Layton   ceph: use __getna...
2305
  	int pos;
1b71fe2ef   Al Viro   ceph analog of ci...
2306
  	unsigned seq;
69a10fb3f   Jeff Layton   ceph: fix potenti...
2307
  	u64 base;
2f2dc0534   Sage Weil   ceph: MDS client
2308

d37b1d994   Markus Elfring   ceph: adjust 36 c...
2309
  	if (!dentry)
2f2dc0534   Sage Weil   ceph: MDS client
2310
  		return ERR_PTR(-EINVAL);
f77f21bb2   Jeff Layton   ceph: use __getna...
2311
  	path = __getname();
d37b1d994   Markus Elfring   ceph: adjust 36 c...
2312
  	if (!path)
2f2dc0534   Sage Weil   ceph: MDS client
2313
  		return ERR_PTR(-ENOMEM);
f77f21bb2   Jeff Layton   ceph: use __getna...
2314
2315
2316
2317
2318
  retry:
  	pos = PATH_MAX - 1;
  	path[pos] = '\0';
  
  	seq = read_seqbegin(&rename_lock);
1b71fe2ef   Al Viro   ceph analog of ci...
2319
  	rcu_read_lock();
f77f21bb2   Jeff Layton   ceph: use __getna...
2320
2321
  	temp = dentry;
  	for (;;) {
1b71fe2ef   Al Viro   ceph analog of ci...
2322
  		struct inode *inode;
2f2dc0534   Sage Weil   ceph: MDS client
2323

1b71fe2ef   Al Viro   ceph analog of ci...
2324
  		spin_lock(&temp->d_lock);
2b0143b5c   David Howells   VFS: normal files...
2325
  		inode = d_inode(temp);
2f2dc0534   Sage Weil   ceph: MDS client
2326
  		if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
104648ad3   Sage Weil   ceph: reduce buil...
2327
2328
  			dout("build_path path+%d: %p SNAPDIR
  ",
2f2dc0534   Sage Weil   ceph: MDS client
2329
  			     pos, temp);
d6b8bd679   Jeff Layton   ceph: fix ceph_md...
2330
  		} else if (stop_on_nosnap && inode && dentry != temp &&
2f2dc0534   Sage Weil   ceph: MDS client
2331
  			   ceph_snap(inode) == CEPH_NOSNAP) {
9d5a09e65   Yehuda Sadeh   ceph: add missing...
2332
  			spin_unlock(&temp->d_lock);
d6b8bd679   Jeff Layton   ceph: fix ceph_md...
2333
  			pos++; /* get rid of any prepended '/' */
2f2dc0534   Sage Weil   ceph: MDS client
2334
2335
2336
  			break;
  		} else {
  			pos -= temp->d_name.len;
1b71fe2ef   Al Viro   ceph analog of ci...
2337
2338
  			if (pos < 0) {
  				spin_unlock(&temp->d_lock);
2f2dc0534   Sage Weil   ceph: MDS client
2339
  				break;
1b71fe2ef   Al Viro   ceph analog of ci...
2340
  			}
f77f21bb2   Jeff Layton   ceph: use __getna...
2341
  			memcpy(path + pos, temp->d_name.name, temp->d_name.len);
2f2dc0534   Sage Weil   ceph: MDS client
2342
  		}
1b71fe2ef   Al Viro   ceph analog of ci...
2343
  		spin_unlock(&temp->d_lock);
41883ba8e   Yan, Zheng   ceph: use READ_ON...
2344
  		temp = READ_ONCE(temp->d_parent);
f77f21bb2   Jeff Layton   ceph: use __getna...
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
  
  		/* Are we at the root? */
  		if (IS_ROOT(temp))
  			break;
  
  		/* Are we out of buffer? */
  		if (--pos < 0)
  			break;
  
  		path[pos] = '/';
2f2dc0534   Sage Weil   ceph: MDS client
2355
  	}
69a10fb3f   Jeff Layton   ceph: fix potenti...
2356
  	base = ceph_ino(d_inode(temp));
1b71fe2ef   Al Viro   ceph analog of ci...
2357
  	rcu_read_unlock();
f5946bcc5   Jeff Layton   ceph: tone down l...
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
  
  	if (read_seqretry(&rename_lock, seq))
  		goto retry;
  
  	if (pos < 0) {
  		/*
  		 * A rename didn't occur, but somehow we didn't end up where
  		 * we thought we would. Throw a warning and try again.
  		 */
  		pr_warn("build_path did not end path lookup where "
  			"expected, pos is %d
  ", pos);
2f2dc0534   Sage Weil   ceph: MDS client
2370
2371
  		goto retry;
  	}
69a10fb3f   Jeff Layton   ceph: fix potenti...
2372
  	*pbase = base;
f77f21bb2   Jeff Layton   ceph: use __getna...
2373
  	*plen = PATH_MAX - 1 - pos;
104648ad3   Sage Weil   ceph: reduce buil...
2374
2375
  	dout("build_path on %p %d built %llx '%.*s'
  ",
f77f21bb2   Jeff Layton   ceph: use __getna...
2376
2377
  	     dentry, d_count(dentry), base, *plen, path + pos);
  	return path + pos;
2f2dc0534   Sage Weil   ceph: MDS client
2378
  }
fd36a7176   Jeff Layton   ceph: pass parent...
2379
  static int build_dentry_path(struct dentry *dentry, struct inode *dir,
2f2dc0534   Sage Weil   ceph: MDS client
2380
  			     const char **ppath, int *ppathlen, u64 *pino,
1bcb34408   Jeff Layton   ceph: only use d_...
2381
  			     bool *pfreepath, bool parent_locked)
2f2dc0534   Sage Weil   ceph: MDS client
2382
2383
  {
  	char *path;
c6b0b656c   Jeff Layton   ceph: clean up un...
2384
  	rcu_read_lock();
fd36a7176   Jeff Layton   ceph: pass parent...
2385
2386
  	if (!dir)
  		dir = d_inode_rcu(dentry->d_parent);
964fff749   Jeff Layton   ceph: use ceph_md...
2387
  	if (dir && parent_locked && ceph_snap(dir) == CEPH_NOSNAP) {
c6b0b656c   Jeff Layton   ceph: clean up un...
2388
2389
  		*pino = ceph_ino(dir);
  		rcu_read_unlock();
964fff749   Jeff Layton   ceph: use ceph_md...
2390
2391
  		*ppath = dentry->d_name.name;
  		*ppathlen = dentry->d_name.len;
2f2dc0534   Sage Weil   ceph: MDS client
2392
2393
  		return 0;
  	}
c6b0b656c   Jeff Layton   ceph: clean up un...
2394
  	rcu_read_unlock();
2f2dc0534   Sage Weil   ceph: MDS client
2395
2396
2397
2398
  	path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
  	if (IS_ERR(path))
  		return PTR_ERR(path);
  	*ppath = path;
1bcb34408   Jeff Layton   ceph: only use d_...
2399
  	*pfreepath = true;
2f2dc0534   Sage Weil   ceph: MDS client
2400
2401
2402
2403
2404
  	return 0;
  }
  
  static int build_inode_path(struct inode *inode,
  			    const char **ppath, int *ppathlen, u64 *pino,
1bcb34408   Jeff Layton   ceph: only use d_...
2405
  			    bool *pfreepath)
2f2dc0534   Sage Weil   ceph: MDS client
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
  {
  	struct dentry *dentry;
  	char *path;
  
  	if (ceph_snap(inode) == CEPH_NOSNAP) {
  		*pino = ceph_ino(inode);
  		*ppathlen = 0;
  		return 0;
  	}
  	dentry = d_find_alias(inode);
  	path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
  	dput(dentry);
  	if (IS_ERR(path))
  		return PTR_ERR(path);
  	*ppath = path;
1bcb34408   Jeff Layton   ceph: only use d_...
2421
  	*pfreepath = true;
2f2dc0534   Sage Weil   ceph: MDS client
2422
2423
2424
2425
2426
2427
2428
2429
  	return 0;
  }
  
  /*
   * request arguments may be specified via an inode *, a dentry *, or
   * an explicit ino+path.
   */
  static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
fd36a7176   Jeff Layton   ceph: pass parent...
2430
2431
  				  struct inode *rdiri, const char *rpath,
  				  u64 rino, const char **ppath, int *pathlen,
1bcb34408   Jeff Layton   ceph: only use d_...
2432
  				  u64 *ino, bool *freepath, bool parent_locked)
2f2dc0534   Sage Weil   ceph: MDS client
2433
2434
2435
2436
2437
2438
2439
2440
2441
  {
  	int r = 0;
  
  	if (rinode) {
  		r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
  		dout(" inode %p %llx.%llx
  ", rinode, ceph_ino(rinode),
  		     ceph_snap(rinode));
  	} else if (rdentry) {
fd36a7176   Jeff Layton   ceph: pass parent...
2442
  		r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino,
1bcb34408   Jeff Layton   ceph: only use d_...
2443
  					freepath, parent_locked);
2f2dc0534   Sage Weil   ceph: MDS client
2444
2445
2446
  		dout(" dentry %p %llx/%.*s
  ", rdentry, *ino, *pathlen,
  		     *ppath);
795858dbd   Sage Weil   ceph: fix encodin...
2447
  	} else if (rpath || rino) {
2f2dc0534   Sage Weil   ceph: MDS client
2448
2449
  		*ino = rino;
  		*ppath = rpath;
b000056a5   David Zafman   ceph: Fix NULL pt...
2450
  		*pathlen = rpath ? strlen(rpath) : 0;
2f2dc0534   Sage Weil   ceph: MDS client
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
  		dout(" path %.*s
  ", *pathlen, rpath);
  	}
  
  	return r;
  }
  
  /*
   * called under mdsc->mutex
   */
  static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
  					       struct ceph_mds_request *req,
6e6f09231   Yan, Zheng   ceph: drop cap re...
2463
  					       int mds, bool drop_cap_releases)
2f2dc0534   Sage Weil   ceph: MDS client
2464
2465
2466
2467
2468
2469
2470
  {
  	struct ceph_msg *msg;
  	struct ceph_mds_request_head *head;
  	const char *path1 = NULL;
  	const char *path2 = NULL;
  	u64 ino1 = 0, ino2 = 0;
  	int pathlen1 = 0, pathlen2 = 0;
1bcb34408   Jeff Layton   ceph: only use d_...
2471
  	bool freepath1 = false, freepath2 = false;
2f2dc0534   Sage Weil   ceph: MDS client
2472
2473
2474
2475
2476
2477
  	int len;
  	u16 releases;
  	void *p, *end;
  	int ret;
  
  	ret = set_request_path_attr(req->r_inode, req->r_dentry,
3dd69aabc   Jeff Layton   ceph: add a new f...
2478
  			      req->r_parent, req->r_path1, req->r_ino1.ino,
1bcb34408   Jeff Layton   ceph: only use d_...
2479
2480
2481
  			      &path1, &pathlen1, &ino1, &freepath1,
  			      test_bit(CEPH_MDS_R_PARENT_LOCKED,
  					&req->r_req_flags));
2f2dc0534   Sage Weil   ceph: MDS client
2482
2483
2484
2485
  	if (ret < 0) {
  		msg = ERR_PTR(ret);
  		goto out;
  	}
1bcb34408   Jeff Layton   ceph: only use d_...
2486
  	/* If r_old_dentry is set, then assume that its parent is locked */
2f2dc0534   Sage Weil   ceph: MDS client
2487
  	ret = set_request_path_attr(NULL, req->r_old_dentry,
fd36a7176   Jeff Layton   ceph: pass parent...
2488
  			      req->r_old_dentry_dir,
2f2dc0534   Sage Weil   ceph: MDS client
2489
  			      req->r_path2, req->r_ino2.ino,
1bcb34408   Jeff Layton   ceph: only use d_...
2490
  			      &path2, &pathlen2, &ino2, &freepath2, true);
2f2dc0534   Sage Weil   ceph: MDS client
2491
2492
2493
2494
2495
2496
  	if (ret < 0) {
  		msg = ERR_PTR(ret);
  		goto out_free1;
  	}
  
  	len = sizeof(*head) +
b8e69066d   Sage Weil   ceph: include tim...
2497
  		pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) +
777d738a5   Arnd Bergmann   ceph: fix message...
2498
  		sizeof(struct ceph_timespec);
2f2dc0534   Sage Weil   ceph: MDS client
2499
2500
2501
2502
2503
2504
  
  	/* calculate (max) length for cap releases */
  	len += sizeof(struct ceph_mds_request_release) *
  		(!!req->r_inode_drop + !!req->r_dentry_drop +
  		 !!req->r_old_inode_drop + !!req->r_old_dentry_drop);
  	if (req->r_dentry_drop)
c1dfc2772   Jeff Layton   ceph: use pathlen...
2505
  		len += pathlen1;
2f2dc0534   Sage Weil   ceph: MDS client
2506
  	if (req->r_old_dentry_drop)
c1dfc2772   Jeff Layton   ceph: use pathlen...
2507
  		len += pathlen2;
2f2dc0534   Sage Weil   ceph: MDS client
2508

0d9c1ab3b   Ilya Dryomov   libceph: prealloc...
2509
  	msg = ceph_msg_new2(CEPH_MSG_CLIENT_REQUEST, len, 1, GFP_NOFS, false);
a79832f26   Sage Weil   ceph: make ceph_m...
2510
2511
  	if (!msg) {
  		msg = ERR_PTR(-ENOMEM);
2f2dc0534   Sage Weil   ceph: MDS client
2512
  		goto out_free2;
a79832f26   Sage Weil   ceph: make ceph_m...
2513
  	}
2f2dc0534   Sage Weil   ceph: MDS client
2514

7cfa0313d   John Spray   ceph: message ver...
2515
  	msg->hdr.version = cpu_to_le16(2);
6df058c02   Sage Weil   ceph: include tra...
2516
  	msg->hdr.tid = cpu_to_le64(req->r_tid);
2f2dc0534   Sage Weil   ceph: MDS client
2517
2518
2519
2520
2521
2522
  	head = msg->front.iov_base;
  	p = msg->front.iov_base + sizeof(*head);
  	end = msg->front.iov_base + msg->front.iov_len;
  
  	head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
  	head->op = cpu_to_le32(req->r_op);
ff3d00466   Eric W. Biederman   ceph: Convert str...
2523
2524
  	head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, req->r_uid));
  	head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, req->r_gid));
6deb8008a   Jeff Layton   ceph: add new MDS...
2525
  	head->ino = cpu_to_le64(req->r_deleg_ino);
2f2dc0534   Sage Weil   ceph: MDS client
2526
2527
2528
2529
  	head->args = req->r_args;
  
  	ceph_encode_filepath(&p, end, ino1, path1);
  	ceph_encode_filepath(&p, end, ino2, path2);
e979cf503   Sage Weil   ceph: do not incl...
2530
2531
  	/* make note of release offset, in case we need to replay */
  	req->r_request_release_offset = p - msg->front.iov_base;
2f2dc0534   Sage Weil   ceph: MDS client
2532
2533
2534
2535
  	/* cap releases */
  	releases = 0;
  	if (req->r_inode_drop)
  		releases += ceph_encode_inode_release(&p,
2b0143b5c   David Howells   VFS: normal files...
2536
  		      req->r_inode ? req->r_inode : d_inode(req->r_dentry),
719a2514e   Yan, Zheng   ceph: consider in...
2537
2538
  		      mds, req->r_inode_drop, req->r_inode_unless,
  		      req->r_op == CEPH_MDS_OP_READDIR);
2f2dc0534   Sage Weil   ceph: MDS client
2539
2540
  	if (req->r_dentry_drop)
  		releases += ceph_encode_dentry_release(&p, req->r_dentry,
3dd69aabc   Jeff Layton   ceph: add a new f...
2541
  				req->r_parent, mds, req->r_dentry_drop,
ca6c8ae0f   Jeff Layton   ceph: pass parent...
2542
  				req->r_dentry_unless);
2f2dc0534   Sage Weil   ceph: MDS client
2543
2544
  	if (req->r_old_dentry_drop)
  		releases += ceph_encode_dentry_release(&p, req->r_old_dentry,
ca6c8ae0f   Jeff Layton   ceph: pass parent...
2545
2546
2547
  				req->r_old_dentry_dir, mds,
  				req->r_old_dentry_drop,
  				req->r_old_dentry_unless);
2f2dc0534   Sage Weil   ceph: MDS client
2548
2549
  	if (req->r_old_inode_drop)
  		releases += ceph_encode_inode_release(&p,
2b0143b5c   David Howells   VFS: normal files...
2550
  		      d_inode(req->r_old_dentry),
2f2dc0534   Sage Weil   ceph: MDS client
2551
  		      mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
6e6f09231   Yan, Zheng   ceph: drop cap re...
2552
2553
2554
2555
2556
  
  	if (drop_cap_releases) {
  		releases = 0;
  		p = msg->front.iov_base + req->r_request_release_offset;
  	}
2f2dc0534   Sage Weil   ceph: MDS client
2557
  	head->num_releases = cpu_to_le16(releases);
b8e69066d   Sage Weil   ceph: include tim...
2558
  	/* time stamp */
1f041a89b   Yan, Zheng   ceph: fix request...
2559
2560
  	{
  		struct ceph_timespec ts;
0ed1e90a0   Arnd Bergmann   ceph: use timespe...
2561
  		ceph_encode_timespec64(&ts, &req->r_stamp);
1f041a89b   Yan, Zheng   ceph: fix request...
2562
2563
  		ceph_encode_copy(&p, &ts, sizeof(ts));
  	}
b8e69066d   Sage Weil   ceph: include tim...
2564

b682c6d41   Xiubo Li   ceph: switch to W...
2565
2566
2567
2568
2569
  	if (WARN_ON_ONCE(p > end)) {
  		ceph_msg_put(msg);
  		msg = ERR_PTR(-ERANGE);
  		goto out_free2;
  	}
2f2dc0534   Sage Weil   ceph: MDS client
2570
2571
  	msg->front.iov_len = p - msg->front.iov_base;
  	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
25e6bae35   Yan, Zheng   ceph: use pagelis...
2572
2573
  	if (req->r_pagelist) {
  		struct ceph_pagelist *pagelist = req->r_pagelist;
25e6bae35   Yan, Zheng   ceph: use pagelis...
2574
2575
2576
2577
  		ceph_msg_data_add_pagelist(msg, pagelist);
  		msg->hdr.data_len = cpu_to_le32(pagelist->length);
  	} else {
  		msg->hdr.data_len = 0;
ebf18f470   Alex Elder   ceph: only set me...
2578
  	}
02afca6ca   Alex Elder   libceph: isolate ...
2579

2f2dc0534   Sage Weil   ceph: MDS client
2580
2581
2582
2583
  	msg->hdr.data_off = cpu_to_le16(0);
  
  out_free2:
  	if (freepath2)
f77f21bb2   Jeff Layton   ceph: use __getna...
2584
  		ceph_mdsc_free_path((char *)path2, pathlen2);
2f2dc0534   Sage Weil   ceph: MDS client
2585
2586
  out_free1:
  	if (freepath1)
f77f21bb2   Jeff Layton   ceph: use __getna...
2587
  		ceph_mdsc_free_path((char *)path1, pathlen1);
2f2dc0534   Sage Weil   ceph: MDS client
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
  out:
  	return msg;
  }
  
  /*
   * called under mdsc->mutex if error, under no mutex if
   * success.
   */
  static void complete_request(struct ceph_mds_client *mdsc,
  			     struct ceph_mds_request *req)
  {
70c948206   Xiubo Li   ceph: add metadat...
2599
  	req->r_end_latency = ktime_get();
2f2dc0534   Sage Weil   ceph: MDS client
2600
2601
  	if (req->r_callback)
  		req->r_callback(mdsc, req);
111c70810   Jeff Layton   ceph: after an MD...
2602
  	complete_all(&req->r_completion);
2f2dc0534   Sage Weil   ceph: MDS client
2603
2604
2605
2606
2607
2608
2609
  }
  
  /*
   * called under mdsc->mutex
   */
  static int __prepare_send_request(struct ceph_mds_client *mdsc,
  				  struct ceph_mds_request *req,
6e6f09231   Yan, Zheng   ceph: drop cap re...
2610
  				  int mds, bool drop_cap_releases)
2f2dc0534   Sage Weil   ceph: MDS client
2611
2612
2613
2614
  {
  	struct ceph_mds_request_head *rhead;
  	struct ceph_msg *msg;
  	int flags = 0;
2f2dc0534   Sage Weil   ceph: MDS client
2615
  	req->r_attempts++;
e55b71f80   Greg Farnum   ceph: handle ESTA...
2616
2617
2618
2619
2620
2621
2622
2623
2624
  	if (req->r_inode) {
  		struct ceph_cap *cap =
  			ceph_get_cap_for_mds(ceph_inode(req->r_inode), mds);
  
  		if (cap)
  			req->r_sent_on_mseq = cap->mseq;
  		else
  			req->r_sent_on_mseq = -1;
  	}
2f2dc0534   Sage Weil   ceph: MDS client
2625
2626
2627
  	dout("prepare_send_request %p tid %lld %s (attempt %d)
  ", req,
  	     req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
bc2de10dc   Jeff Layton   ceph: convert boo...
2628
  	if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
c5c9a0bf1   Yan, Zheng   ceph: include tim...
2629
  		void *p;
01a92f174   Sage Weil   ceph: reuse reque...
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
  		/*
  		 * Replay.  Do not regenerate message (and rebuild
  		 * paths, etc.); just use the original message.
  		 * Rebuilding paths will break for renames because
  		 * d_move mangles the src name.
  		 */
  		msg = req->r_request;
  		rhead = msg->front.iov_base;
  
  		flags = le32_to_cpu(rhead->flags);
  		flags |= CEPH_MDS_FLAG_REPLAY;
  		rhead->flags = cpu_to_le32(flags);
  
  		if (req->r_target_inode)
  			rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
  
  		rhead->num_retry = req->r_attempts - 1;
e979cf503   Sage Weil   ceph: do not incl...
2647
2648
2649
  
  		/* remove cap/dentry releases from message */
  		rhead->num_releases = 0;
c5c9a0bf1   Yan, Zheng   ceph: include tim...
2650
2651
2652
  
  		/* time stamp */
  		p = msg->front.iov_base + req->r_request_release_offset;
1f041a89b   Yan, Zheng   ceph: fix request...
2653
2654
  		{
  			struct ceph_timespec ts;
0ed1e90a0   Arnd Bergmann   ceph: use timespe...
2655
  			ceph_encode_timespec64(&ts, &req->r_stamp);
1f041a89b   Yan, Zheng   ceph: fix request...
2656
2657
  			ceph_encode_copy(&p, &ts, sizeof(ts));
  		}
c5c9a0bf1   Yan, Zheng   ceph: include tim...
2658
2659
2660
  
  		msg->front.iov_len = p - msg->front.iov_base;
  		msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
01a92f174   Sage Weil   ceph: reuse reque...
2661
2662
  		return 0;
  	}
2f2dc0534   Sage Weil   ceph: MDS client
2663
2664
2665
2666
  	if (req->r_request) {
  		ceph_msg_put(req->r_request);
  		req->r_request = NULL;
  	}
6e6f09231   Yan, Zheng   ceph: drop cap re...
2667
  	msg = create_request_message(mdsc, req, mds, drop_cap_releases);
2f2dc0534   Sage Weil   ceph: MDS client
2668
  	if (IS_ERR(msg)) {
e1518c7c0   Sage Weil   ceph: clean up md...
2669
  		req->r_err = PTR_ERR(msg);
a79832f26   Sage Weil   ceph: make ceph_m...
2670
  		return PTR_ERR(msg);
2f2dc0534   Sage Weil   ceph: MDS client
2671
2672
2673
2674
  	}
  	req->r_request = msg;
  
  	rhead = msg->front.iov_base;
2f2dc0534   Sage Weil   ceph: MDS client
2675
  	rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
bc2de10dc   Jeff Layton   ceph: convert boo...
2676
  	if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
2f2dc0534   Sage Weil   ceph: MDS client
2677
  		flags |= CEPH_MDS_FLAG_REPLAY;
3bb48b414   Jeff Layton   ceph: add flag to...
2678
2679
  	if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags))
  		flags |= CEPH_MDS_FLAG_ASYNC;
3dd69aabc   Jeff Layton   ceph: add a new f...
2680
  	if (req->r_parent)
2f2dc0534   Sage Weil   ceph: MDS client
2681
2682
2683
2684
  		flags |= CEPH_MDS_FLAG_WANT_DENTRY;
  	rhead->flags = cpu_to_le32(flags);
  	rhead->num_fwd = req->r_num_fwd;
  	rhead->num_retry = req->r_attempts - 1;
3dd69aabc   Jeff Layton   ceph: add a new f...
2685
2686
  	dout(" r_parent = %p
  ", req->r_parent);
2f2dc0534   Sage Weil   ceph: MDS client
2687
2688
2689
2690
  	return 0;
  }
  
  /*
9cf54563b   Xiubo Li   ceph: add __send_...
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
   * called under mdsc->mutex
   */
  static int __send_request(struct ceph_mds_client *mdsc,
  			  struct ceph_mds_session *session,
  			  struct ceph_mds_request *req,
  			  bool drop_cap_releases)
  {
  	int err;
  
  	err = __prepare_send_request(mdsc, req, session->s_mds,
  				     drop_cap_releases);
  	if (!err) {
  		ceph_msg_get(req->r_request);
  		ceph_con_send(&session->s_con, req->r_request);
  	}
  
  	return err;
  }
  
  /*
2f2dc0534   Sage Weil   ceph: MDS client
2711
2712
   * send request, or put it on the appropriate wait list.
   */
d55484929   Chengguang Xu   ceph: change to v...
2713
  static void __do_request(struct ceph_mds_client *mdsc,
2f2dc0534   Sage Weil   ceph: MDS client
2714
2715
2716
2717
  			struct ceph_mds_request *req)
  {
  	struct ceph_mds_session *session = NULL;
  	int mds = -1;
48fec5d0a   Yan, Zheng   ceph: EIO all ope...
2718
  	int err = 0;
c4853e977   Xiubo Li   ceph: retry the s...
2719
  	bool random;
2f2dc0534   Sage Weil   ceph: MDS client
2720

bc2de10dc   Jeff Layton   ceph: convert boo...
2721
2722
  	if (req->r_err || test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
  		if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags))
eb1b8af33   Yan, Zheng   ceph: cleanup abo...
2723
  			__unregister_request(mdsc, req);
d55484929   Chengguang Xu   ceph: change to v...
2724
  		return;
eb1b8af33   Yan, Zheng   ceph: cleanup abo...
2725
  	}
2f2dc0534   Sage Weil   ceph: MDS client
2726
2727
2728
2729
2730
  
  	if (req->r_timeout &&
  	    time_after_eq(jiffies, req->r_started + req->r_timeout)) {
  		dout("do_request timed out
  ");
8ccf7fcce   Xiubo Li   ceph: return ETIM...
2731
  		err = -ETIMEDOUT;
2f2dc0534   Sage Weil   ceph: MDS client
2732
2733
  		goto finish;
  	}
52953d559   Seraphime Kirkovski   ceph: cleanup ACC...
2734
  	if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
48fec5d0a   Yan, Zheng   ceph: EIO all ope...
2735
2736
2737
2738
2739
  		dout("do_request forced umount
  ");
  		err = -EIO;
  		goto finish;
  	}
52953d559   Seraphime Kirkovski   ceph: cleanup ACC...
2740
  	if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_MOUNTING) {
e9e427f0a   Yan, Zheng   ceph: check avail...
2741
2742
2743
2744
2745
2746
  		if (mdsc->mdsmap_err) {
  			err = mdsc->mdsmap_err;
  			dout("do_request mdsmap err %d
  ", err);
  			goto finish;
  		}
cc8e83429   Yan, Zheng   ceph: fix mds clu...
2747
2748
2749
2750
  		if (mdsc->mdsmap->m_epoch == 0) {
  			dout("do_request no mdsmap, waiting for map
  ");
  			list_add(&req->r_wait, &mdsc->waiting_for_map);
d55484929   Chengguang Xu   ceph: change to v...
2751
  			return;
cc8e83429   Yan, Zheng   ceph: fix mds clu...
2752
  		}
e9e427f0a   Yan, Zheng   ceph: check avail...
2753
2754
2755
  		if (!(mdsc->fsc->mount_options->flags &
  		      CEPH_MOUNT_OPT_MOUNTWAIT) &&
  		    !ceph_mdsmap_is_cluster_available(mdsc->mdsmap)) {
97820058f   Xiubo Li   ceph: check avail...
2756
  			err = -EHOSTUNREACH;
e9e427f0a   Yan, Zheng   ceph: check avail...
2757
2758
2759
  			goto finish;
  		}
  	}
2f2dc0534   Sage Weil   ceph: MDS client
2760

dc69e2e9f   Sage Weil   ceph: associate r...
2761
  	put_request_session(req);
c4853e977   Xiubo Li   ceph: retry the s...
2762
  	mds = __choose_mds(mdsc, req, &random);
2f2dc0534   Sage Weil   ceph: MDS client
2763
2764
  	if (mds < 0 ||
  	    ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
3bb48b414   Jeff Layton   ceph: add flag to...
2765
2766
2767
2768
  		if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) {
  			err = -EJUKEBOX;
  			goto finish;
  		}
2f2dc0534   Sage Weil   ceph: MDS client
2769
2770
2771
  		dout("do_request no mds or not active, waiting for map
  ");
  		list_add(&req->r_wait, &mdsc->waiting_for_map);
d55484929   Chengguang Xu   ceph: change to v...
2772
  		return;
2f2dc0534   Sage Weil   ceph: MDS client
2773
2774
2775
2776
  	}
  
  	/* get, open session */
  	session = __ceph_lookup_mds_session(mdsc, mds);
9c423956b   Sage Weil   ceph: propagate m...
2777
  	if (!session) {
2f2dc0534   Sage Weil   ceph: MDS client
2778
  		session = register_session(mdsc, mds);
9c423956b   Sage Weil   ceph: propagate m...
2779
2780
2781
2782
2783
  		if (IS_ERR(session)) {
  			err = PTR_ERR(session);
  			goto finish;
  		}
  	}
5b3248c67   Xiubo Li   ceph: rename get_...
2784
  	req->r_session = ceph_get_mds_session(session);
dc69e2e9f   Sage Weil   ceph: associate r...
2785

2f2dc0534   Sage Weil   ceph: MDS client
2786
2787
  	dout("do_request mds%d session %p state %s
  ", mds, session,
a687ecaf5   John Spray   ceph: export ceph...
2788
  	     ceph_session_state_name(session->s_state));
2f2dc0534   Sage Weil   ceph: MDS client
2789
2790
  	if (session->s_state != CEPH_MDS_SESSION_OPEN &&
  	    session->s_state != CEPH_MDS_SESSION_HUNG) {
fcff415c9   Yan, Zheng   ceph: handle CEPH...
2791
2792
2793
2794
  		if (session->s_state == CEPH_MDS_SESSION_REJECTED) {
  			err = -EACCES;
  			goto out_session;
  		}
3bb48b414   Jeff Layton   ceph: add flag to...
2795
2796
2797
2798
2799
2800
2801
2802
2803
  		/*
  		 * We cannot queue async requests since the caps and delegated
  		 * inodes are bound to the session. Just return -EJUKEBOX and
  		 * let the caller retry a sync request in that case.
  		 */
  		if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) {
  			err = -EJUKEBOX;
  			goto out_session;
  		}
2f2dc0534   Sage Weil   ceph: MDS client
2804
  		if (session->s_state == CEPH_MDS_SESSION_NEW ||
c4853e977   Xiubo Li   ceph: retry the s...
2805
  		    session->s_state == CEPH_MDS_SESSION_CLOSING) {
b682c6d41   Xiubo Li   ceph: switch to W...
2806
2807
2808
  			err = __open_session(mdsc, session);
  			if (err)
  				goto out_session;
c4853e977   Xiubo Li   ceph: retry the s...
2809
2810
2811
2812
  			/* retry the same mds later */
  			if (random)
  				req->r_resend_mds = mds;
  		}
2f2dc0534   Sage Weil   ceph: MDS client
2813
2814
2815
2816
2817
  		list_add(&req->r_wait, &session->s_waiting);
  		goto out_session;
  	}
  
  	/* send request */
2f2dc0534   Sage Weil   ceph: MDS client
2818
2819
2820
2821
  	req->r_resend_mds = -1;   /* forget any previous mds hint */
  
  	if (req->r_request_started == 0)   /* note request start time */
  		req->r_request_started = jiffies;
9cf54563b   Xiubo Li   ceph: add __send_...
2822
  	err = __send_request(mdsc, session, req, false);
2f2dc0534   Sage Weil   ceph: MDS client
2823
2824
2825
  
  out_session:
  	ceph_put_mds_session(session);
48fec5d0a   Yan, Zheng   ceph: EIO all ope...
2826
2827
2828
2829
2830
2831
2832
2833
  finish:
  	if (err) {
  		dout("__do_request early error %d
  ", err);
  		req->r_err = err;
  		complete_request(mdsc, req);
  		__unregister_request(mdsc, req);
  	}
d55484929   Chengguang Xu   ceph: change to v...
2834
  	return;
2f2dc0534   Sage Weil   ceph: MDS client
2835
2836
2837
2838
2839
2840
2841
2842
  }
  
  /*
   * called under mdsc->mutex
   */
  static void __wake_requests(struct ceph_mds_client *mdsc,
  			    struct list_head *head)
  {
ed75ec2cd   Yan, Zheng   ceph: Fix infinit...
2843
2844
2845
2846
  	struct ceph_mds_request *req;
  	LIST_HEAD(tmp_list);
  
  	list_splice_init(head, &tmp_list);
2f2dc0534   Sage Weil   ceph: MDS client
2847

ed75ec2cd   Yan, Zheng   ceph: Fix infinit...
2848
2849
2850
  	while (!list_empty(&tmp_list)) {
  		req = list_entry(tmp_list.next,
  				 struct ceph_mds_request, r_wait);
2f2dc0534   Sage Weil   ceph: MDS client
2851
  		list_del_init(&req->r_wait);
7971bd92b   Sage Weil   ceph: revert comm...
2852
2853
  		dout(" wake request %p tid %llu
  ", req, req->r_tid);
2f2dc0534   Sage Weil   ceph: MDS client
2854
2855
2856
2857
2858
2859
  		__do_request(mdsc, req);
  	}
  }
  
  /*
   * Wake up threads with requests pending for @mds, so that they can
29790f26a   Sage Weil   ceph: wait for md...
2860
   * resubmit their requests to a possibly different mds.
2f2dc0534   Sage Weil   ceph: MDS client
2861
   */
29790f26a   Sage Weil   ceph: wait for md...
2862
  static void kick_requests(struct ceph_mds_client *mdsc, int mds)
2f2dc0534   Sage Weil   ceph: MDS client
2863
  {
44ca18f26   Sage Weil   ceph: use rbtree ...
2864
  	struct ceph_mds_request *req;
282c10522   Yan, Zheng   ceph: fix kick_re...
2865
  	struct rb_node *p = rb_first(&mdsc->request_tree);
2f2dc0534   Sage Weil   ceph: MDS client
2866
2867
2868
  
  	dout("kick_requests mds%d
  ", mds);
282c10522   Yan, Zheng   ceph: fix kick_re...
2869
  	while (p) {
44ca18f26   Sage Weil   ceph: use rbtree ...
2870
  		req = rb_entry(p, struct ceph_mds_request, r_node);
282c10522   Yan, Zheng   ceph: fix kick_re...
2871
  		p = rb_next(p);
bc2de10dc   Jeff Layton   ceph: convert boo...
2872
  		if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
44ca18f26   Sage Weil   ceph: use rbtree ...
2873
  			continue;
3de22be67   Yan, Zheng   ceph: re-send req...
2874
2875
  		if (req->r_attempts > 0)
  			continue; /* only new requests */
44ca18f26   Sage Weil   ceph: use rbtree ...
2876
2877
2878
2879
  		if (req->r_session &&
  		    req->r_session->s_mds == mds) {
  			dout(" kicking tid %llu
  ", req->r_tid);
03974e817   Yan, Zheng   ceph: make sure r...
2880
  			list_del_init(&req->r_wait);
44ca18f26   Sage Weil   ceph: use rbtree ...
2881
  			__do_request(mdsc, req);
2f2dc0534   Sage Weil   ceph: MDS client
2882
2883
2884
  		}
  	}
  }
86bda539f   Jeff Layton   ceph: have ceph_m...
2885
  int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
2f2dc0534   Sage Weil   ceph: MDS client
2886
2887
  			      struct ceph_mds_request *req)
  {
891f3f5a6   Jeff Layton   ceph: add infrast...
2888
  	int err = 0;
86bda539f   Jeff Layton   ceph: have ceph_m...
2889
2890
2891
2892
  
  	/* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */
  	if (req->r_inode)
  		ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
9c1c2b35f   Jeff Layton   ceph: hold extra ...
2893
  	if (req->r_parent) {
719a2514e   Yan, Zheng   ceph: consider in...
2894
2895
2896
2897
2898
2899
2900
  		struct ceph_inode_info *ci = ceph_inode(req->r_parent);
  		int fmode = (req->r_op & CEPH_MDS_OP_WRITE) ?
  			    CEPH_FILE_MODE_WR : CEPH_FILE_MODE_RD;
  		spin_lock(&ci->i_ceph_lock);
  		ceph_take_cap_refs(ci, CEPH_CAP_PIN, false);
  		__ceph_touch_fmode(ci, mdsc, fmode);
  		spin_unlock(&ci->i_ceph_lock);
9c1c2b35f   Jeff Layton   ceph: hold extra ...
2901
2902
  		ihold(req->r_parent);
  	}
86bda539f   Jeff Layton   ceph: have ceph_m...
2903
2904
2905
  	if (req->r_old_dentry_dir)
  		ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
  				  CEPH_CAP_PIN);
891f3f5a6   Jeff Layton   ceph: add infrast...
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
  	if (req->r_inode) {
  		err = ceph_wait_on_async_create(req->r_inode);
  		if (err) {
  			dout("%s: wait for async create returned: %d
  ",
  			     __func__, err);
  			return err;
  		}
  	}
  
  	if (!err && req->r_old_inode) {
  		err = ceph_wait_on_async_create(req->r_old_inode);
  		if (err) {
  			dout("%s: wait for async create returned: %d
  ",
  			     __func__, err);
  			return err;
  		}
  	}
86bda539f   Jeff Layton   ceph: have ceph_m...
2925
2926
  	dout("submit_request on %p for inode %p
  ", req, dir);
2f2dc0534   Sage Weil   ceph: MDS client
2927
  	mutex_lock(&mdsc->mutex);
86bda539f   Jeff Layton   ceph: have ceph_m...
2928
  	__register_request(mdsc, req, dir);
2f2dc0534   Sage Weil   ceph: MDS client
2929
  	__do_request(mdsc, req);
86bda539f   Jeff Layton   ceph: have ceph_m...
2930
  	err = req->r_err;
2f2dc0534   Sage Weil   ceph: MDS client
2931
  	mutex_unlock(&mdsc->mutex);
86bda539f   Jeff Layton   ceph: have ceph_m...
2932
  	return err;
2f2dc0534   Sage Weil   ceph: MDS client
2933
  }
8340f22ce   Jeff Layton   ceph: move wait f...
2934
2935
  static int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,
  				  struct ceph_mds_request *req)
2f2dc0534   Sage Weil   ceph: MDS client
2936
2937
  {
  	int err;
e1518c7c0   Sage Weil   ceph: clean up md...
2938
  	/* wait */
e1518c7c0   Sage Weil   ceph: clean up md...
2939
2940
  	dout("do_request waiting
  ");
5be730347   Ilya Dryomov   ceph: simplify tw...
2941
  	if (!req->r_timeout && req->r_wait_for_completion) {
9280be24d   Yan, Zheng   ceph: fix file lo...
2942
  		err = req->r_wait_for_completion(mdsc, req);
e1518c7c0   Sage Weil   ceph: clean up md...
2943
  	} else {
5be730347   Ilya Dryomov   ceph: simplify tw...
2944
2945
2946
2947
2948
2949
  		long timeleft = wait_for_completion_killable_timeout(
  					&req->r_completion,
  					ceph_timeout_jiffies(req->r_timeout));
  		if (timeleft > 0)
  			err = 0;
  		else if (!timeleft)
8ccf7fcce   Xiubo Li   ceph: return ETIM...
2950
  			err = -ETIMEDOUT;  /* timed out */
5be730347   Ilya Dryomov   ceph: simplify tw...
2951
2952
  		else
  			err = timeleft;  /* killed */
e1518c7c0   Sage Weil   ceph: clean up md...
2953
2954
2955
2956
  	}
  	dout("do_request waited, got %d
  ", err);
  	mutex_lock(&mdsc->mutex);
5b1daecd5   Sage Weil   ceph: properly ha...
2957

e1518c7c0   Sage Weil   ceph: clean up md...
2958
  	/* only abort if we didn't race with a real reply */
bc2de10dc   Jeff Layton   ceph: convert boo...
2959
  	if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
e1518c7c0   Sage Weil   ceph: clean up md...
2960
2961
2962
2963
  		err = le32_to_cpu(req->r_reply_info.head->result);
  	} else if (err < 0) {
  		dout("aborted request %lld with %d
  ", req->r_tid, err);
b4556396f   Sage Weil   ceph: fix race be...
2964
2965
2966
2967
2968
2969
2970
  
  		/*
  		 * ensure we aren't running concurrently with
  		 * ceph_fill_trace or ceph_readdir_prepopulate, which
  		 * rely on locks (dir mutex) held by our caller.
  		 */
  		mutex_lock(&req->r_fill_mutex);
e1518c7c0   Sage Weil   ceph: clean up md...
2971
  		req->r_err = err;
bc2de10dc   Jeff Layton   ceph: convert boo...
2972
  		set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
b4556396f   Sage Weil   ceph: fix race be...
2973
  		mutex_unlock(&req->r_fill_mutex);
5b1daecd5   Sage Weil   ceph: properly ha...
2974

3dd69aabc   Jeff Layton   ceph: add a new f...
2975
  		if (req->r_parent &&
167c9e352   Sage Weil   ceph: use common ...
2976
2977
  		    (req->r_op & CEPH_MDS_OP_WRITE))
  			ceph_invalidate_dir_request(req);
2f2dc0534   Sage Weil   ceph: MDS client
2978
  	} else {
e1518c7c0   Sage Weil   ceph: clean up md...
2979
  		err = req->r_err;
2f2dc0534   Sage Weil   ceph: MDS client
2980
  	}
2f2dc0534   Sage Weil   ceph: MDS client
2981

e1518c7c0   Sage Weil   ceph: clean up md...
2982
  	mutex_unlock(&mdsc->mutex);
8340f22ce   Jeff Layton   ceph: move wait f...
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
  	return err;
  }
  
  /*
   * Synchrously perform an mds request.  Take care of all of the
   * session setup, forwarding, retry details.
   */
  int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
  			 struct inode *dir,
  			 struct ceph_mds_request *req)
  {
  	int err;
  
  	dout("do_request on %p
  ", req);
  
  	/* issue */
  	err = ceph_mdsc_submit_request(mdsc, dir, req);
  	if (!err)
  		err = ceph_mdsc_wait_request(mdsc, req);
2f2dc0534   Sage Weil   ceph: MDS client
3003
3004
3005
3006
3007
3008
  	dout("do_request %p done, result %d
  ", req, err);
  	return err;
  }
  
  /*
2f276c511   Yan, Zheng   ceph: use i_relea...
3009
   * Invalidate dir's completeness, dentry lease state on an aborted MDS
167c9e352   Sage Weil   ceph: use common ...
3010
3011
3012
3013
   * namespace request.
   */
  void ceph_invalidate_dir_request(struct ceph_mds_request *req)
  {
8d8f371c8   Yan, Zheng   ceph: cleanup tra...
3014
3015
  	struct inode *dir = req->r_parent;
  	struct inode *old_dir = req->r_old_dentry_dir;
167c9e352   Sage Weil   ceph: use common ...
3016

8d8f371c8   Yan, Zheng   ceph: cleanup tra...
3017
3018
  	dout("invalidate_dir_request %p %p (complete, lease(s))
  ", dir, old_dir);
167c9e352   Sage Weil   ceph: use common ...
3019

8d8f371c8   Yan, Zheng   ceph: cleanup tra...
3020
3021
3022
  	ceph_dir_clear_complete(dir);
  	if (old_dir)
  		ceph_dir_clear_complete(old_dir);
167c9e352   Sage Weil   ceph: use common ...
3023
3024
3025
3026
3027
3028
3029
  	if (req->r_dentry)
  		ceph_invalidate_dentry_lease(req->r_dentry);
  	if (req->r_old_dentry)
  		ceph_invalidate_dentry_lease(req->r_old_dentry);
  }
  
  /*
2f2dc0534   Sage Weil   ceph: MDS client
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
   * Handle mds reply.
   *
   * We take the session mutex and parse and process the reply immediately.
   * This preserves the logical ordering of replies, capabilities, etc., sent
   * by the MDS as they are applied to our local cache.
   */
  static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
  {
  	struct ceph_mds_client *mdsc = session->s_mdsc;
  	struct ceph_mds_request *req;
  	struct ceph_mds_reply_head *head = msg->front.iov_base;
  	struct ceph_mds_reply_info_parsed *rinfo;  /* parsed reply info */
982d6011b   Yan, Zheng   ceph: improve ref...
3042
  	struct ceph_snap_realm *realm;
2f2dc0534   Sage Weil   ceph: MDS client
3043
3044
  	u64 tid;
  	int err, result;
2600d2dd5   Sage Weil   ceph: drop messag...
3045
  	int mds = session->s_mds;
2f2dc0534   Sage Weil   ceph: MDS client
3046

2f2dc0534   Sage Weil   ceph: MDS client
3047
3048
3049
  	if (msg->front.iov_len < sizeof(*head)) {
  		pr_err("mdsc_handle_reply got corrupt (short) reply
  ");
9ec7cab14   Sage Weil   ceph: hex dump co...
3050
  		ceph_msg_dump(msg);
2f2dc0534   Sage Weil   ceph: MDS client
3051
3052
3053
3054
  		return;
  	}
  
  	/* get request, session */
6df058c02   Sage Weil   ceph: include tra...
3055
  	tid = le64_to_cpu(msg->hdr.tid);
2f2dc0534   Sage Weil   ceph: MDS client
3056
  	mutex_lock(&mdsc->mutex);
fcd00b68b   Ilya Dryomov   libceph: DEFINE_R...
3057
  	req = lookup_get_request(mdsc, tid);
2f2dc0534   Sage Weil   ceph: MDS client
3058
3059
3060
3061
3062
3063
3064
3065
  	if (!req) {
  		dout("handle_reply on unknown tid %llu
  ", tid);
  		mutex_unlock(&mdsc->mutex);
  		return;
  	}
  	dout("handle_reply %p
  ", req);
2f2dc0534   Sage Weil   ceph: MDS client
3066
3067
  
  	/* correct session? */
d96d60498   Sage Weil   ceph: fix session...
3068
  	if (req->r_session != session) {
2f2dc0534   Sage Weil   ceph: MDS client
3069
3070
3071
3072
3073
3074
3075
3076
3077
  		pr_err("mdsc_handle_reply got %llu on session mds%d"
  		       " not mds%d
  ", tid, session->s_mds,
  		       req->r_session ? req->r_session->s_mds : -1);
  		mutex_unlock(&mdsc->mutex);
  		goto out;
  	}
  
  	/* dup? */
bc2de10dc   Jeff Layton   ceph: convert boo...
3078
3079
  	if ((test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags) && !head->safe) ||
  	    (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags) && head->safe)) {
f3ae1b97b   Fabian Frederick   fs/ceph: replace ...
3080
3081
  		pr_warn("got a dup %s reply on %llu from mds%d
  ",
2f2dc0534   Sage Weil   ceph: MDS client
3082
3083
3084
3085
  			   head->safe ? "safe" : "unsafe", tid, mds);
  		mutex_unlock(&mdsc->mutex);
  		goto out;
  	}
bc2de10dc   Jeff Layton   ceph: convert boo...
3086
  	if (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags)) {
f3ae1b97b   Fabian Frederick   fs/ceph: replace ...
3087
3088
  		pr_warn("got unsafe after safe on %llu from mds%d
  ",
85792d0dd   Sage Weil   ceph: cope with o...
3089
3090
3091
3092
  			   tid, mds);
  		mutex_unlock(&mdsc->mutex);
  		goto out;
  	}
2f2dc0534   Sage Weil   ceph: MDS client
3093
3094
3095
3096
  
  	result = le32_to_cpu(head->result);
  
  	/*
e55b71f80   Greg Farnum   ceph: handle ESTA...
3097
3098
3099
3100
3101
  	 * Handle an ESTALE
  	 * if we're not talking to the authority, send to them
  	 * if the authority has changed while we weren't looking,
  	 * send to new authority
  	 * Otherwise we just have to return an ESTALE
2f2dc0534   Sage Weil   ceph: MDS client
3102
3103
  	 */
  	if (result == -ESTALE) {
4c069a582   Chengguang Xu   ceph: add newline...
3104
3105
  		dout("got ESTALE on request %llu
  ", req->r_tid);
51da8e8c6   Yan, Zheng   ceph: reset r_res...
3106
  		req->r_resend_mds = -1;
ca18bede0   Yan, Zheng   ceph: handle -EST...
3107
  		if (req->r_direct_mode != USE_AUTH_MDS) {
4c069a582   Chengguang Xu   ceph: add newline...
3108
3109
  			dout("not using auth, setting for that now
  ");
e55b71f80   Greg Farnum   ceph: handle ESTA...
3110
  			req->r_direct_mode = USE_AUTH_MDS;
2f2dc0534   Sage Weil   ceph: MDS client
3111
3112
3113
  			__do_request(mdsc, req);
  			mutex_unlock(&mdsc->mutex);
  			goto out;
e55b71f80   Greg Farnum   ceph: handle ESTA...
3114
  		} else  {
c4853e977   Xiubo Li   ceph: retry the s...
3115
  			int mds = __choose_mds(mdsc, req, NULL);
ca18bede0   Yan, Zheng   ceph: handle -EST...
3116
  			if (mds >= 0 && mds != req->r_session->s_mds) {
4c069a582   Chengguang Xu   ceph: add newline...
3117
3118
  				dout("but auth changed, so resending
  ");
e55b71f80   Greg Farnum   ceph: handle ESTA...
3119
3120
3121
3122
  				__do_request(mdsc, req);
  				mutex_unlock(&mdsc->mutex);
  				goto out;
  			}
2f2dc0534   Sage Weil   ceph: MDS client
3123
  		}
4c069a582   Chengguang Xu   ceph: add newline...
3124
3125
  		dout("have to return ESTALE on request %llu
  ", req->r_tid);
2f2dc0534   Sage Weil   ceph: MDS client
3126
  	}
e55b71f80   Greg Farnum   ceph: handle ESTA...
3127

2f2dc0534   Sage Weil   ceph: MDS client
3128
  	if (head->safe) {
bc2de10dc   Jeff Layton   ceph: convert boo...
3129
  		set_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags);
2f2dc0534   Sage Weil   ceph: MDS client
3130
  		__unregister_request(mdsc, req);
2f2dc0534   Sage Weil   ceph: MDS client
3131

07edc0571   Xiubo Li   ceph: fix possibl...
3132
3133
3134
  		/* last request during umount? */
  		if (mdsc->stopping && !__get_oldest_req(mdsc))
  			complete_all(&mdsc->safe_umount_waiters);
bc2de10dc   Jeff Layton   ceph: convert boo...
3135
  		if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
2f2dc0534   Sage Weil   ceph: MDS client
3136
3137
3138
3139
3140
3141
3142
3143
3144
  			/*
  			 * We already handled the unsafe response, now do the
  			 * cleanup.  No need to examine the response; the MDS
  			 * doesn't include any result info in the safe
  			 * response.  And even if it did, there is nothing
  			 * useful we could do with a revised return value.
  			 */
  			dout("got safe reply %llu, mds%d
  ", tid, mds);
2f2dc0534   Sage Weil   ceph: MDS client
3145

2f2dc0534   Sage Weil   ceph: MDS client
3146
3147
3148
  			mutex_unlock(&mdsc->mutex);
  			goto out;
  		}
e1518c7c0   Sage Weil   ceph: clean up md...
3149
  	} else {
bc2de10dc   Jeff Layton   ceph: convert boo...
3150
  		set_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags);
2f2dc0534   Sage Weil   ceph: MDS client
3151
3152
3153
3154
3155
3156
  		list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
  	}
  
  	dout("handle_reply tid %lld result %d
  ", tid, result);
  	rinfo = &req->r_reply_info;
b37fe1f92   Yan, Zheng   ceph: support ver...
3157
  	if (test_bit(CEPHFS_FEATURE_REPLY_ENCODING, &session->s_features))
d48464878   Jeff Layton   ceph: decode inte...
3158
  		err = parse_reply_info(session, msg, rinfo, (u64)-1);
b37fe1f92   Yan, Zheng   ceph: support ver...
3159
  	else
d48464878   Jeff Layton   ceph: decode inte...
3160
  		err = parse_reply_info(session, msg, rinfo, session->s_con.peer_features);
2f2dc0534   Sage Weil   ceph: MDS client
3161
3162
3163
3164
  	mutex_unlock(&mdsc->mutex);
  
  	mutex_lock(&session->s_mutex);
  	if (err < 0) {
25933abdd   Herb Shiu   ceph: Handle file...
3165
3166
  		pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)
  ", mds, tid);
9ec7cab14   Sage Weil   ceph: hex dump co...
3167
  		ceph_msg_dump(msg);
2f2dc0534   Sage Weil   ceph: MDS client
3168
3169
3170
3171
  		goto out_err;
  	}
  
  	/* snap trace */
982d6011b   Yan, Zheng   ceph: improve ref...
3172
  	realm = NULL;
2f2dc0534   Sage Weil   ceph: MDS client
3173
3174
3175
  	if (rinfo->snapblob_len) {
  		down_write(&mdsc->snap_rwsem);
  		ceph_update_snap_trace(mdsc, rinfo->snapblob,
982d6011b   Yan, Zheng   ceph: improve ref...
3176
3177
3178
  				rinfo->snapblob + rinfo->snapblob_len,
  				le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP,
  				&realm);
2f2dc0534   Sage Weil   ceph: MDS client
3179
3180
3181
3182
3183
3184
  		downgrade_write(&mdsc->snap_rwsem);
  	} else {
  		down_read(&mdsc->snap_rwsem);
  	}
  
  	/* insert trace into our cache */
b4556396f   Sage Weil   ceph: fix race be...
3185
  	mutex_lock(&req->r_fill_mutex);
315f24088   Yan, Zheng   ceph: fix securit...
3186
  	current->journal_info = req;
f5a03b080   Jeff Layton   ceph: drop sessio...
3187
  	err = ceph_fill_trace(mdsc->fsc->sb, req);
2f2dc0534   Sage Weil   ceph: MDS client
3188
  	if (err == 0) {
6e8575faa   Sam Lang   ceph: Check for c...
3189
  		if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR ||
81c6aea52   Yan, Zheng   ceph: handle frag...
3190
  				    req->r_op == CEPH_MDS_OP_LSSNAP))
2f2dc0534   Sage Weil   ceph: MDS client
3191
  			ceph_readdir_prepopulate(req, req->r_session);
2f2dc0534   Sage Weil   ceph: MDS client
3192
  	}
315f24088   Yan, Zheng   ceph: fix securit...
3193
  	current->journal_info = NULL;
b4556396f   Sage Weil   ceph: fix race be...
3194
  	mutex_unlock(&req->r_fill_mutex);
2f2dc0534   Sage Weil   ceph: MDS client
3195
3196
  
  	up_read(&mdsc->snap_rwsem);
982d6011b   Yan, Zheng   ceph: improve ref...
3197
3198
  	if (realm)
  		ceph_put_snap_realm(mdsc, realm);
68cd5b4b7   Yan, Zheng   ceph: make fsync(...
3199

fe33032da   Yan, Zheng   ceph: add mount o...
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
  	if (err == 0) {
  		if (req->r_target_inode &&
  		    test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
  			struct ceph_inode_info *ci =
  				ceph_inode(req->r_target_inode);
  			spin_lock(&ci->i_unsafe_lock);
  			list_add_tail(&req->r_unsafe_target_item,
  				      &ci->i_unsafe_iops);
  			spin_unlock(&ci->i_unsafe_lock);
  		}
  
  		ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
68cd5b4b7   Yan, Zheng   ceph: make fsync(...
3212
  	}
2f2dc0534   Sage Weil   ceph: MDS client
3213
  out_err:
e1518c7c0   Sage Weil   ceph: clean up md...
3214
  	mutex_lock(&mdsc->mutex);
bc2de10dc   Jeff Layton   ceph: convert boo...
3215
  	if (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
e1518c7c0   Sage Weil   ceph: clean up md...
3216
3217
3218
  		if (err) {
  			req->r_err = err;
  		} else {
5fdb1389e   Jianpeng Ma   ceph: cleanup use...
3219
  			req->r_reply =  ceph_msg_get(msg);
bc2de10dc   Jeff Layton   ceph: convert boo...
3220
  			set_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags);
e1518c7c0   Sage Weil   ceph: clean up md...
3221
  		}
2f2dc0534   Sage Weil   ceph: MDS client
3222
  	} else {
e1518c7c0   Sage Weil   ceph: clean up md...
3223
3224
  		dout("reply arrived after request %lld was aborted
  ", tid);
2f2dc0534   Sage Weil   ceph: MDS client
3225
  	}
e1518c7c0   Sage Weil   ceph: clean up md...
3226
  	mutex_unlock(&mdsc->mutex);
2f2dc0534   Sage Weil   ceph: MDS client
3227

2f2dc0534   Sage Weil   ceph: MDS client
3228
3229
3230
3231
  	mutex_unlock(&session->s_mutex);
  
  	/* kick calling process */
  	complete_request(mdsc, req);
70c948206   Xiubo Li   ceph: add metadat...
3232
3233
3234
  
  	ceph_update_metadata_latency(&mdsc->metric, req->r_start_latency,
  				     req->r_end_latency, err);
2f2dc0534   Sage Weil   ceph: MDS client
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
  out:
  	ceph_mdsc_put_request(req);
  	return;
  }
  
  
  
  /*
   * handle mds notification that our request has been forwarded.
   */
2600d2dd5   Sage Weil   ceph: drop messag...
3245
3246
3247
  static void handle_forward(struct ceph_mds_client *mdsc,
  			   struct ceph_mds_session *session,
  			   struct ceph_msg *msg)
2f2dc0534   Sage Weil   ceph: MDS client
3248
3249
  {
  	struct ceph_mds_request *req;
a1ea787c7   Sage Weil   ceph: fix client_...
3250
  	u64 tid = le64_to_cpu(msg->hdr.tid);
2f2dc0534   Sage Weil   ceph: MDS client
3251
3252
  	u32 next_mds;
  	u32 fwd_seq;
2f2dc0534   Sage Weil   ceph: MDS client
3253
3254
3255
  	int err = -EINVAL;
  	void *p = msg->front.iov_base;
  	void *end = p + msg->front.iov_len;
2f2dc0534   Sage Weil   ceph: MDS client
3256

a1ea787c7   Sage Weil   ceph: fix client_...
3257
  	ceph_decode_need(&p, end, 2*sizeof(u32), bad);
c89136ea4   Sage Weil   ceph: convert enc...
3258
3259
  	next_mds = ceph_decode_32(&p);
  	fwd_seq = ceph_decode_32(&p);
2f2dc0534   Sage Weil   ceph: MDS client
3260
3261
  
  	mutex_lock(&mdsc->mutex);
fcd00b68b   Ilya Dryomov   libceph: DEFINE_R...
3262
  	req = lookup_get_request(mdsc, tid);
2f2dc0534   Sage Weil   ceph: MDS client
3263
  	if (!req) {
2a8e5e363   Sage Weil   ceph: clean up on...
3264
3265
  		dout("forward tid %llu to mds%d - req dne
  ", tid, next_mds);
2f2dc0534   Sage Weil   ceph: MDS client
3266
3267
  		goto out;  /* dup reply? */
  	}
bc2de10dc   Jeff Layton   ceph: convert boo...
3268
  	if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
2a8e5e363   Sage Weil   ceph: clean up on...
3269
3270
3271
3272
3273
3274
  		dout("forward tid %llu aborted, unregistering
  ", tid);
  		__unregister_request(mdsc, req);
  	} else if (fwd_seq <= req->r_num_fwd) {
  		dout("forward tid %llu to mds%d - old seq %d <= %d
  ",
2f2dc0534   Sage Weil   ceph: MDS client
3275
3276
3277
  		     tid, next_mds, req->r_num_fwd, fwd_seq);
  	} else {
  		/* resend. forward race not possible; mds would drop */
2a8e5e363   Sage Weil   ceph: clean up on...
3278
3279
3280
  		dout("forward tid %llu to mds%d (we resend)
  ", tid, next_mds);
  		BUG_ON(req->r_err);
bc2de10dc   Jeff Layton   ceph: convert boo...
3281
  		BUG_ON(test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags));
3de22be67   Yan, Zheng   ceph: re-send req...
3282
  		req->r_attempts = 0;
2f2dc0534   Sage Weil   ceph: MDS client
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
  		req->r_num_fwd = fwd_seq;
  		req->r_resend_mds = next_mds;
  		put_request_session(req);
  		__do_request(mdsc, req);
  	}
  	ceph_mdsc_put_request(req);
  out:
  	mutex_unlock(&mdsc->mutex);
  	return;
  
  bad:
  	pr_err("mdsc_handle_forward decode error err=%d
  ", err);
  }
131d7eb4f   Yan, Zheng   ceph: auto reconn...
3297
  static int __decode_session_metadata(void **p, void *end,
0b98acd61   Ilya Dryomov   libceph, rbd, cep...
3298
  				     bool *blocklisted)
84bf39509   Yan, Zheng   ceph: decode feat...
3299
3300
3301
  {
  	/* map<string,string> */
  	u32 n;
131d7eb4f   Yan, Zheng   ceph: auto reconn...
3302
  	bool err_str;
84bf39509   Yan, Zheng   ceph: decode feat...
3303
3304
3305
3306
3307
  	ceph_decode_32_safe(p, end, n, bad);
  	while (n-- > 0) {
  		u32 len;
  		ceph_decode_32_safe(p, end, len, bad);
  		ceph_decode_need(p, end, len, bad);
131d7eb4f   Yan, Zheng   ceph: auto reconn...
3308
  		err_str = !strncmp(*p, "error_string", len);
84bf39509   Yan, Zheng   ceph: decode feat...
3309
3310
3311
  		*p += len;
  		ceph_decode_32_safe(p, end, len, bad);
  		ceph_decode_need(p, end, len, bad);
4bb926e83   Ilya Dryomov   ceph: add a note ...
3312
3313
3314
3315
  		/*
  		 * Match "blocklisted (blacklisted)" from newer MDSes,
  		 * or "blacklisted" from older MDSes.
  		 */
131d7eb4f   Yan, Zheng   ceph: auto reconn...
3316
  		if (err_str && strnstr(*p, "blacklisted", len))
0b98acd61   Ilya Dryomov   libceph, rbd, cep...
3317
  			*blocklisted = true;
84bf39509   Yan, Zheng   ceph: decode feat...
3318
3319
3320
3321
3322
3323
  		*p += len;
  	}
  	return 0;
  bad:
  	return -1;
  }
2f2dc0534   Sage Weil   ceph: MDS client
3324
3325
3326
3327
3328
3329
3330
  /*
   * handle a mds session control message
   */
  static void handle_session(struct ceph_mds_session *session,
  			   struct ceph_msg *msg)
  {
  	struct ceph_mds_client *mdsc = session->s_mdsc;
84bf39509   Yan, Zheng   ceph: decode feat...
3331
3332
3333
3334
3335
  	int mds = session->s_mds;
  	int msg_version = le16_to_cpu(msg->hdr.version);
  	void *p = msg->front.iov_base;
  	void *end = p + msg->front.iov_len;
  	struct ceph_mds_session_head *h;
2f2dc0534   Sage Weil   ceph: MDS client
3336
  	u32 op;
0fa826336   Jeff Layton   ceph: fix endiann...
3337
  	u64 seq, features = 0;
2f2dc0534   Sage Weil   ceph: MDS client
3338
  	int wake = 0;
0b98acd61   Ilya Dryomov   libceph, rbd, cep...
3339
  	bool blocklisted = false;
2f2dc0534   Sage Weil   ceph: MDS client
3340

2f2dc0534   Sage Weil   ceph: MDS client
3341
  	/* decode */
84bf39509   Yan, Zheng   ceph: decode feat...
3342
3343
3344
  	ceph_decode_need(&p, end, sizeof(*h), bad);
  	h = p;
  	p += sizeof(*h);
2f2dc0534   Sage Weil   ceph: MDS client
3345
3346
  	op = le32_to_cpu(h->op);
  	seq = le64_to_cpu(h->seq);
84bf39509   Yan, Zheng   ceph: decode feat...
3347
3348
3349
  	if (msg_version >= 3) {
  		u32 len;
  		/* version >= 2, metadata */
0b98acd61   Ilya Dryomov   libceph, rbd, cep...
3350
  		if (__decode_session_metadata(&p, end, &blocklisted) < 0)
84bf39509   Yan, Zheng   ceph: decode feat...
3351
3352
3353
  			goto bad;
  		/* version >= 3, feature bits */
  		ceph_decode_32_safe(&p, end, len, bad);
02e37571f   Jeff Layton   ceph: handle zero...
3354
3355
3356
3357
  		if (len) {
  			ceph_decode_64_safe(&p, end, features, bad);
  			p += len - sizeof(features);
  		}
84bf39509   Yan, Zheng   ceph: decode feat...
3358
  	}
2f2dc0534   Sage Weil   ceph: MDS client
3359
  	mutex_lock(&mdsc->mutex);
0a07fc8cd   Yan, Zheng   ceph: fix potenti...
3360
  	if (op == CEPH_SESSION_CLOSE) {
5b3248c67   Xiubo Li   ceph: rename get_...
3361
  		ceph_get_mds_session(session);
2600d2dd5   Sage Weil   ceph: drop messag...
3362
  		__unregister_session(mdsc, session);
0a07fc8cd   Yan, Zheng   ceph: fix potenti...
3363
  	}
2f2dc0534   Sage Weil   ceph: MDS client
3364
3365
3366
3367
3368
3369
3370
3371
3372
  	/* FIXME: this ttl calculation is generous */
  	session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose;
  	mutex_unlock(&mdsc->mutex);
  
  	mutex_lock(&session->s_mutex);
  
  	dout("handle_session mds%d %s %p state %s seq %llu
  ",
  	     mds, ceph_session_op_name(op), session,
a687ecaf5   John Spray   ceph: export ceph...
3373
  	     ceph_session_state_name(session->s_state), seq);
2f2dc0534   Sage Weil   ceph: MDS client
3374
3375
3376
3377
3378
3379
3380
3381
3382
  
  	if (session->s_state == CEPH_MDS_SESSION_HUNG) {
  		session->s_state = CEPH_MDS_SESSION_OPEN;
  		pr_info("mds%d came back
  ", session->s_mds);
  	}
  
  	switch (op) {
  	case CEPH_SESSION_OPEN:
29790f26a   Sage Weil   ceph: wait for md...
3383
3384
3385
  		if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
  			pr_info("mds%d reconnect success
  ", session->s_mds);
2f2dc0534   Sage Weil   ceph: MDS client
3386
  		session->s_state = CEPH_MDS_SESSION_OPEN;
84bf39509   Yan, Zheng   ceph: decode feat...
3387
  		session->s_features = features;
2f2dc0534   Sage Weil   ceph: MDS client
3388
  		renewed_caps(mdsc, session, 0);
18f473b38   Xiubo Li   ceph: periodicall...
3389
3390
  		if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT, &session->s_features))
  			metric_schedule_delayed(&mdsc->metric);
2f2dc0534   Sage Weil   ceph: MDS client
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
  		wake = 1;
  		if (mdsc->stopping)
  			__close_session(mdsc, session);
  		break;
  
  	case CEPH_SESSION_RENEWCAPS:
  		if (session->s_renew_seq == seq)
  			renewed_caps(mdsc, session, 1);
  		break;
  
  	case CEPH_SESSION_CLOSE:
29790f26a   Sage Weil   ceph: wait for md...
3402
3403
3404
  		if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
  			pr_info("mds%d reconnect denied
  ", session->s_mds);
4d681c2f9   Xiubo Li   ceph: keep the se...
3405
  		session->s_state = CEPH_MDS_SESSION_CLOSED;
1c841a96b   Yan, Zheng   ceph: cleanup uns...
3406
  		cleanup_session_requests(mdsc, session);
2f2dc0534   Sage Weil   ceph: MDS client
3407
  		remove_session_caps(session);
656e43829   Yan, Zheng   ceph: protect kic...
3408
  		wake = 2; /* for good measure */
f3c60c591   Sage Weil   ceph: fix multipl...
3409
  		wake_up_all(&mdsc->session_close_wq);
2f2dc0534   Sage Weil   ceph: MDS client
3410
3411
3412
3413
3414
3415
  		break;
  
  	case CEPH_SESSION_STALE:
  		pr_info("mds%d caps went stale, renewing
  ",
  			session->s_mds);
d8fb02abd   Alex Elder   ceph: create a ne...
3416
  		spin_lock(&session->s_gen_ttl_lock);
2f2dc0534   Sage Weil   ceph: MDS client
3417
  		session->s_cap_gen++;
1ce208a6c   Alex Elder   ceph: don't reset...
3418
  		session->s_cap_ttl = jiffies - 1;
d8fb02abd   Alex Elder   ceph: create a ne...
3419
  		spin_unlock(&session->s_gen_ttl_lock);
2f2dc0534   Sage Weil   ceph: MDS client
3420
3421
3422
3423
  		send_renew_caps(mdsc, session);
  		break;
  
  	case CEPH_SESSION_RECALL_STATE:
e30ee5812   Zhi Zhang   ceph: try to allo...
3424
  		ceph_trim_caps(mdsc, session, le32_to_cpu(h->max_caps));
2f2dc0534   Sage Weil   ceph: MDS client
3425
  		break;
186e4f7a4   Yan, Zheng   ceph: handle sess...
3426
3427
3428
  	case CEPH_SESSION_FLUSHMSG:
  		send_flushmsg_ack(mdsc, session, seq);
  		break;
03f4fcb02   Yan, Zheng   ceph: handle SESS...
3429
3430
3431
3432
3433
3434
  	case CEPH_SESSION_FORCE_RO:
  		dout("force_session_readonly %p
  ", session);
  		spin_lock(&session->s_cap_lock);
  		session->s_readonly = true;
  		spin_unlock(&session->s_cap_lock);
d2f8bb27c   Yan, Zheng   ceph: update want...
3435
  		wake_up_session_caps(session, FORCE_RO);
03f4fcb02   Yan, Zheng   ceph: handle SESS...
3436
  		break;
fcff415c9   Yan, Zheng   ceph: handle CEPH...
3437
3438
3439
3440
3441
3442
3443
  	case CEPH_SESSION_REJECT:
  		WARN_ON(session->s_state != CEPH_MDS_SESSION_OPENING);
  		pr_info("mds%d rejected session
  ", session->s_mds);
  		session->s_state = CEPH_MDS_SESSION_REJECTED;
  		cleanup_session_requests(mdsc, session);
  		remove_session_caps(session);
0b98acd61   Ilya Dryomov   libceph, rbd, cep...
3444
3445
  		if (blocklisted)
  			mdsc->fsc->blocklisted = true;
fcff415c9   Yan, Zheng   ceph: handle CEPH...
3446
3447
  		wake = 2; /* for good measure */
  		break;
2f2dc0534   Sage Weil   ceph: MDS client
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
  	default:
  		pr_err("mdsc_handle_session bad op %d mds%d
  ", op, mds);
  		WARN_ON(1);
  	}
  
  	mutex_unlock(&session->s_mutex);
  	if (wake) {
  		mutex_lock(&mdsc->mutex);
  		__wake_requests(mdsc, &session->s_waiting);
656e43829   Yan, Zheng   ceph: protect kic...
3458
3459
  		if (wake == 2)
  			kick_requests(mdsc, mds);
2f2dc0534   Sage Weil   ceph: MDS client
3460
3461
  		mutex_unlock(&mdsc->mutex);
  	}
0a07fc8cd   Yan, Zheng   ceph: fix potenti...
3462
3463
  	if (op == CEPH_SESSION_CLOSE)
  		ceph_put_mds_session(session);
2f2dc0534   Sage Weil   ceph: MDS client
3464
3465
3466
3467
3468
3469
  	return;
  
  bad:
  	pr_err("mdsc_handle_session corrupt message mds%d len %d
  ", mds,
  	       (int)msg->front.iov_len);
9ec7cab14   Sage Weil   ceph: hex dump co...
3470
  	ceph_msg_dump(msg);
2f2dc0534   Sage Weil   ceph: MDS client
3471
3472
  	return;
  }
a25949b99   Jeff Layton   ceph: cap trackin...
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
  void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req)
  {
  	int dcaps;
  
  	dcaps = xchg(&req->r_dir_caps, 0);
  	if (dcaps) {
  		dout("releasing r_dir_caps=%s
  ", ceph_cap_string(dcaps));
  		ceph_put_cap_refs(ceph_inode(req->r_parent), dcaps);
  	}
  }
e64f44a88   Xiubo Li   ceph: skip checki...
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
  void ceph_mdsc_release_dir_caps_no_check(struct ceph_mds_request *req)
  {
  	int dcaps;
  
  	dcaps = xchg(&req->r_dir_caps, 0);
  	if (dcaps) {
  		dout("releasing r_dir_caps=%s
  ", ceph_cap_string(dcaps));
  		ceph_put_cap_refs_no_check_caps(ceph_inode(req->r_parent),
  						dcaps);
  	}
  }
2f2dc0534   Sage Weil   ceph: MDS client
3496
3497
3498
3499
3500
3501
3502
  /*
   * called under session->mutex.
   */
  static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
  				   struct ceph_mds_session *session)
  {
  	struct ceph_mds_request *req, *nreq;
3de22be67   Yan, Zheng   ceph: re-send req...
3503
  	struct rb_node *p;
2f2dc0534   Sage Weil   ceph: MDS client
3504
3505
3506
3507
3508
  
  	dout("replay_unsafe_requests mds%d
  ", session->s_mds);
  
  	mutex_lock(&mdsc->mutex);
9cf54563b   Xiubo Li   ceph: add __send_...
3509
3510
  	list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item)
  		__send_request(mdsc, session, req, true);
3de22be67   Yan, Zheng   ceph: re-send req...
3511
3512
3513
3514
3515
3516
3517
3518
3519
  
  	/*
  	 * also re-send old requests when MDS enters reconnect stage. So that MDS
  	 * can process completed request in clientreplay stage.
  	 */
  	p = rb_first(&mdsc->request_tree);
  	while (p) {
  		req = rb_entry(p, struct ceph_mds_request, r_node);
  		p = rb_next(p);
bc2de10dc   Jeff Layton   ceph: convert boo...
3520
  		if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
3de22be67   Yan, Zheng   ceph: re-send req...
3521
3522
3523
  			continue;
  		if (req->r_attempts == 0)
  			continue; /* only old requests */
a25949b99   Jeff Layton   ceph: cap trackin...
3524
3525
3526
3527
  		if (!req->r_session)
  			continue;
  		if (req->r_session->s_mds != session->s_mds)
  			continue;
e64f44a88   Xiubo Li   ceph: skip checki...
3528
  		ceph_mdsc_release_dir_caps_no_check(req);
a25949b99   Jeff Layton   ceph: cap trackin...
3529
3530
  
  		__send_request(mdsc, session, req, true);
3de22be67   Yan, Zheng   ceph: re-send req...
3531
  	}
2f2dc0534   Sage Weil   ceph: MDS client
3532
3533
  	mutex_unlock(&mdsc->mutex);
  }
81c5a1487   Yan, Zheng   ceph: split large...
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
  static int send_reconnect_partial(struct ceph_reconnect_state *recon_state)
  {
  	struct ceph_msg *reply;
  	struct ceph_pagelist *_pagelist;
  	struct page *page;
  	__le32 *addr;
  	int err = -ENOMEM;
  
  	if (!recon_state->allow_multi)
  		return -ENOSPC;
  
  	/* can't handle message that contains both caps and realm */
  	BUG_ON(!recon_state->nr_caps == !recon_state->nr_realms);
  
  	/* pre-allocate new pagelist */
  	_pagelist = ceph_pagelist_alloc(GFP_NOFS);
  	if (!_pagelist)
  		return -ENOMEM;
  
  	reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false);
  	if (!reply)
  		goto fail_msg;
  
  	/* placeholder for nr_caps */
  	err = ceph_pagelist_encode_32(_pagelist, 0);
  	if (err < 0)
  		goto fail;
  
  	if (recon_state->nr_caps) {
  		/* currently encoding caps */
  		err = ceph_pagelist_encode_32(recon_state->pagelist, 0);
  		if (err)
  			goto fail;
  	} else {
  		/* placeholder for nr_realms (currently encoding relams) */
  		err = ceph_pagelist_encode_32(_pagelist, 0);
  		if (err < 0)
  			goto fail;
  	}
  
  	err = ceph_pagelist_encode_8(recon_state->pagelist, 1);
  	if (err)
  		goto fail;
  
  	page = list_first_entry(&recon_state->pagelist->head, struct page, lru);
  	addr = kmap_atomic(page);
  	if (recon_state->nr_caps) {
  		/* currently encoding caps */
  		*addr = cpu_to_le32(recon_state->nr_caps);
  	} else {
  		/* currently encoding relams */
  		*(addr + 1) = cpu_to_le32(recon_state->nr_realms);
  	}
  	kunmap_atomic(addr);
  
  	reply->hdr.version = cpu_to_le16(5);
  	reply->hdr.compat_version = cpu_to_le16(4);
  
  	reply->hdr.data_len = cpu_to_le32(recon_state->pagelist->length);
  	ceph_msg_data_add_pagelist(reply, recon_state->pagelist);
  
  	ceph_con_send(&recon_state->session->s_con, reply);
  	ceph_pagelist_release(recon_state->pagelist);
  
  	recon_state->pagelist = _pagelist;
  	recon_state->nr_caps = 0;
  	recon_state->nr_realms = 0;
  	recon_state->msg_version = 5;
  	return 0;
  fail:
  	ceph_msg_put(reply);
  fail_msg:
  	ceph_pagelist_release(_pagelist);
  	return err;
  }
a33f6432b   Yan, Zheng   ceph: encode inod...
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
  static struct dentry* d_find_primary(struct inode *inode)
  {
  	struct dentry *alias, *dn = NULL;
  
  	if (hlist_empty(&inode->i_dentry))
  		return NULL;
  
  	spin_lock(&inode->i_lock);
  	if (hlist_empty(&inode->i_dentry))
  		goto out_unlock;
  
  	if (S_ISDIR(inode->i_mode)) {
  		alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
  		if (!IS_ROOT(alias))
  			dn = dget(alias);
  		goto out_unlock;
  	}
  
  	hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
  		spin_lock(&alias->d_lock);
  		if (!d_unhashed(alias) &&
  		    (ceph_dentry(alias)->flags & CEPH_DENTRY_PRIMARY_LINK)) {
  			dn = dget_dlock(alias);
  		}
  		spin_unlock(&alias->d_lock);
  		if (dn)
  			break;
  	}
  out_unlock:
  	spin_unlock(&inode->i_lock);
  	return dn;
  }
2f2dc0534   Sage Weil   ceph: MDS client
3641
3642
3643
  /*
   * Encode information about a cap for a reconnect with the MDS.
   */
a25949b99   Jeff Layton   ceph: cap trackin...
3644
  static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
2f2dc0534   Sage Weil   ceph: MDS client
3645
3646
  			  void *arg)
  {
20cb34ae9   Sage Weil   ceph: support v2 ...
3647
3648
3649
3650
  	union {
  		struct ceph_mds_cap_reconnect v2;
  		struct ceph_mds_cap_reconnect_v1 v1;
  	} rec;
b3f8d68f3   Yan, Zheng   ceph: handle 'ses...
3651
  	struct ceph_inode_info *ci = cap->ci;
20cb34ae9   Sage Weil   ceph: support v2 ...
3652
3653
  	struct ceph_reconnect_state *recon_state = arg;
  	struct ceph_pagelist *pagelist = recon_state->pagelist;
a33f6432b   Yan, Zheng   ceph: encode inod...
3654
3655
3656
3657
  	struct dentry *dentry;
  	char *path;
  	int pathlen, err;
  	u64 pathbase;
3469ed0d1   Yan, Zheng   ceph: include 'fo...
3658
  	u64 snap_follows;
2f2dc0534   Sage Weil   ceph: MDS client
3659

2f2dc0534   Sage Weil   ceph: MDS client
3660
3661
3662
3663
  	dout(" adding %p ino %llx.%llx cap %p %lld %s
  ",
  	     inode, ceph_vinop(inode), cap, cap->cap_id,
  	     ceph_cap_string(cap->issued));
2f2dc0534   Sage Weil   ceph: MDS client
3664

a33f6432b   Yan, Zheng   ceph: encode inod...
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
  	dentry = d_find_primary(inode);
  	if (dentry) {
  		/* set pathbase to parent dir when msg_version >= 2 */
  		path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase,
  					    recon_state->msg_version >= 2);
  		dput(dentry);
  		if (IS_ERR(path)) {
  			err = PTR_ERR(path);
  			goto out_err;
  		}
  	} else {
  		path = NULL;
  		pathlen = 0;
  		pathbase = 0;
  	}
be655596b   Sage Weil   ceph: use i_ceph_...
3680
  	spin_lock(&ci->i_ceph_lock);
2f2dc0534   Sage Weil   ceph: MDS client
3681
3682
  	cap->seq = 0;        /* reset cap seq */
  	cap->issue_seq = 0;  /* and issue_seq */
667ca05cd   Yan, Zheng   ceph: clear migra...
3683
  	cap->mseq = 0;       /* and migrate_seq */
99a9c273b   Yan, Zheng   ceph: handle race...
3684
  	cap->cap_gen = cap->session->s_cap_gen;
20cb34ae9   Sage Weil   ceph: support v2 ...
3685

a25949b99   Jeff Layton   ceph: cap trackin...
3686
  	/* These are lost when the session goes away */
785892fe8   Jeff Layton   ceph: cache layou...
3687
3688
3689
3690
3691
  	if (S_ISDIR(inode->i_mode)) {
  		if (cap->issued & CEPH_CAP_DIR_CREATE) {
  			ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
  			memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
  		}
a25949b99   Jeff Layton   ceph: cap trackin...
3692
  		cap->issued &= ~CEPH_CAP_ANY_DIR_OPS;
785892fe8   Jeff Layton   ceph: cache layou...
3693
  	}
a25949b99   Jeff Layton   ceph: cap trackin...
3694

121f22a19   Yan, Zheng   ceph: update cap ...
3695
  	if (recon_state->msg_version >= 2) {
20cb34ae9   Sage Weil   ceph: support v2 ...
3696
3697
3698
3699
  		rec.v2.cap_id = cpu_to_le64(cap->cap_id);
  		rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
  		rec.v2.issued = cpu_to_le32(cap->issued);
  		rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
a33f6432b   Yan, Zheng   ceph: encode inod...
3700
  		rec.v2.pathbase = cpu_to_le64(pathbase);
ec1dff25b   Jeff Layton   ceph: silence spa...
3701
3702
  		rec.v2.flock_len = (__force __le32)
  			((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1);
20cb34ae9   Sage Weil   ceph: support v2 ...
3703
3704
3705
3706
3707
  	} else {
  		rec.v1.cap_id = cpu_to_le64(cap->cap_id);
  		rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
  		rec.v1.issued = cpu_to_le32(cap->issued);
  		rec.v1.size = cpu_to_le64(inode->i_size);
9bbeab41c   Arnd Bergmann   ceph: use timespe...
3708
3709
  		ceph_encode_timespec64(&rec.v1.mtime, &inode->i_mtime);
  		ceph_encode_timespec64(&rec.v1.atime, &inode->i_atime);
20cb34ae9   Sage Weil   ceph: support v2 ...
3710
  		rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
a33f6432b   Yan, Zheng   ceph: encode inod...
3711
  		rec.v1.pathbase = cpu_to_le64(pathbase);
20cb34ae9   Sage Weil   ceph: support v2 ...
3712
  	}
3469ed0d1   Yan, Zheng   ceph: include 'fo...
3713
3714
  
  	if (list_empty(&ci->i_cap_snaps)) {
92776fd2c   Yan, Zheng   ceph: properly se...
3715
  		snap_follows = ci->i_head_snapc ? ci->i_head_snapc->seq : 0;
3469ed0d1   Yan, Zheng   ceph: include 'fo...
3716
3717
3718
3719
3720
  	} else {
  		struct ceph_cap_snap *capsnap =
  			list_first_entry(&ci->i_cap_snaps,
  					 struct ceph_cap_snap, ci_item);
  		snap_follows = capsnap->follows;
20cb34ae9   Sage Weil   ceph: support v2 ...
3721
  	}
be655596b   Sage Weil   ceph: use i_ceph_...
3722
  	spin_unlock(&ci->i_ceph_lock);
2f2dc0534   Sage Weil   ceph: MDS client
3723

121f22a19   Yan, Zheng   ceph: update cap ...
3724
  	if (recon_state->msg_version >= 2) {
40819f6fb   Greg Farnum   ceph: add flock/f...
3725
  		int num_fcntl_locks, num_flock_locks;
4deb14a25   Yan, Zheng   ceph: optimize fl...
3726
  		struct ceph_filelock *flocks = NULL;
81c5a1487   Yan, Zheng   ceph: split large...
3727
  		size_t struct_len, total_len = sizeof(u64);
121f22a19   Yan, Zheng   ceph: update cap ...
3728
  		u8 struct_v = 0;
39be95e9c   Jim Schutt   ceph: ceph_pageli...
3729
3730
  
  encode_again:
b3f8d68f3   Yan, Zheng   ceph: handle 'ses...
3731
3732
3733
3734
3735
3736
  		if (rec.v2.flock_len) {
  			ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
  		} else {
  			num_fcntl_locks = 0;
  			num_flock_locks = 0;
  		}
4deb14a25   Yan, Zheng   ceph: optimize fl...
3737
  		if (num_fcntl_locks + num_flock_locks > 0) {
6da2ec560   Kees Cook   treewide: kmalloc...
3738
3739
3740
  			flocks = kmalloc_array(num_fcntl_locks + num_flock_locks,
  					       sizeof(struct ceph_filelock),
  					       GFP_NOFS);
4deb14a25   Yan, Zheng   ceph: optimize fl...
3741
3742
  			if (!flocks) {
  				err = -ENOMEM;
5ccedf1cc   Yan, Zheng   ceph: don't encod...
3743
  				goto out_err;
4deb14a25   Yan, Zheng   ceph: optimize fl...
3744
3745
3746
3747
3748
3749
3750
3751
3752
  			}
  			err = ceph_encode_locks_to_buffer(inode, flocks,
  							  num_fcntl_locks,
  							  num_flock_locks);
  			if (err) {
  				kfree(flocks);
  				flocks = NULL;
  				if (err == -ENOSPC)
  					goto encode_again;
5ccedf1cc   Yan, Zheng   ceph: don't encod...
3753
  				goto out_err;
4deb14a25   Yan, Zheng   ceph: optimize fl...
3754
3755
  			}
  		} else {
39be95e9c   Jim Schutt   ceph: ceph_pageli...
3756
  			kfree(flocks);
4deb14a25   Yan, Zheng   ceph: optimize fl...
3757
  			flocks = NULL;
39be95e9c   Jim Schutt   ceph: ceph_pageli...
3758
  		}
121f22a19   Yan, Zheng   ceph: update cap ...
3759
3760
3761
  
  		if (recon_state->msg_version >= 3) {
  			/* version, compat_version and struct_len */
81c5a1487   Yan, Zheng   ceph: split large...
3762
  			total_len += 2 * sizeof(u8) + sizeof(u32);
3469ed0d1   Yan, Zheng   ceph: include 'fo...
3763
  			struct_v = 2;
121f22a19   Yan, Zheng   ceph: update cap ...
3764
  		}
39be95e9c   Jim Schutt   ceph: ceph_pageli...
3765
3766
3767
  		/*
  		 * number of encoded locks is stable, so copy to pagelist
  		 */
121f22a19   Yan, Zheng   ceph: update cap ...
3768
3769
3770
3771
  		struct_len = 2 * sizeof(u32) +
  			    (num_fcntl_locks + num_flock_locks) *
  			    sizeof(struct ceph_filelock);
  		rec.v2.flock_len = cpu_to_le32(struct_len);
a33f6432b   Yan, Zheng   ceph: encode inod...
3772
  		struct_len += sizeof(u32) + pathlen + sizeof(rec.v2);
121f22a19   Yan, Zheng   ceph: update cap ...
3773

3469ed0d1   Yan, Zheng   ceph: include 'fo...
3774
3775
  		if (struct_v >= 2)
  			struct_len += sizeof(u64); /* snap_follows */
121f22a19   Yan, Zheng   ceph: update cap ...
3776
  		total_len += struct_len;
81c5a1487   Yan, Zheng   ceph: split large...
3777
3778
3779
3780
3781
3782
  
  		if (pagelist->length + total_len > RECONNECT_MAX_SIZE) {
  			err = send_reconnect_partial(recon_state);
  			if (err)
  				goto out_freeflocks;
  			pagelist = recon_state->pagelist;
5ccedf1cc   Yan, Zheng   ceph: don't encod...
3783
  		}
121f22a19   Yan, Zheng   ceph: update cap ...
3784

81c5a1487   Yan, Zheng   ceph: split large...
3785
3786
3787
3788
3789
  		err = ceph_pagelist_reserve(pagelist, total_len);
  		if (err)
  			goto out_freeflocks;
  
  		ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
5ccedf1cc   Yan, Zheng   ceph: don't encod...
3790
3791
3792
3793
  		if (recon_state->msg_version >= 3) {
  			ceph_pagelist_encode_8(pagelist, struct_v);
  			ceph_pagelist_encode_8(pagelist, 1);
  			ceph_pagelist_encode_32(pagelist, struct_len);
121f22a19   Yan, Zheng   ceph: update cap ...
3794
  		}
a33f6432b   Yan, Zheng   ceph: encode inod...
3795
  		ceph_pagelist_encode_string(pagelist, path, pathlen);
5ccedf1cc   Yan, Zheng   ceph: don't encod...
3796
3797
3798
3799
3800
  		ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2));
  		ceph_locks_to_pagelist(flocks, pagelist,
  				       num_fcntl_locks, num_flock_locks);
  		if (struct_v >= 2)
  			ceph_pagelist_encode_64(pagelist, snap_follows);
81c5a1487   Yan, Zheng   ceph: split large...
3801
  out_freeflocks:
39be95e9c   Jim Schutt   ceph: ceph_pageli...
3802
  		kfree(flocks);
3612abbd5   Sage Weil   ceph: fix reconne...
3803
  	} else {
5ccedf1cc   Yan, Zheng   ceph: don't encod...
3804
  		err = ceph_pagelist_reserve(pagelist,
81c5a1487   Yan, Zheng   ceph: split large...
3805
3806
  					    sizeof(u64) + sizeof(u32) +
  					    pathlen + sizeof(rec.v1));
a33f6432b   Yan, Zheng   ceph: encode inod...
3807
3808
  		if (err)
  			goto out_err;
5ccedf1cc   Yan, Zheng   ceph: don't encod...
3809

81c5a1487   Yan, Zheng   ceph: split large...
3810
  		ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
5ccedf1cc   Yan, Zheng   ceph: don't encod...
3811
3812
  		ceph_pagelist_encode_string(pagelist, path, pathlen);
  		ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
40819f6fb   Greg Farnum   ceph: add flock/f...
3813
  	}
44c99757f   Yan, Zheng   ceph: set caps co...
3814

5ccedf1cc   Yan, Zheng   ceph: don't encod...
3815
  out_err:
a33f6432b   Yan, Zheng   ceph: encode inod...
3816
3817
  	ceph_mdsc_free_path(path, pathlen);
  	if (!err)
81c5a1487   Yan, Zheng   ceph: split large...
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
  		recon_state->nr_caps++;
  	return err;
  }
  
  static int encode_snap_realms(struct ceph_mds_client *mdsc,
  			      struct ceph_reconnect_state *recon_state)
  {
  	struct rb_node *p;
  	struct ceph_pagelist *pagelist = recon_state->pagelist;
  	int err = 0;
  
  	if (recon_state->msg_version >= 4) {
  		err = ceph_pagelist_encode_32(pagelist, mdsc->num_snap_realms);
  		if (err < 0)
  			goto fail;
  	}
  
  	/*
  	 * snaprealms.  we provide mds with the ino, seq (version), and
  	 * parent for all of our realms.  If the mds has any newer info,
  	 * it will tell us.
  	 */
  	for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) {
  		struct ceph_snap_realm *realm =
  		       rb_entry(p, struct ceph_snap_realm, node);
  		struct ceph_mds_snaprealm_reconnect sr_rec;
  
  		if (recon_state->msg_version >= 4) {
  			size_t need = sizeof(u8) * 2 + sizeof(u32) +
  				      sizeof(sr_rec);
  
  			if (pagelist->length + need > RECONNECT_MAX_SIZE) {
  				err = send_reconnect_partial(recon_state);
  				if (err)
  					goto fail;
  				pagelist = recon_state->pagelist;
  			}
  
  			err = ceph_pagelist_reserve(pagelist, need);
  			if (err)
  				goto fail;
  
  			ceph_pagelist_encode_8(pagelist, 1);
  			ceph_pagelist_encode_8(pagelist, 1);
  			ceph_pagelist_encode_32(pagelist, sizeof(sr_rec));
  		}
  
  		dout(" adding snap realm %llx seq %lld parent %llx
  ",
  		     realm->ino, realm->seq, realm->parent_ino);
  		sr_rec.ino = cpu_to_le64(realm->ino);
  		sr_rec.seq = cpu_to_le64(realm->seq);
  		sr_rec.parent = cpu_to_le64(realm->parent_ino);
  
  		err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));
  		if (err)
  			goto fail;
  
  		recon_state->nr_realms++;
  	}
  fail:
93cea5beb   Sage Weil   ceph: use ceph_pa...
3879
  	return err;
2f2dc0534   Sage Weil   ceph: MDS client
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
  }
  
  
  /*
   * If an MDS fails and recovers, clients need to reconnect in order to
   * reestablish shared state.  This includes all caps issued through
   * this session _and_ the snap_realm hierarchy.  Because it's not
   * clear which snap realms the mds cares about, we send everything we
   * know about.. that ensures we'll then get any new info the
   * recovering MDS might have.
   *
   * This is a relatively heavyweight operation, but it's rare.
2f2dc0534   Sage Weil   ceph: MDS client
3892
   */
34b6c855f   Sage Weil   ceph: clean up se...
3893
3894
  static void send_mds_reconnect(struct ceph_mds_client *mdsc,
  			       struct ceph_mds_session *session)
2f2dc0534   Sage Weil   ceph: MDS client
3895
  {
2f2dc0534   Sage Weil   ceph: MDS client
3896
  	struct ceph_msg *reply;
34b6c855f   Sage Weil   ceph: clean up se...
3897
  	int mds = session->s_mds;
9abf82b8b   Sage Weil   ceph: fix locking...
3898
  	int err = -ENOMEM;
81c5a1487   Yan, Zheng   ceph: split large...
3899
3900
3901
  	struct ceph_reconnect_state recon_state = {
  		.session = session,
  	};
c8a96a31c   Jeff Layton   ceph: clean up sp...
3902
  	LIST_HEAD(dispose);
2f2dc0534   Sage Weil   ceph: MDS client
3903

34b6c855f   Sage Weil   ceph: clean up se...
3904
3905
  	pr_info("mds%d reconnect start
  ", mds);
2f2dc0534   Sage Weil   ceph: MDS client
3906

81c5a1487   Yan, Zheng   ceph: split large...
3907
3908
  	recon_state.pagelist = ceph_pagelist_alloc(GFP_NOFS);
  	if (!recon_state.pagelist)
93cea5beb   Sage Weil   ceph: use ceph_pa...
3909
  		goto fail_nopagelist;
93cea5beb   Sage Weil   ceph: use ceph_pa...
3910

0d9c1ab3b   Ilya Dryomov   libceph: prealloc...
3911
  	reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false);
a79832f26   Sage Weil   ceph: make ceph_m...
3912
  	if (!reply)
93cea5beb   Sage Weil   ceph: use ceph_pa...
3913
  		goto fail_nomsg;
93cea5beb   Sage Weil   ceph: use ceph_pa...
3914

d48464878   Jeff Layton   ceph: decode inte...
3915
  	xa_destroy(&session->s_delegated_inos);
34b6c855f   Sage Weil   ceph: clean up se...
3916
3917
3918
  	mutex_lock(&session->s_mutex);
  	session->s_state = CEPH_MDS_SESSION_RECONNECTING;
  	session->s_seq = 0;
2f2dc0534   Sage Weil   ceph: MDS client
3919

2f2dc0534   Sage Weil   ceph: MDS client
3920
3921
  	dout("session %p state %s
  ", session,
a687ecaf5   John Spray   ceph: export ceph...
3922
  	     ceph_session_state_name(session->s_state));
2f2dc0534   Sage Weil   ceph: MDS client
3923

99a9c273b   Yan, Zheng   ceph: handle race...
3924
3925
3926
3927
3928
  	spin_lock(&session->s_gen_ttl_lock);
  	session->s_cap_gen++;
  	spin_unlock(&session->s_gen_ttl_lock);
  
  	spin_lock(&session->s_cap_lock);
03f4fcb02   Yan, Zheng   ceph: handle SESS...
3929
3930
  	/* don't know if session is readonly */
  	session->s_readonly = 0;
99a9c273b   Yan, Zheng   ceph: handle race...
3931
3932
3933
3934
3935
3936
  	/*
  	 * notify __ceph_remove_cap() that we are composing cap reconnect.
  	 * If a cap get released before being added to the cap reconnect,
  	 * __ceph_remove_cap() should skip queuing cap release.
  	 */
  	session->s_cap_reconnect = 1;
e01a59464   Sage Weil   ceph: dicard cap ...
3937
  	/* drop old cap expires; we're about to reestablish that state */
c8a96a31c   Jeff Layton   ceph: clean up sp...
3938
3939
3940
  	detach_cap_releases(session, &dispose);
  	spin_unlock(&session->s_cap_lock);
  	dispose_cap_releases(mdsc, &dispose);
e01a59464   Sage Weil   ceph: dicard cap ...
3941

5d23371fd   Yan, Zheng   ceph: trim unused...
3942
  	/* trim unused caps to reduce MDS's cache rejoin time */
c0bd50e2e   Yan, Zheng   ceph: fix null po...
3943
3944
  	if (mdsc->fsc->sb->s_root)
  		shrink_dcache_parent(mdsc->fsc->sb->s_root);
5d23371fd   Yan, Zheng   ceph: trim unused...
3945
3946
3947
3948
3949
3950
3951
3952
  
  	ceph_con_close(&session->s_con);
  	ceph_con_open(&session->s_con,
  		      CEPH_ENTITY_TYPE_MDS, mds,
  		      ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
  
  	/* replay unsafe requests */
  	replay_unsafe_requests(mdsc, session);
81c5a1487   Yan, Zheng   ceph: split large...
3953
  	ceph_early_kick_flushing_caps(mdsc, session);
5d23371fd   Yan, Zheng   ceph: trim unused...
3954
  	down_read(&mdsc->snap_rwsem);
81c5a1487   Yan, Zheng   ceph: split large...
3955
3956
  	/* placeholder for nr_caps */
  	err = ceph_pagelist_encode_32(recon_state.pagelist, 0);
93cea5beb   Sage Weil   ceph: use ceph_pa...
3957
3958
  	if (err)
  		goto fail;
20cb34ae9   Sage Weil   ceph: support v2 ...
3959

81c5a1487   Yan, Zheng   ceph: split large...
3960
  	if (test_bit(CEPHFS_FEATURE_MULTI_RECONNECT, &session->s_features)) {
121f22a19   Yan, Zheng   ceph: update cap ...
3961
  		recon_state.msg_version = 3;
81c5a1487   Yan, Zheng   ceph: split large...
3962
3963
3964
3965
  		recon_state.allow_multi = true;
  	} else if (session->s_con.peer_features & CEPH_FEATURE_MDSENC) {
  		recon_state.msg_version = 3;
  	} else {
23c625ce3   Ilya Dryomov   libceph: assume a...
3966
  		recon_state.msg_version = 2;
81c5a1487   Yan, Zheng   ceph: split large...
3967
3968
  	}
  	/* trsaverse this session's caps */
a25949b99   Jeff Layton   ceph: cap trackin...
3969
  	err = ceph_iterate_session_caps(session, reconnect_caps_cb, &recon_state);
2f2dc0534   Sage Weil   ceph: MDS client
3970

99a9c273b   Yan, Zheng   ceph: handle race...
3971
3972
3973
  	spin_lock(&session->s_cap_lock);
  	session->s_cap_reconnect = 0;
  	spin_unlock(&session->s_cap_lock);
81c5a1487   Yan, Zheng   ceph: split large...
3974
3975
  	if (err < 0)
  		goto fail;
2f2dc0534   Sage Weil   ceph: MDS client
3976

81c5a1487   Yan, Zheng   ceph: split large...
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
  	/* check if all realms can be encoded into current message */
  	if (mdsc->num_snap_realms) {
  		size_t total_len =
  			recon_state.pagelist->length +
  			mdsc->num_snap_realms *
  			sizeof(struct ceph_mds_snaprealm_reconnect);
  		if (recon_state.msg_version >= 4) {
  			/* number of realms */
  			total_len += sizeof(u32);
  			/* version, compat_version and struct_len */
  			total_len += mdsc->num_snap_realms *
  				     (2 * sizeof(u8) + sizeof(u32));
  		}
  		if (total_len > RECONNECT_MAX_SIZE) {
  			if (!recon_state.allow_multi) {
  				err = -ENOSPC;
  				goto fail;
  			}
  			if (recon_state.nr_caps) {
  				err = send_reconnect_partial(&recon_state);
  				if (err)
  					goto fail;
  			}
  			recon_state.msg_version = 5;
  		}
2f2dc0534   Sage Weil   ceph: MDS client
4002
  	}
2f2dc0534   Sage Weil   ceph: MDS client
4003

81c5a1487   Yan, Zheng   ceph: split large...
4004
4005
4006
4007
4008
4009
4010
4011
4012
  	err = encode_snap_realms(mdsc, &recon_state);
  	if (err < 0)
  		goto fail;
  
  	if (recon_state.msg_version >= 5) {
  		err = ceph_pagelist_encode_8(recon_state.pagelist, 0);
  		if (err < 0)
  			goto fail;
  	}
44c99757f   Yan, Zheng   ceph: set caps co...
4013

81c5a1487   Yan, Zheng   ceph: split large...
4014
4015
4016
4017
  	if (recon_state.nr_caps || recon_state.nr_realms) {
  		struct page *page =
  			list_first_entry(&recon_state.pagelist->head,
  					struct page, lru);
44c99757f   Yan, Zheng   ceph: set caps co...
4018
  		__le32 *addr = kmap_atomic(page);
81c5a1487   Yan, Zheng   ceph: split large...
4019
4020
4021
4022
4023
4024
  		if (recon_state.nr_caps) {
  			WARN_ON(recon_state.nr_realms != mdsc->num_snap_realms);
  			*addr = cpu_to_le32(recon_state.nr_caps);
  		} else if (recon_state.msg_version >= 4) {
  			*(addr + 1) = cpu_to_le32(recon_state.nr_realms);
  		}
44c99757f   Yan, Zheng   ceph: set caps co...
4025
  		kunmap_atomic(addr);
ebf18f470   Alex Elder   ceph: only set me...
4026
  	}
44c99757f   Yan, Zheng   ceph: set caps co...
4027

81c5a1487   Yan, Zheng   ceph: split large...
4028
4029
4030
  	reply->hdr.version = cpu_to_le16(recon_state.msg_version);
  	if (recon_state.msg_version >= 4)
  		reply->hdr.compat_version = cpu_to_le16(4);
e548e9b93   Yan, Zheng   ceph: re-send flu...
4031

81c5a1487   Yan, Zheng   ceph: split large...
4032
4033
  	reply->hdr.data_len = cpu_to_le32(recon_state.pagelist->length);
  	ceph_msg_data_add_pagelist(reply, recon_state.pagelist);
e548e9b93   Yan, Zheng   ceph: re-send flu...
4034

2f2dc0534   Sage Weil   ceph: MDS client
4035
  	ceph_con_send(&session->s_con, reply);
9abf82b8b   Sage Weil   ceph: fix locking...
4036
4037
4038
4039
4040
  	mutex_unlock(&session->s_mutex);
  
  	mutex_lock(&mdsc->mutex);
  	__wake_requests(mdsc, &session->s_waiting);
  	mutex_unlock(&mdsc->mutex);
2f2dc0534   Sage Weil   ceph: MDS client
4041
  	up_read(&mdsc->snap_rwsem);
81c5a1487   Yan, Zheng   ceph: split large...
4042
  	ceph_pagelist_release(recon_state.pagelist);
2f2dc0534   Sage Weil   ceph: MDS client
4043
  	return;
93cea5beb   Sage Weil   ceph: use ceph_pa...
4044
  fail:
2f2dc0534   Sage Weil   ceph: MDS client
4045
  	ceph_msg_put(reply);
9abf82b8b   Sage Weil   ceph: fix locking...
4046
4047
  	up_read(&mdsc->snap_rwsem);
  	mutex_unlock(&session->s_mutex);
93cea5beb   Sage Weil   ceph: use ceph_pa...
4048
  fail_nomsg:
81c5a1487   Yan, Zheng   ceph: split large...
4049
  	ceph_pagelist_release(recon_state.pagelist);
93cea5beb   Sage Weil   ceph: use ceph_pa...
4050
  fail_nopagelist:
9abf82b8b   Sage Weil   ceph: fix locking...
4051
4052
  	pr_err("error %d preparing reconnect for mds%d
  ", err, mds);
9abf82b8b   Sage Weil   ceph: fix locking...
4053
  	return;
2f2dc0534   Sage Weil   ceph: MDS client
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
  }
  
  
  /*
   * compare old and new mdsmaps, kicking requests
   * and closing out old connections as necessary
   *
   * called under mdsc->mutex.
   */
  static void check_new_map(struct ceph_mds_client *mdsc,
  			  struct ceph_mdsmap *newmap,
  			  struct ceph_mdsmap *oldmap)
  {
  	int i;
  	int oldstate, newstate;
  	struct ceph_mds_session *s;
  
  	dout("check_new_map new %u old %u
  ",
  	     newmap->m_epoch, oldmap->m_epoch);
b38c9eb47   Xiubo Li   ceph: add possibl...
4074
  	for (i = 0; i < oldmap->possible_max_rank && i < mdsc->max_sessions; i++) {
d37b1d994   Markus Elfring   ceph: adjust 36 c...
4075
  		if (!mdsc->sessions[i])
2f2dc0534   Sage Weil   ceph: MDS client
4076
4077
4078
4079
  			continue;
  		s = mdsc->sessions[i];
  		oldstate = ceph_mdsmap_get_state(oldmap, i);
  		newstate = ceph_mdsmap_get_state(newmap, i);
0deb01c99   Sage Weil   ceph: track laggy...
4080
4081
  		dout("check_new_map mds%d state %s%s -> %s%s (session %s)
  ",
2f2dc0534   Sage Weil   ceph: MDS client
4082
  		     i, ceph_mds_state_name(oldstate),
0deb01c99   Sage Weil   ceph: track laggy...
4083
  		     ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "",
2f2dc0534   Sage Weil   ceph: MDS client
4084
  		     ceph_mds_state_name(newstate),
0deb01c99   Sage Weil   ceph: track laggy...
4085
  		     ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
a687ecaf5   John Spray   ceph: export ceph...
4086
  		     ceph_session_state_name(s->s_state));
2f2dc0534   Sage Weil   ceph: MDS client
4087

b38c9eb47   Xiubo Li   ceph: add possibl...
4088
  		if (i >= newmap->possible_max_rank) {
6f0f597b5   Yan, Zheng   ceph: don't blind...
4089
  			/* force close session for stopped mds */
5b3248c67   Xiubo Li   ceph: rename get_...
4090
  			ceph_get_mds_session(s);
6f0f597b5   Yan, Zheng   ceph: don't blind...
4091
4092
4093
  			__unregister_session(mdsc, s);
  			__wake_requests(mdsc, &s->s_waiting);
  			mutex_unlock(&mdsc->mutex);
2827528da   Yan, Zheng   ceph: close stopp...
4094

6f0f597b5   Yan, Zheng   ceph: don't blind...
4095
4096
4097
4098
  			mutex_lock(&s->s_mutex);
  			cleanup_session_requests(mdsc, s);
  			remove_session_caps(s);
  			mutex_unlock(&s->s_mutex);
2827528da   Yan, Zheng   ceph: close stopp...
4099

6f0f597b5   Yan, Zheng   ceph: don't blind...
4100
  			ceph_put_mds_session(s);
2827528da   Yan, Zheng   ceph: close stopp...
4101

6f0f597b5   Yan, Zheng   ceph: don't blind...
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
  			mutex_lock(&mdsc->mutex);
  			kick_requests(mdsc, i);
  			continue;
  		}
  
  		if (memcmp(ceph_mdsmap_get_addr(oldmap, i),
  			   ceph_mdsmap_get_addr(newmap, i),
  			   sizeof(struct ceph_entity_addr))) {
  			/* just close it */
  			mutex_unlock(&mdsc->mutex);
  			mutex_lock(&s->s_mutex);
  			mutex_lock(&mdsc->mutex);
  			ceph_con_close(&s->s_con);
  			mutex_unlock(&s->s_mutex);
  			s->s_state = CEPH_MDS_SESSION_RESTARTING;
2f2dc0534   Sage Weil   ceph: MDS client
4117
4118
4119
4120
4121
4122
4123
4124
  		} else if (oldstate == newstate) {
  			continue;  /* nothing new with this mds */
  		}
  
  		/*
  		 * send reconnect?
  		 */
  		if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
34b6c855f   Sage Weil   ceph: clean up se...
4125
4126
4127
4128
4129
  		    newstate >= CEPH_MDS_STATE_RECONNECT) {
  			mutex_unlock(&mdsc->mutex);
  			send_mds_reconnect(mdsc, s);
  			mutex_lock(&mdsc->mutex);
  		}
2f2dc0534   Sage Weil   ceph: MDS client
4130
4131
  
  		/*
29790f26a   Sage Weil   ceph: wait for md...
4132
  		 * kick request on any mds that has gone active.
2f2dc0534   Sage Weil   ceph: MDS client
4133
4134
4135
  		 */
  		if (oldstate < CEPH_MDS_STATE_ACTIVE &&
  		    newstate >= CEPH_MDS_STATE_ACTIVE) {
29790f26a   Sage Weil   ceph: wait for md...
4136
4137
4138
4139
4140
  			if (oldstate != CEPH_MDS_STATE_CREATING &&
  			    oldstate != CEPH_MDS_STATE_STARTING)
  				pr_info("mds%d recovery completed
  ", s->s_mds);
  			kick_requests(mdsc, i);
ea8412b28   Xiubo Li   ceph: make sure m...
4141
  			mutex_unlock(&mdsc->mutex);
829ad4db9   Jeff Layton   ceph: ceph_kick_f...
4142
  			mutex_lock(&s->s_mutex);
ea8412b28   Xiubo Li   ceph: make sure m...
4143
  			mutex_lock(&mdsc->mutex);
2f2dc0534   Sage Weil   ceph: MDS client
4144
  			ceph_kick_flushing_caps(mdsc, s);
829ad4db9   Jeff Layton   ceph: ceph_kick_f...
4145
  			mutex_unlock(&s->s_mutex);
d2f8bb27c   Yan, Zheng   ceph: update want...
4146
  			wake_up_session_caps(s, RECONNECT);
2f2dc0534   Sage Weil   ceph: MDS client
4147
4148
  		}
  	}
cb170a221   Sage Weil   ceph: connect to ...
4149

b38c9eb47   Xiubo Li   ceph: add possibl...
4150
  	for (i = 0; i < newmap->possible_max_rank && i < mdsc->max_sessions; i++) {
cb170a221   Sage Weil   ceph: connect to ...
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
  		s = mdsc->sessions[i];
  		if (!s)
  			continue;
  		if (!ceph_mdsmap_is_laggy(newmap, i))
  			continue;
  		if (s->s_state == CEPH_MDS_SESSION_OPEN ||
  		    s->s_state == CEPH_MDS_SESSION_HUNG ||
  		    s->s_state == CEPH_MDS_SESSION_CLOSING) {
  			dout(" connecting to export targets of laggy mds%d
  ",
  			     i);
  			__open_export_target_sessions(mdsc, s);
  		}
  	}
2f2dc0534   Sage Weil   ceph: MDS client
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
  }
  
  
  
  /*
   * leases
   */
  
  /*
   * caller must hold session s_mutex, dentry->d_lock
   */
  void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry)
  {
  	struct ceph_dentry_info *di = ceph_dentry(dentry);
  
  	ceph_put_mds_session(di->lease_session);
  	di->lease_session = NULL;
  }
2600d2dd5   Sage Weil   ceph: drop messag...
4183
4184
4185
  static void handle_lease(struct ceph_mds_client *mdsc,
  			 struct ceph_mds_session *session,
  			 struct ceph_msg *msg)
2f2dc0534   Sage Weil   ceph: MDS client
4186
  {
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
4187
  	struct super_block *sb = mdsc->fsc->sb;
2f2dc0534   Sage Weil   ceph: MDS client
4188
  	struct inode *inode;
2f2dc0534   Sage Weil   ceph: MDS client
4189
4190
  	struct dentry *parent, *dentry;
  	struct ceph_dentry_info *di;
2600d2dd5   Sage Weil   ceph: drop messag...
4191
  	int mds = session->s_mds;
2f2dc0534   Sage Weil   ceph: MDS client
4192
  	struct ceph_mds_lease *h = msg->front.iov_base;
1e5ea23df   Sage Weil   ceph: fix lease r...
4193
  	u32 seq;
2f2dc0534   Sage Weil   ceph: MDS client
4194
  	struct ceph_vino vino;
2f2dc0534   Sage Weil   ceph: MDS client
4195
4196
  	struct qstr dname;
  	int release = 0;
2f2dc0534   Sage Weil   ceph: MDS client
4197
4198
4199
4200
4201
4202
4203
4204
  	dout("handle_lease from mds%d
  ", mds);
  
  	/* decode */
  	if (msg->front.iov_len < sizeof(*h) + sizeof(u32))
  		goto bad;
  	vino.ino = le64_to_cpu(h->ino);
  	vino.snap = CEPH_NOSNAP;
1e5ea23df   Sage Weil   ceph: fix lease r...
4205
  	seq = le32_to_cpu(h->seq);
0fcf6c02b   Yan, Zheng   ceph: don't drop ...
4206
4207
  	dname.len = get_unaligned_le32(h + 1);
  	if (msg->front.iov_len < sizeof(*h) + sizeof(u32) + dname.len)
2f2dc0534   Sage Weil   ceph: MDS client
4208
  		goto bad;
0fcf6c02b   Yan, Zheng   ceph: don't drop ...
4209
  	dname.name = (void *)(h + 1) + sizeof(u32);
2f2dc0534   Sage Weil   ceph: MDS client
4210

2f2dc0534   Sage Weil   ceph: MDS client
4211
4212
  	/* lookup inode */
  	inode = ceph_find_inode(sb, vino);
2f90b852e   Sage Weil   ceph: ignore leas...
4213
4214
4215
  	dout("handle_lease %s, ino %llx %p %.*s
  ",
  	     ceph_lease_op_name(h->action), vino.ino, inode,
1e5ea23df   Sage Weil   ceph: fix lease r...
4216
  	     dname.len, dname.name);
6cd3bcad0   Yan, Zheng   ceph: move ceph_f...
4217
4218
  
  	mutex_lock(&session->s_mutex);
62575e270   Jeff Layton   ceph: check sessi...
4219
  	inc_session_sequence(session);
6cd3bcad0   Yan, Zheng   ceph: move ceph_f...
4220

d37b1d994   Markus Elfring   ceph: adjust 36 c...
4221
  	if (!inode) {
2f2dc0534   Sage Weil   ceph: MDS client
4222
4223
4224
4225
  		dout("handle_lease no inode %llx
  ", vino.ino);
  		goto release;
  	}
2f2dc0534   Sage Weil   ceph: MDS client
4226
4227
4228
4229
4230
4231
4232
4233
4234
  
  	/* dentry */
  	parent = d_find_alias(inode);
  	if (!parent) {
  		dout("no parent dentry on inode %p
  ", inode);
  		WARN_ON(1);
  		goto release;  /* hrm... */
  	}
8387ff257   Linus Torvalds   vfs: make the str...
4235
  	dname.hash = full_name_hash(parent, dname.name, dname.len);
2f2dc0534   Sage Weil   ceph: MDS client
4236
4237
4238
4239
4240
4241
4242
4243
4244
  	dentry = d_lookup(parent, &dname);
  	dput(parent);
  	if (!dentry)
  		goto release;
  
  	spin_lock(&dentry->d_lock);
  	di = ceph_dentry(dentry);
  	switch (h->action) {
  	case CEPH_MDS_LEASE_REVOKE:
3d8eb7a94   Sage Weil   ceph: remove unne...
4245
  		if (di->lease_session == session) {
1e5ea23df   Sage Weil   ceph: fix lease r...
4246
4247
  			if (ceph_seq_cmp(di->lease_seq, seq) > 0)
  				h->seq = cpu_to_le32(di->lease_seq);
2f2dc0534   Sage Weil   ceph: MDS client
4248
4249
4250
4251
4252
4253
  			__ceph_mdsc_drop_dentry_lease(dentry);
  		}
  		release = 1;
  		break;
  
  	case CEPH_MDS_LEASE_RENEW:
3d8eb7a94   Sage Weil   ceph: remove unne...
4254
  		if (di->lease_session == session &&
2f2dc0534   Sage Weil   ceph: MDS client
4255
4256
4257
4258
  		    di->lease_gen == session->s_cap_gen &&
  		    di->lease_renew_from &&
  		    di->lease_renew_after == 0) {
  			unsigned long duration =
3563dbdd9   Nicholas Mc Guire   ceph: use msecs_t...
4259
  				msecs_to_jiffies(le32_to_cpu(h->duration_ms));
2f2dc0534   Sage Weil   ceph: MDS client
4260

1e5ea23df   Sage Weil   ceph: fix lease r...
4261
  			di->lease_seq = seq;
9b16f03c4   Miklos Szeredi   ceph: don't use -...
4262
  			di->time = di->lease_renew_from + duration;
2f2dc0534   Sage Weil   ceph: MDS client
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
  			di->lease_renew_after = di->lease_renew_from +
  				(duration >> 1);
  			di->lease_renew_from = 0;
  		}
  		break;
  	}
  	spin_unlock(&dentry->d_lock);
  	dput(dentry);
  
  	if (!release)
  		goto out;
  
  release:
  	/* let's just reuse the same message */
  	h->action = CEPH_MDS_LEASE_REVOKE_ACK;
  	ceph_msg_get(msg);
  	ceph_con_send(&session->s_con, msg);
  
  out:
2f2dc0534   Sage Weil   ceph: MDS client
4282
  	mutex_unlock(&session->s_mutex);
3e1d0452e   Yan, Zheng   ceph: avoid iput_...
4283
4284
  	/* avoid calling iput_final() in mds dispatch threads */
  	ceph_async_iput(inode);
2f2dc0534   Sage Weil   ceph: MDS client
4285
4286
4287
4288
4289
  	return;
  
  bad:
  	pr_err("corrupt lease message
  ");
9ec7cab14   Sage Weil   ceph: hex dump co...
4290
  	ceph_msg_dump(msg);
2f2dc0534   Sage Weil   ceph: MDS client
4291
4292
4293
  }
  
  void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
2f2dc0534   Sage Weil   ceph: MDS client
4294
4295
4296
4297
4298
  			      struct dentry *dentry, char action,
  			      u32 seq)
  {
  	struct ceph_msg *msg;
  	struct ceph_mds_lease *lease;
8f2a98ef3   Yan, Zheng   ceph: ensure d_na...
4299
4300
  	struct inode *dir;
  	int len = sizeof(*lease) + sizeof(u32) + NAME_MAX;
2f2dc0534   Sage Weil   ceph: MDS client
4301

8f2a98ef3   Yan, Zheng   ceph: ensure d_na...
4302
4303
4304
  	dout("lease_send_msg identry %p %s to mds%d
  ",
  	     dentry, ceph_lease_op_name(action), session->s_mds);
2f2dc0534   Sage Weil   ceph: MDS client
4305

b61c27636   Sage Weil   libceph: don't co...
4306
  	msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false);
a79832f26   Sage Weil   ceph: make ceph_m...
4307
  	if (!msg)
2f2dc0534   Sage Weil   ceph: MDS client
4308
4309
4310
  		return;
  	lease = msg->front.iov_base;
  	lease->action = action;
2f2dc0534   Sage Weil   ceph: MDS client
4311
  	lease->seq = cpu_to_le32(seq);
2f2dc0534   Sage Weil   ceph: MDS client
4312

8f2a98ef3   Yan, Zheng   ceph: ensure d_na...
4313
4314
4315
4316
4317
4318
4319
4320
4321
  	spin_lock(&dentry->d_lock);
  	dir = d_inode(dentry->d_parent);
  	lease->ino = cpu_to_le64(ceph_ino(dir));
  	lease->first = lease->last = cpu_to_le64(ceph_snap(dir));
  
  	put_unaligned_le32(dentry->d_name.len, lease + 1);
  	memcpy((void *)(lease + 1) + 4,
  	       dentry->d_name.name, dentry->d_name.len);
  	spin_unlock(&dentry->d_lock);
2f2dc0534   Sage Weil   ceph: MDS client
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
  	/*
  	 * if this is a preemptive lease RELEASE, no need to
  	 * flush request stream, since the actual request will
  	 * soon follow.
  	 */
  	msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE);
  
  	ceph_con_send(&session->s_con, msg);
  }
  
  /*
7aac453a0   Yan, Zheng   ceph: rename func...
4333
   * lock unlock sessions, to wait ongoing session activities
2f2dc0534   Sage Weil   ceph: MDS client
4334
   */
7aac453a0   Yan, Zheng   ceph: rename func...
4335
  static void lock_unlock_sessions(struct ceph_mds_client *mdsc)
2f2dc0534   Sage Weil   ceph: MDS client
4336
4337
  {
  	int i;
2f2dc0534   Sage Weil   ceph: MDS client
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
  	mutex_lock(&mdsc->mutex);
  	for (i = 0; i < mdsc->max_sessions; i++) {
  		struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
  		if (!s)
  			continue;
  		mutex_unlock(&mdsc->mutex);
  		mutex_lock(&s->s_mutex);
  		mutex_unlock(&s->s_mutex);
  		ceph_put_mds_session(s);
  		mutex_lock(&mdsc->mutex);
  	}
  	mutex_unlock(&mdsc->mutex);
  }
131d7eb4f   Yan, Zheng   ceph: auto reconn...
4351
4352
4353
4354
4355
4356
  static void maybe_recover_session(struct ceph_mds_client *mdsc)
  {
  	struct ceph_fs_client *fsc = mdsc->fsc;
  
  	if (!ceph_test_mount_opt(fsc, CLEANRECOVER))
  		return;
2f2dc0534   Sage Weil   ceph: MDS client
4357

131d7eb4f   Yan, Zheng   ceph: auto reconn...
4358
4359
  	if (READ_ONCE(fsc->mount_state) != CEPH_MOUNT_MOUNTED)
  		return;
0b98acd61   Ilya Dryomov   libceph, rbd, cep...
4360
  	if (!READ_ONCE(fsc->blocklisted))
131d7eb4f   Yan, Zheng   ceph: auto reconn...
4361
4362
4363
4364
4365
  		return;
  
  	if (fsc->last_auto_reconnect &&
  	    time_before(jiffies, fsc->last_auto_reconnect + HZ * 60 * 30))
  		return;
0b98acd61   Ilya Dryomov   libceph, rbd, cep...
4366
4367
  	pr_info("auto reconnect after blocklisted
  ");
131d7eb4f   Yan, Zheng   ceph: auto reconn...
4368
4369
4370
  	fsc->last_auto_reconnect = jiffies;
  	ceph_force_reconnect(fsc->sb);
  }
2f2dc0534   Sage Weil   ceph: MDS client
4371

3e699bd86   Xiubo Li   ceph: add check_s...
4372
4373
  bool check_session_state(struct ceph_mds_session *s)
  {
62575e270   Jeff Layton   ceph: check sessi...
4374
4375
4376
  	switch (s->s_state) {
  	case CEPH_MDS_SESSION_OPEN:
  		if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
3e699bd86   Xiubo Li   ceph: add check_s...
4377
4378
4379
4380
  			s->s_state = CEPH_MDS_SESSION_HUNG;
  			pr_info("mds%d hung
  ", s->s_mds);
  		}
62575e270   Jeff Layton   ceph: check sessi...
4381
4382
4383
4384
4385
4386
4387
4388
4389
  		break;
  	case CEPH_MDS_SESSION_CLOSING:
  		/* Should never reach this when we're unmounting */
  		WARN_ON_ONCE(true);
  		fallthrough;
  	case CEPH_MDS_SESSION_NEW:
  	case CEPH_MDS_SESSION_RESTARTING:
  	case CEPH_MDS_SESSION_CLOSED:
  	case CEPH_MDS_SESSION_REJECTED:
3e699bd86   Xiubo Li   ceph: add check_s...
4390
  		return false;
62575e270   Jeff Layton   ceph: check sessi...
4391
  	}
3e699bd86   Xiubo Li   ceph: add check_s...
4392
4393
4394
  
  	return true;
  }
2f2dc0534   Sage Weil   ceph: MDS client
4395
  /*
62575e270   Jeff Layton   ceph: check sessi...
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
   * If the sequence is incremented while we're waiting on a REQUEST_CLOSE reply,
   * then we need to retransmit that request.
   */
  void inc_session_sequence(struct ceph_mds_session *s)
  {
  	lockdep_assert_held(&s->s_mutex);
  
  	s->s_seq++;
  
  	if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
  		int ret;
  
  		dout("resending session close request for mds%d
  ", s->s_mds);
  		ret = request_close_session(s);
  		if (ret < 0)
  			pr_err("unable to close session to mds%d: %d
  ",
  			       s->s_mds, ret);
  	}
  }
  
  /*
2f2dc0534   Sage Weil   ceph: MDS client
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
   * delayed work -- periodically trim expired leases, renew caps with mds
   */
  static void schedule_delayed(struct ceph_mds_client *mdsc)
  {
  	int delay = 5;
  	unsigned hz = round_jiffies_relative(HZ * delay);
  	schedule_delayed_work(&mdsc->delayed_work, hz);
  }
  
  static void delayed_work(struct work_struct *work)
  {
  	int i;
  	struct ceph_mds_client *mdsc =
  		container_of(work, struct ceph_mds_client, delayed_work.work);
  	int renew_interval;
  	int renew_caps;
  
  	dout("mdsc delayed_work
  ");
75c9627ef   Yan, Zheng   ceph: map snapid ...
4438

fa9967734   Xiubo Li   ceph: fix potenti...
4439
4440
  	if (mdsc->stopping)
  		return;
2f2dc0534   Sage Weil   ceph: MDS client
4441
4442
4443
4444
4445
4446
4447
4448
4449
  	mutex_lock(&mdsc->mutex);
  	renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
  	renew_caps = time_after_eq(jiffies, HZ*renew_interval +
  				   mdsc->last_renew_caps);
  	if (renew_caps)
  		mdsc->last_renew_caps = jiffies;
  
  	for (i = 0; i < mdsc->max_sessions; i++) {
  		struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
d37b1d994   Markus Elfring   ceph: adjust 36 c...
4450
  		if (!s)
2f2dc0534   Sage Weil   ceph: MDS client
4451
  			continue;
3e699bd86   Xiubo Li   ceph: add check_s...
4452
4453
  
  		if (!check_session_state(s)) {
2f2dc0534   Sage Weil   ceph: MDS client
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
  			ceph_put_mds_session(s);
  			continue;
  		}
  		mutex_unlock(&mdsc->mutex);
  
  		mutex_lock(&s->s_mutex);
  		if (renew_caps)
  			send_renew_caps(mdsc, s);
  		else
  			ceph_con_keepalive(&s->s_con);
aab53dd9e   Sage Weil   ceph: only send c...
4464
4465
  		if (s->s_state == CEPH_MDS_SESSION_OPEN ||
  		    s->s_state == CEPH_MDS_SESSION_HUNG)
3d7ded4d8   Sage Weil   ceph: release cap...
4466
  			ceph_send_cap_releases(mdsc, s);
2f2dc0534   Sage Weil   ceph: MDS client
4467
4468
4469
4470
4471
4472
  		mutex_unlock(&s->s_mutex);
  		ceph_put_mds_session(s);
  
  		mutex_lock(&mdsc->mutex);
  	}
  	mutex_unlock(&mdsc->mutex);
37c4efc1d   Yan, Zheng   ceph: periodicall...
4473
4474
4475
4476
4477
  	ceph_check_delayed_caps(mdsc);
  
  	ceph_queue_cap_reclaim_work(mdsc);
  
  	ceph_trim_snapid_map(mdsc);
131d7eb4f   Yan, Zheng   ceph: auto reconn...
4478
  	maybe_recover_session(mdsc);
2f2dc0534   Sage Weil   ceph: MDS client
4479
4480
  	schedule_delayed(mdsc);
  }
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
4481
  int ceph_mdsc_init(struct ceph_fs_client *fsc)
2f2dc0534   Sage Weil   ceph: MDS client
4482

2f2dc0534   Sage Weil   ceph: MDS client
4483
  {
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
4484
  	struct ceph_mds_client *mdsc;
f9009efac   Xiubo Li   ceph: add dentry ...
4485
  	int err;
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
4486
4487
4488
4489
4490
  
  	mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS);
  	if (!mdsc)
  		return -ENOMEM;
  	mdsc->fsc = fsc;
2f2dc0534   Sage Weil   ceph: MDS client
4491
4492
  	mutex_init(&mdsc->mutex);
  	mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
d37b1d994   Markus Elfring   ceph: adjust 36 c...
4493
  	if (!mdsc->mdsmap) {
f9009efac   Xiubo Li   ceph: add dentry ...
4494
4495
  		err = -ENOMEM;
  		goto err_mdsc;
fb3101b6f   majianpeng   ceph: Free mdsc i...
4496
  	}
2d06eeb87   Cheng Renquan   ceph: handle kzal...
4497

2f2dc0534   Sage Weil   ceph: MDS client
4498
  	init_completion(&mdsc->safe_umount_waiters);
f3c60c591   Sage Weil   ceph: fix multipl...
4499
  	init_waitqueue_head(&mdsc->session_close_wq);
2f2dc0534   Sage Weil   ceph: MDS client
4500
4501
  	INIT_LIST_HEAD(&mdsc->waiting_for_map);
  	mdsc->sessions = NULL;
86d8f67b2   Yan, Zheng   ceph: avoid block...
4502
  	atomic_set(&mdsc->num_sessions, 0);
2f2dc0534   Sage Weil   ceph: MDS client
4503
4504
  	mdsc->max_sessions = 0;
  	mdsc->stopping = 0;
d557c48db   Luis Henriques   ceph: quota: add ...
4505
  	atomic64_set(&mdsc->quotarealms_count, 0);
0c44a8e0f   Luis Henriques   ceph: quota: fix ...
4506
4507
  	mdsc->quotarealms_inodes = RB_ROOT;
  	mutex_init(&mdsc->quotarealms_inodes_mutex);
affbc19a6   Yan, Zheng   ceph: make sure s...
4508
  	mdsc->last_snap_seq = 0;
2f2dc0534   Sage Weil   ceph: MDS client
4509
  	init_rwsem(&mdsc->snap_rwsem);
a105f00cf   Sage Weil   ceph: use rbtree ...
4510
  	mdsc->snap_realms = RB_ROOT;
2f2dc0534   Sage Weil   ceph: MDS client
4511
  	INIT_LIST_HEAD(&mdsc->snap_empty);
81c5a1487   Yan, Zheng   ceph: split large...
4512
  	mdsc->num_snap_realms = 0;
2f2dc0534   Sage Weil   ceph: MDS client
4513
4514
  	spin_lock_init(&mdsc->snap_empty_lock);
  	mdsc->last_tid = 0;
e8a7b8b12   Yan, Zheng   ceph: exclude set...
4515
  	mdsc->oldest_tid = 0;
44ca18f26   Sage Weil   ceph: use rbtree ...
4516
  	mdsc->request_tree = RB_ROOT;
2f2dc0534   Sage Weil   ceph: MDS client
4517
4518
4519
  	INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
  	mdsc->last_renew_caps = jiffies;
  	INIT_LIST_HEAD(&mdsc->cap_delay_list);
3a3430aff   Jeff Layton   ceph: show tasks ...
4520
  	INIT_LIST_HEAD(&mdsc->cap_wait_list);
2f2dc0534   Sage Weil   ceph: MDS client
4521
4522
4523
  	spin_lock_init(&mdsc->cap_delay_lock);
  	INIT_LIST_HEAD(&mdsc->snap_flush_list);
  	spin_lock_init(&mdsc->snap_flush_lock);
553adfd94   Yan, Zheng   ceph: track pendi...
4524
  	mdsc->last_cap_flush_tid = 1;
e4500b5e3   Yan, Zheng   ceph: use list in...
4525
  	INIT_LIST_HEAD(&mdsc->cap_flush_list);
db3540522   Sage Weil   ceph: fix cap flu...
4526
  	INIT_LIST_HEAD(&mdsc->cap_dirty_migrating);
2f2dc0534   Sage Weil   ceph: MDS client
4527
4528
4529
  	mdsc->num_cap_flushing = 0;
  	spin_lock_init(&mdsc->cap_dirty_lock);
  	init_waitqueue_head(&mdsc->cap_flushing_wq);
37c4efc1d   Yan, Zheng   ceph: periodicall...
4530
  	INIT_WORK(&mdsc->cap_reclaim_work, ceph_cap_reclaim_work);
fe33032da   Yan, Zheng   ceph: add mount o...
4531
  	atomic_set(&mdsc->cap_reclaim_pending, 0);
f9009efac   Xiubo Li   ceph: add dentry ...
4532
4533
4534
  	err = ceph_metric_init(&mdsc->metric);
  	if (err)
  		goto err_mdsmap;
37c4efc1d   Yan, Zheng   ceph: periodicall...
4535
4536
4537
4538
  
  	spin_lock_init(&mdsc->dentry_list_lock);
  	INIT_LIST_HEAD(&mdsc->dentry_leases);
  	INIT_LIST_HEAD(&mdsc->dentry_dir_leases);
2d06eeb87   Cheng Renquan   ceph: handle kzal...
4539

37151668b   Yehuda Sadeh   ceph: do caps acc...
4540
  	ceph_caps_init(mdsc);
fe33032da   Yan, Zheng   ceph: add mount o...
4541
  	ceph_adjust_caps_max_min(mdsc, fsc->mount_options);
37151668b   Yehuda Sadeh   ceph: do caps acc...
4542

75c9627ef   Yan, Zheng   ceph: map snapid ...
4543
4544
4545
  	spin_lock_init(&mdsc->snapid_map_lock);
  	mdsc->snapid_map_tree = RB_ROOT;
  	INIT_LIST_HEAD(&mdsc->snapid_map_lru);
10183a695   Yan, Zheng   ceph: check OSD c...
4546
4547
  	init_rwsem(&mdsc->pool_perm_rwsem);
  	mdsc->pool_perm_tree = RB_ROOT;
dfeb84d4a   Yan, Zheng   ceph: fix incorre...
4548
4549
  	strscpy(mdsc->nodename, utsname()->nodename,
  		sizeof(mdsc->nodename));
a7caa88f8   Xiubo Li   ceph: fix use-aft...
4550
4551
  
  	fsc->mdsc = mdsc;
5f44f1426   Sage Weil   ceph: handle erro...
4552
  	return 0;
f9009efac   Xiubo Li   ceph: add dentry ...
4553
4554
4555
4556
4557
4558
  
  err_mdsmap:
  	kfree(mdsc->mdsmap);
  err_mdsc:
  	kfree(mdsc);
  	return err;
2f2dc0534   Sage Weil   ceph: MDS client
4559
4560
4561
4562
4563
4564
4565
4566
  }
  
  /*
   * Wait for safe replies on open mds requests.  If we time out, drop
   * all requests from the tree to avoid dangling dentry refs.
   */
  static void wait_requests(struct ceph_mds_client *mdsc)
  {
a319bf56a   Ilya Dryomov   libceph: store ti...
4567
  	struct ceph_options *opts = mdsc->fsc->client->options;
2f2dc0534   Sage Weil   ceph: MDS client
4568
  	struct ceph_mds_request *req;
2f2dc0534   Sage Weil   ceph: MDS client
4569
4570
  
  	mutex_lock(&mdsc->mutex);
44ca18f26   Sage Weil   ceph: use rbtree ...
4571
  	if (__get_oldest_req(mdsc)) {
2f2dc0534   Sage Weil   ceph: MDS client
4572
  		mutex_unlock(&mdsc->mutex);
44ca18f26   Sage Weil   ceph: use rbtree ...
4573

2f2dc0534   Sage Weil   ceph: MDS client
4574
4575
4576
  		dout("wait_requests waiting for requests
  ");
  		wait_for_completion_timeout(&mdsc->safe_umount_waiters,
a319bf56a   Ilya Dryomov   libceph: store ti...
4577
  				    ceph_timeout_jiffies(opts->mount_timeout));
2f2dc0534   Sage Weil   ceph: MDS client
4578
4579
  
  		/* tear down remaining requests */
44ca18f26   Sage Weil   ceph: use rbtree ...
4580
4581
  		mutex_lock(&mdsc->mutex);
  		while ((req = __get_oldest_req(mdsc))) {
2f2dc0534   Sage Weil   ceph: MDS client
4582
4583
4584
  			dout("wait_requests timed out on tid %llu
  ",
  			     req->r_tid);
428138c98   Yan, Zheng   ceph: remove requ...
4585
  			list_del_init(&req->r_wait);
44ca18f26   Sage Weil   ceph: use rbtree ...
4586
  			__unregister_request(mdsc, req);
2f2dc0534   Sage Weil   ceph: MDS client
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
  		}
  	}
  	mutex_unlock(&mdsc->mutex);
  	dout("wait_requests done
  ");
  }
  
  /*
   * called before mount is ro, and before dentries are torn down.
   * (hmm, does this still race with new lookups?)
   */
  void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
  {
  	dout("pre_umount
  ");
  	mdsc->stopping = 1;
7aac453a0   Yan, Zheng   ceph: rename func...
4603
  	lock_unlock_sessions(mdsc);
afcdaea3f   Sage Weil   ceph: flush dirty...
4604
  	ceph_flush_dirty_caps(mdsc);
2f2dc0534   Sage Weil   ceph: MDS client
4605
  	wait_requests(mdsc);
17c688c3d   Sage Weil   ceph: delay umoun...
4606
4607
4608
4609
4610
4611
  
  	/*
  	 * wait for reply handlers to drop their request refs and
  	 * their inode/dcache refs
  	 */
  	ceph_msgr_flush();
0c44a8e0f   Luis Henriques   ceph: quota: fix ...
4612
4613
  
  	ceph_cleanup_quotarealms_inodes(mdsc);
2f2dc0534   Sage Weil   ceph: MDS client
4614
4615
4616
4617
4618
4619
4620
  }
  
  /*
   * wait for all write mds requests to flush.
   */
  static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid)
  {
80fc7314a   Sage Weil   ceph: fix mds syn...
4621
  	struct ceph_mds_request *req = NULL, *nextreq;
44ca18f26   Sage Weil   ceph: use rbtree ...
4622
  	struct rb_node *n;
2f2dc0534   Sage Weil   ceph: MDS client
4623
4624
4625
4626
  
  	mutex_lock(&mdsc->mutex);
  	dout("wait_unsafe_requests want %lld
  ", want_tid);
80fc7314a   Sage Weil   ceph: fix mds syn...
4627
  restart:
44ca18f26   Sage Weil   ceph: use rbtree ...
4628
4629
  	req = __get_oldest_req(mdsc);
  	while (req && req->r_tid <= want_tid) {
80fc7314a   Sage Weil   ceph: fix mds syn...
4630
4631
4632
4633
4634
4635
  		/* find next request */
  		n = rb_next(&req->r_node);
  		if (n)
  			nextreq = rb_entry(n, struct ceph_mds_request, r_node);
  		else
  			nextreq = NULL;
e8a7b8b12   Yan, Zheng   ceph: exclude set...
4636
4637
  		if (req->r_op != CEPH_MDS_OP_SETFILELOCK &&
  		    (req->r_op & CEPH_MDS_OP_WRITE)) {
44ca18f26   Sage Weil   ceph: use rbtree ...
4638
4639
  			/* write op */
  			ceph_mdsc_get_request(req);
80fc7314a   Sage Weil   ceph: fix mds syn...
4640
4641
  			if (nextreq)
  				ceph_mdsc_get_request(nextreq);
44ca18f26   Sage Weil   ceph: use rbtree ...
4642
4643
4644
4645
4646
4647
  			mutex_unlock(&mdsc->mutex);
  			dout("wait_unsafe_requests  wait on %llu (want %llu)
  ",
  			     req->r_tid, want_tid);
  			wait_for_completion(&req->r_safe_completion);
  			mutex_lock(&mdsc->mutex);
44ca18f26   Sage Weil   ceph: use rbtree ...
4648
  			ceph_mdsc_put_request(req);
80fc7314a   Sage Weil   ceph: fix mds syn...
4649
4650
4651
4652
4653
4654
4655
4656
  			if (!nextreq)
  				break;  /* next dne before, so we're done! */
  			if (RB_EMPTY_NODE(&nextreq->r_node)) {
  				/* next request was removed from tree */
  				ceph_mdsc_put_request(nextreq);
  				goto restart;
  			}
  			ceph_mdsc_put_request(nextreq);  /* won't go away */
44ca18f26   Sage Weil   ceph: use rbtree ...
4657
  		}
80fc7314a   Sage Weil   ceph: fix mds syn...
4658
  		req = nextreq;
2f2dc0534   Sage Weil   ceph: MDS client
4659
4660
4661
4662
4663
4664
4665
4666
  	}
  	mutex_unlock(&mdsc->mutex);
  	dout("wait_unsafe_requests done
  ");
  }
  
  void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
  {
0e2943878   Yan, Zheng   ceph: unify cap f...
4667
  	u64 want_tid, want_flush;
2f2dc0534   Sage Weil   ceph: MDS client
4668

52953d559   Seraphime Kirkovski   ceph: cleanup ACC...
4669
  	if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
56b7cf958   Sage Weil   ceph: skip mds sy...
4670
  		return;
2f2dc0534   Sage Weil   ceph: MDS client
4671
4672
4673
4674
  	dout("sync
  ");
  	mutex_lock(&mdsc->mutex);
  	want_tid = mdsc->last_tid;
2f2dc0534   Sage Weil   ceph: MDS client
4675
  	mutex_unlock(&mdsc->mutex);
2f2dc0534   Sage Weil   ceph: MDS client
4676

afcdaea3f   Sage Weil   ceph: flush dirty...
4677
  	ceph_flush_dirty_caps(mdsc);
d3383a8e3   Yan, Zheng   ceph: avoid block...
4678
  	spin_lock(&mdsc->cap_dirty_lock);
8310b0891   Yan, Zheng   ceph: track pendi...
4679
  	want_flush = mdsc->last_cap_flush_tid;
c8799fc46   Yan, Zheng   ceph: optimize ca...
4680
4681
4682
4683
4684
4685
  	if (!list_empty(&mdsc->cap_flush_list)) {
  		struct ceph_cap_flush *cf =
  			list_last_entry(&mdsc->cap_flush_list,
  					struct ceph_cap_flush, g_list);
  		cf->wake = true;
  	}
d3383a8e3   Yan, Zheng   ceph: avoid block...
4686
  	spin_unlock(&mdsc->cap_dirty_lock);
0e2943878   Yan, Zheng   ceph: unify cap f...
4687
4688
4689
  	dout("sync want tid %lld flush_seq %lld
  ",
  	     want_tid, want_flush);
2f2dc0534   Sage Weil   ceph: MDS client
4690
4691
  
  	wait_unsafe_requests(mdsc, want_tid);
0e2943878   Yan, Zheng   ceph: unify cap f...
4692
  	wait_caps_flush(mdsc, want_flush);
2f2dc0534   Sage Weil   ceph: MDS client
4693
  }
f3c60c591   Sage Weil   ceph: fix multipl...
4694
4695
4696
  /*
   * true if all sessions are closed, or we force unmount
   */
fcff415c9   Yan, Zheng   ceph: handle CEPH...
4697
  static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped)
f3c60c591   Sage Weil   ceph: fix multipl...
4698
  {
52953d559   Seraphime Kirkovski   ceph: cleanup ACC...
4699
  	if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
f3c60c591   Sage Weil   ceph: fix multipl...
4700
  		return true;
fcff415c9   Yan, Zheng   ceph: handle CEPH...
4701
  	return atomic_read(&mdsc->num_sessions) <= skipped;
f3c60c591   Sage Weil   ceph: fix multipl...
4702
  }
2f2dc0534   Sage Weil   ceph: MDS client
4703
4704
4705
4706
4707
4708
  
  /*
   * called after sb is ro.
   */
  void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
  {
a319bf56a   Ilya Dryomov   libceph: store ti...
4709
  	struct ceph_options *opts = mdsc->fsc->client->options;
2f2dc0534   Sage Weil   ceph: MDS client
4710
4711
  	struct ceph_mds_session *session;
  	int i;
fcff415c9   Yan, Zheng   ceph: handle CEPH...
4712
  	int skipped = 0;
2f2dc0534   Sage Weil   ceph: MDS client
4713
4714
4715
  
  	dout("close_sessions
  ");
2f2dc0534   Sage Weil   ceph: MDS client
4716
  	/* close sessions */
f3c60c591   Sage Weil   ceph: fix multipl...
4717
4718
4719
4720
4721
  	mutex_lock(&mdsc->mutex);
  	for (i = 0; i < mdsc->max_sessions; i++) {
  		session = __ceph_lookup_mds_session(mdsc, i);
  		if (!session)
  			continue;
2f2dc0534   Sage Weil   ceph: MDS client
4722
  		mutex_unlock(&mdsc->mutex);
f3c60c591   Sage Weil   ceph: fix multipl...
4723
  		mutex_lock(&session->s_mutex);
fcff415c9   Yan, Zheng   ceph: handle CEPH...
4724
4725
  		if (__close_session(mdsc, session) <= 0)
  			skipped++;
f3c60c591   Sage Weil   ceph: fix multipl...
4726
4727
  		mutex_unlock(&session->s_mutex);
  		ceph_put_mds_session(session);
2f2dc0534   Sage Weil   ceph: MDS client
4728
4729
  		mutex_lock(&mdsc->mutex);
  	}
f3c60c591   Sage Weil   ceph: fix multipl...
4730
4731
4732
4733
  	mutex_unlock(&mdsc->mutex);
  
  	dout("waiting for sessions to close
  ");
fcff415c9   Yan, Zheng   ceph: handle CEPH...
4734
4735
  	wait_event_timeout(mdsc->session_close_wq,
  			   done_closing_sessions(mdsc, skipped),
a319bf56a   Ilya Dryomov   libceph: store ti...
4736
  			   ceph_timeout_jiffies(opts->mount_timeout));
2f2dc0534   Sage Weil   ceph: MDS client
4737
4738
  
  	/* tear down remaining sessions */
f3c60c591   Sage Weil   ceph: fix multipl...
4739
  	mutex_lock(&mdsc->mutex);
2f2dc0534   Sage Weil   ceph: MDS client
4740
4741
  	for (i = 0; i < mdsc->max_sessions; i++) {
  		if (mdsc->sessions[i]) {
5b3248c67   Xiubo Li   ceph: rename get_...
4742
  			session = ceph_get_mds_session(mdsc->sessions[i]);
2600d2dd5   Sage Weil   ceph: drop messag...
4743
  			__unregister_session(mdsc, session);
2f2dc0534   Sage Weil   ceph: MDS client
4744
4745
4746
4747
4748
4749
4750
4751
  			mutex_unlock(&mdsc->mutex);
  			mutex_lock(&session->s_mutex);
  			remove_session_caps(session);
  			mutex_unlock(&session->s_mutex);
  			ceph_put_mds_session(session);
  			mutex_lock(&mdsc->mutex);
  		}
  	}
2f2dc0534   Sage Weil   ceph: MDS client
4752
  	WARN_ON(!list_empty(&mdsc->cap_delay_list));
2f2dc0534   Sage Weil   ceph: MDS client
4753
  	mutex_unlock(&mdsc->mutex);
75c9627ef   Yan, Zheng   ceph: map snapid ...
4754
  	ceph_cleanup_snapid_map(mdsc);
2f2dc0534   Sage Weil   ceph: MDS client
4755
  	ceph_cleanup_empty_realms(mdsc);
37c4efc1d   Yan, Zheng   ceph: periodicall...
4756
  	cancel_work_sync(&mdsc->cap_reclaim_work);
2f2dc0534   Sage Weil   ceph: MDS client
4757
4758
4759
4760
4761
  	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
  
  	dout("stopped
  ");
  }
48fec5d0a   Yan, Zheng   ceph: EIO all ope...
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
  void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
  {
  	struct ceph_mds_session *session;
  	int mds;
  
  	dout("force umount
  ");
  
  	mutex_lock(&mdsc->mutex);
  	for (mds = 0; mds < mdsc->max_sessions; mds++) {
  		session = __ceph_lookup_mds_session(mdsc, mds);
  		if (!session)
  			continue;
d468e729b   Yan, Zheng   ceph: add helper ...
4775
4776
4777
4778
  
  		if (session->s_state == CEPH_MDS_SESSION_REJECTED)
  			__unregister_session(mdsc, session);
  		__wake_requests(mdsc, &session->s_waiting);
48fec5d0a   Yan, Zheng   ceph: EIO all ope...
4779
  		mutex_unlock(&mdsc->mutex);
d468e729b   Yan, Zheng   ceph: add helper ...
4780

48fec5d0a   Yan, Zheng   ceph: EIO all ope...
4781
4782
4783
4784
4785
4786
4787
4788
  		mutex_lock(&session->s_mutex);
  		__close_session(mdsc, session);
  		if (session->s_state == CEPH_MDS_SESSION_CLOSING) {
  			cleanup_session_requests(mdsc, session);
  			remove_session_caps(session);
  		}
  		mutex_unlock(&session->s_mutex);
  		ceph_put_mds_session(session);
d468e729b   Yan, Zheng   ceph: add helper ...
4789

48fec5d0a   Yan, Zheng   ceph: EIO all ope...
4790
4791
4792
4793
4794
4795
  		mutex_lock(&mdsc->mutex);
  		kick_requests(mdsc, mds);
  	}
  	__wake_requests(mdsc, &mdsc->waiting_for_map);
  	mutex_unlock(&mdsc->mutex);
  }
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
4796
  static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
2f2dc0534   Sage Weil   ceph: MDS client
4797
4798
4799
  {
  	dout("stop
  ");
fa9967734   Xiubo Li   ceph: fix potenti...
4800
4801
4802
4803
4804
4805
4806
4807
4808
  	/*
  	 * Make sure the delayed work stopped before releasing
  	 * the resources.
  	 *
  	 * Because the cancel_delayed_work_sync() will only
  	 * guarantee that the work finishes executing. But the
  	 * delayed work will re-arm itself again after that.
  	 */
  	flush_delayed_work(&mdsc->delayed_work);
2f2dc0534   Sage Weil   ceph: MDS client
4809
4810
4811
  	if (mdsc->mdsmap)
  		ceph_mdsmap_destroy(mdsc->mdsmap);
  	kfree(mdsc->sessions);
37151668b   Yehuda Sadeh   ceph: do caps acc...
4812
  	ceph_caps_finalize(mdsc);
10183a695   Yan, Zheng   ceph: check OSD c...
4813
  	ceph_pool_perm_destroy(mdsc);
2f2dc0534   Sage Weil   ceph: MDS client
4814
  }
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
4815
4816
4817
  void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
  {
  	struct ceph_mds_client *mdsc = fsc->mdsc;
ef550f6f4   Sage Weil   ceph: flush msgr_...
4818
4819
  	dout("mdsc_destroy %p
  ", mdsc);
ef550f6f4   Sage Weil   ceph: flush msgr_...
4820

50c55aeca   Chengguang Xu   ceph: fix invalid...
4821
4822
  	if (!mdsc)
  		return;
ef550f6f4   Sage Weil   ceph: flush msgr_...
4823
4824
  	/* flush out any connection work with references to us */
  	ceph_msgr_flush();
62a65f36d   Yan, Zheng   ceph: avoid inval...
4825
  	ceph_mdsc_stop(mdsc);
f9009efac   Xiubo Li   ceph: add dentry ...
4826
  	ceph_metric_destroy(&mdsc->metric);
18f473b38   Xiubo Li   ceph: periodicall...
4827
  	flush_delayed_work(&mdsc->metric.delayed_work);
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
4828
4829
  	fsc->mdsc = NULL;
  	kfree(mdsc);
ef550f6f4   Sage Weil   ceph: flush msgr_...
4830
4831
  	dout("mdsc_destroy %p done
  ", mdsc);
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
4832
  }
430afbadd   Yan, Zheng   ceph: mount non-d...
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
  void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
  {
  	struct ceph_fs_client *fsc = mdsc->fsc;
  	const char *mds_namespace = fsc->mount_options->mds_namespace;
  	void *p = msg->front.iov_base;
  	void *end = p + msg->front.iov_len;
  	u32 epoch;
  	u32 map_len;
  	u32 num_fs;
  	u32 mount_fscid = (u32)-1;
  	u8 struct_v, struct_cv;
  	int err = -EINVAL;
  
  	ceph_decode_need(&p, end, sizeof(u32), bad);
  	epoch = ceph_decode_32(&p);
  
  	dout("handle_fsmap epoch %u
  ", epoch);
  
  	ceph_decode_need(&p, end, 2 + sizeof(u32), bad);
  	struct_v = ceph_decode_8(&p);
  	struct_cv = ceph_decode_8(&p);
  	map_len = ceph_decode_32(&p);
  
  	ceph_decode_need(&p, end, sizeof(u32) * 3, bad);
  	p += sizeof(u32) * 2; /* skip epoch and legacy_client_fscid */
  
  	num_fs = ceph_decode_32(&p);
  	while (num_fs-- > 0) {
  		void *info_p, *info_end;
  		u32 info_len;
  		u8 info_v, info_cv;
  		u32 fscid, namelen;
  
  		ceph_decode_need(&p, end, 2 + sizeof(u32), bad);
  		info_v = ceph_decode_8(&p);
  		info_cv = ceph_decode_8(&p);
  		info_len = ceph_decode_32(&p);
  		ceph_decode_need(&p, end, info_len, bad);
  		info_p = p;
  		info_end = p + info_len;
  		p = info_end;
  
  		ceph_decode_need(&info_p, info_end, sizeof(u32) * 2, bad);
  		fscid = ceph_decode_32(&info_p);
  		namelen = ceph_decode_32(&info_p);
  		ceph_decode_need(&info_p, info_end, namelen, bad);
  
  		if (mds_namespace &&
  		    strlen(mds_namespace) == namelen &&
  		    !strncmp(mds_namespace, (char *)info_p, namelen)) {
  			mount_fscid = fscid;
  			break;
  		}
  	}
  
  	ceph_monc_got_map(&fsc->client->monc, CEPH_SUB_FSMAP, epoch);
  	if (mount_fscid != (u32)-1) {
  		fsc->client->monc.fs_cluster_id = mount_fscid;
  		ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP,
  				   0, true);
  		ceph_monc_renew_subs(&fsc->client->monc);
  	} else {
  		err = -ENOENT;
  		goto err_out;
  	}
  	return;
76bd6ec49   Ilya Dryomov   ceph: -EINVAL on ...
4900

430afbadd   Yan, Zheng   ceph: mount non-d...
4901
4902
4903
4904
4905
  bad:
  	pr_err("error decoding fsmap
  ");
  err_out:
  	mutex_lock(&mdsc->mutex);
76bd6ec49   Ilya Dryomov   ceph: -EINVAL on ...
4906
  	mdsc->mdsmap_err = err;
430afbadd   Yan, Zheng   ceph: mount non-d...
4907
4908
  	__wake_requests(mdsc, &mdsc->waiting_for_map);
  	mutex_unlock(&mdsc->mutex);
430afbadd   Yan, Zheng   ceph: mount non-d...
4909
  }
2f2dc0534   Sage Weil   ceph: MDS client
4910
4911
4912
4913
  
  /*
   * handle mds map update.
   */
430afbadd   Yan, Zheng   ceph: mount non-d...
4914
  void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
2f2dc0534   Sage Weil   ceph: MDS client
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
  {
  	u32 epoch;
  	u32 maplen;
  	void *p = msg->front.iov_base;
  	void *end = p + msg->front.iov_len;
  	struct ceph_mdsmap *newmap, *oldmap;
  	struct ceph_fsid fsid;
  	int err = -EINVAL;
  
  	ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
  	ceph_decode_copy(&p, &fsid, sizeof(fsid));
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
4926
  	if (ceph_check_fsid(mdsc->fsc->client, &fsid) < 0)
0743304d8   Sage Weil   ceph: fix debugfs...
4927
  		return;
c89136ea4   Sage Weil   ceph: convert enc...
4928
4929
  	epoch = ceph_decode_32(&p);
  	maplen = ceph_decode_32(&p);
2f2dc0534   Sage Weil   ceph: MDS client
4930
4931
4932
4933
  	dout("handle_map epoch %u len %d
  ", epoch, (int)maplen);
  
  	/* do we need it? */
2f2dc0534   Sage Weil   ceph: MDS client
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
  	mutex_lock(&mdsc->mutex);
  	if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
  		dout("handle_map epoch %u <= our %u
  ",
  		     epoch, mdsc->mdsmap->m_epoch);
  		mutex_unlock(&mdsc->mutex);
  		return;
  	}
  
  	newmap = ceph_mdsmap_decode(&p, end);
  	if (IS_ERR(newmap)) {
  		err = PTR_ERR(newmap);
  		goto bad_unlock;
  	}
  
  	/* swap into place */
  	if (mdsc->mdsmap) {
  		oldmap = mdsc->mdsmap;
  		mdsc->mdsmap = newmap;
  		check_new_map(mdsc, newmap, oldmap);
  		ceph_mdsmap_destroy(oldmap);
  	} else {
  		mdsc->mdsmap = newmap;  /* first mds map */
  	}
719784ba7   Chengguang Xu   ceph: add new fie...
4958
4959
  	mdsc->fsc->max_file_size = min((loff_t)mdsc->mdsmap->m_max_file_size,
  					MAX_LFS_FILESIZE);
2f2dc0534   Sage Weil   ceph: MDS client
4960
4961
  
  	__wake_requests(mdsc, &mdsc->waiting_for_map);
82dcabad7   Ilya Dryomov   libceph: revamp s...
4962
4963
  	ceph_monc_got_map(&mdsc->fsc->client->monc, CEPH_SUB_MDSMAP,
  			  mdsc->mdsmap->m_epoch);
2f2dc0534   Sage Weil   ceph: MDS client
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
  
  	mutex_unlock(&mdsc->mutex);
  	schedule_delayed(mdsc);
  	return;
  
  bad_unlock:
  	mutex_unlock(&mdsc->mutex);
  bad:
  	pr_err("error decoding mdsmap %d
  ", err);
  	return;
  }
  
  static struct ceph_connection *con_get(struct ceph_connection *con)
  {
  	struct ceph_mds_session *s = con->private;
5b3248c67   Xiubo Li   ceph: rename get_...
4980
  	if (ceph_get_mds_session(s))
2f2dc0534   Sage Weil   ceph: MDS client
4981
  		return con;
2f2dc0534   Sage Weil   ceph: MDS client
4982
4983
4984
4985
4986
4987
  	return NULL;
  }
  
  static void con_put(struct ceph_connection *con)
  {
  	struct ceph_mds_session *s = con->private;
2f2dc0534   Sage Weil   ceph: MDS client
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
  	ceph_put_mds_session(s);
  }
  
  /*
   * if the client is unresponsive for long enough, the mds will kill
   * the session entirely.
   */
  static void peer_reset(struct ceph_connection *con)
  {
  	struct ceph_mds_session *s = con->private;
7e70f0ed9   Sage Weil   ceph: attempt mds...
4998
  	struct ceph_mds_client *mdsc = s->s_mdsc;
2f2dc0534   Sage Weil   ceph: MDS client
4999

f3ae1b97b   Fabian Frederick   fs/ceph: replace ...
5000
5001
  	pr_warn("mds%d closed our session
  ", s->s_mds);
7e70f0ed9   Sage Weil   ceph: attempt mds...
5002
  	send_mds_reconnect(mdsc, s);
2f2dc0534   Sage Weil   ceph: MDS client
5003
5004
5005
5006
5007
5008
5009
  }
  
  static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
  {
  	struct ceph_mds_session *s = con->private;
  	struct ceph_mds_client *mdsc = s->s_mdsc;
  	int type = le16_to_cpu(msg->hdr.type);
2600d2dd5   Sage Weil   ceph: drop messag...
5010
5011
5012
5013
5014
5015
  	mutex_lock(&mdsc->mutex);
  	if (__verify_registered_session(mdsc, s) < 0) {
  		mutex_unlock(&mdsc->mutex);
  		goto out;
  	}
  	mutex_unlock(&mdsc->mutex);
2f2dc0534   Sage Weil   ceph: MDS client
5016
5017
  	switch (type) {
  	case CEPH_MSG_MDS_MAP:
430afbadd   Yan, Zheng   ceph: mount non-d...
5018
5019
5020
5021
  		ceph_mdsc_handle_mdsmap(mdsc, msg);
  		break;
  	case CEPH_MSG_FS_MAP_USER:
  		ceph_mdsc_handle_fsmap(mdsc, msg);
2f2dc0534   Sage Weil   ceph: MDS client
5022
5023
5024
5025
5026
5027
5028
5029
  		break;
  	case CEPH_MSG_CLIENT_SESSION:
  		handle_session(s, msg);
  		break;
  	case CEPH_MSG_CLIENT_REPLY:
  		handle_reply(s, msg);
  		break;
  	case CEPH_MSG_CLIENT_REQUEST_FORWARD:
2600d2dd5   Sage Weil   ceph: drop messag...
5030
  		handle_forward(mdsc, s, msg);
2f2dc0534   Sage Weil   ceph: MDS client
5031
5032
5033
5034
5035
  		break;
  	case CEPH_MSG_CLIENT_CAPS:
  		ceph_handle_caps(s, msg);
  		break;
  	case CEPH_MSG_CLIENT_SNAP:
2600d2dd5   Sage Weil   ceph: drop messag...
5036
  		ceph_handle_snap(mdsc, s, msg);
2f2dc0534   Sage Weil   ceph: MDS client
5037
5038
  		break;
  	case CEPH_MSG_CLIENT_LEASE:
2600d2dd5   Sage Weil   ceph: drop messag...
5039
  		handle_lease(mdsc, s, msg);
2f2dc0534   Sage Weil   ceph: MDS client
5040
  		break;
fb18a5756   Luis Henriques   ceph: quota: add ...
5041
5042
5043
  	case CEPH_MSG_CLIENT_QUOTA:
  		ceph_handle_quota(mdsc, s, msg);
  		break;
2f2dc0534   Sage Weil   ceph: MDS client
5044
5045
5046
5047
5048
5049
  
  	default:
  		pr_err("received unknown message type %d %s
  ", type,
  		       ceph_msg_type_name(type));
  	}
2600d2dd5   Sage Weil   ceph: drop messag...
5050
  out:
2f2dc0534   Sage Weil   ceph: MDS client
5051
5052
  	ceph_msg_put(msg);
  }
4e7a5dcd1   Sage Weil   ceph: negotiate a...
5053
5054
5055
  /*
   * authentication
   */
a3530df33   Alex Elder   ceph: have get_au...
5056
5057
5058
5059
5060
5061
  
  /*
   * Note: returned pointer is the address of a structure that's
   * managed separately.  Caller must *not* attempt to free it.
   */
  static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
8f43fb538   Alex Elder   ceph: use info re...
5062
  					int *proto, int force_new)
4e7a5dcd1   Sage Weil   ceph: negotiate a...
5063
5064
5065
  {
  	struct ceph_mds_session *s = con->private;
  	struct ceph_mds_client *mdsc = s->s_mdsc;
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
5066
  	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
74f1869f7   Alex Elder   ceph: messenger: ...
5067
  	struct ceph_auth_handshake *auth = &s->s_auth;
4e7a5dcd1   Sage Weil   ceph: negotiate a...
5068

74f1869f7   Alex Elder   ceph: messenger: ...
5069
  	if (force_new && auth->authorizer) {
6c1ea260f   Ilya Dryomov   libceph: make aut...
5070
  		ceph_auth_destroy_authorizer(auth->authorizer);
74f1869f7   Alex Elder   ceph: messenger: ...
5071
  		auth->authorizer = NULL;
4e7a5dcd1   Sage Weil   ceph: negotiate a...
5072
  	}
27859f977   Sage Weil   libceph: wrap aut...
5073
5074
5075
  	if (!auth->authorizer) {
  		int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_MDS,
  						      auth);
0bed9b5c5   Sage Weil   libceph: add upda...
5076
5077
  		if (ret)
  			return ERR_PTR(ret);
27859f977   Sage Weil   libceph: wrap aut...
5078
5079
5080
  	} else {
  		int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_MDS,
  						      auth);
a255651d4   Alex Elder   ceph: ensure auth...
5081
  		if (ret)
a3530df33   Alex Elder   ceph: have get_au...
5082
  			return ERR_PTR(ret);
4e7a5dcd1   Sage Weil   ceph: negotiate a...
5083
  	}
4e7a5dcd1   Sage Weil   ceph: negotiate a...
5084
  	*proto = ac->protocol;
74f1869f7   Alex Elder   ceph: messenger: ...
5085

a3530df33   Alex Elder   ceph: have get_au...
5086
  	return auth;
4e7a5dcd1   Sage Weil   ceph: negotiate a...
5087
  }
6daca13d2   Ilya Dryomov   libceph: add auth...
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
  static int add_authorizer_challenge(struct ceph_connection *con,
  				    void *challenge_buf, int challenge_buf_len)
  {
  	struct ceph_mds_session *s = con->private;
  	struct ceph_mds_client *mdsc = s->s_mdsc;
  	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
  
  	return ceph_auth_add_authorizer_challenge(ac, s->s_auth.authorizer,
  					    challenge_buf, challenge_buf_len);
  }
4e7a5dcd1   Sage Weil   ceph: negotiate a...
5098

0dde58488   Ilya Dryomov   libceph: drop len...
5099
  static int verify_authorizer_reply(struct ceph_connection *con)
4e7a5dcd1   Sage Weil   ceph: negotiate a...
5100
5101
5102
  {
  	struct ceph_mds_session *s = con->private;
  	struct ceph_mds_client *mdsc = s->s_mdsc;
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
5103
  	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
4e7a5dcd1   Sage Weil   ceph: negotiate a...
5104

0dde58488   Ilya Dryomov   libceph: drop len...
5105
  	return ceph_auth_verify_authorizer_reply(ac, s->s_auth.authorizer);
4e7a5dcd1   Sage Weil   ceph: negotiate a...
5106
  }
9bd2e6f8b   Sage Weil   ceph: allow renew...
5107
5108
5109
5110
  static int invalidate_authorizer(struct ceph_connection *con)
  {
  	struct ceph_mds_session *s = con->private;
  	struct ceph_mds_client *mdsc = s->s_mdsc;
3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
5111
  	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
9bd2e6f8b   Sage Weil   ceph: allow renew...
5112

27859f977   Sage Weil   libceph: wrap aut...
5113
  	ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
9bd2e6f8b   Sage Weil   ceph: allow renew...
5114

3d14c5d2b   Yehuda Sadeh   ceph: factor out ...
5115
  	return ceph_monc_validate_auth(&mdsc->fsc->client->monc);
9bd2e6f8b   Sage Weil   ceph: allow renew...
5116
  }
53ded495c   Alex Elder   libceph: define m...
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
  static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con,
  				struct ceph_msg_header *hdr, int *skip)
  {
  	struct ceph_msg *msg;
  	int type = (int) le16_to_cpu(hdr->type);
  	int front_len = (int) le32_to_cpu(hdr->front_len);
  
  	if (con->in_msg)
  		return con->in_msg;
  
  	*skip = 0;
  	msg = ceph_msg_new(type, front_len, GFP_NOFS, false);
  	if (!msg) {
  		pr_err("unable to allocate msg type %d len %d
  ",
  		       type, front_len);
  		return NULL;
  	}
53ded495c   Alex Elder   libceph: define m...
5135
5136
5137
  
  	return msg;
  }
79dbd1baa   Ilya Dryomov   libceph: msg sign...
5138
  static int mds_sign_message(struct ceph_msg *msg)
33d073379   Yan, Zheng   libceph: message ...
5139
  {
79dbd1baa   Ilya Dryomov   libceph: msg sign...
5140
         struct ceph_mds_session *s = msg->con->private;
33d073379   Yan, Zheng   libceph: message ...
5141
         struct ceph_auth_handshake *auth = &s->s_auth;
79dbd1baa   Ilya Dryomov   libceph: msg sign...
5142

33d073379   Yan, Zheng   libceph: message ...
5143
5144
         return ceph_auth_sign_message(auth, msg);
  }
79dbd1baa   Ilya Dryomov   libceph: msg sign...
5145
  static int mds_check_message_signature(struct ceph_msg *msg)
33d073379   Yan, Zheng   libceph: message ...
5146
  {
79dbd1baa   Ilya Dryomov   libceph: msg sign...
5147
         struct ceph_mds_session *s = msg->con->private;
33d073379   Yan, Zheng   libceph: message ...
5148
         struct ceph_auth_handshake *auth = &s->s_auth;
79dbd1baa   Ilya Dryomov   libceph: msg sign...
5149

33d073379   Yan, Zheng   libceph: message ...
5150
5151
         return ceph_auth_check_message_signature(auth, msg);
  }
9e32789f6   Tobias Klauser   ceph: Storage cla...
5152
  static const struct ceph_connection_operations mds_con_ops = {
2f2dc0534   Sage Weil   ceph: MDS client
5153
5154
5155
  	.get = con_get,
  	.put = con_put,
  	.dispatch = dispatch,
4e7a5dcd1   Sage Weil   ceph: negotiate a...
5156
  	.get_authorizer = get_authorizer,
6daca13d2   Ilya Dryomov   libceph: add auth...
5157
  	.add_authorizer_challenge = add_authorizer_challenge,
4e7a5dcd1   Sage Weil   ceph: negotiate a...
5158
  	.verify_authorizer_reply = verify_authorizer_reply,
9bd2e6f8b   Sage Weil   ceph: allow renew...
5159
  	.invalidate_authorizer = invalidate_authorizer,
2f2dc0534   Sage Weil   ceph: MDS client
5160
  	.peer_reset = peer_reset,
53ded495c   Alex Elder   libceph: define m...
5161
  	.alloc_msg = mds_alloc_msg,
79dbd1baa   Ilya Dryomov   libceph: msg sign...
5162
5163
  	.sign_message = mds_sign_message,
  	.check_message_signature = mds_check_message_signature,
2f2dc0534   Sage Weil   ceph: MDS client
5164
  };
2f2dc0534   Sage Weil   ceph: MDS client
5165
  /* eof */