Commit 6c073a7ee250118b8be3a2379c96fd7f78382b06

Authored by Linus Torvalds

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client:
  rbd: fix safety of rbd_put_client()
  rbd: fix a memory leak in rbd_get_client()
  ceph: create a new session lock to avoid lock inversion
  ceph: fix length validation in parse_reply_info()
  ceph: initialize client debugfs outside of monc->mutex
  ceph: change "ceph.layout" xattr to be "ceph.file.layout"

Showing 8 changed files Inline Diff

1 /* 1 /*
2 rbd.c -- Export ceph rados objects as a Linux block device 2 rbd.c -- Export ceph rados objects as a Linux block device
3 3
4 4
5 based on drivers/block/osdblk.c: 5 based on drivers/block/osdblk.c:
6 6
7 Copyright 2009 Red Hat, Inc. 7 Copyright 2009 Red Hat, Inc.
8 8
9 This program is free software; you can redistribute it and/or modify 9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by 10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation. 11 the Free Software Foundation.
12 12
13 This program is distributed in the hope that it will be useful, 13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of 14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details. 16 GNU General Public License for more details.
17 17
18 You should have received a copy of the GNU General Public License 18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to 19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21 21
22 22
23 23
24 For usage instructions, please refer to: 24 For usage instructions, please refer to:
25 25
26 Documentation/ABI/testing/sysfs-bus-rbd 26 Documentation/ABI/testing/sysfs-bus-rbd
27 27
28 */ 28 */
29 29
30 #include <linux/ceph/libceph.h> 30 #include <linux/ceph/libceph.h>
31 #include <linux/ceph/osd_client.h> 31 #include <linux/ceph/osd_client.h>
32 #include <linux/ceph/mon_client.h> 32 #include <linux/ceph/mon_client.h>
33 #include <linux/ceph/decode.h> 33 #include <linux/ceph/decode.h>
34 #include <linux/parser.h> 34 #include <linux/parser.h>
35 35
36 #include <linux/kernel.h> 36 #include <linux/kernel.h>
37 #include <linux/device.h> 37 #include <linux/device.h>
38 #include <linux/module.h> 38 #include <linux/module.h>
39 #include <linux/fs.h> 39 #include <linux/fs.h>
40 #include <linux/blkdev.h> 40 #include <linux/blkdev.h>
41 41
42 #include "rbd_types.h" 42 #include "rbd_types.h"
43 43
44 #define DRV_NAME "rbd" 44 #define DRV_NAME "rbd"
45 #define DRV_NAME_LONG "rbd (rados block device)" 45 #define DRV_NAME_LONG "rbd (rados block device)"
46 46
47 #define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */ 47 #define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
48 48
49 #define RBD_MAX_MD_NAME_LEN (96 + sizeof(RBD_SUFFIX)) 49 #define RBD_MAX_MD_NAME_LEN (96 + sizeof(RBD_SUFFIX))
50 #define RBD_MAX_POOL_NAME_LEN 64 50 #define RBD_MAX_POOL_NAME_LEN 64
51 #define RBD_MAX_SNAP_NAME_LEN 32 51 #define RBD_MAX_SNAP_NAME_LEN 32
52 #define RBD_MAX_OPT_LEN 1024 52 #define RBD_MAX_OPT_LEN 1024
53 53
54 #define RBD_SNAP_HEAD_NAME "-" 54 #define RBD_SNAP_HEAD_NAME "-"
55 55
56 #define DEV_NAME_LEN 32 56 #define DEV_NAME_LEN 32
57 57
58 #define RBD_NOTIFY_TIMEOUT_DEFAULT 10 58 #define RBD_NOTIFY_TIMEOUT_DEFAULT 10
59 59
60 /* 60 /*
61 * block device image metadata (in-memory version) 61 * block device image metadata (in-memory version)
62 */ 62 */
63 struct rbd_image_header { 63 struct rbd_image_header {
64 u64 image_size; 64 u64 image_size;
65 char block_name[32]; 65 char block_name[32];
66 __u8 obj_order; 66 __u8 obj_order;
67 __u8 crypt_type; 67 __u8 crypt_type;
68 __u8 comp_type; 68 __u8 comp_type;
69 struct rw_semaphore snap_rwsem; 69 struct rw_semaphore snap_rwsem;
70 struct ceph_snap_context *snapc; 70 struct ceph_snap_context *snapc;
71 size_t snap_names_len; 71 size_t snap_names_len;
72 u64 snap_seq; 72 u64 snap_seq;
73 u32 total_snaps; 73 u32 total_snaps;
74 74
75 char *snap_names; 75 char *snap_names;
76 u64 *snap_sizes; 76 u64 *snap_sizes;
77 77
78 u64 obj_version; 78 u64 obj_version;
79 }; 79 };
80 80
81 struct rbd_options { 81 struct rbd_options {
82 int notify_timeout; 82 int notify_timeout;
83 }; 83 };
84 84
85 /* 85 /*
86 * an instance of the client. multiple devices may share a client. 86 * an instance of the client. multiple devices may share a client.
87 */ 87 */
88 struct rbd_client { 88 struct rbd_client {
89 struct ceph_client *client; 89 struct ceph_client *client;
90 struct rbd_options *rbd_opts; 90 struct rbd_options *rbd_opts;
91 struct kref kref; 91 struct kref kref;
92 struct list_head node; 92 struct list_head node;
93 }; 93 };
94 94
95 struct rbd_req_coll; 95 struct rbd_req_coll;
96 96
97 /* 97 /*
98 * a single io request 98 * a single io request
99 */ 99 */
100 struct rbd_request { 100 struct rbd_request {
101 struct request *rq; /* blk layer request */ 101 struct request *rq; /* blk layer request */
102 struct bio *bio; /* cloned bio */ 102 struct bio *bio; /* cloned bio */
103 struct page **pages; /* list of used pages */ 103 struct page **pages; /* list of used pages */
104 u64 len; 104 u64 len;
105 int coll_index; 105 int coll_index;
106 struct rbd_req_coll *coll; 106 struct rbd_req_coll *coll;
107 }; 107 };
108 108
109 struct rbd_req_status { 109 struct rbd_req_status {
110 int done; 110 int done;
111 int rc; 111 int rc;
112 u64 bytes; 112 u64 bytes;
113 }; 113 };
114 114
115 /* 115 /*
116 * a collection of requests 116 * a collection of requests
117 */ 117 */
118 struct rbd_req_coll { 118 struct rbd_req_coll {
119 int total; 119 int total;
120 int num_done; 120 int num_done;
121 struct kref kref; 121 struct kref kref;
122 struct rbd_req_status status[0]; 122 struct rbd_req_status status[0];
123 }; 123 };
124 124
125 struct rbd_snap { 125 struct rbd_snap {
126 struct device dev; 126 struct device dev;
127 const char *name; 127 const char *name;
128 size_t size; 128 size_t size;
129 struct list_head node; 129 struct list_head node;
130 u64 id; 130 u64 id;
131 }; 131 };
132 132
133 /* 133 /*
134 * a single device 134 * a single device
135 */ 135 */
136 struct rbd_device { 136 struct rbd_device {
137 int id; /* blkdev unique id */ 137 int id; /* blkdev unique id */
138 138
139 int major; /* blkdev assigned major */ 139 int major; /* blkdev assigned major */
140 struct gendisk *disk; /* blkdev's gendisk and rq */ 140 struct gendisk *disk; /* blkdev's gendisk and rq */
141 struct request_queue *q; 141 struct request_queue *q;
142 142
143 struct ceph_client *client; 143 struct ceph_client *client;
144 struct rbd_client *rbd_client; 144 struct rbd_client *rbd_client;
145 145
146 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ 146 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
147 147
148 spinlock_t lock; /* queue lock */ 148 spinlock_t lock; /* queue lock */
149 149
150 struct rbd_image_header header; 150 struct rbd_image_header header;
151 char obj[RBD_MAX_OBJ_NAME_LEN]; /* rbd image name */ 151 char obj[RBD_MAX_OBJ_NAME_LEN]; /* rbd image name */
152 int obj_len; 152 int obj_len;
153 char obj_md_name[RBD_MAX_MD_NAME_LEN]; /* hdr nm. */ 153 char obj_md_name[RBD_MAX_MD_NAME_LEN]; /* hdr nm. */
154 char pool_name[RBD_MAX_POOL_NAME_LEN]; 154 char pool_name[RBD_MAX_POOL_NAME_LEN];
155 int poolid; 155 int poolid;
156 156
157 struct ceph_osd_event *watch_event; 157 struct ceph_osd_event *watch_event;
158 struct ceph_osd_request *watch_request; 158 struct ceph_osd_request *watch_request;
159 159
160 char snap_name[RBD_MAX_SNAP_NAME_LEN]; 160 char snap_name[RBD_MAX_SNAP_NAME_LEN];
161 u32 cur_snap; /* index+1 of current snapshot within snap context 161 u32 cur_snap; /* index+1 of current snapshot within snap context
162 0 - for the head */ 162 0 - for the head */
163 int read_only; 163 int read_only;
164 164
165 struct list_head node; 165 struct list_head node;
166 166
167 /* list of snapshots */ 167 /* list of snapshots */
168 struct list_head snaps; 168 struct list_head snaps;
169 169
170 /* sysfs related */ 170 /* sysfs related */
171 struct device dev; 171 struct device dev;
172 }; 172 };
173 173
174 static struct bus_type rbd_bus_type = { 174 static struct bus_type rbd_bus_type = {
175 .name = "rbd", 175 .name = "rbd",
176 }; 176 };
177 177
178 static spinlock_t node_lock; /* protects client get/put */ 178 static spinlock_t node_lock; /* protects client get/put */
179 179
180 static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ 180 static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
181 static LIST_HEAD(rbd_dev_list); /* devices */ 181 static LIST_HEAD(rbd_dev_list); /* devices */
182 static LIST_HEAD(rbd_client_list); /* clients */ 182 static LIST_HEAD(rbd_client_list); /* clients */
183 183
184 static int __rbd_init_snaps_header(struct rbd_device *rbd_dev); 184 static int __rbd_init_snaps_header(struct rbd_device *rbd_dev);
185 static void rbd_dev_release(struct device *dev); 185 static void rbd_dev_release(struct device *dev);
186 static ssize_t rbd_snap_add(struct device *dev, 186 static ssize_t rbd_snap_add(struct device *dev,
187 struct device_attribute *attr, 187 struct device_attribute *attr,
188 const char *buf, 188 const char *buf,
189 size_t count); 189 size_t count);
190 static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev, 190 static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
191 struct rbd_snap *snap); 191 struct rbd_snap *snap);
192 192
193 193
194 static struct rbd_device *dev_to_rbd(struct device *dev) 194 static struct rbd_device *dev_to_rbd(struct device *dev)
195 { 195 {
196 return container_of(dev, struct rbd_device, dev); 196 return container_of(dev, struct rbd_device, dev);
197 } 197 }
198 198
199 static struct device *rbd_get_dev(struct rbd_device *rbd_dev) 199 static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
200 { 200 {
201 return get_device(&rbd_dev->dev); 201 return get_device(&rbd_dev->dev);
202 } 202 }
203 203
204 static void rbd_put_dev(struct rbd_device *rbd_dev) 204 static void rbd_put_dev(struct rbd_device *rbd_dev)
205 { 205 {
206 put_device(&rbd_dev->dev); 206 put_device(&rbd_dev->dev);
207 } 207 }
208 208
209 static int __rbd_update_snaps(struct rbd_device *rbd_dev); 209 static int __rbd_update_snaps(struct rbd_device *rbd_dev);
210 210
211 static int rbd_open(struct block_device *bdev, fmode_t mode) 211 static int rbd_open(struct block_device *bdev, fmode_t mode)
212 { 212 {
213 struct gendisk *disk = bdev->bd_disk; 213 struct gendisk *disk = bdev->bd_disk;
214 struct rbd_device *rbd_dev = disk->private_data; 214 struct rbd_device *rbd_dev = disk->private_data;
215 215
216 rbd_get_dev(rbd_dev); 216 rbd_get_dev(rbd_dev);
217 217
218 set_device_ro(bdev, rbd_dev->read_only); 218 set_device_ro(bdev, rbd_dev->read_only);
219 219
220 if ((mode & FMODE_WRITE) && rbd_dev->read_only) 220 if ((mode & FMODE_WRITE) && rbd_dev->read_only)
221 return -EROFS; 221 return -EROFS;
222 222
223 return 0; 223 return 0;
224 } 224 }
225 225
226 static int rbd_release(struct gendisk *disk, fmode_t mode) 226 static int rbd_release(struct gendisk *disk, fmode_t mode)
227 { 227 {
228 struct rbd_device *rbd_dev = disk->private_data; 228 struct rbd_device *rbd_dev = disk->private_data;
229 229
230 rbd_put_dev(rbd_dev); 230 rbd_put_dev(rbd_dev);
231 231
232 return 0; 232 return 0;
233 } 233 }
234 234
235 static const struct block_device_operations rbd_bd_ops = { 235 static const struct block_device_operations rbd_bd_ops = {
236 .owner = THIS_MODULE, 236 .owner = THIS_MODULE,
237 .open = rbd_open, 237 .open = rbd_open,
238 .release = rbd_release, 238 .release = rbd_release,
239 }; 239 };
240 240
241 /* 241 /*
242 * Initialize an rbd client instance. 242 * Initialize an rbd client instance.
243 * We own *opt. 243 * We own *opt.
244 */ 244 */
245 static struct rbd_client *rbd_client_create(struct ceph_options *opt, 245 static struct rbd_client *rbd_client_create(struct ceph_options *opt,
246 struct rbd_options *rbd_opts) 246 struct rbd_options *rbd_opts)
247 { 247 {
248 struct rbd_client *rbdc; 248 struct rbd_client *rbdc;
249 int ret = -ENOMEM; 249 int ret = -ENOMEM;
250 250
251 dout("rbd_client_create\n"); 251 dout("rbd_client_create\n");
252 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); 252 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
253 if (!rbdc) 253 if (!rbdc)
254 goto out_opt; 254 goto out_opt;
255 255
256 kref_init(&rbdc->kref); 256 kref_init(&rbdc->kref);
257 INIT_LIST_HEAD(&rbdc->node); 257 INIT_LIST_HEAD(&rbdc->node);
258 258
259 rbdc->client = ceph_create_client(opt, rbdc, 0, 0); 259 rbdc->client = ceph_create_client(opt, rbdc, 0, 0);
260 if (IS_ERR(rbdc->client)) 260 if (IS_ERR(rbdc->client))
261 goto out_rbdc; 261 goto out_rbdc;
262 opt = NULL; /* Now rbdc->client is responsible for opt */ 262 opt = NULL; /* Now rbdc->client is responsible for opt */
263 263
264 ret = ceph_open_session(rbdc->client); 264 ret = ceph_open_session(rbdc->client);
265 if (ret < 0) 265 if (ret < 0)
266 goto out_err; 266 goto out_err;
267 267
268 rbdc->rbd_opts = rbd_opts; 268 rbdc->rbd_opts = rbd_opts;
269 269
270 spin_lock(&node_lock); 270 spin_lock(&node_lock);
271 list_add_tail(&rbdc->node, &rbd_client_list); 271 list_add_tail(&rbdc->node, &rbd_client_list);
272 spin_unlock(&node_lock); 272 spin_unlock(&node_lock);
273 273
274 dout("rbd_client_create created %p\n", rbdc); 274 dout("rbd_client_create created %p\n", rbdc);
275 return rbdc; 275 return rbdc;
276 276
277 out_err: 277 out_err:
278 ceph_destroy_client(rbdc->client); 278 ceph_destroy_client(rbdc->client);
279 out_rbdc: 279 out_rbdc:
280 kfree(rbdc); 280 kfree(rbdc);
281 out_opt: 281 out_opt:
282 if (opt) 282 if (opt)
283 ceph_destroy_options(opt); 283 ceph_destroy_options(opt);
284 return ERR_PTR(ret); 284 return ERR_PTR(ret);
285 } 285 }
286 286
287 /* 287 /*
288 * Find a ceph client with specific addr and configuration. 288 * Find a ceph client with specific addr and configuration.
289 */ 289 */
290 static struct rbd_client *__rbd_client_find(struct ceph_options *opt) 290 static struct rbd_client *__rbd_client_find(struct ceph_options *opt)
291 { 291 {
292 struct rbd_client *client_node; 292 struct rbd_client *client_node;
293 293
294 if (opt->flags & CEPH_OPT_NOSHARE) 294 if (opt->flags & CEPH_OPT_NOSHARE)
295 return NULL; 295 return NULL;
296 296
297 list_for_each_entry(client_node, &rbd_client_list, node) 297 list_for_each_entry(client_node, &rbd_client_list, node)
298 if (ceph_compare_options(opt, client_node->client) == 0) 298 if (ceph_compare_options(opt, client_node->client) == 0)
299 return client_node; 299 return client_node;
300 return NULL; 300 return NULL;
301 } 301 }
302 302
303 /* 303 /*
304 * mount options 304 * mount options
305 */ 305 */
306 enum { 306 enum {
307 Opt_notify_timeout, 307 Opt_notify_timeout,
308 Opt_last_int, 308 Opt_last_int,
309 /* int args above */ 309 /* int args above */
310 Opt_last_string, 310 Opt_last_string,
311 /* string args above */ 311 /* string args above */
312 }; 312 };
313 313
314 static match_table_t rbdopt_tokens = { 314 static match_table_t rbdopt_tokens = {
315 {Opt_notify_timeout, "notify_timeout=%d"}, 315 {Opt_notify_timeout, "notify_timeout=%d"},
316 /* int args above */ 316 /* int args above */
317 /* string args above */ 317 /* string args above */
318 {-1, NULL} 318 {-1, NULL}
319 }; 319 };
320 320
321 static int parse_rbd_opts_token(char *c, void *private) 321 static int parse_rbd_opts_token(char *c, void *private)
322 { 322 {
323 struct rbd_options *rbdopt = private; 323 struct rbd_options *rbdopt = private;
324 substring_t argstr[MAX_OPT_ARGS]; 324 substring_t argstr[MAX_OPT_ARGS];
325 int token, intval, ret; 325 int token, intval, ret;
326 326
327 token = match_token((char *)c, rbdopt_tokens, argstr); 327 token = match_token((char *)c, rbdopt_tokens, argstr);
328 if (token < 0) 328 if (token < 0)
329 return -EINVAL; 329 return -EINVAL;
330 330
331 if (token < Opt_last_int) { 331 if (token < Opt_last_int) {
332 ret = match_int(&argstr[0], &intval); 332 ret = match_int(&argstr[0], &intval);
333 if (ret < 0) { 333 if (ret < 0) {
334 pr_err("bad mount option arg (not int) " 334 pr_err("bad mount option arg (not int) "
335 "at '%s'\n", c); 335 "at '%s'\n", c);
336 return ret; 336 return ret;
337 } 337 }
338 dout("got int token %d val %d\n", token, intval); 338 dout("got int token %d val %d\n", token, intval);
339 } else if (token > Opt_last_int && token < Opt_last_string) { 339 } else if (token > Opt_last_int && token < Opt_last_string) {
340 dout("got string token %d val %s\n", token, 340 dout("got string token %d val %s\n", token,
341 argstr[0].from); 341 argstr[0].from);
342 } else { 342 } else {
343 dout("got token %d\n", token); 343 dout("got token %d\n", token);
344 } 344 }
345 345
346 switch (token) { 346 switch (token) {
347 case Opt_notify_timeout: 347 case Opt_notify_timeout:
348 rbdopt->notify_timeout = intval; 348 rbdopt->notify_timeout = intval;
349 break; 349 break;
350 default: 350 default:
351 BUG_ON(token); 351 BUG_ON(token);
352 } 352 }
353 return 0; 353 return 0;
354 } 354 }
355 355
356 /* 356 /*
357 * Get a ceph client with specific addr and configuration, if one does 357 * Get a ceph client with specific addr and configuration, if one does
358 * not exist create it. 358 * not exist create it.
359 */ 359 */
360 static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr, 360 static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
361 char *options) 361 char *options)
362 { 362 {
363 struct rbd_client *rbdc; 363 struct rbd_client *rbdc;
364 struct ceph_options *opt; 364 struct ceph_options *opt;
365 int ret; 365 int ret;
366 struct rbd_options *rbd_opts; 366 struct rbd_options *rbd_opts;
367 367
368 rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL); 368 rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL);
369 if (!rbd_opts) 369 if (!rbd_opts)
370 return -ENOMEM; 370 return -ENOMEM;
371 371
372 rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT; 372 rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT;
373 373
374 ret = ceph_parse_options(&opt, options, mon_addr, 374 ret = ceph_parse_options(&opt, options, mon_addr,
375 mon_addr + strlen(mon_addr), parse_rbd_opts_token, rbd_opts); 375 mon_addr + strlen(mon_addr), parse_rbd_opts_token, rbd_opts);
376 if (ret < 0) 376 if (ret < 0)
377 goto done_err; 377 goto done_err;
378 378
379 spin_lock(&node_lock); 379 spin_lock(&node_lock);
380 rbdc = __rbd_client_find(opt); 380 rbdc = __rbd_client_find(opt);
381 if (rbdc) { 381 if (rbdc) {
382 ceph_destroy_options(opt); 382 ceph_destroy_options(opt);
383 kfree(rbd_opts);
383 384
384 /* using an existing client */ 385 /* using an existing client */
385 kref_get(&rbdc->kref); 386 kref_get(&rbdc->kref);
386 rbd_dev->rbd_client = rbdc; 387 rbd_dev->rbd_client = rbdc;
387 rbd_dev->client = rbdc->client; 388 rbd_dev->client = rbdc->client;
388 spin_unlock(&node_lock); 389 spin_unlock(&node_lock);
389 return 0; 390 return 0;
390 } 391 }
391 spin_unlock(&node_lock); 392 spin_unlock(&node_lock);
392 393
393 rbdc = rbd_client_create(opt, rbd_opts); 394 rbdc = rbd_client_create(opt, rbd_opts);
394 if (IS_ERR(rbdc)) { 395 if (IS_ERR(rbdc)) {
395 ret = PTR_ERR(rbdc); 396 ret = PTR_ERR(rbdc);
396 goto done_err; 397 goto done_err;
397 } 398 }
398 399
399 rbd_dev->rbd_client = rbdc; 400 rbd_dev->rbd_client = rbdc;
400 rbd_dev->client = rbdc->client; 401 rbd_dev->client = rbdc->client;
401 return 0; 402 return 0;
402 done_err: 403 done_err:
403 kfree(rbd_opts); 404 kfree(rbd_opts);
404 return ret; 405 return ret;
405 } 406 }
406 407
407 /* 408 /*
408 * Destroy ceph client 409 * Destroy ceph client
410 *
411 * Caller must hold node_lock.
409 */ 412 */
410 static void rbd_client_release(struct kref *kref) 413 static void rbd_client_release(struct kref *kref)
411 { 414 {
412 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); 415 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
413 416
414 dout("rbd_release_client %p\n", rbdc); 417 dout("rbd_release_client %p\n", rbdc);
415 spin_lock(&node_lock);
416 list_del(&rbdc->node); 418 list_del(&rbdc->node);
417 spin_unlock(&node_lock);
418 419
419 ceph_destroy_client(rbdc->client); 420 ceph_destroy_client(rbdc->client);
420 kfree(rbdc->rbd_opts); 421 kfree(rbdc->rbd_opts);
421 kfree(rbdc); 422 kfree(rbdc);
422 } 423 }
423 424
424 /* 425 /*
425 * Drop reference to ceph client node. If it's not referenced anymore, release 426 * Drop reference to ceph client node. If it's not referenced anymore, release
426 * it. 427 * it.
427 */ 428 */
428 static void rbd_put_client(struct rbd_device *rbd_dev) 429 static void rbd_put_client(struct rbd_device *rbd_dev)
429 { 430 {
431 spin_lock(&node_lock);
430 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release); 432 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
433 spin_unlock(&node_lock);
431 rbd_dev->rbd_client = NULL; 434 rbd_dev->rbd_client = NULL;
432 rbd_dev->client = NULL; 435 rbd_dev->client = NULL;
433 } 436 }
434 437
435 /* 438 /*
436 * Destroy requests collection 439 * Destroy requests collection
437 */ 440 */
438 static void rbd_coll_release(struct kref *kref) 441 static void rbd_coll_release(struct kref *kref)
439 { 442 {
440 struct rbd_req_coll *coll = 443 struct rbd_req_coll *coll =
441 container_of(kref, struct rbd_req_coll, kref); 444 container_of(kref, struct rbd_req_coll, kref);
442 445
443 dout("rbd_coll_release %p\n", coll); 446 dout("rbd_coll_release %p\n", coll);
444 kfree(coll); 447 kfree(coll);
445 } 448 }
446 449
447 /* 450 /*
448 * Create a new header structure, translate header format from the on-disk 451 * Create a new header structure, translate header format from the on-disk
449 * header. 452 * header.
450 */ 453 */
451 static int rbd_header_from_disk(struct rbd_image_header *header, 454 static int rbd_header_from_disk(struct rbd_image_header *header,
452 struct rbd_image_header_ondisk *ondisk, 455 struct rbd_image_header_ondisk *ondisk,
453 int allocated_snaps, 456 int allocated_snaps,
454 gfp_t gfp_flags) 457 gfp_t gfp_flags)
455 { 458 {
456 int i; 459 int i;
457 u32 snap_count = le32_to_cpu(ondisk->snap_count); 460 u32 snap_count = le32_to_cpu(ondisk->snap_count);
458 int ret = -ENOMEM; 461 int ret = -ENOMEM;
459 462
460 if (memcmp(ondisk, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT))) { 463 if (memcmp(ondisk, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT))) {
461 return -ENXIO; 464 return -ENXIO;
462 } 465 }
463 466
464 init_rwsem(&header->snap_rwsem); 467 init_rwsem(&header->snap_rwsem);
465 header->snap_names_len = le64_to_cpu(ondisk->snap_names_len); 468 header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
466 header->snapc = kmalloc(sizeof(struct ceph_snap_context) + 469 header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
467 snap_count * 470 snap_count *
468 sizeof(struct rbd_image_snap_ondisk), 471 sizeof(struct rbd_image_snap_ondisk),
469 gfp_flags); 472 gfp_flags);
470 if (!header->snapc) 473 if (!header->snapc)
471 return -ENOMEM; 474 return -ENOMEM;
472 if (snap_count) { 475 if (snap_count) {
473 header->snap_names = kmalloc(header->snap_names_len, 476 header->snap_names = kmalloc(header->snap_names_len,
474 GFP_KERNEL); 477 GFP_KERNEL);
475 if (!header->snap_names) 478 if (!header->snap_names)
476 goto err_snapc; 479 goto err_snapc;
477 header->snap_sizes = kmalloc(snap_count * sizeof(u64), 480 header->snap_sizes = kmalloc(snap_count * sizeof(u64),
478 GFP_KERNEL); 481 GFP_KERNEL);
479 if (!header->snap_sizes) 482 if (!header->snap_sizes)
480 goto err_names; 483 goto err_names;
481 } else { 484 } else {
482 header->snap_names = NULL; 485 header->snap_names = NULL;
483 header->snap_sizes = NULL; 486 header->snap_sizes = NULL;
484 } 487 }
485 memcpy(header->block_name, ondisk->block_name, 488 memcpy(header->block_name, ondisk->block_name,
486 sizeof(ondisk->block_name)); 489 sizeof(ondisk->block_name));
487 490
488 header->image_size = le64_to_cpu(ondisk->image_size); 491 header->image_size = le64_to_cpu(ondisk->image_size);
489 header->obj_order = ondisk->options.order; 492 header->obj_order = ondisk->options.order;
490 header->crypt_type = ondisk->options.crypt_type; 493 header->crypt_type = ondisk->options.crypt_type;
491 header->comp_type = ondisk->options.comp_type; 494 header->comp_type = ondisk->options.comp_type;
492 495
493 atomic_set(&header->snapc->nref, 1); 496 atomic_set(&header->snapc->nref, 1);
494 header->snap_seq = le64_to_cpu(ondisk->snap_seq); 497 header->snap_seq = le64_to_cpu(ondisk->snap_seq);
495 header->snapc->num_snaps = snap_count; 498 header->snapc->num_snaps = snap_count;
496 header->total_snaps = snap_count; 499 header->total_snaps = snap_count;
497 500
498 if (snap_count && 501 if (snap_count &&
499 allocated_snaps == snap_count) { 502 allocated_snaps == snap_count) {
500 for (i = 0; i < snap_count; i++) { 503 for (i = 0; i < snap_count; i++) {
501 header->snapc->snaps[i] = 504 header->snapc->snaps[i] =
502 le64_to_cpu(ondisk->snaps[i].id); 505 le64_to_cpu(ondisk->snaps[i].id);
503 header->snap_sizes[i] = 506 header->snap_sizes[i] =
504 le64_to_cpu(ondisk->snaps[i].image_size); 507 le64_to_cpu(ondisk->snaps[i].image_size);
505 } 508 }
506 509
507 /* copy snapshot names */ 510 /* copy snapshot names */
508 memcpy(header->snap_names, &ondisk->snaps[i], 511 memcpy(header->snap_names, &ondisk->snaps[i],
509 header->snap_names_len); 512 header->snap_names_len);
510 } 513 }
511 514
512 return 0; 515 return 0;
513 516
514 err_names: 517 err_names:
515 kfree(header->snap_names); 518 kfree(header->snap_names);
516 err_snapc: 519 err_snapc:
517 kfree(header->snapc); 520 kfree(header->snapc);
518 return ret; 521 return ret;
519 } 522 }
520 523
521 static int snap_index(struct rbd_image_header *header, int snap_num) 524 static int snap_index(struct rbd_image_header *header, int snap_num)
522 { 525 {
523 return header->total_snaps - snap_num; 526 return header->total_snaps - snap_num;
524 } 527 }
525 528
526 static u64 cur_snap_id(struct rbd_device *rbd_dev) 529 static u64 cur_snap_id(struct rbd_device *rbd_dev)
527 { 530 {
528 struct rbd_image_header *header = &rbd_dev->header; 531 struct rbd_image_header *header = &rbd_dev->header;
529 532
530 if (!rbd_dev->cur_snap) 533 if (!rbd_dev->cur_snap)
531 return 0; 534 return 0;
532 535
533 return header->snapc->snaps[snap_index(header, rbd_dev->cur_snap)]; 536 return header->snapc->snaps[snap_index(header, rbd_dev->cur_snap)];
534 } 537 }
535 538
536 static int snap_by_name(struct rbd_image_header *header, const char *snap_name, 539 static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
537 u64 *seq, u64 *size) 540 u64 *seq, u64 *size)
538 { 541 {
539 int i; 542 int i;
540 char *p = header->snap_names; 543 char *p = header->snap_names;
541 544
542 for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) { 545 for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) {
543 if (strcmp(snap_name, p) == 0) 546 if (strcmp(snap_name, p) == 0)
544 break; 547 break;
545 } 548 }
546 if (i == header->total_snaps) 549 if (i == header->total_snaps)
547 return -ENOENT; 550 return -ENOENT;
548 if (seq) 551 if (seq)
549 *seq = header->snapc->snaps[i]; 552 *seq = header->snapc->snaps[i];
550 553
551 if (size) 554 if (size)
552 *size = header->snap_sizes[i]; 555 *size = header->snap_sizes[i];
553 556
554 return i; 557 return i;
555 } 558 }
556 559
557 static int rbd_header_set_snap(struct rbd_device *dev, 560 static int rbd_header_set_snap(struct rbd_device *dev,
558 const char *snap_name, 561 const char *snap_name,
559 u64 *size) 562 u64 *size)
560 { 563 {
561 struct rbd_image_header *header = &dev->header; 564 struct rbd_image_header *header = &dev->header;
562 struct ceph_snap_context *snapc = header->snapc; 565 struct ceph_snap_context *snapc = header->snapc;
563 int ret = -ENOENT; 566 int ret = -ENOENT;
564 567
565 down_write(&header->snap_rwsem); 568 down_write(&header->snap_rwsem);
566 569
567 if (!snap_name || 570 if (!snap_name ||
568 !*snap_name || 571 !*snap_name ||
569 strcmp(snap_name, "-") == 0 || 572 strcmp(snap_name, "-") == 0 ||
570 strcmp(snap_name, RBD_SNAP_HEAD_NAME) == 0) { 573 strcmp(snap_name, RBD_SNAP_HEAD_NAME) == 0) {
571 if (header->total_snaps) 574 if (header->total_snaps)
572 snapc->seq = header->snap_seq; 575 snapc->seq = header->snap_seq;
573 else 576 else
574 snapc->seq = 0; 577 snapc->seq = 0;
575 dev->cur_snap = 0; 578 dev->cur_snap = 0;
576 dev->read_only = 0; 579 dev->read_only = 0;
577 if (size) 580 if (size)
578 *size = header->image_size; 581 *size = header->image_size;
579 } else { 582 } else {
580 ret = snap_by_name(header, snap_name, &snapc->seq, size); 583 ret = snap_by_name(header, snap_name, &snapc->seq, size);
581 if (ret < 0) 584 if (ret < 0)
582 goto done; 585 goto done;
583 586
584 dev->cur_snap = header->total_snaps - ret; 587 dev->cur_snap = header->total_snaps - ret;
585 dev->read_only = 1; 588 dev->read_only = 1;
586 } 589 }
587 590
588 ret = 0; 591 ret = 0;
589 done: 592 done:
590 up_write(&header->snap_rwsem); 593 up_write(&header->snap_rwsem);
591 return ret; 594 return ret;
592 } 595 }
593 596
594 static void rbd_header_free(struct rbd_image_header *header) 597 static void rbd_header_free(struct rbd_image_header *header)
595 { 598 {
596 kfree(header->snapc); 599 kfree(header->snapc);
597 kfree(header->snap_names); 600 kfree(header->snap_names);
598 kfree(header->snap_sizes); 601 kfree(header->snap_sizes);
599 } 602 }
600 603
601 /* 604 /*
602 * get the actual striped segment name, offset and length 605 * get the actual striped segment name, offset and length
603 */ 606 */
604 static u64 rbd_get_segment(struct rbd_image_header *header, 607 static u64 rbd_get_segment(struct rbd_image_header *header,
605 const char *block_name, 608 const char *block_name,
606 u64 ofs, u64 len, 609 u64 ofs, u64 len,
607 char *seg_name, u64 *segofs) 610 char *seg_name, u64 *segofs)
608 { 611 {
609 u64 seg = ofs >> header->obj_order; 612 u64 seg = ofs >> header->obj_order;
610 613
611 if (seg_name) 614 if (seg_name)
612 snprintf(seg_name, RBD_MAX_SEG_NAME_LEN, 615 snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
613 "%s.%012llx", block_name, seg); 616 "%s.%012llx", block_name, seg);
614 617
615 ofs = ofs & ((1 << header->obj_order) - 1); 618 ofs = ofs & ((1 << header->obj_order) - 1);
616 len = min_t(u64, len, (1 << header->obj_order) - ofs); 619 len = min_t(u64, len, (1 << header->obj_order) - ofs);
617 620
618 if (segofs) 621 if (segofs)
619 *segofs = ofs; 622 *segofs = ofs;
620 623
621 return len; 624 return len;
622 } 625 }
623 626
624 static int rbd_get_num_segments(struct rbd_image_header *header, 627 static int rbd_get_num_segments(struct rbd_image_header *header,
625 u64 ofs, u64 len) 628 u64 ofs, u64 len)
626 { 629 {
627 u64 start_seg = ofs >> header->obj_order; 630 u64 start_seg = ofs >> header->obj_order;
628 u64 end_seg = (ofs + len - 1) >> header->obj_order; 631 u64 end_seg = (ofs + len - 1) >> header->obj_order;
629 return end_seg - start_seg + 1; 632 return end_seg - start_seg + 1;
630 } 633 }
631 634
632 /* 635 /*
633 * returns the size of an object in the image 636 * returns the size of an object in the image
634 */ 637 */
635 static u64 rbd_obj_bytes(struct rbd_image_header *header) 638 static u64 rbd_obj_bytes(struct rbd_image_header *header)
636 { 639 {
637 return 1 << header->obj_order; 640 return 1 << header->obj_order;
638 } 641 }
639 642
640 /* 643 /*
641 * bio helpers 644 * bio helpers
642 */ 645 */
643 646
644 static void bio_chain_put(struct bio *chain) 647 static void bio_chain_put(struct bio *chain)
645 { 648 {
646 struct bio *tmp; 649 struct bio *tmp;
647 650
648 while (chain) { 651 while (chain) {
649 tmp = chain; 652 tmp = chain;
650 chain = chain->bi_next; 653 chain = chain->bi_next;
651 bio_put(tmp); 654 bio_put(tmp);
652 } 655 }
653 } 656 }
654 657
655 /* 658 /*
656 * zeros a bio chain, starting at specific offset 659 * zeros a bio chain, starting at specific offset
657 */ 660 */
658 static void zero_bio_chain(struct bio *chain, int start_ofs) 661 static void zero_bio_chain(struct bio *chain, int start_ofs)
659 { 662 {
660 struct bio_vec *bv; 663 struct bio_vec *bv;
661 unsigned long flags; 664 unsigned long flags;
662 void *buf; 665 void *buf;
663 int i; 666 int i;
664 int pos = 0; 667 int pos = 0;
665 668
666 while (chain) { 669 while (chain) {
667 bio_for_each_segment(bv, chain, i) { 670 bio_for_each_segment(bv, chain, i) {
668 if (pos + bv->bv_len > start_ofs) { 671 if (pos + bv->bv_len > start_ofs) {
669 int remainder = max(start_ofs - pos, 0); 672 int remainder = max(start_ofs - pos, 0);
670 buf = bvec_kmap_irq(bv, &flags); 673 buf = bvec_kmap_irq(bv, &flags);
671 memset(buf + remainder, 0, 674 memset(buf + remainder, 0,
672 bv->bv_len - remainder); 675 bv->bv_len - remainder);
673 bvec_kunmap_irq(buf, &flags); 676 bvec_kunmap_irq(buf, &flags);
674 } 677 }
675 pos += bv->bv_len; 678 pos += bv->bv_len;
676 } 679 }
677 680
678 chain = chain->bi_next; 681 chain = chain->bi_next;
679 } 682 }
680 } 683 }
681 684
682 /* 685 /*
683 * bio_chain_clone - clone a chain of bios up to a certain length. 686 * bio_chain_clone - clone a chain of bios up to a certain length.
684 * might return a bio_pair that will need to be released. 687 * might return a bio_pair that will need to be released.
685 */ 688 */
686 static struct bio *bio_chain_clone(struct bio **old, struct bio **next, 689 static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
687 struct bio_pair **bp, 690 struct bio_pair **bp,
688 int len, gfp_t gfpmask) 691 int len, gfp_t gfpmask)
689 { 692 {
690 struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL; 693 struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
691 int total = 0; 694 int total = 0;
692 695
693 if (*bp) { 696 if (*bp) {
694 bio_pair_release(*bp); 697 bio_pair_release(*bp);
695 *bp = NULL; 698 *bp = NULL;
696 } 699 }
697 700
698 while (old_chain && (total < len)) { 701 while (old_chain && (total < len)) {
699 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs); 702 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
700 if (!tmp) 703 if (!tmp)
701 goto err_out; 704 goto err_out;
702 705
703 if (total + old_chain->bi_size > len) { 706 if (total + old_chain->bi_size > len) {
704 struct bio_pair *bp; 707 struct bio_pair *bp;
705 708
706 /* 709 /*
707 * this split can only happen with a single paged bio, 710 * this split can only happen with a single paged bio,
708 * split_bio will BUG_ON if this is not the case 711 * split_bio will BUG_ON if this is not the case
709 */ 712 */
710 dout("bio_chain_clone split! total=%d remaining=%d" 713 dout("bio_chain_clone split! total=%d remaining=%d"
711 "bi_size=%d\n", 714 "bi_size=%d\n",
712 (int)total, (int)len-total, 715 (int)total, (int)len-total,
713 (int)old_chain->bi_size); 716 (int)old_chain->bi_size);
714 717
715 /* split the bio. We'll release it either in the next 718 /* split the bio. We'll release it either in the next
716 call, or it will have to be released outside */ 719 call, or it will have to be released outside */
717 bp = bio_split(old_chain, (len - total) / 512ULL); 720 bp = bio_split(old_chain, (len - total) / 512ULL);
718 if (!bp) 721 if (!bp)
719 goto err_out; 722 goto err_out;
720 723
721 __bio_clone(tmp, &bp->bio1); 724 __bio_clone(tmp, &bp->bio1);
722 725
723 *next = &bp->bio2; 726 *next = &bp->bio2;
724 } else { 727 } else {
725 __bio_clone(tmp, old_chain); 728 __bio_clone(tmp, old_chain);
726 *next = old_chain->bi_next; 729 *next = old_chain->bi_next;
727 } 730 }
728 731
729 tmp->bi_bdev = NULL; 732 tmp->bi_bdev = NULL;
730 gfpmask &= ~__GFP_WAIT; 733 gfpmask &= ~__GFP_WAIT;
731 tmp->bi_next = NULL; 734 tmp->bi_next = NULL;
732 735
733 if (!new_chain) { 736 if (!new_chain) {
734 new_chain = tail = tmp; 737 new_chain = tail = tmp;
735 } else { 738 } else {
736 tail->bi_next = tmp; 739 tail->bi_next = tmp;
737 tail = tmp; 740 tail = tmp;
738 } 741 }
739 old_chain = old_chain->bi_next; 742 old_chain = old_chain->bi_next;
740 743
741 total += tmp->bi_size; 744 total += tmp->bi_size;
742 } 745 }
743 746
744 BUG_ON(total < len); 747 BUG_ON(total < len);
745 748
746 if (tail) 749 if (tail)
747 tail->bi_next = NULL; 750 tail->bi_next = NULL;
748 751
749 *old = old_chain; 752 *old = old_chain;
750 753
751 return new_chain; 754 return new_chain;
752 755
753 err_out: 756 err_out:
754 dout("bio_chain_clone with err\n"); 757 dout("bio_chain_clone with err\n");
755 bio_chain_put(new_chain); 758 bio_chain_put(new_chain);
756 return NULL; 759 return NULL;
757 } 760 }
758 761
759 /* 762 /*
760 * helpers for osd request op vectors. 763 * helpers for osd request op vectors.
761 */ 764 */
762 static int rbd_create_rw_ops(struct ceph_osd_req_op **ops, 765 static int rbd_create_rw_ops(struct ceph_osd_req_op **ops,
763 int num_ops, 766 int num_ops,
764 int opcode, 767 int opcode,
765 u32 payload_len) 768 u32 payload_len)
766 { 769 {
767 *ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1), 770 *ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1),
768 GFP_NOIO); 771 GFP_NOIO);
769 if (!*ops) 772 if (!*ops)
770 return -ENOMEM; 773 return -ENOMEM;
771 (*ops)[0].op = opcode; 774 (*ops)[0].op = opcode;
772 /* 775 /*
773 * op extent offset and length will be set later on 776 * op extent offset and length will be set later on
774 * in calc_raw_layout() 777 * in calc_raw_layout()
775 */ 778 */
776 (*ops)[0].payload_len = payload_len; 779 (*ops)[0].payload_len = payload_len;
777 return 0; 780 return 0;
778 } 781 }
779 782
780 static void rbd_destroy_ops(struct ceph_osd_req_op *ops) 783 static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
781 { 784 {
782 kfree(ops); 785 kfree(ops);
783 } 786 }
784 787
785 static void rbd_coll_end_req_index(struct request *rq, 788 static void rbd_coll_end_req_index(struct request *rq,
786 struct rbd_req_coll *coll, 789 struct rbd_req_coll *coll,
787 int index, 790 int index,
788 int ret, u64 len) 791 int ret, u64 len)
789 { 792 {
790 struct request_queue *q; 793 struct request_queue *q;
791 int min, max, i; 794 int min, max, i;
792 795
793 dout("rbd_coll_end_req_index %p index %d ret %d len %lld\n", 796 dout("rbd_coll_end_req_index %p index %d ret %d len %lld\n",
794 coll, index, ret, len); 797 coll, index, ret, len);
795 798
796 if (!rq) 799 if (!rq)
797 return; 800 return;
798 801
799 if (!coll) { 802 if (!coll) {
800 blk_end_request(rq, ret, len); 803 blk_end_request(rq, ret, len);
801 return; 804 return;
802 } 805 }
803 806
804 q = rq->q; 807 q = rq->q;
805 808
806 spin_lock_irq(q->queue_lock); 809 spin_lock_irq(q->queue_lock);
807 coll->status[index].done = 1; 810 coll->status[index].done = 1;
808 coll->status[index].rc = ret; 811 coll->status[index].rc = ret;
809 coll->status[index].bytes = len; 812 coll->status[index].bytes = len;
810 max = min = coll->num_done; 813 max = min = coll->num_done;
811 while (max < coll->total && coll->status[max].done) 814 while (max < coll->total && coll->status[max].done)
812 max++; 815 max++;
813 816
814 for (i = min; i<max; i++) { 817 for (i = min; i<max; i++) {
815 __blk_end_request(rq, coll->status[i].rc, 818 __blk_end_request(rq, coll->status[i].rc,
816 coll->status[i].bytes); 819 coll->status[i].bytes);
817 coll->num_done++; 820 coll->num_done++;
818 kref_put(&coll->kref, rbd_coll_release); 821 kref_put(&coll->kref, rbd_coll_release);
819 } 822 }
820 spin_unlock_irq(q->queue_lock); 823 spin_unlock_irq(q->queue_lock);
821 } 824 }
822 825
823 static void rbd_coll_end_req(struct rbd_request *req, 826 static void rbd_coll_end_req(struct rbd_request *req,
824 int ret, u64 len) 827 int ret, u64 len)
825 { 828 {
826 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len); 829 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
827 } 830 }
828 831
829 /* 832 /*
830 * Send ceph osd request 833 * Send ceph osd request
831 */ 834 */
832 static int rbd_do_request(struct request *rq, 835 static int rbd_do_request(struct request *rq,
833 struct rbd_device *dev, 836 struct rbd_device *dev,
834 struct ceph_snap_context *snapc, 837 struct ceph_snap_context *snapc,
835 u64 snapid, 838 u64 snapid,
836 const char *obj, u64 ofs, u64 len, 839 const char *obj, u64 ofs, u64 len,
837 struct bio *bio, 840 struct bio *bio,
838 struct page **pages, 841 struct page **pages,
839 int num_pages, 842 int num_pages,
840 int flags, 843 int flags,
841 struct ceph_osd_req_op *ops, 844 struct ceph_osd_req_op *ops,
842 int num_reply, 845 int num_reply,
843 struct rbd_req_coll *coll, 846 struct rbd_req_coll *coll,
844 int coll_index, 847 int coll_index,
845 void (*rbd_cb)(struct ceph_osd_request *req, 848 void (*rbd_cb)(struct ceph_osd_request *req,
846 struct ceph_msg *msg), 849 struct ceph_msg *msg),
847 struct ceph_osd_request **linger_req, 850 struct ceph_osd_request **linger_req,
848 u64 *ver) 851 u64 *ver)
849 { 852 {
850 struct ceph_osd_request *req; 853 struct ceph_osd_request *req;
851 struct ceph_file_layout *layout; 854 struct ceph_file_layout *layout;
852 int ret; 855 int ret;
853 u64 bno; 856 u64 bno;
854 struct timespec mtime = CURRENT_TIME; 857 struct timespec mtime = CURRENT_TIME;
855 struct rbd_request *req_data; 858 struct rbd_request *req_data;
856 struct ceph_osd_request_head *reqhead; 859 struct ceph_osd_request_head *reqhead;
857 struct rbd_image_header *header = &dev->header; 860 struct rbd_image_header *header = &dev->header;
858 861
859 req_data = kzalloc(sizeof(*req_data), GFP_NOIO); 862 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
860 if (!req_data) { 863 if (!req_data) {
861 if (coll) 864 if (coll)
862 rbd_coll_end_req_index(rq, coll, coll_index, 865 rbd_coll_end_req_index(rq, coll, coll_index,
863 -ENOMEM, len); 866 -ENOMEM, len);
864 return -ENOMEM; 867 return -ENOMEM;
865 } 868 }
866 869
867 if (coll) { 870 if (coll) {
868 req_data->coll = coll; 871 req_data->coll = coll;
869 req_data->coll_index = coll_index; 872 req_data->coll_index = coll_index;
870 } 873 }
871 874
872 dout("rbd_do_request obj=%s ofs=%lld len=%lld\n", obj, len, ofs); 875 dout("rbd_do_request obj=%s ofs=%lld len=%lld\n", obj, len, ofs);
873 876
874 down_read(&header->snap_rwsem); 877 down_read(&header->snap_rwsem);
875 878
876 req = ceph_osdc_alloc_request(&dev->client->osdc, flags, 879 req = ceph_osdc_alloc_request(&dev->client->osdc, flags,
877 snapc, 880 snapc,
878 ops, 881 ops,
879 false, 882 false,
880 GFP_NOIO, pages, bio); 883 GFP_NOIO, pages, bio);
881 if (!req) { 884 if (!req) {
882 up_read(&header->snap_rwsem); 885 up_read(&header->snap_rwsem);
883 ret = -ENOMEM; 886 ret = -ENOMEM;
884 goto done_pages; 887 goto done_pages;
885 } 888 }
886 889
887 req->r_callback = rbd_cb; 890 req->r_callback = rbd_cb;
888 891
889 req_data->rq = rq; 892 req_data->rq = rq;
890 req_data->bio = bio; 893 req_data->bio = bio;
891 req_data->pages = pages; 894 req_data->pages = pages;
892 req_data->len = len; 895 req_data->len = len;
893 896
894 req->r_priv = req_data; 897 req->r_priv = req_data;
895 898
896 reqhead = req->r_request->front.iov_base; 899 reqhead = req->r_request->front.iov_base;
897 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP); 900 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
898 901
899 strncpy(req->r_oid, obj, sizeof(req->r_oid)); 902 strncpy(req->r_oid, obj, sizeof(req->r_oid));
900 req->r_oid_len = strlen(req->r_oid); 903 req->r_oid_len = strlen(req->r_oid);
901 904
902 layout = &req->r_file_layout; 905 layout = &req->r_file_layout;
903 memset(layout, 0, sizeof(*layout)); 906 memset(layout, 0, sizeof(*layout));
904 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); 907 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
905 layout->fl_stripe_count = cpu_to_le32(1); 908 layout->fl_stripe_count = cpu_to_le32(1);
906 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); 909 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
907 layout->fl_pg_preferred = cpu_to_le32(-1); 910 layout->fl_pg_preferred = cpu_to_le32(-1);
908 layout->fl_pg_pool = cpu_to_le32(dev->poolid); 911 layout->fl_pg_pool = cpu_to_le32(dev->poolid);
909 ceph_calc_raw_layout(&dev->client->osdc, layout, snapid, 912 ceph_calc_raw_layout(&dev->client->osdc, layout, snapid,
910 ofs, &len, &bno, req, ops); 913 ofs, &len, &bno, req, ops);
911 914
912 ceph_osdc_build_request(req, ofs, &len, 915 ceph_osdc_build_request(req, ofs, &len,
913 ops, 916 ops,
914 snapc, 917 snapc,
915 &mtime, 918 &mtime,
916 req->r_oid, req->r_oid_len); 919 req->r_oid, req->r_oid_len);
917 up_read(&header->snap_rwsem); 920 up_read(&header->snap_rwsem);
918 921
919 if (linger_req) { 922 if (linger_req) {
920 ceph_osdc_set_request_linger(&dev->client->osdc, req); 923 ceph_osdc_set_request_linger(&dev->client->osdc, req);
921 *linger_req = req; 924 *linger_req = req;
922 } 925 }
923 926
924 ret = ceph_osdc_start_request(&dev->client->osdc, req, false); 927 ret = ceph_osdc_start_request(&dev->client->osdc, req, false);
925 if (ret < 0) 928 if (ret < 0)
926 goto done_err; 929 goto done_err;
927 930
928 if (!rbd_cb) { 931 if (!rbd_cb) {
929 ret = ceph_osdc_wait_request(&dev->client->osdc, req); 932 ret = ceph_osdc_wait_request(&dev->client->osdc, req);
930 if (ver) 933 if (ver)
931 *ver = le64_to_cpu(req->r_reassert_version.version); 934 *ver = le64_to_cpu(req->r_reassert_version.version);
932 dout("reassert_ver=%lld\n", 935 dout("reassert_ver=%lld\n",
933 le64_to_cpu(req->r_reassert_version.version)); 936 le64_to_cpu(req->r_reassert_version.version));
934 ceph_osdc_put_request(req); 937 ceph_osdc_put_request(req);
935 } 938 }
936 return ret; 939 return ret;
937 940
938 done_err: 941 done_err:
939 bio_chain_put(req_data->bio); 942 bio_chain_put(req_data->bio);
940 ceph_osdc_put_request(req); 943 ceph_osdc_put_request(req);
941 done_pages: 944 done_pages:
942 rbd_coll_end_req(req_data, ret, len); 945 rbd_coll_end_req(req_data, ret, len);
943 kfree(req_data); 946 kfree(req_data);
944 return ret; 947 return ret;
945 } 948 }
946 949
947 /* 950 /*
948 * Ceph osd op callback 951 * Ceph osd op callback
949 */ 952 */
950 static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) 953 static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
951 { 954 {
952 struct rbd_request *req_data = req->r_priv; 955 struct rbd_request *req_data = req->r_priv;
953 struct ceph_osd_reply_head *replyhead; 956 struct ceph_osd_reply_head *replyhead;
954 struct ceph_osd_op *op; 957 struct ceph_osd_op *op;
955 __s32 rc; 958 __s32 rc;
956 u64 bytes; 959 u64 bytes;
957 int read_op; 960 int read_op;
958 961
959 /* parse reply */ 962 /* parse reply */
960 replyhead = msg->front.iov_base; 963 replyhead = msg->front.iov_base;
961 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); 964 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
962 op = (void *)(replyhead + 1); 965 op = (void *)(replyhead + 1);
963 rc = le32_to_cpu(replyhead->result); 966 rc = le32_to_cpu(replyhead->result);
964 bytes = le64_to_cpu(op->extent.length); 967 bytes = le64_to_cpu(op->extent.length);
965 read_op = (le32_to_cpu(op->op) == CEPH_OSD_OP_READ); 968 read_op = (le32_to_cpu(op->op) == CEPH_OSD_OP_READ);
966 969
967 dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc); 970 dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc);
968 971
969 if (rc == -ENOENT && read_op) { 972 if (rc == -ENOENT && read_op) {
970 zero_bio_chain(req_data->bio, 0); 973 zero_bio_chain(req_data->bio, 0);
971 rc = 0; 974 rc = 0;
972 } else if (rc == 0 && read_op && bytes < req_data->len) { 975 } else if (rc == 0 && read_op && bytes < req_data->len) {
973 zero_bio_chain(req_data->bio, bytes); 976 zero_bio_chain(req_data->bio, bytes);
974 bytes = req_data->len; 977 bytes = req_data->len;
975 } 978 }
976 979
977 rbd_coll_end_req(req_data, rc, bytes); 980 rbd_coll_end_req(req_data, rc, bytes);
978 981
979 if (req_data->bio) 982 if (req_data->bio)
980 bio_chain_put(req_data->bio); 983 bio_chain_put(req_data->bio);
981 984
982 ceph_osdc_put_request(req); 985 ceph_osdc_put_request(req);
983 kfree(req_data); 986 kfree(req_data);
984 } 987 }
985 988
986 static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) 989 static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
987 { 990 {
988 ceph_osdc_put_request(req); 991 ceph_osdc_put_request(req);
989 } 992 }
990 993
991 /* 994 /*
992 * Do a synchronous ceph osd operation 995 * Do a synchronous ceph osd operation
993 */ 996 */
994 static int rbd_req_sync_op(struct rbd_device *dev, 997 static int rbd_req_sync_op(struct rbd_device *dev,
995 struct ceph_snap_context *snapc, 998 struct ceph_snap_context *snapc,
996 u64 snapid, 999 u64 snapid,
997 int opcode, 1000 int opcode,
998 int flags, 1001 int flags,
999 struct ceph_osd_req_op *orig_ops, 1002 struct ceph_osd_req_op *orig_ops,
1000 int num_reply, 1003 int num_reply,
1001 const char *obj, 1004 const char *obj,
1002 u64 ofs, u64 len, 1005 u64 ofs, u64 len,
1003 char *buf, 1006 char *buf,
1004 struct ceph_osd_request **linger_req, 1007 struct ceph_osd_request **linger_req,
1005 u64 *ver) 1008 u64 *ver)
1006 { 1009 {
1007 int ret; 1010 int ret;
1008 struct page **pages; 1011 struct page **pages;
1009 int num_pages; 1012 int num_pages;
1010 struct ceph_osd_req_op *ops = orig_ops; 1013 struct ceph_osd_req_op *ops = orig_ops;
1011 u32 payload_len; 1014 u32 payload_len;
1012 1015
1013 num_pages = calc_pages_for(ofs , len); 1016 num_pages = calc_pages_for(ofs , len);
1014 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); 1017 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
1015 if (IS_ERR(pages)) 1018 if (IS_ERR(pages))
1016 return PTR_ERR(pages); 1019 return PTR_ERR(pages);
1017 1020
1018 if (!orig_ops) { 1021 if (!orig_ops) {
1019 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0); 1022 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0);
1020 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len); 1023 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1021 if (ret < 0) 1024 if (ret < 0)
1022 goto done; 1025 goto done;
1023 1026
1024 if ((flags & CEPH_OSD_FLAG_WRITE) && buf) { 1027 if ((flags & CEPH_OSD_FLAG_WRITE) && buf) {
1025 ret = ceph_copy_to_page_vector(pages, buf, ofs, len); 1028 ret = ceph_copy_to_page_vector(pages, buf, ofs, len);
1026 if (ret < 0) 1029 if (ret < 0)
1027 goto done_ops; 1030 goto done_ops;
1028 } 1031 }
1029 } 1032 }
1030 1033
1031 ret = rbd_do_request(NULL, dev, snapc, snapid, 1034 ret = rbd_do_request(NULL, dev, snapc, snapid,
1032 obj, ofs, len, NULL, 1035 obj, ofs, len, NULL,
1033 pages, num_pages, 1036 pages, num_pages,
1034 flags, 1037 flags,
1035 ops, 1038 ops,
1036 2, 1039 2,
1037 NULL, 0, 1040 NULL, 0,
1038 NULL, 1041 NULL,
1039 linger_req, ver); 1042 linger_req, ver);
1040 if (ret < 0) 1043 if (ret < 0)
1041 goto done_ops; 1044 goto done_ops;
1042 1045
1043 if ((flags & CEPH_OSD_FLAG_READ) && buf) 1046 if ((flags & CEPH_OSD_FLAG_READ) && buf)
1044 ret = ceph_copy_from_page_vector(pages, buf, ofs, ret); 1047 ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
1045 1048
1046 done_ops: 1049 done_ops:
1047 if (!orig_ops) 1050 if (!orig_ops)
1048 rbd_destroy_ops(ops); 1051 rbd_destroy_ops(ops);
1049 done: 1052 done:
1050 ceph_release_page_vector(pages, num_pages); 1053 ceph_release_page_vector(pages, num_pages);
1051 return ret; 1054 return ret;
1052 } 1055 }
1053 1056
1054 /* 1057 /*
1055 * Do an asynchronous ceph osd operation 1058 * Do an asynchronous ceph osd operation
1056 */ 1059 */
1057 static int rbd_do_op(struct request *rq, 1060 static int rbd_do_op(struct request *rq,
1058 struct rbd_device *rbd_dev , 1061 struct rbd_device *rbd_dev ,
1059 struct ceph_snap_context *snapc, 1062 struct ceph_snap_context *snapc,
1060 u64 snapid, 1063 u64 snapid,
1061 int opcode, int flags, int num_reply, 1064 int opcode, int flags, int num_reply,
1062 u64 ofs, u64 len, 1065 u64 ofs, u64 len,
1063 struct bio *bio, 1066 struct bio *bio,
1064 struct rbd_req_coll *coll, 1067 struct rbd_req_coll *coll,
1065 int coll_index) 1068 int coll_index)
1066 { 1069 {
1067 char *seg_name; 1070 char *seg_name;
1068 u64 seg_ofs; 1071 u64 seg_ofs;
1069 u64 seg_len; 1072 u64 seg_len;
1070 int ret; 1073 int ret;
1071 struct ceph_osd_req_op *ops; 1074 struct ceph_osd_req_op *ops;
1072 u32 payload_len; 1075 u32 payload_len;
1073 1076
1074 seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO); 1077 seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
1075 if (!seg_name) 1078 if (!seg_name)
1076 return -ENOMEM; 1079 return -ENOMEM;
1077 1080
1078 seg_len = rbd_get_segment(&rbd_dev->header, 1081 seg_len = rbd_get_segment(&rbd_dev->header,
1079 rbd_dev->header.block_name, 1082 rbd_dev->header.block_name,
1080 ofs, len, 1083 ofs, len,
1081 seg_name, &seg_ofs); 1084 seg_name, &seg_ofs);
1082 1085
1083 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0); 1086 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1084 1087
1085 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len); 1088 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1086 if (ret < 0) 1089 if (ret < 0)
1087 goto done; 1090 goto done;
1088 1091
1089 /* we've taken care of segment sizes earlier when we 1092 /* we've taken care of segment sizes earlier when we
1090 cloned the bios. We should never have a segment 1093 cloned the bios. We should never have a segment
1091 truncated at this point */ 1094 truncated at this point */
1092 BUG_ON(seg_len < len); 1095 BUG_ON(seg_len < len);
1093 1096
1094 ret = rbd_do_request(rq, rbd_dev, snapc, snapid, 1097 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1095 seg_name, seg_ofs, seg_len, 1098 seg_name, seg_ofs, seg_len,
1096 bio, 1099 bio,
1097 NULL, 0, 1100 NULL, 0,
1098 flags, 1101 flags,
1099 ops, 1102 ops,
1100 num_reply, 1103 num_reply,
1101 coll, coll_index, 1104 coll, coll_index,
1102 rbd_req_cb, 0, NULL); 1105 rbd_req_cb, 0, NULL);
1103 1106
1104 rbd_destroy_ops(ops); 1107 rbd_destroy_ops(ops);
1105 done: 1108 done:
1106 kfree(seg_name); 1109 kfree(seg_name);
1107 return ret; 1110 return ret;
1108 } 1111 }
1109 1112
1110 /* 1113 /*
1111 * Request async osd write 1114 * Request async osd write
1112 */ 1115 */
1113 static int rbd_req_write(struct request *rq, 1116 static int rbd_req_write(struct request *rq,
1114 struct rbd_device *rbd_dev, 1117 struct rbd_device *rbd_dev,
1115 struct ceph_snap_context *snapc, 1118 struct ceph_snap_context *snapc,
1116 u64 ofs, u64 len, 1119 u64 ofs, u64 len,
1117 struct bio *bio, 1120 struct bio *bio,
1118 struct rbd_req_coll *coll, 1121 struct rbd_req_coll *coll,
1119 int coll_index) 1122 int coll_index)
1120 { 1123 {
1121 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP, 1124 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1122 CEPH_OSD_OP_WRITE, 1125 CEPH_OSD_OP_WRITE,
1123 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, 1126 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1124 2, 1127 2,
1125 ofs, len, bio, coll, coll_index); 1128 ofs, len, bio, coll, coll_index);
1126 } 1129 }
1127 1130
1128 /* 1131 /*
1129 * Request async osd read 1132 * Request async osd read
1130 */ 1133 */
1131 static int rbd_req_read(struct request *rq, 1134 static int rbd_req_read(struct request *rq,
1132 struct rbd_device *rbd_dev, 1135 struct rbd_device *rbd_dev,
1133 u64 snapid, 1136 u64 snapid,
1134 u64 ofs, u64 len, 1137 u64 ofs, u64 len,
1135 struct bio *bio, 1138 struct bio *bio,
1136 struct rbd_req_coll *coll, 1139 struct rbd_req_coll *coll,
1137 int coll_index) 1140 int coll_index)
1138 { 1141 {
1139 return rbd_do_op(rq, rbd_dev, NULL, 1142 return rbd_do_op(rq, rbd_dev, NULL,
1140 (snapid ? snapid : CEPH_NOSNAP), 1143 (snapid ? snapid : CEPH_NOSNAP),
1141 CEPH_OSD_OP_READ, 1144 CEPH_OSD_OP_READ,
1142 CEPH_OSD_FLAG_READ, 1145 CEPH_OSD_FLAG_READ,
1143 2, 1146 2,
1144 ofs, len, bio, coll, coll_index); 1147 ofs, len, bio, coll, coll_index);
1145 } 1148 }
1146 1149
1147 /* 1150 /*
1148 * Request sync osd read 1151 * Request sync osd read
1149 */ 1152 */
1150 static int rbd_req_sync_read(struct rbd_device *dev, 1153 static int rbd_req_sync_read(struct rbd_device *dev,
1151 struct ceph_snap_context *snapc, 1154 struct ceph_snap_context *snapc,
1152 u64 snapid, 1155 u64 snapid,
1153 const char *obj, 1156 const char *obj,
1154 u64 ofs, u64 len, 1157 u64 ofs, u64 len,
1155 char *buf, 1158 char *buf,
1156 u64 *ver) 1159 u64 *ver)
1157 { 1160 {
1158 return rbd_req_sync_op(dev, NULL, 1161 return rbd_req_sync_op(dev, NULL,
1159 (snapid ? snapid : CEPH_NOSNAP), 1162 (snapid ? snapid : CEPH_NOSNAP),
1160 CEPH_OSD_OP_READ, 1163 CEPH_OSD_OP_READ,
1161 CEPH_OSD_FLAG_READ, 1164 CEPH_OSD_FLAG_READ,
1162 NULL, 1165 NULL,
1163 1, obj, ofs, len, buf, NULL, ver); 1166 1, obj, ofs, len, buf, NULL, ver);
1164 } 1167 }
1165 1168
1166 /* 1169 /*
1167 * Request sync osd watch 1170 * Request sync osd watch
1168 */ 1171 */
1169 static int rbd_req_sync_notify_ack(struct rbd_device *dev, 1172 static int rbd_req_sync_notify_ack(struct rbd_device *dev,
1170 u64 ver, 1173 u64 ver,
1171 u64 notify_id, 1174 u64 notify_id,
1172 const char *obj) 1175 const char *obj)
1173 { 1176 {
1174 struct ceph_osd_req_op *ops; 1177 struct ceph_osd_req_op *ops;
1175 struct page **pages = NULL; 1178 struct page **pages = NULL;
1176 int ret; 1179 int ret;
1177 1180
1178 ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY_ACK, 0); 1181 ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1179 if (ret < 0) 1182 if (ret < 0)
1180 return ret; 1183 return ret;
1181 1184
1182 ops[0].watch.ver = cpu_to_le64(dev->header.obj_version); 1185 ops[0].watch.ver = cpu_to_le64(dev->header.obj_version);
1183 ops[0].watch.cookie = notify_id; 1186 ops[0].watch.cookie = notify_id;
1184 ops[0].watch.flag = 0; 1187 ops[0].watch.flag = 0;
1185 1188
1186 ret = rbd_do_request(NULL, dev, NULL, CEPH_NOSNAP, 1189 ret = rbd_do_request(NULL, dev, NULL, CEPH_NOSNAP,
1187 obj, 0, 0, NULL, 1190 obj, 0, 0, NULL,
1188 pages, 0, 1191 pages, 0,
1189 CEPH_OSD_FLAG_READ, 1192 CEPH_OSD_FLAG_READ,
1190 ops, 1193 ops,
1191 1, 1194 1,
1192 NULL, 0, 1195 NULL, 0,
1193 rbd_simple_req_cb, 0, NULL); 1196 rbd_simple_req_cb, 0, NULL);
1194 1197
1195 rbd_destroy_ops(ops); 1198 rbd_destroy_ops(ops);
1196 return ret; 1199 return ret;
1197 } 1200 }
1198 1201
1199 static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) 1202 static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1200 { 1203 {
1201 struct rbd_device *dev = (struct rbd_device *)data; 1204 struct rbd_device *dev = (struct rbd_device *)data;
1202 int rc; 1205 int rc;
1203 1206
1204 if (!dev) 1207 if (!dev)
1205 return; 1208 return;
1206 1209
1207 dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name, 1210 dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name,
1208 notify_id, (int)opcode); 1211 notify_id, (int)opcode);
1209 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 1212 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1210 rc = __rbd_update_snaps(dev); 1213 rc = __rbd_update_snaps(dev);
1211 mutex_unlock(&ctl_mutex); 1214 mutex_unlock(&ctl_mutex);
1212 if (rc) 1215 if (rc)
1213 pr_warning(DRV_NAME "%d got notification but failed to update" 1216 pr_warning(DRV_NAME "%d got notification but failed to update"
1214 " snaps: %d\n", dev->major, rc); 1217 " snaps: %d\n", dev->major, rc);
1215 1218
1216 rbd_req_sync_notify_ack(dev, ver, notify_id, dev->obj_md_name); 1219 rbd_req_sync_notify_ack(dev, ver, notify_id, dev->obj_md_name);
1217 } 1220 }
1218 1221
1219 /* 1222 /*
1220 * Request sync osd watch 1223 * Request sync osd watch
1221 */ 1224 */
1222 static int rbd_req_sync_watch(struct rbd_device *dev, 1225 static int rbd_req_sync_watch(struct rbd_device *dev,
1223 const char *obj, 1226 const char *obj,
1224 u64 ver) 1227 u64 ver)
1225 { 1228 {
1226 struct ceph_osd_req_op *ops; 1229 struct ceph_osd_req_op *ops;
1227 struct ceph_osd_client *osdc = &dev->client->osdc; 1230 struct ceph_osd_client *osdc = &dev->client->osdc;
1228 1231
1229 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0); 1232 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
1230 if (ret < 0) 1233 if (ret < 0)
1231 return ret; 1234 return ret;
1232 1235
1233 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, 1236 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
1234 (void *)dev, &dev->watch_event); 1237 (void *)dev, &dev->watch_event);
1235 if (ret < 0) 1238 if (ret < 0)
1236 goto fail; 1239 goto fail;
1237 1240
1238 ops[0].watch.ver = cpu_to_le64(ver); 1241 ops[0].watch.ver = cpu_to_le64(ver);
1239 ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie); 1242 ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie);
1240 ops[0].watch.flag = 1; 1243 ops[0].watch.flag = 1;
1241 1244
1242 ret = rbd_req_sync_op(dev, NULL, 1245 ret = rbd_req_sync_op(dev, NULL,
1243 CEPH_NOSNAP, 1246 CEPH_NOSNAP,
1244 0, 1247 0,
1245 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, 1248 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1246 ops, 1249 ops,
1247 1, obj, 0, 0, NULL, 1250 1, obj, 0, 0, NULL,
1248 &dev->watch_request, NULL); 1251 &dev->watch_request, NULL);
1249 1252
1250 if (ret < 0) 1253 if (ret < 0)
1251 goto fail_event; 1254 goto fail_event;
1252 1255
1253 rbd_destroy_ops(ops); 1256 rbd_destroy_ops(ops);
1254 return 0; 1257 return 0;
1255 1258
1256 fail_event: 1259 fail_event:
1257 ceph_osdc_cancel_event(dev->watch_event); 1260 ceph_osdc_cancel_event(dev->watch_event);
1258 dev->watch_event = NULL; 1261 dev->watch_event = NULL;
1259 fail: 1262 fail:
1260 rbd_destroy_ops(ops); 1263 rbd_destroy_ops(ops);
1261 return ret; 1264 return ret;
1262 } 1265 }
1263 1266
1264 /* 1267 /*
1265 * Request sync osd unwatch 1268 * Request sync osd unwatch
1266 */ 1269 */
1267 static int rbd_req_sync_unwatch(struct rbd_device *dev, 1270 static int rbd_req_sync_unwatch(struct rbd_device *dev,
1268 const char *obj) 1271 const char *obj)
1269 { 1272 {
1270 struct ceph_osd_req_op *ops; 1273 struct ceph_osd_req_op *ops;
1271 1274
1272 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0); 1275 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
1273 if (ret < 0) 1276 if (ret < 0)
1274 return ret; 1277 return ret;
1275 1278
1276 ops[0].watch.ver = 0; 1279 ops[0].watch.ver = 0;
1277 ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie); 1280 ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie);
1278 ops[0].watch.flag = 0; 1281 ops[0].watch.flag = 0;
1279 1282
1280 ret = rbd_req_sync_op(dev, NULL, 1283 ret = rbd_req_sync_op(dev, NULL,
1281 CEPH_NOSNAP, 1284 CEPH_NOSNAP,
1282 0, 1285 0,
1283 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, 1286 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1284 ops, 1287 ops,
1285 1, obj, 0, 0, NULL, NULL, NULL); 1288 1, obj, 0, 0, NULL, NULL, NULL);
1286 1289
1287 rbd_destroy_ops(ops); 1290 rbd_destroy_ops(ops);
1288 ceph_osdc_cancel_event(dev->watch_event); 1291 ceph_osdc_cancel_event(dev->watch_event);
1289 dev->watch_event = NULL; 1292 dev->watch_event = NULL;
1290 return ret; 1293 return ret;
1291 } 1294 }
1292 1295
1293 struct rbd_notify_info { 1296 struct rbd_notify_info {
1294 struct rbd_device *dev; 1297 struct rbd_device *dev;
1295 }; 1298 };
1296 1299
1297 static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data) 1300 static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1298 { 1301 {
1299 struct rbd_device *dev = (struct rbd_device *)data; 1302 struct rbd_device *dev = (struct rbd_device *)data;
1300 if (!dev) 1303 if (!dev)
1301 return; 1304 return;
1302 1305
1303 dout("rbd_notify_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name, 1306 dout("rbd_notify_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name,
1304 notify_id, (int)opcode); 1307 notify_id, (int)opcode);
1305 } 1308 }
1306 1309
1307 /* 1310 /*
1308 * Request sync osd notify 1311 * Request sync osd notify
1309 */ 1312 */
1310 static int rbd_req_sync_notify(struct rbd_device *dev, 1313 static int rbd_req_sync_notify(struct rbd_device *dev,
1311 const char *obj) 1314 const char *obj)
1312 { 1315 {
1313 struct ceph_osd_req_op *ops; 1316 struct ceph_osd_req_op *ops;
1314 struct ceph_osd_client *osdc = &dev->client->osdc; 1317 struct ceph_osd_client *osdc = &dev->client->osdc;
1315 struct ceph_osd_event *event; 1318 struct ceph_osd_event *event;
1316 struct rbd_notify_info info; 1319 struct rbd_notify_info info;
1317 int payload_len = sizeof(u32) + sizeof(u32); 1320 int payload_len = sizeof(u32) + sizeof(u32);
1318 int ret; 1321 int ret;
1319 1322
1320 ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY, payload_len); 1323 ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY, payload_len);
1321 if (ret < 0) 1324 if (ret < 0)
1322 return ret; 1325 return ret;
1323 1326
1324 info.dev = dev; 1327 info.dev = dev;
1325 1328
1326 ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1, 1329 ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
1327 (void *)&info, &event); 1330 (void *)&info, &event);
1328 if (ret < 0) 1331 if (ret < 0)
1329 goto fail; 1332 goto fail;
1330 1333
1331 ops[0].watch.ver = 1; 1334 ops[0].watch.ver = 1;
1332 ops[0].watch.flag = 1; 1335 ops[0].watch.flag = 1;
1333 ops[0].watch.cookie = event->cookie; 1336 ops[0].watch.cookie = event->cookie;
1334 ops[0].watch.prot_ver = RADOS_NOTIFY_VER; 1337 ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
1335 ops[0].watch.timeout = 12; 1338 ops[0].watch.timeout = 12;
1336 1339
1337 ret = rbd_req_sync_op(dev, NULL, 1340 ret = rbd_req_sync_op(dev, NULL,
1338 CEPH_NOSNAP, 1341 CEPH_NOSNAP,
1339 0, 1342 0,
1340 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, 1343 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1341 ops, 1344 ops,
1342 1, obj, 0, 0, NULL, NULL, NULL); 1345 1, obj, 0, 0, NULL, NULL, NULL);
1343 if (ret < 0) 1346 if (ret < 0)
1344 goto fail_event; 1347 goto fail_event;
1345 1348
1346 ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT); 1349 ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
1347 dout("ceph_osdc_wait_event returned %d\n", ret); 1350 dout("ceph_osdc_wait_event returned %d\n", ret);
1348 rbd_destroy_ops(ops); 1351 rbd_destroy_ops(ops);
1349 return 0; 1352 return 0;
1350 1353
1351 fail_event: 1354 fail_event:
1352 ceph_osdc_cancel_event(event); 1355 ceph_osdc_cancel_event(event);
1353 fail: 1356 fail:
1354 rbd_destroy_ops(ops); 1357 rbd_destroy_ops(ops);
1355 return ret; 1358 return ret;
1356 } 1359 }
1357 1360
1358 /* 1361 /*
1359 * Request sync osd read 1362 * Request sync osd read
1360 */ 1363 */
1361 static int rbd_req_sync_exec(struct rbd_device *dev, 1364 static int rbd_req_sync_exec(struct rbd_device *dev,
1362 const char *obj, 1365 const char *obj,
1363 const char *cls, 1366 const char *cls,
1364 const char *method, 1367 const char *method,
1365 const char *data, 1368 const char *data,
1366 int len, 1369 int len,
1367 u64 *ver) 1370 u64 *ver)
1368 { 1371 {
1369 struct ceph_osd_req_op *ops; 1372 struct ceph_osd_req_op *ops;
1370 int cls_len = strlen(cls); 1373 int cls_len = strlen(cls);
1371 int method_len = strlen(method); 1374 int method_len = strlen(method);
1372 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL, 1375 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL,
1373 cls_len + method_len + len); 1376 cls_len + method_len + len);
1374 if (ret < 0) 1377 if (ret < 0)
1375 return ret; 1378 return ret;
1376 1379
1377 ops[0].cls.class_name = cls; 1380 ops[0].cls.class_name = cls;
1378 ops[0].cls.class_len = (__u8)cls_len; 1381 ops[0].cls.class_len = (__u8)cls_len;
1379 ops[0].cls.method_name = method; 1382 ops[0].cls.method_name = method;
1380 ops[0].cls.method_len = (__u8)method_len; 1383 ops[0].cls.method_len = (__u8)method_len;
1381 ops[0].cls.argc = 0; 1384 ops[0].cls.argc = 0;
1382 ops[0].cls.indata = data; 1385 ops[0].cls.indata = data;
1383 ops[0].cls.indata_len = len; 1386 ops[0].cls.indata_len = len;
1384 1387
1385 ret = rbd_req_sync_op(dev, NULL, 1388 ret = rbd_req_sync_op(dev, NULL,
1386 CEPH_NOSNAP, 1389 CEPH_NOSNAP,
1387 0, 1390 0,
1388 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, 1391 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1389 ops, 1392 ops,
1390 1, obj, 0, 0, NULL, NULL, ver); 1393 1, obj, 0, 0, NULL, NULL, ver);
1391 1394
1392 rbd_destroy_ops(ops); 1395 rbd_destroy_ops(ops);
1393 1396
1394 dout("cls_exec returned %d\n", ret); 1397 dout("cls_exec returned %d\n", ret);
1395 return ret; 1398 return ret;
1396 } 1399 }
1397 1400
1398 static struct rbd_req_coll *rbd_alloc_coll(int num_reqs) 1401 static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1399 { 1402 {
1400 struct rbd_req_coll *coll = 1403 struct rbd_req_coll *coll =
1401 kzalloc(sizeof(struct rbd_req_coll) + 1404 kzalloc(sizeof(struct rbd_req_coll) +
1402 sizeof(struct rbd_req_status) * num_reqs, 1405 sizeof(struct rbd_req_status) * num_reqs,
1403 GFP_ATOMIC); 1406 GFP_ATOMIC);
1404 1407
1405 if (!coll) 1408 if (!coll)
1406 return NULL; 1409 return NULL;
1407 coll->total = num_reqs; 1410 coll->total = num_reqs;
1408 kref_init(&coll->kref); 1411 kref_init(&coll->kref);
1409 return coll; 1412 return coll;
1410 } 1413 }
1411 1414
1412 /* 1415 /*
1413 * block device queue callback 1416 * block device queue callback
1414 */ 1417 */
1415 static void rbd_rq_fn(struct request_queue *q) 1418 static void rbd_rq_fn(struct request_queue *q)
1416 { 1419 {
1417 struct rbd_device *rbd_dev = q->queuedata; 1420 struct rbd_device *rbd_dev = q->queuedata;
1418 struct request *rq; 1421 struct request *rq;
1419 struct bio_pair *bp = NULL; 1422 struct bio_pair *bp = NULL;
1420 1423
1421 rq = blk_fetch_request(q); 1424 rq = blk_fetch_request(q);
1422 1425
1423 while (1) { 1426 while (1) {
1424 struct bio *bio; 1427 struct bio *bio;
1425 struct bio *rq_bio, *next_bio = NULL; 1428 struct bio *rq_bio, *next_bio = NULL;
1426 bool do_write; 1429 bool do_write;
1427 int size, op_size = 0; 1430 int size, op_size = 0;
1428 u64 ofs; 1431 u64 ofs;
1429 int num_segs, cur_seg = 0; 1432 int num_segs, cur_seg = 0;
1430 struct rbd_req_coll *coll; 1433 struct rbd_req_coll *coll;
1431 1434
1432 /* peek at request from block layer */ 1435 /* peek at request from block layer */
1433 if (!rq) 1436 if (!rq)
1434 break; 1437 break;
1435 1438
1436 dout("fetched request\n"); 1439 dout("fetched request\n");
1437 1440
1438 /* filter out block requests we don't understand */ 1441 /* filter out block requests we don't understand */
1439 if ((rq->cmd_type != REQ_TYPE_FS)) { 1442 if ((rq->cmd_type != REQ_TYPE_FS)) {
1440 __blk_end_request_all(rq, 0); 1443 __blk_end_request_all(rq, 0);
1441 goto next; 1444 goto next;
1442 } 1445 }
1443 1446
1444 /* deduce our operation (read, write) */ 1447 /* deduce our operation (read, write) */
1445 do_write = (rq_data_dir(rq) == WRITE); 1448 do_write = (rq_data_dir(rq) == WRITE);
1446 1449
1447 size = blk_rq_bytes(rq); 1450 size = blk_rq_bytes(rq);
1448 ofs = blk_rq_pos(rq) * 512ULL; 1451 ofs = blk_rq_pos(rq) * 512ULL;
1449 rq_bio = rq->bio; 1452 rq_bio = rq->bio;
1450 if (do_write && rbd_dev->read_only) { 1453 if (do_write && rbd_dev->read_only) {
1451 __blk_end_request_all(rq, -EROFS); 1454 __blk_end_request_all(rq, -EROFS);
1452 goto next; 1455 goto next;
1453 } 1456 }
1454 1457
1455 spin_unlock_irq(q->queue_lock); 1458 spin_unlock_irq(q->queue_lock);
1456 1459
1457 dout("%s 0x%x bytes at 0x%llx\n", 1460 dout("%s 0x%x bytes at 0x%llx\n",
1458 do_write ? "write" : "read", 1461 do_write ? "write" : "read",
1459 size, blk_rq_pos(rq) * 512ULL); 1462 size, blk_rq_pos(rq) * 512ULL);
1460 1463
1461 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size); 1464 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1462 coll = rbd_alloc_coll(num_segs); 1465 coll = rbd_alloc_coll(num_segs);
1463 if (!coll) { 1466 if (!coll) {
1464 spin_lock_irq(q->queue_lock); 1467 spin_lock_irq(q->queue_lock);
1465 __blk_end_request_all(rq, -ENOMEM); 1468 __blk_end_request_all(rq, -ENOMEM);
1466 goto next; 1469 goto next;
1467 } 1470 }
1468 1471
1469 do { 1472 do {
1470 /* a bio clone to be passed down to OSD req */ 1473 /* a bio clone to be passed down to OSD req */
1471 dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt); 1474 dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt);
1472 op_size = rbd_get_segment(&rbd_dev->header, 1475 op_size = rbd_get_segment(&rbd_dev->header,
1473 rbd_dev->header.block_name, 1476 rbd_dev->header.block_name,
1474 ofs, size, 1477 ofs, size,
1475 NULL, NULL); 1478 NULL, NULL);
1476 kref_get(&coll->kref); 1479 kref_get(&coll->kref);
1477 bio = bio_chain_clone(&rq_bio, &next_bio, &bp, 1480 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1478 op_size, GFP_ATOMIC); 1481 op_size, GFP_ATOMIC);
1479 if (!bio) { 1482 if (!bio) {
1480 rbd_coll_end_req_index(rq, coll, cur_seg, 1483 rbd_coll_end_req_index(rq, coll, cur_seg,
1481 -ENOMEM, op_size); 1484 -ENOMEM, op_size);
1482 goto next_seg; 1485 goto next_seg;
1483 } 1486 }
1484 1487
1485 1488
1486 /* init OSD command: write or read */ 1489 /* init OSD command: write or read */
1487 if (do_write) 1490 if (do_write)
1488 rbd_req_write(rq, rbd_dev, 1491 rbd_req_write(rq, rbd_dev,
1489 rbd_dev->header.snapc, 1492 rbd_dev->header.snapc,
1490 ofs, 1493 ofs,
1491 op_size, bio, 1494 op_size, bio,
1492 coll, cur_seg); 1495 coll, cur_seg);
1493 else 1496 else
1494 rbd_req_read(rq, rbd_dev, 1497 rbd_req_read(rq, rbd_dev,
1495 cur_snap_id(rbd_dev), 1498 cur_snap_id(rbd_dev),
1496 ofs, 1499 ofs,
1497 op_size, bio, 1500 op_size, bio,
1498 coll, cur_seg); 1501 coll, cur_seg);
1499 1502
1500 next_seg: 1503 next_seg:
1501 size -= op_size; 1504 size -= op_size;
1502 ofs += op_size; 1505 ofs += op_size;
1503 1506
1504 cur_seg++; 1507 cur_seg++;
1505 rq_bio = next_bio; 1508 rq_bio = next_bio;
1506 } while (size > 0); 1509 } while (size > 0);
1507 kref_put(&coll->kref, rbd_coll_release); 1510 kref_put(&coll->kref, rbd_coll_release);
1508 1511
1509 if (bp) 1512 if (bp)
1510 bio_pair_release(bp); 1513 bio_pair_release(bp);
1511 spin_lock_irq(q->queue_lock); 1514 spin_lock_irq(q->queue_lock);
1512 next: 1515 next:
1513 rq = blk_fetch_request(q); 1516 rq = blk_fetch_request(q);
1514 } 1517 }
1515 } 1518 }
1516 1519
1517 /* 1520 /*
1518 * a queue callback. Makes sure that we don't create a bio that spans across 1521 * a queue callback. Makes sure that we don't create a bio that spans across
1519 * multiple osd objects. One exception would be with a single page bios, 1522 * multiple osd objects. One exception would be with a single page bios,
1520 * which we handle later at bio_chain_clone 1523 * which we handle later at bio_chain_clone
1521 */ 1524 */
1522 static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd, 1525 static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1523 struct bio_vec *bvec) 1526 struct bio_vec *bvec)
1524 { 1527 {
1525 struct rbd_device *rbd_dev = q->queuedata; 1528 struct rbd_device *rbd_dev = q->queuedata;
1526 unsigned int chunk_sectors = 1 << (rbd_dev->header.obj_order - 9); 1529 unsigned int chunk_sectors = 1 << (rbd_dev->header.obj_order - 9);
1527 sector_t sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev); 1530 sector_t sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1528 unsigned int bio_sectors = bmd->bi_size >> 9; 1531 unsigned int bio_sectors = bmd->bi_size >> 9;
1529 int max; 1532 int max;
1530 1533
1531 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) 1534 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
1532 + bio_sectors)) << 9; 1535 + bio_sectors)) << 9;
1533 if (max < 0) 1536 if (max < 0)
1534 max = 0; /* bio_add cannot handle a negative return */ 1537 max = 0; /* bio_add cannot handle a negative return */
1535 if (max <= bvec->bv_len && bio_sectors == 0) 1538 if (max <= bvec->bv_len && bio_sectors == 0)
1536 return bvec->bv_len; 1539 return bvec->bv_len;
1537 return max; 1540 return max;
1538 } 1541 }
1539 1542
1540 static void rbd_free_disk(struct rbd_device *rbd_dev) 1543 static void rbd_free_disk(struct rbd_device *rbd_dev)
1541 { 1544 {
1542 struct gendisk *disk = rbd_dev->disk; 1545 struct gendisk *disk = rbd_dev->disk;
1543 1546
1544 if (!disk) 1547 if (!disk)
1545 return; 1548 return;
1546 1549
1547 rbd_header_free(&rbd_dev->header); 1550 rbd_header_free(&rbd_dev->header);
1548 1551
1549 if (disk->flags & GENHD_FL_UP) 1552 if (disk->flags & GENHD_FL_UP)
1550 del_gendisk(disk); 1553 del_gendisk(disk);
1551 if (disk->queue) 1554 if (disk->queue)
1552 blk_cleanup_queue(disk->queue); 1555 blk_cleanup_queue(disk->queue);
1553 put_disk(disk); 1556 put_disk(disk);
1554 } 1557 }
1555 1558
1556 /* 1559 /*
1557 * reload the ondisk the header 1560 * reload the ondisk the header
1558 */ 1561 */
1559 static int rbd_read_header(struct rbd_device *rbd_dev, 1562 static int rbd_read_header(struct rbd_device *rbd_dev,
1560 struct rbd_image_header *header) 1563 struct rbd_image_header *header)
1561 { 1564 {
1562 ssize_t rc; 1565 ssize_t rc;
1563 struct rbd_image_header_ondisk *dh; 1566 struct rbd_image_header_ondisk *dh;
1564 int snap_count = 0; 1567 int snap_count = 0;
1565 u64 snap_names_len = 0; 1568 u64 snap_names_len = 0;
1566 u64 ver; 1569 u64 ver;
1567 1570
1568 while (1) { 1571 while (1) {
1569 int len = sizeof(*dh) + 1572 int len = sizeof(*dh) +
1570 snap_count * sizeof(struct rbd_image_snap_ondisk) + 1573 snap_count * sizeof(struct rbd_image_snap_ondisk) +
1571 snap_names_len; 1574 snap_names_len;
1572 1575
1573 rc = -ENOMEM; 1576 rc = -ENOMEM;
1574 dh = kmalloc(len, GFP_KERNEL); 1577 dh = kmalloc(len, GFP_KERNEL);
1575 if (!dh) 1578 if (!dh)
1576 return -ENOMEM; 1579 return -ENOMEM;
1577 1580
1578 rc = rbd_req_sync_read(rbd_dev, 1581 rc = rbd_req_sync_read(rbd_dev,
1579 NULL, CEPH_NOSNAP, 1582 NULL, CEPH_NOSNAP,
1580 rbd_dev->obj_md_name, 1583 rbd_dev->obj_md_name,
1581 0, len, 1584 0, len,
1582 (char *)dh, &ver); 1585 (char *)dh, &ver);
1583 if (rc < 0) 1586 if (rc < 0)
1584 goto out_dh; 1587 goto out_dh;
1585 1588
1586 rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL); 1589 rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL);
1587 if (rc < 0) { 1590 if (rc < 0) {
1588 if (rc == -ENXIO) { 1591 if (rc == -ENXIO) {
1589 pr_warning("unrecognized header format" 1592 pr_warning("unrecognized header format"
1590 " for image %s", rbd_dev->obj); 1593 " for image %s", rbd_dev->obj);
1591 } 1594 }
1592 goto out_dh; 1595 goto out_dh;
1593 } 1596 }
1594 1597
1595 if (snap_count != header->total_snaps) { 1598 if (snap_count != header->total_snaps) {
1596 snap_count = header->total_snaps; 1599 snap_count = header->total_snaps;
1597 snap_names_len = header->snap_names_len; 1600 snap_names_len = header->snap_names_len;
1598 rbd_header_free(header); 1601 rbd_header_free(header);
1599 kfree(dh); 1602 kfree(dh);
1600 continue; 1603 continue;
1601 } 1604 }
1602 break; 1605 break;
1603 } 1606 }
1604 header->obj_version = ver; 1607 header->obj_version = ver;
1605 1608
1606 out_dh: 1609 out_dh:
1607 kfree(dh); 1610 kfree(dh);
1608 return rc; 1611 return rc;
1609 } 1612 }
1610 1613
1611 /* 1614 /*
1612 * create a snapshot 1615 * create a snapshot
1613 */ 1616 */
1614 static int rbd_header_add_snap(struct rbd_device *dev, 1617 static int rbd_header_add_snap(struct rbd_device *dev,
1615 const char *snap_name, 1618 const char *snap_name,
1616 gfp_t gfp_flags) 1619 gfp_t gfp_flags)
1617 { 1620 {
1618 int name_len = strlen(snap_name); 1621 int name_len = strlen(snap_name);
1619 u64 new_snapid; 1622 u64 new_snapid;
1620 int ret; 1623 int ret;
1621 void *data, *p, *e; 1624 void *data, *p, *e;
1622 u64 ver; 1625 u64 ver;
1623 1626
1624 /* we should create a snapshot only if we're pointing at the head */ 1627 /* we should create a snapshot only if we're pointing at the head */
1625 if (dev->cur_snap) 1628 if (dev->cur_snap)
1626 return -EINVAL; 1629 return -EINVAL;
1627 1630
1628 ret = ceph_monc_create_snapid(&dev->client->monc, dev->poolid, 1631 ret = ceph_monc_create_snapid(&dev->client->monc, dev->poolid,
1629 &new_snapid); 1632 &new_snapid);
1630 dout("created snapid=%lld\n", new_snapid); 1633 dout("created snapid=%lld\n", new_snapid);
1631 if (ret < 0) 1634 if (ret < 0)
1632 return ret; 1635 return ret;
1633 1636
1634 data = kmalloc(name_len + 16, gfp_flags); 1637 data = kmalloc(name_len + 16, gfp_flags);
1635 if (!data) 1638 if (!data)
1636 return -ENOMEM; 1639 return -ENOMEM;
1637 1640
1638 p = data; 1641 p = data;
1639 e = data + name_len + 16; 1642 e = data + name_len + 16;
1640 1643
1641 ceph_encode_string_safe(&p, e, snap_name, name_len, bad); 1644 ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1642 ceph_encode_64_safe(&p, e, new_snapid, bad); 1645 ceph_encode_64_safe(&p, e, new_snapid, bad);
1643 1646
1644 ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add", 1647 ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add",
1645 data, p - data, &ver); 1648 data, p - data, &ver);
1646 1649
1647 kfree(data); 1650 kfree(data);
1648 1651
1649 if (ret < 0) 1652 if (ret < 0)
1650 return ret; 1653 return ret;
1651 1654
1652 dev->header.snapc->seq = new_snapid; 1655 dev->header.snapc->seq = new_snapid;
1653 1656
1654 return 0; 1657 return 0;
1655 bad: 1658 bad:
1656 return -ERANGE; 1659 return -ERANGE;
1657 } 1660 }
1658 1661
1659 static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev) 1662 static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1660 { 1663 {
1661 struct rbd_snap *snap; 1664 struct rbd_snap *snap;
1662 1665
1663 while (!list_empty(&rbd_dev->snaps)) { 1666 while (!list_empty(&rbd_dev->snaps)) {
1664 snap = list_first_entry(&rbd_dev->snaps, struct rbd_snap, node); 1667 snap = list_first_entry(&rbd_dev->snaps, struct rbd_snap, node);
1665 __rbd_remove_snap_dev(rbd_dev, snap); 1668 __rbd_remove_snap_dev(rbd_dev, snap);
1666 } 1669 }
1667 } 1670 }
1668 1671
1669 /* 1672 /*
1670 * only read the first part of the ondisk header, without the snaps info 1673 * only read the first part of the ondisk header, without the snaps info
1671 */ 1674 */
1672 static int __rbd_update_snaps(struct rbd_device *rbd_dev) 1675 static int __rbd_update_snaps(struct rbd_device *rbd_dev)
1673 { 1676 {
1674 int ret; 1677 int ret;
1675 struct rbd_image_header h; 1678 struct rbd_image_header h;
1676 u64 snap_seq; 1679 u64 snap_seq;
1677 int follow_seq = 0; 1680 int follow_seq = 0;
1678 1681
1679 ret = rbd_read_header(rbd_dev, &h); 1682 ret = rbd_read_header(rbd_dev, &h);
1680 if (ret < 0) 1683 if (ret < 0)
1681 return ret; 1684 return ret;
1682 1685
1683 /* resized? */ 1686 /* resized? */
1684 set_capacity(rbd_dev->disk, h.image_size / 512ULL); 1687 set_capacity(rbd_dev->disk, h.image_size / 512ULL);
1685 1688
1686 down_write(&rbd_dev->header.snap_rwsem); 1689 down_write(&rbd_dev->header.snap_rwsem);
1687 1690
1688 snap_seq = rbd_dev->header.snapc->seq; 1691 snap_seq = rbd_dev->header.snapc->seq;
1689 if (rbd_dev->header.total_snaps && 1692 if (rbd_dev->header.total_snaps &&
1690 rbd_dev->header.snapc->snaps[0] == snap_seq) 1693 rbd_dev->header.snapc->snaps[0] == snap_seq)
1691 /* pointing at the head, will need to follow that 1694 /* pointing at the head, will need to follow that
1692 if head moves */ 1695 if head moves */
1693 follow_seq = 1; 1696 follow_seq = 1;
1694 1697
1695 kfree(rbd_dev->header.snapc); 1698 kfree(rbd_dev->header.snapc);
1696 kfree(rbd_dev->header.snap_names); 1699 kfree(rbd_dev->header.snap_names);
1697 kfree(rbd_dev->header.snap_sizes); 1700 kfree(rbd_dev->header.snap_sizes);
1698 1701
1699 rbd_dev->header.total_snaps = h.total_snaps; 1702 rbd_dev->header.total_snaps = h.total_snaps;
1700 rbd_dev->header.snapc = h.snapc; 1703 rbd_dev->header.snapc = h.snapc;
1701 rbd_dev->header.snap_names = h.snap_names; 1704 rbd_dev->header.snap_names = h.snap_names;
1702 rbd_dev->header.snap_names_len = h.snap_names_len; 1705 rbd_dev->header.snap_names_len = h.snap_names_len;
1703 rbd_dev->header.snap_sizes = h.snap_sizes; 1706 rbd_dev->header.snap_sizes = h.snap_sizes;
1704 if (follow_seq) 1707 if (follow_seq)
1705 rbd_dev->header.snapc->seq = rbd_dev->header.snapc->snaps[0]; 1708 rbd_dev->header.snapc->seq = rbd_dev->header.snapc->snaps[0];
1706 else 1709 else
1707 rbd_dev->header.snapc->seq = snap_seq; 1710 rbd_dev->header.snapc->seq = snap_seq;
1708 1711
1709 ret = __rbd_init_snaps_header(rbd_dev); 1712 ret = __rbd_init_snaps_header(rbd_dev);
1710 1713
1711 up_write(&rbd_dev->header.snap_rwsem); 1714 up_write(&rbd_dev->header.snap_rwsem);
1712 1715
1713 return ret; 1716 return ret;
1714 } 1717 }
1715 1718
1716 static int rbd_init_disk(struct rbd_device *rbd_dev) 1719 static int rbd_init_disk(struct rbd_device *rbd_dev)
1717 { 1720 {
1718 struct gendisk *disk; 1721 struct gendisk *disk;
1719 struct request_queue *q; 1722 struct request_queue *q;
1720 int rc; 1723 int rc;
1721 u64 total_size = 0; 1724 u64 total_size = 0;
1722 1725
1723 /* contact OSD, request size info about the object being mapped */ 1726 /* contact OSD, request size info about the object being mapped */
1724 rc = rbd_read_header(rbd_dev, &rbd_dev->header); 1727 rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1725 if (rc) 1728 if (rc)
1726 return rc; 1729 return rc;
1727 1730
1728 /* no need to lock here, as rbd_dev is not registered yet */ 1731 /* no need to lock here, as rbd_dev is not registered yet */
1729 rc = __rbd_init_snaps_header(rbd_dev); 1732 rc = __rbd_init_snaps_header(rbd_dev);
1730 if (rc) 1733 if (rc)
1731 return rc; 1734 return rc;
1732 1735
1733 rc = rbd_header_set_snap(rbd_dev, rbd_dev->snap_name, &total_size); 1736 rc = rbd_header_set_snap(rbd_dev, rbd_dev->snap_name, &total_size);
1734 if (rc) 1737 if (rc)
1735 return rc; 1738 return rc;
1736 1739
1737 /* create gendisk info */ 1740 /* create gendisk info */
1738 rc = -ENOMEM; 1741 rc = -ENOMEM;
1739 disk = alloc_disk(RBD_MINORS_PER_MAJOR); 1742 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1740 if (!disk) 1743 if (!disk)
1741 goto out; 1744 goto out;
1742 1745
1743 snprintf(disk->disk_name, sizeof(disk->disk_name), DRV_NAME "%d", 1746 snprintf(disk->disk_name, sizeof(disk->disk_name), DRV_NAME "%d",
1744 rbd_dev->id); 1747 rbd_dev->id);
1745 disk->major = rbd_dev->major; 1748 disk->major = rbd_dev->major;
1746 disk->first_minor = 0; 1749 disk->first_minor = 0;
1747 disk->fops = &rbd_bd_ops; 1750 disk->fops = &rbd_bd_ops;
1748 disk->private_data = rbd_dev; 1751 disk->private_data = rbd_dev;
1749 1752
1750 /* init rq */ 1753 /* init rq */
1751 rc = -ENOMEM; 1754 rc = -ENOMEM;
1752 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock); 1755 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1753 if (!q) 1756 if (!q)
1754 goto out_disk; 1757 goto out_disk;
1755 1758
1756 /* set io sizes to object size */ 1759 /* set io sizes to object size */
1757 blk_queue_max_hw_sectors(q, rbd_obj_bytes(&rbd_dev->header) / 512ULL); 1760 blk_queue_max_hw_sectors(q, rbd_obj_bytes(&rbd_dev->header) / 512ULL);
1758 blk_queue_max_segment_size(q, rbd_obj_bytes(&rbd_dev->header)); 1761 blk_queue_max_segment_size(q, rbd_obj_bytes(&rbd_dev->header));
1759 blk_queue_io_min(q, rbd_obj_bytes(&rbd_dev->header)); 1762 blk_queue_io_min(q, rbd_obj_bytes(&rbd_dev->header));
1760 blk_queue_io_opt(q, rbd_obj_bytes(&rbd_dev->header)); 1763 blk_queue_io_opt(q, rbd_obj_bytes(&rbd_dev->header));
1761 1764
1762 blk_queue_merge_bvec(q, rbd_merge_bvec); 1765 blk_queue_merge_bvec(q, rbd_merge_bvec);
1763 disk->queue = q; 1766 disk->queue = q;
1764 1767
1765 q->queuedata = rbd_dev; 1768 q->queuedata = rbd_dev;
1766 1769
1767 rbd_dev->disk = disk; 1770 rbd_dev->disk = disk;
1768 rbd_dev->q = q; 1771 rbd_dev->q = q;
1769 1772
1770 /* finally, announce the disk to the world */ 1773 /* finally, announce the disk to the world */
1771 set_capacity(disk, total_size / 512ULL); 1774 set_capacity(disk, total_size / 512ULL);
1772 add_disk(disk); 1775 add_disk(disk);
1773 1776
1774 pr_info("%s: added with size 0x%llx\n", 1777 pr_info("%s: added with size 0x%llx\n",
1775 disk->disk_name, (unsigned long long)total_size); 1778 disk->disk_name, (unsigned long long)total_size);
1776 return 0; 1779 return 0;
1777 1780
1778 out_disk: 1781 out_disk:
1779 put_disk(disk); 1782 put_disk(disk);
1780 out: 1783 out:
1781 return rc; 1784 return rc;
1782 } 1785 }
1783 1786
1784 /* 1787 /*
1785 sysfs 1788 sysfs
1786 */ 1789 */
1787 1790
1788 static ssize_t rbd_size_show(struct device *dev, 1791 static ssize_t rbd_size_show(struct device *dev,
1789 struct device_attribute *attr, char *buf) 1792 struct device_attribute *attr, char *buf)
1790 { 1793 {
1791 struct rbd_device *rbd_dev = dev_to_rbd(dev); 1794 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1792 1795
1793 return sprintf(buf, "%llu\n", (unsigned long long)rbd_dev->header.image_size); 1796 return sprintf(buf, "%llu\n", (unsigned long long)rbd_dev->header.image_size);
1794 } 1797 }
1795 1798
1796 static ssize_t rbd_major_show(struct device *dev, 1799 static ssize_t rbd_major_show(struct device *dev,
1797 struct device_attribute *attr, char *buf) 1800 struct device_attribute *attr, char *buf)
1798 { 1801 {
1799 struct rbd_device *rbd_dev = dev_to_rbd(dev); 1802 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1800 1803
1801 return sprintf(buf, "%d\n", rbd_dev->major); 1804 return sprintf(buf, "%d\n", rbd_dev->major);
1802 } 1805 }
1803 1806
1804 static ssize_t rbd_client_id_show(struct device *dev, 1807 static ssize_t rbd_client_id_show(struct device *dev,
1805 struct device_attribute *attr, char *buf) 1808 struct device_attribute *attr, char *buf)
1806 { 1809 {
1807 struct rbd_device *rbd_dev = dev_to_rbd(dev); 1810 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1808 1811
1809 return sprintf(buf, "client%lld\n", ceph_client_id(rbd_dev->client)); 1812 return sprintf(buf, "client%lld\n", ceph_client_id(rbd_dev->client));
1810 } 1813 }
1811 1814
1812 static ssize_t rbd_pool_show(struct device *dev, 1815 static ssize_t rbd_pool_show(struct device *dev,
1813 struct device_attribute *attr, char *buf) 1816 struct device_attribute *attr, char *buf)
1814 { 1817 {
1815 struct rbd_device *rbd_dev = dev_to_rbd(dev); 1818 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1816 1819
1817 return sprintf(buf, "%s\n", rbd_dev->pool_name); 1820 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1818 } 1821 }
1819 1822
1820 static ssize_t rbd_name_show(struct device *dev, 1823 static ssize_t rbd_name_show(struct device *dev,
1821 struct device_attribute *attr, char *buf) 1824 struct device_attribute *attr, char *buf)
1822 { 1825 {
1823 struct rbd_device *rbd_dev = dev_to_rbd(dev); 1826 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1824 1827
1825 return sprintf(buf, "%s\n", rbd_dev->obj); 1828 return sprintf(buf, "%s\n", rbd_dev->obj);
1826 } 1829 }
1827 1830
1828 static ssize_t rbd_snap_show(struct device *dev, 1831 static ssize_t rbd_snap_show(struct device *dev,
1829 struct device_attribute *attr, 1832 struct device_attribute *attr,
1830 char *buf) 1833 char *buf)
1831 { 1834 {
1832 struct rbd_device *rbd_dev = dev_to_rbd(dev); 1835 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1833 1836
1834 return sprintf(buf, "%s\n", rbd_dev->snap_name); 1837 return sprintf(buf, "%s\n", rbd_dev->snap_name);
1835 } 1838 }
1836 1839
1837 static ssize_t rbd_image_refresh(struct device *dev, 1840 static ssize_t rbd_image_refresh(struct device *dev,
1838 struct device_attribute *attr, 1841 struct device_attribute *attr,
1839 const char *buf, 1842 const char *buf,
1840 size_t size) 1843 size_t size)
1841 { 1844 {
1842 struct rbd_device *rbd_dev = dev_to_rbd(dev); 1845 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1843 int rc; 1846 int rc;
1844 int ret = size; 1847 int ret = size;
1845 1848
1846 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 1849 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1847 1850
1848 rc = __rbd_update_snaps(rbd_dev); 1851 rc = __rbd_update_snaps(rbd_dev);
1849 if (rc < 0) 1852 if (rc < 0)
1850 ret = rc; 1853 ret = rc;
1851 1854
1852 mutex_unlock(&ctl_mutex); 1855 mutex_unlock(&ctl_mutex);
1853 return ret; 1856 return ret;
1854 } 1857 }
1855 1858
1856 static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); 1859 static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
1857 static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL); 1860 static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1858 static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); 1861 static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1859 static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); 1862 static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
1860 static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL); 1863 static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
1861 static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); 1864 static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1862 static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); 1865 static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
1863 static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add); 1866 static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
1864 1867
1865 static struct attribute *rbd_attrs[] = { 1868 static struct attribute *rbd_attrs[] = {
1866 &dev_attr_size.attr, 1869 &dev_attr_size.attr,
1867 &dev_attr_major.attr, 1870 &dev_attr_major.attr,
1868 &dev_attr_client_id.attr, 1871 &dev_attr_client_id.attr,
1869 &dev_attr_pool.attr, 1872 &dev_attr_pool.attr,
1870 &dev_attr_name.attr, 1873 &dev_attr_name.attr,
1871 &dev_attr_current_snap.attr, 1874 &dev_attr_current_snap.attr,
1872 &dev_attr_refresh.attr, 1875 &dev_attr_refresh.attr,
1873 &dev_attr_create_snap.attr, 1876 &dev_attr_create_snap.attr,
1874 NULL 1877 NULL
1875 }; 1878 };
1876 1879
1877 static struct attribute_group rbd_attr_group = { 1880 static struct attribute_group rbd_attr_group = {
1878 .attrs = rbd_attrs, 1881 .attrs = rbd_attrs,
1879 }; 1882 };
1880 1883
1881 static const struct attribute_group *rbd_attr_groups[] = { 1884 static const struct attribute_group *rbd_attr_groups[] = {
1882 &rbd_attr_group, 1885 &rbd_attr_group,
1883 NULL 1886 NULL
1884 }; 1887 };
1885 1888
1886 static void rbd_sysfs_dev_release(struct device *dev) 1889 static void rbd_sysfs_dev_release(struct device *dev)
1887 { 1890 {
1888 } 1891 }
1889 1892
1890 static struct device_type rbd_device_type = { 1893 static struct device_type rbd_device_type = {
1891 .name = "rbd", 1894 .name = "rbd",
1892 .groups = rbd_attr_groups, 1895 .groups = rbd_attr_groups,
1893 .release = rbd_sysfs_dev_release, 1896 .release = rbd_sysfs_dev_release,
1894 }; 1897 };
1895 1898
1896 1899
1897 /* 1900 /*
1898 sysfs - snapshots 1901 sysfs - snapshots
1899 */ 1902 */
1900 1903
1901 static ssize_t rbd_snap_size_show(struct device *dev, 1904 static ssize_t rbd_snap_size_show(struct device *dev,
1902 struct device_attribute *attr, 1905 struct device_attribute *attr,
1903 char *buf) 1906 char *buf)
1904 { 1907 {
1905 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); 1908 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1906 1909
1907 return sprintf(buf, "%lld\n", (long long)snap->size); 1910 return sprintf(buf, "%lld\n", (long long)snap->size);
1908 } 1911 }
1909 1912
1910 static ssize_t rbd_snap_id_show(struct device *dev, 1913 static ssize_t rbd_snap_id_show(struct device *dev,
1911 struct device_attribute *attr, 1914 struct device_attribute *attr,
1912 char *buf) 1915 char *buf)
1913 { 1916 {
1914 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); 1917 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1915 1918
1916 return sprintf(buf, "%lld\n", (long long)snap->id); 1919 return sprintf(buf, "%lld\n", (long long)snap->id);
1917 } 1920 }
1918 1921
1919 static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL); 1922 static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
1920 static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL); 1923 static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
1921 1924
1922 static struct attribute *rbd_snap_attrs[] = { 1925 static struct attribute *rbd_snap_attrs[] = {
1923 &dev_attr_snap_size.attr, 1926 &dev_attr_snap_size.attr,
1924 &dev_attr_snap_id.attr, 1927 &dev_attr_snap_id.attr,
1925 NULL, 1928 NULL,
1926 }; 1929 };
1927 1930
1928 static struct attribute_group rbd_snap_attr_group = { 1931 static struct attribute_group rbd_snap_attr_group = {
1929 .attrs = rbd_snap_attrs, 1932 .attrs = rbd_snap_attrs,
1930 }; 1933 };
1931 1934
1932 static void rbd_snap_dev_release(struct device *dev) 1935 static void rbd_snap_dev_release(struct device *dev)
1933 { 1936 {
1934 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); 1937 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1935 kfree(snap->name); 1938 kfree(snap->name);
1936 kfree(snap); 1939 kfree(snap);
1937 } 1940 }
1938 1941
1939 static const struct attribute_group *rbd_snap_attr_groups[] = { 1942 static const struct attribute_group *rbd_snap_attr_groups[] = {
1940 &rbd_snap_attr_group, 1943 &rbd_snap_attr_group,
1941 NULL 1944 NULL
1942 }; 1945 };
1943 1946
1944 static struct device_type rbd_snap_device_type = { 1947 static struct device_type rbd_snap_device_type = {
1945 .groups = rbd_snap_attr_groups, 1948 .groups = rbd_snap_attr_groups,
1946 .release = rbd_snap_dev_release, 1949 .release = rbd_snap_dev_release,
1947 }; 1950 };
1948 1951
1949 static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev, 1952 static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
1950 struct rbd_snap *snap) 1953 struct rbd_snap *snap)
1951 { 1954 {
1952 list_del(&snap->node); 1955 list_del(&snap->node);
1953 device_unregister(&snap->dev); 1956 device_unregister(&snap->dev);
1954 } 1957 }
1955 1958
1956 static int rbd_register_snap_dev(struct rbd_device *rbd_dev, 1959 static int rbd_register_snap_dev(struct rbd_device *rbd_dev,
1957 struct rbd_snap *snap, 1960 struct rbd_snap *snap,
1958 struct device *parent) 1961 struct device *parent)
1959 { 1962 {
1960 struct device *dev = &snap->dev; 1963 struct device *dev = &snap->dev;
1961 int ret; 1964 int ret;
1962 1965
1963 dev->type = &rbd_snap_device_type; 1966 dev->type = &rbd_snap_device_type;
1964 dev->parent = parent; 1967 dev->parent = parent;
1965 dev->release = rbd_snap_dev_release; 1968 dev->release = rbd_snap_dev_release;
1966 dev_set_name(dev, "snap_%s", snap->name); 1969 dev_set_name(dev, "snap_%s", snap->name);
1967 ret = device_register(dev); 1970 ret = device_register(dev);
1968 1971
1969 return ret; 1972 return ret;
1970 } 1973 }
1971 1974
1972 static int __rbd_add_snap_dev(struct rbd_device *rbd_dev, 1975 static int __rbd_add_snap_dev(struct rbd_device *rbd_dev,
1973 int i, const char *name, 1976 int i, const char *name,
1974 struct rbd_snap **snapp) 1977 struct rbd_snap **snapp)
1975 { 1978 {
1976 int ret; 1979 int ret;
1977 struct rbd_snap *snap = kzalloc(sizeof(*snap), GFP_KERNEL); 1980 struct rbd_snap *snap = kzalloc(sizeof(*snap), GFP_KERNEL);
1978 if (!snap) 1981 if (!snap)
1979 return -ENOMEM; 1982 return -ENOMEM;
1980 snap->name = kstrdup(name, GFP_KERNEL); 1983 snap->name = kstrdup(name, GFP_KERNEL);
1981 snap->size = rbd_dev->header.snap_sizes[i]; 1984 snap->size = rbd_dev->header.snap_sizes[i];
1982 snap->id = rbd_dev->header.snapc->snaps[i]; 1985 snap->id = rbd_dev->header.snapc->snaps[i];
1983 if (device_is_registered(&rbd_dev->dev)) { 1986 if (device_is_registered(&rbd_dev->dev)) {
1984 ret = rbd_register_snap_dev(rbd_dev, snap, 1987 ret = rbd_register_snap_dev(rbd_dev, snap,
1985 &rbd_dev->dev); 1988 &rbd_dev->dev);
1986 if (ret < 0) 1989 if (ret < 0)
1987 goto err; 1990 goto err;
1988 } 1991 }
1989 *snapp = snap; 1992 *snapp = snap;
1990 return 0; 1993 return 0;
1991 err: 1994 err:
1992 kfree(snap->name); 1995 kfree(snap->name);
1993 kfree(snap); 1996 kfree(snap);
1994 return ret; 1997 return ret;
1995 } 1998 }
1996 1999
1997 /* 2000 /*
1998 * search for the previous snap in a null delimited string list 2001 * search for the previous snap in a null delimited string list
1999 */ 2002 */
2000 const char *rbd_prev_snap_name(const char *name, const char *start) 2003 const char *rbd_prev_snap_name(const char *name, const char *start)
2001 { 2004 {
2002 if (name < start + 2) 2005 if (name < start + 2)
2003 return NULL; 2006 return NULL;
2004 2007
2005 name -= 2; 2008 name -= 2;
2006 while (*name) { 2009 while (*name) {
2007 if (name == start) 2010 if (name == start)
2008 return start; 2011 return start;
2009 name--; 2012 name--;
2010 } 2013 }
2011 return name + 1; 2014 return name + 1;
2012 } 2015 }
2013 2016
2014 /* 2017 /*
2015 * compare the old list of snapshots that we have to what's in the header 2018 * compare the old list of snapshots that we have to what's in the header
2016 * and update it accordingly. Note that the header holds the snapshots 2019 * and update it accordingly. Note that the header holds the snapshots
2017 * in a reverse order (from newest to oldest) and we need to go from 2020 * in a reverse order (from newest to oldest) and we need to go from
2018 * older to new so that we don't get a duplicate snap name when 2021 * older to new so that we don't get a duplicate snap name when
2019 * doing the process (e.g., removed snapshot and recreated a new 2022 * doing the process (e.g., removed snapshot and recreated a new
2020 * one with the same name. 2023 * one with the same name.
2021 */ 2024 */
2022 static int __rbd_init_snaps_header(struct rbd_device *rbd_dev) 2025 static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
2023 { 2026 {
2024 const char *name, *first_name; 2027 const char *name, *first_name;
2025 int i = rbd_dev->header.total_snaps; 2028 int i = rbd_dev->header.total_snaps;
2026 struct rbd_snap *snap, *old_snap = NULL; 2029 struct rbd_snap *snap, *old_snap = NULL;
2027 int ret; 2030 int ret;
2028 struct list_head *p, *n; 2031 struct list_head *p, *n;
2029 2032
2030 first_name = rbd_dev->header.snap_names; 2033 first_name = rbd_dev->header.snap_names;
2031 name = first_name + rbd_dev->header.snap_names_len; 2034 name = first_name + rbd_dev->header.snap_names_len;
2032 2035
2033 list_for_each_prev_safe(p, n, &rbd_dev->snaps) { 2036 list_for_each_prev_safe(p, n, &rbd_dev->snaps) {
2034 u64 cur_id; 2037 u64 cur_id;
2035 2038
2036 old_snap = list_entry(p, struct rbd_snap, node); 2039 old_snap = list_entry(p, struct rbd_snap, node);
2037 2040
2038 if (i) 2041 if (i)
2039 cur_id = rbd_dev->header.snapc->snaps[i - 1]; 2042 cur_id = rbd_dev->header.snapc->snaps[i - 1];
2040 2043
2041 if (!i || old_snap->id < cur_id) { 2044 if (!i || old_snap->id < cur_id) {
2042 /* old_snap->id was skipped, thus was removed */ 2045 /* old_snap->id was skipped, thus was removed */
2043 __rbd_remove_snap_dev(rbd_dev, old_snap); 2046 __rbd_remove_snap_dev(rbd_dev, old_snap);
2044 continue; 2047 continue;
2045 } 2048 }
2046 if (old_snap->id == cur_id) { 2049 if (old_snap->id == cur_id) {
2047 /* we have this snapshot already */ 2050 /* we have this snapshot already */
2048 i--; 2051 i--;
2049 name = rbd_prev_snap_name(name, first_name); 2052 name = rbd_prev_snap_name(name, first_name);
2050 continue; 2053 continue;
2051 } 2054 }
2052 for (; i > 0; 2055 for (; i > 0;
2053 i--, name = rbd_prev_snap_name(name, first_name)) { 2056 i--, name = rbd_prev_snap_name(name, first_name)) {
2054 if (!name) { 2057 if (!name) {
2055 WARN_ON(1); 2058 WARN_ON(1);
2056 return -EINVAL; 2059 return -EINVAL;
2057 } 2060 }
2058 cur_id = rbd_dev->header.snapc->snaps[i]; 2061 cur_id = rbd_dev->header.snapc->snaps[i];
2059 /* snapshot removal? handle it above */ 2062 /* snapshot removal? handle it above */
2060 if (cur_id >= old_snap->id) 2063 if (cur_id >= old_snap->id)
2061 break; 2064 break;
2062 /* a new snapshot */ 2065 /* a new snapshot */
2063 ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap); 2066 ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2064 if (ret < 0) 2067 if (ret < 0)
2065 return ret; 2068 return ret;
2066 2069
2067 /* note that we add it backward so using n and not p */ 2070 /* note that we add it backward so using n and not p */
2068 list_add(&snap->node, n); 2071 list_add(&snap->node, n);
2069 p = &snap->node; 2072 p = &snap->node;
2070 } 2073 }
2071 } 2074 }
2072 /* we're done going over the old snap list, just add what's left */ 2075 /* we're done going over the old snap list, just add what's left */
2073 for (; i > 0; i--) { 2076 for (; i > 0; i--) {
2074 name = rbd_prev_snap_name(name, first_name); 2077 name = rbd_prev_snap_name(name, first_name);
2075 if (!name) { 2078 if (!name) {
2076 WARN_ON(1); 2079 WARN_ON(1);
2077 return -EINVAL; 2080 return -EINVAL;
2078 } 2081 }
2079 ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap); 2082 ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2080 if (ret < 0) 2083 if (ret < 0)
2081 return ret; 2084 return ret;
2082 list_add(&snap->node, &rbd_dev->snaps); 2085 list_add(&snap->node, &rbd_dev->snaps);
2083 } 2086 }
2084 2087
2085 return 0; 2088 return 0;
2086 } 2089 }
2087 2090
2088 2091
2089 static void rbd_root_dev_release(struct device *dev) 2092 static void rbd_root_dev_release(struct device *dev)
2090 { 2093 {
2091 } 2094 }
2092 2095
2093 static struct device rbd_root_dev = { 2096 static struct device rbd_root_dev = {
2094 .init_name = "rbd", 2097 .init_name = "rbd",
2095 .release = rbd_root_dev_release, 2098 .release = rbd_root_dev_release,
2096 }; 2099 };
2097 2100
2098 static int rbd_bus_add_dev(struct rbd_device *rbd_dev) 2101 static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2099 { 2102 {
2100 int ret = -ENOMEM; 2103 int ret = -ENOMEM;
2101 struct device *dev; 2104 struct device *dev;
2102 struct rbd_snap *snap; 2105 struct rbd_snap *snap;
2103 2106
2104 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 2107 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2105 dev = &rbd_dev->dev; 2108 dev = &rbd_dev->dev;
2106 2109
2107 dev->bus = &rbd_bus_type; 2110 dev->bus = &rbd_bus_type;
2108 dev->type = &rbd_device_type; 2111 dev->type = &rbd_device_type;
2109 dev->parent = &rbd_root_dev; 2112 dev->parent = &rbd_root_dev;
2110 dev->release = rbd_dev_release; 2113 dev->release = rbd_dev_release;
2111 dev_set_name(dev, "%d", rbd_dev->id); 2114 dev_set_name(dev, "%d", rbd_dev->id);
2112 ret = device_register(dev); 2115 ret = device_register(dev);
2113 if (ret < 0) 2116 if (ret < 0)
2114 goto done_free; 2117 goto done_free;
2115 2118
2116 list_for_each_entry(snap, &rbd_dev->snaps, node) { 2119 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2117 ret = rbd_register_snap_dev(rbd_dev, snap, 2120 ret = rbd_register_snap_dev(rbd_dev, snap,
2118 &rbd_dev->dev); 2121 &rbd_dev->dev);
2119 if (ret < 0) 2122 if (ret < 0)
2120 break; 2123 break;
2121 } 2124 }
2122 2125
2123 mutex_unlock(&ctl_mutex); 2126 mutex_unlock(&ctl_mutex);
2124 return 0; 2127 return 0;
2125 done_free: 2128 done_free:
2126 mutex_unlock(&ctl_mutex); 2129 mutex_unlock(&ctl_mutex);
2127 return ret; 2130 return ret;
2128 } 2131 }
2129 2132
2130 static void rbd_bus_del_dev(struct rbd_device *rbd_dev) 2133 static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2131 { 2134 {
2132 device_unregister(&rbd_dev->dev); 2135 device_unregister(&rbd_dev->dev);
2133 } 2136 }
2134 2137
2135 static int rbd_init_watch_dev(struct rbd_device *rbd_dev) 2138 static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2136 { 2139 {
2137 int ret, rc; 2140 int ret, rc;
2138 2141
2139 do { 2142 do {
2140 ret = rbd_req_sync_watch(rbd_dev, rbd_dev->obj_md_name, 2143 ret = rbd_req_sync_watch(rbd_dev, rbd_dev->obj_md_name,
2141 rbd_dev->header.obj_version); 2144 rbd_dev->header.obj_version);
2142 if (ret == -ERANGE) { 2145 if (ret == -ERANGE) {
2143 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 2146 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2144 rc = __rbd_update_snaps(rbd_dev); 2147 rc = __rbd_update_snaps(rbd_dev);
2145 mutex_unlock(&ctl_mutex); 2148 mutex_unlock(&ctl_mutex);
2146 if (rc < 0) 2149 if (rc < 0)
2147 return rc; 2150 return rc;
2148 } 2151 }
2149 } while (ret == -ERANGE); 2152 } while (ret == -ERANGE);
2150 2153
2151 return ret; 2154 return ret;
2152 } 2155 }
2153 2156
2154 static ssize_t rbd_add(struct bus_type *bus, 2157 static ssize_t rbd_add(struct bus_type *bus,
2155 const char *buf, 2158 const char *buf,
2156 size_t count) 2159 size_t count)
2157 { 2160 {
2158 struct ceph_osd_client *osdc; 2161 struct ceph_osd_client *osdc;
2159 struct rbd_device *rbd_dev; 2162 struct rbd_device *rbd_dev;
2160 ssize_t rc = -ENOMEM; 2163 ssize_t rc = -ENOMEM;
2161 int irc, new_id = 0; 2164 int irc, new_id = 0;
2162 struct list_head *tmp; 2165 struct list_head *tmp;
2163 char *mon_dev_name; 2166 char *mon_dev_name;
2164 char *options; 2167 char *options;
2165 2168
2166 if (!try_module_get(THIS_MODULE)) 2169 if (!try_module_get(THIS_MODULE))
2167 return -ENODEV; 2170 return -ENODEV;
2168 2171
2169 mon_dev_name = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL); 2172 mon_dev_name = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
2170 if (!mon_dev_name) 2173 if (!mon_dev_name)
2171 goto err_out_mod; 2174 goto err_out_mod;
2172 2175
2173 options = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL); 2176 options = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
2174 if (!options) 2177 if (!options)
2175 goto err_mon_dev; 2178 goto err_mon_dev;
2176 2179
2177 /* new rbd_device object */ 2180 /* new rbd_device object */
2178 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL); 2181 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2179 if (!rbd_dev) 2182 if (!rbd_dev)
2180 goto err_out_opt; 2183 goto err_out_opt;
2181 2184
2182 /* static rbd_device initialization */ 2185 /* static rbd_device initialization */
2183 spin_lock_init(&rbd_dev->lock); 2186 spin_lock_init(&rbd_dev->lock);
2184 INIT_LIST_HEAD(&rbd_dev->node); 2187 INIT_LIST_HEAD(&rbd_dev->node);
2185 INIT_LIST_HEAD(&rbd_dev->snaps); 2188 INIT_LIST_HEAD(&rbd_dev->snaps);
2186 2189
2187 init_rwsem(&rbd_dev->header.snap_rwsem); 2190 init_rwsem(&rbd_dev->header.snap_rwsem);
2188 2191
2189 /* generate unique id: find highest unique id, add one */ 2192 /* generate unique id: find highest unique id, add one */
2190 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 2193 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2191 2194
2192 list_for_each(tmp, &rbd_dev_list) { 2195 list_for_each(tmp, &rbd_dev_list) {
2193 struct rbd_device *rbd_dev; 2196 struct rbd_device *rbd_dev;
2194 2197
2195 rbd_dev = list_entry(tmp, struct rbd_device, node); 2198 rbd_dev = list_entry(tmp, struct rbd_device, node);
2196 if (rbd_dev->id >= new_id) 2199 if (rbd_dev->id >= new_id)
2197 new_id = rbd_dev->id + 1; 2200 new_id = rbd_dev->id + 1;
2198 } 2201 }
2199 2202
2200 rbd_dev->id = new_id; 2203 rbd_dev->id = new_id;
2201 2204
2202 /* add to global list */ 2205 /* add to global list */
2203 list_add_tail(&rbd_dev->node, &rbd_dev_list); 2206 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2204 2207
2205 /* parse add command */ 2208 /* parse add command */
2206 if (sscanf(buf, "%" __stringify(RBD_MAX_OPT_LEN) "s " 2209 if (sscanf(buf, "%" __stringify(RBD_MAX_OPT_LEN) "s "
2207 "%" __stringify(RBD_MAX_OPT_LEN) "s " 2210 "%" __stringify(RBD_MAX_OPT_LEN) "s "
2208 "%" __stringify(RBD_MAX_POOL_NAME_LEN) "s " 2211 "%" __stringify(RBD_MAX_POOL_NAME_LEN) "s "
2209 "%" __stringify(RBD_MAX_OBJ_NAME_LEN) "s" 2212 "%" __stringify(RBD_MAX_OBJ_NAME_LEN) "s"
2210 "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s", 2213 "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
2211 mon_dev_name, options, rbd_dev->pool_name, 2214 mon_dev_name, options, rbd_dev->pool_name,
2212 rbd_dev->obj, rbd_dev->snap_name) < 4) { 2215 rbd_dev->obj, rbd_dev->snap_name) < 4) {
2213 rc = -EINVAL; 2216 rc = -EINVAL;
2214 goto err_out_slot; 2217 goto err_out_slot;
2215 } 2218 }
2216 2219
2217 if (rbd_dev->snap_name[0] == 0) 2220 if (rbd_dev->snap_name[0] == 0)
2218 rbd_dev->snap_name[0] = '-'; 2221 rbd_dev->snap_name[0] = '-';
2219 2222
2220 rbd_dev->obj_len = strlen(rbd_dev->obj); 2223 rbd_dev->obj_len = strlen(rbd_dev->obj);
2221 snprintf(rbd_dev->obj_md_name, sizeof(rbd_dev->obj_md_name), "%s%s", 2224 snprintf(rbd_dev->obj_md_name, sizeof(rbd_dev->obj_md_name), "%s%s",
2222 rbd_dev->obj, RBD_SUFFIX); 2225 rbd_dev->obj, RBD_SUFFIX);
2223 2226
2224 /* initialize rest of new object */ 2227 /* initialize rest of new object */
2225 snprintf(rbd_dev->name, DEV_NAME_LEN, DRV_NAME "%d", rbd_dev->id); 2228 snprintf(rbd_dev->name, DEV_NAME_LEN, DRV_NAME "%d", rbd_dev->id);
2226 rc = rbd_get_client(rbd_dev, mon_dev_name, options); 2229 rc = rbd_get_client(rbd_dev, mon_dev_name, options);
2227 if (rc < 0) 2230 if (rc < 0)
2228 goto err_out_slot; 2231 goto err_out_slot;
2229 2232
2230 mutex_unlock(&ctl_mutex); 2233 mutex_unlock(&ctl_mutex);
2231 2234
2232 /* pick the pool */ 2235 /* pick the pool */
2233 osdc = &rbd_dev->client->osdc; 2236 osdc = &rbd_dev->client->osdc;
2234 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name); 2237 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2235 if (rc < 0) 2238 if (rc < 0)
2236 goto err_out_client; 2239 goto err_out_client;
2237 rbd_dev->poolid = rc; 2240 rbd_dev->poolid = rc;
2238 2241
2239 /* register our block device */ 2242 /* register our block device */
2240 irc = register_blkdev(0, rbd_dev->name); 2243 irc = register_blkdev(0, rbd_dev->name);
2241 if (irc < 0) { 2244 if (irc < 0) {
2242 rc = irc; 2245 rc = irc;
2243 goto err_out_client; 2246 goto err_out_client;
2244 } 2247 }
2245 rbd_dev->major = irc; 2248 rbd_dev->major = irc;
2246 2249
2247 rc = rbd_bus_add_dev(rbd_dev); 2250 rc = rbd_bus_add_dev(rbd_dev);
2248 if (rc) 2251 if (rc)
2249 goto err_out_blkdev; 2252 goto err_out_blkdev;
2250 2253
2251 /* set up and announce blkdev mapping */ 2254 /* set up and announce blkdev mapping */
2252 rc = rbd_init_disk(rbd_dev); 2255 rc = rbd_init_disk(rbd_dev);
2253 if (rc) 2256 if (rc)
2254 goto err_out_bus; 2257 goto err_out_bus;
2255 2258
2256 rc = rbd_init_watch_dev(rbd_dev); 2259 rc = rbd_init_watch_dev(rbd_dev);
2257 if (rc) 2260 if (rc)
2258 goto err_out_bus; 2261 goto err_out_bus;
2259 2262
2260 return count; 2263 return count;
2261 2264
2262 err_out_bus: 2265 err_out_bus:
2263 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 2266 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2264 list_del_init(&rbd_dev->node); 2267 list_del_init(&rbd_dev->node);
2265 mutex_unlock(&ctl_mutex); 2268 mutex_unlock(&ctl_mutex);
2266 2269
2267 /* this will also clean up rest of rbd_dev stuff */ 2270 /* this will also clean up rest of rbd_dev stuff */
2268 2271
2269 rbd_bus_del_dev(rbd_dev); 2272 rbd_bus_del_dev(rbd_dev);
2270 kfree(options); 2273 kfree(options);
2271 kfree(mon_dev_name); 2274 kfree(mon_dev_name);
2272 return rc; 2275 return rc;
2273 2276
2274 err_out_blkdev: 2277 err_out_blkdev:
2275 unregister_blkdev(rbd_dev->major, rbd_dev->name); 2278 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2276 err_out_client: 2279 err_out_client:
2277 rbd_put_client(rbd_dev); 2280 rbd_put_client(rbd_dev);
2278 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 2281 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2279 err_out_slot: 2282 err_out_slot:
2280 list_del_init(&rbd_dev->node); 2283 list_del_init(&rbd_dev->node);
2281 mutex_unlock(&ctl_mutex); 2284 mutex_unlock(&ctl_mutex);
2282 2285
2283 kfree(rbd_dev); 2286 kfree(rbd_dev);
2284 err_out_opt: 2287 err_out_opt:
2285 kfree(options); 2288 kfree(options);
2286 err_mon_dev: 2289 err_mon_dev:
2287 kfree(mon_dev_name); 2290 kfree(mon_dev_name);
2288 err_out_mod: 2291 err_out_mod:
2289 dout("Error adding device %s\n", buf); 2292 dout("Error adding device %s\n", buf);
2290 module_put(THIS_MODULE); 2293 module_put(THIS_MODULE);
2291 return rc; 2294 return rc;
2292 } 2295 }
2293 2296
2294 static struct rbd_device *__rbd_get_dev(unsigned long id) 2297 static struct rbd_device *__rbd_get_dev(unsigned long id)
2295 { 2298 {
2296 struct list_head *tmp; 2299 struct list_head *tmp;
2297 struct rbd_device *rbd_dev; 2300 struct rbd_device *rbd_dev;
2298 2301
2299 list_for_each(tmp, &rbd_dev_list) { 2302 list_for_each(tmp, &rbd_dev_list) {
2300 rbd_dev = list_entry(tmp, struct rbd_device, node); 2303 rbd_dev = list_entry(tmp, struct rbd_device, node);
2301 if (rbd_dev->id == id) 2304 if (rbd_dev->id == id)
2302 return rbd_dev; 2305 return rbd_dev;
2303 } 2306 }
2304 return NULL; 2307 return NULL;
2305 } 2308 }
2306 2309
2307 static void rbd_dev_release(struct device *dev) 2310 static void rbd_dev_release(struct device *dev)
2308 { 2311 {
2309 struct rbd_device *rbd_dev = 2312 struct rbd_device *rbd_dev =
2310 container_of(dev, struct rbd_device, dev); 2313 container_of(dev, struct rbd_device, dev);
2311 2314
2312 if (rbd_dev->watch_request) 2315 if (rbd_dev->watch_request)
2313 ceph_osdc_unregister_linger_request(&rbd_dev->client->osdc, 2316 ceph_osdc_unregister_linger_request(&rbd_dev->client->osdc,
2314 rbd_dev->watch_request); 2317 rbd_dev->watch_request);
2315 if (rbd_dev->watch_event) 2318 if (rbd_dev->watch_event)
2316 rbd_req_sync_unwatch(rbd_dev, rbd_dev->obj_md_name); 2319 rbd_req_sync_unwatch(rbd_dev, rbd_dev->obj_md_name);
2317 2320
2318 rbd_put_client(rbd_dev); 2321 rbd_put_client(rbd_dev);
2319 2322
2320 /* clean up and free blkdev */ 2323 /* clean up and free blkdev */
2321 rbd_free_disk(rbd_dev); 2324 rbd_free_disk(rbd_dev);
2322 unregister_blkdev(rbd_dev->major, rbd_dev->name); 2325 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2323 kfree(rbd_dev); 2326 kfree(rbd_dev);
2324 2327
2325 /* release module ref */ 2328 /* release module ref */
2326 module_put(THIS_MODULE); 2329 module_put(THIS_MODULE);
2327 } 2330 }
2328 2331
2329 static ssize_t rbd_remove(struct bus_type *bus, 2332 static ssize_t rbd_remove(struct bus_type *bus,
2330 const char *buf, 2333 const char *buf,
2331 size_t count) 2334 size_t count)
2332 { 2335 {
2333 struct rbd_device *rbd_dev = NULL; 2336 struct rbd_device *rbd_dev = NULL;
2334 int target_id, rc; 2337 int target_id, rc;
2335 unsigned long ul; 2338 unsigned long ul;
2336 int ret = count; 2339 int ret = count;
2337 2340
2338 rc = strict_strtoul(buf, 10, &ul); 2341 rc = strict_strtoul(buf, 10, &ul);
2339 if (rc) 2342 if (rc)
2340 return rc; 2343 return rc;
2341 2344
2342 /* convert to int; abort if we lost anything in the conversion */ 2345 /* convert to int; abort if we lost anything in the conversion */
2343 target_id = (int) ul; 2346 target_id = (int) ul;
2344 if (target_id != ul) 2347 if (target_id != ul)
2345 return -EINVAL; 2348 return -EINVAL;
2346 2349
2347 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 2350 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2348 2351
2349 rbd_dev = __rbd_get_dev(target_id); 2352 rbd_dev = __rbd_get_dev(target_id);
2350 if (!rbd_dev) { 2353 if (!rbd_dev) {
2351 ret = -ENOENT; 2354 ret = -ENOENT;
2352 goto done; 2355 goto done;
2353 } 2356 }
2354 2357
2355 list_del_init(&rbd_dev->node); 2358 list_del_init(&rbd_dev->node);
2356 2359
2357 __rbd_remove_all_snaps(rbd_dev); 2360 __rbd_remove_all_snaps(rbd_dev);
2358 rbd_bus_del_dev(rbd_dev); 2361 rbd_bus_del_dev(rbd_dev);
2359 2362
2360 done: 2363 done:
2361 mutex_unlock(&ctl_mutex); 2364 mutex_unlock(&ctl_mutex);
2362 return ret; 2365 return ret;
2363 } 2366 }
2364 2367
2365 static ssize_t rbd_snap_add(struct device *dev, 2368 static ssize_t rbd_snap_add(struct device *dev,
2366 struct device_attribute *attr, 2369 struct device_attribute *attr,
2367 const char *buf, 2370 const char *buf,
2368 size_t count) 2371 size_t count)
2369 { 2372 {
2370 struct rbd_device *rbd_dev = dev_to_rbd(dev); 2373 struct rbd_device *rbd_dev = dev_to_rbd(dev);
2371 int ret; 2374 int ret;
2372 char *name = kmalloc(count + 1, GFP_KERNEL); 2375 char *name = kmalloc(count + 1, GFP_KERNEL);
2373 if (!name) 2376 if (!name)
2374 return -ENOMEM; 2377 return -ENOMEM;
2375 2378
2376 snprintf(name, count, "%s", buf); 2379 snprintf(name, count, "%s", buf);
2377 2380
2378 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 2381 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2379 2382
2380 ret = rbd_header_add_snap(rbd_dev, 2383 ret = rbd_header_add_snap(rbd_dev,
2381 name, GFP_KERNEL); 2384 name, GFP_KERNEL);
2382 if (ret < 0) 2385 if (ret < 0)
2383 goto err_unlock; 2386 goto err_unlock;
2384 2387
2385 ret = __rbd_update_snaps(rbd_dev); 2388 ret = __rbd_update_snaps(rbd_dev);
2386 if (ret < 0) 2389 if (ret < 0)
2387 goto err_unlock; 2390 goto err_unlock;
2388 2391
2389 /* shouldn't hold ctl_mutex when notifying.. notify might 2392 /* shouldn't hold ctl_mutex when notifying.. notify might
2390 trigger a watch callback that would need to get that mutex */ 2393 trigger a watch callback that would need to get that mutex */
2391 mutex_unlock(&ctl_mutex); 2394 mutex_unlock(&ctl_mutex);
2392 2395
2393 /* make a best effort, don't error if failed */ 2396 /* make a best effort, don't error if failed */
2394 rbd_req_sync_notify(rbd_dev, rbd_dev->obj_md_name); 2397 rbd_req_sync_notify(rbd_dev, rbd_dev->obj_md_name);
2395 2398
2396 ret = count; 2399 ret = count;
2397 kfree(name); 2400 kfree(name);
2398 return ret; 2401 return ret;
2399 2402
2400 err_unlock: 2403 err_unlock:
2401 mutex_unlock(&ctl_mutex); 2404 mutex_unlock(&ctl_mutex);
2402 kfree(name); 2405 kfree(name);
2403 return ret; 2406 return ret;
2404 } 2407 }
2405 2408
2406 static struct bus_attribute rbd_bus_attrs[] = { 2409 static struct bus_attribute rbd_bus_attrs[] = {
2407 __ATTR(add, S_IWUSR, NULL, rbd_add), 2410 __ATTR(add, S_IWUSR, NULL, rbd_add),
2408 __ATTR(remove, S_IWUSR, NULL, rbd_remove), 2411 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
2409 __ATTR_NULL 2412 __ATTR_NULL
2410 }; 2413 };
2411 2414
2412 /* 2415 /*
2413 * create control files in sysfs 2416 * create control files in sysfs
2414 * /sys/bus/rbd/... 2417 * /sys/bus/rbd/...
2415 */ 2418 */
2416 static int rbd_sysfs_init(void) 2419 static int rbd_sysfs_init(void)
2417 { 2420 {
2418 int ret; 2421 int ret;
2419 2422
2420 rbd_bus_type.bus_attrs = rbd_bus_attrs; 2423 rbd_bus_type.bus_attrs = rbd_bus_attrs;
2421 2424
2422 ret = bus_register(&rbd_bus_type); 2425 ret = bus_register(&rbd_bus_type);
2423 if (ret < 0) 2426 if (ret < 0)
2424 return ret; 2427 return ret;
2425 2428
2426 ret = device_register(&rbd_root_dev); 2429 ret = device_register(&rbd_root_dev);
2427 2430
2428 return ret; 2431 return ret;
2429 } 2432 }
2430 2433
2431 static void rbd_sysfs_cleanup(void) 2434 static void rbd_sysfs_cleanup(void)
2432 { 2435 {
2433 device_unregister(&rbd_root_dev); 2436 device_unregister(&rbd_root_dev);
2434 bus_unregister(&rbd_bus_type); 2437 bus_unregister(&rbd_bus_type);
2435 } 2438 }
2436 2439
2437 int __init rbd_init(void) 2440 int __init rbd_init(void)
2438 { 2441 {
2439 int rc; 2442 int rc;
2440 2443
2441 rc = rbd_sysfs_init(); 2444 rc = rbd_sysfs_init();
2442 if (rc) 2445 if (rc)
2443 return rc; 2446 return rc;
2444 spin_lock_init(&node_lock); 2447 spin_lock_init(&node_lock);
2445 pr_info("loaded " DRV_NAME_LONG "\n"); 2448 pr_info("loaded " DRV_NAME_LONG "\n");
2446 return 0; 2449 return 0;
2447 } 2450 }
2448 2451
2449 void __exit rbd_exit(void) 2452 void __exit rbd_exit(void)
2450 { 2453 {
2451 rbd_sysfs_cleanup(); 2454 rbd_sysfs_cleanup();
2452 } 2455 }
2453 2456
2454 module_init(rbd_init); 2457 module_init(rbd_init);
2455 module_exit(rbd_exit); 2458 module_exit(rbd_exit);
2456 2459
2457 MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); 2460 MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2458 MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); 2461 MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2459 MODULE_DESCRIPTION("rados block device"); 2462 MODULE_DESCRIPTION("rados block device");
2460 2463
2461 /* following authorship retained from original osdblk.c */ 2464 /* following authorship retained from original osdblk.c */
2462 MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>"); 2465 MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2463 2466
1 #include <linux/ceph/ceph_debug.h> 1 #include <linux/ceph/ceph_debug.h>
2 2
3 #include <linux/fs.h> 3 #include <linux/fs.h>
4 #include <linux/kernel.h> 4 #include <linux/kernel.h>
5 #include <linux/sched.h> 5 #include <linux/sched.h>
6 #include <linux/slab.h> 6 #include <linux/slab.h>
7 #include <linux/vmalloc.h> 7 #include <linux/vmalloc.h>
8 #include <linux/wait.h> 8 #include <linux/wait.h>
9 #include <linux/writeback.h> 9 #include <linux/writeback.h>
10 10
11 #include "super.h" 11 #include "super.h"
12 #include "mds_client.h" 12 #include "mds_client.h"
13 #include <linux/ceph/decode.h> 13 #include <linux/ceph/decode.h>
14 #include <linux/ceph/messenger.h> 14 #include <linux/ceph/messenger.h>
15 15
16 /* 16 /*
17 * Capability management 17 * Capability management
18 * 18 *
19 * The Ceph metadata servers control client access to inode metadata 19 * The Ceph metadata servers control client access to inode metadata
20 * and file data by issuing capabilities, granting clients permission 20 * and file data by issuing capabilities, granting clients permission
21 * to read and/or write both inode field and file data to OSDs 21 * to read and/or write both inode field and file data to OSDs
22 * (storage nodes). Each capability consists of a set of bits 22 * (storage nodes). Each capability consists of a set of bits
23 * indicating which operations are allowed. 23 * indicating which operations are allowed.
24 * 24 *
25 * If the client holds a *_SHARED cap, the client has a coherent value 25 * If the client holds a *_SHARED cap, the client has a coherent value
26 * that can be safely read from the cached inode. 26 * that can be safely read from the cached inode.
27 * 27 *
28 * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the 28 * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the
29 * client is allowed to change inode attributes (e.g., file size, 29 * client is allowed to change inode attributes (e.g., file size,
30 * mtime), note its dirty state in the ceph_cap, and asynchronously 30 * mtime), note its dirty state in the ceph_cap, and asynchronously
31 * flush that metadata change to the MDS. 31 * flush that metadata change to the MDS.
32 * 32 *
33 * In the event of a conflicting operation (perhaps by another 33 * In the event of a conflicting operation (perhaps by another
34 * client), the MDS will revoke the conflicting client capabilities. 34 * client), the MDS will revoke the conflicting client capabilities.
35 * 35 *
36 * In order for a client to cache an inode, it must hold a capability 36 * In order for a client to cache an inode, it must hold a capability
37 * with at least one MDS server. When inodes are released, release 37 * with at least one MDS server. When inodes are released, release
38 * notifications are batched and periodically sent en masse to the MDS 38 * notifications are batched and periodically sent en masse to the MDS
39 * cluster to release server state. 39 * cluster to release server state.
40 */ 40 */
41 41
42 42
43 /* 43 /*
44 * Generate readable cap strings for debugging output. 44 * Generate readable cap strings for debugging output.
45 */ 45 */
46 #define MAX_CAP_STR 20 46 #define MAX_CAP_STR 20
47 static char cap_str[MAX_CAP_STR][40]; 47 static char cap_str[MAX_CAP_STR][40];
48 static DEFINE_SPINLOCK(cap_str_lock); 48 static DEFINE_SPINLOCK(cap_str_lock);
49 static int last_cap_str; 49 static int last_cap_str;
50 50
51 static char *gcap_string(char *s, int c) 51 static char *gcap_string(char *s, int c)
52 { 52 {
53 if (c & CEPH_CAP_GSHARED) 53 if (c & CEPH_CAP_GSHARED)
54 *s++ = 's'; 54 *s++ = 's';
55 if (c & CEPH_CAP_GEXCL) 55 if (c & CEPH_CAP_GEXCL)
56 *s++ = 'x'; 56 *s++ = 'x';
57 if (c & CEPH_CAP_GCACHE) 57 if (c & CEPH_CAP_GCACHE)
58 *s++ = 'c'; 58 *s++ = 'c';
59 if (c & CEPH_CAP_GRD) 59 if (c & CEPH_CAP_GRD)
60 *s++ = 'r'; 60 *s++ = 'r';
61 if (c & CEPH_CAP_GWR) 61 if (c & CEPH_CAP_GWR)
62 *s++ = 'w'; 62 *s++ = 'w';
63 if (c & CEPH_CAP_GBUFFER) 63 if (c & CEPH_CAP_GBUFFER)
64 *s++ = 'b'; 64 *s++ = 'b';
65 if (c & CEPH_CAP_GLAZYIO) 65 if (c & CEPH_CAP_GLAZYIO)
66 *s++ = 'l'; 66 *s++ = 'l';
67 return s; 67 return s;
68 } 68 }
69 69
70 const char *ceph_cap_string(int caps) 70 const char *ceph_cap_string(int caps)
71 { 71 {
72 int i; 72 int i;
73 char *s; 73 char *s;
74 int c; 74 int c;
75 75
76 spin_lock(&cap_str_lock); 76 spin_lock(&cap_str_lock);
77 i = last_cap_str++; 77 i = last_cap_str++;
78 if (last_cap_str == MAX_CAP_STR) 78 if (last_cap_str == MAX_CAP_STR)
79 last_cap_str = 0; 79 last_cap_str = 0;
80 spin_unlock(&cap_str_lock); 80 spin_unlock(&cap_str_lock);
81 81
82 s = cap_str[i]; 82 s = cap_str[i];
83 83
84 if (caps & CEPH_CAP_PIN) 84 if (caps & CEPH_CAP_PIN)
85 *s++ = 'p'; 85 *s++ = 'p';
86 86
87 c = (caps >> CEPH_CAP_SAUTH) & 3; 87 c = (caps >> CEPH_CAP_SAUTH) & 3;
88 if (c) { 88 if (c) {
89 *s++ = 'A'; 89 *s++ = 'A';
90 s = gcap_string(s, c); 90 s = gcap_string(s, c);
91 } 91 }
92 92
93 c = (caps >> CEPH_CAP_SLINK) & 3; 93 c = (caps >> CEPH_CAP_SLINK) & 3;
94 if (c) { 94 if (c) {
95 *s++ = 'L'; 95 *s++ = 'L';
96 s = gcap_string(s, c); 96 s = gcap_string(s, c);
97 } 97 }
98 98
99 c = (caps >> CEPH_CAP_SXATTR) & 3; 99 c = (caps >> CEPH_CAP_SXATTR) & 3;
100 if (c) { 100 if (c) {
101 *s++ = 'X'; 101 *s++ = 'X';
102 s = gcap_string(s, c); 102 s = gcap_string(s, c);
103 } 103 }
104 104
105 c = caps >> CEPH_CAP_SFILE; 105 c = caps >> CEPH_CAP_SFILE;
106 if (c) { 106 if (c) {
107 *s++ = 'F'; 107 *s++ = 'F';
108 s = gcap_string(s, c); 108 s = gcap_string(s, c);
109 } 109 }
110 110
111 if (s == cap_str[i]) 111 if (s == cap_str[i])
112 *s++ = '-'; 112 *s++ = '-';
113 *s = 0; 113 *s = 0;
114 return cap_str[i]; 114 return cap_str[i];
115 } 115 }
116 116
117 void ceph_caps_init(struct ceph_mds_client *mdsc) 117 void ceph_caps_init(struct ceph_mds_client *mdsc)
118 { 118 {
119 INIT_LIST_HEAD(&mdsc->caps_list); 119 INIT_LIST_HEAD(&mdsc->caps_list);
120 spin_lock_init(&mdsc->caps_list_lock); 120 spin_lock_init(&mdsc->caps_list_lock);
121 } 121 }
122 122
123 void ceph_caps_finalize(struct ceph_mds_client *mdsc) 123 void ceph_caps_finalize(struct ceph_mds_client *mdsc)
124 { 124 {
125 struct ceph_cap *cap; 125 struct ceph_cap *cap;
126 126
127 spin_lock(&mdsc->caps_list_lock); 127 spin_lock(&mdsc->caps_list_lock);
128 while (!list_empty(&mdsc->caps_list)) { 128 while (!list_empty(&mdsc->caps_list)) {
129 cap = list_first_entry(&mdsc->caps_list, 129 cap = list_first_entry(&mdsc->caps_list,
130 struct ceph_cap, caps_item); 130 struct ceph_cap, caps_item);
131 list_del(&cap->caps_item); 131 list_del(&cap->caps_item);
132 kmem_cache_free(ceph_cap_cachep, cap); 132 kmem_cache_free(ceph_cap_cachep, cap);
133 } 133 }
134 mdsc->caps_total_count = 0; 134 mdsc->caps_total_count = 0;
135 mdsc->caps_avail_count = 0; 135 mdsc->caps_avail_count = 0;
136 mdsc->caps_use_count = 0; 136 mdsc->caps_use_count = 0;
137 mdsc->caps_reserve_count = 0; 137 mdsc->caps_reserve_count = 0;
138 mdsc->caps_min_count = 0; 138 mdsc->caps_min_count = 0;
139 spin_unlock(&mdsc->caps_list_lock); 139 spin_unlock(&mdsc->caps_list_lock);
140 } 140 }
141 141
142 void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta) 142 void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta)
143 { 143 {
144 spin_lock(&mdsc->caps_list_lock); 144 spin_lock(&mdsc->caps_list_lock);
145 mdsc->caps_min_count += delta; 145 mdsc->caps_min_count += delta;
146 BUG_ON(mdsc->caps_min_count < 0); 146 BUG_ON(mdsc->caps_min_count < 0);
147 spin_unlock(&mdsc->caps_list_lock); 147 spin_unlock(&mdsc->caps_list_lock);
148 } 148 }
149 149
150 int ceph_reserve_caps(struct ceph_mds_client *mdsc, 150 int ceph_reserve_caps(struct ceph_mds_client *mdsc,
151 struct ceph_cap_reservation *ctx, int need) 151 struct ceph_cap_reservation *ctx, int need)
152 { 152 {
153 int i; 153 int i;
154 struct ceph_cap *cap; 154 struct ceph_cap *cap;
155 int have; 155 int have;
156 int alloc = 0; 156 int alloc = 0;
157 LIST_HEAD(newcaps); 157 LIST_HEAD(newcaps);
158 int ret = 0; 158 int ret = 0;
159 159
160 dout("reserve caps ctx=%p need=%d\n", ctx, need); 160 dout("reserve caps ctx=%p need=%d\n", ctx, need);
161 161
162 /* first reserve any caps that are already allocated */ 162 /* first reserve any caps that are already allocated */
163 spin_lock(&mdsc->caps_list_lock); 163 spin_lock(&mdsc->caps_list_lock);
164 if (mdsc->caps_avail_count >= need) 164 if (mdsc->caps_avail_count >= need)
165 have = need; 165 have = need;
166 else 166 else
167 have = mdsc->caps_avail_count; 167 have = mdsc->caps_avail_count;
168 mdsc->caps_avail_count -= have; 168 mdsc->caps_avail_count -= have;
169 mdsc->caps_reserve_count += have; 169 mdsc->caps_reserve_count += have;
170 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + 170 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
171 mdsc->caps_reserve_count + 171 mdsc->caps_reserve_count +
172 mdsc->caps_avail_count); 172 mdsc->caps_avail_count);
173 spin_unlock(&mdsc->caps_list_lock); 173 spin_unlock(&mdsc->caps_list_lock);
174 174
175 for (i = have; i < need; i++) { 175 for (i = have; i < need; i++) {
176 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); 176 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
177 if (!cap) { 177 if (!cap) {
178 ret = -ENOMEM; 178 ret = -ENOMEM;
179 goto out_alloc_count; 179 goto out_alloc_count;
180 } 180 }
181 list_add(&cap->caps_item, &newcaps); 181 list_add(&cap->caps_item, &newcaps);
182 alloc++; 182 alloc++;
183 } 183 }
184 BUG_ON(have + alloc != need); 184 BUG_ON(have + alloc != need);
185 185
186 spin_lock(&mdsc->caps_list_lock); 186 spin_lock(&mdsc->caps_list_lock);
187 mdsc->caps_total_count += alloc; 187 mdsc->caps_total_count += alloc;
188 mdsc->caps_reserve_count += alloc; 188 mdsc->caps_reserve_count += alloc;
189 list_splice(&newcaps, &mdsc->caps_list); 189 list_splice(&newcaps, &mdsc->caps_list);
190 190
191 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + 191 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
192 mdsc->caps_reserve_count + 192 mdsc->caps_reserve_count +
193 mdsc->caps_avail_count); 193 mdsc->caps_avail_count);
194 spin_unlock(&mdsc->caps_list_lock); 194 spin_unlock(&mdsc->caps_list_lock);
195 195
196 ctx->count = need; 196 ctx->count = need;
197 dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n", 197 dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
198 ctx, mdsc->caps_total_count, mdsc->caps_use_count, 198 ctx, mdsc->caps_total_count, mdsc->caps_use_count,
199 mdsc->caps_reserve_count, mdsc->caps_avail_count); 199 mdsc->caps_reserve_count, mdsc->caps_avail_count);
200 return 0; 200 return 0;
201 201
202 out_alloc_count: 202 out_alloc_count:
203 /* we didn't manage to reserve as much as we needed */ 203 /* we didn't manage to reserve as much as we needed */
204 pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n", 204 pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
205 ctx, need, have); 205 ctx, need, have);
206 return ret; 206 return ret;
207 } 207 }
208 208
209 int ceph_unreserve_caps(struct ceph_mds_client *mdsc, 209 int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
210 struct ceph_cap_reservation *ctx) 210 struct ceph_cap_reservation *ctx)
211 { 211 {
212 dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count); 212 dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
213 if (ctx->count) { 213 if (ctx->count) {
214 spin_lock(&mdsc->caps_list_lock); 214 spin_lock(&mdsc->caps_list_lock);
215 BUG_ON(mdsc->caps_reserve_count < ctx->count); 215 BUG_ON(mdsc->caps_reserve_count < ctx->count);
216 mdsc->caps_reserve_count -= ctx->count; 216 mdsc->caps_reserve_count -= ctx->count;
217 mdsc->caps_avail_count += ctx->count; 217 mdsc->caps_avail_count += ctx->count;
218 ctx->count = 0; 218 ctx->count = 0;
219 dout("unreserve caps %d = %d used + %d resv + %d avail\n", 219 dout("unreserve caps %d = %d used + %d resv + %d avail\n",
220 mdsc->caps_total_count, mdsc->caps_use_count, 220 mdsc->caps_total_count, mdsc->caps_use_count,
221 mdsc->caps_reserve_count, mdsc->caps_avail_count); 221 mdsc->caps_reserve_count, mdsc->caps_avail_count);
222 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + 222 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
223 mdsc->caps_reserve_count + 223 mdsc->caps_reserve_count +
224 mdsc->caps_avail_count); 224 mdsc->caps_avail_count);
225 spin_unlock(&mdsc->caps_list_lock); 225 spin_unlock(&mdsc->caps_list_lock);
226 } 226 }
227 return 0; 227 return 0;
228 } 228 }
229 229
230 static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc, 230 static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc,
231 struct ceph_cap_reservation *ctx) 231 struct ceph_cap_reservation *ctx)
232 { 232 {
233 struct ceph_cap *cap = NULL; 233 struct ceph_cap *cap = NULL;
234 234
235 /* temporary, until we do something about cap import/export */ 235 /* temporary, until we do something about cap import/export */
236 if (!ctx) { 236 if (!ctx) {
237 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); 237 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
238 if (cap) { 238 if (cap) {
239 mdsc->caps_use_count++; 239 mdsc->caps_use_count++;
240 mdsc->caps_total_count++; 240 mdsc->caps_total_count++;
241 } 241 }
242 return cap; 242 return cap;
243 } 243 }
244 244
245 spin_lock(&mdsc->caps_list_lock); 245 spin_lock(&mdsc->caps_list_lock);
246 dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n", 246 dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
247 ctx, ctx->count, mdsc->caps_total_count, mdsc->caps_use_count, 247 ctx, ctx->count, mdsc->caps_total_count, mdsc->caps_use_count,
248 mdsc->caps_reserve_count, mdsc->caps_avail_count); 248 mdsc->caps_reserve_count, mdsc->caps_avail_count);
249 BUG_ON(!ctx->count); 249 BUG_ON(!ctx->count);
250 BUG_ON(ctx->count > mdsc->caps_reserve_count); 250 BUG_ON(ctx->count > mdsc->caps_reserve_count);
251 BUG_ON(list_empty(&mdsc->caps_list)); 251 BUG_ON(list_empty(&mdsc->caps_list));
252 252
253 ctx->count--; 253 ctx->count--;
254 mdsc->caps_reserve_count--; 254 mdsc->caps_reserve_count--;
255 mdsc->caps_use_count++; 255 mdsc->caps_use_count++;
256 256
257 cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item); 257 cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item);
258 list_del(&cap->caps_item); 258 list_del(&cap->caps_item);
259 259
260 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + 260 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
261 mdsc->caps_reserve_count + mdsc->caps_avail_count); 261 mdsc->caps_reserve_count + mdsc->caps_avail_count);
262 spin_unlock(&mdsc->caps_list_lock); 262 spin_unlock(&mdsc->caps_list_lock);
263 return cap; 263 return cap;
264 } 264 }
265 265
266 void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap) 266 void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap)
267 { 267 {
268 spin_lock(&mdsc->caps_list_lock); 268 spin_lock(&mdsc->caps_list_lock);
269 dout("put_cap %p %d = %d used + %d resv + %d avail\n", 269 dout("put_cap %p %d = %d used + %d resv + %d avail\n",
270 cap, mdsc->caps_total_count, mdsc->caps_use_count, 270 cap, mdsc->caps_total_count, mdsc->caps_use_count,
271 mdsc->caps_reserve_count, mdsc->caps_avail_count); 271 mdsc->caps_reserve_count, mdsc->caps_avail_count);
272 mdsc->caps_use_count--; 272 mdsc->caps_use_count--;
273 /* 273 /*
274 * Keep some preallocated caps around (ceph_min_count), to 274 * Keep some preallocated caps around (ceph_min_count), to
275 * avoid lots of free/alloc churn. 275 * avoid lots of free/alloc churn.
276 */ 276 */
277 if (mdsc->caps_avail_count >= mdsc->caps_reserve_count + 277 if (mdsc->caps_avail_count >= mdsc->caps_reserve_count +
278 mdsc->caps_min_count) { 278 mdsc->caps_min_count) {
279 mdsc->caps_total_count--; 279 mdsc->caps_total_count--;
280 kmem_cache_free(ceph_cap_cachep, cap); 280 kmem_cache_free(ceph_cap_cachep, cap);
281 } else { 281 } else {
282 mdsc->caps_avail_count++; 282 mdsc->caps_avail_count++;
283 list_add(&cap->caps_item, &mdsc->caps_list); 283 list_add(&cap->caps_item, &mdsc->caps_list);
284 } 284 }
285 285
286 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + 286 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
287 mdsc->caps_reserve_count + mdsc->caps_avail_count); 287 mdsc->caps_reserve_count + mdsc->caps_avail_count);
288 spin_unlock(&mdsc->caps_list_lock); 288 spin_unlock(&mdsc->caps_list_lock);
289 } 289 }
290 290
291 void ceph_reservation_status(struct ceph_fs_client *fsc, 291 void ceph_reservation_status(struct ceph_fs_client *fsc,
292 int *total, int *avail, int *used, int *reserved, 292 int *total, int *avail, int *used, int *reserved,
293 int *min) 293 int *min)
294 { 294 {
295 struct ceph_mds_client *mdsc = fsc->mdsc; 295 struct ceph_mds_client *mdsc = fsc->mdsc;
296 296
297 if (total) 297 if (total)
298 *total = mdsc->caps_total_count; 298 *total = mdsc->caps_total_count;
299 if (avail) 299 if (avail)
300 *avail = mdsc->caps_avail_count; 300 *avail = mdsc->caps_avail_count;
301 if (used) 301 if (used)
302 *used = mdsc->caps_use_count; 302 *used = mdsc->caps_use_count;
303 if (reserved) 303 if (reserved)
304 *reserved = mdsc->caps_reserve_count; 304 *reserved = mdsc->caps_reserve_count;
305 if (min) 305 if (min)
306 *min = mdsc->caps_min_count; 306 *min = mdsc->caps_min_count;
307 } 307 }
308 308
309 /* 309 /*
310 * Find ceph_cap for given mds, if any. 310 * Find ceph_cap for given mds, if any.
311 * 311 *
312 * Called with i_ceph_lock held. 312 * Called with i_ceph_lock held.
313 */ 313 */
314 static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds) 314 static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
315 { 315 {
316 struct ceph_cap *cap; 316 struct ceph_cap *cap;
317 struct rb_node *n = ci->i_caps.rb_node; 317 struct rb_node *n = ci->i_caps.rb_node;
318 318
319 while (n) { 319 while (n) {
320 cap = rb_entry(n, struct ceph_cap, ci_node); 320 cap = rb_entry(n, struct ceph_cap, ci_node);
321 if (mds < cap->mds) 321 if (mds < cap->mds)
322 n = n->rb_left; 322 n = n->rb_left;
323 else if (mds > cap->mds) 323 else if (mds > cap->mds)
324 n = n->rb_right; 324 n = n->rb_right;
325 else 325 else
326 return cap; 326 return cap;
327 } 327 }
328 return NULL; 328 return NULL;
329 } 329 }
330 330
331 struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds) 331 struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds)
332 { 332 {
333 struct ceph_cap *cap; 333 struct ceph_cap *cap;
334 334
335 spin_lock(&ci->i_ceph_lock); 335 spin_lock(&ci->i_ceph_lock);
336 cap = __get_cap_for_mds(ci, mds); 336 cap = __get_cap_for_mds(ci, mds);
337 spin_unlock(&ci->i_ceph_lock); 337 spin_unlock(&ci->i_ceph_lock);
338 return cap; 338 return cap;
339 } 339 }
340 340
341 /* 341 /*
342 * Return id of any MDS with a cap, preferably FILE_WR|BUFFER|EXCL, else -1. 342 * Return id of any MDS with a cap, preferably FILE_WR|BUFFER|EXCL, else -1.
343 */ 343 */
344 static int __ceph_get_cap_mds(struct ceph_inode_info *ci) 344 static int __ceph_get_cap_mds(struct ceph_inode_info *ci)
345 { 345 {
346 struct ceph_cap *cap; 346 struct ceph_cap *cap;
347 int mds = -1; 347 int mds = -1;
348 struct rb_node *p; 348 struct rb_node *p;
349 349
350 /* prefer mds with WR|BUFFER|EXCL caps */ 350 /* prefer mds with WR|BUFFER|EXCL caps */
351 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { 351 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
352 cap = rb_entry(p, struct ceph_cap, ci_node); 352 cap = rb_entry(p, struct ceph_cap, ci_node);
353 mds = cap->mds; 353 mds = cap->mds;
354 if (cap->issued & (CEPH_CAP_FILE_WR | 354 if (cap->issued & (CEPH_CAP_FILE_WR |
355 CEPH_CAP_FILE_BUFFER | 355 CEPH_CAP_FILE_BUFFER |
356 CEPH_CAP_FILE_EXCL)) 356 CEPH_CAP_FILE_EXCL))
357 break; 357 break;
358 } 358 }
359 return mds; 359 return mds;
360 } 360 }
361 361
362 int ceph_get_cap_mds(struct inode *inode) 362 int ceph_get_cap_mds(struct inode *inode)
363 { 363 {
364 struct ceph_inode_info *ci = ceph_inode(inode); 364 struct ceph_inode_info *ci = ceph_inode(inode);
365 int mds; 365 int mds;
366 spin_lock(&ci->i_ceph_lock); 366 spin_lock(&ci->i_ceph_lock);
367 mds = __ceph_get_cap_mds(ceph_inode(inode)); 367 mds = __ceph_get_cap_mds(ceph_inode(inode));
368 spin_unlock(&ci->i_ceph_lock); 368 spin_unlock(&ci->i_ceph_lock);
369 return mds; 369 return mds;
370 } 370 }
371 371
372 /* 372 /*
373 * Called under i_ceph_lock. 373 * Called under i_ceph_lock.
374 */ 374 */
375 static void __insert_cap_node(struct ceph_inode_info *ci, 375 static void __insert_cap_node(struct ceph_inode_info *ci,
376 struct ceph_cap *new) 376 struct ceph_cap *new)
377 { 377 {
378 struct rb_node **p = &ci->i_caps.rb_node; 378 struct rb_node **p = &ci->i_caps.rb_node;
379 struct rb_node *parent = NULL; 379 struct rb_node *parent = NULL;
380 struct ceph_cap *cap = NULL; 380 struct ceph_cap *cap = NULL;
381 381
382 while (*p) { 382 while (*p) {
383 parent = *p; 383 parent = *p;
384 cap = rb_entry(parent, struct ceph_cap, ci_node); 384 cap = rb_entry(parent, struct ceph_cap, ci_node);
385 if (new->mds < cap->mds) 385 if (new->mds < cap->mds)
386 p = &(*p)->rb_left; 386 p = &(*p)->rb_left;
387 else if (new->mds > cap->mds) 387 else if (new->mds > cap->mds)
388 p = &(*p)->rb_right; 388 p = &(*p)->rb_right;
389 else 389 else
390 BUG(); 390 BUG();
391 } 391 }
392 392
393 rb_link_node(&new->ci_node, parent, p); 393 rb_link_node(&new->ci_node, parent, p);
394 rb_insert_color(&new->ci_node, &ci->i_caps); 394 rb_insert_color(&new->ci_node, &ci->i_caps);
395 } 395 }
396 396
397 /* 397 /*
398 * (re)set cap hold timeouts, which control the delayed release 398 * (re)set cap hold timeouts, which control the delayed release
399 * of unused caps back to the MDS. Should be called on cap use. 399 * of unused caps back to the MDS. Should be called on cap use.
400 */ 400 */
401 static void __cap_set_timeouts(struct ceph_mds_client *mdsc, 401 static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
402 struct ceph_inode_info *ci) 402 struct ceph_inode_info *ci)
403 { 403 {
404 struct ceph_mount_options *ma = mdsc->fsc->mount_options; 404 struct ceph_mount_options *ma = mdsc->fsc->mount_options;
405 405
406 ci->i_hold_caps_min = round_jiffies(jiffies + 406 ci->i_hold_caps_min = round_jiffies(jiffies +
407 ma->caps_wanted_delay_min * HZ); 407 ma->caps_wanted_delay_min * HZ);
408 ci->i_hold_caps_max = round_jiffies(jiffies + 408 ci->i_hold_caps_max = round_jiffies(jiffies +
409 ma->caps_wanted_delay_max * HZ); 409 ma->caps_wanted_delay_max * HZ);
410 dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode, 410 dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
411 ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies); 411 ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
412 } 412 }
413 413
414 /* 414 /*
415 * (Re)queue cap at the end of the delayed cap release list. 415 * (Re)queue cap at the end of the delayed cap release list.
416 * 416 *
417 * If I_FLUSH is set, leave the inode at the front of the list. 417 * If I_FLUSH is set, leave the inode at the front of the list.
418 * 418 *
419 * Caller holds i_ceph_lock 419 * Caller holds i_ceph_lock
420 * -> we take mdsc->cap_delay_lock 420 * -> we take mdsc->cap_delay_lock
421 */ 421 */
422 static void __cap_delay_requeue(struct ceph_mds_client *mdsc, 422 static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
423 struct ceph_inode_info *ci) 423 struct ceph_inode_info *ci)
424 { 424 {
425 __cap_set_timeouts(mdsc, ci); 425 __cap_set_timeouts(mdsc, ci);
426 dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode, 426 dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
427 ci->i_ceph_flags, ci->i_hold_caps_max); 427 ci->i_ceph_flags, ci->i_hold_caps_max);
428 if (!mdsc->stopping) { 428 if (!mdsc->stopping) {
429 spin_lock(&mdsc->cap_delay_lock); 429 spin_lock(&mdsc->cap_delay_lock);
430 if (!list_empty(&ci->i_cap_delay_list)) { 430 if (!list_empty(&ci->i_cap_delay_list)) {
431 if (ci->i_ceph_flags & CEPH_I_FLUSH) 431 if (ci->i_ceph_flags & CEPH_I_FLUSH)
432 goto no_change; 432 goto no_change;
433 list_del_init(&ci->i_cap_delay_list); 433 list_del_init(&ci->i_cap_delay_list);
434 } 434 }
435 list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list); 435 list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
436 no_change: 436 no_change:
437 spin_unlock(&mdsc->cap_delay_lock); 437 spin_unlock(&mdsc->cap_delay_lock);
438 } 438 }
439 } 439 }
440 440
441 /* 441 /*
442 * Queue an inode for immediate writeback. Mark inode with I_FLUSH, 442 * Queue an inode for immediate writeback. Mark inode with I_FLUSH,
443 * indicating we should send a cap message to flush dirty metadata 443 * indicating we should send a cap message to flush dirty metadata
444 * asap, and move to the front of the delayed cap list. 444 * asap, and move to the front of the delayed cap list.
445 */ 445 */
446 static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc, 446 static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
447 struct ceph_inode_info *ci) 447 struct ceph_inode_info *ci)
448 { 448 {
449 dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode); 449 dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
450 spin_lock(&mdsc->cap_delay_lock); 450 spin_lock(&mdsc->cap_delay_lock);
451 ci->i_ceph_flags |= CEPH_I_FLUSH; 451 ci->i_ceph_flags |= CEPH_I_FLUSH;
452 if (!list_empty(&ci->i_cap_delay_list)) 452 if (!list_empty(&ci->i_cap_delay_list))
453 list_del_init(&ci->i_cap_delay_list); 453 list_del_init(&ci->i_cap_delay_list);
454 list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list); 454 list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
455 spin_unlock(&mdsc->cap_delay_lock); 455 spin_unlock(&mdsc->cap_delay_lock);
456 } 456 }
457 457
458 /* 458 /*
459 * Cancel delayed work on cap. 459 * Cancel delayed work on cap.
460 * 460 *
461 * Caller must hold i_ceph_lock. 461 * Caller must hold i_ceph_lock.
462 */ 462 */
463 static void __cap_delay_cancel(struct ceph_mds_client *mdsc, 463 static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
464 struct ceph_inode_info *ci) 464 struct ceph_inode_info *ci)
465 { 465 {
466 dout("__cap_delay_cancel %p\n", &ci->vfs_inode); 466 dout("__cap_delay_cancel %p\n", &ci->vfs_inode);
467 if (list_empty(&ci->i_cap_delay_list)) 467 if (list_empty(&ci->i_cap_delay_list))
468 return; 468 return;
469 spin_lock(&mdsc->cap_delay_lock); 469 spin_lock(&mdsc->cap_delay_lock);
470 list_del_init(&ci->i_cap_delay_list); 470 list_del_init(&ci->i_cap_delay_list);
471 spin_unlock(&mdsc->cap_delay_lock); 471 spin_unlock(&mdsc->cap_delay_lock);
472 } 472 }
473 473
474 /* 474 /*
475 * Common issue checks for add_cap, handle_cap_grant. 475 * Common issue checks for add_cap, handle_cap_grant.
476 */ 476 */
477 static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, 477 static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
478 unsigned issued) 478 unsigned issued)
479 { 479 {
480 unsigned had = __ceph_caps_issued(ci, NULL); 480 unsigned had = __ceph_caps_issued(ci, NULL);
481 481
482 /* 482 /*
483 * Each time we receive FILE_CACHE anew, we increment 483 * Each time we receive FILE_CACHE anew, we increment
484 * i_rdcache_gen. 484 * i_rdcache_gen.
485 */ 485 */
486 if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && 486 if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
487 (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) 487 (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)
488 ci->i_rdcache_gen++; 488 ci->i_rdcache_gen++;
489 489
490 /* 490 /*
491 * if we are newly issued FILE_SHARED, clear D_COMPLETE; we 491 * if we are newly issued FILE_SHARED, clear D_COMPLETE; we
492 * don't know what happened to this directory while we didn't 492 * don't know what happened to this directory while we didn't
493 * have the cap. 493 * have the cap.
494 */ 494 */
495 if ((issued & CEPH_CAP_FILE_SHARED) && 495 if ((issued & CEPH_CAP_FILE_SHARED) &&
496 (had & CEPH_CAP_FILE_SHARED) == 0) { 496 (had & CEPH_CAP_FILE_SHARED) == 0) {
497 ci->i_shared_gen++; 497 ci->i_shared_gen++;
498 if (S_ISDIR(ci->vfs_inode.i_mode)) 498 if (S_ISDIR(ci->vfs_inode.i_mode))
499 ceph_dir_clear_complete(&ci->vfs_inode); 499 ceph_dir_clear_complete(&ci->vfs_inode);
500 } 500 }
501 } 501 }
502 502
503 /* 503 /*
504 * Add a capability under the given MDS session. 504 * Add a capability under the given MDS session.
505 * 505 *
506 * Caller should hold session snap_rwsem (read) and s_mutex. 506 * Caller should hold session snap_rwsem (read) and s_mutex.
507 * 507 *
508 * @fmode is the open file mode, if we are opening a file, otherwise 508 * @fmode is the open file mode, if we are opening a file, otherwise
509 * it is < 0. (This is so we can atomically add the cap and add an 509 * it is < 0. (This is so we can atomically add the cap and add an
510 * open file reference to it.) 510 * open file reference to it.)
511 */ 511 */
512 int ceph_add_cap(struct inode *inode, 512 int ceph_add_cap(struct inode *inode,
513 struct ceph_mds_session *session, u64 cap_id, 513 struct ceph_mds_session *session, u64 cap_id,
514 int fmode, unsigned issued, unsigned wanted, 514 int fmode, unsigned issued, unsigned wanted,
515 unsigned seq, unsigned mseq, u64 realmino, int flags, 515 unsigned seq, unsigned mseq, u64 realmino, int flags,
516 struct ceph_cap_reservation *caps_reservation) 516 struct ceph_cap_reservation *caps_reservation)
517 { 517 {
518 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; 518 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
519 struct ceph_inode_info *ci = ceph_inode(inode); 519 struct ceph_inode_info *ci = ceph_inode(inode);
520 struct ceph_cap *new_cap = NULL; 520 struct ceph_cap *new_cap = NULL;
521 struct ceph_cap *cap; 521 struct ceph_cap *cap;
522 int mds = session->s_mds; 522 int mds = session->s_mds;
523 int actual_wanted; 523 int actual_wanted;
524 524
525 dout("add_cap %p mds%d cap %llx %s seq %d\n", inode, 525 dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
526 session->s_mds, cap_id, ceph_cap_string(issued), seq); 526 session->s_mds, cap_id, ceph_cap_string(issued), seq);
527 527
528 /* 528 /*
529 * If we are opening the file, include file mode wanted bits 529 * If we are opening the file, include file mode wanted bits
530 * in wanted. 530 * in wanted.
531 */ 531 */
532 if (fmode >= 0) 532 if (fmode >= 0)
533 wanted |= ceph_caps_for_mode(fmode); 533 wanted |= ceph_caps_for_mode(fmode);
534 534
535 retry: 535 retry:
536 spin_lock(&ci->i_ceph_lock); 536 spin_lock(&ci->i_ceph_lock);
537 cap = __get_cap_for_mds(ci, mds); 537 cap = __get_cap_for_mds(ci, mds);
538 if (!cap) { 538 if (!cap) {
539 if (new_cap) { 539 if (new_cap) {
540 cap = new_cap; 540 cap = new_cap;
541 new_cap = NULL; 541 new_cap = NULL;
542 } else { 542 } else {
543 spin_unlock(&ci->i_ceph_lock); 543 spin_unlock(&ci->i_ceph_lock);
544 new_cap = get_cap(mdsc, caps_reservation); 544 new_cap = get_cap(mdsc, caps_reservation);
545 if (new_cap == NULL) 545 if (new_cap == NULL)
546 return -ENOMEM; 546 return -ENOMEM;
547 goto retry; 547 goto retry;
548 } 548 }
549 549
550 cap->issued = 0; 550 cap->issued = 0;
551 cap->implemented = 0; 551 cap->implemented = 0;
552 cap->mds = mds; 552 cap->mds = mds;
553 cap->mds_wanted = 0; 553 cap->mds_wanted = 0;
554 554
555 cap->ci = ci; 555 cap->ci = ci;
556 __insert_cap_node(ci, cap); 556 __insert_cap_node(ci, cap);
557 557
558 /* clear out old exporting info? (i.e. on cap import) */ 558 /* clear out old exporting info? (i.e. on cap import) */
559 if (ci->i_cap_exporting_mds == mds) { 559 if (ci->i_cap_exporting_mds == mds) {
560 ci->i_cap_exporting_issued = 0; 560 ci->i_cap_exporting_issued = 0;
561 ci->i_cap_exporting_mseq = 0; 561 ci->i_cap_exporting_mseq = 0;
562 ci->i_cap_exporting_mds = -1; 562 ci->i_cap_exporting_mds = -1;
563 } 563 }
564 564
565 /* add to session cap list */ 565 /* add to session cap list */
566 cap->session = session; 566 cap->session = session;
567 spin_lock(&session->s_cap_lock); 567 spin_lock(&session->s_cap_lock);
568 list_add_tail(&cap->session_caps, &session->s_caps); 568 list_add_tail(&cap->session_caps, &session->s_caps);
569 session->s_nr_caps++; 569 session->s_nr_caps++;
570 spin_unlock(&session->s_cap_lock); 570 spin_unlock(&session->s_cap_lock);
571 } else if (new_cap) 571 } else if (new_cap)
572 ceph_put_cap(mdsc, new_cap); 572 ceph_put_cap(mdsc, new_cap);
573 573
574 if (!ci->i_snap_realm) { 574 if (!ci->i_snap_realm) {
575 /* 575 /*
576 * add this inode to the appropriate snap realm 576 * add this inode to the appropriate snap realm
577 */ 577 */
578 struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc, 578 struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
579 realmino); 579 realmino);
580 if (realm) { 580 if (realm) {
581 ceph_get_snap_realm(mdsc, realm); 581 ceph_get_snap_realm(mdsc, realm);
582 spin_lock(&realm->inodes_with_caps_lock); 582 spin_lock(&realm->inodes_with_caps_lock);
583 ci->i_snap_realm = realm; 583 ci->i_snap_realm = realm;
584 list_add(&ci->i_snap_realm_item, 584 list_add(&ci->i_snap_realm_item,
585 &realm->inodes_with_caps); 585 &realm->inodes_with_caps);
586 spin_unlock(&realm->inodes_with_caps_lock); 586 spin_unlock(&realm->inodes_with_caps_lock);
587 } else { 587 } else {
588 pr_err("ceph_add_cap: couldn't find snap realm %llx\n", 588 pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
589 realmino); 589 realmino);
590 WARN_ON(!realm); 590 WARN_ON(!realm);
591 } 591 }
592 } 592 }
593 593
594 __check_cap_issue(ci, cap, issued); 594 __check_cap_issue(ci, cap, issued);
595 595
596 /* 596 /*
597 * If we are issued caps we don't want, or the mds' wanted 597 * If we are issued caps we don't want, or the mds' wanted
598 * value appears to be off, queue a check so we'll release 598 * value appears to be off, queue a check so we'll release
599 * later and/or update the mds wanted value. 599 * later and/or update the mds wanted value.
600 */ 600 */
601 actual_wanted = __ceph_caps_wanted(ci); 601 actual_wanted = __ceph_caps_wanted(ci);
602 if ((wanted & ~actual_wanted) || 602 if ((wanted & ~actual_wanted) ||
603 (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) { 603 (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
604 dout(" issued %s, mds wanted %s, actual %s, queueing\n", 604 dout(" issued %s, mds wanted %s, actual %s, queueing\n",
605 ceph_cap_string(issued), ceph_cap_string(wanted), 605 ceph_cap_string(issued), ceph_cap_string(wanted),
606 ceph_cap_string(actual_wanted)); 606 ceph_cap_string(actual_wanted));
607 __cap_delay_requeue(mdsc, ci); 607 __cap_delay_requeue(mdsc, ci);
608 } 608 }
609 609
610 if (flags & CEPH_CAP_FLAG_AUTH) 610 if (flags & CEPH_CAP_FLAG_AUTH)
611 ci->i_auth_cap = cap; 611 ci->i_auth_cap = cap;
612 else if (ci->i_auth_cap == cap) 612 else if (ci->i_auth_cap == cap)
613 ci->i_auth_cap = NULL; 613 ci->i_auth_cap = NULL;
614 614
615 dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n", 615 dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
616 inode, ceph_vinop(inode), cap, ceph_cap_string(issued), 616 inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
617 ceph_cap_string(issued|cap->issued), seq, mds); 617 ceph_cap_string(issued|cap->issued), seq, mds);
618 cap->cap_id = cap_id; 618 cap->cap_id = cap_id;
619 cap->issued = issued; 619 cap->issued = issued;
620 cap->implemented |= issued; 620 cap->implemented |= issued;
621 cap->mds_wanted |= wanted; 621 cap->mds_wanted |= wanted;
622 cap->seq = seq; 622 cap->seq = seq;
623 cap->issue_seq = seq; 623 cap->issue_seq = seq;
624 cap->mseq = mseq; 624 cap->mseq = mseq;
625 cap->cap_gen = session->s_cap_gen; 625 cap->cap_gen = session->s_cap_gen;
626 626
627 if (fmode >= 0) 627 if (fmode >= 0)
628 __ceph_get_fmode(ci, fmode); 628 __ceph_get_fmode(ci, fmode);
629 spin_unlock(&ci->i_ceph_lock); 629 spin_unlock(&ci->i_ceph_lock);
630 wake_up_all(&ci->i_cap_wq); 630 wake_up_all(&ci->i_cap_wq);
631 return 0; 631 return 0;
632 } 632 }
633 633
634 /* 634 /*
635 * Return true if cap has not timed out and belongs to the current 635 * Return true if cap has not timed out and belongs to the current
636 * generation of the MDS session (i.e. has not gone 'stale' due to 636 * generation of the MDS session (i.e. has not gone 'stale' due to
637 * us losing touch with the mds). 637 * us losing touch with the mds).
638 */ 638 */
639 static int __cap_is_valid(struct ceph_cap *cap) 639 static int __cap_is_valid(struct ceph_cap *cap)
640 { 640 {
641 unsigned long ttl; 641 unsigned long ttl;
642 u32 gen; 642 u32 gen;
643 643
644 spin_lock(&cap->session->s_cap_lock); 644 spin_lock(&cap->session->s_gen_ttl_lock);
645 gen = cap->session->s_cap_gen; 645 gen = cap->session->s_cap_gen;
646 ttl = cap->session->s_cap_ttl; 646 ttl = cap->session->s_cap_ttl;
647 spin_unlock(&cap->session->s_cap_lock); 647 spin_unlock(&cap->session->s_gen_ttl_lock);
648 648
649 if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) { 649 if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
650 dout("__cap_is_valid %p cap %p issued %s " 650 dout("__cap_is_valid %p cap %p issued %s "
651 "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode, 651 "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,
652 cap, ceph_cap_string(cap->issued), cap->cap_gen, gen); 652 cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);
653 return 0; 653 return 0;
654 } 654 }
655 655
656 return 1; 656 return 1;
657 } 657 }
658 658
659 /* 659 /*
660 * Return set of valid cap bits issued to us. Note that caps time 660 * Return set of valid cap bits issued to us. Note that caps time
661 * out, and may be invalidated in bulk if the client session times out 661 * out, and may be invalidated in bulk if the client session times out
662 * and session->s_cap_gen is bumped. 662 * and session->s_cap_gen is bumped.
663 */ 663 */
664 int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented) 664 int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
665 { 665 {
666 int have = ci->i_snap_caps | ci->i_cap_exporting_issued; 666 int have = ci->i_snap_caps | ci->i_cap_exporting_issued;
667 struct ceph_cap *cap; 667 struct ceph_cap *cap;
668 struct rb_node *p; 668 struct rb_node *p;
669 669
670 if (implemented) 670 if (implemented)
671 *implemented = 0; 671 *implemented = 0;
672 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { 672 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
673 cap = rb_entry(p, struct ceph_cap, ci_node); 673 cap = rb_entry(p, struct ceph_cap, ci_node);
674 if (!__cap_is_valid(cap)) 674 if (!__cap_is_valid(cap))
675 continue; 675 continue;
676 dout("__ceph_caps_issued %p cap %p issued %s\n", 676 dout("__ceph_caps_issued %p cap %p issued %s\n",
677 &ci->vfs_inode, cap, ceph_cap_string(cap->issued)); 677 &ci->vfs_inode, cap, ceph_cap_string(cap->issued));
678 have |= cap->issued; 678 have |= cap->issued;
679 if (implemented) 679 if (implemented)
680 *implemented |= cap->implemented; 680 *implemented |= cap->implemented;
681 } 681 }
682 return have; 682 return have;
683 } 683 }
684 684
685 /* 685 /*
686 * Get cap bits issued by caps other than @ocap 686 * Get cap bits issued by caps other than @ocap
687 */ 687 */
688 int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap) 688 int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap)
689 { 689 {
690 int have = ci->i_snap_caps; 690 int have = ci->i_snap_caps;
691 struct ceph_cap *cap; 691 struct ceph_cap *cap;
692 struct rb_node *p; 692 struct rb_node *p;
693 693
694 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { 694 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
695 cap = rb_entry(p, struct ceph_cap, ci_node); 695 cap = rb_entry(p, struct ceph_cap, ci_node);
696 if (cap == ocap) 696 if (cap == ocap)
697 continue; 697 continue;
698 if (!__cap_is_valid(cap)) 698 if (!__cap_is_valid(cap))
699 continue; 699 continue;
700 have |= cap->issued; 700 have |= cap->issued;
701 } 701 }
702 return have; 702 return have;
703 } 703 }
704 704
705 /* 705 /*
706 * Move a cap to the end of the LRU (oldest caps at list head, newest 706 * Move a cap to the end of the LRU (oldest caps at list head, newest
707 * at list tail). 707 * at list tail).
708 */ 708 */
709 static void __touch_cap(struct ceph_cap *cap) 709 static void __touch_cap(struct ceph_cap *cap)
710 { 710 {
711 struct ceph_mds_session *s = cap->session; 711 struct ceph_mds_session *s = cap->session;
712 712
713 spin_lock(&s->s_cap_lock); 713 spin_lock(&s->s_cap_lock);
714 if (s->s_cap_iterator == NULL) { 714 if (s->s_cap_iterator == NULL) {
715 dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap, 715 dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
716 s->s_mds); 716 s->s_mds);
717 list_move_tail(&cap->session_caps, &s->s_caps); 717 list_move_tail(&cap->session_caps, &s->s_caps);
718 } else { 718 } else {
719 dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n", 719 dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n",
720 &cap->ci->vfs_inode, cap, s->s_mds); 720 &cap->ci->vfs_inode, cap, s->s_mds);
721 } 721 }
722 spin_unlock(&s->s_cap_lock); 722 spin_unlock(&s->s_cap_lock);
723 } 723 }
724 724
725 /* 725 /*
726 * Check if we hold the given mask. If so, move the cap(s) to the 726 * Check if we hold the given mask. If so, move the cap(s) to the
727 * front of their respective LRUs. (This is the preferred way for 727 * front of their respective LRUs. (This is the preferred way for
728 * callers to check for caps they want.) 728 * callers to check for caps they want.)
729 */ 729 */
730 int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) 730 int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
731 { 731 {
732 struct ceph_cap *cap; 732 struct ceph_cap *cap;
733 struct rb_node *p; 733 struct rb_node *p;
734 int have = ci->i_snap_caps; 734 int have = ci->i_snap_caps;
735 735
736 if ((have & mask) == mask) { 736 if ((have & mask) == mask) {
737 dout("__ceph_caps_issued_mask %p snap issued %s" 737 dout("__ceph_caps_issued_mask %p snap issued %s"
738 " (mask %s)\n", &ci->vfs_inode, 738 " (mask %s)\n", &ci->vfs_inode,
739 ceph_cap_string(have), 739 ceph_cap_string(have),
740 ceph_cap_string(mask)); 740 ceph_cap_string(mask));
741 return 1; 741 return 1;
742 } 742 }
743 743
744 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { 744 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
745 cap = rb_entry(p, struct ceph_cap, ci_node); 745 cap = rb_entry(p, struct ceph_cap, ci_node);
746 if (!__cap_is_valid(cap)) 746 if (!__cap_is_valid(cap))
747 continue; 747 continue;
748 if ((cap->issued & mask) == mask) { 748 if ((cap->issued & mask) == mask) {
749 dout("__ceph_caps_issued_mask %p cap %p issued %s" 749 dout("__ceph_caps_issued_mask %p cap %p issued %s"
750 " (mask %s)\n", &ci->vfs_inode, cap, 750 " (mask %s)\n", &ci->vfs_inode, cap,
751 ceph_cap_string(cap->issued), 751 ceph_cap_string(cap->issued),
752 ceph_cap_string(mask)); 752 ceph_cap_string(mask));
753 if (touch) 753 if (touch)
754 __touch_cap(cap); 754 __touch_cap(cap);
755 return 1; 755 return 1;
756 } 756 }
757 757
758 /* does a combination of caps satisfy mask? */ 758 /* does a combination of caps satisfy mask? */
759 have |= cap->issued; 759 have |= cap->issued;
760 if ((have & mask) == mask) { 760 if ((have & mask) == mask) {
761 dout("__ceph_caps_issued_mask %p combo issued %s" 761 dout("__ceph_caps_issued_mask %p combo issued %s"
762 " (mask %s)\n", &ci->vfs_inode, 762 " (mask %s)\n", &ci->vfs_inode,
763 ceph_cap_string(cap->issued), 763 ceph_cap_string(cap->issued),
764 ceph_cap_string(mask)); 764 ceph_cap_string(mask));
765 if (touch) { 765 if (touch) {
766 struct rb_node *q; 766 struct rb_node *q;
767 767
768 /* touch this + preceding caps */ 768 /* touch this + preceding caps */
769 __touch_cap(cap); 769 __touch_cap(cap);
770 for (q = rb_first(&ci->i_caps); q != p; 770 for (q = rb_first(&ci->i_caps); q != p;
771 q = rb_next(q)) { 771 q = rb_next(q)) {
772 cap = rb_entry(q, struct ceph_cap, 772 cap = rb_entry(q, struct ceph_cap,
773 ci_node); 773 ci_node);
774 if (!__cap_is_valid(cap)) 774 if (!__cap_is_valid(cap))
775 continue; 775 continue;
776 __touch_cap(cap); 776 __touch_cap(cap);
777 } 777 }
778 } 778 }
779 return 1; 779 return 1;
780 } 780 }
781 } 781 }
782 782
783 return 0; 783 return 0;
784 } 784 }
785 785
786 /* 786 /*
787 * Return true if mask caps are currently being revoked by an MDS. 787 * Return true if mask caps are currently being revoked by an MDS.
788 */ 788 */
789 int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) 789 int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
790 { 790 {
791 struct inode *inode = &ci->vfs_inode; 791 struct inode *inode = &ci->vfs_inode;
792 struct ceph_cap *cap; 792 struct ceph_cap *cap;
793 struct rb_node *p; 793 struct rb_node *p;
794 int ret = 0; 794 int ret = 0;
795 795
796 spin_lock(&ci->i_ceph_lock); 796 spin_lock(&ci->i_ceph_lock);
797 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { 797 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
798 cap = rb_entry(p, struct ceph_cap, ci_node); 798 cap = rb_entry(p, struct ceph_cap, ci_node);
799 if (__cap_is_valid(cap) && 799 if (__cap_is_valid(cap) &&
800 (cap->implemented & ~cap->issued & mask)) { 800 (cap->implemented & ~cap->issued & mask)) {
801 ret = 1; 801 ret = 1;
802 break; 802 break;
803 } 803 }
804 } 804 }
805 spin_unlock(&ci->i_ceph_lock); 805 spin_unlock(&ci->i_ceph_lock);
806 dout("ceph_caps_revoking %p %s = %d\n", inode, 806 dout("ceph_caps_revoking %p %s = %d\n", inode,
807 ceph_cap_string(mask), ret); 807 ceph_cap_string(mask), ret);
808 return ret; 808 return ret;
809 } 809 }
810 810
811 int __ceph_caps_used(struct ceph_inode_info *ci) 811 int __ceph_caps_used(struct ceph_inode_info *ci)
812 { 812 {
813 int used = 0; 813 int used = 0;
814 if (ci->i_pin_ref) 814 if (ci->i_pin_ref)
815 used |= CEPH_CAP_PIN; 815 used |= CEPH_CAP_PIN;
816 if (ci->i_rd_ref) 816 if (ci->i_rd_ref)
817 used |= CEPH_CAP_FILE_RD; 817 used |= CEPH_CAP_FILE_RD;
818 if (ci->i_rdcache_ref || ci->vfs_inode.i_data.nrpages) 818 if (ci->i_rdcache_ref || ci->vfs_inode.i_data.nrpages)
819 used |= CEPH_CAP_FILE_CACHE; 819 used |= CEPH_CAP_FILE_CACHE;
820 if (ci->i_wr_ref) 820 if (ci->i_wr_ref)
821 used |= CEPH_CAP_FILE_WR; 821 used |= CEPH_CAP_FILE_WR;
822 if (ci->i_wb_ref || ci->i_wrbuffer_ref) 822 if (ci->i_wb_ref || ci->i_wrbuffer_ref)
823 used |= CEPH_CAP_FILE_BUFFER; 823 used |= CEPH_CAP_FILE_BUFFER;
824 return used; 824 return used;
825 } 825 }
826 826
827 /* 827 /*
828 * wanted, by virtue of open file modes 828 * wanted, by virtue of open file modes
829 */ 829 */
830 int __ceph_caps_file_wanted(struct ceph_inode_info *ci) 830 int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
831 { 831 {
832 int want = 0; 832 int want = 0;
833 int mode; 833 int mode;
834 for (mode = 0; mode < CEPH_FILE_MODE_NUM; mode++) 834 for (mode = 0; mode < CEPH_FILE_MODE_NUM; mode++)
835 if (ci->i_nr_by_mode[mode]) 835 if (ci->i_nr_by_mode[mode])
836 want |= ceph_caps_for_mode(mode); 836 want |= ceph_caps_for_mode(mode);
837 return want; 837 return want;
838 } 838 }
839 839
840 /* 840 /*
841 * Return caps we have registered with the MDS(s) as 'wanted'. 841 * Return caps we have registered with the MDS(s) as 'wanted'.
842 */ 842 */
843 int __ceph_caps_mds_wanted(struct ceph_inode_info *ci) 843 int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
844 { 844 {
845 struct ceph_cap *cap; 845 struct ceph_cap *cap;
846 struct rb_node *p; 846 struct rb_node *p;
847 int mds_wanted = 0; 847 int mds_wanted = 0;
848 848
849 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { 849 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
850 cap = rb_entry(p, struct ceph_cap, ci_node); 850 cap = rb_entry(p, struct ceph_cap, ci_node);
851 if (!__cap_is_valid(cap)) 851 if (!__cap_is_valid(cap))
852 continue; 852 continue;
853 mds_wanted |= cap->mds_wanted; 853 mds_wanted |= cap->mds_wanted;
854 } 854 }
855 return mds_wanted; 855 return mds_wanted;
856 } 856 }
857 857
858 /* 858 /*
859 * called under i_ceph_lock 859 * called under i_ceph_lock
860 */ 860 */
861 static int __ceph_is_any_caps(struct ceph_inode_info *ci) 861 static int __ceph_is_any_caps(struct ceph_inode_info *ci)
862 { 862 {
863 return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0; 863 return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0;
864 } 864 }
865 865
866 /* 866 /*
867 * Remove a cap. Take steps to deal with a racing iterate_session_caps. 867 * Remove a cap. Take steps to deal with a racing iterate_session_caps.
868 * 868 *
869 * caller should hold i_ceph_lock. 869 * caller should hold i_ceph_lock.
870 * caller will not hold session s_mutex if called from destroy_inode. 870 * caller will not hold session s_mutex if called from destroy_inode.
871 */ 871 */
872 void __ceph_remove_cap(struct ceph_cap *cap) 872 void __ceph_remove_cap(struct ceph_cap *cap)
873 { 873 {
874 struct ceph_mds_session *session = cap->session; 874 struct ceph_mds_session *session = cap->session;
875 struct ceph_inode_info *ci = cap->ci; 875 struct ceph_inode_info *ci = cap->ci;
876 struct ceph_mds_client *mdsc = 876 struct ceph_mds_client *mdsc =
877 ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; 877 ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
878 int removed = 0; 878 int removed = 0;
879 879
880 dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); 880 dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
881 881
882 /* remove from session list */ 882 /* remove from session list */
883 spin_lock(&session->s_cap_lock); 883 spin_lock(&session->s_cap_lock);
884 if (session->s_cap_iterator == cap) { 884 if (session->s_cap_iterator == cap) {
885 /* not yet, we are iterating over this very cap */ 885 /* not yet, we are iterating over this very cap */
886 dout("__ceph_remove_cap delaying %p removal from session %p\n", 886 dout("__ceph_remove_cap delaying %p removal from session %p\n",
887 cap, cap->session); 887 cap, cap->session);
888 } else { 888 } else {
889 list_del_init(&cap->session_caps); 889 list_del_init(&cap->session_caps);
890 session->s_nr_caps--; 890 session->s_nr_caps--;
891 cap->session = NULL; 891 cap->session = NULL;
892 removed = 1; 892 removed = 1;
893 } 893 }
894 /* protect backpointer with s_cap_lock: see iterate_session_caps */ 894 /* protect backpointer with s_cap_lock: see iterate_session_caps */
895 cap->ci = NULL; 895 cap->ci = NULL;
896 spin_unlock(&session->s_cap_lock); 896 spin_unlock(&session->s_cap_lock);
897 897
898 /* remove from inode list */ 898 /* remove from inode list */
899 rb_erase(&cap->ci_node, &ci->i_caps); 899 rb_erase(&cap->ci_node, &ci->i_caps);
900 if (ci->i_auth_cap == cap) 900 if (ci->i_auth_cap == cap)
901 ci->i_auth_cap = NULL; 901 ci->i_auth_cap = NULL;
902 902
903 if (removed) 903 if (removed)
904 ceph_put_cap(mdsc, cap); 904 ceph_put_cap(mdsc, cap);
905 905
906 if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) { 906 if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
907 struct ceph_snap_realm *realm = ci->i_snap_realm; 907 struct ceph_snap_realm *realm = ci->i_snap_realm;
908 spin_lock(&realm->inodes_with_caps_lock); 908 spin_lock(&realm->inodes_with_caps_lock);
909 list_del_init(&ci->i_snap_realm_item); 909 list_del_init(&ci->i_snap_realm_item);
910 ci->i_snap_realm_counter++; 910 ci->i_snap_realm_counter++;
911 ci->i_snap_realm = NULL; 911 ci->i_snap_realm = NULL;
912 spin_unlock(&realm->inodes_with_caps_lock); 912 spin_unlock(&realm->inodes_with_caps_lock);
913 ceph_put_snap_realm(mdsc, realm); 913 ceph_put_snap_realm(mdsc, realm);
914 } 914 }
915 if (!__ceph_is_any_real_caps(ci)) 915 if (!__ceph_is_any_real_caps(ci))
916 __cap_delay_cancel(mdsc, ci); 916 __cap_delay_cancel(mdsc, ci);
917 } 917 }
918 918
919 /* 919 /*
920 * Build and send a cap message to the given MDS. 920 * Build and send a cap message to the given MDS.
921 * 921 *
922 * Caller should be holding s_mutex. 922 * Caller should be holding s_mutex.
923 */ 923 */
924 static int send_cap_msg(struct ceph_mds_session *session, 924 static int send_cap_msg(struct ceph_mds_session *session,
925 u64 ino, u64 cid, int op, 925 u64 ino, u64 cid, int op,
926 int caps, int wanted, int dirty, 926 int caps, int wanted, int dirty,
927 u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq, 927 u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq,
928 u64 size, u64 max_size, 928 u64 size, u64 max_size,
929 struct timespec *mtime, struct timespec *atime, 929 struct timespec *mtime, struct timespec *atime,
930 u64 time_warp_seq, 930 u64 time_warp_seq,
931 uid_t uid, gid_t gid, umode_t mode, 931 uid_t uid, gid_t gid, umode_t mode,
932 u64 xattr_version, 932 u64 xattr_version,
933 struct ceph_buffer *xattrs_buf, 933 struct ceph_buffer *xattrs_buf,
934 u64 follows) 934 u64 follows)
935 { 935 {
936 struct ceph_mds_caps *fc; 936 struct ceph_mds_caps *fc;
937 struct ceph_msg *msg; 937 struct ceph_msg *msg;
938 938
939 dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s" 939 dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
940 " seq %u/%u mseq %u follows %lld size %llu/%llu" 940 " seq %u/%u mseq %u follows %lld size %llu/%llu"
941 " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op), 941 " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op),
942 cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted), 942 cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted),
943 ceph_cap_string(dirty), 943 ceph_cap_string(dirty),
944 seq, issue_seq, mseq, follows, size, max_size, 944 seq, issue_seq, mseq, follows, size, max_size,
945 xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0); 945 xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
946 946
947 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS, false); 947 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS, false);
948 if (!msg) 948 if (!msg)
949 return -ENOMEM; 949 return -ENOMEM;
950 950
951 msg->hdr.tid = cpu_to_le64(flush_tid); 951 msg->hdr.tid = cpu_to_le64(flush_tid);
952 952
953 fc = msg->front.iov_base; 953 fc = msg->front.iov_base;
954 memset(fc, 0, sizeof(*fc)); 954 memset(fc, 0, sizeof(*fc));
955 955
956 fc->cap_id = cpu_to_le64(cid); 956 fc->cap_id = cpu_to_le64(cid);
957 fc->op = cpu_to_le32(op); 957 fc->op = cpu_to_le32(op);
958 fc->seq = cpu_to_le32(seq); 958 fc->seq = cpu_to_le32(seq);
959 fc->issue_seq = cpu_to_le32(issue_seq); 959 fc->issue_seq = cpu_to_le32(issue_seq);
960 fc->migrate_seq = cpu_to_le32(mseq); 960 fc->migrate_seq = cpu_to_le32(mseq);
961 fc->caps = cpu_to_le32(caps); 961 fc->caps = cpu_to_le32(caps);
962 fc->wanted = cpu_to_le32(wanted); 962 fc->wanted = cpu_to_le32(wanted);
963 fc->dirty = cpu_to_le32(dirty); 963 fc->dirty = cpu_to_le32(dirty);
964 fc->ino = cpu_to_le64(ino); 964 fc->ino = cpu_to_le64(ino);
965 fc->snap_follows = cpu_to_le64(follows); 965 fc->snap_follows = cpu_to_le64(follows);
966 966
967 fc->size = cpu_to_le64(size); 967 fc->size = cpu_to_le64(size);
968 fc->max_size = cpu_to_le64(max_size); 968 fc->max_size = cpu_to_le64(max_size);
969 if (mtime) 969 if (mtime)
970 ceph_encode_timespec(&fc->mtime, mtime); 970 ceph_encode_timespec(&fc->mtime, mtime);
971 if (atime) 971 if (atime)
972 ceph_encode_timespec(&fc->atime, atime); 972 ceph_encode_timespec(&fc->atime, atime);
973 fc->time_warp_seq = cpu_to_le32(time_warp_seq); 973 fc->time_warp_seq = cpu_to_le32(time_warp_seq);
974 974
975 fc->uid = cpu_to_le32(uid); 975 fc->uid = cpu_to_le32(uid);
976 fc->gid = cpu_to_le32(gid); 976 fc->gid = cpu_to_le32(gid);
977 fc->mode = cpu_to_le32(mode); 977 fc->mode = cpu_to_le32(mode);
978 978
979 fc->xattr_version = cpu_to_le64(xattr_version); 979 fc->xattr_version = cpu_to_le64(xattr_version);
980 if (xattrs_buf) { 980 if (xattrs_buf) {
981 msg->middle = ceph_buffer_get(xattrs_buf); 981 msg->middle = ceph_buffer_get(xattrs_buf);
982 fc->xattr_len = cpu_to_le32(xattrs_buf->vec.iov_len); 982 fc->xattr_len = cpu_to_le32(xattrs_buf->vec.iov_len);
983 msg->hdr.middle_len = cpu_to_le32(xattrs_buf->vec.iov_len); 983 msg->hdr.middle_len = cpu_to_le32(xattrs_buf->vec.iov_len);
984 } 984 }
985 985
986 ceph_con_send(&session->s_con, msg); 986 ceph_con_send(&session->s_con, msg);
987 return 0; 987 return 0;
988 } 988 }
989 989
990 static void __queue_cap_release(struct ceph_mds_session *session, 990 static void __queue_cap_release(struct ceph_mds_session *session,
991 u64 ino, u64 cap_id, u32 migrate_seq, 991 u64 ino, u64 cap_id, u32 migrate_seq,
992 u32 issue_seq) 992 u32 issue_seq)
993 { 993 {
994 struct ceph_msg *msg; 994 struct ceph_msg *msg;
995 struct ceph_mds_cap_release *head; 995 struct ceph_mds_cap_release *head;
996 struct ceph_mds_cap_item *item; 996 struct ceph_mds_cap_item *item;
997 997
998 spin_lock(&session->s_cap_lock); 998 spin_lock(&session->s_cap_lock);
999 BUG_ON(!session->s_num_cap_releases); 999 BUG_ON(!session->s_num_cap_releases);
1000 msg = list_first_entry(&session->s_cap_releases, 1000 msg = list_first_entry(&session->s_cap_releases,
1001 struct ceph_msg, list_head); 1001 struct ceph_msg, list_head);
1002 1002
1003 dout(" adding %llx release to mds%d msg %p (%d left)\n", 1003 dout(" adding %llx release to mds%d msg %p (%d left)\n",
1004 ino, session->s_mds, msg, session->s_num_cap_releases); 1004 ino, session->s_mds, msg, session->s_num_cap_releases);
1005 1005
1006 BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE); 1006 BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
1007 head = msg->front.iov_base; 1007 head = msg->front.iov_base;
1008 head->num = cpu_to_le32(le32_to_cpu(head->num) + 1); 1008 head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
1009 item = msg->front.iov_base + msg->front.iov_len; 1009 item = msg->front.iov_base + msg->front.iov_len;
1010 item->ino = cpu_to_le64(ino); 1010 item->ino = cpu_to_le64(ino);
1011 item->cap_id = cpu_to_le64(cap_id); 1011 item->cap_id = cpu_to_le64(cap_id);
1012 item->migrate_seq = cpu_to_le32(migrate_seq); 1012 item->migrate_seq = cpu_to_le32(migrate_seq);
1013 item->seq = cpu_to_le32(issue_seq); 1013 item->seq = cpu_to_le32(issue_seq);
1014 1014
1015 session->s_num_cap_releases--; 1015 session->s_num_cap_releases--;
1016 1016
1017 msg->front.iov_len += sizeof(*item); 1017 msg->front.iov_len += sizeof(*item);
1018 if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) { 1018 if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
1019 dout(" release msg %p full\n", msg); 1019 dout(" release msg %p full\n", msg);
1020 list_move_tail(&msg->list_head, &session->s_cap_releases_done); 1020 list_move_tail(&msg->list_head, &session->s_cap_releases_done);
1021 } else { 1021 } else {
1022 dout(" release msg %p at %d/%d (%d)\n", msg, 1022 dout(" release msg %p at %d/%d (%d)\n", msg,
1023 (int)le32_to_cpu(head->num), 1023 (int)le32_to_cpu(head->num),
1024 (int)CEPH_CAPS_PER_RELEASE, 1024 (int)CEPH_CAPS_PER_RELEASE,
1025 (int)msg->front.iov_len); 1025 (int)msg->front.iov_len);
1026 } 1026 }
1027 spin_unlock(&session->s_cap_lock); 1027 spin_unlock(&session->s_cap_lock);
1028 } 1028 }
1029 1029
1030 /* 1030 /*
1031 * Queue cap releases when an inode is dropped from our cache. Since 1031 * Queue cap releases when an inode is dropped from our cache. Since
1032 * inode is about to be destroyed, there is no need for i_ceph_lock. 1032 * inode is about to be destroyed, there is no need for i_ceph_lock.
1033 */ 1033 */
1034 void ceph_queue_caps_release(struct inode *inode) 1034 void ceph_queue_caps_release(struct inode *inode)
1035 { 1035 {
1036 struct ceph_inode_info *ci = ceph_inode(inode); 1036 struct ceph_inode_info *ci = ceph_inode(inode);
1037 struct rb_node *p; 1037 struct rb_node *p;
1038 1038
1039 p = rb_first(&ci->i_caps); 1039 p = rb_first(&ci->i_caps);
1040 while (p) { 1040 while (p) {
1041 struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node); 1041 struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
1042 struct ceph_mds_session *session = cap->session; 1042 struct ceph_mds_session *session = cap->session;
1043 1043
1044 __queue_cap_release(session, ceph_ino(inode), cap->cap_id, 1044 __queue_cap_release(session, ceph_ino(inode), cap->cap_id,
1045 cap->mseq, cap->issue_seq); 1045 cap->mseq, cap->issue_seq);
1046 p = rb_next(p); 1046 p = rb_next(p);
1047 __ceph_remove_cap(cap); 1047 __ceph_remove_cap(cap);
1048 } 1048 }
1049 } 1049 }
1050 1050
1051 /* 1051 /*
1052 * Send a cap msg on the given inode. Update our caps state, then 1052 * Send a cap msg on the given inode. Update our caps state, then
1053 * drop i_ceph_lock and send the message. 1053 * drop i_ceph_lock and send the message.
1054 * 1054 *
1055 * Make note of max_size reported/requested from mds, revoked caps 1055 * Make note of max_size reported/requested from mds, revoked caps
1056 * that have now been implemented. 1056 * that have now been implemented.
1057 * 1057 *
1058 * Make half-hearted attempt ot to invalidate page cache if we are 1058 * Make half-hearted attempt ot to invalidate page cache if we are
1059 * dropping RDCACHE. Note that this will leave behind locked pages 1059 * dropping RDCACHE. Note that this will leave behind locked pages
1060 * that we'll then need to deal with elsewhere. 1060 * that we'll then need to deal with elsewhere.
1061 * 1061 *
1062 * Return non-zero if delayed release, or we experienced an error 1062 * Return non-zero if delayed release, or we experienced an error
1063 * such that the caller should requeue + retry later. 1063 * such that the caller should requeue + retry later.
1064 * 1064 *
1065 * called with i_ceph_lock, then drops it. 1065 * called with i_ceph_lock, then drops it.
1066 * caller should hold snap_rwsem (read), s_mutex. 1066 * caller should hold snap_rwsem (read), s_mutex.
1067 */ 1067 */
1068 static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, 1068 static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1069 int op, int used, int want, int retain, int flushing, 1069 int op, int used, int want, int retain, int flushing,
1070 unsigned *pflush_tid) 1070 unsigned *pflush_tid)
1071 __releases(cap->ci->i_ceph_lock) 1071 __releases(cap->ci->i_ceph_lock)
1072 { 1072 {
1073 struct ceph_inode_info *ci = cap->ci; 1073 struct ceph_inode_info *ci = cap->ci;
1074 struct inode *inode = &ci->vfs_inode; 1074 struct inode *inode = &ci->vfs_inode;
1075 u64 cap_id = cap->cap_id; 1075 u64 cap_id = cap->cap_id;
1076 int held, revoking, dropping, keep; 1076 int held, revoking, dropping, keep;
1077 u64 seq, issue_seq, mseq, time_warp_seq, follows; 1077 u64 seq, issue_seq, mseq, time_warp_seq, follows;
1078 u64 size, max_size; 1078 u64 size, max_size;
1079 struct timespec mtime, atime; 1079 struct timespec mtime, atime;
1080 int wake = 0; 1080 int wake = 0;
1081 umode_t mode; 1081 umode_t mode;
1082 uid_t uid; 1082 uid_t uid;
1083 gid_t gid; 1083 gid_t gid;
1084 struct ceph_mds_session *session; 1084 struct ceph_mds_session *session;
1085 u64 xattr_version = 0; 1085 u64 xattr_version = 0;
1086 struct ceph_buffer *xattr_blob = NULL; 1086 struct ceph_buffer *xattr_blob = NULL;
1087 int delayed = 0; 1087 int delayed = 0;
1088 u64 flush_tid = 0; 1088 u64 flush_tid = 0;
1089 int i; 1089 int i;
1090 int ret; 1090 int ret;
1091 1091
1092 held = cap->issued | cap->implemented; 1092 held = cap->issued | cap->implemented;
1093 revoking = cap->implemented & ~cap->issued; 1093 revoking = cap->implemented & ~cap->issued;
1094 retain &= ~revoking; 1094 retain &= ~revoking;
1095 dropping = cap->issued & ~retain; 1095 dropping = cap->issued & ~retain;
1096 1096
1097 dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n", 1097 dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n",
1098 inode, cap, cap->session, 1098 inode, cap, cap->session,
1099 ceph_cap_string(held), ceph_cap_string(held & retain), 1099 ceph_cap_string(held), ceph_cap_string(held & retain),
1100 ceph_cap_string(revoking)); 1100 ceph_cap_string(revoking));
1101 BUG_ON((retain & CEPH_CAP_PIN) == 0); 1101 BUG_ON((retain & CEPH_CAP_PIN) == 0);
1102 1102
1103 session = cap->session; 1103 session = cap->session;
1104 1104
1105 /* don't release wanted unless we've waited a bit. */ 1105 /* don't release wanted unless we've waited a bit. */
1106 if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 && 1106 if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
1107 time_before(jiffies, ci->i_hold_caps_min)) { 1107 time_before(jiffies, ci->i_hold_caps_min)) {
1108 dout(" delaying issued %s -> %s, wanted %s -> %s on send\n", 1108 dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
1109 ceph_cap_string(cap->issued), 1109 ceph_cap_string(cap->issued),
1110 ceph_cap_string(cap->issued & retain), 1110 ceph_cap_string(cap->issued & retain),
1111 ceph_cap_string(cap->mds_wanted), 1111 ceph_cap_string(cap->mds_wanted),
1112 ceph_cap_string(want)); 1112 ceph_cap_string(want));
1113 want |= cap->mds_wanted; 1113 want |= cap->mds_wanted;
1114 retain |= cap->issued; 1114 retain |= cap->issued;
1115 delayed = 1; 1115 delayed = 1;
1116 } 1116 }
1117 ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH); 1117 ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
1118 1118
1119 cap->issued &= retain; /* drop bits we don't want */ 1119 cap->issued &= retain; /* drop bits we don't want */
1120 if (cap->implemented & ~cap->issued) { 1120 if (cap->implemented & ~cap->issued) {
1121 /* 1121 /*
1122 * Wake up any waiters on wanted -> needed transition. 1122 * Wake up any waiters on wanted -> needed transition.
1123 * This is due to the weird transition from buffered 1123 * This is due to the weird transition from buffered
1124 * to sync IO... we need to flush dirty pages _before_ 1124 * to sync IO... we need to flush dirty pages _before_
1125 * allowing sync writes to avoid reordering. 1125 * allowing sync writes to avoid reordering.
1126 */ 1126 */
1127 wake = 1; 1127 wake = 1;
1128 } 1128 }
1129 cap->implemented &= cap->issued | used; 1129 cap->implemented &= cap->issued | used;
1130 cap->mds_wanted = want; 1130 cap->mds_wanted = want;
1131 1131
1132 if (flushing) { 1132 if (flushing) {
1133 /* 1133 /*
1134 * assign a tid for flush operations so we can avoid 1134 * assign a tid for flush operations so we can avoid
1135 * flush1 -> dirty1 -> flush2 -> flushack1 -> mark 1135 * flush1 -> dirty1 -> flush2 -> flushack1 -> mark
1136 * clean type races. track latest tid for every bit 1136 * clean type races. track latest tid for every bit
1137 * so we can handle flush AxFw, flush Fw, and have the 1137 * so we can handle flush AxFw, flush Fw, and have the
1138 * first ack clean Ax. 1138 * first ack clean Ax.
1139 */ 1139 */
1140 flush_tid = ++ci->i_cap_flush_last_tid; 1140 flush_tid = ++ci->i_cap_flush_last_tid;
1141 if (pflush_tid) 1141 if (pflush_tid)
1142 *pflush_tid = flush_tid; 1142 *pflush_tid = flush_tid;
1143 dout(" cap_flush_tid %d\n", (int)flush_tid); 1143 dout(" cap_flush_tid %d\n", (int)flush_tid);
1144 for (i = 0; i < CEPH_CAP_BITS; i++) 1144 for (i = 0; i < CEPH_CAP_BITS; i++)
1145 if (flushing & (1 << i)) 1145 if (flushing & (1 << i))
1146 ci->i_cap_flush_tid[i] = flush_tid; 1146 ci->i_cap_flush_tid[i] = flush_tid;
1147 1147
1148 follows = ci->i_head_snapc->seq; 1148 follows = ci->i_head_snapc->seq;
1149 } else { 1149 } else {
1150 follows = 0; 1150 follows = 0;
1151 } 1151 }
1152 1152
1153 keep = cap->implemented; 1153 keep = cap->implemented;
1154 seq = cap->seq; 1154 seq = cap->seq;
1155 issue_seq = cap->issue_seq; 1155 issue_seq = cap->issue_seq;
1156 mseq = cap->mseq; 1156 mseq = cap->mseq;
1157 size = inode->i_size; 1157 size = inode->i_size;
1158 ci->i_reported_size = size; 1158 ci->i_reported_size = size;
1159 max_size = ci->i_wanted_max_size; 1159 max_size = ci->i_wanted_max_size;
1160 ci->i_requested_max_size = max_size; 1160 ci->i_requested_max_size = max_size;
1161 mtime = inode->i_mtime; 1161 mtime = inode->i_mtime;
1162 atime = inode->i_atime; 1162 atime = inode->i_atime;
1163 time_warp_seq = ci->i_time_warp_seq; 1163 time_warp_seq = ci->i_time_warp_seq;
1164 uid = inode->i_uid; 1164 uid = inode->i_uid;
1165 gid = inode->i_gid; 1165 gid = inode->i_gid;
1166 mode = inode->i_mode; 1166 mode = inode->i_mode;
1167 1167
1168 if (flushing & CEPH_CAP_XATTR_EXCL) { 1168 if (flushing & CEPH_CAP_XATTR_EXCL) {
1169 __ceph_build_xattrs_blob(ci); 1169 __ceph_build_xattrs_blob(ci);
1170 xattr_blob = ci->i_xattrs.blob; 1170 xattr_blob = ci->i_xattrs.blob;
1171 xattr_version = ci->i_xattrs.version; 1171 xattr_version = ci->i_xattrs.version;
1172 } 1172 }
1173 1173
1174 spin_unlock(&ci->i_ceph_lock); 1174 spin_unlock(&ci->i_ceph_lock);
1175 1175
1176 ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id, 1176 ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
1177 op, keep, want, flushing, seq, flush_tid, issue_seq, mseq, 1177 op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
1178 size, max_size, &mtime, &atime, time_warp_seq, 1178 size, max_size, &mtime, &atime, time_warp_seq,
1179 uid, gid, mode, xattr_version, xattr_blob, 1179 uid, gid, mode, xattr_version, xattr_blob,
1180 follows); 1180 follows);
1181 if (ret < 0) { 1181 if (ret < 0) {
1182 dout("error sending cap msg, must requeue %p\n", inode); 1182 dout("error sending cap msg, must requeue %p\n", inode);
1183 delayed = 1; 1183 delayed = 1;
1184 } 1184 }
1185 1185
1186 if (wake) 1186 if (wake)
1187 wake_up_all(&ci->i_cap_wq); 1187 wake_up_all(&ci->i_cap_wq);
1188 1188
1189 return delayed; 1189 return delayed;
1190 } 1190 }
1191 1191
1192 /* 1192 /*
1193 * When a snapshot is taken, clients accumulate dirty metadata on 1193 * When a snapshot is taken, clients accumulate dirty metadata on
1194 * inodes with capabilities in ceph_cap_snaps to describe the file 1194 * inodes with capabilities in ceph_cap_snaps to describe the file
1195 * state at the time the snapshot was taken. This must be flushed 1195 * state at the time the snapshot was taken. This must be flushed
1196 * asynchronously back to the MDS once sync writes complete and dirty 1196 * asynchronously back to the MDS once sync writes complete and dirty
1197 * data is written out. 1197 * data is written out.
1198 * 1198 *
1199 * Unless @again is true, skip cap_snaps that were already sent to 1199 * Unless @again is true, skip cap_snaps that were already sent to
1200 * the MDS (i.e., during this session). 1200 * the MDS (i.e., during this session).
1201 * 1201 *
1202 * Called under i_ceph_lock. Takes s_mutex as needed. 1202 * Called under i_ceph_lock. Takes s_mutex as needed.
1203 */ 1203 */
1204 void __ceph_flush_snaps(struct ceph_inode_info *ci, 1204 void __ceph_flush_snaps(struct ceph_inode_info *ci,
1205 struct ceph_mds_session **psession, 1205 struct ceph_mds_session **psession,
1206 int again) 1206 int again)
1207 __releases(ci->i_ceph_lock) 1207 __releases(ci->i_ceph_lock)
1208 __acquires(ci->i_ceph_lock) 1208 __acquires(ci->i_ceph_lock)
1209 { 1209 {
1210 struct inode *inode = &ci->vfs_inode; 1210 struct inode *inode = &ci->vfs_inode;
1211 int mds; 1211 int mds;
1212 struct ceph_cap_snap *capsnap; 1212 struct ceph_cap_snap *capsnap;
1213 u32 mseq; 1213 u32 mseq;
1214 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; 1214 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
1215 struct ceph_mds_session *session = NULL; /* if session != NULL, we hold 1215 struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
1216 session->s_mutex */ 1216 session->s_mutex */
1217 u64 next_follows = 0; /* keep track of how far we've gotten through the 1217 u64 next_follows = 0; /* keep track of how far we've gotten through the
1218 i_cap_snaps list, and skip these entries next time 1218 i_cap_snaps list, and skip these entries next time
1219 around to avoid an infinite loop */ 1219 around to avoid an infinite loop */
1220 1220
1221 if (psession) 1221 if (psession)
1222 session = *psession; 1222 session = *psession;
1223 1223
1224 dout("__flush_snaps %p\n", inode); 1224 dout("__flush_snaps %p\n", inode);
1225 retry: 1225 retry:
1226 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { 1226 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
1227 /* avoid an infiniute loop after retry */ 1227 /* avoid an infiniute loop after retry */
1228 if (capsnap->follows < next_follows) 1228 if (capsnap->follows < next_follows)
1229 continue; 1229 continue;
1230 /* 1230 /*
1231 * we need to wait for sync writes to complete and for dirty 1231 * we need to wait for sync writes to complete and for dirty
1232 * pages to be written out. 1232 * pages to be written out.
1233 */ 1233 */
1234 if (capsnap->dirty_pages || capsnap->writing) 1234 if (capsnap->dirty_pages || capsnap->writing)
1235 break; 1235 break;
1236 1236
1237 /* 1237 /*
1238 * if cap writeback already occurred, we should have dropped 1238 * if cap writeback already occurred, we should have dropped
1239 * the capsnap in ceph_put_wrbuffer_cap_refs. 1239 * the capsnap in ceph_put_wrbuffer_cap_refs.
1240 */ 1240 */
1241 BUG_ON(capsnap->dirty == 0); 1241 BUG_ON(capsnap->dirty == 0);
1242 1242
1243 /* pick mds, take s_mutex */ 1243 /* pick mds, take s_mutex */
1244 if (ci->i_auth_cap == NULL) { 1244 if (ci->i_auth_cap == NULL) {
1245 dout("no auth cap (migrating?), doing nothing\n"); 1245 dout("no auth cap (migrating?), doing nothing\n");
1246 goto out; 1246 goto out;
1247 } 1247 }
1248 1248
1249 /* only flush each capsnap once */ 1249 /* only flush each capsnap once */
1250 if (!again && !list_empty(&capsnap->flushing_item)) { 1250 if (!again && !list_empty(&capsnap->flushing_item)) {
1251 dout("already flushed %p, skipping\n", capsnap); 1251 dout("already flushed %p, skipping\n", capsnap);
1252 continue; 1252 continue;
1253 } 1253 }
1254 1254
1255 mds = ci->i_auth_cap->session->s_mds; 1255 mds = ci->i_auth_cap->session->s_mds;
1256 mseq = ci->i_auth_cap->mseq; 1256 mseq = ci->i_auth_cap->mseq;
1257 1257
1258 if (session && session->s_mds != mds) { 1258 if (session && session->s_mds != mds) {
1259 dout("oops, wrong session %p mutex\n", session); 1259 dout("oops, wrong session %p mutex\n", session);
1260 mutex_unlock(&session->s_mutex); 1260 mutex_unlock(&session->s_mutex);
1261 ceph_put_mds_session(session); 1261 ceph_put_mds_session(session);
1262 session = NULL; 1262 session = NULL;
1263 } 1263 }
1264 if (!session) { 1264 if (!session) {
1265 spin_unlock(&ci->i_ceph_lock); 1265 spin_unlock(&ci->i_ceph_lock);
1266 mutex_lock(&mdsc->mutex); 1266 mutex_lock(&mdsc->mutex);
1267 session = __ceph_lookup_mds_session(mdsc, mds); 1267 session = __ceph_lookup_mds_session(mdsc, mds);
1268 mutex_unlock(&mdsc->mutex); 1268 mutex_unlock(&mdsc->mutex);
1269 if (session) { 1269 if (session) {
1270 dout("inverting session/ino locks on %p\n", 1270 dout("inverting session/ino locks on %p\n",
1271 session); 1271 session);
1272 mutex_lock(&session->s_mutex); 1272 mutex_lock(&session->s_mutex);
1273 } 1273 }
1274 /* 1274 /*
1275 * if session == NULL, we raced against a cap 1275 * if session == NULL, we raced against a cap
1276 * deletion or migration. retry, and we'll 1276 * deletion or migration. retry, and we'll
1277 * get a better @mds value next time. 1277 * get a better @mds value next time.
1278 */ 1278 */
1279 spin_lock(&ci->i_ceph_lock); 1279 spin_lock(&ci->i_ceph_lock);
1280 goto retry; 1280 goto retry;
1281 } 1281 }
1282 1282
1283 capsnap->flush_tid = ++ci->i_cap_flush_last_tid; 1283 capsnap->flush_tid = ++ci->i_cap_flush_last_tid;
1284 atomic_inc(&capsnap->nref); 1284 atomic_inc(&capsnap->nref);
1285 if (!list_empty(&capsnap->flushing_item)) 1285 if (!list_empty(&capsnap->flushing_item))
1286 list_del_init(&capsnap->flushing_item); 1286 list_del_init(&capsnap->flushing_item);
1287 list_add_tail(&capsnap->flushing_item, 1287 list_add_tail(&capsnap->flushing_item,
1288 &session->s_cap_snaps_flushing); 1288 &session->s_cap_snaps_flushing);
1289 spin_unlock(&ci->i_ceph_lock); 1289 spin_unlock(&ci->i_ceph_lock);
1290 1290
1291 dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n", 1291 dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n",
1292 inode, capsnap, capsnap->follows, capsnap->flush_tid); 1292 inode, capsnap, capsnap->follows, capsnap->flush_tid);
1293 send_cap_msg(session, ceph_vino(inode).ino, 0, 1293 send_cap_msg(session, ceph_vino(inode).ino, 0,
1294 CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0, 1294 CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
1295 capsnap->dirty, 0, capsnap->flush_tid, 0, mseq, 1295 capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
1296 capsnap->size, 0, 1296 capsnap->size, 0,
1297 &capsnap->mtime, &capsnap->atime, 1297 &capsnap->mtime, &capsnap->atime,
1298 capsnap->time_warp_seq, 1298 capsnap->time_warp_seq,
1299 capsnap->uid, capsnap->gid, capsnap->mode, 1299 capsnap->uid, capsnap->gid, capsnap->mode,
1300 capsnap->xattr_version, capsnap->xattr_blob, 1300 capsnap->xattr_version, capsnap->xattr_blob,
1301 capsnap->follows); 1301 capsnap->follows);
1302 1302
1303 next_follows = capsnap->follows + 1; 1303 next_follows = capsnap->follows + 1;
1304 ceph_put_cap_snap(capsnap); 1304 ceph_put_cap_snap(capsnap);
1305 1305
1306 spin_lock(&ci->i_ceph_lock); 1306 spin_lock(&ci->i_ceph_lock);
1307 goto retry; 1307 goto retry;
1308 } 1308 }
1309 1309
1310 /* we flushed them all; remove this inode from the queue */ 1310 /* we flushed them all; remove this inode from the queue */
1311 spin_lock(&mdsc->snap_flush_lock); 1311 spin_lock(&mdsc->snap_flush_lock);
1312 list_del_init(&ci->i_snap_flush_item); 1312 list_del_init(&ci->i_snap_flush_item);
1313 spin_unlock(&mdsc->snap_flush_lock); 1313 spin_unlock(&mdsc->snap_flush_lock);
1314 1314
1315 out: 1315 out:
1316 if (psession) 1316 if (psession)
1317 *psession = session; 1317 *psession = session;
1318 else if (session) { 1318 else if (session) {
1319 mutex_unlock(&session->s_mutex); 1319 mutex_unlock(&session->s_mutex);
1320 ceph_put_mds_session(session); 1320 ceph_put_mds_session(session);
1321 } 1321 }
1322 } 1322 }
1323 1323
1324 static void ceph_flush_snaps(struct ceph_inode_info *ci) 1324 static void ceph_flush_snaps(struct ceph_inode_info *ci)
1325 { 1325 {
1326 spin_lock(&ci->i_ceph_lock); 1326 spin_lock(&ci->i_ceph_lock);
1327 __ceph_flush_snaps(ci, NULL, 0); 1327 __ceph_flush_snaps(ci, NULL, 0);
1328 spin_unlock(&ci->i_ceph_lock); 1328 spin_unlock(&ci->i_ceph_lock);
1329 } 1329 }
1330 1330
1331 /* 1331 /*
1332 * Mark caps dirty. If inode is newly dirty, return the dirty flags. 1332 * Mark caps dirty. If inode is newly dirty, return the dirty flags.
1333 * Caller is then responsible for calling __mark_inode_dirty with the 1333 * Caller is then responsible for calling __mark_inode_dirty with the
1334 * returned flags value. 1334 * returned flags value.
1335 */ 1335 */
1336 int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) 1336 int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1337 { 1337 {
1338 struct ceph_mds_client *mdsc = 1338 struct ceph_mds_client *mdsc =
1339 ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; 1339 ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
1340 struct inode *inode = &ci->vfs_inode; 1340 struct inode *inode = &ci->vfs_inode;
1341 int was = ci->i_dirty_caps; 1341 int was = ci->i_dirty_caps;
1342 int dirty = 0; 1342 int dirty = 0;
1343 1343
1344 dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode, 1344 dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
1345 ceph_cap_string(mask), ceph_cap_string(was), 1345 ceph_cap_string(mask), ceph_cap_string(was),
1346 ceph_cap_string(was | mask)); 1346 ceph_cap_string(was | mask));
1347 ci->i_dirty_caps |= mask; 1347 ci->i_dirty_caps |= mask;
1348 if (was == 0) { 1348 if (was == 0) {
1349 if (!ci->i_head_snapc) 1349 if (!ci->i_head_snapc)
1350 ci->i_head_snapc = ceph_get_snap_context( 1350 ci->i_head_snapc = ceph_get_snap_context(
1351 ci->i_snap_realm->cached_context); 1351 ci->i_snap_realm->cached_context);
1352 dout(" inode %p now dirty snapc %p\n", &ci->vfs_inode, 1352 dout(" inode %p now dirty snapc %p\n", &ci->vfs_inode,
1353 ci->i_head_snapc); 1353 ci->i_head_snapc);
1354 BUG_ON(!list_empty(&ci->i_dirty_item)); 1354 BUG_ON(!list_empty(&ci->i_dirty_item));
1355 spin_lock(&mdsc->cap_dirty_lock); 1355 spin_lock(&mdsc->cap_dirty_lock);
1356 list_add(&ci->i_dirty_item, &mdsc->cap_dirty); 1356 list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
1357 spin_unlock(&mdsc->cap_dirty_lock); 1357 spin_unlock(&mdsc->cap_dirty_lock);
1358 if (ci->i_flushing_caps == 0) { 1358 if (ci->i_flushing_caps == 0) {
1359 ihold(inode); 1359 ihold(inode);
1360 dirty |= I_DIRTY_SYNC; 1360 dirty |= I_DIRTY_SYNC;
1361 } 1361 }
1362 } 1362 }
1363 BUG_ON(list_empty(&ci->i_dirty_item)); 1363 BUG_ON(list_empty(&ci->i_dirty_item));
1364 if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) && 1364 if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
1365 (mask & CEPH_CAP_FILE_BUFFER)) 1365 (mask & CEPH_CAP_FILE_BUFFER))
1366 dirty |= I_DIRTY_DATASYNC; 1366 dirty |= I_DIRTY_DATASYNC;
1367 __cap_delay_requeue(mdsc, ci); 1367 __cap_delay_requeue(mdsc, ci);
1368 return dirty; 1368 return dirty;
1369 } 1369 }
1370 1370
1371 /* 1371 /*
1372 * Add dirty inode to the flushing list. Assigned a seq number so we 1372 * Add dirty inode to the flushing list. Assigned a seq number so we
1373 * can wait for caps to flush without starving. 1373 * can wait for caps to flush without starving.
1374 * 1374 *
1375 * Called under i_ceph_lock. 1375 * Called under i_ceph_lock.
1376 */ 1376 */
1377 static int __mark_caps_flushing(struct inode *inode, 1377 static int __mark_caps_flushing(struct inode *inode,
1378 struct ceph_mds_session *session) 1378 struct ceph_mds_session *session)
1379 { 1379 {
1380 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; 1380 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
1381 struct ceph_inode_info *ci = ceph_inode(inode); 1381 struct ceph_inode_info *ci = ceph_inode(inode);
1382 int flushing; 1382 int flushing;
1383 1383
1384 BUG_ON(ci->i_dirty_caps == 0); 1384 BUG_ON(ci->i_dirty_caps == 0);
1385 BUG_ON(list_empty(&ci->i_dirty_item)); 1385 BUG_ON(list_empty(&ci->i_dirty_item));
1386 1386
1387 flushing = ci->i_dirty_caps; 1387 flushing = ci->i_dirty_caps;
1388 dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n", 1388 dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
1389 ceph_cap_string(flushing), 1389 ceph_cap_string(flushing),
1390 ceph_cap_string(ci->i_flushing_caps), 1390 ceph_cap_string(ci->i_flushing_caps),
1391 ceph_cap_string(ci->i_flushing_caps | flushing)); 1391 ceph_cap_string(ci->i_flushing_caps | flushing));
1392 ci->i_flushing_caps |= flushing; 1392 ci->i_flushing_caps |= flushing;
1393 ci->i_dirty_caps = 0; 1393 ci->i_dirty_caps = 0;
1394 dout(" inode %p now !dirty\n", inode); 1394 dout(" inode %p now !dirty\n", inode);
1395 1395
1396 spin_lock(&mdsc->cap_dirty_lock); 1396 spin_lock(&mdsc->cap_dirty_lock);
1397 list_del_init(&ci->i_dirty_item); 1397 list_del_init(&ci->i_dirty_item);
1398 1398
1399 ci->i_cap_flush_seq = ++mdsc->cap_flush_seq; 1399 ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
1400 if (list_empty(&ci->i_flushing_item)) { 1400 if (list_empty(&ci->i_flushing_item)) {
1401 list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing); 1401 list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
1402 mdsc->num_cap_flushing++; 1402 mdsc->num_cap_flushing++;
1403 dout(" inode %p now flushing seq %lld\n", inode, 1403 dout(" inode %p now flushing seq %lld\n", inode,
1404 ci->i_cap_flush_seq); 1404 ci->i_cap_flush_seq);
1405 } else { 1405 } else {
1406 list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing); 1406 list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
1407 dout(" inode %p now flushing (more) seq %lld\n", inode, 1407 dout(" inode %p now flushing (more) seq %lld\n", inode,
1408 ci->i_cap_flush_seq); 1408 ci->i_cap_flush_seq);
1409 } 1409 }
1410 spin_unlock(&mdsc->cap_dirty_lock); 1410 spin_unlock(&mdsc->cap_dirty_lock);
1411 1411
1412 return flushing; 1412 return flushing;
1413 } 1413 }
1414 1414
1415 /* 1415 /*
1416 * try to invalidate mapping pages without blocking. 1416 * try to invalidate mapping pages without blocking.
1417 */ 1417 */
1418 static int try_nonblocking_invalidate(struct inode *inode) 1418 static int try_nonblocking_invalidate(struct inode *inode)
1419 { 1419 {
1420 struct ceph_inode_info *ci = ceph_inode(inode); 1420 struct ceph_inode_info *ci = ceph_inode(inode);
1421 u32 invalidating_gen = ci->i_rdcache_gen; 1421 u32 invalidating_gen = ci->i_rdcache_gen;
1422 1422
1423 spin_unlock(&ci->i_ceph_lock); 1423 spin_unlock(&ci->i_ceph_lock);
1424 invalidate_mapping_pages(&inode->i_data, 0, -1); 1424 invalidate_mapping_pages(&inode->i_data, 0, -1);
1425 spin_lock(&ci->i_ceph_lock); 1425 spin_lock(&ci->i_ceph_lock);
1426 1426
1427 if (inode->i_data.nrpages == 0 && 1427 if (inode->i_data.nrpages == 0 &&
1428 invalidating_gen == ci->i_rdcache_gen) { 1428 invalidating_gen == ci->i_rdcache_gen) {
1429 /* success. */ 1429 /* success. */
1430 dout("try_nonblocking_invalidate %p success\n", inode); 1430 dout("try_nonblocking_invalidate %p success\n", inode);
1431 /* save any racing async invalidate some trouble */ 1431 /* save any racing async invalidate some trouble */
1432 ci->i_rdcache_revoking = ci->i_rdcache_gen - 1; 1432 ci->i_rdcache_revoking = ci->i_rdcache_gen - 1;
1433 return 0; 1433 return 0;
1434 } 1434 }
1435 dout("try_nonblocking_invalidate %p failed\n", inode); 1435 dout("try_nonblocking_invalidate %p failed\n", inode);
1436 return -1; 1436 return -1;
1437 } 1437 }
1438 1438
1439 /* 1439 /*
1440 * Swiss army knife function to examine currently used and wanted 1440 * Swiss army knife function to examine currently used and wanted
1441 * versus held caps. Release, flush, ack revoked caps to mds as 1441 * versus held caps. Release, flush, ack revoked caps to mds as
1442 * appropriate. 1442 * appropriate.
1443 * 1443 *
1444 * CHECK_CAPS_NODELAY - caller is delayed work and we should not delay 1444 * CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
1445 * cap release further. 1445 * cap release further.
1446 * CHECK_CAPS_AUTHONLY - we should only check the auth cap 1446 * CHECK_CAPS_AUTHONLY - we should only check the auth cap
1447 * CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without 1447 * CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
1448 * further delay. 1448 * further delay.
1449 */ 1449 */
1450 void ceph_check_caps(struct ceph_inode_info *ci, int flags, 1450 void ceph_check_caps(struct ceph_inode_info *ci, int flags,
1451 struct ceph_mds_session *session) 1451 struct ceph_mds_session *session)
1452 { 1452 {
1453 struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode); 1453 struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
1454 struct ceph_mds_client *mdsc = fsc->mdsc; 1454 struct ceph_mds_client *mdsc = fsc->mdsc;
1455 struct inode *inode = &ci->vfs_inode; 1455 struct inode *inode = &ci->vfs_inode;
1456 struct ceph_cap *cap; 1456 struct ceph_cap *cap;
1457 int file_wanted, used; 1457 int file_wanted, used;
1458 int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */ 1458 int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */
1459 int issued, implemented, want, retain, revoking, flushing = 0; 1459 int issued, implemented, want, retain, revoking, flushing = 0;
1460 int mds = -1; /* keep track of how far we've gone through i_caps list 1460 int mds = -1; /* keep track of how far we've gone through i_caps list
1461 to avoid an infinite loop on retry */ 1461 to avoid an infinite loop on retry */
1462 struct rb_node *p; 1462 struct rb_node *p;
1463 int tried_invalidate = 0; 1463 int tried_invalidate = 0;
1464 int delayed = 0, sent = 0, force_requeue = 0, num; 1464 int delayed = 0, sent = 0, force_requeue = 0, num;
1465 int queue_invalidate = 0; 1465 int queue_invalidate = 0;
1466 int is_delayed = flags & CHECK_CAPS_NODELAY; 1466 int is_delayed = flags & CHECK_CAPS_NODELAY;
1467 1467
1468 /* if we are unmounting, flush any unused caps immediately. */ 1468 /* if we are unmounting, flush any unused caps immediately. */
1469 if (mdsc->stopping) 1469 if (mdsc->stopping)
1470 is_delayed = 1; 1470 is_delayed = 1;
1471 1471
1472 spin_lock(&ci->i_ceph_lock); 1472 spin_lock(&ci->i_ceph_lock);
1473 1473
1474 if (ci->i_ceph_flags & CEPH_I_FLUSH) 1474 if (ci->i_ceph_flags & CEPH_I_FLUSH)
1475 flags |= CHECK_CAPS_FLUSH; 1475 flags |= CHECK_CAPS_FLUSH;
1476 1476
1477 /* flush snaps first time around only */ 1477 /* flush snaps first time around only */
1478 if (!list_empty(&ci->i_cap_snaps)) 1478 if (!list_empty(&ci->i_cap_snaps))
1479 __ceph_flush_snaps(ci, &session, 0); 1479 __ceph_flush_snaps(ci, &session, 0);
1480 goto retry_locked; 1480 goto retry_locked;
1481 retry: 1481 retry:
1482 spin_lock(&ci->i_ceph_lock); 1482 spin_lock(&ci->i_ceph_lock);
1483 retry_locked: 1483 retry_locked:
1484 file_wanted = __ceph_caps_file_wanted(ci); 1484 file_wanted = __ceph_caps_file_wanted(ci);
1485 used = __ceph_caps_used(ci); 1485 used = __ceph_caps_used(ci);
1486 want = file_wanted | used; 1486 want = file_wanted | used;
1487 issued = __ceph_caps_issued(ci, &implemented); 1487 issued = __ceph_caps_issued(ci, &implemented);
1488 revoking = implemented & ~issued; 1488 revoking = implemented & ~issued;
1489 1489
1490 retain = want | CEPH_CAP_PIN; 1490 retain = want | CEPH_CAP_PIN;
1491 if (!mdsc->stopping && inode->i_nlink > 0) { 1491 if (!mdsc->stopping && inode->i_nlink > 0) {
1492 if (want) { 1492 if (want) {
1493 retain |= CEPH_CAP_ANY; /* be greedy */ 1493 retain |= CEPH_CAP_ANY; /* be greedy */
1494 } else { 1494 } else {
1495 retain |= CEPH_CAP_ANY_SHARED; 1495 retain |= CEPH_CAP_ANY_SHARED;
1496 /* 1496 /*
1497 * keep RD only if we didn't have the file open RW, 1497 * keep RD only if we didn't have the file open RW,
1498 * because then the mds would revoke it anyway to 1498 * because then the mds would revoke it anyway to
1499 * journal max_size=0. 1499 * journal max_size=0.
1500 */ 1500 */
1501 if (ci->i_max_size == 0) 1501 if (ci->i_max_size == 0)
1502 retain |= CEPH_CAP_ANY_RD; 1502 retain |= CEPH_CAP_ANY_RD;
1503 } 1503 }
1504 } 1504 }
1505 1505
1506 dout("check_caps %p file_want %s used %s dirty %s flushing %s" 1506 dout("check_caps %p file_want %s used %s dirty %s flushing %s"
1507 " issued %s revoking %s retain %s %s%s%s\n", inode, 1507 " issued %s revoking %s retain %s %s%s%s\n", inode,
1508 ceph_cap_string(file_wanted), 1508 ceph_cap_string(file_wanted),
1509 ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps), 1509 ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
1510 ceph_cap_string(ci->i_flushing_caps), 1510 ceph_cap_string(ci->i_flushing_caps),
1511 ceph_cap_string(issued), ceph_cap_string(revoking), 1511 ceph_cap_string(issued), ceph_cap_string(revoking),
1512 ceph_cap_string(retain), 1512 ceph_cap_string(retain),
1513 (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "", 1513 (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
1514 (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "", 1514 (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
1515 (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : ""); 1515 (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
1516 1516
1517 /* 1517 /*
1518 * If we no longer need to hold onto old our caps, and we may 1518 * If we no longer need to hold onto old our caps, and we may
1519 * have cached pages, but don't want them, then try to invalidate. 1519 * have cached pages, but don't want them, then try to invalidate.
1520 * If we fail, it's because pages are locked.... try again later. 1520 * If we fail, it's because pages are locked.... try again later.
1521 */ 1521 */
1522 if ((!is_delayed || mdsc->stopping) && 1522 if ((!is_delayed || mdsc->stopping) &&
1523 ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ 1523 ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
1524 inode->i_data.nrpages && /* have cached pages */ 1524 inode->i_data.nrpages && /* have cached pages */
1525 (file_wanted == 0 || /* no open files */ 1525 (file_wanted == 0 || /* no open files */
1526 (revoking & (CEPH_CAP_FILE_CACHE| 1526 (revoking & (CEPH_CAP_FILE_CACHE|
1527 CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */ 1527 CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */
1528 !tried_invalidate) { 1528 !tried_invalidate) {
1529 dout("check_caps trying to invalidate on %p\n", inode); 1529 dout("check_caps trying to invalidate on %p\n", inode);
1530 if (try_nonblocking_invalidate(inode) < 0) { 1530 if (try_nonblocking_invalidate(inode) < 0) {
1531 if (revoking & (CEPH_CAP_FILE_CACHE| 1531 if (revoking & (CEPH_CAP_FILE_CACHE|
1532 CEPH_CAP_FILE_LAZYIO)) { 1532 CEPH_CAP_FILE_LAZYIO)) {
1533 dout("check_caps queuing invalidate\n"); 1533 dout("check_caps queuing invalidate\n");
1534 queue_invalidate = 1; 1534 queue_invalidate = 1;
1535 ci->i_rdcache_revoking = ci->i_rdcache_gen; 1535 ci->i_rdcache_revoking = ci->i_rdcache_gen;
1536 } else { 1536 } else {
1537 dout("check_caps failed to invalidate pages\n"); 1537 dout("check_caps failed to invalidate pages\n");
1538 /* we failed to invalidate pages. check these 1538 /* we failed to invalidate pages. check these
1539 caps again later. */ 1539 caps again later. */
1540 force_requeue = 1; 1540 force_requeue = 1;
1541 __cap_set_timeouts(mdsc, ci); 1541 __cap_set_timeouts(mdsc, ci);
1542 } 1542 }
1543 } 1543 }
1544 tried_invalidate = 1; 1544 tried_invalidate = 1;
1545 goto retry_locked; 1545 goto retry_locked;
1546 } 1546 }
1547 1547
1548 num = 0; 1548 num = 0;
1549 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { 1549 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
1550 cap = rb_entry(p, struct ceph_cap, ci_node); 1550 cap = rb_entry(p, struct ceph_cap, ci_node);
1551 num++; 1551 num++;
1552 1552
1553 /* avoid looping forever */ 1553 /* avoid looping forever */
1554 if (mds >= cap->mds || 1554 if (mds >= cap->mds ||
1555 ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap)) 1555 ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap))
1556 continue; 1556 continue;
1557 1557
1558 /* NOTE: no side-effects allowed, until we take s_mutex */ 1558 /* NOTE: no side-effects allowed, until we take s_mutex */
1559 1559
1560 revoking = cap->implemented & ~cap->issued; 1560 revoking = cap->implemented & ~cap->issued;
1561 dout(" mds%d cap %p issued %s implemented %s revoking %s\n", 1561 dout(" mds%d cap %p issued %s implemented %s revoking %s\n",
1562 cap->mds, cap, ceph_cap_string(cap->issued), 1562 cap->mds, cap, ceph_cap_string(cap->issued),
1563 ceph_cap_string(cap->implemented), 1563 ceph_cap_string(cap->implemented),
1564 ceph_cap_string(revoking)); 1564 ceph_cap_string(revoking));
1565 1565
1566 if (cap == ci->i_auth_cap && 1566 if (cap == ci->i_auth_cap &&
1567 (cap->issued & CEPH_CAP_FILE_WR)) { 1567 (cap->issued & CEPH_CAP_FILE_WR)) {
1568 /* request larger max_size from MDS? */ 1568 /* request larger max_size from MDS? */
1569 if (ci->i_wanted_max_size > ci->i_max_size && 1569 if (ci->i_wanted_max_size > ci->i_max_size &&
1570 ci->i_wanted_max_size > ci->i_requested_max_size) { 1570 ci->i_wanted_max_size > ci->i_requested_max_size) {
1571 dout("requesting new max_size\n"); 1571 dout("requesting new max_size\n");
1572 goto ack; 1572 goto ack;
1573 } 1573 }
1574 1574
1575 /* approaching file_max? */ 1575 /* approaching file_max? */
1576 if ((inode->i_size << 1) >= ci->i_max_size && 1576 if ((inode->i_size << 1) >= ci->i_max_size &&
1577 (ci->i_reported_size << 1) < ci->i_max_size) { 1577 (ci->i_reported_size << 1) < ci->i_max_size) {
1578 dout("i_size approaching max_size\n"); 1578 dout("i_size approaching max_size\n");
1579 goto ack; 1579 goto ack;
1580 } 1580 }
1581 } 1581 }
1582 /* flush anything dirty? */ 1582 /* flush anything dirty? */
1583 if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) && 1583 if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) &&
1584 ci->i_dirty_caps) { 1584 ci->i_dirty_caps) {
1585 dout("flushing dirty caps\n"); 1585 dout("flushing dirty caps\n");
1586 goto ack; 1586 goto ack;
1587 } 1587 }
1588 1588
1589 /* completed revocation? going down and there are no caps? */ 1589 /* completed revocation? going down and there are no caps? */
1590 if (revoking && (revoking & used) == 0) { 1590 if (revoking && (revoking & used) == 0) {
1591 dout("completed revocation of %s\n", 1591 dout("completed revocation of %s\n",
1592 ceph_cap_string(cap->implemented & ~cap->issued)); 1592 ceph_cap_string(cap->implemented & ~cap->issued));
1593 goto ack; 1593 goto ack;
1594 } 1594 }
1595 1595
1596 /* want more caps from mds? */ 1596 /* want more caps from mds? */
1597 if (want & ~(cap->mds_wanted | cap->issued)) 1597 if (want & ~(cap->mds_wanted | cap->issued))
1598 goto ack; 1598 goto ack;
1599 1599
1600 /* things we might delay */ 1600 /* things we might delay */
1601 if ((cap->issued & ~retain) == 0 && 1601 if ((cap->issued & ~retain) == 0 &&
1602 cap->mds_wanted == want) 1602 cap->mds_wanted == want)
1603 continue; /* nope, all good */ 1603 continue; /* nope, all good */
1604 1604
1605 if (is_delayed) 1605 if (is_delayed)
1606 goto ack; 1606 goto ack;
1607 1607
1608 /* delay? */ 1608 /* delay? */
1609 if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 && 1609 if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
1610 time_before(jiffies, ci->i_hold_caps_max)) { 1610 time_before(jiffies, ci->i_hold_caps_max)) {
1611 dout(" delaying issued %s -> %s, wanted %s -> %s\n", 1611 dout(" delaying issued %s -> %s, wanted %s -> %s\n",
1612 ceph_cap_string(cap->issued), 1612 ceph_cap_string(cap->issued),
1613 ceph_cap_string(cap->issued & retain), 1613 ceph_cap_string(cap->issued & retain),
1614 ceph_cap_string(cap->mds_wanted), 1614 ceph_cap_string(cap->mds_wanted),
1615 ceph_cap_string(want)); 1615 ceph_cap_string(want));
1616 delayed++; 1616 delayed++;
1617 continue; 1617 continue;
1618 } 1618 }
1619 1619
1620 ack: 1620 ack:
1621 if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { 1621 if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
1622 dout(" skipping %p I_NOFLUSH set\n", inode); 1622 dout(" skipping %p I_NOFLUSH set\n", inode);
1623 continue; 1623 continue;
1624 } 1624 }
1625 1625
1626 if (session && session != cap->session) { 1626 if (session && session != cap->session) {
1627 dout("oops, wrong session %p mutex\n", session); 1627 dout("oops, wrong session %p mutex\n", session);
1628 mutex_unlock(&session->s_mutex); 1628 mutex_unlock(&session->s_mutex);
1629 session = NULL; 1629 session = NULL;
1630 } 1630 }
1631 if (!session) { 1631 if (!session) {
1632 session = cap->session; 1632 session = cap->session;
1633 if (mutex_trylock(&session->s_mutex) == 0) { 1633 if (mutex_trylock(&session->s_mutex) == 0) {
1634 dout("inverting session/ino locks on %p\n", 1634 dout("inverting session/ino locks on %p\n",
1635 session); 1635 session);
1636 spin_unlock(&ci->i_ceph_lock); 1636 spin_unlock(&ci->i_ceph_lock);
1637 if (took_snap_rwsem) { 1637 if (took_snap_rwsem) {
1638 up_read(&mdsc->snap_rwsem); 1638 up_read(&mdsc->snap_rwsem);
1639 took_snap_rwsem = 0; 1639 took_snap_rwsem = 0;
1640 } 1640 }
1641 mutex_lock(&session->s_mutex); 1641 mutex_lock(&session->s_mutex);
1642 goto retry; 1642 goto retry;
1643 } 1643 }
1644 } 1644 }
1645 /* take snap_rwsem after session mutex */ 1645 /* take snap_rwsem after session mutex */
1646 if (!took_snap_rwsem) { 1646 if (!took_snap_rwsem) {
1647 if (down_read_trylock(&mdsc->snap_rwsem) == 0) { 1647 if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
1648 dout("inverting snap/in locks on %p\n", 1648 dout("inverting snap/in locks on %p\n",
1649 inode); 1649 inode);
1650 spin_unlock(&ci->i_ceph_lock); 1650 spin_unlock(&ci->i_ceph_lock);
1651 down_read(&mdsc->snap_rwsem); 1651 down_read(&mdsc->snap_rwsem);
1652 took_snap_rwsem = 1; 1652 took_snap_rwsem = 1;
1653 goto retry; 1653 goto retry;
1654 } 1654 }
1655 took_snap_rwsem = 1; 1655 took_snap_rwsem = 1;
1656 } 1656 }
1657 1657
1658 if (cap == ci->i_auth_cap && ci->i_dirty_caps) 1658 if (cap == ci->i_auth_cap && ci->i_dirty_caps)
1659 flushing = __mark_caps_flushing(inode, session); 1659 flushing = __mark_caps_flushing(inode, session);
1660 else 1660 else
1661 flushing = 0; 1661 flushing = 0;
1662 1662
1663 mds = cap->mds; /* remember mds, so we don't repeat */ 1663 mds = cap->mds; /* remember mds, so we don't repeat */
1664 sent++; 1664 sent++;
1665 1665
1666 /* __send_cap drops i_ceph_lock */ 1666 /* __send_cap drops i_ceph_lock */
1667 delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want, 1667 delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want,
1668 retain, flushing, NULL); 1668 retain, flushing, NULL);
1669 goto retry; /* retake i_ceph_lock and restart our cap scan. */ 1669 goto retry; /* retake i_ceph_lock and restart our cap scan. */
1670 } 1670 }
1671 1671
1672 /* 1672 /*
1673 * Reschedule delayed caps release if we delayed anything, 1673 * Reschedule delayed caps release if we delayed anything,
1674 * otherwise cancel. 1674 * otherwise cancel.
1675 */ 1675 */
1676 if (delayed && is_delayed) 1676 if (delayed && is_delayed)
1677 force_requeue = 1; /* __send_cap delayed release; requeue */ 1677 force_requeue = 1; /* __send_cap delayed release; requeue */
1678 if (!delayed && !is_delayed) 1678 if (!delayed && !is_delayed)
1679 __cap_delay_cancel(mdsc, ci); 1679 __cap_delay_cancel(mdsc, ci);
1680 else if (!is_delayed || force_requeue) 1680 else if (!is_delayed || force_requeue)
1681 __cap_delay_requeue(mdsc, ci); 1681 __cap_delay_requeue(mdsc, ci);
1682 1682
1683 spin_unlock(&ci->i_ceph_lock); 1683 spin_unlock(&ci->i_ceph_lock);
1684 1684
1685 if (queue_invalidate) 1685 if (queue_invalidate)
1686 ceph_queue_invalidate(inode); 1686 ceph_queue_invalidate(inode);
1687 1687
1688 if (session) 1688 if (session)
1689 mutex_unlock(&session->s_mutex); 1689 mutex_unlock(&session->s_mutex);
1690 if (took_snap_rwsem) 1690 if (took_snap_rwsem)
1691 up_read(&mdsc->snap_rwsem); 1691 up_read(&mdsc->snap_rwsem);
1692 } 1692 }
1693 1693
1694 /* 1694 /*
1695 * Try to flush dirty caps back to the auth mds. 1695 * Try to flush dirty caps back to the auth mds.
1696 */ 1696 */
1697 static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session, 1697 static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
1698 unsigned *flush_tid) 1698 unsigned *flush_tid)
1699 { 1699 {
1700 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; 1700 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
1701 struct ceph_inode_info *ci = ceph_inode(inode); 1701 struct ceph_inode_info *ci = ceph_inode(inode);
1702 int unlock_session = session ? 0 : 1; 1702 int unlock_session = session ? 0 : 1;
1703 int flushing = 0; 1703 int flushing = 0;
1704 1704
1705 retry: 1705 retry:
1706 spin_lock(&ci->i_ceph_lock); 1706 spin_lock(&ci->i_ceph_lock);
1707 if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { 1707 if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
1708 dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode); 1708 dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
1709 goto out; 1709 goto out;
1710 } 1710 }
1711 if (ci->i_dirty_caps && ci->i_auth_cap) { 1711 if (ci->i_dirty_caps && ci->i_auth_cap) {
1712 struct ceph_cap *cap = ci->i_auth_cap; 1712 struct ceph_cap *cap = ci->i_auth_cap;
1713 int used = __ceph_caps_used(ci); 1713 int used = __ceph_caps_used(ci);
1714 int want = __ceph_caps_wanted(ci); 1714 int want = __ceph_caps_wanted(ci);
1715 int delayed; 1715 int delayed;
1716 1716
1717 if (!session) { 1717 if (!session) {
1718 spin_unlock(&ci->i_ceph_lock); 1718 spin_unlock(&ci->i_ceph_lock);
1719 session = cap->session; 1719 session = cap->session;
1720 mutex_lock(&session->s_mutex); 1720 mutex_lock(&session->s_mutex);
1721 goto retry; 1721 goto retry;
1722 } 1722 }
1723 BUG_ON(session != cap->session); 1723 BUG_ON(session != cap->session);
1724 if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) 1724 if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
1725 goto out; 1725 goto out;
1726 1726
1727 flushing = __mark_caps_flushing(inode, session); 1727 flushing = __mark_caps_flushing(inode, session);
1728 1728
1729 /* __send_cap drops i_ceph_lock */ 1729 /* __send_cap drops i_ceph_lock */
1730 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want, 1730 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
1731 cap->issued | cap->implemented, flushing, 1731 cap->issued | cap->implemented, flushing,
1732 flush_tid); 1732 flush_tid);
1733 if (!delayed) 1733 if (!delayed)
1734 goto out_unlocked; 1734 goto out_unlocked;
1735 1735
1736 spin_lock(&ci->i_ceph_lock); 1736 spin_lock(&ci->i_ceph_lock);
1737 __cap_delay_requeue(mdsc, ci); 1737 __cap_delay_requeue(mdsc, ci);
1738 } 1738 }
1739 out: 1739 out:
1740 spin_unlock(&ci->i_ceph_lock); 1740 spin_unlock(&ci->i_ceph_lock);
1741 out_unlocked: 1741 out_unlocked:
1742 if (session && unlock_session) 1742 if (session && unlock_session)
1743 mutex_unlock(&session->s_mutex); 1743 mutex_unlock(&session->s_mutex);
1744 return flushing; 1744 return flushing;
1745 } 1745 }
1746 1746
1747 /* 1747 /*
1748 * Return true if we've flushed caps through the given flush_tid. 1748 * Return true if we've flushed caps through the given flush_tid.
1749 */ 1749 */
1750 static int caps_are_flushed(struct inode *inode, unsigned tid) 1750 static int caps_are_flushed(struct inode *inode, unsigned tid)
1751 { 1751 {
1752 struct ceph_inode_info *ci = ceph_inode(inode); 1752 struct ceph_inode_info *ci = ceph_inode(inode);
1753 int i, ret = 1; 1753 int i, ret = 1;
1754 1754
1755 spin_lock(&ci->i_ceph_lock); 1755 spin_lock(&ci->i_ceph_lock);
1756 for (i = 0; i < CEPH_CAP_BITS; i++) 1756 for (i = 0; i < CEPH_CAP_BITS; i++)
1757 if ((ci->i_flushing_caps & (1 << i)) && 1757 if ((ci->i_flushing_caps & (1 << i)) &&
1758 ci->i_cap_flush_tid[i] <= tid) { 1758 ci->i_cap_flush_tid[i] <= tid) {
1759 /* still flushing this bit */ 1759 /* still flushing this bit */
1760 ret = 0; 1760 ret = 0;
1761 break; 1761 break;
1762 } 1762 }
1763 spin_unlock(&ci->i_ceph_lock); 1763 spin_unlock(&ci->i_ceph_lock);
1764 return ret; 1764 return ret;
1765 } 1765 }
1766 1766
1767 /* 1767 /*
1768 * Wait on any unsafe replies for the given inode. First wait on the 1768 * Wait on any unsafe replies for the given inode. First wait on the
1769 * newest request, and make that the upper bound. Then, if there are 1769 * newest request, and make that the upper bound. Then, if there are
1770 * more requests, keep waiting on the oldest as long as it is still older 1770 * more requests, keep waiting on the oldest as long as it is still older
1771 * than the original request. 1771 * than the original request.
1772 */ 1772 */
1773 static void sync_write_wait(struct inode *inode) 1773 static void sync_write_wait(struct inode *inode)
1774 { 1774 {
1775 struct ceph_inode_info *ci = ceph_inode(inode); 1775 struct ceph_inode_info *ci = ceph_inode(inode);
1776 struct list_head *head = &ci->i_unsafe_writes; 1776 struct list_head *head = &ci->i_unsafe_writes;
1777 struct ceph_osd_request *req; 1777 struct ceph_osd_request *req;
1778 u64 last_tid; 1778 u64 last_tid;
1779 1779
1780 spin_lock(&ci->i_unsafe_lock); 1780 spin_lock(&ci->i_unsafe_lock);
1781 if (list_empty(head)) 1781 if (list_empty(head))
1782 goto out; 1782 goto out;
1783 1783
1784 /* set upper bound as _last_ entry in chain */ 1784 /* set upper bound as _last_ entry in chain */
1785 req = list_entry(head->prev, struct ceph_osd_request, 1785 req = list_entry(head->prev, struct ceph_osd_request,
1786 r_unsafe_item); 1786 r_unsafe_item);
1787 last_tid = req->r_tid; 1787 last_tid = req->r_tid;
1788 1788
1789 do { 1789 do {
1790 ceph_osdc_get_request(req); 1790 ceph_osdc_get_request(req);
1791 spin_unlock(&ci->i_unsafe_lock); 1791 spin_unlock(&ci->i_unsafe_lock);
1792 dout("sync_write_wait on tid %llu (until %llu)\n", 1792 dout("sync_write_wait on tid %llu (until %llu)\n",
1793 req->r_tid, last_tid); 1793 req->r_tid, last_tid);
1794 wait_for_completion(&req->r_safe_completion); 1794 wait_for_completion(&req->r_safe_completion);
1795 spin_lock(&ci->i_unsafe_lock); 1795 spin_lock(&ci->i_unsafe_lock);
1796 ceph_osdc_put_request(req); 1796 ceph_osdc_put_request(req);
1797 1797
1798 /* 1798 /*
1799 * from here on look at first entry in chain, since we 1799 * from here on look at first entry in chain, since we
1800 * only want to wait for anything older than last_tid 1800 * only want to wait for anything older than last_tid
1801 */ 1801 */
1802 if (list_empty(head)) 1802 if (list_empty(head))
1803 break; 1803 break;
1804 req = list_entry(head->next, struct ceph_osd_request, 1804 req = list_entry(head->next, struct ceph_osd_request,
1805 r_unsafe_item); 1805 r_unsafe_item);
1806 } while (req->r_tid < last_tid); 1806 } while (req->r_tid < last_tid);
1807 out: 1807 out:
1808 spin_unlock(&ci->i_unsafe_lock); 1808 spin_unlock(&ci->i_unsafe_lock);
1809 } 1809 }
1810 1810
1811 int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) 1811 int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
1812 { 1812 {
1813 struct inode *inode = file->f_mapping->host; 1813 struct inode *inode = file->f_mapping->host;
1814 struct ceph_inode_info *ci = ceph_inode(inode); 1814 struct ceph_inode_info *ci = ceph_inode(inode);
1815 unsigned flush_tid; 1815 unsigned flush_tid;
1816 int ret; 1816 int ret;
1817 int dirty; 1817 int dirty;
1818 1818
1819 dout("fsync %p%s\n", inode, datasync ? " datasync" : ""); 1819 dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
1820 sync_write_wait(inode); 1820 sync_write_wait(inode);
1821 1821
1822 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 1822 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
1823 if (ret < 0) 1823 if (ret < 0)
1824 return ret; 1824 return ret;
1825 mutex_lock(&inode->i_mutex); 1825 mutex_lock(&inode->i_mutex);
1826 1826
1827 dirty = try_flush_caps(inode, NULL, &flush_tid); 1827 dirty = try_flush_caps(inode, NULL, &flush_tid);
1828 dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); 1828 dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
1829 1829
1830 /* 1830 /*
1831 * only wait on non-file metadata writeback (the mds 1831 * only wait on non-file metadata writeback (the mds
1832 * can recover size and mtime, so we don't need to 1832 * can recover size and mtime, so we don't need to
1833 * wait for that) 1833 * wait for that)
1834 */ 1834 */
1835 if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { 1835 if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
1836 dout("fsync waiting for flush_tid %u\n", flush_tid); 1836 dout("fsync waiting for flush_tid %u\n", flush_tid);
1837 ret = wait_event_interruptible(ci->i_cap_wq, 1837 ret = wait_event_interruptible(ci->i_cap_wq,
1838 caps_are_flushed(inode, flush_tid)); 1838 caps_are_flushed(inode, flush_tid));
1839 } 1839 }
1840 1840
1841 dout("fsync %p%s done\n", inode, datasync ? " datasync" : ""); 1841 dout("fsync %p%s done\n", inode, datasync ? " datasync" : "");
1842 mutex_unlock(&inode->i_mutex); 1842 mutex_unlock(&inode->i_mutex);
1843 return ret; 1843 return ret;
1844 } 1844 }
1845 1845
1846 /* 1846 /*
1847 * Flush any dirty caps back to the mds. If we aren't asked to wait, 1847 * Flush any dirty caps back to the mds. If we aren't asked to wait,
1848 * queue inode for flush but don't do so immediately, because we can 1848 * queue inode for flush but don't do so immediately, because we can
1849 * get by with fewer MDS messages if we wait for data writeback to 1849 * get by with fewer MDS messages if we wait for data writeback to
1850 * complete first. 1850 * complete first.
1851 */ 1851 */
1852 int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) 1852 int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
1853 { 1853 {
1854 struct ceph_inode_info *ci = ceph_inode(inode); 1854 struct ceph_inode_info *ci = ceph_inode(inode);
1855 unsigned flush_tid; 1855 unsigned flush_tid;
1856 int err = 0; 1856 int err = 0;
1857 int dirty; 1857 int dirty;
1858 int wait = wbc->sync_mode == WB_SYNC_ALL; 1858 int wait = wbc->sync_mode == WB_SYNC_ALL;
1859 1859
1860 dout("write_inode %p wait=%d\n", inode, wait); 1860 dout("write_inode %p wait=%d\n", inode, wait);
1861 if (wait) { 1861 if (wait) {
1862 dirty = try_flush_caps(inode, NULL, &flush_tid); 1862 dirty = try_flush_caps(inode, NULL, &flush_tid);
1863 if (dirty) 1863 if (dirty)
1864 err = wait_event_interruptible(ci->i_cap_wq, 1864 err = wait_event_interruptible(ci->i_cap_wq,
1865 caps_are_flushed(inode, flush_tid)); 1865 caps_are_flushed(inode, flush_tid));
1866 } else { 1866 } else {
1867 struct ceph_mds_client *mdsc = 1867 struct ceph_mds_client *mdsc =
1868 ceph_sb_to_client(inode->i_sb)->mdsc; 1868 ceph_sb_to_client(inode->i_sb)->mdsc;
1869 1869
1870 spin_lock(&ci->i_ceph_lock); 1870 spin_lock(&ci->i_ceph_lock);
1871 if (__ceph_caps_dirty(ci)) 1871 if (__ceph_caps_dirty(ci))
1872 __cap_delay_requeue_front(mdsc, ci); 1872 __cap_delay_requeue_front(mdsc, ci);
1873 spin_unlock(&ci->i_ceph_lock); 1873 spin_unlock(&ci->i_ceph_lock);
1874 } 1874 }
1875 return err; 1875 return err;
1876 } 1876 }
1877 1877
1878 /* 1878 /*
1879 * After a recovering MDS goes active, we need to resend any caps 1879 * After a recovering MDS goes active, we need to resend any caps
1880 * we were flushing. 1880 * we were flushing.
1881 * 1881 *
1882 * Caller holds session->s_mutex. 1882 * Caller holds session->s_mutex.
1883 */ 1883 */
1884 static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc, 1884 static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
1885 struct ceph_mds_session *session) 1885 struct ceph_mds_session *session)
1886 { 1886 {
1887 struct ceph_cap_snap *capsnap; 1887 struct ceph_cap_snap *capsnap;
1888 1888
1889 dout("kick_flushing_capsnaps mds%d\n", session->s_mds); 1889 dout("kick_flushing_capsnaps mds%d\n", session->s_mds);
1890 list_for_each_entry(capsnap, &session->s_cap_snaps_flushing, 1890 list_for_each_entry(capsnap, &session->s_cap_snaps_flushing,
1891 flushing_item) { 1891 flushing_item) {
1892 struct ceph_inode_info *ci = capsnap->ci; 1892 struct ceph_inode_info *ci = capsnap->ci;
1893 struct inode *inode = &ci->vfs_inode; 1893 struct inode *inode = &ci->vfs_inode;
1894 struct ceph_cap *cap; 1894 struct ceph_cap *cap;
1895 1895
1896 spin_lock(&ci->i_ceph_lock); 1896 spin_lock(&ci->i_ceph_lock);
1897 cap = ci->i_auth_cap; 1897 cap = ci->i_auth_cap;
1898 if (cap && cap->session == session) { 1898 if (cap && cap->session == session) {
1899 dout("kick_flushing_caps %p cap %p capsnap %p\n", inode, 1899 dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
1900 cap, capsnap); 1900 cap, capsnap);
1901 __ceph_flush_snaps(ci, &session, 1); 1901 __ceph_flush_snaps(ci, &session, 1);
1902 } else { 1902 } else {
1903 pr_err("%p auth cap %p not mds%d ???\n", inode, 1903 pr_err("%p auth cap %p not mds%d ???\n", inode,
1904 cap, session->s_mds); 1904 cap, session->s_mds);
1905 } 1905 }
1906 spin_unlock(&ci->i_ceph_lock); 1906 spin_unlock(&ci->i_ceph_lock);
1907 } 1907 }
1908 } 1908 }
1909 1909
1910 void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, 1910 void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
1911 struct ceph_mds_session *session) 1911 struct ceph_mds_session *session)
1912 { 1912 {
1913 struct ceph_inode_info *ci; 1913 struct ceph_inode_info *ci;
1914 1914
1915 kick_flushing_capsnaps(mdsc, session); 1915 kick_flushing_capsnaps(mdsc, session);
1916 1916
1917 dout("kick_flushing_caps mds%d\n", session->s_mds); 1917 dout("kick_flushing_caps mds%d\n", session->s_mds);
1918 list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { 1918 list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
1919 struct inode *inode = &ci->vfs_inode; 1919 struct inode *inode = &ci->vfs_inode;
1920 struct ceph_cap *cap; 1920 struct ceph_cap *cap;
1921 int delayed = 0; 1921 int delayed = 0;
1922 1922
1923 spin_lock(&ci->i_ceph_lock); 1923 spin_lock(&ci->i_ceph_lock);
1924 cap = ci->i_auth_cap; 1924 cap = ci->i_auth_cap;
1925 if (cap && cap->session == session) { 1925 if (cap && cap->session == session) {
1926 dout("kick_flushing_caps %p cap %p %s\n", inode, 1926 dout("kick_flushing_caps %p cap %p %s\n", inode,
1927 cap, ceph_cap_string(ci->i_flushing_caps)); 1927 cap, ceph_cap_string(ci->i_flushing_caps));
1928 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, 1928 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
1929 __ceph_caps_used(ci), 1929 __ceph_caps_used(ci),
1930 __ceph_caps_wanted(ci), 1930 __ceph_caps_wanted(ci),
1931 cap->issued | cap->implemented, 1931 cap->issued | cap->implemented,
1932 ci->i_flushing_caps, NULL); 1932 ci->i_flushing_caps, NULL);
1933 if (delayed) { 1933 if (delayed) {
1934 spin_lock(&ci->i_ceph_lock); 1934 spin_lock(&ci->i_ceph_lock);
1935 __cap_delay_requeue(mdsc, ci); 1935 __cap_delay_requeue(mdsc, ci);
1936 spin_unlock(&ci->i_ceph_lock); 1936 spin_unlock(&ci->i_ceph_lock);
1937 } 1937 }
1938 } else { 1938 } else {
1939 pr_err("%p auth cap %p not mds%d ???\n", inode, 1939 pr_err("%p auth cap %p not mds%d ???\n", inode,
1940 cap, session->s_mds); 1940 cap, session->s_mds);
1941 spin_unlock(&ci->i_ceph_lock); 1941 spin_unlock(&ci->i_ceph_lock);
1942 } 1942 }
1943 } 1943 }
1944 } 1944 }
1945 1945
1946 static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, 1946 static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
1947 struct ceph_mds_session *session, 1947 struct ceph_mds_session *session,
1948 struct inode *inode) 1948 struct inode *inode)
1949 { 1949 {
1950 struct ceph_inode_info *ci = ceph_inode(inode); 1950 struct ceph_inode_info *ci = ceph_inode(inode);
1951 struct ceph_cap *cap; 1951 struct ceph_cap *cap;
1952 int delayed = 0; 1952 int delayed = 0;
1953 1953
1954 spin_lock(&ci->i_ceph_lock); 1954 spin_lock(&ci->i_ceph_lock);
1955 cap = ci->i_auth_cap; 1955 cap = ci->i_auth_cap;
1956 dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode, 1956 dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode,
1957 ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq); 1957 ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq);
1958 __ceph_flush_snaps(ci, &session, 1); 1958 __ceph_flush_snaps(ci, &session, 1);
1959 if (ci->i_flushing_caps) { 1959 if (ci->i_flushing_caps) {
1960 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, 1960 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
1961 __ceph_caps_used(ci), 1961 __ceph_caps_used(ci),
1962 __ceph_caps_wanted(ci), 1962 __ceph_caps_wanted(ci),
1963 cap->issued | cap->implemented, 1963 cap->issued | cap->implemented,
1964 ci->i_flushing_caps, NULL); 1964 ci->i_flushing_caps, NULL);
1965 if (delayed) { 1965 if (delayed) {
1966 spin_lock(&ci->i_ceph_lock); 1966 spin_lock(&ci->i_ceph_lock);
1967 __cap_delay_requeue(mdsc, ci); 1967 __cap_delay_requeue(mdsc, ci);
1968 spin_unlock(&ci->i_ceph_lock); 1968 spin_unlock(&ci->i_ceph_lock);
1969 } 1969 }
1970 } else { 1970 } else {
1971 spin_unlock(&ci->i_ceph_lock); 1971 spin_unlock(&ci->i_ceph_lock);
1972 } 1972 }
1973 } 1973 }
1974 1974
1975 1975
1976 /* 1976 /*
1977 * Take references to capabilities we hold, so that we don't release 1977 * Take references to capabilities we hold, so that we don't release
1978 * them to the MDS prematurely. 1978 * them to the MDS prematurely.
1979 * 1979 *
1980 * Protected by i_ceph_lock. 1980 * Protected by i_ceph_lock.
1981 */ 1981 */
1982 static void __take_cap_refs(struct ceph_inode_info *ci, int got) 1982 static void __take_cap_refs(struct ceph_inode_info *ci, int got)
1983 { 1983 {
1984 if (got & CEPH_CAP_PIN) 1984 if (got & CEPH_CAP_PIN)
1985 ci->i_pin_ref++; 1985 ci->i_pin_ref++;
1986 if (got & CEPH_CAP_FILE_RD) 1986 if (got & CEPH_CAP_FILE_RD)
1987 ci->i_rd_ref++; 1987 ci->i_rd_ref++;
1988 if (got & CEPH_CAP_FILE_CACHE) 1988 if (got & CEPH_CAP_FILE_CACHE)
1989 ci->i_rdcache_ref++; 1989 ci->i_rdcache_ref++;
1990 if (got & CEPH_CAP_FILE_WR) 1990 if (got & CEPH_CAP_FILE_WR)
1991 ci->i_wr_ref++; 1991 ci->i_wr_ref++;
1992 if (got & CEPH_CAP_FILE_BUFFER) { 1992 if (got & CEPH_CAP_FILE_BUFFER) {
1993 if (ci->i_wb_ref == 0) 1993 if (ci->i_wb_ref == 0)
1994 ihold(&ci->vfs_inode); 1994 ihold(&ci->vfs_inode);
1995 ci->i_wb_ref++; 1995 ci->i_wb_ref++;
1996 dout("__take_cap_refs %p wb %d -> %d (?)\n", 1996 dout("__take_cap_refs %p wb %d -> %d (?)\n",
1997 &ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref); 1997 &ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref);
1998 } 1998 }
1999 } 1999 }
2000 2000
2001 /* 2001 /*
2002 * Try to grab cap references. Specify those refs we @want, and the 2002 * Try to grab cap references. Specify those refs we @want, and the
2003 * minimal set we @need. Also include the larger offset we are writing 2003 * minimal set we @need. Also include the larger offset we are writing
2004 * to (when applicable), and check against max_size here as well. 2004 * to (when applicable), and check against max_size here as well.
2005 * Note that caller is responsible for ensuring max_size increases are 2005 * Note that caller is responsible for ensuring max_size increases are
2006 * requested from the MDS. 2006 * requested from the MDS.
2007 */ 2007 */
2008 static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, 2008 static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
2009 int *got, loff_t endoff, int *check_max, int *err) 2009 int *got, loff_t endoff, int *check_max, int *err)
2010 { 2010 {
2011 struct inode *inode = &ci->vfs_inode; 2011 struct inode *inode = &ci->vfs_inode;
2012 int ret = 0; 2012 int ret = 0;
2013 int have, implemented; 2013 int have, implemented;
2014 int file_wanted; 2014 int file_wanted;
2015 2015
2016 dout("get_cap_refs %p need %s want %s\n", inode, 2016 dout("get_cap_refs %p need %s want %s\n", inode,
2017 ceph_cap_string(need), ceph_cap_string(want)); 2017 ceph_cap_string(need), ceph_cap_string(want));
2018 spin_lock(&ci->i_ceph_lock); 2018 spin_lock(&ci->i_ceph_lock);
2019 2019
2020 /* make sure file is actually open */ 2020 /* make sure file is actually open */
2021 file_wanted = __ceph_caps_file_wanted(ci); 2021 file_wanted = __ceph_caps_file_wanted(ci);
2022 if ((file_wanted & need) == 0) { 2022 if ((file_wanted & need) == 0) {
2023 dout("try_get_cap_refs need %s file_wanted %s, EBADF\n", 2023 dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
2024 ceph_cap_string(need), ceph_cap_string(file_wanted)); 2024 ceph_cap_string(need), ceph_cap_string(file_wanted));
2025 *err = -EBADF; 2025 *err = -EBADF;
2026 ret = 1; 2026 ret = 1;
2027 goto out; 2027 goto out;
2028 } 2028 }
2029 2029
2030 if (need & CEPH_CAP_FILE_WR) { 2030 if (need & CEPH_CAP_FILE_WR) {
2031 if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) { 2031 if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
2032 dout("get_cap_refs %p endoff %llu > maxsize %llu\n", 2032 dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
2033 inode, endoff, ci->i_max_size); 2033 inode, endoff, ci->i_max_size);
2034 if (endoff > ci->i_wanted_max_size) { 2034 if (endoff > ci->i_wanted_max_size) {
2035 *check_max = 1; 2035 *check_max = 1;
2036 ret = 1; 2036 ret = 1;
2037 } 2037 }
2038 goto out; 2038 goto out;
2039 } 2039 }
2040 /* 2040 /*
2041 * If a sync write is in progress, we must wait, so that we 2041 * If a sync write is in progress, we must wait, so that we
2042 * can get a final snapshot value for size+mtime. 2042 * can get a final snapshot value for size+mtime.
2043 */ 2043 */
2044 if (__ceph_have_pending_cap_snap(ci)) { 2044 if (__ceph_have_pending_cap_snap(ci)) {
2045 dout("get_cap_refs %p cap_snap_pending\n", inode); 2045 dout("get_cap_refs %p cap_snap_pending\n", inode);
2046 goto out; 2046 goto out;
2047 } 2047 }
2048 } 2048 }
2049 have = __ceph_caps_issued(ci, &implemented); 2049 have = __ceph_caps_issued(ci, &implemented);
2050 2050
2051 /* 2051 /*
2052 * disallow writes while a truncate is pending 2052 * disallow writes while a truncate is pending
2053 */ 2053 */
2054 if (ci->i_truncate_pending) 2054 if (ci->i_truncate_pending)
2055 have &= ~CEPH_CAP_FILE_WR; 2055 have &= ~CEPH_CAP_FILE_WR;
2056 2056
2057 if ((have & need) == need) { 2057 if ((have & need) == need) {
2058 /* 2058 /*
2059 * Look at (implemented & ~have & not) so that we keep waiting 2059 * Look at (implemented & ~have & not) so that we keep waiting
2060 * on transition from wanted -> needed caps. This is needed 2060 * on transition from wanted -> needed caps. This is needed
2061 * for WRBUFFER|WR -> WR to avoid a new WR sync write from 2061 * for WRBUFFER|WR -> WR to avoid a new WR sync write from
2062 * going before a prior buffered writeback happens. 2062 * going before a prior buffered writeback happens.
2063 */ 2063 */
2064 int not = want & ~(have & need); 2064 int not = want & ~(have & need);
2065 int revoking = implemented & ~have; 2065 int revoking = implemented & ~have;
2066 dout("get_cap_refs %p have %s but not %s (revoking %s)\n", 2066 dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
2067 inode, ceph_cap_string(have), ceph_cap_string(not), 2067 inode, ceph_cap_string(have), ceph_cap_string(not),
2068 ceph_cap_string(revoking)); 2068 ceph_cap_string(revoking));
2069 if ((revoking & not) == 0) { 2069 if ((revoking & not) == 0) {
2070 *got = need | (have & want); 2070 *got = need | (have & want);
2071 __take_cap_refs(ci, *got); 2071 __take_cap_refs(ci, *got);
2072 ret = 1; 2072 ret = 1;
2073 } 2073 }
2074 } else { 2074 } else {
2075 dout("get_cap_refs %p have %s needed %s\n", inode, 2075 dout("get_cap_refs %p have %s needed %s\n", inode,
2076 ceph_cap_string(have), ceph_cap_string(need)); 2076 ceph_cap_string(have), ceph_cap_string(need));
2077 } 2077 }
2078 out: 2078 out:
2079 spin_unlock(&ci->i_ceph_lock); 2079 spin_unlock(&ci->i_ceph_lock);
2080 dout("get_cap_refs %p ret %d got %s\n", inode, 2080 dout("get_cap_refs %p ret %d got %s\n", inode,
2081 ret, ceph_cap_string(*got)); 2081 ret, ceph_cap_string(*got));
2082 return ret; 2082 return ret;
2083 } 2083 }
2084 2084
2085 /* 2085 /*
2086 * Check the offset we are writing up to against our current 2086 * Check the offset we are writing up to against our current
2087 * max_size. If necessary, tell the MDS we want to write to 2087 * max_size. If necessary, tell the MDS we want to write to
2088 * a larger offset. 2088 * a larger offset.
2089 */ 2089 */
2090 static void check_max_size(struct inode *inode, loff_t endoff) 2090 static void check_max_size(struct inode *inode, loff_t endoff)
2091 { 2091 {
2092 struct ceph_inode_info *ci = ceph_inode(inode); 2092 struct ceph_inode_info *ci = ceph_inode(inode);
2093 int check = 0; 2093 int check = 0;
2094 2094
2095 /* do we need to explicitly request a larger max_size? */ 2095 /* do we need to explicitly request a larger max_size? */
2096 spin_lock(&ci->i_ceph_lock); 2096 spin_lock(&ci->i_ceph_lock);
2097 if ((endoff >= ci->i_max_size || 2097 if ((endoff >= ci->i_max_size ||
2098 endoff > (inode->i_size << 1)) && 2098 endoff > (inode->i_size << 1)) &&
2099 endoff > ci->i_wanted_max_size) { 2099 endoff > ci->i_wanted_max_size) {
2100 dout("write %p at large endoff %llu, req max_size\n", 2100 dout("write %p at large endoff %llu, req max_size\n",
2101 inode, endoff); 2101 inode, endoff);
2102 ci->i_wanted_max_size = endoff; 2102 ci->i_wanted_max_size = endoff;
2103 check = 1; 2103 check = 1;
2104 } 2104 }
2105 spin_unlock(&ci->i_ceph_lock); 2105 spin_unlock(&ci->i_ceph_lock);
2106 if (check) 2106 if (check)
2107 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); 2107 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
2108 } 2108 }
2109 2109
2110 /* 2110 /*
2111 * Wait for caps, and take cap references. If we can't get a WR cap 2111 * Wait for caps, and take cap references. If we can't get a WR cap
2112 * due to a small max_size, make sure we check_max_size (and possibly 2112 * due to a small max_size, make sure we check_max_size (and possibly
2113 * ask the mds) so we don't get hung up indefinitely. 2113 * ask the mds) so we don't get hung up indefinitely.
2114 */ 2114 */
2115 int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, int *got, 2115 int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, int *got,
2116 loff_t endoff) 2116 loff_t endoff)
2117 { 2117 {
2118 int check_max, ret, err; 2118 int check_max, ret, err;
2119 2119
2120 retry: 2120 retry:
2121 if (endoff > 0) 2121 if (endoff > 0)
2122 check_max_size(&ci->vfs_inode, endoff); 2122 check_max_size(&ci->vfs_inode, endoff);
2123 check_max = 0; 2123 check_max = 0;
2124 err = 0; 2124 err = 0;
2125 ret = wait_event_interruptible(ci->i_cap_wq, 2125 ret = wait_event_interruptible(ci->i_cap_wq,
2126 try_get_cap_refs(ci, need, want, 2126 try_get_cap_refs(ci, need, want,
2127 got, endoff, 2127 got, endoff,
2128 &check_max, &err)); 2128 &check_max, &err));
2129 if (err) 2129 if (err)
2130 ret = err; 2130 ret = err;
2131 if (check_max) 2131 if (check_max)
2132 goto retry; 2132 goto retry;
2133 return ret; 2133 return ret;
2134 } 2134 }
2135 2135
2136 /* 2136 /*
2137 * Take cap refs. Caller must already know we hold at least one ref 2137 * Take cap refs. Caller must already know we hold at least one ref
2138 * on the caps in question or we don't know this is safe. 2138 * on the caps in question or we don't know this is safe.
2139 */ 2139 */
2140 void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps) 2140 void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
2141 { 2141 {
2142 spin_lock(&ci->i_ceph_lock); 2142 spin_lock(&ci->i_ceph_lock);
2143 __take_cap_refs(ci, caps); 2143 __take_cap_refs(ci, caps);
2144 spin_unlock(&ci->i_ceph_lock); 2144 spin_unlock(&ci->i_ceph_lock);
2145 } 2145 }
2146 2146
2147 /* 2147 /*
2148 * Release cap refs. 2148 * Release cap refs.
2149 * 2149 *
2150 * If we released the last ref on any given cap, call ceph_check_caps 2150 * If we released the last ref on any given cap, call ceph_check_caps
2151 * to release (or schedule a release). 2151 * to release (or schedule a release).
2152 * 2152 *
2153 * If we are releasing a WR cap (from a sync write), finalize any affected 2153 * If we are releasing a WR cap (from a sync write), finalize any affected
2154 * cap_snap, and wake up any waiters. 2154 * cap_snap, and wake up any waiters.
2155 */ 2155 */
2156 void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) 2156 void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
2157 { 2157 {
2158 struct inode *inode = &ci->vfs_inode; 2158 struct inode *inode = &ci->vfs_inode;
2159 int last = 0, put = 0, flushsnaps = 0, wake = 0; 2159 int last = 0, put = 0, flushsnaps = 0, wake = 0;
2160 struct ceph_cap_snap *capsnap; 2160 struct ceph_cap_snap *capsnap;
2161 2161
2162 spin_lock(&ci->i_ceph_lock); 2162 spin_lock(&ci->i_ceph_lock);
2163 if (had & CEPH_CAP_PIN) 2163 if (had & CEPH_CAP_PIN)
2164 --ci->i_pin_ref; 2164 --ci->i_pin_ref;
2165 if (had & CEPH_CAP_FILE_RD) 2165 if (had & CEPH_CAP_FILE_RD)
2166 if (--ci->i_rd_ref == 0) 2166 if (--ci->i_rd_ref == 0)
2167 last++; 2167 last++;
2168 if (had & CEPH_CAP_FILE_CACHE) 2168 if (had & CEPH_CAP_FILE_CACHE)
2169 if (--ci->i_rdcache_ref == 0) 2169 if (--ci->i_rdcache_ref == 0)
2170 last++; 2170 last++;
2171 if (had & CEPH_CAP_FILE_BUFFER) { 2171 if (had & CEPH_CAP_FILE_BUFFER) {
2172 if (--ci->i_wb_ref == 0) { 2172 if (--ci->i_wb_ref == 0) {
2173 last++; 2173 last++;
2174 put++; 2174 put++;
2175 } 2175 }
2176 dout("put_cap_refs %p wb %d -> %d (?)\n", 2176 dout("put_cap_refs %p wb %d -> %d (?)\n",
2177 inode, ci->i_wb_ref+1, ci->i_wb_ref); 2177 inode, ci->i_wb_ref+1, ci->i_wb_ref);
2178 } 2178 }
2179 if (had & CEPH_CAP_FILE_WR) 2179 if (had & CEPH_CAP_FILE_WR)
2180 if (--ci->i_wr_ref == 0) { 2180 if (--ci->i_wr_ref == 0) {
2181 last++; 2181 last++;
2182 if (!list_empty(&ci->i_cap_snaps)) { 2182 if (!list_empty(&ci->i_cap_snaps)) {
2183 capsnap = list_first_entry(&ci->i_cap_snaps, 2183 capsnap = list_first_entry(&ci->i_cap_snaps,
2184 struct ceph_cap_snap, 2184 struct ceph_cap_snap,
2185 ci_item); 2185 ci_item);
2186 if (capsnap->writing) { 2186 if (capsnap->writing) {
2187 capsnap->writing = 0; 2187 capsnap->writing = 0;
2188 flushsnaps = 2188 flushsnaps =
2189 __ceph_finish_cap_snap(ci, 2189 __ceph_finish_cap_snap(ci,
2190 capsnap); 2190 capsnap);
2191 wake = 1; 2191 wake = 1;
2192 } 2192 }
2193 } 2193 }
2194 } 2194 }
2195 spin_unlock(&ci->i_ceph_lock); 2195 spin_unlock(&ci->i_ceph_lock);
2196 2196
2197 dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had), 2197 dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
2198 last ? " last" : "", put ? " put" : ""); 2198 last ? " last" : "", put ? " put" : "");
2199 2199
2200 if (last && !flushsnaps) 2200 if (last && !flushsnaps)
2201 ceph_check_caps(ci, 0, NULL); 2201 ceph_check_caps(ci, 0, NULL);
2202 else if (flushsnaps) 2202 else if (flushsnaps)
2203 ceph_flush_snaps(ci); 2203 ceph_flush_snaps(ci);
2204 if (wake) 2204 if (wake)
2205 wake_up_all(&ci->i_cap_wq); 2205 wake_up_all(&ci->i_cap_wq);
2206 if (put) 2206 if (put)
2207 iput(inode); 2207 iput(inode);
2208 } 2208 }
2209 2209
2210 /* 2210 /*
2211 * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap 2211 * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
2212 * context. Adjust per-snap dirty page accounting as appropriate. 2212 * context. Adjust per-snap dirty page accounting as appropriate.
2213 * Once all dirty data for a cap_snap is flushed, flush snapped file 2213 * Once all dirty data for a cap_snap is flushed, flush snapped file
2214 * metadata back to the MDS. If we dropped the last ref, call 2214 * metadata back to the MDS. If we dropped the last ref, call
2215 * ceph_check_caps. 2215 * ceph_check_caps.
2216 */ 2216 */
2217 void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, 2217 void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
2218 struct ceph_snap_context *snapc) 2218 struct ceph_snap_context *snapc)
2219 { 2219 {
2220 struct inode *inode = &ci->vfs_inode; 2220 struct inode *inode = &ci->vfs_inode;
2221 int last = 0; 2221 int last = 0;
2222 int complete_capsnap = 0; 2222 int complete_capsnap = 0;
2223 int drop_capsnap = 0; 2223 int drop_capsnap = 0;
2224 int found = 0; 2224 int found = 0;
2225 struct ceph_cap_snap *capsnap = NULL; 2225 struct ceph_cap_snap *capsnap = NULL;
2226 2226
2227 spin_lock(&ci->i_ceph_lock); 2227 spin_lock(&ci->i_ceph_lock);
2228 ci->i_wrbuffer_ref -= nr; 2228 ci->i_wrbuffer_ref -= nr;
2229 last = !ci->i_wrbuffer_ref; 2229 last = !ci->i_wrbuffer_ref;
2230 2230
2231 if (ci->i_head_snapc == snapc) { 2231 if (ci->i_head_snapc == snapc) {
2232 ci->i_wrbuffer_ref_head -= nr; 2232 ci->i_wrbuffer_ref_head -= nr;
2233 if (ci->i_wrbuffer_ref_head == 0 && 2233 if (ci->i_wrbuffer_ref_head == 0 &&
2234 ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) { 2234 ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) {
2235 BUG_ON(!ci->i_head_snapc); 2235 BUG_ON(!ci->i_head_snapc);
2236 ceph_put_snap_context(ci->i_head_snapc); 2236 ceph_put_snap_context(ci->i_head_snapc);
2237 ci->i_head_snapc = NULL; 2237 ci->i_head_snapc = NULL;
2238 } 2238 }
2239 dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n", 2239 dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n",
2240 inode, 2240 inode,
2241 ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr, 2241 ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr,
2242 ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, 2242 ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
2243 last ? " LAST" : ""); 2243 last ? " LAST" : "");
2244 } else { 2244 } else {
2245 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { 2245 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
2246 if (capsnap->context == snapc) { 2246 if (capsnap->context == snapc) {
2247 found = 1; 2247 found = 1;
2248 break; 2248 break;
2249 } 2249 }
2250 } 2250 }
2251 BUG_ON(!found); 2251 BUG_ON(!found);
2252 capsnap->dirty_pages -= nr; 2252 capsnap->dirty_pages -= nr;
2253 if (capsnap->dirty_pages == 0) { 2253 if (capsnap->dirty_pages == 0) {
2254 complete_capsnap = 1; 2254 complete_capsnap = 1;
2255 if (capsnap->dirty == 0) 2255 if (capsnap->dirty == 0)
2256 /* cap writeback completed before we created 2256 /* cap writeback completed before we created
2257 * the cap_snap; no FLUSHSNAP is needed */ 2257 * the cap_snap; no FLUSHSNAP is needed */
2258 drop_capsnap = 1; 2258 drop_capsnap = 1;
2259 } 2259 }
2260 dout("put_wrbuffer_cap_refs on %p cap_snap %p " 2260 dout("put_wrbuffer_cap_refs on %p cap_snap %p "
2261 " snap %lld %d/%d -> %d/%d %s%s%s\n", 2261 " snap %lld %d/%d -> %d/%d %s%s%s\n",
2262 inode, capsnap, capsnap->context->seq, 2262 inode, capsnap, capsnap->context->seq,
2263 ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr, 2263 ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
2264 ci->i_wrbuffer_ref, capsnap->dirty_pages, 2264 ci->i_wrbuffer_ref, capsnap->dirty_pages,
2265 last ? " (wrbuffer last)" : "", 2265 last ? " (wrbuffer last)" : "",
2266 complete_capsnap ? " (complete capsnap)" : "", 2266 complete_capsnap ? " (complete capsnap)" : "",
2267 drop_capsnap ? " (drop capsnap)" : ""); 2267 drop_capsnap ? " (drop capsnap)" : "");
2268 if (drop_capsnap) { 2268 if (drop_capsnap) {
2269 ceph_put_snap_context(capsnap->context); 2269 ceph_put_snap_context(capsnap->context);
2270 list_del(&capsnap->ci_item); 2270 list_del(&capsnap->ci_item);
2271 list_del(&capsnap->flushing_item); 2271 list_del(&capsnap->flushing_item);
2272 ceph_put_cap_snap(capsnap); 2272 ceph_put_cap_snap(capsnap);
2273 } 2273 }
2274 } 2274 }
2275 2275
2276 spin_unlock(&ci->i_ceph_lock); 2276 spin_unlock(&ci->i_ceph_lock);
2277 2277
2278 if (last) { 2278 if (last) {
2279 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); 2279 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
2280 iput(inode); 2280 iput(inode);
2281 } else if (complete_capsnap) { 2281 } else if (complete_capsnap) {
2282 ceph_flush_snaps(ci); 2282 ceph_flush_snaps(ci);
2283 wake_up_all(&ci->i_cap_wq); 2283 wake_up_all(&ci->i_cap_wq);
2284 } 2284 }
2285 if (drop_capsnap) 2285 if (drop_capsnap)
2286 iput(inode); 2286 iput(inode);
2287 } 2287 }
2288 2288
2289 /* 2289 /*
2290 * Handle a cap GRANT message from the MDS. (Note that a GRANT may 2290 * Handle a cap GRANT message from the MDS. (Note that a GRANT may
2291 * actually be a revocation if it specifies a smaller cap set.) 2291 * actually be a revocation if it specifies a smaller cap set.)
2292 * 2292 *
2293 * caller holds s_mutex and i_ceph_lock, we drop both. 2293 * caller holds s_mutex and i_ceph_lock, we drop both.
2294 * 2294 *
2295 * return value: 2295 * return value:
2296 * 0 - ok 2296 * 0 - ok
2297 * 1 - check_caps on auth cap only (writeback) 2297 * 1 - check_caps on auth cap only (writeback)
2298 * 2 - check_caps (ack revoke) 2298 * 2 - check_caps (ack revoke)
2299 */ 2299 */
2300 static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, 2300 static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2301 struct ceph_mds_session *session, 2301 struct ceph_mds_session *session,
2302 struct ceph_cap *cap, 2302 struct ceph_cap *cap,
2303 struct ceph_buffer *xattr_buf) 2303 struct ceph_buffer *xattr_buf)
2304 __releases(ci->i_ceph_lock) 2304 __releases(ci->i_ceph_lock)
2305 { 2305 {
2306 struct ceph_inode_info *ci = ceph_inode(inode); 2306 struct ceph_inode_info *ci = ceph_inode(inode);
2307 int mds = session->s_mds; 2307 int mds = session->s_mds;
2308 int seq = le32_to_cpu(grant->seq); 2308 int seq = le32_to_cpu(grant->seq);
2309 int newcaps = le32_to_cpu(grant->caps); 2309 int newcaps = le32_to_cpu(grant->caps);
2310 int issued, implemented, used, wanted, dirty; 2310 int issued, implemented, used, wanted, dirty;
2311 u64 size = le64_to_cpu(grant->size); 2311 u64 size = le64_to_cpu(grant->size);
2312 u64 max_size = le64_to_cpu(grant->max_size); 2312 u64 max_size = le64_to_cpu(grant->max_size);
2313 struct timespec mtime, atime, ctime; 2313 struct timespec mtime, atime, ctime;
2314 int check_caps = 0; 2314 int check_caps = 0;
2315 int wake = 0; 2315 int wake = 0;
2316 int writeback = 0; 2316 int writeback = 0;
2317 int revoked_rdcache = 0; 2317 int revoked_rdcache = 0;
2318 int queue_invalidate = 0; 2318 int queue_invalidate = 0;
2319 2319
2320 dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", 2320 dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
2321 inode, cap, mds, seq, ceph_cap_string(newcaps)); 2321 inode, cap, mds, seq, ceph_cap_string(newcaps));
2322 dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, 2322 dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
2323 inode->i_size); 2323 inode->i_size);
2324 2324
2325 /* 2325 /*
2326 * If CACHE is being revoked, and we have no dirty buffers, 2326 * If CACHE is being revoked, and we have no dirty buffers,
2327 * try to invalidate (once). (If there are dirty buffers, we 2327 * try to invalidate (once). (If there are dirty buffers, we
2328 * will invalidate _after_ writeback.) 2328 * will invalidate _after_ writeback.)
2329 */ 2329 */
2330 if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && 2330 if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
2331 (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && 2331 (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
2332 !ci->i_wrbuffer_ref) { 2332 !ci->i_wrbuffer_ref) {
2333 if (try_nonblocking_invalidate(inode) == 0) { 2333 if (try_nonblocking_invalidate(inode) == 0) {
2334 revoked_rdcache = 1; 2334 revoked_rdcache = 1;
2335 } else { 2335 } else {
2336 /* there were locked pages.. invalidate later 2336 /* there were locked pages.. invalidate later
2337 in a separate thread. */ 2337 in a separate thread. */
2338 if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { 2338 if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
2339 queue_invalidate = 1; 2339 queue_invalidate = 1;
2340 ci->i_rdcache_revoking = ci->i_rdcache_gen; 2340 ci->i_rdcache_revoking = ci->i_rdcache_gen;
2341 } 2341 }
2342 } 2342 }
2343 } 2343 }
2344 2344
2345 /* side effects now are allowed */ 2345 /* side effects now are allowed */
2346 2346
2347 issued = __ceph_caps_issued(ci, &implemented); 2347 issued = __ceph_caps_issued(ci, &implemented);
2348 issued |= implemented | __ceph_caps_dirty(ci); 2348 issued |= implemented | __ceph_caps_dirty(ci);
2349 2349
2350 cap->cap_gen = session->s_cap_gen; 2350 cap->cap_gen = session->s_cap_gen;
2351 2351
2352 __check_cap_issue(ci, cap, newcaps); 2352 __check_cap_issue(ci, cap, newcaps);
2353 2353
2354 if ((issued & CEPH_CAP_AUTH_EXCL) == 0) { 2354 if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
2355 inode->i_mode = le32_to_cpu(grant->mode); 2355 inode->i_mode = le32_to_cpu(grant->mode);
2356 inode->i_uid = le32_to_cpu(grant->uid); 2356 inode->i_uid = le32_to_cpu(grant->uid);
2357 inode->i_gid = le32_to_cpu(grant->gid); 2357 inode->i_gid = le32_to_cpu(grant->gid);
2358 dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, 2358 dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
2359 inode->i_uid, inode->i_gid); 2359 inode->i_uid, inode->i_gid);
2360 } 2360 }
2361 2361
2362 if ((issued & CEPH_CAP_LINK_EXCL) == 0) 2362 if ((issued & CEPH_CAP_LINK_EXCL) == 0)
2363 set_nlink(inode, le32_to_cpu(grant->nlink)); 2363 set_nlink(inode, le32_to_cpu(grant->nlink));
2364 2364
2365 if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) { 2365 if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) {
2366 int len = le32_to_cpu(grant->xattr_len); 2366 int len = le32_to_cpu(grant->xattr_len);
2367 u64 version = le64_to_cpu(grant->xattr_version); 2367 u64 version = le64_to_cpu(grant->xattr_version);
2368 2368
2369 if (version > ci->i_xattrs.version) { 2369 if (version > ci->i_xattrs.version) {
2370 dout(" got new xattrs v%llu on %p len %d\n", 2370 dout(" got new xattrs v%llu on %p len %d\n",
2371 version, inode, len); 2371 version, inode, len);
2372 if (ci->i_xattrs.blob) 2372 if (ci->i_xattrs.blob)
2373 ceph_buffer_put(ci->i_xattrs.blob); 2373 ceph_buffer_put(ci->i_xattrs.blob);
2374 ci->i_xattrs.blob = ceph_buffer_get(xattr_buf); 2374 ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
2375 ci->i_xattrs.version = version; 2375 ci->i_xattrs.version = version;
2376 } 2376 }
2377 } 2377 }
2378 2378
2379 /* size/ctime/mtime/atime? */ 2379 /* size/ctime/mtime/atime? */
2380 ceph_fill_file_size(inode, issued, 2380 ceph_fill_file_size(inode, issued,
2381 le32_to_cpu(grant->truncate_seq), 2381 le32_to_cpu(grant->truncate_seq),
2382 le64_to_cpu(grant->truncate_size), size); 2382 le64_to_cpu(grant->truncate_size), size);
2383 ceph_decode_timespec(&mtime, &grant->mtime); 2383 ceph_decode_timespec(&mtime, &grant->mtime);
2384 ceph_decode_timespec(&atime, &grant->atime); 2384 ceph_decode_timespec(&atime, &grant->atime);
2385 ceph_decode_timespec(&ctime, &grant->ctime); 2385 ceph_decode_timespec(&ctime, &grant->ctime);
2386 ceph_fill_file_time(inode, issued, 2386 ceph_fill_file_time(inode, issued,
2387 le32_to_cpu(grant->time_warp_seq), &ctime, &mtime, 2387 le32_to_cpu(grant->time_warp_seq), &ctime, &mtime,
2388 &atime); 2388 &atime);
2389 2389
2390 /* max size increase? */ 2390 /* max size increase? */
2391 if (max_size != ci->i_max_size) { 2391 if (max_size != ci->i_max_size) {
2392 dout("max_size %lld -> %llu\n", ci->i_max_size, max_size); 2392 dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
2393 ci->i_max_size = max_size; 2393 ci->i_max_size = max_size;
2394 if (max_size >= ci->i_wanted_max_size) { 2394 if (max_size >= ci->i_wanted_max_size) {
2395 ci->i_wanted_max_size = 0; /* reset */ 2395 ci->i_wanted_max_size = 0; /* reset */
2396 ci->i_requested_max_size = 0; 2396 ci->i_requested_max_size = 0;
2397 } 2397 }
2398 wake = 1; 2398 wake = 1;
2399 } 2399 }
2400 2400
2401 /* check cap bits */ 2401 /* check cap bits */
2402 wanted = __ceph_caps_wanted(ci); 2402 wanted = __ceph_caps_wanted(ci);
2403 used = __ceph_caps_used(ci); 2403 used = __ceph_caps_used(ci);
2404 dirty = __ceph_caps_dirty(ci); 2404 dirty = __ceph_caps_dirty(ci);
2405 dout(" my wanted = %s, used = %s, dirty %s\n", 2405 dout(" my wanted = %s, used = %s, dirty %s\n",
2406 ceph_cap_string(wanted), 2406 ceph_cap_string(wanted),
2407 ceph_cap_string(used), 2407 ceph_cap_string(used),
2408 ceph_cap_string(dirty)); 2408 ceph_cap_string(dirty));
2409 if (wanted != le32_to_cpu(grant->wanted)) { 2409 if (wanted != le32_to_cpu(grant->wanted)) {
2410 dout("mds wanted %s -> %s\n", 2410 dout("mds wanted %s -> %s\n",
2411 ceph_cap_string(le32_to_cpu(grant->wanted)), 2411 ceph_cap_string(le32_to_cpu(grant->wanted)),
2412 ceph_cap_string(wanted)); 2412 ceph_cap_string(wanted));
2413 grant->wanted = cpu_to_le32(wanted); 2413 grant->wanted = cpu_to_le32(wanted);
2414 } 2414 }
2415 2415
2416 cap->seq = seq; 2416 cap->seq = seq;
2417 2417
2418 /* file layout may have changed */ 2418 /* file layout may have changed */
2419 ci->i_layout = grant->layout; 2419 ci->i_layout = grant->layout;
2420 2420
2421 /* revocation, grant, or no-op? */ 2421 /* revocation, grant, or no-op? */
2422 if (cap->issued & ~newcaps) { 2422 if (cap->issued & ~newcaps) {
2423 int revoking = cap->issued & ~newcaps; 2423 int revoking = cap->issued & ~newcaps;
2424 2424
2425 dout("revocation: %s -> %s (revoking %s)\n", 2425 dout("revocation: %s -> %s (revoking %s)\n",
2426 ceph_cap_string(cap->issued), 2426 ceph_cap_string(cap->issued),
2427 ceph_cap_string(newcaps), 2427 ceph_cap_string(newcaps),
2428 ceph_cap_string(revoking)); 2428 ceph_cap_string(revoking));
2429 if (revoking & used & CEPH_CAP_FILE_BUFFER) 2429 if (revoking & used & CEPH_CAP_FILE_BUFFER)
2430 writeback = 1; /* initiate writeback; will delay ack */ 2430 writeback = 1; /* initiate writeback; will delay ack */
2431 else if (revoking == CEPH_CAP_FILE_CACHE && 2431 else if (revoking == CEPH_CAP_FILE_CACHE &&
2432 (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && 2432 (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
2433 queue_invalidate) 2433 queue_invalidate)
2434 ; /* do nothing yet, invalidation will be queued */ 2434 ; /* do nothing yet, invalidation will be queued */
2435 else if (cap == ci->i_auth_cap) 2435 else if (cap == ci->i_auth_cap)
2436 check_caps = 1; /* check auth cap only */ 2436 check_caps = 1; /* check auth cap only */
2437 else 2437 else
2438 check_caps = 2; /* check all caps */ 2438 check_caps = 2; /* check all caps */
2439 cap->issued = newcaps; 2439 cap->issued = newcaps;
2440 cap->implemented |= newcaps; 2440 cap->implemented |= newcaps;
2441 } else if (cap->issued == newcaps) { 2441 } else if (cap->issued == newcaps) {
2442 dout("caps unchanged: %s -> %s\n", 2442 dout("caps unchanged: %s -> %s\n",
2443 ceph_cap_string(cap->issued), ceph_cap_string(newcaps)); 2443 ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
2444 } else { 2444 } else {
2445 dout("grant: %s -> %s\n", ceph_cap_string(cap->issued), 2445 dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
2446 ceph_cap_string(newcaps)); 2446 ceph_cap_string(newcaps));
2447 cap->issued = newcaps; 2447 cap->issued = newcaps;
2448 cap->implemented |= newcaps; /* add bits only, to 2448 cap->implemented |= newcaps; /* add bits only, to
2449 * avoid stepping on a 2449 * avoid stepping on a
2450 * pending revocation */ 2450 * pending revocation */
2451 wake = 1; 2451 wake = 1;
2452 } 2452 }
2453 BUG_ON(cap->issued & ~cap->implemented); 2453 BUG_ON(cap->issued & ~cap->implemented);
2454 2454
2455 spin_unlock(&ci->i_ceph_lock); 2455 spin_unlock(&ci->i_ceph_lock);
2456 if (writeback) 2456 if (writeback)
2457 /* 2457 /*
2458 * queue inode for writeback: we can't actually call 2458 * queue inode for writeback: we can't actually call
2459 * filemap_write_and_wait, etc. from message handler 2459 * filemap_write_and_wait, etc. from message handler
2460 * context. 2460 * context.
2461 */ 2461 */
2462 ceph_queue_writeback(inode); 2462 ceph_queue_writeback(inode);
2463 if (queue_invalidate) 2463 if (queue_invalidate)
2464 ceph_queue_invalidate(inode); 2464 ceph_queue_invalidate(inode);
2465 if (wake) 2465 if (wake)
2466 wake_up_all(&ci->i_cap_wq); 2466 wake_up_all(&ci->i_cap_wq);
2467 2467
2468 if (check_caps == 1) 2468 if (check_caps == 1)
2469 ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY, 2469 ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
2470 session); 2470 session);
2471 else if (check_caps == 2) 2471 else if (check_caps == 2)
2472 ceph_check_caps(ci, CHECK_CAPS_NODELAY, session); 2472 ceph_check_caps(ci, CHECK_CAPS_NODELAY, session);
2473 else 2473 else
2474 mutex_unlock(&session->s_mutex); 2474 mutex_unlock(&session->s_mutex);
2475 } 2475 }
2476 2476
2477 /* 2477 /*
2478 * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the 2478 * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the
2479 * MDS has been safely committed. 2479 * MDS has been safely committed.
2480 */ 2480 */
2481 static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, 2481 static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
2482 struct ceph_mds_caps *m, 2482 struct ceph_mds_caps *m,
2483 struct ceph_mds_session *session, 2483 struct ceph_mds_session *session,
2484 struct ceph_cap *cap) 2484 struct ceph_cap *cap)
2485 __releases(ci->i_ceph_lock) 2485 __releases(ci->i_ceph_lock)
2486 { 2486 {
2487 struct ceph_inode_info *ci = ceph_inode(inode); 2487 struct ceph_inode_info *ci = ceph_inode(inode);
2488 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; 2488 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
2489 unsigned seq = le32_to_cpu(m->seq); 2489 unsigned seq = le32_to_cpu(m->seq);
2490 int dirty = le32_to_cpu(m->dirty); 2490 int dirty = le32_to_cpu(m->dirty);
2491 int cleaned = 0; 2491 int cleaned = 0;
2492 int drop = 0; 2492 int drop = 0;
2493 int i; 2493 int i;
2494 2494
2495 for (i = 0; i < CEPH_CAP_BITS; i++) 2495 for (i = 0; i < CEPH_CAP_BITS; i++)
2496 if ((dirty & (1 << i)) && 2496 if ((dirty & (1 << i)) &&
2497 flush_tid == ci->i_cap_flush_tid[i]) 2497 flush_tid == ci->i_cap_flush_tid[i])
2498 cleaned |= 1 << i; 2498 cleaned |= 1 << i;
2499 2499
2500 dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s," 2500 dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
2501 " flushing %s -> %s\n", 2501 " flushing %s -> %s\n",
2502 inode, session->s_mds, seq, ceph_cap_string(dirty), 2502 inode, session->s_mds, seq, ceph_cap_string(dirty),
2503 ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps), 2503 ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
2504 ceph_cap_string(ci->i_flushing_caps & ~cleaned)); 2504 ceph_cap_string(ci->i_flushing_caps & ~cleaned));
2505 2505
2506 if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned)) 2506 if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned))
2507 goto out; 2507 goto out;
2508 2508
2509 ci->i_flushing_caps &= ~cleaned; 2509 ci->i_flushing_caps &= ~cleaned;
2510 2510
2511 spin_lock(&mdsc->cap_dirty_lock); 2511 spin_lock(&mdsc->cap_dirty_lock);
2512 if (ci->i_flushing_caps == 0) { 2512 if (ci->i_flushing_caps == 0) {
2513 list_del_init(&ci->i_flushing_item); 2513 list_del_init(&ci->i_flushing_item);
2514 if (!list_empty(&session->s_cap_flushing)) 2514 if (!list_empty(&session->s_cap_flushing))
2515 dout(" mds%d still flushing cap on %p\n", 2515 dout(" mds%d still flushing cap on %p\n",
2516 session->s_mds, 2516 session->s_mds,
2517 &list_entry(session->s_cap_flushing.next, 2517 &list_entry(session->s_cap_flushing.next,
2518 struct ceph_inode_info, 2518 struct ceph_inode_info,
2519 i_flushing_item)->vfs_inode); 2519 i_flushing_item)->vfs_inode);
2520 mdsc->num_cap_flushing--; 2520 mdsc->num_cap_flushing--;
2521 wake_up_all(&mdsc->cap_flushing_wq); 2521 wake_up_all(&mdsc->cap_flushing_wq);
2522 dout(" inode %p now !flushing\n", inode); 2522 dout(" inode %p now !flushing\n", inode);
2523 2523
2524 if (ci->i_dirty_caps == 0) { 2524 if (ci->i_dirty_caps == 0) {
2525 dout(" inode %p now clean\n", inode); 2525 dout(" inode %p now clean\n", inode);
2526 BUG_ON(!list_empty(&ci->i_dirty_item)); 2526 BUG_ON(!list_empty(&ci->i_dirty_item));
2527 drop = 1; 2527 drop = 1;
2528 if (ci->i_wrbuffer_ref_head == 0) { 2528 if (ci->i_wrbuffer_ref_head == 0) {
2529 BUG_ON(!ci->i_head_snapc); 2529 BUG_ON(!ci->i_head_snapc);
2530 ceph_put_snap_context(ci->i_head_snapc); 2530 ceph_put_snap_context(ci->i_head_snapc);
2531 ci->i_head_snapc = NULL; 2531 ci->i_head_snapc = NULL;
2532 } 2532 }
2533 } else { 2533 } else {
2534 BUG_ON(list_empty(&ci->i_dirty_item)); 2534 BUG_ON(list_empty(&ci->i_dirty_item));
2535 } 2535 }
2536 } 2536 }
2537 spin_unlock(&mdsc->cap_dirty_lock); 2537 spin_unlock(&mdsc->cap_dirty_lock);
2538 wake_up_all(&ci->i_cap_wq); 2538 wake_up_all(&ci->i_cap_wq);
2539 2539
2540 out: 2540 out:
2541 spin_unlock(&ci->i_ceph_lock); 2541 spin_unlock(&ci->i_ceph_lock);
2542 if (drop) 2542 if (drop)
2543 iput(inode); 2543 iput(inode);
2544 } 2544 }
2545 2545
2546 /* 2546 /*
2547 * Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can 2547 * Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can
2548 * throw away our cap_snap. 2548 * throw away our cap_snap.
2549 * 2549 *
2550 * Caller hold s_mutex. 2550 * Caller hold s_mutex.
2551 */ 2551 */
2552 static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, 2552 static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
2553 struct ceph_mds_caps *m, 2553 struct ceph_mds_caps *m,
2554 struct ceph_mds_session *session) 2554 struct ceph_mds_session *session)
2555 { 2555 {
2556 struct ceph_inode_info *ci = ceph_inode(inode); 2556 struct ceph_inode_info *ci = ceph_inode(inode);
2557 u64 follows = le64_to_cpu(m->snap_follows); 2557 u64 follows = le64_to_cpu(m->snap_follows);
2558 struct ceph_cap_snap *capsnap; 2558 struct ceph_cap_snap *capsnap;
2559 int drop = 0; 2559 int drop = 0;
2560 2560
2561 dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n", 2561 dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
2562 inode, ci, session->s_mds, follows); 2562 inode, ci, session->s_mds, follows);
2563 2563
2564 spin_lock(&ci->i_ceph_lock); 2564 spin_lock(&ci->i_ceph_lock);
2565 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { 2565 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
2566 if (capsnap->follows == follows) { 2566 if (capsnap->follows == follows) {
2567 if (capsnap->flush_tid != flush_tid) { 2567 if (capsnap->flush_tid != flush_tid) {
2568 dout(" cap_snap %p follows %lld tid %lld !=" 2568 dout(" cap_snap %p follows %lld tid %lld !="
2569 " %lld\n", capsnap, follows, 2569 " %lld\n", capsnap, follows,
2570 flush_tid, capsnap->flush_tid); 2570 flush_tid, capsnap->flush_tid);
2571 break; 2571 break;
2572 } 2572 }
2573 WARN_ON(capsnap->dirty_pages || capsnap->writing); 2573 WARN_ON(capsnap->dirty_pages || capsnap->writing);
2574 dout(" removing %p cap_snap %p follows %lld\n", 2574 dout(" removing %p cap_snap %p follows %lld\n",
2575 inode, capsnap, follows); 2575 inode, capsnap, follows);
2576 ceph_put_snap_context(capsnap->context); 2576 ceph_put_snap_context(capsnap->context);
2577 list_del(&capsnap->ci_item); 2577 list_del(&capsnap->ci_item);
2578 list_del(&capsnap->flushing_item); 2578 list_del(&capsnap->flushing_item);
2579 ceph_put_cap_snap(capsnap); 2579 ceph_put_cap_snap(capsnap);
2580 drop = 1; 2580 drop = 1;
2581 break; 2581 break;
2582 } else { 2582 } else {
2583 dout(" skipping cap_snap %p follows %lld\n", 2583 dout(" skipping cap_snap %p follows %lld\n",
2584 capsnap, capsnap->follows); 2584 capsnap, capsnap->follows);
2585 } 2585 }
2586 } 2586 }
2587 spin_unlock(&ci->i_ceph_lock); 2587 spin_unlock(&ci->i_ceph_lock);
2588 if (drop) 2588 if (drop)
2589 iput(inode); 2589 iput(inode);
2590 } 2590 }
2591 2591
2592 /* 2592 /*
2593 * Handle TRUNC from MDS, indicating file truncation. 2593 * Handle TRUNC from MDS, indicating file truncation.
2594 * 2594 *
2595 * caller hold s_mutex. 2595 * caller hold s_mutex.
2596 */ 2596 */
2597 static void handle_cap_trunc(struct inode *inode, 2597 static void handle_cap_trunc(struct inode *inode,
2598 struct ceph_mds_caps *trunc, 2598 struct ceph_mds_caps *trunc,
2599 struct ceph_mds_session *session) 2599 struct ceph_mds_session *session)
2600 __releases(ci->i_ceph_lock) 2600 __releases(ci->i_ceph_lock)
2601 { 2601 {
2602 struct ceph_inode_info *ci = ceph_inode(inode); 2602 struct ceph_inode_info *ci = ceph_inode(inode);
2603 int mds = session->s_mds; 2603 int mds = session->s_mds;
2604 int seq = le32_to_cpu(trunc->seq); 2604 int seq = le32_to_cpu(trunc->seq);
2605 u32 truncate_seq = le32_to_cpu(trunc->truncate_seq); 2605 u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);
2606 u64 truncate_size = le64_to_cpu(trunc->truncate_size); 2606 u64 truncate_size = le64_to_cpu(trunc->truncate_size);
2607 u64 size = le64_to_cpu(trunc->size); 2607 u64 size = le64_to_cpu(trunc->size);
2608 int implemented = 0; 2608 int implemented = 0;
2609 int dirty = __ceph_caps_dirty(ci); 2609 int dirty = __ceph_caps_dirty(ci);
2610 int issued = __ceph_caps_issued(ceph_inode(inode), &implemented); 2610 int issued = __ceph_caps_issued(ceph_inode(inode), &implemented);
2611 int queue_trunc = 0; 2611 int queue_trunc = 0;
2612 2612
2613 issued |= implemented | dirty; 2613 issued |= implemented | dirty;
2614 2614
2615 dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n", 2615 dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n",
2616 inode, mds, seq, truncate_size, truncate_seq); 2616 inode, mds, seq, truncate_size, truncate_seq);
2617 queue_trunc = ceph_fill_file_size(inode, issued, 2617 queue_trunc = ceph_fill_file_size(inode, issued,
2618 truncate_seq, truncate_size, size); 2618 truncate_seq, truncate_size, size);
2619 spin_unlock(&ci->i_ceph_lock); 2619 spin_unlock(&ci->i_ceph_lock);
2620 2620
2621 if (queue_trunc) 2621 if (queue_trunc)
2622 ceph_queue_vmtruncate(inode); 2622 ceph_queue_vmtruncate(inode);
2623 } 2623 }
2624 2624
2625 /* 2625 /*
2626 * Handle EXPORT from MDS. Cap is being migrated _from_ this mds to a 2626 * Handle EXPORT from MDS. Cap is being migrated _from_ this mds to a
2627 * different one. If we are the most recent migration we've seen (as 2627 * different one. If we are the most recent migration we've seen (as
2628 * indicated by mseq), make note of the migrating cap bits for the 2628 * indicated by mseq), make note of the migrating cap bits for the
2629 * duration (until we see the corresponding IMPORT). 2629 * duration (until we see the corresponding IMPORT).
2630 * 2630 *
2631 * caller holds s_mutex 2631 * caller holds s_mutex
2632 */ 2632 */
2633 static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, 2633 static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
2634 struct ceph_mds_session *session, 2634 struct ceph_mds_session *session,
2635 int *open_target_sessions) 2635 int *open_target_sessions)
2636 { 2636 {
2637 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; 2637 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
2638 struct ceph_inode_info *ci = ceph_inode(inode); 2638 struct ceph_inode_info *ci = ceph_inode(inode);
2639 int mds = session->s_mds; 2639 int mds = session->s_mds;
2640 unsigned mseq = le32_to_cpu(ex->migrate_seq); 2640 unsigned mseq = le32_to_cpu(ex->migrate_seq);
2641 struct ceph_cap *cap = NULL, *t; 2641 struct ceph_cap *cap = NULL, *t;
2642 struct rb_node *p; 2642 struct rb_node *p;
2643 int remember = 1; 2643 int remember = 1;
2644 2644
2645 dout("handle_cap_export inode %p ci %p mds%d mseq %d\n", 2645 dout("handle_cap_export inode %p ci %p mds%d mseq %d\n",
2646 inode, ci, mds, mseq); 2646 inode, ci, mds, mseq);
2647 2647
2648 spin_lock(&ci->i_ceph_lock); 2648 spin_lock(&ci->i_ceph_lock);
2649 2649
2650 /* make sure we haven't seen a higher mseq */ 2650 /* make sure we haven't seen a higher mseq */
2651 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { 2651 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
2652 t = rb_entry(p, struct ceph_cap, ci_node); 2652 t = rb_entry(p, struct ceph_cap, ci_node);
2653 if (ceph_seq_cmp(t->mseq, mseq) > 0) { 2653 if (ceph_seq_cmp(t->mseq, mseq) > 0) {
2654 dout(" higher mseq on cap from mds%d\n", 2654 dout(" higher mseq on cap from mds%d\n",
2655 t->session->s_mds); 2655 t->session->s_mds);
2656 remember = 0; 2656 remember = 0;
2657 } 2657 }
2658 if (t->session->s_mds == mds) 2658 if (t->session->s_mds == mds)
2659 cap = t; 2659 cap = t;
2660 } 2660 }
2661 2661
2662 if (cap) { 2662 if (cap) {
2663 if (remember) { 2663 if (remember) {
2664 /* make note */ 2664 /* make note */
2665 ci->i_cap_exporting_mds = mds; 2665 ci->i_cap_exporting_mds = mds;
2666 ci->i_cap_exporting_mseq = mseq; 2666 ci->i_cap_exporting_mseq = mseq;
2667 ci->i_cap_exporting_issued = cap->issued; 2667 ci->i_cap_exporting_issued = cap->issued;
2668 2668
2669 /* 2669 /*
2670 * make sure we have open sessions with all possible 2670 * make sure we have open sessions with all possible
2671 * export targets, so that we get the matching IMPORT 2671 * export targets, so that we get the matching IMPORT
2672 */ 2672 */
2673 *open_target_sessions = 1; 2673 *open_target_sessions = 1;
2674 2674
2675 /* 2675 /*
2676 * we can't flush dirty caps that we've seen the 2676 * we can't flush dirty caps that we've seen the
2677 * EXPORT but no IMPORT for 2677 * EXPORT but no IMPORT for
2678 */ 2678 */
2679 spin_lock(&mdsc->cap_dirty_lock); 2679 spin_lock(&mdsc->cap_dirty_lock);
2680 if (!list_empty(&ci->i_dirty_item)) { 2680 if (!list_empty(&ci->i_dirty_item)) {
2681 dout(" moving %p to cap_dirty_migrating\n", 2681 dout(" moving %p to cap_dirty_migrating\n",
2682 inode); 2682 inode);
2683 list_move(&ci->i_dirty_item, 2683 list_move(&ci->i_dirty_item,
2684 &mdsc->cap_dirty_migrating); 2684 &mdsc->cap_dirty_migrating);
2685 } 2685 }
2686 spin_unlock(&mdsc->cap_dirty_lock); 2686 spin_unlock(&mdsc->cap_dirty_lock);
2687 } 2687 }
2688 __ceph_remove_cap(cap); 2688 __ceph_remove_cap(cap);
2689 } 2689 }
2690 /* else, we already released it */ 2690 /* else, we already released it */
2691 2691
2692 spin_unlock(&ci->i_ceph_lock); 2692 spin_unlock(&ci->i_ceph_lock);
2693 } 2693 }
2694 2694
2695 /* 2695 /*
2696 * Handle cap IMPORT. If there are temp bits from an older EXPORT, 2696 * Handle cap IMPORT. If there are temp bits from an older EXPORT,
2697 * clean them up. 2697 * clean them up.
2698 * 2698 *
2699 * caller holds s_mutex. 2699 * caller holds s_mutex.
2700 */ 2700 */
2701 static void handle_cap_import(struct ceph_mds_client *mdsc, 2701 static void handle_cap_import(struct ceph_mds_client *mdsc,
2702 struct inode *inode, struct ceph_mds_caps *im, 2702 struct inode *inode, struct ceph_mds_caps *im,
2703 struct ceph_mds_session *session, 2703 struct ceph_mds_session *session,
2704 void *snaptrace, int snaptrace_len) 2704 void *snaptrace, int snaptrace_len)
2705 { 2705 {
2706 struct ceph_inode_info *ci = ceph_inode(inode); 2706 struct ceph_inode_info *ci = ceph_inode(inode);
2707 int mds = session->s_mds; 2707 int mds = session->s_mds;
2708 unsigned issued = le32_to_cpu(im->caps); 2708 unsigned issued = le32_to_cpu(im->caps);
2709 unsigned wanted = le32_to_cpu(im->wanted); 2709 unsigned wanted = le32_to_cpu(im->wanted);
2710 unsigned seq = le32_to_cpu(im->seq); 2710 unsigned seq = le32_to_cpu(im->seq);
2711 unsigned mseq = le32_to_cpu(im->migrate_seq); 2711 unsigned mseq = le32_to_cpu(im->migrate_seq);
2712 u64 realmino = le64_to_cpu(im->realm); 2712 u64 realmino = le64_to_cpu(im->realm);
2713 u64 cap_id = le64_to_cpu(im->cap_id); 2713 u64 cap_id = le64_to_cpu(im->cap_id);
2714 2714
2715 if (ci->i_cap_exporting_mds >= 0 && 2715 if (ci->i_cap_exporting_mds >= 0 &&
2716 ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) { 2716 ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) {
2717 dout("handle_cap_import inode %p ci %p mds%d mseq %d" 2717 dout("handle_cap_import inode %p ci %p mds%d mseq %d"
2718 " - cleared exporting from mds%d\n", 2718 " - cleared exporting from mds%d\n",
2719 inode, ci, mds, mseq, 2719 inode, ci, mds, mseq,
2720 ci->i_cap_exporting_mds); 2720 ci->i_cap_exporting_mds);
2721 ci->i_cap_exporting_issued = 0; 2721 ci->i_cap_exporting_issued = 0;
2722 ci->i_cap_exporting_mseq = 0; 2722 ci->i_cap_exporting_mseq = 0;
2723 ci->i_cap_exporting_mds = -1; 2723 ci->i_cap_exporting_mds = -1;
2724 2724
2725 spin_lock(&mdsc->cap_dirty_lock); 2725 spin_lock(&mdsc->cap_dirty_lock);
2726 if (!list_empty(&ci->i_dirty_item)) { 2726 if (!list_empty(&ci->i_dirty_item)) {
2727 dout(" moving %p back to cap_dirty\n", inode); 2727 dout(" moving %p back to cap_dirty\n", inode);
2728 list_move(&ci->i_dirty_item, &mdsc->cap_dirty); 2728 list_move(&ci->i_dirty_item, &mdsc->cap_dirty);
2729 } 2729 }
2730 spin_unlock(&mdsc->cap_dirty_lock); 2730 spin_unlock(&mdsc->cap_dirty_lock);
2731 } else { 2731 } else {
2732 dout("handle_cap_import inode %p ci %p mds%d mseq %d\n", 2732 dout("handle_cap_import inode %p ci %p mds%d mseq %d\n",
2733 inode, ci, mds, mseq); 2733 inode, ci, mds, mseq);
2734 } 2734 }
2735 2735
2736 down_write(&mdsc->snap_rwsem); 2736 down_write(&mdsc->snap_rwsem);
2737 ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len, 2737 ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len,
2738 false); 2738 false);
2739 downgrade_write(&mdsc->snap_rwsem); 2739 downgrade_write(&mdsc->snap_rwsem);
2740 ceph_add_cap(inode, session, cap_id, -1, 2740 ceph_add_cap(inode, session, cap_id, -1,
2741 issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH, 2741 issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
2742 NULL /* no caps context */); 2742 NULL /* no caps context */);
2743 kick_flushing_inode_caps(mdsc, session, inode); 2743 kick_flushing_inode_caps(mdsc, session, inode);
2744 up_read(&mdsc->snap_rwsem); 2744 up_read(&mdsc->snap_rwsem);
2745 2745
2746 /* make sure we re-request max_size, if necessary */ 2746 /* make sure we re-request max_size, if necessary */
2747 spin_lock(&ci->i_ceph_lock); 2747 spin_lock(&ci->i_ceph_lock);
2748 ci->i_requested_max_size = 0; 2748 ci->i_requested_max_size = 0;
2749 spin_unlock(&ci->i_ceph_lock); 2749 spin_unlock(&ci->i_ceph_lock);
2750 } 2750 }
2751 2751
2752 /* 2752 /*
2753 * Handle a caps message from the MDS. 2753 * Handle a caps message from the MDS.
2754 * 2754 *
2755 * Identify the appropriate session, inode, and call the right handler 2755 * Identify the appropriate session, inode, and call the right handler
2756 * based on the cap op. 2756 * based on the cap op.
2757 */ 2757 */
2758 void ceph_handle_caps(struct ceph_mds_session *session, 2758 void ceph_handle_caps(struct ceph_mds_session *session,
2759 struct ceph_msg *msg) 2759 struct ceph_msg *msg)
2760 { 2760 {
2761 struct ceph_mds_client *mdsc = session->s_mdsc; 2761 struct ceph_mds_client *mdsc = session->s_mdsc;
2762 struct super_block *sb = mdsc->fsc->sb; 2762 struct super_block *sb = mdsc->fsc->sb;
2763 struct inode *inode; 2763 struct inode *inode;
2764 struct ceph_inode_info *ci; 2764 struct ceph_inode_info *ci;
2765 struct ceph_cap *cap; 2765 struct ceph_cap *cap;
2766 struct ceph_mds_caps *h; 2766 struct ceph_mds_caps *h;
2767 int mds = session->s_mds; 2767 int mds = session->s_mds;
2768 int op; 2768 int op;
2769 u32 seq, mseq; 2769 u32 seq, mseq;
2770 struct ceph_vino vino; 2770 struct ceph_vino vino;
2771 u64 cap_id; 2771 u64 cap_id;
2772 u64 size, max_size; 2772 u64 size, max_size;
2773 u64 tid; 2773 u64 tid;
2774 void *snaptrace; 2774 void *snaptrace;
2775 size_t snaptrace_len; 2775 size_t snaptrace_len;
2776 void *flock; 2776 void *flock;
2777 u32 flock_len; 2777 u32 flock_len;
2778 int open_target_sessions = 0; 2778 int open_target_sessions = 0;
2779 2779
2780 dout("handle_caps from mds%d\n", mds); 2780 dout("handle_caps from mds%d\n", mds);
2781 2781
2782 /* decode */ 2782 /* decode */
2783 tid = le64_to_cpu(msg->hdr.tid); 2783 tid = le64_to_cpu(msg->hdr.tid);
2784 if (msg->front.iov_len < sizeof(*h)) 2784 if (msg->front.iov_len < sizeof(*h))
2785 goto bad; 2785 goto bad;
2786 h = msg->front.iov_base; 2786 h = msg->front.iov_base;
2787 op = le32_to_cpu(h->op); 2787 op = le32_to_cpu(h->op);
2788 vino.ino = le64_to_cpu(h->ino); 2788 vino.ino = le64_to_cpu(h->ino);
2789 vino.snap = CEPH_NOSNAP; 2789 vino.snap = CEPH_NOSNAP;
2790 cap_id = le64_to_cpu(h->cap_id); 2790 cap_id = le64_to_cpu(h->cap_id);
2791 seq = le32_to_cpu(h->seq); 2791 seq = le32_to_cpu(h->seq);
2792 mseq = le32_to_cpu(h->migrate_seq); 2792 mseq = le32_to_cpu(h->migrate_seq);
2793 size = le64_to_cpu(h->size); 2793 size = le64_to_cpu(h->size);
2794 max_size = le64_to_cpu(h->max_size); 2794 max_size = le64_to_cpu(h->max_size);
2795 2795
2796 snaptrace = h + 1; 2796 snaptrace = h + 1;
2797 snaptrace_len = le32_to_cpu(h->snap_trace_len); 2797 snaptrace_len = le32_to_cpu(h->snap_trace_len);
2798 2798
2799 if (le16_to_cpu(msg->hdr.version) >= 2) { 2799 if (le16_to_cpu(msg->hdr.version) >= 2) {
2800 void *p, *end; 2800 void *p, *end;
2801 2801
2802 p = snaptrace + snaptrace_len; 2802 p = snaptrace + snaptrace_len;
2803 end = msg->front.iov_base + msg->front.iov_len; 2803 end = msg->front.iov_base + msg->front.iov_len;
2804 ceph_decode_32_safe(&p, end, flock_len, bad); 2804 ceph_decode_32_safe(&p, end, flock_len, bad);
2805 flock = p; 2805 flock = p;
2806 } else { 2806 } else {
2807 flock = NULL; 2807 flock = NULL;
2808 flock_len = 0; 2808 flock_len = 0;
2809 } 2809 }
2810 2810
2811 mutex_lock(&session->s_mutex); 2811 mutex_lock(&session->s_mutex);
2812 session->s_seq++; 2812 session->s_seq++;
2813 dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, 2813 dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
2814 (unsigned)seq); 2814 (unsigned)seq);
2815 2815
2816 /* lookup ino */ 2816 /* lookup ino */
2817 inode = ceph_find_inode(sb, vino); 2817 inode = ceph_find_inode(sb, vino);
2818 ci = ceph_inode(inode); 2818 ci = ceph_inode(inode);
2819 dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino, 2819 dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
2820 vino.snap, inode); 2820 vino.snap, inode);
2821 if (!inode) { 2821 if (!inode) {
2822 dout(" i don't have ino %llx\n", vino.ino); 2822 dout(" i don't have ino %llx\n", vino.ino);
2823 2823
2824 if (op == CEPH_CAP_OP_IMPORT) 2824 if (op == CEPH_CAP_OP_IMPORT)
2825 __queue_cap_release(session, vino.ino, cap_id, 2825 __queue_cap_release(session, vino.ino, cap_id,
2826 mseq, seq); 2826 mseq, seq);
2827 goto flush_cap_releases; 2827 goto flush_cap_releases;
2828 } 2828 }
2829 2829
2830 /* these will work even if we don't have a cap yet */ 2830 /* these will work even if we don't have a cap yet */
2831 switch (op) { 2831 switch (op) {
2832 case CEPH_CAP_OP_FLUSHSNAP_ACK: 2832 case CEPH_CAP_OP_FLUSHSNAP_ACK:
2833 handle_cap_flushsnap_ack(inode, tid, h, session); 2833 handle_cap_flushsnap_ack(inode, tid, h, session);
2834 goto done; 2834 goto done;
2835 2835
2836 case CEPH_CAP_OP_EXPORT: 2836 case CEPH_CAP_OP_EXPORT:
2837 handle_cap_export(inode, h, session, &open_target_sessions); 2837 handle_cap_export(inode, h, session, &open_target_sessions);
2838 goto done; 2838 goto done;
2839 2839
2840 case CEPH_CAP_OP_IMPORT: 2840 case CEPH_CAP_OP_IMPORT:
2841 handle_cap_import(mdsc, inode, h, session, 2841 handle_cap_import(mdsc, inode, h, session,
2842 snaptrace, snaptrace_len); 2842 snaptrace, snaptrace_len);
2843 ceph_check_caps(ceph_inode(inode), 0, session); 2843 ceph_check_caps(ceph_inode(inode), 0, session);
2844 goto done_unlocked; 2844 goto done_unlocked;
2845 } 2845 }
2846 2846
2847 /* the rest require a cap */ 2847 /* the rest require a cap */
2848 spin_lock(&ci->i_ceph_lock); 2848 spin_lock(&ci->i_ceph_lock);
2849 cap = __get_cap_for_mds(ceph_inode(inode), mds); 2849 cap = __get_cap_for_mds(ceph_inode(inode), mds);
2850 if (!cap) { 2850 if (!cap) {
2851 dout(" no cap on %p ino %llx.%llx from mds%d\n", 2851 dout(" no cap on %p ino %llx.%llx from mds%d\n",
2852 inode, ceph_ino(inode), ceph_snap(inode), mds); 2852 inode, ceph_ino(inode), ceph_snap(inode), mds);
2853 spin_unlock(&ci->i_ceph_lock); 2853 spin_unlock(&ci->i_ceph_lock);
2854 goto flush_cap_releases; 2854 goto flush_cap_releases;
2855 } 2855 }
2856 2856
2857 /* note that each of these drops i_ceph_lock for us */ 2857 /* note that each of these drops i_ceph_lock for us */
2858 switch (op) { 2858 switch (op) {
2859 case CEPH_CAP_OP_REVOKE: 2859 case CEPH_CAP_OP_REVOKE:
2860 case CEPH_CAP_OP_GRANT: 2860 case CEPH_CAP_OP_GRANT:
2861 handle_cap_grant(inode, h, session, cap, msg->middle); 2861 handle_cap_grant(inode, h, session, cap, msg->middle);
2862 goto done_unlocked; 2862 goto done_unlocked;
2863 2863
2864 case CEPH_CAP_OP_FLUSH_ACK: 2864 case CEPH_CAP_OP_FLUSH_ACK:
2865 handle_cap_flush_ack(inode, tid, h, session, cap); 2865 handle_cap_flush_ack(inode, tid, h, session, cap);
2866 break; 2866 break;
2867 2867
2868 case CEPH_CAP_OP_TRUNC: 2868 case CEPH_CAP_OP_TRUNC:
2869 handle_cap_trunc(inode, h, session); 2869 handle_cap_trunc(inode, h, session);
2870 break; 2870 break;
2871 2871
2872 default: 2872 default:
2873 spin_unlock(&ci->i_ceph_lock); 2873 spin_unlock(&ci->i_ceph_lock);
2874 pr_err("ceph_handle_caps: unknown cap op %d %s\n", op, 2874 pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
2875 ceph_cap_op_name(op)); 2875 ceph_cap_op_name(op));
2876 } 2876 }
2877 2877
2878 goto done; 2878 goto done;
2879 2879
2880 flush_cap_releases: 2880 flush_cap_releases:
2881 /* 2881 /*
2882 * send any full release message to try to move things 2882 * send any full release message to try to move things
2883 * along for the mds (who clearly thinks we still have this 2883 * along for the mds (who clearly thinks we still have this
2884 * cap). 2884 * cap).
2885 */ 2885 */
2886 ceph_add_cap_releases(mdsc, session); 2886 ceph_add_cap_releases(mdsc, session);
2887 ceph_send_cap_releases(mdsc, session); 2887 ceph_send_cap_releases(mdsc, session);
2888 2888
2889 done: 2889 done:
2890 mutex_unlock(&session->s_mutex); 2890 mutex_unlock(&session->s_mutex);
2891 done_unlocked: 2891 done_unlocked:
2892 if (inode) 2892 if (inode)
2893 iput(inode); 2893 iput(inode);
2894 if (open_target_sessions) 2894 if (open_target_sessions)
2895 ceph_mdsc_open_export_target_sessions(mdsc, session); 2895 ceph_mdsc_open_export_target_sessions(mdsc, session);
2896 return; 2896 return;
2897 2897
2898 bad: 2898 bad:
2899 pr_err("ceph_handle_caps: corrupt message\n"); 2899 pr_err("ceph_handle_caps: corrupt message\n");
2900 ceph_msg_dump(msg); 2900 ceph_msg_dump(msg);
2901 return; 2901 return;
2902 } 2902 }
2903 2903
2904 /* 2904 /*
2905 * Delayed work handler to process end of delayed cap release LRU list. 2905 * Delayed work handler to process end of delayed cap release LRU list.
2906 */ 2906 */
2907 void ceph_check_delayed_caps(struct ceph_mds_client *mdsc) 2907 void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
2908 { 2908 {
2909 struct ceph_inode_info *ci; 2909 struct ceph_inode_info *ci;
2910 int flags = CHECK_CAPS_NODELAY; 2910 int flags = CHECK_CAPS_NODELAY;
2911 2911
2912 dout("check_delayed_caps\n"); 2912 dout("check_delayed_caps\n");
2913 while (1) { 2913 while (1) {
2914 spin_lock(&mdsc->cap_delay_lock); 2914 spin_lock(&mdsc->cap_delay_lock);
2915 if (list_empty(&mdsc->cap_delay_list)) 2915 if (list_empty(&mdsc->cap_delay_list))
2916 break; 2916 break;
2917 ci = list_first_entry(&mdsc->cap_delay_list, 2917 ci = list_first_entry(&mdsc->cap_delay_list,
2918 struct ceph_inode_info, 2918 struct ceph_inode_info,
2919 i_cap_delay_list); 2919 i_cap_delay_list);
2920 if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 && 2920 if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
2921 time_before(jiffies, ci->i_hold_caps_max)) 2921 time_before(jiffies, ci->i_hold_caps_max))
2922 break; 2922 break;
2923 list_del_init(&ci->i_cap_delay_list); 2923 list_del_init(&ci->i_cap_delay_list);
2924 spin_unlock(&mdsc->cap_delay_lock); 2924 spin_unlock(&mdsc->cap_delay_lock);
2925 dout("check_delayed_caps on %p\n", &ci->vfs_inode); 2925 dout("check_delayed_caps on %p\n", &ci->vfs_inode);
2926 ceph_check_caps(ci, flags, NULL); 2926 ceph_check_caps(ci, flags, NULL);
2927 } 2927 }
2928 spin_unlock(&mdsc->cap_delay_lock); 2928 spin_unlock(&mdsc->cap_delay_lock);
2929 } 2929 }
2930 2930
2931 /* 2931 /*
2932 * Flush all dirty caps to the mds 2932 * Flush all dirty caps to the mds
2933 */ 2933 */
2934 void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) 2934 void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
2935 { 2935 {
2936 struct ceph_inode_info *ci; 2936 struct ceph_inode_info *ci;
2937 struct inode *inode; 2937 struct inode *inode;
2938 2938
2939 dout("flush_dirty_caps\n"); 2939 dout("flush_dirty_caps\n");
2940 spin_lock(&mdsc->cap_dirty_lock); 2940 spin_lock(&mdsc->cap_dirty_lock);
2941 while (!list_empty(&mdsc->cap_dirty)) { 2941 while (!list_empty(&mdsc->cap_dirty)) {
2942 ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info, 2942 ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info,
2943 i_dirty_item); 2943 i_dirty_item);
2944 inode = &ci->vfs_inode; 2944 inode = &ci->vfs_inode;
2945 ihold(inode); 2945 ihold(inode);
2946 dout("flush_dirty_caps %p\n", inode); 2946 dout("flush_dirty_caps %p\n", inode);
2947 spin_unlock(&mdsc->cap_dirty_lock); 2947 spin_unlock(&mdsc->cap_dirty_lock);
2948 ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, NULL); 2948 ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, NULL);
2949 iput(inode); 2949 iput(inode);
2950 spin_lock(&mdsc->cap_dirty_lock); 2950 spin_lock(&mdsc->cap_dirty_lock);
2951 } 2951 }
2952 spin_unlock(&mdsc->cap_dirty_lock); 2952 spin_unlock(&mdsc->cap_dirty_lock);
2953 dout("flush_dirty_caps done\n"); 2953 dout("flush_dirty_caps done\n");
2954 } 2954 }
2955 2955
2956 /* 2956 /*
2957 * Drop open file reference. If we were the last open file, 2957 * Drop open file reference. If we were the last open file,
2958 * we may need to release capabilities to the MDS (or schedule 2958 * we may need to release capabilities to the MDS (or schedule
2959 * their delayed release). 2959 * their delayed release).
2960 */ 2960 */
2961 void ceph_put_fmode(struct ceph_inode_info *ci, int fmode) 2961 void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
2962 { 2962 {
2963 struct inode *inode = &ci->vfs_inode; 2963 struct inode *inode = &ci->vfs_inode;
2964 int last = 0; 2964 int last = 0;
2965 2965
2966 spin_lock(&ci->i_ceph_lock); 2966 spin_lock(&ci->i_ceph_lock);
2967 dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode, 2967 dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode,
2968 ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1); 2968 ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1);
2969 BUG_ON(ci->i_nr_by_mode[fmode] == 0); 2969 BUG_ON(ci->i_nr_by_mode[fmode] == 0);
2970 if (--ci->i_nr_by_mode[fmode] == 0) 2970 if (--ci->i_nr_by_mode[fmode] == 0)
2971 last++; 2971 last++;
2972 spin_unlock(&ci->i_ceph_lock); 2972 spin_unlock(&ci->i_ceph_lock);
2973 2973
2974 if (last && ci->i_vino.snap == CEPH_NOSNAP) 2974 if (last && ci->i_vino.snap == CEPH_NOSNAP)
2975 ceph_check_caps(ci, 0, NULL); 2975 ceph_check_caps(ci, 0, NULL);
2976 } 2976 }
2977 2977
2978 /* 2978 /*
2979 * Helpers for embedding cap and dentry lease releases into mds 2979 * Helpers for embedding cap and dentry lease releases into mds
2980 * requests. 2980 * requests.
2981 * 2981 *
2982 * @force is used by dentry_release (below) to force inclusion of a 2982 * @force is used by dentry_release (below) to force inclusion of a
2983 * record for the directory inode, even when there aren't any caps to 2983 * record for the directory inode, even when there aren't any caps to
2984 * drop. 2984 * drop.
2985 */ 2985 */
2986 int ceph_encode_inode_release(void **p, struct inode *inode, 2986 int ceph_encode_inode_release(void **p, struct inode *inode,
2987 int mds, int drop, int unless, int force) 2987 int mds, int drop, int unless, int force)
2988 { 2988 {
2989 struct ceph_inode_info *ci = ceph_inode(inode); 2989 struct ceph_inode_info *ci = ceph_inode(inode);
2990 struct ceph_cap *cap; 2990 struct ceph_cap *cap;
2991 struct ceph_mds_request_release *rel = *p; 2991 struct ceph_mds_request_release *rel = *p;
2992 int used, dirty; 2992 int used, dirty;
2993 int ret = 0; 2993 int ret = 0;
2994 2994
2995 spin_lock(&ci->i_ceph_lock); 2995 spin_lock(&ci->i_ceph_lock);
2996 used = __ceph_caps_used(ci); 2996 used = __ceph_caps_used(ci);
2997 dirty = __ceph_caps_dirty(ci); 2997 dirty = __ceph_caps_dirty(ci);
2998 2998
2999 dout("encode_inode_release %p mds%d used|dirty %s drop %s unless %s\n", 2999 dout("encode_inode_release %p mds%d used|dirty %s drop %s unless %s\n",
3000 inode, mds, ceph_cap_string(used|dirty), ceph_cap_string(drop), 3000 inode, mds, ceph_cap_string(used|dirty), ceph_cap_string(drop),
3001 ceph_cap_string(unless)); 3001 ceph_cap_string(unless));
3002 3002
3003 /* only drop unused, clean caps */ 3003 /* only drop unused, clean caps */
3004 drop &= ~(used | dirty); 3004 drop &= ~(used | dirty);
3005 3005
3006 cap = __get_cap_for_mds(ci, mds); 3006 cap = __get_cap_for_mds(ci, mds);
3007 if (cap && __cap_is_valid(cap)) { 3007 if (cap && __cap_is_valid(cap)) {
3008 if (force || 3008 if (force ||
3009 ((cap->issued & drop) && 3009 ((cap->issued & drop) &&
3010 (cap->issued & unless) == 0)) { 3010 (cap->issued & unless) == 0)) {
3011 if ((cap->issued & drop) && 3011 if ((cap->issued & drop) &&
3012 (cap->issued & unless) == 0) { 3012 (cap->issued & unless) == 0) {
3013 dout("encode_inode_release %p cap %p %s -> " 3013 dout("encode_inode_release %p cap %p %s -> "
3014 "%s\n", inode, cap, 3014 "%s\n", inode, cap,
3015 ceph_cap_string(cap->issued), 3015 ceph_cap_string(cap->issued),
3016 ceph_cap_string(cap->issued & ~drop)); 3016 ceph_cap_string(cap->issued & ~drop));
3017 cap->issued &= ~drop; 3017 cap->issued &= ~drop;
3018 cap->implemented &= ~drop; 3018 cap->implemented &= ~drop;
3019 if (ci->i_ceph_flags & CEPH_I_NODELAY) { 3019 if (ci->i_ceph_flags & CEPH_I_NODELAY) {
3020 int wanted = __ceph_caps_wanted(ci); 3020 int wanted = __ceph_caps_wanted(ci);
3021 dout(" wanted %s -> %s (act %s)\n", 3021 dout(" wanted %s -> %s (act %s)\n",
3022 ceph_cap_string(cap->mds_wanted), 3022 ceph_cap_string(cap->mds_wanted),
3023 ceph_cap_string(cap->mds_wanted & 3023 ceph_cap_string(cap->mds_wanted &
3024 ~wanted), 3024 ~wanted),
3025 ceph_cap_string(wanted)); 3025 ceph_cap_string(wanted));
3026 cap->mds_wanted &= wanted; 3026 cap->mds_wanted &= wanted;
3027 } 3027 }
3028 } else { 3028 } else {
3029 dout("encode_inode_release %p cap %p %s" 3029 dout("encode_inode_release %p cap %p %s"
3030 " (force)\n", inode, cap, 3030 " (force)\n", inode, cap,
3031 ceph_cap_string(cap->issued)); 3031 ceph_cap_string(cap->issued));
3032 } 3032 }
3033 3033
3034 rel->ino = cpu_to_le64(ceph_ino(inode)); 3034 rel->ino = cpu_to_le64(ceph_ino(inode));
3035 rel->cap_id = cpu_to_le64(cap->cap_id); 3035 rel->cap_id = cpu_to_le64(cap->cap_id);
3036 rel->seq = cpu_to_le32(cap->seq); 3036 rel->seq = cpu_to_le32(cap->seq);
3037 rel->issue_seq = cpu_to_le32(cap->issue_seq), 3037 rel->issue_seq = cpu_to_le32(cap->issue_seq),
3038 rel->mseq = cpu_to_le32(cap->mseq); 3038 rel->mseq = cpu_to_le32(cap->mseq);
3039 rel->caps = cpu_to_le32(cap->issued); 3039 rel->caps = cpu_to_le32(cap->issued);
3040 rel->wanted = cpu_to_le32(cap->mds_wanted); 3040 rel->wanted = cpu_to_le32(cap->mds_wanted);
3041 rel->dname_len = 0; 3041 rel->dname_len = 0;
3042 rel->dname_seq = 0; 3042 rel->dname_seq = 0;
3043 *p += sizeof(*rel); 3043 *p += sizeof(*rel);
3044 ret = 1; 3044 ret = 1;
3045 } else { 3045 } else {
3046 dout("encode_inode_release %p cap %p %s\n", 3046 dout("encode_inode_release %p cap %p %s\n",
3047 inode, cap, ceph_cap_string(cap->issued)); 3047 inode, cap, ceph_cap_string(cap->issued));
3048 } 3048 }
3049 } 3049 }
3050 spin_unlock(&ci->i_ceph_lock); 3050 spin_unlock(&ci->i_ceph_lock);
3051 return ret; 3051 return ret;
3052 } 3052 }
3053 3053
3054 int ceph_encode_dentry_release(void **p, struct dentry *dentry, 3054 int ceph_encode_dentry_release(void **p, struct dentry *dentry,
3055 int mds, int drop, int unless) 3055 int mds, int drop, int unless)
3056 { 3056 {
3057 struct inode *dir = dentry->d_parent->d_inode; 3057 struct inode *dir = dentry->d_parent->d_inode;
3058 struct ceph_mds_request_release *rel = *p; 3058 struct ceph_mds_request_release *rel = *p;
3059 struct ceph_dentry_info *di = ceph_dentry(dentry); 3059 struct ceph_dentry_info *di = ceph_dentry(dentry);
3060 int force = 0; 3060 int force = 0;
3061 int ret; 3061 int ret;
3062 3062
3063 /* 3063 /*
3064 * force an record for the directory caps if we have a dentry lease. 3064 * force an record for the directory caps if we have a dentry lease.
3065 * this is racy (can't take i_ceph_lock and d_lock together), but it 3065 * this is racy (can't take i_ceph_lock and d_lock together), but it
3066 * doesn't have to be perfect; the mds will revoke anything we don't 3066 * doesn't have to be perfect; the mds will revoke anything we don't
3067 * release. 3067 * release.
3068 */ 3068 */
3069 spin_lock(&dentry->d_lock); 3069 spin_lock(&dentry->d_lock);
3070 if (di->lease_session && di->lease_session->s_mds == mds) 3070 if (di->lease_session && di->lease_session->s_mds == mds)
3071 force = 1; 3071 force = 1;
3072 spin_unlock(&dentry->d_lock); 3072 spin_unlock(&dentry->d_lock);
3073 3073
3074 ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force); 3074 ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
3075 3075
3076 spin_lock(&dentry->d_lock); 3076 spin_lock(&dentry->d_lock);
3077 if (ret && di->lease_session && di->lease_session->s_mds == mds) { 3077 if (ret && di->lease_session && di->lease_session->s_mds == mds) {
3078 dout("encode_dentry_release %p mds%d seq %d\n", 3078 dout("encode_dentry_release %p mds%d seq %d\n",
3079 dentry, mds, (int)di->lease_seq); 3079 dentry, mds, (int)di->lease_seq);
3080 rel->dname_len = cpu_to_le32(dentry->d_name.len); 3080 rel->dname_len = cpu_to_le32(dentry->d_name.len);
3081 memcpy(*p, dentry->d_name.name, dentry->d_name.len); 3081 memcpy(*p, dentry->d_name.name, dentry->d_name.len);
3082 *p += dentry->d_name.len; 3082 *p += dentry->d_name.len;
3083 rel->dname_seq = cpu_to_le32(di->lease_seq); 3083 rel->dname_seq = cpu_to_le32(di->lease_seq);
3084 __ceph_mdsc_drop_dentry_lease(dentry); 3084 __ceph_mdsc_drop_dentry_lease(dentry);
3085 } 3085 }
3086 spin_unlock(&dentry->d_lock); 3086 spin_unlock(&dentry->d_lock);
3087 return ret; 3087 return ret;
3088 } 3088 }
3089 3089
1 #include <linux/ceph/ceph_debug.h> 1 #include <linux/ceph/ceph_debug.h>
2 2
3 #include <linux/spinlock.h> 3 #include <linux/spinlock.h>
4 #include <linux/fs_struct.h> 4 #include <linux/fs_struct.h>
5 #include <linux/namei.h> 5 #include <linux/namei.h>
6 #include <linux/slab.h> 6 #include <linux/slab.h>
7 #include <linux/sched.h> 7 #include <linux/sched.h>
8 8
9 #include "super.h" 9 #include "super.h"
10 #include "mds_client.h" 10 #include "mds_client.h"
11 11
12 /* 12 /*
13 * Directory operations: readdir, lookup, create, link, unlink, 13 * Directory operations: readdir, lookup, create, link, unlink,
14 * rename, etc. 14 * rename, etc.
15 */ 15 */
16 16
17 /* 17 /*
18 * Ceph MDS operations are specified in terms of a base ino and 18 * Ceph MDS operations are specified in terms of a base ino and
19 * relative path. Thus, the client can specify an operation on a 19 * relative path. Thus, the client can specify an operation on a
20 * specific inode (e.g., a getattr due to fstat(2)), or as a path 20 * specific inode (e.g., a getattr due to fstat(2)), or as a path
21 * relative to, say, the root directory. 21 * relative to, say, the root directory.
22 * 22 *
23 * Normally, we limit ourselves to strict inode ops (no path component) 23 * Normally, we limit ourselves to strict inode ops (no path component)
24 * or dentry operations (a single path component relative to an ino). The 24 * or dentry operations (a single path component relative to an ino). The
25 * exception to this is open_root_dentry(), which will open the mount 25 * exception to this is open_root_dentry(), which will open the mount
26 * point by name. 26 * point by name.
27 */ 27 */
28 28
29 const struct inode_operations ceph_dir_iops; 29 const struct inode_operations ceph_dir_iops;
30 const struct file_operations ceph_dir_fops; 30 const struct file_operations ceph_dir_fops;
31 const struct dentry_operations ceph_dentry_ops; 31 const struct dentry_operations ceph_dentry_ops;
32 32
33 /* 33 /*
34 * Initialize ceph dentry state. 34 * Initialize ceph dentry state.
35 */ 35 */
36 int ceph_init_dentry(struct dentry *dentry) 36 int ceph_init_dentry(struct dentry *dentry)
37 { 37 {
38 struct ceph_dentry_info *di; 38 struct ceph_dentry_info *di;
39 39
40 if (dentry->d_fsdata) 40 if (dentry->d_fsdata)
41 return 0; 41 return 0;
42 42
43 di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO); 43 di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO);
44 if (!di) 44 if (!di)
45 return -ENOMEM; /* oh well */ 45 return -ENOMEM; /* oh well */
46 46
47 spin_lock(&dentry->d_lock); 47 spin_lock(&dentry->d_lock);
48 if (dentry->d_fsdata) { 48 if (dentry->d_fsdata) {
49 /* lost a race */ 49 /* lost a race */
50 kmem_cache_free(ceph_dentry_cachep, di); 50 kmem_cache_free(ceph_dentry_cachep, di);
51 goto out_unlock; 51 goto out_unlock;
52 } 52 }
53 53
54 if (dentry->d_parent == NULL || /* nfs fh_to_dentry */ 54 if (dentry->d_parent == NULL || /* nfs fh_to_dentry */
55 ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) 55 ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
56 d_set_d_op(dentry, &ceph_dentry_ops); 56 d_set_d_op(dentry, &ceph_dentry_ops);
57 else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR) 57 else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
58 d_set_d_op(dentry, &ceph_snapdir_dentry_ops); 58 d_set_d_op(dentry, &ceph_snapdir_dentry_ops);
59 else 59 else
60 d_set_d_op(dentry, &ceph_snap_dentry_ops); 60 d_set_d_op(dentry, &ceph_snap_dentry_ops);
61 61
62 di->dentry = dentry; 62 di->dentry = dentry;
63 di->lease_session = NULL; 63 di->lease_session = NULL;
64 dentry->d_time = jiffies; 64 dentry->d_time = jiffies;
65 /* avoid reordering d_fsdata setup so that the check above is safe */ 65 /* avoid reordering d_fsdata setup so that the check above is safe */
66 smp_mb(); 66 smp_mb();
67 dentry->d_fsdata = di; 67 dentry->d_fsdata = di;
68 ceph_dentry_lru_add(dentry); 68 ceph_dentry_lru_add(dentry);
69 out_unlock: 69 out_unlock:
70 spin_unlock(&dentry->d_lock); 70 spin_unlock(&dentry->d_lock);
71 return 0; 71 return 0;
72 } 72 }
73 73
74 struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry) 74 struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry)
75 { 75 {
76 struct inode *inode = NULL; 76 struct inode *inode = NULL;
77 77
78 if (!dentry) 78 if (!dentry)
79 return NULL; 79 return NULL;
80 80
81 spin_lock(&dentry->d_lock); 81 spin_lock(&dentry->d_lock);
82 if (dentry->d_parent) { 82 if (dentry->d_parent) {
83 inode = dentry->d_parent->d_inode; 83 inode = dentry->d_parent->d_inode;
84 ihold(inode); 84 ihold(inode);
85 } 85 }
86 spin_unlock(&dentry->d_lock); 86 spin_unlock(&dentry->d_lock);
87 return inode; 87 return inode;
88 } 88 }
89 89
90 90
91 /* 91 /*
92 * for readdir, we encode the directory frag and offset within that 92 * for readdir, we encode the directory frag and offset within that
93 * frag into f_pos. 93 * frag into f_pos.
94 */ 94 */
95 static unsigned fpos_frag(loff_t p) 95 static unsigned fpos_frag(loff_t p)
96 { 96 {
97 return p >> 32; 97 return p >> 32;
98 } 98 }
99 static unsigned fpos_off(loff_t p) 99 static unsigned fpos_off(loff_t p)
100 { 100 {
101 return p & 0xffffffff; 101 return p & 0xffffffff;
102 } 102 }
103 103
104 /* 104 /*
105 * When possible, we try to satisfy a readdir by peeking at the 105 * When possible, we try to satisfy a readdir by peeking at the
106 * dcache. We make this work by carefully ordering dentries on 106 * dcache. We make this work by carefully ordering dentries on
107 * d_u.d_child when we initially get results back from the MDS, and 107 * d_u.d_child when we initially get results back from the MDS, and
108 * falling back to a "normal" sync readdir if any dentries in the dir 108 * falling back to a "normal" sync readdir if any dentries in the dir
109 * are dropped. 109 * are dropped.
110 * 110 *
111 * D_COMPLETE tells indicates we have all dentries in the dir. It is 111 * D_COMPLETE tells indicates we have all dentries in the dir. It is
112 * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by 112 * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
113 * the MDS if/when the directory is modified). 113 * the MDS if/when the directory is modified).
114 */ 114 */
115 static int __dcache_readdir(struct file *filp, 115 static int __dcache_readdir(struct file *filp,
116 void *dirent, filldir_t filldir) 116 void *dirent, filldir_t filldir)
117 { 117 {
118 struct ceph_file_info *fi = filp->private_data; 118 struct ceph_file_info *fi = filp->private_data;
119 struct dentry *parent = filp->f_dentry; 119 struct dentry *parent = filp->f_dentry;
120 struct inode *dir = parent->d_inode; 120 struct inode *dir = parent->d_inode;
121 struct list_head *p; 121 struct list_head *p;
122 struct dentry *dentry, *last; 122 struct dentry *dentry, *last;
123 struct ceph_dentry_info *di; 123 struct ceph_dentry_info *di;
124 int err = 0; 124 int err = 0;
125 125
126 /* claim ref on last dentry we returned */ 126 /* claim ref on last dentry we returned */
127 last = fi->dentry; 127 last = fi->dentry;
128 fi->dentry = NULL; 128 fi->dentry = NULL;
129 129
130 dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos, 130 dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,
131 last); 131 last);
132 132
133 spin_lock(&parent->d_lock); 133 spin_lock(&parent->d_lock);
134 134
135 /* start at beginning? */ 135 /* start at beginning? */
136 if (filp->f_pos == 2 || last == NULL || 136 if (filp->f_pos == 2 || last == NULL ||
137 filp->f_pos < ceph_dentry(last)->offset) { 137 filp->f_pos < ceph_dentry(last)->offset) {
138 if (list_empty(&parent->d_subdirs)) 138 if (list_empty(&parent->d_subdirs))
139 goto out_unlock; 139 goto out_unlock;
140 p = parent->d_subdirs.prev; 140 p = parent->d_subdirs.prev;
141 dout(" initial p %p/%p\n", p->prev, p->next); 141 dout(" initial p %p/%p\n", p->prev, p->next);
142 } else { 142 } else {
143 p = last->d_u.d_child.prev; 143 p = last->d_u.d_child.prev;
144 } 144 }
145 145
146 more: 146 more:
147 dentry = list_entry(p, struct dentry, d_u.d_child); 147 dentry = list_entry(p, struct dentry, d_u.d_child);
148 di = ceph_dentry(dentry); 148 di = ceph_dentry(dentry);
149 while (1) { 149 while (1) {
150 dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next, 150 dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next,
151 d_unhashed(dentry) ? "!hashed" : "hashed", 151 d_unhashed(dentry) ? "!hashed" : "hashed",
152 parent->d_subdirs.prev, parent->d_subdirs.next); 152 parent->d_subdirs.prev, parent->d_subdirs.next);
153 if (p == &parent->d_subdirs) { 153 if (p == &parent->d_subdirs) {
154 fi->flags |= CEPH_F_ATEND; 154 fi->flags |= CEPH_F_ATEND;
155 goto out_unlock; 155 goto out_unlock;
156 } 156 }
157 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); 157 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
158 if (!d_unhashed(dentry) && dentry->d_inode && 158 if (!d_unhashed(dentry) && dentry->d_inode &&
159 ceph_snap(dentry->d_inode) != CEPH_SNAPDIR && 159 ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
160 ceph_ino(dentry->d_inode) != CEPH_INO_CEPH && 160 ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
161 filp->f_pos <= di->offset) 161 filp->f_pos <= di->offset)
162 break; 162 break;
163 dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry, 163 dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,
164 dentry->d_name.len, dentry->d_name.name, di->offset, 164 dentry->d_name.len, dentry->d_name.name, di->offset,
165 filp->f_pos, d_unhashed(dentry) ? " unhashed" : "", 165 filp->f_pos, d_unhashed(dentry) ? " unhashed" : "",
166 !dentry->d_inode ? " null" : ""); 166 !dentry->d_inode ? " null" : "");
167 spin_unlock(&dentry->d_lock); 167 spin_unlock(&dentry->d_lock);
168 p = p->prev; 168 p = p->prev;
169 dentry = list_entry(p, struct dentry, d_u.d_child); 169 dentry = list_entry(p, struct dentry, d_u.d_child);
170 di = ceph_dentry(dentry); 170 di = ceph_dentry(dentry);
171 } 171 }
172 172
173 dget_dlock(dentry); 173 dget_dlock(dentry);
174 spin_unlock(&dentry->d_lock); 174 spin_unlock(&dentry->d_lock);
175 spin_unlock(&parent->d_lock); 175 spin_unlock(&parent->d_lock);
176 176
177 dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos, 177 dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
178 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); 178 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
179 filp->f_pos = di->offset; 179 filp->f_pos = di->offset;
180 err = filldir(dirent, dentry->d_name.name, 180 err = filldir(dirent, dentry->d_name.name,
181 dentry->d_name.len, di->offset, 181 dentry->d_name.len, di->offset,
182 ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino), 182 ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino),
183 dentry->d_inode->i_mode >> 12); 183 dentry->d_inode->i_mode >> 12);
184 184
185 if (last) { 185 if (last) {
186 if (err < 0) { 186 if (err < 0) {
187 /* remember our position */ 187 /* remember our position */
188 fi->dentry = last; 188 fi->dentry = last;
189 fi->next_offset = di->offset; 189 fi->next_offset = di->offset;
190 } else { 190 } else {
191 dput(last); 191 dput(last);
192 } 192 }
193 } 193 }
194 last = dentry; 194 last = dentry;
195 195
196 if (err < 0) 196 if (err < 0)
197 goto out; 197 goto out;
198 198
199 filp->f_pos++; 199 filp->f_pos++;
200 200
201 /* make sure a dentry wasn't dropped while we didn't have parent lock */ 201 /* make sure a dentry wasn't dropped while we didn't have parent lock */
202 if (!ceph_dir_test_complete(dir)) { 202 if (!ceph_dir_test_complete(dir)) {
203 dout(" lost D_COMPLETE on %p; falling back to mds\n", dir); 203 dout(" lost D_COMPLETE on %p; falling back to mds\n", dir);
204 err = -EAGAIN; 204 err = -EAGAIN;
205 goto out; 205 goto out;
206 } 206 }
207 207
208 spin_lock(&parent->d_lock); 208 spin_lock(&parent->d_lock);
209 p = p->prev; /* advance to next dentry */ 209 p = p->prev; /* advance to next dentry */
210 goto more; 210 goto more;
211 211
212 out_unlock: 212 out_unlock:
213 spin_unlock(&parent->d_lock); 213 spin_unlock(&parent->d_lock);
214 out: 214 out:
215 if (last) 215 if (last)
216 dput(last); 216 dput(last);
217 return err; 217 return err;
218 } 218 }
219 219
220 /* 220 /*
221 * make note of the last dentry we read, so we can 221 * make note of the last dentry we read, so we can
222 * continue at the same lexicographical point, 222 * continue at the same lexicographical point,
223 * regardless of what dir changes take place on the 223 * regardless of what dir changes take place on the
224 * server. 224 * server.
225 */ 225 */
226 static int note_last_dentry(struct ceph_file_info *fi, const char *name, 226 static int note_last_dentry(struct ceph_file_info *fi, const char *name,
227 int len) 227 int len)
228 { 228 {
229 kfree(fi->last_name); 229 kfree(fi->last_name);
230 fi->last_name = kmalloc(len+1, GFP_NOFS); 230 fi->last_name = kmalloc(len+1, GFP_NOFS);
231 if (!fi->last_name) 231 if (!fi->last_name)
232 return -ENOMEM; 232 return -ENOMEM;
233 memcpy(fi->last_name, name, len); 233 memcpy(fi->last_name, name, len);
234 fi->last_name[len] = 0; 234 fi->last_name[len] = 0;
235 dout("note_last_dentry '%s'\n", fi->last_name); 235 dout("note_last_dentry '%s'\n", fi->last_name);
236 return 0; 236 return 0;
237 } 237 }
238 238
239 static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) 239 static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
240 { 240 {
241 struct ceph_file_info *fi = filp->private_data; 241 struct ceph_file_info *fi = filp->private_data;
242 struct inode *inode = filp->f_dentry->d_inode; 242 struct inode *inode = filp->f_dentry->d_inode;
243 struct ceph_inode_info *ci = ceph_inode(inode); 243 struct ceph_inode_info *ci = ceph_inode(inode);
244 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 244 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
245 struct ceph_mds_client *mdsc = fsc->mdsc; 245 struct ceph_mds_client *mdsc = fsc->mdsc;
246 unsigned frag = fpos_frag(filp->f_pos); 246 unsigned frag = fpos_frag(filp->f_pos);
247 int off = fpos_off(filp->f_pos); 247 int off = fpos_off(filp->f_pos);
248 int err; 248 int err;
249 u32 ftype; 249 u32 ftype;
250 struct ceph_mds_reply_info_parsed *rinfo; 250 struct ceph_mds_reply_info_parsed *rinfo;
251 const int max_entries = fsc->mount_options->max_readdir; 251 const int max_entries = fsc->mount_options->max_readdir;
252 const int max_bytes = fsc->mount_options->max_readdir_bytes; 252 const int max_bytes = fsc->mount_options->max_readdir_bytes;
253 253
254 dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off); 254 dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
255 if (fi->flags & CEPH_F_ATEND) 255 if (fi->flags & CEPH_F_ATEND)
256 return 0; 256 return 0;
257 257
258 /* always start with . and .. */ 258 /* always start with . and .. */
259 if (filp->f_pos == 0) { 259 if (filp->f_pos == 0) {
260 /* note dir version at start of readdir so we can tell 260 /* note dir version at start of readdir so we can tell
261 * if any dentries get dropped */ 261 * if any dentries get dropped */
262 fi->dir_release_count = ci->i_release_count; 262 fi->dir_release_count = ci->i_release_count;
263 263
264 dout("readdir off 0 -> '.'\n"); 264 dout("readdir off 0 -> '.'\n");
265 if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0), 265 if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0),
266 ceph_translate_ino(inode->i_sb, inode->i_ino), 266 ceph_translate_ino(inode->i_sb, inode->i_ino),
267 inode->i_mode >> 12) < 0) 267 inode->i_mode >> 12) < 0)
268 return 0; 268 return 0;
269 filp->f_pos = 1; 269 filp->f_pos = 1;
270 off = 1; 270 off = 1;
271 } 271 }
272 if (filp->f_pos == 1) { 272 if (filp->f_pos == 1) {
273 ino_t ino = parent_ino(filp->f_dentry); 273 ino_t ino = parent_ino(filp->f_dentry);
274 dout("readdir off 1 -> '..'\n"); 274 dout("readdir off 1 -> '..'\n");
275 if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1), 275 if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1),
276 ceph_translate_ino(inode->i_sb, ino), 276 ceph_translate_ino(inode->i_sb, ino),
277 inode->i_mode >> 12) < 0) 277 inode->i_mode >> 12) < 0)
278 return 0; 278 return 0;
279 filp->f_pos = 2; 279 filp->f_pos = 2;
280 off = 2; 280 off = 2;
281 } 281 }
282 282
283 /* can we use the dcache? */ 283 /* can we use the dcache? */
284 spin_lock(&ci->i_ceph_lock); 284 spin_lock(&ci->i_ceph_lock);
285 if ((filp->f_pos == 2 || fi->dentry) && 285 if ((filp->f_pos == 2 || fi->dentry) &&
286 !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && 286 !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
287 ceph_snap(inode) != CEPH_SNAPDIR && 287 ceph_snap(inode) != CEPH_SNAPDIR &&
288 ceph_dir_test_complete(inode) && 288 ceph_dir_test_complete(inode) &&
289 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { 289 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
290 spin_unlock(&ci->i_ceph_lock); 290 spin_unlock(&ci->i_ceph_lock);
291 err = __dcache_readdir(filp, dirent, filldir); 291 err = __dcache_readdir(filp, dirent, filldir);
292 if (err != -EAGAIN) 292 if (err != -EAGAIN)
293 return err; 293 return err;
294 } else { 294 } else {
295 spin_unlock(&ci->i_ceph_lock); 295 spin_unlock(&ci->i_ceph_lock);
296 } 296 }
297 if (fi->dentry) { 297 if (fi->dentry) {
298 err = note_last_dentry(fi, fi->dentry->d_name.name, 298 err = note_last_dentry(fi, fi->dentry->d_name.name,
299 fi->dentry->d_name.len); 299 fi->dentry->d_name.len);
300 if (err) 300 if (err)
301 return err; 301 return err;
302 dput(fi->dentry); 302 dput(fi->dentry);
303 fi->dentry = NULL; 303 fi->dentry = NULL;
304 } 304 }
305 305
306 /* proceed with a normal readdir */ 306 /* proceed with a normal readdir */
307 307
308 more: 308 more:
309 /* do we have the correct frag content buffered? */ 309 /* do we have the correct frag content buffered? */
310 if (fi->frag != frag || fi->last_readdir == NULL) { 310 if (fi->frag != frag || fi->last_readdir == NULL) {
311 struct ceph_mds_request *req; 311 struct ceph_mds_request *req;
312 int op = ceph_snap(inode) == CEPH_SNAPDIR ? 312 int op = ceph_snap(inode) == CEPH_SNAPDIR ?
313 CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR; 313 CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
314 314
315 /* discard old result, if any */ 315 /* discard old result, if any */
316 if (fi->last_readdir) { 316 if (fi->last_readdir) {
317 ceph_mdsc_put_request(fi->last_readdir); 317 ceph_mdsc_put_request(fi->last_readdir);
318 fi->last_readdir = NULL; 318 fi->last_readdir = NULL;
319 } 319 }
320 320
321 /* requery frag tree, as the frag topology may have changed */ 321 /* requery frag tree, as the frag topology may have changed */
322 frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL); 322 frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL);
323 323
324 dout("readdir fetching %llx.%llx frag %x offset '%s'\n", 324 dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
325 ceph_vinop(inode), frag, fi->last_name); 325 ceph_vinop(inode), frag, fi->last_name);
326 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); 326 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
327 if (IS_ERR(req)) 327 if (IS_ERR(req))
328 return PTR_ERR(req); 328 return PTR_ERR(req);
329 req->r_inode = inode; 329 req->r_inode = inode;
330 ihold(inode); 330 ihold(inode);
331 req->r_dentry = dget(filp->f_dentry); 331 req->r_dentry = dget(filp->f_dentry);
332 /* hints to request -> mds selection code */ 332 /* hints to request -> mds selection code */
333 req->r_direct_mode = USE_AUTH_MDS; 333 req->r_direct_mode = USE_AUTH_MDS;
334 req->r_direct_hash = ceph_frag_value(frag); 334 req->r_direct_hash = ceph_frag_value(frag);
335 req->r_direct_is_hash = true; 335 req->r_direct_is_hash = true;
336 req->r_path2 = kstrdup(fi->last_name, GFP_NOFS); 336 req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
337 req->r_readdir_offset = fi->next_offset; 337 req->r_readdir_offset = fi->next_offset;
338 req->r_args.readdir.frag = cpu_to_le32(frag); 338 req->r_args.readdir.frag = cpu_to_le32(frag);
339 req->r_args.readdir.max_entries = cpu_to_le32(max_entries); 339 req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
340 req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes); 340 req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes);
341 req->r_num_caps = max_entries + 1; 341 req->r_num_caps = max_entries + 1;
342 err = ceph_mdsc_do_request(mdsc, NULL, req); 342 err = ceph_mdsc_do_request(mdsc, NULL, req);
343 if (err < 0) { 343 if (err < 0) {
344 ceph_mdsc_put_request(req); 344 ceph_mdsc_put_request(req);
345 return err; 345 return err;
346 } 346 }
347 dout("readdir got and parsed readdir result=%d" 347 dout("readdir got and parsed readdir result=%d"
348 " on frag %x, end=%d, complete=%d\n", err, frag, 348 " on frag %x, end=%d, complete=%d\n", err, frag,
349 (int)req->r_reply_info.dir_end, 349 (int)req->r_reply_info.dir_end,
350 (int)req->r_reply_info.dir_complete); 350 (int)req->r_reply_info.dir_complete);
351 351
352 if (!req->r_did_prepopulate) { 352 if (!req->r_did_prepopulate) {
353 dout("readdir !did_prepopulate"); 353 dout("readdir !did_prepopulate");
354 fi->dir_release_count--; /* preclude D_COMPLETE */ 354 fi->dir_release_count--; /* preclude D_COMPLETE */
355 } 355 }
356 356
357 /* note next offset and last dentry name */ 357 /* note next offset and last dentry name */
358 fi->offset = fi->next_offset; 358 fi->offset = fi->next_offset;
359 fi->last_readdir = req; 359 fi->last_readdir = req;
360 360
361 if (req->r_reply_info.dir_end) { 361 if (req->r_reply_info.dir_end) {
362 kfree(fi->last_name); 362 kfree(fi->last_name);
363 fi->last_name = NULL; 363 fi->last_name = NULL;
364 if (ceph_frag_is_rightmost(frag)) 364 if (ceph_frag_is_rightmost(frag))
365 fi->next_offset = 2; 365 fi->next_offset = 2;
366 else 366 else
367 fi->next_offset = 0; 367 fi->next_offset = 0;
368 } else { 368 } else {
369 rinfo = &req->r_reply_info; 369 rinfo = &req->r_reply_info;
370 err = note_last_dentry(fi, 370 err = note_last_dentry(fi,
371 rinfo->dir_dname[rinfo->dir_nr-1], 371 rinfo->dir_dname[rinfo->dir_nr-1],
372 rinfo->dir_dname_len[rinfo->dir_nr-1]); 372 rinfo->dir_dname_len[rinfo->dir_nr-1]);
373 if (err) 373 if (err)
374 return err; 374 return err;
375 fi->next_offset += rinfo->dir_nr; 375 fi->next_offset += rinfo->dir_nr;
376 } 376 }
377 } 377 }
378 378
379 rinfo = &fi->last_readdir->r_reply_info; 379 rinfo = &fi->last_readdir->r_reply_info;
380 dout("readdir frag %x num %d off %d chunkoff %d\n", frag, 380 dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
381 rinfo->dir_nr, off, fi->offset); 381 rinfo->dir_nr, off, fi->offset);
382 while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) { 382 while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) {
383 u64 pos = ceph_make_fpos(frag, off); 383 u64 pos = ceph_make_fpos(frag, off);
384 struct ceph_mds_reply_inode *in = 384 struct ceph_mds_reply_inode *in =
385 rinfo->dir_in[off - fi->offset].in; 385 rinfo->dir_in[off - fi->offset].in;
386 struct ceph_vino vino; 386 struct ceph_vino vino;
387 ino_t ino; 387 ino_t ino;
388 388
389 dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n", 389 dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
390 off, off - fi->offset, rinfo->dir_nr, pos, 390 off, off - fi->offset, rinfo->dir_nr, pos,
391 rinfo->dir_dname_len[off - fi->offset], 391 rinfo->dir_dname_len[off - fi->offset],
392 rinfo->dir_dname[off - fi->offset], in); 392 rinfo->dir_dname[off - fi->offset], in);
393 BUG_ON(!in); 393 BUG_ON(!in);
394 ftype = le32_to_cpu(in->mode) >> 12; 394 ftype = le32_to_cpu(in->mode) >> 12;
395 vino.ino = le64_to_cpu(in->ino); 395 vino.ino = le64_to_cpu(in->ino);
396 vino.snap = le64_to_cpu(in->snapid); 396 vino.snap = le64_to_cpu(in->snapid);
397 ino = ceph_vino_to_ino(vino); 397 ino = ceph_vino_to_ino(vino);
398 if (filldir(dirent, 398 if (filldir(dirent,
399 rinfo->dir_dname[off - fi->offset], 399 rinfo->dir_dname[off - fi->offset],
400 rinfo->dir_dname_len[off - fi->offset], 400 rinfo->dir_dname_len[off - fi->offset],
401 pos, 401 pos,
402 ceph_translate_ino(inode->i_sb, ino), ftype) < 0) { 402 ceph_translate_ino(inode->i_sb, ino), ftype) < 0) {
403 dout("filldir stopping us...\n"); 403 dout("filldir stopping us...\n");
404 return 0; 404 return 0;
405 } 405 }
406 off++; 406 off++;
407 filp->f_pos = pos + 1; 407 filp->f_pos = pos + 1;
408 } 408 }
409 409
410 if (fi->last_name) { 410 if (fi->last_name) {
411 ceph_mdsc_put_request(fi->last_readdir); 411 ceph_mdsc_put_request(fi->last_readdir);
412 fi->last_readdir = NULL; 412 fi->last_readdir = NULL;
413 goto more; 413 goto more;
414 } 414 }
415 415
416 /* more frags? */ 416 /* more frags? */
417 if (!ceph_frag_is_rightmost(frag)) { 417 if (!ceph_frag_is_rightmost(frag)) {
418 frag = ceph_frag_next(frag); 418 frag = ceph_frag_next(frag);
419 off = 0; 419 off = 0;
420 filp->f_pos = ceph_make_fpos(frag, off); 420 filp->f_pos = ceph_make_fpos(frag, off);
421 dout("readdir next frag is %x\n", frag); 421 dout("readdir next frag is %x\n", frag);
422 goto more; 422 goto more;
423 } 423 }
424 fi->flags |= CEPH_F_ATEND; 424 fi->flags |= CEPH_F_ATEND;
425 425
426 /* 426 /*
427 * if dir_release_count still matches the dir, no dentries 427 * if dir_release_count still matches the dir, no dentries
428 * were released during the whole readdir, and we should have 428 * were released during the whole readdir, and we should have
429 * the complete dir contents in our cache. 429 * the complete dir contents in our cache.
430 */ 430 */
431 spin_lock(&ci->i_ceph_lock); 431 spin_lock(&ci->i_ceph_lock);
432 if (ci->i_release_count == fi->dir_release_count) { 432 if (ci->i_release_count == fi->dir_release_count) {
433 ceph_dir_set_complete(inode); 433 ceph_dir_set_complete(inode);
434 ci->i_max_offset = filp->f_pos; 434 ci->i_max_offset = filp->f_pos;
435 } 435 }
436 spin_unlock(&ci->i_ceph_lock); 436 spin_unlock(&ci->i_ceph_lock);
437 437
438 dout("readdir %p filp %p done.\n", inode, filp); 438 dout("readdir %p filp %p done.\n", inode, filp);
439 return 0; 439 return 0;
440 } 440 }
441 441
442 static void reset_readdir(struct ceph_file_info *fi) 442 static void reset_readdir(struct ceph_file_info *fi)
443 { 443 {
444 if (fi->last_readdir) { 444 if (fi->last_readdir) {
445 ceph_mdsc_put_request(fi->last_readdir); 445 ceph_mdsc_put_request(fi->last_readdir);
446 fi->last_readdir = NULL; 446 fi->last_readdir = NULL;
447 } 447 }
448 kfree(fi->last_name); 448 kfree(fi->last_name);
449 fi->last_name = NULL; 449 fi->last_name = NULL;
450 fi->next_offset = 2; /* compensate for . and .. */ 450 fi->next_offset = 2; /* compensate for . and .. */
451 if (fi->dentry) { 451 if (fi->dentry) {
452 dput(fi->dentry); 452 dput(fi->dentry);
453 fi->dentry = NULL; 453 fi->dentry = NULL;
454 } 454 }
455 fi->flags &= ~CEPH_F_ATEND; 455 fi->flags &= ~CEPH_F_ATEND;
456 } 456 }
457 457
458 static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin) 458 static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
459 { 459 {
460 struct ceph_file_info *fi = file->private_data; 460 struct ceph_file_info *fi = file->private_data;
461 struct inode *inode = file->f_mapping->host; 461 struct inode *inode = file->f_mapping->host;
462 loff_t old_offset = offset; 462 loff_t old_offset = offset;
463 loff_t retval; 463 loff_t retval;
464 464
465 mutex_lock(&inode->i_mutex); 465 mutex_lock(&inode->i_mutex);
466 retval = -EINVAL; 466 retval = -EINVAL;
467 switch (origin) { 467 switch (origin) {
468 case SEEK_END: 468 case SEEK_END:
469 offset += inode->i_size + 2; /* FIXME */ 469 offset += inode->i_size + 2; /* FIXME */
470 break; 470 break;
471 case SEEK_CUR: 471 case SEEK_CUR:
472 offset += file->f_pos; 472 offset += file->f_pos;
473 case SEEK_SET: 473 case SEEK_SET:
474 break; 474 break;
475 default: 475 default:
476 goto out; 476 goto out;
477 } 477 }
478 478
479 if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) { 479 if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
480 if (offset != file->f_pos) { 480 if (offset != file->f_pos) {
481 file->f_pos = offset; 481 file->f_pos = offset;
482 file->f_version = 0; 482 file->f_version = 0;
483 fi->flags &= ~CEPH_F_ATEND; 483 fi->flags &= ~CEPH_F_ATEND;
484 } 484 }
485 retval = offset; 485 retval = offset;
486 486
487 /* 487 /*
488 * discard buffered readdir content on seekdir(0), or 488 * discard buffered readdir content on seekdir(0), or
489 * seek to new frag, or seek prior to current chunk. 489 * seek to new frag, or seek prior to current chunk.
490 */ 490 */
491 if (offset == 0 || 491 if (offset == 0 ||
492 fpos_frag(offset) != fpos_frag(old_offset) || 492 fpos_frag(offset) != fpos_frag(old_offset) ||
493 fpos_off(offset) < fi->offset) { 493 fpos_off(offset) < fi->offset) {
494 dout("dir_llseek dropping %p content\n", file); 494 dout("dir_llseek dropping %p content\n", file);
495 reset_readdir(fi); 495 reset_readdir(fi);
496 } 496 }
497 497
498 /* bump dir_release_count if we did a forward seek */ 498 /* bump dir_release_count if we did a forward seek */
499 if (offset > old_offset) 499 if (offset > old_offset)
500 fi->dir_release_count--; 500 fi->dir_release_count--;
501 } 501 }
502 out: 502 out:
503 mutex_unlock(&inode->i_mutex); 503 mutex_unlock(&inode->i_mutex);
504 return retval; 504 return retval;
505 } 505 }
506 506
507 /* 507 /*
508 * Handle lookups for the hidden .snap directory. 508 * Handle lookups for the hidden .snap directory.
509 */ 509 */
510 int ceph_handle_snapdir(struct ceph_mds_request *req, 510 int ceph_handle_snapdir(struct ceph_mds_request *req,
511 struct dentry *dentry, int err) 511 struct dentry *dentry, int err)
512 { 512 {
513 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); 513 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
514 struct inode *parent = dentry->d_parent->d_inode; /* we hold i_mutex */ 514 struct inode *parent = dentry->d_parent->d_inode; /* we hold i_mutex */
515 515
516 /* .snap dir? */ 516 /* .snap dir? */
517 if (err == -ENOENT && 517 if (err == -ENOENT &&
518 ceph_snap(parent) == CEPH_NOSNAP && 518 ceph_snap(parent) == CEPH_NOSNAP &&
519 strcmp(dentry->d_name.name, 519 strcmp(dentry->d_name.name,
520 fsc->mount_options->snapdir_name) == 0) { 520 fsc->mount_options->snapdir_name) == 0) {
521 struct inode *inode = ceph_get_snapdir(parent); 521 struct inode *inode = ceph_get_snapdir(parent);
522 dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n", 522 dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
523 dentry, dentry->d_name.len, dentry->d_name.name, inode); 523 dentry, dentry->d_name.len, dentry->d_name.name, inode);
524 BUG_ON(!d_unhashed(dentry)); 524 BUG_ON(!d_unhashed(dentry));
525 d_add(dentry, inode); 525 d_add(dentry, inode);
526 err = 0; 526 err = 0;
527 } 527 }
528 return err; 528 return err;
529 } 529 }
530 530
531 /* 531 /*
532 * Figure out final result of a lookup/open request. 532 * Figure out final result of a lookup/open request.
533 * 533 *
534 * Mainly, make sure we return the final req->r_dentry (if it already 534 * Mainly, make sure we return the final req->r_dentry (if it already
535 * existed) in place of the original VFS-provided dentry when they 535 * existed) in place of the original VFS-provided dentry when they
536 * differ. 536 * differ.
537 * 537 *
538 * Gracefully handle the case where the MDS replies with -ENOENT and 538 * Gracefully handle the case where the MDS replies with -ENOENT and
539 * no trace (which it may do, at its discretion, e.g., if it doesn't 539 * no trace (which it may do, at its discretion, e.g., if it doesn't
540 * care to issue a lease on the negative dentry). 540 * care to issue a lease on the negative dentry).
541 */ 541 */
542 struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, 542 struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
543 struct dentry *dentry, int err) 543 struct dentry *dentry, int err)
544 { 544 {
545 if (err == -ENOENT) { 545 if (err == -ENOENT) {
546 /* no trace? */ 546 /* no trace? */
547 err = 0; 547 err = 0;
548 if (!req->r_reply_info.head->is_dentry) { 548 if (!req->r_reply_info.head->is_dentry) {
549 dout("ENOENT and no trace, dentry %p inode %p\n", 549 dout("ENOENT and no trace, dentry %p inode %p\n",
550 dentry, dentry->d_inode); 550 dentry, dentry->d_inode);
551 if (dentry->d_inode) { 551 if (dentry->d_inode) {
552 d_drop(dentry); 552 d_drop(dentry);
553 err = -ENOENT; 553 err = -ENOENT;
554 } else { 554 } else {
555 d_add(dentry, NULL); 555 d_add(dentry, NULL);
556 } 556 }
557 } 557 }
558 } 558 }
559 if (err) 559 if (err)
560 dentry = ERR_PTR(err); 560 dentry = ERR_PTR(err);
561 else if (dentry != req->r_dentry) 561 else if (dentry != req->r_dentry)
562 dentry = dget(req->r_dentry); /* we got spliced */ 562 dentry = dget(req->r_dentry); /* we got spliced */
563 else 563 else
564 dentry = NULL; 564 dentry = NULL;
565 return dentry; 565 return dentry;
566 } 566 }
567 567
568 static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry) 568 static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
569 { 569 {
570 return ceph_ino(inode) == CEPH_INO_ROOT && 570 return ceph_ino(inode) == CEPH_INO_ROOT &&
571 strncmp(dentry->d_name.name, ".ceph", 5) == 0; 571 strncmp(dentry->d_name.name, ".ceph", 5) == 0;
572 } 572 }
573 573
574 /* 574 /*
575 * Look up a single dir entry. If there is a lookup intent, inform 575 * Look up a single dir entry. If there is a lookup intent, inform
576 * the MDS so that it gets our 'caps wanted' value in a single op. 576 * the MDS so that it gets our 'caps wanted' value in a single op.
577 */ 577 */
578 static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, 578 static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
579 struct nameidata *nd) 579 struct nameidata *nd)
580 { 580 {
581 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); 581 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
582 struct ceph_mds_client *mdsc = fsc->mdsc; 582 struct ceph_mds_client *mdsc = fsc->mdsc;
583 struct ceph_mds_request *req; 583 struct ceph_mds_request *req;
584 int op; 584 int op;
585 int err; 585 int err;
586 586
587 dout("lookup %p dentry %p '%.*s'\n", 587 dout("lookup %p dentry %p '%.*s'\n",
588 dir, dentry, dentry->d_name.len, dentry->d_name.name); 588 dir, dentry, dentry->d_name.len, dentry->d_name.name);
589 589
590 if (dentry->d_name.len > NAME_MAX) 590 if (dentry->d_name.len > NAME_MAX)
591 return ERR_PTR(-ENAMETOOLONG); 591 return ERR_PTR(-ENAMETOOLONG);
592 592
593 err = ceph_init_dentry(dentry); 593 err = ceph_init_dentry(dentry);
594 if (err < 0) 594 if (err < 0)
595 return ERR_PTR(err); 595 return ERR_PTR(err);
596 596
597 /* open (but not create!) intent? */ 597 /* open (but not create!) intent? */
598 if (nd && 598 if (nd &&
599 (nd->flags & LOOKUP_OPEN) && 599 (nd->flags & LOOKUP_OPEN) &&
600 !(nd->intent.open.flags & O_CREAT)) { 600 !(nd->intent.open.flags & O_CREAT)) {
601 int mode = nd->intent.open.create_mode & ~current->fs->umask; 601 int mode = nd->intent.open.create_mode & ~current->fs->umask;
602 return ceph_lookup_open(dir, dentry, nd, mode, 1); 602 return ceph_lookup_open(dir, dentry, nd, mode, 1);
603 } 603 }
604 604
605 /* can we conclude ENOENT locally? */ 605 /* can we conclude ENOENT locally? */
606 if (dentry->d_inode == NULL) { 606 if (dentry->d_inode == NULL) {
607 struct ceph_inode_info *ci = ceph_inode(dir); 607 struct ceph_inode_info *ci = ceph_inode(dir);
608 struct ceph_dentry_info *di = ceph_dentry(dentry); 608 struct ceph_dentry_info *di = ceph_dentry(dentry);
609 609
610 spin_lock(&ci->i_ceph_lock); 610 spin_lock(&ci->i_ceph_lock);
611 dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags); 611 dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
612 if (strncmp(dentry->d_name.name, 612 if (strncmp(dentry->d_name.name,
613 fsc->mount_options->snapdir_name, 613 fsc->mount_options->snapdir_name,
614 dentry->d_name.len) && 614 dentry->d_name.len) &&
615 !is_root_ceph_dentry(dir, dentry) && 615 !is_root_ceph_dentry(dir, dentry) &&
616 ceph_dir_test_complete(dir) && 616 ceph_dir_test_complete(dir) &&
617 (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { 617 (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
618 spin_unlock(&ci->i_ceph_lock); 618 spin_unlock(&ci->i_ceph_lock);
619 dout(" dir %p complete, -ENOENT\n", dir); 619 dout(" dir %p complete, -ENOENT\n", dir);
620 d_add(dentry, NULL); 620 d_add(dentry, NULL);
621 di->lease_shared_gen = ci->i_shared_gen; 621 di->lease_shared_gen = ci->i_shared_gen;
622 return NULL; 622 return NULL;
623 } 623 }
624 spin_unlock(&ci->i_ceph_lock); 624 spin_unlock(&ci->i_ceph_lock);
625 } 625 }
626 626
627 op = ceph_snap(dir) == CEPH_SNAPDIR ? 627 op = ceph_snap(dir) == CEPH_SNAPDIR ?
628 CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP; 628 CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
629 req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS); 629 req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
630 if (IS_ERR(req)) 630 if (IS_ERR(req))
631 return ERR_CAST(req); 631 return ERR_CAST(req);
632 req->r_dentry = dget(dentry); 632 req->r_dentry = dget(dentry);
633 req->r_num_caps = 2; 633 req->r_num_caps = 2;
634 /* we only need inode linkage */ 634 /* we only need inode linkage */
635 req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); 635 req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
636 req->r_locked_dir = dir; 636 req->r_locked_dir = dir;
637 err = ceph_mdsc_do_request(mdsc, NULL, req); 637 err = ceph_mdsc_do_request(mdsc, NULL, req);
638 err = ceph_handle_snapdir(req, dentry, err); 638 err = ceph_handle_snapdir(req, dentry, err);
639 dentry = ceph_finish_lookup(req, dentry, err); 639 dentry = ceph_finish_lookup(req, dentry, err);
640 ceph_mdsc_put_request(req); /* will dput(dentry) */ 640 ceph_mdsc_put_request(req); /* will dput(dentry) */
641 dout("lookup result=%p\n", dentry); 641 dout("lookup result=%p\n", dentry);
642 return dentry; 642 return dentry;
643 } 643 }
644 644
645 /* 645 /*
646 * If we do a create but get no trace back from the MDS, follow up with 646 * If we do a create but get no trace back from the MDS, follow up with
647 * a lookup (the VFS expects us to link up the provided dentry). 647 * a lookup (the VFS expects us to link up the provided dentry).
648 */ 648 */
649 int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry) 649 int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
650 { 650 {
651 struct dentry *result = ceph_lookup(dir, dentry, NULL); 651 struct dentry *result = ceph_lookup(dir, dentry, NULL);
652 652
653 if (result && !IS_ERR(result)) { 653 if (result && !IS_ERR(result)) {
654 /* 654 /*
655 * We created the item, then did a lookup, and found 655 * We created the item, then did a lookup, and found
656 * it was already linked to another inode we already 656 * it was already linked to another inode we already
657 * had in our cache (and thus got spliced). Link our 657 * had in our cache (and thus got spliced). Link our
658 * dentry to that inode, but don't hash it, just in 658 * dentry to that inode, but don't hash it, just in
659 * case the VFS wants to dereference it. 659 * case the VFS wants to dereference it.
660 */ 660 */
661 BUG_ON(!result->d_inode); 661 BUG_ON(!result->d_inode);
662 d_instantiate(dentry, result->d_inode); 662 d_instantiate(dentry, result->d_inode);
663 return 0; 663 return 0;
664 } 664 }
665 return PTR_ERR(result); 665 return PTR_ERR(result);
666 } 666 }
667 667
668 static int ceph_mknod(struct inode *dir, struct dentry *dentry, 668 static int ceph_mknod(struct inode *dir, struct dentry *dentry,
669 umode_t mode, dev_t rdev) 669 umode_t mode, dev_t rdev)
670 { 670 {
671 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); 671 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
672 struct ceph_mds_client *mdsc = fsc->mdsc; 672 struct ceph_mds_client *mdsc = fsc->mdsc;
673 struct ceph_mds_request *req; 673 struct ceph_mds_request *req;
674 int err; 674 int err;
675 675
676 if (ceph_snap(dir) != CEPH_NOSNAP) 676 if (ceph_snap(dir) != CEPH_NOSNAP)
677 return -EROFS; 677 return -EROFS;
678 678
679 dout("mknod in dir %p dentry %p mode 0%ho rdev %d\n", 679 dout("mknod in dir %p dentry %p mode 0%ho rdev %d\n",
680 dir, dentry, mode, rdev); 680 dir, dentry, mode, rdev);
681 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS); 681 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS);
682 if (IS_ERR(req)) { 682 if (IS_ERR(req)) {
683 d_drop(dentry); 683 d_drop(dentry);
684 return PTR_ERR(req); 684 return PTR_ERR(req);
685 } 685 }
686 req->r_dentry = dget(dentry); 686 req->r_dentry = dget(dentry);
687 req->r_num_caps = 2; 687 req->r_num_caps = 2;
688 req->r_locked_dir = dir; 688 req->r_locked_dir = dir;
689 req->r_args.mknod.mode = cpu_to_le32(mode); 689 req->r_args.mknod.mode = cpu_to_le32(mode);
690 req->r_args.mknod.rdev = cpu_to_le32(rdev); 690 req->r_args.mknod.rdev = cpu_to_le32(rdev);
691 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 691 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
692 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 692 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
693 err = ceph_mdsc_do_request(mdsc, dir, req); 693 err = ceph_mdsc_do_request(mdsc, dir, req);
694 if (!err && !req->r_reply_info.head->is_dentry) 694 if (!err && !req->r_reply_info.head->is_dentry)
695 err = ceph_handle_notrace_create(dir, dentry); 695 err = ceph_handle_notrace_create(dir, dentry);
696 ceph_mdsc_put_request(req); 696 ceph_mdsc_put_request(req);
697 if (err) 697 if (err)
698 d_drop(dentry); 698 d_drop(dentry);
699 return err; 699 return err;
700 } 700 }
701 701
702 static int ceph_create(struct inode *dir, struct dentry *dentry, umode_t mode, 702 static int ceph_create(struct inode *dir, struct dentry *dentry, umode_t mode,
703 struct nameidata *nd) 703 struct nameidata *nd)
704 { 704 {
705 dout("create in dir %p dentry %p name '%.*s'\n", 705 dout("create in dir %p dentry %p name '%.*s'\n",
706 dir, dentry, dentry->d_name.len, dentry->d_name.name); 706 dir, dentry, dentry->d_name.len, dentry->d_name.name);
707 707
708 if (ceph_snap(dir) != CEPH_NOSNAP) 708 if (ceph_snap(dir) != CEPH_NOSNAP)
709 return -EROFS; 709 return -EROFS;
710 710
711 if (nd) { 711 if (nd) {
712 BUG_ON((nd->flags & LOOKUP_OPEN) == 0); 712 BUG_ON((nd->flags & LOOKUP_OPEN) == 0);
713 dentry = ceph_lookup_open(dir, dentry, nd, mode, 0); 713 dentry = ceph_lookup_open(dir, dentry, nd, mode, 0);
714 /* hrm, what should i do here if we get aliased? */ 714 /* hrm, what should i do here if we get aliased? */
715 if (IS_ERR(dentry)) 715 if (IS_ERR(dentry))
716 return PTR_ERR(dentry); 716 return PTR_ERR(dentry);
717 return 0; 717 return 0;
718 } 718 }
719 719
720 /* fall back to mknod */ 720 /* fall back to mknod */
721 return ceph_mknod(dir, dentry, (mode & ~S_IFMT) | S_IFREG, 0); 721 return ceph_mknod(dir, dentry, (mode & ~S_IFMT) | S_IFREG, 0);
722 } 722 }
723 723
724 static int ceph_symlink(struct inode *dir, struct dentry *dentry, 724 static int ceph_symlink(struct inode *dir, struct dentry *dentry,
725 const char *dest) 725 const char *dest)
726 { 726 {
727 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); 727 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
728 struct ceph_mds_client *mdsc = fsc->mdsc; 728 struct ceph_mds_client *mdsc = fsc->mdsc;
729 struct ceph_mds_request *req; 729 struct ceph_mds_request *req;
730 int err; 730 int err;
731 731
732 if (ceph_snap(dir) != CEPH_NOSNAP) 732 if (ceph_snap(dir) != CEPH_NOSNAP)
733 return -EROFS; 733 return -EROFS;
734 734
735 dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest); 735 dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
736 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS); 736 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);
737 if (IS_ERR(req)) { 737 if (IS_ERR(req)) {
738 d_drop(dentry); 738 d_drop(dentry);
739 return PTR_ERR(req); 739 return PTR_ERR(req);
740 } 740 }
741 req->r_dentry = dget(dentry); 741 req->r_dentry = dget(dentry);
742 req->r_num_caps = 2; 742 req->r_num_caps = 2;
743 req->r_path2 = kstrdup(dest, GFP_NOFS); 743 req->r_path2 = kstrdup(dest, GFP_NOFS);
744 req->r_locked_dir = dir; 744 req->r_locked_dir = dir;
745 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 745 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
746 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 746 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
747 err = ceph_mdsc_do_request(mdsc, dir, req); 747 err = ceph_mdsc_do_request(mdsc, dir, req);
748 if (!err && !req->r_reply_info.head->is_dentry) 748 if (!err && !req->r_reply_info.head->is_dentry)
749 err = ceph_handle_notrace_create(dir, dentry); 749 err = ceph_handle_notrace_create(dir, dentry);
750 ceph_mdsc_put_request(req); 750 ceph_mdsc_put_request(req);
751 if (err) 751 if (err)
752 d_drop(dentry); 752 d_drop(dentry);
753 return err; 753 return err;
754 } 754 }
755 755
756 static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 756 static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
757 { 757 {
758 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); 758 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
759 struct ceph_mds_client *mdsc = fsc->mdsc; 759 struct ceph_mds_client *mdsc = fsc->mdsc;
760 struct ceph_mds_request *req; 760 struct ceph_mds_request *req;
761 int err = -EROFS; 761 int err = -EROFS;
762 int op; 762 int op;
763 763
764 if (ceph_snap(dir) == CEPH_SNAPDIR) { 764 if (ceph_snap(dir) == CEPH_SNAPDIR) {
765 /* mkdir .snap/foo is a MKSNAP */ 765 /* mkdir .snap/foo is a MKSNAP */
766 op = CEPH_MDS_OP_MKSNAP; 766 op = CEPH_MDS_OP_MKSNAP;
767 dout("mksnap dir %p snap '%.*s' dn %p\n", dir, 767 dout("mksnap dir %p snap '%.*s' dn %p\n", dir,
768 dentry->d_name.len, dentry->d_name.name, dentry); 768 dentry->d_name.len, dentry->d_name.name, dentry);
769 } else if (ceph_snap(dir) == CEPH_NOSNAP) { 769 } else if (ceph_snap(dir) == CEPH_NOSNAP) {
770 dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode); 770 dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode);
771 op = CEPH_MDS_OP_MKDIR; 771 op = CEPH_MDS_OP_MKDIR;
772 } else { 772 } else {
773 goto out; 773 goto out;
774 } 774 }
775 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); 775 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
776 if (IS_ERR(req)) { 776 if (IS_ERR(req)) {
777 err = PTR_ERR(req); 777 err = PTR_ERR(req);
778 goto out; 778 goto out;
779 } 779 }
780 780
781 req->r_dentry = dget(dentry); 781 req->r_dentry = dget(dentry);
782 req->r_num_caps = 2; 782 req->r_num_caps = 2;
783 req->r_locked_dir = dir; 783 req->r_locked_dir = dir;
784 req->r_args.mkdir.mode = cpu_to_le32(mode); 784 req->r_args.mkdir.mode = cpu_to_le32(mode);
785 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 785 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
786 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 786 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
787 err = ceph_mdsc_do_request(mdsc, dir, req); 787 err = ceph_mdsc_do_request(mdsc, dir, req);
788 if (!err && !req->r_reply_info.head->is_dentry) 788 if (!err && !req->r_reply_info.head->is_dentry)
789 err = ceph_handle_notrace_create(dir, dentry); 789 err = ceph_handle_notrace_create(dir, dentry);
790 ceph_mdsc_put_request(req); 790 ceph_mdsc_put_request(req);
791 out: 791 out:
792 if (err < 0) 792 if (err < 0)
793 d_drop(dentry); 793 d_drop(dentry);
794 return err; 794 return err;
795 } 795 }
796 796
797 static int ceph_link(struct dentry *old_dentry, struct inode *dir, 797 static int ceph_link(struct dentry *old_dentry, struct inode *dir,
798 struct dentry *dentry) 798 struct dentry *dentry)
799 { 799 {
800 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); 800 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
801 struct ceph_mds_client *mdsc = fsc->mdsc; 801 struct ceph_mds_client *mdsc = fsc->mdsc;
802 struct ceph_mds_request *req; 802 struct ceph_mds_request *req;
803 int err; 803 int err;
804 804
805 if (ceph_snap(dir) != CEPH_NOSNAP) 805 if (ceph_snap(dir) != CEPH_NOSNAP)
806 return -EROFS; 806 return -EROFS;
807 807
808 dout("link in dir %p old_dentry %p dentry %p\n", dir, 808 dout("link in dir %p old_dentry %p dentry %p\n", dir,
809 old_dentry, dentry); 809 old_dentry, dentry);
810 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS); 810 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS);
811 if (IS_ERR(req)) { 811 if (IS_ERR(req)) {
812 d_drop(dentry); 812 d_drop(dentry);
813 return PTR_ERR(req); 813 return PTR_ERR(req);
814 } 814 }
815 req->r_dentry = dget(dentry); 815 req->r_dentry = dget(dentry);
816 req->r_num_caps = 2; 816 req->r_num_caps = 2;
817 req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */ 817 req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */
818 req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry); 818 req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry);
819 req->r_locked_dir = dir; 819 req->r_locked_dir = dir;
820 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 820 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
821 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 821 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
822 err = ceph_mdsc_do_request(mdsc, dir, req); 822 err = ceph_mdsc_do_request(mdsc, dir, req);
823 if (err) { 823 if (err) {
824 d_drop(dentry); 824 d_drop(dentry);
825 } else if (!req->r_reply_info.head->is_dentry) { 825 } else if (!req->r_reply_info.head->is_dentry) {
826 ihold(old_dentry->d_inode); 826 ihold(old_dentry->d_inode);
827 d_instantiate(dentry, old_dentry->d_inode); 827 d_instantiate(dentry, old_dentry->d_inode);
828 } 828 }
829 ceph_mdsc_put_request(req); 829 ceph_mdsc_put_request(req);
830 return err; 830 return err;
831 } 831 }
832 832
833 /* 833 /*
834 * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it 834 * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it
835 * looks like the link count will hit 0, drop any other caps (other 835 * looks like the link count will hit 0, drop any other caps (other
836 * than PIN) we don't specifically want (due to the file still being 836 * than PIN) we don't specifically want (due to the file still being
837 * open). 837 * open).
838 */ 838 */
839 static int drop_caps_for_unlink(struct inode *inode) 839 static int drop_caps_for_unlink(struct inode *inode)
840 { 840 {
841 struct ceph_inode_info *ci = ceph_inode(inode); 841 struct ceph_inode_info *ci = ceph_inode(inode);
842 int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; 842 int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
843 843
844 spin_lock(&ci->i_ceph_lock); 844 spin_lock(&ci->i_ceph_lock);
845 if (inode->i_nlink == 1) { 845 if (inode->i_nlink == 1) {
846 drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN); 846 drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
847 ci->i_ceph_flags |= CEPH_I_NODELAY; 847 ci->i_ceph_flags |= CEPH_I_NODELAY;
848 } 848 }
849 spin_unlock(&ci->i_ceph_lock); 849 spin_unlock(&ci->i_ceph_lock);
850 return drop; 850 return drop;
851 } 851 }
852 852
853 /* 853 /*
854 * rmdir and unlink are differ only by the metadata op code 854 * rmdir and unlink are differ only by the metadata op code
855 */ 855 */
856 static int ceph_unlink(struct inode *dir, struct dentry *dentry) 856 static int ceph_unlink(struct inode *dir, struct dentry *dentry)
857 { 857 {
858 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); 858 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
859 struct ceph_mds_client *mdsc = fsc->mdsc; 859 struct ceph_mds_client *mdsc = fsc->mdsc;
860 struct inode *inode = dentry->d_inode; 860 struct inode *inode = dentry->d_inode;
861 struct ceph_mds_request *req; 861 struct ceph_mds_request *req;
862 int err = -EROFS; 862 int err = -EROFS;
863 int op; 863 int op;
864 864
865 if (ceph_snap(dir) == CEPH_SNAPDIR) { 865 if (ceph_snap(dir) == CEPH_SNAPDIR) {
866 /* rmdir .snap/foo is RMSNAP */ 866 /* rmdir .snap/foo is RMSNAP */
867 dout("rmsnap dir %p '%.*s' dn %p\n", dir, dentry->d_name.len, 867 dout("rmsnap dir %p '%.*s' dn %p\n", dir, dentry->d_name.len,
868 dentry->d_name.name, dentry); 868 dentry->d_name.name, dentry);
869 op = CEPH_MDS_OP_RMSNAP; 869 op = CEPH_MDS_OP_RMSNAP;
870 } else if (ceph_snap(dir) == CEPH_NOSNAP) { 870 } else if (ceph_snap(dir) == CEPH_NOSNAP) {
871 dout("unlink/rmdir dir %p dn %p inode %p\n", 871 dout("unlink/rmdir dir %p dn %p inode %p\n",
872 dir, dentry, inode); 872 dir, dentry, inode);
873 op = S_ISDIR(dentry->d_inode->i_mode) ? 873 op = S_ISDIR(dentry->d_inode->i_mode) ?
874 CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK; 874 CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
875 } else 875 } else
876 goto out; 876 goto out;
877 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); 877 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
878 if (IS_ERR(req)) { 878 if (IS_ERR(req)) {
879 err = PTR_ERR(req); 879 err = PTR_ERR(req);
880 goto out; 880 goto out;
881 } 881 }
882 req->r_dentry = dget(dentry); 882 req->r_dentry = dget(dentry);
883 req->r_num_caps = 2; 883 req->r_num_caps = 2;
884 req->r_locked_dir = dir; 884 req->r_locked_dir = dir;
885 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 885 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
886 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 886 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
887 req->r_inode_drop = drop_caps_for_unlink(inode); 887 req->r_inode_drop = drop_caps_for_unlink(inode);
888 err = ceph_mdsc_do_request(mdsc, dir, req); 888 err = ceph_mdsc_do_request(mdsc, dir, req);
889 if (!err && !req->r_reply_info.head->is_dentry) 889 if (!err && !req->r_reply_info.head->is_dentry)
890 d_delete(dentry); 890 d_delete(dentry);
891 ceph_mdsc_put_request(req); 891 ceph_mdsc_put_request(req);
892 out: 892 out:
893 return err; 893 return err;
894 } 894 }
895 895
896 static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, 896 static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
897 struct inode *new_dir, struct dentry *new_dentry) 897 struct inode *new_dir, struct dentry *new_dentry)
898 { 898 {
899 struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb); 899 struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb);
900 struct ceph_mds_client *mdsc = fsc->mdsc; 900 struct ceph_mds_client *mdsc = fsc->mdsc;
901 struct ceph_mds_request *req; 901 struct ceph_mds_request *req;
902 int err; 902 int err;
903 903
904 if (ceph_snap(old_dir) != ceph_snap(new_dir)) 904 if (ceph_snap(old_dir) != ceph_snap(new_dir))
905 return -EXDEV; 905 return -EXDEV;
906 if (ceph_snap(old_dir) != CEPH_NOSNAP || 906 if (ceph_snap(old_dir) != CEPH_NOSNAP ||
907 ceph_snap(new_dir) != CEPH_NOSNAP) 907 ceph_snap(new_dir) != CEPH_NOSNAP)
908 return -EROFS; 908 return -EROFS;
909 dout("rename dir %p dentry %p to dir %p dentry %p\n", 909 dout("rename dir %p dentry %p to dir %p dentry %p\n",
910 old_dir, old_dentry, new_dir, new_dentry); 910 old_dir, old_dentry, new_dir, new_dentry);
911 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS); 911 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS);
912 if (IS_ERR(req)) 912 if (IS_ERR(req))
913 return PTR_ERR(req); 913 return PTR_ERR(req);
914 req->r_dentry = dget(new_dentry); 914 req->r_dentry = dget(new_dentry);
915 req->r_num_caps = 2; 915 req->r_num_caps = 2;
916 req->r_old_dentry = dget(old_dentry); 916 req->r_old_dentry = dget(old_dentry);
917 req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry); 917 req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry);
918 req->r_locked_dir = new_dir; 918 req->r_locked_dir = new_dir;
919 req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED; 919 req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
920 req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL; 920 req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
921 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 921 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
922 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 922 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
923 /* release LINK_RDCACHE on source inode (mds will lock it) */ 923 /* release LINK_RDCACHE on source inode (mds will lock it) */
924 req->r_old_inode_drop = CEPH_CAP_LINK_SHARED; 924 req->r_old_inode_drop = CEPH_CAP_LINK_SHARED;
925 if (new_dentry->d_inode) 925 if (new_dentry->d_inode)
926 req->r_inode_drop = drop_caps_for_unlink(new_dentry->d_inode); 926 req->r_inode_drop = drop_caps_for_unlink(new_dentry->d_inode);
927 err = ceph_mdsc_do_request(mdsc, old_dir, req); 927 err = ceph_mdsc_do_request(mdsc, old_dir, req);
928 if (!err && !req->r_reply_info.head->is_dentry) { 928 if (!err && !req->r_reply_info.head->is_dentry) {
929 /* 929 /*
930 * Normally d_move() is done by fill_trace (called by 930 * Normally d_move() is done by fill_trace (called by
931 * do_request, above). If there is no trace, we need 931 * do_request, above). If there is no trace, we need
932 * to do it here. 932 * to do it here.
933 */ 933 */
934 934
935 /* d_move screws up d_subdirs order */ 935 /* d_move screws up d_subdirs order */
936 ceph_dir_clear_complete(new_dir); 936 ceph_dir_clear_complete(new_dir);
937 937
938 d_move(old_dentry, new_dentry); 938 d_move(old_dentry, new_dentry);
939 939
940 /* ensure target dentry is invalidated, despite 940 /* ensure target dentry is invalidated, despite
941 rehashing bug in vfs_rename_dir */ 941 rehashing bug in vfs_rename_dir */
942 ceph_invalidate_dentry_lease(new_dentry); 942 ceph_invalidate_dentry_lease(new_dentry);
943 } 943 }
944 ceph_mdsc_put_request(req); 944 ceph_mdsc_put_request(req);
945 return err; 945 return err;
946 } 946 }
947 947
948 /* 948 /*
949 * Ensure a dentry lease will no longer revalidate. 949 * Ensure a dentry lease will no longer revalidate.
950 */ 950 */
951 void ceph_invalidate_dentry_lease(struct dentry *dentry) 951 void ceph_invalidate_dentry_lease(struct dentry *dentry)
952 { 952 {
953 spin_lock(&dentry->d_lock); 953 spin_lock(&dentry->d_lock);
954 dentry->d_time = jiffies; 954 dentry->d_time = jiffies;
955 ceph_dentry(dentry)->lease_shared_gen = 0; 955 ceph_dentry(dentry)->lease_shared_gen = 0;
956 spin_unlock(&dentry->d_lock); 956 spin_unlock(&dentry->d_lock);
957 } 957 }
958 958
959 /* 959 /*
960 * Check if dentry lease is valid. If not, delete the lease. Try to 960 * Check if dentry lease is valid. If not, delete the lease. Try to
961 * renew if the least is more than half up. 961 * renew if the least is more than half up.
962 */ 962 */
963 static int dentry_lease_is_valid(struct dentry *dentry) 963 static int dentry_lease_is_valid(struct dentry *dentry)
964 { 964 {
965 struct ceph_dentry_info *di; 965 struct ceph_dentry_info *di;
966 struct ceph_mds_session *s; 966 struct ceph_mds_session *s;
967 int valid = 0; 967 int valid = 0;
968 u32 gen; 968 u32 gen;
969 unsigned long ttl; 969 unsigned long ttl;
970 struct ceph_mds_session *session = NULL; 970 struct ceph_mds_session *session = NULL;
971 struct inode *dir = NULL; 971 struct inode *dir = NULL;
972 u32 seq = 0; 972 u32 seq = 0;
973 973
974 spin_lock(&dentry->d_lock); 974 spin_lock(&dentry->d_lock);
975 di = ceph_dentry(dentry); 975 di = ceph_dentry(dentry);
976 if (di->lease_session) { 976 if (di->lease_session) {
977 s = di->lease_session; 977 s = di->lease_session;
978 spin_lock(&s->s_cap_lock); 978 spin_lock(&s->s_gen_ttl_lock);
979 gen = s->s_cap_gen; 979 gen = s->s_cap_gen;
980 ttl = s->s_cap_ttl; 980 ttl = s->s_cap_ttl;
981 spin_unlock(&s->s_cap_lock); 981 spin_unlock(&s->s_gen_ttl_lock);
982 982
983 if (di->lease_gen == gen && 983 if (di->lease_gen == gen &&
984 time_before(jiffies, dentry->d_time) && 984 time_before(jiffies, dentry->d_time) &&
985 time_before(jiffies, ttl)) { 985 time_before(jiffies, ttl)) {
986 valid = 1; 986 valid = 1;
987 if (di->lease_renew_after && 987 if (di->lease_renew_after &&
988 time_after(jiffies, di->lease_renew_after)) { 988 time_after(jiffies, di->lease_renew_after)) {
989 /* we should renew */ 989 /* we should renew */
990 dir = dentry->d_parent->d_inode; 990 dir = dentry->d_parent->d_inode;
991 session = ceph_get_mds_session(s); 991 session = ceph_get_mds_session(s);
992 seq = di->lease_seq; 992 seq = di->lease_seq;
993 di->lease_renew_after = 0; 993 di->lease_renew_after = 0;
994 di->lease_renew_from = jiffies; 994 di->lease_renew_from = jiffies;
995 } 995 }
996 } 996 }
997 } 997 }
998 spin_unlock(&dentry->d_lock); 998 spin_unlock(&dentry->d_lock);
999 999
1000 if (session) { 1000 if (session) {
1001 ceph_mdsc_lease_send_msg(session, dir, dentry, 1001 ceph_mdsc_lease_send_msg(session, dir, dentry,
1002 CEPH_MDS_LEASE_RENEW, seq); 1002 CEPH_MDS_LEASE_RENEW, seq);
1003 ceph_put_mds_session(session); 1003 ceph_put_mds_session(session);
1004 } 1004 }
1005 dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid); 1005 dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
1006 return valid; 1006 return valid;
1007 } 1007 }
1008 1008
1009 /* 1009 /*
1010 * Check if directory-wide content lease/cap is valid. 1010 * Check if directory-wide content lease/cap is valid.
1011 */ 1011 */
1012 static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry) 1012 static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
1013 { 1013 {
1014 struct ceph_inode_info *ci = ceph_inode(dir); 1014 struct ceph_inode_info *ci = ceph_inode(dir);
1015 struct ceph_dentry_info *di = ceph_dentry(dentry); 1015 struct ceph_dentry_info *di = ceph_dentry(dentry);
1016 int valid = 0; 1016 int valid = 0;
1017 1017
1018 spin_lock(&ci->i_ceph_lock); 1018 spin_lock(&ci->i_ceph_lock);
1019 if (ci->i_shared_gen == di->lease_shared_gen) 1019 if (ci->i_shared_gen == di->lease_shared_gen)
1020 valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1); 1020 valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
1021 spin_unlock(&ci->i_ceph_lock); 1021 spin_unlock(&ci->i_ceph_lock);
1022 dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n", 1022 dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n",
1023 dir, (unsigned)ci->i_shared_gen, dentry, 1023 dir, (unsigned)ci->i_shared_gen, dentry,
1024 (unsigned)di->lease_shared_gen, valid); 1024 (unsigned)di->lease_shared_gen, valid);
1025 return valid; 1025 return valid;
1026 } 1026 }
1027 1027
1028 /* 1028 /*
1029 * Check if cached dentry can be trusted. 1029 * Check if cached dentry can be trusted.
1030 */ 1030 */
1031 static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd) 1031 static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
1032 { 1032 {
1033 int valid = 0; 1033 int valid = 0;
1034 struct inode *dir; 1034 struct inode *dir;
1035 1035
1036 if (nd && nd->flags & LOOKUP_RCU) 1036 if (nd && nd->flags & LOOKUP_RCU)
1037 return -ECHILD; 1037 return -ECHILD;
1038 1038
1039 dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry, 1039 dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry,
1040 dentry->d_name.len, dentry->d_name.name, dentry->d_inode, 1040 dentry->d_name.len, dentry->d_name.name, dentry->d_inode,
1041 ceph_dentry(dentry)->offset); 1041 ceph_dentry(dentry)->offset);
1042 1042
1043 dir = ceph_get_dentry_parent_inode(dentry); 1043 dir = ceph_get_dentry_parent_inode(dentry);
1044 1044
1045 /* always trust cached snapped dentries, snapdir dentry */ 1045 /* always trust cached snapped dentries, snapdir dentry */
1046 if (ceph_snap(dir) != CEPH_NOSNAP) { 1046 if (ceph_snap(dir) != CEPH_NOSNAP) {
1047 dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry, 1047 dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry,
1048 dentry->d_name.len, dentry->d_name.name, dentry->d_inode); 1048 dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
1049 valid = 1; 1049 valid = 1;
1050 } else if (dentry->d_inode && 1050 } else if (dentry->d_inode &&
1051 ceph_snap(dentry->d_inode) == CEPH_SNAPDIR) { 1051 ceph_snap(dentry->d_inode) == CEPH_SNAPDIR) {
1052 valid = 1; 1052 valid = 1;
1053 } else if (dentry_lease_is_valid(dentry) || 1053 } else if (dentry_lease_is_valid(dentry) ||
1054 dir_lease_is_valid(dir, dentry)) { 1054 dir_lease_is_valid(dir, dentry)) {
1055 valid = 1; 1055 valid = 1;
1056 } 1056 }
1057 1057
1058 dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid"); 1058 dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
1059 if (valid) 1059 if (valid)
1060 ceph_dentry_lru_touch(dentry); 1060 ceph_dentry_lru_touch(dentry);
1061 else 1061 else
1062 d_drop(dentry); 1062 d_drop(dentry);
1063 iput(dir); 1063 iput(dir);
1064 return valid; 1064 return valid;
1065 } 1065 }
1066 1066
1067 /* 1067 /*
1068 * Release our ceph_dentry_info. 1068 * Release our ceph_dentry_info.
1069 */ 1069 */
1070 static void ceph_d_release(struct dentry *dentry) 1070 static void ceph_d_release(struct dentry *dentry)
1071 { 1071 {
1072 struct ceph_dentry_info *di = ceph_dentry(dentry); 1072 struct ceph_dentry_info *di = ceph_dentry(dentry);
1073 1073
1074 dout("d_release %p\n", dentry); 1074 dout("d_release %p\n", dentry);
1075 ceph_dentry_lru_del(dentry); 1075 ceph_dentry_lru_del(dentry);
1076 if (di->lease_session) 1076 if (di->lease_session)
1077 ceph_put_mds_session(di->lease_session); 1077 ceph_put_mds_session(di->lease_session);
1078 kmem_cache_free(ceph_dentry_cachep, di); 1078 kmem_cache_free(ceph_dentry_cachep, di);
1079 dentry->d_fsdata = NULL; 1079 dentry->d_fsdata = NULL;
1080 } 1080 }
1081 1081
1082 static int ceph_snapdir_d_revalidate(struct dentry *dentry, 1082 static int ceph_snapdir_d_revalidate(struct dentry *dentry,
1083 struct nameidata *nd) 1083 struct nameidata *nd)
1084 { 1084 {
1085 /* 1085 /*
1086 * Eventually, we'll want to revalidate snapped metadata 1086 * Eventually, we'll want to revalidate snapped metadata
1087 * too... probably... 1087 * too... probably...
1088 */ 1088 */
1089 return 1; 1089 return 1;
1090 } 1090 }
1091 1091
1092 /* 1092 /*
1093 * Set/clear/test dir complete flag on the dir's dentry. 1093 * Set/clear/test dir complete flag on the dir's dentry.
1094 */ 1094 */
1095 void ceph_dir_set_complete(struct inode *inode) 1095 void ceph_dir_set_complete(struct inode *inode)
1096 { 1096 {
1097 struct dentry *dentry = d_find_any_alias(inode); 1097 struct dentry *dentry = d_find_any_alias(inode);
1098 1098
1099 if (dentry && ceph_dentry(dentry) && 1099 if (dentry && ceph_dentry(dentry) &&
1100 ceph_test_mount_opt(ceph_sb_to_client(dentry->d_sb), DCACHE)) { 1100 ceph_test_mount_opt(ceph_sb_to_client(dentry->d_sb), DCACHE)) {
1101 dout(" marking %p (%p) complete\n", inode, dentry); 1101 dout(" marking %p (%p) complete\n", inode, dentry);
1102 set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); 1102 set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
1103 } 1103 }
1104 dput(dentry); 1104 dput(dentry);
1105 } 1105 }
1106 1106
1107 void ceph_dir_clear_complete(struct inode *inode) 1107 void ceph_dir_clear_complete(struct inode *inode)
1108 { 1108 {
1109 struct dentry *dentry = d_find_any_alias(inode); 1109 struct dentry *dentry = d_find_any_alias(inode);
1110 1110
1111 if (dentry && ceph_dentry(dentry)) { 1111 if (dentry && ceph_dentry(dentry)) {
1112 dout(" marking %p (%p) complete\n", inode, dentry); 1112 dout(" marking %p (%p) complete\n", inode, dentry);
1113 set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); 1113 set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
1114 } 1114 }
1115 dput(dentry); 1115 dput(dentry);
1116 } 1116 }
1117 1117
1118 bool ceph_dir_test_complete(struct inode *inode) 1118 bool ceph_dir_test_complete(struct inode *inode)
1119 { 1119 {
1120 struct dentry *dentry = d_find_any_alias(inode); 1120 struct dentry *dentry = d_find_any_alias(inode);
1121 1121
1122 if (dentry && ceph_dentry(dentry)) { 1122 if (dentry && ceph_dentry(dentry)) {
1123 dout(" marking %p (%p) NOT complete\n", inode, dentry); 1123 dout(" marking %p (%p) NOT complete\n", inode, dentry);
1124 clear_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); 1124 clear_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
1125 } 1125 }
1126 dput(dentry); 1126 dput(dentry);
1127 return false; 1127 return false;
1128 } 1128 }
1129 1129
1130 /* 1130 /*
1131 * When the VFS prunes a dentry from the cache, we need to clear the 1131 * When the VFS prunes a dentry from the cache, we need to clear the
1132 * complete flag on the parent directory. 1132 * complete flag on the parent directory.
1133 * 1133 *
1134 * Called under dentry->d_lock. 1134 * Called under dentry->d_lock.
1135 */ 1135 */
1136 static void ceph_d_prune(struct dentry *dentry) 1136 static void ceph_d_prune(struct dentry *dentry)
1137 { 1137 {
1138 struct ceph_dentry_info *di; 1138 struct ceph_dentry_info *di;
1139 1139
1140 dout("ceph_d_prune %p\n", dentry); 1140 dout("ceph_d_prune %p\n", dentry);
1141 1141
1142 /* do we have a valid parent? */ 1142 /* do we have a valid parent? */
1143 if (!dentry->d_parent || IS_ROOT(dentry)) 1143 if (!dentry->d_parent || IS_ROOT(dentry))
1144 return; 1144 return;
1145 1145
1146 /* if we are not hashed, we don't affect D_COMPLETE */ 1146 /* if we are not hashed, we don't affect D_COMPLETE */
1147 if (d_unhashed(dentry)) 1147 if (d_unhashed(dentry))
1148 return; 1148 return;
1149 1149
1150 /* 1150 /*
1151 * we hold d_lock, so d_parent is stable, and d_fsdata is never 1151 * we hold d_lock, so d_parent is stable, and d_fsdata is never
1152 * cleared until d_release 1152 * cleared until d_release
1153 */ 1153 */
1154 di = ceph_dentry(dentry->d_parent); 1154 di = ceph_dentry(dentry->d_parent);
1155 clear_bit(CEPH_D_COMPLETE, &di->flags); 1155 clear_bit(CEPH_D_COMPLETE, &di->flags);
1156 } 1156 }
1157 1157
1158 /* 1158 /*
1159 * read() on a dir. This weird interface hack only works if mounted 1159 * read() on a dir. This weird interface hack only works if mounted
1160 * with '-o dirstat'. 1160 * with '-o dirstat'.
1161 */ 1161 */
1162 static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, 1162 static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
1163 loff_t *ppos) 1163 loff_t *ppos)
1164 { 1164 {
1165 struct ceph_file_info *cf = file->private_data; 1165 struct ceph_file_info *cf = file->private_data;
1166 struct inode *inode = file->f_dentry->d_inode; 1166 struct inode *inode = file->f_dentry->d_inode;
1167 struct ceph_inode_info *ci = ceph_inode(inode); 1167 struct ceph_inode_info *ci = ceph_inode(inode);
1168 int left; 1168 int left;
1169 const int bufsize = 1024; 1169 const int bufsize = 1024;
1170 1170
1171 if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT)) 1171 if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
1172 return -EISDIR; 1172 return -EISDIR;
1173 1173
1174 if (!cf->dir_info) { 1174 if (!cf->dir_info) {
1175 cf->dir_info = kmalloc(bufsize, GFP_NOFS); 1175 cf->dir_info = kmalloc(bufsize, GFP_NOFS);
1176 if (!cf->dir_info) 1176 if (!cf->dir_info)
1177 return -ENOMEM; 1177 return -ENOMEM;
1178 cf->dir_info_len = 1178 cf->dir_info_len =
1179 snprintf(cf->dir_info, bufsize, 1179 snprintf(cf->dir_info, bufsize,
1180 "entries: %20lld\n" 1180 "entries: %20lld\n"
1181 " files: %20lld\n" 1181 " files: %20lld\n"
1182 " subdirs: %20lld\n" 1182 " subdirs: %20lld\n"
1183 "rentries: %20lld\n" 1183 "rentries: %20lld\n"
1184 " rfiles: %20lld\n" 1184 " rfiles: %20lld\n"
1185 " rsubdirs: %20lld\n" 1185 " rsubdirs: %20lld\n"
1186 "rbytes: %20lld\n" 1186 "rbytes: %20lld\n"
1187 "rctime: %10ld.%09ld\n", 1187 "rctime: %10ld.%09ld\n",
1188 ci->i_files + ci->i_subdirs, 1188 ci->i_files + ci->i_subdirs,
1189 ci->i_files, 1189 ci->i_files,
1190 ci->i_subdirs, 1190 ci->i_subdirs,
1191 ci->i_rfiles + ci->i_rsubdirs, 1191 ci->i_rfiles + ci->i_rsubdirs,
1192 ci->i_rfiles, 1192 ci->i_rfiles,
1193 ci->i_rsubdirs, 1193 ci->i_rsubdirs,
1194 ci->i_rbytes, 1194 ci->i_rbytes,
1195 (long)ci->i_rctime.tv_sec, 1195 (long)ci->i_rctime.tv_sec,
1196 (long)ci->i_rctime.tv_nsec); 1196 (long)ci->i_rctime.tv_nsec);
1197 } 1197 }
1198 1198
1199 if (*ppos >= cf->dir_info_len) 1199 if (*ppos >= cf->dir_info_len)
1200 return 0; 1200 return 0;
1201 size = min_t(unsigned, size, cf->dir_info_len-*ppos); 1201 size = min_t(unsigned, size, cf->dir_info_len-*ppos);
1202 left = copy_to_user(buf, cf->dir_info + *ppos, size); 1202 left = copy_to_user(buf, cf->dir_info + *ppos, size);
1203 if (left == size) 1203 if (left == size)
1204 return -EFAULT; 1204 return -EFAULT;
1205 *ppos += (size - left); 1205 *ppos += (size - left);
1206 return size - left; 1206 return size - left;
1207 } 1207 }
1208 1208
1209 /* 1209 /*
1210 * an fsync() on a dir will wait for any uncommitted directory 1210 * an fsync() on a dir will wait for any uncommitted directory
1211 * operations to commit. 1211 * operations to commit.
1212 */ 1212 */
1213 static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end, 1213 static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end,
1214 int datasync) 1214 int datasync)
1215 { 1215 {
1216 struct inode *inode = file->f_path.dentry->d_inode; 1216 struct inode *inode = file->f_path.dentry->d_inode;
1217 struct ceph_inode_info *ci = ceph_inode(inode); 1217 struct ceph_inode_info *ci = ceph_inode(inode);
1218 struct list_head *head = &ci->i_unsafe_dirops; 1218 struct list_head *head = &ci->i_unsafe_dirops;
1219 struct ceph_mds_request *req; 1219 struct ceph_mds_request *req;
1220 u64 last_tid; 1220 u64 last_tid;
1221 int ret = 0; 1221 int ret = 0;
1222 1222
1223 dout("dir_fsync %p\n", inode); 1223 dout("dir_fsync %p\n", inode);
1224 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 1224 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
1225 if (ret) 1225 if (ret)
1226 return ret; 1226 return ret;
1227 mutex_lock(&inode->i_mutex); 1227 mutex_lock(&inode->i_mutex);
1228 1228
1229 spin_lock(&ci->i_unsafe_lock); 1229 spin_lock(&ci->i_unsafe_lock);
1230 if (list_empty(head)) 1230 if (list_empty(head))
1231 goto out; 1231 goto out;
1232 1232
1233 req = list_entry(head->prev, 1233 req = list_entry(head->prev,
1234 struct ceph_mds_request, r_unsafe_dir_item); 1234 struct ceph_mds_request, r_unsafe_dir_item);
1235 last_tid = req->r_tid; 1235 last_tid = req->r_tid;
1236 1236
1237 do { 1237 do {
1238 ceph_mdsc_get_request(req); 1238 ceph_mdsc_get_request(req);
1239 spin_unlock(&ci->i_unsafe_lock); 1239 spin_unlock(&ci->i_unsafe_lock);
1240 1240
1241 dout("dir_fsync %p wait on tid %llu (until %llu)\n", 1241 dout("dir_fsync %p wait on tid %llu (until %llu)\n",
1242 inode, req->r_tid, last_tid); 1242 inode, req->r_tid, last_tid);
1243 if (req->r_timeout) { 1243 if (req->r_timeout) {
1244 ret = wait_for_completion_timeout( 1244 ret = wait_for_completion_timeout(
1245 &req->r_safe_completion, req->r_timeout); 1245 &req->r_safe_completion, req->r_timeout);
1246 if (ret > 0) 1246 if (ret > 0)
1247 ret = 0; 1247 ret = 0;
1248 else if (ret == 0) 1248 else if (ret == 0)
1249 ret = -EIO; /* timed out */ 1249 ret = -EIO; /* timed out */
1250 } else { 1250 } else {
1251 wait_for_completion(&req->r_safe_completion); 1251 wait_for_completion(&req->r_safe_completion);
1252 } 1252 }
1253 ceph_mdsc_put_request(req); 1253 ceph_mdsc_put_request(req);
1254 1254
1255 spin_lock(&ci->i_unsafe_lock); 1255 spin_lock(&ci->i_unsafe_lock);
1256 if (ret || list_empty(head)) 1256 if (ret || list_empty(head))
1257 break; 1257 break;
1258 req = list_entry(head->next, 1258 req = list_entry(head->next,
1259 struct ceph_mds_request, r_unsafe_dir_item); 1259 struct ceph_mds_request, r_unsafe_dir_item);
1260 } while (req->r_tid < last_tid); 1260 } while (req->r_tid < last_tid);
1261 out: 1261 out:
1262 spin_unlock(&ci->i_unsafe_lock); 1262 spin_unlock(&ci->i_unsafe_lock);
1263 mutex_unlock(&inode->i_mutex); 1263 mutex_unlock(&inode->i_mutex);
1264 1264
1265 return ret; 1265 return ret;
1266 } 1266 }
1267 1267
1268 /* 1268 /*
1269 * We maintain a private dentry LRU. 1269 * We maintain a private dentry LRU.
1270 * 1270 *
1271 * FIXME: this needs to be changed to a per-mds lru to be useful. 1271 * FIXME: this needs to be changed to a per-mds lru to be useful.
1272 */ 1272 */
1273 void ceph_dentry_lru_add(struct dentry *dn) 1273 void ceph_dentry_lru_add(struct dentry *dn)
1274 { 1274 {
1275 struct ceph_dentry_info *di = ceph_dentry(dn); 1275 struct ceph_dentry_info *di = ceph_dentry(dn);
1276 struct ceph_mds_client *mdsc; 1276 struct ceph_mds_client *mdsc;
1277 1277
1278 dout("dentry_lru_add %p %p '%.*s'\n", di, dn, 1278 dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
1279 dn->d_name.len, dn->d_name.name); 1279 dn->d_name.len, dn->d_name.name);
1280 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; 1280 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
1281 spin_lock(&mdsc->dentry_lru_lock); 1281 spin_lock(&mdsc->dentry_lru_lock);
1282 list_add_tail(&di->lru, &mdsc->dentry_lru); 1282 list_add_tail(&di->lru, &mdsc->dentry_lru);
1283 mdsc->num_dentry++; 1283 mdsc->num_dentry++;
1284 spin_unlock(&mdsc->dentry_lru_lock); 1284 spin_unlock(&mdsc->dentry_lru_lock);
1285 } 1285 }
1286 1286
1287 void ceph_dentry_lru_touch(struct dentry *dn) 1287 void ceph_dentry_lru_touch(struct dentry *dn)
1288 { 1288 {
1289 struct ceph_dentry_info *di = ceph_dentry(dn); 1289 struct ceph_dentry_info *di = ceph_dentry(dn);
1290 struct ceph_mds_client *mdsc; 1290 struct ceph_mds_client *mdsc;
1291 1291
1292 dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn, 1292 dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
1293 dn->d_name.len, dn->d_name.name, di->offset); 1293 dn->d_name.len, dn->d_name.name, di->offset);
1294 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; 1294 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
1295 spin_lock(&mdsc->dentry_lru_lock); 1295 spin_lock(&mdsc->dentry_lru_lock);
1296 list_move_tail(&di->lru, &mdsc->dentry_lru); 1296 list_move_tail(&di->lru, &mdsc->dentry_lru);
1297 spin_unlock(&mdsc->dentry_lru_lock); 1297 spin_unlock(&mdsc->dentry_lru_lock);
1298 } 1298 }
1299 1299
1300 void ceph_dentry_lru_del(struct dentry *dn) 1300 void ceph_dentry_lru_del(struct dentry *dn)
1301 { 1301 {
1302 struct ceph_dentry_info *di = ceph_dentry(dn); 1302 struct ceph_dentry_info *di = ceph_dentry(dn);
1303 struct ceph_mds_client *mdsc; 1303 struct ceph_mds_client *mdsc;
1304 1304
1305 dout("dentry_lru_del %p %p '%.*s'\n", di, dn, 1305 dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
1306 dn->d_name.len, dn->d_name.name); 1306 dn->d_name.len, dn->d_name.name);
1307 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; 1307 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
1308 spin_lock(&mdsc->dentry_lru_lock); 1308 spin_lock(&mdsc->dentry_lru_lock);
1309 list_del_init(&di->lru); 1309 list_del_init(&di->lru);
1310 mdsc->num_dentry--; 1310 mdsc->num_dentry--;
1311 spin_unlock(&mdsc->dentry_lru_lock); 1311 spin_unlock(&mdsc->dentry_lru_lock);
1312 } 1312 }
1313 1313
1314 /* 1314 /*
1315 * Return name hash for a given dentry. This is dependent on 1315 * Return name hash for a given dentry. This is dependent on
1316 * the parent directory's hash function. 1316 * the parent directory's hash function.
1317 */ 1317 */
1318 unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn) 1318 unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn)
1319 { 1319 {
1320 struct ceph_inode_info *dci = ceph_inode(dir); 1320 struct ceph_inode_info *dci = ceph_inode(dir);
1321 1321
1322 switch (dci->i_dir_layout.dl_dir_hash) { 1322 switch (dci->i_dir_layout.dl_dir_hash) {
1323 case 0: /* for backward compat */ 1323 case 0: /* for backward compat */
1324 case CEPH_STR_HASH_LINUX: 1324 case CEPH_STR_HASH_LINUX:
1325 return dn->d_name.hash; 1325 return dn->d_name.hash;
1326 1326
1327 default: 1327 default:
1328 return ceph_str_hash(dci->i_dir_layout.dl_dir_hash, 1328 return ceph_str_hash(dci->i_dir_layout.dl_dir_hash,
1329 dn->d_name.name, dn->d_name.len); 1329 dn->d_name.name, dn->d_name.len);
1330 } 1330 }
1331 } 1331 }
1332 1332
1333 const struct file_operations ceph_dir_fops = { 1333 const struct file_operations ceph_dir_fops = {
1334 .read = ceph_read_dir, 1334 .read = ceph_read_dir,
1335 .readdir = ceph_readdir, 1335 .readdir = ceph_readdir,
1336 .llseek = ceph_dir_llseek, 1336 .llseek = ceph_dir_llseek,
1337 .open = ceph_open, 1337 .open = ceph_open,
1338 .release = ceph_release, 1338 .release = ceph_release,
1339 .unlocked_ioctl = ceph_ioctl, 1339 .unlocked_ioctl = ceph_ioctl,
1340 .fsync = ceph_dir_fsync, 1340 .fsync = ceph_dir_fsync,
1341 }; 1341 };
1342 1342
1343 const struct inode_operations ceph_dir_iops = { 1343 const struct inode_operations ceph_dir_iops = {
1344 .lookup = ceph_lookup, 1344 .lookup = ceph_lookup,
1345 .permission = ceph_permission, 1345 .permission = ceph_permission,
1346 .getattr = ceph_getattr, 1346 .getattr = ceph_getattr,
1347 .setattr = ceph_setattr, 1347 .setattr = ceph_setattr,
1348 .setxattr = ceph_setxattr, 1348 .setxattr = ceph_setxattr,
1349 .getxattr = ceph_getxattr, 1349 .getxattr = ceph_getxattr,
1350 .listxattr = ceph_listxattr, 1350 .listxattr = ceph_listxattr,
1351 .removexattr = ceph_removexattr, 1351 .removexattr = ceph_removexattr,
1352 .mknod = ceph_mknod, 1352 .mknod = ceph_mknod,
1353 .symlink = ceph_symlink, 1353 .symlink = ceph_symlink,
1354 .mkdir = ceph_mkdir, 1354 .mkdir = ceph_mkdir,
1355 .link = ceph_link, 1355 .link = ceph_link,
1356 .unlink = ceph_unlink, 1356 .unlink = ceph_unlink,
1357 .rmdir = ceph_unlink, 1357 .rmdir = ceph_unlink,
1358 .rename = ceph_rename, 1358 .rename = ceph_rename,
1359 .create = ceph_create, 1359 .create = ceph_create,
1360 }; 1360 };
1361 1361
1362 const struct dentry_operations ceph_dentry_ops = { 1362 const struct dentry_operations ceph_dentry_ops = {
1363 .d_revalidate = ceph_d_revalidate, 1363 .d_revalidate = ceph_d_revalidate,
1364 .d_release = ceph_d_release, 1364 .d_release = ceph_d_release,
1365 .d_prune = ceph_d_prune, 1365 .d_prune = ceph_d_prune,
1366 }; 1366 };
1367 1367
1368 const struct dentry_operations ceph_snapdir_dentry_ops = { 1368 const struct dentry_operations ceph_snapdir_dentry_ops = {
1369 .d_revalidate = ceph_snapdir_d_revalidate, 1369 .d_revalidate = ceph_snapdir_d_revalidate,
1370 .d_release = ceph_d_release, 1370 .d_release = ceph_d_release,
1371 }; 1371 };
1372 1372
1373 const struct dentry_operations ceph_snap_dentry_ops = { 1373 const struct dentry_operations ceph_snap_dentry_ops = {
1374 .d_release = ceph_d_release, 1374 .d_release = ceph_d_release,
1375 .d_prune = ceph_d_prune, 1375 .d_prune = ceph_d_prune,
1376 }; 1376 };
1377 1377
fs/ceph/mds_client.c
1 #include <linux/ceph/ceph_debug.h> 1 #include <linux/ceph/ceph_debug.h>
2 2
3 #include <linux/fs.h> 3 #include <linux/fs.h>
4 #include <linux/wait.h> 4 #include <linux/wait.h>
5 #include <linux/slab.h> 5 #include <linux/slab.h>
6 #include <linux/sched.h> 6 #include <linux/sched.h>
7 #include <linux/debugfs.h> 7 #include <linux/debugfs.h>
8 #include <linux/seq_file.h> 8 #include <linux/seq_file.h>
9 9
10 #include "super.h" 10 #include "super.h"
11 #include "mds_client.h" 11 #include "mds_client.h"
12 12
13 #include <linux/ceph/messenger.h> 13 #include <linux/ceph/messenger.h>
14 #include <linux/ceph/decode.h> 14 #include <linux/ceph/decode.h>
15 #include <linux/ceph/pagelist.h> 15 #include <linux/ceph/pagelist.h>
16 #include <linux/ceph/auth.h> 16 #include <linux/ceph/auth.h>
17 #include <linux/ceph/debugfs.h> 17 #include <linux/ceph/debugfs.h>
18 18
19 /* 19 /*
20 * A cluster of MDS (metadata server) daemons is responsible for 20 * A cluster of MDS (metadata server) daemons is responsible for
21 * managing the file system namespace (the directory hierarchy and 21 * managing the file system namespace (the directory hierarchy and
22 * inodes) and for coordinating shared access to storage. Metadata is 22 * inodes) and for coordinating shared access to storage. Metadata is
23 * partitioning hierarchically across a number of servers, and that 23 * partitioning hierarchically across a number of servers, and that
24 * partition varies over time as the cluster adjusts the distribution 24 * partition varies over time as the cluster adjusts the distribution
25 * in order to balance load. 25 * in order to balance load.
26 * 26 *
27 * The MDS client is primarily responsible to managing synchronous 27 * The MDS client is primarily responsible to managing synchronous
28 * metadata requests for operations like open, unlink, and so forth. 28 * metadata requests for operations like open, unlink, and so forth.
29 * If there is a MDS failure, we find out about it when we (possibly 29 * If there is a MDS failure, we find out about it when we (possibly
30 * request and) receive a new MDS map, and can resubmit affected 30 * request and) receive a new MDS map, and can resubmit affected
31 * requests. 31 * requests.
32 * 32 *
33 * For the most part, though, we take advantage of a lossless 33 * For the most part, though, we take advantage of a lossless
34 * communications channel to the MDS, and do not need to worry about 34 * communications channel to the MDS, and do not need to worry about
35 * timing out or resubmitting requests. 35 * timing out or resubmitting requests.
36 * 36 *
37 * We maintain a stateful "session" with each MDS we interact with. 37 * We maintain a stateful "session" with each MDS we interact with.
38 * Within each session, we sent periodic heartbeat messages to ensure 38 * Within each session, we sent periodic heartbeat messages to ensure
39 * any capabilities or leases we have been issues remain valid. If 39 * any capabilities or leases we have been issues remain valid. If
40 * the session times out and goes stale, our leases and capabilities 40 * the session times out and goes stale, our leases and capabilities
41 * are no longer valid. 41 * are no longer valid.
42 */ 42 */
43 43
44 struct ceph_reconnect_state { 44 struct ceph_reconnect_state {
45 struct ceph_pagelist *pagelist; 45 struct ceph_pagelist *pagelist;
46 bool flock; 46 bool flock;
47 }; 47 };
48 48
49 static void __wake_requests(struct ceph_mds_client *mdsc, 49 static void __wake_requests(struct ceph_mds_client *mdsc,
50 struct list_head *head); 50 struct list_head *head);
51 51
52 static const struct ceph_connection_operations mds_con_ops; 52 static const struct ceph_connection_operations mds_con_ops;
53 53
54 54
55 /* 55 /*
56 * mds reply parsing 56 * mds reply parsing
57 */ 57 */
58 58
59 /* 59 /*
60 * parse individual inode info 60 * parse individual inode info
61 */ 61 */
62 static int parse_reply_info_in(void **p, void *end, 62 static int parse_reply_info_in(void **p, void *end,
63 struct ceph_mds_reply_info_in *info, 63 struct ceph_mds_reply_info_in *info,
64 int features) 64 int features)
65 { 65 {
66 int err = -EIO; 66 int err = -EIO;
67 67
68 info->in = *p; 68 info->in = *p;
69 *p += sizeof(struct ceph_mds_reply_inode) + 69 *p += sizeof(struct ceph_mds_reply_inode) +
70 sizeof(*info->in->fragtree.splits) * 70 sizeof(*info->in->fragtree.splits) *
71 le32_to_cpu(info->in->fragtree.nsplits); 71 le32_to_cpu(info->in->fragtree.nsplits);
72 72
73 ceph_decode_32_safe(p, end, info->symlink_len, bad); 73 ceph_decode_32_safe(p, end, info->symlink_len, bad);
74 ceph_decode_need(p, end, info->symlink_len, bad); 74 ceph_decode_need(p, end, info->symlink_len, bad);
75 info->symlink = *p; 75 info->symlink = *p;
76 *p += info->symlink_len; 76 *p += info->symlink_len;
77 77
78 if (features & CEPH_FEATURE_DIRLAYOUTHASH) 78 if (features & CEPH_FEATURE_DIRLAYOUTHASH)
79 ceph_decode_copy_safe(p, end, &info->dir_layout, 79 ceph_decode_copy_safe(p, end, &info->dir_layout,
80 sizeof(info->dir_layout), bad); 80 sizeof(info->dir_layout), bad);
81 else 81 else
82 memset(&info->dir_layout, 0, sizeof(info->dir_layout)); 82 memset(&info->dir_layout, 0, sizeof(info->dir_layout));
83 83
84 ceph_decode_32_safe(p, end, info->xattr_len, bad); 84 ceph_decode_32_safe(p, end, info->xattr_len, bad);
85 ceph_decode_need(p, end, info->xattr_len, bad); 85 ceph_decode_need(p, end, info->xattr_len, bad);
86 info->xattr_data = *p; 86 info->xattr_data = *p;
87 *p += info->xattr_len; 87 *p += info->xattr_len;
88 return 0; 88 return 0;
89 bad: 89 bad:
90 return err; 90 return err;
91 } 91 }
92 92
93 /* 93 /*
94 * parse a normal reply, which may contain a (dir+)dentry and/or a 94 * parse a normal reply, which may contain a (dir+)dentry and/or a
95 * target inode. 95 * target inode.
96 */ 96 */
97 static int parse_reply_info_trace(void **p, void *end, 97 static int parse_reply_info_trace(void **p, void *end,
98 struct ceph_mds_reply_info_parsed *info, 98 struct ceph_mds_reply_info_parsed *info,
99 int features) 99 int features)
100 { 100 {
101 int err; 101 int err;
102 102
103 if (info->head->is_dentry) { 103 if (info->head->is_dentry) {
104 err = parse_reply_info_in(p, end, &info->diri, features); 104 err = parse_reply_info_in(p, end, &info->diri, features);
105 if (err < 0) 105 if (err < 0)
106 goto out_bad; 106 goto out_bad;
107 107
108 if (unlikely(*p + sizeof(*info->dirfrag) > end)) 108 if (unlikely(*p + sizeof(*info->dirfrag) > end))
109 goto bad; 109 goto bad;
110 info->dirfrag = *p; 110 info->dirfrag = *p;
111 *p += sizeof(*info->dirfrag) + 111 *p += sizeof(*info->dirfrag) +
112 sizeof(u32)*le32_to_cpu(info->dirfrag->ndist); 112 sizeof(u32)*le32_to_cpu(info->dirfrag->ndist);
113 if (unlikely(*p > end)) 113 if (unlikely(*p > end))
114 goto bad; 114 goto bad;
115 115
116 ceph_decode_32_safe(p, end, info->dname_len, bad); 116 ceph_decode_32_safe(p, end, info->dname_len, bad);
117 ceph_decode_need(p, end, info->dname_len, bad); 117 ceph_decode_need(p, end, info->dname_len, bad);
118 info->dname = *p; 118 info->dname = *p;
119 *p += info->dname_len; 119 *p += info->dname_len;
120 info->dlease = *p; 120 info->dlease = *p;
121 *p += sizeof(*info->dlease); 121 *p += sizeof(*info->dlease);
122 } 122 }
123 123
124 if (info->head->is_target) { 124 if (info->head->is_target) {
125 err = parse_reply_info_in(p, end, &info->targeti, features); 125 err = parse_reply_info_in(p, end, &info->targeti, features);
126 if (err < 0) 126 if (err < 0)
127 goto out_bad; 127 goto out_bad;
128 } 128 }
129 129
130 if (unlikely(*p != end)) 130 if (unlikely(*p != end))
131 goto bad; 131 goto bad;
132 return 0; 132 return 0;
133 133
134 bad: 134 bad:
135 err = -EIO; 135 err = -EIO;
136 out_bad: 136 out_bad:
137 pr_err("problem parsing mds trace %d\n", err); 137 pr_err("problem parsing mds trace %d\n", err);
138 return err; 138 return err;
139 } 139 }
140 140
141 /* 141 /*
142 * parse readdir results 142 * parse readdir results
143 */ 143 */
144 static int parse_reply_info_dir(void **p, void *end, 144 static int parse_reply_info_dir(void **p, void *end,
145 struct ceph_mds_reply_info_parsed *info, 145 struct ceph_mds_reply_info_parsed *info,
146 int features) 146 int features)
147 { 147 {
148 u32 num, i = 0; 148 u32 num, i = 0;
149 int err; 149 int err;
150 150
151 info->dir_dir = *p; 151 info->dir_dir = *p;
152 if (*p + sizeof(*info->dir_dir) > end) 152 if (*p + sizeof(*info->dir_dir) > end)
153 goto bad; 153 goto bad;
154 *p += sizeof(*info->dir_dir) + 154 *p += sizeof(*info->dir_dir) +
155 sizeof(u32)*le32_to_cpu(info->dir_dir->ndist); 155 sizeof(u32)*le32_to_cpu(info->dir_dir->ndist);
156 if (*p > end) 156 if (*p > end)
157 goto bad; 157 goto bad;
158 158
159 ceph_decode_need(p, end, sizeof(num) + 2, bad); 159 ceph_decode_need(p, end, sizeof(num) + 2, bad);
160 num = ceph_decode_32(p); 160 num = ceph_decode_32(p);
161 info->dir_end = ceph_decode_8(p); 161 info->dir_end = ceph_decode_8(p);
162 info->dir_complete = ceph_decode_8(p); 162 info->dir_complete = ceph_decode_8(p);
163 if (num == 0) 163 if (num == 0)
164 goto done; 164 goto done;
165 165
166 /* alloc large array */ 166 /* alloc large array */
167 info->dir_nr = num; 167 info->dir_nr = num;
168 info->dir_in = kcalloc(num, sizeof(*info->dir_in) + 168 info->dir_in = kcalloc(num, sizeof(*info->dir_in) +
169 sizeof(*info->dir_dname) + 169 sizeof(*info->dir_dname) +
170 sizeof(*info->dir_dname_len) + 170 sizeof(*info->dir_dname_len) +
171 sizeof(*info->dir_dlease), 171 sizeof(*info->dir_dlease),
172 GFP_NOFS); 172 GFP_NOFS);
173 if (info->dir_in == NULL) { 173 if (info->dir_in == NULL) {
174 err = -ENOMEM; 174 err = -ENOMEM;
175 goto out_bad; 175 goto out_bad;
176 } 176 }
177 info->dir_dname = (void *)(info->dir_in + num); 177 info->dir_dname = (void *)(info->dir_in + num);
178 info->dir_dname_len = (void *)(info->dir_dname + num); 178 info->dir_dname_len = (void *)(info->dir_dname + num);
179 info->dir_dlease = (void *)(info->dir_dname_len + num); 179 info->dir_dlease = (void *)(info->dir_dname_len + num);
180 180
181 while (num) { 181 while (num) {
182 /* dentry */ 182 /* dentry */
183 ceph_decode_need(p, end, sizeof(u32)*2, bad); 183 ceph_decode_need(p, end, sizeof(u32)*2, bad);
184 info->dir_dname_len[i] = ceph_decode_32(p); 184 info->dir_dname_len[i] = ceph_decode_32(p);
185 ceph_decode_need(p, end, info->dir_dname_len[i], bad); 185 ceph_decode_need(p, end, info->dir_dname_len[i], bad);
186 info->dir_dname[i] = *p; 186 info->dir_dname[i] = *p;
187 *p += info->dir_dname_len[i]; 187 *p += info->dir_dname_len[i];
188 dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i], 188 dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i],
189 info->dir_dname[i]); 189 info->dir_dname[i]);
190 info->dir_dlease[i] = *p; 190 info->dir_dlease[i] = *p;
191 *p += sizeof(struct ceph_mds_reply_lease); 191 *p += sizeof(struct ceph_mds_reply_lease);
192 192
193 /* inode */ 193 /* inode */
194 err = parse_reply_info_in(p, end, &info->dir_in[i], features); 194 err = parse_reply_info_in(p, end, &info->dir_in[i], features);
195 if (err < 0) 195 if (err < 0)
196 goto out_bad; 196 goto out_bad;
197 i++; 197 i++;
198 num--; 198 num--;
199 } 199 }
200 200
201 done: 201 done:
202 if (*p != end) 202 if (*p != end)
203 goto bad; 203 goto bad;
204 return 0; 204 return 0;
205 205
206 bad: 206 bad:
207 err = -EIO; 207 err = -EIO;
208 out_bad: 208 out_bad:
209 pr_err("problem parsing dir contents %d\n", err); 209 pr_err("problem parsing dir contents %d\n", err);
210 return err; 210 return err;
211 } 211 }
212 212
213 /* 213 /*
214 * parse fcntl F_GETLK results 214 * parse fcntl F_GETLK results
215 */ 215 */
216 static int parse_reply_info_filelock(void **p, void *end, 216 static int parse_reply_info_filelock(void **p, void *end,
217 struct ceph_mds_reply_info_parsed *info, 217 struct ceph_mds_reply_info_parsed *info,
218 int features) 218 int features)
219 { 219 {
220 if (*p + sizeof(*info->filelock_reply) > end) 220 if (*p + sizeof(*info->filelock_reply) > end)
221 goto bad; 221 goto bad;
222 222
223 info->filelock_reply = *p; 223 info->filelock_reply = *p;
224 *p += sizeof(*info->filelock_reply); 224 *p += sizeof(*info->filelock_reply);
225 225
226 if (unlikely(*p != end)) 226 if (unlikely(*p != end))
227 goto bad; 227 goto bad;
228 return 0; 228 return 0;
229 229
230 bad: 230 bad:
231 return -EIO; 231 return -EIO;
232 } 232 }
233 233
234 /* 234 /*
235 * parse extra results 235 * parse extra results
236 */ 236 */
237 static int parse_reply_info_extra(void **p, void *end, 237 static int parse_reply_info_extra(void **p, void *end,
238 struct ceph_mds_reply_info_parsed *info, 238 struct ceph_mds_reply_info_parsed *info,
239 int features) 239 int features)
240 { 240 {
241 if (info->head->op == CEPH_MDS_OP_GETFILELOCK) 241 if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
242 return parse_reply_info_filelock(p, end, info, features); 242 return parse_reply_info_filelock(p, end, info, features);
243 else 243 else
244 return parse_reply_info_dir(p, end, info, features); 244 return parse_reply_info_dir(p, end, info, features);
245 } 245 }
246 246
247 /* 247 /*
248 * parse entire mds reply 248 * parse entire mds reply
249 */ 249 */
250 static int parse_reply_info(struct ceph_msg *msg, 250 static int parse_reply_info(struct ceph_msg *msg,
251 struct ceph_mds_reply_info_parsed *info, 251 struct ceph_mds_reply_info_parsed *info,
252 int features) 252 int features)
253 { 253 {
254 void *p, *end; 254 void *p, *end;
255 u32 len; 255 u32 len;
256 int err; 256 int err;
257 257
258 info->head = msg->front.iov_base; 258 info->head = msg->front.iov_base;
259 p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head); 259 p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head);
260 end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head); 260 end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head);
261 261
262 /* trace */ 262 /* trace */
263 ceph_decode_32_safe(&p, end, len, bad); 263 ceph_decode_32_safe(&p, end, len, bad);
264 if (len > 0) { 264 if (len > 0) {
265 ceph_decode_need(&p, end, len, bad);
265 err = parse_reply_info_trace(&p, p+len, info, features); 266 err = parse_reply_info_trace(&p, p+len, info, features);
266 if (err < 0) 267 if (err < 0)
267 goto out_bad; 268 goto out_bad;
268 } 269 }
269 270
270 /* extra */ 271 /* extra */
271 ceph_decode_32_safe(&p, end, len, bad); 272 ceph_decode_32_safe(&p, end, len, bad);
272 if (len > 0) { 273 if (len > 0) {
274 ceph_decode_need(&p, end, len, bad);
273 err = parse_reply_info_extra(&p, p+len, info, features); 275 err = parse_reply_info_extra(&p, p+len, info, features);
274 if (err < 0) 276 if (err < 0)
275 goto out_bad; 277 goto out_bad;
276 } 278 }
277 279
278 /* snap blob */ 280 /* snap blob */
279 ceph_decode_32_safe(&p, end, len, bad); 281 ceph_decode_32_safe(&p, end, len, bad);
280 info->snapblob_len = len; 282 info->snapblob_len = len;
281 info->snapblob = p; 283 info->snapblob = p;
282 p += len; 284 p += len;
283 285
284 if (p != end) 286 if (p != end)
285 goto bad; 287 goto bad;
286 return 0; 288 return 0;
287 289
288 bad: 290 bad:
289 err = -EIO; 291 err = -EIO;
290 out_bad: 292 out_bad:
291 pr_err("mds parse_reply err %d\n", err); 293 pr_err("mds parse_reply err %d\n", err);
292 return err; 294 return err;
293 } 295 }
294 296
295 static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info) 297 static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
296 { 298 {
297 kfree(info->dir_in); 299 kfree(info->dir_in);
298 } 300 }
299 301
300 302
301 /* 303 /*
302 * sessions 304 * sessions
303 */ 305 */
304 static const char *session_state_name(int s) 306 static const char *session_state_name(int s)
305 { 307 {
306 switch (s) { 308 switch (s) {
307 case CEPH_MDS_SESSION_NEW: return "new"; 309 case CEPH_MDS_SESSION_NEW: return "new";
308 case CEPH_MDS_SESSION_OPENING: return "opening"; 310 case CEPH_MDS_SESSION_OPENING: return "opening";
309 case CEPH_MDS_SESSION_OPEN: return "open"; 311 case CEPH_MDS_SESSION_OPEN: return "open";
310 case CEPH_MDS_SESSION_HUNG: return "hung"; 312 case CEPH_MDS_SESSION_HUNG: return "hung";
311 case CEPH_MDS_SESSION_CLOSING: return "closing"; 313 case CEPH_MDS_SESSION_CLOSING: return "closing";
312 case CEPH_MDS_SESSION_RESTARTING: return "restarting"; 314 case CEPH_MDS_SESSION_RESTARTING: return "restarting";
313 case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting"; 315 case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting";
314 default: return "???"; 316 default: return "???";
315 } 317 }
316 } 318 }
317 319
318 static struct ceph_mds_session *get_session(struct ceph_mds_session *s) 320 static struct ceph_mds_session *get_session(struct ceph_mds_session *s)
319 { 321 {
320 if (atomic_inc_not_zero(&s->s_ref)) { 322 if (atomic_inc_not_zero(&s->s_ref)) {
321 dout("mdsc get_session %p %d -> %d\n", s, 323 dout("mdsc get_session %p %d -> %d\n", s,
322 atomic_read(&s->s_ref)-1, atomic_read(&s->s_ref)); 324 atomic_read(&s->s_ref)-1, atomic_read(&s->s_ref));
323 return s; 325 return s;
324 } else { 326 } else {
325 dout("mdsc get_session %p 0 -- FAIL", s); 327 dout("mdsc get_session %p 0 -- FAIL", s);
326 return NULL; 328 return NULL;
327 } 329 }
328 } 330 }
329 331
330 void ceph_put_mds_session(struct ceph_mds_session *s) 332 void ceph_put_mds_session(struct ceph_mds_session *s)
331 { 333 {
332 dout("mdsc put_session %p %d -> %d\n", s, 334 dout("mdsc put_session %p %d -> %d\n", s,
333 atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1); 335 atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
334 if (atomic_dec_and_test(&s->s_ref)) { 336 if (atomic_dec_and_test(&s->s_ref)) {
335 if (s->s_authorizer) 337 if (s->s_authorizer)
336 s->s_mdsc->fsc->client->monc.auth->ops->destroy_authorizer( 338 s->s_mdsc->fsc->client->monc.auth->ops->destroy_authorizer(
337 s->s_mdsc->fsc->client->monc.auth, 339 s->s_mdsc->fsc->client->monc.auth,
338 s->s_authorizer); 340 s->s_authorizer);
339 kfree(s); 341 kfree(s);
340 } 342 }
341 } 343 }
342 344
343 /* 345 /*
344 * called under mdsc->mutex 346 * called under mdsc->mutex
345 */ 347 */
346 struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc, 348 struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
347 int mds) 349 int mds)
348 { 350 {
349 struct ceph_mds_session *session; 351 struct ceph_mds_session *session;
350 352
351 if (mds >= mdsc->max_sessions || mdsc->sessions[mds] == NULL) 353 if (mds >= mdsc->max_sessions || mdsc->sessions[mds] == NULL)
352 return NULL; 354 return NULL;
353 session = mdsc->sessions[mds]; 355 session = mdsc->sessions[mds];
354 dout("lookup_mds_session %p %d\n", session, 356 dout("lookup_mds_session %p %d\n", session,
355 atomic_read(&session->s_ref)); 357 atomic_read(&session->s_ref));
356 get_session(session); 358 get_session(session);
357 return session; 359 return session;
358 } 360 }
359 361
360 static bool __have_session(struct ceph_mds_client *mdsc, int mds) 362 static bool __have_session(struct ceph_mds_client *mdsc, int mds)
361 { 363 {
362 if (mds >= mdsc->max_sessions) 364 if (mds >= mdsc->max_sessions)
363 return false; 365 return false;
364 return mdsc->sessions[mds]; 366 return mdsc->sessions[mds];
365 } 367 }
366 368
367 static int __verify_registered_session(struct ceph_mds_client *mdsc, 369 static int __verify_registered_session(struct ceph_mds_client *mdsc,
368 struct ceph_mds_session *s) 370 struct ceph_mds_session *s)
369 { 371 {
370 if (s->s_mds >= mdsc->max_sessions || 372 if (s->s_mds >= mdsc->max_sessions ||
371 mdsc->sessions[s->s_mds] != s) 373 mdsc->sessions[s->s_mds] != s)
372 return -ENOENT; 374 return -ENOENT;
373 return 0; 375 return 0;
374 } 376 }
375 377
376 /* 378 /*
377 * create+register a new session for given mds. 379 * create+register a new session for given mds.
378 * called under mdsc->mutex. 380 * called under mdsc->mutex.
379 */ 381 */
380 static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, 382 static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
381 int mds) 383 int mds)
382 { 384 {
383 struct ceph_mds_session *s; 385 struct ceph_mds_session *s;
384 386
385 s = kzalloc(sizeof(*s), GFP_NOFS); 387 s = kzalloc(sizeof(*s), GFP_NOFS);
386 if (!s) 388 if (!s)
387 return ERR_PTR(-ENOMEM); 389 return ERR_PTR(-ENOMEM);
388 s->s_mdsc = mdsc; 390 s->s_mdsc = mdsc;
389 s->s_mds = mds; 391 s->s_mds = mds;
390 s->s_state = CEPH_MDS_SESSION_NEW; 392 s->s_state = CEPH_MDS_SESSION_NEW;
391 s->s_ttl = 0; 393 s->s_ttl = 0;
392 s->s_seq = 0; 394 s->s_seq = 0;
393 mutex_init(&s->s_mutex); 395 mutex_init(&s->s_mutex);
394 396
395 ceph_con_init(mdsc->fsc->client->msgr, &s->s_con); 397 ceph_con_init(mdsc->fsc->client->msgr, &s->s_con);
396 s->s_con.private = s; 398 s->s_con.private = s;
397 s->s_con.ops = &mds_con_ops; 399 s->s_con.ops = &mds_con_ops;
398 s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS; 400 s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
399 s->s_con.peer_name.num = cpu_to_le64(mds); 401 s->s_con.peer_name.num = cpu_to_le64(mds);
400 402
401 spin_lock_init(&s->s_cap_lock); 403 spin_lock_init(&s->s_gen_ttl_lock);
402 s->s_cap_gen = 0; 404 s->s_cap_gen = 0;
403 s->s_cap_ttl = 0; 405 s->s_cap_ttl = 0;
406
407 spin_lock_init(&s->s_cap_lock);
404 s->s_renew_requested = 0; 408 s->s_renew_requested = 0;
405 s->s_renew_seq = 0; 409 s->s_renew_seq = 0;
406 INIT_LIST_HEAD(&s->s_caps); 410 INIT_LIST_HEAD(&s->s_caps);
407 s->s_nr_caps = 0; 411 s->s_nr_caps = 0;
408 s->s_trim_caps = 0; 412 s->s_trim_caps = 0;
409 atomic_set(&s->s_ref, 1); 413 atomic_set(&s->s_ref, 1);
410 INIT_LIST_HEAD(&s->s_waiting); 414 INIT_LIST_HEAD(&s->s_waiting);
411 INIT_LIST_HEAD(&s->s_unsafe); 415 INIT_LIST_HEAD(&s->s_unsafe);
412 s->s_num_cap_releases = 0; 416 s->s_num_cap_releases = 0;
413 s->s_cap_iterator = NULL; 417 s->s_cap_iterator = NULL;
414 INIT_LIST_HEAD(&s->s_cap_releases); 418 INIT_LIST_HEAD(&s->s_cap_releases);
415 INIT_LIST_HEAD(&s->s_cap_releases_done); 419 INIT_LIST_HEAD(&s->s_cap_releases_done);
416 INIT_LIST_HEAD(&s->s_cap_flushing); 420 INIT_LIST_HEAD(&s->s_cap_flushing);
417 INIT_LIST_HEAD(&s->s_cap_snaps_flushing); 421 INIT_LIST_HEAD(&s->s_cap_snaps_flushing);
418 422
419 dout("register_session mds%d\n", mds); 423 dout("register_session mds%d\n", mds);
420 if (mds >= mdsc->max_sessions) { 424 if (mds >= mdsc->max_sessions) {
421 int newmax = 1 << get_count_order(mds+1); 425 int newmax = 1 << get_count_order(mds+1);
422 struct ceph_mds_session **sa; 426 struct ceph_mds_session **sa;
423 427
424 dout("register_session realloc to %d\n", newmax); 428 dout("register_session realloc to %d\n", newmax);
425 sa = kcalloc(newmax, sizeof(void *), GFP_NOFS); 429 sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
426 if (sa == NULL) 430 if (sa == NULL)
427 goto fail_realloc; 431 goto fail_realloc;
428 if (mdsc->sessions) { 432 if (mdsc->sessions) {
429 memcpy(sa, mdsc->sessions, 433 memcpy(sa, mdsc->sessions,
430 mdsc->max_sessions * sizeof(void *)); 434 mdsc->max_sessions * sizeof(void *));
431 kfree(mdsc->sessions); 435 kfree(mdsc->sessions);
432 } 436 }
433 mdsc->sessions = sa; 437 mdsc->sessions = sa;
434 mdsc->max_sessions = newmax; 438 mdsc->max_sessions = newmax;
435 } 439 }
436 mdsc->sessions[mds] = s; 440 mdsc->sessions[mds] = s;
437 atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */ 441 atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */
438 442
439 ceph_con_open(&s->s_con, ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); 443 ceph_con_open(&s->s_con, ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
440 444
441 return s; 445 return s;
442 446
443 fail_realloc: 447 fail_realloc:
444 kfree(s); 448 kfree(s);
445 return ERR_PTR(-ENOMEM); 449 return ERR_PTR(-ENOMEM);
446 } 450 }
447 451
448 /* 452 /*
449 * called under mdsc->mutex 453 * called under mdsc->mutex
450 */ 454 */
451 static void __unregister_session(struct ceph_mds_client *mdsc, 455 static void __unregister_session(struct ceph_mds_client *mdsc,
452 struct ceph_mds_session *s) 456 struct ceph_mds_session *s)
453 { 457 {
454 dout("__unregister_session mds%d %p\n", s->s_mds, s); 458 dout("__unregister_session mds%d %p\n", s->s_mds, s);
455 BUG_ON(mdsc->sessions[s->s_mds] != s); 459 BUG_ON(mdsc->sessions[s->s_mds] != s);
456 mdsc->sessions[s->s_mds] = NULL; 460 mdsc->sessions[s->s_mds] = NULL;
457 ceph_con_close(&s->s_con); 461 ceph_con_close(&s->s_con);
458 ceph_put_mds_session(s); 462 ceph_put_mds_session(s);
459 } 463 }
460 464
461 /* 465 /*
462 * drop session refs in request. 466 * drop session refs in request.
463 * 467 *
464 * should be last request ref, or hold mdsc->mutex 468 * should be last request ref, or hold mdsc->mutex
465 */ 469 */
466 static void put_request_session(struct ceph_mds_request *req) 470 static void put_request_session(struct ceph_mds_request *req)
467 { 471 {
468 if (req->r_session) { 472 if (req->r_session) {
469 ceph_put_mds_session(req->r_session); 473 ceph_put_mds_session(req->r_session);
470 req->r_session = NULL; 474 req->r_session = NULL;
471 } 475 }
472 } 476 }
473 477
474 void ceph_mdsc_release_request(struct kref *kref) 478 void ceph_mdsc_release_request(struct kref *kref)
475 { 479 {
476 struct ceph_mds_request *req = container_of(kref, 480 struct ceph_mds_request *req = container_of(kref,
477 struct ceph_mds_request, 481 struct ceph_mds_request,
478 r_kref); 482 r_kref);
479 if (req->r_request) 483 if (req->r_request)
480 ceph_msg_put(req->r_request); 484 ceph_msg_put(req->r_request);
481 if (req->r_reply) { 485 if (req->r_reply) {
482 ceph_msg_put(req->r_reply); 486 ceph_msg_put(req->r_reply);
483 destroy_reply_info(&req->r_reply_info); 487 destroy_reply_info(&req->r_reply_info);
484 } 488 }
485 if (req->r_inode) { 489 if (req->r_inode) {
486 ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); 490 ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
487 iput(req->r_inode); 491 iput(req->r_inode);
488 } 492 }
489 if (req->r_locked_dir) 493 if (req->r_locked_dir)
490 ceph_put_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); 494 ceph_put_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
491 if (req->r_target_inode) 495 if (req->r_target_inode)
492 iput(req->r_target_inode); 496 iput(req->r_target_inode);
493 if (req->r_dentry) 497 if (req->r_dentry)
494 dput(req->r_dentry); 498 dput(req->r_dentry);
495 if (req->r_old_dentry) { 499 if (req->r_old_dentry) {
496 /* 500 /*
497 * track (and drop pins for) r_old_dentry_dir 501 * track (and drop pins for) r_old_dentry_dir
498 * separately, since r_old_dentry's d_parent may have 502 * separately, since r_old_dentry's d_parent may have
499 * changed between the dir mutex being dropped and 503 * changed between the dir mutex being dropped and
500 * this request being freed. 504 * this request being freed.
501 */ 505 */
502 ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir), 506 ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir),
503 CEPH_CAP_PIN); 507 CEPH_CAP_PIN);
504 dput(req->r_old_dentry); 508 dput(req->r_old_dentry);
505 iput(req->r_old_dentry_dir); 509 iput(req->r_old_dentry_dir);
506 } 510 }
507 kfree(req->r_path1); 511 kfree(req->r_path1);
508 kfree(req->r_path2); 512 kfree(req->r_path2);
509 put_request_session(req); 513 put_request_session(req);
510 ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation); 514 ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation);
511 kfree(req); 515 kfree(req);
512 } 516 }
513 517
514 /* 518 /*
515 * lookup session, bump ref if found. 519 * lookup session, bump ref if found.
516 * 520 *
517 * called under mdsc->mutex. 521 * called under mdsc->mutex.
518 */ 522 */
519 static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc, 523 static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc,
520 u64 tid) 524 u64 tid)
521 { 525 {
522 struct ceph_mds_request *req; 526 struct ceph_mds_request *req;
523 struct rb_node *n = mdsc->request_tree.rb_node; 527 struct rb_node *n = mdsc->request_tree.rb_node;
524 528
525 while (n) { 529 while (n) {
526 req = rb_entry(n, struct ceph_mds_request, r_node); 530 req = rb_entry(n, struct ceph_mds_request, r_node);
527 if (tid < req->r_tid) 531 if (tid < req->r_tid)
528 n = n->rb_left; 532 n = n->rb_left;
529 else if (tid > req->r_tid) 533 else if (tid > req->r_tid)
530 n = n->rb_right; 534 n = n->rb_right;
531 else { 535 else {
532 ceph_mdsc_get_request(req); 536 ceph_mdsc_get_request(req);
533 return req; 537 return req;
534 } 538 }
535 } 539 }
536 return NULL; 540 return NULL;
537 } 541 }
538 542
539 static void __insert_request(struct ceph_mds_client *mdsc, 543 static void __insert_request(struct ceph_mds_client *mdsc,
540 struct ceph_mds_request *new) 544 struct ceph_mds_request *new)
541 { 545 {
542 struct rb_node **p = &mdsc->request_tree.rb_node; 546 struct rb_node **p = &mdsc->request_tree.rb_node;
543 struct rb_node *parent = NULL; 547 struct rb_node *parent = NULL;
544 struct ceph_mds_request *req = NULL; 548 struct ceph_mds_request *req = NULL;
545 549
546 while (*p) { 550 while (*p) {
547 parent = *p; 551 parent = *p;
548 req = rb_entry(parent, struct ceph_mds_request, r_node); 552 req = rb_entry(parent, struct ceph_mds_request, r_node);
549 if (new->r_tid < req->r_tid) 553 if (new->r_tid < req->r_tid)
550 p = &(*p)->rb_left; 554 p = &(*p)->rb_left;
551 else if (new->r_tid > req->r_tid) 555 else if (new->r_tid > req->r_tid)
552 p = &(*p)->rb_right; 556 p = &(*p)->rb_right;
553 else 557 else
554 BUG(); 558 BUG();
555 } 559 }
556 560
557 rb_link_node(&new->r_node, parent, p); 561 rb_link_node(&new->r_node, parent, p);
558 rb_insert_color(&new->r_node, &mdsc->request_tree); 562 rb_insert_color(&new->r_node, &mdsc->request_tree);
559 } 563 }
560 564
561 /* 565 /*
562 * Register an in-flight request, and assign a tid. Link to directory 566 * Register an in-flight request, and assign a tid. Link to directory
563 * are modifying (if any). 567 * are modifying (if any).
564 * 568 *
565 * Called under mdsc->mutex. 569 * Called under mdsc->mutex.
566 */ 570 */
567 static void __register_request(struct ceph_mds_client *mdsc, 571 static void __register_request(struct ceph_mds_client *mdsc,
568 struct ceph_mds_request *req, 572 struct ceph_mds_request *req,
569 struct inode *dir) 573 struct inode *dir)
570 { 574 {
571 req->r_tid = ++mdsc->last_tid; 575 req->r_tid = ++mdsc->last_tid;
572 if (req->r_num_caps) 576 if (req->r_num_caps)
573 ceph_reserve_caps(mdsc, &req->r_caps_reservation, 577 ceph_reserve_caps(mdsc, &req->r_caps_reservation,
574 req->r_num_caps); 578 req->r_num_caps);
575 dout("__register_request %p tid %lld\n", req, req->r_tid); 579 dout("__register_request %p tid %lld\n", req, req->r_tid);
576 ceph_mdsc_get_request(req); 580 ceph_mdsc_get_request(req);
577 __insert_request(mdsc, req); 581 __insert_request(mdsc, req);
578 582
579 req->r_uid = current_fsuid(); 583 req->r_uid = current_fsuid();
580 req->r_gid = current_fsgid(); 584 req->r_gid = current_fsgid();
581 585
582 if (dir) { 586 if (dir) {
583 struct ceph_inode_info *ci = ceph_inode(dir); 587 struct ceph_inode_info *ci = ceph_inode(dir);
584 588
585 ihold(dir); 589 ihold(dir);
586 spin_lock(&ci->i_unsafe_lock); 590 spin_lock(&ci->i_unsafe_lock);
587 req->r_unsafe_dir = dir; 591 req->r_unsafe_dir = dir;
588 list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops); 592 list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
589 spin_unlock(&ci->i_unsafe_lock); 593 spin_unlock(&ci->i_unsafe_lock);
590 } 594 }
591 } 595 }
592 596
593 static void __unregister_request(struct ceph_mds_client *mdsc, 597 static void __unregister_request(struct ceph_mds_client *mdsc,
594 struct ceph_mds_request *req) 598 struct ceph_mds_request *req)
595 { 599 {
596 dout("__unregister_request %p tid %lld\n", req, req->r_tid); 600 dout("__unregister_request %p tid %lld\n", req, req->r_tid);
597 rb_erase(&req->r_node, &mdsc->request_tree); 601 rb_erase(&req->r_node, &mdsc->request_tree);
598 RB_CLEAR_NODE(&req->r_node); 602 RB_CLEAR_NODE(&req->r_node);
599 603
600 if (req->r_unsafe_dir) { 604 if (req->r_unsafe_dir) {
601 struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir); 605 struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
602 606
603 spin_lock(&ci->i_unsafe_lock); 607 spin_lock(&ci->i_unsafe_lock);
604 list_del_init(&req->r_unsafe_dir_item); 608 list_del_init(&req->r_unsafe_dir_item);
605 spin_unlock(&ci->i_unsafe_lock); 609 spin_unlock(&ci->i_unsafe_lock);
606 610
607 iput(req->r_unsafe_dir); 611 iput(req->r_unsafe_dir);
608 req->r_unsafe_dir = NULL; 612 req->r_unsafe_dir = NULL;
609 } 613 }
610 614
611 ceph_mdsc_put_request(req); 615 ceph_mdsc_put_request(req);
612 } 616 }
613 617
614 /* 618 /*
615 * Choose mds to send request to next. If there is a hint set in the 619 * Choose mds to send request to next. If there is a hint set in the
616 * request (e.g., due to a prior forward hint from the mds), use that. 620 * request (e.g., due to a prior forward hint from the mds), use that.
617 * Otherwise, consult frag tree and/or caps to identify the 621 * Otherwise, consult frag tree and/or caps to identify the
618 * appropriate mds. If all else fails, choose randomly. 622 * appropriate mds. If all else fails, choose randomly.
619 * 623 *
620 * Called under mdsc->mutex. 624 * Called under mdsc->mutex.
621 */ 625 */
622 static struct dentry *get_nonsnap_parent(struct dentry *dentry) 626 static struct dentry *get_nonsnap_parent(struct dentry *dentry)
623 { 627 {
624 /* 628 /*
625 * we don't need to worry about protecting the d_parent access 629 * we don't need to worry about protecting the d_parent access
626 * here because we never renaming inside the snapped namespace 630 * here because we never renaming inside the snapped namespace
627 * except to resplice to another snapdir, and either the old or new 631 * except to resplice to another snapdir, and either the old or new
628 * result is a valid result. 632 * result is a valid result.
629 */ 633 */
630 while (!IS_ROOT(dentry) && ceph_snap(dentry->d_inode) != CEPH_NOSNAP) 634 while (!IS_ROOT(dentry) && ceph_snap(dentry->d_inode) != CEPH_NOSNAP)
631 dentry = dentry->d_parent; 635 dentry = dentry->d_parent;
632 return dentry; 636 return dentry;
633 } 637 }
634 638
635 static int __choose_mds(struct ceph_mds_client *mdsc, 639 static int __choose_mds(struct ceph_mds_client *mdsc,
636 struct ceph_mds_request *req) 640 struct ceph_mds_request *req)
637 { 641 {
638 struct inode *inode; 642 struct inode *inode;
639 struct ceph_inode_info *ci; 643 struct ceph_inode_info *ci;
640 struct ceph_cap *cap; 644 struct ceph_cap *cap;
641 int mode = req->r_direct_mode; 645 int mode = req->r_direct_mode;
642 int mds = -1; 646 int mds = -1;
643 u32 hash = req->r_direct_hash; 647 u32 hash = req->r_direct_hash;
644 bool is_hash = req->r_direct_is_hash; 648 bool is_hash = req->r_direct_is_hash;
645 649
646 /* 650 /*
647 * is there a specific mds we should try? ignore hint if we have 651 * is there a specific mds we should try? ignore hint if we have
648 * no session and the mds is not up (active or recovering). 652 * no session and the mds is not up (active or recovering).
649 */ 653 */
650 if (req->r_resend_mds >= 0 && 654 if (req->r_resend_mds >= 0 &&
651 (__have_session(mdsc, req->r_resend_mds) || 655 (__have_session(mdsc, req->r_resend_mds) ||
652 ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) { 656 ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) {
653 dout("choose_mds using resend_mds mds%d\n", 657 dout("choose_mds using resend_mds mds%d\n",
654 req->r_resend_mds); 658 req->r_resend_mds);
655 return req->r_resend_mds; 659 return req->r_resend_mds;
656 } 660 }
657 661
658 if (mode == USE_RANDOM_MDS) 662 if (mode == USE_RANDOM_MDS)
659 goto random; 663 goto random;
660 664
661 inode = NULL; 665 inode = NULL;
662 if (req->r_inode) { 666 if (req->r_inode) {
663 inode = req->r_inode; 667 inode = req->r_inode;
664 } else if (req->r_dentry) { 668 } else if (req->r_dentry) {
665 /* ignore race with rename; old or new d_parent is okay */ 669 /* ignore race with rename; old or new d_parent is okay */
666 struct dentry *parent = req->r_dentry->d_parent; 670 struct dentry *parent = req->r_dentry->d_parent;
667 struct inode *dir = parent->d_inode; 671 struct inode *dir = parent->d_inode;
668 672
669 if (dir->i_sb != mdsc->fsc->sb) { 673 if (dir->i_sb != mdsc->fsc->sb) {
670 /* not this fs! */ 674 /* not this fs! */
671 inode = req->r_dentry->d_inode; 675 inode = req->r_dentry->d_inode;
672 } else if (ceph_snap(dir) != CEPH_NOSNAP) { 676 } else if (ceph_snap(dir) != CEPH_NOSNAP) {
673 /* direct snapped/virtual snapdir requests 677 /* direct snapped/virtual snapdir requests
674 * based on parent dir inode */ 678 * based on parent dir inode */
675 struct dentry *dn = get_nonsnap_parent(parent); 679 struct dentry *dn = get_nonsnap_parent(parent);
676 inode = dn->d_inode; 680 inode = dn->d_inode;
677 dout("__choose_mds using nonsnap parent %p\n", inode); 681 dout("__choose_mds using nonsnap parent %p\n", inode);
678 } else if (req->r_dentry->d_inode) { 682 } else if (req->r_dentry->d_inode) {
679 /* dentry target */ 683 /* dentry target */
680 inode = req->r_dentry->d_inode; 684 inode = req->r_dentry->d_inode;
681 } else { 685 } else {
682 /* dir + name */ 686 /* dir + name */
683 inode = dir; 687 inode = dir;
684 hash = ceph_dentry_hash(dir, req->r_dentry); 688 hash = ceph_dentry_hash(dir, req->r_dentry);
685 is_hash = true; 689 is_hash = true;
686 } 690 }
687 } 691 }
688 692
689 dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash, 693 dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
690 (int)hash, mode); 694 (int)hash, mode);
691 if (!inode) 695 if (!inode)
692 goto random; 696 goto random;
693 ci = ceph_inode(inode); 697 ci = ceph_inode(inode);
694 698
695 if (is_hash && S_ISDIR(inode->i_mode)) { 699 if (is_hash && S_ISDIR(inode->i_mode)) {
696 struct ceph_inode_frag frag; 700 struct ceph_inode_frag frag;
697 int found; 701 int found;
698 702
699 ceph_choose_frag(ci, hash, &frag, &found); 703 ceph_choose_frag(ci, hash, &frag, &found);
700 if (found) { 704 if (found) {
701 if (mode == USE_ANY_MDS && frag.ndist > 0) { 705 if (mode == USE_ANY_MDS && frag.ndist > 0) {
702 u8 r; 706 u8 r;
703 707
704 /* choose a random replica */ 708 /* choose a random replica */
705 get_random_bytes(&r, 1); 709 get_random_bytes(&r, 1);
706 r %= frag.ndist; 710 r %= frag.ndist;
707 mds = frag.dist[r]; 711 mds = frag.dist[r];
708 dout("choose_mds %p %llx.%llx " 712 dout("choose_mds %p %llx.%llx "
709 "frag %u mds%d (%d/%d)\n", 713 "frag %u mds%d (%d/%d)\n",
710 inode, ceph_vinop(inode), 714 inode, ceph_vinop(inode),
711 frag.frag, mds, 715 frag.frag, mds,
712 (int)r, frag.ndist); 716 (int)r, frag.ndist);
713 if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= 717 if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
714 CEPH_MDS_STATE_ACTIVE) 718 CEPH_MDS_STATE_ACTIVE)
715 return mds; 719 return mds;
716 } 720 }
717 721
718 /* since this file/dir wasn't known to be 722 /* since this file/dir wasn't known to be
719 * replicated, then we want to look for the 723 * replicated, then we want to look for the
720 * authoritative mds. */ 724 * authoritative mds. */
721 mode = USE_AUTH_MDS; 725 mode = USE_AUTH_MDS;
722 if (frag.mds >= 0) { 726 if (frag.mds >= 0) {
723 /* choose auth mds */ 727 /* choose auth mds */
724 mds = frag.mds; 728 mds = frag.mds;
725 dout("choose_mds %p %llx.%llx " 729 dout("choose_mds %p %llx.%llx "
726 "frag %u mds%d (auth)\n", 730 "frag %u mds%d (auth)\n",
727 inode, ceph_vinop(inode), frag.frag, mds); 731 inode, ceph_vinop(inode), frag.frag, mds);
728 if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= 732 if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
729 CEPH_MDS_STATE_ACTIVE) 733 CEPH_MDS_STATE_ACTIVE)
730 return mds; 734 return mds;
731 } 735 }
732 } 736 }
733 } 737 }
734 738
735 spin_lock(&ci->i_ceph_lock); 739 spin_lock(&ci->i_ceph_lock);
736 cap = NULL; 740 cap = NULL;
737 if (mode == USE_AUTH_MDS) 741 if (mode == USE_AUTH_MDS)
738 cap = ci->i_auth_cap; 742 cap = ci->i_auth_cap;
739 if (!cap && !RB_EMPTY_ROOT(&ci->i_caps)) 743 if (!cap && !RB_EMPTY_ROOT(&ci->i_caps))
740 cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node); 744 cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
741 if (!cap) { 745 if (!cap) {
742 spin_unlock(&ci->i_ceph_lock); 746 spin_unlock(&ci->i_ceph_lock);
743 goto random; 747 goto random;
744 } 748 }
745 mds = cap->session->s_mds; 749 mds = cap->session->s_mds;
746 dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n", 750 dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n",
747 inode, ceph_vinop(inode), mds, 751 inode, ceph_vinop(inode), mds,
748 cap == ci->i_auth_cap ? "auth " : "", cap); 752 cap == ci->i_auth_cap ? "auth " : "", cap);
749 spin_unlock(&ci->i_ceph_lock); 753 spin_unlock(&ci->i_ceph_lock);
750 return mds; 754 return mds;
751 755
752 random: 756 random:
753 mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap); 757 mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
754 dout("choose_mds chose random mds%d\n", mds); 758 dout("choose_mds chose random mds%d\n", mds);
755 return mds; 759 return mds;
756 } 760 }
757 761
758 762
759 /* 763 /*
760 * session messages 764 * session messages
761 */ 765 */
762 static struct ceph_msg *create_session_msg(u32 op, u64 seq) 766 static struct ceph_msg *create_session_msg(u32 op, u64 seq)
763 { 767 {
764 struct ceph_msg *msg; 768 struct ceph_msg *msg;
765 struct ceph_mds_session_head *h; 769 struct ceph_mds_session_head *h;
766 770
767 msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS, 771 msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS,
768 false); 772 false);
769 if (!msg) { 773 if (!msg) {
770 pr_err("create_session_msg ENOMEM creating msg\n"); 774 pr_err("create_session_msg ENOMEM creating msg\n");
771 return NULL; 775 return NULL;
772 } 776 }
773 h = msg->front.iov_base; 777 h = msg->front.iov_base;
774 h->op = cpu_to_le32(op); 778 h->op = cpu_to_le32(op);
775 h->seq = cpu_to_le64(seq); 779 h->seq = cpu_to_le64(seq);
776 return msg; 780 return msg;
777 } 781 }
778 782
779 /* 783 /*
780 * send session open request. 784 * send session open request.
781 * 785 *
782 * called under mdsc->mutex 786 * called under mdsc->mutex
783 */ 787 */
784 static int __open_session(struct ceph_mds_client *mdsc, 788 static int __open_session(struct ceph_mds_client *mdsc,
785 struct ceph_mds_session *session) 789 struct ceph_mds_session *session)
786 { 790 {
787 struct ceph_msg *msg; 791 struct ceph_msg *msg;
788 int mstate; 792 int mstate;
789 int mds = session->s_mds; 793 int mds = session->s_mds;
790 794
791 /* wait for mds to go active? */ 795 /* wait for mds to go active? */
792 mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds); 796 mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
793 dout("open_session to mds%d (%s)\n", mds, 797 dout("open_session to mds%d (%s)\n", mds,
794 ceph_mds_state_name(mstate)); 798 ceph_mds_state_name(mstate));
795 session->s_state = CEPH_MDS_SESSION_OPENING; 799 session->s_state = CEPH_MDS_SESSION_OPENING;
796 session->s_renew_requested = jiffies; 800 session->s_renew_requested = jiffies;
797 801
798 /* send connect message */ 802 /* send connect message */
799 msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq); 803 msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq);
800 if (!msg) 804 if (!msg)
801 return -ENOMEM; 805 return -ENOMEM;
802 ceph_con_send(&session->s_con, msg); 806 ceph_con_send(&session->s_con, msg);
803 return 0; 807 return 0;
804 } 808 }
805 809
806 /* 810 /*
807 * open sessions for any export targets for the given mds 811 * open sessions for any export targets for the given mds
808 * 812 *
809 * called under mdsc->mutex 813 * called under mdsc->mutex
810 */ 814 */
811 static void __open_export_target_sessions(struct ceph_mds_client *mdsc, 815 static void __open_export_target_sessions(struct ceph_mds_client *mdsc,
812 struct ceph_mds_session *session) 816 struct ceph_mds_session *session)
813 { 817 {
814 struct ceph_mds_info *mi; 818 struct ceph_mds_info *mi;
815 struct ceph_mds_session *ts; 819 struct ceph_mds_session *ts;
816 int i, mds = session->s_mds; 820 int i, mds = session->s_mds;
817 int target; 821 int target;
818 822
819 if (mds >= mdsc->mdsmap->m_max_mds) 823 if (mds >= mdsc->mdsmap->m_max_mds)
820 return; 824 return;
821 mi = &mdsc->mdsmap->m_info[mds]; 825 mi = &mdsc->mdsmap->m_info[mds];
822 dout("open_export_target_sessions for mds%d (%d targets)\n", 826 dout("open_export_target_sessions for mds%d (%d targets)\n",
823 session->s_mds, mi->num_export_targets); 827 session->s_mds, mi->num_export_targets);
824 828
825 for (i = 0; i < mi->num_export_targets; i++) { 829 for (i = 0; i < mi->num_export_targets; i++) {
826 target = mi->export_targets[i]; 830 target = mi->export_targets[i];
827 ts = __ceph_lookup_mds_session(mdsc, target); 831 ts = __ceph_lookup_mds_session(mdsc, target);
828 if (!ts) { 832 if (!ts) {
829 ts = register_session(mdsc, target); 833 ts = register_session(mdsc, target);
830 if (IS_ERR(ts)) 834 if (IS_ERR(ts))
831 return; 835 return;
832 } 836 }
833 if (session->s_state == CEPH_MDS_SESSION_NEW || 837 if (session->s_state == CEPH_MDS_SESSION_NEW ||
834 session->s_state == CEPH_MDS_SESSION_CLOSING) 838 session->s_state == CEPH_MDS_SESSION_CLOSING)
835 __open_session(mdsc, session); 839 __open_session(mdsc, session);
836 else 840 else
837 dout(" mds%d target mds%d %p is %s\n", session->s_mds, 841 dout(" mds%d target mds%d %p is %s\n", session->s_mds,
838 i, ts, session_state_name(ts->s_state)); 842 i, ts, session_state_name(ts->s_state));
839 ceph_put_mds_session(ts); 843 ceph_put_mds_session(ts);
840 } 844 }
841 } 845 }
842 846
843 void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, 847 void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
844 struct ceph_mds_session *session) 848 struct ceph_mds_session *session)
845 { 849 {
846 mutex_lock(&mdsc->mutex); 850 mutex_lock(&mdsc->mutex);
847 __open_export_target_sessions(mdsc, session); 851 __open_export_target_sessions(mdsc, session);
848 mutex_unlock(&mdsc->mutex); 852 mutex_unlock(&mdsc->mutex);
849 } 853 }
850 854
851 /* 855 /*
852 * session caps 856 * session caps
853 */ 857 */
854 858
855 /* 859 /*
856 * Free preallocated cap messages assigned to this session 860 * Free preallocated cap messages assigned to this session
857 */ 861 */
858 static void cleanup_cap_releases(struct ceph_mds_session *session) 862 static void cleanup_cap_releases(struct ceph_mds_session *session)
859 { 863 {
860 struct ceph_msg *msg; 864 struct ceph_msg *msg;
861 865
862 spin_lock(&session->s_cap_lock); 866 spin_lock(&session->s_cap_lock);
863 while (!list_empty(&session->s_cap_releases)) { 867 while (!list_empty(&session->s_cap_releases)) {
864 msg = list_first_entry(&session->s_cap_releases, 868 msg = list_first_entry(&session->s_cap_releases,
865 struct ceph_msg, list_head); 869 struct ceph_msg, list_head);
866 list_del_init(&msg->list_head); 870 list_del_init(&msg->list_head);
867 ceph_msg_put(msg); 871 ceph_msg_put(msg);
868 } 872 }
869 while (!list_empty(&session->s_cap_releases_done)) { 873 while (!list_empty(&session->s_cap_releases_done)) {
870 msg = list_first_entry(&session->s_cap_releases_done, 874 msg = list_first_entry(&session->s_cap_releases_done,
871 struct ceph_msg, list_head); 875 struct ceph_msg, list_head);
872 list_del_init(&msg->list_head); 876 list_del_init(&msg->list_head);
873 ceph_msg_put(msg); 877 ceph_msg_put(msg);
874 } 878 }
875 spin_unlock(&session->s_cap_lock); 879 spin_unlock(&session->s_cap_lock);
876 } 880 }
877 881
878 /* 882 /*
879 * Helper to safely iterate over all caps associated with a session, with 883 * Helper to safely iterate over all caps associated with a session, with
880 * special care taken to handle a racing __ceph_remove_cap(). 884 * special care taken to handle a racing __ceph_remove_cap().
881 * 885 *
882 * Caller must hold session s_mutex. 886 * Caller must hold session s_mutex.
883 */ 887 */
884 static int iterate_session_caps(struct ceph_mds_session *session, 888 static int iterate_session_caps(struct ceph_mds_session *session,
885 int (*cb)(struct inode *, struct ceph_cap *, 889 int (*cb)(struct inode *, struct ceph_cap *,
886 void *), void *arg) 890 void *), void *arg)
887 { 891 {
888 struct list_head *p; 892 struct list_head *p;
889 struct ceph_cap *cap; 893 struct ceph_cap *cap;
890 struct inode *inode, *last_inode = NULL; 894 struct inode *inode, *last_inode = NULL;
891 struct ceph_cap *old_cap = NULL; 895 struct ceph_cap *old_cap = NULL;
892 int ret; 896 int ret;
893 897
894 dout("iterate_session_caps %p mds%d\n", session, session->s_mds); 898 dout("iterate_session_caps %p mds%d\n", session, session->s_mds);
895 spin_lock(&session->s_cap_lock); 899 spin_lock(&session->s_cap_lock);
896 p = session->s_caps.next; 900 p = session->s_caps.next;
897 while (p != &session->s_caps) { 901 while (p != &session->s_caps) {
898 cap = list_entry(p, struct ceph_cap, session_caps); 902 cap = list_entry(p, struct ceph_cap, session_caps);
899 inode = igrab(&cap->ci->vfs_inode); 903 inode = igrab(&cap->ci->vfs_inode);
900 if (!inode) { 904 if (!inode) {
901 p = p->next; 905 p = p->next;
902 continue; 906 continue;
903 } 907 }
904 session->s_cap_iterator = cap; 908 session->s_cap_iterator = cap;
905 spin_unlock(&session->s_cap_lock); 909 spin_unlock(&session->s_cap_lock);
906 910
907 if (last_inode) { 911 if (last_inode) {
908 iput(last_inode); 912 iput(last_inode);
909 last_inode = NULL; 913 last_inode = NULL;
910 } 914 }
911 if (old_cap) { 915 if (old_cap) {
912 ceph_put_cap(session->s_mdsc, old_cap); 916 ceph_put_cap(session->s_mdsc, old_cap);
913 old_cap = NULL; 917 old_cap = NULL;
914 } 918 }
915 919
916 ret = cb(inode, cap, arg); 920 ret = cb(inode, cap, arg);
917 last_inode = inode; 921 last_inode = inode;
918 922
919 spin_lock(&session->s_cap_lock); 923 spin_lock(&session->s_cap_lock);
920 p = p->next; 924 p = p->next;
921 if (cap->ci == NULL) { 925 if (cap->ci == NULL) {
922 dout("iterate_session_caps finishing cap %p removal\n", 926 dout("iterate_session_caps finishing cap %p removal\n",
923 cap); 927 cap);
924 BUG_ON(cap->session != session); 928 BUG_ON(cap->session != session);
925 list_del_init(&cap->session_caps); 929 list_del_init(&cap->session_caps);
926 session->s_nr_caps--; 930 session->s_nr_caps--;
927 cap->session = NULL; 931 cap->session = NULL;
928 old_cap = cap; /* put_cap it w/o locks held */ 932 old_cap = cap; /* put_cap it w/o locks held */
929 } 933 }
930 if (ret < 0) 934 if (ret < 0)
931 goto out; 935 goto out;
932 } 936 }
933 ret = 0; 937 ret = 0;
934 out: 938 out:
935 session->s_cap_iterator = NULL; 939 session->s_cap_iterator = NULL;
936 spin_unlock(&session->s_cap_lock); 940 spin_unlock(&session->s_cap_lock);
937 941
938 if (last_inode) 942 if (last_inode)
939 iput(last_inode); 943 iput(last_inode);
940 if (old_cap) 944 if (old_cap)
941 ceph_put_cap(session->s_mdsc, old_cap); 945 ceph_put_cap(session->s_mdsc, old_cap);
942 946
943 return ret; 947 return ret;
944 } 948 }
945 949
946 static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, 950 static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
947 void *arg) 951 void *arg)
948 { 952 {
949 struct ceph_inode_info *ci = ceph_inode(inode); 953 struct ceph_inode_info *ci = ceph_inode(inode);
950 int drop = 0; 954 int drop = 0;
951 955
952 dout("removing cap %p, ci is %p, inode is %p\n", 956 dout("removing cap %p, ci is %p, inode is %p\n",
953 cap, ci, &ci->vfs_inode); 957 cap, ci, &ci->vfs_inode);
954 spin_lock(&ci->i_ceph_lock); 958 spin_lock(&ci->i_ceph_lock);
955 __ceph_remove_cap(cap); 959 __ceph_remove_cap(cap);
956 if (!__ceph_is_any_real_caps(ci)) { 960 if (!__ceph_is_any_real_caps(ci)) {
957 struct ceph_mds_client *mdsc = 961 struct ceph_mds_client *mdsc =
958 ceph_sb_to_client(inode->i_sb)->mdsc; 962 ceph_sb_to_client(inode->i_sb)->mdsc;
959 963
960 spin_lock(&mdsc->cap_dirty_lock); 964 spin_lock(&mdsc->cap_dirty_lock);
961 if (!list_empty(&ci->i_dirty_item)) { 965 if (!list_empty(&ci->i_dirty_item)) {
962 pr_info(" dropping dirty %s state for %p %lld\n", 966 pr_info(" dropping dirty %s state for %p %lld\n",
963 ceph_cap_string(ci->i_dirty_caps), 967 ceph_cap_string(ci->i_dirty_caps),
964 inode, ceph_ino(inode)); 968 inode, ceph_ino(inode));
965 ci->i_dirty_caps = 0; 969 ci->i_dirty_caps = 0;
966 list_del_init(&ci->i_dirty_item); 970 list_del_init(&ci->i_dirty_item);
967 drop = 1; 971 drop = 1;
968 } 972 }
969 if (!list_empty(&ci->i_flushing_item)) { 973 if (!list_empty(&ci->i_flushing_item)) {
970 pr_info(" dropping dirty+flushing %s state for %p %lld\n", 974 pr_info(" dropping dirty+flushing %s state for %p %lld\n",
971 ceph_cap_string(ci->i_flushing_caps), 975 ceph_cap_string(ci->i_flushing_caps),
972 inode, ceph_ino(inode)); 976 inode, ceph_ino(inode));
973 ci->i_flushing_caps = 0; 977 ci->i_flushing_caps = 0;
974 list_del_init(&ci->i_flushing_item); 978 list_del_init(&ci->i_flushing_item);
975 mdsc->num_cap_flushing--; 979 mdsc->num_cap_flushing--;
976 drop = 1; 980 drop = 1;
977 } 981 }
978 if (drop && ci->i_wrbuffer_ref) { 982 if (drop && ci->i_wrbuffer_ref) {
979 pr_info(" dropping dirty data for %p %lld\n", 983 pr_info(" dropping dirty data for %p %lld\n",
980 inode, ceph_ino(inode)); 984 inode, ceph_ino(inode));
981 ci->i_wrbuffer_ref = 0; 985 ci->i_wrbuffer_ref = 0;
982 ci->i_wrbuffer_ref_head = 0; 986 ci->i_wrbuffer_ref_head = 0;
983 drop++; 987 drop++;
984 } 988 }
985 spin_unlock(&mdsc->cap_dirty_lock); 989 spin_unlock(&mdsc->cap_dirty_lock);
986 } 990 }
987 spin_unlock(&ci->i_ceph_lock); 991 spin_unlock(&ci->i_ceph_lock);
988 while (drop--) 992 while (drop--)
989 iput(inode); 993 iput(inode);
990 return 0; 994 return 0;
991 } 995 }
992 996
993 /* 997 /*
994 * caller must hold session s_mutex 998 * caller must hold session s_mutex
995 */ 999 */
996 static void remove_session_caps(struct ceph_mds_session *session) 1000 static void remove_session_caps(struct ceph_mds_session *session)
997 { 1001 {
998 dout("remove_session_caps on %p\n", session); 1002 dout("remove_session_caps on %p\n", session);
999 iterate_session_caps(session, remove_session_caps_cb, NULL); 1003 iterate_session_caps(session, remove_session_caps_cb, NULL);
1000 BUG_ON(session->s_nr_caps > 0); 1004 BUG_ON(session->s_nr_caps > 0);
1001 BUG_ON(!list_empty(&session->s_cap_flushing)); 1005 BUG_ON(!list_empty(&session->s_cap_flushing));
1002 cleanup_cap_releases(session); 1006 cleanup_cap_releases(session);
1003 } 1007 }
1004 1008
1005 /* 1009 /*
1006 * wake up any threads waiting on this session's caps. if the cap is 1010 * wake up any threads waiting on this session's caps. if the cap is
1007 * old (didn't get renewed on the client reconnect), remove it now. 1011 * old (didn't get renewed on the client reconnect), remove it now.
1008 * 1012 *
1009 * caller must hold s_mutex. 1013 * caller must hold s_mutex.
1010 */ 1014 */
1011 static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap, 1015 static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
1012 void *arg) 1016 void *arg)
1013 { 1017 {
1014 struct ceph_inode_info *ci = ceph_inode(inode); 1018 struct ceph_inode_info *ci = ceph_inode(inode);
1015 1019
1016 wake_up_all(&ci->i_cap_wq); 1020 wake_up_all(&ci->i_cap_wq);
1017 if (arg) { 1021 if (arg) {
1018 spin_lock(&ci->i_ceph_lock); 1022 spin_lock(&ci->i_ceph_lock);
1019 ci->i_wanted_max_size = 0; 1023 ci->i_wanted_max_size = 0;
1020 ci->i_requested_max_size = 0; 1024 ci->i_requested_max_size = 0;
1021 spin_unlock(&ci->i_ceph_lock); 1025 spin_unlock(&ci->i_ceph_lock);
1022 } 1026 }
1023 return 0; 1027 return 0;
1024 } 1028 }
1025 1029
1026 static void wake_up_session_caps(struct ceph_mds_session *session, 1030 static void wake_up_session_caps(struct ceph_mds_session *session,
1027 int reconnect) 1031 int reconnect)
1028 { 1032 {
1029 dout("wake_up_session_caps %p mds%d\n", session, session->s_mds); 1033 dout("wake_up_session_caps %p mds%d\n", session, session->s_mds);
1030 iterate_session_caps(session, wake_up_session_cb, 1034 iterate_session_caps(session, wake_up_session_cb,
1031 (void *)(unsigned long)reconnect); 1035 (void *)(unsigned long)reconnect);
1032 } 1036 }
1033 1037
1034 /* 1038 /*
1035 * Send periodic message to MDS renewing all currently held caps. The 1039 * Send periodic message to MDS renewing all currently held caps. The
1036 * ack will reset the expiration for all caps from this session. 1040 * ack will reset the expiration for all caps from this session.
1037 * 1041 *
1038 * caller holds s_mutex 1042 * caller holds s_mutex
1039 */ 1043 */
1040 static int send_renew_caps(struct ceph_mds_client *mdsc, 1044 static int send_renew_caps(struct ceph_mds_client *mdsc,
1041 struct ceph_mds_session *session) 1045 struct ceph_mds_session *session)
1042 { 1046 {
1043 struct ceph_msg *msg; 1047 struct ceph_msg *msg;
1044 int state; 1048 int state;
1045 1049
1046 if (time_after_eq(jiffies, session->s_cap_ttl) && 1050 if (time_after_eq(jiffies, session->s_cap_ttl) &&
1047 time_after_eq(session->s_cap_ttl, session->s_renew_requested)) 1051 time_after_eq(session->s_cap_ttl, session->s_renew_requested))
1048 pr_info("mds%d caps stale\n", session->s_mds); 1052 pr_info("mds%d caps stale\n", session->s_mds);
1049 session->s_renew_requested = jiffies; 1053 session->s_renew_requested = jiffies;
1050 1054
1051 /* do not try to renew caps until a recovering mds has reconnected 1055 /* do not try to renew caps until a recovering mds has reconnected
1052 * with its clients. */ 1056 * with its clients. */
1053 state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds); 1057 state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds);
1054 if (state < CEPH_MDS_STATE_RECONNECT) { 1058 if (state < CEPH_MDS_STATE_RECONNECT) {
1055 dout("send_renew_caps ignoring mds%d (%s)\n", 1059 dout("send_renew_caps ignoring mds%d (%s)\n",
1056 session->s_mds, ceph_mds_state_name(state)); 1060 session->s_mds, ceph_mds_state_name(state));
1057 return 0; 1061 return 0;
1058 } 1062 }
1059 1063
1060 dout("send_renew_caps to mds%d (%s)\n", session->s_mds, 1064 dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
1061 ceph_mds_state_name(state)); 1065 ceph_mds_state_name(state));
1062 msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS, 1066 msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
1063 ++session->s_renew_seq); 1067 ++session->s_renew_seq);
1064 if (!msg) 1068 if (!msg)
1065 return -ENOMEM; 1069 return -ENOMEM;
1066 ceph_con_send(&session->s_con, msg); 1070 ceph_con_send(&session->s_con, msg);
1067 return 0; 1071 return 0;
1068 } 1072 }
1069 1073
1070 /* 1074 /*
1071 * Note new cap ttl, and any transition from stale -> not stale (fresh?). 1075 * Note new cap ttl, and any transition from stale -> not stale (fresh?).
1072 * 1076 *
1073 * Called under session->s_mutex 1077 * Called under session->s_mutex
1074 */ 1078 */
1075 static void renewed_caps(struct ceph_mds_client *mdsc, 1079 static void renewed_caps(struct ceph_mds_client *mdsc,
1076 struct ceph_mds_session *session, int is_renew) 1080 struct ceph_mds_session *session, int is_renew)
1077 { 1081 {
1078 int was_stale; 1082 int was_stale;
1079 int wake = 0; 1083 int wake = 0;
1080 1084
1081 spin_lock(&session->s_cap_lock); 1085 spin_lock(&session->s_cap_lock);
1082 was_stale = is_renew && (session->s_cap_ttl == 0 || 1086 was_stale = is_renew && (session->s_cap_ttl == 0 ||
1083 time_after_eq(jiffies, session->s_cap_ttl)); 1087 time_after_eq(jiffies, session->s_cap_ttl));
1084 1088
1085 session->s_cap_ttl = session->s_renew_requested + 1089 session->s_cap_ttl = session->s_renew_requested +
1086 mdsc->mdsmap->m_session_timeout*HZ; 1090 mdsc->mdsmap->m_session_timeout*HZ;
1087 1091
1088 if (was_stale) { 1092 if (was_stale) {
1089 if (time_before(jiffies, session->s_cap_ttl)) { 1093 if (time_before(jiffies, session->s_cap_ttl)) {
1090 pr_info("mds%d caps renewed\n", session->s_mds); 1094 pr_info("mds%d caps renewed\n", session->s_mds);
1091 wake = 1; 1095 wake = 1;
1092 } else { 1096 } else {
1093 pr_info("mds%d caps still stale\n", session->s_mds); 1097 pr_info("mds%d caps still stale\n", session->s_mds);
1094 } 1098 }
1095 } 1099 }
1096 dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n", 1100 dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n",
1097 session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh", 1101 session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh",
1098 time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh"); 1102 time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh");
1099 spin_unlock(&session->s_cap_lock); 1103 spin_unlock(&session->s_cap_lock);
1100 1104
1101 if (wake) 1105 if (wake)
1102 wake_up_session_caps(session, 0); 1106 wake_up_session_caps(session, 0);
1103 } 1107 }
1104 1108
1105 /* 1109 /*
1106 * send a session close request 1110 * send a session close request
1107 */ 1111 */
1108 static int request_close_session(struct ceph_mds_client *mdsc, 1112 static int request_close_session(struct ceph_mds_client *mdsc,
1109 struct ceph_mds_session *session) 1113 struct ceph_mds_session *session)
1110 { 1114 {
1111 struct ceph_msg *msg; 1115 struct ceph_msg *msg;
1112 1116
1113 dout("request_close_session mds%d state %s seq %lld\n", 1117 dout("request_close_session mds%d state %s seq %lld\n",
1114 session->s_mds, session_state_name(session->s_state), 1118 session->s_mds, session_state_name(session->s_state),
1115 session->s_seq); 1119 session->s_seq);
1116 msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq); 1120 msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
1117 if (!msg) 1121 if (!msg)
1118 return -ENOMEM; 1122 return -ENOMEM;
1119 ceph_con_send(&session->s_con, msg); 1123 ceph_con_send(&session->s_con, msg);
1120 return 0; 1124 return 0;
1121 } 1125 }
1122 1126
1123 /* 1127 /*
1124 * Called with s_mutex held. 1128 * Called with s_mutex held.
1125 */ 1129 */
1126 static int __close_session(struct ceph_mds_client *mdsc, 1130 static int __close_session(struct ceph_mds_client *mdsc,
1127 struct ceph_mds_session *session) 1131 struct ceph_mds_session *session)
1128 { 1132 {
1129 if (session->s_state >= CEPH_MDS_SESSION_CLOSING) 1133 if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
1130 return 0; 1134 return 0;
1131 session->s_state = CEPH_MDS_SESSION_CLOSING; 1135 session->s_state = CEPH_MDS_SESSION_CLOSING;
1132 return request_close_session(mdsc, session); 1136 return request_close_session(mdsc, session);
1133 } 1137 }
1134 1138
1135 /* 1139 /*
1136 * Trim old(er) caps. 1140 * Trim old(er) caps.
1137 * 1141 *
1138 * Because we can't cache an inode without one or more caps, we do 1142 * Because we can't cache an inode without one or more caps, we do
1139 * this indirectly: if a cap is unused, we prune its aliases, at which 1143 * this indirectly: if a cap is unused, we prune its aliases, at which
1140 * point the inode will hopefully get dropped to. 1144 * point the inode will hopefully get dropped to.
1141 * 1145 *
1142 * Yes, this is a bit sloppy. Our only real goal here is to respond to 1146 * Yes, this is a bit sloppy. Our only real goal here is to respond to
1143 * memory pressure from the MDS, though, so it needn't be perfect. 1147 * memory pressure from the MDS, though, so it needn't be perfect.
1144 */ 1148 */
1145 static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) 1149 static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
1146 { 1150 {
1147 struct ceph_mds_session *session = arg; 1151 struct ceph_mds_session *session = arg;
1148 struct ceph_inode_info *ci = ceph_inode(inode); 1152 struct ceph_inode_info *ci = ceph_inode(inode);
1149 int used, oissued, mine; 1153 int used, oissued, mine;
1150 1154
1151 if (session->s_trim_caps <= 0) 1155 if (session->s_trim_caps <= 0)
1152 return -1; 1156 return -1;
1153 1157
1154 spin_lock(&ci->i_ceph_lock); 1158 spin_lock(&ci->i_ceph_lock);
1155 mine = cap->issued | cap->implemented; 1159 mine = cap->issued | cap->implemented;
1156 used = __ceph_caps_used(ci); 1160 used = __ceph_caps_used(ci);
1157 oissued = __ceph_caps_issued_other(ci, cap); 1161 oissued = __ceph_caps_issued_other(ci, cap);
1158 1162
1159 dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n", 1163 dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n",
1160 inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued), 1164 inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
1161 ceph_cap_string(used)); 1165 ceph_cap_string(used));
1162 if (ci->i_dirty_caps) 1166 if (ci->i_dirty_caps)
1163 goto out; /* dirty caps */ 1167 goto out; /* dirty caps */
1164 if ((used & ~oissued) & mine) 1168 if ((used & ~oissued) & mine)
1165 goto out; /* we need these caps */ 1169 goto out; /* we need these caps */
1166 1170
1167 session->s_trim_caps--; 1171 session->s_trim_caps--;
1168 if (oissued) { 1172 if (oissued) {
1169 /* we aren't the only cap.. just remove us */ 1173 /* we aren't the only cap.. just remove us */
1170 __ceph_remove_cap(cap); 1174 __ceph_remove_cap(cap);
1171 } else { 1175 } else {
1172 /* try to drop referring dentries */ 1176 /* try to drop referring dentries */
1173 spin_unlock(&ci->i_ceph_lock); 1177 spin_unlock(&ci->i_ceph_lock);
1174 d_prune_aliases(inode); 1178 d_prune_aliases(inode);
1175 dout("trim_caps_cb %p cap %p pruned, count now %d\n", 1179 dout("trim_caps_cb %p cap %p pruned, count now %d\n",
1176 inode, cap, atomic_read(&inode->i_count)); 1180 inode, cap, atomic_read(&inode->i_count));
1177 return 0; 1181 return 0;
1178 } 1182 }
1179 1183
1180 out: 1184 out:
1181 spin_unlock(&ci->i_ceph_lock); 1185 spin_unlock(&ci->i_ceph_lock);
1182 return 0; 1186 return 0;
1183 } 1187 }
1184 1188
1185 /* 1189 /*
1186 * Trim session cap count down to some max number. 1190 * Trim session cap count down to some max number.
1187 */ 1191 */
1188 static int trim_caps(struct ceph_mds_client *mdsc, 1192 static int trim_caps(struct ceph_mds_client *mdsc,
1189 struct ceph_mds_session *session, 1193 struct ceph_mds_session *session,
1190 int max_caps) 1194 int max_caps)
1191 { 1195 {
1192 int trim_caps = session->s_nr_caps - max_caps; 1196 int trim_caps = session->s_nr_caps - max_caps;
1193 1197
1194 dout("trim_caps mds%d start: %d / %d, trim %d\n", 1198 dout("trim_caps mds%d start: %d / %d, trim %d\n",
1195 session->s_mds, session->s_nr_caps, max_caps, trim_caps); 1199 session->s_mds, session->s_nr_caps, max_caps, trim_caps);
1196 if (trim_caps > 0) { 1200 if (trim_caps > 0) {
1197 session->s_trim_caps = trim_caps; 1201 session->s_trim_caps = trim_caps;
1198 iterate_session_caps(session, trim_caps_cb, session); 1202 iterate_session_caps(session, trim_caps_cb, session);
1199 dout("trim_caps mds%d done: %d / %d, trimmed %d\n", 1203 dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
1200 session->s_mds, session->s_nr_caps, max_caps, 1204 session->s_mds, session->s_nr_caps, max_caps,
1201 trim_caps - session->s_trim_caps); 1205 trim_caps - session->s_trim_caps);
1202 session->s_trim_caps = 0; 1206 session->s_trim_caps = 0;
1203 } 1207 }
1204 return 0; 1208 return 0;
1205 } 1209 }
1206 1210
1207 /* 1211 /*
1208 * Allocate cap_release messages. If there is a partially full message 1212 * Allocate cap_release messages. If there is a partially full message
1209 * in the queue, try to allocate enough to cover it's remainder, so that 1213 * in the queue, try to allocate enough to cover it's remainder, so that
1210 * we can send it immediately. 1214 * we can send it immediately.
1211 * 1215 *
1212 * Called under s_mutex. 1216 * Called under s_mutex.
1213 */ 1217 */
1214 int ceph_add_cap_releases(struct ceph_mds_client *mdsc, 1218 int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
1215 struct ceph_mds_session *session) 1219 struct ceph_mds_session *session)
1216 { 1220 {
1217 struct ceph_msg *msg, *partial = NULL; 1221 struct ceph_msg *msg, *partial = NULL;
1218 struct ceph_mds_cap_release *head; 1222 struct ceph_mds_cap_release *head;
1219 int err = -ENOMEM; 1223 int err = -ENOMEM;
1220 int extra = mdsc->fsc->mount_options->cap_release_safety; 1224 int extra = mdsc->fsc->mount_options->cap_release_safety;
1221 int num; 1225 int num;
1222 1226
1223 dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds, 1227 dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds,
1224 extra); 1228 extra);
1225 1229
1226 spin_lock(&session->s_cap_lock); 1230 spin_lock(&session->s_cap_lock);
1227 1231
1228 if (!list_empty(&session->s_cap_releases)) { 1232 if (!list_empty(&session->s_cap_releases)) {
1229 msg = list_first_entry(&session->s_cap_releases, 1233 msg = list_first_entry(&session->s_cap_releases,
1230 struct ceph_msg, 1234 struct ceph_msg,
1231 list_head); 1235 list_head);
1232 head = msg->front.iov_base; 1236 head = msg->front.iov_base;
1233 num = le32_to_cpu(head->num); 1237 num = le32_to_cpu(head->num);
1234 if (num) { 1238 if (num) {
1235 dout(" partial %p with (%d/%d)\n", msg, num, 1239 dout(" partial %p with (%d/%d)\n", msg, num,
1236 (int)CEPH_CAPS_PER_RELEASE); 1240 (int)CEPH_CAPS_PER_RELEASE);
1237 extra += CEPH_CAPS_PER_RELEASE - num; 1241 extra += CEPH_CAPS_PER_RELEASE - num;
1238 partial = msg; 1242 partial = msg;
1239 } 1243 }
1240 } 1244 }
1241 while (session->s_num_cap_releases < session->s_nr_caps + extra) { 1245 while (session->s_num_cap_releases < session->s_nr_caps + extra) {
1242 spin_unlock(&session->s_cap_lock); 1246 spin_unlock(&session->s_cap_lock);
1243 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE, 1247 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
1244 GFP_NOFS, false); 1248 GFP_NOFS, false);
1245 if (!msg) 1249 if (!msg)
1246 goto out_unlocked; 1250 goto out_unlocked;
1247 dout("add_cap_releases %p msg %p now %d\n", session, msg, 1251 dout("add_cap_releases %p msg %p now %d\n", session, msg,
1248 (int)msg->front.iov_len); 1252 (int)msg->front.iov_len);
1249 head = msg->front.iov_base; 1253 head = msg->front.iov_base;
1250 head->num = cpu_to_le32(0); 1254 head->num = cpu_to_le32(0);
1251 msg->front.iov_len = sizeof(*head); 1255 msg->front.iov_len = sizeof(*head);
1252 spin_lock(&session->s_cap_lock); 1256 spin_lock(&session->s_cap_lock);
1253 list_add(&msg->list_head, &session->s_cap_releases); 1257 list_add(&msg->list_head, &session->s_cap_releases);
1254 session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE; 1258 session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE;
1255 } 1259 }
1256 1260
1257 if (partial) { 1261 if (partial) {
1258 head = partial->front.iov_base; 1262 head = partial->front.iov_base;
1259 num = le32_to_cpu(head->num); 1263 num = le32_to_cpu(head->num);
1260 dout(" queueing partial %p with %d/%d\n", partial, num, 1264 dout(" queueing partial %p with %d/%d\n", partial, num,
1261 (int)CEPH_CAPS_PER_RELEASE); 1265 (int)CEPH_CAPS_PER_RELEASE);
1262 list_move_tail(&partial->list_head, 1266 list_move_tail(&partial->list_head,
1263 &session->s_cap_releases_done); 1267 &session->s_cap_releases_done);
1264 session->s_num_cap_releases -= CEPH_CAPS_PER_RELEASE - num; 1268 session->s_num_cap_releases -= CEPH_CAPS_PER_RELEASE - num;
1265 } 1269 }
1266 err = 0; 1270 err = 0;
1267 spin_unlock(&session->s_cap_lock); 1271 spin_unlock(&session->s_cap_lock);
1268 out_unlocked: 1272 out_unlocked:
1269 return err; 1273 return err;
1270 } 1274 }
1271 1275
1272 /* 1276 /*
1273 * flush all dirty inode data to disk. 1277 * flush all dirty inode data to disk.
1274 * 1278 *
1275 * returns true if we've flushed through want_flush_seq 1279 * returns true if we've flushed through want_flush_seq
1276 */ 1280 */
1277 static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) 1281 static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
1278 { 1282 {
1279 int mds, ret = 1; 1283 int mds, ret = 1;
1280 1284
1281 dout("check_cap_flush want %lld\n", want_flush_seq); 1285 dout("check_cap_flush want %lld\n", want_flush_seq);
1282 mutex_lock(&mdsc->mutex); 1286 mutex_lock(&mdsc->mutex);
1283 for (mds = 0; ret && mds < mdsc->max_sessions; mds++) { 1287 for (mds = 0; ret && mds < mdsc->max_sessions; mds++) {
1284 struct ceph_mds_session *session = mdsc->sessions[mds]; 1288 struct ceph_mds_session *session = mdsc->sessions[mds];
1285 1289
1286 if (!session) 1290 if (!session)
1287 continue; 1291 continue;
1288 get_session(session); 1292 get_session(session);
1289 mutex_unlock(&mdsc->mutex); 1293 mutex_unlock(&mdsc->mutex);
1290 1294
1291 mutex_lock(&session->s_mutex); 1295 mutex_lock(&session->s_mutex);
1292 if (!list_empty(&session->s_cap_flushing)) { 1296 if (!list_empty(&session->s_cap_flushing)) {
1293 struct ceph_inode_info *ci = 1297 struct ceph_inode_info *ci =
1294 list_entry(session->s_cap_flushing.next, 1298 list_entry(session->s_cap_flushing.next,
1295 struct ceph_inode_info, 1299 struct ceph_inode_info,
1296 i_flushing_item); 1300 i_flushing_item);
1297 struct inode *inode = &ci->vfs_inode; 1301 struct inode *inode = &ci->vfs_inode;
1298 1302
1299 spin_lock(&ci->i_ceph_lock); 1303 spin_lock(&ci->i_ceph_lock);
1300 if (ci->i_cap_flush_seq <= want_flush_seq) { 1304 if (ci->i_cap_flush_seq <= want_flush_seq) {
1301 dout("check_cap_flush still flushing %p " 1305 dout("check_cap_flush still flushing %p "
1302 "seq %lld <= %lld to mds%d\n", inode, 1306 "seq %lld <= %lld to mds%d\n", inode,
1303 ci->i_cap_flush_seq, want_flush_seq, 1307 ci->i_cap_flush_seq, want_flush_seq,
1304 session->s_mds); 1308 session->s_mds);
1305 ret = 0; 1309 ret = 0;
1306 } 1310 }
1307 spin_unlock(&ci->i_ceph_lock); 1311 spin_unlock(&ci->i_ceph_lock);
1308 } 1312 }
1309 mutex_unlock(&session->s_mutex); 1313 mutex_unlock(&session->s_mutex);
1310 ceph_put_mds_session(session); 1314 ceph_put_mds_session(session);
1311 1315
1312 if (!ret) 1316 if (!ret)
1313 return ret; 1317 return ret;
1314 mutex_lock(&mdsc->mutex); 1318 mutex_lock(&mdsc->mutex);
1315 } 1319 }
1316 1320
1317 mutex_unlock(&mdsc->mutex); 1321 mutex_unlock(&mdsc->mutex);
1318 dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq); 1322 dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq);
1319 return ret; 1323 return ret;
1320 } 1324 }
1321 1325
1322 /* 1326 /*
1323 * called under s_mutex 1327 * called under s_mutex
1324 */ 1328 */
1325 void ceph_send_cap_releases(struct ceph_mds_client *mdsc, 1329 void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
1326 struct ceph_mds_session *session) 1330 struct ceph_mds_session *session)
1327 { 1331 {
1328 struct ceph_msg *msg; 1332 struct ceph_msg *msg;
1329 1333
1330 dout("send_cap_releases mds%d\n", session->s_mds); 1334 dout("send_cap_releases mds%d\n", session->s_mds);
1331 spin_lock(&session->s_cap_lock); 1335 spin_lock(&session->s_cap_lock);
1332 while (!list_empty(&session->s_cap_releases_done)) { 1336 while (!list_empty(&session->s_cap_releases_done)) {
1333 msg = list_first_entry(&session->s_cap_releases_done, 1337 msg = list_first_entry(&session->s_cap_releases_done,
1334 struct ceph_msg, list_head); 1338 struct ceph_msg, list_head);
1335 list_del_init(&msg->list_head); 1339 list_del_init(&msg->list_head);
1336 spin_unlock(&session->s_cap_lock); 1340 spin_unlock(&session->s_cap_lock);
1337 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); 1341 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
1338 dout("send_cap_releases mds%d %p\n", session->s_mds, msg); 1342 dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
1339 ceph_con_send(&session->s_con, msg); 1343 ceph_con_send(&session->s_con, msg);
1340 spin_lock(&session->s_cap_lock); 1344 spin_lock(&session->s_cap_lock);
1341 } 1345 }
1342 spin_unlock(&session->s_cap_lock); 1346 spin_unlock(&session->s_cap_lock);
1343 } 1347 }
1344 1348
1345 static void discard_cap_releases(struct ceph_mds_client *mdsc, 1349 static void discard_cap_releases(struct ceph_mds_client *mdsc,
1346 struct ceph_mds_session *session) 1350 struct ceph_mds_session *session)
1347 { 1351 {
1348 struct ceph_msg *msg; 1352 struct ceph_msg *msg;
1349 struct ceph_mds_cap_release *head; 1353 struct ceph_mds_cap_release *head;
1350 unsigned num; 1354 unsigned num;
1351 1355
1352 dout("discard_cap_releases mds%d\n", session->s_mds); 1356 dout("discard_cap_releases mds%d\n", session->s_mds);
1353 spin_lock(&session->s_cap_lock); 1357 spin_lock(&session->s_cap_lock);
1354 1358
1355 /* zero out the in-progress message */ 1359 /* zero out the in-progress message */
1356 msg = list_first_entry(&session->s_cap_releases, 1360 msg = list_first_entry(&session->s_cap_releases,
1357 struct ceph_msg, list_head); 1361 struct ceph_msg, list_head);
1358 head = msg->front.iov_base; 1362 head = msg->front.iov_base;
1359 num = le32_to_cpu(head->num); 1363 num = le32_to_cpu(head->num);
1360 dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num); 1364 dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num);
1361 head->num = cpu_to_le32(0); 1365 head->num = cpu_to_le32(0);
1362 session->s_num_cap_releases += num; 1366 session->s_num_cap_releases += num;
1363 1367
1364 /* requeue completed messages */ 1368 /* requeue completed messages */
1365 while (!list_empty(&session->s_cap_releases_done)) { 1369 while (!list_empty(&session->s_cap_releases_done)) {
1366 msg = list_first_entry(&session->s_cap_releases_done, 1370 msg = list_first_entry(&session->s_cap_releases_done,
1367 struct ceph_msg, list_head); 1371 struct ceph_msg, list_head);
1368 list_del_init(&msg->list_head); 1372 list_del_init(&msg->list_head);
1369 1373
1370 head = msg->front.iov_base; 1374 head = msg->front.iov_base;
1371 num = le32_to_cpu(head->num); 1375 num = le32_to_cpu(head->num);
1372 dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, 1376 dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg,
1373 num); 1377 num);
1374 session->s_num_cap_releases += num; 1378 session->s_num_cap_releases += num;
1375 head->num = cpu_to_le32(0); 1379 head->num = cpu_to_le32(0);
1376 msg->front.iov_len = sizeof(*head); 1380 msg->front.iov_len = sizeof(*head);
1377 list_add(&msg->list_head, &session->s_cap_releases); 1381 list_add(&msg->list_head, &session->s_cap_releases);
1378 } 1382 }
1379 1383
1380 spin_unlock(&session->s_cap_lock); 1384 spin_unlock(&session->s_cap_lock);
1381 } 1385 }
1382 1386
1383 /* 1387 /*
1384 * requests 1388 * requests
1385 */ 1389 */
1386 1390
1387 /* 1391 /*
1388 * Create an mds request. 1392 * Create an mds request.
1389 */ 1393 */
1390 struct ceph_mds_request * 1394 struct ceph_mds_request *
1391 ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) 1395 ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
1392 { 1396 {
1393 struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS); 1397 struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
1394 1398
1395 if (!req) 1399 if (!req)
1396 return ERR_PTR(-ENOMEM); 1400 return ERR_PTR(-ENOMEM);
1397 1401
1398 mutex_init(&req->r_fill_mutex); 1402 mutex_init(&req->r_fill_mutex);
1399 req->r_mdsc = mdsc; 1403 req->r_mdsc = mdsc;
1400 req->r_started = jiffies; 1404 req->r_started = jiffies;
1401 req->r_resend_mds = -1; 1405 req->r_resend_mds = -1;
1402 INIT_LIST_HEAD(&req->r_unsafe_dir_item); 1406 INIT_LIST_HEAD(&req->r_unsafe_dir_item);
1403 req->r_fmode = -1; 1407 req->r_fmode = -1;
1404 kref_init(&req->r_kref); 1408 kref_init(&req->r_kref);
1405 INIT_LIST_HEAD(&req->r_wait); 1409 INIT_LIST_HEAD(&req->r_wait);
1406 init_completion(&req->r_completion); 1410 init_completion(&req->r_completion);
1407 init_completion(&req->r_safe_completion); 1411 init_completion(&req->r_safe_completion);
1408 INIT_LIST_HEAD(&req->r_unsafe_item); 1412 INIT_LIST_HEAD(&req->r_unsafe_item);
1409 1413
1410 req->r_op = op; 1414 req->r_op = op;
1411 req->r_direct_mode = mode; 1415 req->r_direct_mode = mode;
1412 return req; 1416 return req;
1413 } 1417 }
1414 1418
1415 /* 1419 /*
1416 * return oldest (lowest) request, tid in request tree, 0 if none. 1420 * return oldest (lowest) request, tid in request tree, 0 if none.
1417 * 1421 *
1418 * called under mdsc->mutex. 1422 * called under mdsc->mutex.
1419 */ 1423 */
1420 static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc) 1424 static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc)
1421 { 1425 {
1422 if (RB_EMPTY_ROOT(&mdsc->request_tree)) 1426 if (RB_EMPTY_ROOT(&mdsc->request_tree))
1423 return NULL; 1427 return NULL;
1424 return rb_entry(rb_first(&mdsc->request_tree), 1428 return rb_entry(rb_first(&mdsc->request_tree),
1425 struct ceph_mds_request, r_node); 1429 struct ceph_mds_request, r_node);
1426 } 1430 }
1427 1431
1428 static u64 __get_oldest_tid(struct ceph_mds_client *mdsc) 1432 static u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
1429 { 1433 {
1430 struct ceph_mds_request *req = __get_oldest_req(mdsc); 1434 struct ceph_mds_request *req = __get_oldest_req(mdsc);
1431 1435
1432 if (req) 1436 if (req)
1433 return req->r_tid; 1437 return req->r_tid;
1434 return 0; 1438 return 0;
1435 } 1439 }
1436 1440
1437 /* 1441 /*
1438 * Build a dentry's path. Allocate on heap; caller must kfree. Based 1442 * Build a dentry's path. Allocate on heap; caller must kfree. Based
1439 * on build_path_from_dentry in fs/cifs/dir.c. 1443 * on build_path_from_dentry in fs/cifs/dir.c.
1440 * 1444 *
1441 * If @stop_on_nosnap, generate path relative to the first non-snapped 1445 * If @stop_on_nosnap, generate path relative to the first non-snapped
1442 * inode. 1446 * inode.
1443 * 1447 *
1444 * Encode hidden .snap dirs as a double /, i.e. 1448 * Encode hidden .snap dirs as a double /, i.e.
1445 * foo/.snap/bar -> foo//bar 1449 * foo/.snap/bar -> foo//bar
1446 */ 1450 */
1447 char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, 1451 char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
1448 int stop_on_nosnap) 1452 int stop_on_nosnap)
1449 { 1453 {
1450 struct dentry *temp; 1454 struct dentry *temp;
1451 char *path; 1455 char *path;
1452 int len, pos; 1456 int len, pos;
1453 unsigned seq; 1457 unsigned seq;
1454 1458
1455 if (dentry == NULL) 1459 if (dentry == NULL)
1456 return ERR_PTR(-EINVAL); 1460 return ERR_PTR(-EINVAL);
1457 1461
1458 retry: 1462 retry:
1459 len = 0; 1463 len = 0;
1460 seq = read_seqbegin(&rename_lock); 1464 seq = read_seqbegin(&rename_lock);
1461 rcu_read_lock(); 1465 rcu_read_lock();
1462 for (temp = dentry; !IS_ROOT(temp);) { 1466 for (temp = dentry; !IS_ROOT(temp);) {
1463 struct inode *inode = temp->d_inode; 1467 struct inode *inode = temp->d_inode;
1464 if (inode && ceph_snap(inode) == CEPH_SNAPDIR) 1468 if (inode && ceph_snap(inode) == CEPH_SNAPDIR)
1465 len++; /* slash only */ 1469 len++; /* slash only */
1466 else if (stop_on_nosnap && inode && 1470 else if (stop_on_nosnap && inode &&
1467 ceph_snap(inode) == CEPH_NOSNAP) 1471 ceph_snap(inode) == CEPH_NOSNAP)
1468 break; 1472 break;
1469 else 1473 else
1470 len += 1 + temp->d_name.len; 1474 len += 1 + temp->d_name.len;
1471 temp = temp->d_parent; 1475 temp = temp->d_parent;
1472 if (temp == NULL) { 1476 if (temp == NULL) {
1473 rcu_read_unlock(); 1477 rcu_read_unlock();
1474 pr_err("build_path corrupt dentry %p\n", dentry); 1478 pr_err("build_path corrupt dentry %p\n", dentry);
1475 return ERR_PTR(-EINVAL); 1479 return ERR_PTR(-EINVAL);
1476 } 1480 }
1477 } 1481 }
1478 rcu_read_unlock(); 1482 rcu_read_unlock();
1479 if (len) 1483 if (len)
1480 len--; /* no leading '/' */ 1484 len--; /* no leading '/' */
1481 1485
1482 path = kmalloc(len+1, GFP_NOFS); 1486 path = kmalloc(len+1, GFP_NOFS);
1483 if (path == NULL) 1487 if (path == NULL)
1484 return ERR_PTR(-ENOMEM); 1488 return ERR_PTR(-ENOMEM);
1485 pos = len; 1489 pos = len;
1486 path[pos] = 0; /* trailing null */ 1490 path[pos] = 0; /* trailing null */
1487 rcu_read_lock(); 1491 rcu_read_lock();
1488 for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) { 1492 for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) {
1489 struct inode *inode; 1493 struct inode *inode;
1490 1494
1491 spin_lock(&temp->d_lock); 1495 spin_lock(&temp->d_lock);
1492 inode = temp->d_inode; 1496 inode = temp->d_inode;
1493 if (inode && ceph_snap(inode) == CEPH_SNAPDIR) { 1497 if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
1494 dout("build_path path+%d: %p SNAPDIR\n", 1498 dout("build_path path+%d: %p SNAPDIR\n",
1495 pos, temp); 1499 pos, temp);
1496 } else if (stop_on_nosnap && inode && 1500 } else if (stop_on_nosnap && inode &&
1497 ceph_snap(inode) == CEPH_NOSNAP) { 1501 ceph_snap(inode) == CEPH_NOSNAP) {
1498 spin_unlock(&temp->d_lock); 1502 spin_unlock(&temp->d_lock);
1499 break; 1503 break;
1500 } else { 1504 } else {
1501 pos -= temp->d_name.len; 1505 pos -= temp->d_name.len;
1502 if (pos < 0) { 1506 if (pos < 0) {
1503 spin_unlock(&temp->d_lock); 1507 spin_unlock(&temp->d_lock);
1504 break; 1508 break;
1505 } 1509 }
1506 strncpy(path + pos, temp->d_name.name, 1510 strncpy(path + pos, temp->d_name.name,
1507 temp->d_name.len); 1511 temp->d_name.len);
1508 } 1512 }
1509 spin_unlock(&temp->d_lock); 1513 spin_unlock(&temp->d_lock);
1510 if (pos) 1514 if (pos)
1511 path[--pos] = '/'; 1515 path[--pos] = '/';
1512 temp = temp->d_parent; 1516 temp = temp->d_parent;
1513 if (temp == NULL) { 1517 if (temp == NULL) {
1514 rcu_read_unlock(); 1518 rcu_read_unlock();
1515 pr_err("build_path corrupt dentry\n"); 1519 pr_err("build_path corrupt dentry\n");
1516 kfree(path); 1520 kfree(path);
1517 return ERR_PTR(-EINVAL); 1521 return ERR_PTR(-EINVAL);
1518 } 1522 }
1519 } 1523 }
1520 rcu_read_unlock(); 1524 rcu_read_unlock();
1521 if (pos != 0 || read_seqretry(&rename_lock, seq)) { 1525 if (pos != 0 || read_seqretry(&rename_lock, seq)) {
1522 pr_err("build_path did not end path lookup where " 1526 pr_err("build_path did not end path lookup where "
1523 "expected, namelen is %d, pos is %d\n", len, pos); 1527 "expected, namelen is %d, pos is %d\n", len, pos);
1524 /* presumably this is only possible if racing with a 1528 /* presumably this is only possible if racing with a
1525 rename of one of the parent directories (we can not 1529 rename of one of the parent directories (we can not
1526 lock the dentries above us to prevent this, but 1530 lock the dentries above us to prevent this, but
1527 retrying should be harmless) */ 1531 retrying should be harmless) */
1528 kfree(path); 1532 kfree(path);
1529 goto retry; 1533 goto retry;
1530 } 1534 }
1531 1535
1532 *base = ceph_ino(temp->d_inode); 1536 *base = ceph_ino(temp->d_inode);
1533 *plen = len; 1537 *plen = len;
1534 dout("build_path on %p %d built %llx '%.*s'\n", 1538 dout("build_path on %p %d built %llx '%.*s'\n",
1535 dentry, dentry->d_count, *base, len, path); 1539 dentry, dentry->d_count, *base, len, path);
1536 return path; 1540 return path;
1537 } 1541 }
1538 1542
1539 static int build_dentry_path(struct dentry *dentry, 1543 static int build_dentry_path(struct dentry *dentry,
1540 const char **ppath, int *ppathlen, u64 *pino, 1544 const char **ppath, int *ppathlen, u64 *pino,
1541 int *pfreepath) 1545 int *pfreepath)
1542 { 1546 {
1543 char *path; 1547 char *path;
1544 1548
1545 if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) { 1549 if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) {
1546 *pino = ceph_ino(dentry->d_parent->d_inode); 1550 *pino = ceph_ino(dentry->d_parent->d_inode);
1547 *ppath = dentry->d_name.name; 1551 *ppath = dentry->d_name.name;
1548 *ppathlen = dentry->d_name.len; 1552 *ppathlen = dentry->d_name.len;
1549 return 0; 1553 return 0;
1550 } 1554 }
1551 path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1); 1555 path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
1552 if (IS_ERR(path)) 1556 if (IS_ERR(path))
1553 return PTR_ERR(path); 1557 return PTR_ERR(path);
1554 *ppath = path; 1558 *ppath = path;
1555 *pfreepath = 1; 1559 *pfreepath = 1;
1556 return 0; 1560 return 0;
1557 } 1561 }
1558 1562
1559 static int build_inode_path(struct inode *inode, 1563 static int build_inode_path(struct inode *inode,
1560 const char **ppath, int *ppathlen, u64 *pino, 1564 const char **ppath, int *ppathlen, u64 *pino,
1561 int *pfreepath) 1565 int *pfreepath)
1562 { 1566 {
1563 struct dentry *dentry; 1567 struct dentry *dentry;
1564 char *path; 1568 char *path;
1565 1569
1566 if (ceph_snap(inode) == CEPH_NOSNAP) { 1570 if (ceph_snap(inode) == CEPH_NOSNAP) {
1567 *pino = ceph_ino(inode); 1571 *pino = ceph_ino(inode);
1568 *ppathlen = 0; 1572 *ppathlen = 0;
1569 return 0; 1573 return 0;
1570 } 1574 }
1571 dentry = d_find_alias(inode); 1575 dentry = d_find_alias(inode);
1572 path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1); 1576 path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
1573 dput(dentry); 1577 dput(dentry);
1574 if (IS_ERR(path)) 1578 if (IS_ERR(path))
1575 return PTR_ERR(path); 1579 return PTR_ERR(path);
1576 *ppath = path; 1580 *ppath = path;
1577 *pfreepath = 1; 1581 *pfreepath = 1;
1578 return 0; 1582 return 0;
1579 } 1583 }
1580 1584
1581 /* 1585 /*
1582 * request arguments may be specified via an inode *, a dentry *, or 1586 * request arguments may be specified via an inode *, a dentry *, or
1583 * an explicit ino+path. 1587 * an explicit ino+path.
1584 */ 1588 */
1585 static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, 1589 static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
1586 const char *rpath, u64 rino, 1590 const char *rpath, u64 rino,
1587 const char **ppath, int *pathlen, 1591 const char **ppath, int *pathlen,
1588 u64 *ino, int *freepath) 1592 u64 *ino, int *freepath)
1589 { 1593 {
1590 int r = 0; 1594 int r = 0;
1591 1595
1592 if (rinode) { 1596 if (rinode) {
1593 r = build_inode_path(rinode, ppath, pathlen, ino, freepath); 1597 r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
1594 dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode), 1598 dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
1595 ceph_snap(rinode)); 1599 ceph_snap(rinode));
1596 } else if (rdentry) { 1600 } else if (rdentry) {
1597 r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath); 1601 r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath);
1598 dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, 1602 dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
1599 *ppath); 1603 *ppath);
1600 } else if (rpath || rino) { 1604 } else if (rpath || rino) {
1601 *ino = rino; 1605 *ino = rino;
1602 *ppath = rpath; 1606 *ppath = rpath;
1603 *pathlen = strlen(rpath); 1607 *pathlen = strlen(rpath);
1604 dout(" path %.*s\n", *pathlen, rpath); 1608 dout(" path %.*s\n", *pathlen, rpath);
1605 } 1609 }
1606 1610
1607 return r; 1611 return r;
1608 } 1612 }
1609 1613
1610 /* 1614 /*
1611 * called under mdsc->mutex 1615 * called under mdsc->mutex
1612 */ 1616 */
1613 static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, 1617 static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
1614 struct ceph_mds_request *req, 1618 struct ceph_mds_request *req,
1615 int mds) 1619 int mds)
1616 { 1620 {
1617 struct ceph_msg *msg; 1621 struct ceph_msg *msg;
1618 struct ceph_mds_request_head *head; 1622 struct ceph_mds_request_head *head;
1619 const char *path1 = NULL; 1623 const char *path1 = NULL;
1620 const char *path2 = NULL; 1624 const char *path2 = NULL;
1621 u64 ino1 = 0, ino2 = 0; 1625 u64 ino1 = 0, ino2 = 0;
1622 int pathlen1 = 0, pathlen2 = 0; 1626 int pathlen1 = 0, pathlen2 = 0;
1623 int freepath1 = 0, freepath2 = 0; 1627 int freepath1 = 0, freepath2 = 0;
1624 int len; 1628 int len;
1625 u16 releases; 1629 u16 releases;
1626 void *p, *end; 1630 void *p, *end;
1627 int ret; 1631 int ret;
1628 1632
1629 ret = set_request_path_attr(req->r_inode, req->r_dentry, 1633 ret = set_request_path_attr(req->r_inode, req->r_dentry,
1630 req->r_path1, req->r_ino1.ino, 1634 req->r_path1, req->r_ino1.ino,
1631 &path1, &pathlen1, &ino1, &freepath1); 1635 &path1, &pathlen1, &ino1, &freepath1);
1632 if (ret < 0) { 1636 if (ret < 0) {
1633 msg = ERR_PTR(ret); 1637 msg = ERR_PTR(ret);
1634 goto out; 1638 goto out;
1635 } 1639 }
1636 1640
1637 ret = set_request_path_attr(NULL, req->r_old_dentry, 1641 ret = set_request_path_attr(NULL, req->r_old_dentry,
1638 req->r_path2, req->r_ino2.ino, 1642 req->r_path2, req->r_ino2.ino,
1639 &path2, &pathlen2, &ino2, &freepath2); 1643 &path2, &pathlen2, &ino2, &freepath2);
1640 if (ret < 0) { 1644 if (ret < 0) {
1641 msg = ERR_PTR(ret); 1645 msg = ERR_PTR(ret);
1642 goto out_free1; 1646 goto out_free1;
1643 } 1647 }
1644 1648
1645 len = sizeof(*head) + 1649 len = sizeof(*head) +
1646 pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)); 1650 pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64));
1647 1651
1648 /* calculate (max) length for cap releases */ 1652 /* calculate (max) length for cap releases */
1649 len += sizeof(struct ceph_mds_request_release) * 1653 len += sizeof(struct ceph_mds_request_release) *
1650 (!!req->r_inode_drop + !!req->r_dentry_drop + 1654 (!!req->r_inode_drop + !!req->r_dentry_drop +
1651 !!req->r_old_inode_drop + !!req->r_old_dentry_drop); 1655 !!req->r_old_inode_drop + !!req->r_old_dentry_drop);
1652 if (req->r_dentry_drop) 1656 if (req->r_dentry_drop)
1653 len += req->r_dentry->d_name.len; 1657 len += req->r_dentry->d_name.len;
1654 if (req->r_old_dentry_drop) 1658 if (req->r_old_dentry_drop)
1655 len += req->r_old_dentry->d_name.len; 1659 len += req->r_old_dentry->d_name.len;
1656 1660
1657 msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS, false); 1661 msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS, false);
1658 if (!msg) { 1662 if (!msg) {
1659 msg = ERR_PTR(-ENOMEM); 1663 msg = ERR_PTR(-ENOMEM);
1660 goto out_free2; 1664 goto out_free2;
1661 } 1665 }
1662 1666
1663 msg->hdr.tid = cpu_to_le64(req->r_tid); 1667 msg->hdr.tid = cpu_to_le64(req->r_tid);
1664 1668
1665 head = msg->front.iov_base; 1669 head = msg->front.iov_base;
1666 p = msg->front.iov_base + sizeof(*head); 1670 p = msg->front.iov_base + sizeof(*head);
1667 end = msg->front.iov_base + msg->front.iov_len; 1671 end = msg->front.iov_base + msg->front.iov_len;
1668 1672
1669 head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch); 1673 head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
1670 head->op = cpu_to_le32(req->r_op); 1674 head->op = cpu_to_le32(req->r_op);
1671 head->caller_uid = cpu_to_le32(req->r_uid); 1675 head->caller_uid = cpu_to_le32(req->r_uid);
1672 head->caller_gid = cpu_to_le32(req->r_gid); 1676 head->caller_gid = cpu_to_le32(req->r_gid);
1673 head->args = req->r_args; 1677 head->args = req->r_args;
1674 1678
1675 ceph_encode_filepath(&p, end, ino1, path1); 1679 ceph_encode_filepath(&p, end, ino1, path1);
1676 ceph_encode_filepath(&p, end, ino2, path2); 1680 ceph_encode_filepath(&p, end, ino2, path2);
1677 1681
1678 /* make note of release offset, in case we need to replay */ 1682 /* make note of release offset, in case we need to replay */
1679 req->r_request_release_offset = p - msg->front.iov_base; 1683 req->r_request_release_offset = p - msg->front.iov_base;
1680 1684
1681 /* cap releases */ 1685 /* cap releases */
1682 releases = 0; 1686 releases = 0;
1683 if (req->r_inode_drop) 1687 if (req->r_inode_drop)
1684 releases += ceph_encode_inode_release(&p, 1688 releases += ceph_encode_inode_release(&p,
1685 req->r_inode ? req->r_inode : req->r_dentry->d_inode, 1689 req->r_inode ? req->r_inode : req->r_dentry->d_inode,
1686 mds, req->r_inode_drop, req->r_inode_unless, 0); 1690 mds, req->r_inode_drop, req->r_inode_unless, 0);
1687 if (req->r_dentry_drop) 1691 if (req->r_dentry_drop)
1688 releases += ceph_encode_dentry_release(&p, req->r_dentry, 1692 releases += ceph_encode_dentry_release(&p, req->r_dentry,
1689 mds, req->r_dentry_drop, req->r_dentry_unless); 1693 mds, req->r_dentry_drop, req->r_dentry_unless);
1690 if (req->r_old_dentry_drop) 1694 if (req->r_old_dentry_drop)
1691 releases += ceph_encode_dentry_release(&p, req->r_old_dentry, 1695 releases += ceph_encode_dentry_release(&p, req->r_old_dentry,
1692 mds, req->r_old_dentry_drop, req->r_old_dentry_unless); 1696 mds, req->r_old_dentry_drop, req->r_old_dentry_unless);
1693 if (req->r_old_inode_drop) 1697 if (req->r_old_inode_drop)
1694 releases += ceph_encode_inode_release(&p, 1698 releases += ceph_encode_inode_release(&p,
1695 req->r_old_dentry->d_inode, 1699 req->r_old_dentry->d_inode,
1696 mds, req->r_old_inode_drop, req->r_old_inode_unless, 0); 1700 mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
1697 head->num_releases = cpu_to_le16(releases); 1701 head->num_releases = cpu_to_le16(releases);
1698 1702
1699 BUG_ON(p > end); 1703 BUG_ON(p > end);
1700 msg->front.iov_len = p - msg->front.iov_base; 1704 msg->front.iov_len = p - msg->front.iov_base;
1701 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); 1705 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
1702 1706
1703 msg->pages = req->r_pages; 1707 msg->pages = req->r_pages;
1704 msg->nr_pages = req->r_num_pages; 1708 msg->nr_pages = req->r_num_pages;
1705 msg->hdr.data_len = cpu_to_le32(req->r_data_len); 1709 msg->hdr.data_len = cpu_to_le32(req->r_data_len);
1706 msg->hdr.data_off = cpu_to_le16(0); 1710 msg->hdr.data_off = cpu_to_le16(0);
1707 1711
1708 out_free2: 1712 out_free2:
1709 if (freepath2) 1713 if (freepath2)
1710 kfree((char *)path2); 1714 kfree((char *)path2);
1711 out_free1: 1715 out_free1:
1712 if (freepath1) 1716 if (freepath1)
1713 kfree((char *)path1); 1717 kfree((char *)path1);
1714 out: 1718 out:
1715 return msg; 1719 return msg;
1716 } 1720 }
1717 1721
1718 /* 1722 /*
1719 * called under mdsc->mutex if error, under no mutex if 1723 * called under mdsc->mutex if error, under no mutex if
1720 * success. 1724 * success.
1721 */ 1725 */
1722 static void complete_request(struct ceph_mds_client *mdsc, 1726 static void complete_request(struct ceph_mds_client *mdsc,
1723 struct ceph_mds_request *req) 1727 struct ceph_mds_request *req)
1724 { 1728 {
1725 if (req->r_callback) 1729 if (req->r_callback)
1726 req->r_callback(mdsc, req); 1730 req->r_callback(mdsc, req);
1727 else 1731 else
1728 complete_all(&req->r_completion); 1732 complete_all(&req->r_completion);
1729 } 1733 }
1730 1734
1731 /* 1735 /*
1732 * called under mdsc->mutex 1736 * called under mdsc->mutex
1733 */ 1737 */
1734 static int __prepare_send_request(struct ceph_mds_client *mdsc, 1738 static int __prepare_send_request(struct ceph_mds_client *mdsc,
1735 struct ceph_mds_request *req, 1739 struct ceph_mds_request *req,
1736 int mds) 1740 int mds)
1737 { 1741 {
1738 struct ceph_mds_request_head *rhead; 1742 struct ceph_mds_request_head *rhead;
1739 struct ceph_msg *msg; 1743 struct ceph_msg *msg;
1740 int flags = 0; 1744 int flags = 0;
1741 1745
1742 req->r_attempts++; 1746 req->r_attempts++;
1743 if (req->r_inode) { 1747 if (req->r_inode) {
1744 struct ceph_cap *cap = 1748 struct ceph_cap *cap =
1745 ceph_get_cap_for_mds(ceph_inode(req->r_inode), mds); 1749 ceph_get_cap_for_mds(ceph_inode(req->r_inode), mds);
1746 1750
1747 if (cap) 1751 if (cap)
1748 req->r_sent_on_mseq = cap->mseq; 1752 req->r_sent_on_mseq = cap->mseq;
1749 else 1753 else
1750 req->r_sent_on_mseq = -1; 1754 req->r_sent_on_mseq = -1;
1751 } 1755 }
1752 dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req, 1756 dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
1753 req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts); 1757 req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
1754 1758
1755 if (req->r_got_unsafe) { 1759 if (req->r_got_unsafe) {
1756 /* 1760 /*
1757 * Replay. Do not regenerate message (and rebuild 1761 * Replay. Do not regenerate message (and rebuild
1758 * paths, etc.); just use the original message. 1762 * paths, etc.); just use the original message.
1759 * Rebuilding paths will break for renames because 1763 * Rebuilding paths will break for renames because
1760 * d_move mangles the src name. 1764 * d_move mangles the src name.
1761 */ 1765 */
1762 msg = req->r_request; 1766 msg = req->r_request;
1763 rhead = msg->front.iov_base; 1767 rhead = msg->front.iov_base;
1764 1768
1765 flags = le32_to_cpu(rhead->flags); 1769 flags = le32_to_cpu(rhead->flags);
1766 flags |= CEPH_MDS_FLAG_REPLAY; 1770 flags |= CEPH_MDS_FLAG_REPLAY;
1767 rhead->flags = cpu_to_le32(flags); 1771 rhead->flags = cpu_to_le32(flags);
1768 1772
1769 if (req->r_target_inode) 1773 if (req->r_target_inode)
1770 rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode)); 1774 rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
1771 1775
1772 rhead->num_retry = req->r_attempts - 1; 1776 rhead->num_retry = req->r_attempts - 1;
1773 1777
1774 /* remove cap/dentry releases from message */ 1778 /* remove cap/dentry releases from message */
1775 rhead->num_releases = 0; 1779 rhead->num_releases = 0;
1776 msg->hdr.front_len = cpu_to_le32(req->r_request_release_offset); 1780 msg->hdr.front_len = cpu_to_le32(req->r_request_release_offset);
1777 msg->front.iov_len = req->r_request_release_offset; 1781 msg->front.iov_len = req->r_request_release_offset;
1778 return 0; 1782 return 0;
1779 } 1783 }
1780 1784
1781 if (req->r_request) { 1785 if (req->r_request) {
1782 ceph_msg_put(req->r_request); 1786 ceph_msg_put(req->r_request);
1783 req->r_request = NULL; 1787 req->r_request = NULL;
1784 } 1788 }
1785 msg = create_request_message(mdsc, req, mds); 1789 msg = create_request_message(mdsc, req, mds);
1786 if (IS_ERR(msg)) { 1790 if (IS_ERR(msg)) {
1787 req->r_err = PTR_ERR(msg); 1791 req->r_err = PTR_ERR(msg);
1788 complete_request(mdsc, req); 1792 complete_request(mdsc, req);
1789 return PTR_ERR(msg); 1793 return PTR_ERR(msg);
1790 } 1794 }
1791 req->r_request = msg; 1795 req->r_request = msg;
1792 1796
1793 rhead = msg->front.iov_base; 1797 rhead = msg->front.iov_base;
1794 rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc)); 1798 rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
1795 if (req->r_got_unsafe) 1799 if (req->r_got_unsafe)
1796 flags |= CEPH_MDS_FLAG_REPLAY; 1800 flags |= CEPH_MDS_FLAG_REPLAY;
1797 if (req->r_locked_dir) 1801 if (req->r_locked_dir)
1798 flags |= CEPH_MDS_FLAG_WANT_DENTRY; 1802 flags |= CEPH_MDS_FLAG_WANT_DENTRY;
1799 rhead->flags = cpu_to_le32(flags); 1803 rhead->flags = cpu_to_le32(flags);
1800 rhead->num_fwd = req->r_num_fwd; 1804 rhead->num_fwd = req->r_num_fwd;
1801 rhead->num_retry = req->r_attempts - 1; 1805 rhead->num_retry = req->r_attempts - 1;
1802 rhead->ino = 0; 1806 rhead->ino = 0;
1803 1807
1804 dout(" r_locked_dir = %p\n", req->r_locked_dir); 1808 dout(" r_locked_dir = %p\n", req->r_locked_dir);
1805 return 0; 1809 return 0;
1806 } 1810 }
1807 1811
1808 /* 1812 /*
1809 * send request, or put it on the appropriate wait list. 1813 * send request, or put it on the appropriate wait list.
1810 */ 1814 */
1811 static int __do_request(struct ceph_mds_client *mdsc, 1815 static int __do_request(struct ceph_mds_client *mdsc,
1812 struct ceph_mds_request *req) 1816 struct ceph_mds_request *req)
1813 { 1817 {
1814 struct ceph_mds_session *session = NULL; 1818 struct ceph_mds_session *session = NULL;
1815 int mds = -1; 1819 int mds = -1;
1816 int err = -EAGAIN; 1820 int err = -EAGAIN;
1817 1821
1818 if (req->r_err || req->r_got_result) 1822 if (req->r_err || req->r_got_result)
1819 goto out; 1823 goto out;
1820 1824
1821 if (req->r_timeout && 1825 if (req->r_timeout &&
1822 time_after_eq(jiffies, req->r_started + req->r_timeout)) { 1826 time_after_eq(jiffies, req->r_started + req->r_timeout)) {
1823 dout("do_request timed out\n"); 1827 dout("do_request timed out\n");
1824 err = -EIO; 1828 err = -EIO;
1825 goto finish; 1829 goto finish;
1826 } 1830 }
1827 1831
1828 put_request_session(req); 1832 put_request_session(req);
1829 1833
1830 mds = __choose_mds(mdsc, req); 1834 mds = __choose_mds(mdsc, req);
1831 if (mds < 0 || 1835 if (mds < 0 ||
1832 ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) { 1836 ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
1833 dout("do_request no mds or not active, waiting for map\n"); 1837 dout("do_request no mds or not active, waiting for map\n");
1834 list_add(&req->r_wait, &mdsc->waiting_for_map); 1838 list_add(&req->r_wait, &mdsc->waiting_for_map);
1835 goto out; 1839 goto out;
1836 } 1840 }
1837 1841
1838 /* get, open session */ 1842 /* get, open session */
1839 session = __ceph_lookup_mds_session(mdsc, mds); 1843 session = __ceph_lookup_mds_session(mdsc, mds);
1840 if (!session) { 1844 if (!session) {
1841 session = register_session(mdsc, mds); 1845 session = register_session(mdsc, mds);
1842 if (IS_ERR(session)) { 1846 if (IS_ERR(session)) {
1843 err = PTR_ERR(session); 1847 err = PTR_ERR(session);
1844 goto finish; 1848 goto finish;
1845 } 1849 }
1846 } 1850 }
1847 req->r_session = get_session(session); 1851 req->r_session = get_session(session);
1848 1852
1849 dout("do_request mds%d session %p state %s\n", mds, session, 1853 dout("do_request mds%d session %p state %s\n", mds, session,
1850 session_state_name(session->s_state)); 1854 session_state_name(session->s_state));
1851 if (session->s_state != CEPH_MDS_SESSION_OPEN && 1855 if (session->s_state != CEPH_MDS_SESSION_OPEN &&
1852 session->s_state != CEPH_MDS_SESSION_HUNG) { 1856 session->s_state != CEPH_MDS_SESSION_HUNG) {
1853 if (session->s_state == CEPH_MDS_SESSION_NEW || 1857 if (session->s_state == CEPH_MDS_SESSION_NEW ||
1854 session->s_state == CEPH_MDS_SESSION_CLOSING) 1858 session->s_state == CEPH_MDS_SESSION_CLOSING)
1855 __open_session(mdsc, session); 1859 __open_session(mdsc, session);
1856 list_add(&req->r_wait, &session->s_waiting); 1860 list_add(&req->r_wait, &session->s_waiting);
1857 goto out_session; 1861 goto out_session;
1858 } 1862 }
1859 1863
1860 /* send request */ 1864 /* send request */
1861 req->r_resend_mds = -1; /* forget any previous mds hint */ 1865 req->r_resend_mds = -1; /* forget any previous mds hint */
1862 1866
1863 if (req->r_request_started == 0) /* note request start time */ 1867 if (req->r_request_started == 0) /* note request start time */
1864 req->r_request_started = jiffies; 1868 req->r_request_started = jiffies;
1865 1869
1866 err = __prepare_send_request(mdsc, req, mds); 1870 err = __prepare_send_request(mdsc, req, mds);
1867 if (!err) { 1871 if (!err) {
1868 ceph_msg_get(req->r_request); 1872 ceph_msg_get(req->r_request);
1869 ceph_con_send(&session->s_con, req->r_request); 1873 ceph_con_send(&session->s_con, req->r_request);
1870 } 1874 }
1871 1875
1872 out_session: 1876 out_session:
1873 ceph_put_mds_session(session); 1877 ceph_put_mds_session(session);
1874 out: 1878 out:
1875 return err; 1879 return err;
1876 1880
1877 finish: 1881 finish:
1878 req->r_err = err; 1882 req->r_err = err;
1879 complete_request(mdsc, req); 1883 complete_request(mdsc, req);
1880 goto out; 1884 goto out;
1881 } 1885 }
1882 1886
1883 /* 1887 /*
1884 * called under mdsc->mutex 1888 * called under mdsc->mutex
1885 */ 1889 */
1886 static void __wake_requests(struct ceph_mds_client *mdsc, 1890 static void __wake_requests(struct ceph_mds_client *mdsc,
1887 struct list_head *head) 1891 struct list_head *head)
1888 { 1892 {
1889 struct ceph_mds_request *req, *nreq; 1893 struct ceph_mds_request *req, *nreq;
1890 1894
1891 list_for_each_entry_safe(req, nreq, head, r_wait) { 1895 list_for_each_entry_safe(req, nreq, head, r_wait) {
1892 list_del_init(&req->r_wait); 1896 list_del_init(&req->r_wait);
1893 __do_request(mdsc, req); 1897 __do_request(mdsc, req);
1894 } 1898 }
1895 } 1899 }
1896 1900
1897 /* 1901 /*
1898 * Wake up threads with requests pending for @mds, so that they can 1902 * Wake up threads with requests pending for @mds, so that they can
1899 * resubmit their requests to a possibly different mds. 1903 * resubmit their requests to a possibly different mds.
1900 */ 1904 */
1901 static void kick_requests(struct ceph_mds_client *mdsc, int mds) 1905 static void kick_requests(struct ceph_mds_client *mdsc, int mds)
1902 { 1906 {
1903 struct ceph_mds_request *req; 1907 struct ceph_mds_request *req;
1904 struct rb_node *p; 1908 struct rb_node *p;
1905 1909
1906 dout("kick_requests mds%d\n", mds); 1910 dout("kick_requests mds%d\n", mds);
1907 for (p = rb_first(&mdsc->request_tree); p; p = rb_next(p)) { 1911 for (p = rb_first(&mdsc->request_tree); p; p = rb_next(p)) {
1908 req = rb_entry(p, struct ceph_mds_request, r_node); 1912 req = rb_entry(p, struct ceph_mds_request, r_node);
1909 if (req->r_got_unsafe) 1913 if (req->r_got_unsafe)
1910 continue; 1914 continue;
1911 if (req->r_session && 1915 if (req->r_session &&
1912 req->r_session->s_mds == mds) { 1916 req->r_session->s_mds == mds) {
1913 dout(" kicking tid %llu\n", req->r_tid); 1917 dout(" kicking tid %llu\n", req->r_tid);
1914 __do_request(mdsc, req); 1918 __do_request(mdsc, req);
1915 } 1919 }
1916 } 1920 }
1917 } 1921 }
1918 1922
1919 void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, 1923 void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
1920 struct ceph_mds_request *req) 1924 struct ceph_mds_request *req)
1921 { 1925 {
1922 dout("submit_request on %p\n", req); 1926 dout("submit_request on %p\n", req);
1923 mutex_lock(&mdsc->mutex); 1927 mutex_lock(&mdsc->mutex);
1924 __register_request(mdsc, req, NULL); 1928 __register_request(mdsc, req, NULL);
1925 __do_request(mdsc, req); 1929 __do_request(mdsc, req);
1926 mutex_unlock(&mdsc->mutex); 1930 mutex_unlock(&mdsc->mutex);
1927 } 1931 }
1928 1932
1929 /* 1933 /*
1930 * Synchrously perform an mds request. Take care of all of the 1934 * Synchrously perform an mds request. Take care of all of the
1931 * session setup, forwarding, retry details. 1935 * session setup, forwarding, retry details.
1932 */ 1936 */
1933 int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, 1937 int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
1934 struct inode *dir, 1938 struct inode *dir,
1935 struct ceph_mds_request *req) 1939 struct ceph_mds_request *req)
1936 { 1940 {
1937 int err; 1941 int err;
1938 1942
1939 dout("do_request on %p\n", req); 1943 dout("do_request on %p\n", req);
1940 1944
1941 /* take CAP_PIN refs for r_inode, r_locked_dir, r_old_dentry */ 1945 /* take CAP_PIN refs for r_inode, r_locked_dir, r_old_dentry */
1942 if (req->r_inode) 1946 if (req->r_inode)
1943 ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); 1947 ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
1944 if (req->r_locked_dir) 1948 if (req->r_locked_dir)
1945 ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); 1949 ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
1946 if (req->r_old_dentry) 1950 if (req->r_old_dentry)
1947 ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir), 1951 ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
1948 CEPH_CAP_PIN); 1952 CEPH_CAP_PIN);
1949 1953
1950 /* issue */ 1954 /* issue */
1951 mutex_lock(&mdsc->mutex); 1955 mutex_lock(&mdsc->mutex);
1952 __register_request(mdsc, req, dir); 1956 __register_request(mdsc, req, dir);
1953 __do_request(mdsc, req); 1957 __do_request(mdsc, req);
1954 1958
1955 if (req->r_err) { 1959 if (req->r_err) {
1956 err = req->r_err; 1960 err = req->r_err;
1957 __unregister_request(mdsc, req); 1961 __unregister_request(mdsc, req);
1958 dout("do_request early error %d\n", err); 1962 dout("do_request early error %d\n", err);
1959 goto out; 1963 goto out;
1960 } 1964 }
1961 1965
1962 /* wait */ 1966 /* wait */
1963 mutex_unlock(&mdsc->mutex); 1967 mutex_unlock(&mdsc->mutex);
1964 dout("do_request waiting\n"); 1968 dout("do_request waiting\n");
1965 if (req->r_timeout) { 1969 if (req->r_timeout) {
1966 err = (long)wait_for_completion_killable_timeout( 1970 err = (long)wait_for_completion_killable_timeout(
1967 &req->r_completion, req->r_timeout); 1971 &req->r_completion, req->r_timeout);
1968 if (err == 0) 1972 if (err == 0)
1969 err = -EIO; 1973 err = -EIO;
1970 } else { 1974 } else {
1971 err = wait_for_completion_killable(&req->r_completion); 1975 err = wait_for_completion_killable(&req->r_completion);
1972 } 1976 }
1973 dout("do_request waited, got %d\n", err); 1977 dout("do_request waited, got %d\n", err);
1974 mutex_lock(&mdsc->mutex); 1978 mutex_lock(&mdsc->mutex);
1975 1979
1976 /* only abort if we didn't race with a real reply */ 1980 /* only abort if we didn't race with a real reply */
1977 if (req->r_got_result) { 1981 if (req->r_got_result) {
1978 err = le32_to_cpu(req->r_reply_info.head->result); 1982 err = le32_to_cpu(req->r_reply_info.head->result);
1979 } else if (err < 0) { 1983 } else if (err < 0) {
1980 dout("aborted request %lld with %d\n", req->r_tid, err); 1984 dout("aborted request %lld with %d\n", req->r_tid, err);
1981 1985
1982 /* 1986 /*
1983 * ensure we aren't running concurrently with 1987 * ensure we aren't running concurrently with
1984 * ceph_fill_trace or ceph_readdir_prepopulate, which 1988 * ceph_fill_trace or ceph_readdir_prepopulate, which
1985 * rely on locks (dir mutex) held by our caller. 1989 * rely on locks (dir mutex) held by our caller.
1986 */ 1990 */
1987 mutex_lock(&req->r_fill_mutex); 1991 mutex_lock(&req->r_fill_mutex);
1988 req->r_err = err; 1992 req->r_err = err;
1989 req->r_aborted = true; 1993 req->r_aborted = true;
1990 mutex_unlock(&req->r_fill_mutex); 1994 mutex_unlock(&req->r_fill_mutex);
1991 1995
1992 if (req->r_locked_dir && 1996 if (req->r_locked_dir &&
1993 (req->r_op & CEPH_MDS_OP_WRITE)) 1997 (req->r_op & CEPH_MDS_OP_WRITE))
1994 ceph_invalidate_dir_request(req); 1998 ceph_invalidate_dir_request(req);
1995 } else { 1999 } else {
1996 err = req->r_err; 2000 err = req->r_err;
1997 } 2001 }
1998 2002
1999 out: 2003 out:
2000 mutex_unlock(&mdsc->mutex); 2004 mutex_unlock(&mdsc->mutex);
2001 dout("do_request %p done, result %d\n", req, err); 2005 dout("do_request %p done, result %d\n", req, err);
2002 return err; 2006 return err;
2003 } 2007 }
2004 2008
2005 /* 2009 /*
2006 * Invalidate dir D_COMPLETE, dentry lease state on an aborted MDS 2010 * Invalidate dir D_COMPLETE, dentry lease state on an aborted MDS
2007 * namespace request. 2011 * namespace request.
2008 */ 2012 */
2009 void ceph_invalidate_dir_request(struct ceph_mds_request *req) 2013 void ceph_invalidate_dir_request(struct ceph_mds_request *req)
2010 { 2014 {
2011 struct inode *inode = req->r_locked_dir; 2015 struct inode *inode = req->r_locked_dir;
2012 struct ceph_inode_info *ci = ceph_inode(inode); 2016 struct ceph_inode_info *ci = ceph_inode(inode);
2013 2017
2014 dout("invalidate_dir_request %p (D_COMPLETE, lease(s))\n", inode); 2018 dout("invalidate_dir_request %p (D_COMPLETE, lease(s))\n", inode);
2015 spin_lock(&ci->i_ceph_lock); 2019 spin_lock(&ci->i_ceph_lock);
2016 ceph_dir_clear_complete(inode); 2020 ceph_dir_clear_complete(inode);
2017 ci->i_release_count++; 2021 ci->i_release_count++;
2018 spin_unlock(&ci->i_ceph_lock); 2022 spin_unlock(&ci->i_ceph_lock);
2019 2023
2020 if (req->r_dentry) 2024 if (req->r_dentry)
2021 ceph_invalidate_dentry_lease(req->r_dentry); 2025 ceph_invalidate_dentry_lease(req->r_dentry);
2022 if (req->r_old_dentry) 2026 if (req->r_old_dentry)
2023 ceph_invalidate_dentry_lease(req->r_old_dentry); 2027 ceph_invalidate_dentry_lease(req->r_old_dentry);
2024 } 2028 }
2025 2029
2026 /* 2030 /*
2027 * Handle mds reply. 2031 * Handle mds reply.
2028 * 2032 *
2029 * We take the session mutex and parse and process the reply immediately. 2033 * We take the session mutex and parse and process the reply immediately.
2030 * This preserves the logical ordering of replies, capabilities, etc., sent 2034 * This preserves the logical ordering of replies, capabilities, etc., sent
2031 * by the MDS as they are applied to our local cache. 2035 * by the MDS as they are applied to our local cache.
2032 */ 2036 */
2033 static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) 2037 static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
2034 { 2038 {
2035 struct ceph_mds_client *mdsc = session->s_mdsc; 2039 struct ceph_mds_client *mdsc = session->s_mdsc;
2036 struct ceph_mds_request *req; 2040 struct ceph_mds_request *req;
2037 struct ceph_mds_reply_head *head = msg->front.iov_base; 2041 struct ceph_mds_reply_head *head = msg->front.iov_base;
2038 struct ceph_mds_reply_info_parsed *rinfo; /* parsed reply info */ 2042 struct ceph_mds_reply_info_parsed *rinfo; /* parsed reply info */
2039 u64 tid; 2043 u64 tid;
2040 int err, result; 2044 int err, result;
2041 int mds = session->s_mds; 2045 int mds = session->s_mds;
2042 2046
2043 if (msg->front.iov_len < sizeof(*head)) { 2047 if (msg->front.iov_len < sizeof(*head)) {
2044 pr_err("mdsc_handle_reply got corrupt (short) reply\n"); 2048 pr_err("mdsc_handle_reply got corrupt (short) reply\n");
2045 ceph_msg_dump(msg); 2049 ceph_msg_dump(msg);
2046 return; 2050 return;
2047 } 2051 }
2048 2052
2049 /* get request, session */ 2053 /* get request, session */
2050 tid = le64_to_cpu(msg->hdr.tid); 2054 tid = le64_to_cpu(msg->hdr.tid);
2051 mutex_lock(&mdsc->mutex); 2055 mutex_lock(&mdsc->mutex);
2052 req = __lookup_request(mdsc, tid); 2056 req = __lookup_request(mdsc, tid);
2053 if (!req) { 2057 if (!req) {
2054 dout("handle_reply on unknown tid %llu\n", tid); 2058 dout("handle_reply on unknown tid %llu\n", tid);
2055 mutex_unlock(&mdsc->mutex); 2059 mutex_unlock(&mdsc->mutex);
2056 return; 2060 return;
2057 } 2061 }
2058 dout("handle_reply %p\n", req); 2062 dout("handle_reply %p\n", req);
2059 2063
2060 /* correct session? */ 2064 /* correct session? */
2061 if (req->r_session != session) { 2065 if (req->r_session != session) {
2062 pr_err("mdsc_handle_reply got %llu on session mds%d" 2066 pr_err("mdsc_handle_reply got %llu on session mds%d"
2063 " not mds%d\n", tid, session->s_mds, 2067 " not mds%d\n", tid, session->s_mds,
2064 req->r_session ? req->r_session->s_mds : -1); 2068 req->r_session ? req->r_session->s_mds : -1);
2065 mutex_unlock(&mdsc->mutex); 2069 mutex_unlock(&mdsc->mutex);
2066 goto out; 2070 goto out;
2067 } 2071 }
2068 2072
2069 /* dup? */ 2073 /* dup? */
2070 if ((req->r_got_unsafe && !head->safe) || 2074 if ((req->r_got_unsafe && !head->safe) ||
2071 (req->r_got_safe && head->safe)) { 2075 (req->r_got_safe && head->safe)) {
2072 pr_warning("got a dup %s reply on %llu from mds%d\n", 2076 pr_warning("got a dup %s reply on %llu from mds%d\n",
2073 head->safe ? "safe" : "unsafe", tid, mds); 2077 head->safe ? "safe" : "unsafe", tid, mds);
2074 mutex_unlock(&mdsc->mutex); 2078 mutex_unlock(&mdsc->mutex);
2075 goto out; 2079 goto out;
2076 } 2080 }
2077 if (req->r_got_safe && !head->safe) { 2081 if (req->r_got_safe && !head->safe) {
2078 pr_warning("got unsafe after safe on %llu from mds%d\n", 2082 pr_warning("got unsafe after safe on %llu from mds%d\n",
2079 tid, mds); 2083 tid, mds);
2080 mutex_unlock(&mdsc->mutex); 2084 mutex_unlock(&mdsc->mutex);
2081 goto out; 2085 goto out;
2082 } 2086 }
2083 2087
2084 result = le32_to_cpu(head->result); 2088 result = le32_to_cpu(head->result);
2085 2089
2086 /* 2090 /*
2087 * Handle an ESTALE 2091 * Handle an ESTALE
2088 * if we're not talking to the authority, send to them 2092 * if we're not talking to the authority, send to them
2089 * if the authority has changed while we weren't looking, 2093 * if the authority has changed while we weren't looking,
2090 * send to new authority 2094 * send to new authority
2091 * Otherwise we just have to return an ESTALE 2095 * Otherwise we just have to return an ESTALE
2092 */ 2096 */
2093 if (result == -ESTALE) { 2097 if (result == -ESTALE) {
2094 dout("got ESTALE on request %llu", req->r_tid); 2098 dout("got ESTALE on request %llu", req->r_tid);
2095 if (!req->r_inode) { 2099 if (!req->r_inode) {
2096 /* do nothing; not an authority problem */ 2100 /* do nothing; not an authority problem */
2097 } else if (req->r_direct_mode != USE_AUTH_MDS) { 2101 } else if (req->r_direct_mode != USE_AUTH_MDS) {
2098 dout("not using auth, setting for that now"); 2102 dout("not using auth, setting for that now");
2099 req->r_direct_mode = USE_AUTH_MDS; 2103 req->r_direct_mode = USE_AUTH_MDS;
2100 __do_request(mdsc, req); 2104 __do_request(mdsc, req);
2101 mutex_unlock(&mdsc->mutex); 2105 mutex_unlock(&mdsc->mutex);
2102 goto out; 2106 goto out;
2103 } else { 2107 } else {
2104 struct ceph_inode_info *ci = ceph_inode(req->r_inode); 2108 struct ceph_inode_info *ci = ceph_inode(req->r_inode);
2105 struct ceph_cap *cap = NULL; 2109 struct ceph_cap *cap = NULL;
2106 2110
2107 if (req->r_session) 2111 if (req->r_session)
2108 cap = ceph_get_cap_for_mds(ci, 2112 cap = ceph_get_cap_for_mds(ci,
2109 req->r_session->s_mds); 2113 req->r_session->s_mds);
2110 2114
2111 dout("already using auth"); 2115 dout("already using auth");
2112 if ((!cap || cap != ci->i_auth_cap) || 2116 if ((!cap || cap != ci->i_auth_cap) ||
2113 (cap->mseq != req->r_sent_on_mseq)) { 2117 (cap->mseq != req->r_sent_on_mseq)) {
2114 dout("but cap changed, so resending"); 2118 dout("but cap changed, so resending");
2115 __do_request(mdsc, req); 2119 __do_request(mdsc, req);
2116 mutex_unlock(&mdsc->mutex); 2120 mutex_unlock(&mdsc->mutex);
2117 goto out; 2121 goto out;
2118 } 2122 }
2119 } 2123 }
2120 dout("have to return ESTALE on request %llu", req->r_tid); 2124 dout("have to return ESTALE on request %llu", req->r_tid);
2121 } 2125 }
2122 2126
2123 2127
2124 if (head->safe) { 2128 if (head->safe) {
2125 req->r_got_safe = true; 2129 req->r_got_safe = true;
2126 __unregister_request(mdsc, req); 2130 __unregister_request(mdsc, req);
2127 complete_all(&req->r_safe_completion); 2131 complete_all(&req->r_safe_completion);
2128 2132
2129 if (req->r_got_unsafe) { 2133 if (req->r_got_unsafe) {
2130 /* 2134 /*
2131 * We already handled the unsafe response, now do the 2135 * We already handled the unsafe response, now do the
2132 * cleanup. No need to examine the response; the MDS 2136 * cleanup. No need to examine the response; the MDS
2133 * doesn't include any result info in the safe 2137 * doesn't include any result info in the safe
2134 * response. And even if it did, there is nothing 2138 * response. And even if it did, there is nothing
2135 * useful we could do with a revised return value. 2139 * useful we could do with a revised return value.
2136 */ 2140 */
2137 dout("got safe reply %llu, mds%d\n", tid, mds); 2141 dout("got safe reply %llu, mds%d\n", tid, mds);
2138 list_del_init(&req->r_unsafe_item); 2142 list_del_init(&req->r_unsafe_item);
2139 2143
2140 /* last unsafe request during umount? */ 2144 /* last unsafe request during umount? */
2141 if (mdsc->stopping && !__get_oldest_req(mdsc)) 2145 if (mdsc->stopping && !__get_oldest_req(mdsc))
2142 complete_all(&mdsc->safe_umount_waiters); 2146 complete_all(&mdsc->safe_umount_waiters);
2143 mutex_unlock(&mdsc->mutex); 2147 mutex_unlock(&mdsc->mutex);
2144 goto out; 2148 goto out;
2145 } 2149 }
2146 } else { 2150 } else {
2147 req->r_got_unsafe = true; 2151 req->r_got_unsafe = true;
2148 list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe); 2152 list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
2149 } 2153 }
2150 2154
2151 dout("handle_reply tid %lld result %d\n", tid, result); 2155 dout("handle_reply tid %lld result %d\n", tid, result);
2152 rinfo = &req->r_reply_info; 2156 rinfo = &req->r_reply_info;
2153 err = parse_reply_info(msg, rinfo, session->s_con.peer_features); 2157 err = parse_reply_info(msg, rinfo, session->s_con.peer_features);
2154 mutex_unlock(&mdsc->mutex); 2158 mutex_unlock(&mdsc->mutex);
2155 2159
2156 mutex_lock(&session->s_mutex); 2160 mutex_lock(&session->s_mutex);
2157 if (err < 0) { 2161 if (err < 0) {
2158 pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid); 2162 pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid);
2159 ceph_msg_dump(msg); 2163 ceph_msg_dump(msg);
2160 goto out_err; 2164 goto out_err;
2161 } 2165 }
2162 2166
2163 /* snap trace */ 2167 /* snap trace */
2164 if (rinfo->snapblob_len) { 2168 if (rinfo->snapblob_len) {
2165 down_write(&mdsc->snap_rwsem); 2169 down_write(&mdsc->snap_rwsem);
2166 ceph_update_snap_trace(mdsc, rinfo->snapblob, 2170 ceph_update_snap_trace(mdsc, rinfo->snapblob,
2167 rinfo->snapblob + rinfo->snapblob_len, 2171 rinfo->snapblob + rinfo->snapblob_len,
2168 le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP); 2172 le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP);
2169 downgrade_write(&mdsc->snap_rwsem); 2173 downgrade_write(&mdsc->snap_rwsem);
2170 } else { 2174 } else {
2171 down_read(&mdsc->snap_rwsem); 2175 down_read(&mdsc->snap_rwsem);
2172 } 2176 }
2173 2177
2174 /* insert trace into our cache */ 2178 /* insert trace into our cache */
2175 mutex_lock(&req->r_fill_mutex); 2179 mutex_lock(&req->r_fill_mutex);
2176 err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session); 2180 err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
2177 if (err == 0) { 2181 if (err == 0) {
2178 if (result == 0 && req->r_op != CEPH_MDS_OP_GETFILELOCK && 2182 if (result == 0 && req->r_op != CEPH_MDS_OP_GETFILELOCK &&
2179 rinfo->dir_nr) 2183 rinfo->dir_nr)
2180 ceph_readdir_prepopulate(req, req->r_session); 2184 ceph_readdir_prepopulate(req, req->r_session);
2181 ceph_unreserve_caps(mdsc, &req->r_caps_reservation); 2185 ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
2182 } 2186 }
2183 mutex_unlock(&req->r_fill_mutex); 2187 mutex_unlock(&req->r_fill_mutex);
2184 2188
2185 up_read(&mdsc->snap_rwsem); 2189 up_read(&mdsc->snap_rwsem);
2186 out_err: 2190 out_err:
2187 mutex_lock(&mdsc->mutex); 2191 mutex_lock(&mdsc->mutex);
2188 if (!req->r_aborted) { 2192 if (!req->r_aborted) {
2189 if (err) { 2193 if (err) {
2190 req->r_err = err; 2194 req->r_err = err;
2191 } else { 2195 } else {
2192 req->r_reply = msg; 2196 req->r_reply = msg;
2193 ceph_msg_get(msg); 2197 ceph_msg_get(msg);
2194 req->r_got_result = true; 2198 req->r_got_result = true;
2195 } 2199 }
2196 } else { 2200 } else {
2197 dout("reply arrived after request %lld was aborted\n", tid); 2201 dout("reply arrived after request %lld was aborted\n", tid);
2198 } 2202 }
2199 mutex_unlock(&mdsc->mutex); 2203 mutex_unlock(&mdsc->mutex);
2200 2204
2201 ceph_add_cap_releases(mdsc, req->r_session); 2205 ceph_add_cap_releases(mdsc, req->r_session);
2202 mutex_unlock(&session->s_mutex); 2206 mutex_unlock(&session->s_mutex);
2203 2207
2204 /* kick calling process */ 2208 /* kick calling process */
2205 complete_request(mdsc, req); 2209 complete_request(mdsc, req);
2206 out: 2210 out:
2207 ceph_mdsc_put_request(req); 2211 ceph_mdsc_put_request(req);
2208 return; 2212 return;
2209 } 2213 }
2210 2214
2211 2215
2212 2216
2213 /* 2217 /*
2214 * handle mds notification that our request has been forwarded. 2218 * handle mds notification that our request has been forwarded.
2215 */ 2219 */
2216 static void handle_forward(struct ceph_mds_client *mdsc, 2220 static void handle_forward(struct ceph_mds_client *mdsc,
2217 struct ceph_mds_session *session, 2221 struct ceph_mds_session *session,
2218 struct ceph_msg *msg) 2222 struct ceph_msg *msg)
2219 { 2223 {
2220 struct ceph_mds_request *req; 2224 struct ceph_mds_request *req;
2221 u64 tid = le64_to_cpu(msg->hdr.tid); 2225 u64 tid = le64_to_cpu(msg->hdr.tid);
2222 u32 next_mds; 2226 u32 next_mds;
2223 u32 fwd_seq; 2227 u32 fwd_seq;
2224 int err = -EINVAL; 2228 int err = -EINVAL;
2225 void *p = msg->front.iov_base; 2229 void *p = msg->front.iov_base;
2226 void *end = p + msg->front.iov_len; 2230 void *end = p + msg->front.iov_len;
2227 2231
2228 ceph_decode_need(&p, end, 2*sizeof(u32), bad); 2232 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
2229 next_mds = ceph_decode_32(&p); 2233 next_mds = ceph_decode_32(&p);
2230 fwd_seq = ceph_decode_32(&p); 2234 fwd_seq = ceph_decode_32(&p);
2231 2235
2232 mutex_lock(&mdsc->mutex); 2236 mutex_lock(&mdsc->mutex);
2233 req = __lookup_request(mdsc, tid); 2237 req = __lookup_request(mdsc, tid);
2234 if (!req) { 2238 if (!req) {
2235 dout("forward tid %llu to mds%d - req dne\n", tid, next_mds); 2239 dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
2236 goto out; /* dup reply? */ 2240 goto out; /* dup reply? */
2237 } 2241 }
2238 2242
2239 if (req->r_aborted) { 2243 if (req->r_aborted) {
2240 dout("forward tid %llu aborted, unregistering\n", tid); 2244 dout("forward tid %llu aborted, unregistering\n", tid);
2241 __unregister_request(mdsc, req); 2245 __unregister_request(mdsc, req);
2242 } else if (fwd_seq <= req->r_num_fwd) { 2246 } else if (fwd_seq <= req->r_num_fwd) {
2243 dout("forward tid %llu to mds%d - old seq %d <= %d\n", 2247 dout("forward tid %llu to mds%d - old seq %d <= %d\n",
2244 tid, next_mds, req->r_num_fwd, fwd_seq); 2248 tid, next_mds, req->r_num_fwd, fwd_seq);
2245 } else { 2249 } else {
2246 /* resend. forward race not possible; mds would drop */ 2250 /* resend. forward race not possible; mds would drop */
2247 dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds); 2251 dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
2248 BUG_ON(req->r_err); 2252 BUG_ON(req->r_err);
2249 BUG_ON(req->r_got_result); 2253 BUG_ON(req->r_got_result);
2250 req->r_num_fwd = fwd_seq; 2254 req->r_num_fwd = fwd_seq;
2251 req->r_resend_mds = next_mds; 2255 req->r_resend_mds = next_mds;
2252 put_request_session(req); 2256 put_request_session(req);
2253 __do_request(mdsc, req); 2257 __do_request(mdsc, req);
2254 } 2258 }
2255 ceph_mdsc_put_request(req); 2259 ceph_mdsc_put_request(req);
2256 out: 2260 out:
2257 mutex_unlock(&mdsc->mutex); 2261 mutex_unlock(&mdsc->mutex);
2258 return; 2262 return;
2259 2263
2260 bad: 2264 bad:
2261 pr_err("mdsc_handle_forward decode error err=%d\n", err); 2265 pr_err("mdsc_handle_forward decode error err=%d\n", err);
2262 } 2266 }
2263 2267
2264 /* 2268 /*
2265 * handle a mds session control message 2269 * handle a mds session control message
2266 */ 2270 */
2267 static void handle_session(struct ceph_mds_session *session, 2271 static void handle_session(struct ceph_mds_session *session,
2268 struct ceph_msg *msg) 2272 struct ceph_msg *msg)
2269 { 2273 {
2270 struct ceph_mds_client *mdsc = session->s_mdsc; 2274 struct ceph_mds_client *mdsc = session->s_mdsc;
2271 u32 op; 2275 u32 op;
2272 u64 seq; 2276 u64 seq;
2273 int mds = session->s_mds; 2277 int mds = session->s_mds;
2274 struct ceph_mds_session_head *h = msg->front.iov_base; 2278 struct ceph_mds_session_head *h = msg->front.iov_base;
2275 int wake = 0; 2279 int wake = 0;
2276 2280
2277 /* decode */ 2281 /* decode */
2278 if (msg->front.iov_len != sizeof(*h)) 2282 if (msg->front.iov_len != sizeof(*h))
2279 goto bad; 2283 goto bad;
2280 op = le32_to_cpu(h->op); 2284 op = le32_to_cpu(h->op);
2281 seq = le64_to_cpu(h->seq); 2285 seq = le64_to_cpu(h->seq);
2282 2286
2283 mutex_lock(&mdsc->mutex); 2287 mutex_lock(&mdsc->mutex);
2284 if (op == CEPH_SESSION_CLOSE) 2288 if (op == CEPH_SESSION_CLOSE)
2285 __unregister_session(mdsc, session); 2289 __unregister_session(mdsc, session);
2286 /* FIXME: this ttl calculation is generous */ 2290 /* FIXME: this ttl calculation is generous */
2287 session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose; 2291 session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose;
2288 mutex_unlock(&mdsc->mutex); 2292 mutex_unlock(&mdsc->mutex);
2289 2293
2290 mutex_lock(&session->s_mutex); 2294 mutex_lock(&session->s_mutex);
2291 2295
2292 dout("handle_session mds%d %s %p state %s seq %llu\n", 2296 dout("handle_session mds%d %s %p state %s seq %llu\n",
2293 mds, ceph_session_op_name(op), session, 2297 mds, ceph_session_op_name(op), session,
2294 session_state_name(session->s_state), seq); 2298 session_state_name(session->s_state), seq);
2295 2299
2296 if (session->s_state == CEPH_MDS_SESSION_HUNG) { 2300 if (session->s_state == CEPH_MDS_SESSION_HUNG) {
2297 session->s_state = CEPH_MDS_SESSION_OPEN; 2301 session->s_state = CEPH_MDS_SESSION_OPEN;
2298 pr_info("mds%d came back\n", session->s_mds); 2302 pr_info("mds%d came back\n", session->s_mds);
2299 } 2303 }
2300 2304
2301 switch (op) { 2305 switch (op) {
2302 case CEPH_SESSION_OPEN: 2306 case CEPH_SESSION_OPEN:
2303 if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) 2307 if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
2304 pr_info("mds%d reconnect success\n", session->s_mds); 2308 pr_info("mds%d reconnect success\n", session->s_mds);
2305 session->s_state = CEPH_MDS_SESSION_OPEN; 2309 session->s_state = CEPH_MDS_SESSION_OPEN;
2306 renewed_caps(mdsc, session, 0); 2310 renewed_caps(mdsc, session, 0);
2307 wake = 1; 2311 wake = 1;
2308 if (mdsc->stopping) 2312 if (mdsc->stopping)
2309 __close_session(mdsc, session); 2313 __close_session(mdsc, session);
2310 break; 2314 break;
2311 2315
2312 case CEPH_SESSION_RENEWCAPS: 2316 case CEPH_SESSION_RENEWCAPS:
2313 if (session->s_renew_seq == seq) 2317 if (session->s_renew_seq == seq)
2314 renewed_caps(mdsc, session, 1); 2318 renewed_caps(mdsc, session, 1);
2315 break; 2319 break;
2316 2320
2317 case CEPH_SESSION_CLOSE: 2321 case CEPH_SESSION_CLOSE:
2318 if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) 2322 if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
2319 pr_info("mds%d reconnect denied\n", session->s_mds); 2323 pr_info("mds%d reconnect denied\n", session->s_mds);
2320 remove_session_caps(session); 2324 remove_session_caps(session);
2321 wake = 1; /* for good measure */ 2325 wake = 1; /* for good measure */
2322 wake_up_all(&mdsc->session_close_wq); 2326 wake_up_all(&mdsc->session_close_wq);
2323 kick_requests(mdsc, mds); 2327 kick_requests(mdsc, mds);
2324 break; 2328 break;
2325 2329
2326 case CEPH_SESSION_STALE: 2330 case CEPH_SESSION_STALE:
2327 pr_info("mds%d caps went stale, renewing\n", 2331 pr_info("mds%d caps went stale, renewing\n",
2328 session->s_mds); 2332 session->s_mds);
2329 spin_lock(&session->s_cap_lock); 2333 spin_lock(&session->s_gen_ttl_lock);
2330 session->s_cap_gen++; 2334 session->s_cap_gen++;
2331 session->s_cap_ttl = 0; 2335 session->s_cap_ttl = 0;
2332 spin_unlock(&session->s_cap_lock); 2336 spin_unlock(&session->s_gen_ttl_lock);
2333 send_renew_caps(mdsc, session); 2337 send_renew_caps(mdsc, session);
2334 break; 2338 break;
2335 2339
2336 case CEPH_SESSION_RECALL_STATE: 2340 case CEPH_SESSION_RECALL_STATE:
2337 trim_caps(mdsc, session, le32_to_cpu(h->max_caps)); 2341 trim_caps(mdsc, session, le32_to_cpu(h->max_caps));
2338 break; 2342 break;
2339 2343
2340 default: 2344 default:
2341 pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds); 2345 pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
2342 WARN_ON(1); 2346 WARN_ON(1);
2343 } 2347 }
2344 2348
2345 mutex_unlock(&session->s_mutex); 2349 mutex_unlock(&session->s_mutex);
2346 if (wake) { 2350 if (wake) {
2347 mutex_lock(&mdsc->mutex); 2351 mutex_lock(&mdsc->mutex);
2348 __wake_requests(mdsc, &session->s_waiting); 2352 __wake_requests(mdsc, &session->s_waiting);
2349 mutex_unlock(&mdsc->mutex); 2353 mutex_unlock(&mdsc->mutex);
2350 } 2354 }
2351 return; 2355 return;
2352 2356
2353 bad: 2357 bad:
2354 pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds, 2358 pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds,
2355 (int)msg->front.iov_len); 2359 (int)msg->front.iov_len);
2356 ceph_msg_dump(msg); 2360 ceph_msg_dump(msg);
2357 return; 2361 return;
2358 } 2362 }
2359 2363
2360 2364
2361 /* 2365 /*
2362 * called under session->mutex. 2366 * called under session->mutex.
2363 */ 2367 */
2364 static void replay_unsafe_requests(struct ceph_mds_client *mdsc, 2368 static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
2365 struct ceph_mds_session *session) 2369 struct ceph_mds_session *session)
2366 { 2370 {
2367 struct ceph_mds_request *req, *nreq; 2371 struct ceph_mds_request *req, *nreq;
2368 int err; 2372 int err;
2369 2373
2370 dout("replay_unsafe_requests mds%d\n", session->s_mds); 2374 dout("replay_unsafe_requests mds%d\n", session->s_mds);
2371 2375
2372 mutex_lock(&mdsc->mutex); 2376 mutex_lock(&mdsc->mutex);
2373 list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) { 2377 list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) {
2374 err = __prepare_send_request(mdsc, req, session->s_mds); 2378 err = __prepare_send_request(mdsc, req, session->s_mds);
2375 if (!err) { 2379 if (!err) {
2376 ceph_msg_get(req->r_request); 2380 ceph_msg_get(req->r_request);
2377 ceph_con_send(&session->s_con, req->r_request); 2381 ceph_con_send(&session->s_con, req->r_request);
2378 } 2382 }
2379 } 2383 }
2380 mutex_unlock(&mdsc->mutex); 2384 mutex_unlock(&mdsc->mutex);
2381 } 2385 }
2382 2386
2383 /* 2387 /*
2384 * Encode information about a cap for a reconnect with the MDS. 2388 * Encode information about a cap for a reconnect with the MDS.
2385 */ 2389 */
2386 static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, 2390 static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2387 void *arg) 2391 void *arg)
2388 { 2392 {
2389 union { 2393 union {
2390 struct ceph_mds_cap_reconnect v2; 2394 struct ceph_mds_cap_reconnect v2;
2391 struct ceph_mds_cap_reconnect_v1 v1; 2395 struct ceph_mds_cap_reconnect_v1 v1;
2392 } rec; 2396 } rec;
2393 size_t reclen; 2397 size_t reclen;
2394 struct ceph_inode_info *ci; 2398 struct ceph_inode_info *ci;
2395 struct ceph_reconnect_state *recon_state = arg; 2399 struct ceph_reconnect_state *recon_state = arg;
2396 struct ceph_pagelist *pagelist = recon_state->pagelist; 2400 struct ceph_pagelist *pagelist = recon_state->pagelist;
2397 char *path; 2401 char *path;
2398 int pathlen, err; 2402 int pathlen, err;
2399 u64 pathbase; 2403 u64 pathbase;
2400 struct dentry *dentry; 2404 struct dentry *dentry;
2401 2405
2402 ci = cap->ci; 2406 ci = cap->ci;
2403 2407
2404 dout(" adding %p ino %llx.%llx cap %p %lld %s\n", 2408 dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
2405 inode, ceph_vinop(inode), cap, cap->cap_id, 2409 inode, ceph_vinop(inode), cap, cap->cap_id,
2406 ceph_cap_string(cap->issued)); 2410 ceph_cap_string(cap->issued));
2407 err = ceph_pagelist_encode_64(pagelist, ceph_ino(inode)); 2411 err = ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
2408 if (err) 2412 if (err)
2409 return err; 2413 return err;
2410 2414
2411 dentry = d_find_alias(inode); 2415 dentry = d_find_alias(inode);
2412 if (dentry) { 2416 if (dentry) {
2413 path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0); 2417 path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0);
2414 if (IS_ERR(path)) { 2418 if (IS_ERR(path)) {
2415 err = PTR_ERR(path); 2419 err = PTR_ERR(path);
2416 goto out_dput; 2420 goto out_dput;
2417 } 2421 }
2418 } else { 2422 } else {
2419 path = NULL; 2423 path = NULL;
2420 pathlen = 0; 2424 pathlen = 0;
2421 } 2425 }
2422 err = ceph_pagelist_encode_string(pagelist, path, pathlen); 2426 err = ceph_pagelist_encode_string(pagelist, path, pathlen);
2423 if (err) 2427 if (err)
2424 goto out_free; 2428 goto out_free;
2425 2429
2426 spin_lock(&ci->i_ceph_lock); 2430 spin_lock(&ci->i_ceph_lock);
2427 cap->seq = 0; /* reset cap seq */ 2431 cap->seq = 0; /* reset cap seq */
2428 cap->issue_seq = 0; /* and issue_seq */ 2432 cap->issue_seq = 0; /* and issue_seq */
2429 2433
2430 if (recon_state->flock) { 2434 if (recon_state->flock) {
2431 rec.v2.cap_id = cpu_to_le64(cap->cap_id); 2435 rec.v2.cap_id = cpu_to_le64(cap->cap_id);
2432 rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); 2436 rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
2433 rec.v2.issued = cpu_to_le32(cap->issued); 2437 rec.v2.issued = cpu_to_le32(cap->issued);
2434 rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); 2438 rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
2435 rec.v2.pathbase = cpu_to_le64(pathbase); 2439 rec.v2.pathbase = cpu_to_le64(pathbase);
2436 rec.v2.flock_len = 0; 2440 rec.v2.flock_len = 0;
2437 reclen = sizeof(rec.v2); 2441 reclen = sizeof(rec.v2);
2438 } else { 2442 } else {
2439 rec.v1.cap_id = cpu_to_le64(cap->cap_id); 2443 rec.v1.cap_id = cpu_to_le64(cap->cap_id);
2440 rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); 2444 rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
2441 rec.v1.issued = cpu_to_le32(cap->issued); 2445 rec.v1.issued = cpu_to_le32(cap->issued);
2442 rec.v1.size = cpu_to_le64(inode->i_size); 2446 rec.v1.size = cpu_to_le64(inode->i_size);
2443 ceph_encode_timespec(&rec.v1.mtime, &inode->i_mtime); 2447 ceph_encode_timespec(&rec.v1.mtime, &inode->i_mtime);
2444 ceph_encode_timespec(&rec.v1.atime, &inode->i_atime); 2448 ceph_encode_timespec(&rec.v1.atime, &inode->i_atime);
2445 rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); 2449 rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
2446 rec.v1.pathbase = cpu_to_le64(pathbase); 2450 rec.v1.pathbase = cpu_to_le64(pathbase);
2447 reclen = sizeof(rec.v1); 2451 reclen = sizeof(rec.v1);
2448 } 2452 }
2449 spin_unlock(&ci->i_ceph_lock); 2453 spin_unlock(&ci->i_ceph_lock);
2450 2454
2451 if (recon_state->flock) { 2455 if (recon_state->flock) {
2452 int num_fcntl_locks, num_flock_locks; 2456 int num_fcntl_locks, num_flock_locks;
2453 struct ceph_pagelist_cursor trunc_point; 2457 struct ceph_pagelist_cursor trunc_point;
2454 2458
2455 ceph_pagelist_set_cursor(pagelist, &trunc_point); 2459 ceph_pagelist_set_cursor(pagelist, &trunc_point);
2456 do { 2460 do {
2457 lock_flocks(); 2461 lock_flocks();
2458 ceph_count_locks(inode, &num_fcntl_locks, 2462 ceph_count_locks(inode, &num_fcntl_locks,
2459 &num_flock_locks); 2463 &num_flock_locks);
2460 rec.v2.flock_len = (2*sizeof(u32) + 2464 rec.v2.flock_len = (2*sizeof(u32) +
2461 (num_fcntl_locks+num_flock_locks) * 2465 (num_fcntl_locks+num_flock_locks) *
2462 sizeof(struct ceph_filelock)); 2466 sizeof(struct ceph_filelock));
2463 unlock_flocks(); 2467 unlock_flocks();
2464 2468
2465 /* pre-alloc pagelist */ 2469 /* pre-alloc pagelist */
2466 ceph_pagelist_truncate(pagelist, &trunc_point); 2470 ceph_pagelist_truncate(pagelist, &trunc_point);
2467 err = ceph_pagelist_append(pagelist, &rec, reclen); 2471 err = ceph_pagelist_append(pagelist, &rec, reclen);
2468 if (!err) 2472 if (!err)
2469 err = ceph_pagelist_reserve(pagelist, 2473 err = ceph_pagelist_reserve(pagelist,
2470 rec.v2.flock_len); 2474 rec.v2.flock_len);
2471 2475
2472 /* encode locks */ 2476 /* encode locks */
2473 if (!err) { 2477 if (!err) {
2474 lock_flocks(); 2478 lock_flocks();
2475 err = ceph_encode_locks(inode, 2479 err = ceph_encode_locks(inode,
2476 pagelist, 2480 pagelist,
2477 num_fcntl_locks, 2481 num_fcntl_locks,
2478 num_flock_locks); 2482 num_flock_locks);
2479 unlock_flocks(); 2483 unlock_flocks();
2480 } 2484 }
2481 } while (err == -ENOSPC); 2485 } while (err == -ENOSPC);
2482 } else { 2486 } else {
2483 err = ceph_pagelist_append(pagelist, &rec, reclen); 2487 err = ceph_pagelist_append(pagelist, &rec, reclen);
2484 } 2488 }
2485 2489
2486 out_free: 2490 out_free:
2487 kfree(path); 2491 kfree(path);
2488 out_dput: 2492 out_dput:
2489 dput(dentry); 2493 dput(dentry);
2490 return err; 2494 return err;
2491 } 2495 }
2492 2496
2493 2497
2494 /* 2498 /*
2495 * If an MDS fails and recovers, clients need to reconnect in order to 2499 * If an MDS fails and recovers, clients need to reconnect in order to
2496 * reestablish shared state. This includes all caps issued through 2500 * reestablish shared state. This includes all caps issued through
2497 * this session _and_ the snap_realm hierarchy. Because it's not 2501 * this session _and_ the snap_realm hierarchy. Because it's not
2498 * clear which snap realms the mds cares about, we send everything we 2502 * clear which snap realms the mds cares about, we send everything we
2499 * know about.. that ensures we'll then get any new info the 2503 * know about.. that ensures we'll then get any new info the
2500 * recovering MDS might have. 2504 * recovering MDS might have.
2501 * 2505 *
2502 * This is a relatively heavyweight operation, but it's rare. 2506 * This is a relatively heavyweight operation, but it's rare.
2503 * 2507 *
2504 * called with mdsc->mutex held. 2508 * called with mdsc->mutex held.
2505 */ 2509 */
2506 static void send_mds_reconnect(struct ceph_mds_client *mdsc, 2510 static void send_mds_reconnect(struct ceph_mds_client *mdsc,
2507 struct ceph_mds_session *session) 2511 struct ceph_mds_session *session)
2508 { 2512 {
2509 struct ceph_msg *reply; 2513 struct ceph_msg *reply;
2510 struct rb_node *p; 2514 struct rb_node *p;
2511 int mds = session->s_mds; 2515 int mds = session->s_mds;
2512 int err = -ENOMEM; 2516 int err = -ENOMEM;
2513 struct ceph_pagelist *pagelist; 2517 struct ceph_pagelist *pagelist;
2514 struct ceph_reconnect_state recon_state; 2518 struct ceph_reconnect_state recon_state;
2515 2519
2516 pr_info("mds%d reconnect start\n", mds); 2520 pr_info("mds%d reconnect start\n", mds);
2517 2521
2518 pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS); 2522 pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
2519 if (!pagelist) 2523 if (!pagelist)
2520 goto fail_nopagelist; 2524 goto fail_nopagelist;
2521 ceph_pagelist_init(pagelist); 2525 ceph_pagelist_init(pagelist);
2522 2526
2523 reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS, false); 2527 reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS, false);
2524 if (!reply) 2528 if (!reply)
2525 goto fail_nomsg; 2529 goto fail_nomsg;
2526 2530
2527 mutex_lock(&session->s_mutex); 2531 mutex_lock(&session->s_mutex);
2528 session->s_state = CEPH_MDS_SESSION_RECONNECTING; 2532 session->s_state = CEPH_MDS_SESSION_RECONNECTING;
2529 session->s_seq = 0; 2533 session->s_seq = 0;
2530 2534
2531 ceph_con_open(&session->s_con, 2535 ceph_con_open(&session->s_con,
2532 ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); 2536 ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
2533 2537
2534 /* replay unsafe requests */ 2538 /* replay unsafe requests */
2535 replay_unsafe_requests(mdsc, session); 2539 replay_unsafe_requests(mdsc, session);
2536 2540
2537 down_read(&mdsc->snap_rwsem); 2541 down_read(&mdsc->snap_rwsem);
2538 2542
2539 dout("session %p state %s\n", session, 2543 dout("session %p state %s\n", session,
2540 session_state_name(session->s_state)); 2544 session_state_name(session->s_state));
2541 2545
2542 /* drop old cap expires; we're about to reestablish that state */ 2546 /* drop old cap expires; we're about to reestablish that state */
2543 discard_cap_releases(mdsc, session); 2547 discard_cap_releases(mdsc, session);
2544 2548
2545 /* traverse this session's caps */ 2549 /* traverse this session's caps */
2546 err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps); 2550 err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps);
2547 if (err) 2551 if (err)
2548 goto fail; 2552 goto fail;
2549 2553
2550 recon_state.pagelist = pagelist; 2554 recon_state.pagelist = pagelist;
2551 recon_state.flock = session->s_con.peer_features & CEPH_FEATURE_FLOCK; 2555 recon_state.flock = session->s_con.peer_features & CEPH_FEATURE_FLOCK;
2552 err = iterate_session_caps(session, encode_caps_cb, &recon_state); 2556 err = iterate_session_caps(session, encode_caps_cb, &recon_state);
2553 if (err < 0) 2557 if (err < 0)
2554 goto fail; 2558 goto fail;
2555 2559
2556 /* 2560 /*
2557 * snaprealms. we provide mds with the ino, seq (version), and 2561 * snaprealms. we provide mds with the ino, seq (version), and
2558 * parent for all of our realms. If the mds has any newer info, 2562 * parent for all of our realms. If the mds has any newer info,
2559 * it will tell us. 2563 * it will tell us.
2560 */ 2564 */
2561 for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) { 2565 for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) {
2562 struct ceph_snap_realm *realm = 2566 struct ceph_snap_realm *realm =
2563 rb_entry(p, struct ceph_snap_realm, node); 2567 rb_entry(p, struct ceph_snap_realm, node);
2564 struct ceph_mds_snaprealm_reconnect sr_rec; 2568 struct ceph_mds_snaprealm_reconnect sr_rec;
2565 2569
2566 dout(" adding snap realm %llx seq %lld parent %llx\n", 2570 dout(" adding snap realm %llx seq %lld parent %llx\n",
2567 realm->ino, realm->seq, realm->parent_ino); 2571 realm->ino, realm->seq, realm->parent_ino);
2568 sr_rec.ino = cpu_to_le64(realm->ino); 2572 sr_rec.ino = cpu_to_le64(realm->ino);
2569 sr_rec.seq = cpu_to_le64(realm->seq); 2573 sr_rec.seq = cpu_to_le64(realm->seq);
2570 sr_rec.parent = cpu_to_le64(realm->parent_ino); 2574 sr_rec.parent = cpu_to_le64(realm->parent_ino);
2571 err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec)); 2575 err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));
2572 if (err) 2576 if (err)
2573 goto fail; 2577 goto fail;
2574 } 2578 }
2575 2579
2576 reply->pagelist = pagelist; 2580 reply->pagelist = pagelist;
2577 if (recon_state.flock) 2581 if (recon_state.flock)
2578 reply->hdr.version = cpu_to_le16(2); 2582 reply->hdr.version = cpu_to_le16(2);
2579 reply->hdr.data_len = cpu_to_le32(pagelist->length); 2583 reply->hdr.data_len = cpu_to_le32(pagelist->length);
2580 reply->nr_pages = calc_pages_for(0, pagelist->length); 2584 reply->nr_pages = calc_pages_for(0, pagelist->length);
2581 ceph_con_send(&session->s_con, reply); 2585 ceph_con_send(&session->s_con, reply);
2582 2586
2583 mutex_unlock(&session->s_mutex); 2587 mutex_unlock(&session->s_mutex);
2584 2588
2585 mutex_lock(&mdsc->mutex); 2589 mutex_lock(&mdsc->mutex);
2586 __wake_requests(mdsc, &session->s_waiting); 2590 __wake_requests(mdsc, &session->s_waiting);
2587 mutex_unlock(&mdsc->mutex); 2591 mutex_unlock(&mdsc->mutex);
2588 2592
2589 up_read(&mdsc->snap_rwsem); 2593 up_read(&mdsc->snap_rwsem);
2590 return; 2594 return;
2591 2595
2592 fail: 2596 fail:
2593 ceph_msg_put(reply); 2597 ceph_msg_put(reply);
2594 up_read(&mdsc->snap_rwsem); 2598 up_read(&mdsc->snap_rwsem);
2595 mutex_unlock(&session->s_mutex); 2599 mutex_unlock(&session->s_mutex);
2596 fail_nomsg: 2600 fail_nomsg:
2597 ceph_pagelist_release(pagelist); 2601 ceph_pagelist_release(pagelist);
2598 kfree(pagelist); 2602 kfree(pagelist);
2599 fail_nopagelist: 2603 fail_nopagelist:
2600 pr_err("error %d preparing reconnect for mds%d\n", err, mds); 2604 pr_err("error %d preparing reconnect for mds%d\n", err, mds);
2601 return; 2605 return;
2602 } 2606 }
2603 2607
2604 2608
2605 /* 2609 /*
2606 * compare old and new mdsmaps, kicking requests 2610 * compare old and new mdsmaps, kicking requests
2607 * and closing out old connections as necessary 2611 * and closing out old connections as necessary
2608 * 2612 *
2609 * called under mdsc->mutex. 2613 * called under mdsc->mutex.
2610 */ 2614 */
2611 static void check_new_map(struct ceph_mds_client *mdsc, 2615 static void check_new_map(struct ceph_mds_client *mdsc,
2612 struct ceph_mdsmap *newmap, 2616 struct ceph_mdsmap *newmap,
2613 struct ceph_mdsmap *oldmap) 2617 struct ceph_mdsmap *oldmap)
2614 { 2618 {
2615 int i; 2619 int i;
2616 int oldstate, newstate; 2620 int oldstate, newstate;
2617 struct ceph_mds_session *s; 2621 struct ceph_mds_session *s;
2618 2622
2619 dout("check_new_map new %u old %u\n", 2623 dout("check_new_map new %u old %u\n",
2620 newmap->m_epoch, oldmap->m_epoch); 2624 newmap->m_epoch, oldmap->m_epoch);
2621 2625
2622 for (i = 0; i < oldmap->m_max_mds && i < mdsc->max_sessions; i++) { 2626 for (i = 0; i < oldmap->m_max_mds && i < mdsc->max_sessions; i++) {
2623 if (mdsc->sessions[i] == NULL) 2627 if (mdsc->sessions[i] == NULL)
2624 continue; 2628 continue;
2625 s = mdsc->sessions[i]; 2629 s = mdsc->sessions[i];
2626 oldstate = ceph_mdsmap_get_state(oldmap, i); 2630 oldstate = ceph_mdsmap_get_state(oldmap, i);
2627 newstate = ceph_mdsmap_get_state(newmap, i); 2631 newstate = ceph_mdsmap_get_state(newmap, i);
2628 2632
2629 dout("check_new_map mds%d state %s%s -> %s%s (session %s)\n", 2633 dout("check_new_map mds%d state %s%s -> %s%s (session %s)\n",
2630 i, ceph_mds_state_name(oldstate), 2634 i, ceph_mds_state_name(oldstate),
2631 ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "", 2635 ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "",
2632 ceph_mds_state_name(newstate), 2636 ceph_mds_state_name(newstate),
2633 ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "", 2637 ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
2634 session_state_name(s->s_state)); 2638 session_state_name(s->s_state));
2635 2639
2636 if (memcmp(ceph_mdsmap_get_addr(oldmap, i), 2640 if (memcmp(ceph_mdsmap_get_addr(oldmap, i),
2637 ceph_mdsmap_get_addr(newmap, i), 2641 ceph_mdsmap_get_addr(newmap, i),
2638 sizeof(struct ceph_entity_addr))) { 2642 sizeof(struct ceph_entity_addr))) {
2639 if (s->s_state == CEPH_MDS_SESSION_OPENING) { 2643 if (s->s_state == CEPH_MDS_SESSION_OPENING) {
2640 /* the session never opened, just close it 2644 /* the session never opened, just close it
2641 * out now */ 2645 * out now */
2642 __wake_requests(mdsc, &s->s_waiting); 2646 __wake_requests(mdsc, &s->s_waiting);
2643 __unregister_session(mdsc, s); 2647 __unregister_session(mdsc, s);
2644 } else { 2648 } else {
2645 /* just close it */ 2649 /* just close it */
2646 mutex_unlock(&mdsc->mutex); 2650 mutex_unlock(&mdsc->mutex);
2647 mutex_lock(&s->s_mutex); 2651 mutex_lock(&s->s_mutex);
2648 mutex_lock(&mdsc->mutex); 2652 mutex_lock(&mdsc->mutex);
2649 ceph_con_close(&s->s_con); 2653 ceph_con_close(&s->s_con);
2650 mutex_unlock(&s->s_mutex); 2654 mutex_unlock(&s->s_mutex);
2651 s->s_state = CEPH_MDS_SESSION_RESTARTING; 2655 s->s_state = CEPH_MDS_SESSION_RESTARTING;
2652 } 2656 }
2653 2657
2654 /* kick any requests waiting on the recovering mds */ 2658 /* kick any requests waiting on the recovering mds */
2655 kick_requests(mdsc, i); 2659 kick_requests(mdsc, i);
2656 } else if (oldstate == newstate) { 2660 } else if (oldstate == newstate) {
2657 continue; /* nothing new with this mds */ 2661 continue; /* nothing new with this mds */
2658 } 2662 }
2659 2663
2660 /* 2664 /*
2661 * send reconnect? 2665 * send reconnect?
2662 */ 2666 */
2663 if (s->s_state == CEPH_MDS_SESSION_RESTARTING && 2667 if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
2664 newstate >= CEPH_MDS_STATE_RECONNECT) { 2668 newstate >= CEPH_MDS_STATE_RECONNECT) {
2665 mutex_unlock(&mdsc->mutex); 2669 mutex_unlock(&mdsc->mutex);
2666 send_mds_reconnect(mdsc, s); 2670 send_mds_reconnect(mdsc, s);
2667 mutex_lock(&mdsc->mutex); 2671 mutex_lock(&mdsc->mutex);
2668 } 2672 }
2669 2673
2670 /* 2674 /*
2671 * kick request on any mds that has gone active. 2675 * kick request on any mds that has gone active.
2672 */ 2676 */
2673 if (oldstate < CEPH_MDS_STATE_ACTIVE && 2677 if (oldstate < CEPH_MDS_STATE_ACTIVE &&
2674 newstate >= CEPH_MDS_STATE_ACTIVE) { 2678 newstate >= CEPH_MDS_STATE_ACTIVE) {
2675 if (oldstate != CEPH_MDS_STATE_CREATING && 2679 if (oldstate != CEPH_MDS_STATE_CREATING &&
2676 oldstate != CEPH_MDS_STATE_STARTING) 2680 oldstate != CEPH_MDS_STATE_STARTING)
2677 pr_info("mds%d recovery completed\n", s->s_mds); 2681 pr_info("mds%d recovery completed\n", s->s_mds);
2678 kick_requests(mdsc, i); 2682 kick_requests(mdsc, i);
2679 ceph_kick_flushing_caps(mdsc, s); 2683 ceph_kick_flushing_caps(mdsc, s);
2680 wake_up_session_caps(s, 1); 2684 wake_up_session_caps(s, 1);
2681 } 2685 }
2682 } 2686 }
2683 2687
2684 for (i = 0; i < newmap->m_max_mds && i < mdsc->max_sessions; i++) { 2688 for (i = 0; i < newmap->m_max_mds && i < mdsc->max_sessions; i++) {
2685 s = mdsc->sessions[i]; 2689 s = mdsc->sessions[i];
2686 if (!s) 2690 if (!s)
2687 continue; 2691 continue;
2688 if (!ceph_mdsmap_is_laggy(newmap, i)) 2692 if (!ceph_mdsmap_is_laggy(newmap, i))
2689 continue; 2693 continue;
2690 if (s->s_state == CEPH_MDS_SESSION_OPEN || 2694 if (s->s_state == CEPH_MDS_SESSION_OPEN ||
2691 s->s_state == CEPH_MDS_SESSION_HUNG || 2695 s->s_state == CEPH_MDS_SESSION_HUNG ||
2692 s->s_state == CEPH_MDS_SESSION_CLOSING) { 2696 s->s_state == CEPH_MDS_SESSION_CLOSING) {
2693 dout(" connecting to export targets of laggy mds%d\n", 2697 dout(" connecting to export targets of laggy mds%d\n",
2694 i); 2698 i);
2695 __open_export_target_sessions(mdsc, s); 2699 __open_export_target_sessions(mdsc, s);
2696 } 2700 }
2697 } 2701 }
2698 } 2702 }
2699 2703
2700 2704
2701 2705
2702 /* 2706 /*
2703 * leases 2707 * leases
2704 */ 2708 */
2705 2709
2706 /* 2710 /*
2707 * caller must hold session s_mutex, dentry->d_lock 2711 * caller must hold session s_mutex, dentry->d_lock
2708 */ 2712 */
2709 void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry) 2713 void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry)
2710 { 2714 {
2711 struct ceph_dentry_info *di = ceph_dentry(dentry); 2715 struct ceph_dentry_info *di = ceph_dentry(dentry);
2712 2716
2713 ceph_put_mds_session(di->lease_session); 2717 ceph_put_mds_session(di->lease_session);
2714 di->lease_session = NULL; 2718 di->lease_session = NULL;
2715 } 2719 }
2716 2720
2717 static void handle_lease(struct ceph_mds_client *mdsc, 2721 static void handle_lease(struct ceph_mds_client *mdsc,
2718 struct ceph_mds_session *session, 2722 struct ceph_mds_session *session,
2719 struct ceph_msg *msg) 2723 struct ceph_msg *msg)
2720 { 2724 {
2721 struct super_block *sb = mdsc->fsc->sb; 2725 struct super_block *sb = mdsc->fsc->sb;
2722 struct inode *inode; 2726 struct inode *inode;
2723 struct dentry *parent, *dentry; 2727 struct dentry *parent, *dentry;
2724 struct ceph_dentry_info *di; 2728 struct ceph_dentry_info *di;
2725 int mds = session->s_mds; 2729 int mds = session->s_mds;
2726 struct ceph_mds_lease *h = msg->front.iov_base; 2730 struct ceph_mds_lease *h = msg->front.iov_base;
2727 u32 seq; 2731 u32 seq;
2728 struct ceph_vino vino; 2732 struct ceph_vino vino;
2729 struct qstr dname; 2733 struct qstr dname;
2730 int release = 0; 2734 int release = 0;
2731 2735
2732 dout("handle_lease from mds%d\n", mds); 2736 dout("handle_lease from mds%d\n", mds);
2733 2737
2734 /* decode */ 2738 /* decode */
2735 if (msg->front.iov_len < sizeof(*h) + sizeof(u32)) 2739 if (msg->front.iov_len < sizeof(*h) + sizeof(u32))
2736 goto bad; 2740 goto bad;
2737 vino.ino = le64_to_cpu(h->ino); 2741 vino.ino = le64_to_cpu(h->ino);
2738 vino.snap = CEPH_NOSNAP; 2742 vino.snap = CEPH_NOSNAP;
2739 seq = le32_to_cpu(h->seq); 2743 seq = le32_to_cpu(h->seq);
2740 dname.name = (void *)h + sizeof(*h) + sizeof(u32); 2744 dname.name = (void *)h + sizeof(*h) + sizeof(u32);
2741 dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32); 2745 dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32);
2742 if (dname.len != get_unaligned_le32(h+1)) 2746 if (dname.len != get_unaligned_le32(h+1))
2743 goto bad; 2747 goto bad;
2744 2748
2745 mutex_lock(&session->s_mutex); 2749 mutex_lock(&session->s_mutex);
2746 session->s_seq++; 2750 session->s_seq++;
2747 2751
2748 /* lookup inode */ 2752 /* lookup inode */
2749 inode = ceph_find_inode(sb, vino); 2753 inode = ceph_find_inode(sb, vino);
2750 dout("handle_lease %s, ino %llx %p %.*s\n", 2754 dout("handle_lease %s, ino %llx %p %.*s\n",
2751 ceph_lease_op_name(h->action), vino.ino, inode, 2755 ceph_lease_op_name(h->action), vino.ino, inode,
2752 dname.len, dname.name); 2756 dname.len, dname.name);
2753 if (inode == NULL) { 2757 if (inode == NULL) {
2754 dout("handle_lease no inode %llx\n", vino.ino); 2758 dout("handle_lease no inode %llx\n", vino.ino);
2755 goto release; 2759 goto release;
2756 } 2760 }
2757 2761
2758 /* dentry */ 2762 /* dentry */
2759 parent = d_find_alias(inode); 2763 parent = d_find_alias(inode);
2760 if (!parent) { 2764 if (!parent) {
2761 dout("no parent dentry on inode %p\n", inode); 2765 dout("no parent dentry on inode %p\n", inode);
2762 WARN_ON(1); 2766 WARN_ON(1);
2763 goto release; /* hrm... */ 2767 goto release; /* hrm... */
2764 } 2768 }
2765 dname.hash = full_name_hash(dname.name, dname.len); 2769 dname.hash = full_name_hash(dname.name, dname.len);
2766 dentry = d_lookup(parent, &dname); 2770 dentry = d_lookup(parent, &dname);
2767 dput(parent); 2771 dput(parent);
2768 if (!dentry) 2772 if (!dentry)
2769 goto release; 2773 goto release;
2770 2774
2771 spin_lock(&dentry->d_lock); 2775 spin_lock(&dentry->d_lock);
2772 di = ceph_dentry(dentry); 2776 di = ceph_dentry(dentry);
2773 switch (h->action) { 2777 switch (h->action) {
2774 case CEPH_MDS_LEASE_REVOKE: 2778 case CEPH_MDS_LEASE_REVOKE:
2775 if (di->lease_session == session) { 2779 if (di->lease_session == session) {
2776 if (ceph_seq_cmp(di->lease_seq, seq) > 0) 2780 if (ceph_seq_cmp(di->lease_seq, seq) > 0)
2777 h->seq = cpu_to_le32(di->lease_seq); 2781 h->seq = cpu_to_le32(di->lease_seq);
2778 __ceph_mdsc_drop_dentry_lease(dentry); 2782 __ceph_mdsc_drop_dentry_lease(dentry);
2779 } 2783 }
2780 release = 1; 2784 release = 1;
2781 break; 2785 break;
2782 2786
2783 case CEPH_MDS_LEASE_RENEW: 2787 case CEPH_MDS_LEASE_RENEW:
2784 if (di->lease_session == session && 2788 if (di->lease_session == session &&
2785 di->lease_gen == session->s_cap_gen && 2789 di->lease_gen == session->s_cap_gen &&
2786 di->lease_renew_from && 2790 di->lease_renew_from &&
2787 di->lease_renew_after == 0) { 2791 di->lease_renew_after == 0) {
2788 unsigned long duration = 2792 unsigned long duration =
2789 le32_to_cpu(h->duration_ms) * HZ / 1000; 2793 le32_to_cpu(h->duration_ms) * HZ / 1000;
2790 2794
2791 di->lease_seq = seq; 2795 di->lease_seq = seq;
2792 dentry->d_time = di->lease_renew_from + duration; 2796 dentry->d_time = di->lease_renew_from + duration;
2793 di->lease_renew_after = di->lease_renew_from + 2797 di->lease_renew_after = di->lease_renew_from +
2794 (duration >> 1); 2798 (duration >> 1);
2795 di->lease_renew_from = 0; 2799 di->lease_renew_from = 0;
2796 } 2800 }
2797 break; 2801 break;
2798 } 2802 }
2799 spin_unlock(&dentry->d_lock); 2803 spin_unlock(&dentry->d_lock);
2800 dput(dentry); 2804 dput(dentry);
2801 2805
2802 if (!release) 2806 if (!release)
2803 goto out; 2807 goto out;
2804 2808
2805 release: 2809 release:
2806 /* let's just reuse the same message */ 2810 /* let's just reuse the same message */
2807 h->action = CEPH_MDS_LEASE_REVOKE_ACK; 2811 h->action = CEPH_MDS_LEASE_REVOKE_ACK;
2808 ceph_msg_get(msg); 2812 ceph_msg_get(msg);
2809 ceph_con_send(&session->s_con, msg); 2813 ceph_con_send(&session->s_con, msg);
2810 2814
2811 out: 2815 out:
2812 iput(inode); 2816 iput(inode);
2813 mutex_unlock(&session->s_mutex); 2817 mutex_unlock(&session->s_mutex);
2814 return; 2818 return;
2815 2819
2816 bad: 2820 bad:
2817 pr_err("corrupt lease message\n"); 2821 pr_err("corrupt lease message\n");
2818 ceph_msg_dump(msg); 2822 ceph_msg_dump(msg);
2819 } 2823 }
2820 2824
2821 void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, 2825 void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
2822 struct inode *inode, 2826 struct inode *inode,
2823 struct dentry *dentry, char action, 2827 struct dentry *dentry, char action,
2824 u32 seq) 2828 u32 seq)
2825 { 2829 {
2826 struct ceph_msg *msg; 2830 struct ceph_msg *msg;
2827 struct ceph_mds_lease *lease; 2831 struct ceph_mds_lease *lease;
2828 int len = sizeof(*lease) + sizeof(u32); 2832 int len = sizeof(*lease) + sizeof(u32);
2829 int dnamelen = 0; 2833 int dnamelen = 0;
2830 2834
2831 dout("lease_send_msg inode %p dentry %p %s to mds%d\n", 2835 dout("lease_send_msg inode %p dentry %p %s to mds%d\n",
2832 inode, dentry, ceph_lease_op_name(action), session->s_mds); 2836 inode, dentry, ceph_lease_op_name(action), session->s_mds);
2833 dnamelen = dentry->d_name.len; 2837 dnamelen = dentry->d_name.len;
2834 len += dnamelen; 2838 len += dnamelen;
2835 2839
2836 msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false); 2840 msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false);
2837 if (!msg) 2841 if (!msg)
2838 return; 2842 return;
2839 lease = msg->front.iov_base; 2843 lease = msg->front.iov_base;
2840 lease->action = action; 2844 lease->action = action;
2841 lease->ino = cpu_to_le64(ceph_vino(inode).ino); 2845 lease->ino = cpu_to_le64(ceph_vino(inode).ino);
2842 lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap); 2846 lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
2843 lease->seq = cpu_to_le32(seq); 2847 lease->seq = cpu_to_le32(seq);
2844 put_unaligned_le32(dnamelen, lease + 1); 2848 put_unaligned_le32(dnamelen, lease + 1);
2845 memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dnamelen); 2849 memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dnamelen);
2846 2850
2847 /* 2851 /*
2848 * if this is a preemptive lease RELEASE, no need to 2852 * if this is a preemptive lease RELEASE, no need to
2849 * flush request stream, since the actual request will 2853 * flush request stream, since the actual request will
2850 * soon follow. 2854 * soon follow.
2851 */ 2855 */
2852 msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE); 2856 msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE);
2853 2857
2854 ceph_con_send(&session->s_con, msg); 2858 ceph_con_send(&session->s_con, msg);
2855 } 2859 }
2856 2860
2857 /* 2861 /*
2858 * Preemptively release a lease we expect to invalidate anyway. 2862 * Preemptively release a lease we expect to invalidate anyway.
2859 * Pass @inode always, @dentry is optional. 2863 * Pass @inode always, @dentry is optional.
2860 */ 2864 */
2861 void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode, 2865 void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
2862 struct dentry *dentry) 2866 struct dentry *dentry)
2863 { 2867 {
2864 struct ceph_dentry_info *di; 2868 struct ceph_dentry_info *di;
2865 struct ceph_mds_session *session; 2869 struct ceph_mds_session *session;
2866 u32 seq; 2870 u32 seq;
2867 2871
2868 BUG_ON(inode == NULL); 2872 BUG_ON(inode == NULL);
2869 BUG_ON(dentry == NULL); 2873 BUG_ON(dentry == NULL);
2870 2874
2871 /* is dentry lease valid? */ 2875 /* is dentry lease valid? */
2872 spin_lock(&dentry->d_lock); 2876 spin_lock(&dentry->d_lock);
2873 di = ceph_dentry(dentry); 2877 di = ceph_dentry(dentry);
2874 if (!di || !di->lease_session || 2878 if (!di || !di->lease_session ||
2875 di->lease_session->s_mds < 0 || 2879 di->lease_session->s_mds < 0 ||
2876 di->lease_gen != di->lease_session->s_cap_gen || 2880 di->lease_gen != di->lease_session->s_cap_gen ||
2877 !time_before(jiffies, dentry->d_time)) { 2881 !time_before(jiffies, dentry->d_time)) {
2878 dout("lease_release inode %p dentry %p -- " 2882 dout("lease_release inode %p dentry %p -- "
2879 "no lease\n", 2883 "no lease\n",
2880 inode, dentry); 2884 inode, dentry);
2881 spin_unlock(&dentry->d_lock); 2885 spin_unlock(&dentry->d_lock);
2882 return; 2886 return;
2883 } 2887 }
2884 2888
2885 /* we do have a lease on this dentry; note mds and seq */ 2889 /* we do have a lease on this dentry; note mds and seq */
2886 session = ceph_get_mds_session(di->lease_session); 2890 session = ceph_get_mds_session(di->lease_session);
2887 seq = di->lease_seq; 2891 seq = di->lease_seq;
2888 __ceph_mdsc_drop_dentry_lease(dentry); 2892 __ceph_mdsc_drop_dentry_lease(dentry);
2889 spin_unlock(&dentry->d_lock); 2893 spin_unlock(&dentry->d_lock);
2890 2894
2891 dout("lease_release inode %p dentry %p to mds%d\n", 2895 dout("lease_release inode %p dentry %p to mds%d\n",
2892 inode, dentry, session->s_mds); 2896 inode, dentry, session->s_mds);
2893 ceph_mdsc_lease_send_msg(session, inode, dentry, 2897 ceph_mdsc_lease_send_msg(session, inode, dentry,
2894 CEPH_MDS_LEASE_RELEASE, seq); 2898 CEPH_MDS_LEASE_RELEASE, seq);
2895 ceph_put_mds_session(session); 2899 ceph_put_mds_session(session);
2896 } 2900 }
2897 2901
2898 /* 2902 /*
2899 * drop all leases (and dentry refs) in preparation for umount 2903 * drop all leases (and dentry refs) in preparation for umount
2900 */ 2904 */
2901 static void drop_leases(struct ceph_mds_client *mdsc) 2905 static void drop_leases(struct ceph_mds_client *mdsc)
2902 { 2906 {
2903 int i; 2907 int i;
2904 2908
2905 dout("drop_leases\n"); 2909 dout("drop_leases\n");
2906 mutex_lock(&mdsc->mutex); 2910 mutex_lock(&mdsc->mutex);
2907 for (i = 0; i < mdsc->max_sessions; i++) { 2911 for (i = 0; i < mdsc->max_sessions; i++) {
2908 struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i); 2912 struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
2909 if (!s) 2913 if (!s)
2910 continue; 2914 continue;
2911 mutex_unlock(&mdsc->mutex); 2915 mutex_unlock(&mdsc->mutex);
2912 mutex_lock(&s->s_mutex); 2916 mutex_lock(&s->s_mutex);
2913 mutex_unlock(&s->s_mutex); 2917 mutex_unlock(&s->s_mutex);
2914 ceph_put_mds_session(s); 2918 ceph_put_mds_session(s);
2915 mutex_lock(&mdsc->mutex); 2919 mutex_lock(&mdsc->mutex);
2916 } 2920 }
2917 mutex_unlock(&mdsc->mutex); 2921 mutex_unlock(&mdsc->mutex);
2918 } 2922 }
2919 2923
2920 2924
2921 2925
2922 /* 2926 /*
2923 * delayed work -- periodically trim expired leases, renew caps with mds 2927 * delayed work -- periodically trim expired leases, renew caps with mds
2924 */ 2928 */
2925 static void schedule_delayed(struct ceph_mds_client *mdsc) 2929 static void schedule_delayed(struct ceph_mds_client *mdsc)
2926 { 2930 {
2927 int delay = 5; 2931 int delay = 5;
2928 unsigned hz = round_jiffies_relative(HZ * delay); 2932 unsigned hz = round_jiffies_relative(HZ * delay);
2929 schedule_delayed_work(&mdsc->delayed_work, hz); 2933 schedule_delayed_work(&mdsc->delayed_work, hz);
2930 } 2934 }
2931 2935
2932 static void delayed_work(struct work_struct *work) 2936 static void delayed_work(struct work_struct *work)
2933 { 2937 {
2934 int i; 2938 int i;
2935 struct ceph_mds_client *mdsc = 2939 struct ceph_mds_client *mdsc =
2936 container_of(work, struct ceph_mds_client, delayed_work.work); 2940 container_of(work, struct ceph_mds_client, delayed_work.work);
2937 int renew_interval; 2941 int renew_interval;
2938 int renew_caps; 2942 int renew_caps;
2939 2943
2940 dout("mdsc delayed_work\n"); 2944 dout("mdsc delayed_work\n");
2941 ceph_check_delayed_caps(mdsc); 2945 ceph_check_delayed_caps(mdsc);
2942 2946
2943 mutex_lock(&mdsc->mutex); 2947 mutex_lock(&mdsc->mutex);
2944 renew_interval = mdsc->mdsmap->m_session_timeout >> 2; 2948 renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
2945 renew_caps = time_after_eq(jiffies, HZ*renew_interval + 2949 renew_caps = time_after_eq(jiffies, HZ*renew_interval +
2946 mdsc->last_renew_caps); 2950 mdsc->last_renew_caps);
2947 if (renew_caps) 2951 if (renew_caps)
2948 mdsc->last_renew_caps = jiffies; 2952 mdsc->last_renew_caps = jiffies;
2949 2953
2950 for (i = 0; i < mdsc->max_sessions; i++) { 2954 for (i = 0; i < mdsc->max_sessions; i++) {
2951 struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i); 2955 struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
2952 if (s == NULL) 2956 if (s == NULL)
2953 continue; 2957 continue;
2954 if (s->s_state == CEPH_MDS_SESSION_CLOSING) { 2958 if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
2955 dout("resending session close request for mds%d\n", 2959 dout("resending session close request for mds%d\n",
2956 s->s_mds); 2960 s->s_mds);
2957 request_close_session(mdsc, s); 2961 request_close_session(mdsc, s);
2958 ceph_put_mds_session(s); 2962 ceph_put_mds_session(s);
2959 continue; 2963 continue;
2960 } 2964 }
2961 if (s->s_ttl && time_after(jiffies, s->s_ttl)) { 2965 if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
2962 if (s->s_state == CEPH_MDS_SESSION_OPEN) { 2966 if (s->s_state == CEPH_MDS_SESSION_OPEN) {
2963 s->s_state = CEPH_MDS_SESSION_HUNG; 2967 s->s_state = CEPH_MDS_SESSION_HUNG;
2964 pr_info("mds%d hung\n", s->s_mds); 2968 pr_info("mds%d hung\n", s->s_mds);
2965 } 2969 }
2966 } 2970 }
2967 if (s->s_state < CEPH_MDS_SESSION_OPEN) { 2971 if (s->s_state < CEPH_MDS_SESSION_OPEN) {
2968 /* this mds is failed or recovering, just wait */ 2972 /* this mds is failed or recovering, just wait */
2969 ceph_put_mds_session(s); 2973 ceph_put_mds_session(s);
2970 continue; 2974 continue;
2971 } 2975 }
2972 mutex_unlock(&mdsc->mutex); 2976 mutex_unlock(&mdsc->mutex);
2973 2977
2974 mutex_lock(&s->s_mutex); 2978 mutex_lock(&s->s_mutex);
2975 if (renew_caps) 2979 if (renew_caps)
2976 send_renew_caps(mdsc, s); 2980 send_renew_caps(mdsc, s);
2977 else 2981 else
2978 ceph_con_keepalive(&s->s_con); 2982 ceph_con_keepalive(&s->s_con);
2979 ceph_add_cap_releases(mdsc, s); 2983 ceph_add_cap_releases(mdsc, s);
2980 if (s->s_state == CEPH_MDS_SESSION_OPEN || 2984 if (s->s_state == CEPH_MDS_SESSION_OPEN ||
2981 s->s_state == CEPH_MDS_SESSION_HUNG) 2985 s->s_state == CEPH_MDS_SESSION_HUNG)
2982 ceph_send_cap_releases(mdsc, s); 2986 ceph_send_cap_releases(mdsc, s);
2983 mutex_unlock(&s->s_mutex); 2987 mutex_unlock(&s->s_mutex);
2984 ceph_put_mds_session(s); 2988 ceph_put_mds_session(s);
2985 2989
2986 mutex_lock(&mdsc->mutex); 2990 mutex_lock(&mdsc->mutex);
2987 } 2991 }
2988 mutex_unlock(&mdsc->mutex); 2992 mutex_unlock(&mdsc->mutex);
2989 2993
2990 schedule_delayed(mdsc); 2994 schedule_delayed(mdsc);
2991 } 2995 }
2992 2996
2993 int ceph_mdsc_init(struct ceph_fs_client *fsc) 2997 int ceph_mdsc_init(struct ceph_fs_client *fsc)
2994 2998
2995 { 2999 {
2996 struct ceph_mds_client *mdsc; 3000 struct ceph_mds_client *mdsc;
2997 3001
2998 mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS); 3002 mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS);
2999 if (!mdsc) 3003 if (!mdsc)
3000 return -ENOMEM; 3004 return -ENOMEM;
3001 mdsc->fsc = fsc; 3005 mdsc->fsc = fsc;
3002 fsc->mdsc = mdsc; 3006 fsc->mdsc = mdsc;
3003 mutex_init(&mdsc->mutex); 3007 mutex_init(&mdsc->mutex);
3004 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); 3008 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
3005 if (mdsc->mdsmap == NULL) 3009 if (mdsc->mdsmap == NULL)
3006 return -ENOMEM; 3010 return -ENOMEM;
3007 3011
3008 init_completion(&mdsc->safe_umount_waiters); 3012 init_completion(&mdsc->safe_umount_waiters);
3009 init_waitqueue_head(&mdsc->session_close_wq); 3013 init_waitqueue_head(&mdsc->session_close_wq);
3010 INIT_LIST_HEAD(&mdsc->waiting_for_map); 3014 INIT_LIST_HEAD(&mdsc->waiting_for_map);
3011 mdsc->sessions = NULL; 3015 mdsc->sessions = NULL;
3012 mdsc->max_sessions = 0; 3016 mdsc->max_sessions = 0;
3013 mdsc->stopping = 0; 3017 mdsc->stopping = 0;
3014 init_rwsem(&mdsc->snap_rwsem); 3018 init_rwsem(&mdsc->snap_rwsem);
3015 mdsc->snap_realms = RB_ROOT; 3019 mdsc->snap_realms = RB_ROOT;
3016 INIT_LIST_HEAD(&mdsc->snap_empty); 3020 INIT_LIST_HEAD(&mdsc->snap_empty);
3017 spin_lock_init(&mdsc->snap_empty_lock); 3021 spin_lock_init(&mdsc->snap_empty_lock);
3018 mdsc->last_tid = 0; 3022 mdsc->last_tid = 0;
3019 mdsc->request_tree = RB_ROOT; 3023 mdsc->request_tree = RB_ROOT;
3020 INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work); 3024 INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
3021 mdsc->last_renew_caps = jiffies; 3025 mdsc->last_renew_caps = jiffies;
3022 INIT_LIST_HEAD(&mdsc->cap_delay_list); 3026 INIT_LIST_HEAD(&mdsc->cap_delay_list);
3023 spin_lock_init(&mdsc->cap_delay_lock); 3027 spin_lock_init(&mdsc->cap_delay_lock);
3024 INIT_LIST_HEAD(&mdsc->snap_flush_list); 3028 INIT_LIST_HEAD(&mdsc->snap_flush_list);
3025 spin_lock_init(&mdsc->snap_flush_lock); 3029 spin_lock_init(&mdsc->snap_flush_lock);
3026 mdsc->cap_flush_seq = 0; 3030 mdsc->cap_flush_seq = 0;
3027 INIT_LIST_HEAD(&mdsc->cap_dirty); 3031 INIT_LIST_HEAD(&mdsc->cap_dirty);
3028 INIT_LIST_HEAD(&mdsc->cap_dirty_migrating); 3032 INIT_LIST_HEAD(&mdsc->cap_dirty_migrating);
3029 mdsc->num_cap_flushing = 0; 3033 mdsc->num_cap_flushing = 0;
3030 spin_lock_init(&mdsc->cap_dirty_lock); 3034 spin_lock_init(&mdsc->cap_dirty_lock);
3031 init_waitqueue_head(&mdsc->cap_flushing_wq); 3035 init_waitqueue_head(&mdsc->cap_flushing_wq);
3032 spin_lock_init(&mdsc->dentry_lru_lock); 3036 spin_lock_init(&mdsc->dentry_lru_lock);
3033 INIT_LIST_HEAD(&mdsc->dentry_lru); 3037 INIT_LIST_HEAD(&mdsc->dentry_lru);
3034 3038
3035 ceph_caps_init(mdsc); 3039 ceph_caps_init(mdsc);
3036 ceph_adjust_min_caps(mdsc, fsc->min_caps); 3040 ceph_adjust_min_caps(mdsc, fsc->min_caps);
3037 3041
3038 return 0; 3042 return 0;
3039 } 3043 }
3040 3044
3041 /* 3045 /*
3042 * Wait for safe replies on open mds requests. If we time out, drop 3046 * Wait for safe replies on open mds requests. If we time out, drop
3043 * all requests from the tree to avoid dangling dentry refs. 3047 * all requests from the tree to avoid dangling dentry refs.
3044 */ 3048 */
3045 static void wait_requests(struct ceph_mds_client *mdsc) 3049 static void wait_requests(struct ceph_mds_client *mdsc)
3046 { 3050 {
3047 struct ceph_mds_request *req; 3051 struct ceph_mds_request *req;
3048 struct ceph_fs_client *fsc = mdsc->fsc; 3052 struct ceph_fs_client *fsc = mdsc->fsc;
3049 3053
3050 mutex_lock(&mdsc->mutex); 3054 mutex_lock(&mdsc->mutex);
3051 if (__get_oldest_req(mdsc)) { 3055 if (__get_oldest_req(mdsc)) {
3052 mutex_unlock(&mdsc->mutex); 3056 mutex_unlock(&mdsc->mutex);
3053 3057
3054 dout("wait_requests waiting for requests\n"); 3058 dout("wait_requests waiting for requests\n");
3055 wait_for_completion_timeout(&mdsc->safe_umount_waiters, 3059 wait_for_completion_timeout(&mdsc->safe_umount_waiters,
3056 fsc->client->options->mount_timeout * HZ); 3060 fsc->client->options->mount_timeout * HZ);
3057 3061
3058 /* tear down remaining requests */ 3062 /* tear down remaining requests */
3059 mutex_lock(&mdsc->mutex); 3063 mutex_lock(&mdsc->mutex);
3060 while ((req = __get_oldest_req(mdsc))) { 3064 while ((req = __get_oldest_req(mdsc))) {
3061 dout("wait_requests timed out on tid %llu\n", 3065 dout("wait_requests timed out on tid %llu\n",
3062 req->r_tid); 3066 req->r_tid);
3063 __unregister_request(mdsc, req); 3067 __unregister_request(mdsc, req);
3064 } 3068 }
3065 } 3069 }
3066 mutex_unlock(&mdsc->mutex); 3070 mutex_unlock(&mdsc->mutex);
3067 dout("wait_requests done\n"); 3071 dout("wait_requests done\n");
3068 } 3072 }
3069 3073
3070 /* 3074 /*
3071 * called before mount is ro, and before dentries are torn down. 3075 * called before mount is ro, and before dentries are torn down.
3072 * (hmm, does this still race with new lookups?) 3076 * (hmm, does this still race with new lookups?)
3073 */ 3077 */
3074 void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc) 3078 void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
3075 { 3079 {
3076 dout("pre_umount\n"); 3080 dout("pre_umount\n");
3077 mdsc->stopping = 1; 3081 mdsc->stopping = 1;
3078 3082
3079 drop_leases(mdsc); 3083 drop_leases(mdsc);
3080 ceph_flush_dirty_caps(mdsc); 3084 ceph_flush_dirty_caps(mdsc);
3081 wait_requests(mdsc); 3085 wait_requests(mdsc);
3082 3086
3083 /* 3087 /*
3084 * wait for reply handlers to drop their request refs and 3088 * wait for reply handlers to drop their request refs and
3085 * their inode/dcache refs 3089 * their inode/dcache refs
3086 */ 3090 */
3087 ceph_msgr_flush(); 3091 ceph_msgr_flush();
3088 } 3092 }
3089 3093
3090 /* 3094 /*
3091 * wait for all write mds requests to flush. 3095 * wait for all write mds requests to flush.
3092 */ 3096 */
3093 static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid) 3097 static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid)
3094 { 3098 {
3095 struct ceph_mds_request *req = NULL, *nextreq; 3099 struct ceph_mds_request *req = NULL, *nextreq;
3096 struct rb_node *n; 3100 struct rb_node *n;
3097 3101
3098 mutex_lock(&mdsc->mutex); 3102 mutex_lock(&mdsc->mutex);
3099 dout("wait_unsafe_requests want %lld\n", want_tid); 3103 dout("wait_unsafe_requests want %lld\n", want_tid);
3100 restart: 3104 restart:
3101 req = __get_oldest_req(mdsc); 3105 req = __get_oldest_req(mdsc);
3102 while (req && req->r_tid <= want_tid) { 3106 while (req && req->r_tid <= want_tid) {
3103 /* find next request */ 3107 /* find next request */
3104 n = rb_next(&req->r_node); 3108 n = rb_next(&req->r_node);
3105 if (n) 3109 if (n)
3106 nextreq = rb_entry(n, struct ceph_mds_request, r_node); 3110 nextreq = rb_entry(n, struct ceph_mds_request, r_node);
3107 else 3111 else
3108 nextreq = NULL; 3112 nextreq = NULL;
3109 if ((req->r_op & CEPH_MDS_OP_WRITE)) { 3113 if ((req->r_op & CEPH_MDS_OP_WRITE)) {
3110 /* write op */ 3114 /* write op */
3111 ceph_mdsc_get_request(req); 3115 ceph_mdsc_get_request(req);
3112 if (nextreq) 3116 if (nextreq)
3113 ceph_mdsc_get_request(nextreq); 3117 ceph_mdsc_get_request(nextreq);
3114 mutex_unlock(&mdsc->mutex); 3118 mutex_unlock(&mdsc->mutex);
3115 dout("wait_unsafe_requests wait on %llu (want %llu)\n", 3119 dout("wait_unsafe_requests wait on %llu (want %llu)\n",
3116 req->r_tid, want_tid); 3120 req->r_tid, want_tid);
3117 wait_for_completion(&req->r_safe_completion); 3121 wait_for_completion(&req->r_safe_completion);
3118 mutex_lock(&mdsc->mutex); 3122 mutex_lock(&mdsc->mutex);
3119 ceph_mdsc_put_request(req); 3123 ceph_mdsc_put_request(req);
3120 if (!nextreq) 3124 if (!nextreq)
3121 break; /* next dne before, so we're done! */ 3125 break; /* next dne before, so we're done! */
3122 if (RB_EMPTY_NODE(&nextreq->r_node)) { 3126 if (RB_EMPTY_NODE(&nextreq->r_node)) {
3123 /* next request was removed from tree */ 3127 /* next request was removed from tree */
3124 ceph_mdsc_put_request(nextreq); 3128 ceph_mdsc_put_request(nextreq);
3125 goto restart; 3129 goto restart;
3126 } 3130 }
3127 ceph_mdsc_put_request(nextreq); /* won't go away */ 3131 ceph_mdsc_put_request(nextreq); /* won't go away */
3128 } 3132 }
3129 req = nextreq; 3133 req = nextreq;
3130 } 3134 }
3131 mutex_unlock(&mdsc->mutex); 3135 mutex_unlock(&mdsc->mutex);
3132 dout("wait_unsafe_requests done\n"); 3136 dout("wait_unsafe_requests done\n");
3133 } 3137 }
3134 3138
3135 void ceph_mdsc_sync(struct ceph_mds_client *mdsc) 3139 void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
3136 { 3140 {
3137 u64 want_tid, want_flush; 3141 u64 want_tid, want_flush;
3138 3142
3139 if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN) 3143 if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
3140 return; 3144 return;
3141 3145
3142 dout("sync\n"); 3146 dout("sync\n");
3143 mutex_lock(&mdsc->mutex); 3147 mutex_lock(&mdsc->mutex);
3144 want_tid = mdsc->last_tid; 3148 want_tid = mdsc->last_tid;
3145 want_flush = mdsc->cap_flush_seq; 3149 want_flush = mdsc->cap_flush_seq;
3146 mutex_unlock(&mdsc->mutex); 3150 mutex_unlock(&mdsc->mutex);
3147 dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush); 3151 dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
3148 3152
3149 ceph_flush_dirty_caps(mdsc); 3153 ceph_flush_dirty_caps(mdsc);
3150 3154
3151 wait_unsafe_requests(mdsc, want_tid); 3155 wait_unsafe_requests(mdsc, want_tid);
3152 wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush)); 3156 wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush));
3153 } 3157 }
3154 3158
3155 /* 3159 /*
3156 * true if all sessions are closed, or we force unmount 3160 * true if all sessions are closed, or we force unmount
3157 */ 3161 */
3158 static bool done_closing_sessions(struct ceph_mds_client *mdsc) 3162 static bool done_closing_sessions(struct ceph_mds_client *mdsc)
3159 { 3163 {
3160 int i, n = 0; 3164 int i, n = 0;
3161 3165
3162 if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN) 3166 if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
3163 return true; 3167 return true;
3164 3168
3165 mutex_lock(&mdsc->mutex); 3169 mutex_lock(&mdsc->mutex);
3166 for (i = 0; i < mdsc->max_sessions; i++) 3170 for (i = 0; i < mdsc->max_sessions; i++)
3167 if (mdsc->sessions[i]) 3171 if (mdsc->sessions[i])
3168 n++; 3172 n++;
3169 mutex_unlock(&mdsc->mutex); 3173 mutex_unlock(&mdsc->mutex);
3170 return n == 0; 3174 return n == 0;
3171 } 3175 }
3172 3176
3173 /* 3177 /*
3174 * called after sb is ro. 3178 * called after sb is ro.
3175 */ 3179 */
3176 void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) 3180 void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
3177 { 3181 {
3178 struct ceph_mds_session *session; 3182 struct ceph_mds_session *session;
3179 int i; 3183 int i;
3180 struct ceph_fs_client *fsc = mdsc->fsc; 3184 struct ceph_fs_client *fsc = mdsc->fsc;
3181 unsigned long timeout = fsc->client->options->mount_timeout * HZ; 3185 unsigned long timeout = fsc->client->options->mount_timeout * HZ;
3182 3186
3183 dout("close_sessions\n"); 3187 dout("close_sessions\n");
3184 3188
3185 /* close sessions */ 3189 /* close sessions */
3186 mutex_lock(&mdsc->mutex); 3190 mutex_lock(&mdsc->mutex);
3187 for (i = 0; i < mdsc->max_sessions; i++) { 3191 for (i = 0; i < mdsc->max_sessions; i++) {
3188 session = __ceph_lookup_mds_session(mdsc, i); 3192 session = __ceph_lookup_mds_session(mdsc, i);
3189 if (!session) 3193 if (!session)
3190 continue; 3194 continue;
3191 mutex_unlock(&mdsc->mutex); 3195 mutex_unlock(&mdsc->mutex);
3192 mutex_lock(&session->s_mutex); 3196 mutex_lock(&session->s_mutex);
3193 __close_session(mdsc, session); 3197 __close_session(mdsc, session);
3194 mutex_unlock(&session->s_mutex); 3198 mutex_unlock(&session->s_mutex);
3195 ceph_put_mds_session(session); 3199 ceph_put_mds_session(session);
3196 mutex_lock(&mdsc->mutex); 3200 mutex_lock(&mdsc->mutex);
3197 } 3201 }
3198 mutex_unlock(&mdsc->mutex); 3202 mutex_unlock(&mdsc->mutex);
3199 3203
3200 dout("waiting for sessions to close\n"); 3204 dout("waiting for sessions to close\n");
3201 wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc), 3205 wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc),
3202 timeout); 3206 timeout);
3203 3207
3204 /* tear down remaining sessions */ 3208 /* tear down remaining sessions */
3205 mutex_lock(&mdsc->mutex); 3209 mutex_lock(&mdsc->mutex);
3206 for (i = 0; i < mdsc->max_sessions; i++) { 3210 for (i = 0; i < mdsc->max_sessions; i++) {
3207 if (mdsc->sessions[i]) { 3211 if (mdsc->sessions[i]) {
3208 session = get_session(mdsc->sessions[i]); 3212 session = get_session(mdsc->sessions[i]);
3209 __unregister_session(mdsc, session); 3213 __unregister_session(mdsc, session);
3210 mutex_unlock(&mdsc->mutex); 3214 mutex_unlock(&mdsc->mutex);
3211 mutex_lock(&session->s_mutex); 3215 mutex_lock(&session->s_mutex);
3212 remove_session_caps(session); 3216 remove_session_caps(session);
3213 mutex_unlock(&session->s_mutex); 3217 mutex_unlock(&session->s_mutex);
3214 ceph_put_mds_session(session); 3218 ceph_put_mds_session(session);
3215 mutex_lock(&mdsc->mutex); 3219 mutex_lock(&mdsc->mutex);
3216 } 3220 }
3217 } 3221 }
3218 WARN_ON(!list_empty(&mdsc->cap_delay_list)); 3222 WARN_ON(!list_empty(&mdsc->cap_delay_list));
3219 mutex_unlock(&mdsc->mutex); 3223 mutex_unlock(&mdsc->mutex);
3220 3224
3221 ceph_cleanup_empty_realms(mdsc); 3225 ceph_cleanup_empty_realms(mdsc);
3222 3226
3223 cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ 3227 cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
3224 3228
3225 dout("stopped\n"); 3229 dout("stopped\n");
3226 } 3230 }
3227 3231
3228 static void ceph_mdsc_stop(struct ceph_mds_client *mdsc) 3232 static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
3229 { 3233 {
3230 dout("stop\n"); 3234 dout("stop\n");
3231 cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ 3235 cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
3232 if (mdsc->mdsmap) 3236 if (mdsc->mdsmap)
3233 ceph_mdsmap_destroy(mdsc->mdsmap); 3237 ceph_mdsmap_destroy(mdsc->mdsmap);
3234 kfree(mdsc->sessions); 3238 kfree(mdsc->sessions);
3235 ceph_caps_finalize(mdsc); 3239 ceph_caps_finalize(mdsc);
3236 } 3240 }
3237 3241
3238 void ceph_mdsc_destroy(struct ceph_fs_client *fsc) 3242 void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
3239 { 3243 {
3240 struct ceph_mds_client *mdsc = fsc->mdsc; 3244 struct ceph_mds_client *mdsc = fsc->mdsc;
3241 3245
3242 dout("mdsc_destroy %p\n", mdsc); 3246 dout("mdsc_destroy %p\n", mdsc);
3243 ceph_mdsc_stop(mdsc); 3247 ceph_mdsc_stop(mdsc);
3244 3248
3245 /* flush out any connection work with references to us */ 3249 /* flush out any connection work with references to us */
3246 ceph_msgr_flush(); 3250 ceph_msgr_flush();
3247 3251
3248 fsc->mdsc = NULL; 3252 fsc->mdsc = NULL;
3249 kfree(mdsc); 3253 kfree(mdsc);
3250 dout("mdsc_destroy %p done\n", mdsc); 3254 dout("mdsc_destroy %p done\n", mdsc);
3251 } 3255 }
3252 3256
3253 3257
3254 /* 3258 /*
3255 * handle mds map update. 3259 * handle mds map update.
3256 */ 3260 */
3257 void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg) 3261 void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
3258 { 3262 {
3259 u32 epoch; 3263 u32 epoch;
3260 u32 maplen; 3264 u32 maplen;
3261 void *p = msg->front.iov_base; 3265 void *p = msg->front.iov_base;
3262 void *end = p + msg->front.iov_len; 3266 void *end = p + msg->front.iov_len;
3263 struct ceph_mdsmap *newmap, *oldmap; 3267 struct ceph_mdsmap *newmap, *oldmap;
3264 struct ceph_fsid fsid; 3268 struct ceph_fsid fsid;
3265 int err = -EINVAL; 3269 int err = -EINVAL;
3266 3270
3267 ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad); 3271 ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
3268 ceph_decode_copy(&p, &fsid, sizeof(fsid)); 3272 ceph_decode_copy(&p, &fsid, sizeof(fsid));
3269 if (ceph_check_fsid(mdsc->fsc->client, &fsid) < 0) 3273 if (ceph_check_fsid(mdsc->fsc->client, &fsid) < 0)
3270 return; 3274 return;
3271 epoch = ceph_decode_32(&p); 3275 epoch = ceph_decode_32(&p);
3272 maplen = ceph_decode_32(&p); 3276 maplen = ceph_decode_32(&p);
3273 dout("handle_map epoch %u len %d\n", epoch, (int)maplen); 3277 dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
3274 3278
3275 /* do we need it? */ 3279 /* do we need it? */
3276 ceph_monc_got_mdsmap(&mdsc->fsc->client->monc, epoch); 3280 ceph_monc_got_mdsmap(&mdsc->fsc->client->monc, epoch);
3277 mutex_lock(&mdsc->mutex); 3281 mutex_lock(&mdsc->mutex);
3278 if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) { 3282 if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
3279 dout("handle_map epoch %u <= our %u\n", 3283 dout("handle_map epoch %u <= our %u\n",
3280 epoch, mdsc->mdsmap->m_epoch); 3284 epoch, mdsc->mdsmap->m_epoch);
3281 mutex_unlock(&mdsc->mutex); 3285 mutex_unlock(&mdsc->mutex);
3282 return; 3286 return;
3283 } 3287 }
3284 3288
3285 newmap = ceph_mdsmap_decode(&p, end); 3289 newmap = ceph_mdsmap_decode(&p, end);
3286 if (IS_ERR(newmap)) { 3290 if (IS_ERR(newmap)) {
3287 err = PTR_ERR(newmap); 3291 err = PTR_ERR(newmap);
3288 goto bad_unlock; 3292 goto bad_unlock;
3289 } 3293 }
3290 3294
3291 /* swap into place */ 3295 /* swap into place */
3292 if (mdsc->mdsmap) { 3296 if (mdsc->mdsmap) {
3293 oldmap = mdsc->mdsmap; 3297 oldmap = mdsc->mdsmap;
3294 mdsc->mdsmap = newmap; 3298 mdsc->mdsmap = newmap;
3295 check_new_map(mdsc, newmap, oldmap); 3299 check_new_map(mdsc, newmap, oldmap);
3296 ceph_mdsmap_destroy(oldmap); 3300 ceph_mdsmap_destroy(oldmap);
3297 } else { 3301 } else {
3298 mdsc->mdsmap = newmap; /* first mds map */ 3302 mdsc->mdsmap = newmap; /* first mds map */
3299 } 3303 }
3300 mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size; 3304 mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
3301 3305
3302 __wake_requests(mdsc, &mdsc->waiting_for_map); 3306 __wake_requests(mdsc, &mdsc->waiting_for_map);
3303 3307
3304 mutex_unlock(&mdsc->mutex); 3308 mutex_unlock(&mdsc->mutex);
3305 schedule_delayed(mdsc); 3309 schedule_delayed(mdsc);
3306 return; 3310 return;
3307 3311
3308 bad_unlock: 3312 bad_unlock:
3309 mutex_unlock(&mdsc->mutex); 3313 mutex_unlock(&mdsc->mutex);
3310 bad: 3314 bad:
3311 pr_err("error decoding mdsmap %d\n", err); 3315 pr_err("error decoding mdsmap %d\n", err);
3312 return; 3316 return;
3313 } 3317 }
3314 3318
3315 static struct ceph_connection *con_get(struct ceph_connection *con) 3319 static struct ceph_connection *con_get(struct ceph_connection *con)
3316 { 3320 {
3317 struct ceph_mds_session *s = con->private; 3321 struct ceph_mds_session *s = con->private;
3318 3322
3319 if (get_session(s)) { 3323 if (get_session(s)) {
3320 dout("mdsc con_get %p ok (%d)\n", s, atomic_read(&s->s_ref)); 3324 dout("mdsc con_get %p ok (%d)\n", s, atomic_read(&s->s_ref));
3321 return con; 3325 return con;
3322 } 3326 }
3323 dout("mdsc con_get %p FAIL\n", s); 3327 dout("mdsc con_get %p FAIL\n", s);
3324 return NULL; 3328 return NULL;
3325 } 3329 }
3326 3330
3327 static void con_put(struct ceph_connection *con) 3331 static void con_put(struct ceph_connection *con)
3328 { 3332 {
3329 struct ceph_mds_session *s = con->private; 3333 struct ceph_mds_session *s = con->private;
3330 3334
3331 dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref) - 1); 3335 dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref) - 1);
3332 ceph_put_mds_session(s); 3336 ceph_put_mds_session(s);
3333 } 3337 }
3334 3338
3335 /* 3339 /*
3336 * if the client is unresponsive for long enough, the mds will kill 3340 * if the client is unresponsive for long enough, the mds will kill
3337 * the session entirely. 3341 * the session entirely.
3338 */ 3342 */
3339 static void peer_reset(struct ceph_connection *con) 3343 static void peer_reset(struct ceph_connection *con)
3340 { 3344 {
3341 struct ceph_mds_session *s = con->private; 3345 struct ceph_mds_session *s = con->private;
3342 struct ceph_mds_client *mdsc = s->s_mdsc; 3346 struct ceph_mds_client *mdsc = s->s_mdsc;
3343 3347
3344 pr_warning("mds%d closed our session\n", s->s_mds); 3348 pr_warning("mds%d closed our session\n", s->s_mds);
3345 send_mds_reconnect(mdsc, s); 3349 send_mds_reconnect(mdsc, s);
3346 } 3350 }
3347 3351
3348 static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) 3352 static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
3349 { 3353 {
3350 struct ceph_mds_session *s = con->private; 3354 struct ceph_mds_session *s = con->private;
3351 struct ceph_mds_client *mdsc = s->s_mdsc; 3355 struct ceph_mds_client *mdsc = s->s_mdsc;
3352 int type = le16_to_cpu(msg->hdr.type); 3356 int type = le16_to_cpu(msg->hdr.type);
3353 3357
3354 mutex_lock(&mdsc->mutex); 3358 mutex_lock(&mdsc->mutex);
3355 if (__verify_registered_session(mdsc, s) < 0) { 3359 if (__verify_registered_session(mdsc, s) < 0) {
3356 mutex_unlock(&mdsc->mutex); 3360 mutex_unlock(&mdsc->mutex);
3357 goto out; 3361 goto out;
3358 } 3362 }
3359 mutex_unlock(&mdsc->mutex); 3363 mutex_unlock(&mdsc->mutex);
3360 3364
3361 switch (type) { 3365 switch (type) {
3362 case CEPH_MSG_MDS_MAP: 3366 case CEPH_MSG_MDS_MAP:
3363 ceph_mdsc_handle_map(mdsc, msg); 3367 ceph_mdsc_handle_map(mdsc, msg);
3364 break; 3368 break;
3365 case CEPH_MSG_CLIENT_SESSION: 3369 case CEPH_MSG_CLIENT_SESSION:
3366 handle_session(s, msg); 3370 handle_session(s, msg);
3367 break; 3371 break;
3368 case CEPH_MSG_CLIENT_REPLY: 3372 case CEPH_MSG_CLIENT_REPLY:
3369 handle_reply(s, msg); 3373 handle_reply(s, msg);
3370 break; 3374 break;
3371 case CEPH_MSG_CLIENT_REQUEST_FORWARD: 3375 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
3372 handle_forward(mdsc, s, msg); 3376 handle_forward(mdsc, s, msg);
3373 break; 3377 break;
3374 case CEPH_MSG_CLIENT_CAPS: 3378 case CEPH_MSG_CLIENT_CAPS:
3375 ceph_handle_caps(s, msg); 3379 ceph_handle_caps(s, msg);
3376 break; 3380 break;
3377 case CEPH_MSG_CLIENT_SNAP: 3381 case CEPH_MSG_CLIENT_SNAP:
3378 ceph_handle_snap(mdsc, s, msg); 3382 ceph_handle_snap(mdsc, s, msg);
3379 break; 3383 break;
3380 case CEPH_MSG_CLIENT_LEASE: 3384 case CEPH_MSG_CLIENT_LEASE:
3381 handle_lease(mdsc, s, msg); 3385 handle_lease(mdsc, s, msg);
3382 break; 3386 break;
3383 3387
3384 default: 3388 default:
3385 pr_err("received unknown message type %d %s\n", type, 3389 pr_err("received unknown message type %d %s\n", type,
3386 ceph_msg_type_name(type)); 3390 ceph_msg_type_name(type));
3387 } 3391 }
3388 out: 3392 out:
3389 ceph_msg_put(msg); 3393 ceph_msg_put(msg);
3390 } 3394 }
3391 3395
3392 /* 3396 /*
3393 * authentication 3397 * authentication
3394 */ 3398 */
3395 static int get_authorizer(struct ceph_connection *con, 3399 static int get_authorizer(struct ceph_connection *con,
3396 void **buf, int *len, int *proto, 3400 void **buf, int *len, int *proto,
3397 void **reply_buf, int *reply_len, int force_new) 3401 void **reply_buf, int *reply_len, int force_new)
3398 { 3402 {
3399 struct ceph_mds_session *s = con->private; 3403 struct ceph_mds_session *s = con->private;
3400 struct ceph_mds_client *mdsc = s->s_mdsc; 3404 struct ceph_mds_client *mdsc = s->s_mdsc;
3401 struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; 3405 struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
3402 int ret = 0; 3406 int ret = 0;
3403 3407
3404 if (force_new && s->s_authorizer) { 3408 if (force_new && s->s_authorizer) {
3405 ac->ops->destroy_authorizer(ac, s->s_authorizer); 3409 ac->ops->destroy_authorizer(ac, s->s_authorizer);
3406 s->s_authorizer = NULL; 3410 s->s_authorizer = NULL;
3407 } 3411 }
3408 if (s->s_authorizer == NULL) { 3412 if (s->s_authorizer == NULL) {
3409 if (ac->ops->create_authorizer) { 3413 if (ac->ops->create_authorizer) {
3410 ret = ac->ops->create_authorizer( 3414 ret = ac->ops->create_authorizer(
3411 ac, CEPH_ENTITY_TYPE_MDS, 3415 ac, CEPH_ENTITY_TYPE_MDS,
3412 &s->s_authorizer, 3416 &s->s_authorizer,
3413 &s->s_authorizer_buf, 3417 &s->s_authorizer_buf,
3414 &s->s_authorizer_buf_len, 3418 &s->s_authorizer_buf_len,
3415 &s->s_authorizer_reply_buf, 3419 &s->s_authorizer_reply_buf,
3416 &s->s_authorizer_reply_buf_len); 3420 &s->s_authorizer_reply_buf_len);
3417 if (ret) 3421 if (ret)
3418 return ret; 3422 return ret;
3419 } 3423 }
3420 } 3424 }
3421 3425
3422 *proto = ac->protocol; 3426 *proto = ac->protocol;
3423 *buf = s->s_authorizer_buf; 3427 *buf = s->s_authorizer_buf;
3424 *len = s->s_authorizer_buf_len; 3428 *len = s->s_authorizer_buf_len;
3425 *reply_buf = s->s_authorizer_reply_buf; 3429 *reply_buf = s->s_authorizer_reply_buf;
3426 *reply_len = s->s_authorizer_reply_buf_len; 3430 *reply_len = s->s_authorizer_reply_buf_len;
3427 return 0; 3431 return 0;
3428 } 3432 }
3429 3433
3430 3434
3431 static int verify_authorizer_reply(struct ceph_connection *con, int len) 3435 static int verify_authorizer_reply(struct ceph_connection *con, int len)
3432 { 3436 {
3433 struct ceph_mds_session *s = con->private; 3437 struct ceph_mds_session *s = con->private;
3434 struct ceph_mds_client *mdsc = s->s_mdsc; 3438 struct ceph_mds_client *mdsc = s->s_mdsc;
3435 struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; 3439 struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
3436 3440
3437 return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len); 3441 return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len);
3438 } 3442 }
3439 3443
3440 static int invalidate_authorizer(struct ceph_connection *con) 3444 static int invalidate_authorizer(struct ceph_connection *con)
3441 { 3445 {
3442 struct ceph_mds_session *s = con->private; 3446 struct ceph_mds_session *s = con->private;
3443 struct ceph_mds_client *mdsc = s->s_mdsc; 3447 struct ceph_mds_client *mdsc = s->s_mdsc;
3444 struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; 3448 struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
3445 3449
3446 if (ac->ops->invalidate_authorizer) 3450 if (ac->ops->invalidate_authorizer)
3447 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS); 3451 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
3448 3452
3449 return ceph_monc_validate_auth(&mdsc->fsc->client->monc); 3453 return ceph_monc_validate_auth(&mdsc->fsc->client->monc);
3450 } 3454 }
3451 3455
3452 static const struct ceph_connection_operations mds_con_ops = { 3456 static const struct ceph_connection_operations mds_con_ops = {
3453 .get = con_get, 3457 .get = con_get,
3454 .put = con_put, 3458 .put = con_put,
3455 .dispatch = dispatch, 3459 .dispatch = dispatch,
3456 .get_authorizer = get_authorizer, 3460 .get_authorizer = get_authorizer,
3457 .verify_authorizer_reply = verify_authorizer_reply, 3461 .verify_authorizer_reply = verify_authorizer_reply,
3458 .invalidate_authorizer = invalidate_authorizer, 3462 .invalidate_authorizer = invalidate_authorizer,
3459 .peer_reset = peer_reset, 3463 .peer_reset = peer_reset,
3460 }; 3464 };
3461 3465
3462 /* eof */ 3466 /* eof */
3463 3467
fs/ceph/mds_client.h
1 #ifndef _FS_CEPH_MDS_CLIENT_H 1 #ifndef _FS_CEPH_MDS_CLIENT_H
2 #define _FS_CEPH_MDS_CLIENT_H 2 #define _FS_CEPH_MDS_CLIENT_H
3 3
4 #include <linux/completion.h> 4 #include <linux/completion.h>
5 #include <linux/kref.h> 5 #include <linux/kref.h>
6 #include <linux/list.h> 6 #include <linux/list.h>
7 #include <linux/mutex.h> 7 #include <linux/mutex.h>
8 #include <linux/rbtree.h> 8 #include <linux/rbtree.h>
9 #include <linux/spinlock.h> 9 #include <linux/spinlock.h>
10 10
11 #include <linux/ceph/types.h> 11 #include <linux/ceph/types.h>
12 #include <linux/ceph/messenger.h> 12 #include <linux/ceph/messenger.h>
13 #include <linux/ceph/mdsmap.h> 13 #include <linux/ceph/mdsmap.h>
14 14
15 /* 15 /*
16 * Some lock dependencies: 16 * Some lock dependencies:
17 * 17 *
18 * session->s_mutex 18 * session->s_mutex
19 * mdsc->mutex 19 * mdsc->mutex
20 * 20 *
21 * mdsc->snap_rwsem 21 * mdsc->snap_rwsem
22 * 22 *
23 * ci->i_ceph_lock 23 * ci->i_ceph_lock
24 * mdsc->snap_flush_lock 24 * mdsc->snap_flush_lock
25 * mdsc->cap_delay_lock 25 * mdsc->cap_delay_lock
26 * 26 *
27 */ 27 */
28 28
29 struct ceph_fs_client; 29 struct ceph_fs_client;
30 struct ceph_cap; 30 struct ceph_cap;
31 31
32 /* 32 /*
33 * parsed info about a single inode. pointers are into the encoded 33 * parsed info about a single inode. pointers are into the encoded
34 * on-wire structures within the mds reply message payload. 34 * on-wire structures within the mds reply message payload.
35 */ 35 */
36 struct ceph_mds_reply_info_in { 36 struct ceph_mds_reply_info_in {
37 struct ceph_mds_reply_inode *in; 37 struct ceph_mds_reply_inode *in;
38 struct ceph_dir_layout dir_layout; 38 struct ceph_dir_layout dir_layout;
39 u32 symlink_len; 39 u32 symlink_len;
40 char *symlink; 40 char *symlink;
41 u32 xattr_len; 41 u32 xattr_len;
42 char *xattr_data; 42 char *xattr_data;
43 }; 43 };
44 44
45 /* 45 /*
46 * parsed info about an mds reply, including information about 46 * parsed info about an mds reply, including information about
47 * either: 1) the target inode and/or its parent directory and dentry, 47 * either: 1) the target inode and/or its parent directory and dentry,
48 * and directory contents (for readdir results), or 48 * and directory contents (for readdir results), or
49 * 2) the file range lock info (for fcntl F_GETLK results). 49 * 2) the file range lock info (for fcntl F_GETLK results).
50 */ 50 */
51 struct ceph_mds_reply_info_parsed { 51 struct ceph_mds_reply_info_parsed {
52 struct ceph_mds_reply_head *head; 52 struct ceph_mds_reply_head *head;
53 53
54 /* trace */ 54 /* trace */
55 struct ceph_mds_reply_info_in diri, targeti; 55 struct ceph_mds_reply_info_in diri, targeti;
56 struct ceph_mds_reply_dirfrag *dirfrag; 56 struct ceph_mds_reply_dirfrag *dirfrag;
57 char *dname; 57 char *dname;
58 u32 dname_len; 58 u32 dname_len;
59 struct ceph_mds_reply_lease *dlease; 59 struct ceph_mds_reply_lease *dlease;
60 60
61 /* extra */ 61 /* extra */
62 union { 62 union {
63 /* for fcntl F_GETLK results */ 63 /* for fcntl F_GETLK results */
64 struct ceph_filelock *filelock_reply; 64 struct ceph_filelock *filelock_reply;
65 65
66 /* for readdir results */ 66 /* for readdir results */
67 struct { 67 struct {
68 struct ceph_mds_reply_dirfrag *dir_dir; 68 struct ceph_mds_reply_dirfrag *dir_dir;
69 int dir_nr; 69 int dir_nr;
70 char **dir_dname; 70 char **dir_dname;
71 u32 *dir_dname_len; 71 u32 *dir_dname_len;
72 struct ceph_mds_reply_lease **dir_dlease; 72 struct ceph_mds_reply_lease **dir_dlease;
73 struct ceph_mds_reply_info_in *dir_in; 73 struct ceph_mds_reply_info_in *dir_in;
74 u8 dir_complete, dir_end; 74 u8 dir_complete, dir_end;
75 }; 75 };
76 }; 76 };
77 77
78 /* encoded blob describing snapshot contexts for certain 78 /* encoded blob describing snapshot contexts for certain
79 operations (e.g., open) */ 79 operations (e.g., open) */
80 void *snapblob; 80 void *snapblob;
81 int snapblob_len; 81 int snapblob_len;
82 }; 82 };
83 83
84 84
85 /* 85 /*
86 * cap releases are batched and sent to the MDS en masse. 86 * cap releases are batched and sent to the MDS en masse.
87 */ 87 */
88 #define CEPH_CAPS_PER_RELEASE ((PAGE_CACHE_SIZE - \ 88 #define CEPH_CAPS_PER_RELEASE ((PAGE_CACHE_SIZE - \
89 sizeof(struct ceph_mds_cap_release)) / \ 89 sizeof(struct ceph_mds_cap_release)) / \
90 sizeof(struct ceph_mds_cap_item)) 90 sizeof(struct ceph_mds_cap_item))
91 91
92 92
93 /* 93 /*
94 * state associated with each MDS<->client session 94 * state associated with each MDS<->client session
95 */ 95 */
96 enum { 96 enum {
97 CEPH_MDS_SESSION_NEW = 1, 97 CEPH_MDS_SESSION_NEW = 1,
98 CEPH_MDS_SESSION_OPENING = 2, 98 CEPH_MDS_SESSION_OPENING = 2,
99 CEPH_MDS_SESSION_OPEN = 3, 99 CEPH_MDS_SESSION_OPEN = 3,
100 CEPH_MDS_SESSION_HUNG = 4, 100 CEPH_MDS_SESSION_HUNG = 4,
101 CEPH_MDS_SESSION_CLOSING = 5, 101 CEPH_MDS_SESSION_CLOSING = 5,
102 CEPH_MDS_SESSION_RESTARTING = 6, 102 CEPH_MDS_SESSION_RESTARTING = 6,
103 CEPH_MDS_SESSION_RECONNECTING = 7, 103 CEPH_MDS_SESSION_RECONNECTING = 7,
104 }; 104 };
105 105
106 struct ceph_mds_session { 106 struct ceph_mds_session {
107 struct ceph_mds_client *s_mdsc; 107 struct ceph_mds_client *s_mdsc;
108 int s_mds; 108 int s_mds;
109 int s_state; 109 int s_state;
110 unsigned long s_ttl; /* time until mds kills us */ 110 unsigned long s_ttl; /* time until mds kills us */
111 u64 s_seq; /* incoming msg seq # */ 111 u64 s_seq; /* incoming msg seq # */
112 struct mutex s_mutex; /* serialize session messages */ 112 struct mutex s_mutex; /* serialize session messages */
113 113
114 struct ceph_connection s_con; 114 struct ceph_connection s_con;
115 115
116 struct ceph_authorizer *s_authorizer; 116 struct ceph_authorizer *s_authorizer;
117 void *s_authorizer_buf, *s_authorizer_reply_buf; 117 void *s_authorizer_buf, *s_authorizer_reply_buf;
118 size_t s_authorizer_buf_len, s_authorizer_reply_buf_len; 118 size_t s_authorizer_buf_len, s_authorizer_reply_buf_len;
119 119
120 /* protected by s_cap_lock */ 120 /* protected by s_gen_ttl_lock */
121 spinlock_t s_cap_lock; 121 spinlock_t s_gen_ttl_lock;
122 u32 s_cap_gen; /* inc each time we get mds stale msg */ 122 u32 s_cap_gen; /* inc each time we get mds stale msg */
123 unsigned long s_cap_ttl; /* when session caps expire */ 123 unsigned long s_cap_ttl; /* when session caps expire */
124
125 /* protected by s_cap_lock */
126 spinlock_t s_cap_lock;
124 struct list_head s_caps; /* all caps issued by this session */ 127 struct list_head s_caps; /* all caps issued by this session */
125 int s_nr_caps, s_trim_caps; 128 int s_nr_caps, s_trim_caps;
126 int s_num_cap_releases; 129 int s_num_cap_releases;
127 struct list_head s_cap_releases; /* waiting cap_release messages */ 130 struct list_head s_cap_releases; /* waiting cap_release messages */
128 struct list_head s_cap_releases_done; /* ready to send */ 131 struct list_head s_cap_releases_done; /* ready to send */
129 struct ceph_cap *s_cap_iterator; 132 struct ceph_cap *s_cap_iterator;
130 133
131 /* protected by mutex */ 134 /* protected by mutex */
132 struct list_head s_cap_flushing; /* inodes w/ flushing caps */ 135 struct list_head s_cap_flushing; /* inodes w/ flushing caps */
133 struct list_head s_cap_snaps_flushing; 136 struct list_head s_cap_snaps_flushing;
134 unsigned long s_renew_requested; /* last time we sent a renew req */ 137 unsigned long s_renew_requested; /* last time we sent a renew req */
135 u64 s_renew_seq; 138 u64 s_renew_seq;
136 139
137 atomic_t s_ref; 140 atomic_t s_ref;
138 struct list_head s_waiting; /* waiting requests */ 141 struct list_head s_waiting; /* waiting requests */
139 struct list_head s_unsafe; /* unsafe requests */ 142 struct list_head s_unsafe; /* unsafe requests */
140 }; 143 };
141 144
142 /* 145 /*
143 * modes of choosing which MDS to send a request to 146 * modes of choosing which MDS to send a request to
144 */ 147 */
145 enum { 148 enum {
146 USE_ANY_MDS, 149 USE_ANY_MDS,
147 USE_RANDOM_MDS, 150 USE_RANDOM_MDS,
148 USE_AUTH_MDS, /* prefer authoritative mds for this metadata item */ 151 USE_AUTH_MDS, /* prefer authoritative mds for this metadata item */
149 }; 152 };
150 153
151 struct ceph_mds_request; 154 struct ceph_mds_request;
152 struct ceph_mds_client; 155 struct ceph_mds_client;
153 156
154 /* 157 /*
155 * request completion callback 158 * request completion callback
156 */ 159 */
157 typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc, 160 typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
158 struct ceph_mds_request *req); 161 struct ceph_mds_request *req);
159 162
160 /* 163 /*
161 * an in-flight mds request 164 * an in-flight mds request
162 */ 165 */
163 struct ceph_mds_request { 166 struct ceph_mds_request {
164 u64 r_tid; /* transaction id */ 167 u64 r_tid; /* transaction id */
165 struct rb_node r_node; 168 struct rb_node r_node;
166 struct ceph_mds_client *r_mdsc; 169 struct ceph_mds_client *r_mdsc;
167 170
168 int r_op; /* mds op code */ 171 int r_op; /* mds op code */
169 172
170 /* operation on what? */ 173 /* operation on what? */
171 struct inode *r_inode; /* arg1 */ 174 struct inode *r_inode; /* arg1 */
172 struct dentry *r_dentry; /* arg1 */ 175 struct dentry *r_dentry; /* arg1 */
173 struct dentry *r_old_dentry; /* arg2: rename from or link from */ 176 struct dentry *r_old_dentry; /* arg2: rename from or link from */
174 struct inode *r_old_dentry_dir; /* arg2: old dentry's parent dir */ 177 struct inode *r_old_dentry_dir; /* arg2: old dentry's parent dir */
175 char *r_path1, *r_path2; 178 char *r_path1, *r_path2;
176 struct ceph_vino r_ino1, r_ino2; 179 struct ceph_vino r_ino1, r_ino2;
177 180
178 struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */ 181 struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
179 struct inode *r_target_inode; /* resulting inode */ 182 struct inode *r_target_inode; /* resulting inode */
180 183
181 struct mutex r_fill_mutex; 184 struct mutex r_fill_mutex;
182 185
183 union ceph_mds_request_args r_args; 186 union ceph_mds_request_args r_args;
184 int r_fmode; /* file mode, if expecting cap */ 187 int r_fmode; /* file mode, if expecting cap */
185 uid_t r_uid; 188 uid_t r_uid;
186 gid_t r_gid; 189 gid_t r_gid;
187 190
188 /* for choosing which mds to send this request to */ 191 /* for choosing which mds to send this request to */
189 int r_direct_mode; 192 int r_direct_mode;
190 u32 r_direct_hash; /* choose dir frag based on this dentry hash */ 193 u32 r_direct_hash; /* choose dir frag based on this dentry hash */
191 bool r_direct_is_hash; /* true if r_direct_hash is valid */ 194 bool r_direct_is_hash; /* true if r_direct_hash is valid */
192 195
193 /* data payload is used for xattr ops */ 196 /* data payload is used for xattr ops */
194 struct page **r_pages; 197 struct page **r_pages;
195 int r_num_pages; 198 int r_num_pages;
196 int r_data_len; 199 int r_data_len;
197 200
198 /* what caps shall we drop? */ 201 /* what caps shall we drop? */
199 int r_inode_drop, r_inode_unless; 202 int r_inode_drop, r_inode_unless;
200 int r_dentry_drop, r_dentry_unless; 203 int r_dentry_drop, r_dentry_unless;
201 int r_old_dentry_drop, r_old_dentry_unless; 204 int r_old_dentry_drop, r_old_dentry_unless;
202 struct inode *r_old_inode; 205 struct inode *r_old_inode;
203 int r_old_inode_drop, r_old_inode_unless; 206 int r_old_inode_drop, r_old_inode_unless;
204 207
205 struct ceph_msg *r_request; /* original request */ 208 struct ceph_msg *r_request; /* original request */
206 int r_request_release_offset; 209 int r_request_release_offset;
207 struct ceph_msg *r_reply; 210 struct ceph_msg *r_reply;
208 struct ceph_mds_reply_info_parsed r_reply_info; 211 struct ceph_mds_reply_info_parsed r_reply_info;
209 int r_err; 212 int r_err;
210 bool r_aborted; 213 bool r_aborted;
211 214
212 unsigned long r_timeout; /* optional. jiffies */ 215 unsigned long r_timeout; /* optional. jiffies */
213 unsigned long r_started; /* start time to measure timeout against */ 216 unsigned long r_started; /* start time to measure timeout against */
214 unsigned long r_request_started; /* start time for mds request only, 217 unsigned long r_request_started; /* start time for mds request only,
215 used to measure lease durations */ 218 used to measure lease durations */
216 219
217 /* link unsafe requests to parent directory, for fsync */ 220 /* link unsafe requests to parent directory, for fsync */
218 struct inode *r_unsafe_dir; 221 struct inode *r_unsafe_dir;
219 struct list_head r_unsafe_dir_item; 222 struct list_head r_unsafe_dir_item;
220 223
221 struct ceph_mds_session *r_session; 224 struct ceph_mds_session *r_session;
222 225
223 int r_attempts; /* resend attempts */ 226 int r_attempts; /* resend attempts */
224 int r_num_fwd; /* number of forward attempts */ 227 int r_num_fwd; /* number of forward attempts */
225 int r_resend_mds; /* mds to resend to next, if any*/ 228 int r_resend_mds; /* mds to resend to next, if any*/
226 u32 r_sent_on_mseq; /* cap mseq request was sent at*/ 229 u32 r_sent_on_mseq; /* cap mseq request was sent at*/
227 230
228 struct kref r_kref; 231 struct kref r_kref;
229 struct list_head r_wait; 232 struct list_head r_wait;
230 struct completion r_completion; 233 struct completion r_completion;
231 struct completion r_safe_completion; 234 struct completion r_safe_completion;
232 ceph_mds_request_callback_t r_callback; 235 ceph_mds_request_callback_t r_callback;
233 struct list_head r_unsafe_item; /* per-session unsafe list item */ 236 struct list_head r_unsafe_item; /* per-session unsafe list item */
234 bool r_got_unsafe, r_got_safe, r_got_result; 237 bool r_got_unsafe, r_got_safe, r_got_result;
235 238
236 bool r_did_prepopulate; 239 bool r_did_prepopulate;
237 u32 r_readdir_offset; 240 u32 r_readdir_offset;
238 241
239 struct ceph_cap_reservation r_caps_reservation; 242 struct ceph_cap_reservation r_caps_reservation;
240 int r_num_caps; 243 int r_num_caps;
241 }; 244 };
242 245
243 /* 246 /*
244 * mds client state 247 * mds client state
245 */ 248 */
246 struct ceph_mds_client { 249 struct ceph_mds_client {
247 struct ceph_fs_client *fsc; 250 struct ceph_fs_client *fsc;
248 struct mutex mutex; /* all nested structures */ 251 struct mutex mutex; /* all nested structures */
249 252
250 struct ceph_mdsmap *mdsmap; 253 struct ceph_mdsmap *mdsmap;
251 struct completion safe_umount_waiters; 254 struct completion safe_umount_waiters;
252 wait_queue_head_t session_close_wq; 255 wait_queue_head_t session_close_wq;
253 struct list_head waiting_for_map; 256 struct list_head waiting_for_map;
254 257
255 struct ceph_mds_session **sessions; /* NULL for mds if no session */ 258 struct ceph_mds_session **sessions; /* NULL for mds if no session */
256 int max_sessions; /* len of s_mds_sessions */ 259 int max_sessions; /* len of s_mds_sessions */
257 int stopping; /* true if shutting down */ 260 int stopping; /* true if shutting down */
258 261
259 /* 262 /*
260 * snap_rwsem will cover cap linkage into snaprealms, and 263 * snap_rwsem will cover cap linkage into snaprealms, and
261 * realm snap contexts. (later, we can do per-realm snap 264 * realm snap contexts. (later, we can do per-realm snap
262 * contexts locks..) the empty list contains realms with no 265 * contexts locks..) the empty list contains realms with no
263 * references (implying they contain no inodes with caps) that 266 * references (implying they contain no inodes with caps) that
264 * should be destroyed. 267 * should be destroyed.
265 */ 268 */
266 struct rw_semaphore snap_rwsem; 269 struct rw_semaphore snap_rwsem;
267 struct rb_root snap_realms; 270 struct rb_root snap_realms;
268 struct list_head snap_empty; 271 struct list_head snap_empty;
269 spinlock_t snap_empty_lock; /* protect snap_empty */ 272 spinlock_t snap_empty_lock; /* protect snap_empty */
270 273
271 u64 last_tid; /* most recent mds request */ 274 u64 last_tid; /* most recent mds request */
272 struct rb_root request_tree; /* pending mds requests */ 275 struct rb_root request_tree; /* pending mds requests */
273 struct delayed_work delayed_work; /* delayed work */ 276 struct delayed_work delayed_work; /* delayed work */
274 unsigned long last_renew_caps; /* last time we renewed our caps */ 277 unsigned long last_renew_caps; /* last time we renewed our caps */
275 struct list_head cap_delay_list; /* caps with delayed release */ 278 struct list_head cap_delay_list; /* caps with delayed release */
276 spinlock_t cap_delay_lock; /* protects cap_delay_list */ 279 spinlock_t cap_delay_lock; /* protects cap_delay_list */
277 struct list_head snap_flush_list; /* cap_snaps ready to flush */ 280 struct list_head snap_flush_list; /* cap_snaps ready to flush */
278 spinlock_t snap_flush_lock; 281 spinlock_t snap_flush_lock;
279 282
280 u64 cap_flush_seq; 283 u64 cap_flush_seq;
281 struct list_head cap_dirty; /* inodes with dirty caps */ 284 struct list_head cap_dirty; /* inodes with dirty caps */
282 struct list_head cap_dirty_migrating; /* ...that are migration... */ 285 struct list_head cap_dirty_migrating; /* ...that are migration... */
283 int num_cap_flushing; /* # caps we are flushing */ 286 int num_cap_flushing; /* # caps we are flushing */
284 spinlock_t cap_dirty_lock; /* protects above items */ 287 spinlock_t cap_dirty_lock; /* protects above items */
285 wait_queue_head_t cap_flushing_wq; 288 wait_queue_head_t cap_flushing_wq;
286 289
287 /* 290 /*
288 * Cap reservations 291 * Cap reservations
289 * 292 *
290 * Maintain a global pool of preallocated struct ceph_caps, referenced 293 * Maintain a global pool of preallocated struct ceph_caps, referenced
291 * by struct ceph_caps_reservations. This ensures that we preallocate 294 * by struct ceph_caps_reservations. This ensures that we preallocate
292 * memory needed to successfully process an MDS response. (If an MDS 295 * memory needed to successfully process an MDS response. (If an MDS
293 * sends us cap information and we fail to process it, we will have 296 * sends us cap information and we fail to process it, we will have
294 * problems due to the client and MDS being out of sync.) 297 * problems due to the client and MDS being out of sync.)
295 * 298 *
296 * Reservations are 'owned' by a ceph_cap_reservation context. 299 * Reservations are 'owned' by a ceph_cap_reservation context.
297 */ 300 */
298 spinlock_t caps_list_lock; 301 spinlock_t caps_list_lock;
299 struct list_head caps_list; /* unused (reserved or 302 struct list_head caps_list; /* unused (reserved or
300 unreserved) */ 303 unreserved) */
301 int caps_total_count; /* total caps allocated */ 304 int caps_total_count; /* total caps allocated */
302 int caps_use_count; /* in use */ 305 int caps_use_count; /* in use */
303 int caps_reserve_count; /* unused, reserved */ 306 int caps_reserve_count; /* unused, reserved */
304 int caps_avail_count; /* unused, unreserved */ 307 int caps_avail_count; /* unused, unreserved */
305 int caps_min_count; /* keep at least this many 308 int caps_min_count; /* keep at least this many
306 (unreserved) */ 309 (unreserved) */
307 spinlock_t dentry_lru_lock; 310 spinlock_t dentry_lru_lock;
308 struct list_head dentry_lru; 311 struct list_head dentry_lru;
309 int num_dentry; 312 int num_dentry;
310 }; 313 };
311 314
312 extern const char *ceph_mds_op_name(int op); 315 extern const char *ceph_mds_op_name(int op);
313 316
314 extern struct ceph_mds_session * 317 extern struct ceph_mds_session *
315 __ceph_lookup_mds_session(struct ceph_mds_client *, int mds); 318 __ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
316 319
317 static inline struct ceph_mds_session * 320 static inline struct ceph_mds_session *
318 ceph_get_mds_session(struct ceph_mds_session *s) 321 ceph_get_mds_session(struct ceph_mds_session *s)
319 { 322 {
320 atomic_inc(&s->s_ref); 323 atomic_inc(&s->s_ref);
321 return s; 324 return s;
322 } 325 }
323 326
324 extern void ceph_put_mds_session(struct ceph_mds_session *s); 327 extern void ceph_put_mds_session(struct ceph_mds_session *s);
325 328
326 extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc, 329 extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
327 struct ceph_msg *msg, int mds); 330 struct ceph_msg *msg, int mds);
328 331
329 extern int ceph_mdsc_init(struct ceph_fs_client *fsc); 332 extern int ceph_mdsc_init(struct ceph_fs_client *fsc);
330 extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc); 333 extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
331 extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc); 334 extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
332 335
333 extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc); 336 extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
334 337
335 extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, 338 extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
336 struct inode *inode, 339 struct inode *inode,
337 struct dentry *dn); 340 struct dentry *dn);
338 341
339 extern void ceph_invalidate_dir_request(struct ceph_mds_request *req); 342 extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
340 343
341 extern struct ceph_mds_request * 344 extern struct ceph_mds_request *
342 ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode); 345 ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
343 extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, 346 extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
344 struct ceph_mds_request *req); 347 struct ceph_mds_request *req);
345 extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, 348 extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
346 struct inode *dir, 349 struct inode *dir,
347 struct ceph_mds_request *req); 350 struct ceph_mds_request *req);
348 static inline void ceph_mdsc_get_request(struct ceph_mds_request *req) 351 static inline void ceph_mdsc_get_request(struct ceph_mds_request *req)
349 { 352 {
350 kref_get(&req->r_kref); 353 kref_get(&req->r_kref);
351 } 354 }
352 extern void ceph_mdsc_release_request(struct kref *kref); 355 extern void ceph_mdsc_release_request(struct kref *kref);
353 static inline void ceph_mdsc_put_request(struct ceph_mds_request *req) 356 static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
354 { 357 {
355 kref_put(&req->r_kref, ceph_mdsc_release_request); 358 kref_put(&req->r_kref, ceph_mdsc_release_request);
356 } 359 }
357 360
358 extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc, 361 extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
359 struct ceph_mds_session *session); 362 struct ceph_mds_session *session);
360 extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc, 363 extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
361 struct ceph_mds_session *session); 364 struct ceph_mds_session *session);
362 365
363 extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc); 366 extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
364 367
365 extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, 368 extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
366 int stop_on_nosnap); 369 int stop_on_nosnap);
367 370
368 extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry); 371 extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
369 extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, 372 extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
370 struct inode *inode, 373 struct inode *inode,
371 struct dentry *dentry, char action, 374 struct dentry *dentry, char action,
372 u32 seq); 375 u32 seq);
373 376
374 extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, 377 extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
375 struct ceph_msg *msg); 378 struct ceph_msg *msg);
376 379
377 extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, 380 extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
378 struct ceph_mds_session *session); 381 struct ceph_mds_session *session);
379 382
380 #endif 383 #endif
381 384
1 #include <linux/ceph/ceph_debug.h> 1 #include <linux/ceph/ceph_debug.h>
2 2
3 #include "super.h" 3 #include "super.h"
4 #include "mds_client.h" 4 #include "mds_client.h"
5 5
6 #include <linux/ceph/decode.h> 6 #include <linux/ceph/decode.h>
7 7
8 #include <linux/xattr.h> 8 #include <linux/xattr.h>
9 #include <linux/slab.h> 9 #include <linux/slab.h>
10 10
11 static bool ceph_is_valid_xattr(const char *name) 11 static bool ceph_is_valid_xattr(const char *name)
12 { 12 {
13 return !strncmp(name, "ceph.", 5) || 13 return !strncmp(name, "ceph.", 5) ||
14 !strncmp(name, XATTR_SECURITY_PREFIX, 14 !strncmp(name, XATTR_SECURITY_PREFIX,
15 XATTR_SECURITY_PREFIX_LEN) || 15 XATTR_SECURITY_PREFIX_LEN) ||
16 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || 16 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
17 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); 17 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
18 } 18 }
19 19
20 /* 20 /*
21 * These define virtual xattrs exposing the recursive directory 21 * These define virtual xattrs exposing the recursive directory
22 * statistics and layout metadata. 22 * statistics and layout metadata.
23 */ 23 */
24 struct ceph_vxattr_cb { 24 struct ceph_vxattr_cb {
25 bool readonly; 25 bool readonly;
26 char *name; 26 char *name;
27 size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val, 27 size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
28 size_t size); 28 size_t size);
29 }; 29 };
30 30
31 /* directories */ 31 /* directories */
32 32
33 static size_t ceph_vxattrcb_entries(struct ceph_inode_info *ci, char *val, 33 static size_t ceph_vxattrcb_entries(struct ceph_inode_info *ci, char *val,
34 size_t size) 34 size_t size)
35 { 35 {
36 return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs); 36 return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs);
37 } 37 }
38 38
39 static size_t ceph_vxattrcb_files(struct ceph_inode_info *ci, char *val, 39 static size_t ceph_vxattrcb_files(struct ceph_inode_info *ci, char *val,
40 size_t size) 40 size_t size)
41 { 41 {
42 return snprintf(val, size, "%lld", ci->i_files); 42 return snprintf(val, size, "%lld", ci->i_files);
43 } 43 }
44 44
45 static size_t ceph_vxattrcb_subdirs(struct ceph_inode_info *ci, char *val, 45 static size_t ceph_vxattrcb_subdirs(struct ceph_inode_info *ci, char *val,
46 size_t size) 46 size_t size)
47 { 47 {
48 return snprintf(val, size, "%lld", ci->i_subdirs); 48 return snprintf(val, size, "%lld", ci->i_subdirs);
49 } 49 }
50 50
51 static size_t ceph_vxattrcb_rentries(struct ceph_inode_info *ci, char *val, 51 static size_t ceph_vxattrcb_rentries(struct ceph_inode_info *ci, char *val,
52 size_t size) 52 size_t size)
53 { 53 {
54 return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs); 54 return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs);
55 } 55 }
56 56
57 static size_t ceph_vxattrcb_rfiles(struct ceph_inode_info *ci, char *val, 57 static size_t ceph_vxattrcb_rfiles(struct ceph_inode_info *ci, char *val,
58 size_t size) 58 size_t size)
59 { 59 {
60 return snprintf(val, size, "%lld", ci->i_rfiles); 60 return snprintf(val, size, "%lld", ci->i_rfiles);
61 } 61 }
62 62
63 static size_t ceph_vxattrcb_rsubdirs(struct ceph_inode_info *ci, char *val, 63 static size_t ceph_vxattrcb_rsubdirs(struct ceph_inode_info *ci, char *val,
64 size_t size) 64 size_t size)
65 { 65 {
66 return snprintf(val, size, "%lld", ci->i_rsubdirs); 66 return snprintf(val, size, "%lld", ci->i_rsubdirs);
67 } 67 }
68 68
69 static size_t ceph_vxattrcb_rbytes(struct ceph_inode_info *ci, char *val, 69 static size_t ceph_vxattrcb_rbytes(struct ceph_inode_info *ci, char *val,
70 size_t size) 70 size_t size)
71 { 71 {
72 return snprintf(val, size, "%lld", ci->i_rbytes); 72 return snprintf(val, size, "%lld", ci->i_rbytes);
73 } 73 }
74 74
75 static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val, 75 static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
76 size_t size) 76 size_t size)
77 { 77 {
78 return snprintf(val, size, "%ld.%ld", (long)ci->i_rctime.tv_sec, 78 return snprintf(val, size, "%ld.%ld", (long)ci->i_rctime.tv_sec,
79 (long)ci->i_rctime.tv_nsec); 79 (long)ci->i_rctime.tv_nsec);
80 } 80 }
81 81
82 static struct ceph_vxattr_cb ceph_dir_vxattrs[] = { 82 static struct ceph_vxattr_cb ceph_dir_vxattrs[] = {
83 { true, "ceph.dir.entries", ceph_vxattrcb_entries}, 83 { true, "ceph.dir.entries", ceph_vxattrcb_entries},
84 { true, "ceph.dir.files", ceph_vxattrcb_files}, 84 { true, "ceph.dir.files", ceph_vxattrcb_files},
85 { true, "ceph.dir.subdirs", ceph_vxattrcb_subdirs}, 85 { true, "ceph.dir.subdirs", ceph_vxattrcb_subdirs},
86 { true, "ceph.dir.rentries", ceph_vxattrcb_rentries}, 86 { true, "ceph.dir.rentries", ceph_vxattrcb_rentries},
87 { true, "ceph.dir.rfiles", ceph_vxattrcb_rfiles}, 87 { true, "ceph.dir.rfiles", ceph_vxattrcb_rfiles},
88 { true, "ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs}, 88 { true, "ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
89 { true, "ceph.dir.rbytes", ceph_vxattrcb_rbytes}, 89 { true, "ceph.dir.rbytes", ceph_vxattrcb_rbytes},
90 { true, "ceph.dir.rctime", ceph_vxattrcb_rctime}, 90 { true, "ceph.dir.rctime", ceph_vxattrcb_rctime},
91 { true, NULL, NULL } 91 { true, NULL, NULL }
92 }; 92 };
93 93
94 /* files */ 94 /* files */
95 95
96 static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, 96 static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
97 size_t size) 97 size_t size)
98 { 98 {
99 int ret; 99 int ret;
100 100
101 ret = snprintf(val, size, 101 ret = snprintf(val, size,
102 "chunk_bytes=%lld\nstripe_count=%lld\nobject_size=%lld\n", 102 "chunk_bytes=%lld\nstripe_count=%lld\nobject_size=%lld\n",
103 (unsigned long long)ceph_file_layout_su(ci->i_layout), 103 (unsigned long long)ceph_file_layout_su(ci->i_layout),
104 (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), 104 (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
105 (unsigned long long)ceph_file_layout_object_size(ci->i_layout)); 105 (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
106 if (ceph_file_layout_pg_preferred(ci->i_layout)) 106 if (ceph_file_layout_pg_preferred(ci->i_layout))
107 ret += snprintf(val + ret, size, "preferred_osd=%lld\n", 107 ret += snprintf(val + ret, size, "preferred_osd=%lld\n",
108 (unsigned long long)ceph_file_layout_pg_preferred( 108 (unsigned long long)ceph_file_layout_pg_preferred(
109 ci->i_layout)); 109 ci->i_layout));
110 return ret; 110 return ret;
111 } 111 }
112 112
113 static struct ceph_vxattr_cb ceph_file_vxattrs[] = { 113 static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
114 { true, "ceph.file.layout", ceph_vxattrcb_layout},
115 /* The following extended attribute name is deprecated */
114 { true, "ceph.layout", ceph_vxattrcb_layout}, 116 { true, "ceph.layout", ceph_vxattrcb_layout},
115 { NULL, NULL } 117 { true, NULL, NULL }
116 }; 118 };
117 119
118 static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode) 120 static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode)
119 { 121 {
120 if (S_ISDIR(inode->i_mode)) 122 if (S_ISDIR(inode->i_mode))
121 return ceph_dir_vxattrs; 123 return ceph_dir_vxattrs;
122 else if (S_ISREG(inode->i_mode)) 124 else if (S_ISREG(inode->i_mode))
123 return ceph_file_vxattrs; 125 return ceph_file_vxattrs;
124 return NULL; 126 return NULL;
125 } 127 }
126 128
127 static struct ceph_vxattr_cb *ceph_match_vxattr(struct ceph_vxattr_cb *vxattr, 129 static struct ceph_vxattr_cb *ceph_match_vxattr(struct ceph_vxattr_cb *vxattr,
128 const char *name) 130 const char *name)
129 { 131 {
130 do { 132 do {
131 if (strcmp(vxattr->name, name) == 0) 133 if (strcmp(vxattr->name, name) == 0)
132 return vxattr; 134 return vxattr;
133 vxattr++; 135 vxattr++;
134 } while (vxattr->name); 136 } while (vxattr->name);
135 return NULL; 137 return NULL;
136 } 138 }
137 139
138 static int __set_xattr(struct ceph_inode_info *ci, 140 static int __set_xattr(struct ceph_inode_info *ci,
139 const char *name, int name_len, 141 const char *name, int name_len,
140 const char *val, int val_len, 142 const char *val, int val_len,
141 int dirty, 143 int dirty,
142 int should_free_name, int should_free_val, 144 int should_free_name, int should_free_val,
143 struct ceph_inode_xattr **newxattr) 145 struct ceph_inode_xattr **newxattr)
144 { 146 {
145 struct rb_node **p; 147 struct rb_node **p;
146 struct rb_node *parent = NULL; 148 struct rb_node *parent = NULL;
147 struct ceph_inode_xattr *xattr = NULL; 149 struct ceph_inode_xattr *xattr = NULL;
148 int c; 150 int c;
149 int new = 0; 151 int new = 0;
150 152
151 p = &ci->i_xattrs.index.rb_node; 153 p = &ci->i_xattrs.index.rb_node;
152 while (*p) { 154 while (*p) {
153 parent = *p; 155 parent = *p;
154 xattr = rb_entry(parent, struct ceph_inode_xattr, node); 156 xattr = rb_entry(parent, struct ceph_inode_xattr, node);
155 c = strncmp(name, xattr->name, min(name_len, xattr->name_len)); 157 c = strncmp(name, xattr->name, min(name_len, xattr->name_len));
156 if (c < 0) 158 if (c < 0)
157 p = &(*p)->rb_left; 159 p = &(*p)->rb_left;
158 else if (c > 0) 160 else if (c > 0)
159 p = &(*p)->rb_right; 161 p = &(*p)->rb_right;
160 else { 162 else {
161 if (name_len == xattr->name_len) 163 if (name_len == xattr->name_len)
162 break; 164 break;
163 else if (name_len < xattr->name_len) 165 else if (name_len < xattr->name_len)
164 p = &(*p)->rb_left; 166 p = &(*p)->rb_left;
165 else 167 else
166 p = &(*p)->rb_right; 168 p = &(*p)->rb_right;
167 } 169 }
168 xattr = NULL; 170 xattr = NULL;
169 } 171 }
170 172
171 if (!xattr) { 173 if (!xattr) {
172 new = 1; 174 new = 1;
173 xattr = *newxattr; 175 xattr = *newxattr;
174 xattr->name = name; 176 xattr->name = name;
175 xattr->name_len = name_len; 177 xattr->name_len = name_len;
176 xattr->should_free_name = should_free_name; 178 xattr->should_free_name = should_free_name;
177 179
178 ci->i_xattrs.count++; 180 ci->i_xattrs.count++;
179 dout("__set_xattr count=%d\n", ci->i_xattrs.count); 181 dout("__set_xattr count=%d\n", ci->i_xattrs.count);
180 } else { 182 } else {
181 kfree(*newxattr); 183 kfree(*newxattr);
182 *newxattr = NULL; 184 *newxattr = NULL;
183 if (xattr->should_free_val) 185 if (xattr->should_free_val)
184 kfree((void *)xattr->val); 186 kfree((void *)xattr->val);
185 187
186 if (should_free_name) { 188 if (should_free_name) {
187 kfree((void *)name); 189 kfree((void *)name);
188 name = xattr->name; 190 name = xattr->name;
189 } 191 }
190 ci->i_xattrs.names_size -= xattr->name_len; 192 ci->i_xattrs.names_size -= xattr->name_len;
191 ci->i_xattrs.vals_size -= xattr->val_len; 193 ci->i_xattrs.vals_size -= xattr->val_len;
192 } 194 }
193 ci->i_xattrs.names_size += name_len; 195 ci->i_xattrs.names_size += name_len;
194 ci->i_xattrs.vals_size += val_len; 196 ci->i_xattrs.vals_size += val_len;
195 if (val) 197 if (val)
196 xattr->val = val; 198 xattr->val = val;
197 else 199 else
198 xattr->val = ""; 200 xattr->val = "";
199 201
200 xattr->val_len = val_len; 202 xattr->val_len = val_len;
201 xattr->dirty = dirty; 203 xattr->dirty = dirty;
202 xattr->should_free_val = (val && should_free_val); 204 xattr->should_free_val = (val && should_free_val);
203 205
204 if (new) { 206 if (new) {
205 rb_link_node(&xattr->node, parent, p); 207 rb_link_node(&xattr->node, parent, p);
206 rb_insert_color(&xattr->node, &ci->i_xattrs.index); 208 rb_insert_color(&xattr->node, &ci->i_xattrs.index);
207 dout("__set_xattr_val p=%p\n", p); 209 dout("__set_xattr_val p=%p\n", p);
208 } 210 }
209 211
210 dout("__set_xattr_val added %llx.%llx xattr %p %s=%.*s\n", 212 dout("__set_xattr_val added %llx.%llx xattr %p %s=%.*s\n",
211 ceph_vinop(&ci->vfs_inode), xattr, name, val_len, val); 213 ceph_vinop(&ci->vfs_inode), xattr, name, val_len, val);
212 214
213 return 0; 215 return 0;
214 } 216 }
215 217
216 static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci, 218 static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
217 const char *name) 219 const char *name)
218 { 220 {
219 struct rb_node **p; 221 struct rb_node **p;
220 struct rb_node *parent = NULL; 222 struct rb_node *parent = NULL;
221 struct ceph_inode_xattr *xattr = NULL; 223 struct ceph_inode_xattr *xattr = NULL;
222 int name_len = strlen(name); 224 int name_len = strlen(name);
223 int c; 225 int c;
224 226
225 p = &ci->i_xattrs.index.rb_node; 227 p = &ci->i_xattrs.index.rb_node;
226 while (*p) { 228 while (*p) {
227 parent = *p; 229 parent = *p;
228 xattr = rb_entry(parent, struct ceph_inode_xattr, node); 230 xattr = rb_entry(parent, struct ceph_inode_xattr, node);
229 c = strncmp(name, xattr->name, xattr->name_len); 231 c = strncmp(name, xattr->name, xattr->name_len);
230 if (c == 0 && name_len > xattr->name_len) 232 if (c == 0 && name_len > xattr->name_len)
231 c = 1; 233 c = 1;
232 if (c < 0) 234 if (c < 0)
233 p = &(*p)->rb_left; 235 p = &(*p)->rb_left;
234 else if (c > 0) 236 else if (c > 0)
235 p = &(*p)->rb_right; 237 p = &(*p)->rb_right;
236 else { 238 else {
237 dout("__get_xattr %s: found %.*s\n", name, 239 dout("__get_xattr %s: found %.*s\n", name,
238 xattr->val_len, xattr->val); 240 xattr->val_len, xattr->val);
239 return xattr; 241 return xattr;
240 } 242 }
241 } 243 }
242 244
243 dout("__get_xattr %s: not found\n", name); 245 dout("__get_xattr %s: not found\n", name);
244 246
245 return NULL; 247 return NULL;
246 } 248 }
247 249
248 static void __free_xattr(struct ceph_inode_xattr *xattr) 250 static void __free_xattr(struct ceph_inode_xattr *xattr)
249 { 251 {
250 BUG_ON(!xattr); 252 BUG_ON(!xattr);
251 253
252 if (xattr->should_free_name) 254 if (xattr->should_free_name)
253 kfree((void *)xattr->name); 255 kfree((void *)xattr->name);
254 if (xattr->should_free_val) 256 if (xattr->should_free_val)
255 kfree((void *)xattr->val); 257 kfree((void *)xattr->val);
256 258
257 kfree(xattr); 259 kfree(xattr);
258 } 260 }
259 261
260 static int __remove_xattr(struct ceph_inode_info *ci, 262 static int __remove_xattr(struct ceph_inode_info *ci,
261 struct ceph_inode_xattr *xattr) 263 struct ceph_inode_xattr *xattr)
262 { 264 {
263 if (!xattr) 265 if (!xattr)
264 return -EOPNOTSUPP; 266 return -EOPNOTSUPP;
265 267
266 rb_erase(&xattr->node, &ci->i_xattrs.index); 268 rb_erase(&xattr->node, &ci->i_xattrs.index);
267 269
268 if (xattr->should_free_name) 270 if (xattr->should_free_name)
269 kfree((void *)xattr->name); 271 kfree((void *)xattr->name);
270 if (xattr->should_free_val) 272 if (xattr->should_free_val)
271 kfree((void *)xattr->val); 273 kfree((void *)xattr->val);
272 274
273 ci->i_xattrs.names_size -= xattr->name_len; 275 ci->i_xattrs.names_size -= xattr->name_len;
274 ci->i_xattrs.vals_size -= xattr->val_len; 276 ci->i_xattrs.vals_size -= xattr->val_len;
275 ci->i_xattrs.count--; 277 ci->i_xattrs.count--;
276 kfree(xattr); 278 kfree(xattr);
277 279
278 return 0; 280 return 0;
279 } 281 }
280 282
281 static int __remove_xattr_by_name(struct ceph_inode_info *ci, 283 static int __remove_xattr_by_name(struct ceph_inode_info *ci,
282 const char *name) 284 const char *name)
283 { 285 {
284 struct rb_node **p; 286 struct rb_node **p;
285 struct ceph_inode_xattr *xattr; 287 struct ceph_inode_xattr *xattr;
286 int err; 288 int err;
287 289
288 p = &ci->i_xattrs.index.rb_node; 290 p = &ci->i_xattrs.index.rb_node;
289 xattr = __get_xattr(ci, name); 291 xattr = __get_xattr(ci, name);
290 err = __remove_xattr(ci, xattr); 292 err = __remove_xattr(ci, xattr);
291 return err; 293 return err;
292 } 294 }
293 295
294 static char *__copy_xattr_names(struct ceph_inode_info *ci, 296 static char *__copy_xattr_names(struct ceph_inode_info *ci,
295 char *dest) 297 char *dest)
296 { 298 {
297 struct rb_node *p; 299 struct rb_node *p;
298 struct ceph_inode_xattr *xattr = NULL; 300 struct ceph_inode_xattr *xattr = NULL;
299 301
300 p = rb_first(&ci->i_xattrs.index); 302 p = rb_first(&ci->i_xattrs.index);
301 dout("__copy_xattr_names count=%d\n", ci->i_xattrs.count); 303 dout("__copy_xattr_names count=%d\n", ci->i_xattrs.count);
302 304
303 while (p) { 305 while (p) {
304 xattr = rb_entry(p, struct ceph_inode_xattr, node); 306 xattr = rb_entry(p, struct ceph_inode_xattr, node);
305 memcpy(dest, xattr->name, xattr->name_len); 307 memcpy(dest, xattr->name, xattr->name_len);
306 dest[xattr->name_len] = '\0'; 308 dest[xattr->name_len] = '\0';
307 309
308 dout("dest=%s %p (%s) (%d/%d)\n", dest, xattr, xattr->name, 310 dout("dest=%s %p (%s) (%d/%d)\n", dest, xattr, xattr->name,
309 xattr->name_len, ci->i_xattrs.names_size); 311 xattr->name_len, ci->i_xattrs.names_size);
310 312
311 dest += xattr->name_len + 1; 313 dest += xattr->name_len + 1;
312 p = rb_next(p); 314 p = rb_next(p);
313 } 315 }
314 316
315 return dest; 317 return dest;
316 } 318 }
317 319
318 void __ceph_destroy_xattrs(struct ceph_inode_info *ci) 320 void __ceph_destroy_xattrs(struct ceph_inode_info *ci)
319 { 321 {
320 struct rb_node *p, *tmp; 322 struct rb_node *p, *tmp;
321 struct ceph_inode_xattr *xattr = NULL; 323 struct ceph_inode_xattr *xattr = NULL;
322 324
323 p = rb_first(&ci->i_xattrs.index); 325 p = rb_first(&ci->i_xattrs.index);
324 326
325 dout("__ceph_destroy_xattrs p=%p\n", p); 327 dout("__ceph_destroy_xattrs p=%p\n", p);
326 328
327 while (p) { 329 while (p) {
328 xattr = rb_entry(p, struct ceph_inode_xattr, node); 330 xattr = rb_entry(p, struct ceph_inode_xattr, node);
329 tmp = p; 331 tmp = p;
330 p = rb_next(tmp); 332 p = rb_next(tmp);
331 dout("__ceph_destroy_xattrs next p=%p (%.*s)\n", p, 333 dout("__ceph_destroy_xattrs next p=%p (%.*s)\n", p,
332 xattr->name_len, xattr->name); 334 xattr->name_len, xattr->name);
333 rb_erase(tmp, &ci->i_xattrs.index); 335 rb_erase(tmp, &ci->i_xattrs.index);
334 336
335 __free_xattr(xattr); 337 __free_xattr(xattr);
336 } 338 }
337 339
338 ci->i_xattrs.names_size = 0; 340 ci->i_xattrs.names_size = 0;
339 ci->i_xattrs.vals_size = 0; 341 ci->i_xattrs.vals_size = 0;
340 ci->i_xattrs.index_version = 0; 342 ci->i_xattrs.index_version = 0;
341 ci->i_xattrs.count = 0; 343 ci->i_xattrs.count = 0;
342 ci->i_xattrs.index = RB_ROOT; 344 ci->i_xattrs.index = RB_ROOT;
343 } 345 }
344 346
345 static int __build_xattrs(struct inode *inode) 347 static int __build_xattrs(struct inode *inode)
346 __releases(ci->i_ceph_lock) 348 __releases(ci->i_ceph_lock)
347 __acquires(ci->i_ceph_lock) 349 __acquires(ci->i_ceph_lock)
348 { 350 {
349 u32 namelen; 351 u32 namelen;
350 u32 numattr = 0; 352 u32 numattr = 0;
351 void *p, *end; 353 void *p, *end;
352 u32 len; 354 u32 len;
353 const char *name, *val; 355 const char *name, *val;
354 struct ceph_inode_info *ci = ceph_inode(inode); 356 struct ceph_inode_info *ci = ceph_inode(inode);
355 int xattr_version; 357 int xattr_version;
356 struct ceph_inode_xattr **xattrs = NULL; 358 struct ceph_inode_xattr **xattrs = NULL;
357 int err = 0; 359 int err = 0;
358 int i; 360 int i;
359 361
360 dout("__build_xattrs() len=%d\n", 362 dout("__build_xattrs() len=%d\n",
361 ci->i_xattrs.blob ? (int)ci->i_xattrs.blob->vec.iov_len : 0); 363 ci->i_xattrs.blob ? (int)ci->i_xattrs.blob->vec.iov_len : 0);
362 364
363 if (ci->i_xattrs.index_version >= ci->i_xattrs.version) 365 if (ci->i_xattrs.index_version >= ci->i_xattrs.version)
364 return 0; /* already built */ 366 return 0; /* already built */
365 367
366 __ceph_destroy_xattrs(ci); 368 __ceph_destroy_xattrs(ci);
367 369
368 start: 370 start:
369 /* updated internal xattr rb tree */ 371 /* updated internal xattr rb tree */
370 if (ci->i_xattrs.blob && ci->i_xattrs.blob->vec.iov_len > 4) { 372 if (ci->i_xattrs.blob && ci->i_xattrs.blob->vec.iov_len > 4) {
371 p = ci->i_xattrs.blob->vec.iov_base; 373 p = ci->i_xattrs.blob->vec.iov_base;
372 end = p + ci->i_xattrs.blob->vec.iov_len; 374 end = p + ci->i_xattrs.blob->vec.iov_len;
373 ceph_decode_32_safe(&p, end, numattr, bad); 375 ceph_decode_32_safe(&p, end, numattr, bad);
374 xattr_version = ci->i_xattrs.version; 376 xattr_version = ci->i_xattrs.version;
375 spin_unlock(&ci->i_ceph_lock); 377 spin_unlock(&ci->i_ceph_lock);
376 378
377 xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *), 379 xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *),
378 GFP_NOFS); 380 GFP_NOFS);
379 err = -ENOMEM; 381 err = -ENOMEM;
380 if (!xattrs) 382 if (!xattrs)
381 goto bad_lock; 383 goto bad_lock;
382 memset(xattrs, 0, numattr*sizeof(struct ceph_xattr *)); 384 memset(xattrs, 0, numattr*sizeof(struct ceph_xattr *));
383 for (i = 0; i < numattr; i++) { 385 for (i = 0; i < numattr; i++) {
384 xattrs[i] = kmalloc(sizeof(struct ceph_inode_xattr), 386 xattrs[i] = kmalloc(sizeof(struct ceph_inode_xattr),
385 GFP_NOFS); 387 GFP_NOFS);
386 if (!xattrs[i]) 388 if (!xattrs[i])
387 goto bad_lock; 389 goto bad_lock;
388 } 390 }
389 391
390 spin_lock(&ci->i_ceph_lock); 392 spin_lock(&ci->i_ceph_lock);
391 if (ci->i_xattrs.version != xattr_version) { 393 if (ci->i_xattrs.version != xattr_version) {
392 /* lost a race, retry */ 394 /* lost a race, retry */
393 for (i = 0; i < numattr; i++) 395 for (i = 0; i < numattr; i++)
394 kfree(xattrs[i]); 396 kfree(xattrs[i]);
395 kfree(xattrs); 397 kfree(xattrs);
396 goto start; 398 goto start;
397 } 399 }
398 err = -EIO; 400 err = -EIO;
399 while (numattr--) { 401 while (numattr--) {
400 ceph_decode_32_safe(&p, end, len, bad); 402 ceph_decode_32_safe(&p, end, len, bad);
401 namelen = len; 403 namelen = len;
402 name = p; 404 name = p;
403 p += len; 405 p += len;
404 ceph_decode_32_safe(&p, end, len, bad); 406 ceph_decode_32_safe(&p, end, len, bad);
405 val = p; 407 val = p;
406 p += len; 408 p += len;
407 409
408 err = __set_xattr(ci, name, namelen, val, len, 410 err = __set_xattr(ci, name, namelen, val, len,
409 0, 0, 0, &xattrs[numattr]); 411 0, 0, 0, &xattrs[numattr]);
410 412
411 if (err < 0) 413 if (err < 0)
412 goto bad; 414 goto bad;
413 } 415 }
414 kfree(xattrs); 416 kfree(xattrs);
415 } 417 }
416 ci->i_xattrs.index_version = ci->i_xattrs.version; 418 ci->i_xattrs.index_version = ci->i_xattrs.version;
417 ci->i_xattrs.dirty = false; 419 ci->i_xattrs.dirty = false;
418 420
419 return err; 421 return err;
420 bad_lock: 422 bad_lock:
421 spin_lock(&ci->i_ceph_lock); 423 spin_lock(&ci->i_ceph_lock);
422 bad: 424 bad:
423 if (xattrs) { 425 if (xattrs) {
424 for (i = 0; i < numattr; i++) 426 for (i = 0; i < numattr; i++)
425 kfree(xattrs[i]); 427 kfree(xattrs[i]);
426 kfree(xattrs); 428 kfree(xattrs);
427 } 429 }
428 ci->i_xattrs.names_size = 0; 430 ci->i_xattrs.names_size = 0;
429 return err; 431 return err;
430 } 432 }
431 433
432 static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size, 434 static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size,
433 int val_size) 435 int val_size)
434 { 436 {
435 /* 437 /*
436 * 4 bytes for the length, and additional 4 bytes per each xattr name, 438 * 4 bytes for the length, and additional 4 bytes per each xattr name,
437 * 4 bytes per each value 439 * 4 bytes per each value
438 */ 440 */
439 int size = 4 + ci->i_xattrs.count*(4 + 4) + 441 int size = 4 + ci->i_xattrs.count*(4 + 4) +
440 ci->i_xattrs.names_size + 442 ci->i_xattrs.names_size +
441 ci->i_xattrs.vals_size; 443 ci->i_xattrs.vals_size;
442 dout("__get_required_blob_size c=%d names.size=%d vals.size=%d\n", 444 dout("__get_required_blob_size c=%d names.size=%d vals.size=%d\n",
443 ci->i_xattrs.count, ci->i_xattrs.names_size, 445 ci->i_xattrs.count, ci->i_xattrs.names_size,
444 ci->i_xattrs.vals_size); 446 ci->i_xattrs.vals_size);
445 447
446 if (name_size) 448 if (name_size)
447 size += 4 + 4 + name_size + val_size; 449 size += 4 + 4 + name_size + val_size;
448 450
449 return size; 451 return size;
450 } 452 }
451 453
452 /* 454 /*
453 * If there are dirty xattrs, reencode xattrs into the prealloc_blob 455 * If there are dirty xattrs, reencode xattrs into the prealloc_blob
454 * and swap into place. 456 * and swap into place.
455 */ 457 */
456 void __ceph_build_xattrs_blob(struct ceph_inode_info *ci) 458 void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
457 { 459 {
458 struct rb_node *p; 460 struct rb_node *p;
459 struct ceph_inode_xattr *xattr = NULL; 461 struct ceph_inode_xattr *xattr = NULL;
460 void *dest; 462 void *dest;
461 463
462 dout("__build_xattrs_blob %p\n", &ci->vfs_inode); 464 dout("__build_xattrs_blob %p\n", &ci->vfs_inode);
463 if (ci->i_xattrs.dirty) { 465 if (ci->i_xattrs.dirty) {
464 int need = __get_required_blob_size(ci, 0, 0); 466 int need = __get_required_blob_size(ci, 0, 0);
465 467
466 BUG_ON(need > ci->i_xattrs.prealloc_blob->alloc_len); 468 BUG_ON(need > ci->i_xattrs.prealloc_blob->alloc_len);
467 469
468 p = rb_first(&ci->i_xattrs.index); 470 p = rb_first(&ci->i_xattrs.index);
469 dest = ci->i_xattrs.prealloc_blob->vec.iov_base; 471 dest = ci->i_xattrs.prealloc_blob->vec.iov_base;
470 472
471 ceph_encode_32(&dest, ci->i_xattrs.count); 473 ceph_encode_32(&dest, ci->i_xattrs.count);
472 while (p) { 474 while (p) {
473 xattr = rb_entry(p, struct ceph_inode_xattr, node); 475 xattr = rb_entry(p, struct ceph_inode_xattr, node);
474 476
475 ceph_encode_32(&dest, xattr->name_len); 477 ceph_encode_32(&dest, xattr->name_len);
476 memcpy(dest, xattr->name, xattr->name_len); 478 memcpy(dest, xattr->name, xattr->name_len);
477 dest += xattr->name_len; 479 dest += xattr->name_len;
478 ceph_encode_32(&dest, xattr->val_len); 480 ceph_encode_32(&dest, xattr->val_len);
479 memcpy(dest, xattr->val, xattr->val_len); 481 memcpy(dest, xattr->val, xattr->val_len);
480 dest += xattr->val_len; 482 dest += xattr->val_len;
481 483
482 p = rb_next(p); 484 p = rb_next(p);
483 } 485 }
484 486
485 /* adjust buffer len; it may be larger than we need */ 487 /* adjust buffer len; it may be larger than we need */
486 ci->i_xattrs.prealloc_blob->vec.iov_len = 488 ci->i_xattrs.prealloc_blob->vec.iov_len =
487 dest - ci->i_xattrs.prealloc_blob->vec.iov_base; 489 dest - ci->i_xattrs.prealloc_blob->vec.iov_base;
488 490
489 if (ci->i_xattrs.blob) 491 if (ci->i_xattrs.blob)
490 ceph_buffer_put(ci->i_xattrs.blob); 492 ceph_buffer_put(ci->i_xattrs.blob);
491 ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob; 493 ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob;
492 ci->i_xattrs.prealloc_blob = NULL; 494 ci->i_xattrs.prealloc_blob = NULL;
493 ci->i_xattrs.dirty = false; 495 ci->i_xattrs.dirty = false;
494 ci->i_xattrs.version++; 496 ci->i_xattrs.version++;
495 } 497 }
496 } 498 }
497 499
498 ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, 500 ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
499 size_t size) 501 size_t size)
500 { 502 {
501 struct inode *inode = dentry->d_inode; 503 struct inode *inode = dentry->d_inode;
502 struct ceph_inode_info *ci = ceph_inode(inode); 504 struct ceph_inode_info *ci = ceph_inode(inode);
503 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); 505 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
504 int err; 506 int err;
505 struct ceph_inode_xattr *xattr; 507 struct ceph_inode_xattr *xattr;
506 struct ceph_vxattr_cb *vxattr = NULL; 508 struct ceph_vxattr_cb *vxattr = NULL;
507 509
508 if (!ceph_is_valid_xattr(name)) 510 if (!ceph_is_valid_xattr(name))
509 return -ENODATA; 511 return -ENODATA;
510 512
511 /* let's see if a virtual xattr was requested */ 513 /* let's see if a virtual xattr was requested */
512 if (vxattrs) 514 if (vxattrs)
513 vxattr = ceph_match_vxattr(vxattrs, name); 515 vxattr = ceph_match_vxattr(vxattrs, name);
514 516
515 spin_lock(&ci->i_ceph_lock); 517 spin_lock(&ci->i_ceph_lock);
516 dout("getxattr %p ver=%lld index_ver=%lld\n", inode, 518 dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
517 ci->i_xattrs.version, ci->i_xattrs.index_version); 519 ci->i_xattrs.version, ci->i_xattrs.index_version);
518 520
519 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) && 521 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
520 (ci->i_xattrs.index_version >= ci->i_xattrs.version)) { 522 (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
521 goto get_xattr; 523 goto get_xattr;
522 } else { 524 } else {
523 spin_unlock(&ci->i_ceph_lock); 525 spin_unlock(&ci->i_ceph_lock);
524 /* get xattrs from mds (if we don't already have them) */ 526 /* get xattrs from mds (if we don't already have them) */
525 err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR); 527 err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
526 if (err) 528 if (err)
527 return err; 529 return err;
528 } 530 }
529 531
530 spin_lock(&ci->i_ceph_lock); 532 spin_lock(&ci->i_ceph_lock);
531 533
532 if (vxattr && vxattr->readonly) { 534 if (vxattr && vxattr->readonly) {
533 err = vxattr->getxattr_cb(ci, value, size); 535 err = vxattr->getxattr_cb(ci, value, size);
534 goto out; 536 goto out;
535 } 537 }
536 538
537 err = __build_xattrs(inode); 539 err = __build_xattrs(inode);
538 if (err < 0) 540 if (err < 0)
539 goto out; 541 goto out;
540 542
541 get_xattr: 543 get_xattr:
542 err = -ENODATA; /* == ENOATTR */ 544 err = -ENODATA; /* == ENOATTR */
543 xattr = __get_xattr(ci, name); 545 xattr = __get_xattr(ci, name);
544 if (!xattr) { 546 if (!xattr) {
545 if (vxattr) 547 if (vxattr)
546 err = vxattr->getxattr_cb(ci, value, size); 548 err = vxattr->getxattr_cb(ci, value, size);
547 goto out; 549 goto out;
548 } 550 }
549 551
550 err = -ERANGE; 552 err = -ERANGE;
551 if (size && size < xattr->val_len) 553 if (size && size < xattr->val_len)
552 goto out; 554 goto out;
553 555
554 err = xattr->val_len; 556 err = xattr->val_len;
555 if (size == 0) 557 if (size == 0)
556 goto out; 558 goto out;
557 559
558 memcpy(value, xattr->val, xattr->val_len); 560 memcpy(value, xattr->val, xattr->val_len);
559 561
560 out: 562 out:
561 spin_unlock(&ci->i_ceph_lock); 563 spin_unlock(&ci->i_ceph_lock);
562 return err; 564 return err;
563 } 565 }
564 566
565 ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) 567 ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
566 { 568 {
567 struct inode *inode = dentry->d_inode; 569 struct inode *inode = dentry->d_inode;
568 struct ceph_inode_info *ci = ceph_inode(inode); 570 struct ceph_inode_info *ci = ceph_inode(inode);
569 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); 571 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
570 u32 vir_namelen = 0; 572 u32 vir_namelen = 0;
571 u32 namelen; 573 u32 namelen;
572 int err; 574 int err;
573 u32 len; 575 u32 len;
574 int i; 576 int i;
575 577
576 spin_lock(&ci->i_ceph_lock); 578 spin_lock(&ci->i_ceph_lock);
577 dout("listxattr %p ver=%lld index_ver=%lld\n", inode, 579 dout("listxattr %p ver=%lld index_ver=%lld\n", inode,
578 ci->i_xattrs.version, ci->i_xattrs.index_version); 580 ci->i_xattrs.version, ci->i_xattrs.index_version);
579 581
580 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) && 582 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
581 (ci->i_xattrs.index_version >= ci->i_xattrs.version)) { 583 (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
582 goto list_xattr; 584 goto list_xattr;
583 } else { 585 } else {
584 spin_unlock(&ci->i_ceph_lock); 586 spin_unlock(&ci->i_ceph_lock);
585 err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR); 587 err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
586 if (err) 588 if (err)
587 return err; 589 return err;
588 } 590 }
589 591
590 spin_lock(&ci->i_ceph_lock); 592 spin_lock(&ci->i_ceph_lock);
591 593
592 err = __build_xattrs(inode); 594 err = __build_xattrs(inode);
593 if (err < 0) 595 if (err < 0)
594 goto out; 596 goto out;
595 597
596 list_xattr: 598 list_xattr:
597 vir_namelen = 0; 599 vir_namelen = 0;
598 /* include virtual dir xattrs */ 600 /* include virtual dir xattrs */
599 if (vxattrs) 601 if (vxattrs)
600 for (i = 0; vxattrs[i].name; i++) 602 for (i = 0; vxattrs[i].name; i++)
601 vir_namelen += strlen(vxattrs[i].name) + 1; 603 vir_namelen += strlen(vxattrs[i].name) + 1;
602 /* adding 1 byte per each variable due to the null termination */ 604 /* adding 1 byte per each variable due to the null termination */
603 namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count; 605 namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count;
604 err = -ERANGE; 606 err = -ERANGE;
605 if (size && namelen > size) 607 if (size && namelen > size)
606 goto out; 608 goto out;
607 609
608 err = namelen; 610 err = namelen;
609 if (size == 0) 611 if (size == 0)
610 goto out; 612 goto out;
611 613
612 names = __copy_xattr_names(ci, names); 614 names = __copy_xattr_names(ci, names);
613 615
614 /* virtual xattr names, too */ 616 /* virtual xattr names, too */
615 if (vxattrs) 617 if (vxattrs)
616 for (i = 0; vxattrs[i].name; i++) { 618 for (i = 0; vxattrs[i].name; i++) {
617 len = sprintf(names, "%s", vxattrs[i].name); 619 len = sprintf(names, "%s", vxattrs[i].name);
618 names += len + 1; 620 names += len + 1;
619 } 621 }
620 622
621 out: 623 out:
622 spin_unlock(&ci->i_ceph_lock); 624 spin_unlock(&ci->i_ceph_lock);
623 return err; 625 return err;
624 } 626 }
625 627
626 static int ceph_sync_setxattr(struct dentry *dentry, const char *name, 628 static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
627 const char *value, size_t size, int flags) 629 const char *value, size_t size, int flags)
628 { 630 {
629 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); 631 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
630 struct inode *inode = dentry->d_inode; 632 struct inode *inode = dentry->d_inode;
631 struct ceph_inode_info *ci = ceph_inode(inode); 633 struct ceph_inode_info *ci = ceph_inode(inode);
632 struct inode *parent_inode; 634 struct inode *parent_inode;
633 struct ceph_mds_request *req; 635 struct ceph_mds_request *req;
634 struct ceph_mds_client *mdsc = fsc->mdsc; 636 struct ceph_mds_client *mdsc = fsc->mdsc;
635 int err; 637 int err;
636 int i, nr_pages; 638 int i, nr_pages;
637 struct page **pages = NULL; 639 struct page **pages = NULL;
638 void *kaddr; 640 void *kaddr;
639 641
640 /* copy value into some pages */ 642 /* copy value into some pages */
641 nr_pages = calc_pages_for(0, size); 643 nr_pages = calc_pages_for(0, size);
642 if (nr_pages) { 644 if (nr_pages) {
643 pages = kmalloc(sizeof(pages[0])*nr_pages, GFP_NOFS); 645 pages = kmalloc(sizeof(pages[0])*nr_pages, GFP_NOFS);
644 if (!pages) 646 if (!pages)
645 return -ENOMEM; 647 return -ENOMEM;
646 err = -ENOMEM; 648 err = -ENOMEM;
647 for (i = 0; i < nr_pages; i++) { 649 for (i = 0; i < nr_pages; i++) {
648 pages[i] = __page_cache_alloc(GFP_NOFS); 650 pages[i] = __page_cache_alloc(GFP_NOFS);
649 if (!pages[i]) { 651 if (!pages[i]) {
650 nr_pages = i; 652 nr_pages = i;
651 goto out; 653 goto out;
652 } 654 }
653 kaddr = kmap(pages[i]); 655 kaddr = kmap(pages[i]);
654 memcpy(kaddr, value + i*PAGE_CACHE_SIZE, 656 memcpy(kaddr, value + i*PAGE_CACHE_SIZE,
655 min(PAGE_CACHE_SIZE, size-i*PAGE_CACHE_SIZE)); 657 min(PAGE_CACHE_SIZE, size-i*PAGE_CACHE_SIZE));
656 } 658 }
657 } 659 }
658 660
659 dout("setxattr value=%.*s\n", (int)size, value); 661 dout("setxattr value=%.*s\n", (int)size, value);
660 662
661 /* do request */ 663 /* do request */
662 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR, 664 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR,
663 USE_AUTH_MDS); 665 USE_AUTH_MDS);
664 if (IS_ERR(req)) { 666 if (IS_ERR(req)) {
665 err = PTR_ERR(req); 667 err = PTR_ERR(req);
666 goto out; 668 goto out;
667 } 669 }
668 req->r_inode = inode; 670 req->r_inode = inode;
669 ihold(inode); 671 ihold(inode);
670 req->r_inode_drop = CEPH_CAP_XATTR_SHARED; 672 req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
671 req->r_num_caps = 1; 673 req->r_num_caps = 1;
672 req->r_args.setxattr.flags = cpu_to_le32(flags); 674 req->r_args.setxattr.flags = cpu_to_le32(flags);
673 req->r_path2 = kstrdup(name, GFP_NOFS); 675 req->r_path2 = kstrdup(name, GFP_NOFS);
674 676
675 req->r_pages = pages; 677 req->r_pages = pages;
676 req->r_num_pages = nr_pages; 678 req->r_num_pages = nr_pages;
677 req->r_data_len = size; 679 req->r_data_len = size;
678 680
679 dout("xattr.ver (before): %lld\n", ci->i_xattrs.version); 681 dout("xattr.ver (before): %lld\n", ci->i_xattrs.version);
680 parent_inode = ceph_get_dentry_parent_inode(dentry); 682 parent_inode = ceph_get_dentry_parent_inode(dentry);
681 err = ceph_mdsc_do_request(mdsc, parent_inode, req); 683 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
682 iput(parent_inode); 684 iput(parent_inode);
683 ceph_mdsc_put_request(req); 685 ceph_mdsc_put_request(req);
684 dout("xattr.ver (after): %lld\n", ci->i_xattrs.version); 686 dout("xattr.ver (after): %lld\n", ci->i_xattrs.version);
685 687
686 out: 688 out:
687 if (pages) { 689 if (pages) {
688 for (i = 0; i < nr_pages; i++) 690 for (i = 0; i < nr_pages; i++)
689 __free_page(pages[i]); 691 __free_page(pages[i]);
690 kfree(pages); 692 kfree(pages);
691 } 693 }
692 return err; 694 return err;
693 } 695 }
694 696
695 int ceph_setxattr(struct dentry *dentry, const char *name, 697 int ceph_setxattr(struct dentry *dentry, const char *name,
696 const void *value, size_t size, int flags) 698 const void *value, size_t size, int flags)
697 { 699 {
698 struct inode *inode = dentry->d_inode; 700 struct inode *inode = dentry->d_inode;
699 struct ceph_inode_info *ci = ceph_inode(inode); 701 struct ceph_inode_info *ci = ceph_inode(inode);
700 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); 702 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
701 int err; 703 int err;
702 int name_len = strlen(name); 704 int name_len = strlen(name);
703 int val_len = size; 705 int val_len = size;
704 char *newname = NULL; 706 char *newname = NULL;
705 char *newval = NULL; 707 char *newval = NULL;
706 struct ceph_inode_xattr *xattr = NULL; 708 struct ceph_inode_xattr *xattr = NULL;
707 int issued; 709 int issued;
708 int required_blob_size; 710 int required_blob_size;
709 int dirty; 711 int dirty;
710 712
711 if (ceph_snap(inode) != CEPH_NOSNAP) 713 if (ceph_snap(inode) != CEPH_NOSNAP)
712 return -EROFS; 714 return -EROFS;
713 715
714 if (!ceph_is_valid_xattr(name)) 716 if (!ceph_is_valid_xattr(name))
715 return -EOPNOTSUPP; 717 return -EOPNOTSUPP;
716 718
717 if (vxattrs) { 719 if (vxattrs) {
718 struct ceph_vxattr_cb *vxattr = 720 struct ceph_vxattr_cb *vxattr =
719 ceph_match_vxattr(vxattrs, name); 721 ceph_match_vxattr(vxattrs, name);
720 if (vxattr && vxattr->readonly) 722 if (vxattr && vxattr->readonly)
721 return -EOPNOTSUPP; 723 return -EOPNOTSUPP;
722 } 724 }
723 725
724 /* preallocate memory for xattr name, value, index node */ 726 /* preallocate memory for xattr name, value, index node */
725 err = -ENOMEM; 727 err = -ENOMEM;
726 newname = kmemdup(name, name_len + 1, GFP_NOFS); 728 newname = kmemdup(name, name_len + 1, GFP_NOFS);
727 if (!newname) 729 if (!newname)
728 goto out; 730 goto out;
729 731
730 if (val_len) { 732 if (val_len) {
731 newval = kmalloc(val_len + 1, GFP_NOFS); 733 newval = kmalloc(val_len + 1, GFP_NOFS);
732 if (!newval) 734 if (!newval)
733 goto out; 735 goto out;
734 memcpy(newval, value, val_len); 736 memcpy(newval, value, val_len);
735 newval[val_len] = '\0'; 737 newval[val_len] = '\0';
736 } 738 }
737 739
738 xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS); 740 xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS);
739 if (!xattr) 741 if (!xattr)
740 goto out; 742 goto out;
741 743
742 spin_lock(&ci->i_ceph_lock); 744 spin_lock(&ci->i_ceph_lock);
743 retry: 745 retry:
744 issued = __ceph_caps_issued(ci, NULL); 746 issued = __ceph_caps_issued(ci, NULL);
745 if (!(issued & CEPH_CAP_XATTR_EXCL)) 747 if (!(issued & CEPH_CAP_XATTR_EXCL))
746 goto do_sync; 748 goto do_sync;
747 __build_xattrs(inode); 749 __build_xattrs(inode);
748 750
749 required_blob_size = __get_required_blob_size(ci, name_len, val_len); 751 required_blob_size = __get_required_blob_size(ci, name_len, val_len);
750 752
751 if (!ci->i_xattrs.prealloc_blob || 753 if (!ci->i_xattrs.prealloc_blob ||
752 required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) { 754 required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
753 struct ceph_buffer *blob = NULL; 755 struct ceph_buffer *blob = NULL;
754 756
755 spin_unlock(&ci->i_ceph_lock); 757 spin_unlock(&ci->i_ceph_lock);
756 dout(" preaallocating new blob size=%d\n", required_blob_size); 758 dout(" preaallocating new blob size=%d\n", required_blob_size);
757 blob = ceph_buffer_new(required_blob_size, GFP_NOFS); 759 blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
758 if (!blob) 760 if (!blob)
759 goto out; 761 goto out;
760 spin_lock(&ci->i_ceph_lock); 762 spin_lock(&ci->i_ceph_lock);
761 if (ci->i_xattrs.prealloc_blob) 763 if (ci->i_xattrs.prealloc_blob)
762 ceph_buffer_put(ci->i_xattrs.prealloc_blob); 764 ceph_buffer_put(ci->i_xattrs.prealloc_blob);
763 ci->i_xattrs.prealloc_blob = blob; 765 ci->i_xattrs.prealloc_blob = blob;
764 goto retry; 766 goto retry;
765 } 767 }
766 768
767 dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued)); 769 dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
768 err = __set_xattr(ci, newname, name_len, newval, 770 err = __set_xattr(ci, newname, name_len, newval,
769 val_len, 1, 1, 1, &xattr); 771 val_len, 1, 1, 1, &xattr);
770 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); 772 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
771 ci->i_xattrs.dirty = true; 773 ci->i_xattrs.dirty = true;
772 inode->i_ctime = CURRENT_TIME; 774 inode->i_ctime = CURRENT_TIME;
773 spin_unlock(&ci->i_ceph_lock); 775 spin_unlock(&ci->i_ceph_lock);
774 if (dirty) 776 if (dirty)
775 __mark_inode_dirty(inode, dirty); 777 __mark_inode_dirty(inode, dirty);
776 return err; 778 return err;
777 779
778 do_sync: 780 do_sync:
779 spin_unlock(&ci->i_ceph_lock); 781 spin_unlock(&ci->i_ceph_lock);
780 err = ceph_sync_setxattr(dentry, name, value, size, flags); 782 err = ceph_sync_setxattr(dentry, name, value, size, flags);
781 out: 783 out:
782 kfree(newname); 784 kfree(newname);
783 kfree(newval); 785 kfree(newval);
784 kfree(xattr); 786 kfree(xattr);
785 return err; 787 return err;
786 } 788 }
787 789
788 static int ceph_send_removexattr(struct dentry *dentry, const char *name) 790 static int ceph_send_removexattr(struct dentry *dentry, const char *name)
789 { 791 {
790 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); 792 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
791 struct ceph_mds_client *mdsc = fsc->mdsc; 793 struct ceph_mds_client *mdsc = fsc->mdsc;
792 struct inode *inode = dentry->d_inode; 794 struct inode *inode = dentry->d_inode;
793 struct inode *parent_inode; 795 struct inode *parent_inode;
794 struct ceph_mds_request *req; 796 struct ceph_mds_request *req;
795 int err; 797 int err;
796 798
797 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RMXATTR, 799 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RMXATTR,
798 USE_AUTH_MDS); 800 USE_AUTH_MDS);
799 if (IS_ERR(req)) 801 if (IS_ERR(req))
800 return PTR_ERR(req); 802 return PTR_ERR(req);
801 req->r_inode = inode; 803 req->r_inode = inode;
802 ihold(inode); 804 ihold(inode);
803 req->r_inode_drop = CEPH_CAP_XATTR_SHARED; 805 req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
804 req->r_num_caps = 1; 806 req->r_num_caps = 1;
805 req->r_path2 = kstrdup(name, GFP_NOFS); 807 req->r_path2 = kstrdup(name, GFP_NOFS);
806 808
807 parent_inode = ceph_get_dentry_parent_inode(dentry); 809 parent_inode = ceph_get_dentry_parent_inode(dentry);
808 err = ceph_mdsc_do_request(mdsc, parent_inode, req); 810 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
809 iput(parent_inode); 811 iput(parent_inode);
810 ceph_mdsc_put_request(req); 812 ceph_mdsc_put_request(req);
811 return err; 813 return err;
812 } 814 }
813 815
814 int ceph_removexattr(struct dentry *dentry, const char *name) 816 int ceph_removexattr(struct dentry *dentry, const char *name)
815 { 817 {
816 struct inode *inode = dentry->d_inode; 818 struct inode *inode = dentry->d_inode;
817 struct ceph_inode_info *ci = ceph_inode(inode); 819 struct ceph_inode_info *ci = ceph_inode(inode);
818 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); 820 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
819 int issued; 821 int issued;
820 int err; 822 int err;
821 int required_blob_size; 823 int required_blob_size;
822 int dirty; 824 int dirty;
823 825
824 if (ceph_snap(inode) != CEPH_NOSNAP) 826 if (ceph_snap(inode) != CEPH_NOSNAP)
825 return -EROFS; 827 return -EROFS;
826 828
827 if (!ceph_is_valid_xattr(name)) 829 if (!ceph_is_valid_xattr(name))
828 return -EOPNOTSUPP; 830 return -EOPNOTSUPP;
829 831
830 if (vxattrs) { 832 if (vxattrs) {
831 struct ceph_vxattr_cb *vxattr = 833 struct ceph_vxattr_cb *vxattr =
832 ceph_match_vxattr(vxattrs, name); 834 ceph_match_vxattr(vxattrs, name);
833 if (vxattr && vxattr->readonly) 835 if (vxattr && vxattr->readonly)
834 return -EOPNOTSUPP; 836 return -EOPNOTSUPP;
835 } 837 }
836 838
837 err = -ENOMEM; 839 err = -ENOMEM;
838 spin_lock(&ci->i_ceph_lock); 840 spin_lock(&ci->i_ceph_lock);
839 __build_xattrs(inode); 841 __build_xattrs(inode);
840 retry: 842 retry:
841 issued = __ceph_caps_issued(ci, NULL); 843 issued = __ceph_caps_issued(ci, NULL);
842 dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued)); 844 dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
843 845
844 if (!(issued & CEPH_CAP_XATTR_EXCL)) 846 if (!(issued & CEPH_CAP_XATTR_EXCL))
845 goto do_sync; 847 goto do_sync;
846 848
847 required_blob_size = __get_required_blob_size(ci, 0, 0); 849 required_blob_size = __get_required_blob_size(ci, 0, 0);
848 850
849 if (!ci->i_xattrs.prealloc_blob || 851 if (!ci->i_xattrs.prealloc_blob ||
850 required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) { 852 required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
851 struct ceph_buffer *blob; 853 struct ceph_buffer *blob;
852 854
853 spin_unlock(&ci->i_ceph_lock); 855 spin_unlock(&ci->i_ceph_lock);
854 dout(" preaallocating new blob size=%d\n", required_blob_size); 856 dout(" preaallocating new blob size=%d\n", required_blob_size);
855 blob = ceph_buffer_new(required_blob_size, GFP_NOFS); 857 blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
856 if (!blob) 858 if (!blob)
857 goto out; 859 goto out;
858 spin_lock(&ci->i_ceph_lock); 860 spin_lock(&ci->i_ceph_lock);
859 if (ci->i_xattrs.prealloc_blob) 861 if (ci->i_xattrs.prealloc_blob)
860 ceph_buffer_put(ci->i_xattrs.prealloc_blob); 862 ceph_buffer_put(ci->i_xattrs.prealloc_blob);
861 ci->i_xattrs.prealloc_blob = blob; 863 ci->i_xattrs.prealloc_blob = blob;
862 goto retry; 864 goto retry;
863 } 865 }
864 866
865 err = __remove_xattr_by_name(ceph_inode(inode), name); 867 err = __remove_xattr_by_name(ceph_inode(inode), name);
866 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); 868 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
867 ci->i_xattrs.dirty = true; 869 ci->i_xattrs.dirty = true;
868 inode->i_ctime = CURRENT_TIME; 870 inode->i_ctime = CURRENT_TIME;
869 871
870 spin_unlock(&ci->i_ceph_lock); 872 spin_unlock(&ci->i_ceph_lock);
871 if (dirty) 873 if (dirty)
872 __mark_inode_dirty(inode, dirty); 874 __mark_inode_dirty(inode, dirty);
873 return err; 875 return err;
874 do_sync: 876 do_sync:
875 spin_unlock(&ci->i_ceph_lock); 877 spin_unlock(&ci->i_ceph_lock);
876 err = ceph_send_removexattr(dentry, name); 878 err = ceph_send_removexattr(dentry, name);
877 out: 879 out:
878 return err; 880 return err;
879 } 881 }
880 882
881 883
net/ceph/ceph_common.c
1 1
2 #include <linux/ceph/ceph_debug.h> 2 #include <linux/ceph/ceph_debug.h>
3 #include <linux/backing-dev.h> 3 #include <linux/backing-dev.h>
4 #include <linux/ctype.h> 4 #include <linux/ctype.h>
5 #include <linux/fs.h> 5 #include <linux/fs.h>
6 #include <linux/inet.h> 6 #include <linux/inet.h>
7 #include <linux/in6.h> 7 #include <linux/in6.h>
8 #include <linux/key.h> 8 #include <linux/key.h>
9 #include <keys/ceph-type.h> 9 #include <keys/ceph-type.h>
10 #include <linux/module.h> 10 #include <linux/module.h>
11 #include <linux/mount.h> 11 #include <linux/mount.h>
12 #include <linux/parser.h> 12 #include <linux/parser.h>
13 #include <linux/sched.h> 13 #include <linux/sched.h>
14 #include <linux/seq_file.h> 14 #include <linux/seq_file.h>
15 #include <linux/slab.h> 15 #include <linux/slab.h>
16 #include <linux/statfs.h> 16 #include <linux/statfs.h>
17 #include <linux/string.h> 17 #include <linux/string.h>
18 18
19 19
20 #include <linux/ceph/libceph.h> 20 #include <linux/ceph/libceph.h>
21 #include <linux/ceph/debugfs.h> 21 #include <linux/ceph/debugfs.h>
22 #include <linux/ceph/decode.h> 22 #include <linux/ceph/decode.h>
23 #include <linux/ceph/mon_client.h> 23 #include <linux/ceph/mon_client.h>
24 #include <linux/ceph/auth.h> 24 #include <linux/ceph/auth.h>
25 #include "crypto.h" 25 #include "crypto.h"
26 26
27 27
28 28
29 /* 29 /*
30 * find filename portion of a path (/foo/bar/baz -> baz) 30 * find filename portion of a path (/foo/bar/baz -> baz)
31 */ 31 */
32 const char *ceph_file_part(const char *s, int len) 32 const char *ceph_file_part(const char *s, int len)
33 { 33 {
34 const char *e = s + len; 34 const char *e = s + len;
35 35
36 while (e != s && *(e-1) != '/') 36 while (e != s && *(e-1) != '/')
37 e--; 37 e--;
38 return e; 38 return e;
39 } 39 }
40 EXPORT_SYMBOL(ceph_file_part); 40 EXPORT_SYMBOL(ceph_file_part);
41 41
42 const char *ceph_msg_type_name(int type) 42 const char *ceph_msg_type_name(int type)
43 { 43 {
44 switch (type) { 44 switch (type) {
45 case CEPH_MSG_SHUTDOWN: return "shutdown"; 45 case CEPH_MSG_SHUTDOWN: return "shutdown";
46 case CEPH_MSG_PING: return "ping"; 46 case CEPH_MSG_PING: return "ping";
47 case CEPH_MSG_AUTH: return "auth"; 47 case CEPH_MSG_AUTH: return "auth";
48 case CEPH_MSG_AUTH_REPLY: return "auth_reply"; 48 case CEPH_MSG_AUTH_REPLY: return "auth_reply";
49 case CEPH_MSG_MON_MAP: return "mon_map"; 49 case CEPH_MSG_MON_MAP: return "mon_map";
50 case CEPH_MSG_MON_GET_MAP: return "mon_get_map"; 50 case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
51 case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe"; 51 case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
52 case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack"; 52 case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
53 case CEPH_MSG_STATFS: return "statfs"; 53 case CEPH_MSG_STATFS: return "statfs";
54 case CEPH_MSG_STATFS_REPLY: return "statfs_reply"; 54 case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
55 case CEPH_MSG_MDS_MAP: return "mds_map"; 55 case CEPH_MSG_MDS_MAP: return "mds_map";
56 case CEPH_MSG_CLIENT_SESSION: return "client_session"; 56 case CEPH_MSG_CLIENT_SESSION: return "client_session";
57 case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect"; 57 case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
58 case CEPH_MSG_CLIENT_REQUEST: return "client_request"; 58 case CEPH_MSG_CLIENT_REQUEST: return "client_request";
59 case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward"; 59 case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
60 case CEPH_MSG_CLIENT_REPLY: return "client_reply"; 60 case CEPH_MSG_CLIENT_REPLY: return "client_reply";
61 case CEPH_MSG_CLIENT_CAPS: return "client_caps"; 61 case CEPH_MSG_CLIENT_CAPS: return "client_caps";
62 case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release"; 62 case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
63 case CEPH_MSG_CLIENT_SNAP: return "client_snap"; 63 case CEPH_MSG_CLIENT_SNAP: return "client_snap";
64 case CEPH_MSG_CLIENT_LEASE: return "client_lease"; 64 case CEPH_MSG_CLIENT_LEASE: return "client_lease";
65 case CEPH_MSG_OSD_MAP: return "osd_map"; 65 case CEPH_MSG_OSD_MAP: return "osd_map";
66 case CEPH_MSG_OSD_OP: return "osd_op"; 66 case CEPH_MSG_OSD_OP: return "osd_op";
67 case CEPH_MSG_OSD_OPREPLY: return "osd_opreply"; 67 case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
68 case CEPH_MSG_WATCH_NOTIFY: return "watch_notify"; 68 case CEPH_MSG_WATCH_NOTIFY: return "watch_notify";
69 default: return "unknown"; 69 default: return "unknown";
70 } 70 }
71 } 71 }
72 EXPORT_SYMBOL(ceph_msg_type_name); 72 EXPORT_SYMBOL(ceph_msg_type_name);
73 73
74 /* 74 /*
75 * Initially learn our fsid, or verify an fsid matches. 75 * Initially learn our fsid, or verify an fsid matches.
76 */ 76 */
77 int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid) 77 int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
78 { 78 {
79 if (client->have_fsid) { 79 if (client->have_fsid) {
80 if (ceph_fsid_compare(&client->fsid, fsid)) { 80 if (ceph_fsid_compare(&client->fsid, fsid)) {
81 pr_err("bad fsid, had %pU got %pU", 81 pr_err("bad fsid, had %pU got %pU",
82 &client->fsid, fsid); 82 &client->fsid, fsid);
83 return -1; 83 return -1;
84 } 84 }
85 } else { 85 } else {
86 pr_info("client%lld fsid %pU\n", ceph_client_id(client), fsid); 86 pr_info("client%lld fsid %pU\n", ceph_client_id(client), fsid);
87 memcpy(&client->fsid, fsid, sizeof(*fsid)); 87 memcpy(&client->fsid, fsid, sizeof(*fsid));
88 ceph_debugfs_client_init(client);
89 client->have_fsid = true;
90 } 88 }
91 return 0; 89 return 0;
92 } 90 }
93 EXPORT_SYMBOL(ceph_check_fsid); 91 EXPORT_SYMBOL(ceph_check_fsid);
94 92
95 static int strcmp_null(const char *s1, const char *s2) 93 static int strcmp_null(const char *s1, const char *s2)
96 { 94 {
97 if (!s1 && !s2) 95 if (!s1 && !s2)
98 return 0; 96 return 0;
99 if (s1 && !s2) 97 if (s1 && !s2)
100 return -1; 98 return -1;
101 if (!s1 && s2) 99 if (!s1 && s2)
102 return 1; 100 return 1;
103 return strcmp(s1, s2); 101 return strcmp(s1, s2);
104 } 102 }
105 103
106 int ceph_compare_options(struct ceph_options *new_opt, 104 int ceph_compare_options(struct ceph_options *new_opt,
107 struct ceph_client *client) 105 struct ceph_client *client)
108 { 106 {
109 struct ceph_options *opt1 = new_opt; 107 struct ceph_options *opt1 = new_opt;
110 struct ceph_options *opt2 = client->options; 108 struct ceph_options *opt2 = client->options;
111 int ofs = offsetof(struct ceph_options, mon_addr); 109 int ofs = offsetof(struct ceph_options, mon_addr);
112 int i; 110 int i;
113 int ret; 111 int ret;
114 112
115 ret = memcmp(opt1, opt2, ofs); 113 ret = memcmp(opt1, opt2, ofs);
116 if (ret) 114 if (ret)
117 return ret; 115 return ret;
118 116
119 ret = strcmp_null(opt1->name, opt2->name); 117 ret = strcmp_null(opt1->name, opt2->name);
120 if (ret) 118 if (ret)
121 return ret; 119 return ret;
122 120
123 if (opt1->key && !opt2->key) 121 if (opt1->key && !opt2->key)
124 return -1; 122 return -1;
125 if (!opt1->key && opt2->key) 123 if (!opt1->key && opt2->key)
126 return 1; 124 return 1;
127 if (opt1->key && opt2->key) { 125 if (opt1->key && opt2->key) {
128 if (opt1->key->type != opt2->key->type) 126 if (opt1->key->type != opt2->key->type)
129 return -1; 127 return -1;
130 if (opt1->key->created.tv_sec != opt2->key->created.tv_sec) 128 if (opt1->key->created.tv_sec != opt2->key->created.tv_sec)
131 return -1; 129 return -1;
132 if (opt1->key->created.tv_nsec != opt2->key->created.tv_nsec) 130 if (opt1->key->created.tv_nsec != opt2->key->created.tv_nsec)
133 return -1; 131 return -1;
134 if (opt1->key->len != opt2->key->len) 132 if (opt1->key->len != opt2->key->len)
135 return -1; 133 return -1;
136 if (opt1->key->key && !opt2->key->key) 134 if (opt1->key->key && !opt2->key->key)
137 return -1; 135 return -1;
138 if (!opt1->key->key && opt2->key->key) 136 if (!opt1->key->key && opt2->key->key)
139 return 1; 137 return 1;
140 if (opt1->key->key && opt2->key->key) { 138 if (opt1->key->key && opt2->key->key) {
141 ret = memcmp(opt1->key->key, opt2->key->key, opt1->key->len); 139 ret = memcmp(opt1->key->key, opt2->key->key, opt1->key->len);
142 if (ret) 140 if (ret)
143 return ret; 141 return ret;
144 } 142 }
145 } 143 }
146 144
147 /* any matching mon ip implies a match */ 145 /* any matching mon ip implies a match */
148 for (i = 0; i < opt1->num_mon; i++) { 146 for (i = 0; i < opt1->num_mon; i++) {
149 if (ceph_monmap_contains(client->monc.monmap, 147 if (ceph_monmap_contains(client->monc.monmap,
150 &opt1->mon_addr[i])) 148 &opt1->mon_addr[i]))
151 return 0; 149 return 0;
152 } 150 }
153 return -1; 151 return -1;
154 } 152 }
155 EXPORT_SYMBOL(ceph_compare_options); 153 EXPORT_SYMBOL(ceph_compare_options);
156 154
157 155
158 static int parse_fsid(const char *str, struct ceph_fsid *fsid) 156 static int parse_fsid(const char *str, struct ceph_fsid *fsid)
159 { 157 {
160 int i = 0; 158 int i = 0;
161 char tmp[3]; 159 char tmp[3];
162 int err = -EINVAL; 160 int err = -EINVAL;
163 int d; 161 int d;
164 162
165 dout("parse_fsid '%s'\n", str); 163 dout("parse_fsid '%s'\n", str);
166 tmp[2] = 0; 164 tmp[2] = 0;
167 while (*str && i < 16) { 165 while (*str && i < 16) {
168 if (ispunct(*str)) { 166 if (ispunct(*str)) {
169 str++; 167 str++;
170 continue; 168 continue;
171 } 169 }
172 if (!isxdigit(str[0]) || !isxdigit(str[1])) 170 if (!isxdigit(str[0]) || !isxdigit(str[1]))
173 break; 171 break;
174 tmp[0] = str[0]; 172 tmp[0] = str[0];
175 tmp[1] = str[1]; 173 tmp[1] = str[1];
176 if (sscanf(tmp, "%x", &d) < 1) 174 if (sscanf(tmp, "%x", &d) < 1)
177 break; 175 break;
178 fsid->fsid[i] = d & 0xff; 176 fsid->fsid[i] = d & 0xff;
179 i++; 177 i++;
180 str += 2; 178 str += 2;
181 } 179 }
182 180
183 if (i == 16) 181 if (i == 16)
184 err = 0; 182 err = 0;
185 dout("parse_fsid ret %d got fsid %pU", err, fsid); 183 dout("parse_fsid ret %d got fsid %pU", err, fsid);
186 return err; 184 return err;
187 } 185 }
188 186
189 /* 187 /*
190 * ceph options 188 * ceph options
191 */ 189 */
192 enum { 190 enum {
193 Opt_osdtimeout, 191 Opt_osdtimeout,
194 Opt_osdkeepalivetimeout, 192 Opt_osdkeepalivetimeout,
195 Opt_mount_timeout, 193 Opt_mount_timeout,
196 Opt_osd_idle_ttl, 194 Opt_osd_idle_ttl,
197 Opt_last_int, 195 Opt_last_int,
198 /* int args above */ 196 /* int args above */
199 Opt_fsid, 197 Opt_fsid,
200 Opt_name, 198 Opt_name,
201 Opt_secret, 199 Opt_secret,
202 Opt_key, 200 Opt_key,
203 Opt_ip, 201 Opt_ip,
204 Opt_last_string, 202 Opt_last_string,
205 /* string args above */ 203 /* string args above */
206 Opt_noshare, 204 Opt_noshare,
207 Opt_nocrc, 205 Opt_nocrc,
208 }; 206 };
209 207
210 static match_table_t opt_tokens = { 208 static match_table_t opt_tokens = {
211 {Opt_osdtimeout, "osdtimeout=%d"}, 209 {Opt_osdtimeout, "osdtimeout=%d"},
212 {Opt_osdkeepalivetimeout, "osdkeepalive=%d"}, 210 {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
213 {Opt_mount_timeout, "mount_timeout=%d"}, 211 {Opt_mount_timeout, "mount_timeout=%d"},
214 {Opt_osd_idle_ttl, "osd_idle_ttl=%d"}, 212 {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
215 /* int args above */ 213 /* int args above */
216 {Opt_fsid, "fsid=%s"}, 214 {Opt_fsid, "fsid=%s"},
217 {Opt_name, "name=%s"}, 215 {Opt_name, "name=%s"},
218 {Opt_secret, "secret=%s"}, 216 {Opt_secret, "secret=%s"},
219 {Opt_key, "key=%s"}, 217 {Opt_key, "key=%s"},
220 {Opt_ip, "ip=%s"}, 218 {Opt_ip, "ip=%s"},
221 /* string args above */ 219 /* string args above */
222 {Opt_noshare, "noshare"}, 220 {Opt_noshare, "noshare"},
223 {Opt_nocrc, "nocrc"}, 221 {Opt_nocrc, "nocrc"},
224 {-1, NULL} 222 {-1, NULL}
225 }; 223 };
226 224
227 void ceph_destroy_options(struct ceph_options *opt) 225 void ceph_destroy_options(struct ceph_options *opt)
228 { 226 {
229 dout("destroy_options %p\n", opt); 227 dout("destroy_options %p\n", opt);
230 kfree(opt->name); 228 kfree(opt->name);
231 if (opt->key) { 229 if (opt->key) {
232 ceph_crypto_key_destroy(opt->key); 230 ceph_crypto_key_destroy(opt->key);
233 kfree(opt->key); 231 kfree(opt->key);
234 } 232 }
235 kfree(opt->mon_addr); 233 kfree(opt->mon_addr);
236 kfree(opt); 234 kfree(opt);
237 } 235 }
238 EXPORT_SYMBOL(ceph_destroy_options); 236 EXPORT_SYMBOL(ceph_destroy_options);
239 237
240 /* get secret from key store */ 238 /* get secret from key store */
241 static int get_secret(struct ceph_crypto_key *dst, const char *name) { 239 static int get_secret(struct ceph_crypto_key *dst, const char *name) {
242 struct key *ukey; 240 struct key *ukey;
243 int key_err; 241 int key_err;
244 int err = 0; 242 int err = 0;
245 struct ceph_crypto_key *ckey; 243 struct ceph_crypto_key *ckey;
246 244
247 ukey = request_key(&key_type_ceph, name, NULL); 245 ukey = request_key(&key_type_ceph, name, NULL);
248 if (!ukey || IS_ERR(ukey)) { 246 if (!ukey || IS_ERR(ukey)) {
249 /* request_key errors don't map nicely to mount(2) 247 /* request_key errors don't map nicely to mount(2)
250 errors; don't even try, but still printk */ 248 errors; don't even try, but still printk */
251 key_err = PTR_ERR(ukey); 249 key_err = PTR_ERR(ukey);
252 switch (key_err) { 250 switch (key_err) {
253 case -ENOKEY: 251 case -ENOKEY:
254 pr_warning("ceph: Mount failed due to key not found: %s\n", name); 252 pr_warning("ceph: Mount failed due to key not found: %s\n", name);
255 break; 253 break;
256 case -EKEYEXPIRED: 254 case -EKEYEXPIRED:
257 pr_warning("ceph: Mount failed due to expired key: %s\n", name); 255 pr_warning("ceph: Mount failed due to expired key: %s\n", name);
258 break; 256 break;
259 case -EKEYREVOKED: 257 case -EKEYREVOKED:
260 pr_warning("ceph: Mount failed due to revoked key: %s\n", name); 258 pr_warning("ceph: Mount failed due to revoked key: %s\n", name);
261 break; 259 break;
262 default: 260 default:
263 pr_warning("ceph: Mount failed due to unknown key error" 261 pr_warning("ceph: Mount failed due to unknown key error"
264 " %d: %s\n", key_err, name); 262 " %d: %s\n", key_err, name);
265 } 263 }
266 err = -EPERM; 264 err = -EPERM;
267 goto out; 265 goto out;
268 } 266 }
269 267
270 ckey = ukey->payload.data; 268 ckey = ukey->payload.data;
271 err = ceph_crypto_key_clone(dst, ckey); 269 err = ceph_crypto_key_clone(dst, ckey);
272 if (err) 270 if (err)
273 goto out_key; 271 goto out_key;
274 /* pass through, err is 0 */ 272 /* pass through, err is 0 */
275 273
276 out_key: 274 out_key:
277 key_put(ukey); 275 key_put(ukey);
278 out: 276 out:
279 return err; 277 return err;
280 } 278 }
281 279
282 int ceph_parse_options(struct ceph_options **popt, char *options, 280 int ceph_parse_options(struct ceph_options **popt, char *options,
283 const char *dev_name, const char *dev_name_end, 281 const char *dev_name, const char *dev_name_end,
284 int (*parse_extra_token)(char *c, void *private), 282 int (*parse_extra_token)(char *c, void *private),
285 void *private) 283 void *private)
286 { 284 {
287 struct ceph_options *opt; 285 struct ceph_options *opt;
288 const char *c; 286 const char *c;
289 int err = -ENOMEM; 287 int err = -ENOMEM;
290 substring_t argstr[MAX_OPT_ARGS]; 288 substring_t argstr[MAX_OPT_ARGS];
291 289
292 opt = kzalloc(sizeof(*opt), GFP_KERNEL); 290 opt = kzalloc(sizeof(*opt), GFP_KERNEL);
293 if (!opt) 291 if (!opt)
294 return err; 292 return err;
295 opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr), 293 opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr),
296 GFP_KERNEL); 294 GFP_KERNEL);
297 if (!opt->mon_addr) 295 if (!opt->mon_addr)
298 goto out; 296 goto out;
299 297
300 dout("parse_options %p options '%s' dev_name '%s'\n", opt, options, 298 dout("parse_options %p options '%s' dev_name '%s'\n", opt, options,
301 dev_name); 299 dev_name);
302 300
303 /* start with defaults */ 301 /* start with defaults */
304 opt->flags = CEPH_OPT_DEFAULT; 302 opt->flags = CEPH_OPT_DEFAULT;
305 opt->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT; 303 opt->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
306 opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; 304 opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
307 opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */ 305 opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
308 opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */ 306 opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */
309 307
310 /* get mon ip(s) */ 308 /* get mon ip(s) */
311 /* ip1[:port1][,ip2[:port2]...] */ 309 /* ip1[:port1][,ip2[:port2]...] */
312 err = ceph_parse_ips(dev_name, dev_name_end, opt->mon_addr, 310 err = ceph_parse_ips(dev_name, dev_name_end, opt->mon_addr,
313 CEPH_MAX_MON, &opt->num_mon); 311 CEPH_MAX_MON, &opt->num_mon);
314 if (err < 0) 312 if (err < 0)
315 goto out; 313 goto out;
316 314
317 /* parse mount options */ 315 /* parse mount options */
318 while ((c = strsep(&options, ",")) != NULL) { 316 while ((c = strsep(&options, ",")) != NULL) {
319 int token, intval, ret; 317 int token, intval, ret;
320 if (!*c) 318 if (!*c)
321 continue; 319 continue;
322 err = -EINVAL; 320 err = -EINVAL;
323 token = match_token((char *)c, opt_tokens, argstr); 321 token = match_token((char *)c, opt_tokens, argstr);
324 if (token < 0 && parse_extra_token) { 322 if (token < 0 && parse_extra_token) {
325 /* extra? */ 323 /* extra? */
326 err = parse_extra_token((char *)c, private); 324 err = parse_extra_token((char *)c, private);
327 if (err < 0) { 325 if (err < 0) {
328 pr_err("bad option at '%s'\n", c); 326 pr_err("bad option at '%s'\n", c);
329 goto out; 327 goto out;
330 } 328 }
331 continue; 329 continue;
332 } 330 }
333 if (token < Opt_last_int) { 331 if (token < Opt_last_int) {
334 ret = match_int(&argstr[0], &intval); 332 ret = match_int(&argstr[0], &intval);
335 if (ret < 0) { 333 if (ret < 0) {
336 pr_err("bad mount option arg (not int) " 334 pr_err("bad mount option arg (not int) "
337 "at '%s'\n", c); 335 "at '%s'\n", c);
338 continue; 336 continue;
339 } 337 }
340 dout("got int token %d val %d\n", token, intval); 338 dout("got int token %d val %d\n", token, intval);
341 } else if (token > Opt_last_int && token < Opt_last_string) { 339 } else if (token > Opt_last_int && token < Opt_last_string) {
342 dout("got string token %d val %s\n", token, 340 dout("got string token %d val %s\n", token,
343 argstr[0].from); 341 argstr[0].from);
344 } else { 342 } else {
345 dout("got token %d\n", token); 343 dout("got token %d\n", token);
346 } 344 }
347 switch (token) { 345 switch (token) {
348 case Opt_ip: 346 case Opt_ip:
349 err = ceph_parse_ips(argstr[0].from, 347 err = ceph_parse_ips(argstr[0].from,
350 argstr[0].to, 348 argstr[0].to,
351 &opt->my_addr, 349 &opt->my_addr,
352 1, NULL); 350 1, NULL);
353 if (err < 0) 351 if (err < 0)
354 goto out; 352 goto out;
355 opt->flags |= CEPH_OPT_MYIP; 353 opt->flags |= CEPH_OPT_MYIP;
356 break; 354 break;
357 355
358 case Opt_fsid: 356 case Opt_fsid:
359 err = parse_fsid(argstr[0].from, &opt->fsid); 357 err = parse_fsid(argstr[0].from, &opt->fsid);
360 if (err == 0) 358 if (err == 0)
361 opt->flags |= CEPH_OPT_FSID; 359 opt->flags |= CEPH_OPT_FSID;
362 break; 360 break;
363 case Opt_name: 361 case Opt_name:
364 opt->name = kstrndup(argstr[0].from, 362 opt->name = kstrndup(argstr[0].from,
365 argstr[0].to-argstr[0].from, 363 argstr[0].to-argstr[0].from,
366 GFP_KERNEL); 364 GFP_KERNEL);
367 break; 365 break;
368 case Opt_secret: 366 case Opt_secret:
369 opt->key = kzalloc(sizeof(*opt->key), GFP_KERNEL); 367 opt->key = kzalloc(sizeof(*opt->key), GFP_KERNEL);
370 if (!opt->key) { 368 if (!opt->key) {
371 err = -ENOMEM; 369 err = -ENOMEM;
372 goto out; 370 goto out;
373 } 371 }
374 err = ceph_crypto_key_unarmor(opt->key, argstr[0].from); 372 err = ceph_crypto_key_unarmor(opt->key, argstr[0].from);
375 if (err < 0) 373 if (err < 0)
376 goto out; 374 goto out;
377 break; 375 break;
378 case Opt_key: 376 case Opt_key:
379 opt->key = kzalloc(sizeof(*opt->key), GFP_KERNEL); 377 opt->key = kzalloc(sizeof(*opt->key), GFP_KERNEL);
380 if (!opt->key) { 378 if (!opt->key) {
381 err = -ENOMEM; 379 err = -ENOMEM;
382 goto out; 380 goto out;
383 } 381 }
384 err = get_secret(opt->key, argstr[0].from); 382 err = get_secret(opt->key, argstr[0].from);
385 if (err < 0) 383 if (err < 0)
386 goto out; 384 goto out;
387 break; 385 break;
388 386
389 /* misc */ 387 /* misc */
390 case Opt_osdtimeout: 388 case Opt_osdtimeout:
391 opt->osd_timeout = intval; 389 opt->osd_timeout = intval;
392 break; 390 break;
393 case Opt_osdkeepalivetimeout: 391 case Opt_osdkeepalivetimeout:
394 opt->osd_keepalive_timeout = intval; 392 opt->osd_keepalive_timeout = intval;
395 break; 393 break;
396 case Opt_osd_idle_ttl: 394 case Opt_osd_idle_ttl:
397 opt->osd_idle_ttl = intval; 395 opt->osd_idle_ttl = intval;
398 break; 396 break;
399 case Opt_mount_timeout: 397 case Opt_mount_timeout:
400 opt->mount_timeout = intval; 398 opt->mount_timeout = intval;
401 break; 399 break;
402 400
403 case Opt_noshare: 401 case Opt_noshare:
404 opt->flags |= CEPH_OPT_NOSHARE; 402 opt->flags |= CEPH_OPT_NOSHARE;
405 break; 403 break;
406 404
407 case Opt_nocrc: 405 case Opt_nocrc:
408 opt->flags |= CEPH_OPT_NOCRC; 406 opt->flags |= CEPH_OPT_NOCRC;
409 break; 407 break;
410 408
411 default: 409 default:
412 BUG_ON(token); 410 BUG_ON(token);
413 } 411 }
414 } 412 }
415 413
416 /* success */ 414 /* success */
417 *popt = opt; 415 *popt = opt;
418 return 0; 416 return 0;
419 417
420 out: 418 out:
421 ceph_destroy_options(opt); 419 ceph_destroy_options(opt);
422 return err; 420 return err;
423 } 421 }
424 EXPORT_SYMBOL(ceph_parse_options); 422 EXPORT_SYMBOL(ceph_parse_options);
425 423
426 u64 ceph_client_id(struct ceph_client *client) 424 u64 ceph_client_id(struct ceph_client *client)
427 { 425 {
428 return client->monc.auth->global_id; 426 return client->monc.auth->global_id;
429 } 427 }
430 EXPORT_SYMBOL(ceph_client_id); 428 EXPORT_SYMBOL(ceph_client_id);
431 429
432 /* 430 /*
433 * create a fresh client instance 431 * create a fresh client instance
434 */ 432 */
435 struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private, 433 struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private,
436 unsigned supported_features, 434 unsigned supported_features,
437 unsigned required_features) 435 unsigned required_features)
438 { 436 {
439 struct ceph_client *client; 437 struct ceph_client *client;
440 struct ceph_entity_addr *myaddr = NULL; 438 struct ceph_entity_addr *myaddr = NULL;
441 int err = -ENOMEM; 439 int err = -ENOMEM;
442 440
443 client = kzalloc(sizeof(*client), GFP_KERNEL); 441 client = kzalloc(sizeof(*client), GFP_KERNEL);
444 if (client == NULL) 442 if (client == NULL)
445 return ERR_PTR(-ENOMEM); 443 return ERR_PTR(-ENOMEM);
446 444
447 client->private = private; 445 client->private = private;
448 client->options = opt; 446 client->options = opt;
449 447
450 mutex_init(&client->mount_mutex); 448 mutex_init(&client->mount_mutex);
451 init_waitqueue_head(&client->auth_wq); 449 init_waitqueue_head(&client->auth_wq);
452 client->auth_err = 0; 450 client->auth_err = 0;
453 451
454 client->extra_mon_dispatch = NULL; 452 client->extra_mon_dispatch = NULL;
455 client->supported_features = CEPH_FEATURE_SUPPORTED_DEFAULT | 453 client->supported_features = CEPH_FEATURE_SUPPORTED_DEFAULT |
456 supported_features; 454 supported_features;
457 client->required_features = CEPH_FEATURE_REQUIRED_DEFAULT | 455 client->required_features = CEPH_FEATURE_REQUIRED_DEFAULT |
458 required_features; 456 required_features;
459 457
460 /* msgr */ 458 /* msgr */
461 if (ceph_test_opt(client, MYIP)) 459 if (ceph_test_opt(client, MYIP))
462 myaddr = &client->options->my_addr; 460 myaddr = &client->options->my_addr;
463 client->msgr = ceph_messenger_create(myaddr, 461 client->msgr = ceph_messenger_create(myaddr,
464 client->supported_features, 462 client->supported_features,
465 client->required_features); 463 client->required_features);
466 if (IS_ERR(client->msgr)) { 464 if (IS_ERR(client->msgr)) {
467 err = PTR_ERR(client->msgr); 465 err = PTR_ERR(client->msgr);
468 goto fail; 466 goto fail;
469 } 467 }
470 client->msgr->nocrc = ceph_test_opt(client, NOCRC); 468 client->msgr->nocrc = ceph_test_opt(client, NOCRC);
471 469
472 /* subsystems */ 470 /* subsystems */
473 err = ceph_monc_init(&client->monc, client); 471 err = ceph_monc_init(&client->monc, client);
474 if (err < 0) 472 if (err < 0)
475 goto fail_msgr; 473 goto fail_msgr;
476 err = ceph_osdc_init(&client->osdc, client); 474 err = ceph_osdc_init(&client->osdc, client);
477 if (err < 0) 475 if (err < 0)
478 goto fail_monc; 476 goto fail_monc;
479 477
480 return client; 478 return client;
481 479
482 fail_monc: 480 fail_monc:
483 ceph_monc_stop(&client->monc); 481 ceph_monc_stop(&client->monc);
484 fail_msgr: 482 fail_msgr:
485 ceph_messenger_destroy(client->msgr); 483 ceph_messenger_destroy(client->msgr);
486 fail: 484 fail:
487 kfree(client); 485 kfree(client);
488 return ERR_PTR(err); 486 return ERR_PTR(err);
489 } 487 }
490 EXPORT_SYMBOL(ceph_create_client); 488 EXPORT_SYMBOL(ceph_create_client);
491 489
492 void ceph_destroy_client(struct ceph_client *client) 490 void ceph_destroy_client(struct ceph_client *client)
493 { 491 {
494 dout("destroy_client %p\n", client); 492 dout("destroy_client %p\n", client);
495 493
496 /* unmount */ 494 /* unmount */
497 ceph_osdc_stop(&client->osdc); 495 ceph_osdc_stop(&client->osdc);
498 496
499 /* 497 /*
500 * make sure osd connections close out before destroying the 498 * make sure osd connections close out before destroying the
501 * auth module, which is needed to free those connections' 499 * auth module, which is needed to free those connections'
502 * ceph_authorizers. 500 * ceph_authorizers.
503 */ 501 */
504 ceph_msgr_flush(); 502 ceph_msgr_flush();
505 503
506 ceph_monc_stop(&client->monc); 504 ceph_monc_stop(&client->monc);
507 505
508 ceph_debugfs_client_cleanup(client); 506 ceph_debugfs_client_cleanup(client);
509 507
510 ceph_messenger_destroy(client->msgr); 508 ceph_messenger_destroy(client->msgr);
511 509
512 ceph_destroy_options(client->options); 510 ceph_destroy_options(client->options);
513 511
514 kfree(client); 512 kfree(client);
515 dout("destroy_client %p done\n", client); 513 dout("destroy_client %p done\n", client);
516 } 514 }
517 EXPORT_SYMBOL(ceph_destroy_client); 515 EXPORT_SYMBOL(ceph_destroy_client);
518 516
519 /* 517 /*
520 * true if we have the mon map (and have thus joined the cluster) 518 * true if we have the mon map (and have thus joined the cluster)
521 */ 519 */
522 static int have_mon_and_osd_map(struct ceph_client *client) 520 static int have_mon_and_osd_map(struct ceph_client *client)
523 { 521 {
524 return client->monc.monmap && client->monc.monmap->epoch && 522 return client->monc.monmap && client->monc.monmap->epoch &&
525 client->osdc.osdmap && client->osdc.osdmap->epoch; 523 client->osdc.osdmap && client->osdc.osdmap->epoch;
526 } 524 }
527 525
528 /* 526 /*
529 * mount: join the ceph cluster, and open root directory. 527 * mount: join the ceph cluster, and open root directory.
530 */ 528 */
531 int __ceph_open_session(struct ceph_client *client, unsigned long started) 529 int __ceph_open_session(struct ceph_client *client, unsigned long started)
532 { 530 {
533 int err; 531 int err;
534 unsigned long timeout = client->options->mount_timeout * HZ; 532 unsigned long timeout = client->options->mount_timeout * HZ;
535 533
536 /* open session, and wait for mon and osd maps */ 534 /* open session, and wait for mon and osd maps */
537 err = ceph_monc_open_session(&client->monc); 535 err = ceph_monc_open_session(&client->monc);
538 if (err < 0) 536 if (err < 0)
539 return err; 537 return err;
540 538
541 while (!have_mon_and_osd_map(client)) { 539 while (!have_mon_and_osd_map(client)) {
542 err = -EIO; 540 err = -EIO;
543 if (timeout && time_after_eq(jiffies, started + timeout)) 541 if (timeout && time_after_eq(jiffies, started + timeout))
544 return err; 542 return err;
545 543
546 /* wait */ 544 /* wait */
547 dout("mount waiting for mon_map\n"); 545 dout("mount waiting for mon_map\n");
548 err = wait_event_interruptible_timeout(client->auth_wq, 546 err = wait_event_interruptible_timeout(client->auth_wq,
549 have_mon_and_osd_map(client) || (client->auth_err < 0), 547 have_mon_and_osd_map(client) || (client->auth_err < 0),
550 timeout); 548 timeout);
551 if (err == -EINTR || err == -ERESTARTSYS) 549 if (err == -EINTR || err == -ERESTARTSYS)
552 return err; 550 return err;
553 if (client->auth_err < 0) 551 if (client->auth_err < 0)
554 return client->auth_err; 552 return client->auth_err;
555 } 553 }
556 554
557 return 0; 555 return 0;
558 } 556 }
559 EXPORT_SYMBOL(__ceph_open_session); 557 EXPORT_SYMBOL(__ceph_open_session);
560 558
561 559
562 int ceph_open_session(struct ceph_client *client) 560 int ceph_open_session(struct ceph_client *client)
563 { 561 {
564 int ret; 562 int ret;
565 unsigned long started = jiffies; /* note the start time */ 563 unsigned long started = jiffies; /* note the start time */
566 564
567 dout("open_session start\n"); 565 dout("open_session start\n");
568 mutex_lock(&client->mount_mutex); 566 mutex_lock(&client->mount_mutex);
569 567
570 ret = __ceph_open_session(client, started); 568 ret = __ceph_open_session(client, started);
571 569
572 mutex_unlock(&client->mount_mutex); 570 mutex_unlock(&client->mount_mutex);
573 return ret; 571 return ret;
574 } 572 }
575 EXPORT_SYMBOL(ceph_open_session); 573 EXPORT_SYMBOL(ceph_open_session);
576 574
577 575
578 static int __init init_ceph_lib(void) 576 static int __init init_ceph_lib(void)
579 { 577 {
580 int ret = 0; 578 int ret = 0;
581 579
582 ret = ceph_debugfs_init(); 580 ret = ceph_debugfs_init();
583 if (ret < 0) 581 if (ret < 0)
584 goto out; 582 goto out;
585 583
586 ret = ceph_crypto_init(); 584 ret = ceph_crypto_init();
587 if (ret < 0) 585 if (ret < 0)
588 goto out_debugfs; 586 goto out_debugfs;
589 587
590 ret = ceph_msgr_init(); 588 ret = ceph_msgr_init();
591 if (ret < 0) 589 if (ret < 0)
592 goto out_crypto; 590 goto out_crypto;
593 591
594 pr_info("loaded (mon/osd proto %d/%d, osdmap %d/%d %d/%d)\n", 592 pr_info("loaded (mon/osd proto %d/%d, osdmap %d/%d %d/%d)\n",
595 CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL, 593 CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL,
596 CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT, 594 CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
597 CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT); 595 CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
598 596
599 return 0; 597 return 0;
600 598
601 out_crypto: 599 out_crypto:
602 ceph_crypto_shutdown(); 600 ceph_crypto_shutdown();
603 out_debugfs: 601 out_debugfs:
604 ceph_debugfs_cleanup(); 602 ceph_debugfs_cleanup();
605 out: 603 out:
606 return ret; 604 return ret;
607 } 605 }
608 606
609 static void __exit exit_ceph_lib(void) 607 static void __exit exit_ceph_lib(void)
610 { 608 {
611 dout("exit_ceph_lib\n"); 609 dout("exit_ceph_lib\n");
612 ceph_msgr_exit(); 610 ceph_msgr_exit();
613 ceph_crypto_shutdown(); 611 ceph_crypto_shutdown();
614 ceph_debugfs_cleanup(); 612 ceph_debugfs_cleanup();
615 } 613 }
616 614
617 module_init(init_ceph_lib); 615 module_init(init_ceph_lib);
618 module_exit(exit_ceph_lib); 616 module_exit(exit_ceph_lib);
619 617
620 MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); 618 MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
621 MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); 619 MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
622 MODULE_AUTHOR("Patience Warnick <patience@newdream.net>"); 620 MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
623 MODULE_DESCRIPTION("Ceph filesystem for Linux"); 621 MODULE_DESCRIPTION("Ceph filesystem for Linux");
624 MODULE_LICENSE("GPL"); 622 MODULE_LICENSE("GPL");
625 623
net/ceph/mon_client.c
1 #include <linux/ceph/ceph_debug.h> 1 #include <linux/ceph/ceph_debug.h>
2 2
3 #include <linux/module.h> 3 #include <linux/module.h>
4 #include <linux/types.h> 4 #include <linux/types.h>
5 #include <linux/slab.h> 5 #include <linux/slab.h>
6 #include <linux/random.h> 6 #include <linux/random.h>
7 #include <linux/sched.h> 7 #include <linux/sched.h>
8 8
9 #include <linux/ceph/mon_client.h> 9 #include <linux/ceph/mon_client.h>
10 #include <linux/ceph/libceph.h> 10 #include <linux/ceph/libceph.h>
11 #include <linux/ceph/debugfs.h>
11 #include <linux/ceph/decode.h> 12 #include <linux/ceph/decode.h>
12
13 #include <linux/ceph/auth.h> 13 #include <linux/ceph/auth.h>
14 14
15 /* 15 /*
16 * Interact with Ceph monitor cluster. Handle requests for new map 16 * Interact with Ceph monitor cluster. Handle requests for new map
17 * versions, and periodically resend as needed. Also implement 17 * versions, and periodically resend as needed. Also implement
18 * statfs() and umount(). 18 * statfs() and umount().
19 * 19 *
20 * A small cluster of Ceph "monitors" are responsible for managing critical 20 * A small cluster of Ceph "monitors" are responsible for managing critical
21 * cluster configuration and state information. An odd number (e.g., 3, 5) 21 * cluster configuration and state information. An odd number (e.g., 3, 5)
22 * of cmon daemons use a modified version of the Paxos part-time parliament 22 * of cmon daemons use a modified version of the Paxos part-time parliament
23 * algorithm to manage the MDS map (mds cluster membership), OSD map, and 23 * algorithm to manage the MDS map (mds cluster membership), OSD map, and
24 * list of clients who have mounted the file system. 24 * list of clients who have mounted the file system.
25 * 25 *
26 * We maintain an open, active session with a monitor at all times in order to 26 * We maintain an open, active session with a monitor at all times in order to
27 * receive timely MDSMap updates. We periodically send a keepalive byte on the 27 * receive timely MDSMap updates. We periodically send a keepalive byte on the
28 * TCP socket to ensure we detect a failure. If the connection does break, we 28 * TCP socket to ensure we detect a failure. If the connection does break, we
29 * randomly hunt for a new monitor. Once the connection is reestablished, we 29 * randomly hunt for a new monitor. Once the connection is reestablished, we
30 * resend any outstanding requests. 30 * resend any outstanding requests.
31 */ 31 */
32 32
33 static const struct ceph_connection_operations mon_con_ops; 33 static const struct ceph_connection_operations mon_con_ops;
34 34
35 static int __validate_auth(struct ceph_mon_client *monc); 35 static int __validate_auth(struct ceph_mon_client *monc);
36 36
37 /* 37 /*
38 * Decode a monmap blob (e.g., during mount). 38 * Decode a monmap blob (e.g., during mount).
39 */ 39 */
40 struct ceph_monmap *ceph_monmap_decode(void *p, void *end) 40 struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
41 { 41 {
42 struct ceph_monmap *m = NULL; 42 struct ceph_monmap *m = NULL;
43 int i, err = -EINVAL; 43 int i, err = -EINVAL;
44 struct ceph_fsid fsid; 44 struct ceph_fsid fsid;
45 u32 epoch, num_mon; 45 u32 epoch, num_mon;
46 u16 version; 46 u16 version;
47 u32 len; 47 u32 len;
48 48
49 ceph_decode_32_safe(&p, end, len, bad); 49 ceph_decode_32_safe(&p, end, len, bad);
50 ceph_decode_need(&p, end, len, bad); 50 ceph_decode_need(&p, end, len, bad);
51 51
52 dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p)); 52 dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
53 53
54 ceph_decode_16_safe(&p, end, version, bad); 54 ceph_decode_16_safe(&p, end, version, bad);
55 55
56 ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad); 56 ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
57 ceph_decode_copy(&p, &fsid, sizeof(fsid)); 57 ceph_decode_copy(&p, &fsid, sizeof(fsid));
58 epoch = ceph_decode_32(&p); 58 epoch = ceph_decode_32(&p);
59 59
60 num_mon = ceph_decode_32(&p); 60 num_mon = ceph_decode_32(&p);
61 ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad); 61 ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
62 62
63 if (num_mon >= CEPH_MAX_MON) 63 if (num_mon >= CEPH_MAX_MON)
64 goto bad; 64 goto bad;
65 m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS); 65 m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
66 if (m == NULL) 66 if (m == NULL)
67 return ERR_PTR(-ENOMEM); 67 return ERR_PTR(-ENOMEM);
68 m->fsid = fsid; 68 m->fsid = fsid;
69 m->epoch = epoch; 69 m->epoch = epoch;
70 m->num_mon = num_mon; 70 m->num_mon = num_mon;
71 ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0])); 71 ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
72 for (i = 0; i < num_mon; i++) 72 for (i = 0; i < num_mon; i++)
73 ceph_decode_addr(&m->mon_inst[i].addr); 73 ceph_decode_addr(&m->mon_inst[i].addr);
74 74
75 dout("monmap_decode epoch %d, num_mon %d\n", m->epoch, 75 dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
76 m->num_mon); 76 m->num_mon);
77 for (i = 0; i < m->num_mon; i++) 77 for (i = 0; i < m->num_mon; i++)
78 dout("monmap_decode mon%d is %s\n", i, 78 dout("monmap_decode mon%d is %s\n", i,
79 ceph_pr_addr(&m->mon_inst[i].addr.in_addr)); 79 ceph_pr_addr(&m->mon_inst[i].addr.in_addr));
80 return m; 80 return m;
81 81
82 bad: 82 bad:
83 dout("monmap_decode failed with %d\n", err); 83 dout("monmap_decode failed with %d\n", err);
84 kfree(m); 84 kfree(m);
85 return ERR_PTR(err); 85 return ERR_PTR(err);
86 } 86 }
87 87
88 /* 88 /*
89 * return true if *addr is included in the monmap. 89 * return true if *addr is included in the monmap.
90 */ 90 */
91 int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr) 91 int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
92 { 92 {
93 int i; 93 int i;
94 94
95 for (i = 0; i < m->num_mon; i++) 95 for (i = 0; i < m->num_mon; i++)
96 if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0) 96 if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
97 return 1; 97 return 1;
98 return 0; 98 return 0;
99 } 99 }
100 100
101 /* 101 /*
102 * Send an auth request. 102 * Send an auth request.
103 */ 103 */
104 static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len) 104 static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
105 { 105 {
106 monc->pending_auth = 1; 106 monc->pending_auth = 1;
107 monc->m_auth->front.iov_len = len; 107 monc->m_auth->front.iov_len = len;
108 monc->m_auth->hdr.front_len = cpu_to_le32(len); 108 monc->m_auth->hdr.front_len = cpu_to_le32(len);
109 ceph_con_revoke(monc->con, monc->m_auth); 109 ceph_con_revoke(monc->con, monc->m_auth);
110 ceph_msg_get(monc->m_auth); /* keep our ref */ 110 ceph_msg_get(monc->m_auth); /* keep our ref */
111 ceph_con_send(monc->con, monc->m_auth); 111 ceph_con_send(monc->con, monc->m_auth);
112 } 112 }
113 113
114 /* 114 /*
115 * Close monitor session, if any. 115 * Close monitor session, if any.
116 */ 116 */
117 static void __close_session(struct ceph_mon_client *monc) 117 static void __close_session(struct ceph_mon_client *monc)
118 { 118 {
119 dout("__close_session closing mon%d\n", monc->cur_mon); 119 dout("__close_session closing mon%d\n", monc->cur_mon);
120 ceph_con_revoke(monc->con, monc->m_auth); 120 ceph_con_revoke(monc->con, monc->m_auth);
121 ceph_con_close(monc->con); 121 ceph_con_close(monc->con);
122 monc->cur_mon = -1; 122 monc->cur_mon = -1;
123 monc->pending_auth = 0; 123 monc->pending_auth = 0;
124 ceph_auth_reset(monc->auth); 124 ceph_auth_reset(monc->auth);
125 } 125 }
126 126
127 /* 127 /*
128 * Open a session with a (new) monitor. 128 * Open a session with a (new) monitor.
129 */ 129 */
130 static int __open_session(struct ceph_mon_client *monc) 130 static int __open_session(struct ceph_mon_client *monc)
131 { 131 {
132 char r; 132 char r;
133 int ret; 133 int ret;
134 134
135 if (monc->cur_mon < 0) { 135 if (monc->cur_mon < 0) {
136 get_random_bytes(&r, 1); 136 get_random_bytes(&r, 1);
137 monc->cur_mon = r % monc->monmap->num_mon; 137 monc->cur_mon = r % monc->monmap->num_mon;
138 dout("open_session num=%d r=%d -> mon%d\n", 138 dout("open_session num=%d r=%d -> mon%d\n",
139 monc->monmap->num_mon, r, monc->cur_mon); 139 monc->monmap->num_mon, r, monc->cur_mon);
140 monc->sub_sent = 0; 140 monc->sub_sent = 0;
141 monc->sub_renew_after = jiffies; /* i.e., expired */ 141 monc->sub_renew_after = jiffies; /* i.e., expired */
142 monc->want_next_osdmap = !!monc->want_next_osdmap; 142 monc->want_next_osdmap = !!monc->want_next_osdmap;
143 143
144 dout("open_session mon%d opening\n", monc->cur_mon); 144 dout("open_session mon%d opening\n", monc->cur_mon);
145 monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON; 145 monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON;
146 monc->con->peer_name.num = cpu_to_le64(monc->cur_mon); 146 monc->con->peer_name.num = cpu_to_le64(monc->cur_mon);
147 ceph_con_open(monc->con, 147 ceph_con_open(monc->con,
148 &monc->monmap->mon_inst[monc->cur_mon].addr); 148 &monc->monmap->mon_inst[monc->cur_mon].addr);
149 149
150 /* initiatiate authentication handshake */ 150 /* initiatiate authentication handshake */
151 ret = ceph_auth_build_hello(monc->auth, 151 ret = ceph_auth_build_hello(monc->auth,
152 monc->m_auth->front.iov_base, 152 monc->m_auth->front.iov_base,
153 monc->m_auth->front_max); 153 monc->m_auth->front_max);
154 __send_prepared_auth_request(monc, ret); 154 __send_prepared_auth_request(monc, ret);
155 } else { 155 } else {
156 dout("open_session mon%d already open\n", monc->cur_mon); 156 dout("open_session mon%d already open\n", monc->cur_mon);
157 } 157 }
158 return 0; 158 return 0;
159 } 159 }
160 160
161 static bool __sub_expired(struct ceph_mon_client *monc) 161 static bool __sub_expired(struct ceph_mon_client *monc)
162 { 162 {
163 return time_after_eq(jiffies, monc->sub_renew_after); 163 return time_after_eq(jiffies, monc->sub_renew_after);
164 } 164 }
165 165
166 /* 166 /*
167 * Reschedule delayed work timer. 167 * Reschedule delayed work timer.
168 */ 168 */
169 static void __schedule_delayed(struct ceph_mon_client *monc) 169 static void __schedule_delayed(struct ceph_mon_client *monc)
170 { 170 {
171 unsigned delay; 171 unsigned delay;
172 172
173 if (monc->cur_mon < 0 || __sub_expired(monc)) 173 if (monc->cur_mon < 0 || __sub_expired(monc))
174 delay = 10 * HZ; 174 delay = 10 * HZ;
175 else 175 else
176 delay = 20 * HZ; 176 delay = 20 * HZ;
177 dout("__schedule_delayed after %u\n", delay); 177 dout("__schedule_delayed after %u\n", delay);
178 schedule_delayed_work(&monc->delayed_work, delay); 178 schedule_delayed_work(&monc->delayed_work, delay);
179 } 179 }
180 180
181 /* 181 /*
182 * Send subscribe request for mdsmap and/or osdmap. 182 * Send subscribe request for mdsmap and/or osdmap.
183 */ 183 */
184 static void __send_subscribe(struct ceph_mon_client *monc) 184 static void __send_subscribe(struct ceph_mon_client *monc)
185 { 185 {
186 dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n", 186 dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
187 (unsigned)monc->sub_sent, __sub_expired(monc), 187 (unsigned)monc->sub_sent, __sub_expired(monc),
188 monc->want_next_osdmap); 188 monc->want_next_osdmap);
189 if ((__sub_expired(monc) && !monc->sub_sent) || 189 if ((__sub_expired(monc) && !monc->sub_sent) ||
190 monc->want_next_osdmap == 1) { 190 monc->want_next_osdmap == 1) {
191 struct ceph_msg *msg = monc->m_subscribe; 191 struct ceph_msg *msg = monc->m_subscribe;
192 struct ceph_mon_subscribe_item *i; 192 struct ceph_mon_subscribe_item *i;
193 void *p, *end; 193 void *p, *end;
194 int num; 194 int num;
195 195
196 p = msg->front.iov_base; 196 p = msg->front.iov_base;
197 end = p + msg->front_max; 197 end = p + msg->front_max;
198 198
199 num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap; 199 num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap;
200 ceph_encode_32(&p, num); 200 ceph_encode_32(&p, num);
201 201
202 if (monc->want_next_osdmap) { 202 if (monc->want_next_osdmap) {
203 dout("__send_subscribe to 'osdmap' %u\n", 203 dout("__send_subscribe to 'osdmap' %u\n",
204 (unsigned)monc->have_osdmap); 204 (unsigned)monc->have_osdmap);
205 ceph_encode_string(&p, end, "osdmap", 6); 205 ceph_encode_string(&p, end, "osdmap", 6);
206 i = p; 206 i = p;
207 i->have = cpu_to_le64(monc->have_osdmap); 207 i->have = cpu_to_le64(monc->have_osdmap);
208 i->onetime = 1; 208 i->onetime = 1;
209 p += sizeof(*i); 209 p += sizeof(*i);
210 monc->want_next_osdmap = 2; /* requested */ 210 monc->want_next_osdmap = 2; /* requested */
211 } 211 }
212 if (monc->want_mdsmap) { 212 if (monc->want_mdsmap) {
213 dout("__send_subscribe to 'mdsmap' %u+\n", 213 dout("__send_subscribe to 'mdsmap' %u+\n",
214 (unsigned)monc->have_mdsmap); 214 (unsigned)monc->have_mdsmap);
215 ceph_encode_string(&p, end, "mdsmap", 6); 215 ceph_encode_string(&p, end, "mdsmap", 6);
216 i = p; 216 i = p;
217 i->have = cpu_to_le64(monc->have_mdsmap); 217 i->have = cpu_to_le64(monc->have_mdsmap);
218 i->onetime = 0; 218 i->onetime = 0;
219 p += sizeof(*i); 219 p += sizeof(*i);
220 } 220 }
221 ceph_encode_string(&p, end, "monmap", 6); 221 ceph_encode_string(&p, end, "monmap", 6);
222 i = p; 222 i = p;
223 i->have = 0; 223 i->have = 0;
224 i->onetime = 0; 224 i->onetime = 0;
225 p += sizeof(*i); 225 p += sizeof(*i);
226 226
227 msg->front.iov_len = p - msg->front.iov_base; 227 msg->front.iov_len = p - msg->front.iov_base;
228 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); 228 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
229 ceph_con_revoke(monc->con, msg); 229 ceph_con_revoke(monc->con, msg);
230 ceph_con_send(monc->con, ceph_msg_get(msg)); 230 ceph_con_send(monc->con, ceph_msg_get(msg));
231 231
232 monc->sub_sent = jiffies | 1; /* never 0 */ 232 monc->sub_sent = jiffies | 1; /* never 0 */
233 } 233 }
234 } 234 }
235 235
236 static void handle_subscribe_ack(struct ceph_mon_client *monc, 236 static void handle_subscribe_ack(struct ceph_mon_client *monc,
237 struct ceph_msg *msg) 237 struct ceph_msg *msg)
238 { 238 {
239 unsigned seconds; 239 unsigned seconds;
240 struct ceph_mon_subscribe_ack *h = msg->front.iov_base; 240 struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
241 241
242 if (msg->front.iov_len < sizeof(*h)) 242 if (msg->front.iov_len < sizeof(*h))
243 goto bad; 243 goto bad;
244 seconds = le32_to_cpu(h->duration); 244 seconds = le32_to_cpu(h->duration);
245 245
246 mutex_lock(&monc->mutex); 246 mutex_lock(&monc->mutex);
247 if (monc->hunting) { 247 if (monc->hunting) {
248 pr_info("mon%d %s session established\n", 248 pr_info("mon%d %s session established\n",
249 monc->cur_mon, 249 monc->cur_mon,
250 ceph_pr_addr(&monc->con->peer_addr.in_addr)); 250 ceph_pr_addr(&monc->con->peer_addr.in_addr));
251 monc->hunting = false; 251 monc->hunting = false;
252 } 252 }
253 dout("handle_subscribe_ack after %d seconds\n", seconds); 253 dout("handle_subscribe_ack after %d seconds\n", seconds);
254 monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1; 254 monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
255 monc->sub_sent = 0; 255 monc->sub_sent = 0;
256 mutex_unlock(&monc->mutex); 256 mutex_unlock(&monc->mutex);
257 return; 257 return;
258 bad: 258 bad:
259 pr_err("got corrupt subscribe-ack msg\n"); 259 pr_err("got corrupt subscribe-ack msg\n");
260 ceph_msg_dump(msg); 260 ceph_msg_dump(msg);
261 } 261 }
262 262
263 /* 263 /*
264 * Keep track of which maps we have 264 * Keep track of which maps we have
265 */ 265 */
266 int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got) 266 int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
267 { 267 {
268 mutex_lock(&monc->mutex); 268 mutex_lock(&monc->mutex);
269 monc->have_mdsmap = got; 269 monc->have_mdsmap = got;
270 mutex_unlock(&monc->mutex); 270 mutex_unlock(&monc->mutex);
271 return 0; 271 return 0;
272 } 272 }
273 EXPORT_SYMBOL(ceph_monc_got_mdsmap); 273 EXPORT_SYMBOL(ceph_monc_got_mdsmap);
274 274
275 int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got) 275 int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
276 { 276 {
277 mutex_lock(&monc->mutex); 277 mutex_lock(&monc->mutex);
278 monc->have_osdmap = got; 278 monc->have_osdmap = got;
279 monc->want_next_osdmap = 0; 279 monc->want_next_osdmap = 0;
280 mutex_unlock(&monc->mutex); 280 mutex_unlock(&monc->mutex);
281 return 0; 281 return 0;
282 } 282 }
283 283
284 /* 284 /*
285 * Register interest in the next osdmap 285 * Register interest in the next osdmap
286 */ 286 */
287 void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc) 287 void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
288 { 288 {
289 dout("request_next_osdmap have %u\n", monc->have_osdmap); 289 dout("request_next_osdmap have %u\n", monc->have_osdmap);
290 mutex_lock(&monc->mutex); 290 mutex_lock(&monc->mutex);
291 if (!monc->want_next_osdmap) 291 if (!monc->want_next_osdmap)
292 monc->want_next_osdmap = 1; 292 monc->want_next_osdmap = 1;
293 if (monc->want_next_osdmap < 2) 293 if (monc->want_next_osdmap < 2)
294 __send_subscribe(monc); 294 __send_subscribe(monc);
295 mutex_unlock(&monc->mutex); 295 mutex_unlock(&monc->mutex);
296 } 296 }
297 297
298 /* 298 /*
299 * 299 *
300 */ 300 */
301 int ceph_monc_open_session(struct ceph_mon_client *monc) 301 int ceph_monc_open_session(struct ceph_mon_client *monc)
302 { 302 {
303 mutex_lock(&monc->mutex); 303 mutex_lock(&monc->mutex);
304 __open_session(monc); 304 __open_session(monc);
305 __schedule_delayed(monc); 305 __schedule_delayed(monc);
306 mutex_unlock(&monc->mutex); 306 mutex_unlock(&monc->mutex);
307 return 0; 307 return 0;
308 } 308 }
309 EXPORT_SYMBOL(ceph_monc_open_session); 309 EXPORT_SYMBOL(ceph_monc_open_session);
310 310
311 /* 311 /*
312 * The monitor responds with mount ack indicate mount success. The 312 * The monitor responds with mount ack indicate mount success. The
313 * included client ticket allows the client to talk to MDSs and OSDs. 313 * included client ticket allows the client to talk to MDSs and OSDs.
314 */ 314 */
315 static void ceph_monc_handle_map(struct ceph_mon_client *monc, 315 static void ceph_monc_handle_map(struct ceph_mon_client *monc,
316 struct ceph_msg *msg) 316 struct ceph_msg *msg)
317 { 317 {
318 struct ceph_client *client = monc->client; 318 struct ceph_client *client = monc->client;
319 struct ceph_monmap *monmap = NULL, *old = monc->monmap; 319 struct ceph_monmap *monmap = NULL, *old = monc->monmap;
320 void *p, *end; 320 void *p, *end;
321 321
322 mutex_lock(&monc->mutex); 322 mutex_lock(&monc->mutex);
323 323
324 dout("handle_monmap\n"); 324 dout("handle_monmap\n");
325 p = msg->front.iov_base; 325 p = msg->front.iov_base;
326 end = p + msg->front.iov_len; 326 end = p + msg->front.iov_len;
327 327
328 monmap = ceph_monmap_decode(p, end); 328 monmap = ceph_monmap_decode(p, end);
329 if (IS_ERR(monmap)) { 329 if (IS_ERR(monmap)) {
330 pr_err("problem decoding monmap, %d\n", 330 pr_err("problem decoding monmap, %d\n",
331 (int)PTR_ERR(monmap)); 331 (int)PTR_ERR(monmap));
332 goto out; 332 goto out;
333 } 333 }
334 334
335 if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) { 335 if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
336 kfree(monmap); 336 kfree(monmap);
337 goto out; 337 goto out;
338 } 338 }
339 339
340 client->monc.monmap = monmap; 340 client->monc.monmap = monmap;
341 kfree(old); 341 kfree(old);
342 342
343 if (!client->have_fsid) {
344 client->have_fsid = true;
345 mutex_unlock(&monc->mutex);
346 /*
347 * do debugfs initialization without mutex to avoid
348 * creating a locking dependency
349 */
350 ceph_debugfs_client_init(client);
351 goto out_unlocked;
352 }
343 out: 353 out:
344 mutex_unlock(&monc->mutex); 354 mutex_unlock(&monc->mutex);
355 out_unlocked:
345 wake_up_all(&client->auth_wq); 356 wake_up_all(&client->auth_wq);
346 } 357 }
347 358
348 /* 359 /*
349 * generic requests (e.g., statfs, poolop) 360 * generic requests (e.g., statfs, poolop)
350 */ 361 */
351 static struct ceph_mon_generic_request *__lookup_generic_req( 362 static struct ceph_mon_generic_request *__lookup_generic_req(
352 struct ceph_mon_client *monc, u64 tid) 363 struct ceph_mon_client *monc, u64 tid)
353 { 364 {
354 struct ceph_mon_generic_request *req; 365 struct ceph_mon_generic_request *req;
355 struct rb_node *n = monc->generic_request_tree.rb_node; 366 struct rb_node *n = monc->generic_request_tree.rb_node;
356 367
357 while (n) { 368 while (n) {
358 req = rb_entry(n, struct ceph_mon_generic_request, node); 369 req = rb_entry(n, struct ceph_mon_generic_request, node);
359 if (tid < req->tid) 370 if (tid < req->tid)
360 n = n->rb_left; 371 n = n->rb_left;
361 else if (tid > req->tid) 372 else if (tid > req->tid)
362 n = n->rb_right; 373 n = n->rb_right;
363 else 374 else
364 return req; 375 return req;
365 } 376 }
366 return NULL; 377 return NULL;
367 } 378 }
368 379
369 static void __insert_generic_request(struct ceph_mon_client *monc, 380 static void __insert_generic_request(struct ceph_mon_client *monc,
370 struct ceph_mon_generic_request *new) 381 struct ceph_mon_generic_request *new)
371 { 382 {
372 struct rb_node **p = &monc->generic_request_tree.rb_node; 383 struct rb_node **p = &monc->generic_request_tree.rb_node;
373 struct rb_node *parent = NULL; 384 struct rb_node *parent = NULL;
374 struct ceph_mon_generic_request *req = NULL; 385 struct ceph_mon_generic_request *req = NULL;
375 386
376 while (*p) { 387 while (*p) {
377 parent = *p; 388 parent = *p;
378 req = rb_entry(parent, struct ceph_mon_generic_request, node); 389 req = rb_entry(parent, struct ceph_mon_generic_request, node);
379 if (new->tid < req->tid) 390 if (new->tid < req->tid)
380 p = &(*p)->rb_left; 391 p = &(*p)->rb_left;
381 else if (new->tid > req->tid) 392 else if (new->tid > req->tid)
382 p = &(*p)->rb_right; 393 p = &(*p)->rb_right;
383 else 394 else
384 BUG(); 395 BUG();
385 } 396 }
386 397
387 rb_link_node(&new->node, parent, p); 398 rb_link_node(&new->node, parent, p);
388 rb_insert_color(&new->node, &monc->generic_request_tree); 399 rb_insert_color(&new->node, &monc->generic_request_tree);
389 } 400 }
390 401
391 static void release_generic_request(struct kref *kref) 402 static void release_generic_request(struct kref *kref)
392 { 403 {
393 struct ceph_mon_generic_request *req = 404 struct ceph_mon_generic_request *req =
394 container_of(kref, struct ceph_mon_generic_request, kref); 405 container_of(kref, struct ceph_mon_generic_request, kref);
395 406
396 if (req->reply) 407 if (req->reply)
397 ceph_msg_put(req->reply); 408 ceph_msg_put(req->reply);
398 if (req->request) 409 if (req->request)
399 ceph_msg_put(req->request); 410 ceph_msg_put(req->request);
400 411
401 kfree(req); 412 kfree(req);
402 } 413 }
403 414
404 static void put_generic_request(struct ceph_mon_generic_request *req) 415 static void put_generic_request(struct ceph_mon_generic_request *req)
405 { 416 {
406 kref_put(&req->kref, release_generic_request); 417 kref_put(&req->kref, release_generic_request);
407 } 418 }
408 419
409 static void get_generic_request(struct ceph_mon_generic_request *req) 420 static void get_generic_request(struct ceph_mon_generic_request *req)
410 { 421 {
411 kref_get(&req->kref); 422 kref_get(&req->kref);
412 } 423 }
413 424
414 static struct ceph_msg *get_generic_reply(struct ceph_connection *con, 425 static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
415 struct ceph_msg_header *hdr, 426 struct ceph_msg_header *hdr,
416 int *skip) 427 int *skip)
417 { 428 {
418 struct ceph_mon_client *monc = con->private; 429 struct ceph_mon_client *monc = con->private;
419 struct ceph_mon_generic_request *req; 430 struct ceph_mon_generic_request *req;
420 u64 tid = le64_to_cpu(hdr->tid); 431 u64 tid = le64_to_cpu(hdr->tid);
421 struct ceph_msg *m; 432 struct ceph_msg *m;
422 433
423 mutex_lock(&monc->mutex); 434 mutex_lock(&monc->mutex);
424 req = __lookup_generic_req(monc, tid); 435 req = __lookup_generic_req(monc, tid);
425 if (!req) { 436 if (!req) {
426 dout("get_generic_reply %lld dne\n", tid); 437 dout("get_generic_reply %lld dne\n", tid);
427 *skip = 1; 438 *skip = 1;
428 m = NULL; 439 m = NULL;
429 } else { 440 } else {
430 dout("get_generic_reply %lld got %p\n", tid, req->reply); 441 dout("get_generic_reply %lld got %p\n", tid, req->reply);
431 m = ceph_msg_get(req->reply); 442 m = ceph_msg_get(req->reply);
432 /* 443 /*
433 * we don't need to track the connection reading into 444 * we don't need to track the connection reading into
434 * this reply because we only have one open connection 445 * this reply because we only have one open connection
435 * at a time, ever. 446 * at a time, ever.
436 */ 447 */
437 } 448 }
438 mutex_unlock(&monc->mutex); 449 mutex_unlock(&monc->mutex);
439 return m; 450 return m;
440 } 451 }
441 452
442 static int do_generic_request(struct ceph_mon_client *monc, 453 static int do_generic_request(struct ceph_mon_client *monc,
443 struct ceph_mon_generic_request *req) 454 struct ceph_mon_generic_request *req)
444 { 455 {
445 int err; 456 int err;
446 457
447 /* register request */ 458 /* register request */
448 mutex_lock(&monc->mutex); 459 mutex_lock(&monc->mutex);
449 req->tid = ++monc->last_tid; 460 req->tid = ++monc->last_tid;
450 req->request->hdr.tid = cpu_to_le64(req->tid); 461 req->request->hdr.tid = cpu_to_le64(req->tid);
451 __insert_generic_request(monc, req); 462 __insert_generic_request(monc, req);
452 monc->num_generic_requests++; 463 monc->num_generic_requests++;
453 ceph_con_send(monc->con, ceph_msg_get(req->request)); 464 ceph_con_send(monc->con, ceph_msg_get(req->request));
454 mutex_unlock(&monc->mutex); 465 mutex_unlock(&monc->mutex);
455 466
456 err = wait_for_completion_interruptible(&req->completion); 467 err = wait_for_completion_interruptible(&req->completion);
457 468
458 mutex_lock(&monc->mutex); 469 mutex_lock(&monc->mutex);
459 rb_erase(&req->node, &monc->generic_request_tree); 470 rb_erase(&req->node, &monc->generic_request_tree);
460 monc->num_generic_requests--; 471 monc->num_generic_requests--;
461 mutex_unlock(&monc->mutex); 472 mutex_unlock(&monc->mutex);
462 473
463 if (!err) 474 if (!err)
464 err = req->result; 475 err = req->result;
465 return err; 476 return err;
466 } 477 }
467 478
468 /* 479 /*
469 * statfs 480 * statfs
470 */ 481 */
471 static void handle_statfs_reply(struct ceph_mon_client *monc, 482 static void handle_statfs_reply(struct ceph_mon_client *monc,
472 struct ceph_msg *msg) 483 struct ceph_msg *msg)
473 { 484 {
474 struct ceph_mon_generic_request *req; 485 struct ceph_mon_generic_request *req;
475 struct ceph_mon_statfs_reply *reply = msg->front.iov_base; 486 struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
476 u64 tid = le64_to_cpu(msg->hdr.tid); 487 u64 tid = le64_to_cpu(msg->hdr.tid);
477 488
478 if (msg->front.iov_len != sizeof(*reply)) 489 if (msg->front.iov_len != sizeof(*reply))
479 goto bad; 490 goto bad;
480 dout("handle_statfs_reply %p tid %llu\n", msg, tid); 491 dout("handle_statfs_reply %p tid %llu\n", msg, tid);
481 492
482 mutex_lock(&monc->mutex); 493 mutex_lock(&monc->mutex);
483 req = __lookup_generic_req(monc, tid); 494 req = __lookup_generic_req(monc, tid);
484 if (req) { 495 if (req) {
485 *(struct ceph_statfs *)req->buf = reply->st; 496 *(struct ceph_statfs *)req->buf = reply->st;
486 req->result = 0; 497 req->result = 0;
487 get_generic_request(req); 498 get_generic_request(req);
488 } 499 }
489 mutex_unlock(&monc->mutex); 500 mutex_unlock(&monc->mutex);
490 if (req) { 501 if (req) {
491 complete_all(&req->completion); 502 complete_all(&req->completion);
492 put_generic_request(req); 503 put_generic_request(req);
493 } 504 }
494 return; 505 return;
495 506
496 bad: 507 bad:
497 pr_err("corrupt generic reply, tid %llu\n", tid); 508 pr_err("corrupt generic reply, tid %llu\n", tid);
498 ceph_msg_dump(msg); 509 ceph_msg_dump(msg);
499 } 510 }
500 511
501 /* 512 /*
502 * Do a synchronous statfs(). 513 * Do a synchronous statfs().
503 */ 514 */
504 int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) 515 int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
505 { 516 {
506 struct ceph_mon_generic_request *req; 517 struct ceph_mon_generic_request *req;
507 struct ceph_mon_statfs *h; 518 struct ceph_mon_statfs *h;
508 int err; 519 int err;
509 520
510 req = kzalloc(sizeof(*req), GFP_NOFS); 521 req = kzalloc(sizeof(*req), GFP_NOFS);
511 if (!req) 522 if (!req)
512 return -ENOMEM; 523 return -ENOMEM;
513 524
514 kref_init(&req->kref); 525 kref_init(&req->kref);
515 req->buf = buf; 526 req->buf = buf;
516 req->buf_len = sizeof(*buf); 527 req->buf_len = sizeof(*buf);
517 init_completion(&req->completion); 528 init_completion(&req->completion);
518 529
519 err = -ENOMEM; 530 err = -ENOMEM;
520 req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS, 531 req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS,
521 true); 532 true);
522 if (!req->request) 533 if (!req->request)
523 goto out; 534 goto out;
524 req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS, 535 req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS,
525 true); 536 true);
526 if (!req->reply) 537 if (!req->reply)
527 goto out; 538 goto out;
528 539
529 /* fill out request */ 540 /* fill out request */
530 h = req->request->front.iov_base; 541 h = req->request->front.iov_base;
531 h->monhdr.have_version = 0; 542 h->monhdr.have_version = 0;
532 h->monhdr.session_mon = cpu_to_le16(-1); 543 h->monhdr.session_mon = cpu_to_le16(-1);
533 h->monhdr.session_mon_tid = 0; 544 h->monhdr.session_mon_tid = 0;
534 h->fsid = monc->monmap->fsid; 545 h->fsid = monc->monmap->fsid;
535 546
536 err = do_generic_request(monc, req); 547 err = do_generic_request(monc, req);
537 548
538 out: 549 out:
539 kref_put(&req->kref, release_generic_request); 550 kref_put(&req->kref, release_generic_request);
540 return err; 551 return err;
541 } 552 }
542 EXPORT_SYMBOL(ceph_monc_do_statfs); 553 EXPORT_SYMBOL(ceph_monc_do_statfs);
543 554
544 /* 555 /*
545 * pool ops 556 * pool ops
546 */ 557 */
547 static int get_poolop_reply_buf(const char *src, size_t src_len, 558 static int get_poolop_reply_buf(const char *src, size_t src_len,
548 char *dst, size_t dst_len) 559 char *dst, size_t dst_len)
549 { 560 {
550 u32 buf_len; 561 u32 buf_len;
551 562
552 if (src_len != sizeof(u32) + dst_len) 563 if (src_len != sizeof(u32) + dst_len)
553 return -EINVAL; 564 return -EINVAL;
554 565
555 buf_len = le32_to_cpu(*(u32 *)src); 566 buf_len = le32_to_cpu(*(u32 *)src);
556 if (buf_len != dst_len) 567 if (buf_len != dst_len)
557 return -EINVAL; 568 return -EINVAL;
558 569
559 memcpy(dst, src + sizeof(u32), dst_len); 570 memcpy(dst, src + sizeof(u32), dst_len);
560 return 0; 571 return 0;
561 } 572 }
562 573
563 static void handle_poolop_reply(struct ceph_mon_client *monc, 574 static void handle_poolop_reply(struct ceph_mon_client *monc,
564 struct ceph_msg *msg) 575 struct ceph_msg *msg)
565 { 576 {
566 struct ceph_mon_generic_request *req; 577 struct ceph_mon_generic_request *req;
567 struct ceph_mon_poolop_reply *reply = msg->front.iov_base; 578 struct ceph_mon_poolop_reply *reply = msg->front.iov_base;
568 u64 tid = le64_to_cpu(msg->hdr.tid); 579 u64 tid = le64_to_cpu(msg->hdr.tid);
569 580
570 if (msg->front.iov_len < sizeof(*reply)) 581 if (msg->front.iov_len < sizeof(*reply))
571 goto bad; 582 goto bad;
572 dout("handle_poolop_reply %p tid %llu\n", msg, tid); 583 dout("handle_poolop_reply %p tid %llu\n", msg, tid);
573 584
574 mutex_lock(&monc->mutex); 585 mutex_lock(&monc->mutex);
575 req = __lookup_generic_req(monc, tid); 586 req = __lookup_generic_req(monc, tid);
576 if (req) { 587 if (req) {
577 if (req->buf_len && 588 if (req->buf_len &&
578 get_poolop_reply_buf(msg->front.iov_base + sizeof(*reply), 589 get_poolop_reply_buf(msg->front.iov_base + sizeof(*reply),
579 msg->front.iov_len - sizeof(*reply), 590 msg->front.iov_len - sizeof(*reply),
580 req->buf, req->buf_len) < 0) { 591 req->buf, req->buf_len) < 0) {
581 mutex_unlock(&monc->mutex); 592 mutex_unlock(&monc->mutex);
582 goto bad; 593 goto bad;
583 } 594 }
584 req->result = le32_to_cpu(reply->reply_code); 595 req->result = le32_to_cpu(reply->reply_code);
585 get_generic_request(req); 596 get_generic_request(req);
586 } 597 }
587 mutex_unlock(&monc->mutex); 598 mutex_unlock(&monc->mutex);
588 if (req) { 599 if (req) {
589 complete(&req->completion); 600 complete(&req->completion);
590 put_generic_request(req); 601 put_generic_request(req);
591 } 602 }
592 return; 603 return;
593 604
594 bad: 605 bad:
595 pr_err("corrupt generic reply, tid %llu\n", tid); 606 pr_err("corrupt generic reply, tid %llu\n", tid);
596 ceph_msg_dump(msg); 607 ceph_msg_dump(msg);
597 } 608 }
598 609
599 /* 610 /*
600 * Do a synchronous pool op. 611 * Do a synchronous pool op.
601 */ 612 */
602 int ceph_monc_do_poolop(struct ceph_mon_client *monc, u32 op, 613 int ceph_monc_do_poolop(struct ceph_mon_client *monc, u32 op,
603 u32 pool, u64 snapid, 614 u32 pool, u64 snapid,
604 char *buf, int len) 615 char *buf, int len)
605 { 616 {
606 struct ceph_mon_generic_request *req; 617 struct ceph_mon_generic_request *req;
607 struct ceph_mon_poolop *h; 618 struct ceph_mon_poolop *h;
608 int err; 619 int err;
609 620
610 req = kzalloc(sizeof(*req), GFP_NOFS); 621 req = kzalloc(sizeof(*req), GFP_NOFS);
611 if (!req) 622 if (!req)
612 return -ENOMEM; 623 return -ENOMEM;
613 624
614 kref_init(&req->kref); 625 kref_init(&req->kref);
615 req->buf = buf; 626 req->buf = buf;
616 req->buf_len = len; 627 req->buf_len = len;
617 init_completion(&req->completion); 628 init_completion(&req->completion);
618 629
619 err = -ENOMEM; 630 err = -ENOMEM;
620 req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS, 631 req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS,
621 true); 632 true);
622 if (!req->request) 633 if (!req->request)
623 goto out; 634 goto out;
624 req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS, 635 req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS,
625 true); 636 true);
626 if (!req->reply) 637 if (!req->reply)
627 goto out; 638 goto out;
628 639
629 /* fill out request */ 640 /* fill out request */
630 req->request->hdr.version = cpu_to_le16(2); 641 req->request->hdr.version = cpu_to_le16(2);
631 h = req->request->front.iov_base; 642 h = req->request->front.iov_base;
632 h->monhdr.have_version = 0; 643 h->monhdr.have_version = 0;
633 h->monhdr.session_mon = cpu_to_le16(-1); 644 h->monhdr.session_mon = cpu_to_le16(-1);
634 h->monhdr.session_mon_tid = 0; 645 h->monhdr.session_mon_tid = 0;
635 h->fsid = monc->monmap->fsid; 646 h->fsid = monc->monmap->fsid;
636 h->pool = cpu_to_le32(pool); 647 h->pool = cpu_to_le32(pool);
637 h->op = cpu_to_le32(op); 648 h->op = cpu_to_le32(op);
638 h->auid = 0; 649 h->auid = 0;
639 h->snapid = cpu_to_le64(snapid); 650 h->snapid = cpu_to_le64(snapid);
640 h->name_len = 0; 651 h->name_len = 0;
641 652
642 err = do_generic_request(monc, req); 653 err = do_generic_request(monc, req);
643 654
644 out: 655 out:
645 kref_put(&req->kref, release_generic_request); 656 kref_put(&req->kref, release_generic_request);
646 return err; 657 return err;
647 } 658 }
648 659
649 int ceph_monc_create_snapid(struct ceph_mon_client *monc, 660 int ceph_monc_create_snapid(struct ceph_mon_client *monc,
650 u32 pool, u64 *snapid) 661 u32 pool, u64 *snapid)
651 { 662 {
652 return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, 663 return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP,
653 pool, 0, (char *)snapid, sizeof(*snapid)); 664 pool, 0, (char *)snapid, sizeof(*snapid));
654 665
655 } 666 }
656 EXPORT_SYMBOL(ceph_monc_create_snapid); 667 EXPORT_SYMBOL(ceph_monc_create_snapid);
657 668
658 int ceph_monc_delete_snapid(struct ceph_mon_client *monc, 669 int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
659 u32 pool, u64 snapid) 670 u32 pool, u64 snapid)
660 { 671 {
661 return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, 672 return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP,
662 pool, snapid, 0, 0); 673 pool, snapid, 0, 0);
663 674
664 } 675 }
665 676
666 /* 677 /*
667 * Resend pending generic requests. 678 * Resend pending generic requests.
668 */ 679 */
669 static void __resend_generic_request(struct ceph_mon_client *monc) 680 static void __resend_generic_request(struct ceph_mon_client *monc)
670 { 681 {
671 struct ceph_mon_generic_request *req; 682 struct ceph_mon_generic_request *req;
672 struct rb_node *p; 683 struct rb_node *p;
673 684
674 for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) { 685 for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
675 req = rb_entry(p, struct ceph_mon_generic_request, node); 686 req = rb_entry(p, struct ceph_mon_generic_request, node);
676 ceph_con_revoke(monc->con, req->request); 687 ceph_con_revoke(monc->con, req->request);
677 ceph_con_send(monc->con, ceph_msg_get(req->request)); 688 ceph_con_send(monc->con, ceph_msg_get(req->request));
678 } 689 }
679 } 690 }
680 691
681 /* 692 /*
682 * Delayed work. If we haven't mounted yet, retry. Otherwise, 693 * Delayed work. If we haven't mounted yet, retry. Otherwise,
683 * renew/retry subscription as needed (in case it is timing out, or we 694 * renew/retry subscription as needed (in case it is timing out, or we
684 * got an ENOMEM). And keep the monitor connection alive. 695 * got an ENOMEM). And keep the monitor connection alive.
685 */ 696 */
686 static void delayed_work(struct work_struct *work) 697 static void delayed_work(struct work_struct *work)
687 { 698 {
688 struct ceph_mon_client *monc = 699 struct ceph_mon_client *monc =
689 container_of(work, struct ceph_mon_client, delayed_work.work); 700 container_of(work, struct ceph_mon_client, delayed_work.work);
690 701
691 dout("monc delayed_work\n"); 702 dout("monc delayed_work\n");
692 mutex_lock(&monc->mutex); 703 mutex_lock(&monc->mutex);
693 if (monc->hunting) { 704 if (monc->hunting) {
694 __close_session(monc); 705 __close_session(monc);
695 __open_session(monc); /* continue hunting */ 706 __open_session(monc); /* continue hunting */
696 } else { 707 } else {
697 ceph_con_keepalive(monc->con); 708 ceph_con_keepalive(monc->con);
698 709
699 __validate_auth(monc); 710 __validate_auth(monc);
700 711
701 if (monc->auth->ops->is_authenticated(monc->auth)) 712 if (monc->auth->ops->is_authenticated(monc->auth))
702 __send_subscribe(monc); 713 __send_subscribe(monc);
703 } 714 }
704 __schedule_delayed(monc); 715 __schedule_delayed(monc);
705 mutex_unlock(&monc->mutex); 716 mutex_unlock(&monc->mutex);
706 } 717 }
707 718
708 /* 719 /*
709 * On startup, we build a temporary monmap populated with the IPs 720 * On startup, we build a temporary monmap populated with the IPs
710 * provided by mount(2). 721 * provided by mount(2).
711 */ 722 */
712 static int build_initial_monmap(struct ceph_mon_client *monc) 723 static int build_initial_monmap(struct ceph_mon_client *monc)
713 { 724 {
714 struct ceph_options *opt = monc->client->options; 725 struct ceph_options *opt = monc->client->options;
715 struct ceph_entity_addr *mon_addr = opt->mon_addr; 726 struct ceph_entity_addr *mon_addr = opt->mon_addr;
716 int num_mon = opt->num_mon; 727 int num_mon = opt->num_mon;
717 int i; 728 int i;
718 729
719 /* build initial monmap */ 730 /* build initial monmap */
720 monc->monmap = kzalloc(sizeof(*monc->monmap) + 731 monc->monmap = kzalloc(sizeof(*monc->monmap) +
721 num_mon*sizeof(monc->monmap->mon_inst[0]), 732 num_mon*sizeof(monc->monmap->mon_inst[0]),
722 GFP_KERNEL); 733 GFP_KERNEL);
723 if (!monc->monmap) 734 if (!monc->monmap)
724 return -ENOMEM; 735 return -ENOMEM;
725 for (i = 0; i < num_mon; i++) { 736 for (i = 0; i < num_mon; i++) {
726 monc->monmap->mon_inst[i].addr = mon_addr[i]; 737 monc->monmap->mon_inst[i].addr = mon_addr[i];
727 monc->monmap->mon_inst[i].addr.nonce = 0; 738 monc->monmap->mon_inst[i].addr.nonce = 0;
728 monc->monmap->mon_inst[i].name.type = 739 monc->monmap->mon_inst[i].name.type =
729 CEPH_ENTITY_TYPE_MON; 740 CEPH_ENTITY_TYPE_MON;
730 monc->monmap->mon_inst[i].name.num = cpu_to_le64(i); 741 monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
731 } 742 }
732 monc->monmap->num_mon = num_mon; 743 monc->monmap->num_mon = num_mon;
733 monc->have_fsid = false; 744 monc->have_fsid = false;
734 return 0; 745 return 0;
735 } 746 }
736 747
737 int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) 748 int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
738 { 749 {
739 int err = 0; 750 int err = 0;
740 751
741 dout("init\n"); 752 dout("init\n");
742 memset(monc, 0, sizeof(*monc)); 753 memset(monc, 0, sizeof(*monc));
743 monc->client = cl; 754 monc->client = cl;
744 monc->monmap = NULL; 755 monc->monmap = NULL;
745 mutex_init(&monc->mutex); 756 mutex_init(&monc->mutex);
746 757
747 err = build_initial_monmap(monc); 758 err = build_initial_monmap(monc);
748 if (err) 759 if (err)
749 goto out; 760 goto out;
750 761
751 /* connection */ 762 /* connection */
752 monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL); 763 monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
753 if (!monc->con) 764 if (!monc->con)
754 goto out_monmap; 765 goto out_monmap;
755 ceph_con_init(monc->client->msgr, monc->con); 766 ceph_con_init(monc->client->msgr, monc->con);
756 monc->con->private = monc; 767 monc->con->private = monc;
757 monc->con->ops = &mon_con_ops; 768 monc->con->ops = &mon_con_ops;
758 769
759 /* authentication */ 770 /* authentication */
760 monc->auth = ceph_auth_init(cl->options->name, 771 monc->auth = ceph_auth_init(cl->options->name,
761 cl->options->key); 772 cl->options->key);
762 if (IS_ERR(monc->auth)) { 773 if (IS_ERR(monc->auth)) {
763 err = PTR_ERR(monc->auth); 774 err = PTR_ERR(monc->auth);
764 goto out_con; 775 goto out_con;
765 } 776 }
766 monc->auth->want_keys = 777 monc->auth->want_keys =
767 CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON | 778 CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
768 CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS; 779 CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
769 780
770 /* msgs */ 781 /* msgs */
771 err = -ENOMEM; 782 err = -ENOMEM;
772 monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK, 783 monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
773 sizeof(struct ceph_mon_subscribe_ack), 784 sizeof(struct ceph_mon_subscribe_ack),
774 GFP_NOFS, true); 785 GFP_NOFS, true);
775 if (!monc->m_subscribe_ack) 786 if (!monc->m_subscribe_ack)
776 goto out_auth; 787 goto out_auth;
777 788
778 monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS, 789 monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS,
779 true); 790 true);
780 if (!monc->m_subscribe) 791 if (!monc->m_subscribe)
781 goto out_subscribe_ack; 792 goto out_subscribe_ack;
782 793
783 monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS, 794 monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS,
784 true); 795 true);
785 if (!monc->m_auth_reply) 796 if (!monc->m_auth_reply)
786 goto out_subscribe; 797 goto out_subscribe;
787 798
788 monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS, true); 799 monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS, true);
789 monc->pending_auth = 0; 800 monc->pending_auth = 0;
790 if (!monc->m_auth) 801 if (!monc->m_auth)
791 goto out_auth_reply; 802 goto out_auth_reply;
792 803
793 monc->cur_mon = -1; 804 monc->cur_mon = -1;
794 monc->hunting = true; 805 monc->hunting = true;
795 monc->sub_renew_after = jiffies; 806 monc->sub_renew_after = jiffies;
796 monc->sub_sent = 0; 807 monc->sub_sent = 0;
797 808
798 INIT_DELAYED_WORK(&monc->delayed_work, delayed_work); 809 INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
799 monc->generic_request_tree = RB_ROOT; 810 monc->generic_request_tree = RB_ROOT;
800 monc->num_generic_requests = 0; 811 monc->num_generic_requests = 0;
801 monc->last_tid = 0; 812 monc->last_tid = 0;
802 813
803 monc->have_mdsmap = 0; 814 monc->have_mdsmap = 0;
804 monc->have_osdmap = 0; 815 monc->have_osdmap = 0;
805 monc->want_next_osdmap = 1; 816 monc->want_next_osdmap = 1;
806 return 0; 817 return 0;
807 818
808 out_auth_reply: 819 out_auth_reply:
809 ceph_msg_put(monc->m_auth_reply); 820 ceph_msg_put(monc->m_auth_reply);
810 out_subscribe: 821 out_subscribe:
811 ceph_msg_put(monc->m_subscribe); 822 ceph_msg_put(monc->m_subscribe);
812 out_subscribe_ack: 823 out_subscribe_ack:
813 ceph_msg_put(monc->m_subscribe_ack); 824 ceph_msg_put(monc->m_subscribe_ack);
814 out_auth: 825 out_auth:
815 ceph_auth_destroy(monc->auth); 826 ceph_auth_destroy(monc->auth);
816 out_con: 827 out_con:
817 monc->con->ops->put(monc->con); 828 monc->con->ops->put(monc->con);
818 out_monmap: 829 out_monmap:
819 kfree(monc->monmap); 830 kfree(monc->monmap);
820 out: 831 out:
821 return err; 832 return err;
822 } 833 }
823 EXPORT_SYMBOL(ceph_monc_init); 834 EXPORT_SYMBOL(ceph_monc_init);
824 835
825 void ceph_monc_stop(struct ceph_mon_client *monc) 836 void ceph_monc_stop(struct ceph_mon_client *monc)
826 { 837 {
827 dout("stop\n"); 838 dout("stop\n");
828 cancel_delayed_work_sync(&monc->delayed_work); 839 cancel_delayed_work_sync(&monc->delayed_work);
829 840
830 mutex_lock(&monc->mutex); 841 mutex_lock(&monc->mutex);
831 __close_session(monc); 842 __close_session(monc);
832 843
833 monc->con->private = NULL; 844 monc->con->private = NULL;
834 monc->con->ops->put(monc->con); 845 monc->con->ops->put(monc->con);
835 monc->con = NULL; 846 monc->con = NULL;
836 847
837 mutex_unlock(&monc->mutex); 848 mutex_unlock(&monc->mutex);
838 849
839 ceph_auth_destroy(monc->auth); 850 ceph_auth_destroy(monc->auth);
840 851
841 ceph_msg_put(monc->m_auth); 852 ceph_msg_put(monc->m_auth);
842 ceph_msg_put(monc->m_auth_reply); 853 ceph_msg_put(monc->m_auth_reply);
843 ceph_msg_put(monc->m_subscribe); 854 ceph_msg_put(monc->m_subscribe);
844 ceph_msg_put(monc->m_subscribe_ack); 855 ceph_msg_put(monc->m_subscribe_ack);
845 856
846 kfree(monc->monmap); 857 kfree(monc->monmap);
847 } 858 }
848 EXPORT_SYMBOL(ceph_monc_stop); 859 EXPORT_SYMBOL(ceph_monc_stop);
849 860
850 static void handle_auth_reply(struct ceph_mon_client *monc, 861 static void handle_auth_reply(struct ceph_mon_client *monc,
851 struct ceph_msg *msg) 862 struct ceph_msg *msg)
852 { 863 {
853 int ret; 864 int ret;
854 int was_auth = 0; 865 int was_auth = 0;
855 866
856 mutex_lock(&monc->mutex); 867 mutex_lock(&monc->mutex);
857 if (monc->auth->ops) 868 if (monc->auth->ops)
858 was_auth = monc->auth->ops->is_authenticated(monc->auth); 869 was_auth = monc->auth->ops->is_authenticated(monc->auth);
859 monc->pending_auth = 0; 870 monc->pending_auth = 0;
860 ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base, 871 ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
861 msg->front.iov_len, 872 msg->front.iov_len,
862 monc->m_auth->front.iov_base, 873 monc->m_auth->front.iov_base,
863 monc->m_auth->front_max); 874 monc->m_auth->front_max);
864 if (ret < 0) { 875 if (ret < 0) {
865 monc->client->auth_err = ret; 876 monc->client->auth_err = ret;
866 wake_up_all(&monc->client->auth_wq); 877 wake_up_all(&monc->client->auth_wq);
867 } else if (ret > 0) { 878 } else if (ret > 0) {
868 __send_prepared_auth_request(monc, ret); 879 __send_prepared_auth_request(monc, ret);
869 } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) { 880 } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) {
870 dout("authenticated, starting session\n"); 881 dout("authenticated, starting session\n");
871 882
872 monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT; 883 monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
873 monc->client->msgr->inst.name.num = 884 monc->client->msgr->inst.name.num =
874 cpu_to_le64(monc->auth->global_id); 885 cpu_to_le64(monc->auth->global_id);
875 886
876 __send_subscribe(monc); 887 __send_subscribe(monc);
877 __resend_generic_request(monc); 888 __resend_generic_request(monc);
878 } 889 }
879 mutex_unlock(&monc->mutex); 890 mutex_unlock(&monc->mutex);
880 } 891 }
881 892
882 static int __validate_auth(struct ceph_mon_client *monc) 893 static int __validate_auth(struct ceph_mon_client *monc)
883 { 894 {
884 int ret; 895 int ret;
885 896
886 if (monc->pending_auth) 897 if (monc->pending_auth)
887 return 0; 898 return 0;
888 899
889 ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base, 900 ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
890 monc->m_auth->front_max); 901 monc->m_auth->front_max);
891 if (ret <= 0) 902 if (ret <= 0)
892 return ret; /* either an error, or no need to authenticate */ 903 return ret; /* either an error, or no need to authenticate */
893 __send_prepared_auth_request(monc, ret); 904 __send_prepared_auth_request(monc, ret);
894 return 0; 905 return 0;
895 } 906 }
896 907
897 int ceph_monc_validate_auth(struct ceph_mon_client *monc) 908 int ceph_monc_validate_auth(struct ceph_mon_client *monc)
898 { 909 {
899 int ret; 910 int ret;
900 911
901 mutex_lock(&monc->mutex); 912 mutex_lock(&monc->mutex);
902 ret = __validate_auth(monc); 913 ret = __validate_auth(monc);
903 mutex_unlock(&monc->mutex); 914 mutex_unlock(&monc->mutex);
904 return ret; 915 return ret;
905 } 916 }
906 EXPORT_SYMBOL(ceph_monc_validate_auth); 917 EXPORT_SYMBOL(ceph_monc_validate_auth);
907 918
908 /* 919 /*
909 * handle incoming message 920 * handle incoming message
910 */ 921 */
911 static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) 922 static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
912 { 923 {
913 struct ceph_mon_client *monc = con->private; 924 struct ceph_mon_client *monc = con->private;
914 int type = le16_to_cpu(msg->hdr.type); 925 int type = le16_to_cpu(msg->hdr.type);
915 926
916 if (!monc) 927 if (!monc)
917 return; 928 return;
918 929
919 switch (type) { 930 switch (type) {
920 case CEPH_MSG_AUTH_REPLY: 931 case CEPH_MSG_AUTH_REPLY:
921 handle_auth_reply(monc, msg); 932 handle_auth_reply(monc, msg);
922 break; 933 break;
923 934
924 case CEPH_MSG_MON_SUBSCRIBE_ACK: 935 case CEPH_MSG_MON_SUBSCRIBE_ACK:
925 handle_subscribe_ack(monc, msg); 936 handle_subscribe_ack(monc, msg);
926 break; 937 break;
927 938
928 case CEPH_MSG_STATFS_REPLY: 939 case CEPH_MSG_STATFS_REPLY:
929 handle_statfs_reply(monc, msg); 940 handle_statfs_reply(monc, msg);
930 break; 941 break;
931 942
932 case CEPH_MSG_POOLOP_REPLY: 943 case CEPH_MSG_POOLOP_REPLY:
933 handle_poolop_reply(monc, msg); 944 handle_poolop_reply(monc, msg);
934 break; 945 break;
935 946
936 case CEPH_MSG_MON_MAP: 947 case CEPH_MSG_MON_MAP:
937 ceph_monc_handle_map(monc, msg); 948 ceph_monc_handle_map(monc, msg);
938 break; 949 break;
939 950
940 case CEPH_MSG_OSD_MAP: 951 case CEPH_MSG_OSD_MAP:
941 ceph_osdc_handle_map(&monc->client->osdc, msg); 952 ceph_osdc_handle_map(&monc->client->osdc, msg);
942 break; 953 break;
943 954
944 default: 955 default:
945 /* can the chained handler handle it? */ 956 /* can the chained handler handle it? */
946 if (monc->client->extra_mon_dispatch && 957 if (monc->client->extra_mon_dispatch &&
947 monc->client->extra_mon_dispatch(monc->client, msg) == 0) 958 monc->client->extra_mon_dispatch(monc->client, msg) == 0)
948 break; 959 break;
949 960
950 pr_err("received unknown message type %d %s\n", type, 961 pr_err("received unknown message type %d %s\n", type,
951 ceph_msg_type_name(type)); 962 ceph_msg_type_name(type));
952 } 963 }
953 ceph_msg_put(msg); 964 ceph_msg_put(msg);
954 } 965 }
955 966
956 /* 967 /*
957 * Allocate memory for incoming message 968 * Allocate memory for incoming message
958 */ 969 */
959 static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, 970 static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
960 struct ceph_msg_header *hdr, 971 struct ceph_msg_header *hdr,
961 int *skip) 972 int *skip)
962 { 973 {
963 struct ceph_mon_client *monc = con->private; 974 struct ceph_mon_client *monc = con->private;
964 int type = le16_to_cpu(hdr->type); 975 int type = le16_to_cpu(hdr->type);
965 int front_len = le32_to_cpu(hdr->front_len); 976 int front_len = le32_to_cpu(hdr->front_len);
966 struct ceph_msg *m = NULL; 977 struct ceph_msg *m = NULL;
967 978
968 *skip = 0; 979 *skip = 0;
969 980
970 switch (type) { 981 switch (type) {
971 case CEPH_MSG_MON_SUBSCRIBE_ACK: 982 case CEPH_MSG_MON_SUBSCRIBE_ACK:
972 m = ceph_msg_get(monc->m_subscribe_ack); 983 m = ceph_msg_get(monc->m_subscribe_ack);
973 break; 984 break;
974 case CEPH_MSG_POOLOP_REPLY: 985 case CEPH_MSG_POOLOP_REPLY:
975 case CEPH_MSG_STATFS_REPLY: 986 case CEPH_MSG_STATFS_REPLY:
976 return get_generic_reply(con, hdr, skip); 987 return get_generic_reply(con, hdr, skip);
977 case CEPH_MSG_AUTH_REPLY: 988 case CEPH_MSG_AUTH_REPLY:
978 m = ceph_msg_get(monc->m_auth_reply); 989 m = ceph_msg_get(monc->m_auth_reply);
979 break; 990 break;
980 case CEPH_MSG_MON_MAP: 991 case CEPH_MSG_MON_MAP:
981 case CEPH_MSG_MDS_MAP: 992 case CEPH_MSG_MDS_MAP:
982 case CEPH_MSG_OSD_MAP: 993 case CEPH_MSG_OSD_MAP:
983 m = ceph_msg_new(type, front_len, GFP_NOFS, false); 994 m = ceph_msg_new(type, front_len, GFP_NOFS, false);
984 break; 995 break;
985 } 996 }
986 997
987 if (!m) { 998 if (!m) {
988 pr_info("alloc_msg unknown type %d\n", type); 999 pr_info("alloc_msg unknown type %d\n", type);
989 *skip = 1; 1000 *skip = 1;
990 } 1001 }
991 return m; 1002 return m;
992 } 1003 }
993 1004
994 /* 1005 /*
995 * If the monitor connection resets, pick a new monitor and resubmit 1006 * If the monitor connection resets, pick a new monitor and resubmit
996 * any pending requests. 1007 * any pending requests.
997 */ 1008 */
998 static void mon_fault(struct ceph_connection *con) 1009 static void mon_fault(struct ceph_connection *con)
999 { 1010 {
1000 struct ceph_mon_client *monc = con->private; 1011 struct ceph_mon_client *monc = con->private;
1001 1012
1002 if (!monc) 1013 if (!monc)
1003 return; 1014 return;
1004 1015
1005 dout("mon_fault\n"); 1016 dout("mon_fault\n");
1006 mutex_lock(&monc->mutex); 1017 mutex_lock(&monc->mutex);
1007 if (!con->private) 1018 if (!con->private)
1008 goto out; 1019 goto out;
1009 1020
1010 if (!monc->hunting) 1021 if (!monc->hunting)
1011 pr_info("mon%d %s session lost, " 1022 pr_info("mon%d %s session lost, "
1012 "hunting for new mon\n", monc->cur_mon, 1023 "hunting for new mon\n", monc->cur_mon,
1013 ceph_pr_addr(&monc->con->peer_addr.in_addr)); 1024 ceph_pr_addr(&monc->con->peer_addr.in_addr));
1014 1025
1015 __close_session(monc); 1026 __close_session(monc);
1016 if (!monc->hunting) { 1027 if (!monc->hunting) {
1017 /* start hunting */ 1028 /* start hunting */
1018 monc->hunting = true; 1029 monc->hunting = true;
1019 __open_session(monc); 1030 __open_session(monc);
1020 } else { 1031 } else {
1021 /* already hunting, let's wait a bit */ 1032 /* already hunting, let's wait a bit */
1022 __schedule_delayed(monc); 1033 __schedule_delayed(monc);
1023 } 1034 }
1024 out: 1035 out:
1025 mutex_unlock(&monc->mutex); 1036 mutex_unlock(&monc->mutex);
1026 } 1037 }
1027 1038
1028 static const struct ceph_connection_operations mon_con_ops = { 1039 static const struct ceph_connection_operations mon_con_ops = {
1029 .get = ceph_con_get, 1040 .get = ceph_con_get,
1030 .put = ceph_con_put, 1041 .put = ceph_con_put,
1031 .dispatch = dispatch, 1042 .dispatch = dispatch,
1032 .fault = mon_fault, 1043 .fault = mon_fault,
1033 .alloc_msg = mon_alloc_msg, 1044 .alloc_msg = mon_alloc_msg,
1034 }; 1045 };