Doug / smarc-fsl-linux-kernel | Embedian Git Server

Commit 6c073a7ee250118b8be3a2379c96fd7f78382b06

Authored by Linus Torvalds 2012-02-03 07:47:33 +0800

Exists in smarc-l5.0.0_1.0.0-ga and in 5 other branches

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client:
  rbd: fix safety of rbd_put_client()
  rbd: fix a memory leak in rbd_get_client()
  ceph: create a new session lock to avoid lock inversion
  ceph: fix length validation in parse_reply_info()
  ceph: initialize client debugfs outside of monc->mutex
  ceph: change "ceph.layout" xattr to be "ceph.file.layout"

Showing 8 changed files Inline Diff

drivers/block/rbd.c
fs/ceph/caps.c
fs/ceph/dir.c
fs/ceph/mds_client.c
fs/ceph/mds_client.h
fs/ceph/xattr.c
net/ceph/ceph_common.c
net/ceph/mon_client.c

drivers/block/rbd.c

Diff comments View file @ 6c073a7

 /*
    rbd.c -- Export ceph rados objects as a Linux block device
    based on drivers/block/osdblk.c:
    Copyright 2009 Red Hat, Inc.
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License
    along with this program; see the file COPYING.  If not, write to
    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
    For usage instructions, please refer to:
                  Documentation/ABI/testing/sysfs-bus-rbd
  */
 #include <linux/ceph/libceph.h>
 #include <linux/ceph/osd_client.h>
 #include <linux/ceph/mon_client.h>
 #include <linux/ceph/decode.h>
 #include <linux/parser.h>
 #include <linux/kernel.h>
 #include <linux/device.h>
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/blkdev.h>
 #include "rbd_types.h"
 #define DRV_NAME "rbd"
 #define DRV_NAME_LONG "rbd (rados block device)"
 #define RBD_MINORS_PER_MAJOR	256		/* max minors per blkdev */
 #define RBD_MAX_MD_NAME_LEN	(96 + sizeof(RBD_SUFFIX))
 #define RBD_MAX_POOL_NAME_LEN	64
 #define RBD_MAX_SNAP_NAME_LEN	32
 #define RBD_MAX_OPT_LEN		1024
 #define RBD_SNAP_HEAD_NAME	"-"
 #define DEV_NAME_LEN		32
 #define RBD_NOTIFY_TIMEOUT_DEFAULT 10
 /*
  * block device image metadata (in-memory version)
  */
 struct rbd_image_header {
 	u64 image_size;
 	char block_name[32];
 	__u8 obj_order;
 	__u8 crypt_type;
 	__u8 comp_type;
 	struct rw_semaphore snap_rwsem;
 	struct ceph_snap_context *snapc;
 	size_t snap_names_len;
 	u64 snap_seq;
 	u32 total_snaps;
 	char *snap_names;
 	u64 *snap_sizes;
 	u64 obj_version;
 };
 struct rbd_options {
 	int	notify_timeout;
 };
 /*
  * an instance of the client.  multiple devices may share a client.
  */
 struct rbd_client {
 	struct ceph_client	*client;
 	struct rbd_options	*rbd_opts;
 	struct kref		kref;
 	struct list_head	node;
 };
 struct rbd_req_coll;
 /*
  * a single io request
  */
 struct rbd_request {
 	struct request		*rq;		/* blk layer request */
 	struct bio		*bio;		/* cloned bio */
 	struct page		**pages;	/* list of used pages */
 	u64			len;
 	int			coll_index;
 	struct rbd_req_coll	*coll;
 };
 struct rbd_req_status {
 	int done;
 	int rc;
 	u64 bytes;
 };
 /*
  * a collection of requests
  */
 struct rbd_req_coll {
 	int			total;
 	int			num_done;
 	struct kref		kref;
 	struct rbd_req_status	status[0];
 };
 struct rbd_snap {
 	struct	device		dev;
 	const char		*name;
 	size_t			size;
 	struct list_head	node;
 	u64			id;
 };
 /*
  * a single device
  */
 struct rbd_device {
 	int			id;		/* blkdev unique id */
 	int			major;		/* blkdev assigned major */
 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
 	struct request_queue	*q;
 	struct ceph_client	*client;
 	struct rbd_client	*rbd_client;
 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
 	spinlock_t		lock;		/* queue lock */
 	struct rbd_image_header	header;
 	char			obj[RBD_MAX_OBJ_NAME_LEN]; /* rbd image name */
 	int			obj_len;
 	char			obj_md_name[RBD_MAX_MD_NAME_LEN]; /* hdr nm. */
 	char			pool_name[RBD_MAX_POOL_NAME_LEN];
 	int			poolid;
 	struct ceph_osd_event   *watch_event;
 	struct ceph_osd_request *watch_request;
 	char                    snap_name[RBD_MAX_SNAP_NAME_LEN];
 	u32 cur_snap;	/* index+1 of current snapshot within snap context
 			   0 - for the head */
 	int read_only;
 	struct list_head	node;
 	/* list of snapshots */
 	struct list_head	snaps;
 	/* sysfs related */
 	struct device		dev;
 };
 static struct bus_type rbd_bus_type = {
 	.name		= "rbd",
 };
 static spinlock_t node_lock;      /* protects client get/put */
 static DEFINE_MUTEX(ctl_mutex);	  /* Serialize open/close/setup/teardown */
 static LIST_HEAD(rbd_dev_list);    /* devices */
 static LIST_HEAD(rbd_client_list);      /* clients */
 static int __rbd_init_snaps_header(struct rbd_device *rbd_dev);
 static void rbd_dev_release(struct device *dev);
 static ssize_t rbd_snap_add(struct device *dev,
 			    struct device_attribute *attr,
 			    const char *buf,
 			    size_t count);
 static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
 				  struct rbd_snap *snap);
 static struct rbd_device *dev_to_rbd(struct device *dev)
 {
 	return container_of(dev, struct rbd_device, dev);
 }
 static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
 {
 	return get_device(&rbd_dev->dev);
 }
 static void rbd_put_dev(struct rbd_device *rbd_dev)
 {
 	put_device(&rbd_dev->dev);
 }
 static int __rbd_update_snaps(struct rbd_device *rbd_dev);
 static int rbd_open(struct block_device *bdev, fmode_t mode)
 {
 	struct gendisk *disk = bdev->bd_disk;
 	struct rbd_device *rbd_dev = disk->private_data;
 	rbd_get_dev(rbd_dev);
 	set_device_ro(bdev, rbd_dev->read_only);
 	if ((mode & FMODE_WRITE) && rbd_dev->read_only)
 		return -EROFS;
 	return 0;
 }
 static int rbd_release(struct gendisk *disk, fmode_t mode)
 {
 	struct rbd_device *rbd_dev = disk->private_data;
 	rbd_put_dev(rbd_dev);
 	return 0;
 }
 static const struct block_device_operations rbd_bd_ops = {
 	.owner			= THIS_MODULE,
 	.open			= rbd_open,
 	.release		= rbd_release,
 };
 /*
  * Initialize an rbd client instance.
  * We own *opt.
  */
 static struct rbd_client *rbd_client_create(struct ceph_options *opt,
 					    struct rbd_options *rbd_opts)
 {
 	struct rbd_client *rbdc;
 	int ret = -ENOMEM;
 	dout("rbd_client_create\n");
 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
 	if (!rbdc)
 		goto out_opt;
 	kref_init(&rbdc->kref);
 	INIT_LIST_HEAD(&rbdc->node);
 	rbdc->client = ceph_create_client(opt, rbdc, 0, 0);
 	if (IS_ERR(rbdc->client))
 		goto out_rbdc;
 	opt = NULL; /* Now rbdc->client is responsible for opt */
 	ret = ceph_open_session(rbdc->client);
 	if (ret < 0)
 		goto out_err;
 	rbdc->rbd_opts = rbd_opts;
 	spin_lock(&node_lock);
 	list_add_tail(&rbdc->node, &rbd_client_list);
 	spin_unlock(&node_lock);
 	dout("rbd_client_create created %p\n", rbdc);
 	return rbdc;
 out_err:
 	ceph_destroy_client(rbdc->client);
 out_rbdc:
 	kfree(rbdc);
 out_opt:
 	if (opt)
 		ceph_destroy_options(opt);
 	return ERR_PTR(ret);
 }
 /*
  * Find a ceph client with specific addr and configuration.
  */
 static struct rbd_client *__rbd_client_find(struct ceph_options *opt)
 {
 	struct rbd_client *client_node;
 	if (opt->flags & CEPH_OPT_NOSHARE)
 		return NULL;
 	list_for_each_entry(client_node, &rbd_client_list, node)
 		if (ceph_compare_options(opt, client_node->client) == 0)
 			return client_node;
 	return NULL;
 }
 /*
  * mount options
  */
 enum {
 	Opt_notify_timeout,
 	Opt_last_int,
 	/* int args above */
 	Opt_last_string,
 	/* string args above */
 };
 static match_table_t rbdopt_tokens = {
 	{Opt_notify_timeout, "notify_timeout=%d"},
 	/* int args above */
 	/* string args above */
 	{-1, NULL}
 };
 static int parse_rbd_opts_token(char *c, void *private)
 {
 	struct rbd_options *rbdopt = private;
 	substring_t argstr[MAX_OPT_ARGS];
 	int token, intval, ret;
 	token = match_token((char *)c, rbdopt_tokens, argstr);
 	if (token < 0)
 		return -EINVAL;
 	if (token < Opt_last_int) {
 		ret = match_int(&argstr[0], &intval);
 		if (ret < 0) {
 			pr_err("bad mount option arg (not int) "
 			       "at '%s'\n", c);
 			return ret;
 		}
 		dout("got int token %d val %d\n", token, intval);
 	} else if (token > Opt_last_int && token < Opt_last_string) {
 		dout("got string token %d val %s\n", token,
 		     argstr[0].from);
 	} else {
 		dout("got token %d\n", token);
 	}
 	switch (token) {
 	case Opt_notify_timeout:
 		rbdopt->notify_timeout = intval;
 		break;
 	default:
 		BUG_ON(token);
 	}
 	return 0;
 }
 /*
  * Get a ceph client with specific addr and configuration, if one does
  * not exist create it.
  */
 static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
 			  char *options)
 {
 	struct rbd_client *rbdc;
 	struct ceph_options *opt;
 	int ret;
 	struct rbd_options *rbd_opts;
 	rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL);
 	if (!rbd_opts)
 		return -ENOMEM;
 	rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT;
 	ret = ceph_parse_options(&opt, options, mon_addr,
 				 mon_addr + strlen(mon_addr), parse_rbd_opts_token, rbd_opts);
 	if (ret < 0)
 		goto done_err;
 	spin_lock(&node_lock);
 	rbdc = __rbd_client_find(opt);
 	if (rbdc) {
 		ceph_destroy_options(opt);
+		kfree(rbd_opts);
 		/* using an existing client */
 		kref_get(&rbdc->kref);
 		rbd_dev->rbd_client = rbdc;
 		rbd_dev->client = rbdc->client;
 		spin_unlock(&node_lock);
 		return 0;
 	}
 	spin_unlock(&node_lock);
 	rbdc = rbd_client_create(opt, rbd_opts);
 	if (IS_ERR(rbdc)) {
 		ret = PTR_ERR(rbdc);
 		goto done_err;
 	}
 	rbd_dev->rbd_client = rbdc;
 	rbd_dev->client = rbdc->client;
 	return 0;
 done_err:
 	kfree(rbd_opts);
 	return ret;
 }
 /*
  * Destroy ceph client
+ *
+ * Caller must hold node_lock.
  */
 static void rbd_client_release(struct kref *kref)
 {
 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
 	dout("rbd_release_client %p\n", rbdc);
-	spin_lock(&node_lock);
 	list_del(&rbdc->node);
-	spin_unlock(&node_lock);
 	ceph_destroy_client(rbdc->client);
 	kfree(rbdc->rbd_opts);
 	kfree(rbdc);
 }
 /*
  * Drop reference to ceph client node. If it's not referenced anymore, release
  * it.
  */
 static void rbd_put_client(struct rbd_device *rbd_dev)
 {
+	spin_lock(&node_lock);
 	kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
+	spin_unlock(&node_lock);
 	rbd_dev->rbd_client = NULL;
 	rbd_dev->client = NULL;
 }
 /*
  * Destroy requests collection
  */
 static void rbd_coll_release(struct kref *kref)
 {
 	struct rbd_req_coll *coll =
 		container_of(kref, struct rbd_req_coll, kref);
 	dout("rbd_coll_release %p\n", coll);
 	kfree(coll);
 }
 /*
  * Create a new header structure, translate header format from the on-disk
  * header.
  */
 static int rbd_header_from_disk(struct rbd_image_header *header,
 				 struct rbd_image_header_ondisk *ondisk,
 				 int allocated_snaps,
 				 gfp_t gfp_flags)
 {
 	int i;
 	u32 snap_count = le32_to_cpu(ondisk->snap_count);
 	int ret = -ENOMEM;
 	if (memcmp(ondisk, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT))) {
 		return -ENXIO;
 	}
 	init_rwsem(&header->snap_rwsem);
 	header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
 	header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
 				snap_count *
 				 sizeof(struct rbd_image_snap_ondisk),
 				gfp_flags);
 	if (!header->snapc)
 		return -ENOMEM;
 	if (snap_count) {
 		header->snap_names = kmalloc(header->snap_names_len,
 					     GFP_KERNEL);
 		if (!header->snap_names)
 			goto err_snapc;
 		header->snap_sizes = kmalloc(snap_count * sizeof(u64),
 					     GFP_KERNEL);
 		if (!header->snap_sizes)
 			goto err_names;
 	} else {
 		header->snap_names = NULL;
 		header->snap_sizes = NULL;
 	}
 	memcpy(header->block_name, ondisk->block_name,
 	       sizeof(ondisk->block_name));
 	header->image_size = le64_to_cpu(ondisk->image_size);
 	header->obj_order = ondisk->options.order;
 	header->crypt_type = ondisk->options.crypt_type;
 	header->comp_type = ondisk->options.comp_type;
 	atomic_set(&header->snapc->nref, 1);
 	header->snap_seq = le64_to_cpu(ondisk->snap_seq);
 	header->snapc->num_snaps = snap_count;
 	header->total_snaps = snap_count;
 	if (snap_count &&
 	    allocated_snaps == snap_count) {
 		for (i = 0; i < snap_count; i++) {
 			header->snapc->snaps[i] =
 				le64_to_cpu(ondisk->snaps[i].id);
 			header->snap_sizes[i] =
 				le64_to_cpu(ondisk->snaps[i].image_size);
 		}
 		/* copy snapshot names */
 		memcpy(header->snap_names, &ondisk->snaps[i],
 			header->snap_names_len);
 	}
 	return 0;
 err_names:
 	kfree(header->snap_names);
 err_snapc:
 	kfree(header->snapc);
 	return ret;
 }
 static int snap_index(struct rbd_image_header *header, int snap_num)
 {
 	return header->total_snaps - snap_num;
 }
 static u64 cur_snap_id(struct rbd_device *rbd_dev)
 {
 	struct rbd_image_header *header = &rbd_dev->header;
 	if (!rbd_dev->cur_snap)
 		return 0;
 	return header->snapc->snaps[snap_index(header, rbd_dev->cur_snap)];
 }
 static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
 			u64 *seq, u64 *size)
 {
 	int i;
 	char *p = header->snap_names;
 	for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) {
 		if (strcmp(snap_name, p) == 0)
 			break;
 	}
 	if (i == header->total_snaps)
 		return -ENOENT;
 	if (seq)
 		*seq = header->snapc->snaps[i];
 	if (size)
 		*size = header->snap_sizes[i];
 	return i;
 }
 static int rbd_header_set_snap(struct rbd_device *dev,
 			       const char *snap_name,
 			       u64 *size)
 {
 	struct rbd_image_header *header = &dev->header;
 	struct ceph_snap_context *snapc = header->snapc;
 	int ret = -ENOENT;
 	down_write(&header->snap_rwsem);
 	if (!snap_name ||
 	    !*snap_name ||
 	    strcmp(snap_name, "-") == 0 ||
 	    strcmp(snap_name, RBD_SNAP_HEAD_NAME) == 0) {
 		if (header->total_snaps)
 			snapc->seq = header->snap_seq;
 		else
 			snapc->seq = 0;
 		dev->cur_snap = 0;
 		dev->read_only = 0;
 		if (size)
 			*size = header->image_size;
 	} else {
 		ret = snap_by_name(header, snap_name, &snapc->seq, size);
 		if (ret < 0)
 			goto done;
 		dev->cur_snap = header->total_snaps - ret;
 		dev->read_only = 1;
 	}
 	ret = 0;
 done:
 	up_write(&header->snap_rwsem);
 	return ret;
 }
 static void rbd_header_free(struct rbd_image_header *header)
 {
 	kfree(header->snapc);
 	kfree(header->snap_names);
 	kfree(header->snap_sizes);
 }
 /*
  * get the actual striped segment name, offset and length
  */
 static u64 rbd_get_segment(struct rbd_image_header *header,
 			   const char *block_name,
 			   u64 ofs, u64 len,
 			   char *seg_name, u64 *segofs)
 {
 	u64 seg = ofs >> header->obj_order;
 	if (seg_name)
 		snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
 			 "%s.%012llx", block_name, seg);
 	ofs = ofs & ((1 << header->obj_order) - 1);
 	len = min_t(u64, len, (1 << header->obj_order) - ofs);
 	if (segofs)
 		*segofs = ofs;
 	return len;
 }
 static int rbd_get_num_segments(struct rbd_image_header *header,
 				u64 ofs, u64 len)
 {
 	u64 start_seg = ofs >> header->obj_order;
 	u64 end_seg = (ofs + len - 1) >> header->obj_order;
 	return end_seg - start_seg + 1;
 }
 /*
  * returns the size of an object in the image
  */
 static u64 rbd_obj_bytes(struct rbd_image_header *header)
 {
 	return 1 << header->obj_order;
 }
 /*
  * bio helpers
  */
 static void bio_chain_put(struct bio *chain)
 {
 	struct bio *tmp;
 	while (chain) {
 		tmp = chain;
 		chain = chain->bi_next;
 		bio_put(tmp);
 	}
 }
 /*
  * zeros a bio chain, starting at specific offset
  */
 static void zero_bio_chain(struct bio *chain, int start_ofs)
 {
 	struct bio_vec *bv;
 	unsigned long flags;
 	void *buf;
 	int i;
 	int pos = 0;
 	while (chain) {
 		bio_for_each_segment(bv, chain, i) {
 			if (pos + bv->bv_len > start_ofs) {
 				int remainder = max(start_ofs - pos, 0);
 				buf = bvec_kmap_irq(bv, &flags);
 				memset(buf + remainder, 0,
 				       bv->bv_len - remainder);
 				bvec_kunmap_irq(buf, &flags);
 			}
 			pos += bv->bv_len;
 		}
 		chain = chain->bi_next;
 	}
 }
 /*
  * bio_chain_clone - clone a chain of bios up to a certain length.
  * might return a bio_pair that will need to be released.
  */
 static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
 				   struct bio_pair **bp,
 				   int len, gfp_t gfpmask)
 {
 	struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
 	int total = 0;
 	if (*bp) {
 		bio_pair_release(*bp);
 		*bp = NULL;
 	}
 	while (old_chain && (total < len)) {
 		tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
 		if (!tmp)
 			goto err_out;
 		if (total + old_chain->bi_size > len) {
 			struct bio_pair *bp;
 			/*
 			 * this split can only happen with a single paged bio,
 			 * split_bio will BUG_ON if this is not the case
 			 */
 			dout("bio_chain_clone split! total=%d remaining=%d"
 			     "bi_size=%d\n",
 			     (int)total, (int)len-total,
 			     (int)old_chain->bi_size);
 			/* split the bio. We'll release it either in the next
 			   call, or it will have to be released outside */
 			bp = bio_split(old_chain, (len - total) / 512ULL);
 			if (!bp)
 				goto err_out;
 			__bio_clone(tmp, &bp->bio1);
 			*next = &bp->bio2;
 		} else {
 			__bio_clone(tmp, old_chain);
 			*next = old_chain->bi_next;
 		}
 		tmp->bi_bdev = NULL;
 		gfpmask &= ~__GFP_WAIT;
 		tmp->bi_next = NULL;
 		if (!new_chain) {
 			new_chain = tail = tmp;
 		} else {
 			tail->bi_next = tmp;
 			tail = tmp;
 		}
 		old_chain = old_chain->bi_next;
 		total += tmp->bi_size;
 	}
 	BUG_ON(total < len);
 	if (tail)
 		tail->bi_next = NULL;
 	*old = old_chain;
 	return new_chain;
 err_out:
 	dout("bio_chain_clone with err\n");
 	bio_chain_put(new_chain);
 	return NULL;
 }
 /*
  * helpers for osd request op vectors.
  */
 static int rbd_create_rw_ops(struct ceph_osd_req_op **ops,
 			    int num_ops,
 			    int opcode,
 			    u32 payload_len)
 {
 	*ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1),
 		       GFP_NOIO);
 	if (!*ops)
 		return -ENOMEM;
 	(*ops)[0].op = opcode;
 	/*
 	 * op extent offset and length will be set later on
 	 * in calc_raw_layout()
 	 */
 	(*ops)[0].payload_len = payload_len;
 	return 0;
 }
 static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
 {
 	kfree(ops);
 }
 static void rbd_coll_end_req_index(struct request *rq,
 				   struct rbd_req_coll *coll,
 				   int index,
 				   int ret, u64 len)
 {
 	struct request_queue *q;
 	int min, max, i;
 	dout("rbd_coll_end_req_index %p index %d ret %d len %lld\n",
 	     coll, index, ret, len);
 	if (!rq)
 		return;
 	if (!coll) {
 		blk_end_request(rq, ret, len);
 		return;
 	}
 	q = rq->q;
 	spin_lock_irq(q->queue_lock);
 	coll->status[index].done = 1;
 	coll->status[index].rc = ret;
 	coll->status[index].bytes = len;
 	max = min = coll->num_done;
 	while (max < coll->total && coll->status[max].done)
 		max++;
 	for (i = min; i<max; i++) {
 		__blk_end_request(rq, coll->status[i].rc,
 				  coll->status[i].bytes);
 		coll->num_done++;
 		kref_put(&coll->kref, rbd_coll_release);
 	}
 	spin_unlock_irq(q->queue_lock);
 }
 static void rbd_coll_end_req(struct rbd_request *req,
 			     int ret, u64 len)
 {
 	rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
 }
 /*
  * Send ceph osd request
  */
 static int rbd_do_request(struct request *rq,
 			  struct rbd_device *dev,
 			  struct ceph_snap_context *snapc,
 			  u64 snapid,
 			  const char *obj, u64 ofs, u64 len,
 			  struct bio *bio,
 			  struct page **pages,
 			  int num_pages,
 			  int flags,
 			  struct ceph_osd_req_op *ops,
 			  int num_reply,
 			  struct rbd_req_coll *coll,
 			  int coll_index,
 			  void (*rbd_cb)(struct ceph_osd_request *req,
 					 struct ceph_msg *msg),
 			  struct ceph_osd_request **linger_req,
 			  u64 *ver)
 {
 	struct ceph_osd_request *req;
 	struct ceph_file_layout *layout;
 	int ret;
 	u64 bno;
 	struct timespec mtime = CURRENT_TIME;
 	struct rbd_request *req_data;
 	struct ceph_osd_request_head *reqhead;
 	struct rbd_image_header *header = &dev->header;
 	req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
 	if (!req_data) {
 		if (coll)
 			rbd_coll_end_req_index(rq, coll, coll_index,
 					       -ENOMEM, len);
 		return -ENOMEM;
 	}
 	if (coll) {
 		req_data->coll = coll;
 		req_data->coll_index = coll_index;
 	}
 	dout("rbd_do_request obj=%s ofs=%lld len=%lld\n", obj, len, ofs);
 	down_read(&header->snap_rwsem);
 	req = ceph_osdc_alloc_request(&dev->client->osdc, flags,
 				      snapc,
 				      ops,
 				      false,
 				      GFP_NOIO, pages, bio);
 	if (!req) {
 		up_read(&header->snap_rwsem);
 		ret = -ENOMEM;
 		goto done_pages;
 	}
 	req->r_callback = rbd_cb;
 	req_data->rq = rq;
 	req_data->bio = bio;
 	req_data->pages = pages;
 	req_data->len = len;
 	req->r_priv = req_data;
 	reqhead = req->r_request->front.iov_base;
 	reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
 	strncpy(req->r_oid, obj, sizeof(req->r_oid));
 	req->r_oid_len = strlen(req->r_oid);
 	layout = &req->r_file_layout;
 	memset(layout, 0, sizeof(*layout));
 	layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
 	layout->fl_stripe_count = cpu_to_le32(1);
 	layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
 	layout->fl_pg_preferred = cpu_to_le32(-1);
 	layout->fl_pg_pool = cpu_to_le32(dev->poolid);
 	ceph_calc_raw_layout(&dev->client->osdc, layout, snapid,
 			     ofs, &len, &bno, req, ops);
 	ceph_osdc_build_request(req, ofs, &len,
 				ops,
 				snapc,
 				&mtime,
 				req->r_oid, req->r_oid_len);
 	up_read(&header->snap_rwsem);
 	if (linger_req) {
 		ceph_osdc_set_request_linger(&dev->client->osdc, req);
 		*linger_req = req;
 	}
 	ret = ceph_osdc_start_request(&dev->client->osdc, req, false);
 	if (ret < 0)
 		goto done_err;
 	if (!rbd_cb) {
 		ret = ceph_osdc_wait_request(&dev->client->osdc, req);
 		if (ver)
 			*ver = le64_to_cpu(req->r_reassert_version.version);
 		dout("reassert_ver=%lld\n",
 		     le64_to_cpu(req->r_reassert_version.version));
 		ceph_osdc_put_request(req);
 	}
 	return ret;
 done_err:
 	bio_chain_put(req_data->bio);
 	ceph_osdc_put_request(req);
 done_pages:
 	rbd_coll_end_req(req_data, ret, len);
 	kfree(req_data);
 	return ret;
 }
 /*
  * Ceph osd op callback
  */
 static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
 {
 	struct rbd_request *req_data = req->r_priv;
 	struct ceph_osd_reply_head *replyhead;
 	struct ceph_osd_op *op;
 	__s32 rc;
 	u64 bytes;
 	int read_op;
 	/* parse reply */
 	replyhead = msg->front.iov_base;
 	WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
 	op = (void *)(replyhead + 1);
 	rc = le32_to_cpu(replyhead->result);
 	bytes = le64_to_cpu(op->extent.length);
 	read_op = (le32_to_cpu(op->op) == CEPH_OSD_OP_READ);
 	dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc);
 	if (rc == -ENOENT && read_op) {
 		zero_bio_chain(req_data->bio, 0);
 		rc = 0;
 	} else if (rc == 0 && read_op && bytes < req_data->len) {
 		zero_bio_chain(req_data->bio, bytes);
 		bytes = req_data->len;
 	}
 	rbd_coll_end_req(req_data, rc, bytes);
 	if (req_data->bio)
 		bio_chain_put(req_data->bio);
 	ceph_osdc_put_request(req);
 	kfree(req_data);
 }
 static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
 {
 	ceph_osdc_put_request(req);
 }
 /*
  * Do a synchronous ceph osd operation
  */
 static int rbd_req_sync_op(struct rbd_device *dev,
 			   struct ceph_snap_context *snapc,
 			   u64 snapid,
 			   int opcode,
 			   int flags,
 			   struct ceph_osd_req_op *orig_ops,
 			   int num_reply,
 			   const char *obj,
 			   u64 ofs, u64 len,
 			   char *buf,
 			   struct ceph_osd_request **linger_req,
 			   u64 *ver)
 {
 	int ret;
 	struct page **pages;
 	int num_pages;
 	struct ceph_osd_req_op *ops = orig_ops;
 	u32 payload_len;
 	num_pages = calc_pages_for(ofs , len);
 	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
 	if (IS_ERR(pages))
 		return PTR_ERR(pages);
 	if (!orig_ops) {
 		payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0);
 		ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
 		if (ret < 0)
 			goto done;
 		if ((flags & CEPH_OSD_FLAG_WRITE) && buf) {
 			ret = ceph_copy_to_page_vector(pages, buf, ofs, len);
 			if (ret < 0)
 				goto done_ops;
 		}
 	}
 	ret = rbd_do_request(NULL, dev, snapc, snapid,
 			  obj, ofs, len, NULL,
 			  pages, num_pages,
 			  flags,
 			  ops,
 			  2,
 			  NULL, 0,
 			  NULL,
 			  linger_req, ver);
 	if (ret < 0)
 		goto done_ops;
 	if ((flags & CEPH_OSD_FLAG_READ) && buf)
 		ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
 done_ops:
 	if (!orig_ops)
 		rbd_destroy_ops(ops);
 done:
 	ceph_release_page_vector(pages, num_pages);
 	return ret;
 }
 /*
  * Do an asynchronous ceph osd operation
  */
 static int rbd_do_op(struct request *rq,
 		     struct rbd_device *rbd_dev ,
 		     struct ceph_snap_context *snapc,
 		     u64 snapid,
 		     int opcode, int flags, int num_reply,
 		     u64 ofs, u64 len,
 		     struct bio *bio,
 		     struct rbd_req_coll *coll,
 		     int coll_index)
 {
 	char *seg_name;
 	u64 seg_ofs;
 	u64 seg_len;
 	int ret;
 	struct ceph_osd_req_op *ops;
 	u32 payload_len;
 	seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
 	if (!seg_name)
 		return -ENOMEM;
 	seg_len = rbd_get_segment(&rbd_dev->header,
 				  rbd_dev->header.block_name,
 				  ofs, len,
 				  seg_name, &seg_ofs);
 	payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
 	ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
 	if (ret < 0)
 		goto done;
 	/* we've taken care of segment sizes earlier when we
 	   cloned the bios. We should never have a segment
 	   truncated at this point */
 	BUG_ON(seg_len < len);
 	ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
 			     seg_name, seg_ofs, seg_len,
 			     bio,
 			     NULL, 0,
 			     flags,
 			     ops,
 			     num_reply,
 			     coll, coll_index,
 			     rbd_req_cb, 0, NULL);
 	rbd_destroy_ops(ops);
 done:
 	kfree(seg_name);
 	return ret;
 }
 /*
  * Request async osd write
  */
 static int rbd_req_write(struct request *rq,
 			 struct rbd_device *rbd_dev,
 			 struct ceph_snap_context *snapc,
 			 u64 ofs, u64 len,
 			 struct bio *bio,
 			 struct rbd_req_coll *coll,
 			 int coll_index)
 {
 	return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
 			 CEPH_OSD_OP_WRITE,
 			 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
 			 2,
 			 ofs, len, bio, coll, coll_index);
 }
 /*
  * Request async osd read
  */
 static int rbd_req_read(struct request *rq,
 			 struct rbd_device *rbd_dev,
 			 u64 snapid,
 			 u64 ofs, u64 len,
 			 struct bio *bio,
 			 struct rbd_req_coll *coll,
 			 int coll_index)
 {
 	return rbd_do_op(rq, rbd_dev, NULL,
 			 (snapid ? snapid : CEPH_NOSNAP),
 			 CEPH_OSD_OP_READ,
 			 CEPH_OSD_FLAG_READ,
 			 2,
 			 ofs, len, bio, coll, coll_index);
 }
 /*
  * Request sync osd read
  */
 static int rbd_req_sync_read(struct rbd_device *dev,
 			  struct ceph_snap_context *snapc,
 			  u64 snapid,
 			  const char *obj,
 			  u64 ofs, u64 len,
 			  char *buf,
 			  u64 *ver)
 {
 	return rbd_req_sync_op(dev, NULL,
 			       (snapid ? snapid : CEPH_NOSNAP),
 			       CEPH_OSD_OP_READ,
 			       CEPH_OSD_FLAG_READ,
 			       NULL,
 			       1, obj, ofs, len, buf, NULL, ver);
 }
 /*
  * Request sync osd watch
  */
 static int rbd_req_sync_notify_ack(struct rbd_device *dev,
 				   u64 ver,
 				   u64 notify_id,
 				   const char *obj)
 {
 	struct ceph_osd_req_op *ops;
 	struct page **pages = NULL;
 	int ret;
 	ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY_ACK, 0);
 	if (ret < 0)
 		return ret;
 	ops[0].watch.ver = cpu_to_le64(dev->header.obj_version);
 	ops[0].watch.cookie = notify_id;
 	ops[0].watch.flag = 0;
 	ret = rbd_do_request(NULL, dev, NULL, CEPH_NOSNAP,
 			  obj, 0, 0, NULL,
 			  pages, 0,
 			  CEPH_OSD_FLAG_READ,
 			  ops,
 			  1,
 			  NULL, 0,
 			  rbd_simple_req_cb, 0, NULL);
 	rbd_destroy_ops(ops);
 	return ret;
 }
 static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
 {
 	struct rbd_device *dev = (struct rbd_device *)data;
 	int rc;
 	if (!dev)
 		return;
 	dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name,
 		notify_id, (int)opcode);
 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 	rc = __rbd_update_snaps(dev);
 	mutex_unlock(&ctl_mutex);
 	if (rc)
 		pr_warning(DRV_NAME "%d got notification but failed to update"
 			   " snaps: %d\n", dev->major, rc);
 	rbd_req_sync_notify_ack(dev, ver, notify_id, dev->obj_md_name);
 }
 /*
  * Request sync osd watch
  */
 static int rbd_req_sync_watch(struct rbd_device *dev,
 			      const char *obj,
 			      u64 ver)
 {
 	struct ceph_osd_req_op *ops;
 	struct ceph_osd_client *osdc = &dev->client->osdc;
 	int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
 	if (ret < 0)
 		return ret;
 	ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
 				     (void *)dev, &dev->watch_event);
 	if (ret < 0)
 		goto fail;
 	ops[0].watch.ver = cpu_to_le64(ver);
 	ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie);
 	ops[0].watch.flag = 1;
 	ret = rbd_req_sync_op(dev, NULL,
 			      CEPH_NOSNAP,
 			      0,
 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
 			      ops,
 			      1, obj, 0, 0, NULL,
 			      &dev->watch_request, NULL);
 	if (ret < 0)
 		goto fail_event;
 	rbd_destroy_ops(ops);
 	return 0;
 fail_event:
 	ceph_osdc_cancel_event(dev->watch_event);
 	dev->watch_event = NULL;
 fail:
 	rbd_destroy_ops(ops);
 	return ret;
 }
 /*
  * Request sync osd unwatch
  */
 static int rbd_req_sync_unwatch(struct rbd_device *dev,
 				const char *obj)
 {
 	struct ceph_osd_req_op *ops;
 	int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
 	if (ret < 0)
 		return ret;
 	ops[0].watch.ver = 0;
 	ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie);
 	ops[0].watch.flag = 0;
 	ret = rbd_req_sync_op(dev, NULL,
 			      CEPH_NOSNAP,
 			      0,
 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
 			      ops,
 			      1, obj, 0, 0, NULL, NULL, NULL);
 	rbd_destroy_ops(ops);
 	ceph_osdc_cancel_event(dev->watch_event);
 	dev->watch_event = NULL;
 	return ret;
 }
 struct rbd_notify_info {
 	struct rbd_device *dev;
 };
 static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
 {
 	struct rbd_device *dev = (struct rbd_device *)data;
 	if (!dev)
 		return;
 	dout("rbd_notify_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name,
 		notify_id, (int)opcode);
 }
 /*
  * Request sync osd notify
  */
 static int rbd_req_sync_notify(struct rbd_device *dev,
 		          const char *obj)
 {
 	struct ceph_osd_req_op *ops;
 	struct ceph_osd_client *osdc = &dev->client->osdc;
 	struct ceph_osd_event *event;
 	struct rbd_notify_info info;
 	int payload_len = sizeof(u32) + sizeof(u32);
 	int ret;
 	ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY, payload_len);
 	if (ret < 0)
 		return ret;
 	info.dev = dev;
 	ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
 				     (void *)&info, &event);
 	if (ret < 0)
 		goto fail;
 	ops[0].watch.ver = 1;
 	ops[0].watch.flag = 1;
 	ops[0].watch.cookie = event->cookie;
 	ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
 	ops[0].watch.timeout = 12;
 	ret = rbd_req_sync_op(dev, NULL,
 			       CEPH_NOSNAP,
 			       0,
 			       CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
 			       ops,
 			       1, obj, 0, 0, NULL, NULL, NULL);
 	if (ret < 0)
 		goto fail_event;
 	ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
 	dout("ceph_osdc_wait_event returned %d\n", ret);
 	rbd_destroy_ops(ops);
 	return 0;
 fail_event:
 	ceph_osdc_cancel_event(event);
 fail:
 	rbd_destroy_ops(ops);
 	return ret;
 }
 /*
  * Request sync osd read
  */
 static int rbd_req_sync_exec(struct rbd_device *dev,
 			     const char *obj,
 			     const char *cls,
 			     const char *method,
 			     const char *data,
 			     int len,
 			     u64 *ver)
 {
 	struct ceph_osd_req_op *ops;
 	int cls_len = strlen(cls);
 	int method_len = strlen(method);
 	int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL,
 				    cls_len + method_len + len);
 	if (ret < 0)
 		return ret;
 	ops[0].cls.class_name = cls;
 	ops[0].cls.class_len = (__u8)cls_len;
 	ops[0].cls.method_name = method;
 	ops[0].cls.method_len = (__u8)method_len;
 	ops[0].cls.argc = 0;
 	ops[0].cls.indata = data;
 	ops[0].cls.indata_len = len;
 	ret = rbd_req_sync_op(dev, NULL,
 			       CEPH_NOSNAP,
 			       0,
 			       CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
 			       ops,
 			       1, obj, 0, 0, NULL, NULL, ver);
 	rbd_destroy_ops(ops);
 	dout("cls_exec returned %d\n", ret);
 	return ret;
 }
 static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
 {
 	struct rbd_req_coll *coll =
 			kzalloc(sizeof(struct rbd_req_coll) +
 			        sizeof(struct rbd_req_status) * num_reqs,
 				GFP_ATOMIC);
 	if (!coll)
 		return NULL;
 	coll->total = num_reqs;
 	kref_init(&coll->kref);
 	return coll;
 }
 /*
  * block device queue callback
  */
 static void rbd_rq_fn(struct request_queue *q)
 {
 	struct rbd_device *rbd_dev = q->queuedata;
 	struct request *rq;
 	struct bio_pair *bp = NULL;
 	rq = blk_fetch_request(q);
 	while (1) {
 		struct bio *bio;
 		struct bio *rq_bio, *next_bio = NULL;
 		bool do_write;
 		int size, op_size = 0;
 		u64 ofs;
 		int num_segs, cur_seg = 0;
 		struct rbd_req_coll *coll;
 		/* peek at request from block layer */
 		if (!rq)
 			break;
 		dout("fetched request\n");
 		/* filter out block requests we don't understand */
 		if ((rq->cmd_type != REQ_TYPE_FS)) {
 			__blk_end_request_all(rq, 0);
 			goto next;
 		}
 		/* deduce our operation (read, write) */
 		do_write = (rq_data_dir(rq) == WRITE);
 		size = blk_rq_bytes(rq);
 		ofs = blk_rq_pos(rq) * 512ULL;
 		rq_bio = rq->bio;
 		if (do_write && rbd_dev->read_only) {
 			__blk_end_request_all(rq, -EROFS);
 			goto next;
 		}
 		spin_unlock_irq(q->queue_lock);
 		dout("%s 0x%x bytes at 0x%llx\n",
 		     do_write ? "write" : "read",
 		     size, blk_rq_pos(rq) * 512ULL);
 		num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
 		coll = rbd_alloc_coll(num_segs);
 		if (!coll) {
 			spin_lock_irq(q->queue_lock);
 			__blk_end_request_all(rq, -ENOMEM);
 			goto next;
 		}
 		do {
 			/* a bio clone to be passed down to OSD req */
 			dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt);
 			op_size = rbd_get_segment(&rbd_dev->header,
 						  rbd_dev->header.block_name,
 						  ofs, size,
 						  NULL, NULL);
 			kref_get(&coll->kref);
 			bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
 					      op_size, GFP_ATOMIC);
 			if (!bio) {
 				rbd_coll_end_req_index(rq, coll, cur_seg,
 						       -ENOMEM, op_size);
 				goto next_seg;
 			}
 			/* init OSD command: write or read */
 			if (do_write)
 				rbd_req_write(rq, rbd_dev,
 					      rbd_dev->header.snapc,
 					      ofs,
 					      op_size, bio,
 					      coll, cur_seg);
 			else
 				rbd_req_read(rq, rbd_dev,
 					     cur_snap_id(rbd_dev),
 					     ofs,
 					     op_size, bio,
 					     coll, cur_seg);
 next_seg:
 			size -= op_size;
 			ofs += op_size;
 			cur_seg++;
 			rq_bio = next_bio;
 		} while (size > 0);
 		kref_put(&coll->kref, rbd_coll_release);
 		if (bp)
 			bio_pair_release(bp);
 		spin_lock_irq(q->queue_lock);
 next:
 		rq = blk_fetch_request(q);
 	}
 }
 /*
  * a queue callback. Makes sure that we don't create a bio that spans across
  * multiple osd objects. One exception would be with a single page bios,
  * which we handle later at bio_chain_clone
  */
 static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
 			  struct bio_vec *bvec)
 {
 	struct rbd_device *rbd_dev = q->queuedata;
 	unsigned int chunk_sectors = 1 << (rbd_dev->header.obj_order - 9);
 	sector_t sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
 	unsigned int bio_sectors = bmd->bi_size >> 9;
 	int max;
 	max =  (chunk_sectors - ((sector & (chunk_sectors - 1))
 				 + bio_sectors)) << 9;
 	if (max < 0)
 		max = 0; /* bio_add cannot handle a negative return */
 	if (max <= bvec->bv_len && bio_sectors == 0)
 		return bvec->bv_len;
 	return max;
 }
 static void rbd_free_disk(struct rbd_device *rbd_dev)
 {
 	struct gendisk *disk = rbd_dev->disk;
 	if (!disk)
 		return;
 	rbd_header_free(&rbd_dev->header);
 	if (disk->flags & GENHD_FL_UP)
 		del_gendisk(disk);
 	if (disk->queue)
 		blk_cleanup_queue(disk->queue);
 	put_disk(disk);
 }
 /*
  * reload the ondisk the header
  */
 static int rbd_read_header(struct rbd_device *rbd_dev,
 			   struct rbd_image_header *header)
 {
 	ssize_t rc;
 	struct rbd_image_header_ondisk *dh;
 	int snap_count = 0;
 	u64 snap_names_len = 0;
 	u64 ver;
 	while (1) {
 		int len = sizeof(*dh) +
 			  snap_count * sizeof(struct rbd_image_snap_ondisk) +
 			  snap_names_len;
 		rc = -ENOMEM;
 		dh = kmalloc(len, GFP_KERNEL);
 		if (!dh)
 			return -ENOMEM;
 		rc = rbd_req_sync_read(rbd_dev,
 				       NULL, CEPH_NOSNAP,
 				       rbd_dev->obj_md_name,
 				       0, len,
 				       (char *)dh, &ver);
 		if (rc < 0)
 			goto out_dh;
 		rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL);
 		if (rc < 0) {
 			if (rc == -ENXIO) {
 				pr_warning("unrecognized header format"
 					   " for image %s", rbd_dev->obj);
 			}
 			goto out_dh;
 		}
 		if (snap_count != header->total_snaps) {
 			snap_count = header->total_snaps;
 			snap_names_len = header->snap_names_len;
 			rbd_header_free(header);
 			kfree(dh);
 			continue;
 		}
 		break;
 	}
 	header->obj_version = ver;
 out_dh:
 	kfree(dh);
 	return rc;
 }
 /*
  * create a snapshot
  */
 static int rbd_header_add_snap(struct rbd_device *dev,
 			       const char *snap_name,
 			       gfp_t gfp_flags)
 {
 	int name_len = strlen(snap_name);
 	u64 new_snapid;
 	int ret;
 	void *data, *p, *e;
 	u64 ver;
 	/* we should create a snapshot only if we're pointing at the head */
 	if (dev->cur_snap)
 		return -EINVAL;
 	ret = ceph_monc_create_snapid(&dev->client->monc, dev->poolid,
 				      &new_snapid);
 	dout("created snapid=%lld\n", new_snapid);
 	if (ret < 0)
 		return ret;
 	data = kmalloc(name_len + 16, gfp_flags);
 	if (!data)
 		return -ENOMEM;
 	p = data;
 	e = data + name_len + 16;
 	ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
 	ceph_encode_64_safe(&p, e, new_snapid, bad);
 	ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add",
 				data, p - data, &ver);
 	kfree(data);
 	if (ret < 0)
 		return ret;
 	dev->header.snapc->seq =  new_snapid;
 	return 0;
 bad:
 	return -ERANGE;
 }
 static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
 {
 	struct rbd_snap *snap;
 	while (!list_empty(&rbd_dev->snaps)) {
 		snap = list_first_entry(&rbd_dev->snaps, struct rbd_snap, node);
 		__rbd_remove_snap_dev(rbd_dev, snap);
 	}
 }
 /*
  * only read the first part of the ondisk header, without the snaps info
  */
 static int __rbd_update_snaps(struct rbd_device *rbd_dev)
 {
 	int ret;
 	struct rbd_image_header h;
 	u64 snap_seq;
 	int follow_seq = 0;
 	ret = rbd_read_header(rbd_dev, &h);
 	if (ret < 0)
 		return ret;
 	/* resized? */
 	set_capacity(rbd_dev->disk, h.image_size / 512ULL);
 	down_write(&rbd_dev->header.snap_rwsem);
 	snap_seq = rbd_dev->header.snapc->seq;
 	if (rbd_dev->header.total_snaps &&
 	    rbd_dev->header.snapc->snaps[0] == snap_seq)
 		/* pointing at the head, will need to follow that
 		   if head moves */
 		follow_seq = 1;
 	kfree(rbd_dev->header.snapc);
 	kfree(rbd_dev->header.snap_names);
 	kfree(rbd_dev->header.snap_sizes);
 	rbd_dev->header.total_snaps = h.total_snaps;
 	rbd_dev->header.snapc = h.snapc;
 	rbd_dev->header.snap_names = h.snap_names;
 	rbd_dev->header.snap_names_len = h.snap_names_len;
 	rbd_dev->header.snap_sizes = h.snap_sizes;
 	if (follow_seq)
 		rbd_dev->header.snapc->seq = rbd_dev->header.snapc->snaps[0];
 	else
 		rbd_dev->header.snapc->seq = snap_seq;
 	ret = __rbd_init_snaps_header(rbd_dev);
 	up_write(&rbd_dev->header.snap_rwsem);
 	return ret;
 }
 static int rbd_init_disk(struct rbd_device *rbd_dev)
 {
 	struct gendisk *disk;
 	struct request_queue *q;
 	int rc;
 	u64 total_size = 0;
 	/* contact OSD, request size info about the object being mapped */
 	rc = rbd_read_header(rbd_dev, &rbd_dev->header);
 	if (rc)
 		return rc;
 	/* no need to lock here, as rbd_dev is not registered yet */
 	rc = __rbd_init_snaps_header(rbd_dev);
 	if (rc)
 		return rc;
 	rc = rbd_header_set_snap(rbd_dev, rbd_dev->snap_name, &total_size);
 	if (rc)
 		return rc;
 	/* create gendisk info */
 	rc = -ENOMEM;
 	disk = alloc_disk(RBD_MINORS_PER_MAJOR);
 	if (!disk)
 		goto out;
 	snprintf(disk->disk_name, sizeof(disk->disk_name), DRV_NAME "%d",
 		 rbd_dev->id);
 	disk->major = rbd_dev->major;
 	disk->first_minor = 0;
 	disk->fops = &rbd_bd_ops;
 	disk->private_data = rbd_dev;
 	/* init rq */
 	rc = -ENOMEM;
 	q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
 	if (!q)
 		goto out_disk;
 	/* set io sizes to object size */
 	blk_queue_max_hw_sectors(q, rbd_obj_bytes(&rbd_dev->header) / 512ULL);
 	blk_queue_max_segment_size(q, rbd_obj_bytes(&rbd_dev->header));
 	blk_queue_io_min(q, rbd_obj_bytes(&rbd_dev->header));
 	blk_queue_io_opt(q, rbd_obj_bytes(&rbd_dev->header));
 	blk_queue_merge_bvec(q, rbd_merge_bvec);
 	disk->queue = q;
 	q->queuedata = rbd_dev;
 	rbd_dev->disk = disk;
 	rbd_dev->q = q;
 	/* finally, announce the disk to the world */
 	set_capacity(disk, total_size / 512ULL);
 	add_disk(disk);
 	pr_info("%s: added with size 0x%llx\n",
 		disk->disk_name, (unsigned long long)total_size);
 	return 0;
 out_disk:
 	put_disk(disk);
 out:
 	return rc;
 }
 /*
   sysfs
 */
 static ssize_t rbd_size_show(struct device *dev,
 			     struct device_attribute *attr, char *buf)
 {
 	struct rbd_device *rbd_dev = dev_to_rbd(dev);
 	return sprintf(buf, "%llu\n", (unsigned long long)rbd_dev->header.image_size);
 }
 static ssize_t rbd_major_show(struct device *dev,
 			      struct device_attribute *attr, char *buf)
 {
 	struct rbd_device *rbd_dev = dev_to_rbd(dev);
 	return sprintf(buf, "%d\n", rbd_dev->major);
 }
 static ssize_t rbd_client_id_show(struct device *dev,
 				  struct device_attribute *attr, char *buf)
 {
 	struct rbd_device *rbd_dev = dev_to_rbd(dev);
 	return sprintf(buf, "client%lld\n", ceph_client_id(rbd_dev->client));
 }
 static ssize_t rbd_pool_show(struct device *dev,
 			     struct device_attribute *attr, char *buf)
 {
 	struct rbd_device *rbd_dev = dev_to_rbd(dev);
 	return sprintf(buf, "%s\n", rbd_dev->pool_name);
 }
 static ssize_t rbd_name_show(struct device *dev,
 			     struct device_attribute *attr, char *buf)
 {
 	struct rbd_device *rbd_dev = dev_to_rbd(dev);
 	return sprintf(buf, "%s\n", rbd_dev->obj);
 }
 static ssize_t rbd_snap_show(struct device *dev,
 			     struct device_attribute *attr,
 			     char *buf)
 {
 	struct rbd_device *rbd_dev = dev_to_rbd(dev);
 	return sprintf(buf, "%s\n", rbd_dev->snap_name);
 }
 static ssize_t rbd_image_refresh(struct device *dev,
 				 struct device_attribute *attr,
 				 const char *buf,
 				 size_t size)
 {
 	struct rbd_device *rbd_dev = dev_to_rbd(dev);
 	int rc;
 	int ret = size;
 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 	rc = __rbd_update_snaps(rbd_dev);
 	if (rc < 0)
 		ret = rc;
 	mutex_unlock(&ctl_mutex);
 	return ret;
 }
 static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
 static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
 static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
 static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
 static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
 static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
 static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
 static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
 static struct attribute *rbd_attrs[] = {
 	&dev_attr_size.attr,
 	&dev_attr_major.attr,
 	&dev_attr_client_id.attr,
 	&dev_attr_pool.attr,
 	&dev_attr_name.attr,
 	&dev_attr_current_snap.attr,
 	&dev_attr_refresh.attr,
 	&dev_attr_create_snap.attr,
 	NULL
 };
 static struct attribute_group rbd_attr_group = {
 	.attrs = rbd_attrs,
 };
 static const struct attribute_group *rbd_attr_groups[] = {
 	&rbd_attr_group,
 	NULL
 };
 static void rbd_sysfs_dev_release(struct device *dev)
 {
 }
 static struct device_type rbd_device_type = {
 	.name		= "rbd",
 	.groups		= rbd_attr_groups,
 	.release	= rbd_sysfs_dev_release,
 };
 /*
   sysfs - snapshots
 */
 static ssize_t rbd_snap_size_show(struct device *dev,
 				  struct device_attribute *attr,
 				  char *buf)
 {
 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
 	return sprintf(buf, "%lld\n", (long long)snap->size);
 }
 static ssize_t rbd_snap_id_show(struct device *dev,
 				struct device_attribute *attr,
 				char *buf)
 {
 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
 	return sprintf(buf, "%lld\n", (long long)snap->id);
 }
 static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
 static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
 static struct attribute *rbd_snap_attrs[] = {
 	&dev_attr_snap_size.attr,
 	&dev_attr_snap_id.attr,
 	NULL,
 };
 static struct attribute_group rbd_snap_attr_group = {
 	.attrs = rbd_snap_attrs,
 };
 static void rbd_snap_dev_release(struct device *dev)
 {
 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
 	kfree(snap->name);
 	kfree(snap);
 }
 static const struct attribute_group *rbd_snap_attr_groups[] = {
 	&rbd_snap_attr_group,
 	NULL
 };
 static struct device_type rbd_snap_device_type = {
 	.groups		= rbd_snap_attr_groups,
 	.release	= rbd_snap_dev_release,
 };
 static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
 				  struct rbd_snap *snap)
 {
 	list_del(&snap->node);
 	device_unregister(&snap->dev);
 }
 static int rbd_register_snap_dev(struct rbd_device *rbd_dev,
 				  struct rbd_snap *snap,
 				  struct device *parent)
 {
 	struct device *dev = &snap->dev;
 	int ret;
 	dev->type = &rbd_snap_device_type;
 	dev->parent = parent;
 	dev->release = rbd_snap_dev_release;
 	dev_set_name(dev, "snap_%s", snap->name);
 	ret = device_register(dev);
 	return ret;
 }
 static int __rbd_add_snap_dev(struct rbd_device *rbd_dev,
 			      int i, const char *name,
 			      struct rbd_snap **snapp)
 {
 	int ret;
 	struct rbd_snap *snap = kzalloc(sizeof(*snap), GFP_KERNEL);
 	if (!snap)
 		return -ENOMEM;
 	snap->name = kstrdup(name, GFP_KERNEL);
 	snap->size = rbd_dev->header.snap_sizes[i];
 	snap->id = rbd_dev->header.snapc->snaps[i];
 	if (device_is_registered(&rbd_dev->dev)) {
 		ret = rbd_register_snap_dev(rbd_dev, snap,
 					     &rbd_dev->dev);
 		if (ret < 0)
 			goto err;
 	}
 	*snapp = snap;
 	return 0;
 err:
 	kfree(snap->name);
 	kfree(snap);
 	return ret;
 }
 /*
  * search for the previous snap in a null delimited string list
  */
 const char *rbd_prev_snap_name(const char *name, const char *start)
 {
 	if (name < start + 2)
 		return NULL;
 	name -= 2;
 	while (*name) {
 		if (name == start)
 			return start;
 		name--;
 	}
 	return name + 1;
 }
 /*
  * compare the old list of snapshots that we have to what's in the header
  * and update it accordingly. Note that the header holds the snapshots
  * in a reverse order (from newest to oldest) and we need to go from
  * older to new so that we don't get a duplicate snap name when
  * doing the process (e.g., removed snapshot and recreated a new
  * one with the same name.
  */
 static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
 {
 	const char *name, *first_name;
 	int i = rbd_dev->header.total_snaps;
 	struct rbd_snap *snap, *old_snap = NULL;
 	int ret;
 	struct list_head *p, *n;
 	first_name = rbd_dev->header.snap_names;
 	name = first_name + rbd_dev->header.snap_names_len;
 	list_for_each_prev_safe(p, n, &rbd_dev->snaps) {
 		u64 cur_id;
 		old_snap = list_entry(p, struct rbd_snap, node);
 		if (i)
 			cur_id = rbd_dev->header.snapc->snaps[i - 1];
 		if (!i || old_snap->id < cur_id) {
 			/* old_snap->id was skipped, thus was removed */
 			__rbd_remove_snap_dev(rbd_dev, old_snap);
 			continue;
 		}
 		if (old_snap->id == cur_id) {
 			/* we have this snapshot already */
 			i--;
 			name = rbd_prev_snap_name(name, first_name);
 			continue;
 		}
 		for (; i > 0;
 		     i--, name = rbd_prev_snap_name(name, first_name)) {
 			if (!name) {
 				WARN_ON(1);
 				return -EINVAL;
 			}
 			cur_id = rbd_dev->header.snapc->snaps[i];
 			/* snapshot removal? handle it above */
 			if (cur_id >= old_snap->id)
 				break;
 			/* a new snapshot */
 			ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
 			if (ret < 0)
 				return ret;
 			/* note that we add it backward so using n and not p */
 			list_add(&snap->node, n);
 			p = &snap->node;
 		}
 	}
 	/* we're done going over the old snap list, just add what's left */
 	for (; i > 0; i--) {
 		name = rbd_prev_snap_name(name, first_name);
 		if (!name) {
 			WARN_ON(1);
 			return -EINVAL;
 		}
 		ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
 		if (ret < 0)
 			return ret;
 		list_add(&snap->node, &rbd_dev->snaps);
 	}
 	return 0;
 }
 static void rbd_root_dev_release(struct device *dev)
 {
 }
 static struct device rbd_root_dev = {
 	.init_name =    "rbd",
 	.release =      rbd_root_dev_release,
 };
 static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
 {
 	int ret = -ENOMEM;
 	struct device *dev;
 	struct rbd_snap *snap;
 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 	dev = &rbd_dev->dev;
 	dev->bus = &rbd_bus_type;
 	dev->type = &rbd_device_type;
 	dev->parent = &rbd_root_dev;
 	dev->release = rbd_dev_release;
 	dev_set_name(dev, "%d", rbd_dev->id);
 	ret = device_register(dev);
 	if (ret < 0)
 		goto done_free;
 	list_for_each_entry(snap, &rbd_dev->snaps, node) {
 		ret = rbd_register_snap_dev(rbd_dev, snap,
 					     &rbd_dev->dev);
 		if (ret < 0)
 			break;
 	}
 	mutex_unlock(&ctl_mutex);
 	return 0;
 done_free:
 	mutex_unlock(&ctl_mutex);
 	return ret;
 }
 static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
 {
 	device_unregister(&rbd_dev->dev);
 }
 static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
 {
 	int ret, rc;
 	do {
 		ret = rbd_req_sync_watch(rbd_dev, rbd_dev->obj_md_name,
 					 rbd_dev->header.obj_version);
 		if (ret == -ERANGE) {
 			mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 			rc = __rbd_update_snaps(rbd_dev);
 			mutex_unlock(&ctl_mutex);
 			if (rc < 0)
 				return rc;
 		}
 	} while (ret == -ERANGE);
 	return ret;
 }
 static ssize_t rbd_add(struct bus_type *bus,
 		       const char *buf,
 		       size_t count)
 {
 	struct ceph_osd_client *osdc;
 	struct rbd_device *rbd_dev;
 	ssize_t rc = -ENOMEM;
 	int irc, new_id = 0;
 	struct list_head *tmp;
 	char *mon_dev_name;
 	char *options;
 	if (!try_module_get(THIS_MODULE))
 		return -ENODEV;
 	mon_dev_name = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
 	if (!mon_dev_name)
 		goto err_out_mod;
 	options = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
 	if (!options)
 		goto err_mon_dev;
 	/* new rbd_device object */
 	rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
 	if (!rbd_dev)
 		goto err_out_opt;
 	/* static rbd_device initialization */
 	spin_lock_init(&rbd_dev->lock);
 	INIT_LIST_HEAD(&rbd_dev->node);
 	INIT_LIST_HEAD(&rbd_dev->snaps);
 	init_rwsem(&rbd_dev->header.snap_rwsem);
 	/* generate unique id: find highest unique id, add one */
 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 	list_for_each(tmp, &rbd_dev_list) {
 		struct rbd_device *rbd_dev;
 		rbd_dev = list_entry(tmp, struct rbd_device, node);
 		if (rbd_dev->id >= new_id)
 			new_id = rbd_dev->id + 1;
 	}
 	rbd_dev->id = new_id;
 	/* add to global list */
 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
 	/* parse add command */
 	if (sscanf(buf, "%" __stringify(RBD_MAX_OPT_LEN) "s "
 		   "%" __stringify(RBD_MAX_OPT_LEN) "s "
 		   "%" __stringify(RBD_MAX_POOL_NAME_LEN) "s "
 		   "%" __stringify(RBD_MAX_OBJ_NAME_LEN) "s"
 		   "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
 		   mon_dev_name, options, rbd_dev->pool_name,
 		   rbd_dev->obj, rbd_dev->snap_name) < 4) {
 		rc = -EINVAL;
 		goto err_out_slot;
 	}
 	if (rbd_dev->snap_name[0] == 0)
 		rbd_dev->snap_name[0] = '-';
 	rbd_dev->obj_len = strlen(rbd_dev->obj);
 	snprintf(rbd_dev->obj_md_name, sizeof(rbd_dev->obj_md_name), "%s%s",
 		 rbd_dev->obj, RBD_SUFFIX);
 	/* initialize rest of new object */
 	snprintf(rbd_dev->name, DEV_NAME_LEN, DRV_NAME "%d", rbd_dev->id);
 	rc = rbd_get_client(rbd_dev, mon_dev_name, options);
 	if (rc < 0)
 		goto err_out_slot;
 	mutex_unlock(&ctl_mutex);
 	/* pick the pool */
 	osdc = &rbd_dev->client->osdc;
 	rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
 	if (rc < 0)
 		goto err_out_client;
 	rbd_dev->poolid = rc;
 	/* register our block device */
 	irc = register_blkdev(0, rbd_dev->name);
 	if (irc < 0) {
 		rc = irc;
 		goto err_out_client;
 	}
 	rbd_dev->major = irc;
 	rc = rbd_bus_add_dev(rbd_dev);
 	if (rc)
 		goto err_out_blkdev;
 	/* set up and announce blkdev mapping */
 	rc = rbd_init_disk(rbd_dev);
 	if (rc)
 		goto err_out_bus;
 	rc = rbd_init_watch_dev(rbd_dev);
 	if (rc)
 		goto err_out_bus;
 	return count;
 err_out_bus:
 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 	list_del_init(&rbd_dev->node);
 	mutex_unlock(&ctl_mutex);
 	/* this will also clean up rest of rbd_dev stuff */
 	rbd_bus_del_dev(rbd_dev);
 	kfree(options);
 	kfree(mon_dev_name);
 	return rc;
 err_out_blkdev:
 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
 err_out_client:
 	rbd_put_client(rbd_dev);
 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 err_out_slot:
 	list_del_init(&rbd_dev->node);
 	mutex_unlock(&ctl_mutex);
 	kfree(rbd_dev);
 err_out_opt:
 	kfree(options);
 err_mon_dev:
 	kfree(mon_dev_name);
 err_out_mod:
 	dout("Error adding device %s\n", buf);
 	module_put(THIS_MODULE);
 	return rc;
 }
 static struct rbd_device *__rbd_get_dev(unsigned long id)
 {
 	struct list_head *tmp;
 	struct rbd_device *rbd_dev;
 	list_for_each(tmp, &rbd_dev_list) {
 		rbd_dev = list_entry(tmp, struct rbd_device, node);
 		if (rbd_dev->id == id)
 			return rbd_dev;
 	}
 	return NULL;
 }
 static void rbd_dev_release(struct device *dev)
 {
 	struct rbd_device *rbd_dev =
 			container_of(dev, struct rbd_device, dev);
 	if (rbd_dev->watch_request)
 		ceph_osdc_unregister_linger_request(&rbd_dev->client->osdc,
 						    rbd_dev->watch_request);
 	if (rbd_dev->watch_event)
 		rbd_req_sync_unwatch(rbd_dev, rbd_dev->obj_md_name);
 	rbd_put_client(rbd_dev);
 	/* clean up and free blkdev */
 	rbd_free_disk(rbd_dev);
 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
 	kfree(rbd_dev);
 	/* release module ref */
 	module_put(THIS_MODULE);
 }
 static ssize_t rbd_remove(struct bus_type *bus,
 			  const char *buf,
 			  size_t count)
 {
 	struct rbd_device *rbd_dev = NULL;
 	int target_id, rc;
 	unsigned long ul;
 	int ret = count;
 	rc = strict_strtoul(buf, 10, &ul);
 	if (rc)
 		return rc;
 	/* convert to int; abort if we lost anything in the conversion */
 	target_id = (int) ul;
 	if (target_id != ul)
 		return -EINVAL;
 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 	rbd_dev = __rbd_get_dev(target_id);
 	if (!rbd_dev) {
 		ret = -ENOENT;
 		goto done;
 	}
 	list_del_init(&rbd_dev->node);
 	__rbd_remove_all_snaps(rbd_dev);
 	rbd_bus_del_dev(rbd_dev);
 done:
 	mutex_unlock(&ctl_mutex);
 	return ret;
 }
 static ssize_t rbd_snap_add(struct device *dev,
 			    struct device_attribute *attr,
 			    const char *buf,
 			    size_t count)
 {
 	struct rbd_device *rbd_dev = dev_to_rbd(dev);
 	int ret;
 	char *name = kmalloc(count + 1, GFP_KERNEL);
 	if (!name)
 		return -ENOMEM;
 	snprintf(name, count, "%s", buf);
 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 	ret = rbd_header_add_snap(rbd_dev,
 				  name, GFP_KERNEL);
 	if (ret < 0)
 		goto err_unlock;
 	ret = __rbd_update_snaps(rbd_dev);
 	if (ret < 0)
 		goto err_unlock;
 	/* shouldn't hold ctl_mutex when notifying.. notify might
 	   trigger a watch callback that would need to get that mutex */
 	mutex_unlock(&ctl_mutex);
 	/* make a best effort, don't error if failed */
 	rbd_req_sync_notify(rbd_dev, rbd_dev->obj_md_name);
 	ret = count;
 	kfree(name);
 	return ret;
 err_unlock:
 	mutex_unlock(&ctl_mutex);
 	kfree(name);
 	return ret;
 }
 static struct bus_attribute rbd_bus_attrs[] = {
 	__ATTR(add, S_IWUSR, NULL, rbd_add),
 	__ATTR(remove, S_IWUSR, NULL, rbd_remove),
 	__ATTR_NULL
 };
 /*
  * create control files in sysfs
  * /sys/bus/rbd/...
  */
 static int rbd_sysfs_init(void)
 {
 	int ret;
 	rbd_bus_type.bus_attrs = rbd_bus_attrs;
 	ret = bus_register(&rbd_bus_type);
 	 if (ret < 0)
 		return ret;
 	ret = device_register(&rbd_root_dev);
 	return ret;
 }
 static void rbd_sysfs_cleanup(void)
 {
 	device_unregister(&rbd_root_dev);
 	bus_unregister(&rbd_bus_type);
 }
 int __init rbd_init(void)
 {
 	int rc;
 	rc = rbd_sysfs_init();
 	if (rc)
 		return rc;
 	spin_lock_init(&node_lock);
 	pr_info("loaded " DRV_NAME_LONG "\n");
 	return 0;
 }
 void __exit rbd_exit(void)
 {
 	rbd_sysfs_cleanup();
 }
 module_init(rbd_init);
 module_exit(rbd_exit);
 MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
 MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
 MODULE_DESCRIPTION("rados block device");
 /* following authorship retained from original osdblk.c */
 MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");

fs/ceph/caps.c

Diff comments View file @ 6c073a7

1	#include <linux/ceph/ceph_debug.h>	1	#include <linux/ceph/ceph_debug.h>
2		2
3	#include <linux/fs.h>	3	#include <linux/fs.h>
4	#include <linux/kernel.h>	4	#include <linux/kernel.h>
5	#include <linux/sched.h>	5	#include <linux/sched.h>
6	#include <linux/slab.h>	6	#include <linux/slab.h>
7	#include <linux/vmalloc.h>	7	#include <linux/vmalloc.h>
8	#include <linux/wait.h>	8	#include <linux/wait.h>
9	#include <linux/writeback.h>	9	#include <linux/writeback.h>
10		10
11	#include "super.h"	11	#include "super.h"
12	#include "mds_client.h"	12	#include "mds_client.h"
13	#include <linux/ceph/decode.h>	13	#include <linux/ceph/decode.h>
14	#include <linux/ceph/messenger.h>	14	#include <linux/ceph/messenger.h>
15		15
16	/*	16	/*
17	* Capability management	17	* Capability management
18	*	18	*
19	* The Ceph metadata servers control client access to inode metadata	19	* The Ceph metadata servers control client access to inode metadata
20	* and file data by issuing capabilities, granting clients permission	20	* and file data by issuing capabilities, granting clients permission
21	* to read and/or write both inode field and file data to OSDs	21	* to read and/or write both inode field and file data to OSDs
22	* (storage nodes). Each capability consists of a set of bits	22	* (storage nodes). Each capability consists of a set of bits
23	* indicating which operations are allowed.	23	* indicating which operations are allowed.
24	*	24	*
25	* If the client holds a *_SHARED cap, the client has a coherent value	25	* If the client holds a *_SHARED cap, the client has a coherent value
26	* that can be safely read from the cached inode.	26	* that can be safely read from the cached inode.
27	*	27	*
28	* In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the	28	* In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the
29	* client is allowed to change inode attributes (e.g., file size,	29	* client is allowed to change inode attributes (e.g., file size,
30	* mtime), note its dirty state in the ceph_cap, and asynchronously	30	* mtime), note its dirty state in the ceph_cap, and asynchronously
31	* flush that metadata change to the MDS.	31	* flush that metadata change to the MDS.
32	*	32	*
33	* In the event of a conflicting operation (perhaps by another	33	* In the event of a conflicting operation (perhaps by another
34	* client), the MDS will revoke the conflicting client capabilities.	34	* client), the MDS will revoke the conflicting client capabilities.
35	*	35	*
36	* In order for a client to cache an inode, it must hold a capability	36	* In order for a client to cache an inode, it must hold a capability
37	* with at least one MDS server. When inodes are released, release	37	* with at least one MDS server. When inodes are released, release
38	* notifications are batched and periodically sent en masse to the MDS	38	* notifications are batched and periodically sent en masse to the MDS
39	* cluster to release server state.	39	* cluster to release server state.
40	*/	40	*/
41		41
42		42
43	/*	43	/*
44	* Generate readable cap strings for debugging output.	44	* Generate readable cap strings for debugging output.
45	*/	45	*/
46	#define MAX_CAP_STR 20	46	#define MAX_CAP_STR 20
47	static char cap_str[MAX_CAP_STR][40];	47	static char cap_str[MAX_CAP_STR][40];
48	static DEFINE_SPINLOCK(cap_str_lock);	48	static DEFINE_SPINLOCK(cap_str_lock);
49	static int last_cap_str;	49	static int last_cap_str;
50		50
51	static char gcap_string(char s, int c)	51	static char gcap_string(char s, int c)
52	{	52	{
53	if (c & CEPH_CAP_GSHARED)	53	if (c & CEPH_CAP_GSHARED)
54	*s++ = 's';	54	*s++ = 's';
55	if (c & CEPH_CAP_GEXCL)	55	if (c & CEPH_CAP_GEXCL)
56	*s++ = 'x';	56	*s++ = 'x';
57	if (c & CEPH_CAP_GCACHE)	57	if (c & CEPH_CAP_GCACHE)
58	*s++ = 'c';	58	*s++ = 'c';
59	if (c & CEPH_CAP_GRD)	59	if (c & CEPH_CAP_GRD)
60	*s++ = 'r';	60	*s++ = 'r';
61	if (c & CEPH_CAP_GWR)	61	if (c & CEPH_CAP_GWR)
62	*s++ = 'w';	62	*s++ = 'w';
63	if (c & CEPH_CAP_GBUFFER)	63	if (c & CEPH_CAP_GBUFFER)
64	*s++ = 'b';	64	*s++ = 'b';
65	if (c & CEPH_CAP_GLAZYIO)	65	if (c & CEPH_CAP_GLAZYIO)
66	*s++ = 'l';	66	*s++ = 'l';
67	return s;	67	return s;
68	}	68	}
69		69
70	const char *ceph_cap_string(int caps)	70	const char *ceph_cap_string(int caps)
71	{	71	{
72	int i;	72	int i;
73	char *s;	73	char *s;
74	int c;	74	int c;
75		75
76	spin_lock(&cap_str_lock);	76	spin_lock(&cap_str_lock);
77	i = last_cap_str++;	77	i = last_cap_str++;
78	if (last_cap_str == MAX_CAP_STR)	78	if (last_cap_str == MAX_CAP_STR)
79	last_cap_str = 0;	79	last_cap_str = 0;
80	spin_unlock(&cap_str_lock);	80	spin_unlock(&cap_str_lock);
81		81
82	s = cap_str[i];	82	s = cap_str[i];
83		83
84	if (caps & CEPH_CAP_PIN)	84	if (caps & CEPH_CAP_PIN)
85	*s++ = 'p';	85	*s++ = 'p';
86		86
87	c = (caps >> CEPH_CAP_SAUTH) & 3;	87	c = (caps >> CEPH_CAP_SAUTH) & 3;
88	if (c) {	88	if (c) {
89	*s++ = 'A';	89	*s++ = 'A';
90	s = gcap_string(s, c);	90	s = gcap_string(s, c);
91	}	91	}
92		92
93	c = (caps >> CEPH_CAP_SLINK) & 3;	93	c = (caps >> CEPH_CAP_SLINK) & 3;
94	if (c) {	94	if (c) {
95	*s++ = 'L';	95	*s++ = 'L';
96	s = gcap_string(s, c);	96	s = gcap_string(s, c);
97	}	97	}
98		98
99	c = (caps >> CEPH_CAP_SXATTR) & 3;	99	c = (caps >> CEPH_CAP_SXATTR) & 3;
100	if (c) {	100	if (c) {
101	*s++ = 'X';	101	*s++ = 'X';
102	s = gcap_string(s, c);	102	s = gcap_string(s, c);
103	}	103	}
104		104
105	c = caps >> CEPH_CAP_SFILE;	105	c = caps >> CEPH_CAP_SFILE;
106	if (c) {	106	if (c) {
107	*s++ = 'F';	107	*s++ = 'F';
108	s = gcap_string(s, c);	108	s = gcap_string(s, c);
109	}	109	}
110		110
111	if (s == cap_str[i])	111	if (s == cap_str[i])
112	*s++ = '-';	112	*s++ = '-';
113	*s = 0;	113	*s = 0;
114	return cap_str[i];	114	return cap_str[i];
115	}	115	}
116		116
117	void ceph_caps_init(struct ceph_mds_client *mdsc)	117	void ceph_caps_init(struct ceph_mds_client *mdsc)
118	{	118	{
119	INIT_LIST_HEAD(&mdsc->caps_list);	119	INIT_LIST_HEAD(&mdsc->caps_list);
120	spin_lock_init(&mdsc->caps_list_lock);	120	spin_lock_init(&mdsc->caps_list_lock);
121	}	121	}
122		122
123	void ceph_caps_finalize(struct ceph_mds_client *mdsc)	123	void ceph_caps_finalize(struct ceph_mds_client *mdsc)
124	{	124	{
125	struct ceph_cap *cap;	125	struct ceph_cap *cap;
126		126
127	spin_lock(&mdsc->caps_list_lock);	127	spin_lock(&mdsc->caps_list_lock);
128	while (!list_empty(&mdsc->caps_list)) {	128	while (!list_empty(&mdsc->caps_list)) {
129	cap = list_first_entry(&mdsc->caps_list,	129	cap = list_first_entry(&mdsc->caps_list,
130	struct ceph_cap, caps_item);	130	struct ceph_cap, caps_item);
131	list_del(&cap->caps_item);	131	list_del(&cap->caps_item);
132	kmem_cache_free(ceph_cap_cachep, cap);	132	kmem_cache_free(ceph_cap_cachep, cap);
133	}	133	}
134	mdsc->caps_total_count = 0;	134	mdsc->caps_total_count = 0;
135	mdsc->caps_avail_count = 0;	135	mdsc->caps_avail_count = 0;
136	mdsc->caps_use_count = 0;	136	mdsc->caps_use_count = 0;
137	mdsc->caps_reserve_count = 0;	137	mdsc->caps_reserve_count = 0;
138	mdsc->caps_min_count = 0;	138	mdsc->caps_min_count = 0;
139	spin_unlock(&mdsc->caps_list_lock);	139	spin_unlock(&mdsc->caps_list_lock);
140	}	140	}
141		141
142	void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta)	142	void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta)
143	{	143	{
144	spin_lock(&mdsc->caps_list_lock);	144	spin_lock(&mdsc->caps_list_lock);
145	mdsc->caps_min_count += delta;	145	mdsc->caps_min_count += delta;
146	BUG_ON(mdsc->caps_min_count < 0);	146	BUG_ON(mdsc->caps_min_count < 0);
147	spin_unlock(&mdsc->caps_list_lock);	147	spin_unlock(&mdsc->caps_list_lock);
148	}	148	}
149		149
150	int ceph_reserve_caps(struct ceph_mds_client *mdsc,	150	int ceph_reserve_caps(struct ceph_mds_client *mdsc,
151	struct ceph_cap_reservation *ctx, int need)	151	struct ceph_cap_reservation *ctx, int need)
152	{	152	{
153	int i;	153	int i;
154	struct ceph_cap *cap;	154	struct ceph_cap *cap;
155	int have;	155	int have;
156	int alloc = 0;	156	int alloc = 0;
157	LIST_HEAD(newcaps);	157	LIST_HEAD(newcaps);
158	int ret = 0;	158	int ret = 0;
159		159
160	dout("reserve caps ctx=%p need=%d\n", ctx, need);	160	dout("reserve caps ctx=%p need=%d\n", ctx, need);
161		161
162	/* first reserve any caps that are already allocated */	162	/* first reserve any caps that are already allocated */
163	spin_lock(&mdsc->caps_list_lock);	163	spin_lock(&mdsc->caps_list_lock);
164	if (mdsc->caps_avail_count >= need)	164	if (mdsc->caps_avail_count >= need)
165	have = need;	165	have = need;
166	else	166	else
167	have = mdsc->caps_avail_count;	167	have = mdsc->caps_avail_count;
168	mdsc->caps_avail_count -= have;	168	mdsc->caps_avail_count -= have;
169	mdsc->caps_reserve_count += have;	169	mdsc->caps_reserve_count += have;
170	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +	170	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
171	mdsc->caps_reserve_count +	171	mdsc->caps_reserve_count +
172	mdsc->caps_avail_count);	172	mdsc->caps_avail_count);
173	spin_unlock(&mdsc->caps_list_lock);	173	spin_unlock(&mdsc->caps_list_lock);
174		174
175	for (i = have; i < need; i++) {	175	for (i = have; i < need; i++) {
176	cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);	176	cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
177	if (!cap) {	177	if (!cap) {
178	ret = -ENOMEM;	178	ret = -ENOMEM;
179	goto out_alloc_count;	179	goto out_alloc_count;
180	}	180	}
181	list_add(&cap->caps_item, &newcaps);	181	list_add(&cap->caps_item, &newcaps);
182	alloc++;	182	alloc++;
183	}	183	}
184	BUG_ON(have + alloc != need);	184	BUG_ON(have + alloc != need);
185		185
186	spin_lock(&mdsc->caps_list_lock);	186	spin_lock(&mdsc->caps_list_lock);
187	mdsc->caps_total_count += alloc;	187	mdsc->caps_total_count += alloc;
188	mdsc->caps_reserve_count += alloc;	188	mdsc->caps_reserve_count += alloc;
189	list_splice(&newcaps, &mdsc->caps_list);	189	list_splice(&newcaps, &mdsc->caps_list);
190		190
191	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +	191	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
192	mdsc->caps_reserve_count +	192	mdsc->caps_reserve_count +
193	mdsc->caps_avail_count);	193	mdsc->caps_avail_count);
194	spin_unlock(&mdsc->caps_list_lock);	194	spin_unlock(&mdsc->caps_list_lock);
195		195
196	ctx->count = need;	196	ctx->count = need;
197	dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",	197	dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
198	ctx, mdsc->caps_total_count, mdsc->caps_use_count,	198	ctx, mdsc->caps_total_count, mdsc->caps_use_count,
199	mdsc->caps_reserve_count, mdsc->caps_avail_count);	199	mdsc->caps_reserve_count, mdsc->caps_avail_count);
200	return 0;	200	return 0;
201		201
202	out_alloc_count:	202	out_alloc_count:
203	/* we didn't manage to reserve as much as we needed */	203	/* we didn't manage to reserve as much as we needed */
204	pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n",	204	pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
205	ctx, need, have);	205	ctx, need, have);
206	return ret;	206	return ret;
207	}	207	}
208		208
209	int ceph_unreserve_caps(struct ceph_mds_client *mdsc,	209	int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
210	struct ceph_cap_reservation *ctx)	210	struct ceph_cap_reservation *ctx)
211	{	211	{
212	dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);	212	dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
213	if (ctx->count) {	213	if (ctx->count) {
214	spin_lock(&mdsc->caps_list_lock);	214	spin_lock(&mdsc->caps_list_lock);
215	BUG_ON(mdsc->caps_reserve_count < ctx->count);	215	BUG_ON(mdsc->caps_reserve_count < ctx->count);
216	mdsc->caps_reserve_count -= ctx->count;	216	mdsc->caps_reserve_count -= ctx->count;
217	mdsc->caps_avail_count += ctx->count;	217	mdsc->caps_avail_count += ctx->count;
218	ctx->count = 0;	218	ctx->count = 0;
219	dout("unreserve caps %d = %d used + %d resv + %d avail\n",	219	dout("unreserve caps %d = %d used + %d resv + %d avail\n",
220	mdsc->caps_total_count, mdsc->caps_use_count,	220	mdsc->caps_total_count, mdsc->caps_use_count,
221	mdsc->caps_reserve_count, mdsc->caps_avail_count);	221	mdsc->caps_reserve_count, mdsc->caps_avail_count);
222	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +	222	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
223	mdsc->caps_reserve_count +	223	mdsc->caps_reserve_count +
224	mdsc->caps_avail_count);	224	mdsc->caps_avail_count);
225	spin_unlock(&mdsc->caps_list_lock);	225	spin_unlock(&mdsc->caps_list_lock);
226	}	226	}
227	return 0;	227	return 0;
228	}	228	}
229		229
230	static struct ceph_cap get_cap(struct ceph_mds_client mdsc,	230	static struct ceph_cap get_cap(struct ceph_mds_client mdsc,
231	struct ceph_cap_reservation *ctx)	231	struct ceph_cap_reservation *ctx)
232	{	232	{
233	struct ceph_cap *cap = NULL;	233	struct ceph_cap *cap = NULL;
234		234
235	/* temporary, until we do something about cap import/export */	235	/* temporary, until we do something about cap import/export */
236	if (!ctx) {	236	if (!ctx) {
237	cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);	237	cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
238	if (cap) {	238	if (cap) {
239	mdsc->caps_use_count++;	239	mdsc->caps_use_count++;
240	mdsc->caps_total_count++;	240	mdsc->caps_total_count++;
241	}	241	}
242	return cap;	242	return cap;
243	}	243	}
244		244
245	spin_lock(&mdsc->caps_list_lock);	245	spin_lock(&mdsc->caps_list_lock);
246	dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",	246	dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
247	ctx, ctx->count, mdsc->caps_total_count, mdsc->caps_use_count,	247	ctx, ctx->count, mdsc->caps_total_count, mdsc->caps_use_count,
248	mdsc->caps_reserve_count, mdsc->caps_avail_count);	248	mdsc->caps_reserve_count, mdsc->caps_avail_count);
249	BUG_ON(!ctx->count);	249	BUG_ON(!ctx->count);
250	BUG_ON(ctx->count > mdsc->caps_reserve_count);	250	BUG_ON(ctx->count > mdsc->caps_reserve_count);
251	BUG_ON(list_empty(&mdsc->caps_list));	251	BUG_ON(list_empty(&mdsc->caps_list));
252		252
253	ctx->count--;	253	ctx->count--;
254	mdsc->caps_reserve_count--;	254	mdsc->caps_reserve_count--;
255	mdsc->caps_use_count++;	255	mdsc->caps_use_count++;
256		256
257	cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item);	257	cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item);
258	list_del(&cap->caps_item);	258	list_del(&cap->caps_item);
259		259
260	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +	260	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
261	mdsc->caps_reserve_count + mdsc->caps_avail_count);	261	mdsc->caps_reserve_count + mdsc->caps_avail_count);
262	spin_unlock(&mdsc->caps_list_lock);	262	spin_unlock(&mdsc->caps_list_lock);
263	return cap;	263	return cap;
264	}	264	}
265		265
266	void ceph_put_cap(struct ceph_mds_client mdsc, struct ceph_cap cap)	266	void ceph_put_cap(struct ceph_mds_client mdsc, struct ceph_cap cap)
267	{	267	{
268	spin_lock(&mdsc->caps_list_lock);	268	spin_lock(&mdsc->caps_list_lock);
269	dout("put_cap %p %d = %d used + %d resv + %d avail\n",	269	dout("put_cap %p %d = %d used + %d resv + %d avail\n",
270	cap, mdsc->caps_total_count, mdsc->caps_use_count,	270	cap, mdsc->caps_total_count, mdsc->caps_use_count,
271	mdsc->caps_reserve_count, mdsc->caps_avail_count);	271	mdsc->caps_reserve_count, mdsc->caps_avail_count);
272	mdsc->caps_use_count--;	272	mdsc->caps_use_count--;
273	/*	273	/*
274	* Keep some preallocated caps around (ceph_min_count), to	274	* Keep some preallocated caps around (ceph_min_count), to
275	* avoid lots of free/alloc churn.	275	* avoid lots of free/alloc churn.
276	*/	276	*/
277	if (mdsc->caps_avail_count >= mdsc->caps_reserve_count +	277	if (mdsc->caps_avail_count >= mdsc->caps_reserve_count +
278	mdsc->caps_min_count) {	278	mdsc->caps_min_count) {
279	mdsc->caps_total_count--;	279	mdsc->caps_total_count--;
280	kmem_cache_free(ceph_cap_cachep, cap);	280	kmem_cache_free(ceph_cap_cachep, cap);
281	} else {	281	} else {
282	mdsc->caps_avail_count++;	282	mdsc->caps_avail_count++;
283	list_add(&cap->caps_item, &mdsc->caps_list);	283	list_add(&cap->caps_item, &mdsc->caps_list);
284	}	284	}
285		285
286	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +	286	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
287	mdsc->caps_reserve_count + mdsc->caps_avail_count);	287	mdsc->caps_reserve_count + mdsc->caps_avail_count);
288	spin_unlock(&mdsc->caps_list_lock);	288	spin_unlock(&mdsc->caps_list_lock);
289	}	289	}
290		290
291	void ceph_reservation_status(struct ceph_fs_client *fsc,	291	void ceph_reservation_status(struct ceph_fs_client *fsc,
292	int total, int avail, int used, int reserved,	292	int total, int avail, int used, int reserved,
293	int *min)	293	int *min)
294	{	294	{
295	struct ceph_mds_client *mdsc = fsc->mdsc;	295	struct ceph_mds_client *mdsc = fsc->mdsc;
296		296
297	if (total)	297	if (total)
298	*total = mdsc->caps_total_count;	298	*total = mdsc->caps_total_count;
299	if (avail)	299	if (avail)
300	*avail = mdsc->caps_avail_count;	300	*avail = mdsc->caps_avail_count;
301	if (used)	301	if (used)
302	*used = mdsc->caps_use_count;	302	*used = mdsc->caps_use_count;
303	if (reserved)	303	if (reserved)
304	*reserved = mdsc->caps_reserve_count;	304	*reserved = mdsc->caps_reserve_count;
305	if (min)	305	if (min)
306	*min = mdsc->caps_min_count;	306	*min = mdsc->caps_min_count;
307	}	307	}
308		308
309	/*	309	/*
310	* Find ceph_cap for given mds, if any.	310	* Find ceph_cap for given mds, if any.
311	*	311	*
312	* Called with i_ceph_lock held.	312	* Called with i_ceph_lock held.
313	*/	313	*/
314	static struct ceph_cap __get_cap_for_mds(struct ceph_inode_info ci, int mds)	314	static struct ceph_cap __get_cap_for_mds(struct ceph_inode_info ci, int mds)
315	{	315	{
316	struct ceph_cap *cap;	316	struct ceph_cap *cap;
317	struct rb_node *n = ci->i_caps.rb_node;	317	struct rb_node *n = ci->i_caps.rb_node;
318		318
319	while (n) {	319	while (n) {
320	cap = rb_entry(n, struct ceph_cap, ci_node);	320	cap = rb_entry(n, struct ceph_cap, ci_node);
321	if (mds < cap->mds)	321	if (mds < cap->mds)
322	n = n->rb_left;	322	n = n->rb_left;
323	else if (mds > cap->mds)	323	else if (mds > cap->mds)
324	n = n->rb_right;	324	n = n->rb_right;
325	else	325	else
326	return cap;	326	return cap;
327	}	327	}
328	return NULL;	328	return NULL;
329	}	329	}
330		330
331	struct ceph_cap ceph_get_cap_for_mds(struct ceph_inode_info ci, int mds)	331	struct ceph_cap ceph_get_cap_for_mds(struct ceph_inode_info ci, int mds)
332	{	332	{
333	struct ceph_cap *cap;	333	struct ceph_cap *cap;
334		334
335	spin_lock(&ci->i_ceph_lock);	335	spin_lock(&ci->i_ceph_lock);
336	cap = __get_cap_for_mds(ci, mds);	336	cap = __get_cap_for_mds(ci, mds);
337	spin_unlock(&ci->i_ceph_lock);	337	spin_unlock(&ci->i_ceph_lock);
338	return cap;	338	return cap;
339	}	339	}
340		340
341	/*	341	/*
342	* Return id of any MDS with a cap, preferably FILE_WR\|BUFFER\|EXCL, else -1.	342	* Return id of any MDS with a cap, preferably FILE_WR\|BUFFER\|EXCL, else -1.
343	*/	343	*/
344	static int __ceph_get_cap_mds(struct ceph_inode_info *ci)	344	static int __ceph_get_cap_mds(struct ceph_inode_info *ci)
345	{	345	{
346	struct ceph_cap *cap;	346	struct ceph_cap *cap;
347	int mds = -1;	347	int mds = -1;
348	struct rb_node *p;	348	struct rb_node *p;
349		349
350	/* prefer mds with WR\|BUFFER\|EXCL caps */	350	/* prefer mds with WR\|BUFFER\|EXCL caps */
351	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {	351	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
352	cap = rb_entry(p, struct ceph_cap, ci_node);	352	cap = rb_entry(p, struct ceph_cap, ci_node);
353	mds = cap->mds;	353	mds = cap->mds;
354	if (cap->issued & (CEPH_CAP_FILE_WR \|	354	if (cap->issued & (CEPH_CAP_FILE_WR \|
355	CEPH_CAP_FILE_BUFFER \|	355	CEPH_CAP_FILE_BUFFER \|
356	CEPH_CAP_FILE_EXCL))	356	CEPH_CAP_FILE_EXCL))
357	break;	357	break;
358	}	358	}
359	return mds;	359	return mds;
360	}	360	}
361		361
362	int ceph_get_cap_mds(struct inode *inode)	362	int ceph_get_cap_mds(struct inode *inode)
363	{	363	{
364	struct ceph_inode_info *ci = ceph_inode(inode);	364	struct ceph_inode_info *ci = ceph_inode(inode);
365	int mds;	365	int mds;
366	spin_lock(&ci->i_ceph_lock);	366	spin_lock(&ci->i_ceph_lock);
367	mds = __ceph_get_cap_mds(ceph_inode(inode));	367	mds = __ceph_get_cap_mds(ceph_inode(inode));
368	spin_unlock(&ci->i_ceph_lock);	368	spin_unlock(&ci->i_ceph_lock);
369	return mds;	369	return mds;
370	}	370	}
371		371
372	/*	372	/*
373	* Called under i_ceph_lock.	373	* Called under i_ceph_lock.
374	*/	374	*/
375	static void __insert_cap_node(struct ceph_inode_info *ci,	375	static void __insert_cap_node(struct ceph_inode_info *ci,
376	struct ceph_cap *new)	376	struct ceph_cap *new)
377	{	377	{
378	struct rb_node **p = &ci->i_caps.rb_node;	378	struct rb_node **p = &ci->i_caps.rb_node;
379	struct rb_node *parent = NULL;	379	struct rb_node *parent = NULL;
380	struct ceph_cap *cap = NULL;	380	struct ceph_cap *cap = NULL;
381		381
382	while (*p) {	382	while (*p) {
383	parent = *p;	383	parent = *p;
384	cap = rb_entry(parent, struct ceph_cap, ci_node);	384	cap = rb_entry(parent, struct ceph_cap, ci_node);
385	if (new->mds < cap->mds)	385	if (new->mds < cap->mds)
386	p = &(*p)->rb_left;	386	p = &(*p)->rb_left;
387	else if (new->mds > cap->mds)	387	else if (new->mds > cap->mds)
388	p = &(*p)->rb_right;	388	p = &(*p)->rb_right;
389	else	389	else
390	BUG();	390	BUG();
391	}	391	}
392		392
393	rb_link_node(&new->ci_node, parent, p);	393	rb_link_node(&new->ci_node, parent, p);
394	rb_insert_color(&new->ci_node, &ci->i_caps);	394	rb_insert_color(&new->ci_node, &ci->i_caps);
395	}	395	}
396		396
397	/*	397	/*
398	* (re)set cap hold timeouts, which control the delayed release	398	* (re)set cap hold timeouts, which control the delayed release
399	* of unused caps back to the MDS. Should be called on cap use.	399	* of unused caps back to the MDS. Should be called on cap use.
400	*/	400	*/
401	static void __cap_set_timeouts(struct ceph_mds_client *mdsc,	401	static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
402	struct ceph_inode_info *ci)	402	struct ceph_inode_info *ci)
403	{	403	{
404	struct ceph_mount_options *ma = mdsc->fsc->mount_options;	404	struct ceph_mount_options *ma = mdsc->fsc->mount_options;
405		405
406	ci->i_hold_caps_min = round_jiffies(jiffies +	406	ci->i_hold_caps_min = round_jiffies(jiffies +
407	ma->caps_wanted_delay_min * HZ);	407	ma->caps_wanted_delay_min * HZ);
408	ci->i_hold_caps_max = round_jiffies(jiffies +	408	ci->i_hold_caps_max = round_jiffies(jiffies +
409	ma->caps_wanted_delay_max * HZ);	409	ma->caps_wanted_delay_max * HZ);
410	dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,	410	dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
411	ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);	411	ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
412	}	412	}
413		413
414	/*	414	/*
415	* (Re)queue cap at the end of the delayed cap release list.	415	* (Re)queue cap at the end of the delayed cap release list.
416	*	416	*
417	* If I_FLUSH is set, leave the inode at the front of the list.	417	* If I_FLUSH is set, leave the inode at the front of the list.
418	*	418	*
419	* Caller holds i_ceph_lock	419	* Caller holds i_ceph_lock
420	* -> we take mdsc->cap_delay_lock	420	* -> we take mdsc->cap_delay_lock
421	*/	421	*/
422	static void __cap_delay_requeue(struct ceph_mds_client *mdsc,	422	static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
423	struct ceph_inode_info *ci)	423	struct ceph_inode_info *ci)
424	{	424	{
425	__cap_set_timeouts(mdsc, ci);	425	__cap_set_timeouts(mdsc, ci);
426	dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,	426	dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
427	ci->i_ceph_flags, ci->i_hold_caps_max);	427	ci->i_ceph_flags, ci->i_hold_caps_max);
428	if (!mdsc->stopping) {	428	if (!mdsc->stopping) {
429	spin_lock(&mdsc->cap_delay_lock);	429	spin_lock(&mdsc->cap_delay_lock);
430	if (!list_empty(&ci->i_cap_delay_list)) {	430	if (!list_empty(&ci->i_cap_delay_list)) {
431	if (ci->i_ceph_flags & CEPH_I_FLUSH)	431	if (ci->i_ceph_flags & CEPH_I_FLUSH)
432	goto no_change;	432	goto no_change;
433	list_del_init(&ci->i_cap_delay_list);	433	list_del_init(&ci->i_cap_delay_list);
434	}	434	}
435	list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);	435	list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
436	no_change:	436	no_change:
437	spin_unlock(&mdsc->cap_delay_lock);	437	spin_unlock(&mdsc->cap_delay_lock);
438	}	438	}
439	}	439	}
440		440
441	/*	441	/*
442	* Queue an inode for immediate writeback. Mark inode with I_FLUSH,	442	* Queue an inode for immediate writeback. Mark inode with I_FLUSH,
443	* indicating we should send a cap message to flush dirty metadata	443	* indicating we should send a cap message to flush dirty metadata
444	* asap, and move to the front of the delayed cap list.	444	* asap, and move to the front of the delayed cap list.
445	*/	445	*/
446	static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,	446	static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
447	struct ceph_inode_info *ci)	447	struct ceph_inode_info *ci)
448	{	448	{
449	dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);	449	dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
450	spin_lock(&mdsc->cap_delay_lock);	450	spin_lock(&mdsc->cap_delay_lock);
451	ci->i_ceph_flags \|= CEPH_I_FLUSH;	451	ci->i_ceph_flags \|= CEPH_I_FLUSH;
452	if (!list_empty(&ci->i_cap_delay_list))	452	if (!list_empty(&ci->i_cap_delay_list))
453	list_del_init(&ci->i_cap_delay_list);	453	list_del_init(&ci->i_cap_delay_list);
454	list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);	454	list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
455	spin_unlock(&mdsc->cap_delay_lock);	455	spin_unlock(&mdsc->cap_delay_lock);
456	}	456	}
457		457
458	/*	458	/*
459	* Cancel delayed work on cap.	459	* Cancel delayed work on cap.
460	*	460	*
461	* Caller must hold i_ceph_lock.	461	* Caller must hold i_ceph_lock.
462	*/	462	*/
463	static void __cap_delay_cancel(struct ceph_mds_client *mdsc,	463	static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
464	struct ceph_inode_info *ci)	464	struct ceph_inode_info *ci)
465	{	465	{
466	dout("__cap_delay_cancel %p\n", &ci->vfs_inode);	466	dout("__cap_delay_cancel %p\n", &ci->vfs_inode);
467	if (list_empty(&ci->i_cap_delay_list))	467	if (list_empty(&ci->i_cap_delay_list))
468	return;	468	return;
469	spin_lock(&mdsc->cap_delay_lock);	469	spin_lock(&mdsc->cap_delay_lock);
470	list_del_init(&ci->i_cap_delay_list);	470	list_del_init(&ci->i_cap_delay_list);
471	spin_unlock(&mdsc->cap_delay_lock);	471	spin_unlock(&mdsc->cap_delay_lock);
472	}	472	}
473		473
474	/*	474	/*
475	* Common issue checks for add_cap, handle_cap_grant.	475	* Common issue checks for add_cap, handle_cap_grant.
476	*/	476	*/
477	static void __check_cap_issue(struct ceph_inode_info ci, struct ceph_cap cap,	477	static void __check_cap_issue(struct ceph_inode_info ci, struct ceph_cap cap,
478	unsigned issued)	478	unsigned issued)
479	{	479	{
480	unsigned had = __ceph_caps_issued(ci, NULL);	480	unsigned had = __ceph_caps_issued(ci, NULL);
481		481
482	/*	482	/*
483	* Each time we receive FILE_CACHE anew, we increment	483	* Each time we receive FILE_CACHE anew, we increment
484	* i_rdcache_gen.	484	* i_rdcache_gen.
485	*/	485	*/
486	if ((issued & (CEPH_CAP_FILE_CACHE\|CEPH_CAP_FILE_LAZYIO)) &&	486	if ((issued & (CEPH_CAP_FILE_CACHE\|CEPH_CAP_FILE_LAZYIO)) &&
487	(had & (CEPH_CAP_FILE_CACHE\|CEPH_CAP_FILE_LAZYIO)) == 0)	487	(had & (CEPH_CAP_FILE_CACHE\|CEPH_CAP_FILE_LAZYIO)) == 0)
488	ci->i_rdcache_gen++;	488	ci->i_rdcache_gen++;
489		489
490	/*	490	/*
491	* if we are newly issued FILE_SHARED, clear D_COMPLETE; we	491	* if we are newly issued FILE_SHARED, clear D_COMPLETE; we
492	* don't know what happened to this directory while we didn't	492	* don't know what happened to this directory while we didn't
493	* have the cap.	493	* have the cap.
494	*/	494	*/
495	if ((issued & CEPH_CAP_FILE_SHARED) &&	495	if ((issued & CEPH_CAP_FILE_SHARED) &&
496	(had & CEPH_CAP_FILE_SHARED) == 0) {	496	(had & CEPH_CAP_FILE_SHARED) == 0) {
497	ci->i_shared_gen++;	497	ci->i_shared_gen++;
498	if (S_ISDIR(ci->vfs_inode.i_mode))	498	if (S_ISDIR(ci->vfs_inode.i_mode))
499	ceph_dir_clear_complete(&ci->vfs_inode);	499	ceph_dir_clear_complete(&ci->vfs_inode);
500	}	500	}
501	}	501	}
502		502
503	/*	503	/*
504	* Add a capability under the given MDS session.	504	* Add a capability under the given MDS session.
505	*	505	*
506	* Caller should hold session snap_rwsem (read) and s_mutex.	506	* Caller should hold session snap_rwsem (read) and s_mutex.
507	*	507	*
508	* @fmode is the open file mode, if we are opening a file, otherwise	508	* @fmode is the open file mode, if we are opening a file, otherwise
509	* it is < 0. (This is so we can atomically add the cap and add an	509	* it is < 0. (This is so we can atomically add the cap and add an
510	* open file reference to it.)	510	* open file reference to it.)
511	*/	511	*/
512	int ceph_add_cap(struct inode *inode,	512	int ceph_add_cap(struct inode *inode,
513	struct ceph_mds_session *session, u64 cap_id,	513	struct ceph_mds_session *session, u64 cap_id,
514	int fmode, unsigned issued, unsigned wanted,	514	int fmode, unsigned issued, unsigned wanted,
515	unsigned seq, unsigned mseq, u64 realmino, int flags,	515	unsigned seq, unsigned mseq, u64 realmino, int flags,
516	struct ceph_cap_reservation *caps_reservation)	516	struct ceph_cap_reservation *caps_reservation)
517	{	517	{
518	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;	518	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
519	struct ceph_inode_info *ci = ceph_inode(inode);	519	struct ceph_inode_info *ci = ceph_inode(inode);
520	struct ceph_cap *new_cap = NULL;	520	struct ceph_cap *new_cap = NULL;
521	struct ceph_cap *cap;	521	struct ceph_cap *cap;
522	int mds = session->s_mds;	522	int mds = session->s_mds;
523	int actual_wanted;	523	int actual_wanted;
524		524
525	dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,	525	dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
526	session->s_mds, cap_id, ceph_cap_string(issued), seq);	526	session->s_mds, cap_id, ceph_cap_string(issued), seq);
527		527
528	/*	528	/*
529	* If we are opening the file, include file mode wanted bits	529	* If we are opening the file, include file mode wanted bits
530	* in wanted.	530	* in wanted.
531	*/	531	*/
532	if (fmode >= 0)	532	if (fmode >= 0)
533	wanted \|= ceph_caps_for_mode(fmode);	533	wanted \|= ceph_caps_for_mode(fmode);
534		534
535	retry:	535	retry:
536	spin_lock(&ci->i_ceph_lock);	536	spin_lock(&ci->i_ceph_lock);
537	cap = __get_cap_for_mds(ci, mds);	537	cap = __get_cap_for_mds(ci, mds);
538	if (!cap) {	538	if (!cap) {
539	if (new_cap) {	539	if (new_cap) {
540	cap = new_cap;	540	cap = new_cap;
541	new_cap = NULL;	541	new_cap = NULL;
542	} else {	542	} else {
543	spin_unlock(&ci->i_ceph_lock);	543	spin_unlock(&ci->i_ceph_lock);
544	new_cap = get_cap(mdsc, caps_reservation);	544	new_cap = get_cap(mdsc, caps_reservation);
545	if (new_cap == NULL)	545	if (new_cap == NULL)
546	return -ENOMEM;	546	return -ENOMEM;
547	goto retry;	547	goto retry;
548	}	548	}
549		549
550	cap->issued = 0;	550	cap->issued = 0;
551	cap->implemented = 0;	551	cap->implemented = 0;
552	cap->mds = mds;	552	cap->mds = mds;
553	cap->mds_wanted = 0;	553	cap->mds_wanted = 0;
554		554
555	cap->ci = ci;	555	cap->ci = ci;
556	__insert_cap_node(ci, cap);	556	__insert_cap_node(ci, cap);
557		557
558	/* clear out old exporting info? (i.e. on cap import) */	558	/* clear out old exporting info? (i.e. on cap import) */
559	if (ci->i_cap_exporting_mds == mds) {	559	if (ci->i_cap_exporting_mds == mds) {
560	ci->i_cap_exporting_issued = 0;	560	ci->i_cap_exporting_issued = 0;
561	ci->i_cap_exporting_mseq = 0;	561	ci->i_cap_exporting_mseq = 0;
562	ci->i_cap_exporting_mds = -1;	562	ci->i_cap_exporting_mds = -1;
563	}	563	}
564		564
565	/* add to session cap list */	565	/* add to session cap list */
566	cap->session = session;	566	cap->session = session;
567	spin_lock(&session->s_cap_lock);	567	spin_lock(&session->s_cap_lock);
568	list_add_tail(&cap->session_caps, &session->s_caps);	568	list_add_tail(&cap->session_caps, &session->s_caps);
569	session->s_nr_caps++;	569	session->s_nr_caps++;
570	spin_unlock(&session->s_cap_lock);	570	spin_unlock(&session->s_cap_lock);
571	} else if (new_cap)	571	} else if (new_cap)
572	ceph_put_cap(mdsc, new_cap);	572	ceph_put_cap(mdsc, new_cap);
573		573
574	if (!ci->i_snap_realm) {	574	if (!ci->i_snap_realm) {
575	/*	575	/*
576	* add this inode to the appropriate snap realm	576	* add this inode to the appropriate snap realm
577	*/	577	*/
578	struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,	578	struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
579	realmino);	579	realmino);
580	if (realm) {	580	if (realm) {
581	ceph_get_snap_realm(mdsc, realm);	581	ceph_get_snap_realm(mdsc, realm);
582	spin_lock(&realm->inodes_with_caps_lock);	582	spin_lock(&realm->inodes_with_caps_lock);
583	ci->i_snap_realm = realm;	583	ci->i_snap_realm = realm;
584	list_add(&ci->i_snap_realm_item,	584	list_add(&ci->i_snap_realm_item,
585	&realm->inodes_with_caps);	585	&realm->inodes_with_caps);
586	spin_unlock(&realm->inodes_with_caps_lock);	586	spin_unlock(&realm->inodes_with_caps_lock);
587	} else {	587	} else {
588	pr_err("ceph_add_cap: couldn't find snap realm %llx\n",	588	pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
589	realmino);	589	realmino);
590	WARN_ON(!realm);	590	WARN_ON(!realm);
591	}	591	}
592	}	592	}
593		593
594	__check_cap_issue(ci, cap, issued);	594	__check_cap_issue(ci, cap, issued);
595		595
596	/*	596	/*
597	* If we are issued caps we don't want, or the mds' wanted	597	* If we are issued caps we don't want, or the mds' wanted
598	* value appears to be off, queue a check so we'll release	598	* value appears to be off, queue a check so we'll release
599	* later and/or update the mds wanted value.	599	* later and/or update the mds wanted value.
600	*/	600	*/
601	actual_wanted = __ceph_caps_wanted(ci);	601	actual_wanted = __ceph_caps_wanted(ci);
602	if ((wanted & ~actual_wanted) \|\|	602	if ((wanted & ~actual_wanted) \|\|
603	(issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {	603	(issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
604	dout(" issued %s, mds wanted %s, actual %s, queueing\n",	604	dout(" issued %s, mds wanted %s, actual %s, queueing\n",
605	ceph_cap_string(issued), ceph_cap_string(wanted),	605	ceph_cap_string(issued), ceph_cap_string(wanted),
606	ceph_cap_string(actual_wanted));	606	ceph_cap_string(actual_wanted));
607	__cap_delay_requeue(mdsc, ci);	607	__cap_delay_requeue(mdsc, ci);
608	}	608	}
609		609
610	if (flags & CEPH_CAP_FLAG_AUTH)	610	if (flags & CEPH_CAP_FLAG_AUTH)
611	ci->i_auth_cap = cap;	611	ci->i_auth_cap = cap;
612	else if (ci->i_auth_cap == cap)	612	else if (ci->i_auth_cap == cap)
613	ci->i_auth_cap = NULL;	613	ci->i_auth_cap = NULL;
614		614
615	dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",	615	dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
616	inode, ceph_vinop(inode), cap, ceph_cap_string(issued),	616	inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
617	ceph_cap_string(issued\|cap->issued), seq, mds);	617	ceph_cap_string(issued\|cap->issued), seq, mds);
618	cap->cap_id = cap_id;	618	cap->cap_id = cap_id;
619	cap->issued = issued;	619	cap->issued = issued;
620	cap->implemented \|= issued;	620	cap->implemented \|= issued;
621	cap->mds_wanted \|= wanted;	621	cap->mds_wanted \|= wanted;
622	cap->seq = seq;	622	cap->seq = seq;
623	cap->issue_seq = seq;	623	cap->issue_seq = seq;
624	cap->mseq = mseq;	624	cap->mseq = mseq;
625	cap->cap_gen = session->s_cap_gen;	625	cap->cap_gen = session->s_cap_gen;
626		626
627	if (fmode >= 0)	627	if (fmode >= 0)
628	__ceph_get_fmode(ci, fmode);	628	__ceph_get_fmode(ci, fmode);
629	spin_unlock(&ci->i_ceph_lock);	629	spin_unlock(&ci->i_ceph_lock);
630	wake_up_all(&ci->i_cap_wq);	630	wake_up_all(&ci->i_cap_wq);
631	return 0;	631	return 0;
632	}	632	}
633		633
634	/*	634	/*
635	* Return true if cap has not timed out and belongs to the current	635	* Return true if cap has not timed out and belongs to the current
636	* generation of the MDS session (i.e. has not gone 'stale' due to	636	* generation of the MDS session (i.e. has not gone 'stale' due to
637	* us losing touch with the mds).	637	* us losing touch with the mds).
638	*/	638	*/
639	static int __cap_is_valid(struct ceph_cap *cap)	639	static int __cap_is_valid(struct ceph_cap *cap)
640	{	640	{
641	unsigned long ttl;	641	unsigned long ttl;
642	u32 gen;	642	u32 gen;
643		643
644	spin_lock(&cap->session->s_cap_lock);	644	spin_lock(&cap->session->s_gen_ttl_lock);
645	gen = cap->session->s_cap_gen;	645	gen = cap->session->s_cap_gen;
646	ttl = cap->session->s_cap_ttl;	646	ttl = cap->session->s_cap_ttl;
647	spin_unlock(&cap->session->s_cap_lock);	647	spin_unlock(&cap->session->s_gen_ttl_lock);
648		648
649	if (cap->cap_gen < gen \|\| time_after_eq(jiffies, ttl)) {	649	if (cap->cap_gen < gen \|\| time_after_eq(jiffies, ttl)) {
650	dout("__cap_is_valid %p cap %p issued %s "	650	dout("__cap_is_valid %p cap %p issued %s "
651	"but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,	651	"but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,
652	cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);	652	cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);
653	return 0;	653	return 0;
654	}	654	}
655		655
656	return 1;	656	return 1;
657	}	657	}
658		658
659	/*	659	/*
660	* Return set of valid cap bits issued to us. Note that caps time	660	* Return set of valid cap bits issued to us. Note that caps time
661	* out, and may be invalidated in bulk if the client session times out	661	* out, and may be invalidated in bulk if the client session times out
662	* and session->s_cap_gen is bumped.	662	* and session->s_cap_gen is bumped.
663	*/	663	*/
664	int __ceph_caps_issued(struct ceph_inode_info ci, int implemented)	664	int __ceph_caps_issued(struct ceph_inode_info ci, int implemented)
665	{	665	{
666	int have = ci->i_snap_caps \| ci->i_cap_exporting_issued;	666	int have = ci->i_snap_caps \| ci->i_cap_exporting_issued;
667	struct ceph_cap *cap;	667	struct ceph_cap *cap;
668	struct rb_node *p;	668	struct rb_node *p;
669		669
670	if (implemented)	670	if (implemented)
671	*implemented = 0;	671	*implemented = 0;
672	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {	672	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
673	cap = rb_entry(p, struct ceph_cap, ci_node);	673	cap = rb_entry(p, struct ceph_cap, ci_node);
674	if (!__cap_is_valid(cap))	674	if (!__cap_is_valid(cap))
675	continue;	675	continue;
676	dout("__ceph_caps_issued %p cap %p issued %s\n",	676	dout("__ceph_caps_issued %p cap %p issued %s\n",
677	&ci->vfs_inode, cap, ceph_cap_string(cap->issued));	677	&ci->vfs_inode, cap, ceph_cap_string(cap->issued));
678	have \|= cap->issued;	678	have \|= cap->issued;
679	if (implemented)	679	if (implemented)
680	*implemented \|= cap->implemented;	680	*implemented \|= cap->implemented;
681	}	681	}
682	return have;	682	return have;
683	}	683	}
684		684
685	/*	685	/*
686	* Get cap bits issued by caps other than @ocap	686	* Get cap bits issued by caps other than @ocap
687	*/	687	*/
688	int __ceph_caps_issued_other(struct ceph_inode_info ci, struct ceph_cap ocap)	688	int __ceph_caps_issued_other(struct ceph_inode_info ci, struct ceph_cap ocap)
689	{	689	{
690	int have = ci->i_snap_caps;	690	int have = ci->i_snap_caps;
691	struct ceph_cap *cap;	691	struct ceph_cap *cap;
692	struct rb_node *p;	692	struct rb_node *p;
693		693
694	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {	694	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
695	cap = rb_entry(p, struct ceph_cap, ci_node);	695	cap = rb_entry(p, struct ceph_cap, ci_node);
696	if (cap == ocap)	696	if (cap == ocap)
697	continue;	697	continue;
698	if (!__cap_is_valid(cap))	698	if (!__cap_is_valid(cap))
699	continue;	699	continue;
700	have \|= cap->issued;	700	have \|= cap->issued;
701	}	701	}
702	return have;	702	return have;
703	}	703	}
704		704
705	/*	705	/*
706	* Move a cap to the end of the LRU (oldest caps at list head, newest	706	* Move a cap to the end of the LRU (oldest caps at list head, newest
707	* at list tail).	707	* at list tail).
708	*/	708	*/
709	static void __touch_cap(struct ceph_cap *cap)	709	static void __touch_cap(struct ceph_cap *cap)
710	{	710	{
711	struct ceph_mds_session *s = cap->session;	711	struct ceph_mds_session *s = cap->session;
712		712
713	spin_lock(&s->s_cap_lock);	713	spin_lock(&s->s_cap_lock);
714	if (s->s_cap_iterator == NULL) {	714	if (s->s_cap_iterator == NULL) {
715	dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,	715	dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
716	s->s_mds);	716	s->s_mds);
717	list_move_tail(&cap->session_caps, &s->s_caps);	717	list_move_tail(&cap->session_caps, &s->s_caps);
718	} else {	718	} else {
719	dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n",	719	dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n",
720	&cap->ci->vfs_inode, cap, s->s_mds);	720	&cap->ci->vfs_inode, cap, s->s_mds);
721	}	721	}
722	spin_unlock(&s->s_cap_lock);	722	spin_unlock(&s->s_cap_lock);
723	}	723	}
724		724
725	/*	725	/*
726	* Check if we hold the given mask. If so, move the cap(s) to the	726	* Check if we hold the given mask. If so, move the cap(s) to the
727	* front of their respective LRUs. (This is the preferred way for	727	* front of their respective LRUs. (This is the preferred way for
728	* callers to check for caps they want.)	728	* callers to check for caps they want.)
729	*/	729	*/
730	int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)	730	int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
731	{	731	{
732	struct ceph_cap *cap;	732	struct ceph_cap *cap;
733	struct rb_node *p;	733	struct rb_node *p;
734	int have = ci->i_snap_caps;	734	int have = ci->i_snap_caps;
735		735
736	if ((have & mask) == mask) {	736	if ((have & mask) == mask) {
737	dout("__ceph_caps_issued_mask %p snap issued %s"	737	dout("__ceph_caps_issued_mask %p snap issued %s"
738	" (mask %s)\n", &ci->vfs_inode,	738	" (mask %s)\n", &ci->vfs_inode,
739	ceph_cap_string(have),	739	ceph_cap_string(have),
740	ceph_cap_string(mask));	740	ceph_cap_string(mask));
741	return 1;	741	return 1;
742	}	742	}
743		743
744	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {	744	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
745	cap = rb_entry(p, struct ceph_cap, ci_node);	745	cap = rb_entry(p, struct ceph_cap, ci_node);
746	if (!__cap_is_valid(cap))	746	if (!__cap_is_valid(cap))
747	continue;	747	continue;
748	if ((cap->issued & mask) == mask) {	748	if ((cap->issued & mask) == mask) {
749	dout("__ceph_caps_issued_mask %p cap %p issued %s"	749	dout("__ceph_caps_issued_mask %p cap %p issued %s"
750	" (mask %s)\n", &ci->vfs_inode, cap,	750	" (mask %s)\n", &ci->vfs_inode, cap,
751	ceph_cap_string(cap->issued),	751	ceph_cap_string(cap->issued),
752	ceph_cap_string(mask));	752	ceph_cap_string(mask));
753	if (touch)	753	if (touch)
754	__touch_cap(cap);	754	__touch_cap(cap);
755	return 1;	755	return 1;
756	}	756	}
757		757
758	/* does a combination of caps satisfy mask? */	758	/* does a combination of caps satisfy mask? */
759	have \|= cap->issued;	759	have \|= cap->issued;
760	if ((have & mask) == mask) {	760	if ((have & mask) == mask) {
761	dout("__ceph_caps_issued_mask %p combo issued %s"	761	dout("__ceph_caps_issued_mask %p combo issued %s"
762	" (mask %s)\n", &ci->vfs_inode,	762	" (mask %s)\n", &ci->vfs_inode,
763	ceph_cap_string(cap->issued),	763	ceph_cap_string(cap->issued),
764	ceph_cap_string(mask));	764	ceph_cap_string(mask));
765	if (touch) {	765	if (touch) {
766	struct rb_node *q;	766	struct rb_node *q;
767		767
768	/* touch this + preceding caps */	768	/* touch this + preceding caps */
769	__touch_cap(cap);	769	__touch_cap(cap);
770	for (q = rb_first(&ci->i_caps); q != p;	770	for (q = rb_first(&ci->i_caps); q != p;
771	q = rb_next(q)) {	771	q = rb_next(q)) {
772	cap = rb_entry(q, struct ceph_cap,	772	cap = rb_entry(q, struct ceph_cap,
773	ci_node);	773	ci_node);
774	if (!__cap_is_valid(cap))	774	if (!__cap_is_valid(cap))
775	continue;	775	continue;
776	__touch_cap(cap);	776	__touch_cap(cap);
777	}	777	}
778	}	778	}
779	return 1;	779	return 1;
780	}	780	}
781	}	781	}
782		782
783	return 0;	783	return 0;
784	}	784	}
785		785
786	/*	786	/*
787	* Return true if mask caps are currently being revoked by an MDS.	787	* Return true if mask caps are currently being revoked by an MDS.
788	*/	788	*/
789	int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)	789	int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
790	{	790	{
791	struct inode *inode = &ci->vfs_inode;	791	struct inode *inode = &ci->vfs_inode;
792	struct ceph_cap *cap;	792	struct ceph_cap *cap;
793	struct rb_node *p;	793	struct rb_node *p;
794	int ret = 0;	794	int ret = 0;
795		795
796	spin_lock(&ci->i_ceph_lock);	796	spin_lock(&ci->i_ceph_lock);
797	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {	797	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
798	cap = rb_entry(p, struct ceph_cap, ci_node);	798	cap = rb_entry(p, struct ceph_cap, ci_node);
799	if (__cap_is_valid(cap) &&	799	if (__cap_is_valid(cap) &&
800	(cap->implemented & ~cap->issued & mask)) {	800	(cap->implemented & ~cap->issued & mask)) {
801	ret = 1;	801	ret = 1;
802	break;	802	break;
803	}	803	}
804	}	804	}
805	spin_unlock(&ci->i_ceph_lock);	805	spin_unlock(&ci->i_ceph_lock);
806	dout("ceph_caps_revoking %p %s = %d\n", inode,	806	dout("ceph_caps_revoking %p %s = %d\n", inode,
807	ceph_cap_string(mask), ret);	807	ceph_cap_string(mask), ret);
808	return ret;	808	return ret;
809	}	809	}
810		810
811	int __ceph_caps_used(struct ceph_inode_info *ci)	811	int __ceph_caps_used(struct ceph_inode_info *ci)
812	{	812	{
813	int used = 0;	813	int used = 0;
814	if (ci->i_pin_ref)	814	if (ci->i_pin_ref)
815	used \|= CEPH_CAP_PIN;	815	used \|= CEPH_CAP_PIN;
816	if (ci->i_rd_ref)	816	if (ci->i_rd_ref)
817	used \|= CEPH_CAP_FILE_RD;	817	used \|= CEPH_CAP_FILE_RD;
818	if (ci->i_rdcache_ref \|\| ci->vfs_inode.i_data.nrpages)	818	if (ci->i_rdcache_ref \|\| ci->vfs_inode.i_data.nrpages)
819	used \|= CEPH_CAP_FILE_CACHE;	819	used \|= CEPH_CAP_FILE_CACHE;
820	if (ci->i_wr_ref)	820	if (ci->i_wr_ref)
821	used \|= CEPH_CAP_FILE_WR;	821	used \|= CEPH_CAP_FILE_WR;
822	if (ci->i_wb_ref \|\| ci->i_wrbuffer_ref)	822	if (ci->i_wb_ref \|\| ci->i_wrbuffer_ref)
823	used \|= CEPH_CAP_FILE_BUFFER;	823	used \|= CEPH_CAP_FILE_BUFFER;
824	return used;	824	return used;
825	}	825	}
826		826
827	/*	827	/*
828	* wanted, by virtue of open file modes	828	* wanted, by virtue of open file modes
829	*/	829	*/
830	int __ceph_caps_file_wanted(struct ceph_inode_info *ci)	830	int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
831	{	831	{
832	int want = 0;	832	int want = 0;
833	int mode;	833	int mode;
834	for (mode = 0; mode < CEPH_FILE_MODE_NUM; mode++)	834	for (mode = 0; mode < CEPH_FILE_MODE_NUM; mode++)
835	if (ci->i_nr_by_mode[mode])	835	if (ci->i_nr_by_mode[mode])
836	want \|= ceph_caps_for_mode(mode);	836	want \|= ceph_caps_for_mode(mode);
837	return want;	837	return want;
838	}	838	}
839		839
840	/*	840	/*
841	* Return caps we have registered with the MDS(s) as 'wanted'.	841	* Return caps we have registered with the MDS(s) as 'wanted'.
842	*/	842	*/
843	int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)	843	int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
844	{	844	{
845	struct ceph_cap *cap;	845	struct ceph_cap *cap;
846	struct rb_node *p;	846	struct rb_node *p;
847	int mds_wanted = 0;	847	int mds_wanted = 0;
848		848
849	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {	849	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
850	cap = rb_entry(p, struct ceph_cap, ci_node);	850	cap = rb_entry(p, struct ceph_cap, ci_node);
851	if (!__cap_is_valid(cap))	851	if (!__cap_is_valid(cap))
852	continue;	852	continue;
853	mds_wanted \|= cap->mds_wanted;	853	mds_wanted \|= cap->mds_wanted;
854	}	854	}
855	return mds_wanted;	855	return mds_wanted;
856	}	856	}
857		857
858	/*	858	/*
859	* called under i_ceph_lock	859	* called under i_ceph_lock
860	*/	860	*/
861	static int __ceph_is_any_caps(struct ceph_inode_info *ci)	861	static int __ceph_is_any_caps(struct ceph_inode_info *ci)
862	{	862	{
863	return !RB_EMPTY_ROOT(&ci->i_caps) \|\| ci->i_cap_exporting_mds >= 0;	863	return !RB_EMPTY_ROOT(&ci->i_caps) \|\| ci->i_cap_exporting_mds >= 0;
864	}	864	}
865		865
866	/*	866	/*
867	* Remove a cap. Take steps to deal with a racing iterate_session_caps.	867	* Remove a cap. Take steps to deal with a racing iterate_session_caps.
868	*	868	*
869	* caller should hold i_ceph_lock.	869	* caller should hold i_ceph_lock.
870	* caller will not hold session s_mutex if called from destroy_inode.	870	* caller will not hold session s_mutex if called from destroy_inode.
871	*/	871	*/
872	void __ceph_remove_cap(struct ceph_cap *cap)	872	void __ceph_remove_cap(struct ceph_cap *cap)
873	{	873	{
874	struct ceph_mds_session *session = cap->session;	874	struct ceph_mds_session *session = cap->session;
875	struct ceph_inode_info *ci = cap->ci;	875	struct ceph_inode_info *ci = cap->ci;
876	struct ceph_mds_client *mdsc =	876	struct ceph_mds_client *mdsc =
877	ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;	877	ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
878	int removed = 0;	878	int removed = 0;
879		879
880	dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);	880	dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
881		881
882	/* remove from session list */	882	/* remove from session list */
883	spin_lock(&session->s_cap_lock);	883	spin_lock(&session->s_cap_lock);
884	if (session->s_cap_iterator == cap) {	884	if (session->s_cap_iterator == cap) {
885	/* not yet, we are iterating over this very cap */	885	/* not yet, we are iterating over this very cap */
886	dout("__ceph_remove_cap delaying %p removal from session %p\n",	886	dout("__ceph_remove_cap delaying %p removal from session %p\n",
887	cap, cap->session);	887	cap, cap->session);
888	} else {	888	} else {
889	list_del_init(&cap->session_caps);	889	list_del_init(&cap->session_caps);
890	session->s_nr_caps--;	890	session->s_nr_caps--;
891	cap->session = NULL;	891	cap->session = NULL;
892	removed = 1;	892	removed = 1;
893	}	893	}
894	/* protect backpointer with s_cap_lock: see iterate_session_caps */	894	/* protect backpointer with s_cap_lock: see iterate_session_caps */
895	cap->ci = NULL;	895	cap->ci = NULL;
896	spin_unlock(&session->s_cap_lock);	896	spin_unlock(&session->s_cap_lock);
897		897
898	/* remove from inode list */	898	/* remove from inode list */
899	rb_erase(&cap->ci_node, &ci->i_caps);	899	rb_erase(&cap->ci_node, &ci->i_caps);
900	if (ci->i_auth_cap == cap)	900	if (ci->i_auth_cap == cap)
901	ci->i_auth_cap = NULL;	901	ci->i_auth_cap = NULL;
902		902
903	if (removed)	903	if (removed)
904	ceph_put_cap(mdsc, cap);	904	ceph_put_cap(mdsc, cap);
905		905
906	if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {	906	if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
907	struct ceph_snap_realm *realm = ci->i_snap_realm;	907	struct ceph_snap_realm *realm = ci->i_snap_realm;
908	spin_lock(&realm->inodes_with_caps_lock);	908	spin_lock(&realm->inodes_with_caps_lock);
909	list_del_init(&ci->i_snap_realm_item);	909	list_del_init(&ci->i_snap_realm_item);
910	ci->i_snap_realm_counter++;	910	ci->i_snap_realm_counter++;
911	ci->i_snap_realm = NULL;	911	ci->i_snap_realm = NULL;
912	spin_unlock(&realm->inodes_with_caps_lock);	912	spin_unlock(&realm->inodes_with_caps_lock);
913	ceph_put_snap_realm(mdsc, realm);	913	ceph_put_snap_realm(mdsc, realm);
914	}	914	}
915	if (!__ceph_is_any_real_caps(ci))	915	if (!__ceph_is_any_real_caps(ci))
916	__cap_delay_cancel(mdsc, ci);	916	__cap_delay_cancel(mdsc, ci);
917	}	917	}
918		918
919	/*	919	/*
920	* Build and send a cap message to the given MDS.	920	* Build and send a cap message to the given MDS.
921	*	921	*
922	* Caller should be holding s_mutex.	922	* Caller should be holding s_mutex.
923	*/	923	*/
924	static int send_cap_msg(struct ceph_mds_session *session,	924	static int send_cap_msg(struct ceph_mds_session *session,
925	u64 ino, u64 cid, int op,	925	u64 ino, u64 cid, int op,
926	int caps, int wanted, int dirty,	926	int caps, int wanted, int dirty,
927	u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq,	927	u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq,
928	u64 size, u64 max_size,	928	u64 size, u64 max_size,
929	struct timespec mtime, struct timespec atime,	929	struct timespec mtime, struct timespec atime,
930	u64 time_warp_seq,	930	u64 time_warp_seq,
931	uid_t uid, gid_t gid, umode_t mode,	931	uid_t uid, gid_t gid, umode_t mode,
932	u64 xattr_version,	932	u64 xattr_version,
933	struct ceph_buffer *xattrs_buf,	933	struct ceph_buffer *xattrs_buf,
934	u64 follows)	934	u64 follows)
935	{	935	{
936	struct ceph_mds_caps *fc;	936	struct ceph_mds_caps *fc;
937	struct ceph_msg *msg;	937	struct ceph_msg *msg;
938		938
939	dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"	939	dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
940	" seq %u/%u mseq %u follows %lld size %llu/%llu"	940	" seq %u/%u mseq %u follows %lld size %llu/%llu"
941	" xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op),	941	" xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op),
942	cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted),	942	cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted),
943	ceph_cap_string(dirty),	943	ceph_cap_string(dirty),
944	seq, issue_seq, mseq, follows, size, max_size,	944	seq, issue_seq, mseq, follows, size, max_size,
945	xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);	945	xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
946		946
947	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS, false);	947	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS, false);
948	if (!msg)	948	if (!msg)
949	return -ENOMEM;	949	return -ENOMEM;
950		950
951	msg->hdr.tid = cpu_to_le64(flush_tid);	951	msg->hdr.tid = cpu_to_le64(flush_tid);
952		952
953	fc = msg->front.iov_base;	953	fc = msg->front.iov_base;
954	memset(fc, 0, sizeof(*fc));	954	memset(fc, 0, sizeof(*fc));
955		955
956	fc->cap_id = cpu_to_le64(cid);	956	fc->cap_id = cpu_to_le64(cid);
957	fc->op = cpu_to_le32(op);	957	fc->op = cpu_to_le32(op);
958	fc->seq = cpu_to_le32(seq);	958	fc->seq = cpu_to_le32(seq);
959	fc->issue_seq = cpu_to_le32(issue_seq);	959	fc->issue_seq = cpu_to_le32(issue_seq);
960	fc->migrate_seq = cpu_to_le32(mseq);	960	fc->migrate_seq = cpu_to_le32(mseq);
961	fc->caps = cpu_to_le32(caps);	961	fc->caps = cpu_to_le32(caps);
962	fc->wanted = cpu_to_le32(wanted);	962	fc->wanted = cpu_to_le32(wanted);
963	fc->dirty = cpu_to_le32(dirty);	963	fc->dirty = cpu_to_le32(dirty);
964	fc->ino = cpu_to_le64(ino);	964	fc->ino = cpu_to_le64(ino);
965	fc->snap_follows = cpu_to_le64(follows);	965	fc->snap_follows = cpu_to_le64(follows);
966		966
967	fc->size = cpu_to_le64(size);	967	fc->size = cpu_to_le64(size);
968	fc->max_size = cpu_to_le64(max_size);	968	fc->max_size = cpu_to_le64(max_size);
969	if (mtime)	969	if (mtime)
970	ceph_encode_timespec(&fc->mtime, mtime);	970	ceph_encode_timespec(&fc->mtime, mtime);
971	if (atime)	971	if (atime)
972	ceph_encode_timespec(&fc->atime, atime);	972	ceph_encode_timespec(&fc->atime, atime);
973	fc->time_warp_seq = cpu_to_le32(time_warp_seq);	973	fc->time_warp_seq = cpu_to_le32(time_warp_seq);
974		974
975	fc->uid = cpu_to_le32(uid);	975	fc->uid = cpu_to_le32(uid);
976	fc->gid = cpu_to_le32(gid);	976	fc->gid = cpu_to_le32(gid);
977	fc->mode = cpu_to_le32(mode);	977	fc->mode = cpu_to_le32(mode);
978		978
979	fc->xattr_version = cpu_to_le64(xattr_version);	979	fc->xattr_version = cpu_to_le64(xattr_version);
980	if (xattrs_buf) {	980	if (xattrs_buf) {
981	msg->middle = ceph_buffer_get(xattrs_buf);	981	msg->middle = ceph_buffer_get(xattrs_buf);
982	fc->xattr_len = cpu_to_le32(xattrs_buf->vec.iov_len);	982	fc->xattr_len = cpu_to_le32(xattrs_buf->vec.iov_len);
983	msg->hdr.middle_len = cpu_to_le32(xattrs_buf->vec.iov_len);	983	msg->hdr.middle_len = cpu_to_le32(xattrs_buf->vec.iov_len);
984	}	984	}
985		985
986	ceph_con_send(&session->s_con, msg);	986	ceph_con_send(&session->s_con, msg);
987	return 0;	987	return 0;
988	}	988	}
989		989
990	static void __queue_cap_release(struct ceph_mds_session *session,	990	static void __queue_cap_release(struct ceph_mds_session *session,
991	u64 ino, u64 cap_id, u32 migrate_seq,	991	u64 ino, u64 cap_id, u32 migrate_seq,
992	u32 issue_seq)	992	u32 issue_seq)
993	{	993	{
994	struct ceph_msg *msg;	994	struct ceph_msg *msg;
995	struct ceph_mds_cap_release *head;	995	struct ceph_mds_cap_release *head;
996	struct ceph_mds_cap_item *item;	996	struct ceph_mds_cap_item *item;
997		997
998	spin_lock(&session->s_cap_lock);	998	spin_lock(&session->s_cap_lock);
999	BUG_ON(!session->s_num_cap_releases);	999	BUG_ON(!session->s_num_cap_releases);
1000	msg = list_first_entry(&session->s_cap_releases,	1000	msg = list_first_entry(&session->s_cap_releases,
1001	struct ceph_msg, list_head);	1001	struct ceph_msg, list_head);
1002		1002
1003	dout(" adding %llx release to mds%d msg %p (%d left)\n",	1003	dout(" adding %llx release to mds%d msg %p (%d left)\n",
1004	ino, session->s_mds, msg, session->s_num_cap_releases);	1004	ino, session->s_mds, msg, session->s_num_cap_releases);
1005		1005
1006	BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);	1006	BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
1007	head = msg->front.iov_base;	1007	head = msg->front.iov_base;
1008	head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);	1008	head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
1009	item = msg->front.iov_base + msg->front.iov_len;	1009	item = msg->front.iov_base + msg->front.iov_len;
1010	item->ino = cpu_to_le64(ino);	1010	item->ino = cpu_to_le64(ino);
1011	item->cap_id = cpu_to_le64(cap_id);	1011	item->cap_id = cpu_to_le64(cap_id);
1012	item->migrate_seq = cpu_to_le32(migrate_seq);	1012	item->migrate_seq = cpu_to_le32(migrate_seq);
1013	item->seq = cpu_to_le32(issue_seq);	1013	item->seq = cpu_to_le32(issue_seq);
1014		1014
1015	session->s_num_cap_releases--;	1015	session->s_num_cap_releases--;
1016		1016
1017	msg->front.iov_len += sizeof(*item);	1017	msg->front.iov_len += sizeof(*item);
1018	if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {	1018	if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
1019	dout(" release msg %p full\n", msg);	1019	dout(" release msg %p full\n", msg);
1020	list_move_tail(&msg->list_head, &session->s_cap_releases_done);	1020	list_move_tail(&msg->list_head, &session->s_cap_releases_done);
1021	} else {	1021	} else {
1022	dout(" release msg %p at %d/%d (%d)\n", msg,	1022	dout(" release msg %p at %d/%d (%d)\n", msg,
1023	(int)le32_to_cpu(head->num),	1023	(int)le32_to_cpu(head->num),
1024	(int)CEPH_CAPS_PER_RELEASE,	1024	(int)CEPH_CAPS_PER_RELEASE,
1025	(int)msg->front.iov_len);	1025	(int)msg->front.iov_len);
1026	}	1026	}
1027	spin_unlock(&session->s_cap_lock);	1027	spin_unlock(&session->s_cap_lock);
1028	}	1028	}
1029		1029
1030	/*	1030	/*
1031	* Queue cap releases when an inode is dropped from our cache. Since	1031	* Queue cap releases when an inode is dropped from our cache. Since
1032	* inode is about to be destroyed, there is no need for i_ceph_lock.	1032	* inode is about to be destroyed, there is no need for i_ceph_lock.
1033	*/	1033	*/
1034	void ceph_queue_caps_release(struct inode *inode)	1034	void ceph_queue_caps_release(struct inode *inode)
1035	{	1035	{
1036	struct ceph_inode_info *ci = ceph_inode(inode);	1036	struct ceph_inode_info *ci = ceph_inode(inode);
1037	struct rb_node *p;	1037	struct rb_node *p;
1038		1038
1039	p = rb_first(&ci->i_caps);	1039	p = rb_first(&ci->i_caps);
1040	while (p) {	1040	while (p) {
1041	struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);	1041	struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
1042	struct ceph_mds_session *session = cap->session;	1042	struct ceph_mds_session *session = cap->session;
1043		1043
1044	__queue_cap_release(session, ceph_ino(inode), cap->cap_id,	1044	__queue_cap_release(session, ceph_ino(inode), cap->cap_id,
1045	cap->mseq, cap->issue_seq);	1045	cap->mseq, cap->issue_seq);
1046	p = rb_next(p);	1046	p = rb_next(p);
1047	__ceph_remove_cap(cap);	1047	__ceph_remove_cap(cap);
1048	}	1048	}
1049	}	1049	}
1050		1050
1051	/*	1051	/*
1052	* Send a cap msg on the given inode. Update our caps state, then	1052	* Send a cap msg on the given inode. Update our caps state, then
1053	* drop i_ceph_lock and send the message.	1053	* drop i_ceph_lock and send the message.
1054	*	1054	*
1055	* Make note of max_size reported/requested from mds, revoked caps	1055	* Make note of max_size reported/requested from mds, revoked caps
1056	* that have now been implemented.	1056	* that have now been implemented.
1057	*	1057	*
1058	* Make half-hearted attempt ot to invalidate page cache if we are	1058	* Make half-hearted attempt ot to invalidate page cache if we are
1059	* dropping RDCACHE. Note that this will leave behind locked pages	1059	* dropping RDCACHE. Note that this will leave behind locked pages
1060	* that we'll then need to deal with elsewhere.	1060	* that we'll then need to deal with elsewhere.
1061	*	1061	*
1062	* Return non-zero if delayed release, or we experienced an error	1062	* Return non-zero if delayed release, or we experienced an error
1063	* such that the caller should requeue + retry later.	1063	* such that the caller should requeue + retry later.
1064	*	1064	*
1065	* called with i_ceph_lock, then drops it.	1065	* called with i_ceph_lock, then drops it.
1066	* caller should hold snap_rwsem (read), s_mutex.	1066	* caller should hold snap_rwsem (read), s_mutex.
1067	*/	1067	*/
1068	static int __send_cap(struct ceph_mds_client mdsc, struct ceph_cap cap,	1068	static int __send_cap(struct ceph_mds_client mdsc, struct ceph_cap cap,
1069	int op, int used, int want, int retain, int flushing,	1069	int op, int used, int want, int retain, int flushing,
1070	unsigned *pflush_tid)	1070	unsigned *pflush_tid)
1071	__releases(cap->ci->i_ceph_lock)	1071	__releases(cap->ci->i_ceph_lock)
1072	{	1072	{
1073	struct ceph_inode_info *ci = cap->ci;	1073	struct ceph_inode_info *ci = cap->ci;
1074	struct inode *inode = &ci->vfs_inode;	1074	struct inode *inode = &ci->vfs_inode;
1075	u64 cap_id = cap->cap_id;	1075	u64 cap_id = cap->cap_id;
1076	int held, revoking, dropping, keep;	1076	int held, revoking, dropping, keep;
1077	u64 seq, issue_seq, mseq, time_warp_seq, follows;	1077	u64 seq, issue_seq, mseq, time_warp_seq, follows;
1078	u64 size, max_size;	1078	u64 size, max_size;
1079	struct timespec mtime, atime;	1079	struct timespec mtime, atime;
1080	int wake = 0;	1080	int wake = 0;
1081	umode_t mode;	1081	umode_t mode;
1082	uid_t uid;	1082	uid_t uid;
1083	gid_t gid;	1083	gid_t gid;
1084	struct ceph_mds_session *session;	1084	struct ceph_mds_session *session;
1085	u64 xattr_version = 0;	1085	u64 xattr_version = 0;
1086	struct ceph_buffer *xattr_blob = NULL;	1086	struct ceph_buffer *xattr_blob = NULL;
1087	int delayed = 0;	1087	int delayed = 0;
1088	u64 flush_tid = 0;	1088	u64 flush_tid = 0;
1089	int i;	1089	int i;
1090	int ret;	1090	int ret;
1091		1091
1092	held = cap->issued \| cap->implemented;	1092	held = cap->issued \| cap->implemented;
1093	revoking = cap->implemented & ~cap->issued;	1093	revoking = cap->implemented & ~cap->issued;
1094	retain &= ~revoking;	1094	retain &= ~revoking;
1095	dropping = cap->issued & ~retain;	1095	dropping = cap->issued & ~retain;
1096		1096
1097	dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n",	1097	dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n",
1098	inode, cap, cap->session,	1098	inode, cap, cap->session,
1099	ceph_cap_string(held), ceph_cap_string(held & retain),	1099	ceph_cap_string(held), ceph_cap_string(held & retain),
1100	ceph_cap_string(revoking));	1100	ceph_cap_string(revoking));
1101	BUG_ON((retain & CEPH_CAP_PIN) == 0);	1101	BUG_ON((retain & CEPH_CAP_PIN) == 0);
1102		1102
1103	session = cap->session;	1103	session = cap->session;
1104		1104
1105	/* don't release wanted unless we've waited a bit. */	1105	/* don't release wanted unless we've waited a bit. */
1106	if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&	1106	if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
1107	time_before(jiffies, ci->i_hold_caps_min)) {	1107	time_before(jiffies, ci->i_hold_caps_min)) {
1108	dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",	1108	dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
1109	ceph_cap_string(cap->issued),	1109	ceph_cap_string(cap->issued),
1110	ceph_cap_string(cap->issued & retain),	1110	ceph_cap_string(cap->issued & retain),
1111	ceph_cap_string(cap->mds_wanted),	1111	ceph_cap_string(cap->mds_wanted),
1112	ceph_cap_string(want));	1112	ceph_cap_string(want));
1113	want \|= cap->mds_wanted;	1113	want \|= cap->mds_wanted;
1114	retain \|= cap->issued;	1114	retain \|= cap->issued;
1115	delayed = 1;	1115	delayed = 1;
1116	}	1116	}
1117	ci->i_ceph_flags &= ~(CEPH_I_NODELAY \| CEPH_I_FLUSH);	1117	ci->i_ceph_flags &= ~(CEPH_I_NODELAY \| CEPH_I_FLUSH);
1118		1118
1119	cap->issued &= retain; /* drop bits we don't want */	1119	cap->issued &= retain; /* drop bits we don't want */
1120	if (cap->implemented & ~cap->issued) {	1120	if (cap->implemented & ~cap->issued) {
1121	/*	1121	/*
1122	* Wake up any waiters on wanted -> needed transition.	1122	* Wake up any waiters on wanted -> needed transition.
1123	* This is due to the weird transition from buffered	1123	* This is due to the weird transition from buffered
1124	* to sync IO... we need to flush dirty pages _before_	1124	* to sync IO... we need to flush dirty pages _before_
1125	* allowing sync writes to avoid reordering.	1125	* allowing sync writes to avoid reordering.
1126	*/	1126	*/
1127	wake = 1;	1127	wake = 1;
1128	}	1128	}
1129	cap->implemented &= cap->issued \| used;	1129	cap->implemented &= cap->issued \| used;
1130	cap->mds_wanted = want;	1130	cap->mds_wanted = want;
1131		1131
1132	if (flushing) {	1132	if (flushing) {
1133	/*	1133	/*
1134	* assign a tid for flush operations so we can avoid	1134	* assign a tid for flush operations so we can avoid
1135	* flush1 -> dirty1 -> flush2 -> flushack1 -> mark	1135	* flush1 -> dirty1 -> flush2 -> flushack1 -> mark
1136	* clean type races. track latest tid for every bit	1136	* clean type races. track latest tid for every bit
1137	* so we can handle flush AxFw, flush Fw, and have the	1137	* so we can handle flush AxFw, flush Fw, and have the
1138	* first ack clean Ax.	1138	* first ack clean Ax.
1139	*/	1139	*/
1140	flush_tid = ++ci->i_cap_flush_last_tid;	1140	flush_tid = ++ci->i_cap_flush_last_tid;
1141	if (pflush_tid)	1141	if (pflush_tid)
1142	*pflush_tid = flush_tid;	1142	*pflush_tid = flush_tid;
1143	dout(" cap_flush_tid %d\n", (int)flush_tid);	1143	dout(" cap_flush_tid %d\n", (int)flush_tid);
1144	for (i = 0; i < CEPH_CAP_BITS; i++)	1144	for (i = 0; i < CEPH_CAP_BITS; i++)
1145	if (flushing & (1 << i))	1145	if (flushing & (1 << i))
1146	ci->i_cap_flush_tid[i] = flush_tid;	1146	ci->i_cap_flush_tid[i] = flush_tid;
1147		1147
1148	follows = ci->i_head_snapc->seq;	1148	follows = ci->i_head_snapc->seq;
1149	} else {	1149	} else {
1150	follows = 0;	1150	follows = 0;
1151	}	1151	}
1152		1152
1153	keep = cap->implemented;	1153	keep = cap->implemented;
1154	seq = cap->seq;	1154	seq = cap->seq;
1155	issue_seq = cap->issue_seq;	1155	issue_seq = cap->issue_seq;
1156	mseq = cap->mseq;	1156	mseq = cap->mseq;
1157	size = inode->i_size;	1157	size = inode->i_size;
1158	ci->i_reported_size = size;	1158	ci->i_reported_size = size;
1159	max_size = ci->i_wanted_max_size;	1159	max_size = ci->i_wanted_max_size;
1160	ci->i_requested_max_size = max_size;	1160	ci->i_requested_max_size = max_size;
1161	mtime = inode->i_mtime;	1161	mtime = inode->i_mtime;
1162	atime = inode->i_atime;	1162	atime = inode->i_atime;
1163	time_warp_seq = ci->i_time_warp_seq;	1163	time_warp_seq = ci->i_time_warp_seq;
1164	uid = inode->i_uid;	1164	uid = inode->i_uid;
1165	gid = inode->i_gid;	1165	gid = inode->i_gid;
1166	mode = inode->i_mode;	1166	mode = inode->i_mode;
1167		1167
1168	if (flushing & CEPH_CAP_XATTR_EXCL) {	1168	if (flushing & CEPH_CAP_XATTR_EXCL) {
1169	__ceph_build_xattrs_blob(ci);	1169	__ceph_build_xattrs_blob(ci);
1170	xattr_blob = ci->i_xattrs.blob;	1170	xattr_blob = ci->i_xattrs.blob;
1171	xattr_version = ci->i_xattrs.version;	1171	xattr_version = ci->i_xattrs.version;
1172	}	1172	}
1173		1173
1174	spin_unlock(&ci->i_ceph_lock);	1174	spin_unlock(&ci->i_ceph_lock);
1175		1175
1176	ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,	1176	ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
1177	op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,	1177	op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
1178	size, max_size, &mtime, &atime, time_warp_seq,	1178	size, max_size, &mtime, &atime, time_warp_seq,
1179	uid, gid, mode, xattr_version, xattr_blob,	1179	uid, gid, mode, xattr_version, xattr_blob,
1180	follows);	1180	follows);
1181	if (ret < 0) {	1181	if (ret < 0) {
1182	dout("error sending cap msg, must requeue %p\n", inode);	1182	dout("error sending cap msg, must requeue %p\n", inode);
1183	delayed = 1;	1183	delayed = 1;
1184	}	1184	}
1185		1185
1186	if (wake)	1186	if (wake)
1187	wake_up_all(&ci->i_cap_wq);	1187	wake_up_all(&ci->i_cap_wq);
1188		1188
1189	return delayed;	1189	return delayed;
1190	}	1190	}
1191		1191
1192	/*	1192	/*
1193	* When a snapshot is taken, clients accumulate dirty metadata on	1193	* When a snapshot is taken, clients accumulate dirty metadata on
1194	* inodes with capabilities in ceph_cap_snaps to describe the file	1194	* inodes with capabilities in ceph_cap_snaps to describe the file
1195	* state at the time the snapshot was taken. This must be flushed	1195	* state at the time the snapshot was taken. This must be flushed
1196	* asynchronously back to the MDS once sync writes complete and dirty	1196	* asynchronously back to the MDS once sync writes complete and dirty
1197	* data is written out.	1197	* data is written out.
1198	*	1198	*
1199	* Unless @again is true, skip cap_snaps that were already sent to	1199	* Unless @again is true, skip cap_snaps that were already sent to
1200	* the MDS (i.e., during this session).	1200	* the MDS (i.e., during this session).
1201	*	1201	*
1202	* Called under i_ceph_lock. Takes s_mutex as needed.	1202	* Called under i_ceph_lock. Takes s_mutex as needed.
1203	*/	1203	*/
1204	void __ceph_flush_snaps(struct ceph_inode_info *ci,	1204	void __ceph_flush_snaps(struct ceph_inode_info *ci,
1205	struct ceph_mds_session **psession,	1205	struct ceph_mds_session **psession,
1206	int again)	1206	int again)
1207	__releases(ci->i_ceph_lock)	1207	__releases(ci->i_ceph_lock)
1208	__acquires(ci->i_ceph_lock)	1208	__acquires(ci->i_ceph_lock)
1209	{	1209	{
1210	struct inode *inode = &ci->vfs_inode;	1210	struct inode *inode = &ci->vfs_inode;
1211	int mds;	1211	int mds;
1212	struct ceph_cap_snap *capsnap;	1212	struct ceph_cap_snap *capsnap;
1213	u32 mseq;	1213	u32 mseq;
1214	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;	1214	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
1215	struct ceph_mds_session session = NULL; / if session != NULL, we hold	1215	struct ceph_mds_session session = NULL; / if session != NULL, we hold
1216	session->s_mutex */	1216	session->s_mutex */
1217	u64 next_follows = 0; /* keep track of how far we've gotten through the	1217	u64 next_follows = 0; /* keep track of how far we've gotten through the
1218	i_cap_snaps list, and skip these entries next time	1218	i_cap_snaps list, and skip these entries next time
1219	around to avoid an infinite loop */	1219	around to avoid an infinite loop */
1220		1220
1221	if (psession)	1221	if (psession)
1222	session = *psession;	1222	session = *psession;
1223		1223
1224	dout("__flush_snaps %p\n", inode);	1224	dout("__flush_snaps %p\n", inode);
1225	retry:	1225	retry:
1226	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {	1226	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
1227	/* avoid an infiniute loop after retry */	1227	/* avoid an infiniute loop after retry */
1228	if (capsnap->follows < next_follows)	1228	if (capsnap->follows < next_follows)
1229	continue;	1229	continue;
1230	/*	1230	/*
1231	* we need to wait for sync writes to complete and for dirty	1231	* we need to wait for sync writes to complete and for dirty
1232	* pages to be written out.	1232	* pages to be written out.
1233	*/	1233	*/
1234	if (capsnap->dirty_pages \|\| capsnap->writing)	1234	if (capsnap->dirty_pages \|\| capsnap->writing)
1235	break;	1235	break;
1236		1236
1237	/*	1237	/*
1238	* if cap writeback already occurred, we should have dropped	1238	* if cap writeback already occurred, we should have dropped
1239	* the capsnap in ceph_put_wrbuffer_cap_refs.	1239	* the capsnap in ceph_put_wrbuffer_cap_refs.
1240	*/	1240	*/
1241	BUG_ON(capsnap->dirty == 0);	1241	BUG_ON(capsnap->dirty == 0);
1242		1242
1243	/* pick mds, take s_mutex */	1243	/* pick mds, take s_mutex */
1244	if (ci->i_auth_cap == NULL) {	1244	if (ci->i_auth_cap == NULL) {
1245	dout("no auth cap (migrating?), doing nothing\n");	1245	dout("no auth cap (migrating?), doing nothing\n");
1246	goto out;	1246	goto out;
1247	}	1247	}
1248		1248
1249	/* only flush each capsnap once */	1249	/* only flush each capsnap once */
1250	if (!again && !list_empty(&capsnap->flushing_item)) {	1250	if (!again && !list_empty(&capsnap->flushing_item)) {
1251	dout("already flushed %p, skipping\n", capsnap);	1251	dout("already flushed %p, skipping\n", capsnap);
1252	continue;	1252	continue;
1253	}	1253	}
1254		1254
1255	mds = ci->i_auth_cap->session->s_mds;	1255	mds = ci->i_auth_cap->session->s_mds;
1256	mseq = ci->i_auth_cap->mseq;	1256	mseq = ci->i_auth_cap->mseq;
1257		1257
1258	if (session && session->s_mds != mds) {	1258	if (session && session->s_mds != mds) {
1259	dout("oops, wrong session %p mutex\n", session);	1259	dout("oops, wrong session %p mutex\n", session);
1260	mutex_unlock(&session->s_mutex);	1260	mutex_unlock(&session->s_mutex);
1261	ceph_put_mds_session(session);	1261	ceph_put_mds_session(session);
1262	session = NULL;	1262	session = NULL;
1263	}	1263	}
1264	if (!session) {	1264	if (!session) {
1265	spin_unlock(&ci->i_ceph_lock);	1265	spin_unlock(&ci->i_ceph_lock);
1266	mutex_lock(&mdsc->mutex);	1266	mutex_lock(&mdsc->mutex);
1267	session = __ceph_lookup_mds_session(mdsc, mds);	1267	session = __ceph_lookup_mds_session(mdsc, mds);
1268	mutex_unlock(&mdsc->mutex);	1268	mutex_unlock(&mdsc->mutex);
1269	if (session) {	1269	if (session) {
1270	dout("inverting session/ino locks on %p\n",	1270	dout("inverting session/ino locks on %p\n",
1271	session);	1271	session);
1272	mutex_lock(&session->s_mutex);	1272	mutex_lock(&session->s_mutex);
1273	}	1273	}
1274	/*	1274	/*
1275	* if session == NULL, we raced against a cap	1275	* if session == NULL, we raced against a cap
1276	* deletion or migration. retry, and we'll	1276	* deletion or migration. retry, and we'll
1277	* get a better @mds value next time.	1277	* get a better @mds value next time.
1278	*/	1278	*/
1279	spin_lock(&ci->i_ceph_lock);	1279	spin_lock(&ci->i_ceph_lock);
1280	goto retry;	1280	goto retry;
1281	}	1281	}
1282		1282
1283	capsnap->flush_tid = ++ci->i_cap_flush_last_tid;	1283	capsnap->flush_tid = ++ci->i_cap_flush_last_tid;
1284	atomic_inc(&capsnap->nref);	1284	atomic_inc(&capsnap->nref);
1285	if (!list_empty(&capsnap->flushing_item))	1285	if (!list_empty(&capsnap->flushing_item))
1286	list_del_init(&capsnap->flushing_item);	1286	list_del_init(&capsnap->flushing_item);
1287	list_add_tail(&capsnap->flushing_item,	1287	list_add_tail(&capsnap->flushing_item,
1288	&session->s_cap_snaps_flushing);	1288	&session->s_cap_snaps_flushing);
1289	spin_unlock(&ci->i_ceph_lock);	1289	spin_unlock(&ci->i_ceph_lock);
1290		1290
1291	dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n",	1291	dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n",
1292	inode, capsnap, capsnap->follows, capsnap->flush_tid);	1292	inode, capsnap, capsnap->follows, capsnap->flush_tid);
1293	send_cap_msg(session, ceph_vino(inode).ino, 0,	1293	send_cap_msg(session, ceph_vino(inode).ino, 0,
1294	CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,	1294	CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
1295	capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,	1295	capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
1296	capsnap->size, 0,	1296	capsnap->size, 0,
1297	&capsnap->mtime, &capsnap->atime,	1297	&capsnap->mtime, &capsnap->atime,
1298	capsnap->time_warp_seq,	1298	capsnap->time_warp_seq,
1299	capsnap->uid, capsnap->gid, capsnap->mode,	1299	capsnap->uid, capsnap->gid, capsnap->mode,
1300	capsnap->xattr_version, capsnap->xattr_blob,	1300	capsnap->xattr_version, capsnap->xattr_blob,
1301	capsnap->follows);	1301	capsnap->follows);
1302		1302
1303	next_follows = capsnap->follows + 1;	1303	next_follows = capsnap->follows + 1;
1304	ceph_put_cap_snap(capsnap);	1304	ceph_put_cap_snap(capsnap);
1305		1305
1306	spin_lock(&ci->i_ceph_lock);	1306	spin_lock(&ci->i_ceph_lock);
1307	goto retry;	1307	goto retry;
1308	}	1308	}
1309		1309
1310	/* we flushed them all; remove this inode from the queue */	1310	/* we flushed them all; remove this inode from the queue */
1311	spin_lock(&mdsc->snap_flush_lock);	1311	spin_lock(&mdsc->snap_flush_lock);
1312	list_del_init(&ci->i_snap_flush_item);	1312	list_del_init(&ci->i_snap_flush_item);
1313	spin_unlock(&mdsc->snap_flush_lock);	1313	spin_unlock(&mdsc->snap_flush_lock);
1314		1314
1315	out:	1315	out:
1316	if (psession)	1316	if (psession)
1317	*psession = session;	1317	*psession = session;
1318	else if (session) {	1318	else if (session) {
1319	mutex_unlock(&session->s_mutex);	1319	mutex_unlock(&session->s_mutex);
1320	ceph_put_mds_session(session);	1320	ceph_put_mds_session(session);
1321	}	1321	}
1322	}	1322	}
1323		1323
1324	static void ceph_flush_snaps(struct ceph_inode_info *ci)	1324	static void ceph_flush_snaps(struct ceph_inode_info *ci)
1325	{	1325	{
1326	spin_lock(&ci->i_ceph_lock);	1326	spin_lock(&ci->i_ceph_lock);
1327	__ceph_flush_snaps(ci, NULL, 0);	1327	__ceph_flush_snaps(ci, NULL, 0);
1328	spin_unlock(&ci->i_ceph_lock);	1328	spin_unlock(&ci->i_ceph_lock);
1329	}	1329	}
1330		1330
1331	/*	1331	/*
1332	* Mark caps dirty. If inode is newly dirty, return the dirty flags.	1332	* Mark caps dirty. If inode is newly dirty, return the dirty flags.
1333	* Caller is then responsible for calling __mark_inode_dirty with the	1333	* Caller is then responsible for calling __mark_inode_dirty with the
1334	* returned flags value.	1334	* returned flags value.
1335	*/	1335	*/
1336	int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)	1336	int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1337	{	1337	{
1338	struct ceph_mds_client *mdsc =	1338	struct ceph_mds_client *mdsc =
1339	ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;	1339	ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
1340	struct inode *inode = &ci->vfs_inode;	1340	struct inode *inode = &ci->vfs_inode;
1341	int was = ci->i_dirty_caps;	1341	int was = ci->i_dirty_caps;
1342	int dirty = 0;	1342	int dirty = 0;
1343		1343
1344	dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,	1344	dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
1345	ceph_cap_string(mask), ceph_cap_string(was),	1345	ceph_cap_string(mask), ceph_cap_string(was),
1346	ceph_cap_string(was \| mask));	1346	ceph_cap_string(was \| mask));
1347	ci->i_dirty_caps \|= mask;	1347	ci->i_dirty_caps \|= mask;
1348	if (was == 0) {	1348	if (was == 0) {
1349	if (!ci->i_head_snapc)	1349	if (!ci->i_head_snapc)
1350	ci->i_head_snapc = ceph_get_snap_context(	1350	ci->i_head_snapc = ceph_get_snap_context(
1351	ci->i_snap_realm->cached_context);	1351	ci->i_snap_realm->cached_context);
1352	dout(" inode %p now dirty snapc %p\n", &ci->vfs_inode,	1352	dout(" inode %p now dirty snapc %p\n", &ci->vfs_inode,
1353	ci->i_head_snapc);	1353	ci->i_head_snapc);
1354	BUG_ON(!list_empty(&ci->i_dirty_item));	1354	BUG_ON(!list_empty(&ci->i_dirty_item));
1355	spin_lock(&mdsc->cap_dirty_lock);	1355	spin_lock(&mdsc->cap_dirty_lock);
1356	list_add(&ci->i_dirty_item, &mdsc->cap_dirty);	1356	list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
1357	spin_unlock(&mdsc->cap_dirty_lock);	1357	spin_unlock(&mdsc->cap_dirty_lock);
1358	if (ci->i_flushing_caps == 0) {	1358	if (ci->i_flushing_caps == 0) {
1359	ihold(inode);	1359	ihold(inode);
1360	dirty \|= I_DIRTY_SYNC;	1360	dirty \|= I_DIRTY_SYNC;
1361	}	1361	}
1362	}	1362	}
1363	BUG_ON(list_empty(&ci->i_dirty_item));	1363	BUG_ON(list_empty(&ci->i_dirty_item));
1364	if (((was \| ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&	1364	if (((was \| ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
1365	(mask & CEPH_CAP_FILE_BUFFER))	1365	(mask & CEPH_CAP_FILE_BUFFER))
1366	dirty \|= I_DIRTY_DATASYNC;	1366	dirty \|= I_DIRTY_DATASYNC;
1367	__cap_delay_requeue(mdsc, ci);	1367	__cap_delay_requeue(mdsc, ci);
1368	return dirty;	1368	return dirty;
1369	}	1369	}
1370		1370
1371	/*	1371	/*
1372	* Add dirty inode to the flushing list. Assigned a seq number so we	1372	* Add dirty inode to the flushing list. Assigned a seq number so we
1373	* can wait for caps to flush without starving.	1373	* can wait for caps to flush without starving.
1374	*	1374	*
1375	* Called under i_ceph_lock.	1375	* Called under i_ceph_lock.
1376	*/	1376	*/
1377	static int __mark_caps_flushing(struct inode *inode,	1377	static int __mark_caps_flushing(struct inode *inode,
1378	struct ceph_mds_session *session)	1378	struct ceph_mds_session *session)
1379	{	1379	{
1380	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;	1380	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
1381	struct ceph_inode_info *ci = ceph_inode(inode);	1381	struct ceph_inode_info *ci = ceph_inode(inode);
1382	int flushing;	1382	int flushing;
1383		1383
1384	BUG_ON(ci->i_dirty_caps == 0);	1384	BUG_ON(ci->i_dirty_caps == 0);
1385	BUG_ON(list_empty(&ci->i_dirty_item));	1385	BUG_ON(list_empty(&ci->i_dirty_item));
1386		1386
1387	flushing = ci->i_dirty_caps;	1387	flushing = ci->i_dirty_caps;
1388	dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",	1388	dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
1389	ceph_cap_string(flushing),	1389	ceph_cap_string(flushing),
1390	ceph_cap_string(ci->i_flushing_caps),	1390	ceph_cap_string(ci->i_flushing_caps),
1391	ceph_cap_string(ci->i_flushing_caps \| flushing));	1391	ceph_cap_string(ci->i_flushing_caps \| flushing));
1392	ci->i_flushing_caps \|= flushing;	1392	ci->i_flushing_caps \|= flushing;
1393	ci->i_dirty_caps = 0;	1393	ci->i_dirty_caps = 0;
1394	dout(" inode %p now !dirty\n", inode);	1394	dout(" inode %p now !dirty\n", inode);
1395		1395
1396	spin_lock(&mdsc->cap_dirty_lock);	1396	spin_lock(&mdsc->cap_dirty_lock);
1397	list_del_init(&ci->i_dirty_item);	1397	list_del_init(&ci->i_dirty_item);
1398		1398
1399	ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;	1399	ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
1400	if (list_empty(&ci->i_flushing_item)) {	1400	if (list_empty(&ci->i_flushing_item)) {
1401	list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);	1401	list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
1402	mdsc->num_cap_flushing++;	1402	mdsc->num_cap_flushing++;
1403	dout(" inode %p now flushing seq %lld\n", inode,	1403	dout(" inode %p now flushing seq %lld\n", inode,
1404	ci->i_cap_flush_seq);	1404	ci->i_cap_flush_seq);
1405	} else {	1405	} else {
1406	list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);	1406	list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
1407	dout(" inode %p now flushing (more) seq %lld\n", inode,	1407	dout(" inode %p now flushing (more) seq %lld\n", inode,
1408	ci->i_cap_flush_seq);	1408	ci->i_cap_flush_seq);
1409	}	1409	}
1410	spin_unlock(&mdsc->cap_dirty_lock);	1410	spin_unlock(&mdsc->cap_dirty_lock);
1411		1411
1412	return flushing;	1412	return flushing;
1413	}	1413	}
1414		1414
1415	/*	1415	/*
1416	* try to invalidate mapping pages without blocking.	1416	* try to invalidate mapping pages without blocking.
1417	*/	1417	*/
1418	static int try_nonblocking_invalidate(struct inode *inode)	1418	static int try_nonblocking_invalidate(struct inode *inode)
1419	{	1419	{
1420	struct ceph_inode_info *ci = ceph_inode(inode);	1420	struct ceph_inode_info *ci = ceph_inode(inode);
1421	u32 invalidating_gen = ci->i_rdcache_gen;	1421	u32 invalidating_gen = ci->i_rdcache_gen;
1422		1422
1423	spin_unlock(&ci->i_ceph_lock);	1423	spin_unlock(&ci->i_ceph_lock);
1424	invalidate_mapping_pages(&inode->i_data, 0, -1);	1424	invalidate_mapping_pages(&inode->i_data, 0, -1);
1425	spin_lock(&ci->i_ceph_lock);	1425	spin_lock(&ci->i_ceph_lock);
1426		1426
1427	if (inode->i_data.nrpages == 0 &&	1427	if (inode->i_data.nrpages == 0 &&
1428	invalidating_gen == ci->i_rdcache_gen) {	1428	invalidating_gen == ci->i_rdcache_gen) {
1429	/* success. */	1429	/* success. */
1430	dout("try_nonblocking_invalidate %p success\n", inode);	1430	dout("try_nonblocking_invalidate %p success\n", inode);
1431	/* save any racing async invalidate some trouble */	1431	/* save any racing async invalidate some trouble */
1432	ci->i_rdcache_revoking = ci->i_rdcache_gen - 1;	1432	ci->i_rdcache_revoking = ci->i_rdcache_gen - 1;
1433	return 0;	1433	return 0;
1434	}	1434	}
1435	dout("try_nonblocking_invalidate %p failed\n", inode);	1435	dout("try_nonblocking_invalidate %p failed\n", inode);
1436	return -1;	1436	return -1;
1437	}	1437	}
1438		1438
1439	/*	1439	/*
1440	* Swiss army knife function to examine currently used and wanted	1440	* Swiss army knife function to examine currently used and wanted
1441	* versus held caps. Release, flush, ack revoked caps to mds as	1441	* versus held caps. Release, flush, ack revoked caps to mds as
1442	* appropriate.	1442	* appropriate.
1443	*	1443	*
1444	* CHECK_CAPS_NODELAY - caller is delayed work and we should not delay	1444	* CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
1445	* cap release further.	1445	* cap release further.
1446	* CHECK_CAPS_AUTHONLY - we should only check the auth cap	1446	* CHECK_CAPS_AUTHONLY - we should only check the auth cap
1447	* CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without	1447	* CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
1448	* further delay.	1448	* further delay.
1449	*/	1449	*/
1450	void ceph_check_caps(struct ceph_inode_info *ci, int flags,	1450	void ceph_check_caps(struct ceph_inode_info *ci, int flags,
1451	struct ceph_mds_session *session)	1451	struct ceph_mds_session *session)
1452	{	1452	{
1453	struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);	1453	struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
1454	struct ceph_mds_client *mdsc = fsc->mdsc;	1454	struct ceph_mds_client *mdsc = fsc->mdsc;
1455	struct inode *inode = &ci->vfs_inode;	1455	struct inode *inode = &ci->vfs_inode;
1456	struct ceph_cap *cap;	1456	struct ceph_cap *cap;
1457	int file_wanted, used;	1457	int file_wanted, used;
1458	int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */	1458	int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */
1459	int issued, implemented, want, retain, revoking, flushing = 0;	1459	int issued, implemented, want, retain, revoking, flushing = 0;
1460	int mds = -1; /* keep track of how far we've gone through i_caps list	1460	int mds = -1; /* keep track of how far we've gone through i_caps list
1461	to avoid an infinite loop on retry */	1461	to avoid an infinite loop on retry */
1462	struct rb_node *p;	1462	struct rb_node *p;
1463	int tried_invalidate = 0;	1463	int tried_invalidate = 0;
1464	int delayed = 0, sent = 0, force_requeue = 0, num;	1464	int delayed = 0, sent = 0, force_requeue = 0, num;
1465	int queue_invalidate = 0;	1465	int queue_invalidate = 0;
1466	int is_delayed = flags & CHECK_CAPS_NODELAY;	1466	int is_delayed = flags & CHECK_CAPS_NODELAY;
1467		1467
1468	/* if we are unmounting, flush any unused caps immediately. */	1468	/* if we are unmounting, flush any unused caps immediately. */
1469	if (mdsc->stopping)	1469	if (mdsc->stopping)
1470	is_delayed = 1;	1470	is_delayed = 1;
1471		1471
1472	spin_lock(&ci->i_ceph_lock);	1472	spin_lock(&ci->i_ceph_lock);
1473		1473
1474	if (ci->i_ceph_flags & CEPH_I_FLUSH)	1474	if (ci->i_ceph_flags & CEPH_I_FLUSH)
1475	flags \|= CHECK_CAPS_FLUSH;	1475	flags \|= CHECK_CAPS_FLUSH;
1476		1476
1477	/* flush snaps first time around only */	1477	/* flush snaps first time around only */
1478	if (!list_empty(&ci->i_cap_snaps))	1478	if (!list_empty(&ci->i_cap_snaps))
1479	__ceph_flush_snaps(ci, &session, 0);	1479	__ceph_flush_snaps(ci, &session, 0);
1480	goto retry_locked;	1480	goto retry_locked;
1481	retry:	1481	retry:
1482	spin_lock(&ci->i_ceph_lock);	1482	spin_lock(&ci->i_ceph_lock);
1483	retry_locked:	1483	retry_locked:
1484	file_wanted = __ceph_caps_file_wanted(ci);	1484	file_wanted = __ceph_caps_file_wanted(ci);
1485	used = __ceph_caps_used(ci);	1485	used = __ceph_caps_used(ci);
1486	want = file_wanted \| used;	1486	want = file_wanted \| used;
1487	issued = __ceph_caps_issued(ci, &implemented);	1487	issued = __ceph_caps_issued(ci, &implemented);
1488	revoking = implemented & ~issued;	1488	revoking = implemented & ~issued;
1489		1489
1490	retain = want \| CEPH_CAP_PIN;	1490	retain = want \| CEPH_CAP_PIN;
1491	if (!mdsc->stopping && inode->i_nlink > 0) {	1491	if (!mdsc->stopping && inode->i_nlink > 0) {
1492	if (want) {	1492	if (want) {
1493	retain \|= CEPH_CAP_ANY; /* be greedy */	1493	retain \|= CEPH_CAP_ANY; /* be greedy */
1494	} else {	1494	} else {
1495	retain \|= CEPH_CAP_ANY_SHARED;	1495	retain \|= CEPH_CAP_ANY_SHARED;
1496	/*	1496	/*
1497	* keep RD only if we didn't have the file open RW,	1497	* keep RD only if we didn't have the file open RW,
1498	* because then the mds would revoke it anyway to	1498	* because then the mds would revoke it anyway to
1499	* journal max_size=0.	1499	* journal max_size=0.
1500	*/	1500	*/
1501	if (ci->i_max_size == 0)	1501	if (ci->i_max_size == 0)
1502	retain \|= CEPH_CAP_ANY_RD;	1502	retain \|= CEPH_CAP_ANY_RD;
1503	}	1503	}
1504	}	1504	}
1505		1505
1506	dout("check_caps %p file_want %s used %s dirty %s flushing %s"	1506	dout("check_caps %p file_want %s used %s dirty %s flushing %s"
1507	" issued %s revoking %s retain %s %s%s%s\n", inode,	1507	" issued %s revoking %s retain %s %s%s%s\n", inode,
1508	ceph_cap_string(file_wanted),	1508	ceph_cap_string(file_wanted),
1509	ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),	1509	ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
1510	ceph_cap_string(ci->i_flushing_caps),	1510	ceph_cap_string(ci->i_flushing_caps),
1511	ceph_cap_string(issued), ceph_cap_string(revoking),	1511	ceph_cap_string(issued), ceph_cap_string(revoking),
1512	ceph_cap_string(retain),	1512	ceph_cap_string(retain),
1513	(flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",	1513	(flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
1514	(flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",	1514	(flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
1515	(flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");	1515	(flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
1516		1516
1517	/*	1517	/*
1518	* If we no longer need to hold onto old our caps, and we may	1518	* If we no longer need to hold onto old our caps, and we may
1519	* have cached pages, but don't want them, then try to invalidate.	1519	* have cached pages, but don't want them, then try to invalidate.
1520	* If we fail, it's because pages are locked.... try again later.	1520	* If we fail, it's because pages are locked.... try again later.
1521	*/	1521	*/
1522	if ((!is_delayed \|\| mdsc->stopping) &&	1522	if ((!is_delayed \|\| mdsc->stopping) &&
1523	ci->i_wrbuffer_ref == 0 && /* no dirty pages... */	1523	ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
1524	inode->i_data.nrpages && /* have cached pages */	1524	inode->i_data.nrpages && /* have cached pages */
1525	(file_wanted == 0 \|\| /* no open files */	1525	(file_wanted == 0 \|\| /* no open files */
1526	(revoking & (CEPH_CAP_FILE_CACHE\|	1526	(revoking & (CEPH_CAP_FILE_CACHE\|
1527	CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */	1527	CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */
1528	!tried_invalidate) {	1528	!tried_invalidate) {
1529	dout("check_caps trying to invalidate on %p\n", inode);	1529	dout("check_caps trying to invalidate on %p\n", inode);
1530	if (try_nonblocking_invalidate(inode) < 0) {	1530	if (try_nonblocking_invalidate(inode) < 0) {
1531	if (revoking & (CEPH_CAP_FILE_CACHE\|	1531	if (revoking & (CEPH_CAP_FILE_CACHE\|
1532	CEPH_CAP_FILE_LAZYIO)) {	1532	CEPH_CAP_FILE_LAZYIO)) {
1533	dout("check_caps queuing invalidate\n");	1533	dout("check_caps queuing invalidate\n");
1534	queue_invalidate = 1;	1534	queue_invalidate = 1;
1535	ci->i_rdcache_revoking = ci->i_rdcache_gen;	1535	ci->i_rdcache_revoking = ci->i_rdcache_gen;
1536	} else {	1536	} else {
1537	dout("check_caps failed to invalidate pages\n");	1537	dout("check_caps failed to invalidate pages\n");
1538	/* we failed to invalidate pages. check these	1538	/* we failed to invalidate pages. check these
1539	caps again later. */	1539	caps again later. */
1540	force_requeue = 1;	1540	force_requeue = 1;
1541	__cap_set_timeouts(mdsc, ci);	1541	__cap_set_timeouts(mdsc, ci);
1542	}	1542	}
1543	}	1543	}
1544	tried_invalidate = 1;	1544	tried_invalidate = 1;
1545	goto retry_locked;	1545	goto retry_locked;
1546	}	1546	}
1547		1547
1548	num = 0;	1548	num = 0;
1549	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {	1549	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
1550	cap = rb_entry(p, struct ceph_cap, ci_node);	1550	cap = rb_entry(p, struct ceph_cap, ci_node);
1551	num++;	1551	num++;
1552		1552
1553	/* avoid looping forever */	1553	/* avoid looping forever */
1554	if (mds >= cap->mds \|\|	1554	if (mds >= cap->mds \|\|
1555	((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap))	1555	((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap))
1556	continue;	1556	continue;
1557		1557
1558	/* NOTE: no side-effects allowed, until we take s_mutex */	1558	/* NOTE: no side-effects allowed, until we take s_mutex */
1559		1559
1560	revoking = cap->implemented & ~cap->issued;	1560	revoking = cap->implemented & ~cap->issued;
1561	dout(" mds%d cap %p issued %s implemented %s revoking %s\n",	1561	dout(" mds%d cap %p issued %s implemented %s revoking %s\n",
1562	cap->mds, cap, ceph_cap_string(cap->issued),	1562	cap->mds, cap, ceph_cap_string(cap->issued),
1563	ceph_cap_string(cap->implemented),	1563	ceph_cap_string(cap->implemented),
1564	ceph_cap_string(revoking));	1564	ceph_cap_string(revoking));
1565		1565
1566	if (cap == ci->i_auth_cap &&	1566	if (cap == ci->i_auth_cap &&
1567	(cap->issued & CEPH_CAP_FILE_WR)) {	1567	(cap->issued & CEPH_CAP_FILE_WR)) {
1568	/* request larger max_size from MDS? */	1568	/* request larger max_size from MDS? */
1569	if (ci->i_wanted_max_size > ci->i_max_size &&	1569	if (ci->i_wanted_max_size > ci->i_max_size &&
1570	ci->i_wanted_max_size > ci->i_requested_max_size) {	1570	ci->i_wanted_max_size > ci->i_requested_max_size) {
1571	dout("requesting new max_size\n");	1571	dout("requesting new max_size\n");
1572	goto ack;	1572	goto ack;
1573	}	1573	}
1574		1574
1575	/* approaching file_max? */	1575	/* approaching file_max? */
1576	if ((inode->i_size << 1) >= ci->i_max_size &&	1576	if ((inode->i_size << 1) >= ci->i_max_size &&
1577	(ci->i_reported_size << 1) < ci->i_max_size) {	1577	(ci->i_reported_size << 1) < ci->i_max_size) {
1578	dout("i_size approaching max_size\n");	1578	dout("i_size approaching max_size\n");
1579	goto ack;	1579	goto ack;
1580	}	1580	}
1581	}	1581	}
1582	/* flush anything dirty? */	1582	/* flush anything dirty? */
1583	if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) &&	1583	if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) &&
1584	ci->i_dirty_caps) {	1584	ci->i_dirty_caps) {
1585	dout("flushing dirty caps\n");	1585	dout("flushing dirty caps\n");
1586	goto ack;	1586	goto ack;
1587	}	1587	}
1588		1588
1589	/* completed revocation? going down and there are no caps? */	1589	/* completed revocation? going down and there are no caps? */
1590	if (revoking && (revoking & used) == 0) {	1590	if (revoking && (revoking & used) == 0) {
1591	dout("completed revocation of %s\n",	1591	dout("completed revocation of %s\n",
1592	ceph_cap_string(cap->implemented & ~cap->issued));	1592	ceph_cap_string(cap->implemented & ~cap->issued));
1593	goto ack;	1593	goto ack;
1594	}	1594	}
1595		1595
1596	/* want more caps from mds? */	1596	/* want more caps from mds? */
1597	if (want & ~(cap->mds_wanted \| cap->issued))	1597	if (want & ~(cap->mds_wanted \| cap->issued))
1598	goto ack;	1598	goto ack;
1599		1599
1600	/* things we might delay */	1600	/* things we might delay */
1601	if ((cap->issued & ~retain) == 0 &&	1601	if ((cap->issued & ~retain) == 0 &&
1602	cap->mds_wanted == want)	1602	cap->mds_wanted == want)
1603	continue; /* nope, all good */	1603	continue; /* nope, all good */
1604		1604
1605	if (is_delayed)	1605	if (is_delayed)
1606	goto ack;	1606	goto ack;
1607		1607
1608	/* delay? */	1608	/* delay? */
1609	if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&	1609	if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
1610	time_before(jiffies, ci->i_hold_caps_max)) {	1610	time_before(jiffies, ci->i_hold_caps_max)) {
1611	dout(" delaying issued %s -> %s, wanted %s -> %s\n",	1611	dout(" delaying issued %s -> %s, wanted %s -> %s\n",
1612	ceph_cap_string(cap->issued),	1612	ceph_cap_string(cap->issued),
1613	ceph_cap_string(cap->issued & retain),	1613	ceph_cap_string(cap->issued & retain),
1614	ceph_cap_string(cap->mds_wanted),	1614	ceph_cap_string(cap->mds_wanted),
1615	ceph_cap_string(want));	1615	ceph_cap_string(want));
1616	delayed++;	1616	delayed++;
1617	continue;	1617	continue;
1618	}	1618	}
1619		1619
1620	ack:	1620	ack:
1621	if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {	1621	if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
1622	dout(" skipping %p I_NOFLUSH set\n", inode);	1622	dout(" skipping %p I_NOFLUSH set\n", inode);
1623	continue;	1623	continue;
1624	}	1624	}
1625		1625
1626	if (session && session != cap->session) {	1626	if (session && session != cap->session) {
1627	dout("oops, wrong session %p mutex\n", session);	1627	dout("oops, wrong session %p mutex\n", session);
1628	mutex_unlock(&session->s_mutex);	1628	mutex_unlock(&session->s_mutex);
1629	session = NULL;	1629	session = NULL;
1630	}	1630	}
1631	if (!session) {	1631	if (!session) {
1632	session = cap->session;	1632	session = cap->session;
1633	if (mutex_trylock(&session->s_mutex) == 0) {	1633	if (mutex_trylock(&session->s_mutex) == 0) {
1634	dout("inverting session/ino locks on %p\n",	1634	dout("inverting session/ino locks on %p\n",
1635	session);	1635	session);
1636	spin_unlock(&ci->i_ceph_lock);	1636	spin_unlock(&ci->i_ceph_lock);
1637	if (took_snap_rwsem) {	1637	if (took_snap_rwsem) {
1638	up_read(&mdsc->snap_rwsem);	1638	up_read(&mdsc->snap_rwsem);
1639	took_snap_rwsem = 0;	1639	took_snap_rwsem = 0;
1640	}	1640	}
1641	mutex_lock(&session->s_mutex);	1641	mutex_lock(&session->s_mutex);
1642	goto retry;	1642	goto retry;
1643	}	1643	}
1644	}	1644	}
1645	/* take snap_rwsem after session mutex */	1645	/* take snap_rwsem after session mutex */
1646	if (!took_snap_rwsem) {	1646	if (!took_snap_rwsem) {
1647	if (down_read_trylock(&mdsc->snap_rwsem) == 0) {	1647	if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
1648	dout("inverting snap/in locks on %p\n",	1648	dout("inverting snap/in locks on %p\n",
1649	inode);	1649	inode);
1650	spin_unlock(&ci->i_ceph_lock);	1650	spin_unlock(&ci->i_ceph_lock);
1651	down_read(&mdsc->snap_rwsem);	1651	down_read(&mdsc->snap_rwsem);
1652	took_snap_rwsem = 1;	1652	took_snap_rwsem = 1;
1653	goto retry;	1653	goto retry;
1654	}	1654	}
1655	took_snap_rwsem = 1;	1655	took_snap_rwsem = 1;
1656	}	1656	}
1657		1657
1658	if (cap == ci->i_auth_cap && ci->i_dirty_caps)	1658	if (cap == ci->i_auth_cap && ci->i_dirty_caps)
1659	flushing = __mark_caps_flushing(inode, session);	1659	flushing = __mark_caps_flushing(inode, session);
1660	else	1660	else
1661	flushing = 0;	1661	flushing = 0;
1662		1662
1663	mds = cap->mds; /* remember mds, so we don't repeat */	1663	mds = cap->mds; /* remember mds, so we don't repeat */
1664	sent++;	1664	sent++;
1665		1665
1666	/* __send_cap drops i_ceph_lock */	1666	/* __send_cap drops i_ceph_lock */
1667	delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want,	1667	delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want,
1668	retain, flushing, NULL);	1668	retain, flushing, NULL);
1669	goto retry; /* retake i_ceph_lock and restart our cap scan. */	1669	goto retry; /* retake i_ceph_lock and restart our cap scan. */
1670	}	1670	}
1671		1671
1672	/*	1672	/*
1673	* Reschedule delayed caps release if we delayed anything,	1673	* Reschedule delayed caps release if we delayed anything,
1674	* otherwise cancel.	1674	* otherwise cancel.
1675	*/	1675	*/
1676	if (delayed && is_delayed)	1676	if (delayed && is_delayed)
1677	force_requeue = 1; /* __send_cap delayed release; requeue */	1677	force_requeue = 1; /* __send_cap delayed release; requeue */
1678	if (!delayed && !is_delayed)	1678	if (!delayed && !is_delayed)
1679	__cap_delay_cancel(mdsc, ci);	1679	__cap_delay_cancel(mdsc, ci);
1680	else if (!is_delayed \|\| force_requeue)	1680	else if (!is_delayed \|\| force_requeue)
1681	__cap_delay_requeue(mdsc, ci);	1681	__cap_delay_requeue(mdsc, ci);
1682		1682
1683	spin_unlock(&ci->i_ceph_lock);	1683	spin_unlock(&ci->i_ceph_lock);
1684		1684
1685	if (queue_invalidate)	1685	if (queue_invalidate)
1686	ceph_queue_invalidate(inode);	1686	ceph_queue_invalidate(inode);
1687		1687
1688	if (session)	1688	if (session)
1689	mutex_unlock(&session->s_mutex);	1689	mutex_unlock(&session->s_mutex);
1690	if (took_snap_rwsem)	1690	if (took_snap_rwsem)
1691	up_read(&mdsc->snap_rwsem);	1691	up_read(&mdsc->snap_rwsem);
1692	}	1692	}
1693		1693
1694	/*	1694	/*
1695	* Try to flush dirty caps back to the auth mds.	1695	* Try to flush dirty caps back to the auth mds.
1696	*/	1696	*/
1697	static int try_flush_caps(struct inode inode, struct ceph_mds_session session,	1697	static int try_flush_caps(struct inode inode, struct ceph_mds_session session,
1698	unsigned *flush_tid)	1698	unsigned *flush_tid)
1699	{	1699	{
1700	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;	1700	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
1701	struct ceph_inode_info *ci = ceph_inode(inode);	1701	struct ceph_inode_info *ci = ceph_inode(inode);
1702	int unlock_session = session ? 0 : 1;	1702	int unlock_session = session ? 0 : 1;
1703	int flushing = 0;	1703	int flushing = 0;
1704		1704
1705	retry:	1705	retry:
1706	spin_lock(&ci->i_ceph_lock);	1706	spin_lock(&ci->i_ceph_lock);
1707	if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {	1707	if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
1708	dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);	1708	dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
1709	goto out;	1709	goto out;
1710	}	1710	}
1711	if (ci->i_dirty_caps && ci->i_auth_cap) {	1711	if (ci->i_dirty_caps && ci->i_auth_cap) {
1712	struct ceph_cap *cap = ci->i_auth_cap;	1712	struct ceph_cap *cap = ci->i_auth_cap;
1713	int used = __ceph_caps_used(ci);	1713	int used = __ceph_caps_used(ci);
1714	int want = __ceph_caps_wanted(ci);	1714	int want = __ceph_caps_wanted(ci);
1715	int delayed;	1715	int delayed;
1716		1716
1717	if (!session) {	1717	if (!session) {
1718	spin_unlock(&ci->i_ceph_lock);	1718	spin_unlock(&ci->i_ceph_lock);
1719	session = cap->session;	1719	session = cap->session;
1720	mutex_lock(&session->s_mutex);	1720	mutex_lock(&session->s_mutex);
1721	goto retry;	1721	goto retry;
1722	}	1722	}
1723	BUG_ON(session != cap->session);	1723	BUG_ON(session != cap->session);
1724	if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)	1724	if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
1725	goto out;	1725	goto out;
1726		1726
1727	flushing = __mark_caps_flushing(inode, session);	1727	flushing = __mark_caps_flushing(inode, session);
1728		1728
1729	/* __send_cap drops i_ceph_lock */	1729	/* __send_cap drops i_ceph_lock */
1730	delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,	1730	delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
1731	cap->issued \| cap->implemented, flushing,	1731	cap->issued \| cap->implemented, flushing,
1732	flush_tid);	1732	flush_tid);
1733	if (!delayed)	1733	if (!delayed)
1734	goto out_unlocked;	1734	goto out_unlocked;
1735		1735
1736	spin_lock(&ci->i_ceph_lock);	1736	spin_lock(&ci->i_ceph_lock);
1737	__cap_delay_requeue(mdsc, ci);	1737	__cap_delay_requeue(mdsc, ci);
1738	}	1738	}
1739	out:	1739	out:
1740	spin_unlock(&ci->i_ceph_lock);	1740	spin_unlock(&ci->i_ceph_lock);
1741	out_unlocked:	1741	out_unlocked:
1742	if (session && unlock_session)	1742	if (session && unlock_session)
1743	mutex_unlock(&session->s_mutex);	1743	mutex_unlock(&session->s_mutex);
1744	return flushing;	1744	return flushing;
1745	}	1745	}
1746		1746
1747	/*	1747	/*
1748	* Return true if we've flushed caps through the given flush_tid.	1748	* Return true if we've flushed caps through the given flush_tid.
1749	*/	1749	*/
1750	static int caps_are_flushed(struct inode *inode, unsigned tid)	1750	static int caps_are_flushed(struct inode *inode, unsigned tid)
1751	{	1751	{
1752	struct ceph_inode_info *ci = ceph_inode(inode);	1752	struct ceph_inode_info *ci = ceph_inode(inode);
1753	int i, ret = 1;	1753	int i, ret = 1;
1754		1754
1755	spin_lock(&ci->i_ceph_lock);	1755	spin_lock(&ci->i_ceph_lock);
1756	for (i = 0; i < CEPH_CAP_BITS; i++)	1756	for (i = 0; i < CEPH_CAP_BITS; i++)
1757	if ((ci->i_flushing_caps & (1 << i)) &&	1757	if ((ci->i_flushing_caps & (1 << i)) &&
1758	ci->i_cap_flush_tid[i] <= tid) {	1758	ci->i_cap_flush_tid[i] <= tid) {
1759	/* still flushing this bit */	1759	/* still flushing this bit */
1760	ret = 0;	1760	ret = 0;
1761	break;	1761	break;
1762	}	1762	}
1763	spin_unlock(&ci->i_ceph_lock);	1763	spin_unlock(&ci->i_ceph_lock);
1764	return ret;	1764	return ret;
1765	}	1765	}
1766		1766
1767	/*	1767	/*
1768	* Wait on any unsafe replies for the given inode. First wait on the	1768	* Wait on any unsafe replies for the given inode. First wait on the
1769	* newest request, and make that the upper bound. Then, if there are	1769	* newest request, and make that the upper bound. Then, if there are
1770	* more requests, keep waiting on the oldest as long as it is still older	1770	* more requests, keep waiting on the oldest as long as it is still older
1771	* than the original request.	1771	* than the original request.
1772	*/	1772	*/
1773	static void sync_write_wait(struct inode *inode)	1773	static void sync_write_wait(struct inode *inode)
1774	{	1774	{
1775	struct ceph_inode_info *ci = ceph_inode(inode);	1775	struct ceph_inode_info *ci = ceph_inode(inode);
1776	struct list_head *head = &ci->i_unsafe_writes;	1776	struct list_head *head = &ci->i_unsafe_writes;
1777	struct ceph_osd_request *req;	1777	struct ceph_osd_request *req;
1778	u64 last_tid;	1778	u64 last_tid;
1779		1779
1780	spin_lock(&ci->i_unsafe_lock);	1780	spin_lock(&ci->i_unsafe_lock);
1781	if (list_empty(head))	1781	if (list_empty(head))
1782	goto out;	1782	goto out;
1783		1783
1784	/* set upper bound as _last_ entry in chain */	1784	/* set upper bound as _last_ entry in chain */
1785	req = list_entry(head->prev, struct ceph_osd_request,	1785	req = list_entry(head->prev, struct ceph_osd_request,
1786	r_unsafe_item);	1786	r_unsafe_item);
1787	last_tid = req->r_tid;	1787	last_tid = req->r_tid;
1788		1788
1789	do {	1789	do {
1790	ceph_osdc_get_request(req);	1790	ceph_osdc_get_request(req);
1791	spin_unlock(&ci->i_unsafe_lock);	1791	spin_unlock(&ci->i_unsafe_lock);
1792	dout("sync_write_wait on tid %llu (until %llu)\n",	1792	dout("sync_write_wait on tid %llu (until %llu)\n",
1793	req->r_tid, last_tid);	1793	req->r_tid, last_tid);
1794	wait_for_completion(&req->r_safe_completion);	1794	wait_for_completion(&req->r_safe_completion);
1795	spin_lock(&ci->i_unsafe_lock);	1795	spin_lock(&ci->i_unsafe_lock);
1796	ceph_osdc_put_request(req);	1796	ceph_osdc_put_request(req);
1797		1797
1798	/*	1798	/*
1799	* from here on look at first entry in chain, since we	1799	* from here on look at first entry in chain, since we
1800	* only want to wait for anything older than last_tid	1800	* only want to wait for anything older than last_tid
1801	*/	1801	*/
1802	if (list_empty(head))	1802	if (list_empty(head))
1803	break;	1803	break;
1804	req = list_entry(head->next, struct ceph_osd_request,	1804	req = list_entry(head->next, struct ceph_osd_request,
1805	r_unsafe_item);	1805	r_unsafe_item);
1806	} while (req->r_tid < last_tid);	1806	} while (req->r_tid < last_tid);
1807	out:	1807	out:
1808	spin_unlock(&ci->i_unsafe_lock);	1808	spin_unlock(&ci->i_unsafe_lock);
1809	}	1809	}
1810		1810
1811	int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)	1811	int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
1812	{	1812	{
1813	struct inode *inode = file->f_mapping->host;	1813	struct inode *inode = file->f_mapping->host;
1814	struct ceph_inode_info *ci = ceph_inode(inode);	1814	struct ceph_inode_info *ci = ceph_inode(inode);
1815	unsigned flush_tid;	1815	unsigned flush_tid;
1816	int ret;	1816	int ret;
1817	int dirty;	1817	int dirty;
1818		1818
1819	dout("fsync %p%s\n", inode, datasync ? " datasync" : "");	1819	dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
1820	sync_write_wait(inode);	1820	sync_write_wait(inode);
1821		1821
1822	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);	1822	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
1823	if (ret < 0)	1823	if (ret < 0)
1824	return ret;	1824	return ret;
1825	mutex_lock(&inode->i_mutex);	1825	mutex_lock(&inode->i_mutex);
1826		1826
1827	dirty = try_flush_caps(inode, NULL, &flush_tid);	1827	dirty = try_flush_caps(inode, NULL, &flush_tid);
1828	dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));	1828	dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
1829		1829
1830	/*	1830	/*
1831	* only wait on non-file metadata writeback (the mds	1831	* only wait on non-file metadata writeback (the mds
1832	* can recover size and mtime, so we don't need to	1832	* can recover size and mtime, so we don't need to
1833	* wait for that)	1833	* wait for that)
1834	*/	1834	*/
1835	if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {	1835	if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
1836	dout("fsync waiting for flush_tid %u\n", flush_tid);	1836	dout("fsync waiting for flush_tid %u\n", flush_tid);
1837	ret = wait_event_interruptible(ci->i_cap_wq,	1837	ret = wait_event_interruptible(ci->i_cap_wq,
1838	caps_are_flushed(inode, flush_tid));	1838	caps_are_flushed(inode, flush_tid));
1839	}	1839	}
1840		1840
1841	dout("fsync %p%s done\n", inode, datasync ? " datasync" : "");	1841	dout("fsync %p%s done\n", inode, datasync ? " datasync" : "");
1842	mutex_unlock(&inode->i_mutex);	1842	mutex_unlock(&inode->i_mutex);
1843	return ret;	1843	return ret;
1844	}	1844	}
1845		1845
1846	/*	1846	/*
1847	* Flush any dirty caps back to the mds. If we aren't asked to wait,	1847	* Flush any dirty caps back to the mds. If we aren't asked to wait,
1848	* queue inode for flush but don't do so immediately, because we can	1848	* queue inode for flush but don't do so immediately, because we can
1849	* get by with fewer MDS messages if we wait for data writeback to	1849	* get by with fewer MDS messages if we wait for data writeback to
1850	* complete first.	1850	* complete first.
1851	*/	1851	*/
1852	int ceph_write_inode(struct inode inode, struct writeback_control wbc)	1852	int ceph_write_inode(struct inode inode, struct writeback_control wbc)
1853	{	1853	{
1854	struct ceph_inode_info *ci = ceph_inode(inode);	1854	struct ceph_inode_info *ci = ceph_inode(inode);
1855	unsigned flush_tid;	1855	unsigned flush_tid;
1856	int err = 0;	1856	int err = 0;
1857	int dirty;	1857	int dirty;
1858	int wait = wbc->sync_mode == WB_SYNC_ALL;	1858	int wait = wbc->sync_mode == WB_SYNC_ALL;
1859		1859
1860	dout("write_inode %p wait=%d\n", inode, wait);	1860	dout("write_inode %p wait=%d\n", inode, wait);
1861	if (wait) {	1861	if (wait) {
1862	dirty = try_flush_caps(inode, NULL, &flush_tid);	1862	dirty = try_flush_caps(inode, NULL, &flush_tid);
1863	if (dirty)	1863	if (dirty)
1864	err = wait_event_interruptible(ci->i_cap_wq,	1864	err = wait_event_interruptible(ci->i_cap_wq,
1865	caps_are_flushed(inode, flush_tid));	1865	caps_are_flushed(inode, flush_tid));
1866	} else {	1866	} else {
1867	struct ceph_mds_client *mdsc =	1867	struct ceph_mds_client *mdsc =
1868	ceph_sb_to_client(inode->i_sb)->mdsc;	1868	ceph_sb_to_client(inode->i_sb)->mdsc;
1869		1869
1870	spin_lock(&ci->i_ceph_lock);	1870	spin_lock(&ci->i_ceph_lock);
1871	if (__ceph_caps_dirty(ci))	1871	if (__ceph_caps_dirty(ci))
1872	__cap_delay_requeue_front(mdsc, ci);	1872	__cap_delay_requeue_front(mdsc, ci);
1873	spin_unlock(&ci->i_ceph_lock);	1873	spin_unlock(&ci->i_ceph_lock);
1874	}	1874	}
1875	return err;	1875	return err;
1876	}	1876	}
1877		1877
1878	/*	1878	/*
1879	* After a recovering MDS goes active, we need to resend any caps	1879	* After a recovering MDS goes active, we need to resend any caps
1880	* we were flushing.	1880	* we were flushing.
1881	*	1881	*
1882	* Caller holds session->s_mutex.	1882	* Caller holds session->s_mutex.
1883	*/	1883	*/
1884	static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,	1884	static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
1885	struct ceph_mds_session *session)	1885	struct ceph_mds_session *session)
1886	{	1886	{
1887	struct ceph_cap_snap *capsnap;	1887	struct ceph_cap_snap *capsnap;
1888		1888
1889	dout("kick_flushing_capsnaps mds%d\n", session->s_mds);	1889	dout("kick_flushing_capsnaps mds%d\n", session->s_mds);
1890	list_for_each_entry(capsnap, &session->s_cap_snaps_flushing,	1890	list_for_each_entry(capsnap, &session->s_cap_snaps_flushing,
1891	flushing_item) {	1891	flushing_item) {
1892	struct ceph_inode_info *ci = capsnap->ci;	1892	struct ceph_inode_info *ci = capsnap->ci;
1893	struct inode *inode = &ci->vfs_inode;	1893	struct inode *inode = &ci->vfs_inode;
1894	struct ceph_cap *cap;	1894	struct ceph_cap *cap;
1895		1895
1896	spin_lock(&ci->i_ceph_lock);	1896	spin_lock(&ci->i_ceph_lock);
1897	cap = ci->i_auth_cap;	1897	cap = ci->i_auth_cap;
1898	if (cap && cap->session == session) {	1898	if (cap && cap->session == session) {
1899	dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,	1899	dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
1900	cap, capsnap);	1900	cap, capsnap);
1901	__ceph_flush_snaps(ci, &session, 1);	1901	__ceph_flush_snaps(ci, &session, 1);
1902	} else {	1902	} else {
1903	pr_err("%p auth cap %p not mds%d ???\n", inode,	1903	pr_err("%p auth cap %p not mds%d ???\n", inode,
1904	cap, session->s_mds);	1904	cap, session->s_mds);
1905	}	1905	}
1906	spin_unlock(&ci->i_ceph_lock);	1906	spin_unlock(&ci->i_ceph_lock);
1907	}	1907	}
1908	}	1908	}
1909		1909
1910	void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,	1910	void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
1911	struct ceph_mds_session *session)	1911	struct ceph_mds_session *session)
1912	{	1912	{
1913	struct ceph_inode_info *ci;	1913	struct ceph_inode_info *ci;
1914		1914
1915	kick_flushing_capsnaps(mdsc, session);	1915	kick_flushing_capsnaps(mdsc, session);
1916		1916
1917	dout("kick_flushing_caps mds%d\n", session->s_mds);	1917	dout("kick_flushing_caps mds%d\n", session->s_mds);
1918	list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {	1918	list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
1919	struct inode *inode = &ci->vfs_inode;	1919	struct inode *inode = &ci->vfs_inode;
1920	struct ceph_cap *cap;	1920	struct ceph_cap *cap;
1921	int delayed = 0;	1921	int delayed = 0;
1922		1922
1923	spin_lock(&ci->i_ceph_lock);	1923	spin_lock(&ci->i_ceph_lock);
1924	cap = ci->i_auth_cap;	1924	cap = ci->i_auth_cap;
1925	if (cap && cap->session == session) {	1925	if (cap && cap->session == session) {
1926	dout("kick_flushing_caps %p cap %p %s\n", inode,	1926	dout("kick_flushing_caps %p cap %p %s\n", inode,
1927	cap, ceph_cap_string(ci->i_flushing_caps));	1927	cap, ceph_cap_string(ci->i_flushing_caps));
1928	delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,	1928	delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
1929	__ceph_caps_used(ci),	1929	__ceph_caps_used(ci),
1930	__ceph_caps_wanted(ci),	1930	__ceph_caps_wanted(ci),
1931	cap->issued \| cap->implemented,	1931	cap->issued \| cap->implemented,
1932	ci->i_flushing_caps, NULL);	1932	ci->i_flushing_caps, NULL);
1933	if (delayed) {	1933	if (delayed) {
1934	spin_lock(&ci->i_ceph_lock);	1934	spin_lock(&ci->i_ceph_lock);
1935	__cap_delay_requeue(mdsc, ci);	1935	__cap_delay_requeue(mdsc, ci);
1936	spin_unlock(&ci->i_ceph_lock);	1936	spin_unlock(&ci->i_ceph_lock);
1937	}	1937	}
1938	} else {	1938	} else {
1939	pr_err("%p auth cap %p not mds%d ???\n", inode,	1939	pr_err("%p auth cap %p not mds%d ???\n", inode,
1940	cap, session->s_mds);	1940	cap, session->s_mds);
1941	spin_unlock(&ci->i_ceph_lock);	1941	spin_unlock(&ci->i_ceph_lock);
1942	}	1942	}
1943	}	1943	}
1944	}	1944	}
1945		1945
1946	static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,	1946	static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
1947	struct ceph_mds_session *session,	1947	struct ceph_mds_session *session,
1948	struct inode *inode)	1948	struct inode *inode)
1949	{	1949	{
1950	struct ceph_inode_info *ci = ceph_inode(inode);	1950	struct ceph_inode_info *ci = ceph_inode(inode);
1951	struct ceph_cap *cap;	1951	struct ceph_cap *cap;
1952	int delayed = 0;	1952	int delayed = 0;
1953		1953
1954	spin_lock(&ci->i_ceph_lock);	1954	spin_lock(&ci->i_ceph_lock);
1955	cap = ci->i_auth_cap;	1955	cap = ci->i_auth_cap;
1956	dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode,	1956	dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode,
1957	ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq);	1957	ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq);
1958	__ceph_flush_snaps(ci, &session, 1);	1958	__ceph_flush_snaps(ci, &session, 1);
1959	if (ci->i_flushing_caps) {	1959	if (ci->i_flushing_caps) {
1960	delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,	1960	delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
1961	__ceph_caps_used(ci),	1961	__ceph_caps_used(ci),
1962	__ceph_caps_wanted(ci),	1962	__ceph_caps_wanted(ci),
1963	cap->issued \| cap->implemented,	1963	cap->issued \| cap->implemented,
1964	ci->i_flushing_caps, NULL);	1964	ci->i_flushing_caps, NULL);
1965	if (delayed) {	1965	if (delayed) {
1966	spin_lock(&ci->i_ceph_lock);	1966	spin_lock(&ci->i_ceph_lock);
1967	__cap_delay_requeue(mdsc, ci);	1967	__cap_delay_requeue(mdsc, ci);
1968	spin_unlock(&ci->i_ceph_lock);	1968	spin_unlock(&ci->i_ceph_lock);
1969	}	1969	}
1970	} else {	1970	} else {
1971	spin_unlock(&ci->i_ceph_lock);	1971	spin_unlock(&ci->i_ceph_lock);
1972	}	1972	}
1973	}	1973	}
1974		1974
1975		1975
1976	/*	1976	/*
1977	* Take references to capabilities we hold, so that we don't release	1977	* Take references to capabilities we hold, so that we don't release
1978	* them to the MDS prematurely.	1978	* them to the MDS prematurely.
1979	*	1979	*
1980	* Protected by i_ceph_lock.	1980	* Protected by i_ceph_lock.
1981	*/	1981	*/
1982	static void __take_cap_refs(struct ceph_inode_info *ci, int got)	1982	static void __take_cap_refs(struct ceph_inode_info *ci, int got)
1983	{	1983	{
1984	if (got & CEPH_CAP_PIN)	1984	if (got & CEPH_CAP_PIN)
1985	ci->i_pin_ref++;	1985	ci->i_pin_ref++;
1986	if (got & CEPH_CAP_FILE_RD)	1986	if (got & CEPH_CAP_FILE_RD)
1987	ci->i_rd_ref++;	1987	ci->i_rd_ref++;
1988	if (got & CEPH_CAP_FILE_CACHE)	1988	if (got & CEPH_CAP_FILE_CACHE)
1989	ci->i_rdcache_ref++;	1989	ci->i_rdcache_ref++;
1990	if (got & CEPH_CAP_FILE_WR)	1990	if (got & CEPH_CAP_FILE_WR)
1991	ci->i_wr_ref++;	1991	ci->i_wr_ref++;
1992	if (got & CEPH_CAP_FILE_BUFFER) {	1992	if (got & CEPH_CAP_FILE_BUFFER) {
1993	if (ci->i_wb_ref == 0)	1993	if (ci->i_wb_ref == 0)
1994	ihold(&ci->vfs_inode);	1994	ihold(&ci->vfs_inode);
1995	ci->i_wb_ref++;	1995	ci->i_wb_ref++;
1996	dout("__take_cap_refs %p wb %d -> %d (?)\n",	1996	dout("__take_cap_refs %p wb %d -> %d (?)\n",
1997	&ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref);	1997	&ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref);
1998	}	1998	}
1999	}	1999	}
2000		2000
2001	/*	2001	/*
2002	* Try to grab cap references. Specify those refs we @want, and the	2002	* Try to grab cap references. Specify those refs we @want, and the
2003	* minimal set we @need. Also include the larger offset we are writing	2003	* minimal set we @need. Also include the larger offset we are writing
2004	* to (when applicable), and check against max_size here as well.	2004	* to (when applicable), and check against max_size here as well.
2005	* Note that caller is responsible for ensuring max_size increases are	2005	* Note that caller is responsible for ensuring max_size increases are
2006	* requested from the MDS.	2006	* requested from the MDS.
2007	*/	2007	*/
2008	static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,	2008	static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
2009	int got, loff_t endoff, int check_max, int *err)	2009	int got, loff_t endoff, int check_max, int *err)
2010	{	2010	{
2011	struct inode *inode = &ci->vfs_inode;	2011	struct inode *inode = &ci->vfs_inode;
2012	int ret = 0;	2012	int ret = 0;
2013	int have, implemented;	2013	int have, implemented;
2014	int file_wanted;	2014	int file_wanted;
2015		2015
2016	dout("get_cap_refs %p need %s want %s\n", inode,	2016	dout("get_cap_refs %p need %s want %s\n", inode,
2017	ceph_cap_string(need), ceph_cap_string(want));	2017	ceph_cap_string(need), ceph_cap_string(want));
2018	spin_lock(&ci->i_ceph_lock);	2018	spin_lock(&ci->i_ceph_lock);
2019		2019
2020	/* make sure file is actually open */	2020	/* make sure file is actually open */
2021	file_wanted = __ceph_caps_file_wanted(ci);	2021	file_wanted = __ceph_caps_file_wanted(ci);
2022	if ((file_wanted & need) == 0) {	2022	if ((file_wanted & need) == 0) {
2023	dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",	2023	dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
2024	ceph_cap_string(need), ceph_cap_string(file_wanted));	2024	ceph_cap_string(need), ceph_cap_string(file_wanted));
2025	*err = -EBADF;	2025	*err = -EBADF;
2026	ret = 1;	2026	ret = 1;
2027	goto out;	2027	goto out;
2028	}	2028	}
2029		2029
2030	if (need & CEPH_CAP_FILE_WR) {	2030	if (need & CEPH_CAP_FILE_WR) {
2031	if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {	2031	if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
2032	dout("get_cap_refs %p endoff %llu > maxsize %llu\n",	2032	dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
2033	inode, endoff, ci->i_max_size);	2033	inode, endoff, ci->i_max_size);
2034	if (endoff > ci->i_wanted_max_size) {	2034	if (endoff > ci->i_wanted_max_size) {
2035	*check_max = 1;	2035	*check_max = 1;
2036	ret = 1;	2036	ret = 1;
2037	}	2037	}
2038	goto out;	2038	goto out;
2039	}	2039	}
2040	/*	2040	/*
2041	* If a sync write is in progress, we must wait, so that we	2041	* If a sync write is in progress, we must wait, so that we
2042	* can get a final snapshot value for size+mtime.	2042	* can get a final snapshot value for size+mtime.
2043	*/	2043	*/
2044	if (__ceph_have_pending_cap_snap(ci)) {	2044	if (__ceph_have_pending_cap_snap(ci)) {
2045	dout("get_cap_refs %p cap_snap_pending\n", inode);	2045	dout("get_cap_refs %p cap_snap_pending\n", inode);
2046	goto out;	2046	goto out;
2047	}	2047	}
2048	}	2048	}
2049	have = __ceph_caps_issued(ci, &implemented);	2049	have = __ceph_caps_issued(ci, &implemented);
2050		2050
2051	/*	2051	/*
2052	* disallow writes while a truncate is pending	2052	* disallow writes while a truncate is pending
2053	*/	2053	*/
2054	if (ci->i_truncate_pending)	2054	if (ci->i_truncate_pending)
2055	have &= ~CEPH_CAP_FILE_WR;	2055	have &= ~CEPH_CAP_FILE_WR;
2056		2056
2057	if ((have & need) == need) {	2057	if ((have & need) == need) {
2058	/*	2058	/*
2059	* Look at (implemented & ~have & not) so that we keep waiting	2059	* Look at (implemented & ~have & not) so that we keep waiting
2060	* on transition from wanted -> needed caps. This is needed	2060	* on transition from wanted -> needed caps. This is needed
2061	* for WRBUFFER\|WR -> WR to avoid a new WR sync write from	2061	* for WRBUFFER\|WR -> WR to avoid a new WR sync write from
2062	* going before a prior buffered writeback happens.	2062	* going before a prior buffered writeback happens.
2063	*/	2063	*/
2064	int not = want & ~(have & need);	2064	int not = want & ~(have & need);
2065	int revoking = implemented & ~have;	2065	int revoking = implemented & ~have;
2066	dout("get_cap_refs %p have %s but not %s (revoking %s)\n",	2066	dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
2067	inode, ceph_cap_string(have), ceph_cap_string(not),	2067	inode, ceph_cap_string(have), ceph_cap_string(not),
2068	ceph_cap_string(revoking));	2068	ceph_cap_string(revoking));
2069	if ((revoking & not) == 0) {	2069	if ((revoking & not) == 0) {
2070	*got = need \| (have & want);	2070	*got = need \| (have & want);
2071	__take_cap_refs(ci, *got);	2071	__take_cap_refs(ci, *got);
2072	ret = 1;	2072	ret = 1;
2073	}	2073	}
2074	} else {	2074	} else {
2075	dout("get_cap_refs %p have %s needed %s\n", inode,	2075	dout("get_cap_refs %p have %s needed %s\n", inode,
2076	ceph_cap_string(have), ceph_cap_string(need));	2076	ceph_cap_string(have), ceph_cap_string(need));
2077	}	2077	}
2078	out:	2078	out:
2079	spin_unlock(&ci->i_ceph_lock);	2079	spin_unlock(&ci->i_ceph_lock);
2080	dout("get_cap_refs %p ret %d got %s\n", inode,	2080	dout("get_cap_refs %p ret %d got %s\n", inode,
2081	ret, ceph_cap_string(*got));	2081	ret, ceph_cap_string(*got));
2082	return ret;	2082	return ret;
2083	}	2083	}
2084		2084
2085	/*	2085	/*
2086	* Check the offset we are writing up to against our current	2086	* Check the offset we are writing up to against our current
2087	* max_size. If necessary, tell the MDS we want to write to	2087	* max_size. If necessary, tell the MDS we want to write to
2088	* a larger offset.	2088	* a larger offset.
2089	*/	2089	*/
2090	static void check_max_size(struct inode *inode, loff_t endoff)	2090	static void check_max_size(struct inode *inode, loff_t endoff)
2091	{	2091	{
2092	struct ceph_inode_info *ci = ceph_inode(inode);	2092	struct ceph_inode_info *ci = ceph_inode(inode);
2093	int check = 0;	2093	int check = 0;
2094		2094
2095	/* do we need to explicitly request a larger max_size? */	2095	/* do we need to explicitly request a larger max_size? */
2096	spin_lock(&ci->i_ceph_lock);	2096	spin_lock(&ci->i_ceph_lock);
2097	if ((endoff >= ci->i_max_size \|\|	2097	if ((endoff >= ci->i_max_size \|\|
2098	endoff > (inode->i_size << 1)) &&	2098	endoff > (inode->i_size << 1)) &&
2099	endoff > ci->i_wanted_max_size) {	2099	endoff > ci->i_wanted_max_size) {
2100	dout("write %p at large endoff %llu, req max_size\n",	2100	dout("write %p at large endoff %llu, req max_size\n",
2101	inode, endoff);	2101	inode, endoff);
2102	ci->i_wanted_max_size = endoff;	2102	ci->i_wanted_max_size = endoff;
2103	check = 1;	2103	check = 1;
2104	}	2104	}
2105	spin_unlock(&ci->i_ceph_lock);	2105	spin_unlock(&ci->i_ceph_lock);
2106	if (check)	2106	if (check)
2107	ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);	2107	ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
2108	}	2108	}
2109		2109
2110	/*	2110	/*
2111	* Wait for caps, and take cap references. If we can't get a WR cap	2111	* Wait for caps, and take cap references. If we can't get a WR cap
2112	* due to a small max_size, make sure we check_max_size (and possibly	2112	* due to a small max_size, make sure we check_max_size (and possibly
2113	* ask the mds) so we don't get hung up indefinitely.	2113	* ask the mds) so we don't get hung up indefinitely.
2114	*/	2114	*/
2115	int ceph_get_caps(struct ceph_inode_info ci, int need, int want, int got,	2115	int ceph_get_caps(struct ceph_inode_info ci, int need, int want, int got,
2116	loff_t endoff)	2116	loff_t endoff)
2117	{	2117	{
2118	int check_max, ret, err;	2118	int check_max, ret, err;
2119		2119
2120	retry:	2120	retry:
2121	if (endoff > 0)	2121	if (endoff > 0)
2122	check_max_size(&ci->vfs_inode, endoff);	2122	check_max_size(&ci->vfs_inode, endoff);
2123	check_max = 0;	2123	check_max = 0;
2124	err = 0;	2124	err = 0;
2125	ret = wait_event_interruptible(ci->i_cap_wq,	2125	ret = wait_event_interruptible(ci->i_cap_wq,
2126	try_get_cap_refs(ci, need, want,	2126	try_get_cap_refs(ci, need, want,
2127	got, endoff,	2127	got, endoff,
2128	&check_max, &err));	2128	&check_max, &err));
2129	if (err)	2129	if (err)
2130	ret = err;	2130	ret = err;
2131	if (check_max)	2131	if (check_max)
2132	goto retry;	2132	goto retry;
2133	return ret;	2133	return ret;
2134	}	2134	}
2135		2135
2136	/*	2136	/*
2137	* Take cap refs. Caller must already know we hold at least one ref	2137	* Take cap refs. Caller must already know we hold at least one ref
2138	* on the caps in question or we don't know this is safe.	2138	* on the caps in question or we don't know this is safe.
2139	*/	2139	*/
2140	void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)	2140	void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
2141	{	2141	{
2142	spin_lock(&ci->i_ceph_lock);	2142	spin_lock(&ci->i_ceph_lock);
2143	__take_cap_refs(ci, caps);	2143	__take_cap_refs(ci, caps);
2144	spin_unlock(&ci->i_ceph_lock);	2144	spin_unlock(&ci->i_ceph_lock);
2145	}	2145	}
2146		2146
2147	/*	2147	/*
2148	* Release cap refs.	2148	* Release cap refs.
2149	*	2149	*
2150	* If we released the last ref on any given cap, call ceph_check_caps	2150	* If we released the last ref on any given cap, call ceph_check_caps
2151	* to release (or schedule a release).	2151	* to release (or schedule a release).
2152	*	2152	*
2153	* If we are releasing a WR cap (from a sync write), finalize any affected	2153	* If we are releasing a WR cap (from a sync write), finalize any affected
2154	* cap_snap, and wake up any waiters.	2154	* cap_snap, and wake up any waiters.
2155	*/	2155	*/
2156	void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)	2156	void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
2157	{	2157	{
2158	struct inode *inode = &ci->vfs_inode;	2158	struct inode *inode = &ci->vfs_inode;
2159	int last = 0, put = 0, flushsnaps = 0, wake = 0;	2159	int last = 0, put = 0, flushsnaps = 0, wake = 0;
2160	struct ceph_cap_snap *capsnap;	2160	struct ceph_cap_snap *capsnap;
2161		2161
2162	spin_lock(&ci->i_ceph_lock);	2162	spin_lock(&ci->i_ceph_lock);
2163	if (had & CEPH_CAP_PIN)	2163	if (had & CEPH_CAP_PIN)
2164	--ci->i_pin_ref;	2164	--ci->i_pin_ref;
2165	if (had & CEPH_CAP_FILE_RD)	2165	if (had & CEPH_CAP_FILE_RD)
2166	if (--ci->i_rd_ref == 0)	2166	if (--ci->i_rd_ref == 0)
2167	last++;	2167	last++;
2168	if (had & CEPH_CAP_FILE_CACHE)	2168	if (had & CEPH_CAP_FILE_CACHE)
2169	if (--ci->i_rdcache_ref == 0)	2169	if (--ci->i_rdcache_ref == 0)
2170	last++;	2170	last++;
2171	if (had & CEPH_CAP_FILE_BUFFER) {	2171	if (had & CEPH_CAP_FILE_BUFFER) {
2172	if (--ci->i_wb_ref == 0) {	2172	if (--ci->i_wb_ref == 0) {
2173	last++;	2173	last++;
2174	put++;	2174	put++;
2175	}	2175	}
2176	dout("put_cap_refs %p wb %d -> %d (?)\n",	2176	dout("put_cap_refs %p wb %d -> %d (?)\n",
2177	inode, ci->i_wb_ref+1, ci->i_wb_ref);	2177	inode, ci->i_wb_ref+1, ci->i_wb_ref);
2178	}	2178	}
2179	if (had & CEPH_CAP_FILE_WR)	2179	if (had & CEPH_CAP_FILE_WR)
2180	if (--ci->i_wr_ref == 0) {	2180	if (--ci->i_wr_ref == 0) {
2181	last++;	2181	last++;
2182	if (!list_empty(&ci->i_cap_snaps)) {	2182	if (!list_empty(&ci->i_cap_snaps)) {
2183	capsnap = list_first_entry(&ci->i_cap_snaps,	2183	capsnap = list_first_entry(&ci->i_cap_snaps,
2184	struct ceph_cap_snap,	2184	struct ceph_cap_snap,
2185	ci_item);	2185	ci_item);
2186	if (capsnap->writing) {	2186	if (capsnap->writing) {
2187	capsnap->writing = 0;	2187	capsnap->writing = 0;
2188	flushsnaps =	2188	flushsnaps =
2189	__ceph_finish_cap_snap(ci,	2189	__ceph_finish_cap_snap(ci,
2190	capsnap);	2190	capsnap);
2191	wake = 1;	2191	wake = 1;
2192	}	2192	}
2193	}	2193	}
2194	}	2194	}
2195	spin_unlock(&ci->i_ceph_lock);	2195	spin_unlock(&ci->i_ceph_lock);
2196		2196
2197	dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),	2197	dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
2198	last ? " last" : "", put ? " put" : "");	2198	last ? " last" : "", put ? " put" : "");
2199		2199
2200	if (last && !flushsnaps)	2200	if (last && !flushsnaps)
2201	ceph_check_caps(ci, 0, NULL);	2201	ceph_check_caps(ci, 0, NULL);
2202	else if (flushsnaps)	2202	else if (flushsnaps)
2203	ceph_flush_snaps(ci);	2203	ceph_flush_snaps(ci);
2204	if (wake)	2204	if (wake)
2205	wake_up_all(&ci->i_cap_wq);	2205	wake_up_all(&ci->i_cap_wq);
2206	if (put)	2206	if (put)
2207	iput(inode);	2207	iput(inode);
2208	}	2208	}
2209		2209
2210	/*	2210	/*
2211	* Release @nr WRBUFFER refs on dirty pages for the given @snapc snap	2211	* Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
2212	* context. Adjust per-snap dirty page accounting as appropriate.	2212	* context. Adjust per-snap dirty page accounting as appropriate.
2213	* Once all dirty data for a cap_snap is flushed, flush snapped file	2213	* Once all dirty data for a cap_snap is flushed, flush snapped file
2214	* metadata back to the MDS. If we dropped the last ref, call	2214	* metadata back to the MDS. If we dropped the last ref, call
2215	* ceph_check_caps.	2215	* ceph_check_caps.
2216	*/	2216	*/
2217	void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,	2217	void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
2218	struct ceph_snap_context *snapc)	2218	struct ceph_snap_context *snapc)
2219	{	2219	{
2220	struct inode *inode = &ci->vfs_inode;	2220	struct inode *inode = &ci->vfs_inode;
2221	int last = 0;	2221	int last = 0;
2222	int complete_capsnap = 0;	2222	int complete_capsnap = 0;
2223	int drop_capsnap = 0;	2223	int drop_capsnap = 0;
2224	int found = 0;	2224	int found = 0;
2225	struct ceph_cap_snap *capsnap = NULL;	2225	struct ceph_cap_snap *capsnap = NULL;
2226		2226
2227	spin_lock(&ci->i_ceph_lock);	2227	spin_lock(&ci->i_ceph_lock);
2228	ci->i_wrbuffer_ref -= nr;	2228	ci->i_wrbuffer_ref -= nr;
2229	last = !ci->i_wrbuffer_ref;	2229	last = !ci->i_wrbuffer_ref;
2230		2230
2231	if (ci->i_head_snapc == snapc) {	2231	if (ci->i_head_snapc == snapc) {
2232	ci->i_wrbuffer_ref_head -= nr;	2232	ci->i_wrbuffer_ref_head -= nr;
2233	if (ci->i_wrbuffer_ref_head == 0 &&	2233	if (ci->i_wrbuffer_ref_head == 0 &&
2234	ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) {	2234	ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) {
2235	BUG_ON(!ci->i_head_snapc);	2235	BUG_ON(!ci->i_head_snapc);
2236	ceph_put_snap_context(ci->i_head_snapc);	2236	ceph_put_snap_context(ci->i_head_snapc);
2237	ci->i_head_snapc = NULL;	2237	ci->i_head_snapc = NULL;
2238	}	2238	}
2239	dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n",	2239	dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n",
2240	inode,	2240	inode,
2241	ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr,	2241	ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr,
2242	ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,	2242	ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
2243	last ? " LAST" : "");	2243	last ? " LAST" : "");
2244	} else {	2244	} else {
2245	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {	2245	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
2246	if (capsnap->context == snapc) {	2246	if (capsnap->context == snapc) {
2247	found = 1;	2247	found = 1;
2248	break;	2248	break;
2249	}	2249	}
2250	}	2250	}
2251	BUG_ON(!found);	2251	BUG_ON(!found);
2252	capsnap->dirty_pages -= nr;	2252	capsnap->dirty_pages -= nr;
2253	if (capsnap->dirty_pages == 0) {	2253	if (capsnap->dirty_pages == 0) {
2254	complete_capsnap = 1;	2254	complete_capsnap = 1;
2255	if (capsnap->dirty == 0)	2255	if (capsnap->dirty == 0)
2256	/* cap writeback completed before we created	2256	/* cap writeback completed before we created
2257	* the cap_snap; no FLUSHSNAP is needed */	2257	* the cap_snap; no FLUSHSNAP is needed */
2258	drop_capsnap = 1;	2258	drop_capsnap = 1;
2259	}	2259	}
2260	dout("put_wrbuffer_cap_refs on %p cap_snap %p "	2260	dout("put_wrbuffer_cap_refs on %p cap_snap %p "
2261	" snap %lld %d/%d -> %d/%d %s%s%s\n",	2261	" snap %lld %d/%d -> %d/%d %s%s%s\n",
2262	inode, capsnap, capsnap->context->seq,	2262	inode, capsnap, capsnap->context->seq,
2263	ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,	2263	ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
2264	ci->i_wrbuffer_ref, capsnap->dirty_pages,	2264	ci->i_wrbuffer_ref, capsnap->dirty_pages,
2265	last ? " (wrbuffer last)" : "",	2265	last ? " (wrbuffer last)" : "",
2266	complete_capsnap ? " (complete capsnap)" : "",	2266	complete_capsnap ? " (complete capsnap)" : "",
2267	drop_capsnap ? " (drop capsnap)" : "");	2267	drop_capsnap ? " (drop capsnap)" : "");
2268	if (drop_capsnap) {	2268	if (drop_capsnap) {
2269	ceph_put_snap_context(capsnap->context);	2269	ceph_put_snap_context(capsnap->context);
2270	list_del(&capsnap->ci_item);	2270	list_del(&capsnap->ci_item);
2271	list_del(&capsnap->flushing_item);	2271	list_del(&capsnap->flushing_item);
2272	ceph_put_cap_snap(capsnap);	2272	ceph_put_cap_snap(capsnap);
2273	}	2273	}
2274	}	2274	}
2275		2275
2276	spin_unlock(&ci->i_ceph_lock);	2276	spin_unlock(&ci->i_ceph_lock);
2277		2277
2278	if (last) {	2278	if (last) {
2279	ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);	2279	ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
2280	iput(inode);	2280	iput(inode);
2281	} else if (complete_capsnap) {	2281	} else if (complete_capsnap) {
2282	ceph_flush_snaps(ci);	2282	ceph_flush_snaps(ci);
2283	wake_up_all(&ci->i_cap_wq);	2283	wake_up_all(&ci->i_cap_wq);
2284	}	2284	}
2285	if (drop_capsnap)	2285	if (drop_capsnap)
2286	iput(inode);	2286	iput(inode);
2287	}	2287	}
2288		2288
2289	/*	2289	/*
2290	* Handle a cap GRANT message from the MDS. (Note that a GRANT may	2290	* Handle a cap GRANT message from the MDS. (Note that a GRANT may
2291	* actually be a revocation if it specifies a smaller cap set.)	2291	* actually be a revocation if it specifies a smaller cap set.)
2292	*	2292	*
2293	* caller holds s_mutex and i_ceph_lock, we drop both.	2293	* caller holds s_mutex and i_ceph_lock, we drop both.
2294	*	2294	*
2295	* return value:	2295	* return value:
2296	* 0 - ok	2296	* 0 - ok
2297	* 1 - check_caps on auth cap only (writeback)	2297	* 1 - check_caps on auth cap only (writeback)
2298	* 2 - check_caps (ack revoke)	2298	* 2 - check_caps (ack revoke)
2299	*/	2299	*/
2300	static void handle_cap_grant(struct inode inode, struct ceph_mds_caps grant,	2300	static void handle_cap_grant(struct inode inode, struct ceph_mds_caps grant,
2301	struct ceph_mds_session *session,	2301	struct ceph_mds_session *session,
2302	struct ceph_cap *cap,	2302	struct ceph_cap *cap,
2303	struct ceph_buffer *xattr_buf)	2303	struct ceph_buffer *xattr_buf)
2304	__releases(ci->i_ceph_lock)	2304	__releases(ci->i_ceph_lock)
2305	{	2305	{
2306	struct ceph_inode_info *ci = ceph_inode(inode);	2306	struct ceph_inode_info *ci = ceph_inode(inode);
2307	int mds = session->s_mds;	2307	int mds = session->s_mds;
2308	int seq = le32_to_cpu(grant->seq);	2308	int seq = le32_to_cpu(grant->seq);
2309	int newcaps = le32_to_cpu(grant->caps);	2309	int newcaps = le32_to_cpu(grant->caps);
2310	int issued, implemented, used, wanted, dirty;	2310	int issued, implemented, used, wanted, dirty;
2311	u64 size = le64_to_cpu(grant->size);	2311	u64 size = le64_to_cpu(grant->size);
2312	u64 max_size = le64_to_cpu(grant->max_size);	2312	u64 max_size = le64_to_cpu(grant->max_size);
2313	struct timespec mtime, atime, ctime;	2313	struct timespec mtime, atime, ctime;
2314	int check_caps = 0;	2314	int check_caps = 0;
2315	int wake = 0;	2315	int wake = 0;
2316	int writeback = 0;	2316	int writeback = 0;
2317	int revoked_rdcache = 0;	2317	int revoked_rdcache = 0;
2318	int queue_invalidate = 0;	2318	int queue_invalidate = 0;
2319		2319
2320	dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",	2320	dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
2321	inode, cap, mds, seq, ceph_cap_string(newcaps));	2321	inode, cap, mds, seq, ceph_cap_string(newcaps));
2322	dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,	2322	dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
2323	inode->i_size);	2323	inode->i_size);
2324		2324
2325	/*	2325	/*
2326	* If CACHE is being revoked, and we have no dirty buffers,	2326	* If CACHE is being revoked, and we have no dirty buffers,
2327	* try to invalidate (once). (If there are dirty buffers, we	2327	* try to invalidate (once). (If there are dirty buffers, we
2328	* will invalidate _after_ writeback.)	2328	* will invalidate _after_ writeback.)
2329	*/	2329	*/
2330	if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&	2330	if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
2331	(newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&	2331	(newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
2332	!ci->i_wrbuffer_ref) {	2332	!ci->i_wrbuffer_ref) {
2333	if (try_nonblocking_invalidate(inode) == 0) {	2333	if (try_nonblocking_invalidate(inode) == 0) {
2334	revoked_rdcache = 1;	2334	revoked_rdcache = 1;
2335	} else {	2335	} else {
2336	/* there were locked pages.. invalidate later	2336	/* there were locked pages.. invalidate later
2337	in a separate thread. */	2337	in a separate thread. */
2338	if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {	2338	if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
2339	queue_invalidate = 1;	2339	queue_invalidate = 1;
2340	ci->i_rdcache_revoking = ci->i_rdcache_gen;	2340	ci->i_rdcache_revoking = ci->i_rdcache_gen;
2341	}	2341	}
2342	}	2342	}
2343	}	2343	}
2344		2344
2345	/* side effects now are allowed */	2345	/* side effects now are allowed */
2346		2346
2347	issued = __ceph_caps_issued(ci, &implemented);	2347	issued = __ceph_caps_issued(ci, &implemented);
2348	issued \|= implemented \| __ceph_caps_dirty(ci);	2348	issued \|= implemented \| __ceph_caps_dirty(ci);
2349		2349
2350	cap->cap_gen = session->s_cap_gen;	2350	cap->cap_gen = session->s_cap_gen;
2351		2351
2352	__check_cap_issue(ci, cap, newcaps);	2352	__check_cap_issue(ci, cap, newcaps);
2353		2353
2354	if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {	2354	if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
2355	inode->i_mode = le32_to_cpu(grant->mode);	2355	inode->i_mode = le32_to_cpu(grant->mode);
2356	inode->i_uid = le32_to_cpu(grant->uid);	2356	inode->i_uid = le32_to_cpu(grant->uid);
2357	inode->i_gid = le32_to_cpu(grant->gid);	2357	inode->i_gid = le32_to_cpu(grant->gid);
2358	dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,	2358	dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
2359	inode->i_uid, inode->i_gid);	2359	inode->i_uid, inode->i_gid);
2360	}	2360	}
2361		2361
2362	if ((issued & CEPH_CAP_LINK_EXCL) == 0)	2362	if ((issued & CEPH_CAP_LINK_EXCL) == 0)
2363	set_nlink(inode, le32_to_cpu(grant->nlink));	2363	set_nlink(inode, le32_to_cpu(grant->nlink));
2364		2364
2365	if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) {	2365	if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) {
2366	int len = le32_to_cpu(grant->xattr_len);	2366	int len = le32_to_cpu(grant->xattr_len);
2367	u64 version = le64_to_cpu(grant->xattr_version);	2367	u64 version = le64_to_cpu(grant->xattr_version);
2368		2368
2369	if (version > ci->i_xattrs.version) {	2369	if (version > ci->i_xattrs.version) {
2370	dout(" got new xattrs v%llu on %p len %d\n",	2370	dout(" got new xattrs v%llu on %p len %d\n",
2371	version, inode, len);	2371	version, inode, len);
2372	if (ci->i_xattrs.blob)	2372	if (ci->i_xattrs.blob)
2373	ceph_buffer_put(ci->i_xattrs.blob);	2373	ceph_buffer_put(ci->i_xattrs.blob);
2374	ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);	2374	ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
2375	ci->i_xattrs.version = version;	2375	ci->i_xattrs.version = version;
2376	}	2376	}
2377	}	2377	}
2378		2378
2379	/* size/ctime/mtime/atime? */	2379	/* size/ctime/mtime/atime? */
2380	ceph_fill_file_size(inode, issued,	2380	ceph_fill_file_size(inode, issued,
2381	le32_to_cpu(grant->truncate_seq),	2381	le32_to_cpu(grant->truncate_seq),
2382	le64_to_cpu(grant->truncate_size), size);	2382	le64_to_cpu(grant->truncate_size), size);
2383	ceph_decode_timespec(&mtime, &grant->mtime);	2383	ceph_decode_timespec(&mtime, &grant->mtime);
2384	ceph_decode_timespec(&atime, &grant->atime);	2384	ceph_decode_timespec(&atime, &grant->atime);
2385	ceph_decode_timespec(&ctime, &grant->ctime);	2385	ceph_decode_timespec(&ctime, &grant->ctime);
2386	ceph_fill_file_time(inode, issued,	2386	ceph_fill_file_time(inode, issued,
2387	le32_to_cpu(grant->time_warp_seq), &ctime, &mtime,	2387	le32_to_cpu(grant->time_warp_seq), &ctime, &mtime,
2388	&atime);	2388	&atime);
2389		2389
2390	/* max size increase? */	2390	/* max size increase? */
2391	if (max_size != ci->i_max_size) {	2391	if (max_size != ci->i_max_size) {
2392	dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);	2392	dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
2393	ci->i_max_size = max_size;	2393	ci->i_max_size = max_size;
2394	if (max_size >= ci->i_wanted_max_size) {	2394	if (max_size >= ci->i_wanted_max_size) {
2395	ci->i_wanted_max_size = 0; /* reset */	2395	ci->i_wanted_max_size = 0; /* reset */
2396	ci->i_requested_max_size = 0;	2396	ci->i_requested_max_size = 0;
2397	}	2397	}
2398	wake = 1;	2398	wake = 1;
2399	}	2399	}
2400		2400
2401	/* check cap bits */	2401	/* check cap bits */
2402	wanted = __ceph_caps_wanted(ci);	2402	wanted = __ceph_caps_wanted(ci);
2403	used = __ceph_caps_used(ci);	2403	used = __ceph_caps_used(ci);
2404	dirty = __ceph_caps_dirty(ci);	2404	dirty = __ceph_caps_dirty(ci);
2405	dout(" my wanted = %s, used = %s, dirty %s\n",	2405	dout(" my wanted = %s, used = %s, dirty %s\n",
2406	ceph_cap_string(wanted),	2406	ceph_cap_string(wanted),
2407	ceph_cap_string(used),	2407	ceph_cap_string(used),
2408	ceph_cap_string(dirty));	2408	ceph_cap_string(dirty));
2409	if (wanted != le32_to_cpu(grant->wanted)) {	2409	if (wanted != le32_to_cpu(grant->wanted)) {
2410	dout("mds wanted %s -> %s\n",	2410	dout("mds wanted %s -> %s\n",
2411	ceph_cap_string(le32_to_cpu(grant->wanted)),	2411	ceph_cap_string(le32_to_cpu(grant->wanted)),
2412	ceph_cap_string(wanted));	2412	ceph_cap_string(wanted));
2413	grant->wanted = cpu_to_le32(wanted);	2413	grant->wanted = cpu_to_le32(wanted);
2414	}	2414	}
2415		2415
2416	cap->seq = seq;	2416	cap->seq = seq;
2417		2417
2418	/* file layout may have changed */	2418	/* file layout may have changed */
2419	ci->i_layout = grant->layout;	2419	ci->i_layout = grant->layout;
2420		2420
2421	/* revocation, grant, or no-op? */	2421	/* revocation, grant, or no-op? */
2422	if (cap->issued & ~newcaps) {	2422	if (cap->issued & ~newcaps) {
2423	int revoking = cap->issued & ~newcaps;	2423	int revoking = cap->issued & ~newcaps;
2424		2424
2425	dout("revocation: %s -> %s (revoking %s)\n",	2425	dout("revocation: %s -> %s (revoking %s)\n",
2426	ceph_cap_string(cap->issued),	2426	ceph_cap_string(cap->issued),
2427	ceph_cap_string(newcaps),	2427	ceph_cap_string(newcaps),
2428	ceph_cap_string(revoking));	2428	ceph_cap_string(revoking));
2429	if (revoking & used & CEPH_CAP_FILE_BUFFER)	2429	if (revoking & used & CEPH_CAP_FILE_BUFFER)
2430	writeback = 1; /* initiate writeback; will delay ack */	2430	writeback = 1; /* initiate writeback; will delay ack */
2431	else if (revoking == CEPH_CAP_FILE_CACHE &&	2431	else if (revoking == CEPH_CAP_FILE_CACHE &&
2432	(newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&	2432	(newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
2433	queue_invalidate)	2433	queue_invalidate)
2434	; /* do nothing yet, invalidation will be queued */	2434	; /* do nothing yet, invalidation will be queued */
2435	else if (cap == ci->i_auth_cap)	2435	else if (cap == ci->i_auth_cap)
2436	check_caps = 1; /* check auth cap only */	2436	check_caps = 1; /* check auth cap only */
2437	else	2437	else
2438	check_caps = 2; /* check all caps */	2438	check_caps = 2; /* check all caps */
2439	cap->issued = newcaps;	2439	cap->issued = newcaps;
2440	cap->implemented \|= newcaps;	2440	cap->implemented \|= newcaps;
2441	} else if (cap->issued == newcaps) {	2441	} else if (cap->issued == newcaps) {
2442	dout("caps unchanged: %s -> %s\n",	2442	dout("caps unchanged: %s -> %s\n",
2443	ceph_cap_string(cap->issued), ceph_cap_string(newcaps));	2443	ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
2444	} else {	2444	} else {
2445	dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),	2445	dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
2446	ceph_cap_string(newcaps));	2446	ceph_cap_string(newcaps));
2447	cap->issued = newcaps;	2447	cap->issued = newcaps;
2448	cap->implemented \|= newcaps; /* add bits only, to	2448	cap->implemented \|= newcaps; /* add bits only, to
2449	* avoid stepping on a	2449	* avoid stepping on a
2450	* pending revocation */	2450	* pending revocation */
2451	wake = 1;	2451	wake = 1;
2452	}	2452	}
2453	BUG_ON(cap->issued & ~cap->implemented);	2453	BUG_ON(cap->issued & ~cap->implemented);
2454		2454
2455	spin_unlock(&ci->i_ceph_lock);	2455	spin_unlock(&ci->i_ceph_lock);
2456	if (writeback)	2456	if (writeback)
2457	/*	2457	/*
2458	* queue inode for writeback: we can't actually call	2458	* queue inode for writeback: we can't actually call
2459	* filemap_write_and_wait, etc. from message handler	2459	* filemap_write_and_wait, etc. from message handler
2460	* context.	2460	* context.
2461	*/	2461	*/
2462	ceph_queue_writeback(inode);	2462	ceph_queue_writeback(inode);
2463	if (queue_invalidate)	2463	if (queue_invalidate)
2464	ceph_queue_invalidate(inode);	2464	ceph_queue_invalidate(inode);
2465	if (wake)	2465	if (wake)
2466	wake_up_all(&ci->i_cap_wq);	2466	wake_up_all(&ci->i_cap_wq);
2467		2467
2468	if (check_caps == 1)	2468	if (check_caps == 1)
2469	ceph_check_caps(ci, CHECK_CAPS_NODELAY\|CHECK_CAPS_AUTHONLY,	2469	ceph_check_caps(ci, CHECK_CAPS_NODELAY\|CHECK_CAPS_AUTHONLY,
2470	session);	2470	session);
2471	else if (check_caps == 2)	2471	else if (check_caps == 2)
2472	ceph_check_caps(ci, CHECK_CAPS_NODELAY, session);	2472	ceph_check_caps(ci, CHECK_CAPS_NODELAY, session);
2473	else	2473	else
2474	mutex_unlock(&session->s_mutex);	2474	mutex_unlock(&session->s_mutex);
2475	}	2475	}
2476		2476
2477	/*	2477	/*
2478	* Handle FLUSH_ACK from MDS, indicating that metadata we sent to the	2478	* Handle FLUSH_ACK from MDS, indicating that metadata we sent to the
2479	* MDS has been safely committed.	2479	* MDS has been safely committed.
2480	*/	2480	*/
2481	static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,	2481	static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
2482	struct ceph_mds_caps *m,	2482	struct ceph_mds_caps *m,
2483	struct ceph_mds_session *session,	2483	struct ceph_mds_session *session,
2484	struct ceph_cap *cap)	2484	struct ceph_cap *cap)
2485	__releases(ci->i_ceph_lock)	2485	__releases(ci->i_ceph_lock)
2486	{	2486	{
2487	struct ceph_inode_info *ci = ceph_inode(inode);	2487	struct ceph_inode_info *ci = ceph_inode(inode);
2488	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;	2488	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
2489	unsigned seq = le32_to_cpu(m->seq);	2489	unsigned seq = le32_to_cpu(m->seq);
2490	int dirty = le32_to_cpu(m->dirty);	2490	int dirty = le32_to_cpu(m->dirty);
2491	int cleaned = 0;	2491	int cleaned = 0;
2492	int drop = 0;	2492	int drop = 0;
2493	int i;	2493	int i;
2494		2494
2495	for (i = 0; i < CEPH_CAP_BITS; i++)	2495	for (i = 0; i < CEPH_CAP_BITS; i++)
2496	if ((dirty & (1 << i)) &&	2496	if ((dirty & (1 << i)) &&
2497	flush_tid == ci->i_cap_flush_tid[i])	2497	flush_tid == ci->i_cap_flush_tid[i])
2498	cleaned \|= 1 << i;	2498	cleaned \|= 1 << i;
2499		2499
2500	dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"	2500	dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
2501	" flushing %s -> %s\n",	2501	" flushing %s -> %s\n",
2502	inode, session->s_mds, seq, ceph_cap_string(dirty),	2502	inode, session->s_mds, seq, ceph_cap_string(dirty),
2503	ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),	2503	ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
2504	ceph_cap_string(ci->i_flushing_caps & ~cleaned));	2504	ceph_cap_string(ci->i_flushing_caps & ~cleaned));
2505		2505
2506	if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned))	2506	if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned))
2507	goto out;	2507	goto out;
2508		2508
2509	ci->i_flushing_caps &= ~cleaned;	2509	ci->i_flushing_caps &= ~cleaned;
2510		2510
2511	spin_lock(&mdsc->cap_dirty_lock);	2511	spin_lock(&mdsc->cap_dirty_lock);
2512	if (ci->i_flushing_caps == 0) {	2512	if (ci->i_flushing_caps == 0) {
2513	list_del_init(&ci->i_flushing_item);	2513	list_del_init(&ci->i_flushing_item);
2514	if (!list_empty(&session->s_cap_flushing))	2514	if (!list_empty(&session->s_cap_flushing))
2515	dout(" mds%d still flushing cap on %p\n",	2515	dout(" mds%d still flushing cap on %p\n",
2516	session->s_mds,	2516	session->s_mds,
2517	&list_entry(session->s_cap_flushing.next,	2517	&list_entry(session->s_cap_flushing.next,
2518	struct ceph_inode_info,	2518	struct ceph_inode_info,
2519	i_flushing_item)->vfs_inode);	2519	i_flushing_item)->vfs_inode);
2520	mdsc->num_cap_flushing--;	2520	mdsc->num_cap_flushing--;
2521	wake_up_all(&mdsc->cap_flushing_wq);	2521	wake_up_all(&mdsc->cap_flushing_wq);
2522	dout(" inode %p now !flushing\n", inode);	2522	dout(" inode %p now !flushing\n", inode);
2523		2523
2524	if (ci->i_dirty_caps == 0) {	2524	if (ci->i_dirty_caps == 0) {
2525	dout(" inode %p now clean\n", inode);	2525	dout(" inode %p now clean\n", inode);
2526	BUG_ON(!list_empty(&ci->i_dirty_item));	2526	BUG_ON(!list_empty(&ci->i_dirty_item));
2527	drop = 1;	2527	drop = 1;
2528	if (ci->i_wrbuffer_ref_head == 0) {	2528	if (ci->i_wrbuffer_ref_head == 0) {
2529	BUG_ON(!ci->i_head_snapc);	2529	BUG_ON(!ci->i_head_snapc);
2530	ceph_put_snap_context(ci->i_head_snapc);	2530	ceph_put_snap_context(ci->i_head_snapc);
2531	ci->i_head_snapc = NULL;	2531	ci->i_head_snapc = NULL;
2532	}	2532	}
2533	} else {	2533	} else {
2534	BUG_ON(list_empty(&ci->i_dirty_item));	2534	BUG_ON(list_empty(&ci->i_dirty_item));
2535	}	2535	}
2536	}	2536	}
2537	spin_unlock(&mdsc->cap_dirty_lock);	2537	spin_unlock(&mdsc->cap_dirty_lock);
2538	wake_up_all(&ci->i_cap_wq);	2538	wake_up_all(&ci->i_cap_wq);
2539		2539
2540	out:	2540	out:
2541	spin_unlock(&ci->i_ceph_lock);	2541	spin_unlock(&ci->i_ceph_lock);
2542	if (drop)	2542	if (drop)
2543	iput(inode);	2543	iput(inode);
2544	}	2544	}
2545		2545
2546	/*	2546	/*
2547	* Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can	2547	* Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can
2548	* throw away our cap_snap.	2548	* throw away our cap_snap.
2549	*	2549	*
2550	* Caller hold s_mutex.	2550	* Caller hold s_mutex.
2551	*/	2551	*/
2552	static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,	2552	static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
2553	struct ceph_mds_caps *m,	2553	struct ceph_mds_caps *m,
2554	struct ceph_mds_session *session)	2554	struct ceph_mds_session *session)
2555	{	2555	{
2556	struct ceph_inode_info *ci = ceph_inode(inode);	2556	struct ceph_inode_info *ci = ceph_inode(inode);
2557	u64 follows = le64_to_cpu(m->snap_follows);	2557	u64 follows = le64_to_cpu(m->snap_follows);
2558	struct ceph_cap_snap *capsnap;	2558	struct ceph_cap_snap *capsnap;
2559	int drop = 0;	2559	int drop = 0;
2560		2560
2561	dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",	2561	dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
2562	inode, ci, session->s_mds, follows);	2562	inode, ci, session->s_mds, follows);
2563		2563
2564	spin_lock(&ci->i_ceph_lock);	2564	spin_lock(&ci->i_ceph_lock);
2565	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {	2565	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
2566	if (capsnap->follows == follows) {	2566	if (capsnap->follows == follows) {
2567	if (capsnap->flush_tid != flush_tid) {	2567	if (capsnap->flush_tid != flush_tid) {
2568	dout(" cap_snap %p follows %lld tid %lld !="	2568	dout(" cap_snap %p follows %lld tid %lld !="
2569	" %lld\n", capsnap, follows,	2569	" %lld\n", capsnap, follows,
2570	flush_tid, capsnap->flush_tid);	2570	flush_tid, capsnap->flush_tid);
2571	break;	2571	break;
2572	}	2572	}
2573	WARN_ON(capsnap->dirty_pages \|\| capsnap->writing);	2573	WARN_ON(capsnap->dirty_pages \|\| capsnap->writing);
2574	dout(" removing %p cap_snap %p follows %lld\n",	2574	dout(" removing %p cap_snap %p follows %lld\n",
2575	inode, capsnap, follows);	2575	inode, capsnap, follows);
2576	ceph_put_snap_context(capsnap->context);	2576	ceph_put_snap_context(capsnap->context);
2577	list_del(&capsnap->ci_item);	2577	list_del(&capsnap->ci_item);
2578	list_del(&capsnap->flushing_item);	2578	list_del(&capsnap->flushing_item);
2579	ceph_put_cap_snap(capsnap);	2579	ceph_put_cap_snap(capsnap);
2580	drop = 1;	2580	drop = 1;
2581	break;	2581	break;
2582	} else {	2582	} else {
2583	dout(" skipping cap_snap %p follows %lld\n",	2583	dout(" skipping cap_snap %p follows %lld\n",
2584	capsnap, capsnap->follows);	2584	capsnap, capsnap->follows);
2585	}	2585	}
2586	}	2586	}
2587	spin_unlock(&ci->i_ceph_lock);	2587	spin_unlock(&ci->i_ceph_lock);
2588	if (drop)	2588	if (drop)
2589	iput(inode);	2589	iput(inode);
2590	}	2590	}
2591		2591
2592	/*	2592	/*
2593	* Handle TRUNC from MDS, indicating file truncation.	2593	* Handle TRUNC from MDS, indicating file truncation.
2594	*	2594	*
2595	* caller hold s_mutex.	2595	* caller hold s_mutex.
2596	*/	2596	*/
2597	static void handle_cap_trunc(struct inode *inode,	2597	static void handle_cap_trunc(struct inode *inode,
2598	struct ceph_mds_caps *trunc,	2598	struct ceph_mds_caps *trunc,
2599	struct ceph_mds_session *session)	2599	struct ceph_mds_session *session)
2600	__releases(ci->i_ceph_lock)	2600	__releases(ci->i_ceph_lock)
2601	{	2601	{
2602	struct ceph_inode_info *ci = ceph_inode(inode);	2602	struct ceph_inode_info *ci = ceph_inode(inode);
2603	int mds = session->s_mds;	2603	int mds = session->s_mds;
2604	int seq = le32_to_cpu(trunc->seq);	2604	int seq = le32_to_cpu(trunc->seq);
2605	u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);	2605	u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);
2606	u64 truncate_size = le64_to_cpu(trunc->truncate_size);	2606	u64 truncate_size = le64_to_cpu(trunc->truncate_size);
2607	u64 size = le64_to_cpu(trunc->size);	2607	u64 size = le64_to_cpu(trunc->size);
2608	int implemented = 0;	2608	int implemented = 0;
2609	int dirty = __ceph_caps_dirty(ci);	2609	int dirty = __ceph_caps_dirty(ci);
2610	int issued = __ceph_caps_issued(ceph_inode(inode), &implemented);	2610	int issued = __ceph_caps_issued(ceph_inode(inode), &implemented);
2611	int queue_trunc = 0;	2611	int queue_trunc = 0;
2612		2612
2613	issued \|= implemented \| dirty;	2613	issued \|= implemented \| dirty;
2614		2614
2615	dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n",	2615	dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n",
2616	inode, mds, seq, truncate_size, truncate_seq);	2616	inode, mds, seq, truncate_size, truncate_seq);
2617	queue_trunc = ceph_fill_file_size(inode, issued,	2617	queue_trunc = ceph_fill_file_size(inode, issued,
2618	truncate_seq, truncate_size, size);	2618	truncate_seq, truncate_size, size);
2619	spin_unlock(&ci->i_ceph_lock);	2619	spin_unlock(&ci->i_ceph_lock);
2620		2620
2621	if (queue_trunc)	2621	if (queue_trunc)
2622	ceph_queue_vmtruncate(inode);	2622	ceph_queue_vmtruncate(inode);
2623	}	2623	}
2624		2624
2625	/*	2625	/*
2626	* Handle EXPORT from MDS. Cap is being migrated _from_ this mds to a	2626	* Handle EXPORT from MDS. Cap is being migrated _from_ this mds to a
2627	* different one. If we are the most recent migration we've seen (as	2627	* different one. If we are the most recent migration we've seen (as
2628	* indicated by mseq), make note of the migrating cap bits for the	2628	* indicated by mseq), make note of the migrating cap bits for the
2629	* duration (until we see the corresponding IMPORT).	2629	* duration (until we see the corresponding IMPORT).
2630	*	2630	*
2631	* caller holds s_mutex	2631	* caller holds s_mutex
2632	*/	2632	*/
2633	static void handle_cap_export(struct inode inode, struct ceph_mds_caps ex,	2633	static void handle_cap_export(struct inode inode, struct ceph_mds_caps ex,
2634	struct ceph_mds_session *session,	2634	struct ceph_mds_session *session,
2635	int *open_target_sessions)	2635	int *open_target_sessions)
2636	{	2636	{
2637	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;	2637	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
2638	struct ceph_inode_info *ci = ceph_inode(inode);	2638	struct ceph_inode_info *ci = ceph_inode(inode);
2639	int mds = session->s_mds;	2639	int mds = session->s_mds;
2640	unsigned mseq = le32_to_cpu(ex->migrate_seq);	2640	unsigned mseq = le32_to_cpu(ex->migrate_seq);
2641	struct ceph_cap cap = NULL, t;	2641	struct ceph_cap cap = NULL, t;
2642	struct rb_node *p;	2642	struct rb_node *p;
2643	int remember = 1;	2643	int remember = 1;
2644		2644
2645	dout("handle_cap_export inode %p ci %p mds%d mseq %d\n",	2645	dout("handle_cap_export inode %p ci %p mds%d mseq %d\n",
2646	inode, ci, mds, mseq);	2646	inode, ci, mds, mseq);
2647		2647
2648	spin_lock(&ci->i_ceph_lock);	2648	spin_lock(&ci->i_ceph_lock);
2649		2649
2650	/* make sure we haven't seen a higher mseq */	2650	/* make sure we haven't seen a higher mseq */
2651	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {	2651	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
2652	t = rb_entry(p, struct ceph_cap, ci_node);	2652	t = rb_entry(p, struct ceph_cap, ci_node);
2653	if (ceph_seq_cmp(t->mseq, mseq) > 0) {	2653	if (ceph_seq_cmp(t->mseq, mseq) > 0) {
2654	dout(" higher mseq on cap from mds%d\n",	2654	dout(" higher mseq on cap from mds%d\n",
2655	t->session->s_mds);	2655	t->session->s_mds);
2656	remember = 0;	2656	remember = 0;
2657	}	2657	}
2658	if (t->session->s_mds == mds)	2658	if (t->session->s_mds == mds)
2659	cap = t;	2659	cap = t;
2660	}	2660	}
2661		2661
2662	if (cap) {	2662	if (cap) {
2663	if (remember) {	2663	if (remember) {
2664	/* make note */	2664	/* make note */
2665	ci->i_cap_exporting_mds = mds;	2665	ci->i_cap_exporting_mds = mds;
2666	ci->i_cap_exporting_mseq = mseq;	2666	ci->i_cap_exporting_mseq = mseq;
2667	ci->i_cap_exporting_issued = cap->issued;	2667	ci->i_cap_exporting_issued = cap->issued;
2668		2668
2669	/*	2669	/*
2670	* make sure we have open sessions with all possible	2670	* make sure we have open sessions with all possible
2671	* export targets, so that we get the matching IMPORT	2671	* export targets, so that we get the matching IMPORT
2672	*/	2672	*/
2673	*open_target_sessions = 1;	2673	*open_target_sessions = 1;
2674		2674
2675	/*	2675	/*
2676	* we can't flush dirty caps that we've seen the	2676	* we can't flush dirty caps that we've seen the
2677	* EXPORT but no IMPORT for	2677	* EXPORT but no IMPORT for
2678	*/	2678	*/
2679	spin_lock(&mdsc->cap_dirty_lock);	2679	spin_lock(&mdsc->cap_dirty_lock);
2680	if (!list_empty(&ci->i_dirty_item)) {	2680	if (!list_empty(&ci->i_dirty_item)) {
2681	dout(" moving %p to cap_dirty_migrating\n",	2681	dout(" moving %p to cap_dirty_migrating\n",
2682	inode);	2682	inode);
2683	list_move(&ci->i_dirty_item,	2683	list_move(&ci->i_dirty_item,
2684	&mdsc->cap_dirty_migrating);	2684	&mdsc->cap_dirty_migrating);
2685	}	2685	}
2686	spin_unlock(&mdsc->cap_dirty_lock);	2686	spin_unlock(&mdsc->cap_dirty_lock);
2687	}	2687	}
2688	__ceph_remove_cap(cap);	2688	__ceph_remove_cap(cap);
2689	}	2689	}
2690	/* else, we already released it */	2690	/* else, we already released it */
2691		2691
2692	spin_unlock(&ci->i_ceph_lock);	2692	spin_unlock(&ci->i_ceph_lock);
2693	}	2693	}
2694		2694
2695	/*	2695	/*
2696	* Handle cap IMPORT. If there are temp bits from an older EXPORT,	2696	* Handle cap IMPORT. If there are temp bits from an older EXPORT,
2697	* clean them up.	2697	* clean them up.
2698	*	2698	*
2699	* caller holds s_mutex.	2699	* caller holds s_mutex.
2700	*/	2700	*/
2701	static void handle_cap_import(struct ceph_mds_client *mdsc,	2701	static void handle_cap_import(struct ceph_mds_client *mdsc,
2702	struct inode inode, struct ceph_mds_caps im,	2702	struct inode inode, struct ceph_mds_caps im,
2703	struct ceph_mds_session *session,	2703	struct ceph_mds_session *session,
2704	void *snaptrace, int snaptrace_len)	2704	void *snaptrace, int snaptrace_len)
2705	{	2705	{
2706	struct ceph_inode_info *ci = ceph_inode(inode);	2706	struct ceph_inode_info *ci = ceph_inode(inode);
2707	int mds = session->s_mds;	2707	int mds = session->s_mds;
2708	unsigned issued = le32_to_cpu(im->caps);	2708	unsigned issued = le32_to_cpu(im->caps);
2709	unsigned wanted = le32_to_cpu(im->wanted);	2709	unsigned wanted = le32_to_cpu(im->wanted);
2710	unsigned seq = le32_to_cpu(im->seq);	2710	unsigned seq = le32_to_cpu(im->seq);
2711	unsigned mseq = le32_to_cpu(im->migrate_seq);	2711	unsigned mseq = le32_to_cpu(im->migrate_seq);
2712	u64 realmino = le64_to_cpu(im->realm);	2712	u64 realmino = le64_to_cpu(im->realm);
2713	u64 cap_id = le64_to_cpu(im->cap_id);	2713	u64 cap_id = le64_to_cpu(im->cap_id);
2714		2714
2715	if (ci->i_cap_exporting_mds >= 0 &&	2715	if (ci->i_cap_exporting_mds >= 0 &&
2716	ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) {	2716	ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) {
2717	dout("handle_cap_import inode %p ci %p mds%d mseq %d"	2717	dout("handle_cap_import inode %p ci %p mds%d mseq %d"
2718	" - cleared exporting from mds%d\n",	2718	" - cleared exporting from mds%d\n",
2719	inode, ci, mds, mseq,	2719	inode, ci, mds, mseq,
2720	ci->i_cap_exporting_mds);	2720	ci->i_cap_exporting_mds);
2721	ci->i_cap_exporting_issued = 0;	2721	ci->i_cap_exporting_issued = 0;
2722	ci->i_cap_exporting_mseq = 0;	2722	ci->i_cap_exporting_mseq = 0;
2723	ci->i_cap_exporting_mds = -1;	2723	ci->i_cap_exporting_mds = -1;
2724		2724
2725	spin_lock(&mdsc->cap_dirty_lock);	2725	spin_lock(&mdsc->cap_dirty_lock);
2726	if (!list_empty(&ci->i_dirty_item)) {	2726	if (!list_empty(&ci->i_dirty_item)) {
2727	dout(" moving %p back to cap_dirty\n", inode);	2727	dout(" moving %p back to cap_dirty\n", inode);
2728	list_move(&ci->i_dirty_item, &mdsc->cap_dirty);	2728	list_move(&ci->i_dirty_item, &mdsc->cap_dirty);
2729	}	2729	}
2730	spin_unlock(&mdsc->cap_dirty_lock);	2730	spin_unlock(&mdsc->cap_dirty_lock);
2731	} else {	2731	} else {
2732	dout("handle_cap_import inode %p ci %p mds%d mseq %d\n",	2732	dout("handle_cap_import inode %p ci %p mds%d mseq %d\n",
2733	inode, ci, mds, mseq);	2733	inode, ci, mds, mseq);
2734	}	2734	}
2735		2735
2736	down_write(&mdsc->snap_rwsem);	2736	down_write(&mdsc->snap_rwsem);
2737	ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len,	2737	ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len,
2738	false);	2738	false);
2739	downgrade_write(&mdsc->snap_rwsem);	2739	downgrade_write(&mdsc->snap_rwsem);
2740	ceph_add_cap(inode, session, cap_id, -1,	2740	ceph_add_cap(inode, session, cap_id, -1,
2741	issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,	2741	issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
2742	NULL /* no caps context */);	2742	NULL /* no caps context */);
2743	kick_flushing_inode_caps(mdsc, session, inode);	2743	kick_flushing_inode_caps(mdsc, session, inode);
2744	up_read(&mdsc->snap_rwsem);	2744	up_read(&mdsc->snap_rwsem);
2745		2745
2746	/* make sure we re-request max_size, if necessary */	2746	/* make sure we re-request max_size, if necessary */
2747	spin_lock(&ci->i_ceph_lock);	2747	spin_lock(&ci->i_ceph_lock);
2748	ci->i_requested_max_size = 0;	2748	ci->i_requested_max_size = 0;
2749	spin_unlock(&ci->i_ceph_lock);	2749	spin_unlock(&ci->i_ceph_lock);
2750	}	2750	}
2751		2751
2752	/*	2752	/*
2753	* Handle a caps message from the MDS.	2753	* Handle a caps message from the MDS.
2754	*	2754	*
2755	* Identify the appropriate session, inode, and call the right handler	2755	* Identify the appropriate session, inode, and call the right handler
2756	* based on the cap op.	2756	* based on the cap op.
2757	*/	2757	*/
2758	void ceph_handle_caps(struct ceph_mds_session *session,	2758	void ceph_handle_caps(struct ceph_mds_session *session,
2759	struct ceph_msg *msg)	2759	struct ceph_msg *msg)
2760	{	2760	{
2761	struct ceph_mds_client *mdsc = session->s_mdsc;	2761	struct ceph_mds_client *mdsc = session->s_mdsc;
2762	struct super_block *sb = mdsc->fsc->sb;	2762	struct super_block *sb = mdsc->fsc->sb;
2763	struct inode *inode;	2763	struct inode *inode;
2764	struct ceph_inode_info *ci;	2764	struct ceph_inode_info *ci;
2765	struct ceph_cap *cap;	2765	struct ceph_cap *cap;
2766	struct ceph_mds_caps *h;	2766	struct ceph_mds_caps *h;
2767	int mds = session->s_mds;	2767	int mds = session->s_mds;
2768	int op;	2768	int op;
2769	u32 seq, mseq;	2769	u32 seq, mseq;
2770	struct ceph_vino vino;	2770	struct ceph_vino vino;
2771	u64 cap_id;	2771	u64 cap_id;
2772	u64 size, max_size;	2772	u64 size, max_size;
2773	u64 tid;	2773	u64 tid;
2774	void *snaptrace;	2774	void *snaptrace;
2775	size_t snaptrace_len;	2775	size_t snaptrace_len;
2776	void *flock;	2776	void *flock;
2777	u32 flock_len;	2777	u32 flock_len;
2778	int open_target_sessions = 0;	2778	int open_target_sessions = 0;
2779		2779
2780	dout("handle_caps from mds%d\n", mds);	2780	dout("handle_caps from mds%d\n", mds);
2781		2781
2782	/* decode */	2782	/* decode */
2783	tid = le64_to_cpu(msg->hdr.tid);	2783	tid = le64_to_cpu(msg->hdr.tid);
2784	if (msg->front.iov_len < sizeof(*h))	2784	if (msg->front.iov_len < sizeof(*h))
2785	goto bad;	2785	goto bad;
2786	h = msg->front.iov_base;	2786	h = msg->front.iov_base;
2787	op = le32_to_cpu(h->op);	2787	op = le32_to_cpu(h->op);
2788	vino.ino = le64_to_cpu(h->ino);	2788	vino.ino = le64_to_cpu(h->ino);
2789	vino.snap = CEPH_NOSNAP;	2789	vino.snap = CEPH_NOSNAP;
2790	cap_id = le64_to_cpu(h->cap_id);	2790	cap_id = le64_to_cpu(h->cap_id);
2791	seq = le32_to_cpu(h->seq);	2791	seq = le32_to_cpu(h->seq);
2792	mseq = le32_to_cpu(h->migrate_seq);	2792	mseq = le32_to_cpu(h->migrate_seq);
2793	size = le64_to_cpu(h->size);	2793	size = le64_to_cpu(h->size);
2794	max_size = le64_to_cpu(h->max_size);	2794	max_size = le64_to_cpu(h->max_size);
2795		2795
2796	snaptrace = h + 1;	2796	snaptrace = h + 1;
2797	snaptrace_len = le32_to_cpu(h->snap_trace_len);	2797	snaptrace_len = le32_to_cpu(h->snap_trace_len);
2798		2798
2799	if (le16_to_cpu(msg->hdr.version) >= 2) {	2799	if (le16_to_cpu(msg->hdr.version) >= 2) {
2800	void p, end;	2800	void p, end;
2801		2801
2802	p = snaptrace + snaptrace_len;	2802	p = snaptrace + snaptrace_len;
2803	end = msg->front.iov_base + msg->front.iov_len;	2803	end = msg->front.iov_base + msg->front.iov_len;
2804	ceph_decode_32_safe(&p, end, flock_len, bad);	2804	ceph_decode_32_safe(&p, end, flock_len, bad);
2805	flock = p;	2805	flock = p;
2806	} else {	2806	} else {
2807	flock = NULL;	2807	flock = NULL;
2808	flock_len = 0;	2808	flock_len = 0;
2809	}	2809	}
2810		2810
2811	mutex_lock(&session->s_mutex);	2811	mutex_lock(&session->s_mutex);
2812	session->s_seq++;	2812	session->s_seq++;
2813	dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,	2813	dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
2814	(unsigned)seq);	2814	(unsigned)seq);
2815		2815
2816	/* lookup ino */	2816	/* lookup ino */
2817	inode = ceph_find_inode(sb, vino);	2817	inode = ceph_find_inode(sb, vino);
2818	ci = ceph_inode(inode);	2818	ci = ceph_inode(inode);
2819	dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,	2819	dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
2820	vino.snap, inode);	2820	vino.snap, inode);
2821	if (!inode) {	2821	if (!inode) {
2822	dout(" i don't have ino %llx\n", vino.ino);	2822	dout(" i don't have ino %llx\n", vino.ino);
2823		2823
2824	if (op == CEPH_CAP_OP_IMPORT)	2824	if (op == CEPH_CAP_OP_IMPORT)
2825	__queue_cap_release(session, vino.ino, cap_id,	2825	__queue_cap_release(session, vino.ino, cap_id,
2826	mseq, seq);	2826	mseq, seq);
2827	goto flush_cap_releases;	2827	goto flush_cap_releases;
2828	}	2828	}
2829		2829
2830	/* these will work even if we don't have a cap yet */	2830	/* these will work even if we don't have a cap yet */
2831	switch (op) {	2831	switch (op) {
2832	case CEPH_CAP_OP_FLUSHSNAP_ACK:	2832	case CEPH_CAP_OP_FLUSHSNAP_ACK:
2833	handle_cap_flushsnap_ack(inode, tid, h, session);	2833	handle_cap_flushsnap_ack(inode, tid, h, session);
2834	goto done;	2834	goto done;
2835		2835
2836	case CEPH_CAP_OP_EXPORT:	2836	case CEPH_CAP_OP_EXPORT:
2837	handle_cap_export(inode, h, session, &open_target_sessions);	2837	handle_cap_export(inode, h, session, &open_target_sessions);
2838	goto done;	2838	goto done;
2839		2839
2840	case CEPH_CAP_OP_IMPORT:	2840	case CEPH_CAP_OP_IMPORT:
2841	handle_cap_import(mdsc, inode, h, session,	2841	handle_cap_import(mdsc, inode, h, session,
2842	snaptrace, snaptrace_len);	2842	snaptrace, snaptrace_len);
2843	ceph_check_caps(ceph_inode(inode), 0, session);	2843	ceph_check_caps(ceph_inode(inode), 0, session);
2844	goto done_unlocked;	2844	goto done_unlocked;
2845	}	2845	}
2846		2846
2847	/* the rest require a cap */	2847	/* the rest require a cap */
2848	spin_lock(&ci->i_ceph_lock);	2848	spin_lock(&ci->i_ceph_lock);
2849	cap = __get_cap_for_mds(ceph_inode(inode), mds);	2849	cap = __get_cap_for_mds(ceph_inode(inode), mds);
2850	if (!cap) {	2850	if (!cap) {
2851	dout(" no cap on %p ino %llx.%llx from mds%d\n",	2851	dout(" no cap on %p ino %llx.%llx from mds%d\n",
2852	inode, ceph_ino(inode), ceph_snap(inode), mds);	2852	inode, ceph_ino(inode), ceph_snap(inode), mds);
2853	spin_unlock(&ci->i_ceph_lock);	2853	spin_unlock(&ci->i_ceph_lock);
2854	goto flush_cap_releases;	2854	goto flush_cap_releases;
2855	}	2855	}
2856		2856
2857	/* note that each of these drops i_ceph_lock for us */	2857	/* note that each of these drops i_ceph_lock for us */
2858	switch (op) {	2858	switch (op) {
2859	case CEPH_CAP_OP_REVOKE:	2859	case CEPH_CAP_OP_REVOKE:
2860	case CEPH_CAP_OP_GRANT:	2860	case CEPH_CAP_OP_GRANT:
2861	handle_cap_grant(inode, h, session, cap, msg->middle);	2861	handle_cap_grant(inode, h, session, cap, msg->middle);
2862	goto done_unlocked;	2862	goto done_unlocked;
2863		2863
2864	case CEPH_CAP_OP_FLUSH_ACK:	2864	case CEPH_CAP_OP_FLUSH_ACK:
2865	handle_cap_flush_ack(inode, tid, h, session, cap);	2865	handle_cap_flush_ack(inode, tid, h, session, cap);
2866	break;	2866	break;
2867		2867
2868	case CEPH_CAP_OP_TRUNC:	2868	case CEPH_CAP_OP_TRUNC:
2869	handle_cap_trunc(inode, h, session);	2869	handle_cap_trunc(inode, h, session);
2870	break;	2870	break;
2871		2871
2872	default:	2872	default:
2873	spin_unlock(&ci->i_ceph_lock);	2873	spin_unlock(&ci->i_ceph_lock);
2874	pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,	2874	pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
2875	ceph_cap_op_name(op));	2875	ceph_cap_op_name(op));
2876	}	2876	}
2877		2877
2878	goto done;	2878	goto done;
2879		2879
2880	flush_cap_releases:	2880	flush_cap_releases:
2881	/*	2881	/*
2882	* send any full release message to try to move things	2882	* send any full release message to try to move things
2883	* along for the mds (who clearly thinks we still have this	2883	* along for the mds (who clearly thinks we still have this
2884	* cap).	2884	* cap).
2885	*/	2885	*/
2886	ceph_add_cap_releases(mdsc, session);	2886	ceph_add_cap_releases(mdsc, session);
2887	ceph_send_cap_releases(mdsc, session);	2887	ceph_send_cap_releases(mdsc, session);
2888		2888
2889	done:	2889	done:
2890	mutex_unlock(&session->s_mutex);	2890	mutex_unlock(&session->s_mutex);
2891	done_unlocked:	2891	done_unlocked:
2892	if (inode)	2892	if (inode)
2893	iput(inode);	2893	iput(inode);
2894	if (open_target_sessions)	2894	if (open_target_sessions)
2895	ceph_mdsc_open_export_target_sessions(mdsc, session);	2895	ceph_mdsc_open_export_target_sessions(mdsc, session);
2896	return;	2896	return;
2897		2897
2898	bad:	2898	bad:
2899	pr_err("ceph_handle_caps: corrupt message\n");	2899	pr_err("ceph_handle_caps: corrupt message\n");
2900	ceph_msg_dump(msg);	2900	ceph_msg_dump(msg);
2901	return;	2901	return;
2902	}	2902	}
2903		2903
2904	/*	2904	/*
2905	* Delayed work handler to process end of delayed cap release LRU list.	2905	* Delayed work handler to process end of delayed cap release LRU list.
2906	*/	2906	*/
2907	void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)	2907	void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
2908	{	2908	{
2909	struct ceph_inode_info *ci;	2909	struct ceph_inode_info *ci;
2910	int flags = CHECK_CAPS_NODELAY;	2910	int flags = CHECK_CAPS_NODELAY;
2911		2911
2912	dout("check_delayed_caps\n");	2912	dout("check_delayed_caps\n");
2913	while (1) {	2913	while (1) {
2914	spin_lock(&mdsc->cap_delay_lock);	2914	spin_lock(&mdsc->cap_delay_lock);
2915	if (list_empty(&mdsc->cap_delay_list))	2915	if (list_empty(&mdsc->cap_delay_list))
2916	break;	2916	break;
2917	ci = list_first_entry(&mdsc->cap_delay_list,	2917	ci = list_first_entry(&mdsc->cap_delay_list,
2918	struct ceph_inode_info,	2918	struct ceph_inode_info,
2919	i_cap_delay_list);	2919	i_cap_delay_list);
2920	if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&	2920	if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
2921	time_before(jiffies, ci->i_hold_caps_max))	2921	time_before(jiffies, ci->i_hold_caps_max))
2922	break;	2922	break;
2923	list_del_init(&ci->i_cap_delay_list);	2923	list_del_init(&ci->i_cap_delay_list);
2924	spin_unlock(&mdsc->cap_delay_lock);	2924	spin_unlock(&mdsc->cap_delay_lock);
2925	dout("check_delayed_caps on %p\n", &ci->vfs_inode);	2925	dout("check_delayed_caps on %p\n", &ci->vfs_inode);
2926	ceph_check_caps(ci, flags, NULL);	2926	ceph_check_caps(ci, flags, NULL);
2927	}	2927	}
2928	spin_unlock(&mdsc->cap_delay_lock);	2928	spin_unlock(&mdsc->cap_delay_lock);
2929	}	2929	}
2930		2930
2931	/*	2931	/*
2932	* Flush all dirty caps to the mds	2932	* Flush all dirty caps to the mds
2933	*/	2933	*/
2934	void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)	2934	void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
2935	{	2935	{
2936	struct ceph_inode_info *ci;	2936	struct ceph_inode_info *ci;
2937	struct inode *inode;	2937	struct inode *inode;
2938		2938
2939	dout("flush_dirty_caps\n");	2939	dout("flush_dirty_caps\n");
2940	spin_lock(&mdsc->cap_dirty_lock);	2940	spin_lock(&mdsc->cap_dirty_lock);
2941	while (!list_empty(&mdsc->cap_dirty)) {	2941	while (!list_empty(&mdsc->cap_dirty)) {
2942	ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info,	2942	ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info,
2943	i_dirty_item);	2943	i_dirty_item);
2944	inode = &ci->vfs_inode;	2944	inode = &ci->vfs_inode;
2945	ihold(inode);	2945	ihold(inode);
2946	dout("flush_dirty_caps %p\n", inode);	2946	dout("flush_dirty_caps %p\n", inode);
2947	spin_unlock(&mdsc->cap_dirty_lock);	2947	spin_unlock(&mdsc->cap_dirty_lock);
2948	ceph_check_caps(ci, CHECK_CAPS_NODELAY\|CHECK_CAPS_FLUSH, NULL);	2948	ceph_check_caps(ci, CHECK_CAPS_NODELAY\|CHECK_CAPS_FLUSH, NULL);
2949	iput(inode);	2949	iput(inode);
2950	spin_lock(&mdsc->cap_dirty_lock);	2950	spin_lock(&mdsc->cap_dirty_lock);
2951	}	2951	}
2952	spin_unlock(&mdsc->cap_dirty_lock);	2952	spin_unlock(&mdsc->cap_dirty_lock);
2953	dout("flush_dirty_caps done\n");	2953	dout("flush_dirty_caps done\n");
2954	}	2954	}
2955		2955
2956	/*	2956	/*
2957	* Drop open file reference. If we were the last open file,	2957	* Drop open file reference. If we were the last open file,
2958	* we may need to release capabilities to the MDS (or schedule	2958	* we may need to release capabilities to the MDS (or schedule
2959	* their delayed release).	2959	* their delayed release).
2960	*/	2960	*/
2961	void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)	2961	void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
2962	{	2962	{
2963	struct inode *inode = &ci->vfs_inode;	2963	struct inode *inode = &ci->vfs_inode;
2964	int last = 0;	2964	int last = 0;
2965		2965
2966	spin_lock(&ci->i_ceph_lock);	2966	spin_lock(&ci->i_ceph_lock);
2967	dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode,	2967	dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode,
2968	ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1);	2968	ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1);
2969	BUG_ON(ci->i_nr_by_mode[fmode] == 0);	2969	BUG_ON(ci->i_nr_by_mode[fmode] == 0);
2970	if (--ci->i_nr_by_mode[fmode] == 0)	2970	if (--ci->i_nr_by_mode[fmode] == 0)
2971	last++;	2971	last++;
2972	spin_unlock(&ci->i_ceph_lock);	2972	spin_unlock(&ci->i_ceph_lock);
2973		2973
2974	if (last && ci->i_vino.snap == CEPH_NOSNAP)	2974	if (last && ci->i_vino.snap == CEPH_NOSNAP)
2975	ceph_check_caps(ci, 0, NULL);	2975	ceph_check_caps(ci, 0, NULL);
2976	}	2976	}
2977		2977
2978	/*	2978	/*
2979	* Helpers for embedding cap and dentry lease releases into mds	2979	* Helpers for embedding cap and dentry lease releases into mds
2980	* requests.	2980	* requests.
2981	*	2981	*
2982	* @force is used by dentry_release (below) to force inclusion of a	2982	* @force is used by dentry_release (below) to force inclusion of a
2983	* record for the directory inode, even when there aren't any caps to	2983	* record for the directory inode, even when there aren't any caps to
2984	* drop.	2984	* drop.
2985	*/	2985	*/
2986	int ceph_encode_inode_release(void *p, struct inode inode,	2986	int ceph_encode_inode_release(void *p, struct inode inode,
2987	int mds, int drop, int unless, int force)	2987	int mds, int drop, int unless, int force)
2988	{	2988	{
2989	struct ceph_inode_info *ci = ceph_inode(inode);	2989	struct ceph_inode_info *ci = ceph_inode(inode);
2990	struct ceph_cap *cap;	2990	struct ceph_cap *cap;
2991	struct ceph_mds_request_release rel = p;	2991	struct ceph_mds_request_release rel = p;
2992	int used, dirty;	2992	int used, dirty;
2993	int ret = 0;	2993	int ret = 0;
2994		2994
2995	spin_lock(&ci->i_ceph_lock);	2995	spin_lock(&ci->i_ceph_lock);
2996	used = __ceph_caps_used(ci);	2996	used = __ceph_caps_used(ci);
2997	dirty = __ceph_caps_dirty(ci);	2997	dirty = __ceph_caps_dirty(ci);
2998		2998
2999	dout("encode_inode_release %p mds%d used\|dirty %s drop %s unless %s\n",	2999	dout("encode_inode_release %p mds%d used\|dirty %s drop %s unless %s\n",
3000	inode, mds, ceph_cap_string(used\|dirty), ceph_cap_string(drop),	3000	inode, mds, ceph_cap_string(used\|dirty), ceph_cap_string(drop),
3001	ceph_cap_string(unless));	3001	ceph_cap_string(unless));
3002		3002
3003	/* only drop unused, clean caps */	3003	/* only drop unused, clean caps */
3004	drop &= ~(used \| dirty);	3004	drop &= ~(used \| dirty);
3005		3005
3006	cap = __get_cap_for_mds(ci, mds);	3006	cap = __get_cap_for_mds(ci, mds);
3007	if (cap && __cap_is_valid(cap)) {	3007	if (cap && __cap_is_valid(cap)) {
3008	if (force \|\|	3008	if (force \|\|
3009	((cap->issued & drop) &&	3009	((cap->issued & drop) &&
3010	(cap->issued & unless) == 0)) {	3010	(cap->issued & unless) == 0)) {
3011	if ((cap->issued & drop) &&	3011	if ((cap->issued & drop) &&
3012	(cap->issued & unless) == 0) {	3012	(cap->issued & unless) == 0) {
3013	dout("encode_inode_release %p cap %p %s -> "	3013	dout("encode_inode_release %p cap %p %s -> "
3014	"%s\n", inode, cap,	3014	"%s\n", inode, cap,
3015	ceph_cap_string(cap->issued),	3015	ceph_cap_string(cap->issued),
3016	ceph_cap_string(cap->issued & ~drop));	3016	ceph_cap_string(cap->issued & ~drop));
3017	cap->issued &= ~drop;	3017	cap->issued &= ~drop;
3018	cap->implemented &= ~drop;	3018	cap->implemented &= ~drop;
3019	if (ci->i_ceph_flags & CEPH_I_NODELAY) {	3019	if (ci->i_ceph_flags & CEPH_I_NODELAY) {
3020	int wanted = __ceph_caps_wanted(ci);	3020	int wanted = __ceph_caps_wanted(ci);
3021	dout(" wanted %s -> %s (act %s)\n",	3021	dout(" wanted %s -> %s (act %s)\n",
3022	ceph_cap_string(cap->mds_wanted),	3022	ceph_cap_string(cap->mds_wanted),
3023	ceph_cap_string(cap->mds_wanted &	3023	ceph_cap_string(cap->mds_wanted &
3024	~wanted),	3024	~wanted),
3025	ceph_cap_string(wanted));	3025	ceph_cap_string(wanted));
3026	cap->mds_wanted &= wanted;	3026	cap->mds_wanted &= wanted;
3027	}	3027	}
3028	} else {	3028	} else {
3029	dout("encode_inode_release %p cap %p %s"	3029	dout("encode_inode_release %p cap %p %s"
3030	" (force)\n", inode, cap,	3030	" (force)\n", inode, cap,
3031	ceph_cap_string(cap->issued));	3031	ceph_cap_string(cap->issued));
3032	}	3032	}
3033		3033
3034	rel->ino = cpu_to_le64(ceph_ino(inode));	3034	rel->ino = cpu_to_le64(ceph_ino(inode));
3035	rel->cap_id = cpu_to_le64(cap->cap_id);	3035	rel->cap_id = cpu_to_le64(cap->cap_id);
3036	rel->seq = cpu_to_le32(cap->seq);	3036	rel->seq = cpu_to_le32(cap->seq);
3037	rel->issue_seq = cpu_to_le32(cap->issue_seq),	3037	rel->issue_seq = cpu_to_le32(cap->issue_seq),
3038	rel->mseq = cpu_to_le32(cap->mseq);	3038	rel->mseq = cpu_to_le32(cap->mseq);
3039	rel->caps = cpu_to_le32(cap->issued);	3039	rel->caps = cpu_to_le32(cap->issued);
3040	rel->wanted = cpu_to_le32(cap->mds_wanted);	3040	rel->wanted = cpu_to_le32(cap->mds_wanted);
3041	rel->dname_len = 0;	3041	rel->dname_len = 0;
3042	rel->dname_seq = 0;	3042	rel->dname_seq = 0;
3043	p += sizeof(rel);	3043	p += sizeof(rel);
3044	ret = 1;	3044	ret = 1;
3045	} else {	3045	} else {
3046	dout("encode_inode_release %p cap %p %s\n",	3046	dout("encode_inode_release %p cap %p %s\n",
3047	inode, cap, ceph_cap_string(cap->issued));	3047	inode, cap, ceph_cap_string(cap->issued));
3048	}	3048	}
3049	}	3049	}
3050	spin_unlock(&ci->i_ceph_lock);	3050	spin_unlock(&ci->i_ceph_lock);
3051	return ret;	3051	return ret;
3052	}	3052	}
3053		3053
3054	int ceph_encode_dentry_release(void *p, struct dentry dentry,	3054	int ceph_encode_dentry_release(void *p, struct dentry dentry,
3055	int mds, int drop, int unless)	3055	int mds, int drop, int unless)
3056	{	3056	{
3057	struct inode *dir = dentry->d_parent->d_inode;	3057	struct inode *dir = dentry->d_parent->d_inode;
3058	struct ceph_mds_request_release rel = p;	3058	struct ceph_mds_request_release rel = p;
3059	struct ceph_dentry_info *di = ceph_dentry(dentry);	3059	struct ceph_dentry_info *di = ceph_dentry(dentry);
3060	int force = 0;	3060	int force = 0;
3061	int ret;	3061	int ret;
3062		3062
3063	/*	3063	/*
3064	* force an record for the directory caps if we have a dentry lease.	3064	* force an record for the directory caps if we have a dentry lease.
3065	* this is racy (can't take i_ceph_lock and d_lock together), but it	3065	* this is racy (can't take i_ceph_lock and d_lock together), but it
3066	* doesn't have to be perfect; the mds will revoke anything we don't	3066	* doesn't have to be perfect; the mds will revoke anything we don't
3067	* release.	3067	* release.
3068	*/	3068	*/
3069	spin_lock(&dentry->d_lock);	3069	spin_lock(&dentry->d_lock);
3070	if (di->lease_session && di->lease_session->s_mds == mds)	3070	if (di->lease_session && di->lease_session->s_mds == mds)
3071	force = 1;	3071	force = 1;
3072	spin_unlock(&dentry->d_lock);	3072	spin_unlock(&dentry->d_lock);
3073		3073
3074	ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);	3074	ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
3075		3075
3076	spin_lock(&dentry->d_lock);	3076	spin_lock(&dentry->d_lock);
3077	if (ret && di->lease_session && di->lease_session->s_mds == mds) {	3077	if (ret && di->lease_session && di->lease_session->s_mds == mds) {
3078	dout("encode_dentry_release %p mds%d seq %d\n",	3078	dout("encode_dentry_release %p mds%d seq %d\n",
3079	dentry, mds, (int)di->lease_seq);	3079	dentry, mds, (int)di->lease_seq);
3080	rel->dname_len = cpu_to_le32(dentry->d_name.len);	3080	rel->dname_len = cpu_to_le32(dentry->d_name.len);
3081	memcpy(*p, dentry->d_name.name, dentry->d_name.len);	3081	memcpy(*p, dentry->d_name.name, dentry->d_name.len);
3082	*p += dentry->d_name.len;	3082	*p += dentry->d_name.len;
3083	rel->dname_seq = cpu_to_le32(di->lease_seq);	3083	rel->dname_seq = cpu_to_le32(di->lease_seq);
3084	__ceph_mdsc_drop_dentry_lease(dentry);	3084	__ceph_mdsc_drop_dentry_lease(dentry);
3085	}	3085	}
3086	spin_unlock(&dentry->d_lock);	3086	spin_unlock(&dentry->d_lock);
3087	return ret;	3087	return ret;
3088	}	3088	}
3089		3089

fs/ceph/dir.c

Diff comments View file @ 6c073a7

1	#include <linux/ceph/ceph_debug.h>	1	#include <linux/ceph/ceph_debug.h>
2		2
3	#include <linux/spinlock.h>	3	#include <linux/spinlock.h>
4	#include <linux/fs_struct.h>	4	#include <linux/fs_struct.h>
5	#include <linux/namei.h>	5	#include <linux/namei.h>
6	#include <linux/slab.h>	6	#include <linux/slab.h>
7	#include <linux/sched.h>	7	#include <linux/sched.h>
8		8
9	#include "super.h"	9	#include "super.h"
10	#include "mds_client.h"	10	#include "mds_client.h"
11		11
12	/*	12	/*
13	* Directory operations: readdir, lookup, create, link, unlink,	13	* Directory operations: readdir, lookup, create, link, unlink,
14	* rename, etc.	14	* rename, etc.
15	*/	15	*/
16		16
17	/*	17	/*
18	* Ceph MDS operations are specified in terms of a base ino and	18	* Ceph MDS operations are specified in terms of a base ino and
19	* relative path. Thus, the client can specify an operation on a	19	* relative path. Thus, the client can specify an operation on a
20	* specific inode (e.g., a getattr due to fstat(2)), or as a path	20	* specific inode (e.g., a getattr due to fstat(2)), or as a path
21	* relative to, say, the root directory.	21	* relative to, say, the root directory.
22	*	22	*
23	* Normally, we limit ourselves to strict inode ops (no path component)	23	* Normally, we limit ourselves to strict inode ops (no path component)
24	* or dentry operations (a single path component relative to an ino). The	24	* or dentry operations (a single path component relative to an ino). The
25	* exception to this is open_root_dentry(), which will open the mount	25	* exception to this is open_root_dentry(), which will open the mount
26	* point by name.	26	* point by name.
27	*/	27	*/
28		28
29	const struct inode_operations ceph_dir_iops;	29	const struct inode_operations ceph_dir_iops;
30	const struct file_operations ceph_dir_fops;	30	const struct file_operations ceph_dir_fops;
31	const struct dentry_operations ceph_dentry_ops;	31	const struct dentry_operations ceph_dentry_ops;
32		32
33	/*	33	/*
34	* Initialize ceph dentry state.	34	* Initialize ceph dentry state.
35	*/	35	*/
36	int ceph_init_dentry(struct dentry *dentry)	36	int ceph_init_dentry(struct dentry *dentry)
37	{	37	{
38	struct ceph_dentry_info *di;	38	struct ceph_dentry_info *di;
39		39
40	if (dentry->d_fsdata)	40	if (dentry->d_fsdata)
41	return 0;	41	return 0;
42		42
43	di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS \| __GFP_ZERO);	43	di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS \| __GFP_ZERO);
44	if (!di)	44	if (!di)
45	return -ENOMEM; /* oh well */	45	return -ENOMEM; /* oh well */
46		46
47	spin_lock(&dentry->d_lock);	47	spin_lock(&dentry->d_lock);
48	if (dentry->d_fsdata) {	48	if (dentry->d_fsdata) {
49	/* lost a race */	49	/* lost a race */
50	kmem_cache_free(ceph_dentry_cachep, di);	50	kmem_cache_free(ceph_dentry_cachep, di);
51	goto out_unlock;	51	goto out_unlock;
52	}	52	}
53		53
54	if (dentry->d_parent == NULL \|\| /* nfs fh_to_dentry */	54	if (dentry->d_parent == NULL \|\| /* nfs fh_to_dentry */
55	ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)	55	ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
56	d_set_d_op(dentry, &ceph_dentry_ops);	56	d_set_d_op(dentry, &ceph_dentry_ops);
57	else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)	57	else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
58	d_set_d_op(dentry, &ceph_snapdir_dentry_ops);	58	d_set_d_op(dentry, &ceph_snapdir_dentry_ops);
59	else	59	else
60	d_set_d_op(dentry, &ceph_snap_dentry_ops);	60	d_set_d_op(dentry, &ceph_snap_dentry_ops);
61		61
62	di->dentry = dentry;	62	di->dentry = dentry;
63	di->lease_session = NULL;	63	di->lease_session = NULL;
64	dentry->d_time = jiffies;	64	dentry->d_time = jiffies;
65	/* avoid reordering d_fsdata setup so that the check above is safe */	65	/* avoid reordering d_fsdata setup so that the check above is safe */
66	smp_mb();	66	smp_mb();
67	dentry->d_fsdata = di;	67	dentry->d_fsdata = di;
68	ceph_dentry_lru_add(dentry);	68	ceph_dentry_lru_add(dentry);
69	out_unlock:	69	out_unlock:
70	spin_unlock(&dentry->d_lock);	70	spin_unlock(&dentry->d_lock);
71	return 0;	71	return 0;
72	}	72	}
73		73
74	struct inode ceph_get_dentry_parent_inode(struct dentry dentry)	74	struct inode ceph_get_dentry_parent_inode(struct dentry dentry)
75	{	75	{
76	struct inode *inode = NULL;	76	struct inode *inode = NULL;
77		77
78	if (!dentry)	78	if (!dentry)
79	return NULL;	79	return NULL;
80		80
81	spin_lock(&dentry->d_lock);	81	spin_lock(&dentry->d_lock);
82	if (dentry->d_parent) {	82	if (dentry->d_parent) {
83	inode = dentry->d_parent->d_inode;	83	inode = dentry->d_parent->d_inode;
84	ihold(inode);	84	ihold(inode);
85	}	85	}
86	spin_unlock(&dentry->d_lock);	86	spin_unlock(&dentry->d_lock);
87	return inode;	87	return inode;
88	}	88	}
89		89
90		90
91	/*	91	/*
92	* for readdir, we encode the directory frag and offset within that	92	* for readdir, we encode the directory frag and offset within that
93	* frag into f_pos.	93	* frag into f_pos.
94	*/	94	*/
95	static unsigned fpos_frag(loff_t p)	95	static unsigned fpos_frag(loff_t p)
96	{	96	{
97	return p >> 32;	97	return p >> 32;
98	}	98	}
99	static unsigned fpos_off(loff_t p)	99	static unsigned fpos_off(loff_t p)
100	{	100	{
101	return p & 0xffffffff;	101	return p & 0xffffffff;
102	}	102	}
103		103
104	/*	104	/*
105	* When possible, we try to satisfy a readdir by peeking at the	105	* When possible, we try to satisfy a readdir by peeking at the
106	* dcache. We make this work by carefully ordering dentries on	106	* dcache. We make this work by carefully ordering dentries on
107	* d_u.d_child when we initially get results back from the MDS, and	107	* d_u.d_child when we initially get results back from the MDS, and
108	* falling back to a "normal" sync readdir if any dentries in the dir	108	* falling back to a "normal" sync readdir if any dentries in the dir
109	* are dropped.	109	* are dropped.
110	*	110	*
111	* D_COMPLETE tells indicates we have all dentries in the dir. It is	111	* D_COMPLETE tells indicates we have all dentries in the dir. It is
112	* defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by	112	* defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
113	* the MDS if/when the directory is modified).	113	* the MDS if/when the directory is modified).
114	*/	114	*/
115	static int __dcache_readdir(struct file *filp,	115	static int __dcache_readdir(struct file *filp,
116	void *dirent, filldir_t filldir)	116	void *dirent, filldir_t filldir)
117	{	117	{
118	struct ceph_file_info *fi = filp->private_data;	118	struct ceph_file_info *fi = filp->private_data;
119	struct dentry *parent = filp->f_dentry;	119	struct dentry *parent = filp->f_dentry;
120	struct inode *dir = parent->d_inode;	120	struct inode *dir = parent->d_inode;
121	struct list_head *p;	121	struct list_head *p;
122	struct dentry dentry, last;	122	struct dentry dentry, last;
123	struct ceph_dentry_info *di;	123	struct ceph_dentry_info *di;
124	int err = 0;	124	int err = 0;
125		125
126	/* claim ref on last dentry we returned */	126	/* claim ref on last dentry we returned */
127	last = fi->dentry;	127	last = fi->dentry;
128	fi->dentry = NULL;	128	fi->dentry = NULL;
129		129
130	dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,	130	dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,
131	last);	131	last);
132		132
133	spin_lock(&parent->d_lock);	133	spin_lock(&parent->d_lock);
134		134
135	/* start at beginning? */	135	/* start at beginning? */
136	if (filp->f_pos == 2 \|\| last == NULL \|\|	136	if (filp->f_pos == 2 \|\| last == NULL \|\|
137	filp->f_pos < ceph_dentry(last)->offset) {	137	filp->f_pos < ceph_dentry(last)->offset) {
138	if (list_empty(&parent->d_subdirs))	138	if (list_empty(&parent->d_subdirs))
139	goto out_unlock;	139	goto out_unlock;
140	p = parent->d_subdirs.prev;	140	p = parent->d_subdirs.prev;
141	dout(" initial p %p/%p\n", p->prev, p->next);	141	dout(" initial p %p/%p\n", p->prev, p->next);
142	} else {	142	} else {
143	p = last->d_u.d_child.prev;	143	p = last->d_u.d_child.prev;
144	}	144	}
145		145
146	more:	146	more:
147	dentry = list_entry(p, struct dentry, d_u.d_child);	147	dentry = list_entry(p, struct dentry, d_u.d_child);
148	di = ceph_dentry(dentry);	148	di = ceph_dentry(dentry);
149	while (1) {	149	while (1) {
150	dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next,	150	dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next,
151	d_unhashed(dentry) ? "!hashed" : "hashed",	151	d_unhashed(dentry) ? "!hashed" : "hashed",
152	parent->d_subdirs.prev, parent->d_subdirs.next);	152	parent->d_subdirs.prev, parent->d_subdirs.next);
153	if (p == &parent->d_subdirs) {	153	if (p == &parent->d_subdirs) {
154	fi->flags \|= CEPH_F_ATEND;	154	fi->flags \|= CEPH_F_ATEND;
155	goto out_unlock;	155	goto out_unlock;
156	}	156	}
157	spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);	157	spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
158	if (!d_unhashed(dentry) && dentry->d_inode &&	158	if (!d_unhashed(dentry) && dentry->d_inode &&
159	ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&	159	ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
160	ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&	160	ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
161	filp->f_pos <= di->offset)	161	filp->f_pos <= di->offset)
162	break;	162	break;
163	dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,	163	dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,
164	dentry->d_name.len, dentry->d_name.name, di->offset,	164	dentry->d_name.len, dentry->d_name.name, di->offset,
165	filp->f_pos, d_unhashed(dentry) ? " unhashed" : "",	165	filp->f_pos, d_unhashed(dentry) ? " unhashed" : "",
166	!dentry->d_inode ? " null" : "");	166	!dentry->d_inode ? " null" : "");
167	spin_unlock(&dentry->d_lock);	167	spin_unlock(&dentry->d_lock);
168	p = p->prev;	168	p = p->prev;
169	dentry = list_entry(p, struct dentry, d_u.d_child);	169	dentry = list_entry(p, struct dentry, d_u.d_child);
170	di = ceph_dentry(dentry);	170	di = ceph_dentry(dentry);
171	}	171	}
172		172
173	dget_dlock(dentry);	173	dget_dlock(dentry);
174	spin_unlock(&dentry->d_lock);	174	spin_unlock(&dentry->d_lock);
175	spin_unlock(&parent->d_lock);	175	spin_unlock(&parent->d_lock);
176		176
177	dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,	177	dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
178	dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);	178	dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
179	filp->f_pos = di->offset;	179	filp->f_pos = di->offset;
180	err = filldir(dirent, dentry->d_name.name,	180	err = filldir(dirent, dentry->d_name.name,
181	dentry->d_name.len, di->offset,	181	dentry->d_name.len, di->offset,
182	ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino),	182	ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino),
183	dentry->d_inode->i_mode >> 12);	183	dentry->d_inode->i_mode >> 12);
184		184
185	if (last) {	185	if (last) {
186	if (err < 0) {	186	if (err < 0) {
187	/* remember our position */	187	/* remember our position */
188	fi->dentry = last;	188	fi->dentry = last;
189	fi->next_offset = di->offset;	189	fi->next_offset = di->offset;
190	} else {	190	} else {
191	dput(last);	191	dput(last);
192	}	192	}
193	}	193	}
194	last = dentry;	194	last = dentry;
195		195
196	if (err < 0)	196	if (err < 0)
197	goto out;	197	goto out;
198		198
199	filp->f_pos++;	199	filp->f_pos++;
200		200
201	/* make sure a dentry wasn't dropped while we didn't have parent lock */	201	/* make sure a dentry wasn't dropped while we didn't have parent lock */
202	if (!ceph_dir_test_complete(dir)) {	202	if (!ceph_dir_test_complete(dir)) {
203	dout(" lost D_COMPLETE on %p; falling back to mds\n", dir);	203	dout(" lost D_COMPLETE on %p; falling back to mds\n", dir);
204	err = -EAGAIN;	204	err = -EAGAIN;
205	goto out;	205	goto out;
206	}	206	}
207		207
208	spin_lock(&parent->d_lock);	208	spin_lock(&parent->d_lock);
209	p = p->prev; /* advance to next dentry */	209	p = p->prev; /* advance to next dentry */
210	goto more;	210	goto more;
211		211
212	out_unlock:	212	out_unlock:
213	spin_unlock(&parent->d_lock);	213	spin_unlock(&parent->d_lock);
214	out:	214	out:
215	if (last)	215	if (last)
216	dput(last);	216	dput(last);
217	return err;	217	return err;
218	}	218	}
219		219
220	/*	220	/*
221	* make note of the last dentry we read, so we can	221	* make note of the last dentry we read, so we can
222	* continue at the same lexicographical point,	222	* continue at the same lexicographical point,
223	* regardless of what dir changes take place on the	223	* regardless of what dir changes take place on the
224	* server.	224	* server.
225	*/	225	*/
226	static int note_last_dentry(struct ceph_file_info fi, const char name,	226	static int note_last_dentry(struct ceph_file_info fi, const char name,
227	int len)	227	int len)
228	{	228	{
229	kfree(fi->last_name);	229	kfree(fi->last_name);
230	fi->last_name = kmalloc(len+1, GFP_NOFS);	230	fi->last_name = kmalloc(len+1, GFP_NOFS);
231	if (!fi->last_name)	231	if (!fi->last_name)
232	return -ENOMEM;	232	return -ENOMEM;
233	memcpy(fi->last_name, name, len);	233	memcpy(fi->last_name, name, len);
234	fi->last_name[len] = 0;	234	fi->last_name[len] = 0;
235	dout("note_last_dentry '%s'\n", fi->last_name);	235	dout("note_last_dentry '%s'\n", fi->last_name);
236	return 0;	236	return 0;
237	}	237	}
238		238
239	static int ceph_readdir(struct file filp, void dirent, filldir_t filldir)	239	static int ceph_readdir(struct file filp, void dirent, filldir_t filldir)
240	{	240	{
241	struct ceph_file_info *fi = filp->private_data;	241	struct ceph_file_info *fi = filp->private_data;
242	struct inode *inode = filp->f_dentry->d_inode;	242	struct inode *inode = filp->f_dentry->d_inode;
243	struct ceph_inode_info *ci = ceph_inode(inode);	243	struct ceph_inode_info *ci = ceph_inode(inode);
244	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);	244	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
245	struct ceph_mds_client *mdsc = fsc->mdsc;	245	struct ceph_mds_client *mdsc = fsc->mdsc;
246	unsigned frag = fpos_frag(filp->f_pos);	246	unsigned frag = fpos_frag(filp->f_pos);
247	int off = fpos_off(filp->f_pos);	247	int off = fpos_off(filp->f_pos);
248	int err;	248	int err;
249	u32 ftype;	249	u32 ftype;
250	struct ceph_mds_reply_info_parsed *rinfo;	250	struct ceph_mds_reply_info_parsed *rinfo;
251	const int max_entries = fsc->mount_options->max_readdir;	251	const int max_entries = fsc->mount_options->max_readdir;
252	const int max_bytes = fsc->mount_options->max_readdir_bytes;	252	const int max_bytes = fsc->mount_options->max_readdir_bytes;
253		253
254	dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);	254	dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
255	if (fi->flags & CEPH_F_ATEND)	255	if (fi->flags & CEPH_F_ATEND)
256	return 0;	256	return 0;
257		257
258	/* always start with . and .. */	258	/* always start with . and .. */
259	if (filp->f_pos == 0) {	259	if (filp->f_pos == 0) {
260	/* note dir version at start of readdir so we can tell	260	/* note dir version at start of readdir so we can tell
261	* if any dentries get dropped */	261	* if any dentries get dropped */
262	fi->dir_release_count = ci->i_release_count;	262	fi->dir_release_count = ci->i_release_count;
263		263
264	dout("readdir off 0 -> '.'\n");	264	dout("readdir off 0 -> '.'\n");
265	if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0),	265	if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0),
266	ceph_translate_ino(inode->i_sb, inode->i_ino),	266	ceph_translate_ino(inode->i_sb, inode->i_ino),
267	inode->i_mode >> 12) < 0)	267	inode->i_mode >> 12) < 0)
268	return 0;	268	return 0;
269	filp->f_pos = 1;	269	filp->f_pos = 1;
270	off = 1;	270	off = 1;
271	}	271	}
272	if (filp->f_pos == 1) {	272	if (filp->f_pos == 1) {
273	ino_t ino = parent_ino(filp->f_dentry);	273	ino_t ino = parent_ino(filp->f_dentry);
274	dout("readdir off 1 -> '..'\n");	274	dout("readdir off 1 -> '..'\n");
275	if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1),	275	if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1),
276	ceph_translate_ino(inode->i_sb, ino),	276	ceph_translate_ino(inode->i_sb, ino),
277	inode->i_mode >> 12) < 0)	277	inode->i_mode >> 12) < 0)
278	return 0;	278	return 0;
279	filp->f_pos = 2;	279	filp->f_pos = 2;
280	off = 2;	280	off = 2;
281	}	281	}
282		282
283	/* can we use the dcache? */	283	/* can we use the dcache? */
284	spin_lock(&ci->i_ceph_lock);	284	spin_lock(&ci->i_ceph_lock);
285	if ((filp->f_pos == 2 \|\| fi->dentry) &&	285	if ((filp->f_pos == 2 \|\| fi->dentry) &&
286	!ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&	286	!ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
287	ceph_snap(inode) != CEPH_SNAPDIR &&	287	ceph_snap(inode) != CEPH_SNAPDIR &&
288	ceph_dir_test_complete(inode) &&	288	ceph_dir_test_complete(inode) &&
289	__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {	289	__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
290	spin_unlock(&ci->i_ceph_lock);	290	spin_unlock(&ci->i_ceph_lock);
291	err = __dcache_readdir(filp, dirent, filldir);	291	err = __dcache_readdir(filp, dirent, filldir);
292	if (err != -EAGAIN)	292	if (err != -EAGAIN)
293	return err;	293	return err;
294	} else {	294	} else {
295	spin_unlock(&ci->i_ceph_lock);	295	spin_unlock(&ci->i_ceph_lock);
296	}	296	}
297	if (fi->dentry) {	297	if (fi->dentry) {
298	err = note_last_dentry(fi, fi->dentry->d_name.name,	298	err = note_last_dentry(fi, fi->dentry->d_name.name,
299	fi->dentry->d_name.len);	299	fi->dentry->d_name.len);
300	if (err)	300	if (err)
301	return err;	301	return err;
302	dput(fi->dentry);	302	dput(fi->dentry);
303	fi->dentry = NULL;	303	fi->dentry = NULL;
304	}	304	}
305		305
306	/* proceed with a normal readdir */	306	/* proceed with a normal readdir */
307		307
308	more:	308	more:
309	/* do we have the correct frag content buffered? */	309	/* do we have the correct frag content buffered? */
310	if (fi->frag != frag \|\| fi->last_readdir == NULL) {	310	if (fi->frag != frag \|\| fi->last_readdir == NULL) {
311	struct ceph_mds_request *req;	311	struct ceph_mds_request *req;
312	int op = ceph_snap(inode) == CEPH_SNAPDIR ?	312	int op = ceph_snap(inode) == CEPH_SNAPDIR ?
313	CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;	313	CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
314		314
315	/* discard old result, if any */	315	/* discard old result, if any */
316	if (fi->last_readdir) {	316	if (fi->last_readdir) {
317	ceph_mdsc_put_request(fi->last_readdir);	317	ceph_mdsc_put_request(fi->last_readdir);
318	fi->last_readdir = NULL;	318	fi->last_readdir = NULL;
319	}	319	}
320		320
321	/* requery frag tree, as the frag topology may have changed */	321	/* requery frag tree, as the frag topology may have changed */
322	frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL);	322	frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL);
323		323
324	dout("readdir fetching %llx.%llx frag %x offset '%s'\n",	324	dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
325	ceph_vinop(inode), frag, fi->last_name);	325	ceph_vinop(inode), frag, fi->last_name);
326	req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);	326	req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
327	if (IS_ERR(req))	327	if (IS_ERR(req))
328	return PTR_ERR(req);	328	return PTR_ERR(req);
329	req->r_inode = inode;	329	req->r_inode = inode;
330	ihold(inode);	330	ihold(inode);
331	req->r_dentry = dget(filp->f_dentry);	331	req->r_dentry = dget(filp->f_dentry);
332	/* hints to request -> mds selection code */	332	/* hints to request -> mds selection code */
333	req->r_direct_mode = USE_AUTH_MDS;	333	req->r_direct_mode = USE_AUTH_MDS;
334	req->r_direct_hash = ceph_frag_value(frag);	334	req->r_direct_hash = ceph_frag_value(frag);
335	req->r_direct_is_hash = true;	335	req->r_direct_is_hash = true;
336	req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);	336	req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
337	req->r_readdir_offset = fi->next_offset;	337	req->r_readdir_offset = fi->next_offset;
338	req->r_args.readdir.frag = cpu_to_le32(frag);	338	req->r_args.readdir.frag = cpu_to_le32(frag);
339	req->r_args.readdir.max_entries = cpu_to_le32(max_entries);	339	req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
340	req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes);	340	req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes);
341	req->r_num_caps = max_entries + 1;	341	req->r_num_caps = max_entries + 1;
342	err = ceph_mdsc_do_request(mdsc, NULL, req);	342	err = ceph_mdsc_do_request(mdsc, NULL, req);
343	if (err < 0) {	343	if (err < 0) {
344	ceph_mdsc_put_request(req);	344	ceph_mdsc_put_request(req);
345	return err;	345	return err;
346	}	346	}
347	dout("readdir got and parsed readdir result=%d"	347	dout("readdir got and parsed readdir result=%d"
348	" on frag %x, end=%d, complete=%d\n", err, frag,	348	" on frag %x, end=%d, complete=%d\n", err, frag,
349	(int)req->r_reply_info.dir_end,	349	(int)req->r_reply_info.dir_end,
350	(int)req->r_reply_info.dir_complete);	350	(int)req->r_reply_info.dir_complete);
351		351
352	if (!req->r_did_prepopulate) {	352	if (!req->r_did_prepopulate) {
353	dout("readdir !did_prepopulate");	353	dout("readdir !did_prepopulate");
354	fi->dir_release_count--; /* preclude D_COMPLETE */	354	fi->dir_release_count--; /* preclude D_COMPLETE */
355	}	355	}
356		356
357	/* note next offset and last dentry name */	357	/* note next offset and last dentry name */
358	fi->offset = fi->next_offset;	358	fi->offset = fi->next_offset;
359	fi->last_readdir = req;	359	fi->last_readdir = req;
360		360
361	if (req->r_reply_info.dir_end) {	361	if (req->r_reply_info.dir_end) {
362	kfree(fi->last_name);	362	kfree(fi->last_name);
363	fi->last_name = NULL;	363	fi->last_name = NULL;
364	if (ceph_frag_is_rightmost(frag))	364	if (ceph_frag_is_rightmost(frag))
365	fi->next_offset = 2;	365	fi->next_offset = 2;
366	else	366	else
367	fi->next_offset = 0;	367	fi->next_offset = 0;
368	} else {	368	} else {
369	rinfo = &req->r_reply_info;	369	rinfo = &req->r_reply_info;
370	err = note_last_dentry(fi,	370	err = note_last_dentry(fi,
371	rinfo->dir_dname[rinfo->dir_nr-1],	371	rinfo->dir_dname[rinfo->dir_nr-1],
372	rinfo->dir_dname_len[rinfo->dir_nr-1]);	372	rinfo->dir_dname_len[rinfo->dir_nr-1]);
373	if (err)	373	if (err)
374	return err;	374	return err;
375	fi->next_offset += rinfo->dir_nr;	375	fi->next_offset += rinfo->dir_nr;
376	}	376	}
377	}	377	}
378		378
379	rinfo = &fi->last_readdir->r_reply_info;	379	rinfo = &fi->last_readdir->r_reply_info;
380	dout("readdir frag %x num %d off %d chunkoff %d\n", frag,	380	dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
381	rinfo->dir_nr, off, fi->offset);	381	rinfo->dir_nr, off, fi->offset);
382	while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) {	382	while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) {
383	u64 pos = ceph_make_fpos(frag, off);	383	u64 pos = ceph_make_fpos(frag, off);
384	struct ceph_mds_reply_inode *in =	384	struct ceph_mds_reply_inode *in =
385	rinfo->dir_in[off - fi->offset].in;	385	rinfo->dir_in[off - fi->offset].in;
386	struct ceph_vino vino;	386	struct ceph_vino vino;
387	ino_t ino;	387	ino_t ino;
388		388
389	dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",	389	dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
390	off, off - fi->offset, rinfo->dir_nr, pos,	390	off, off - fi->offset, rinfo->dir_nr, pos,
391	rinfo->dir_dname_len[off - fi->offset],	391	rinfo->dir_dname_len[off - fi->offset],
392	rinfo->dir_dname[off - fi->offset], in);	392	rinfo->dir_dname[off - fi->offset], in);
393	BUG_ON(!in);	393	BUG_ON(!in);
394	ftype = le32_to_cpu(in->mode) >> 12;	394	ftype = le32_to_cpu(in->mode) >> 12;
395	vino.ino = le64_to_cpu(in->ino);	395	vino.ino = le64_to_cpu(in->ino);
396	vino.snap = le64_to_cpu(in->snapid);	396	vino.snap = le64_to_cpu(in->snapid);
397	ino = ceph_vino_to_ino(vino);	397	ino = ceph_vino_to_ino(vino);
398	if (filldir(dirent,	398	if (filldir(dirent,
399	rinfo->dir_dname[off - fi->offset],	399	rinfo->dir_dname[off - fi->offset],
400	rinfo->dir_dname_len[off - fi->offset],	400	rinfo->dir_dname_len[off - fi->offset],
401	pos,	401	pos,
402	ceph_translate_ino(inode->i_sb, ino), ftype) < 0) {	402	ceph_translate_ino(inode->i_sb, ino), ftype) < 0) {
403	dout("filldir stopping us...\n");	403	dout("filldir stopping us...\n");
404	return 0;	404	return 0;
405	}	405	}
406	off++;	406	off++;
407	filp->f_pos = pos + 1;	407	filp->f_pos = pos + 1;
408	}	408	}
409		409
410	if (fi->last_name) {	410	if (fi->last_name) {
411	ceph_mdsc_put_request(fi->last_readdir);	411	ceph_mdsc_put_request(fi->last_readdir);
412	fi->last_readdir = NULL;	412	fi->last_readdir = NULL;
413	goto more;	413	goto more;
414	}	414	}
415		415
416	/* more frags? */	416	/* more frags? */
417	if (!ceph_frag_is_rightmost(frag)) {	417	if (!ceph_frag_is_rightmost(frag)) {
418	frag = ceph_frag_next(frag);	418	frag = ceph_frag_next(frag);
419	off = 0;	419	off = 0;
420	filp->f_pos = ceph_make_fpos(frag, off);	420	filp->f_pos = ceph_make_fpos(frag, off);
421	dout("readdir next frag is %x\n", frag);	421	dout("readdir next frag is %x\n", frag);
422	goto more;	422	goto more;
423	}	423	}
424	fi->flags \|= CEPH_F_ATEND;	424	fi->flags \|= CEPH_F_ATEND;
425		425
426	/*	426	/*
427	* if dir_release_count still matches the dir, no dentries	427	* if dir_release_count still matches the dir, no dentries
428	* were released during the whole readdir, and we should have	428	* were released during the whole readdir, and we should have
429	* the complete dir contents in our cache.	429	* the complete dir contents in our cache.
430	*/	430	*/
431	spin_lock(&ci->i_ceph_lock);	431	spin_lock(&ci->i_ceph_lock);
432	if (ci->i_release_count == fi->dir_release_count) {	432	if (ci->i_release_count == fi->dir_release_count) {
433	ceph_dir_set_complete(inode);	433	ceph_dir_set_complete(inode);
434	ci->i_max_offset = filp->f_pos;	434	ci->i_max_offset = filp->f_pos;
435	}	435	}
436	spin_unlock(&ci->i_ceph_lock);	436	spin_unlock(&ci->i_ceph_lock);
437		437
438	dout("readdir %p filp %p done.\n", inode, filp);	438	dout("readdir %p filp %p done.\n", inode, filp);
439	return 0;	439	return 0;
440	}	440	}
441		441
442	static void reset_readdir(struct ceph_file_info *fi)	442	static void reset_readdir(struct ceph_file_info *fi)
443	{	443	{
444	if (fi->last_readdir) {	444	if (fi->last_readdir) {
445	ceph_mdsc_put_request(fi->last_readdir);	445	ceph_mdsc_put_request(fi->last_readdir);
446	fi->last_readdir = NULL;	446	fi->last_readdir = NULL;
447	}	447	}
448	kfree(fi->last_name);	448	kfree(fi->last_name);
449	fi->last_name = NULL;	449	fi->last_name = NULL;
450	fi->next_offset = 2; /* compensate for . and .. */	450	fi->next_offset = 2; /* compensate for . and .. */
451	if (fi->dentry) {	451	if (fi->dentry) {
452	dput(fi->dentry);	452	dput(fi->dentry);
453	fi->dentry = NULL;	453	fi->dentry = NULL;
454	}	454	}
455	fi->flags &= ~CEPH_F_ATEND;	455	fi->flags &= ~CEPH_F_ATEND;
456	}	456	}
457		457
458	static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)	458	static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
459	{	459	{
460	struct ceph_file_info *fi = file->private_data;	460	struct ceph_file_info *fi = file->private_data;
461	struct inode *inode = file->f_mapping->host;	461	struct inode *inode = file->f_mapping->host;
462	loff_t old_offset = offset;	462	loff_t old_offset = offset;
463	loff_t retval;	463	loff_t retval;
464		464
465	mutex_lock(&inode->i_mutex);	465	mutex_lock(&inode->i_mutex);
466	retval = -EINVAL;	466	retval = -EINVAL;
467	switch (origin) {	467	switch (origin) {
468	case SEEK_END:	468	case SEEK_END:
469	offset += inode->i_size + 2; /* FIXME */	469	offset += inode->i_size + 2; /* FIXME */
470	break;	470	break;
471	case SEEK_CUR:	471	case SEEK_CUR:
472	offset += file->f_pos;	472	offset += file->f_pos;
473	case SEEK_SET:	473	case SEEK_SET:
474	break;	474	break;
475	default:	475	default:
476	goto out;	476	goto out;
477	}	477	}
478		478
479	if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {	479	if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
480	if (offset != file->f_pos) {	480	if (offset != file->f_pos) {
481	file->f_pos = offset;	481	file->f_pos = offset;
482	file->f_version = 0;	482	file->f_version = 0;
483	fi->flags &= ~CEPH_F_ATEND;	483	fi->flags &= ~CEPH_F_ATEND;
484	}	484	}
485	retval = offset;	485	retval = offset;
486		486
487	/*	487	/*
488	* discard buffered readdir content on seekdir(0), or	488	* discard buffered readdir content on seekdir(0), or
489	* seek to new frag, or seek prior to current chunk.	489	* seek to new frag, or seek prior to current chunk.
490	*/	490	*/
491	if (offset == 0 \|\|	491	if (offset == 0 \|\|
492	fpos_frag(offset) != fpos_frag(old_offset) \|\|	492	fpos_frag(offset) != fpos_frag(old_offset) \|\|
493	fpos_off(offset) < fi->offset) {	493	fpos_off(offset) < fi->offset) {
494	dout("dir_llseek dropping %p content\n", file);	494	dout("dir_llseek dropping %p content\n", file);
495	reset_readdir(fi);	495	reset_readdir(fi);
496	}	496	}
497		497
498	/* bump dir_release_count if we did a forward seek */	498	/* bump dir_release_count if we did a forward seek */
499	if (offset > old_offset)	499	if (offset > old_offset)
500	fi->dir_release_count--;	500	fi->dir_release_count--;
501	}	501	}
502	out:	502	out:
503	mutex_unlock(&inode->i_mutex);	503	mutex_unlock(&inode->i_mutex);
504	return retval;	504	return retval;
505	}	505	}
506		506
507	/*	507	/*
508	* Handle lookups for the hidden .snap directory.	508	* Handle lookups for the hidden .snap directory.
509	*/	509	*/
510	int ceph_handle_snapdir(struct ceph_mds_request *req,	510	int ceph_handle_snapdir(struct ceph_mds_request *req,
511	struct dentry *dentry, int err)	511	struct dentry *dentry, int err)
512	{	512	{
513	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);	513	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
514	struct inode parent = dentry->d_parent->d_inode; / we hold i_mutex */	514	struct inode parent = dentry->d_parent->d_inode; / we hold i_mutex */
515		515
516	/* .snap dir? */	516	/* .snap dir? */
517	if (err == -ENOENT &&	517	if (err == -ENOENT &&
518	ceph_snap(parent) == CEPH_NOSNAP &&	518	ceph_snap(parent) == CEPH_NOSNAP &&
519	strcmp(dentry->d_name.name,	519	strcmp(dentry->d_name.name,
520	fsc->mount_options->snapdir_name) == 0) {	520	fsc->mount_options->snapdir_name) == 0) {
521	struct inode *inode = ceph_get_snapdir(parent);	521	struct inode *inode = ceph_get_snapdir(parent);
522	dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",	522	dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
523	dentry, dentry->d_name.len, dentry->d_name.name, inode);	523	dentry, dentry->d_name.len, dentry->d_name.name, inode);
524	BUG_ON(!d_unhashed(dentry));	524	BUG_ON(!d_unhashed(dentry));
525	d_add(dentry, inode);	525	d_add(dentry, inode);
526	err = 0;	526	err = 0;
527	}	527	}
528	return err;	528	return err;
529	}	529	}
530		530
531	/*	531	/*
532	* Figure out final result of a lookup/open request.	532	* Figure out final result of a lookup/open request.
533	*	533	*
534	* Mainly, make sure we return the final req->r_dentry (if it already	534	* Mainly, make sure we return the final req->r_dentry (if it already
535	* existed) in place of the original VFS-provided dentry when they	535	* existed) in place of the original VFS-provided dentry when they
536	* differ.	536	* differ.
537	*	537	*
538	* Gracefully handle the case where the MDS replies with -ENOENT and	538	* Gracefully handle the case where the MDS replies with -ENOENT and
539	* no trace (which it may do, at its discretion, e.g., if it doesn't	539	* no trace (which it may do, at its discretion, e.g., if it doesn't
540	* care to issue a lease on the negative dentry).	540	* care to issue a lease on the negative dentry).
541	*/	541	*/
542	struct dentry ceph_finish_lookup(struct ceph_mds_request req,	542	struct dentry ceph_finish_lookup(struct ceph_mds_request req,
543	struct dentry *dentry, int err)	543	struct dentry *dentry, int err)
544	{	544	{
545	if (err == -ENOENT) {	545	if (err == -ENOENT) {
546	/* no trace? */	546	/* no trace? */
547	err = 0;	547	err = 0;
548	if (!req->r_reply_info.head->is_dentry) {	548	if (!req->r_reply_info.head->is_dentry) {
549	dout("ENOENT and no trace, dentry %p inode %p\n",	549	dout("ENOENT and no trace, dentry %p inode %p\n",
550	dentry, dentry->d_inode);	550	dentry, dentry->d_inode);
551	if (dentry->d_inode) {	551	if (dentry->d_inode) {
552	d_drop(dentry);	552	d_drop(dentry);
553	err = -ENOENT;	553	err = -ENOENT;
554	} else {	554	} else {
555	d_add(dentry, NULL);	555	d_add(dentry, NULL);
556	}	556	}
557	}	557	}
558	}	558	}
559	if (err)	559	if (err)
560	dentry = ERR_PTR(err);	560	dentry = ERR_PTR(err);
561	else if (dentry != req->r_dentry)	561	else if (dentry != req->r_dentry)
562	dentry = dget(req->r_dentry); /* we got spliced */	562	dentry = dget(req->r_dentry); /* we got spliced */
563	else	563	else
564	dentry = NULL;	564	dentry = NULL;
565	return dentry;	565	return dentry;
566	}	566	}
567		567
568	static int is_root_ceph_dentry(struct inode inode, struct dentry dentry)	568	static int is_root_ceph_dentry(struct inode inode, struct dentry dentry)
569	{	569	{
570	return ceph_ino(inode) == CEPH_INO_ROOT &&	570	return ceph_ino(inode) == CEPH_INO_ROOT &&
571	strncmp(dentry->d_name.name, ".ceph", 5) == 0;	571	strncmp(dentry->d_name.name, ".ceph", 5) == 0;
572	}	572	}
573		573
574	/*	574	/*
575	* Look up a single dir entry. If there is a lookup intent, inform	575	* Look up a single dir entry. If there is a lookup intent, inform
576	* the MDS so that it gets our 'caps wanted' value in a single op.	576	* the MDS so that it gets our 'caps wanted' value in a single op.
577	*/	577	*/
578	static struct dentry ceph_lookup(struct inode dir, struct dentry *dentry,	578	static struct dentry ceph_lookup(struct inode dir, struct dentry *dentry,
579	struct nameidata *nd)	579	struct nameidata *nd)
580	{	580	{
581	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);	581	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
582	struct ceph_mds_client *mdsc = fsc->mdsc;	582	struct ceph_mds_client *mdsc = fsc->mdsc;
583	struct ceph_mds_request *req;	583	struct ceph_mds_request *req;
584	int op;	584	int op;
585	int err;	585	int err;
586		586
587	dout("lookup %p dentry %p '%.*s'\n",	587	dout("lookup %p dentry %p '%.*s'\n",
588	dir, dentry, dentry->d_name.len, dentry->d_name.name);	588	dir, dentry, dentry->d_name.len, dentry->d_name.name);
589		589
590	if (dentry->d_name.len > NAME_MAX)	590	if (dentry->d_name.len > NAME_MAX)
591	return ERR_PTR(-ENAMETOOLONG);	591	return ERR_PTR(-ENAMETOOLONG);
592		592
593	err = ceph_init_dentry(dentry);	593	err = ceph_init_dentry(dentry);
594	if (err < 0)	594	if (err < 0)
595	return ERR_PTR(err);	595	return ERR_PTR(err);
596		596
597	/* open (but not create!) intent? */	597	/* open (but not create!) intent? */
598	if (nd &&	598	if (nd &&
599	(nd->flags & LOOKUP_OPEN) &&	599	(nd->flags & LOOKUP_OPEN) &&
600	!(nd->intent.open.flags & O_CREAT)) {	600	!(nd->intent.open.flags & O_CREAT)) {
601	int mode = nd->intent.open.create_mode & ~current->fs->umask;	601	int mode = nd->intent.open.create_mode & ~current->fs->umask;
602	return ceph_lookup_open(dir, dentry, nd, mode, 1);	602	return ceph_lookup_open(dir, dentry, nd, mode, 1);
603	}	603	}
604		604
605	/* can we conclude ENOENT locally? */	605	/* can we conclude ENOENT locally? */
606	if (dentry->d_inode == NULL) {	606	if (dentry->d_inode == NULL) {
607	struct ceph_inode_info *ci = ceph_inode(dir);	607	struct ceph_inode_info *ci = ceph_inode(dir);
608	struct ceph_dentry_info *di = ceph_dentry(dentry);	608	struct ceph_dentry_info *di = ceph_dentry(dentry);
609		609
610	spin_lock(&ci->i_ceph_lock);	610	spin_lock(&ci->i_ceph_lock);
611	dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);	611	dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
612	if (strncmp(dentry->d_name.name,	612	if (strncmp(dentry->d_name.name,
613	fsc->mount_options->snapdir_name,	613	fsc->mount_options->snapdir_name,
614	dentry->d_name.len) &&	614	dentry->d_name.len) &&
615	!is_root_ceph_dentry(dir, dentry) &&	615	!is_root_ceph_dentry(dir, dentry) &&
616	ceph_dir_test_complete(dir) &&	616	ceph_dir_test_complete(dir) &&
617	(__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {	617	(__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
618	spin_unlock(&ci->i_ceph_lock);	618	spin_unlock(&ci->i_ceph_lock);
619	dout(" dir %p complete, -ENOENT\n", dir);	619	dout(" dir %p complete, -ENOENT\n", dir);
620	d_add(dentry, NULL);	620	d_add(dentry, NULL);
621	di->lease_shared_gen = ci->i_shared_gen;	621	di->lease_shared_gen = ci->i_shared_gen;
622	return NULL;	622	return NULL;
623	}	623	}
624	spin_unlock(&ci->i_ceph_lock);	624	spin_unlock(&ci->i_ceph_lock);
625	}	625	}
626		626
627	op = ceph_snap(dir) == CEPH_SNAPDIR ?	627	op = ceph_snap(dir) == CEPH_SNAPDIR ?
628	CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;	628	CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
629	req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);	629	req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
630	if (IS_ERR(req))	630	if (IS_ERR(req))
631	return ERR_CAST(req);	631	return ERR_CAST(req);
632	req->r_dentry = dget(dentry);	632	req->r_dentry = dget(dentry);
633	req->r_num_caps = 2;	633	req->r_num_caps = 2;
634	/* we only need inode linkage */	634	/* we only need inode linkage */
635	req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);	635	req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
636	req->r_locked_dir = dir;	636	req->r_locked_dir = dir;
637	err = ceph_mdsc_do_request(mdsc, NULL, req);	637	err = ceph_mdsc_do_request(mdsc, NULL, req);
638	err = ceph_handle_snapdir(req, dentry, err);	638	err = ceph_handle_snapdir(req, dentry, err);
639	dentry = ceph_finish_lookup(req, dentry, err);	639	dentry = ceph_finish_lookup(req, dentry, err);
640	ceph_mdsc_put_request(req); /* will dput(dentry) */	640	ceph_mdsc_put_request(req); /* will dput(dentry) */
641	dout("lookup result=%p\n", dentry);	641	dout("lookup result=%p\n", dentry);
642	return dentry;	642	return dentry;
643	}	643	}
644		644
645	/*	645	/*
646	* If we do a create but get no trace back from the MDS, follow up with	646	* If we do a create but get no trace back from the MDS, follow up with
647	* a lookup (the VFS expects us to link up the provided dentry).	647	* a lookup (the VFS expects us to link up the provided dentry).
648	*/	648	*/
649	int ceph_handle_notrace_create(struct inode dir, struct dentry dentry)	649	int ceph_handle_notrace_create(struct inode dir, struct dentry dentry)
650	{	650	{
651	struct dentry *result = ceph_lookup(dir, dentry, NULL);	651	struct dentry *result = ceph_lookup(dir, dentry, NULL);
652		652
653	if (result && !IS_ERR(result)) {	653	if (result && !IS_ERR(result)) {
654	/*	654	/*
655	* We created the item, then did a lookup, and found	655	* We created the item, then did a lookup, and found
656	* it was already linked to another inode we already	656	* it was already linked to another inode we already
657	* had in our cache (and thus got spliced). Link our	657	* had in our cache (and thus got spliced). Link our
658	* dentry to that inode, but don't hash it, just in	658	* dentry to that inode, but don't hash it, just in
659	* case the VFS wants to dereference it.	659	* case the VFS wants to dereference it.
660	*/	660	*/
661	BUG_ON(!result->d_inode);	661	BUG_ON(!result->d_inode);
662	d_instantiate(dentry, result->d_inode);	662	d_instantiate(dentry, result->d_inode);
663	return 0;	663	return 0;
664	}	664	}
665	return PTR_ERR(result);	665	return PTR_ERR(result);
666	}	666	}
667		667
668	static int ceph_mknod(struct inode dir, struct dentry dentry,	668	static int ceph_mknod(struct inode dir, struct dentry dentry,
669	umode_t mode, dev_t rdev)	669	umode_t mode, dev_t rdev)
670	{	670	{
671	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);	671	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
672	struct ceph_mds_client *mdsc = fsc->mdsc;	672	struct ceph_mds_client *mdsc = fsc->mdsc;
673	struct ceph_mds_request *req;	673	struct ceph_mds_request *req;
674	int err;	674	int err;
675		675
676	if (ceph_snap(dir) != CEPH_NOSNAP)	676	if (ceph_snap(dir) != CEPH_NOSNAP)
677	return -EROFS;	677	return -EROFS;
678		678
679	dout("mknod in dir %p dentry %p mode 0%ho rdev %d\n",	679	dout("mknod in dir %p dentry %p mode 0%ho rdev %d\n",
680	dir, dentry, mode, rdev);	680	dir, dentry, mode, rdev);
681	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS);	681	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS);
682	if (IS_ERR(req)) {	682	if (IS_ERR(req)) {
683	d_drop(dentry);	683	d_drop(dentry);
684	return PTR_ERR(req);	684	return PTR_ERR(req);
685	}	685	}
686	req->r_dentry = dget(dentry);	686	req->r_dentry = dget(dentry);
687	req->r_num_caps = 2;	687	req->r_num_caps = 2;
688	req->r_locked_dir = dir;	688	req->r_locked_dir = dir;
689	req->r_args.mknod.mode = cpu_to_le32(mode);	689	req->r_args.mknod.mode = cpu_to_le32(mode);
690	req->r_args.mknod.rdev = cpu_to_le32(rdev);	690	req->r_args.mknod.rdev = cpu_to_le32(rdev);
691	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;	691	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
692	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;	692	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
693	err = ceph_mdsc_do_request(mdsc, dir, req);	693	err = ceph_mdsc_do_request(mdsc, dir, req);
694	if (!err && !req->r_reply_info.head->is_dentry)	694	if (!err && !req->r_reply_info.head->is_dentry)
695	err = ceph_handle_notrace_create(dir, dentry);	695	err = ceph_handle_notrace_create(dir, dentry);
696	ceph_mdsc_put_request(req);	696	ceph_mdsc_put_request(req);
697	if (err)	697	if (err)
698	d_drop(dentry);	698	d_drop(dentry);
699	return err;	699	return err;
700	}	700	}
701		701
702	static int ceph_create(struct inode dir, struct dentry dentry, umode_t mode,	702	static int ceph_create(struct inode dir, struct dentry dentry, umode_t mode,
703	struct nameidata *nd)	703	struct nameidata *nd)
704	{	704	{
705	dout("create in dir %p dentry %p name '%.*s'\n",	705	dout("create in dir %p dentry %p name '%.*s'\n",
706	dir, dentry, dentry->d_name.len, dentry->d_name.name);	706	dir, dentry, dentry->d_name.len, dentry->d_name.name);
707		707
708	if (ceph_snap(dir) != CEPH_NOSNAP)	708	if (ceph_snap(dir) != CEPH_NOSNAP)
709	return -EROFS;	709	return -EROFS;
710		710
711	if (nd) {	711	if (nd) {
712	BUG_ON((nd->flags & LOOKUP_OPEN) == 0);	712	BUG_ON((nd->flags & LOOKUP_OPEN) == 0);
713	dentry = ceph_lookup_open(dir, dentry, nd, mode, 0);	713	dentry = ceph_lookup_open(dir, dentry, nd, mode, 0);
714	/* hrm, what should i do here if we get aliased? */	714	/* hrm, what should i do here if we get aliased? */
715	if (IS_ERR(dentry))	715	if (IS_ERR(dentry))
716	return PTR_ERR(dentry);	716	return PTR_ERR(dentry);
717	return 0;	717	return 0;
718	}	718	}
719		719
720	/* fall back to mknod */	720	/* fall back to mknod */
721	return ceph_mknod(dir, dentry, (mode & ~S_IFMT) \| S_IFREG, 0);	721	return ceph_mknod(dir, dentry, (mode & ~S_IFMT) \| S_IFREG, 0);
722	}	722	}
723		723
724	static int ceph_symlink(struct inode dir, struct dentry dentry,	724	static int ceph_symlink(struct inode dir, struct dentry dentry,
725	const char *dest)	725	const char *dest)
726	{	726	{
727	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);	727	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
728	struct ceph_mds_client *mdsc = fsc->mdsc;	728	struct ceph_mds_client *mdsc = fsc->mdsc;
729	struct ceph_mds_request *req;	729	struct ceph_mds_request *req;
730	int err;	730	int err;
731		731
732	if (ceph_snap(dir) != CEPH_NOSNAP)	732	if (ceph_snap(dir) != CEPH_NOSNAP)
733	return -EROFS;	733	return -EROFS;
734		734
735	dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);	735	dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
736	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);	736	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);
737	if (IS_ERR(req)) {	737	if (IS_ERR(req)) {
738	d_drop(dentry);	738	d_drop(dentry);
739	return PTR_ERR(req);	739	return PTR_ERR(req);
740	}	740	}
741	req->r_dentry = dget(dentry);	741	req->r_dentry = dget(dentry);
742	req->r_num_caps = 2;	742	req->r_num_caps = 2;
743	req->r_path2 = kstrdup(dest, GFP_NOFS);	743	req->r_path2 = kstrdup(dest, GFP_NOFS);
744	req->r_locked_dir = dir;	744	req->r_locked_dir = dir;
745	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;	745	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
746	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;	746	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
747	err = ceph_mdsc_do_request(mdsc, dir, req);	747	err = ceph_mdsc_do_request(mdsc, dir, req);
748	if (!err && !req->r_reply_info.head->is_dentry)	748	if (!err && !req->r_reply_info.head->is_dentry)
749	err = ceph_handle_notrace_create(dir, dentry);	749	err = ceph_handle_notrace_create(dir, dentry);
750	ceph_mdsc_put_request(req);	750	ceph_mdsc_put_request(req);
751	if (err)	751	if (err)
752	d_drop(dentry);	752	d_drop(dentry);
753	return err;	753	return err;
754	}	754	}
755		755
756	static int ceph_mkdir(struct inode dir, struct dentry dentry, umode_t mode)	756	static int ceph_mkdir(struct inode dir, struct dentry dentry, umode_t mode)
757	{	757	{
758	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);	758	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
759	struct ceph_mds_client *mdsc = fsc->mdsc;	759	struct ceph_mds_client *mdsc = fsc->mdsc;
760	struct ceph_mds_request *req;	760	struct ceph_mds_request *req;
761	int err = -EROFS;	761	int err = -EROFS;
762	int op;	762	int op;
763		763
764	if (ceph_snap(dir) == CEPH_SNAPDIR) {	764	if (ceph_snap(dir) == CEPH_SNAPDIR) {
765	/* mkdir .snap/foo is a MKSNAP */	765	/* mkdir .snap/foo is a MKSNAP */
766	op = CEPH_MDS_OP_MKSNAP;	766	op = CEPH_MDS_OP_MKSNAP;
767	dout("mksnap dir %p snap '%.*s' dn %p\n", dir,	767	dout("mksnap dir %p snap '%.*s' dn %p\n", dir,
768	dentry->d_name.len, dentry->d_name.name, dentry);	768	dentry->d_name.len, dentry->d_name.name, dentry);
769	} else if (ceph_snap(dir) == CEPH_NOSNAP) {	769	} else if (ceph_snap(dir) == CEPH_NOSNAP) {
770	dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode);	770	dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode);
771	op = CEPH_MDS_OP_MKDIR;	771	op = CEPH_MDS_OP_MKDIR;
772	} else {	772	} else {
773	goto out;	773	goto out;
774	}	774	}
775	req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);	775	req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
776	if (IS_ERR(req)) {	776	if (IS_ERR(req)) {
777	err = PTR_ERR(req);	777	err = PTR_ERR(req);
778	goto out;	778	goto out;
779	}	779	}
780		780
781	req->r_dentry = dget(dentry);	781	req->r_dentry = dget(dentry);
782	req->r_num_caps = 2;	782	req->r_num_caps = 2;
783	req->r_locked_dir = dir;	783	req->r_locked_dir = dir;
784	req->r_args.mkdir.mode = cpu_to_le32(mode);	784	req->r_args.mkdir.mode = cpu_to_le32(mode);
785	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;	785	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
786	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;	786	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
787	err = ceph_mdsc_do_request(mdsc, dir, req);	787	err = ceph_mdsc_do_request(mdsc, dir, req);
788	if (!err && !req->r_reply_info.head->is_dentry)	788	if (!err && !req->r_reply_info.head->is_dentry)
789	err = ceph_handle_notrace_create(dir, dentry);	789	err = ceph_handle_notrace_create(dir, dentry);
790	ceph_mdsc_put_request(req);	790	ceph_mdsc_put_request(req);
791	out:	791	out:
792	if (err < 0)	792	if (err < 0)
793	d_drop(dentry);	793	d_drop(dentry);
794	return err;	794	return err;
795	}	795	}
796		796
797	static int ceph_link(struct dentry old_dentry, struct inode dir,	797	static int ceph_link(struct dentry old_dentry, struct inode dir,
798	struct dentry *dentry)	798	struct dentry *dentry)
799	{	799	{
800	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);	800	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
801	struct ceph_mds_client *mdsc = fsc->mdsc;	801	struct ceph_mds_client *mdsc = fsc->mdsc;
802	struct ceph_mds_request *req;	802	struct ceph_mds_request *req;
803	int err;	803	int err;
804		804
805	if (ceph_snap(dir) != CEPH_NOSNAP)	805	if (ceph_snap(dir) != CEPH_NOSNAP)
806	return -EROFS;	806	return -EROFS;
807		807
808	dout("link in dir %p old_dentry %p dentry %p\n", dir,	808	dout("link in dir %p old_dentry %p dentry %p\n", dir,
809	old_dentry, dentry);	809	old_dentry, dentry);
810	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS);	810	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS);
811	if (IS_ERR(req)) {	811	if (IS_ERR(req)) {
812	d_drop(dentry);	812	d_drop(dentry);
813	return PTR_ERR(req);	813	return PTR_ERR(req);
814	}	814	}
815	req->r_dentry = dget(dentry);	815	req->r_dentry = dget(dentry);
816	req->r_num_caps = 2;	816	req->r_num_caps = 2;
817	req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */	817	req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */
818	req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry);	818	req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry);
819	req->r_locked_dir = dir;	819	req->r_locked_dir = dir;
820	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;	820	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
821	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;	821	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
822	err = ceph_mdsc_do_request(mdsc, dir, req);	822	err = ceph_mdsc_do_request(mdsc, dir, req);
823	if (err) {	823	if (err) {
824	d_drop(dentry);	824	d_drop(dentry);
825	} else if (!req->r_reply_info.head->is_dentry) {	825	} else if (!req->r_reply_info.head->is_dentry) {
826	ihold(old_dentry->d_inode);	826	ihold(old_dentry->d_inode);
827	d_instantiate(dentry, old_dentry->d_inode);	827	d_instantiate(dentry, old_dentry->d_inode);
828	}	828	}
829	ceph_mdsc_put_request(req);	829	ceph_mdsc_put_request(req);
830	return err;	830	return err;
831	}	831	}
832		832
833	/*	833	/*
834	* For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it	834	* For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it
835	* looks like the link count will hit 0, drop any other caps (other	835	* looks like the link count will hit 0, drop any other caps (other
836	* than PIN) we don't specifically want (due to the file still being	836	* than PIN) we don't specifically want (due to the file still being
837	* open).	837	* open).
838	*/	838	*/
839	static int drop_caps_for_unlink(struct inode *inode)	839	static int drop_caps_for_unlink(struct inode *inode)
840	{	840	{
841	struct ceph_inode_info *ci = ceph_inode(inode);	841	struct ceph_inode_info *ci = ceph_inode(inode);
842	int drop = CEPH_CAP_LINK_SHARED \| CEPH_CAP_LINK_EXCL;	842	int drop = CEPH_CAP_LINK_SHARED \| CEPH_CAP_LINK_EXCL;
843		843
844	spin_lock(&ci->i_ceph_lock);	844	spin_lock(&ci->i_ceph_lock);
845	if (inode->i_nlink == 1) {	845	if (inode->i_nlink == 1) {
846	drop \|= ~(__ceph_caps_wanted(ci) \| CEPH_CAP_PIN);	846	drop \|= ~(__ceph_caps_wanted(ci) \| CEPH_CAP_PIN);
847	ci->i_ceph_flags \|= CEPH_I_NODELAY;	847	ci->i_ceph_flags \|= CEPH_I_NODELAY;
848	}	848	}
849	spin_unlock(&ci->i_ceph_lock);	849	spin_unlock(&ci->i_ceph_lock);
850	return drop;	850	return drop;
851	}	851	}
852		852
853	/*	853	/*
854	* rmdir and unlink are differ only by the metadata op code	854	* rmdir and unlink are differ only by the metadata op code
855	*/	855	*/
856	static int ceph_unlink(struct inode dir, struct dentry dentry)	856	static int ceph_unlink(struct inode dir, struct dentry dentry)
857	{	857	{
858	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);	858	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
859	struct ceph_mds_client *mdsc = fsc->mdsc;	859	struct ceph_mds_client *mdsc = fsc->mdsc;
860	struct inode *inode = dentry->d_inode;	860	struct inode *inode = dentry->d_inode;
861	struct ceph_mds_request *req;	861	struct ceph_mds_request *req;
862	int err = -EROFS;	862	int err = -EROFS;
863	int op;	863	int op;
864		864
865	if (ceph_snap(dir) == CEPH_SNAPDIR) {	865	if (ceph_snap(dir) == CEPH_SNAPDIR) {
866	/* rmdir .snap/foo is RMSNAP */	866	/* rmdir .snap/foo is RMSNAP */
867	dout("rmsnap dir %p '%.*s' dn %p\n", dir, dentry->d_name.len,	867	dout("rmsnap dir %p '%.*s' dn %p\n", dir, dentry->d_name.len,
868	dentry->d_name.name, dentry);	868	dentry->d_name.name, dentry);
869	op = CEPH_MDS_OP_RMSNAP;	869	op = CEPH_MDS_OP_RMSNAP;
870	} else if (ceph_snap(dir) == CEPH_NOSNAP) {	870	} else if (ceph_snap(dir) == CEPH_NOSNAP) {
871	dout("unlink/rmdir dir %p dn %p inode %p\n",	871	dout("unlink/rmdir dir %p dn %p inode %p\n",
872	dir, dentry, inode);	872	dir, dentry, inode);
873	op = S_ISDIR(dentry->d_inode->i_mode) ?	873	op = S_ISDIR(dentry->d_inode->i_mode) ?
874	CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;	874	CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
875	} else	875	} else
876	goto out;	876	goto out;
877	req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);	877	req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
878	if (IS_ERR(req)) {	878	if (IS_ERR(req)) {
879	err = PTR_ERR(req);	879	err = PTR_ERR(req);
880	goto out;	880	goto out;
881	}	881	}
882	req->r_dentry = dget(dentry);	882	req->r_dentry = dget(dentry);
883	req->r_num_caps = 2;	883	req->r_num_caps = 2;
884	req->r_locked_dir = dir;	884	req->r_locked_dir = dir;
885	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;	885	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
886	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;	886	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
887	req->r_inode_drop = drop_caps_for_unlink(inode);	887	req->r_inode_drop = drop_caps_for_unlink(inode);
888	err = ceph_mdsc_do_request(mdsc, dir, req);	888	err = ceph_mdsc_do_request(mdsc, dir, req);
889	if (!err && !req->r_reply_info.head->is_dentry)	889	if (!err && !req->r_reply_info.head->is_dentry)
890	d_delete(dentry);	890	d_delete(dentry);
891	ceph_mdsc_put_request(req);	891	ceph_mdsc_put_request(req);
892	out:	892	out:
893	return err;	893	return err;
894	}	894	}
895		895
896	static int ceph_rename(struct inode old_dir, struct dentry old_dentry,	896	static int ceph_rename(struct inode old_dir, struct dentry old_dentry,
897	struct inode new_dir, struct dentry new_dentry)	897	struct inode new_dir, struct dentry new_dentry)
898	{	898	{
899	struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb);	899	struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb);
900	struct ceph_mds_client *mdsc = fsc->mdsc;	900	struct ceph_mds_client *mdsc = fsc->mdsc;
901	struct ceph_mds_request *req;	901	struct ceph_mds_request *req;
902	int err;	902	int err;
903		903
904	if (ceph_snap(old_dir) != ceph_snap(new_dir))	904	if (ceph_snap(old_dir) != ceph_snap(new_dir))
905	return -EXDEV;	905	return -EXDEV;
906	if (ceph_snap(old_dir) != CEPH_NOSNAP \|\|	906	if (ceph_snap(old_dir) != CEPH_NOSNAP \|\|
907	ceph_snap(new_dir) != CEPH_NOSNAP)	907	ceph_snap(new_dir) != CEPH_NOSNAP)
908	return -EROFS;	908	return -EROFS;
909	dout("rename dir %p dentry %p to dir %p dentry %p\n",	909	dout("rename dir %p dentry %p to dir %p dentry %p\n",
910	old_dir, old_dentry, new_dir, new_dentry);	910	old_dir, old_dentry, new_dir, new_dentry);
911	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS);	911	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS);
912	if (IS_ERR(req))	912	if (IS_ERR(req))
913	return PTR_ERR(req);	913	return PTR_ERR(req);
914	req->r_dentry = dget(new_dentry);	914	req->r_dentry = dget(new_dentry);
915	req->r_num_caps = 2;	915	req->r_num_caps = 2;
916	req->r_old_dentry = dget(old_dentry);	916	req->r_old_dentry = dget(old_dentry);
917	req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry);	917	req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry);
918	req->r_locked_dir = new_dir;	918	req->r_locked_dir = new_dir;
919	req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;	919	req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
920	req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;	920	req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
921	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;	921	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
922	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;	922	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
923	/* release LINK_RDCACHE on source inode (mds will lock it) */	923	/* release LINK_RDCACHE on source inode (mds will lock it) */
924	req->r_old_inode_drop = CEPH_CAP_LINK_SHARED;	924	req->r_old_inode_drop = CEPH_CAP_LINK_SHARED;
925	if (new_dentry->d_inode)	925	if (new_dentry->d_inode)
926	req->r_inode_drop = drop_caps_for_unlink(new_dentry->d_inode);	926	req->r_inode_drop = drop_caps_for_unlink(new_dentry->d_inode);
927	err = ceph_mdsc_do_request(mdsc, old_dir, req);	927	err = ceph_mdsc_do_request(mdsc, old_dir, req);
928	if (!err && !req->r_reply_info.head->is_dentry) {	928	if (!err && !req->r_reply_info.head->is_dentry) {
929	/*	929	/*
930	* Normally d_move() is done by fill_trace (called by	930	* Normally d_move() is done by fill_trace (called by
931	* do_request, above). If there is no trace, we need	931	* do_request, above). If there is no trace, we need
932	* to do it here.	932	* to do it here.
933	*/	933	*/
934		934
935	/* d_move screws up d_subdirs order */	935	/* d_move screws up d_subdirs order */
936	ceph_dir_clear_complete(new_dir);	936	ceph_dir_clear_complete(new_dir);
937		937
938	d_move(old_dentry, new_dentry);	938	d_move(old_dentry, new_dentry);
939		939
940	/* ensure target dentry is invalidated, despite	940	/* ensure target dentry is invalidated, despite
941	rehashing bug in vfs_rename_dir */	941	rehashing bug in vfs_rename_dir */
942	ceph_invalidate_dentry_lease(new_dentry);	942	ceph_invalidate_dentry_lease(new_dentry);
943	}	943	}
944	ceph_mdsc_put_request(req);	944	ceph_mdsc_put_request(req);
945	return err;	945	return err;
946	}	946	}
947		947
948	/*	948	/*
949	* Ensure a dentry lease will no longer revalidate.	949	* Ensure a dentry lease will no longer revalidate.
950	*/	950	*/
951	void ceph_invalidate_dentry_lease(struct dentry *dentry)	951	void ceph_invalidate_dentry_lease(struct dentry *dentry)
952	{	952	{
953	spin_lock(&dentry->d_lock);	953	spin_lock(&dentry->d_lock);
954	dentry->d_time = jiffies;	954	dentry->d_time = jiffies;
955	ceph_dentry(dentry)->lease_shared_gen = 0;	955	ceph_dentry(dentry)->lease_shared_gen = 0;
956	spin_unlock(&dentry->d_lock);	956	spin_unlock(&dentry->d_lock);
957	}	957	}
958		958
959	/*	959	/*
960	* Check if dentry lease is valid. If not, delete the lease. Try to	960	* Check if dentry lease is valid. If not, delete the lease. Try to
961	* renew if the least is more than half up.	961	* renew if the least is more than half up.
962	*/	962	*/
963	static int dentry_lease_is_valid(struct dentry *dentry)	963	static int dentry_lease_is_valid(struct dentry *dentry)
964	{	964	{
965	struct ceph_dentry_info *di;	965	struct ceph_dentry_info *di;
966	struct ceph_mds_session *s;	966	struct ceph_mds_session *s;
967	int valid = 0;	967	int valid = 0;
968	u32 gen;	968	u32 gen;
969	unsigned long ttl;	969	unsigned long ttl;
970	struct ceph_mds_session *session = NULL;	970	struct ceph_mds_session *session = NULL;
971	struct inode *dir = NULL;	971	struct inode *dir = NULL;
972	u32 seq = 0;	972	u32 seq = 0;
973		973
974	spin_lock(&dentry->d_lock);	974	spin_lock(&dentry->d_lock);
975	di = ceph_dentry(dentry);	975	di = ceph_dentry(dentry);
976	if (di->lease_session) {	976	if (di->lease_session) {
977	s = di->lease_session;	977	s = di->lease_session;
978	spin_lock(&s->s_cap_lock);	978	spin_lock(&s->s_gen_ttl_lock);
979	gen = s->s_cap_gen;	979	gen = s->s_cap_gen;
980	ttl = s->s_cap_ttl;	980	ttl = s->s_cap_ttl;
981	spin_unlock(&s->s_cap_lock);	981	spin_unlock(&s->s_gen_ttl_lock);
982		982
983	if (di->lease_gen == gen &&	983	if (di->lease_gen == gen &&
984	time_before(jiffies, dentry->d_time) &&	984	time_before(jiffies, dentry->d_time) &&
985	time_before(jiffies, ttl)) {	985	time_before(jiffies, ttl)) {
986	valid = 1;	986	valid = 1;
987	if (di->lease_renew_after &&	987	if (di->lease_renew_after &&
988	time_after(jiffies, di->lease_renew_after)) {	988	time_after(jiffies, di->lease_renew_after)) {
989	/* we should renew */	989	/* we should renew */
990	dir = dentry->d_parent->d_inode;	990	dir = dentry->d_parent->d_inode;
991	session = ceph_get_mds_session(s);	991	session = ceph_get_mds_session(s);
992	seq = di->lease_seq;	992	seq = di->lease_seq;
993	di->lease_renew_after = 0;	993	di->lease_renew_after = 0;
994	di->lease_renew_from = jiffies;	994	di->lease_renew_from = jiffies;
995	}	995	}
996	}	996	}
997	}	997	}
998	spin_unlock(&dentry->d_lock);	998	spin_unlock(&dentry->d_lock);
999		999
1000	if (session) {	1000	if (session) {
1001	ceph_mdsc_lease_send_msg(session, dir, dentry,	1001	ceph_mdsc_lease_send_msg(session, dir, dentry,
1002	CEPH_MDS_LEASE_RENEW, seq);	1002	CEPH_MDS_LEASE_RENEW, seq);
1003	ceph_put_mds_session(session);	1003	ceph_put_mds_session(session);
1004	}	1004	}
1005	dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);	1005	dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
1006	return valid;	1006	return valid;
1007	}	1007	}
1008		1008
1009	/*	1009	/*
1010	* Check if directory-wide content lease/cap is valid.	1010	* Check if directory-wide content lease/cap is valid.
1011	*/	1011	*/
1012	static int dir_lease_is_valid(struct inode dir, struct dentry dentry)	1012	static int dir_lease_is_valid(struct inode dir, struct dentry dentry)
1013	{	1013	{
1014	struct ceph_inode_info *ci = ceph_inode(dir);	1014	struct ceph_inode_info *ci = ceph_inode(dir);
1015	struct ceph_dentry_info *di = ceph_dentry(dentry);	1015	struct ceph_dentry_info *di = ceph_dentry(dentry);
1016	int valid = 0;	1016	int valid = 0;
1017		1017
1018	spin_lock(&ci->i_ceph_lock);	1018	spin_lock(&ci->i_ceph_lock);
1019	if (ci->i_shared_gen == di->lease_shared_gen)	1019	if (ci->i_shared_gen == di->lease_shared_gen)
1020	valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);	1020	valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
1021	spin_unlock(&ci->i_ceph_lock);	1021	spin_unlock(&ci->i_ceph_lock);
1022	dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n",	1022	dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n",
1023	dir, (unsigned)ci->i_shared_gen, dentry,	1023	dir, (unsigned)ci->i_shared_gen, dentry,
1024	(unsigned)di->lease_shared_gen, valid);	1024	(unsigned)di->lease_shared_gen, valid);
1025	return valid;	1025	return valid;
1026	}	1026	}
1027		1027
1028	/*	1028	/*
1029	* Check if cached dentry can be trusted.	1029	* Check if cached dentry can be trusted.
1030	*/	1030	*/
1031	static int ceph_d_revalidate(struct dentry dentry, struct nameidata nd)	1031	static int ceph_d_revalidate(struct dentry dentry, struct nameidata nd)
1032	{	1032	{
1033	int valid = 0;	1033	int valid = 0;
1034	struct inode *dir;	1034	struct inode *dir;
1035		1035
1036	if (nd && nd->flags & LOOKUP_RCU)	1036	if (nd && nd->flags & LOOKUP_RCU)
1037	return -ECHILD;	1037	return -ECHILD;
1038		1038
1039	dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry,	1039	dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry,
1040	dentry->d_name.len, dentry->d_name.name, dentry->d_inode,	1040	dentry->d_name.len, dentry->d_name.name, dentry->d_inode,
1041	ceph_dentry(dentry)->offset);	1041	ceph_dentry(dentry)->offset);
1042		1042
1043	dir = ceph_get_dentry_parent_inode(dentry);	1043	dir = ceph_get_dentry_parent_inode(dentry);
1044		1044
1045	/* always trust cached snapped dentries, snapdir dentry */	1045	/* always trust cached snapped dentries, snapdir dentry */
1046	if (ceph_snap(dir) != CEPH_NOSNAP) {	1046	if (ceph_snap(dir) != CEPH_NOSNAP) {
1047	dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry,	1047	dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry,
1048	dentry->d_name.len, dentry->d_name.name, dentry->d_inode);	1048	dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
1049	valid = 1;	1049	valid = 1;
1050	} else if (dentry->d_inode &&	1050	} else if (dentry->d_inode &&
1051	ceph_snap(dentry->d_inode) == CEPH_SNAPDIR) {	1051	ceph_snap(dentry->d_inode) == CEPH_SNAPDIR) {
1052	valid = 1;	1052	valid = 1;
1053	} else if (dentry_lease_is_valid(dentry) \|\|	1053	} else if (dentry_lease_is_valid(dentry) \|\|
1054	dir_lease_is_valid(dir, dentry)) {	1054	dir_lease_is_valid(dir, dentry)) {
1055	valid = 1;	1055	valid = 1;
1056	}	1056	}
1057		1057
1058	dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");	1058	dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
1059	if (valid)	1059	if (valid)
1060	ceph_dentry_lru_touch(dentry);	1060	ceph_dentry_lru_touch(dentry);
1061	else	1061	else
1062	d_drop(dentry);	1062	d_drop(dentry);
1063	iput(dir);	1063	iput(dir);
1064	return valid;	1064	return valid;
1065	}	1065	}
1066		1066
1067	/*	1067	/*
1068	* Release our ceph_dentry_info.	1068	* Release our ceph_dentry_info.
1069	*/	1069	*/
1070	static void ceph_d_release(struct dentry *dentry)	1070	static void ceph_d_release(struct dentry *dentry)
1071	{	1071	{
1072	struct ceph_dentry_info *di = ceph_dentry(dentry);	1072	struct ceph_dentry_info *di = ceph_dentry(dentry);
1073		1073
1074	dout("d_release %p\n", dentry);	1074	dout("d_release %p\n", dentry);
1075	ceph_dentry_lru_del(dentry);	1075	ceph_dentry_lru_del(dentry);
1076	if (di->lease_session)	1076	if (di->lease_session)
1077	ceph_put_mds_session(di->lease_session);	1077	ceph_put_mds_session(di->lease_session);
1078	kmem_cache_free(ceph_dentry_cachep, di);	1078	kmem_cache_free(ceph_dentry_cachep, di);
1079	dentry->d_fsdata = NULL;	1079	dentry->d_fsdata = NULL;
1080	}	1080	}
1081		1081
1082	static int ceph_snapdir_d_revalidate(struct dentry *dentry,	1082	static int ceph_snapdir_d_revalidate(struct dentry *dentry,
1083	struct nameidata *nd)	1083	struct nameidata *nd)
1084	{	1084	{
1085	/*	1085	/*
1086	* Eventually, we'll want to revalidate snapped metadata	1086	* Eventually, we'll want to revalidate snapped metadata
1087	* too... probably...	1087	* too... probably...
1088	*/	1088	*/
1089	return 1;	1089	return 1;
1090	}	1090	}
1091		1091
1092	/*	1092	/*
1093	* Set/clear/test dir complete flag on the dir's dentry.	1093	* Set/clear/test dir complete flag on the dir's dentry.
1094	*/	1094	*/
1095	void ceph_dir_set_complete(struct inode *inode)	1095	void ceph_dir_set_complete(struct inode *inode)
1096	{	1096	{
1097	struct dentry *dentry = d_find_any_alias(inode);	1097	struct dentry *dentry = d_find_any_alias(inode);
1098		1098
1099	if (dentry && ceph_dentry(dentry) &&	1099	if (dentry && ceph_dentry(dentry) &&
1100	ceph_test_mount_opt(ceph_sb_to_client(dentry->d_sb), DCACHE)) {	1100	ceph_test_mount_opt(ceph_sb_to_client(dentry->d_sb), DCACHE)) {
1101	dout(" marking %p (%p) complete\n", inode, dentry);	1101	dout(" marking %p (%p) complete\n", inode, dentry);
1102	set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);	1102	set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
1103	}	1103	}
1104	dput(dentry);	1104	dput(dentry);
1105	}	1105	}
1106		1106
1107	void ceph_dir_clear_complete(struct inode *inode)	1107	void ceph_dir_clear_complete(struct inode *inode)
1108	{	1108	{
1109	struct dentry *dentry = d_find_any_alias(inode);	1109	struct dentry *dentry = d_find_any_alias(inode);
1110		1110
1111	if (dentry && ceph_dentry(dentry)) {	1111	if (dentry && ceph_dentry(dentry)) {
1112	dout(" marking %p (%p) complete\n", inode, dentry);	1112	dout(" marking %p (%p) complete\n", inode, dentry);
1113	set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);	1113	set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
1114	}	1114	}
1115	dput(dentry);	1115	dput(dentry);
1116	}	1116	}
1117		1117
1118	bool ceph_dir_test_complete(struct inode *inode)	1118	bool ceph_dir_test_complete(struct inode *inode)
1119	{	1119	{
1120	struct dentry *dentry = d_find_any_alias(inode);	1120	struct dentry *dentry = d_find_any_alias(inode);
1121		1121
1122	if (dentry && ceph_dentry(dentry)) {	1122	if (dentry && ceph_dentry(dentry)) {
1123	dout(" marking %p (%p) NOT complete\n", inode, dentry);	1123	dout(" marking %p (%p) NOT complete\n", inode, dentry);
1124	clear_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);	1124	clear_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
1125	}	1125	}
1126	dput(dentry);	1126	dput(dentry);
1127	return false;	1127	return false;
1128	}	1128	}
1129		1129
1130	/*	1130	/*
1131	* When the VFS prunes a dentry from the cache, we need to clear the	1131	* When the VFS prunes a dentry from the cache, we need to clear the
1132	* complete flag on the parent directory.	1132	* complete flag on the parent directory.
1133	*	1133	*
1134	* Called under dentry->d_lock.	1134	* Called under dentry->d_lock.
1135	*/	1135	*/
1136	static void ceph_d_prune(struct dentry *dentry)	1136	static void ceph_d_prune(struct dentry *dentry)
1137	{	1137	{
1138	struct ceph_dentry_info *di;	1138	struct ceph_dentry_info *di;
1139		1139
1140	dout("ceph_d_prune %p\n", dentry);	1140	dout("ceph_d_prune %p\n", dentry);
1141		1141
1142	/* do we have a valid parent? */	1142	/* do we have a valid parent? */
1143	if (!dentry->d_parent \|\| IS_ROOT(dentry))	1143	if (!dentry->d_parent \|\| IS_ROOT(dentry))
1144	return;	1144	return;
1145		1145
1146	/* if we are not hashed, we don't affect D_COMPLETE */	1146	/* if we are not hashed, we don't affect D_COMPLETE */
1147	if (d_unhashed(dentry))	1147	if (d_unhashed(dentry))
1148	return;	1148	return;
1149		1149
1150	/*	1150	/*
1151	* we hold d_lock, so d_parent is stable, and d_fsdata is never	1151	* we hold d_lock, so d_parent is stable, and d_fsdata is never
1152	* cleared until d_release	1152	* cleared until d_release
1153	*/	1153	*/
1154	di = ceph_dentry(dentry->d_parent);	1154	di = ceph_dentry(dentry->d_parent);
1155	clear_bit(CEPH_D_COMPLETE, &di->flags);	1155	clear_bit(CEPH_D_COMPLETE, &di->flags);
1156	}	1156	}
1157		1157
1158	/*	1158	/*
1159	* read() on a dir. This weird interface hack only works if mounted	1159	* read() on a dir. This weird interface hack only works if mounted
1160	* with '-o dirstat'.	1160	* with '-o dirstat'.
1161	*/	1161	*/
1162	static ssize_t ceph_read_dir(struct file file, char __user buf, size_t size,	1162	static ssize_t ceph_read_dir(struct file file, char __user buf, size_t size,
1163	loff_t *ppos)	1163	loff_t *ppos)
1164	{	1164	{
1165	struct ceph_file_info *cf = file->private_data;	1165	struct ceph_file_info *cf = file->private_data;
1166	struct inode *inode = file->f_dentry->d_inode;	1166	struct inode *inode = file->f_dentry->d_inode;
1167	struct ceph_inode_info *ci = ceph_inode(inode);	1167	struct ceph_inode_info *ci = ceph_inode(inode);
1168	int left;	1168	int left;
1169	const int bufsize = 1024;	1169	const int bufsize = 1024;
1170		1170
1171	if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))	1171	if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
1172	return -EISDIR;	1172	return -EISDIR;
1173		1173
1174	if (!cf->dir_info) {	1174	if (!cf->dir_info) {
1175	cf->dir_info = kmalloc(bufsize, GFP_NOFS);	1175	cf->dir_info = kmalloc(bufsize, GFP_NOFS);
1176	if (!cf->dir_info)	1176	if (!cf->dir_info)
1177	return -ENOMEM;	1177	return -ENOMEM;
1178	cf->dir_info_len =	1178	cf->dir_info_len =
1179	snprintf(cf->dir_info, bufsize,	1179	snprintf(cf->dir_info, bufsize,
1180	"entries: %20lld\n"	1180	"entries: %20lld\n"
1181	" files: %20lld\n"	1181	" files: %20lld\n"
1182	" subdirs: %20lld\n"	1182	" subdirs: %20lld\n"
1183	"rentries: %20lld\n"	1183	"rentries: %20lld\n"
1184	" rfiles: %20lld\n"	1184	" rfiles: %20lld\n"
1185	" rsubdirs: %20lld\n"	1185	" rsubdirs: %20lld\n"
1186	"rbytes: %20lld\n"	1186	"rbytes: %20lld\n"
1187	"rctime: %10ld.%09ld\n",	1187	"rctime: %10ld.%09ld\n",
1188	ci->i_files + ci->i_subdirs,	1188	ci->i_files + ci->i_subdirs,
1189	ci->i_files,	1189	ci->i_files,
1190	ci->i_subdirs,	1190	ci->i_subdirs,
1191	ci->i_rfiles + ci->i_rsubdirs,	1191	ci->i_rfiles + ci->i_rsubdirs,
1192	ci->i_rfiles,	1192	ci->i_rfiles,
1193	ci->i_rsubdirs,	1193	ci->i_rsubdirs,
1194	ci->i_rbytes,	1194	ci->i_rbytes,
1195	(long)ci->i_rctime.tv_sec,	1195	(long)ci->i_rctime.tv_sec,
1196	(long)ci->i_rctime.tv_nsec);	1196	(long)ci->i_rctime.tv_nsec);
1197	}	1197	}
1198		1198
1199	if (*ppos >= cf->dir_info_len)	1199	if (*ppos >= cf->dir_info_len)
1200	return 0;	1200	return 0;
1201	size = min_t(unsigned, size, cf->dir_info_len-*ppos);	1201	size = min_t(unsigned, size, cf->dir_info_len-*ppos);
1202	left = copy_to_user(buf, cf->dir_info + *ppos, size);	1202	left = copy_to_user(buf, cf->dir_info + *ppos, size);
1203	if (left == size)	1203	if (left == size)
1204	return -EFAULT;	1204	return -EFAULT;
1205	*ppos += (size - left);	1205	*ppos += (size - left);
1206	return size - left;	1206	return size - left;
1207	}	1207	}
1208		1208
1209	/*	1209	/*
1210	* an fsync() on a dir will wait for any uncommitted directory	1210	* an fsync() on a dir will wait for any uncommitted directory
1211	* operations to commit.	1211	* operations to commit.
1212	*/	1212	*/
1213	static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end,	1213	static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end,
1214	int datasync)	1214	int datasync)
1215	{	1215	{
1216	struct inode *inode = file->f_path.dentry->d_inode;	1216	struct inode *inode = file->f_path.dentry->d_inode;
1217	struct ceph_inode_info *ci = ceph_inode(inode);	1217	struct ceph_inode_info *ci = ceph_inode(inode);
1218	struct list_head *head = &ci->i_unsafe_dirops;	1218	struct list_head *head = &ci->i_unsafe_dirops;
1219	struct ceph_mds_request *req;	1219	struct ceph_mds_request *req;
1220	u64 last_tid;	1220	u64 last_tid;
1221	int ret = 0;	1221	int ret = 0;
1222		1222
1223	dout("dir_fsync %p\n", inode);	1223	dout("dir_fsync %p\n", inode);
1224	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);	1224	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
1225	if (ret)	1225	if (ret)
1226	return ret;	1226	return ret;
1227	mutex_lock(&inode->i_mutex);	1227	mutex_lock(&inode->i_mutex);
1228		1228
1229	spin_lock(&ci->i_unsafe_lock);	1229	spin_lock(&ci->i_unsafe_lock);
1230	if (list_empty(head))	1230	if (list_empty(head))
1231	goto out;	1231	goto out;
1232		1232
1233	req = list_entry(head->prev,	1233	req = list_entry(head->prev,
1234	struct ceph_mds_request, r_unsafe_dir_item);	1234	struct ceph_mds_request, r_unsafe_dir_item);
1235	last_tid = req->r_tid;	1235	last_tid = req->r_tid;
1236		1236
1237	do {	1237	do {
1238	ceph_mdsc_get_request(req);	1238	ceph_mdsc_get_request(req);
1239	spin_unlock(&ci->i_unsafe_lock);	1239	spin_unlock(&ci->i_unsafe_lock);
1240		1240
1241	dout("dir_fsync %p wait on tid %llu (until %llu)\n",	1241	dout("dir_fsync %p wait on tid %llu (until %llu)\n",
1242	inode, req->r_tid, last_tid);	1242	inode, req->r_tid, last_tid);
1243	if (req->r_timeout) {	1243	if (req->r_timeout) {
1244	ret = wait_for_completion_timeout(	1244	ret = wait_for_completion_timeout(
1245	&req->r_safe_completion, req->r_timeout);	1245	&req->r_safe_completion, req->r_timeout);
1246	if (ret > 0)	1246	if (ret > 0)
1247	ret = 0;	1247	ret = 0;
1248	else if (ret == 0)	1248	else if (ret == 0)
1249	ret = -EIO; /* timed out */	1249	ret = -EIO; /* timed out */
1250	} else {	1250	} else {
1251	wait_for_completion(&req->r_safe_completion);	1251	wait_for_completion(&req->r_safe_completion);
1252	}	1252	}
1253	ceph_mdsc_put_request(req);	1253	ceph_mdsc_put_request(req);
1254		1254
1255	spin_lock(&ci->i_unsafe_lock);	1255	spin_lock(&ci->i_unsafe_lock);
1256	if (ret \|\| list_empty(head))	1256	if (ret \|\| list_empty(head))
1257	break;	1257	break;
1258	req = list_entry(head->next,	1258	req = list_entry(head->next,
1259	struct ceph_mds_request, r_unsafe_dir_item);	1259	struct ceph_mds_request, r_unsafe_dir_item);
1260	} while (req->r_tid < last_tid);	1260	} while (req->r_tid < last_tid);
1261	out:	1261	out:
1262	spin_unlock(&ci->i_unsafe_lock);	1262	spin_unlock(&ci->i_unsafe_lock);
1263	mutex_unlock(&inode->i_mutex);	1263	mutex_unlock(&inode->i_mutex);
1264		1264
1265	return ret;	1265	return ret;
1266	}	1266	}
1267		1267
1268	/*	1268	/*
1269	* We maintain a private dentry LRU.	1269	* We maintain a private dentry LRU.
1270	*	1270	*
1271	* FIXME: this needs to be changed to a per-mds lru to be useful.	1271	* FIXME: this needs to be changed to a per-mds lru to be useful.
1272	*/	1272	*/
1273	void ceph_dentry_lru_add(struct dentry *dn)	1273	void ceph_dentry_lru_add(struct dentry *dn)
1274	{	1274	{
1275	struct ceph_dentry_info *di = ceph_dentry(dn);	1275	struct ceph_dentry_info *di = ceph_dentry(dn);
1276	struct ceph_mds_client *mdsc;	1276	struct ceph_mds_client *mdsc;
1277		1277
1278	dout("dentry_lru_add %p %p '%.*s'\n", di, dn,	1278	dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
1279	dn->d_name.len, dn->d_name.name);	1279	dn->d_name.len, dn->d_name.name);
1280	mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;	1280	mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
1281	spin_lock(&mdsc->dentry_lru_lock);	1281	spin_lock(&mdsc->dentry_lru_lock);
1282	list_add_tail(&di->lru, &mdsc->dentry_lru);	1282	list_add_tail(&di->lru, &mdsc->dentry_lru);
1283	mdsc->num_dentry++;	1283	mdsc->num_dentry++;
1284	spin_unlock(&mdsc->dentry_lru_lock);	1284	spin_unlock(&mdsc->dentry_lru_lock);
1285	}	1285	}
1286		1286
1287	void ceph_dentry_lru_touch(struct dentry *dn)	1287	void ceph_dentry_lru_touch(struct dentry *dn)
1288	{	1288	{
1289	struct ceph_dentry_info *di = ceph_dentry(dn);	1289	struct ceph_dentry_info *di = ceph_dentry(dn);
1290	struct ceph_mds_client *mdsc;	1290	struct ceph_mds_client *mdsc;
1291		1291
1292	dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,	1292	dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
1293	dn->d_name.len, dn->d_name.name, di->offset);	1293	dn->d_name.len, dn->d_name.name, di->offset);
1294	mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;	1294	mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
1295	spin_lock(&mdsc->dentry_lru_lock);	1295	spin_lock(&mdsc->dentry_lru_lock);
1296	list_move_tail(&di->lru, &mdsc->dentry_lru);	1296	list_move_tail(&di->lru, &mdsc->dentry_lru);
1297	spin_unlock(&mdsc->dentry_lru_lock);	1297	spin_unlock(&mdsc->dentry_lru_lock);
1298	}	1298	}
1299		1299
1300	void ceph_dentry_lru_del(struct dentry *dn)	1300	void ceph_dentry_lru_del(struct dentry *dn)
1301	{	1301	{
1302	struct ceph_dentry_info *di = ceph_dentry(dn);	1302	struct ceph_dentry_info *di = ceph_dentry(dn);
1303	struct ceph_mds_client *mdsc;	1303	struct ceph_mds_client *mdsc;
1304		1304
1305	dout("dentry_lru_del %p %p '%.*s'\n", di, dn,	1305	dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
1306	dn->d_name.len, dn->d_name.name);	1306	dn->d_name.len, dn->d_name.name);
1307	mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;	1307	mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
1308	spin_lock(&mdsc->dentry_lru_lock);	1308	spin_lock(&mdsc->dentry_lru_lock);
1309	list_del_init(&di->lru);	1309	list_del_init(&di->lru);
1310	mdsc->num_dentry--;	1310	mdsc->num_dentry--;
1311	spin_unlock(&mdsc->dentry_lru_lock);	1311	spin_unlock(&mdsc->dentry_lru_lock);
1312	}	1312	}
1313		1313
1314	/*	1314	/*
1315	* Return name hash for a given dentry. This is dependent on	1315	* Return name hash for a given dentry. This is dependent on
1316	* the parent directory's hash function.	1316	* the parent directory's hash function.
1317	*/	1317	*/
1318	unsigned ceph_dentry_hash(struct inode dir, struct dentry dn)	1318	unsigned ceph_dentry_hash(struct inode dir, struct dentry dn)
1319	{	1319	{
1320	struct ceph_inode_info *dci = ceph_inode(dir);	1320	struct ceph_inode_info *dci = ceph_inode(dir);
1321		1321
1322	switch (dci->i_dir_layout.dl_dir_hash) {	1322	switch (dci->i_dir_layout.dl_dir_hash) {
1323	case 0: /* for backward compat */	1323	case 0: /* for backward compat */
1324	case CEPH_STR_HASH_LINUX:	1324	case CEPH_STR_HASH_LINUX:
1325	return dn->d_name.hash;	1325	return dn->d_name.hash;
1326		1326
1327	default:	1327	default:
1328	return ceph_str_hash(dci->i_dir_layout.dl_dir_hash,	1328	return ceph_str_hash(dci->i_dir_layout.dl_dir_hash,
1329	dn->d_name.name, dn->d_name.len);	1329	dn->d_name.name, dn->d_name.len);
1330	}	1330	}
1331	}	1331	}
1332		1332
1333	const struct file_operations ceph_dir_fops = {	1333	const struct file_operations ceph_dir_fops = {
1334	.read = ceph_read_dir,	1334	.read = ceph_read_dir,
1335	.readdir = ceph_readdir,	1335	.readdir = ceph_readdir,
1336	.llseek = ceph_dir_llseek,	1336	.llseek = ceph_dir_llseek,
1337	.open = ceph_open,	1337	.open = ceph_open,
1338	.release = ceph_release,	1338	.release = ceph_release,
1339	.unlocked_ioctl = ceph_ioctl,	1339	.unlocked_ioctl = ceph_ioctl,
1340	.fsync = ceph_dir_fsync,	1340	.fsync = ceph_dir_fsync,
1341	};	1341	};
1342		1342
1343	const struct inode_operations ceph_dir_iops = {	1343	const struct inode_operations ceph_dir_iops = {
1344	.lookup = ceph_lookup,	1344	.lookup = ceph_lookup,
1345	.permission = ceph_permission,	1345	.permission = ceph_permission,
1346	.getattr = ceph_getattr,	1346	.getattr = ceph_getattr,
1347	.setattr = ceph_setattr,	1347	.setattr = ceph_setattr,
1348	.setxattr = ceph_setxattr,	1348	.setxattr = ceph_setxattr,
1349	.getxattr = ceph_getxattr,	1349	.getxattr = ceph_getxattr,
1350	.listxattr = ceph_listxattr,	1350	.listxattr = ceph_listxattr,
1351	.removexattr = ceph_removexattr,	1351	.removexattr = ceph_removexattr,
1352	.mknod = ceph_mknod,	1352	.mknod = ceph_mknod,
1353	.symlink = ceph_symlink,	1353	.symlink = ceph_symlink,
1354	.mkdir = ceph_mkdir,	1354	.mkdir = ceph_mkdir,
1355	.link = ceph_link,	1355	.link = ceph_link,
1356	.unlink = ceph_unlink,	1356	.unlink = ceph_unlink,
1357	.rmdir = ceph_unlink,	1357	.rmdir = ceph_unlink,
1358	.rename = ceph_rename,	1358	.rename = ceph_rename,
1359	.create = ceph_create,	1359	.create = ceph_create,
1360	};	1360	};
1361		1361
1362	const struct dentry_operations ceph_dentry_ops = {	1362	const struct dentry_operations ceph_dentry_ops = {
1363	.d_revalidate = ceph_d_revalidate,	1363	.d_revalidate = ceph_d_revalidate,
1364	.d_release = ceph_d_release,	1364	.d_release = ceph_d_release,
1365	.d_prune = ceph_d_prune,	1365	.d_prune = ceph_d_prune,
1366	};	1366	};
1367		1367
1368	const struct dentry_operations ceph_snapdir_dentry_ops = {	1368	const struct dentry_operations ceph_snapdir_dentry_ops = {
1369	.d_revalidate = ceph_snapdir_d_revalidate,	1369	.d_revalidate = ceph_snapdir_d_revalidate,
1370	.d_release = ceph_d_release,	1370	.d_release = ceph_d_release,
1371	};	1371	};
1372		1372
1373	const struct dentry_operations ceph_snap_dentry_ops = {	1373	const struct dentry_operations ceph_snap_dentry_ops = {
1374	.d_release = ceph_d_release,	1374	.d_release = ceph_d_release,
1375	.d_prune = ceph_d_prune,	1375	.d_prune = ceph_d_prune,
1376	};	1376	};
1377		1377

fs/ceph/mds_client.c

Diff comments View file @ 6c073a7

 #include <linux/ceph/ceph_debug.h>
 #include <linux/fs.h>
 #include <linux/wait.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 #include "super.h"
 #include "mds_client.h"
 #include <linux/ceph/messenger.h>
 #include <linux/ceph/decode.h>
 #include <linux/ceph/pagelist.h>
 #include <linux/ceph/auth.h>
 #include <linux/ceph/debugfs.h>
 /*
  * A cluster of MDS (metadata server) daemons is responsible for
  * managing the file system namespace (the directory hierarchy and
  * inodes) and for coordinating shared access to storage.  Metadata is
  * partitioning hierarchically across a number of servers, and that
  * partition varies over time as the cluster adjusts the distribution
  * in order to balance load.
  *
  * The MDS client is primarily responsible to managing synchronous
  * metadata requests for operations like open, unlink, and so forth.
  * If there is a MDS failure, we find out about it when we (possibly
  * request and) receive a new MDS map, and can resubmit affected
  * requests.
  *
  * For the most part, though, we take advantage of a lossless
  * communications channel to the MDS, and do not need to worry about
  * timing out or resubmitting requests.
  *
  * We maintain a stateful "session" with each MDS we interact with.
  * Within each session, we sent periodic heartbeat messages to ensure
  * any capabilities or leases we have been issues remain valid.  If
  * the session times out and goes stale, our leases and capabilities
  * are no longer valid.
  */
 struct ceph_reconnect_state {
 	struct ceph_pagelist *pagelist;
 	bool flock;
 };
 static void __wake_requests(struct ceph_mds_client *mdsc,
 			    struct list_head *head);
 static const struct ceph_connection_operations mds_con_ops;
 /*
  * mds reply parsing
  */
 /*
  * parse individual inode info
  */
 static int parse_reply_info_in(void **p, void *end,
 			       struct ceph_mds_reply_info_in *info,
 			       int features)
 {
 	int err = -EIO;
 	info->in = *p;
 	*p += sizeof(struct ceph_mds_reply_inode) +
 		sizeof(*info->in->fragtree.splits) *
 		le32_to_cpu(info->in->fragtree.nsplits);
 	ceph_decode_32_safe(p, end, info->symlink_len, bad);
 	ceph_decode_need(p, end, info->symlink_len, bad);
 	info->symlink = *p;
 	*p += info->symlink_len;
 	if (features & CEPH_FEATURE_DIRLAYOUTHASH)
 		ceph_decode_copy_safe(p, end, &info->dir_layout,
 				      sizeof(info->dir_layout), bad);
 	else
 		memset(&info->dir_layout, 0, sizeof(info->dir_layout));
 	ceph_decode_32_safe(p, end, info->xattr_len, bad);
 	ceph_decode_need(p, end, info->xattr_len, bad);
 	info->xattr_data = *p;
 	*p += info->xattr_len;
 	return 0;
 bad:
 	return err;
 }
 /*
  * parse a normal reply, which may contain a (dir+)dentry and/or a
  * target inode.
  */
 static int parse_reply_info_trace(void **p, void *end,
 				  struct ceph_mds_reply_info_parsed *info,
 				  int features)
 {
 	int err;
 	if (info->head->is_dentry) {
 		err = parse_reply_info_in(p, end, &info->diri, features);
 		if (err < 0)
 			goto out_bad;
 		if (unlikely(*p + sizeof(*info->dirfrag) > end))
 			goto bad;
 		info->dirfrag = *p;
 		*p += sizeof(*info->dirfrag) +
 			sizeof(u32)*le32_to_cpu(info->dirfrag->ndist);
 		if (unlikely(*p > end))
 			goto bad;
 		ceph_decode_32_safe(p, end, info->dname_len, bad);
 		ceph_decode_need(p, end, info->dname_len, bad);
 		info->dname = *p;
 		*p += info->dname_len;
 		info->dlease = *p;
 		*p += sizeof(*info->dlease);
 	}
 	if (info->head->is_target) {
 		err = parse_reply_info_in(p, end, &info->targeti, features);
 		if (err < 0)
 			goto out_bad;
 	}
 	if (unlikely(*p != end))
 		goto bad;
 	return 0;
 bad:
 	err = -EIO;
 out_bad:
 	pr_err("problem parsing mds trace %d\n", err);
 	return err;
 }
 /*
  * parse readdir results
  */
 static int parse_reply_info_dir(void **p, void *end,
 				struct ceph_mds_reply_info_parsed *info,
 				int features)
 {
 	u32 num, i = 0;
 	int err;
 	info->dir_dir = *p;
 	if (*p + sizeof(*info->dir_dir) > end)
 		goto bad;
 	*p += sizeof(*info->dir_dir) +
 		sizeof(u32)*le32_to_cpu(info->dir_dir->ndist);
 	if (*p > end)
 		goto bad;
 	ceph_decode_need(p, end, sizeof(num) + 2, bad);
 	num = ceph_decode_32(p);
 	info->dir_end = ceph_decode_8(p);
 	info->dir_complete = ceph_decode_8(p);
 	if (num == 0)
 		goto done;
 	/* alloc large array */
 	info->dir_nr = num;
 	info->dir_in = kcalloc(num, sizeof(*info->dir_in) +
 			       sizeof(*info->dir_dname) +
 			       sizeof(*info->dir_dname_len) +
 			       sizeof(*info->dir_dlease),
 			       GFP_NOFS);
 	if (info->dir_in == NULL) {
 		err = -ENOMEM;
 		goto out_bad;
 	}
 	info->dir_dname = (void *)(info->dir_in + num);
 	info->dir_dname_len = (void *)(info->dir_dname + num);
 	info->dir_dlease = (void *)(info->dir_dname_len + num);
 	while (num) {
 		/* dentry */
 		ceph_decode_need(p, end, sizeof(u32)*2, bad);
 		info->dir_dname_len[i] = ceph_decode_32(p);
 		ceph_decode_need(p, end, info->dir_dname_len[i], bad);
 		info->dir_dname[i] = *p;
 		*p += info->dir_dname_len[i];
 		dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i],
 		     info->dir_dname[i]);
 		info->dir_dlease[i] = *p;
 		*p += sizeof(struct ceph_mds_reply_lease);
 		/* inode */
 		err = parse_reply_info_in(p, end, &info->dir_in[i], features);
 		if (err < 0)
 			goto out_bad;
 		i++;
 		num--;
 	}
 done:
 	if (*p != end)
 		goto bad;
 	return 0;
 bad:
 	err = -EIO;
 out_bad:
 	pr_err("problem parsing dir contents %d\n", err);
 	return err;
 }
 /*
  * parse fcntl F_GETLK results
  */
 static int parse_reply_info_filelock(void **p, void *end,
 				     struct ceph_mds_reply_info_parsed *info,
 				     int features)
 {
 	if (*p + sizeof(*info->filelock_reply) > end)
 		goto bad;
 	info->filelock_reply = *p;
 	*p += sizeof(*info->filelock_reply);
 	if (unlikely(*p != end))
 		goto bad;
 	return 0;
 bad:
 	return -EIO;
 }
 /*
  * parse extra results
  */
 static int parse_reply_info_extra(void **p, void *end,
 				  struct ceph_mds_reply_info_parsed *info,
 				  int features)
 {
 	if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
 		return parse_reply_info_filelock(p, end, info, features);
 	else
 		return parse_reply_info_dir(p, end, info, features);
 }
 /*
  * parse entire mds reply
  */
 static int parse_reply_info(struct ceph_msg *msg,
 			    struct ceph_mds_reply_info_parsed *info,
 			    int features)
 {
 	void *p, *end;
 	u32 len;
 	int err;
 	info->head = msg->front.iov_base;
 	p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head);
 	end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head);
 	/* trace */
 	ceph_decode_32_safe(&p, end, len, bad);
 	if (len > 0) {
+		ceph_decode_need(&p, end, len, bad);
 		err = parse_reply_info_trace(&p, p+len, info, features);
 		if (err < 0)
 			goto out_bad;
 	}
 	/* extra */
 	ceph_decode_32_safe(&p, end, len, bad);
 	if (len > 0) {
+		ceph_decode_need(&p, end, len, bad);
 		err = parse_reply_info_extra(&p, p+len, info, features);
 		if (err < 0)
 			goto out_bad;
 	}
 	/* snap blob */
 	ceph_decode_32_safe(&p, end, len, bad);
 	info->snapblob_len = len;
 	info->snapblob = p;
 	p += len;
 	if (p != end)
 		goto bad;
 	return 0;
 bad:
 	err = -EIO;
 out_bad:
 	pr_err("mds parse_reply err %d\n", err);
 	return err;
 }
 static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
 {
 	kfree(info->dir_in);
 }
 /*
  * sessions
  */
 static const char *session_state_name(int s)
 {
 	switch (s) {
 	case CEPH_MDS_SESSION_NEW: return "new";
 	case CEPH_MDS_SESSION_OPENING: return "opening";
 	case CEPH_MDS_SESSION_OPEN: return "open";
 	case CEPH_MDS_SESSION_HUNG: return "hung";
 	case CEPH_MDS_SESSION_CLOSING: return "closing";
 	case CEPH_MDS_SESSION_RESTARTING: return "restarting";
 	case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting";
 	default: return "???";
 	}
 }
 static struct ceph_mds_session *get_session(struct ceph_mds_session *s)
 {
 	if (atomic_inc_not_zero(&s->s_ref)) {
 		dout("mdsc get_session %p %d -> %d\n", s,
 		     atomic_read(&s->s_ref)-1, atomic_read(&s->s_ref));
 		return s;
 	} else {
 		dout("mdsc get_session %p 0 -- FAIL", s);
 		return NULL;
 	}
 }
 void ceph_put_mds_session(struct ceph_mds_session *s)
 {
 	dout("mdsc put_session %p %d -> %d\n", s,
 	     atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
 	if (atomic_dec_and_test(&s->s_ref)) {
 		if (s->s_authorizer)
 		     s->s_mdsc->fsc->client->monc.auth->ops->destroy_authorizer(
 			     s->s_mdsc->fsc->client->monc.auth,
 			     s->s_authorizer);
 		kfree(s);
 	}
 }
 /*
  * called under mdsc->mutex
  */
 struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
 						   int mds)
 {
 	struct ceph_mds_session *session;
 	if (mds >= mdsc->max_sessions || mdsc->sessions[mds] == NULL)
 		return NULL;
 	session = mdsc->sessions[mds];
 	dout("lookup_mds_session %p %d\n", session,
 	     atomic_read(&session->s_ref));
 	get_session(session);
 	return session;
 }
 static bool __have_session(struct ceph_mds_client *mdsc, int mds)
 {
 	if (mds >= mdsc->max_sessions)
 		return false;
 	return mdsc->sessions[mds];
 }
 static int __verify_registered_session(struct ceph_mds_client *mdsc,
 				       struct ceph_mds_session *s)
 {
 	if (s->s_mds >= mdsc->max_sessions ||
 	    mdsc->sessions[s->s_mds] != s)
 		return -ENOENT;
 	return 0;
 }
 /*
  * create+register a new session for given mds.
  * called under mdsc->mutex.
  */
 static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
 						 int mds)
 {
 	struct ceph_mds_session *s;
 	s = kzalloc(sizeof(*s), GFP_NOFS);
 	if (!s)
 		return ERR_PTR(-ENOMEM);
 	s->s_mdsc = mdsc;
 	s->s_mds = mds;
 	s->s_state = CEPH_MDS_SESSION_NEW;
 	s->s_ttl = 0;
 	s->s_seq = 0;
 	mutex_init(&s->s_mutex);
 	ceph_con_init(mdsc->fsc->client->msgr, &s->s_con);
 	s->s_con.private = s;
 	s->s_con.ops = &mds_con_ops;
 	s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
 	s->s_con.peer_name.num = cpu_to_le64(mds);
-	spin_lock_init(&s->s_cap_lock);
+	spin_lock_init(&s->s_gen_ttl_lock);
 	s->s_cap_gen = 0;
 	s->s_cap_ttl = 0;
+	spin_lock_init(&s->s_cap_lock);
 	s->s_renew_requested = 0;
 	s->s_renew_seq = 0;
 	INIT_LIST_HEAD(&s->s_caps);
 	s->s_nr_caps = 0;
 	s->s_trim_caps = 0;
 	atomic_set(&s->s_ref, 1);
 	INIT_LIST_HEAD(&s->s_waiting);
 	INIT_LIST_HEAD(&s->s_unsafe);
 	s->s_num_cap_releases = 0;
 	s->s_cap_iterator = NULL;
 	INIT_LIST_HEAD(&s->s_cap_releases);
 	INIT_LIST_HEAD(&s->s_cap_releases_done);
 	INIT_LIST_HEAD(&s->s_cap_flushing);
 	INIT_LIST_HEAD(&s->s_cap_snaps_flushing);
 	dout("register_session mds%d\n", mds);
 	if (mds >= mdsc->max_sessions) {
 		int newmax = 1 << get_count_order(mds+1);
 		struct ceph_mds_session **sa;
 		dout("register_session realloc to %d\n", newmax);
 		sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
 		if (sa == NULL)
 			goto fail_realloc;
 		if (mdsc->sessions) {
 			memcpy(sa, mdsc->sessions,
 			       mdsc->max_sessions * sizeof(void *));
 			kfree(mdsc->sessions);
 		}
 		mdsc->sessions = sa;
 		mdsc->max_sessions = newmax;
 	}
 	mdsc->sessions[mds] = s;
 	atomic_inc(&s->s_ref);  /* one ref to sessions[], one to caller */
 	ceph_con_open(&s->s_con, ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
 	return s;
 fail_realloc:
 	kfree(s);
 	return ERR_PTR(-ENOMEM);
 }
 /*
  * called under mdsc->mutex
  */
 static void __unregister_session(struct ceph_mds_client *mdsc,
 			       struct ceph_mds_session *s)
 {
 	dout("__unregister_session mds%d %p\n", s->s_mds, s);
 	BUG_ON(mdsc->sessions[s->s_mds] != s);
 	mdsc->sessions[s->s_mds] = NULL;
 	ceph_con_close(&s->s_con);
 	ceph_put_mds_session(s);
 }
 /*
  * drop session refs in request.
  *
  * should be last request ref, or hold mdsc->mutex
  */
 static void put_request_session(struct ceph_mds_request *req)
 {
 	if (req->r_session) {
 		ceph_put_mds_session(req->r_session);
 		req->r_session = NULL;
 	}
 }
 void ceph_mdsc_release_request(struct kref *kref)
 {
 	struct ceph_mds_request *req = container_of(kref,
 						    struct ceph_mds_request,
 						    r_kref);
 	if (req->r_request)
 		ceph_msg_put(req->r_request);
 	if (req->r_reply) {
 		ceph_msg_put(req->r_reply);
 		destroy_reply_info(&req->r_reply_info);
 	}
 	if (req->r_inode) {
 		ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
 		iput(req->r_inode);
 	}
 	if (req->r_locked_dir)
 		ceph_put_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
 	if (req->r_target_inode)
 		iput(req->r_target_inode);
 	if (req->r_dentry)
 		dput(req->r_dentry);
 	if (req->r_old_dentry) {
 		/*
 		 * track (and drop pins for) r_old_dentry_dir
 		 * separately, since r_old_dentry's d_parent may have
 		 * changed between the dir mutex being dropped and
 		 * this request being freed.
 		 */
 		ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir),
 				  CEPH_CAP_PIN);
 		dput(req->r_old_dentry);
 		iput(req->r_old_dentry_dir);
 	}
 	kfree(req->r_path1);
 	kfree(req->r_path2);
 	put_request_session(req);
 	ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation);
 	kfree(req);
 }
 /*
  * lookup session, bump ref if found.
  *
  * called under mdsc->mutex.
  */
 static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc,
 					     u64 tid)
 {
 	struct ceph_mds_request *req;
 	struct rb_node *n = mdsc->request_tree.rb_node;
 	while (n) {
 		req = rb_entry(n, struct ceph_mds_request, r_node);
 		if (tid < req->r_tid)
 			n = n->rb_left;
 		else if (tid > req->r_tid)
 			n = n->rb_right;
 		else {
 			ceph_mdsc_get_request(req);
 			return req;
 		}
 	}
 	return NULL;
 }
 static void __insert_request(struct ceph_mds_client *mdsc,
 			     struct ceph_mds_request *new)
 {
 	struct rb_node **p = &mdsc->request_tree.rb_node;
 	struct rb_node *parent = NULL;
 	struct ceph_mds_request *req = NULL;
 	while (*p) {
 		parent = *p;
 		req = rb_entry(parent, struct ceph_mds_request, r_node);
 		if (new->r_tid < req->r_tid)
 			p = &(*p)->rb_left;
 		else if (new->r_tid > req->r_tid)
 			p = &(*p)->rb_right;
 		else
 			BUG();
 	}
 	rb_link_node(&new->r_node, parent, p);
 	rb_insert_color(&new->r_node, &mdsc->request_tree);
 }
 /*
  * Register an in-flight request, and assign a tid.  Link to directory
  * are modifying (if any).
  *
  * Called under mdsc->mutex.
  */
 static void __register_request(struct ceph_mds_client *mdsc,
 			       struct ceph_mds_request *req,
 			       struct inode *dir)
 {
 	req->r_tid = ++mdsc->last_tid;
 	if (req->r_num_caps)
 		ceph_reserve_caps(mdsc, &req->r_caps_reservation,
 				  req->r_num_caps);
 	dout("__register_request %p tid %lld\n", req, req->r_tid);
 	ceph_mdsc_get_request(req);
 	__insert_request(mdsc, req);
 	req->r_uid = current_fsuid();
 	req->r_gid = current_fsgid();
 	if (dir) {
 		struct ceph_inode_info *ci = ceph_inode(dir);
 		ihold(dir);
 		spin_lock(&ci->i_unsafe_lock);
 		req->r_unsafe_dir = dir;
 		list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
 		spin_unlock(&ci->i_unsafe_lock);
 	}
 }
 static void __unregister_request(struct ceph_mds_client *mdsc,
 				 struct ceph_mds_request *req)
 {
 	dout("__unregister_request %p tid %lld\n", req, req->r_tid);
 	rb_erase(&req->r_node, &mdsc->request_tree);
 	RB_CLEAR_NODE(&req->r_node);
 	if (req->r_unsafe_dir) {
 		struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
 		spin_lock(&ci->i_unsafe_lock);
 		list_del_init(&req->r_unsafe_dir_item);
 		spin_unlock(&ci->i_unsafe_lock);
 		iput(req->r_unsafe_dir);
 		req->r_unsafe_dir = NULL;
 	}
 	ceph_mdsc_put_request(req);
 }
 /*
  * Choose mds to send request to next.  If there is a hint set in the
  * request (e.g., due to a prior forward hint from the mds), use that.
  * Otherwise, consult frag tree and/or caps to identify the
  * appropriate mds.  If all else fails, choose randomly.
  *
  * Called under mdsc->mutex.
  */
 static struct dentry *get_nonsnap_parent(struct dentry *dentry)
 {
 	/*
 	 * we don't need to worry about protecting the d_parent access
 	 * here because we never renaming inside the snapped namespace
 	 * except to resplice to another snapdir, and either the old or new
 	 * result is a valid result.
 	 */
 	while (!IS_ROOT(dentry) && ceph_snap(dentry->d_inode) != CEPH_NOSNAP)
 		dentry = dentry->d_parent;
 	return dentry;
 }
 static int __choose_mds(struct ceph_mds_client *mdsc,
 			struct ceph_mds_request *req)
 {
 	struct inode *inode;
 	struct ceph_inode_info *ci;
 	struct ceph_cap *cap;
 	int mode = req->r_direct_mode;
 	int mds = -1;
 	u32 hash = req->r_direct_hash;
 	bool is_hash = req->r_direct_is_hash;
 	/*
 	 * is there a specific mds we should try?  ignore hint if we have
 	 * no session and the mds is not up (active or recovering).
 	 */
 	if (req->r_resend_mds >= 0 &&
 	    (__have_session(mdsc, req->r_resend_mds) ||
 	     ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) {
 		dout("choose_mds using resend_mds mds%d\n",
 		     req->r_resend_mds);
 		return req->r_resend_mds;
 	}
 	if (mode == USE_RANDOM_MDS)
 		goto random;
 	inode = NULL;
 	if (req->r_inode) {
 		inode = req->r_inode;
 	} else if (req->r_dentry) {
 		/* ignore race with rename; old or new d_parent is okay */
 		struct dentry *parent = req->r_dentry->d_parent;
 		struct inode *dir = parent->d_inode;
 		if (dir->i_sb != mdsc->fsc->sb) {
 			/* not this fs! */
 			inode = req->r_dentry->d_inode;
 		} else if (ceph_snap(dir) != CEPH_NOSNAP) {
 			/* direct snapped/virtual snapdir requests
 			 * based on parent dir inode */
 			struct dentry *dn = get_nonsnap_parent(parent);
 			inode = dn->d_inode;
 			dout("__choose_mds using nonsnap parent %p\n", inode);
 		} else if (req->r_dentry->d_inode) {
 			/* dentry target */
 			inode = req->r_dentry->d_inode;
 		} else {
 			/* dir + name */
 			inode = dir;
 			hash = ceph_dentry_hash(dir, req->r_dentry);
 			is_hash = true;
 		}
 	}
 	dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
 	     (int)hash, mode);
 	if (!inode)
 		goto random;
 	ci = ceph_inode(inode);
 	if (is_hash && S_ISDIR(inode->i_mode)) {
 		struct ceph_inode_frag frag;
 		int found;
 		ceph_choose_frag(ci, hash, &frag, &found);
 		if (found) {
 			if (mode == USE_ANY_MDS && frag.ndist > 0) {
 				u8 r;
 				/* choose a random replica */
 				get_random_bytes(&r, 1);
 				r %= frag.ndist;
 				mds = frag.dist[r];
 				dout("choose_mds %p %llx.%llx "
 				     "frag %u mds%d (%d/%d)\n",
 				     inode, ceph_vinop(inode),
 				     frag.frag, mds,
 				     (int)r, frag.ndist);
 				if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
 				    CEPH_MDS_STATE_ACTIVE)
 					return mds;
 			}
 			/* since this file/dir wasn't known to be
 			 * replicated, then we want to look for the
 			 * authoritative mds. */
 			mode = USE_AUTH_MDS;
 			if (frag.mds >= 0) {
 				/* choose auth mds */
 				mds = frag.mds;
 				dout("choose_mds %p %llx.%llx "
 				     "frag %u mds%d (auth)\n",
 				     inode, ceph_vinop(inode), frag.frag, mds);
 				if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
 				    CEPH_MDS_STATE_ACTIVE)
 					return mds;
 			}
 		}
 	}
 	spin_lock(&ci->i_ceph_lock);
 	cap = NULL;
 	if (mode == USE_AUTH_MDS)
 		cap = ci->i_auth_cap;
 	if (!cap && !RB_EMPTY_ROOT(&ci->i_caps))
 		cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
 	if (!cap) {
 		spin_unlock(&ci->i_ceph_lock);
 		goto random;
 	}
 	mds = cap->session->s_mds;
 	dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n",
 	     inode, ceph_vinop(inode), mds,
 	     cap == ci->i_auth_cap ? "auth " : "", cap);
 	spin_unlock(&ci->i_ceph_lock);
 	return mds;
 random:
 	mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
 	dout("choose_mds chose random mds%d\n", mds);
 	return mds;
 }
 /*
  * session messages
  */
 static struct ceph_msg *create_session_msg(u32 op, u64 seq)
 {
 	struct ceph_msg *msg;
 	struct ceph_mds_session_head *h;
 	msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS,
 			   false);
 	if (!msg) {
 		pr_err("create_session_msg ENOMEM creating msg\n");
 		return NULL;
 	}
 	h = msg->front.iov_base;
 	h->op = cpu_to_le32(op);
 	h->seq = cpu_to_le64(seq);
 	return msg;
 }
 /*
  * send session open request.
  *
  * called under mdsc->mutex
  */
 static int __open_session(struct ceph_mds_client *mdsc,
 			  struct ceph_mds_session *session)
 {
 	struct ceph_msg *msg;
 	int mstate;
 	int mds = session->s_mds;
 	/* wait for mds to go active? */
 	mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
 	dout("open_session to mds%d (%s)\n", mds,
 	     ceph_mds_state_name(mstate));
 	session->s_state = CEPH_MDS_SESSION_OPENING;
 	session->s_renew_requested = jiffies;
 	/* send connect message */
 	msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq);
 	if (!msg)
 		return -ENOMEM;
 	ceph_con_send(&session->s_con, msg);
 	return 0;
 }
 /*
  * open sessions for any export targets for the given mds
  *
  * called under mdsc->mutex
  */
 static void __open_export_target_sessions(struct ceph_mds_client *mdsc,
 					  struct ceph_mds_session *session)
 {
 	struct ceph_mds_info *mi;
 	struct ceph_mds_session *ts;
 	int i, mds = session->s_mds;
 	int target;
 	if (mds >= mdsc->mdsmap->m_max_mds)
 		return;
 	mi = &mdsc->mdsmap->m_info[mds];
 	dout("open_export_target_sessions for mds%d (%d targets)\n",
 	     session->s_mds, mi->num_export_targets);
 	for (i = 0; i < mi->num_export_targets; i++) {
 		target = mi->export_targets[i];
 		ts = __ceph_lookup_mds_session(mdsc, target);
 		if (!ts) {
 			ts = register_session(mdsc, target);
 			if (IS_ERR(ts))
 				return;
 		}
 		if (session->s_state == CEPH_MDS_SESSION_NEW ||
 		    session->s_state == CEPH_MDS_SESSION_CLOSING)
 			__open_session(mdsc, session);
 		else
 			dout(" mds%d target mds%d %p is %s\n", session->s_mds,
 			     i, ts, session_state_name(ts->s_state));
 		ceph_put_mds_session(ts);
 	}
 }
 void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
 					   struct ceph_mds_session *session)
 {
 	mutex_lock(&mdsc->mutex);
 	__open_export_target_sessions(mdsc, session);
 	mutex_unlock(&mdsc->mutex);
 }
 /*
  * session caps
  */
 /*
  * Free preallocated cap messages assigned to this session
  */
 static void cleanup_cap_releases(struct ceph_mds_session *session)
 {
 	struct ceph_msg *msg;
 	spin_lock(&session->s_cap_lock);
 	while (!list_empty(&session->s_cap_releases)) {
 		msg = list_first_entry(&session->s_cap_releases,
 				       struct ceph_msg, list_head);
 		list_del_init(&msg->list_head);
 		ceph_msg_put(msg);
 	}
 	while (!list_empty(&session->s_cap_releases_done)) {
 		msg = list_first_entry(&session->s_cap_releases_done,
 				       struct ceph_msg, list_head);
 		list_del_init(&msg->list_head);
 		ceph_msg_put(msg);
 	}
 	spin_unlock(&session->s_cap_lock);
 }
 /*
  * Helper to safely iterate over all caps associated with a session, with
  * special care taken to handle a racing __ceph_remove_cap().
  *
  * Caller must hold session s_mutex.
  */
 static int iterate_session_caps(struct ceph_mds_session *session,
 				 int (*cb)(struct inode *, struct ceph_cap *,
 					    void *), void *arg)
 {
 	struct list_head *p;
 	struct ceph_cap *cap;
 	struct inode *inode, *last_inode = NULL;
 	struct ceph_cap *old_cap = NULL;
 	int ret;
 	dout("iterate_session_caps %p mds%d\n", session, session->s_mds);
 	spin_lock(&session->s_cap_lock);
 	p = session->s_caps.next;
 	while (p != &session->s_caps) {
 		cap = list_entry(p, struct ceph_cap, session_caps);
 		inode = igrab(&cap->ci->vfs_inode);
 		if (!inode) {
 			p = p->next;
 			continue;
 		}
 		session->s_cap_iterator = cap;
 		spin_unlock(&session->s_cap_lock);
 		if (last_inode) {
 			iput(last_inode);
 			last_inode = NULL;
 		}
 		if (old_cap) {
 			ceph_put_cap(session->s_mdsc, old_cap);
 			old_cap = NULL;
 		}
 		ret = cb(inode, cap, arg);
 		last_inode = inode;
 		spin_lock(&session->s_cap_lock);
 		p = p->next;
 		if (cap->ci == NULL) {
 			dout("iterate_session_caps  finishing cap %p removal\n",
 			     cap);
 			BUG_ON(cap->session != session);
 			list_del_init(&cap->session_caps);
 			session->s_nr_caps--;
 			cap->session = NULL;
 			old_cap = cap;  /* put_cap it w/o locks held */
 		}
 		if (ret < 0)
 			goto out;
 	}
 	ret = 0;
 out:
 	session->s_cap_iterator = NULL;
 	spin_unlock(&session->s_cap_lock);
 	if (last_inode)
 		iput(last_inode);
 	if (old_cap)
 		ceph_put_cap(session->s_mdsc, old_cap);
 	return ret;
 }
 static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
 				  void *arg)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int drop = 0;
 	dout("removing cap %p, ci is %p, inode is %p\n",
 	     cap, ci, &ci->vfs_inode);
 	spin_lock(&ci->i_ceph_lock);
 	__ceph_remove_cap(cap);
 	if (!__ceph_is_any_real_caps(ci)) {
 		struct ceph_mds_client *mdsc =
 			ceph_sb_to_client(inode->i_sb)->mdsc;
 		spin_lock(&mdsc->cap_dirty_lock);
 		if (!list_empty(&ci->i_dirty_item)) {
 			pr_info(" dropping dirty %s state for %p %lld\n",
 				ceph_cap_string(ci->i_dirty_caps),
 				inode, ceph_ino(inode));
 			ci->i_dirty_caps = 0;
 			list_del_init(&ci->i_dirty_item);
 			drop = 1;
 		}
 		if (!list_empty(&ci->i_flushing_item)) {
 			pr_info(" dropping dirty+flushing %s state for %p %lld\n",
 				ceph_cap_string(ci->i_flushing_caps),
 				inode, ceph_ino(inode));
 			ci->i_flushing_caps = 0;
 			list_del_init(&ci->i_flushing_item);
 			mdsc->num_cap_flushing--;
 			drop = 1;
 		}
 		if (drop && ci->i_wrbuffer_ref) {
 			pr_info(" dropping dirty data for %p %lld\n",
 				inode, ceph_ino(inode));
 			ci->i_wrbuffer_ref = 0;
 			ci->i_wrbuffer_ref_head = 0;
 			drop++;
 		}
 		spin_unlock(&mdsc->cap_dirty_lock);
 	}
 	spin_unlock(&ci->i_ceph_lock);
 	while (drop--)
 		iput(inode);
 	return 0;
 }
 /*
  * caller must hold session s_mutex
  */
 static void remove_session_caps(struct ceph_mds_session *session)
 {
 	dout("remove_session_caps on %p\n", session);
 	iterate_session_caps(session, remove_session_caps_cb, NULL);
 	BUG_ON(session->s_nr_caps > 0);
 	BUG_ON(!list_empty(&session->s_cap_flushing));
 	cleanup_cap_releases(session);
 }
 /*
  * wake up any threads waiting on this session's caps.  if the cap is
  * old (didn't get renewed on the client reconnect), remove it now.
  *
  * caller must hold s_mutex.
  */
 static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
 			      void *arg)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	wake_up_all(&ci->i_cap_wq);
 	if (arg) {
 		spin_lock(&ci->i_ceph_lock);
 		ci->i_wanted_max_size = 0;
 		ci->i_requested_max_size = 0;
 		spin_unlock(&ci->i_ceph_lock);
 	}
 	return 0;
 }
 static void wake_up_session_caps(struct ceph_mds_session *session,
 				 int reconnect)
 {
 	dout("wake_up_session_caps %p mds%d\n", session, session->s_mds);
 	iterate_session_caps(session, wake_up_session_cb,
 			     (void *)(unsigned long)reconnect);
 }
 /*
  * Send periodic message to MDS renewing all currently held caps.  The
  * ack will reset the expiration for all caps from this session.
  *
  * caller holds s_mutex
  */
 static int send_renew_caps(struct ceph_mds_client *mdsc,
 			   struct ceph_mds_session *session)
 {
 	struct ceph_msg *msg;
 	int state;
 	if (time_after_eq(jiffies, session->s_cap_ttl) &&
 	    time_after_eq(session->s_cap_ttl, session->s_renew_requested))
 		pr_info("mds%d caps stale\n", session->s_mds);
 	session->s_renew_requested = jiffies;
 	/* do not try to renew caps until a recovering mds has reconnected
 	 * with its clients. */
 	state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds);
 	if (state < CEPH_MDS_STATE_RECONNECT) {
 		dout("send_renew_caps ignoring mds%d (%s)\n",
 		     session->s_mds, ceph_mds_state_name(state));
 		return 0;
 	}
 	dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
 		ceph_mds_state_name(state));
 	msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
 				 ++session->s_renew_seq);
 	if (!msg)
 		return -ENOMEM;
 	ceph_con_send(&session->s_con, msg);
 	return 0;
 }
 /*
  * Note new cap ttl, and any transition from stale -> not stale (fresh?).
  *
  * Called under session->s_mutex
  */
 static void renewed_caps(struct ceph_mds_client *mdsc,
 			 struct ceph_mds_session *session, int is_renew)
 {
 	int was_stale;
 	int wake = 0;
 	spin_lock(&session->s_cap_lock);
 	was_stale = is_renew && (session->s_cap_ttl == 0 ||
 				 time_after_eq(jiffies, session->s_cap_ttl));
 	session->s_cap_ttl = session->s_renew_requested +
 		mdsc->mdsmap->m_session_timeout*HZ;
 	if (was_stale) {
 		if (time_before(jiffies, session->s_cap_ttl)) {
 			pr_info("mds%d caps renewed\n", session->s_mds);
 			wake = 1;
 		} else {
 			pr_info("mds%d caps still stale\n", session->s_mds);
 		}
 	}
 	dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n",
 	     session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh",
 	     time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh");
 	spin_unlock(&session->s_cap_lock);
 	if (wake)
 		wake_up_session_caps(session, 0);
 }
 /*
  * send a session close request
  */
 static int request_close_session(struct ceph_mds_client *mdsc,
 				 struct ceph_mds_session *session)
 {
 	struct ceph_msg *msg;
 	dout("request_close_session mds%d state %s seq %lld\n",
 	     session->s_mds, session_state_name(session->s_state),
 	     session->s_seq);
 	msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
 	if (!msg)
 		return -ENOMEM;
 	ceph_con_send(&session->s_con, msg);
 	return 0;
 }
 /*
  * Called with s_mutex held.
  */
 static int __close_session(struct ceph_mds_client *mdsc,
 			 struct ceph_mds_session *session)
 {
 	if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
 		return 0;
 	session->s_state = CEPH_MDS_SESSION_CLOSING;
 	return request_close_session(mdsc, session);
 }
 /*
  * Trim old(er) caps.
  *
  * Because we can't cache an inode without one or more caps, we do
  * this indirectly: if a cap is unused, we prune its aliases, at which
  * point the inode will hopefully get dropped to.
  *
  * Yes, this is a bit sloppy.  Our only real goal here is to respond to
  * memory pressure from the MDS, though, so it needn't be perfect.
  */
 static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
 {
 	struct ceph_mds_session *session = arg;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int used, oissued, mine;
 	if (session->s_trim_caps <= 0)
 		return -1;
 	spin_lock(&ci->i_ceph_lock);
 	mine = cap->issued | cap->implemented;
 	used = __ceph_caps_used(ci);
 	oissued = __ceph_caps_issued_other(ci, cap);
 	dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n",
 	     inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
 	     ceph_cap_string(used));
 	if (ci->i_dirty_caps)
 		goto out;   /* dirty caps */
 	if ((used & ~oissued) & mine)
 		goto out;   /* we need these caps */
 	session->s_trim_caps--;
 	if (oissued) {
 		/* we aren't the only cap.. just remove us */
 		__ceph_remove_cap(cap);
 	} else {
 		/* try to drop referring dentries */
 		spin_unlock(&ci->i_ceph_lock);
 		d_prune_aliases(inode);
 		dout("trim_caps_cb %p cap %p  pruned, count now %d\n",
 		     inode, cap, atomic_read(&inode->i_count));
 		return 0;
 	}
 out:
 	spin_unlock(&ci->i_ceph_lock);
 	return 0;
 }
 /*
  * Trim session cap count down to some max number.
  */
 static int trim_caps(struct ceph_mds_client *mdsc,
 		     struct ceph_mds_session *session,
 		     int max_caps)
 {
 	int trim_caps = session->s_nr_caps - max_caps;
 	dout("trim_caps mds%d start: %d / %d, trim %d\n",
 	     session->s_mds, session->s_nr_caps, max_caps, trim_caps);
 	if (trim_caps > 0) {
 		session->s_trim_caps = trim_caps;
 		iterate_session_caps(session, trim_caps_cb, session);
 		dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
 		     session->s_mds, session->s_nr_caps, max_caps,
 			trim_caps - session->s_trim_caps);
 		session->s_trim_caps = 0;
 	}
 	return 0;
 }
 /*
  * Allocate cap_release messages.  If there is a partially full message
  * in the queue, try to allocate enough to cover it's remainder, so that
  * we can send it immediately.
  *
  * Called under s_mutex.
  */
 int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
 			  struct ceph_mds_session *session)
 {
 	struct ceph_msg *msg, *partial = NULL;
 	struct ceph_mds_cap_release *head;
 	int err = -ENOMEM;
 	int extra = mdsc->fsc->mount_options->cap_release_safety;
 	int num;
 	dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds,
 	     extra);
 	spin_lock(&session->s_cap_lock);
 	if (!list_empty(&session->s_cap_releases)) {
 		msg = list_first_entry(&session->s_cap_releases,
 				       struct ceph_msg,
 				 list_head);
 		head = msg->front.iov_base;
 		num = le32_to_cpu(head->num);
 		if (num) {
 			dout(" partial %p with (%d/%d)\n", msg, num,
 			     (int)CEPH_CAPS_PER_RELEASE);
 			extra += CEPH_CAPS_PER_RELEASE - num;
 			partial = msg;
 		}
 	}
 	while (session->s_num_cap_releases < session->s_nr_caps + extra) {
 		spin_unlock(&session->s_cap_lock);
 		msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
 				   GFP_NOFS, false);
 		if (!msg)
 			goto out_unlocked;
 		dout("add_cap_releases %p msg %p now %d\n", session, msg,
 		     (int)msg->front.iov_len);
 		head = msg->front.iov_base;
 		head->num = cpu_to_le32(0);
 		msg->front.iov_len = sizeof(*head);
 		spin_lock(&session->s_cap_lock);
 		list_add(&msg->list_head, &session->s_cap_releases);
 		session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE;
 	}
 	if (partial) {
 		head = partial->front.iov_base;
 		num = le32_to_cpu(head->num);
 		dout(" queueing partial %p with %d/%d\n", partial, num,
 		     (int)CEPH_CAPS_PER_RELEASE);
 		list_move_tail(&partial->list_head,
 			       &session->s_cap_releases_done);
 		session->s_num_cap_releases -= CEPH_CAPS_PER_RELEASE - num;
 	}
 	err = 0;
 	spin_unlock(&session->s_cap_lock);
 out_unlocked:
 	return err;
 }
 /*
  * flush all dirty inode data to disk.
  *
  * returns true if we've flushed through want_flush_seq
  */
 static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
 {
 	int mds, ret = 1;
 	dout("check_cap_flush want %lld\n", want_flush_seq);
 	mutex_lock(&mdsc->mutex);
 	for (mds = 0; ret && mds < mdsc->max_sessions; mds++) {
 		struct ceph_mds_session *session = mdsc->sessions[mds];
 		if (!session)
 			continue;
 		get_session(session);
 		mutex_unlock(&mdsc->mutex);
 		mutex_lock(&session->s_mutex);
 		if (!list_empty(&session->s_cap_flushing)) {
 			struct ceph_inode_info *ci =
 				list_entry(session->s_cap_flushing.next,
 					   struct ceph_inode_info,
 					   i_flushing_item);
 			struct inode *inode = &ci->vfs_inode;
 			spin_lock(&ci->i_ceph_lock);
 			if (ci->i_cap_flush_seq <= want_flush_seq) {
 				dout("check_cap_flush still flushing %p "
 				     "seq %lld <= %lld to mds%d\n", inode,
 				     ci->i_cap_flush_seq, want_flush_seq,
 				     session->s_mds);
 				ret = 0;
 			}
 			spin_unlock(&ci->i_ceph_lock);
 		}
 		mutex_unlock(&session->s_mutex);
 		ceph_put_mds_session(session);
 		if (!ret)
 			return ret;
 		mutex_lock(&mdsc->mutex);
 	}
 	mutex_unlock(&mdsc->mutex);
 	dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq);
 	return ret;
 }
 /*
  * called under s_mutex
  */
 void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
 			    struct ceph_mds_session *session)
 {
 	struct ceph_msg *msg;
 	dout("send_cap_releases mds%d\n", session->s_mds);
 	spin_lock(&session->s_cap_lock);
 	while (!list_empty(&session->s_cap_releases_done)) {
 		msg = list_first_entry(&session->s_cap_releases_done,
 				 struct ceph_msg, list_head);
 		list_del_init(&msg->list_head);
 		spin_unlock(&session->s_cap_lock);
 		msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
 		dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
 		ceph_con_send(&session->s_con, msg);
 		spin_lock(&session->s_cap_lock);
 	}
 	spin_unlock(&session->s_cap_lock);
 }
 static void discard_cap_releases(struct ceph_mds_client *mdsc,
 				 struct ceph_mds_session *session)
 {
 	struct ceph_msg *msg;
 	struct ceph_mds_cap_release *head;
 	unsigned num;
 	dout("discard_cap_releases mds%d\n", session->s_mds);
 	spin_lock(&session->s_cap_lock);
 	/* zero out the in-progress message */
 	msg = list_first_entry(&session->s_cap_releases,
 			       struct ceph_msg, list_head);
 	head = msg->front.iov_base;
 	num = le32_to_cpu(head->num);
 	dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num);
 	head->num = cpu_to_le32(0);
 	session->s_num_cap_releases += num;
 	/* requeue completed messages */
 	while (!list_empty(&session->s_cap_releases_done)) {
 		msg = list_first_entry(&session->s_cap_releases_done,
 				 struct ceph_msg, list_head);
 		list_del_init(&msg->list_head);
 		head = msg->front.iov_base;
 		num = le32_to_cpu(head->num);
 		dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg,
 		     num);
 		session->s_num_cap_releases += num;
 		head->num = cpu_to_le32(0);
 		msg->front.iov_len = sizeof(*head);
 		list_add(&msg->list_head, &session->s_cap_releases);
 	}
 	spin_unlock(&session->s_cap_lock);
 }
 /*
  * requests
  */
 /*
  * Create an mds request.
  */
 struct ceph_mds_request *
 ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
 {
 	struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
 	if (!req)
 		return ERR_PTR(-ENOMEM);
 	mutex_init(&req->r_fill_mutex);
 	req->r_mdsc = mdsc;
 	req->r_started = jiffies;
 	req->r_resend_mds = -1;
 	INIT_LIST_HEAD(&req->r_unsafe_dir_item);
 	req->r_fmode = -1;
 	kref_init(&req->r_kref);
 	INIT_LIST_HEAD(&req->r_wait);
 	init_completion(&req->r_completion);
 	init_completion(&req->r_safe_completion);
 	INIT_LIST_HEAD(&req->r_unsafe_item);
 	req->r_op = op;
 	req->r_direct_mode = mode;
 	return req;
 }
 /*
  * return oldest (lowest) request, tid in request tree, 0 if none.
  *
  * called under mdsc->mutex.
  */
 static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc)
 {
 	if (RB_EMPTY_ROOT(&mdsc->request_tree))
 		return NULL;
 	return rb_entry(rb_first(&mdsc->request_tree),
 			struct ceph_mds_request, r_node);
 }
 static u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
 {
 	struct ceph_mds_request *req = __get_oldest_req(mdsc);
 	if (req)
 		return req->r_tid;
 	return 0;
 }
 /*
  * Build a dentry's path.  Allocate on heap; caller must kfree.  Based
  * on build_path_from_dentry in fs/cifs/dir.c.
  *
  * If @stop_on_nosnap, generate path relative to the first non-snapped
  * inode.
  *
  * Encode hidden .snap dirs as a double /, i.e.
  *   foo/.snap/bar -> foo//bar
  */
 char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
 			   int stop_on_nosnap)
 {
 	struct dentry *temp;
 	char *path;
 	int len, pos;
 	unsigned seq;
 	if (dentry == NULL)
 		return ERR_PTR(-EINVAL);
 retry:
 	len = 0;
 	seq = read_seqbegin(&rename_lock);
 	rcu_read_lock();
 	for (temp = dentry; !IS_ROOT(temp);) {
 		struct inode *inode = temp->d_inode;
 		if (inode && ceph_snap(inode) == CEPH_SNAPDIR)
 			len++;  /* slash only */
 		else if (stop_on_nosnap && inode &&
 			 ceph_snap(inode) == CEPH_NOSNAP)
 			break;
 		else
 			len += 1 + temp->d_name.len;
 		temp = temp->d_parent;
 		if (temp == NULL) {
 			rcu_read_unlock();
 			pr_err("build_path corrupt dentry %p\n", dentry);
 			return ERR_PTR(-EINVAL);
 		}
 	}
 	rcu_read_unlock();
 	if (len)
 		len--;  /* no leading '/' */
 	path = kmalloc(len+1, GFP_NOFS);
 	if (path == NULL)
 		return ERR_PTR(-ENOMEM);
 	pos = len;
 	path[pos] = 0;	/* trailing null */
 	rcu_read_lock();
 	for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) {
 		struct inode *inode;
 		spin_lock(&temp->d_lock);
 		inode = temp->d_inode;
 		if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
 			dout("build_path path+%d: %p SNAPDIR\n",
 			     pos, temp);
 		} else if (stop_on_nosnap && inode &&
 			   ceph_snap(inode) == CEPH_NOSNAP) {
 			spin_unlock(&temp->d_lock);
 			break;
 		} else {
 			pos -= temp->d_name.len;
 			if (pos < 0) {
 				spin_unlock(&temp->d_lock);
 				break;
 			}
 			strncpy(path + pos, temp->d_name.name,
 				temp->d_name.len);
 		}
 		spin_unlock(&temp->d_lock);
 		if (pos)
 			path[--pos] = '/';
 		temp = temp->d_parent;
 		if (temp == NULL) {
 			rcu_read_unlock();
 			pr_err("build_path corrupt dentry\n");
 			kfree(path);
 			return ERR_PTR(-EINVAL);
 		}
 	}
 	rcu_read_unlock();
 	if (pos != 0 || read_seqretry(&rename_lock, seq)) {
 		pr_err("build_path did not end path lookup where "
 		       "expected, namelen is %d, pos is %d\n", len, pos);
 		/* presumably this is only possible if racing with a
 		   rename of one of the parent directories (we can not
 		   lock the dentries above us to prevent this, but
 		   retrying should be harmless) */
 		kfree(path);
 		goto retry;
 	}
 	*base = ceph_ino(temp->d_inode);
 	*plen = len;
 	dout("build_path on %p %d built %llx '%.*s'\n",
 	     dentry, dentry->d_count, *base, len, path);
 	return path;
 }
 static int build_dentry_path(struct dentry *dentry,
 			     const char **ppath, int *ppathlen, u64 *pino,
 			     int *pfreepath)
 {
 	char *path;
 	if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) {
 		*pino = ceph_ino(dentry->d_parent->d_inode);
 		*ppath = dentry->d_name.name;
 		*ppathlen = dentry->d_name.len;
 		return 0;
 	}
 	path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
 	if (IS_ERR(path))
 		return PTR_ERR(path);
 	*ppath = path;
 	*pfreepath = 1;
 	return 0;
 }
 static int build_inode_path(struct inode *inode,
 			    const char **ppath, int *ppathlen, u64 *pino,
 			    int *pfreepath)
 {
 	struct dentry *dentry;
 	char *path;
 	if (ceph_snap(inode) == CEPH_NOSNAP) {
 		*pino = ceph_ino(inode);
 		*ppathlen = 0;
 		return 0;
 	}
 	dentry = d_find_alias(inode);
 	path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
 	dput(dentry);
 	if (IS_ERR(path))
 		return PTR_ERR(path);
 	*ppath = path;
 	*pfreepath = 1;
 	return 0;
 }
 /*
  * request arguments may be specified via an inode *, a dentry *, or
  * an explicit ino+path.
  */
 static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
 				  const char *rpath, u64 rino,
 				  const char **ppath, int *pathlen,
 				  u64 *ino, int *freepath)
 {
 	int r = 0;
 	if (rinode) {
 		r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
 		dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
 		     ceph_snap(rinode));
 	} else if (rdentry) {
 		r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath);
 		dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
 		     *ppath);
 	} else if (rpath || rino) {
 		*ino = rino;
 		*ppath = rpath;
 		*pathlen = strlen(rpath);
 		dout(" path %.*s\n", *pathlen, rpath);
 	}
 	return r;
 }
 /*
  * called under mdsc->mutex
  */
 static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
 					       struct ceph_mds_request *req,
 					       int mds)
 {
 	struct ceph_msg *msg;
 	struct ceph_mds_request_head *head;
 	const char *path1 = NULL;
 	const char *path2 = NULL;
 	u64 ino1 = 0, ino2 = 0;
 	int pathlen1 = 0, pathlen2 = 0;
 	int freepath1 = 0, freepath2 = 0;
 	int len;
 	u16 releases;
 	void *p, *end;
 	int ret;
 	ret = set_request_path_attr(req->r_inode, req->r_dentry,
 			      req->r_path1, req->r_ino1.ino,
 			      &path1, &pathlen1, &ino1, &freepath1);
 	if (ret < 0) {
 		msg = ERR_PTR(ret);
 		goto out;
 	}
 	ret = set_request_path_attr(NULL, req->r_old_dentry,
 			      req->r_path2, req->r_ino2.ino,
 			      &path2, &pathlen2, &ino2, &freepath2);
 	if (ret < 0) {
 		msg = ERR_PTR(ret);
 		goto out_free1;
 	}
 	len = sizeof(*head) +
 		pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64));
 	/* calculate (max) length for cap releases */
 	len += sizeof(struct ceph_mds_request_release) *
 		(!!req->r_inode_drop + !!req->r_dentry_drop +
 		 !!req->r_old_inode_drop + !!req->r_old_dentry_drop);
 	if (req->r_dentry_drop)
 		len += req->r_dentry->d_name.len;
 	if (req->r_old_dentry_drop)
 		len += req->r_old_dentry->d_name.len;
 	msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS, false);
 	if (!msg) {
 		msg = ERR_PTR(-ENOMEM);
 		goto out_free2;
 	}
 	msg->hdr.tid = cpu_to_le64(req->r_tid);
 	head = msg->front.iov_base;
 	p = msg->front.iov_base + sizeof(*head);
 	end = msg->front.iov_base + msg->front.iov_len;
 	head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
 	head->op = cpu_to_le32(req->r_op);
 	head->caller_uid = cpu_to_le32(req->r_uid);
 	head->caller_gid = cpu_to_le32(req->r_gid);
 	head->args = req->r_args;
 	ceph_encode_filepath(&p, end, ino1, path1);
 	ceph_encode_filepath(&p, end, ino2, path2);
 	/* make note of release offset, in case we need to replay */
 	req->r_request_release_offset = p - msg->front.iov_base;
 	/* cap releases */
 	releases = 0;
 	if (req->r_inode_drop)
 		releases += ceph_encode_inode_release(&p,
 		      req->r_inode ? req->r_inode : req->r_dentry->d_inode,
 		      mds, req->r_inode_drop, req->r_inode_unless, 0);
 	if (req->r_dentry_drop)
 		releases += ceph_encode_dentry_release(&p, req->r_dentry,
 		       mds, req->r_dentry_drop, req->r_dentry_unless);
 	if (req->r_old_dentry_drop)
 		releases += ceph_encode_dentry_release(&p, req->r_old_dentry,
 		       mds, req->r_old_dentry_drop, req->r_old_dentry_unless);
 	if (req->r_old_inode_drop)
 		releases += ceph_encode_inode_release(&p,
 		      req->r_old_dentry->d_inode,
 		      mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
 	head->num_releases = cpu_to_le16(releases);
 	BUG_ON(p > end);
 	msg->front.iov_len = p - msg->front.iov_base;
 	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
 	msg->pages = req->r_pages;
 	msg->nr_pages = req->r_num_pages;
 	msg->hdr.data_len = cpu_to_le32(req->r_data_len);
 	msg->hdr.data_off = cpu_to_le16(0);
 out_free2:
 	if (freepath2)
 		kfree((char *)path2);
 out_free1:
 	if (freepath1)
 		kfree((char *)path1);
 out:
 	return msg;
 }
 /*
  * called under mdsc->mutex if error, under no mutex if
  * success.
  */
 static void complete_request(struct ceph_mds_client *mdsc,
 			     struct ceph_mds_request *req)
 {
 	if (req->r_callback)
 		req->r_callback(mdsc, req);
 	else
 		complete_all(&req->r_completion);
 }
 /*
  * called under mdsc->mutex
  */
 static int __prepare_send_request(struct ceph_mds_client *mdsc,
 				  struct ceph_mds_request *req,
 				  int mds)
 {
 	struct ceph_mds_request_head *rhead;
 	struct ceph_msg *msg;
 	int flags = 0;
 	req->r_attempts++;
 	if (req->r_inode) {
 		struct ceph_cap *cap =
 			ceph_get_cap_for_mds(ceph_inode(req->r_inode), mds);
 		if (cap)
 			req->r_sent_on_mseq = cap->mseq;
 		else
 			req->r_sent_on_mseq = -1;
 	}
 	dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
 	     req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
 	if (req->r_got_unsafe) {
 		/*
 		 * Replay.  Do not regenerate message (and rebuild
 		 * paths, etc.); just use the original message.
 		 * Rebuilding paths will break for renames because
 		 * d_move mangles the src name.
 		 */
 		msg = req->r_request;
 		rhead = msg->front.iov_base;
 		flags = le32_to_cpu(rhead->flags);
 		flags |= CEPH_MDS_FLAG_REPLAY;
 		rhead->flags = cpu_to_le32(flags);
 		if (req->r_target_inode)
 			rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
 		rhead->num_retry = req->r_attempts - 1;
 		/* remove cap/dentry releases from message */
 		rhead->num_releases = 0;
 		msg->hdr.front_len = cpu_to_le32(req->r_request_release_offset);
 		msg->front.iov_len = req->r_request_release_offset;
 		return 0;
 	}
 	if (req->r_request) {
 		ceph_msg_put(req->r_request);
 		req->r_request = NULL;
 	}
 	msg = create_request_message(mdsc, req, mds);
 	if (IS_ERR(msg)) {
 		req->r_err = PTR_ERR(msg);
 		complete_request(mdsc, req);
 		return PTR_ERR(msg);
 	}
 	req->r_request = msg;
 	rhead = msg->front.iov_base;
 	rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
 	if (req->r_got_unsafe)
 		flags |= CEPH_MDS_FLAG_REPLAY;
 	if (req->r_locked_dir)
 		flags |= CEPH_MDS_FLAG_WANT_DENTRY;
 	rhead->flags = cpu_to_le32(flags);
 	rhead->num_fwd = req->r_num_fwd;
 	rhead->num_retry = req->r_attempts - 1;
 	rhead->ino = 0;
 	dout(" r_locked_dir = %p\n", req->r_locked_dir);
 	return 0;
 }
 /*
  * send request, or put it on the appropriate wait list.
  */
 static int __do_request(struct ceph_mds_client *mdsc,
 			struct ceph_mds_request *req)
 {
 	struct ceph_mds_session *session = NULL;
 	int mds = -1;
 	int err = -EAGAIN;
 	if (req->r_err || req->r_got_result)
 		goto out;
 	if (req->r_timeout &&
 	    time_after_eq(jiffies, req->r_started + req->r_timeout)) {
 		dout("do_request timed out\n");
 		err = -EIO;
 		goto finish;
 	}
 	put_request_session(req);
 	mds = __choose_mds(mdsc, req);
 	if (mds < 0 ||
 	    ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
 		dout("do_request no mds or not active, waiting for map\n");
 		list_add(&req->r_wait, &mdsc->waiting_for_map);
 		goto out;
 	}
 	/* get, open session */
 	session = __ceph_lookup_mds_session(mdsc, mds);
 	if (!session) {
 		session = register_session(mdsc, mds);
 		if (IS_ERR(session)) {
 			err = PTR_ERR(session);
 			goto finish;
 		}
 	}
 	req->r_session = get_session(session);
 	dout("do_request mds%d session %p state %s\n", mds, session,
 	     session_state_name(session->s_state));
 	if (session->s_state != CEPH_MDS_SESSION_OPEN &&
 	    session->s_state != CEPH_MDS_SESSION_HUNG) {
 		if (session->s_state == CEPH_MDS_SESSION_NEW ||
 		    session->s_state == CEPH_MDS_SESSION_CLOSING)
 			__open_session(mdsc, session);
 		list_add(&req->r_wait, &session->s_waiting);
 		goto out_session;
 	}
 	/* send request */
 	req->r_resend_mds = -1;   /* forget any previous mds hint */
 	if (req->r_request_started == 0)   /* note request start time */
 		req->r_request_started = jiffies;
 	err = __prepare_send_request(mdsc, req, mds);
 	if (!err) {
 		ceph_msg_get(req->r_request);
 		ceph_con_send(&session->s_con, req->r_request);
 	}
 out_session:
 	ceph_put_mds_session(session);
 out:
 	return err;
 finish:
 	req->r_err = err;
 	complete_request(mdsc, req);
 	goto out;
 }
 /*
  * called under mdsc->mutex
  */
 static void __wake_requests(struct ceph_mds_client *mdsc,
 			    struct list_head *head)
 {
 	struct ceph_mds_request *req, *nreq;
 	list_for_each_entry_safe(req, nreq, head, r_wait) {
 		list_del_init(&req->r_wait);
 		__do_request(mdsc, req);
 	}
 }
 /*
  * Wake up threads with requests pending for @mds, so that they can
  * resubmit their requests to a possibly different mds.
  */
 static void kick_requests(struct ceph_mds_client *mdsc, int mds)
 {
 	struct ceph_mds_request *req;
 	struct rb_node *p;
 	dout("kick_requests mds%d\n", mds);
 	for (p = rb_first(&mdsc->request_tree); p; p = rb_next(p)) {
 		req = rb_entry(p, struct ceph_mds_request, r_node);
 		if (req->r_got_unsafe)
 			continue;
 		if (req->r_session &&
 		    req->r_session->s_mds == mds) {
 			dout(" kicking tid %llu\n", req->r_tid);
 			__do_request(mdsc, req);
 		}
 	}
 }
 void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
 			      struct ceph_mds_request *req)
 {
 	dout("submit_request on %p\n", req);
 	mutex_lock(&mdsc->mutex);
 	__register_request(mdsc, req, NULL);
 	__do_request(mdsc, req);
 	mutex_unlock(&mdsc->mutex);
 }
 /*
  * Synchrously perform an mds request.  Take care of all of the
  * session setup, forwarding, retry details.
  */
 int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
 			 struct inode *dir,
 			 struct ceph_mds_request *req)
 {
 	int err;
 	dout("do_request on %p\n", req);
 	/* take CAP_PIN refs for r_inode, r_locked_dir, r_old_dentry */
 	if (req->r_inode)
 		ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
 	if (req->r_locked_dir)
 		ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
 	if (req->r_old_dentry)
 		ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
 				  CEPH_CAP_PIN);
 	/* issue */
 	mutex_lock(&mdsc->mutex);
 	__register_request(mdsc, req, dir);
 	__do_request(mdsc, req);
 	if (req->r_err) {
 		err = req->r_err;
 		__unregister_request(mdsc, req);
 		dout("do_request early error %d\n", err);
 		goto out;
 	}
 	/* wait */
 	mutex_unlock(&mdsc->mutex);
 	dout("do_request waiting\n");
 	if (req->r_timeout) {
 		err = (long)wait_for_completion_killable_timeout(
 			&req->r_completion, req->r_timeout);
 		if (err == 0)
 			err = -EIO;
 	} else {
 		err = wait_for_completion_killable(&req->r_completion);
 	}
 	dout("do_request waited, got %d\n", err);
 	mutex_lock(&mdsc->mutex);
 	/* only abort if we didn't race with a real reply */
 	if (req->r_got_result) {
 		err = le32_to_cpu(req->r_reply_info.head->result);
 	} else if (err < 0) {
 		dout("aborted request %lld with %d\n", req->r_tid, err);
 		/*
 		 * ensure we aren't running concurrently with
 		 * ceph_fill_trace or ceph_readdir_prepopulate, which
 		 * rely on locks (dir mutex) held by our caller.
 		 */
 		mutex_lock(&req->r_fill_mutex);
 		req->r_err = err;
 		req->r_aborted = true;
 		mutex_unlock(&req->r_fill_mutex);
 		if (req->r_locked_dir &&
 		    (req->r_op & CEPH_MDS_OP_WRITE))
 			ceph_invalidate_dir_request(req);
 	} else {
 		err = req->r_err;
 	}
 out:
 	mutex_unlock(&mdsc->mutex);
 	dout("do_request %p done, result %d\n", req, err);
 	return err;
 }
 /*
  * Invalidate dir D_COMPLETE, dentry lease state on an aborted MDS
  * namespace request.
  */
 void ceph_invalidate_dir_request(struct ceph_mds_request *req)
 {
 	struct inode *inode = req->r_locked_dir;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	dout("invalidate_dir_request %p (D_COMPLETE, lease(s))\n", inode);
 	spin_lock(&ci->i_ceph_lock);
 	ceph_dir_clear_complete(inode);
 	ci->i_release_count++;
 	spin_unlock(&ci->i_ceph_lock);
 	if (req->r_dentry)
 		ceph_invalidate_dentry_lease(req->r_dentry);
 	if (req->r_old_dentry)
 		ceph_invalidate_dentry_lease(req->r_old_dentry);
 }
 /*
  * Handle mds reply.
  *
  * We take the session mutex and parse and process the reply immediately.
  * This preserves the logical ordering of replies, capabilities, etc., sent
  * by the MDS as they are applied to our local cache.
  */
 static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 {
 	struct ceph_mds_client *mdsc = session->s_mdsc;
 	struct ceph_mds_request *req;
 	struct ceph_mds_reply_head *head = msg->front.iov_base;
 	struct ceph_mds_reply_info_parsed *rinfo;  /* parsed reply info */
 	u64 tid;
 	int err, result;
 	int mds = session->s_mds;
 	if (msg->front.iov_len < sizeof(*head)) {
 		pr_err("mdsc_handle_reply got corrupt (short) reply\n");
 		ceph_msg_dump(msg);
 		return;
 	}
 	/* get request, session */
 	tid = le64_to_cpu(msg->hdr.tid);
 	mutex_lock(&mdsc->mutex);
 	req = __lookup_request(mdsc, tid);
 	if (!req) {
 		dout("handle_reply on unknown tid %llu\n", tid);
 		mutex_unlock(&mdsc->mutex);
 		return;
 	}
 	dout("handle_reply %p\n", req);
 	/* correct session? */
 	if (req->r_session != session) {
 		pr_err("mdsc_handle_reply got %llu on session mds%d"
 		       " not mds%d\n", tid, session->s_mds,
 		       req->r_session ? req->r_session->s_mds : -1);
 		mutex_unlock(&mdsc->mutex);
 		goto out;
 	}
 	/* dup? */
 	if ((req->r_got_unsafe && !head->safe) ||
 	    (req->r_got_safe && head->safe)) {
 		pr_warning("got a dup %s reply on %llu from mds%d\n",
 			   head->safe ? "safe" : "unsafe", tid, mds);
 		mutex_unlock(&mdsc->mutex);
 		goto out;
 	}
 	if (req->r_got_safe && !head->safe) {
 		pr_warning("got unsafe after safe on %llu from mds%d\n",
 			   tid, mds);
 		mutex_unlock(&mdsc->mutex);
 		goto out;
 	}
 	result = le32_to_cpu(head->result);
 	/*
 	 * Handle an ESTALE
 	 * if we're not talking to the authority, send to them
 	 * if the authority has changed while we weren't looking,
 	 * send to new authority
 	 * Otherwise we just have to return an ESTALE
 	 */
 	if (result == -ESTALE) {
 		dout("got ESTALE on request %llu", req->r_tid);
 		if (!req->r_inode) {
 			/* do nothing; not an authority problem */
 		} else if (req->r_direct_mode != USE_AUTH_MDS) {
 			dout("not using auth, setting for that now");
 			req->r_direct_mode = USE_AUTH_MDS;
 			__do_request(mdsc, req);
 			mutex_unlock(&mdsc->mutex);
 			goto out;
 		} else  {
 			struct ceph_inode_info *ci = ceph_inode(req->r_inode);
 			struct ceph_cap *cap = NULL;
 			if (req->r_session)
 				cap = ceph_get_cap_for_mds(ci,
 						   req->r_session->s_mds);
 			dout("already using auth");
 			if ((!cap || cap != ci->i_auth_cap) ||
 			    (cap->mseq != req->r_sent_on_mseq)) {
 				dout("but cap changed, so resending");
 				__do_request(mdsc, req);
 				mutex_unlock(&mdsc->mutex);
 				goto out;
 			}
 		}
 		dout("have to return ESTALE on request %llu", req->r_tid);
 	}
 	if (head->safe) {
 		req->r_got_safe = true;
 		__unregister_request(mdsc, req);
 		complete_all(&req->r_safe_completion);
 		if (req->r_got_unsafe) {
 			/*
 			 * We already handled the unsafe response, now do the
 			 * cleanup.  No need to examine the response; the MDS
 			 * doesn't include any result info in the safe
 			 * response.  And even if it did, there is nothing
 			 * useful we could do with a revised return value.
 			 */
 			dout("got safe reply %llu, mds%d\n", tid, mds);
 			list_del_init(&req->r_unsafe_item);
 			/* last unsafe request during umount? */
 			if (mdsc->stopping && !__get_oldest_req(mdsc))
 				complete_all(&mdsc->safe_umount_waiters);
 			mutex_unlock(&mdsc->mutex);
 			goto out;
 		}
 	} else {
 		req->r_got_unsafe = true;
 		list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
 	}
 	dout("handle_reply tid %lld result %d\n", tid, result);
 	rinfo = &req->r_reply_info;
 	err = parse_reply_info(msg, rinfo, session->s_con.peer_features);
 	mutex_unlock(&mdsc->mutex);
 	mutex_lock(&session->s_mutex);
 	if (err < 0) {
 		pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid);
 		ceph_msg_dump(msg);
 		goto out_err;
 	}
 	/* snap trace */
 	if (rinfo->snapblob_len) {
 		down_write(&mdsc->snap_rwsem);
 		ceph_update_snap_trace(mdsc, rinfo->snapblob,
 			       rinfo->snapblob + rinfo->snapblob_len,
 			       le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP);
 		downgrade_write(&mdsc->snap_rwsem);
 	} else {
 		down_read(&mdsc->snap_rwsem);
 	}
 	/* insert trace into our cache */
 	mutex_lock(&req->r_fill_mutex);
 	err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
 	if (err == 0) {
 		if (result == 0 && req->r_op != CEPH_MDS_OP_GETFILELOCK &&
 		    rinfo->dir_nr)
 			ceph_readdir_prepopulate(req, req->r_session);
 		ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
 	}
 	mutex_unlock(&req->r_fill_mutex);
 	up_read(&mdsc->snap_rwsem);
 out_err:
 	mutex_lock(&mdsc->mutex);
 	if (!req->r_aborted) {
 		if (err) {
 			req->r_err = err;
 		} else {
 			req->r_reply = msg;
 			ceph_msg_get(msg);
 			req->r_got_result = true;
 		}
 	} else {
 		dout("reply arrived after request %lld was aborted\n", tid);
 	}
 	mutex_unlock(&mdsc->mutex);
 	ceph_add_cap_releases(mdsc, req->r_session);
 	mutex_unlock(&session->s_mutex);
 	/* kick calling process */
 	complete_request(mdsc, req);
 out:
 	ceph_mdsc_put_request(req);
 	return;
 }
 /*
  * handle mds notification that our request has been forwarded.
  */
 static void handle_forward(struct ceph_mds_client *mdsc,
 			   struct ceph_mds_session *session,
 			   struct ceph_msg *msg)
 {
 	struct ceph_mds_request *req;
 	u64 tid = le64_to_cpu(msg->hdr.tid);
 	u32 next_mds;
 	u32 fwd_seq;
 	int err = -EINVAL;
 	void *p = msg->front.iov_base;
 	void *end = p + msg->front.iov_len;
 	ceph_decode_need(&p, end, 2*sizeof(u32), bad);
 	next_mds = ceph_decode_32(&p);
 	fwd_seq = ceph_decode_32(&p);
 	mutex_lock(&mdsc->mutex);
 	req = __lookup_request(mdsc, tid);
 	if (!req) {
 		dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
 		goto out;  /* dup reply? */
 	}
 	if (req->r_aborted) {
 		dout("forward tid %llu aborted, unregistering\n", tid);
 		__unregister_request(mdsc, req);
 	} else if (fwd_seq <= req->r_num_fwd) {
 		dout("forward tid %llu to mds%d - old seq %d <= %d\n",
 		     tid, next_mds, req->r_num_fwd, fwd_seq);
 	} else {
 		/* resend. forward race not possible; mds would drop */
 		dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
 		BUG_ON(req->r_err);
 		BUG_ON(req->r_got_result);
 		req->r_num_fwd = fwd_seq;
 		req->r_resend_mds = next_mds;
 		put_request_session(req);
 		__do_request(mdsc, req);
 	}
 	ceph_mdsc_put_request(req);
 out:
 	mutex_unlock(&mdsc->mutex);
 	return;
 bad:
 	pr_err("mdsc_handle_forward decode error err=%d\n", err);
 }
 /*
  * handle a mds session control message
  */
 static void handle_session(struct ceph_mds_session *session,
 			   struct ceph_msg *msg)
 {
 	struct ceph_mds_client *mdsc = session->s_mdsc;
 	u32 op;
 	u64 seq;
 	int mds = session->s_mds;
 	struct ceph_mds_session_head *h = msg->front.iov_base;
 	int wake = 0;
 	/* decode */
 	if (msg->front.iov_len != sizeof(*h))
 		goto bad;
 	op = le32_to_cpu(h->op);
 	seq = le64_to_cpu(h->seq);
 	mutex_lock(&mdsc->mutex);
 	if (op == CEPH_SESSION_CLOSE)
 		__unregister_session(mdsc, session);
 	/* FIXME: this ttl calculation is generous */
 	session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose;
 	mutex_unlock(&mdsc->mutex);
 	mutex_lock(&session->s_mutex);
 	dout("handle_session mds%d %s %p state %s seq %llu\n",
 	     mds, ceph_session_op_name(op), session,
 	     session_state_name(session->s_state), seq);
 	if (session->s_state == CEPH_MDS_SESSION_HUNG) {
 		session->s_state = CEPH_MDS_SESSION_OPEN;
 		pr_info("mds%d came back\n", session->s_mds);
 	}
 	switch (op) {
 	case CEPH_SESSION_OPEN:
 		if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
 			pr_info("mds%d reconnect success\n", session->s_mds);
 		session->s_state = CEPH_MDS_SESSION_OPEN;
 		renewed_caps(mdsc, session, 0);
 		wake = 1;
 		if (mdsc->stopping)
 			__close_session(mdsc, session);
 		break;
 	case CEPH_SESSION_RENEWCAPS:
 		if (session->s_renew_seq == seq)
 			renewed_caps(mdsc, session, 1);
 		break;
 	case CEPH_SESSION_CLOSE:
 		if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
 			pr_info("mds%d reconnect denied\n", session->s_mds);
 		remove_session_caps(session);
 		wake = 1; /* for good measure */
 		wake_up_all(&mdsc->session_close_wq);
 		kick_requests(mdsc, mds);
 		break;
 	case CEPH_SESSION_STALE:
 		pr_info("mds%d caps went stale, renewing\n",
 			session->s_mds);
-		spin_lock(&session->s_cap_lock);
+		spin_lock(&session->s_gen_ttl_lock);
 		session->s_cap_gen++;
 		session->s_cap_ttl = 0;
-		spin_unlock(&session->s_cap_lock);
+		spin_unlock(&session->s_gen_ttl_lock);
 		send_renew_caps(mdsc, session);
 		break;
 	case CEPH_SESSION_RECALL_STATE:
 		trim_caps(mdsc, session, le32_to_cpu(h->max_caps));
 		break;
 	default:
 		pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
 		WARN_ON(1);
 	}
 	mutex_unlock(&session->s_mutex);
 	if (wake) {
 		mutex_lock(&mdsc->mutex);
 		__wake_requests(mdsc, &session->s_waiting);
 		mutex_unlock(&mdsc->mutex);
 	}
 	return;
 bad:
 	pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds,
 	       (int)msg->front.iov_len);
 	ceph_msg_dump(msg);
 	return;
 }
 /*
  * called under session->mutex.
  */
 static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
 				   struct ceph_mds_session *session)
 {
 	struct ceph_mds_request *req, *nreq;
 	int err;
 	dout("replay_unsafe_requests mds%d\n", session->s_mds);
 	mutex_lock(&mdsc->mutex);
 	list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) {
 		err = __prepare_send_request(mdsc, req, session->s_mds);
 		if (!err) {
 			ceph_msg_get(req->r_request);
 			ceph_con_send(&session->s_con, req->r_request);
 		}
 	}
 	mutex_unlock(&mdsc->mutex);
 }
 /*
  * Encode information about a cap for a reconnect with the MDS.
  */
 static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
 			  void *arg)
 {
 	union {
 		struct ceph_mds_cap_reconnect v2;
 		struct ceph_mds_cap_reconnect_v1 v1;
 	} rec;
 	size_t reclen;
 	struct ceph_inode_info *ci;
 	struct ceph_reconnect_state *recon_state = arg;
 	struct ceph_pagelist *pagelist = recon_state->pagelist;
 	char *path;
 	int pathlen, err;
 	u64 pathbase;
 	struct dentry *dentry;
 	ci = cap->ci;
 	dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
 	     inode, ceph_vinop(inode), cap, cap->cap_id,
 	     ceph_cap_string(cap->issued));
 	err = ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
 	if (err)
 		return err;
 	dentry = d_find_alias(inode);
 	if (dentry) {
 		path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0);
 		if (IS_ERR(path)) {
 			err = PTR_ERR(path);
 			goto out_dput;
 		}
 	} else {
 		path = NULL;
 		pathlen = 0;
 	}
 	err = ceph_pagelist_encode_string(pagelist, path, pathlen);
 	if (err)
 		goto out_free;
 	spin_lock(&ci->i_ceph_lock);
 	cap->seq = 0;        /* reset cap seq */
 	cap->issue_seq = 0;  /* and issue_seq */
 	if (recon_state->flock) {
 		rec.v2.cap_id = cpu_to_le64(cap->cap_id);
 		rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
 		rec.v2.issued = cpu_to_le32(cap->issued);
 		rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
 		rec.v2.pathbase = cpu_to_le64(pathbase);
 		rec.v2.flock_len = 0;
 		reclen = sizeof(rec.v2);
 	} else {
 		rec.v1.cap_id = cpu_to_le64(cap->cap_id);
 		rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
 		rec.v1.issued = cpu_to_le32(cap->issued);
 		rec.v1.size = cpu_to_le64(inode->i_size);
 		ceph_encode_timespec(&rec.v1.mtime, &inode->i_mtime);
 		ceph_encode_timespec(&rec.v1.atime, &inode->i_atime);
 		rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
 		rec.v1.pathbase = cpu_to_le64(pathbase);
 		reclen = sizeof(rec.v1);
 	}
 	spin_unlock(&ci->i_ceph_lock);
 	if (recon_state->flock) {
 		int num_fcntl_locks, num_flock_locks;
 		struct ceph_pagelist_cursor trunc_point;
 		ceph_pagelist_set_cursor(pagelist, &trunc_point);
 		do {
 			lock_flocks();
 			ceph_count_locks(inode, &num_fcntl_locks,
 					 &num_flock_locks);
 			rec.v2.flock_len = (2*sizeof(u32) +
 					    (num_fcntl_locks+num_flock_locks) *
 					    sizeof(struct ceph_filelock));
 			unlock_flocks();
 			/* pre-alloc pagelist */
 			ceph_pagelist_truncate(pagelist, &trunc_point);
 			err = ceph_pagelist_append(pagelist, &rec, reclen);
 			if (!err)
 				err = ceph_pagelist_reserve(pagelist,
 							    rec.v2.flock_len);
 			/* encode locks */
 			if (!err) {
 				lock_flocks();
 				err = ceph_encode_locks(inode,
 							pagelist,
 							num_fcntl_locks,
 							num_flock_locks);
 				unlock_flocks();
 			}
 		} while (err == -ENOSPC);
 	} else {
 		err = ceph_pagelist_append(pagelist, &rec, reclen);
 	}
 out_free:
 	kfree(path);
 out_dput:
 	dput(dentry);
 	return err;
 }
 /*
  * If an MDS fails and recovers, clients need to reconnect in order to
  * reestablish shared state.  This includes all caps issued through
  * this session _and_ the snap_realm hierarchy.  Because it's not
  * clear which snap realms the mds cares about, we send everything we
  * know about.. that ensures we'll then get any new info the
  * recovering MDS might have.
  *
  * This is a relatively heavyweight operation, but it's rare.
  *
  * called with mdsc->mutex held.
  */
 static void send_mds_reconnect(struct ceph_mds_client *mdsc,
 			       struct ceph_mds_session *session)
 {
 	struct ceph_msg *reply;
 	struct rb_node *p;
 	int mds = session->s_mds;
 	int err = -ENOMEM;
 	struct ceph_pagelist *pagelist;
 	struct ceph_reconnect_state recon_state;
 	pr_info("mds%d reconnect start\n", mds);
 	pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
 	if (!pagelist)
 		goto fail_nopagelist;
 	ceph_pagelist_init(pagelist);
 	reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS, false);
 	if (!reply)
 		goto fail_nomsg;
 	mutex_lock(&session->s_mutex);
 	session->s_state = CEPH_MDS_SESSION_RECONNECTING;
 	session->s_seq = 0;
 	ceph_con_open(&session->s_con,
 		      ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
 	/* replay unsafe requests */
 	replay_unsafe_requests(mdsc, session);
 	down_read(&mdsc->snap_rwsem);
 	dout("session %p state %s\n", session,
 	     session_state_name(session->s_state));
 	/* drop old cap expires; we're about to reestablish that state */
 	discard_cap_releases(mdsc, session);
 	/* traverse this session's caps */
 	err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps);
 	if (err)
 		goto fail;
 	recon_state.pagelist = pagelist;
 	recon_state.flock = session->s_con.peer_features & CEPH_FEATURE_FLOCK;
 	err = iterate_session_caps(session, encode_caps_cb, &recon_state);
 	if (err < 0)
 		goto fail;
 	/*
 	 * snaprealms.  we provide mds with the ino, seq (version), and
 	 * parent for all of our realms.  If the mds has any newer info,
 	 * it will tell us.
 	 */
 	for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) {
 		struct ceph_snap_realm *realm =
 			rb_entry(p, struct ceph_snap_realm, node);
 		struct ceph_mds_snaprealm_reconnect sr_rec;
 		dout(" adding snap realm %llx seq %lld parent %llx\n",
 		     realm->ino, realm->seq, realm->parent_ino);
 		sr_rec.ino = cpu_to_le64(realm->ino);
 		sr_rec.seq = cpu_to_le64(realm->seq);
 		sr_rec.parent = cpu_to_le64(realm->parent_ino);
 		err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));
 		if (err)
 			goto fail;
 	}
 	reply->pagelist = pagelist;
 	if (recon_state.flock)
 		reply->hdr.version = cpu_to_le16(2);
 	reply->hdr.data_len = cpu_to_le32(pagelist->length);
 	reply->nr_pages = calc_pages_for(0, pagelist->length);
 	ceph_con_send(&session->s_con, reply);
 	mutex_unlock(&session->s_mutex);
 	mutex_lock(&mdsc->mutex);
 	__wake_requests(mdsc, &session->s_waiting);
 	mutex_unlock(&mdsc->mutex);
 	up_read(&mdsc->snap_rwsem);
 	return;
 fail:
 	ceph_msg_put(reply);
 	up_read(&mdsc->snap_rwsem);
 	mutex_unlock(&session->s_mutex);
 fail_nomsg:
 	ceph_pagelist_release(pagelist);
 	kfree(pagelist);
 fail_nopagelist:
 	pr_err("error %d preparing reconnect for mds%d\n", err, mds);
 	return;
 }
 /*
  * compare old and new mdsmaps, kicking requests
  * and closing out old connections as necessary
  *
  * called under mdsc->mutex.
  */
 static void check_new_map(struct ceph_mds_client *mdsc,
 			  struct ceph_mdsmap *newmap,
 			  struct ceph_mdsmap *oldmap)
 {
 	int i;
 	int oldstate, newstate;
 	struct ceph_mds_session *s;
 	dout("check_new_map new %u old %u\n",
 	     newmap->m_epoch, oldmap->m_epoch);
 	for (i = 0; i < oldmap->m_max_mds && i < mdsc->max_sessions; i++) {
 		if (mdsc->sessions[i] == NULL)
 			continue;
 		s = mdsc->sessions[i];
 		oldstate = ceph_mdsmap_get_state(oldmap, i);
 		newstate = ceph_mdsmap_get_state(newmap, i);
 		dout("check_new_map mds%d state %s%s -> %s%s (session %s)\n",
 		     i, ceph_mds_state_name(oldstate),
 		     ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "",
 		     ceph_mds_state_name(newstate),
 		     ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
 		     session_state_name(s->s_state));
 		if (memcmp(ceph_mdsmap_get_addr(oldmap, i),
 			   ceph_mdsmap_get_addr(newmap, i),
 			   sizeof(struct ceph_entity_addr))) {
 			if (s->s_state == CEPH_MDS_SESSION_OPENING) {
 				/* the session never opened, just close it
 				 * out now */
 				__wake_requests(mdsc, &s->s_waiting);
 				__unregister_session(mdsc, s);
 			} else {
 				/* just close it */
 				mutex_unlock(&mdsc->mutex);
 				mutex_lock(&s->s_mutex);
 				mutex_lock(&mdsc->mutex);
 				ceph_con_close(&s->s_con);
 				mutex_unlock(&s->s_mutex);
 				s->s_state = CEPH_MDS_SESSION_RESTARTING;
 			}
 			/* kick any requests waiting on the recovering mds */
 			kick_requests(mdsc, i);
 		} else if (oldstate == newstate) {
 			continue;  /* nothing new with this mds */
 		}
 		/*
 		 * send reconnect?
 		 */
 		if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
 		    newstate >= CEPH_MDS_STATE_RECONNECT) {
 			mutex_unlock(&mdsc->mutex);
 			send_mds_reconnect(mdsc, s);
 			mutex_lock(&mdsc->mutex);
 		}
 		/*
 		 * kick request on any mds that has gone active.
 		 */
 		if (oldstate < CEPH_MDS_STATE_ACTIVE &&
 		    newstate >= CEPH_MDS_STATE_ACTIVE) {
 			if (oldstate != CEPH_MDS_STATE_CREATING &&
 			    oldstate != CEPH_MDS_STATE_STARTING)
 				pr_info("mds%d recovery completed\n", s->s_mds);
 			kick_requests(mdsc, i);
 			ceph_kick_flushing_caps(mdsc, s);
 			wake_up_session_caps(s, 1);
 		}
 	}
 	for (i = 0; i < newmap->m_max_mds && i < mdsc->max_sessions; i++) {
 		s = mdsc->sessions[i];
 		if (!s)
 			continue;
 		if (!ceph_mdsmap_is_laggy(newmap, i))
 			continue;
 		if (s->s_state == CEPH_MDS_SESSION_OPEN ||
 		    s->s_state == CEPH_MDS_SESSION_HUNG ||
 		    s->s_state == CEPH_MDS_SESSION_CLOSING) {
 			dout(" connecting to export targets of laggy mds%d\n",
 			     i);
 			__open_export_target_sessions(mdsc, s);
 		}
 	}
 }
 /*
  * leases
  */
 /*
  * caller must hold session s_mutex, dentry->d_lock
  */
 void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry)
 {
 	struct ceph_dentry_info *di = ceph_dentry(dentry);
 	ceph_put_mds_session(di->lease_session);
 	di->lease_session = NULL;
 }
 static void handle_lease(struct ceph_mds_client *mdsc,
 			 struct ceph_mds_session *session,
 			 struct ceph_msg *msg)
 {
 	struct super_block *sb = mdsc->fsc->sb;
 	struct inode *inode;
 	struct dentry *parent, *dentry;
 	struct ceph_dentry_info *di;
 	int mds = session->s_mds;
 	struct ceph_mds_lease *h = msg->front.iov_base;
 	u32 seq;
 	struct ceph_vino vino;
 	struct qstr dname;
 	int release = 0;
 	dout("handle_lease from mds%d\n", mds);
 	/* decode */
 	if (msg->front.iov_len < sizeof(*h) + sizeof(u32))
 		goto bad;
 	vino.ino = le64_to_cpu(h->ino);
 	vino.snap = CEPH_NOSNAP;
 	seq = le32_to_cpu(h->seq);
 	dname.name = (void *)h + sizeof(*h) + sizeof(u32);
 	dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32);
 	if (dname.len != get_unaligned_le32(h+1))
 		goto bad;
 	mutex_lock(&session->s_mutex);
 	session->s_seq++;
 	/* lookup inode */
 	inode = ceph_find_inode(sb, vino);
 	dout("handle_lease %s, ino %llx %p %.*s\n",
 	     ceph_lease_op_name(h->action), vino.ino, inode,
 	     dname.len, dname.name);
 	if (inode == NULL) {
 		dout("handle_lease no inode %llx\n", vino.ino);
 		goto release;
 	}
 	/* dentry */
 	parent = d_find_alias(inode);
 	if (!parent) {
 		dout("no parent dentry on inode %p\n", inode);
 		WARN_ON(1);
 		goto release;  /* hrm... */
 	}
 	dname.hash = full_name_hash(dname.name, dname.len);
 	dentry = d_lookup(parent, &dname);
 	dput(parent);
 	if (!dentry)
 		goto release;
 	spin_lock(&dentry->d_lock);
 	di = ceph_dentry(dentry);
 	switch (h->action) {
 	case CEPH_MDS_LEASE_REVOKE:
 		if (di->lease_session == session) {
 			if (ceph_seq_cmp(di->lease_seq, seq) > 0)
 				h->seq = cpu_to_le32(di->lease_seq);
 			__ceph_mdsc_drop_dentry_lease(dentry);
 		}
 		release = 1;
 		break;
 	case CEPH_MDS_LEASE_RENEW:
 		if (di->lease_session == session &&
 		    di->lease_gen == session->s_cap_gen &&
 		    di->lease_renew_from &&
 		    di->lease_renew_after == 0) {
 			unsigned long duration =
 				le32_to_cpu(h->duration_ms) * HZ / 1000;
 			di->lease_seq = seq;
 			dentry->d_time = di->lease_renew_from + duration;
 			di->lease_renew_after = di->lease_renew_from +
 				(duration >> 1);
 			di->lease_renew_from = 0;
 		}
 		break;
 	}
 	spin_unlock(&dentry->d_lock);
 	dput(dentry);
 	if (!release)
 		goto out;
 release:
 	/* let's just reuse the same message */
 	h->action = CEPH_MDS_LEASE_REVOKE_ACK;
 	ceph_msg_get(msg);
 	ceph_con_send(&session->s_con, msg);
 out:
 	iput(inode);
 	mutex_unlock(&session->s_mutex);
 	return;
 bad:
 	pr_err("corrupt lease message\n");
 	ceph_msg_dump(msg);
 }
 void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
 			      struct inode *inode,
 			      struct dentry *dentry, char action,
 			      u32 seq)
 {
 	struct ceph_msg *msg;
 	struct ceph_mds_lease *lease;
 	int len = sizeof(*lease) + sizeof(u32);
 	int dnamelen = 0;
 	dout("lease_send_msg inode %p dentry %p %s to mds%d\n",
 	     inode, dentry, ceph_lease_op_name(action), session->s_mds);
 	dnamelen = dentry->d_name.len;
 	len += dnamelen;
 	msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false);
 	if (!msg)
 		return;
 	lease = msg->front.iov_base;
 	lease->action = action;
 	lease->ino = cpu_to_le64(ceph_vino(inode).ino);
 	lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
 	lease->seq = cpu_to_le32(seq);
 	put_unaligned_le32(dnamelen, lease + 1);
 	memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dnamelen);
 	/*
 	 * if this is a preemptive lease RELEASE, no need to
 	 * flush request stream, since the actual request will
 	 * soon follow.
 	 */
 	msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE);
 	ceph_con_send(&session->s_con, msg);
 }
 /*
  * Preemptively release a lease we expect to invalidate anyway.
  * Pass @inode always, @dentry is optional.
  */
 void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
 			     struct dentry *dentry)
 {
 	struct ceph_dentry_info *di;
 	struct ceph_mds_session *session;
 	u32 seq;
 	BUG_ON(inode == NULL);
 	BUG_ON(dentry == NULL);
 	/* is dentry lease valid? */
 	spin_lock(&dentry->d_lock);
 	di = ceph_dentry(dentry);
 	if (!di || !di->lease_session ||
 	    di->lease_session->s_mds < 0 ||
 	    di->lease_gen != di->lease_session->s_cap_gen ||
 	    !time_before(jiffies, dentry->d_time)) {
 		dout("lease_release inode %p dentry %p -- "
 		     "no lease\n",
 		     inode, dentry);
 		spin_unlock(&dentry->d_lock);
 		return;
 	}
 	/* we do have a lease on this dentry; note mds and seq */
 	session = ceph_get_mds_session(di->lease_session);
 	seq = di->lease_seq;
 	__ceph_mdsc_drop_dentry_lease(dentry);
 	spin_unlock(&dentry->d_lock);
 	dout("lease_release inode %p dentry %p to mds%d\n",
 	     inode, dentry, session->s_mds);
 	ceph_mdsc_lease_send_msg(session, inode, dentry,
 				 CEPH_MDS_LEASE_RELEASE, seq);
 	ceph_put_mds_session(session);
 }
 /*
  * drop all leases (and dentry refs) in preparation for umount
  */
 static void drop_leases(struct ceph_mds_client *mdsc)
 {
 	int i;
 	dout("drop_leases\n");
 	mutex_lock(&mdsc->mutex);
 	for (i = 0; i < mdsc->max_sessions; i++) {
 		struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
 		if (!s)
 			continue;
 		mutex_unlock(&mdsc->mutex);
 		mutex_lock(&s->s_mutex);
 		mutex_unlock(&s->s_mutex);
 		ceph_put_mds_session(s);
 		mutex_lock(&mdsc->mutex);
 	}
 	mutex_unlock(&mdsc->mutex);
 }
 /*
  * delayed work -- periodically trim expired leases, renew caps with mds
  */
 static void schedule_delayed(struct ceph_mds_client *mdsc)
 {
 	int delay = 5;
 	unsigned hz = round_jiffies_relative(HZ * delay);
 	schedule_delayed_work(&mdsc->delayed_work, hz);
 }
 static void delayed_work(struct work_struct *work)
 {
 	int i;
 	struct ceph_mds_client *mdsc =
 		container_of(work, struct ceph_mds_client, delayed_work.work);
 	int renew_interval;
 	int renew_caps;
 	dout("mdsc delayed_work\n");
 	ceph_check_delayed_caps(mdsc);
 	mutex_lock(&mdsc->mutex);
 	renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
 	renew_caps = time_after_eq(jiffies, HZ*renew_interval +
 				   mdsc->last_renew_caps);
 	if (renew_caps)
 		mdsc->last_renew_caps = jiffies;
 	for (i = 0; i < mdsc->max_sessions; i++) {
 		struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
 		if (s == NULL)
 			continue;
 		if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
 			dout("resending session close request for mds%d\n",
 			     s->s_mds);
 			request_close_session(mdsc, s);
 			ceph_put_mds_session(s);
 			continue;
 		}
 		if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
 			if (s->s_state == CEPH_MDS_SESSION_OPEN) {
 				s->s_state = CEPH_MDS_SESSION_HUNG;
 				pr_info("mds%d hung\n", s->s_mds);
 			}
 		}
 		if (s->s_state < CEPH_MDS_SESSION_OPEN) {
 			/* this mds is failed or recovering, just wait */
 			ceph_put_mds_session(s);
 			continue;
 		}
 		mutex_unlock(&mdsc->mutex);
 		mutex_lock(&s->s_mutex);
 		if (renew_caps)
 			send_renew_caps(mdsc, s);
 		else
 			ceph_con_keepalive(&s->s_con);
 		ceph_add_cap_releases(mdsc, s);
 		if (s->s_state == CEPH_MDS_SESSION_OPEN ||
 		    s->s_state == CEPH_MDS_SESSION_HUNG)
 			ceph_send_cap_releases(mdsc, s);
 		mutex_unlock(&s->s_mutex);
 		ceph_put_mds_session(s);
 		mutex_lock(&mdsc->mutex);
 	}
 	mutex_unlock(&mdsc->mutex);
 	schedule_delayed(mdsc);
 }
 int ceph_mdsc_init(struct ceph_fs_client *fsc)
 {
 	struct ceph_mds_client *mdsc;
 	mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS);
 	if (!mdsc)
 		return -ENOMEM;
 	mdsc->fsc = fsc;
 	fsc->mdsc = mdsc;
 	mutex_init(&mdsc->mutex);
 	mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
 	if (mdsc->mdsmap == NULL)
 		return -ENOMEM;
 	init_completion(&mdsc->safe_umount_waiters);
 	init_waitqueue_head(&mdsc->session_close_wq);
 	INIT_LIST_HEAD(&mdsc->waiting_for_map);
 	mdsc->sessions = NULL;
 	mdsc->max_sessions = 0;
 	mdsc->stopping = 0;
 	init_rwsem(&mdsc->snap_rwsem);
 	mdsc->snap_realms = RB_ROOT;
 	INIT_LIST_HEAD(&mdsc->snap_empty);
 	spin_lock_init(&mdsc->snap_empty_lock);
 	mdsc->last_tid = 0;
 	mdsc->request_tree = RB_ROOT;
 	INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
 	mdsc->last_renew_caps = jiffies;
 	INIT_LIST_HEAD(&mdsc->cap_delay_list);
 	spin_lock_init(&mdsc->cap_delay_lock);
 	INIT_LIST_HEAD(&mdsc->snap_flush_list);
 	spin_lock_init(&mdsc->snap_flush_lock);
 	mdsc->cap_flush_seq = 0;
 	INIT_LIST_HEAD(&mdsc->cap_dirty);
 	INIT_LIST_HEAD(&mdsc->cap_dirty_migrating);
 	mdsc->num_cap_flushing = 0;
 	spin_lock_init(&mdsc->cap_dirty_lock);
 	init_waitqueue_head(&mdsc->cap_flushing_wq);
 	spin_lock_init(&mdsc->dentry_lru_lock);
 	INIT_LIST_HEAD(&mdsc->dentry_lru);
 	ceph_caps_init(mdsc);
 	ceph_adjust_min_caps(mdsc, fsc->min_caps);
 	return 0;
 }
 /*
  * Wait for safe replies on open mds requests.  If we time out, drop
  * all requests from the tree to avoid dangling dentry refs.
  */
 static void wait_requests(struct ceph_mds_client *mdsc)
 {
 	struct ceph_mds_request *req;
 	struct ceph_fs_client *fsc = mdsc->fsc;
 	mutex_lock(&mdsc->mutex);
 	if (__get_oldest_req(mdsc)) {
 		mutex_unlock(&mdsc->mutex);
 		dout("wait_requests waiting for requests\n");
 		wait_for_completion_timeout(&mdsc->safe_umount_waiters,
 				    fsc->client->options->mount_timeout * HZ);
 		/* tear down remaining requests */
 		mutex_lock(&mdsc->mutex);
 		while ((req = __get_oldest_req(mdsc))) {
 			dout("wait_requests timed out on tid %llu\n",
 			     req->r_tid);
 			__unregister_request(mdsc, req);
 		}
 	}
 	mutex_unlock(&mdsc->mutex);
 	dout("wait_requests done\n");
 }
 /*
  * called before mount is ro, and before dentries are torn down.
  * (hmm, does this still race with new lookups?)
  */
 void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
 {
 	dout("pre_umount\n");
 	mdsc->stopping = 1;
 	drop_leases(mdsc);
 	ceph_flush_dirty_caps(mdsc);
 	wait_requests(mdsc);
 	/*
 	 * wait for reply handlers to drop their request refs and
 	 * their inode/dcache refs
 	 */
 	ceph_msgr_flush();
 }
 /*
  * wait for all write mds requests to flush.
  */
 static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid)
 {
 	struct ceph_mds_request *req = NULL, *nextreq;
 	struct rb_node *n;
 	mutex_lock(&mdsc->mutex);
 	dout("wait_unsafe_requests want %lld\n", want_tid);
 restart:
 	req = __get_oldest_req(mdsc);
 	while (req && req->r_tid <= want_tid) {
 		/* find next request */
 		n = rb_next(&req->r_node);
 		if (n)
 			nextreq = rb_entry(n, struct ceph_mds_request, r_node);
 		else
 			nextreq = NULL;
 		if ((req->r_op & CEPH_MDS_OP_WRITE)) {
 			/* write op */
 			ceph_mdsc_get_request(req);
 			if (nextreq)
 				ceph_mdsc_get_request(nextreq);
 			mutex_unlock(&mdsc->mutex);
 			dout("wait_unsafe_requests  wait on %llu (want %llu)\n",
 			     req->r_tid, want_tid);
 			wait_for_completion(&req->r_safe_completion);
 			mutex_lock(&mdsc->mutex);
 			ceph_mdsc_put_request(req);
 			if (!nextreq)
 				break;  /* next dne before, so we're done! */
 			if (RB_EMPTY_NODE(&nextreq->r_node)) {
 				/* next request was removed from tree */
 				ceph_mdsc_put_request(nextreq);
 				goto restart;
 			}
 			ceph_mdsc_put_request(nextreq);  /* won't go away */
 		}
 		req = nextreq;
 	}
 	mutex_unlock(&mdsc->mutex);
 	dout("wait_unsafe_requests done\n");
 }
 void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
 {
 	u64 want_tid, want_flush;
 	if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
 		return;
 	dout("sync\n");
 	mutex_lock(&mdsc->mutex);
 	want_tid = mdsc->last_tid;
 	want_flush = mdsc->cap_flush_seq;
 	mutex_unlock(&mdsc->mutex);
 	dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
 	ceph_flush_dirty_caps(mdsc);
 	wait_unsafe_requests(mdsc, want_tid);
 	wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush));
 }
 /*
  * true if all sessions are closed, or we force unmount
  */
 static bool done_closing_sessions(struct ceph_mds_client *mdsc)
 {
 	int i, n = 0;
 	if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
 		return true;
 	mutex_lock(&mdsc->mutex);
 	for (i = 0; i < mdsc->max_sessions; i++)
 		if (mdsc->sessions[i])
 			n++;
 	mutex_unlock(&mdsc->mutex);
 	return n == 0;
 }
 /*
  * called after sb is ro.
  */
 void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 {
 	struct ceph_mds_session *session;
 	int i;
 	struct ceph_fs_client *fsc = mdsc->fsc;
 	unsigned long timeout = fsc->client->options->mount_timeout * HZ;
 	dout("close_sessions\n");
 	/* close sessions */
 	mutex_lock(&mdsc->mutex);
 	for (i = 0; i < mdsc->max_sessions; i++) {
 		session = __ceph_lookup_mds_session(mdsc, i);
 		if (!session)
 			continue;
 		mutex_unlock(&mdsc->mutex);
 		mutex_lock(&session->s_mutex);
 		__close_session(mdsc, session);
 		mutex_unlock(&session->s_mutex);
 		ceph_put_mds_session(session);
 		mutex_lock(&mdsc->mutex);
 	}
 	mutex_unlock(&mdsc->mutex);
 	dout("waiting for sessions to close\n");
 	wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc),
 			   timeout);
 	/* tear down remaining sessions */
 	mutex_lock(&mdsc->mutex);
 	for (i = 0; i < mdsc->max_sessions; i++) {
 		if (mdsc->sessions[i]) {
 			session = get_session(mdsc->sessions[i]);
 			__unregister_session(mdsc, session);
 			mutex_unlock(&mdsc->mutex);
 			mutex_lock(&session->s_mutex);
 			remove_session_caps(session);
 			mutex_unlock(&session->s_mutex);
 			ceph_put_mds_session(session);
 			mutex_lock(&mdsc->mutex);
 		}
 	}
 	WARN_ON(!list_empty(&mdsc->cap_delay_list));
 	mutex_unlock(&mdsc->mutex);
 	ceph_cleanup_empty_realms(mdsc);
 	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
 	dout("stopped\n");
 }
 static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
 {
 	dout("stop\n");
 	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
 	if (mdsc->mdsmap)
 		ceph_mdsmap_destroy(mdsc->mdsmap);
 	kfree(mdsc->sessions);
 	ceph_caps_finalize(mdsc);
 }
 void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
 {
 	struct ceph_mds_client *mdsc = fsc->mdsc;
 	dout("mdsc_destroy %p\n", mdsc);
 	ceph_mdsc_stop(mdsc);
 	/* flush out any connection work with references to us */
 	ceph_msgr_flush();
 	fsc->mdsc = NULL;
 	kfree(mdsc);
 	dout("mdsc_destroy %p done\n", mdsc);
 }
 /*
  * handle mds map update.
  */
 void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
 {
 	u32 epoch;
 	u32 maplen;
 	void *p = msg->front.iov_base;
 	void *end = p + msg->front.iov_len;
 	struct ceph_mdsmap *newmap, *oldmap;
 	struct ceph_fsid fsid;
 	int err = -EINVAL;
 	ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
 	ceph_decode_copy(&p, &fsid, sizeof(fsid));
 	if (ceph_check_fsid(mdsc->fsc->client, &fsid) < 0)
 		return;
 	epoch = ceph_decode_32(&p);
 	maplen = ceph_decode_32(&p);
 	dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
 	/* do we need it? */
 	ceph_monc_got_mdsmap(&mdsc->fsc->client->monc, epoch);
 	mutex_lock(&mdsc->mutex);
 	if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
 		dout("handle_map epoch %u <= our %u\n",
 		     epoch, mdsc->mdsmap->m_epoch);
 		mutex_unlock(&mdsc->mutex);
 		return;
 	}
 	newmap = ceph_mdsmap_decode(&p, end);
 	if (IS_ERR(newmap)) {
 		err = PTR_ERR(newmap);
 		goto bad_unlock;
 	}
 	/* swap into place */
 	if (mdsc->mdsmap) {
 		oldmap = mdsc->mdsmap;
 		mdsc->mdsmap = newmap;
 		check_new_map(mdsc, newmap, oldmap);
 		ceph_mdsmap_destroy(oldmap);
 	} else {
 		mdsc->mdsmap = newmap;  /* first mds map */
 	}
 	mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
 	__wake_requests(mdsc, &mdsc->waiting_for_map);
 	mutex_unlock(&mdsc->mutex);
 	schedule_delayed(mdsc);
 	return;
 bad_unlock:
 	mutex_unlock(&mdsc->mutex);
 bad:
 	pr_err("error decoding mdsmap %d\n", err);
 	return;
 }
 static struct ceph_connection *con_get(struct ceph_connection *con)
 {
 	struct ceph_mds_session *s = con->private;
 	if (get_session(s)) {
 		dout("mdsc con_get %p ok (%d)\n", s, atomic_read(&s->s_ref));
 		return con;
 	}
 	dout("mdsc con_get %p FAIL\n", s);
 	return NULL;
 }
 static void con_put(struct ceph_connection *con)
 {
 	struct ceph_mds_session *s = con->private;
 	dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref) - 1);
 	ceph_put_mds_session(s);
 }
 /*
  * if the client is unresponsive for long enough, the mds will kill
  * the session entirely.
  */
 static void peer_reset(struct ceph_connection *con)
 {
 	struct ceph_mds_session *s = con->private;
 	struct ceph_mds_client *mdsc = s->s_mdsc;
 	pr_warning("mds%d closed our session\n", s->s_mds);
 	send_mds_reconnect(mdsc, s);
 }
 static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
 {
 	struct ceph_mds_session *s = con->private;
 	struct ceph_mds_client *mdsc = s->s_mdsc;
 	int type = le16_to_cpu(msg->hdr.type);
 	mutex_lock(&mdsc->mutex);
 	if (__verify_registered_session(mdsc, s) < 0) {
 		mutex_unlock(&mdsc->mutex);
 		goto out;
 	}
 	mutex_unlock(&mdsc->mutex);
 	switch (type) {
 	case CEPH_MSG_MDS_MAP:
 		ceph_mdsc_handle_map(mdsc, msg);
 		break;
 	case CEPH_MSG_CLIENT_SESSION:
 		handle_session(s, msg);
 		break;
 	case CEPH_MSG_CLIENT_REPLY:
 		handle_reply(s, msg);
 		break;
 	case CEPH_MSG_CLIENT_REQUEST_FORWARD:
 		handle_forward(mdsc, s, msg);
 		break;
 	case CEPH_MSG_CLIENT_CAPS:
 		ceph_handle_caps(s, msg);
 		break;
 	case CEPH_MSG_CLIENT_SNAP:
 		ceph_handle_snap(mdsc, s, msg);
 		break;
 	case CEPH_MSG_CLIENT_LEASE:
 		handle_lease(mdsc, s, msg);
 		break;
 	default:
 		pr_err("received unknown message type %d %s\n", type,
 		       ceph_msg_type_name(type));
 	}
 out:
 	ceph_msg_put(msg);
 }
 /*
  * authentication
  */
 static int get_authorizer(struct ceph_connection *con,
 			  void **buf, int *len, int *proto,
 			  void **reply_buf, int *reply_len, int force_new)
 {
 	struct ceph_mds_session *s = con->private;
 	struct ceph_mds_client *mdsc = s->s_mdsc;
 	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
 	int ret = 0;
 	if (force_new && s->s_authorizer) {
 		ac->ops->destroy_authorizer(ac, s->s_authorizer);
 		s->s_authorizer = NULL;
 	}
 	if (s->s_authorizer == NULL) {
 		if (ac->ops->create_authorizer) {
 			ret = ac->ops->create_authorizer(
 				ac, CEPH_ENTITY_TYPE_MDS,
 				&s->s_authorizer,
 				&s->s_authorizer_buf,
 				&s->s_authorizer_buf_len,
 				&s->s_authorizer_reply_buf,
 				&s->s_authorizer_reply_buf_len);
 			if (ret)
 				return ret;
 		}
 	}
 	*proto = ac->protocol;
 	*buf = s->s_authorizer_buf;
 	*len = s->s_authorizer_buf_len;
 	*reply_buf = s->s_authorizer_reply_buf;
 	*reply_len = s->s_authorizer_reply_buf_len;
 	return 0;
 }
 static int verify_authorizer_reply(struct ceph_connection *con, int len)
 {
 	struct ceph_mds_session *s = con->private;
 	struct ceph_mds_client *mdsc = s->s_mdsc;
 	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
 	return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len);
 }
 static int invalidate_authorizer(struct ceph_connection *con)
 {
 	struct ceph_mds_session *s = con->private;
 	struct ceph_mds_client *mdsc = s->s_mdsc;
 	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
 	if (ac->ops->invalidate_authorizer)
 		ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
 	return ceph_monc_validate_auth(&mdsc->fsc->client->monc);
 }
 static const struct ceph_connection_operations mds_con_ops = {
 	.get = con_get,
 	.put = con_put,
 	.dispatch = dispatch,
 	.get_authorizer = get_authorizer,
 	.verify_authorizer_reply = verify_authorizer_reply,
 	.invalidate_authorizer = invalidate_authorizer,
 	.peer_reset = peer_reset,
 };
 /* eof */

fs/ceph/mds_client.h

Diff comments View file @ 6c073a7

 #ifndef _FS_CEPH_MDS_CLIENT_H
 #define _FS_CEPH_MDS_CLIENT_H
 #include <linux/completion.h>
 #include <linux/kref.h>
 #include <linux/list.h>
 #include <linux/mutex.h>
 #include <linux/rbtree.h>
 #include <linux/spinlock.h>
 #include <linux/ceph/types.h>
 #include <linux/ceph/messenger.h>
 #include <linux/ceph/mdsmap.h>
 /*
  * Some lock dependencies:
  *
  * session->s_mutex
  *         mdsc->mutex
  *
  *         mdsc->snap_rwsem
  *
  *         ci->i_ceph_lock
  *                 mdsc->snap_flush_lock
  *                 mdsc->cap_delay_lock
  *
  */
 struct ceph_fs_client;
 struct ceph_cap;
 /*
  * parsed info about a single inode.  pointers are into the encoded
  * on-wire structures within the mds reply message payload.
  */
 struct ceph_mds_reply_info_in {
 	struct ceph_mds_reply_inode *in;
 	struct ceph_dir_layout dir_layout;
 	u32 symlink_len;
 	char *symlink;
 	u32 xattr_len;
 	char *xattr_data;
 };
 /*
  * parsed info about an mds reply, including information about
  * either: 1) the target inode and/or its parent directory and dentry,
  * and directory contents (for readdir results), or
  * 2) the file range lock info (for fcntl F_GETLK results).
  */
 struct ceph_mds_reply_info_parsed {
 	struct ceph_mds_reply_head    *head;
 	/* trace */
 	struct ceph_mds_reply_info_in diri, targeti;
 	struct ceph_mds_reply_dirfrag *dirfrag;
 	char                          *dname;
 	u32                           dname_len;
 	struct ceph_mds_reply_lease   *dlease;
 	/* extra */
 	union {
 		/* for fcntl F_GETLK results */
 		struct ceph_filelock *filelock_reply;
 		/* for readdir results */
 		struct {
 			struct ceph_mds_reply_dirfrag *dir_dir;
 			int                           dir_nr;
 			char                          **dir_dname;
 			u32                           *dir_dname_len;
 			struct ceph_mds_reply_lease   **dir_dlease;
 			struct ceph_mds_reply_info_in *dir_in;
 			u8                            dir_complete, dir_end;
 		};
 	};
 	/* encoded blob describing snapshot contexts for certain
 	   operations (e.g., open) */
 	void *snapblob;
 	int snapblob_len;
 };
 /*
  * cap releases are batched and sent to the MDS en masse.
  */
 #define CEPH_CAPS_PER_RELEASE ((PAGE_CACHE_SIZE -			\
 				sizeof(struct ceph_mds_cap_release)) /	\
 			       sizeof(struct ceph_mds_cap_item))
 /*
  * state associated with each MDS<->client session
  */
 enum {
 	CEPH_MDS_SESSION_NEW = 1,
 	CEPH_MDS_SESSION_OPENING = 2,
 	CEPH_MDS_SESSION_OPEN = 3,
 	CEPH_MDS_SESSION_HUNG = 4,
 	CEPH_MDS_SESSION_CLOSING = 5,
 	CEPH_MDS_SESSION_RESTARTING = 6,
 	CEPH_MDS_SESSION_RECONNECTING = 7,
 };
 struct ceph_mds_session {
 	struct ceph_mds_client *s_mdsc;
 	int               s_mds;
 	int               s_state;
 	unsigned long     s_ttl;      /* time until mds kills us */
 	u64               s_seq;      /* incoming msg seq # */
 	struct mutex      s_mutex;    /* serialize session messages */
 	struct ceph_connection s_con;
 	struct ceph_authorizer *s_authorizer;
 	void             *s_authorizer_buf, *s_authorizer_reply_buf;
 	size_t            s_authorizer_buf_len, s_authorizer_reply_buf_len;
-	/* protected by s_cap_lock */
+	/* protected by s_gen_ttl_lock */
-	spinlock_t        s_cap_lock;
+	spinlock_t        s_gen_ttl_lock;
 	u32               s_cap_gen;  /* inc each time we get mds stale msg */
 	unsigned long     s_cap_ttl;  /* when session caps expire */
+	/* protected by s_cap_lock */
+	spinlock_t        s_cap_lock;
 	struct list_head  s_caps;     /* all caps issued by this session */
 	int               s_nr_caps, s_trim_caps;
 	int               s_num_cap_releases;
 	struct list_head  s_cap_releases; /* waiting cap_release messages */
 	struct list_head  s_cap_releases_done; /* ready to send */
 	struct ceph_cap  *s_cap_iterator;
 	/* protected by mutex */
 	struct list_head  s_cap_flushing;     /* inodes w/ flushing caps */
 	struct list_head  s_cap_snaps_flushing;
 	unsigned long     s_renew_requested; /* last time we sent a renew req */
 	u64               s_renew_seq;
 	atomic_t          s_ref;
 	struct list_head  s_waiting;  /* waiting requests */
 	struct list_head  s_unsafe;   /* unsafe requests */
 };
 /*
  * modes of choosing which MDS to send a request to
  */
 enum {
 	USE_ANY_MDS,
 	USE_RANDOM_MDS,
 	USE_AUTH_MDS,   /* prefer authoritative mds for this metadata item */
 };
 struct ceph_mds_request;
 struct ceph_mds_client;
 /*
  * request completion callback
  */
 typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
 					     struct ceph_mds_request *req);
 /*
  * an in-flight mds request
  */
 struct ceph_mds_request {
 	u64 r_tid;                   /* transaction id */
 	struct rb_node r_node;
 	struct ceph_mds_client *r_mdsc;
 	int r_op;                    /* mds op code */
 	/* operation on what? */
 	struct inode *r_inode;              /* arg1 */
 	struct dentry *r_dentry;            /* arg1 */
 	struct dentry *r_old_dentry;        /* arg2: rename from or link from */
 	struct inode *r_old_dentry_dir;     /* arg2: old dentry's parent dir */
 	char *r_path1, *r_path2;
 	struct ceph_vino r_ino1, r_ino2;
 	struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
 	struct inode *r_target_inode;       /* resulting inode */
 	struct mutex r_fill_mutex;
 	union ceph_mds_request_args r_args;
 	int r_fmode;        /* file mode, if expecting cap */
 	uid_t r_uid;
 	gid_t r_gid;
 	/* for choosing which mds to send this request to */
 	int r_direct_mode;
 	u32 r_direct_hash;      /* choose dir frag based on this dentry hash */
 	bool r_direct_is_hash;  /* true if r_direct_hash is valid */
 	/* data payload is used for xattr ops */
 	struct page **r_pages;
 	int r_num_pages;
 	int r_data_len;
 	/* what caps shall we drop? */
 	int r_inode_drop, r_inode_unless;
 	int r_dentry_drop, r_dentry_unless;
 	int r_old_dentry_drop, r_old_dentry_unless;
 	struct inode *r_old_inode;
 	int r_old_inode_drop, r_old_inode_unless;
 	struct ceph_msg  *r_request;  /* original request */
 	int r_request_release_offset;
 	struct ceph_msg  *r_reply;
 	struct ceph_mds_reply_info_parsed r_reply_info;
 	int r_err;
 	bool r_aborted;
 	unsigned long r_timeout;  /* optional.  jiffies */
 	unsigned long r_started;  /* start time to measure timeout against */
 	unsigned long r_request_started; /* start time for mds request only,
 					    used to measure lease durations */
 	/* link unsafe requests to parent directory, for fsync */
 	struct inode	*r_unsafe_dir;
 	struct list_head r_unsafe_dir_item;
 	struct ceph_mds_session *r_session;
 	int               r_attempts;   /* resend attempts */
 	int               r_num_fwd;    /* number of forward attempts */
 	int               r_resend_mds; /* mds to resend to next, if any*/
 	u32               r_sent_on_mseq; /* cap mseq request was sent at*/
 	struct kref       r_kref;
 	struct list_head  r_wait;
 	struct completion r_completion;
 	struct completion r_safe_completion;
 	ceph_mds_request_callback_t r_callback;
 	struct list_head  r_unsafe_item;  /* per-session unsafe list item */
 	bool		  r_got_unsafe, r_got_safe, r_got_result;
 	bool              r_did_prepopulate;
 	u32               r_readdir_offset;
 	struct ceph_cap_reservation r_caps_reservation;
 	int r_num_caps;
 };
 /*
  * mds client state
  */
 struct ceph_mds_client {
 	struct ceph_fs_client  *fsc;
 	struct mutex            mutex;         /* all nested structures */
 	struct ceph_mdsmap      *mdsmap;
 	struct completion       safe_umount_waiters;
 	wait_queue_head_t       session_close_wq;
 	struct list_head        waiting_for_map;
 	struct ceph_mds_session **sessions;    /* NULL for mds if no session */
 	int                     max_sessions;  /* len of s_mds_sessions */
 	int                     stopping;      /* true if shutting down */
 	/*
 	 * snap_rwsem will cover cap linkage into snaprealms, and
 	 * realm snap contexts.  (later, we can do per-realm snap
 	 * contexts locks..)  the empty list contains realms with no
 	 * references (implying they contain no inodes with caps) that
 	 * should be destroyed.
 	 */
 	struct rw_semaphore     snap_rwsem;
 	struct rb_root          snap_realms;
 	struct list_head        snap_empty;
 	spinlock_t              snap_empty_lock;  /* protect snap_empty */
 	u64                    last_tid;      /* most recent mds request */
 	struct rb_root         request_tree;  /* pending mds requests */
 	struct delayed_work    delayed_work;  /* delayed work */
 	unsigned long    last_renew_caps;  /* last time we renewed our caps */
 	struct list_head cap_delay_list;   /* caps with delayed release */
 	spinlock_t       cap_delay_lock;   /* protects cap_delay_list */
 	struct list_head snap_flush_list;  /* cap_snaps ready to flush */
 	spinlock_t       snap_flush_lock;
 	u64               cap_flush_seq;
 	struct list_head  cap_dirty;        /* inodes with dirty caps */
 	struct list_head  cap_dirty_migrating; /* ...that are migration... */
 	int               num_cap_flushing; /* # caps we are flushing */
 	spinlock_t        cap_dirty_lock;   /* protects above items */
 	wait_queue_head_t cap_flushing_wq;
 	/*
 	 * Cap reservations
 	 *
 	 * Maintain a global pool of preallocated struct ceph_caps, referenced
 	 * by struct ceph_caps_reservations.  This ensures that we preallocate
 	 * memory needed to successfully process an MDS response.  (If an MDS
 	 * sends us cap information and we fail to process it, we will have
 	 * problems due to the client and MDS being out of sync.)
 	 *
 	 * Reservations are 'owned' by a ceph_cap_reservation context.
 	 */
 	spinlock_t	caps_list_lock;
 	struct		list_head caps_list; /* unused (reserved or
 						unreserved) */
 	int		caps_total_count;    /* total caps allocated */
 	int		caps_use_count;      /* in use */
 	int		caps_reserve_count;  /* unused, reserved */
 	int		caps_avail_count;    /* unused, unreserved */
 	int		caps_min_count;      /* keep at least this many
 						(unreserved) */
 	spinlock_t	  dentry_lru_lock;
 	struct list_head  dentry_lru;
 	int		  num_dentry;
 };
 extern const char *ceph_mds_op_name(int op);
 extern struct ceph_mds_session *
 __ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
 static inline struct ceph_mds_session *
 ceph_get_mds_session(struct ceph_mds_session *s)
 {
 	atomic_inc(&s->s_ref);
 	return s;
 }
 extern void ceph_put_mds_session(struct ceph_mds_session *s);
 extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
 			     struct ceph_msg *msg, int mds);
 extern int ceph_mdsc_init(struct ceph_fs_client *fsc);
 extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
 extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
 extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
 extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
 				    struct inode *inode,
 				    struct dentry *dn);
 extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
 extern struct ceph_mds_request *
 ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
 extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
 				     struct ceph_mds_request *req);
 extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
 				struct inode *dir,
 				struct ceph_mds_request *req);
 static inline void ceph_mdsc_get_request(struct ceph_mds_request *req)
 {
 	kref_get(&req->r_kref);
 }
 extern void ceph_mdsc_release_request(struct kref *kref);
 static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
 {
 	kref_put(&req->r_kref, ceph_mdsc_release_request);
 }
 extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
 				 struct ceph_mds_session *session);
 extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
 				   struct ceph_mds_session *session);
 extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
 extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
 				  int stop_on_nosnap);
 extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
 extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
 				     struct inode *inode,
 				     struct dentry *dentry, char action,
 				     u32 seq);
 extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
 				 struct ceph_msg *msg);
 extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
 					  struct ceph_mds_session *session);
 #endif

fs/ceph/xattr.c

Diff comments View file @ 6c073a7

 #include <linux/ceph/ceph_debug.h>
 #include "super.h"
 #include "mds_client.h"
 #include <linux/ceph/decode.h>
 #include <linux/xattr.h>
 #include <linux/slab.h>
 static bool ceph_is_valid_xattr(const char *name)
 {
 	return !strncmp(name, "ceph.", 5) ||
 	       !strncmp(name, XATTR_SECURITY_PREFIX,
 			XATTR_SECURITY_PREFIX_LEN) ||
 	       !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
 	       !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
 }
 /*
  * These define virtual xattrs exposing the recursive directory
  * statistics and layout metadata.
  */
 struct ceph_vxattr_cb {
 	bool readonly;
 	char *name;
 	size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
 			      size_t size);
 };
 /* directories */
 static size_t ceph_vxattrcb_entries(struct ceph_inode_info *ci, char *val,
 					size_t size)
 {
 	return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs);
 }
 static size_t ceph_vxattrcb_files(struct ceph_inode_info *ci, char *val,
 				      size_t size)
 {
 	return snprintf(val, size, "%lld", ci->i_files);
 }
 static size_t ceph_vxattrcb_subdirs(struct ceph_inode_info *ci, char *val,
 					size_t size)
 {
 	return snprintf(val, size, "%lld", ci->i_subdirs);
 }
 static size_t ceph_vxattrcb_rentries(struct ceph_inode_info *ci, char *val,
 					 size_t size)
 {
 	return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs);
 }
 static size_t ceph_vxattrcb_rfiles(struct ceph_inode_info *ci, char *val,
 				       size_t size)
 {
 	return snprintf(val, size, "%lld", ci->i_rfiles);
 }
 static size_t ceph_vxattrcb_rsubdirs(struct ceph_inode_info *ci, char *val,
 					 size_t size)
 {
 	return snprintf(val, size, "%lld", ci->i_rsubdirs);
 }
 static size_t ceph_vxattrcb_rbytes(struct ceph_inode_info *ci, char *val,
 				       size_t size)
 {
 	return snprintf(val, size, "%lld", ci->i_rbytes);
 }
 static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
 				       size_t size)
 {
 	return snprintf(val, size, "%ld.%ld", (long)ci->i_rctime.tv_sec,
 			(long)ci->i_rctime.tv_nsec);
 }
 static struct ceph_vxattr_cb ceph_dir_vxattrs[] = {
 	{ true, "ceph.dir.entries", ceph_vxattrcb_entries},
 	{ true, "ceph.dir.files", ceph_vxattrcb_files},
 	{ true, "ceph.dir.subdirs", ceph_vxattrcb_subdirs},
 	{ true, "ceph.dir.rentries", ceph_vxattrcb_rentries},
 	{ true, "ceph.dir.rfiles", ceph_vxattrcb_rfiles},
 	{ true, "ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
 	{ true, "ceph.dir.rbytes", ceph_vxattrcb_rbytes},
 	{ true, "ceph.dir.rctime", ceph_vxattrcb_rctime},
 	{ true, NULL, NULL }
 };
 /* files */
 static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
 				   size_t size)
 {
 	int ret;
 	ret = snprintf(val, size,
 		"chunk_bytes=%lld\nstripe_count=%lld\nobject_size=%lld\n",
 		(unsigned long long)ceph_file_layout_su(ci->i_layout),
 		(unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
 		(unsigned long long)ceph_file_layout_object_size(ci->i_layout));
 	if (ceph_file_layout_pg_preferred(ci->i_layout))
 		ret += snprintf(val + ret, size, "preferred_osd=%lld\n",
 			    (unsigned long long)ceph_file_layout_pg_preferred(
 				    ci->i_layout));
 	return ret;
 }
 static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
+	{ true, "ceph.file.layout", ceph_vxattrcb_layout},
+	/* The following extended attribute name is deprecated */
 	{ true, "ceph.layout", ceph_vxattrcb_layout},
-	{ NULL, NULL }
+	{ true, NULL, NULL }
 };
 static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode)
 {
 	if (S_ISDIR(inode->i_mode))
 		return ceph_dir_vxattrs;
 	else if (S_ISREG(inode->i_mode))
 		return ceph_file_vxattrs;
 	return NULL;
 }
 static struct ceph_vxattr_cb *ceph_match_vxattr(struct ceph_vxattr_cb *vxattr,
 						const char *name)
 {
 	do {
 		if (strcmp(vxattr->name, name) == 0)
 			return vxattr;
 		vxattr++;
 	} while (vxattr->name);
 	return NULL;
 }
 static int __set_xattr(struct ceph_inode_info *ci,
 			   const char *name, int name_len,
 			   const char *val, int val_len,
 			   int dirty,
 			   int should_free_name, int should_free_val,
 			   struct ceph_inode_xattr **newxattr)
 {
 	struct rb_node **p;
 	struct rb_node *parent = NULL;
 	struct ceph_inode_xattr *xattr = NULL;
 	int c;
 	int new = 0;
 	p = &ci->i_xattrs.index.rb_node;
 	while (*p) {
 		parent = *p;
 		xattr = rb_entry(parent, struct ceph_inode_xattr, node);
 		c = strncmp(name, xattr->name, min(name_len, xattr->name_len));
 		if (c < 0)
 			p = &(*p)->rb_left;
 		else if (c > 0)
 			p = &(*p)->rb_right;
 		else {
 			if (name_len == xattr->name_len)
 				break;
 			else if (name_len < xattr->name_len)
 				p = &(*p)->rb_left;
 			else
 				p = &(*p)->rb_right;
 		}
 		xattr = NULL;
 	}
 	if (!xattr) {
 		new = 1;
 		xattr = *newxattr;
 		xattr->name = name;
 		xattr->name_len = name_len;
 		xattr->should_free_name = should_free_name;
 		ci->i_xattrs.count++;
 		dout("__set_xattr count=%d\n", ci->i_xattrs.count);
 	} else {
 		kfree(*newxattr);
 		*newxattr = NULL;
 		if (xattr->should_free_val)
 			kfree((void *)xattr->val);
 		if (should_free_name) {
 			kfree((void *)name);
 			name = xattr->name;
 		}
 		ci->i_xattrs.names_size -= xattr->name_len;
 		ci->i_xattrs.vals_size -= xattr->val_len;
 	}
 	ci->i_xattrs.names_size += name_len;
 	ci->i_xattrs.vals_size += val_len;
 	if (val)
 		xattr->val = val;
 	else
 		xattr->val = "";
 	xattr->val_len = val_len;
 	xattr->dirty = dirty;
 	xattr->should_free_val = (val && should_free_val);
 	if (new) {
 		rb_link_node(&xattr->node, parent, p);
 		rb_insert_color(&xattr->node, &ci->i_xattrs.index);
 		dout("__set_xattr_val p=%p\n", p);
 	}
 	dout("__set_xattr_val added %llx.%llx xattr %p %s=%.*s\n",
 	     ceph_vinop(&ci->vfs_inode), xattr, name, val_len, val);
 	return 0;
 }
 static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
 			   const char *name)
 {
 	struct rb_node **p;
 	struct rb_node *parent = NULL;
 	struct ceph_inode_xattr *xattr = NULL;
 	int name_len = strlen(name);
 	int c;
 	p = &ci->i_xattrs.index.rb_node;
 	while (*p) {
 		parent = *p;
 		xattr = rb_entry(parent, struct ceph_inode_xattr, node);
 		c = strncmp(name, xattr->name, xattr->name_len);
 		if (c == 0 && name_len > xattr->name_len)
 			c = 1;
 		if (c < 0)
 			p = &(*p)->rb_left;
 		else if (c > 0)
 			p = &(*p)->rb_right;
 		else {
 			dout("__get_xattr %s: found %.*s\n", name,
 			     xattr->val_len, xattr->val);
 			return xattr;
 		}
 	}
 	dout("__get_xattr %s: not found\n", name);
 	return NULL;
 }
 static void __free_xattr(struct ceph_inode_xattr *xattr)
 {
 	BUG_ON(!xattr);
 	if (xattr->should_free_name)
 		kfree((void *)xattr->name);
 	if (xattr->should_free_val)
 		kfree((void *)xattr->val);
 	kfree(xattr);
 }
 static int __remove_xattr(struct ceph_inode_info *ci,
 			  struct ceph_inode_xattr *xattr)
 {
 	if (!xattr)
 		return -EOPNOTSUPP;
 	rb_erase(&xattr->node, &ci->i_xattrs.index);
 	if (xattr->should_free_name)
 		kfree((void *)xattr->name);
 	if (xattr->should_free_val)
 		kfree((void *)xattr->val);
 	ci->i_xattrs.names_size -= xattr->name_len;
 	ci->i_xattrs.vals_size -= xattr->val_len;
 	ci->i_xattrs.count--;
 	kfree(xattr);
 	return 0;
 }
 static int __remove_xattr_by_name(struct ceph_inode_info *ci,
 			   const char *name)
 {
 	struct rb_node **p;
 	struct ceph_inode_xattr *xattr;
 	int err;
 	p = &ci->i_xattrs.index.rb_node;
 	xattr = __get_xattr(ci, name);
 	err = __remove_xattr(ci, xattr);
 	return err;
 }
 static char *__copy_xattr_names(struct ceph_inode_info *ci,
 				char *dest)
 {
 	struct rb_node *p;
 	struct ceph_inode_xattr *xattr = NULL;
 	p = rb_first(&ci->i_xattrs.index);
 	dout("__copy_xattr_names count=%d\n", ci->i_xattrs.count);
 	while (p) {
 		xattr = rb_entry(p, struct ceph_inode_xattr, node);
 		memcpy(dest, xattr->name, xattr->name_len);
 		dest[xattr->name_len] = '\0';
 		dout("dest=%s %p (%s) (%d/%d)\n", dest, xattr, xattr->name,
 		     xattr->name_len, ci->i_xattrs.names_size);
 		dest += xattr->name_len + 1;
 		p = rb_next(p);
 	}
 	return dest;
 }
 void __ceph_destroy_xattrs(struct ceph_inode_info *ci)
 {
 	struct rb_node *p, *tmp;
 	struct ceph_inode_xattr *xattr = NULL;
 	p = rb_first(&ci->i_xattrs.index);
 	dout("__ceph_destroy_xattrs p=%p\n", p);
 	while (p) {
 		xattr = rb_entry(p, struct ceph_inode_xattr, node);
 		tmp = p;
 		p = rb_next(tmp);
 		dout("__ceph_destroy_xattrs next p=%p (%.*s)\n", p,
 		     xattr->name_len, xattr->name);
 		rb_erase(tmp, &ci->i_xattrs.index);
 		__free_xattr(xattr);
 	}
 	ci->i_xattrs.names_size = 0;
 	ci->i_xattrs.vals_size = 0;
 	ci->i_xattrs.index_version = 0;
 	ci->i_xattrs.count = 0;
 	ci->i_xattrs.index = RB_ROOT;
 }
 static int __build_xattrs(struct inode *inode)
 	__releases(ci->i_ceph_lock)
 	__acquires(ci->i_ceph_lock)
 {
 	u32 namelen;
 	u32 numattr = 0;
 	void *p, *end;
 	u32 len;
 	const char *name, *val;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int xattr_version;
 	struct ceph_inode_xattr **xattrs = NULL;
 	int err = 0;
 	int i;
 	dout("__build_xattrs() len=%d\n",
 	     ci->i_xattrs.blob ? (int)ci->i_xattrs.blob->vec.iov_len : 0);
 	if (ci->i_xattrs.index_version >= ci->i_xattrs.version)
 		return 0; /* already built */
 	__ceph_destroy_xattrs(ci);
 start:
 	/* updated internal xattr rb tree */
 	if (ci->i_xattrs.blob && ci->i_xattrs.blob->vec.iov_len > 4) {
 		p = ci->i_xattrs.blob->vec.iov_base;
 		end = p + ci->i_xattrs.blob->vec.iov_len;
 		ceph_decode_32_safe(&p, end, numattr, bad);
 		xattr_version = ci->i_xattrs.version;
 		spin_unlock(&ci->i_ceph_lock);
 		xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *),
 				 GFP_NOFS);
 		err = -ENOMEM;
 		if (!xattrs)
 			goto bad_lock;
 		memset(xattrs, 0, numattr*sizeof(struct ceph_xattr *));
 		for (i = 0; i < numattr; i++) {
 			xattrs[i] = kmalloc(sizeof(struct ceph_inode_xattr),
 					    GFP_NOFS);
 			if (!xattrs[i])
 				goto bad_lock;
 		}
 		spin_lock(&ci->i_ceph_lock);
 		if (ci->i_xattrs.version != xattr_version) {
 			/* lost a race, retry */
 			for (i = 0; i < numattr; i++)
 				kfree(xattrs[i]);
 			kfree(xattrs);
 			goto start;
 		}
 		err = -EIO;
 		while (numattr--) {
 			ceph_decode_32_safe(&p, end, len, bad);
 			namelen = len;
 			name = p;
 			p += len;
 			ceph_decode_32_safe(&p, end, len, bad);
 			val = p;
 			p += len;
 			err = __set_xattr(ci, name, namelen, val, len,
 					  0, 0, 0, &xattrs[numattr]);
 			if (err < 0)
 				goto bad;
 		}
 		kfree(xattrs);
 	}
 	ci->i_xattrs.index_version = ci->i_xattrs.version;
 	ci->i_xattrs.dirty = false;
 	return err;
 bad_lock:
 	spin_lock(&ci->i_ceph_lock);
 bad:
 	if (xattrs) {
 		for (i = 0; i < numattr; i++)
 			kfree(xattrs[i]);
 		kfree(xattrs);
 	}
 	ci->i_xattrs.names_size = 0;
 	return err;
 }
 static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size,
 				    int val_size)
 {
 	/*
 	 * 4 bytes for the length, and additional 4 bytes per each xattr name,
 	 * 4 bytes per each value
 	 */
 	int size = 4 + ci->i_xattrs.count*(4 + 4) +
 			     ci->i_xattrs.names_size +
 			     ci->i_xattrs.vals_size;
 	dout("__get_required_blob_size c=%d names.size=%d vals.size=%d\n",
 	     ci->i_xattrs.count, ci->i_xattrs.names_size,
 	     ci->i_xattrs.vals_size);
 	if (name_size)
 		size += 4 + 4 + name_size + val_size;
 	return size;
 }
 /*
  * If there are dirty xattrs, reencode xattrs into the prealloc_blob
  * and swap into place.
  */
 void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
 {
 	struct rb_node *p;
 	struct ceph_inode_xattr *xattr = NULL;
 	void *dest;
 	dout("__build_xattrs_blob %p\n", &ci->vfs_inode);
 	if (ci->i_xattrs.dirty) {
 		int need = __get_required_blob_size(ci, 0, 0);
 		BUG_ON(need > ci->i_xattrs.prealloc_blob->alloc_len);
 		p = rb_first(&ci->i_xattrs.index);
 		dest = ci->i_xattrs.prealloc_blob->vec.iov_base;
 		ceph_encode_32(&dest, ci->i_xattrs.count);
 		while (p) {
 			xattr = rb_entry(p, struct ceph_inode_xattr, node);
 			ceph_encode_32(&dest, xattr->name_len);
 			memcpy(dest, xattr->name, xattr->name_len);
 			dest += xattr->name_len;
 			ceph_encode_32(&dest, xattr->val_len);
 			memcpy(dest, xattr->val, xattr->val_len);
 			dest += xattr->val_len;
 			p = rb_next(p);
 		}
 		/* adjust buffer len; it may be larger than we need */
 		ci->i_xattrs.prealloc_blob->vec.iov_len =
 			dest - ci->i_xattrs.prealloc_blob->vec.iov_base;
 		if (ci->i_xattrs.blob)
 			ceph_buffer_put(ci->i_xattrs.blob);
 		ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob;
 		ci->i_xattrs.prealloc_blob = NULL;
 		ci->i_xattrs.dirty = false;
 		ci->i_xattrs.version++;
 	}
 }
 ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
 		      size_t size)
 {
 	struct inode *inode = dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
 	int err;
 	struct ceph_inode_xattr *xattr;
 	struct ceph_vxattr_cb *vxattr = NULL;
 	if (!ceph_is_valid_xattr(name))
 		return -ENODATA;
 	/* let's see if a virtual xattr was requested */
 	if (vxattrs)
 		vxattr = ceph_match_vxattr(vxattrs, name);
 	spin_lock(&ci->i_ceph_lock);
 	dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
 	     ci->i_xattrs.version, ci->i_xattrs.index_version);
 	if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
 	    (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
 		goto get_xattr;
 	} else {
 		spin_unlock(&ci->i_ceph_lock);
 		/* get xattrs from mds (if we don't already have them) */
 		err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
 		if (err)
 			return err;
 	}
 	spin_lock(&ci->i_ceph_lock);
 	if (vxattr && vxattr->readonly) {
 		err = vxattr->getxattr_cb(ci, value, size);
 		goto out;
 	}
 	err = __build_xattrs(inode);
 	if (err < 0)
 		goto out;
 get_xattr:
 	err = -ENODATA;  /* == ENOATTR */
 	xattr = __get_xattr(ci, name);
 	if (!xattr) {
 		if (vxattr)
 			err = vxattr->getxattr_cb(ci, value, size);
 		goto out;
 	}
 	err = -ERANGE;
 	if (size && size < xattr->val_len)
 		goto out;
 	err = xattr->val_len;
 	if (size == 0)
 		goto out;
 	memcpy(value, xattr->val, xattr->val_len);
 out:
 	spin_unlock(&ci->i_ceph_lock);
 	return err;
 }
 ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
 {
 	struct inode *inode = dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
 	u32 vir_namelen = 0;
 	u32 namelen;
 	int err;
 	u32 len;
 	int i;
 	spin_lock(&ci->i_ceph_lock);
 	dout("listxattr %p ver=%lld index_ver=%lld\n", inode,
 	     ci->i_xattrs.version, ci->i_xattrs.index_version);
 	if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
 	    (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
 		goto list_xattr;
 	} else {
 		spin_unlock(&ci->i_ceph_lock);
 		err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
 		if (err)
 			return err;
 	}
 	spin_lock(&ci->i_ceph_lock);
 	err = __build_xattrs(inode);
 	if (err < 0)
 		goto out;
 list_xattr:
 	vir_namelen = 0;
 	/* include virtual dir xattrs */
 	if (vxattrs)
 		for (i = 0; vxattrs[i].name; i++)
 			vir_namelen += strlen(vxattrs[i].name) + 1;
 	/* adding 1 byte per each variable due to the null termination */
 	namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count;
 	err = -ERANGE;
 	if (size && namelen > size)
 		goto out;
 	err = namelen;
 	if (size == 0)
 		goto out;
 	names = __copy_xattr_names(ci, names);
 	/* virtual xattr names, too */
 	if (vxattrs)
 		for (i = 0; vxattrs[i].name; i++) {
 			len = sprintf(names, "%s", vxattrs[i].name);
 			names += len + 1;
 		}
 out:
 	spin_unlock(&ci->i_ceph_lock);
 	return err;
 }
 static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
 			      const char *value, size_t size, int flags)
 {
 	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
 	struct inode *inode = dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct inode *parent_inode;
 	struct ceph_mds_request *req;
 	struct ceph_mds_client *mdsc = fsc->mdsc;
 	int err;
 	int i, nr_pages;
 	struct page **pages = NULL;
 	void *kaddr;
 	/* copy value into some pages */
 	nr_pages = calc_pages_for(0, size);
 	if (nr_pages) {
 		pages = kmalloc(sizeof(pages[0])*nr_pages, GFP_NOFS);
 		if (!pages)
 			return -ENOMEM;
 		err = -ENOMEM;
 		for (i = 0; i < nr_pages; i++) {
 			pages[i] = __page_cache_alloc(GFP_NOFS);
 			if (!pages[i]) {
 				nr_pages = i;
 				goto out;
 			}
 			kaddr = kmap(pages[i]);
 			memcpy(kaddr, value + i*PAGE_CACHE_SIZE,
 			       min(PAGE_CACHE_SIZE, size-i*PAGE_CACHE_SIZE));
 		}
 	}
 	dout("setxattr value=%.*s\n", (int)size, value);
 	/* do request */
 	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR,
 				       USE_AUTH_MDS);
 	if (IS_ERR(req)) {
 		err = PTR_ERR(req);
 		goto out;
 	}
 	req->r_inode = inode;
 	ihold(inode);
 	req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
 	req->r_num_caps = 1;
 	req->r_args.setxattr.flags = cpu_to_le32(flags);
 	req->r_path2 = kstrdup(name, GFP_NOFS);
 	req->r_pages = pages;
 	req->r_num_pages = nr_pages;
 	req->r_data_len = size;
 	dout("xattr.ver (before): %lld\n", ci->i_xattrs.version);
 	parent_inode = ceph_get_dentry_parent_inode(dentry);
 	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
 	iput(parent_inode);
 	ceph_mdsc_put_request(req);
 	dout("xattr.ver (after): %lld\n", ci->i_xattrs.version);
 out:
 	if (pages) {
 		for (i = 0; i < nr_pages; i++)
 			__free_page(pages[i]);
 		kfree(pages);
 	}
 	return err;
 }
 int ceph_setxattr(struct dentry *dentry, const char *name,
 		  const void *value, size_t size, int flags)
 {
 	struct inode *inode = dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
 	int err;
 	int name_len = strlen(name);
 	int val_len = size;
 	char *newname = NULL;
 	char *newval = NULL;
 	struct ceph_inode_xattr *xattr = NULL;
 	int issued;
 	int required_blob_size;
 	int dirty;
 	if (ceph_snap(inode) != CEPH_NOSNAP)
 		return -EROFS;
 	if (!ceph_is_valid_xattr(name))
 		return -EOPNOTSUPP;
 	if (vxattrs) {
 		struct ceph_vxattr_cb *vxattr =
 			ceph_match_vxattr(vxattrs, name);
 		if (vxattr && vxattr->readonly)
 			return -EOPNOTSUPP;
 	}
 	/* preallocate memory for xattr name, value, index node */
 	err = -ENOMEM;
 	newname = kmemdup(name, name_len + 1, GFP_NOFS);
 	if (!newname)
 		goto out;
 	if (val_len) {
 		newval = kmalloc(val_len + 1, GFP_NOFS);
 		if (!newval)
 			goto out;
 		memcpy(newval, value, val_len);
 		newval[val_len] = '\0';
 	}
 	xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS);
 	if (!xattr)
 		goto out;
 	spin_lock(&ci->i_ceph_lock);
 retry:
 	issued = __ceph_caps_issued(ci, NULL);
 	if (!(issued & CEPH_CAP_XATTR_EXCL))
 		goto do_sync;
 	__build_xattrs(inode);
 	required_blob_size = __get_required_blob_size(ci, name_len, val_len);
 	if (!ci->i_xattrs.prealloc_blob ||
 	    required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
 		struct ceph_buffer *blob = NULL;
 		spin_unlock(&ci->i_ceph_lock);
 		dout(" preaallocating new blob size=%d\n", required_blob_size);
 		blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
 		if (!blob)
 			goto out;
 		spin_lock(&ci->i_ceph_lock);
 		if (ci->i_xattrs.prealloc_blob)
 			ceph_buffer_put(ci->i_xattrs.prealloc_blob);
 		ci->i_xattrs.prealloc_blob = blob;
 		goto retry;
 	}
 	dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
 	err = __set_xattr(ci, newname, name_len, newval,
 			  val_len, 1, 1, 1, &xattr);
 	dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
 	ci->i_xattrs.dirty = true;
 	inode->i_ctime = CURRENT_TIME;
 	spin_unlock(&ci->i_ceph_lock);
 	if (dirty)
 		__mark_inode_dirty(inode, dirty);
 	return err;
 do_sync:
 	spin_unlock(&ci->i_ceph_lock);
 	err = ceph_sync_setxattr(dentry, name, value, size, flags);
 out:
 	kfree(newname);
 	kfree(newval);
 	kfree(xattr);
 	return err;
 }
 static int ceph_send_removexattr(struct dentry *dentry, const char *name)
 {
 	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
 	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct inode *inode = dentry->d_inode;
 	struct inode *parent_inode;
 	struct ceph_mds_request *req;
 	int err;
 	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RMXATTR,
 				       USE_AUTH_MDS);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 	req->r_inode = inode;
 	ihold(inode);
 	req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
 	req->r_num_caps = 1;
 	req->r_path2 = kstrdup(name, GFP_NOFS);
 	parent_inode = ceph_get_dentry_parent_inode(dentry);
 	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
 	iput(parent_inode);
 	ceph_mdsc_put_request(req);
 	return err;
 }
 int ceph_removexattr(struct dentry *dentry, const char *name)
 {
 	struct inode *inode = dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
 	int issued;
 	int err;
 	int required_blob_size;
 	int dirty;
 	if (ceph_snap(inode) != CEPH_NOSNAP)
 		return -EROFS;
 	if (!ceph_is_valid_xattr(name))
 		return -EOPNOTSUPP;
 	if (vxattrs) {
 		struct ceph_vxattr_cb *vxattr =
 			ceph_match_vxattr(vxattrs, name);
 		if (vxattr && vxattr->readonly)
 			return -EOPNOTSUPP;
 	}
 	err = -ENOMEM;
 	spin_lock(&ci->i_ceph_lock);
 	__build_xattrs(inode);
 retry:
 	issued = __ceph_caps_issued(ci, NULL);
 	dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
 	if (!(issued & CEPH_CAP_XATTR_EXCL))
 		goto do_sync;
 	required_blob_size = __get_required_blob_size(ci, 0, 0);
 	if (!ci->i_xattrs.prealloc_blob ||
 	    required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
 		struct ceph_buffer *blob;
 		spin_unlock(&ci->i_ceph_lock);
 		dout(" preaallocating new blob size=%d\n", required_blob_size);
 		blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
 		if (!blob)
 			goto out;
 		spin_lock(&ci->i_ceph_lock);
 		if (ci->i_xattrs.prealloc_blob)
 			ceph_buffer_put(ci->i_xattrs.prealloc_blob);
 		ci->i_xattrs.prealloc_blob = blob;
 		goto retry;
 	}
 	err = __remove_xattr_by_name(ceph_inode(inode), name);
 	dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
 	ci->i_xattrs.dirty = true;
 	inode->i_ctime = CURRENT_TIME;
 	spin_unlock(&ci->i_ceph_lock);
 	if (dirty)
 		__mark_inode_dirty(inode, dirty);
 	return err;
 do_sync:
 	spin_unlock(&ci->i_ceph_lock);
 	err = ceph_send_removexattr(dentry, name);
 out:
 	return err;
 }

net/ceph/ceph_common.c

Diff comments View file @ 6c073a7

 #include <linux/ceph/ceph_debug.h>
 #include <linux/backing-dev.h>
 #include <linux/ctype.h>
 #include <linux/fs.h>
 #include <linux/inet.h>
 #include <linux/in6.h>
 #include <linux/key.h>
 #include <keys/ceph-type.h>
 #include <linux/module.h>
 #include <linux/mount.h>
 #include <linux/parser.h>
 #include <linux/sched.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/statfs.h>
 #include <linux/string.h>
 #include <linux/ceph/libceph.h>
 #include <linux/ceph/debugfs.h>
 #include <linux/ceph/decode.h>
 #include <linux/ceph/mon_client.h>
 #include <linux/ceph/auth.h>
 #include "crypto.h"
 /*
  * find filename portion of a path (/foo/bar/baz -> baz)
  */
 const char *ceph_file_part(const char *s, int len)
 {
 	const char *e = s + len;
 	while (e != s && *(e-1) != '/')
 		e--;
 	return e;
 }
 EXPORT_SYMBOL(ceph_file_part);
 const char *ceph_msg_type_name(int type)
 {
 	switch (type) {
 	case CEPH_MSG_SHUTDOWN: return "shutdown";
 	case CEPH_MSG_PING: return "ping";
 	case CEPH_MSG_AUTH: return "auth";
 	case CEPH_MSG_AUTH_REPLY: return "auth_reply";
 	case CEPH_MSG_MON_MAP: return "mon_map";
 	case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
 	case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
 	case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
 	case CEPH_MSG_STATFS: return "statfs";
 	case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
 	case CEPH_MSG_MDS_MAP: return "mds_map";
 	case CEPH_MSG_CLIENT_SESSION: return "client_session";
 	case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
 	case CEPH_MSG_CLIENT_REQUEST: return "client_request";
 	case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
 	case CEPH_MSG_CLIENT_REPLY: return "client_reply";
 	case CEPH_MSG_CLIENT_CAPS: return "client_caps";
 	case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
 	case CEPH_MSG_CLIENT_SNAP: return "client_snap";
 	case CEPH_MSG_CLIENT_LEASE: return "client_lease";
 	case CEPH_MSG_OSD_MAP: return "osd_map";
 	case CEPH_MSG_OSD_OP: return "osd_op";
 	case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
 	case CEPH_MSG_WATCH_NOTIFY: return "watch_notify";
 	default: return "unknown";
 	}
 }
 EXPORT_SYMBOL(ceph_msg_type_name);
 /*
  * Initially learn our fsid, or verify an fsid matches.
  */
 int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
 {
 	if (client->have_fsid) {
 		if (ceph_fsid_compare(&client->fsid, fsid)) {
 			pr_err("bad fsid, had %pU got %pU",
 			       &client->fsid, fsid);
 			return -1;
 		}
 	} else {
 		pr_info("client%lld fsid %pU\n", ceph_client_id(client), fsid);
 		memcpy(&client->fsid, fsid, sizeof(*fsid));
-		ceph_debugfs_client_init(client);
-		client->have_fsid = true;
 	}
 	return 0;
 }
 EXPORT_SYMBOL(ceph_check_fsid);
 static int strcmp_null(const char *s1, const char *s2)
 {
 	if (!s1 && !s2)
 		return 0;
 	if (s1 && !s2)
 		return -1;
 	if (!s1 && s2)
 		return 1;
 	return strcmp(s1, s2);
 }
 int ceph_compare_options(struct ceph_options *new_opt,
 			 struct ceph_client *client)
 {
 	struct ceph_options *opt1 = new_opt;
 	struct ceph_options *opt2 = client->options;
 	int ofs = offsetof(struct ceph_options, mon_addr);
 	int i;
 	int ret;
 	ret = memcmp(opt1, opt2, ofs);
 	if (ret)
 		return ret;
 	ret = strcmp_null(opt1->name, opt2->name);
 	if (ret)
 		return ret;
 	if (opt1->key && !opt2->key)
 		return -1;
 	if (!opt1->key && opt2->key)
 		return 1;
 	if (opt1->key && opt2->key) {
 		if (opt1->key->type != opt2->key->type)
 			return -1;
 		if (opt1->key->created.tv_sec != opt2->key->created.tv_sec)
 			return -1;
 		if (opt1->key->created.tv_nsec != opt2->key->created.tv_nsec)
 			return -1;
 		if (opt1->key->len != opt2->key->len)
 			return -1;
 		if (opt1->key->key && !opt2->key->key)
 			return -1;
 		if (!opt1->key->key && opt2->key->key)
 			return 1;
 		if (opt1->key->key && opt2->key->key) {
 			ret = memcmp(opt1->key->key, opt2->key->key, opt1->key->len);
 			if (ret)
 				return ret;
 		}
 	}
 	/* any matching mon ip implies a match */
 	for (i = 0; i < opt1->num_mon; i++) {
 		if (ceph_monmap_contains(client->monc.monmap,
 				 &opt1->mon_addr[i]))
 			return 0;
 	}
 	return -1;
 }
 EXPORT_SYMBOL(ceph_compare_options);
 static int parse_fsid(const char *str, struct ceph_fsid *fsid)
 {
 	int i = 0;
 	char tmp[3];
 	int err = -EINVAL;
 	int d;
 	dout("parse_fsid '%s'\n", str);
 	tmp[2] = 0;
 	while (*str && i < 16) {
 		if (ispunct(*str)) {
 			str++;
 			continue;
 		}
 		if (!isxdigit(str[0]) || !isxdigit(str[1]))
 			break;
 		tmp[0] = str[0];
 		tmp[1] = str[1];
 		if (sscanf(tmp, "%x", &d) < 1)
 			break;
 		fsid->fsid[i] = d & 0xff;
 		i++;
 		str += 2;
 	}
 	if (i == 16)
 		err = 0;
 	dout("parse_fsid ret %d got fsid %pU", err, fsid);
 	return err;
 }
 /*
  * ceph options
  */
 enum {
 	Opt_osdtimeout,
 	Opt_osdkeepalivetimeout,
 	Opt_mount_timeout,
 	Opt_osd_idle_ttl,
 	Opt_last_int,
 	/* int args above */
 	Opt_fsid,
 	Opt_name,
 	Opt_secret,
 	Opt_key,
 	Opt_ip,
 	Opt_last_string,
 	/* string args above */
 	Opt_noshare,
 	Opt_nocrc,
 };
 static match_table_t opt_tokens = {
 	{Opt_osdtimeout, "osdtimeout=%d"},
 	{Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
 	{Opt_mount_timeout, "mount_timeout=%d"},
 	{Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
 	/* int args above */
 	{Opt_fsid, "fsid=%s"},
 	{Opt_name, "name=%s"},
 	{Opt_secret, "secret=%s"},
 	{Opt_key, "key=%s"},
 	{Opt_ip, "ip=%s"},
 	/* string args above */
 	{Opt_noshare, "noshare"},
 	{Opt_nocrc, "nocrc"},
 	{-1, NULL}
 };
 void ceph_destroy_options(struct ceph_options *opt)
 {
 	dout("destroy_options %p\n", opt);
 	kfree(opt->name);
 	if (opt->key) {
 		ceph_crypto_key_destroy(opt->key);
 		kfree(opt->key);
 	}
 	kfree(opt->mon_addr);
 	kfree(opt);
 }
 EXPORT_SYMBOL(ceph_destroy_options);
 /* get secret from key store */
 static int get_secret(struct ceph_crypto_key *dst, const char *name) {
 	struct key *ukey;
 	int key_err;
 	int err = 0;
 	struct ceph_crypto_key *ckey;
 	ukey = request_key(&key_type_ceph, name, NULL);
 	if (!ukey || IS_ERR(ukey)) {
 		/* request_key errors don't map nicely to mount(2)
 		   errors; don't even try, but still printk */
 		key_err = PTR_ERR(ukey);
 		switch (key_err) {
 		case -ENOKEY:
 			pr_warning("ceph: Mount failed due to key not found: %s\n", name);
 			break;
 		case -EKEYEXPIRED:
 			pr_warning("ceph: Mount failed due to expired key: %s\n", name);
 			break;
 		case -EKEYREVOKED:
 			pr_warning("ceph: Mount failed due to revoked key: %s\n", name);
 			break;
 		default:
 			pr_warning("ceph: Mount failed due to unknown key error"
 			       " %d: %s\n", key_err, name);
 		}
 		err = -EPERM;
 		goto out;
 	}
 	ckey = ukey->payload.data;
 	err = ceph_crypto_key_clone(dst, ckey);
 	if (err)
 		goto out_key;
 	/* pass through, err is 0 */
 out_key:
 	key_put(ukey);
 out:
 	return err;
 }
 int ceph_parse_options(struct ceph_options **popt, char *options,
 		       const char *dev_name, const char *dev_name_end,
 		       int (*parse_extra_token)(char *c, void *private),
 		       void *private)
 {
 	struct ceph_options *opt;
 	const char *c;
 	int err = -ENOMEM;
 	substring_t argstr[MAX_OPT_ARGS];
 	opt = kzalloc(sizeof(*opt), GFP_KERNEL);
 	if (!opt)
 		return err;
 	opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr),
 				GFP_KERNEL);
 	if (!opt->mon_addr)
 		goto out;
 	dout("parse_options %p options '%s' dev_name '%s'\n", opt, options,
 	     dev_name);
 	/* start with defaults */
 	opt->flags = CEPH_OPT_DEFAULT;
 	opt->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
 	opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
 	opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
 	opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;   /* seconds */
 	/* get mon ip(s) */
 	/* ip1[:port1][,ip2[:port2]...] */
 	err = ceph_parse_ips(dev_name, dev_name_end, opt->mon_addr,
 			     CEPH_MAX_MON, &opt->num_mon);
 	if (err < 0)
 		goto out;
 	/* parse mount options */
 	while ((c = strsep(&options, ",")) != NULL) {
 		int token, intval, ret;
 		if (!*c)
 			continue;
 		err = -EINVAL;
 		token = match_token((char *)c, opt_tokens, argstr);
 		if (token < 0 && parse_extra_token) {
 			/* extra? */
 			err = parse_extra_token((char *)c, private);
 			if (err < 0) {
 				pr_err("bad option at '%s'\n", c);
 				goto out;
 			}
 			continue;
 		}
 		if (token < Opt_last_int) {
 			ret = match_int(&argstr[0], &intval);
 			if (ret < 0) {
 				pr_err("bad mount option arg (not int) "
 				       "at '%s'\n", c);
 				continue;
 			}
 			dout("got int token %d val %d\n", token, intval);
 		} else if (token > Opt_last_int && token < Opt_last_string) {
 			dout("got string token %d val %s\n", token,
 			     argstr[0].from);
 		} else {
 			dout("got token %d\n", token);
 		}
 		switch (token) {
 		case Opt_ip:
 			err = ceph_parse_ips(argstr[0].from,
 					     argstr[0].to,
 					     &opt->my_addr,
 					     1, NULL);
 			if (err < 0)
 				goto out;
 			opt->flags |= CEPH_OPT_MYIP;
 			break;
 		case Opt_fsid:
 			err = parse_fsid(argstr[0].from, &opt->fsid);
 			if (err == 0)
 				opt->flags |= CEPH_OPT_FSID;
 			break;
 		case Opt_name:
 			opt->name = kstrndup(argstr[0].from,
 					      argstr[0].to-argstr[0].from,
 					      GFP_KERNEL);
 			break;
 		case Opt_secret:
 		        opt->key = kzalloc(sizeof(*opt->key), GFP_KERNEL);
 			if (!opt->key) {
 				err = -ENOMEM;
 				goto out;
 			}
 			err = ceph_crypto_key_unarmor(opt->key, argstr[0].from);
 			if (err < 0)
 				goto out;
 			break;
 		case Opt_key:
 		        opt->key = kzalloc(sizeof(*opt->key), GFP_KERNEL);
 			if (!opt->key) {
 				err = -ENOMEM;
 				goto out;
 			}
 			err = get_secret(opt->key, argstr[0].from);
 			if (err < 0)
 				goto out;
 			break;
 			/* misc */
 		case Opt_osdtimeout:
 			opt->osd_timeout = intval;
 			break;
 		case Opt_osdkeepalivetimeout:
 			opt->osd_keepalive_timeout = intval;
 			break;
 		case Opt_osd_idle_ttl:
 			opt->osd_idle_ttl = intval;
 			break;
 		case Opt_mount_timeout:
 			opt->mount_timeout = intval;
 			break;
 		case Opt_noshare:
 			opt->flags |= CEPH_OPT_NOSHARE;
 			break;
 		case Opt_nocrc:
 			opt->flags |= CEPH_OPT_NOCRC;
 			break;
 		default:
 			BUG_ON(token);
 		}
 	}
 	/* success */
 	*popt = opt;
 	return 0;
 out:
 	ceph_destroy_options(opt);
 	return err;
 }
 EXPORT_SYMBOL(ceph_parse_options);
 u64 ceph_client_id(struct ceph_client *client)
 {
 	return client->monc.auth->global_id;
 }
 EXPORT_SYMBOL(ceph_client_id);
 /*
  * create a fresh client instance
  */
 struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private,
 				       unsigned supported_features,
 				       unsigned required_features)
 {
 	struct ceph_client *client;
 	struct ceph_entity_addr *myaddr = NULL;
 	int err = -ENOMEM;
 	client = kzalloc(sizeof(*client), GFP_KERNEL);
 	if (client == NULL)
 		return ERR_PTR(-ENOMEM);
 	client->private = private;
 	client->options = opt;
 	mutex_init(&client->mount_mutex);
 	init_waitqueue_head(&client->auth_wq);
 	client->auth_err = 0;
 	client->extra_mon_dispatch = NULL;
 	client->supported_features = CEPH_FEATURE_SUPPORTED_DEFAULT |
 		supported_features;
 	client->required_features = CEPH_FEATURE_REQUIRED_DEFAULT |
 		required_features;
 	/* msgr */
 	if (ceph_test_opt(client, MYIP))
 		myaddr = &client->options->my_addr;
 	client->msgr = ceph_messenger_create(myaddr,
 					     client->supported_features,
 					     client->required_features);
 	if (IS_ERR(client->msgr)) {
 		err = PTR_ERR(client->msgr);
 		goto fail;
 	}
 	client->msgr->nocrc = ceph_test_opt(client, NOCRC);
 	/* subsystems */
 	err = ceph_monc_init(&client->monc, client);
 	if (err < 0)
 		goto fail_msgr;
 	err = ceph_osdc_init(&client->osdc, client);
 	if (err < 0)
 		goto fail_monc;
 	return client;
 fail_monc:
 	ceph_monc_stop(&client->monc);
 fail_msgr:
 	ceph_messenger_destroy(client->msgr);
 fail:
 	kfree(client);
 	return ERR_PTR(err);
 }
 EXPORT_SYMBOL(ceph_create_client);
 void ceph_destroy_client(struct ceph_client *client)
 {
 	dout("destroy_client %p\n", client);
 	/* unmount */
 	ceph_osdc_stop(&client->osdc);
 	/*
 	 * make sure osd connections close out before destroying the
 	 * auth module, which is needed to free those connections'
 	 * ceph_authorizers.
 	 */
 	ceph_msgr_flush();
 	ceph_monc_stop(&client->monc);
 	ceph_debugfs_client_cleanup(client);
 	ceph_messenger_destroy(client->msgr);
 	ceph_destroy_options(client->options);
 	kfree(client);
 	dout("destroy_client %p done\n", client);
 }
 EXPORT_SYMBOL(ceph_destroy_client);
 /*
  * true if we have the mon map (and have thus joined the cluster)
  */
 static int have_mon_and_osd_map(struct ceph_client *client)
 {
 	return client->monc.monmap && client->monc.monmap->epoch &&
 	       client->osdc.osdmap && client->osdc.osdmap->epoch;
 }
 /*
  * mount: join the ceph cluster, and open root directory.
  */
 int __ceph_open_session(struct ceph_client *client, unsigned long started)
 {
 	int err;
 	unsigned long timeout = client->options->mount_timeout * HZ;
 	/* open session, and wait for mon and osd maps */
 	err = ceph_monc_open_session(&client->monc);
 	if (err < 0)
 		return err;
 	while (!have_mon_and_osd_map(client)) {
 		err = -EIO;
 		if (timeout && time_after_eq(jiffies, started + timeout))
 			return err;
 		/* wait */
 		dout("mount waiting for mon_map\n");
 		err = wait_event_interruptible_timeout(client->auth_wq,
 			have_mon_and_osd_map(client) || (client->auth_err < 0),
 			timeout);
 		if (err == -EINTR || err == -ERESTARTSYS)
 			return err;
 		if (client->auth_err < 0)
 			return client->auth_err;
 	}
 	return 0;
 }
 EXPORT_SYMBOL(__ceph_open_session);
 int ceph_open_session(struct ceph_client *client)
 {
 	int ret;
 	unsigned long started = jiffies;  /* note the start time */
 	dout("open_session start\n");
 	mutex_lock(&client->mount_mutex);
 	ret = __ceph_open_session(client, started);
 	mutex_unlock(&client->mount_mutex);
 	return ret;
 }
 EXPORT_SYMBOL(ceph_open_session);
 static int __init init_ceph_lib(void)
 {
 	int ret = 0;
 	ret = ceph_debugfs_init();
 	if (ret < 0)
 		goto out;
 	ret = ceph_crypto_init();
 	if (ret < 0)
 		goto out_debugfs;
 	ret = ceph_msgr_init();
 	if (ret < 0)
 		goto out_crypto;
 	pr_info("loaded (mon/osd proto %d/%d, osdmap %d/%d %d/%d)\n",
 		CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL,
 		CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
 		CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
 	return 0;
 out_crypto:
 	ceph_crypto_shutdown();
 out_debugfs:
 	ceph_debugfs_cleanup();
 out:
 	return ret;
 }
 static void __exit exit_ceph_lib(void)
 {
 	dout("exit_ceph_lib\n");
 	ceph_msgr_exit();
 	ceph_crypto_shutdown();
 	ceph_debugfs_cleanup();
 }
 module_init(init_ceph_lib);
 module_exit(exit_ceph_lib);
 MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
 MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
 MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
 MODULE_DESCRIPTION("Ceph filesystem for Linux");
 MODULE_LICENSE("GPL");

net/ceph/mon_client.c

Diff comments View file @ 6c073a7

 #include <linux/ceph/ceph_debug.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/random.h>
 #include <linux/sched.h>
 #include <linux/ceph/mon_client.h>
 #include <linux/ceph/libceph.h>
+#include <linux/ceph/debugfs.h>
 #include <linux/ceph/decode.h>
 #include <linux/ceph/auth.h>
 /*
  * Interact with Ceph monitor cluster.  Handle requests for new map
  * versions, and periodically resend as needed.  Also implement
  * statfs() and umount().
  *
  * A small cluster of Ceph "monitors" are responsible for managing critical
  * cluster configuration and state information.  An odd number (e.g., 3, 5)
  * of cmon daemons use a modified version of the Paxos part-time parliament
  * algorithm to manage the MDS map (mds cluster membership), OSD map, and
  * list of clients who have mounted the file system.
  *
  * We maintain an open, active session with a monitor at all times in order to
  * receive timely MDSMap updates.  We periodically send a keepalive byte on the
  * TCP socket to ensure we detect a failure.  If the connection does break, we
  * randomly hunt for a new monitor.  Once the connection is reestablished, we
  * resend any outstanding requests.
  */
 static const struct ceph_connection_operations mon_con_ops;
 static int __validate_auth(struct ceph_mon_client *monc);
 /*
  * Decode a monmap blob (e.g., during mount).
  */
 struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
 {
 	struct ceph_monmap *m = NULL;
 	int i, err = -EINVAL;
 	struct ceph_fsid fsid;
 	u32 epoch, num_mon;
 	u16 version;
 	u32 len;
 	ceph_decode_32_safe(&p, end, len, bad);
 	ceph_decode_need(&p, end, len, bad);
 	dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
 	ceph_decode_16_safe(&p, end, version, bad);
 	ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
 	ceph_decode_copy(&p, &fsid, sizeof(fsid));
 	epoch = ceph_decode_32(&p);
 	num_mon = ceph_decode_32(&p);
 	ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
 	if (num_mon >= CEPH_MAX_MON)
 		goto bad;
 	m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
 	if (m == NULL)
 		return ERR_PTR(-ENOMEM);
 	m->fsid = fsid;
 	m->epoch = epoch;
 	m->num_mon = num_mon;
 	ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
 	for (i = 0; i < num_mon; i++)
 		ceph_decode_addr(&m->mon_inst[i].addr);
 	dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
 	     m->num_mon);
 	for (i = 0; i < m->num_mon; i++)
 		dout("monmap_decode  mon%d is %s\n", i,
 		     ceph_pr_addr(&m->mon_inst[i].addr.in_addr));
 	return m;
 bad:
 	dout("monmap_decode failed with %d\n", err);
 	kfree(m);
 	return ERR_PTR(err);
 }
 /*
  * return true if *addr is included in the monmap.
  */
 int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
 {
 	int i;
 	for (i = 0; i < m->num_mon; i++)
 		if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
 			return 1;
 	return 0;
 }
 /*
  * Send an auth request.
  */
 static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
 {
 	monc->pending_auth = 1;
 	monc->m_auth->front.iov_len = len;
 	monc->m_auth->hdr.front_len = cpu_to_le32(len);
 	ceph_con_revoke(monc->con, monc->m_auth);
 	ceph_msg_get(monc->m_auth);  /* keep our ref */
 	ceph_con_send(monc->con, monc->m_auth);
 }
 /*
  * Close monitor session, if any.
  */
 static void __close_session(struct ceph_mon_client *monc)
 {
 	dout("__close_session closing mon%d\n", monc->cur_mon);
 	ceph_con_revoke(monc->con, monc->m_auth);
 	ceph_con_close(monc->con);
 	monc->cur_mon = -1;
 	monc->pending_auth = 0;
 	ceph_auth_reset(monc->auth);
 }
 /*
  * Open a session with a (new) monitor.
  */
 static int __open_session(struct ceph_mon_client *monc)
 {
 	char r;
 	int ret;
 	if (monc->cur_mon < 0) {
 		get_random_bytes(&r, 1);
 		monc->cur_mon = r % monc->monmap->num_mon;
 		dout("open_session num=%d r=%d -> mon%d\n",
 		     monc->monmap->num_mon, r, monc->cur_mon);
 		monc->sub_sent = 0;
 		monc->sub_renew_after = jiffies;  /* i.e., expired */
 		monc->want_next_osdmap = !!monc->want_next_osdmap;
 		dout("open_session mon%d opening\n", monc->cur_mon);
 		monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON;
 		monc->con->peer_name.num = cpu_to_le64(monc->cur_mon);
 		ceph_con_open(monc->con,
 			      &monc->monmap->mon_inst[monc->cur_mon].addr);
 		/* initiatiate authentication handshake */
 		ret = ceph_auth_build_hello(monc->auth,
 					    monc->m_auth->front.iov_base,
 					    monc->m_auth->front_max);
 		__send_prepared_auth_request(monc, ret);
 	} else {
 		dout("open_session mon%d already open\n", monc->cur_mon);
 	}
 	return 0;
 }
 static bool __sub_expired(struct ceph_mon_client *monc)
 {
 	return time_after_eq(jiffies, monc->sub_renew_after);
 }
 /*
  * Reschedule delayed work timer.
  */
 static void __schedule_delayed(struct ceph_mon_client *monc)
 {
 	unsigned delay;
 	if (monc->cur_mon < 0 || __sub_expired(monc))
 		delay = 10 * HZ;
 	else
 		delay = 20 * HZ;
 	dout("__schedule_delayed after %u\n", delay);
 	schedule_delayed_work(&monc->delayed_work, delay);
 }
 /*
  * Send subscribe request for mdsmap and/or osdmap.
  */
 static void __send_subscribe(struct ceph_mon_client *monc)
 {
 	dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
 	     (unsigned)monc->sub_sent, __sub_expired(monc),
 	     monc->want_next_osdmap);
 	if ((__sub_expired(monc) && !monc->sub_sent) ||
 	    monc->want_next_osdmap == 1) {
 		struct ceph_msg *msg = monc->m_subscribe;
 		struct ceph_mon_subscribe_item *i;
 		void *p, *end;
 		int num;
 		p = msg->front.iov_base;
 		end = p + msg->front_max;
 		num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap;
 		ceph_encode_32(&p, num);
 		if (monc->want_next_osdmap) {
 			dout("__send_subscribe to 'osdmap' %u\n",
 			     (unsigned)monc->have_osdmap);
 			ceph_encode_string(&p, end, "osdmap", 6);
 			i = p;
 			i->have = cpu_to_le64(monc->have_osdmap);
 			i->onetime = 1;
 			p += sizeof(*i);
 			monc->want_next_osdmap = 2;  /* requested */
 		}
 		if (monc->want_mdsmap) {
 			dout("__send_subscribe to 'mdsmap' %u+\n",
 			     (unsigned)monc->have_mdsmap);
 			ceph_encode_string(&p, end, "mdsmap", 6);
 			i = p;
 			i->have = cpu_to_le64(monc->have_mdsmap);
 			i->onetime = 0;
 			p += sizeof(*i);
 		}
 		ceph_encode_string(&p, end, "monmap", 6);
 		i = p;
 		i->have = 0;
 		i->onetime = 0;
 		p += sizeof(*i);
 		msg->front.iov_len = p - msg->front.iov_base;
 		msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
 		ceph_con_revoke(monc->con, msg);
 		ceph_con_send(monc->con, ceph_msg_get(msg));
 		monc->sub_sent = jiffies | 1;  /* never 0 */
 	}
 }
 static void handle_subscribe_ack(struct ceph_mon_client *monc,
 				 struct ceph_msg *msg)
 {
 	unsigned seconds;
 	struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
 	if (msg->front.iov_len < sizeof(*h))
 		goto bad;
 	seconds = le32_to_cpu(h->duration);
 	mutex_lock(&monc->mutex);
 	if (monc->hunting) {
 		pr_info("mon%d %s session established\n",
 			monc->cur_mon,
 			ceph_pr_addr(&monc->con->peer_addr.in_addr));
 		monc->hunting = false;
 	}
 	dout("handle_subscribe_ack after %d seconds\n", seconds);
 	monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
 	monc->sub_sent = 0;
 	mutex_unlock(&monc->mutex);
 	return;
 bad:
 	pr_err("got corrupt subscribe-ack msg\n");
 	ceph_msg_dump(msg);
 }
 /*
  * Keep track of which maps we have
  */
 int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
 {
 	mutex_lock(&monc->mutex);
 	monc->have_mdsmap = got;
 	mutex_unlock(&monc->mutex);
 	return 0;
 }
 EXPORT_SYMBOL(ceph_monc_got_mdsmap);
 int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
 {
 	mutex_lock(&monc->mutex);
 	monc->have_osdmap = got;
 	monc->want_next_osdmap = 0;
 	mutex_unlock(&monc->mutex);
 	return 0;
 }
 /*
  * Register interest in the next osdmap
  */
 void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
 {
 	dout("request_next_osdmap have %u\n", monc->have_osdmap);
 	mutex_lock(&monc->mutex);
 	if (!monc->want_next_osdmap)
 		monc->want_next_osdmap = 1;
 	if (monc->want_next_osdmap < 2)
 		__send_subscribe(monc);
 	mutex_unlock(&monc->mutex);
 }
 /*
  *
  */
 int ceph_monc_open_session(struct ceph_mon_client *monc)
 {
 	mutex_lock(&monc->mutex);
 	__open_session(monc);
 	__schedule_delayed(monc);
 	mutex_unlock(&monc->mutex);
 	return 0;
 }
 EXPORT_SYMBOL(ceph_monc_open_session);
 /*
  * The monitor responds with mount ack indicate mount success.  The
  * included client ticket allows the client to talk to MDSs and OSDs.
  */
 static void ceph_monc_handle_map(struct ceph_mon_client *monc,
 				 struct ceph_msg *msg)
 {
 	struct ceph_client *client = monc->client;
 	struct ceph_monmap *monmap = NULL, *old = monc->monmap;
 	void *p, *end;
 	mutex_lock(&monc->mutex);
 	dout("handle_monmap\n");
 	p = msg->front.iov_base;
 	end = p + msg->front.iov_len;
 	monmap = ceph_monmap_decode(p, end);
 	if (IS_ERR(monmap)) {
 		pr_err("problem decoding monmap, %d\n",
 		       (int)PTR_ERR(monmap));
 		goto out;
 	}
 	if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
 		kfree(monmap);
 		goto out;
 	}
 	client->monc.monmap = monmap;
 	kfree(old);
+	if (!client->have_fsid) {
+		client->have_fsid = true;
+		mutex_unlock(&monc->mutex);
+		/*
+		 * do debugfs initialization without mutex to avoid
+		 * creating a locking dependency
+		 */
+		ceph_debugfs_client_init(client);
+		goto out_unlocked;
+	}
 out:
 	mutex_unlock(&monc->mutex);
+out_unlocked:
 	wake_up_all(&client->auth_wq);
 }
 /*
  * generic requests (e.g., statfs, poolop)
  */
 static struct ceph_mon_generic_request *__lookup_generic_req(
 	struct ceph_mon_client *monc, u64 tid)
 {
 	struct ceph_mon_generic_request *req;
 	struct rb_node *n = monc->generic_request_tree.rb_node;
 	while (n) {
 		req = rb_entry(n, struct ceph_mon_generic_request, node);
 		if (tid < req->tid)
 			n = n->rb_left;
 		else if (tid > req->tid)
 			n = n->rb_right;
 		else
 			return req;
 	}
 	return NULL;
 }
 static void __insert_generic_request(struct ceph_mon_client *monc,
 			    struct ceph_mon_generic_request *new)
 {
 	struct rb_node **p = &monc->generic_request_tree.rb_node;
 	struct rb_node *parent = NULL;
 	struct ceph_mon_generic_request *req = NULL;
 	while (*p) {
 		parent = *p;
 		req = rb_entry(parent, struct ceph_mon_generic_request, node);
 		if (new->tid < req->tid)
 			p = &(*p)->rb_left;
 		else if (new->tid > req->tid)
 			p = &(*p)->rb_right;
 		else
 			BUG();
 	}
 	rb_link_node(&new->node, parent, p);
 	rb_insert_color(&new->node, &monc->generic_request_tree);
 }
 static void release_generic_request(struct kref *kref)
 {
 	struct ceph_mon_generic_request *req =
 		container_of(kref, struct ceph_mon_generic_request, kref);
 	if (req->reply)
 		ceph_msg_put(req->reply);
 	if (req->request)
 		ceph_msg_put(req->request);
 	kfree(req);
 }
 static void put_generic_request(struct ceph_mon_generic_request *req)
 {
 	kref_put(&req->kref, release_generic_request);
 }
 static void get_generic_request(struct ceph_mon_generic_request *req)
 {
 	kref_get(&req->kref);
 }
 static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
 					 struct ceph_msg_header *hdr,
 					 int *skip)
 {
 	struct ceph_mon_client *monc = con->private;
 	struct ceph_mon_generic_request *req;
 	u64 tid = le64_to_cpu(hdr->tid);
 	struct ceph_msg *m;
 	mutex_lock(&monc->mutex);
 	req = __lookup_generic_req(monc, tid);
 	if (!req) {
 		dout("get_generic_reply %lld dne\n", tid);
 		*skip = 1;
 		m = NULL;
 	} else {
 		dout("get_generic_reply %lld got %p\n", tid, req->reply);
 		m = ceph_msg_get(req->reply);
 		/*
 		 * we don't need to track the connection reading into
 		 * this reply because we only have one open connection
 		 * at a time, ever.
 		 */
 	}
 	mutex_unlock(&monc->mutex);
 	return m;
 }
 static int do_generic_request(struct ceph_mon_client *monc,
 			      struct ceph_mon_generic_request *req)
 {
 	int err;
 	/* register request */
 	mutex_lock(&monc->mutex);
 	req->tid = ++monc->last_tid;
 	req->request->hdr.tid = cpu_to_le64(req->tid);
 	__insert_generic_request(monc, req);
 	monc->num_generic_requests++;
 	ceph_con_send(monc->con, ceph_msg_get(req->request));
 	mutex_unlock(&monc->mutex);
 	err = wait_for_completion_interruptible(&req->completion);
 	mutex_lock(&monc->mutex);
 	rb_erase(&req->node, &monc->generic_request_tree);
 	monc->num_generic_requests--;
 	mutex_unlock(&monc->mutex);
 	if (!err)
 		err = req->result;
 	return err;
 }
 /*
  * statfs
  */
 static void handle_statfs_reply(struct ceph_mon_client *monc,
 				struct ceph_msg *msg)
 {
 	struct ceph_mon_generic_request *req;
 	struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
 	u64 tid = le64_to_cpu(msg->hdr.tid);
 	if (msg->front.iov_len != sizeof(*reply))
 		goto bad;
 	dout("handle_statfs_reply %p tid %llu\n", msg, tid);
 	mutex_lock(&monc->mutex);
 	req = __lookup_generic_req(monc, tid);
 	if (req) {
 		*(struct ceph_statfs *)req->buf = reply->st;
 		req->result = 0;
 		get_generic_request(req);
 	}
 	mutex_unlock(&monc->mutex);
 	if (req) {
 		complete_all(&req->completion);
 		put_generic_request(req);
 	}
 	return;
 bad:
 	pr_err("corrupt generic reply, tid %llu\n", tid);
 	ceph_msg_dump(msg);
 }
 /*
  * Do a synchronous statfs().
  */
 int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
 {
 	struct ceph_mon_generic_request *req;
 	struct ceph_mon_statfs *h;
 	int err;
 	req = kzalloc(sizeof(*req), GFP_NOFS);
 	if (!req)
 		return -ENOMEM;
 	kref_init(&req->kref);
 	req->buf = buf;
 	req->buf_len = sizeof(*buf);
 	init_completion(&req->completion);
 	err = -ENOMEM;
 	req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS,
 				    true);
 	if (!req->request)
 		goto out;
 	req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS,
 				  true);
 	if (!req->reply)
 		goto out;
 	/* fill out request */
 	h = req->request->front.iov_base;
 	h->monhdr.have_version = 0;
 	h->monhdr.session_mon = cpu_to_le16(-1);
 	h->monhdr.session_mon_tid = 0;
 	h->fsid = monc->monmap->fsid;
 	err = do_generic_request(monc, req);
 out:
 	kref_put(&req->kref, release_generic_request);
 	return err;
 }
 EXPORT_SYMBOL(ceph_monc_do_statfs);
 /*
  * pool ops
  */
 static int get_poolop_reply_buf(const char *src, size_t src_len,
 				char *dst, size_t dst_len)
 {
 	u32 buf_len;
 	if (src_len != sizeof(u32) + dst_len)
 		return -EINVAL;
 	buf_len = le32_to_cpu(*(u32 *)src);
 	if (buf_len != dst_len)
 		return -EINVAL;
 	memcpy(dst, src + sizeof(u32), dst_len);
 	return 0;
 }
 static void handle_poolop_reply(struct ceph_mon_client *monc,
 				struct ceph_msg *msg)
 {
 	struct ceph_mon_generic_request *req;
 	struct ceph_mon_poolop_reply *reply = msg->front.iov_base;
 	u64 tid = le64_to_cpu(msg->hdr.tid);
 	if (msg->front.iov_len < sizeof(*reply))
 		goto bad;
 	dout("handle_poolop_reply %p tid %llu\n", msg, tid);
 	mutex_lock(&monc->mutex);
 	req = __lookup_generic_req(monc, tid);
 	if (req) {
 		if (req->buf_len &&
 		    get_poolop_reply_buf(msg->front.iov_base + sizeof(*reply),
 				     msg->front.iov_len - sizeof(*reply),
 				     req->buf, req->buf_len) < 0) {
 			mutex_unlock(&monc->mutex);
 			goto bad;
 		}
 		req->result = le32_to_cpu(reply->reply_code);
 		get_generic_request(req);
 	}
 	mutex_unlock(&monc->mutex);
 	if (req) {
 		complete(&req->completion);
 		put_generic_request(req);
 	}
 	return;
 bad:
 	pr_err("corrupt generic reply, tid %llu\n", tid);
 	ceph_msg_dump(msg);
 }
 /*
  * Do a synchronous pool op.
  */
 int ceph_monc_do_poolop(struct ceph_mon_client *monc, u32 op,
 			u32 pool, u64 snapid,
 			char *buf, int len)
 {
 	struct ceph_mon_generic_request *req;
 	struct ceph_mon_poolop *h;
 	int err;
 	req = kzalloc(sizeof(*req), GFP_NOFS);
 	if (!req)
 		return -ENOMEM;
 	kref_init(&req->kref);
 	req->buf = buf;
 	req->buf_len = len;
 	init_completion(&req->completion);
 	err = -ENOMEM;
 	req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS,
 				    true);
 	if (!req->request)
 		goto out;
 	req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS,
 				  true);
 	if (!req->reply)
 		goto out;
 	/* fill out request */
 	req->request->hdr.version = cpu_to_le16(2);
 	h = req->request->front.iov_base;
 	h->monhdr.have_version = 0;
 	h->monhdr.session_mon = cpu_to_le16(-1);
 	h->monhdr.session_mon_tid = 0;
 	h->fsid = monc->monmap->fsid;
 	h->pool = cpu_to_le32(pool);
 	h->op = cpu_to_le32(op);
 	h->auid = 0;
 	h->snapid = cpu_to_le64(snapid);
 	h->name_len = 0;
 	err = do_generic_request(monc, req);
 out:
 	kref_put(&req->kref, release_generic_request);
 	return err;
 }
 int ceph_monc_create_snapid(struct ceph_mon_client *monc,
 			    u32 pool, u64 *snapid)
 {
 	return ceph_monc_do_poolop(monc,  POOL_OP_CREATE_UNMANAGED_SNAP,
 				   pool, 0, (char *)snapid, sizeof(*snapid));
 }
 EXPORT_SYMBOL(ceph_monc_create_snapid);
 int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
 			    u32 pool, u64 snapid)
 {
 	return ceph_monc_do_poolop(monc,  POOL_OP_CREATE_UNMANAGED_SNAP,
 				   pool, snapid, 0, 0);
 }
 /*
  * Resend pending generic requests.
  */
 static void __resend_generic_request(struct ceph_mon_client *monc)
 {
 	struct ceph_mon_generic_request *req;
 	struct rb_node *p;
 	for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
 		req = rb_entry(p, struct ceph_mon_generic_request, node);
 		ceph_con_revoke(monc->con, req->request);
 		ceph_con_send(monc->con, ceph_msg_get(req->request));
 	}
 }
 /*
  * Delayed work.  If we haven't mounted yet, retry.  Otherwise,
  * renew/retry subscription as needed (in case it is timing out, or we
  * got an ENOMEM).  And keep the monitor connection alive.
  */
 static void delayed_work(struct work_struct *work)
 {
 	struct ceph_mon_client *monc =
 		container_of(work, struct ceph_mon_client, delayed_work.work);
 	dout("monc delayed_work\n");
 	mutex_lock(&monc->mutex);
 	if (monc->hunting) {
 		__close_session(monc);
 		__open_session(monc);  /* continue hunting */
 	} else {
 		ceph_con_keepalive(monc->con);
 		__validate_auth(monc);
 		if (monc->auth->ops->is_authenticated(monc->auth))
 			__send_subscribe(monc);
 	}
 	__schedule_delayed(monc);
 	mutex_unlock(&monc->mutex);
 }
 /*
  * On startup, we build a temporary monmap populated with the IPs
  * provided by mount(2).
  */
 static int build_initial_monmap(struct ceph_mon_client *monc)
 {
 	struct ceph_options *opt = monc->client->options;
 	struct ceph_entity_addr *mon_addr = opt->mon_addr;
 	int num_mon = opt->num_mon;
 	int i;
 	/* build initial monmap */
 	monc->monmap = kzalloc(sizeof(*monc->monmap) +
 			       num_mon*sizeof(monc->monmap->mon_inst[0]),
 			       GFP_KERNEL);
 	if (!monc->monmap)
 		return -ENOMEM;
 	for (i = 0; i < num_mon; i++) {
 		monc->monmap->mon_inst[i].addr = mon_addr[i];
 		monc->monmap->mon_inst[i].addr.nonce = 0;
 		monc->monmap->mon_inst[i].name.type =
 			CEPH_ENTITY_TYPE_MON;
 		monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
 	}
 	monc->monmap->num_mon = num_mon;
 	monc->have_fsid = false;
 	return 0;
 }
 int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 {
 	int err = 0;
 	dout("init\n");
 	memset(monc, 0, sizeof(*monc));
 	monc->client = cl;
 	monc->monmap = NULL;
 	mutex_init(&monc->mutex);
 	err = build_initial_monmap(monc);
 	if (err)
 		goto out;
 	/* connection */
 	monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
 	if (!monc->con)
 		goto out_monmap;
 	ceph_con_init(monc->client->msgr, monc->con);
 	monc->con->private = monc;
 	monc->con->ops = &mon_con_ops;
 	/* authentication */
 	monc->auth = ceph_auth_init(cl->options->name,
 				    cl->options->key);
 	if (IS_ERR(monc->auth)) {
 		err = PTR_ERR(monc->auth);
 		goto out_con;
 	}
 	monc->auth->want_keys =
 		CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
 		CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
 	/* msgs */
 	err = -ENOMEM;
 	monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
 				     sizeof(struct ceph_mon_subscribe_ack),
 				     GFP_NOFS, true);
 	if (!monc->m_subscribe_ack)
 		goto out_auth;
 	monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS,
 					 true);
 	if (!monc->m_subscribe)
 		goto out_subscribe_ack;
 	monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS,
 					  true);
 	if (!monc->m_auth_reply)
 		goto out_subscribe;
 	monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS, true);
 	monc->pending_auth = 0;
 	if (!monc->m_auth)
 		goto out_auth_reply;
 	monc->cur_mon = -1;
 	monc->hunting = true;
 	monc->sub_renew_after = jiffies;
 	monc->sub_sent = 0;
 	INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
 	monc->generic_request_tree = RB_ROOT;
 	monc->num_generic_requests = 0;
 	monc->last_tid = 0;
 	monc->have_mdsmap = 0;
 	monc->have_osdmap = 0;
 	monc->want_next_osdmap = 1;
 	return 0;
 out_auth_reply:
 	ceph_msg_put(monc->m_auth_reply);
 out_subscribe:
 	ceph_msg_put(monc->m_subscribe);
 out_subscribe_ack:
 	ceph_msg_put(monc->m_subscribe_ack);
 out_auth:
 	ceph_auth_destroy(monc->auth);
 out_con:
 	monc->con->ops->put(monc->con);
 out_monmap:
 	kfree(monc->monmap);
 out:
 	return err;
 }
 EXPORT_SYMBOL(ceph_monc_init);
 void ceph_monc_stop(struct ceph_mon_client *monc)
 {
 	dout("stop\n");
 	cancel_delayed_work_sync(&monc->delayed_work);
 	mutex_lock(&monc->mutex);
 	__close_session(monc);
 	monc->con->private = NULL;
 	monc->con->ops->put(monc->con);
 	monc->con = NULL;
 	mutex_unlock(&monc->mutex);
 	ceph_auth_destroy(monc->auth);
 	ceph_msg_put(monc->m_auth);
 	ceph_msg_put(monc->m_auth_reply);
 	ceph_msg_put(monc->m_subscribe);
 	ceph_msg_put(monc->m_subscribe_ack);
 	kfree(monc->monmap);
 }
 EXPORT_SYMBOL(ceph_monc_stop);
 static void handle_auth_reply(struct ceph_mon_client *monc,
 			      struct ceph_msg *msg)
 {
 	int ret;
 	int was_auth = 0;
 	mutex_lock(&monc->mutex);
 	if (monc->auth->ops)
 		was_auth = monc->auth->ops->is_authenticated(monc->auth);
 	monc->pending_auth = 0;
 	ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
 				     msg->front.iov_len,
 				     monc->m_auth->front.iov_base,
 				     monc->m_auth->front_max);
 	if (ret < 0) {
 		monc->client->auth_err = ret;
 		wake_up_all(&monc->client->auth_wq);
 	} else if (ret > 0) {
 		__send_prepared_auth_request(monc, ret);
 	} else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) {
 		dout("authenticated, starting session\n");
 		monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
 		monc->client->msgr->inst.name.num =
 					cpu_to_le64(monc->auth->global_id);
 		__send_subscribe(monc);
 		__resend_generic_request(monc);
 	}
 	mutex_unlock(&monc->mutex);
 }
 static int __validate_auth(struct ceph_mon_client *monc)
 {
 	int ret;
 	if (monc->pending_auth)
 		return 0;
 	ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
 			      monc->m_auth->front_max);
 	if (ret <= 0)
 		return ret; /* either an error, or no need to authenticate */
 	__send_prepared_auth_request(monc, ret);
 	return 0;
 }
 int ceph_monc_validate_auth(struct ceph_mon_client *monc)
 {
 	int ret;
 	mutex_lock(&monc->mutex);
 	ret = __validate_auth(monc);
 	mutex_unlock(&monc->mutex);
 	return ret;
 }
 EXPORT_SYMBOL(ceph_monc_validate_auth);
 /*
  * handle incoming message
  */
 static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
 {
 	struct ceph_mon_client *monc = con->private;
 	int type = le16_to_cpu(msg->hdr.type);
 	if (!monc)
 		return;
 	switch (type) {
 	case CEPH_MSG_AUTH_REPLY:
 		handle_auth_reply(monc, msg);
 		break;
 	case CEPH_MSG_MON_SUBSCRIBE_ACK:
 		handle_subscribe_ack(monc, msg);
 		break;
 	case CEPH_MSG_STATFS_REPLY:
 		handle_statfs_reply(monc, msg);
 		break;
 	case CEPH_MSG_POOLOP_REPLY:
 		handle_poolop_reply(monc, msg);
 		break;
 	case CEPH_MSG_MON_MAP:
 		ceph_monc_handle_map(monc, msg);
 		break;
 	case CEPH_MSG_OSD_MAP:
 		ceph_osdc_handle_map(&monc->client->osdc, msg);
 		break;
 	default:
 		/* can the chained handler handle it? */
 		if (monc->client->extra_mon_dispatch &&
 		    monc->client->extra_mon_dispatch(monc->client, msg) == 0)
 			break;
 		pr_err("received unknown message type %d %s\n", type,
 		       ceph_msg_type_name(type));
 	}
 	ceph_msg_put(msg);
 }
 /*
  * Allocate memory for incoming message
  */
 static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
 				      struct ceph_msg_header *hdr,
 				      int *skip)
 {
 	struct ceph_mon_client *monc = con->private;
 	int type = le16_to_cpu(hdr->type);
 	int front_len = le32_to_cpu(hdr->front_len);
 	struct ceph_msg *m = NULL;
 	*skip = 0;
 	switch (type) {
 	case CEPH_MSG_MON_SUBSCRIBE_ACK:
 		m = ceph_msg_get(monc->m_subscribe_ack);
 		break;
 	case CEPH_MSG_POOLOP_REPLY:
 	case CEPH_MSG_STATFS_REPLY:
 		return get_generic_reply(con, hdr, skip);
 	case CEPH_MSG_AUTH_REPLY:
 		m = ceph_msg_get(monc->m_auth_reply);
 		break;
 	case CEPH_MSG_MON_MAP:
 	case CEPH_MSG_MDS_MAP:
 	case CEPH_MSG_OSD_MAP:
 		m = ceph_msg_new(type, front_len, GFP_NOFS, false);
 		break;
 	}
 	if (!m) {
 		pr_info("alloc_msg unknown type %d\n", type);
 		*skip = 1;
 	}
 	return m;
 }
 /*
  * If the monitor connection resets, pick a new monitor and resubmit
  * any pending requests.
  */
 static void mon_fault(struct ceph_connection *con)
 {
 	struct ceph_mon_client *monc = con->private;
 	if (!monc)
 		return;
 	dout("mon_fault\n");
 	mutex_lock(&monc->mutex);
 	if (!con->private)
 		goto out;
 	if (!monc->hunting)
 		pr_info("mon%d %s session lost, "
 			"hunting for new mon\n", monc->cur_mon,
 			ceph_pr_addr(&monc->con->peer_addr.in_addr));
 	__close_session(monc);
 	if (!monc->hunting) {
 		/* start hunting */
 		monc->hunting = true;
 		__open_session(monc);
 	} else {
 		/* already hunting, let's wait a bit */
 		__schedule_delayed(monc);
 	}
 out:
 	mutex_unlock(&monc->mutex);
 }
 static const struct ceph_connection_operations mon_con_ops = {
 	.get = ceph_con_get,
 	.put = ceph_con_put,
 	.dispatch = dispatch,
 	.fault = mon_fault,
 	.alloc_msg = mon_alloc_msg,
 };