Commit 6c073a7ee250118b8be3a2379c96fd7f78382b06
Exists in
smarc-l5.0.0_1.0.0-ga
and in
5 other branches
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: rbd: fix safety of rbd_put_client() rbd: fix a memory leak in rbd_get_client() ceph: create a new session lock to avoid lock inversion ceph: fix length validation in parse_reply_info() ceph: initialize client debugfs outside of monc->mutex ceph: change "ceph.layout" xattr to be "ceph.file.layout"
Showing 8 changed files Inline Diff
drivers/block/rbd.c
1 | /* | 1 | /* |
2 | rbd.c -- Export ceph rados objects as a Linux block device | 2 | rbd.c -- Export ceph rados objects as a Linux block device |
3 | 3 | ||
4 | 4 | ||
5 | based on drivers/block/osdblk.c: | 5 | based on drivers/block/osdblk.c: |
6 | 6 | ||
7 | Copyright 2009 Red Hat, Inc. | 7 | Copyright 2009 Red Hat, Inc. |
8 | 8 | ||
9 | This program is free software; you can redistribute it and/or modify | 9 | This program is free software; you can redistribute it and/or modify |
10 | it under the terms of the GNU General Public License as published by | 10 | it under the terms of the GNU General Public License as published by |
11 | the Free Software Foundation. | 11 | the Free Software Foundation. |
12 | 12 | ||
13 | This program is distributed in the hope that it will be useful, | 13 | This program is distributed in the hope that it will be useful, |
14 | but WITHOUT ANY WARRANTY; without even the implied warranty of | 14 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 | GNU General Public License for more details. | 16 | GNU General Public License for more details. |
17 | 17 | ||
18 | You should have received a copy of the GNU General Public License | 18 | You should have received a copy of the GNU General Public License |
19 | along with this program; see the file COPYING. If not, write to | 19 | along with this program; see the file COPYING. If not, write to |
20 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | 20 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. |
21 | 21 | ||
22 | 22 | ||
23 | 23 | ||
24 | For usage instructions, please refer to: | 24 | For usage instructions, please refer to: |
25 | 25 | ||
26 | Documentation/ABI/testing/sysfs-bus-rbd | 26 | Documentation/ABI/testing/sysfs-bus-rbd |
27 | 27 | ||
28 | */ | 28 | */ |
29 | 29 | ||
30 | #include <linux/ceph/libceph.h> | 30 | #include <linux/ceph/libceph.h> |
31 | #include <linux/ceph/osd_client.h> | 31 | #include <linux/ceph/osd_client.h> |
32 | #include <linux/ceph/mon_client.h> | 32 | #include <linux/ceph/mon_client.h> |
33 | #include <linux/ceph/decode.h> | 33 | #include <linux/ceph/decode.h> |
34 | #include <linux/parser.h> | 34 | #include <linux/parser.h> |
35 | 35 | ||
36 | #include <linux/kernel.h> | 36 | #include <linux/kernel.h> |
37 | #include <linux/device.h> | 37 | #include <linux/device.h> |
38 | #include <linux/module.h> | 38 | #include <linux/module.h> |
39 | #include <linux/fs.h> | 39 | #include <linux/fs.h> |
40 | #include <linux/blkdev.h> | 40 | #include <linux/blkdev.h> |
41 | 41 | ||
42 | #include "rbd_types.h" | 42 | #include "rbd_types.h" |
43 | 43 | ||
44 | #define DRV_NAME "rbd" | 44 | #define DRV_NAME "rbd" |
45 | #define DRV_NAME_LONG "rbd (rados block device)" | 45 | #define DRV_NAME_LONG "rbd (rados block device)" |
46 | 46 | ||
47 | #define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */ | 47 | #define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */ |
48 | 48 | ||
49 | #define RBD_MAX_MD_NAME_LEN (96 + sizeof(RBD_SUFFIX)) | 49 | #define RBD_MAX_MD_NAME_LEN (96 + sizeof(RBD_SUFFIX)) |
50 | #define RBD_MAX_POOL_NAME_LEN 64 | 50 | #define RBD_MAX_POOL_NAME_LEN 64 |
51 | #define RBD_MAX_SNAP_NAME_LEN 32 | 51 | #define RBD_MAX_SNAP_NAME_LEN 32 |
52 | #define RBD_MAX_OPT_LEN 1024 | 52 | #define RBD_MAX_OPT_LEN 1024 |
53 | 53 | ||
54 | #define RBD_SNAP_HEAD_NAME "-" | 54 | #define RBD_SNAP_HEAD_NAME "-" |
55 | 55 | ||
56 | #define DEV_NAME_LEN 32 | 56 | #define DEV_NAME_LEN 32 |
57 | 57 | ||
58 | #define RBD_NOTIFY_TIMEOUT_DEFAULT 10 | 58 | #define RBD_NOTIFY_TIMEOUT_DEFAULT 10 |
59 | 59 | ||
60 | /* | 60 | /* |
61 | * block device image metadata (in-memory version) | 61 | * block device image metadata (in-memory version) |
62 | */ | 62 | */ |
63 | struct rbd_image_header { | 63 | struct rbd_image_header { |
64 | u64 image_size; | 64 | u64 image_size; |
65 | char block_name[32]; | 65 | char block_name[32]; |
66 | __u8 obj_order; | 66 | __u8 obj_order; |
67 | __u8 crypt_type; | 67 | __u8 crypt_type; |
68 | __u8 comp_type; | 68 | __u8 comp_type; |
69 | struct rw_semaphore snap_rwsem; | 69 | struct rw_semaphore snap_rwsem; |
70 | struct ceph_snap_context *snapc; | 70 | struct ceph_snap_context *snapc; |
71 | size_t snap_names_len; | 71 | size_t snap_names_len; |
72 | u64 snap_seq; | 72 | u64 snap_seq; |
73 | u32 total_snaps; | 73 | u32 total_snaps; |
74 | 74 | ||
75 | char *snap_names; | 75 | char *snap_names; |
76 | u64 *snap_sizes; | 76 | u64 *snap_sizes; |
77 | 77 | ||
78 | u64 obj_version; | 78 | u64 obj_version; |
79 | }; | 79 | }; |
80 | 80 | ||
81 | struct rbd_options { | 81 | struct rbd_options { |
82 | int notify_timeout; | 82 | int notify_timeout; |
83 | }; | 83 | }; |
84 | 84 | ||
85 | /* | 85 | /* |
86 | * an instance of the client. multiple devices may share a client. | 86 | * an instance of the client. multiple devices may share a client. |
87 | */ | 87 | */ |
88 | struct rbd_client { | 88 | struct rbd_client { |
89 | struct ceph_client *client; | 89 | struct ceph_client *client; |
90 | struct rbd_options *rbd_opts; | 90 | struct rbd_options *rbd_opts; |
91 | struct kref kref; | 91 | struct kref kref; |
92 | struct list_head node; | 92 | struct list_head node; |
93 | }; | 93 | }; |
94 | 94 | ||
95 | struct rbd_req_coll; | 95 | struct rbd_req_coll; |
96 | 96 | ||
97 | /* | 97 | /* |
98 | * a single io request | 98 | * a single io request |
99 | */ | 99 | */ |
100 | struct rbd_request { | 100 | struct rbd_request { |
101 | struct request *rq; /* blk layer request */ | 101 | struct request *rq; /* blk layer request */ |
102 | struct bio *bio; /* cloned bio */ | 102 | struct bio *bio; /* cloned bio */ |
103 | struct page **pages; /* list of used pages */ | 103 | struct page **pages; /* list of used pages */ |
104 | u64 len; | 104 | u64 len; |
105 | int coll_index; | 105 | int coll_index; |
106 | struct rbd_req_coll *coll; | 106 | struct rbd_req_coll *coll; |
107 | }; | 107 | }; |
108 | 108 | ||
109 | struct rbd_req_status { | 109 | struct rbd_req_status { |
110 | int done; | 110 | int done; |
111 | int rc; | 111 | int rc; |
112 | u64 bytes; | 112 | u64 bytes; |
113 | }; | 113 | }; |
114 | 114 | ||
115 | /* | 115 | /* |
116 | * a collection of requests | 116 | * a collection of requests |
117 | */ | 117 | */ |
118 | struct rbd_req_coll { | 118 | struct rbd_req_coll { |
119 | int total; | 119 | int total; |
120 | int num_done; | 120 | int num_done; |
121 | struct kref kref; | 121 | struct kref kref; |
122 | struct rbd_req_status status[0]; | 122 | struct rbd_req_status status[0]; |
123 | }; | 123 | }; |
124 | 124 | ||
125 | struct rbd_snap { | 125 | struct rbd_snap { |
126 | struct device dev; | 126 | struct device dev; |
127 | const char *name; | 127 | const char *name; |
128 | size_t size; | 128 | size_t size; |
129 | struct list_head node; | 129 | struct list_head node; |
130 | u64 id; | 130 | u64 id; |
131 | }; | 131 | }; |
132 | 132 | ||
133 | /* | 133 | /* |
134 | * a single device | 134 | * a single device |
135 | */ | 135 | */ |
136 | struct rbd_device { | 136 | struct rbd_device { |
137 | int id; /* blkdev unique id */ | 137 | int id; /* blkdev unique id */ |
138 | 138 | ||
139 | int major; /* blkdev assigned major */ | 139 | int major; /* blkdev assigned major */ |
140 | struct gendisk *disk; /* blkdev's gendisk and rq */ | 140 | struct gendisk *disk; /* blkdev's gendisk and rq */ |
141 | struct request_queue *q; | 141 | struct request_queue *q; |
142 | 142 | ||
143 | struct ceph_client *client; | 143 | struct ceph_client *client; |
144 | struct rbd_client *rbd_client; | 144 | struct rbd_client *rbd_client; |
145 | 145 | ||
146 | char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ | 146 | char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ |
147 | 147 | ||
148 | spinlock_t lock; /* queue lock */ | 148 | spinlock_t lock; /* queue lock */ |
149 | 149 | ||
150 | struct rbd_image_header header; | 150 | struct rbd_image_header header; |
151 | char obj[RBD_MAX_OBJ_NAME_LEN]; /* rbd image name */ | 151 | char obj[RBD_MAX_OBJ_NAME_LEN]; /* rbd image name */ |
152 | int obj_len; | 152 | int obj_len; |
153 | char obj_md_name[RBD_MAX_MD_NAME_LEN]; /* hdr nm. */ | 153 | char obj_md_name[RBD_MAX_MD_NAME_LEN]; /* hdr nm. */ |
154 | char pool_name[RBD_MAX_POOL_NAME_LEN]; | 154 | char pool_name[RBD_MAX_POOL_NAME_LEN]; |
155 | int poolid; | 155 | int poolid; |
156 | 156 | ||
157 | struct ceph_osd_event *watch_event; | 157 | struct ceph_osd_event *watch_event; |
158 | struct ceph_osd_request *watch_request; | 158 | struct ceph_osd_request *watch_request; |
159 | 159 | ||
160 | char snap_name[RBD_MAX_SNAP_NAME_LEN]; | 160 | char snap_name[RBD_MAX_SNAP_NAME_LEN]; |
161 | u32 cur_snap; /* index+1 of current snapshot within snap context | 161 | u32 cur_snap; /* index+1 of current snapshot within snap context |
162 | 0 - for the head */ | 162 | 0 - for the head */ |
163 | int read_only; | 163 | int read_only; |
164 | 164 | ||
165 | struct list_head node; | 165 | struct list_head node; |
166 | 166 | ||
167 | /* list of snapshots */ | 167 | /* list of snapshots */ |
168 | struct list_head snaps; | 168 | struct list_head snaps; |
169 | 169 | ||
170 | /* sysfs related */ | 170 | /* sysfs related */ |
171 | struct device dev; | 171 | struct device dev; |
172 | }; | 172 | }; |
173 | 173 | ||
174 | static struct bus_type rbd_bus_type = { | 174 | static struct bus_type rbd_bus_type = { |
175 | .name = "rbd", | 175 | .name = "rbd", |
176 | }; | 176 | }; |
177 | 177 | ||
178 | static spinlock_t node_lock; /* protects client get/put */ | 178 | static spinlock_t node_lock; /* protects client get/put */ |
179 | 179 | ||
180 | static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ | 180 | static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ |
181 | static LIST_HEAD(rbd_dev_list); /* devices */ | 181 | static LIST_HEAD(rbd_dev_list); /* devices */ |
182 | static LIST_HEAD(rbd_client_list); /* clients */ | 182 | static LIST_HEAD(rbd_client_list); /* clients */ |
183 | 183 | ||
184 | static int __rbd_init_snaps_header(struct rbd_device *rbd_dev); | 184 | static int __rbd_init_snaps_header(struct rbd_device *rbd_dev); |
185 | static void rbd_dev_release(struct device *dev); | 185 | static void rbd_dev_release(struct device *dev); |
186 | static ssize_t rbd_snap_add(struct device *dev, | 186 | static ssize_t rbd_snap_add(struct device *dev, |
187 | struct device_attribute *attr, | 187 | struct device_attribute *attr, |
188 | const char *buf, | 188 | const char *buf, |
189 | size_t count); | 189 | size_t count); |
190 | static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev, | 190 | static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev, |
191 | struct rbd_snap *snap); | 191 | struct rbd_snap *snap); |
192 | 192 | ||
193 | 193 | ||
194 | static struct rbd_device *dev_to_rbd(struct device *dev) | 194 | static struct rbd_device *dev_to_rbd(struct device *dev) |
195 | { | 195 | { |
196 | return container_of(dev, struct rbd_device, dev); | 196 | return container_of(dev, struct rbd_device, dev); |
197 | } | 197 | } |
198 | 198 | ||
199 | static struct device *rbd_get_dev(struct rbd_device *rbd_dev) | 199 | static struct device *rbd_get_dev(struct rbd_device *rbd_dev) |
200 | { | 200 | { |
201 | return get_device(&rbd_dev->dev); | 201 | return get_device(&rbd_dev->dev); |
202 | } | 202 | } |
203 | 203 | ||
204 | static void rbd_put_dev(struct rbd_device *rbd_dev) | 204 | static void rbd_put_dev(struct rbd_device *rbd_dev) |
205 | { | 205 | { |
206 | put_device(&rbd_dev->dev); | 206 | put_device(&rbd_dev->dev); |
207 | } | 207 | } |
208 | 208 | ||
209 | static int __rbd_update_snaps(struct rbd_device *rbd_dev); | 209 | static int __rbd_update_snaps(struct rbd_device *rbd_dev); |
210 | 210 | ||
211 | static int rbd_open(struct block_device *bdev, fmode_t mode) | 211 | static int rbd_open(struct block_device *bdev, fmode_t mode) |
212 | { | 212 | { |
213 | struct gendisk *disk = bdev->bd_disk; | 213 | struct gendisk *disk = bdev->bd_disk; |
214 | struct rbd_device *rbd_dev = disk->private_data; | 214 | struct rbd_device *rbd_dev = disk->private_data; |
215 | 215 | ||
216 | rbd_get_dev(rbd_dev); | 216 | rbd_get_dev(rbd_dev); |
217 | 217 | ||
218 | set_device_ro(bdev, rbd_dev->read_only); | 218 | set_device_ro(bdev, rbd_dev->read_only); |
219 | 219 | ||
220 | if ((mode & FMODE_WRITE) && rbd_dev->read_only) | 220 | if ((mode & FMODE_WRITE) && rbd_dev->read_only) |
221 | return -EROFS; | 221 | return -EROFS; |
222 | 222 | ||
223 | return 0; | 223 | return 0; |
224 | } | 224 | } |
225 | 225 | ||
226 | static int rbd_release(struct gendisk *disk, fmode_t mode) | 226 | static int rbd_release(struct gendisk *disk, fmode_t mode) |
227 | { | 227 | { |
228 | struct rbd_device *rbd_dev = disk->private_data; | 228 | struct rbd_device *rbd_dev = disk->private_data; |
229 | 229 | ||
230 | rbd_put_dev(rbd_dev); | 230 | rbd_put_dev(rbd_dev); |
231 | 231 | ||
232 | return 0; | 232 | return 0; |
233 | } | 233 | } |
234 | 234 | ||
235 | static const struct block_device_operations rbd_bd_ops = { | 235 | static const struct block_device_operations rbd_bd_ops = { |
236 | .owner = THIS_MODULE, | 236 | .owner = THIS_MODULE, |
237 | .open = rbd_open, | 237 | .open = rbd_open, |
238 | .release = rbd_release, | 238 | .release = rbd_release, |
239 | }; | 239 | }; |
240 | 240 | ||
241 | /* | 241 | /* |
242 | * Initialize an rbd client instance. | 242 | * Initialize an rbd client instance. |
243 | * We own *opt. | 243 | * We own *opt. |
244 | */ | 244 | */ |
245 | static struct rbd_client *rbd_client_create(struct ceph_options *opt, | 245 | static struct rbd_client *rbd_client_create(struct ceph_options *opt, |
246 | struct rbd_options *rbd_opts) | 246 | struct rbd_options *rbd_opts) |
247 | { | 247 | { |
248 | struct rbd_client *rbdc; | 248 | struct rbd_client *rbdc; |
249 | int ret = -ENOMEM; | 249 | int ret = -ENOMEM; |
250 | 250 | ||
251 | dout("rbd_client_create\n"); | 251 | dout("rbd_client_create\n"); |
252 | rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); | 252 | rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); |
253 | if (!rbdc) | 253 | if (!rbdc) |
254 | goto out_opt; | 254 | goto out_opt; |
255 | 255 | ||
256 | kref_init(&rbdc->kref); | 256 | kref_init(&rbdc->kref); |
257 | INIT_LIST_HEAD(&rbdc->node); | 257 | INIT_LIST_HEAD(&rbdc->node); |
258 | 258 | ||
259 | rbdc->client = ceph_create_client(opt, rbdc, 0, 0); | 259 | rbdc->client = ceph_create_client(opt, rbdc, 0, 0); |
260 | if (IS_ERR(rbdc->client)) | 260 | if (IS_ERR(rbdc->client)) |
261 | goto out_rbdc; | 261 | goto out_rbdc; |
262 | opt = NULL; /* Now rbdc->client is responsible for opt */ | 262 | opt = NULL; /* Now rbdc->client is responsible for opt */ |
263 | 263 | ||
264 | ret = ceph_open_session(rbdc->client); | 264 | ret = ceph_open_session(rbdc->client); |
265 | if (ret < 0) | 265 | if (ret < 0) |
266 | goto out_err; | 266 | goto out_err; |
267 | 267 | ||
268 | rbdc->rbd_opts = rbd_opts; | 268 | rbdc->rbd_opts = rbd_opts; |
269 | 269 | ||
270 | spin_lock(&node_lock); | 270 | spin_lock(&node_lock); |
271 | list_add_tail(&rbdc->node, &rbd_client_list); | 271 | list_add_tail(&rbdc->node, &rbd_client_list); |
272 | spin_unlock(&node_lock); | 272 | spin_unlock(&node_lock); |
273 | 273 | ||
274 | dout("rbd_client_create created %p\n", rbdc); | 274 | dout("rbd_client_create created %p\n", rbdc); |
275 | return rbdc; | 275 | return rbdc; |
276 | 276 | ||
277 | out_err: | 277 | out_err: |
278 | ceph_destroy_client(rbdc->client); | 278 | ceph_destroy_client(rbdc->client); |
279 | out_rbdc: | 279 | out_rbdc: |
280 | kfree(rbdc); | 280 | kfree(rbdc); |
281 | out_opt: | 281 | out_opt: |
282 | if (opt) | 282 | if (opt) |
283 | ceph_destroy_options(opt); | 283 | ceph_destroy_options(opt); |
284 | return ERR_PTR(ret); | 284 | return ERR_PTR(ret); |
285 | } | 285 | } |
286 | 286 | ||
287 | /* | 287 | /* |
288 | * Find a ceph client with specific addr and configuration. | 288 | * Find a ceph client with specific addr and configuration. |
289 | */ | 289 | */ |
290 | static struct rbd_client *__rbd_client_find(struct ceph_options *opt) | 290 | static struct rbd_client *__rbd_client_find(struct ceph_options *opt) |
291 | { | 291 | { |
292 | struct rbd_client *client_node; | 292 | struct rbd_client *client_node; |
293 | 293 | ||
294 | if (opt->flags & CEPH_OPT_NOSHARE) | 294 | if (opt->flags & CEPH_OPT_NOSHARE) |
295 | return NULL; | 295 | return NULL; |
296 | 296 | ||
297 | list_for_each_entry(client_node, &rbd_client_list, node) | 297 | list_for_each_entry(client_node, &rbd_client_list, node) |
298 | if (ceph_compare_options(opt, client_node->client) == 0) | 298 | if (ceph_compare_options(opt, client_node->client) == 0) |
299 | return client_node; | 299 | return client_node; |
300 | return NULL; | 300 | return NULL; |
301 | } | 301 | } |
302 | 302 | ||
303 | /* | 303 | /* |
304 | * mount options | 304 | * mount options |
305 | */ | 305 | */ |
306 | enum { | 306 | enum { |
307 | Opt_notify_timeout, | 307 | Opt_notify_timeout, |
308 | Opt_last_int, | 308 | Opt_last_int, |
309 | /* int args above */ | 309 | /* int args above */ |
310 | Opt_last_string, | 310 | Opt_last_string, |
311 | /* string args above */ | 311 | /* string args above */ |
312 | }; | 312 | }; |
313 | 313 | ||
314 | static match_table_t rbdopt_tokens = { | 314 | static match_table_t rbdopt_tokens = { |
315 | {Opt_notify_timeout, "notify_timeout=%d"}, | 315 | {Opt_notify_timeout, "notify_timeout=%d"}, |
316 | /* int args above */ | 316 | /* int args above */ |
317 | /* string args above */ | 317 | /* string args above */ |
318 | {-1, NULL} | 318 | {-1, NULL} |
319 | }; | 319 | }; |
320 | 320 | ||
321 | static int parse_rbd_opts_token(char *c, void *private) | 321 | static int parse_rbd_opts_token(char *c, void *private) |
322 | { | 322 | { |
323 | struct rbd_options *rbdopt = private; | 323 | struct rbd_options *rbdopt = private; |
324 | substring_t argstr[MAX_OPT_ARGS]; | 324 | substring_t argstr[MAX_OPT_ARGS]; |
325 | int token, intval, ret; | 325 | int token, intval, ret; |
326 | 326 | ||
327 | token = match_token((char *)c, rbdopt_tokens, argstr); | 327 | token = match_token((char *)c, rbdopt_tokens, argstr); |
328 | if (token < 0) | 328 | if (token < 0) |
329 | return -EINVAL; | 329 | return -EINVAL; |
330 | 330 | ||
331 | if (token < Opt_last_int) { | 331 | if (token < Opt_last_int) { |
332 | ret = match_int(&argstr[0], &intval); | 332 | ret = match_int(&argstr[0], &intval); |
333 | if (ret < 0) { | 333 | if (ret < 0) { |
334 | pr_err("bad mount option arg (not int) " | 334 | pr_err("bad mount option arg (not int) " |
335 | "at '%s'\n", c); | 335 | "at '%s'\n", c); |
336 | return ret; | 336 | return ret; |
337 | } | 337 | } |
338 | dout("got int token %d val %d\n", token, intval); | 338 | dout("got int token %d val %d\n", token, intval); |
339 | } else if (token > Opt_last_int && token < Opt_last_string) { | 339 | } else if (token > Opt_last_int && token < Opt_last_string) { |
340 | dout("got string token %d val %s\n", token, | 340 | dout("got string token %d val %s\n", token, |
341 | argstr[0].from); | 341 | argstr[0].from); |
342 | } else { | 342 | } else { |
343 | dout("got token %d\n", token); | 343 | dout("got token %d\n", token); |
344 | } | 344 | } |
345 | 345 | ||
346 | switch (token) { | 346 | switch (token) { |
347 | case Opt_notify_timeout: | 347 | case Opt_notify_timeout: |
348 | rbdopt->notify_timeout = intval; | 348 | rbdopt->notify_timeout = intval; |
349 | break; | 349 | break; |
350 | default: | 350 | default: |
351 | BUG_ON(token); | 351 | BUG_ON(token); |
352 | } | 352 | } |
353 | return 0; | 353 | return 0; |
354 | } | 354 | } |
355 | 355 | ||
356 | /* | 356 | /* |
357 | * Get a ceph client with specific addr and configuration, if one does | 357 | * Get a ceph client with specific addr and configuration, if one does |
358 | * not exist create it. | 358 | * not exist create it. |
359 | */ | 359 | */ |
360 | static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr, | 360 | static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr, |
361 | char *options) | 361 | char *options) |
362 | { | 362 | { |
363 | struct rbd_client *rbdc; | 363 | struct rbd_client *rbdc; |
364 | struct ceph_options *opt; | 364 | struct ceph_options *opt; |
365 | int ret; | 365 | int ret; |
366 | struct rbd_options *rbd_opts; | 366 | struct rbd_options *rbd_opts; |
367 | 367 | ||
368 | rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL); | 368 | rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL); |
369 | if (!rbd_opts) | 369 | if (!rbd_opts) |
370 | return -ENOMEM; | 370 | return -ENOMEM; |
371 | 371 | ||
372 | rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT; | 372 | rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT; |
373 | 373 | ||
374 | ret = ceph_parse_options(&opt, options, mon_addr, | 374 | ret = ceph_parse_options(&opt, options, mon_addr, |
375 | mon_addr + strlen(mon_addr), parse_rbd_opts_token, rbd_opts); | 375 | mon_addr + strlen(mon_addr), parse_rbd_opts_token, rbd_opts); |
376 | if (ret < 0) | 376 | if (ret < 0) |
377 | goto done_err; | 377 | goto done_err; |
378 | 378 | ||
379 | spin_lock(&node_lock); | 379 | spin_lock(&node_lock); |
380 | rbdc = __rbd_client_find(opt); | 380 | rbdc = __rbd_client_find(opt); |
381 | if (rbdc) { | 381 | if (rbdc) { |
382 | ceph_destroy_options(opt); | 382 | ceph_destroy_options(opt); |
383 | kfree(rbd_opts); | ||
383 | 384 | ||
384 | /* using an existing client */ | 385 | /* using an existing client */ |
385 | kref_get(&rbdc->kref); | 386 | kref_get(&rbdc->kref); |
386 | rbd_dev->rbd_client = rbdc; | 387 | rbd_dev->rbd_client = rbdc; |
387 | rbd_dev->client = rbdc->client; | 388 | rbd_dev->client = rbdc->client; |
388 | spin_unlock(&node_lock); | 389 | spin_unlock(&node_lock); |
389 | return 0; | 390 | return 0; |
390 | } | 391 | } |
391 | spin_unlock(&node_lock); | 392 | spin_unlock(&node_lock); |
392 | 393 | ||
393 | rbdc = rbd_client_create(opt, rbd_opts); | 394 | rbdc = rbd_client_create(opt, rbd_opts); |
394 | if (IS_ERR(rbdc)) { | 395 | if (IS_ERR(rbdc)) { |
395 | ret = PTR_ERR(rbdc); | 396 | ret = PTR_ERR(rbdc); |
396 | goto done_err; | 397 | goto done_err; |
397 | } | 398 | } |
398 | 399 | ||
399 | rbd_dev->rbd_client = rbdc; | 400 | rbd_dev->rbd_client = rbdc; |
400 | rbd_dev->client = rbdc->client; | 401 | rbd_dev->client = rbdc->client; |
401 | return 0; | 402 | return 0; |
402 | done_err: | 403 | done_err: |
403 | kfree(rbd_opts); | 404 | kfree(rbd_opts); |
404 | return ret; | 405 | return ret; |
405 | } | 406 | } |
406 | 407 | ||
407 | /* | 408 | /* |
408 | * Destroy ceph client | 409 | * Destroy ceph client |
410 | * | ||
411 | * Caller must hold node_lock. | ||
409 | */ | 412 | */ |
410 | static void rbd_client_release(struct kref *kref) | 413 | static void rbd_client_release(struct kref *kref) |
411 | { | 414 | { |
412 | struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); | 415 | struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); |
413 | 416 | ||
414 | dout("rbd_release_client %p\n", rbdc); | 417 | dout("rbd_release_client %p\n", rbdc); |
415 | spin_lock(&node_lock); | ||
416 | list_del(&rbdc->node); | 418 | list_del(&rbdc->node); |
417 | spin_unlock(&node_lock); | ||
418 | 419 | ||
419 | ceph_destroy_client(rbdc->client); | 420 | ceph_destroy_client(rbdc->client); |
420 | kfree(rbdc->rbd_opts); | 421 | kfree(rbdc->rbd_opts); |
421 | kfree(rbdc); | 422 | kfree(rbdc); |
422 | } | 423 | } |
423 | 424 | ||
424 | /* | 425 | /* |
425 | * Drop reference to ceph client node. If it's not referenced anymore, release | 426 | * Drop reference to ceph client node. If it's not referenced anymore, release |
426 | * it. | 427 | * it. |
427 | */ | 428 | */ |
428 | static void rbd_put_client(struct rbd_device *rbd_dev) | 429 | static void rbd_put_client(struct rbd_device *rbd_dev) |
429 | { | 430 | { |
431 | spin_lock(&node_lock); | ||
430 | kref_put(&rbd_dev->rbd_client->kref, rbd_client_release); | 432 | kref_put(&rbd_dev->rbd_client->kref, rbd_client_release); |
433 | spin_unlock(&node_lock); | ||
431 | rbd_dev->rbd_client = NULL; | 434 | rbd_dev->rbd_client = NULL; |
432 | rbd_dev->client = NULL; | 435 | rbd_dev->client = NULL; |
433 | } | 436 | } |
434 | 437 | ||
435 | /* | 438 | /* |
436 | * Destroy requests collection | 439 | * Destroy requests collection |
437 | */ | 440 | */ |
438 | static void rbd_coll_release(struct kref *kref) | 441 | static void rbd_coll_release(struct kref *kref) |
439 | { | 442 | { |
440 | struct rbd_req_coll *coll = | 443 | struct rbd_req_coll *coll = |
441 | container_of(kref, struct rbd_req_coll, kref); | 444 | container_of(kref, struct rbd_req_coll, kref); |
442 | 445 | ||
443 | dout("rbd_coll_release %p\n", coll); | 446 | dout("rbd_coll_release %p\n", coll); |
444 | kfree(coll); | 447 | kfree(coll); |
445 | } | 448 | } |
446 | 449 | ||
447 | /* | 450 | /* |
448 | * Create a new header structure, translate header format from the on-disk | 451 | * Create a new header structure, translate header format from the on-disk |
449 | * header. | 452 | * header. |
450 | */ | 453 | */ |
451 | static int rbd_header_from_disk(struct rbd_image_header *header, | 454 | static int rbd_header_from_disk(struct rbd_image_header *header, |
452 | struct rbd_image_header_ondisk *ondisk, | 455 | struct rbd_image_header_ondisk *ondisk, |
453 | int allocated_snaps, | 456 | int allocated_snaps, |
454 | gfp_t gfp_flags) | 457 | gfp_t gfp_flags) |
455 | { | 458 | { |
456 | int i; | 459 | int i; |
457 | u32 snap_count = le32_to_cpu(ondisk->snap_count); | 460 | u32 snap_count = le32_to_cpu(ondisk->snap_count); |
458 | int ret = -ENOMEM; | 461 | int ret = -ENOMEM; |
459 | 462 | ||
460 | if (memcmp(ondisk, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT))) { | 463 | if (memcmp(ondisk, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT))) { |
461 | return -ENXIO; | 464 | return -ENXIO; |
462 | } | 465 | } |
463 | 466 | ||
464 | init_rwsem(&header->snap_rwsem); | 467 | init_rwsem(&header->snap_rwsem); |
465 | header->snap_names_len = le64_to_cpu(ondisk->snap_names_len); | 468 | header->snap_names_len = le64_to_cpu(ondisk->snap_names_len); |
466 | header->snapc = kmalloc(sizeof(struct ceph_snap_context) + | 469 | header->snapc = kmalloc(sizeof(struct ceph_snap_context) + |
467 | snap_count * | 470 | snap_count * |
468 | sizeof(struct rbd_image_snap_ondisk), | 471 | sizeof(struct rbd_image_snap_ondisk), |
469 | gfp_flags); | 472 | gfp_flags); |
470 | if (!header->snapc) | 473 | if (!header->snapc) |
471 | return -ENOMEM; | 474 | return -ENOMEM; |
472 | if (snap_count) { | 475 | if (snap_count) { |
473 | header->snap_names = kmalloc(header->snap_names_len, | 476 | header->snap_names = kmalloc(header->snap_names_len, |
474 | GFP_KERNEL); | 477 | GFP_KERNEL); |
475 | if (!header->snap_names) | 478 | if (!header->snap_names) |
476 | goto err_snapc; | 479 | goto err_snapc; |
477 | header->snap_sizes = kmalloc(snap_count * sizeof(u64), | 480 | header->snap_sizes = kmalloc(snap_count * sizeof(u64), |
478 | GFP_KERNEL); | 481 | GFP_KERNEL); |
479 | if (!header->snap_sizes) | 482 | if (!header->snap_sizes) |
480 | goto err_names; | 483 | goto err_names; |
481 | } else { | 484 | } else { |
482 | header->snap_names = NULL; | 485 | header->snap_names = NULL; |
483 | header->snap_sizes = NULL; | 486 | header->snap_sizes = NULL; |
484 | } | 487 | } |
485 | memcpy(header->block_name, ondisk->block_name, | 488 | memcpy(header->block_name, ondisk->block_name, |
486 | sizeof(ondisk->block_name)); | 489 | sizeof(ondisk->block_name)); |
487 | 490 | ||
488 | header->image_size = le64_to_cpu(ondisk->image_size); | 491 | header->image_size = le64_to_cpu(ondisk->image_size); |
489 | header->obj_order = ondisk->options.order; | 492 | header->obj_order = ondisk->options.order; |
490 | header->crypt_type = ondisk->options.crypt_type; | 493 | header->crypt_type = ondisk->options.crypt_type; |
491 | header->comp_type = ondisk->options.comp_type; | 494 | header->comp_type = ondisk->options.comp_type; |
492 | 495 | ||
493 | atomic_set(&header->snapc->nref, 1); | 496 | atomic_set(&header->snapc->nref, 1); |
494 | header->snap_seq = le64_to_cpu(ondisk->snap_seq); | 497 | header->snap_seq = le64_to_cpu(ondisk->snap_seq); |
495 | header->snapc->num_snaps = snap_count; | 498 | header->snapc->num_snaps = snap_count; |
496 | header->total_snaps = snap_count; | 499 | header->total_snaps = snap_count; |
497 | 500 | ||
498 | if (snap_count && | 501 | if (snap_count && |
499 | allocated_snaps == snap_count) { | 502 | allocated_snaps == snap_count) { |
500 | for (i = 0; i < snap_count; i++) { | 503 | for (i = 0; i < snap_count; i++) { |
501 | header->snapc->snaps[i] = | 504 | header->snapc->snaps[i] = |
502 | le64_to_cpu(ondisk->snaps[i].id); | 505 | le64_to_cpu(ondisk->snaps[i].id); |
503 | header->snap_sizes[i] = | 506 | header->snap_sizes[i] = |
504 | le64_to_cpu(ondisk->snaps[i].image_size); | 507 | le64_to_cpu(ondisk->snaps[i].image_size); |
505 | } | 508 | } |
506 | 509 | ||
507 | /* copy snapshot names */ | 510 | /* copy snapshot names */ |
508 | memcpy(header->snap_names, &ondisk->snaps[i], | 511 | memcpy(header->snap_names, &ondisk->snaps[i], |
509 | header->snap_names_len); | 512 | header->snap_names_len); |
510 | } | 513 | } |
511 | 514 | ||
512 | return 0; | 515 | return 0; |
513 | 516 | ||
514 | err_names: | 517 | err_names: |
515 | kfree(header->snap_names); | 518 | kfree(header->snap_names); |
516 | err_snapc: | 519 | err_snapc: |
517 | kfree(header->snapc); | 520 | kfree(header->snapc); |
518 | return ret; | 521 | return ret; |
519 | } | 522 | } |
520 | 523 | ||
521 | static int snap_index(struct rbd_image_header *header, int snap_num) | 524 | static int snap_index(struct rbd_image_header *header, int snap_num) |
522 | { | 525 | { |
523 | return header->total_snaps - snap_num; | 526 | return header->total_snaps - snap_num; |
524 | } | 527 | } |
525 | 528 | ||
526 | static u64 cur_snap_id(struct rbd_device *rbd_dev) | 529 | static u64 cur_snap_id(struct rbd_device *rbd_dev) |
527 | { | 530 | { |
528 | struct rbd_image_header *header = &rbd_dev->header; | 531 | struct rbd_image_header *header = &rbd_dev->header; |
529 | 532 | ||
530 | if (!rbd_dev->cur_snap) | 533 | if (!rbd_dev->cur_snap) |
531 | return 0; | 534 | return 0; |
532 | 535 | ||
533 | return header->snapc->snaps[snap_index(header, rbd_dev->cur_snap)]; | 536 | return header->snapc->snaps[snap_index(header, rbd_dev->cur_snap)]; |
534 | } | 537 | } |
535 | 538 | ||
536 | static int snap_by_name(struct rbd_image_header *header, const char *snap_name, | 539 | static int snap_by_name(struct rbd_image_header *header, const char *snap_name, |
537 | u64 *seq, u64 *size) | 540 | u64 *seq, u64 *size) |
538 | { | 541 | { |
539 | int i; | 542 | int i; |
540 | char *p = header->snap_names; | 543 | char *p = header->snap_names; |
541 | 544 | ||
542 | for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) { | 545 | for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) { |
543 | if (strcmp(snap_name, p) == 0) | 546 | if (strcmp(snap_name, p) == 0) |
544 | break; | 547 | break; |
545 | } | 548 | } |
546 | if (i == header->total_snaps) | 549 | if (i == header->total_snaps) |
547 | return -ENOENT; | 550 | return -ENOENT; |
548 | if (seq) | 551 | if (seq) |
549 | *seq = header->snapc->snaps[i]; | 552 | *seq = header->snapc->snaps[i]; |
550 | 553 | ||
551 | if (size) | 554 | if (size) |
552 | *size = header->snap_sizes[i]; | 555 | *size = header->snap_sizes[i]; |
553 | 556 | ||
554 | return i; | 557 | return i; |
555 | } | 558 | } |
556 | 559 | ||
557 | static int rbd_header_set_snap(struct rbd_device *dev, | 560 | static int rbd_header_set_snap(struct rbd_device *dev, |
558 | const char *snap_name, | 561 | const char *snap_name, |
559 | u64 *size) | 562 | u64 *size) |
560 | { | 563 | { |
561 | struct rbd_image_header *header = &dev->header; | 564 | struct rbd_image_header *header = &dev->header; |
562 | struct ceph_snap_context *snapc = header->snapc; | 565 | struct ceph_snap_context *snapc = header->snapc; |
563 | int ret = -ENOENT; | 566 | int ret = -ENOENT; |
564 | 567 | ||
565 | down_write(&header->snap_rwsem); | 568 | down_write(&header->snap_rwsem); |
566 | 569 | ||
567 | if (!snap_name || | 570 | if (!snap_name || |
568 | !*snap_name || | 571 | !*snap_name || |
569 | strcmp(snap_name, "-") == 0 || | 572 | strcmp(snap_name, "-") == 0 || |
570 | strcmp(snap_name, RBD_SNAP_HEAD_NAME) == 0) { | 573 | strcmp(snap_name, RBD_SNAP_HEAD_NAME) == 0) { |
571 | if (header->total_snaps) | 574 | if (header->total_snaps) |
572 | snapc->seq = header->snap_seq; | 575 | snapc->seq = header->snap_seq; |
573 | else | 576 | else |
574 | snapc->seq = 0; | 577 | snapc->seq = 0; |
575 | dev->cur_snap = 0; | 578 | dev->cur_snap = 0; |
576 | dev->read_only = 0; | 579 | dev->read_only = 0; |
577 | if (size) | 580 | if (size) |
578 | *size = header->image_size; | 581 | *size = header->image_size; |
579 | } else { | 582 | } else { |
580 | ret = snap_by_name(header, snap_name, &snapc->seq, size); | 583 | ret = snap_by_name(header, snap_name, &snapc->seq, size); |
581 | if (ret < 0) | 584 | if (ret < 0) |
582 | goto done; | 585 | goto done; |
583 | 586 | ||
584 | dev->cur_snap = header->total_snaps - ret; | 587 | dev->cur_snap = header->total_snaps - ret; |
585 | dev->read_only = 1; | 588 | dev->read_only = 1; |
586 | } | 589 | } |
587 | 590 | ||
588 | ret = 0; | 591 | ret = 0; |
589 | done: | 592 | done: |
590 | up_write(&header->snap_rwsem); | 593 | up_write(&header->snap_rwsem); |
591 | return ret; | 594 | return ret; |
592 | } | 595 | } |
593 | 596 | ||
594 | static void rbd_header_free(struct rbd_image_header *header) | 597 | static void rbd_header_free(struct rbd_image_header *header) |
595 | { | 598 | { |
596 | kfree(header->snapc); | 599 | kfree(header->snapc); |
597 | kfree(header->snap_names); | 600 | kfree(header->snap_names); |
598 | kfree(header->snap_sizes); | 601 | kfree(header->snap_sizes); |
599 | } | 602 | } |
600 | 603 | ||
601 | /* | 604 | /* |
602 | * get the actual striped segment name, offset and length | 605 | * get the actual striped segment name, offset and length |
603 | */ | 606 | */ |
604 | static u64 rbd_get_segment(struct rbd_image_header *header, | 607 | static u64 rbd_get_segment(struct rbd_image_header *header, |
605 | const char *block_name, | 608 | const char *block_name, |
606 | u64 ofs, u64 len, | 609 | u64 ofs, u64 len, |
607 | char *seg_name, u64 *segofs) | 610 | char *seg_name, u64 *segofs) |
608 | { | 611 | { |
609 | u64 seg = ofs >> header->obj_order; | 612 | u64 seg = ofs >> header->obj_order; |
610 | 613 | ||
611 | if (seg_name) | 614 | if (seg_name) |
612 | snprintf(seg_name, RBD_MAX_SEG_NAME_LEN, | 615 | snprintf(seg_name, RBD_MAX_SEG_NAME_LEN, |
613 | "%s.%012llx", block_name, seg); | 616 | "%s.%012llx", block_name, seg); |
614 | 617 | ||
615 | ofs = ofs & ((1 << header->obj_order) - 1); | 618 | ofs = ofs & ((1 << header->obj_order) - 1); |
616 | len = min_t(u64, len, (1 << header->obj_order) - ofs); | 619 | len = min_t(u64, len, (1 << header->obj_order) - ofs); |
617 | 620 | ||
618 | if (segofs) | 621 | if (segofs) |
619 | *segofs = ofs; | 622 | *segofs = ofs; |
620 | 623 | ||
621 | return len; | 624 | return len; |
622 | } | 625 | } |
623 | 626 | ||
624 | static int rbd_get_num_segments(struct rbd_image_header *header, | 627 | static int rbd_get_num_segments(struct rbd_image_header *header, |
625 | u64 ofs, u64 len) | 628 | u64 ofs, u64 len) |
626 | { | 629 | { |
627 | u64 start_seg = ofs >> header->obj_order; | 630 | u64 start_seg = ofs >> header->obj_order; |
628 | u64 end_seg = (ofs + len - 1) >> header->obj_order; | 631 | u64 end_seg = (ofs + len - 1) >> header->obj_order; |
629 | return end_seg - start_seg + 1; | 632 | return end_seg - start_seg + 1; |
630 | } | 633 | } |
631 | 634 | ||
632 | /* | 635 | /* |
633 | * returns the size of an object in the image | 636 | * returns the size of an object in the image |
634 | */ | 637 | */ |
635 | static u64 rbd_obj_bytes(struct rbd_image_header *header) | 638 | static u64 rbd_obj_bytes(struct rbd_image_header *header) |
636 | { | 639 | { |
637 | return 1 << header->obj_order; | 640 | return 1 << header->obj_order; |
638 | } | 641 | } |
639 | 642 | ||
640 | /* | 643 | /* |
641 | * bio helpers | 644 | * bio helpers |
642 | */ | 645 | */ |
643 | 646 | ||
644 | static void bio_chain_put(struct bio *chain) | 647 | static void bio_chain_put(struct bio *chain) |
645 | { | 648 | { |
646 | struct bio *tmp; | 649 | struct bio *tmp; |
647 | 650 | ||
648 | while (chain) { | 651 | while (chain) { |
649 | tmp = chain; | 652 | tmp = chain; |
650 | chain = chain->bi_next; | 653 | chain = chain->bi_next; |
651 | bio_put(tmp); | 654 | bio_put(tmp); |
652 | } | 655 | } |
653 | } | 656 | } |
654 | 657 | ||
655 | /* | 658 | /* |
656 | * zeros a bio chain, starting at specific offset | 659 | * zeros a bio chain, starting at specific offset |
657 | */ | 660 | */ |
658 | static void zero_bio_chain(struct bio *chain, int start_ofs) | 661 | static void zero_bio_chain(struct bio *chain, int start_ofs) |
659 | { | 662 | { |
660 | struct bio_vec *bv; | 663 | struct bio_vec *bv; |
661 | unsigned long flags; | 664 | unsigned long flags; |
662 | void *buf; | 665 | void *buf; |
663 | int i; | 666 | int i; |
664 | int pos = 0; | 667 | int pos = 0; |
665 | 668 | ||
666 | while (chain) { | 669 | while (chain) { |
667 | bio_for_each_segment(bv, chain, i) { | 670 | bio_for_each_segment(bv, chain, i) { |
668 | if (pos + bv->bv_len > start_ofs) { | 671 | if (pos + bv->bv_len > start_ofs) { |
669 | int remainder = max(start_ofs - pos, 0); | 672 | int remainder = max(start_ofs - pos, 0); |
670 | buf = bvec_kmap_irq(bv, &flags); | 673 | buf = bvec_kmap_irq(bv, &flags); |
671 | memset(buf + remainder, 0, | 674 | memset(buf + remainder, 0, |
672 | bv->bv_len - remainder); | 675 | bv->bv_len - remainder); |
673 | bvec_kunmap_irq(buf, &flags); | 676 | bvec_kunmap_irq(buf, &flags); |
674 | } | 677 | } |
675 | pos += bv->bv_len; | 678 | pos += bv->bv_len; |
676 | } | 679 | } |
677 | 680 | ||
678 | chain = chain->bi_next; | 681 | chain = chain->bi_next; |
679 | } | 682 | } |
680 | } | 683 | } |
681 | 684 | ||
682 | /* | 685 | /* |
683 | * bio_chain_clone - clone a chain of bios up to a certain length. | 686 | * bio_chain_clone - clone a chain of bios up to a certain length. |
684 | * might return a bio_pair that will need to be released. | 687 | * might return a bio_pair that will need to be released. |
685 | */ | 688 | */ |
686 | static struct bio *bio_chain_clone(struct bio **old, struct bio **next, | 689 | static struct bio *bio_chain_clone(struct bio **old, struct bio **next, |
687 | struct bio_pair **bp, | 690 | struct bio_pair **bp, |
688 | int len, gfp_t gfpmask) | 691 | int len, gfp_t gfpmask) |
689 | { | 692 | { |
690 | struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL; | 693 | struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL; |
691 | int total = 0; | 694 | int total = 0; |
692 | 695 | ||
693 | if (*bp) { | 696 | if (*bp) { |
694 | bio_pair_release(*bp); | 697 | bio_pair_release(*bp); |
695 | *bp = NULL; | 698 | *bp = NULL; |
696 | } | 699 | } |
697 | 700 | ||
698 | while (old_chain && (total < len)) { | 701 | while (old_chain && (total < len)) { |
699 | tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs); | 702 | tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs); |
700 | if (!tmp) | 703 | if (!tmp) |
701 | goto err_out; | 704 | goto err_out; |
702 | 705 | ||
703 | if (total + old_chain->bi_size > len) { | 706 | if (total + old_chain->bi_size > len) { |
704 | struct bio_pair *bp; | 707 | struct bio_pair *bp; |
705 | 708 | ||
706 | /* | 709 | /* |
707 | * this split can only happen with a single paged bio, | 710 | * this split can only happen with a single paged bio, |
708 | * split_bio will BUG_ON if this is not the case | 711 | * split_bio will BUG_ON if this is not the case |
709 | */ | 712 | */ |
710 | dout("bio_chain_clone split! total=%d remaining=%d" | 713 | dout("bio_chain_clone split! total=%d remaining=%d" |
711 | "bi_size=%d\n", | 714 | "bi_size=%d\n", |
712 | (int)total, (int)len-total, | 715 | (int)total, (int)len-total, |
713 | (int)old_chain->bi_size); | 716 | (int)old_chain->bi_size); |
714 | 717 | ||
715 | /* split the bio. We'll release it either in the next | 718 | /* split the bio. We'll release it either in the next |
716 | call, or it will have to be released outside */ | 719 | call, or it will have to be released outside */ |
717 | bp = bio_split(old_chain, (len - total) / 512ULL); | 720 | bp = bio_split(old_chain, (len - total) / 512ULL); |
718 | if (!bp) | 721 | if (!bp) |
719 | goto err_out; | 722 | goto err_out; |
720 | 723 | ||
721 | __bio_clone(tmp, &bp->bio1); | 724 | __bio_clone(tmp, &bp->bio1); |
722 | 725 | ||
723 | *next = &bp->bio2; | 726 | *next = &bp->bio2; |
724 | } else { | 727 | } else { |
725 | __bio_clone(tmp, old_chain); | 728 | __bio_clone(tmp, old_chain); |
726 | *next = old_chain->bi_next; | 729 | *next = old_chain->bi_next; |
727 | } | 730 | } |
728 | 731 | ||
729 | tmp->bi_bdev = NULL; | 732 | tmp->bi_bdev = NULL; |
730 | gfpmask &= ~__GFP_WAIT; | 733 | gfpmask &= ~__GFP_WAIT; |
731 | tmp->bi_next = NULL; | 734 | tmp->bi_next = NULL; |
732 | 735 | ||
733 | if (!new_chain) { | 736 | if (!new_chain) { |
734 | new_chain = tail = tmp; | 737 | new_chain = tail = tmp; |
735 | } else { | 738 | } else { |
736 | tail->bi_next = tmp; | 739 | tail->bi_next = tmp; |
737 | tail = tmp; | 740 | tail = tmp; |
738 | } | 741 | } |
739 | old_chain = old_chain->bi_next; | 742 | old_chain = old_chain->bi_next; |
740 | 743 | ||
741 | total += tmp->bi_size; | 744 | total += tmp->bi_size; |
742 | } | 745 | } |
743 | 746 | ||
744 | BUG_ON(total < len); | 747 | BUG_ON(total < len); |
745 | 748 | ||
746 | if (tail) | 749 | if (tail) |
747 | tail->bi_next = NULL; | 750 | tail->bi_next = NULL; |
748 | 751 | ||
749 | *old = old_chain; | 752 | *old = old_chain; |
750 | 753 | ||
751 | return new_chain; | 754 | return new_chain; |
752 | 755 | ||
753 | err_out: | 756 | err_out: |
754 | dout("bio_chain_clone with err\n"); | 757 | dout("bio_chain_clone with err\n"); |
755 | bio_chain_put(new_chain); | 758 | bio_chain_put(new_chain); |
756 | return NULL; | 759 | return NULL; |
757 | } | 760 | } |
758 | 761 | ||
759 | /* | 762 | /* |
760 | * helpers for osd request op vectors. | 763 | * helpers for osd request op vectors. |
761 | */ | 764 | */ |
762 | static int rbd_create_rw_ops(struct ceph_osd_req_op **ops, | 765 | static int rbd_create_rw_ops(struct ceph_osd_req_op **ops, |
763 | int num_ops, | 766 | int num_ops, |
764 | int opcode, | 767 | int opcode, |
765 | u32 payload_len) | 768 | u32 payload_len) |
766 | { | 769 | { |
767 | *ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1), | 770 | *ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1), |
768 | GFP_NOIO); | 771 | GFP_NOIO); |
769 | if (!*ops) | 772 | if (!*ops) |
770 | return -ENOMEM; | 773 | return -ENOMEM; |
771 | (*ops)[0].op = opcode; | 774 | (*ops)[0].op = opcode; |
772 | /* | 775 | /* |
773 | * op extent offset and length will be set later on | 776 | * op extent offset and length will be set later on |
774 | * in calc_raw_layout() | 777 | * in calc_raw_layout() |
775 | */ | 778 | */ |
776 | (*ops)[0].payload_len = payload_len; | 779 | (*ops)[0].payload_len = payload_len; |
777 | return 0; | 780 | return 0; |
778 | } | 781 | } |
779 | 782 | ||
780 | static void rbd_destroy_ops(struct ceph_osd_req_op *ops) | 783 | static void rbd_destroy_ops(struct ceph_osd_req_op *ops) |
781 | { | 784 | { |
782 | kfree(ops); | 785 | kfree(ops); |
783 | } | 786 | } |
784 | 787 | ||
785 | static void rbd_coll_end_req_index(struct request *rq, | 788 | static void rbd_coll_end_req_index(struct request *rq, |
786 | struct rbd_req_coll *coll, | 789 | struct rbd_req_coll *coll, |
787 | int index, | 790 | int index, |
788 | int ret, u64 len) | 791 | int ret, u64 len) |
789 | { | 792 | { |
790 | struct request_queue *q; | 793 | struct request_queue *q; |
791 | int min, max, i; | 794 | int min, max, i; |
792 | 795 | ||
793 | dout("rbd_coll_end_req_index %p index %d ret %d len %lld\n", | 796 | dout("rbd_coll_end_req_index %p index %d ret %d len %lld\n", |
794 | coll, index, ret, len); | 797 | coll, index, ret, len); |
795 | 798 | ||
796 | if (!rq) | 799 | if (!rq) |
797 | return; | 800 | return; |
798 | 801 | ||
799 | if (!coll) { | 802 | if (!coll) { |
800 | blk_end_request(rq, ret, len); | 803 | blk_end_request(rq, ret, len); |
801 | return; | 804 | return; |
802 | } | 805 | } |
803 | 806 | ||
804 | q = rq->q; | 807 | q = rq->q; |
805 | 808 | ||
806 | spin_lock_irq(q->queue_lock); | 809 | spin_lock_irq(q->queue_lock); |
807 | coll->status[index].done = 1; | 810 | coll->status[index].done = 1; |
808 | coll->status[index].rc = ret; | 811 | coll->status[index].rc = ret; |
809 | coll->status[index].bytes = len; | 812 | coll->status[index].bytes = len; |
810 | max = min = coll->num_done; | 813 | max = min = coll->num_done; |
811 | while (max < coll->total && coll->status[max].done) | 814 | while (max < coll->total && coll->status[max].done) |
812 | max++; | 815 | max++; |
813 | 816 | ||
814 | for (i = min; i<max; i++) { | 817 | for (i = min; i<max; i++) { |
815 | __blk_end_request(rq, coll->status[i].rc, | 818 | __blk_end_request(rq, coll->status[i].rc, |
816 | coll->status[i].bytes); | 819 | coll->status[i].bytes); |
817 | coll->num_done++; | 820 | coll->num_done++; |
818 | kref_put(&coll->kref, rbd_coll_release); | 821 | kref_put(&coll->kref, rbd_coll_release); |
819 | } | 822 | } |
820 | spin_unlock_irq(q->queue_lock); | 823 | spin_unlock_irq(q->queue_lock); |
821 | } | 824 | } |
822 | 825 | ||
823 | static void rbd_coll_end_req(struct rbd_request *req, | 826 | static void rbd_coll_end_req(struct rbd_request *req, |
824 | int ret, u64 len) | 827 | int ret, u64 len) |
825 | { | 828 | { |
826 | rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len); | 829 | rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len); |
827 | } | 830 | } |
828 | 831 | ||
829 | /* | 832 | /* |
830 | * Send ceph osd request | 833 | * Send ceph osd request |
831 | */ | 834 | */ |
832 | static int rbd_do_request(struct request *rq, | 835 | static int rbd_do_request(struct request *rq, |
833 | struct rbd_device *dev, | 836 | struct rbd_device *dev, |
834 | struct ceph_snap_context *snapc, | 837 | struct ceph_snap_context *snapc, |
835 | u64 snapid, | 838 | u64 snapid, |
836 | const char *obj, u64 ofs, u64 len, | 839 | const char *obj, u64 ofs, u64 len, |
837 | struct bio *bio, | 840 | struct bio *bio, |
838 | struct page **pages, | 841 | struct page **pages, |
839 | int num_pages, | 842 | int num_pages, |
840 | int flags, | 843 | int flags, |
841 | struct ceph_osd_req_op *ops, | 844 | struct ceph_osd_req_op *ops, |
842 | int num_reply, | 845 | int num_reply, |
843 | struct rbd_req_coll *coll, | 846 | struct rbd_req_coll *coll, |
844 | int coll_index, | 847 | int coll_index, |
845 | void (*rbd_cb)(struct ceph_osd_request *req, | 848 | void (*rbd_cb)(struct ceph_osd_request *req, |
846 | struct ceph_msg *msg), | 849 | struct ceph_msg *msg), |
847 | struct ceph_osd_request **linger_req, | 850 | struct ceph_osd_request **linger_req, |
848 | u64 *ver) | 851 | u64 *ver) |
849 | { | 852 | { |
850 | struct ceph_osd_request *req; | 853 | struct ceph_osd_request *req; |
851 | struct ceph_file_layout *layout; | 854 | struct ceph_file_layout *layout; |
852 | int ret; | 855 | int ret; |
853 | u64 bno; | 856 | u64 bno; |
854 | struct timespec mtime = CURRENT_TIME; | 857 | struct timespec mtime = CURRENT_TIME; |
855 | struct rbd_request *req_data; | 858 | struct rbd_request *req_data; |
856 | struct ceph_osd_request_head *reqhead; | 859 | struct ceph_osd_request_head *reqhead; |
857 | struct rbd_image_header *header = &dev->header; | 860 | struct rbd_image_header *header = &dev->header; |
858 | 861 | ||
859 | req_data = kzalloc(sizeof(*req_data), GFP_NOIO); | 862 | req_data = kzalloc(sizeof(*req_data), GFP_NOIO); |
860 | if (!req_data) { | 863 | if (!req_data) { |
861 | if (coll) | 864 | if (coll) |
862 | rbd_coll_end_req_index(rq, coll, coll_index, | 865 | rbd_coll_end_req_index(rq, coll, coll_index, |
863 | -ENOMEM, len); | 866 | -ENOMEM, len); |
864 | return -ENOMEM; | 867 | return -ENOMEM; |
865 | } | 868 | } |
866 | 869 | ||
867 | if (coll) { | 870 | if (coll) { |
868 | req_data->coll = coll; | 871 | req_data->coll = coll; |
869 | req_data->coll_index = coll_index; | 872 | req_data->coll_index = coll_index; |
870 | } | 873 | } |
871 | 874 | ||
872 | dout("rbd_do_request obj=%s ofs=%lld len=%lld\n", obj, len, ofs); | 875 | dout("rbd_do_request obj=%s ofs=%lld len=%lld\n", obj, len, ofs); |
873 | 876 | ||
874 | down_read(&header->snap_rwsem); | 877 | down_read(&header->snap_rwsem); |
875 | 878 | ||
876 | req = ceph_osdc_alloc_request(&dev->client->osdc, flags, | 879 | req = ceph_osdc_alloc_request(&dev->client->osdc, flags, |
877 | snapc, | 880 | snapc, |
878 | ops, | 881 | ops, |
879 | false, | 882 | false, |
880 | GFP_NOIO, pages, bio); | 883 | GFP_NOIO, pages, bio); |
881 | if (!req) { | 884 | if (!req) { |
882 | up_read(&header->snap_rwsem); | 885 | up_read(&header->snap_rwsem); |
883 | ret = -ENOMEM; | 886 | ret = -ENOMEM; |
884 | goto done_pages; | 887 | goto done_pages; |
885 | } | 888 | } |
886 | 889 | ||
887 | req->r_callback = rbd_cb; | 890 | req->r_callback = rbd_cb; |
888 | 891 | ||
889 | req_data->rq = rq; | 892 | req_data->rq = rq; |
890 | req_data->bio = bio; | 893 | req_data->bio = bio; |
891 | req_data->pages = pages; | 894 | req_data->pages = pages; |
892 | req_data->len = len; | 895 | req_data->len = len; |
893 | 896 | ||
894 | req->r_priv = req_data; | 897 | req->r_priv = req_data; |
895 | 898 | ||
896 | reqhead = req->r_request->front.iov_base; | 899 | reqhead = req->r_request->front.iov_base; |
897 | reqhead->snapid = cpu_to_le64(CEPH_NOSNAP); | 900 | reqhead->snapid = cpu_to_le64(CEPH_NOSNAP); |
898 | 901 | ||
899 | strncpy(req->r_oid, obj, sizeof(req->r_oid)); | 902 | strncpy(req->r_oid, obj, sizeof(req->r_oid)); |
900 | req->r_oid_len = strlen(req->r_oid); | 903 | req->r_oid_len = strlen(req->r_oid); |
901 | 904 | ||
902 | layout = &req->r_file_layout; | 905 | layout = &req->r_file_layout; |
903 | memset(layout, 0, sizeof(*layout)); | 906 | memset(layout, 0, sizeof(*layout)); |
904 | layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); | 907 | layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); |
905 | layout->fl_stripe_count = cpu_to_le32(1); | 908 | layout->fl_stripe_count = cpu_to_le32(1); |
906 | layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); | 909 | layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); |
907 | layout->fl_pg_preferred = cpu_to_le32(-1); | 910 | layout->fl_pg_preferred = cpu_to_le32(-1); |
908 | layout->fl_pg_pool = cpu_to_le32(dev->poolid); | 911 | layout->fl_pg_pool = cpu_to_le32(dev->poolid); |
909 | ceph_calc_raw_layout(&dev->client->osdc, layout, snapid, | 912 | ceph_calc_raw_layout(&dev->client->osdc, layout, snapid, |
910 | ofs, &len, &bno, req, ops); | 913 | ofs, &len, &bno, req, ops); |
911 | 914 | ||
912 | ceph_osdc_build_request(req, ofs, &len, | 915 | ceph_osdc_build_request(req, ofs, &len, |
913 | ops, | 916 | ops, |
914 | snapc, | 917 | snapc, |
915 | &mtime, | 918 | &mtime, |
916 | req->r_oid, req->r_oid_len); | 919 | req->r_oid, req->r_oid_len); |
917 | up_read(&header->snap_rwsem); | 920 | up_read(&header->snap_rwsem); |
918 | 921 | ||
919 | if (linger_req) { | 922 | if (linger_req) { |
920 | ceph_osdc_set_request_linger(&dev->client->osdc, req); | 923 | ceph_osdc_set_request_linger(&dev->client->osdc, req); |
921 | *linger_req = req; | 924 | *linger_req = req; |
922 | } | 925 | } |
923 | 926 | ||
924 | ret = ceph_osdc_start_request(&dev->client->osdc, req, false); | 927 | ret = ceph_osdc_start_request(&dev->client->osdc, req, false); |
925 | if (ret < 0) | 928 | if (ret < 0) |
926 | goto done_err; | 929 | goto done_err; |
927 | 930 | ||
928 | if (!rbd_cb) { | 931 | if (!rbd_cb) { |
929 | ret = ceph_osdc_wait_request(&dev->client->osdc, req); | 932 | ret = ceph_osdc_wait_request(&dev->client->osdc, req); |
930 | if (ver) | 933 | if (ver) |
931 | *ver = le64_to_cpu(req->r_reassert_version.version); | 934 | *ver = le64_to_cpu(req->r_reassert_version.version); |
932 | dout("reassert_ver=%lld\n", | 935 | dout("reassert_ver=%lld\n", |
933 | le64_to_cpu(req->r_reassert_version.version)); | 936 | le64_to_cpu(req->r_reassert_version.version)); |
934 | ceph_osdc_put_request(req); | 937 | ceph_osdc_put_request(req); |
935 | } | 938 | } |
936 | return ret; | 939 | return ret; |
937 | 940 | ||
938 | done_err: | 941 | done_err: |
939 | bio_chain_put(req_data->bio); | 942 | bio_chain_put(req_data->bio); |
940 | ceph_osdc_put_request(req); | 943 | ceph_osdc_put_request(req); |
941 | done_pages: | 944 | done_pages: |
942 | rbd_coll_end_req(req_data, ret, len); | 945 | rbd_coll_end_req(req_data, ret, len); |
943 | kfree(req_data); | 946 | kfree(req_data); |
944 | return ret; | 947 | return ret; |
945 | } | 948 | } |
946 | 949 | ||
947 | /* | 950 | /* |
948 | * Ceph osd op callback | 951 | * Ceph osd op callback |
949 | */ | 952 | */ |
950 | static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) | 953 | static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) |
951 | { | 954 | { |
952 | struct rbd_request *req_data = req->r_priv; | 955 | struct rbd_request *req_data = req->r_priv; |
953 | struct ceph_osd_reply_head *replyhead; | 956 | struct ceph_osd_reply_head *replyhead; |
954 | struct ceph_osd_op *op; | 957 | struct ceph_osd_op *op; |
955 | __s32 rc; | 958 | __s32 rc; |
956 | u64 bytes; | 959 | u64 bytes; |
957 | int read_op; | 960 | int read_op; |
958 | 961 | ||
959 | /* parse reply */ | 962 | /* parse reply */ |
960 | replyhead = msg->front.iov_base; | 963 | replyhead = msg->front.iov_base; |
961 | WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); | 964 | WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); |
962 | op = (void *)(replyhead + 1); | 965 | op = (void *)(replyhead + 1); |
963 | rc = le32_to_cpu(replyhead->result); | 966 | rc = le32_to_cpu(replyhead->result); |
964 | bytes = le64_to_cpu(op->extent.length); | 967 | bytes = le64_to_cpu(op->extent.length); |
965 | read_op = (le32_to_cpu(op->op) == CEPH_OSD_OP_READ); | 968 | read_op = (le32_to_cpu(op->op) == CEPH_OSD_OP_READ); |
966 | 969 | ||
967 | dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc); | 970 | dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc); |
968 | 971 | ||
969 | if (rc == -ENOENT && read_op) { | 972 | if (rc == -ENOENT && read_op) { |
970 | zero_bio_chain(req_data->bio, 0); | 973 | zero_bio_chain(req_data->bio, 0); |
971 | rc = 0; | 974 | rc = 0; |
972 | } else if (rc == 0 && read_op && bytes < req_data->len) { | 975 | } else if (rc == 0 && read_op && bytes < req_data->len) { |
973 | zero_bio_chain(req_data->bio, bytes); | 976 | zero_bio_chain(req_data->bio, bytes); |
974 | bytes = req_data->len; | 977 | bytes = req_data->len; |
975 | } | 978 | } |
976 | 979 | ||
977 | rbd_coll_end_req(req_data, rc, bytes); | 980 | rbd_coll_end_req(req_data, rc, bytes); |
978 | 981 | ||
979 | if (req_data->bio) | 982 | if (req_data->bio) |
980 | bio_chain_put(req_data->bio); | 983 | bio_chain_put(req_data->bio); |
981 | 984 | ||
982 | ceph_osdc_put_request(req); | 985 | ceph_osdc_put_request(req); |
983 | kfree(req_data); | 986 | kfree(req_data); |
984 | } | 987 | } |
985 | 988 | ||
986 | static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) | 989 | static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) |
987 | { | 990 | { |
988 | ceph_osdc_put_request(req); | 991 | ceph_osdc_put_request(req); |
989 | } | 992 | } |
990 | 993 | ||
991 | /* | 994 | /* |
992 | * Do a synchronous ceph osd operation | 995 | * Do a synchronous ceph osd operation |
993 | */ | 996 | */ |
994 | static int rbd_req_sync_op(struct rbd_device *dev, | 997 | static int rbd_req_sync_op(struct rbd_device *dev, |
995 | struct ceph_snap_context *snapc, | 998 | struct ceph_snap_context *snapc, |
996 | u64 snapid, | 999 | u64 snapid, |
997 | int opcode, | 1000 | int opcode, |
998 | int flags, | 1001 | int flags, |
999 | struct ceph_osd_req_op *orig_ops, | 1002 | struct ceph_osd_req_op *orig_ops, |
1000 | int num_reply, | 1003 | int num_reply, |
1001 | const char *obj, | 1004 | const char *obj, |
1002 | u64 ofs, u64 len, | 1005 | u64 ofs, u64 len, |
1003 | char *buf, | 1006 | char *buf, |
1004 | struct ceph_osd_request **linger_req, | 1007 | struct ceph_osd_request **linger_req, |
1005 | u64 *ver) | 1008 | u64 *ver) |
1006 | { | 1009 | { |
1007 | int ret; | 1010 | int ret; |
1008 | struct page **pages; | 1011 | struct page **pages; |
1009 | int num_pages; | 1012 | int num_pages; |
1010 | struct ceph_osd_req_op *ops = orig_ops; | 1013 | struct ceph_osd_req_op *ops = orig_ops; |
1011 | u32 payload_len; | 1014 | u32 payload_len; |
1012 | 1015 | ||
1013 | num_pages = calc_pages_for(ofs , len); | 1016 | num_pages = calc_pages_for(ofs , len); |
1014 | pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); | 1017 | pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); |
1015 | if (IS_ERR(pages)) | 1018 | if (IS_ERR(pages)) |
1016 | return PTR_ERR(pages); | 1019 | return PTR_ERR(pages); |
1017 | 1020 | ||
1018 | if (!orig_ops) { | 1021 | if (!orig_ops) { |
1019 | payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0); | 1022 | payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0); |
1020 | ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len); | 1023 | ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len); |
1021 | if (ret < 0) | 1024 | if (ret < 0) |
1022 | goto done; | 1025 | goto done; |
1023 | 1026 | ||
1024 | if ((flags & CEPH_OSD_FLAG_WRITE) && buf) { | 1027 | if ((flags & CEPH_OSD_FLAG_WRITE) && buf) { |
1025 | ret = ceph_copy_to_page_vector(pages, buf, ofs, len); | 1028 | ret = ceph_copy_to_page_vector(pages, buf, ofs, len); |
1026 | if (ret < 0) | 1029 | if (ret < 0) |
1027 | goto done_ops; | 1030 | goto done_ops; |
1028 | } | 1031 | } |
1029 | } | 1032 | } |
1030 | 1033 | ||
1031 | ret = rbd_do_request(NULL, dev, snapc, snapid, | 1034 | ret = rbd_do_request(NULL, dev, snapc, snapid, |
1032 | obj, ofs, len, NULL, | 1035 | obj, ofs, len, NULL, |
1033 | pages, num_pages, | 1036 | pages, num_pages, |
1034 | flags, | 1037 | flags, |
1035 | ops, | 1038 | ops, |
1036 | 2, | 1039 | 2, |
1037 | NULL, 0, | 1040 | NULL, 0, |
1038 | NULL, | 1041 | NULL, |
1039 | linger_req, ver); | 1042 | linger_req, ver); |
1040 | if (ret < 0) | 1043 | if (ret < 0) |
1041 | goto done_ops; | 1044 | goto done_ops; |
1042 | 1045 | ||
1043 | if ((flags & CEPH_OSD_FLAG_READ) && buf) | 1046 | if ((flags & CEPH_OSD_FLAG_READ) && buf) |
1044 | ret = ceph_copy_from_page_vector(pages, buf, ofs, ret); | 1047 | ret = ceph_copy_from_page_vector(pages, buf, ofs, ret); |
1045 | 1048 | ||
1046 | done_ops: | 1049 | done_ops: |
1047 | if (!orig_ops) | 1050 | if (!orig_ops) |
1048 | rbd_destroy_ops(ops); | 1051 | rbd_destroy_ops(ops); |
1049 | done: | 1052 | done: |
1050 | ceph_release_page_vector(pages, num_pages); | 1053 | ceph_release_page_vector(pages, num_pages); |
1051 | return ret; | 1054 | return ret; |
1052 | } | 1055 | } |
1053 | 1056 | ||
1054 | /* | 1057 | /* |
1055 | * Do an asynchronous ceph osd operation | 1058 | * Do an asynchronous ceph osd operation |
1056 | */ | 1059 | */ |
1057 | static int rbd_do_op(struct request *rq, | 1060 | static int rbd_do_op(struct request *rq, |
1058 | struct rbd_device *rbd_dev , | 1061 | struct rbd_device *rbd_dev , |
1059 | struct ceph_snap_context *snapc, | 1062 | struct ceph_snap_context *snapc, |
1060 | u64 snapid, | 1063 | u64 snapid, |
1061 | int opcode, int flags, int num_reply, | 1064 | int opcode, int flags, int num_reply, |
1062 | u64 ofs, u64 len, | 1065 | u64 ofs, u64 len, |
1063 | struct bio *bio, | 1066 | struct bio *bio, |
1064 | struct rbd_req_coll *coll, | 1067 | struct rbd_req_coll *coll, |
1065 | int coll_index) | 1068 | int coll_index) |
1066 | { | 1069 | { |
1067 | char *seg_name; | 1070 | char *seg_name; |
1068 | u64 seg_ofs; | 1071 | u64 seg_ofs; |
1069 | u64 seg_len; | 1072 | u64 seg_len; |
1070 | int ret; | 1073 | int ret; |
1071 | struct ceph_osd_req_op *ops; | 1074 | struct ceph_osd_req_op *ops; |
1072 | u32 payload_len; | 1075 | u32 payload_len; |
1073 | 1076 | ||
1074 | seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO); | 1077 | seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO); |
1075 | if (!seg_name) | 1078 | if (!seg_name) |
1076 | return -ENOMEM; | 1079 | return -ENOMEM; |
1077 | 1080 | ||
1078 | seg_len = rbd_get_segment(&rbd_dev->header, | 1081 | seg_len = rbd_get_segment(&rbd_dev->header, |
1079 | rbd_dev->header.block_name, | 1082 | rbd_dev->header.block_name, |
1080 | ofs, len, | 1083 | ofs, len, |
1081 | seg_name, &seg_ofs); | 1084 | seg_name, &seg_ofs); |
1082 | 1085 | ||
1083 | payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0); | 1086 | payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0); |
1084 | 1087 | ||
1085 | ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len); | 1088 | ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len); |
1086 | if (ret < 0) | 1089 | if (ret < 0) |
1087 | goto done; | 1090 | goto done; |
1088 | 1091 | ||
1089 | /* we've taken care of segment sizes earlier when we | 1092 | /* we've taken care of segment sizes earlier when we |
1090 | cloned the bios. We should never have a segment | 1093 | cloned the bios. We should never have a segment |
1091 | truncated at this point */ | 1094 | truncated at this point */ |
1092 | BUG_ON(seg_len < len); | 1095 | BUG_ON(seg_len < len); |
1093 | 1096 | ||
1094 | ret = rbd_do_request(rq, rbd_dev, snapc, snapid, | 1097 | ret = rbd_do_request(rq, rbd_dev, snapc, snapid, |
1095 | seg_name, seg_ofs, seg_len, | 1098 | seg_name, seg_ofs, seg_len, |
1096 | bio, | 1099 | bio, |
1097 | NULL, 0, | 1100 | NULL, 0, |
1098 | flags, | 1101 | flags, |
1099 | ops, | 1102 | ops, |
1100 | num_reply, | 1103 | num_reply, |
1101 | coll, coll_index, | 1104 | coll, coll_index, |
1102 | rbd_req_cb, 0, NULL); | 1105 | rbd_req_cb, 0, NULL); |
1103 | 1106 | ||
1104 | rbd_destroy_ops(ops); | 1107 | rbd_destroy_ops(ops); |
1105 | done: | 1108 | done: |
1106 | kfree(seg_name); | 1109 | kfree(seg_name); |
1107 | return ret; | 1110 | return ret; |
1108 | } | 1111 | } |
1109 | 1112 | ||
1110 | /* | 1113 | /* |
1111 | * Request async osd write | 1114 | * Request async osd write |
1112 | */ | 1115 | */ |
1113 | static int rbd_req_write(struct request *rq, | 1116 | static int rbd_req_write(struct request *rq, |
1114 | struct rbd_device *rbd_dev, | 1117 | struct rbd_device *rbd_dev, |
1115 | struct ceph_snap_context *snapc, | 1118 | struct ceph_snap_context *snapc, |
1116 | u64 ofs, u64 len, | 1119 | u64 ofs, u64 len, |
1117 | struct bio *bio, | 1120 | struct bio *bio, |
1118 | struct rbd_req_coll *coll, | 1121 | struct rbd_req_coll *coll, |
1119 | int coll_index) | 1122 | int coll_index) |
1120 | { | 1123 | { |
1121 | return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP, | 1124 | return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP, |
1122 | CEPH_OSD_OP_WRITE, | 1125 | CEPH_OSD_OP_WRITE, |
1123 | CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, | 1126 | CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, |
1124 | 2, | 1127 | 2, |
1125 | ofs, len, bio, coll, coll_index); | 1128 | ofs, len, bio, coll, coll_index); |
1126 | } | 1129 | } |
1127 | 1130 | ||
1128 | /* | 1131 | /* |
1129 | * Request async osd read | 1132 | * Request async osd read |
1130 | */ | 1133 | */ |
1131 | static int rbd_req_read(struct request *rq, | 1134 | static int rbd_req_read(struct request *rq, |
1132 | struct rbd_device *rbd_dev, | 1135 | struct rbd_device *rbd_dev, |
1133 | u64 snapid, | 1136 | u64 snapid, |
1134 | u64 ofs, u64 len, | 1137 | u64 ofs, u64 len, |
1135 | struct bio *bio, | 1138 | struct bio *bio, |
1136 | struct rbd_req_coll *coll, | 1139 | struct rbd_req_coll *coll, |
1137 | int coll_index) | 1140 | int coll_index) |
1138 | { | 1141 | { |
1139 | return rbd_do_op(rq, rbd_dev, NULL, | 1142 | return rbd_do_op(rq, rbd_dev, NULL, |
1140 | (snapid ? snapid : CEPH_NOSNAP), | 1143 | (snapid ? snapid : CEPH_NOSNAP), |
1141 | CEPH_OSD_OP_READ, | 1144 | CEPH_OSD_OP_READ, |
1142 | CEPH_OSD_FLAG_READ, | 1145 | CEPH_OSD_FLAG_READ, |
1143 | 2, | 1146 | 2, |
1144 | ofs, len, bio, coll, coll_index); | 1147 | ofs, len, bio, coll, coll_index); |
1145 | } | 1148 | } |
1146 | 1149 | ||
1147 | /* | 1150 | /* |
1148 | * Request sync osd read | 1151 | * Request sync osd read |
1149 | */ | 1152 | */ |
1150 | static int rbd_req_sync_read(struct rbd_device *dev, | 1153 | static int rbd_req_sync_read(struct rbd_device *dev, |
1151 | struct ceph_snap_context *snapc, | 1154 | struct ceph_snap_context *snapc, |
1152 | u64 snapid, | 1155 | u64 snapid, |
1153 | const char *obj, | 1156 | const char *obj, |
1154 | u64 ofs, u64 len, | 1157 | u64 ofs, u64 len, |
1155 | char *buf, | 1158 | char *buf, |
1156 | u64 *ver) | 1159 | u64 *ver) |
1157 | { | 1160 | { |
1158 | return rbd_req_sync_op(dev, NULL, | 1161 | return rbd_req_sync_op(dev, NULL, |
1159 | (snapid ? snapid : CEPH_NOSNAP), | 1162 | (snapid ? snapid : CEPH_NOSNAP), |
1160 | CEPH_OSD_OP_READ, | 1163 | CEPH_OSD_OP_READ, |
1161 | CEPH_OSD_FLAG_READ, | 1164 | CEPH_OSD_FLAG_READ, |
1162 | NULL, | 1165 | NULL, |
1163 | 1, obj, ofs, len, buf, NULL, ver); | 1166 | 1, obj, ofs, len, buf, NULL, ver); |
1164 | } | 1167 | } |
1165 | 1168 | ||
1166 | /* | 1169 | /* |
1167 | * Request sync osd watch | 1170 | * Request sync osd watch |
1168 | */ | 1171 | */ |
1169 | static int rbd_req_sync_notify_ack(struct rbd_device *dev, | 1172 | static int rbd_req_sync_notify_ack(struct rbd_device *dev, |
1170 | u64 ver, | 1173 | u64 ver, |
1171 | u64 notify_id, | 1174 | u64 notify_id, |
1172 | const char *obj) | 1175 | const char *obj) |
1173 | { | 1176 | { |
1174 | struct ceph_osd_req_op *ops; | 1177 | struct ceph_osd_req_op *ops; |
1175 | struct page **pages = NULL; | 1178 | struct page **pages = NULL; |
1176 | int ret; | 1179 | int ret; |
1177 | 1180 | ||
1178 | ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY_ACK, 0); | 1181 | ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY_ACK, 0); |
1179 | if (ret < 0) | 1182 | if (ret < 0) |
1180 | return ret; | 1183 | return ret; |
1181 | 1184 | ||
1182 | ops[0].watch.ver = cpu_to_le64(dev->header.obj_version); | 1185 | ops[0].watch.ver = cpu_to_le64(dev->header.obj_version); |
1183 | ops[0].watch.cookie = notify_id; | 1186 | ops[0].watch.cookie = notify_id; |
1184 | ops[0].watch.flag = 0; | 1187 | ops[0].watch.flag = 0; |
1185 | 1188 | ||
1186 | ret = rbd_do_request(NULL, dev, NULL, CEPH_NOSNAP, | 1189 | ret = rbd_do_request(NULL, dev, NULL, CEPH_NOSNAP, |
1187 | obj, 0, 0, NULL, | 1190 | obj, 0, 0, NULL, |
1188 | pages, 0, | 1191 | pages, 0, |
1189 | CEPH_OSD_FLAG_READ, | 1192 | CEPH_OSD_FLAG_READ, |
1190 | ops, | 1193 | ops, |
1191 | 1, | 1194 | 1, |
1192 | NULL, 0, | 1195 | NULL, 0, |
1193 | rbd_simple_req_cb, 0, NULL); | 1196 | rbd_simple_req_cb, 0, NULL); |
1194 | 1197 | ||
1195 | rbd_destroy_ops(ops); | 1198 | rbd_destroy_ops(ops); |
1196 | return ret; | 1199 | return ret; |
1197 | } | 1200 | } |
1198 | 1201 | ||
1199 | static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) | 1202 | static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) |
1200 | { | 1203 | { |
1201 | struct rbd_device *dev = (struct rbd_device *)data; | 1204 | struct rbd_device *dev = (struct rbd_device *)data; |
1202 | int rc; | 1205 | int rc; |
1203 | 1206 | ||
1204 | if (!dev) | 1207 | if (!dev) |
1205 | return; | 1208 | return; |
1206 | 1209 | ||
1207 | dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name, | 1210 | dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name, |
1208 | notify_id, (int)opcode); | 1211 | notify_id, (int)opcode); |
1209 | mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); | 1212 | mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); |
1210 | rc = __rbd_update_snaps(dev); | 1213 | rc = __rbd_update_snaps(dev); |
1211 | mutex_unlock(&ctl_mutex); | 1214 | mutex_unlock(&ctl_mutex); |
1212 | if (rc) | 1215 | if (rc) |
1213 | pr_warning(DRV_NAME "%d got notification but failed to update" | 1216 | pr_warning(DRV_NAME "%d got notification but failed to update" |
1214 | " snaps: %d\n", dev->major, rc); | 1217 | " snaps: %d\n", dev->major, rc); |
1215 | 1218 | ||
1216 | rbd_req_sync_notify_ack(dev, ver, notify_id, dev->obj_md_name); | 1219 | rbd_req_sync_notify_ack(dev, ver, notify_id, dev->obj_md_name); |
1217 | } | 1220 | } |
1218 | 1221 | ||
1219 | /* | 1222 | /* |
1220 | * Request sync osd watch | 1223 | * Request sync osd watch |
1221 | */ | 1224 | */ |
1222 | static int rbd_req_sync_watch(struct rbd_device *dev, | 1225 | static int rbd_req_sync_watch(struct rbd_device *dev, |
1223 | const char *obj, | 1226 | const char *obj, |
1224 | u64 ver) | 1227 | u64 ver) |
1225 | { | 1228 | { |
1226 | struct ceph_osd_req_op *ops; | 1229 | struct ceph_osd_req_op *ops; |
1227 | struct ceph_osd_client *osdc = &dev->client->osdc; | 1230 | struct ceph_osd_client *osdc = &dev->client->osdc; |
1228 | 1231 | ||
1229 | int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0); | 1232 | int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0); |
1230 | if (ret < 0) | 1233 | if (ret < 0) |
1231 | return ret; | 1234 | return ret; |
1232 | 1235 | ||
1233 | ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, | 1236 | ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, |
1234 | (void *)dev, &dev->watch_event); | 1237 | (void *)dev, &dev->watch_event); |
1235 | if (ret < 0) | 1238 | if (ret < 0) |
1236 | goto fail; | 1239 | goto fail; |
1237 | 1240 | ||
1238 | ops[0].watch.ver = cpu_to_le64(ver); | 1241 | ops[0].watch.ver = cpu_to_le64(ver); |
1239 | ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie); | 1242 | ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie); |
1240 | ops[0].watch.flag = 1; | 1243 | ops[0].watch.flag = 1; |
1241 | 1244 | ||
1242 | ret = rbd_req_sync_op(dev, NULL, | 1245 | ret = rbd_req_sync_op(dev, NULL, |
1243 | CEPH_NOSNAP, | 1246 | CEPH_NOSNAP, |
1244 | 0, | 1247 | 0, |
1245 | CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, | 1248 | CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, |
1246 | ops, | 1249 | ops, |
1247 | 1, obj, 0, 0, NULL, | 1250 | 1, obj, 0, 0, NULL, |
1248 | &dev->watch_request, NULL); | 1251 | &dev->watch_request, NULL); |
1249 | 1252 | ||
1250 | if (ret < 0) | 1253 | if (ret < 0) |
1251 | goto fail_event; | 1254 | goto fail_event; |
1252 | 1255 | ||
1253 | rbd_destroy_ops(ops); | 1256 | rbd_destroy_ops(ops); |
1254 | return 0; | 1257 | return 0; |
1255 | 1258 | ||
1256 | fail_event: | 1259 | fail_event: |
1257 | ceph_osdc_cancel_event(dev->watch_event); | 1260 | ceph_osdc_cancel_event(dev->watch_event); |
1258 | dev->watch_event = NULL; | 1261 | dev->watch_event = NULL; |
1259 | fail: | 1262 | fail: |
1260 | rbd_destroy_ops(ops); | 1263 | rbd_destroy_ops(ops); |
1261 | return ret; | 1264 | return ret; |
1262 | } | 1265 | } |
1263 | 1266 | ||
1264 | /* | 1267 | /* |
1265 | * Request sync osd unwatch | 1268 | * Request sync osd unwatch |
1266 | */ | 1269 | */ |
1267 | static int rbd_req_sync_unwatch(struct rbd_device *dev, | 1270 | static int rbd_req_sync_unwatch(struct rbd_device *dev, |
1268 | const char *obj) | 1271 | const char *obj) |
1269 | { | 1272 | { |
1270 | struct ceph_osd_req_op *ops; | 1273 | struct ceph_osd_req_op *ops; |
1271 | 1274 | ||
1272 | int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0); | 1275 | int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0); |
1273 | if (ret < 0) | 1276 | if (ret < 0) |
1274 | return ret; | 1277 | return ret; |
1275 | 1278 | ||
1276 | ops[0].watch.ver = 0; | 1279 | ops[0].watch.ver = 0; |
1277 | ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie); | 1280 | ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie); |
1278 | ops[0].watch.flag = 0; | 1281 | ops[0].watch.flag = 0; |
1279 | 1282 | ||
1280 | ret = rbd_req_sync_op(dev, NULL, | 1283 | ret = rbd_req_sync_op(dev, NULL, |
1281 | CEPH_NOSNAP, | 1284 | CEPH_NOSNAP, |
1282 | 0, | 1285 | 0, |
1283 | CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, | 1286 | CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, |
1284 | ops, | 1287 | ops, |
1285 | 1, obj, 0, 0, NULL, NULL, NULL); | 1288 | 1, obj, 0, 0, NULL, NULL, NULL); |
1286 | 1289 | ||
1287 | rbd_destroy_ops(ops); | 1290 | rbd_destroy_ops(ops); |
1288 | ceph_osdc_cancel_event(dev->watch_event); | 1291 | ceph_osdc_cancel_event(dev->watch_event); |
1289 | dev->watch_event = NULL; | 1292 | dev->watch_event = NULL; |
1290 | return ret; | 1293 | return ret; |
1291 | } | 1294 | } |
1292 | 1295 | ||
1293 | struct rbd_notify_info { | 1296 | struct rbd_notify_info { |
1294 | struct rbd_device *dev; | 1297 | struct rbd_device *dev; |
1295 | }; | 1298 | }; |
1296 | 1299 | ||
1297 | static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data) | 1300 | static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data) |
1298 | { | 1301 | { |
1299 | struct rbd_device *dev = (struct rbd_device *)data; | 1302 | struct rbd_device *dev = (struct rbd_device *)data; |
1300 | if (!dev) | 1303 | if (!dev) |
1301 | return; | 1304 | return; |
1302 | 1305 | ||
1303 | dout("rbd_notify_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name, | 1306 | dout("rbd_notify_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name, |
1304 | notify_id, (int)opcode); | 1307 | notify_id, (int)opcode); |
1305 | } | 1308 | } |
1306 | 1309 | ||
1307 | /* | 1310 | /* |
1308 | * Request sync osd notify | 1311 | * Request sync osd notify |
1309 | */ | 1312 | */ |
1310 | static int rbd_req_sync_notify(struct rbd_device *dev, | 1313 | static int rbd_req_sync_notify(struct rbd_device *dev, |
1311 | const char *obj) | 1314 | const char *obj) |
1312 | { | 1315 | { |
1313 | struct ceph_osd_req_op *ops; | 1316 | struct ceph_osd_req_op *ops; |
1314 | struct ceph_osd_client *osdc = &dev->client->osdc; | 1317 | struct ceph_osd_client *osdc = &dev->client->osdc; |
1315 | struct ceph_osd_event *event; | 1318 | struct ceph_osd_event *event; |
1316 | struct rbd_notify_info info; | 1319 | struct rbd_notify_info info; |
1317 | int payload_len = sizeof(u32) + sizeof(u32); | 1320 | int payload_len = sizeof(u32) + sizeof(u32); |
1318 | int ret; | 1321 | int ret; |
1319 | 1322 | ||
1320 | ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY, payload_len); | 1323 | ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY, payload_len); |
1321 | if (ret < 0) | 1324 | if (ret < 0) |
1322 | return ret; | 1325 | return ret; |
1323 | 1326 | ||
1324 | info.dev = dev; | 1327 | info.dev = dev; |
1325 | 1328 | ||
1326 | ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1, | 1329 | ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1, |
1327 | (void *)&info, &event); | 1330 | (void *)&info, &event); |
1328 | if (ret < 0) | 1331 | if (ret < 0) |
1329 | goto fail; | 1332 | goto fail; |
1330 | 1333 | ||
1331 | ops[0].watch.ver = 1; | 1334 | ops[0].watch.ver = 1; |
1332 | ops[0].watch.flag = 1; | 1335 | ops[0].watch.flag = 1; |
1333 | ops[0].watch.cookie = event->cookie; | 1336 | ops[0].watch.cookie = event->cookie; |
1334 | ops[0].watch.prot_ver = RADOS_NOTIFY_VER; | 1337 | ops[0].watch.prot_ver = RADOS_NOTIFY_VER; |
1335 | ops[0].watch.timeout = 12; | 1338 | ops[0].watch.timeout = 12; |
1336 | 1339 | ||
1337 | ret = rbd_req_sync_op(dev, NULL, | 1340 | ret = rbd_req_sync_op(dev, NULL, |
1338 | CEPH_NOSNAP, | 1341 | CEPH_NOSNAP, |
1339 | 0, | 1342 | 0, |
1340 | CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, | 1343 | CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, |
1341 | ops, | 1344 | ops, |
1342 | 1, obj, 0, 0, NULL, NULL, NULL); | 1345 | 1, obj, 0, 0, NULL, NULL, NULL); |
1343 | if (ret < 0) | 1346 | if (ret < 0) |
1344 | goto fail_event; | 1347 | goto fail_event; |
1345 | 1348 | ||
1346 | ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT); | 1349 | ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT); |
1347 | dout("ceph_osdc_wait_event returned %d\n", ret); | 1350 | dout("ceph_osdc_wait_event returned %d\n", ret); |
1348 | rbd_destroy_ops(ops); | 1351 | rbd_destroy_ops(ops); |
1349 | return 0; | 1352 | return 0; |
1350 | 1353 | ||
1351 | fail_event: | 1354 | fail_event: |
1352 | ceph_osdc_cancel_event(event); | 1355 | ceph_osdc_cancel_event(event); |
1353 | fail: | 1356 | fail: |
1354 | rbd_destroy_ops(ops); | 1357 | rbd_destroy_ops(ops); |
1355 | return ret; | 1358 | return ret; |
1356 | } | 1359 | } |
1357 | 1360 | ||
1358 | /* | 1361 | /* |
1359 | * Request sync osd read | 1362 | * Request sync osd read |
1360 | */ | 1363 | */ |
1361 | static int rbd_req_sync_exec(struct rbd_device *dev, | 1364 | static int rbd_req_sync_exec(struct rbd_device *dev, |
1362 | const char *obj, | 1365 | const char *obj, |
1363 | const char *cls, | 1366 | const char *cls, |
1364 | const char *method, | 1367 | const char *method, |
1365 | const char *data, | 1368 | const char *data, |
1366 | int len, | 1369 | int len, |
1367 | u64 *ver) | 1370 | u64 *ver) |
1368 | { | 1371 | { |
1369 | struct ceph_osd_req_op *ops; | 1372 | struct ceph_osd_req_op *ops; |
1370 | int cls_len = strlen(cls); | 1373 | int cls_len = strlen(cls); |
1371 | int method_len = strlen(method); | 1374 | int method_len = strlen(method); |
1372 | int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL, | 1375 | int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL, |
1373 | cls_len + method_len + len); | 1376 | cls_len + method_len + len); |
1374 | if (ret < 0) | 1377 | if (ret < 0) |
1375 | return ret; | 1378 | return ret; |
1376 | 1379 | ||
1377 | ops[0].cls.class_name = cls; | 1380 | ops[0].cls.class_name = cls; |
1378 | ops[0].cls.class_len = (__u8)cls_len; | 1381 | ops[0].cls.class_len = (__u8)cls_len; |
1379 | ops[0].cls.method_name = method; | 1382 | ops[0].cls.method_name = method; |
1380 | ops[0].cls.method_len = (__u8)method_len; | 1383 | ops[0].cls.method_len = (__u8)method_len; |
1381 | ops[0].cls.argc = 0; | 1384 | ops[0].cls.argc = 0; |
1382 | ops[0].cls.indata = data; | 1385 | ops[0].cls.indata = data; |
1383 | ops[0].cls.indata_len = len; | 1386 | ops[0].cls.indata_len = len; |
1384 | 1387 | ||
1385 | ret = rbd_req_sync_op(dev, NULL, | 1388 | ret = rbd_req_sync_op(dev, NULL, |
1386 | CEPH_NOSNAP, | 1389 | CEPH_NOSNAP, |
1387 | 0, | 1390 | 0, |
1388 | CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, | 1391 | CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, |
1389 | ops, | 1392 | ops, |
1390 | 1, obj, 0, 0, NULL, NULL, ver); | 1393 | 1, obj, 0, 0, NULL, NULL, ver); |
1391 | 1394 | ||
1392 | rbd_destroy_ops(ops); | 1395 | rbd_destroy_ops(ops); |
1393 | 1396 | ||
1394 | dout("cls_exec returned %d\n", ret); | 1397 | dout("cls_exec returned %d\n", ret); |
1395 | return ret; | 1398 | return ret; |
1396 | } | 1399 | } |
1397 | 1400 | ||
1398 | static struct rbd_req_coll *rbd_alloc_coll(int num_reqs) | 1401 | static struct rbd_req_coll *rbd_alloc_coll(int num_reqs) |
1399 | { | 1402 | { |
1400 | struct rbd_req_coll *coll = | 1403 | struct rbd_req_coll *coll = |
1401 | kzalloc(sizeof(struct rbd_req_coll) + | 1404 | kzalloc(sizeof(struct rbd_req_coll) + |
1402 | sizeof(struct rbd_req_status) * num_reqs, | 1405 | sizeof(struct rbd_req_status) * num_reqs, |
1403 | GFP_ATOMIC); | 1406 | GFP_ATOMIC); |
1404 | 1407 | ||
1405 | if (!coll) | 1408 | if (!coll) |
1406 | return NULL; | 1409 | return NULL; |
1407 | coll->total = num_reqs; | 1410 | coll->total = num_reqs; |
1408 | kref_init(&coll->kref); | 1411 | kref_init(&coll->kref); |
1409 | return coll; | 1412 | return coll; |
1410 | } | 1413 | } |
1411 | 1414 | ||
1412 | /* | 1415 | /* |
1413 | * block device queue callback | 1416 | * block device queue callback |
1414 | */ | 1417 | */ |
1415 | static void rbd_rq_fn(struct request_queue *q) | 1418 | static void rbd_rq_fn(struct request_queue *q) |
1416 | { | 1419 | { |
1417 | struct rbd_device *rbd_dev = q->queuedata; | 1420 | struct rbd_device *rbd_dev = q->queuedata; |
1418 | struct request *rq; | 1421 | struct request *rq; |
1419 | struct bio_pair *bp = NULL; | 1422 | struct bio_pair *bp = NULL; |
1420 | 1423 | ||
1421 | rq = blk_fetch_request(q); | 1424 | rq = blk_fetch_request(q); |
1422 | 1425 | ||
1423 | while (1) { | 1426 | while (1) { |
1424 | struct bio *bio; | 1427 | struct bio *bio; |
1425 | struct bio *rq_bio, *next_bio = NULL; | 1428 | struct bio *rq_bio, *next_bio = NULL; |
1426 | bool do_write; | 1429 | bool do_write; |
1427 | int size, op_size = 0; | 1430 | int size, op_size = 0; |
1428 | u64 ofs; | 1431 | u64 ofs; |
1429 | int num_segs, cur_seg = 0; | 1432 | int num_segs, cur_seg = 0; |
1430 | struct rbd_req_coll *coll; | 1433 | struct rbd_req_coll *coll; |
1431 | 1434 | ||
1432 | /* peek at request from block layer */ | 1435 | /* peek at request from block layer */ |
1433 | if (!rq) | 1436 | if (!rq) |
1434 | break; | 1437 | break; |
1435 | 1438 | ||
1436 | dout("fetched request\n"); | 1439 | dout("fetched request\n"); |
1437 | 1440 | ||
1438 | /* filter out block requests we don't understand */ | 1441 | /* filter out block requests we don't understand */ |
1439 | if ((rq->cmd_type != REQ_TYPE_FS)) { | 1442 | if ((rq->cmd_type != REQ_TYPE_FS)) { |
1440 | __blk_end_request_all(rq, 0); | 1443 | __blk_end_request_all(rq, 0); |
1441 | goto next; | 1444 | goto next; |
1442 | } | 1445 | } |
1443 | 1446 | ||
1444 | /* deduce our operation (read, write) */ | 1447 | /* deduce our operation (read, write) */ |
1445 | do_write = (rq_data_dir(rq) == WRITE); | 1448 | do_write = (rq_data_dir(rq) == WRITE); |
1446 | 1449 | ||
1447 | size = blk_rq_bytes(rq); | 1450 | size = blk_rq_bytes(rq); |
1448 | ofs = blk_rq_pos(rq) * 512ULL; | 1451 | ofs = blk_rq_pos(rq) * 512ULL; |
1449 | rq_bio = rq->bio; | 1452 | rq_bio = rq->bio; |
1450 | if (do_write && rbd_dev->read_only) { | 1453 | if (do_write && rbd_dev->read_only) { |
1451 | __blk_end_request_all(rq, -EROFS); | 1454 | __blk_end_request_all(rq, -EROFS); |
1452 | goto next; | 1455 | goto next; |
1453 | } | 1456 | } |
1454 | 1457 | ||
1455 | spin_unlock_irq(q->queue_lock); | 1458 | spin_unlock_irq(q->queue_lock); |
1456 | 1459 | ||
1457 | dout("%s 0x%x bytes at 0x%llx\n", | 1460 | dout("%s 0x%x bytes at 0x%llx\n", |
1458 | do_write ? "write" : "read", | 1461 | do_write ? "write" : "read", |
1459 | size, blk_rq_pos(rq) * 512ULL); | 1462 | size, blk_rq_pos(rq) * 512ULL); |
1460 | 1463 | ||
1461 | num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size); | 1464 | num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size); |
1462 | coll = rbd_alloc_coll(num_segs); | 1465 | coll = rbd_alloc_coll(num_segs); |
1463 | if (!coll) { | 1466 | if (!coll) { |
1464 | spin_lock_irq(q->queue_lock); | 1467 | spin_lock_irq(q->queue_lock); |
1465 | __blk_end_request_all(rq, -ENOMEM); | 1468 | __blk_end_request_all(rq, -ENOMEM); |
1466 | goto next; | 1469 | goto next; |
1467 | } | 1470 | } |
1468 | 1471 | ||
1469 | do { | 1472 | do { |
1470 | /* a bio clone to be passed down to OSD req */ | 1473 | /* a bio clone to be passed down to OSD req */ |
1471 | dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt); | 1474 | dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt); |
1472 | op_size = rbd_get_segment(&rbd_dev->header, | 1475 | op_size = rbd_get_segment(&rbd_dev->header, |
1473 | rbd_dev->header.block_name, | 1476 | rbd_dev->header.block_name, |
1474 | ofs, size, | 1477 | ofs, size, |
1475 | NULL, NULL); | 1478 | NULL, NULL); |
1476 | kref_get(&coll->kref); | 1479 | kref_get(&coll->kref); |
1477 | bio = bio_chain_clone(&rq_bio, &next_bio, &bp, | 1480 | bio = bio_chain_clone(&rq_bio, &next_bio, &bp, |
1478 | op_size, GFP_ATOMIC); | 1481 | op_size, GFP_ATOMIC); |
1479 | if (!bio) { | 1482 | if (!bio) { |
1480 | rbd_coll_end_req_index(rq, coll, cur_seg, | 1483 | rbd_coll_end_req_index(rq, coll, cur_seg, |
1481 | -ENOMEM, op_size); | 1484 | -ENOMEM, op_size); |
1482 | goto next_seg; | 1485 | goto next_seg; |
1483 | } | 1486 | } |
1484 | 1487 | ||
1485 | 1488 | ||
1486 | /* init OSD command: write or read */ | 1489 | /* init OSD command: write or read */ |
1487 | if (do_write) | 1490 | if (do_write) |
1488 | rbd_req_write(rq, rbd_dev, | 1491 | rbd_req_write(rq, rbd_dev, |
1489 | rbd_dev->header.snapc, | 1492 | rbd_dev->header.snapc, |
1490 | ofs, | 1493 | ofs, |
1491 | op_size, bio, | 1494 | op_size, bio, |
1492 | coll, cur_seg); | 1495 | coll, cur_seg); |
1493 | else | 1496 | else |
1494 | rbd_req_read(rq, rbd_dev, | 1497 | rbd_req_read(rq, rbd_dev, |
1495 | cur_snap_id(rbd_dev), | 1498 | cur_snap_id(rbd_dev), |
1496 | ofs, | 1499 | ofs, |
1497 | op_size, bio, | 1500 | op_size, bio, |
1498 | coll, cur_seg); | 1501 | coll, cur_seg); |
1499 | 1502 | ||
1500 | next_seg: | 1503 | next_seg: |
1501 | size -= op_size; | 1504 | size -= op_size; |
1502 | ofs += op_size; | 1505 | ofs += op_size; |
1503 | 1506 | ||
1504 | cur_seg++; | 1507 | cur_seg++; |
1505 | rq_bio = next_bio; | 1508 | rq_bio = next_bio; |
1506 | } while (size > 0); | 1509 | } while (size > 0); |
1507 | kref_put(&coll->kref, rbd_coll_release); | 1510 | kref_put(&coll->kref, rbd_coll_release); |
1508 | 1511 | ||
1509 | if (bp) | 1512 | if (bp) |
1510 | bio_pair_release(bp); | 1513 | bio_pair_release(bp); |
1511 | spin_lock_irq(q->queue_lock); | 1514 | spin_lock_irq(q->queue_lock); |
1512 | next: | 1515 | next: |
1513 | rq = blk_fetch_request(q); | 1516 | rq = blk_fetch_request(q); |
1514 | } | 1517 | } |
1515 | } | 1518 | } |
1516 | 1519 | ||
1517 | /* | 1520 | /* |
1518 | * a queue callback. Makes sure that we don't create a bio that spans across | 1521 | * a queue callback. Makes sure that we don't create a bio that spans across |
1519 | * multiple osd objects. One exception would be with a single page bios, | 1522 | * multiple osd objects. One exception would be with a single page bios, |
1520 | * which we handle later at bio_chain_clone | 1523 | * which we handle later at bio_chain_clone |
1521 | */ | 1524 | */ |
1522 | static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd, | 1525 | static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd, |
1523 | struct bio_vec *bvec) | 1526 | struct bio_vec *bvec) |
1524 | { | 1527 | { |
1525 | struct rbd_device *rbd_dev = q->queuedata; | 1528 | struct rbd_device *rbd_dev = q->queuedata; |
1526 | unsigned int chunk_sectors = 1 << (rbd_dev->header.obj_order - 9); | 1529 | unsigned int chunk_sectors = 1 << (rbd_dev->header.obj_order - 9); |
1527 | sector_t sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev); | 1530 | sector_t sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev); |
1528 | unsigned int bio_sectors = bmd->bi_size >> 9; | 1531 | unsigned int bio_sectors = bmd->bi_size >> 9; |
1529 | int max; | 1532 | int max; |
1530 | 1533 | ||
1531 | max = (chunk_sectors - ((sector & (chunk_sectors - 1)) | 1534 | max = (chunk_sectors - ((sector & (chunk_sectors - 1)) |
1532 | + bio_sectors)) << 9; | 1535 | + bio_sectors)) << 9; |
1533 | if (max < 0) | 1536 | if (max < 0) |
1534 | max = 0; /* bio_add cannot handle a negative return */ | 1537 | max = 0; /* bio_add cannot handle a negative return */ |
1535 | if (max <= bvec->bv_len && bio_sectors == 0) | 1538 | if (max <= bvec->bv_len && bio_sectors == 0) |
1536 | return bvec->bv_len; | 1539 | return bvec->bv_len; |
1537 | return max; | 1540 | return max; |
1538 | } | 1541 | } |
1539 | 1542 | ||
1540 | static void rbd_free_disk(struct rbd_device *rbd_dev) | 1543 | static void rbd_free_disk(struct rbd_device *rbd_dev) |
1541 | { | 1544 | { |
1542 | struct gendisk *disk = rbd_dev->disk; | 1545 | struct gendisk *disk = rbd_dev->disk; |
1543 | 1546 | ||
1544 | if (!disk) | 1547 | if (!disk) |
1545 | return; | 1548 | return; |
1546 | 1549 | ||
1547 | rbd_header_free(&rbd_dev->header); | 1550 | rbd_header_free(&rbd_dev->header); |
1548 | 1551 | ||
1549 | if (disk->flags & GENHD_FL_UP) | 1552 | if (disk->flags & GENHD_FL_UP) |
1550 | del_gendisk(disk); | 1553 | del_gendisk(disk); |
1551 | if (disk->queue) | 1554 | if (disk->queue) |
1552 | blk_cleanup_queue(disk->queue); | 1555 | blk_cleanup_queue(disk->queue); |
1553 | put_disk(disk); | 1556 | put_disk(disk); |
1554 | } | 1557 | } |
1555 | 1558 | ||
1556 | /* | 1559 | /* |
1557 | * reload the ondisk the header | 1560 | * reload the ondisk the header |
1558 | */ | 1561 | */ |
1559 | static int rbd_read_header(struct rbd_device *rbd_dev, | 1562 | static int rbd_read_header(struct rbd_device *rbd_dev, |
1560 | struct rbd_image_header *header) | 1563 | struct rbd_image_header *header) |
1561 | { | 1564 | { |
1562 | ssize_t rc; | 1565 | ssize_t rc; |
1563 | struct rbd_image_header_ondisk *dh; | 1566 | struct rbd_image_header_ondisk *dh; |
1564 | int snap_count = 0; | 1567 | int snap_count = 0; |
1565 | u64 snap_names_len = 0; | 1568 | u64 snap_names_len = 0; |
1566 | u64 ver; | 1569 | u64 ver; |
1567 | 1570 | ||
1568 | while (1) { | 1571 | while (1) { |
1569 | int len = sizeof(*dh) + | 1572 | int len = sizeof(*dh) + |
1570 | snap_count * sizeof(struct rbd_image_snap_ondisk) + | 1573 | snap_count * sizeof(struct rbd_image_snap_ondisk) + |
1571 | snap_names_len; | 1574 | snap_names_len; |
1572 | 1575 | ||
1573 | rc = -ENOMEM; | 1576 | rc = -ENOMEM; |
1574 | dh = kmalloc(len, GFP_KERNEL); | 1577 | dh = kmalloc(len, GFP_KERNEL); |
1575 | if (!dh) | 1578 | if (!dh) |
1576 | return -ENOMEM; | 1579 | return -ENOMEM; |
1577 | 1580 | ||
1578 | rc = rbd_req_sync_read(rbd_dev, | 1581 | rc = rbd_req_sync_read(rbd_dev, |
1579 | NULL, CEPH_NOSNAP, | 1582 | NULL, CEPH_NOSNAP, |
1580 | rbd_dev->obj_md_name, | 1583 | rbd_dev->obj_md_name, |
1581 | 0, len, | 1584 | 0, len, |
1582 | (char *)dh, &ver); | 1585 | (char *)dh, &ver); |
1583 | if (rc < 0) | 1586 | if (rc < 0) |
1584 | goto out_dh; | 1587 | goto out_dh; |
1585 | 1588 | ||
1586 | rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL); | 1589 | rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL); |
1587 | if (rc < 0) { | 1590 | if (rc < 0) { |
1588 | if (rc == -ENXIO) { | 1591 | if (rc == -ENXIO) { |
1589 | pr_warning("unrecognized header format" | 1592 | pr_warning("unrecognized header format" |
1590 | " for image %s", rbd_dev->obj); | 1593 | " for image %s", rbd_dev->obj); |
1591 | } | 1594 | } |
1592 | goto out_dh; | 1595 | goto out_dh; |
1593 | } | 1596 | } |
1594 | 1597 | ||
1595 | if (snap_count != header->total_snaps) { | 1598 | if (snap_count != header->total_snaps) { |
1596 | snap_count = header->total_snaps; | 1599 | snap_count = header->total_snaps; |
1597 | snap_names_len = header->snap_names_len; | 1600 | snap_names_len = header->snap_names_len; |
1598 | rbd_header_free(header); | 1601 | rbd_header_free(header); |
1599 | kfree(dh); | 1602 | kfree(dh); |
1600 | continue; | 1603 | continue; |
1601 | } | 1604 | } |
1602 | break; | 1605 | break; |
1603 | } | 1606 | } |
1604 | header->obj_version = ver; | 1607 | header->obj_version = ver; |
1605 | 1608 | ||
1606 | out_dh: | 1609 | out_dh: |
1607 | kfree(dh); | 1610 | kfree(dh); |
1608 | return rc; | 1611 | return rc; |
1609 | } | 1612 | } |
1610 | 1613 | ||
1611 | /* | 1614 | /* |
1612 | * create a snapshot | 1615 | * create a snapshot |
1613 | */ | 1616 | */ |
1614 | static int rbd_header_add_snap(struct rbd_device *dev, | 1617 | static int rbd_header_add_snap(struct rbd_device *dev, |
1615 | const char *snap_name, | 1618 | const char *snap_name, |
1616 | gfp_t gfp_flags) | 1619 | gfp_t gfp_flags) |
1617 | { | 1620 | { |
1618 | int name_len = strlen(snap_name); | 1621 | int name_len = strlen(snap_name); |
1619 | u64 new_snapid; | 1622 | u64 new_snapid; |
1620 | int ret; | 1623 | int ret; |
1621 | void *data, *p, *e; | 1624 | void *data, *p, *e; |
1622 | u64 ver; | 1625 | u64 ver; |
1623 | 1626 | ||
1624 | /* we should create a snapshot only if we're pointing at the head */ | 1627 | /* we should create a snapshot only if we're pointing at the head */ |
1625 | if (dev->cur_snap) | 1628 | if (dev->cur_snap) |
1626 | return -EINVAL; | 1629 | return -EINVAL; |
1627 | 1630 | ||
1628 | ret = ceph_monc_create_snapid(&dev->client->monc, dev->poolid, | 1631 | ret = ceph_monc_create_snapid(&dev->client->monc, dev->poolid, |
1629 | &new_snapid); | 1632 | &new_snapid); |
1630 | dout("created snapid=%lld\n", new_snapid); | 1633 | dout("created snapid=%lld\n", new_snapid); |
1631 | if (ret < 0) | 1634 | if (ret < 0) |
1632 | return ret; | 1635 | return ret; |
1633 | 1636 | ||
1634 | data = kmalloc(name_len + 16, gfp_flags); | 1637 | data = kmalloc(name_len + 16, gfp_flags); |
1635 | if (!data) | 1638 | if (!data) |
1636 | return -ENOMEM; | 1639 | return -ENOMEM; |
1637 | 1640 | ||
1638 | p = data; | 1641 | p = data; |
1639 | e = data + name_len + 16; | 1642 | e = data + name_len + 16; |
1640 | 1643 | ||
1641 | ceph_encode_string_safe(&p, e, snap_name, name_len, bad); | 1644 | ceph_encode_string_safe(&p, e, snap_name, name_len, bad); |
1642 | ceph_encode_64_safe(&p, e, new_snapid, bad); | 1645 | ceph_encode_64_safe(&p, e, new_snapid, bad); |
1643 | 1646 | ||
1644 | ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add", | 1647 | ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add", |
1645 | data, p - data, &ver); | 1648 | data, p - data, &ver); |
1646 | 1649 | ||
1647 | kfree(data); | 1650 | kfree(data); |
1648 | 1651 | ||
1649 | if (ret < 0) | 1652 | if (ret < 0) |
1650 | return ret; | 1653 | return ret; |
1651 | 1654 | ||
1652 | dev->header.snapc->seq = new_snapid; | 1655 | dev->header.snapc->seq = new_snapid; |
1653 | 1656 | ||
1654 | return 0; | 1657 | return 0; |
1655 | bad: | 1658 | bad: |
1656 | return -ERANGE; | 1659 | return -ERANGE; |
1657 | } | 1660 | } |
1658 | 1661 | ||
1659 | static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev) | 1662 | static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev) |
1660 | { | 1663 | { |
1661 | struct rbd_snap *snap; | 1664 | struct rbd_snap *snap; |
1662 | 1665 | ||
1663 | while (!list_empty(&rbd_dev->snaps)) { | 1666 | while (!list_empty(&rbd_dev->snaps)) { |
1664 | snap = list_first_entry(&rbd_dev->snaps, struct rbd_snap, node); | 1667 | snap = list_first_entry(&rbd_dev->snaps, struct rbd_snap, node); |
1665 | __rbd_remove_snap_dev(rbd_dev, snap); | 1668 | __rbd_remove_snap_dev(rbd_dev, snap); |
1666 | } | 1669 | } |
1667 | } | 1670 | } |
1668 | 1671 | ||
1669 | /* | 1672 | /* |
1670 | * only read the first part of the ondisk header, without the snaps info | 1673 | * only read the first part of the ondisk header, without the snaps info |
1671 | */ | 1674 | */ |
1672 | static int __rbd_update_snaps(struct rbd_device *rbd_dev) | 1675 | static int __rbd_update_snaps(struct rbd_device *rbd_dev) |
1673 | { | 1676 | { |
1674 | int ret; | 1677 | int ret; |
1675 | struct rbd_image_header h; | 1678 | struct rbd_image_header h; |
1676 | u64 snap_seq; | 1679 | u64 snap_seq; |
1677 | int follow_seq = 0; | 1680 | int follow_seq = 0; |
1678 | 1681 | ||
1679 | ret = rbd_read_header(rbd_dev, &h); | 1682 | ret = rbd_read_header(rbd_dev, &h); |
1680 | if (ret < 0) | 1683 | if (ret < 0) |
1681 | return ret; | 1684 | return ret; |
1682 | 1685 | ||
1683 | /* resized? */ | 1686 | /* resized? */ |
1684 | set_capacity(rbd_dev->disk, h.image_size / 512ULL); | 1687 | set_capacity(rbd_dev->disk, h.image_size / 512ULL); |
1685 | 1688 | ||
1686 | down_write(&rbd_dev->header.snap_rwsem); | 1689 | down_write(&rbd_dev->header.snap_rwsem); |
1687 | 1690 | ||
1688 | snap_seq = rbd_dev->header.snapc->seq; | 1691 | snap_seq = rbd_dev->header.snapc->seq; |
1689 | if (rbd_dev->header.total_snaps && | 1692 | if (rbd_dev->header.total_snaps && |
1690 | rbd_dev->header.snapc->snaps[0] == snap_seq) | 1693 | rbd_dev->header.snapc->snaps[0] == snap_seq) |
1691 | /* pointing at the head, will need to follow that | 1694 | /* pointing at the head, will need to follow that |
1692 | if head moves */ | 1695 | if head moves */ |
1693 | follow_seq = 1; | 1696 | follow_seq = 1; |
1694 | 1697 | ||
1695 | kfree(rbd_dev->header.snapc); | 1698 | kfree(rbd_dev->header.snapc); |
1696 | kfree(rbd_dev->header.snap_names); | 1699 | kfree(rbd_dev->header.snap_names); |
1697 | kfree(rbd_dev->header.snap_sizes); | 1700 | kfree(rbd_dev->header.snap_sizes); |
1698 | 1701 | ||
1699 | rbd_dev->header.total_snaps = h.total_snaps; | 1702 | rbd_dev->header.total_snaps = h.total_snaps; |
1700 | rbd_dev->header.snapc = h.snapc; | 1703 | rbd_dev->header.snapc = h.snapc; |
1701 | rbd_dev->header.snap_names = h.snap_names; | 1704 | rbd_dev->header.snap_names = h.snap_names; |
1702 | rbd_dev->header.snap_names_len = h.snap_names_len; | 1705 | rbd_dev->header.snap_names_len = h.snap_names_len; |
1703 | rbd_dev->header.snap_sizes = h.snap_sizes; | 1706 | rbd_dev->header.snap_sizes = h.snap_sizes; |
1704 | if (follow_seq) | 1707 | if (follow_seq) |
1705 | rbd_dev->header.snapc->seq = rbd_dev->header.snapc->snaps[0]; | 1708 | rbd_dev->header.snapc->seq = rbd_dev->header.snapc->snaps[0]; |
1706 | else | 1709 | else |
1707 | rbd_dev->header.snapc->seq = snap_seq; | 1710 | rbd_dev->header.snapc->seq = snap_seq; |
1708 | 1711 | ||
1709 | ret = __rbd_init_snaps_header(rbd_dev); | 1712 | ret = __rbd_init_snaps_header(rbd_dev); |
1710 | 1713 | ||
1711 | up_write(&rbd_dev->header.snap_rwsem); | 1714 | up_write(&rbd_dev->header.snap_rwsem); |
1712 | 1715 | ||
1713 | return ret; | 1716 | return ret; |
1714 | } | 1717 | } |
1715 | 1718 | ||
1716 | static int rbd_init_disk(struct rbd_device *rbd_dev) | 1719 | static int rbd_init_disk(struct rbd_device *rbd_dev) |
1717 | { | 1720 | { |
1718 | struct gendisk *disk; | 1721 | struct gendisk *disk; |
1719 | struct request_queue *q; | 1722 | struct request_queue *q; |
1720 | int rc; | 1723 | int rc; |
1721 | u64 total_size = 0; | 1724 | u64 total_size = 0; |
1722 | 1725 | ||
1723 | /* contact OSD, request size info about the object being mapped */ | 1726 | /* contact OSD, request size info about the object being mapped */ |
1724 | rc = rbd_read_header(rbd_dev, &rbd_dev->header); | 1727 | rc = rbd_read_header(rbd_dev, &rbd_dev->header); |
1725 | if (rc) | 1728 | if (rc) |
1726 | return rc; | 1729 | return rc; |
1727 | 1730 | ||
1728 | /* no need to lock here, as rbd_dev is not registered yet */ | 1731 | /* no need to lock here, as rbd_dev is not registered yet */ |
1729 | rc = __rbd_init_snaps_header(rbd_dev); | 1732 | rc = __rbd_init_snaps_header(rbd_dev); |
1730 | if (rc) | 1733 | if (rc) |
1731 | return rc; | 1734 | return rc; |
1732 | 1735 | ||
1733 | rc = rbd_header_set_snap(rbd_dev, rbd_dev->snap_name, &total_size); | 1736 | rc = rbd_header_set_snap(rbd_dev, rbd_dev->snap_name, &total_size); |
1734 | if (rc) | 1737 | if (rc) |
1735 | return rc; | 1738 | return rc; |
1736 | 1739 | ||
1737 | /* create gendisk info */ | 1740 | /* create gendisk info */ |
1738 | rc = -ENOMEM; | 1741 | rc = -ENOMEM; |
1739 | disk = alloc_disk(RBD_MINORS_PER_MAJOR); | 1742 | disk = alloc_disk(RBD_MINORS_PER_MAJOR); |
1740 | if (!disk) | 1743 | if (!disk) |
1741 | goto out; | 1744 | goto out; |
1742 | 1745 | ||
1743 | snprintf(disk->disk_name, sizeof(disk->disk_name), DRV_NAME "%d", | 1746 | snprintf(disk->disk_name, sizeof(disk->disk_name), DRV_NAME "%d", |
1744 | rbd_dev->id); | 1747 | rbd_dev->id); |
1745 | disk->major = rbd_dev->major; | 1748 | disk->major = rbd_dev->major; |
1746 | disk->first_minor = 0; | 1749 | disk->first_minor = 0; |
1747 | disk->fops = &rbd_bd_ops; | 1750 | disk->fops = &rbd_bd_ops; |
1748 | disk->private_data = rbd_dev; | 1751 | disk->private_data = rbd_dev; |
1749 | 1752 | ||
1750 | /* init rq */ | 1753 | /* init rq */ |
1751 | rc = -ENOMEM; | 1754 | rc = -ENOMEM; |
1752 | q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock); | 1755 | q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock); |
1753 | if (!q) | 1756 | if (!q) |
1754 | goto out_disk; | 1757 | goto out_disk; |
1755 | 1758 | ||
1756 | /* set io sizes to object size */ | 1759 | /* set io sizes to object size */ |
1757 | blk_queue_max_hw_sectors(q, rbd_obj_bytes(&rbd_dev->header) / 512ULL); | 1760 | blk_queue_max_hw_sectors(q, rbd_obj_bytes(&rbd_dev->header) / 512ULL); |
1758 | blk_queue_max_segment_size(q, rbd_obj_bytes(&rbd_dev->header)); | 1761 | blk_queue_max_segment_size(q, rbd_obj_bytes(&rbd_dev->header)); |
1759 | blk_queue_io_min(q, rbd_obj_bytes(&rbd_dev->header)); | 1762 | blk_queue_io_min(q, rbd_obj_bytes(&rbd_dev->header)); |
1760 | blk_queue_io_opt(q, rbd_obj_bytes(&rbd_dev->header)); | 1763 | blk_queue_io_opt(q, rbd_obj_bytes(&rbd_dev->header)); |
1761 | 1764 | ||
1762 | blk_queue_merge_bvec(q, rbd_merge_bvec); | 1765 | blk_queue_merge_bvec(q, rbd_merge_bvec); |
1763 | disk->queue = q; | 1766 | disk->queue = q; |
1764 | 1767 | ||
1765 | q->queuedata = rbd_dev; | 1768 | q->queuedata = rbd_dev; |
1766 | 1769 | ||
1767 | rbd_dev->disk = disk; | 1770 | rbd_dev->disk = disk; |
1768 | rbd_dev->q = q; | 1771 | rbd_dev->q = q; |
1769 | 1772 | ||
1770 | /* finally, announce the disk to the world */ | 1773 | /* finally, announce the disk to the world */ |
1771 | set_capacity(disk, total_size / 512ULL); | 1774 | set_capacity(disk, total_size / 512ULL); |
1772 | add_disk(disk); | 1775 | add_disk(disk); |
1773 | 1776 | ||
1774 | pr_info("%s: added with size 0x%llx\n", | 1777 | pr_info("%s: added with size 0x%llx\n", |
1775 | disk->disk_name, (unsigned long long)total_size); | 1778 | disk->disk_name, (unsigned long long)total_size); |
1776 | return 0; | 1779 | return 0; |
1777 | 1780 | ||
1778 | out_disk: | 1781 | out_disk: |
1779 | put_disk(disk); | 1782 | put_disk(disk); |
1780 | out: | 1783 | out: |
1781 | return rc; | 1784 | return rc; |
1782 | } | 1785 | } |
1783 | 1786 | ||
1784 | /* | 1787 | /* |
1785 | sysfs | 1788 | sysfs |
1786 | */ | 1789 | */ |
1787 | 1790 | ||
1788 | static ssize_t rbd_size_show(struct device *dev, | 1791 | static ssize_t rbd_size_show(struct device *dev, |
1789 | struct device_attribute *attr, char *buf) | 1792 | struct device_attribute *attr, char *buf) |
1790 | { | 1793 | { |
1791 | struct rbd_device *rbd_dev = dev_to_rbd(dev); | 1794 | struct rbd_device *rbd_dev = dev_to_rbd(dev); |
1792 | 1795 | ||
1793 | return sprintf(buf, "%llu\n", (unsigned long long)rbd_dev->header.image_size); | 1796 | return sprintf(buf, "%llu\n", (unsigned long long)rbd_dev->header.image_size); |
1794 | } | 1797 | } |
1795 | 1798 | ||
1796 | static ssize_t rbd_major_show(struct device *dev, | 1799 | static ssize_t rbd_major_show(struct device *dev, |
1797 | struct device_attribute *attr, char *buf) | 1800 | struct device_attribute *attr, char *buf) |
1798 | { | 1801 | { |
1799 | struct rbd_device *rbd_dev = dev_to_rbd(dev); | 1802 | struct rbd_device *rbd_dev = dev_to_rbd(dev); |
1800 | 1803 | ||
1801 | return sprintf(buf, "%d\n", rbd_dev->major); | 1804 | return sprintf(buf, "%d\n", rbd_dev->major); |
1802 | } | 1805 | } |
1803 | 1806 | ||
1804 | static ssize_t rbd_client_id_show(struct device *dev, | 1807 | static ssize_t rbd_client_id_show(struct device *dev, |
1805 | struct device_attribute *attr, char *buf) | 1808 | struct device_attribute *attr, char *buf) |
1806 | { | 1809 | { |
1807 | struct rbd_device *rbd_dev = dev_to_rbd(dev); | 1810 | struct rbd_device *rbd_dev = dev_to_rbd(dev); |
1808 | 1811 | ||
1809 | return sprintf(buf, "client%lld\n", ceph_client_id(rbd_dev->client)); | 1812 | return sprintf(buf, "client%lld\n", ceph_client_id(rbd_dev->client)); |
1810 | } | 1813 | } |
1811 | 1814 | ||
1812 | static ssize_t rbd_pool_show(struct device *dev, | 1815 | static ssize_t rbd_pool_show(struct device *dev, |
1813 | struct device_attribute *attr, char *buf) | 1816 | struct device_attribute *attr, char *buf) |
1814 | { | 1817 | { |
1815 | struct rbd_device *rbd_dev = dev_to_rbd(dev); | 1818 | struct rbd_device *rbd_dev = dev_to_rbd(dev); |
1816 | 1819 | ||
1817 | return sprintf(buf, "%s\n", rbd_dev->pool_name); | 1820 | return sprintf(buf, "%s\n", rbd_dev->pool_name); |
1818 | } | 1821 | } |
1819 | 1822 | ||
1820 | static ssize_t rbd_name_show(struct device *dev, | 1823 | static ssize_t rbd_name_show(struct device *dev, |
1821 | struct device_attribute *attr, char *buf) | 1824 | struct device_attribute *attr, char *buf) |
1822 | { | 1825 | { |
1823 | struct rbd_device *rbd_dev = dev_to_rbd(dev); | 1826 | struct rbd_device *rbd_dev = dev_to_rbd(dev); |
1824 | 1827 | ||
1825 | return sprintf(buf, "%s\n", rbd_dev->obj); | 1828 | return sprintf(buf, "%s\n", rbd_dev->obj); |
1826 | } | 1829 | } |
1827 | 1830 | ||
1828 | static ssize_t rbd_snap_show(struct device *dev, | 1831 | static ssize_t rbd_snap_show(struct device *dev, |
1829 | struct device_attribute *attr, | 1832 | struct device_attribute *attr, |
1830 | char *buf) | 1833 | char *buf) |
1831 | { | 1834 | { |
1832 | struct rbd_device *rbd_dev = dev_to_rbd(dev); | 1835 | struct rbd_device *rbd_dev = dev_to_rbd(dev); |
1833 | 1836 | ||
1834 | return sprintf(buf, "%s\n", rbd_dev->snap_name); | 1837 | return sprintf(buf, "%s\n", rbd_dev->snap_name); |
1835 | } | 1838 | } |
1836 | 1839 | ||
1837 | static ssize_t rbd_image_refresh(struct device *dev, | 1840 | static ssize_t rbd_image_refresh(struct device *dev, |
1838 | struct device_attribute *attr, | 1841 | struct device_attribute *attr, |
1839 | const char *buf, | 1842 | const char *buf, |
1840 | size_t size) | 1843 | size_t size) |
1841 | { | 1844 | { |
1842 | struct rbd_device *rbd_dev = dev_to_rbd(dev); | 1845 | struct rbd_device *rbd_dev = dev_to_rbd(dev); |
1843 | int rc; | 1846 | int rc; |
1844 | int ret = size; | 1847 | int ret = size; |
1845 | 1848 | ||
1846 | mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); | 1849 | mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); |
1847 | 1850 | ||
1848 | rc = __rbd_update_snaps(rbd_dev); | 1851 | rc = __rbd_update_snaps(rbd_dev); |
1849 | if (rc < 0) | 1852 | if (rc < 0) |
1850 | ret = rc; | 1853 | ret = rc; |
1851 | 1854 | ||
1852 | mutex_unlock(&ctl_mutex); | 1855 | mutex_unlock(&ctl_mutex); |
1853 | return ret; | 1856 | return ret; |
1854 | } | 1857 | } |
1855 | 1858 | ||
1856 | static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); | 1859 | static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); |
1857 | static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL); | 1860 | static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL); |
1858 | static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); | 1861 | static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); |
1859 | static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); | 1862 | static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); |
1860 | static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL); | 1863 | static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL); |
1861 | static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); | 1864 | static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); |
1862 | static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); | 1865 | static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); |
1863 | static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add); | 1866 | static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add); |
1864 | 1867 | ||
1865 | static struct attribute *rbd_attrs[] = { | 1868 | static struct attribute *rbd_attrs[] = { |
1866 | &dev_attr_size.attr, | 1869 | &dev_attr_size.attr, |
1867 | &dev_attr_major.attr, | 1870 | &dev_attr_major.attr, |
1868 | &dev_attr_client_id.attr, | 1871 | &dev_attr_client_id.attr, |
1869 | &dev_attr_pool.attr, | 1872 | &dev_attr_pool.attr, |
1870 | &dev_attr_name.attr, | 1873 | &dev_attr_name.attr, |
1871 | &dev_attr_current_snap.attr, | 1874 | &dev_attr_current_snap.attr, |
1872 | &dev_attr_refresh.attr, | 1875 | &dev_attr_refresh.attr, |
1873 | &dev_attr_create_snap.attr, | 1876 | &dev_attr_create_snap.attr, |
1874 | NULL | 1877 | NULL |
1875 | }; | 1878 | }; |
1876 | 1879 | ||
1877 | static struct attribute_group rbd_attr_group = { | 1880 | static struct attribute_group rbd_attr_group = { |
1878 | .attrs = rbd_attrs, | 1881 | .attrs = rbd_attrs, |
1879 | }; | 1882 | }; |
1880 | 1883 | ||
1881 | static const struct attribute_group *rbd_attr_groups[] = { | 1884 | static const struct attribute_group *rbd_attr_groups[] = { |
1882 | &rbd_attr_group, | 1885 | &rbd_attr_group, |
1883 | NULL | 1886 | NULL |
1884 | }; | 1887 | }; |
1885 | 1888 | ||
1886 | static void rbd_sysfs_dev_release(struct device *dev) | 1889 | static void rbd_sysfs_dev_release(struct device *dev) |
1887 | { | 1890 | { |
1888 | } | 1891 | } |
1889 | 1892 | ||
1890 | static struct device_type rbd_device_type = { | 1893 | static struct device_type rbd_device_type = { |
1891 | .name = "rbd", | 1894 | .name = "rbd", |
1892 | .groups = rbd_attr_groups, | 1895 | .groups = rbd_attr_groups, |
1893 | .release = rbd_sysfs_dev_release, | 1896 | .release = rbd_sysfs_dev_release, |
1894 | }; | 1897 | }; |
1895 | 1898 | ||
1896 | 1899 | ||
1897 | /* | 1900 | /* |
1898 | sysfs - snapshots | 1901 | sysfs - snapshots |
1899 | */ | 1902 | */ |
1900 | 1903 | ||
1901 | static ssize_t rbd_snap_size_show(struct device *dev, | 1904 | static ssize_t rbd_snap_size_show(struct device *dev, |
1902 | struct device_attribute *attr, | 1905 | struct device_attribute *attr, |
1903 | char *buf) | 1906 | char *buf) |
1904 | { | 1907 | { |
1905 | struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); | 1908 | struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); |
1906 | 1909 | ||
1907 | return sprintf(buf, "%lld\n", (long long)snap->size); | 1910 | return sprintf(buf, "%lld\n", (long long)snap->size); |
1908 | } | 1911 | } |
1909 | 1912 | ||
1910 | static ssize_t rbd_snap_id_show(struct device *dev, | 1913 | static ssize_t rbd_snap_id_show(struct device *dev, |
1911 | struct device_attribute *attr, | 1914 | struct device_attribute *attr, |
1912 | char *buf) | 1915 | char *buf) |
1913 | { | 1916 | { |
1914 | struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); | 1917 | struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); |
1915 | 1918 | ||
1916 | return sprintf(buf, "%lld\n", (long long)snap->id); | 1919 | return sprintf(buf, "%lld\n", (long long)snap->id); |
1917 | } | 1920 | } |
1918 | 1921 | ||
1919 | static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL); | 1922 | static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL); |
1920 | static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL); | 1923 | static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL); |
1921 | 1924 | ||
1922 | static struct attribute *rbd_snap_attrs[] = { | 1925 | static struct attribute *rbd_snap_attrs[] = { |
1923 | &dev_attr_snap_size.attr, | 1926 | &dev_attr_snap_size.attr, |
1924 | &dev_attr_snap_id.attr, | 1927 | &dev_attr_snap_id.attr, |
1925 | NULL, | 1928 | NULL, |
1926 | }; | 1929 | }; |
1927 | 1930 | ||
1928 | static struct attribute_group rbd_snap_attr_group = { | 1931 | static struct attribute_group rbd_snap_attr_group = { |
1929 | .attrs = rbd_snap_attrs, | 1932 | .attrs = rbd_snap_attrs, |
1930 | }; | 1933 | }; |
1931 | 1934 | ||
1932 | static void rbd_snap_dev_release(struct device *dev) | 1935 | static void rbd_snap_dev_release(struct device *dev) |
1933 | { | 1936 | { |
1934 | struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); | 1937 | struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); |
1935 | kfree(snap->name); | 1938 | kfree(snap->name); |
1936 | kfree(snap); | 1939 | kfree(snap); |
1937 | } | 1940 | } |
1938 | 1941 | ||
1939 | static const struct attribute_group *rbd_snap_attr_groups[] = { | 1942 | static const struct attribute_group *rbd_snap_attr_groups[] = { |
1940 | &rbd_snap_attr_group, | 1943 | &rbd_snap_attr_group, |
1941 | NULL | 1944 | NULL |
1942 | }; | 1945 | }; |
1943 | 1946 | ||
1944 | static struct device_type rbd_snap_device_type = { | 1947 | static struct device_type rbd_snap_device_type = { |
1945 | .groups = rbd_snap_attr_groups, | 1948 | .groups = rbd_snap_attr_groups, |
1946 | .release = rbd_snap_dev_release, | 1949 | .release = rbd_snap_dev_release, |
1947 | }; | 1950 | }; |
1948 | 1951 | ||
1949 | static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev, | 1952 | static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev, |
1950 | struct rbd_snap *snap) | 1953 | struct rbd_snap *snap) |
1951 | { | 1954 | { |
1952 | list_del(&snap->node); | 1955 | list_del(&snap->node); |
1953 | device_unregister(&snap->dev); | 1956 | device_unregister(&snap->dev); |
1954 | } | 1957 | } |
1955 | 1958 | ||
1956 | static int rbd_register_snap_dev(struct rbd_device *rbd_dev, | 1959 | static int rbd_register_snap_dev(struct rbd_device *rbd_dev, |
1957 | struct rbd_snap *snap, | 1960 | struct rbd_snap *snap, |
1958 | struct device *parent) | 1961 | struct device *parent) |
1959 | { | 1962 | { |
1960 | struct device *dev = &snap->dev; | 1963 | struct device *dev = &snap->dev; |
1961 | int ret; | 1964 | int ret; |
1962 | 1965 | ||
1963 | dev->type = &rbd_snap_device_type; | 1966 | dev->type = &rbd_snap_device_type; |
1964 | dev->parent = parent; | 1967 | dev->parent = parent; |
1965 | dev->release = rbd_snap_dev_release; | 1968 | dev->release = rbd_snap_dev_release; |
1966 | dev_set_name(dev, "snap_%s", snap->name); | 1969 | dev_set_name(dev, "snap_%s", snap->name); |
1967 | ret = device_register(dev); | 1970 | ret = device_register(dev); |
1968 | 1971 | ||
1969 | return ret; | 1972 | return ret; |
1970 | } | 1973 | } |
1971 | 1974 | ||
1972 | static int __rbd_add_snap_dev(struct rbd_device *rbd_dev, | 1975 | static int __rbd_add_snap_dev(struct rbd_device *rbd_dev, |
1973 | int i, const char *name, | 1976 | int i, const char *name, |
1974 | struct rbd_snap **snapp) | 1977 | struct rbd_snap **snapp) |
1975 | { | 1978 | { |
1976 | int ret; | 1979 | int ret; |
1977 | struct rbd_snap *snap = kzalloc(sizeof(*snap), GFP_KERNEL); | 1980 | struct rbd_snap *snap = kzalloc(sizeof(*snap), GFP_KERNEL); |
1978 | if (!snap) | 1981 | if (!snap) |
1979 | return -ENOMEM; | 1982 | return -ENOMEM; |
1980 | snap->name = kstrdup(name, GFP_KERNEL); | 1983 | snap->name = kstrdup(name, GFP_KERNEL); |
1981 | snap->size = rbd_dev->header.snap_sizes[i]; | 1984 | snap->size = rbd_dev->header.snap_sizes[i]; |
1982 | snap->id = rbd_dev->header.snapc->snaps[i]; | 1985 | snap->id = rbd_dev->header.snapc->snaps[i]; |
1983 | if (device_is_registered(&rbd_dev->dev)) { | 1986 | if (device_is_registered(&rbd_dev->dev)) { |
1984 | ret = rbd_register_snap_dev(rbd_dev, snap, | 1987 | ret = rbd_register_snap_dev(rbd_dev, snap, |
1985 | &rbd_dev->dev); | 1988 | &rbd_dev->dev); |
1986 | if (ret < 0) | 1989 | if (ret < 0) |
1987 | goto err; | 1990 | goto err; |
1988 | } | 1991 | } |
1989 | *snapp = snap; | 1992 | *snapp = snap; |
1990 | return 0; | 1993 | return 0; |
1991 | err: | 1994 | err: |
1992 | kfree(snap->name); | 1995 | kfree(snap->name); |
1993 | kfree(snap); | 1996 | kfree(snap); |
1994 | return ret; | 1997 | return ret; |
1995 | } | 1998 | } |
1996 | 1999 | ||
1997 | /* | 2000 | /* |
1998 | * search for the previous snap in a null delimited string list | 2001 | * search for the previous snap in a null delimited string list |
1999 | */ | 2002 | */ |
2000 | const char *rbd_prev_snap_name(const char *name, const char *start) | 2003 | const char *rbd_prev_snap_name(const char *name, const char *start) |
2001 | { | 2004 | { |
2002 | if (name < start + 2) | 2005 | if (name < start + 2) |
2003 | return NULL; | 2006 | return NULL; |
2004 | 2007 | ||
2005 | name -= 2; | 2008 | name -= 2; |
2006 | while (*name) { | 2009 | while (*name) { |
2007 | if (name == start) | 2010 | if (name == start) |
2008 | return start; | 2011 | return start; |
2009 | name--; | 2012 | name--; |
2010 | } | 2013 | } |
2011 | return name + 1; | 2014 | return name + 1; |
2012 | } | 2015 | } |
2013 | 2016 | ||
2014 | /* | 2017 | /* |
2015 | * compare the old list of snapshots that we have to what's in the header | 2018 | * compare the old list of snapshots that we have to what's in the header |
2016 | * and update it accordingly. Note that the header holds the snapshots | 2019 | * and update it accordingly. Note that the header holds the snapshots |
2017 | * in a reverse order (from newest to oldest) and we need to go from | 2020 | * in a reverse order (from newest to oldest) and we need to go from |
2018 | * older to new so that we don't get a duplicate snap name when | 2021 | * older to new so that we don't get a duplicate snap name when |
2019 | * doing the process (e.g., removed snapshot and recreated a new | 2022 | * doing the process (e.g., removed snapshot and recreated a new |
2020 | * one with the same name. | 2023 | * one with the same name. |
2021 | */ | 2024 | */ |
2022 | static int __rbd_init_snaps_header(struct rbd_device *rbd_dev) | 2025 | static int __rbd_init_snaps_header(struct rbd_device *rbd_dev) |
2023 | { | 2026 | { |
2024 | const char *name, *first_name; | 2027 | const char *name, *first_name; |
2025 | int i = rbd_dev->header.total_snaps; | 2028 | int i = rbd_dev->header.total_snaps; |
2026 | struct rbd_snap *snap, *old_snap = NULL; | 2029 | struct rbd_snap *snap, *old_snap = NULL; |
2027 | int ret; | 2030 | int ret; |
2028 | struct list_head *p, *n; | 2031 | struct list_head *p, *n; |
2029 | 2032 | ||
2030 | first_name = rbd_dev->header.snap_names; | 2033 | first_name = rbd_dev->header.snap_names; |
2031 | name = first_name + rbd_dev->header.snap_names_len; | 2034 | name = first_name + rbd_dev->header.snap_names_len; |
2032 | 2035 | ||
2033 | list_for_each_prev_safe(p, n, &rbd_dev->snaps) { | 2036 | list_for_each_prev_safe(p, n, &rbd_dev->snaps) { |
2034 | u64 cur_id; | 2037 | u64 cur_id; |
2035 | 2038 | ||
2036 | old_snap = list_entry(p, struct rbd_snap, node); | 2039 | old_snap = list_entry(p, struct rbd_snap, node); |
2037 | 2040 | ||
2038 | if (i) | 2041 | if (i) |
2039 | cur_id = rbd_dev->header.snapc->snaps[i - 1]; | 2042 | cur_id = rbd_dev->header.snapc->snaps[i - 1]; |
2040 | 2043 | ||
2041 | if (!i || old_snap->id < cur_id) { | 2044 | if (!i || old_snap->id < cur_id) { |
2042 | /* old_snap->id was skipped, thus was removed */ | 2045 | /* old_snap->id was skipped, thus was removed */ |
2043 | __rbd_remove_snap_dev(rbd_dev, old_snap); | 2046 | __rbd_remove_snap_dev(rbd_dev, old_snap); |
2044 | continue; | 2047 | continue; |
2045 | } | 2048 | } |
2046 | if (old_snap->id == cur_id) { | 2049 | if (old_snap->id == cur_id) { |
2047 | /* we have this snapshot already */ | 2050 | /* we have this snapshot already */ |
2048 | i--; | 2051 | i--; |
2049 | name = rbd_prev_snap_name(name, first_name); | 2052 | name = rbd_prev_snap_name(name, first_name); |
2050 | continue; | 2053 | continue; |
2051 | } | 2054 | } |
2052 | for (; i > 0; | 2055 | for (; i > 0; |
2053 | i--, name = rbd_prev_snap_name(name, first_name)) { | 2056 | i--, name = rbd_prev_snap_name(name, first_name)) { |
2054 | if (!name) { | 2057 | if (!name) { |
2055 | WARN_ON(1); | 2058 | WARN_ON(1); |
2056 | return -EINVAL; | 2059 | return -EINVAL; |
2057 | } | 2060 | } |
2058 | cur_id = rbd_dev->header.snapc->snaps[i]; | 2061 | cur_id = rbd_dev->header.snapc->snaps[i]; |
2059 | /* snapshot removal? handle it above */ | 2062 | /* snapshot removal? handle it above */ |
2060 | if (cur_id >= old_snap->id) | 2063 | if (cur_id >= old_snap->id) |
2061 | break; | 2064 | break; |
2062 | /* a new snapshot */ | 2065 | /* a new snapshot */ |
2063 | ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap); | 2066 | ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap); |
2064 | if (ret < 0) | 2067 | if (ret < 0) |
2065 | return ret; | 2068 | return ret; |
2066 | 2069 | ||
2067 | /* note that we add it backward so using n and not p */ | 2070 | /* note that we add it backward so using n and not p */ |
2068 | list_add(&snap->node, n); | 2071 | list_add(&snap->node, n); |
2069 | p = &snap->node; | 2072 | p = &snap->node; |
2070 | } | 2073 | } |
2071 | } | 2074 | } |
2072 | /* we're done going over the old snap list, just add what's left */ | 2075 | /* we're done going over the old snap list, just add what's left */ |
2073 | for (; i > 0; i--) { | 2076 | for (; i > 0; i--) { |
2074 | name = rbd_prev_snap_name(name, first_name); | 2077 | name = rbd_prev_snap_name(name, first_name); |
2075 | if (!name) { | 2078 | if (!name) { |
2076 | WARN_ON(1); | 2079 | WARN_ON(1); |
2077 | return -EINVAL; | 2080 | return -EINVAL; |
2078 | } | 2081 | } |
2079 | ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap); | 2082 | ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap); |
2080 | if (ret < 0) | 2083 | if (ret < 0) |
2081 | return ret; | 2084 | return ret; |
2082 | list_add(&snap->node, &rbd_dev->snaps); | 2085 | list_add(&snap->node, &rbd_dev->snaps); |
2083 | } | 2086 | } |
2084 | 2087 | ||
2085 | return 0; | 2088 | return 0; |
2086 | } | 2089 | } |
2087 | 2090 | ||
2088 | 2091 | ||
2089 | static void rbd_root_dev_release(struct device *dev) | 2092 | static void rbd_root_dev_release(struct device *dev) |
2090 | { | 2093 | { |
2091 | } | 2094 | } |
2092 | 2095 | ||
2093 | static struct device rbd_root_dev = { | 2096 | static struct device rbd_root_dev = { |
2094 | .init_name = "rbd", | 2097 | .init_name = "rbd", |
2095 | .release = rbd_root_dev_release, | 2098 | .release = rbd_root_dev_release, |
2096 | }; | 2099 | }; |
2097 | 2100 | ||
2098 | static int rbd_bus_add_dev(struct rbd_device *rbd_dev) | 2101 | static int rbd_bus_add_dev(struct rbd_device *rbd_dev) |
2099 | { | 2102 | { |
2100 | int ret = -ENOMEM; | 2103 | int ret = -ENOMEM; |
2101 | struct device *dev; | 2104 | struct device *dev; |
2102 | struct rbd_snap *snap; | 2105 | struct rbd_snap *snap; |
2103 | 2106 | ||
2104 | mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); | 2107 | mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); |
2105 | dev = &rbd_dev->dev; | 2108 | dev = &rbd_dev->dev; |
2106 | 2109 | ||
2107 | dev->bus = &rbd_bus_type; | 2110 | dev->bus = &rbd_bus_type; |
2108 | dev->type = &rbd_device_type; | 2111 | dev->type = &rbd_device_type; |
2109 | dev->parent = &rbd_root_dev; | 2112 | dev->parent = &rbd_root_dev; |
2110 | dev->release = rbd_dev_release; | 2113 | dev->release = rbd_dev_release; |
2111 | dev_set_name(dev, "%d", rbd_dev->id); | 2114 | dev_set_name(dev, "%d", rbd_dev->id); |
2112 | ret = device_register(dev); | 2115 | ret = device_register(dev); |
2113 | if (ret < 0) | 2116 | if (ret < 0) |
2114 | goto done_free; | 2117 | goto done_free; |
2115 | 2118 | ||
2116 | list_for_each_entry(snap, &rbd_dev->snaps, node) { | 2119 | list_for_each_entry(snap, &rbd_dev->snaps, node) { |
2117 | ret = rbd_register_snap_dev(rbd_dev, snap, | 2120 | ret = rbd_register_snap_dev(rbd_dev, snap, |
2118 | &rbd_dev->dev); | 2121 | &rbd_dev->dev); |
2119 | if (ret < 0) | 2122 | if (ret < 0) |
2120 | break; | 2123 | break; |
2121 | } | 2124 | } |
2122 | 2125 | ||
2123 | mutex_unlock(&ctl_mutex); | 2126 | mutex_unlock(&ctl_mutex); |
2124 | return 0; | 2127 | return 0; |
2125 | done_free: | 2128 | done_free: |
2126 | mutex_unlock(&ctl_mutex); | 2129 | mutex_unlock(&ctl_mutex); |
2127 | return ret; | 2130 | return ret; |
2128 | } | 2131 | } |
2129 | 2132 | ||
2130 | static void rbd_bus_del_dev(struct rbd_device *rbd_dev) | 2133 | static void rbd_bus_del_dev(struct rbd_device *rbd_dev) |
2131 | { | 2134 | { |
2132 | device_unregister(&rbd_dev->dev); | 2135 | device_unregister(&rbd_dev->dev); |
2133 | } | 2136 | } |
2134 | 2137 | ||
2135 | static int rbd_init_watch_dev(struct rbd_device *rbd_dev) | 2138 | static int rbd_init_watch_dev(struct rbd_device *rbd_dev) |
2136 | { | 2139 | { |
2137 | int ret, rc; | 2140 | int ret, rc; |
2138 | 2141 | ||
2139 | do { | 2142 | do { |
2140 | ret = rbd_req_sync_watch(rbd_dev, rbd_dev->obj_md_name, | 2143 | ret = rbd_req_sync_watch(rbd_dev, rbd_dev->obj_md_name, |
2141 | rbd_dev->header.obj_version); | 2144 | rbd_dev->header.obj_version); |
2142 | if (ret == -ERANGE) { | 2145 | if (ret == -ERANGE) { |
2143 | mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); | 2146 | mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); |
2144 | rc = __rbd_update_snaps(rbd_dev); | 2147 | rc = __rbd_update_snaps(rbd_dev); |
2145 | mutex_unlock(&ctl_mutex); | 2148 | mutex_unlock(&ctl_mutex); |
2146 | if (rc < 0) | 2149 | if (rc < 0) |
2147 | return rc; | 2150 | return rc; |
2148 | } | 2151 | } |
2149 | } while (ret == -ERANGE); | 2152 | } while (ret == -ERANGE); |
2150 | 2153 | ||
2151 | return ret; | 2154 | return ret; |
2152 | } | 2155 | } |
2153 | 2156 | ||
2154 | static ssize_t rbd_add(struct bus_type *bus, | 2157 | static ssize_t rbd_add(struct bus_type *bus, |
2155 | const char *buf, | 2158 | const char *buf, |
2156 | size_t count) | 2159 | size_t count) |
2157 | { | 2160 | { |
2158 | struct ceph_osd_client *osdc; | 2161 | struct ceph_osd_client *osdc; |
2159 | struct rbd_device *rbd_dev; | 2162 | struct rbd_device *rbd_dev; |
2160 | ssize_t rc = -ENOMEM; | 2163 | ssize_t rc = -ENOMEM; |
2161 | int irc, new_id = 0; | 2164 | int irc, new_id = 0; |
2162 | struct list_head *tmp; | 2165 | struct list_head *tmp; |
2163 | char *mon_dev_name; | 2166 | char *mon_dev_name; |
2164 | char *options; | 2167 | char *options; |
2165 | 2168 | ||
2166 | if (!try_module_get(THIS_MODULE)) | 2169 | if (!try_module_get(THIS_MODULE)) |
2167 | return -ENODEV; | 2170 | return -ENODEV; |
2168 | 2171 | ||
2169 | mon_dev_name = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL); | 2172 | mon_dev_name = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL); |
2170 | if (!mon_dev_name) | 2173 | if (!mon_dev_name) |
2171 | goto err_out_mod; | 2174 | goto err_out_mod; |
2172 | 2175 | ||
2173 | options = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL); | 2176 | options = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL); |
2174 | if (!options) | 2177 | if (!options) |
2175 | goto err_mon_dev; | 2178 | goto err_mon_dev; |
2176 | 2179 | ||
2177 | /* new rbd_device object */ | 2180 | /* new rbd_device object */ |
2178 | rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL); | 2181 | rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL); |
2179 | if (!rbd_dev) | 2182 | if (!rbd_dev) |
2180 | goto err_out_opt; | 2183 | goto err_out_opt; |
2181 | 2184 | ||
2182 | /* static rbd_device initialization */ | 2185 | /* static rbd_device initialization */ |
2183 | spin_lock_init(&rbd_dev->lock); | 2186 | spin_lock_init(&rbd_dev->lock); |
2184 | INIT_LIST_HEAD(&rbd_dev->node); | 2187 | INIT_LIST_HEAD(&rbd_dev->node); |
2185 | INIT_LIST_HEAD(&rbd_dev->snaps); | 2188 | INIT_LIST_HEAD(&rbd_dev->snaps); |
2186 | 2189 | ||
2187 | init_rwsem(&rbd_dev->header.snap_rwsem); | 2190 | init_rwsem(&rbd_dev->header.snap_rwsem); |
2188 | 2191 | ||
2189 | /* generate unique id: find highest unique id, add one */ | 2192 | /* generate unique id: find highest unique id, add one */ |
2190 | mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); | 2193 | mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); |
2191 | 2194 | ||
2192 | list_for_each(tmp, &rbd_dev_list) { | 2195 | list_for_each(tmp, &rbd_dev_list) { |
2193 | struct rbd_device *rbd_dev; | 2196 | struct rbd_device *rbd_dev; |
2194 | 2197 | ||
2195 | rbd_dev = list_entry(tmp, struct rbd_device, node); | 2198 | rbd_dev = list_entry(tmp, struct rbd_device, node); |
2196 | if (rbd_dev->id >= new_id) | 2199 | if (rbd_dev->id >= new_id) |
2197 | new_id = rbd_dev->id + 1; | 2200 | new_id = rbd_dev->id + 1; |
2198 | } | 2201 | } |
2199 | 2202 | ||
2200 | rbd_dev->id = new_id; | 2203 | rbd_dev->id = new_id; |
2201 | 2204 | ||
2202 | /* add to global list */ | 2205 | /* add to global list */ |
2203 | list_add_tail(&rbd_dev->node, &rbd_dev_list); | 2206 | list_add_tail(&rbd_dev->node, &rbd_dev_list); |
2204 | 2207 | ||
2205 | /* parse add command */ | 2208 | /* parse add command */ |
2206 | if (sscanf(buf, "%" __stringify(RBD_MAX_OPT_LEN) "s " | 2209 | if (sscanf(buf, "%" __stringify(RBD_MAX_OPT_LEN) "s " |
2207 | "%" __stringify(RBD_MAX_OPT_LEN) "s " | 2210 | "%" __stringify(RBD_MAX_OPT_LEN) "s " |
2208 | "%" __stringify(RBD_MAX_POOL_NAME_LEN) "s " | 2211 | "%" __stringify(RBD_MAX_POOL_NAME_LEN) "s " |
2209 | "%" __stringify(RBD_MAX_OBJ_NAME_LEN) "s" | 2212 | "%" __stringify(RBD_MAX_OBJ_NAME_LEN) "s" |
2210 | "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s", | 2213 | "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s", |
2211 | mon_dev_name, options, rbd_dev->pool_name, | 2214 | mon_dev_name, options, rbd_dev->pool_name, |
2212 | rbd_dev->obj, rbd_dev->snap_name) < 4) { | 2215 | rbd_dev->obj, rbd_dev->snap_name) < 4) { |
2213 | rc = -EINVAL; | 2216 | rc = -EINVAL; |
2214 | goto err_out_slot; | 2217 | goto err_out_slot; |
2215 | } | 2218 | } |
2216 | 2219 | ||
2217 | if (rbd_dev->snap_name[0] == 0) | 2220 | if (rbd_dev->snap_name[0] == 0) |
2218 | rbd_dev->snap_name[0] = '-'; | 2221 | rbd_dev->snap_name[0] = '-'; |
2219 | 2222 | ||
2220 | rbd_dev->obj_len = strlen(rbd_dev->obj); | 2223 | rbd_dev->obj_len = strlen(rbd_dev->obj); |
2221 | snprintf(rbd_dev->obj_md_name, sizeof(rbd_dev->obj_md_name), "%s%s", | 2224 | snprintf(rbd_dev->obj_md_name, sizeof(rbd_dev->obj_md_name), "%s%s", |
2222 | rbd_dev->obj, RBD_SUFFIX); | 2225 | rbd_dev->obj, RBD_SUFFIX); |
2223 | 2226 | ||
2224 | /* initialize rest of new object */ | 2227 | /* initialize rest of new object */ |
2225 | snprintf(rbd_dev->name, DEV_NAME_LEN, DRV_NAME "%d", rbd_dev->id); | 2228 | snprintf(rbd_dev->name, DEV_NAME_LEN, DRV_NAME "%d", rbd_dev->id); |
2226 | rc = rbd_get_client(rbd_dev, mon_dev_name, options); | 2229 | rc = rbd_get_client(rbd_dev, mon_dev_name, options); |
2227 | if (rc < 0) | 2230 | if (rc < 0) |
2228 | goto err_out_slot; | 2231 | goto err_out_slot; |
2229 | 2232 | ||
2230 | mutex_unlock(&ctl_mutex); | 2233 | mutex_unlock(&ctl_mutex); |
2231 | 2234 | ||
2232 | /* pick the pool */ | 2235 | /* pick the pool */ |
2233 | osdc = &rbd_dev->client->osdc; | 2236 | osdc = &rbd_dev->client->osdc; |
2234 | rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name); | 2237 | rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name); |
2235 | if (rc < 0) | 2238 | if (rc < 0) |
2236 | goto err_out_client; | 2239 | goto err_out_client; |
2237 | rbd_dev->poolid = rc; | 2240 | rbd_dev->poolid = rc; |
2238 | 2241 | ||
2239 | /* register our block device */ | 2242 | /* register our block device */ |
2240 | irc = register_blkdev(0, rbd_dev->name); | 2243 | irc = register_blkdev(0, rbd_dev->name); |
2241 | if (irc < 0) { | 2244 | if (irc < 0) { |
2242 | rc = irc; | 2245 | rc = irc; |
2243 | goto err_out_client; | 2246 | goto err_out_client; |
2244 | } | 2247 | } |
2245 | rbd_dev->major = irc; | 2248 | rbd_dev->major = irc; |
2246 | 2249 | ||
2247 | rc = rbd_bus_add_dev(rbd_dev); | 2250 | rc = rbd_bus_add_dev(rbd_dev); |
2248 | if (rc) | 2251 | if (rc) |
2249 | goto err_out_blkdev; | 2252 | goto err_out_blkdev; |
2250 | 2253 | ||
2251 | /* set up and announce blkdev mapping */ | 2254 | /* set up and announce blkdev mapping */ |
2252 | rc = rbd_init_disk(rbd_dev); | 2255 | rc = rbd_init_disk(rbd_dev); |
2253 | if (rc) | 2256 | if (rc) |
2254 | goto err_out_bus; | 2257 | goto err_out_bus; |
2255 | 2258 | ||
2256 | rc = rbd_init_watch_dev(rbd_dev); | 2259 | rc = rbd_init_watch_dev(rbd_dev); |
2257 | if (rc) | 2260 | if (rc) |
2258 | goto err_out_bus; | 2261 | goto err_out_bus; |
2259 | 2262 | ||
2260 | return count; | 2263 | return count; |
2261 | 2264 | ||
2262 | err_out_bus: | 2265 | err_out_bus: |
2263 | mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); | 2266 | mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); |
2264 | list_del_init(&rbd_dev->node); | 2267 | list_del_init(&rbd_dev->node); |
2265 | mutex_unlock(&ctl_mutex); | 2268 | mutex_unlock(&ctl_mutex); |
2266 | 2269 | ||
2267 | /* this will also clean up rest of rbd_dev stuff */ | 2270 | /* this will also clean up rest of rbd_dev stuff */ |
2268 | 2271 | ||
2269 | rbd_bus_del_dev(rbd_dev); | 2272 | rbd_bus_del_dev(rbd_dev); |
2270 | kfree(options); | 2273 | kfree(options); |
2271 | kfree(mon_dev_name); | 2274 | kfree(mon_dev_name); |
2272 | return rc; | 2275 | return rc; |
2273 | 2276 | ||
2274 | err_out_blkdev: | 2277 | err_out_blkdev: |
2275 | unregister_blkdev(rbd_dev->major, rbd_dev->name); | 2278 | unregister_blkdev(rbd_dev->major, rbd_dev->name); |
2276 | err_out_client: | 2279 | err_out_client: |
2277 | rbd_put_client(rbd_dev); | 2280 | rbd_put_client(rbd_dev); |
2278 | mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); | 2281 | mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); |
2279 | err_out_slot: | 2282 | err_out_slot: |
2280 | list_del_init(&rbd_dev->node); | 2283 | list_del_init(&rbd_dev->node); |
2281 | mutex_unlock(&ctl_mutex); | 2284 | mutex_unlock(&ctl_mutex); |
2282 | 2285 | ||
2283 | kfree(rbd_dev); | 2286 | kfree(rbd_dev); |
2284 | err_out_opt: | 2287 | err_out_opt: |
2285 | kfree(options); | 2288 | kfree(options); |
2286 | err_mon_dev: | 2289 | err_mon_dev: |
2287 | kfree(mon_dev_name); | 2290 | kfree(mon_dev_name); |
2288 | err_out_mod: | 2291 | err_out_mod: |
2289 | dout("Error adding device %s\n", buf); | 2292 | dout("Error adding device %s\n", buf); |
2290 | module_put(THIS_MODULE); | 2293 | module_put(THIS_MODULE); |
2291 | return rc; | 2294 | return rc; |
2292 | } | 2295 | } |
2293 | 2296 | ||
2294 | static struct rbd_device *__rbd_get_dev(unsigned long id) | 2297 | static struct rbd_device *__rbd_get_dev(unsigned long id) |
2295 | { | 2298 | { |
2296 | struct list_head *tmp; | 2299 | struct list_head *tmp; |
2297 | struct rbd_device *rbd_dev; | 2300 | struct rbd_device *rbd_dev; |
2298 | 2301 | ||
2299 | list_for_each(tmp, &rbd_dev_list) { | 2302 | list_for_each(tmp, &rbd_dev_list) { |
2300 | rbd_dev = list_entry(tmp, struct rbd_device, node); | 2303 | rbd_dev = list_entry(tmp, struct rbd_device, node); |
2301 | if (rbd_dev->id == id) | 2304 | if (rbd_dev->id == id) |
2302 | return rbd_dev; | 2305 | return rbd_dev; |
2303 | } | 2306 | } |
2304 | return NULL; | 2307 | return NULL; |
2305 | } | 2308 | } |
2306 | 2309 | ||
2307 | static void rbd_dev_release(struct device *dev) | 2310 | static void rbd_dev_release(struct device *dev) |
2308 | { | 2311 | { |
2309 | struct rbd_device *rbd_dev = | 2312 | struct rbd_device *rbd_dev = |
2310 | container_of(dev, struct rbd_device, dev); | 2313 | container_of(dev, struct rbd_device, dev); |
2311 | 2314 | ||
2312 | if (rbd_dev->watch_request) | 2315 | if (rbd_dev->watch_request) |
2313 | ceph_osdc_unregister_linger_request(&rbd_dev->client->osdc, | 2316 | ceph_osdc_unregister_linger_request(&rbd_dev->client->osdc, |
2314 | rbd_dev->watch_request); | 2317 | rbd_dev->watch_request); |
2315 | if (rbd_dev->watch_event) | 2318 | if (rbd_dev->watch_event) |
2316 | rbd_req_sync_unwatch(rbd_dev, rbd_dev->obj_md_name); | 2319 | rbd_req_sync_unwatch(rbd_dev, rbd_dev->obj_md_name); |
2317 | 2320 | ||
2318 | rbd_put_client(rbd_dev); | 2321 | rbd_put_client(rbd_dev); |
2319 | 2322 | ||
2320 | /* clean up and free blkdev */ | 2323 | /* clean up and free blkdev */ |
2321 | rbd_free_disk(rbd_dev); | 2324 | rbd_free_disk(rbd_dev); |
2322 | unregister_blkdev(rbd_dev->major, rbd_dev->name); | 2325 | unregister_blkdev(rbd_dev->major, rbd_dev->name); |
2323 | kfree(rbd_dev); | 2326 | kfree(rbd_dev); |
2324 | 2327 | ||
2325 | /* release module ref */ | 2328 | /* release module ref */ |
2326 | module_put(THIS_MODULE); | 2329 | module_put(THIS_MODULE); |
2327 | } | 2330 | } |
2328 | 2331 | ||
2329 | static ssize_t rbd_remove(struct bus_type *bus, | 2332 | static ssize_t rbd_remove(struct bus_type *bus, |
2330 | const char *buf, | 2333 | const char *buf, |
2331 | size_t count) | 2334 | size_t count) |
2332 | { | 2335 | { |
2333 | struct rbd_device *rbd_dev = NULL; | 2336 | struct rbd_device *rbd_dev = NULL; |
2334 | int target_id, rc; | 2337 | int target_id, rc; |
2335 | unsigned long ul; | 2338 | unsigned long ul; |
2336 | int ret = count; | 2339 | int ret = count; |
2337 | 2340 | ||
2338 | rc = strict_strtoul(buf, 10, &ul); | 2341 | rc = strict_strtoul(buf, 10, &ul); |
2339 | if (rc) | 2342 | if (rc) |
2340 | return rc; | 2343 | return rc; |
2341 | 2344 | ||
2342 | /* convert to int; abort if we lost anything in the conversion */ | 2345 | /* convert to int; abort if we lost anything in the conversion */ |
2343 | target_id = (int) ul; | 2346 | target_id = (int) ul; |
2344 | if (target_id != ul) | 2347 | if (target_id != ul) |
2345 | return -EINVAL; | 2348 | return -EINVAL; |
2346 | 2349 | ||
2347 | mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); | 2350 | mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); |
2348 | 2351 | ||
2349 | rbd_dev = __rbd_get_dev(target_id); | 2352 | rbd_dev = __rbd_get_dev(target_id); |
2350 | if (!rbd_dev) { | 2353 | if (!rbd_dev) { |
2351 | ret = -ENOENT; | 2354 | ret = -ENOENT; |
2352 | goto done; | 2355 | goto done; |
2353 | } | 2356 | } |
2354 | 2357 | ||
2355 | list_del_init(&rbd_dev->node); | 2358 | list_del_init(&rbd_dev->node); |
2356 | 2359 | ||
2357 | __rbd_remove_all_snaps(rbd_dev); | 2360 | __rbd_remove_all_snaps(rbd_dev); |
2358 | rbd_bus_del_dev(rbd_dev); | 2361 | rbd_bus_del_dev(rbd_dev); |
2359 | 2362 | ||
2360 | done: | 2363 | done: |
2361 | mutex_unlock(&ctl_mutex); | 2364 | mutex_unlock(&ctl_mutex); |
2362 | return ret; | 2365 | return ret; |
2363 | } | 2366 | } |
2364 | 2367 | ||
2365 | static ssize_t rbd_snap_add(struct device *dev, | 2368 | static ssize_t rbd_snap_add(struct device *dev, |
2366 | struct device_attribute *attr, | 2369 | struct device_attribute *attr, |
2367 | const char *buf, | 2370 | const char *buf, |
2368 | size_t count) | 2371 | size_t count) |
2369 | { | 2372 | { |
2370 | struct rbd_device *rbd_dev = dev_to_rbd(dev); | 2373 | struct rbd_device *rbd_dev = dev_to_rbd(dev); |
2371 | int ret; | 2374 | int ret; |
2372 | char *name = kmalloc(count + 1, GFP_KERNEL); | 2375 | char *name = kmalloc(count + 1, GFP_KERNEL); |
2373 | if (!name) | 2376 | if (!name) |
2374 | return -ENOMEM; | 2377 | return -ENOMEM; |
2375 | 2378 | ||
2376 | snprintf(name, count, "%s", buf); | 2379 | snprintf(name, count, "%s", buf); |
2377 | 2380 | ||
2378 | mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); | 2381 | mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); |
2379 | 2382 | ||
2380 | ret = rbd_header_add_snap(rbd_dev, | 2383 | ret = rbd_header_add_snap(rbd_dev, |
2381 | name, GFP_KERNEL); | 2384 | name, GFP_KERNEL); |
2382 | if (ret < 0) | 2385 | if (ret < 0) |
2383 | goto err_unlock; | 2386 | goto err_unlock; |
2384 | 2387 | ||
2385 | ret = __rbd_update_snaps(rbd_dev); | 2388 | ret = __rbd_update_snaps(rbd_dev); |
2386 | if (ret < 0) | 2389 | if (ret < 0) |
2387 | goto err_unlock; | 2390 | goto err_unlock; |
2388 | 2391 | ||
2389 | /* shouldn't hold ctl_mutex when notifying.. notify might | 2392 | /* shouldn't hold ctl_mutex when notifying.. notify might |
2390 | trigger a watch callback that would need to get that mutex */ | 2393 | trigger a watch callback that would need to get that mutex */ |
2391 | mutex_unlock(&ctl_mutex); | 2394 | mutex_unlock(&ctl_mutex); |
2392 | 2395 | ||
2393 | /* make a best effort, don't error if failed */ | 2396 | /* make a best effort, don't error if failed */ |
2394 | rbd_req_sync_notify(rbd_dev, rbd_dev->obj_md_name); | 2397 | rbd_req_sync_notify(rbd_dev, rbd_dev->obj_md_name); |
2395 | 2398 | ||
2396 | ret = count; | 2399 | ret = count; |
2397 | kfree(name); | 2400 | kfree(name); |
2398 | return ret; | 2401 | return ret; |
2399 | 2402 | ||
2400 | err_unlock: | 2403 | err_unlock: |
2401 | mutex_unlock(&ctl_mutex); | 2404 | mutex_unlock(&ctl_mutex); |
2402 | kfree(name); | 2405 | kfree(name); |
2403 | return ret; | 2406 | return ret; |
2404 | } | 2407 | } |
2405 | 2408 | ||
2406 | static struct bus_attribute rbd_bus_attrs[] = { | 2409 | static struct bus_attribute rbd_bus_attrs[] = { |
2407 | __ATTR(add, S_IWUSR, NULL, rbd_add), | 2410 | __ATTR(add, S_IWUSR, NULL, rbd_add), |
2408 | __ATTR(remove, S_IWUSR, NULL, rbd_remove), | 2411 | __ATTR(remove, S_IWUSR, NULL, rbd_remove), |
2409 | __ATTR_NULL | 2412 | __ATTR_NULL |
2410 | }; | 2413 | }; |
2411 | 2414 | ||
2412 | /* | 2415 | /* |
2413 | * create control files in sysfs | 2416 | * create control files in sysfs |
2414 | * /sys/bus/rbd/... | 2417 | * /sys/bus/rbd/... |
2415 | */ | 2418 | */ |
2416 | static int rbd_sysfs_init(void) | 2419 | static int rbd_sysfs_init(void) |
2417 | { | 2420 | { |
2418 | int ret; | 2421 | int ret; |
2419 | 2422 | ||
2420 | rbd_bus_type.bus_attrs = rbd_bus_attrs; | 2423 | rbd_bus_type.bus_attrs = rbd_bus_attrs; |
2421 | 2424 | ||
2422 | ret = bus_register(&rbd_bus_type); | 2425 | ret = bus_register(&rbd_bus_type); |
2423 | if (ret < 0) | 2426 | if (ret < 0) |
2424 | return ret; | 2427 | return ret; |
2425 | 2428 | ||
2426 | ret = device_register(&rbd_root_dev); | 2429 | ret = device_register(&rbd_root_dev); |
2427 | 2430 | ||
2428 | return ret; | 2431 | return ret; |
2429 | } | 2432 | } |
2430 | 2433 | ||
2431 | static void rbd_sysfs_cleanup(void) | 2434 | static void rbd_sysfs_cleanup(void) |
2432 | { | 2435 | { |
2433 | device_unregister(&rbd_root_dev); | 2436 | device_unregister(&rbd_root_dev); |
2434 | bus_unregister(&rbd_bus_type); | 2437 | bus_unregister(&rbd_bus_type); |
2435 | } | 2438 | } |
2436 | 2439 | ||
2437 | int __init rbd_init(void) | 2440 | int __init rbd_init(void) |
2438 | { | 2441 | { |
2439 | int rc; | 2442 | int rc; |
2440 | 2443 | ||
2441 | rc = rbd_sysfs_init(); | 2444 | rc = rbd_sysfs_init(); |
2442 | if (rc) | 2445 | if (rc) |
2443 | return rc; | 2446 | return rc; |
2444 | spin_lock_init(&node_lock); | 2447 | spin_lock_init(&node_lock); |
2445 | pr_info("loaded " DRV_NAME_LONG "\n"); | 2448 | pr_info("loaded " DRV_NAME_LONG "\n"); |
2446 | return 0; | 2449 | return 0; |
2447 | } | 2450 | } |
2448 | 2451 | ||
2449 | void __exit rbd_exit(void) | 2452 | void __exit rbd_exit(void) |
2450 | { | 2453 | { |
2451 | rbd_sysfs_cleanup(); | 2454 | rbd_sysfs_cleanup(); |
2452 | } | 2455 | } |
2453 | 2456 | ||
2454 | module_init(rbd_init); | 2457 | module_init(rbd_init); |
2455 | module_exit(rbd_exit); | 2458 | module_exit(rbd_exit); |
2456 | 2459 | ||
2457 | MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); | 2460 | MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); |
2458 | MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); | 2461 | MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); |
2459 | MODULE_DESCRIPTION("rados block device"); | 2462 | MODULE_DESCRIPTION("rados block device"); |
2460 | 2463 | ||
2461 | /* following authorship retained from original osdblk.c */ | 2464 | /* following authorship retained from original osdblk.c */ |
2462 | MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>"); | 2465 | MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>"); |
2463 | 2466 |
fs/ceph/caps.c
1 | #include <linux/ceph/ceph_debug.h> | 1 | #include <linux/ceph/ceph_debug.h> |
2 | 2 | ||
3 | #include <linux/fs.h> | 3 | #include <linux/fs.h> |
4 | #include <linux/kernel.h> | 4 | #include <linux/kernel.h> |
5 | #include <linux/sched.h> | 5 | #include <linux/sched.h> |
6 | #include <linux/slab.h> | 6 | #include <linux/slab.h> |
7 | #include <linux/vmalloc.h> | 7 | #include <linux/vmalloc.h> |
8 | #include <linux/wait.h> | 8 | #include <linux/wait.h> |
9 | #include <linux/writeback.h> | 9 | #include <linux/writeback.h> |
10 | 10 | ||
11 | #include "super.h" | 11 | #include "super.h" |
12 | #include "mds_client.h" | 12 | #include "mds_client.h" |
13 | #include <linux/ceph/decode.h> | 13 | #include <linux/ceph/decode.h> |
14 | #include <linux/ceph/messenger.h> | 14 | #include <linux/ceph/messenger.h> |
15 | 15 | ||
16 | /* | 16 | /* |
17 | * Capability management | 17 | * Capability management |
18 | * | 18 | * |
19 | * The Ceph metadata servers control client access to inode metadata | 19 | * The Ceph metadata servers control client access to inode metadata |
20 | * and file data by issuing capabilities, granting clients permission | 20 | * and file data by issuing capabilities, granting clients permission |
21 | * to read and/or write both inode field and file data to OSDs | 21 | * to read and/or write both inode field and file data to OSDs |
22 | * (storage nodes). Each capability consists of a set of bits | 22 | * (storage nodes). Each capability consists of a set of bits |
23 | * indicating which operations are allowed. | 23 | * indicating which operations are allowed. |
24 | * | 24 | * |
25 | * If the client holds a *_SHARED cap, the client has a coherent value | 25 | * If the client holds a *_SHARED cap, the client has a coherent value |
26 | * that can be safely read from the cached inode. | 26 | * that can be safely read from the cached inode. |
27 | * | 27 | * |
28 | * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the | 28 | * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the |
29 | * client is allowed to change inode attributes (e.g., file size, | 29 | * client is allowed to change inode attributes (e.g., file size, |
30 | * mtime), note its dirty state in the ceph_cap, and asynchronously | 30 | * mtime), note its dirty state in the ceph_cap, and asynchronously |
31 | * flush that metadata change to the MDS. | 31 | * flush that metadata change to the MDS. |
32 | * | 32 | * |
33 | * In the event of a conflicting operation (perhaps by another | 33 | * In the event of a conflicting operation (perhaps by another |
34 | * client), the MDS will revoke the conflicting client capabilities. | 34 | * client), the MDS will revoke the conflicting client capabilities. |
35 | * | 35 | * |
36 | * In order for a client to cache an inode, it must hold a capability | 36 | * In order for a client to cache an inode, it must hold a capability |
37 | * with at least one MDS server. When inodes are released, release | 37 | * with at least one MDS server. When inodes are released, release |
38 | * notifications are batched and periodically sent en masse to the MDS | 38 | * notifications are batched and periodically sent en masse to the MDS |
39 | * cluster to release server state. | 39 | * cluster to release server state. |
40 | */ | 40 | */ |
41 | 41 | ||
42 | 42 | ||
43 | /* | 43 | /* |
44 | * Generate readable cap strings for debugging output. | 44 | * Generate readable cap strings for debugging output. |
45 | */ | 45 | */ |
46 | #define MAX_CAP_STR 20 | 46 | #define MAX_CAP_STR 20 |
47 | static char cap_str[MAX_CAP_STR][40]; | 47 | static char cap_str[MAX_CAP_STR][40]; |
48 | static DEFINE_SPINLOCK(cap_str_lock); | 48 | static DEFINE_SPINLOCK(cap_str_lock); |
49 | static int last_cap_str; | 49 | static int last_cap_str; |
50 | 50 | ||
51 | static char *gcap_string(char *s, int c) | 51 | static char *gcap_string(char *s, int c) |
52 | { | 52 | { |
53 | if (c & CEPH_CAP_GSHARED) | 53 | if (c & CEPH_CAP_GSHARED) |
54 | *s++ = 's'; | 54 | *s++ = 's'; |
55 | if (c & CEPH_CAP_GEXCL) | 55 | if (c & CEPH_CAP_GEXCL) |
56 | *s++ = 'x'; | 56 | *s++ = 'x'; |
57 | if (c & CEPH_CAP_GCACHE) | 57 | if (c & CEPH_CAP_GCACHE) |
58 | *s++ = 'c'; | 58 | *s++ = 'c'; |
59 | if (c & CEPH_CAP_GRD) | 59 | if (c & CEPH_CAP_GRD) |
60 | *s++ = 'r'; | 60 | *s++ = 'r'; |
61 | if (c & CEPH_CAP_GWR) | 61 | if (c & CEPH_CAP_GWR) |
62 | *s++ = 'w'; | 62 | *s++ = 'w'; |
63 | if (c & CEPH_CAP_GBUFFER) | 63 | if (c & CEPH_CAP_GBUFFER) |
64 | *s++ = 'b'; | 64 | *s++ = 'b'; |
65 | if (c & CEPH_CAP_GLAZYIO) | 65 | if (c & CEPH_CAP_GLAZYIO) |
66 | *s++ = 'l'; | 66 | *s++ = 'l'; |
67 | return s; | 67 | return s; |
68 | } | 68 | } |
69 | 69 | ||
70 | const char *ceph_cap_string(int caps) | 70 | const char *ceph_cap_string(int caps) |
71 | { | 71 | { |
72 | int i; | 72 | int i; |
73 | char *s; | 73 | char *s; |
74 | int c; | 74 | int c; |
75 | 75 | ||
76 | spin_lock(&cap_str_lock); | 76 | spin_lock(&cap_str_lock); |
77 | i = last_cap_str++; | 77 | i = last_cap_str++; |
78 | if (last_cap_str == MAX_CAP_STR) | 78 | if (last_cap_str == MAX_CAP_STR) |
79 | last_cap_str = 0; | 79 | last_cap_str = 0; |
80 | spin_unlock(&cap_str_lock); | 80 | spin_unlock(&cap_str_lock); |
81 | 81 | ||
82 | s = cap_str[i]; | 82 | s = cap_str[i]; |
83 | 83 | ||
84 | if (caps & CEPH_CAP_PIN) | 84 | if (caps & CEPH_CAP_PIN) |
85 | *s++ = 'p'; | 85 | *s++ = 'p'; |
86 | 86 | ||
87 | c = (caps >> CEPH_CAP_SAUTH) & 3; | 87 | c = (caps >> CEPH_CAP_SAUTH) & 3; |
88 | if (c) { | 88 | if (c) { |
89 | *s++ = 'A'; | 89 | *s++ = 'A'; |
90 | s = gcap_string(s, c); | 90 | s = gcap_string(s, c); |
91 | } | 91 | } |
92 | 92 | ||
93 | c = (caps >> CEPH_CAP_SLINK) & 3; | 93 | c = (caps >> CEPH_CAP_SLINK) & 3; |
94 | if (c) { | 94 | if (c) { |
95 | *s++ = 'L'; | 95 | *s++ = 'L'; |
96 | s = gcap_string(s, c); | 96 | s = gcap_string(s, c); |
97 | } | 97 | } |
98 | 98 | ||
99 | c = (caps >> CEPH_CAP_SXATTR) & 3; | 99 | c = (caps >> CEPH_CAP_SXATTR) & 3; |
100 | if (c) { | 100 | if (c) { |
101 | *s++ = 'X'; | 101 | *s++ = 'X'; |
102 | s = gcap_string(s, c); | 102 | s = gcap_string(s, c); |
103 | } | 103 | } |
104 | 104 | ||
105 | c = caps >> CEPH_CAP_SFILE; | 105 | c = caps >> CEPH_CAP_SFILE; |
106 | if (c) { | 106 | if (c) { |
107 | *s++ = 'F'; | 107 | *s++ = 'F'; |
108 | s = gcap_string(s, c); | 108 | s = gcap_string(s, c); |
109 | } | 109 | } |
110 | 110 | ||
111 | if (s == cap_str[i]) | 111 | if (s == cap_str[i]) |
112 | *s++ = '-'; | 112 | *s++ = '-'; |
113 | *s = 0; | 113 | *s = 0; |
114 | return cap_str[i]; | 114 | return cap_str[i]; |
115 | } | 115 | } |
116 | 116 | ||
117 | void ceph_caps_init(struct ceph_mds_client *mdsc) | 117 | void ceph_caps_init(struct ceph_mds_client *mdsc) |
118 | { | 118 | { |
119 | INIT_LIST_HEAD(&mdsc->caps_list); | 119 | INIT_LIST_HEAD(&mdsc->caps_list); |
120 | spin_lock_init(&mdsc->caps_list_lock); | 120 | spin_lock_init(&mdsc->caps_list_lock); |
121 | } | 121 | } |
122 | 122 | ||
123 | void ceph_caps_finalize(struct ceph_mds_client *mdsc) | 123 | void ceph_caps_finalize(struct ceph_mds_client *mdsc) |
124 | { | 124 | { |
125 | struct ceph_cap *cap; | 125 | struct ceph_cap *cap; |
126 | 126 | ||
127 | spin_lock(&mdsc->caps_list_lock); | 127 | spin_lock(&mdsc->caps_list_lock); |
128 | while (!list_empty(&mdsc->caps_list)) { | 128 | while (!list_empty(&mdsc->caps_list)) { |
129 | cap = list_first_entry(&mdsc->caps_list, | 129 | cap = list_first_entry(&mdsc->caps_list, |
130 | struct ceph_cap, caps_item); | 130 | struct ceph_cap, caps_item); |
131 | list_del(&cap->caps_item); | 131 | list_del(&cap->caps_item); |
132 | kmem_cache_free(ceph_cap_cachep, cap); | 132 | kmem_cache_free(ceph_cap_cachep, cap); |
133 | } | 133 | } |
134 | mdsc->caps_total_count = 0; | 134 | mdsc->caps_total_count = 0; |
135 | mdsc->caps_avail_count = 0; | 135 | mdsc->caps_avail_count = 0; |
136 | mdsc->caps_use_count = 0; | 136 | mdsc->caps_use_count = 0; |
137 | mdsc->caps_reserve_count = 0; | 137 | mdsc->caps_reserve_count = 0; |
138 | mdsc->caps_min_count = 0; | 138 | mdsc->caps_min_count = 0; |
139 | spin_unlock(&mdsc->caps_list_lock); | 139 | spin_unlock(&mdsc->caps_list_lock); |
140 | } | 140 | } |
141 | 141 | ||
142 | void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta) | 142 | void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta) |
143 | { | 143 | { |
144 | spin_lock(&mdsc->caps_list_lock); | 144 | spin_lock(&mdsc->caps_list_lock); |
145 | mdsc->caps_min_count += delta; | 145 | mdsc->caps_min_count += delta; |
146 | BUG_ON(mdsc->caps_min_count < 0); | 146 | BUG_ON(mdsc->caps_min_count < 0); |
147 | spin_unlock(&mdsc->caps_list_lock); | 147 | spin_unlock(&mdsc->caps_list_lock); |
148 | } | 148 | } |
149 | 149 | ||
150 | int ceph_reserve_caps(struct ceph_mds_client *mdsc, | 150 | int ceph_reserve_caps(struct ceph_mds_client *mdsc, |
151 | struct ceph_cap_reservation *ctx, int need) | 151 | struct ceph_cap_reservation *ctx, int need) |
152 | { | 152 | { |
153 | int i; | 153 | int i; |
154 | struct ceph_cap *cap; | 154 | struct ceph_cap *cap; |
155 | int have; | 155 | int have; |
156 | int alloc = 0; | 156 | int alloc = 0; |
157 | LIST_HEAD(newcaps); | 157 | LIST_HEAD(newcaps); |
158 | int ret = 0; | 158 | int ret = 0; |
159 | 159 | ||
160 | dout("reserve caps ctx=%p need=%d\n", ctx, need); | 160 | dout("reserve caps ctx=%p need=%d\n", ctx, need); |
161 | 161 | ||
162 | /* first reserve any caps that are already allocated */ | 162 | /* first reserve any caps that are already allocated */ |
163 | spin_lock(&mdsc->caps_list_lock); | 163 | spin_lock(&mdsc->caps_list_lock); |
164 | if (mdsc->caps_avail_count >= need) | 164 | if (mdsc->caps_avail_count >= need) |
165 | have = need; | 165 | have = need; |
166 | else | 166 | else |
167 | have = mdsc->caps_avail_count; | 167 | have = mdsc->caps_avail_count; |
168 | mdsc->caps_avail_count -= have; | 168 | mdsc->caps_avail_count -= have; |
169 | mdsc->caps_reserve_count += have; | 169 | mdsc->caps_reserve_count += have; |
170 | BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + | 170 | BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + |
171 | mdsc->caps_reserve_count + | 171 | mdsc->caps_reserve_count + |
172 | mdsc->caps_avail_count); | 172 | mdsc->caps_avail_count); |
173 | spin_unlock(&mdsc->caps_list_lock); | 173 | spin_unlock(&mdsc->caps_list_lock); |
174 | 174 | ||
175 | for (i = have; i < need; i++) { | 175 | for (i = have; i < need; i++) { |
176 | cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); | 176 | cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); |
177 | if (!cap) { | 177 | if (!cap) { |
178 | ret = -ENOMEM; | 178 | ret = -ENOMEM; |
179 | goto out_alloc_count; | 179 | goto out_alloc_count; |
180 | } | 180 | } |
181 | list_add(&cap->caps_item, &newcaps); | 181 | list_add(&cap->caps_item, &newcaps); |
182 | alloc++; | 182 | alloc++; |
183 | } | 183 | } |
184 | BUG_ON(have + alloc != need); | 184 | BUG_ON(have + alloc != need); |
185 | 185 | ||
186 | spin_lock(&mdsc->caps_list_lock); | 186 | spin_lock(&mdsc->caps_list_lock); |
187 | mdsc->caps_total_count += alloc; | 187 | mdsc->caps_total_count += alloc; |
188 | mdsc->caps_reserve_count += alloc; | 188 | mdsc->caps_reserve_count += alloc; |
189 | list_splice(&newcaps, &mdsc->caps_list); | 189 | list_splice(&newcaps, &mdsc->caps_list); |
190 | 190 | ||
191 | BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + | 191 | BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + |
192 | mdsc->caps_reserve_count + | 192 | mdsc->caps_reserve_count + |
193 | mdsc->caps_avail_count); | 193 | mdsc->caps_avail_count); |
194 | spin_unlock(&mdsc->caps_list_lock); | 194 | spin_unlock(&mdsc->caps_list_lock); |
195 | 195 | ||
196 | ctx->count = need; | 196 | ctx->count = need; |
197 | dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n", | 197 | dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n", |
198 | ctx, mdsc->caps_total_count, mdsc->caps_use_count, | 198 | ctx, mdsc->caps_total_count, mdsc->caps_use_count, |
199 | mdsc->caps_reserve_count, mdsc->caps_avail_count); | 199 | mdsc->caps_reserve_count, mdsc->caps_avail_count); |
200 | return 0; | 200 | return 0; |
201 | 201 | ||
202 | out_alloc_count: | 202 | out_alloc_count: |
203 | /* we didn't manage to reserve as much as we needed */ | 203 | /* we didn't manage to reserve as much as we needed */ |
204 | pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n", | 204 | pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n", |
205 | ctx, need, have); | 205 | ctx, need, have); |
206 | return ret; | 206 | return ret; |
207 | } | 207 | } |
208 | 208 | ||
209 | int ceph_unreserve_caps(struct ceph_mds_client *mdsc, | 209 | int ceph_unreserve_caps(struct ceph_mds_client *mdsc, |
210 | struct ceph_cap_reservation *ctx) | 210 | struct ceph_cap_reservation *ctx) |
211 | { | 211 | { |
212 | dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count); | 212 | dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count); |
213 | if (ctx->count) { | 213 | if (ctx->count) { |
214 | spin_lock(&mdsc->caps_list_lock); | 214 | spin_lock(&mdsc->caps_list_lock); |
215 | BUG_ON(mdsc->caps_reserve_count < ctx->count); | 215 | BUG_ON(mdsc->caps_reserve_count < ctx->count); |
216 | mdsc->caps_reserve_count -= ctx->count; | 216 | mdsc->caps_reserve_count -= ctx->count; |
217 | mdsc->caps_avail_count += ctx->count; | 217 | mdsc->caps_avail_count += ctx->count; |
218 | ctx->count = 0; | 218 | ctx->count = 0; |
219 | dout("unreserve caps %d = %d used + %d resv + %d avail\n", | 219 | dout("unreserve caps %d = %d used + %d resv + %d avail\n", |
220 | mdsc->caps_total_count, mdsc->caps_use_count, | 220 | mdsc->caps_total_count, mdsc->caps_use_count, |
221 | mdsc->caps_reserve_count, mdsc->caps_avail_count); | 221 | mdsc->caps_reserve_count, mdsc->caps_avail_count); |
222 | BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + | 222 | BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + |
223 | mdsc->caps_reserve_count + | 223 | mdsc->caps_reserve_count + |
224 | mdsc->caps_avail_count); | 224 | mdsc->caps_avail_count); |
225 | spin_unlock(&mdsc->caps_list_lock); | 225 | spin_unlock(&mdsc->caps_list_lock); |
226 | } | 226 | } |
227 | return 0; | 227 | return 0; |
228 | } | 228 | } |
229 | 229 | ||
230 | static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc, | 230 | static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc, |
231 | struct ceph_cap_reservation *ctx) | 231 | struct ceph_cap_reservation *ctx) |
232 | { | 232 | { |
233 | struct ceph_cap *cap = NULL; | 233 | struct ceph_cap *cap = NULL; |
234 | 234 | ||
235 | /* temporary, until we do something about cap import/export */ | 235 | /* temporary, until we do something about cap import/export */ |
236 | if (!ctx) { | 236 | if (!ctx) { |
237 | cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); | 237 | cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); |
238 | if (cap) { | 238 | if (cap) { |
239 | mdsc->caps_use_count++; | 239 | mdsc->caps_use_count++; |
240 | mdsc->caps_total_count++; | 240 | mdsc->caps_total_count++; |
241 | } | 241 | } |
242 | return cap; | 242 | return cap; |
243 | } | 243 | } |
244 | 244 | ||
245 | spin_lock(&mdsc->caps_list_lock); | 245 | spin_lock(&mdsc->caps_list_lock); |
246 | dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n", | 246 | dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n", |
247 | ctx, ctx->count, mdsc->caps_total_count, mdsc->caps_use_count, | 247 | ctx, ctx->count, mdsc->caps_total_count, mdsc->caps_use_count, |
248 | mdsc->caps_reserve_count, mdsc->caps_avail_count); | 248 | mdsc->caps_reserve_count, mdsc->caps_avail_count); |
249 | BUG_ON(!ctx->count); | 249 | BUG_ON(!ctx->count); |
250 | BUG_ON(ctx->count > mdsc->caps_reserve_count); | 250 | BUG_ON(ctx->count > mdsc->caps_reserve_count); |
251 | BUG_ON(list_empty(&mdsc->caps_list)); | 251 | BUG_ON(list_empty(&mdsc->caps_list)); |
252 | 252 | ||
253 | ctx->count--; | 253 | ctx->count--; |
254 | mdsc->caps_reserve_count--; | 254 | mdsc->caps_reserve_count--; |
255 | mdsc->caps_use_count++; | 255 | mdsc->caps_use_count++; |
256 | 256 | ||
257 | cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item); | 257 | cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item); |
258 | list_del(&cap->caps_item); | 258 | list_del(&cap->caps_item); |
259 | 259 | ||
260 | BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + | 260 | BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + |
261 | mdsc->caps_reserve_count + mdsc->caps_avail_count); | 261 | mdsc->caps_reserve_count + mdsc->caps_avail_count); |
262 | spin_unlock(&mdsc->caps_list_lock); | 262 | spin_unlock(&mdsc->caps_list_lock); |
263 | return cap; | 263 | return cap; |
264 | } | 264 | } |
265 | 265 | ||
266 | void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap) | 266 | void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap) |
267 | { | 267 | { |
268 | spin_lock(&mdsc->caps_list_lock); | 268 | spin_lock(&mdsc->caps_list_lock); |
269 | dout("put_cap %p %d = %d used + %d resv + %d avail\n", | 269 | dout("put_cap %p %d = %d used + %d resv + %d avail\n", |
270 | cap, mdsc->caps_total_count, mdsc->caps_use_count, | 270 | cap, mdsc->caps_total_count, mdsc->caps_use_count, |
271 | mdsc->caps_reserve_count, mdsc->caps_avail_count); | 271 | mdsc->caps_reserve_count, mdsc->caps_avail_count); |
272 | mdsc->caps_use_count--; | 272 | mdsc->caps_use_count--; |
273 | /* | 273 | /* |
274 | * Keep some preallocated caps around (ceph_min_count), to | 274 | * Keep some preallocated caps around (ceph_min_count), to |
275 | * avoid lots of free/alloc churn. | 275 | * avoid lots of free/alloc churn. |
276 | */ | 276 | */ |
277 | if (mdsc->caps_avail_count >= mdsc->caps_reserve_count + | 277 | if (mdsc->caps_avail_count >= mdsc->caps_reserve_count + |
278 | mdsc->caps_min_count) { | 278 | mdsc->caps_min_count) { |
279 | mdsc->caps_total_count--; | 279 | mdsc->caps_total_count--; |
280 | kmem_cache_free(ceph_cap_cachep, cap); | 280 | kmem_cache_free(ceph_cap_cachep, cap); |
281 | } else { | 281 | } else { |
282 | mdsc->caps_avail_count++; | 282 | mdsc->caps_avail_count++; |
283 | list_add(&cap->caps_item, &mdsc->caps_list); | 283 | list_add(&cap->caps_item, &mdsc->caps_list); |
284 | } | 284 | } |
285 | 285 | ||
286 | BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + | 286 | BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + |
287 | mdsc->caps_reserve_count + mdsc->caps_avail_count); | 287 | mdsc->caps_reserve_count + mdsc->caps_avail_count); |
288 | spin_unlock(&mdsc->caps_list_lock); | 288 | spin_unlock(&mdsc->caps_list_lock); |
289 | } | 289 | } |
290 | 290 | ||
291 | void ceph_reservation_status(struct ceph_fs_client *fsc, | 291 | void ceph_reservation_status(struct ceph_fs_client *fsc, |
292 | int *total, int *avail, int *used, int *reserved, | 292 | int *total, int *avail, int *used, int *reserved, |
293 | int *min) | 293 | int *min) |
294 | { | 294 | { |
295 | struct ceph_mds_client *mdsc = fsc->mdsc; | 295 | struct ceph_mds_client *mdsc = fsc->mdsc; |
296 | 296 | ||
297 | if (total) | 297 | if (total) |
298 | *total = mdsc->caps_total_count; | 298 | *total = mdsc->caps_total_count; |
299 | if (avail) | 299 | if (avail) |
300 | *avail = mdsc->caps_avail_count; | 300 | *avail = mdsc->caps_avail_count; |
301 | if (used) | 301 | if (used) |
302 | *used = mdsc->caps_use_count; | 302 | *used = mdsc->caps_use_count; |
303 | if (reserved) | 303 | if (reserved) |
304 | *reserved = mdsc->caps_reserve_count; | 304 | *reserved = mdsc->caps_reserve_count; |
305 | if (min) | 305 | if (min) |
306 | *min = mdsc->caps_min_count; | 306 | *min = mdsc->caps_min_count; |
307 | } | 307 | } |
308 | 308 | ||
309 | /* | 309 | /* |
310 | * Find ceph_cap for given mds, if any. | 310 | * Find ceph_cap for given mds, if any. |
311 | * | 311 | * |
312 | * Called with i_ceph_lock held. | 312 | * Called with i_ceph_lock held. |
313 | */ | 313 | */ |
314 | static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds) | 314 | static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds) |
315 | { | 315 | { |
316 | struct ceph_cap *cap; | 316 | struct ceph_cap *cap; |
317 | struct rb_node *n = ci->i_caps.rb_node; | 317 | struct rb_node *n = ci->i_caps.rb_node; |
318 | 318 | ||
319 | while (n) { | 319 | while (n) { |
320 | cap = rb_entry(n, struct ceph_cap, ci_node); | 320 | cap = rb_entry(n, struct ceph_cap, ci_node); |
321 | if (mds < cap->mds) | 321 | if (mds < cap->mds) |
322 | n = n->rb_left; | 322 | n = n->rb_left; |
323 | else if (mds > cap->mds) | 323 | else if (mds > cap->mds) |
324 | n = n->rb_right; | 324 | n = n->rb_right; |
325 | else | 325 | else |
326 | return cap; | 326 | return cap; |
327 | } | 327 | } |
328 | return NULL; | 328 | return NULL; |
329 | } | 329 | } |
330 | 330 | ||
331 | struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds) | 331 | struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds) |
332 | { | 332 | { |
333 | struct ceph_cap *cap; | 333 | struct ceph_cap *cap; |
334 | 334 | ||
335 | spin_lock(&ci->i_ceph_lock); | 335 | spin_lock(&ci->i_ceph_lock); |
336 | cap = __get_cap_for_mds(ci, mds); | 336 | cap = __get_cap_for_mds(ci, mds); |
337 | spin_unlock(&ci->i_ceph_lock); | 337 | spin_unlock(&ci->i_ceph_lock); |
338 | return cap; | 338 | return cap; |
339 | } | 339 | } |
340 | 340 | ||
341 | /* | 341 | /* |
342 | * Return id of any MDS with a cap, preferably FILE_WR|BUFFER|EXCL, else -1. | 342 | * Return id of any MDS with a cap, preferably FILE_WR|BUFFER|EXCL, else -1. |
343 | */ | 343 | */ |
344 | static int __ceph_get_cap_mds(struct ceph_inode_info *ci) | 344 | static int __ceph_get_cap_mds(struct ceph_inode_info *ci) |
345 | { | 345 | { |
346 | struct ceph_cap *cap; | 346 | struct ceph_cap *cap; |
347 | int mds = -1; | 347 | int mds = -1; |
348 | struct rb_node *p; | 348 | struct rb_node *p; |
349 | 349 | ||
350 | /* prefer mds with WR|BUFFER|EXCL caps */ | 350 | /* prefer mds with WR|BUFFER|EXCL caps */ |
351 | for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { | 351 | for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { |
352 | cap = rb_entry(p, struct ceph_cap, ci_node); | 352 | cap = rb_entry(p, struct ceph_cap, ci_node); |
353 | mds = cap->mds; | 353 | mds = cap->mds; |
354 | if (cap->issued & (CEPH_CAP_FILE_WR | | 354 | if (cap->issued & (CEPH_CAP_FILE_WR | |
355 | CEPH_CAP_FILE_BUFFER | | 355 | CEPH_CAP_FILE_BUFFER | |
356 | CEPH_CAP_FILE_EXCL)) | 356 | CEPH_CAP_FILE_EXCL)) |
357 | break; | 357 | break; |
358 | } | 358 | } |
359 | return mds; | 359 | return mds; |
360 | } | 360 | } |
361 | 361 | ||
362 | int ceph_get_cap_mds(struct inode *inode) | 362 | int ceph_get_cap_mds(struct inode *inode) |
363 | { | 363 | { |
364 | struct ceph_inode_info *ci = ceph_inode(inode); | 364 | struct ceph_inode_info *ci = ceph_inode(inode); |
365 | int mds; | 365 | int mds; |
366 | spin_lock(&ci->i_ceph_lock); | 366 | spin_lock(&ci->i_ceph_lock); |
367 | mds = __ceph_get_cap_mds(ceph_inode(inode)); | 367 | mds = __ceph_get_cap_mds(ceph_inode(inode)); |
368 | spin_unlock(&ci->i_ceph_lock); | 368 | spin_unlock(&ci->i_ceph_lock); |
369 | return mds; | 369 | return mds; |
370 | } | 370 | } |
371 | 371 | ||
372 | /* | 372 | /* |
373 | * Called under i_ceph_lock. | 373 | * Called under i_ceph_lock. |
374 | */ | 374 | */ |
375 | static void __insert_cap_node(struct ceph_inode_info *ci, | 375 | static void __insert_cap_node(struct ceph_inode_info *ci, |
376 | struct ceph_cap *new) | 376 | struct ceph_cap *new) |
377 | { | 377 | { |
378 | struct rb_node **p = &ci->i_caps.rb_node; | 378 | struct rb_node **p = &ci->i_caps.rb_node; |
379 | struct rb_node *parent = NULL; | 379 | struct rb_node *parent = NULL; |
380 | struct ceph_cap *cap = NULL; | 380 | struct ceph_cap *cap = NULL; |
381 | 381 | ||
382 | while (*p) { | 382 | while (*p) { |
383 | parent = *p; | 383 | parent = *p; |
384 | cap = rb_entry(parent, struct ceph_cap, ci_node); | 384 | cap = rb_entry(parent, struct ceph_cap, ci_node); |
385 | if (new->mds < cap->mds) | 385 | if (new->mds < cap->mds) |
386 | p = &(*p)->rb_left; | 386 | p = &(*p)->rb_left; |
387 | else if (new->mds > cap->mds) | 387 | else if (new->mds > cap->mds) |
388 | p = &(*p)->rb_right; | 388 | p = &(*p)->rb_right; |
389 | else | 389 | else |
390 | BUG(); | 390 | BUG(); |
391 | } | 391 | } |
392 | 392 | ||
393 | rb_link_node(&new->ci_node, parent, p); | 393 | rb_link_node(&new->ci_node, parent, p); |
394 | rb_insert_color(&new->ci_node, &ci->i_caps); | 394 | rb_insert_color(&new->ci_node, &ci->i_caps); |
395 | } | 395 | } |
396 | 396 | ||
397 | /* | 397 | /* |
398 | * (re)set cap hold timeouts, which control the delayed release | 398 | * (re)set cap hold timeouts, which control the delayed release |
399 | * of unused caps back to the MDS. Should be called on cap use. | 399 | * of unused caps back to the MDS. Should be called on cap use. |
400 | */ | 400 | */ |
401 | static void __cap_set_timeouts(struct ceph_mds_client *mdsc, | 401 | static void __cap_set_timeouts(struct ceph_mds_client *mdsc, |
402 | struct ceph_inode_info *ci) | 402 | struct ceph_inode_info *ci) |
403 | { | 403 | { |
404 | struct ceph_mount_options *ma = mdsc->fsc->mount_options; | 404 | struct ceph_mount_options *ma = mdsc->fsc->mount_options; |
405 | 405 | ||
406 | ci->i_hold_caps_min = round_jiffies(jiffies + | 406 | ci->i_hold_caps_min = round_jiffies(jiffies + |
407 | ma->caps_wanted_delay_min * HZ); | 407 | ma->caps_wanted_delay_min * HZ); |
408 | ci->i_hold_caps_max = round_jiffies(jiffies + | 408 | ci->i_hold_caps_max = round_jiffies(jiffies + |
409 | ma->caps_wanted_delay_max * HZ); | 409 | ma->caps_wanted_delay_max * HZ); |
410 | dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode, | 410 | dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode, |
411 | ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies); | 411 | ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies); |
412 | } | 412 | } |
413 | 413 | ||
414 | /* | 414 | /* |
415 | * (Re)queue cap at the end of the delayed cap release list. | 415 | * (Re)queue cap at the end of the delayed cap release list. |
416 | * | 416 | * |
417 | * If I_FLUSH is set, leave the inode at the front of the list. | 417 | * If I_FLUSH is set, leave the inode at the front of the list. |
418 | * | 418 | * |
419 | * Caller holds i_ceph_lock | 419 | * Caller holds i_ceph_lock |
420 | * -> we take mdsc->cap_delay_lock | 420 | * -> we take mdsc->cap_delay_lock |
421 | */ | 421 | */ |
422 | static void __cap_delay_requeue(struct ceph_mds_client *mdsc, | 422 | static void __cap_delay_requeue(struct ceph_mds_client *mdsc, |
423 | struct ceph_inode_info *ci) | 423 | struct ceph_inode_info *ci) |
424 | { | 424 | { |
425 | __cap_set_timeouts(mdsc, ci); | 425 | __cap_set_timeouts(mdsc, ci); |
426 | dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode, | 426 | dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode, |
427 | ci->i_ceph_flags, ci->i_hold_caps_max); | 427 | ci->i_ceph_flags, ci->i_hold_caps_max); |
428 | if (!mdsc->stopping) { | 428 | if (!mdsc->stopping) { |
429 | spin_lock(&mdsc->cap_delay_lock); | 429 | spin_lock(&mdsc->cap_delay_lock); |
430 | if (!list_empty(&ci->i_cap_delay_list)) { | 430 | if (!list_empty(&ci->i_cap_delay_list)) { |
431 | if (ci->i_ceph_flags & CEPH_I_FLUSH) | 431 | if (ci->i_ceph_flags & CEPH_I_FLUSH) |
432 | goto no_change; | 432 | goto no_change; |
433 | list_del_init(&ci->i_cap_delay_list); | 433 | list_del_init(&ci->i_cap_delay_list); |
434 | } | 434 | } |
435 | list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list); | 435 | list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list); |
436 | no_change: | 436 | no_change: |
437 | spin_unlock(&mdsc->cap_delay_lock); | 437 | spin_unlock(&mdsc->cap_delay_lock); |
438 | } | 438 | } |
439 | } | 439 | } |
440 | 440 | ||
441 | /* | 441 | /* |
442 | * Queue an inode for immediate writeback. Mark inode with I_FLUSH, | 442 | * Queue an inode for immediate writeback. Mark inode with I_FLUSH, |
443 | * indicating we should send a cap message to flush dirty metadata | 443 | * indicating we should send a cap message to flush dirty metadata |
444 | * asap, and move to the front of the delayed cap list. | 444 | * asap, and move to the front of the delayed cap list. |
445 | */ | 445 | */ |
446 | static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc, | 446 | static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc, |
447 | struct ceph_inode_info *ci) | 447 | struct ceph_inode_info *ci) |
448 | { | 448 | { |
449 | dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode); | 449 | dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode); |
450 | spin_lock(&mdsc->cap_delay_lock); | 450 | spin_lock(&mdsc->cap_delay_lock); |
451 | ci->i_ceph_flags |= CEPH_I_FLUSH; | 451 | ci->i_ceph_flags |= CEPH_I_FLUSH; |
452 | if (!list_empty(&ci->i_cap_delay_list)) | 452 | if (!list_empty(&ci->i_cap_delay_list)) |
453 | list_del_init(&ci->i_cap_delay_list); | 453 | list_del_init(&ci->i_cap_delay_list); |
454 | list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list); | 454 | list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list); |
455 | spin_unlock(&mdsc->cap_delay_lock); | 455 | spin_unlock(&mdsc->cap_delay_lock); |
456 | } | 456 | } |
457 | 457 | ||
458 | /* | 458 | /* |
459 | * Cancel delayed work on cap. | 459 | * Cancel delayed work on cap. |
460 | * | 460 | * |
461 | * Caller must hold i_ceph_lock. | 461 | * Caller must hold i_ceph_lock. |
462 | */ | 462 | */ |
463 | static void __cap_delay_cancel(struct ceph_mds_client *mdsc, | 463 | static void __cap_delay_cancel(struct ceph_mds_client *mdsc, |
464 | struct ceph_inode_info *ci) | 464 | struct ceph_inode_info *ci) |
465 | { | 465 | { |
466 | dout("__cap_delay_cancel %p\n", &ci->vfs_inode); | 466 | dout("__cap_delay_cancel %p\n", &ci->vfs_inode); |
467 | if (list_empty(&ci->i_cap_delay_list)) | 467 | if (list_empty(&ci->i_cap_delay_list)) |
468 | return; | 468 | return; |
469 | spin_lock(&mdsc->cap_delay_lock); | 469 | spin_lock(&mdsc->cap_delay_lock); |
470 | list_del_init(&ci->i_cap_delay_list); | 470 | list_del_init(&ci->i_cap_delay_list); |
471 | spin_unlock(&mdsc->cap_delay_lock); | 471 | spin_unlock(&mdsc->cap_delay_lock); |
472 | } | 472 | } |
473 | 473 | ||
474 | /* | 474 | /* |
475 | * Common issue checks for add_cap, handle_cap_grant. | 475 | * Common issue checks for add_cap, handle_cap_grant. |
476 | */ | 476 | */ |
477 | static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, | 477 | static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, |
478 | unsigned issued) | 478 | unsigned issued) |
479 | { | 479 | { |
480 | unsigned had = __ceph_caps_issued(ci, NULL); | 480 | unsigned had = __ceph_caps_issued(ci, NULL); |
481 | 481 | ||
482 | /* | 482 | /* |
483 | * Each time we receive FILE_CACHE anew, we increment | 483 | * Each time we receive FILE_CACHE anew, we increment |
484 | * i_rdcache_gen. | 484 | * i_rdcache_gen. |
485 | */ | 485 | */ |
486 | if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && | 486 | if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && |
487 | (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) | 487 | (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) |
488 | ci->i_rdcache_gen++; | 488 | ci->i_rdcache_gen++; |
489 | 489 | ||
490 | /* | 490 | /* |
491 | * if we are newly issued FILE_SHARED, clear D_COMPLETE; we | 491 | * if we are newly issued FILE_SHARED, clear D_COMPLETE; we |
492 | * don't know what happened to this directory while we didn't | 492 | * don't know what happened to this directory while we didn't |
493 | * have the cap. | 493 | * have the cap. |
494 | */ | 494 | */ |
495 | if ((issued & CEPH_CAP_FILE_SHARED) && | 495 | if ((issued & CEPH_CAP_FILE_SHARED) && |
496 | (had & CEPH_CAP_FILE_SHARED) == 0) { | 496 | (had & CEPH_CAP_FILE_SHARED) == 0) { |
497 | ci->i_shared_gen++; | 497 | ci->i_shared_gen++; |
498 | if (S_ISDIR(ci->vfs_inode.i_mode)) | 498 | if (S_ISDIR(ci->vfs_inode.i_mode)) |
499 | ceph_dir_clear_complete(&ci->vfs_inode); | 499 | ceph_dir_clear_complete(&ci->vfs_inode); |
500 | } | 500 | } |
501 | } | 501 | } |
502 | 502 | ||
503 | /* | 503 | /* |
504 | * Add a capability under the given MDS session. | 504 | * Add a capability under the given MDS session. |
505 | * | 505 | * |
506 | * Caller should hold session snap_rwsem (read) and s_mutex. | 506 | * Caller should hold session snap_rwsem (read) and s_mutex. |
507 | * | 507 | * |
508 | * @fmode is the open file mode, if we are opening a file, otherwise | 508 | * @fmode is the open file mode, if we are opening a file, otherwise |
509 | * it is < 0. (This is so we can atomically add the cap and add an | 509 | * it is < 0. (This is so we can atomically add the cap and add an |
510 | * open file reference to it.) | 510 | * open file reference to it.) |
511 | */ | 511 | */ |
512 | int ceph_add_cap(struct inode *inode, | 512 | int ceph_add_cap(struct inode *inode, |
513 | struct ceph_mds_session *session, u64 cap_id, | 513 | struct ceph_mds_session *session, u64 cap_id, |
514 | int fmode, unsigned issued, unsigned wanted, | 514 | int fmode, unsigned issued, unsigned wanted, |
515 | unsigned seq, unsigned mseq, u64 realmino, int flags, | 515 | unsigned seq, unsigned mseq, u64 realmino, int flags, |
516 | struct ceph_cap_reservation *caps_reservation) | 516 | struct ceph_cap_reservation *caps_reservation) |
517 | { | 517 | { |
518 | struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; | 518 | struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; |
519 | struct ceph_inode_info *ci = ceph_inode(inode); | 519 | struct ceph_inode_info *ci = ceph_inode(inode); |
520 | struct ceph_cap *new_cap = NULL; | 520 | struct ceph_cap *new_cap = NULL; |
521 | struct ceph_cap *cap; | 521 | struct ceph_cap *cap; |
522 | int mds = session->s_mds; | 522 | int mds = session->s_mds; |
523 | int actual_wanted; | 523 | int actual_wanted; |
524 | 524 | ||
525 | dout("add_cap %p mds%d cap %llx %s seq %d\n", inode, | 525 | dout("add_cap %p mds%d cap %llx %s seq %d\n", inode, |
526 | session->s_mds, cap_id, ceph_cap_string(issued), seq); | 526 | session->s_mds, cap_id, ceph_cap_string(issued), seq); |
527 | 527 | ||
528 | /* | 528 | /* |
529 | * If we are opening the file, include file mode wanted bits | 529 | * If we are opening the file, include file mode wanted bits |
530 | * in wanted. | 530 | * in wanted. |
531 | */ | 531 | */ |
532 | if (fmode >= 0) | 532 | if (fmode >= 0) |
533 | wanted |= ceph_caps_for_mode(fmode); | 533 | wanted |= ceph_caps_for_mode(fmode); |
534 | 534 | ||
535 | retry: | 535 | retry: |
536 | spin_lock(&ci->i_ceph_lock); | 536 | spin_lock(&ci->i_ceph_lock); |
537 | cap = __get_cap_for_mds(ci, mds); | 537 | cap = __get_cap_for_mds(ci, mds); |
538 | if (!cap) { | 538 | if (!cap) { |
539 | if (new_cap) { | 539 | if (new_cap) { |
540 | cap = new_cap; | 540 | cap = new_cap; |
541 | new_cap = NULL; | 541 | new_cap = NULL; |
542 | } else { | 542 | } else { |
543 | spin_unlock(&ci->i_ceph_lock); | 543 | spin_unlock(&ci->i_ceph_lock); |
544 | new_cap = get_cap(mdsc, caps_reservation); | 544 | new_cap = get_cap(mdsc, caps_reservation); |
545 | if (new_cap == NULL) | 545 | if (new_cap == NULL) |
546 | return -ENOMEM; | 546 | return -ENOMEM; |
547 | goto retry; | 547 | goto retry; |
548 | } | 548 | } |
549 | 549 | ||
550 | cap->issued = 0; | 550 | cap->issued = 0; |
551 | cap->implemented = 0; | 551 | cap->implemented = 0; |
552 | cap->mds = mds; | 552 | cap->mds = mds; |
553 | cap->mds_wanted = 0; | 553 | cap->mds_wanted = 0; |
554 | 554 | ||
555 | cap->ci = ci; | 555 | cap->ci = ci; |
556 | __insert_cap_node(ci, cap); | 556 | __insert_cap_node(ci, cap); |
557 | 557 | ||
558 | /* clear out old exporting info? (i.e. on cap import) */ | 558 | /* clear out old exporting info? (i.e. on cap import) */ |
559 | if (ci->i_cap_exporting_mds == mds) { | 559 | if (ci->i_cap_exporting_mds == mds) { |
560 | ci->i_cap_exporting_issued = 0; | 560 | ci->i_cap_exporting_issued = 0; |
561 | ci->i_cap_exporting_mseq = 0; | 561 | ci->i_cap_exporting_mseq = 0; |
562 | ci->i_cap_exporting_mds = -1; | 562 | ci->i_cap_exporting_mds = -1; |
563 | } | 563 | } |
564 | 564 | ||
565 | /* add to session cap list */ | 565 | /* add to session cap list */ |
566 | cap->session = session; | 566 | cap->session = session; |
567 | spin_lock(&session->s_cap_lock); | 567 | spin_lock(&session->s_cap_lock); |
568 | list_add_tail(&cap->session_caps, &session->s_caps); | 568 | list_add_tail(&cap->session_caps, &session->s_caps); |
569 | session->s_nr_caps++; | 569 | session->s_nr_caps++; |
570 | spin_unlock(&session->s_cap_lock); | 570 | spin_unlock(&session->s_cap_lock); |
571 | } else if (new_cap) | 571 | } else if (new_cap) |
572 | ceph_put_cap(mdsc, new_cap); | 572 | ceph_put_cap(mdsc, new_cap); |
573 | 573 | ||
574 | if (!ci->i_snap_realm) { | 574 | if (!ci->i_snap_realm) { |
575 | /* | 575 | /* |
576 | * add this inode to the appropriate snap realm | 576 | * add this inode to the appropriate snap realm |
577 | */ | 577 | */ |
578 | struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc, | 578 | struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc, |
579 | realmino); | 579 | realmino); |
580 | if (realm) { | 580 | if (realm) { |
581 | ceph_get_snap_realm(mdsc, realm); | 581 | ceph_get_snap_realm(mdsc, realm); |
582 | spin_lock(&realm->inodes_with_caps_lock); | 582 | spin_lock(&realm->inodes_with_caps_lock); |
583 | ci->i_snap_realm = realm; | 583 | ci->i_snap_realm = realm; |
584 | list_add(&ci->i_snap_realm_item, | 584 | list_add(&ci->i_snap_realm_item, |
585 | &realm->inodes_with_caps); | 585 | &realm->inodes_with_caps); |
586 | spin_unlock(&realm->inodes_with_caps_lock); | 586 | spin_unlock(&realm->inodes_with_caps_lock); |
587 | } else { | 587 | } else { |
588 | pr_err("ceph_add_cap: couldn't find snap realm %llx\n", | 588 | pr_err("ceph_add_cap: couldn't find snap realm %llx\n", |
589 | realmino); | 589 | realmino); |
590 | WARN_ON(!realm); | 590 | WARN_ON(!realm); |
591 | } | 591 | } |
592 | } | 592 | } |
593 | 593 | ||
594 | __check_cap_issue(ci, cap, issued); | 594 | __check_cap_issue(ci, cap, issued); |
595 | 595 | ||
596 | /* | 596 | /* |
597 | * If we are issued caps we don't want, or the mds' wanted | 597 | * If we are issued caps we don't want, or the mds' wanted |
598 | * value appears to be off, queue a check so we'll release | 598 | * value appears to be off, queue a check so we'll release |
599 | * later and/or update the mds wanted value. | 599 | * later and/or update the mds wanted value. |
600 | */ | 600 | */ |
601 | actual_wanted = __ceph_caps_wanted(ci); | 601 | actual_wanted = __ceph_caps_wanted(ci); |
602 | if ((wanted & ~actual_wanted) || | 602 | if ((wanted & ~actual_wanted) || |
603 | (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) { | 603 | (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) { |
604 | dout(" issued %s, mds wanted %s, actual %s, queueing\n", | 604 | dout(" issued %s, mds wanted %s, actual %s, queueing\n", |
605 | ceph_cap_string(issued), ceph_cap_string(wanted), | 605 | ceph_cap_string(issued), ceph_cap_string(wanted), |
606 | ceph_cap_string(actual_wanted)); | 606 | ceph_cap_string(actual_wanted)); |
607 | __cap_delay_requeue(mdsc, ci); | 607 | __cap_delay_requeue(mdsc, ci); |
608 | } | 608 | } |
609 | 609 | ||
610 | if (flags & CEPH_CAP_FLAG_AUTH) | 610 | if (flags & CEPH_CAP_FLAG_AUTH) |
611 | ci->i_auth_cap = cap; | 611 | ci->i_auth_cap = cap; |
612 | else if (ci->i_auth_cap == cap) | 612 | else if (ci->i_auth_cap == cap) |
613 | ci->i_auth_cap = NULL; | 613 | ci->i_auth_cap = NULL; |
614 | 614 | ||
615 | dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n", | 615 | dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n", |
616 | inode, ceph_vinop(inode), cap, ceph_cap_string(issued), | 616 | inode, ceph_vinop(inode), cap, ceph_cap_string(issued), |
617 | ceph_cap_string(issued|cap->issued), seq, mds); | 617 | ceph_cap_string(issued|cap->issued), seq, mds); |
618 | cap->cap_id = cap_id; | 618 | cap->cap_id = cap_id; |
619 | cap->issued = issued; | 619 | cap->issued = issued; |
620 | cap->implemented |= issued; | 620 | cap->implemented |= issued; |
621 | cap->mds_wanted |= wanted; | 621 | cap->mds_wanted |= wanted; |
622 | cap->seq = seq; | 622 | cap->seq = seq; |
623 | cap->issue_seq = seq; | 623 | cap->issue_seq = seq; |
624 | cap->mseq = mseq; | 624 | cap->mseq = mseq; |
625 | cap->cap_gen = session->s_cap_gen; | 625 | cap->cap_gen = session->s_cap_gen; |
626 | 626 | ||
627 | if (fmode >= 0) | 627 | if (fmode >= 0) |
628 | __ceph_get_fmode(ci, fmode); | 628 | __ceph_get_fmode(ci, fmode); |
629 | spin_unlock(&ci->i_ceph_lock); | 629 | spin_unlock(&ci->i_ceph_lock); |
630 | wake_up_all(&ci->i_cap_wq); | 630 | wake_up_all(&ci->i_cap_wq); |
631 | return 0; | 631 | return 0; |
632 | } | 632 | } |
633 | 633 | ||
634 | /* | 634 | /* |
635 | * Return true if cap has not timed out and belongs to the current | 635 | * Return true if cap has not timed out and belongs to the current |
636 | * generation of the MDS session (i.e. has not gone 'stale' due to | 636 | * generation of the MDS session (i.e. has not gone 'stale' due to |
637 | * us losing touch with the mds). | 637 | * us losing touch with the mds). |
638 | */ | 638 | */ |
639 | static int __cap_is_valid(struct ceph_cap *cap) | 639 | static int __cap_is_valid(struct ceph_cap *cap) |
640 | { | 640 | { |
641 | unsigned long ttl; | 641 | unsigned long ttl; |
642 | u32 gen; | 642 | u32 gen; |
643 | 643 | ||
644 | spin_lock(&cap->session->s_cap_lock); | 644 | spin_lock(&cap->session->s_gen_ttl_lock); |
645 | gen = cap->session->s_cap_gen; | 645 | gen = cap->session->s_cap_gen; |
646 | ttl = cap->session->s_cap_ttl; | 646 | ttl = cap->session->s_cap_ttl; |
647 | spin_unlock(&cap->session->s_cap_lock); | 647 | spin_unlock(&cap->session->s_gen_ttl_lock); |
648 | 648 | ||
649 | if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) { | 649 | if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) { |
650 | dout("__cap_is_valid %p cap %p issued %s " | 650 | dout("__cap_is_valid %p cap %p issued %s " |
651 | "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode, | 651 | "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode, |
652 | cap, ceph_cap_string(cap->issued), cap->cap_gen, gen); | 652 | cap, ceph_cap_string(cap->issued), cap->cap_gen, gen); |
653 | return 0; | 653 | return 0; |
654 | } | 654 | } |
655 | 655 | ||
656 | return 1; | 656 | return 1; |
657 | } | 657 | } |
658 | 658 | ||
659 | /* | 659 | /* |
660 | * Return set of valid cap bits issued to us. Note that caps time | 660 | * Return set of valid cap bits issued to us. Note that caps time |
661 | * out, and may be invalidated in bulk if the client session times out | 661 | * out, and may be invalidated in bulk if the client session times out |
662 | * and session->s_cap_gen is bumped. | 662 | * and session->s_cap_gen is bumped. |
663 | */ | 663 | */ |
664 | int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented) | 664 | int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented) |
665 | { | 665 | { |
666 | int have = ci->i_snap_caps | ci->i_cap_exporting_issued; | 666 | int have = ci->i_snap_caps | ci->i_cap_exporting_issued; |
667 | struct ceph_cap *cap; | 667 | struct ceph_cap *cap; |
668 | struct rb_node *p; | 668 | struct rb_node *p; |
669 | 669 | ||
670 | if (implemented) | 670 | if (implemented) |
671 | *implemented = 0; | 671 | *implemented = 0; |
672 | for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { | 672 | for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { |
673 | cap = rb_entry(p, struct ceph_cap, ci_node); | 673 | cap = rb_entry(p, struct ceph_cap, ci_node); |
674 | if (!__cap_is_valid(cap)) | 674 | if (!__cap_is_valid(cap)) |
675 | continue; | 675 | continue; |
676 | dout("__ceph_caps_issued %p cap %p issued %s\n", | 676 | dout("__ceph_caps_issued %p cap %p issued %s\n", |
677 | &ci->vfs_inode, cap, ceph_cap_string(cap->issued)); | 677 | &ci->vfs_inode, cap, ceph_cap_string(cap->issued)); |
678 | have |= cap->issued; | 678 | have |= cap->issued; |
679 | if (implemented) | 679 | if (implemented) |
680 | *implemented |= cap->implemented; | 680 | *implemented |= cap->implemented; |
681 | } | 681 | } |
682 | return have; | 682 | return have; |
683 | } | 683 | } |
684 | 684 | ||
685 | /* | 685 | /* |
686 | * Get cap bits issued by caps other than @ocap | 686 | * Get cap bits issued by caps other than @ocap |
687 | */ | 687 | */ |
688 | int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap) | 688 | int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap) |
689 | { | 689 | { |
690 | int have = ci->i_snap_caps; | 690 | int have = ci->i_snap_caps; |
691 | struct ceph_cap *cap; | 691 | struct ceph_cap *cap; |
692 | struct rb_node *p; | 692 | struct rb_node *p; |
693 | 693 | ||
694 | for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { | 694 | for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { |
695 | cap = rb_entry(p, struct ceph_cap, ci_node); | 695 | cap = rb_entry(p, struct ceph_cap, ci_node); |
696 | if (cap == ocap) | 696 | if (cap == ocap) |
697 | continue; | 697 | continue; |
698 | if (!__cap_is_valid(cap)) | 698 | if (!__cap_is_valid(cap)) |
699 | continue; | 699 | continue; |
700 | have |= cap->issued; | 700 | have |= cap->issued; |
701 | } | 701 | } |
702 | return have; | 702 | return have; |
703 | } | 703 | } |
704 | 704 | ||
705 | /* | 705 | /* |
706 | * Move a cap to the end of the LRU (oldest caps at list head, newest | 706 | * Move a cap to the end of the LRU (oldest caps at list head, newest |
707 | * at list tail). | 707 | * at list tail). |
708 | */ | 708 | */ |
709 | static void __touch_cap(struct ceph_cap *cap) | 709 | static void __touch_cap(struct ceph_cap *cap) |
710 | { | 710 | { |
711 | struct ceph_mds_session *s = cap->session; | 711 | struct ceph_mds_session *s = cap->session; |
712 | 712 | ||
713 | spin_lock(&s->s_cap_lock); | 713 | spin_lock(&s->s_cap_lock); |
714 | if (s->s_cap_iterator == NULL) { | 714 | if (s->s_cap_iterator == NULL) { |
715 | dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap, | 715 | dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap, |
716 | s->s_mds); | 716 | s->s_mds); |
717 | list_move_tail(&cap->session_caps, &s->s_caps); | 717 | list_move_tail(&cap->session_caps, &s->s_caps); |
718 | } else { | 718 | } else { |
719 | dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n", | 719 | dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n", |
720 | &cap->ci->vfs_inode, cap, s->s_mds); | 720 | &cap->ci->vfs_inode, cap, s->s_mds); |
721 | } | 721 | } |
722 | spin_unlock(&s->s_cap_lock); | 722 | spin_unlock(&s->s_cap_lock); |
723 | } | 723 | } |
724 | 724 | ||
725 | /* | 725 | /* |
726 | * Check if we hold the given mask. If so, move the cap(s) to the | 726 | * Check if we hold the given mask. If so, move the cap(s) to the |
727 | * front of their respective LRUs. (This is the preferred way for | 727 | * front of their respective LRUs. (This is the preferred way for |
728 | * callers to check for caps they want.) | 728 | * callers to check for caps they want.) |
729 | */ | 729 | */ |
730 | int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) | 730 | int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) |
731 | { | 731 | { |
732 | struct ceph_cap *cap; | 732 | struct ceph_cap *cap; |
733 | struct rb_node *p; | 733 | struct rb_node *p; |
734 | int have = ci->i_snap_caps; | 734 | int have = ci->i_snap_caps; |
735 | 735 | ||
736 | if ((have & mask) == mask) { | 736 | if ((have & mask) == mask) { |
737 | dout("__ceph_caps_issued_mask %p snap issued %s" | 737 | dout("__ceph_caps_issued_mask %p snap issued %s" |
738 | " (mask %s)\n", &ci->vfs_inode, | 738 | " (mask %s)\n", &ci->vfs_inode, |
739 | ceph_cap_string(have), | 739 | ceph_cap_string(have), |
740 | ceph_cap_string(mask)); | 740 | ceph_cap_string(mask)); |
741 | return 1; | 741 | return 1; |
742 | } | 742 | } |
743 | 743 | ||
744 | for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { | 744 | for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { |
745 | cap = rb_entry(p, struct ceph_cap, ci_node); | 745 | cap = rb_entry(p, struct ceph_cap, ci_node); |
746 | if (!__cap_is_valid(cap)) | 746 | if (!__cap_is_valid(cap)) |
747 | continue; | 747 | continue; |
748 | if ((cap->issued & mask) == mask) { | 748 | if ((cap->issued & mask) == mask) { |
749 | dout("__ceph_caps_issued_mask %p cap %p issued %s" | 749 | dout("__ceph_caps_issued_mask %p cap %p issued %s" |
750 | " (mask %s)\n", &ci->vfs_inode, cap, | 750 | " (mask %s)\n", &ci->vfs_inode, cap, |
751 | ceph_cap_string(cap->issued), | 751 | ceph_cap_string(cap->issued), |
752 | ceph_cap_string(mask)); | 752 | ceph_cap_string(mask)); |
753 | if (touch) | 753 | if (touch) |
754 | __touch_cap(cap); | 754 | __touch_cap(cap); |
755 | return 1; | 755 | return 1; |
756 | } | 756 | } |
757 | 757 | ||
758 | /* does a combination of caps satisfy mask? */ | 758 | /* does a combination of caps satisfy mask? */ |
759 | have |= cap->issued; | 759 | have |= cap->issued; |
760 | if ((have & mask) == mask) { | 760 | if ((have & mask) == mask) { |
761 | dout("__ceph_caps_issued_mask %p combo issued %s" | 761 | dout("__ceph_caps_issued_mask %p combo issued %s" |
762 | " (mask %s)\n", &ci->vfs_inode, | 762 | " (mask %s)\n", &ci->vfs_inode, |
763 | ceph_cap_string(cap->issued), | 763 | ceph_cap_string(cap->issued), |
764 | ceph_cap_string(mask)); | 764 | ceph_cap_string(mask)); |
765 | if (touch) { | 765 | if (touch) { |
766 | struct rb_node *q; | 766 | struct rb_node *q; |
767 | 767 | ||
768 | /* touch this + preceding caps */ | 768 | /* touch this + preceding caps */ |
769 | __touch_cap(cap); | 769 | __touch_cap(cap); |
770 | for (q = rb_first(&ci->i_caps); q != p; | 770 | for (q = rb_first(&ci->i_caps); q != p; |
771 | q = rb_next(q)) { | 771 | q = rb_next(q)) { |
772 | cap = rb_entry(q, struct ceph_cap, | 772 | cap = rb_entry(q, struct ceph_cap, |
773 | ci_node); | 773 | ci_node); |
774 | if (!__cap_is_valid(cap)) | 774 | if (!__cap_is_valid(cap)) |
775 | continue; | 775 | continue; |
776 | __touch_cap(cap); | 776 | __touch_cap(cap); |
777 | } | 777 | } |
778 | } | 778 | } |
779 | return 1; | 779 | return 1; |
780 | } | 780 | } |
781 | } | 781 | } |
782 | 782 | ||
783 | return 0; | 783 | return 0; |
784 | } | 784 | } |
785 | 785 | ||
786 | /* | 786 | /* |
787 | * Return true if mask caps are currently being revoked by an MDS. | 787 | * Return true if mask caps are currently being revoked by an MDS. |
788 | */ | 788 | */ |
789 | int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) | 789 | int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) |
790 | { | 790 | { |
791 | struct inode *inode = &ci->vfs_inode; | 791 | struct inode *inode = &ci->vfs_inode; |
792 | struct ceph_cap *cap; | 792 | struct ceph_cap *cap; |
793 | struct rb_node *p; | 793 | struct rb_node *p; |
794 | int ret = 0; | 794 | int ret = 0; |
795 | 795 | ||
796 | spin_lock(&ci->i_ceph_lock); | 796 | spin_lock(&ci->i_ceph_lock); |
797 | for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { | 797 | for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { |
798 | cap = rb_entry(p, struct ceph_cap, ci_node); | 798 | cap = rb_entry(p, struct ceph_cap, ci_node); |
799 | if (__cap_is_valid(cap) && | 799 | if (__cap_is_valid(cap) && |
800 | (cap->implemented & ~cap->issued & mask)) { | 800 | (cap->implemented & ~cap->issued & mask)) { |
801 | ret = 1; | 801 | ret = 1; |
802 | break; | 802 | break; |
803 | } | 803 | } |
804 | } | 804 | } |
805 | spin_unlock(&ci->i_ceph_lock); | 805 | spin_unlock(&ci->i_ceph_lock); |
806 | dout("ceph_caps_revoking %p %s = %d\n", inode, | 806 | dout("ceph_caps_revoking %p %s = %d\n", inode, |
807 | ceph_cap_string(mask), ret); | 807 | ceph_cap_string(mask), ret); |
808 | return ret; | 808 | return ret; |
809 | } | 809 | } |
810 | 810 | ||
811 | int __ceph_caps_used(struct ceph_inode_info *ci) | 811 | int __ceph_caps_used(struct ceph_inode_info *ci) |
812 | { | 812 | { |
813 | int used = 0; | 813 | int used = 0; |
814 | if (ci->i_pin_ref) | 814 | if (ci->i_pin_ref) |
815 | used |= CEPH_CAP_PIN; | 815 | used |= CEPH_CAP_PIN; |
816 | if (ci->i_rd_ref) | 816 | if (ci->i_rd_ref) |
817 | used |= CEPH_CAP_FILE_RD; | 817 | used |= CEPH_CAP_FILE_RD; |
818 | if (ci->i_rdcache_ref || ci->vfs_inode.i_data.nrpages) | 818 | if (ci->i_rdcache_ref || ci->vfs_inode.i_data.nrpages) |
819 | used |= CEPH_CAP_FILE_CACHE; | 819 | used |= CEPH_CAP_FILE_CACHE; |
820 | if (ci->i_wr_ref) | 820 | if (ci->i_wr_ref) |
821 | used |= CEPH_CAP_FILE_WR; | 821 | used |= CEPH_CAP_FILE_WR; |
822 | if (ci->i_wb_ref || ci->i_wrbuffer_ref) | 822 | if (ci->i_wb_ref || ci->i_wrbuffer_ref) |
823 | used |= CEPH_CAP_FILE_BUFFER; | 823 | used |= CEPH_CAP_FILE_BUFFER; |
824 | return used; | 824 | return used; |
825 | } | 825 | } |
826 | 826 | ||
827 | /* | 827 | /* |
828 | * wanted, by virtue of open file modes | 828 | * wanted, by virtue of open file modes |
829 | */ | 829 | */ |
830 | int __ceph_caps_file_wanted(struct ceph_inode_info *ci) | 830 | int __ceph_caps_file_wanted(struct ceph_inode_info *ci) |
831 | { | 831 | { |
832 | int want = 0; | 832 | int want = 0; |
833 | int mode; | 833 | int mode; |
834 | for (mode = 0; mode < CEPH_FILE_MODE_NUM; mode++) | 834 | for (mode = 0; mode < CEPH_FILE_MODE_NUM; mode++) |
835 | if (ci->i_nr_by_mode[mode]) | 835 | if (ci->i_nr_by_mode[mode]) |
836 | want |= ceph_caps_for_mode(mode); | 836 | want |= ceph_caps_for_mode(mode); |
837 | return want; | 837 | return want; |
838 | } | 838 | } |
839 | 839 | ||
840 | /* | 840 | /* |
841 | * Return caps we have registered with the MDS(s) as 'wanted'. | 841 | * Return caps we have registered with the MDS(s) as 'wanted'. |
842 | */ | 842 | */ |
843 | int __ceph_caps_mds_wanted(struct ceph_inode_info *ci) | 843 | int __ceph_caps_mds_wanted(struct ceph_inode_info *ci) |
844 | { | 844 | { |
845 | struct ceph_cap *cap; | 845 | struct ceph_cap *cap; |
846 | struct rb_node *p; | 846 | struct rb_node *p; |
847 | int mds_wanted = 0; | 847 | int mds_wanted = 0; |
848 | 848 | ||
849 | for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { | 849 | for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { |
850 | cap = rb_entry(p, struct ceph_cap, ci_node); | 850 | cap = rb_entry(p, struct ceph_cap, ci_node); |
851 | if (!__cap_is_valid(cap)) | 851 | if (!__cap_is_valid(cap)) |
852 | continue; | 852 | continue; |
853 | mds_wanted |= cap->mds_wanted; | 853 | mds_wanted |= cap->mds_wanted; |
854 | } | 854 | } |
855 | return mds_wanted; | 855 | return mds_wanted; |
856 | } | 856 | } |
857 | 857 | ||
858 | /* | 858 | /* |
859 | * called under i_ceph_lock | 859 | * called under i_ceph_lock |
860 | */ | 860 | */ |
861 | static int __ceph_is_any_caps(struct ceph_inode_info *ci) | 861 | static int __ceph_is_any_caps(struct ceph_inode_info *ci) |
862 | { | 862 | { |
863 | return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0; | 863 | return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0; |
864 | } | 864 | } |
865 | 865 | ||
866 | /* | 866 | /* |
867 | * Remove a cap. Take steps to deal with a racing iterate_session_caps. | 867 | * Remove a cap. Take steps to deal with a racing iterate_session_caps. |
868 | * | 868 | * |
869 | * caller should hold i_ceph_lock. | 869 | * caller should hold i_ceph_lock. |
870 | * caller will not hold session s_mutex if called from destroy_inode. | 870 | * caller will not hold session s_mutex if called from destroy_inode. |
871 | */ | 871 | */ |
872 | void __ceph_remove_cap(struct ceph_cap *cap) | 872 | void __ceph_remove_cap(struct ceph_cap *cap) |
873 | { | 873 | { |
874 | struct ceph_mds_session *session = cap->session; | 874 | struct ceph_mds_session *session = cap->session; |
875 | struct ceph_inode_info *ci = cap->ci; | 875 | struct ceph_inode_info *ci = cap->ci; |
876 | struct ceph_mds_client *mdsc = | 876 | struct ceph_mds_client *mdsc = |
877 | ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; | 877 | ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; |
878 | int removed = 0; | 878 | int removed = 0; |
879 | 879 | ||
880 | dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); | 880 | dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); |
881 | 881 | ||
882 | /* remove from session list */ | 882 | /* remove from session list */ |
883 | spin_lock(&session->s_cap_lock); | 883 | spin_lock(&session->s_cap_lock); |
884 | if (session->s_cap_iterator == cap) { | 884 | if (session->s_cap_iterator == cap) { |
885 | /* not yet, we are iterating over this very cap */ | 885 | /* not yet, we are iterating over this very cap */ |
886 | dout("__ceph_remove_cap delaying %p removal from session %p\n", | 886 | dout("__ceph_remove_cap delaying %p removal from session %p\n", |
887 | cap, cap->session); | 887 | cap, cap->session); |
888 | } else { | 888 | } else { |
889 | list_del_init(&cap->session_caps); | 889 | list_del_init(&cap->session_caps); |
890 | session->s_nr_caps--; | 890 | session->s_nr_caps--; |
891 | cap->session = NULL; | 891 | cap->session = NULL; |
892 | removed = 1; | 892 | removed = 1; |
893 | } | 893 | } |
894 | /* protect backpointer with s_cap_lock: see iterate_session_caps */ | 894 | /* protect backpointer with s_cap_lock: see iterate_session_caps */ |
895 | cap->ci = NULL; | 895 | cap->ci = NULL; |
896 | spin_unlock(&session->s_cap_lock); | 896 | spin_unlock(&session->s_cap_lock); |
897 | 897 | ||
898 | /* remove from inode list */ | 898 | /* remove from inode list */ |
899 | rb_erase(&cap->ci_node, &ci->i_caps); | 899 | rb_erase(&cap->ci_node, &ci->i_caps); |
900 | if (ci->i_auth_cap == cap) | 900 | if (ci->i_auth_cap == cap) |
901 | ci->i_auth_cap = NULL; | 901 | ci->i_auth_cap = NULL; |
902 | 902 | ||
903 | if (removed) | 903 | if (removed) |
904 | ceph_put_cap(mdsc, cap); | 904 | ceph_put_cap(mdsc, cap); |
905 | 905 | ||
906 | if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) { | 906 | if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) { |
907 | struct ceph_snap_realm *realm = ci->i_snap_realm; | 907 | struct ceph_snap_realm *realm = ci->i_snap_realm; |
908 | spin_lock(&realm->inodes_with_caps_lock); | 908 | spin_lock(&realm->inodes_with_caps_lock); |
909 | list_del_init(&ci->i_snap_realm_item); | 909 | list_del_init(&ci->i_snap_realm_item); |
910 | ci->i_snap_realm_counter++; | 910 | ci->i_snap_realm_counter++; |
911 | ci->i_snap_realm = NULL; | 911 | ci->i_snap_realm = NULL; |
912 | spin_unlock(&realm->inodes_with_caps_lock); | 912 | spin_unlock(&realm->inodes_with_caps_lock); |
913 | ceph_put_snap_realm(mdsc, realm); | 913 | ceph_put_snap_realm(mdsc, realm); |
914 | } | 914 | } |
915 | if (!__ceph_is_any_real_caps(ci)) | 915 | if (!__ceph_is_any_real_caps(ci)) |
916 | __cap_delay_cancel(mdsc, ci); | 916 | __cap_delay_cancel(mdsc, ci); |
917 | } | 917 | } |
918 | 918 | ||
919 | /* | 919 | /* |
920 | * Build and send a cap message to the given MDS. | 920 | * Build and send a cap message to the given MDS. |
921 | * | 921 | * |
922 | * Caller should be holding s_mutex. | 922 | * Caller should be holding s_mutex. |
923 | */ | 923 | */ |
924 | static int send_cap_msg(struct ceph_mds_session *session, | 924 | static int send_cap_msg(struct ceph_mds_session *session, |
925 | u64 ino, u64 cid, int op, | 925 | u64 ino, u64 cid, int op, |
926 | int caps, int wanted, int dirty, | 926 | int caps, int wanted, int dirty, |
927 | u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq, | 927 | u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq, |
928 | u64 size, u64 max_size, | 928 | u64 size, u64 max_size, |
929 | struct timespec *mtime, struct timespec *atime, | 929 | struct timespec *mtime, struct timespec *atime, |
930 | u64 time_warp_seq, | 930 | u64 time_warp_seq, |
931 | uid_t uid, gid_t gid, umode_t mode, | 931 | uid_t uid, gid_t gid, umode_t mode, |
932 | u64 xattr_version, | 932 | u64 xattr_version, |
933 | struct ceph_buffer *xattrs_buf, | 933 | struct ceph_buffer *xattrs_buf, |
934 | u64 follows) | 934 | u64 follows) |
935 | { | 935 | { |
936 | struct ceph_mds_caps *fc; | 936 | struct ceph_mds_caps *fc; |
937 | struct ceph_msg *msg; | 937 | struct ceph_msg *msg; |
938 | 938 | ||
939 | dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s" | 939 | dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s" |
940 | " seq %u/%u mseq %u follows %lld size %llu/%llu" | 940 | " seq %u/%u mseq %u follows %lld size %llu/%llu" |
941 | " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op), | 941 | " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op), |
942 | cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted), | 942 | cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted), |
943 | ceph_cap_string(dirty), | 943 | ceph_cap_string(dirty), |
944 | seq, issue_seq, mseq, follows, size, max_size, | 944 | seq, issue_seq, mseq, follows, size, max_size, |
945 | xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0); | 945 | xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0); |
946 | 946 | ||
947 | msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS, false); | 947 | msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS, false); |
948 | if (!msg) | 948 | if (!msg) |
949 | return -ENOMEM; | 949 | return -ENOMEM; |
950 | 950 | ||
951 | msg->hdr.tid = cpu_to_le64(flush_tid); | 951 | msg->hdr.tid = cpu_to_le64(flush_tid); |
952 | 952 | ||
953 | fc = msg->front.iov_base; | 953 | fc = msg->front.iov_base; |
954 | memset(fc, 0, sizeof(*fc)); | 954 | memset(fc, 0, sizeof(*fc)); |
955 | 955 | ||
956 | fc->cap_id = cpu_to_le64(cid); | 956 | fc->cap_id = cpu_to_le64(cid); |
957 | fc->op = cpu_to_le32(op); | 957 | fc->op = cpu_to_le32(op); |
958 | fc->seq = cpu_to_le32(seq); | 958 | fc->seq = cpu_to_le32(seq); |
959 | fc->issue_seq = cpu_to_le32(issue_seq); | 959 | fc->issue_seq = cpu_to_le32(issue_seq); |
960 | fc->migrate_seq = cpu_to_le32(mseq); | 960 | fc->migrate_seq = cpu_to_le32(mseq); |
961 | fc->caps = cpu_to_le32(caps); | 961 | fc->caps = cpu_to_le32(caps); |
962 | fc->wanted = cpu_to_le32(wanted); | 962 | fc->wanted = cpu_to_le32(wanted); |
963 | fc->dirty = cpu_to_le32(dirty); | 963 | fc->dirty = cpu_to_le32(dirty); |
964 | fc->ino = cpu_to_le64(ino); | 964 | fc->ino = cpu_to_le64(ino); |
965 | fc->snap_follows = cpu_to_le64(follows); | 965 | fc->snap_follows = cpu_to_le64(follows); |
966 | 966 | ||
967 | fc->size = cpu_to_le64(size); | 967 | fc->size = cpu_to_le64(size); |
968 | fc->max_size = cpu_to_le64(max_size); | 968 | fc->max_size = cpu_to_le64(max_size); |
969 | if (mtime) | 969 | if (mtime) |
970 | ceph_encode_timespec(&fc->mtime, mtime); | 970 | ceph_encode_timespec(&fc->mtime, mtime); |
971 | if (atime) | 971 | if (atime) |
972 | ceph_encode_timespec(&fc->atime, atime); | 972 | ceph_encode_timespec(&fc->atime, atime); |
973 | fc->time_warp_seq = cpu_to_le32(time_warp_seq); | 973 | fc->time_warp_seq = cpu_to_le32(time_warp_seq); |
974 | 974 | ||
975 | fc->uid = cpu_to_le32(uid); | 975 | fc->uid = cpu_to_le32(uid); |
976 | fc->gid = cpu_to_le32(gid); | 976 | fc->gid = cpu_to_le32(gid); |
977 | fc->mode = cpu_to_le32(mode); | 977 | fc->mode = cpu_to_le32(mode); |
978 | 978 | ||
979 | fc->xattr_version = cpu_to_le64(xattr_version); | 979 | fc->xattr_version = cpu_to_le64(xattr_version); |
980 | if (xattrs_buf) { | 980 | if (xattrs_buf) { |
981 | msg->middle = ceph_buffer_get(xattrs_buf); | 981 | msg->middle = ceph_buffer_get(xattrs_buf); |
982 | fc->xattr_len = cpu_to_le32(xattrs_buf->vec.iov_len); | 982 | fc->xattr_len = cpu_to_le32(xattrs_buf->vec.iov_len); |
983 | msg->hdr.middle_len = cpu_to_le32(xattrs_buf->vec.iov_len); | 983 | msg->hdr.middle_len = cpu_to_le32(xattrs_buf->vec.iov_len); |
984 | } | 984 | } |
985 | 985 | ||
986 | ceph_con_send(&session->s_con, msg); | 986 | ceph_con_send(&session->s_con, msg); |
987 | return 0; | 987 | return 0; |
988 | } | 988 | } |
989 | 989 | ||
990 | static void __queue_cap_release(struct ceph_mds_session *session, | 990 | static void __queue_cap_release(struct ceph_mds_session *session, |
991 | u64 ino, u64 cap_id, u32 migrate_seq, | 991 | u64 ino, u64 cap_id, u32 migrate_seq, |
992 | u32 issue_seq) | 992 | u32 issue_seq) |
993 | { | 993 | { |
994 | struct ceph_msg *msg; | 994 | struct ceph_msg *msg; |
995 | struct ceph_mds_cap_release *head; | 995 | struct ceph_mds_cap_release *head; |
996 | struct ceph_mds_cap_item *item; | 996 | struct ceph_mds_cap_item *item; |
997 | 997 | ||
998 | spin_lock(&session->s_cap_lock); | 998 | spin_lock(&session->s_cap_lock); |
999 | BUG_ON(!session->s_num_cap_releases); | 999 | BUG_ON(!session->s_num_cap_releases); |
1000 | msg = list_first_entry(&session->s_cap_releases, | 1000 | msg = list_first_entry(&session->s_cap_releases, |
1001 | struct ceph_msg, list_head); | 1001 | struct ceph_msg, list_head); |
1002 | 1002 | ||
1003 | dout(" adding %llx release to mds%d msg %p (%d left)\n", | 1003 | dout(" adding %llx release to mds%d msg %p (%d left)\n", |
1004 | ino, session->s_mds, msg, session->s_num_cap_releases); | 1004 | ino, session->s_mds, msg, session->s_num_cap_releases); |
1005 | 1005 | ||
1006 | BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE); | 1006 | BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE); |
1007 | head = msg->front.iov_base; | 1007 | head = msg->front.iov_base; |
1008 | head->num = cpu_to_le32(le32_to_cpu(head->num) + 1); | 1008 | head->num = cpu_to_le32(le32_to_cpu(head->num) + 1); |
1009 | item = msg->front.iov_base + msg->front.iov_len; | 1009 | item = msg->front.iov_base + msg->front.iov_len; |
1010 | item->ino = cpu_to_le64(ino); | 1010 | item->ino = cpu_to_le64(ino); |
1011 | item->cap_id = cpu_to_le64(cap_id); | 1011 | item->cap_id = cpu_to_le64(cap_id); |
1012 | item->migrate_seq = cpu_to_le32(migrate_seq); | 1012 | item->migrate_seq = cpu_to_le32(migrate_seq); |
1013 | item->seq = cpu_to_le32(issue_seq); | 1013 | item->seq = cpu_to_le32(issue_seq); |
1014 | 1014 | ||
1015 | session->s_num_cap_releases--; | 1015 | session->s_num_cap_releases--; |
1016 | 1016 | ||
1017 | msg->front.iov_len += sizeof(*item); | 1017 | msg->front.iov_len += sizeof(*item); |
1018 | if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) { | 1018 | if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) { |
1019 | dout(" release msg %p full\n", msg); | 1019 | dout(" release msg %p full\n", msg); |
1020 | list_move_tail(&msg->list_head, &session->s_cap_releases_done); | 1020 | list_move_tail(&msg->list_head, &session->s_cap_releases_done); |
1021 | } else { | 1021 | } else { |
1022 | dout(" release msg %p at %d/%d (%d)\n", msg, | 1022 | dout(" release msg %p at %d/%d (%d)\n", msg, |
1023 | (int)le32_to_cpu(head->num), | 1023 | (int)le32_to_cpu(head->num), |
1024 | (int)CEPH_CAPS_PER_RELEASE, | 1024 | (int)CEPH_CAPS_PER_RELEASE, |
1025 | (int)msg->front.iov_len); | 1025 | (int)msg->front.iov_len); |
1026 | } | 1026 | } |
1027 | spin_unlock(&session->s_cap_lock); | 1027 | spin_unlock(&session->s_cap_lock); |
1028 | } | 1028 | } |
1029 | 1029 | ||
1030 | /* | 1030 | /* |
1031 | * Queue cap releases when an inode is dropped from our cache. Since | 1031 | * Queue cap releases when an inode is dropped from our cache. Since |
1032 | * inode is about to be destroyed, there is no need for i_ceph_lock. | 1032 | * inode is about to be destroyed, there is no need for i_ceph_lock. |
1033 | */ | 1033 | */ |
1034 | void ceph_queue_caps_release(struct inode *inode) | 1034 | void ceph_queue_caps_release(struct inode *inode) |
1035 | { | 1035 | { |
1036 | struct ceph_inode_info *ci = ceph_inode(inode); | 1036 | struct ceph_inode_info *ci = ceph_inode(inode); |
1037 | struct rb_node *p; | 1037 | struct rb_node *p; |
1038 | 1038 | ||
1039 | p = rb_first(&ci->i_caps); | 1039 | p = rb_first(&ci->i_caps); |
1040 | while (p) { | 1040 | while (p) { |
1041 | struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node); | 1041 | struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node); |
1042 | struct ceph_mds_session *session = cap->session; | 1042 | struct ceph_mds_session *session = cap->session; |
1043 | 1043 | ||
1044 | __queue_cap_release(session, ceph_ino(inode), cap->cap_id, | 1044 | __queue_cap_release(session, ceph_ino(inode), cap->cap_id, |
1045 | cap->mseq, cap->issue_seq); | 1045 | cap->mseq, cap->issue_seq); |
1046 | p = rb_next(p); | 1046 | p = rb_next(p); |
1047 | __ceph_remove_cap(cap); | 1047 | __ceph_remove_cap(cap); |
1048 | } | 1048 | } |
1049 | } | 1049 | } |
1050 | 1050 | ||
1051 | /* | 1051 | /* |
1052 | * Send a cap msg on the given inode. Update our caps state, then | 1052 | * Send a cap msg on the given inode. Update our caps state, then |
1053 | * drop i_ceph_lock and send the message. | 1053 | * drop i_ceph_lock and send the message. |
1054 | * | 1054 | * |
1055 | * Make note of max_size reported/requested from mds, revoked caps | 1055 | * Make note of max_size reported/requested from mds, revoked caps |
1056 | * that have now been implemented. | 1056 | * that have now been implemented. |
1057 | * | 1057 | * |
1058 | * Make half-hearted attempt ot to invalidate page cache if we are | 1058 | * Make half-hearted attempt ot to invalidate page cache if we are |
1059 | * dropping RDCACHE. Note that this will leave behind locked pages | 1059 | * dropping RDCACHE. Note that this will leave behind locked pages |
1060 | * that we'll then need to deal with elsewhere. | 1060 | * that we'll then need to deal with elsewhere. |
1061 | * | 1061 | * |
1062 | * Return non-zero if delayed release, or we experienced an error | 1062 | * Return non-zero if delayed release, or we experienced an error |
1063 | * such that the caller should requeue + retry later. | 1063 | * such that the caller should requeue + retry later. |
1064 | * | 1064 | * |
1065 | * called with i_ceph_lock, then drops it. | 1065 | * called with i_ceph_lock, then drops it. |
1066 | * caller should hold snap_rwsem (read), s_mutex. | 1066 | * caller should hold snap_rwsem (read), s_mutex. |
1067 | */ | 1067 | */ |
1068 | static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, | 1068 | static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, |
1069 | int op, int used, int want, int retain, int flushing, | 1069 | int op, int used, int want, int retain, int flushing, |
1070 | unsigned *pflush_tid) | 1070 | unsigned *pflush_tid) |
1071 | __releases(cap->ci->i_ceph_lock) | 1071 | __releases(cap->ci->i_ceph_lock) |
1072 | { | 1072 | { |
1073 | struct ceph_inode_info *ci = cap->ci; | 1073 | struct ceph_inode_info *ci = cap->ci; |
1074 | struct inode *inode = &ci->vfs_inode; | 1074 | struct inode *inode = &ci->vfs_inode; |
1075 | u64 cap_id = cap->cap_id; | 1075 | u64 cap_id = cap->cap_id; |
1076 | int held, revoking, dropping, keep; | 1076 | int held, revoking, dropping, keep; |
1077 | u64 seq, issue_seq, mseq, time_warp_seq, follows; | 1077 | u64 seq, issue_seq, mseq, time_warp_seq, follows; |
1078 | u64 size, max_size; | 1078 | u64 size, max_size; |
1079 | struct timespec mtime, atime; | 1079 | struct timespec mtime, atime; |
1080 | int wake = 0; | 1080 | int wake = 0; |
1081 | umode_t mode; | 1081 | umode_t mode; |
1082 | uid_t uid; | 1082 | uid_t uid; |
1083 | gid_t gid; | 1083 | gid_t gid; |
1084 | struct ceph_mds_session *session; | 1084 | struct ceph_mds_session *session; |
1085 | u64 xattr_version = 0; | 1085 | u64 xattr_version = 0; |
1086 | struct ceph_buffer *xattr_blob = NULL; | 1086 | struct ceph_buffer *xattr_blob = NULL; |
1087 | int delayed = 0; | 1087 | int delayed = 0; |
1088 | u64 flush_tid = 0; | 1088 | u64 flush_tid = 0; |
1089 | int i; | 1089 | int i; |
1090 | int ret; | 1090 | int ret; |
1091 | 1091 | ||
1092 | held = cap->issued | cap->implemented; | 1092 | held = cap->issued | cap->implemented; |
1093 | revoking = cap->implemented & ~cap->issued; | 1093 | revoking = cap->implemented & ~cap->issued; |
1094 | retain &= ~revoking; | 1094 | retain &= ~revoking; |
1095 | dropping = cap->issued & ~retain; | 1095 | dropping = cap->issued & ~retain; |
1096 | 1096 | ||
1097 | dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n", | 1097 | dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n", |
1098 | inode, cap, cap->session, | 1098 | inode, cap, cap->session, |
1099 | ceph_cap_string(held), ceph_cap_string(held & retain), | 1099 | ceph_cap_string(held), ceph_cap_string(held & retain), |
1100 | ceph_cap_string(revoking)); | 1100 | ceph_cap_string(revoking)); |
1101 | BUG_ON((retain & CEPH_CAP_PIN) == 0); | 1101 | BUG_ON((retain & CEPH_CAP_PIN) == 0); |
1102 | 1102 | ||
1103 | session = cap->session; | 1103 | session = cap->session; |
1104 | 1104 | ||
1105 | /* don't release wanted unless we've waited a bit. */ | 1105 | /* don't release wanted unless we've waited a bit. */ |
1106 | if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 && | 1106 | if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 && |
1107 | time_before(jiffies, ci->i_hold_caps_min)) { | 1107 | time_before(jiffies, ci->i_hold_caps_min)) { |
1108 | dout(" delaying issued %s -> %s, wanted %s -> %s on send\n", | 1108 | dout(" delaying issued %s -> %s, wanted %s -> %s on send\n", |
1109 | ceph_cap_string(cap->issued), | 1109 | ceph_cap_string(cap->issued), |
1110 | ceph_cap_string(cap->issued & retain), | 1110 | ceph_cap_string(cap->issued & retain), |
1111 | ceph_cap_string(cap->mds_wanted), | 1111 | ceph_cap_string(cap->mds_wanted), |
1112 | ceph_cap_string(want)); | 1112 | ceph_cap_string(want)); |
1113 | want |= cap->mds_wanted; | 1113 | want |= cap->mds_wanted; |
1114 | retain |= cap->issued; | 1114 | retain |= cap->issued; |
1115 | delayed = 1; | 1115 | delayed = 1; |
1116 | } | 1116 | } |
1117 | ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH); | 1117 | ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH); |
1118 | 1118 | ||
1119 | cap->issued &= retain; /* drop bits we don't want */ | 1119 | cap->issued &= retain; /* drop bits we don't want */ |
1120 | if (cap->implemented & ~cap->issued) { | 1120 | if (cap->implemented & ~cap->issued) { |
1121 | /* | 1121 | /* |
1122 | * Wake up any waiters on wanted -> needed transition. | 1122 | * Wake up any waiters on wanted -> needed transition. |
1123 | * This is due to the weird transition from buffered | 1123 | * This is due to the weird transition from buffered |
1124 | * to sync IO... we need to flush dirty pages _before_ | 1124 | * to sync IO... we need to flush dirty pages _before_ |
1125 | * allowing sync writes to avoid reordering. | 1125 | * allowing sync writes to avoid reordering. |
1126 | */ | 1126 | */ |
1127 | wake = 1; | 1127 | wake = 1; |
1128 | } | 1128 | } |
1129 | cap->implemented &= cap->issued | used; | 1129 | cap->implemented &= cap->issued | used; |
1130 | cap->mds_wanted = want; | 1130 | cap->mds_wanted = want; |
1131 | 1131 | ||
1132 | if (flushing) { | 1132 | if (flushing) { |
1133 | /* | 1133 | /* |
1134 | * assign a tid for flush operations so we can avoid | 1134 | * assign a tid for flush operations so we can avoid |
1135 | * flush1 -> dirty1 -> flush2 -> flushack1 -> mark | 1135 | * flush1 -> dirty1 -> flush2 -> flushack1 -> mark |
1136 | * clean type races. track latest tid for every bit | 1136 | * clean type races. track latest tid for every bit |
1137 | * so we can handle flush AxFw, flush Fw, and have the | 1137 | * so we can handle flush AxFw, flush Fw, and have the |
1138 | * first ack clean Ax. | 1138 | * first ack clean Ax. |
1139 | */ | 1139 | */ |
1140 | flush_tid = ++ci->i_cap_flush_last_tid; | 1140 | flush_tid = ++ci->i_cap_flush_last_tid; |
1141 | if (pflush_tid) | 1141 | if (pflush_tid) |
1142 | *pflush_tid = flush_tid; | 1142 | *pflush_tid = flush_tid; |
1143 | dout(" cap_flush_tid %d\n", (int)flush_tid); | 1143 | dout(" cap_flush_tid %d\n", (int)flush_tid); |
1144 | for (i = 0; i < CEPH_CAP_BITS; i++) | 1144 | for (i = 0; i < CEPH_CAP_BITS; i++) |
1145 | if (flushing & (1 << i)) | 1145 | if (flushing & (1 << i)) |
1146 | ci->i_cap_flush_tid[i] = flush_tid; | 1146 | ci->i_cap_flush_tid[i] = flush_tid; |
1147 | 1147 | ||
1148 | follows = ci->i_head_snapc->seq; | 1148 | follows = ci->i_head_snapc->seq; |
1149 | } else { | 1149 | } else { |
1150 | follows = 0; | 1150 | follows = 0; |
1151 | } | 1151 | } |
1152 | 1152 | ||
1153 | keep = cap->implemented; | 1153 | keep = cap->implemented; |
1154 | seq = cap->seq; | 1154 | seq = cap->seq; |
1155 | issue_seq = cap->issue_seq; | 1155 | issue_seq = cap->issue_seq; |
1156 | mseq = cap->mseq; | 1156 | mseq = cap->mseq; |
1157 | size = inode->i_size; | 1157 | size = inode->i_size; |
1158 | ci->i_reported_size = size; | 1158 | ci->i_reported_size = size; |
1159 | max_size = ci->i_wanted_max_size; | 1159 | max_size = ci->i_wanted_max_size; |
1160 | ci->i_requested_max_size = max_size; | 1160 | ci->i_requested_max_size = max_size; |
1161 | mtime = inode->i_mtime; | 1161 | mtime = inode->i_mtime; |
1162 | atime = inode->i_atime; | 1162 | atime = inode->i_atime; |
1163 | time_warp_seq = ci->i_time_warp_seq; | 1163 | time_warp_seq = ci->i_time_warp_seq; |
1164 | uid = inode->i_uid; | 1164 | uid = inode->i_uid; |
1165 | gid = inode->i_gid; | 1165 | gid = inode->i_gid; |
1166 | mode = inode->i_mode; | 1166 | mode = inode->i_mode; |
1167 | 1167 | ||
1168 | if (flushing & CEPH_CAP_XATTR_EXCL) { | 1168 | if (flushing & CEPH_CAP_XATTR_EXCL) { |
1169 | __ceph_build_xattrs_blob(ci); | 1169 | __ceph_build_xattrs_blob(ci); |
1170 | xattr_blob = ci->i_xattrs.blob; | 1170 | xattr_blob = ci->i_xattrs.blob; |
1171 | xattr_version = ci->i_xattrs.version; | 1171 | xattr_version = ci->i_xattrs.version; |
1172 | } | 1172 | } |
1173 | 1173 | ||
1174 | spin_unlock(&ci->i_ceph_lock); | 1174 | spin_unlock(&ci->i_ceph_lock); |
1175 | 1175 | ||
1176 | ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id, | 1176 | ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id, |
1177 | op, keep, want, flushing, seq, flush_tid, issue_seq, mseq, | 1177 | op, keep, want, flushing, seq, flush_tid, issue_seq, mseq, |
1178 | size, max_size, &mtime, &atime, time_warp_seq, | 1178 | size, max_size, &mtime, &atime, time_warp_seq, |
1179 | uid, gid, mode, xattr_version, xattr_blob, | 1179 | uid, gid, mode, xattr_version, xattr_blob, |
1180 | follows); | 1180 | follows); |
1181 | if (ret < 0) { | 1181 | if (ret < 0) { |
1182 | dout("error sending cap msg, must requeue %p\n", inode); | 1182 | dout("error sending cap msg, must requeue %p\n", inode); |
1183 | delayed = 1; | 1183 | delayed = 1; |
1184 | } | 1184 | } |
1185 | 1185 | ||
1186 | if (wake) | 1186 | if (wake) |
1187 | wake_up_all(&ci->i_cap_wq); | 1187 | wake_up_all(&ci->i_cap_wq); |
1188 | 1188 | ||
1189 | return delayed; | 1189 | return delayed; |
1190 | } | 1190 | } |
1191 | 1191 | ||
1192 | /* | 1192 | /* |
1193 | * When a snapshot is taken, clients accumulate dirty metadata on | 1193 | * When a snapshot is taken, clients accumulate dirty metadata on |
1194 | * inodes with capabilities in ceph_cap_snaps to describe the file | 1194 | * inodes with capabilities in ceph_cap_snaps to describe the file |
1195 | * state at the time the snapshot was taken. This must be flushed | 1195 | * state at the time the snapshot was taken. This must be flushed |
1196 | * asynchronously back to the MDS once sync writes complete and dirty | 1196 | * asynchronously back to the MDS once sync writes complete and dirty |
1197 | * data is written out. | 1197 | * data is written out. |
1198 | * | 1198 | * |
1199 | * Unless @again is true, skip cap_snaps that were already sent to | 1199 | * Unless @again is true, skip cap_snaps that were already sent to |
1200 | * the MDS (i.e., during this session). | 1200 | * the MDS (i.e., during this session). |
1201 | * | 1201 | * |
1202 | * Called under i_ceph_lock. Takes s_mutex as needed. | 1202 | * Called under i_ceph_lock. Takes s_mutex as needed. |
1203 | */ | 1203 | */ |
1204 | void __ceph_flush_snaps(struct ceph_inode_info *ci, | 1204 | void __ceph_flush_snaps(struct ceph_inode_info *ci, |
1205 | struct ceph_mds_session **psession, | 1205 | struct ceph_mds_session **psession, |
1206 | int again) | 1206 | int again) |
1207 | __releases(ci->i_ceph_lock) | 1207 | __releases(ci->i_ceph_lock) |
1208 | __acquires(ci->i_ceph_lock) | 1208 | __acquires(ci->i_ceph_lock) |
1209 | { | 1209 | { |
1210 | struct inode *inode = &ci->vfs_inode; | 1210 | struct inode *inode = &ci->vfs_inode; |
1211 | int mds; | 1211 | int mds; |
1212 | struct ceph_cap_snap *capsnap; | 1212 | struct ceph_cap_snap *capsnap; |
1213 | u32 mseq; | 1213 | u32 mseq; |
1214 | struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; | 1214 | struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; |
1215 | struct ceph_mds_session *session = NULL; /* if session != NULL, we hold | 1215 | struct ceph_mds_session *session = NULL; /* if session != NULL, we hold |
1216 | session->s_mutex */ | 1216 | session->s_mutex */ |
1217 | u64 next_follows = 0; /* keep track of how far we've gotten through the | 1217 | u64 next_follows = 0; /* keep track of how far we've gotten through the |
1218 | i_cap_snaps list, and skip these entries next time | 1218 | i_cap_snaps list, and skip these entries next time |
1219 | around to avoid an infinite loop */ | 1219 | around to avoid an infinite loop */ |
1220 | 1220 | ||
1221 | if (psession) | 1221 | if (psession) |
1222 | session = *psession; | 1222 | session = *psession; |
1223 | 1223 | ||
1224 | dout("__flush_snaps %p\n", inode); | 1224 | dout("__flush_snaps %p\n", inode); |
1225 | retry: | 1225 | retry: |
1226 | list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { | 1226 | list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { |
1227 | /* avoid an infiniute loop after retry */ | 1227 | /* avoid an infiniute loop after retry */ |
1228 | if (capsnap->follows < next_follows) | 1228 | if (capsnap->follows < next_follows) |
1229 | continue; | 1229 | continue; |
1230 | /* | 1230 | /* |
1231 | * we need to wait for sync writes to complete and for dirty | 1231 | * we need to wait for sync writes to complete and for dirty |
1232 | * pages to be written out. | 1232 | * pages to be written out. |
1233 | */ | 1233 | */ |
1234 | if (capsnap->dirty_pages || capsnap->writing) | 1234 | if (capsnap->dirty_pages || capsnap->writing) |
1235 | break; | 1235 | break; |
1236 | 1236 | ||
1237 | /* | 1237 | /* |
1238 | * if cap writeback already occurred, we should have dropped | 1238 | * if cap writeback already occurred, we should have dropped |
1239 | * the capsnap in ceph_put_wrbuffer_cap_refs. | 1239 | * the capsnap in ceph_put_wrbuffer_cap_refs. |
1240 | */ | 1240 | */ |
1241 | BUG_ON(capsnap->dirty == 0); | 1241 | BUG_ON(capsnap->dirty == 0); |
1242 | 1242 | ||
1243 | /* pick mds, take s_mutex */ | 1243 | /* pick mds, take s_mutex */ |
1244 | if (ci->i_auth_cap == NULL) { | 1244 | if (ci->i_auth_cap == NULL) { |
1245 | dout("no auth cap (migrating?), doing nothing\n"); | 1245 | dout("no auth cap (migrating?), doing nothing\n"); |
1246 | goto out; | 1246 | goto out; |
1247 | } | 1247 | } |
1248 | 1248 | ||
1249 | /* only flush each capsnap once */ | 1249 | /* only flush each capsnap once */ |
1250 | if (!again && !list_empty(&capsnap->flushing_item)) { | 1250 | if (!again && !list_empty(&capsnap->flushing_item)) { |
1251 | dout("already flushed %p, skipping\n", capsnap); | 1251 | dout("already flushed %p, skipping\n", capsnap); |
1252 | continue; | 1252 | continue; |
1253 | } | 1253 | } |
1254 | 1254 | ||
1255 | mds = ci->i_auth_cap->session->s_mds; | 1255 | mds = ci->i_auth_cap->session->s_mds; |
1256 | mseq = ci->i_auth_cap->mseq; | 1256 | mseq = ci->i_auth_cap->mseq; |
1257 | 1257 | ||
1258 | if (session && session->s_mds != mds) { | 1258 | if (session && session->s_mds != mds) { |
1259 | dout("oops, wrong session %p mutex\n", session); | 1259 | dout("oops, wrong session %p mutex\n", session); |
1260 | mutex_unlock(&session->s_mutex); | 1260 | mutex_unlock(&session->s_mutex); |
1261 | ceph_put_mds_session(session); | 1261 | ceph_put_mds_session(session); |
1262 | session = NULL; | 1262 | session = NULL; |
1263 | } | 1263 | } |
1264 | if (!session) { | 1264 | if (!session) { |
1265 | spin_unlock(&ci->i_ceph_lock); | 1265 | spin_unlock(&ci->i_ceph_lock); |
1266 | mutex_lock(&mdsc->mutex); | 1266 | mutex_lock(&mdsc->mutex); |
1267 | session = __ceph_lookup_mds_session(mdsc, mds); | 1267 | session = __ceph_lookup_mds_session(mdsc, mds); |
1268 | mutex_unlock(&mdsc->mutex); | 1268 | mutex_unlock(&mdsc->mutex); |
1269 | if (session) { | 1269 | if (session) { |
1270 | dout("inverting session/ino locks on %p\n", | 1270 | dout("inverting session/ino locks on %p\n", |
1271 | session); | 1271 | session); |
1272 | mutex_lock(&session->s_mutex); | 1272 | mutex_lock(&session->s_mutex); |
1273 | } | 1273 | } |
1274 | /* | 1274 | /* |
1275 | * if session == NULL, we raced against a cap | 1275 | * if session == NULL, we raced against a cap |
1276 | * deletion or migration. retry, and we'll | 1276 | * deletion or migration. retry, and we'll |
1277 | * get a better @mds value next time. | 1277 | * get a better @mds value next time. |
1278 | */ | 1278 | */ |
1279 | spin_lock(&ci->i_ceph_lock); | 1279 | spin_lock(&ci->i_ceph_lock); |
1280 | goto retry; | 1280 | goto retry; |
1281 | } | 1281 | } |
1282 | 1282 | ||
1283 | capsnap->flush_tid = ++ci->i_cap_flush_last_tid; | 1283 | capsnap->flush_tid = ++ci->i_cap_flush_last_tid; |
1284 | atomic_inc(&capsnap->nref); | 1284 | atomic_inc(&capsnap->nref); |
1285 | if (!list_empty(&capsnap->flushing_item)) | 1285 | if (!list_empty(&capsnap->flushing_item)) |
1286 | list_del_init(&capsnap->flushing_item); | 1286 | list_del_init(&capsnap->flushing_item); |
1287 | list_add_tail(&capsnap->flushing_item, | 1287 | list_add_tail(&capsnap->flushing_item, |
1288 | &session->s_cap_snaps_flushing); | 1288 | &session->s_cap_snaps_flushing); |
1289 | spin_unlock(&ci->i_ceph_lock); | 1289 | spin_unlock(&ci->i_ceph_lock); |
1290 | 1290 | ||
1291 | dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n", | 1291 | dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n", |
1292 | inode, capsnap, capsnap->follows, capsnap->flush_tid); | 1292 | inode, capsnap, capsnap->follows, capsnap->flush_tid); |
1293 | send_cap_msg(session, ceph_vino(inode).ino, 0, | 1293 | send_cap_msg(session, ceph_vino(inode).ino, 0, |
1294 | CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0, | 1294 | CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0, |
1295 | capsnap->dirty, 0, capsnap->flush_tid, 0, mseq, | 1295 | capsnap->dirty, 0, capsnap->flush_tid, 0, mseq, |
1296 | capsnap->size, 0, | 1296 | capsnap->size, 0, |
1297 | &capsnap->mtime, &capsnap->atime, | 1297 | &capsnap->mtime, &capsnap->atime, |
1298 | capsnap->time_warp_seq, | 1298 | capsnap->time_warp_seq, |
1299 | capsnap->uid, capsnap->gid, capsnap->mode, | 1299 | capsnap->uid, capsnap->gid, capsnap->mode, |
1300 | capsnap->xattr_version, capsnap->xattr_blob, | 1300 | capsnap->xattr_version, capsnap->xattr_blob, |
1301 | capsnap->follows); | 1301 | capsnap->follows); |
1302 | 1302 | ||
1303 | next_follows = capsnap->follows + 1; | 1303 | next_follows = capsnap->follows + 1; |
1304 | ceph_put_cap_snap(capsnap); | 1304 | ceph_put_cap_snap(capsnap); |
1305 | 1305 | ||
1306 | spin_lock(&ci->i_ceph_lock); | 1306 | spin_lock(&ci->i_ceph_lock); |
1307 | goto retry; | 1307 | goto retry; |
1308 | } | 1308 | } |
1309 | 1309 | ||
1310 | /* we flushed them all; remove this inode from the queue */ | 1310 | /* we flushed them all; remove this inode from the queue */ |
1311 | spin_lock(&mdsc->snap_flush_lock); | 1311 | spin_lock(&mdsc->snap_flush_lock); |
1312 | list_del_init(&ci->i_snap_flush_item); | 1312 | list_del_init(&ci->i_snap_flush_item); |
1313 | spin_unlock(&mdsc->snap_flush_lock); | 1313 | spin_unlock(&mdsc->snap_flush_lock); |
1314 | 1314 | ||
1315 | out: | 1315 | out: |
1316 | if (psession) | 1316 | if (psession) |
1317 | *psession = session; | 1317 | *psession = session; |
1318 | else if (session) { | 1318 | else if (session) { |
1319 | mutex_unlock(&session->s_mutex); | 1319 | mutex_unlock(&session->s_mutex); |
1320 | ceph_put_mds_session(session); | 1320 | ceph_put_mds_session(session); |
1321 | } | 1321 | } |
1322 | } | 1322 | } |
1323 | 1323 | ||
1324 | static void ceph_flush_snaps(struct ceph_inode_info *ci) | 1324 | static void ceph_flush_snaps(struct ceph_inode_info *ci) |
1325 | { | 1325 | { |
1326 | spin_lock(&ci->i_ceph_lock); | 1326 | spin_lock(&ci->i_ceph_lock); |
1327 | __ceph_flush_snaps(ci, NULL, 0); | 1327 | __ceph_flush_snaps(ci, NULL, 0); |
1328 | spin_unlock(&ci->i_ceph_lock); | 1328 | spin_unlock(&ci->i_ceph_lock); |
1329 | } | 1329 | } |
1330 | 1330 | ||
1331 | /* | 1331 | /* |
1332 | * Mark caps dirty. If inode is newly dirty, return the dirty flags. | 1332 | * Mark caps dirty. If inode is newly dirty, return the dirty flags. |
1333 | * Caller is then responsible for calling __mark_inode_dirty with the | 1333 | * Caller is then responsible for calling __mark_inode_dirty with the |
1334 | * returned flags value. | 1334 | * returned flags value. |
1335 | */ | 1335 | */ |
1336 | int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) | 1336 | int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) |
1337 | { | 1337 | { |
1338 | struct ceph_mds_client *mdsc = | 1338 | struct ceph_mds_client *mdsc = |
1339 | ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; | 1339 | ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; |
1340 | struct inode *inode = &ci->vfs_inode; | 1340 | struct inode *inode = &ci->vfs_inode; |
1341 | int was = ci->i_dirty_caps; | 1341 | int was = ci->i_dirty_caps; |
1342 | int dirty = 0; | 1342 | int dirty = 0; |
1343 | 1343 | ||
1344 | dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode, | 1344 | dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode, |
1345 | ceph_cap_string(mask), ceph_cap_string(was), | 1345 | ceph_cap_string(mask), ceph_cap_string(was), |
1346 | ceph_cap_string(was | mask)); | 1346 | ceph_cap_string(was | mask)); |
1347 | ci->i_dirty_caps |= mask; | 1347 | ci->i_dirty_caps |= mask; |
1348 | if (was == 0) { | 1348 | if (was == 0) { |
1349 | if (!ci->i_head_snapc) | 1349 | if (!ci->i_head_snapc) |
1350 | ci->i_head_snapc = ceph_get_snap_context( | 1350 | ci->i_head_snapc = ceph_get_snap_context( |
1351 | ci->i_snap_realm->cached_context); | 1351 | ci->i_snap_realm->cached_context); |
1352 | dout(" inode %p now dirty snapc %p\n", &ci->vfs_inode, | 1352 | dout(" inode %p now dirty snapc %p\n", &ci->vfs_inode, |
1353 | ci->i_head_snapc); | 1353 | ci->i_head_snapc); |
1354 | BUG_ON(!list_empty(&ci->i_dirty_item)); | 1354 | BUG_ON(!list_empty(&ci->i_dirty_item)); |
1355 | spin_lock(&mdsc->cap_dirty_lock); | 1355 | spin_lock(&mdsc->cap_dirty_lock); |
1356 | list_add(&ci->i_dirty_item, &mdsc->cap_dirty); | 1356 | list_add(&ci->i_dirty_item, &mdsc->cap_dirty); |
1357 | spin_unlock(&mdsc->cap_dirty_lock); | 1357 | spin_unlock(&mdsc->cap_dirty_lock); |
1358 | if (ci->i_flushing_caps == 0) { | 1358 | if (ci->i_flushing_caps == 0) { |
1359 | ihold(inode); | 1359 | ihold(inode); |
1360 | dirty |= I_DIRTY_SYNC; | 1360 | dirty |= I_DIRTY_SYNC; |
1361 | } | 1361 | } |
1362 | } | 1362 | } |
1363 | BUG_ON(list_empty(&ci->i_dirty_item)); | 1363 | BUG_ON(list_empty(&ci->i_dirty_item)); |
1364 | if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) && | 1364 | if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) && |
1365 | (mask & CEPH_CAP_FILE_BUFFER)) | 1365 | (mask & CEPH_CAP_FILE_BUFFER)) |
1366 | dirty |= I_DIRTY_DATASYNC; | 1366 | dirty |= I_DIRTY_DATASYNC; |
1367 | __cap_delay_requeue(mdsc, ci); | 1367 | __cap_delay_requeue(mdsc, ci); |
1368 | return dirty; | 1368 | return dirty; |
1369 | } | 1369 | } |
1370 | 1370 | ||
1371 | /* | 1371 | /* |
1372 | * Add dirty inode to the flushing list. Assigned a seq number so we | 1372 | * Add dirty inode to the flushing list. Assigned a seq number so we |
1373 | * can wait for caps to flush without starving. | 1373 | * can wait for caps to flush without starving. |
1374 | * | 1374 | * |
1375 | * Called under i_ceph_lock. | 1375 | * Called under i_ceph_lock. |
1376 | */ | 1376 | */ |
1377 | static int __mark_caps_flushing(struct inode *inode, | 1377 | static int __mark_caps_flushing(struct inode *inode, |
1378 | struct ceph_mds_session *session) | 1378 | struct ceph_mds_session *session) |
1379 | { | 1379 | { |
1380 | struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; | 1380 | struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; |
1381 | struct ceph_inode_info *ci = ceph_inode(inode); | 1381 | struct ceph_inode_info *ci = ceph_inode(inode); |
1382 | int flushing; | 1382 | int flushing; |
1383 | 1383 | ||
1384 | BUG_ON(ci->i_dirty_caps == 0); | 1384 | BUG_ON(ci->i_dirty_caps == 0); |
1385 | BUG_ON(list_empty(&ci->i_dirty_item)); | 1385 | BUG_ON(list_empty(&ci->i_dirty_item)); |
1386 | 1386 | ||
1387 | flushing = ci->i_dirty_caps; | 1387 | flushing = ci->i_dirty_caps; |
1388 | dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n", | 1388 | dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n", |
1389 | ceph_cap_string(flushing), | 1389 | ceph_cap_string(flushing), |
1390 | ceph_cap_string(ci->i_flushing_caps), | 1390 | ceph_cap_string(ci->i_flushing_caps), |
1391 | ceph_cap_string(ci->i_flushing_caps | flushing)); | 1391 | ceph_cap_string(ci->i_flushing_caps | flushing)); |
1392 | ci->i_flushing_caps |= flushing; | 1392 | ci->i_flushing_caps |= flushing; |
1393 | ci->i_dirty_caps = 0; | 1393 | ci->i_dirty_caps = 0; |
1394 | dout(" inode %p now !dirty\n", inode); | 1394 | dout(" inode %p now !dirty\n", inode); |
1395 | 1395 | ||
1396 | spin_lock(&mdsc->cap_dirty_lock); | 1396 | spin_lock(&mdsc->cap_dirty_lock); |
1397 | list_del_init(&ci->i_dirty_item); | 1397 | list_del_init(&ci->i_dirty_item); |
1398 | 1398 | ||
1399 | ci->i_cap_flush_seq = ++mdsc->cap_flush_seq; | 1399 | ci->i_cap_flush_seq = ++mdsc->cap_flush_seq; |
1400 | if (list_empty(&ci->i_flushing_item)) { | 1400 | if (list_empty(&ci->i_flushing_item)) { |
1401 | list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing); | 1401 | list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing); |
1402 | mdsc->num_cap_flushing++; | 1402 | mdsc->num_cap_flushing++; |
1403 | dout(" inode %p now flushing seq %lld\n", inode, | 1403 | dout(" inode %p now flushing seq %lld\n", inode, |
1404 | ci->i_cap_flush_seq); | 1404 | ci->i_cap_flush_seq); |
1405 | } else { | 1405 | } else { |
1406 | list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing); | 1406 | list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing); |
1407 | dout(" inode %p now flushing (more) seq %lld\n", inode, | 1407 | dout(" inode %p now flushing (more) seq %lld\n", inode, |
1408 | ci->i_cap_flush_seq); | 1408 | ci->i_cap_flush_seq); |
1409 | } | 1409 | } |
1410 | spin_unlock(&mdsc->cap_dirty_lock); | 1410 | spin_unlock(&mdsc->cap_dirty_lock); |
1411 | 1411 | ||
1412 | return flushing; | 1412 | return flushing; |
1413 | } | 1413 | } |
1414 | 1414 | ||
1415 | /* | 1415 | /* |
1416 | * try to invalidate mapping pages without blocking. | 1416 | * try to invalidate mapping pages without blocking. |
1417 | */ | 1417 | */ |
1418 | static int try_nonblocking_invalidate(struct inode *inode) | 1418 | static int try_nonblocking_invalidate(struct inode *inode) |
1419 | { | 1419 | { |
1420 | struct ceph_inode_info *ci = ceph_inode(inode); | 1420 | struct ceph_inode_info *ci = ceph_inode(inode); |
1421 | u32 invalidating_gen = ci->i_rdcache_gen; | 1421 | u32 invalidating_gen = ci->i_rdcache_gen; |
1422 | 1422 | ||
1423 | spin_unlock(&ci->i_ceph_lock); | 1423 | spin_unlock(&ci->i_ceph_lock); |
1424 | invalidate_mapping_pages(&inode->i_data, 0, -1); | 1424 | invalidate_mapping_pages(&inode->i_data, 0, -1); |
1425 | spin_lock(&ci->i_ceph_lock); | 1425 | spin_lock(&ci->i_ceph_lock); |
1426 | 1426 | ||
1427 | if (inode->i_data.nrpages == 0 && | 1427 | if (inode->i_data.nrpages == 0 && |
1428 | invalidating_gen == ci->i_rdcache_gen) { | 1428 | invalidating_gen == ci->i_rdcache_gen) { |
1429 | /* success. */ | 1429 | /* success. */ |
1430 | dout("try_nonblocking_invalidate %p success\n", inode); | 1430 | dout("try_nonblocking_invalidate %p success\n", inode); |
1431 | /* save any racing async invalidate some trouble */ | 1431 | /* save any racing async invalidate some trouble */ |
1432 | ci->i_rdcache_revoking = ci->i_rdcache_gen - 1; | 1432 | ci->i_rdcache_revoking = ci->i_rdcache_gen - 1; |
1433 | return 0; | 1433 | return 0; |
1434 | } | 1434 | } |
1435 | dout("try_nonblocking_invalidate %p failed\n", inode); | 1435 | dout("try_nonblocking_invalidate %p failed\n", inode); |
1436 | return -1; | 1436 | return -1; |
1437 | } | 1437 | } |
1438 | 1438 | ||
1439 | /* | 1439 | /* |
1440 | * Swiss army knife function to examine currently used and wanted | 1440 | * Swiss army knife function to examine currently used and wanted |
1441 | * versus held caps. Release, flush, ack revoked caps to mds as | 1441 | * versus held caps. Release, flush, ack revoked caps to mds as |
1442 | * appropriate. | 1442 | * appropriate. |
1443 | * | 1443 | * |
1444 | * CHECK_CAPS_NODELAY - caller is delayed work and we should not delay | 1444 | * CHECK_CAPS_NODELAY - caller is delayed work and we should not delay |
1445 | * cap release further. | 1445 | * cap release further. |
1446 | * CHECK_CAPS_AUTHONLY - we should only check the auth cap | 1446 | * CHECK_CAPS_AUTHONLY - we should only check the auth cap |
1447 | * CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without | 1447 | * CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without |
1448 | * further delay. | 1448 | * further delay. |
1449 | */ | 1449 | */ |
1450 | void ceph_check_caps(struct ceph_inode_info *ci, int flags, | 1450 | void ceph_check_caps(struct ceph_inode_info *ci, int flags, |
1451 | struct ceph_mds_session *session) | 1451 | struct ceph_mds_session *session) |
1452 | { | 1452 | { |
1453 | struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode); | 1453 | struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode); |
1454 | struct ceph_mds_client *mdsc = fsc->mdsc; | 1454 | struct ceph_mds_client *mdsc = fsc->mdsc; |
1455 | struct inode *inode = &ci->vfs_inode; | 1455 | struct inode *inode = &ci->vfs_inode; |
1456 | struct ceph_cap *cap; | 1456 | struct ceph_cap *cap; |
1457 | int file_wanted, used; | 1457 | int file_wanted, used; |
1458 | int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */ | 1458 | int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */ |
1459 | int issued, implemented, want, retain, revoking, flushing = 0; | 1459 | int issued, implemented, want, retain, revoking, flushing = 0; |
1460 | int mds = -1; /* keep track of how far we've gone through i_caps list | 1460 | int mds = -1; /* keep track of how far we've gone through i_caps list |
1461 | to avoid an infinite loop on retry */ | 1461 | to avoid an infinite loop on retry */ |
1462 | struct rb_node *p; | 1462 | struct rb_node *p; |
1463 | int tried_invalidate = 0; | 1463 | int tried_invalidate = 0; |
1464 | int delayed = 0, sent = 0, force_requeue = 0, num; | 1464 | int delayed = 0, sent = 0, force_requeue = 0, num; |
1465 | int queue_invalidate = 0; | 1465 | int queue_invalidate = 0; |
1466 | int is_delayed = flags & CHECK_CAPS_NODELAY; | 1466 | int is_delayed = flags & CHECK_CAPS_NODELAY; |
1467 | 1467 | ||
1468 | /* if we are unmounting, flush any unused caps immediately. */ | 1468 | /* if we are unmounting, flush any unused caps immediately. */ |
1469 | if (mdsc->stopping) | 1469 | if (mdsc->stopping) |
1470 | is_delayed = 1; | 1470 | is_delayed = 1; |
1471 | 1471 | ||
1472 | spin_lock(&ci->i_ceph_lock); | 1472 | spin_lock(&ci->i_ceph_lock); |
1473 | 1473 | ||
1474 | if (ci->i_ceph_flags & CEPH_I_FLUSH) | 1474 | if (ci->i_ceph_flags & CEPH_I_FLUSH) |
1475 | flags |= CHECK_CAPS_FLUSH; | 1475 | flags |= CHECK_CAPS_FLUSH; |
1476 | 1476 | ||
1477 | /* flush snaps first time around only */ | 1477 | /* flush snaps first time around only */ |
1478 | if (!list_empty(&ci->i_cap_snaps)) | 1478 | if (!list_empty(&ci->i_cap_snaps)) |
1479 | __ceph_flush_snaps(ci, &session, 0); | 1479 | __ceph_flush_snaps(ci, &session, 0); |
1480 | goto retry_locked; | 1480 | goto retry_locked; |
1481 | retry: | 1481 | retry: |
1482 | spin_lock(&ci->i_ceph_lock); | 1482 | spin_lock(&ci->i_ceph_lock); |
1483 | retry_locked: | 1483 | retry_locked: |
1484 | file_wanted = __ceph_caps_file_wanted(ci); | 1484 | file_wanted = __ceph_caps_file_wanted(ci); |
1485 | used = __ceph_caps_used(ci); | 1485 | used = __ceph_caps_used(ci); |
1486 | want = file_wanted | used; | 1486 | want = file_wanted | used; |
1487 | issued = __ceph_caps_issued(ci, &implemented); | 1487 | issued = __ceph_caps_issued(ci, &implemented); |
1488 | revoking = implemented & ~issued; | 1488 | revoking = implemented & ~issued; |
1489 | 1489 | ||
1490 | retain = want | CEPH_CAP_PIN; | 1490 | retain = want | CEPH_CAP_PIN; |
1491 | if (!mdsc->stopping && inode->i_nlink > 0) { | 1491 | if (!mdsc->stopping && inode->i_nlink > 0) { |
1492 | if (want) { | 1492 | if (want) { |
1493 | retain |= CEPH_CAP_ANY; /* be greedy */ | 1493 | retain |= CEPH_CAP_ANY; /* be greedy */ |
1494 | } else { | 1494 | } else { |
1495 | retain |= CEPH_CAP_ANY_SHARED; | 1495 | retain |= CEPH_CAP_ANY_SHARED; |
1496 | /* | 1496 | /* |
1497 | * keep RD only if we didn't have the file open RW, | 1497 | * keep RD only if we didn't have the file open RW, |
1498 | * because then the mds would revoke it anyway to | 1498 | * because then the mds would revoke it anyway to |
1499 | * journal max_size=0. | 1499 | * journal max_size=0. |
1500 | */ | 1500 | */ |
1501 | if (ci->i_max_size == 0) | 1501 | if (ci->i_max_size == 0) |
1502 | retain |= CEPH_CAP_ANY_RD; | 1502 | retain |= CEPH_CAP_ANY_RD; |
1503 | } | 1503 | } |
1504 | } | 1504 | } |
1505 | 1505 | ||
1506 | dout("check_caps %p file_want %s used %s dirty %s flushing %s" | 1506 | dout("check_caps %p file_want %s used %s dirty %s flushing %s" |
1507 | " issued %s revoking %s retain %s %s%s%s\n", inode, | 1507 | " issued %s revoking %s retain %s %s%s%s\n", inode, |
1508 | ceph_cap_string(file_wanted), | 1508 | ceph_cap_string(file_wanted), |
1509 | ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps), | 1509 | ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps), |
1510 | ceph_cap_string(ci->i_flushing_caps), | 1510 | ceph_cap_string(ci->i_flushing_caps), |
1511 | ceph_cap_string(issued), ceph_cap_string(revoking), | 1511 | ceph_cap_string(issued), ceph_cap_string(revoking), |
1512 | ceph_cap_string(retain), | 1512 | ceph_cap_string(retain), |
1513 | (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "", | 1513 | (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "", |
1514 | (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "", | 1514 | (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "", |
1515 | (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : ""); | 1515 | (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : ""); |
1516 | 1516 | ||
1517 | /* | 1517 | /* |
1518 | * If we no longer need to hold onto old our caps, and we may | 1518 | * If we no longer need to hold onto old our caps, and we may |
1519 | * have cached pages, but don't want them, then try to invalidate. | 1519 | * have cached pages, but don't want them, then try to invalidate. |
1520 | * If we fail, it's because pages are locked.... try again later. | 1520 | * If we fail, it's because pages are locked.... try again later. |
1521 | */ | 1521 | */ |
1522 | if ((!is_delayed || mdsc->stopping) && | 1522 | if ((!is_delayed || mdsc->stopping) && |
1523 | ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ | 1523 | ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ |
1524 | inode->i_data.nrpages && /* have cached pages */ | 1524 | inode->i_data.nrpages && /* have cached pages */ |
1525 | (file_wanted == 0 || /* no open files */ | 1525 | (file_wanted == 0 || /* no open files */ |
1526 | (revoking & (CEPH_CAP_FILE_CACHE| | 1526 | (revoking & (CEPH_CAP_FILE_CACHE| |
1527 | CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */ | 1527 | CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */ |
1528 | !tried_invalidate) { | 1528 | !tried_invalidate) { |
1529 | dout("check_caps trying to invalidate on %p\n", inode); | 1529 | dout("check_caps trying to invalidate on %p\n", inode); |
1530 | if (try_nonblocking_invalidate(inode) < 0) { | 1530 | if (try_nonblocking_invalidate(inode) < 0) { |
1531 | if (revoking & (CEPH_CAP_FILE_CACHE| | 1531 | if (revoking & (CEPH_CAP_FILE_CACHE| |
1532 | CEPH_CAP_FILE_LAZYIO)) { | 1532 | CEPH_CAP_FILE_LAZYIO)) { |
1533 | dout("check_caps queuing invalidate\n"); | 1533 | dout("check_caps queuing invalidate\n"); |
1534 | queue_invalidate = 1; | 1534 | queue_invalidate = 1; |
1535 | ci->i_rdcache_revoking = ci->i_rdcache_gen; | 1535 | ci->i_rdcache_revoking = ci->i_rdcache_gen; |
1536 | } else { | 1536 | } else { |
1537 | dout("check_caps failed to invalidate pages\n"); | 1537 | dout("check_caps failed to invalidate pages\n"); |
1538 | /* we failed to invalidate pages. check these | 1538 | /* we failed to invalidate pages. check these |
1539 | caps again later. */ | 1539 | caps again later. */ |
1540 | force_requeue = 1; | 1540 | force_requeue = 1; |
1541 | __cap_set_timeouts(mdsc, ci); | 1541 | __cap_set_timeouts(mdsc, ci); |
1542 | } | 1542 | } |
1543 | } | 1543 | } |
1544 | tried_invalidate = 1; | 1544 | tried_invalidate = 1; |
1545 | goto retry_locked; | 1545 | goto retry_locked; |
1546 | } | 1546 | } |
1547 | 1547 | ||
1548 | num = 0; | 1548 | num = 0; |
1549 | for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { | 1549 | for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { |
1550 | cap = rb_entry(p, struct ceph_cap, ci_node); | 1550 | cap = rb_entry(p, struct ceph_cap, ci_node); |
1551 | num++; | 1551 | num++; |
1552 | 1552 | ||
1553 | /* avoid looping forever */ | 1553 | /* avoid looping forever */ |
1554 | if (mds >= cap->mds || | 1554 | if (mds >= cap->mds || |
1555 | ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap)) | 1555 | ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap)) |
1556 | continue; | 1556 | continue; |
1557 | 1557 | ||
1558 | /* NOTE: no side-effects allowed, until we take s_mutex */ | 1558 | /* NOTE: no side-effects allowed, until we take s_mutex */ |
1559 | 1559 | ||
1560 | revoking = cap->implemented & ~cap->issued; | 1560 | revoking = cap->implemented & ~cap->issued; |
1561 | dout(" mds%d cap %p issued %s implemented %s revoking %s\n", | 1561 | dout(" mds%d cap %p issued %s implemented %s revoking %s\n", |
1562 | cap->mds, cap, ceph_cap_string(cap->issued), | 1562 | cap->mds, cap, ceph_cap_string(cap->issued), |
1563 | ceph_cap_string(cap->implemented), | 1563 | ceph_cap_string(cap->implemented), |
1564 | ceph_cap_string(revoking)); | 1564 | ceph_cap_string(revoking)); |
1565 | 1565 | ||
1566 | if (cap == ci->i_auth_cap && | 1566 | if (cap == ci->i_auth_cap && |
1567 | (cap->issued & CEPH_CAP_FILE_WR)) { | 1567 | (cap->issued & CEPH_CAP_FILE_WR)) { |
1568 | /* request larger max_size from MDS? */ | 1568 | /* request larger max_size from MDS? */ |
1569 | if (ci->i_wanted_max_size > ci->i_max_size && | 1569 | if (ci->i_wanted_max_size > ci->i_max_size && |
1570 | ci->i_wanted_max_size > ci->i_requested_max_size) { | 1570 | ci->i_wanted_max_size > ci->i_requested_max_size) { |
1571 | dout("requesting new max_size\n"); | 1571 | dout("requesting new max_size\n"); |
1572 | goto ack; | 1572 | goto ack; |
1573 | } | 1573 | } |
1574 | 1574 | ||
1575 | /* approaching file_max? */ | 1575 | /* approaching file_max? */ |
1576 | if ((inode->i_size << 1) >= ci->i_max_size && | 1576 | if ((inode->i_size << 1) >= ci->i_max_size && |
1577 | (ci->i_reported_size << 1) < ci->i_max_size) { | 1577 | (ci->i_reported_size << 1) < ci->i_max_size) { |
1578 | dout("i_size approaching max_size\n"); | 1578 | dout("i_size approaching max_size\n"); |
1579 | goto ack; | 1579 | goto ack; |
1580 | } | 1580 | } |
1581 | } | 1581 | } |
1582 | /* flush anything dirty? */ | 1582 | /* flush anything dirty? */ |
1583 | if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) && | 1583 | if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) && |
1584 | ci->i_dirty_caps) { | 1584 | ci->i_dirty_caps) { |
1585 | dout("flushing dirty caps\n"); | 1585 | dout("flushing dirty caps\n"); |
1586 | goto ack; | 1586 | goto ack; |
1587 | } | 1587 | } |
1588 | 1588 | ||
1589 | /* completed revocation? going down and there are no caps? */ | 1589 | /* completed revocation? going down and there are no caps? */ |
1590 | if (revoking && (revoking & used) == 0) { | 1590 | if (revoking && (revoking & used) == 0) { |
1591 | dout("completed revocation of %s\n", | 1591 | dout("completed revocation of %s\n", |
1592 | ceph_cap_string(cap->implemented & ~cap->issued)); | 1592 | ceph_cap_string(cap->implemented & ~cap->issued)); |
1593 | goto ack; | 1593 | goto ack; |
1594 | } | 1594 | } |
1595 | 1595 | ||
1596 | /* want more caps from mds? */ | 1596 | /* want more caps from mds? */ |
1597 | if (want & ~(cap->mds_wanted | cap->issued)) | 1597 | if (want & ~(cap->mds_wanted | cap->issued)) |
1598 | goto ack; | 1598 | goto ack; |
1599 | 1599 | ||
1600 | /* things we might delay */ | 1600 | /* things we might delay */ |
1601 | if ((cap->issued & ~retain) == 0 && | 1601 | if ((cap->issued & ~retain) == 0 && |
1602 | cap->mds_wanted == want) | 1602 | cap->mds_wanted == want) |
1603 | continue; /* nope, all good */ | 1603 | continue; /* nope, all good */ |
1604 | 1604 | ||
1605 | if (is_delayed) | 1605 | if (is_delayed) |
1606 | goto ack; | 1606 | goto ack; |
1607 | 1607 | ||
1608 | /* delay? */ | 1608 | /* delay? */ |
1609 | if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 && | 1609 | if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 && |
1610 | time_before(jiffies, ci->i_hold_caps_max)) { | 1610 | time_before(jiffies, ci->i_hold_caps_max)) { |
1611 | dout(" delaying issued %s -> %s, wanted %s -> %s\n", | 1611 | dout(" delaying issued %s -> %s, wanted %s -> %s\n", |
1612 | ceph_cap_string(cap->issued), | 1612 | ceph_cap_string(cap->issued), |
1613 | ceph_cap_string(cap->issued & retain), | 1613 | ceph_cap_string(cap->issued & retain), |
1614 | ceph_cap_string(cap->mds_wanted), | 1614 | ceph_cap_string(cap->mds_wanted), |
1615 | ceph_cap_string(want)); | 1615 | ceph_cap_string(want)); |
1616 | delayed++; | 1616 | delayed++; |
1617 | continue; | 1617 | continue; |
1618 | } | 1618 | } |
1619 | 1619 | ||
1620 | ack: | 1620 | ack: |
1621 | if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { | 1621 | if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { |
1622 | dout(" skipping %p I_NOFLUSH set\n", inode); | 1622 | dout(" skipping %p I_NOFLUSH set\n", inode); |
1623 | continue; | 1623 | continue; |
1624 | } | 1624 | } |
1625 | 1625 | ||
1626 | if (session && session != cap->session) { | 1626 | if (session && session != cap->session) { |
1627 | dout("oops, wrong session %p mutex\n", session); | 1627 | dout("oops, wrong session %p mutex\n", session); |
1628 | mutex_unlock(&session->s_mutex); | 1628 | mutex_unlock(&session->s_mutex); |
1629 | session = NULL; | 1629 | session = NULL; |
1630 | } | 1630 | } |
1631 | if (!session) { | 1631 | if (!session) { |
1632 | session = cap->session; | 1632 | session = cap->session; |
1633 | if (mutex_trylock(&session->s_mutex) == 0) { | 1633 | if (mutex_trylock(&session->s_mutex) == 0) { |
1634 | dout("inverting session/ino locks on %p\n", | 1634 | dout("inverting session/ino locks on %p\n", |
1635 | session); | 1635 | session); |
1636 | spin_unlock(&ci->i_ceph_lock); | 1636 | spin_unlock(&ci->i_ceph_lock); |
1637 | if (took_snap_rwsem) { | 1637 | if (took_snap_rwsem) { |
1638 | up_read(&mdsc->snap_rwsem); | 1638 | up_read(&mdsc->snap_rwsem); |
1639 | took_snap_rwsem = 0; | 1639 | took_snap_rwsem = 0; |
1640 | } | 1640 | } |
1641 | mutex_lock(&session->s_mutex); | 1641 | mutex_lock(&session->s_mutex); |
1642 | goto retry; | 1642 | goto retry; |
1643 | } | 1643 | } |
1644 | } | 1644 | } |
1645 | /* take snap_rwsem after session mutex */ | 1645 | /* take snap_rwsem after session mutex */ |
1646 | if (!took_snap_rwsem) { | 1646 | if (!took_snap_rwsem) { |
1647 | if (down_read_trylock(&mdsc->snap_rwsem) == 0) { | 1647 | if (down_read_trylock(&mdsc->snap_rwsem) == 0) { |
1648 | dout("inverting snap/in locks on %p\n", | 1648 | dout("inverting snap/in locks on %p\n", |
1649 | inode); | 1649 | inode); |
1650 | spin_unlock(&ci->i_ceph_lock); | 1650 | spin_unlock(&ci->i_ceph_lock); |
1651 | down_read(&mdsc->snap_rwsem); | 1651 | down_read(&mdsc->snap_rwsem); |
1652 | took_snap_rwsem = 1; | 1652 | took_snap_rwsem = 1; |
1653 | goto retry; | 1653 | goto retry; |
1654 | } | 1654 | } |
1655 | took_snap_rwsem = 1; | 1655 | took_snap_rwsem = 1; |
1656 | } | 1656 | } |
1657 | 1657 | ||
1658 | if (cap == ci->i_auth_cap && ci->i_dirty_caps) | 1658 | if (cap == ci->i_auth_cap && ci->i_dirty_caps) |
1659 | flushing = __mark_caps_flushing(inode, session); | 1659 | flushing = __mark_caps_flushing(inode, session); |
1660 | else | 1660 | else |
1661 | flushing = 0; | 1661 | flushing = 0; |
1662 | 1662 | ||
1663 | mds = cap->mds; /* remember mds, so we don't repeat */ | 1663 | mds = cap->mds; /* remember mds, so we don't repeat */ |
1664 | sent++; | 1664 | sent++; |
1665 | 1665 | ||
1666 | /* __send_cap drops i_ceph_lock */ | 1666 | /* __send_cap drops i_ceph_lock */ |
1667 | delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want, | 1667 | delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want, |
1668 | retain, flushing, NULL); | 1668 | retain, flushing, NULL); |
1669 | goto retry; /* retake i_ceph_lock and restart our cap scan. */ | 1669 | goto retry; /* retake i_ceph_lock and restart our cap scan. */ |
1670 | } | 1670 | } |
1671 | 1671 | ||
1672 | /* | 1672 | /* |
1673 | * Reschedule delayed caps release if we delayed anything, | 1673 | * Reschedule delayed caps release if we delayed anything, |
1674 | * otherwise cancel. | 1674 | * otherwise cancel. |
1675 | */ | 1675 | */ |
1676 | if (delayed && is_delayed) | 1676 | if (delayed && is_delayed) |
1677 | force_requeue = 1; /* __send_cap delayed release; requeue */ | 1677 | force_requeue = 1; /* __send_cap delayed release; requeue */ |
1678 | if (!delayed && !is_delayed) | 1678 | if (!delayed && !is_delayed) |
1679 | __cap_delay_cancel(mdsc, ci); | 1679 | __cap_delay_cancel(mdsc, ci); |
1680 | else if (!is_delayed || force_requeue) | 1680 | else if (!is_delayed || force_requeue) |
1681 | __cap_delay_requeue(mdsc, ci); | 1681 | __cap_delay_requeue(mdsc, ci); |
1682 | 1682 | ||
1683 | spin_unlock(&ci->i_ceph_lock); | 1683 | spin_unlock(&ci->i_ceph_lock); |
1684 | 1684 | ||
1685 | if (queue_invalidate) | 1685 | if (queue_invalidate) |
1686 | ceph_queue_invalidate(inode); | 1686 | ceph_queue_invalidate(inode); |
1687 | 1687 | ||
1688 | if (session) | 1688 | if (session) |
1689 | mutex_unlock(&session->s_mutex); | 1689 | mutex_unlock(&session->s_mutex); |
1690 | if (took_snap_rwsem) | 1690 | if (took_snap_rwsem) |
1691 | up_read(&mdsc->snap_rwsem); | 1691 | up_read(&mdsc->snap_rwsem); |
1692 | } | 1692 | } |
1693 | 1693 | ||
1694 | /* | 1694 | /* |
1695 | * Try to flush dirty caps back to the auth mds. | 1695 | * Try to flush dirty caps back to the auth mds. |
1696 | */ | 1696 | */ |
1697 | static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session, | 1697 | static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session, |
1698 | unsigned *flush_tid) | 1698 | unsigned *flush_tid) |
1699 | { | 1699 | { |
1700 | struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; | 1700 | struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; |
1701 | struct ceph_inode_info *ci = ceph_inode(inode); | 1701 | struct ceph_inode_info *ci = ceph_inode(inode); |
1702 | int unlock_session = session ? 0 : 1; | 1702 | int unlock_session = session ? 0 : 1; |
1703 | int flushing = 0; | 1703 | int flushing = 0; |
1704 | 1704 | ||
1705 | retry: | 1705 | retry: |
1706 | spin_lock(&ci->i_ceph_lock); | 1706 | spin_lock(&ci->i_ceph_lock); |
1707 | if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { | 1707 | if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { |
1708 | dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode); | 1708 | dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode); |
1709 | goto out; | 1709 | goto out; |
1710 | } | 1710 | } |
1711 | if (ci->i_dirty_caps && ci->i_auth_cap) { | 1711 | if (ci->i_dirty_caps && ci->i_auth_cap) { |
1712 | struct ceph_cap *cap = ci->i_auth_cap; | 1712 | struct ceph_cap *cap = ci->i_auth_cap; |
1713 | int used = __ceph_caps_used(ci); | 1713 | int used = __ceph_caps_used(ci); |
1714 | int want = __ceph_caps_wanted(ci); | 1714 | int want = __ceph_caps_wanted(ci); |
1715 | int delayed; | 1715 | int delayed; |
1716 | 1716 | ||
1717 | if (!session) { | 1717 | if (!session) { |
1718 | spin_unlock(&ci->i_ceph_lock); | 1718 | spin_unlock(&ci->i_ceph_lock); |
1719 | session = cap->session; | 1719 | session = cap->session; |
1720 | mutex_lock(&session->s_mutex); | 1720 | mutex_lock(&session->s_mutex); |
1721 | goto retry; | 1721 | goto retry; |
1722 | } | 1722 | } |
1723 | BUG_ON(session != cap->session); | 1723 | BUG_ON(session != cap->session); |
1724 | if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) | 1724 | if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) |
1725 | goto out; | 1725 | goto out; |
1726 | 1726 | ||
1727 | flushing = __mark_caps_flushing(inode, session); | 1727 | flushing = __mark_caps_flushing(inode, session); |
1728 | 1728 | ||
1729 | /* __send_cap drops i_ceph_lock */ | 1729 | /* __send_cap drops i_ceph_lock */ |
1730 | delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want, | 1730 | delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want, |
1731 | cap->issued | cap->implemented, flushing, | 1731 | cap->issued | cap->implemented, flushing, |
1732 | flush_tid); | 1732 | flush_tid); |
1733 | if (!delayed) | 1733 | if (!delayed) |
1734 | goto out_unlocked; | 1734 | goto out_unlocked; |
1735 | 1735 | ||
1736 | spin_lock(&ci->i_ceph_lock); | 1736 | spin_lock(&ci->i_ceph_lock); |
1737 | __cap_delay_requeue(mdsc, ci); | 1737 | __cap_delay_requeue(mdsc, ci); |
1738 | } | 1738 | } |
1739 | out: | 1739 | out: |
1740 | spin_unlock(&ci->i_ceph_lock); | 1740 | spin_unlock(&ci->i_ceph_lock); |
1741 | out_unlocked: | 1741 | out_unlocked: |
1742 | if (session && unlock_session) | 1742 | if (session && unlock_session) |
1743 | mutex_unlock(&session->s_mutex); | 1743 | mutex_unlock(&session->s_mutex); |
1744 | return flushing; | 1744 | return flushing; |
1745 | } | 1745 | } |
1746 | 1746 | ||
1747 | /* | 1747 | /* |
1748 | * Return true if we've flushed caps through the given flush_tid. | 1748 | * Return true if we've flushed caps through the given flush_tid. |
1749 | */ | 1749 | */ |
1750 | static int caps_are_flushed(struct inode *inode, unsigned tid) | 1750 | static int caps_are_flushed(struct inode *inode, unsigned tid) |
1751 | { | 1751 | { |
1752 | struct ceph_inode_info *ci = ceph_inode(inode); | 1752 | struct ceph_inode_info *ci = ceph_inode(inode); |
1753 | int i, ret = 1; | 1753 | int i, ret = 1; |
1754 | 1754 | ||
1755 | spin_lock(&ci->i_ceph_lock); | 1755 | spin_lock(&ci->i_ceph_lock); |
1756 | for (i = 0; i < CEPH_CAP_BITS; i++) | 1756 | for (i = 0; i < CEPH_CAP_BITS; i++) |
1757 | if ((ci->i_flushing_caps & (1 << i)) && | 1757 | if ((ci->i_flushing_caps & (1 << i)) && |
1758 | ci->i_cap_flush_tid[i] <= tid) { | 1758 | ci->i_cap_flush_tid[i] <= tid) { |
1759 | /* still flushing this bit */ | 1759 | /* still flushing this bit */ |
1760 | ret = 0; | 1760 | ret = 0; |
1761 | break; | 1761 | break; |
1762 | } | 1762 | } |
1763 | spin_unlock(&ci->i_ceph_lock); | 1763 | spin_unlock(&ci->i_ceph_lock); |
1764 | return ret; | 1764 | return ret; |
1765 | } | 1765 | } |
1766 | 1766 | ||
1767 | /* | 1767 | /* |
1768 | * Wait on any unsafe replies for the given inode. First wait on the | 1768 | * Wait on any unsafe replies for the given inode. First wait on the |
1769 | * newest request, and make that the upper bound. Then, if there are | 1769 | * newest request, and make that the upper bound. Then, if there are |
1770 | * more requests, keep waiting on the oldest as long as it is still older | 1770 | * more requests, keep waiting on the oldest as long as it is still older |
1771 | * than the original request. | 1771 | * than the original request. |
1772 | */ | 1772 | */ |
1773 | static void sync_write_wait(struct inode *inode) | 1773 | static void sync_write_wait(struct inode *inode) |
1774 | { | 1774 | { |
1775 | struct ceph_inode_info *ci = ceph_inode(inode); | 1775 | struct ceph_inode_info *ci = ceph_inode(inode); |
1776 | struct list_head *head = &ci->i_unsafe_writes; | 1776 | struct list_head *head = &ci->i_unsafe_writes; |
1777 | struct ceph_osd_request *req; | 1777 | struct ceph_osd_request *req; |
1778 | u64 last_tid; | 1778 | u64 last_tid; |
1779 | 1779 | ||
1780 | spin_lock(&ci->i_unsafe_lock); | 1780 | spin_lock(&ci->i_unsafe_lock); |
1781 | if (list_empty(head)) | 1781 | if (list_empty(head)) |
1782 | goto out; | 1782 | goto out; |
1783 | 1783 | ||
1784 | /* set upper bound as _last_ entry in chain */ | 1784 | /* set upper bound as _last_ entry in chain */ |
1785 | req = list_entry(head->prev, struct ceph_osd_request, | 1785 | req = list_entry(head->prev, struct ceph_osd_request, |
1786 | r_unsafe_item); | 1786 | r_unsafe_item); |
1787 | last_tid = req->r_tid; | 1787 | last_tid = req->r_tid; |
1788 | 1788 | ||
1789 | do { | 1789 | do { |
1790 | ceph_osdc_get_request(req); | 1790 | ceph_osdc_get_request(req); |
1791 | spin_unlock(&ci->i_unsafe_lock); | 1791 | spin_unlock(&ci->i_unsafe_lock); |
1792 | dout("sync_write_wait on tid %llu (until %llu)\n", | 1792 | dout("sync_write_wait on tid %llu (until %llu)\n", |
1793 | req->r_tid, last_tid); | 1793 | req->r_tid, last_tid); |
1794 | wait_for_completion(&req->r_safe_completion); | 1794 | wait_for_completion(&req->r_safe_completion); |
1795 | spin_lock(&ci->i_unsafe_lock); | 1795 | spin_lock(&ci->i_unsafe_lock); |
1796 | ceph_osdc_put_request(req); | 1796 | ceph_osdc_put_request(req); |
1797 | 1797 | ||
1798 | /* | 1798 | /* |
1799 | * from here on look at first entry in chain, since we | 1799 | * from here on look at first entry in chain, since we |
1800 | * only want to wait for anything older than last_tid | 1800 | * only want to wait for anything older than last_tid |
1801 | */ | 1801 | */ |
1802 | if (list_empty(head)) | 1802 | if (list_empty(head)) |
1803 | break; | 1803 | break; |
1804 | req = list_entry(head->next, struct ceph_osd_request, | 1804 | req = list_entry(head->next, struct ceph_osd_request, |
1805 | r_unsafe_item); | 1805 | r_unsafe_item); |
1806 | } while (req->r_tid < last_tid); | 1806 | } while (req->r_tid < last_tid); |
1807 | out: | 1807 | out: |
1808 | spin_unlock(&ci->i_unsafe_lock); | 1808 | spin_unlock(&ci->i_unsafe_lock); |
1809 | } | 1809 | } |
1810 | 1810 | ||
1811 | int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) | 1811 | int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) |
1812 | { | 1812 | { |
1813 | struct inode *inode = file->f_mapping->host; | 1813 | struct inode *inode = file->f_mapping->host; |
1814 | struct ceph_inode_info *ci = ceph_inode(inode); | 1814 | struct ceph_inode_info *ci = ceph_inode(inode); |
1815 | unsigned flush_tid; | 1815 | unsigned flush_tid; |
1816 | int ret; | 1816 | int ret; |
1817 | int dirty; | 1817 | int dirty; |
1818 | 1818 | ||
1819 | dout("fsync %p%s\n", inode, datasync ? " datasync" : ""); | 1819 | dout("fsync %p%s\n", inode, datasync ? " datasync" : ""); |
1820 | sync_write_wait(inode); | 1820 | sync_write_wait(inode); |
1821 | 1821 | ||
1822 | ret = filemap_write_and_wait_range(inode->i_mapping, start, end); | 1822 | ret = filemap_write_and_wait_range(inode->i_mapping, start, end); |
1823 | if (ret < 0) | 1823 | if (ret < 0) |
1824 | return ret; | 1824 | return ret; |
1825 | mutex_lock(&inode->i_mutex); | 1825 | mutex_lock(&inode->i_mutex); |
1826 | 1826 | ||
1827 | dirty = try_flush_caps(inode, NULL, &flush_tid); | 1827 | dirty = try_flush_caps(inode, NULL, &flush_tid); |
1828 | dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); | 1828 | dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); |
1829 | 1829 | ||
1830 | /* | 1830 | /* |
1831 | * only wait on non-file metadata writeback (the mds | 1831 | * only wait on non-file metadata writeback (the mds |
1832 | * can recover size and mtime, so we don't need to | 1832 | * can recover size and mtime, so we don't need to |
1833 | * wait for that) | 1833 | * wait for that) |
1834 | */ | 1834 | */ |
1835 | if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { | 1835 | if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { |
1836 | dout("fsync waiting for flush_tid %u\n", flush_tid); | 1836 | dout("fsync waiting for flush_tid %u\n", flush_tid); |
1837 | ret = wait_event_interruptible(ci->i_cap_wq, | 1837 | ret = wait_event_interruptible(ci->i_cap_wq, |
1838 | caps_are_flushed(inode, flush_tid)); | 1838 | caps_are_flushed(inode, flush_tid)); |
1839 | } | 1839 | } |
1840 | 1840 | ||
1841 | dout("fsync %p%s done\n", inode, datasync ? " datasync" : ""); | 1841 | dout("fsync %p%s done\n", inode, datasync ? " datasync" : ""); |
1842 | mutex_unlock(&inode->i_mutex); | 1842 | mutex_unlock(&inode->i_mutex); |
1843 | return ret; | 1843 | return ret; |
1844 | } | 1844 | } |
1845 | 1845 | ||
1846 | /* | 1846 | /* |
1847 | * Flush any dirty caps back to the mds. If we aren't asked to wait, | 1847 | * Flush any dirty caps back to the mds. If we aren't asked to wait, |
1848 | * queue inode for flush but don't do so immediately, because we can | 1848 | * queue inode for flush but don't do so immediately, because we can |
1849 | * get by with fewer MDS messages if we wait for data writeback to | 1849 | * get by with fewer MDS messages if we wait for data writeback to |
1850 | * complete first. | 1850 | * complete first. |
1851 | */ | 1851 | */ |
1852 | int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) | 1852 | int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) |
1853 | { | 1853 | { |
1854 | struct ceph_inode_info *ci = ceph_inode(inode); | 1854 | struct ceph_inode_info *ci = ceph_inode(inode); |
1855 | unsigned flush_tid; | 1855 | unsigned flush_tid; |
1856 | int err = 0; | 1856 | int err = 0; |
1857 | int dirty; | 1857 | int dirty; |
1858 | int wait = wbc->sync_mode == WB_SYNC_ALL; | 1858 | int wait = wbc->sync_mode == WB_SYNC_ALL; |
1859 | 1859 | ||
1860 | dout("write_inode %p wait=%d\n", inode, wait); | 1860 | dout("write_inode %p wait=%d\n", inode, wait); |
1861 | if (wait) { | 1861 | if (wait) { |
1862 | dirty = try_flush_caps(inode, NULL, &flush_tid); | 1862 | dirty = try_flush_caps(inode, NULL, &flush_tid); |
1863 | if (dirty) | 1863 | if (dirty) |
1864 | err = wait_event_interruptible(ci->i_cap_wq, | 1864 | err = wait_event_interruptible(ci->i_cap_wq, |
1865 | caps_are_flushed(inode, flush_tid)); | 1865 | caps_are_flushed(inode, flush_tid)); |
1866 | } else { | 1866 | } else { |
1867 | struct ceph_mds_client *mdsc = | 1867 | struct ceph_mds_client *mdsc = |
1868 | ceph_sb_to_client(inode->i_sb)->mdsc; | 1868 | ceph_sb_to_client(inode->i_sb)->mdsc; |
1869 | 1869 | ||
1870 | spin_lock(&ci->i_ceph_lock); | 1870 | spin_lock(&ci->i_ceph_lock); |
1871 | if (__ceph_caps_dirty(ci)) | 1871 | if (__ceph_caps_dirty(ci)) |
1872 | __cap_delay_requeue_front(mdsc, ci); | 1872 | __cap_delay_requeue_front(mdsc, ci); |
1873 | spin_unlock(&ci->i_ceph_lock); | 1873 | spin_unlock(&ci->i_ceph_lock); |
1874 | } | 1874 | } |
1875 | return err; | 1875 | return err; |
1876 | } | 1876 | } |
1877 | 1877 | ||
1878 | /* | 1878 | /* |
1879 | * After a recovering MDS goes active, we need to resend any caps | 1879 | * After a recovering MDS goes active, we need to resend any caps |
1880 | * we were flushing. | 1880 | * we were flushing. |
1881 | * | 1881 | * |
1882 | * Caller holds session->s_mutex. | 1882 | * Caller holds session->s_mutex. |
1883 | */ | 1883 | */ |
1884 | static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc, | 1884 | static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc, |
1885 | struct ceph_mds_session *session) | 1885 | struct ceph_mds_session *session) |
1886 | { | 1886 | { |
1887 | struct ceph_cap_snap *capsnap; | 1887 | struct ceph_cap_snap *capsnap; |
1888 | 1888 | ||
1889 | dout("kick_flushing_capsnaps mds%d\n", session->s_mds); | 1889 | dout("kick_flushing_capsnaps mds%d\n", session->s_mds); |
1890 | list_for_each_entry(capsnap, &session->s_cap_snaps_flushing, | 1890 | list_for_each_entry(capsnap, &session->s_cap_snaps_flushing, |
1891 | flushing_item) { | 1891 | flushing_item) { |
1892 | struct ceph_inode_info *ci = capsnap->ci; | 1892 | struct ceph_inode_info *ci = capsnap->ci; |
1893 | struct inode *inode = &ci->vfs_inode; | 1893 | struct inode *inode = &ci->vfs_inode; |
1894 | struct ceph_cap *cap; | 1894 | struct ceph_cap *cap; |
1895 | 1895 | ||
1896 | spin_lock(&ci->i_ceph_lock); | 1896 | spin_lock(&ci->i_ceph_lock); |
1897 | cap = ci->i_auth_cap; | 1897 | cap = ci->i_auth_cap; |
1898 | if (cap && cap->session == session) { | 1898 | if (cap && cap->session == session) { |
1899 | dout("kick_flushing_caps %p cap %p capsnap %p\n", inode, | 1899 | dout("kick_flushing_caps %p cap %p capsnap %p\n", inode, |
1900 | cap, capsnap); | 1900 | cap, capsnap); |
1901 | __ceph_flush_snaps(ci, &session, 1); | 1901 | __ceph_flush_snaps(ci, &session, 1); |
1902 | } else { | 1902 | } else { |
1903 | pr_err("%p auth cap %p not mds%d ???\n", inode, | 1903 | pr_err("%p auth cap %p not mds%d ???\n", inode, |
1904 | cap, session->s_mds); | 1904 | cap, session->s_mds); |
1905 | } | 1905 | } |
1906 | spin_unlock(&ci->i_ceph_lock); | 1906 | spin_unlock(&ci->i_ceph_lock); |
1907 | } | 1907 | } |
1908 | } | 1908 | } |
1909 | 1909 | ||
1910 | void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, | 1910 | void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, |
1911 | struct ceph_mds_session *session) | 1911 | struct ceph_mds_session *session) |
1912 | { | 1912 | { |
1913 | struct ceph_inode_info *ci; | 1913 | struct ceph_inode_info *ci; |
1914 | 1914 | ||
1915 | kick_flushing_capsnaps(mdsc, session); | 1915 | kick_flushing_capsnaps(mdsc, session); |
1916 | 1916 | ||
1917 | dout("kick_flushing_caps mds%d\n", session->s_mds); | 1917 | dout("kick_flushing_caps mds%d\n", session->s_mds); |
1918 | list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { | 1918 | list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { |
1919 | struct inode *inode = &ci->vfs_inode; | 1919 | struct inode *inode = &ci->vfs_inode; |
1920 | struct ceph_cap *cap; | 1920 | struct ceph_cap *cap; |
1921 | int delayed = 0; | 1921 | int delayed = 0; |
1922 | 1922 | ||
1923 | spin_lock(&ci->i_ceph_lock); | 1923 | spin_lock(&ci->i_ceph_lock); |
1924 | cap = ci->i_auth_cap; | 1924 | cap = ci->i_auth_cap; |
1925 | if (cap && cap->session == session) { | 1925 | if (cap && cap->session == session) { |
1926 | dout("kick_flushing_caps %p cap %p %s\n", inode, | 1926 | dout("kick_flushing_caps %p cap %p %s\n", inode, |
1927 | cap, ceph_cap_string(ci->i_flushing_caps)); | 1927 | cap, ceph_cap_string(ci->i_flushing_caps)); |
1928 | delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, | 1928 | delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, |
1929 | __ceph_caps_used(ci), | 1929 | __ceph_caps_used(ci), |
1930 | __ceph_caps_wanted(ci), | 1930 | __ceph_caps_wanted(ci), |
1931 | cap->issued | cap->implemented, | 1931 | cap->issued | cap->implemented, |
1932 | ci->i_flushing_caps, NULL); | 1932 | ci->i_flushing_caps, NULL); |
1933 | if (delayed) { | 1933 | if (delayed) { |
1934 | spin_lock(&ci->i_ceph_lock); | 1934 | spin_lock(&ci->i_ceph_lock); |
1935 | __cap_delay_requeue(mdsc, ci); | 1935 | __cap_delay_requeue(mdsc, ci); |
1936 | spin_unlock(&ci->i_ceph_lock); | 1936 | spin_unlock(&ci->i_ceph_lock); |
1937 | } | 1937 | } |
1938 | } else { | 1938 | } else { |
1939 | pr_err("%p auth cap %p not mds%d ???\n", inode, | 1939 | pr_err("%p auth cap %p not mds%d ???\n", inode, |
1940 | cap, session->s_mds); | 1940 | cap, session->s_mds); |
1941 | spin_unlock(&ci->i_ceph_lock); | 1941 | spin_unlock(&ci->i_ceph_lock); |
1942 | } | 1942 | } |
1943 | } | 1943 | } |
1944 | } | 1944 | } |
1945 | 1945 | ||
1946 | static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, | 1946 | static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, |
1947 | struct ceph_mds_session *session, | 1947 | struct ceph_mds_session *session, |
1948 | struct inode *inode) | 1948 | struct inode *inode) |
1949 | { | 1949 | { |
1950 | struct ceph_inode_info *ci = ceph_inode(inode); | 1950 | struct ceph_inode_info *ci = ceph_inode(inode); |
1951 | struct ceph_cap *cap; | 1951 | struct ceph_cap *cap; |
1952 | int delayed = 0; | 1952 | int delayed = 0; |
1953 | 1953 | ||
1954 | spin_lock(&ci->i_ceph_lock); | 1954 | spin_lock(&ci->i_ceph_lock); |
1955 | cap = ci->i_auth_cap; | 1955 | cap = ci->i_auth_cap; |
1956 | dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode, | 1956 | dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode, |
1957 | ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq); | 1957 | ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq); |
1958 | __ceph_flush_snaps(ci, &session, 1); | 1958 | __ceph_flush_snaps(ci, &session, 1); |
1959 | if (ci->i_flushing_caps) { | 1959 | if (ci->i_flushing_caps) { |
1960 | delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, | 1960 | delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, |
1961 | __ceph_caps_used(ci), | 1961 | __ceph_caps_used(ci), |
1962 | __ceph_caps_wanted(ci), | 1962 | __ceph_caps_wanted(ci), |
1963 | cap->issued | cap->implemented, | 1963 | cap->issued | cap->implemented, |
1964 | ci->i_flushing_caps, NULL); | 1964 | ci->i_flushing_caps, NULL); |
1965 | if (delayed) { | 1965 | if (delayed) { |
1966 | spin_lock(&ci->i_ceph_lock); | 1966 | spin_lock(&ci->i_ceph_lock); |
1967 | __cap_delay_requeue(mdsc, ci); | 1967 | __cap_delay_requeue(mdsc, ci); |
1968 | spin_unlock(&ci->i_ceph_lock); | 1968 | spin_unlock(&ci->i_ceph_lock); |
1969 | } | 1969 | } |
1970 | } else { | 1970 | } else { |
1971 | spin_unlock(&ci->i_ceph_lock); | 1971 | spin_unlock(&ci->i_ceph_lock); |
1972 | } | 1972 | } |
1973 | } | 1973 | } |
1974 | 1974 | ||
1975 | 1975 | ||
1976 | /* | 1976 | /* |
1977 | * Take references to capabilities we hold, so that we don't release | 1977 | * Take references to capabilities we hold, so that we don't release |
1978 | * them to the MDS prematurely. | 1978 | * them to the MDS prematurely. |
1979 | * | 1979 | * |
1980 | * Protected by i_ceph_lock. | 1980 | * Protected by i_ceph_lock. |
1981 | */ | 1981 | */ |
1982 | static void __take_cap_refs(struct ceph_inode_info *ci, int got) | 1982 | static void __take_cap_refs(struct ceph_inode_info *ci, int got) |
1983 | { | 1983 | { |
1984 | if (got & CEPH_CAP_PIN) | 1984 | if (got & CEPH_CAP_PIN) |
1985 | ci->i_pin_ref++; | 1985 | ci->i_pin_ref++; |
1986 | if (got & CEPH_CAP_FILE_RD) | 1986 | if (got & CEPH_CAP_FILE_RD) |
1987 | ci->i_rd_ref++; | 1987 | ci->i_rd_ref++; |
1988 | if (got & CEPH_CAP_FILE_CACHE) | 1988 | if (got & CEPH_CAP_FILE_CACHE) |
1989 | ci->i_rdcache_ref++; | 1989 | ci->i_rdcache_ref++; |
1990 | if (got & CEPH_CAP_FILE_WR) | 1990 | if (got & CEPH_CAP_FILE_WR) |
1991 | ci->i_wr_ref++; | 1991 | ci->i_wr_ref++; |
1992 | if (got & CEPH_CAP_FILE_BUFFER) { | 1992 | if (got & CEPH_CAP_FILE_BUFFER) { |
1993 | if (ci->i_wb_ref == 0) | 1993 | if (ci->i_wb_ref == 0) |
1994 | ihold(&ci->vfs_inode); | 1994 | ihold(&ci->vfs_inode); |
1995 | ci->i_wb_ref++; | 1995 | ci->i_wb_ref++; |
1996 | dout("__take_cap_refs %p wb %d -> %d (?)\n", | 1996 | dout("__take_cap_refs %p wb %d -> %d (?)\n", |
1997 | &ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref); | 1997 | &ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref); |
1998 | } | 1998 | } |
1999 | } | 1999 | } |
2000 | 2000 | ||
2001 | /* | 2001 | /* |
2002 | * Try to grab cap references. Specify those refs we @want, and the | 2002 | * Try to grab cap references. Specify those refs we @want, and the |
2003 | * minimal set we @need. Also include the larger offset we are writing | 2003 | * minimal set we @need. Also include the larger offset we are writing |
2004 | * to (when applicable), and check against max_size here as well. | 2004 | * to (when applicable), and check against max_size here as well. |
2005 | * Note that caller is responsible for ensuring max_size increases are | 2005 | * Note that caller is responsible for ensuring max_size increases are |
2006 | * requested from the MDS. | 2006 | * requested from the MDS. |
2007 | */ | 2007 | */ |
2008 | static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, | 2008 | static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, |
2009 | int *got, loff_t endoff, int *check_max, int *err) | 2009 | int *got, loff_t endoff, int *check_max, int *err) |
2010 | { | 2010 | { |
2011 | struct inode *inode = &ci->vfs_inode; | 2011 | struct inode *inode = &ci->vfs_inode; |
2012 | int ret = 0; | 2012 | int ret = 0; |
2013 | int have, implemented; | 2013 | int have, implemented; |
2014 | int file_wanted; | 2014 | int file_wanted; |
2015 | 2015 | ||
2016 | dout("get_cap_refs %p need %s want %s\n", inode, | 2016 | dout("get_cap_refs %p need %s want %s\n", inode, |
2017 | ceph_cap_string(need), ceph_cap_string(want)); | 2017 | ceph_cap_string(need), ceph_cap_string(want)); |
2018 | spin_lock(&ci->i_ceph_lock); | 2018 | spin_lock(&ci->i_ceph_lock); |
2019 | 2019 | ||
2020 | /* make sure file is actually open */ | 2020 | /* make sure file is actually open */ |
2021 | file_wanted = __ceph_caps_file_wanted(ci); | 2021 | file_wanted = __ceph_caps_file_wanted(ci); |
2022 | if ((file_wanted & need) == 0) { | 2022 | if ((file_wanted & need) == 0) { |
2023 | dout("try_get_cap_refs need %s file_wanted %s, EBADF\n", | 2023 | dout("try_get_cap_refs need %s file_wanted %s, EBADF\n", |
2024 | ceph_cap_string(need), ceph_cap_string(file_wanted)); | 2024 | ceph_cap_string(need), ceph_cap_string(file_wanted)); |
2025 | *err = -EBADF; | 2025 | *err = -EBADF; |
2026 | ret = 1; | 2026 | ret = 1; |
2027 | goto out; | 2027 | goto out; |
2028 | } | 2028 | } |
2029 | 2029 | ||
2030 | if (need & CEPH_CAP_FILE_WR) { | 2030 | if (need & CEPH_CAP_FILE_WR) { |
2031 | if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) { | 2031 | if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) { |
2032 | dout("get_cap_refs %p endoff %llu > maxsize %llu\n", | 2032 | dout("get_cap_refs %p endoff %llu > maxsize %llu\n", |
2033 | inode, endoff, ci->i_max_size); | 2033 | inode, endoff, ci->i_max_size); |
2034 | if (endoff > ci->i_wanted_max_size) { | 2034 | if (endoff > ci->i_wanted_max_size) { |
2035 | *check_max = 1; | 2035 | *check_max = 1; |
2036 | ret = 1; | 2036 | ret = 1; |
2037 | } | 2037 | } |
2038 | goto out; | 2038 | goto out; |
2039 | } | 2039 | } |
2040 | /* | 2040 | /* |
2041 | * If a sync write is in progress, we must wait, so that we | 2041 | * If a sync write is in progress, we must wait, so that we |
2042 | * can get a final snapshot value for size+mtime. | 2042 | * can get a final snapshot value for size+mtime. |
2043 | */ | 2043 | */ |
2044 | if (__ceph_have_pending_cap_snap(ci)) { | 2044 | if (__ceph_have_pending_cap_snap(ci)) { |
2045 | dout("get_cap_refs %p cap_snap_pending\n", inode); | 2045 | dout("get_cap_refs %p cap_snap_pending\n", inode); |
2046 | goto out; | 2046 | goto out; |
2047 | } | 2047 | } |
2048 | } | 2048 | } |
2049 | have = __ceph_caps_issued(ci, &implemented); | 2049 | have = __ceph_caps_issued(ci, &implemented); |
2050 | 2050 | ||
2051 | /* | 2051 | /* |
2052 | * disallow writes while a truncate is pending | 2052 | * disallow writes while a truncate is pending |
2053 | */ | 2053 | */ |
2054 | if (ci->i_truncate_pending) | 2054 | if (ci->i_truncate_pending) |
2055 | have &= ~CEPH_CAP_FILE_WR; | 2055 | have &= ~CEPH_CAP_FILE_WR; |
2056 | 2056 | ||
2057 | if ((have & need) == need) { | 2057 | if ((have & need) == need) { |
2058 | /* | 2058 | /* |
2059 | * Look at (implemented & ~have & not) so that we keep waiting | 2059 | * Look at (implemented & ~have & not) so that we keep waiting |
2060 | * on transition from wanted -> needed caps. This is needed | 2060 | * on transition from wanted -> needed caps. This is needed |
2061 | * for WRBUFFER|WR -> WR to avoid a new WR sync write from | 2061 | * for WRBUFFER|WR -> WR to avoid a new WR sync write from |
2062 | * going before a prior buffered writeback happens. | 2062 | * going before a prior buffered writeback happens. |
2063 | */ | 2063 | */ |
2064 | int not = want & ~(have & need); | 2064 | int not = want & ~(have & need); |
2065 | int revoking = implemented & ~have; | 2065 | int revoking = implemented & ~have; |
2066 | dout("get_cap_refs %p have %s but not %s (revoking %s)\n", | 2066 | dout("get_cap_refs %p have %s but not %s (revoking %s)\n", |
2067 | inode, ceph_cap_string(have), ceph_cap_string(not), | 2067 | inode, ceph_cap_string(have), ceph_cap_string(not), |
2068 | ceph_cap_string(revoking)); | 2068 | ceph_cap_string(revoking)); |
2069 | if ((revoking & not) == 0) { | 2069 | if ((revoking & not) == 0) { |
2070 | *got = need | (have & want); | 2070 | *got = need | (have & want); |
2071 | __take_cap_refs(ci, *got); | 2071 | __take_cap_refs(ci, *got); |
2072 | ret = 1; | 2072 | ret = 1; |
2073 | } | 2073 | } |
2074 | } else { | 2074 | } else { |
2075 | dout("get_cap_refs %p have %s needed %s\n", inode, | 2075 | dout("get_cap_refs %p have %s needed %s\n", inode, |
2076 | ceph_cap_string(have), ceph_cap_string(need)); | 2076 | ceph_cap_string(have), ceph_cap_string(need)); |
2077 | } | 2077 | } |
2078 | out: | 2078 | out: |
2079 | spin_unlock(&ci->i_ceph_lock); | 2079 | spin_unlock(&ci->i_ceph_lock); |
2080 | dout("get_cap_refs %p ret %d got %s\n", inode, | 2080 | dout("get_cap_refs %p ret %d got %s\n", inode, |
2081 | ret, ceph_cap_string(*got)); | 2081 | ret, ceph_cap_string(*got)); |
2082 | return ret; | 2082 | return ret; |
2083 | } | 2083 | } |
2084 | 2084 | ||
2085 | /* | 2085 | /* |
2086 | * Check the offset we are writing up to against our current | 2086 | * Check the offset we are writing up to against our current |
2087 | * max_size. If necessary, tell the MDS we want to write to | 2087 | * max_size. If necessary, tell the MDS we want to write to |
2088 | * a larger offset. | 2088 | * a larger offset. |
2089 | */ | 2089 | */ |
2090 | static void check_max_size(struct inode *inode, loff_t endoff) | 2090 | static void check_max_size(struct inode *inode, loff_t endoff) |
2091 | { | 2091 | { |
2092 | struct ceph_inode_info *ci = ceph_inode(inode); | 2092 | struct ceph_inode_info *ci = ceph_inode(inode); |
2093 | int check = 0; | 2093 | int check = 0; |
2094 | 2094 | ||
2095 | /* do we need to explicitly request a larger max_size? */ | 2095 | /* do we need to explicitly request a larger max_size? */ |
2096 | spin_lock(&ci->i_ceph_lock); | 2096 | spin_lock(&ci->i_ceph_lock); |
2097 | if ((endoff >= ci->i_max_size || | 2097 | if ((endoff >= ci->i_max_size || |
2098 | endoff > (inode->i_size << 1)) && | 2098 | endoff > (inode->i_size << 1)) && |
2099 | endoff > ci->i_wanted_max_size) { | 2099 | endoff > ci->i_wanted_max_size) { |
2100 | dout("write %p at large endoff %llu, req max_size\n", | 2100 | dout("write %p at large endoff %llu, req max_size\n", |
2101 | inode, endoff); | 2101 | inode, endoff); |
2102 | ci->i_wanted_max_size = endoff; | 2102 | ci->i_wanted_max_size = endoff; |
2103 | check = 1; | 2103 | check = 1; |
2104 | } | 2104 | } |
2105 | spin_unlock(&ci->i_ceph_lock); | 2105 | spin_unlock(&ci->i_ceph_lock); |
2106 | if (check) | 2106 | if (check) |
2107 | ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); | 2107 | ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); |
2108 | } | 2108 | } |
2109 | 2109 | ||
2110 | /* | 2110 | /* |
2111 | * Wait for caps, and take cap references. If we can't get a WR cap | 2111 | * Wait for caps, and take cap references. If we can't get a WR cap |
2112 | * due to a small max_size, make sure we check_max_size (and possibly | 2112 | * due to a small max_size, make sure we check_max_size (and possibly |
2113 | * ask the mds) so we don't get hung up indefinitely. | 2113 | * ask the mds) so we don't get hung up indefinitely. |
2114 | */ | 2114 | */ |
2115 | int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, int *got, | 2115 | int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, int *got, |
2116 | loff_t endoff) | 2116 | loff_t endoff) |
2117 | { | 2117 | { |
2118 | int check_max, ret, err; | 2118 | int check_max, ret, err; |
2119 | 2119 | ||
2120 | retry: | 2120 | retry: |
2121 | if (endoff > 0) | 2121 | if (endoff > 0) |
2122 | check_max_size(&ci->vfs_inode, endoff); | 2122 | check_max_size(&ci->vfs_inode, endoff); |
2123 | check_max = 0; | 2123 | check_max = 0; |
2124 | err = 0; | 2124 | err = 0; |
2125 | ret = wait_event_interruptible(ci->i_cap_wq, | 2125 | ret = wait_event_interruptible(ci->i_cap_wq, |
2126 | try_get_cap_refs(ci, need, want, | 2126 | try_get_cap_refs(ci, need, want, |
2127 | got, endoff, | 2127 | got, endoff, |
2128 | &check_max, &err)); | 2128 | &check_max, &err)); |
2129 | if (err) | 2129 | if (err) |
2130 | ret = err; | 2130 | ret = err; |
2131 | if (check_max) | 2131 | if (check_max) |
2132 | goto retry; | 2132 | goto retry; |
2133 | return ret; | 2133 | return ret; |
2134 | } | 2134 | } |
2135 | 2135 | ||
2136 | /* | 2136 | /* |
2137 | * Take cap refs. Caller must already know we hold at least one ref | 2137 | * Take cap refs. Caller must already know we hold at least one ref |
2138 | * on the caps in question or we don't know this is safe. | 2138 | * on the caps in question or we don't know this is safe. |
2139 | */ | 2139 | */ |
2140 | void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps) | 2140 | void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps) |
2141 | { | 2141 | { |
2142 | spin_lock(&ci->i_ceph_lock); | 2142 | spin_lock(&ci->i_ceph_lock); |
2143 | __take_cap_refs(ci, caps); | 2143 | __take_cap_refs(ci, caps); |
2144 | spin_unlock(&ci->i_ceph_lock); | 2144 | spin_unlock(&ci->i_ceph_lock); |
2145 | } | 2145 | } |
2146 | 2146 | ||
2147 | /* | 2147 | /* |
2148 | * Release cap refs. | 2148 | * Release cap refs. |
2149 | * | 2149 | * |
2150 | * If we released the last ref on any given cap, call ceph_check_caps | 2150 | * If we released the last ref on any given cap, call ceph_check_caps |
2151 | * to release (or schedule a release). | 2151 | * to release (or schedule a release). |
2152 | * | 2152 | * |
2153 | * If we are releasing a WR cap (from a sync write), finalize any affected | 2153 | * If we are releasing a WR cap (from a sync write), finalize any affected |
2154 | * cap_snap, and wake up any waiters. | 2154 | * cap_snap, and wake up any waiters. |
2155 | */ | 2155 | */ |
2156 | void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) | 2156 | void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) |
2157 | { | 2157 | { |
2158 | struct inode *inode = &ci->vfs_inode; | 2158 | struct inode *inode = &ci->vfs_inode; |
2159 | int last = 0, put = 0, flushsnaps = 0, wake = 0; | 2159 | int last = 0, put = 0, flushsnaps = 0, wake = 0; |
2160 | struct ceph_cap_snap *capsnap; | 2160 | struct ceph_cap_snap *capsnap; |
2161 | 2161 | ||
2162 | spin_lock(&ci->i_ceph_lock); | 2162 | spin_lock(&ci->i_ceph_lock); |
2163 | if (had & CEPH_CAP_PIN) | 2163 | if (had & CEPH_CAP_PIN) |
2164 | --ci->i_pin_ref; | 2164 | --ci->i_pin_ref; |
2165 | if (had & CEPH_CAP_FILE_RD) | 2165 | if (had & CEPH_CAP_FILE_RD) |
2166 | if (--ci->i_rd_ref == 0) | 2166 | if (--ci->i_rd_ref == 0) |
2167 | last++; | 2167 | last++; |
2168 | if (had & CEPH_CAP_FILE_CACHE) | 2168 | if (had & CEPH_CAP_FILE_CACHE) |
2169 | if (--ci->i_rdcache_ref == 0) | 2169 | if (--ci->i_rdcache_ref == 0) |
2170 | last++; | 2170 | last++; |
2171 | if (had & CEPH_CAP_FILE_BUFFER) { | 2171 | if (had & CEPH_CAP_FILE_BUFFER) { |
2172 | if (--ci->i_wb_ref == 0) { | 2172 | if (--ci->i_wb_ref == 0) { |
2173 | last++; | 2173 | last++; |
2174 | put++; | 2174 | put++; |
2175 | } | 2175 | } |
2176 | dout("put_cap_refs %p wb %d -> %d (?)\n", | 2176 | dout("put_cap_refs %p wb %d -> %d (?)\n", |
2177 | inode, ci->i_wb_ref+1, ci->i_wb_ref); | 2177 | inode, ci->i_wb_ref+1, ci->i_wb_ref); |
2178 | } | 2178 | } |
2179 | if (had & CEPH_CAP_FILE_WR) | 2179 | if (had & CEPH_CAP_FILE_WR) |
2180 | if (--ci->i_wr_ref == 0) { | 2180 | if (--ci->i_wr_ref == 0) { |
2181 | last++; | 2181 | last++; |
2182 | if (!list_empty(&ci->i_cap_snaps)) { | 2182 | if (!list_empty(&ci->i_cap_snaps)) { |
2183 | capsnap = list_first_entry(&ci->i_cap_snaps, | 2183 | capsnap = list_first_entry(&ci->i_cap_snaps, |
2184 | struct ceph_cap_snap, | 2184 | struct ceph_cap_snap, |
2185 | ci_item); | 2185 | ci_item); |
2186 | if (capsnap->writing) { | 2186 | if (capsnap->writing) { |
2187 | capsnap->writing = 0; | 2187 | capsnap->writing = 0; |
2188 | flushsnaps = | 2188 | flushsnaps = |
2189 | __ceph_finish_cap_snap(ci, | 2189 | __ceph_finish_cap_snap(ci, |
2190 | capsnap); | 2190 | capsnap); |
2191 | wake = 1; | 2191 | wake = 1; |
2192 | } | 2192 | } |
2193 | } | 2193 | } |
2194 | } | 2194 | } |
2195 | spin_unlock(&ci->i_ceph_lock); | 2195 | spin_unlock(&ci->i_ceph_lock); |
2196 | 2196 | ||
2197 | dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had), | 2197 | dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had), |
2198 | last ? " last" : "", put ? " put" : ""); | 2198 | last ? " last" : "", put ? " put" : ""); |
2199 | 2199 | ||
2200 | if (last && !flushsnaps) | 2200 | if (last && !flushsnaps) |
2201 | ceph_check_caps(ci, 0, NULL); | 2201 | ceph_check_caps(ci, 0, NULL); |
2202 | else if (flushsnaps) | 2202 | else if (flushsnaps) |
2203 | ceph_flush_snaps(ci); | 2203 | ceph_flush_snaps(ci); |
2204 | if (wake) | 2204 | if (wake) |
2205 | wake_up_all(&ci->i_cap_wq); | 2205 | wake_up_all(&ci->i_cap_wq); |
2206 | if (put) | 2206 | if (put) |
2207 | iput(inode); | 2207 | iput(inode); |
2208 | } | 2208 | } |
2209 | 2209 | ||
2210 | /* | 2210 | /* |
2211 | * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap | 2211 | * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap |
2212 | * context. Adjust per-snap dirty page accounting as appropriate. | 2212 | * context. Adjust per-snap dirty page accounting as appropriate. |
2213 | * Once all dirty data for a cap_snap is flushed, flush snapped file | 2213 | * Once all dirty data for a cap_snap is flushed, flush snapped file |
2214 | * metadata back to the MDS. If we dropped the last ref, call | 2214 | * metadata back to the MDS. If we dropped the last ref, call |
2215 | * ceph_check_caps. | 2215 | * ceph_check_caps. |
2216 | */ | 2216 | */ |
2217 | void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, | 2217 | void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, |
2218 | struct ceph_snap_context *snapc) | 2218 | struct ceph_snap_context *snapc) |
2219 | { | 2219 | { |
2220 | struct inode *inode = &ci->vfs_inode; | 2220 | struct inode *inode = &ci->vfs_inode; |
2221 | int last = 0; | 2221 | int last = 0; |
2222 | int complete_capsnap = 0; | 2222 | int complete_capsnap = 0; |
2223 | int drop_capsnap = 0; | 2223 | int drop_capsnap = 0; |
2224 | int found = 0; | 2224 | int found = 0; |
2225 | struct ceph_cap_snap *capsnap = NULL; | 2225 | struct ceph_cap_snap *capsnap = NULL; |
2226 | 2226 | ||
2227 | spin_lock(&ci->i_ceph_lock); | 2227 | spin_lock(&ci->i_ceph_lock); |
2228 | ci->i_wrbuffer_ref -= nr; | 2228 | ci->i_wrbuffer_ref -= nr; |
2229 | last = !ci->i_wrbuffer_ref; | 2229 | last = !ci->i_wrbuffer_ref; |
2230 | 2230 | ||
2231 | if (ci->i_head_snapc == snapc) { | 2231 | if (ci->i_head_snapc == snapc) { |
2232 | ci->i_wrbuffer_ref_head -= nr; | 2232 | ci->i_wrbuffer_ref_head -= nr; |
2233 | if (ci->i_wrbuffer_ref_head == 0 && | 2233 | if (ci->i_wrbuffer_ref_head == 0 && |
2234 | ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) { | 2234 | ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) { |
2235 | BUG_ON(!ci->i_head_snapc); | 2235 | BUG_ON(!ci->i_head_snapc); |
2236 | ceph_put_snap_context(ci->i_head_snapc); | 2236 | ceph_put_snap_context(ci->i_head_snapc); |
2237 | ci->i_head_snapc = NULL; | 2237 | ci->i_head_snapc = NULL; |
2238 | } | 2238 | } |
2239 | dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n", | 2239 | dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n", |
2240 | inode, | 2240 | inode, |
2241 | ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr, | 2241 | ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr, |
2242 | ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, | 2242 | ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, |
2243 | last ? " LAST" : ""); | 2243 | last ? " LAST" : ""); |
2244 | } else { | 2244 | } else { |
2245 | list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { | 2245 | list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { |
2246 | if (capsnap->context == snapc) { | 2246 | if (capsnap->context == snapc) { |
2247 | found = 1; | 2247 | found = 1; |
2248 | break; | 2248 | break; |
2249 | } | 2249 | } |
2250 | } | 2250 | } |
2251 | BUG_ON(!found); | 2251 | BUG_ON(!found); |
2252 | capsnap->dirty_pages -= nr; | 2252 | capsnap->dirty_pages -= nr; |
2253 | if (capsnap->dirty_pages == 0) { | 2253 | if (capsnap->dirty_pages == 0) { |
2254 | complete_capsnap = 1; | 2254 | complete_capsnap = 1; |
2255 | if (capsnap->dirty == 0) | 2255 | if (capsnap->dirty == 0) |
2256 | /* cap writeback completed before we created | 2256 | /* cap writeback completed before we created |
2257 | * the cap_snap; no FLUSHSNAP is needed */ | 2257 | * the cap_snap; no FLUSHSNAP is needed */ |
2258 | drop_capsnap = 1; | 2258 | drop_capsnap = 1; |
2259 | } | 2259 | } |
2260 | dout("put_wrbuffer_cap_refs on %p cap_snap %p " | 2260 | dout("put_wrbuffer_cap_refs on %p cap_snap %p " |
2261 | " snap %lld %d/%d -> %d/%d %s%s%s\n", | 2261 | " snap %lld %d/%d -> %d/%d %s%s%s\n", |
2262 | inode, capsnap, capsnap->context->seq, | 2262 | inode, capsnap, capsnap->context->seq, |
2263 | ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr, | 2263 | ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr, |
2264 | ci->i_wrbuffer_ref, capsnap->dirty_pages, | 2264 | ci->i_wrbuffer_ref, capsnap->dirty_pages, |
2265 | last ? " (wrbuffer last)" : "", | 2265 | last ? " (wrbuffer last)" : "", |
2266 | complete_capsnap ? " (complete capsnap)" : "", | 2266 | complete_capsnap ? " (complete capsnap)" : "", |
2267 | drop_capsnap ? " (drop capsnap)" : ""); | 2267 | drop_capsnap ? " (drop capsnap)" : ""); |
2268 | if (drop_capsnap) { | 2268 | if (drop_capsnap) { |
2269 | ceph_put_snap_context(capsnap->context); | 2269 | ceph_put_snap_context(capsnap->context); |
2270 | list_del(&capsnap->ci_item); | 2270 | list_del(&capsnap->ci_item); |
2271 | list_del(&capsnap->flushing_item); | 2271 | list_del(&capsnap->flushing_item); |
2272 | ceph_put_cap_snap(capsnap); | 2272 | ceph_put_cap_snap(capsnap); |
2273 | } | 2273 | } |
2274 | } | 2274 | } |
2275 | 2275 | ||
2276 | spin_unlock(&ci->i_ceph_lock); | 2276 | spin_unlock(&ci->i_ceph_lock); |
2277 | 2277 | ||
2278 | if (last) { | 2278 | if (last) { |
2279 | ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); | 2279 | ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); |
2280 | iput(inode); | 2280 | iput(inode); |
2281 | } else if (complete_capsnap) { | 2281 | } else if (complete_capsnap) { |
2282 | ceph_flush_snaps(ci); | 2282 | ceph_flush_snaps(ci); |
2283 | wake_up_all(&ci->i_cap_wq); | 2283 | wake_up_all(&ci->i_cap_wq); |
2284 | } | 2284 | } |
2285 | if (drop_capsnap) | 2285 | if (drop_capsnap) |
2286 | iput(inode); | 2286 | iput(inode); |
2287 | } | 2287 | } |
2288 | 2288 | ||
2289 | /* | 2289 | /* |
2290 | * Handle a cap GRANT message from the MDS. (Note that a GRANT may | 2290 | * Handle a cap GRANT message from the MDS. (Note that a GRANT may |
2291 | * actually be a revocation if it specifies a smaller cap set.) | 2291 | * actually be a revocation if it specifies a smaller cap set.) |
2292 | * | 2292 | * |
2293 | * caller holds s_mutex and i_ceph_lock, we drop both. | 2293 | * caller holds s_mutex and i_ceph_lock, we drop both. |
2294 | * | 2294 | * |
2295 | * return value: | 2295 | * return value: |
2296 | * 0 - ok | 2296 | * 0 - ok |
2297 | * 1 - check_caps on auth cap only (writeback) | 2297 | * 1 - check_caps on auth cap only (writeback) |
2298 | * 2 - check_caps (ack revoke) | 2298 | * 2 - check_caps (ack revoke) |
2299 | */ | 2299 | */ |
2300 | static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, | 2300 | static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, |
2301 | struct ceph_mds_session *session, | 2301 | struct ceph_mds_session *session, |
2302 | struct ceph_cap *cap, | 2302 | struct ceph_cap *cap, |
2303 | struct ceph_buffer *xattr_buf) | 2303 | struct ceph_buffer *xattr_buf) |
2304 | __releases(ci->i_ceph_lock) | 2304 | __releases(ci->i_ceph_lock) |
2305 | { | 2305 | { |
2306 | struct ceph_inode_info *ci = ceph_inode(inode); | 2306 | struct ceph_inode_info *ci = ceph_inode(inode); |
2307 | int mds = session->s_mds; | 2307 | int mds = session->s_mds; |
2308 | int seq = le32_to_cpu(grant->seq); | 2308 | int seq = le32_to_cpu(grant->seq); |
2309 | int newcaps = le32_to_cpu(grant->caps); | 2309 | int newcaps = le32_to_cpu(grant->caps); |
2310 | int issued, implemented, used, wanted, dirty; | 2310 | int issued, implemented, used, wanted, dirty; |
2311 | u64 size = le64_to_cpu(grant->size); | 2311 | u64 size = le64_to_cpu(grant->size); |
2312 | u64 max_size = le64_to_cpu(grant->max_size); | 2312 | u64 max_size = le64_to_cpu(grant->max_size); |
2313 | struct timespec mtime, atime, ctime; | 2313 | struct timespec mtime, atime, ctime; |
2314 | int check_caps = 0; | 2314 | int check_caps = 0; |
2315 | int wake = 0; | 2315 | int wake = 0; |
2316 | int writeback = 0; | 2316 | int writeback = 0; |
2317 | int revoked_rdcache = 0; | 2317 | int revoked_rdcache = 0; |
2318 | int queue_invalidate = 0; | 2318 | int queue_invalidate = 0; |
2319 | 2319 | ||
2320 | dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", | 2320 | dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", |
2321 | inode, cap, mds, seq, ceph_cap_string(newcaps)); | 2321 | inode, cap, mds, seq, ceph_cap_string(newcaps)); |
2322 | dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, | 2322 | dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, |
2323 | inode->i_size); | 2323 | inode->i_size); |
2324 | 2324 | ||
2325 | /* | 2325 | /* |
2326 | * If CACHE is being revoked, and we have no dirty buffers, | 2326 | * If CACHE is being revoked, and we have no dirty buffers, |
2327 | * try to invalidate (once). (If there are dirty buffers, we | 2327 | * try to invalidate (once). (If there are dirty buffers, we |
2328 | * will invalidate _after_ writeback.) | 2328 | * will invalidate _after_ writeback.) |
2329 | */ | 2329 | */ |
2330 | if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && | 2330 | if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && |
2331 | (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && | 2331 | (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && |
2332 | !ci->i_wrbuffer_ref) { | 2332 | !ci->i_wrbuffer_ref) { |
2333 | if (try_nonblocking_invalidate(inode) == 0) { | 2333 | if (try_nonblocking_invalidate(inode) == 0) { |
2334 | revoked_rdcache = 1; | 2334 | revoked_rdcache = 1; |
2335 | } else { | 2335 | } else { |
2336 | /* there were locked pages.. invalidate later | 2336 | /* there were locked pages.. invalidate later |
2337 | in a separate thread. */ | 2337 | in a separate thread. */ |
2338 | if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { | 2338 | if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { |
2339 | queue_invalidate = 1; | 2339 | queue_invalidate = 1; |
2340 | ci->i_rdcache_revoking = ci->i_rdcache_gen; | 2340 | ci->i_rdcache_revoking = ci->i_rdcache_gen; |
2341 | } | 2341 | } |
2342 | } | 2342 | } |
2343 | } | 2343 | } |
2344 | 2344 | ||
2345 | /* side effects now are allowed */ | 2345 | /* side effects now are allowed */ |
2346 | 2346 | ||
2347 | issued = __ceph_caps_issued(ci, &implemented); | 2347 | issued = __ceph_caps_issued(ci, &implemented); |
2348 | issued |= implemented | __ceph_caps_dirty(ci); | 2348 | issued |= implemented | __ceph_caps_dirty(ci); |
2349 | 2349 | ||
2350 | cap->cap_gen = session->s_cap_gen; | 2350 | cap->cap_gen = session->s_cap_gen; |
2351 | 2351 | ||
2352 | __check_cap_issue(ci, cap, newcaps); | 2352 | __check_cap_issue(ci, cap, newcaps); |
2353 | 2353 | ||
2354 | if ((issued & CEPH_CAP_AUTH_EXCL) == 0) { | 2354 | if ((issued & CEPH_CAP_AUTH_EXCL) == 0) { |
2355 | inode->i_mode = le32_to_cpu(grant->mode); | 2355 | inode->i_mode = le32_to_cpu(grant->mode); |
2356 | inode->i_uid = le32_to_cpu(grant->uid); | 2356 | inode->i_uid = le32_to_cpu(grant->uid); |
2357 | inode->i_gid = le32_to_cpu(grant->gid); | 2357 | inode->i_gid = le32_to_cpu(grant->gid); |
2358 | dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, | 2358 | dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, |
2359 | inode->i_uid, inode->i_gid); | 2359 | inode->i_uid, inode->i_gid); |
2360 | } | 2360 | } |
2361 | 2361 | ||
2362 | if ((issued & CEPH_CAP_LINK_EXCL) == 0) | 2362 | if ((issued & CEPH_CAP_LINK_EXCL) == 0) |
2363 | set_nlink(inode, le32_to_cpu(grant->nlink)); | 2363 | set_nlink(inode, le32_to_cpu(grant->nlink)); |
2364 | 2364 | ||
2365 | if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) { | 2365 | if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) { |
2366 | int len = le32_to_cpu(grant->xattr_len); | 2366 | int len = le32_to_cpu(grant->xattr_len); |
2367 | u64 version = le64_to_cpu(grant->xattr_version); | 2367 | u64 version = le64_to_cpu(grant->xattr_version); |
2368 | 2368 | ||
2369 | if (version > ci->i_xattrs.version) { | 2369 | if (version > ci->i_xattrs.version) { |
2370 | dout(" got new xattrs v%llu on %p len %d\n", | 2370 | dout(" got new xattrs v%llu on %p len %d\n", |
2371 | version, inode, len); | 2371 | version, inode, len); |
2372 | if (ci->i_xattrs.blob) | 2372 | if (ci->i_xattrs.blob) |
2373 | ceph_buffer_put(ci->i_xattrs.blob); | 2373 | ceph_buffer_put(ci->i_xattrs.blob); |
2374 | ci->i_xattrs.blob = ceph_buffer_get(xattr_buf); | 2374 | ci->i_xattrs.blob = ceph_buffer_get(xattr_buf); |
2375 | ci->i_xattrs.version = version; | 2375 | ci->i_xattrs.version = version; |
2376 | } | 2376 | } |
2377 | } | 2377 | } |
2378 | 2378 | ||
2379 | /* size/ctime/mtime/atime? */ | 2379 | /* size/ctime/mtime/atime? */ |
2380 | ceph_fill_file_size(inode, issued, | 2380 | ceph_fill_file_size(inode, issued, |
2381 | le32_to_cpu(grant->truncate_seq), | 2381 | le32_to_cpu(grant->truncate_seq), |
2382 | le64_to_cpu(grant->truncate_size), size); | 2382 | le64_to_cpu(grant->truncate_size), size); |
2383 | ceph_decode_timespec(&mtime, &grant->mtime); | 2383 | ceph_decode_timespec(&mtime, &grant->mtime); |
2384 | ceph_decode_timespec(&atime, &grant->atime); | 2384 | ceph_decode_timespec(&atime, &grant->atime); |
2385 | ceph_decode_timespec(&ctime, &grant->ctime); | 2385 | ceph_decode_timespec(&ctime, &grant->ctime); |
2386 | ceph_fill_file_time(inode, issued, | 2386 | ceph_fill_file_time(inode, issued, |
2387 | le32_to_cpu(grant->time_warp_seq), &ctime, &mtime, | 2387 | le32_to_cpu(grant->time_warp_seq), &ctime, &mtime, |
2388 | &atime); | 2388 | &atime); |
2389 | 2389 | ||
2390 | /* max size increase? */ | 2390 | /* max size increase? */ |
2391 | if (max_size != ci->i_max_size) { | 2391 | if (max_size != ci->i_max_size) { |
2392 | dout("max_size %lld -> %llu\n", ci->i_max_size, max_size); | 2392 | dout("max_size %lld -> %llu\n", ci->i_max_size, max_size); |
2393 | ci->i_max_size = max_size; | 2393 | ci->i_max_size = max_size; |
2394 | if (max_size >= ci->i_wanted_max_size) { | 2394 | if (max_size >= ci->i_wanted_max_size) { |
2395 | ci->i_wanted_max_size = 0; /* reset */ | 2395 | ci->i_wanted_max_size = 0; /* reset */ |
2396 | ci->i_requested_max_size = 0; | 2396 | ci->i_requested_max_size = 0; |
2397 | } | 2397 | } |
2398 | wake = 1; | 2398 | wake = 1; |
2399 | } | 2399 | } |
2400 | 2400 | ||
2401 | /* check cap bits */ | 2401 | /* check cap bits */ |
2402 | wanted = __ceph_caps_wanted(ci); | 2402 | wanted = __ceph_caps_wanted(ci); |
2403 | used = __ceph_caps_used(ci); | 2403 | used = __ceph_caps_used(ci); |
2404 | dirty = __ceph_caps_dirty(ci); | 2404 | dirty = __ceph_caps_dirty(ci); |
2405 | dout(" my wanted = %s, used = %s, dirty %s\n", | 2405 | dout(" my wanted = %s, used = %s, dirty %s\n", |
2406 | ceph_cap_string(wanted), | 2406 | ceph_cap_string(wanted), |
2407 | ceph_cap_string(used), | 2407 | ceph_cap_string(used), |
2408 | ceph_cap_string(dirty)); | 2408 | ceph_cap_string(dirty)); |
2409 | if (wanted != le32_to_cpu(grant->wanted)) { | 2409 | if (wanted != le32_to_cpu(grant->wanted)) { |
2410 | dout("mds wanted %s -> %s\n", | 2410 | dout("mds wanted %s -> %s\n", |
2411 | ceph_cap_string(le32_to_cpu(grant->wanted)), | 2411 | ceph_cap_string(le32_to_cpu(grant->wanted)), |
2412 | ceph_cap_string(wanted)); | 2412 | ceph_cap_string(wanted)); |
2413 | grant->wanted = cpu_to_le32(wanted); | 2413 | grant->wanted = cpu_to_le32(wanted); |
2414 | } | 2414 | } |
2415 | 2415 | ||
2416 | cap->seq = seq; | 2416 | cap->seq = seq; |
2417 | 2417 | ||
2418 | /* file layout may have changed */ | 2418 | /* file layout may have changed */ |
2419 | ci->i_layout = grant->layout; | 2419 | ci->i_layout = grant->layout; |
2420 | 2420 | ||
2421 | /* revocation, grant, or no-op? */ | 2421 | /* revocation, grant, or no-op? */ |
2422 | if (cap->issued & ~newcaps) { | 2422 | if (cap->issued & ~newcaps) { |
2423 | int revoking = cap->issued & ~newcaps; | 2423 | int revoking = cap->issued & ~newcaps; |
2424 | 2424 | ||
2425 | dout("revocation: %s -> %s (revoking %s)\n", | 2425 | dout("revocation: %s -> %s (revoking %s)\n", |
2426 | ceph_cap_string(cap->issued), | 2426 | ceph_cap_string(cap->issued), |
2427 | ceph_cap_string(newcaps), | 2427 | ceph_cap_string(newcaps), |
2428 | ceph_cap_string(revoking)); | 2428 | ceph_cap_string(revoking)); |
2429 | if (revoking & used & CEPH_CAP_FILE_BUFFER) | 2429 | if (revoking & used & CEPH_CAP_FILE_BUFFER) |
2430 | writeback = 1; /* initiate writeback; will delay ack */ | 2430 | writeback = 1; /* initiate writeback; will delay ack */ |
2431 | else if (revoking == CEPH_CAP_FILE_CACHE && | 2431 | else if (revoking == CEPH_CAP_FILE_CACHE && |
2432 | (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && | 2432 | (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && |
2433 | queue_invalidate) | 2433 | queue_invalidate) |
2434 | ; /* do nothing yet, invalidation will be queued */ | 2434 | ; /* do nothing yet, invalidation will be queued */ |
2435 | else if (cap == ci->i_auth_cap) | 2435 | else if (cap == ci->i_auth_cap) |
2436 | check_caps = 1; /* check auth cap only */ | 2436 | check_caps = 1; /* check auth cap only */ |
2437 | else | 2437 | else |
2438 | check_caps = 2; /* check all caps */ | 2438 | check_caps = 2; /* check all caps */ |
2439 | cap->issued = newcaps; | 2439 | cap->issued = newcaps; |
2440 | cap->implemented |= newcaps; | 2440 | cap->implemented |= newcaps; |
2441 | } else if (cap->issued == newcaps) { | 2441 | } else if (cap->issued == newcaps) { |
2442 | dout("caps unchanged: %s -> %s\n", | 2442 | dout("caps unchanged: %s -> %s\n", |
2443 | ceph_cap_string(cap->issued), ceph_cap_string(newcaps)); | 2443 | ceph_cap_string(cap->issued), ceph_cap_string(newcaps)); |
2444 | } else { | 2444 | } else { |
2445 | dout("grant: %s -> %s\n", ceph_cap_string(cap->issued), | 2445 | dout("grant: %s -> %s\n", ceph_cap_string(cap->issued), |
2446 | ceph_cap_string(newcaps)); | 2446 | ceph_cap_string(newcaps)); |
2447 | cap->issued = newcaps; | 2447 | cap->issued = newcaps; |
2448 | cap->implemented |= newcaps; /* add bits only, to | 2448 | cap->implemented |= newcaps; /* add bits only, to |
2449 | * avoid stepping on a | 2449 | * avoid stepping on a |
2450 | * pending revocation */ | 2450 | * pending revocation */ |
2451 | wake = 1; | 2451 | wake = 1; |
2452 | } | 2452 | } |
2453 | BUG_ON(cap->issued & ~cap->implemented); | 2453 | BUG_ON(cap->issued & ~cap->implemented); |
2454 | 2454 | ||
2455 | spin_unlock(&ci->i_ceph_lock); | 2455 | spin_unlock(&ci->i_ceph_lock); |
2456 | if (writeback) | 2456 | if (writeback) |
2457 | /* | 2457 | /* |
2458 | * queue inode for writeback: we can't actually call | 2458 | * queue inode for writeback: we can't actually call |
2459 | * filemap_write_and_wait, etc. from message handler | 2459 | * filemap_write_and_wait, etc. from message handler |
2460 | * context. | 2460 | * context. |
2461 | */ | 2461 | */ |
2462 | ceph_queue_writeback(inode); | 2462 | ceph_queue_writeback(inode); |
2463 | if (queue_invalidate) | 2463 | if (queue_invalidate) |
2464 | ceph_queue_invalidate(inode); | 2464 | ceph_queue_invalidate(inode); |
2465 | if (wake) | 2465 | if (wake) |
2466 | wake_up_all(&ci->i_cap_wq); | 2466 | wake_up_all(&ci->i_cap_wq); |
2467 | 2467 | ||
2468 | if (check_caps == 1) | 2468 | if (check_caps == 1) |
2469 | ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY, | 2469 | ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY, |
2470 | session); | 2470 | session); |
2471 | else if (check_caps == 2) | 2471 | else if (check_caps == 2) |
2472 | ceph_check_caps(ci, CHECK_CAPS_NODELAY, session); | 2472 | ceph_check_caps(ci, CHECK_CAPS_NODELAY, session); |
2473 | else | 2473 | else |
2474 | mutex_unlock(&session->s_mutex); | 2474 | mutex_unlock(&session->s_mutex); |
2475 | } | 2475 | } |
2476 | 2476 | ||
2477 | /* | 2477 | /* |
2478 | * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the | 2478 | * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the |
2479 | * MDS has been safely committed. | 2479 | * MDS has been safely committed. |
2480 | */ | 2480 | */ |
2481 | static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, | 2481 | static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, |
2482 | struct ceph_mds_caps *m, | 2482 | struct ceph_mds_caps *m, |
2483 | struct ceph_mds_session *session, | 2483 | struct ceph_mds_session *session, |
2484 | struct ceph_cap *cap) | 2484 | struct ceph_cap *cap) |
2485 | __releases(ci->i_ceph_lock) | 2485 | __releases(ci->i_ceph_lock) |
2486 | { | 2486 | { |
2487 | struct ceph_inode_info *ci = ceph_inode(inode); | 2487 | struct ceph_inode_info *ci = ceph_inode(inode); |
2488 | struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; | 2488 | struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; |
2489 | unsigned seq = le32_to_cpu(m->seq); | 2489 | unsigned seq = le32_to_cpu(m->seq); |
2490 | int dirty = le32_to_cpu(m->dirty); | 2490 | int dirty = le32_to_cpu(m->dirty); |
2491 | int cleaned = 0; | 2491 | int cleaned = 0; |
2492 | int drop = 0; | 2492 | int drop = 0; |
2493 | int i; | 2493 | int i; |
2494 | 2494 | ||
2495 | for (i = 0; i < CEPH_CAP_BITS; i++) | 2495 | for (i = 0; i < CEPH_CAP_BITS; i++) |
2496 | if ((dirty & (1 << i)) && | 2496 | if ((dirty & (1 << i)) && |
2497 | flush_tid == ci->i_cap_flush_tid[i]) | 2497 | flush_tid == ci->i_cap_flush_tid[i]) |
2498 | cleaned |= 1 << i; | 2498 | cleaned |= 1 << i; |
2499 | 2499 | ||
2500 | dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s," | 2500 | dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s," |
2501 | " flushing %s -> %s\n", | 2501 | " flushing %s -> %s\n", |
2502 | inode, session->s_mds, seq, ceph_cap_string(dirty), | 2502 | inode, session->s_mds, seq, ceph_cap_string(dirty), |
2503 | ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps), | 2503 | ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps), |
2504 | ceph_cap_string(ci->i_flushing_caps & ~cleaned)); | 2504 | ceph_cap_string(ci->i_flushing_caps & ~cleaned)); |
2505 | 2505 | ||
2506 | if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned)) | 2506 | if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned)) |
2507 | goto out; | 2507 | goto out; |
2508 | 2508 | ||
2509 | ci->i_flushing_caps &= ~cleaned; | 2509 | ci->i_flushing_caps &= ~cleaned; |
2510 | 2510 | ||
2511 | spin_lock(&mdsc->cap_dirty_lock); | 2511 | spin_lock(&mdsc->cap_dirty_lock); |
2512 | if (ci->i_flushing_caps == 0) { | 2512 | if (ci->i_flushing_caps == 0) { |
2513 | list_del_init(&ci->i_flushing_item); | 2513 | list_del_init(&ci->i_flushing_item); |
2514 | if (!list_empty(&session->s_cap_flushing)) | 2514 | if (!list_empty(&session->s_cap_flushing)) |
2515 | dout(" mds%d still flushing cap on %p\n", | 2515 | dout(" mds%d still flushing cap on %p\n", |
2516 | session->s_mds, | 2516 | session->s_mds, |
2517 | &list_entry(session->s_cap_flushing.next, | 2517 | &list_entry(session->s_cap_flushing.next, |
2518 | struct ceph_inode_info, | 2518 | struct ceph_inode_info, |
2519 | i_flushing_item)->vfs_inode); | 2519 | i_flushing_item)->vfs_inode); |
2520 | mdsc->num_cap_flushing--; | 2520 | mdsc->num_cap_flushing--; |
2521 | wake_up_all(&mdsc->cap_flushing_wq); | 2521 | wake_up_all(&mdsc->cap_flushing_wq); |
2522 | dout(" inode %p now !flushing\n", inode); | 2522 | dout(" inode %p now !flushing\n", inode); |
2523 | 2523 | ||
2524 | if (ci->i_dirty_caps == 0) { | 2524 | if (ci->i_dirty_caps == 0) { |
2525 | dout(" inode %p now clean\n", inode); | 2525 | dout(" inode %p now clean\n", inode); |
2526 | BUG_ON(!list_empty(&ci->i_dirty_item)); | 2526 | BUG_ON(!list_empty(&ci->i_dirty_item)); |
2527 | drop = 1; | 2527 | drop = 1; |
2528 | if (ci->i_wrbuffer_ref_head == 0) { | 2528 | if (ci->i_wrbuffer_ref_head == 0) { |
2529 | BUG_ON(!ci->i_head_snapc); | 2529 | BUG_ON(!ci->i_head_snapc); |
2530 | ceph_put_snap_context(ci->i_head_snapc); | 2530 | ceph_put_snap_context(ci->i_head_snapc); |
2531 | ci->i_head_snapc = NULL; | 2531 | ci->i_head_snapc = NULL; |
2532 | } | 2532 | } |
2533 | } else { | 2533 | } else { |
2534 | BUG_ON(list_empty(&ci->i_dirty_item)); | 2534 | BUG_ON(list_empty(&ci->i_dirty_item)); |
2535 | } | 2535 | } |
2536 | } | 2536 | } |
2537 | spin_unlock(&mdsc->cap_dirty_lock); | 2537 | spin_unlock(&mdsc->cap_dirty_lock); |
2538 | wake_up_all(&ci->i_cap_wq); | 2538 | wake_up_all(&ci->i_cap_wq); |
2539 | 2539 | ||
2540 | out: | 2540 | out: |
2541 | spin_unlock(&ci->i_ceph_lock); | 2541 | spin_unlock(&ci->i_ceph_lock); |
2542 | if (drop) | 2542 | if (drop) |
2543 | iput(inode); | 2543 | iput(inode); |
2544 | } | 2544 | } |
2545 | 2545 | ||
2546 | /* | 2546 | /* |
2547 | * Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can | 2547 | * Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can |
2548 | * throw away our cap_snap. | 2548 | * throw away our cap_snap. |
2549 | * | 2549 | * |
2550 | * Caller hold s_mutex. | 2550 | * Caller hold s_mutex. |
2551 | */ | 2551 | */ |
2552 | static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, | 2552 | static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, |
2553 | struct ceph_mds_caps *m, | 2553 | struct ceph_mds_caps *m, |
2554 | struct ceph_mds_session *session) | 2554 | struct ceph_mds_session *session) |
2555 | { | 2555 | { |
2556 | struct ceph_inode_info *ci = ceph_inode(inode); | 2556 | struct ceph_inode_info *ci = ceph_inode(inode); |
2557 | u64 follows = le64_to_cpu(m->snap_follows); | 2557 | u64 follows = le64_to_cpu(m->snap_follows); |
2558 | struct ceph_cap_snap *capsnap; | 2558 | struct ceph_cap_snap *capsnap; |
2559 | int drop = 0; | 2559 | int drop = 0; |
2560 | 2560 | ||
2561 | dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n", | 2561 | dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n", |
2562 | inode, ci, session->s_mds, follows); | 2562 | inode, ci, session->s_mds, follows); |
2563 | 2563 | ||
2564 | spin_lock(&ci->i_ceph_lock); | 2564 | spin_lock(&ci->i_ceph_lock); |
2565 | list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { | 2565 | list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { |
2566 | if (capsnap->follows == follows) { | 2566 | if (capsnap->follows == follows) { |
2567 | if (capsnap->flush_tid != flush_tid) { | 2567 | if (capsnap->flush_tid != flush_tid) { |
2568 | dout(" cap_snap %p follows %lld tid %lld !=" | 2568 | dout(" cap_snap %p follows %lld tid %lld !=" |
2569 | " %lld\n", capsnap, follows, | 2569 | " %lld\n", capsnap, follows, |
2570 | flush_tid, capsnap->flush_tid); | 2570 | flush_tid, capsnap->flush_tid); |
2571 | break; | 2571 | break; |
2572 | } | 2572 | } |
2573 | WARN_ON(capsnap->dirty_pages || capsnap->writing); | 2573 | WARN_ON(capsnap->dirty_pages || capsnap->writing); |
2574 | dout(" removing %p cap_snap %p follows %lld\n", | 2574 | dout(" removing %p cap_snap %p follows %lld\n", |
2575 | inode, capsnap, follows); | 2575 | inode, capsnap, follows); |
2576 | ceph_put_snap_context(capsnap->context); | 2576 | ceph_put_snap_context(capsnap->context); |
2577 | list_del(&capsnap->ci_item); | 2577 | list_del(&capsnap->ci_item); |
2578 | list_del(&capsnap->flushing_item); | 2578 | list_del(&capsnap->flushing_item); |
2579 | ceph_put_cap_snap(capsnap); | 2579 | ceph_put_cap_snap(capsnap); |
2580 | drop = 1; | 2580 | drop = 1; |
2581 | break; | 2581 | break; |
2582 | } else { | 2582 | } else { |
2583 | dout(" skipping cap_snap %p follows %lld\n", | 2583 | dout(" skipping cap_snap %p follows %lld\n", |
2584 | capsnap, capsnap->follows); | 2584 | capsnap, capsnap->follows); |
2585 | } | 2585 | } |
2586 | } | 2586 | } |
2587 | spin_unlock(&ci->i_ceph_lock); | 2587 | spin_unlock(&ci->i_ceph_lock); |
2588 | if (drop) | 2588 | if (drop) |
2589 | iput(inode); | 2589 | iput(inode); |
2590 | } | 2590 | } |
2591 | 2591 | ||
2592 | /* | 2592 | /* |
2593 | * Handle TRUNC from MDS, indicating file truncation. | 2593 | * Handle TRUNC from MDS, indicating file truncation. |
2594 | * | 2594 | * |
2595 | * caller hold s_mutex. | 2595 | * caller hold s_mutex. |
2596 | */ | 2596 | */ |
2597 | static void handle_cap_trunc(struct inode *inode, | 2597 | static void handle_cap_trunc(struct inode *inode, |
2598 | struct ceph_mds_caps *trunc, | 2598 | struct ceph_mds_caps *trunc, |
2599 | struct ceph_mds_session *session) | 2599 | struct ceph_mds_session *session) |
2600 | __releases(ci->i_ceph_lock) | 2600 | __releases(ci->i_ceph_lock) |
2601 | { | 2601 | { |
2602 | struct ceph_inode_info *ci = ceph_inode(inode); | 2602 | struct ceph_inode_info *ci = ceph_inode(inode); |
2603 | int mds = session->s_mds; | 2603 | int mds = session->s_mds; |
2604 | int seq = le32_to_cpu(trunc->seq); | 2604 | int seq = le32_to_cpu(trunc->seq); |
2605 | u32 truncate_seq = le32_to_cpu(trunc->truncate_seq); | 2605 | u32 truncate_seq = le32_to_cpu(trunc->truncate_seq); |
2606 | u64 truncate_size = le64_to_cpu(trunc->truncate_size); | 2606 | u64 truncate_size = le64_to_cpu(trunc->truncate_size); |
2607 | u64 size = le64_to_cpu(trunc->size); | 2607 | u64 size = le64_to_cpu(trunc->size); |
2608 | int implemented = 0; | 2608 | int implemented = 0; |
2609 | int dirty = __ceph_caps_dirty(ci); | 2609 | int dirty = __ceph_caps_dirty(ci); |
2610 | int issued = __ceph_caps_issued(ceph_inode(inode), &implemented); | 2610 | int issued = __ceph_caps_issued(ceph_inode(inode), &implemented); |
2611 | int queue_trunc = 0; | 2611 | int queue_trunc = 0; |
2612 | 2612 | ||
2613 | issued |= implemented | dirty; | 2613 | issued |= implemented | dirty; |
2614 | 2614 | ||
2615 | dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n", | 2615 | dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n", |
2616 | inode, mds, seq, truncate_size, truncate_seq); | 2616 | inode, mds, seq, truncate_size, truncate_seq); |
2617 | queue_trunc = ceph_fill_file_size(inode, issued, | 2617 | queue_trunc = ceph_fill_file_size(inode, issued, |
2618 | truncate_seq, truncate_size, size); | 2618 | truncate_seq, truncate_size, size); |
2619 | spin_unlock(&ci->i_ceph_lock); | 2619 | spin_unlock(&ci->i_ceph_lock); |
2620 | 2620 | ||
2621 | if (queue_trunc) | 2621 | if (queue_trunc) |
2622 | ceph_queue_vmtruncate(inode); | 2622 | ceph_queue_vmtruncate(inode); |
2623 | } | 2623 | } |
2624 | 2624 | ||
2625 | /* | 2625 | /* |
2626 | * Handle EXPORT from MDS. Cap is being migrated _from_ this mds to a | 2626 | * Handle EXPORT from MDS. Cap is being migrated _from_ this mds to a |
2627 | * different one. If we are the most recent migration we've seen (as | 2627 | * different one. If we are the most recent migration we've seen (as |
2628 | * indicated by mseq), make note of the migrating cap bits for the | 2628 | * indicated by mseq), make note of the migrating cap bits for the |
2629 | * duration (until we see the corresponding IMPORT). | 2629 | * duration (until we see the corresponding IMPORT). |
2630 | * | 2630 | * |
2631 | * caller holds s_mutex | 2631 | * caller holds s_mutex |
2632 | */ | 2632 | */ |
2633 | static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, | 2633 | static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, |
2634 | struct ceph_mds_session *session, | 2634 | struct ceph_mds_session *session, |
2635 | int *open_target_sessions) | 2635 | int *open_target_sessions) |
2636 | { | 2636 | { |
2637 | struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; | 2637 | struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; |
2638 | struct ceph_inode_info *ci = ceph_inode(inode); | 2638 | struct ceph_inode_info *ci = ceph_inode(inode); |
2639 | int mds = session->s_mds; | 2639 | int mds = session->s_mds; |
2640 | unsigned mseq = le32_to_cpu(ex->migrate_seq); | 2640 | unsigned mseq = le32_to_cpu(ex->migrate_seq); |
2641 | struct ceph_cap *cap = NULL, *t; | 2641 | struct ceph_cap *cap = NULL, *t; |
2642 | struct rb_node *p; | 2642 | struct rb_node *p; |
2643 | int remember = 1; | 2643 | int remember = 1; |
2644 | 2644 | ||
2645 | dout("handle_cap_export inode %p ci %p mds%d mseq %d\n", | 2645 | dout("handle_cap_export inode %p ci %p mds%d mseq %d\n", |
2646 | inode, ci, mds, mseq); | 2646 | inode, ci, mds, mseq); |
2647 | 2647 | ||
2648 | spin_lock(&ci->i_ceph_lock); | 2648 | spin_lock(&ci->i_ceph_lock); |
2649 | 2649 | ||
2650 | /* make sure we haven't seen a higher mseq */ | 2650 | /* make sure we haven't seen a higher mseq */ |
2651 | for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { | 2651 | for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { |
2652 | t = rb_entry(p, struct ceph_cap, ci_node); | 2652 | t = rb_entry(p, struct ceph_cap, ci_node); |
2653 | if (ceph_seq_cmp(t->mseq, mseq) > 0) { | 2653 | if (ceph_seq_cmp(t->mseq, mseq) > 0) { |
2654 | dout(" higher mseq on cap from mds%d\n", | 2654 | dout(" higher mseq on cap from mds%d\n", |
2655 | t->session->s_mds); | 2655 | t->session->s_mds); |
2656 | remember = 0; | 2656 | remember = 0; |
2657 | } | 2657 | } |
2658 | if (t->session->s_mds == mds) | 2658 | if (t->session->s_mds == mds) |
2659 | cap = t; | 2659 | cap = t; |
2660 | } | 2660 | } |
2661 | 2661 | ||
2662 | if (cap) { | 2662 | if (cap) { |
2663 | if (remember) { | 2663 | if (remember) { |
2664 | /* make note */ | 2664 | /* make note */ |
2665 | ci->i_cap_exporting_mds = mds; | 2665 | ci->i_cap_exporting_mds = mds; |
2666 | ci->i_cap_exporting_mseq = mseq; | 2666 | ci->i_cap_exporting_mseq = mseq; |
2667 | ci->i_cap_exporting_issued = cap->issued; | 2667 | ci->i_cap_exporting_issued = cap->issued; |
2668 | 2668 | ||
2669 | /* | 2669 | /* |
2670 | * make sure we have open sessions with all possible | 2670 | * make sure we have open sessions with all possible |
2671 | * export targets, so that we get the matching IMPORT | 2671 | * export targets, so that we get the matching IMPORT |
2672 | */ | 2672 | */ |
2673 | *open_target_sessions = 1; | 2673 | *open_target_sessions = 1; |
2674 | 2674 | ||
2675 | /* | 2675 | /* |
2676 | * we can't flush dirty caps that we've seen the | 2676 | * we can't flush dirty caps that we've seen the |
2677 | * EXPORT but no IMPORT for | 2677 | * EXPORT but no IMPORT for |
2678 | */ | 2678 | */ |
2679 | spin_lock(&mdsc->cap_dirty_lock); | 2679 | spin_lock(&mdsc->cap_dirty_lock); |
2680 | if (!list_empty(&ci->i_dirty_item)) { | 2680 | if (!list_empty(&ci->i_dirty_item)) { |
2681 | dout(" moving %p to cap_dirty_migrating\n", | 2681 | dout(" moving %p to cap_dirty_migrating\n", |
2682 | inode); | 2682 | inode); |
2683 | list_move(&ci->i_dirty_item, | 2683 | list_move(&ci->i_dirty_item, |
2684 | &mdsc->cap_dirty_migrating); | 2684 | &mdsc->cap_dirty_migrating); |
2685 | } | 2685 | } |
2686 | spin_unlock(&mdsc->cap_dirty_lock); | 2686 | spin_unlock(&mdsc->cap_dirty_lock); |
2687 | } | 2687 | } |
2688 | __ceph_remove_cap(cap); | 2688 | __ceph_remove_cap(cap); |
2689 | } | 2689 | } |
2690 | /* else, we already released it */ | 2690 | /* else, we already released it */ |
2691 | 2691 | ||
2692 | spin_unlock(&ci->i_ceph_lock); | 2692 | spin_unlock(&ci->i_ceph_lock); |
2693 | } | 2693 | } |
2694 | 2694 | ||
2695 | /* | 2695 | /* |
2696 | * Handle cap IMPORT. If there are temp bits from an older EXPORT, | 2696 | * Handle cap IMPORT. If there are temp bits from an older EXPORT, |
2697 | * clean them up. | 2697 | * clean them up. |
2698 | * | 2698 | * |
2699 | * caller holds s_mutex. | 2699 | * caller holds s_mutex. |
2700 | */ | 2700 | */ |
2701 | static void handle_cap_import(struct ceph_mds_client *mdsc, | 2701 | static void handle_cap_import(struct ceph_mds_client *mdsc, |
2702 | struct inode *inode, struct ceph_mds_caps *im, | 2702 | struct inode *inode, struct ceph_mds_caps *im, |
2703 | struct ceph_mds_session *session, | 2703 | struct ceph_mds_session *session, |
2704 | void *snaptrace, int snaptrace_len) | 2704 | void *snaptrace, int snaptrace_len) |
2705 | { | 2705 | { |
2706 | struct ceph_inode_info *ci = ceph_inode(inode); | 2706 | struct ceph_inode_info *ci = ceph_inode(inode); |
2707 | int mds = session->s_mds; | 2707 | int mds = session->s_mds; |
2708 | unsigned issued = le32_to_cpu(im->caps); | 2708 | unsigned issued = le32_to_cpu(im->caps); |
2709 | unsigned wanted = le32_to_cpu(im->wanted); | 2709 | unsigned wanted = le32_to_cpu(im->wanted); |
2710 | unsigned seq = le32_to_cpu(im->seq); | 2710 | unsigned seq = le32_to_cpu(im->seq); |
2711 | unsigned mseq = le32_to_cpu(im->migrate_seq); | 2711 | unsigned mseq = le32_to_cpu(im->migrate_seq); |
2712 | u64 realmino = le64_to_cpu(im->realm); | 2712 | u64 realmino = le64_to_cpu(im->realm); |
2713 | u64 cap_id = le64_to_cpu(im->cap_id); | 2713 | u64 cap_id = le64_to_cpu(im->cap_id); |
2714 | 2714 | ||
2715 | if (ci->i_cap_exporting_mds >= 0 && | 2715 | if (ci->i_cap_exporting_mds >= 0 && |
2716 | ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) { | 2716 | ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) { |
2717 | dout("handle_cap_import inode %p ci %p mds%d mseq %d" | 2717 | dout("handle_cap_import inode %p ci %p mds%d mseq %d" |
2718 | " - cleared exporting from mds%d\n", | 2718 | " - cleared exporting from mds%d\n", |
2719 | inode, ci, mds, mseq, | 2719 | inode, ci, mds, mseq, |
2720 | ci->i_cap_exporting_mds); | 2720 | ci->i_cap_exporting_mds); |
2721 | ci->i_cap_exporting_issued = 0; | 2721 | ci->i_cap_exporting_issued = 0; |
2722 | ci->i_cap_exporting_mseq = 0; | 2722 | ci->i_cap_exporting_mseq = 0; |
2723 | ci->i_cap_exporting_mds = -1; | 2723 | ci->i_cap_exporting_mds = -1; |
2724 | 2724 | ||
2725 | spin_lock(&mdsc->cap_dirty_lock); | 2725 | spin_lock(&mdsc->cap_dirty_lock); |
2726 | if (!list_empty(&ci->i_dirty_item)) { | 2726 | if (!list_empty(&ci->i_dirty_item)) { |
2727 | dout(" moving %p back to cap_dirty\n", inode); | 2727 | dout(" moving %p back to cap_dirty\n", inode); |
2728 | list_move(&ci->i_dirty_item, &mdsc->cap_dirty); | 2728 | list_move(&ci->i_dirty_item, &mdsc->cap_dirty); |
2729 | } | 2729 | } |
2730 | spin_unlock(&mdsc->cap_dirty_lock); | 2730 | spin_unlock(&mdsc->cap_dirty_lock); |
2731 | } else { | 2731 | } else { |
2732 | dout("handle_cap_import inode %p ci %p mds%d mseq %d\n", | 2732 | dout("handle_cap_import inode %p ci %p mds%d mseq %d\n", |
2733 | inode, ci, mds, mseq); | 2733 | inode, ci, mds, mseq); |
2734 | } | 2734 | } |
2735 | 2735 | ||
2736 | down_write(&mdsc->snap_rwsem); | 2736 | down_write(&mdsc->snap_rwsem); |
2737 | ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len, | 2737 | ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len, |
2738 | false); | 2738 | false); |
2739 | downgrade_write(&mdsc->snap_rwsem); | 2739 | downgrade_write(&mdsc->snap_rwsem); |
2740 | ceph_add_cap(inode, session, cap_id, -1, | 2740 | ceph_add_cap(inode, session, cap_id, -1, |
2741 | issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH, | 2741 | issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH, |
2742 | NULL /* no caps context */); | 2742 | NULL /* no caps context */); |
2743 | kick_flushing_inode_caps(mdsc, session, inode); | 2743 | kick_flushing_inode_caps(mdsc, session, inode); |
2744 | up_read(&mdsc->snap_rwsem); | 2744 | up_read(&mdsc->snap_rwsem); |
2745 | 2745 | ||
2746 | /* make sure we re-request max_size, if necessary */ | 2746 | /* make sure we re-request max_size, if necessary */ |
2747 | spin_lock(&ci->i_ceph_lock); | 2747 | spin_lock(&ci->i_ceph_lock); |
2748 | ci->i_requested_max_size = 0; | 2748 | ci->i_requested_max_size = 0; |
2749 | spin_unlock(&ci->i_ceph_lock); | 2749 | spin_unlock(&ci->i_ceph_lock); |
2750 | } | 2750 | } |
2751 | 2751 | ||
2752 | /* | 2752 | /* |
2753 | * Handle a caps message from the MDS. | 2753 | * Handle a caps message from the MDS. |
2754 | * | 2754 | * |
2755 | * Identify the appropriate session, inode, and call the right handler | 2755 | * Identify the appropriate session, inode, and call the right handler |
2756 | * based on the cap op. | 2756 | * based on the cap op. |
2757 | */ | 2757 | */ |
2758 | void ceph_handle_caps(struct ceph_mds_session *session, | 2758 | void ceph_handle_caps(struct ceph_mds_session *session, |
2759 | struct ceph_msg *msg) | 2759 | struct ceph_msg *msg) |
2760 | { | 2760 | { |
2761 | struct ceph_mds_client *mdsc = session->s_mdsc; | 2761 | struct ceph_mds_client *mdsc = session->s_mdsc; |
2762 | struct super_block *sb = mdsc->fsc->sb; | 2762 | struct super_block *sb = mdsc->fsc->sb; |
2763 | struct inode *inode; | 2763 | struct inode *inode; |
2764 | struct ceph_inode_info *ci; | 2764 | struct ceph_inode_info *ci; |
2765 | struct ceph_cap *cap; | 2765 | struct ceph_cap *cap; |
2766 | struct ceph_mds_caps *h; | 2766 | struct ceph_mds_caps *h; |
2767 | int mds = session->s_mds; | 2767 | int mds = session->s_mds; |
2768 | int op; | 2768 | int op; |
2769 | u32 seq, mseq; | 2769 | u32 seq, mseq; |
2770 | struct ceph_vino vino; | 2770 | struct ceph_vino vino; |
2771 | u64 cap_id; | 2771 | u64 cap_id; |
2772 | u64 size, max_size; | 2772 | u64 size, max_size; |
2773 | u64 tid; | 2773 | u64 tid; |
2774 | void *snaptrace; | 2774 | void *snaptrace; |
2775 | size_t snaptrace_len; | 2775 | size_t snaptrace_len; |
2776 | void *flock; | 2776 | void *flock; |
2777 | u32 flock_len; | 2777 | u32 flock_len; |
2778 | int open_target_sessions = 0; | 2778 | int open_target_sessions = 0; |
2779 | 2779 | ||
2780 | dout("handle_caps from mds%d\n", mds); | 2780 | dout("handle_caps from mds%d\n", mds); |
2781 | 2781 | ||
2782 | /* decode */ | 2782 | /* decode */ |
2783 | tid = le64_to_cpu(msg->hdr.tid); | 2783 | tid = le64_to_cpu(msg->hdr.tid); |
2784 | if (msg->front.iov_len < sizeof(*h)) | 2784 | if (msg->front.iov_len < sizeof(*h)) |
2785 | goto bad; | 2785 | goto bad; |
2786 | h = msg->front.iov_base; | 2786 | h = msg->front.iov_base; |
2787 | op = le32_to_cpu(h->op); | 2787 | op = le32_to_cpu(h->op); |
2788 | vino.ino = le64_to_cpu(h->ino); | 2788 | vino.ino = le64_to_cpu(h->ino); |
2789 | vino.snap = CEPH_NOSNAP; | 2789 | vino.snap = CEPH_NOSNAP; |
2790 | cap_id = le64_to_cpu(h->cap_id); | 2790 | cap_id = le64_to_cpu(h->cap_id); |
2791 | seq = le32_to_cpu(h->seq); | 2791 | seq = le32_to_cpu(h->seq); |
2792 | mseq = le32_to_cpu(h->migrate_seq); | 2792 | mseq = le32_to_cpu(h->migrate_seq); |
2793 | size = le64_to_cpu(h->size); | 2793 | size = le64_to_cpu(h->size); |
2794 | max_size = le64_to_cpu(h->max_size); | 2794 | max_size = le64_to_cpu(h->max_size); |
2795 | 2795 | ||
2796 | snaptrace = h + 1; | 2796 | snaptrace = h + 1; |
2797 | snaptrace_len = le32_to_cpu(h->snap_trace_len); | 2797 | snaptrace_len = le32_to_cpu(h->snap_trace_len); |
2798 | 2798 | ||
2799 | if (le16_to_cpu(msg->hdr.version) >= 2) { | 2799 | if (le16_to_cpu(msg->hdr.version) >= 2) { |
2800 | void *p, *end; | 2800 | void *p, *end; |
2801 | 2801 | ||
2802 | p = snaptrace + snaptrace_len; | 2802 | p = snaptrace + snaptrace_len; |
2803 | end = msg->front.iov_base + msg->front.iov_len; | 2803 | end = msg->front.iov_base + msg->front.iov_len; |
2804 | ceph_decode_32_safe(&p, end, flock_len, bad); | 2804 | ceph_decode_32_safe(&p, end, flock_len, bad); |
2805 | flock = p; | 2805 | flock = p; |
2806 | } else { | 2806 | } else { |
2807 | flock = NULL; | 2807 | flock = NULL; |
2808 | flock_len = 0; | 2808 | flock_len = 0; |
2809 | } | 2809 | } |
2810 | 2810 | ||
2811 | mutex_lock(&session->s_mutex); | 2811 | mutex_lock(&session->s_mutex); |
2812 | session->s_seq++; | 2812 | session->s_seq++; |
2813 | dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, | 2813 | dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, |
2814 | (unsigned)seq); | 2814 | (unsigned)seq); |
2815 | 2815 | ||
2816 | /* lookup ino */ | 2816 | /* lookup ino */ |
2817 | inode = ceph_find_inode(sb, vino); | 2817 | inode = ceph_find_inode(sb, vino); |
2818 | ci = ceph_inode(inode); | 2818 | ci = ceph_inode(inode); |
2819 | dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino, | 2819 | dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino, |
2820 | vino.snap, inode); | 2820 | vino.snap, inode); |
2821 | if (!inode) { | 2821 | if (!inode) { |
2822 | dout(" i don't have ino %llx\n", vino.ino); | 2822 | dout(" i don't have ino %llx\n", vino.ino); |
2823 | 2823 | ||
2824 | if (op == CEPH_CAP_OP_IMPORT) | 2824 | if (op == CEPH_CAP_OP_IMPORT) |
2825 | __queue_cap_release(session, vino.ino, cap_id, | 2825 | __queue_cap_release(session, vino.ino, cap_id, |
2826 | mseq, seq); | 2826 | mseq, seq); |
2827 | goto flush_cap_releases; | 2827 | goto flush_cap_releases; |
2828 | } | 2828 | } |
2829 | 2829 | ||
2830 | /* these will work even if we don't have a cap yet */ | 2830 | /* these will work even if we don't have a cap yet */ |
2831 | switch (op) { | 2831 | switch (op) { |
2832 | case CEPH_CAP_OP_FLUSHSNAP_ACK: | 2832 | case CEPH_CAP_OP_FLUSHSNAP_ACK: |
2833 | handle_cap_flushsnap_ack(inode, tid, h, session); | 2833 | handle_cap_flushsnap_ack(inode, tid, h, session); |
2834 | goto done; | 2834 | goto done; |
2835 | 2835 | ||
2836 | case CEPH_CAP_OP_EXPORT: | 2836 | case CEPH_CAP_OP_EXPORT: |
2837 | handle_cap_export(inode, h, session, &open_target_sessions); | 2837 | handle_cap_export(inode, h, session, &open_target_sessions); |
2838 | goto done; | 2838 | goto done; |
2839 | 2839 | ||
2840 | case CEPH_CAP_OP_IMPORT: | 2840 | case CEPH_CAP_OP_IMPORT: |
2841 | handle_cap_import(mdsc, inode, h, session, | 2841 | handle_cap_import(mdsc, inode, h, session, |
2842 | snaptrace, snaptrace_len); | 2842 | snaptrace, snaptrace_len); |
2843 | ceph_check_caps(ceph_inode(inode), 0, session); | 2843 | ceph_check_caps(ceph_inode(inode), 0, session); |
2844 | goto done_unlocked; | 2844 | goto done_unlocked; |
2845 | } | 2845 | } |
2846 | 2846 | ||
2847 | /* the rest require a cap */ | 2847 | /* the rest require a cap */ |
2848 | spin_lock(&ci->i_ceph_lock); | 2848 | spin_lock(&ci->i_ceph_lock); |
2849 | cap = __get_cap_for_mds(ceph_inode(inode), mds); | 2849 | cap = __get_cap_for_mds(ceph_inode(inode), mds); |
2850 | if (!cap) { | 2850 | if (!cap) { |
2851 | dout(" no cap on %p ino %llx.%llx from mds%d\n", | 2851 | dout(" no cap on %p ino %llx.%llx from mds%d\n", |
2852 | inode, ceph_ino(inode), ceph_snap(inode), mds); | 2852 | inode, ceph_ino(inode), ceph_snap(inode), mds); |
2853 | spin_unlock(&ci->i_ceph_lock); | 2853 | spin_unlock(&ci->i_ceph_lock); |
2854 | goto flush_cap_releases; | 2854 | goto flush_cap_releases; |
2855 | } | 2855 | } |
2856 | 2856 | ||
2857 | /* note that each of these drops i_ceph_lock for us */ | 2857 | /* note that each of these drops i_ceph_lock for us */ |
2858 | switch (op) { | 2858 | switch (op) { |
2859 | case CEPH_CAP_OP_REVOKE: | 2859 | case CEPH_CAP_OP_REVOKE: |
2860 | case CEPH_CAP_OP_GRANT: | 2860 | case CEPH_CAP_OP_GRANT: |
2861 | handle_cap_grant(inode, h, session, cap, msg->middle); | 2861 | handle_cap_grant(inode, h, session, cap, msg->middle); |
2862 | goto done_unlocked; | 2862 | goto done_unlocked; |
2863 | 2863 | ||
2864 | case CEPH_CAP_OP_FLUSH_ACK: | 2864 | case CEPH_CAP_OP_FLUSH_ACK: |
2865 | handle_cap_flush_ack(inode, tid, h, session, cap); | 2865 | handle_cap_flush_ack(inode, tid, h, session, cap); |
2866 | break; | 2866 | break; |
2867 | 2867 | ||
2868 | case CEPH_CAP_OP_TRUNC: | 2868 | case CEPH_CAP_OP_TRUNC: |
2869 | handle_cap_trunc(inode, h, session); | 2869 | handle_cap_trunc(inode, h, session); |
2870 | break; | 2870 | break; |
2871 | 2871 | ||
2872 | default: | 2872 | default: |
2873 | spin_unlock(&ci->i_ceph_lock); | 2873 | spin_unlock(&ci->i_ceph_lock); |
2874 | pr_err("ceph_handle_caps: unknown cap op %d %s\n", op, | 2874 | pr_err("ceph_handle_caps: unknown cap op %d %s\n", op, |
2875 | ceph_cap_op_name(op)); | 2875 | ceph_cap_op_name(op)); |
2876 | } | 2876 | } |
2877 | 2877 | ||
2878 | goto done; | 2878 | goto done; |
2879 | 2879 | ||
2880 | flush_cap_releases: | 2880 | flush_cap_releases: |
2881 | /* | 2881 | /* |
2882 | * send any full release message to try to move things | 2882 | * send any full release message to try to move things |
2883 | * along for the mds (who clearly thinks we still have this | 2883 | * along for the mds (who clearly thinks we still have this |
2884 | * cap). | 2884 | * cap). |
2885 | */ | 2885 | */ |
2886 | ceph_add_cap_releases(mdsc, session); | 2886 | ceph_add_cap_releases(mdsc, session); |
2887 | ceph_send_cap_releases(mdsc, session); | 2887 | ceph_send_cap_releases(mdsc, session); |
2888 | 2888 | ||
2889 | done: | 2889 | done: |
2890 | mutex_unlock(&session->s_mutex); | 2890 | mutex_unlock(&session->s_mutex); |
2891 | done_unlocked: | 2891 | done_unlocked: |
2892 | if (inode) | 2892 | if (inode) |
2893 | iput(inode); | 2893 | iput(inode); |
2894 | if (open_target_sessions) | 2894 | if (open_target_sessions) |
2895 | ceph_mdsc_open_export_target_sessions(mdsc, session); | 2895 | ceph_mdsc_open_export_target_sessions(mdsc, session); |
2896 | return; | 2896 | return; |
2897 | 2897 | ||
2898 | bad: | 2898 | bad: |
2899 | pr_err("ceph_handle_caps: corrupt message\n"); | 2899 | pr_err("ceph_handle_caps: corrupt message\n"); |
2900 | ceph_msg_dump(msg); | 2900 | ceph_msg_dump(msg); |
2901 | return; | 2901 | return; |
2902 | } | 2902 | } |
2903 | 2903 | ||
2904 | /* | 2904 | /* |
2905 | * Delayed work handler to process end of delayed cap release LRU list. | 2905 | * Delayed work handler to process end of delayed cap release LRU list. |
2906 | */ | 2906 | */ |
2907 | void ceph_check_delayed_caps(struct ceph_mds_client *mdsc) | 2907 | void ceph_check_delayed_caps(struct ceph_mds_client *mdsc) |
2908 | { | 2908 | { |
2909 | struct ceph_inode_info *ci; | 2909 | struct ceph_inode_info *ci; |
2910 | int flags = CHECK_CAPS_NODELAY; | 2910 | int flags = CHECK_CAPS_NODELAY; |
2911 | 2911 | ||
2912 | dout("check_delayed_caps\n"); | 2912 | dout("check_delayed_caps\n"); |
2913 | while (1) { | 2913 | while (1) { |
2914 | spin_lock(&mdsc->cap_delay_lock); | 2914 | spin_lock(&mdsc->cap_delay_lock); |
2915 | if (list_empty(&mdsc->cap_delay_list)) | 2915 | if (list_empty(&mdsc->cap_delay_list)) |
2916 | break; | 2916 | break; |
2917 | ci = list_first_entry(&mdsc->cap_delay_list, | 2917 | ci = list_first_entry(&mdsc->cap_delay_list, |
2918 | struct ceph_inode_info, | 2918 | struct ceph_inode_info, |
2919 | i_cap_delay_list); | 2919 | i_cap_delay_list); |
2920 | if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 && | 2920 | if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 && |
2921 | time_before(jiffies, ci->i_hold_caps_max)) | 2921 | time_before(jiffies, ci->i_hold_caps_max)) |
2922 | break; | 2922 | break; |
2923 | list_del_init(&ci->i_cap_delay_list); | 2923 | list_del_init(&ci->i_cap_delay_list); |
2924 | spin_unlock(&mdsc->cap_delay_lock); | 2924 | spin_unlock(&mdsc->cap_delay_lock); |
2925 | dout("check_delayed_caps on %p\n", &ci->vfs_inode); | 2925 | dout("check_delayed_caps on %p\n", &ci->vfs_inode); |
2926 | ceph_check_caps(ci, flags, NULL); | 2926 | ceph_check_caps(ci, flags, NULL); |
2927 | } | 2927 | } |
2928 | spin_unlock(&mdsc->cap_delay_lock); | 2928 | spin_unlock(&mdsc->cap_delay_lock); |
2929 | } | 2929 | } |
2930 | 2930 | ||
2931 | /* | 2931 | /* |
2932 | * Flush all dirty caps to the mds | 2932 | * Flush all dirty caps to the mds |
2933 | */ | 2933 | */ |
2934 | void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) | 2934 | void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) |
2935 | { | 2935 | { |
2936 | struct ceph_inode_info *ci; | 2936 | struct ceph_inode_info *ci; |
2937 | struct inode *inode; | 2937 | struct inode *inode; |
2938 | 2938 | ||
2939 | dout("flush_dirty_caps\n"); | 2939 | dout("flush_dirty_caps\n"); |
2940 | spin_lock(&mdsc->cap_dirty_lock); | 2940 | spin_lock(&mdsc->cap_dirty_lock); |
2941 | while (!list_empty(&mdsc->cap_dirty)) { | 2941 | while (!list_empty(&mdsc->cap_dirty)) { |
2942 | ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info, | 2942 | ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info, |
2943 | i_dirty_item); | 2943 | i_dirty_item); |
2944 | inode = &ci->vfs_inode; | 2944 | inode = &ci->vfs_inode; |
2945 | ihold(inode); | 2945 | ihold(inode); |
2946 | dout("flush_dirty_caps %p\n", inode); | 2946 | dout("flush_dirty_caps %p\n", inode); |
2947 | spin_unlock(&mdsc->cap_dirty_lock); | 2947 | spin_unlock(&mdsc->cap_dirty_lock); |
2948 | ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, NULL); | 2948 | ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, NULL); |
2949 | iput(inode); | 2949 | iput(inode); |
2950 | spin_lock(&mdsc->cap_dirty_lock); | 2950 | spin_lock(&mdsc->cap_dirty_lock); |
2951 | } | 2951 | } |
2952 | spin_unlock(&mdsc->cap_dirty_lock); | 2952 | spin_unlock(&mdsc->cap_dirty_lock); |
2953 | dout("flush_dirty_caps done\n"); | 2953 | dout("flush_dirty_caps done\n"); |
2954 | } | 2954 | } |
2955 | 2955 | ||
2956 | /* | 2956 | /* |
2957 | * Drop open file reference. If we were the last open file, | 2957 | * Drop open file reference. If we were the last open file, |
2958 | * we may need to release capabilities to the MDS (or schedule | 2958 | * we may need to release capabilities to the MDS (or schedule |
2959 | * their delayed release). | 2959 | * their delayed release). |
2960 | */ | 2960 | */ |
2961 | void ceph_put_fmode(struct ceph_inode_info *ci, int fmode) | 2961 | void ceph_put_fmode(struct ceph_inode_info *ci, int fmode) |
2962 | { | 2962 | { |
2963 | struct inode *inode = &ci->vfs_inode; | 2963 | struct inode *inode = &ci->vfs_inode; |
2964 | int last = 0; | 2964 | int last = 0; |
2965 | 2965 | ||
2966 | spin_lock(&ci->i_ceph_lock); | 2966 | spin_lock(&ci->i_ceph_lock); |
2967 | dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode, | 2967 | dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode, |
2968 | ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1); | 2968 | ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1); |
2969 | BUG_ON(ci->i_nr_by_mode[fmode] == 0); | 2969 | BUG_ON(ci->i_nr_by_mode[fmode] == 0); |
2970 | if (--ci->i_nr_by_mode[fmode] == 0) | 2970 | if (--ci->i_nr_by_mode[fmode] == 0) |
2971 | last++; | 2971 | last++; |
2972 | spin_unlock(&ci->i_ceph_lock); | 2972 | spin_unlock(&ci->i_ceph_lock); |
2973 | 2973 | ||
2974 | if (last && ci->i_vino.snap == CEPH_NOSNAP) | 2974 | if (last && ci->i_vino.snap == CEPH_NOSNAP) |
2975 | ceph_check_caps(ci, 0, NULL); | 2975 | ceph_check_caps(ci, 0, NULL); |
2976 | } | 2976 | } |
2977 | 2977 | ||
2978 | /* | 2978 | /* |
2979 | * Helpers for embedding cap and dentry lease releases into mds | 2979 | * Helpers for embedding cap and dentry lease releases into mds |
2980 | * requests. | 2980 | * requests. |
2981 | * | 2981 | * |
2982 | * @force is used by dentry_release (below) to force inclusion of a | 2982 | * @force is used by dentry_release (below) to force inclusion of a |
2983 | * record for the directory inode, even when there aren't any caps to | 2983 | * record for the directory inode, even when there aren't any caps to |
2984 | * drop. | 2984 | * drop. |
2985 | */ | 2985 | */ |
2986 | int ceph_encode_inode_release(void **p, struct inode *inode, | 2986 | int ceph_encode_inode_release(void **p, struct inode *inode, |
2987 | int mds, int drop, int unless, int force) | 2987 | int mds, int drop, int unless, int force) |
2988 | { | 2988 | { |
2989 | struct ceph_inode_info *ci = ceph_inode(inode); | 2989 | struct ceph_inode_info *ci = ceph_inode(inode); |
2990 | struct ceph_cap *cap; | 2990 | struct ceph_cap *cap; |
2991 | struct ceph_mds_request_release *rel = *p; | 2991 | struct ceph_mds_request_release *rel = *p; |
2992 | int used, dirty; | 2992 | int used, dirty; |
2993 | int ret = 0; | 2993 | int ret = 0; |
2994 | 2994 | ||
2995 | spin_lock(&ci->i_ceph_lock); | 2995 | spin_lock(&ci->i_ceph_lock); |
2996 | used = __ceph_caps_used(ci); | 2996 | used = __ceph_caps_used(ci); |
2997 | dirty = __ceph_caps_dirty(ci); | 2997 | dirty = __ceph_caps_dirty(ci); |
2998 | 2998 | ||
2999 | dout("encode_inode_release %p mds%d used|dirty %s drop %s unless %s\n", | 2999 | dout("encode_inode_release %p mds%d used|dirty %s drop %s unless %s\n", |
3000 | inode, mds, ceph_cap_string(used|dirty), ceph_cap_string(drop), | 3000 | inode, mds, ceph_cap_string(used|dirty), ceph_cap_string(drop), |
3001 | ceph_cap_string(unless)); | 3001 | ceph_cap_string(unless)); |
3002 | 3002 | ||
3003 | /* only drop unused, clean caps */ | 3003 | /* only drop unused, clean caps */ |
3004 | drop &= ~(used | dirty); | 3004 | drop &= ~(used | dirty); |
3005 | 3005 | ||
3006 | cap = __get_cap_for_mds(ci, mds); | 3006 | cap = __get_cap_for_mds(ci, mds); |
3007 | if (cap && __cap_is_valid(cap)) { | 3007 | if (cap && __cap_is_valid(cap)) { |
3008 | if (force || | 3008 | if (force || |
3009 | ((cap->issued & drop) && | 3009 | ((cap->issued & drop) && |
3010 | (cap->issued & unless) == 0)) { | 3010 | (cap->issued & unless) == 0)) { |
3011 | if ((cap->issued & drop) && | 3011 | if ((cap->issued & drop) && |
3012 | (cap->issued & unless) == 0) { | 3012 | (cap->issued & unless) == 0) { |
3013 | dout("encode_inode_release %p cap %p %s -> " | 3013 | dout("encode_inode_release %p cap %p %s -> " |
3014 | "%s\n", inode, cap, | 3014 | "%s\n", inode, cap, |
3015 | ceph_cap_string(cap->issued), | 3015 | ceph_cap_string(cap->issued), |
3016 | ceph_cap_string(cap->issued & ~drop)); | 3016 | ceph_cap_string(cap->issued & ~drop)); |
3017 | cap->issued &= ~drop; | 3017 | cap->issued &= ~drop; |
3018 | cap->implemented &= ~drop; | 3018 | cap->implemented &= ~drop; |
3019 | if (ci->i_ceph_flags & CEPH_I_NODELAY) { | 3019 | if (ci->i_ceph_flags & CEPH_I_NODELAY) { |
3020 | int wanted = __ceph_caps_wanted(ci); | 3020 | int wanted = __ceph_caps_wanted(ci); |
3021 | dout(" wanted %s -> %s (act %s)\n", | 3021 | dout(" wanted %s -> %s (act %s)\n", |
3022 | ceph_cap_string(cap->mds_wanted), | 3022 | ceph_cap_string(cap->mds_wanted), |
3023 | ceph_cap_string(cap->mds_wanted & | 3023 | ceph_cap_string(cap->mds_wanted & |
3024 | ~wanted), | 3024 | ~wanted), |
3025 | ceph_cap_string(wanted)); | 3025 | ceph_cap_string(wanted)); |
3026 | cap->mds_wanted &= wanted; | 3026 | cap->mds_wanted &= wanted; |
3027 | } | 3027 | } |
3028 | } else { | 3028 | } else { |
3029 | dout("encode_inode_release %p cap %p %s" | 3029 | dout("encode_inode_release %p cap %p %s" |
3030 | " (force)\n", inode, cap, | 3030 | " (force)\n", inode, cap, |
3031 | ceph_cap_string(cap->issued)); | 3031 | ceph_cap_string(cap->issued)); |
3032 | } | 3032 | } |
3033 | 3033 | ||
3034 | rel->ino = cpu_to_le64(ceph_ino(inode)); | 3034 | rel->ino = cpu_to_le64(ceph_ino(inode)); |
3035 | rel->cap_id = cpu_to_le64(cap->cap_id); | 3035 | rel->cap_id = cpu_to_le64(cap->cap_id); |
3036 | rel->seq = cpu_to_le32(cap->seq); | 3036 | rel->seq = cpu_to_le32(cap->seq); |
3037 | rel->issue_seq = cpu_to_le32(cap->issue_seq), | 3037 | rel->issue_seq = cpu_to_le32(cap->issue_seq), |
3038 | rel->mseq = cpu_to_le32(cap->mseq); | 3038 | rel->mseq = cpu_to_le32(cap->mseq); |
3039 | rel->caps = cpu_to_le32(cap->issued); | 3039 | rel->caps = cpu_to_le32(cap->issued); |
3040 | rel->wanted = cpu_to_le32(cap->mds_wanted); | 3040 | rel->wanted = cpu_to_le32(cap->mds_wanted); |
3041 | rel->dname_len = 0; | 3041 | rel->dname_len = 0; |
3042 | rel->dname_seq = 0; | 3042 | rel->dname_seq = 0; |
3043 | *p += sizeof(*rel); | 3043 | *p += sizeof(*rel); |
3044 | ret = 1; | 3044 | ret = 1; |
3045 | } else { | 3045 | } else { |
3046 | dout("encode_inode_release %p cap %p %s\n", | 3046 | dout("encode_inode_release %p cap %p %s\n", |
3047 | inode, cap, ceph_cap_string(cap->issued)); | 3047 | inode, cap, ceph_cap_string(cap->issued)); |
3048 | } | 3048 | } |
3049 | } | 3049 | } |
3050 | spin_unlock(&ci->i_ceph_lock); | 3050 | spin_unlock(&ci->i_ceph_lock); |
3051 | return ret; | 3051 | return ret; |
3052 | } | 3052 | } |
3053 | 3053 | ||
3054 | int ceph_encode_dentry_release(void **p, struct dentry *dentry, | 3054 | int ceph_encode_dentry_release(void **p, struct dentry *dentry, |
3055 | int mds, int drop, int unless) | 3055 | int mds, int drop, int unless) |
3056 | { | 3056 | { |
3057 | struct inode *dir = dentry->d_parent->d_inode; | 3057 | struct inode *dir = dentry->d_parent->d_inode; |
3058 | struct ceph_mds_request_release *rel = *p; | 3058 | struct ceph_mds_request_release *rel = *p; |
3059 | struct ceph_dentry_info *di = ceph_dentry(dentry); | 3059 | struct ceph_dentry_info *di = ceph_dentry(dentry); |
3060 | int force = 0; | 3060 | int force = 0; |
3061 | int ret; | 3061 | int ret; |
3062 | 3062 | ||
3063 | /* | 3063 | /* |
3064 | * force an record for the directory caps if we have a dentry lease. | 3064 | * force an record for the directory caps if we have a dentry lease. |
3065 | * this is racy (can't take i_ceph_lock and d_lock together), but it | 3065 | * this is racy (can't take i_ceph_lock and d_lock together), but it |
3066 | * doesn't have to be perfect; the mds will revoke anything we don't | 3066 | * doesn't have to be perfect; the mds will revoke anything we don't |
3067 | * release. | 3067 | * release. |
3068 | */ | 3068 | */ |
3069 | spin_lock(&dentry->d_lock); | 3069 | spin_lock(&dentry->d_lock); |
3070 | if (di->lease_session && di->lease_session->s_mds == mds) | 3070 | if (di->lease_session && di->lease_session->s_mds == mds) |
3071 | force = 1; | 3071 | force = 1; |
3072 | spin_unlock(&dentry->d_lock); | 3072 | spin_unlock(&dentry->d_lock); |
3073 | 3073 | ||
3074 | ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force); | 3074 | ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force); |
3075 | 3075 | ||
3076 | spin_lock(&dentry->d_lock); | 3076 | spin_lock(&dentry->d_lock); |
3077 | if (ret && di->lease_session && di->lease_session->s_mds == mds) { | 3077 | if (ret && di->lease_session && di->lease_session->s_mds == mds) { |
3078 | dout("encode_dentry_release %p mds%d seq %d\n", | 3078 | dout("encode_dentry_release %p mds%d seq %d\n", |
3079 | dentry, mds, (int)di->lease_seq); | 3079 | dentry, mds, (int)di->lease_seq); |
3080 | rel->dname_len = cpu_to_le32(dentry->d_name.len); | 3080 | rel->dname_len = cpu_to_le32(dentry->d_name.len); |
3081 | memcpy(*p, dentry->d_name.name, dentry->d_name.len); | 3081 | memcpy(*p, dentry->d_name.name, dentry->d_name.len); |
3082 | *p += dentry->d_name.len; | 3082 | *p += dentry->d_name.len; |
3083 | rel->dname_seq = cpu_to_le32(di->lease_seq); | 3083 | rel->dname_seq = cpu_to_le32(di->lease_seq); |
3084 | __ceph_mdsc_drop_dentry_lease(dentry); | 3084 | __ceph_mdsc_drop_dentry_lease(dentry); |
3085 | } | 3085 | } |
3086 | spin_unlock(&dentry->d_lock); | 3086 | spin_unlock(&dentry->d_lock); |
3087 | return ret; | 3087 | return ret; |
3088 | } | 3088 | } |
3089 | 3089 |
fs/ceph/dir.c
1 | #include <linux/ceph/ceph_debug.h> | 1 | #include <linux/ceph/ceph_debug.h> |
2 | 2 | ||
3 | #include <linux/spinlock.h> | 3 | #include <linux/spinlock.h> |
4 | #include <linux/fs_struct.h> | 4 | #include <linux/fs_struct.h> |
5 | #include <linux/namei.h> | 5 | #include <linux/namei.h> |
6 | #include <linux/slab.h> | 6 | #include <linux/slab.h> |
7 | #include <linux/sched.h> | 7 | #include <linux/sched.h> |
8 | 8 | ||
9 | #include "super.h" | 9 | #include "super.h" |
10 | #include "mds_client.h" | 10 | #include "mds_client.h" |
11 | 11 | ||
12 | /* | 12 | /* |
13 | * Directory operations: readdir, lookup, create, link, unlink, | 13 | * Directory operations: readdir, lookup, create, link, unlink, |
14 | * rename, etc. | 14 | * rename, etc. |
15 | */ | 15 | */ |
16 | 16 | ||
17 | /* | 17 | /* |
18 | * Ceph MDS operations are specified in terms of a base ino and | 18 | * Ceph MDS operations are specified in terms of a base ino and |
19 | * relative path. Thus, the client can specify an operation on a | 19 | * relative path. Thus, the client can specify an operation on a |
20 | * specific inode (e.g., a getattr due to fstat(2)), or as a path | 20 | * specific inode (e.g., a getattr due to fstat(2)), or as a path |
21 | * relative to, say, the root directory. | 21 | * relative to, say, the root directory. |
22 | * | 22 | * |
23 | * Normally, we limit ourselves to strict inode ops (no path component) | 23 | * Normally, we limit ourselves to strict inode ops (no path component) |
24 | * or dentry operations (a single path component relative to an ino). The | 24 | * or dentry operations (a single path component relative to an ino). The |
25 | * exception to this is open_root_dentry(), which will open the mount | 25 | * exception to this is open_root_dentry(), which will open the mount |
26 | * point by name. | 26 | * point by name. |
27 | */ | 27 | */ |
28 | 28 | ||
29 | const struct inode_operations ceph_dir_iops; | 29 | const struct inode_operations ceph_dir_iops; |
30 | const struct file_operations ceph_dir_fops; | 30 | const struct file_operations ceph_dir_fops; |
31 | const struct dentry_operations ceph_dentry_ops; | 31 | const struct dentry_operations ceph_dentry_ops; |
32 | 32 | ||
33 | /* | 33 | /* |
34 | * Initialize ceph dentry state. | 34 | * Initialize ceph dentry state. |
35 | */ | 35 | */ |
36 | int ceph_init_dentry(struct dentry *dentry) | 36 | int ceph_init_dentry(struct dentry *dentry) |
37 | { | 37 | { |
38 | struct ceph_dentry_info *di; | 38 | struct ceph_dentry_info *di; |
39 | 39 | ||
40 | if (dentry->d_fsdata) | 40 | if (dentry->d_fsdata) |
41 | return 0; | 41 | return 0; |
42 | 42 | ||
43 | di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO); | 43 | di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO); |
44 | if (!di) | 44 | if (!di) |
45 | return -ENOMEM; /* oh well */ | 45 | return -ENOMEM; /* oh well */ |
46 | 46 | ||
47 | spin_lock(&dentry->d_lock); | 47 | spin_lock(&dentry->d_lock); |
48 | if (dentry->d_fsdata) { | 48 | if (dentry->d_fsdata) { |
49 | /* lost a race */ | 49 | /* lost a race */ |
50 | kmem_cache_free(ceph_dentry_cachep, di); | 50 | kmem_cache_free(ceph_dentry_cachep, di); |
51 | goto out_unlock; | 51 | goto out_unlock; |
52 | } | 52 | } |
53 | 53 | ||
54 | if (dentry->d_parent == NULL || /* nfs fh_to_dentry */ | 54 | if (dentry->d_parent == NULL || /* nfs fh_to_dentry */ |
55 | ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) | 55 | ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) |
56 | d_set_d_op(dentry, &ceph_dentry_ops); | 56 | d_set_d_op(dentry, &ceph_dentry_ops); |
57 | else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR) | 57 | else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR) |
58 | d_set_d_op(dentry, &ceph_snapdir_dentry_ops); | 58 | d_set_d_op(dentry, &ceph_snapdir_dentry_ops); |
59 | else | 59 | else |
60 | d_set_d_op(dentry, &ceph_snap_dentry_ops); | 60 | d_set_d_op(dentry, &ceph_snap_dentry_ops); |
61 | 61 | ||
62 | di->dentry = dentry; | 62 | di->dentry = dentry; |
63 | di->lease_session = NULL; | 63 | di->lease_session = NULL; |
64 | dentry->d_time = jiffies; | 64 | dentry->d_time = jiffies; |
65 | /* avoid reordering d_fsdata setup so that the check above is safe */ | 65 | /* avoid reordering d_fsdata setup so that the check above is safe */ |
66 | smp_mb(); | 66 | smp_mb(); |
67 | dentry->d_fsdata = di; | 67 | dentry->d_fsdata = di; |
68 | ceph_dentry_lru_add(dentry); | 68 | ceph_dentry_lru_add(dentry); |
69 | out_unlock: | 69 | out_unlock: |
70 | spin_unlock(&dentry->d_lock); | 70 | spin_unlock(&dentry->d_lock); |
71 | return 0; | 71 | return 0; |
72 | } | 72 | } |
73 | 73 | ||
74 | struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry) | 74 | struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry) |
75 | { | 75 | { |
76 | struct inode *inode = NULL; | 76 | struct inode *inode = NULL; |
77 | 77 | ||
78 | if (!dentry) | 78 | if (!dentry) |
79 | return NULL; | 79 | return NULL; |
80 | 80 | ||
81 | spin_lock(&dentry->d_lock); | 81 | spin_lock(&dentry->d_lock); |
82 | if (dentry->d_parent) { | 82 | if (dentry->d_parent) { |
83 | inode = dentry->d_parent->d_inode; | 83 | inode = dentry->d_parent->d_inode; |
84 | ihold(inode); | 84 | ihold(inode); |
85 | } | 85 | } |
86 | spin_unlock(&dentry->d_lock); | 86 | spin_unlock(&dentry->d_lock); |
87 | return inode; | 87 | return inode; |
88 | } | 88 | } |
89 | 89 | ||
90 | 90 | ||
91 | /* | 91 | /* |
92 | * for readdir, we encode the directory frag and offset within that | 92 | * for readdir, we encode the directory frag and offset within that |
93 | * frag into f_pos. | 93 | * frag into f_pos. |
94 | */ | 94 | */ |
95 | static unsigned fpos_frag(loff_t p) | 95 | static unsigned fpos_frag(loff_t p) |
96 | { | 96 | { |
97 | return p >> 32; | 97 | return p >> 32; |
98 | } | 98 | } |
99 | static unsigned fpos_off(loff_t p) | 99 | static unsigned fpos_off(loff_t p) |
100 | { | 100 | { |
101 | return p & 0xffffffff; | 101 | return p & 0xffffffff; |
102 | } | 102 | } |
103 | 103 | ||
104 | /* | 104 | /* |
105 | * When possible, we try to satisfy a readdir by peeking at the | 105 | * When possible, we try to satisfy a readdir by peeking at the |
106 | * dcache. We make this work by carefully ordering dentries on | 106 | * dcache. We make this work by carefully ordering dentries on |
107 | * d_u.d_child when we initially get results back from the MDS, and | 107 | * d_u.d_child when we initially get results back from the MDS, and |
108 | * falling back to a "normal" sync readdir if any dentries in the dir | 108 | * falling back to a "normal" sync readdir if any dentries in the dir |
109 | * are dropped. | 109 | * are dropped. |
110 | * | 110 | * |
111 | * D_COMPLETE tells indicates we have all dentries in the dir. It is | 111 | * D_COMPLETE tells indicates we have all dentries in the dir. It is |
112 | * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by | 112 | * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by |
113 | * the MDS if/when the directory is modified). | 113 | * the MDS if/when the directory is modified). |
114 | */ | 114 | */ |
115 | static int __dcache_readdir(struct file *filp, | 115 | static int __dcache_readdir(struct file *filp, |
116 | void *dirent, filldir_t filldir) | 116 | void *dirent, filldir_t filldir) |
117 | { | 117 | { |
118 | struct ceph_file_info *fi = filp->private_data; | 118 | struct ceph_file_info *fi = filp->private_data; |
119 | struct dentry *parent = filp->f_dentry; | 119 | struct dentry *parent = filp->f_dentry; |
120 | struct inode *dir = parent->d_inode; | 120 | struct inode *dir = parent->d_inode; |
121 | struct list_head *p; | 121 | struct list_head *p; |
122 | struct dentry *dentry, *last; | 122 | struct dentry *dentry, *last; |
123 | struct ceph_dentry_info *di; | 123 | struct ceph_dentry_info *di; |
124 | int err = 0; | 124 | int err = 0; |
125 | 125 | ||
126 | /* claim ref on last dentry we returned */ | 126 | /* claim ref on last dentry we returned */ |
127 | last = fi->dentry; | 127 | last = fi->dentry; |
128 | fi->dentry = NULL; | 128 | fi->dentry = NULL; |
129 | 129 | ||
130 | dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos, | 130 | dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos, |
131 | last); | 131 | last); |
132 | 132 | ||
133 | spin_lock(&parent->d_lock); | 133 | spin_lock(&parent->d_lock); |
134 | 134 | ||
135 | /* start at beginning? */ | 135 | /* start at beginning? */ |
136 | if (filp->f_pos == 2 || last == NULL || | 136 | if (filp->f_pos == 2 || last == NULL || |
137 | filp->f_pos < ceph_dentry(last)->offset) { | 137 | filp->f_pos < ceph_dentry(last)->offset) { |
138 | if (list_empty(&parent->d_subdirs)) | 138 | if (list_empty(&parent->d_subdirs)) |
139 | goto out_unlock; | 139 | goto out_unlock; |
140 | p = parent->d_subdirs.prev; | 140 | p = parent->d_subdirs.prev; |
141 | dout(" initial p %p/%p\n", p->prev, p->next); | 141 | dout(" initial p %p/%p\n", p->prev, p->next); |
142 | } else { | 142 | } else { |
143 | p = last->d_u.d_child.prev; | 143 | p = last->d_u.d_child.prev; |
144 | } | 144 | } |
145 | 145 | ||
146 | more: | 146 | more: |
147 | dentry = list_entry(p, struct dentry, d_u.d_child); | 147 | dentry = list_entry(p, struct dentry, d_u.d_child); |
148 | di = ceph_dentry(dentry); | 148 | di = ceph_dentry(dentry); |
149 | while (1) { | 149 | while (1) { |
150 | dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next, | 150 | dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next, |
151 | d_unhashed(dentry) ? "!hashed" : "hashed", | 151 | d_unhashed(dentry) ? "!hashed" : "hashed", |
152 | parent->d_subdirs.prev, parent->d_subdirs.next); | 152 | parent->d_subdirs.prev, parent->d_subdirs.next); |
153 | if (p == &parent->d_subdirs) { | 153 | if (p == &parent->d_subdirs) { |
154 | fi->flags |= CEPH_F_ATEND; | 154 | fi->flags |= CEPH_F_ATEND; |
155 | goto out_unlock; | 155 | goto out_unlock; |
156 | } | 156 | } |
157 | spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); | 157 | spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); |
158 | if (!d_unhashed(dentry) && dentry->d_inode && | 158 | if (!d_unhashed(dentry) && dentry->d_inode && |
159 | ceph_snap(dentry->d_inode) != CEPH_SNAPDIR && | 159 | ceph_snap(dentry->d_inode) != CEPH_SNAPDIR && |
160 | ceph_ino(dentry->d_inode) != CEPH_INO_CEPH && | 160 | ceph_ino(dentry->d_inode) != CEPH_INO_CEPH && |
161 | filp->f_pos <= di->offset) | 161 | filp->f_pos <= di->offset) |
162 | break; | 162 | break; |
163 | dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry, | 163 | dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry, |
164 | dentry->d_name.len, dentry->d_name.name, di->offset, | 164 | dentry->d_name.len, dentry->d_name.name, di->offset, |
165 | filp->f_pos, d_unhashed(dentry) ? " unhashed" : "", | 165 | filp->f_pos, d_unhashed(dentry) ? " unhashed" : "", |
166 | !dentry->d_inode ? " null" : ""); | 166 | !dentry->d_inode ? " null" : ""); |
167 | spin_unlock(&dentry->d_lock); | 167 | spin_unlock(&dentry->d_lock); |
168 | p = p->prev; | 168 | p = p->prev; |
169 | dentry = list_entry(p, struct dentry, d_u.d_child); | 169 | dentry = list_entry(p, struct dentry, d_u.d_child); |
170 | di = ceph_dentry(dentry); | 170 | di = ceph_dentry(dentry); |
171 | } | 171 | } |
172 | 172 | ||
173 | dget_dlock(dentry); | 173 | dget_dlock(dentry); |
174 | spin_unlock(&dentry->d_lock); | 174 | spin_unlock(&dentry->d_lock); |
175 | spin_unlock(&parent->d_lock); | 175 | spin_unlock(&parent->d_lock); |
176 | 176 | ||
177 | dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos, | 177 | dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos, |
178 | dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); | 178 | dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); |
179 | filp->f_pos = di->offset; | 179 | filp->f_pos = di->offset; |
180 | err = filldir(dirent, dentry->d_name.name, | 180 | err = filldir(dirent, dentry->d_name.name, |
181 | dentry->d_name.len, di->offset, | 181 | dentry->d_name.len, di->offset, |
182 | ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino), | 182 | ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino), |
183 | dentry->d_inode->i_mode >> 12); | 183 | dentry->d_inode->i_mode >> 12); |
184 | 184 | ||
185 | if (last) { | 185 | if (last) { |
186 | if (err < 0) { | 186 | if (err < 0) { |
187 | /* remember our position */ | 187 | /* remember our position */ |
188 | fi->dentry = last; | 188 | fi->dentry = last; |
189 | fi->next_offset = di->offset; | 189 | fi->next_offset = di->offset; |
190 | } else { | 190 | } else { |
191 | dput(last); | 191 | dput(last); |
192 | } | 192 | } |
193 | } | 193 | } |
194 | last = dentry; | 194 | last = dentry; |
195 | 195 | ||
196 | if (err < 0) | 196 | if (err < 0) |
197 | goto out; | 197 | goto out; |
198 | 198 | ||
199 | filp->f_pos++; | 199 | filp->f_pos++; |
200 | 200 | ||
201 | /* make sure a dentry wasn't dropped while we didn't have parent lock */ | 201 | /* make sure a dentry wasn't dropped while we didn't have parent lock */ |
202 | if (!ceph_dir_test_complete(dir)) { | 202 | if (!ceph_dir_test_complete(dir)) { |
203 | dout(" lost D_COMPLETE on %p; falling back to mds\n", dir); | 203 | dout(" lost D_COMPLETE on %p; falling back to mds\n", dir); |
204 | err = -EAGAIN; | 204 | err = -EAGAIN; |
205 | goto out; | 205 | goto out; |
206 | } | 206 | } |
207 | 207 | ||
208 | spin_lock(&parent->d_lock); | 208 | spin_lock(&parent->d_lock); |
209 | p = p->prev; /* advance to next dentry */ | 209 | p = p->prev; /* advance to next dentry */ |
210 | goto more; | 210 | goto more; |
211 | 211 | ||
212 | out_unlock: | 212 | out_unlock: |
213 | spin_unlock(&parent->d_lock); | 213 | spin_unlock(&parent->d_lock); |
214 | out: | 214 | out: |
215 | if (last) | 215 | if (last) |
216 | dput(last); | 216 | dput(last); |
217 | return err; | 217 | return err; |
218 | } | 218 | } |
219 | 219 | ||
220 | /* | 220 | /* |
221 | * make note of the last dentry we read, so we can | 221 | * make note of the last dentry we read, so we can |
222 | * continue at the same lexicographical point, | 222 | * continue at the same lexicographical point, |
223 | * regardless of what dir changes take place on the | 223 | * regardless of what dir changes take place on the |
224 | * server. | 224 | * server. |
225 | */ | 225 | */ |
226 | static int note_last_dentry(struct ceph_file_info *fi, const char *name, | 226 | static int note_last_dentry(struct ceph_file_info *fi, const char *name, |
227 | int len) | 227 | int len) |
228 | { | 228 | { |
229 | kfree(fi->last_name); | 229 | kfree(fi->last_name); |
230 | fi->last_name = kmalloc(len+1, GFP_NOFS); | 230 | fi->last_name = kmalloc(len+1, GFP_NOFS); |
231 | if (!fi->last_name) | 231 | if (!fi->last_name) |
232 | return -ENOMEM; | 232 | return -ENOMEM; |
233 | memcpy(fi->last_name, name, len); | 233 | memcpy(fi->last_name, name, len); |
234 | fi->last_name[len] = 0; | 234 | fi->last_name[len] = 0; |
235 | dout("note_last_dentry '%s'\n", fi->last_name); | 235 | dout("note_last_dentry '%s'\n", fi->last_name); |
236 | return 0; | 236 | return 0; |
237 | } | 237 | } |
238 | 238 | ||
239 | static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) | 239 | static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) |
240 | { | 240 | { |
241 | struct ceph_file_info *fi = filp->private_data; | 241 | struct ceph_file_info *fi = filp->private_data; |
242 | struct inode *inode = filp->f_dentry->d_inode; | 242 | struct inode *inode = filp->f_dentry->d_inode; |
243 | struct ceph_inode_info *ci = ceph_inode(inode); | 243 | struct ceph_inode_info *ci = ceph_inode(inode); |
244 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); | 244 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); |
245 | struct ceph_mds_client *mdsc = fsc->mdsc; | 245 | struct ceph_mds_client *mdsc = fsc->mdsc; |
246 | unsigned frag = fpos_frag(filp->f_pos); | 246 | unsigned frag = fpos_frag(filp->f_pos); |
247 | int off = fpos_off(filp->f_pos); | 247 | int off = fpos_off(filp->f_pos); |
248 | int err; | 248 | int err; |
249 | u32 ftype; | 249 | u32 ftype; |
250 | struct ceph_mds_reply_info_parsed *rinfo; | 250 | struct ceph_mds_reply_info_parsed *rinfo; |
251 | const int max_entries = fsc->mount_options->max_readdir; | 251 | const int max_entries = fsc->mount_options->max_readdir; |
252 | const int max_bytes = fsc->mount_options->max_readdir_bytes; | 252 | const int max_bytes = fsc->mount_options->max_readdir_bytes; |
253 | 253 | ||
254 | dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off); | 254 | dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off); |
255 | if (fi->flags & CEPH_F_ATEND) | 255 | if (fi->flags & CEPH_F_ATEND) |
256 | return 0; | 256 | return 0; |
257 | 257 | ||
258 | /* always start with . and .. */ | 258 | /* always start with . and .. */ |
259 | if (filp->f_pos == 0) { | 259 | if (filp->f_pos == 0) { |
260 | /* note dir version at start of readdir so we can tell | 260 | /* note dir version at start of readdir so we can tell |
261 | * if any dentries get dropped */ | 261 | * if any dentries get dropped */ |
262 | fi->dir_release_count = ci->i_release_count; | 262 | fi->dir_release_count = ci->i_release_count; |
263 | 263 | ||
264 | dout("readdir off 0 -> '.'\n"); | 264 | dout("readdir off 0 -> '.'\n"); |
265 | if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0), | 265 | if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0), |
266 | ceph_translate_ino(inode->i_sb, inode->i_ino), | 266 | ceph_translate_ino(inode->i_sb, inode->i_ino), |
267 | inode->i_mode >> 12) < 0) | 267 | inode->i_mode >> 12) < 0) |
268 | return 0; | 268 | return 0; |
269 | filp->f_pos = 1; | 269 | filp->f_pos = 1; |
270 | off = 1; | 270 | off = 1; |
271 | } | 271 | } |
272 | if (filp->f_pos == 1) { | 272 | if (filp->f_pos == 1) { |
273 | ino_t ino = parent_ino(filp->f_dentry); | 273 | ino_t ino = parent_ino(filp->f_dentry); |
274 | dout("readdir off 1 -> '..'\n"); | 274 | dout("readdir off 1 -> '..'\n"); |
275 | if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1), | 275 | if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1), |
276 | ceph_translate_ino(inode->i_sb, ino), | 276 | ceph_translate_ino(inode->i_sb, ino), |
277 | inode->i_mode >> 12) < 0) | 277 | inode->i_mode >> 12) < 0) |
278 | return 0; | 278 | return 0; |
279 | filp->f_pos = 2; | 279 | filp->f_pos = 2; |
280 | off = 2; | 280 | off = 2; |
281 | } | 281 | } |
282 | 282 | ||
283 | /* can we use the dcache? */ | 283 | /* can we use the dcache? */ |
284 | spin_lock(&ci->i_ceph_lock); | 284 | spin_lock(&ci->i_ceph_lock); |
285 | if ((filp->f_pos == 2 || fi->dentry) && | 285 | if ((filp->f_pos == 2 || fi->dentry) && |
286 | !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && | 286 | !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && |
287 | ceph_snap(inode) != CEPH_SNAPDIR && | 287 | ceph_snap(inode) != CEPH_SNAPDIR && |
288 | ceph_dir_test_complete(inode) && | 288 | ceph_dir_test_complete(inode) && |
289 | __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { | 289 | __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { |
290 | spin_unlock(&ci->i_ceph_lock); | 290 | spin_unlock(&ci->i_ceph_lock); |
291 | err = __dcache_readdir(filp, dirent, filldir); | 291 | err = __dcache_readdir(filp, dirent, filldir); |
292 | if (err != -EAGAIN) | 292 | if (err != -EAGAIN) |
293 | return err; | 293 | return err; |
294 | } else { | 294 | } else { |
295 | spin_unlock(&ci->i_ceph_lock); | 295 | spin_unlock(&ci->i_ceph_lock); |
296 | } | 296 | } |
297 | if (fi->dentry) { | 297 | if (fi->dentry) { |
298 | err = note_last_dentry(fi, fi->dentry->d_name.name, | 298 | err = note_last_dentry(fi, fi->dentry->d_name.name, |
299 | fi->dentry->d_name.len); | 299 | fi->dentry->d_name.len); |
300 | if (err) | 300 | if (err) |
301 | return err; | 301 | return err; |
302 | dput(fi->dentry); | 302 | dput(fi->dentry); |
303 | fi->dentry = NULL; | 303 | fi->dentry = NULL; |
304 | } | 304 | } |
305 | 305 | ||
306 | /* proceed with a normal readdir */ | 306 | /* proceed with a normal readdir */ |
307 | 307 | ||
308 | more: | 308 | more: |
309 | /* do we have the correct frag content buffered? */ | 309 | /* do we have the correct frag content buffered? */ |
310 | if (fi->frag != frag || fi->last_readdir == NULL) { | 310 | if (fi->frag != frag || fi->last_readdir == NULL) { |
311 | struct ceph_mds_request *req; | 311 | struct ceph_mds_request *req; |
312 | int op = ceph_snap(inode) == CEPH_SNAPDIR ? | 312 | int op = ceph_snap(inode) == CEPH_SNAPDIR ? |
313 | CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR; | 313 | CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR; |
314 | 314 | ||
315 | /* discard old result, if any */ | 315 | /* discard old result, if any */ |
316 | if (fi->last_readdir) { | 316 | if (fi->last_readdir) { |
317 | ceph_mdsc_put_request(fi->last_readdir); | 317 | ceph_mdsc_put_request(fi->last_readdir); |
318 | fi->last_readdir = NULL; | 318 | fi->last_readdir = NULL; |
319 | } | 319 | } |
320 | 320 | ||
321 | /* requery frag tree, as the frag topology may have changed */ | 321 | /* requery frag tree, as the frag topology may have changed */ |
322 | frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL); | 322 | frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL); |
323 | 323 | ||
324 | dout("readdir fetching %llx.%llx frag %x offset '%s'\n", | 324 | dout("readdir fetching %llx.%llx frag %x offset '%s'\n", |
325 | ceph_vinop(inode), frag, fi->last_name); | 325 | ceph_vinop(inode), frag, fi->last_name); |
326 | req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); | 326 | req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); |
327 | if (IS_ERR(req)) | 327 | if (IS_ERR(req)) |
328 | return PTR_ERR(req); | 328 | return PTR_ERR(req); |
329 | req->r_inode = inode; | 329 | req->r_inode = inode; |
330 | ihold(inode); | 330 | ihold(inode); |
331 | req->r_dentry = dget(filp->f_dentry); | 331 | req->r_dentry = dget(filp->f_dentry); |
332 | /* hints to request -> mds selection code */ | 332 | /* hints to request -> mds selection code */ |
333 | req->r_direct_mode = USE_AUTH_MDS; | 333 | req->r_direct_mode = USE_AUTH_MDS; |
334 | req->r_direct_hash = ceph_frag_value(frag); | 334 | req->r_direct_hash = ceph_frag_value(frag); |
335 | req->r_direct_is_hash = true; | 335 | req->r_direct_is_hash = true; |
336 | req->r_path2 = kstrdup(fi->last_name, GFP_NOFS); | 336 | req->r_path2 = kstrdup(fi->last_name, GFP_NOFS); |
337 | req->r_readdir_offset = fi->next_offset; | 337 | req->r_readdir_offset = fi->next_offset; |
338 | req->r_args.readdir.frag = cpu_to_le32(frag); | 338 | req->r_args.readdir.frag = cpu_to_le32(frag); |
339 | req->r_args.readdir.max_entries = cpu_to_le32(max_entries); | 339 | req->r_args.readdir.max_entries = cpu_to_le32(max_entries); |
340 | req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes); | 340 | req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes); |
341 | req->r_num_caps = max_entries + 1; | 341 | req->r_num_caps = max_entries + 1; |
342 | err = ceph_mdsc_do_request(mdsc, NULL, req); | 342 | err = ceph_mdsc_do_request(mdsc, NULL, req); |
343 | if (err < 0) { | 343 | if (err < 0) { |
344 | ceph_mdsc_put_request(req); | 344 | ceph_mdsc_put_request(req); |
345 | return err; | 345 | return err; |
346 | } | 346 | } |
347 | dout("readdir got and parsed readdir result=%d" | 347 | dout("readdir got and parsed readdir result=%d" |
348 | " on frag %x, end=%d, complete=%d\n", err, frag, | 348 | " on frag %x, end=%d, complete=%d\n", err, frag, |
349 | (int)req->r_reply_info.dir_end, | 349 | (int)req->r_reply_info.dir_end, |
350 | (int)req->r_reply_info.dir_complete); | 350 | (int)req->r_reply_info.dir_complete); |
351 | 351 | ||
352 | if (!req->r_did_prepopulate) { | 352 | if (!req->r_did_prepopulate) { |
353 | dout("readdir !did_prepopulate"); | 353 | dout("readdir !did_prepopulate"); |
354 | fi->dir_release_count--; /* preclude D_COMPLETE */ | 354 | fi->dir_release_count--; /* preclude D_COMPLETE */ |
355 | } | 355 | } |
356 | 356 | ||
357 | /* note next offset and last dentry name */ | 357 | /* note next offset and last dentry name */ |
358 | fi->offset = fi->next_offset; | 358 | fi->offset = fi->next_offset; |
359 | fi->last_readdir = req; | 359 | fi->last_readdir = req; |
360 | 360 | ||
361 | if (req->r_reply_info.dir_end) { | 361 | if (req->r_reply_info.dir_end) { |
362 | kfree(fi->last_name); | 362 | kfree(fi->last_name); |
363 | fi->last_name = NULL; | 363 | fi->last_name = NULL; |
364 | if (ceph_frag_is_rightmost(frag)) | 364 | if (ceph_frag_is_rightmost(frag)) |
365 | fi->next_offset = 2; | 365 | fi->next_offset = 2; |
366 | else | 366 | else |
367 | fi->next_offset = 0; | 367 | fi->next_offset = 0; |
368 | } else { | 368 | } else { |
369 | rinfo = &req->r_reply_info; | 369 | rinfo = &req->r_reply_info; |
370 | err = note_last_dentry(fi, | 370 | err = note_last_dentry(fi, |
371 | rinfo->dir_dname[rinfo->dir_nr-1], | 371 | rinfo->dir_dname[rinfo->dir_nr-1], |
372 | rinfo->dir_dname_len[rinfo->dir_nr-1]); | 372 | rinfo->dir_dname_len[rinfo->dir_nr-1]); |
373 | if (err) | 373 | if (err) |
374 | return err; | 374 | return err; |
375 | fi->next_offset += rinfo->dir_nr; | 375 | fi->next_offset += rinfo->dir_nr; |
376 | } | 376 | } |
377 | } | 377 | } |
378 | 378 | ||
379 | rinfo = &fi->last_readdir->r_reply_info; | 379 | rinfo = &fi->last_readdir->r_reply_info; |
380 | dout("readdir frag %x num %d off %d chunkoff %d\n", frag, | 380 | dout("readdir frag %x num %d off %d chunkoff %d\n", frag, |
381 | rinfo->dir_nr, off, fi->offset); | 381 | rinfo->dir_nr, off, fi->offset); |
382 | while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) { | 382 | while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) { |
383 | u64 pos = ceph_make_fpos(frag, off); | 383 | u64 pos = ceph_make_fpos(frag, off); |
384 | struct ceph_mds_reply_inode *in = | 384 | struct ceph_mds_reply_inode *in = |
385 | rinfo->dir_in[off - fi->offset].in; | 385 | rinfo->dir_in[off - fi->offset].in; |
386 | struct ceph_vino vino; | 386 | struct ceph_vino vino; |
387 | ino_t ino; | 387 | ino_t ino; |
388 | 388 | ||
389 | dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n", | 389 | dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n", |
390 | off, off - fi->offset, rinfo->dir_nr, pos, | 390 | off, off - fi->offset, rinfo->dir_nr, pos, |
391 | rinfo->dir_dname_len[off - fi->offset], | 391 | rinfo->dir_dname_len[off - fi->offset], |
392 | rinfo->dir_dname[off - fi->offset], in); | 392 | rinfo->dir_dname[off - fi->offset], in); |
393 | BUG_ON(!in); | 393 | BUG_ON(!in); |
394 | ftype = le32_to_cpu(in->mode) >> 12; | 394 | ftype = le32_to_cpu(in->mode) >> 12; |
395 | vino.ino = le64_to_cpu(in->ino); | 395 | vino.ino = le64_to_cpu(in->ino); |
396 | vino.snap = le64_to_cpu(in->snapid); | 396 | vino.snap = le64_to_cpu(in->snapid); |
397 | ino = ceph_vino_to_ino(vino); | 397 | ino = ceph_vino_to_ino(vino); |
398 | if (filldir(dirent, | 398 | if (filldir(dirent, |
399 | rinfo->dir_dname[off - fi->offset], | 399 | rinfo->dir_dname[off - fi->offset], |
400 | rinfo->dir_dname_len[off - fi->offset], | 400 | rinfo->dir_dname_len[off - fi->offset], |
401 | pos, | 401 | pos, |
402 | ceph_translate_ino(inode->i_sb, ino), ftype) < 0) { | 402 | ceph_translate_ino(inode->i_sb, ino), ftype) < 0) { |
403 | dout("filldir stopping us...\n"); | 403 | dout("filldir stopping us...\n"); |
404 | return 0; | 404 | return 0; |
405 | } | 405 | } |
406 | off++; | 406 | off++; |
407 | filp->f_pos = pos + 1; | 407 | filp->f_pos = pos + 1; |
408 | } | 408 | } |
409 | 409 | ||
410 | if (fi->last_name) { | 410 | if (fi->last_name) { |
411 | ceph_mdsc_put_request(fi->last_readdir); | 411 | ceph_mdsc_put_request(fi->last_readdir); |
412 | fi->last_readdir = NULL; | 412 | fi->last_readdir = NULL; |
413 | goto more; | 413 | goto more; |
414 | } | 414 | } |
415 | 415 | ||
416 | /* more frags? */ | 416 | /* more frags? */ |
417 | if (!ceph_frag_is_rightmost(frag)) { | 417 | if (!ceph_frag_is_rightmost(frag)) { |
418 | frag = ceph_frag_next(frag); | 418 | frag = ceph_frag_next(frag); |
419 | off = 0; | 419 | off = 0; |
420 | filp->f_pos = ceph_make_fpos(frag, off); | 420 | filp->f_pos = ceph_make_fpos(frag, off); |
421 | dout("readdir next frag is %x\n", frag); | 421 | dout("readdir next frag is %x\n", frag); |
422 | goto more; | 422 | goto more; |
423 | } | 423 | } |
424 | fi->flags |= CEPH_F_ATEND; | 424 | fi->flags |= CEPH_F_ATEND; |
425 | 425 | ||
426 | /* | 426 | /* |
427 | * if dir_release_count still matches the dir, no dentries | 427 | * if dir_release_count still matches the dir, no dentries |
428 | * were released during the whole readdir, and we should have | 428 | * were released during the whole readdir, and we should have |
429 | * the complete dir contents in our cache. | 429 | * the complete dir contents in our cache. |
430 | */ | 430 | */ |
431 | spin_lock(&ci->i_ceph_lock); | 431 | spin_lock(&ci->i_ceph_lock); |
432 | if (ci->i_release_count == fi->dir_release_count) { | 432 | if (ci->i_release_count == fi->dir_release_count) { |
433 | ceph_dir_set_complete(inode); | 433 | ceph_dir_set_complete(inode); |
434 | ci->i_max_offset = filp->f_pos; | 434 | ci->i_max_offset = filp->f_pos; |
435 | } | 435 | } |
436 | spin_unlock(&ci->i_ceph_lock); | 436 | spin_unlock(&ci->i_ceph_lock); |
437 | 437 | ||
438 | dout("readdir %p filp %p done.\n", inode, filp); | 438 | dout("readdir %p filp %p done.\n", inode, filp); |
439 | return 0; | 439 | return 0; |
440 | } | 440 | } |
441 | 441 | ||
442 | static void reset_readdir(struct ceph_file_info *fi) | 442 | static void reset_readdir(struct ceph_file_info *fi) |
443 | { | 443 | { |
444 | if (fi->last_readdir) { | 444 | if (fi->last_readdir) { |
445 | ceph_mdsc_put_request(fi->last_readdir); | 445 | ceph_mdsc_put_request(fi->last_readdir); |
446 | fi->last_readdir = NULL; | 446 | fi->last_readdir = NULL; |
447 | } | 447 | } |
448 | kfree(fi->last_name); | 448 | kfree(fi->last_name); |
449 | fi->last_name = NULL; | 449 | fi->last_name = NULL; |
450 | fi->next_offset = 2; /* compensate for . and .. */ | 450 | fi->next_offset = 2; /* compensate for . and .. */ |
451 | if (fi->dentry) { | 451 | if (fi->dentry) { |
452 | dput(fi->dentry); | 452 | dput(fi->dentry); |
453 | fi->dentry = NULL; | 453 | fi->dentry = NULL; |
454 | } | 454 | } |
455 | fi->flags &= ~CEPH_F_ATEND; | 455 | fi->flags &= ~CEPH_F_ATEND; |
456 | } | 456 | } |
457 | 457 | ||
458 | static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin) | 458 | static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin) |
459 | { | 459 | { |
460 | struct ceph_file_info *fi = file->private_data; | 460 | struct ceph_file_info *fi = file->private_data; |
461 | struct inode *inode = file->f_mapping->host; | 461 | struct inode *inode = file->f_mapping->host; |
462 | loff_t old_offset = offset; | 462 | loff_t old_offset = offset; |
463 | loff_t retval; | 463 | loff_t retval; |
464 | 464 | ||
465 | mutex_lock(&inode->i_mutex); | 465 | mutex_lock(&inode->i_mutex); |
466 | retval = -EINVAL; | 466 | retval = -EINVAL; |
467 | switch (origin) { | 467 | switch (origin) { |
468 | case SEEK_END: | 468 | case SEEK_END: |
469 | offset += inode->i_size + 2; /* FIXME */ | 469 | offset += inode->i_size + 2; /* FIXME */ |
470 | break; | 470 | break; |
471 | case SEEK_CUR: | 471 | case SEEK_CUR: |
472 | offset += file->f_pos; | 472 | offset += file->f_pos; |
473 | case SEEK_SET: | 473 | case SEEK_SET: |
474 | break; | 474 | break; |
475 | default: | 475 | default: |
476 | goto out; | 476 | goto out; |
477 | } | 477 | } |
478 | 478 | ||
479 | if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) { | 479 | if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) { |
480 | if (offset != file->f_pos) { | 480 | if (offset != file->f_pos) { |
481 | file->f_pos = offset; | 481 | file->f_pos = offset; |
482 | file->f_version = 0; | 482 | file->f_version = 0; |
483 | fi->flags &= ~CEPH_F_ATEND; | 483 | fi->flags &= ~CEPH_F_ATEND; |
484 | } | 484 | } |
485 | retval = offset; | 485 | retval = offset; |
486 | 486 | ||
487 | /* | 487 | /* |
488 | * discard buffered readdir content on seekdir(0), or | 488 | * discard buffered readdir content on seekdir(0), or |
489 | * seek to new frag, or seek prior to current chunk. | 489 | * seek to new frag, or seek prior to current chunk. |
490 | */ | 490 | */ |
491 | if (offset == 0 || | 491 | if (offset == 0 || |
492 | fpos_frag(offset) != fpos_frag(old_offset) || | 492 | fpos_frag(offset) != fpos_frag(old_offset) || |
493 | fpos_off(offset) < fi->offset) { | 493 | fpos_off(offset) < fi->offset) { |
494 | dout("dir_llseek dropping %p content\n", file); | 494 | dout("dir_llseek dropping %p content\n", file); |
495 | reset_readdir(fi); | 495 | reset_readdir(fi); |
496 | } | 496 | } |
497 | 497 | ||
498 | /* bump dir_release_count if we did a forward seek */ | 498 | /* bump dir_release_count if we did a forward seek */ |
499 | if (offset > old_offset) | 499 | if (offset > old_offset) |
500 | fi->dir_release_count--; | 500 | fi->dir_release_count--; |
501 | } | 501 | } |
502 | out: | 502 | out: |
503 | mutex_unlock(&inode->i_mutex); | 503 | mutex_unlock(&inode->i_mutex); |
504 | return retval; | 504 | return retval; |
505 | } | 505 | } |
506 | 506 | ||
507 | /* | 507 | /* |
508 | * Handle lookups for the hidden .snap directory. | 508 | * Handle lookups for the hidden .snap directory. |
509 | */ | 509 | */ |
510 | int ceph_handle_snapdir(struct ceph_mds_request *req, | 510 | int ceph_handle_snapdir(struct ceph_mds_request *req, |
511 | struct dentry *dentry, int err) | 511 | struct dentry *dentry, int err) |
512 | { | 512 | { |
513 | struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); | 513 | struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); |
514 | struct inode *parent = dentry->d_parent->d_inode; /* we hold i_mutex */ | 514 | struct inode *parent = dentry->d_parent->d_inode; /* we hold i_mutex */ |
515 | 515 | ||
516 | /* .snap dir? */ | 516 | /* .snap dir? */ |
517 | if (err == -ENOENT && | 517 | if (err == -ENOENT && |
518 | ceph_snap(parent) == CEPH_NOSNAP && | 518 | ceph_snap(parent) == CEPH_NOSNAP && |
519 | strcmp(dentry->d_name.name, | 519 | strcmp(dentry->d_name.name, |
520 | fsc->mount_options->snapdir_name) == 0) { | 520 | fsc->mount_options->snapdir_name) == 0) { |
521 | struct inode *inode = ceph_get_snapdir(parent); | 521 | struct inode *inode = ceph_get_snapdir(parent); |
522 | dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n", | 522 | dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n", |
523 | dentry, dentry->d_name.len, dentry->d_name.name, inode); | 523 | dentry, dentry->d_name.len, dentry->d_name.name, inode); |
524 | BUG_ON(!d_unhashed(dentry)); | 524 | BUG_ON(!d_unhashed(dentry)); |
525 | d_add(dentry, inode); | 525 | d_add(dentry, inode); |
526 | err = 0; | 526 | err = 0; |
527 | } | 527 | } |
528 | return err; | 528 | return err; |
529 | } | 529 | } |
530 | 530 | ||
531 | /* | 531 | /* |
532 | * Figure out final result of a lookup/open request. | 532 | * Figure out final result of a lookup/open request. |
533 | * | 533 | * |
534 | * Mainly, make sure we return the final req->r_dentry (if it already | 534 | * Mainly, make sure we return the final req->r_dentry (if it already |
535 | * existed) in place of the original VFS-provided dentry when they | 535 | * existed) in place of the original VFS-provided dentry when they |
536 | * differ. | 536 | * differ. |
537 | * | 537 | * |
538 | * Gracefully handle the case where the MDS replies with -ENOENT and | 538 | * Gracefully handle the case where the MDS replies with -ENOENT and |
539 | * no trace (which it may do, at its discretion, e.g., if it doesn't | 539 | * no trace (which it may do, at its discretion, e.g., if it doesn't |
540 | * care to issue a lease on the negative dentry). | 540 | * care to issue a lease on the negative dentry). |
541 | */ | 541 | */ |
542 | struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, | 542 | struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, |
543 | struct dentry *dentry, int err) | 543 | struct dentry *dentry, int err) |
544 | { | 544 | { |
545 | if (err == -ENOENT) { | 545 | if (err == -ENOENT) { |
546 | /* no trace? */ | 546 | /* no trace? */ |
547 | err = 0; | 547 | err = 0; |
548 | if (!req->r_reply_info.head->is_dentry) { | 548 | if (!req->r_reply_info.head->is_dentry) { |
549 | dout("ENOENT and no trace, dentry %p inode %p\n", | 549 | dout("ENOENT and no trace, dentry %p inode %p\n", |
550 | dentry, dentry->d_inode); | 550 | dentry, dentry->d_inode); |
551 | if (dentry->d_inode) { | 551 | if (dentry->d_inode) { |
552 | d_drop(dentry); | 552 | d_drop(dentry); |
553 | err = -ENOENT; | 553 | err = -ENOENT; |
554 | } else { | 554 | } else { |
555 | d_add(dentry, NULL); | 555 | d_add(dentry, NULL); |
556 | } | 556 | } |
557 | } | 557 | } |
558 | } | 558 | } |
559 | if (err) | 559 | if (err) |
560 | dentry = ERR_PTR(err); | 560 | dentry = ERR_PTR(err); |
561 | else if (dentry != req->r_dentry) | 561 | else if (dentry != req->r_dentry) |
562 | dentry = dget(req->r_dentry); /* we got spliced */ | 562 | dentry = dget(req->r_dentry); /* we got spliced */ |
563 | else | 563 | else |
564 | dentry = NULL; | 564 | dentry = NULL; |
565 | return dentry; | 565 | return dentry; |
566 | } | 566 | } |
567 | 567 | ||
568 | static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry) | 568 | static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry) |
569 | { | 569 | { |
570 | return ceph_ino(inode) == CEPH_INO_ROOT && | 570 | return ceph_ino(inode) == CEPH_INO_ROOT && |
571 | strncmp(dentry->d_name.name, ".ceph", 5) == 0; | 571 | strncmp(dentry->d_name.name, ".ceph", 5) == 0; |
572 | } | 572 | } |
573 | 573 | ||
574 | /* | 574 | /* |
575 | * Look up a single dir entry. If there is a lookup intent, inform | 575 | * Look up a single dir entry. If there is a lookup intent, inform |
576 | * the MDS so that it gets our 'caps wanted' value in a single op. | 576 | * the MDS so that it gets our 'caps wanted' value in a single op. |
577 | */ | 577 | */ |
578 | static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, | 578 | static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, |
579 | struct nameidata *nd) | 579 | struct nameidata *nd) |
580 | { | 580 | { |
581 | struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); | 581 | struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); |
582 | struct ceph_mds_client *mdsc = fsc->mdsc; | 582 | struct ceph_mds_client *mdsc = fsc->mdsc; |
583 | struct ceph_mds_request *req; | 583 | struct ceph_mds_request *req; |
584 | int op; | 584 | int op; |
585 | int err; | 585 | int err; |
586 | 586 | ||
587 | dout("lookup %p dentry %p '%.*s'\n", | 587 | dout("lookup %p dentry %p '%.*s'\n", |
588 | dir, dentry, dentry->d_name.len, dentry->d_name.name); | 588 | dir, dentry, dentry->d_name.len, dentry->d_name.name); |
589 | 589 | ||
590 | if (dentry->d_name.len > NAME_MAX) | 590 | if (dentry->d_name.len > NAME_MAX) |
591 | return ERR_PTR(-ENAMETOOLONG); | 591 | return ERR_PTR(-ENAMETOOLONG); |
592 | 592 | ||
593 | err = ceph_init_dentry(dentry); | 593 | err = ceph_init_dentry(dentry); |
594 | if (err < 0) | 594 | if (err < 0) |
595 | return ERR_PTR(err); | 595 | return ERR_PTR(err); |
596 | 596 | ||
597 | /* open (but not create!) intent? */ | 597 | /* open (but not create!) intent? */ |
598 | if (nd && | 598 | if (nd && |
599 | (nd->flags & LOOKUP_OPEN) && | 599 | (nd->flags & LOOKUP_OPEN) && |
600 | !(nd->intent.open.flags & O_CREAT)) { | 600 | !(nd->intent.open.flags & O_CREAT)) { |
601 | int mode = nd->intent.open.create_mode & ~current->fs->umask; | 601 | int mode = nd->intent.open.create_mode & ~current->fs->umask; |
602 | return ceph_lookup_open(dir, dentry, nd, mode, 1); | 602 | return ceph_lookup_open(dir, dentry, nd, mode, 1); |
603 | } | 603 | } |
604 | 604 | ||
605 | /* can we conclude ENOENT locally? */ | 605 | /* can we conclude ENOENT locally? */ |
606 | if (dentry->d_inode == NULL) { | 606 | if (dentry->d_inode == NULL) { |
607 | struct ceph_inode_info *ci = ceph_inode(dir); | 607 | struct ceph_inode_info *ci = ceph_inode(dir); |
608 | struct ceph_dentry_info *di = ceph_dentry(dentry); | 608 | struct ceph_dentry_info *di = ceph_dentry(dentry); |
609 | 609 | ||
610 | spin_lock(&ci->i_ceph_lock); | 610 | spin_lock(&ci->i_ceph_lock); |
611 | dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags); | 611 | dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags); |
612 | if (strncmp(dentry->d_name.name, | 612 | if (strncmp(dentry->d_name.name, |
613 | fsc->mount_options->snapdir_name, | 613 | fsc->mount_options->snapdir_name, |
614 | dentry->d_name.len) && | 614 | dentry->d_name.len) && |
615 | !is_root_ceph_dentry(dir, dentry) && | 615 | !is_root_ceph_dentry(dir, dentry) && |
616 | ceph_dir_test_complete(dir) && | 616 | ceph_dir_test_complete(dir) && |
617 | (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { | 617 | (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { |
618 | spin_unlock(&ci->i_ceph_lock); | 618 | spin_unlock(&ci->i_ceph_lock); |
619 | dout(" dir %p complete, -ENOENT\n", dir); | 619 | dout(" dir %p complete, -ENOENT\n", dir); |
620 | d_add(dentry, NULL); | 620 | d_add(dentry, NULL); |
621 | di->lease_shared_gen = ci->i_shared_gen; | 621 | di->lease_shared_gen = ci->i_shared_gen; |
622 | return NULL; | 622 | return NULL; |
623 | } | 623 | } |
624 | spin_unlock(&ci->i_ceph_lock); | 624 | spin_unlock(&ci->i_ceph_lock); |
625 | } | 625 | } |
626 | 626 | ||
627 | op = ceph_snap(dir) == CEPH_SNAPDIR ? | 627 | op = ceph_snap(dir) == CEPH_SNAPDIR ? |
628 | CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP; | 628 | CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP; |
629 | req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS); | 629 | req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS); |
630 | if (IS_ERR(req)) | 630 | if (IS_ERR(req)) |
631 | return ERR_CAST(req); | 631 | return ERR_CAST(req); |
632 | req->r_dentry = dget(dentry); | 632 | req->r_dentry = dget(dentry); |
633 | req->r_num_caps = 2; | 633 | req->r_num_caps = 2; |
634 | /* we only need inode linkage */ | 634 | /* we only need inode linkage */ |
635 | req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); | 635 | req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); |
636 | req->r_locked_dir = dir; | 636 | req->r_locked_dir = dir; |
637 | err = ceph_mdsc_do_request(mdsc, NULL, req); | 637 | err = ceph_mdsc_do_request(mdsc, NULL, req); |
638 | err = ceph_handle_snapdir(req, dentry, err); | 638 | err = ceph_handle_snapdir(req, dentry, err); |
639 | dentry = ceph_finish_lookup(req, dentry, err); | 639 | dentry = ceph_finish_lookup(req, dentry, err); |
640 | ceph_mdsc_put_request(req); /* will dput(dentry) */ | 640 | ceph_mdsc_put_request(req); /* will dput(dentry) */ |
641 | dout("lookup result=%p\n", dentry); | 641 | dout("lookup result=%p\n", dentry); |
642 | return dentry; | 642 | return dentry; |
643 | } | 643 | } |
644 | 644 | ||
645 | /* | 645 | /* |
646 | * If we do a create but get no trace back from the MDS, follow up with | 646 | * If we do a create but get no trace back from the MDS, follow up with |
647 | * a lookup (the VFS expects us to link up the provided dentry). | 647 | * a lookup (the VFS expects us to link up the provided dentry). |
648 | */ | 648 | */ |
649 | int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry) | 649 | int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry) |
650 | { | 650 | { |
651 | struct dentry *result = ceph_lookup(dir, dentry, NULL); | 651 | struct dentry *result = ceph_lookup(dir, dentry, NULL); |
652 | 652 | ||
653 | if (result && !IS_ERR(result)) { | 653 | if (result && !IS_ERR(result)) { |
654 | /* | 654 | /* |
655 | * We created the item, then did a lookup, and found | 655 | * We created the item, then did a lookup, and found |
656 | * it was already linked to another inode we already | 656 | * it was already linked to another inode we already |
657 | * had in our cache (and thus got spliced). Link our | 657 | * had in our cache (and thus got spliced). Link our |
658 | * dentry to that inode, but don't hash it, just in | 658 | * dentry to that inode, but don't hash it, just in |
659 | * case the VFS wants to dereference it. | 659 | * case the VFS wants to dereference it. |
660 | */ | 660 | */ |
661 | BUG_ON(!result->d_inode); | 661 | BUG_ON(!result->d_inode); |
662 | d_instantiate(dentry, result->d_inode); | 662 | d_instantiate(dentry, result->d_inode); |
663 | return 0; | 663 | return 0; |
664 | } | 664 | } |
665 | return PTR_ERR(result); | 665 | return PTR_ERR(result); |
666 | } | 666 | } |
667 | 667 | ||
668 | static int ceph_mknod(struct inode *dir, struct dentry *dentry, | 668 | static int ceph_mknod(struct inode *dir, struct dentry *dentry, |
669 | umode_t mode, dev_t rdev) | 669 | umode_t mode, dev_t rdev) |
670 | { | 670 | { |
671 | struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); | 671 | struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); |
672 | struct ceph_mds_client *mdsc = fsc->mdsc; | 672 | struct ceph_mds_client *mdsc = fsc->mdsc; |
673 | struct ceph_mds_request *req; | 673 | struct ceph_mds_request *req; |
674 | int err; | 674 | int err; |
675 | 675 | ||
676 | if (ceph_snap(dir) != CEPH_NOSNAP) | 676 | if (ceph_snap(dir) != CEPH_NOSNAP) |
677 | return -EROFS; | 677 | return -EROFS; |
678 | 678 | ||
679 | dout("mknod in dir %p dentry %p mode 0%ho rdev %d\n", | 679 | dout("mknod in dir %p dentry %p mode 0%ho rdev %d\n", |
680 | dir, dentry, mode, rdev); | 680 | dir, dentry, mode, rdev); |
681 | req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS); | 681 | req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS); |
682 | if (IS_ERR(req)) { | 682 | if (IS_ERR(req)) { |
683 | d_drop(dentry); | 683 | d_drop(dentry); |
684 | return PTR_ERR(req); | 684 | return PTR_ERR(req); |
685 | } | 685 | } |
686 | req->r_dentry = dget(dentry); | 686 | req->r_dentry = dget(dentry); |
687 | req->r_num_caps = 2; | 687 | req->r_num_caps = 2; |
688 | req->r_locked_dir = dir; | 688 | req->r_locked_dir = dir; |
689 | req->r_args.mknod.mode = cpu_to_le32(mode); | 689 | req->r_args.mknod.mode = cpu_to_le32(mode); |
690 | req->r_args.mknod.rdev = cpu_to_le32(rdev); | 690 | req->r_args.mknod.rdev = cpu_to_le32(rdev); |
691 | req->r_dentry_drop = CEPH_CAP_FILE_SHARED; | 691 | req->r_dentry_drop = CEPH_CAP_FILE_SHARED; |
692 | req->r_dentry_unless = CEPH_CAP_FILE_EXCL; | 692 | req->r_dentry_unless = CEPH_CAP_FILE_EXCL; |
693 | err = ceph_mdsc_do_request(mdsc, dir, req); | 693 | err = ceph_mdsc_do_request(mdsc, dir, req); |
694 | if (!err && !req->r_reply_info.head->is_dentry) | 694 | if (!err && !req->r_reply_info.head->is_dentry) |
695 | err = ceph_handle_notrace_create(dir, dentry); | 695 | err = ceph_handle_notrace_create(dir, dentry); |
696 | ceph_mdsc_put_request(req); | 696 | ceph_mdsc_put_request(req); |
697 | if (err) | 697 | if (err) |
698 | d_drop(dentry); | 698 | d_drop(dentry); |
699 | return err; | 699 | return err; |
700 | } | 700 | } |
701 | 701 | ||
702 | static int ceph_create(struct inode *dir, struct dentry *dentry, umode_t mode, | 702 | static int ceph_create(struct inode *dir, struct dentry *dentry, umode_t mode, |
703 | struct nameidata *nd) | 703 | struct nameidata *nd) |
704 | { | 704 | { |
705 | dout("create in dir %p dentry %p name '%.*s'\n", | 705 | dout("create in dir %p dentry %p name '%.*s'\n", |
706 | dir, dentry, dentry->d_name.len, dentry->d_name.name); | 706 | dir, dentry, dentry->d_name.len, dentry->d_name.name); |
707 | 707 | ||
708 | if (ceph_snap(dir) != CEPH_NOSNAP) | 708 | if (ceph_snap(dir) != CEPH_NOSNAP) |
709 | return -EROFS; | 709 | return -EROFS; |
710 | 710 | ||
711 | if (nd) { | 711 | if (nd) { |
712 | BUG_ON((nd->flags & LOOKUP_OPEN) == 0); | 712 | BUG_ON((nd->flags & LOOKUP_OPEN) == 0); |
713 | dentry = ceph_lookup_open(dir, dentry, nd, mode, 0); | 713 | dentry = ceph_lookup_open(dir, dentry, nd, mode, 0); |
714 | /* hrm, what should i do here if we get aliased? */ | 714 | /* hrm, what should i do here if we get aliased? */ |
715 | if (IS_ERR(dentry)) | 715 | if (IS_ERR(dentry)) |
716 | return PTR_ERR(dentry); | 716 | return PTR_ERR(dentry); |
717 | return 0; | 717 | return 0; |
718 | } | 718 | } |
719 | 719 | ||
720 | /* fall back to mknod */ | 720 | /* fall back to mknod */ |
721 | return ceph_mknod(dir, dentry, (mode & ~S_IFMT) | S_IFREG, 0); | 721 | return ceph_mknod(dir, dentry, (mode & ~S_IFMT) | S_IFREG, 0); |
722 | } | 722 | } |
723 | 723 | ||
724 | static int ceph_symlink(struct inode *dir, struct dentry *dentry, | 724 | static int ceph_symlink(struct inode *dir, struct dentry *dentry, |
725 | const char *dest) | 725 | const char *dest) |
726 | { | 726 | { |
727 | struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); | 727 | struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); |
728 | struct ceph_mds_client *mdsc = fsc->mdsc; | 728 | struct ceph_mds_client *mdsc = fsc->mdsc; |
729 | struct ceph_mds_request *req; | 729 | struct ceph_mds_request *req; |
730 | int err; | 730 | int err; |
731 | 731 | ||
732 | if (ceph_snap(dir) != CEPH_NOSNAP) | 732 | if (ceph_snap(dir) != CEPH_NOSNAP) |
733 | return -EROFS; | 733 | return -EROFS; |
734 | 734 | ||
735 | dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest); | 735 | dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest); |
736 | req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS); | 736 | req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS); |
737 | if (IS_ERR(req)) { | 737 | if (IS_ERR(req)) { |
738 | d_drop(dentry); | 738 | d_drop(dentry); |
739 | return PTR_ERR(req); | 739 | return PTR_ERR(req); |
740 | } | 740 | } |
741 | req->r_dentry = dget(dentry); | 741 | req->r_dentry = dget(dentry); |
742 | req->r_num_caps = 2; | 742 | req->r_num_caps = 2; |
743 | req->r_path2 = kstrdup(dest, GFP_NOFS); | 743 | req->r_path2 = kstrdup(dest, GFP_NOFS); |
744 | req->r_locked_dir = dir; | 744 | req->r_locked_dir = dir; |
745 | req->r_dentry_drop = CEPH_CAP_FILE_SHARED; | 745 | req->r_dentry_drop = CEPH_CAP_FILE_SHARED; |
746 | req->r_dentry_unless = CEPH_CAP_FILE_EXCL; | 746 | req->r_dentry_unless = CEPH_CAP_FILE_EXCL; |
747 | err = ceph_mdsc_do_request(mdsc, dir, req); | 747 | err = ceph_mdsc_do_request(mdsc, dir, req); |
748 | if (!err && !req->r_reply_info.head->is_dentry) | 748 | if (!err && !req->r_reply_info.head->is_dentry) |
749 | err = ceph_handle_notrace_create(dir, dentry); | 749 | err = ceph_handle_notrace_create(dir, dentry); |
750 | ceph_mdsc_put_request(req); | 750 | ceph_mdsc_put_request(req); |
751 | if (err) | 751 | if (err) |
752 | d_drop(dentry); | 752 | d_drop(dentry); |
753 | return err; | 753 | return err; |
754 | } | 754 | } |
755 | 755 | ||
756 | static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) | 756 | static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) |
757 | { | 757 | { |
758 | struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); | 758 | struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); |
759 | struct ceph_mds_client *mdsc = fsc->mdsc; | 759 | struct ceph_mds_client *mdsc = fsc->mdsc; |
760 | struct ceph_mds_request *req; | 760 | struct ceph_mds_request *req; |
761 | int err = -EROFS; | 761 | int err = -EROFS; |
762 | int op; | 762 | int op; |
763 | 763 | ||
764 | if (ceph_snap(dir) == CEPH_SNAPDIR) { | 764 | if (ceph_snap(dir) == CEPH_SNAPDIR) { |
765 | /* mkdir .snap/foo is a MKSNAP */ | 765 | /* mkdir .snap/foo is a MKSNAP */ |
766 | op = CEPH_MDS_OP_MKSNAP; | 766 | op = CEPH_MDS_OP_MKSNAP; |
767 | dout("mksnap dir %p snap '%.*s' dn %p\n", dir, | 767 | dout("mksnap dir %p snap '%.*s' dn %p\n", dir, |
768 | dentry->d_name.len, dentry->d_name.name, dentry); | 768 | dentry->d_name.len, dentry->d_name.name, dentry); |
769 | } else if (ceph_snap(dir) == CEPH_NOSNAP) { | 769 | } else if (ceph_snap(dir) == CEPH_NOSNAP) { |
770 | dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode); | 770 | dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode); |
771 | op = CEPH_MDS_OP_MKDIR; | 771 | op = CEPH_MDS_OP_MKDIR; |
772 | } else { | 772 | } else { |
773 | goto out; | 773 | goto out; |
774 | } | 774 | } |
775 | req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); | 775 | req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); |
776 | if (IS_ERR(req)) { | 776 | if (IS_ERR(req)) { |
777 | err = PTR_ERR(req); | 777 | err = PTR_ERR(req); |
778 | goto out; | 778 | goto out; |
779 | } | 779 | } |
780 | 780 | ||
781 | req->r_dentry = dget(dentry); | 781 | req->r_dentry = dget(dentry); |
782 | req->r_num_caps = 2; | 782 | req->r_num_caps = 2; |
783 | req->r_locked_dir = dir; | 783 | req->r_locked_dir = dir; |
784 | req->r_args.mkdir.mode = cpu_to_le32(mode); | 784 | req->r_args.mkdir.mode = cpu_to_le32(mode); |
785 | req->r_dentry_drop = CEPH_CAP_FILE_SHARED; | 785 | req->r_dentry_drop = CEPH_CAP_FILE_SHARED; |
786 | req->r_dentry_unless = CEPH_CAP_FILE_EXCL; | 786 | req->r_dentry_unless = CEPH_CAP_FILE_EXCL; |
787 | err = ceph_mdsc_do_request(mdsc, dir, req); | 787 | err = ceph_mdsc_do_request(mdsc, dir, req); |
788 | if (!err && !req->r_reply_info.head->is_dentry) | 788 | if (!err && !req->r_reply_info.head->is_dentry) |
789 | err = ceph_handle_notrace_create(dir, dentry); | 789 | err = ceph_handle_notrace_create(dir, dentry); |
790 | ceph_mdsc_put_request(req); | 790 | ceph_mdsc_put_request(req); |
791 | out: | 791 | out: |
792 | if (err < 0) | 792 | if (err < 0) |
793 | d_drop(dentry); | 793 | d_drop(dentry); |
794 | return err; | 794 | return err; |
795 | } | 795 | } |
796 | 796 | ||
797 | static int ceph_link(struct dentry *old_dentry, struct inode *dir, | 797 | static int ceph_link(struct dentry *old_dentry, struct inode *dir, |
798 | struct dentry *dentry) | 798 | struct dentry *dentry) |
799 | { | 799 | { |
800 | struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); | 800 | struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); |
801 | struct ceph_mds_client *mdsc = fsc->mdsc; | 801 | struct ceph_mds_client *mdsc = fsc->mdsc; |
802 | struct ceph_mds_request *req; | 802 | struct ceph_mds_request *req; |
803 | int err; | 803 | int err; |
804 | 804 | ||
805 | if (ceph_snap(dir) != CEPH_NOSNAP) | 805 | if (ceph_snap(dir) != CEPH_NOSNAP) |
806 | return -EROFS; | 806 | return -EROFS; |
807 | 807 | ||
808 | dout("link in dir %p old_dentry %p dentry %p\n", dir, | 808 | dout("link in dir %p old_dentry %p dentry %p\n", dir, |
809 | old_dentry, dentry); | 809 | old_dentry, dentry); |
810 | req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS); | 810 | req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS); |
811 | if (IS_ERR(req)) { | 811 | if (IS_ERR(req)) { |
812 | d_drop(dentry); | 812 | d_drop(dentry); |
813 | return PTR_ERR(req); | 813 | return PTR_ERR(req); |
814 | } | 814 | } |
815 | req->r_dentry = dget(dentry); | 815 | req->r_dentry = dget(dentry); |
816 | req->r_num_caps = 2; | 816 | req->r_num_caps = 2; |
817 | req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */ | 817 | req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */ |
818 | req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry); | 818 | req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry); |
819 | req->r_locked_dir = dir; | 819 | req->r_locked_dir = dir; |
820 | req->r_dentry_drop = CEPH_CAP_FILE_SHARED; | 820 | req->r_dentry_drop = CEPH_CAP_FILE_SHARED; |
821 | req->r_dentry_unless = CEPH_CAP_FILE_EXCL; | 821 | req->r_dentry_unless = CEPH_CAP_FILE_EXCL; |
822 | err = ceph_mdsc_do_request(mdsc, dir, req); | 822 | err = ceph_mdsc_do_request(mdsc, dir, req); |
823 | if (err) { | 823 | if (err) { |
824 | d_drop(dentry); | 824 | d_drop(dentry); |
825 | } else if (!req->r_reply_info.head->is_dentry) { | 825 | } else if (!req->r_reply_info.head->is_dentry) { |
826 | ihold(old_dentry->d_inode); | 826 | ihold(old_dentry->d_inode); |
827 | d_instantiate(dentry, old_dentry->d_inode); | 827 | d_instantiate(dentry, old_dentry->d_inode); |
828 | } | 828 | } |
829 | ceph_mdsc_put_request(req); | 829 | ceph_mdsc_put_request(req); |
830 | return err; | 830 | return err; |
831 | } | 831 | } |
832 | 832 | ||
833 | /* | 833 | /* |
834 | * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it | 834 | * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it |
835 | * looks like the link count will hit 0, drop any other caps (other | 835 | * looks like the link count will hit 0, drop any other caps (other |
836 | * than PIN) we don't specifically want (due to the file still being | 836 | * than PIN) we don't specifically want (due to the file still being |
837 | * open). | 837 | * open). |
838 | */ | 838 | */ |
839 | static int drop_caps_for_unlink(struct inode *inode) | 839 | static int drop_caps_for_unlink(struct inode *inode) |
840 | { | 840 | { |
841 | struct ceph_inode_info *ci = ceph_inode(inode); | 841 | struct ceph_inode_info *ci = ceph_inode(inode); |
842 | int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; | 842 | int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; |
843 | 843 | ||
844 | spin_lock(&ci->i_ceph_lock); | 844 | spin_lock(&ci->i_ceph_lock); |
845 | if (inode->i_nlink == 1) { | 845 | if (inode->i_nlink == 1) { |
846 | drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN); | 846 | drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN); |
847 | ci->i_ceph_flags |= CEPH_I_NODELAY; | 847 | ci->i_ceph_flags |= CEPH_I_NODELAY; |
848 | } | 848 | } |
849 | spin_unlock(&ci->i_ceph_lock); | 849 | spin_unlock(&ci->i_ceph_lock); |
850 | return drop; | 850 | return drop; |
851 | } | 851 | } |
852 | 852 | ||
853 | /* | 853 | /* |
854 | * rmdir and unlink are differ only by the metadata op code | 854 | * rmdir and unlink are differ only by the metadata op code |
855 | */ | 855 | */ |
856 | static int ceph_unlink(struct inode *dir, struct dentry *dentry) | 856 | static int ceph_unlink(struct inode *dir, struct dentry *dentry) |
857 | { | 857 | { |
858 | struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); | 858 | struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); |
859 | struct ceph_mds_client *mdsc = fsc->mdsc; | 859 | struct ceph_mds_client *mdsc = fsc->mdsc; |
860 | struct inode *inode = dentry->d_inode; | 860 | struct inode *inode = dentry->d_inode; |
861 | struct ceph_mds_request *req; | 861 | struct ceph_mds_request *req; |
862 | int err = -EROFS; | 862 | int err = -EROFS; |
863 | int op; | 863 | int op; |
864 | 864 | ||
865 | if (ceph_snap(dir) == CEPH_SNAPDIR) { | 865 | if (ceph_snap(dir) == CEPH_SNAPDIR) { |
866 | /* rmdir .snap/foo is RMSNAP */ | 866 | /* rmdir .snap/foo is RMSNAP */ |
867 | dout("rmsnap dir %p '%.*s' dn %p\n", dir, dentry->d_name.len, | 867 | dout("rmsnap dir %p '%.*s' dn %p\n", dir, dentry->d_name.len, |
868 | dentry->d_name.name, dentry); | 868 | dentry->d_name.name, dentry); |
869 | op = CEPH_MDS_OP_RMSNAP; | 869 | op = CEPH_MDS_OP_RMSNAP; |
870 | } else if (ceph_snap(dir) == CEPH_NOSNAP) { | 870 | } else if (ceph_snap(dir) == CEPH_NOSNAP) { |
871 | dout("unlink/rmdir dir %p dn %p inode %p\n", | 871 | dout("unlink/rmdir dir %p dn %p inode %p\n", |
872 | dir, dentry, inode); | 872 | dir, dentry, inode); |
873 | op = S_ISDIR(dentry->d_inode->i_mode) ? | 873 | op = S_ISDIR(dentry->d_inode->i_mode) ? |
874 | CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK; | 874 | CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK; |
875 | } else | 875 | } else |
876 | goto out; | 876 | goto out; |
877 | req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); | 877 | req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); |
878 | if (IS_ERR(req)) { | 878 | if (IS_ERR(req)) { |
879 | err = PTR_ERR(req); | 879 | err = PTR_ERR(req); |
880 | goto out; | 880 | goto out; |
881 | } | 881 | } |
882 | req->r_dentry = dget(dentry); | 882 | req->r_dentry = dget(dentry); |
883 | req->r_num_caps = 2; | 883 | req->r_num_caps = 2; |
884 | req->r_locked_dir = dir; | 884 | req->r_locked_dir = dir; |
885 | req->r_dentry_drop = CEPH_CAP_FILE_SHARED; | 885 | req->r_dentry_drop = CEPH_CAP_FILE_SHARED; |
886 | req->r_dentry_unless = CEPH_CAP_FILE_EXCL; | 886 | req->r_dentry_unless = CEPH_CAP_FILE_EXCL; |
887 | req->r_inode_drop = drop_caps_for_unlink(inode); | 887 | req->r_inode_drop = drop_caps_for_unlink(inode); |
888 | err = ceph_mdsc_do_request(mdsc, dir, req); | 888 | err = ceph_mdsc_do_request(mdsc, dir, req); |
889 | if (!err && !req->r_reply_info.head->is_dentry) | 889 | if (!err && !req->r_reply_info.head->is_dentry) |
890 | d_delete(dentry); | 890 | d_delete(dentry); |
891 | ceph_mdsc_put_request(req); | 891 | ceph_mdsc_put_request(req); |
892 | out: | 892 | out: |
893 | return err; | 893 | return err; |
894 | } | 894 | } |
895 | 895 | ||
896 | static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, | 896 | static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, |
897 | struct inode *new_dir, struct dentry *new_dentry) | 897 | struct inode *new_dir, struct dentry *new_dentry) |
898 | { | 898 | { |
899 | struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb); | 899 | struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb); |
900 | struct ceph_mds_client *mdsc = fsc->mdsc; | 900 | struct ceph_mds_client *mdsc = fsc->mdsc; |
901 | struct ceph_mds_request *req; | 901 | struct ceph_mds_request *req; |
902 | int err; | 902 | int err; |
903 | 903 | ||
904 | if (ceph_snap(old_dir) != ceph_snap(new_dir)) | 904 | if (ceph_snap(old_dir) != ceph_snap(new_dir)) |
905 | return -EXDEV; | 905 | return -EXDEV; |
906 | if (ceph_snap(old_dir) != CEPH_NOSNAP || | 906 | if (ceph_snap(old_dir) != CEPH_NOSNAP || |
907 | ceph_snap(new_dir) != CEPH_NOSNAP) | 907 | ceph_snap(new_dir) != CEPH_NOSNAP) |
908 | return -EROFS; | 908 | return -EROFS; |
909 | dout("rename dir %p dentry %p to dir %p dentry %p\n", | 909 | dout("rename dir %p dentry %p to dir %p dentry %p\n", |
910 | old_dir, old_dentry, new_dir, new_dentry); | 910 | old_dir, old_dentry, new_dir, new_dentry); |
911 | req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS); | 911 | req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS); |
912 | if (IS_ERR(req)) | 912 | if (IS_ERR(req)) |
913 | return PTR_ERR(req); | 913 | return PTR_ERR(req); |
914 | req->r_dentry = dget(new_dentry); | 914 | req->r_dentry = dget(new_dentry); |
915 | req->r_num_caps = 2; | 915 | req->r_num_caps = 2; |
916 | req->r_old_dentry = dget(old_dentry); | 916 | req->r_old_dentry = dget(old_dentry); |
917 | req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry); | 917 | req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry); |
918 | req->r_locked_dir = new_dir; | 918 | req->r_locked_dir = new_dir; |
919 | req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED; | 919 | req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED; |
920 | req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL; | 920 | req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL; |
921 | req->r_dentry_drop = CEPH_CAP_FILE_SHARED; | 921 | req->r_dentry_drop = CEPH_CAP_FILE_SHARED; |
922 | req->r_dentry_unless = CEPH_CAP_FILE_EXCL; | 922 | req->r_dentry_unless = CEPH_CAP_FILE_EXCL; |
923 | /* release LINK_RDCACHE on source inode (mds will lock it) */ | 923 | /* release LINK_RDCACHE on source inode (mds will lock it) */ |
924 | req->r_old_inode_drop = CEPH_CAP_LINK_SHARED; | 924 | req->r_old_inode_drop = CEPH_CAP_LINK_SHARED; |
925 | if (new_dentry->d_inode) | 925 | if (new_dentry->d_inode) |
926 | req->r_inode_drop = drop_caps_for_unlink(new_dentry->d_inode); | 926 | req->r_inode_drop = drop_caps_for_unlink(new_dentry->d_inode); |
927 | err = ceph_mdsc_do_request(mdsc, old_dir, req); | 927 | err = ceph_mdsc_do_request(mdsc, old_dir, req); |
928 | if (!err && !req->r_reply_info.head->is_dentry) { | 928 | if (!err && !req->r_reply_info.head->is_dentry) { |
929 | /* | 929 | /* |
930 | * Normally d_move() is done by fill_trace (called by | 930 | * Normally d_move() is done by fill_trace (called by |
931 | * do_request, above). If there is no trace, we need | 931 | * do_request, above). If there is no trace, we need |
932 | * to do it here. | 932 | * to do it here. |
933 | */ | 933 | */ |
934 | 934 | ||
935 | /* d_move screws up d_subdirs order */ | 935 | /* d_move screws up d_subdirs order */ |
936 | ceph_dir_clear_complete(new_dir); | 936 | ceph_dir_clear_complete(new_dir); |
937 | 937 | ||
938 | d_move(old_dentry, new_dentry); | 938 | d_move(old_dentry, new_dentry); |
939 | 939 | ||
940 | /* ensure target dentry is invalidated, despite | 940 | /* ensure target dentry is invalidated, despite |
941 | rehashing bug in vfs_rename_dir */ | 941 | rehashing bug in vfs_rename_dir */ |
942 | ceph_invalidate_dentry_lease(new_dentry); | 942 | ceph_invalidate_dentry_lease(new_dentry); |
943 | } | 943 | } |
944 | ceph_mdsc_put_request(req); | 944 | ceph_mdsc_put_request(req); |
945 | return err; | 945 | return err; |
946 | } | 946 | } |
947 | 947 | ||
948 | /* | 948 | /* |
949 | * Ensure a dentry lease will no longer revalidate. | 949 | * Ensure a dentry lease will no longer revalidate. |
950 | */ | 950 | */ |
951 | void ceph_invalidate_dentry_lease(struct dentry *dentry) | 951 | void ceph_invalidate_dentry_lease(struct dentry *dentry) |
952 | { | 952 | { |
953 | spin_lock(&dentry->d_lock); | 953 | spin_lock(&dentry->d_lock); |
954 | dentry->d_time = jiffies; | 954 | dentry->d_time = jiffies; |
955 | ceph_dentry(dentry)->lease_shared_gen = 0; | 955 | ceph_dentry(dentry)->lease_shared_gen = 0; |
956 | spin_unlock(&dentry->d_lock); | 956 | spin_unlock(&dentry->d_lock); |
957 | } | 957 | } |
958 | 958 | ||
959 | /* | 959 | /* |
960 | * Check if dentry lease is valid. If not, delete the lease. Try to | 960 | * Check if dentry lease is valid. If not, delete the lease. Try to |
961 | * renew if the least is more than half up. | 961 | * renew if the least is more than half up. |
962 | */ | 962 | */ |
963 | static int dentry_lease_is_valid(struct dentry *dentry) | 963 | static int dentry_lease_is_valid(struct dentry *dentry) |
964 | { | 964 | { |
965 | struct ceph_dentry_info *di; | 965 | struct ceph_dentry_info *di; |
966 | struct ceph_mds_session *s; | 966 | struct ceph_mds_session *s; |
967 | int valid = 0; | 967 | int valid = 0; |
968 | u32 gen; | 968 | u32 gen; |
969 | unsigned long ttl; | 969 | unsigned long ttl; |
970 | struct ceph_mds_session *session = NULL; | 970 | struct ceph_mds_session *session = NULL; |
971 | struct inode *dir = NULL; | 971 | struct inode *dir = NULL; |
972 | u32 seq = 0; | 972 | u32 seq = 0; |
973 | 973 | ||
974 | spin_lock(&dentry->d_lock); | 974 | spin_lock(&dentry->d_lock); |
975 | di = ceph_dentry(dentry); | 975 | di = ceph_dentry(dentry); |
976 | if (di->lease_session) { | 976 | if (di->lease_session) { |
977 | s = di->lease_session; | 977 | s = di->lease_session; |
978 | spin_lock(&s->s_cap_lock); | 978 | spin_lock(&s->s_gen_ttl_lock); |
979 | gen = s->s_cap_gen; | 979 | gen = s->s_cap_gen; |
980 | ttl = s->s_cap_ttl; | 980 | ttl = s->s_cap_ttl; |
981 | spin_unlock(&s->s_cap_lock); | 981 | spin_unlock(&s->s_gen_ttl_lock); |
982 | 982 | ||
983 | if (di->lease_gen == gen && | 983 | if (di->lease_gen == gen && |
984 | time_before(jiffies, dentry->d_time) && | 984 | time_before(jiffies, dentry->d_time) && |
985 | time_before(jiffies, ttl)) { | 985 | time_before(jiffies, ttl)) { |
986 | valid = 1; | 986 | valid = 1; |
987 | if (di->lease_renew_after && | 987 | if (di->lease_renew_after && |
988 | time_after(jiffies, di->lease_renew_after)) { | 988 | time_after(jiffies, di->lease_renew_after)) { |
989 | /* we should renew */ | 989 | /* we should renew */ |
990 | dir = dentry->d_parent->d_inode; | 990 | dir = dentry->d_parent->d_inode; |
991 | session = ceph_get_mds_session(s); | 991 | session = ceph_get_mds_session(s); |
992 | seq = di->lease_seq; | 992 | seq = di->lease_seq; |
993 | di->lease_renew_after = 0; | 993 | di->lease_renew_after = 0; |
994 | di->lease_renew_from = jiffies; | 994 | di->lease_renew_from = jiffies; |
995 | } | 995 | } |
996 | } | 996 | } |
997 | } | 997 | } |
998 | spin_unlock(&dentry->d_lock); | 998 | spin_unlock(&dentry->d_lock); |
999 | 999 | ||
1000 | if (session) { | 1000 | if (session) { |
1001 | ceph_mdsc_lease_send_msg(session, dir, dentry, | 1001 | ceph_mdsc_lease_send_msg(session, dir, dentry, |
1002 | CEPH_MDS_LEASE_RENEW, seq); | 1002 | CEPH_MDS_LEASE_RENEW, seq); |
1003 | ceph_put_mds_session(session); | 1003 | ceph_put_mds_session(session); |
1004 | } | 1004 | } |
1005 | dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid); | 1005 | dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid); |
1006 | return valid; | 1006 | return valid; |
1007 | } | 1007 | } |
1008 | 1008 | ||
1009 | /* | 1009 | /* |
1010 | * Check if directory-wide content lease/cap is valid. | 1010 | * Check if directory-wide content lease/cap is valid. |
1011 | */ | 1011 | */ |
1012 | static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry) | 1012 | static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry) |
1013 | { | 1013 | { |
1014 | struct ceph_inode_info *ci = ceph_inode(dir); | 1014 | struct ceph_inode_info *ci = ceph_inode(dir); |
1015 | struct ceph_dentry_info *di = ceph_dentry(dentry); | 1015 | struct ceph_dentry_info *di = ceph_dentry(dentry); |
1016 | int valid = 0; | 1016 | int valid = 0; |
1017 | 1017 | ||
1018 | spin_lock(&ci->i_ceph_lock); | 1018 | spin_lock(&ci->i_ceph_lock); |
1019 | if (ci->i_shared_gen == di->lease_shared_gen) | 1019 | if (ci->i_shared_gen == di->lease_shared_gen) |
1020 | valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1); | 1020 | valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1); |
1021 | spin_unlock(&ci->i_ceph_lock); | 1021 | spin_unlock(&ci->i_ceph_lock); |
1022 | dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n", | 1022 | dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n", |
1023 | dir, (unsigned)ci->i_shared_gen, dentry, | 1023 | dir, (unsigned)ci->i_shared_gen, dentry, |
1024 | (unsigned)di->lease_shared_gen, valid); | 1024 | (unsigned)di->lease_shared_gen, valid); |
1025 | return valid; | 1025 | return valid; |
1026 | } | 1026 | } |
1027 | 1027 | ||
1028 | /* | 1028 | /* |
1029 | * Check if cached dentry can be trusted. | 1029 | * Check if cached dentry can be trusted. |
1030 | */ | 1030 | */ |
1031 | static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd) | 1031 | static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd) |
1032 | { | 1032 | { |
1033 | int valid = 0; | 1033 | int valid = 0; |
1034 | struct inode *dir; | 1034 | struct inode *dir; |
1035 | 1035 | ||
1036 | if (nd && nd->flags & LOOKUP_RCU) | 1036 | if (nd && nd->flags & LOOKUP_RCU) |
1037 | return -ECHILD; | 1037 | return -ECHILD; |
1038 | 1038 | ||
1039 | dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry, | 1039 | dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry, |
1040 | dentry->d_name.len, dentry->d_name.name, dentry->d_inode, | 1040 | dentry->d_name.len, dentry->d_name.name, dentry->d_inode, |
1041 | ceph_dentry(dentry)->offset); | 1041 | ceph_dentry(dentry)->offset); |
1042 | 1042 | ||
1043 | dir = ceph_get_dentry_parent_inode(dentry); | 1043 | dir = ceph_get_dentry_parent_inode(dentry); |
1044 | 1044 | ||
1045 | /* always trust cached snapped dentries, snapdir dentry */ | 1045 | /* always trust cached snapped dentries, snapdir dentry */ |
1046 | if (ceph_snap(dir) != CEPH_NOSNAP) { | 1046 | if (ceph_snap(dir) != CEPH_NOSNAP) { |
1047 | dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry, | 1047 | dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry, |
1048 | dentry->d_name.len, dentry->d_name.name, dentry->d_inode); | 1048 | dentry->d_name.len, dentry->d_name.name, dentry->d_inode); |
1049 | valid = 1; | 1049 | valid = 1; |
1050 | } else if (dentry->d_inode && | 1050 | } else if (dentry->d_inode && |
1051 | ceph_snap(dentry->d_inode) == CEPH_SNAPDIR) { | 1051 | ceph_snap(dentry->d_inode) == CEPH_SNAPDIR) { |
1052 | valid = 1; | 1052 | valid = 1; |
1053 | } else if (dentry_lease_is_valid(dentry) || | 1053 | } else if (dentry_lease_is_valid(dentry) || |
1054 | dir_lease_is_valid(dir, dentry)) { | 1054 | dir_lease_is_valid(dir, dentry)) { |
1055 | valid = 1; | 1055 | valid = 1; |
1056 | } | 1056 | } |
1057 | 1057 | ||
1058 | dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid"); | 1058 | dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid"); |
1059 | if (valid) | 1059 | if (valid) |
1060 | ceph_dentry_lru_touch(dentry); | 1060 | ceph_dentry_lru_touch(dentry); |
1061 | else | 1061 | else |
1062 | d_drop(dentry); | 1062 | d_drop(dentry); |
1063 | iput(dir); | 1063 | iput(dir); |
1064 | return valid; | 1064 | return valid; |
1065 | } | 1065 | } |
1066 | 1066 | ||
1067 | /* | 1067 | /* |
1068 | * Release our ceph_dentry_info. | 1068 | * Release our ceph_dentry_info. |
1069 | */ | 1069 | */ |
1070 | static void ceph_d_release(struct dentry *dentry) | 1070 | static void ceph_d_release(struct dentry *dentry) |
1071 | { | 1071 | { |
1072 | struct ceph_dentry_info *di = ceph_dentry(dentry); | 1072 | struct ceph_dentry_info *di = ceph_dentry(dentry); |
1073 | 1073 | ||
1074 | dout("d_release %p\n", dentry); | 1074 | dout("d_release %p\n", dentry); |
1075 | ceph_dentry_lru_del(dentry); | 1075 | ceph_dentry_lru_del(dentry); |
1076 | if (di->lease_session) | 1076 | if (di->lease_session) |
1077 | ceph_put_mds_session(di->lease_session); | 1077 | ceph_put_mds_session(di->lease_session); |
1078 | kmem_cache_free(ceph_dentry_cachep, di); | 1078 | kmem_cache_free(ceph_dentry_cachep, di); |
1079 | dentry->d_fsdata = NULL; | 1079 | dentry->d_fsdata = NULL; |
1080 | } | 1080 | } |
1081 | 1081 | ||
1082 | static int ceph_snapdir_d_revalidate(struct dentry *dentry, | 1082 | static int ceph_snapdir_d_revalidate(struct dentry *dentry, |
1083 | struct nameidata *nd) | 1083 | struct nameidata *nd) |
1084 | { | 1084 | { |
1085 | /* | 1085 | /* |
1086 | * Eventually, we'll want to revalidate snapped metadata | 1086 | * Eventually, we'll want to revalidate snapped metadata |
1087 | * too... probably... | 1087 | * too... probably... |
1088 | */ | 1088 | */ |
1089 | return 1; | 1089 | return 1; |
1090 | } | 1090 | } |
1091 | 1091 | ||
1092 | /* | 1092 | /* |
1093 | * Set/clear/test dir complete flag on the dir's dentry. | 1093 | * Set/clear/test dir complete flag on the dir's dentry. |
1094 | */ | 1094 | */ |
1095 | void ceph_dir_set_complete(struct inode *inode) | 1095 | void ceph_dir_set_complete(struct inode *inode) |
1096 | { | 1096 | { |
1097 | struct dentry *dentry = d_find_any_alias(inode); | 1097 | struct dentry *dentry = d_find_any_alias(inode); |
1098 | 1098 | ||
1099 | if (dentry && ceph_dentry(dentry) && | 1099 | if (dentry && ceph_dentry(dentry) && |
1100 | ceph_test_mount_opt(ceph_sb_to_client(dentry->d_sb), DCACHE)) { | 1100 | ceph_test_mount_opt(ceph_sb_to_client(dentry->d_sb), DCACHE)) { |
1101 | dout(" marking %p (%p) complete\n", inode, dentry); | 1101 | dout(" marking %p (%p) complete\n", inode, dentry); |
1102 | set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); | 1102 | set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); |
1103 | } | 1103 | } |
1104 | dput(dentry); | 1104 | dput(dentry); |
1105 | } | 1105 | } |
1106 | 1106 | ||
1107 | void ceph_dir_clear_complete(struct inode *inode) | 1107 | void ceph_dir_clear_complete(struct inode *inode) |
1108 | { | 1108 | { |
1109 | struct dentry *dentry = d_find_any_alias(inode); | 1109 | struct dentry *dentry = d_find_any_alias(inode); |
1110 | 1110 | ||
1111 | if (dentry && ceph_dentry(dentry)) { | 1111 | if (dentry && ceph_dentry(dentry)) { |
1112 | dout(" marking %p (%p) complete\n", inode, dentry); | 1112 | dout(" marking %p (%p) complete\n", inode, dentry); |
1113 | set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); | 1113 | set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); |
1114 | } | 1114 | } |
1115 | dput(dentry); | 1115 | dput(dentry); |
1116 | } | 1116 | } |
1117 | 1117 | ||
1118 | bool ceph_dir_test_complete(struct inode *inode) | 1118 | bool ceph_dir_test_complete(struct inode *inode) |
1119 | { | 1119 | { |
1120 | struct dentry *dentry = d_find_any_alias(inode); | 1120 | struct dentry *dentry = d_find_any_alias(inode); |
1121 | 1121 | ||
1122 | if (dentry && ceph_dentry(dentry)) { | 1122 | if (dentry && ceph_dentry(dentry)) { |
1123 | dout(" marking %p (%p) NOT complete\n", inode, dentry); | 1123 | dout(" marking %p (%p) NOT complete\n", inode, dentry); |
1124 | clear_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); | 1124 | clear_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); |
1125 | } | 1125 | } |
1126 | dput(dentry); | 1126 | dput(dentry); |
1127 | return false; | 1127 | return false; |
1128 | } | 1128 | } |
1129 | 1129 | ||
1130 | /* | 1130 | /* |
1131 | * When the VFS prunes a dentry from the cache, we need to clear the | 1131 | * When the VFS prunes a dentry from the cache, we need to clear the |
1132 | * complete flag on the parent directory. | 1132 | * complete flag on the parent directory. |
1133 | * | 1133 | * |
1134 | * Called under dentry->d_lock. | 1134 | * Called under dentry->d_lock. |
1135 | */ | 1135 | */ |
1136 | static void ceph_d_prune(struct dentry *dentry) | 1136 | static void ceph_d_prune(struct dentry *dentry) |
1137 | { | 1137 | { |
1138 | struct ceph_dentry_info *di; | 1138 | struct ceph_dentry_info *di; |
1139 | 1139 | ||
1140 | dout("ceph_d_prune %p\n", dentry); | 1140 | dout("ceph_d_prune %p\n", dentry); |
1141 | 1141 | ||
1142 | /* do we have a valid parent? */ | 1142 | /* do we have a valid parent? */ |
1143 | if (!dentry->d_parent || IS_ROOT(dentry)) | 1143 | if (!dentry->d_parent || IS_ROOT(dentry)) |
1144 | return; | 1144 | return; |
1145 | 1145 | ||
1146 | /* if we are not hashed, we don't affect D_COMPLETE */ | 1146 | /* if we are not hashed, we don't affect D_COMPLETE */ |
1147 | if (d_unhashed(dentry)) | 1147 | if (d_unhashed(dentry)) |
1148 | return; | 1148 | return; |
1149 | 1149 | ||
1150 | /* | 1150 | /* |
1151 | * we hold d_lock, so d_parent is stable, and d_fsdata is never | 1151 | * we hold d_lock, so d_parent is stable, and d_fsdata is never |
1152 | * cleared until d_release | 1152 | * cleared until d_release |
1153 | */ | 1153 | */ |
1154 | di = ceph_dentry(dentry->d_parent); | 1154 | di = ceph_dentry(dentry->d_parent); |
1155 | clear_bit(CEPH_D_COMPLETE, &di->flags); | 1155 | clear_bit(CEPH_D_COMPLETE, &di->flags); |
1156 | } | 1156 | } |
1157 | 1157 | ||
1158 | /* | 1158 | /* |
1159 | * read() on a dir. This weird interface hack only works if mounted | 1159 | * read() on a dir. This weird interface hack only works if mounted |
1160 | * with '-o dirstat'. | 1160 | * with '-o dirstat'. |
1161 | */ | 1161 | */ |
1162 | static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, | 1162 | static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, |
1163 | loff_t *ppos) | 1163 | loff_t *ppos) |
1164 | { | 1164 | { |
1165 | struct ceph_file_info *cf = file->private_data; | 1165 | struct ceph_file_info *cf = file->private_data; |
1166 | struct inode *inode = file->f_dentry->d_inode; | 1166 | struct inode *inode = file->f_dentry->d_inode; |
1167 | struct ceph_inode_info *ci = ceph_inode(inode); | 1167 | struct ceph_inode_info *ci = ceph_inode(inode); |
1168 | int left; | 1168 | int left; |
1169 | const int bufsize = 1024; | 1169 | const int bufsize = 1024; |
1170 | 1170 | ||
1171 | if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT)) | 1171 | if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT)) |
1172 | return -EISDIR; | 1172 | return -EISDIR; |
1173 | 1173 | ||
1174 | if (!cf->dir_info) { | 1174 | if (!cf->dir_info) { |
1175 | cf->dir_info = kmalloc(bufsize, GFP_NOFS); | 1175 | cf->dir_info = kmalloc(bufsize, GFP_NOFS); |
1176 | if (!cf->dir_info) | 1176 | if (!cf->dir_info) |
1177 | return -ENOMEM; | 1177 | return -ENOMEM; |
1178 | cf->dir_info_len = | 1178 | cf->dir_info_len = |
1179 | snprintf(cf->dir_info, bufsize, | 1179 | snprintf(cf->dir_info, bufsize, |
1180 | "entries: %20lld\n" | 1180 | "entries: %20lld\n" |
1181 | " files: %20lld\n" | 1181 | " files: %20lld\n" |
1182 | " subdirs: %20lld\n" | 1182 | " subdirs: %20lld\n" |
1183 | "rentries: %20lld\n" | 1183 | "rentries: %20lld\n" |
1184 | " rfiles: %20lld\n" | 1184 | " rfiles: %20lld\n" |
1185 | " rsubdirs: %20lld\n" | 1185 | " rsubdirs: %20lld\n" |
1186 | "rbytes: %20lld\n" | 1186 | "rbytes: %20lld\n" |
1187 | "rctime: %10ld.%09ld\n", | 1187 | "rctime: %10ld.%09ld\n", |
1188 | ci->i_files + ci->i_subdirs, | 1188 | ci->i_files + ci->i_subdirs, |
1189 | ci->i_files, | 1189 | ci->i_files, |
1190 | ci->i_subdirs, | 1190 | ci->i_subdirs, |
1191 | ci->i_rfiles + ci->i_rsubdirs, | 1191 | ci->i_rfiles + ci->i_rsubdirs, |
1192 | ci->i_rfiles, | 1192 | ci->i_rfiles, |
1193 | ci->i_rsubdirs, | 1193 | ci->i_rsubdirs, |
1194 | ci->i_rbytes, | 1194 | ci->i_rbytes, |
1195 | (long)ci->i_rctime.tv_sec, | 1195 | (long)ci->i_rctime.tv_sec, |
1196 | (long)ci->i_rctime.tv_nsec); | 1196 | (long)ci->i_rctime.tv_nsec); |
1197 | } | 1197 | } |
1198 | 1198 | ||
1199 | if (*ppos >= cf->dir_info_len) | 1199 | if (*ppos >= cf->dir_info_len) |
1200 | return 0; | 1200 | return 0; |
1201 | size = min_t(unsigned, size, cf->dir_info_len-*ppos); | 1201 | size = min_t(unsigned, size, cf->dir_info_len-*ppos); |
1202 | left = copy_to_user(buf, cf->dir_info + *ppos, size); | 1202 | left = copy_to_user(buf, cf->dir_info + *ppos, size); |
1203 | if (left == size) | 1203 | if (left == size) |
1204 | return -EFAULT; | 1204 | return -EFAULT; |
1205 | *ppos += (size - left); | 1205 | *ppos += (size - left); |
1206 | return size - left; | 1206 | return size - left; |
1207 | } | 1207 | } |
1208 | 1208 | ||
1209 | /* | 1209 | /* |
1210 | * an fsync() on a dir will wait for any uncommitted directory | 1210 | * an fsync() on a dir will wait for any uncommitted directory |
1211 | * operations to commit. | 1211 | * operations to commit. |
1212 | */ | 1212 | */ |
1213 | static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end, | 1213 | static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end, |
1214 | int datasync) | 1214 | int datasync) |
1215 | { | 1215 | { |
1216 | struct inode *inode = file->f_path.dentry->d_inode; | 1216 | struct inode *inode = file->f_path.dentry->d_inode; |
1217 | struct ceph_inode_info *ci = ceph_inode(inode); | 1217 | struct ceph_inode_info *ci = ceph_inode(inode); |
1218 | struct list_head *head = &ci->i_unsafe_dirops; | 1218 | struct list_head *head = &ci->i_unsafe_dirops; |
1219 | struct ceph_mds_request *req; | 1219 | struct ceph_mds_request *req; |
1220 | u64 last_tid; | 1220 | u64 last_tid; |
1221 | int ret = 0; | 1221 | int ret = 0; |
1222 | 1222 | ||
1223 | dout("dir_fsync %p\n", inode); | 1223 | dout("dir_fsync %p\n", inode); |
1224 | ret = filemap_write_and_wait_range(inode->i_mapping, start, end); | 1224 | ret = filemap_write_and_wait_range(inode->i_mapping, start, end); |
1225 | if (ret) | 1225 | if (ret) |
1226 | return ret; | 1226 | return ret; |
1227 | mutex_lock(&inode->i_mutex); | 1227 | mutex_lock(&inode->i_mutex); |
1228 | 1228 | ||
1229 | spin_lock(&ci->i_unsafe_lock); | 1229 | spin_lock(&ci->i_unsafe_lock); |
1230 | if (list_empty(head)) | 1230 | if (list_empty(head)) |
1231 | goto out; | 1231 | goto out; |
1232 | 1232 | ||
1233 | req = list_entry(head->prev, | 1233 | req = list_entry(head->prev, |
1234 | struct ceph_mds_request, r_unsafe_dir_item); | 1234 | struct ceph_mds_request, r_unsafe_dir_item); |
1235 | last_tid = req->r_tid; | 1235 | last_tid = req->r_tid; |
1236 | 1236 | ||
1237 | do { | 1237 | do { |
1238 | ceph_mdsc_get_request(req); | 1238 | ceph_mdsc_get_request(req); |
1239 | spin_unlock(&ci->i_unsafe_lock); | 1239 | spin_unlock(&ci->i_unsafe_lock); |
1240 | 1240 | ||
1241 | dout("dir_fsync %p wait on tid %llu (until %llu)\n", | 1241 | dout("dir_fsync %p wait on tid %llu (until %llu)\n", |
1242 | inode, req->r_tid, last_tid); | 1242 | inode, req->r_tid, last_tid); |
1243 | if (req->r_timeout) { | 1243 | if (req->r_timeout) { |
1244 | ret = wait_for_completion_timeout( | 1244 | ret = wait_for_completion_timeout( |
1245 | &req->r_safe_completion, req->r_timeout); | 1245 | &req->r_safe_completion, req->r_timeout); |
1246 | if (ret > 0) | 1246 | if (ret > 0) |
1247 | ret = 0; | 1247 | ret = 0; |
1248 | else if (ret == 0) | 1248 | else if (ret == 0) |
1249 | ret = -EIO; /* timed out */ | 1249 | ret = -EIO; /* timed out */ |
1250 | } else { | 1250 | } else { |
1251 | wait_for_completion(&req->r_safe_completion); | 1251 | wait_for_completion(&req->r_safe_completion); |
1252 | } | 1252 | } |
1253 | ceph_mdsc_put_request(req); | 1253 | ceph_mdsc_put_request(req); |
1254 | 1254 | ||
1255 | spin_lock(&ci->i_unsafe_lock); | 1255 | spin_lock(&ci->i_unsafe_lock); |
1256 | if (ret || list_empty(head)) | 1256 | if (ret || list_empty(head)) |
1257 | break; | 1257 | break; |
1258 | req = list_entry(head->next, | 1258 | req = list_entry(head->next, |
1259 | struct ceph_mds_request, r_unsafe_dir_item); | 1259 | struct ceph_mds_request, r_unsafe_dir_item); |
1260 | } while (req->r_tid < last_tid); | 1260 | } while (req->r_tid < last_tid); |
1261 | out: | 1261 | out: |
1262 | spin_unlock(&ci->i_unsafe_lock); | 1262 | spin_unlock(&ci->i_unsafe_lock); |
1263 | mutex_unlock(&inode->i_mutex); | 1263 | mutex_unlock(&inode->i_mutex); |
1264 | 1264 | ||
1265 | return ret; | 1265 | return ret; |
1266 | } | 1266 | } |
1267 | 1267 | ||
1268 | /* | 1268 | /* |
1269 | * We maintain a private dentry LRU. | 1269 | * We maintain a private dentry LRU. |
1270 | * | 1270 | * |
1271 | * FIXME: this needs to be changed to a per-mds lru to be useful. | 1271 | * FIXME: this needs to be changed to a per-mds lru to be useful. |
1272 | */ | 1272 | */ |
1273 | void ceph_dentry_lru_add(struct dentry *dn) | 1273 | void ceph_dentry_lru_add(struct dentry *dn) |
1274 | { | 1274 | { |
1275 | struct ceph_dentry_info *di = ceph_dentry(dn); | 1275 | struct ceph_dentry_info *di = ceph_dentry(dn); |
1276 | struct ceph_mds_client *mdsc; | 1276 | struct ceph_mds_client *mdsc; |
1277 | 1277 | ||
1278 | dout("dentry_lru_add %p %p '%.*s'\n", di, dn, | 1278 | dout("dentry_lru_add %p %p '%.*s'\n", di, dn, |
1279 | dn->d_name.len, dn->d_name.name); | 1279 | dn->d_name.len, dn->d_name.name); |
1280 | mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; | 1280 | mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; |
1281 | spin_lock(&mdsc->dentry_lru_lock); | 1281 | spin_lock(&mdsc->dentry_lru_lock); |
1282 | list_add_tail(&di->lru, &mdsc->dentry_lru); | 1282 | list_add_tail(&di->lru, &mdsc->dentry_lru); |
1283 | mdsc->num_dentry++; | 1283 | mdsc->num_dentry++; |
1284 | spin_unlock(&mdsc->dentry_lru_lock); | 1284 | spin_unlock(&mdsc->dentry_lru_lock); |
1285 | } | 1285 | } |
1286 | 1286 | ||
1287 | void ceph_dentry_lru_touch(struct dentry *dn) | 1287 | void ceph_dentry_lru_touch(struct dentry *dn) |
1288 | { | 1288 | { |
1289 | struct ceph_dentry_info *di = ceph_dentry(dn); | 1289 | struct ceph_dentry_info *di = ceph_dentry(dn); |
1290 | struct ceph_mds_client *mdsc; | 1290 | struct ceph_mds_client *mdsc; |
1291 | 1291 | ||
1292 | dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn, | 1292 | dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn, |
1293 | dn->d_name.len, dn->d_name.name, di->offset); | 1293 | dn->d_name.len, dn->d_name.name, di->offset); |
1294 | mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; | 1294 | mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; |
1295 | spin_lock(&mdsc->dentry_lru_lock); | 1295 | spin_lock(&mdsc->dentry_lru_lock); |
1296 | list_move_tail(&di->lru, &mdsc->dentry_lru); | 1296 | list_move_tail(&di->lru, &mdsc->dentry_lru); |
1297 | spin_unlock(&mdsc->dentry_lru_lock); | 1297 | spin_unlock(&mdsc->dentry_lru_lock); |
1298 | } | 1298 | } |
1299 | 1299 | ||
1300 | void ceph_dentry_lru_del(struct dentry *dn) | 1300 | void ceph_dentry_lru_del(struct dentry *dn) |
1301 | { | 1301 | { |
1302 | struct ceph_dentry_info *di = ceph_dentry(dn); | 1302 | struct ceph_dentry_info *di = ceph_dentry(dn); |
1303 | struct ceph_mds_client *mdsc; | 1303 | struct ceph_mds_client *mdsc; |
1304 | 1304 | ||
1305 | dout("dentry_lru_del %p %p '%.*s'\n", di, dn, | 1305 | dout("dentry_lru_del %p %p '%.*s'\n", di, dn, |
1306 | dn->d_name.len, dn->d_name.name); | 1306 | dn->d_name.len, dn->d_name.name); |
1307 | mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; | 1307 | mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; |
1308 | spin_lock(&mdsc->dentry_lru_lock); | 1308 | spin_lock(&mdsc->dentry_lru_lock); |
1309 | list_del_init(&di->lru); | 1309 | list_del_init(&di->lru); |
1310 | mdsc->num_dentry--; | 1310 | mdsc->num_dentry--; |
1311 | spin_unlock(&mdsc->dentry_lru_lock); | 1311 | spin_unlock(&mdsc->dentry_lru_lock); |
1312 | } | 1312 | } |
1313 | 1313 | ||
1314 | /* | 1314 | /* |
1315 | * Return name hash for a given dentry. This is dependent on | 1315 | * Return name hash for a given dentry. This is dependent on |
1316 | * the parent directory's hash function. | 1316 | * the parent directory's hash function. |
1317 | */ | 1317 | */ |
1318 | unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn) | 1318 | unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn) |
1319 | { | 1319 | { |
1320 | struct ceph_inode_info *dci = ceph_inode(dir); | 1320 | struct ceph_inode_info *dci = ceph_inode(dir); |
1321 | 1321 | ||
1322 | switch (dci->i_dir_layout.dl_dir_hash) { | 1322 | switch (dci->i_dir_layout.dl_dir_hash) { |
1323 | case 0: /* for backward compat */ | 1323 | case 0: /* for backward compat */ |
1324 | case CEPH_STR_HASH_LINUX: | 1324 | case CEPH_STR_HASH_LINUX: |
1325 | return dn->d_name.hash; | 1325 | return dn->d_name.hash; |
1326 | 1326 | ||
1327 | default: | 1327 | default: |
1328 | return ceph_str_hash(dci->i_dir_layout.dl_dir_hash, | 1328 | return ceph_str_hash(dci->i_dir_layout.dl_dir_hash, |
1329 | dn->d_name.name, dn->d_name.len); | 1329 | dn->d_name.name, dn->d_name.len); |
1330 | } | 1330 | } |
1331 | } | 1331 | } |
1332 | 1332 | ||
1333 | const struct file_operations ceph_dir_fops = { | 1333 | const struct file_operations ceph_dir_fops = { |
1334 | .read = ceph_read_dir, | 1334 | .read = ceph_read_dir, |
1335 | .readdir = ceph_readdir, | 1335 | .readdir = ceph_readdir, |
1336 | .llseek = ceph_dir_llseek, | 1336 | .llseek = ceph_dir_llseek, |
1337 | .open = ceph_open, | 1337 | .open = ceph_open, |
1338 | .release = ceph_release, | 1338 | .release = ceph_release, |
1339 | .unlocked_ioctl = ceph_ioctl, | 1339 | .unlocked_ioctl = ceph_ioctl, |
1340 | .fsync = ceph_dir_fsync, | 1340 | .fsync = ceph_dir_fsync, |
1341 | }; | 1341 | }; |
1342 | 1342 | ||
1343 | const struct inode_operations ceph_dir_iops = { | 1343 | const struct inode_operations ceph_dir_iops = { |
1344 | .lookup = ceph_lookup, | 1344 | .lookup = ceph_lookup, |
1345 | .permission = ceph_permission, | 1345 | .permission = ceph_permission, |
1346 | .getattr = ceph_getattr, | 1346 | .getattr = ceph_getattr, |
1347 | .setattr = ceph_setattr, | 1347 | .setattr = ceph_setattr, |
1348 | .setxattr = ceph_setxattr, | 1348 | .setxattr = ceph_setxattr, |
1349 | .getxattr = ceph_getxattr, | 1349 | .getxattr = ceph_getxattr, |
1350 | .listxattr = ceph_listxattr, | 1350 | .listxattr = ceph_listxattr, |
1351 | .removexattr = ceph_removexattr, | 1351 | .removexattr = ceph_removexattr, |
1352 | .mknod = ceph_mknod, | 1352 | .mknod = ceph_mknod, |
1353 | .symlink = ceph_symlink, | 1353 | .symlink = ceph_symlink, |
1354 | .mkdir = ceph_mkdir, | 1354 | .mkdir = ceph_mkdir, |
1355 | .link = ceph_link, | 1355 | .link = ceph_link, |
1356 | .unlink = ceph_unlink, | 1356 | .unlink = ceph_unlink, |
1357 | .rmdir = ceph_unlink, | 1357 | .rmdir = ceph_unlink, |
1358 | .rename = ceph_rename, | 1358 | .rename = ceph_rename, |
1359 | .create = ceph_create, | 1359 | .create = ceph_create, |
1360 | }; | 1360 | }; |
1361 | 1361 | ||
1362 | const struct dentry_operations ceph_dentry_ops = { | 1362 | const struct dentry_operations ceph_dentry_ops = { |
1363 | .d_revalidate = ceph_d_revalidate, | 1363 | .d_revalidate = ceph_d_revalidate, |
1364 | .d_release = ceph_d_release, | 1364 | .d_release = ceph_d_release, |
1365 | .d_prune = ceph_d_prune, | 1365 | .d_prune = ceph_d_prune, |
1366 | }; | 1366 | }; |
1367 | 1367 | ||
1368 | const struct dentry_operations ceph_snapdir_dentry_ops = { | 1368 | const struct dentry_operations ceph_snapdir_dentry_ops = { |
1369 | .d_revalidate = ceph_snapdir_d_revalidate, | 1369 | .d_revalidate = ceph_snapdir_d_revalidate, |
1370 | .d_release = ceph_d_release, | 1370 | .d_release = ceph_d_release, |
1371 | }; | 1371 | }; |
1372 | 1372 | ||
1373 | const struct dentry_operations ceph_snap_dentry_ops = { | 1373 | const struct dentry_operations ceph_snap_dentry_ops = { |
1374 | .d_release = ceph_d_release, | 1374 | .d_release = ceph_d_release, |
1375 | .d_prune = ceph_d_prune, | 1375 | .d_prune = ceph_d_prune, |
1376 | }; | 1376 | }; |
1377 | 1377 |
fs/ceph/mds_client.c
1 | #include <linux/ceph/ceph_debug.h> | 1 | #include <linux/ceph/ceph_debug.h> |
2 | 2 | ||
3 | #include <linux/fs.h> | 3 | #include <linux/fs.h> |
4 | #include <linux/wait.h> | 4 | #include <linux/wait.h> |
5 | #include <linux/slab.h> | 5 | #include <linux/slab.h> |
6 | #include <linux/sched.h> | 6 | #include <linux/sched.h> |
7 | #include <linux/debugfs.h> | 7 | #include <linux/debugfs.h> |
8 | #include <linux/seq_file.h> | 8 | #include <linux/seq_file.h> |
9 | 9 | ||
10 | #include "super.h" | 10 | #include "super.h" |
11 | #include "mds_client.h" | 11 | #include "mds_client.h" |
12 | 12 | ||
13 | #include <linux/ceph/messenger.h> | 13 | #include <linux/ceph/messenger.h> |
14 | #include <linux/ceph/decode.h> | 14 | #include <linux/ceph/decode.h> |
15 | #include <linux/ceph/pagelist.h> | 15 | #include <linux/ceph/pagelist.h> |
16 | #include <linux/ceph/auth.h> | 16 | #include <linux/ceph/auth.h> |
17 | #include <linux/ceph/debugfs.h> | 17 | #include <linux/ceph/debugfs.h> |
18 | 18 | ||
19 | /* | 19 | /* |
20 | * A cluster of MDS (metadata server) daemons is responsible for | 20 | * A cluster of MDS (metadata server) daemons is responsible for |
21 | * managing the file system namespace (the directory hierarchy and | 21 | * managing the file system namespace (the directory hierarchy and |
22 | * inodes) and for coordinating shared access to storage. Metadata is | 22 | * inodes) and for coordinating shared access to storage. Metadata is |
23 | * partitioning hierarchically across a number of servers, and that | 23 | * partitioning hierarchically across a number of servers, and that |
24 | * partition varies over time as the cluster adjusts the distribution | 24 | * partition varies over time as the cluster adjusts the distribution |
25 | * in order to balance load. | 25 | * in order to balance load. |
26 | * | 26 | * |
27 | * The MDS client is primarily responsible to managing synchronous | 27 | * The MDS client is primarily responsible to managing synchronous |
28 | * metadata requests for operations like open, unlink, and so forth. | 28 | * metadata requests for operations like open, unlink, and so forth. |
29 | * If there is a MDS failure, we find out about it when we (possibly | 29 | * If there is a MDS failure, we find out about it when we (possibly |
30 | * request and) receive a new MDS map, and can resubmit affected | 30 | * request and) receive a new MDS map, and can resubmit affected |
31 | * requests. | 31 | * requests. |
32 | * | 32 | * |
33 | * For the most part, though, we take advantage of a lossless | 33 | * For the most part, though, we take advantage of a lossless |
34 | * communications channel to the MDS, and do not need to worry about | 34 | * communications channel to the MDS, and do not need to worry about |
35 | * timing out or resubmitting requests. | 35 | * timing out or resubmitting requests. |
36 | * | 36 | * |
37 | * We maintain a stateful "session" with each MDS we interact with. | 37 | * We maintain a stateful "session" with each MDS we interact with. |
38 | * Within each session, we sent periodic heartbeat messages to ensure | 38 | * Within each session, we sent periodic heartbeat messages to ensure |
39 | * any capabilities or leases we have been issues remain valid. If | 39 | * any capabilities or leases we have been issues remain valid. If |
40 | * the session times out and goes stale, our leases and capabilities | 40 | * the session times out and goes stale, our leases and capabilities |
41 | * are no longer valid. | 41 | * are no longer valid. |
42 | */ | 42 | */ |
43 | 43 | ||
44 | struct ceph_reconnect_state { | 44 | struct ceph_reconnect_state { |
45 | struct ceph_pagelist *pagelist; | 45 | struct ceph_pagelist *pagelist; |
46 | bool flock; | 46 | bool flock; |
47 | }; | 47 | }; |
48 | 48 | ||
49 | static void __wake_requests(struct ceph_mds_client *mdsc, | 49 | static void __wake_requests(struct ceph_mds_client *mdsc, |
50 | struct list_head *head); | 50 | struct list_head *head); |
51 | 51 | ||
52 | static const struct ceph_connection_operations mds_con_ops; | 52 | static const struct ceph_connection_operations mds_con_ops; |
53 | 53 | ||
54 | 54 | ||
55 | /* | 55 | /* |
56 | * mds reply parsing | 56 | * mds reply parsing |
57 | */ | 57 | */ |
58 | 58 | ||
59 | /* | 59 | /* |
60 | * parse individual inode info | 60 | * parse individual inode info |
61 | */ | 61 | */ |
62 | static int parse_reply_info_in(void **p, void *end, | 62 | static int parse_reply_info_in(void **p, void *end, |
63 | struct ceph_mds_reply_info_in *info, | 63 | struct ceph_mds_reply_info_in *info, |
64 | int features) | 64 | int features) |
65 | { | 65 | { |
66 | int err = -EIO; | 66 | int err = -EIO; |
67 | 67 | ||
68 | info->in = *p; | 68 | info->in = *p; |
69 | *p += sizeof(struct ceph_mds_reply_inode) + | 69 | *p += sizeof(struct ceph_mds_reply_inode) + |
70 | sizeof(*info->in->fragtree.splits) * | 70 | sizeof(*info->in->fragtree.splits) * |
71 | le32_to_cpu(info->in->fragtree.nsplits); | 71 | le32_to_cpu(info->in->fragtree.nsplits); |
72 | 72 | ||
73 | ceph_decode_32_safe(p, end, info->symlink_len, bad); | 73 | ceph_decode_32_safe(p, end, info->symlink_len, bad); |
74 | ceph_decode_need(p, end, info->symlink_len, bad); | 74 | ceph_decode_need(p, end, info->symlink_len, bad); |
75 | info->symlink = *p; | 75 | info->symlink = *p; |
76 | *p += info->symlink_len; | 76 | *p += info->symlink_len; |
77 | 77 | ||
78 | if (features & CEPH_FEATURE_DIRLAYOUTHASH) | 78 | if (features & CEPH_FEATURE_DIRLAYOUTHASH) |
79 | ceph_decode_copy_safe(p, end, &info->dir_layout, | 79 | ceph_decode_copy_safe(p, end, &info->dir_layout, |
80 | sizeof(info->dir_layout), bad); | 80 | sizeof(info->dir_layout), bad); |
81 | else | 81 | else |
82 | memset(&info->dir_layout, 0, sizeof(info->dir_layout)); | 82 | memset(&info->dir_layout, 0, sizeof(info->dir_layout)); |
83 | 83 | ||
84 | ceph_decode_32_safe(p, end, info->xattr_len, bad); | 84 | ceph_decode_32_safe(p, end, info->xattr_len, bad); |
85 | ceph_decode_need(p, end, info->xattr_len, bad); | 85 | ceph_decode_need(p, end, info->xattr_len, bad); |
86 | info->xattr_data = *p; | 86 | info->xattr_data = *p; |
87 | *p += info->xattr_len; | 87 | *p += info->xattr_len; |
88 | return 0; | 88 | return 0; |
89 | bad: | 89 | bad: |
90 | return err; | 90 | return err; |
91 | } | 91 | } |
92 | 92 | ||
93 | /* | 93 | /* |
94 | * parse a normal reply, which may contain a (dir+)dentry and/or a | 94 | * parse a normal reply, which may contain a (dir+)dentry and/or a |
95 | * target inode. | 95 | * target inode. |
96 | */ | 96 | */ |
97 | static int parse_reply_info_trace(void **p, void *end, | 97 | static int parse_reply_info_trace(void **p, void *end, |
98 | struct ceph_mds_reply_info_parsed *info, | 98 | struct ceph_mds_reply_info_parsed *info, |
99 | int features) | 99 | int features) |
100 | { | 100 | { |
101 | int err; | 101 | int err; |
102 | 102 | ||
103 | if (info->head->is_dentry) { | 103 | if (info->head->is_dentry) { |
104 | err = parse_reply_info_in(p, end, &info->diri, features); | 104 | err = parse_reply_info_in(p, end, &info->diri, features); |
105 | if (err < 0) | 105 | if (err < 0) |
106 | goto out_bad; | 106 | goto out_bad; |
107 | 107 | ||
108 | if (unlikely(*p + sizeof(*info->dirfrag) > end)) | 108 | if (unlikely(*p + sizeof(*info->dirfrag) > end)) |
109 | goto bad; | 109 | goto bad; |
110 | info->dirfrag = *p; | 110 | info->dirfrag = *p; |
111 | *p += sizeof(*info->dirfrag) + | 111 | *p += sizeof(*info->dirfrag) + |
112 | sizeof(u32)*le32_to_cpu(info->dirfrag->ndist); | 112 | sizeof(u32)*le32_to_cpu(info->dirfrag->ndist); |
113 | if (unlikely(*p > end)) | 113 | if (unlikely(*p > end)) |
114 | goto bad; | 114 | goto bad; |
115 | 115 | ||
116 | ceph_decode_32_safe(p, end, info->dname_len, bad); | 116 | ceph_decode_32_safe(p, end, info->dname_len, bad); |
117 | ceph_decode_need(p, end, info->dname_len, bad); | 117 | ceph_decode_need(p, end, info->dname_len, bad); |
118 | info->dname = *p; | 118 | info->dname = *p; |
119 | *p += info->dname_len; | 119 | *p += info->dname_len; |
120 | info->dlease = *p; | 120 | info->dlease = *p; |
121 | *p += sizeof(*info->dlease); | 121 | *p += sizeof(*info->dlease); |
122 | } | 122 | } |
123 | 123 | ||
124 | if (info->head->is_target) { | 124 | if (info->head->is_target) { |
125 | err = parse_reply_info_in(p, end, &info->targeti, features); | 125 | err = parse_reply_info_in(p, end, &info->targeti, features); |
126 | if (err < 0) | 126 | if (err < 0) |
127 | goto out_bad; | 127 | goto out_bad; |
128 | } | 128 | } |
129 | 129 | ||
130 | if (unlikely(*p != end)) | 130 | if (unlikely(*p != end)) |
131 | goto bad; | 131 | goto bad; |
132 | return 0; | 132 | return 0; |
133 | 133 | ||
134 | bad: | 134 | bad: |
135 | err = -EIO; | 135 | err = -EIO; |
136 | out_bad: | 136 | out_bad: |
137 | pr_err("problem parsing mds trace %d\n", err); | 137 | pr_err("problem parsing mds trace %d\n", err); |
138 | return err; | 138 | return err; |
139 | } | 139 | } |
140 | 140 | ||
141 | /* | 141 | /* |
142 | * parse readdir results | 142 | * parse readdir results |
143 | */ | 143 | */ |
144 | static int parse_reply_info_dir(void **p, void *end, | 144 | static int parse_reply_info_dir(void **p, void *end, |
145 | struct ceph_mds_reply_info_parsed *info, | 145 | struct ceph_mds_reply_info_parsed *info, |
146 | int features) | 146 | int features) |
147 | { | 147 | { |
148 | u32 num, i = 0; | 148 | u32 num, i = 0; |
149 | int err; | 149 | int err; |
150 | 150 | ||
151 | info->dir_dir = *p; | 151 | info->dir_dir = *p; |
152 | if (*p + sizeof(*info->dir_dir) > end) | 152 | if (*p + sizeof(*info->dir_dir) > end) |
153 | goto bad; | 153 | goto bad; |
154 | *p += sizeof(*info->dir_dir) + | 154 | *p += sizeof(*info->dir_dir) + |
155 | sizeof(u32)*le32_to_cpu(info->dir_dir->ndist); | 155 | sizeof(u32)*le32_to_cpu(info->dir_dir->ndist); |
156 | if (*p > end) | 156 | if (*p > end) |
157 | goto bad; | 157 | goto bad; |
158 | 158 | ||
159 | ceph_decode_need(p, end, sizeof(num) + 2, bad); | 159 | ceph_decode_need(p, end, sizeof(num) + 2, bad); |
160 | num = ceph_decode_32(p); | 160 | num = ceph_decode_32(p); |
161 | info->dir_end = ceph_decode_8(p); | 161 | info->dir_end = ceph_decode_8(p); |
162 | info->dir_complete = ceph_decode_8(p); | 162 | info->dir_complete = ceph_decode_8(p); |
163 | if (num == 0) | 163 | if (num == 0) |
164 | goto done; | 164 | goto done; |
165 | 165 | ||
166 | /* alloc large array */ | 166 | /* alloc large array */ |
167 | info->dir_nr = num; | 167 | info->dir_nr = num; |
168 | info->dir_in = kcalloc(num, sizeof(*info->dir_in) + | 168 | info->dir_in = kcalloc(num, sizeof(*info->dir_in) + |
169 | sizeof(*info->dir_dname) + | 169 | sizeof(*info->dir_dname) + |
170 | sizeof(*info->dir_dname_len) + | 170 | sizeof(*info->dir_dname_len) + |
171 | sizeof(*info->dir_dlease), | 171 | sizeof(*info->dir_dlease), |
172 | GFP_NOFS); | 172 | GFP_NOFS); |
173 | if (info->dir_in == NULL) { | 173 | if (info->dir_in == NULL) { |
174 | err = -ENOMEM; | 174 | err = -ENOMEM; |
175 | goto out_bad; | 175 | goto out_bad; |
176 | } | 176 | } |
177 | info->dir_dname = (void *)(info->dir_in + num); | 177 | info->dir_dname = (void *)(info->dir_in + num); |
178 | info->dir_dname_len = (void *)(info->dir_dname + num); | 178 | info->dir_dname_len = (void *)(info->dir_dname + num); |
179 | info->dir_dlease = (void *)(info->dir_dname_len + num); | 179 | info->dir_dlease = (void *)(info->dir_dname_len + num); |
180 | 180 | ||
181 | while (num) { | 181 | while (num) { |
182 | /* dentry */ | 182 | /* dentry */ |
183 | ceph_decode_need(p, end, sizeof(u32)*2, bad); | 183 | ceph_decode_need(p, end, sizeof(u32)*2, bad); |
184 | info->dir_dname_len[i] = ceph_decode_32(p); | 184 | info->dir_dname_len[i] = ceph_decode_32(p); |
185 | ceph_decode_need(p, end, info->dir_dname_len[i], bad); | 185 | ceph_decode_need(p, end, info->dir_dname_len[i], bad); |
186 | info->dir_dname[i] = *p; | 186 | info->dir_dname[i] = *p; |
187 | *p += info->dir_dname_len[i]; | 187 | *p += info->dir_dname_len[i]; |
188 | dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i], | 188 | dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i], |
189 | info->dir_dname[i]); | 189 | info->dir_dname[i]); |
190 | info->dir_dlease[i] = *p; | 190 | info->dir_dlease[i] = *p; |
191 | *p += sizeof(struct ceph_mds_reply_lease); | 191 | *p += sizeof(struct ceph_mds_reply_lease); |
192 | 192 | ||
193 | /* inode */ | 193 | /* inode */ |
194 | err = parse_reply_info_in(p, end, &info->dir_in[i], features); | 194 | err = parse_reply_info_in(p, end, &info->dir_in[i], features); |
195 | if (err < 0) | 195 | if (err < 0) |
196 | goto out_bad; | 196 | goto out_bad; |
197 | i++; | 197 | i++; |
198 | num--; | 198 | num--; |
199 | } | 199 | } |
200 | 200 | ||
201 | done: | 201 | done: |
202 | if (*p != end) | 202 | if (*p != end) |
203 | goto bad; | 203 | goto bad; |
204 | return 0; | 204 | return 0; |
205 | 205 | ||
206 | bad: | 206 | bad: |
207 | err = -EIO; | 207 | err = -EIO; |
208 | out_bad: | 208 | out_bad: |
209 | pr_err("problem parsing dir contents %d\n", err); | 209 | pr_err("problem parsing dir contents %d\n", err); |
210 | return err; | 210 | return err; |
211 | } | 211 | } |
212 | 212 | ||
213 | /* | 213 | /* |
214 | * parse fcntl F_GETLK results | 214 | * parse fcntl F_GETLK results |
215 | */ | 215 | */ |
216 | static int parse_reply_info_filelock(void **p, void *end, | 216 | static int parse_reply_info_filelock(void **p, void *end, |
217 | struct ceph_mds_reply_info_parsed *info, | 217 | struct ceph_mds_reply_info_parsed *info, |
218 | int features) | 218 | int features) |
219 | { | 219 | { |
220 | if (*p + sizeof(*info->filelock_reply) > end) | 220 | if (*p + sizeof(*info->filelock_reply) > end) |
221 | goto bad; | 221 | goto bad; |
222 | 222 | ||
223 | info->filelock_reply = *p; | 223 | info->filelock_reply = *p; |
224 | *p += sizeof(*info->filelock_reply); | 224 | *p += sizeof(*info->filelock_reply); |
225 | 225 | ||
226 | if (unlikely(*p != end)) | 226 | if (unlikely(*p != end)) |
227 | goto bad; | 227 | goto bad; |
228 | return 0; | 228 | return 0; |
229 | 229 | ||
230 | bad: | 230 | bad: |
231 | return -EIO; | 231 | return -EIO; |
232 | } | 232 | } |
233 | 233 | ||
234 | /* | 234 | /* |
235 | * parse extra results | 235 | * parse extra results |
236 | */ | 236 | */ |
237 | static int parse_reply_info_extra(void **p, void *end, | 237 | static int parse_reply_info_extra(void **p, void *end, |
238 | struct ceph_mds_reply_info_parsed *info, | 238 | struct ceph_mds_reply_info_parsed *info, |
239 | int features) | 239 | int features) |
240 | { | 240 | { |
241 | if (info->head->op == CEPH_MDS_OP_GETFILELOCK) | 241 | if (info->head->op == CEPH_MDS_OP_GETFILELOCK) |
242 | return parse_reply_info_filelock(p, end, info, features); | 242 | return parse_reply_info_filelock(p, end, info, features); |
243 | else | 243 | else |
244 | return parse_reply_info_dir(p, end, info, features); | 244 | return parse_reply_info_dir(p, end, info, features); |
245 | } | 245 | } |
246 | 246 | ||
247 | /* | 247 | /* |
248 | * parse entire mds reply | 248 | * parse entire mds reply |
249 | */ | 249 | */ |
250 | static int parse_reply_info(struct ceph_msg *msg, | 250 | static int parse_reply_info(struct ceph_msg *msg, |
251 | struct ceph_mds_reply_info_parsed *info, | 251 | struct ceph_mds_reply_info_parsed *info, |
252 | int features) | 252 | int features) |
253 | { | 253 | { |
254 | void *p, *end; | 254 | void *p, *end; |
255 | u32 len; | 255 | u32 len; |
256 | int err; | 256 | int err; |
257 | 257 | ||
258 | info->head = msg->front.iov_base; | 258 | info->head = msg->front.iov_base; |
259 | p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head); | 259 | p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head); |
260 | end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head); | 260 | end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head); |
261 | 261 | ||
262 | /* trace */ | 262 | /* trace */ |
263 | ceph_decode_32_safe(&p, end, len, bad); | 263 | ceph_decode_32_safe(&p, end, len, bad); |
264 | if (len > 0) { | 264 | if (len > 0) { |
265 | ceph_decode_need(&p, end, len, bad); | ||
265 | err = parse_reply_info_trace(&p, p+len, info, features); | 266 | err = parse_reply_info_trace(&p, p+len, info, features); |
266 | if (err < 0) | 267 | if (err < 0) |
267 | goto out_bad; | 268 | goto out_bad; |
268 | } | 269 | } |
269 | 270 | ||
270 | /* extra */ | 271 | /* extra */ |
271 | ceph_decode_32_safe(&p, end, len, bad); | 272 | ceph_decode_32_safe(&p, end, len, bad); |
272 | if (len > 0) { | 273 | if (len > 0) { |
274 | ceph_decode_need(&p, end, len, bad); | ||
273 | err = parse_reply_info_extra(&p, p+len, info, features); | 275 | err = parse_reply_info_extra(&p, p+len, info, features); |
274 | if (err < 0) | 276 | if (err < 0) |
275 | goto out_bad; | 277 | goto out_bad; |
276 | } | 278 | } |
277 | 279 | ||
278 | /* snap blob */ | 280 | /* snap blob */ |
279 | ceph_decode_32_safe(&p, end, len, bad); | 281 | ceph_decode_32_safe(&p, end, len, bad); |
280 | info->snapblob_len = len; | 282 | info->snapblob_len = len; |
281 | info->snapblob = p; | 283 | info->snapblob = p; |
282 | p += len; | 284 | p += len; |
283 | 285 | ||
284 | if (p != end) | 286 | if (p != end) |
285 | goto bad; | 287 | goto bad; |
286 | return 0; | 288 | return 0; |
287 | 289 | ||
288 | bad: | 290 | bad: |
289 | err = -EIO; | 291 | err = -EIO; |
290 | out_bad: | 292 | out_bad: |
291 | pr_err("mds parse_reply err %d\n", err); | 293 | pr_err("mds parse_reply err %d\n", err); |
292 | return err; | 294 | return err; |
293 | } | 295 | } |
294 | 296 | ||
295 | static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info) | 297 | static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info) |
296 | { | 298 | { |
297 | kfree(info->dir_in); | 299 | kfree(info->dir_in); |
298 | } | 300 | } |
299 | 301 | ||
300 | 302 | ||
301 | /* | 303 | /* |
302 | * sessions | 304 | * sessions |
303 | */ | 305 | */ |
304 | static const char *session_state_name(int s) | 306 | static const char *session_state_name(int s) |
305 | { | 307 | { |
306 | switch (s) { | 308 | switch (s) { |
307 | case CEPH_MDS_SESSION_NEW: return "new"; | 309 | case CEPH_MDS_SESSION_NEW: return "new"; |
308 | case CEPH_MDS_SESSION_OPENING: return "opening"; | 310 | case CEPH_MDS_SESSION_OPENING: return "opening"; |
309 | case CEPH_MDS_SESSION_OPEN: return "open"; | 311 | case CEPH_MDS_SESSION_OPEN: return "open"; |
310 | case CEPH_MDS_SESSION_HUNG: return "hung"; | 312 | case CEPH_MDS_SESSION_HUNG: return "hung"; |
311 | case CEPH_MDS_SESSION_CLOSING: return "closing"; | 313 | case CEPH_MDS_SESSION_CLOSING: return "closing"; |
312 | case CEPH_MDS_SESSION_RESTARTING: return "restarting"; | 314 | case CEPH_MDS_SESSION_RESTARTING: return "restarting"; |
313 | case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting"; | 315 | case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting"; |
314 | default: return "???"; | 316 | default: return "???"; |
315 | } | 317 | } |
316 | } | 318 | } |
317 | 319 | ||
318 | static struct ceph_mds_session *get_session(struct ceph_mds_session *s) | 320 | static struct ceph_mds_session *get_session(struct ceph_mds_session *s) |
319 | { | 321 | { |
320 | if (atomic_inc_not_zero(&s->s_ref)) { | 322 | if (atomic_inc_not_zero(&s->s_ref)) { |
321 | dout("mdsc get_session %p %d -> %d\n", s, | 323 | dout("mdsc get_session %p %d -> %d\n", s, |
322 | atomic_read(&s->s_ref)-1, atomic_read(&s->s_ref)); | 324 | atomic_read(&s->s_ref)-1, atomic_read(&s->s_ref)); |
323 | return s; | 325 | return s; |
324 | } else { | 326 | } else { |
325 | dout("mdsc get_session %p 0 -- FAIL", s); | 327 | dout("mdsc get_session %p 0 -- FAIL", s); |
326 | return NULL; | 328 | return NULL; |
327 | } | 329 | } |
328 | } | 330 | } |
329 | 331 | ||
330 | void ceph_put_mds_session(struct ceph_mds_session *s) | 332 | void ceph_put_mds_session(struct ceph_mds_session *s) |
331 | { | 333 | { |
332 | dout("mdsc put_session %p %d -> %d\n", s, | 334 | dout("mdsc put_session %p %d -> %d\n", s, |
333 | atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1); | 335 | atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1); |
334 | if (atomic_dec_and_test(&s->s_ref)) { | 336 | if (atomic_dec_and_test(&s->s_ref)) { |
335 | if (s->s_authorizer) | 337 | if (s->s_authorizer) |
336 | s->s_mdsc->fsc->client->monc.auth->ops->destroy_authorizer( | 338 | s->s_mdsc->fsc->client->monc.auth->ops->destroy_authorizer( |
337 | s->s_mdsc->fsc->client->monc.auth, | 339 | s->s_mdsc->fsc->client->monc.auth, |
338 | s->s_authorizer); | 340 | s->s_authorizer); |
339 | kfree(s); | 341 | kfree(s); |
340 | } | 342 | } |
341 | } | 343 | } |
342 | 344 | ||
343 | /* | 345 | /* |
344 | * called under mdsc->mutex | 346 | * called under mdsc->mutex |
345 | */ | 347 | */ |
346 | struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc, | 348 | struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc, |
347 | int mds) | 349 | int mds) |
348 | { | 350 | { |
349 | struct ceph_mds_session *session; | 351 | struct ceph_mds_session *session; |
350 | 352 | ||
351 | if (mds >= mdsc->max_sessions || mdsc->sessions[mds] == NULL) | 353 | if (mds >= mdsc->max_sessions || mdsc->sessions[mds] == NULL) |
352 | return NULL; | 354 | return NULL; |
353 | session = mdsc->sessions[mds]; | 355 | session = mdsc->sessions[mds]; |
354 | dout("lookup_mds_session %p %d\n", session, | 356 | dout("lookup_mds_session %p %d\n", session, |
355 | atomic_read(&session->s_ref)); | 357 | atomic_read(&session->s_ref)); |
356 | get_session(session); | 358 | get_session(session); |
357 | return session; | 359 | return session; |
358 | } | 360 | } |
359 | 361 | ||
360 | static bool __have_session(struct ceph_mds_client *mdsc, int mds) | 362 | static bool __have_session(struct ceph_mds_client *mdsc, int mds) |
361 | { | 363 | { |
362 | if (mds >= mdsc->max_sessions) | 364 | if (mds >= mdsc->max_sessions) |
363 | return false; | 365 | return false; |
364 | return mdsc->sessions[mds]; | 366 | return mdsc->sessions[mds]; |
365 | } | 367 | } |
366 | 368 | ||
367 | static int __verify_registered_session(struct ceph_mds_client *mdsc, | 369 | static int __verify_registered_session(struct ceph_mds_client *mdsc, |
368 | struct ceph_mds_session *s) | 370 | struct ceph_mds_session *s) |
369 | { | 371 | { |
370 | if (s->s_mds >= mdsc->max_sessions || | 372 | if (s->s_mds >= mdsc->max_sessions || |
371 | mdsc->sessions[s->s_mds] != s) | 373 | mdsc->sessions[s->s_mds] != s) |
372 | return -ENOENT; | 374 | return -ENOENT; |
373 | return 0; | 375 | return 0; |
374 | } | 376 | } |
375 | 377 | ||
376 | /* | 378 | /* |
377 | * create+register a new session for given mds. | 379 | * create+register a new session for given mds. |
378 | * called under mdsc->mutex. | 380 | * called under mdsc->mutex. |
379 | */ | 381 | */ |
380 | static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, | 382 | static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, |
381 | int mds) | 383 | int mds) |
382 | { | 384 | { |
383 | struct ceph_mds_session *s; | 385 | struct ceph_mds_session *s; |
384 | 386 | ||
385 | s = kzalloc(sizeof(*s), GFP_NOFS); | 387 | s = kzalloc(sizeof(*s), GFP_NOFS); |
386 | if (!s) | 388 | if (!s) |
387 | return ERR_PTR(-ENOMEM); | 389 | return ERR_PTR(-ENOMEM); |
388 | s->s_mdsc = mdsc; | 390 | s->s_mdsc = mdsc; |
389 | s->s_mds = mds; | 391 | s->s_mds = mds; |
390 | s->s_state = CEPH_MDS_SESSION_NEW; | 392 | s->s_state = CEPH_MDS_SESSION_NEW; |
391 | s->s_ttl = 0; | 393 | s->s_ttl = 0; |
392 | s->s_seq = 0; | 394 | s->s_seq = 0; |
393 | mutex_init(&s->s_mutex); | 395 | mutex_init(&s->s_mutex); |
394 | 396 | ||
395 | ceph_con_init(mdsc->fsc->client->msgr, &s->s_con); | 397 | ceph_con_init(mdsc->fsc->client->msgr, &s->s_con); |
396 | s->s_con.private = s; | 398 | s->s_con.private = s; |
397 | s->s_con.ops = &mds_con_ops; | 399 | s->s_con.ops = &mds_con_ops; |
398 | s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS; | 400 | s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS; |
399 | s->s_con.peer_name.num = cpu_to_le64(mds); | 401 | s->s_con.peer_name.num = cpu_to_le64(mds); |
400 | 402 | ||
401 | spin_lock_init(&s->s_cap_lock); | 403 | spin_lock_init(&s->s_gen_ttl_lock); |
402 | s->s_cap_gen = 0; | 404 | s->s_cap_gen = 0; |
403 | s->s_cap_ttl = 0; | 405 | s->s_cap_ttl = 0; |
406 | |||
407 | spin_lock_init(&s->s_cap_lock); | ||
404 | s->s_renew_requested = 0; | 408 | s->s_renew_requested = 0; |
405 | s->s_renew_seq = 0; | 409 | s->s_renew_seq = 0; |
406 | INIT_LIST_HEAD(&s->s_caps); | 410 | INIT_LIST_HEAD(&s->s_caps); |
407 | s->s_nr_caps = 0; | 411 | s->s_nr_caps = 0; |
408 | s->s_trim_caps = 0; | 412 | s->s_trim_caps = 0; |
409 | atomic_set(&s->s_ref, 1); | 413 | atomic_set(&s->s_ref, 1); |
410 | INIT_LIST_HEAD(&s->s_waiting); | 414 | INIT_LIST_HEAD(&s->s_waiting); |
411 | INIT_LIST_HEAD(&s->s_unsafe); | 415 | INIT_LIST_HEAD(&s->s_unsafe); |
412 | s->s_num_cap_releases = 0; | 416 | s->s_num_cap_releases = 0; |
413 | s->s_cap_iterator = NULL; | 417 | s->s_cap_iterator = NULL; |
414 | INIT_LIST_HEAD(&s->s_cap_releases); | 418 | INIT_LIST_HEAD(&s->s_cap_releases); |
415 | INIT_LIST_HEAD(&s->s_cap_releases_done); | 419 | INIT_LIST_HEAD(&s->s_cap_releases_done); |
416 | INIT_LIST_HEAD(&s->s_cap_flushing); | 420 | INIT_LIST_HEAD(&s->s_cap_flushing); |
417 | INIT_LIST_HEAD(&s->s_cap_snaps_flushing); | 421 | INIT_LIST_HEAD(&s->s_cap_snaps_flushing); |
418 | 422 | ||
419 | dout("register_session mds%d\n", mds); | 423 | dout("register_session mds%d\n", mds); |
420 | if (mds >= mdsc->max_sessions) { | 424 | if (mds >= mdsc->max_sessions) { |
421 | int newmax = 1 << get_count_order(mds+1); | 425 | int newmax = 1 << get_count_order(mds+1); |
422 | struct ceph_mds_session **sa; | 426 | struct ceph_mds_session **sa; |
423 | 427 | ||
424 | dout("register_session realloc to %d\n", newmax); | 428 | dout("register_session realloc to %d\n", newmax); |
425 | sa = kcalloc(newmax, sizeof(void *), GFP_NOFS); | 429 | sa = kcalloc(newmax, sizeof(void *), GFP_NOFS); |
426 | if (sa == NULL) | 430 | if (sa == NULL) |
427 | goto fail_realloc; | 431 | goto fail_realloc; |
428 | if (mdsc->sessions) { | 432 | if (mdsc->sessions) { |
429 | memcpy(sa, mdsc->sessions, | 433 | memcpy(sa, mdsc->sessions, |
430 | mdsc->max_sessions * sizeof(void *)); | 434 | mdsc->max_sessions * sizeof(void *)); |
431 | kfree(mdsc->sessions); | 435 | kfree(mdsc->sessions); |
432 | } | 436 | } |
433 | mdsc->sessions = sa; | 437 | mdsc->sessions = sa; |
434 | mdsc->max_sessions = newmax; | 438 | mdsc->max_sessions = newmax; |
435 | } | 439 | } |
436 | mdsc->sessions[mds] = s; | 440 | mdsc->sessions[mds] = s; |
437 | atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */ | 441 | atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */ |
438 | 442 | ||
439 | ceph_con_open(&s->s_con, ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); | 443 | ceph_con_open(&s->s_con, ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); |
440 | 444 | ||
441 | return s; | 445 | return s; |
442 | 446 | ||
443 | fail_realloc: | 447 | fail_realloc: |
444 | kfree(s); | 448 | kfree(s); |
445 | return ERR_PTR(-ENOMEM); | 449 | return ERR_PTR(-ENOMEM); |
446 | } | 450 | } |
447 | 451 | ||
448 | /* | 452 | /* |
449 | * called under mdsc->mutex | 453 | * called under mdsc->mutex |
450 | */ | 454 | */ |
451 | static void __unregister_session(struct ceph_mds_client *mdsc, | 455 | static void __unregister_session(struct ceph_mds_client *mdsc, |
452 | struct ceph_mds_session *s) | 456 | struct ceph_mds_session *s) |
453 | { | 457 | { |
454 | dout("__unregister_session mds%d %p\n", s->s_mds, s); | 458 | dout("__unregister_session mds%d %p\n", s->s_mds, s); |
455 | BUG_ON(mdsc->sessions[s->s_mds] != s); | 459 | BUG_ON(mdsc->sessions[s->s_mds] != s); |
456 | mdsc->sessions[s->s_mds] = NULL; | 460 | mdsc->sessions[s->s_mds] = NULL; |
457 | ceph_con_close(&s->s_con); | 461 | ceph_con_close(&s->s_con); |
458 | ceph_put_mds_session(s); | 462 | ceph_put_mds_session(s); |
459 | } | 463 | } |
460 | 464 | ||
461 | /* | 465 | /* |
462 | * drop session refs in request. | 466 | * drop session refs in request. |
463 | * | 467 | * |
464 | * should be last request ref, or hold mdsc->mutex | 468 | * should be last request ref, or hold mdsc->mutex |
465 | */ | 469 | */ |
466 | static void put_request_session(struct ceph_mds_request *req) | 470 | static void put_request_session(struct ceph_mds_request *req) |
467 | { | 471 | { |
468 | if (req->r_session) { | 472 | if (req->r_session) { |
469 | ceph_put_mds_session(req->r_session); | 473 | ceph_put_mds_session(req->r_session); |
470 | req->r_session = NULL; | 474 | req->r_session = NULL; |
471 | } | 475 | } |
472 | } | 476 | } |
473 | 477 | ||
474 | void ceph_mdsc_release_request(struct kref *kref) | 478 | void ceph_mdsc_release_request(struct kref *kref) |
475 | { | 479 | { |
476 | struct ceph_mds_request *req = container_of(kref, | 480 | struct ceph_mds_request *req = container_of(kref, |
477 | struct ceph_mds_request, | 481 | struct ceph_mds_request, |
478 | r_kref); | 482 | r_kref); |
479 | if (req->r_request) | 483 | if (req->r_request) |
480 | ceph_msg_put(req->r_request); | 484 | ceph_msg_put(req->r_request); |
481 | if (req->r_reply) { | 485 | if (req->r_reply) { |
482 | ceph_msg_put(req->r_reply); | 486 | ceph_msg_put(req->r_reply); |
483 | destroy_reply_info(&req->r_reply_info); | 487 | destroy_reply_info(&req->r_reply_info); |
484 | } | 488 | } |
485 | if (req->r_inode) { | 489 | if (req->r_inode) { |
486 | ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); | 490 | ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); |
487 | iput(req->r_inode); | 491 | iput(req->r_inode); |
488 | } | 492 | } |
489 | if (req->r_locked_dir) | 493 | if (req->r_locked_dir) |
490 | ceph_put_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); | 494 | ceph_put_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); |
491 | if (req->r_target_inode) | 495 | if (req->r_target_inode) |
492 | iput(req->r_target_inode); | 496 | iput(req->r_target_inode); |
493 | if (req->r_dentry) | 497 | if (req->r_dentry) |
494 | dput(req->r_dentry); | 498 | dput(req->r_dentry); |
495 | if (req->r_old_dentry) { | 499 | if (req->r_old_dentry) { |
496 | /* | 500 | /* |
497 | * track (and drop pins for) r_old_dentry_dir | 501 | * track (and drop pins for) r_old_dentry_dir |
498 | * separately, since r_old_dentry's d_parent may have | 502 | * separately, since r_old_dentry's d_parent may have |
499 | * changed between the dir mutex being dropped and | 503 | * changed between the dir mutex being dropped and |
500 | * this request being freed. | 504 | * this request being freed. |
501 | */ | 505 | */ |
502 | ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir), | 506 | ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir), |
503 | CEPH_CAP_PIN); | 507 | CEPH_CAP_PIN); |
504 | dput(req->r_old_dentry); | 508 | dput(req->r_old_dentry); |
505 | iput(req->r_old_dentry_dir); | 509 | iput(req->r_old_dentry_dir); |
506 | } | 510 | } |
507 | kfree(req->r_path1); | 511 | kfree(req->r_path1); |
508 | kfree(req->r_path2); | 512 | kfree(req->r_path2); |
509 | put_request_session(req); | 513 | put_request_session(req); |
510 | ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation); | 514 | ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation); |
511 | kfree(req); | 515 | kfree(req); |
512 | } | 516 | } |
513 | 517 | ||
514 | /* | 518 | /* |
515 | * lookup session, bump ref if found. | 519 | * lookup session, bump ref if found. |
516 | * | 520 | * |
517 | * called under mdsc->mutex. | 521 | * called under mdsc->mutex. |
518 | */ | 522 | */ |
519 | static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc, | 523 | static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc, |
520 | u64 tid) | 524 | u64 tid) |
521 | { | 525 | { |
522 | struct ceph_mds_request *req; | 526 | struct ceph_mds_request *req; |
523 | struct rb_node *n = mdsc->request_tree.rb_node; | 527 | struct rb_node *n = mdsc->request_tree.rb_node; |
524 | 528 | ||
525 | while (n) { | 529 | while (n) { |
526 | req = rb_entry(n, struct ceph_mds_request, r_node); | 530 | req = rb_entry(n, struct ceph_mds_request, r_node); |
527 | if (tid < req->r_tid) | 531 | if (tid < req->r_tid) |
528 | n = n->rb_left; | 532 | n = n->rb_left; |
529 | else if (tid > req->r_tid) | 533 | else if (tid > req->r_tid) |
530 | n = n->rb_right; | 534 | n = n->rb_right; |
531 | else { | 535 | else { |
532 | ceph_mdsc_get_request(req); | 536 | ceph_mdsc_get_request(req); |
533 | return req; | 537 | return req; |
534 | } | 538 | } |
535 | } | 539 | } |
536 | return NULL; | 540 | return NULL; |
537 | } | 541 | } |
538 | 542 | ||
539 | static void __insert_request(struct ceph_mds_client *mdsc, | 543 | static void __insert_request(struct ceph_mds_client *mdsc, |
540 | struct ceph_mds_request *new) | 544 | struct ceph_mds_request *new) |
541 | { | 545 | { |
542 | struct rb_node **p = &mdsc->request_tree.rb_node; | 546 | struct rb_node **p = &mdsc->request_tree.rb_node; |
543 | struct rb_node *parent = NULL; | 547 | struct rb_node *parent = NULL; |
544 | struct ceph_mds_request *req = NULL; | 548 | struct ceph_mds_request *req = NULL; |
545 | 549 | ||
546 | while (*p) { | 550 | while (*p) { |
547 | parent = *p; | 551 | parent = *p; |
548 | req = rb_entry(parent, struct ceph_mds_request, r_node); | 552 | req = rb_entry(parent, struct ceph_mds_request, r_node); |
549 | if (new->r_tid < req->r_tid) | 553 | if (new->r_tid < req->r_tid) |
550 | p = &(*p)->rb_left; | 554 | p = &(*p)->rb_left; |
551 | else if (new->r_tid > req->r_tid) | 555 | else if (new->r_tid > req->r_tid) |
552 | p = &(*p)->rb_right; | 556 | p = &(*p)->rb_right; |
553 | else | 557 | else |
554 | BUG(); | 558 | BUG(); |
555 | } | 559 | } |
556 | 560 | ||
557 | rb_link_node(&new->r_node, parent, p); | 561 | rb_link_node(&new->r_node, parent, p); |
558 | rb_insert_color(&new->r_node, &mdsc->request_tree); | 562 | rb_insert_color(&new->r_node, &mdsc->request_tree); |
559 | } | 563 | } |
560 | 564 | ||
561 | /* | 565 | /* |
562 | * Register an in-flight request, and assign a tid. Link to directory | 566 | * Register an in-flight request, and assign a tid. Link to directory |
563 | * are modifying (if any). | 567 | * are modifying (if any). |
564 | * | 568 | * |
565 | * Called under mdsc->mutex. | 569 | * Called under mdsc->mutex. |
566 | */ | 570 | */ |
567 | static void __register_request(struct ceph_mds_client *mdsc, | 571 | static void __register_request(struct ceph_mds_client *mdsc, |
568 | struct ceph_mds_request *req, | 572 | struct ceph_mds_request *req, |
569 | struct inode *dir) | 573 | struct inode *dir) |
570 | { | 574 | { |
571 | req->r_tid = ++mdsc->last_tid; | 575 | req->r_tid = ++mdsc->last_tid; |
572 | if (req->r_num_caps) | 576 | if (req->r_num_caps) |
573 | ceph_reserve_caps(mdsc, &req->r_caps_reservation, | 577 | ceph_reserve_caps(mdsc, &req->r_caps_reservation, |
574 | req->r_num_caps); | 578 | req->r_num_caps); |
575 | dout("__register_request %p tid %lld\n", req, req->r_tid); | 579 | dout("__register_request %p tid %lld\n", req, req->r_tid); |
576 | ceph_mdsc_get_request(req); | 580 | ceph_mdsc_get_request(req); |
577 | __insert_request(mdsc, req); | 581 | __insert_request(mdsc, req); |
578 | 582 | ||
579 | req->r_uid = current_fsuid(); | 583 | req->r_uid = current_fsuid(); |
580 | req->r_gid = current_fsgid(); | 584 | req->r_gid = current_fsgid(); |
581 | 585 | ||
582 | if (dir) { | 586 | if (dir) { |
583 | struct ceph_inode_info *ci = ceph_inode(dir); | 587 | struct ceph_inode_info *ci = ceph_inode(dir); |
584 | 588 | ||
585 | ihold(dir); | 589 | ihold(dir); |
586 | spin_lock(&ci->i_unsafe_lock); | 590 | spin_lock(&ci->i_unsafe_lock); |
587 | req->r_unsafe_dir = dir; | 591 | req->r_unsafe_dir = dir; |
588 | list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops); | 592 | list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops); |
589 | spin_unlock(&ci->i_unsafe_lock); | 593 | spin_unlock(&ci->i_unsafe_lock); |
590 | } | 594 | } |
591 | } | 595 | } |
592 | 596 | ||
593 | static void __unregister_request(struct ceph_mds_client *mdsc, | 597 | static void __unregister_request(struct ceph_mds_client *mdsc, |
594 | struct ceph_mds_request *req) | 598 | struct ceph_mds_request *req) |
595 | { | 599 | { |
596 | dout("__unregister_request %p tid %lld\n", req, req->r_tid); | 600 | dout("__unregister_request %p tid %lld\n", req, req->r_tid); |
597 | rb_erase(&req->r_node, &mdsc->request_tree); | 601 | rb_erase(&req->r_node, &mdsc->request_tree); |
598 | RB_CLEAR_NODE(&req->r_node); | 602 | RB_CLEAR_NODE(&req->r_node); |
599 | 603 | ||
600 | if (req->r_unsafe_dir) { | 604 | if (req->r_unsafe_dir) { |
601 | struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir); | 605 | struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir); |
602 | 606 | ||
603 | spin_lock(&ci->i_unsafe_lock); | 607 | spin_lock(&ci->i_unsafe_lock); |
604 | list_del_init(&req->r_unsafe_dir_item); | 608 | list_del_init(&req->r_unsafe_dir_item); |
605 | spin_unlock(&ci->i_unsafe_lock); | 609 | spin_unlock(&ci->i_unsafe_lock); |
606 | 610 | ||
607 | iput(req->r_unsafe_dir); | 611 | iput(req->r_unsafe_dir); |
608 | req->r_unsafe_dir = NULL; | 612 | req->r_unsafe_dir = NULL; |
609 | } | 613 | } |
610 | 614 | ||
611 | ceph_mdsc_put_request(req); | 615 | ceph_mdsc_put_request(req); |
612 | } | 616 | } |
613 | 617 | ||
614 | /* | 618 | /* |
615 | * Choose mds to send request to next. If there is a hint set in the | 619 | * Choose mds to send request to next. If there is a hint set in the |
616 | * request (e.g., due to a prior forward hint from the mds), use that. | 620 | * request (e.g., due to a prior forward hint from the mds), use that. |
617 | * Otherwise, consult frag tree and/or caps to identify the | 621 | * Otherwise, consult frag tree and/or caps to identify the |
618 | * appropriate mds. If all else fails, choose randomly. | 622 | * appropriate mds. If all else fails, choose randomly. |
619 | * | 623 | * |
620 | * Called under mdsc->mutex. | 624 | * Called under mdsc->mutex. |
621 | */ | 625 | */ |
622 | static struct dentry *get_nonsnap_parent(struct dentry *dentry) | 626 | static struct dentry *get_nonsnap_parent(struct dentry *dentry) |
623 | { | 627 | { |
624 | /* | 628 | /* |
625 | * we don't need to worry about protecting the d_parent access | 629 | * we don't need to worry about protecting the d_parent access |
626 | * here because we never renaming inside the snapped namespace | 630 | * here because we never renaming inside the snapped namespace |
627 | * except to resplice to another snapdir, and either the old or new | 631 | * except to resplice to another snapdir, and either the old or new |
628 | * result is a valid result. | 632 | * result is a valid result. |
629 | */ | 633 | */ |
630 | while (!IS_ROOT(dentry) && ceph_snap(dentry->d_inode) != CEPH_NOSNAP) | 634 | while (!IS_ROOT(dentry) && ceph_snap(dentry->d_inode) != CEPH_NOSNAP) |
631 | dentry = dentry->d_parent; | 635 | dentry = dentry->d_parent; |
632 | return dentry; | 636 | return dentry; |
633 | } | 637 | } |
634 | 638 | ||
635 | static int __choose_mds(struct ceph_mds_client *mdsc, | 639 | static int __choose_mds(struct ceph_mds_client *mdsc, |
636 | struct ceph_mds_request *req) | 640 | struct ceph_mds_request *req) |
637 | { | 641 | { |
638 | struct inode *inode; | 642 | struct inode *inode; |
639 | struct ceph_inode_info *ci; | 643 | struct ceph_inode_info *ci; |
640 | struct ceph_cap *cap; | 644 | struct ceph_cap *cap; |
641 | int mode = req->r_direct_mode; | 645 | int mode = req->r_direct_mode; |
642 | int mds = -1; | 646 | int mds = -1; |
643 | u32 hash = req->r_direct_hash; | 647 | u32 hash = req->r_direct_hash; |
644 | bool is_hash = req->r_direct_is_hash; | 648 | bool is_hash = req->r_direct_is_hash; |
645 | 649 | ||
646 | /* | 650 | /* |
647 | * is there a specific mds we should try? ignore hint if we have | 651 | * is there a specific mds we should try? ignore hint if we have |
648 | * no session and the mds is not up (active or recovering). | 652 | * no session and the mds is not up (active or recovering). |
649 | */ | 653 | */ |
650 | if (req->r_resend_mds >= 0 && | 654 | if (req->r_resend_mds >= 0 && |
651 | (__have_session(mdsc, req->r_resend_mds) || | 655 | (__have_session(mdsc, req->r_resend_mds) || |
652 | ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) { | 656 | ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) { |
653 | dout("choose_mds using resend_mds mds%d\n", | 657 | dout("choose_mds using resend_mds mds%d\n", |
654 | req->r_resend_mds); | 658 | req->r_resend_mds); |
655 | return req->r_resend_mds; | 659 | return req->r_resend_mds; |
656 | } | 660 | } |
657 | 661 | ||
658 | if (mode == USE_RANDOM_MDS) | 662 | if (mode == USE_RANDOM_MDS) |
659 | goto random; | 663 | goto random; |
660 | 664 | ||
661 | inode = NULL; | 665 | inode = NULL; |
662 | if (req->r_inode) { | 666 | if (req->r_inode) { |
663 | inode = req->r_inode; | 667 | inode = req->r_inode; |
664 | } else if (req->r_dentry) { | 668 | } else if (req->r_dentry) { |
665 | /* ignore race with rename; old or new d_parent is okay */ | 669 | /* ignore race with rename; old or new d_parent is okay */ |
666 | struct dentry *parent = req->r_dentry->d_parent; | 670 | struct dentry *parent = req->r_dentry->d_parent; |
667 | struct inode *dir = parent->d_inode; | 671 | struct inode *dir = parent->d_inode; |
668 | 672 | ||
669 | if (dir->i_sb != mdsc->fsc->sb) { | 673 | if (dir->i_sb != mdsc->fsc->sb) { |
670 | /* not this fs! */ | 674 | /* not this fs! */ |
671 | inode = req->r_dentry->d_inode; | 675 | inode = req->r_dentry->d_inode; |
672 | } else if (ceph_snap(dir) != CEPH_NOSNAP) { | 676 | } else if (ceph_snap(dir) != CEPH_NOSNAP) { |
673 | /* direct snapped/virtual snapdir requests | 677 | /* direct snapped/virtual snapdir requests |
674 | * based on parent dir inode */ | 678 | * based on parent dir inode */ |
675 | struct dentry *dn = get_nonsnap_parent(parent); | 679 | struct dentry *dn = get_nonsnap_parent(parent); |
676 | inode = dn->d_inode; | 680 | inode = dn->d_inode; |
677 | dout("__choose_mds using nonsnap parent %p\n", inode); | 681 | dout("__choose_mds using nonsnap parent %p\n", inode); |
678 | } else if (req->r_dentry->d_inode) { | 682 | } else if (req->r_dentry->d_inode) { |
679 | /* dentry target */ | 683 | /* dentry target */ |
680 | inode = req->r_dentry->d_inode; | 684 | inode = req->r_dentry->d_inode; |
681 | } else { | 685 | } else { |
682 | /* dir + name */ | 686 | /* dir + name */ |
683 | inode = dir; | 687 | inode = dir; |
684 | hash = ceph_dentry_hash(dir, req->r_dentry); | 688 | hash = ceph_dentry_hash(dir, req->r_dentry); |
685 | is_hash = true; | 689 | is_hash = true; |
686 | } | 690 | } |
687 | } | 691 | } |
688 | 692 | ||
689 | dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash, | 693 | dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash, |
690 | (int)hash, mode); | 694 | (int)hash, mode); |
691 | if (!inode) | 695 | if (!inode) |
692 | goto random; | 696 | goto random; |
693 | ci = ceph_inode(inode); | 697 | ci = ceph_inode(inode); |
694 | 698 | ||
695 | if (is_hash && S_ISDIR(inode->i_mode)) { | 699 | if (is_hash && S_ISDIR(inode->i_mode)) { |
696 | struct ceph_inode_frag frag; | 700 | struct ceph_inode_frag frag; |
697 | int found; | 701 | int found; |
698 | 702 | ||
699 | ceph_choose_frag(ci, hash, &frag, &found); | 703 | ceph_choose_frag(ci, hash, &frag, &found); |
700 | if (found) { | 704 | if (found) { |
701 | if (mode == USE_ANY_MDS && frag.ndist > 0) { | 705 | if (mode == USE_ANY_MDS && frag.ndist > 0) { |
702 | u8 r; | 706 | u8 r; |
703 | 707 | ||
704 | /* choose a random replica */ | 708 | /* choose a random replica */ |
705 | get_random_bytes(&r, 1); | 709 | get_random_bytes(&r, 1); |
706 | r %= frag.ndist; | 710 | r %= frag.ndist; |
707 | mds = frag.dist[r]; | 711 | mds = frag.dist[r]; |
708 | dout("choose_mds %p %llx.%llx " | 712 | dout("choose_mds %p %llx.%llx " |
709 | "frag %u mds%d (%d/%d)\n", | 713 | "frag %u mds%d (%d/%d)\n", |
710 | inode, ceph_vinop(inode), | 714 | inode, ceph_vinop(inode), |
711 | frag.frag, mds, | 715 | frag.frag, mds, |
712 | (int)r, frag.ndist); | 716 | (int)r, frag.ndist); |
713 | if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= | 717 | if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= |
714 | CEPH_MDS_STATE_ACTIVE) | 718 | CEPH_MDS_STATE_ACTIVE) |
715 | return mds; | 719 | return mds; |
716 | } | 720 | } |
717 | 721 | ||
718 | /* since this file/dir wasn't known to be | 722 | /* since this file/dir wasn't known to be |
719 | * replicated, then we want to look for the | 723 | * replicated, then we want to look for the |
720 | * authoritative mds. */ | 724 | * authoritative mds. */ |
721 | mode = USE_AUTH_MDS; | 725 | mode = USE_AUTH_MDS; |
722 | if (frag.mds >= 0) { | 726 | if (frag.mds >= 0) { |
723 | /* choose auth mds */ | 727 | /* choose auth mds */ |
724 | mds = frag.mds; | 728 | mds = frag.mds; |
725 | dout("choose_mds %p %llx.%llx " | 729 | dout("choose_mds %p %llx.%llx " |
726 | "frag %u mds%d (auth)\n", | 730 | "frag %u mds%d (auth)\n", |
727 | inode, ceph_vinop(inode), frag.frag, mds); | 731 | inode, ceph_vinop(inode), frag.frag, mds); |
728 | if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= | 732 | if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= |
729 | CEPH_MDS_STATE_ACTIVE) | 733 | CEPH_MDS_STATE_ACTIVE) |
730 | return mds; | 734 | return mds; |
731 | } | 735 | } |
732 | } | 736 | } |
733 | } | 737 | } |
734 | 738 | ||
735 | spin_lock(&ci->i_ceph_lock); | 739 | spin_lock(&ci->i_ceph_lock); |
736 | cap = NULL; | 740 | cap = NULL; |
737 | if (mode == USE_AUTH_MDS) | 741 | if (mode == USE_AUTH_MDS) |
738 | cap = ci->i_auth_cap; | 742 | cap = ci->i_auth_cap; |
739 | if (!cap && !RB_EMPTY_ROOT(&ci->i_caps)) | 743 | if (!cap && !RB_EMPTY_ROOT(&ci->i_caps)) |
740 | cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node); | 744 | cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node); |
741 | if (!cap) { | 745 | if (!cap) { |
742 | spin_unlock(&ci->i_ceph_lock); | 746 | spin_unlock(&ci->i_ceph_lock); |
743 | goto random; | 747 | goto random; |
744 | } | 748 | } |
745 | mds = cap->session->s_mds; | 749 | mds = cap->session->s_mds; |
746 | dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n", | 750 | dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n", |
747 | inode, ceph_vinop(inode), mds, | 751 | inode, ceph_vinop(inode), mds, |
748 | cap == ci->i_auth_cap ? "auth " : "", cap); | 752 | cap == ci->i_auth_cap ? "auth " : "", cap); |
749 | spin_unlock(&ci->i_ceph_lock); | 753 | spin_unlock(&ci->i_ceph_lock); |
750 | return mds; | 754 | return mds; |
751 | 755 | ||
752 | random: | 756 | random: |
753 | mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap); | 757 | mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap); |
754 | dout("choose_mds chose random mds%d\n", mds); | 758 | dout("choose_mds chose random mds%d\n", mds); |
755 | return mds; | 759 | return mds; |
756 | } | 760 | } |
757 | 761 | ||
758 | 762 | ||
759 | /* | 763 | /* |
760 | * session messages | 764 | * session messages |
761 | */ | 765 | */ |
762 | static struct ceph_msg *create_session_msg(u32 op, u64 seq) | 766 | static struct ceph_msg *create_session_msg(u32 op, u64 seq) |
763 | { | 767 | { |
764 | struct ceph_msg *msg; | 768 | struct ceph_msg *msg; |
765 | struct ceph_mds_session_head *h; | 769 | struct ceph_mds_session_head *h; |
766 | 770 | ||
767 | msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS, | 771 | msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS, |
768 | false); | 772 | false); |
769 | if (!msg) { | 773 | if (!msg) { |
770 | pr_err("create_session_msg ENOMEM creating msg\n"); | 774 | pr_err("create_session_msg ENOMEM creating msg\n"); |
771 | return NULL; | 775 | return NULL; |
772 | } | 776 | } |
773 | h = msg->front.iov_base; | 777 | h = msg->front.iov_base; |
774 | h->op = cpu_to_le32(op); | 778 | h->op = cpu_to_le32(op); |
775 | h->seq = cpu_to_le64(seq); | 779 | h->seq = cpu_to_le64(seq); |
776 | return msg; | 780 | return msg; |
777 | } | 781 | } |
778 | 782 | ||
779 | /* | 783 | /* |
780 | * send session open request. | 784 | * send session open request. |
781 | * | 785 | * |
782 | * called under mdsc->mutex | 786 | * called under mdsc->mutex |
783 | */ | 787 | */ |
784 | static int __open_session(struct ceph_mds_client *mdsc, | 788 | static int __open_session(struct ceph_mds_client *mdsc, |
785 | struct ceph_mds_session *session) | 789 | struct ceph_mds_session *session) |
786 | { | 790 | { |
787 | struct ceph_msg *msg; | 791 | struct ceph_msg *msg; |
788 | int mstate; | 792 | int mstate; |
789 | int mds = session->s_mds; | 793 | int mds = session->s_mds; |
790 | 794 | ||
791 | /* wait for mds to go active? */ | 795 | /* wait for mds to go active? */ |
792 | mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds); | 796 | mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds); |
793 | dout("open_session to mds%d (%s)\n", mds, | 797 | dout("open_session to mds%d (%s)\n", mds, |
794 | ceph_mds_state_name(mstate)); | 798 | ceph_mds_state_name(mstate)); |
795 | session->s_state = CEPH_MDS_SESSION_OPENING; | 799 | session->s_state = CEPH_MDS_SESSION_OPENING; |
796 | session->s_renew_requested = jiffies; | 800 | session->s_renew_requested = jiffies; |
797 | 801 | ||
798 | /* send connect message */ | 802 | /* send connect message */ |
799 | msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq); | 803 | msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq); |
800 | if (!msg) | 804 | if (!msg) |
801 | return -ENOMEM; | 805 | return -ENOMEM; |
802 | ceph_con_send(&session->s_con, msg); | 806 | ceph_con_send(&session->s_con, msg); |
803 | return 0; | 807 | return 0; |
804 | } | 808 | } |
805 | 809 | ||
806 | /* | 810 | /* |
807 | * open sessions for any export targets for the given mds | 811 | * open sessions for any export targets for the given mds |
808 | * | 812 | * |
809 | * called under mdsc->mutex | 813 | * called under mdsc->mutex |
810 | */ | 814 | */ |
811 | static void __open_export_target_sessions(struct ceph_mds_client *mdsc, | 815 | static void __open_export_target_sessions(struct ceph_mds_client *mdsc, |
812 | struct ceph_mds_session *session) | 816 | struct ceph_mds_session *session) |
813 | { | 817 | { |
814 | struct ceph_mds_info *mi; | 818 | struct ceph_mds_info *mi; |
815 | struct ceph_mds_session *ts; | 819 | struct ceph_mds_session *ts; |
816 | int i, mds = session->s_mds; | 820 | int i, mds = session->s_mds; |
817 | int target; | 821 | int target; |
818 | 822 | ||
819 | if (mds >= mdsc->mdsmap->m_max_mds) | 823 | if (mds >= mdsc->mdsmap->m_max_mds) |
820 | return; | 824 | return; |
821 | mi = &mdsc->mdsmap->m_info[mds]; | 825 | mi = &mdsc->mdsmap->m_info[mds]; |
822 | dout("open_export_target_sessions for mds%d (%d targets)\n", | 826 | dout("open_export_target_sessions for mds%d (%d targets)\n", |
823 | session->s_mds, mi->num_export_targets); | 827 | session->s_mds, mi->num_export_targets); |
824 | 828 | ||
825 | for (i = 0; i < mi->num_export_targets; i++) { | 829 | for (i = 0; i < mi->num_export_targets; i++) { |
826 | target = mi->export_targets[i]; | 830 | target = mi->export_targets[i]; |
827 | ts = __ceph_lookup_mds_session(mdsc, target); | 831 | ts = __ceph_lookup_mds_session(mdsc, target); |
828 | if (!ts) { | 832 | if (!ts) { |
829 | ts = register_session(mdsc, target); | 833 | ts = register_session(mdsc, target); |
830 | if (IS_ERR(ts)) | 834 | if (IS_ERR(ts)) |
831 | return; | 835 | return; |
832 | } | 836 | } |
833 | if (session->s_state == CEPH_MDS_SESSION_NEW || | 837 | if (session->s_state == CEPH_MDS_SESSION_NEW || |
834 | session->s_state == CEPH_MDS_SESSION_CLOSING) | 838 | session->s_state == CEPH_MDS_SESSION_CLOSING) |
835 | __open_session(mdsc, session); | 839 | __open_session(mdsc, session); |
836 | else | 840 | else |
837 | dout(" mds%d target mds%d %p is %s\n", session->s_mds, | 841 | dout(" mds%d target mds%d %p is %s\n", session->s_mds, |
838 | i, ts, session_state_name(ts->s_state)); | 842 | i, ts, session_state_name(ts->s_state)); |
839 | ceph_put_mds_session(ts); | 843 | ceph_put_mds_session(ts); |
840 | } | 844 | } |
841 | } | 845 | } |
842 | 846 | ||
843 | void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, | 847 | void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, |
844 | struct ceph_mds_session *session) | 848 | struct ceph_mds_session *session) |
845 | { | 849 | { |
846 | mutex_lock(&mdsc->mutex); | 850 | mutex_lock(&mdsc->mutex); |
847 | __open_export_target_sessions(mdsc, session); | 851 | __open_export_target_sessions(mdsc, session); |
848 | mutex_unlock(&mdsc->mutex); | 852 | mutex_unlock(&mdsc->mutex); |
849 | } | 853 | } |
850 | 854 | ||
851 | /* | 855 | /* |
852 | * session caps | 856 | * session caps |
853 | */ | 857 | */ |
854 | 858 | ||
855 | /* | 859 | /* |
856 | * Free preallocated cap messages assigned to this session | 860 | * Free preallocated cap messages assigned to this session |
857 | */ | 861 | */ |
858 | static void cleanup_cap_releases(struct ceph_mds_session *session) | 862 | static void cleanup_cap_releases(struct ceph_mds_session *session) |
859 | { | 863 | { |
860 | struct ceph_msg *msg; | 864 | struct ceph_msg *msg; |
861 | 865 | ||
862 | spin_lock(&session->s_cap_lock); | 866 | spin_lock(&session->s_cap_lock); |
863 | while (!list_empty(&session->s_cap_releases)) { | 867 | while (!list_empty(&session->s_cap_releases)) { |
864 | msg = list_first_entry(&session->s_cap_releases, | 868 | msg = list_first_entry(&session->s_cap_releases, |
865 | struct ceph_msg, list_head); | 869 | struct ceph_msg, list_head); |
866 | list_del_init(&msg->list_head); | 870 | list_del_init(&msg->list_head); |
867 | ceph_msg_put(msg); | 871 | ceph_msg_put(msg); |
868 | } | 872 | } |
869 | while (!list_empty(&session->s_cap_releases_done)) { | 873 | while (!list_empty(&session->s_cap_releases_done)) { |
870 | msg = list_first_entry(&session->s_cap_releases_done, | 874 | msg = list_first_entry(&session->s_cap_releases_done, |
871 | struct ceph_msg, list_head); | 875 | struct ceph_msg, list_head); |
872 | list_del_init(&msg->list_head); | 876 | list_del_init(&msg->list_head); |
873 | ceph_msg_put(msg); | 877 | ceph_msg_put(msg); |
874 | } | 878 | } |
875 | spin_unlock(&session->s_cap_lock); | 879 | spin_unlock(&session->s_cap_lock); |
876 | } | 880 | } |
877 | 881 | ||
878 | /* | 882 | /* |
879 | * Helper to safely iterate over all caps associated with a session, with | 883 | * Helper to safely iterate over all caps associated with a session, with |
880 | * special care taken to handle a racing __ceph_remove_cap(). | 884 | * special care taken to handle a racing __ceph_remove_cap(). |
881 | * | 885 | * |
882 | * Caller must hold session s_mutex. | 886 | * Caller must hold session s_mutex. |
883 | */ | 887 | */ |
884 | static int iterate_session_caps(struct ceph_mds_session *session, | 888 | static int iterate_session_caps(struct ceph_mds_session *session, |
885 | int (*cb)(struct inode *, struct ceph_cap *, | 889 | int (*cb)(struct inode *, struct ceph_cap *, |
886 | void *), void *arg) | 890 | void *), void *arg) |
887 | { | 891 | { |
888 | struct list_head *p; | 892 | struct list_head *p; |
889 | struct ceph_cap *cap; | 893 | struct ceph_cap *cap; |
890 | struct inode *inode, *last_inode = NULL; | 894 | struct inode *inode, *last_inode = NULL; |
891 | struct ceph_cap *old_cap = NULL; | 895 | struct ceph_cap *old_cap = NULL; |
892 | int ret; | 896 | int ret; |
893 | 897 | ||
894 | dout("iterate_session_caps %p mds%d\n", session, session->s_mds); | 898 | dout("iterate_session_caps %p mds%d\n", session, session->s_mds); |
895 | spin_lock(&session->s_cap_lock); | 899 | spin_lock(&session->s_cap_lock); |
896 | p = session->s_caps.next; | 900 | p = session->s_caps.next; |
897 | while (p != &session->s_caps) { | 901 | while (p != &session->s_caps) { |
898 | cap = list_entry(p, struct ceph_cap, session_caps); | 902 | cap = list_entry(p, struct ceph_cap, session_caps); |
899 | inode = igrab(&cap->ci->vfs_inode); | 903 | inode = igrab(&cap->ci->vfs_inode); |
900 | if (!inode) { | 904 | if (!inode) { |
901 | p = p->next; | 905 | p = p->next; |
902 | continue; | 906 | continue; |
903 | } | 907 | } |
904 | session->s_cap_iterator = cap; | 908 | session->s_cap_iterator = cap; |
905 | spin_unlock(&session->s_cap_lock); | 909 | spin_unlock(&session->s_cap_lock); |
906 | 910 | ||
907 | if (last_inode) { | 911 | if (last_inode) { |
908 | iput(last_inode); | 912 | iput(last_inode); |
909 | last_inode = NULL; | 913 | last_inode = NULL; |
910 | } | 914 | } |
911 | if (old_cap) { | 915 | if (old_cap) { |
912 | ceph_put_cap(session->s_mdsc, old_cap); | 916 | ceph_put_cap(session->s_mdsc, old_cap); |
913 | old_cap = NULL; | 917 | old_cap = NULL; |
914 | } | 918 | } |
915 | 919 | ||
916 | ret = cb(inode, cap, arg); | 920 | ret = cb(inode, cap, arg); |
917 | last_inode = inode; | 921 | last_inode = inode; |
918 | 922 | ||
919 | spin_lock(&session->s_cap_lock); | 923 | spin_lock(&session->s_cap_lock); |
920 | p = p->next; | 924 | p = p->next; |
921 | if (cap->ci == NULL) { | 925 | if (cap->ci == NULL) { |
922 | dout("iterate_session_caps finishing cap %p removal\n", | 926 | dout("iterate_session_caps finishing cap %p removal\n", |
923 | cap); | 927 | cap); |
924 | BUG_ON(cap->session != session); | 928 | BUG_ON(cap->session != session); |
925 | list_del_init(&cap->session_caps); | 929 | list_del_init(&cap->session_caps); |
926 | session->s_nr_caps--; | 930 | session->s_nr_caps--; |
927 | cap->session = NULL; | 931 | cap->session = NULL; |
928 | old_cap = cap; /* put_cap it w/o locks held */ | 932 | old_cap = cap; /* put_cap it w/o locks held */ |
929 | } | 933 | } |
930 | if (ret < 0) | 934 | if (ret < 0) |
931 | goto out; | 935 | goto out; |
932 | } | 936 | } |
933 | ret = 0; | 937 | ret = 0; |
934 | out: | 938 | out: |
935 | session->s_cap_iterator = NULL; | 939 | session->s_cap_iterator = NULL; |
936 | spin_unlock(&session->s_cap_lock); | 940 | spin_unlock(&session->s_cap_lock); |
937 | 941 | ||
938 | if (last_inode) | 942 | if (last_inode) |
939 | iput(last_inode); | 943 | iput(last_inode); |
940 | if (old_cap) | 944 | if (old_cap) |
941 | ceph_put_cap(session->s_mdsc, old_cap); | 945 | ceph_put_cap(session->s_mdsc, old_cap); |
942 | 946 | ||
943 | return ret; | 947 | return ret; |
944 | } | 948 | } |
945 | 949 | ||
946 | static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, | 950 | static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, |
947 | void *arg) | 951 | void *arg) |
948 | { | 952 | { |
949 | struct ceph_inode_info *ci = ceph_inode(inode); | 953 | struct ceph_inode_info *ci = ceph_inode(inode); |
950 | int drop = 0; | 954 | int drop = 0; |
951 | 955 | ||
952 | dout("removing cap %p, ci is %p, inode is %p\n", | 956 | dout("removing cap %p, ci is %p, inode is %p\n", |
953 | cap, ci, &ci->vfs_inode); | 957 | cap, ci, &ci->vfs_inode); |
954 | spin_lock(&ci->i_ceph_lock); | 958 | spin_lock(&ci->i_ceph_lock); |
955 | __ceph_remove_cap(cap); | 959 | __ceph_remove_cap(cap); |
956 | if (!__ceph_is_any_real_caps(ci)) { | 960 | if (!__ceph_is_any_real_caps(ci)) { |
957 | struct ceph_mds_client *mdsc = | 961 | struct ceph_mds_client *mdsc = |
958 | ceph_sb_to_client(inode->i_sb)->mdsc; | 962 | ceph_sb_to_client(inode->i_sb)->mdsc; |
959 | 963 | ||
960 | spin_lock(&mdsc->cap_dirty_lock); | 964 | spin_lock(&mdsc->cap_dirty_lock); |
961 | if (!list_empty(&ci->i_dirty_item)) { | 965 | if (!list_empty(&ci->i_dirty_item)) { |
962 | pr_info(" dropping dirty %s state for %p %lld\n", | 966 | pr_info(" dropping dirty %s state for %p %lld\n", |
963 | ceph_cap_string(ci->i_dirty_caps), | 967 | ceph_cap_string(ci->i_dirty_caps), |
964 | inode, ceph_ino(inode)); | 968 | inode, ceph_ino(inode)); |
965 | ci->i_dirty_caps = 0; | 969 | ci->i_dirty_caps = 0; |
966 | list_del_init(&ci->i_dirty_item); | 970 | list_del_init(&ci->i_dirty_item); |
967 | drop = 1; | 971 | drop = 1; |
968 | } | 972 | } |
969 | if (!list_empty(&ci->i_flushing_item)) { | 973 | if (!list_empty(&ci->i_flushing_item)) { |
970 | pr_info(" dropping dirty+flushing %s state for %p %lld\n", | 974 | pr_info(" dropping dirty+flushing %s state for %p %lld\n", |
971 | ceph_cap_string(ci->i_flushing_caps), | 975 | ceph_cap_string(ci->i_flushing_caps), |
972 | inode, ceph_ino(inode)); | 976 | inode, ceph_ino(inode)); |
973 | ci->i_flushing_caps = 0; | 977 | ci->i_flushing_caps = 0; |
974 | list_del_init(&ci->i_flushing_item); | 978 | list_del_init(&ci->i_flushing_item); |
975 | mdsc->num_cap_flushing--; | 979 | mdsc->num_cap_flushing--; |
976 | drop = 1; | 980 | drop = 1; |
977 | } | 981 | } |
978 | if (drop && ci->i_wrbuffer_ref) { | 982 | if (drop && ci->i_wrbuffer_ref) { |
979 | pr_info(" dropping dirty data for %p %lld\n", | 983 | pr_info(" dropping dirty data for %p %lld\n", |
980 | inode, ceph_ino(inode)); | 984 | inode, ceph_ino(inode)); |
981 | ci->i_wrbuffer_ref = 0; | 985 | ci->i_wrbuffer_ref = 0; |
982 | ci->i_wrbuffer_ref_head = 0; | 986 | ci->i_wrbuffer_ref_head = 0; |
983 | drop++; | 987 | drop++; |
984 | } | 988 | } |
985 | spin_unlock(&mdsc->cap_dirty_lock); | 989 | spin_unlock(&mdsc->cap_dirty_lock); |
986 | } | 990 | } |
987 | spin_unlock(&ci->i_ceph_lock); | 991 | spin_unlock(&ci->i_ceph_lock); |
988 | while (drop--) | 992 | while (drop--) |
989 | iput(inode); | 993 | iput(inode); |
990 | return 0; | 994 | return 0; |
991 | } | 995 | } |
992 | 996 | ||
993 | /* | 997 | /* |
994 | * caller must hold session s_mutex | 998 | * caller must hold session s_mutex |
995 | */ | 999 | */ |
996 | static void remove_session_caps(struct ceph_mds_session *session) | 1000 | static void remove_session_caps(struct ceph_mds_session *session) |
997 | { | 1001 | { |
998 | dout("remove_session_caps on %p\n", session); | 1002 | dout("remove_session_caps on %p\n", session); |
999 | iterate_session_caps(session, remove_session_caps_cb, NULL); | 1003 | iterate_session_caps(session, remove_session_caps_cb, NULL); |
1000 | BUG_ON(session->s_nr_caps > 0); | 1004 | BUG_ON(session->s_nr_caps > 0); |
1001 | BUG_ON(!list_empty(&session->s_cap_flushing)); | 1005 | BUG_ON(!list_empty(&session->s_cap_flushing)); |
1002 | cleanup_cap_releases(session); | 1006 | cleanup_cap_releases(session); |
1003 | } | 1007 | } |
1004 | 1008 | ||
1005 | /* | 1009 | /* |
1006 | * wake up any threads waiting on this session's caps. if the cap is | 1010 | * wake up any threads waiting on this session's caps. if the cap is |
1007 | * old (didn't get renewed on the client reconnect), remove it now. | 1011 | * old (didn't get renewed on the client reconnect), remove it now. |
1008 | * | 1012 | * |
1009 | * caller must hold s_mutex. | 1013 | * caller must hold s_mutex. |
1010 | */ | 1014 | */ |
1011 | static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap, | 1015 | static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap, |
1012 | void *arg) | 1016 | void *arg) |
1013 | { | 1017 | { |
1014 | struct ceph_inode_info *ci = ceph_inode(inode); | 1018 | struct ceph_inode_info *ci = ceph_inode(inode); |
1015 | 1019 | ||
1016 | wake_up_all(&ci->i_cap_wq); | 1020 | wake_up_all(&ci->i_cap_wq); |
1017 | if (arg) { | 1021 | if (arg) { |
1018 | spin_lock(&ci->i_ceph_lock); | 1022 | spin_lock(&ci->i_ceph_lock); |
1019 | ci->i_wanted_max_size = 0; | 1023 | ci->i_wanted_max_size = 0; |
1020 | ci->i_requested_max_size = 0; | 1024 | ci->i_requested_max_size = 0; |
1021 | spin_unlock(&ci->i_ceph_lock); | 1025 | spin_unlock(&ci->i_ceph_lock); |
1022 | } | 1026 | } |
1023 | return 0; | 1027 | return 0; |
1024 | } | 1028 | } |
1025 | 1029 | ||
1026 | static void wake_up_session_caps(struct ceph_mds_session *session, | 1030 | static void wake_up_session_caps(struct ceph_mds_session *session, |
1027 | int reconnect) | 1031 | int reconnect) |
1028 | { | 1032 | { |
1029 | dout("wake_up_session_caps %p mds%d\n", session, session->s_mds); | 1033 | dout("wake_up_session_caps %p mds%d\n", session, session->s_mds); |
1030 | iterate_session_caps(session, wake_up_session_cb, | 1034 | iterate_session_caps(session, wake_up_session_cb, |
1031 | (void *)(unsigned long)reconnect); | 1035 | (void *)(unsigned long)reconnect); |
1032 | } | 1036 | } |
1033 | 1037 | ||
1034 | /* | 1038 | /* |
1035 | * Send periodic message to MDS renewing all currently held caps. The | 1039 | * Send periodic message to MDS renewing all currently held caps. The |
1036 | * ack will reset the expiration for all caps from this session. | 1040 | * ack will reset the expiration for all caps from this session. |
1037 | * | 1041 | * |
1038 | * caller holds s_mutex | 1042 | * caller holds s_mutex |
1039 | */ | 1043 | */ |
1040 | static int send_renew_caps(struct ceph_mds_client *mdsc, | 1044 | static int send_renew_caps(struct ceph_mds_client *mdsc, |
1041 | struct ceph_mds_session *session) | 1045 | struct ceph_mds_session *session) |
1042 | { | 1046 | { |
1043 | struct ceph_msg *msg; | 1047 | struct ceph_msg *msg; |
1044 | int state; | 1048 | int state; |
1045 | 1049 | ||
1046 | if (time_after_eq(jiffies, session->s_cap_ttl) && | 1050 | if (time_after_eq(jiffies, session->s_cap_ttl) && |
1047 | time_after_eq(session->s_cap_ttl, session->s_renew_requested)) | 1051 | time_after_eq(session->s_cap_ttl, session->s_renew_requested)) |
1048 | pr_info("mds%d caps stale\n", session->s_mds); | 1052 | pr_info("mds%d caps stale\n", session->s_mds); |
1049 | session->s_renew_requested = jiffies; | 1053 | session->s_renew_requested = jiffies; |
1050 | 1054 | ||
1051 | /* do not try to renew caps until a recovering mds has reconnected | 1055 | /* do not try to renew caps until a recovering mds has reconnected |
1052 | * with its clients. */ | 1056 | * with its clients. */ |
1053 | state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds); | 1057 | state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds); |
1054 | if (state < CEPH_MDS_STATE_RECONNECT) { | 1058 | if (state < CEPH_MDS_STATE_RECONNECT) { |
1055 | dout("send_renew_caps ignoring mds%d (%s)\n", | 1059 | dout("send_renew_caps ignoring mds%d (%s)\n", |
1056 | session->s_mds, ceph_mds_state_name(state)); | 1060 | session->s_mds, ceph_mds_state_name(state)); |
1057 | return 0; | 1061 | return 0; |
1058 | } | 1062 | } |
1059 | 1063 | ||
1060 | dout("send_renew_caps to mds%d (%s)\n", session->s_mds, | 1064 | dout("send_renew_caps to mds%d (%s)\n", session->s_mds, |
1061 | ceph_mds_state_name(state)); | 1065 | ceph_mds_state_name(state)); |
1062 | msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS, | 1066 | msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS, |
1063 | ++session->s_renew_seq); | 1067 | ++session->s_renew_seq); |
1064 | if (!msg) | 1068 | if (!msg) |
1065 | return -ENOMEM; | 1069 | return -ENOMEM; |
1066 | ceph_con_send(&session->s_con, msg); | 1070 | ceph_con_send(&session->s_con, msg); |
1067 | return 0; | 1071 | return 0; |
1068 | } | 1072 | } |
1069 | 1073 | ||
1070 | /* | 1074 | /* |
1071 | * Note new cap ttl, and any transition from stale -> not stale (fresh?). | 1075 | * Note new cap ttl, and any transition from stale -> not stale (fresh?). |
1072 | * | 1076 | * |
1073 | * Called under session->s_mutex | 1077 | * Called under session->s_mutex |
1074 | */ | 1078 | */ |
1075 | static void renewed_caps(struct ceph_mds_client *mdsc, | 1079 | static void renewed_caps(struct ceph_mds_client *mdsc, |
1076 | struct ceph_mds_session *session, int is_renew) | 1080 | struct ceph_mds_session *session, int is_renew) |
1077 | { | 1081 | { |
1078 | int was_stale; | 1082 | int was_stale; |
1079 | int wake = 0; | 1083 | int wake = 0; |
1080 | 1084 | ||
1081 | spin_lock(&session->s_cap_lock); | 1085 | spin_lock(&session->s_cap_lock); |
1082 | was_stale = is_renew && (session->s_cap_ttl == 0 || | 1086 | was_stale = is_renew && (session->s_cap_ttl == 0 || |
1083 | time_after_eq(jiffies, session->s_cap_ttl)); | 1087 | time_after_eq(jiffies, session->s_cap_ttl)); |
1084 | 1088 | ||
1085 | session->s_cap_ttl = session->s_renew_requested + | 1089 | session->s_cap_ttl = session->s_renew_requested + |
1086 | mdsc->mdsmap->m_session_timeout*HZ; | 1090 | mdsc->mdsmap->m_session_timeout*HZ; |
1087 | 1091 | ||
1088 | if (was_stale) { | 1092 | if (was_stale) { |
1089 | if (time_before(jiffies, session->s_cap_ttl)) { | 1093 | if (time_before(jiffies, session->s_cap_ttl)) { |
1090 | pr_info("mds%d caps renewed\n", session->s_mds); | 1094 | pr_info("mds%d caps renewed\n", session->s_mds); |
1091 | wake = 1; | 1095 | wake = 1; |
1092 | } else { | 1096 | } else { |
1093 | pr_info("mds%d caps still stale\n", session->s_mds); | 1097 | pr_info("mds%d caps still stale\n", session->s_mds); |
1094 | } | 1098 | } |
1095 | } | 1099 | } |
1096 | dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n", | 1100 | dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n", |
1097 | session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh", | 1101 | session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh", |
1098 | time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh"); | 1102 | time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh"); |
1099 | spin_unlock(&session->s_cap_lock); | 1103 | spin_unlock(&session->s_cap_lock); |
1100 | 1104 | ||
1101 | if (wake) | 1105 | if (wake) |
1102 | wake_up_session_caps(session, 0); | 1106 | wake_up_session_caps(session, 0); |
1103 | } | 1107 | } |
1104 | 1108 | ||
1105 | /* | 1109 | /* |
1106 | * send a session close request | 1110 | * send a session close request |
1107 | */ | 1111 | */ |
1108 | static int request_close_session(struct ceph_mds_client *mdsc, | 1112 | static int request_close_session(struct ceph_mds_client *mdsc, |
1109 | struct ceph_mds_session *session) | 1113 | struct ceph_mds_session *session) |
1110 | { | 1114 | { |
1111 | struct ceph_msg *msg; | 1115 | struct ceph_msg *msg; |
1112 | 1116 | ||
1113 | dout("request_close_session mds%d state %s seq %lld\n", | 1117 | dout("request_close_session mds%d state %s seq %lld\n", |
1114 | session->s_mds, session_state_name(session->s_state), | 1118 | session->s_mds, session_state_name(session->s_state), |
1115 | session->s_seq); | 1119 | session->s_seq); |
1116 | msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq); | 1120 | msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq); |
1117 | if (!msg) | 1121 | if (!msg) |
1118 | return -ENOMEM; | 1122 | return -ENOMEM; |
1119 | ceph_con_send(&session->s_con, msg); | 1123 | ceph_con_send(&session->s_con, msg); |
1120 | return 0; | 1124 | return 0; |
1121 | } | 1125 | } |
1122 | 1126 | ||
1123 | /* | 1127 | /* |
1124 | * Called with s_mutex held. | 1128 | * Called with s_mutex held. |
1125 | */ | 1129 | */ |
1126 | static int __close_session(struct ceph_mds_client *mdsc, | 1130 | static int __close_session(struct ceph_mds_client *mdsc, |
1127 | struct ceph_mds_session *session) | 1131 | struct ceph_mds_session *session) |
1128 | { | 1132 | { |
1129 | if (session->s_state >= CEPH_MDS_SESSION_CLOSING) | 1133 | if (session->s_state >= CEPH_MDS_SESSION_CLOSING) |
1130 | return 0; | 1134 | return 0; |
1131 | session->s_state = CEPH_MDS_SESSION_CLOSING; | 1135 | session->s_state = CEPH_MDS_SESSION_CLOSING; |
1132 | return request_close_session(mdsc, session); | 1136 | return request_close_session(mdsc, session); |
1133 | } | 1137 | } |
1134 | 1138 | ||
1135 | /* | 1139 | /* |
1136 | * Trim old(er) caps. | 1140 | * Trim old(er) caps. |
1137 | * | 1141 | * |
1138 | * Because we can't cache an inode without one or more caps, we do | 1142 | * Because we can't cache an inode without one or more caps, we do |
1139 | * this indirectly: if a cap is unused, we prune its aliases, at which | 1143 | * this indirectly: if a cap is unused, we prune its aliases, at which |
1140 | * point the inode will hopefully get dropped to. | 1144 | * point the inode will hopefully get dropped to. |
1141 | * | 1145 | * |
1142 | * Yes, this is a bit sloppy. Our only real goal here is to respond to | 1146 | * Yes, this is a bit sloppy. Our only real goal here is to respond to |
1143 | * memory pressure from the MDS, though, so it needn't be perfect. | 1147 | * memory pressure from the MDS, though, so it needn't be perfect. |
1144 | */ | 1148 | */ |
1145 | static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) | 1149 | static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) |
1146 | { | 1150 | { |
1147 | struct ceph_mds_session *session = arg; | 1151 | struct ceph_mds_session *session = arg; |
1148 | struct ceph_inode_info *ci = ceph_inode(inode); | 1152 | struct ceph_inode_info *ci = ceph_inode(inode); |
1149 | int used, oissued, mine; | 1153 | int used, oissued, mine; |
1150 | 1154 | ||
1151 | if (session->s_trim_caps <= 0) | 1155 | if (session->s_trim_caps <= 0) |
1152 | return -1; | 1156 | return -1; |
1153 | 1157 | ||
1154 | spin_lock(&ci->i_ceph_lock); | 1158 | spin_lock(&ci->i_ceph_lock); |
1155 | mine = cap->issued | cap->implemented; | 1159 | mine = cap->issued | cap->implemented; |
1156 | used = __ceph_caps_used(ci); | 1160 | used = __ceph_caps_used(ci); |
1157 | oissued = __ceph_caps_issued_other(ci, cap); | 1161 | oissued = __ceph_caps_issued_other(ci, cap); |
1158 | 1162 | ||
1159 | dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n", | 1163 | dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n", |
1160 | inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued), | 1164 | inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued), |
1161 | ceph_cap_string(used)); | 1165 | ceph_cap_string(used)); |
1162 | if (ci->i_dirty_caps) | 1166 | if (ci->i_dirty_caps) |
1163 | goto out; /* dirty caps */ | 1167 | goto out; /* dirty caps */ |
1164 | if ((used & ~oissued) & mine) | 1168 | if ((used & ~oissued) & mine) |
1165 | goto out; /* we need these caps */ | 1169 | goto out; /* we need these caps */ |
1166 | 1170 | ||
1167 | session->s_trim_caps--; | 1171 | session->s_trim_caps--; |
1168 | if (oissued) { | 1172 | if (oissued) { |
1169 | /* we aren't the only cap.. just remove us */ | 1173 | /* we aren't the only cap.. just remove us */ |
1170 | __ceph_remove_cap(cap); | 1174 | __ceph_remove_cap(cap); |
1171 | } else { | 1175 | } else { |
1172 | /* try to drop referring dentries */ | 1176 | /* try to drop referring dentries */ |
1173 | spin_unlock(&ci->i_ceph_lock); | 1177 | spin_unlock(&ci->i_ceph_lock); |
1174 | d_prune_aliases(inode); | 1178 | d_prune_aliases(inode); |
1175 | dout("trim_caps_cb %p cap %p pruned, count now %d\n", | 1179 | dout("trim_caps_cb %p cap %p pruned, count now %d\n", |
1176 | inode, cap, atomic_read(&inode->i_count)); | 1180 | inode, cap, atomic_read(&inode->i_count)); |
1177 | return 0; | 1181 | return 0; |
1178 | } | 1182 | } |
1179 | 1183 | ||
1180 | out: | 1184 | out: |
1181 | spin_unlock(&ci->i_ceph_lock); | 1185 | spin_unlock(&ci->i_ceph_lock); |
1182 | return 0; | 1186 | return 0; |
1183 | } | 1187 | } |
1184 | 1188 | ||
1185 | /* | 1189 | /* |
1186 | * Trim session cap count down to some max number. | 1190 | * Trim session cap count down to some max number. |
1187 | */ | 1191 | */ |
1188 | static int trim_caps(struct ceph_mds_client *mdsc, | 1192 | static int trim_caps(struct ceph_mds_client *mdsc, |
1189 | struct ceph_mds_session *session, | 1193 | struct ceph_mds_session *session, |
1190 | int max_caps) | 1194 | int max_caps) |
1191 | { | 1195 | { |
1192 | int trim_caps = session->s_nr_caps - max_caps; | 1196 | int trim_caps = session->s_nr_caps - max_caps; |
1193 | 1197 | ||
1194 | dout("trim_caps mds%d start: %d / %d, trim %d\n", | 1198 | dout("trim_caps mds%d start: %d / %d, trim %d\n", |
1195 | session->s_mds, session->s_nr_caps, max_caps, trim_caps); | 1199 | session->s_mds, session->s_nr_caps, max_caps, trim_caps); |
1196 | if (trim_caps > 0) { | 1200 | if (trim_caps > 0) { |
1197 | session->s_trim_caps = trim_caps; | 1201 | session->s_trim_caps = trim_caps; |
1198 | iterate_session_caps(session, trim_caps_cb, session); | 1202 | iterate_session_caps(session, trim_caps_cb, session); |
1199 | dout("trim_caps mds%d done: %d / %d, trimmed %d\n", | 1203 | dout("trim_caps mds%d done: %d / %d, trimmed %d\n", |
1200 | session->s_mds, session->s_nr_caps, max_caps, | 1204 | session->s_mds, session->s_nr_caps, max_caps, |
1201 | trim_caps - session->s_trim_caps); | 1205 | trim_caps - session->s_trim_caps); |
1202 | session->s_trim_caps = 0; | 1206 | session->s_trim_caps = 0; |
1203 | } | 1207 | } |
1204 | return 0; | 1208 | return 0; |
1205 | } | 1209 | } |
1206 | 1210 | ||
1207 | /* | 1211 | /* |
1208 | * Allocate cap_release messages. If there is a partially full message | 1212 | * Allocate cap_release messages. If there is a partially full message |
1209 | * in the queue, try to allocate enough to cover it's remainder, so that | 1213 | * in the queue, try to allocate enough to cover it's remainder, so that |
1210 | * we can send it immediately. | 1214 | * we can send it immediately. |
1211 | * | 1215 | * |
1212 | * Called under s_mutex. | 1216 | * Called under s_mutex. |
1213 | */ | 1217 | */ |
1214 | int ceph_add_cap_releases(struct ceph_mds_client *mdsc, | 1218 | int ceph_add_cap_releases(struct ceph_mds_client *mdsc, |
1215 | struct ceph_mds_session *session) | 1219 | struct ceph_mds_session *session) |
1216 | { | 1220 | { |
1217 | struct ceph_msg *msg, *partial = NULL; | 1221 | struct ceph_msg *msg, *partial = NULL; |
1218 | struct ceph_mds_cap_release *head; | 1222 | struct ceph_mds_cap_release *head; |
1219 | int err = -ENOMEM; | 1223 | int err = -ENOMEM; |
1220 | int extra = mdsc->fsc->mount_options->cap_release_safety; | 1224 | int extra = mdsc->fsc->mount_options->cap_release_safety; |
1221 | int num; | 1225 | int num; |
1222 | 1226 | ||
1223 | dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds, | 1227 | dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds, |
1224 | extra); | 1228 | extra); |
1225 | 1229 | ||
1226 | spin_lock(&session->s_cap_lock); | 1230 | spin_lock(&session->s_cap_lock); |
1227 | 1231 | ||
1228 | if (!list_empty(&session->s_cap_releases)) { | 1232 | if (!list_empty(&session->s_cap_releases)) { |
1229 | msg = list_first_entry(&session->s_cap_releases, | 1233 | msg = list_first_entry(&session->s_cap_releases, |
1230 | struct ceph_msg, | 1234 | struct ceph_msg, |
1231 | list_head); | 1235 | list_head); |
1232 | head = msg->front.iov_base; | 1236 | head = msg->front.iov_base; |
1233 | num = le32_to_cpu(head->num); | 1237 | num = le32_to_cpu(head->num); |
1234 | if (num) { | 1238 | if (num) { |
1235 | dout(" partial %p with (%d/%d)\n", msg, num, | 1239 | dout(" partial %p with (%d/%d)\n", msg, num, |
1236 | (int)CEPH_CAPS_PER_RELEASE); | 1240 | (int)CEPH_CAPS_PER_RELEASE); |
1237 | extra += CEPH_CAPS_PER_RELEASE - num; | 1241 | extra += CEPH_CAPS_PER_RELEASE - num; |
1238 | partial = msg; | 1242 | partial = msg; |
1239 | } | 1243 | } |
1240 | } | 1244 | } |
1241 | while (session->s_num_cap_releases < session->s_nr_caps + extra) { | 1245 | while (session->s_num_cap_releases < session->s_nr_caps + extra) { |
1242 | spin_unlock(&session->s_cap_lock); | 1246 | spin_unlock(&session->s_cap_lock); |
1243 | msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE, | 1247 | msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE, |
1244 | GFP_NOFS, false); | 1248 | GFP_NOFS, false); |
1245 | if (!msg) | 1249 | if (!msg) |
1246 | goto out_unlocked; | 1250 | goto out_unlocked; |
1247 | dout("add_cap_releases %p msg %p now %d\n", session, msg, | 1251 | dout("add_cap_releases %p msg %p now %d\n", session, msg, |
1248 | (int)msg->front.iov_len); | 1252 | (int)msg->front.iov_len); |
1249 | head = msg->front.iov_base; | 1253 | head = msg->front.iov_base; |
1250 | head->num = cpu_to_le32(0); | 1254 | head->num = cpu_to_le32(0); |
1251 | msg->front.iov_len = sizeof(*head); | 1255 | msg->front.iov_len = sizeof(*head); |
1252 | spin_lock(&session->s_cap_lock); | 1256 | spin_lock(&session->s_cap_lock); |
1253 | list_add(&msg->list_head, &session->s_cap_releases); | 1257 | list_add(&msg->list_head, &session->s_cap_releases); |
1254 | session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE; | 1258 | session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE; |
1255 | } | 1259 | } |
1256 | 1260 | ||
1257 | if (partial) { | 1261 | if (partial) { |
1258 | head = partial->front.iov_base; | 1262 | head = partial->front.iov_base; |
1259 | num = le32_to_cpu(head->num); | 1263 | num = le32_to_cpu(head->num); |
1260 | dout(" queueing partial %p with %d/%d\n", partial, num, | 1264 | dout(" queueing partial %p with %d/%d\n", partial, num, |
1261 | (int)CEPH_CAPS_PER_RELEASE); | 1265 | (int)CEPH_CAPS_PER_RELEASE); |
1262 | list_move_tail(&partial->list_head, | 1266 | list_move_tail(&partial->list_head, |
1263 | &session->s_cap_releases_done); | 1267 | &session->s_cap_releases_done); |
1264 | session->s_num_cap_releases -= CEPH_CAPS_PER_RELEASE - num; | 1268 | session->s_num_cap_releases -= CEPH_CAPS_PER_RELEASE - num; |
1265 | } | 1269 | } |
1266 | err = 0; | 1270 | err = 0; |
1267 | spin_unlock(&session->s_cap_lock); | 1271 | spin_unlock(&session->s_cap_lock); |
1268 | out_unlocked: | 1272 | out_unlocked: |
1269 | return err; | 1273 | return err; |
1270 | } | 1274 | } |
1271 | 1275 | ||
1272 | /* | 1276 | /* |
1273 | * flush all dirty inode data to disk. | 1277 | * flush all dirty inode data to disk. |
1274 | * | 1278 | * |
1275 | * returns true if we've flushed through want_flush_seq | 1279 | * returns true if we've flushed through want_flush_seq |
1276 | */ | 1280 | */ |
1277 | static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) | 1281 | static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) |
1278 | { | 1282 | { |
1279 | int mds, ret = 1; | 1283 | int mds, ret = 1; |
1280 | 1284 | ||
1281 | dout("check_cap_flush want %lld\n", want_flush_seq); | 1285 | dout("check_cap_flush want %lld\n", want_flush_seq); |
1282 | mutex_lock(&mdsc->mutex); | 1286 | mutex_lock(&mdsc->mutex); |
1283 | for (mds = 0; ret && mds < mdsc->max_sessions; mds++) { | 1287 | for (mds = 0; ret && mds < mdsc->max_sessions; mds++) { |
1284 | struct ceph_mds_session *session = mdsc->sessions[mds]; | 1288 | struct ceph_mds_session *session = mdsc->sessions[mds]; |
1285 | 1289 | ||
1286 | if (!session) | 1290 | if (!session) |
1287 | continue; | 1291 | continue; |
1288 | get_session(session); | 1292 | get_session(session); |
1289 | mutex_unlock(&mdsc->mutex); | 1293 | mutex_unlock(&mdsc->mutex); |
1290 | 1294 | ||
1291 | mutex_lock(&session->s_mutex); | 1295 | mutex_lock(&session->s_mutex); |
1292 | if (!list_empty(&session->s_cap_flushing)) { | 1296 | if (!list_empty(&session->s_cap_flushing)) { |
1293 | struct ceph_inode_info *ci = | 1297 | struct ceph_inode_info *ci = |
1294 | list_entry(session->s_cap_flushing.next, | 1298 | list_entry(session->s_cap_flushing.next, |
1295 | struct ceph_inode_info, | 1299 | struct ceph_inode_info, |
1296 | i_flushing_item); | 1300 | i_flushing_item); |
1297 | struct inode *inode = &ci->vfs_inode; | 1301 | struct inode *inode = &ci->vfs_inode; |
1298 | 1302 | ||
1299 | spin_lock(&ci->i_ceph_lock); | 1303 | spin_lock(&ci->i_ceph_lock); |
1300 | if (ci->i_cap_flush_seq <= want_flush_seq) { | 1304 | if (ci->i_cap_flush_seq <= want_flush_seq) { |
1301 | dout("check_cap_flush still flushing %p " | 1305 | dout("check_cap_flush still flushing %p " |
1302 | "seq %lld <= %lld to mds%d\n", inode, | 1306 | "seq %lld <= %lld to mds%d\n", inode, |
1303 | ci->i_cap_flush_seq, want_flush_seq, | 1307 | ci->i_cap_flush_seq, want_flush_seq, |
1304 | session->s_mds); | 1308 | session->s_mds); |
1305 | ret = 0; | 1309 | ret = 0; |
1306 | } | 1310 | } |
1307 | spin_unlock(&ci->i_ceph_lock); | 1311 | spin_unlock(&ci->i_ceph_lock); |
1308 | } | 1312 | } |
1309 | mutex_unlock(&session->s_mutex); | 1313 | mutex_unlock(&session->s_mutex); |
1310 | ceph_put_mds_session(session); | 1314 | ceph_put_mds_session(session); |
1311 | 1315 | ||
1312 | if (!ret) | 1316 | if (!ret) |
1313 | return ret; | 1317 | return ret; |
1314 | mutex_lock(&mdsc->mutex); | 1318 | mutex_lock(&mdsc->mutex); |
1315 | } | 1319 | } |
1316 | 1320 | ||
1317 | mutex_unlock(&mdsc->mutex); | 1321 | mutex_unlock(&mdsc->mutex); |
1318 | dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq); | 1322 | dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq); |
1319 | return ret; | 1323 | return ret; |
1320 | } | 1324 | } |
1321 | 1325 | ||
1322 | /* | 1326 | /* |
1323 | * called under s_mutex | 1327 | * called under s_mutex |
1324 | */ | 1328 | */ |
1325 | void ceph_send_cap_releases(struct ceph_mds_client *mdsc, | 1329 | void ceph_send_cap_releases(struct ceph_mds_client *mdsc, |
1326 | struct ceph_mds_session *session) | 1330 | struct ceph_mds_session *session) |
1327 | { | 1331 | { |
1328 | struct ceph_msg *msg; | 1332 | struct ceph_msg *msg; |
1329 | 1333 | ||
1330 | dout("send_cap_releases mds%d\n", session->s_mds); | 1334 | dout("send_cap_releases mds%d\n", session->s_mds); |
1331 | spin_lock(&session->s_cap_lock); | 1335 | spin_lock(&session->s_cap_lock); |
1332 | while (!list_empty(&session->s_cap_releases_done)) { | 1336 | while (!list_empty(&session->s_cap_releases_done)) { |
1333 | msg = list_first_entry(&session->s_cap_releases_done, | 1337 | msg = list_first_entry(&session->s_cap_releases_done, |
1334 | struct ceph_msg, list_head); | 1338 | struct ceph_msg, list_head); |
1335 | list_del_init(&msg->list_head); | 1339 | list_del_init(&msg->list_head); |
1336 | spin_unlock(&session->s_cap_lock); | 1340 | spin_unlock(&session->s_cap_lock); |
1337 | msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); | 1341 | msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); |
1338 | dout("send_cap_releases mds%d %p\n", session->s_mds, msg); | 1342 | dout("send_cap_releases mds%d %p\n", session->s_mds, msg); |
1339 | ceph_con_send(&session->s_con, msg); | 1343 | ceph_con_send(&session->s_con, msg); |
1340 | spin_lock(&session->s_cap_lock); | 1344 | spin_lock(&session->s_cap_lock); |
1341 | } | 1345 | } |
1342 | spin_unlock(&session->s_cap_lock); | 1346 | spin_unlock(&session->s_cap_lock); |
1343 | } | 1347 | } |
1344 | 1348 | ||
1345 | static void discard_cap_releases(struct ceph_mds_client *mdsc, | 1349 | static void discard_cap_releases(struct ceph_mds_client *mdsc, |
1346 | struct ceph_mds_session *session) | 1350 | struct ceph_mds_session *session) |
1347 | { | 1351 | { |
1348 | struct ceph_msg *msg; | 1352 | struct ceph_msg *msg; |
1349 | struct ceph_mds_cap_release *head; | 1353 | struct ceph_mds_cap_release *head; |
1350 | unsigned num; | 1354 | unsigned num; |
1351 | 1355 | ||
1352 | dout("discard_cap_releases mds%d\n", session->s_mds); | 1356 | dout("discard_cap_releases mds%d\n", session->s_mds); |
1353 | spin_lock(&session->s_cap_lock); | 1357 | spin_lock(&session->s_cap_lock); |
1354 | 1358 | ||
1355 | /* zero out the in-progress message */ | 1359 | /* zero out the in-progress message */ |
1356 | msg = list_first_entry(&session->s_cap_releases, | 1360 | msg = list_first_entry(&session->s_cap_releases, |
1357 | struct ceph_msg, list_head); | 1361 | struct ceph_msg, list_head); |
1358 | head = msg->front.iov_base; | 1362 | head = msg->front.iov_base; |
1359 | num = le32_to_cpu(head->num); | 1363 | num = le32_to_cpu(head->num); |
1360 | dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num); | 1364 | dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num); |
1361 | head->num = cpu_to_le32(0); | 1365 | head->num = cpu_to_le32(0); |
1362 | session->s_num_cap_releases += num; | 1366 | session->s_num_cap_releases += num; |
1363 | 1367 | ||
1364 | /* requeue completed messages */ | 1368 | /* requeue completed messages */ |
1365 | while (!list_empty(&session->s_cap_releases_done)) { | 1369 | while (!list_empty(&session->s_cap_releases_done)) { |
1366 | msg = list_first_entry(&session->s_cap_releases_done, | 1370 | msg = list_first_entry(&session->s_cap_releases_done, |
1367 | struct ceph_msg, list_head); | 1371 | struct ceph_msg, list_head); |
1368 | list_del_init(&msg->list_head); | 1372 | list_del_init(&msg->list_head); |
1369 | 1373 | ||
1370 | head = msg->front.iov_base; | 1374 | head = msg->front.iov_base; |
1371 | num = le32_to_cpu(head->num); | 1375 | num = le32_to_cpu(head->num); |
1372 | dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, | 1376 | dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, |
1373 | num); | 1377 | num); |
1374 | session->s_num_cap_releases += num; | 1378 | session->s_num_cap_releases += num; |
1375 | head->num = cpu_to_le32(0); | 1379 | head->num = cpu_to_le32(0); |
1376 | msg->front.iov_len = sizeof(*head); | 1380 | msg->front.iov_len = sizeof(*head); |
1377 | list_add(&msg->list_head, &session->s_cap_releases); | 1381 | list_add(&msg->list_head, &session->s_cap_releases); |
1378 | } | 1382 | } |
1379 | 1383 | ||
1380 | spin_unlock(&session->s_cap_lock); | 1384 | spin_unlock(&session->s_cap_lock); |
1381 | } | 1385 | } |
1382 | 1386 | ||
1383 | /* | 1387 | /* |
1384 | * requests | 1388 | * requests |
1385 | */ | 1389 | */ |
1386 | 1390 | ||
1387 | /* | 1391 | /* |
1388 | * Create an mds request. | 1392 | * Create an mds request. |
1389 | */ | 1393 | */ |
1390 | struct ceph_mds_request * | 1394 | struct ceph_mds_request * |
1391 | ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) | 1395 | ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) |
1392 | { | 1396 | { |
1393 | struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS); | 1397 | struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS); |
1394 | 1398 | ||
1395 | if (!req) | 1399 | if (!req) |
1396 | return ERR_PTR(-ENOMEM); | 1400 | return ERR_PTR(-ENOMEM); |
1397 | 1401 | ||
1398 | mutex_init(&req->r_fill_mutex); | 1402 | mutex_init(&req->r_fill_mutex); |
1399 | req->r_mdsc = mdsc; | 1403 | req->r_mdsc = mdsc; |
1400 | req->r_started = jiffies; | 1404 | req->r_started = jiffies; |
1401 | req->r_resend_mds = -1; | 1405 | req->r_resend_mds = -1; |
1402 | INIT_LIST_HEAD(&req->r_unsafe_dir_item); | 1406 | INIT_LIST_HEAD(&req->r_unsafe_dir_item); |
1403 | req->r_fmode = -1; | 1407 | req->r_fmode = -1; |
1404 | kref_init(&req->r_kref); | 1408 | kref_init(&req->r_kref); |
1405 | INIT_LIST_HEAD(&req->r_wait); | 1409 | INIT_LIST_HEAD(&req->r_wait); |
1406 | init_completion(&req->r_completion); | 1410 | init_completion(&req->r_completion); |
1407 | init_completion(&req->r_safe_completion); | 1411 | init_completion(&req->r_safe_completion); |
1408 | INIT_LIST_HEAD(&req->r_unsafe_item); | 1412 | INIT_LIST_HEAD(&req->r_unsafe_item); |
1409 | 1413 | ||
1410 | req->r_op = op; | 1414 | req->r_op = op; |
1411 | req->r_direct_mode = mode; | 1415 | req->r_direct_mode = mode; |
1412 | return req; | 1416 | return req; |
1413 | } | 1417 | } |
1414 | 1418 | ||
1415 | /* | 1419 | /* |
1416 | * return oldest (lowest) request, tid in request tree, 0 if none. | 1420 | * return oldest (lowest) request, tid in request tree, 0 if none. |
1417 | * | 1421 | * |
1418 | * called under mdsc->mutex. | 1422 | * called under mdsc->mutex. |
1419 | */ | 1423 | */ |
1420 | static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc) | 1424 | static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc) |
1421 | { | 1425 | { |
1422 | if (RB_EMPTY_ROOT(&mdsc->request_tree)) | 1426 | if (RB_EMPTY_ROOT(&mdsc->request_tree)) |
1423 | return NULL; | 1427 | return NULL; |
1424 | return rb_entry(rb_first(&mdsc->request_tree), | 1428 | return rb_entry(rb_first(&mdsc->request_tree), |
1425 | struct ceph_mds_request, r_node); | 1429 | struct ceph_mds_request, r_node); |
1426 | } | 1430 | } |
1427 | 1431 | ||
1428 | static u64 __get_oldest_tid(struct ceph_mds_client *mdsc) | 1432 | static u64 __get_oldest_tid(struct ceph_mds_client *mdsc) |
1429 | { | 1433 | { |
1430 | struct ceph_mds_request *req = __get_oldest_req(mdsc); | 1434 | struct ceph_mds_request *req = __get_oldest_req(mdsc); |
1431 | 1435 | ||
1432 | if (req) | 1436 | if (req) |
1433 | return req->r_tid; | 1437 | return req->r_tid; |
1434 | return 0; | 1438 | return 0; |
1435 | } | 1439 | } |
1436 | 1440 | ||
1437 | /* | 1441 | /* |
1438 | * Build a dentry's path. Allocate on heap; caller must kfree. Based | 1442 | * Build a dentry's path. Allocate on heap; caller must kfree. Based |
1439 | * on build_path_from_dentry in fs/cifs/dir.c. | 1443 | * on build_path_from_dentry in fs/cifs/dir.c. |
1440 | * | 1444 | * |
1441 | * If @stop_on_nosnap, generate path relative to the first non-snapped | 1445 | * If @stop_on_nosnap, generate path relative to the first non-snapped |
1442 | * inode. | 1446 | * inode. |
1443 | * | 1447 | * |
1444 | * Encode hidden .snap dirs as a double /, i.e. | 1448 | * Encode hidden .snap dirs as a double /, i.e. |
1445 | * foo/.snap/bar -> foo//bar | 1449 | * foo/.snap/bar -> foo//bar |
1446 | */ | 1450 | */ |
1447 | char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, | 1451 | char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, |
1448 | int stop_on_nosnap) | 1452 | int stop_on_nosnap) |
1449 | { | 1453 | { |
1450 | struct dentry *temp; | 1454 | struct dentry *temp; |
1451 | char *path; | 1455 | char *path; |
1452 | int len, pos; | 1456 | int len, pos; |
1453 | unsigned seq; | 1457 | unsigned seq; |
1454 | 1458 | ||
1455 | if (dentry == NULL) | 1459 | if (dentry == NULL) |
1456 | return ERR_PTR(-EINVAL); | 1460 | return ERR_PTR(-EINVAL); |
1457 | 1461 | ||
1458 | retry: | 1462 | retry: |
1459 | len = 0; | 1463 | len = 0; |
1460 | seq = read_seqbegin(&rename_lock); | 1464 | seq = read_seqbegin(&rename_lock); |
1461 | rcu_read_lock(); | 1465 | rcu_read_lock(); |
1462 | for (temp = dentry; !IS_ROOT(temp);) { | 1466 | for (temp = dentry; !IS_ROOT(temp);) { |
1463 | struct inode *inode = temp->d_inode; | 1467 | struct inode *inode = temp->d_inode; |
1464 | if (inode && ceph_snap(inode) == CEPH_SNAPDIR) | 1468 | if (inode && ceph_snap(inode) == CEPH_SNAPDIR) |
1465 | len++; /* slash only */ | 1469 | len++; /* slash only */ |
1466 | else if (stop_on_nosnap && inode && | 1470 | else if (stop_on_nosnap && inode && |
1467 | ceph_snap(inode) == CEPH_NOSNAP) | 1471 | ceph_snap(inode) == CEPH_NOSNAP) |
1468 | break; | 1472 | break; |
1469 | else | 1473 | else |
1470 | len += 1 + temp->d_name.len; | 1474 | len += 1 + temp->d_name.len; |
1471 | temp = temp->d_parent; | 1475 | temp = temp->d_parent; |
1472 | if (temp == NULL) { | 1476 | if (temp == NULL) { |
1473 | rcu_read_unlock(); | 1477 | rcu_read_unlock(); |
1474 | pr_err("build_path corrupt dentry %p\n", dentry); | 1478 | pr_err("build_path corrupt dentry %p\n", dentry); |
1475 | return ERR_PTR(-EINVAL); | 1479 | return ERR_PTR(-EINVAL); |
1476 | } | 1480 | } |
1477 | } | 1481 | } |
1478 | rcu_read_unlock(); | 1482 | rcu_read_unlock(); |
1479 | if (len) | 1483 | if (len) |
1480 | len--; /* no leading '/' */ | 1484 | len--; /* no leading '/' */ |
1481 | 1485 | ||
1482 | path = kmalloc(len+1, GFP_NOFS); | 1486 | path = kmalloc(len+1, GFP_NOFS); |
1483 | if (path == NULL) | 1487 | if (path == NULL) |
1484 | return ERR_PTR(-ENOMEM); | 1488 | return ERR_PTR(-ENOMEM); |
1485 | pos = len; | 1489 | pos = len; |
1486 | path[pos] = 0; /* trailing null */ | 1490 | path[pos] = 0; /* trailing null */ |
1487 | rcu_read_lock(); | 1491 | rcu_read_lock(); |
1488 | for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) { | 1492 | for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) { |
1489 | struct inode *inode; | 1493 | struct inode *inode; |
1490 | 1494 | ||
1491 | spin_lock(&temp->d_lock); | 1495 | spin_lock(&temp->d_lock); |
1492 | inode = temp->d_inode; | 1496 | inode = temp->d_inode; |
1493 | if (inode && ceph_snap(inode) == CEPH_SNAPDIR) { | 1497 | if (inode && ceph_snap(inode) == CEPH_SNAPDIR) { |
1494 | dout("build_path path+%d: %p SNAPDIR\n", | 1498 | dout("build_path path+%d: %p SNAPDIR\n", |
1495 | pos, temp); | 1499 | pos, temp); |
1496 | } else if (stop_on_nosnap && inode && | 1500 | } else if (stop_on_nosnap && inode && |
1497 | ceph_snap(inode) == CEPH_NOSNAP) { | 1501 | ceph_snap(inode) == CEPH_NOSNAP) { |
1498 | spin_unlock(&temp->d_lock); | 1502 | spin_unlock(&temp->d_lock); |
1499 | break; | 1503 | break; |
1500 | } else { | 1504 | } else { |
1501 | pos -= temp->d_name.len; | 1505 | pos -= temp->d_name.len; |
1502 | if (pos < 0) { | 1506 | if (pos < 0) { |
1503 | spin_unlock(&temp->d_lock); | 1507 | spin_unlock(&temp->d_lock); |
1504 | break; | 1508 | break; |
1505 | } | 1509 | } |
1506 | strncpy(path + pos, temp->d_name.name, | 1510 | strncpy(path + pos, temp->d_name.name, |
1507 | temp->d_name.len); | 1511 | temp->d_name.len); |
1508 | } | 1512 | } |
1509 | spin_unlock(&temp->d_lock); | 1513 | spin_unlock(&temp->d_lock); |
1510 | if (pos) | 1514 | if (pos) |
1511 | path[--pos] = '/'; | 1515 | path[--pos] = '/'; |
1512 | temp = temp->d_parent; | 1516 | temp = temp->d_parent; |
1513 | if (temp == NULL) { | 1517 | if (temp == NULL) { |
1514 | rcu_read_unlock(); | 1518 | rcu_read_unlock(); |
1515 | pr_err("build_path corrupt dentry\n"); | 1519 | pr_err("build_path corrupt dentry\n"); |
1516 | kfree(path); | 1520 | kfree(path); |
1517 | return ERR_PTR(-EINVAL); | 1521 | return ERR_PTR(-EINVAL); |
1518 | } | 1522 | } |
1519 | } | 1523 | } |
1520 | rcu_read_unlock(); | 1524 | rcu_read_unlock(); |
1521 | if (pos != 0 || read_seqretry(&rename_lock, seq)) { | 1525 | if (pos != 0 || read_seqretry(&rename_lock, seq)) { |
1522 | pr_err("build_path did not end path lookup where " | 1526 | pr_err("build_path did not end path lookup where " |
1523 | "expected, namelen is %d, pos is %d\n", len, pos); | 1527 | "expected, namelen is %d, pos is %d\n", len, pos); |
1524 | /* presumably this is only possible if racing with a | 1528 | /* presumably this is only possible if racing with a |
1525 | rename of one of the parent directories (we can not | 1529 | rename of one of the parent directories (we can not |
1526 | lock the dentries above us to prevent this, but | 1530 | lock the dentries above us to prevent this, but |
1527 | retrying should be harmless) */ | 1531 | retrying should be harmless) */ |
1528 | kfree(path); | 1532 | kfree(path); |
1529 | goto retry; | 1533 | goto retry; |
1530 | } | 1534 | } |
1531 | 1535 | ||
1532 | *base = ceph_ino(temp->d_inode); | 1536 | *base = ceph_ino(temp->d_inode); |
1533 | *plen = len; | 1537 | *plen = len; |
1534 | dout("build_path on %p %d built %llx '%.*s'\n", | 1538 | dout("build_path on %p %d built %llx '%.*s'\n", |
1535 | dentry, dentry->d_count, *base, len, path); | 1539 | dentry, dentry->d_count, *base, len, path); |
1536 | return path; | 1540 | return path; |
1537 | } | 1541 | } |
1538 | 1542 | ||
1539 | static int build_dentry_path(struct dentry *dentry, | 1543 | static int build_dentry_path(struct dentry *dentry, |
1540 | const char **ppath, int *ppathlen, u64 *pino, | 1544 | const char **ppath, int *ppathlen, u64 *pino, |
1541 | int *pfreepath) | 1545 | int *pfreepath) |
1542 | { | 1546 | { |
1543 | char *path; | 1547 | char *path; |
1544 | 1548 | ||
1545 | if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) { | 1549 | if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) { |
1546 | *pino = ceph_ino(dentry->d_parent->d_inode); | 1550 | *pino = ceph_ino(dentry->d_parent->d_inode); |
1547 | *ppath = dentry->d_name.name; | 1551 | *ppath = dentry->d_name.name; |
1548 | *ppathlen = dentry->d_name.len; | 1552 | *ppathlen = dentry->d_name.len; |
1549 | return 0; | 1553 | return 0; |
1550 | } | 1554 | } |
1551 | path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1); | 1555 | path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1); |
1552 | if (IS_ERR(path)) | 1556 | if (IS_ERR(path)) |
1553 | return PTR_ERR(path); | 1557 | return PTR_ERR(path); |
1554 | *ppath = path; | 1558 | *ppath = path; |
1555 | *pfreepath = 1; | 1559 | *pfreepath = 1; |
1556 | return 0; | 1560 | return 0; |
1557 | } | 1561 | } |
1558 | 1562 | ||
1559 | static int build_inode_path(struct inode *inode, | 1563 | static int build_inode_path(struct inode *inode, |
1560 | const char **ppath, int *ppathlen, u64 *pino, | 1564 | const char **ppath, int *ppathlen, u64 *pino, |
1561 | int *pfreepath) | 1565 | int *pfreepath) |
1562 | { | 1566 | { |
1563 | struct dentry *dentry; | 1567 | struct dentry *dentry; |
1564 | char *path; | 1568 | char *path; |
1565 | 1569 | ||
1566 | if (ceph_snap(inode) == CEPH_NOSNAP) { | 1570 | if (ceph_snap(inode) == CEPH_NOSNAP) { |
1567 | *pino = ceph_ino(inode); | 1571 | *pino = ceph_ino(inode); |
1568 | *ppathlen = 0; | 1572 | *ppathlen = 0; |
1569 | return 0; | 1573 | return 0; |
1570 | } | 1574 | } |
1571 | dentry = d_find_alias(inode); | 1575 | dentry = d_find_alias(inode); |
1572 | path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1); | 1576 | path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1); |
1573 | dput(dentry); | 1577 | dput(dentry); |
1574 | if (IS_ERR(path)) | 1578 | if (IS_ERR(path)) |
1575 | return PTR_ERR(path); | 1579 | return PTR_ERR(path); |
1576 | *ppath = path; | 1580 | *ppath = path; |
1577 | *pfreepath = 1; | 1581 | *pfreepath = 1; |
1578 | return 0; | 1582 | return 0; |
1579 | } | 1583 | } |
1580 | 1584 | ||
1581 | /* | 1585 | /* |
1582 | * request arguments may be specified via an inode *, a dentry *, or | 1586 | * request arguments may be specified via an inode *, a dentry *, or |
1583 | * an explicit ino+path. | 1587 | * an explicit ino+path. |
1584 | */ | 1588 | */ |
1585 | static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, | 1589 | static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, |
1586 | const char *rpath, u64 rino, | 1590 | const char *rpath, u64 rino, |
1587 | const char **ppath, int *pathlen, | 1591 | const char **ppath, int *pathlen, |
1588 | u64 *ino, int *freepath) | 1592 | u64 *ino, int *freepath) |
1589 | { | 1593 | { |
1590 | int r = 0; | 1594 | int r = 0; |
1591 | 1595 | ||
1592 | if (rinode) { | 1596 | if (rinode) { |
1593 | r = build_inode_path(rinode, ppath, pathlen, ino, freepath); | 1597 | r = build_inode_path(rinode, ppath, pathlen, ino, freepath); |
1594 | dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode), | 1598 | dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode), |
1595 | ceph_snap(rinode)); | 1599 | ceph_snap(rinode)); |
1596 | } else if (rdentry) { | 1600 | } else if (rdentry) { |
1597 | r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath); | 1601 | r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath); |
1598 | dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, | 1602 | dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, |
1599 | *ppath); | 1603 | *ppath); |
1600 | } else if (rpath || rino) { | 1604 | } else if (rpath || rino) { |
1601 | *ino = rino; | 1605 | *ino = rino; |
1602 | *ppath = rpath; | 1606 | *ppath = rpath; |
1603 | *pathlen = strlen(rpath); | 1607 | *pathlen = strlen(rpath); |
1604 | dout(" path %.*s\n", *pathlen, rpath); | 1608 | dout(" path %.*s\n", *pathlen, rpath); |
1605 | } | 1609 | } |
1606 | 1610 | ||
1607 | return r; | 1611 | return r; |
1608 | } | 1612 | } |
1609 | 1613 | ||
1610 | /* | 1614 | /* |
1611 | * called under mdsc->mutex | 1615 | * called under mdsc->mutex |
1612 | */ | 1616 | */ |
1613 | static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, | 1617 | static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, |
1614 | struct ceph_mds_request *req, | 1618 | struct ceph_mds_request *req, |
1615 | int mds) | 1619 | int mds) |
1616 | { | 1620 | { |
1617 | struct ceph_msg *msg; | 1621 | struct ceph_msg *msg; |
1618 | struct ceph_mds_request_head *head; | 1622 | struct ceph_mds_request_head *head; |
1619 | const char *path1 = NULL; | 1623 | const char *path1 = NULL; |
1620 | const char *path2 = NULL; | 1624 | const char *path2 = NULL; |
1621 | u64 ino1 = 0, ino2 = 0; | 1625 | u64 ino1 = 0, ino2 = 0; |
1622 | int pathlen1 = 0, pathlen2 = 0; | 1626 | int pathlen1 = 0, pathlen2 = 0; |
1623 | int freepath1 = 0, freepath2 = 0; | 1627 | int freepath1 = 0, freepath2 = 0; |
1624 | int len; | 1628 | int len; |
1625 | u16 releases; | 1629 | u16 releases; |
1626 | void *p, *end; | 1630 | void *p, *end; |
1627 | int ret; | 1631 | int ret; |
1628 | 1632 | ||
1629 | ret = set_request_path_attr(req->r_inode, req->r_dentry, | 1633 | ret = set_request_path_attr(req->r_inode, req->r_dentry, |
1630 | req->r_path1, req->r_ino1.ino, | 1634 | req->r_path1, req->r_ino1.ino, |
1631 | &path1, &pathlen1, &ino1, &freepath1); | 1635 | &path1, &pathlen1, &ino1, &freepath1); |
1632 | if (ret < 0) { | 1636 | if (ret < 0) { |
1633 | msg = ERR_PTR(ret); | 1637 | msg = ERR_PTR(ret); |
1634 | goto out; | 1638 | goto out; |
1635 | } | 1639 | } |
1636 | 1640 | ||
1637 | ret = set_request_path_attr(NULL, req->r_old_dentry, | 1641 | ret = set_request_path_attr(NULL, req->r_old_dentry, |
1638 | req->r_path2, req->r_ino2.ino, | 1642 | req->r_path2, req->r_ino2.ino, |
1639 | &path2, &pathlen2, &ino2, &freepath2); | 1643 | &path2, &pathlen2, &ino2, &freepath2); |
1640 | if (ret < 0) { | 1644 | if (ret < 0) { |
1641 | msg = ERR_PTR(ret); | 1645 | msg = ERR_PTR(ret); |
1642 | goto out_free1; | 1646 | goto out_free1; |
1643 | } | 1647 | } |
1644 | 1648 | ||
1645 | len = sizeof(*head) + | 1649 | len = sizeof(*head) + |
1646 | pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)); | 1650 | pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)); |
1647 | 1651 | ||
1648 | /* calculate (max) length for cap releases */ | 1652 | /* calculate (max) length for cap releases */ |
1649 | len += sizeof(struct ceph_mds_request_release) * | 1653 | len += sizeof(struct ceph_mds_request_release) * |
1650 | (!!req->r_inode_drop + !!req->r_dentry_drop + | 1654 | (!!req->r_inode_drop + !!req->r_dentry_drop + |
1651 | !!req->r_old_inode_drop + !!req->r_old_dentry_drop); | 1655 | !!req->r_old_inode_drop + !!req->r_old_dentry_drop); |
1652 | if (req->r_dentry_drop) | 1656 | if (req->r_dentry_drop) |
1653 | len += req->r_dentry->d_name.len; | 1657 | len += req->r_dentry->d_name.len; |
1654 | if (req->r_old_dentry_drop) | 1658 | if (req->r_old_dentry_drop) |
1655 | len += req->r_old_dentry->d_name.len; | 1659 | len += req->r_old_dentry->d_name.len; |
1656 | 1660 | ||
1657 | msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS, false); | 1661 | msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS, false); |
1658 | if (!msg) { | 1662 | if (!msg) { |
1659 | msg = ERR_PTR(-ENOMEM); | 1663 | msg = ERR_PTR(-ENOMEM); |
1660 | goto out_free2; | 1664 | goto out_free2; |
1661 | } | 1665 | } |
1662 | 1666 | ||
1663 | msg->hdr.tid = cpu_to_le64(req->r_tid); | 1667 | msg->hdr.tid = cpu_to_le64(req->r_tid); |
1664 | 1668 | ||
1665 | head = msg->front.iov_base; | 1669 | head = msg->front.iov_base; |
1666 | p = msg->front.iov_base + sizeof(*head); | 1670 | p = msg->front.iov_base + sizeof(*head); |
1667 | end = msg->front.iov_base + msg->front.iov_len; | 1671 | end = msg->front.iov_base + msg->front.iov_len; |
1668 | 1672 | ||
1669 | head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch); | 1673 | head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch); |
1670 | head->op = cpu_to_le32(req->r_op); | 1674 | head->op = cpu_to_le32(req->r_op); |
1671 | head->caller_uid = cpu_to_le32(req->r_uid); | 1675 | head->caller_uid = cpu_to_le32(req->r_uid); |
1672 | head->caller_gid = cpu_to_le32(req->r_gid); | 1676 | head->caller_gid = cpu_to_le32(req->r_gid); |
1673 | head->args = req->r_args; | 1677 | head->args = req->r_args; |
1674 | 1678 | ||
1675 | ceph_encode_filepath(&p, end, ino1, path1); | 1679 | ceph_encode_filepath(&p, end, ino1, path1); |
1676 | ceph_encode_filepath(&p, end, ino2, path2); | 1680 | ceph_encode_filepath(&p, end, ino2, path2); |
1677 | 1681 | ||
1678 | /* make note of release offset, in case we need to replay */ | 1682 | /* make note of release offset, in case we need to replay */ |
1679 | req->r_request_release_offset = p - msg->front.iov_base; | 1683 | req->r_request_release_offset = p - msg->front.iov_base; |
1680 | 1684 | ||
1681 | /* cap releases */ | 1685 | /* cap releases */ |
1682 | releases = 0; | 1686 | releases = 0; |
1683 | if (req->r_inode_drop) | 1687 | if (req->r_inode_drop) |
1684 | releases += ceph_encode_inode_release(&p, | 1688 | releases += ceph_encode_inode_release(&p, |
1685 | req->r_inode ? req->r_inode : req->r_dentry->d_inode, | 1689 | req->r_inode ? req->r_inode : req->r_dentry->d_inode, |
1686 | mds, req->r_inode_drop, req->r_inode_unless, 0); | 1690 | mds, req->r_inode_drop, req->r_inode_unless, 0); |
1687 | if (req->r_dentry_drop) | 1691 | if (req->r_dentry_drop) |
1688 | releases += ceph_encode_dentry_release(&p, req->r_dentry, | 1692 | releases += ceph_encode_dentry_release(&p, req->r_dentry, |
1689 | mds, req->r_dentry_drop, req->r_dentry_unless); | 1693 | mds, req->r_dentry_drop, req->r_dentry_unless); |
1690 | if (req->r_old_dentry_drop) | 1694 | if (req->r_old_dentry_drop) |
1691 | releases += ceph_encode_dentry_release(&p, req->r_old_dentry, | 1695 | releases += ceph_encode_dentry_release(&p, req->r_old_dentry, |
1692 | mds, req->r_old_dentry_drop, req->r_old_dentry_unless); | 1696 | mds, req->r_old_dentry_drop, req->r_old_dentry_unless); |
1693 | if (req->r_old_inode_drop) | 1697 | if (req->r_old_inode_drop) |
1694 | releases += ceph_encode_inode_release(&p, | 1698 | releases += ceph_encode_inode_release(&p, |
1695 | req->r_old_dentry->d_inode, | 1699 | req->r_old_dentry->d_inode, |
1696 | mds, req->r_old_inode_drop, req->r_old_inode_unless, 0); | 1700 | mds, req->r_old_inode_drop, req->r_old_inode_unless, 0); |
1697 | head->num_releases = cpu_to_le16(releases); | 1701 | head->num_releases = cpu_to_le16(releases); |
1698 | 1702 | ||
1699 | BUG_ON(p > end); | 1703 | BUG_ON(p > end); |
1700 | msg->front.iov_len = p - msg->front.iov_base; | 1704 | msg->front.iov_len = p - msg->front.iov_base; |
1701 | msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); | 1705 | msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); |
1702 | 1706 | ||
1703 | msg->pages = req->r_pages; | 1707 | msg->pages = req->r_pages; |
1704 | msg->nr_pages = req->r_num_pages; | 1708 | msg->nr_pages = req->r_num_pages; |
1705 | msg->hdr.data_len = cpu_to_le32(req->r_data_len); | 1709 | msg->hdr.data_len = cpu_to_le32(req->r_data_len); |
1706 | msg->hdr.data_off = cpu_to_le16(0); | 1710 | msg->hdr.data_off = cpu_to_le16(0); |
1707 | 1711 | ||
1708 | out_free2: | 1712 | out_free2: |
1709 | if (freepath2) | 1713 | if (freepath2) |
1710 | kfree((char *)path2); | 1714 | kfree((char *)path2); |
1711 | out_free1: | 1715 | out_free1: |
1712 | if (freepath1) | 1716 | if (freepath1) |
1713 | kfree((char *)path1); | 1717 | kfree((char *)path1); |
1714 | out: | 1718 | out: |
1715 | return msg; | 1719 | return msg; |
1716 | } | 1720 | } |
1717 | 1721 | ||
1718 | /* | 1722 | /* |
1719 | * called under mdsc->mutex if error, under no mutex if | 1723 | * called under mdsc->mutex if error, under no mutex if |
1720 | * success. | 1724 | * success. |
1721 | */ | 1725 | */ |
1722 | static void complete_request(struct ceph_mds_client *mdsc, | 1726 | static void complete_request(struct ceph_mds_client *mdsc, |
1723 | struct ceph_mds_request *req) | 1727 | struct ceph_mds_request *req) |
1724 | { | 1728 | { |
1725 | if (req->r_callback) | 1729 | if (req->r_callback) |
1726 | req->r_callback(mdsc, req); | 1730 | req->r_callback(mdsc, req); |
1727 | else | 1731 | else |
1728 | complete_all(&req->r_completion); | 1732 | complete_all(&req->r_completion); |
1729 | } | 1733 | } |
1730 | 1734 | ||
1731 | /* | 1735 | /* |
1732 | * called under mdsc->mutex | 1736 | * called under mdsc->mutex |
1733 | */ | 1737 | */ |
1734 | static int __prepare_send_request(struct ceph_mds_client *mdsc, | 1738 | static int __prepare_send_request(struct ceph_mds_client *mdsc, |
1735 | struct ceph_mds_request *req, | 1739 | struct ceph_mds_request *req, |
1736 | int mds) | 1740 | int mds) |
1737 | { | 1741 | { |
1738 | struct ceph_mds_request_head *rhead; | 1742 | struct ceph_mds_request_head *rhead; |
1739 | struct ceph_msg *msg; | 1743 | struct ceph_msg *msg; |
1740 | int flags = 0; | 1744 | int flags = 0; |
1741 | 1745 | ||
1742 | req->r_attempts++; | 1746 | req->r_attempts++; |
1743 | if (req->r_inode) { | 1747 | if (req->r_inode) { |
1744 | struct ceph_cap *cap = | 1748 | struct ceph_cap *cap = |
1745 | ceph_get_cap_for_mds(ceph_inode(req->r_inode), mds); | 1749 | ceph_get_cap_for_mds(ceph_inode(req->r_inode), mds); |
1746 | 1750 | ||
1747 | if (cap) | 1751 | if (cap) |
1748 | req->r_sent_on_mseq = cap->mseq; | 1752 | req->r_sent_on_mseq = cap->mseq; |
1749 | else | 1753 | else |
1750 | req->r_sent_on_mseq = -1; | 1754 | req->r_sent_on_mseq = -1; |
1751 | } | 1755 | } |
1752 | dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req, | 1756 | dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req, |
1753 | req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts); | 1757 | req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts); |
1754 | 1758 | ||
1755 | if (req->r_got_unsafe) { | 1759 | if (req->r_got_unsafe) { |
1756 | /* | 1760 | /* |
1757 | * Replay. Do not regenerate message (and rebuild | 1761 | * Replay. Do not regenerate message (and rebuild |
1758 | * paths, etc.); just use the original message. | 1762 | * paths, etc.); just use the original message. |
1759 | * Rebuilding paths will break for renames because | 1763 | * Rebuilding paths will break for renames because |
1760 | * d_move mangles the src name. | 1764 | * d_move mangles the src name. |
1761 | */ | 1765 | */ |
1762 | msg = req->r_request; | 1766 | msg = req->r_request; |
1763 | rhead = msg->front.iov_base; | 1767 | rhead = msg->front.iov_base; |
1764 | 1768 | ||
1765 | flags = le32_to_cpu(rhead->flags); | 1769 | flags = le32_to_cpu(rhead->flags); |
1766 | flags |= CEPH_MDS_FLAG_REPLAY; | 1770 | flags |= CEPH_MDS_FLAG_REPLAY; |
1767 | rhead->flags = cpu_to_le32(flags); | 1771 | rhead->flags = cpu_to_le32(flags); |
1768 | 1772 | ||
1769 | if (req->r_target_inode) | 1773 | if (req->r_target_inode) |
1770 | rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode)); | 1774 | rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode)); |
1771 | 1775 | ||
1772 | rhead->num_retry = req->r_attempts - 1; | 1776 | rhead->num_retry = req->r_attempts - 1; |
1773 | 1777 | ||
1774 | /* remove cap/dentry releases from message */ | 1778 | /* remove cap/dentry releases from message */ |
1775 | rhead->num_releases = 0; | 1779 | rhead->num_releases = 0; |
1776 | msg->hdr.front_len = cpu_to_le32(req->r_request_release_offset); | 1780 | msg->hdr.front_len = cpu_to_le32(req->r_request_release_offset); |
1777 | msg->front.iov_len = req->r_request_release_offset; | 1781 | msg->front.iov_len = req->r_request_release_offset; |
1778 | return 0; | 1782 | return 0; |
1779 | } | 1783 | } |
1780 | 1784 | ||
1781 | if (req->r_request) { | 1785 | if (req->r_request) { |
1782 | ceph_msg_put(req->r_request); | 1786 | ceph_msg_put(req->r_request); |
1783 | req->r_request = NULL; | 1787 | req->r_request = NULL; |
1784 | } | 1788 | } |
1785 | msg = create_request_message(mdsc, req, mds); | 1789 | msg = create_request_message(mdsc, req, mds); |
1786 | if (IS_ERR(msg)) { | 1790 | if (IS_ERR(msg)) { |
1787 | req->r_err = PTR_ERR(msg); | 1791 | req->r_err = PTR_ERR(msg); |
1788 | complete_request(mdsc, req); | 1792 | complete_request(mdsc, req); |
1789 | return PTR_ERR(msg); | 1793 | return PTR_ERR(msg); |
1790 | } | 1794 | } |
1791 | req->r_request = msg; | 1795 | req->r_request = msg; |
1792 | 1796 | ||
1793 | rhead = msg->front.iov_base; | 1797 | rhead = msg->front.iov_base; |
1794 | rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc)); | 1798 | rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc)); |
1795 | if (req->r_got_unsafe) | 1799 | if (req->r_got_unsafe) |
1796 | flags |= CEPH_MDS_FLAG_REPLAY; | 1800 | flags |= CEPH_MDS_FLAG_REPLAY; |
1797 | if (req->r_locked_dir) | 1801 | if (req->r_locked_dir) |
1798 | flags |= CEPH_MDS_FLAG_WANT_DENTRY; | 1802 | flags |= CEPH_MDS_FLAG_WANT_DENTRY; |
1799 | rhead->flags = cpu_to_le32(flags); | 1803 | rhead->flags = cpu_to_le32(flags); |
1800 | rhead->num_fwd = req->r_num_fwd; | 1804 | rhead->num_fwd = req->r_num_fwd; |
1801 | rhead->num_retry = req->r_attempts - 1; | 1805 | rhead->num_retry = req->r_attempts - 1; |
1802 | rhead->ino = 0; | 1806 | rhead->ino = 0; |
1803 | 1807 | ||
1804 | dout(" r_locked_dir = %p\n", req->r_locked_dir); | 1808 | dout(" r_locked_dir = %p\n", req->r_locked_dir); |
1805 | return 0; | 1809 | return 0; |
1806 | } | 1810 | } |
1807 | 1811 | ||
1808 | /* | 1812 | /* |
1809 | * send request, or put it on the appropriate wait list. | 1813 | * send request, or put it on the appropriate wait list. |
1810 | */ | 1814 | */ |
1811 | static int __do_request(struct ceph_mds_client *mdsc, | 1815 | static int __do_request(struct ceph_mds_client *mdsc, |
1812 | struct ceph_mds_request *req) | 1816 | struct ceph_mds_request *req) |
1813 | { | 1817 | { |
1814 | struct ceph_mds_session *session = NULL; | 1818 | struct ceph_mds_session *session = NULL; |
1815 | int mds = -1; | 1819 | int mds = -1; |
1816 | int err = -EAGAIN; | 1820 | int err = -EAGAIN; |
1817 | 1821 | ||
1818 | if (req->r_err || req->r_got_result) | 1822 | if (req->r_err || req->r_got_result) |
1819 | goto out; | 1823 | goto out; |
1820 | 1824 | ||
1821 | if (req->r_timeout && | 1825 | if (req->r_timeout && |
1822 | time_after_eq(jiffies, req->r_started + req->r_timeout)) { | 1826 | time_after_eq(jiffies, req->r_started + req->r_timeout)) { |
1823 | dout("do_request timed out\n"); | 1827 | dout("do_request timed out\n"); |
1824 | err = -EIO; | 1828 | err = -EIO; |
1825 | goto finish; | 1829 | goto finish; |
1826 | } | 1830 | } |
1827 | 1831 | ||
1828 | put_request_session(req); | 1832 | put_request_session(req); |
1829 | 1833 | ||
1830 | mds = __choose_mds(mdsc, req); | 1834 | mds = __choose_mds(mdsc, req); |
1831 | if (mds < 0 || | 1835 | if (mds < 0 || |
1832 | ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) { | 1836 | ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) { |
1833 | dout("do_request no mds or not active, waiting for map\n"); | 1837 | dout("do_request no mds or not active, waiting for map\n"); |
1834 | list_add(&req->r_wait, &mdsc->waiting_for_map); | 1838 | list_add(&req->r_wait, &mdsc->waiting_for_map); |
1835 | goto out; | 1839 | goto out; |
1836 | } | 1840 | } |
1837 | 1841 | ||
1838 | /* get, open session */ | 1842 | /* get, open session */ |
1839 | session = __ceph_lookup_mds_session(mdsc, mds); | 1843 | session = __ceph_lookup_mds_session(mdsc, mds); |
1840 | if (!session) { | 1844 | if (!session) { |
1841 | session = register_session(mdsc, mds); | 1845 | session = register_session(mdsc, mds); |
1842 | if (IS_ERR(session)) { | 1846 | if (IS_ERR(session)) { |
1843 | err = PTR_ERR(session); | 1847 | err = PTR_ERR(session); |
1844 | goto finish; | 1848 | goto finish; |
1845 | } | 1849 | } |
1846 | } | 1850 | } |
1847 | req->r_session = get_session(session); | 1851 | req->r_session = get_session(session); |
1848 | 1852 | ||
1849 | dout("do_request mds%d session %p state %s\n", mds, session, | 1853 | dout("do_request mds%d session %p state %s\n", mds, session, |
1850 | session_state_name(session->s_state)); | 1854 | session_state_name(session->s_state)); |
1851 | if (session->s_state != CEPH_MDS_SESSION_OPEN && | 1855 | if (session->s_state != CEPH_MDS_SESSION_OPEN && |
1852 | session->s_state != CEPH_MDS_SESSION_HUNG) { | 1856 | session->s_state != CEPH_MDS_SESSION_HUNG) { |
1853 | if (session->s_state == CEPH_MDS_SESSION_NEW || | 1857 | if (session->s_state == CEPH_MDS_SESSION_NEW || |
1854 | session->s_state == CEPH_MDS_SESSION_CLOSING) | 1858 | session->s_state == CEPH_MDS_SESSION_CLOSING) |
1855 | __open_session(mdsc, session); | 1859 | __open_session(mdsc, session); |
1856 | list_add(&req->r_wait, &session->s_waiting); | 1860 | list_add(&req->r_wait, &session->s_waiting); |
1857 | goto out_session; | 1861 | goto out_session; |
1858 | } | 1862 | } |
1859 | 1863 | ||
1860 | /* send request */ | 1864 | /* send request */ |
1861 | req->r_resend_mds = -1; /* forget any previous mds hint */ | 1865 | req->r_resend_mds = -1; /* forget any previous mds hint */ |
1862 | 1866 | ||
1863 | if (req->r_request_started == 0) /* note request start time */ | 1867 | if (req->r_request_started == 0) /* note request start time */ |
1864 | req->r_request_started = jiffies; | 1868 | req->r_request_started = jiffies; |
1865 | 1869 | ||
1866 | err = __prepare_send_request(mdsc, req, mds); | 1870 | err = __prepare_send_request(mdsc, req, mds); |
1867 | if (!err) { | 1871 | if (!err) { |
1868 | ceph_msg_get(req->r_request); | 1872 | ceph_msg_get(req->r_request); |
1869 | ceph_con_send(&session->s_con, req->r_request); | 1873 | ceph_con_send(&session->s_con, req->r_request); |
1870 | } | 1874 | } |
1871 | 1875 | ||
1872 | out_session: | 1876 | out_session: |
1873 | ceph_put_mds_session(session); | 1877 | ceph_put_mds_session(session); |
1874 | out: | 1878 | out: |
1875 | return err; | 1879 | return err; |
1876 | 1880 | ||
1877 | finish: | 1881 | finish: |
1878 | req->r_err = err; | 1882 | req->r_err = err; |
1879 | complete_request(mdsc, req); | 1883 | complete_request(mdsc, req); |
1880 | goto out; | 1884 | goto out; |
1881 | } | 1885 | } |
1882 | 1886 | ||
1883 | /* | 1887 | /* |
1884 | * called under mdsc->mutex | 1888 | * called under mdsc->mutex |
1885 | */ | 1889 | */ |
1886 | static void __wake_requests(struct ceph_mds_client *mdsc, | 1890 | static void __wake_requests(struct ceph_mds_client *mdsc, |
1887 | struct list_head *head) | 1891 | struct list_head *head) |
1888 | { | 1892 | { |
1889 | struct ceph_mds_request *req, *nreq; | 1893 | struct ceph_mds_request *req, *nreq; |
1890 | 1894 | ||
1891 | list_for_each_entry_safe(req, nreq, head, r_wait) { | 1895 | list_for_each_entry_safe(req, nreq, head, r_wait) { |
1892 | list_del_init(&req->r_wait); | 1896 | list_del_init(&req->r_wait); |
1893 | __do_request(mdsc, req); | 1897 | __do_request(mdsc, req); |
1894 | } | 1898 | } |
1895 | } | 1899 | } |
1896 | 1900 | ||
1897 | /* | 1901 | /* |
1898 | * Wake up threads with requests pending for @mds, so that they can | 1902 | * Wake up threads with requests pending for @mds, so that they can |
1899 | * resubmit their requests to a possibly different mds. | 1903 | * resubmit their requests to a possibly different mds. |
1900 | */ | 1904 | */ |
1901 | static void kick_requests(struct ceph_mds_client *mdsc, int mds) | 1905 | static void kick_requests(struct ceph_mds_client *mdsc, int mds) |
1902 | { | 1906 | { |
1903 | struct ceph_mds_request *req; | 1907 | struct ceph_mds_request *req; |
1904 | struct rb_node *p; | 1908 | struct rb_node *p; |
1905 | 1909 | ||
1906 | dout("kick_requests mds%d\n", mds); | 1910 | dout("kick_requests mds%d\n", mds); |
1907 | for (p = rb_first(&mdsc->request_tree); p; p = rb_next(p)) { | 1911 | for (p = rb_first(&mdsc->request_tree); p; p = rb_next(p)) { |
1908 | req = rb_entry(p, struct ceph_mds_request, r_node); | 1912 | req = rb_entry(p, struct ceph_mds_request, r_node); |
1909 | if (req->r_got_unsafe) | 1913 | if (req->r_got_unsafe) |
1910 | continue; | 1914 | continue; |
1911 | if (req->r_session && | 1915 | if (req->r_session && |
1912 | req->r_session->s_mds == mds) { | 1916 | req->r_session->s_mds == mds) { |
1913 | dout(" kicking tid %llu\n", req->r_tid); | 1917 | dout(" kicking tid %llu\n", req->r_tid); |
1914 | __do_request(mdsc, req); | 1918 | __do_request(mdsc, req); |
1915 | } | 1919 | } |
1916 | } | 1920 | } |
1917 | } | 1921 | } |
1918 | 1922 | ||
1919 | void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, | 1923 | void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, |
1920 | struct ceph_mds_request *req) | 1924 | struct ceph_mds_request *req) |
1921 | { | 1925 | { |
1922 | dout("submit_request on %p\n", req); | 1926 | dout("submit_request on %p\n", req); |
1923 | mutex_lock(&mdsc->mutex); | 1927 | mutex_lock(&mdsc->mutex); |
1924 | __register_request(mdsc, req, NULL); | 1928 | __register_request(mdsc, req, NULL); |
1925 | __do_request(mdsc, req); | 1929 | __do_request(mdsc, req); |
1926 | mutex_unlock(&mdsc->mutex); | 1930 | mutex_unlock(&mdsc->mutex); |
1927 | } | 1931 | } |
1928 | 1932 | ||
1929 | /* | 1933 | /* |
1930 | * Synchrously perform an mds request. Take care of all of the | 1934 | * Synchrously perform an mds request. Take care of all of the |
1931 | * session setup, forwarding, retry details. | 1935 | * session setup, forwarding, retry details. |
1932 | */ | 1936 | */ |
1933 | int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, | 1937 | int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, |
1934 | struct inode *dir, | 1938 | struct inode *dir, |
1935 | struct ceph_mds_request *req) | 1939 | struct ceph_mds_request *req) |
1936 | { | 1940 | { |
1937 | int err; | 1941 | int err; |
1938 | 1942 | ||
1939 | dout("do_request on %p\n", req); | 1943 | dout("do_request on %p\n", req); |
1940 | 1944 | ||
1941 | /* take CAP_PIN refs for r_inode, r_locked_dir, r_old_dentry */ | 1945 | /* take CAP_PIN refs for r_inode, r_locked_dir, r_old_dentry */ |
1942 | if (req->r_inode) | 1946 | if (req->r_inode) |
1943 | ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); | 1947 | ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); |
1944 | if (req->r_locked_dir) | 1948 | if (req->r_locked_dir) |
1945 | ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); | 1949 | ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); |
1946 | if (req->r_old_dentry) | 1950 | if (req->r_old_dentry) |
1947 | ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir), | 1951 | ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir), |
1948 | CEPH_CAP_PIN); | 1952 | CEPH_CAP_PIN); |
1949 | 1953 | ||
1950 | /* issue */ | 1954 | /* issue */ |
1951 | mutex_lock(&mdsc->mutex); | 1955 | mutex_lock(&mdsc->mutex); |
1952 | __register_request(mdsc, req, dir); | 1956 | __register_request(mdsc, req, dir); |
1953 | __do_request(mdsc, req); | 1957 | __do_request(mdsc, req); |
1954 | 1958 | ||
1955 | if (req->r_err) { | 1959 | if (req->r_err) { |
1956 | err = req->r_err; | 1960 | err = req->r_err; |
1957 | __unregister_request(mdsc, req); | 1961 | __unregister_request(mdsc, req); |
1958 | dout("do_request early error %d\n", err); | 1962 | dout("do_request early error %d\n", err); |
1959 | goto out; | 1963 | goto out; |
1960 | } | 1964 | } |
1961 | 1965 | ||
1962 | /* wait */ | 1966 | /* wait */ |
1963 | mutex_unlock(&mdsc->mutex); | 1967 | mutex_unlock(&mdsc->mutex); |
1964 | dout("do_request waiting\n"); | 1968 | dout("do_request waiting\n"); |
1965 | if (req->r_timeout) { | 1969 | if (req->r_timeout) { |
1966 | err = (long)wait_for_completion_killable_timeout( | 1970 | err = (long)wait_for_completion_killable_timeout( |
1967 | &req->r_completion, req->r_timeout); | 1971 | &req->r_completion, req->r_timeout); |
1968 | if (err == 0) | 1972 | if (err == 0) |
1969 | err = -EIO; | 1973 | err = -EIO; |
1970 | } else { | 1974 | } else { |
1971 | err = wait_for_completion_killable(&req->r_completion); | 1975 | err = wait_for_completion_killable(&req->r_completion); |
1972 | } | 1976 | } |
1973 | dout("do_request waited, got %d\n", err); | 1977 | dout("do_request waited, got %d\n", err); |
1974 | mutex_lock(&mdsc->mutex); | 1978 | mutex_lock(&mdsc->mutex); |
1975 | 1979 | ||
1976 | /* only abort if we didn't race with a real reply */ | 1980 | /* only abort if we didn't race with a real reply */ |
1977 | if (req->r_got_result) { | 1981 | if (req->r_got_result) { |
1978 | err = le32_to_cpu(req->r_reply_info.head->result); | 1982 | err = le32_to_cpu(req->r_reply_info.head->result); |
1979 | } else if (err < 0) { | 1983 | } else if (err < 0) { |
1980 | dout("aborted request %lld with %d\n", req->r_tid, err); | 1984 | dout("aborted request %lld with %d\n", req->r_tid, err); |
1981 | 1985 | ||
1982 | /* | 1986 | /* |
1983 | * ensure we aren't running concurrently with | 1987 | * ensure we aren't running concurrently with |
1984 | * ceph_fill_trace or ceph_readdir_prepopulate, which | 1988 | * ceph_fill_trace or ceph_readdir_prepopulate, which |
1985 | * rely on locks (dir mutex) held by our caller. | 1989 | * rely on locks (dir mutex) held by our caller. |
1986 | */ | 1990 | */ |
1987 | mutex_lock(&req->r_fill_mutex); | 1991 | mutex_lock(&req->r_fill_mutex); |
1988 | req->r_err = err; | 1992 | req->r_err = err; |
1989 | req->r_aborted = true; | 1993 | req->r_aborted = true; |
1990 | mutex_unlock(&req->r_fill_mutex); | 1994 | mutex_unlock(&req->r_fill_mutex); |
1991 | 1995 | ||
1992 | if (req->r_locked_dir && | 1996 | if (req->r_locked_dir && |
1993 | (req->r_op & CEPH_MDS_OP_WRITE)) | 1997 | (req->r_op & CEPH_MDS_OP_WRITE)) |
1994 | ceph_invalidate_dir_request(req); | 1998 | ceph_invalidate_dir_request(req); |
1995 | } else { | 1999 | } else { |
1996 | err = req->r_err; | 2000 | err = req->r_err; |
1997 | } | 2001 | } |
1998 | 2002 | ||
1999 | out: | 2003 | out: |
2000 | mutex_unlock(&mdsc->mutex); | 2004 | mutex_unlock(&mdsc->mutex); |
2001 | dout("do_request %p done, result %d\n", req, err); | 2005 | dout("do_request %p done, result %d\n", req, err); |
2002 | return err; | 2006 | return err; |
2003 | } | 2007 | } |
2004 | 2008 | ||
2005 | /* | 2009 | /* |
2006 | * Invalidate dir D_COMPLETE, dentry lease state on an aborted MDS | 2010 | * Invalidate dir D_COMPLETE, dentry lease state on an aborted MDS |
2007 | * namespace request. | 2011 | * namespace request. |
2008 | */ | 2012 | */ |
2009 | void ceph_invalidate_dir_request(struct ceph_mds_request *req) | 2013 | void ceph_invalidate_dir_request(struct ceph_mds_request *req) |
2010 | { | 2014 | { |
2011 | struct inode *inode = req->r_locked_dir; | 2015 | struct inode *inode = req->r_locked_dir; |
2012 | struct ceph_inode_info *ci = ceph_inode(inode); | 2016 | struct ceph_inode_info *ci = ceph_inode(inode); |
2013 | 2017 | ||
2014 | dout("invalidate_dir_request %p (D_COMPLETE, lease(s))\n", inode); | 2018 | dout("invalidate_dir_request %p (D_COMPLETE, lease(s))\n", inode); |
2015 | spin_lock(&ci->i_ceph_lock); | 2019 | spin_lock(&ci->i_ceph_lock); |
2016 | ceph_dir_clear_complete(inode); | 2020 | ceph_dir_clear_complete(inode); |
2017 | ci->i_release_count++; | 2021 | ci->i_release_count++; |
2018 | spin_unlock(&ci->i_ceph_lock); | 2022 | spin_unlock(&ci->i_ceph_lock); |
2019 | 2023 | ||
2020 | if (req->r_dentry) | 2024 | if (req->r_dentry) |
2021 | ceph_invalidate_dentry_lease(req->r_dentry); | 2025 | ceph_invalidate_dentry_lease(req->r_dentry); |
2022 | if (req->r_old_dentry) | 2026 | if (req->r_old_dentry) |
2023 | ceph_invalidate_dentry_lease(req->r_old_dentry); | 2027 | ceph_invalidate_dentry_lease(req->r_old_dentry); |
2024 | } | 2028 | } |
2025 | 2029 | ||
2026 | /* | 2030 | /* |
2027 | * Handle mds reply. | 2031 | * Handle mds reply. |
2028 | * | 2032 | * |
2029 | * We take the session mutex and parse and process the reply immediately. | 2033 | * We take the session mutex and parse and process the reply immediately. |
2030 | * This preserves the logical ordering of replies, capabilities, etc., sent | 2034 | * This preserves the logical ordering of replies, capabilities, etc., sent |
2031 | * by the MDS as they are applied to our local cache. | 2035 | * by the MDS as they are applied to our local cache. |
2032 | */ | 2036 | */ |
2033 | static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) | 2037 | static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) |
2034 | { | 2038 | { |
2035 | struct ceph_mds_client *mdsc = session->s_mdsc; | 2039 | struct ceph_mds_client *mdsc = session->s_mdsc; |
2036 | struct ceph_mds_request *req; | 2040 | struct ceph_mds_request *req; |
2037 | struct ceph_mds_reply_head *head = msg->front.iov_base; | 2041 | struct ceph_mds_reply_head *head = msg->front.iov_base; |
2038 | struct ceph_mds_reply_info_parsed *rinfo; /* parsed reply info */ | 2042 | struct ceph_mds_reply_info_parsed *rinfo; /* parsed reply info */ |
2039 | u64 tid; | 2043 | u64 tid; |
2040 | int err, result; | 2044 | int err, result; |
2041 | int mds = session->s_mds; | 2045 | int mds = session->s_mds; |
2042 | 2046 | ||
2043 | if (msg->front.iov_len < sizeof(*head)) { | 2047 | if (msg->front.iov_len < sizeof(*head)) { |
2044 | pr_err("mdsc_handle_reply got corrupt (short) reply\n"); | 2048 | pr_err("mdsc_handle_reply got corrupt (short) reply\n"); |
2045 | ceph_msg_dump(msg); | 2049 | ceph_msg_dump(msg); |
2046 | return; | 2050 | return; |
2047 | } | 2051 | } |
2048 | 2052 | ||
2049 | /* get request, session */ | 2053 | /* get request, session */ |
2050 | tid = le64_to_cpu(msg->hdr.tid); | 2054 | tid = le64_to_cpu(msg->hdr.tid); |
2051 | mutex_lock(&mdsc->mutex); | 2055 | mutex_lock(&mdsc->mutex); |
2052 | req = __lookup_request(mdsc, tid); | 2056 | req = __lookup_request(mdsc, tid); |
2053 | if (!req) { | 2057 | if (!req) { |
2054 | dout("handle_reply on unknown tid %llu\n", tid); | 2058 | dout("handle_reply on unknown tid %llu\n", tid); |
2055 | mutex_unlock(&mdsc->mutex); | 2059 | mutex_unlock(&mdsc->mutex); |
2056 | return; | 2060 | return; |
2057 | } | 2061 | } |
2058 | dout("handle_reply %p\n", req); | 2062 | dout("handle_reply %p\n", req); |
2059 | 2063 | ||
2060 | /* correct session? */ | 2064 | /* correct session? */ |
2061 | if (req->r_session != session) { | 2065 | if (req->r_session != session) { |
2062 | pr_err("mdsc_handle_reply got %llu on session mds%d" | 2066 | pr_err("mdsc_handle_reply got %llu on session mds%d" |
2063 | " not mds%d\n", tid, session->s_mds, | 2067 | " not mds%d\n", tid, session->s_mds, |
2064 | req->r_session ? req->r_session->s_mds : -1); | 2068 | req->r_session ? req->r_session->s_mds : -1); |
2065 | mutex_unlock(&mdsc->mutex); | 2069 | mutex_unlock(&mdsc->mutex); |
2066 | goto out; | 2070 | goto out; |
2067 | } | 2071 | } |
2068 | 2072 | ||
2069 | /* dup? */ | 2073 | /* dup? */ |
2070 | if ((req->r_got_unsafe && !head->safe) || | 2074 | if ((req->r_got_unsafe && !head->safe) || |
2071 | (req->r_got_safe && head->safe)) { | 2075 | (req->r_got_safe && head->safe)) { |
2072 | pr_warning("got a dup %s reply on %llu from mds%d\n", | 2076 | pr_warning("got a dup %s reply on %llu from mds%d\n", |
2073 | head->safe ? "safe" : "unsafe", tid, mds); | 2077 | head->safe ? "safe" : "unsafe", tid, mds); |
2074 | mutex_unlock(&mdsc->mutex); | 2078 | mutex_unlock(&mdsc->mutex); |
2075 | goto out; | 2079 | goto out; |
2076 | } | 2080 | } |
2077 | if (req->r_got_safe && !head->safe) { | 2081 | if (req->r_got_safe && !head->safe) { |
2078 | pr_warning("got unsafe after safe on %llu from mds%d\n", | 2082 | pr_warning("got unsafe after safe on %llu from mds%d\n", |
2079 | tid, mds); | 2083 | tid, mds); |
2080 | mutex_unlock(&mdsc->mutex); | 2084 | mutex_unlock(&mdsc->mutex); |
2081 | goto out; | 2085 | goto out; |
2082 | } | 2086 | } |
2083 | 2087 | ||
2084 | result = le32_to_cpu(head->result); | 2088 | result = le32_to_cpu(head->result); |
2085 | 2089 | ||
2086 | /* | 2090 | /* |
2087 | * Handle an ESTALE | 2091 | * Handle an ESTALE |
2088 | * if we're not talking to the authority, send to them | 2092 | * if we're not talking to the authority, send to them |
2089 | * if the authority has changed while we weren't looking, | 2093 | * if the authority has changed while we weren't looking, |
2090 | * send to new authority | 2094 | * send to new authority |
2091 | * Otherwise we just have to return an ESTALE | 2095 | * Otherwise we just have to return an ESTALE |
2092 | */ | 2096 | */ |
2093 | if (result == -ESTALE) { | 2097 | if (result == -ESTALE) { |
2094 | dout("got ESTALE on request %llu", req->r_tid); | 2098 | dout("got ESTALE on request %llu", req->r_tid); |
2095 | if (!req->r_inode) { | 2099 | if (!req->r_inode) { |
2096 | /* do nothing; not an authority problem */ | 2100 | /* do nothing; not an authority problem */ |
2097 | } else if (req->r_direct_mode != USE_AUTH_MDS) { | 2101 | } else if (req->r_direct_mode != USE_AUTH_MDS) { |
2098 | dout("not using auth, setting for that now"); | 2102 | dout("not using auth, setting for that now"); |
2099 | req->r_direct_mode = USE_AUTH_MDS; | 2103 | req->r_direct_mode = USE_AUTH_MDS; |
2100 | __do_request(mdsc, req); | 2104 | __do_request(mdsc, req); |
2101 | mutex_unlock(&mdsc->mutex); | 2105 | mutex_unlock(&mdsc->mutex); |
2102 | goto out; | 2106 | goto out; |
2103 | } else { | 2107 | } else { |
2104 | struct ceph_inode_info *ci = ceph_inode(req->r_inode); | 2108 | struct ceph_inode_info *ci = ceph_inode(req->r_inode); |
2105 | struct ceph_cap *cap = NULL; | 2109 | struct ceph_cap *cap = NULL; |
2106 | 2110 | ||
2107 | if (req->r_session) | 2111 | if (req->r_session) |
2108 | cap = ceph_get_cap_for_mds(ci, | 2112 | cap = ceph_get_cap_for_mds(ci, |
2109 | req->r_session->s_mds); | 2113 | req->r_session->s_mds); |
2110 | 2114 | ||
2111 | dout("already using auth"); | 2115 | dout("already using auth"); |
2112 | if ((!cap || cap != ci->i_auth_cap) || | 2116 | if ((!cap || cap != ci->i_auth_cap) || |
2113 | (cap->mseq != req->r_sent_on_mseq)) { | 2117 | (cap->mseq != req->r_sent_on_mseq)) { |
2114 | dout("but cap changed, so resending"); | 2118 | dout("but cap changed, so resending"); |
2115 | __do_request(mdsc, req); | 2119 | __do_request(mdsc, req); |
2116 | mutex_unlock(&mdsc->mutex); | 2120 | mutex_unlock(&mdsc->mutex); |
2117 | goto out; | 2121 | goto out; |
2118 | } | 2122 | } |
2119 | } | 2123 | } |
2120 | dout("have to return ESTALE on request %llu", req->r_tid); | 2124 | dout("have to return ESTALE on request %llu", req->r_tid); |
2121 | } | 2125 | } |
2122 | 2126 | ||
2123 | 2127 | ||
2124 | if (head->safe) { | 2128 | if (head->safe) { |
2125 | req->r_got_safe = true; | 2129 | req->r_got_safe = true; |
2126 | __unregister_request(mdsc, req); | 2130 | __unregister_request(mdsc, req); |
2127 | complete_all(&req->r_safe_completion); | 2131 | complete_all(&req->r_safe_completion); |
2128 | 2132 | ||
2129 | if (req->r_got_unsafe) { | 2133 | if (req->r_got_unsafe) { |
2130 | /* | 2134 | /* |
2131 | * We already handled the unsafe response, now do the | 2135 | * We already handled the unsafe response, now do the |
2132 | * cleanup. No need to examine the response; the MDS | 2136 | * cleanup. No need to examine the response; the MDS |
2133 | * doesn't include any result info in the safe | 2137 | * doesn't include any result info in the safe |
2134 | * response. And even if it did, there is nothing | 2138 | * response. And even if it did, there is nothing |
2135 | * useful we could do with a revised return value. | 2139 | * useful we could do with a revised return value. |
2136 | */ | 2140 | */ |
2137 | dout("got safe reply %llu, mds%d\n", tid, mds); | 2141 | dout("got safe reply %llu, mds%d\n", tid, mds); |
2138 | list_del_init(&req->r_unsafe_item); | 2142 | list_del_init(&req->r_unsafe_item); |
2139 | 2143 | ||
2140 | /* last unsafe request during umount? */ | 2144 | /* last unsafe request during umount? */ |
2141 | if (mdsc->stopping && !__get_oldest_req(mdsc)) | 2145 | if (mdsc->stopping && !__get_oldest_req(mdsc)) |
2142 | complete_all(&mdsc->safe_umount_waiters); | 2146 | complete_all(&mdsc->safe_umount_waiters); |
2143 | mutex_unlock(&mdsc->mutex); | 2147 | mutex_unlock(&mdsc->mutex); |
2144 | goto out; | 2148 | goto out; |
2145 | } | 2149 | } |
2146 | } else { | 2150 | } else { |
2147 | req->r_got_unsafe = true; | 2151 | req->r_got_unsafe = true; |
2148 | list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe); | 2152 | list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe); |
2149 | } | 2153 | } |
2150 | 2154 | ||
2151 | dout("handle_reply tid %lld result %d\n", tid, result); | 2155 | dout("handle_reply tid %lld result %d\n", tid, result); |
2152 | rinfo = &req->r_reply_info; | 2156 | rinfo = &req->r_reply_info; |
2153 | err = parse_reply_info(msg, rinfo, session->s_con.peer_features); | 2157 | err = parse_reply_info(msg, rinfo, session->s_con.peer_features); |
2154 | mutex_unlock(&mdsc->mutex); | 2158 | mutex_unlock(&mdsc->mutex); |
2155 | 2159 | ||
2156 | mutex_lock(&session->s_mutex); | 2160 | mutex_lock(&session->s_mutex); |
2157 | if (err < 0) { | 2161 | if (err < 0) { |
2158 | pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid); | 2162 | pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid); |
2159 | ceph_msg_dump(msg); | 2163 | ceph_msg_dump(msg); |
2160 | goto out_err; | 2164 | goto out_err; |
2161 | } | 2165 | } |
2162 | 2166 | ||
2163 | /* snap trace */ | 2167 | /* snap trace */ |
2164 | if (rinfo->snapblob_len) { | 2168 | if (rinfo->snapblob_len) { |
2165 | down_write(&mdsc->snap_rwsem); | 2169 | down_write(&mdsc->snap_rwsem); |
2166 | ceph_update_snap_trace(mdsc, rinfo->snapblob, | 2170 | ceph_update_snap_trace(mdsc, rinfo->snapblob, |
2167 | rinfo->snapblob + rinfo->snapblob_len, | 2171 | rinfo->snapblob + rinfo->snapblob_len, |
2168 | le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP); | 2172 | le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP); |
2169 | downgrade_write(&mdsc->snap_rwsem); | 2173 | downgrade_write(&mdsc->snap_rwsem); |
2170 | } else { | 2174 | } else { |
2171 | down_read(&mdsc->snap_rwsem); | 2175 | down_read(&mdsc->snap_rwsem); |
2172 | } | 2176 | } |
2173 | 2177 | ||
2174 | /* insert trace into our cache */ | 2178 | /* insert trace into our cache */ |
2175 | mutex_lock(&req->r_fill_mutex); | 2179 | mutex_lock(&req->r_fill_mutex); |
2176 | err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session); | 2180 | err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session); |
2177 | if (err == 0) { | 2181 | if (err == 0) { |
2178 | if (result == 0 && req->r_op != CEPH_MDS_OP_GETFILELOCK && | 2182 | if (result == 0 && req->r_op != CEPH_MDS_OP_GETFILELOCK && |
2179 | rinfo->dir_nr) | 2183 | rinfo->dir_nr) |
2180 | ceph_readdir_prepopulate(req, req->r_session); | 2184 | ceph_readdir_prepopulate(req, req->r_session); |
2181 | ceph_unreserve_caps(mdsc, &req->r_caps_reservation); | 2185 | ceph_unreserve_caps(mdsc, &req->r_caps_reservation); |
2182 | } | 2186 | } |
2183 | mutex_unlock(&req->r_fill_mutex); | 2187 | mutex_unlock(&req->r_fill_mutex); |
2184 | 2188 | ||
2185 | up_read(&mdsc->snap_rwsem); | 2189 | up_read(&mdsc->snap_rwsem); |
2186 | out_err: | 2190 | out_err: |
2187 | mutex_lock(&mdsc->mutex); | 2191 | mutex_lock(&mdsc->mutex); |
2188 | if (!req->r_aborted) { | 2192 | if (!req->r_aborted) { |
2189 | if (err) { | 2193 | if (err) { |
2190 | req->r_err = err; | 2194 | req->r_err = err; |
2191 | } else { | 2195 | } else { |
2192 | req->r_reply = msg; | 2196 | req->r_reply = msg; |
2193 | ceph_msg_get(msg); | 2197 | ceph_msg_get(msg); |
2194 | req->r_got_result = true; | 2198 | req->r_got_result = true; |
2195 | } | 2199 | } |
2196 | } else { | 2200 | } else { |
2197 | dout("reply arrived after request %lld was aborted\n", tid); | 2201 | dout("reply arrived after request %lld was aborted\n", tid); |
2198 | } | 2202 | } |
2199 | mutex_unlock(&mdsc->mutex); | 2203 | mutex_unlock(&mdsc->mutex); |
2200 | 2204 | ||
2201 | ceph_add_cap_releases(mdsc, req->r_session); | 2205 | ceph_add_cap_releases(mdsc, req->r_session); |
2202 | mutex_unlock(&session->s_mutex); | 2206 | mutex_unlock(&session->s_mutex); |
2203 | 2207 | ||
2204 | /* kick calling process */ | 2208 | /* kick calling process */ |
2205 | complete_request(mdsc, req); | 2209 | complete_request(mdsc, req); |
2206 | out: | 2210 | out: |
2207 | ceph_mdsc_put_request(req); | 2211 | ceph_mdsc_put_request(req); |
2208 | return; | 2212 | return; |
2209 | } | 2213 | } |
2210 | 2214 | ||
2211 | 2215 | ||
2212 | 2216 | ||
2213 | /* | 2217 | /* |
2214 | * handle mds notification that our request has been forwarded. | 2218 | * handle mds notification that our request has been forwarded. |
2215 | */ | 2219 | */ |
2216 | static void handle_forward(struct ceph_mds_client *mdsc, | 2220 | static void handle_forward(struct ceph_mds_client *mdsc, |
2217 | struct ceph_mds_session *session, | 2221 | struct ceph_mds_session *session, |
2218 | struct ceph_msg *msg) | 2222 | struct ceph_msg *msg) |
2219 | { | 2223 | { |
2220 | struct ceph_mds_request *req; | 2224 | struct ceph_mds_request *req; |
2221 | u64 tid = le64_to_cpu(msg->hdr.tid); | 2225 | u64 tid = le64_to_cpu(msg->hdr.tid); |
2222 | u32 next_mds; | 2226 | u32 next_mds; |
2223 | u32 fwd_seq; | 2227 | u32 fwd_seq; |
2224 | int err = -EINVAL; | 2228 | int err = -EINVAL; |
2225 | void *p = msg->front.iov_base; | 2229 | void *p = msg->front.iov_base; |
2226 | void *end = p + msg->front.iov_len; | 2230 | void *end = p + msg->front.iov_len; |
2227 | 2231 | ||
2228 | ceph_decode_need(&p, end, 2*sizeof(u32), bad); | 2232 | ceph_decode_need(&p, end, 2*sizeof(u32), bad); |
2229 | next_mds = ceph_decode_32(&p); | 2233 | next_mds = ceph_decode_32(&p); |
2230 | fwd_seq = ceph_decode_32(&p); | 2234 | fwd_seq = ceph_decode_32(&p); |
2231 | 2235 | ||
2232 | mutex_lock(&mdsc->mutex); | 2236 | mutex_lock(&mdsc->mutex); |
2233 | req = __lookup_request(mdsc, tid); | 2237 | req = __lookup_request(mdsc, tid); |
2234 | if (!req) { | 2238 | if (!req) { |
2235 | dout("forward tid %llu to mds%d - req dne\n", tid, next_mds); | 2239 | dout("forward tid %llu to mds%d - req dne\n", tid, next_mds); |
2236 | goto out; /* dup reply? */ | 2240 | goto out; /* dup reply? */ |
2237 | } | 2241 | } |
2238 | 2242 | ||
2239 | if (req->r_aborted) { | 2243 | if (req->r_aborted) { |
2240 | dout("forward tid %llu aborted, unregistering\n", tid); | 2244 | dout("forward tid %llu aborted, unregistering\n", tid); |
2241 | __unregister_request(mdsc, req); | 2245 | __unregister_request(mdsc, req); |
2242 | } else if (fwd_seq <= req->r_num_fwd) { | 2246 | } else if (fwd_seq <= req->r_num_fwd) { |
2243 | dout("forward tid %llu to mds%d - old seq %d <= %d\n", | 2247 | dout("forward tid %llu to mds%d - old seq %d <= %d\n", |
2244 | tid, next_mds, req->r_num_fwd, fwd_seq); | 2248 | tid, next_mds, req->r_num_fwd, fwd_seq); |
2245 | } else { | 2249 | } else { |
2246 | /* resend. forward race not possible; mds would drop */ | 2250 | /* resend. forward race not possible; mds would drop */ |
2247 | dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds); | 2251 | dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds); |
2248 | BUG_ON(req->r_err); | 2252 | BUG_ON(req->r_err); |
2249 | BUG_ON(req->r_got_result); | 2253 | BUG_ON(req->r_got_result); |
2250 | req->r_num_fwd = fwd_seq; | 2254 | req->r_num_fwd = fwd_seq; |
2251 | req->r_resend_mds = next_mds; | 2255 | req->r_resend_mds = next_mds; |
2252 | put_request_session(req); | 2256 | put_request_session(req); |
2253 | __do_request(mdsc, req); | 2257 | __do_request(mdsc, req); |
2254 | } | 2258 | } |
2255 | ceph_mdsc_put_request(req); | 2259 | ceph_mdsc_put_request(req); |
2256 | out: | 2260 | out: |
2257 | mutex_unlock(&mdsc->mutex); | 2261 | mutex_unlock(&mdsc->mutex); |
2258 | return; | 2262 | return; |
2259 | 2263 | ||
2260 | bad: | 2264 | bad: |
2261 | pr_err("mdsc_handle_forward decode error err=%d\n", err); | 2265 | pr_err("mdsc_handle_forward decode error err=%d\n", err); |
2262 | } | 2266 | } |
2263 | 2267 | ||
2264 | /* | 2268 | /* |
2265 | * handle a mds session control message | 2269 | * handle a mds session control message |
2266 | */ | 2270 | */ |
2267 | static void handle_session(struct ceph_mds_session *session, | 2271 | static void handle_session(struct ceph_mds_session *session, |
2268 | struct ceph_msg *msg) | 2272 | struct ceph_msg *msg) |
2269 | { | 2273 | { |
2270 | struct ceph_mds_client *mdsc = session->s_mdsc; | 2274 | struct ceph_mds_client *mdsc = session->s_mdsc; |
2271 | u32 op; | 2275 | u32 op; |
2272 | u64 seq; | 2276 | u64 seq; |
2273 | int mds = session->s_mds; | 2277 | int mds = session->s_mds; |
2274 | struct ceph_mds_session_head *h = msg->front.iov_base; | 2278 | struct ceph_mds_session_head *h = msg->front.iov_base; |
2275 | int wake = 0; | 2279 | int wake = 0; |
2276 | 2280 | ||
2277 | /* decode */ | 2281 | /* decode */ |
2278 | if (msg->front.iov_len != sizeof(*h)) | 2282 | if (msg->front.iov_len != sizeof(*h)) |
2279 | goto bad; | 2283 | goto bad; |
2280 | op = le32_to_cpu(h->op); | 2284 | op = le32_to_cpu(h->op); |
2281 | seq = le64_to_cpu(h->seq); | 2285 | seq = le64_to_cpu(h->seq); |
2282 | 2286 | ||
2283 | mutex_lock(&mdsc->mutex); | 2287 | mutex_lock(&mdsc->mutex); |
2284 | if (op == CEPH_SESSION_CLOSE) | 2288 | if (op == CEPH_SESSION_CLOSE) |
2285 | __unregister_session(mdsc, session); | 2289 | __unregister_session(mdsc, session); |
2286 | /* FIXME: this ttl calculation is generous */ | 2290 | /* FIXME: this ttl calculation is generous */ |
2287 | session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose; | 2291 | session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose; |
2288 | mutex_unlock(&mdsc->mutex); | 2292 | mutex_unlock(&mdsc->mutex); |
2289 | 2293 | ||
2290 | mutex_lock(&session->s_mutex); | 2294 | mutex_lock(&session->s_mutex); |
2291 | 2295 | ||
2292 | dout("handle_session mds%d %s %p state %s seq %llu\n", | 2296 | dout("handle_session mds%d %s %p state %s seq %llu\n", |
2293 | mds, ceph_session_op_name(op), session, | 2297 | mds, ceph_session_op_name(op), session, |
2294 | session_state_name(session->s_state), seq); | 2298 | session_state_name(session->s_state), seq); |
2295 | 2299 | ||
2296 | if (session->s_state == CEPH_MDS_SESSION_HUNG) { | 2300 | if (session->s_state == CEPH_MDS_SESSION_HUNG) { |
2297 | session->s_state = CEPH_MDS_SESSION_OPEN; | 2301 | session->s_state = CEPH_MDS_SESSION_OPEN; |
2298 | pr_info("mds%d came back\n", session->s_mds); | 2302 | pr_info("mds%d came back\n", session->s_mds); |
2299 | } | 2303 | } |
2300 | 2304 | ||
2301 | switch (op) { | 2305 | switch (op) { |
2302 | case CEPH_SESSION_OPEN: | 2306 | case CEPH_SESSION_OPEN: |
2303 | if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) | 2307 | if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) |
2304 | pr_info("mds%d reconnect success\n", session->s_mds); | 2308 | pr_info("mds%d reconnect success\n", session->s_mds); |
2305 | session->s_state = CEPH_MDS_SESSION_OPEN; | 2309 | session->s_state = CEPH_MDS_SESSION_OPEN; |
2306 | renewed_caps(mdsc, session, 0); | 2310 | renewed_caps(mdsc, session, 0); |
2307 | wake = 1; | 2311 | wake = 1; |
2308 | if (mdsc->stopping) | 2312 | if (mdsc->stopping) |
2309 | __close_session(mdsc, session); | 2313 | __close_session(mdsc, session); |
2310 | break; | 2314 | break; |
2311 | 2315 | ||
2312 | case CEPH_SESSION_RENEWCAPS: | 2316 | case CEPH_SESSION_RENEWCAPS: |
2313 | if (session->s_renew_seq == seq) | 2317 | if (session->s_renew_seq == seq) |
2314 | renewed_caps(mdsc, session, 1); | 2318 | renewed_caps(mdsc, session, 1); |
2315 | break; | 2319 | break; |
2316 | 2320 | ||
2317 | case CEPH_SESSION_CLOSE: | 2321 | case CEPH_SESSION_CLOSE: |
2318 | if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) | 2322 | if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) |
2319 | pr_info("mds%d reconnect denied\n", session->s_mds); | 2323 | pr_info("mds%d reconnect denied\n", session->s_mds); |
2320 | remove_session_caps(session); | 2324 | remove_session_caps(session); |
2321 | wake = 1; /* for good measure */ | 2325 | wake = 1; /* for good measure */ |
2322 | wake_up_all(&mdsc->session_close_wq); | 2326 | wake_up_all(&mdsc->session_close_wq); |
2323 | kick_requests(mdsc, mds); | 2327 | kick_requests(mdsc, mds); |
2324 | break; | 2328 | break; |
2325 | 2329 | ||
2326 | case CEPH_SESSION_STALE: | 2330 | case CEPH_SESSION_STALE: |
2327 | pr_info("mds%d caps went stale, renewing\n", | 2331 | pr_info("mds%d caps went stale, renewing\n", |
2328 | session->s_mds); | 2332 | session->s_mds); |
2329 | spin_lock(&session->s_cap_lock); | 2333 | spin_lock(&session->s_gen_ttl_lock); |
2330 | session->s_cap_gen++; | 2334 | session->s_cap_gen++; |
2331 | session->s_cap_ttl = 0; | 2335 | session->s_cap_ttl = 0; |
2332 | spin_unlock(&session->s_cap_lock); | 2336 | spin_unlock(&session->s_gen_ttl_lock); |
2333 | send_renew_caps(mdsc, session); | 2337 | send_renew_caps(mdsc, session); |
2334 | break; | 2338 | break; |
2335 | 2339 | ||
2336 | case CEPH_SESSION_RECALL_STATE: | 2340 | case CEPH_SESSION_RECALL_STATE: |
2337 | trim_caps(mdsc, session, le32_to_cpu(h->max_caps)); | 2341 | trim_caps(mdsc, session, le32_to_cpu(h->max_caps)); |
2338 | break; | 2342 | break; |
2339 | 2343 | ||
2340 | default: | 2344 | default: |
2341 | pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds); | 2345 | pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds); |
2342 | WARN_ON(1); | 2346 | WARN_ON(1); |
2343 | } | 2347 | } |
2344 | 2348 | ||
2345 | mutex_unlock(&session->s_mutex); | 2349 | mutex_unlock(&session->s_mutex); |
2346 | if (wake) { | 2350 | if (wake) { |
2347 | mutex_lock(&mdsc->mutex); | 2351 | mutex_lock(&mdsc->mutex); |
2348 | __wake_requests(mdsc, &session->s_waiting); | 2352 | __wake_requests(mdsc, &session->s_waiting); |
2349 | mutex_unlock(&mdsc->mutex); | 2353 | mutex_unlock(&mdsc->mutex); |
2350 | } | 2354 | } |
2351 | return; | 2355 | return; |
2352 | 2356 | ||
2353 | bad: | 2357 | bad: |
2354 | pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds, | 2358 | pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds, |
2355 | (int)msg->front.iov_len); | 2359 | (int)msg->front.iov_len); |
2356 | ceph_msg_dump(msg); | 2360 | ceph_msg_dump(msg); |
2357 | return; | 2361 | return; |
2358 | } | 2362 | } |
2359 | 2363 | ||
2360 | 2364 | ||
2361 | /* | 2365 | /* |
2362 | * called under session->mutex. | 2366 | * called under session->mutex. |
2363 | */ | 2367 | */ |
2364 | static void replay_unsafe_requests(struct ceph_mds_client *mdsc, | 2368 | static void replay_unsafe_requests(struct ceph_mds_client *mdsc, |
2365 | struct ceph_mds_session *session) | 2369 | struct ceph_mds_session *session) |
2366 | { | 2370 | { |
2367 | struct ceph_mds_request *req, *nreq; | 2371 | struct ceph_mds_request *req, *nreq; |
2368 | int err; | 2372 | int err; |
2369 | 2373 | ||
2370 | dout("replay_unsafe_requests mds%d\n", session->s_mds); | 2374 | dout("replay_unsafe_requests mds%d\n", session->s_mds); |
2371 | 2375 | ||
2372 | mutex_lock(&mdsc->mutex); | 2376 | mutex_lock(&mdsc->mutex); |
2373 | list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) { | 2377 | list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) { |
2374 | err = __prepare_send_request(mdsc, req, session->s_mds); | 2378 | err = __prepare_send_request(mdsc, req, session->s_mds); |
2375 | if (!err) { | 2379 | if (!err) { |
2376 | ceph_msg_get(req->r_request); | 2380 | ceph_msg_get(req->r_request); |
2377 | ceph_con_send(&session->s_con, req->r_request); | 2381 | ceph_con_send(&session->s_con, req->r_request); |
2378 | } | 2382 | } |
2379 | } | 2383 | } |
2380 | mutex_unlock(&mdsc->mutex); | 2384 | mutex_unlock(&mdsc->mutex); |
2381 | } | 2385 | } |
2382 | 2386 | ||
2383 | /* | 2387 | /* |
2384 | * Encode information about a cap for a reconnect with the MDS. | 2388 | * Encode information about a cap for a reconnect with the MDS. |
2385 | */ | 2389 | */ |
2386 | static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, | 2390 | static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, |
2387 | void *arg) | 2391 | void *arg) |
2388 | { | 2392 | { |
2389 | union { | 2393 | union { |
2390 | struct ceph_mds_cap_reconnect v2; | 2394 | struct ceph_mds_cap_reconnect v2; |
2391 | struct ceph_mds_cap_reconnect_v1 v1; | 2395 | struct ceph_mds_cap_reconnect_v1 v1; |
2392 | } rec; | 2396 | } rec; |
2393 | size_t reclen; | 2397 | size_t reclen; |
2394 | struct ceph_inode_info *ci; | 2398 | struct ceph_inode_info *ci; |
2395 | struct ceph_reconnect_state *recon_state = arg; | 2399 | struct ceph_reconnect_state *recon_state = arg; |
2396 | struct ceph_pagelist *pagelist = recon_state->pagelist; | 2400 | struct ceph_pagelist *pagelist = recon_state->pagelist; |
2397 | char *path; | 2401 | char *path; |
2398 | int pathlen, err; | 2402 | int pathlen, err; |
2399 | u64 pathbase; | 2403 | u64 pathbase; |
2400 | struct dentry *dentry; | 2404 | struct dentry *dentry; |
2401 | 2405 | ||
2402 | ci = cap->ci; | 2406 | ci = cap->ci; |
2403 | 2407 | ||
2404 | dout(" adding %p ino %llx.%llx cap %p %lld %s\n", | 2408 | dout(" adding %p ino %llx.%llx cap %p %lld %s\n", |
2405 | inode, ceph_vinop(inode), cap, cap->cap_id, | 2409 | inode, ceph_vinop(inode), cap, cap->cap_id, |
2406 | ceph_cap_string(cap->issued)); | 2410 | ceph_cap_string(cap->issued)); |
2407 | err = ceph_pagelist_encode_64(pagelist, ceph_ino(inode)); | 2411 | err = ceph_pagelist_encode_64(pagelist, ceph_ino(inode)); |
2408 | if (err) | 2412 | if (err) |
2409 | return err; | 2413 | return err; |
2410 | 2414 | ||
2411 | dentry = d_find_alias(inode); | 2415 | dentry = d_find_alias(inode); |
2412 | if (dentry) { | 2416 | if (dentry) { |
2413 | path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0); | 2417 | path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0); |
2414 | if (IS_ERR(path)) { | 2418 | if (IS_ERR(path)) { |
2415 | err = PTR_ERR(path); | 2419 | err = PTR_ERR(path); |
2416 | goto out_dput; | 2420 | goto out_dput; |
2417 | } | 2421 | } |
2418 | } else { | 2422 | } else { |
2419 | path = NULL; | 2423 | path = NULL; |
2420 | pathlen = 0; | 2424 | pathlen = 0; |
2421 | } | 2425 | } |
2422 | err = ceph_pagelist_encode_string(pagelist, path, pathlen); | 2426 | err = ceph_pagelist_encode_string(pagelist, path, pathlen); |
2423 | if (err) | 2427 | if (err) |
2424 | goto out_free; | 2428 | goto out_free; |
2425 | 2429 | ||
2426 | spin_lock(&ci->i_ceph_lock); | 2430 | spin_lock(&ci->i_ceph_lock); |
2427 | cap->seq = 0; /* reset cap seq */ | 2431 | cap->seq = 0; /* reset cap seq */ |
2428 | cap->issue_seq = 0; /* and issue_seq */ | 2432 | cap->issue_seq = 0; /* and issue_seq */ |
2429 | 2433 | ||
2430 | if (recon_state->flock) { | 2434 | if (recon_state->flock) { |
2431 | rec.v2.cap_id = cpu_to_le64(cap->cap_id); | 2435 | rec.v2.cap_id = cpu_to_le64(cap->cap_id); |
2432 | rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); | 2436 | rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); |
2433 | rec.v2.issued = cpu_to_le32(cap->issued); | 2437 | rec.v2.issued = cpu_to_le32(cap->issued); |
2434 | rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); | 2438 | rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); |
2435 | rec.v2.pathbase = cpu_to_le64(pathbase); | 2439 | rec.v2.pathbase = cpu_to_le64(pathbase); |
2436 | rec.v2.flock_len = 0; | 2440 | rec.v2.flock_len = 0; |
2437 | reclen = sizeof(rec.v2); | 2441 | reclen = sizeof(rec.v2); |
2438 | } else { | 2442 | } else { |
2439 | rec.v1.cap_id = cpu_to_le64(cap->cap_id); | 2443 | rec.v1.cap_id = cpu_to_le64(cap->cap_id); |
2440 | rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); | 2444 | rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); |
2441 | rec.v1.issued = cpu_to_le32(cap->issued); | 2445 | rec.v1.issued = cpu_to_le32(cap->issued); |
2442 | rec.v1.size = cpu_to_le64(inode->i_size); | 2446 | rec.v1.size = cpu_to_le64(inode->i_size); |
2443 | ceph_encode_timespec(&rec.v1.mtime, &inode->i_mtime); | 2447 | ceph_encode_timespec(&rec.v1.mtime, &inode->i_mtime); |
2444 | ceph_encode_timespec(&rec.v1.atime, &inode->i_atime); | 2448 | ceph_encode_timespec(&rec.v1.atime, &inode->i_atime); |
2445 | rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); | 2449 | rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); |
2446 | rec.v1.pathbase = cpu_to_le64(pathbase); | 2450 | rec.v1.pathbase = cpu_to_le64(pathbase); |
2447 | reclen = sizeof(rec.v1); | 2451 | reclen = sizeof(rec.v1); |
2448 | } | 2452 | } |
2449 | spin_unlock(&ci->i_ceph_lock); | 2453 | spin_unlock(&ci->i_ceph_lock); |
2450 | 2454 | ||
2451 | if (recon_state->flock) { | 2455 | if (recon_state->flock) { |
2452 | int num_fcntl_locks, num_flock_locks; | 2456 | int num_fcntl_locks, num_flock_locks; |
2453 | struct ceph_pagelist_cursor trunc_point; | 2457 | struct ceph_pagelist_cursor trunc_point; |
2454 | 2458 | ||
2455 | ceph_pagelist_set_cursor(pagelist, &trunc_point); | 2459 | ceph_pagelist_set_cursor(pagelist, &trunc_point); |
2456 | do { | 2460 | do { |
2457 | lock_flocks(); | 2461 | lock_flocks(); |
2458 | ceph_count_locks(inode, &num_fcntl_locks, | 2462 | ceph_count_locks(inode, &num_fcntl_locks, |
2459 | &num_flock_locks); | 2463 | &num_flock_locks); |
2460 | rec.v2.flock_len = (2*sizeof(u32) + | 2464 | rec.v2.flock_len = (2*sizeof(u32) + |
2461 | (num_fcntl_locks+num_flock_locks) * | 2465 | (num_fcntl_locks+num_flock_locks) * |
2462 | sizeof(struct ceph_filelock)); | 2466 | sizeof(struct ceph_filelock)); |
2463 | unlock_flocks(); | 2467 | unlock_flocks(); |
2464 | 2468 | ||
2465 | /* pre-alloc pagelist */ | 2469 | /* pre-alloc pagelist */ |
2466 | ceph_pagelist_truncate(pagelist, &trunc_point); | 2470 | ceph_pagelist_truncate(pagelist, &trunc_point); |
2467 | err = ceph_pagelist_append(pagelist, &rec, reclen); | 2471 | err = ceph_pagelist_append(pagelist, &rec, reclen); |
2468 | if (!err) | 2472 | if (!err) |
2469 | err = ceph_pagelist_reserve(pagelist, | 2473 | err = ceph_pagelist_reserve(pagelist, |
2470 | rec.v2.flock_len); | 2474 | rec.v2.flock_len); |
2471 | 2475 | ||
2472 | /* encode locks */ | 2476 | /* encode locks */ |
2473 | if (!err) { | 2477 | if (!err) { |
2474 | lock_flocks(); | 2478 | lock_flocks(); |
2475 | err = ceph_encode_locks(inode, | 2479 | err = ceph_encode_locks(inode, |
2476 | pagelist, | 2480 | pagelist, |
2477 | num_fcntl_locks, | 2481 | num_fcntl_locks, |
2478 | num_flock_locks); | 2482 | num_flock_locks); |
2479 | unlock_flocks(); | 2483 | unlock_flocks(); |
2480 | } | 2484 | } |
2481 | } while (err == -ENOSPC); | 2485 | } while (err == -ENOSPC); |
2482 | } else { | 2486 | } else { |
2483 | err = ceph_pagelist_append(pagelist, &rec, reclen); | 2487 | err = ceph_pagelist_append(pagelist, &rec, reclen); |
2484 | } | 2488 | } |
2485 | 2489 | ||
2486 | out_free: | 2490 | out_free: |
2487 | kfree(path); | 2491 | kfree(path); |
2488 | out_dput: | 2492 | out_dput: |
2489 | dput(dentry); | 2493 | dput(dentry); |
2490 | return err; | 2494 | return err; |
2491 | } | 2495 | } |
2492 | 2496 | ||
2493 | 2497 | ||
2494 | /* | 2498 | /* |
2495 | * If an MDS fails and recovers, clients need to reconnect in order to | 2499 | * If an MDS fails and recovers, clients need to reconnect in order to |
2496 | * reestablish shared state. This includes all caps issued through | 2500 | * reestablish shared state. This includes all caps issued through |
2497 | * this session _and_ the snap_realm hierarchy. Because it's not | 2501 | * this session _and_ the snap_realm hierarchy. Because it's not |
2498 | * clear which snap realms the mds cares about, we send everything we | 2502 | * clear which snap realms the mds cares about, we send everything we |
2499 | * know about.. that ensures we'll then get any new info the | 2503 | * know about.. that ensures we'll then get any new info the |
2500 | * recovering MDS might have. | 2504 | * recovering MDS might have. |
2501 | * | 2505 | * |
2502 | * This is a relatively heavyweight operation, but it's rare. | 2506 | * This is a relatively heavyweight operation, but it's rare. |
2503 | * | 2507 | * |
2504 | * called with mdsc->mutex held. | 2508 | * called with mdsc->mutex held. |
2505 | */ | 2509 | */ |
2506 | static void send_mds_reconnect(struct ceph_mds_client *mdsc, | 2510 | static void send_mds_reconnect(struct ceph_mds_client *mdsc, |
2507 | struct ceph_mds_session *session) | 2511 | struct ceph_mds_session *session) |
2508 | { | 2512 | { |
2509 | struct ceph_msg *reply; | 2513 | struct ceph_msg *reply; |
2510 | struct rb_node *p; | 2514 | struct rb_node *p; |
2511 | int mds = session->s_mds; | 2515 | int mds = session->s_mds; |
2512 | int err = -ENOMEM; | 2516 | int err = -ENOMEM; |
2513 | struct ceph_pagelist *pagelist; | 2517 | struct ceph_pagelist *pagelist; |
2514 | struct ceph_reconnect_state recon_state; | 2518 | struct ceph_reconnect_state recon_state; |
2515 | 2519 | ||
2516 | pr_info("mds%d reconnect start\n", mds); | 2520 | pr_info("mds%d reconnect start\n", mds); |
2517 | 2521 | ||
2518 | pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS); | 2522 | pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS); |
2519 | if (!pagelist) | 2523 | if (!pagelist) |
2520 | goto fail_nopagelist; | 2524 | goto fail_nopagelist; |
2521 | ceph_pagelist_init(pagelist); | 2525 | ceph_pagelist_init(pagelist); |
2522 | 2526 | ||
2523 | reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS, false); | 2527 | reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS, false); |
2524 | if (!reply) | 2528 | if (!reply) |
2525 | goto fail_nomsg; | 2529 | goto fail_nomsg; |
2526 | 2530 | ||
2527 | mutex_lock(&session->s_mutex); | 2531 | mutex_lock(&session->s_mutex); |
2528 | session->s_state = CEPH_MDS_SESSION_RECONNECTING; | 2532 | session->s_state = CEPH_MDS_SESSION_RECONNECTING; |
2529 | session->s_seq = 0; | 2533 | session->s_seq = 0; |
2530 | 2534 | ||
2531 | ceph_con_open(&session->s_con, | 2535 | ceph_con_open(&session->s_con, |
2532 | ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); | 2536 | ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); |
2533 | 2537 | ||
2534 | /* replay unsafe requests */ | 2538 | /* replay unsafe requests */ |
2535 | replay_unsafe_requests(mdsc, session); | 2539 | replay_unsafe_requests(mdsc, session); |
2536 | 2540 | ||
2537 | down_read(&mdsc->snap_rwsem); | 2541 | down_read(&mdsc->snap_rwsem); |
2538 | 2542 | ||
2539 | dout("session %p state %s\n", session, | 2543 | dout("session %p state %s\n", session, |
2540 | session_state_name(session->s_state)); | 2544 | session_state_name(session->s_state)); |
2541 | 2545 | ||
2542 | /* drop old cap expires; we're about to reestablish that state */ | 2546 | /* drop old cap expires; we're about to reestablish that state */ |
2543 | discard_cap_releases(mdsc, session); | 2547 | discard_cap_releases(mdsc, session); |
2544 | 2548 | ||
2545 | /* traverse this session's caps */ | 2549 | /* traverse this session's caps */ |
2546 | err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps); | 2550 | err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps); |
2547 | if (err) | 2551 | if (err) |
2548 | goto fail; | 2552 | goto fail; |
2549 | 2553 | ||
2550 | recon_state.pagelist = pagelist; | 2554 | recon_state.pagelist = pagelist; |
2551 | recon_state.flock = session->s_con.peer_features & CEPH_FEATURE_FLOCK; | 2555 | recon_state.flock = session->s_con.peer_features & CEPH_FEATURE_FLOCK; |
2552 | err = iterate_session_caps(session, encode_caps_cb, &recon_state); | 2556 | err = iterate_session_caps(session, encode_caps_cb, &recon_state); |
2553 | if (err < 0) | 2557 | if (err < 0) |
2554 | goto fail; | 2558 | goto fail; |
2555 | 2559 | ||
2556 | /* | 2560 | /* |
2557 | * snaprealms. we provide mds with the ino, seq (version), and | 2561 | * snaprealms. we provide mds with the ino, seq (version), and |
2558 | * parent for all of our realms. If the mds has any newer info, | 2562 | * parent for all of our realms. If the mds has any newer info, |
2559 | * it will tell us. | 2563 | * it will tell us. |
2560 | */ | 2564 | */ |
2561 | for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) { | 2565 | for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) { |
2562 | struct ceph_snap_realm *realm = | 2566 | struct ceph_snap_realm *realm = |
2563 | rb_entry(p, struct ceph_snap_realm, node); | 2567 | rb_entry(p, struct ceph_snap_realm, node); |
2564 | struct ceph_mds_snaprealm_reconnect sr_rec; | 2568 | struct ceph_mds_snaprealm_reconnect sr_rec; |
2565 | 2569 | ||
2566 | dout(" adding snap realm %llx seq %lld parent %llx\n", | 2570 | dout(" adding snap realm %llx seq %lld parent %llx\n", |
2567 | realm->ino, realm->seq, realm->parent_ino); | 2571 | realm->ino, realm->seq, realm->parent_ino); |
2568 | sr_rec.ino = cpu_to_le64(realm->ino); | 2572 | sr_rec.ino = cpu_to_le64(realm->ino); |
2569 | sr_rec.seq = cpu_to_le64(realm->seq); | 2573 | sr_rec.seq = cpu_to_le64(realm->seq); |
2570 | sr_rec.parent = cpu_to_le64(realm->parent_ino); | 2574 | sr_rec.parent = cpu_to_le64(realm->parent_ino); |
2571 | err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec)); | 2575 | err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec)); |
2572 | if (err) | 2576 | if (err) |
2573 | goto fail; | 2577 | goto fail; |
2574 | } | 2578 | } |
2575 | 2579 | ||
2576 | reply->pagelist = pagelist; | 2580 | reply->pagelist = pagelist; |
2577 | if (recon_state.flock) | 2581 | if (recon_state.flock) |
2578 | reply->hdr.version = cpu_to_le16(2); | 2582 | reply->hdr.version = cpu_to_le16(2); |
2579 | reply->hdr.data_len = cpu_to_le32(pagelist->length); | 2583 | reply->hdr.data_len = cpu_to_le32(pagelist->length); |
2580 | reply->nr_pages = calc_pages_for(0, pagelist->length); | 2584 | reply->nr_pages = calc_pages_for(0, pagelist->length); |
2581 | ceph_con_send(&session->s_con, reply); | 2585 | ceph_con_send(&session->s_con, reply); |
2582 | 2586 | ||
2583 | mutex_unlock(&session->s_mutex); | 2587 | mutex_unlock(&session->s_mutex); |
2584 | 2588 | ||
2585 | mutex_lock(&mdsc->mutex); | 2589 | mutex_lock(&mdsc->mutex); |
2586 | __wake_requests(mdsc, &session->s_waiting); | 2590 | __wake_requests(mdsc, &session->s_waiting); |
2587 | mutex_unlock(&mdsc->mutex); | 2591 | mutex_unlock(&mdsc->mutex); |
2588 | 2592 | ||
2589 | up_read(&mdsc->snap_rwsem); | 2593 | up_read(&mdsc->snap_rwsem); |
2590 | return; | 2594 | return; |
2591 | 2595 | ||
2592 | fail: | 2596 | fail: |
2593 | ceph_msg_put(reply); | 2597 | ceph_msg_put(reply); |
2594 | up_read(&mdsc->snap_rwsem); | 2598 | up_read(&mdsc->snap_rwsem); |
2595 | mutex_unlock(&session->s_mutex); | 2599 | mutex_unlock(&session->s_mutex); |
2596 | fail_nomsg: | 2600 | fail_nomsg: |
2597 | ceph_pagelist_release(pagelist); | 2601 | ceph_pagelist_release(pagelist); |
2598 | kfree(pagelist); | 2602 | kfree(pagelist); |
2599 | fail_nopagelist: | 2603 | fail_nopagelist: |
2600 | pr_err("error %d preparing reconnect for mds%d\n", err, mds); | 2604 | pr_err("error %d preparing reconnect for mds%d\n", err, mds); |
2601 | return; | 2605 | return; |
2602 | } | 2606 | } |
2603 | 2607 | ||
2604 | 2608 | ||
2605 | /* | 2609 | /* |
2606 | * compare old and new mdsmaps, kicking requests | 2610 | * compare old and new mdsmaps, kicking requests |
2607 | * and closing out old connections as necessary | 2611 | * and closing out old connections as necessary |
2608 | * | 2612 | * |
2609 | * called under mdsc->mutex. | 2613 | * called under mdsc->mutex. |
2610 | */ | 2614 | */ |
2611 | static void check_new_map(struct ceph_mds_client *mdsc, | 2615 | static void check_new_map(struct ceph_mds_client *mdsc, |
2612 | struct ceph_mdsmap *newmap, | 2616 | struct ceph_mdsmap *newmap, |
2613 | struct ceph_mdsmap *oldmap) | 2617 | struct ceph_mdsmap *oldmap) |
2614 | { | 2618 | { |
2615 | int i; | 2619 | int i; |
2616 | int oldstate, newstate; | 2620 | int oldstate, newstate; |
2617 | struct ceph_mds_session *s; | 2621 | struct ceph_mds_session *s; |
2618 | 2622 | ||
2619 | dout("check_new_map new %u old %u\n", | 2623 | dout("check_new_map new %u old %u\n", |
2620 | newmap->m_epoch, oldmap->m_epoch); | 2624 | newmap->m_epoch, oldmap->m_epoch); |
2621 | 2625 | ||
2622 | for (i = 0; i < oldmap->m_max_mds && i < mdsc->max_sessions; i++) { | 2626 | for (i = 0; i < oldmap->m_max_mds && i < mdsc->max_sessions; i++) { |
2623 | if (mdsc->sessions[i] == NULL) | 2627 | if (mdsc->sessions[i] == NULL) |
2624 | continue; | 2628 | continue; |
2625 | s = mdsc->sessions[i]; | 2629 | s = mdsc->sessions[i]; |
2626 | oldstate = ceph_mdsmap_get_state(oldmap, i); | 2630 | oldstate = ceph_mdsmap_get_state(oldmap, i); |
2627 | newstate = ceph_mdsmap_get_state(newmap, i); | 2631 | newstate = ceph_mdsmap_get_state(newmap, i); |
2628 | 2632 | ||
2629 | dout("check_new_map mds%d state %s%s -> %s%s (session %s)\n", | 2633 | dout("check_new_map mds%d state %s%s -> %s%s (session %s)\n", |
2630 | i, ceph_mds_state_name(oldstate), | 2634 | i, ceph_mds_state_name(oldstate), |
2631 | ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "", | 2635 | ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "", |
2632 | ceph_mds_state_name(newstate), | 2636 | ceph_mds_state_name(newstate), |
2633 | ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "", | 2637 | ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "", |
2634 | session_state_name(s->s_state)); | 2638 | session_state_name(s->s_state)); |
2635 | 2639 | ||
2636 | if (memcmp(ceph_mdsmap_get_addr(oldmap, i), | 2640 | if (memcmp(ceph_mdsmap_get_addr(oldmap, i), |
2637 | ceph_mdsmap_get_addr(newmap, i), | 2641 | ceph_mdsmap_get_addr(newmap, i), |
2638 | sizeof(struct ceph_entity_addr))) { | 2642 | sizeof(struct ceph_entity_addr))) { |
2639 | if (s->s_state == CEPH_MDS_SESSION_OPENING) { | 2643 | if (s->s_state == CEPH_MDS_SESSION_OPENING) { |
2640 | /* the session never opened, just close it | 2644 | /* the session never opened, just close it |
2641 | * out now */ | 2645 | * out now */ |
2642 | __wake_requests(mdsc, &s->s_waiting); | 2646 | __wake_requests(mdsc, &s->s_waiting); |
2643 | __unregister_session(mdsc, s); | 2647 | __unregister_session(mdsc, s); |
2644 | } else { | 2648 | } else { |
2645 | /* just close it */ | 2649 | /* just close it */ |
2646 | mutex_unlock(&mdsc->mutex); | 2650 | mutex_unlock(&mdsc->mutex); |
2647 | mutex_lock(&s->s_mutex); | 2651 | mutex_lock(&s->s_mutex); |
2648 | mutex_lock(&mdsc->mutex); | 2652 | mutex_lock(&mdsc->mutex); |
2649 | ceph_con_close(&s->s_con); | 2653 | ceph_con_close(&s->s_con); |
2650 | mutex_unlock(&s->s_mutex); | 2654 | mutex_unlock(&s->s_mutex); |
2651 | s->s_state = CEPH_MDS_SESSION_RESTARTING; | 2655 | s->s_state = CEPH_MDS_SESSION_RESTARTING; |
2652 | } | 2656 | } |
2653 | 2657 | ||
2654 | /* kick any requests waiting on the recovering mds */ | 2658 | /* kick any requests waiting on the recovering mds */ |
2655 | kick_requests(mdsc, i); | 2659 | kick_requests(mdsc, i); |
2656 | } else if (oldstate == newstate) { | 2660 | } else if (oldstate == newstate) { |
2657 | continue; /* nothing new with this mds */ | 2661 | continue; /* nothing new with this mds */ |
2658 | } | 2662 | } |
2659 | 2663 | ||
2660 | /* | 2664 | /* |
2661 | * send reconnect? | 2665 | * send reconnect? |
2662 | */ | 2666 | */ |
2663 | if (s->s_state == CEPH_MDS_SESSION_RESTARTING && | 2667 | if (s->s_state == CEPH_MDS_SESSION_RESTARTING && |
2664 | newstate >= CEPH_MDS_STATE_RECONNECT) { | 2668 | newstate >= CEPH_MDS_STATE_RECONNECT) { |
2665 | mutex_unlock(&mdsc->mutex); | 2669 | mutex_unlock(&mdsc->mutex); |
2666 | send_mds_reconnect(mdsc, s); | 2670 | send_mds_reconnect(mdsc, s); |
2667 | mutex_lock(&mdsc->mutex); | 2671 | mutex_lock(&mdsc->mutex); |
2668 | } | 2672 | } |
2669 | 2673 | ||
2670 | /* | 2674 | /* |
2671 | * kick request on any mds that has gone active. | 2675 | * kick request on any mds that has gone active. |
2672 | */ | 2676 | */ |
2673 | if (oldstate < CEPH_MDS_STATE_ACTIVE && | 2677 | if (oldstate < CEPH_MDS_STATE_ACTIVE && |
2674 | newstate >= CEPH_MDS_STATE_ACTIVE) { | 2678 | newstate >= CEPH_MDS_STATE_ACTIVE) { |
2675 | if (oldstate != CEPH_MDS_STATE_CREATING && | 2679 | if (oldstate != CEPH_MDS_STATE_CREATING && |
2676 | oldstate != CEPH_MDS_STATE_STARTING) | 2680 | oldstate != CEPH_MDS_STATE_STARTING) |
2677 | pr_info("mds%d recovery completed\n", s->s_mds); | 2681 | pr_info("mds%d recovery completed\n", s->s_mds); |
2678 | kick_requests(mdsc, i); | 2682 | kick_requests(mdsc, i); |
2679 | ceph_kick_flushing_caps(mdsc, s); | 2683 | ceph_kick_flushing_caps(mdsc, s); |
2680 | wake_up_session_caps(s, 1); | 2684 | wake_up_session_caps(s, 1); |
2681 | } | 2685 | } |
2682 | } | 2686 | } |
2683 | 2687 | ||
2684 | for (i = 0; i < newmap->m_max_mds && i < mdsc->max_sessions; i++) { | 2688 | for (i = 0; i < newmap->m_max_mds && i < mdsc->max_sessions; i++) { |
2685 | s = mdsc->sessions[i]; | 2689 | s = mdsc->sessions[i]; |
2686 | if (!s) | 2690 | if (!s) |
2687 | continue; | 2691 | continue; |
2688 | if (!ceph_mdsmap_is_laggy(newmap, i)) | 2692 | if (!ceph_mdsmap_is_laggy(newmap, i)) |
2689 | continue; | 2693 | continue; |
2690 | if (s->s_state == CEPH_MDS_SESSION_OPEN || | 2694 | if (s->s_state == CEPH_MDS_SESSION_OPEN || |
2691 | s->s_state == CEPH_MDS_SESSION_HUNG || | 2695 | s->s_state == CEPH_MDS_SESSION_HUNG || |
2692 | s->s_state == CEPH_MDS_SESSION_CLOSING) { | 2696 | s->s_state == CEPH_MDS_SESSION_CLOSING) { |
2693 | dout(" connecting to export targets of laggy mds%d\n", | 2697 | dout(" connecting to export targets of laggy mds%d\n", |
2694 | i); | 2698 | i); |
2695 | __open_export_target_sessions(mdsc, s); | 2699 | __open_export_target_sessions(mdsc, s); |
2696 | } | 2700 | } |
2697 | } | 2701 | } |
2698 | } | 2702 | } |
2699 | 2703 | ||
2700 | 2704 | ||
2701 | 2705 | ||
2702 | /* | 2706 | /* |
2703 | * leases | 2707 | * leases |
2704 | */ | 2708 | */ |
2705 | 2709 | ||
2706 | /* | 2710 | /* |
2707 | * caller must hold session s_mutex, dentry->d_lock | 2711 | * caller must hold session s_mutex, dentry->d_lock |
2708 | */ | 2712 | */ |
2709 | void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry) | 2713 | void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry) |
2710 | { | 2714 | { |
2711 | struct ceph_dentry_info *di = ceph_dentry(dentry); | 2715 | struct ceph_dentry_info *di = ceph_dentry(dentry); |
2712 | 2716 | ||
2713 | ceph_put_mds_session(di->lease_session); | 2717 | ceph_put_mds_session(di->lease_session); |
2714 | di->lease_session = NULL; | 2718 | di->lease_session = NULL; |
2715 | } | 2719 | } |
2716 | 2720 | ||
2717 | static void handle_lease(struct ceph_mds_client *mdsc, | 2721 | static void handle_lease(struct ceph_mds_client *mdsc, |
2718 | struct ceph_mds_session *session, | 2722 | struct ceph_mds_session *session, |
2719 | struct ceph_msg *msg) | 2723 | struct ceph_msg *msg) |
2720 | { | 2724 | { |
2721 | struct super_block *sb = mdsc->fsc->sb; | 2725 | struct super_block *sb = mdsc->fsc->sb; |
2722 | struct inode *inode; | 2726 | struct inode *inode; |
2723 | struct dentry *parent, *dentry; | 2727 | struct dentry *parent, *dentry; |
2724 | struct ceph_dentry_info *di; | 2728 | struct ceph_dentry_info *di; |
2725 | int mds = session->s_mds; | 2729 | int mds = session->s_mds; |
2726 | struct ceph_mds_lease *h = msg->front.iov_base; | 2730 | struct ceph_mds_lease *h = msg->front.iov_base; |
2727 | u32 seq; | 2731 | u32 seq; |
2728 | struct ceph_vino vino; | 2732 | struct ceph_vino vino; |
2729 | struct qstr dname; | 2733 | struct qstr dname; |
2730 | int release = 0; | 2734 | int release = 0; |
2731 | 2735 | ||
2732 | dout("handle_lease from mds%d\n", mds); | 2736 | dout("handle_lease from mds%d\n", mds); |
2733 | 2737 | ||
2734 | /* decode */ | 2738 | /* decode */ |
2735 | if (msg->front.iov_len < sizeof(*h) + sizeof(u32)) | 2739 | if (msg->front.iov_len < sizeof(*h) + sizeof(u32)) |
2736 | goto bad; | 2740 | goto bad; |
2737 | vino.ino = le64_to_cpu(h->ino); | 2741 | vino.ino = le64_to_cpu(h->ino); |
2738 | vino.snap = CEPH_NOSNAP; | 2742 | vino.snap = CEPH_NOSNAP; |
2739 | seq = le32_to_cpu(h->seq); | 2743 | seq = le32_to_cpu(h->seq); |
2740 | dname.name = (void *)h + sizeof(*h) + sizeof(u32); | 2744 | dname.name = (void *)h + sizeof(*h) + sizeof(u32); |
2741 | dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32); | 2745 | dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32); |
2742 | if (dname.len != get_unaligned_le32(h+1)) | 2746 | if (dname.len != get_unaligned_le32(h+1)) |
2743 | goto bad; | 2747 | goto bad; |
2744 | 2748 | ||
2745 | mutex_lock(&session->s_mutex); | 2749 | mutex_lock(&session->s_mutex); |
2746 | session->s_seq++; | 2750 | session->s_seq++; |
2747 | 2751 | ||
2748 | /* lookup inode */ | 2752 | /* lookup inode */ |
2749 | inode = ceph_find_inode(sb, vino); | 2753 | inode = ceph_find_inode(sb, vino); |
2750 | dout("handle_lease %s, ino %llx %p %.*s\n", | 2754 | dout("handle_lease %s, ino %llx %p %.*s\n", |
2751 | ceph_lease_op_name(h->action), vino.ino, inode, | 2755 | ceph_lease_op_name(h->action), vino.ino, inode, |
2752 | dname.len, dname.name); | 2756 | dname.len, dname.name); |
2753 | if (inode == NULL) { | 2757 | if (inode == NULL) { |
2754 | dout("handle_lease no inode %llx\n", vino.ino); | 2758 | dout("handle_lease no inode %llx\n", vino.ino); |
2755 | goto release; | 2759 | goto release; |
2756 | } | 2760 | } |
2757 | 2761 | ||
2758 | /* dentry */ | 2762 | /* dentry */ |
2759 | parent = d_find_alias(inode); | 2763 | parent = d_find_alias(inode); |
2760 | if (!parent) { | 2764 | if (!parent) { |
2761 | dout("no parent dentry on inode %p\n", inode); | 2765 | dout("no parent dentry on inode %p\n", inode); |
2762 | WARN_ON(1); | 2766 | WARN_ON(1); |
2763 | goto release; /* hrm... */ | 2767 | goto release; /* hrm... */ |
2764 | } | 2768 | } |
2765 | dname.hash = full_name_hash(dname.name, dname.len); | 2769 | dname.hash = full_name_hash(dname.name, dname.len); |
2766 | dentry = d_lookup(parent, &dname); | 2770 | dentry = d_lookup(parent, &dname); |
2767 | dput(parent); | 2771 | dput(parent); |
2768 | if (!dentry) | 2772 | if (!dentry) |
2769 | goto release; | 2773 | goto release; |
2770 | 2774 | ||
2771 | spin_lock(&dentry->d_lock); | 2775 | spin_lock(&dentry->d_lock); |
2772 | di = ceph_dentry(dentry); | 2776 | di = ceph_dentry(dentry); |
2773 | switch (h->action) { | 2777 | switch (h->action) { |
2774 | case CEPH_MDS_LEASE_REVOKE: | 2778 | case CEPH_MDS_LEASE_REVOKE: |
2775 | if (di->lease_session == session) { | 2779 | if (di->lease_session == session) { |
2776 | if (ceph_seq_cmp(di->lease_seq, seq) > 0) | 2780 | if (ceph_seq_cmp(di->lease_seq, seq) > 0) |
2777 | h->seq = cpu_to_le32(di->lease_seq); | 2781 | h->seq = cpu_to_le32(di->lease_seq); |
2778 | __ceph_mdsc_drop_dentry_lease(dentry); | 2782 | __ceph_mdsc_drop_dentry_lease(dentry); |
2779 | } | 2783 | } |
2780 | release = 1; | 2784 | release = 1; |
2781 | break; | 2785 | break; |
2782 | 2786 | ||
2783 | case CEPH_MDS_LEASE_RENEW: | 2787 | case CEPH_MDS_LEASE_RENEW: |
2784 | if (di->lease_session == session && | 2788 | if (di->lease_session == session && |
2785 | di->lease_gen == session->s_cap_gen && | 2789 | di->lease_gen == session->s_cap_gen && |
2786 | di->lease_renew_from && | 2790 | di->lease_renew_from && |
2787 | di->lease_renew_after == 0) { | 2791 | di->lease_renew_after == 0) { |
2788 | unsigned long duration = | 2792 | unsigned long duration = |
2789 | le32_to_cpu(h->duration_ms) * HZ / 1000; | 2793 | le32_to_cpu(h->duration_ms) * HZ / 1000; |
2790 | 2794 | ||
2791 | di->lease_seq = seq; | 2795 | di->lease_seq = seq; |
2792 | dentry->d_time = di->lease_renew_from + duration; | 2796 | dentry->d_time = di->lease_renew_from + duration; |
2793 | di->lease_renew_after = di->lease_renew_from + | 2797 | di->lease_renew_after = di->lease_renew_from + |
2794 | (duration >> 1); | 2798 | (duration >> 1); |
2795 | di->lease_renew_from = 0; | 2799 | di->lease_renew_from = 0; |
2796 | } | 2800 | } |
2797 | break; | 2801 | break; |
2798 | } | 2802 | } |
2799 | spin_unlock(&dentry->d_lock); | 2803 | spin_unlock(&dentry->d_lock); |
2800 | dput(dentry); | 2804 | dput(dentry); |
2801 | 2805 | ||
2802 | if (!release) | 2806 | if (!release) |
2803 | goto out; | 2807 | goto out; |
2804 | 2808 | ||
2805 | release: | 2809 | release: |
2806 | /* let's just reuse the same message */ | 2810 | /* let's just reuse the same message */ |
2807 | h->action = CEPH_MDS_LEASE_REVOKE_ACK; | 2811 | h->action = CEPH_MDS_LEASE_REVOKE_ACK; |
2808 | ceph_msg_get(msg); | 2812 | ceph_msg_get(msg); |
2809 | ceph_con_send(&session->s_con, msg); | 2813 | ceph_con_send(&session->s_con, msg); |
2810 | 2814 | ||
2811 | out: | 2815 | out: |
2812 | iput(inode); | 2816 | iput(inode); |
2813 | mutex_unlock(&session->s_mutex); | 2817 | mutex_unlock(&session->s_mutex); |
2814 | return; | 2818 | return; |
2815 | 2819 | ||
2816 | bad: | 2820 | bad: |
2817 | pr_err("corrupt lease message\n"); | 2821 | pr_err("corrupt lease message\n"); |
2818 | ceph_msg_dump(msg); | 2822 | ceph_msg_dump(msg); |
2819 | } | 2823 | } |
2820 | 2824 | ||
2821 | void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, | 2825 | void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, |
2822 | struct inode *inode, | 2826 | struct inode *inode, |
2823 | struct dentry *dentry, char action, | 2827 | struct dentry *dentry, char action, |
2824 | u32 seq) | 2828 | u32 seq) |
2825 | { | 2829 | { |
2826 | struct ceph_msg *msg; | 2830 | struct ceph_msg *msg; |
2827 | struct ceph_mds_lease *lease; | 2831 | struct ceph_mds_lease *lease; |
2828 | int len = sizeof(*lease) + sizeof(u32); | 2832 | int len = sizeof(*lease) + sizeof(u32); |
2829 | int dnamelen = 0; | 2833 | int dnamelen = 0; |
2830 | 2834 | ||
2831 | dout("lease_send_msg inode %p dentry %p %s to mds%d\n", | 2835 | dout("lease_send_msg inode %p dentry %p %s to mds%d\n", |
2832 | inode, dentry, ceph_lease_op_name(action), session->s_mds); | 2836 | inode, dentry, ceph_lease_op_name(action), session->s_mds); |
2833 | dnamelen = dentry->d_name.len; | 2837 | dnamelen = dentry->d_name.len; |
2834 | len += dnamelen; | 2838 | len += dnamelen; |
2835 | 2839 | ||
2836 | msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false); | 2840 | msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false); |
2837 | if (!msg) | 2841 | if (!msg) |
2838 | return; | 2842 | return; |
2839 | lease = msg->front.iov_base; | 2843 | lease = msg->front.iov_base; |
2840 | lease->action = action; | 2844 | lease->action = action; |
2841 | lease->ino = cpu_to_le64(ceph_vino(inode).ino); | 2845 | lease->ino = cpu_to_le64(ceph_vino(inode).ino); |
2842 | lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap); | 2846 | lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap); |
2843 | lease->seq = cpu_to_le32(seq); | 2847 | lease->seq = cpu_to_le32(seq); |
2844 | put_unaligned_le32(dnamelen, lease + 1); | 2848 | put_unaligned_le32(dnamelen, lease + 1); |
2845 | memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dnamelen); | 2849 | memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dnamelen); |
2846 | 2850 | ||
2847 | /* | 2851 | /* |
2848 | * if this is a preemptive lease RELEASE, no need to | 2852 | * if this is a preemptive lease RELEASE, no need to |
2849 | * flush request stream, since the actual request will | 2853 | * flush request stream, since the actual request will |
2850 | * soon follow. | 2854 | * soon follow. |
2851 | */ | 2855 | */ |
2852 | msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE); | 2856 | msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE); |
2853 | 2857 | ||
2854 | ceph_con_send(&session->s_con, msg); | 2858 | ceph_con_send(&session->s_con, msg); |
2855 | } | 2859 | } |
2856 | 2860 | ||
2857 | /* | 2861 | /* |
2858 | * Preemptively release a lease we expect to invalidate anyway. | 2862 | * Preemptively release a lease we expect to invalidate anyway. |
2859 | * Pass @inode always, @dentry is optional. | 2863 | * Pass @inode always, @dentry is optional. |
2860 | */ | 2864 | */ |
2861 | void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode, | 2865 | void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode, |
2862 | struct dentry *dentry) | 2866 | struct dentry *dentry) |
2863 | { | 2867 | { |
2864 | struct ceph_dentry_info *di; | 2868 | struct ceph_dentry_info *di; |
2865 | struct ceph_mds_session *session; | 2869 | struct ceph_mds_session *session; |
2866 | u32 seq; | 2870 | u32 seq; |
2867 | 2871 | ||
2868 | BUG_ON(inode == NULL); | 2872 | BUG_ON(inode == NULL); |
2869 | BUG_ON(dentry == NULL); | 2873 | BUG_ON(dentry == NULL); |
2870 | 2874 | ||
2871 | /* is dentry lease valid? */ | 2875 | /* is dentry lease valid? */ |
2872 | spin_lock(&dentry->d_lock); | 2876 | spin_lock(&dentry->d_lock); |
2873 | di = ceph_dentry(dentry); | 2877 | di = ceph_dentry(dentry); |
2874 | if (!di || !di->lease_session || | 2878 | if (!di || !di->lease_session || |
2875 | di->lease_session->s_mds < 0 || | 2879 | di->lease_session->s_mds < 0 || |
2876 | di->lease_gen != di->lease_session->s_cap_gen || | 2880 | di->lease_gen != di->lease_session->s_cap_gen || |
2877 | !time_before(jiffies, dentry->d_time)) { | 2881 | !time_before(jiffies, dentry->d_time)) { |
2878 | dout("lease_release inode %p dentry %p -- " | 2882 | dout("lease_release inode %p dentry %p -- " |
2879 | "no lease\n", | 2883 | "no lease\n", |
2880 | inode, dentry); | 2884 | inode, dentry); |
2881 | spin_unlock(&dentry->d_lock); | 2885 | spin_unlock(&dentry->d_lock); |
2882 | return; | 2886 | return; |
2883 | } | 2887 | } |
2884 | 2888 | ||
2885 | /* we do have a lease on this dentry; note mds and seq */ | 2889 | /* we do have a lease on this dentry; note mds and seq */ |
2886 | session = ceph_get_mds_session(di->lease_session); | 2890 | session = ceph_get_mds_session(di->lease_session); |
2887 | seq = di->lease_seq; | 2891 | seq = di->lease_seq; |
2888 | __ceph_mdsc_drop_dentry_lease(dentry); | 2892 | __ceph_mdsc_drop_dentry_lease(dentry); |
2889 | spin_unlock(&dentry->d_lock); | 2893 | spin_unlock(&dentry->d_lock); |
2890 | 2894 | ||
2891 | dout("lease_release inode %p dentry %p to mds%d\n", | 2895 | dout("lease_release inode %p dentry %p to mds%d\n", |
2892 | inode, dentry, session->s_mds); | 2896 | inode, dentry, session->s_mds); |
2893 | ceph_mdsc_lease_send_msg(session, inode, dentry, | 2897 | ceph_mdsc_lease_send_msg(session, inode, dentry, |
2894 | CEPH_MDS_LEASE_RELEASE, seq); | 2898 | CEPH_MDS_LEASE_RELEASE, seq); |
2895 | ceph_put_mds_session(session); | 2899 | ceph_put_mds_session(session); |
2896 | } | 2900 | } |
2897 | 2901 | ||
2898 | /* | 2902 | /* |
2899 | * drop all leases (and dentry refs) in preparation for umount | 2903 | * drop all leases (and dentry refs) in preparation for umount |
2900 | */ | 2904 | */ |
2901 | static void drop_leases(struct ceph_mds_client *mdsc) | 2905 | static void drop_leases(struct ceph_mds_client *mdsc) |
2902 | { | 2906 | { |
2903 | int i; | 2907 | int i; |
2904 | 2908 | ||
2905 | dout("drop_leases\n"); | 2909 | dout("drop_leases\n"); |
2906 | mutex_lock(&mdsc->mutex); | 2910 | mutex_lock(&mdsc->mutex); |
2907 | for (i = 0; i < mdsc->max_sessions; i++) { | 2911 | for (i = 0; i < mdsc->max_sessions; i++) { |
2908 | struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i); | 2912 | struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i); |
2909 | if (!s) | 2913 | if (!s) |
2910 | continue; | 2914 | continue; |
2911 | mutex_unlock(&mdsc->mutex); | 2915 | mutex_unlock(&mdsc->mutex); |
2912 | mutex_lock(&s->s_mutex); | 2916 | mutex_lock(&s->s_mutex); |
2913 | mutex_unlock(&s->s_mutex); | 2917 | mutex_unlock(&s->s_mutex); |
2914 | ceph_put_mds_session(s); | 2918 | ceph_put_mds_session(s); |
2915 | mutex_lock(&mdsc->mutex); | 2919 | mutex_lock(&mdsc->mutex); |
2916 | } | 2920 | } |
2917 | mutex_unlock(&mdsc->mutex); | 2921 | mutex_unlock(&mdsc->mutex); |
2918 | } | 2922 | } |
2919 | 2923 | ||
2920 | 2924 | ||
2921 | 2925 | ||
2922 | /* | 2926 | /* |
2923 | * delayed work -- periodically trim expired leases, renew caps with mds | 2927 | * delayed work -- periodically trim expired leases, renew caps with mds |
2924 | */ | 2928 | */ |
2925 | static void schedule_delayed(struct ceph_mds_client *mdsc) | 2929 | static void schedule_delayed(struct ceph_mds_client *mdsc) |
2926 | { | 2930 | { |
2927 | int delay = 5; | 2931 | int delay = 5; |
2928 | unsigned hz = round_jiffies_relative(HZ * delay); | 2932 | unsigned hz = round_jiffies_relative(HZ * delay); |
2929 | schedule_delayed_work(&mdsc->delayed_work, hz); | 2933 | schedule_delayed_work(&mdsc->delayed_work, hz); |
2930 | } | 2934 | } |
2931 | 2935 | ||
2932 | static void delayed_work(struct work_struct *work) | 2936 | static void delayed_work(struct work_struct *work) |
2933 | { | 2937 | { |
2934 | int i; | 2938 | int i; |
2935 | struct ceph_mds_client *mdsc = | 2939 | struct ceph_mds_client *mdsc = |
2936 | container_of(work, struct ceph_mds_client, delayed_work.work); | 2940 | container_of(work, struct ceph_mds_client, delayed_work.work); |
2937 | int renew_interval; | 2941 | int renew_interval; |
2938 | int renew_caps; | 2942 | int renew_caps; |
2939 | 2943 | ||
2940 | dout("mdsc delayed_work\n"); | 2944 | dout("mdsc delayed_work\n"); |
2941 | ceph_check_delayed_caps(mdsc); | 2945 | ceph_check_delayed_caps(mdsc); |
2942 | 2946 | ||
2943 | mutex_lock(&mdsc->mutex); | 2947 | mutex_lock(&mdsc->mutex); |
2944 | renew_interval = mdsc->mdsmap->m_session_timeout >> 2; | 2948 | renew_interval = mdsc->mdsmap->m_session_timeout >> 2; |
2945 | renew_caps = time_after_eq(jiffies, HZ*renew_interval + | 2949 | renew_caps = time_after_eq(jiffies, HZ*renew_interval + |
2946 | mdsc->last_renew_caps); | 2950 | mdsc->last_renew_caps); |
2947 | if (renew_caps) | 2951 | if (renew_caps) |
2948 | mdsc->last_renew_caps = jiffies; | 2952 | mdsc->last_renew_caps = jiffies; |
2949 | 2953 | ||
2950 | for (i = 0; i < mdsc->max_sessions; i++) { | 2954 | for (i = 0; i < mdsc->max_sessions; i++) { |
2951 | struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i); | 2955 | struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i); |
2952 | if (s == NULL) | 2956 | if (s == NULL) |
2953 | continue; | 2957 | continue; |
2954 | if (s->s_state == CEPH_MDS_SESSION_CLOSING) { | 2958 | if (s->s_state == CEPH_MDS_SESSION_CLOSING) { |
2955 | dout("resending session close request for mds%d\n", | 2959 | dout("resending session close request for mds%d\n", |
2956 | s->s_mds); | 2960 | s->s_mds); |
2957 | request_close_session(mdsc, s); | 2961 | request_close_session(mdsc, s); |
2958 | ceph_put_mds_session(s); | 2962 | ceph_put_mds_session(s); |
2959 | continue; | 2963 | continue; |
2960 | } | 2964 | } |
2961 | if (s->s_ttl && time_after(jiffies, s->s_ttl)) { | 2965 | if (s->s_ttl && time_after(jiffies, s->s_ttl)) { |
2962 | if (s->s_state == CEPH_MDS_SESSION_OPEN) { | 2966 | if (s->s_state == CEPH_MDS_SESSION_OPEN) { |
2963 | s->s_state = CEPH_MDS_SESSION_HUNG; | 2967 | s->s_state = CEPH_MDS_SESSION_HUNG; |
2964 | pr_info("mds%d hung\n", s->s_mds); | 2968 | pr_info("mds%d hung\n", s->s_mds); |
2965 | } | 2969 | } |
2966 | } | 2970 | } |
2967 | if (s->s_state < CEPH_MDS_SESSION_OPEN) { | 2971 | if (s->s_state < CEPH_MDS_SESSION_OPEN) { |
2968 | /* this mds is failed or recovering, just wait */ | 2972 | /* this mds is failed or recovering, just wait */ |
2969 | ceph_put_mds_session(s); | 2973 | ceph_put_mds_session(s); |
2970 | continue; | 2974 | continue; |
2971 | } | 2975 | } |
2972 | mutex_unlock(&mdsc->mutex); | 2976 | mutex_unlock(&mdsc->mutex); |
2973 | 2977 | ||
2974 | mutex_lock(&s->s_mutex); | 2978 | mutex_lock(&s->s_mutex); |
2975 | if (renew_caps) | 2979 | if (renew_caps) |
2976 | send_renew_caps(mdsc, s); | 2980 | send_renew_caps(mdsc, s); |
2977 | else | 2981 | else |
2978 | ceph_con_keepalive(&s->s_con); | 2982 | ceph_con_keepalive(&s->s_con); |
2979 | ceph_add_cap_releases(mdsc, s); | 2983 | ceph_add_cap_releases(mdsc, s); |
2980 | if (s->s_state == CEPH_MDS_SESSION_OPEN || | 2984 | if (s->s_state == CEPH_MDS_SESSION_OPEN || |
2981 | s->s_state == CEPH_MDS_SESSION_HUNG) | 2985 | s->s_state == CEPH_MDS_SESSION_HUNG) |
2982 | ceph_send_cap_releases(mdsc, s); | 2986 | ceph_send_cap_releases(mdsc, s); |
2983 | mutex_unlock(&s->s_mutex); | 2987 | mutex_unlock(&s->s_mutex); |
2984 | ceph_put_mds_session(s); | 2988 | ceph_put_mds_session(s); |
2985 | 2989 | ||
2986 | mutex_lock(&mdsc->mutex); | 2990 | mutex_lock(&mdsc->mutex); |
2987 | } | 2991 | } |
2988 | mutex_unlock(&mdsc->mutex); | 2992 | mutex_unlock(&mdsc->mutex); |
2989 | 2993 | ||
2990 | schedule_delayed(mdsc); | 2994 | schedule_delayed(mdsc); |
2991 | } | 2995 | } |
2992 | 2996 | ||
2993 | int ceph_mdsc_init(struct ceph_fs_client *fsc) | 2997 | int ceph_mdsc_init(struct ceph_fs_client *fsc) |
2994 | 2998 | ||
2995 | { | 2999 | { |
2996 | struct ceph_mds_client *mdsc; | 3000 | struct ceph_mds_client *mdsc; |
2997 | 3001 | ||
2998 | mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS); | 3002 | mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS); |
2999 | if (!mdsc) | 3003 | if (!mdsc) |
3000 | return -ENOMEM; | 3004 | return -ENOMEM; |
3001 | mdsc->fsc = fsc; | 3005 | mdsc->fsc = fsc; |
3002 | fsc->mdsc = mdsc; | 3006 | fsc->mdsc = mdsc; |
3003 | mutex_init(&mdsc->mutex); | 3007 | mutex_init(&mdsc->mutex); |
3004 | mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); | 3008 | mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); |
3005 | if (mdsc->mdsmap == NULL) | 3009 | if (mdsc->mdsmap == NULL) |
3006 | return -ENOMEM; | 3010 | return -ENOMEM; |
3007 | 3011 | ||
3008 | init_completion(&mdsc->safe_umount_waiters); | 3012 | init_completion(&mdsc->safe_umount_waiters); |
3009 | init_waitqueue_head(&mdsc->session_close_wq); | 3013 | init_waitqueue_head(&mdsc->session_close_wq); |
3010 | INIT_LIST_HEAD(&mdsc->waiting_for_map); | 3014 | INIT_LIST_HEAD(&mdsc->waiting_for_map); |
3011 | mdsc->sessions = NULL; | 3015 | mdsc->sessions = NULL; |
3012 | mdsc->max_sessions = 0; | 3016 | mdsc->max_sessions = 0; |
3013 | mdsc->stopping = 0; | 3017 | mdsc->stopping = 0; |
3014 | init_rwsem(&mdsc->snap_rwsem); | 3018 | init_rwsem(&mdsc->snap_rwsem); |
3015 | mdsc->snap_realms = RB_ROOT; | 3019 | mdsc->snap_realms = RB_ROOT; |
3016 | INIT_LIST_HEAD(&mdsc->snap_empty); | 3020 | INIT_LIST_HEAD(&mdsc->snap_empty); |
3017 | spin_lock_init(&mdsc->snap_empty_lock); | 3021 | spin_lock_init(&mdsc->snap_empty_lock); |
3018 | mdsc->last_tid = 0; | 3022 | mdsc->last_tid = 0; |
3019 | mdsc->request_tree = RB_ROOT; | 3023 | mdsc->request_tree = RB_ROOT; |
3020 | INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work); | 3024 | INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work); |
3021 | mdsc->last_renew_caps = jiffies; | 3025 | mdsc->last_renew_caps = jiffies; |
3022 | INIT_LIST_HEAD(&mdsc->cap_delay_list); | 3026 | INIT_LIST_HEAD(&mdsc->cap_delay_list); |
3023 | spin_lock_init(&mdsc->cap_delay_lock); | 3027 | spin_lock_init(&mdsc->cap_delay_lock); |
3024 | INIT_LIST_HEAD(&mdsc->snap_flush_list); | 3028 | INIT_LIST_HEAD(&mdsc->snap_flush_list); |
3025 | spin_lock_init(&mdsc->snap_flush_lock); | 3029 | spin_lock_init(&mdsc->snap_flush_lock); |
3026 | mdsc->cap_flush_seq = 0; | 3030 | mdsc->cap_flush_seq = 0; |
3027 | INIT_LIST_HEAD(&mdsc->cap_dirty); | 3031 | INIT_LIST_HEAD(&mdsc->cap_dirty); |
3028 | INIT_LIST_HEAD(&mdsc->cap_dirty_migrating); | 3032 | INIT_LIST_HEAD(&mdsc->cap_dirty_migrating); |
3029 | mdsc->num_cap_flushing = 0; | 3033 | mdsc->num_cap_flushing = 0; |
3030 | spin_lock_init(&mdsc->cap_dirty_lock); | 3034 | spin_lock_init(&mdsc->cap_dirty_lock); |
3031 | init_waitqueue_head(&mdsc->cap_flushing_wq); | 3035 | init_waitqueue_head(&mdsc->cap_flushing_wq); |
3032 | spin_lock_init(&mdsc->dentry_lru_lock); | 3036 | spin_lock_init(&mdsc->dentry_lru_lock); |
3033 | INIT_LIST_HEAD(&mdsc->dentry_lru); | 3037 | INIT_LIST_HEAD(&mdsc->dentry_lru); |
3034 | 3038 | ||
3035 | ceph_caps_init(mdsc); | 3039 | ceph_caps_init(mdsc); |
3036 | ceph_adjust_min_caps(mdsc, fsc->min_caps); | 3040 | ceph_adjust_min_caps(mdsc, fsc->min_caps); |
3037 | 3041 | ||
3038 | return 0; | 3042 | return 0; |
3039 | } | 3043 | } |
3040 | 3044 | ||
3041 | /* | 3045 | /* |
3042 | * Wait for safe replies on open mds requests. If we time out, drop | 3046 | * Wait for safe replies on open mds requests. If we time out, drop |
3043 | * all requests from the tree to avoid dangling dentry refs. | 3047 | * all requests from the tree to avoid dangling dentry refs. |
3044 | */ | 3048 | */ |
3045 | static void wait_requests(struct ceph_mds_client *mdsc) | 3049 | static void wait_requests(struct ceph_mds_client *mdsc) |
3046 | { | 3050 | { |
3047 | struct ceph_mds_request *req; | 3051 | struct ceph_mds_request *req; |
3048 | struct ceph_fs_client *fsc = mdsc->fsc; | 3052 | struct ceph_fs_client *fsc = mdsc->fsc; |
3049 | 3053 | ||
3050 | mutex_lock(&mdsc->mutex); | 3054 | mutex_lock(&mdsc->mutex); |
3051 | if (__get_oldest_req(mdsc)) { | 3055 | if (__get_oldest_req(mdsc)) { |
3052 | mutex_unlock(&mdsc->mutex); | 3056 | mutex_unlock(&mdsc->mutex); |
3053 | 3057 | ||
3054 | dout("wait_requests waiting for requests\n"); | 3058 | dout("wait_requests waiting for requests\n"); |
3055 | wait_for_completion_timeout(&mdsc->safe_umount_waiters, | 3059 | wait_for_completion_timeout(&mdsc->safe_umount_waiters, |
3056 | fsc->client->options->mount_timeout * HZ); | 3060 | fsc->client->options->mount_timeout * HZ); |
3057 | 3061 | ||
3058 | /* tear down remaining requests */ | 3062 | /* tear down remaining requests */ |
3059 | mutex_lock(&mdsc->mutex); | 3063 | mutex_lock(&mdsc->mutex); |
3060 | while ((req = __get_oldest_req(mdsc))) { | 3064 | while ((req = __get_oldest_req(mdsc))) { |
3061 | dout("wait_requests timed out on tid %llu\n", | 3065 | dout("wait_requests timed out on tid %llu\n", |
3062 | req->r_tid); | 3066 | req->r_tid); |
3063 | __unregister_request(mdsc, req); | 3067 | __unregister_request(mdsc, req); |
3064 | } | 3068 | } |
3065 | } | 3069 | } |
3066 | mutex_unlock(&mdsc->mutex); | 3070 | mutex_unlock(&mdsc->mutex); |
3067 | dout("wait_requests done\n"); | 3071 | dout("wait_requests done\n"); |
3068 | } | 3072 | } |
3069 | 3073 | ||
3070 | /* | 3074 | /* |
3071 | * called before mount is ro, and before dentries are torn down. | 3075 | * called before mount is ro, and before dentries are torn down. |
3072 | * (hmm, does this still race with new lookups?) | 3076 | * (hmm, does this still race with new lookups?) |
3073 | */ | 3077 | */ |
3074 | void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc) | 3078 | void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc) |
3075 | { | 3079 | { |
3076 | dout("pre_umount\n"); | 3080 | dout("pre_umount\n"); |
3077 | mdsc->stopping = 1; | 3081 | mdsc->stopping = 1; |
3078 | 3082 | ||
3079 | drop_leases(mdsc); | 3083 | drop_leases(mdsc); |
3080 | ceph_flush_dirty_caps(mdsc); | 3084 | ceph_flush_dirty_caps(mdsc); |
3081 | wait_requests(mdsc); | 3085 | wait_requests(mdsc); |
3082 | 3086 | ||
3083 | /* | 3087 | /* |
3084 | * wait for reply handlers to drop their request refs and | 3088 | * wait for reply handlers to drop their request refs and |
3085 | * their inode/dcache refs | 3089 | * their inode/dcache refs |
3086 | */ | 3090 | */ |
3087 | ceph_msgr_flush(); | 3091 | ceph_msgr_flush(); |
3088 | } | 3092 | } |
3089 | 3093 | ||
3090 | /* | 3094 | /* |
3091 | * wait for all write mds requests to flush. | 3095 | * wait for all write mds requests to flush. |
3092 | */ | 3096 | */ |
3093 | static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid) | 3097 | static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid) |
3094 | { | 3098 | { |
3095 | struct ceph_mds_request *req = NULL, *nextreq; | 3099 | struct ceph_mds_request *req = NULL, *nextreq; |
3096 | struct rb_node *n; | 3100 | struct rb_node *n; |
3097 | 3101 | ||
3098 | mutex_lock(&mdsc->mutex); | 3102 | mutex_lock(&mdsc->mutex); |
3099 | dout("wait_unsafe_requests want %lld\n", want_tid); | 3103 | dout("wait_unsafe_requests want %lld\n", want_tid); |
3100 | restart: | 3104 | restart: |
3101 | req = __get_oldest_req(mdsc); | 3105 | req = __get_oldest_req(mdsc); |
3102 | while (req && req->r_tid <= want_tid) { | 3106 | while (req && req->r_tid <= want_tid) { |
3103 | /* find next request */ | 3107 | /* find next request */ |
3104 | n = rb_next(&req->r_node); | 3108 | n = rb_next(&req->r_node); |
3105 | if (n) | 3109 | if (n) |
3106 | nextreq = rb_entry(n, struct ceph_mds_request, r_node); | 3110 | nextreq = rb_entry(n, struct ceph_mds_request, r_node); |
3107 | else | 3111 | else |
3108 | nextreq = NULL; | 3112 | nextreq = NULL; |
3109 | if ((req->r_op & CEPH_MDS_OP_WRITE)) { | 3113 | if ((req->r_op & CEPH_MDS_OP_WRITE)) { |
3110 | /* write op */ | 3114 | /* write op */ |
3111 | ceph_mdsc_get_request(req); | 3115 | ceph_mdsc_get_request(req); |
3112 | if (nextreq) | 3116 | if (nextreq) |
3113 | ceph_mdsc_get_request(nextreq); | 3117 | ceph_mdsc_get_request(nextreq); |
3114 | mutex_unlock(&mdsc->mutex); | 3118 | mutex_unlock(&mdsc->mutex); |
3115 | dout("wait_unsafe_requests wait on %llu (want %llu)\n", | 3119 | dout("wait_unsafe_requests wait on %llu (want %llu)\n", |
3116 | req->r_tid, want_tid); | 3120 | req->r_tid, want_tid); |
3117 | wait_for_completion(&req->r_safe_completion); | 3121 | wait_for_completion(&req->r_safe_completion); |
3118 | mutex_lock(&mdsc->mutex); | 3122 | mutex_lock(&mdsc->mutex); |
3119 | ceph_mdsc_put_request(req); | 3123 | ceph_mdsc_put_request(req); |
3120 | if (!nextreq) | 3124 | if (!nextreq) |
3121 | break; /* next dne before, so we're done! */ | 3125 | break; /* next dne before, so we're done! */ |
3122 | if (RB_EMPTY_NODE(&nextreq->r_node)) { | 3126 | if (RB_EMPTY_NODE(&nextreq->r_node)) { |
3123 | /* next request was removed from tree */ | 3127 | /* next request was removed from tree */ |
3124 | ceph_mdsc_put_request(nextreq); | 3128 | ceph_mdsc_put_request(nextreq); |
3125 | goto restart; | 3129 | goto restart; |
3126 | } | 3130 | } |
3127 | ceph_mdsc_put_request(nextreq); /* won't go away */ | 3131 | ceph_mdsc_put_request(nextreq); /* won't go away */ |
3128 | } | 3132 | } |
3129 | req = nextreq; | 3133 | req = nextreq; |
3130 | } | 3134 | } |
3131 | mutex_unlock(&mdsc->mutex); | 3135 | mutex_unlock(&mdsc->mutex); |
3132 | dout("wait_unsafe_requests done\n"); | 3136 | dout("wait_unsafe_requests done\n"); |
3133 | } | 3137 | } |
3134 | 3138 | ||
3135 | void ceph_mdsc_sync(struct ceph_mds_client *mdsc) | 3139 | void ceph_mdsc_sync(struct ceph_mds_client *mdsc) |
3136 | { | 3140 | { |
3137 | u64 want_tid, want_flush; | 3141 | u64 want_tid, want_flush; |
3138 | 3142 | ||
3139 | if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN) | 3143 | if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN) |
3140 | return; | 3144 | return; |
3141 | 3145 | ||
3142 | dout("sync\n"); | 3146 | dout("sync\n"); |
3143 | mutex_lock(&mdsc->mutex); | 3147 | mutex_lock(&mdsc->mutex); |
3144 | want_tid = mdsc->last_tid; | 3148 | want_tid = mdsc->last_tid; |
3145 | want_flush = mdsc->cap_flush_seq; | 3149 | want_flush = mdsc->cap_flush_seq; |
3146 | mutex_unlock(&mdsc->mutex); | 3150 | mutex_unlock(&mdsc->mutex); |
3147 | dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush); | 3151 | dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush); |
3148 | 3152 | ||
3149 | ceph_flush_dirty_caps(mdsc); | 3153 | ceph_flush_dirty_caps(mdsc); |
3150 | 3154 | ||
3151 | wait_unsafe_requests(mdsc, want_tid); | 3155 | wait_unsafe_requests(mdsc, want_tid); |
3152 | wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush)); | 3156 | wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush)); |
3153 | } | 3157 | } |
3154 | 3158 | ||
3155 | /* | 3159 | /* |
3156 | * true if all sessions are closed, or we force unmount | 3160 | * true if all sessions are closed, or we force unmount |
3157 | */ | 3161 | */ |
3158 | static bool done_closing_sessions(struct ceph_mds_client *mdsc) | 3162 | static bool done_closing_sessions(struct ceph_mds_client *mdsc) |
3159 | { | 3163 | { |
3160 | int i, n = 0; | 3164 | int i, n = 0; |
3161 | 3165 | ||
3162 | if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN) | 3166 | if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN) |
3163 | return true; | 3167 | return true; |
3164 | 3168 | ||
3165 | mutex_lock(&mdsc->mutex); | 3169 | mutex_lock(&mdsc->mutex); |
3166 | for (i = 0; i < mdsc->max_sessions; i++) | 3170 | for (i = 0; i < mdsc->max_sessions; i++) |
3167 | if (mdsc->sessions[i]) | 3171 | if (mdsc->sessions[i]) |
3168 | n++; | 3172 | n++; |
3169 | mutex_unlock(&mdsc->mutex); | 3173 | mutex_unlock(&mdsc->mutex); |
3170 | return n == 0; | 3174 | return n == 0; |
3171 | } | 3175 | } |
3172 | 3176 | ||
3173 | /* | 3177 | /* |
3174 | * called after sb is ro. | 3178 | * called after sb is ro. |
3175 | */ | 3179 | */ |
3176 | void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) | 3180 | void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) |
3177 | { | 3181 | { |
3178 | struct ceph_mds_session *session; | 3182 | struct ceph_mds_session *session; |
3179 | int i; | 3183 | int i; |
3180 | struct ceph_fs_client *fsc = mdsc->fsc; | 3184 | struct ceph_fs_client *fsc = mdsc->fsc; |
3181 | unsigned long timeout = fsc->client->options->mount_timeout * HZ; | 3185 | unsigned long timeout = fsc->client->options->mount_timeout * HZ; |
3182 | 3186 | ||
3183 | dout("close_sessions\n"); | 3187 | dout("close_sessions\n"); |
3184 | 3188 | ||
3185 | /* close sessions */ | 3189 | /* close sessions */ |
3186 | mutex_lock(&mdsc->mutex); | 3190 | mutex_lock(&mdsc->mutex); |
3187 | for (i = 0; i < mdsc->max_sessions; i++) { | 3191 | for (i = 0; i < mdsc->max_sessions; i++) { |
3188 | session = __ceph_lookup_mds_session(mdsc, i); | 3192 | session = __ceph_lookup_mds_session(mdsc, i); |
3189 | if (!session) | 3193 | if (!session) |
3190 | continue; | 3194 | continue; |
3191 | mutex_unlock(&mdsc->mutex); | 3195 | mutex_unlock(&mdsc->mutex); |
3192 | mutex_lock(&session->s_mutex); | 3196 | mutex_lock(&session->s_mutex); |
3193 | __close_session(mdsc, session); | 3197 | __close_session(mdsc, session); |
3194 | mutex_unlock(&session->s_mutex); | 3198 | mutex_unlock(&session->s_mutex); |
3195 | ceph_put_mds_session(session); | 3199 | ceph_put_mds_session(session); |
3196 | mutex_lock(&mdsc->mutex); | 3200 | mutex_lock(&mdsc->mutex); |
3197 | } | 3201 | } |
3198 | mutex_unlock(&mdsc->mutex); | 3202 | mutex_unlock(&mdsc->mutex); |
3199 | 3203 | ||
3200 | dout("waiting for sessions to close\n"); | 3204 | dout("waiting for sessions to close\n"); |
3201 | wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc), | 3205 | wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc), |
3202 | timeout); | 3206 | timeout); |
3203 | 3207 | ||
3204 | /* tear down remaining sessions */ | 3208 | /* tear down remaining sessions */ |
3205 | mutex_lock(&mdsc->mutex); | 3209 | mutex_lock(&mdsc->mutex); |
3206 | for (i = 0; i < mdsc->max_sessions; i++) { | 3210 | for (i = 0; i < mdsc->max_sessions; i++) { |
3207 | if (mdsc->sessions[i]) { | 3211 | if (mdsc->sessions[i]) { |
3208 | session = get_session(mdsc->sessions[i]); | 3212 | session = get_session(mdsc->sessions[i]); |
3209 | __unregister_session(mdsc, session); | 3213 | __unregister_session(mdsc, session); |
3210 | mutex_unlock(&mdsc->mutex); | 3214 | mutex_unlock(&mdsc->mutex); |
3211 | mutex_lock(&session->s_mutex); | 3215 | mutex_lock(&session->s_mutex); |
3212 | remove_session_caps(session); | 3216 | remove_session_caps(session); |
3213 | mutex_unlock(&session->s_mutex); | 3217 | mutex_unlock(&session->s_mutex); |
3214 | ceph_put_mds_session(session); | 3218 | ceph_put_mds_session(session); |
3215 | mutex_lock(&mdsc->mutex); | 3219 | mutex_lock(&mdsc->mutex); |
3216 | } | 3220 | } |
3217 | } | 3221 | } |
3218 | WARN_ON(!list_empty(&mdsc->cap_delay_list)); | 3222 | WARN_ON(!list_empty(&mdsc->cap_delay_list)); |
3219 | mutex_unlock(&mdsc->mutex); | 3223 | mutex_unlock(&mdsc->mutex); |
3220 | 3224 | ||
3221 | ceph_cleanup_empty_realms(mdsc); | 3225 | ceph_cleanup_empty_realms(mdsc); |
3222 | 3226 | ||
3223 | cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ | 3227 | cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ |
3224 | 3228 | ||
3225 | dout("stopped\n"); | 3229 | dout("stopped\n"); |
3226 | } | 3230 | } |
3227 | 3231 | ||
3228 | static void ceph_mdsc_stop(struct ceph_mds_client *mdsc) | 3232 | static void ceph_mdsc_stop(struct ceph_mds_client *mdsc) |
3229 | { | 3233 | { |
3230 | dout("stop\n"); | 3234 | dout("stop\n"); |
3231 | cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ | 3235 | cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ |
3232 | if (mdsc->mdsmap) | 3236 | if (mdsc->mdsmap) |
3233 | ceph_mdsmap_destroy(mdsc->mdsmap); | 3237 | ceph_mdsmap_destroy(mdsc->mdsmap); |
3234 | kfree(mdsc->sessions); | 3238 | kfree(mdsc->sessions); |
3235 | ceph_caps_finalize(mdsc); | 3239 | ceph_caps_finalize(mdsc); |
3236 | } | 3240 | } |
3237 | 3241 | ||
3238 | void ceph_mdsc_destroy(struct ceph_fs_client *fsc) | 3242 | void ceph_mdsc_destroy(struct ceph_fs_client *fsc) |
3239 | { | 3243 | { |
3240 | struct ceph_mds_client *mdsc = fsc->mdsc; | 3244 | struct ceph_mds_client *mdsc = fsc->mdsc; |
3241 | 3245 | ||
3242 | dout("mdsc_destroy %p\n", mdsc); | 3246 | dout("mdsc_destroy %p\n", mdsc); |
3243 | ceph_mdsc_stop(mdsc); | 3247 | ceph_mdsc_stop(mdsc); |
3244 | 3248 | ||
3245 | /* flush out any connection work with references to us */ | 3249 | /* flush out any connection work with references to us */ |
3246 | ceph_msgr_flush(); | 3250 | ceph_msgr_flush(); |
3247 | 3251 | ||
3248 | fsc->mdsc = NULL; | 3252 | fsc->mdsc = NULL; |
3249 | kfree(mdsc); | 3253 | kfree(mdsc); |
3250 | dout("mdsc_destroy %p done\n", mdsc); | 3254 | dout("mdsc_destroy %p done\n", mdsc); |
3251 | } | 3255 | } |
3252 | 3256 | ||
3253 | 3257 | ||
3254 | /* | 3258 | /* |
3255 | * handle mds map update. | 3259 | * handle mds map update. |
3256 | */ | 3260 | */ |
3257 | void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg) | 3261 | void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg) |
3258 | { | 3262 | { |
3259 | u32 epoch; | 3263 | u32 epoch; |
3260 | u32 maplen; | 3264 | u32 maplen; |
3261 | void *p = msg->front.iov_base; | 3265 | void *p = msg->front.iov_base; |
3262 | void *end = p + msg->front.iov_len; | 3266 | void *end = p + msg->front.iov_len; |
3263 | struct ceph_mdsmap *newmap, *oldmap; | 3267 | struct ceph_mdsmap *newmap, *oldmap; |
3264 | struct ceph_fsid fsid; | 3268 | struct ceph_fsid fsid; |
3265 | int err = -EINVAL; | 3269 | int err = -EINVAL; |
3266 | 3270 | ||
3267 | ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad); | 3271 | ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad); |
3268 | ceph_decode_copy(&p, &fsid, sizeof(fsid)); | 3272 | ceph_decode_copy(&p, &fsid, sizeof(fsid)); |
3269 | if (ceph_check_fsid(mdsc->fsc->client, &fsid) < 0) | 3273 | if (ceph_check_fsid(mdsc->fsc->client, &fsid) < 0) |
3270 | return; | 3274 | return; |
3271 | epoch = ceph_decode_32(&p); | 3275 | epoch = ceph_decode_32(&p); |
3272 | maplen = ceph_decode_32(&p); | 3276 | maplen = ceph_decode_32(&p); |
3273 | dout("handle_map epoch %u len %d\n", epoch, (int)maplen); | 3277 | dout("handle_map epoch %u len %d\n", epoch, (int)maplen); |
3274 | 3278 | ||
3275 | /* do we need it? */ | 3279 | /* do we need it? */ |
3276 | ceph_monc_got_mdsmap(&mdsc->fsc->client->monc, epoch); | 3280 | ceph_monc_got_mdsmap(&mdsc->fsc->client->monc, epoch); |
3277 | mutex_lock(&mdsc->mutex); | 3281 | mutex_lock(&mdsc->mutex); |
3278 | if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) { | 3282 | if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) { |
3279 | dout("handle_map epoch %u <= our %u\n", | 3283 | dout("handle_map epoch %u <= our %u\n", |
3280 | epoch, mdsc->mdsmap->m_epoch); | 3284 | epoch, mdsc->mdsmap->m_epoch); |
3281 | mutex_unlock(&mdsc->mutex); | 3285 | mutex_unlock(&mdsc->mutex); |
3282 | return; | 3286 | return; |
3283 | } | 3287 | } |
3284 | 3288 | ||
3285 | newmap = ceph_mdsmap_decode(&p, end); | 3289 | newmap = ceph_mdsmap_decode(&p, end); |
3286 | if (IS_ERR(newmap)) { | 3290 | if (IS_ERR(newmap)) { |
3287 | err = PTR_ERR(newmap); | 3291 | err = PTR_ERR(newmap); |
3288 | goto bad_unlock; | 3292 | goto bad_unlock; |
3289 | } | 3293 | } |
3290 | 3294 | ||
3291 | /* swap into place */ | 3295 | /* swap into place */ |
3292 | if (mdsc->mdsmap) { | 3296 | if (mdsc->mdsmap) { |
3293 | oldmap = mdsc->mdsmap; | 3297 | oldmap = mdsc->mdsmap; |
3294 | mdsc->mdsmap = newmap; | 3298 | mdsc->mdsmap = newmap; |
3295 | check_new_map(mdsc, newmap, oldmap); | 3299 | check_new_map(mdsc, newmap, oldmap); |
3296 | ceph_mdsmap_destroy(oldmap); | 3300 | ceph_mdsmap_destroy(oldmap); |
3297 | } else { | 3301 | } else { |
3298 | mdsc->mdsmap = newmap; /* first mds map */ | 3302 | mdsc->mdsmap = newmap; /* first mds map */ |
3299 | } | 3303 | } |
3300 | mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size; | 3304 | mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size; |
3301 | 3305 | ||
3302 | __wake_requests(mdsc, &mdsc->waiting_for_map); | 3306 | __wake_requests(mdsc, &mdsc->waiting_for_map); |
3303 | 3307 | ||
3304 | mutex_unlock(&mdsc->mutex); | 3308 | mutex_unlock(&mdsc->mutex); |
3305 | schedule_delayed(mdsc); | 3309 | schedule_delayed(mdsc); |
3306 | return; | 3310 | return; |
3307 | 3311 | ||
3308 | bad_unlock: | 3312 | bad_unlock: |
3309 | mutex_unlock(&mdsc->mutex); | 3313 | mutex_unlock(&mdsc->mutex); |
3310 | bad: | 3314 | bad: |
3311 | pr_err("error decoding mdsmap %d\n", err); | 3315 | pr_err("error decoding mdsmap %d\n", err); |
3312 | return; | 3316 | return; |
3313 | } | 3317 | } |
3314 | 3318 | ||
3315 | static struct ceph_connection *con_get(struct ceph_connection *con) | 3319 | static struct ceph_connection *con_get(struct ceph_connection *con) |
3316 | { | 3320 | { |
3317 | struct ceph_mds_session *s = con->private; | 3321 | struct ceph_mds_session *s = con->private; |
3318 | 3322 | ||
3319 | if (get_session(s)) { | 3323 | if (get_session(s)) { |
3320 | dout("mdsc con_get %p ok (%d)\n", s, atomic_read(&s->s_ref)); | 3324 | dout("mdsc con_get %p ok (%d)\n", s, atomic_read(&s->s_ref)); |
3321 | return con; | 3325 | return con; |
3322 | } | 3326 | } |
3323 | dout("mdsc con_get %p FAIL\n", s); | 3327 | dout("mdsc con_get %p FAIL\n", s); |
3324 | return NULL; | 3328 | return NULL; |
3325 | } | 3329 | } |
3326 | 3330 | ||
3327 | static void con_put(struct ceph_connection *con) | 3331 | static void con_put(struct ceph_connection *con) |
3328 | { | 3332 | { |
3329 | struct ceph_mds_session *s = con->private; | 3333 | struct ceph_mds_session *s = con->private; |
3330 | 3334 | ||
3331 | dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref) - 1); | 3335 | dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref) - 1); |
3332 | ceph_put_mds_session(s); | 3336 | ceph_put_mds_session(s); |
3333 | } | 3337 | } |
3334 | 3338 | ||
3335 | /* | 3339 | /* |
3336 | * if the client is unresponsive for long enough, the mds will kill | 3340 | * if the client is unresponsive for long enough, the mds will kill |
3337 | * the session entirely. | 3341 | * the session entirely. |
3338 | */ | 3342 | */ |
3339 | static void peer_reset(struct ceph_connection *con) | 3343 | static void peer_reset(struct ceph_connection *con) |
3340 | { | 3344 | { |
3341 | struct ceph_mds_session *s = con->private; | 3345 | struct ceph_mds_session *s = con->private; |
3342 | struct ceph_mds_client *mdsc = s->s_mdsc; | 3346 | struct ceph_mds_client *mdsc = s->s_mdsc; |
3343 | 3347 | ||
3344 | pr_warning("mds%d closed our session\n", s->s_mds); | 3348 | pr_warning("mds%d closed our session\n", s->s_mds); |
3345 | send_mds_reconnect(mdsc, s); | 3349 | send_mds_reconnect(mdsc, s); |
3346 | } | 3350 | } |
3347 | 3351 | ||
3348 | static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) | 3352 | static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) |
3349 | { | 3353 | { |
3350 | struct ceph_mds_session *s = con->private; | 3354 | struct ceph_mds_session *s = con->private; |
3351 | struct ceph_mds_client *mdsc = s->s_mdsc; | 3355 | struct ceph_mds_client *mdsc = s->s_mdsc; |
3352 | int type = le16_to_cpu(msg->hdr.type); | 3356 | int type = le16_to_cpu(msg->hdr.type); |
3353 | 3357 | ||
3354 | mutex_lock(&mdsc->mutex); | 3358 | mutex_lock(&mdsc->mutex); |
3355 | if (__verify_registered_session(mdsc, s) < 0) { | 3359 | if (__verify_registered_session(mdsc, s) < 0) { |
3356 | mutex_unlock(&mdsc->mutex); | 3360 | mutex_unlock(&mdsc->mutex); |
3357 | goto out; | 3361 | goto out; |
3358 | } | 3362 | } |
3359 | mutex_unlock(&mdsc->mutex); | 3363 | mutex_unlock(&mdsc->mutex); |
3360 | 3364 | ||
3361 | switch (type) { | 3365 | switch (type) { |
3362 | case CEPH_MSG_MDS_MAP: | 3366 | case CEPH_MSG_MDS_MAP: |
3363 | ceph_mdsc_handle_map(mdsc, msg); | 3367 | ceph_mdsc_handle_map(mdsc, msg); |
3364 | break; | 3368 | break; |
3365 | case CEPH_MSG_CLIENT_SESSION: | 3369 | case CEPH_MSG_CLIENT_SESSION: |
3366 | handle_session(s, msg); | 3370 | handle_session(s, msg); |
3367 | break; | 3371 | break; |
3368 | case CEPH_MSG_CLIENT_REPLY: | 3372 | case CEPH_MSG_CLIENT_REPLY: |
3369 | handle_reply(s, msg); | 3373 | handle_reply(s, msg); |
3370 | break; | 3374 | break; |
3371 | case CEPH_MSG_CLIENT_REQUEST_FORWARD: | 3375 | case CEPH_MSG_CLIENT_REQUEST_FORWARD: |
3372 | handle_forward(mdsc, s, msg); | 3376 | handle_forward(mdsc, s, msg); |
3373 | break; | 3377 | break; |
3374 | case CEPH_MSG_CLIENT_CAPS: | 3378 | case CEPH_MSG_CLIENT_CAPS: |
3375 | ceph_handle_caps(s, msg); | 3379 | ceph_handle_caps(s, msg); |
3376 | break; | 3380 | break; |
3377 | case CEPH_MSG_CLIENT_SNAP: | 3381 | case CEPH_MSG_CLIENT_SNAP: |
3378 | ceph_handle_snap(mdsc, s, msg); | 3382 | ceph_handle_snap(mdsc, s, msg); |
3379 | break; | 3383 | break; |
3380 | case CEPH_MSG_CLIENT_LEASE: | 3384 | case CEPH_MSG_CLIENT_LEASE: |
3381 | handle_lease(mdsc, s, msg); | 3385 | handle_lease(mdsc, s, msg); |
3382 | break; | 3386 | break; |
3383 | 3387 | ||
3384 | default: | 3388 | default: |
3385 | pr_err("received unknown message type %d %s\n", type, | 3389 | pr_err("received unknown message type %d %s\n", type, |
3386 | ceph_msg_type_name(type)); | 3390 | ceph_msg_type_name(type)); |
3387 | } | 3391 | } |
3388 | out: | 3392 | out: |
3389 | ceph_msg_put(msg); | 3393 | ceph_msg_put(msg); |
3390 | } | 3394 | } |
3391 | 3395 | ||
3392 | /* | 3396 | /* |
3393 | * authentication | 3397 | * authentication |
3394 | */ | 3398 | */ |
3395 | static int get_authorizer(struct ceph_connection *con, | 3399 | static int get_authorizer(struct ceph_connection *con, |
3396 | void **buf, int *len, int *proto, | 3400 | void **buf, int *len, int *proto, |
3397 | void **reply_buf, int *reply_len, int force_new) | 3401 | void **reply_buf, int *reply_len, int force_new) |
3398 | { | 3402 | { |
3399 | struct ceph_mds_session *s = con->private; | 3403 | struct ceph_mds_session *s = con->private; |
3400 | struct ceph_mds_client *mdsc = s->s_mdsc; | 3404 | struct ceph_mds_client *mdsc = s->s_mdsc; |
3401 | struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; | 3405 | struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; |
3402 | int ret = 0; | 3406 | int ret = 0; |
3403 | 3407 | ||
3404 | if (force_new && s->s_authorizer) { | 3408 | if (force_new && s->s_authorizer) { |
3405 | ac->ops->destroy_authorizer(ac, s->s_authorizer); | 3409 | ac->ops->destroy_authorizer(ac, s->s_authorizer); |
3406 | s->s_authorizer = NULL; | 3410 | s->s_authorizer = NULL; |
3407 | } | 3411 | } |
3408 | if (s->s_authorizer == NULL) { | 3412 | if (s->s_authorizer == NULL) { |
3409 | if (ac->ops->create_authorizer) { | 3413 | if (ac->ops->create_authorizer) { |
3410 | ret = ac->ops->create_authorizer( | 3414 | ret = ac->ops->create_authorizer( |
3411 | ac, CEPH_ENTITY_TYPE_MDS, | 3415 | ac, CEPH_ENTITY_TYPE_MDS, |
3412 | &s->s_authorizer, | 3416 | &s->s_authorizer, |
3413 | &s->s_authorizer_buf, | 3417 | &s->s_authorizer_buf, |
3414 | &s->s_authorizer_buf_len, | 3418 | &s->s_authorizer_buf_len, |
3415 | &s->s_authorizer_reply_buf, | 3419 | &s->s_authorizer_reply_buf, |
3416 | &s->s_authorizer_reply_buf_len); | 3420 | &s->s_authorizer_reply_buf_len); |
3417 | if (ret) | 3421 | if (ret) |
3418 | return ret; | 3422 | return ret; |
3419 | } | 3423 | } |
3420 | } | 3424 | } |
3421 | 3425 | ||
3422 | *proto = ac->protocol; | 3426 | *proto = ac->protocol; |
3423 | *buf = s->s_authorizer_buf; | 3427 | *buf = s->s_authorizer_buf; |
3424 | *len = s->s_authorizer_buf_len; | 3428 | *len = s->s_authorizer_buf_len; |
3425 | *reply_buf = s->s_authorizer_reply_buf; | 3429 | *reply_buf = s->s_authorizer_reply_buf; |
3426 | *reply_len = s->s_authorizer_reply_buf_len; | 3430 | *reply_len = s->s_authorizer_reply_buf_len; |
3427 | return 0; | 3431 | return 0; |
3428 | } | 3432 | } |
3429 | 3433 | ||
3430 | 3434 | ||
3431 | static int verify_authorizer_reply(struct ceph_connection *con, int len) | 3435 | static int verify_authorizer_reply(struct ceph_connection *con, int len) |
3432 | { | 3436 | { |
3433 | struct ceph_mds_session *s = con->private; | 3437 | struct ceph_mds_session *s = con->private; |
3434 | struct ceph_mds_client *mdsc = s->s_mdsc; | 3438 | struct ceph_mds_client *mdsc = s->s_mdsc; |
3435 | struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; | 3439 | struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; |
3436 | 3440 | ||
3437 | return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len); | 3441 | return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len); |
3438 | } | 3442 | } |
3439 | 3443 | ||
3440 | static int invalidate_authorizer(struct ceph_connection *con) | 3444 | static int invalidate_authorizer(struct ceph_connection *con) |
3441 | { | 3445 | { |
3442 | struct ceph_mds_session *s = con->private; | 3446 | struct ceph_mds_session *s = con->private; |
3443 | struct ceph_mds_client *mdsc = s->s_mdsc; | 3447 | struct ceph_mds_client *mdsc = s->s_mdsc; |
3444 | struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; | 3448 | struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; |
3445 | 3449 | ||
3446 | if (ac->ops->invalidate_authorizer) | 3450 | if (ac->ops->invalidate_authorizer) |
3447 | ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS); | 3451 | ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS); |
3448 | 3452 | ||
3449 | return ceph_monc_validate_auth(&mdsc->fsc->client->monc); | 3453 | return ceph_monc_validate_auth(&mdsc->fsc->client->monc); |
3450 | } | 3454 | } |
3451 | 3455 | ||
3452 | static const struct ceph_connection_operations mds_con_ops = { | 3456 | static const struct ceph_connection_operations mds_con_ops = { |
3453 | .get = con_get, | 3457 | .get = con_get, |
3454 | .put = con_put, | 3458 | .put = con_put, |
3455 | .dispatch = dispatch, | 3459 | .dispatch = dispatch, |
3456 | .get_authorizer = get_authorizer, | 3460 | .get_authorizer = get_authorizer, |
3457 | .verify_authorizer_reply = verify_authorizer_reply, | 3461 | .verify_authorizer_reply = verify_authorizer_reply, |
3458 | .invalidate_authorizer = invalidate_authorizer, | 3462 | .invalidate_authorizer = invalidate_authorizer, |
3459 | .peer_reset = peer_reset, | 3463 | .peer_reset = peer_reset, |
3460 | }; | 3464 | }; |
3461 | 3465 | ||
3462 | /* eof */ | 3466 | /* eof */ |
3463 | 3467 |
fs/ceph/mds_client.h
1 | #ifndef _FS_CEPH_MDS_CLIENT_H | 1 | #ifndef _FS_CEPH_MDS_CLIENT_H |
2 | #define _FS_CEPH_MDS_CLIENT_H | 2 | #define _FS_CEPH_MDS_CLIENT_H |
3 | 3 | ||
4 | #include <linux/completion.h> | 4 | #include <linux/completion.h> |
5 | #include <linux/kref.h> | 5 | #include <linux/kref.h> |
6 | #include <linux/list.h> | 6 | #include <linux/list.h> |
7 | #include <linux/mutex.h> | 7 | #include <linux/mutex.h> |
8 | #include <linux/rbtree.h> | 8 | #include <linux/rbtree.h> |
9 | #include <linux/spinlock.h> | 9 | #include <linux/spinlock.h> |
10 | 10 | ||
11 | #include <linux/ceph/types.h> | 11 | #include <linux/ceph/types.h> |
12 | #include <linux/ceph/messenger.h> | 12 | #include <linux/ceph/messenger.h> |
13 | #include <linux/ceph/mdsmap.h> | 13 | #include <linux/ceph/mdsmap.h> |
14 | 14 | ||
15 | /* | 15 | /* |
16 | * Some lock dependencies: | 16 | * Some lock dependencies: |
17 | * | 17 | * |
18 | * session->s_mutex | 18 | * session->s_mutex |
19 | * mdsc->mutex | 19 | * mdsc->mutex |
20 | * | 20 | * |
21 | * mdsc->snap_rwsem | 21 | * mdsc->snap_rwsem |
22 | * | 22 | * |
23 | * ci->i_ceph_lock | 23 | * ci->i_ceph_lock |
24 | * mdsc->snap_flush_lock | 24 | * mdsc->snap_flush_lock |
25 | * mdsc->cap_delay_lock | 25 | * mdsc->cap_delay_lock |
26 | * | 26 | * |
27 | */ | 27 | */ |
28 | 28 | ||
29 | struct ceph_fs_client; | 29 | struct ceph_fs_client; |
30 | struct ceph_cap; | 30 | struct ceph_cap; |
31 | 31 | ||
32 | /* | 32 | /* |
33 | * parsed info about a single inode. pointers are into the encoded | 33 | * parsed info about a single inode. pointers are into the encoded |
34 | * on-wire structures within the mds reply message payload. | 34 | * on-wire structures within the mds reply message payload. |
35 | */ | 35 | */ |
36 | struct ceph_mds_reply_info_in { | 36 | struct ceph_mds_reply_info_in { |
37 | struct ceph_mds_reply_inode *in; | 37 | struct ceph_mds_reply_inode *in; |
38 | struct ceph_dir_layout dir_layout; | 38 | struct ceph_dir_layout dir_layout; |
39 | u32 symlink_len; | 39 | u32 symlink_len; |
40 | char *symlink; | 40 | char *symlink; |
41 | u32 xattr_len; | 41 | u32 xattr_len; |
42 | char *xattr_data; | 42 | char *xattr_data; |
43 | }; | 43 | }; |
44 | 44 | ||
45 | /* | 45 | /* |
46 | * parsed info about an mds reply, including information about | 46 | * parsed info about an mds reply, including information about |
47 | * either: 1) the target inode and/or its parent directory and dentry, | 47 | * either: 1) the target inode and/or its parent directory and dentry, |
48 | * and directory contents (for readdir results), or | 48 | * and directory contents (for readdir results), or |
49 | * 2) the file range lock info (for fcntl F_GETLK results). | 49 | * 2) the file range lock info (for fcntl F_GETLK results). |
50 | */ | 50 | */ |
51 | struct ceph_mds_reply_info_parsed { | 51 | struct ceph_mds_reply_info_parsed { |
52 | struct ceph_mds_reply_head *head; | 52 | struct ceph_mds_reply_head *head; |
53 | 53 | ||
54 | /* trace */ | 54 | /* trace */ |
55 | struct ceph_mds_reply_info_in diri, targeti; | 55 | struct ceph_mds_reply_info_in diri, targeti; |
56 | struct ceph_mds_reply_dirfrag *dirfrag; | 56 | struct ceph_mds_reply_dirfrag *dirfrag; |
57 | char *dname; | 57 | char *dname; |
58 | u32 dname_len; | 58 | u32 dname_len; |
59 | struct ceph_mds_reply_lease *dlease; | 59 | struct ceph_mds_reply_lease *dlease; |
60 | 60 | ||
61 | /* extra */ | 61 | /* extra */ |
62 | union { | 62 | union { |
63 | /* for fcntl F_GETLK results */ | 63 | /* for fcntl F_GETLK results */ |
64 | struct ceph_filelock *filelock_reply; | 64 | struct ceph_filelock *filelock_reply; |
65 | 65 | ||
66 | /* for readdir results */ | 66 | /* for readdir results */ |
67 | struct { | 67 | struct { |
68 | struct ceph_mds_reply_dirfrag *dir_dir; | 68 | struct ceph_mds_reply_dirfrag *dir_dir; |
69 | int dir_nr; | 69 | int dir_nr; |
70 | char **dir_dname; | 70 | char **dir_dname; |
71 | u32 *dir_dname_len; | 71 | u32 *dir_dname_len; |
72 | struct ceph_mds_reply_lease **dir_dlease; | 72 | struct ceph_mds_reply_lease **dir_dlease; |
73 | struct ceph_mds_reply_info_in *dir_in; | 73 | struct ceph_mds_reply_info_in *dir_in; |
74 | u8 dir_complete, dir_end; | 74 | u8 dir_complete, dir_end; |
75 | }; | 75 | }; |
76 | }; | 76 | }; |
77 | 77 | ||
78 | /* encoded blob describing snapshot contexts for certain | 78 | /* encoded blob describing snapshot contexts for certain |
79 | operations (e.g., open) */ | 79 | operations (e.g., open) */ |
80 | void *snapblob; | 80 | void *snapblob; |
81 | int snapblob_len; | 81 | int snapblob_len; |
82 | }; | 82 | }; |
83 | 83 | ||
84 | 84 | ||
85 | /* | 85 | /* |
86 | * cap releases are batched and sent to the MDS en masse. | 86 | * cap releases are batched and sent to the MDS en masse. |
87 | */ | 87 | */ |
88 | #define CEPH_CAPS_PER_RELEASE ((PAGE_CACHE_SIZE - \ | 88 | #define CEPH_CAPS_PER_RELEASE ((PAGE_CACHE_SIZE - \ |
89 | sizeof(struct ceph_mds_cap_release)) / \ | 89 | sizeof(struct ceph_mds_cap_release)) / \ |
90 | sizeof(struct ceph_mds_cap_item)) | 90 | sizeof(struct ceph_mds_cap_item)) |
91 | 91 | ||
92 | 92 | ||
93 | /* | 93 | /* |
94 | * state associated with each MDS<->client session | 94 | * state associated with each MDS<->client session |
95 | */ | 95 | */ |
96 | enum { | 96 | enum { |
97 | CEPH_MDS_SESSION_NEW = 1, | 97 | CEPH_MDS_SESSION_NEW = 1, |
98 | CEPH_MDS_SESSION_OPENING = 2, | 98 | CEPH_MDS_SESSION_OPENING = 2, |
99 | CEPH_MDS_SESSION_OPEN = 3, | 99 | CEPH_MDS_SESSION_OPEN = 3, |
100 | CEPH_MDS_SESSION_HUNG = 4, | 100 | CEPH_MDS_SESSION_HUNG = 4, |
101 | CEPH_MDS_SESSION_CLOSING = 5, | 101 | CEPH_MDS_SESSION_CLOSING = 5, |
102 | CEPH_MDS_SESSION_RESTARTING = 6, | 102 | CEPH_MDS_SESSION_RESTARTING = 6, |
103 | CEPH_MDS_SESSION_RECONNECTING = 7, | 103 | CEPH_MDS_SESSION_RECONNECTING = 7, |
104 | }; | 104 | }; |
105 | 105 | ||
106 | struct ceph_mds_session { | 106 | struct ceph_mds_session { |
107 | struct ceph_mds_client *s_mdsc; | 107 | struct ceph_mds_client *s_mdsc; |
108 | int s_mds; | 108 | int s_mds; |
109 | int s_state; | 109 | int s_state; |
110 | unsigned long s_ttl; /* time until mds kills us */ | 110 | unsigned long s_ttl; /* time until mds kills us */ |
111 | u64 s_seq; /* incoming msg seq # */ | 111 | u64 s_seq; /* incoming msg seq # */ |
112 | struct mutex s_mutex; /* serialize session messages */ | 112 | struct mutex s_mutex; /* serialize session messages */ |
113 | 113 | ||
114 | struct ceph_connection s_con; | 114 | struct ceph_connection s_con; |
115 | 115 | ||
116 | struct ceph_authorizer *s_authorizer; | 116 | struct ceph_authorizer *s_authorizer; |
117 | void *s_authorizer_buf, *s_authorizer_reply_buf; | 117 | void *s_authorizer_buf, *s_authorizer_reply_buf; |
118 | size_t s_authorizer_buf_len, s_authorizer_reply_buf_len; | 118 | size_t s_authorizer_buf_len, s_authorizer_reply_buf_len; |
119 | 119 | ||
120 | /* protected by s_cap_lock */ | 120 | /* protected by s_gen_ttl_lock */ |
121 | spinlock_t s_cap_lock; | 121 | spinlock_t s_gen_ttl_lock; |
122 | u32 s_cap_gen; /* inc each time we get mds stale msg */ | 122 | u32 s_cap_gen; /* inc each time we get mds stale msg */ |
123 | unsigned long s_cap_ttl; /* when session caps expire */ | 123 | unsigned long s_cap_ttl; /* when session caps expire */ |
124 | |||
125 | /* protected by s_cap_lock */ | ||
126 | spinlock_t s_cap_lock; | ||
124 | struct list_head s_caps; /* all caps issued by this session */ | 127 | struct list_head s_caps; /* all caps issued by this session */ |
125 | int s_nr_caps, s_trim_caps; | 128 | int s_nr_caps, s_trim_caps; |
126 | int s_num_cap_releases; | 129 | int s_num_cap_releases; |
127 | struct list_head s_cap_releases; /* waiting cap_release messages */ | 130 | struct list_head s_cap_releases; /* waiting cap_release messages */ |
128 | struct list_head s_cap_releases_done; /* ready to send */ | 131 | struct list_head s_cap_releases_done; /* ready to send */ |
129 | struct ceph_cap *s_cap_iterator; | 132 | struct ceph_cap *s_cap_iterator; |
130 | 133 | ||
131 | /* protected by mutex */ | 134 | /* protected by mutex */ |
132 | struct list_head s_cap_flushing; /* inodes w/ flushing caps */ | 135 | struct list_head s_cap_flushing; /* inodes w/ flushing caps */ |
133 | struct list_head s_cap_snaps_flushing; | 136 | struct list_head s_cap_snaps_flushing; |
134 | unsigned long s_renew_requested; /* last time we sent a renew req */ | 137 | unsigned long s_renew_requested; /* last time we sent a renew req */ |
135 | u64 s_renew_seq; | 138 | u64 s_renew_seq; |
136 | 139 | ||
137 | atomic_t s_ref; | 140 | atomic_t s_ref; |
138 | struct list_head s_waiting; /* waiting requests */ | 141 | struct list_head s_waiting; /* waiting requests */ |
139 | struct list_head s_unsafe; /* unsafe requests */ | 142 | struct list_head s_unsafe; /* unsafe requests */ |
140 | }; | 143 | }; |
141 | 144 | ||
142 | /* | 145 | /* |
143 | * modes of choosing which MDS to send a request to | 146 | * modes of choosing which MDS to send a request to |
144 | */ | 147 | */ |
145 | enum { | 148 | enum { |
146 | USE_ANY_MDS, | 149 | USE_ANY_MDS, |
147 | USE_RANDOM_MDS, | 150 | USE_RANDOM_MDS, |
148 | USE_AUTH_MDS, /* prefer authoritative mds for this metadata item */ | 151 | USE_AUTH_MDS, /* prefer authoritative mds for this metadata item */ |
149 | }; | 152 | }; |
150 | 153 | ||
151 | struct ceph_mds_request; | 154 | struct ceph_mds_request; |
152 | struct ceph_mds_client; | 155 | struct ceph_mds_client; |
153 | 156 | ||
154 | /* | 157 | /* |
155 | * request completion callback | 158 | * request completion callback |
156 | */ | 159 | */ |
157 | typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc, | 160 | typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc, |
158 | struct ceph_mds_request *req); | 161 | struct ceph_mds_request *req); |
159 | 162 | ||
160 | /* | 163 | /* |
161 | * an in-flight mds request | 164 | * an in-flight mds request |
162 | */ | 165 | */ |
163 | struct ceph_mds_request { | 166 | struct ceph_mds_request { |
164 | u64 r_tid; /* transaction id */ | 167 | u64 r_tid; /* transaction id */ |
165 | struct rb_node r_node; | 168 | struct rb_node r_node; |
166 | struct ceph_mds_client *r_mdsc; | 169 | struct ceph_mds_client *r_mdsc; |
167 | 170 | ||
168 | int r_op; /* mds op code */ | 171 | int r_op; /* mds op code */ |
169 | 172 | ||
170 | /* operation on what? */ | 173 | /* operation on what? */ |
171 | struct inode *r_inode; /* arg1 */ | 174 | struct inode *r_inode; /* arg1 */ |
172 | struct dentry *r_dentry; /* arg1 */ | 175 | struct dentry *r_dentry; /* arg1 */ |
173 | struct dentry *r_old_dentry; /* arg2: rename from or link from */ | 176 | struct dentry *r_old_dentry; /* arg2: rename from or link from */ |
174 | struct inode *r_old_dentry_dir; /* arg2: old dentry's parent dir */ | 177 | struct inode *r_old_dentry_dir; /* arg2: old dentry's parent dir */ |
175 | char *r_path1, *r_path2; | 178 | char *r_path1, *r_path2; |
176 | struct ceph_vino r_ino1, r_ino2; | 179 | struct ceph_vino r_ino1, r_ino2; |
177 | 180 | ||
178 | struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */ | 181 | struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */ |
179 | struct inode *r_target_inode; /* resulting inode */ | 182 | struct inode *r_target_inode; /* resulting inode */ |
180 | 183 | ||
181 | struct mutex r_fill_mutex; | 184 | struct mutex r_fill_mutex; |
182 | 185 | ||
183 | union ceph_mds_request_args r_args; | 186 | union ceph_mds_request_args r_args; |
184 | int r_fmode; /* file mode, if expecting cap */ | 187 | int r_fmode; /* file mode, if expecting cap */ |
185 | uid_t r_uid; | 188 | uid_t r_uid; |
186 | gid_t r_gid; | 189 | gid_t r_gid; |
187 | 190 | ||
188 | /* for choosing which mds to send this request to */ | 191 | /* for choosing which mds to send this request to */ |
189 | int r_direct_mode; | 192 | int r_direct_mode; |
190 | u32 r_direct_hash; /* choose dir frag based on this dentry hash */ | 193 | u32 r_direct_hash; /* choose dir frag based on this dentry hash */ |
191 | bool r_direct_is_hash; /* true if r_direct_hash is valid */ | 194 | bool r_direct_is_hash; /* true if r_direct_hash is valid */ |
192 | 195 | ||
193 | /* data payload is used for xattr ops */ | 196 | /* data payload is used for xattr ops */ |
194 | struct page **r_pages; | 197 | struct page **r_pages; |
195 | int r_num_pages; | 198 | int r_num_pages; |
196 | int r_data_len; | 199 | int r_data_len; |
197 | 200 | ||
198 | /* what caps shall we drop? */ | 201 | /* what caps shall we drop? */ |
199 | int r_inode_drop, r_inode_unless; | 202 | int r_inode_drop, r_inode_unless; |
200 | int r_dentry_drop, r_dentry_unless; | 203 | int r_dentry_drop, r_dentry_unless; |
201 | int r_old_dentry_drop, r_old_dentry_unless; | 204 | int r_old_dentry_drop, r_old_dentry_unless; |
202 | struct inode *r_old_inode; | 205 | struct inode *r_old_inode; |
203 | int r_old_inode_drop, r_old_inode_unless; | 206 | int r_old_inode_drop, r_old_inode_unless; |
204 | 207 | ||
205 | struct ceph_msg *r_request; /* original request */ | 208 | struct ceph_msg *r_request; /* original request */ |
206 | int r_request_release_offset; | 209 | int r_request_release_offset; |
207 | struct ceph_msg *r_reply; | 210 | struct ceph_msg *r_reply; |
208 | struct ceph_mds_reply_info_parsed r_reply_info; | 211 | struct ceph_mds_reply_info_parsed r_reply_info; |
209 | int r_err; | 212 | int r_err; |
210 | bool r_aborted; | 213 | bool r_aborted; |
211 | 214 | ||
212 | unsigned long r_timeout; /* optional. jiffies */ | 215 | unsigned long r_timeout; /* optional. jiffies */ |
213 | unsigned long r_started; /* start time to measure timeout against */ | 216 | unsigned long r_started; /* start time to measure timeout against */ |
214 | unsigned long r_request_started; /* start time for mds request only, | 217 | unsigned long r_request_started; /* start time for mds request only, |
215 | used to measure lease durations */ | 218 | used to measure lease durations */ |
216 | 219 | ||
217 | /* link unsafe requests to parent directory, for fsync */ | 220 | /* link unsafe requests to parent directory, for fsync */ |
218 | struct inode *r_unsafe_dir; | 221 | struct inode *r_unsafe_dir; |
219 | struct list_head r_unsafe_dir_item; | 222 | struct list_head r_unsafe_dir_item; |
220 | 223 | ||
221 | struct ceph_mds_session *r_session; | 224 | struct ceph_mds_session *r_session; |
222 | 225 | ||
223 | int r_attempts; /* resend attempts */ | 226 | int r_attempts; /* resend attempts */ |
224 | int r_num_fwd; /* number of forward attempts */ | 227 | int r_num_fwd; /* number of forward attempts */ |
225 | int r_resend_mds; /* mds to resend to next, if any*/ | 228 | int r_resend_mds; /* mds to resend to next, if any*/ |
226 | u32 r_sent_on_mseq; /* cap mseq request was sent at*/ | 229 | u32 r_sent_on_mseq; /* cap mseq request was sent at*/ |
227 | 230 | ||
228 | struct kref r_kref; | 231 | struct kref r_kref; |
229 | struct list_head r_wait; | 232 | struct list_head r_wait; |
230 | struct completion r_completion; | 233 | struct completion r_completion; |
231 | struct completion r_safe_completion; | 234 | struct completion r_safe_completion; |
232 | ceph_mds_request_callback_t r_callback; | 235 | ceph_mds_request_callback_t r_callback; |
233 | struct list_head r_unsafe_item; /* per-session unsafe list item */ | 236 | struct list_head r_unsafe_item; /* per-session unsafe list item */ |
234 | bool r_got_unsafe, r_got_safe, r_got_result; | 237 | bool r_got_unsafe, r_got_safe, r_got_result; |
235 | 238 | ||
236 | bool r_did_prepopulate; | 239 | bool r_did_prepopulate; |
237 | u32 r_readdir_offset; | 240 | u32 r_readdir_offset; |
238 | 241 | ||
239 | struct ceph_cap_reservation r_caps_reservation; | 242 | struct ceph_cap_reservation r_caps_reservation; |
240 | int r_num_caps; | 243 | int r_num_caps; |
241 | }; | 244 | }; |
242 | 245 | ||
243 | /* | 246 | /* |
244 | * mds client state | 247 | * mds client state |
245 | */ | 248 | */ |
246 | struct ceph_mds_client { | 249 | struct ceph_mds_client { |
247 | struct ceph_fs_client *fsc; | 250 | struct ceph_fs_client *fsc; |
248 | struct mutex mutex; /* all nested structures */ | 251 | struct mutex mutex; /* all nested structures */ |
249 | 252 | ||
250 | struct ceph_mdsmap *mdsmap; | 253 | struct ceph_mdsmap *mdsmap; |
251 | struct completion safe_umount_waiters; | 254 | struct completion safe_umount_waiters; |
252 | wait_queue_head_t session_close_wq; | 255 | wait_queue_head_t session_close_wq; |
253 | struct list_head waiting_for_map; | 256 | struct list_head waiting_for_map; |
254 | 257 | ||
255 | struct ceph_mds_session **sessions; /* NULL for mds if no session */ | 258 | struct ceph_mds_session **sessions; /* NULL for mds if no session */ |
256 | int max_sessions; /* len of s_mds_sessions */ | 259 | int max_sessions; /* len of s_mds_sessions */ |
257 | int stopping; /* true if shutting down */ | 260 | int stopping; /* true if shutting down */ |
258 | 261 | ||
259 | /* | 262 | /* |
260 | * snap_rwsem will cover cap linkage into snaprealms, and | 263 | * snap_rwsem will cover cap linkage into snaprealms, and |
261 | * realm snap contexts. (later, we can do per-realm snap | 264 | * realm snap contexts. (later, we can do per-realm snap |
262 | * contexts locks..) the empty list contains realms with no | 265 | * contexts locks..) the empty list contains realms with no |
263 | * references (implying they contain no inodes with caps) that | 266 | * references (implying they contain no inodes with caps) that |
264 | * should be destroyed. | 267 | * should be destroyed. |
265 | */ | 268 | */ |
266 | struct rw_semaphore snap_rwsem; | 269 | struct rw_semaphore snap_rwsem; |
267 | struct rb_root snap_realms; | 270 | struct rb_root snap_realms; |
268 | struct list_head snap_empty; | 271 | struct list_head snap_empty; |
269 | spinlock_t snap_empty_lock; /* protect snap_empty */ | 272 | spinlock_t snap_empty_lock; /* protect snap_empty */ |
270 | 273 | ||
271 | u64 last_tid; /* most recent mds request */ | 274 | u64 last_tid; /* most recent mds request */ |
272 | struct rb_root request_tree; /* pending mds requests */ | 275 | struct rb_root request_tree; /* pending mds requests */ |
273 | struct delayed_work delayed_work; /* delayed work */ | 276 | struct delayed_work delayed_work; /* delayed work */ |
274 | unsigned long last_renew_caps; /* last time we renewed our caps */ | 277 | unsigned long last_renew_caps; /* last time we renewed our caps */ |
275 | struct list_head cap_delay_list; /* caps with delayed release */ | 278 | struct list_head cap_delay_list; /* caps with delayed release */ |
276 | spinlock_t cap_delay_lock; /* protects cap_delay_list */ | 279 | spinlock_t cap_delay_lock; /* protects cap_delay_list */ |
277 | struct list_head snap_flush_list; /* cap_snaps ready to flush */ | 280 | struct list_head snap_flush_list; /* cap_snaps ready to flush */ |
278 | spinlock_t snap_flush_lock; | 281 | spinlock_t snap_flush_lock; |
279 | 282 | ||
280 | u64 cap_flush_seq; | 283 | u64 cap_flush_seq; |
281 | struct list_head cap_dirty; /* inodes with dirty caps */ | 284 | struct list_head cap_dirty; /* inodes with dirty caps */ |
282 | struct list_head cap_dirty_migrating; /* ...that are migration... */ | 285 | struct list_head cap_dirty_migrating; /* ...that are migration... */ |
283 | int num_cap_flushing; /* # caps we are flushing */ | 286 | int num_cap_flushing; /* # caps we are flushing */ |
284 | spinlock_t cap_dirty_lock; /* protects above items */ | 287 | spinlock_t cap_dirty_lock; /* protects above items */ |
285 | wait_queue_head_t cap_flushing_wq; | 288 | wait_queue_head_t cap_flushing_wq; |
286 | 289 | ||
287 | /* | 290 | /* |
288 | * Cap reservations | 291 | * Cap reservations |
289 | * | 292 | * |
290 | * Maintain a global pool of preallocated struct ceph_caps, referenced | 293 | * Maintain a global pool of preallocated struct ceph_caps, referenced |
291 | * by struct ceph_caps_reservations. This ensures that we preallocate | 294 | * by struct ceph_caps_reservations. This ensures that we preallocate |
292 | * memory needed to successfully process an MDS response. (If an MDS | 295 | * memory needed to successfully process an MDS response. (If an MDS |
293 | * sends us cap information and we fail to process it, we will have | 296 | * sends us cap information and we fail to process it, we will have |
294 | * problems due to the client and MDS being out of sync.) | 297 | * problems due to the client and MDS being out of sync.) |
295 | * | 298 | * |
296 | * Reservations are 'owned' by a ceph_cap_reservation context. | 299 | * Reservations are 'owned' by a ceph_cap_reservation context. |
297 | */ | 300 | */ |
298 | spinlock_t caps_list_lock; | 301 | spinlock_t caps_list_lock; |
299 | struct list_head caps_list; /* unused (reserved or | 302 | struct list_head caps_list; /* unused (reserved or |
300 | unreserved) */ | 303 | unreserved) */ |
301 | int caps_total_count; /* total caps allocated */ | 304 | int caps_total_count; /* total caps allocated */ |
302 | int caps_use_count; /* in use */ | 305 | int caps_use_count; /* in use */ |
303 | int caps_reserve_count; /* unused, reserved */ | 306 | int caps_reserve_count; /* unused, reserved */ |
304 | int caps_avail_count; /* unused, unreserved */ | 307 | int caps_avail_count; /* unused, unreserved */ |
305 | int caps_min_count; /* keep at least this many | 308 | int caps_min_count; /* keep at least this many |
306 | (unreserved) */ | 309 | (unreserved) */ |
307 | spinlock_t dentry_lru_lock; | 310 | spinlock_t dentry_lru_lock; |
308 | struct list_head dentry_lru; | 311 | struct list_head dentry_lru; |
309 | int num_dentry; | 312 | int num_dentry; |
310 | }; | 313 | }; |
311 | 314 | ||
312 | extern const char *ceph_mds_op_name(int op); | 315 | extern const char *ceph_mds_op_name(int op); |
313 | 316 | ||
314 | extern struct ceph_mds_session * | 317 | extern struct ceph_mds_session * |
315 | __ceph_lookup_mds_session(struct ceph_mds_client *, int mds); | 318 | __ceph_lookup_mds_session(struct ceph_mds_client *, int mds); |
316 | 319 | ||
317 | static inline struct ceph_mds_session * | 320 | static inline struct ceph_mds_session * |
318 | ceph_get_mds_session(struct ceph_mds_session *s) | 321 | ceph_get_mds_session(struct ceph_mds_session *s) |
319 | { | 322 | { |
320 | atomic_inc(&s->s_ref); | 323 | atomic_inc(&s->s_ref); |
321 | return s; | 324 | return s; |
322 | } | 325 | } |
323 | 326 | ||
324 | extern void ceph_put_mds_session(struct ceph_mds_session *s); | 327 | extern void ceph_put_mds_session(struct ceph_mds_session *s); |
325 | 328 | ||
326 | extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc, | 329 | extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc, |
327 | struct ceph_msg *msg, int mds); | 330 | struct ceph_msg *msg, int mds); |
328 | 331 | ||
329 | extern int ceph_mdsc_init(struct ceph_fs_client *fsc); | 332 | extern int ceph_mdsc_init(struct ceph_fs_client *fsc); |
330 | extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc); | 333 | extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc); |
331 | extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc); | 334 | extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc); |
332 | 335 | ||
333 | extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc); | 336 | extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc); |
334 | 337 | ||
335 | extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, | 338 | extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, |
336 | struct inode *inode, | 339 | struct inode *inode, |
337 | struct dentry *dn); | 340 | struct dentry *dn); |
338 | 341 | ||
339 | extern void ceph_invalidate_dir_request(struct ceph_mds_request *req); | 342 | extern void ceph_invalidate_dir_request(struct ceph_mds_request *req); |
340 | 343 | ||
341 | extern struct ceph_mds_request * | 344 | extern struct ceph_mds_request * |
342 | ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode); | 345 | ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode); |
343 | extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, | 346 | extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, |
344 | struct ceph_mds_request *req); | 347 | struct ceph_mds_request *req); |
345 | extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, | 348 | extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, |
346 | struct inode *dir, | 349 | struct inode *dir, |
347 | struct ceph_mds_request *req); | 350 | struct ceph_mds_request *req); |
348 | static inline void ceph_mdsc_get_request(struct ceph_mds_request *req) | 351 | static inline void ceph_mdsc_get_request(struct ceph_mds_request *req) |
349 | { | 352 | { |
350 | kref_get(&req->r_kref); | 353 | kref_get(&req->r_kref); |
351 | } | 354 | } |
352 | extern void ceph_mdsc_release_request(struct kref *kref); | 355 | extern void ceph_mdsc_release_request(struct kref *kref); |
353 | static inline void ceph_mdsc_put_request(struct ceph_mds_request *req) | 356 | static inline void ceph_mdsc_put_request(struct ceph_mds_request *req) |
354 | { | 357 | { |
355 | kref_put(&req->r_kref, ceph_mdsc_release_request); | 358 | kref_put(&req->r_kref, ceph_mdsc_release_request); |
356 | } | 359 | } |
357 | 360 | ||
358 | extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc, | 361 | extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc, |
359 | struct ceph_mds_session *session); | 362 | struct ceph_mds_session *session); |
360 | extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc, | 363 | extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc, |
361 | struct ceph_mds_session *session); | 364 | struct ceph_mds_session *session); |
362 | 365 | ||
363 | extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc); | 366 | extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc); |
364 | 367 | ||
365 | extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, | 368 | extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, |
366 | int stop_on_nosnap); | 369 | int stop_on_nosnap); |
367 | 370 | ||
368 | extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry); | 371 | extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry); |
369 | extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, | 372 | extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, |
370 | struct inode *inode, | 373 | struct inode *inode, |
371 | struct dentry *dentry, char action, | 374 | struct dentry *dentry, char action, |
372 | u32 seq); | 375 | u32 seq); |
373 | 376 | ||
374 | extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, | 377 | extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, |
375 | struct ceph_msg *msg); | 378 | struct ceph_msg *msg); |
376 | 379 | ||
377 | extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, | 380 | extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, |
378 | struct ceph_mds_session *session); | 381 | struct ceph_mds_session *session); |
379 | 382 | ||
380 | #endif | 383 | #endif |
381 | 384 |
fs/ceph/xattr.c
1 | #include <linux/ceph/ceph_debug.h> | 1 | #include <linux/ceph/ceph_debug.h> |
2 | 2 | ||
3 | #include "super.h" | 3 | #include "super.h" |
4 | #include "mds_client.h" | 4 | #include "mds_client.h" |
5 | 5 | ||
6 | #include <linux/ceph/decode.h> | 6 | #include <linux/ceph/decode.h> |
7 | 7 | ||
8 | #include <linux/xattr.h> | 8 | #include <linux/xattr.h> |
9 | #include <linux/slab.h> | 9 | #include <linux/slab.h> |
10 | 10 | ||
11 | static bool ceph_is_valid_xattr(const char *name) | 11 | static bool ceph_is_valid_xattr(const char *name) |
12 | { | 12 | { |
13 | return !strncmp(name, "ceph.", 5) || | 13 | return !strncmp(name, "ceph.", 5) || |
14 | !strncmp(name, XATTR_SECURITY_PREFIX, | 14 | !strncmp(name, XATTR_SECURITY_PREFIX, |
15 | XATTR_SECURITY_PREFIX_LEN) || | 15 | XATTR_SECURITY_PREFIX_LEN) || |
16 | !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || | 16 | !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || |
17 | !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); | 17 | !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); |
18 | } | 18 | } |
19 | 19 | ||
20 | /* | 20 | /* |
21 | * These define virtual xattrs exposing the recursive directory | 21 | * These define virtual xattrs exposing the recursive directory |
22 | * statistics and layout metadata. | 22 | * statistics and layout metadata. |
23 | */ | 23 | */ |
24 | struct ceph_vxattr_cb { | 24 | struct ceph_vxattr_cb { |
25 | bool readonly; | 25 | bool readonly; |
26 | char *name; | 26 | char *name; |
27 | size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val, | 27 | size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val, |
28 | size_t size); | 28 | size_t size); |
29 | }; | 29 | }; |
30 | 30 | ||
31 | /* directories */ | 31 | /* directories */ |
32 | 32 | ||
33 | static size_t ceph_vxattrcb_entries(struct ceph_inode_info *ci, char *val, | 33 | static size_t ceph_vxattrcb_entries(struct ceph_inode_info *ci, char *val, |
34 | size_t size) | 34 | size_t size) |
35 | { | 35 | { |
36 | return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs); | 36 | return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs); |
37 | } | 37 | } |
38 | 38 | ||
39 | static size_t ceph_vxattrcb_files(struct ceph_inode_info *ci, char *val, | 39 | static size_t ceph_vxattrcb_files(struct ceph_inode_info *ci, char *val, |
40 | size_t size) | 40 | size_t size) |
41 | { | 41 | { |
42 | return snprintf(val, size, "%lld", ci->i_files); | 42 | return snprintf(val, size, "%lld", ci->i_files); |
43 | } | 43 | } |
44 | 44 | ||
45 | static size_t ceph_vxattrcb_subdirs(struct ceph_inode_info *ci, char *val, | 45 | static size_t ceph_vxattrcb_subdirs(struct ceph_inode_info *ci, char *val, |
46 | size_t size) | 46 | size_t size) |
47 | { | 47 | { |
48 | return snprintf(val, size, "%lld", ci->i_subdirs); | 48 | return snprintf(val, size, "%lld", ci->i_subdirs); |
49 | } | 49 | } |
50 | 50 | ||
51 | static size_t ceph_vxattrcb_rentries(struct ceph_inode_info *ci, char *val, | 51 | static size_t ceph_vxattrcb_rentries(struct ceph_inode_info *ci, char *val, |
52 | size_t size) | 52 | size_t size) |
53 | { | 53 | { |
54 | return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs); | 54 | return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs); |
55 | } | 55 | } |
56 | 56 | ||
57 | static size_t ceph_vxattrcb_rfiles(struct ceph_inode_info *ci, char *val, | 57 | static size_t ceph_vxattrcb_rfiles(struct ceph_inode_info *ci, char *val, |
58 | size_t size) | 58 | size_t size) |
59 | { | 59 | { |
60 | return snprintf(val, size, "%lld", ci->i_rfiles); | 60 | return snprintf(val, size, "%lld", ci->i_rfiles); |
61 | } | 61 | } |
62 | 62 | ||
63 | static size_t ceph_vxattrcb_rsubdirs(struct ceph_inode_info *ci, char *val, | 63 | static size_t ceph_vxattrcb_rsubdirs(struct ceph_inode_info *ci, char *val, |
64 | size_t size) | 64 | size_t size) |
65 | { | 65 | { |
66 | return snprintf(val, size, "%lld", ci->i_rsubdirs); | 66 | return snprintf(val, size, "%lld", ci->i_rsubdirs); |
67 | } | 67 | } |
68 | 68 | ||
69 | static size_t ceph_vxattrcb_rbytes(struct ceph_inode_info *ci, char *val, | 69 | static size_t ceph_vxattrcb_rbytes(struct ceph_inode_info *ci, char *val, |
70 | size_t size) | 70 | size_t size) |
71 | { | 71 | { |
72 | return snprintf(val, size, "%lld", ci->i_rbytes); | 72 | return snprintf(val, size, "%lld", ci->i_rbytes); |
73 | } | 73 | } |
74 | 74 | ||
75 | static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val, | 75 | static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val, |
76 | size_t size) | 76 | size_t size) |
77 | { | 77 | { |
78 | return snprintf(val, size, "%ld.%ld", (long)ci->i_rctime.tv_sec, | 78 | return snprintf(val, size, "%ld.%ld", (long)ci->i_rctime.tv_sec, |
79 | (long)ci->i_rctime.tv_nsec); | 79 | (long)ci->i_rctime.tv_nsec); |
80 | } | 80 | } |
81 | 81 | ||
82 | static struct ceph_vxattr_cb ceph_dir_vxattrs[] = { | 82 | static struct ceph_vxattr_cb ceph_dir_vxattrs[] = { |
83 | { true, "ceph.dir.entries", ceph_vxattrcb_entries}, | 83 | { true, "ceph.dir.entries", ceph_vxattrcb_entries}, |
84 | { true, "ceph.dir.files", ceph_vxattrcb_files}, | 84 | { true, "ceph.dir.files", ceph_vxattrcb_files}, |
85 | { true, "ceph.dir.subdirs", ceph_vxattrcb_subdirs}, | 85 | { true, "ceph.dir.subdirs", ceph_vxattrcb_subdirs}, |
86 | { true, "ceph.dir.rentries", ceph_vxattrcb_rentries}, | 86 | { true, "ceph.dir.rentries", ceph_vxattrcb_rentries}, |
87 | { true, "ceph.dir.rfiles", ceph_vxattrcb_rfiles}, | 87 | { true, "ceph.dir.rfiles", ceph_vxattrcb_rfiles}, |
88 | { true, "ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs}, | 88 | { true, "ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs}, |
89 | { true, "ceph.dir.rbytes", ceph_vxattrcb_rbytes}, | 89 | { true, "ceph.dir.rbytes", ceph_vxattrcb_rbytes}, |
90 | { true, "ceph.dir.rctime", ceph_vxattrcb_rctime}, | 90 | { true, "ceph.dir.rctime", ceph_vxattrcb_rctime}, |
91 | { true, NULL, NULL } | 91 | { true, NULL, NULL } |
92 | }; | 92 | }; |
93 | 93 | ||
94 | /* files */ | 94 | /* files */ |
95 | 95 | ||
96 | static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, | 96 | static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, |
97 | size_t size) | 97 | size_t size) |
98 | { | 98 | { |
99 | int ret; | 99 | int ret; |
100 | 100 | ||
101 | ret = snprintf(val, size, | 101 | ret = snprintf(val, size, |
102 | "chunk_bytes=%lld\nstripe_count=%lld\nobject_size=%lld\n", | 102 | "chunk_bytes=%lld\nstripe_count=%lld\nobject_size=%lld\n", |
103 | (unsigned long long)ceph_file_layout_su(ci->i_layout), | 103 | (unsigned long long)ceph_file_layout_su(ci->i_layout), |
104 | (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), | 104 | (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), |
105 | (unsigned long long)ceph_file_layout_object_size(ci->i_layout)); | 105 | (unsigned long long)ceph_file_layout_object_size(ci->i_layout)); |
106 | if (ceph_file_layout_pg_preferred(ci->i_layout)) | 106 | if (ceph_file_layout_pg_preferred(ci->i_layout)) |
107 | ret += snprintf(val + ret, size, "preferred_osd=%lld\n", | 107 | ret += snprintf(val + ret, size, "preferred_osd=%lld\n", |
108 | (unsigned long long)ceph_file_layout_pg_preferred( | 108 | (unsigned long long)ceph_file_layout_pg_preferred( |
109 | ci->i_layout)); | 109 | ci->i_layout)); |
110 | return ret; | 110 | return ret; |
111 | } | 111 | } |
112 | 112 | ||
113 | static struct ceph_vxattr_cb ceph_file_vxattrs[] = { | 113 | static struct ceph_vxattr_cb ceph_file_vxattrs[] = { |
114 | { true, "ceph.file.layout", ceph_vxattrcb_layout}, | ||
115 | /* The following extended attribute name is deprecated */ | ||
114 | { true, "ceph.layout", ceph_vxattrcb_layout}, | 116 | { true, "ceph.layout", ceph_vxattrcb_layout}, |
115 | { NULL, NULL } | 117 | { true, NULL, NULL } |
116 | }; | 118 | }; |
117 | 119 | ||
118 | static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode) | 120 | static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode) |
119 | { | 121 | { |
120 | if (S_ISDIR(inode->i_mode)) | 122 | if (S_ISDIR(inode->i_mode)) |
121 | return ceph_dir_vxattrs; | 123 | return ceph_dir_vxattrs; |
122 | else if (S_ISREG(inode->i_mode)) | 124 | else if (S_ISREG(inode->i_mode)) |
123 | return ceph_file_vxattrs; | 125 | return ceph_file_vxattrs; |
124 | return NULL; | 126 | return NULL; |
125 | } | 127 | } |
126 | 128 | ||
127 | static struct ceph_vxattr_cb *ceph_match_vxattr(struct ceph_vxattr_cb *vxattr, | 129 | static struct ceph_vxattr_cb *ceph_match_vxattr(struct ceph_vxattr_cb *vxattr, |
128 | const char *name) | 130 | const char *name) |
129 | { | 131 | { |
130 | do { | 132 | do { |
131 | if (strcmp(vxattr->name, name) == 0) | 133 | if (strcmp(vxattr->name, name) == 0) |
132 | return vxattr; | 134 | return vxattr; |
133 | vxattr++; | 135 | vxattr++; |
134 | } while (vxattr->name); | 136 | } while (vxattr->name); |
135 | return NULL; | 137 | return NULL; |
136 | } | 138 | } |
137 | 139 | ||
138 | static int __set_xattr(struct ceph_inode_info *ci, | 140 | static int __set_xattr(struct ceph_inode_info *ci, |
139 | const char *name, int name_len, | 141 | const char *name, int name_len, |
140 | const char *val, int val_len, | 142 | const char *val, int val_len, |
141 | int dirty, | 143 | int dirty, |
142 | int should_free_name, int should_free_val, | 144 | int should_free_name, int should_free_val, |
143 | struct ceph_inode_xattr **newxattr) | 145 | struct ceph_inode_xattr **newxattr) |
144 | { | 146 | { |
145 | struct rb_node **p; | 147 | struct rb_node **p; |
146 | struct rb_node *parent = NULL; | 148 | struct rb_node *parent = NULL; |
147 | struct ceph_inode_xattr *xattr = NULL; | 149 | struct ceph_inode_xattr *xattr = NULL; |
148 | int c; | 150 | int c; |
149 | int new = 0; | 151 | int new = 0; |
150 | 152 | ||
151 | p = &ci->i_xattrs.index.rb_node; | 153 | p = &ci->i_xattrs.index.rb_node; |
152 | while (*p) { | 154 | while (*p) { |
153 | parent = *p; | 155 | parent = *p; |
154 | xattr = rb_entry(parent, struct ceph_inode_xattr, node); | 156 | xattr = rb_entry(parent, struct ceph_inode_xattr, node); |
155 | c = strncmp(name, xattr->name, min(name_len, xattr->name_len)); | 157 | c = strncmp(name, xattr->name, min(name_len, xattr->name_len)); |
156 | if (c < 0) | 158 | if (c < 0) |
157 | p = &(*p)->rb_left; | 159 | p = &(*p)->rb_left; |
158 | else if (c > 0) | 160 | else if (c > 0) |
159 | p = &(*p)->rb_right; | 161 | p = &(*p)->rb_right; |
160 | else { | 162 | else { |
161 | if (name_len == xattr->name_len) | 163 | if (name_len == xattr->name_len) |
162 | break; | 164 | break; |
163 | else if (name_len < xattr->name_len) | 165 | else if (name_len < xattr->name_len) |
164 | p = &(*p)->rb_left; | 166 | p = &(*p)->rb_left; |
165 | else | 167 | else |
166 | p = &(*p)->rb_right; | 168 | p = &(*p)->rb_right; |
167 | } | 169 | } |
168 | xattr = NULL; | 170 | xattr = NULL; |
169 | } | 171 | } |
170 | 172 | ||
171 | if (!xattr) { | 173 | if (!xattr) { |
172 | new = 1; | 174 | new = 1; |
173 | xattr = *newxattr; | 175 | xattr = *newxattr; |
174 | xattr->name = name; | 176 | xattr->name = name; |
175 | xattr->name_len = name_len; | 177 | xattr->name_len = name_len; |
176 | xattr->should_free_name = should_free_name; | 178 | xattr->should_free_name = should_free_name; |
177 | 179 | ||
178 | ci->i_xattrs.count++; | 180 | ci->i_xattrs.count++; |
179 | dout("__set_xattr count=%d\n", ci->i_xattrs.count); | 181 | dout("__set_xattr count=%d\n", ci->i_xattrs.count); |
180 | } else { | 182 | } else { |
181 | kfree(*newxattr); | 183 | kfree(*newxattr); |
182 | *newxattr = NULL; | 184 | *newxattr = NULL; |
183 | if (xattr->should_free_val) | 185 | if (xattr->should_free_val) |
184 | kfree((void *)xattr->val); | 186 | kfree((void *)xattr->val); |
185 | 187 | ||
186 | if (should_free_name) { | 188 | if (should_free_name) { |
187 | kfree((void *)name); | 189 | kfree((void *)name); |
188 | name = xattr->name; | 190 | name = xattr->name; |
189 | } | 191 | } |
190 | ci->i_xattrs.names_size -= xattr->name_len; | 192 | ci->i_xattrs.names_size -= xattr->name_len; |
191 | ci->i_xattrs.vals_size -= xattr->val_len; | 193 | ci->i_xattrs.vals_size -= xattr->val_len; |
192 | } | 194 | } |
193 | ci->i_xattrs.names_size += name_len; | 195 | ci->i_xattrs.names_size += name_len; |
194 | ci->i_xattrs.vals_size += val_len; | 196 | ci->i_xattrs.vals_size += val_len; |
195 | if (val) | 197 | if (val) |
196 | xattr->val = val; | 198 | xattr->val = val; |
197 | else | 199 | else |
198 | xattr->val = ""; | 200 | xattr->val = ""; |
199 | 201 | ||
200 | xattr->val_len = val_len; | 202 | xattr->val_len = val_len; |
201 | xattr->dirty = dirty; | 203 | xattr->dirty = dirty; |
202 | xattr->should_free_val = (val && should_free_val); | 204 | xattr->should_free_val = (val && should_free_val); |
203 | 205 | ||
204 | if (new) { | 206 | if (new) { |
205 | rb_link_node(&xattr->node, parent, p); | 207 | rb_link_node(&xattr->node, parent, p); |
206 | rb_insert_color(&xattr->node, &ci->i_xattrs.index); | 208 | rb_insert_color(&xattr->node, &ci->i_xattrs.index); |
207 | dout("__set_xattr_val p=%p\n", p); | 209 | dout("__set_xattr_val p=%p\n", p); |
208 | } | 210 | } |
209 | 211 | ||
210 | dout("__set_xattr_val added %llx.%llx xattr %p %s=%.*s\n", | 212 | dout("__set_xattr_val added %llx.%llx xattr %p %s=%.*s\n", |
211 | ceph_vinop(&ci->vfs_inode), xattr, name, val_len, val); | 213 | ceph_vinop(&ci->vfs_inode), xattr, name, val_len, val); |
212 | 214 | ||
213 | return 0; | 215 | return 0; |
214 | } | 216 | } |
215 | 217 | ||
216 | static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci, | 218 | static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci, |
217 | const char *name) | 219 | const char *name) |
218 | { | 220 | { |
219 | struct rb_node **p; | 221 | struct rb_node **p; |
220 | struct rb_node *parent = NULL; | 222 | struct rb_node *parent = NULL; |
221 | struct ceph_inode_xattr *xattr = NULL; | 223 | struct ceph_inode_xattr *xattr = NULL; |
222 | int name_len = strlen(name); | 224 | int name_len = strlen(name); |
223 | int c; | 225 | int c; |
224 | 226 | ||
225 | p = &ci->i_xattrs.index.rb_node; | 227 | p = &ci->i_xattrs.index.rb_node; |
226 | while (*p) { | 228 | while (*p) { |
227 | parent = *p; | 229 | parent = *p; |
228 | xattr = rb_entry(parent, struct ceph_inode_xattr, node); | 230 | xattr = rb_entry(parent, struct ceph_inode_xattr, node); |
229 | c = strncmp(name, xattr->name, xattr->name_len); | 231 | c = strncmp(name, xattr->name, xattr->name_len); |
230 | if (c == 0 && name_len > xattr->name_len) | 232 | if (c == 0 && name_len > xattr->name_len) |
231 | c = 1; | 233 | c = 1; |
232 | if (c < 0) | 234 | if (c < 0) |
233 | p = &(*p)->rb_left; | 235 | p = &(*p)->rb_left; |
234 | else if (c > 0) | 236 | else if (c > 0) |
235 | p = &(*p)->rb_right; | 237 | p = &(*p)->rb_right; |
236 | else { | 238 | else { |
237 | dout("__get_xattr %s: found %.*s\n", name, | 239 | dout("__get_xattr %s: found %.*s\n", name, |
238 | xattr->val_len, xattr->val); | 240 | xattr->val_len, xattr->val); |
239 | return xattr; | 241 | return xattr; |
240 | } | 242 | } |
241 | } | 243 | } |
242 | 244 | ||
243 | dout("__get_xattr %s: not found\n", name); | 245 | dout("__get_xattr %s: not found\n", name); |
244 | 246 | ||
245 | return NULL; | 247 | return NULL; |
246 | } | 248 | } |
247 | 249 | ||
248 | static void __free_xattr(struct ceph_inode_xattr *xattr) | 250 | static void __free_xattr(struct ceph_inode_xattr *xattr) |
249 | { | 251 | { |
250 | BUG_ON(!xattr); | 252 | BUG_ON(!xattr); |
251 | 253 | ||
252 | if (xattr->should_free_name) | 254 | if (xattr->should_free_name) |
253 | kfree((void *)xattr->name); | 255 | kfree((void *)xattr->name); |
254 | if (xattr->should_free_val) | 256 | if (xattr->should_free_val) |
255 | kfree((void *)xattr->val); | 257 | kfree((void *)xattr->val); |
256 | 258 | ||
257 | kfree(xattr); | 259 | kfree(xattr); |
258 | } | 260 | } |
259 | 261 | ||
260 | static int __remove_xattr(struct ceph_inode_info *ci, | 262 | static int __remove_xattr(struct ceph_inode_info *ci, |
261 | struct ceph_inode_xattr *xattr) | 263 | struct ceph_inode_xattr *xattr) |
262 | { | 264 | { |
263 | if (!xattr) | 265 | if (!xattr) |
264 | return -EOPNOTSUPP; | 266 | return -EOPNOTSUPP; |
265 | 267 | ||
266 | rb_erase(&xattr->node, &ci->i_xattrs.index); | 268 | rb_erase(&xattr->node, &ci->i_xattrs.index); |
267 | 269 | ||
268 | if (xattr->should_free_name) | 270 | if (xattr->should_free_name) |
269 | kfree((void *)xattr->name); | 271 | kfree((void *)xattr->name); |
270 | if (xattr->should_free_val) | 272 | if (xattr->should_free_val) |
271 | kfree((void *)xattr->val); | 273 | kfree((void *)xattr->val); |
272 | 274 | ||
273 | ci->i_xattrs.names_size -= xattr->name_len; | 275 | ci->i_xattrs.names_size -= xattr->name_len; |
274 | ci->i_xattrs.vals_size -= xattr->val_len; | 276 | ci->i_xattrs.vals_size -= xattr->val_len; |
275 | ci->i_xattrs.count--; | 277 | ci->i_xattrs.count--; |
276 | kfree(xattr); | 278 | kfree(xattr); |
277 | 279 | ||
278 | return 0; | 280 | return 0; |
279 | } | 281 | } |
280 | 282 | ||
281 | static int __remove_xattr_by_name(struct ceph_inode_info *ci, | 283 | static int __remove_xattr_by_name(struct ceph_inode_info *ci, |
282 | const char *name) | 284 | const char *name) |
283 | { | 285 | { |
284 | struct rb_node **p; | 286 | struct rb_node **p; |
285 | struct ceph_inode_xattr *xattr; | 287 | struct ceph_inode_xattr *xattr; |
286 | int err; | 288 | int err; |
287 | 289 | ||
288 | p = &ci->i_xattrs.index.rb_node; | 290 | p = &ci->i_xattrs.index.rb_node; |
289 | xattr = __get_xattr(ci, name); | 291 | xattr = __get_xattr(ci, name); |
290 | err = __remove_xattr(ci, xattr); | 292 | err = __remove_xattr(ci, xattr); |
291 | return err; | 293 | return err; |
292 | } | 294 | } |
293 | 295 | ||
294 | static char *__copy_xattr_names(struct ceph_inode_info *ci, | 296 | static char *__copy_xattr_names(struct ceph_inode_info *ci, |
295 | char *dest) | 297 | char *dest) |
296 | { | 298 | { |
297 | struct rb_node *p; | 299 | struct rb_node *p; |
298 | struct ceph_inode_xattr *xattr = NULL; | 300 | struct ceph_inode_xattr *xattr = NULL; |
299 | 301 | ||
300 | p = rb_first(&ci->i_xattrs.index); | 302 | p = rb_first(&ci->i_xattrs.index); |
301 | dout("__copy_xattr_names count=%d\n", ci->i_xattrs.count); | 303 | dout("__copy_xattr_names count=%d\n", ci->i_xattrs.count); |
302 | 304 | ||
303 | while (p) { | 305 | while (p) { |
304 | xattr = rb_entry(p, struct ceph_inode_xattr, node); | 306 | xattr = rb_entry(p, struct ceph_inode_xattr, node); |
305 | memcpy(dest, xattr->name, xattr->name_len); | 307 | memcpy(dest, xattr->name, xattr->name_len); |
306 | dest[xattr->name_len] = '\0'; | 308 | dest[xattr->name_len] = '\0'; |
307 | 309 | ||
308 | dout("dest=%s %p (%s) (%d/%d)\n", dest, xattr, xattr->name, | 310 | dout("dest=%s %p (%s) (%d/%d)\n", dest, xattr, xattr->name, |
309 | xattr->name_len, ci->i_xattrs.names_size); | 311 | xattr->name_len, ci->i_xattrs.names_size); |
310 | 312 | ||
311 | dest += xattr->name_len + 1; | 313 | dest += xattr->name_len + 1; |
312 | p = rb_next(p); | 314 | p = rb_next(p); |
313 | } | 315 | } |
314 | 316 | ||
315 | return dest; | 317 | return dest; |
316 | } | 318 | } |
317 | 319 | ||
318 | void __ceph_destroy_xattrs(struct ceph_inode_info *ci) | 320 | void __ceph_destroy_xattrs(struct ceph_inode_info *ci) |
319 | { | 321 | { |
320 | struct rb_node *p, *tmp; | 322 | struct rb_node *p, *tmp; |
321 | struct ceph_inode_xattr *xattr = NULL; | 323 | struct ceph_inode_xattr *xattr = NULL; |
322 | 324 | ||
323 | p = rb_first(&ci->i_xattrs.index); | 325 | p = rb_first(&ci->i_xattrs.index); |
324 | 326 | ||
325 | dout("__ceph_destroy_xattrs p=%p\n", p); | 327 | dout("__ceph_destroy_xattrs p=%p\n", p); |
326 | 328 | ||
327 | while (p) { | 329 | while (p) { |
328 | xattr = rb_entry(p, struct ceph_inode_xattr, node); | 330 | xattr = rb_entry(p, struct ceph_inode_xattr, node); |
329 | tmp = p; | 331 | tmp = p; |
330 | p = rb_next(tmp); | 332 | p = rb_next(tmp); |
331 | dout("__ceph_destroy_xattrs next p=%p (%.*s)\n", p, | 333 | dout("__ceph_destroy_xattrs next p=%p (%.*s)\n", p, |
332 | xattr->name_len, xattr->name); | 334 | xattr->name_len, xattr->name); |
333 | rb_erase(tmp, &ci->i_xattrs.index); | 335 | rb_erase(tmp, &ci->i_xattrs.index); |
334 | 336 | ||
335 | __free_xattr(xattr); | 337 | __free_xattr(xattr); |
336 | } | 338 | } |
337 | 339 | ||
338 | ci->i_xattrs.names_size = 0; | 340 | ci->i_xattrs.names_size = 0; |
339 | ci->i_xattrs.vals_size = 0; | 341 | ci->i_xattrs.vals_size = 0; |
340 | ci->i_xattrs.index_version = 0; | 342 | ci->i_xattrs.index_version = 0; |
341 | ci->i_xattrs.count = 0; | 343 | ci->i_xattrs.count = 0; |
342 | ci->i_xattrs.index = RB_ROOT; | 344 | ci->i_xattrs.index = RB_ROOT; |
343 | } | 345 | } |
344 | 346 | ||
345 | static int __build_xattrs(struct inode *inode) | 347 | static int __build_xattrs(struct inode *inode) |
346 | __releases(ci->i_ceph_lock) | 348 | __releases(ci->i_ceph_lock) |
347 | __acquires(ci->i_ceph_lock) | 349 | __acquires(ci->i_ceph_lock) |
348 | { | 350 | { |
349 | u32 namelen; | 351 | u32 namelen; |
350 | u32 numattr = 0; | 352 | u32 numattr = 0; |
351 | void *p, *end; | 353 | void *p, *end; |
352 | u32 len; | 354 | u32 len; |
353 | const char *name, *val; | 355 | const char *name, *val; |
354 | struct ceph_inode_info *ci = ceph_inode(inode); | 356 | struct ceph_inode_info *ci = ceph_inode(inode); |
355 | int xattr_version; | 357 | int xattr_version; |
356 | struct ceph_inode_xattr **xattrs = NULL; | 358 | struct ceph_inode_xattr **xattrs = NULL; |
357 | int err = 0; | 359 | int err = 0; |
358 | int i; | 360 | int i; |
359 | 361 | ||
360 | dout("__build_xattrs() len=%d\n", | 362 | dout("__build_xattrs() len=%d\n", |
361 | ci->i_xattrs.blob ? (int)ci->i_xattrs.blob->vec.iov_len : 0); | 363 | ci->i_xattrs.blob ? (int)ci->i_xattrs.blob->vec.iov_len : 0); |
362 | 364 | ||
363 | if (ci->i_xattrs.index_version >= ci->i_xattrs.version) | 365 | if (ci->i_xattrs.index_version >= ci->i_xattrs.version) |
364 | return 0; /* already built */ | 366 | return 0; /* already built */ |
365 | 367 | ||
366 | __ceph_destroy_xattrs(ci); | 368 | __ceph_destroy_xattrs(ci); |
367 | 369 | ||
368 | start: | 370 | start: |
369 | /* updated internal xattr rb tree */ | 371 | /* updated internal xattr rb tree */ |
370 | if (ci->i_xattrs.blob && ci->i_xattrs.blob->vec.iov_len > 4) { | 372 | if (ci->i_xattrs.blob && ci->i_xattrs.blob->vec.iov_len > 4) { |
371 | p = ci->i_xattrs.blob->vec.iov_base; | 373 | p = ci->i_xattrs.blob->vec.iov_base; |
372 | end = p + ci->i_xattrs.blob->vec.iov_len; | 374 | end = p + ci->i_xattrs.blob->vec.iov_len; |
373 | ceph_decode_32_safe(&p, end, numattr, bad); | 375 | ceph_decode_32_safe(&p, end, numattr, bad); |
374 | xattr_version = ci->i_xattrs.version; | 376 | xattr_version = ci->i_xattrs.version; |
375 | spin_unlock(&ci->i_ceph_lock); | 377 | spin_unlock(&ci->i_ceph_lock); |
376 | 378 | ||
377 | xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *), | 379 | xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *), |
378 | GFP_NOFS); | 380 | GFP_NOFS); |
379 | err = -ENOMEM; | 381 | err = -ENOMEM; |
380 | if (!xattrs) | 382 | if (!xattrs) |
381 | goto bad_lock; | 383 | goto bad_lock; |
382 | memset(xattrs, 0, numattr*sizeof(struct ceph_xattr *)); | 384 | memset(xattrs, 0, numattr*sizeof(struct ceph_xattr *)); |
383 | for (i = 0; i < numattr; i++) { | 385 | for (i = 0; i < numattr; i++) { |
384 | xattrs[i] = kmalloc(sizeof(struct ceph_inode_xattr), | 386 | xattrs[i] = kmalloc(sizeof(struct ceph_inode_xattr), |
385 | GFP_NOFS); | 387 | GFP_NOFS); |
386 | if (!xattrs[i]) | 388 | if (!xattrs[i]) |
387 | goto bad_lock; | 389 | goto bad_lock; |
388 | } | 390 | } |
389 | 391 | ||
390 | spin_lock(&ci->i_ceph_lock); | 392 | spin_lock(&ci->i_ceph_lock); |
391 | if (ci->i_xattrs.version != xattr_version) { | 393 | if (ci->i_xattrs.version != xattr_version) { |
392 | /* lost a race, retry */ | 394 | /* lost a race, retry */ |
393 | for (i = 0; i < numattr; i++) | 395 | for (i = 0; i < numattr; i++) |
394 | kfree(xattrs[i]); | 396 | kfree(xattrs[i]); |
395 | kfree(xattrs); | 397 | kfree(xattrs); |
396 | goto start; | 398 | goto start; |
397 | } | 399 | } |
398 | err = -EIO; | 400 | err = -EIO; |
399 | while (numattr--) { | 401 | while (numattr--) { |
400 | ceph_decode_32_safe(&p, end, len, bad); | 402 | ceph_decode_32_safe(&p, end, len, bad); |
401 | namelen = len; | 403 | namelen = len; |
402 | name = p; | 404 | name = p; |
403 | p += len; | 405 | p += len; |
404 | ceph_decode_32_safe(&p, end, len, bad); | 406 | ceph_decode_32_safe(&p, end, len, bad); |
405 | val = p; | 407 | val = p; |
406 | p += len; | 408 | p += len; |
407 | 409 | ||
408 | err = __set_xattr(ci, name, namelen, val, len, | 410 | err = __set_xattr(ci, name, namelen, val, len, |
409 | 0, 0, 0, &xattrs[numattr]); | 411 | 0, 0, 0, &xattrs[numattr]); |
410 | 412 | ||
411 | if (err < 0) | 413 | if (err < 0) |
412 | goto bad; | 414 | goto bad; |
413 | } | 415 | } |
414 | kfree(xattrs); | 416 | kfree(xattrs); |
415 | } | 417 | } |
416 | ci->i_xattrs.index_version = ci->i_xattrs.version; | 418 | ci->i_xattrs.index_version = ci->i_xattrs.version; |
417 | ci->i_xattrs.dirty = false; | 419 | ci->i_xattrs.dirty = false; |
418 | 420 | ||
419 | return err; | 421 | return err; |
420 | bad_lock: | 422 | bad_lock: |
421 | spin_lock(&ci->i_ceph_lock); | 423 | spin_lock(&ci->i_ceph_lock); |
422 | bad: | 424 | bad: |
423 | if (xattrs) { | 425 | if (xattrs) { |
424 | for (i = 0; i < numattr; i++) | 426 | for (i = 0; i < numattr; i++) |
425 | kfree(xattrs[i]); | 427 | kfree(xattrs[i]); |
426 | kfree(xattrs); | 428 | kfree(xattrs); |
427 | } | 429 | } |
428 | ci->i_xattrs.names_size = 0; | 430 | ci->i_xattrs.names_size = 0; |
429 | return err; | 431 | return err; |
430 | } | 432 | } |
431 | 433 | ||
432 | static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size, | 434 | static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size, |
433 | int val_size) | 435 | int val_size) |
434 | { | 436 | { |
435 | /* | 437 | /* |
436 | * 4 bytes for the length, and additional 4 bytes per each xattr name, | 438 | * 4 bytes for the length, and additional 4 bytes per each xattr name, |
437 | * 4 bytes per each value | 439 | * 4 bytes per each value |
438 | */ | 440 | */ |
439 | int size = 4 + ci->i_xattrs.count*(4 + 4) + | 441 | int size = 4 + ci->i_xattrs.count*(4 + 4) + |
440 | ci->i_xattrs.names_size + | 442 | ci->i_xattrs.names_size + |
441 | ci->i_xattrs.vals_size; | 443 | ci->i_xattrs.vals_size; |
442 | dout("__get_required_blob_size c=%d names.size=%d vals.size=%d\n", | 444 | dout("__get_required_blob_size c=%d names.size=%d vals.size=%d\n", |
443 | ci->i_xattrs.count, ci->i_xattrs.names_size, | 445 | ci->i_xattrs.count, ci->i_xattrs.names_size, |
444 | ci->i_xattrs.vals_size); | 446 | ci->i_xattrs.vals_size); |
445 | 447 | ||
446 | if (name_size) | 448 | if (name_size) |
447 | size += 4 + 4 + name_size + val_size; | 449 | size += 4 + 4 + name_size + val_size; |
448 | 450 | ||
449 | return size; | 451 | return size; |
450 | } | 452 | } |
451 | 453 | ||
452 | /* | 454 | /* |
453 | * If there are dirty xattrs, reencode xattrs into the prealloc_blob | 455 | * If there are dirty xattrs, reencode xattrs into the prealloc_blob |
454 | * and swap into place. | 456 | * and swap into place. |
455 | */ | 457 | */ |
456 | void __ceph_build_xattrs_blob(struct ceph_inode_info *ci) | 458 | void __ceph_build_xattrs_blob(struct ceph_inode_info *ci) |
457 | { | 459 | { |
458 | struct rb_node *p; | 460 | struct rb_node *p; |
459 | struct ceph_inode_xattr *xattr = NULL; | 461 | struct ceph_inode_xattr *xattr = NULL; |
460 | void *dest; | 462 | void *dest; |
461 | 463 | ||
462 | dout("__build_xattrs_blob %p\n", &ci->vfs_inode); | 464 | dout("__build_xattrs_blob %p\n", &ci->vfs_inode); |
463 | if (ci->i_xattrs.dirty) { | 465 | if (ci->i_xattrs.dirty) { |
464 | int need = __get_required_blob_size(ci, 0, 0); | 466 | int need = __get_required_blob_size(ci, 0, 0); |
465 | 467 | ||
466 | BUG_ON(need > ci->i_xattrs.prealloc_blob->alloc_len); | 468 | BUG_ON(need > ci->i_xattrs.prealloc_blob->alloc_len); |
467 | 469 | ||
468 | p = rb_first(&ci->i_xattrs.index); | 470 | p = rb_first(&ci->i_xattrs.index); |
469 | dest = ci->i_xattrs.prealloc_blob->vec.iov_base; | 471 | dest = ci->i_xattrs.prealloc_blob->vec.iov_base; |
470 | 472 | ||
471 | ceph_encode_32(&dest, ci->i_xattrs.count); | 473 | ceph_encode_32(&dest, ci->i_xattrs.count); |
472 | while (p) { | 474 | while (p) { |
473 | xattr = rb_entry(p, struct ceph_inode_xattr, node); | 475 | xattr = rb_entry(p, struct ceph_inode_xattr, node); |
474 | 476 | ||
475 | ceph_encode_32(&dest, xattr->name_len); | 477 | ceph_encode_32(&dest, xattr->name_len); |
476 | memcpy(dest, xattr->name, xattr->name_len); | 478 | memcpy(dest, xattr->name, xattr->name_len); |
477 | dest += xattr->name_len; | 479 | dest += xattr->name_len; |
478 | ceph_encode_32(&dest, xattr->val_len); | 480 | ceph_encode_32(&dest, xattr->val_len); |
479 | memcpy(dest, xattr->val, xattr->val_len); | 481 | memcpy(dest, xattr->val, xattr->val_len); |
480 | dest += xattr->val_len; | 482 | dest += xattr->val_len; |
481 | 483 | ||
482 | p = rb_next(p); | 484 | p = rb_next(p); |
483 | } | 485 | } |
484 | 486 | ||
485 | /* adjust buffer len; it may be larger than we need */ | 487 | /* adjust buffer len; it may be larger than we need */ |
486 | ci->i_xattrs.prealloc_blob->vec.iov_len = | 488 | ci->i_xattrs.prealloc_blob->vec.iov_len = |
487 | dest - ci->i_xattrs.prealloc_blob->vec.iov_base; | 489 | dest - ci->i_xattrs.prealloc_blob->vec.iov_base; |
488 | 490 | ||
489 | if (ci->i_xattrs.blob) | 491 | if (ci->i_xattrs.blob) |
490 | ceph_buffer_put(ci->i_xattrs.blob); | 492 | ceph_buffer_put(ci->i_xattrs.blob); |
491 | ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob; | 493 | ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob; |
492 | ci->i_xattrs.prealloc_blob = NULL; | 494 | ci->i_xattrs.prealloc_blob = NULL; |
493 | ci->i_xattrs.dirty = false; | 495 | ci->i_xattrs.dirty = false; |
494 | ci->i_xattrs.version++; | 496 | ci->i_xattrs.version++; |
495 | } | 497 | } |
496 | } | 498 | } |
497 | 499 | ||
498 | ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, | 500 | ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, |
499 | size_t size) | 501 | size_t size) |
500 | { | 502 | { |
501 | struct inode *inode = dentry->d_inode; | 503 | struct inode *inode = dentry->d_inode; |
502 | struct ceph_inode_info *ci = ceph_inode(inode); | 504 | struct ceph_inode_info *ci = ceph_inode(inode); |
503 | struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); | 505 | struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); |
504 | int err; | 506 | int err; |
505 | struct ceph_inode_xattr *xattr; | 507 | struct ceph_inode_xattr *xattr; |
506 | struct ceph_vxattr_cb *vxattr = NULL; | 508 | struct ceph_vxattr_cb *vxattr = NULL; |
507 | 509 | ||
508 | if (!ceph_is_valid_xattr(name)) | 510 | if (!ceph_is_valid_xattr(name)) |
509 | return -ENODATA; | 511 | return -ENODATA; |
510 | 512 | ||
511 | /* let's see if a virtual xattr was requested */ | 513 | /* let's see if a virtual xattr was requested */ |
512 | if (vxattrs) | 514 | if (vxattrs) |
513 | vxattr = ceph_match_vxattr(vxattrs, name); | 515 | vxattr = ceph_match_vxattr(vxattrs, name); |
514 | 516 | ||
515 | spin_lock(&ci->i_ceph_lock); | 517 | spin_lock(&ci->i_ceph_lock); |
516 | dout("getxattr %p ver=%lld index_ver=%lld\n", inode, | 518 | dout("getxattr %p ver=%lld index_ver=%lld\n", inode, |
517 | ci->i_xattrs.version, ci->i_xattrs.index_version); | 519 | ci->i_xattrs.version, ci->i_xattrs.index_version); |
518 | 520 | ||
519 | if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) && | 521 | if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) && |
520 | (ci->i_xattrs.index_version >= ci->i_xattrs.version)) { | 522 | (ci->i_xattrs.index_version >= ci->i_xattrs.version)) { |
521 | goto get_xattr; | 523 | goto get_xattr; |
522 | } else { | 524 | } else { |
523 | spin_unlock(&ci->i_ceph_lock); | 525 | spin_unlock(&ci->i_ceph_lock); |
524 | /* get xattrs from mds (if we don't already have them) */ | 526 | /* get xattrs from mds (if we don't already have them) */ |
525 | err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR); | 527 | err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR); |
526 | if (err) | 528 | if (err) |
527 | return err; | 529 | return err; |
528 | } | 530 | } |
529 | 531 | ||
530 | spin_lock(&ci->i_ceph_lock); | 532 | spin_lock(&ci->i_ceph_lock); |
531 | 533 | ||
532 | if (vxattr && vxattr->readonly) { | 534 | if (vxattr && vxattr->readonly) { |
533 | err = vxattr->getxattr_cb(ci, value, size); | 535 | err = vxattr->getxattr_cb(ci, value, size); |
534 | goto out; | 536 | goto out; |
535 | } | 537 | } |
536 | 538 | ||
537 | err = __build_xattrs(inode); | 539 | err = __build_xattrs(inode); |
538 | if (err < 0) | 540 | if (err < 0) |
539 | goto out; | 541 | goto out; |
540 | 542 | ||
541 | get_xattr: | 543 | get_xattr: |
542 | err = -ENODATA; /* == ENOATTR */ | 544 | err = -ENODATA; /* == ENOATTR */ |
543 | xattr = __get_xattr(ci, name); | 545 | xattr = __get_xattr(ci, name); |
544 | if (!xattr) { | 546 | if (!xattr) { |
545 | if (vxattr) | 547 | if (vxattr) |
546 | err = vxattr->getxattr_cb(ci, value, size); | 548 | err = vxattr->getxattr_cb(ci, value, size); |
547 | goto out; | 549 | goto out; |
548 | } | 550 | } |
549 | 551 | ||
550 | err = -ERANGE; | 552 | err = -ERANGE; |
551 | if (size && size < xattr->val_len) | 553 | if (size && size < xattr->val_len) |
552 | goto out; | 554 | goto out; |
553 | 555 | ||
554 | err = xattr->val_len; | 556 | err = xattr->val_len; |
555 | if (size == 0) | 557 | if (size == 0) |
556 | goto out; | 558 | goto out; |
557 | 559 | ||
558 | memcpy(value, xattr->val, xattr->val_len); | 560 | memcpy(value, xattr->val, xattr->val_len); |
559 | 561 | ||
560 | out: | 562 | out: |
561 | spin_unlock(&ci->i_ceph_lock); | 563 | spin_unlock(&ci->i_ceph_lock); |
562 | return err; | 564 | return err; |
563 | } | 565 | } |
564 | 566 | ||
565 | ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) | 567 | ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) |
566 | { | 568 | { |
567 | struct inode *inode = dentry->d_inode; | 569 | struct inode *inode = dentry->d_inode; |
568 | struct ceph_inode_info *ci = ceph_inode(inode); | 570 | struct ceph_inode_info *ci = ceph_inode(inode); |
569 | struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); | 571 | struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); |
570 | u32 vir_namelen = 0; | 572 | u32 vir_namelen = 0; |
571 | u32 namelen; | 573 | u32 namelen; |
572 | int err; | 574 | int err; |
573 | u32 len; | 575 | u32 len; |
574 | int i; | 576 | int i; |
575 | 577 | ||
576 | spin_lock(&ci->i_ceph_lock); | 578 | spin_lock(&ci->i_ceph_lock); |
577 | dout("listxattr %p ver=%lld index_ver=%lld\n", inode, | 579 | dout("listxattr %p ver=%lld index_ver=%lld\n", inode, |
578 | ci->i_xattrs.version, ci->i_xattrs.index_version); | 580 | ci->i_xattrs.version, ci->i_xattrs.index_version); |
579 | 581 | ||
580 | if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) && | 582 | if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) && |
581 | (ci->i_xattrs.index_version >= ci->i_xattrs.version)) { | 583 | (ci->i_xattrs.index_version >= ci->i_xattrs.version)) { |
582 | goto list_xattr; | 584 | goto list_xattr; |
583 | } else { | 585 | } else { |
584 | spin_unlock(&ci->i_ceph_lock); | 586 | spin_unlock(&ci->i_ceph_lock); |
585 | err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR); | 587 | err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR); |
586 | if (err) | 588 | if (err) |
587 | return err; | 589 | return err; |
588 | } | 590 | } |
589 | 591 | ||
590 | spin_lock(&ci->i_ceph_lock); | 592 | spin_lock(&ci->i_ceph_lock); |
591 | 593 | ||
592 | err = __build_xattrs(inode); | 594 | err = __build_xattrs(inode); |
593 | if (err < 0) | 595 | if (err < 0) |
594 | goto out; | 596 | goto out; |
595 | 597 | ||
596 | list_xattr: | 598 | list_xattr: |
597 | vir_namelen = 0; | 599 | vir_namelen = 0; |
598 | /* include virtual dir xattrs */ | 600 | /* include virtual dir xattrs */ |
599 | if (vxattrs) | 601 | if (vxattrs) |
600 | for (i = 0; vxattrs[i].name; i++) | 602 | for (i = 0; vxattrs[i].name; i++) |
601 | vir_namelen += strlen(vxattrs[i].name) + 1; | 603 | vir_namelen += strlen(vxattrs[i].name) + 1; |
602 | /* adding 1 byte per each variable due to the null termination */ | 604 | /* adding 1 byte per each variable due to the null termination */ |
603 | namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count; | 605 | namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count; |
604 | err = -ERANGE; | 606 | err = -ERANGE; |
605 | if (size && namelen > size) | 607 | if (size && namelen > size) |
606 | goto out; | 608 | goto out; |
607 | 609 | ||
608 | err = namelen; | 610 | err = namelen; |
609 | if (size == 0) | 611 | if (size == 0) |
610 | goto out; | 612 | goto out; |
611 | 613 | ||
612 | names = __copy_xattr_names(ci, names); | 614 | names = __copy_xattr_names(ci, names); |
613 | 615 | ||
614 | /* virtual xattr names, too */ | 616 | /* virtual xattr names, too */ |
615 | if (vxattrs) | 617 | if (vxattrs) |
616 | for (i = 0; vxattrs[i].name; i++) { | 618 | for (i = 0; vxattrs[i].name; i++) { |
617 | len = sprintf(names, "%s", vxattrs[i].name); | 619 | len = sprintf(names, "%s", vxattrs[i].name); |
618 | names += len + 1; | 620 | names += len + 1; |
619 | } | 621 | } |
620 | 622 | ||
621 | out: | 623 | out: |
622 | spin_unlock(&ci->i_ceph_lock); | 624 | spin_unlock(&ci->i_ceph_lock); |
623 | return err; | 625 | return err; |
624 | } | 626 | } |
625 | 627 | ||
626 | static int ceph_sync_setxattr(struct dentry *dentry, const char *name, | 628 | static int ceph_sync_setxattr(struct dentry *dentry, const char *name, |
627 | const char *value, size_t size, int flags) | 629 | const char *value, size_t size, int flags) |
628 | { | 630 | { |
629 | struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); | 631 | struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); |
630 | struct inode *inode = dentry->d_inode; | 632 | struct inode *inode = dentry->d_inode; |
631 | struct ceph_inode_info *ci = ceph_inode(inode); | 633 | struct ceph_inode_info *ci = ceph_inode(inode); |
632 | struct inode *parent_inode; | 634 | struct inode *parent_inode; |
633 | struct ceph_mds_request *req; | 635 | struct ceph_mds_request *req; |
634 | struct ceph_mds_client *mdsc = fsc->mdsc; | 636 | struct ceph_mds_client *mdsc = fsc->mdsc; |
635 | int err; | 637 | int err; |
636 | int i, nr_pages; | 638 | int i, nr_pages; |
637 | struct page **pages = NULL; | 639 | struct page **pages = NULL; |
638 | void *kaddr; | 640 | void *kaddr; |
639 | 641 | ||
640 | /* copy value into some pages */ | 642 | /* copy value into some pages */ |
641 | nr_pages = calc_pages_for(0, size); | 643 | nr_pages = calc_pages_for(0, size); |
642 | if (nr_pages) { | 644 | if (nr_pages) { |
643 | pages = kmalloc(sizeof(pages[0])*nr_pages, GFP_NOFS); | 645 | pages = kmalloc(sizeof(pages[0])*nr_pages, GFP_NOFS); |
644 | if (!pages) | 646 | if (!pages) |
645 | return -ENOMEM; | 647 | return -ENOMEM; |
646 | err = -ENOMEM; | 648 | err = -ENOMEM; |
647 | for (i = 0; i < nr_pages; i++) { | 649 | for (i = 0; i < nr_pages; i++) { |
648 | pages[i] = __page_cache_alloc(GFP_NOFS); | 650 | pages[i] = __page_cache_alloc(GFP_NOFS); |
649 | if (!pages[i]) { | 651 | if (!pages[i]) { |
650 | nr_pages = i; | 652 | nr_pages = i; |
651 | goto out; | 653 | goto out; |
652 | } | 654 | } |
653 | kaddr = kmap(pages[i]); | 655 | kaddr = kmap(pages[i]); |
654 | memcpy(kaddr, value + i*PAGE_CACHE_SIZE, | 656 | memcpy(kaddr, value + i*PAGE_CACHE_SIZE, |
655 | min(PAGE_CACHE_SIZE, size-i*PAGE_CACHE_SIZE)); | 657 | min(PAGE_CACHE_SIZE, size-i*PAGE_CACHE_SIZE)); |
656 | } | 658 | } |
657 | } | 659 | } |
658 | 660 | ||
659 | dout("setxattr value=%.*s\n", (int)size, value); | 661 | dout("setxattr value=%.*s\n", (int)size, value); |
660 | 662 | ||
661 | /* do request */ | 663 | /* do request */ |
662 | req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR, | 664 | req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR, |
663 | USE_AUTH_MDS); | 665 | USE_AUTH_MDS); |
664 | if (IS_ERR(req)) { | 666 | if (IS_ERR(req)) { |
665 | err = PTR_ERR(req); | 667 | err = PTR_ERR(req); |
666 | goto out; | 668 | goto out; |
667 | } | 669 | } |
668 | req->r_inode = inode; | 670 | req->r_inode = inode; |
669 | ihold(inode); | 671 | ihold(inode); |
670 | req->r_inode_drop = CEPH_CAP_XATTR_SHARED; | 672 | req->r_inode_drop = CEPH_CAP_XATTR_SHARED; |
671 | req->r_num_caps = 1; | 673 | req->r_num_caps = 1; |
672 | req->r_args.setxattr.flags = cpu_to_le32(flags); | 674 | req->r_args.setxattr.flags = cpu_to_le32(flags); |
673 | req->r_path2 = kstrdup(name, GFP_NOFS); | 675 | req->r_path2 = kstrdup(name, GFP_NOFS); |
674 | 676 | ||
675 | req->r_pages = pages; | 677 | req->r_pages = pages; |
676 | req->r_num_pages = nr_pages; | 678 | req->r_num_pages = nr_pages; |
677 | req->r_data_len = size; | 679 | req->r_data_len = size; |
678 | 680 | ||
679 | dout("xattr.ver (before): %lld\n", ci->i_xattrs.version); | 681 | dout("xattr.ver (before): %lld\n", ci->i_xattrs.version); |
680 | parent_inode = ceph_get_dentry_parent_inode(dentry); | 682 | parent_inode = ceph_get_dentry_parent_inode(dentry); |
681 | err = ceph_mdsc_do_request(mdsc, parent_inode, req); | 683 | err = ceph_mdsc_do_request(mdsc, parent_inode, req); |
682 | iput(parent_inode); | 684 | iput(parent_inode); |
683 | ceph_mdsc_put_request(req); | 685 | ceph_mdsc_put_request(req); |
684 | dout("xattr.ver (after): %lld\n", ci->i_xattrs.version); | 686 | dout("xattr.ver (after): %lld\n", ci->i_xattrs.version); |
685 | 687 | ||
686 | out: | 688 | out: |
687 | if (pages) { | 689 | if (pages) { |
688 | for (i = 0; i < nr_pages; i++) | 690 | for (i = 0; i < nr_pages; i++) |
689 | __free_page(pages[i]); | 691 | __free_page(pages[i]); |
690 | kfree(pages); | 692 | kfree(pages); |
691 | } | 693 | } |
692 | return err; | 694 | return err; |
693 | } | 695 | } |
694 | 696 | ||
695 | int ceph_setxattr(struct dentry *dentry, const char *name, | 697 | int ceph_setxattr(struct dentry *dentry, const char *name, |
696 | const void *value, size_t size, int flags) | 698 | const void *value, size_t size, int flags) |
697 | { | 699 | { |
698 | struct inode *inode = dentry->d_inode; | 700 | struct inode *inode = dentry->d_inode; |
699 | struct ceph_inode_info *ci = ceph_inode(inode); | 701 | struct ceph_inode_info *ci = ceph_inode(inode); |
700 | struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); | 702 | struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); |
701 | int err; | 703 | int err; |
702 | int name_len = strlen(name); | 704 | int name_len = strlen(name); |
703 | int val_len = size; | 705 | int val_len = size; |
704 | char *newname = NULL; | 706 | char *newname = NULL; |
705 | char *newval = NULL; | 707 | char *newval = NULL; |
706 | struct ceph_inode_xattr *xattr = NULL; | 708 | struct ceph_inode_xattr *xattr = NULL; |
707 | int issued; | 709 | int issued; |
708 | int required_blob_size; | 710 | int required_blob_size; |
709 | int dirty; | 711 | int dirty; |
710 | 712 | ||
711 | if (ceph_snap(inode) != CEPH_NOSNAP) | 713 | if (ceph_snap(inode) != CEPH_NOSNAP) |
712 | return -EROFS; | 714 | return -EROFS; |
713 | 715 | ||
714 | if (!ceph_is_valid_xattr(name)) | 716 | if (!ceph_is_valid_xattr(name)) |
715 | return -EOPNOTSUPP; | 717 | return -EOPNOTSUPP; |
716 | 718 | ||
717 | if (vxattrs) { | 719 | if (vxattrs) { |
718 | struct ceph_vxattr_cb *vxattr = | 720 | struct ceph_vxattr_cb *vxattr = |
719 | ceph_match_vxattr(vxattrs, name); | 721 | ceph_match_vxattr(vxattrs, name); |
720 | if (vxattr && vxattr->readonly) | 722 | if (vxattr && vxattr->readonly) |
721 | return -EOPNOTSUPP; | 723 | return -EOPNOTSUPP; |
722 | } | 724 | } |
723 | 725 | ||
724 | /* preallocate memory for xattr name, value, index node */ | 726 | /* preallocate memory for xattr name, value, index node */ |
725 | err = -ENOMEM; | 727 | err = -ENOMEM; |
726 | newname = kmemdup(name, name_len + 1, GFP_NOFS); | 728 | newname = kmemdup(name, name_len + 1, GFP_NOFS); |
727 | if (!newname) | 729 | if (!newname) |
728 | goto out; | 730 | goto out; |
729 | 731 | ||
730 | if (val_len) { | 732 | if (val_len) { |
731 | newval = kmalloc(val_len + 1, GFP_NOFS); | 733 | newval = kmalloc(val_len + 1, GFP_NOFS); |
732 | if (!newval) | 734 | if (!newval) |
733 | goto out; | 735 | goto out; |
734 | memcpy(newval, value, val_len); | 736 | memcpy(newval, value, val_len); |
735 | newval[val_len] = '\0'; | 737 | newval[val_len] = '\0'; |
736 | } | 738 | } |
737 | 739 | ||
738 | xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS); | 740 | xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS); |
739 | if (!xattr) | 741 | if (!xattr) |
740 | goto out; | 742 | goto out; |
741 | 743 | ||
742 | spin_lock(&ci->i_ceph_lock); | 744 | spin_lock(&ci->i_ceph_lock); |
743 | retry: | 745 | retry: |
744 | issued = __ceph_caps_issued(ci, NULL); | 746 | issued = __ceph_caps_issued(ci, NULL); |
745 | if (!(issued & CEPH_CAP_XATTR_EXCL)) | 747 | if (!(issued & CEPH_CAP_XATTR_EXCL)) |
746 | goto do_sync; | 748 | goto do_sync; |
747 | __build_xattrs(inode); | 749 | __build_xattrs(inode); |
748 | 750 | ||
749 | required_blob_size = __get_required_blob_size(ci, name_len, val_len); | 751 | required_blob_size = __get_required_blob_size(ci, name_len, val_len); |
750 | 752 | ||
751 | if (!ci->i_xattrs.prealloc_blob || | 753 | if (!ci->i_xattrs.prealloc_blob || |
752 | required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) { | 754 | required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) { |
753 | struct ceph_buffer *blob = NULL; | 755 | struct ceph_buffer *blob = NULL; |
754 | 756 | ||
755 | spin_unlock(&ci->i_ceph_lock); | 757 | spin_unlock(&ci->i_ceph_lock); |
756 | dout(" preaallocating new blob size=%d\n", required_blob_size); | 758 | dout(" preaallocating new blob size=%d\n", required_blob_size); |
757 | blob = ceph_buffer_new(required_blob_size, GFP_NOFS); | 759 | blob = ceph_buffer_new(required_blob_size, GFP_NOFS); |
758 | if (!blob) | 760 | if (!blob) |
759 | goto out; | 761 | goto out; |
760 | spin_lock(&ci->i_ceph_lock); | 762 | spin_lock(&ci->i_ceph_lock); |
761 | if (ci->i_xattrs.prealloc_blob) | 763 | if (ci->i_xattrs.prealloc_blob) |
762 | ceph_buffer_put(ci->i_xattrs.prealloc_blob); | 764 | ceph_buffer_put(ci->i_xattrs.prealloc_blob); |
763 | ci->i_xattrs.prealloc_blob = blob; | 765 | ci->i_xattrs.prealloc_blob = blob; |
764 | goto retry; | 766 | goto retry; |
765 | } | 767 | } |
766 | 768 | ||
767 | dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued)); | 769 | dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued)); |
768 | err = __set_xattr(ci, newname, name_len, newval, | 770 | err = __set_xattr(ci, newname, name_len, newval, |
769 | val_len, 1, 1, 1, &xattr); | 771 | val_len, 1, 1, 1, &xattr); |
770 | dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); | 772 | dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); |
771 | ci->i_xattrs.dirty = true; | 773 | ci->i_xattrs.dirty = true; |
772 | inode->i_ctime = CURRENT_TIME; | 774 | inode->i_ctime = CURRENT_TIME; |
773 | spin_unlock(&ci->i_ceph_lock); | 775 | spin_unlock(&ci->i_ceph_lock); |
774 | if (dirty) | 776 | if (dirty) |
775 | __mark_inode_dirty(inode, dirty); | 777 | __mark_inode_dirty(inode, dirty); |
776 | return err; | 778 | return err; |
777 | 779 | ||
778 | do_sync: | 780 | do_sync: |
779 | spin_unlock(&ci->i_ceph_lock); | 781 | spin_unlock(&ci->i_ceph_lock); |
780 | err = ceph_sync_setxattr(dentry, name, value, size, flags); | 782 | err = ceph_sync_setxattr(dentry, name, value, size, flags); |
781 | out: | 783 | out: |
782 | kfree(newname); | 784 | kfree(newname); |
783 | kfree(newval); | 785 | kfree(newval); |
784 | kfree(xattr); | 786 | kfree(xattr); |
785 | return err; | 787 | return err; |
786 | } | 788 | } |
787 | 789 | ||
788 | static int ceph_send_removexattr(struct dentry *dentry, const char *name) | 790 | static int ceph_send_removexattr(struct dentry *dentry, const char *name) |
789 | { | 791 | { |
790 | struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); | 792 | struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); |
791 | struct ceph_mds_client *mdsc = fsc->mdsc; | 793 | struct ceph_mds_client *mdsc = fsc->mdsc; |
792 | struct inode *inode = dentry->d_inode; | 794 | struct inode *inode = dentry->d_inode; |
793 | struct inode *parent_inode; | 795 | struct inode *parent_inode; |
794 | struct ceph_mds_request *req; | 796 | struct ceph_mds_request *req; |
795 | int err; | 797 | int err; |
796 | 798 | ||
797 | req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RMXATTR, | 799 | req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RMXATTR, |
798 | USE_AUTH_MDS); | 800 | USE_AUTH_MDS); |
799 | if (IS_ERR(req)) | 801 | if (IS_ERR(req)) |
800 | return PTR_ERR(req); | 802 | return PTR_ERR(req); |
801 | req->r_inode = inode; | 803 | req->r_inode = inode; |
802 | ihold(inode); | 804 | ihold(inode); |
803 | req->r_inode_drop = CEPH_CAP_XATTR_SHARED; | 805 | req->r_inode_drop = CEPH_CAP_XATTR_SHARED; |
804 | req->r_num_caps = 1; | 806 | req->r_num_caps = 1; |
805 | req->r_path2 = kstrdup(name, GFP_NOFS); | 807 | req->r_path2 = kstrdup(name, GFP_NOFS); |
806 | 808 | ||
807 | parent_inode = ceph_get_dentry_parent_inode(dentry); | 809 | parent_inode = ceph_get_dentry_parent_inode(dentry); |
808 | err = ceph_mdsc_do_request(mdsc, parent_inode, req); | 810 | err = ceph_mdsc_do_request(mdsc, parent_inode, req); |
809 | iput(parent_inode); | 811 | iput(parent_inode); |
810 | ceph_mdsc_put_request(req); | 812 | ceph_mdsc_put_request(req); |
811 | return err; | 813 | return err; |
812 | } | 814 | } |
813 | 815 | ||
814 | int ceph_removexattr(struct dentry *dentry, const char *name) | 816 | int ceph_removexattr(struct dentry *dentry, const char *name) |
815 | { | 817 | { |
816 | struct inode *inode = dentry->d_inode; | 818 | struct inode *inode = dentry->d_inode; |
817 | struct ceph_inode_info *ci = ceph_inode(inode); | 819 | struct ceph_inode_info *ci = ceph_inode(inode); |
818 | struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); | 820 | struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); |
819 | int issued; | 821 | int issued; |
820 | int err; | 822 | int err; |
821 | int required_blob_size; | 823 | int required_blob_size; |
822 | int dirty; | 824 | int dirty; |
823 | 825 | ||
824 | if (ceph_snap(inode) != CEPH_NOSNAP) | 826 | if (ceph_snap(inode) != CEPH_NOSNAP) |
825 | return -EROFS; | 827 | return -EROFS; |
826 | 828 | ||
827 | if (!ceph_is_valid_xattr(name)) | 829 | if (!ceph_is_valid_xattr(name)) |
828 | return -EOPNOTSUPP; | 830 | return -EOPNOTSUPP; |
829 | 831 | ||
830 | if (vxattrs) { | 832 | if (vxattrs) { |
831 | struct ceph_vxattr_cb *vxattr = | 833 | struct ceph_vxattr_cb *vxattr = |
832 | ceph_match_vxattr(vxattrs, name); | 834 | ceph_match_vxattr(vxattrs, name); |
833 | if (vxattr && vxattr->readonly) | 835 | if (vxattr && vxattr->readonly) |
834 | return -EOPNOTSUPP; | 836 | return -EOPNOTSUPP; |
835 | } | 837 | } |
836 | 838 | ||
837 | err = -ENOMEM; | 839 | err = -ENOMEM; |
838 | spin_lock(&ci->i_ceph_lock); | 840 | spin_lock(&ci->i_ceph_lock); |
839 | __build_xattrs(inode); | 841 | __build_xattrs(inode); |
840 | retry: | 842 | retry: |
841 | issued = __ceph_caps_issued(ci, NULL); | 843 | issued = __ceph_caps_issued(ci, NULL); |
842 | dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued)); | 844 | dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued)); |
843 | 845 | ||
844 | if (!(issued & CEPH_CAP_XATTR_EXCL)) | 846 | if (!(issued & CEPH_CAP_XATTR_EXCL)) |
845 | goto do_sync; | 847 | goto do_sync; |
846 | 848 | ||
847 | required_blob_size = __get_required_blob_size(ci, 0, 0); | 849 | required_blob_size = __get_required_blob_size(ci, 0, 0); |
848 | 850 | ||
849 | if (!ci->i_xattrs.prealloc_blob || | 851 | if (!ci->i_xattrs.prealloc_blob || |
850 | required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) { | 852 | required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) { |
851 | struct ceph_buffer *blob; | 853 | struct ceph_buffer *blob; |
852 | 854 | ||
853 | spin_unlock(&ci->i_ceph_lock); | 855 | spin_unlock(&ci->i_ceph_lock); |
854 | dout(" preaallocating new blob size=%d\n", required_blob_size); | 856 | dout(" preaallocating new blob size=%d\n", required_blob_size); |
855 | blob = ceph_buffer_new(required_blob_size, GFP_NOFS); | 857 | blob = ceph_buffer_new(required_blob_size, GFP_NOFS); |
856 | if (!blob) | 858 | if (!blob) |
857 | goto out; | 859 | goto out; |
858 | spin_lock(&ci->i_ceph_lock); | 860 | spin_lock(&ci->i_ceph_lock); |
859 | if (ci->i_xattrs.prealloc_blob) | 861 | if (ci->i_xattrs.prealloc_blob) |
860 | ceph_buffer_put(ci->i_xattrs.prealloc_blob); | 862 | ceph_buffer_put(ci->i_xattrs.prealloc_blob); |
861 | ci->i_xattrs.prealloc_blob = blob; | 863 | ci->i_xattrs.prealloc_blob = blob; |
862 | goto retry; | 864 | goto retry; |
863 | } | 865 | } |
864 | 866 | ||
865 | err = __remove_xattr_by_name(ceph_inode(inode), name); | 867 | err = __remove_xattr_by_name(ceph_inode(inode), name); |
866 | dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); | 868 | dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); |
867 | ci->i_xattrs.dirty = true; | 869 | ci->i_xattrs.dirty = true; |
868 | inode->i_ctime = CURRENT_TIME; | 870 | inode->i_ctime = CURRENT_TIME; |
869 | 871 | ||
870 | spin_unlock(&ci->i_ceph_lock); | 872 | spin_unlock(&ci->i_ceph_lock); |
871 | if (dirty) | 873 | if (dirty) |
872 | __mark_inode_dirty(inode, dirty); | 874 | __mark_inode_dirty(inode, dirty); |
873 | return err; | 875 | return err; |
874 | do_sync: | 876 | do_sync: |
875 | spin_unlock(&ci->i_ceph_lock); | 877 | spin_unlock(&ci->i_ceph_lock); |
876 | err = ceph_send_removexattr(dentry, name); | 878 | err = ceph_send_removexattr(dentry, name); |
877 | out: | 879 | out: |
878 | return err; | 880 | return err; |
879 | } | 881 | } |
880 | 882 | ||
881 | 883 |
net/ceph/ceph_common.c
1 | 1 | ||
2 | #include <linux/ceph/ceph_debug.h> | 2 | #include <linux/ceph/ceph_debug.h> |
3 | #include <linux/backing-dev.h> | 3 | #include <linux/backing-dev.h> |
4 | #include <linux/ctype.h> | 4 | #include <linux/ctype.h> |
5 | #include <linux/fs.h> | 5 | #include <linux/fs.h> |
6 | #include <linux/inet.h> | 6 | #include <linux/inet.h> |
7 | #include <linux/in6.h> | 7 | #include <linux/in6.h> |
8 | #include <linux/key.h> | 8 | #include <linux/key.h> |
9 | #include <keys/ceph-type.h> | 9 | #include <keys/ceph-type.h> |
10 | #include <linux/module.h> | 10 | #include <linux/module.h> |
11 | #include <linux/mount.h> | 11 | #include <linux/mount.h> |
12 | #include <linux/parser.h> | 12 | #include <linux/parser.h> |
13 | #include <linux/sched.h> | 13 | #include <linux/sched.h> |
14 | #include <linux/seq_file.h> | 14 | #include <linux/seq_file.h> |
15 | #include <linux/slab.h> | 15 | #include <linux/slab.h> |
16 | #include <linux/statfs.h> | 16 | #include <linux/statfs.h> |
17 | #include <linux/string.h> | 17 | #include <linux/string.h> |
18 | 18 | ||
19 | 19 | ||
20 | #include <linux/ceph/libceph.h> | 20 | #include <linux/ceph/libceph.h> |
21 | #include <linux/ceph/debugfs.h> | 21 | #include <linux/ceph/debugfs.h> |
22 | #include <linux/ceph/decode.h> | 22 | #include <linux/ceph/decode.h> |
23 | #include <linux/ceph/mon_client.h> | 23 | #include <linux/ceph/mon_client.h> |
24 | #include <linux/ceph/auth.h> | 24 | #include <linux/ceph/auth.h> |
25 | #include "crypto.h" | 25 | #include "crypto.h" |
26 | 26 | ||
27 | 27 | ||
28 | 28 | ||
29 | /* | 29 | /* |
30 | * find filename portion of a path (/foo/bar/baz -> baz) | 30 | * find filename portion of a path (/foo/bar/baz -> baz) |
31 | */ | 31 | */ |
32 | const char *ceph_file_part(const char *s, int len) | 32 | const char *ceph_file_part(const char *s, int len) |
33 | { | 33 | { |
34 | const char *e = s + len; | 34 | const char *e = s + len; |
35 | 35 | ||
36 | while (e != s && *(e-1) != '/') | 36 | while (e != s && *(e-1) != '/') |
37 | e--; | 37 | e--; |
38 | return e; | 38 | return e; |
39 | } | 39 | } |
40 | EXPORT_SYMBOL(ceph_file_part); | 40 | EXPORT_SYMBOL(ceph_file_part); |
41 | 41 | ||
42 | const char *ceph_msg_type_name(int type) | 42 | const char *ceph_msg_type_name(int type) |
43 | { | 43 | { |
44 | switch (type) { | 44 | switch (type) { |
45 | case CEPH_MSG_SHUTDOWN: return "shutdown"; | 45 | case CEPH_MSG_SHUTDOWN: return "shutdown"; |
46 | case CEPH_MSG_PING: return "ping"; | 46 | case CEPH_MSG_PING: return "ping"; |
47 | case CEPH_MSG_AUTH: return "auth"; | 47 | case CEPH_MSG_AUTH: return "auth"; |
48 | case CEPH_MSG_AUTH_REPLY: return "auth_reply"; | 48 | case CEPH_MSG_AUTH_REPLY: return "auth_reply"; |
49 | case CEPH_MSG_MON_MAP: return "mon_map"; | 49 | case CEPH_MSG_MON_MAP: return "mon_map"; |
50 | case CEPH_MSG_MON_GET_MAP: return "mon_get_map"; | 50 | case CEPH_MSG_MON_GET_MAP: return "mon_get_map"; |
51 | case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe"; | 51 | case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe"; |
52 | case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack"; | 52 | case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack"; |
53 | case CEPH_MSG_STATFS: return "statfs"; | 53 | case CEPH_MSG_STATFS: return "statfs"; |
54 | case CEPH_MSG_STATFS_REPLY: return "statfs_reply"; | 54 | case CEPH_MSG_STATFS_REPLY: return "statfs_reply"; |
55 | case CEPH_MSG_MDS_MAP: return "mds_map"; | 55 | case CEPH_MSG_MDS_MAP: return "mds_map"; |
56 | case CEPH_MSG_CLIENT_SESSION: return "client_session"; | 56 | case CEPH_MSG_CLIENT_SESSION: return "client_session"; |
57 | case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect"; | 57 | case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect"; |
58 | case CEPH_MSG_CLIENT_REQUEST: return "client_request"; | 58 | case CEPH_MSG_CLIENT_REQUEST: return "client_request"; |
59 | case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward"; | 59 | case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward"; |
60 | case CEPH_MSG_CLIENT_REPLY: return "client_reply"; | 60 | case CEPH_MSG_CLIENT_REPLY: return "client_reply"; |
61 | case CEPH_MSG_CLIENT_CAPS: return "client_caps"; | 61 | case CEPH_MSG_CLIENT_CAPS: return "client_caps"; |
62 | case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release"; | 62 | case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release"; |
63 | case CEPH_MSG_CLIENT_SNAP: return "client_snap"; | 63 | case CEPH_MSG_CLIENT_SNAP: return "client_snap"; |
64 | case CEPH_MSG_CLIENT_LEASE: return "client_lease"; | 64 | case CEPH_MSG_CLIENT_LEASE: return "client_lease"; |
65 | case CEPH_MSG_OSD_MAP: return "osd_map"; | 65 | case CEPH_MSG_OSD_MAP: return "osd_map"; |
66 | case CEPH_MSG_OSD_OP: return "osd_op"; | 66 | case CEPH_MSG_OSD_OP: return "osd_op"; |
67 | case CEPH_MSG_OSD_OPREPLY: return "osd_opreply"; | 67 | case CEPH_MSG_OSD_OPREPLY: return "osd_opreply"; |
68 | case CEPH_MSG_WATCH_NOTIFY: return "watch_notify"; | 68 | case CEPH_MSG_WATCH_NOTIFY: return "watch_notify"; |
69 | default: return "unknown"; | 69 | default: return "unknown"; |
70 | } | 70 | } |
71 | } | 71 | } |
72 | EXPORT_SYMBOL(ceph_msg_type_name); | 72 | EXPORT_SYMBOL(ceph_msg_type_name); |
73 | 73 | ||
74 | /* | 74 | /* |
75 | * Initially learn our fsid, or verify an fsid matches. | 75 | * Initially learn our fsid, or verify an fsid matches. |
76 | */ | 76 | */ |
77 | int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid) | 77 | int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid) |
78 | { | 78 | { |
79 | if (client->have_fsid) { | 79 | if (client->have_fsid) { |
80 | if (ceph_fsid_compare(&client->fsid, fsid)) { | 80 | if (ceph_fsid_compare(&client->fsid, fsid)) { |
81 | pr_err("bad fsid, had %pU got %pU", | 81 | pr_err("bad fsid, had %pU got %pU", |
82 | &client->fsid, fsid); | 82 | &client->fsid, fsid); |
83 | return -1; | 83 | return -1; |
84 | } | 84 | } |
85 | } else { | 85 | } else { |
86 | pr_info("client%lld fsid %pU\n", ceph_client_id(client), fsid); | 86 | pr_info("client%lld fsid %pU\n", ceph_client_id(client), fsid); |
87 | memcpy(&client->fsid, fsid, sizeof(*fsid)); | 87 | memcpy(&client->fsid, fsid, sizeof(*fsid)); |
88 | ceph_debugfs_client_init(client); | ||
89 | client->have_fsid = true; | ||
90 | } | 88 | } |
91 | return 0; | 89 | return 0; |
92 | } | 90 | } |
93 | EXPORT_SYMBOL(ceph_check_fsid); | 91 | EXPORT_SYMBOL(ceph_check_fsid); |
94 | 92 | ||
95 | static int strcmp_null(const char *s1, const char *s2) | 93 | static int strcmp_null(const char *s1, const char *s2) |
96 | { | 94 | { |
97 | if (!s1 && !s2) | 95 | if (!s1 && !s2) |
98 | return 0; | 96 | return 0; |
99 | if (s1 && !s2) | 97 | if (s1 && !s2) |
100 | return -1; | 98 | return -1; |
101 | if (!s1 && s2) | 99 | if (!s1 && s2) |
102 | return 1; | 100 | return 1; |
103 | return strcmp(s1, s2); | 101 | return strcmp(s1, s2); |
104 | } | 102 | } |
105 | 103 | ||
106 | int ceph_compare_options(struct ceph_options *new_opt, | 104 | int ceph_compare_options(struct ceph_options *new_opt, |
107 | struct ceph_client *client) | 105 | struct ceph_client *client) |
108 | { | 106 | { |
109 | struct ceph_options *opt1 = new_opt; | 107 | struct ceph_options *opt1 = new_opt; |
110 | struct ceph_options *opt2 = client->options; | 108 | struct ceph_options *opt2 = client->options; |
111 | int ofs = offsetof(struct ceph_options, mon_addr); | 109 | int ofs = offsetof(struct ceph_options, mon_addr); |
112 | int i; | 110 | int i; |
113 | int ret; | 111 | int ret; |
114 | 112 | ||
115 | ret = memcmp(opt1, opt2, ofs); | 113 | ret = memcmp(opt1, opt2, ofs); |
116 | if (ret) | 114 | if (ret) |
117 | return ret; | 115 | return ret; |
118 | 116 | ||
119 | ret = strcmp_null(opt1->name, opt2->name); | 117 | ret = strcmp_null(opt1->name, opt2->name); |
120 | if (ret) | 118 | if (ret) |
121 | return ret; | 119 | return ret; |
122 | 120 | ||
123 | if (opt1->key && !opt2->key) | 121 | if (opt1->key && !opt2->key) |
124 | return -1; | 122 | return -1; |
125 | if (!opt1->key && opt2->key) | 123 | if (!opt1->key && opt2->key) |
126 | return 1; | 124 | return 1; |
127 | if (opt1->key && opt2->key) { | 125 | if (opt1->key && opt2->key) { |
128 | if (opt1->key->type != opt2->key->type) | 126 | if (opt1->key->type != opt2->key->type) |
129 | return -1; | 127 | return -1; |
130 | if (opt1->key->created.tv_sec != opt2->key->created.tv_sec) | 128 | if (opt1->key->created.tv_sec != opt2->key->created.tv_sec) |
131 | return -1; | 129 | return -1; |
132 | if (opt1->key->created.tv_nsec != opt2->key->created.tv_nsec) | 130 | if (opt1->key->created.tv_nsec != opt2->key->created.tv_nsec) |
133 | return -1; | 131 | return -1; |
134 | if (opt1->key->len != opt2->key->len) | 132 | if (opt1->key->len != opt2->key->len) |
135 | return -1; | 133 | return -1; |
136 | if (opt1->key->key && !opt2->key->key) | 134 | if (opt1->key->key && !opt2->key->key) |
137 | return -1; | 135 | return -1; |
138 | if (!opt1->key->key && opt2->key->key) | 136 | if (!opt1->key->key && opt2->key->key) |
139 | return 1; | 137 | return 1; |
140 | if (opt1->key->key && opt2->key->key) { | 138 | if (opt1->key->key && opt2->key->key) { |
141 | ret = memcmp(opt1->key->key, opt2->key->key, opt1->key->len); | 139 | ret = memcmp(opt1->key->key, opt2->key->key, opt1->key->len); |
142 | if (ret) | 140 | if (ret) |
143 | return ret; | 141 | return ret; |
144 | } | 142 | } |
145 | } | 143 | } |
146 | 144 | ||
147 | /* any matching mon ip implies a match */ | 145 | /* any matching mon ip implies a match */ |
148 | for (i = 0; i < opt1->num_mon; i++) { | 146 | for (i = 0; i < opt1->num_mon; i++) { |
149 | if (ceph_monmap_contains(client->monc.monmap, | 147 | if (ceph_monmap_contains(client->monc.monmap, |
150 | &opt1->mon_addr[i])) | 148 | &opt1->mon_addr[i])) |
151 | return 0; | 149 | return 0; |
152 | } | 150 | } |
153 | return -1; | 151 | return -1; |
154 | } | 152 | } |
155 | EXPORT_SYMBOL(ceph_compare_options); | 153 | EXPORT_SYMBOL(ceph_compare_options); |
156 | 154 | ||
157 | 155 | ||
158 | static int parse_fsid(const char *str, struct ceph_fsid *fsid) | 156 | static int parse_fsid(const char *str, struct ceph_fsid *fsid) |
159 | { | 157 | { |
160 | int i = 0; | 158 | int i = 0; |
161 | char tmp[3]; | 159 | char tmp[3]; |
162 | int err = -EINVAL; | 160 | int err = -EINVAL; |
163 | int d; | 161 | int d; |
164 | 162 | ||
165 | dout("parse_fsid '%s'\n", str); | 163 | dout("parse_fsid '%s'\n", str); |
166 | tmp[2] = 0; | 164 | tmp[2] = 0; |
167 | while (*str && i < 16) { | 165 | while (*str && i < 16) { |
168 | if (ispunct(*str)) { | 166 | if (ispunct(*str)) { |
169 | str++; | 167 | str++; |
170 | continue; | 168 | continue; |
171 | } | 169 | } |
172 | if (!isxdigit(str[0]) || !isxdigit(str[1])) | 170 | if (!isxdigit(str[0]) || !isxdigit(str[1])) |
173 | break; | 171 | break; |
174 | tmp[0] = str[0]; | 172 | tmp[0] = str[0]; |
175 | tmp[1] = str[1]; | 173 | tmp[1] = str[1]; |
176 | if (sscanf(tmp, "%x", &d) < 1) | 174 | if (sscanf(tmp, "%x", &d) < 1) |
177 | break; | 175 | break; |
178 | fsid->fsid[i] = d & 0xff; | 176 | fsid->fsid[i] = d & 0xff; |
179 | i++; | 177 | i++; |
180 | str += 2; | 178 | str += 2; |
181 | } | 179 | } |
182 | 180 | ||
183 | if (i == 16) | 181 | if (i == 16) |
184 | err = 0; | 182 | err = 0; |
185 | dout("parse_fsid ret %d got fsid %pU", err, fsid); | 183 | dout("parse_fsid ret %d got fsid %pU", err, fsid); |
186 | return err; | 184 | return err; |
187 | } | 185 | } |
188 | 186 | ||
189 | /* | 187 | /* |
190 | * ceph options | 188 | * ceph options |
191 | */ | 189 | */ |
192 | enum { | 190 | enum { |
193 | Opt_osdtimeout, | 191 | Opt_osdtimeout, |
194 | Opt_osdkeepalivetimeout, | 192 | Opt_osdkeepalivetimeout, |
195 | Opt_mount_timeout, | 193 | Opt_mount_timeout, |
196 | Opt_osd_idle_ttl, | 194 | Opt_osd_idle_ttl, |
197 | Opt_last_int, | 195 | Opt_last_int, |
198 | /* int args above */ | 196 | /* int args above */ |
199 | Opt_fsid, | 197 | Opt_fsid, |
200 | Opt_name, | 198 | Opt_name, |
201 | Opt_secret, | 199 | Opt_secret, |
202 | Opt_key, | 200 | Opt_key, |
203 | Opt_ip, | 201 | Opt_ip, |
204 | Opt_last_string, | 202 | Opt_last_string, |
205 | /* string args above */ | 203 | /* string args above */ |
206 | Opt_noshare, | 204 | Opt_noshare, |
207 | Opt_nocrc, | 205 | Opt_nocrc, |
208 | }; | 206 | }; |
209 | 207 | ||
210 | static match_table_t opt_tokens = { | 208 | static match_table_t opt_tokens = { |
211 | {Opt_osdtimeout, "osdtimeout=%d"}, | 209 | {Opt_osdtimeout, "osdtimeout=%d"}, |
212 | {Opt_osdkeepalivetimeout, "osdkeepalive=%d"}, | 210 | {Opt_osdkeepalivetimeout, "osdkeepalive=%d"}, |
213 | {Opt_mount_timeout, "mount_timeout=%d"}, | 211 | {Opt_mount_timeout, "mount_timeout=%d"}, |
214 | {Opt_osd_idle_ttl, "osd_idle_ttl=%d"}, | 212 | {Opt_osd_idle_ttl, "osd_idle_ttl=%d"}, |
215 | /* int args above */ | 213 | /* int args above */ |
216 | {Opt_fsid, "fsid=%s"}, | 214 | {Opt_fsid, "fsid=%s"}, |
217 | {Opt_name, "name=%s"}, | 215 | {Opt_name, "name=%s"}, |
218 | {Opt_secret, "secret=%s"}, | 216 | {Opt_secret, "secret=%s"}, |
219 | {Opt_key, "key=%s"}, | 217 | {Opt_key, "key=%s"}, |
220 | {Opt_ip, "ip=%s"}, | 218 | {Opt_ip, "ip=%s"}, |
221 | /* string args above */ | 219 | /* string args above */ |
222 | {Opt_noshare, "noshare"}, | 220 | {Opt_noshare, "noshare"}, |
223 | {Opt_nocrc, "nocrc"}, | 221 | {Opt_nocrc, "nocrc"}, |
224 | {-1, NULL} | 222 | {-1, NULL} |
225 | }; | 223 | }; |
226 | 224 | ||
227 | void ceph_destroy_options(struct ceph_options *opt) | 225 | void ceph_destroy_options(struct ceph_options *opt) |
228 | { | 226 | { |
229 | dout("destroy_options %p\n", opt); | 227 | dout("destroy_options %p\n", opt); |
230 | kfree(opt->name); | 228 | kfree(opt->name); |
231 | if (opt->key) { | 229 | if (opt->key) { |
232 | ceph_crypto_key_destroy(opt->key); | 230 | ceph_crypto_key_destroy(opt->key); |
233 | kfree(opt->key); | 231 | kfree(opt->key); |
234 | } | 232 | } |
235 | kfree(opt->mon_addr); | 233 | kfree(opt->mon_addr); |
236 | kfree(opt); | 234 | kfree(opt); |
237 | } | 235 | } |
238 | EXPORT_SYMBOL(ceph_destroy_options); | 236 | EXPORT_SYMBOL(ceph_destroy_options); |
239 | 237 | ||
240 | /* get secret from key store */ | 238 | /* get secret from key store */ |
241 | static int get_secret(struct ceph_crypto_key *dst, const char *name) { | 239 | static int get_secret(struct ceph_crypto_key *dst, const char *name) { |
242 | struct key *ukey; | 240 | struct key *ukey; |
243 | int key_err; | 241 | int key_err; |
244 | int err = 0; | 242 | int err = 0; |
245 | struct ceph_crypto_key *ckey; | 243 | struct ceph_crypto_key *ckey; |
246 | 244 | ||
247 | ukey = request_key(&key_type_ceph, name, NULL); | 245 | ukey = request_key(&key_type_ceph, name, NULL); |
248 | if (!ukey || IS_ERR(ukey)) { | 246 | if (!ukey || IS_ERR(ukey)) { |
249 | /* request_key errors don't map nicely to mount(2) | 247 | /* request_key errors don't map nicely to mount(2) |
250 | errors; don't even try, but still printk */ | 248 | errors; don't even try, but still printk */ |
251 | key_err = PTR_ERR(ukey); | 249 | key_err = PTR_ERR(ukey); |
252 | switch (key_err) { | 250 | switch (key_err) { |
253 | case -ENOKEY: | 251 | case -ENOKEY: |
254 | pr_warning("ceph: Mount failed due to key not found: %s\n", name); | 252 | pr_warning("ceph: Mount failed due to key not found: %s\n", name); |
255 | break; | 253 | break; |
256 | case -EKEYEXPIRED: | 254 | case -EKEYEXPIRED: |
257 | pr_warning("ceph: Mount failed due to expired key: %s\n", name); | 255 | pr_warning("ceph: Mount failed due to expired key: %s\n", name); |
258 | break; | 256 | break; |
259 | case -EKEYREVOKED: | 257 | case -EKEYREVOKED: |
260 | pr_warning("ceph: Mount failed due to revoked key: %s\n", name); | 258 | pr_warning("ceph: Mount failed due to revoked key: %s\n", name); |
261 | break; | 259 | break; |
262 | default: | 260 | default: |
263 | pr_warning("ceph: Mount failed due to unknown key error" | 261 | pr_warning("ceph: Mount failed due to unknown key error" |
264 | " %d: %s\n", key_err, name); | 262 | " %d: %s\n", key_err, name); |
265 | } | 263 | } |
266 | err = -EPERM; | 264 | err = -EPERM; |
267 | goto out; | 265 | goto out; |
268 | } | 266 | } |
269 | 267 | ||
270 | ckey = ukey->payload.data; | 268 | ckey = ukey->payload.data; |
271 | err = ceph_crypto_key_clone(dst, ckey); | 269 | err = ceph_crypto_key_clone(dst, ckey); |
272 | if (err) | 270 | if (err) |
273 | goto out_key; | 271 | goto out_key; |
274 | /* pass through, err is 0 */ | 272 | /* pass through, err is 0 */ |
275 | 273 | ||
276 | out_key: | 274 | out_key: |
277 | key_put(ukey); | 275 | key_put(ukey); |
278 | out: | 276 | out: |
279 | return err; | 277 | return err; |
280 | } | 278 | } |
281 | 279 | ||
282 | int ceph_parse_options(struct ceph_options **popt, char *options, | 280 | int ceph_parse_options(struct ceph_options **popt, char *options, |
283 | const char *dev_name, const char *dev_name_end, | 281 | const char *dev_name, const char *dev_name_end, |
284 | int (*parse_extra_token)(char *c, void *private), | 282 | int (*parse_extra_token)(char *c, void *private), |
285 | void *private) | 283 | void *private) |
286 | { | 284 | { |
287 | struct ceph_options *opt; | 285 | struct ceph_options *opt; |
288 | const char *c; | 286 | const char *c; |
289 | int err = -ENOMEM; | 287 | int err = -ENOMEM; |
290 | substring_t argstr[MAX_OPT_ARGS]; | 288 | substring_t argstr[MAX_OPT_ARGS]; |
291 | 289 | ||
292 | opt = kzalloc(sizeof(*opt), GFP_KERNEL); | 290 | opt = kzalloc(sizeof(*opt), GFP_KERNEL); |
293 | if (!opt) | 291 | if (!opt) |
294 | return err; | 292 | return err; |
295 | opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr), | 293 | opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr), |
296 | GFP_KERNEL); | 294 | GFP_KERNEL); |
297 | if (!opt->mon_addr) | 295 | if (!opt->mon_addr) |
298 | goto out; | 296 | goto out; |
299 | 297 | ||
300 | dout("parse_options %p options '%s' dev_name '%s'\n", opt, options, | 298 | dout("parse_options %p options '%s' dev_name '%s'\n", opt, options, |
301 | dev_name); | 299 | dev_name); |
302 | 300 | ||
303 | /* start with defaults */ | 301 | /* start with defaults */ |
304 | opt->flags = CEPH_OPT_DEFAULT; | 302 | opt->flags = CEPH_OPT_DEFAULT; |
305 | opt->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT; | 303 | opt->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT; |
306 | opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; | 304 | opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; |
307 | opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */ | 305 | opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */ |
308 | opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */ | 306 | opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */ |
309 | 307 | ||
310 | /* get mon ip(s) */ | 308 | /* get mon ip(s) */ |
311 | /* ip1[:port1][,ip2[:port2]...] */ | 309 | /* ip1[:port1][,ip2[:port2]...] */ |
312 | err = ceph_parse_ips(dev_name, dev_name_end, opt->mon_addr, | 310 | err = ceph_parse_ips(dev_name, dev_name_end, opt->mon_addr, |
313 | CEPH_MAX_MON, &opt->num_mon); | 311 | CEPH_MAX_MON, &opt->num_mon); |
314 | if (err < 0) | 312 | if (err < 0) |
315 | goto out; | 313 | goto out; |
316 | 314 | ||
317 | /* parse mount options */ | 315 | /* parse mount options */ |
318 | while ((c = strsep(&options, ",")) != NULL) { | 316 | while ((c = strsep(&options, ",")) != NULL) { |
319 | int token, intval, ret; | 317 | int token, intval, ret; |
320 | if (!*c) | 318 | if (!*c) |
321 | continue; | 319 | continue; |
322 | err = -EINVAL; | 320 | err = -EINVAL; |
323 | token = match_token((char *)c, opt_tokens, argstr); | 321 | token = match_token((char *)c, opt_tokens, argstr); |
324 | if (token < 0 && parse_extra_token) { | 322 | if (token < 0 && parse_extra_token) { |
325 | /* extra? */ | 323 | /* extra? */ |
326 | err = parse_extra_token((char *)c, private); | 324 | err = parse_extra_token((char *)c, private); |
327 | if (err < 0) { | 325 | if (err < 0) { |
328 | pr_err("bad option at '%s'\n", c); | 326 | pr_err("bad option at '%s'\n", c); |
329 | goto out; | 327 | goto out; |
330 | } | 328 | } |
331 | continue; | 329 | continue; |
332 | } | 330 | } |
333 | if (token < Opt_last_int) { | 331 | if (token < Opt_last_int) { |
334 | ret = match_int(&argstr[0], &intval); | 332 | ret = match_int(&argstr[0], &intval); |
335 | if (ret < 0) { | 333 | if (ret < 0) { |
336 | pr_err("bad mount option arg (not int) " | 334 | pr_err("bad mount option arg (not int) " |
337 | "at '%s'\n", c); | 335 | "at '%s'\n", c); |
338 | continue; | 336 | continue; |
339 | } | 337 | } |
340 | dout("got int token %d val %d\n", token, intval); | 338 | dout("got int token %d val %d\n", token, intval); |
341 | } else if (token > Opt_last_int && token < Opt_last_string) { | 339 | } else if (token > Opt_last_int && token < Opt_last_string) { |
342 | dout("got string token %d val %s\n", token, | 340 | dout("got string token %d val %s\n", token, |
343 | argstr[0].from); | 341 | argstr[0].from); |
344 | } else { | 342 | } else { |
345 | dout("got token %d\n", token); | 343 | dout("got token %d\n", token); |
346 | } | 344 | } |
347 | switch (token) { | 345 | switch (token) { |
348 | case Opt_ip: | 346 | case Opt_ip: |
349 | err = ceph_parse_ips(argstr[0].from, | 347 | err = ceph_parse_ips(argstr[0].from, |
350 | argstr[0].to, | 348 | argstr[0].to, |
351 | &opt->my_addr, | 349 | &opt->my_addr, |
352 | 1, NULL); | 350 | 1, NULL); |
353 | if (err < 0) | 351 | if (err < 0) |
354 | goto out; | 352 | goto out; |
355 | opt->flags |= CEPH_OPT_MYIP; | 353 | opt->flags |= CEPH_OPT_MYIP; |
356 | break; | 354 | break; |
357 | 355 | ||
358 | case Opt_fsid: | 356 | case Opt_fsid: |
359 | err = parse_fsid(argstr[0].from, &opt->fsid); | 357 | err = parse_fsid(argstr[0].from, &opt->fsid); |
360 | if (err == 0) | 358 | if (err == 0) |
361 | opt->flags |= CEPH_OPT_FSID; | 359 | opt->flags |= CEPH_OPT_FSID; |
362 | break; | 360 | break; |
363 | case Opt_name: | 361 | case Opt_name: |
364 | opt->name = kstrndup(argstr[0].from, | 362 | opt->name = kstrndup(argstr[0].from, |
365 | argstr[0].to-argstr[0].from, | 363 | argstr[0].to-argstr[0].from, |
366 | GFP_KERNEL); | 364 | GFP_KERNEL); |
367 | break; | 365 | break; |
368 | case Opt_secret: | 366 | case Opt_secret: |
369 | opt->key = kzalloc(sizeof(*opt->key), GFP_KERNEL); | 367 | opt->key = kzalloc(sizeof(*opt->key), GFP_KERNEL); |
370 | if (!opt->key) { | 368 | if (!opt->key) { |
371 | err = -ENOMEM; | 369 | err = -ENOMEM; |
372 | goto out; | 370 | goto out; |
373 | } | 371 | } |
374 | err = ceph_crypto_key_unarmor(opt->key, argstr[0].from); | 372 | err = ceph_crypto_key_unarmor(opt->key, argstr[0].from); |
375 | if (err < 0) | 373 | if (err < 0) |
376 | goto out; | 374 | goto out; |
377 | break; | 375 | break; |
378 | case Opt_key: | 376 | case Opt_key: |
379 | opt->key = kzalloc(sizeof(*opt->key), GFP_KERNEL); | 377 | opt->key = kzalloc(sizeof(*opt->key), GFP_KERNEL); |
380 | if (!opt->key) { | 378 | if (!opt->key) { |
381 | err = -ENOMEM; | 379 | err = -ENOMEM; |
382 | goto out; | 380 | goto out; |
383 | } | 381 | } |
384 | err = get_secret(opt->key, argstr[0].from); | 382 | err = get_secret(opt->key, argstr[0].from); |
385 | if (err < 0) | 383 | if (err < 0) |
386 | goto out; | 384 | goto out; |
387 | break; | 385 | break; |
388 | 386 | ||
389 | /* misc */ | 387 | /* misc */ |
390 | case Opt_osdtimeout: | 388 | case Opt_osdtimeout: |
391 | opt->osd_timeout = intval; | 389 | opt->osd_timeout = intval; |
392 | break; | 390 | break; |
393 | case Opt_osdkeepalivetimeout: | 391 | case Opt_osdkeepalivetimeout: |
394 | opt->osd_keepalive_timeout = intval; | 392 | opt->osd_keepalive_timeout = intval; |
395 | break; | 393 | break; |
396 | case Opt_osd_idle_ttl: | 394 | case Opt_osd_idle_ttl: |
397 | opt->osd_idle_ttl = intval; | 395 | opt->osd_idle_ttl = intval; |
398 | break; | 396 | break; |
399 | case Opt_mount_timeout: | 397 | case Opt_mount_timeout: |
400 | opt->mount_timeout = intval; | 398 | opt->mount_timeout = intval; |
401 | break; | 399 | break; |
402 | 400 | ||
403 | case Opt_noshare: | 401 | case Opt_noshare: |
404 | opt->flags |= CEPH_OPT_NOSHARE; | 402 | opt->flags |= CEPH_OPT_NOSHARE; |
405 | break; | 403 | break; |
406 | 404 | ||
407 | case Opt_nocrc: | 405 | case Opt_nocrc: |
408 | opt->flags |= CEPH_OPT_NOCRC; | 406 | opt->flags |= CEPH_OPT_NOCRC; |
409 | break; | 407 | break; |
410 | 408 | ||
411 | default: | 409 | default: |
412 | BUG_ON(token); | 410 | BUG_ON(token); |
413 | } | 411 | } |
414 | } | 412 | } |
415 | 413 | ||
416 | /* success */ | 414 | /* success */ |
417 | *popt = opt; | 415 | *popt = opt; |
418 | return 0; | 416 | return 0; |
419 | 417 | ||
420 | out: | 418 | out: |
421 | ceph_destroy_options(opt); | 419 | ceph_destroy_options(opt); |
422 | return err; | 420 | return err; |
423 | } | 421 | } |
424 | EXPORT_SYMBOL(ceph_parse_options); | 422 | EXPORT_SYMBOL(ceph_parse_options); |
425 | 423 | ||
426 | u64 ceph_client_id(struct ceph_client *client) | 424 | u64 ceph_client_id(struct ceph_client *client) |
427 | { | 425 | { |
428 | return client->monc.auth->global_id; | 426 | return client->monc.auth->global_id; |
429 | } | 427 | } |
430 | EXPORT_SYMBOL(ceph_client_id); | 428 | EXPORT_SYMBOL(ceph_client_id); |
431 | 429 | ||
432 | /* | 430 | /* |
433 | * create a fresh client instance | 431 | * create a fresh client instance |
434 | */ | 432 | */ |
435 | struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private, | 433 | struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private, |
436 | unsigned supported_features, | 434 | unsigned supported_features, |
437 | unsigned required_features) | 435 | unsigned required_features) |
438 | { | 436 | { |
439 | struct ceph_client *client; | 437 | struct ceph_client *client; |
440 | struct ceph_entity_addr *myaddr = NULL; | 438 | struct ceph_entity_addr *myaddr = NULL; |
441 | int err = -ENOMEM; | 439 | int err = -ENOMEM; |
442 | 440 | ||
443 | client = kzalloc(sizeof(*client), GFP_KERNEL); | 441 | client = kzalloc(sizeof(*client), GFP_KERNEL); |
444 | if (client == NULL) | 442 | if (client == NULL) |
445 | return ERR_PTR(-ENOMEM); | 443 | return ERR_PTR(-ENOMEM); |
446 | 444 | ||
447 | client->private = private; | 445 | client->private = private; |
448 | client->options = opt; | 446 | client->options = opt; |
449 | 447 | ||
450 | mutex_init(&client->mount_mutex); | 448 | mutex_init(&client->mount_mutex); |
451 | init_waitqueue_head(&client->auth_wq); | 449 | init_waitqueue_head(&client->auth_wq); |
452 | client->auth_err = 0; | 450 | client->auth_err = 0; |
453 | 451 | ||
454 | client->extra_mon_dispatch = NULL; | 452 | client->extra_mon_dispatch = NULL; |
455 | client->supported_features = CEPH_FEATURE_SUPPORTED_DEFAULT | | 453 | client->supported_features = CEPH_FEATURE_SUPPORTED_DEFAULT | |
456 | supported_features; | 454 | supported_features; |
457 | client->required_features = CEPH_FEATURE_REQUIRED_DEFAULT | | 455 | client->required_features = CEPH_FEATURE_REQUIRED_DEFAULT | |
458 | required_features; | 456 | required_features; |
459 | 457 | ||
460 | /* msgr */ | 458 | /* msgr */ |
461 | if (ceph_test_opt(client, MYIP)) | 459 | if (ceph_test_opt(client, MYIP)) |
462 | myaddr = &client->options->my_addr; | 460 | myaddr = &client->options->my_addr; |
463 | client->msgr = ceph_messenger_create(myaddr, | 461 | client->msgr = ceph_messenger_create(myaddr, |
464 | client->supported_features, | 462 | client->supported_features, |
465 | client->required_features); | 463 | client->required_features); |
466 | if (IS_ERR(client->msgr)) { | 464 | if (IS_ERR(client->msgr)) { |
467 | err = PTR_ERR(client->msgr); | 465 | err = PTR_ERR(client->msgr); |
468 | goto fail; | 466 | goto fail; |
469 | } | 467 | } |
470 | client->msgr->nocrc = ceph_test_opt(client, NOCRC); | 468 | client->msgr->nocrc = ceph_test_opt(client, NOCRC); |
471 | 469 | ||
472 | /* subsystems */ | 470 | /* subsystems */ |
473 | err = ceph_monc_init(&client->monc, client); | 471 | err = ceph_monc_init(&client->monc, client); |
474 | if (err < 0) | 472 | if (err < 0) |
475 | goto fail_msgr; | 473 | goto fail_msgr; |
476 | err = ceph_osdc_init(&client->osdc, client); | 474 | err = ceph_osdc_init(&client->osdc, client); |
477 | if (err < 0) | 475 | if (err < 0) |
478 | goto fail_monc; | 476 | goto fail_monc; |
479 | 477 | ||
480 | return client; | 478 | return client; |
481 | 479 | ||
482 | fail_monc: | 480 | fail_monc: |
483 | ceph_monc_stop(&client->monc); | 481 | ceph_monc_stop(&client->monc); |
484 | fail_msgr: | 482 | fail_msgr: |
485 | ceph_messenger_destroy(client->msgr); | 483 | ceph_messenger_destroy(client->msgr); |
486 | fail: | 484 | fail: |
487 | kfree(client); | 485 | kfree(client); |
488 | return ERR_PTR(err); | 486 | return ERR_PTR(err); |
489 | } | 487 | } |
490 | EXPORT_SYMBOL(ceph_create_client); | 488 | EXPORT_SYMBOL(ceph_create_client); |
491 | 489 | ||
492 | void ceph_destroy_client(struct ceph_client *client) | 490 | void ceph_destroy_client(struct ceph_client *client) |
493 | { | 491 | { |
494 | dout("destroy_client %p\n", client); | 492 | dout("destroy_client %p\n", client); |
495 | 493 | ||
496 | /* unmount */ | 494 | /* unmount */ |
497 | ceph_osdc_stop(&client->osdc); | 495 | ceph_osdc_stop(&client->osdc); |
498 | 496 | ||
499 | /* | 497 | /* |
500 | * make sure osd connections close out before destroying the | 498 | * make sure osd connections close out before destroying the |
501 | * auth module, which is needed to free those connections' | 499 | * auth module, which is needed to free those connections' |
502 | * ceph_authorizers. | 500 | * ceph_authorizers. |
503 | */ | 501 | */ |
504 | ceph_msgr_flush(); | 502 | ceph_msgr_flush(); |
505 | 503 | ||
506 | ceph_monc_stop(&client->monc); | 504 | ceph_monc_stop(&client->monc); |
507 | 505 | ||
508 | ceph_debugfs_client_cleanup(client); | 506 | ceph_debugfs_client_cleanup(client); |
509 | 507 | ||
510 | ceph_messenger_destroy(client->msgr); | 508 | ceph_messenger_destroy(client->msgr); |
511 | 509 | ||
512 | ceph_destroy_options(client->options); | 510 | ceph_destroy_options(client->options); |
513 | 511 | ||
514 | kfree(client); | 512 | kfree(client); |
515 | dout("destroy_client %p done\n", client); | 513 | dout("destroy_client %p done\n", client); |
516 | } | 514 | } |
517 | EXPORT_SYMBOL(ceph_destroy_client); | 515 | EXPORT_SYMBOL(ceph_destroy_client); |
518 | 516 | ||
519 | /* | 517 | /* |
520 | * true if we have the mon map (and have thus joined the cluster) | 518 | * true if we have the mon map (and have thus joined the cluster) |
521 | */ | 519 | */ |
522 | static int have_mon_and_osd_map(struct ceph_client *client) | 520 | static int have_mon_and_osd_map(struct ceph_client *client) |
523 | { | 521 | { |
524 | return client->monc.monmap && client->monc.monmap->epoch && | 522 | return client->monc.monmap && client->monc.monmap->epoch && |
525 | client->osdc.osdmap && client->osdc.osdmap->epoch; | 523 | client->osdc.osdmap && client->osdc.osdmap->epoch; |
526 | } | 524 | } |
527 | 525 | ||
528 | /* | 526 | /* |
529 | * mount: join the ceph cluster, and open root directory. | 527 | * mount: join the ceph cluster, and open root directory. |
530 | */ | 528 | */ |
531 | int __ceph_open_session(struct ceph_client *client, unsigned long started) | 529 | int __ceph_open_session(struct ceph_client *client, unsigned long started) |
532 | { | 530 | { |
533 | int err; | 531 | int err; |
534 | unsigned long timeout = client->options->mount_timeout * HZ; | 532 | unsigned long timeout = client->options->mount_timeout * HZ; |
535 | 533 | ||
536 | /* open session, and wait for mon and osd maps */ | 534 | /* open session, and wait for mon and osd maps */ |
537 | err = ceph_monc_open_session(&client->monc); | 535 | err = ceph_monc_open_session(&client->monc); |
538 | if (err < 0) | 536 | if (err < 0) |
539 | return err; | 537 | return err; |
540 | 538 | ||
541 | while (!have_mon_and_osd_map(client)) { | 539 | while (!have_mon_and_osd_map(client)) { |
542 | err = -EIO; | 540 | err = -EIO; |
543 | if (timeout && time_after_eq(jiffies, started + timeout)) | 541 | if (timeout && time_after_eq(jiffies, started + timeout)) |
544 | return err; | 542 | return err; |
545 | 543 | ||
546 | /* wait */ | 544 | /* wait */ |
547 | dout("mount waiting for mon_map\n"); | 545 | dout("mount waiting for mon_map\n"); |
548 | err = wait_event_interruptible_timeout(client->auth_wq, | 546 | err = wait_event_interruptible_timeout(client->auth_wq, |
549 | have_mon_and_osd_map(client) || (client->auth_err < 0), | 547 | have_mon_and_osd_map(client) || (client->auth_err < 0), |
550 | timeout); | 548 | timeout); |
551 | if (err == -EINTR || err == -ERESTARTSYS) | 549 | if (err == -EINTR || err == -ERESTARTSYS) |
552 | return err; | 550 | return err; |
553 | if (client->auth_err < 0) | 551 | if (client->auth_err < 0) |
554 | return client->auth_err; | 552 | return client->auth_err; |
555 | } | 553 | } |
556 | 554 | ||
557 | return 0; | 555 | return 0; |
558 | } | 556 | } |
559 | EXPORT_SYMBOL(__ceph_open_session); | 557 | EXPORT_SYMBOL(__ceph_open_session); |
560 | 558 | ||
561 | 559 | ||
562 | int ceph_open_session(struct ceph_client *client) | 560 | int ceph_open_session(struct ceph_client *client) |
563 | { | 561 | { |
564 | int ret; | 562 | int ret; |
565 | unsigned long started = jiffies; /* note the start time */ | 563 | unsigned long started = jiffies; /* note the start time */ |
566 | 564 | ||
567 | dout("open_session start\n"); | 565 | dout("open_session start\n"); |
568 | mutex_lock(&client->mount_mutex); | 566 | mutex_lock(&client->mount_mutex); |
569 | 567 | ||
570 | ret = __ceph_open_session(client, started); | 568 | ret = __ceph_open_session(client, started); |
571 | 569 | ||
572 | mutex_unlock(&client->mount_mutex); | 570 | mutex_unlock(&client->mount_mutex); |
573 | return ret; | 571 | return ret; |
574 | } | 572 | } |
575 | EXPORT_SYMBOL(ceph_open_session); | 573 | EXPORT_SYMBOL(ceph_open_session); |
576 | 574 | ||
577 | 575 | ||
578 | static int __init init_ceph_lib(void) | 576 | static int __init init_ceph_lib(void) |
579 | { | 577 | { |
580 | int ret = 0; | 578 | int ret = 0; |
581 | 579 | ||
582 | ret = ceph_debugfs_init(); | 580 | ret = ceph_debugfs_init(); |
583 | if (ret < 0) | 581 | if (ret < 0) |
584 | goto out; | 582 | goto out; |
585 | 583 | ||
586 | ret = ceph_crypto_init(); | 584 | ret = ceph_crypto_init(); |
587 | if (ret < 0) | 585 | if (ret < 0) |
588 | goto out_debugfs; | 586 | goto out_debugfs; |
589 | 587 | ||
590 | ret = ceph_msgr_init(); | 588 | ret = ceph_msgr_init(); |
591 | if (ret < 0) | 589 | if (ret < 0) |
592 | goto out_crypto; | 590 | goto out_crypto; |
593 | 591 | ||
594 | pr_info("loaded (mon/osd proto %d/%d, osdmap %d/%d %d/%d)\n", | 592 | pr_info("loaded (mon/osd proto %d/%d, osdmap %d/%d %d/%d)\n", |
595 | CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL, | 593 | CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL, |
596 | CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT, | 594 | CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT, |
597 | CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT); | 595 | CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT); |
598 | 596 | ||
599 | return 0; | 597 | return 0; |
600 | 598 | ||
601 | out_crypto: | 599 | out_crypto: |
602 | ceph_crypto_shutdown(); | 600 | ceph_crypto_shutdown(); |
603 | out_debugfs: | 601 | out_debugfs: |
604 | ceph_debugfs_cleanup(); | 602 | ceph_debugfs_cleanup(); |
605 | out: | 603 | out: |
606 | return ret; | 604 | return ret; |
607 | } | 605 | } |
608 | 606 | ||
609 | static void __exit exit_ceph_lib(void) | 607 | static void __exit exit_ceph_lib(void) |
610 | { | 608 | { |
611 | dout("exit_ceph_lib\n"); | 609 | dout("exit_ceph_lib\n"); |
612 | ceph_msgr_exit(); | 610 | ceph_msgr_exit(); |
613 | ceph_crypto_shutdown(); | 611 | ceph_crypto_shutdown(); |
614 | ceph_debugfs_cleanup(); | 612 | ceph_debugfs_cleanup(); |
615 | } | 613 | } |
616 | 614 | ||
617 | module_init(init_ceph_lib); | 615 | module_init(init_ceph_lib); |
618 | module_exit(exit_ceph_lib); | 616 | module_exit(exit_ceph_lib); |
619 | 617 | ||
620 | MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); | 618 | MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); |
621 | MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); | 619 | MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); |
622 | MODULE_AUTHOR("Patience Warnick <patience@newdream.net>"); | 620 | MODULE_AUTHOR("Patience Warnick <patience@newdream.net>"); |
623 | MODULE_DESCRIPTION("Ceph filesystem for Linux"); | 621 | MODULE_DESCRIPTION("Ceph filesystem for Linux"); |
624 | MODULE_LICENSE("GPL"); | 622 | MODULE_LICENSE("GPL"); |
625 | 623 |
net/ceph/mon_client.c
1 | #include <linux/ceph/ceph_debug.h> | 1 | #include <linux/ceph/ceph_debug.h> |
2 | 2 | ||
3 | #include <linux/module.h> | 3 | #include <linux/module.h> |
4 | #include <linux/types.h> | 4 | #include <linux/types.h> |
5 | #include <linux/slab.h> | 5 | #include <linux/slab.h> |
6 | #include <linux/random.h> | 6 | #include <linux/random.h> |
7 | #include <linux/sched.h> | 7 | #include <linux/sched.h> |
8 | 8 | ||
9 | #include <linux/ceph/mon_client.h> | 9 | #include <linux/ceph/mon_client.h> |
10 | #include <linux/ceph/libceph.h> | 10 | #include <linux/ceph/libceph.h> |
11 | #include <linux/ceph/debugfs.h> | ||
11 | #include <linux/ceph/decode.h> | 12 | #include <linux/ceph/decode.h> |
12 | |||
13 | #include <linux/ceph/auth.h> | 13 | #include <linux/ceph/auth.h> |
14 | 14 | ||
15 | /* | 15 | /* |
16 | * Interact with Ceph monitor cluster. Handle requests for new map | 16 | * Interact with Ceph monitor cluster. Handle requests for new map |
17 | * versions, and periodically resend as needed. Also implement | 17 | * versions, and periodically resend as needed. Also implement |
18 | * statfs() and umount(). | 18 | * statfs() and umount(). |
19 | * | 19 | * |
20 | * A small cluster of Ceph "monitors" are responsible for managing critical | 20 | * A small cluster of Ceph "monitors" are responsible for managing critical |
21 | * cluster configuration and state information. An odd number (e.g., 3, 5) | 21 | * cluster configuration and state information. An odd number (e.g., 3, 5) |
22 | * of cmon daemons use a modified version of the Paxos part-time parliament | 22 | * of cmon daemons use a modified version of the Paxos part-time parliament |
23 | * algorithm to manage the MDS map (mds cluster membership), OSD map, and | 23 | * algorithm to manage the MDS map (mds cluster membership), OSD map, and |
24 | * list of clients who have mounted the file system. | 24 | * list of clients who have mounted the file system. |
25 | * | 25 | * |
26 | * We maintain an open, active session with a monitor at all times in order to | 26 | * We maintain an open, active session with a monitor at all times in order to |
27 | * receive timely MDSMap updates. We periodically send a keepalive byte on the | 27 | * receive timely MDSMap updates. We periodically send a keepalive byte on the |
28 | * TCP socket to ensure we detect a failure. If the connection does break, we | 28 | * TCP socket to ensure we detect a failure. If the connection does break, we |
29 | * randomly hunt for a new monitor. Once the connection is reestablished, we | 29 | * randomly hunt for a new monitor. Once the connection is reestablished, we |
30 | * resend any outstanding requests. | 30 | * resend any outstanding requests. |
31 | */ | 31 | */ |
32 | 32 | ||
33 | static const struct ceph_connection_operations mon_con_ops; | 33 | static const struct ceph_connection_operations mon_con_ops; |
34 | 34 | ||
35 | static int __validate_auth(struct ceph_mon_client *monc); | 35 | static int __validate_auth(struct ceph_mon_client *monc); |
36 | 36 | ||
37 | /* | 37 | /* |
38 | * Decode a monmap blob (e.g., during mount). | 38 | * Decode a monmap blob (e.g., during mount). |
39 | */ | 39 | */ |
40 | struct ceph_monmap *ceph_monmap_decode(void *p, void *end) | 40 | struct ceph_monmap *ceph_monmap_decode(void *p, void *end) |
41 | { | 41 | { |
42 | struct ceph_monmap *m = NULL; | 42 | struct ceph_monmap *m = NULL; |
43 | int i, err = -EINVAL; | 43 | int i, err = -EINVAL; |
44 | struct ceph_fsid fsid; | 44 | struct ceph_fsid fsid; |
45 | u32 epoch, num_mon; | 45 | u32 epoch, num_mon; |
46 | u16 version; | 46 | u16 version; |
47 | u32 len; | 47 | u32 len; |
48 | 48 | ||
49 | ceph_decode_32_safe(&p, end, len, bad); | 49 | ceph_decode_32_safe(&p, end, len, bad); |
50 | ceph_decode_need(&p, end, len, bad); | 50 | ceph_decode_need(&p, end, len, bad); |
51 | 51 | ||
52 | dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p)); | 52 | dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p)); |
53 | 53 | ||
54 | ceph_decode_16_safe(&p, end, version, bad); | 54 | ceph_decode_16_safe(&p, end, version, bad); |
55 | 55 | ||
56 | ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad); | 56 | ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad); |
57 | ceph_decode_copy(&p, &fsid, sizeof(fsid)); | 57 | ceph_decode_copy(&p, &fsid, sizeof(fsid)); |
58 | epoch = ceph_decode_32(&p); | 58 | epoch = ceph_decode_32(&p); |
59 | 59 | ||
60 | num_mon = ceph_decode_32(&p); | 60 | num_mon = ceph_decode_32(&p); |
61 | ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad); | 61 | ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad); |
62 | 62 | ||
63 | if (num_mon >= CEPH_MAX_MON) | 63 | if (num_mon >= CEPH_MAX_MON) |
64 | goto bad; | 64 | goto bad; |
65 | m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS); | 65 | m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS); |
66 | if (m == NULL) | 66 | if (m == NULL) |
67 | return ERR_PTR(-ENOMEM); | 67 | return ERR_PTR(-ENOMEM); |
68 | m->fsid = fsid; | 68 | m->fsid = fsid; |
69 | m->epoch = epoch; | 69 | m->epoch = epoch; |
70 | m->num_mon = num_mon; | 70 | m->num_mon = num_mon; |
71 | ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0])); | 71 | ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0])); |
72 | for (i = 0; i < num_mon; i++) | 72 | for (i = 0; i < num_mon; i++) |
73 | ceph_decode_addr(&m->mon_inst[i].addr); | 73 | ceph_decode_addr(&m->mon_inst[i].addr); |
74 | 74 | ||
75 | dout("monmap_decode epoch %d, num_mon %d\n", m->epoch, | 75 | dout("monmap_decode epoch %d, num_mon %d\n", m->epoch, |
76 | m->num_mon); | 76 | m->num_mon); |
77 | for (i = 0; i < m->num_mon; i++) | 77 | for (i = 0; i < m->num_mon; i++) |
78 | dout("monmap_decode mon%d is %s\n", i, | 78 | dout("monmap_decode mon%d is %s\n", i, |
79 | ceph_pr_addr(&m->mon_inst[i].addr.in_addr)); | 79 | ceph_pr_addr(&m->mon_inst[i].addr.in_addr)); |
80 | return m; | 80 | return m; |
81 | 81 | ||
82 | bad: | 82 | bad: |
83 | dout("monmap_decode failed with %d\n", err); | 83 | dout("monmap_decode failed with %d\n", err); |
84 | kfree(m); | 84 | kfree(m); |
85 | return ERR_PTR(err); | 85 | return ERR_PTR(err); |
86 | } | 86 | } |
87 | 87 | ||
88 | /* | 88 | /* |
89 | * return true if *addr is included in the monmap. | 89 | * return true if *addr is included in the monmap. |
90 | */ | 90 | */ |
91 | int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr) | 91 | int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr) |
92 | { | 92 | { |
93 | int i; | 93 | int i; |
94 | 94 | ||
95 | for (i = 0; i < m->num_mon; i++) | 95 | for (i = 0; i < m->num_mon; i++) |
96 | if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0) | 96 | if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0) |
97 | return 1; | 97 | return 1; |
98 | return 0; | 98 | return 0; |
99 | } | 99 | } |
100 | 100 | ||
101 | /* | 101 | /* |
102 | * Send an auth request. | 102 | * Send an auth request. |
103 | */ | 103 | */ |
104 | static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len) | 104 | static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len) |
105 | { | 105 | { |
106 | monc->pending_auth = 1; | 106 | monc->pending_auth = 1; |
107 | monc->m_auth->front.iov_len = len; | 107 | monc->m_auth->front.iov_len = len; |
108 | monc->m_auth->hdr.front_len = cpu_to_le32(len); | 108 | monc->m_auth->hdr.front_len = cpu_to_le32(len); |
109 | ceph_con_revoke(monc->con, monc->m_auth); | 109 | ceph_con_revoke(monc->con, monc->m_auth); |
110 | ceph_msg_get(monc->m_auth); /* keep our ref */ | 110 | ceph_msg_get(monc->m_auth); /* keep our ref */ |
111 | ceph_con_send(monc->con, monc->m_auth); | 111 | ceph_con_send(monc->con, monc->m_auth); |
112 | } | 112 | } |
113 | 113 | ||
114 | /* | 114 | /* |
115 | * Close monitor session, if any. | 115 | * Close monitor session, if any. |
116 | */ | 116 | */ |
117 | static void __close_session(struct ceph_mon_client *monc) | 117 | static void __close_session(struct ceph_mon_client *monc) |
118 | { | 118 | { |
119 | dout("__close_session closing mon%d\n", monc->cur_mon); | 119 | dout("__close_session closing mon%d\n", monc->cur_mon); |
120 | ceph_con_revoke(monc->con, monc->m_auth); | 120 | ceph_con_revoke(monc->con, monc->m_auth); |
121 | ceph_con_close(monc->con); | 121 | ceph_con_close(monc->con); |
122 | monc->cur_mon = -1; | 122 | monc->cur_mon = -1; |
123 | monc->pending_auth = 0; | 123 | monc->pending_auth = 0; |
124 | ceph_auth_reset(monc->auth); | 124 | ceph_auth_reset(monc->auth); |
125 | } | 125 | } |
126 | 126 | ||
127 | /* | 127 | /* |
128 | * Open a session with a (new) monitor. | 128 | * Open a session with a (new) monitor. |
129 | */ | 129 | */ |
130 | static int __open_session(struct ceph_mon_client *monc) | 130 | static int __open_session(struct ceph_mon_client *monc) |
131 | { | 131 | { |
132 | char r; | 132 | char r; |
133 | int ret; | 133 | int ret; |
134 | 134 | ||
135 | if (monc->cur_mon < 0) { | 135 | if (monc->cur_mon < 0) { |
136 | get_random_bytes(&r, 1); | 136 | get_random_bytes(&r, 1); |
137 | monc->cur_mon = r % monc->monmap->num_mon; | 137 | monc->cur_mon = r % monc->monmap->num_mon; |
138 | dout("open_session num=%d r=%d -> mon%d\n", | 138 | dout("open_session num=%d r=%d -> mon%d\n", |
139 | monc->monmap->num_mon, r, monc->cur_mon); | 139 | monc->monmap->num_mon, r, monc->cur_mon); |
140 | monc->sub_sent = 0; | 140 | monc->sub_sent = 0; |
141 | monc->sub_renew_after = jiffies; /* i.e., expired */ | 141 | monc->sub_renew_after = jiffies; /* i.e., expired */ |
142 | monc->want_next_osdmap = !!monc->want_next_osdmap; | 142 | monc->want_next_osdmap = !!monc->want_next_osdmap; |
143 | 143 | ||
144 | dout("open_session mon%d opening\n", monc->cur_mon); | 144 | dout("open_session mon%d opening\n", monc->cur_mon); |
145 | monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON; | 145 | monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON; |
146 | monc->con->peer_name.num = cpu_to_le64(monc->cur_mon); | 146 | monc->con->peer_name.num = cpu_to_le64(monc->cur_mon); |
147 | ceph_con_open(monc->con, | 147 | ceph_con_open(monc->con, |
148 | &monc->monmap->mon_inst[monc->cur_mon].addr); | 148 | &monc->monmap->mon_inst[monc->cur_mon].addr); |
149 | 149 | ||
150 | /* initiatiate authentication handshake */ | 150 | /* initiatiate authentication handshake */ |
151 | ret = ceph_auth_build_hello(monc->auth, | 151 | ret = ceph_auth_build_hello(monc->auth, |
152 | monc->m_auth->front.iov_base, | 152 | monc->m_auth->front.iov_base, |
153 | monc->m_auth->front_max); | 153 | monc->m_auth->front_max); |
154 | __send_prepared_auth_request(monc, ret); | 154 | __send_prepared_auth_request(monc, ret); |
155 | } else { | 155 | } else { |
156 | dout("open_session mon%d already open\n", monc->cur_mon); | 156 | dout("open_session mon%d already open\n", monc->cur_mon); |
157 | } | 157 | } |
158 | return 0; | 158 | return 0; |
159 | } | 159 | } |
160 | 160 | ||
161 | static bool __sub_expired(struct ceph_mon_client *monc) | 161 | static bool __sub_expired(struct ceph_mon_client *monc) |
162 | { | 162 | { |
163 | return time_after_eq(jiffies, monc->sub_renew_after); | 163 | return time_after_eq(jiffies, monc->sub_renew_after); |
164 | } | 164 | } |
165 | 165 | ||
166 | /* | 166 | /* |
167 | * Reschedule delayed work timer. | 167 | * Reschedule delayed work timer. |
168 | */ | 168 | */ |
169 | static void __schedule_delayed(struct ceph_mon_client *monc) | 169 | static void __schedule_delayed(struct ceph_mon_client *monc) |
170 | { | 170 | { |
171 | unsigned delay; | 171 | unsigned delay; |
172 | 172 | ||
173 | if (monc->cur_mon < 0 || __sub_expired(monc)) | 173 | if (monc->cur_mon < 0 || __sub_expired(monc)) |
174 | delay = 10 * HZ; | 174 | delay = 10 * HZ; |
175 | else | 175 | else |
176 | delay = 20 * HZ; | 176 | delay = 20 * HZ; |
177 | dout("__schedule_delayed after %u\n", delay); | 177 | dout("__schedule_delayed after %u\n", delay); |
178 | schedule_delayed_work(&monc->delayed_work, delay); | 178 | schedule_delayed_work(&monc->delayed_work, delay); |
179 | } | 179 | } |
180 | 180 | ||
181 | /* | 181 | /* |
182 | * Send subscribe request for mdsmap and/or osdmap. | 182 | * Send subscribe request for mdsmap and/or osdmap. |
183 | */ | 183 | */ |
184 | static void __send_subscribe(struct ceph_mon_client *monc) | 184 | static void __send_subscribe(struct ceph_mon_client *monc) |
185 | { | 185 | { |
186 | dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n", | 186 | dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n", |
187 | (unsigned)monc->sub_sent, __sub_expired(monc), | 187 | (unsigned)monc->sub_sent, __sub_expired(monc), |
188 | monc->want_next_osdmap); | 188 | monc->want_next_osdmap); |
189 | if ((__sub_expired(monc) && !monc->sub_sent) || | 189 | if ((__sub_expired(monc) && !monc->sub_sent) || |
190 | monc->want_next_osdmap == 1) { | 190 | monc->want_next_osdmap == 1) { |
191 | struct ceph_msg *msg = monc->m_subscribe; | 191 | struct ceph_msg *msg = monc->m_subscribe; |
192 | struct ceph_mon_subscribe_item *i; | 192 | struct ceph_mon_subscribe_item *i; |
193 | void *p, *end; | 193 | void *p, *end; |
194 | int num; | 194 | int num; |
195 | 195 | ||
196 | p = msg->front.iov_base; | 196 | p = msg->front.iov_base; |
197 | end = p + msg->front_max; | 197 | end = p + msg->front_max; |
198 | 198 | ||
199 | num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap; | 199 | num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap; |
200 | ceph_encode_32(&p, num); | 200 | ceph_encode_32(&p, num); |
201 | 201 | ||
202 | if (monc->want_next_osdmap) { | 202 | if (monc->want_next_osdmap) { |
203 | dout("__send_subscribe to 'osdmap' %u\n", | 203 | dout("__send_subscribe to 'osdmap' %u\n", |
204 | (unsigned)monc->have_osdmap); | 204 | (unsigned)monc->have_osdmap); |
205 | ceph_encode_string(&p, end, "osdmap", 6); | 205 | ceph_encode_string(&p, end, "osdmap", 6); |
206 | i = p; | 206 | i = p; |
207 | i->have = cpu_to_le64(monc->have_osdmap); | 207 | i->have = cpu_to_le64(monc->have_osdmap); |
208 | i->onetime = 1; | 208 | i->onetime = 1; |
209 | p += sizeof(*i); | 209 | p += sizeof(*i); |
210 | monc->want_next_osdmap = 2; /* requested */ | 210 | monc->want_next_osdmap = 2; /* requested */ |
211 | } | 211 | } |
212 | if (monc->want_mdsmap) { | 212 | if (monc->want_mdsmap) { |
213 | dout("__send_subscribe to 'mdsmap' %u+\n", | 213 | dout("__send_subscribe to 'mdsmap' %u+\n", |
214 | (unsigned)monc->have_mdsmap); | 214 | (unsigned)monc->have_mdsmap); |
215 | ceph_encode_string(&p, end, "mdsmap", 6); | 215 | ceph_encode_string(&p, end, "mdsmap", 6); |
216 | i = p; | 216 | i = p; |
217 | i->have = cpu_to_le64(monc->have_mdsmap); | 217 | i->have = cpu_to_le64(monc->have_mdsmap); |
218 | i->onetime = 0; | 218 | i->onetime = 0; |
219 | p += sizeof(*i); | 219 | p += sizeof(*i); |
220 | } | 220 | } |
221 | ceph_encode_string(&p, end, "monmap", 6); | 221 | ceph_encode_string(&p, end, "monmap", 6); |
222 | i = p; | 222 | i = p; |
223 | i->have = 0; | 223 | i->have = 0; |
224 | i->onetime = 0; | 224 | i->onetime = 0; |
225 | p += sizeof(*i); | 225 | p += sizeof(*i); |
226 | 226 | ||
227 | msg->front.iov_len = p - msg->front.iov_base; | 227 | msg->front.iov_len = p - msg->front.iov_base; |
228 | msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); | 228 | msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); |
229 | ceph_con_revoke(monc->con, msg); | 229 | ceph_con_revoke(monc->con, msg); |
230 | ceph_con_send(monc->con, ceph_msg_get(msg)); | 230 | ceph_con_send(monc->con, ceph_msg_get(msg)); |
231 | 231 | ||
232 | monc->sub_sent = jiffies | 1; /* never 0 */ | 232 | monc->sub_sent = jiffies | 1; /* never 0 */ |
233 | } | 233 | } |
234 | } | 234 | } |
235 | 235 | ||
236 | static void handle_subscribe_ack(struct ceph_mon_client *monc, | 236 | static void handle_subscribe_ack(struct ceph_mon_client *monc, |
237 | struct ceph_msg *msg) | 237 | struct ceph_msg *msg) |
238 | { | 238 | { |
239 | unsigned seconds; | 239 | unsigned seconds; |
240 | struct ceph_mon_subscribe_ack *h = msg->front.iov_base; | 240 | struct ceph_mon_subscribe_ack *h = msg->front.iov_base; |
241 | 241 | ||
242 | if (msg->front.iov_len < sizeof(*h)) | 242 | if (msg->front.iov_len < sizeof(*h)) |
243 | goto bad; | 243 | goto bad; |
244 | seconds = le32_to_cpu(h->duration); | 244 | seconds = le32_to_cpu(h->duration); |
245 | 245 | ||
246 | mutex_lock(&monc->mutex); | 246 | mutex_lock(&monc->mutex); |
247 | if (monc->hunting) { | 247 | if (monc->hunting) { |
248 | pr_info("mon%d %s session established\n", | 248 | pr_info("mon%d %s session established\n", |
249 | monc->cur_mon, | 249 | monc->cur_mon, |
250 | ceph_pr_addr(&monc->con->peer_addr.in_addr)); | 250 | ceph_pr_addr(&monc->con->peer_addr.in_addr)); |
251 | monc->hunting = false; | 251 | monc->hunting = false; |
252 | } | 252 | } |
253 | dout("handle_subscribe_ack after %d seconds\n", seconds); | 253 | dout("handle_subscribe_ack after %d seconds\n", seconds); |
254 | monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1; | 254 | monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1; |
255 | monc->sub_sent = 0; | 255 | monc->sub_sent = 0; |
256 | mutex_unlock(&monc->mutex); | 256 | mutex_unlock(&monc->mutex); |
257 | return; | 257 | return; |
258 | bad: | 258 | bad: |
259 | pr_err("got corrupt subscribe-ack msg\n"); | 259 | pr_err("got corrupt subscribe-ack msg\n"); |
260 | ceph_msg_dump(msg); | 260 | ceph_msg_dump(msg); |
261 | } | 261 | } |
262 | 262 | ||
263 | /* | 263 | /* |
264 | * Keep track of which maps we have | 264 | * Keep track of which maps we have |
265 | */ | 265 | */ |
266 | int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got) | 266 | int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got) |
267 | { | 267 | { |
268 | mutex_lock(&monc->mutex); | 268 | mutex_lock(&monc->mutex); |
269 | monc->have_mdsmap = got; | 269 | monc->have_mdsmap = got; |
270 | mutex_unlock(&monc->mutex); | 270 | mutex_unlock(&monc->mutex); |
271 | return 0; | 271 | return 0; |
272 | } | 272 | } |
273 | EXPORT_SYMBOL(ceph_monc_got_mdsmap); | 273 | EXPORT_SYMBOL(ceph_monc_got_mdsmap); |
274 | 274 | ||
275 | int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got) | 275 | int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got) |
276 | { | 276 | { |
277 | mutex_lock(&monc->mutex); | 277 | mutex_lock(&monc->mutex); |
278 | monc->have_osdmap = got; | 278 | monc->have_osdmap = got; |
279 | monc->want_next_osdmap = 0; | 279 | monc->want_next_osdmap = 0; |
280 | mutex_unlock(&monc->mutex); | 280 | mutex_unlock(&monc->mutex); |
281 | return 0; | 281 | return 0; |
282 | } | 282 | } |
283 | 283 | ||
284 | /* | 284 | /* |
285 | * Register interest in the next osdmap | 285 | * Register interest in the next osdmap |
286 | */ | 286 | */ |
287 | void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc) | 287 | void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc) |
288 | { | 288 | { |
289 | dout("request_next_osdmap have %u\n", monc->have_osdmap); | 289 | dout("request_next_osdmap have %u\n", monc->have_osdmap); |
290 | mutex_lock(&monc->mutex); | 290 | mutex_lock(&monc->mutex); |
291 | if (!monc->want_next_osdmap) | 291 | if (!monc->want_next_osdmap) |
292 | monc->want_next_osdmap = 1; | 292 | monc->want_next_osdmap = 1; |
293 | if (monc->want_next_osdmap < 2) | 293 | if (monc->want_next_osdmap < 2) |
294 | __send_subscribe(monc); | 294 | __send_subscribe(monc); |
295 | mutex_unlock(&monc->mutex); | 295 | mutex_unlock(&monc->mutex); |
296 | } | 296 | } |
297 | 297 | ||
298 | /* | 298 | /* |
299 | * | 299 | * |
300 | */ | 300 | */ |
301 | int ceph_monc_open_session(struct ceph_mon_client *monc) | 301 | int ceph_monc_open_session(struct ceph_mon_client *monc) |
302 | { | 302 | { |
303 | mutex_lock(&monc->mutex); | 303 | mutex_lock(&monc->mutex); |
304 | __open_session(monc); | 304 | __open_session(monc); |
305 | __schedule_delayed(monc); | 305 | __schedule_delayed(monc); |
306 | mutex_unlock(&monc->mutex); | 306 | mutex_unlock(&monc->mutex); |
307 | return 0; | 307 | return 0; |
308 | } | 308 | } |
309 | EXPORT_SYMBOL(ceph_monc_open_session); | 309 | EXPORT_SYMBOL(ceph_monc_open_session); |
310 | 310 | ||
311 | /* | 311 | /* |
312 | * The monitor responds with mount ack indicate mount success. The | 312 | * The monitor responds with mount ack indicate mount success. The |
313 | * included client ticket allows the client to talk to MDSs and OSDs. | 313 | * included client ticket allows the client to talk to MDSs and OSDs. |
314 | */ | 314 | */ |
315 | static void ceph_monc_handle_map(struct ceph_mon_client *monc, | 315 | static void ceph_monc_handle_map(struct ceph_mon_client *monc, |
316 | struct ceph_msg *msg) | 316 | struct ceph_msg *msg) |
317 | { | 317 | { |
318 | struct ceph_client *client = monc->client; | 318 | struct ceph_client *client = monc->client; |
319 | struct ceph_monmap *monmap = NULL, *old = monc->monmap; | 319 | struct ceph_monmap *monmap = NULL, *old = monc->monmap; |
320 | void *p, *end; | 320 | void *p, *end; |
321 | 321 | ||
322 | mutex_lock(&monc->mutex); | 322 | mutex_lock(&monc->mutex); |
323 | 323 | ||
324 | dout("handle_monmap\n"); | 324 | dout("handle_monmap\n"); |
325 | p = msg->front.iov_base; | 325 | p = msg->front.iov_base; |
326 | end = p + msg->front.iov_len; | 326 | end = p + msg->front.iov_len; |
327 | 327 | ||
328 | monmap = ceph_monmap_decode(p, end); | 328 | monmap = ceph_monmap_decode(p, end); |
329 | if (IS_ERR(monmap)) { | 329 | if (IS_ERR(monmap)) { |
330 | pr_err("problem decoding monmap, %d\n", | 330 | pr_err("problem decoding monmap, %d\n", |
331 | (int)PTR_ERR(monmap)); | 331 | (int)PTR_ERR(monmap)); |
332 | goto out; | 332 | goto out; |
333 | } | 333 | } |
334 | 334 | ||
335 | if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) { | 335 | if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) { |
336 | kfree(monmap); | 336 | kfree(monmap); |
337 | goto out; | 337 | goto out; |
338 | } | 338 | } |
339 | 339 | ||
340 | client->monc.monmap = monmap; | 340 | client->monc.monmap = monmap; |
341 | kfree(old); | 341 | kfree(old); |
342 | 342 | ||
343 | if (!client->have_fsid) { | ||
344 | client->have_fsid = true; | ||
345 | mutex_unlock(&monc->mutex); | ||
346 | /* | ||
347 | * do debugfs initialization without mutex to avoid | ||
348 | * creating a locking dependency | ||
349 | */ | ||
350 | ceph_debugfs_client_init(client); | ||
351 | goto out_unlocked; | ||
352 | } | ||
343 | out: | 353 | out: |
344 | mutex_unlock(&monc->mutex); | 354 | mutex_unlock(&monc->mutex); |
355 | out_unlocked: | ||
345 | wake_up_all(&client->auth_wq); | 356 | wake_up_all(&client->auth_wq); |
346 | } | 357 | } |
347 | 358 | ||
348 | /* | 359 | /* |
349 | * generic requests (e.g., statfs, poolop) | 360 | * generic requests (e.g., statfs, poolop) |
350 | */ | 361 | */ |
351 | static struct ceph_mon_generic_request *__lookup_generic_req( | 362 | static struct ceph_mon_generic_request *__lookup_generic_req( |
352 | struct ceph_mon_client *monc, u64 tid) | 363 | struct ceph_mon_client *monc, u64 tid) |
353 | { | 364 | { |
354 | struct ceph_mon_generic_request *req; | 365 | struct ceph_mon_generic_request *req; |
355 | struct rb_node *n = monc->generic_request_tree.rb_node; | 366 | struct rb_node *n = monc->generic_request_tree.rb_node; |
356 | 367 | ||
357 | while (n) { | 368 | while (n) { |
358 | req = rb_entry(n, struct ceph_mon_generic_request, node); | 369 | req = rb_entry(n, struct ceph_mon_generic_request, node); |
359 | if (tid < req->tid) | 370 | if (tid < req->tid) |
360 | n = n->rb_left; | 371 | n = n->rb_left; |
361 | else if (tid > req->tid) | 372 | else if (tid > req->tid) |
362 | n = n->rb_right; | 373 | n = n->rb_right; |
363 | else | 374 | else |
364 | return req; | 375 | return req; |
365 | } | 376 | } |
366 | return NULL; | 377 | return NULL; |
367 | } | 378 | } |
368 | 379 | ||
369 | static void __insert_generic_request(struct ceph_mon_client *monc, | 380 | static void __insert_generic_request(struct ceph_mon_client *monc, |
370 | struct ceph_mon_generic_request *new) | 381 | struct ceph_mon_generic_request *new) |
371 | { | 382 | { |
372 | struct rb_node **p = &monc->generic_request_tree.rb_node; | 383 | struct rb_node **p = &monc->generic_request_tree.rb_node; |
373 | struct rb_node *parent = NULL; | 384 | struct rb_node *parent = NULL; |
374 | struct ceph_mon_generic_request *req = NULL; | 385 | struct ceph_mon_generic_request *req = NULL; |
375 | 386 | ||
376 | while (*p) { | 387 | while (*p) { |
377 | parent = *p; | 388 | parent = *p; |
378 | req = rb_entry(parent, struct ceph_mon_generic_request, node); | 389 | req = rb_entry(parent, struct ceph_mon_generic_request, node); |
379 | if (new->tid < req->tid) | 390 | if (new->tid < req->tid) |
380 | p = &(*p)->rb_left; | 391 | p = &(*p)->rb_left; |
381 | else if (new->tid > req->tid) | 392 | else if (new->tid > req->tid) |
382 | p = &(*p)->rb_right; | 393 | p = &(*p)->rb_right; |
383 | else | 394 | else |
384 | BUG(); | 395 | BUG(); |
385 | } | 396 | } |
386 | 397 | ||
387 | rb_link_node(&new->node, parent, p); | 398 | rb_link_node(&new->node, parent, p); |
388 | rb_insert_color(&new->node, &monc->generic_request_tree); | 399 | rb_insert_color(&new->node, &monc->generic_request_tree); |
389 | } | 400 | } |
390 | 401 | ||
391 | static void release_generic_request(struct kref *kref) | 402 | static void release_generic_request(struct kref *kref) |
392 | { | 403 | { |
393 | struct ceph_mon_generic_request *req = | 404 | struct ceph_mon_generic_request *req = |
394 | container_of(kref, struct ceph_mon_generic_request, kref); | 405 | container_of(kref, struct ceph_mon_generic_request, kref); |
395 | 406 | ||
396 | if (req->reply) | 407 | if (req->reply) |
397 | ceph_msg_put(req->reply); | 408 | ceph_msg_put(req->reply); |
398 | if (req->request) | 409 | if (req->request) |
399 | ceph_msg_put(req->request); | 410 | ceph_msg_put(req->request); |
400 | 411 | ||
401 | kfree(req); | 412 | kfree(req); |
402 | } | 413 | } |
403 | 414 | ||
404 | static void put_generic_request(struct ceph_mon_generic_request *req) | 415 | static void put_generic_request(struct ceph_mon_generic_request *req) |
405 | { | 416 | { |
406 | kref_put(&req->kref, release_generic_request); | 417 | kref_put(&req->kref, release_generic_request); |
407 | } | 418 | } |
408 | 419 | ||
409 | static void get_generic_request(struct ceph_mon_generic_request *req) | 420 | static void get_generic_request(struct ceph_mon_generic_request *req) |
410 | { | 421 | { |
411 | kref_get(&req->kref); | 422 | kref_get(&req->kref); |
412 | } | 423 | } |
413 | 424 | ||
414 | static struct ceph_msg *get_generic_reply(struct ceph_connection *con, | 425 | static struct ceph_msg *get_generic_reply(struct ceph_connection *con, |
415 | struct ceph_msg_header *hdr, | 426 | struct ceph_msg_header *hdr, |
416 | int *skip) | 427 | int *skip) |
417 | { | 428 | { |
418 | struct ceph_mon_client *monc = con->private; | 429 | struct ceph_mon_client *monc = con->private; |
419 | struct ceph_mon_generic_request *req; | 430 | struct ceph_mon_generic_request *req; |
420 | u64 tid = le64_to_cpu(hdr->tid); | 431 | u64 tid = le64_to_cpu(hdr->tid); |
421 | struct ceph_msg *m; | 432 | struct ceph_msg *m; |
422 | 433 | ||
423 | mutex_lock(&monc->mutex); | 434 | mutex_lock(&monc->mutex); |
424 | req = __lookup_generic_req(monc, tid); | 435 | req = __lookup_generic_req(monc, tid); |
425 | if (!req) { | 436 | if (!req) { |
426 | dout("get_generic_reply %lld dne\n", tid); | 437 | dout("get_generic_reply %lld dne\n", tid); |
427 | *skip = 1; | 438 | *skip = 1; |
428 | m = NULL; | 439 | m = NULL; |
429 | } else { | 440 | } else { |
430 | dout("get_generic_reply %lld got %p\n", tid, req->reply); | 441 | dout("get_generic_reply %lld got %p\n", tid, req->reply); |
431 | m = ceph_msg_get(req->reply); | 442 | m = ceph_msg_get(req->reply); |
432 | /* | 443 | /* |
433 | * we don't need to track the connection reading into | 444 | * we don't need to track the connection reading into |
434 | * this reply because we only have one open connection | 445 | * this reply because we only have one open connection |
435 | * at a time, ever. | 446 | * at a time, ever. |
436 | */ | 447 | */ |
437 | } | 448 | } |
438 | mutex_unlock(&monc->mutex); | 449 | mutex_unlock(&monc->mutex); |
439 | return m; | 450 | return m; |
440 | } | 451 | } |
441 | 452 | ||
442 | static int do_generic_request(struct ceph_mon_client *monc, | 453 | static int do_generic_request(struct ceph_mon_client *monc, |
443 | struct ceph_mon_generic_request *req) | 454 | struct ceph_mon_generic_request *req) |
444 | { | 455 | { |
445 | int err; | 456 | int err; |
446 | 457 | ||
447 | /* register request */ | 458 | /* register request */ |
448 | mutex_lock(&monc->mutex); | 459 | mutex_lock(&monc->mutex); |
449 | req->tid = ++monc->last_tid; | 460 | req->tid = ++monc->last_tid; |
450 | req->request->hdr.tid = cpu_to_le64(req->tid); | 461 | req->request->hdr.tid = cpu_to_le64(req->tid); |
451 | __insert_generic_request(monc, req); | 462 | __insert_generic_request(monc, req); |
452 | monc->num_generic_requests++; | 463 | monc->num_generic_requests++; |
453 | ceph_con_send(monc->con, ceph_msg_get(req->request)); | 464 | ceph_con_send(monc->con, ceph_msg_get(req->request)); |
454 | mutex_unlock(&monc->mutex); | 465 | mutex_unlock(&monc->mutex); |
455 | 466 | ||
456 | err = wait_for_completion_interruptible(&req->completion); | 467 | err = wait_for_completion_interruptible(&req->completion); |
457 | 468 | ||
458 | mutex_lock(&monc->mutex); | 469 | mutex_lock(&monc->mutex); |
459 | rb_erase(&req->node, &monc->generic_request_tree); | 470 | rb_erase(&req->node, &monc->generic_request_tree); |
460 | monc->num_generic_requests--; | 471 | monc->num_generic_requests--; |
461 | mutex_unlock(&monc->mutex); | 472 | mutex_unlock(&monc->mutex); |
462 | 473 | ||
463 | if (!err) | 474 | if (!err) |
464 | err = req->result; | 475 | err = req->result; |
465 | return err; | 476 | return err; |
466 | } | 477 | } |
467 | 478 | ||
468 | /* | 479 | /* |
469 | * statfs | 480 | * statfs |
470 | */ | 481 | */ |
471 | static void handle_statfs_reply(struct ceph_mon_client *monc, | 482 | static void handle_statfs_reply(struct ceph_mon_client *monc, |
472 | struct ceph_msg *msg) | 483 | struct ceph_msg *msg) |
473 | { | 484 | { |
474 | struct ceph_mon_generic_request *req; | 485 | struct ceph_mon_generic_request *req; |
475 | struct ceph_mon_statfs_reply *reply = msg->front.iov_base; | 486 | struct ceph_mon_statfs_reply *reply = msg->front.iov_base; |
476 | u64 tid = le64_to_cpu(msg->hdr.tid); | 487 | u64 tid = le64_to_cpu(msg->hdr.tid); |
477 | 488 | ||
478 | if (msg->front.iov_len != sizeof(*reply)) | 489 | if (msg->front.iov_len != sizeof(*reply)) |
479 | goto bad; | 490 | goto bad; |
480 | dout("handle_statfs_reply %p tid %llu\n", msg, tid); | 491 | dout("handle_statfs_reply %p tid %llu\n", msg, tid); |
481 | 492 | ||
482 | mutex_lock(&monc->mutex); | 493 | mutex_lock(&monc->mutex); |
483 | req = __lookup_generic_req(monc, tid); | 494 | req = __lookup_generic_req(monc, tid); |
484 | if (req) { | 495 | if (req) { |
485 | *(struct ceph_statfs *)req->buf = reply->st; | 496 | *(struct ceph_statfs *)req->buf = reply->st; |
486 | req->result = 0; | 497 | req->result = 0; |
487 | get_generic_request(req); | 498 | get_generic_request(req); |
488 | } | 499 | } |
489 | mutex_unlock(&monc->mutex); | 500 | mutex_unlock(&monc->mutex); |
490 | if (req) { | 501 | if (req) { |
491 | complete_all(&req->completion); | 502 | complete_all(&req->completion); |
492 | put_generic_request(req); | 503 | put_generic_request(req); |
493 | } | 504 | } |
494 | return; | 505 | return; |
495 | 506 | ||
496 | bad: | 507 | bad: |
497 | pr_err("corrupt generic reply, tid %llu\n", tid); | 508 | pr_err("corrupt generic reply, tid %llu\n", tid); |
498 | ceph_msg_dump(msg); | 509 | ceph_msg_dump(msg); |
499 | } | 510 | } |
500 | 511 | ||
501 | /* | 512 | /* |
502 | * Do a synchronous statfs(). | 513 | * Do a synchronous statfs(). |
503 | */ | 514 | */ |
504 | int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) | 515 | int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) |
505 | { | 516 | { |
506 | struct ceph_mon_generic_request *req; | 517 | struct ceph_mon_generic_request *req; |
507 | struct ceph_mon_statfs *h; | 518 | struct ceph_mon_statfs *h; |
508 | int err; | 519 | int err; |
509 | 520 | ||
510 | req = kzalloc(sizeof(*req), GFP_NOFS); | 521 | req = kzalloc(sizeof(*req), GFP_NOFS); |
511 | if (!req) | 522 | if (!req) |
512 | return -ENOMEM; | 523 | return -ENOMEM; |
513 | 524 | ||
514 | kref_init(&req->kref); | 525 | kref_init(&req->kref); |
515 | req->buf = buf; | 526 | req->buf = buf; |
516 | req->buf_len = sizeof(*buf); | 527 | req->buf_len = sizeof(*buf); |
517 | init_completion(&req->completion); | 528 | init_completion(&req->completion); |
518 | 529 | ||
519 | err = -ENOMEM; | 530 | err = -ENOMEM; |
520 | req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS, | 531 | req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS, |
521 | true); | 532 | true); |
522 | if (!req->request) | 533 | if (!req->request) |
523 | goto out; | 534 | goto out; |
524 | req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS, | 535 | req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS, |
525 | true); | 536 | true); |
526 | if (!req->reply) | 537 | if (!req->reply) |
527 | goto out; | 538 | goto out; |
528 | 539 | ||
529 | /* fill out request */ | 540 | /* fill out request */ |
530 | h = req->request->front.iov_base; | 541 | h = req->request->front.iov_base; |
531 | h->monhdr.have_version = 0; | 542 | h->monhdr.have_version = 0; |
532 | h->monhdr.session_mon = cpu_to_le16(-1); | 543 | h->monhdr.session_mon = cpu_to_le16(-1); |
533 | h->monhdr.session_mon_tid = 0; | 544 | h->monhdr.session_mon_tid = 0; |
534 | h->fsid = monc->monmap->fsid; | 545 | h->fsid = monc->monmap->fsid; |
535 | 546 | ||
536 | err = do_generic_request(monc, req); | 547 | err = do_generic_request(monc, req); |
537 | 548 | ||
538 | out: | 549 | out: |
539 | kref_put(&req->kref, release_generic_request); | 550 | kref_put(&req->kref, release_generic_request); |
540 | return err; | 551 | return err; |
541 | } | 552 | } |
542 | EXPORT_SYMBOL(ceph_monc_do_statfs); | 553 | EXPORT_SYMBOL(ceph_monc_do_statfs); |
543 | 554 | ||
544 | /* | 555 | /* |
545 | * pool ops | 556 | * pool ops |
546 | */ | 557 | */ |
547 | static int get_poolop_reply_buf(const char *src, size_t src_len, | 558 | static int get_poolop_reply_buf(const char *src, size_t src_len, |
548 | char *dst, size_t dst_len) | 559 | char *dst, size_t dst_len) |
549 | { | 560 | { |
550 | u32 buf_len; | 561 | u32 buf_len; |
551 | 562 | ||
552 | if (src_len != sizeof(u32) + dst_len) | 563 | if (src_len != sizeof(u32) + dst_len) |
553 | return -EINVAL; | 564 | return -EINVAL; |
554 | 565 | ||
555 | buf_len = le32_to_cpu(*(u32 *)src); | 566 | buf_len = le32_to_cpu(*(u32 *)src); |
556 | if (buf_len != dst_len) | 567 | if (buf_len != dst_len) |
557 | return -EINVAL; | 568 | return -EINVAL; |
558 | 569 | ||
559 | memcpy(dst, src + sizeof(u32), dst_len); | 570 | memcpy(dst, src + sizeof(u32), dst_len); |
560 | return 0; | 571 | return 0; |
561 | } | 572 | } |
562 | 573 | ||
563 | static void handle_poolop_reply(struct ceph_mon_client *monc, | 574 | static void handle_poolop_reply(struct ceph_mon_client *monc, |
564 | struct ceph_msg *msg) | 575 | struct ceph_msg *msg) |
565 | { | 576 | { |
566 | struct ceph_mon_generic_request *req; | 577 | struct ceph_mon_generic_request *req; |
567 | struct ceph_mon_poolop_reply *reply = msg->front.iov_base; | 578 | struct ceph_mon_poolop_reply *reply = msg->front.iov_base; |
568 | u64 tid = le64_to_cpu(msg->hdr.tid); | 579 | u64 tid = le64_to_cpu(msg->hdr.tid); |
569 | 580 | ||
570 | if (msg->front.iov_len < sizeof(*reply)) | 581 | if (msg->front.iov_len < sizeof(*reply)) |
571 | goto bad; | 582 | goto bad; |
572 | dout("handle_poolop_reply %p tid %llu\n", msg, tid); | 583 | dout("handle_poolop_reply %p tid %llu\n", msg, tid); |
573 | 584 | ||
574 | mutex_lock(&monc->mutex); | 585 | mutex_lock(&monc->mutex); |
575 | req = __lookup_generic_req(monc, tid); | 586 | req = __lookup_generic_req(monc, tid); |
576 | if (req) { | 587 | if (req) { |
577 | if (req->buf_len && | 588 | if (req->buf_len && |
578 | get_poolop_reply_buf(msg->front.iov_base + sizeof(*reply), | 589 | get_poolop_reply_buf(msg->front.iov_base + sizeof(*reply), |
579 | msg->front.iov_len - sizeof(*reply), | 590 | msg->front.iov_len - sizeof(*reply), |
580 | req->buf, req->buf_len) < 0) { | 591 | req->buf, req->buf_len) < 0) { |
581 | mutex_unlock(&monc->mutex); | 592 | mutex_unlock(&monc->mutex); |
582 | goto bad; | 593 | goto bad; |
583 | } | 594 | } |
584 | req->result = le32_to_cpu(reply->reply_code); | 595 | req->result = le32_to_cpu(reply->reply_code); |
585 | get_generic_request(req); | 596 | get_generic_request(req); |
586 | } | 597 | } |
587 | mutex_unlock(&monc->mutex); | 598 | mutex_unlock(&monc->mutex); |
588 | if (req) { | 599 | if (req) { |
589 | complete(&req->completion); | 600 | complete(&req->completion); |
590 | put_generic_request(req); | 601 | put_generic_request(req); |
591 | } | 602 | } |
592 | return; | 603 | return; |
593 | 604 | ||
594 | bad: | 605 | bad: |
595 | pr_err("corrupt generic reply, tid %llu\n", tid); | 606 | pr_err("corrupt generic reply, tid %llu\n", tid); |
596 | ceph_msg_dump(msg); | 607 | ceph_msg_dump(msg); |
597 | } | 608 | } |
598 | 609 | ||
599 | /* | 610 | /* |
600 | * Do a synchronous pool op. | 611 | * Do a synchronous pool op. |
601 | */ | 612 | */ |
602 | int ceph_monc_do_poolop(struct ceph_mon_client *monc, u32 op, | 613 | int ceph_monc_do_poolop(struct ceph_mon_client *monc, u32 op, |
603 | u32 pool, u64 snapid, | 614 | u32 pool, u64 snapid, |
604 | char *buf, int len) | 615 | char *buf, int len) |
605 | { | 616 | { |
606 | struct ceph_mon_generic_request *req; | 617 | struct ceph_mon_generic_request *req; |
607 | struct ceph_mon_poolop *h; | 618 | struct ceph_mon_poolop *h; |
608 | int err; | 619 | int err; |
609 | 620 | ||
610 | req = kzalloc(sizeof(*req), GFP_NOFS); | 621 | req = kzalloc(sizeof(*req), GFP_NOFS); |
611 | if (!req) | 622 | if (!req) |
612 | return -ENOMEM; | 623 | return -ENOMEM; |
613 | 624 | ||
614 | kref_init(&req->kref); | 625 | kref_init(&req->kref); |
615 | req->buf = buf; | 626 | req->buf = buf; |
616 | req->buf_len = len; | 627 | req->buf_len = len; |
617 | init_completion(&req->completion); | 628 | init_completion(&req->completion); |
618 | 629 | ||
619 | err = -ENOMEM; | 630 | err = -ENOMEM; |
620 | req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS, | 631 | req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS, |
621 | true); | 632 | true); |
622 | if (!req->request) | 633 | if (!req->request) |
623 | goto out; | 634 | goto out; |
624 | req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS, | 635 | req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS, |
625 | true); | 636 | true); |
626 | if (!req->reply) | 637 | if (!req->reply) |
627 | goto out; | 638 | goto out; |
628 | 639 | ||
629 | /* fill out request */ | 640 | /* fill out request */ |
630 | req->request->hdr.version = cpu_to_le16(2); | 641 | req->request->hdr.version = cpu_to_le16(2); |
631 | h = req->request->front.iov_base; | 642 | h = req->request->front.iov_base; |
632 | h->monhdr.have_version = 0; | 643 | h->monhdr.have_version = 0; |
633 | h->monhdr.session_mon = cpu_to_le16(-1); | 644 | h->monhdr.session_mon = cpu_to_le16(-1); |
634 | h->monhdr.session_mon_tid = 0; | 645 | h->monhdr.session_mon_tid = 0; |
635 | h->fsid = monc->monmap->fsid; | 646 | h->fsid = monc->monmap->fsid; |
636 | h->pool = cpu_to_le32(pool); | 647 | h->pool = cpu_to_le32(pool); |
637 | h->op = cpu_to_le32(op); | 648 | h->op = cpu_to_le32(op); |
638 | h->auid = 0; | 649 | h->auid = 0; |
639 | h->snapid = cpu_to_le64(snapid); | 650 | h->snapid = cpu_to_le64(snapid); |
640 | h->name_len = 0; | 651 | h->name_len = 0; |
641 | 652 | ||
642 | err = do_generic_request(monc, req); | 653 | err = do_generic_request(monc, req); |
643 | 654 | ||
644 | out: | 655 | out: |
645 | kref_put(&req->kref, release_generic_request); | 656 | kref_put(&req->kref, release_generic_request); |
646 | return err; | 657 | return err; |
647 | } | 658 | } |
648 | 659 | ||
649 | int ceph_monc_create_snapid(struct ceph_mon_client *monc, | 660 | int ceph_monc_create_snapid(struct ceph_mon_client *monc, |
650 | u32 pool, u64 *snapid) | 661 | u32 pool, u64 *snapid) |
651 | { | 662 | { |
652 | return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, | 663 | return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, |
653 | pool, 0, (char *)snapid, sizeof(*snapid)); | 664 | pool, 0, (char *)snapid, sizeof(*snapid)); |
654 | 665 | ||
655 | } | 666 | } |
656 | EXPORT_SYMBOL(ceph_monc_create_snapid); | 667 | EXPORT_SYMBOL(ceph_monc_create_snapid); |
657 | 668 | ||
658 | int ceph_monc_delete_snapid(struct ceph_mon_client *monc, | 669 | int ceph_monc_delete_snapid(struct ceph_mon_client *monc, |
659 | u32 pool, u64 snapid) | 670 | u32 pool, u64 snapid) |
660 | { | 671 | { |
661 | return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, | 672 | return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, |
662 | pool, snapid, 0, 0); | 673 | pool, snapid, 0, 0); |
663 | 674 | ||
664 | } | 675 | } |
665 | 676 | ||
666 | /* | 677 | /* |
667 | * Resend pending generic requests. | 678 | * Resend pending generic requests. |
668 | */ | 679 | */ |
669 | static void __resend_generic_request(struct ceph_mon_client *monc) | 680 | static void __resend_generic_request(struct ceph_mon_client *monc) |
670 | { | 681 | { |
671 | struct ceph_mon_generic_request *req; | 682 | struct ceph_mon_generic_request *req; |
672 | struct rb_node *p; | 683 | struct rb_node *p; |
673 | 684 | ||
674 | for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) { | 685 | for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) { |
675 | req = rb_entry(p, struct ceph_mon_generic_request, node); | 686 | req = rb_entry(p, struct ceph_mon_generic_request, node); |
676 | ceph_con_revoke(monc->con, req->request); | 687 | ceph_con_revoke(monc->con, req->request); |
677 | ceph_con_send(monc->con, ceph_msg_get(req->request)); | 688 | ceph_con_send(monc->con, ceph_msg_get(req->request)); |
678 | } | 689 | } |
679 | } | 690 | } |
680 | 691 | ||
681 | /* | 692 | /* |
682 | * Delayed work. If we haven't mounted yet, retry. Otherwise, | 693 | * Delayed work. If we haven't mounted yet, retry. Otherwise, |
683 | * renew/retry subscription as needed (in case it is timing out, or we | 694 | * renew/retry subscription as needed (in case it is timing out, or we |
684 | * got an ENOMEM). And keep the monitor connection alive. | 695 | * got an ENOMEM). And keep the monitor connection alive. |
685 | */ | 696 | */ |
686 | static void delayed_work(struct work_struct *work) | 697 | static void delayed_work(struct work_struct *work) |
687 | { | 698 | { |
688 | struct ceph_mon_client *monc = | 699 | struct ceph_mon_client *monc = |
689 | container_of(work, struct ceph_mon_client, delayed_work.work); | 700 | container_of(work, struct ceph_mon_client, delayed_work.work); |
690 | 701 | ||
691 | dout("monc delayed_work\n"); | 702 | dout("monc delayed_work\n"); |
692 | mutex_lock(&monc->mutex); | 703 | mutex_lock(&monc->mutex); |
693 | if (monc->hunting) { | 704 | if (monc->hunting) { |
694 | __close_session(monc); | 705 | __close_session(monc); |
695 | __open_session(monc); /* continue hunting */ | 706 | __open_session(monc); /* continue hunting */ |
696 | } else { | 707 | } else { |
697 | ceph_con_keepalive(monc->con); | 708 | ceph_con_keepalive(monc->con); |
698 | 709 | ||
699 | __validate_auth(monc); | 710 | __validate_auth(monc); |
700 | 711 | ||
701 | if (monc->auth->ops->is_authenticated(monc->auth)) | 712 | if (monc->auth->ops->is_authenticated(monc->auth)) |
702 | __send_subscribe(monc); | 713 | __send_subscribe(monc); |
703 | } | 714 | } |
704 | __schedule_delayed(monc); | 715 | __schedule_delayed(monc); |
705 | mutex_unlock(&monc->mutex); | 716 | mutex_unlock(&monc->mutex); |
706 | } | 717 | } |
707 | 718 | ||
708 | /* | 719 | /* |
709 | * On startup, we build a temporary monmap populated with the IPs | 720 | * On startup, we build a temporary monmap populated with the IPs |
710 | * provided by mount(2). | 721 | * provided by mount(2). |
711 | */ | 722 | */ |
712 | static int build_initial_monmap(struct ceph_mon_client *monc) | 723 | static int build_initial_monmap(struct ceph_mon_client *monc) |
713 | { | 724 | { |
714 | struct ceph_options *opt = monc->client->options; | 725 | struct ceph_options *opt = monc->client->options; |
715 | struct ceph_entity_addr *mon_addr = opt->mon_addr; | 726 | struct ceph_entity_addr *mon_addr = opt->mon_addr; |
716 | int num_mon = opt->num_mon; | 727 | int num_mon = opt->num_mon; |
717 | int i; | 728 | int i; |
718 | 729 | ||
719 | /* build initial monmap */ | 730 | /* build initial monmap */ |
720 | monc->monmap = kzalloc(sizeof(*monc->monmap) + | 731 | monc->monmap = kzalloc(sizeof(*monc->monmap) + |
721 | num_mon*sizeof(monc->monmap->mon_inst[0]), | 732 | num_mon*sizeof(monc->monmap->mon_inst[0]), |
722 | GFP_KERNEL); | 733 | GFP_KERNEL); |
723 | if (!monc->monmap) | 734 | if (!monc->monmap) |
724 | return -ENOMEM; | 735 | return -ENOMEM; |
725 | for (i = 0; i < num_mon; i++) { | 736 | for (i = 0; i < num_mon; i++) { |
726 | monc->monmap->mon_inst[i].addr = mon_addr[i]; | 737 | monc->monmap->mon_inst[i].addr = mon_addr[i]; |
727 | monc->monmap->mon_inst[i].addr.nonce = 0; | 738 | monc->monmap->mon_inst[i].addr.nonce = 0; |
728 | monc->monmap->mon_inst[i].name.type = | 739 | monc->monmap->mon_inst[i].name.type = |
729 | CEPH_ENTITY_TYPE_MON; | 740 | CEPH_ENTITY_TYPE_MON; |
730 | monc->monmap->mon_inst[i].name.num = cpu_to_le64(i); | 741 | monc->monmap->mon_inst[i].name.num = cpu_to_le64(i); |
731 | } | 742 | } |
732 | monc->monmap->num_mon = num_mon; | 743 | monc->monmap->num_mon = num_mon; |
733 | monc->have_fsid = false; | 744 | monc->have_fsid = false; |
734 | return 0; | 745 | return 0; |
735 | } | 746 | } |
736 | 747 | ||
737 | int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) | 748 | int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) |
738 | { | 749 | { |
739 | int err = 0; | 750 | int err = 0; |
740 | 751 | ||
741 | dout("init\n"); | 752 | dout("init\n"); |
742 | memset(monc, 0, sizeof(*monc)); | 753 | memset(monc, 0, sizeof(*monc)); |
743 | monc->client = cl; | 754 | monc->client = cl; |
744 | monc->monmap = NULL; | 755 | monc->monmap = NULL; |
745 | mutex_init(&monc->mutex); | 756 | mutex_init(&monc->mutex); |
746 | 757 | ||
747 | err = build_initial_monmap(monc); | 758 | err = build_initial_monmap(monc); |
748 | if (err) | 759 | if (err) |
749 | goto out; | 760 | goto out; |
750 | 761 | ||
751 | /* connection */ | 762 | /* connection */ |
752 | monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL); | 763 | monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL); |
753 | if (!monc->con) | 764 | if (!monc->con) |
754 | goto out_monmap; | 765 | goto out_monmap; |
755 | ceph_con_init(monc->client->msgr, monc->con); | 766 | ceph_con_init(monc->client->msgr, monc->con); |
756 | monc->con->private = monc; | 767 | monc->con->private = monc; |
757 | monc->con->ops = &mon_con_ops; | 768 | monc->con->ops = &mon_con_ops; |
758 | 769 | ||
759 | /* authentication */ | 770 | /* authentication */ |
760 | monc->auth = ceph_auth_init(cl->options->name, | 771 | monc->auth = ceph_auth_init(cl->options->name, |
761 | cl->options->key); | 772 | cl->options->key); |
762 | if (IS_ERR(monc->auth)) { | 773 | if (IS_ERR(monc->auth)) { |
763 | err = PTR_ERR(monc->auth); | 774 | err = PTR_ERR(monc->auth); |
764 | goto out_con; | 775 | goto out_con; |
765 | } | 776 | } |
766 | monc->auth->want_keys = | 777 | monc->auth->want_keys = |
767 | CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON | | 778 | CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON | |
768 | CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS; | 779 | CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS; |
769 | 780 | ||
770 | /* msgs */ | 781 | /* msgs */ |
771 | err = -ENOMEM; | 782 | err = -ENOMEM; |
772 | monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK, | 783 | monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK, |
773 | sizeof(struct ceph_mon_subscribe_ack), | 784 | sizeof(struct ceph_mon_subscribe_ack), |
774 | GFP_NOFS, true); | 785 | GFP_NOFS, true); |
775 | if (!monc->m_subscribe_ack) | 786 | if (!monc->m_subscribe_ack) |
776 | goto out_auth; | 787 | goto out_auth; |
777 | 788 | ||
778 | monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS, | 789 | monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS, |
779 | true); | 790 | true); |
780 | if (!monc->m_subscribe) | 791 | if (!monc->m_subscribe) |
781 | goto out_subscribe_ack; | 792 | goto out_subscribe_ack; |
782 | 793 | ||
783 | monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS, | 794 | monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS, |
784 | true); | 795 | true); |
785 | if (!monc->m_auth_reply) | 796 | if (!monc->m_auth_reply) |
786 | goto out_subscribe; | 797 | goto out_subscribe; |
787 | 798 | ||
788 | monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS, true); | 799 | monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS, true); |
789 | monc->pending_auth = 0; | 800 | monc->pending_auth = 0; |
790 | if (!monc->m_auth) | 801 | if (!monc->m_auth) |
791 | goto out_auth_reply; | 802 | goto out_auth_reply; |
792 | 803 | ||
793 | monc->cur_mon = -1; | 804 | monc->cur_mon = -1; |
794 | monc->hunting = true; | 805 | monc->hunting = true; |
795 | monc->sub_renew_after = jiffies; | 806 | monc->sub_renew_after = jiffies; |
796 | monc->sub_sent = 0; | 807 | monc->sub_sent = 0; |
797 | 808 | ||
798 | INIT_DELAYED_WORK(&monc->delayed_work, delayed_work); | 809 | INIT_DELAYED_WORK(&monc->delayed_work, delayed_work); |
799 | monc->generic_request_tree = RB_ROOT; | 810 | monc->generic_request_tree = RB_ROOT; |
800 | monc->num_generic_requests = 0; | 811 | monc->num_generic_requests = 0; |
801 | monc->last_tid = 0; | 812 | monc->last_tid = 0; |
802 | 813 | ||
803 | monc->have_mdsmap = 0; | 814 | monc->have_mdsmap = 0; |
804 | monc->have_osdmap = 0; | 815 | monc->have_osdmap = 0; |
805 | monc->want_next_osdmap = 1; | 816 | monc->want_next_osdmap = 1; |
806 | return 0; | 817 | return 0; |
807 | 818 | ||
808 | out_auth_reply: | 819 | out_auth_reply: |
809 | ceph_msg_put(monc->m_auth_reply); | 820 | ceph_msg_put(monc->m_auth_reply); |
810 | out_subscribe: | 821 | out_subscribe: |
811 | ceph_msg_put(monc->m_subscribe); | 822 | ceph_msg_put(monc->m_subscribe); |
812 | out_subscribe_ack: | 823 | out_subscribe_ack: |
813 | ceph_msg_put(monc->m_subscribe_ack); | 824 | ceph_msg_put(monc->m_subscribe_ack); |
814 | out_auth: | 825 | out_auth: |
815 | ceph_auth_destroy(monc->auth); | 826 | ceph_auth_destroy(monc->auth); |
816 | out_con: | 827 | out_con: |
817 | monc->con->ops->put(monc->con); | 828 | monc->con->ops->put(monc->con); |
818 | out_monmap: | 829 | out_monmap: |
819 | kfree(monc->monmap); | 830 | kfree(monc->monmap); |
820 | out: | 831 | out: |
821 | return err; | 832 | return err; |
822 | } | 833 | } |
823 | EXPORT_SYMBOL(ceph_monc_init); | 834 | EXPORT_SYMBOL(ceph_monc_init); |
824 | 835 | ||
825 | void ceph_monc_stop(struct ceph_mon_client *monc) | 836 | void ceph_monc_stop(struct ceph_mon_client *monc) |
826 | { | 837 | { |
827 | dout("stop\n"); | 838 | dout("stop\n"); |
828 | cancel_delayed_work_sync(&monc->delayed_work); | 839 | cancel_delayed_work_sync(&monc->delayed_work); |
829 | 840 | ||
830 | mutex_lock(&monc->mutex); | 841 | mutex_lock(&monc->mutex); |
831 | __close_session(monc); | 842 | __close_session(monc); |
832 | 843 | ||
833 | monc->con->private = NULL; | 844 | monc->con->private = NULL; |
834 | monc->con->ops->put(monc->con); | 845 | monc->con->ops->put(monc->con); |
835 | monc->con = NULL; | 846 | monc->con = NULL; |
836 | 847 | ||
837 | mutex_unlock(&monc->mutex); | 848 | mutex_unlock(&monc->mutex); |
838 | 849 | ||
839 | ceph_auth_destroy(monc->auth); | 850 | ceph_auth_destroy(monc->auth); |
840 | 851 | ||
841 | ceph_msg_put(monc->m_auth); | 852 | ceph_msg_put(monc->m_auth); |
842 | ceph_msg_put(monc->m_auth_reply); | 853 | ceph_msg_put(monc->m_auth_reply); |
843 | ceph_msg_put(monc->m_subscribe); | 854 | ceph_msg_put(monc->m_subscribe); |
844 | ceph_msg_put(monc->m_subscribe_ack); | 855 | ceph_msg_put(monc->m_subscribe_ack); |
845 | 856 | ||
846 | kfree(monc->monmap); | 857 | kfree(monc->monmap); |
847 | } | 858 | } |
848 | EXPORT_SYMBOL(ceph_monc_stop); | 859 | EXPORT_SYMBOL(ceph_monc_stop); |
849 | 860 | ||
850 | static void handle_auth_reply(struct ceph_mon_client *monc, | 861 | static void handle_auth_reply(struct ceph_mon_client *monc, |
851 | struct ceph_msg *msg) | 862 | struct ceph_msg *msg) |
852 | { | 863 | { |
853 | int ret; | 864 | int ret; |
854 | int was_auth = 0; | 865 | int was_auth = 0; |
855 | 866 | ||
856 | mutex_lock(&monc->mutex); | 867 | mutex_lock(&monc->mutex); |
857 | if (monc->auth->ops) | 868 | if (monc->auth->ops) |
858 | was_auth = monc->auth->ops->is_authenticated(monc->auth); | 869 | was_auth = monc->auth->ops->is_authenticated(monc->auth); |
859 | monc->pending_auth = 0; | 870 | monc->pending_auth = 0; |
860 | ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base, | 871 | ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base, |
861 | msg->front.iov_len, | 872 | msg->front.iov_len, |
862 | monc->m_auth->front.iov_base, | 873 | monc->m_auth->front.iov_base, |
863 | monc->m_auth->front_max); | 874 | monc->m_auth->front_max); |
864 | if (ret < 0) { | 875 | if (ret < 0) { |
865 | monc->client->auth_err = ret; | 876 | monc->client->auth_err = ret; |
866 | wake_up_all(&monc->client->auth_wq); | 877 | wake_up_all(&monc->client->auth_wq); |
867 | } else if (ret > 0) { | 878 | } else if (ret > 0) { |
868 | __send_prepared_auth_request(monc, ret); | 879 | __send_prepared_auth_request(monc, ret); |
869 | } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) { | 880 | } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) { |
870 | dout("authenticated, starting session\n"); | 881 | dout("authenticated, starting session\n"); |
871 | 882 | ||
872 | monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT; | 883 | monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT; |
873 | monc->client->msgr->inst.name.num = | 884 | monc->client->msgr->inst.name.num = |
874 | cpu_to_le64(monc->auth->global_id); | 885 | cpu_to_le64(monc->auth->global_id); |
875 | 886 | ||
876 | __send_subscribe(monc); | 887 | __send_subscribe(monc); |
877 | __resend_generic_request(monc); | 888 | __resend_generic_request(monc); |
878 | } | 889 | } |
879 | mutex_unlock(&monc->mutex); | 890 | mutex_unlock(&monc->mutex); |
880 | } | 891 | } |
881 | 892 | ||
882 | static int __validate_auth(struct ceph_mon_client *monc) | 893 | static int __validate_auth(struct ceph_mon_client *monc) |
883 | { | 894 | { |
884 | int ret; | 895 | int ret; |
885 | 896 | ||
886 | if (monc->pending_auth) | 897 | if (monc->pending_auth) |
887 | return 0; | 898 | return 0; |
888 | 899 | ||
889 | ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base, | 900 | ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base, |
890 | monc->m_auth->front_max); | 901 | monc->m_auth->front_max); |
891 | if (ret <= 0) | 902 | if (ret <= 0) |
892 | return ret; /* either an error, or no need to authenticate */ | 903 | return ret; /* either an error, or no need to authenticate */ |
893 | __send_prepared_auth_request(monc, ret); | 904 | __send_prepared_auth_request(monc, ret); |
894 | return 0; | 905 | return 0; |
895 | } | 906 | } |
896 | 907 | ||
897 | int ceph_monc_validate_auth(struct ceph_mon_client *monc) | 908 | int ceph_monc_validate_auth(struct ceph_mon_client *monc) |
898 | { | 909 | { |
899 | int ret; | 910 | int ret; |
900 | 911 | ||
901 | mutex_lock(&monc->mutex); | 912 | mutex_lock(&monc->mutex); |
902 | ret = __validate_auth(monc); | 913 | ret = __validate_auth(monc); |
903 | mutex_unlock(&monc->mutex); | 914 | mutex_unlock(&monc->mutex); |
904 | return ret; | 915 | return ret; |
905 | } | 916 | } |
906 | EXPORT_SYMBOL(ceph_monc_validate_auth); | 917 | EXPORT_SYMBOL(ceph_monc_validate_auth); |
907 | 918 | ||
908 | /* | 919 | /* |
909 | * handle incoming message | 920 | * handle incoming message |
910 | */ | 921 | */ |
911 | static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) | 922 | static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) |
912 | { | 923 | { |
913 | struct ceph_mon_client *monc = con->private; | 924 | struct ceph_mon_client *monc = con->private; |
914 | int type = le16_to_cpu(msg->hdr.type); | 925 | int type = le16_to_cpu(msg->hdr.type); |
915 | 926 | ||
916 | if (!monc) | 927 | if (!monc) |
917 | return; | 928 | return; |
918 | 929 | ||
919 | switch (type) { | 930 | switch (type) { |
920 | case CEPH_MSG_AUTH_REPLY: | 931 | case CEPH_MSG_AUTH_REPLY: |
921 | handle_auth_reply(monc, msg); | 932 | handle_auth_reply(monc, msg); |
922 | break; | 933 | break; |
923 | 934 | ||
924 | case CEPH_MSG_MON_SUBSCRIBE_ACK: | 935 | case CEPH_MSG_MON_SUBSCRIBE_ACK: |
925 | handle_subscribe_ack(monc, msg); | 936 | handle_subscribe_ack(monc, msg); |
926 | break; | 937 | break; |
927 | 938 | ||
928 | case CEPH_MSG_STATFS_REPLY: | 939 | case CEPH_MSG_STATFS_REPLY: |
929 | handle_statfs_reply(monc, msg); | 940 | handle_statfs_reply(monc, msg); |
930 | break; | 941 | break; |
931 | 942 | ||
932 | case CEPH_MSG_POOLOP_REPLY: | 943 | case CEPH_MSG_POOLOP_REPLY: |
933 | handle_poolop_reply(monc, msg); | 944 | handle_poolop_reply(monc, msg); |
934 | break; | 945 | break; |
935 | 946 | ||
936 | case CEPH_MSG_MON_MAP: | 947 | case CEPH_MSG_MON_MAP: |
937 | ceph_monc_handle_map(monc, msg); | 948 | ceph_monc_handle_map(monc, msg); |
938 | break; | 949 | break; |
939 | 950 | ||
940 | case CEPH_MSG_OSD_MAP: | 951 | case CEPH_MSG_OSD_MAP: |
941 | ceph_osdc_handle_map(&monc->client->osdc, msg); | 952 | ceph_osdc_handle_map(&monc->client->osdc, msg); |
942 | break; | 953 | break; |
943 | 954 | ||
944 | default: | 955 | default: |
945 | /* can the chained handler handle it? */ | 956 | /* can the chained handler handle it? */ |
946 | if (monc->client->extra_mon_dispatch && | 957 | if (monc->client->extra_mon_dispatch && |
947 | monc->client->extra_mon_dispatch(monc->client, msg) == 0) | 958 | monc->client->extra_mon_dispatch(monc->client, msg) == 0) |
948 | break; | 959 | break; |
949 | 960 | ||
950 | pr_err("received unknown message type %d %s\n", type, | 961 | pr_err("received unknown message type %d %s\n", type, |
951 | ceph_msg_type_name(type)); | 962 | ceph_msg_type_name(type)); |
952 | } | 963 | } |
953 | ceph_msg_put(msg); | 964 | ceph_msg_put(msg); |
954 | } | 965 | } |
955 | 966 | ||
956 | /* | 967 | /* |
957 | * Allocate memory for incoming message | 968 | * Allocate memory for incoming message |
958 | */ | 969 | */ |
959 | static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, | 970 | static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, |
960 | struct ceph_msg_header *hdr, | 971 | struct ceph_msg_header *hdr, |
961 | int *skip) | 972 | int *skip) |
962 | { | 973 | { |
963 | struct ceph_mon_client *monc = con->private; | 974 | struct ceph_mon_client *monc = con->private; |
964 | int type = le16_to_cpu(hdr->type); | 975 | int type = le16_to_cpu(hdr->type); |
965 | int front_len = le32_to_cpu(hdr->front_len); | 976 | int front_len = le32_to_cpu(hdr->front_len); |
966 | struct ceph_msg *m = NULL; | 977 | struct ceph_msg *m = NULL; |
967 | 978 | ||
968 | *skip = 0; | 979 | *skip = 0; |
969 | 980 | ||
970 | switch (type) { | 981 | switch (type) { |
971 | case CEPH_MSG_MON_SUBSCRIBE_ACK: | 982 | case CEPH_MSG_MON_SUBSCRIBE_ACK: |
972 | m = ceph_msg_get(monc->m_subscribe_ack); | 983 | m = ceph_msg_get(monc->m_subscribe_ack); |
973 | break; | 984 | break; |
974 | case CEPH_MSG_POOLOP_REPLY: | 985 | case CEPH_MSG_POOLOP_REPLY: |
975 | case CEPH_MSG_STATFS_REPLY: | 986 | case CEPH_MSG_STATFS_REPLY: |
976 | return get_generic_reply(con, hdr, skip); | 987 | return get_generic_reply(con, hdr, skip); |
977 | case CEPH_MSG_AUTH_REPLY: | 988 | case CEPH_MSG_AUTH_REPLY: |
978 | m = ceph_msg_get(monc->m_auth_reply); | 989 | m = ceph_msg_get(monc->m_auth_reply); |
979 | break; | 990 | break; |
980 | case CEPH_MSG_MON_MAP: | 991 | case CEPH_MSG_MON_MAP: |
981 | case CEPH_MSG_MDS_MAP: | 992 | case CEPH_MSG_MDS_MAP: |
982 | case CEPH_MSG_OSD_MAP: | 993 | case CEPH_MSG_OSD_MAP: |
983 | m = ceph_msg_new(type, front_len, GFP_NOFS, false); | 994 | m = ceph_msg_new(type, front_len, GFP_NOFS, false); |
984 | break; | 995 | break; |
985 | } | 996 | } |
986 | 997 | ||
987 | if (!m) { | 998 | if (!m) { |
988 | pr_info("alloc_msg unknown type %d\n", type); | 999 | pr_info("alloc_msg unknown type %d\n", type); |
989 | *skip = 1; | 1000 | *skip = 1; |
990 | } | 1001 | } |
991 | return m; | 1002 | return m; |
992 | } | 1003 | } |
993 | 1004 | ||
994 | /* | 1005 | /* |
995 | * If the monitor connection resets, pick a new monitor and resubmit | 1006 | * If the monitor connection resets, pick a new monitor and resubmit |
996 | * any pending requests. | 1007 | * any pending requests. |
997 | */ | 1008 | */ |
998 | static void mon_fault(struct ceph_connection *con) | 1009 | static void mon_fault(struct ceph_connection *con) |
999 | { | 1010 | { |
1000 | struct ceph_mon_client *monc = con->private; | 1011 | struct ceph_mon_client *monc = con->private; |
1001 | 1012 | ||
1002 | if (!monc) | 1013 | if (!monc) |
1003 | return; | 1014 | return; |
1004 | 1015 | ||
1005 | dout("mon_fault\n"); | 1016 | dout("mon_fault\n"); |
1006 | mutex_lock(&monc->mutex); | 1017 | mutex_lock(&monc->mutex); |
1007 | if (!con->private) | 1018 | if (!con->private) |
1008 | goto out; | 1019 | goto out; |
1009 | 1020 | ||
1010 | if (!monc->hunting) | 1021 | if (!monc->hunting) |
1011 | pr_info("mon%d %s session lost, " | 1022 | pr_info("mon%d %s session lost, " |
1012 | "hunting for new mon\n", monc->cur_mon, | 1023 | "hunting for new mon\n", monc->cur_mon, |
1013 | ceph_pr_addr(&monc->con->peer_addr.in_addr)); | 1024 | ceph_pr_addr(&monc->con->peer_addr.in_addr)); |
1014 | 1025 | ||
1015 | __close_session(monc); | 1026 | __close_session(monc); |
1016 | if (!monc->hunting) { | 1027 | if (!monc->hunting) { |
1017 | /* start hunting */ | 1028 | /* start hunting */ |
1018 | monc->hunting = true; | 1029 | monc->hunting = true; |
1019 | __open_session(monc); | 1030 | __open_session(monc); |
1020 | } else { | 1031 | } else { |
1021 | /* already hunting, let's wait a bit */ | 1032 | /* already hunting, let's wait a bit */ |
1022 | __schedule_delayed(monc); | 1033 | __schedule_delayed(monc); |
1023 | } | 1034 | } |
1024 | out: | 1035 | out: |
1025 | mutex_unlock(&monc->mutex); | 1036 | mutex_unlock(&monc->mutex); |
1026 | } | 1037 | } |
1027 | 1038 | ||
1028 | static const struct ceph_connection_operations mon_con_ops = { | 1039 | static const struct ceph_connection_operations mon_con_ops = { |
1029 | .get = ceph_con_get, | 1040 | .get = ceph_con_get, |
1030 | .put = ceph_con_put, | 1041 | .put = ceph_con_put, |
1031 | .dispatch = dispatch, | 1042 | .dispatch = dispatch, |
1032 | .fault = mon_fault, | 1043 | .fault = mon_fault, |
1033 | .alloc_msg = mon_alloc_msg, | 1044 | .alloc_msg = mon_alloc_msg, |
1034 | }; | 1045 | }; |