Commit 812164f8c3f6f5348aa69003a2f81775c2872ac0
Committed by
Sage Weil
1 parent
4f0dcb10cf
Exists in
smarc-l5.0.0_1.0.0-ga
and in
5 other branches
ceph: use ceph_create_snap_context()
Now that we have a library routine to create snap contexts, use it. This is part of: http://tracker.ceph.com/issues/4857 Signed-off-by: Alex Elder <elder@inktank.com> Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
Showing 2 changed files with 7 additions and 37 deletions Inline Diff
drivers/block/rbd.c
1 | /* | 1 | /* |
2 | rbd.c -- Export ceph rados objects as a Linux block device | 2 | rbd.c -- Export ceph rados objects as a Linux block device |
3 | 3 | ||
4 | 4 | ||
5 | based on drivers/block/osdblk.c: | 5 | based on drivers/block/osdblk.c: |
6 | 6 | ||
7 | Copyright 2009 Red Hat, Inc. | 7 | Copyright 2009 Red Hat, Inc. |
8 | 8 | ||
9 | This program is free software; you can redistribute it and/or modify | 9 | This program is free software; you can redistribute it and/or modify |
10 | it under the terms of the GNU General Public License as published by | 10 | it under the terms of the GNU General Public License as published by |
11 | the Free Software Foundation. | 11 | the Free Software Foundation. |
12 | 12 | ||
13 | This program is distributed in the hope that it will be useful, | 13 | This program is distributed in the hope that it will be useful, |
14 | but WITHOUT ANY WARRANTY; without even the implied warranty of | 14 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 | GNU General Public License for more details. | 16 | GNU General Public License for more details. |
17 | 17 | ||
18 | You should have received a copy of the GNU General Public License | 18 | You should have received a copy of the GNU General Public License |
19 | along with this program; see the file COPYING. If not, write to | 19 | along with this program; see the file COPYING. If not, write to |
20 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | 20 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. |
21 | 21 | ||
22 | 22 | ||
23 | 23 | ||
24 | For usage instructions, please refer to: | 24 | For usage instructions, please refer to: |
25 | 25 | ||
26 | Documentation/ABI/testing/sysfs-bus-rbd | 26 | Documentation/ABI/testing/sysfs-bus-rbd |
27 | 27 | ||
28 | */ | 28 | */ |
29 | 29 | ||
30 | #include <linux/ceph/libceph.h> | 30 | #include <linux/ceph/libceph.h> |
31 | #include <linux/ceph/osd_client.h> | 31 | #include <linux/ceph/osd_client.h> |
32 | #include <linux/ceph/mon_client.h> | 32 | #include <linux/ceph/mon_client.h> |
33 | #include <linux/ceph/decode.h> | 33 | #include <linux/ceph/decode.h> |
34 | #include <linux/parser.h> | 34 | #include <linux/parser.h> |
35 | 35 | ||
36 | #include <linux/kernel.h> | 36 | #include <linux/kernel.h> |
37 | #include <linux/device.h> | 37 | #include <linux/device.h> |
38 | #include <linux/module.h> | 38 | #include <linux/module.h> |
39 | #include <linux/fs.h> | 39 | #include <linux/fs.h> |
40 | #include <linux/blkdev.h> | 40 | #include <linux/blkdev.h> |
41 | 41 | ||
42 | #include "rbd_types.h" | 42 | #include "rbd_types.h" |
43 | 43 | ||
44 | #define RBD_DEBUG /* Activate rbd_assert() calls */ | 44 | #define RBD_DEBUG /* Activate rbd_assert() calls */ |
45 | 45 | ||
46 | /* | 46 | /* |
47 | * The basic unit of block I/O is a sector. It is interpreted in a | 47 | * The basic unit of block I/O is a sector. It is interpreted in a |
48 | * number of contexts in Linux (blk, bio, genhd), but the default is | 48 | * number of contexts in Linux (blk, bio, genhd), but the default is |
49 | * universally 512 bytes. These symbols are just slightly more | 49 | * universally 512 bytes. These symbols are just slightly more |
50 | * meaningful than the bare numbers they represent. | 50 | * meaningful than the bare numbers they represent. |
51 | */ | 51 | */ |
52 | #define SECTOR_SHIFT 9 | 52 | #define SECTOR_SHIFT 9 |
53 | #define SECTOR_SIZE (1ULL << SECTOR_SHIFT) | 53 | #define SECTOR_SIZE (1ULL << SECTOR_SHIFT) |
54 | 54 | ||
55 | #define RBD_DRV_NAME "rbd" | 55 | #define RBD_DRV_NAME "rbd" |
56 | #define RBD_DRV_NAME_LONG "rbd (rados block device)" | 56 | #define RBD_DRV_NAME_LONG "rbd (rados block device)" |
57 | 57 | ||
58 | #define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */ | 58 | #define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */ |
59 | 59 | ||
60 | #define RBD_SNAP_DEV_NAME_PREFIX "snap_" | 60 | #define RBD_SNAP_DEV_NAME_PREFIX "snap_" |
61 | #define RBD_MAX_SNAP_NAME_LEN \ | 61 | #define RBD_MAX_SNAP_NAME_LEN \ |
62 | (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1)) | 62 | (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1)) |
63 | 63 | ||
64 | #define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ | 64 | #define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ |
65 | 65 | ||
66 | #define RBD_SNAP_HEAD_NAME "-" | 66 | #define RBD_SNAP_HEAD_NAME "-" |
67 | 67 | ||
68 | /* This allows a single page to hold an image name sent by OSD */ | 68 | /* This allows a single page to hold an image name sent by OSD */ |
69 | #define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1) | 69 | #define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1) |
70 | #define RBD_IMAGE_ID_LEN_MAX 64 | 70 | #define RBD_IMAGE_ID_LEN_MAX 64 |
71 | 71 | ||
72 | #define RBD_OBJ_PREFIX_LEN_MAX 64 | 72 | #define RBD_OBJ_PREFIX_LEN_MAX 64 |
73 | 73 | ||
74 | /* Feature bits */ | 74 | /* Feature bits */ |
75 | 75 | ||
76 | #define RBD_FEATURE_LAYERING (1<<0) | 76 | #define RBD_FEATURE_LAYERING (1<<0) |
77 | #define RBD_FEATURE_STRIPINGV2 (1<<1) | 77 | #define RBD_FEATURE_STRIPINGV2 (1<<1) |
78 | #define RBD_FEATURES_ALL \ | 78 | #define RBD_FEATURES_ALL \ |
79 | (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2) | 79 | (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2) |
80 | 80 | ||
81 | /* Features supported by this (client software) implementation. */ | 81 | /* Features supported by this (client software) implementation. */ |
82 | 82 | ||
83 | #define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL) | 83 | #define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL) |
84 | 84 | ||
85 | /* | 85 | /* |
86 | * An RBD device name will be "rbd#", where the "rbd" comes from | 86 | * An RBD device name will be "rbd#", where the "rbd" comes from |
87 | * RBD_DRV_NAME above, and # is a unique integer identifier. | 87 | * RBD_DRV_NAME above, and # is a unique integer identifier. |
88 | * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big | 88 | * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big |
89 | * enough to hold all possible device names. | 89 | * enough to hold all possible device names. |
90 | */ | 90 | */ |
91 | #define DEV_NAME_LEN 32 | 91 | #define DEV_NAME_LEN 32 |
92 | #define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1) | 92 | #define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1) |
93 | 93 | ||
94 | /* | 94 | /* |
95 | * block device image metadata (in-memory version) | 95 | * block device image metadata (in-memory version) |
96 | */ | 96 | */ |
97 | struct rbd_image_header { | 97 | struct rbd_image_header { |
98 | /* These four fields never change for a given rbd image */ | 98 | /* These four fields never change for a given rbd image */ |
99 | char *object_prefix; | 99 | char *object_prefix; |
100 | u64 features; | 100 | u64 features; |
101 | __u8 obj_order; | 101 | __u8 obj_order; |
102 | __u8 crypt_type; | 102 | __u8 crypt_type; |
103 | __u8 comp_type; | 103 | __u8 comp_type; |
104 | 104 | ||
105 | /* The remaining fields need to be updated occasionally */ | 105 | /* The remaining fields need to be updated occasionally */ |
106 | u64 image_size; | 106 | u64 image_size; |
107 | struct ceph_snap_context *snapc; | 107 | struct ceph_snap_context *snapc; |
108 | char *snap_names; | 108 | char *snap_names; |
109 | u64 *snap_sizes; | 109 | u64 *snap_sizes; |
110 | 110 | ||
111 | u64 stripe_unit; | 111 | u64 stripe_unit; |
112 | u64 stripe_count; | 112 | u64 stripe_count; |
113 | 113 | ||
114 | u64 obj_version; | 114 | u64 obj_version; |
115 | }; | 115 | }; |
116 | 116 | ||
117 | /* | 117 | /* |
118 | * An rbd image specification. | 118 | * An rbd image specification. |
119 | * | 119 | * |
120 | * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely | 120 | * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely |
121 | * identify an image. Each rbd_dev structure includes a pointer to | 121 | * identify an image. Each rbd_dev structure includes a pointer to |
122 | * an rbd_spec structure that encapsulates this identity. | 122 | * an rbd_spec structure that encapsulates this identity. |
123 | * | 123 | * |
124 | * Each of the id's in an rbd_spec has an associated name. For a | 124 | * Each of the id's in an rbd_spec has an associated name. For a |
125 | * user-mapped image, the names are supplied and the id's associated | 125 | * user-mapped image, the names are supplied and the id's associated |
126 | * with them are looked up. For a layered image, a parent image is | 126 | * with them are looked up. For a layered image, a parent image is |
127 | * defined by the tuple, and the names are looked up. | 127 | * defined by the tuple, and the names are looked up. |
128 | * | 128 | * |
129 | * An rbd_dev structure contains a parent_spec pointer which is | 129 | * An rbd_dev structure contains a parent_spec pointer which is |
130 | * non-null if the image it represents is a child in a layered | 130 | * non-null if the image it represents is a child in a layered |
131 | * image. This pointer will refer to the rbd_spec structure used | 131 | * image. This pointer will refer to the rbd_spec structure used |
132 | * by the parent rbd_dev for its own identity (i.e., the structure | 132 | * by the parent rbd_dev for its own identity (i.e., the structure |
133 | * is shared between the parent and child). | 133 | * is shared between the parent and child). |
134 | * | 134 | * |
135 | * Since these structures are populated once, during the discovery | 135 | * Since these structures are populated once, during the discovery |
136 | * phase of image construction, they are effectively immutable so | 136 | * phase of image construction, they are effectively immutable so |
137 | * we make no effort to synchronize access to them. | 137 | * we make no effort to synchronize access to them. |
138 | * | 138 | * |
139 | * Note that code herein does not assume the image name is known (it | 139 | * Note that code herein does not assume the image name is known (it |
140 | * could be a null pointer). | 140 | * could be a null pointer). |
141 | */ | 141 | */ |
142 | struct rbd_spec { | 142 | struct rbd_spec { |
143 | u64 pool_id; | 143 | u64 pool_id; |
144 | const char *pool_name; | 144 | const char *pool_name; |
145 | 145 | ||
146 | const char *image_id; | 146 | const char *image_id; |
147 | const char *image_name; | 147 | const char *image_name; |
148 | 148 | ||
149 | u64 snap_id; | 149 | u64 snap_id; |
150 | const char *snap_name; | 150 | const char *snap_name; |
151 | 151 | ||
152 | struct kref kref; | 152 | struct kref kref; |
153 | }; | 153 | }; |
154 | 154 | ||
155 | /* | 155 | /* |
156 | * an instance of the client. multiple devices may share an rbd client. | 156 | * an instance of the client. multiple devices may share an rbd client. |
157 | */ | 157 | */ |
158 | struct rbd_client { | 158 | struct rbd_client { |
159 | struct ceph_client *client; | 159 | struct ceph_client *client; |
160 | struct kref kref; | 160 | struct kref kref; |
161 | struct list_head node; | 161 | struct list_head node; |
162 | }; | 162 | }; |
163 | 163 | ||
164 | struct rbd_img_request; | 164 | struct rbd_img_request; |
165 | typedef void (*rbd_img_callback_t)(struct rbd_img_request *); | 165 | typedef void (*rbd_img_callback_t)(struct rbd_img_request *); |
166 | 166 | ||
167 | #define BAD_WHICH U32_MAX /* Good which or bad which, which? */ | 167 | #define BAD_WHICH U32_MAX /* Good which or bad which, which? */ |
168 | 168 | ||
169 | struct rbd_obj_request; | 169 | struct rbd_obj_request; |
170 | typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *); | 170 | typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *); |
171 | 171 | ||
172 | enum obj_request_type { | 172 | enum obj_request_type { |
173 | OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES | 173 | OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES |
174 | }; | 174 | }; |
175 | 175 | ||
176 | enum obj_req_flags { | 176 | enum obj_req_flags { |
177 | OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */ | 177 | OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */ |
178 | OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */ | 178 | OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */ |
179 | OBJ_REQ_KNOWN, /* EXISTS flag valid: no = 0, yes = 1 */ | 179 | OBJ_REQ_KNOWN, /* EXISTS flag valid: no = 0, yes = 1 */ |
180 | OBJ_REQ_EXISTS, /* target exists: no = 0, yes = 1 */ | 180 | OBJ_REQ_EXISTS, /* target exists: no = 0, yes = 1 */ |
181 | }; | 181 | }; |
182 | 182 | ||
183 | struct rbd_obj_request { | 183 | struct rbd_obj_request { |
184 | const char *object_name; | 184 | const char *object_name; |
185 | u64 offset; /* object start byte */ | 185 | u64 offset; /* object start byte */ |
186 | u64 length; /* bytes from offset */ | 186 | u64 length; /* bytes from offset */ |
187 | unsigned long flags; | 187 | unsigned long flags; |
188 | 188 | ||
189 | /* | 189 | /* |
190 | * An object request associated with an image will have its | 190 | * An object request associated with an image will have its |
191 | * img_data flag set; a standalone object request will not. | 191 | * img_data flag set; a standalone object request will not. |
192 | * | 192 | * |
193 | * A standalone object request will have which == BAD_WHICH | 193 | * A standalone object request will have which == BAD_WHICH |
194 | * and a null obj_request pointer. | 194 | * and a null obj_request pointer. |
195 | * | 195 | * |
196 | * An object request initiated in support of a layered image | 196 | * An object request initiated in support of a layered image |
197 | * object (to check for its existence before a write) will | 197 | * object (to check for its existence before a write) will |
198 | * have which == BAD_WHICH and a non-null obj_request pointer. | 198 | * have which == BAD_WHICH and a non-null obj_request pointer. |
199 | * | 199 | * |
200 | * Finally, an object request for rbd image data will have | 200 | * Finally, an object request for rbd image data will have |
201 | * which != BAD_WHICH, and will have a non-null img_request | 201 | * which != BAD_WHICH, and will have a non-null img_request |
202 | * pointer. The value of which will be in the range | 202 | * pointer. The value of which will be in the range |
203 | * 0..(img_request->obj_request_count-1). | 203 | * 0..(img_request->obj_request_count-1). |
204 | */ | 204 | */ |
205 | union { | 205 | union { |
206 | struct rbd_obj_request *obj_request; /* STAT op */ | 206 | struct rbd_obj_request *obj_request; /* STAT op */ |
207 | struct { | 207 | struct { |
208 | struct rbd_img_request *img_request; | 208 | struct rbd_img_request *img_request; |
209 | u64 img_offset; | 209 | u64 img_offset; |
210 | /* links for img_request->obj_requests list */ | 210 | /* links for img_request->obj_requests list */ |
211 | struct list_head links; | 211 | struct list_head links; |
212 | }; | 212 | }; |
213 | }; | 213 | }; |
214 | u32 which; /* posn image request list */ | 214 | u32 which; /* posn image request list */ |
215 | 215 | ||
216 | enum obj_request_type type; | 216 | enum obj_request_type type; |
217 | union { | 217 | union { |
218 | struct bio *bio_list; | 218 | struct bio *bio_list; |
219 | struct { | 219 | struct { |
220 | struct page **pages; | 220 | struct page **pages; |
221 | u32 page_count; | 221 | u32 page_count; |
222 | }; | 222 | }; |
223 | }; | 223 | }; |
224 | struct page **copyup_pages; | 224 | struct page **copyup_pages; |
225 | 225 | ||
226 | struct ceph_osd_request *osd_req; | 226 | struct ceph_osd_request *osd_req; |
227 | 227 | ||
228 | u64 xferred; /* bytes transferred */ | 228 | u64 xferred; /* bytes transferred */ |
229 | u64 version; | 229 | u64 version; |
230 | int result; | 230 | int result; |
231 | 231 | ||
232 | rbd_obj_callback_t callback; | 232 | rbd_obj_callback_t callback; |
233 | struct completion completion; | 233 | struct completion completion; |
234 | 234 | ||
235 | struct kref kref; | 235 | struct kref kref; |
236 | }; | 236 | }; |
237 | 237 | ||
238 | enum img_req_flags { | 238 | enum img_req_flags { |
239 | IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */ | 239 | IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */ |
240 | IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */ | 240 | IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */ |
241 | IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */ | 241 | IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */ |
242 | }; | 242 | }; |
243 | 243 | ||
244 | struct rbd_img_request { | 244 | struct rbd_img_request { |
245 | struct rbd_device *rbd_dev; | 245 | struct rbd_device *rbd_dev; |
246 | u64 offset; /* starting image byte offset */ | 246 | u64 offset; /* starting image byte offset */ |
247 | u64 length; /* byte count from offset */ | 247 | u64 length; /* byte count from offset */ |
248 | unsigned long flags; | 248 | unsigned long flags; |
249 | union { | 249 | union { |
250 | u64 snap_id; /* for reads */ | 250 | u64 snap_id; /* for reads */ |
251 | struct ceph_snap_context *snapc; /* for writes */ | 251 | struct ceph_snap_context *snapc; /* for writes */ |
252 | }; | 252 | }; |
253 | union { | 253 | union { |
254 | struct request *rq; /* block request */ | 254 | struct request *rq; /* block request */ |
255 | struct rbd_obj_request *obj_request; /* obj req initiator */ | 255 | struct rbd_obj_request *obj_request; /* obj req initiator */ |
256 | }; | 256 | }; |
257 | struct page **copyup_pages; | 257 | struct page **copyup_pages; |
258 | spinlock_t completion_lock;/* protects next_completion */ | 258 | spinlock_t completion_lock;/* protects next_completion */ |
259 | u32 next_completion; | 259 | u32 next_completion; |
260 | rbd_img_callback_t callback; | 260 | rbd_img_callback_t callback; |
261 | u64 xferred;/* aggregate bytes transferred */ | 261 | u64 xferred;/* aggregate bytes transferred */ |
262 | int result; /* first nonzero obj_request result */ | 262 | int result; /* first nonzero obj_request result */ |
263 | 263 | ||
264 | u32 obj_request_count; | 264 | u32 obj_request_count; |
265 | struct list_head obj_requests; /* rbd_obj_request structs */ | 265 | struct list_head obj_requests; /* rbd_obj_request structs */ |
266 | 266 | ||
267 | struct kref kref; | 267 | struct kref kref; |
268 | }; | 268 | }; |
269 | 269 | ||
270 | #define for_each_obj_request(ireq, oreq) \ | 270 | #define for_each_obj_request(ireq, oreq) \ |
271 | list_for_each_entry(oreq, &(ireq)->obj_requests, links) | 271 | list_for_each_entry(oreq, &(ireq)->obj_requests, links) |
272 | #define for_each_obj_request_from(ireq, oreq) \ | 272 | #define for_each_obj_request_from(ireq, oreq) \ |
273 | list_for_each_entry_from(oreq, &(ireq)->obj_requests, links) | 273 | list_for_each_entry_from(oreq, &(ireq)->obj_requests, links) |
274 | #define for_each_obj_request_safe(ireq, oreq, n) \ | 274 | #define for_each_obj_request_safe(ireq, oreq, n) \ |
275 | list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links) | 275 | list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links) |
276 | 276 | ||
277 | struct rbd_snap { | 277 | struct rbd_snap { |
278 | const char *name; | 278 | const char *name; |
279 | u64 size; | 279 | u64 size; |
280 | struct list_head node; | 280 | struct list_head node; |
281 | u64 id; | 281 | u64 id; |
282 | u64 features; | 282 | u64 features; |
283 | }; | 283 | }; |
284 | 284 | ||
285 | struct rbd_mapping { | 285 | struct rbd_mapping { |
286 | u64 size; | 286 | u64 size; |
287 | u64 features; | 287 | u64 features; |
288 | bool read_only; | 288 | bool read_only; |
289 | }; | 289 | }; |
290 | 290 | ||
291 | /* | 291 | /* |
292 | * a single device | 292 | * a single device |
293 | */ | 293 | */ |
294 | struct rbd_device { | 294 | struct rbd_device { |
295 | int dev_id; /* blkdev unique id */ | 295 | int dev_id; /* blkdev unique id */ |
296 | 296 | ||
297 | int major; /* blkdev assigned major */ | 297 | int major; /* blkdev assigned major */ |
298 | struct gendisk *disk; /* blkdev's gendisk and rq */ | 298 | struct gendisk *disk; /* blkdev's gendisk and rq */ |
299 | 299 | ||
300 | u32 image_format; /* Either 1 or 2 */ | 300 | u32 image_format; /* Either 1 or 2 */ |
301 | struct rbd_client *rbd_client; | 301 | struct rbd_client *rbd_client; |
302 | 302 | ||
303 | char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ | 303 | char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ |
304 | 304 | ||
305 | spinlock_t lock; /* queue, flags, open_count */ | 305 | spinlock_t lock; /* queue, flags, open_count */ |
306 | 306 | ||
307 | struct rbd_image_header header; | 307 | struct rbd_image_header header; |
308 | unsigned long flags; /* possibly lock protected */ | 308 | unsigned long flags; /* possibly lock protected */ |
309 | struct rbd_spec *spec; | 309 | struct rbd_spec *spec; |
310 | 310 | ||
311 | char *header_name; | 311 | char *header_name; |
312 | 312 | ||
313 | struct ceph_file_layout layout; | 313 | struct ceph_file_layout layout; |
314 | 314 | ||
315 | struct ceph_osd_event *watch_event; | 315 | struct ceph_osd_event *watch_event; |
316 | struct rbd_obj_request *watch_request; | 316 | struct rbd_obj_request *watch_request; |
317 | 317 | ||
318 | struct rbd_spec *parent_spec; | 318 | struct rbd_spec *parent_spec; |
319 | u64 parent_overlap; | 319 | u64 parent_overlap; |
320 | struct rbd_device *parent; | 320 | struct rbd_device *parent; |
321 | 321 | ||
322 | /* protects updating the header */ | 322 | /* protects updating the header */ |
323 | struct rw_semaphore header_rwsem; | 323 | struct rw_semaphore header_rwsem; |
324 | 324 | ||
325 | struct rbd_mapping mapping; | 325 | struct rbd_mapping mapping; |
326 | 326 | ||
327 | struct list_head node; | 327 | struct list_head node; |
328 | 328 | ||
329 | /* list of snapshots */ | 329 | /* list of snapshots */ |
330 | struct list_head snaps; | 330 | struct list_head snaps; |
331 | 331 | ||
332 | /* sysfs related */ | 332 | /* sysfs related */ |
333 | struct device dev; | 333 | struct device dev; |
334 | unsigned long open_count; /* protected by lock */ | 334 | unsigned long open_count; /* protected by lock */ |
335 | }; | 335 | }; |
336 | 336 | ||
337 | /* | 337 | /* |
338 | * Flag bits for rbd_dev->flags. If atomicity is required, | 338 | * Flag bits for rbd_dev->flags. If atomicity is required, |
339 | * rbd_dev->lock is used to protect access. | 339 | * rbd_dev->lock is used to protect access. |
340 | * | 340 | * |
341 | * Currently, only the "removing" flag (which is coupled with the | 341 | * Currently, only the "removing" flag (which is coupled with the |
342 | * "open_count" field) requires atomic access. | 342 | * "open_count" field) requires atomic access. |
343 | */ | 343 | */ |
344 | enum rbd_dev_flags { | 344 | enum rbd_dev_flags { |
345 | RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */ | 345 | RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */ |
346 | RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */ | 346 | RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */ |
347 | }; | 347 | }; |
348 | 348 | ||
349 | static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ | 349 | static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ |
350 | 350 | ||
351 | static LIST_HEAD(rbd_dev_list); /* devices */ | 351 | static LIST_HEAD(rbd_dev_list); /* devices */ |
352 | static DEFINE_SPINLOCK(rbd_dev_list_lock); | 352 | static DEFINE_SPINLOCK(rbd_dev_list_lock); |
353 | 353 | ||
354 | static LIST_HEAD(rbd_client_list); /* clients */ | 354 | static LIST_HEAD(rbd_client_list); /* clients */ |
355 | static DEFINE_SPINLOCK(rbd_client_list_lock); | 355 | static DEFINE_SPINLOCK(rbd_client_list_lock); |
356 | 356 | ||
357 | static int rbd_img_request_submit(struct rbd_img_request *img_request); | 357 | static int rbd_img_request_submit(struct rbd_img_request *img_request); |
358 | 358 | ||
359 | static int rbd_dev_snaps_update(struct rbd_device *rbd_dev); | 359 | static int rbd_dev_snaps_update(struct rbd_device *rbd_dev); |
360 | 360 | ||
361 | static void rbd_dev_device_release(struct device *dev); | 361 | static void rbd_dev_device_release(struct device *dev); |
362 | static void rbd_snap_destroy(struct rbd_snap *snap); | 362 | static void rbd_snap_destroy(struct rbd_snap *snap); |
363 | 363 | ||
364 | static ssize_t rbd_add(struct bus_type *bus, const char *buf, | 364 | static ssize_t rbd_add(struct bus_type *bus, const char *buf, |
365 | size_t count); | 365 | size_t count); |
366 | static ssize_t rbd_remove(struct bus_type *bus, const char *buf, | 366 | static ssize_t rbd_remove(struct bus_type *bus, const char *buf, |
367 | size_t count); | 367 | size_t count); |
368 | static int rbd_dev_image_probe(struct rbd_device *rbd_dev); | 368 | static int rbd_dev_image_probe(struct rbd_device *rbd_dev); |
369 | 369 | ||
370 | static struct bus_attribute rbd_bus_attrs[] = { | 370 | static struct bus_attribute rbd_bus_attrs[] = { |
371 | __ATTR(add, S_IWUSR, NULL, rbd_add), | 371 | __ATTR(add, S_IWUSR, NULL, rbd_add), |
372 | __ATTR(remove, S_IWUSR, NULL, rbd_remove), | 372 | __ATTR(remove, S_IWUSR, NULL, rbd_remove), |
373 | __ATTR_NULL | 373 | __ATTR_NULL |
374 | }; | 374 | }; |
375 | 375 | ||
376 | static struct bus_type rbd_bus_type = { | 376 | static struct bus_type rbd_bus_type = { |
377 | .name = "rbd", | 377 | .name = "rbd", |
378 | .bus_attrs = rbd_bus_attrs, | 378 | .bus_attrs = rbd_bus_attrs, |
379 | }; | 379 | }; |
380 | 380 | ||
381 | static void rbd_root_dev_release(struct device *dev) | 381 | static void rbd_root_dev_release(struct device *dev) |
382 | { | 382 | { |
383 | } | 383 | } |
384 | 384 | ||
385 | static struct device rbd_root_dev = { | 385 | static struct device rbd_root_dev = { |
386 | .init_name = "rbd", | 386 | .init_name = "rbd", |
387 | .release = rbd_root_dev_release, | 387 | .release = rbd_root_dev_release, |
388 | }; | 388 | }; |
389 | 389 | ||
390 | static __printf(2, 3) | 390 | static __printf(2, 3) |
391 | void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...) | 391 | void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...) |
392 | { | 392 | { |
393 | struct va_format vaf; | 393 | struct va_format vaf; |
394 | va_list args; | 394 | va_list args; |
395 | 395 | ||
396 | va_start(args, fmt); | 396 | va_start(args, fmt); |
397 | vaf.fmt = fmt; | 397 | vaf.fmt = fmt; |
398 | vaf.va = &args; | 398 | vaf.va = &args; |
399 | 399 | ||
400 | if (!rbd_dev) | 400 | if (!rbd_dev) |
401 | printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf); | 401 | printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf); |
402 | else if (rbd_dev->disk) | 402 | else if (rbd_dev->disk) |
403 | printk(KERN_WARNING "%s: %s: %pV\n", | 403 | printk(KERN_WARNING "%s: %s: %pV\n", |
404 | RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf); | 404 | RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf); |
405 | else if (rbd_dev->spec && rbd_dev->spec->image_name) | 405 | else if (rbd_dev->spec && rbd_dev->spec->image_name) |
406 | printk(KERN_WARNING "%s: image %s: %pV\n", | 406 | printk(KERN_WARNING "%s: image %s: %pV\n", |
407 | RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf); | 407 | RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf); |
408 | else if (rbd_dev->spec && rbd_dev->spec->image_id) | 408 | else if (rbd_dev->spec && rbd_dev->spec->image_id) |
409 | printk(KERN_WARNING "%s: id %s: %pV\n", | 409 | printk(KERN_WARNING "%s: id %s: %pV\n", |
410 | RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf); | 410 | RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf); |
411 | else /* punt */ | 411 | else /* punt */ |
412 | printk(KERN_WARNING "%s: rbd_dev %p: %pV\n", | 412 | printk(KERN_WARNING "%s: rbd_dev %p: %pV\n", |
413 | RBD_DRV_NAME, rbd_dev, &vaf); | 413 | RBD_DRV_NAME, rbd_dev, &vaf); |
414 | va_end(args); | 414 | va_end(args); |
415 | } | 415 | } |
416 | 416 | ||
417 | #ifdef RBD_DEBUG | 417 | #ifdef RBD_DEBUG |
418 | #define rbd_assert(expr) \ | 418 | #define rbd_assert(expr) \ |
419 | if (unlikely(!(expr))) { \ | 419 | if (unlikely(!(expr))) { \ |
420 | printk(KERN_ERR "\nAssertion failure in %s() " \ | 420 | printk(KERN_ERR "\nAssertion failure in %s() " \ |
421 | "at line %d:\n\n" \ | 421 | "at line %d:\n\n" \ |
422 | "\trbd_assert(%s);\n\n", \ | 422 | "\trbd_assert(%s);\n\n", \ |
423 | __func__, __LINE__, #expr); \ | 423 | __func__, __LINE__, #expr); \ |
424 | BUG(); \ | 424 | BUG(); \ |
425 | } | 425 | } |
426 | #else /* !RBD_DEBUG */ | 426 | #else /* !RBD_DEBUG */ |
427 | # define rbd_assert(expr) ((void) 0) | 427 | # define rbd_assert(expr) ((void) 0) |
428 | #endif /* !RBD_DEBUG */ | 428 | #endif /* !RBD_DEBUG */ |
429 | 429 | ||
430 | static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request); | 430 | static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request); |
431 | static void rbd_img_parent_read(struct rbd_obj_request *obj_request); | 431 | static void rbd_img_parent_read(struct rbd_obj_request *obj_request); |
432 | static void rbd_dev_remove_parent(struct rbd_device *rbd_dev); | 432 | static void rbd_dev_remove_parent(struct rbd_device *rbd_dev); |
433 | 433 | ||
434 | static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver); | 434 | static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver); |
435 | static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver); | 435 | static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver); |
436 | 436 | ||
437 | static int rbd_open(struct block_device *bdev, fmode_t mode) | 437 | static int rbd_open(struct block_device *bdev, fmode_t mode) |
438 | { | 438 | { |
439 | struct rbd_device *rbd_dev = bdev->bd_disk->private_data; | 439 | struct rbd_device *rbd_dev = bdev->bd_disk->private_data; |
440 | bool removing = false; | 440 | bool removing = false; |
441 | 441 | ||
442 | if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only) | 442 | if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only) |
443 | return -EROFS; | 443 | return -EROFS; |
444 | 444 | ||
445 | spin_lock_irq(&rbd_dev->lock); | 445 | spin_lock_irq(&rbd_dev->lock); |
446 | if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) | 446 | if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) |
447 | removing = true; | 447 | removing = true; |
448 | else | 448 | else |
449 | rbd_dev->open_count++; | 449 | rbd_dev->open_count++; |
450 | spin_unlock_irq(&rbd_dev->lock); | 450 | spin_unlock_irq(&rbd_dev->lock); |
451 | if (removing) | 451 | if (removing) |
452 | return -ENOENT; | 452 | return -ENOENT; |
453 | 453 | ||
454 | mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); | 454 | mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); |
455 | (void) get_device(&rbd_dev->dev); | 455 | (void) get_device(&rbd_dev->dev); |
456 | set_device_ro(bdev, rbd_dev->mapping.read_only); | 456 | set_device_ro(bdev, rbd_dev->mapping.read_only); |
457 | mutex_unlock(&ctl_mutex); | 457 | mutex_unlock(&ctl_mutex); |
458 | 458 | ||
459 | return 0; | 459 | return 0; |
460 | } | 460 | } |
461 | 461 | ||
462 | static int rbd_release(struct gendisk *disk, fmode_t mode) | 462 | static int rbd_release(struct gendisk *disk, fmode_t mode) |
463 | { | 463 | { |
464 | struct rbd_device *rbd_dev = disk->private_data; | 464 | struct rbd_device *rbd_dev = disk->private_data; |
465 | unsigned long open_count_before; | 465 | unsigned long open_count_before; |
466 | 466 | ||
467 | spin_lock_irq(&rbd_dev->lock); | 467 | spin_lock_irq(&rbd_dev->lock); |
468 | open_count_before = rbd_dev->open_count--; | 468 | open_count_before = rbd_dev->open_count--; |
469 | spin_unlock_irq(&rbd_dev->lock); | 469 | spin_unlock_irq(&rbd_dev->lock); |
470 | rbd_assert(open_count_before > 0); | 470 | rbd_assert(open_count_before > 0); |
471 | 471 | ||
472 | mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); | 472 | mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); |
473 | put_device(&rbd_dev->dev); | 473 | put_device(&rbd_dev->dev); |
474 | mutex_unlock(&ctl_mutex); | 474 | mutex_unlock(&ctl_mutex); |
475 | 475 | ||
476 | return 0; | 476 | return 0; |
477 | } | 477 | } |
478 | 478 | ||
479 | static const struct block_device_operations rbd_bd_ops = { | 479 | static const struct block_device_operations rbd_bd_ops = { |
480 | .owner = THIS_MODULE, | 480 | .owner = THIS_MODULE, |
481 | .open = rbd_open, | 481 | .open = rbd_open, |
482 | .release = rbd_release, | 482 | .release = rbd_release, |
483 | }; | 483 | }; |
484 | 484 | ||
485 | /* | 485 | /* |
486 | * Initialize an rbd client instance. | 486 | * Initialize an rbd client instance. |
487 | * We own *ceph_opts. | 487 | * We own *ceph_opts. |
488 | */ | 488 | */ |
489 | static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) | 489 | static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) |
490 | { | 490 | { |
491 | struct rbd_client *rbdc; | 491 | struct rbd_client *rbdc; |
492 | int ret = -ENOMEM; | 492 | int ret = -ENOMEM; |
493 | 493 | ||
494 | dout("%s:\n", __func__); | 494 | dout("%s:\n", __func__); |
495 | rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); | 495 | rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); |
496 | if (!rbdc) | 496 | if (!rbdc) |
497 | goto out_opt; | 497 | goto out_opt; |
498 | 498 | ||
499 | kref_init(&rbdc->kref); | 499 | kref_init(&rbdc->kref); |
500 | INIT_LIST_HEAD(&rbdc->node); | 500 | INIT_LIST_HEAD(&rbdc->node); |
501 | 501 | ||
502 | mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); | 502 | mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); |
503 | 503 | ||
504 | rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0); | 504 | rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0); |
505 | if (IS_ERR(rbdc->client)) | 505 | if (IS_ERR(rbdc->client)) |
506 | goto out_mutex; | 506 | goto out_mutex; |
507 | ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */ | 507 | ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */ |
508 | 508 | ||
509 | ret = ceph_open_session(rbdc->client); | 509 | ret = ceph_open_session(rbdc->client); |
510 | if (ret < 0) | 510 | if (ret < 0) |
511 | goto out_err; | 511 | goto out_err; |
512 | 512 | ||
513 | spin_lock(&rbd_client_list_lock); | 513 | spin_lock(&rbd_client_list_lock); |
514 | list_add_tail(&rbdc->node, &rbd_client_list); | 514 | list_add_tail(&rbdc->node, &rbd_client_list); |
515 | spin_unlock(&rbd_client_list_lock); | 515 | spin_unlock(&rbd_client_list_lock); |
516 | 516 | ||
517 | mutex_unlock(&ctl_mutex); | 517 | mutex_unlock(&ctl_mutex); |
518 | dout("%s: rbdc %p\n", __func__, rbdc); | 518 | dout("%s: rbdc %p\n", __func__, rbdc); |
519 | 519 | ||
520 | return rbdc; | 520 | return rbdc; |
521 | 521 | ||
522 | out_err: | 522 | out_err: |
523 | ceph_destroy_client(rbdc->client); | 523 | ceph_destroy_client(rbdc->client); |
524 | out_mutex: | 524 | out_mutex: |
525 | mutex_unlock(&ctl_mutex); | 525 | mutex_unlock(&ctl_mutex); |
526 | kfree(rbdc); | 526 | kfree(rbdc); |
527 | out_opt: | 527 | out_opt: |
528 | if (ceph_opts) | 528 | if (ceph_opts) |
529 | ceph_destroy_options(ceph_opts); | 529 | ceph_destroy_options(ceph_opts); |
530 | dout("%s: error %d\n", __func__, ret); | 530 | dout("%s: error %d\n", __func__, ret); |
531 | 531 | ||
532 | return ERR_PTR(ret); | 532 | return ERR_PTR(ret); |
533 | } | 533 | } |
534 | 534 | ||
535 | static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc) | 535 | static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc) |
536 | { | 536 | { |
537 | kref_get(&rbdc->kref); | 537 | kref_get(&rbdc->kref); |
538 | 538 | ||
539 | return rbdc; | 539 | return rbdc; |
540 | } | 540 | } |
541 | 541 | ||
542 | /* | 542 | /* |
543 | * Find a ceph client with specific addr and configuration. If | 543 | * Find a ceph client with specific addr and configuration. If |
544 | * found, bump its reference count. | 544 | * found, bump its reference count. |
545 | */ | 545 | */ |
546 | static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) | 546 | static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) |
547 | { | 547 | { |
548 | struct rbd_client *client_node; | 548 | struct rbd_client *client_node; |
549 | bool found = false; | 549 | bool found = false; |
550 | 550 | ||
551 | if (ceph_opts->flags & CEPH_OPT_NOSHARE) | 551 | if (ceph_opts->flags & CEPH_OPT_NOSHARE) |
552 | return NULL; | 552 | return NULL; |
553 | 553 | ||
554 | spin_lock(&rbd_client_list_lock); | 554 | spin_lock(&rbd_client_list_lock); |
555 | list_for_each_entry(client_node, &rbd_client_list, node) { | 555 | list_for_each_entry(client_node, &rbd_client_list, node) { |
556 | if (!ceph_compare_options(ceph_opts, client_node->client)) { | 556 | if (!ceph_compare_options(ceph_opts, client_node->client)) { |
557 | __rbd_get_client(client_node); | 557 | __rbd_get_client(client_node); |
558 | 558 | ||
559 | found = true; | 559 | found = true; |
560 | break; | 560 | break; |
561 | } | 561 | } |
562 | } | 562 | } |
563 | spin_unlock(&rbd_client_list_lock); | 563 | spin_unlock(&rbd_client_list_lock); |
564 | 564 | ||
565 | return found ? client_node : NULL; | 565 | return found ? client_node : NULL; |
566 | } | 566 | } |
567 | 567 | ||
568 | /* | 568 | /* |
569 | * mount options | 569 | * mount options |
570 | */ | 570 | */ |
571 | enum { | 571 | enum { |
572 | Opt_last_int, | 572 | Opt_last_int, |
573 | /* int args above */ | 573 | /* int args above */ |
574 | Opt_last_string, | 574 | Opt_last_string, |
575 | /* string args above */ | 575 | /* string args above */ |
576 | Opt_read_only, | 576 | Opt_read_only, |
577 | Opt_read_write, | 577 | Opt_read_write, |
578 | /* Boolean args above */ | 578 | /* Boolean args above */ |
579 | Opt_last_bool, | 579 | Opt_last_bool, |
580 | }; | 580 | }; |
581 | 581 | ||
582 | static match_table_t rbd_opts_tokens = { | 582 | static match_table_t rbd_opts_tokens = { |
583 | /* int args above */ | 583 | /* int args above */ |
584 | /* string args above */ | 584 | /* string args above */ |
585 | {Opt_read_only, "read_only"}, | 585 | {Opt_read_only, "read_only"}, |
586 | {Opt_read_only, "ro"}, /* Alternate spelling */ | 586 | {Opt_read_only, "ro"}, /* Alternate spelling */ |
587 | {Opt_read_write, "read_write"}, | 587 | {Opt_read_write, "read_write"}, |
588 | {Opt_read_write, "rw"}, /* Alternate spelling */ | 588 | {Opt_read_write, "rw"}, /* Alternate spelling */ |
589 | /* Boolean args above */ | 589 | /* Boolean args above */ |
590 | {-1, NULL} | 590 | {-1, NULL} |
591 | }; | 591 | }; |
592 | 592 | ||
593 | struct rbd_options { | 593 | struct rbd_options { |
594 | bool read_only; | 594 | bool read_only; |
595 | }; | 595 | }; |
596 | 596 | ||
597 | #define RBD_READ_ONLY_DEFAULT false | 597 | #define RBD_READ_ONLY_DEFAULT false |
598 | 598 | ||
599 | static int parse_rbd_opts_token(char *c, void *private) | 599 | static int parse_rbd_opts_token(char *c, void *private) |
600 | { | 600 | { |
601 | struct rbd_options *rbd_opts = private; | 601 | struct rbd_options *rbd_opts = private; |
602 | substring_t argstr[MAX_OPT_ARGS]; | 602 | substring_t argstr[MAX_OPT_ARGS]; |
603 | int token, intval, ret; | 603 | int token, intval, ret; |
604 | 604 | ||
605 | token = match_token(c, rbd_opts_tokens, argstr); | 605 | token = match_token(c, rbd_opts_tokens, argstr); |
606 | if (token < 0) | 606 | if (token < 0) |
607 | return -EINVAL; | 607 | return -EINVAL; |
608 | 608 | ||
609 | if (token < Opt_last_int) { | 609 | if (token < Opt_last_int) { |
610 | ret = match_int(&argstr[0], &intval); | 610 | ret = match_int(&argstr[0], &intval); |
611 | if (ret < 0) { | 611 | if (ret < 0) { |
612 | pr_err("bad mount option arg (not int) " | 612 | pr_err("bad mount option arg (not int) " |
613 | "at '%s'\n", c); | 613 | "at '%s'\n", c); |
614 | return ret; | 614 | return ret; |
615 | } | 615 | } |
616 | dout("got int token %d val %d\n", token, intval); | 616 | dout("got int token %d val %d\n", token, intval); |
617 | } else if (token > Opt_last_int && token < Opt_last_string) { | 617 | } else if (token > Opt_last_int && token < Opt_last_string) { |
618 | dout("got string token %d val %s\n", token, | 618 | dout("got string token %d val %s\n", token, |
619 | argstr[0].from); | 619 | argstr[0].from); |
620 | } else if (token > Opt_last_string && token < Opt_last_bool) { | 620 | } else if (token > Opt_last_string && token < Opt_last_bool) { |
621 | dout("got Boolean token %d\n", token); | 621 | dout("got Boolean token %d\n", token); |
622 | } else { | 622 | } else { |
623 | dout("got token %d\n", token); | 623 | dout("got token %d\n", token); |
624 | } | 624 | } |
625 | 625 | ||
626 | switch (token) { | 626 | switch (token) { |
627 | case Opt_read_only: | 627 | case Opt_read_only: |
628 | rbd_opts->read_only = true; | 628 | rbd_opts->read_only = true; |
629 | break; | 629 | break; |
630 | case Opt_read_write: | 630 | case Opt_read_write: |
631 | rbd_opts->read_only = false; | 631 | rbd_opts->read_only = false; |
632 | break; | 632 | break; |
633 | default: | 633 | default: |
634 | rbd_assert(false); | 634 | rbd_assert(false); |
635 | break; | 635 | break; |
636 | } | 636 | } |
637 | return 0; | 637 | return 0; |
638 | } | 638 | } |
639 | 639 | ||
640 | /* | 640 | /* |
641 | * Get a ceph client with specific addr and configuration, if one does | 641 | * Get a ceph client with specific addr and configuration, if one does |
642 | * not exist create it. | 642 | * not exist create it. |
643 | */ | 643 | */ |
644 | static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) | 644 | static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) |
645 | { | 645 | { |
646 | struct rbd_client *rbdc; | 646 | struct rbd_client *rbdc; |
647 | 647 | ||
648 | rbdc = rbd_client_find(ceph_opts); | 648 | rbdc = rbd_client_find(ceph_opts); |
649 | if (rbdc) /* using an existing client */ | 649 | if (rbdc) /* using an existing client */ |
650 | ceph_destroy_options(ceph_opts); | 650 | ceph_destroy_options(ceph_opts); |
651 | else | 651 | else |
652 | rbdc = rbd_client_create(ceph_opts); | 652 | rbdc = rbd_client_create(ceph_opts); |
653 | 653 | ||
654 | return rbdc; | 654 | return rbdc; |
655 | } | 655 | } |
656 | 656 | ||
657 | /* | 657 | /* |
658 | * Destroy ceph client | 658 | * Destroy ceph client |
659 | * | 659 | * |
660 | * Caller must hold rbd_client_list_lock. | 660 | * Caller must hold rbd_client_list_lock. |
661 | */ | 661 | */ |
662 | static void rbd_client_release(struct kref *kref) | 662 | static void rbd_client_release(struct kref *kref) |
663 | { | 663 | { |
664 | struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); | 664 | struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); |
665 | 665 | ||
666 | dout("%s: rbdc %p\n", __func__, rbdc); | 666 | dout("%s: rbdc %p\n", __func__, rbdc); |
667 | spin_lock(&rbd_client_list_lock); | 667 | spin_lock(&rbd_client_list_lock); |
668 | list_del(&rbdc->node); | 668 | list_del(&rbdc->node); |
669 | spin_unlock(&rbd_client_list_lock); | 669 | spin_unlock(&rbd_client_list_lock); |
670 | 670 | ||
671 | ceph_destroy_client(rbdc->client); | 671 | ceph_destroy_client(rbdc->client); |
672 | kfree(rbdc); | 672 | kfree(rbdc); |
673 | } | 673 | } |
674 | 674 | ||
675 | /* Caller has to fill in snapc->seq and snapc->snaps[0..snap_count-1] */ | ||
676 | |||
677 | static struct ceph_snap_context *rbd_snap_context_create(u32 snap_count) | ||
678 | { | ||
679 | struct ceph_snap_context *snapc; | ||
680 | size_t size; | ||
681 | |||
682 | size = sizeof (struct ceph_snap_context); | ||
683 | size += snap_count * sizeof (snapc->snaps[0]); | ||
684 | snapc = kzalloc(size, GFP_KERNEL); | ||
685 | if (!snapc) | ||
686 | return NULL; | ||
687 | |||
688 | atomic_set(&snapc->nref, 1); | ||
689 | snapc->num_snaps = snap_count; | ||
690 | |||
691 | return snapc; | ||
692 | } | ||
693 | |||
694 | static inline void rbd_snap_context_get(struct ceph_snap_context *snapc) | ||
695 | { | ||
696 | (void)ceph_get_snap_context(snapc); | ||
697 | } | ||
698 | |||
699 | static inline void rbd_snap_context_put(struct ceph_snap_context *snapc) | ||
700 | { | ||
701 | ceph_put_snap_context(snapc); | ||
702 | } | ||
703 | |||
704 | /* | 675 | /* |
705 | * Drop reference to ceph client node. If it's not referenced anymore, release | 676 | * Drop reference to ceph client node. If it's not referenced anymore, release |
706 | * it. | 677 | * it. |
707 | */ | 678 | */ |
708 | static void rbd_put_client(struct rbd_client *rbdc) | 679 | static void rbd_put_client(struct rbd_client *rbdc) |
709 | { | 680 | { |
710 | if (rbdc) | 681 | if (rbdc) |
711 | kref_put(&rbdc->kref, rbd_client_release); | 682 | kref_put(&rbdc->kref, rbd_client_release); |
712 | } | 683 | } |
713 | 684 | ||
714 | static bool rbd_image_format_valid(u32 image_format) | 685 | static bool rbd_image_format_valid(u32 image_format) |
715 | { | 686 | { |
716 | return image_format == 1 || image_format == 2; | 687 | return image_format == 1 || image_format == 2; |
717 | } | 688 | } |
718 | 689 | ||
719 | static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) | 690 | static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) |
720 | { | 691 | { |
721 | size_t size; | 692 | size_t size; |
722 | u32 snap_count; | 693 | u32 snap_count; |
723 | 694 | ||
724 | /* The header has to start with the magic rbd header text */ | 695 | /* The header has to start with the magic rbd header text */ |
725 | if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT))) | 696 | if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT))) |
726 | return false; | 697 | return false; |
727 | 698 | ||
728 | /* The bio layer requires at least sector-sized I/O */ | 699 | /* The bio layer requires at least sector-sized I/O */ |
729 | 700 | ||
730 | if (ondisk->options.order < SECTOR_SHIFT) | 701 | if (ondisk->options.order < SECTOR_SHIFT) |
731 | return false; | 702 | return false; |
732 | 703 | ||
733 | /* If we use u64 in a few spots we may be able to loosen this */ | 704 | /* If we use u64 in a few spots we may be able to loosen this */ |
734 | 705 | ||
735 | if (ondisk->options.order > 8 * sizeof (int) - 1) | 706 | if (ondisk->options.order > 8 * sizeof (int) - 1) |
736 | return false; | 707 | return false; |
737 | 708 | ||
738 | /* | 709 | /* |
739 | * The size of a snapshot header has to fit in a size_t, and | 710 | * The size of a snapshot header has to fit in a size_t, and |
740 | * that limits the number of snapshots. | 711 | * that limits the number of snapshots. |
741 | */ | 712 | */ |
742 | snap_count = le32_to_cpu(ondisk->snap_count); | 713 | snap_count = le32_to_cpu(ondisk->snap_count); |
743 | size = SIZE_MAX - sizeof (struct ceph_snap_context); | 714 | size = SIZE_MAX - sizeof (struct ceph_snap_context); |
744 | if (snap_count > size / sizeof (__le64)) | 715 | if (snap_count > size / sizeof (__le64)) |
745 | return false; | 716 | return false; |
746 | 717 | ||
747 | /* | 718 | /* |
748 | * Not only that, but the size of the entire the snapshot | 719 | * Not only that, but the size of the entire the snapshot |
749 | * header must also be representable in a size_t. | 720 | * header must also be representable in a size_t. |
750 | */ | 721 | */ |
751 | size -= snap_count * sizeof (__le64); | 722 | size -= snap_count * sizeof (__le64); |
752 | if ((u64) size < le64_to_cpu(ondisk->snap_names_len)) | 723 | if ((u64) size < le64_to_cpu(ondisk->snap_names_len)) |
753 | return false; | 724 | return false; |
754 | 725 | ||
755 | return true; | 726 | return true; |
756 | } | 727 | } |
757 | 728 | ||
758 | /* | 729 | /* |
759 | * Create a new header structure, translate header format from the on-disk | 730 | * Create a new header structure, translate header format from the on-disk |
760 | * header. | 731 | * header. |
761 | */ | 732 | */ |
762 | static int rbd_header_from_disk(struct rbd_image_header *header, | 733 | static int rbd_header_from_disk(struct rbd_image_header *header, |
763 | struct rbd_image_header_ondisk *ondisk) | 734 | struct rbd_image_header_ondisk *ondisk) |
764 | { | 735 | { |
765 | u32 snap_count; | 736 | u32 snap_count; |
766 | size_t len; | 737 | size_t len; |
767 | size_t size; | 738 | size_t size; |
768 | u32 i; | 739 | u32 i; |
769 | 740 | ||
770 | memset(header, 0, sizeof (*header)); | 741 | memset(header, 0, sizeof (*header)); |
771 | 742 | ||
772 | snap_count = le32_to_cpu(ondisk->snap_count); | 743 | snap_count = le32_to_cpu(ondisk->snap_count); |
773 | 744 | ||
774 | len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix)); | 745 | len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix)); |
775 | header->object_prefix = kmalloc(len + 1, GFP_KERNEL); | 746 | header->object_prefix = kmalloc(len + 1, GFP_KERNEL); |
776 | if (!header->object_prefix) | 747 | if (!header->object_prefix) |
777 | return -ENOMEM; | 748 | return -ENOMEM; |
778 | memcpy(header->object_prefix, ondisk->object_prefix, len); | 749 | memcpy(header->object_prefix, ondisk->object_prefix, len); |
779 | header->object_prefix[len] = '\0'; | 750 | header->object_prefix[len] = '\0'; |
780 | 751 | ||
781 | if (snap_count) { | 752 | if (snap_count) { |
782 | u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len); | 753 | u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len); |
783 | 754 | ||
784 | /* Save a copy of the snapshot names */ | 755 | /* Save a copy of the snapshot names */ |
785 | 756 | ||
786 | if (snap_names_len > (u64) SIZE_MAX) | 757 | if (snap_names_len > (u64) SIZE_MAX) |
787 | return -EIO; | 758 | return -EIO; |
788 | header->snap_names = kmalloc(snap_names_len, GFP_KERNEL); | 759 | header->snap_names = kmalloc(snap_names_len, GFP_KERNEL); |
789 | if (!header->snap_names) | 760 | if (!header->snap_names) |
790 | goto out_err; | 761 | goto out_err; |
791 | /* | 762 | /* |
792 | * Note that rbd_dev_v1_header_read() guarantees | 763 | * Note that rbd_dev_v1_header_read() guarantees |
793 | * the ondisk buffer we're working with has | 764 | * the ondisk buffer we're working with has |
794 | * snap_names_len bytes beyond the end of the | 765 | * snap_names_len bytes beyond the end of the |
795 | * snapshot id array, this memcpy() is safe. | 766 | * snapshot id array, this memcpy() is safe. |
796 | */ | 767 | */ |
797 | memcpy(header->snap_names, &ondisk->snaps[snap_count], | 768 | memcpy(header->snap_names, &ondisk->snaps[snap_count], |
798 | snap_names_len); | 769 | snap_names_len); |
799 | 770 | ||
800 | /* Record each snapshot's size */ | 771 | /* Record each snapshot's size */ |
801 | 772 | ||
802 | size = snap_count * sizeof (*header->snap_sizes); | 773 | size = snap_count * sizeof (*header->snap_sizes); |
803 | header->snap_sizes = kmalloc(size, GFP_KERNEL); | 774 | header->snap_sizes = kmalloc(size, GFP_KERNEL); |
804 | if (!header->snap_sizes) | 775 | if (!header->snap_sizes) |
805 | goto out_err; | 776 | goto out_err; |
806 | for (i = 0; i < snap_count; i++) | 777 | for (i = 0; i < snap_count; i++) |
807 | header->snap_sizes[i] = | 778 | header->snap_sizes[i] = |
808 | le64_to_cpu(ondisk->snaps[i].image_size); | 779 | le64_to_cpu(ondisk->snaps[i].image_size); |
809 | } else { | 780 | } else { |
810 | header->snap_names = NULL; | 781 | header->snap_names = NULL; |
811 | header->snap_sizes = NULL; | 782 | header->snap_sizes = NULL; |
812 | } | 783 | } |
813 | 784 | ||
814 | header->features = 0; /* No features support in v1 images */ | 785 | header->features = 0; /* No features support in v1 images */ |
815 | header->obj_order = ondisk->options.order; | 786 | header->obj_order = ondisk->options.order; |
816 | header->crypt_type = ondisk->options.crypt_type; | 787 | header->crypt_type = ondisk->options.crypt_type; |
817 | header->comp_type = ondisk->options.comp_type; | 788 | header->comp_type = ondisk->options.comp_type; |
818 | 789 | ||
819 | /* Allocate and fill in the snapshot context */ | 790 | /* Allocate and fill in the snapshot context */ |
820 | 791 | ||
821 | header->image_size = le64_to_cpu(ondisk->image_size); | 792 | header->image_size = le64_to_cpu(ondisk->image_size); |
822 | 793 | ||
823 | header->snapc = rbd_snap_context_create(snap_count); | 794 | header->snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); |
824 | if (!header->snapc) | 795 | if (!header->snapc) |
825 | goto out_err; | 796 | goto out_err; |
826 | header->snapc->seq = le64_to_cpu(ondisk->snap_seq); | 797 | header->snapc->seq = le64_to_cpu(ondisk->snap_seq); |
827 | for (i = 0; i < snap_count; i++) | 798 | for (i = 0; i < snap_count; i++) |
828 | header->snapc->snaps[i] = le64_to_cpu(ondisk->snaps[i].id); | 799 | header->snapc->snaps[i] = le64_to_cpu(ondisk->snaps[i].id); |
829 | 800 | ||
830 | return 0; | 801 | return 0; |
831 | 802 | ||
832 | out_err: | 803 | out_err: |
833 | kfree(header->snap_sizes); | 804 | kfree(header->snap_sizes); |
834 | header->snap_sizes = NULL; | 805 | header->snap_sizes = NULL; |
835 | kfree(header->snap_names); | 806 | kfree(header->snap_names); |
836 | header->snap_names = NULL; | 807 | header->snap_names = NULL; |
837 | kfree(header->object_prefix); | 808 | kfree(header->object_prefix); |
838 | header->object_prefix = NULL; | 809 | header->object_prefix = NULL; |
839 | 810 | ||
840 | return -ENOMEM; | 811 | return -ENOMEM; |
841 | } | 812 | } |
842 | 813 | ||
843 | static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id) | 814 | static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id) |
844 | { | 815 | { |
845 | struct rbd_snap *snap; | 816 | struct rbd_snap *snap; |
846 | 817 | ||
847 | if (snap_id == CEPH_NOSNAP) | 818 | if (snap_id == CEPH_NOSNAP) |
848 | return RBD_SNAP_HEAD_NAME; | 819 | return RBD_SNAP_HEAD_NAME; |
849 | 820 | ||
850 | list_for_each_entry(snap, &rbd_dev->snaps, node) | 821 | list_for_each_entry(snap, &rbd_dev->snaps, node) |
851 | if (snap_id == snap->id) | 822 | if (snap_id == snap->id) |
852 | return snap->name; | 823 | return snap->name; |
853 | 824 | ||
854 | return NULL; | 825 | return NULL; |
855 | } | 826 | } |
856 | 827 | ||
857 | static struct rbd_snap *snap_by_name(struct rbd_device *rbd_dev, | 828 | static struct rbd_snap *snap_by_name(struct rbd_device *rbd_dev, |
858 | const char *snap_name) | 829 | const char *snap_name) |
859 | { | 830 | { |
860 | struct rbd_snap *snap; | 831 | struct rbd_snap *snap; |
861 | 832 | ||
862 | list_for_each_entry(snap, &rbd_dev->snaps, node) | 833 | list_for_each_entry(snap, &rbd_dev->snaps, node) |
863 | if (!strcmp(snap_name, snap->name)) | 834 | if (!strcmp(snap_name, snap->name)) |
864 | return snap; | 835 | return snap; |
865 | 836 | ||
866 | return NULL; | 837 | return NULL; |
867 | } | 838 | } |
868 | 839 | ||
869 | static int rbd_dev_mapping_set(struct rbd_device *rbd_dev) | 840 | static int rbd_dev_mapping_set(struct rbd_device *rbd_dev) |
870 | { | 841 | { |
871 | if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME, | 842 | if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME, |
872 | sizeof (RBD_SNAP_HEAD_NAME))) { | 843 | sizeof (RBD_SNAP_HEAD_NAME))) { |
873 | rbd_dev->mapping.size = rbd_dev->header.image_size; | 844 | rbd_dev->mapping.size = rbd_dev->header.image_size; |
874 | rbd_dev->mapping.features = rbd_dev->header.features; | 845 | rbd_dev->mapping.features = rbd_dev->header.features; |
875 | } else { | 846 | } else { |
876 | struct rbd_snap *snap; | 847 | struct rbd_snap *snap; |
877 | 848 | ||
878 | snap = snap_by_name(rbd_dev, rbd_dev->spec->snap_name); | 849 | snap = snap_by_name(rbd_dev, rbd_dev->spec->snap_name); |
879 | if (!snap) | 850 | if (!snap) |
880 | return -ENOENT; | 851 | return -ENOENT; |
881 | rbd_dev->mapping.size = snap->size; | 852 | rbd_dev->mapping.size = snap->size; |
882 | rbd_dev->mapping.features = snap->features; | 853 | rbd_dev->mapping.features = snap->features; |
883 | rbd_dev->mapping.read_only = true; | 854 | rbd_dev->mapping.read_only = true; |
884 | } | 855 | } |
885 | 856 | ||
886 | return 0; | 857 | return 0; |
887 | } | 858 | } |
888 | 859 | ||
889 | static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev) | 860 | static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev) |
890 | { | 861 | { |
891 | rbd_dev->mapping.size = 0; | 862 | rbd_dev->mapping.size = 0; |
892 | rbd_dev->mapping.features = 0; | 863 | rbd_dev->mapping.features = 0; |
893 | rbd_dev->mapping.read_only = true; | 864 | rbd_dev->mapping.read_only = true; |
894 | } | 865 | } |
895 | 866 | ||
896 | static void rbd_dev_clear_mapping(struct rbd_device *rbd_dev) | 867 | static void rbd_dev_clear_mapping(struct rbd_device *rbd_dev) |
897 | { | 868 | { |
898 | rbd_dev->mapping.size = 0; | 869 | rbd_dev->mapping.size = 0; |
899 | rbd_dev->mapping.features = 0; | 870 | rbd_dev->mapping.features = 0; |
900 | rbd_dev->mapping.read_only = true; | 871 | rbd_dev->mapping.read_only = true; |
901 | } | 872 | } |
902 | 873 | ||
903 | static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) | 874 | static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) |
904 | { | 875 | { |
905 | char *name; | 876 | char *name; |
906 | u64 segment; | 877 | u64 segment; |
907 | int ret; | 878 | int ret; |
908 | 879 | ||
909 | name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO); | 880 | name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO); |
910 | if (!name) | 881 | if (!name) |
911 | return NULL; | 882 | return NULL; |
912 | segment = offset >> rbd_dev->header.obj_order; | 883 | segment = offset >> rbd_dev->header.obj_order; |
913 | ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx", | 884 | ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx", |
914 | rbd_dev->header.object_prefix, segment); | 885 | rbd_dev->header.object_prefix, segment); |
915 | if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) { | 886 | if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) { |
916 | pr_err("error formatting segment name for #%llu (%d)\n", | 887 | pr_err("error formatting segment name for #%llu (%d)\n", |
917 | segment, ret); | 888 | segment, ret); |
918 | kfree(name); | 889 | kfree(name); |
919 | name = NULL; | 890 | name = NULL; |
920 | } | 891 | } |
921 | 892 | ||
922 | return name; | 893 | return name; |
923 | } | 894 | } |
924 | 895 | ||
925 | static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) | 896 | static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) |
926 | { | 897 | { |
927 | u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; | 898 | u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; |
928 | 899 | ||
929 | return offset & (segment_size - 1); | 900 | return offset & (segment_size - 1); |
930 | } | 901 | } |
931 | 902 | ||
932 | static u64 rbd_segment_length(struct rbd_device *rbd_dev, | 903 | static u64 rbd_segment_length(struct rbd_device *rbd_dev, |
933 | u64 offset, u64 length) | 904 | u64 offset, u64 length) |
934 | { | 905 | { |
935 | u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; | 906 | u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; |
936 | 907 | ||
937 | offset &= segment_size - 1; | 908 | offset &= segment_size - 1; |
938 | 909 | ||
939 | rbd_assert(length <= U64_MAX - offset); | 910 | rbd_assert(length <= U64_MAX - offset); |
940 | if (offset + length > segment_size) | 911 | if (offset + length > segment_size) |
941 | length = segment_size - offset; | 912 | length = segment_size - offset; |
942 | 913 | ||
943 | return length; | 914 | return length; |
944 | } | 915 | } |
945 | 916 | ||
946 | /* | 917 | /* |
947 | * returns the size of an object in the image | 918 | * returns the size of an object in the image |
948 | */ | 919 | */ |
949 | static u64 rbd_obj_bytes(struct rbd_image_header *header) | 920 | static u64 rbd_obj_bytes(struct rbd_image_header *header) |
950 | { | 921 | { |
951 | return 1 << header->obj_order; | 922 | return 1 << header->obj_order; |
952 | } | 923 | } |
953 | 924 | ||
954 | /* | 925 | /* |
955 | * bio helpers | 926 | * bio helpers |
956 | */ | 927 | */ |
957 | 928 | ||
958 | static void bio_chain_put(struct bio *chain) | 929 | static void bio_chain_put(struct bio *chain) |
959 | { | 930 | { |
960 | struct bio *tmp; | 931 | struct bio *tmp; |
961 | 932 | ||
962 | while (chain) { | 933 | while (chain) { |
963 | tmp = chain; | 934 | tmp = chain; |
964 | chain = chain->bi_next; | 935 | chain = chain->bi_next; |
965 | bio_put(tmp); | 936 | bio_put(tmp); |
966 | } | 937 | } |
967 | } | 938 | } |
968 | 939 | ||
969 | /* | 940 | /* |
970 | * zeros a bio chain, starting at specific offset | 941 | * zeros a bio chain, starting at specific offset |
971 | */ | 942 | */ |
972 | static void zero_bio_chain(struct bio *chain, int start_ofs) | 943 | static void zero_bio_chain(struct bio *chain, int start_ofs) |
973 | { | 944 | { |
974 | struct bio_vec *bv; | 945 | struct bio_vec *bv; |
975 | unsigned long flags; | 946 | unsigned long flags; |
976 | void *buf; | 947 | void *buf; |
977 | int i; | 948 | int i; |
978 | int pos = 0; | 949 | int pos = 0; |
979 | 950 | ||
980 | while (chain) { | 951 | while (chain) { |
981 | bio_for_each_segment(bv, chain, i) { | 952 | bio_for_each_segment(bv, chain, i) { |
982 | if (pos + bv->bv_len > start_ofs) { | 953 | if (pos + bv->bv_len > start_ofs) { |
983 | int remainder = max(start_ofs - pos, 0); | 954 | int remainder = max(start_ofs - pos, 0); |
984 | buf = bvec_kmap_irq(bv, &flags); | 955 | buf = bvec_kmap_irq(bv, &flags); |
985 | memset(buf + remainder, 0, | 956 | memset(buf + remainder, 0, |
986 | bv->bv_len - remainder); | 957 | bv->bv_len - remainder); |
987 | bvec_kunmap_irq(buf, &flags); | 958 | bvec_kunmap_irq(buf, &flags); |
988 | } | 959 | } |
989 | pos += bv->bv_len; | 960 | pos += bv->bv_len; |
990 | } | 961 | } |
991 | 962 | ||
992 | chain = chain->bi_next; | 963 | chain = chain->bi_next; |
993 | } | 964 | } |
994 | } | 965 | } |
995 | 966 | ||
996 | /* | 967 | /* |
997 | * similar to zero_bio_chain(), zeros data defined by a page array, | 968 | * similar to zero_bio_chain(), zeros data defined by a page array, |
998 | * starting at the given byte offset from the start of the array and | 969 | * starting at the given byte offset from the start of the array and |
999 | * continuing up to the given end offset. The pages array is | 970 | * continuing up to the given end offset. The pages array is |
1000 | * assumed to be big enough to hold all bytes up to the end. | 971 | * assumed to be big enough to hold all bytes up to the end. |
1001 | */ | 972 | */ |
1002 | static void zero_pages(struct page **pages, u64 offset, u64 end) | 973 | static void zero_pages(struct page **pages, u64 offset, u64 end) |
1003 | { | 974 | { |
1004 | struct page **page = &pages[offset >> PAGE_SHIFT]; | 975 | struct page **page = &pages[offset >> PAGE_SHIFT]; |
1005 | 976 | ||
1006 | rbd_assert(end > offset); | 977 | rbd_assert(end > offset); |
1007 | rbd_assert(end - offset <= (u64)SIZE_MAX); | 978 | rbd_assert(end - offset <= (u64)SIZE_MAX); |
1008 | while (offset < end) { | 979 | while (offset < end) { |
1009 | size_t page_offset; | 980 | size_t page_offset; |
1010 | size_t length; | 981 | size_t length; |
1011 | unsigned long flags; | 982 | unsigned long flags; |
1012 | void *kaddr; | 983 | void *kaddr; |
1013 | 984 | ||
1014 | page_offset = (size_t)(offset & ~PAGE_MASK); | 985 | page_offset = (size_t)(offset & ~PAGE_MASK); |
1015 | length = min(PAGE_SIZE - page_offset, (size_t)(end - offset)); | 986 | length = min(PAGE_SIZE - page_offset, (size_t)(end - offset)); |
1016 | local_irq_save(flags); | 987 | local_irq_save(flags); |
1017 | kaddr = kmap_atomic(*page); | 988 | kaddr = kmap_atomic(*page); |
1018 | memset(kaddr + page_offset, 0, length); | 989 | memset(kaddr + page_offset, 0, length); |
1019 | kunmap_atomic(kaddr); | 990 | kunmap_atomic(kaddr); |
1020 | local_irq_restore(flags); | 991 | local_irq_restore(flags); |
1021 | 992 | ||
1022 | offset += length; | 993 | offset += length; |
1023 | page++; | 994 | page++; |
1024 | } | 995 | } |
1025 | } | 996 | } |
1026 | 997 | ||
1027 | /* | 998 | /* |
1028 | * Clone a portion of a bio, starting at the given byte offset | 999 | * Clone a portion of a bio, starting at the given byte offset |
1029 | * and continuing for the number of bytes indicated. | 1000 | * and continuing for the number of bytes indicated. |
1030 | */ | 1001 | */ |
1031 | static struct bio *bio_clone_range(struct bio *bio_src, | 1002 | static struct bio *bio_clone_range(struct bio *bio_src, |
1032 | unsigned int offset, | 1003 | unsigned int offset, |
1033 | unsigned int len, | 1004 | unsigned int len, |
1034 | gfp_t gfpmask) | 1005 | gfp_t gfpmask) |
1035 | { | 1006 | { |
1036 | struct bio_vec *bv; | 1007 | struct bio_vec *bv; |
1037 | unsigned int resid; | 1008 | unsigned int resid; |
1038 | unsigned short idx; | 1009 | unsigned short idx; |
1039 | unsigned int voff; | 1010 | unsigned int voff; |
1040 | unsigned short end_idx; | 1011 | unsigned short end_idx; |
1041 | unsigned short vcnt; | 1012 | unsigned short vcnt; |
1042 | struct bio *bio; | 1013 | struct bio *bio; |
1043 | 1014 | ||
1044 | /* Handle the easy case for the caller */ | 1015 | /* Handle the easy case for the caller */ |
1045 | 1016 | ||
1046 | if (!offset && len == bio_src->bi_size) | 1017 | if (!offset && len == bio_src->bi_size) |
1047 | return bio_clone(bio_src, gfpmask); | 1018 | return bio_clone(bio_src, gfpmask); |
1048 | 1019 | ||
1049 | if (WARN_ON_ONCE(!len)) | 1020 | if (WARN_ON_ONCE(!len)) |
1050 | return NULL; | 1021 | return NULL; |
1051 | if (WARN_ON_ONCE(len > bio_src->bi_size)) | 1022 | if (WARN_ON_ONCE(len > bio_src->bi_size)) |
1052 | return NULL; | 1023 | return NULL; |
1053 | if (WARN_ON_ONCE(offset > bio_src->bi_size - len)) | 1024 | if (WARN_ON_ONCE(offset > bio_src->bi_size - len)) |
1054 | return NULL; | 1025 | return NULL; |
1055 | 1026 | ||
1056 | /* Find first affected segment... */ | 1027 | /* Find first affected segment... */ |
1057 | 1028 | ||
1058 | resid = offset; | 1029 | resid = offset; |
1059 | __bio_for_each_segment(bv, bio_src, idx, 0) { | 1030 | __bio_for_each_segment(bv, bio_src, idx, 0) { |
1060 | if (resid < bv->bv_len) | 1031 | if (resid < bv->bv_len) |
1061 | break; | 1032 | break; |
1062 | resid -= bv->bv_len; | 1033 | resid -= bv->bv_len; |
1063 | } | 1034 | } |
1064 | voff = resid; | 1035 | voff = resid; |
1065 | 1036 | ||
1066 | /* ...and the last affected segment */ | 1037 | /* ...and the last affected segment */ |
1067 | 1038 | ||
1068 | resid += len; | 1039 | resid += len; |
1069 | __bio_for_each_segment(bv, bio_src, end_idx, idx) { | 1040 | __bio_for_each_segment(bv, bio_src, end_idx, idx) { |
1070 | if (resid <= bv->bv_len) | 1041 | if (resid <= bv->bv_len) |
1071 | break; | 1042 | break; |
1072 | resid -= bv->bv_len; | 1043 | resid -= bv->bv_len; |
1073 | } | 1044 | } |
1074 | vcnt = end_idx - idx + 1; | 1045 | vcnt = end_idx - idx + 1; |
1075 | 1046 | ||
1076 | /* Build the clone */ | 1047 | /* Build the clone */ |
1077 | 1048 | ||
1078 | bio = bio_alloc(gfpmask, (unsigned int) vcnt); | 1049 | bio = bio_alloc(gfpmask, (unsigned int) vcnt); |
1079 | if (!bio) | 1050 | if (!bio) |
1080 | return NULL; /* ENOMEM */ | 1051 | return NULL; /* ENOMEM */ |
1081 | 1052 | ||
1082 | bio->bi_bdev = bio_src->bi_bdev; | 1053 | bio->bi_bdev = bio_src->bi_bdev; |
1083 | bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT); | 1054 | bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT); |
1084 | bio->bi_rw = bio_src->bi_rw; | 1055 | bio->bi_rw = bio_src->bi_rw; |
1085 | bio->bi_flags |= 1 << BIO_CLONED; | 1056 | bio->bi_flags |= 1 << BIO_CLONED; |
1086 | 1057 | ||
1087 | /* | 1058 | /* |
1088 | * Copy over our part of the bio_vec, then update the first | 1059 | * Copy over our part of the bio_vec, then update the first |
1089 | * and last (or only) entries. | 1060 | * and last (or only) entries. |
1090 | */ | 1061 | */ |
1091 | memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx], | 1062 | memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx], |
1092 | vcnt * sizeof (struct bio_vec)); | 1063 | vcnt * sizeof (struct bio_vec)); |
1093 | bio->bi_io_vec[0].bv_offset += voff; | 1064 | bio->bi_io_vec[0].bv_offset += voff; |
1094 | if (vcnt > 1) { | 1065 | if (vcnt > 1) { |
1095 | bio->bi_io_vec[0].bv_len -= voff; | 1066 | bio->bi_io_vec[0].bv_len -= voff; |
1096 | bio->bi_io_vec[vcnt - 1].bv_len = resid; | 1067 | bio->bi_io_vec[vcnt - 1].bv_len = resid; |
1097 | } else { | 1068 | } else { |
1098 | bio->bi_io_vec[0].bv_len = len; | 1069 | bio->bi_io_vec[0].bv_len = len; |
1099 | } | 1070 | } |
1100 | 1071 | ||
1101 | bio->bi_vcnt = vcnt; | 1072 | bio->bi_vcnt = vcnt; |
1102 | bio->bi_size = len; | 1073 | bio->bi_size = len; |
1103 | bio->bi_idx = 0; | 1074 | bio->bi_idx = 0; |
1104 | 1075 | ||
1105 | return bio; | 1076 | return bio; |
1106 | } | 1077 | } |
1107 | 1078 | ||
1108 | /* | 1079 | /* |
1109 | * Clone a portion of a bio chain, starting at the given byte offset | 1080 | * Clone a portion of a bio chain, starting at the given byte offset |
1110 | * into the first bio in the source chain and continuing for the | 1081 | * into the first bio in the source chain and continuing for the |
1111 | * number of bytes indicated. The result is another bio chain of | 1082 | * number of bytes indicated. The result is another bio chain of |
1112 | * exactly the given length, or a null pointer on error. | 1083 | * exactly the given length, or a null pointer on error. |
1113 | * | 1084 | * |
1114 | * The bio_src and offset parameters are both in-out. On entry they | 1085 | * The bio_src and offset parameters are both in-out. On entry they |
1115 | * refer to the first source bio and the offset into that bio where | 1086 | * refer to the first source bio and the offset into that bio where |
1116 | * the start of data to be cloned is located. | 1087 | * the start of data to be cloned is located. |
1117 | * | 1088 | * |
1118 | * On return, bio_src is updated to refer to the bio in the source | 1089 | * On return, bio_src is updated to refer to the bio in the source |
1119 | * chain that contains first un-cloned byte, and *offset will | 1090 | * chain that contains first un-cloned byte, and *offset will |
1120 | * contain the offset of that byte within that bio. | 1091 | * contain the offset of that byte within that bio. |
1121 | */ | 1092 | */ |
1122 | static struct bio *bio_chain_clone_range(struct bio **bio_src, | 1093 | static struct bio *bio_chain_clone_range(struct bio **bio_src, |
1123 | unsigned int *offset, | 1094 | unsigned int *offset, |
1124 | unsigned int len, | 1095 | unsigned int len, |
1125 | gfp_t gfpmask) | 1096 | gfp_t gfpmask) |
1126 | { | 1097 | { |
1127 | struct bio *bi = *bio_src; | 1098 | struct bio *bi = *bio_src; |
1128 | unsigned int off = *offset; | 1099 | unsigned int off = *offset; |
1129 | struct bio *chain = NULL; | 1100 | struct bio *chain = NULL; |
1130 | struct bio **end; | 1101 | struct bio **end; |
1131 | 1102 | ||
1132 | /* Build up a chain of clone bios up to the limit */ | 1103 | /* Build up a chain of clone bios up to the limit */ |
1133 | 1104 | ||
1134 | if (!bi || off >= bi->bi_size || !len) | 1105 | if (!bi || off >= bi->bi_size || !len) |
1135 | return NULL; /* Nothing to clone */ | 1106 | return NULL; /* Nothing to clone */ |
1136 | 1107 | ||
1137 | end = &chain; | 1108 | end = &chain; |
1138 | while (len) { | 1109 | while (len) { |
1139 | unsigned int bi_size; | 1110 | unsigned int bi_size; |
1140 | struct bio *bio; | 1111 | struct bio *bio; |
1141 | 1112 | ||
1142 | if (!bi) { | 1113 | if (!bi) { |
1143 | rbd_warn(NULL, "bio_chain exhausted with %u left", len); | 1114 | rbd_warn(NULL, "bio_chain exhausted with %u left", len); |
1144 | goto out_err; /* EINVAL; ran out of bio's */ | 1115 | goto out_err; /* EINVAL; ran out of bio's */ |
1145 | } | 1116 | } |
1146 | bi_size = min_t(unsigned int, bi->bi_size - off, len); | 1117 | bi_size = min_t(unsigned int, bi->bi_size - off, len); |
1147 | bio = bio_clone_range(bi, off, bi_size, gfpmask); | 1118 | bio = bio_clone_range(bi, off, bi_size, gfpmask); |
1148 | if (!bio) | 1119 | if (!bio) |
1149 | goto out_err; /* ENOMEM */ | 1120 | goto out_err; /* ENOMEM */ |
1150 | 1121 | ||
1151 | *end = bio; | 1122 | *end = bio; |
1152 | end = &bio->bi_next; | 1123 | end = &bio->bi_next; |
1153 | 1124 | ||
1154 | off += bi_size; | 1125 | off += bi_size; |
1155 | if (off == bi->bi_size) { | 1126 | if (off == bi->bi_size) { |
1156 | bi = bi->bi_next; | 1127 | bi = bi->bi_next; |
1157 | off = 0; | 1128 | off = 0; |
1158 | } | 1129 | } |
1159 | len -= bi_size; | 1130 | len -= bi_size; |
1160 | } | 1131 | } |
1161 | *bio_src = bi; | 1132 | *bio_src = bi; |
1162 | *offset = off; | 1133 | *offset = off; |
1163 | 1134 | ||
1164 | return chain; | 1135 | return chain; |
1165 | out_err: | 1136 | out_err: |
1166 | bio_chain_put(chain); | 1137 | bio_chain_put(chain); |
1167 | 1138 | ||
1168 | return NULL; | 1139 | return NULL; |
1169 | } | 1140 | } |
1170 | 1141 | ||
1171 | /* | 1142 | /* |
1172 | * The default/initial value for all object request flags is 0. For | 1143 | * The default/initial value for all object request flags is 0. For |
1173 | * each flag, once its value is set to 1 it is never reset to 0 | 1144 | * each flag, once its value is set to 1 it is never reset to 0 |
1174 | * again. | 1145 | * again. |
1175 | */ | 1146 | */ |
1176 | static void obj_request_img_data_set(struct rbd_obj_request *obj_request) | 1147 | static void obj_request_img_data_set(struct rbd_obj_request *obj_request) |
1177 | { | 1148 | { |
1178 | if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) { | 1149 | if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) { |
1179 | struct rbd_device *rbd_dev; | 1150 | struct rbd_device *rbd_dev; |
1180 | 1151 | ||
1181 | rbd_dev = obj_request->img_request->rbd_dev; | 1152 | rbd_dev = obj_request->img_request->rbd_dev; |
1182 | rbd_warn(rbd_dev, "obj_request %p already marked img_data\n", | 1153 | rbd_warn(rbd_dev, "obj_request %p already marked img_data\n", |
1183 | obj_request); | 1154 | obj_request); |
1184 | } | 1155 | } |
1185 | } | 1156 | } |
1186 | 1157 | ||
1187 | static bool obj_request_img_data_test(struct rbd_obj_request *obj_request) | 1158 | static bool obj_request_img_data_test(struct rbd_obj_request *obj_request) |
1188 | { | 1159 | { |
1189 | smp_mb(); | 1160 | smp_mb(); |
1190 | return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0; | 1161 | return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0; |
1191 | } | 1162 | } |
1192 | 1163 | ||
1193 | static void obj_request_done_set(struct rbd_obj_request *obj_request) | 1164 | static void obj_request_done_set(struct rbd_obj_request *obj_request) |
1194 | { | 1165 | { |
1195 | if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) { | 1166 | if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) { |
1196 | struct rbd_device *rbd_dev = NULL; | 1167 | struct rbd_device *rbd_dev = NULL; |
1197 | 1168 | ||
1198 | if (obj_request_img_data_test(obj_request)) | 1169 | if (obj_request_img_data_test(obj_request)) |
1199 | rbd_dev = obj_request->img_request->rbd_dev; | 1170 | rbd_dev = obj_request->img_request->rbd_dev; |
1200 | rbd_warn(rbd_dev, "obj_request %p already marked done\n", | 1171 | rbd_warn(rbd_dev, "obj_request %p already marked done\n", |
1201 | obj_request); | 1172 | obj_request); |
1202 | } | 1173 | } |
1203 | } | 1174 | } |
1204 | 1175 | ||
1205 | static bool obj_request_done_test(struct rbd_obj_request *obj_request) | 1176 | static bool obj_request_done_test(struct rbd_obj_request *obj_request) |
1206 | { | 1177 | { |
1207 | smp_mb(); | 1178 | smp_mb(); |
1208 | return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0; | 1179 | return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0; |
1209 | } | 1180 | } |
1210 | 1181 | ||
1211 | /* | 1182 | /* |
1212 | * This sets the KNOWN flag after (possibly) setting the EXISTS | 1183 | * This sets the KNOWN flag after (possibly) setting the EXISTS |
1213 | * flag. The latter is set based on the "exists" value provided. | 1184 | * flag. The latter is set based on the "exists" value provided. |
1214 | * | 1185 | * |
1215 | * Note that for our purposes once an object exists it never goes | 1186 | * Note that for our purposes once an object exists it never goes |
1216 | * away again. It's possible that the response from two existence | 1187 | * away again. It's possible that the response from two existence |
1217 | * checks are separated by the creation of the target object, and | 1188 | * checks are separated by the creation of the target object, and |
1218 | * the first ("doesn't exist") response arrives *after* the second | 1189 | * the first ("doesn't exist") response arrives *after* the second |
1219 | * ("does exist"). In that case we ignore the second one. | 1190 | * ("does exist"). In that case we ignore the second one. |
1220 | */ | 1191 | */ |
1221 | static void obj_request_existence_set(struct rbd_obj_request *obj_request, | 1192 | static void obj_request_existence_set(struct rbd_obj_request *obj_request, |
1222 | bool exists) | 1193 | bool exists) |
1223 | { | 1194 | { |
1224 | if (exists) | 1195 | if (exists) |
1225 | set_bit(OBJ_REQ_EXISTS, &obj_request->flags); | 1196 | set_bit(OBJ_REQ_EXISTS, &obj_request->flags); |
1226 | set_bit(OBJ_REQ_KNOWN, &obj_request->flags); | 1197 | set_bit(OBJ_REQ_KNOWN, &obj_request->flags); |
1227 | smp_mb(); | 1198 | smp_mb(); |
1228 | } | 1199 | } |
1229 | 1200 | ||
1230 | static bool obj_request_known_test(struct rbd_obj_request *obj_request) | 1201 | static bool obj_request_known_test(struct rbd_obj_request *obj_request) |
1231 | { | 1202 | { |
1232 | smp_mb(); | 1203 | smp_mb(); |
1233 | return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0; | 1204 | return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0; |
1234 | } | 1205 | } |
1235 | 1206 | ||
1236 | static bool obj_request_exists_test(struct rbd_obj_request *obj_request) | 1207 | static bool obj_request_exists_test(struct rbd_obj_request *obj_request) |
1237 | { | 1208 | { |
1238 | smp_mb(); | 1209 | smp_mb(); |
1239 | return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0; | 1210 | return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0; |
1240 | } | 1211 | } |
1241 | 1212 | ||
1242 | static void rbd_obj_request_get(struct rbd_obj_request *obj_request) | 1213 | static void rbd_obj_request_get(struct rbd_obj_request *obj_request) |
1243 | { | 1214 | { |
1244 | dout("%s: obj %p (was %d)\n", __func__, obj_request, | 1215 | dout("%s: obj %p (was %d)\n", __func__, obj_request, |
1245 | atomic_read(&obj_request->kref.refcount)); | 1216 | atomic_read(&obj_request->kref.refcount)); |
1246 | kref_get(&obj_request->kref); | 1217 | kref_get(&obj_request->kref); |
1247 | } | 1218 | } |
1248 | 1219 | ||
1249 | static void rbd_obj_request_destroy(struct kref *kref); | 1220 | static void rbd_obj_request_destroy(struct kref *kref); |
1250 | static void rbd_obj_request_put(struct rbd_obj_request *obj_request) | 1221 | static void rbd_obj_request_put(struct rbd_obj_request *obj_request) |
1251 | { | 1222 | { |
1252 | rbd_assert(obj_request != NULL); | 1223 | rbd_assert(obj_request != NULL); |
1253 | dout("%s: obj %p (was %d)\n", __func__, obj_request, | 1224 | dout("%s: obj %p (was %d)\n", __func__, obj_request, |
1254 | atomic_read(&obj_request->kref.refcount)); | 1225 | atomic_read(&obj_request->kref.refcount)); |
1255 | kref_put(&obj_request->kref, rbd_obj_request_destroy); | 1226 | kref_put(&obj_request->kref, rbd_obj_request_destroy); |
1256 | } | 1227 | } |
1257 | 1228 | ||
1258 | static void rbd_img_request_get(struct rbd_img_request *img_request) | 1229 | static void rbd_img_request_get(struct rbd_img_request *img_request) |
1259 | { | 1230 | { |
1260 | dout("%s: img %p (was %d)\n", __func__, img_request, | 1231 | dout("%s: img %p (was %d)\n", __func__, img_request, |
1261 | atomic_read(&img_request->kref.refcount)); | 1232 | atomic_read(&img_request->kref.refcount)); |
1262 | kref_get(&img_request->kref); | 1233 | kref_get(&img_request->kref); |
1263 | } | 1234 | } |
1264 | 1235 | ||
1265 | static void rbd_img_request_destroy(struct kref *kref); | 1236 | static void rbd_img_request_destroy(struct kref *kref); |
1266 | static void rbd_img_request_put(struct rbd_img_request *img_request) | 1237 | static void rbd_img_request_put(struct rbd_img_request *img_request) |
1267 | { | 1238 | { |
1268 | rbd_assert(img_request != NULL); | 1239 | rbd_assert(img_request != NULL); |
1269 | dout("%s: img %p (was %d)\n", __func__, img_request, | 1240 | dout("%s: img %p (was %d)\n", __func__, img_request, |
1270 | atomic_read(&img_request->kref.refcount)); | 1241 | atomic_read(&img_request->kref.refcount)); |
1271 | kref_put(&img_request->kref, rbd_img_request_destroy); | 1242 | kref_put(&img_request->kref, rbd_img_request_destroy); |
1272 | } | 1243 | } |
1273 | 1244 | ||
1274 | static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, | 1245 | static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, |
1275 | struct rbd_obj_request *obj_request) | 1246 | struct rbd_obj_request *obj_request) |
1276 | { | 1247 | { |
1277 | rbd_assert(obj_request->img_request == NULL); | 1248 | rbd_assert(obj_request->img_request == NULL); |
1278 | 1249 | ||
1279 | /* Image request now owns object's original reference */ | 1250 | /* Image request now owns object's original reference */ |
1280 | obj_request->img_request = img_request; | 1251 | obj_request->img_request = img_request; |
1281 | obj_request->which = img_request->obj_request_count; | 1252 | obj_request->which = img_request->obj_request_count; |
1282 | rbd_assert(!obj_request_img_data_test(obj_request)); | 1253 | rbd_assert(!obj_request_img_data_test(obj_request)); |
1283 | obj_request_img_data_set(obj_request); | 1254 | obj_request_img_data_set(obj_request); |
1284 | rbd_assert(obj_request->which != BAD_WHICH); | 1255 | rbd_assert(obj_request->which != BAD_WHICH); |
1285 | img_request->obj_request_count++; | 1256 | img_request->obj_request_count++; |
1286 | list_add_tail(&obj_request->links, &img_request->obj_requests); | 1257 | list_add_tail(&obj_request->links, &img_request->obj_requests); |
1287 | dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, | 1258 | dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, |
1288 | obj_request->which); | 1259 | obj_request->which); |
1289 | } | 1260 | } |
1290 | 1261 | ||
1291 | static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request, | 1262 | static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request, |
1292 | struct rbd_obj_request *obj_request) | 1263 | struct rbd_obj_request *obj_request) |
1293 | { | 1264 | { |
1294 | rbd_assert(obj_request->which != BAD_WHICH); | 1265 | rbd_assert(obj_request->which != BAD_WHICH); |
1295 | 1266 | ||
1296 | dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, | 1267 | dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, |
1297 | obj_request->which); | 1268 | obj_request->which); |
1298 | list_del(&obj_request->links); | 1269 | list_del(&obj_request->links); |
1299 | rbd_assert(img_request->obj_request_count > 0); | 1270 | rbd_assert(img_request->obj_request_count > 0); |
1300 | img_request->obj_request_count--; | 1271 | img_request->obj_request_count--; |
1301 | rbd_assert(obj_request->which == img_request->obj_request_count); | 1272 | rbd_assert(obj_request->which == img_request->obj_request_count); |
1302 | obj_request->which = BAD_WHICH; | 1273 | obj_request->which = BAD_WHICH; |
1303 | rbd_assert(obj_request_img_data_test(obj_request)); | 1274 | rbd_assert(obj_request_img_data_test(obj_request)); |
1304 | rbd_assert(obj_request->img_request == img_request); | 1275 | rbd_assert(obj_request->img_request == img_request); |
1305 | obj_request->img_request = NULL; | 1276 | obj_request->img_request = NULL; |
1306 | obj_request->callback = NULL; | 1277 | obj_request->callback = NULL; |
1307 | rbd_obj_request_put(obj_request); | 1278 | rbd_obj_request_put(obj_request); |
1308 | } | 1279 | } |
1309 | 1280 | ||
1310 | static bool obj_request_type_valid(enum obj_request_type type) | 1281 | static bool obj_request_type_valid(enum obj_request_type type) |
1311 | { | 1282 | { |
1312 | switch (type) { | 1283 | switch (type) { |
1313 | case OBJ_REQUEST_NODATA: | 1284 | case OBJ_REQUEST_NODATA: |
1314 | case OBJ_REQUEST_BIO: | 1285 | case OBJ_REQUEST_BIO: |
1315 | case OBJ_REQUEST_PAGES: | 1286 | case OBJ_REQUEST_PAGES: |
1316 | return true; | 1287 | return true; |
1317 | default: | 1288 | default: |
1318 | return false; | 1289 | return false; |
1319 | } | 1290 | } |
1320 | } | 1291 | } |
1321 | 1292 | ||
1322 | static int rbd_obj_request_submit(struct ceph_osd_client *osdc, | 1293 | static int rbd_obj_request_submit(struct ceph_osd_client *osdc, |
1323 | struct rbd_obj_request *obj_request) | 1294 | struct rbd_obj_request *obj_request) |
1324 | { | 1295 | { |
1325 | dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request); | 1296 | dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request); |
1326 | 1297 | ||
1327 | return ceph_osdc_start_request(osdc, obj_request->osd_req, false); | 1298 | return ceph_osdc_start_request(osdc, obj_request->osd_req, false); |
1328 | } | 1299 | } |
1329 | 1300 | ||
1330 | static void rbd_img_request_complete(struct rbd_img_request *img_request) | 1301 | static void rbd_img_request_complete(struct rbd_img_request *img_request) |
1331 | { | 1302 | { |
1332 | 1303 | ||
1333 | dout("%s: img %p\n", __func__, img_request); | 1304 | dout("%s: img %p\n", __func__, img_request); |
1334 | 1305 | ||
1335 | /* | 1306 | /* |
1336 | * If no error occurred, compute the aggregate transfer | 1307 | * If no error occurred, compute the aggregate transfer |
1337 | * count for the image request. We could instead use | 1308 | * count for the image request. We could instead use |
1338 | * atomic64_cmpxchg() to update it as each object request | 1309 | * atomic64_cmpxchg() to update it as each object request |
1339 | * completes; not clear which way is better off hand. | 1310 | * completes; not clear which way is better off hand. |
1340 | */ | 1311 | */ |
1341 | if (!img_request->result) { | 1312 | if (!img_request->result) { |
1342 | struct rbd_obj_request *obj_request; | 1313 | struct rbd_obj_request *obj_request; |
1343 | u64 xferred = 0; | 1314 | u64 xferred = 0; |
1344 | 1315 | ||
1345 | for_each_obj_request(img_request, obj_request) | 1316 | for_each_obj_request(img_request, obj_request) |
1346 | xferred += obj_request->xferred; | 1317 | xferred += obj_request->xferred; |
1347 | img_request->xferred = xferred; | 1318 | img_request->xferred = xferred; |
1348 | } | 1319 | } |
1349 | 1320 | ||
1350 | if (img_request->callback) | 1321 | if (img_request->callback) |
1351 | img_request->callback(img_request); | 1322 | img_request->callback(img_request); |
1352 | else | 1323 | else |
1353 | rbd_img_request_put(img_request); | 1324 | rbd_img_request_put(img_request); |
1354 | } | 1325 | } |
1355 | 1326 | ||
1356 | /* Caller is responsible for rbd_obj_request_destroy(obj_request) */ | 1327 | /* Caller is responsible for rbd_obj_request_destroy(obj_request) */ |
1357 | 1328 | ||
1358 | static int rbd_obj_request_wait(struct rbd_obj_request *obj_request) | 1329 | static int rbd_obj_request_wait(struct rbd_obj_request *obj_request) |
1359 | { | 1330 | { |
1360 | dout("%s: obj %p\n", __func__, obj_request); | 1331 | dout("%s: obj %p\n", __func__, obj_request); |
1361 | 1332 | ||
1362 | return wait_for_completion_interruptible(&obj_request->completion); | 1333 | return wait_for_completion_interruptible(&obj_request->completion); |
1363 | } | 1334 | } |
1364 | 1335 | ||
1365 | /* | 1336 | /* |
1366 | * The default/initial value for all image request flags is 0. Each | 1337 | * The default/initial value for all image request flags is 0. Each |
1367 | * is conditionally set to 1 at image request initialization time | 1338 | * is conditionally set to 1 at image request initialization time |
1368 | * and currently never change thereafter. | 1339 | * and currently never change thereafter. |
1369 | */ | 1340 | */ |
1370 | static void img_request_write_set(struct rbd_img_request *img_request) | 1341 | static void img_request_write_set(struct rbd_img_request *img_request) |
1371 | { | 1342 | { |
1372 | set_bit(IMG_REQ_WRITE, &img_request->flags); | 1343 | set_bit(IMG_REQ_WRITE, &img_request->flags); |
1373 | smp_mb(); | 1344 | smp_mb(); |
1374 | } | 1345 | } |
1375 | 1346 | ||
1376 | static bool img_request_write_test(struct rbd_img_request *img_request) | 1347 | static bool img_request_write_test(struct rbd_img_request *img_request) |
1377 | { | 1348 | { |
1378 | smp_mb(); | 1349 | smp_mb(); |
1379 | return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0; | 1350 | return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0; |
1380 | } | 1351 | } |
1381 | 1352 | ||
1382 | static void img_request_child_set(struct rbd_img_request *img_request) | 1353 | static void img_request_child_set(struct rbd_img_request *img_request) |
1383 | { | 1354 | { |
1384 | set_bit(IMG_REQ_CHILD, &img_request->flags); | 1355 | set_bit(IMG_REQ_CHILD, &img_request->flags); |
1385 | smp_mb(); | 1356 | smp_mb(); |
1386 | } | 1357 | } |
1387 | 1358 | ||
1388 | static bool img_request_child_test(struct rbd_img_request *img_request) | 1359 | static bool img_request_child_test(struct rbd_img_request *img_request) |
1389 | { | 1360 | { |
1390 | smp_mb(); | 1361 | smp_mb(); |
1391 | return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0; | 1362 | return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0; |
1392 | } | 1363 | } |
1393 | 1364 | ||
1394 | static void img_request_layered_set(struct rbd_img_request *img_request) | 1365 | static void img_request_layered_set(struct rbd_img_request *img_request) |
1395 | { | 1366 | { |
1396 | set_bit(IMG_REQ_LAYERED, &img_request->flags); | 1367 | set_bit(IMG_REQ_LAYERED, &img_request->flags); |
1397 | smp_mb(); | 1368 | smp_mb(); |
1398 | } | 1369 | } |
1399 | 1370 | ||
1400 | static bool img_request_layered_test(struct rbd_img_request *img_request) | 1371 | static bool img_request_layered_test(struct rbd_img_request *img_request) |
1401 | { | 1372 | { |
1402 | smp_mb(); | 1373 | smp_mb(); |
1403 | return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0; | 1374 | return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0; |
1404 | } | 1375 | } |
1405 | 1376 | ||
1406 | static void | 1377 | static void |
1407 | rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request) | 1378 | rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request) |
1408 | { | 1379 | { |
1409 | u64 xferred = obj_request->xferred; | 1380 | u64 xferred = obj_request->xferred; |
1410 | u64 length = obj_request->length; | 1381 | u64 length = obj_request->length; |
1411 | 1382 | ||
1412 | dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, | 1383 | dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, |
1413 | obj_request, obj_request->img_request, obj_request->result, | 1384 | obj_request, obj_request->img_request, obj_request->result, |
1414 | xferred, length); | 1385 | xferred, length); |
1415 | /* | 1386 | /* |
1416 | * ENOENT means a hole in the image. We zero-fill the | 1387 | * ENOENT means a hole in the image. We zero-fill the |
1417 | * entire length of the request. A short read also implies | 1388 | * entire length of the request. A short read also implies |
1418 | * zero-fill to the end of the request. Either way we | 1389 | * zero-fill to the end of the request. Either way we |
1419 | * update the xferred count to indicate the whole request | 1390 | * update the xferred count to indicate the whole request |
1420 | * was satisfied. | 1391 | * was satisfied. |
1421 | */ | 1392 | */ |
1422 | rbd_assert(obj_request->type != OBJ_REQUEST_NODATA); | 1393 | rbd_assert(obj_request->type != OBJ_REQUEST_NODATA); |
1423 | if (obj_request->result == -ENOENT) { | 1394 | if (obj_request->result == -ENOENT) { |
1424 | if (obj_request->type == OBJ_REQUEST_BIO) | 1395 | if (obj_request->type == OBJ_REQUEST_BIO) |
1425 | zero_bio_chain(obj_request->bio_list, 0); | 1396 | zero_bio_chain(obj_request->bio_list, 0); |
1426 | else | 1397 | else |
1427 | zero_pages(obj_request->pages, 0, length); | 1398 | zero_pages(obj_request->pages, 0, length); |
1428 | obj_request->result = 0; | 1399 | obj_request->result = 0; |
1429 | obj_request->xferred = length; | 1400 | obj_request->xferred = length; |
1430 | } else if (xferred < length && !obj_request->result) { | 1401 | } else if (xferred < length && !obj_request->result) { |
1431 | if (obj_request->type == OBJ_REQUEST_BIO) | 1402 | if (obj_request->type == OBJ_REQUEST_BIO) |
1432 | zero_bio_chain(obj_request->bio_list, xferred); | 1403 | zero_bio_chain(obj_request->bio_list, xferred); |
1433 | else | 1404 | else |
1434 | zero_pages(obj_request->pages, xferred, length); | 1405 | zero_pages(obj_request->pages, xferred, length); |
1435 | obj_request->xferred = length; | 1406 | obj_request->xferred = length; |
1436 | } | 1407 | } |
1437 | obj_request_done_set(obj_request); | 1408 | obj_request_done_set(obj_request); |
1438 | } | 1409 | } |
1439 | 1410 | ||
1440 | static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) | 1411 | static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) |
1441 | { | 1412 | { |
1442 | dout("%s: obj %p cb %p\n", __func__, obj_request, | 1413 | dout("%s: obj %p cb %p\n", __func__, obj_request, |
1443 | obj_request->callback); | 1414 | obj_request->callback); |
1444 | if (obj_request->callback) | 1415 | if (obj_request->callback) |
1445 | obj_request->callback(obj_request); | 1416 | obj_request->callback(obj_request); |
1446 | else | 1417 | else |
1447 | complete_all(&obj_request->completion); | 1418 | complete_all(&obj_request->completion); |
1448 | } | 1419 | } |
1449 | 1420 | ||
1450 | static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request) | 1421 | static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request) |
1451 | { | 1422 | { |
1452 | dout("%s: obj %p\n", __func__, obj_request); | 1423 | dout("%s: obj %p\n", __func__, obj_request); |
1453 | obj_request_done_set(obj_request); | 1424 | obj_request_done_set(obj_request); |
1454 | } | 1425 | } |
1455 | 1426 | ||
1456 | static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) | 1427 | static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) |
1457 | { | 1428 | { |
1458 | struct rbd_img_request *img_request = NULL; | 1429 | struct rbd_img_request *img_request = NULL; |
1459 | struct rbd_device *rbd_dev = NULL; | 1430 | struct rbd_device *rbd_dev = NULL; |
1460 | bool layered = false; | 1431 | bool layered = false; |
1461 | 1432 | ||
1462 | if (obj_request_img_data_test(obj_request)) { | 1433 | if (obj_request_img_data_test(obj_request)) { |
1463 | img_request = obj_request->img_request; | 1434 | img_request = obj_request->img_request; |
1464 | layered = img_request && img_request_layered_test(img_request); | 1435 | layered = img_request && img_request_layered_test(img_request); |
1465 | rbd_dev = img_request->rbd_dev; | 1436 | rbd_dev = img_request->rbd_dev; |
1466 | } | 1437 | } |
1467 | 1438 | ||
1468 | dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, | 1439 | dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, |
1469 | obj_request, img_request, obj_request->result, | 1440 | obj_request, img_request, obj_request->result, |
1470 | obj_request->xferred, obj_request->length); | 1441 | obj_request->xferred, obj_request->length); |
1471 | if (layered && obj_request->result == -ENOENT && | 1442 | if (layered && obj_request->result == -ENOENT && |
1472 | obj_request->img_offset < rbd_dev->parent_overlap) | 1443 | obj_request->img_offset < rbd_dev->parent_overlap) |
1473 | rbd_img_parent_read(obj_request); | 1444 | rbd_img_parent_read(obj_request); |
1474 | else if (img_request) | 1445 | else if (img_request) |
1475 | rbd_img_obj_request_read_callback(obj_request); | 1446 | rbd_img_obj_request_read_callback(obj_request); |
1476 | else | 1447 | else |
1477 | obj_request_done_set(obj_request); | 1448 | obj_request_done_set(obj_request); |
1478 | } | 1449 | } |
1479 | 1450 | ||
1480 | static void rbd_osd_write_callback(struct rbd_obj_request *obj_request) | 1451 | static void rbd_osd_write_callback(struct rbd_obj_request *obj_request) |
1481 | { | 1452 | { |
1482 | dout("%s: obj %p result %d %llu\n", __func__, obj_request, | 1453 | dout("%s: obj %p result %d %llu\n", __func__, obj_request, |
1483 | obj_request->result, obj_request->length); | 1454 | obj_request->result, obj_request->length); |
1484 | /* | 1455 | /* |
1485 | * There is no such thing as a successful short write. Set | 1456 | * There is no such thing as a successful short write. Set |
1486 | * it to our originally-requested length. | 1457 | * it to our originally-requested length. |
1487 | */ | 1458 | */ |
1488 | obj_request->xferred = obj_request->length; | 1459 | obj_request->xferred = obj_request->length; |
1489 | obj_request_done_set(obj_request); | 1460 | obj_request_done_set(obj_request); |
1490 | } | 1461 | } |
1491 | 1462 | ||
1492 | /* | 1463 | /* |
1493 | * For a simple stat call there's nothing to do. We'll do more if | 1464 | * For a simple stat call there's nothing to do. We'll do more if |
1494 | * this is part of a write sequence for a layered image. | 1465 | * this is part of a write sequence for a layered image. |
1495 | */ | 1466 | */ |
1496 | static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request) | 1467 | static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request) |
1497 | { | 1468 | { |
1498 | dout("%s: obj %p\n", __func__, obj_request); | 1469 | dout("%s: obj %p\n", __func__, obj_request); |
1499 | obj_request_done_set(obj_request); | 1470 | obj_request_done_set(obj_request); |
1500 | } | 1471 | } |
1501 | 1472 | ||
1502 | static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, | 1473 | static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, |
1503 | struct ceph_msg *msg) | 1474 | struct ceph_msg *msg) |
1504 | { | 1475 | { |
1505 | struct rbd_obj_request *obj_request = osd_req->r_priv; | 1476 | struct rbd_obj_request *obj_request = osd_req->r_priv; |
1506 | u16 opcode; | 1477 | u16 opcode; |
1507 | 1478 | ||
1508 | dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg); | 1479 | dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg); |
1509 | rbd_assert(osd_req == obj_request->osd_req); | 1480 | rbd_assert(osd_req == obj_request->osd_req); |
1510 | if (obj_request_img_data_test(obj_request)) { | 1481 | if (obj_request_img_data_test(obj_request)) { |
1511 | rbd_assert(obj_request->img_request); | 1482 | rbd_assert(obj_request->img_request); |
1512 | rbd_assert(obj_request->which != BAD_WHICH); | 1483 | rbd_assert(obj_request->which != BAD_WHICH); |
1513 | } else { | 1484 | } else { |
1514 | rbd_assert(obj_request->which == BAD_WHICH); | 1485 | rbd_assert(obj_request->which == BAD_WHICH); |
1515 | } | 1486 | } |
1516 | 1487 | ||
1517 | if (osd_req->r_result < 0) | 1488 | if (osd_req->r_result < 0) |
1518 | obj_request->result = osd_req->r_result; | 1489 | obj_request->result = osd_req->r_result; |
1519 | obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version); | 1490 | obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version); |
1520 | 1491 | ||
1521 | BUG_ON(osd_req->r_num_ops > 2); | 1492 | BUG_ON(osd_req->r_num_ops > 2); |
1522 | 1493 | ||
1523 | /* | 1494 | /* |
1524 | * We support a 64-bit length, but ultimately it has to be | 1495 | * We support a 64-bit length, but ultimately it has to be |
1525 | * passed to blk_end_request(), which takes an unsigned int. | 1496 | * passed to blk_end_request(), which takes an unsigned int. |
1526 | */ | 1497 | */ |
1527 | obj_request->xferred = osd_req->r_reply_op_len[0]; | 1498 | obj_request->xferred = osd_req->r_reply_op_len[0]; |
1528 | rbd_assert(obj_request->xferred < (u64)UINT_MAX); | 1499 | rbd_assert(obj_request->xferred < (u64)UINT_MAX); |
1529 | opcode = osd_req->r_ops[0].op; | 1500 | opcode = osd_req->r_ops[0].op; |
1530 | switch (opcode) { | 1501 | switch (opcode) { |
1531 | case CEPH_OSD_OP_READ: | 1502 | case CEPH_OSD_OP_READ: |
1532 | rbd_osd_read_callback(obj_request); | 1503 | rbd_osd_read_callback(obj_request); |
1533 | break; | 1504 | break; |
1534 | case CEPH_OSD_OP_WRITE: | 1505 | case CEPH_OSD_OP_WRITE: |
1535 | rbd_osd_write_callback(obj_request); | 1506 | rbd_osd_write_callback(obj_request); |
1536 | break; | 1507 | break; |
1537 | case CEPH_OSD_OP_STAT: | 1508 | case CEPH_OSD_OP_STAT: |
1538 | rbd_osd_stat_callback(obj_request); | 1509 | rbd_osd_stat_callback(obj_request); |
1539 | break; | 1510 | break; |
1540 | case CEPH_OSD_OP_CALL: | 1511 | case CEPH_OSD_OP_CALL: |
1541 | case CEPH_OSD_OP_NOTIFY_ACK: | 1512 | case CEPH_OSD_OP_NOTIFY_ACK: |
1542 | case CEPH_OSD_OP_WATCH: | 1513 | case CEPH_OSD_OP_WATCH: |
1543 | rbd_osd_trivial_callback(obj_request); | 1514 | rbd_osd_trivial_callback(obj_request); |
1544 | break; | 1515 | break; |
1545 | default: | 1516 | default: |
1546 | rbd_warn(NULL, "%s: unsupported op %hu\n", | 1517 | rbd_warn(NULL, "%s: unsupported op %hu\n", |
1547 | obj_request->object_name, (unsigned short) opcode); | 1518 | obj_request->object_name, (unsigned short) opcode); |
1548 | break; | 1519 | break; |
1549 | } | 1520 | } |
1550 | 1521 | ||
1551 | if (obj_request_done_test(obj_request)) | 1522 | if (obj_request_done_test(obj_request)) |
1552 | rbd_obj_request_complete(obj_request); | 1523 | rbd_obj_request_complete(obj_request); |
1553 | } | 1524 | } |
1554 | 1525 | ||
1555 | static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request) | 1526 | static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request) |
1556 | { | 1527 | { |
1557 | struct rbd_img_request *img_request = obj_request->img_request; | 1528 | struct rbd_img_request *img_request = obj_request->img_request; |
1558 | struct ceph_osd_request *osd_req = obj_request->osd_req; | 1529 | struct ceph_osd_request *osd_req = obj_request->osd_req; |
1559 | u64 snap_id; | 1530 | u64 snap_id; |
1560 | 1531 | ||
1561 | rbd_assert(osd_req != NULL); | 1532 | rbd_assert(osd_req != NULL); |
1562 | 1533 | ||
1563 | snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP; | 1534 | snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP; |
1564 | ceph_osdc_build_request(osd_req, obj_request->offset, | 1535 | ceph_osdc_build_request(osd_req, obj_request->offset, |
1565 | NULL, snap_id, NULL); | 1536 | NULL, snap_id, NULL); |
1566 | } | 1537 | } |
1567 | 1538 | ||
1568 | static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) | 1539 | static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) |
1569 | { | 1540 | { |
1570 | struct rbd_img_request *img_request = obj_request->img_request; | 1541 | struct rbd_img_request *img_request = obj_request->img_request; |
1571 | struct ceph_osd_request *osd_req = obj_request->osd_req; | 1542 | struct ceph_osd_request *osd_req = obj_request->osd_req; |
1572 | struct ceph_snap_context *snapc; | 1543 | struct ceph_snap_context *snapc; |
1573 | struct timespec mtime = CURRENT_TIME; | 1544 | struct timespec mtime = CURRENT_TIME; |
1574 | 1545 | ||
1575 | rbd_assert(osd_req != NULL); | 1546 | rbd_assert(osd_req != NULL); |
1576 | 1547 | ||
1577 | snapc = img_request ? img_request->snapc : NULL; | 1548 | snapc = img_request ? img_request->snapc : NULL; |
1578 | ceph_osdc_build_request(osd_req, obj_request->offset, | 1549 | ceph_osdc_build_request(osd_req, obj_request->offset, |
1579 | snapc, CEPH_NOSNAP, &mtime); | 1550 | snapc, CEPH_NOSNAP, &mtime); |
1580 | } | 1551 | } |
1581 | 1552 | ||
1582 | static struct ceph_osd_request *rbd_osd_req_create( | 1553 | static struct ceph_osd_request *rbd_osd_req_create( |
1583 | struct rbd_device *rbd_dev, | 1554 | struct rbd_device *rbd_dev, |
1584 | bool write_request, | 1555 | bool write_request, |
1585 | struct rbd_obj_request *obj_request) | 1556 | struct rbd_obj_request *obj_request) |
1586 | { | 1557 | { |
1587 | struct ceph_snap_context *snapc = NULL; | 1558 | struct ceph_snap_context *snapc = NULL; |
1588 | struct ceph_osd_client *osdc; | 1559 | struct ceph_osd_client *osdc; |
1589 | struct ceph_osd_request *osd_req; | 1560 | struct ceph_osd_request *osd_req; |
1590 | 1561 | ||
1591 | if (obj_request_img_data_test(obj_request)) { | 1562 | if (obj_request_img_data_test(obj_request)) { |
1592 | struct rbd_img_request *img_request = obj_request->img_request; | 1563 | struct rbd_img_request *img_request = obj_request->img_request; |
1593 | 1564 | ||
1594 | rbd_assert(write_request == | 1565 | rbd_assert(write_request == |
1595 | img_request_write_test(img_request)); | 1566 | img_request_write_test(img_request)); |
1596 | if (write_request) | 1567 | if (write_request) |
1597 | snapc = img_request->snapc; | 1568 | snapc = img_request->snapc; |
1598 | } | 1569 | } |
1599 | 1570 | ||
1600 | /* Allocate and initialize the request, for the single op */ | 1571 | /* Allocate and initialize the request, for the single op */ |
1601 | 1572 | ||
1602 | osdc = &rbd_dev->rbd_client->client->osdc; | 1573 | osdc = &rbd_dev->rbd_client->client->osdc; |
1603 | osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC); | 1574 | osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC); |
1604 | if (!osd_req) | 1575 | if (!osd_req) |
1605 | return NULL; /* ENOMEM */ | 1576 | return NULL; /* ENOMEM */ |
1606 | 1577 | ||
1607 | if (write_request) | 1578 | if (write_request) |
1608 | osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; | 1579 | osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; |
1609 | else | 1580 | else |
1610 | osd_req->r_flags = CEPH_OSD_FLAG_READ; | 1581 | osd_req->r_flags = CEPH_OSD_FLAG_READ; |
1611 | 1582 | ||
1612 | osd_req->r_callback = rbd_osd_req_callback; | 1583 | osd_req->r_callback = rbd_osd_req_callback; |
1613 | osd_req->r_priv = obj_request; | 1584 | osd_req->r_priv = obj_request; |
1614 | 1585 | ||
1615 | osd_req->r_oid_len = strlen(obj_request->object_name); | 1586 | osd_req->r_oid_len = strlen(obj_request->object_name); |
1616 | rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid)); | 1587 | rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid)); |
1617 | memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len); | 1588 | memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len); |
1618 | 1589 | ||
1619 | osd_req->r_file_layout = rbd_dev->layout; /* struct */ | 1590 | osd_req->r_file_layout = rbd_dev->layout; /* struct */ |
1620 | 1591 | ||
1621 | return osd_req; | 1592 | return osd_req; |
1622 | } | 1593 | } |
1623 | 1594 | ||
1624 | /* | 1595 | /* |
1625 | * Create a copyup osd request based on the information in the | 1596 | * Create a copyup osd request based on the information in the |
1626 | * object request supplied. A copyup request has two osd ops, | 1597 | * object request supplied. A copyup request has two osd ops, |
1627 | * a copyup method call, and a "normal" write request. | 1598 | * a copyup method call, and a "normal" write request. |
1628 | */ | 1599 | */ |
1629 | static struct ceph_osd_request * | 1600 | static struct ceph_osd_request * |
1630 | rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) | 1601 | rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) |
1631 | { | 1602 | { |
1632 | struct rbd_img_request *img_request; | 1603 | struct rbd_img_request *img_request; |
1633 | struct ceph_snap_context *snapc; | 1604 | struct ceph_snap_context *snapc; |
1634 | struct rbd_device *rbd_dev; | 1605 | struct rbd_device *rbd_dev; |
1635 | struct ceph_osd_client *osdc; | 1606 | struct ceph_osd_client *osdc; |
1636 | struct ceph_osd_request *osd_req; | 1607 | struct ceph_osd_request *osd_req; |
1637 | 1608 | ||
1638 | rbd_assert(obj_request_img_data_test(obj_request)); | 1609 | rbd_assert(obj_request_img_data_test(obj_request)); |
1639 | img_request = obj_request->img_request; | 1610 | img_request = obj_request->img_request; |
1640 | rbd_assert(img_request); | 1611 | rbd_assert(img_request); |
1641 | rbd_assert(img_request_write_test(img_request)); | 1612 | rbd_assert(img_request_write_test(img_request)); |
1642 | 1613 | ||
1643 | /* Allocate and initialize the request, for the two ops */ | 1614 | /* Allocate and initialize the request, for the two ops */ |
1644 | 1615 | ||
1645 | snapc = img_request->snapc; | 1616 | snapc = img_request->snapc; |
1646 | rbd_dev = img_request->rbd_dev; | 1617 | rbd_dev = img_request->rbd_dev; |
1647 | osdc = &rbd_dev->rbd_client->client->osdc; | 1618 | osdc = &rbd_dev->rbd_client->client->osdc; |
1648 | osd_req = ceph_osdc_alloc_request(osdc, snapc, 2, false, GFP_ATOMIC); | 1619 | osd_req = ceph_osdc_alloc_request(osdc, snapc, 2, false, GFP_ATOMIC); |
1649 | if (!osd_req) | 1620 | if (!osd_req) |
1650 | return NULL; /* ENOMEM */ | 1621 | return NULL; /* ENOMEM */ |
1651 | 1622 | ||
1652 | osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; | 1623 | osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; |
1653 | osd_req->r_callback = rbd_osd_req_callback; | 1624 | osd_req->r_callback = rbd_osd_req_callback; |
1654 | osd_req->r_priv = obj_request; | 1625 | osd_req->r_priv = obj_request; |
1655 | 1626 | ||
1656 | osd_req->r_oid_len = strlen(obj_request->object_name); | 1627 | osd_req->r_oid_len = strlen(obj_request->object_name); |
1657 | rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid)); | 1628 | rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid)); |
1658 | memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len); | 1629 | memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len); |
1659 | 1630 | ||
1660 | osd_req->r_file_layout = rbd_dev->layout; /* struct */ | 1631 | osd_req->r_file_layout = rbd_dev->layout; /* struct */ |
1661 | 1632 | ||
1662 | return osd_req; | 1633 | return osd_req; |
1663 | } | 1634 | } |
1664 | 1635 | ||
1665 | 1636 | ||
1666 | static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req) | 1637 | static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req) |
1667 | { | 1638 | { |
1668 | ceph_osdc_put_request(osd_req); | 1639 | ceph_osdc_put_request(osd_req); |
1669 | } | 1640 | } |
1670 | 1641 | ||
1671 | /* object_name is assumed to be a non-null pointer and NUL-terminated */ | 1642 | /* object_name is assumed to be a non-null pointer and NUL-terminated */ |
1672 | 1643 | ||
1673 | static struct rbd_obj_request *rbd_obj_request_create(const char *object_name, | 1644 | static struct rbd_obj_request *rbd_obj_request_create(const char *object_name, |
1674 | u64 offset, u64 length, | 1645 | u64 offset, u64 length, |
1675 | enum obj_request_type type) | 1646 | enum obj_request_type type) |
1676 | { | 1647 | { |
1677 | struct rbd_obj_request *obj_request; | 1648 | struct rbd_obj_request *obj_request; |
1678 | size_t size; | 1649 | size_t size; |
1679 | char *name; | 1650 | char *name; |
1680 | 1651 | ||
1681 | rbd_assert(obj_request_type_valid(type)); | 1652 | rbd_assert(obj_request_type_valid(type)); |
1682 | 1653 | ||
1683 | size = strlen(object_name) + 1; | 1654 | size = strlen(object_name) + 1; |
1684 | obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL); | 1655 | obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL); |
1685 | if (!obj_request) | 1656 | if (!obj_request) |
1686 | return NULL; | 1657 | return NULL; |
1687 | 1658 | ||
1688 | name = (char *)(obj_request + 1); | 1659 | name = (char *)(obj_request + 1); |
1689 | obj_request->object_name = memcpy(name, object_name, size); | 1660 | obj_request->object_name = memcpy(name, object_name, size); |
1690 | obj_request->offset = offset; | 1661 | obj_request->offset = offset; |
1691 | obj_request->length = length; | 1662 | obj_request->length = length; |
1692 | obj_request->flags = 0; | 1663 | obj_request->flags = 0; |
1693 | obj_request->which = BAD_WHICH; | 1664 | obj_request->which = BAD_WHICH; |
1694 | obj_request->type = type; | 1665 | obj_request->type = type; |
1695 | INIT_LIST_HEAD(&obj_request->links); | 1666 | INIT_LIST_HEAD(&obj_request->links); |
1696 | init_completion(&obj_request->completion); | 1667 | init_completion(&obj_request->completion); |
1697 | kref_init(&obj_request->kref); | 1668 | kref_init(&obj_request->kref); |
1698 | 1669 | ||
1699 | dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name, | 1670 | dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name, |
1700 | offset, length, (int)type, obj_request); | 1671 | offset, length, (int)type, obj_request); |
1701 | 1672 | ||
1702 | return obj_request; | 1673 | return obj_request; |
1703 | } | 1674 | } |
1704 | 1675 | ||
1705 | static void rbd_obj_request_destroy(struct kref *kref) | 1676 | static void rbd_obj_request_destroy(struct kref *kref) |
1706 | { | 1677 | { |
1707 | struct rbd_obj_request *obj_request; | 1678 | struct rbd_obj_request *obj_request; |
1708 | 1679 | ||
1709 | obj_request = container_of(kref, struct rbd_obj_request, kref); | 1680 | obj_request = container_of(kref, struct rbd_obj_request, kref); |
1710 | 1681 | ||
1711 | dout("%s: obj %p\n", __func__, obj_request); | 1682 | dout("%s: obj %p\n", __func__, obj_request); |
1712 | 1683 | ||
1713 | rbd_assert(obj_request->img_request == NULL); | 1684 | rbd_assert(obj_request->img_request == NULL); |
1714 | rbd_assert(obj_request->which == BAD_WHICH); | 1685 | rbd_assert(obj_request->which == BAD_WHICH); |
1715 | 1686 | ||
1716 | if (obj_request->osd_req) | 1687 | if (obj_request->osd_req) |
1717 | rbd_osd_req_destroy(obj_request->osd_req); | 1688 | rbd_osd_req_destroy(obj_request->osd_req); |
1718 | 1689 | ||
1719 | rbd_assert(obj_request_type_valid(obj_request->type)); | 1690 | rbd_assert(obj_request_type_valid(obj_request->type)); |
1720 | switch (obj_request->type) { | 1691 | switch (obj_request->type) { |
1721 | case OBJ_REQUEST_NODATA: | 1692 | case OBJ_REQUEST_NODATA: |
1722 | break; /* Nothing to do */ | 1693 | break; /* Nothing to do */ |
1723 | case OBJ_REQUEST_BIO: | 1694 | case OBJ_REQUEST_BIO: |
1724 | if (obj_request->bio_list) | 1695 | if (obj_request->bio_list) |
1725 | bio_chain_put(obj_request->bio_list); | 1696 | bio_chain_put(obj_request->bio_list); |
1726 | break; | 1697 | break; |
1727 | case OBJ_REQUEST_PAGES: | 1698 | case OBJ_REQUEST_PAGES: |
1728 | if (obj_request->pages) | 1699 | if (obj_request->pages) |
1729 | ceph_release_page_vector(obj_request->pages, | 1700 | ceph_release_page_vector(obj_request->pages, |
1730 | obj_request->page_count); | 1701 | obj_request->page_count); |
1731 | break; | 1702 | break; |
1732 | } | 1703 | } |
1733 | 1704 | ||
1734 | kfree(obj_request); | 1705 | kfree(obj_request); |
1735 | } | 1706 | } |
1736 | 1707 | ||
1737 | /* | 1708 | /* |
1738 | * Caller is responsible for filling in the list of object requests | 1709 | * Caller is responsible for filling in the list of object requests |
1739 | * that comprises the image request, and the Linux request pointer | 1710 | * that comprises the image request, and the Linux request pointer |
1740 | * (if there is one). | 1711 | * (if there is one). |
1741 | */ | 1712 | */ |
1742 | static struct rbd_img_request *rbd_img_request_create( | 1713 | static struct rbd_img_request *rbd_img_request_create( |
1743 | struct rbd_device *rbd_dev, | 1714 | struct rbd_device *rbd_dev, |
1744 | u64 offset, u64 length, | 1715 | u64 offset, u64 length, |
1745 | bool write_request, | 1716 | bool write_request, |
1746 | bool child_request) | 1717 | bool child_request) |
1747 | { | 1718 | { |
1748 | struct rbd_img_request *img_request; | 1719 | struct rbd_img_request *img_request; |
1749 | 1720 | ||
1750 | img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC); | 1721 | img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC); |
1751 | if (!img_request) | 1722 | if (!img_request) |
1752 | return NULL; | 1723 | return NULL; |
1753 | 1724 | ||
1754 | if (write_request) { | 1725 | if (write_request) { |
1755 | down_read(&rbd_dev->header_rwsem); | 1726 | down_read(&rbd_dev->header_rwsem); |
1756 | rbd_snap_context_get(rbd_dev->header.snapc); | 1727 | ceph_get_snap_context(rbd_dev->header.snapc); |
1757 | up_read(&rbd_dev->header_rwsem); | 1728 | up_read(&rbd_dev->header_rwsem); |
1758 | } | 1729 | } |
1759 | 1730 | ||
1760 | img_request->rq = NULL; | 1731 | img_request->rq = NULL; |
1761 | img_request->rbd_dev = rbd_dev; | 1732 | img_request->rbd_dev = rbd_dev; |
1762 | img_request->offset = offset; | 1733 | img_request->offset = offset; |
1763 | img_request->length = length; | 1734 | img_request->length = length; |
1764 | img_request->flags = 0; | 1735 | img_request->flags = 0; |
1765 | if (write_request) { | 1736 | if (write_request) { |
1766 | img_request_write_set(img_request); | 1737 | img_request_write_set(img_request); |
1767 | img_request->snapc = rbd_dev->header.snapc; | 1738 | img_request->snapc = rbd_dev->header.snapc; |
1768 | } else { | 1739 | } else { |
1769 | img_request->snap_id = rbd_dev->spec->snap_id; | 1740 | img_request->snap_id = rbd_dev->spec->snap_id; |
1770 | } | 1741 | } |
1771 | if (child_request) | 1742 | if (child_request) |
1772 | img_request_child_set(img_request); | 1743 | img_request_child_set(img_request); |
1773 | if (rbd_dev->parent_spec) | 1744 | if (rbd_dev->parent_spec) |
1774 | img_request_layered_set(img_request); | 1745 | img_request_layered_set(img_request); |
1775 | spin_lock_init(&img_request->completion_lock); | 1746 | spin_lock_init(&img_request->completion_lock); |
1776 | img_request->next_completion = 0; | 1747 | img_request->next_completion = 0; |
1777 | img_request->callback = NULL; | 1748 | img_request->callback = NULL; |
1778 | img_request->result = 0; | 1749 | img_request->result = 0; |
1779 | img_request->obj_request_count = 0; | 1750 | img_request->obj_request_count = 0; |
1780 | INIT_LIST_HEAD(&img_request->obj_requests); | 1751 | INIT_LIST_HEAD(&img_request->obj_requests); |
1781 | kref_init(&img_request->kref); | 1752 | kref_init(&img_request->kref); |
1782 | 1753 | ||
1783 | rbd_img_request_get(img_request); /* Avoid a warning */ | 1754 | rbd_img_request_get(img_request); /* Avoid a warning */ |
1784 | rbd_img_request_put(img_request); /* TEMPORARY */ | 1755 | rbd_img_request_put(img_request); /* TEMPORARY */ |
1785 | 1756 | ||
1786 | dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev, | 1757 | dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev, |
1787 | write_request ? "write" : "read", offset, length, | 1758 | write_request ? "write" : "read", offset, length, |
1788 | img_request); | 1759 | img_request); |
1789 | 1760 | ||
1790 | return img_request; | 1761 | return img_request; |
1791 | } | 1762 | } |
1792 | 1763 | ||
1793 | static void rbd_img_request_destroy(struct kref *kref) | 1764 | static void rbd_img_request_destroy(struct kref *kref) |
1794 | { | 1765 | { |
1795 | struct rbd_img_request *img_request; | 1766 | struct rbd_img_request *img_request; |
1796 | struct rbd_obj_request *obj_request; | 1767 | struct rbd_obj_request *obj_request; |
1797 | struct rbd_obj_request *next_obj_request; | 1768 | struct rbd_obj_request *next_obj_request; |
1798 | 1769 | ||
1799 | img_request = container_of(kref, struct rbd_img_request, kref); | 1770 | img_request = container_of(kref, struct rbd_img_request, kref); |
1800 | 1771 | ||
1801 | dout("%s: img %p\n", __func__, img_request); | 1772 | dout("%s: img %p\n", __func__, img_request); |
1802 | 1773 | ||
1803 | for_each_obj_request_safe(img_request, obj_request, next_obj_request) | 1774 | for_each_obj_request_safe(img_request, obj_request, next_obj_request) |
1804 | rbd_img_obj_request_del(img_request, obj_request); | 1775 | rbd_img_obj_request_del(img_request, obj_request); |
1805 | rbd_assert(img_request->obj_request_count == 0); | 1776 | rbd_assert(img_request->obj_request_count == 0); |
1806 | 1777 | ||
1807 | if (img_request_write_test(img_request)) | 1778 | if (img_request_write_test(img_request)) |
1808 | rbd_snap_context_put(img_request->snapc); | 1779 | ceph_put_snap_context(img_request->snapc); |
1809 | 1780 | ||
1810 | if (img_request_child_test(img_request)) | 1781 | if (img_request_child_test(img_request)) |
1811 | rbd_obj_request_put(img_request->obj_request); | 1782 | rbd_obj_request_put(img_request->obj_request); |
1812 | 1783 | ||
1813 | kfree(img_request); | 1784 | kfree(img_request); |
1814 | } | 1785 | } |
1815 | 1786 | ||
1816 | static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) | 1787 | static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) |
1817 | { | 1788 | { |
1818 | struct rbd_img_request *img_request; | 1789 | struct rbd_img_request *img_request; |
1819 | unsigned int xferred; | 1790 | unsigned int xferred; |
1820 | int result; | 1791 | int result; |
1821 | bool more; | 1792 | bool more; |
1822 | 1793 | ||
1823 | rbd_assert(obj_request_img_data_test(obj_request)); | 1794 | rbd_assert(obj_request_img_data_test(obj_request)); |
1824 | img_request = obj_request->img_request; | 1795 | img_request = obj_request->img_request; |
1825 | 1796 | ||
1826 | rbd_assert(obj_request->xferred <= (u64)UINT_MAX); | 1797 | rbd_assert(obj_request->xferred <= (u64)UINT_MAX); |
1827 | xferred = (unsigned int)obj_request->xferred; | 1798 | xferred = (unsigned int)obj_request->xferred; |
1828 | result = obj_request->result; | 1799 | result = obj_request->result; |
1829 | if (result) { | 1800 | if (result) { |
1830 | struct rbd_device *rbd_dev = img_request->rbd_dev; | 1801 | struct rbd_device *rbd_dev = img_request->rbd_dev; |
1831 | 1802 | ||
1832 | rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n", | 1803 | rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n", |
1833 | img_request_write_test(img_request) ? "write" : "read", | 1804 | img_request_write_test(img_request) ? "write" : "read", |
1834 | obj_request->length, obj_request->img_offset, | 1805 | obj_request->length, obj_request->img_offset, |
1835 | obj_request->offset); | 1806 | obj_request->offset); |
1836 | rbd_warn(rbd_dev, " result %d xferred %x\n", | 1807 | rbd_warn(rbd_dev, " result %d xferred %x\n", |
1837 | result, xferred); | 1808 | result, xferred); |
1838 | if (!img_request->result) | 1809 | if (!img_request->result) |
1839 | img_request->result = result; | 1810 | img_request->result = result; |
1840 | } | 1811 | } |
1841 | 1812 | ||
1842 | /* Image object requests don't own their page array */ | 1813 | /* Image object requests don't own their page array */ |
1843 | 1814 | ||
1844 | if (obj_request->type == OBJ_REQUEST_PAGES) { | 1815 | if (obj_request->type == OBJ_REQUEST_PAGES) { |
1845 | obj_request->pages = NULL; | 1816 | obj_request->pages = NULL; |
1846 | obj_request->page_count = 0; | 1817 | obj_request->page_count = 0; |
1847 | } | 1818 | } |
1848 | 1819 | ||
1849 | if (img_request_child_test(img_request)) { | 1820 | if (img_request_child_test(img_request)) { |
1850 | rbd_assert(img_request->obj_request != NULL); | 1821 | rbd_assert(img_request->obj_request != NULL); |
1851 | more = obj_request->which < img_request->obj_request_count - 1; | 1822 | more = obj_request->which < img_request->obj_request_count - 1; |
1852 | } else { | 1823 | } else { |
1853 | rbd_assert(img_request->rq != NULL); | 1824 | rbd_assert(img_request->rq != NULL); |
1854 | more = blk_end_request(img_request->rq, result, xferred); | 1825 | more = blk_end_request(img_request->rq, result, xferred); |
1855 | } | 1826 | } |
1856 | 1827 | ||
1857 | return more; | 1828 | return more; |
1858 | } | 1829 | } |
1859 | 1830 | ||
1860 | static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) | 1831 | static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) |
1861 | { | 1832 | { |
1862 | struct rbd_img_request *img_request; | 1833 | struct rbd_img_request *img_request; |
1863 | u32 which = obj_request->which; | 1834 | u32 which = obj_request->which; |
1864 | bool more = true; | 1835 | bool more = true; |
1865 | 1836 | ||
1866 | rbd_assert(obj_request_img_data_test(obj_request)); | 1837 | rbd_assert(obj_request_img_data_test(obj_request)); |
1867 | img_request = obj_request->img_request; | 1838 | img_request = obj_request->img_request; |
1868 | 1839 | ||
1869 | dout("%s: img %p obj %p\n", __func__, img_request, obj_request); | 1840 | dout("%s: img %p obj %p\n", __func__, img_request, obj_request); |
1870 | rbd_assert(img_request != NULL); | 1841 | rbd_assert(img_request != NULL); |
1871 | rbd_assert(img_request->obj_request_count > 0); | 1842 | rbd_assert(img_request->obj_request_count > 0); |
1872 | rbd_assert(which != BAD_WHICH); | 1843 | rbd_assert(which != BAD_WHICH); |
1873 | rbd_assert(which < img_request->obj_request_count); | 1844 | rbd_assert(which < img_request->obj_request_count); |
1874 | rbd_assert(which >= img_request->next_completion); | 1845 | rbd_assert(which >= img_request->next_completion); |
1875 | 1846 | ||
1876 | spin_lock_irq(&img_request->completion_lock); | 1847 | spin_lock_irq(&img_request->completion_lock); |
1877 | if (which != img_request->next_completion) | 1848 | if (which != img_request->next_completion) |
1878 | goto out; | 1849 | goto out; |
1879 | 1850 | ||
1880 | for_each_obj_request_from(img_request, obj_request) { | 1851 | for_each_obj_request_from(img_request, obj_request) { |
1881 | rbd_assert(more); | 1852 | rbd_assert(more); |
1882 | rbd_assert(which < img_request->obj_request_count); | 1853 | rbd_assert(which < img_request->obj_request_count); |
1883 | 1854 | ||
1884 | if (!obj_request_done_test(obj_request)) | 1855 | if (!obj_request_done_test(obj_request)) |
1885 | break; | 1856 | break; |
1886 | more = rbd_img_obj_end_request(obj_request); | 1857 | more = rbd_img_obj_end_request(obj_request); |
1887 | which++; | 1858 | which++; |
1888 | } | 1859 | } |
1889 | 1860 | ||
1890 | rbd_assert(more ^ (which == img_request->obj_request_count)); | 1861 | rbd_assert(more ^ (which == img_request->obj_request_count)); |
1891 | img_request->next_completion = which; | 1862 | img_request->next_completion = which; |
1892 | out: | 1863 | out: |
1893 | spin_unlock_irq(&img_request->completion_lock); | 1864 | spin_unlock_irq(&img_request->completion_lock); |
1894 | 1865 | ||
1895 | if (!more) | 1866 | if (!more) |
1896 | rbd_img_request_complete(img_request); | 1867 | rbd_img_request_complete(img_request); |
1897 | } | 1868 | } |
1898 | 1869 | ||
1899 | /* | 1870 | /* |
1900 | * Split up an image request into one or more object requests, each | 1871 | * Split up an image request into one or more object requests, each |
1901 | * to a different object. The "type" parameter indicates whether | 1872 | * to a different object. The "type" parameter indicates whether |
1902 | * "data_desc" is the pointer to the head of a list of bio | 1873 | * "data_desc" is the pointer to the head of a list of bio |
1903 | * structures, or the base of a page array. In either case this | 1874 | * structures, or the base of a page array. In either case this |
1904 | * function assumes data_desc describes memory sufficient to hold | 1875 | * function assumes data_desc describes memory sufficient to hold |
1905 | * all data described by the image request. | 1876 | * all data described by the image request. |
1906 | */ | 1877 | */ |
1907 | static int rbd_img_request_fill(struct rbd_img_request *img_request, | 1878 | static int rbd_img_request_fill(struct rbd_img_request *img_request, |
1908 | enum obj_request_type type, | 1879 | enum obj_request_type type, |
1909 | void *data_desc) | 1880 | void *data_desc) |
1910 | { | 1881 | { |
1911 | struct rbd_device *rbd_dev = img_request->rbd_dev; | 1882 | struct rbd_device *rbd_dev = img_request->rbd_dev; |
1912 | struct rbd_obj_request *obj_request = NULL; | 1883 | struct rbd_obj_request *obj_request = NULL; |
1913 | struct rbd_obj_request *next_obj_request; | 1884 | struct rbd_obj_request *next_obj_request; |
1914 | bool write_request = img_request_write_test(img_request); | 1885 | bool write_request = img_request_write_test(img_request); |
1915 | struct bio *bio_list; | 1886 | struct bio *bio_list; |
1916 | unsigned int bio_offset = 0; | 1887 | unsigned int bio_offset = 0; |
1917 | struct page **pages; | 1888 | struct page **pages; |
1918 | u64 img_offset; | 1889 | u64 img_offset; |
1919 | u64 resid; | 1890 | u64 resid; |
1920 | u16 opcode; | 1891 | u16 opcode; |
1921 | 1892 | ||
1922 | dout("%s: img %p type %d data_desc %p\n", __func__, img_request, | 1893 | dout("%s: img %p type %d data_desc %p\n", __func__, img_request, |
1923 | (int)type, data_desc); | 1894 | (int)type, data_desc); |
1924 | 1895 | ||
1925 | opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ; | 1896 | opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ; |
1926 | img_offset = img_request->offset; | 1897 | img_offset = img_request->offset; |
1927 | resid = img_request->length; | 1898 | resid = img_request->length; |
1928 | rbd_assert(resid > 0); | 1899 | rbd_assert(resid > 0); |
1929 | 1900 | ||
1930 | if (type == OBJ_REQUEST_BIO) { | 1901 | if (type == OBJ_REQUEST_BIO) { |
1931 | bio_list = data_desc; | 1902 | bio_list = data_desc; |
1932 | rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT); | 1903 | rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT); |
1933 | } else { | 1904 | } else { |
1934 | rbd_assert(type == OBJ_REQUEST_PAGES); | 1905 | rbd_assert(type == OBJ_REQUEST_PAGES); |
1935 | pages = data_desc; | 1906 | pages = data_desc; |
1936 | } | 1907 | } |
1937 | 1908 | ||
1938 | while (resid) { | 1909 | while (resid) { |
1939 | struct ceph_osd_request *osd_req; | 1910 | struct ceph_osd_request *osd_req; |
1940 | const char *object_name; | 1911 | const char *object_name; |
1941 | u64 offset; | 1912 | u64 offset; |
1942 | u64 length; | 1913 | u64 length; |
1943 | 1914 | ||
1944 | object_name = rbd_segment_name(rbd_dev, img_offset); | 1915 | object_name = rbd_segment_name(rbd_dev, img_offset); |
1945 | if (!object_name) | 1916 | if (!object_name) |
1946 | goto out_unwind; | 1917 | goto out_unwind; |
1947 | offset = rbd_segment_offset(rbd_dev, img_offset); | 1918 | offset = rbd_segment_offset(rbd_dev, img_offset); |
1948 | length = rbd_segment_length(rbd_dev, img_offset, resid); | 1919 | length = rbd_segment_length(rbd_dev, img_offset, resid); |
1949 | obj_request = rbd_obj_request_create(object_name, | 1920 | obj_request = rbd_obj_request_create(object_name, |
1950 | offset, length, type); | 1921 | offset, length, type); |
1951 | kfree(object_name); /* object request has its own copy */ | 1922 | kfree(object_name); /* object request has its own copy */ |
1952 | if (!obj_request) | 1923 | if (!obj_request) |
1953 | goto out_unwind; | 1924 | goto out_unwind; |
1954 | 1925 | ||
1955 | if (type == OBJ_REQUEST_BIO) { | 1926 | if (type == OBJ_REQUEST_BIO) { |
1956 | unsigned int clone_size; | 1927 | unsigned int clone_size; |
1957 | 1928 | ||
1958 | rbd_assert(length <= (u64)UINT_MAX); | 1929 | rbd_assert(length <= (u64)UINT_MAX); |
1959 | clone_size = (unsigned int)length; | 1930 | clone_size = (unsigned int)length; |
1960 | obj_request->bio_list = | 1931 | obj_request->bio_list = |
1961 | bio_chain_clone_range(&bio_list, | 1932 | bio_chain_clone_range(&bio_list, |
1962 | &bio_offset, | 1933 | &bio_offset, |
1963 | clone_size, | 1934 | clone_size, |
1964 | GFP_ATOMIC); | 1935 | GFP_ATOMIC); |
1965 | if (!obj_request->bio_list) | 1936 | if (!obj_request->bio_list) |
1966 | goto out_partial; | 1937 | goto out_partial; |
1967 | } else { | 1938 | } else { |
1968 | unsigned int page_count; | 1939 | unsigned int page_count; |
1969 | 1940 | ||
1970 | obj_request->pages = pages; | 1941 | obj_request->pages = pages; |
1971 | page_count = (u32)calc_pages_for(offset, length); | 1942 | page_count = (u32)calc_pages_for(offset, length); |
1972 | obj_request->page_count = page_count; | 1943 | obj_request->page_count = page_count; |
1973 | if ((offset + length) & ~PAGE_MASK) | 1944 | if ((offset + length) & ~PAGE_MASK) |
1974 | page_count--; /* more on last page */ | 1945 | page_count--; /* more on last page */ |
1975 | pages += page_count; | 1946 | pages += page_count; |
1976 | } | 1947 | } |
1977 | 1948 | ||
1978 | osd_req = rbd_osd_req_create(rbd_dev, write_request, | 1949 | osd_req = rbd_osd_req_create(rbd_dev, write_request, |
1979 | obj_request); | 1950 | obj_request); |
1980 | if (!osd_req) | 1951 | if (!osd_req) |
1981 | goto out_partial; | 1952 | goto out_partial; |
1982 | obj_request->osd_req = osd_req; | 1953 | obj_request->osd_req = osd_req; |
1983 | obj_request->callback = rbd_img_obj_callback; | 1954 | obj_request->callback = rbd_img_obj_callback; |
1984 | 1955 | ||
1985 | osd_req_op_extent_init(osd_req, 0, opcode, offset, length, | 1956 | osd_req_op_extent_init(osd_req, 0, opcode, offset, length, |
1986 | 0, 0); | 1957 | 0, 0); |
1987 | if (type == OBJ_REQUEST_BIO) | 1958 | if (type == OBJ_REQUEST_BIO) |
1988 | osd_req_op_extent_osd_data_bio(osd_req, 0, | 1959 | osd_req_op_extent_osd_data_bio(osd_req, 0, |
1989 | obj_request->bio_list, length); | 1960 | obj_request->bio_list, length); |
1990 | else | 1961 | else |
1991 | osd_req_op_extent_osd_data_pages(osd_req, 0, | 1962 | osd_req_op_extent_osd_data_pages(osd_req, 0, |
1992 | obj_request->pages, length, | 1963 | obj_request->pages, length, |
1993 | offset & ~PAGE_MASK, false, false); | 1964 | offset & ~PAGE_MASK, false, false); |
1994 | 1965 | ||
1995 | if (write_request) | 1966 | if (write_request) |
1996 | rbd_osd_req_format_write(obj_request); | 1967 | rbd_osd_req_format_write(obj_request); |
1997 | else | 1968 | else |
1998 | rbd_osd_req_format_read(obj_request); | 1969 | rbd_osd_req_format_read(obj_request); |
1999 | 1970 | ||
2000 | obj_request->img_offset = img_offset; | 1971 | obj_request->img_offset = img_offset; |
2001 | rbd_img_obj_request_add(img_request, obj_request); | 1972 | rbd_img_obj_request_add(img_request, obj_request); |
2002 | 1973 | ||
2003 | img_offset += length; | 1974 | img_offset += length; |
2004 | resid -= length; | 1975 | resid -= length; |
2005 | } | 1976 | } |
2006 | 1977 | ||
2007 | return 0; | 1978 | return 0; |
2008 | 1979 | ||
2009 | out_partial: | 1980 | out_partial: |
2010 | rbd_obj_request_put(obj_request); | 1981 | rbd_obj_request_put(obj_request); |
2011 | out_unwind: | 1982 | out_unwind: |
2012 | for_each_obj_request_safe(img_request, obj_request, next_obj_request) | 1983 | for_each_obj_request_safe(img_request, obj_request, next_obj_request) |
2013 | rbd_obj_request_put(obj_request); | 1984 | rbd_obj_request_put(obj_request); |
2014 | 1985 | ||
2015 | return -ENOMEM; | 1986 | return -ENOMEM; |
2016 | } | 1987 | } |
2017 | 1988 | ||
2018 | static void | 1989 | static void |
2019 | rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request) | 1990 | rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request) |
2020 | { | 1991 | { |
2021 | struct rbd_img_request *img_request; | 1992 | struct rbd_img_request *img_request; |
2022 | struct rbd_device *rbd_dev; | 1993 | struct rbd_device *rbd_dev; |
2023 | u64 length; | 1994 | u64 length; |
2024 | u32 page_count; | 1995 | u32 page_count; |
2025 | 1996 | ||
2026 | rbd_assert(obj_request->type == OBJ_REQUEST_BIO); | 1997 | rbd_assert(obj_request->type == OBJ_REQUEST_BIO); |
2027 | rbd_assert(obj_request_img_data_test(obj_request)); | 1998 | rbd_assert(obj_request_img_data_test(obj_request)); |
2028 | img_request = obj_request->img_request; | 1999 | img_request = obj_request->img_request; |
2029 | rbd_assert(img_request); | 2000 | rbd_assert(img_request); |
2030 | 2001 | ||
2031 | rbd_dev = img_request->rbd_dev; | 2002 | rbd_dev = img_request->rbd_dev; |
2032 | rbd_assert(rbd_dev); | 2003 | rbd_assert(rbd_dev); |
2033 | length = (u64)1 << rbd_dev->header.obj_order; | 2004 | length = (u64)1 << rbd_dev->header.obj_order; |
2034 | page_count = (u32)calc_pages_for(0, length); | 2005 | page_count = (u32)calc_pages_for(0, length); |
2035 | 2006 | ||
2036 | rbd_assert(obj_request->copyup_pages); | 2007 | rbd_assert(obj_request->copyup_pages); |
2037 | ceph_release_page_vector(obj_request->copyup_pages, page_count); | 2008 | ceph_release_page_vector(obj_request->copyup_pages, page_count); |
2038 | obj_request->copyup_pages = NULL; | 2009 | obj_request->copyup_pages = NULL; |
2039 | 2010 | ||
2040 | /* | 2011 | /* |
2041 | * We want the transfer count to reflect the size of the | 2012 | * We want the transfer count to reflect the size of the |
2042 | * original write request. There is no such thing as a | 2013 | * original write request. There is no such thing as a |
2043 | * successful short write, so if the request was successful | 2014 | * successful short write, so if the request was successful |
2044 | * we can just set it to the originally-requested length. | 2015 | * we can just set it to the originally-requested length. |
2045 | */ | 2016 | */ |
2046 | if (!obj_request->result) | 2017 | if (!obj_request->result) |
2047 | obj_request->xferred = obj_request->length; | 2018 | obj_request->xferred = obj_request->length; |
2048 | 2019 | ||
2049 | /* Finish up with the normal image object callback */ | 2020 | /* Finish up with the normal image object callback */ |
2050 | 2021 | ||
2051 | rbd_img_obj_callback(obj_request); | 2022 | rbd_img_obj_callback(obj_request); |
2052 | } | 2023 | } |
2053 | 2024 | ||
2054 | static void | 2025 | static void |
2055 | rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) | 2026 | rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) |
2056 | { | 2027 | { |
2057 | struct rbd_obj_request *orig_request; | 2028 | struct rbd_obj_request *orig_request; |
2058 | struct ceph_osd_request *osd_req; | 2029 | struct ceph_osd_request *osd_req; |
2059 | struct ceph_osd_client *osdc; | 2030 | struct ceph_osd_client *osdc; |
2060 | struct rbd_device *rbd_dev; | 2031 | struct rbd_device *rbd_dev; |
2061 | struct page **pages; | 2032 | struct page **pages; |
2062 | int result; | 2033 | int result; |
2063 | u64 obj_size; | 2034 | u64 obj_size; |
2064 | u64 xferred; | 2035 | u64 xferred; |
2065 | 2036 | ||
2066 | rbd_assert(img_request_child_test(img_request)); | 2037 | rbd_assert(img_request_child_test(img_request)); |
2067 | 2038 | ||
2068 | /* First get what we need from the image request */ | 2039 | /* First get what we need from the image request */ |
2069 | 2040 | ||
2070 | pages = img_request->copyup_pages; | 2041 | pages = img_request->copyup_pages; |
2071 | rbd_assert(pages != NULL); | 2042 | rbd_assert(pages != NULL); |
2072 | img_request->copyup_pages = NULL; | 2043 | img_request->copyup_pages = NULL; |
2073 | 2044 | ||
2074 | orig_request = img_request->obj_request; | 2045 | orig_request = img_request->obj_request; |
2075 | rbd_assert(orig_request != NULL); | 2046 | rbd_assert(orig_request != NULL); |
2076 | rbd_assert(orig_request->type == OBJ_REQUEST_BIO); | 2047 | rbd_assert(orig_request->type == OBJ_REQUEST_BIO); |
2077 | result = img_request->result; | 2048 | result = img_request->result; |
2078 | obj_size = img_request->length; | 2049 | obj_size = img_request->length; |
2079 | xferred = img_request->xferred; | 2050 | xferred = img_request->xferred; |
2080 | 2051 | ||
2081 | rbd_dev = img_request->rbd_dev; | 2052 | rbd_dev = img_request->rbd_dev; |
2082 | rbd_assert(rbd_dev); | 2053 | rbd_assert(rbd_dev); |
2083 | rbd_assert(obj_size == (u64)1 << rbd_dev->header.obj_order); | 2054 | rbd_assert(obj_size == (u64)1 << rbd_dev->header.obj_order); |
2084 | 2055 | ||
2085 | rbd_img_request_put(img_request); | 2056 | rbd_img_request_put(img_request); |
2086 | 2057 | ||
2087 | if (result) | 2058 | if (result) |
2088 | goto out_err; | 2059 | goto out_err; |
2089 | 2060 | ||
2090 | /* Allocate the new copyup osd request for the original request */ | 2061 | /* Allocate the new copyup osd request for the original request */ |
2091 | 2062 | ||
2092 | result = -ENOMEM; | 2063 | result = -ENOMEM; |
2093 | rbd_assert(!orig_request->osd_req); | 2064 | rbd_assert(!orig_request->osd_req); |
2094 | osd_req = rbd_osd_req_create_copyup(orig_request); | 2065 | osd_req = rbd_osd_req_create_copyup(orig_request); |
2095 | if (!osd_req) | 2066 | if (!osd_req) |
2096 | goto out_err; | 2067 | goto out_err; |
2097 | orig_request->osd_req = osd_req; | 2068 | orig_request->osd_req = osd_req; |
2098 | orig_request->copyup_pages = pages; | 2069 | orig_request->copyup_pages = pages; |
2099 | 2070 | ||
2100 | /* Initialize the copyup op */ | 2071 | /* Initialize the copyup op */ |
2101 | 2072 | ||
2102 | osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup"); | 2073 | osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup"); |
2103 | osd_req_op_cls_request_data_pages(osd_req, 0, pages, obj_size, 0, | 2074 | osd_req_op_cls_request_data_pages(osd_req, 0, pages, obj_size, 0, |
2104 | false, false); | 2075 | false, false); |
2105 | 2076 | ||
2106 | /* Then the original write request op */ | 2077 | /* Then the original write request op */ |
2107 | 2078 | ||
2108 | osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE, | 2079 | osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE, |
2109 | orig_request->offset, | 2080 | orig_request->offset, |
2110 | orig_request->length, 0, 0); | 2081 | orig_request->length, 0, 0); |
2111 | osd_req_op_extent_osd_data_bio(osd_req, 1, orig_request->bio_list, | 2082 | osd_req_op_extent_osd_data_bio(osd_req, 1, orig_request->bio_list, |
2112 | orig_request->length); | 2083 | orig_request->length); |
2113 | 2084 | ||
2114 | rbd_osd_req_format_write(orig_request); | 2085 | rbd_osd_req_format_write(orig_request); |
2115 | 2086 | ||
2116 | /* All set, send it off. */ | 2087 | /* All set, send it off. */ |
2117 | 2088 | ||
2118 | orig_request->callback = rbd_img_obj_copyup_callback; | 2089 | orig_request->callback = rbd_img_obj_copyup_callback; |
2119 | osdc = &rbd_dev->rbd_client->client->osdc; | 2090 | osdc = &rbd_dev->rbd_client->client->osdc; |
2120 | result = rbd_obj_request_submit(osdc, orig_request); | 2091 | result = rbd_obj_request_submit(osdc, orig_request); |
2121 | if (!result) | 2092 | if (!result) |
2122 | return; | 2093 | return; |
2123 | out_err: | 2094 | out_err: |
2124 | /* Record the error code and complete the request */ | 2095 | /* Record the error code and complete the request */ |
2125 | 2096 | ||
2126 | orig_request->result = result; | 2097 | orig_request->result = result; |
2127 | orig_request->xferred = 0; | 2098 | orig_request->xferred = 0; |
2128 | obj_request_done_set(orig_request); | 2099 | obj_request_done_set(orig_request); |
2129 | rbd_obj_request_complete(orig_request); | 2100 | rbd_obj_request_complete(orig_request); |
2130 | } | 2101 | } |
2131 | 2102 | ||
2132 | /* | 2103 | /* |
2133 | * Read from the parent image the range of data that covers the | 2104 | * Read from the parent image the range of data that covers the |
2134 | * entire target of the given object request. This is used for | 2105 | * entire target of the given object request. This is used for |
2135 | * satisfying a layered image write request when the target of an | 2106 | * satisfying a layered image write request when the target of an |
2136 | * object request from the image request does not exist. | 2107 | * object request from the image request does not exist. |
2137 | * | 2108 | * |
2138 | * A page array big enough to hold the returned data is allocated | 2109 | * A page array big enough to hold the returned data is allocated |
2139 | * and supplied to rbd_img_request_fill() as the "data descriptor." | 2110 | * and supplied to rbd_img_request_fill() as the "data descriptor." |
2140 | * When the read completes, this page array will be transferred to | 2111 | * When the read completes, this page array will be transferred to |
2141 | * the original object request for the copyup operation. | 2112 | * the original object request for the copyup operation. |
2142 | * | 2113 | * |
2143 | * If an error occurs, record it as the result of the original | 2114 | * If an error occurs, record it as the result of the original |
2144 | * object request and mark it done so it gets completed. | 2115 | * object request and mark it done so it gets completed. |
2145 | */ | 2116 | */ |
2146 | static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request) | 2117 | static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request) |
2147 | { | 2118 | { |
2148 | struct rbd_img_request *img_request = NULL; | 2119 | struct rbd_img_request *img_request = NULL; |
2149 | struct rbd_img_request *parent_request = NULL; | 2120 | struct rbd_img_request *parent_request = NULL; |
2150 | struct rbd_device *rbd_dev; | 2121 | struct rbd_device *rbd_dev; |
2151 | u64 img_offset; | 2122 | u64 img_offset; |
2152 | u64 length; | 2123 | u64 length; |
2153 | struct page **pages = NULL; | 2124 | struct page **pages = NULL; |
2154 | u32 page_count; | 2125 | u32 page_count; |
2155 | int result; | 2126 | int result; |
2156 | 2127 | ||
2157 | rbd_assert(obj_request_img_data_test(obj_request)); | 2128 | rbd_assert(obj_request_img_data_test(obj_request)); |
2158 | rbd_assert(obj_request->type == OBJ_REQUEST_BIO); | 2129 | rbd_assert(obj_request->type == OBJ_REQUEST_BIO); |
2159 | 2130 | ||
2160 | img_request = obj_request->img_request; | 2131 | img_request = obj_request->img_request; |
2161 | rbd_assert(img_request != NULL); | 2132 | rbd_assert(img_request != NULL); |
2162 | rbd_dev = img_request->rbd_dev; | 2133 | rbd_dev = img_request->rbd_dev; |
2163 | rbd_assert(rbd_dev->parent != NULL); | 2134 | rbd_assert(rbd_dev->parent != NULL); |
2164 | 2135 | ||
2165 | /* | 2136 | /* |
2166 | * First things first. The original osd request is of no | 2137 | * First things first. The original osd request is of no |
2167 | * use to use any more, we'll need a new one that can hold | 2138 | * use to use any more, we'll need a new one that can hold |
2168 | * the two ops in a copyup request. We'll get that later, | 2139 | * the two ops in a copyup request. We'll get that later, |
2169 | * but for now we can release the old one. | 2140 | * but for now we can release the old one. |
2170 | */ | 2141 | */ |
2171 | rbd_osd_req_destroy(obj_request->osd_req); | 2142 | rbd_osd_req_destroy(obj_request->osd_req); |
2172 | obj_request->osd_req = NULL; | 2143 | obj_request->osd_req = NULL; |
2173 | 2144 | ||
2174 | /* | 2145 | /* |
2175 | * Determine the byte range covered by the object in the | 2146 | * Determine the byte range covered by the object in the |
2176 | * child image to which the original request was to be sent. | 2147 | * child image to which the original request was to be sent. |
2177 | */ | 2148 | */ |
2178 | img_offset = obj_request->img_offset - obj_request->offset; | 2149 | img_offset = obj_request->img_offset - obj_request->offset; |
2179 | length = (u64)1 << rbd_dev->header.obj_order; | 2150 | length = (u64)1 << rbd_dev->header.obj_order; |
2180 | 2151 | ||
2181 | /* | 2152 | /* |
2182 | * There is no defined parent data beyond the parent | 2153 | * There is no defined parent data beyond the parent |
2183 | * overlap, so limit what we read at that boundary if | 2154 | * overlap, so limit what we read at that boundary if |
2184 | * necessary. | 2155 | * necessary. |
2185 | */ | 2156 | */ |
2186 | if (img_offset + length > rbd_dev->parent_overlap) { | 2157 | if (img_offset + length > rbd_dev->parent_overlap) { |
2187 | rbd_assert(img_offset < rbd_dev->parent_overlap); | 2158 | rbd_assert(img_offset < rbd_dev->parent_overlap); |
2188 | length = rbd_dev->parent_overlap - img_offset; | 2159 | length = rbd_dev->parent_overlap - img_offset; |
2189 | } | 2160 | } |
2190 | 2161 | ||
2191 | /* | 2162 | /* |
2192 | * Allocate a page array big enough to receive the data read | 2163 | * Allocate a page array big enough to receive the data read |
2193 | * from the parent. | 2164 | * from the parent. |
2194 | */ | 2165 | */ |
2195 | page_count = (u32)calc_pages_for(0, length); | 2166 | page_count = (u32)calc_pages_for(0, length); |
2196 | pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); | 2167 | pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); |
2197 | if (IS_ERR(pages)) { | 2168 | if (IS_ERR(pages)) { |
2198 | result = PTR_ERR(pages); | 2169 | result = PTR_ERR(pages); |
2199 | pages = NULL; | 2170 | pages = NULL; |
2200 | goto out_err; | 2171 | goto out_err; |
2201 | } | 2172 | } |
2202 | 2173 | ||
2203 | result = -ENOMEM; | 2174 | result = -ENOMEM; |
2204 | parent_request = rbd_img_request_create(rbd_dev->parent, | 2175 | parent_request = rbd_img_request_create(rbd_dev->parent, |
2205 | img_offset, length, | 2176 | img_offset, length, |
2206 | false, true); | 2177 | false, true); |
2207 | if (!parent_request) | 2178 | if (!parent_request) |
2208 | goto out_err; | 2179 | goto out_err; |
2209 | rbd_obj_request_get(obj_request); | 2180 | rbd_obj_request_get(obj_request); |
2210 | parent_request->obj_request = obj_request; | 2181 | parent_request->obj_request = obj_request; |
2211 | 2182 | ||
2212 | result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages); | 2183 | result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages); |
2213 | if (result) | 2184 | if (result) |
2214 | goto out_err; | 2185 | goto out_err; |
2215 | parent_request->copyup_pages = pages; | 2186 | parent_request->copyup_pages = pages; |
2216 | 2187 | ||
2217 | parent_request->callback = rbd_img_obj_parent_read_full_callback; | 2188 | parent_request->callback = rbd_img_obj_parent_read_full_callback; |
2218 | result = rbd_img_request_submit(parent_request); | 2189 | result = rbd_img_request_submit(parent_request); |
2219 | if (!result) | 2190 | if (!result) |
2220 | return 0; | 2191 | return 0; |
2221 | 2192 | ||
2222 | parent_request->copyup_pages = NULL; | 2193 | parent_request->copyup_pages = NULL; |
2223 | parent_request->obj_request = NULL; | 2194 | parent_request->obj_request = NULL; |
2224 | rbd_obj_request_put(obj_request); | 2195 | rbd_obj_request_put(obj_request); |
2225 | out_err: | 2196 | out_err: |
2226 | if (pages) | 2197 | if (pages) |
2227 | ceph_release_page_vector(pages, page_count); | 2198 | ceph_release_page_vector(pages, page_count); |
2228 | if (parent_request) | 2199 | if (parent_request) |
2229 | rbd_img_request_put(parent_request); | 2200 | rbd_img_request_put(parent_request); |
2230 | obj_request->result = result; | 2201 | obj_request->result = result; |
2231 | obj_request->xferred = 0; | 2202 | obj_request->xferred = 0; |
2232 | obj_request_done_set(obj_request); | 2203 | obj_request_done_set(obj_request); |
2233 | 2204 | ||
2234 | return result; | 2205 | return result; |
2235 | } | 2206 | } |
2236 | 2207 | ||
2237 | static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request) | 2208 | static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request) |
2238 | { | 2209 | { |
2239 | struct rbd_obj_request *orig_request; | 2210 | struct rbd_obj_request *orig_request; |
2240 | int result; | 2211 | int result; |
2241 | 2212 | ||
2242 | rbd_assert(!obj_request_img_data_test(obj_request)); | 2213 | rbd_assert(!obj_request_img_data_test(obj_request)); |
2243 | 2214 | ||
2244 | /* | 2215 | /* |
2245 | * All we need from the object request is the original | 2216 | * All we need from the object request is the original |
2246 | * request and the result of the STAT op. Grab those, then | 2217 | * request and the result of the STAT op. Grab those, then |
2247 | * we're done with the request. | 2218 | * we're done with the request. |
2248 | */ | 2219 | */ |
2249 | orig_request = obj_request->obj_request; | 2220 | orig_request = obj_request->obj_request; |
2250 | obj_request->obj_request = NULL; | 2221 | obj_request->obj_request = NULL; |
2251 | rbd_assert(orig_request); | 2222 | rbd_assert(orig_request); |
2252 | rbd_assert(orig_request->img_request); | 2223 | rbd_assert(orig_request->img_request); |
2253 | 2224 | ||
2254 | result = obj_request->result; | 2225 | result = obj_request->result; |
2255 | obj_request->result = 0; | 2226 | obj_request->result = 0; |
2256 | 2227 | ||
2257 | dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__, | 2228 | dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__, |
2258 | obj_request, orig_request, result, | 2229 | obj_request, orig_request, result, |
2259 | obj_request->xferred, obj_request->length); | 2230 | obj_request->xferred, obj_request->length); |
2260 | rbd_obj_request_put(obj_request); | 2231 | rbd_obj_request_put(obj_request); |
2261 | 2232 | ||
2262 | rbd_assert(orig_request); | 2233 | rbd_assert(orig_request); |
2263 | rbd_assert(orig_request->img_request); | 2234 | rbd_assert(orig_request->img_request); |
2264 | 2235 | ||
2265 | /* | 2236 | /* |
2266 | * Our only purpose here is to determine whether the object | 2237 | * Our only purpose here is to determine whether the object |
2267 | * exists, and we don't want to treat the non-existence as | 2238 | * exists, and we don't want to treat the non-existence as |
2268 | * an error. If something else comes back, transfer the | 2239 | * an error. If something else comes back, transfer the |
2269 | * error to the original request and complete it now. | 2240 | * error to the original request and complete it now. |
2270 | */ | 2241 | */ |
2271 | if (!result) { | 2242 | if (!result) { |
2272 | obj_request_existence_set(orig_request, true); | 2243 | obj_request_existence_set(orig_request, true); |
2273 | } else if (result == -ENOENT) { | 2244 | } else if (result == -ENOENT) { |
2274 | obj_request_existence_set(orig_request, false); | 2245 | obj_request_existence_set(orig_request, false); |
2275 | } else if (result) { | 2246 | } else if (result) { |
2276 | orig_request->result = result; | 2247 | orig_request->result = result; |
2277 | goto out; | 2248 | goto out; |
2278 | } | 2249 | } |
2279 | 2250 | ||
2280 | /* | 2251 | /* |
2281 | * Resubmit the original request now that we have recorded | 2252 | * Resubmit the original request now that we have recorded |
2282 | * whether the target object exists. | 2253 | * whether the target object exists. |
2283 | */ | 2254 | */ |
2284 | orig_request->result = rbd_img_obj_request_submit(orig_request); | 2255 | orig_request->result = rbd_img_obj_request_submit(orig_request); |
2285 | out: | 2256 | out: |
2286 | if (orig_request->result) | 2257 | if (orig_request->result) |
2287 | rbd_obj_request_complete(orig_request); | 2258 | rbd_obj_request_complete(orig_request); |
2288 | rbd_obj_request_put(orig_request); | 2259 | rbd_obj_request_put(orig_request); |
2289 | } | 2260 | } |
2290 | 2261 | ||
2291 | static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request) | 2262 | static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request) |
2292 | { | 2263 | { |
2293 | struct rbd_obj_request *stat_request; | 2264 | struct rbd_obj_request *stat_request; |
2294 | struct rbd_device *rbd_dev; | 2265 | struct rbd_device *rbd_dev; |
2295 | struct ceph_osd_client *osdc; | 2266 | struct ceph_osd_client *osdc; |
2296 | struct page **pages = NULL; | 2267 | struct page **pages = NULL; |
2297 | u32 page_count; | 2268 | u32 page_count; |
2298 | size_t size; | 2269 | size_t size; |
2299 | int ret; | 2270 | int ret; |
2300 | 2271 | ||
2301 | /* | 2272 | /* |
2302 | * The response data for a STAT call consists of: | 2273 | * The response data for a STAT call consists of: |
2303 | * le64 length; | 2274 | * le64 length; |
2304 | * struct { | 2275 | * struct { |
2305 | * le32 tv_sec; | 2276 | * le32 tv_sec; |
2306 | * le32 tv_nsec; | 2277 | * le32 tv_nsec; |
2307 | * } mtime; | 2278 | * } mtime; |
2308 | */ | 2279 | */ |
2309 | size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32); | 2280 | size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32); |
2310 | page_count = (u32)calc_pages_for(0, size); | 2281 | page_count = (u32)calc_pages_for(0, size); |
2311 | pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); | 2282 | pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); |
2312 | if (IS_ERR(pages)) | 2283 | if (IS_ERR(pages)) |
2313 | return PTR_ERR(pages); | 2284 | return PTR_ERR(pages); |
2314 | 2285 | ||
2315 | ret = -ENOMEM; | 2286 | ret = -ENOMEM; |
2316 | stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0, | 2287 | stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0, |
2317 | OBJ_REQUEST_PAGES); | 2288 | OBJ_REQUEST_PAGES); |
2318 | if (!stat_request) | 2289 | if (!stat_request) |
2319 | goto out; | 2290 | goto out; |
2320 | 2291 | ||
2321 | rbd_obj_request_get(obj_request); | 2292 | rbd_obj_request_get(obj_request); |
2322 | stat_request->obj_request = obj_request; | 2293 | stat_request->obj_request = obj_request; |
2323 | stat_request->pages = pages; | 2294 | stat_request->pages = pages; |
2324 | stat_request->page_count = page_count; | 2295 | stat_request->page_count = page_count; |
2325 | 2296 | ||
2326 | rbd_assert(obj_request->img_request); | 2297 | rbd_assert(obj_request->img_request); |
2327 | rbd_dev = obj_request->img_request->rbd_dev; | 2298 | rbd_dev = obj_request->img_request->rbd_dev; |
2328 | stat_request->osd_req = rbd_osd_req_create(rbd_dev, false, | 2299 | stat_request->osd_req = rbd_osd_req_create(rbd_dev, false, |
2329 | stat_request); | 2300 | stat_request); |
2330 | if (!stat_request->osd_req) | 2301 | if (!stat_request->osd_req) |
2331 | goto out; | 2302 | goto out; |
2332 | stat_request->callback = rbd_img_obj_exists_callback; | 2303 | stat_request->callback = rbd_img_obj_exists_callback; |
2333 | 2304 | ||
2334 | osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT); | 2305 | osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT); |
2335 | osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0, | 2306 | osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0, |
2336 | false, false); | 2307 | false, false); |
2337 | rbd_osd_req_format_read(stat_request); | 2308 | rbd_osd_req_format_read(stat_request); |
2338 | 2309 | ||
2339 | osdc = &rbd_dev->rbd_client->client->osdc; | 2310 | osdc = &rbd_dev->rbd_client->client->osdc; |
2340 | ret = rbd_obj_request_submit(osdc, stat_request); | 2311 | ret = rbd_obj_request_submit(osdc, stat_request); |
2341 | out: | 2312 | out: |
2342 | if (ret) | 2313 | if (ret) |
2343 | rbd_obj_request_put(obj_request); | 2314 | rbd_obj_request_put(obj_request); |
2344 | 2315 | ||
2345 | return ret; | 2316 | return ret; |
2346 | } | 2317 | } |
2347 | 2318 | ||
2348 | static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request) | 2319 | static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request) |
2349 | { | 2320 | { |
2350 | struct rbd_img_request *img_request; | 2321 | struct rbd_img_request *img_request; |
2351 | struct rbd_device *rbd_dev; | 2322 | struct rbd_device *rbd_dev; |
2352 | bool known; | 2323 | bool known; |
2353 | 2324 | ||
2354 | rbd_assert(obj_request_img_data_test(obj_request)); | 2325 | rbd_assert(obj_request_img_data_test(obj_request)); |
2355 | 2326 | ||
2356 | img_request = obj_request->img_request; | 2327 | img_request = obj_request->img_request; |
2357 | rbd_assert(img_request); | 2328 | rbd_assert(img_request); |
2358 | rbd_dev = img_request->rbd_dev; | 2329 | rbd_dev = img_request->rbd_dev; |
2359 | 2330 | ||
2360 | /* | 2331 | /* |
2361 | * Only writes to layered images need special handling. | 2332 | * Only writes to layered images need special handling. |
2362 | * Reads and non-layered writes are simple object requests. | 2333 | * Reads and non-layered writes are simple object requests. |
2363 | * Layered writes that start beyond the end of the overlap | 2334 | * Layered writes that start beyond the end of the overlap |
2364 | * with the parent have no parent data, so they too are | 2335 | * with the parent have no parent data, so they too are |
2365 | * simple object requests. Finally, if the target object is | 2336 | * simple object requests. Finally, if the target object is |
2366 | * known to already exist, its parent data has already been | 2337 | * known to already exist, its parent data has already been |
2367 | * copied, so a write to the object can also be handled as a | 2338 | * copied, so a write to the object can also be handled as a |
2368 | * simple object request. | 2339 | * simple object request. |
2369 | */ | 2340 | */ |
2370 | if (!img_request_write_test(img_request) || | 2341 | if (!img_request_write_test(img_request) || |
2371 | !img_request_layered_test(img_request) || | 2342 | !img_request_layered_test(img_request) || |
2372 | rbd_dev->parent_overlap <= obj_request->img_offset || | 2343 | rbd_dev->parent_overlap <= obj_request->img_offset || |
2373 | ((known = obj_request_known_test(obj_request)) && | 2344 | ((known = obj_request_known_test(obj_request)) && |
2374 | obj_request_exists_test(obj_request))) { | 2345 | obj_request_exists_test(obj_request))) { |
2375 | 2346 | ||
2376 | struct rbd_device *rbd_dev; | 2347 | struct rbd_device *rbd_dev; |
2377 | struct ceph_osd_client *osdc; | 2348 | struct ceph_osd_client *osdc; |
2378 | 2349 | ||
2379 | rbd_dev = obj_request->img_request->rbd_dev; | 2350 | rbd_dev = obj_request->img_request->rbd_dev; |
2380 | osdc = &rbd_dev->rbd_client->client->osdc; | 2351 | osdc = &rbd_dev->rbd_client->client->osdc; |
2381 | 2352 | ||
2382 | return rbd_obj_request_submit(osdc, obj_request); | 2353 | return rbd_obj_request_submit(osdc, obj_request); |
2383 | } | 2354 | } |
2384 | 2355 | ||
2385 | /* | 2356 | /* |
2386 | * It's a layered write. The target object might exist but | 2357 | * It's a layered write. The target object might exist but |
2387 | * we may not know that yet. If we know it doesn't exist, | 2358 | * we may not know that yet. If we know it doesn't exist, |
2388 | * start by reading the data for the full target object from | 2359 | * start by reading the data for the full target object from |
2389 | * the parent so we can use it for a copyup to the target. | 2360 | * the parent so we can use it for a copyup to the target. |
2390 | */ | 2361 | */ |
2391 | if (known) | 2362 | if (known) |
2392 | return rbd_img_obj_parent_read_full(obj_request); | 2363 | return rbd_img_obj_parent_read_full(obj_request); |
2393 | 2364 | ||
2394 | /* We don't know whether the target exists. Go find out. */ | 2365 | /* We don't know whether the target exists. Go find out. */ |
2395 | 2366 | ||
2396 | return rbd_img_obj_exists_submit(obj_request); | 2367 | return rbd_img_obj_exists_submit(obj_request); |
2397 | } | 2368 | } |
2398 | 2369 | ||
2399 | static int rbd_img_request_submit(struct rbd_img_request *img_request) | 2370 | static int rbd_img_request_submit(struct rbd_img_request *img_request) |
2400 | { | 2371 | { |
2401 | struct rbd_obj_request *obj_request; | 2372 | struct rbd_obj_request *obj_request; |
2402 | struct rbd_obj_request *next_obj_request; | 2373 | struct rbd_obj_request *next_obj_request; |
2403 | 2374 | ||
2404 | dout("%s: img %p\n", __func__, img_request); | 2375 | dout("%s: img %p\n", __func__, img_request); |
2405 | for_each_obj_request_safe(img_request, obj_request, next_obj_request) { | 2376 | for_each_obj_request_safe(img_request, obj_request, next_obj_request) { |
2406 | int ret; | 2377 | int ret; |
2407 | 2378 | ||
2408 | ret = rbd_img_obj_request_submit(obj_request); | 2379 | ret = rbd_img_obj_request_submit(obj_request); |
2409 | if (ret) | 2380 | if (ret) |
2410 | return ret; | 2381 | return ret; |
2411 | } | 2382 | } |
2412 | 2383 | ||
2413 | return 0; | 2384 | return 0; |
2414 | } | 2385 | } |
2415 | 2386 | ||
2416 | static void rbd_img_parent_read_callback(struct rbd_img_request *img_request) | 2387 | static void rbd_img_parent_read_callback(struct rbd_img_request *img_request) |
2417 | { | 2388 | { |
2418 | struct rbd_obj_request *obj_request; | 2389 | struct rbd_obj_request *obj_request; |
2419 | struct rbd_device *rbd_dev; | 2390 | struct rbd_device *rbd_dev; |
2420 | u64 obj_end; | 2391 | u64 obj_end; |
2421 | 2392 | ||
2422 | rbd_assert(img_request_child_test(img_request)); | 2393 | rbd_assert(img_request_child_test(img_request)); |
2423 | 2394 | ||
2424 | obj_request = img_request->obj_request; | 2395 | obj_request = img_request->obj_request; |
2425 | rbd_assert(obj_request); | 2396 | rbd_assert(obj_request); |
2426 | rbd_assert(obj_request->img_request); | 2397 | rbd_assert(obj_request->img_request); |
2427 | 2398 | ||
2428 | obj_request->result = img_request->result; | 2399 | obj_request->result = img_request->result; |
2429 | if (obj_request->result) | 2400 | if (obj_request->result) |
2430 | goto out; | 2401 | goto out; |
2431 | 2402 | ||
2432 | /* | 2403 | /* |
2433 | * We need to zero anything beyond the parent overlap | 2404 | * We need to zero anything beyond the parent overlap |
2434 | * boundary. Since rbd_img_obj_request_read_callback() | 2405 | * boundary. Since rbd_img_obj_request_read_callback() |
2435 | * will zero anything beyond the end of a short read, an | 2406 | * will zero anything beyond the end of a short read, an |
2436 | * easy way to do this is to pretend the data from the | 2407 | * easy way to do this is to pretend the data from the |
2437 | * parent came up short--ending at the overlap boundary. | 2408 | * parent came up short--ending at the overlap boundary. |
2438 | */ | 2409 | */ |
2439 | rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length); | 2410 | rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length); |
2440 | obj_end = obj_request->img_offset + obj_request->length; | 2411 | obj_end = obj_request->img_offset + obj_request->length; |
2441 | rbd_dev = obj_request->img_request->rbd_dev; | 2412 | rbd_dev = obj_request->img_request->rbd_dev; |
2442 | if (obj_end > rbd_dev->parent_overlap) { | 2413 | if (obj_end > rbd_dev->parent_overlap) { |
2443 | u64 xferred = 0; | 2414 | u64 xferred = 0; |
2444 | 2415 | ||
2445 | if (obj_request->img_offset < rbd_dev->parent_overlap) | 2416 | if (obj_request->img_offset < rbd_dev->parent_overlap) |
2446 | xferred = rbd_dev->parent_overlap - | 2417 | xferred = rbd_dev->parent_overlap - |
2447 | obj_request->img_offset; | 2418 | obj_request->img_offset; |
2448 | 2419 | ||
2449 | obj_request->xferred = min(img_request->xferred, xferred); | 2420 | obj_request->xferred = min(img_request->xferred, xferred); |
2450 | } else { | 2421 | } else { |
2451 | obj_request->xferred = img_request->xferred; | 2422 | obj_request->xferred = img_request->xferred; |
2452 | } | 2423 | } |
2453 | out: | 2424 | out: |
2454 | rbd_img_obj_request_read_callback(obj_request); | 2425 | rbd_img_obj_request_read_callback(obj_request); |
2455 | rbd_obj_request_complete(obj_request); | 2426 | rbd_obj_request_complete(obj_request); |
2456 | } | 2427 | } |
2457 | 2428 | ||
2458 | static void rbd_img_parent_read(struct rbd_obj_request *obj_request) | 2429 | static void rbd_img_parent_read(struct rbd_obj_request *obj_request) |
2459 | { | 2430 | { |
2460 | struct rbd_device *rbd_dev; | 2431 | struct rbd_device *rbd_dev; |
2461 | struct rbd_img_request *img_request; | 2432 | struct rbd_img_request *img_request; |
2462 | int result; | 2433 | int result; |
2463 | 2434 | ||
2464 | rbd_assert(obj_request_img_data_test(obj_request)); | 2435 | rbd_assert(obj_request_img_data_test(obj_request)); |
2465 | rbd_assert(obj_request->img_request != NULL); | 2436 | rbd_assert(obj_request->img_request != NULL); |
2466 | rbd_assert(obj_request->result == (s32) -ENOENT); | 2437 | rbd_assert(obj_request->result == (s32) -ENOENT); |
2467 | rbd_assert(obj_request->type == OBJ_REQUEST_BIO); | 2438 | rbd_assert(obj_request->type == OBJ_REQUEST_BIO); |
2468 | 2439 | ||
2469 | rbd_dev = obj_request->img_request->rbd_dev; | 2440 | rbd_dev = obj_request->img_request->rbd_dev; |
2470 | rbd_assert(rbd_dev->parent != NULL); | 2441 | rbd_assert(rbd_dev->parent != NULL); |
2471 | /* rbd_read_finish(obj_request, obj_request->length); */ | 2442 | /* rbd_read_finish(obj_request, obj_request->length); */ |
2472 | img_request = rbd_img_request_create(rbd_dev->parent, | 2443 | img_request = rbd_img_request_create(rbd_dev->parent, |
2473 | obj_request->img_offset, | 2444 | obj_request->img_offset, |
2474 | obj_request->length, | 2445 | obj_request->length, |
2475 | false, true); | 2446 | false, true); |
2476 | result = -ENOMEM; | 2447 | result = -ENOMEM; |
2477 | if (!img_request) | 2448 | if (!img_request) |
2478 | goto out_err; | 2449 | goto out_err; |
2479 | 2450 | ||
2480 | rbd_obj_request_get(obj_request); | 2451 | rbd_obj_request_get(obj_request); |
2481 | img_request->obj_request = obj_request; | 2452 | img_request->obj_request = obj_request; |
2482 | 2453 | ||
2483 | result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, | 2454 | result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, |
2484 | obj_request->bio_list); | 2455 | obj_request->bio_list); |
2485 | if (result) | 2456 | if (result) |
2486 | goto out_err; | 2457 | goto out_err; |
2487 | 2458 | ||
2488 | img_request->callback = rbd_img_parent_read_callback; | 2459 | img_request->callback = rbd_img_parent_read_callback; |
2489 | result = rbd_img_request_submit(img_request); | 2460 | result = rbd_img_request_submit(img_request); |
2490 | if (result) | 2461 | if (result) |
2491 | goto out_err; | 2462 | goto out_err; |
2492 | 2463 | ||
2493 | return; | 2464 | return; |
2494 | out_err: | 2465 | out_err: |
2495 | if (img_request) | 2466 | if (img_request) |
2496 | rbd_img_request_put(img_request); | 2467 | rbd_img_request_put(img_request); |
2497 | obj_request->result = result; | 2468 | obj_request->result = result; |
2498 | obj_request->xferred = 0; | 2469 | obj_request->xferred = 0; |
2499 | obj_request_done_set(obj_request); | 2470 | obj_request_done_set(obj_request); |
2500 | } | 2471 | } |
2501 | 2472 | ||
2502 | static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, | 2473 | static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, |
2503 | u64 ver, u64 notify_id) | 2474 | u64 ver, u64 notify_id) |
2504 | { | 2475 | { |
2505 | struct rbd_obj_request *obj_request; | 2476 | struct rbd_obj_request *obj_request; |
2506 | struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; | 2477 | struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; |
2507 | int ret; | 2478 | int ret; |
2508 | 2479 | ||
2509 | obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, | 2480 | obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, |
2510 | OBJ_REQUEST_NODATA); | 2481 | OBJ_REQUEST_NODATA); |
2511 | if (!obj_request) | 2482 | if (!obj_request) |
2512 | return -ENOMEM; | 2483 | return -ENOMEM; |
2513 | 2484 | ||
2514 | ret = -ENOMEM; | 2485 | ret = -ENOMEM; |
2515 | obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request); | 2486 | obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request); |
2516 | if (!obj_request->osd_req) | 2487 | if (!obj_request->osd_req) |
2517 | goto out; | 2488 | goto out; |
2518 | obj_request->callback = rbd_obj_request_put; | 2489 | obj_request->callback = rbd_obj_request_put; |
2519 | 2490 | ||
2520 | osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK, | 2491 | osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK, |
2521 | notify_id, ver, 0); | 2492 | notify_id, ver, 0); |
2522 | rbd_osd_req_format_read(obj_request); | 2493 | rbd_osd_req_format_read(obj_request); |
2523 | 2494 | ||
2524 | ret = rbd_obj_request_submit(osdc, obj_request); | 2495 | ret = rbd_obj_request_submit(osdc, obj_request); |
2525 | out: | 2496 | out: |
2526 | if (ret) | 2497 | if (ret) |
2527 | rbd_obj_request_put(obj_request); | 2498 | rbd_obj_request_put(obj_request); |
2528 | 2499 | ||
2529 | return ret; | 2500 | return ret; |
2530 | } | 2501 | } |
2531 | 2502 | ||
2532 | static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) | 2503 | static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) |
2533 | { | 2504 | { |
2534 | struct rbd_device *rbd_dev = (struct rbd_device *)data; | 2505 | struct rbd_device *rbd_dev = (struct rbd_device *)data; |
2535 | u64 hver; | 2506 | u64 hver; |
2536 | 2507 | ||
2537 | if (!rbd_dev) | 2508 | if (!rbd_dev) |
2538 | return; | 2509 | return; |
2539 | 2510 | ||
2540 | dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__, | 2511 | dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__, |
2541 | rbd_dev->header_name, (unsigned long long) notify_id, | 2512 | rbd_dev->header_name, (unsigned long long) notify_id, |
2542 | (unsigned int) opcode); | 2513 | (unsigned int) opcode); |
2543 | (void)rbd_dev_refresh(rbd_dev, &hver); | 2514 | (void)rbd_dev_refresh(rbd_dev, &hver); |
2544 | 2515 | ||
2545 | rbd_obj_notify_ack(rbd_dev, hver, notify_id); | 2516 | rbd_obj_notify_ack(rbd_dev, hver, notify_id); |
2546 | } | 2517 | } |
2547 | 2518 | ||
2548 | /* | 2519 | /* |
2549 | * Request sync osd watch/unwatch. The value of "start" determines | 2520 | * Request sync osd watch/unwatch. The value of "start" determines |
2550 | * whether a watch request is being initiated or torn down. | 2521 | * whether a watch request is being initiated or torn down. |
2551 | */ | 2522 | */ |
2552 | static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start) | 2523 | static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start) |
2553 | { | 2524 | { |
2554 | struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; | 2525 | struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; |
2555 | struct rbd_obj_request *obj_request; | 2526 | struct rbd_obj_request *obj_request; |
2556 | int ret; | 2527 | int ret; |
2557 | 2528 | ||
2558 | rbd_assert(start ^ !!rbd_dev->watch_event); | 2529 | rbd_assert(start ^ !!rbd_dev->watch_event); |
2559 | rbd_assert(start ^ !!rbd_dev->watch_request); | 2530 | rbd_assert(start ^ !!rbd_dev->watch_request); |
2560 | 2531 | ||
2561 | if (start) { | 2532 | if (start) { |
2562 | ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev, | 2533 | ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev, |
2563 | &rbd_dev->watch_event); | 2534 | &rbd_dev->watch_event); |
2564 | if (ret < 0) | 2535 | if (ret < 0) |
2565 | return ret; | 2536 | return ret; |
2566 | rbd_assert(rbd_dev->watch_event != NULL); | 2537 | rbd_assert(rbd_dev->watch_event != NULL); |
2567 | } | 2538 | } |
2568 | 2539 | ||
2569 | ret = -ENOMEM; | 2540 | ret = -ENOMEM; |
2570 | obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, | 2541 | obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, |
2571 | OBJ_REQUEST_NODATA); | 2542 | OBJ_REQUEST_NODATA); |
2572 | if (!obj_request) | 2543 | if (!obj_request) |
2573 | goto out_cancel; | 2544 | goto out_cancel; |
2574 | 2545 | ||
2575 | obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request); | 2546 | obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request); |
2576 | if (!obj_request->osd_req) | 2547 | if (!obj_request->osd_req) |
2577 | goto out_cancel; | 2548 | goto out_cancel; |
2578 | 2549 | ||
2579 | if (start) | 2550 | if (start) |
2580 | ceph_osdc_set_request_linger(osdc, obj_request->osd_req); | 2551 | ceph_osdc_set_request_linger(osdc, obj_request->osd_req); |
2581 | else | 2552 | else |
2582 | ceph_osdc_unregister_linger_request(osdc, | 2553 | ceph_osdc_unregister_linger_request(osdc, |
2583 | rbd_dev->watch_request->osd_req); | 2554 | rbd_dev->watch_request->osd_req); |
2584 | 2555 | ||
2585 | osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH, | 2556 | osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH, |
2586 | rbd_dev->watch_event->cookie, | 2557 | rbd_dev->watch_event->cookie, |
2587 | rbd_dev->header.obj_version, start); | 2558 | rbd_dev->header.obj_version, start); |
2588 | rbd_osd_req_format_write(obj_request); | 2559 | rbd_osd_req_format_write(obj_request); |
2589 | 2560 | ||
2590 | ret = rbd_obj_request_submit(osdc, obj_request); | 2561 | ret = rbd_obj_request_submit(osdc, obj_request); |
2591 | if (ret) | 2562 | if (ret) |
2592 | goto out_cancel; | 2563 | goto out_cancel; |
2593 | ret = rbd_obj_request_wait(obj_request); | 2564 | ret = rbd_obj_request_wait(obj_request); |
2594 | if (ret) | 2565 | if (ret) |
2595 | goto out_cancel; | 2566 | goto out_cancel; |
2596 | ret = obj_request->result; | 2567 | ret = obj_request->result; |
2597 | if (ret) | 2568 | if (ret) |
2598 | goto out_cancel; | 2569 | goto out_cancel; |
2599 | 2570 | ||
2600 | /* | 2571 | /* |
2601 | * A watch request is set to linger, so the underlying osd | 2572 | * A watch request is set to linger, so the underlying osd |
2602 | * request won't go away until we unregister it. We retain | 2573 | * request won't go away until we unregister it. We retain |
2603 | * a pointer to the object request during that time (in | 2574 | * a pointer to the object request during that time (in |
2604 | * rbd_dev->watch_request), so we'll keep a reference to | 2575 | * rbd_dev->watch_request), so we'll keep a reference to |
2605 | * it. We'll drop that reference (below) after we've | 2576 | * it. We'll drop that reference (below) after we've |
2606 | * unregistered it. | 2577 | * unregistered it. |
2607 | */ | 2578 | */ |
2608 | if (start) { | 2579 | if (start) { |
2609 | rbd_dev->watch_request = obj_request; | 2580 | rbd_dev->watch_request = obj_request; |
2610 | 2581 | ||
2611 | return 0; | 2582 | return 0; |
2612 | } | 2583 | } |
2613 | 2584 | ||
2614 | /* We have successfully torn down the watch request */ | 2585 | /* We have successfully torn down the watch request */ |
2615 | 2586 | ||
2616 | rbd_obj_request_put(rbd_dev->watch_request); | 2587 | rbd_obj_request_put(rbd_dev->watch_request); |
2617 | rbd_dev->watch_request = NULL; | 2588 | rbd_dev->watch_request = NULL; |
2618 | out_cancel: | 2589 | out_cancel: |
2619 | /* Cancel the event if we're tearing down, or on error */ | 2590 | /* Cancel the event if we're tearing down, or on error */ |
2620 | ceph_osdc_cancel_event(rbd_dev->watch_event); | 2591 | ceph_osdc_cancel_event(rbd_dev->watch_event); |
2621 | rbd_dev->watch_event = NULL; | 2592 | rbd_dev->watch_event = NULL; |
2622 | if (obj_request) | 2593 | if (obj_request) |
2623 | rbd_obj_request_put(obj_request); | 2594 | rbd_obj_request_put(obj_request); |
2624 | 2595 | ||
2625 | return ret; | 2596 | return ret; |
2626 | } | 2597 | } |
2627 | 2598 | ||
2628 | /* | 2599 | /* |
2629 | * Synchronous osd object method call. Returns the number of bytes | 2600 | * Synchronous osd object method call. Returns the number of bytes |
2630 | * returned in the outbound buffer, or a negative error code. | 2601 | * returned in the outbound buffer, or a negative error code. |
2631 | */ | 2602 | */ |
2632 | static int rbd_obj_method_sync(struct rbd_device *rbd_dev, | 2603 | static int rbd_obj_method_sync(struct rbd_device *rbd_dev, |
2633 | const char *object_name, | 2604 | const char *object_name, |
2634 | const char *class_name, | 2605 | const char *class_name, |
2635 | const char *method_name, | 2606 | const char *method_name, |
2636 | const void *outbound, | 2607 | const void *outbound, |
2637 | size_t outbound_size, | 2608 | size_t outbound_size, |
2638 | void *inbound, | 2609 | void *inbound, |
2639 | size_t inbound_size, | 2610 | size_t inbound_size, |
2640 | u64 *version) | 2611 | u64 *version) |
2641 | { | 2612 | { |
2642 | struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; | 2613 | struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; |
2643 | struct rbd_obj_request *obj_request; | 2614 | struct rbd_obj_request *obj_request; |
2644 | struct page **pages; | 2615 | struct page **pages; |
2645 | u32 page_count; | 2616 | u32 page_count; |
2646 | int ret; | 2617 | int ret; |
2647 | 2618 | ||
2648 | /* | 2619 | /* |
2649 | * Method calls are ultimately read operations. The result | 2620 | * Method calls are ultimately read operations. The result |
2650 | * should placed into the inbound buffer provided. They | 2621 | * should placed into the inbound buffer provided. They |
2651 | * also supply outbound data--parameters for the object | 2622 | * also supply outbound data--parameters for the object |
2652 | * method. Currently if this is present it will be a | 2623 | * method. Currently if this is present it will be a |
2653 | * snapshot id. | 2624 | * snapshot id. |
2654 | */ | 2625 | */ |
2655 | page_count = (u32)calc_pages_for(0, inbound_size); | 2626 | page_count = (u32)calc_pages_for(0, inbound_size); |
2656 | pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); | 2627 | pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); |
2657 | if (IS_ERR(pages)) | 2628 | if (IS_ERR(pages)) |
2658 | return PTR_ERR(pages); | 2629 | return PTR_ERR(pages); |
2659 | 2630 | ||
2660 | ret = -ENOMEM; | 2631 | ret = -ENOMEM; |
2661 | obj_request = rbd_obj_request_create(object_name, 0, inbound_size, | 2632 | obj_request = rbd_obj_request_create(object_name, 0, inbound_size, |
2662 | OBJ_REQUEST_PAGES); | 2633 | OBJ_REQUEST_PAGES); |
2663 | if (!obj_request) | 2634 | if (!obj_request) |
2664 | goto out; | 2635 | goto out; |
2665 | 2636 | ||
2666 | obj_request->pages = pages; | 2637 | obj_request->pages = pages; |
2667 | obj_request->page_count = page_count; | 2638 | obj_request->page_count = page_count; |
2668 | 2639 | ||
2669 | obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request); | 2640 | obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request); |
2670 | if (!obj_request->osd_req) | 2641 | if (!obj_request->osd_req) |
2671 | goto out; | 2642 | goto out; |
2672 | 2643 | ||
2673 | osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL, | 2644 | osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL, |
2674 | class_name, method_name); | 2645 | class_name, method_name); |
2675 | if (outbound_size) { | 2646 | if (outbound_size) { |
2676 | struct ceph_pagelist *pagelist; | 2647 | struct ceph_pagelist *pagelist; |
2677 | 2648 | ||
2678 | pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS); | 2649 | pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS); |
2679 | if (!pagelist) | 2650 | if (!pagelist) |
2680 | goto out; | 2651 | goto out; |
2681 | 2652 | ||
2682 | ceph_pagelist_init(pagelist); | 2653 | ceph_pagelist_init(pagelist); |
2683 | ceph_pagelist_append(pagelist, outbound, outbound_size); | 2654 | ceph_pagelist_append(pagelist, outbound, outbound_size); |
2684 | osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0, | 2655 | osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0, |
2685 | pagelist); | 2656 | pagelist); |
2686 | } | 2657 | } |
2687 | osd_req_op_cls_response_data_pages(obj_request->osd_req, 0, | 2658 | osd_req_op_cls_response_data_pages(obj_request->osd_req, 0, |
2688 | obj_request->pages, inbound_size, | 2659 | obj_request->pages, inbound_size, |
2689 | 0, false, false); | 2660 | 0, false, false); |
2690 | rbd_osd_req_format_read(obj_request); | 2661 | rbd_osd_req_format_read(obj_request); |
2691 | 2662 | ||
2692 | ret = rbd_obj_request_submit(osdc, obj_request); | 2663 | ret = rbd_obj_request_submit(osdc, obj_request); |
2693 | if (ret) | 2664 | if (ret) |
2694 | goto out; | 2665 | goto out; |
2695 | ret = rbd_obj_request_wait(obj_request); | 2666 | ret = rbd_obj_request_wait(obj_request); |
2696 | if (ret) | 2667 | if (ret) |
2697 | goto out; | 2668 | goto out; |
2698 | 2669 | ||
2699 | ret = obj_request->result; | 2670 | ret = obj_request->result; |
2700 | if (ret < 0) | 2671 | if (ret < 0) |
2701 | goto out; | 2672 | goto out; |
2702 | 2673 | ||
2703 | rbd_assert(obj_request->xferred < (u64)INT_MAX); | 2674 | rbd_assert(obj_request->xferred < (u64)INT_MAX); |
2704 | ret = (int)obj_request->xferred; | 2675 | ret = (int)obj_request->xferred; |
2705 | ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred); | 2676 | ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred); |
2706 | if (version) | 2677 | if (version) |
2707 | *version = obj_request->version; | 2678 | *version = obj_request->version; |
2708 | out: | 2679 | out: |
2709 | if (obj_request) | 2680 | if (obj_request) |
2710 | rbd_obj_request_put(obj_request); | 2681 | rbd_obj_request_put(obj_request); |
2711 | else | 2682 | else |
2712 | ceph_release_page_vector(pages, page_count); | 2683 | ceph_release_page_vector(pages, page_count); |
2713 | 2684 | ||
2714 | return ret; | 2685 | return ret; |
2715 | } | 2686 | } |
2716 | 2687 | ||
2717 | static void rbd_request_fn(struct request_queue *q) | 2688 | static void rbd_request_fn(struct request_queue *q) |
2718 | __releases(q->queue_lock) __acquires(q->queue_lock) | 2689 | __releases(q->queue_lock) __acquires(q->queue_lock) |
2719 | { | 2690 | { |
2720 | struct rbd_device *rbd_dev = q->queuedata; | 2691 | struct rbd_device *rbd_dev = q->queuedata; |
2721 | bool read_only = rbd_dev->mapping.read_only; | 2692 | bool read_only = rbd_dev->mapping.read_only; |
2722 | struct request *rq; | 2693 | struct request *rq; |
2723 | int result; | 2694 | int result; |
2724 | 2695 | ||
2725 | while ((rq = blk_fetch_request(q))) { | 2696 | while ((rq = blk_fetch_request(q))) { |
2726 | bool write_request = rq_data_dir(rq) == WRITE; | 2697 | bool write_request = rq_data_dir(rq) == WRITE; |
2727 | struct rbd_img_request *img_request; | 2698 | struct rbd_img_request *img_request; |
2728 | u64 offset; | 2699 | u64 offset; |
2729 | u64 length; | 2700 | u64 length; |
2730 | 2701 | ||
2731 | /* Ignore any non-FS requests that filter through. */ | 2702 | /* Ignore any non-FS requests that filter through. */ |
2732 | 2703 | ||
2733 | if (rq->cmd_type != REQ_TYPE_FS) { | 2704 | if (rq->cmd_type != REQ_TYPE_FS) { |
2734 | dout("%s: non-fs request type %d\n", __func__, | 2705 | dout("%s: non-fs request type %d\n", __func__, |
2735 | (int) rq->cmd_type); | 2706 | (int) rq->cmd_type); |
2736 | __blk_end_request_all(rq, 0); | 2707 | __blk_end_request_all(rq, 0); |
2737 | continue; | 2708 | continue; |
2738 | } | 2709 | } |
2739 | 2710 | ||
2740 | /* Ignore/skip any zero-length requests */ | 2711 | /* Ignore/skip any zero-length requests */ |
2741 | 2712 | ||
2742 | offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT; | 2713 | offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT; |
2743 | length = (u64) blk_rq_bytes(rq); | 2714 | length = (u64) blk_rq_bytes(rq); |
2744 | 2715 | ||
2745 | if (!length) { | 2716 | if (!length) { |
2746 | dout("%s: zero-length request\n", __func__); | 2717 | dout("%s: zero-length request\n", __func__); |
2747 | __blk_end_request_all(rq, 0); | 2718 | __blk_end_request_all(rq, 0); |
2748 | continue; | 2719 | continue; |
2749 | } | 2720 | } |
2750 | 2721 | ||
2751 | spin_unlock_irq(q->queue_lock); | 2722 | spin_unlock_irq(q->queue_lock); |
2752 | 2723 | ||
2753 | /* Disallow writes to a read-only device */ | 2724 | /* Disallow writes to a read-only device */ |
2754 | 2725 | ||
2755 | if (write_request) { | 2726 | if (write_request) { |
2756 | result = -EROFS; | 2727 | result = -EROFS; |
2757 | if (read_only) | 2728 | if (read_only) |
2758 | goto end_request; | 2729 | goto end_request; |
2759 | rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP); | 2730 | rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP); |
2760 | } | 2731 | } |
2761 | 2732 | ||
2762 | /* | 2733 | /* |
2763 | * Quit early if the mapped snapshot no longer | 2734 | * Quit early if the mapped snapshot no longer |
2764 | * exists. It's still possible the snapshot will | 2735 | * exists. It's still possible the snapshot will |
2765 | * have disappeared by the time our request arrives | 2736 | * have disappeared by the time our request arrives |
2766 | * at the osd, but there's no sense in sending it if | 2737 | * at the osd, but there's no sense in sending it if |
2767 | * we already know. | 2738 | * we already know. |
2768 | */ | 2739 | */ |
2769 | if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) { | 2740 | if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) { |
2770 | dout("request for non-existent snapshot"); | 2741 | dout("request for non-existent snapshot"); |
2771 | rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); | 2742 | rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); |
2772 | result = -ENXIO; | 2743 | result = -ENXIO; |
2773 | goto end_request; | 2744 | goto end_request; |
2774 | } | 2745 | } |
2775 | 2746 | ||
2776 | result = -EINVAL; | 2747 | result = -EINVAL; |
2777 | if (offset && length > U64_MAX - offset + 1) { | 2748 | if (offset && length > U64_MAX - offset + 1) { |
2778 | rbd_warn(rbd_dev, "bad request range (%llu~%llu)\n", | 2749 | rbd_warn(rbd_dev, "bad request range (%llu~%llu)\n", |
2779 | offset, length); | 2750 | offset, length); |
2780 | goto end_request; /* Shouldn't happen */ | 2751 | goto end_request; /* Shouldn't happen */ |
2781 | } | 2752 | } |
2782 | 2753 | ||
2783 | result = -ENOMEM; | 2754 | result = -ENOMEM; |
2784 | img_request = rbd_img_request_create(rbd_dev, offset, length, | 2755 | img_request = rbd_img_request_create(rbd_dev, offset, length, |
2785 | write_request, false); | 2756 | write_request, false); |
2786 | if (!img_request) | 2757 | if (!img_request) |
2787 | goto end_request; | 2758 | goto end_request; |
2788 | 2759 | ||
2789 | img_request->rq = rq; | 2760 | img_request->rq = rq; |
2790 | 2761 | ||
2791 | result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, | 2762 | result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, |
2792 | rq->bio); | 2763 | rq->bio); |
2793 | if (!result) | 2764 | if (!result) |
2794 | result = rbd_img_request_submit(img_request); | 2765 | result = rbd_img_request_submit(img_request); |
2795 | if (result) | 2766 | if (result) |
2796 | rbd_img_request_put(img_request); | 2767 | rbd_img_request_put(img_request); |
2797 | end_request: | 2768 | end_request: |
2798 | spin_lock_irq(q->queue_lock); | 2769 | spin_lock_irq(q->queue_lock); |
2799 | if (result < 0) { | 2770 | if (result < 0) { |
2800 | rbd_warn(rbd_dev, "%s %llx at %llx result %d\n", | 2771 | rbd_warn(rbd_dev, "%s %llx at %llx result %d\n", |
2801 | write_request ? "write" : "read", | 2772 | write_request ? "write" : "read", |
2802 | length, offset, result); | 2773 | length, offset, result); |
2803 | 2774 | ||
2804 | __blk_end_request_all(rq, result); | 2775 | __blk_end_request_all(rq, result); |
2805 | } | 2776 | } |
2806 | } | 2777 | } |
2807 | } | 2778 | } |
2808 | 2779 | ||
2809 | /* | 2780 | /* |
2810 | * a queue callback. Makes sure that we don't create a bio that spans across | 2781 | * a queue callback. Makes sure that we don't create a bio that spans across |
2811 | * multiple osd objects. One exception would be with a single page bios, | 2782 | * multiple osd objects. One exception would be with a single page bios, |
2812 | * which we handle later at bio_chain_clone_range() | 2783 | * which we handle later at bio_chain_clone_range() |
2813 | */ | 2784 | */ |
2814 | static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd, | 2785 | static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd, |
2815 | struct bio_vec *bvec) | 2786 | struct bio_vec *bvec) |
2816 | { | 2787 | { |
2817 | struct rbd_device *rbd_dev = q->queuedata; | 2788 | struct rbd_device *rbd_dev = q->queuedata; |
2818 | sector_t sector_offset; | 2789 | sector_t sector_offset; |
2819 | sector_t sectors_per_obj; | 2790 | sector_t sectors_per_obj; |
2820 | sector_t obj_sector_offset; | 2791 | sector_t obj_sector_offset; |
2821 | int ret; | 2792 | int ret; |
2822 | 2793 | ||
2823 | /* | 2794 | /* |
2824 | * Find how far into its rbd object the partition-relative | 2795 | * Find how far into its rbd object the partition-relative |
2825 | * bio start sector is to offset relative to the enclosing | 2796 | * bio start sector is to offset relative to the enclosing |
2826 | * device. | 2797 | * device. |
2827 | */ | 2798 | */ |
2828 | sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector; | 2799 | sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector; |
2829 | sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT); | 2800 | sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT); |
2830 | obj_sector_offset = sector_offset & (sectors_per_obj - 1); | 2801 | obj_sector_offset = sector_offset & (sectors_per_obj - 1); |
2831 | 2802 | ||
2832 | /* | 2803 | /* |
2833 | * Compute the number of bytes from that offset to the end | 2804 | * Compute the number of bytes from that offset to the end |
2834 | * of the object. Account for what's already used by the bio. | 2805 | * of the object. Account for what's already used by the bio. |
2835 | */ | 2806 | */ |
2836 | ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT; | 2807 | ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT; |
2837 | if (ret > bmd->bi_size) | 2808 | if (ret > bmd->bi_size) |
2838 | ret -= bmd->bi_size; | 2809 | ret -= bmd->bi_size; |
2839 | else | 2810 | else |
2840 | ret = 0; | 2811 | ret = 0; |
2841 | 2812 | ||
2842 | /* | 2813 | /* |
2843 | * Don't send back more than was asked for. And if the bio | 2814 | * Don't send back more than was asked for. And if the bio |
2844 | * was empty, let the whole thing through because: "Note | 2815 | * was empty, let the whole thing through because: "Note |
2845 | * that a block device *must* allow a single page to be | 2816 | * that a block device *must* allow a single page to be |
2846 | * added to an empty bio." | 2817 | * added to an empty bio." |
2847 | */ | 2818 | */ |
2848 | rbd_assert(bvec->bv_len <= PAGE_SIZE); | 2819 | rbd_assert(bvec->bv_len <= PAGE_SIZE); |
2849 | if (ret > (int) bvec->bv_len || !bmd->bi_size) | 2820 | if (ret > (int) bvec->bv_len || !bmd->bi_size) |
2850 | ret = (int) bvec->bv_len; | 2821 | ret = (int) bvec->bv_len; |
2851 | 2822 | ||
2852 | return ret; | 2823 | return ret; |
2853 | } | 2824 | } |
2854 | 2825 | ||
2855 | static void rbd_free_disk(struct rbd_device *rbd_dev) | 2826 | static void rbd_free_disk(struct rbd_device *rbd_dev) |
2856 | { | 2827 | { |
2857 | struct gendisk *disk = rbd_dev->disk; | 2828 | struct gendisk *disk = rbd_dev->disk; |
2858 | 2829 | ||
2859 | if (!disk) | 2830 | if (!disk) |
2860 | return; | 2831 | return; |
2861 | 2832 | ||
2862 | rbd_dev->disk = NULL; | 2833 | rbd_dev->disk = NULL; |
2863 | if (disk->flags & GENHD_FL_UP) { | 2834 | if (disk->flags & GENHD_FL_UP) { |
2864 | del_gendisk(disk); | 2835 | del_gendisk(disk); |
2865 | if (disk->queue) | 2836 | if (disk->queue) |
2866 | blk_cleanup_queue(disk->queue); | 2837 | blk_cleanup_queue(disk->queue); |
2867 | } | 2838 | } |
2868 | put_disk(disk); | 2839 | put_disk(disk); |
2869 | } | 2840 | } |
2870 | 2841 | ||
2871 | static int rbd_obj_read_sync(struct rbd_device *rbd_dev, | 2842 | static int rbd_obj_read_sync(struct rbd_device *rbd_dev, |
2872 | const char *object_name, | 2843 | const char *object_name, |
2873 | u64 offset, u64 length, | 2844 | u64 offset, u64 length, |
2874 | void *buf, u64 *version) | 2845 | void *buf, u64 *version) |
2875 | 2846 | ||
2876 | { | 2847 | { |
2877 | struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; | 2848 | struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; |
2878 | struct rbd_obj_request *obj_request; | 2849 | struct rbd_obj_request *obj_request; |
2879 | struct page **pages = NULL; | 2850 | struct page **pages = NULL; |
2880 | u32 page_count; | 2851 | u32 page_count; |
2881 | size_t size; | 2852 | size_t size; |
2882 | int ret; | 2853 | int ret; |
2883 | 2854 | ||
2884 | page_count = (u32) calc_pages_for(offset, length); | 2855 | page_count = (u32) calc_pages_for(offset, length); |
2885 | pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); | 2856 | pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); |
2886 | if (IS_ERR(pages)) | 2857 | if (IS_ERR(pages)) |
2887 | ret = PTR_ERR(pages); | 2858 | ret = PTR_ERR(pages); |
2888 | 2859 | ||
2889 | ret = -ENOMEM; | 2860 | ret = -ENOMEM; |
2890 | obj_request = rbd_obj_request_create(object_name, offset, length, | 2861 | obj_request = rbd_obj_request_create(object_name, offset, length, |
2891 | OBJ_REQUEST_PAGES); | 2862 | OBJ_REQUEST_PAGES); |
2892 | if (!obj_request) | 2863 | if (!obj_request) |
2893 | goto out; | 2864 | goto out; |
2894 | 2865 | ||
2895 | obj_request->pages = pages; | 2866 | obj_request->pages = pages; |
2896 | obj_request->page_count = page_count; | 2867 | obj_request->page_count = page_count; |
2897 | 2868 | ||
2898 | obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request); | 2869 | obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request); |
2899 | if (!obj_request->osd_req) | 2870 | if (!obj_request->osd_req) |
2900 | goto out; | 2871 | goto out; |
2901 | 2872 | ||
2902 | osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ, | 2873 | osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ, |
2903 | offset, length, 0, 0); | 2874 | offset, length, 0, 0); |
2904 | osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0, | 2875 | osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0, |
2905 | obj_request->pages, | 2876 | obj_request->pages, |
2906 | obj_request->length, | 2877 | obj_request->length, |
2907 | obj_request->offset & ~PAGE_MASK, | 2878 | obj_request->offset & ~PAGE_MASK, |
2908 | false, false); | 2879 | false, false); |
2909 | rbd_osd_req_format_read(obj_request); | 2880 | rbd_osd_req_format_read(obj_request); |
2910 | 2881 | ||
2911 | ret = rbd_obj_request_submit(osdc, obj_request); | 2882 | ret = rbd_obj_request_submit(osdc, obj_request); |
2912 | if (ret) | 2883 | if (ret) |
2913 | goto out; | 2884 | goto out; |
2914 | ret = rbd_obj_request_wait(obj_request); | 2885 | ret = rbd_obj_request_wait(obj_request); |
2915 | if (ret) | 2886 | if (ret) |
2916 | goto out; | 2887 | goto out; |
2917 | 2888 | ||
2918 | ret = obj_request->result; | 2889 | ret = obj_request->result; |
2919 | if (ret < 0) | 2890 | if (ret < 0) |
2920 | goto out; | 2891 | goto out; |
2921 | 2892 | ||
2922 | rbd_assert(obj_request->xferred <= (u64) SIZE_MAX); | 2893 | rbd_assert(obj_request->xferred <= (u64) SIZE_MAX); |
2923 | size = (size_t) obj_request->xferred; | 2894 | size = (size_t) obj_request->xferred; |
2924 | ceph_copy_from_page_vector(pages, buf, 0, size); | 2895 | ceph_copy_from_page_vector(pages, buf, 0, size); |
2925 | rbd_assert(size <= (size_t) INT_MAX); | 2896 | rbd_assert(size <= (size_t) INT_MAX); |
2926 | ret = (int) size; | 2897 | ret = (int) size; |
2927 | if (version) | 2898 | if (version) |
2928 | *version = obj_request->version; | 2899 | *version = obj_request->version; |
2929 | out: | 2900 | out: |
2930 | if (obj_request) | 2901 | if (obj_request) |
2931 | rbd_obj_request_put(obj_request); | 2902 | rbd_obj_request_put(obj_request); |
2932 | else | 2903 | else |
2933 | ceph_release_page_vector(pages, page_count); | 2904 | ceph_release_page_vector(pages, page_count); |
2934 | 2905 | ||
2935 | return ret; | 2906 | return ret; |
2936 | } | 2907 | } |
2937 | 2908 | ||
2938 | /* | 2909 | /* |
2939 | * Read the complete header for the given rbd device. | 2910 | * Read the complete header for the given rbd device. |
2940 | * | 2911 | * |
2941 | * Returns a pointer to a dynamically-allocated buffer containing | 2912 | * Returns a pointer to a dynamically-allocated buffer containing |
2942 | * the complete and validated header. Caller can pass the address | 2913 | * the complete and validated header. Caller can pass the address |
2943 | * of a variable that will be filled in with the version of the | 2914 | * of a variable that will be filled in with the version of the |
2944 | * header object at the time it was read. | 2915 | * header object at the time it was read. |
2945 | * | 2916 | * |
2946 | * Returns a pointer-coded errno if a failure occurs. | 2917 | * Returns a pointer-coded errno if a failure occurs. |
2947 | */ | 2918 | */ |
2948 | static struct rbd_image_header_ondisk * | 2919 | static struct rbd_image_header_ondisk * |
2949 | rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version) | 2920 | rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version) |
2950 | { | 2921 | { |
2951 | struct rbd_image_header_ondisk *ondisk = NULL; | 2922 | struct rbd_image_header_ondisk *ondisk = NULL; |
2952 | u32 snap_count = 0; | 2923 | u32 snap_count = 0; |
2953 | u64 names_size = 0; | 2924 | u64 names_size = 0; |
2954 | u32 want_count; | 2925 | u32 want_count; |
2955 | int ret; | 2926 | int ret; |
2956 | 2927 | ||
2957 | /* | 2928 | /* |
2958 | * The complete header will include an array of its 64-bit | 2929 | * The complete header will include an array of its 64-bit |
2959 | * snapshot ids, followed by the names of those snapshots as | 2930 | * snapshot ids, followed by the names of those snapshots as |
2960 | * a contiguous block of NUL-terminated strings. Note that | 2931 | * a contiguous block of NUL-terminated strings. Note that |
2961 | * the number of snapshots could change by the time we read | 2932 | * the number of snapshots could change by the time we read |
2962 | * it in, in which case we re-read it. | 2933 | * it in, in which case we re-read it. |
2963 | */ | 2934 | */ |
2964 | do { | 2935 | do { |
2965 | size_t size; | 2936 | size_t size; |
2966 | 2937 | ||
2967 | kfree(ondisk); | 2938 | kfree(ondisk); |
2968 | 2939 | ||
2969 | size = sizeof (*ondisk); | 2940 | size = sizeof (*ondisk); |
2970 | size += snap_count * sizeof (struct rbd_image_snap_ondisk); | 2941 | size += snap_count * sizeof (struct rbd_image_snap_ondisk); |
2971 | size += names_size; | 2942 | size += names_size; |
2972 | ondisk = kmalloc(size, GFP_KERNEL); | 2943 | ondisk = kmalloc(size, GFP_KERNEL); |
2973 | if (!ondisk) | 2944 | if (!ondisk) |
2974 | return ERR_PTR(-ENOMEM); | 2945 | return ERR_PTR(-ENOMEM); |
2975 | 2946 | ||
2976 | ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name, | 2947 | ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name, |
2977 | 0, size, ondisk, version); | 2948 | 0, size, ondisk, version); |
2978 | if (ret < 0) | 2949 | if (ret < 0) |
2979 | goto out_err; | 2950 | goto out_err; |
2980 | if ((size_t)ret < size) { | 2951 | if ((size_t)ret < size) { |
2981 | ret = -ENXIO; | 2952 | ret = -ENXIO; |
2982 | rbd_warn(rbd_dev, "short header read (want %zd got %d)", | 2953 | rbd_warn(rbd_dev, "short header read (want %zd got %d)", |
2983 | size, ret); | 2954 | size, ret); |
2984 | goto out_err; | 2955 | goto out_err; |
2985 | } | 2956 | } |
2986 | if (!rbd_dev_ondisk_valid(ondisk)) { | 2957 | if (!rbd_dev_ondisk_valid(ondisk)) { |
2987 | ret = -ENXIO; | 2958 | ret = -ENXIO; |
2988 | rbd_warn(rbd_dev, "invalid header"); | 2959 | rbd_warn(rbd_dev, "invalid header"); |
2989 | goto out_err; | 2960 | goto out_err; |
2990 | } | 2961 | } |
2991 | 2962 | ||
2992 | names_size = le64_to_cpu(ondisk->snap_names_len); | 2963 | names_size = le64_to_cpu(ondisk->snap_names_len); |
2993 | want_count = snap_count; | 2964 | want_count = snap_count; |
2994 | snap_count = le32_to_cpu(ondisk->snap_count); | 2965 | snap_count = le32_to_cpu(ondisk->snap_count); |
2995 | } while (snap_count != want_count); | 2966 | } while (snap_count != want_count); |
2996 | 2967 | ||
2997 | return ondisk; | 2968 | return ondisk; |
2998 | 2969 | ||
2999 | out_err: | 2970 | out_err: |
3000 | kfree(ondisk); | 2971 | kfree(ondisk); |
3001 | 2972 | ||
3002 | return ERR_PTR(ret); | 2973 | return ERR_PTR(ret); |
3003 | } | 2974 | } |
3004 | 2975 | ||
3005 | /* | 2976 | /* |
3006 | * reload the ondisk the header | 2977 | * reload the ondisk the header |
3007 | */ | 2978 | */ |
3008 | static int rbd_read_header(struct rbd_device *rbd_dev, | 2979 | static int rbd_read_header(struct rbd_device *rbd_dev, |
3009 | struct rbd_image_header *header) | 2980 | struct rbd_image_header *header) |
3010 | { | 2981 | { |
3011 | struct rbd_image_header_ondisk *ondisk; | 2982 | struct rbd_image_header_ondisk *ondisk; |
3012 | u64 ver = 0; | 2983 | u64 ver = 0; |
3013 | int ret; | 2984 | int ret; |
3014 | 2985 | ||
3015 | ondisk = rbd_dev_v1_header_read(rbd_dev, &ver); | 2986 | ondisk = rbd_dev_v1_header_read(rbd_dev, &ver); |
3016 | if (IS_ERR(ondisk)) | 2987 | if (IS_ERR(ondisk)) |
3017 | return PTR_ERR(ondisk); | 2988 | return PTR_ERR(ondisk); |
3018 | ret = rbd_header_from_disk(header, ondisk); | 2989 | ret = rbd_header_from_disk(header, ondisk); |
3019 | if (ret >= 0) | 2990 | if (ret >= 0) |
3020 | header->obj_version = ver; | 2991 | header->obj_version = ver; |
3021 | kfree(ondisk); | 2992 | kfree(ondisk); |
3022 | 2993 | ||
3023 | return ret; | 2994 | return ret; |
3024 | } | 2995 | } |
3025 | 2996 | ||
3026 | static void rbd_remove_all_snaps(struct rbd_device *rbd_dev) | 2997 | static void rbd_remove_all_snaps(struct rbd_device *rbd_dev) |
3027 | { | 2998 | { |
3028 | struct rbd_snap *snap; | 2999 | struct rbd_snap *snap; |
3029 | struct rbd_snap *next; | 3000 | struct rbd_snap *next; |
3030 | 3001 | ||
3031 | list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node) { | 3002 | list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node) { |
3032 | list_del(&snap->node); | 3003 | list_del(&snap->node); |
3033 | rbd_snap_destroy(snap); | 3004 | rbd_snap_destroy(snap); |
3034 | } | 3005 | } |
3035 | } | 3006 | } |
3036 | 3007 | ||
3037 | static void rbd_update_mapping_size(struct rbd_device *rbd_dev) | 3008 | static void rbd_update_mapping_size(struct rbd_device *rbd_dev) |
3038 | { | 3009 | { |
3039 | if (rbd_dev->spec->snap_id != CEPH_NOSNAP) | 3010 | if (rbd_dev->spec->snap_id != CEPH_NOSNAP) |
3040 | return; | 3011 | return; |
3041 | 3012 | ||
3042 | if (rbd_dev->mapping.size != rbd_dev->header.image_size) { | 3013 | if (rbd_dev->mapping.size != rbd_dev->header.image_size) { |
3043 | sector_t size; | 3014 | sector_t size; |
3044 | 3015 | ||
3045 | rbd_dev->mapping.size = rbd_dev->header.image_size; | 3016 | rbd_dev->mapping.size = rbd_dev->header.image_size; |
3046 | size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE; | 3017 | size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE; |
3047 | dout("setting size to %llu sectors", (unsigned long long)size); | 3018 | dout("setting size to %llu sectors", (unsigned long long)size); |
3048 | set_capacity(rbd_dev->disk, size); | 3019 | set_capacity(rbd_dev->disk, size); |
3049 | } | 3020 | } |
3050 | } | 3021 | } |
3051 | 3022 | ||
3052 | /* | 3023 | /* |
3053 | * only read the first part of the ondisk header, without the snaps info | 3024 | * only read the first part of the ondisk header, without the snaps info |
3054 | */ | 3025 | */ |
3055 | static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver) | 3026 | static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver) |
3056 | { | 3027 | { |
3057 | int ret; | 3028 | int ret; |
3058 | struct rbd_image_header h; | 3029 | struct rbd_image_header h; |
3059 | 3030 | ||
3060 | ret = rbd_read_header(rbd_dev, &h); | 3031 | ret = rbd_read_header(rbd_dev, &h); |
3061 | if (ret < 0) | 3032 | if (ret < 0) |
3062 | return ret; | 3033 | return ret; |
3063 | 3034 | ||
3064 | down_write(&rbd_dev->header_rwsem); | 3035 | down_write(&rbd_dev->header_rwsem); |
3065 | 3036 | ||
3066 | /* Update image size, and check for resize of mapped image */ | 3037 | /* Update image size, and check for resize of mapped image */ |
3067 | rbd_dev->header.image_size = h.image_size; | 3038 | rbd_dev->header.image_size = h.image_size; |
3068 | rbd_update_mapping_size(rbd_dev); | 3039 | rbd_update_mapping_size(rbd_dev); |
3069 | 3040 | ||
3070 | /* rbd_dev->header.object_prefix shouldn't change */ | 3041 | /* rbd_dev->header.object_prefix shouldn't change */ |
3071 | kfree(rbd_dev->header.snap_sizes); | 3042 | kfree(rbd_dev->header.snap_sizes); |
3072 | kfree(rbd_dev->header.snap_names); | 3043 | kfree(rbd_dev->header.snap_names); |
3073 | /* osd requests may still refer to snapc */ | 3044 | /* osd requests may still refer to snapc */ |
3074 | rbd_snap_context_put(rbd_dev->header.snapc); | 3045 | ceph_put_snap_context(rbd_dev->header.snapc); |
3075 | 3046 | ||
3076 | if (hver) | 3047 | if (hver) |
3077 | *hver = h.obj_version; | 3048 | *hver = h.obj_version; |
3078 | rbd_dev->header.obj_version = h.obj_version; | 3049 | rbd_dev->header.obj_version = h.obj_version; |
3079 | rbd_dev->header.image_size = h.image_size; | 3050 | rbd_dev->header.image_size = h.image_size; |
3080 | rbd_dev->header.snapc = h.snapc; | 3051 | rbd_dev->header.snapc = h.snapc; |
3081 | rbd_dev->header.snap_names = h.snap_names; | 3052 | rbd_dev->header.snap_names = h.snap_names; |
3082 | rbd_dev->header.snap_sizes = h.snap_sizes; | 3053 | rbd_dev->header.snap_sizes = h.snap_sizes; |
3083 | /* Free the extra copy of the object prefix */ | 3054 | /* Free the extra copy of the object prefix */ |
3084 | if (strcmp(rbd_dev->header.object_prefix, h.object_prefix)) | 3055 | if (strcmp(rbd_dev->header.object_prefix, h.object_prefix)) |
3085 | rbd_warn(rbd_dev, "object prefix changed (ignoring)"); | 3056 | rbd_warn(rbd_dev, "object prefix changed (ignoring)"); |
3086 | kfree(h.object_prefix); | 3057 | kfree(h.object_prefix); |
3087 | 3058 | ||
3088 | ret = rbd_dev_snaps_update(rbd_dev); | 3059 | ret = rbd_dev_snaps_update(rbd_dev); |
3089 | 3060 | ||
3090 | up_write(&rbd_dev->header_rwsem); | 3061 | up_write(&rbd_dev->header_rwsem); |
3091 | 3062 | ||
3092 | return ret; | 3063 | return ret; |
3093 | } | 3064 | } |
3094 | 3065 | ||
3095 | static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver) | 3066 | static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver) |
3096 | { | 3067 | { |
3097 | int ret; | 3068 | int ret; |
3098 | 3069 | ||
3099 | rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); | 3070 | rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); |
3100 | mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); | 3071 | mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); |
3101 | if (rbd_dev->image_format == 1) | 3072 | if (rbd_dev->image_format == 1) |
3102 | ret = rbd_dev_v1_refresh(rbd_dev, hver); | 3073 | ret = rbd_dev_v1_refresh(rbd_dev, hver); |
3103 | else | 3074 | else |
3104 | ret = rbd_dev_v2_refresh(rbd_dev, hver); | 3075 | ret = rbd_dev_v2_refresh(rbd_dev, hver); |
3105 | mutex_unlock(&ctl_mutex); | 3076 | mutex_unlock(&ctl_mutex); |
3106 | revalidate_disk(rbd_dev->disk); | 3077 | revalidate_disk(rbd_dev->disk); |
3107 | if (ret) | 3078 | if (ret) |
3108 | rbd_warn(rbd_dev, "got notification but failed to " | 3079 | rbd_warn(rbd_dev, "got notification but failed to " |
3109 | " update snaps: %d\n", ret); | 3080 | " update snaps: %d\n", ret); |
3110 | 3081 | ||
3111 | return ret; | 3082 | return ret; |
3112 | } | 3083 | } |
3113 | 3084 | ||
3114 | static int rbd_init_disk(struct rbd_device *rbd_dev) | 3085 | static int rbd_init_disk(struct rbd_device *rbd_dev) |
3115 | { | 3086 | { |
3116 | struct gendisk *disk; | 3087 | struct gendisk *disk; |
3117 | struct request_queue *q; | 3088 | struct request_queue *q; |
3118 | u64 segment_size; | 3089 | u64 segment_size; |
3119 | 3090 | ||
3120 | /* create gendisk info */ | 3091 | /* create gendisk info */ |
3121 | disk = alloc_disk(RBD_MINORS_PER_MAJOR); | 3092 | disk = alloc_disk(RBD_MINORS_PER_MAJOR); |
3122 | if (!disk) | 3093 | if (!disk) |
3123 | return -ENOMEM; | 3094 | return -ENOMEM; |
3124 | 3095 | ||
3125 | snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", | 3096 | snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", |
3126 | rbd_dev->dev_id); | 3097 | rbd_dev->dev_id); |
3127 | disk->major = rbd_dev->major; | 3098 | disk->major = rbd_dev->major; |
3128 | disk->first_minor = 0; | 3099 | disk->first_minor = 0; |
3129 | disk->fops = &rbd_bd_ops; | 3100 | disk->fops = &rbd_bd_ops; |
3130 | disk->private_data = rbd_dev; | 3101 | disk->private_data = rbd_dev; |
3131 | 3102 | ||
3132 | q = blk_init_queue(rbd_request_fn, &rbd_dev->lock); | 3103 | q = blk_init_queue(rbd_request_fn, &rbd_dev->lock); |
3133 | if (!q) | 3104 | if (!q) |
3134 | goto out_disk; | 3105 | goto out_disk; |
3135 | 3106 | ||
3136 | /* We use the default size, but let's be explicit about it. */ | 3107 | /* We use the default size, but let's be explicit about it. */ |
3137 | blk_queue_physical_block_size(q, SECTOR_SIZE); | 3108 | blk_queue_physical_block_size(q, SECTOR_SIZE); |
3138 | 3109 | ||
3139 | /* set io sizes to object size */ | 3110 | /* set io sizes to object size */ |
3140 | segment_size = rbd_obj_bytes(&rbd_dev->header); | 3111 | segment_size = rbd_obj_bytes(&rbd_dev->header); |
3141 | blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE); | 3112 | blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE); |
3142 | blk_queue_max_segment_size(q, segment_size); | 3113 | blk_queue_max_segment_size(q, segment_size); |
3143 | blk_queue_io_min(q, segment_size); | 3114 | blk_queue_io_min(q, segment_size); |
3144 | blk_queue_io_opt(q, segment_size); | 3115 | blk_queue_io_opt(q, segment_size); |
3145 | 3116 | ||
3146 | blk_queue_merge_bvec(q, rbd_merge_bvec); | 3117 | blk_queue_merge_bvec(q, rbd_merge_bvec); |
3147 | disk->queue = q; | 3118 | disk->queue = q; |
3148 | 3119 | ||
3149 | q->queuedata = rbd_dev; | 3120 | q->queuedata = rbd_dev; |
3150 | 3121 | ||
3151 | rbd_dev->disk = disk; | 3122 | rbd_dev->disk = disk; |
3152 | 3123 | ||
3153 | return 0; | 3124 | return 0; |
3154 | out_disk: | 3125 | out_disk: |
3155 | put_disk(disk); | 3126 | put_disk(disk); |
3156 | 3127 | ||
3157 | return -ENOMEM; | 3128 | return -ENOMEM; |
3158 | } | 3129 | } |
3159 | 3130 | ||
3160 | /* | 3131 | /* |
3161 | sysfs | 3132 | sysfs |
3162 | */ | 3133 | */ |
3163 | 3134 | ||
3164 | static struct rbd_device *dev_to_rbd_dev(struct device *dev) | 3135 | static struct rbd_device *dev_to_rbd_dev(struct device *dev) |
3165 | { | 3136 | { |
3166 | return container_of(dev, struct rbd_device, dev); | 3137 | return container_of(dev, struct rbd_device, dev); |
3167 | } | 3138 | } |
3168 | 3139 | ||
3169 | static ssize_t rbd_size_show(struct device *dev, | 3140 | static ssize_t rbd_size_show(struct device *dev, |
3170 | struct device_attribute *attr, char *buf) | 3141 | struct device_attribute *attr, char *buf) |
3171 | { | 3142 | { |
3172 | struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); | 3143 | struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); |
3173 | 3144 | ||
3174 | return sprintf(buf, "%llu\n", | 3145 | return sprintf(buf, "%llu\n", |
3175 | (unsigned long long)rbd_dev->mapping.size); | 3146 | (unsigned long long)rbd_dev->mapping.size); |
3176 | } | 3147 | } |
3177 | 3148 | ||
3178 | /* | 3149 | /* |
3179 | * Note this shows the features for whatever's mapped, which is not | 3150 | * Note this shows the features for whatever's mapped, which is not |
3180 | * necessarily the base image. | 3151 | * necessarily the base image. |
3181 | */ | 3152 | */ |
3182 | static ssize_t rbd_features_show(struct device *dev, | 3153 | static ssize_t rbd_features_show(struct device *dev, |
3183 | struct device_attribute *attr, char *buf) | 3154 | struct device_attribute *attr, char *buf) |
3184 | { | 3155 | { |
3185 | struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); | 3156 | struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); |
3186 | 3157 | ||
3187 | return sprintf(buf, "0x%016llx\n", | 3158 | return sprintf(buf, "0x%016llx\n", |
3188 | (unsigned long long)rbd_dev->mapping.features); | 3159 | (unsigned long long)rbd_dev->mapping.features); |
3189 | } | 3160 | } |
3190 | 3161 | ||
3191 | static ssize_t rbd_major_show(struct device *dev, | 3162 | static ssize_t rbd_major_show(struct device *dev, |
3192 | struct device_attribute *attr, char *buf) | 3163 | struct device_attribute *attr, char *buf) |
3193 | { | 3164 | { |
3194 | struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); | 3165 | struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); |
3195 | 3166 | ||
3196 | if (rbd_dev->major) | 3167 | if (rbd_dev->major) |
3197 | return sprintf(buf, "%d\n", rbd_dev->major); | 3168 | return sprintf(buf, "%d\n", rbd_dev->major); |
3198 | 3169 | ||
3199 | return sprintf(buf, "(none)\n"); | 3170 | return sprintf(buf, "(none)\n"); |
3200 | 3171 | ||
3201 | } | 3172 | } |
3202 | 3173 | ||
3203 | static ssize_t rbd_client_id_show(struct device *dev, | 3174 | static ssize_t rbd_client_id_show(struct device *dev, |
3204 | struct device_attribute *attr, char *buf) | 3175 | struct device_attribute *attr, char *buf) |
3205 | { | 3176 | { |
3206 | struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); | 3177 | struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); |
3207 | 3178 | ||
3208 | return sprintf(buf, "client%lld\n", | 3179 | return sprintf(buf, "client%lld\n", |
3209 | ceph_client_id(rbd_dev->rbd_client->client)); | 3180 | ceph_client_id(rbd_dev->rbd_client->client)); |
3210 | } | 3181 | } |
3211 | 3182 | ||
3212 | static ssize_t rbd_pool_show(struct device *dev, | 3183 | static ssize_t rbd_pool_show(struct device *dev, |
3213 | struct device_attribute *attr, char *buf) | 3184 | struct device_attribute *attr, char *buf) |
3214 | { | 3185 | { |
3215 | struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); | 3186 | struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); |
3216 | 3187 | ||
3217 | return sprintf(buf, "%s\n", rbd_dev->spec->pool_name); | 3188 | return sprintf(buf, "%s\n", rbd_dev->spec->pool_name); |
3218 | } | 3189 | } |
3219 | 3190 | ||
3220 | static ssize_t rbd_pool_id_show(struct device *dev, | 3191 | static ssize_t rbd_pool_id_show(struct device *dev, |
3221 | struct device_attribute *attr, char *buf) | 3192 | struct device_attribute *attr, char *buf) |
3222 | { | 3193 | { |
3223 | struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); | 3194 | struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); |
3224 | 3195 | ||
3225 | return sprintf(buf, "%llu\n", | 3196 | return sprintf(buf, "%llu\n", |
3226 | (unsigned long long) rbd_dev->spec->pool_id); | 3197 | (unsigned long long) rbd_dev->spec->pool_id); |
3227 | } | 3198 | } |
3228 | 3199 | ||
3229 | static ssize_t rbd_name_show(struct device *dev, | 3200 | static ssize_t rbd_name_show(struct device *dev, |
3230 | struct device_attribute *attr, char *buf) | 3201 | struct device_attribute *attr, char *buf) |
3231 | { | 3202 | { |
3232 | struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); | 3203 | struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); |
3233 | 3204 | ||
3234 | if (rbd_dev->spec->image_name) | 3205 | if (rbd_dev->spec->image_name) |
3235 | return sprintf(buf, "%s\n", rbd_dev->spec->image_name); | 3206 | return sprintf(buf, "%s\n", rbd_dev->spec->image_name); |
3236 | 3207 | ||
3237 | return sprintf(buf, "(unknown)\n"); | 3208 | return sprintf(buf, "(unknown)\n"); |
3238 | } | 3209 | } |
3239 | 3210 | ||
3240 | static ssize_t rbd_image_id_show(struct device *dev, | 3211 | static ssize_t rbd_image_id_show(struct device *dev, |
3241 | struct device_attribute *attr, char *buf) | 3212 | struct device_attribute *attr, char *buf) |
3242 | { | 3213 | { |
3243 | struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); | 3214 | struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); |
3244 | 3215 | ||
3245 | return sprintf(buf, "%s\n", rbd_dev->spec->image_id); | 3216 | return sprintf(buf, "%s\n", rbd_dev->spec->image_id); |
3246 | } | 3217 | } |
3247 | 3218 | ||
3248 | /* | 3219 | /* |
3249 | * Shows the name of the currently-mapped snapshot (or | 3220 | * Shows the name of the currently-mapped snapshot (or |
3250 | * RBD_SNAP_HEAD_NAME for the base image). | 3221 | * RBD_SNAP_HEAD_NAME for the base image). |
3251 | */ | 3222 | */ |
3252 | static ssize_t rbd_snap_show(struct device *dev, | 3223 | static ssize_t rbd_snap_show(struct device *dev, |
3253 | struct device_attribute *attr, | 3224 | struct device_attribute *attr, |
3254 | char *buf) | 3225 | char *buf) |
3255 | { | 3226 | { |
3256 | struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); | 3227 | struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); |
3257 | 3228 | ||
3258 | return sprintf(buf, "%s\n", rbd_dev->spec->snap_name); | 3229 | return sprintf(buf, "%s\n", rbd_dev->spec->snap_name); |
3259 | } | 3230 | } |
3260 | 3231 | ||
3261 | /* | 3232 | /* |
3262 | * For an rbd v2 image, shows the pool id, image id, and snapshot id | 3233 | * For an rbd v2 image, shows the pool id, image id, and snapshot id |
3263 | * for the parent image. If there is no parent, simply shows | 3234 | * for the parent image. If there is no parent, simply shows |
3264 | * "(no parent image)". | 3235 | * "(no parent image)". |
3265 | */ | 3236 | */ |
3266 | static ssize_t rbd_parent_show(struct device *dev, | 3237 | static ssize_t rbd_parent_show(struct device *dev, |
3267 | struct device_attribute *attr, | 3238 | struct device_attribute *attr, |
3268 | char *buf) | 3239 | char *buf) |
3269 | { | 3240 | { |
3270 | struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); | 3241 | struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); |
3271 | struct rbd_spec *spec = rbd_dev->parent_spec; | 3242 | struct rbd_spec *spec = rbd_dev->parent_spec; |
3272 | int count; | 3243 | int count; |
3273 | char *bufp = buf; | 3244 | char *bufp = buf; |
3274 | 3245 | ||
3275 | if (!spec) | 3246 | if (!spec) |
3276 | return sprintf(buf, "(no parent image)\n"); | 3247 | return sprintf(buf, "(no parent image)\n"); |
3277 | 3248 | ||
3278 | count = sprintf(bufp, "pool_id %llu\npool_name %s\n", | 3249 | count = sprintf(bufp, "pool_id %llu\npool_name %s\n", |
3279 | (unsigned long long) spec->pool_id, spec->pool_name); | 3250 | (unsigned long long) spec->pool_id, spec->pool_name); |
3280 | if (count < 0) | 3251 | if (count < 0) |
3281 | return count; | 3252 | return count; |
3282 | bufp += count; | 3253 | bufp += count; |
3283 | 3254 | ||
3284 | count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id, | 3255 | count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id, |
3285 | spec->image_name ? spec->image_name : "(unknown)"); | 3256 | spec->image_name ? spec->image_name : "(unknown)"); |
3286 | if (count < 0) | 3257 | if (count < 0) |
3287 | return count; | 3258 | return count; |
3288 | bufp += count; | 3259 | bufp += count; |
3289 | 3260 | ||
3290 | count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n", | 3261 | count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n", |
3291 | (unsigned long long) spec->snap_id, spec->snap_name); | 3262 | (unsigned long long) spec->snap_id, spec->snap_name); |
3292 | if (count < 0) | 3263 | if (count < 0) |
3293 | return count; | 3264 | return count; |
3294 | bufp += count; | 3265 | bufp += count; |
3295 | 3266 | ||
3296 | count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap); | 3267 | count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap); |
3297 | if (count < 0) | 3268 | if (count < 0) |
3298 | return count; | 3269 | return count; |
3299 | bufp += count; | 3270 | bufp += count; |
3300 | 3271 | ||
3301 | return (ssize_t) (bufp - buf); | 3272 | return (ssize_t) (bufp - buf); |
3302 | } | 3273 | } |
3303 | 3274 | ||
3304 | static ssize_t rbd_image_refresh(struct device *dev, | 3275 | static ssize_t rbd_image_refresh(struct device *dev, |
3305 | struct device_attribute *attr, | 3276 | struct device_attribute *attr, |
3306 | const char *buf, | 3277 | const char *buf, |
3307 | size_t size) | 3278 | size_t size) |
3308 | { | 3279 | { |
3309 | struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); | 3280 | struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); |
3310 | int ret; | 3281 | int ret; |
3311 | 3282 | ||
3312 | ret = rbd_dev_refresh(rbd_dev, NULL); | 3283 | ret = rbd_dev_refresh(rbd_dev, NULL); |
3313 | 3284 | ||
3314 | return ret < 0 ? ret : size; | 3285 | return ret < 0 ? ret : size; |
3315 | } | 3286 | } |
3316 | 3287 | ||
3317 | static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); | 3288 | static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); |
3318 | static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL); | 3289 | static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL); |
3319 | static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL); | 3290 | static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL); |
3320 | static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); | 3291 | static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); |
3321 | static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); | 3292 | static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); |
3322 | static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL); | 3293 | static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL); |
3323 | static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL); | 3294 | static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL); |
3324 | static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL); | 3295 | static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL); |
3325 | static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); | 3296 | static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); |
3326 | static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); | 3297 | static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); |
3327 | static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL); | 3298 | static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL); |
3328 | 3299 | ||
3329 | static struct attribute *rbd_attrs[] = { | 3300 | static struct attribute *rbd_attrs[] = { |
3330 | &dev_attr_size.attr, | 3301 | &dev_attr_size.attr, |
3331 | &dev_attr_features.attr, | 3302 | &dev_attr_features.attr, |
3332 | &dev_attr_major.attr, | 3303 | &dev_attr_major.attr, |
3333 | &dev_attr_client_id.attr, | 3304 | &dev_attr_client_id.attr, |
3334 | &dev_attr_pool.attr, | 3305 | &dev_attr_pool.attr, |
3335 | &dev_attr_pool_id.attr, | 3306 | &dev_attr_pool_id.attr, |
3336 | &dev_attr_name.attr, | 3307 | &dev_attr_name.attr, |
3337 | &dev_attr_image_id.attr, | 3308 | &dev_attr_image_id.attr, |
3338 | &dev_attr_current_snap.attr, | 3309 | &dev_attr_current_snap.attr, |
3339 | &dev_attr_parent.attr, | 3310 | &dev_attr_parent.attr, |
3340 | &dev_attr_refresh.attr, | 3311 | &dev_attr_refresh.attr, |
3341 | NULL | 3312 | NULL |
3342 | }; | 3313 | }; |
3343 | 3314 | ||
3344 | static struct attribute_group rbd_attr_group = { | 3315 | static struct attribute_group rbd_attr_group = { |
3345 | .attrs = rbd_attrs, | 3316 | .attrs = rbd_attrs, |
3346 | }; | 3317 | }; |
3347 | 3318 | ||
3348 | static const struct attribute_group *rbd_attr_groups[] = { | 3319 | static const struct attribute_group *rbd_attr_groups[] = { |
3349 | &rbd_attr_group, | 3320 | &rbd_attr_group, |
3350 | NULL | 3321 | NULL |
3351 | }; | 3322 | }; |
3352 | 3323 | ||
3353 | static void rbd_sysfs_dev_release(struct device *dev) | 3324 | static void rbd_sysfs_dev_release(struct device *dev) |
3354 | { | 3325 | { |
3355 | } | 3326 | } |
3356 | 3327 | ||
3357 | static struct device_type rbd_device_type = { | 3328 | static struct device_type rbd_device_type = { |
3358 | .name = "rbd", | 3329 | .name = "rbd", |
3359 | .groups = rbd_attr_groups, | 3330 | .groups = rbd_attr_groups, |
3360 | .release = rbd_sysfs_dev_release, | 3331 | .release = rbd_sysfs_dev_release, |
3361 | }; | 3332 | }; |
3362 | 3333 | ||
3363 | static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec) | 3334 | static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec) |
3364 | { | 3335 | { |
3365 | kref_get(&spec->kref); | 3336 | kref_get(&spec->kref); |
3366 | 3337 | ||
3367 | return spec; | 3338 | return spec; |
3368 | } | 3339 | } |
3369 | 3340 | ||
3370 | static void rbd_spec_free(struct kref *kref); | 3341 | static void rbd_spec_free(struct kref *kref); |
3371 | static void rbd_spec_put(struct rbd_spec *spec) | 3342 | static void rbd_spec_put(struct rbd_spec *spec) |
3372 | { | 3343 | { |
3373 | if (spec) | 3344 | if (spec) |
3374 | kref_put(&spec->kref, rbd_spec_free); | 3345 | kref_put(&spec->kref, rbd_spec_free); |
3375 | } | 3346 | } |
3376 | 3347 | ||
3377 | static struct rbd_spec *rbd_spec_alloc(void) | 3348 | static struct rbd_spec *rbd_spec_alloc(void) |
3378 | { | 3349 | { |
3379 | struct rbd_spec *spec; | 3350 | struct rbd_spec *spec; |
3380 | 3351 | ||
3381 | spec = kzalloc(sizeof (*spec), GFP_KERNEL); | 3352 | spec = kzalloc(sizeof (*spec), GFP_KERNEL); |
3382 | if (!spec) | 3353 | if (!spec) |
3383 | return NULL; | 3354 | return NULL; |
3384 | kref_init(&spec->kref); | 3355 | kref_init(&spec->kref); |
3385 | 3356 | ||
3386 | return spec; | 3357 | return spec; |
3387 | } | 3358 | } |
3388 | 3359 | ||
3389 | static void rbd_spec_free(struct kref *kref) | 3360 | static void rbd_spec_free(struct kref *kref) |
3390 | { | 3361 | { |
3391 | struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref); | 3362 | struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref); |
3392 | 3363 | ||
3393 | kfree(spec->pool_name); | 3364 | kfree(spec->pool_name); |
3394 | kfree(spec->image_id); | 3365 | kfree(spec->image_id); |
3395 | kfree(spec->image_name); | 3366 | kfree(spec->image_name); |
3396 | kfree(spec->snap_name); | 3367 | kfree(spec->snap_name); |
3397 | kfree(spec); | 3368 | kfree(spec); |
3398 | } | 3369 | } |
3399 | 3370 | ||
3400 | static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, | 3371 | static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, |
3401 | struct rbd_spec *spec) | 3372 | struct rbd_spec *spec) |
3402 | { | 3373 | { |
3403 | struct rbd_device *rbd_dev; | 3374 | struct rbd_device *rbd_dev; |
3404 | 3375 | ||
3405 | rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL); | 3376 | rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL); |
3406 | if (!rbd_dev) | 3377 | if (!rbd_dev) |
3407 | return NULL; | 3378 | return NULL; |
3408 | 3379 | ||
3409 | spin_lock_init(&rbd_dev->lock); | 3380 | spin_lock_init(&rbd_dev->lock); |
3410 | rbd_dev->flags = 0; | 3381 | rbd_dev->flags = 0; |
3411 | INIT_LIST_HEAD(&rbd_dev->node); | 3382 | INIT_LIST_HEAD(&rbd_dev->node); |
3412 | INIT_LIST_HEAD(&rbd_dev->snaps); | 3383 | INIT_LIST_HEAD(&rbd_dev->snaps); |
3413 | init_rwsem(&rbd_dev->header_rwsem); | 3384 | init_rwsem(&rbd_dev->header_rwsem); |
3414 | 3385 | ||
3415 | rbd_dev->spec = spec; | 3386 | rbd_dev->spec = spec; |
3416 | rbd_dev->rbd_client = rbdc; | 3387 | rbd_dev->rbd_client = rbdc; |
3417 | 3388 | ||
3418 | /* Initialize the layout used for all rbd requests */ | 3389 | /* Initialize the layout used for all rbd requests */ |
3419 | 3390 | ||
3420 | rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); | 3391 | rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); |
3421 | rbd_dev->layout.fl_stripe_count = cpu_to_le32(1); | 3392 | rbd_dev->layout.fl_stripe_count = cpu_to_le32(1); |
3422 | rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); | 3393 | rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); |
3423 | rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id); | 3394 | rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id); |
3424 | 3395 | ||
3425 | return rbd_dev; | 3396 | return rbd_dev; |
3426 | } | 3397 | } |
3427 | 3398 | ||
3428 | static void rbd_dev_destroy(struct rbd_device *rbd_dev) | 3399 | static void rbd_dev_destroy(struct rbd_device *rbd_dev) |
3429 | { | 3400 | { |
3430 | rbd_put_client(rbd_dev->rbd_client); | 3401 | rbd_put_client(rbd_dev->rbd_client); |
3431 | rbd_spec_put(rbd_dev->spec); | 3402 | rbd_spec_put(rbd_dev->spec); |
3432 | kfree(rbd_dev); | 3403 | kfree(rbd_dev); |
3433 | } | 3404 | } |
3434 | 3405 | ||
3435 | static void rbd_snap_destroy(struct rbd_snap *snap) | 3406 | static void rbd_snap_destroy(struct rbd_snap *snap) |
3436 | { | 3407 | { |
3437 | kfree(snap->name); | 3408 | kfree(snap->name); |
3438 | kfree(snap); | 3409 | kfree(snap); |
3439 | } | 3410 | } |
3440 | 3411 | ||
3441 | static struct rbd_snap *rbd_snap_create(struct rbd_device *rbd_dev, | 3412 | static struct rbd_snap *rbd_snap_create(struct rbd_device *rbd_dev, |
3442 | const char *snap_name, | 3413 | const char *snap_name, |
3443 | u64 snap_id, u64 snap_size, | 3414 | u64 snap_id, u64 snap_size, |
3444 | u64 snap_features) | 3415 | u64 snap_features) |
3445 | { | 3416 | { |
3446 | struct rbd_snap *snap; | 3417 | struct rbd_snap *snap; |
3447 | 3418 | ||
3448 | snap = kzalloc(sizeof (*snap), GFP_KERNEL); | 3419 | snap = kzalloc(sizeof (*snap), GFP_KERNEL); |
3449 | if (!snap) | 3420 | if (!snap) |
3450 | return ERR_PTR(-ENOMEM); | 3421 | return ERR_PTR(-ENOMEM); |
3451 | 3422 | ||
3452 | snap->name = snap_name; | 3423 | snap->name = snap_name; |
3453 | snap->id = snap_id; | 3424 | snap->id = snap_id; |
3454 | snap->size = snap_size; | 3425 | snap->size = snap_size; |
3455 | snap->features = snap_features; | 3426 | snap->features = snap_features; |
3456 | 3427 | ||
3457 | return snap; | 3428 | return snap; |
3458 | } | 3429 | } |
3459 | 3430 | ||
3460 | /* | 3431 | /* |
3461 | * Returns a dynamically-allocated snapshot name if successful, or a | 3432 | * Returns a dynamically-allocated snapshot name if successful, or a |
3462 | * pointer-coded error otherwise. | 3433 | * pointer-coded error otherwise. |
3463 | */ | 3434 | */ |
3464 | static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which, | 3435 | static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which, |
3465 | u64 *snap_size, u64 *snap_features) | 3436 | u64 *snap_size, u64 *snap_features) |
3466 | { | 3437 | { |
3467 | char *snap_name; | 3438 | char *snap_name; |
3468 | int i; | 3439 | int i; |
3469 | 3440 | ||
3470 | rbd_assert(which < rbd_dev->header.snapc->num_snaps); | 3441 | rbd_assert(which < rbd_dev->header.snapc->num_snaps); |
3471 | 3442 | ||
3472 | /* Skip over names until we find the one we are looking for */ | 3443 | /* Skip over names until we find the one we are looking for */ |
3473 | 3444 | ||
3474 | snap_name = rbd_dev->header.snap_names; | 3445 | snap_name = rbd_dev->header.snap_names; |
3475 | for (i = 0; i < which; i++) | 3446 | for (i = 0; i < which; i++) |
3476 | snap_name += strlen(snap_name) + 1; | 3447 | snap_name += strlen(snap_name) + 1; |
3477 | 3448 | ||
3478 | snap_name = kstrdup(snap_name, GFP_KERNEL); | 3449 | snap_name = kstrdup(snap_name, GFP_KERNEL); |
3479 | if (!snap_name) | 3450 | if (!snap_name) |
3480 | return ERR_PTR(-ENOMEM); | 3451 | return ERR_PTR(-ENOMEM); |
3481 | 3452 | ||
3482 | *snap_size = rbd_dev->header.snap_sizes[which]; | 3453 | *snap_size = rbd_dev->header.snap_sizes[which]; |
3483 | *snap_features = 0; /* No features for v1 */ | 3454 | *snap_features = 0; /* No features for v1 */ |
3484 | 3455 | ||
3485 | return snap_name; | 3456 | return snap_name; |
3486 | } | 3457 | } |
3487 | 3458 | ||
3488 | /* | 3459 | /* |
3489 | * Get the size and object order for an image snapshot, or if | 3460 | * Get the size and object order for an image snapshot, or if |
3490 | * snap_id is CEPH_NOSNAP, gets this information for the base | 3461 | * snap_id is CEPH_NOSNAP, gets this information for the base |
3491 | * image. | 3462 | * image. |
3492 | */ | 3463 | */ |
3493 | static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, | 3464 | static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, |
3494 | u8 *order, u64 *snap_size) | 3465 | u8 *order, u64 *snap_size) |
3495 | { | 3466 | { |
3496 | __le64 snapid = cpu_to_le64(snap_id); | 3467 | __le64 snapid = cpu_to_le64(snap_id); |
3497 | int ret; | 3468 | int ret; |
3498 | struct { | 3469 | struct { |
3499 | u8 order; | 3470 | u8 order; |
3500 | __le64 size; | 3471 | __le64 size; |
3501 | } __attribute__ ((packed)) size_buf = { 0 }; | 3472 | } __attribute__ ((packed)) size_buf = { 0 }; |
3502 | 3473 | ||
3503 | ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, | 3474 | ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, |
3504 | "rbd", "get_size", | 3475 | "rbd", "get_size", |
3505 | &snapid, sizeof (snapid), | 3476 | &snapid, sizeof (snapid), |
3506 | &size_buf, sizeof (size_buf), NULL); | 3477 | &size_buf, sizeof (size_buf), NULL); |
3507 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); | 3478 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); |
3508 | if (ret < 0) | 3479 | if (ret < 0) |
3509 | return ret; | 3480 | return ret; |
3510 | if (ret < sizeof (size_buf)) | 3481 | if (ret < sizeof (size_buf)) |
3511 | return -ERANGE; | 3482 | return -ERANGE; |
3512 | 3483 | ||
3513 | if (order) | 3484 | if (order) |
3514 | *order = size_buf.order; | 3485 | *order = size_buf.order; |
3515 | *snap_size = le64_to_cpu(size_buf.size); | 3486 | *snap_size = le64_to_cpu(size_buf.size); |
3516 | 3487 | ||
3517 | dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n", | 3488 | dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n", |
3518 | (unsigned long long)snap_id, (unsigned int)*order, | 3489 | (unsigned long long)snap_id, (unsigned int)*order, |
3519 | (unsigned long long)*snap_size); | 3490 | (unsigned long long)*snap_size); |
3520 | 3491 | ||
3521 | return 0; | 3492 | return 0; |
3522 | } | 3493 | } |
3523 | 3494 | ||
3524 | static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) | 3495 | static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) |
3525 | { | 3496 | { |
3526 | return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP, | 3497 | return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP, |
3527 | &rbd_dev->header.obj_order, | 3498 | &rbd_dev->header.obj_order, |
3528 | &rbd_dev->header.image_size); | 3499 | &rbd_dev->header.image_size); |
3529 | } | 3500 | } |
3530 | 3501 | ||
3531 | static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) | 3502 | static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) |
3532 | { | 3503 | { |
3533 | void *reply_buf; | 3504 | void *reply_buf; |
3534 | int ret; | 3505 | int ret; |
3535 | void *p; | 3506 | void *p; |
3536 | 3507 | ||
3537 | reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL); | 3508 | reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL); |
3538 | if (!reply_buf) | 3509 | if (!reply_buf) |
3539 | return -ENOMEM; | 3510 | return -ENOMEM; |
3540 | 3511 | ||
3541 | ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, | 3512 | ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, |
3542 | "rbd", "get_object_prefix", NULL, 0, | 3513 | "rbd", "get_object_prefix", NULL, 0, |
3543 | reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL); | 3514 | reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL); |
3544 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); | 3515 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); |
3545 | if (ret < 0) | 3516 | if (ret < 0) |
3546 | goto out; | 3517 | goto out; |
3547 | 3518 | ||
3548 | p = reply_buf; | 3519 | p = reply_buf; |
3549 | rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, | 3520 | rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, |
3550 | p + ret, NULL, GFP_NOIO); | 3521 | p + ret, NULL, GFP_NOIO); |
3551 | ret = 0; | 3522 | ret = 0; |
3552 | 3523 | ||
3553 | if (IS_ERR(rbd_dev->header.object_prefix)) { | 3524 | if (IS_ERR(rbd_dev->header.object_prefix)) { |
3554 | ret = PTR_ERR(rbd_dev->header.object_prefix); | 3525 | ret = PTR_ERR(rbd_dev->header.object_prefix); |
3555 | rbd_dev->header.object_prefix = NULL; | 3526 | rbd_dev->header.object_prefix = NULL; |
3556 | } else { | 3527 | } else { |
3557 | dout(" object_prefix = %s\n", rbd_dev->header.object_prefix); | 3528 | dout(" object_prefix = %s\n", rbd_dev->header.object_prefix); |
3558 | } | 3529 | } |
3559 | out: | 3530 | out: |
3560 | kfree(reply_buf); | 3531 | kfree(reply_buf); |
3561 | 3532 | ||
3562 | return ret; | 3533 | return ret; |
3563 | } | 3534 | } |
3564 | 3535 | ||
3565 | static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, | 3536 | static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, |
3566 | u64 *snap_features) | 3537 | u64 *snap_features) |
3567 | { | 3538 | { |
3568 | __le64 snapid = cpu_to_le64(snap_id); | 3539 | __le64 snapid = cpu_to_le64(snap_id); |
3569 | struct { | 3540 | struct { |
3570 | __le64 features; | 3541 | __le64 features; |
3571 | __le64 incompat; | 3542 | __le64 incompat; |
3572 | } __attribute__ ((packed)) features_buf = { 0 }; | 3543 | } __attribute__ ((packed)) features_buf = { 0 }; |
3573 | u64 incompat; | 3544 | u64 incompat; |
3574 | int ret; | 3545 | int ret; |
3575 | 3546 | ||
3576 | ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, | 3547 | ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, |
3577 | "rbd", "get_features", | 3548 | "rbd", "get_features", |
3578 | &snapid, sizeof (snapid), | 3549 | &snapid, sizeof (snapid), |
3579 | &features_buf, sizeof (features_buf), NULL); | 3550 | &features_buf, sizeof (features_buf), NULL); |
3580 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); | 3551 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); |
3581 | if (ret < 0) | 3552 | if (ret < 0) |
3582 | return ret; | 3553 | return ret; |
3583 | if (ret < sizeof (features_buf)) | 3554 | if (ret < sizeof (features_buf)) |
3584 | return -ERANGE; | 3555 | return -ERANGE; |
3585 | 3556 | ||
3586 | incompat = le64_to_cpu(features_buf.incompat); | 3557 | incompat = le64_to_cpu(features_buf.incompat); |
3587 | if (incompat & ~RBD_FEATURES_SUPPORTED) | 3558 | if (incompat & ~RBD_FEATURES_SUPPORTED) |
3588 | return -ENXIO; | 3559 | return -ENXIO; |
3589 | 3560 | ||
3590 | *snap_features = le64_to_cpu(features_buf.features); | 3561 | *snap_features = le64_to_cpu(features_buf.features); |
3591 | 3562 | ||
3592 | dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n", | 3563 | dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n", |
3593 | (unsigned long long)snap_id, | 3564 | (unsigned long long)snap_id, |
3594 | (unsigned long long)*snap_features, | 3565 | (unsigned long long)*snap_features, |
3595 | (unsigned long long)le64_to_cpu(features_buf.incompat)); | 3566 | (unsigned long long)le64_to_cpu(features_buf.incompat)); |
3596 | 3567 | ||
3597 | return 0; | 3568 | return 0; |
3598 | } | 3569 | } |
3599 | 3570 | ||
3600 | static int rbd_dev_v2_features(struct rbd_device *rbd_dev) | 3571 | static int rbd_dev_v2_features(struct rbd_device *rbd_dev) |
3601 | { | 3572 | { |
3602 | return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP, | 3573 | return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP, |
3603 | &rbd_dev->header.features); | 3574 | &rbd_dev->header.features); |
3604 | } | 3575 | } |
3605 | 3576 | ||
3606 | static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) | 3577 | static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) |
3607 | { | 3578 | { |
3608 | struct rbd_spec *parent_spec; | 3579 | struct rbd_spec *parent_spec; |
3609 | size_t size; | 3580 | size_t size; |
3610 | void *reply_buf = NULL; | 3581 | void *reply_buf = NULL; |
3611 | __le64 snapid; | 3582 | __le64 snapid; |
3612 | void *p; | 3583 | void *p; |
3613 | void *end; | 3584 | void *end; |
3614 | char *image_id; | 3585 | char *image_id; |
3615 | u64 overlap; | 3586 | u64 overlap; |
3616 | int ret; | 3587 | int ret; |
3617 | 3588 | ||
3618 | parent_spec = rbd_spec_alloc(); | 3589 | parent_spec = rbd_spec_alloc(); |
3619 | if (!parent_spec) | 3590 | if (!parent_spec) |
3620 | return -ENOMEM; | 3591 | return -ENOMEM; |
3621 | 3592 | ||
3622 | size = sizeof (__le64) + /* pool_id */ | 3593 | size = sizeof (__le64) + /* pool_id */ |
3623 | sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */ | 3594 | sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */ |
3624 | sizeof (__le64) + /* snap_id */ | 3595 | sizeof (__le64) + /* snap_id */ |
3625 | sizeof (__le64); /* overlap */ | 3596 | sizeof (__le64); /* overlap */ |
3626 | reply_buf = kmalloc(size, GFP_KERNEL); | 3597 | reply_buf = kmalloc(size, GFP_KERNEL); |
3627 | if (!reply_buf) { | 3598 | if (!reply_buf) { |
3628 | ret = -ENOMEM; | 3599 | ret = -ENOMEM; |
3629 | goto out_err; | 3600 | goto out_err; |
3630 | } | 3601 | } |
3631 | 3602 | ||
3632 | snapid = cpu_to_le64(CEPH_NOSNAP); | 3603 | snapid = cpu_to_le64(CEPH_NOSNAP); |
3633 | ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, | 3604 | ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, |
3634 | "rbd", "get_parent", | 3605 | "rbd", "get_parent", |
3635 | &snapid, sizeof (snapid), | 3606 | &snapid, sizeof (snapid), |
3636 | reply_buf, size, NULL); | 3607 | reply_buf, size, NULL); |
3637 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); | 3608 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); |
3638 | if (ret < 0) | 3609 | if (ret < 0) |
3639 | goto out_err; | 3610 | goto out_err; |
3640 | 3611 | ||
3641 | p = reply_buf; | 3612 | p = reply_buf; |
3642 | end = reply_buf + ret; | 3613 | end = reply_buf + ret; |
3643 | ret = -ERANGE; | 3614 | ret = -ERANGE; |
3644 | ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err); | 3615 | ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err); |
3645 | if (parent_spec->pool_id == CEPH_NOPOOL) | 3616 | if (parent_spec->pool_id == CEPH_NOPOOL) |
3646 | goto out; /* No parent? No problem. */ | 3617 | goto out; /* No parent? No problem. */ |
3647 | 3618 | ||
3648 | /* The ceph file layout needs to fit pool id in 32 bits */ | 3619 | /* The ceph file layout needs to fit pool id in 32 bits */ |
3649 | 3620 | ||
3650 | ret = -EIO; | 3621 | ret = -EIO; |
3651 | if (parent_spec->pool_id > (u64)U32_MAX) { | 3622 | if (parent_spec->pool_id > (u64)U32_MAX) { |
3652 | rbd_warn(NULL, "parent pool id too large (%llu > %u)\n", | 3623 | rbd_warn(NULL, "parent pool id too large (%llu > %u)\n", |
3653 | (unsigned long long)parent_spec->pool_id, U32_MAX); | 3624 | (unsigned long long)parent_spec->pool_id, U32_MAX); |
3654 | goto out_err; | 3625 | goto out_err; |
3655 | } | 3626 | } |
3656 | 3627 | ||
3657 | image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); | 3628 | image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); |
3658 | if (IS_ERR(image_id)) { | 3629 | if (IS_ERR(image_id)) { |
3659 | ret = PTR_ERR(image_id); | 3630 | ret = PTR_ERR(image_id); |
3660 | goto out_err; | 3631 | goto out_err; |
3661 | } | 3632 | } |
3662 | parent_spec->image_id = image_id; | 3633 | parent_spec->image_id = image_id; |
3663 | ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err); | 3634 | ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err); |
3664 | ceph_decode_64_safe(&p, end, overlap, out_err); | 3635 | ceph_decode_64_safe(&p, end, overlap, out_err); |
3665 | 3636 | ||
3666 | rbd_dev->parent_overlap = overlap; | 3637 | rbd_dev->parent_overlap = overlap; |
3667 | rbd_dev->parent_spec = parent_spec; | 3638 | rbd_dev->parent_spec = parent_spec; |
3668 | parent_spec = NULL; /* rbd_dev now owns this */ | 3639 | parent_spec = NULL; /* rbd_dev now owns this */ |
3669 | out: | 3640 | out: |
3670 | ret = 0; | 3641 | ret = 0; |
3671 | out_err: | 3642 | out_err: |
3672 | kfree(reply_buf); | 3643 | kfree(reply_buf); |
3673 | rbd_spec_put(parent_spec); | 3644 | rbd_spec_put(parent_spec); |
3674 | 3645 | ||
3675 | return ret; | 3646 | return ret; |
3676 | } | 3647 | } |
3677 | 3648 | ||
3678 | static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) | 3649 | static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) |
3679 | { | 3650 | { |
3680 | struct { | 3651 | struct { |
3681 | __le64 stripe_unit; | 3652 | __le64 stripe_unit; |
3682 | __le64 stripe_count; | 3653 | __le64 stripe_count; |
3683 | } __attribute__ ((packed)) striping_info_buf = { 0 }; | 3654 | } __attribute__ ((packed)) striping_info_buf = { 0 }; |
3684 | size_t size = sizeof (striping_info_buf); | 3655 | size_t size = sizeof (striping_info_buf); |
3685 | void *p; | 3656 | void *p; |
3686 | u64 obj_size; | 3657 | u64 obj_size; |
3687 | u64 stripe_unit; | 3658 | u64 stripe_unit; |
3688 | u64 stripe_count; | 3659 | u64 stripe_count; |
3689 | int ret; | 3660 | int ret; |
3690 | 3661 | ||
3691 | ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, | 3662 | ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, |
3692 | "rbd", "get_stripe_unit_count", NULL, 0, | 3663 | "rbd", "get_stripe_unit_count", NULL, 0, |
3693 | (char *)&striping_info_buf, size, NULL); | 3664 | (char *)&striping_info_buf, size, NULL); |
3694 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); | 3665 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); |
3695 | if (ret < 0) | 3666 | if (ret < 0) |
3696 | return ret; | 3667 | return ret; |
3697 | if (ret < size) | 3668 | if (ret < size) |
3698 | return -ERANGE; | 3669 | return -ERANGE; |
3699 | 3670 | ||
3700 | /* | 3671 | /* |
3701 | * We don't actually support the "fancy striping" feature | 3672 | * We don't actually support the "fancy striping" feature |
3702 | * (STRIPINGV2) yet, but if the striping sizes are the | 3673 | * (STRIPINGV2) yet, but if the striping sizes are the |
3703 | * defaults the behavior is the same as before. So find | 3674 | * defaults the behavior is the same as before. So find |
3704 | * out, and only fail if the image has non-default values. | 3675 | * out, and only fail if the image has non-default values. |
3705 | */ | 3676 | */ |
3706 | ret = -EINVAL; | 3677 | ret = -EINVAL; |
3707 | obj_size = (u64)1 << rbd_dev->header.obj_order; | 3678 | obj_size = (u64)1 << rbd_dev->header.obj_order; |
3708 | p = &striping_info_buf; | 3679 | p = &striping_info_buf; |
3709 | stripe_unit = ceph_decode_64(&p); | 3680 | stripe_unit = ceph_decode_64(&p); |
3710 | if (stripe_unit != obj_size) { | 3681 | if (stripe_unit != obj_size) { |
3711 | rbd_warn(rbd_dev, "unsupported stripe unit " | 3682 | rbd_warn(rbd_dev, "unsupported stripe unit " |
3712 | "(got %llu want %llu)", | 3683 | "(got %llu want %llu)", |
3713 | stripe_unit, obj_size); | 3684 | stripe_unit, obj_size); |
3714 | return -EINVAL; | 3685 | return -EINVAL; |
3715 | } | 3686 | } |
3716 | stripe_count = ceph_decode_64(&p); | 3687 | stripe_count = ceph_decode_64(&p); |
3717 | if (stripe_count != 1) { | 3688 | if (stripe_count != 1) { |
3718 | rbd_warn(rbd_dev, "unsupported stripe count " | 3689 | rbd_warn(rbd_dev, "unsupported stripe count " |
3719 | "(got %llu want 1)", stripe_count); | 3690 | "(got %llu want 1)", stripe_count); |
3720 | return -EINVAL; | 3691 | return -EINVAL; |
3721 | } | 3692 | } |
3722 | rbd_dev->header.stripe_unit = stripe_unit; | 3693 | rbd_dev->header.stripe_unit = stripe_unit; |
3723 | rbd_dev->header.stripe_count = stripe_count; | 3694 | rbd_dev->header.stripe_count = stripe_count; |
3724 | 3695 | ||
3725 | return 0; | 3696 | return 0; |
3726 | } | 3697 | } |
3727 | 3698 | ||
3728 | static char *rbd_dev_image_name(struct rbd_device *rbd_dev) | 3699 | static char *rbd_dev_image_name(struct rbd_device *rbd_dev) |
3729 | { | 3700 | { |
3730 | size_t image_id_size; | 3701 | size_t image_id_size; |
3731 | char *image_id; | 3702 | char *image_id; |
3732 | void *p; | 3703 | void *p; |
3733 | void *end; | 3704 | void *end; |
3734 | size_t size; | 3705 | size_t size; |
3735 | void *reply_buf = NULL; | 3706 | void *reply_buf = NULL; |
3736 | size_t len = 0; | 3707 | size_t len = 0; |
3737 | char *image_name = NULL; | 3708 | char *image_name = NULL; |
3738 | int ret; | 3709 | int ret; |
3739 | 3710 | ||
3740 | rbd_assert(!rbd_dev->spec->image_name); | 3711 | rbd_assert(!rbd_dev->spec->image_name); |
3741 | 3712 | ||
3742 | len = strlen(rbd_dev->spec->image_id); | 3713 | len = strlen(rbd_dev->spec->image_id); |
3743 | image_id_size = sizeof (__le32) + len; | 3714 | image_id_size = sizeof (__le32) + len; |
3744 | image_id = kmalloc(image_id_size, GFP_KERNEL); | 3715 | image_id = kmalloc(image_id_size, GFP_KERNEL); |
3745 | if (!image_id) | 3716 | if (!image_id) |
3746 | return NULL; | 3717 | return NULL; |
3747 | 3718 | ||
3748 | p = image_id; | 3719 | p = image_id; |
3749 | end = image_id + image_id_size; | 3720 | end = image_id + image_id_size; |
3750 | ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len); | 3721 | ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len); |
3751 | 3722 | ||
3752 | size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX; | 3723 | size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX; |
3753 | reply_buf = kmalloc(size, GFP_KERNEL); | 3724 | reply_buf = kmalloc(size, GFP_KERNEL); |
3754 | if (!reply_buf) | 3725 | if (!reply_buf) |
3755 | goto out; | 3726 | goto out; |
3756 | 3727 | ||
3757 | ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY, | 3728 | ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY, |
3758 | "rbd", "dir_get_name", | 3729 | "rbd", "dir_get_name", |
3759 | image_id, image_id_size, | 3730 | image_id, image_id_size, |
3760 | reply_buf, size, NULL); | 3731 | reply_buf, size, NULL); |
3761 | if (ret < 0) | 3732 | if (ret < 0) |
3762 | goto out; | 3733 | goto out; |
3763 | p = reply_buf; | 3734 | p = reply_buf; |
3764 | end = reply_buf + ret; | 3735 | end = reply_buf + ret; |
3765 | 3736 | ||
3766 | image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL); | 3737 | image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL); |
3767 | if (IS_ERR(image_name)) | 3738 | if (IS_ERR(image_name)) |
3768 | image_name = NULL; | 3739 | image_name = NULL; |
3769 | else | 3740 | else |
3770 | dout("%s: name is %s len is %zd\n", __func__, image_name, len); | 3741 | dout("%s: name is %s len is %zd\n", __func__, image_name, len); |
3771 | out: | 3742 | out: |
3772 | kfree(reply_buf); | 3743 | kfree(reply_buf); |
3773 | kfree(image_id); | 3744 | kfree(image_id); |
3774 | 3745 | ||
3775 | return image_name; | 3746 | return image_name; |
3776 | } | 3747 | } |
3777 | 3748 | ||
3778 | /* | 3749 | /* |
3779 | * When an rbd image has a parent image, it is identified by the | 3750 | * When an rbd image has a parent image, it is identified by the |
3780 | * pool, image, and snapshot ids (not names). This function fills | 3751 | * pool, image, and snapshot ids (not names). This function fills |
3781 | * in the names for those ids. (It's OK if we can't figure out the | 3752 | * in the names for those ids. (It's OK if we can't figure out the |
3782 | * name for an image id, but the pool and snapshot ids should always | 3753 | * name for an image id, but the pool and snapshot ids should always |
3783 | * exist and have names.) All names in an rbd spec are dynamically | 3754 | * exist and have names.) All names in an rbd spec are dynamically |
3784 | * allocated. | 3755 | * allocated. |
3785 | * | 3756 | * |
3786 | * When an image being mapped (not a parent) is probed, we have the | 3757 | * When an image being mapped (not a parent) is probed, we have the |
3787 | * pool name and pool id, image name and image id, and the snapshot | 3758 | * pool name and pool id, image name and image id, and the snapshot |
3788 | * name. The only thing we're missing is the snapshot id. | 3759 | * name. The only thing we're missing is the snapshot id. |
3789 | * | 3760 | * |
3790 | * The set of snapshots for an image is not known until they have | 3761 | * The set of snapshots for an image is not known until they have |
3791 | * been read by rbd_dev_snaps_update(), so we can't completely fill | 3762 | * been read by rbd_dev_snaps_update(), so we can't completely fill |
3792 | * in this information until after that has been called. | 3763 | * in this information until after that has been called. |
3793 | */ | 3764 | */ |
3794 | static int rbd_dev_spec_update(struct rbd_device *rbd_dev) | 3765 | static int rbd_dev_spec_update(struct rbd_device *rbd_dev) |
3795 | { | 3766 | { |
3796 | struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; | 3767 | struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; |
3797 | struct rbd_spec *spec = rbd_dev->spec; | 3768 | struct rbd_spec *spec = rbd_dev->spec; |
3798 | const char *pool_name; | 3769 | const char *pool_name; |
3799 | const char *image_name; | 3770 | const char *image_name; |
3800 | const char *snap_name; | 3771 | const char *snap_name; |
3801 | int ret; | 3772 | int ret; |
3802 | 3773 | ||
3803 | /* | 3774 | /* |
3804 | * An image being mapped will have the pool name (etc.), but | 3775 | * An image being mapped will have the pool name (etc.), but |
3805 | * we need to look up the snapshot id. | 3776 | * we need to look up the snapshot id. |
3806 | */ | 3777 | */ |
3807 | if (spec->pool_name) { | 3778 | if (spec->pool_name) { |
3808 | if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) { | 3779 | if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) { |
3809 | struct rbd_snap *snap; | 3780 | struct rbd_snap *snap; |
3810 | 3781 | ||
3811 | snap = snap_by_name(rbd_dev, spec->snap_name); | 3782 | snap = snap_by_name(rbd_dev, spec->snap_name); |
3812 | if (!snap) | 3783 | if (!snap) |
3813 | return -ENOENT; | 3784 | return -ENOENT; |
3814 | spec->snap_id = snap->id; | 3785 | spec->snap_id = snap->id; |
3815 | } else { | 3786 | } else { |
3816 | spec->snap_id = CEPH_NOSNAP; | 3787 | spec->snap_id = CEPH_NOSNAP; |
3817 | } | 3788 | } |
3818 | 3789 | ||
3819 | return 0; | 3790 | return 0; |
3820 | } | 3791 | } |
3821 | 3792 | ||
3822 | /* Get the pool name; we have to make our own copy of this */ | 3793 | /* Get the pool name; we have to make our own copy of this */ |
3823 | 3794 | ||
3824 | pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id); | 3795 | pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id); |
3825 | if (!pool_name) { | 3796 | if (!pool_name) { |
3826 | rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id); | 3797 | rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id); |
3827 | return -EIO; | 3798 | return -EIO; |
3828 | } | 3799 | } |
3829 | pool_name = kstrdup(pool_name, GFP_KERNEL); | 3800 | pool_name = kstrdup(pool_name, GFP_KERNEL); |
3830 | if (!pool_name) | 3801 | if (!pool_name) |
3831 | return -ENOMEM; | 3802 | return -ENOMEM; |
3832 | 3803 | ||
3833 | /* Fetch the image name; tolerate failure here */ | 3804 | /* Fetch the image name; tolerate failure here */ |
3834 | 3805 | ||
3835 | image_name = rbd_dev_image_name(rbd_dev); | 3806 | image_name = rbd_dev_image_name(rbd_dev); |
3836 | if (!image_name) | 3807 | if (!image_name) |
3837 | rbd_warn(rbd_dev, "unable to get image name"); | 3808 | rbd_warn(rbd_dev, "unable to get image name"); |
3838 | 3809 | ||
3839 | /* Look up the snapshot name, and make a copy */ | 3810 | /* Look up the snapshot name, and make a copy */ |
3840 | 3811 | ||
3841 | snap_name = rbd_snap_name(rbd_dev, spec->snap_id); | 3812 | snap_name = rbd_snap_name(rbd_dev, spec->snap_id); |
3842 | if (!snap_name) { | 3813 | if (!snap_name) { |
3843 | rbd_warn(rbd_dev, "no snapshot with id %llu", spec->snap_id); | 3814 | rbd_warn(rbd_dev, "no snapshot with id %llu", spec->snap_id); |
3844 | ret = -EIO; | 3815 | ret = -EIO; |
3845 | goto out_err; | 3816 | goto out_err; |
3846 | } | 3817 | } |
3847 | snap_name = kstrdup(snap_name, GFP_KERNEL); | 3818 | snap_name = kstrdup(snap_name, GFP_KERNEL); |
3848 | if (!snap_name) { | 3819 | if (!snap_name) { |
3849 | ret = -ENOMEM; | 3820 | ret = -ENOMEM; |
3850 | goto out_err; | 3821 | goto out_err; |
3851 | } | 3822 | } |
3852 | 3823 | ||
3853 | spec->pool_name = pool_name; | 3824 | spec->pool_name = pool_name; |
3854 | spec->image_name = image_name; | 3825 | spec->image_name = image_name; |
3855 | spec->snap_name = snap_name; | 3826 | spec->snap_name = snap_name; |
3856 | 3827 | ||
3857 | return 0; | 3828 | return 0; |
3858 | out_err: | 3829 | out_err: |
3859 | kfree(image_name); | 3830 | kfree(image_name); |
3860 | kfree(pool_name); | 3831 | kfree(pool_name); |
3861 | 3832 | ||
3862 | return ret; | 3833 | return ret; |
3863 | } | 3834 | } |
3864 | 3835 | ||
3865 | static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver) | 3836 | static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver) |
3866 | { | 3837 | { |
3867 | size_t size; | 3838 | size_t size; |
3868 | int ret; | 3839 | int ret; |
3869 | void *reply_buf; | 3840 | void *reply_buf; |
3870 | void *p; | 3841 | void *p; |
3871 | void *end; | 3842 | void *end; |
3872 | u64 seq; | 3843 | u64 seq; |
3873 | u32 snap_count; | 3844 | u32 snap_count; |
3874 | struct ceph_snap_context *snapc; | 3845 | struct ceph_snap_context *snapc; |
3875 | u32 i; | 3846 | u32 i; |
3876 | 3847 | ||
3877 | /* | 3848 | /* |
3878 | * We'll need room for the seq value (maximum snapshot id), | 3849 | * We'll need room for the seq value (maximum snapshot id), |
3879 | * snapshot count, and array of that many snapshot ids. | 3850 | * snapshot count, and array of that many snapshot ids. |
3880 | * For now we have a fixed upper limit on the number we're | 3851 | * For now we have a fixed upper limit on the number we're |
3881 | * prepared to receive. | 3852 | * prepared to receive. |
3882 | */ | 3853 | */ |
3883 | size = sizeof (__le64) + sizeof (__le32) + | 3854 | size = sizeof (__le64) + sizeof (__le32) + |
3884 | RBD_MAX_SNAP_COUNT * sizeof (__le64); | 3855 | RBD_MAX_SNAP_COUNT * sizeof (__le64); |
3885 | reply_buf = kzalloc(size, GFP_KERNEL); | 3856 | reply_buf = kzalloc(size, GFP_KERNEL); |
3886 | if (!reply_buf) | 3857 | if (!reply_buf) |
3887 | return -ENOMEM; | 3858 | return -ENOMEM; |
3888 | 3859 | ||
3889 | ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, | 3860 | ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, |
3890 | "rbd", "get_snapcontext", NULL, 0, | 3861 | "rbd", "get_snapcontext", NULL, 0, |
3891 | reply_buf, size, ver); | 3862 | reply_buf, size, ver); |
3892 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); | 3863 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); |
3893 | if (ret < 0) | 3864 | if (ret < 0) |
3894 | goto out; | 3865 | goto out; |
3895 | 3866 | ||
3896 | p = reply_buf; | 3867 | p = reply_buf; |
3897 | end = reply_buf + ret; | 3868 | end = reply_buf + ret; |
3898 | ret = -ERANGE; | 3869 | ret = -ERANGE; |
3899 | ceph_decode_64_safe(&p, end, seq, out); | 3870 | ceph_decode_64_safe(&p, end, seq, out); |
3900 | ceph_decode_32_safe(&p, end, snap_count, out); | 3871 | ceph_decode_32_safe(&p, end, snap_count, out); |
3901 | 3872 | ||
3902 | /* | 3873 | /* |
3903 | * Make sure the reported number of snapshot ids wouldn't go | 3874 | * Make sure the reported number of snapshot ids wouldn't go |
3904 | * beyond the end of our buffer. But before checking that, | 3875 | * beyond the end of our buffer. But before checking that, |
3905 | * make sure the computed size of the snapshot context we | 3876 | * make sure the computed size of the snapshot context we |
3906 | * allocate is representable in a size_t. | 3877 | * allocate is representable in a size_t. |
3907 | */ | 3878 | */ |
3908 | if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context)) | 3879 | if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context)) |
3909 | / sizeof (u64)) { | 3880 | / sizeof (u64)) { |
3910 | ret = -EINVAL; | 3881 | ret = -EINVAL; |
3911 | goto out; | 3882 | goto out; |
3912 | } | 3883 | } |
3913 | if (!ceph_has_room(&p, end, snap_count * sizeof (__le64))) | 3884 | if (!ceph_has_room(&p, end, snap_count * sizeof (__le64))) |
3914 | goto out; | 3885 | goto out; |
3915 | ret = 0; | 3886 | ret = 0; |
3916 | 3887 | ||
3917 | snapc = rbd_snap_context_create(snap_count); | 3888 | snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); |
3918 | if (!snapc) { | 3889 | if (!snapc) { |
3919 | ret = -ENOMEM; | 3890 | ret = -ENOMEM; |
3920 | goto out; | 3891 | goto out; |
3921 | } | 3892 | } |
3922 | snapc->seq = seq; | 3893 | snapc->seq = seq; |
3923 | for (i = 0; i < snap_count; i++) | 3894 | for (i = 0; i < snap_count; i++) |
3924 | snapc->snaps[i] = ceph_decode_64(&p); | 3895 | snapc->snaps[i] = ceph_decode_64(&p); |
3925 | 3896 | ||
3926 | rbd_dev->header.snapc = snapc; | 3897 | rbd_dev->header.snapc = snapc; |
3927 | 3898 | ||
3928 | dout(" snap context seq = %llu, snap_count = %u\n", | 3899 | dout(" snap context seq = %llu, snap_count = %u\n", |
3929 | (unsigned long long)seq, (unsigned int)snap_count); | 3900 | (unsigned long long)seq, (unsigned int)snap_count); |
3930 | out: | 3901 | out: |
3931 | kfree(reply_buf); | 3902 | kfree(reply_buf); |
3932 | 3903 | ||
3933 | return ret; | 3904 | return ret; |
3934 | } | 3905 | } |
3935 | 3906 | ||
3936 | static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which) | 3907 | static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which) |
3937 | { | 3908 | { |
3938 | size_t size; | 3909 | size_t size; |
3939 | void *reply_buf; | 3910 | void *reply_buf; |
3940 | __le64 snap_id; | 3911 | __le64 snap_id; |
3941 | int ret; | 3912 | int ret; |
3942 | void *p; | 3913 | void *p; |
3943 | void *end; | 3914 | void *end; |
3944 | char *snap_name; | 3915 | char *snap_name; |
3945 | 3916 | ||
3946 | size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN; | 3917 | size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN; |
3947 | reply_buf = kmalloc(size, GFP_KERNEL); | 3918 | reply_buf = kmalloc(size, GFP_KERNEL); |
3948 | if (!reply_buf) | 3919 | if (!reply_buf) |
3949 | return ERR_PTR(-ENOMEM); | 3920 | return ERR_PTR(-ENOMEM); |
3950 | 3921 | ||
3951 | rbd_assert(which < rbd_dev->header.snapc->num_snaps); | 3922 | rbd_assert(which < rbd_dev->header.snapc->num_snaps); |
3952 | snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]); | 3923 | snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]); |
3953 | ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, | 3924 | ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, |
3954 | "rbd", "get_snapshot_name", | 3925 | "rbd", "get_snapshot_name", |
3955 | &snap_id, sizeof (snap_id), | 3926 | &snap_id, sizeof (snap_id), |
3956 | reply_buf, size, NULL); | 3927 | reply_buf, size, NULL); |
3957 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); | 3928 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); |
3958 | if (ret < 0) { | 3929 | if (ret < 0) { |
3959 | snap_name = ERR_PTR(ret); | 3930 | snap_name = ERR_PTR(ret); |
3960 | goto out; | 3931 | goto out; |
3961 | } | 3932 | } |
3962 | 3933 | ||
3963 | p = reply_buf; | 3934 | p = reply_buf; |
3964 | end = reply_buf + ret; | 3935 | end = reply_buf + ret; |
3965 | snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); | 3936 | snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); |
3966 | if (IS_ERR(snap_name)) | 3937 | if (IS_ERR(snap_name)) |
3967 | goto out; | 3938 | goto out; |
3968 | 3939 | ||
3969 | dout(" snap_id 0x%016llx snap_name = %s\n", | 3940 | dout(" snap_id 0x%016llx snap_name = %s\n", |
3970 | (unsigned long long)le64_to_cpu(snap_id), snap_name); | 3941 | (unsigned long long)le64_to_cpu(snap_id), snap_name); |
3971 | out: | 3942 | out: |
3972 | kfree(reply_buf); | 3943 | kfree(reply_buf); |
3973 | 3944 | ||
3974 | return snap_name; | 3945 | return snap_name; |
3975 | } | 3946 | } |
3976 | 3947 | ||
3977 | static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which, | 3948 | static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which, |
3978 | u64 *snap_size, u64 *snap_features) | 3949 | u64 *snap_size, u64 *snap_features) |
3979 | { | 3950 | { |
3980 | u64 snap_id; | 3951 | u64 snap_id; |
3981 | u64 size; | 3952 | u64 size; |
3982 | u64 features; | 3953 | u64 features; |
3983 | char *snap_name; | 3954 | char *snap_name; |
3984 | int ret; | 3955 | int ret; |
3985 | 3956 | ||
3986 | rbd_assert(which < rbd_dev->header.snapc->num_snaps); | 3957 | rbd_assert(which < rbd_dev->header.snapc->num_snaps); |
3987 | snap_id = rbd_dev->header.snapc->snaps[which]; | 3958 | snap_id = rbd_dev->header.snapc->snaps[which]; |
3988 | ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size); | 3959 | ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size); |
3989 | if (ret) | 3960 | if (ret) |
3990 | goto out_err; | 3961 | goto out_err; |
3991 | 3962 | ||
3992 | ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features); | 3963 | ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features); |
3993 | if (ret) | 3964 | if (ret) |
3994 | goto out_err; | 3965 | goto out_err; |
3995 | 3966 | ||
3996 | snap_name = rbd_dev_v2_snap_name(rbd_dev, which); | 3967 | snap_name = rbd_dev_v2_snap_name(rbd_dev, which); |
3997 | if (!IS_ERR(snap_name)) { | 3968 | if (!IS_ERR(snap_name)) { |
3998 | *snap_size = size; | 3969 | *snap_size = size; |
3999 | *snap_features = features; | 3970 | *snap_features = features; |
4000 | } | 3971 | } |
4001 | 3972 | ||
4002 | return snap_name; | 3973 | return snap_name; |
4003 | out_err: | 3974 | out_err: |
4004 | return ERR_PTR(ret); | 3975 | return ERR_PTR(ret); |
4005 | } | 3976 | } |
4006 | 3977 | ||
4007 | static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which, | 3978 | static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which, |
4008 | u64 *snap_size, u64 *snap_features) | 3979 | u64 *snap_size, u64 *snap_features) |
4009 | { | 3980 | { |
4010 | if (rbd_dev->image_format == 1) | 3981 | if (rbd_dev->image_format == 1) |
4011 | return rbd_dev_v1_snap_info(rbd_dev, which, | 3982 | return rbd_dev_v1_snap_info(rbd_dev, which, |
4012 | snap_size, snap_features); | 3983 | snap_size, snap_features); |
4013 | if (rbd_dev->image_format == 2) | 3984 | if (rbd_dev->image_format == 2) |
4014 | return rbd_dev_v2_snap_info(rbd_dev, which, | 3985 | return rbd_dev_v2_snap_info(rbd_dev, which, |
4015 | snap_size, snap_features); | 3986 | snap_size, snap_features); |
4016 | return ERR_PTR(-EINVAL); | 3987 | return ERR_PTR(-EINVAL); |
4017 | } | 3988 | } |
4018 | 3989 | ||
4019 | static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver) | 3990 | static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver) |
4020 | { | 3991 | { |
4021 | int ret; | 3992 | int ret; |
4022 | 3993 | ||
4023 | down_write(&rbd_dev->header_rwsem); | 3994 | down_write(&rbd_dev->header_rwsem); |
4024 | 3995 | ||
4025 | ret = rbd_dev_v2_image_size(rbd_dev); | 3996 | ret = rbd_dev_v2_image_size(rbd_dev); |
4026 | if (ret) | 3997 | if (ret) |
4027 | goto out; | 3998 | goto out; |
4028 | rbd_update_mapping_size(rbd_dev); | 3999 | rbd_update_mapping_size(rbd_dev); |
4029 | 4000 | ||
4030 | ret = rbd_dev_v2_snap_context(rbd_dev, hver); | 4001 | ret = rbd_dev_v2_snap_context(rbd_dev, hver); |
4031 | dout("rbd_dev_v2_snap_context returned %d\n", ret); | 4002 | dout("rbd_dev_v2_snap_context returned %d\n", ret); |
4032 | if (ret) | 4003 | if (ret) |
4033 | goto out; | 4004 | goto out; |
4034 | ret = rbd_dev_snaps_update(rbd_dev); | 4005 | ret = rbd_dev_snaps_update(rbd_dev); |
4035 | dout("rbd_dev_snaps_update returned %d\n", ret); | 4006 | dout("rbd_dev_snaps_update returned %d\n", ret); |
4036 | if (ret) | 4007 | if (ret) |
4037 | goto out; | 4008 | goto out; |
4038 | out: | 4009 | out: |
4039 | up_write(&rbd_dev->header_rwsem); | 4010 | up_write(&rbd_dev->header_rwsem); |
4040 | 4011 | ||
4041 | return ret; | 4012 | return ret; |
4042 | } | 4013 | } |
4043 | 4014 | ||
4044 | /* | 4015 | /* |
4045 | * Scan the rbd device's current snapshot list and compare it to the | 4016 | * Scan the rbd device's current snapshot list and compare it to the |
4046 | * newly-received snapshot context. Remove any existing snapshots | 4017 | * newly-received snapshot context. Remove any existing snapshots |
4047 | * not present in the new snapshot context. Add a new snapshot for | 4018 | * not present in the new snapshot context. Add a new snapshot for |
4048 | * any snaphots in the snapshot context not in the current list. | 4019 | * any snaphots in the snapshot context not in the current list. |
4049 | * And verify there are no changes to snapshots we already know | 4020 | * And verify there are no changes to snapshots we already know |
4050 | * about. | 4021 | * about. |
4051 | * | 4022 | * |
4052 | * Assumes the snapshots in the snapshot context are sorted by | 4023 | * Assumes the snapshots in the snapshot context are sorted by |
4053 | * snapshot id, highest id first. (Snapshots in the rbd_dev's list | 4024 | * snapshot id, highest id first. (Snapshots in the rbd_dev's list |
4054 | * are also maintained in that order.) | 4025 | * are also maintained in that order.) |
4055 | * | 4026 | * |
4056 | * Note that any error occurs while updating the snapshot list | 4027 | * Note that any error occurs while updating the snapshot list |
4057 | * aborts the update, and the entire list is cleared. The snapshot | 4028 | * aborts the update, and the entire list is cleared. The snapshot |
4058 | * list becomes inconsistent at that point anyway, so it might as | 4029 | * list becomes inconsistent at that point anyway, so it might as |
4059 | * well be empty. | 4030 | * well be empty. |
4060 | */ | 4031 | */ |
4061 | static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) | 4032 | static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) |
4062 | { | 4033 | { |
4063 | struct ceph_snap_context *snapc = rbd_dev->header.snapc; | 4034 | struct ceph_snap_context *snapc = rbd_dev->header.snapc; |
4064 | const u32 snap_count = snapc->num_snaps; | 4035 | const u32 snap_count = snapc->num_snaps; |
4065 | struct list_head *head = &rbd_dev->snaps; | 4036 | struct list_head *head = &rbd_dev->snaps; |
4066 | struct list_head *links = head->next; | 4037 | struct list_head *links = head->next; |
4067 | u32 index = 0; | 4038 | u32 index = 0; |
4068 | int ret = 0; | 4039 | int ret = 0; |
4069 | 4040 | ||
4070 | dout("%s: snap count is %u\n", __func__, (unsigned int)snap_count); | 4041 | dout("%s: snap count is %u\n", __func__, (unsigned int)snap_count); |
4071 | while (index < snap_count || links != head) { | 4042 | while (index < snap_count || links != head) { |
4072 | u64 snap_id; | 4043 | u64 snap_id; |
4073 | struct rbd_snap *snap; | 4044 | struct rbd_snap *snap; |
4074 | char *snap_name; | 4045 | char *snap_name; |
4075 | u64 snap_size = 0; | 4046 | u64 snap_size = 0; |
4076 | u64 snap_features = 0; | 4047 | u64 snap_features = 0; |
4077 | 4048 | ||
4078 | snap_id = index < snap_count ? snapc->snaps[index] | 4049 | snap_id = index < snap_count ? snapc->snaps[index] |
4079 | : CEPH_NOSNAP; | 4050 | : CEPH_NOSNAP; |
4080 | snap = links != head ? list_entry(links, struct rbd_snap, node) | 4051 | snap = links != head ? list_entry(links, struct rbd_snap, node) |
4081 | : NULL; | 4052 | : NULL; |
4082 | rbd_assert(!snap || snap->id != CEPH_NOSNAP); | 4053 | rbd_assert(!snap || snap->id != CEPH_NOSNAP); |
4083 | 4054 | ||
4084 | if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) { | 4055 | if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) { |
4085 | struct list_head *next = links->next; | 4056 | struct list_head *next = links->next; |
4086 | 4057 | ||
4087 | /* | 4058 | /* |
4088 | * A previously-existing snapshot is not in | 4059 | * A previously-existing snapshot is not in |
4089 | * the new snap context. | 4060 | * the new snap context. |
4090 | * | 4061 | * |
4091 | * If the now-missing snapshot is the one | 4062 | * If the now-missing snapshot is the one |
4092 | * the image represents, clear its existence | 4063 | * the image represents, clear its existence |
4093 | * flag so we can avoid sending any more | 4064 | * flag so we can avoid sending any more |
4094 | * requests to it. | 4065 | * requests to it. |
4095 | */ | 4066 | */ |
4096 | if (rbd_dev->spec->snap_id == snap->id) | 4067 | if (rbd_dev->spec->snap_id == snap->id) |
4097 | clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); | 4068 | clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); |
4098 | dout("removing %ssnap id %llu\n", | 4069 | dout("removing %ssnap id %llu\n", |
4099 | rbd_dev->spec->snap_id == snap->id ? | 4070 | rbd_dev->spec->snap_id == snap->id ? |
4100 | "mapped " : "", | 4071 | "mapped " : "", |
4101 | (unsigned long long)snap->id); | 4072 | (unsigned long long)snap->id); |
4102 | 4073 | ||
4103 | list_del(&snap->node); | 4074 | list_del(&snap->node); |
4104 | rbd_snap_destroy(snap); | 4075 | rbd_snap_destroy(snap); |
4105 | 4076 | ||
4106 | /* Done with this list entry; advance */ | 4077 | /* Done with this list entry; advance */ |
4107 | 4078 | ||
4108 | links = next; | 4079 | links = next; |
4109 | continue; | 4080 | continue; |
4110 | } | 4081 | } |
4111 | 4082 | ||
4112 | snap_name = rbd_dev_snap_info(rbd_dev, index, | 4083 | snap_name = rbd_dev_snap_info(rbd_dev, index, |
4113 | &snap_size, &snap_features); | 4084 | &snap_size, &snap_features); |
4114 | if (IS_ERR(snap_name)) { | 4085 | if (IS_ERR(snap_name)) { |
4115 | ret = PTR_ERR(snap_name); | 4086 | ret = PTR_ERR(snap_name); |
4116 | dout("failed to get snap info, error %d\n", ret); | 4087 | dout("failed to get snap info, error %d\n", ret); |
4117 | goto out_err; | 4088 | goto out_err; |
4118 | } | 4089 | } |
4119 | 4090 | ||
4120 | dout("entry %u: snap_id = %llu\n", (unsigned int)snap_count, | 4091 | dout("entry %u: snap_id = %llu\n", (unsigned int)snap_count, |
4121 | (unsigned long long)snap_id); | 4092 | (unsigned long long)snap_id); |
4122 | if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) { | 4093 | if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) { |
4123 | struct rbd_snap *new_snap; | 4094 | struct rbd_snap *new_snap; |
4124 | 4095 | ||
4125 | /* We haven't seen this snapshot before */ | 4096 | /* We haven't seen this snapshot before */ |
4126 | 4097 | ||
4127 | new_snap = rbd_snap_create(rbd_dev, snap_name, | 4098 | new_snap = rbd_snap_create(rbd_dev, snap_name, |
4128 | snap_id, snap_size, snap_features); | 4099 | snap_id, snap_size, snap_features); |
4129 | if (IS_ERR(new_snap)) { | 4100 | if (IS_ERR(new_snap)) { |
4130 | ret = PTR_ERR(new_snap); | 4101 | ret = PTR_ERR(new_snap); |
4131 | dout(" failed to add dev, error %d\n", ret); | 4102 | dout(" failed to add dev, error %d\n", ret); |
4132 | goto out_err; | 4103 | goto out_err; |
4133 | } | 4104 | } |
4134 | 4105 | ||
4135 | /* New goes before existing, or at end of list */ | 4106 | /* New goes before existing, or at end of list */ |
4136 | 4107 | ||
4137 | dout(" added dev%s\n", snap ? "" : " at end\n"); | 4108 | dout(" added dev%s\n", snap ? "" : " at end\n"); |
4138 | if (snap) | 4109 | if (snap) |
4139 | list_add_tail(&new_snap->node, &snap->node); | 4110 | list_add_tail(&new_snap->node, &snap->node); |
4140 | else | 4111 | else |
4141 | list_add_tail(&new_snap->node, head); | 4112 | list_add_tail(&new_snap->node, head); |
4142 | } else { | 4113 | } else { |
4143 | /* Already have this one */ | 4114 | /* Already have this one */ |
4144 | 4115 | ||
4145 | dout(" already present\n"); | 4116 | dout(" already present\n"); |
4146 | 4117 | ||
4147 | rbd_assert(snap->size == snap_size); | 4118 | rbd_assert(snap->size == snap_size); |
4148 | rbd_assert(!strcmp(snap->name, snap_name)); | 4119 | rbd_assert(!strcmp(snap->name, snap_name)); |
4149 | rbd_assert(snap->features == snap_features); | 4120 | rbd_assert(snap->features == snap_features); |
4150 | 4121 | ||
4151 | /* Done with this list entry; advance */ | 4122 | /* Done with this list entry; advance */ |
4152 | 4123 | ||
4153 | links = links->next; | 4124 | links = links->next; |
4154 | } | 4125 | } |
4155 | 4126 | ||
4156 | /* Advance to the next entry in the snapshot context */ | 4127 | /* Advance to the next entry in the snapshot context */ |
4157 | 4128 | ||
4158 | index++; | 4129 | index++; |
4159 | } | 4130 | } |
4160 | dout("%s: done\n", __func__); | 4131 | dout("%s: done\n", __func__); |
4161 | 4132 | ||
4162 | return 0; | 4133 | return 0; |
4163 | out_err: | 4134 | out_err: |
4164 | rbd_remove_all_snaps(rbd_dev); | 4135 | rbd_remove_all_snaps(rbd_dev); |
4165 | 4136 | ||
4166 | return ret; | 4137 | return ret; |
4167 | } | 4138 | } |
4168 | 4139 | ||
4169 | static int rbd_bus_add_dev(struct rbd_device *rbd_dev) | 4140 | static int rbd_bus_add_dev(struct rbd_device *rbd_dev) |
4170 | { | 4141 | { |
4171 | struct device *dev; | 4142 | struct device *dev; |
4172 | int ret; | 4143 | int ret; |
4173 | 4144 | ||
4174 | mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); | 4145 | mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); |
4175 | 4146 | ||
4176 | dev = &rbd_dev->dev; | 4147 | dev = &rbd_dev->dev; |
4177 | dev->bus = &rbd_bus_type; | 4148 | dev->bus = &rbd_bus_type; |
4178 | dev->type = &rbd_device_type; | 4149 | dev->type = &rbd_device_type; |
4179 | dev->parent = &rbd_root_dev; | 4150 | dev->parent = &rbd_root_dev; |
4180 | dev->release = rbd_dev_device_release; | 4151 | dev->release = rbd_dev_device_release; |
4181 | dev_set_name(dev, "%d", rbd_dev->dev_id); | 4152 | dev_set_name(dev, "%d", rbd_dev->dev_id); |
4182 | ret = device_register(dev); | 4153 | ret = device_register(dev); |
4183 | 4154 | ||
4184 | mutex_unlock(&ctl_mutex); | 4155 | mutex_unlock(&ctl_mutex); |
4185 | 4156 | ||
4186 | return ret; | 4157 | return ret; |
4187 | } | 4158 | } |
4188 | 4159 | ||
4189 | static void rbd_bus_del_dev(struct rbd_device *rbd_dev) | 4160 | static void rbd_bus_del_dev(struct rbd_device *rbd_dev) |
4190 | { | 4161 | { |
4191 | device_unregister(&rbd_dev->dev); | 4162 | device_unregister(&rbd_dev->dev); |
4192 | } | 4163 | } |
4193 | 4164 | ||
4194 | static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0); | 4165 | static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0); |
4195 | 4166 | ||
4196 | /* | 4167 | /* |
4197 | * Get a unique rbd identifier for the given new rbd_dev, and add | 4168 | * Get a unique rbd identifier for the given new rbd_dev, and add |
4198 | * the rbd_dev to the global list. The minimum rbd id is 1. | 4169 | * the rbd_dev to the global list. The minimum rbd id is 1. |
4199 | */ | 4170 | */ |
4200 | static void rbd_dev_id_get(struct rbd_device *rbd_dev) | 4171 | static void rbd_dev_id_get(struct rbd_device *rbd_dev) |
4201 | { | 4172 | { |
4202 | rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max); | 4173 | rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max); |
4203 | 4174 | ||
4204 | spin_lock(&rbd_dev_list_lock); | 4175 | spin_lock(&rbd_dev_list_lock); |
4205 | list_add_tail(&rbd_dev->node, &rbd_dev_list); | 4176 | list_add_tail(&rbd_dev->node, &rbd_dev_list); |
4206 | spin_unlock(&rbd_dev_list_lock); | 4177 | spin_unlock(&rbd_dev_list_lock); |
4207 | dout("rbd_dev %p given dev id %llu\n", rbd_dev, | 4178 | dout("rbd_dev %p given dev id %llu\n", rbd_dev, |
4208 | (unsigned long long) rbd_dev->dev_id); | 4179 | (unsigned long long) rbd_dev->dev_id); |
4209 | } | 4180 | } |
4210 | 4181 | ||
4211 | /* | 4182 | /* |
4212 | * Remove an rbd_dev from the global list, and record that its | 4183 | * Remove an rbd_dev from the global list, and record that its |
4213 | * identifier is no longer in use. | 4184 | * identifier is no longer in use. |
4214 | */ | 4185 | */ |
4215 | static void rbd_dev_id_put(struct rbd_device *rbd_dev) | 4186 | static void rbd_dev_id_put(struct rbd_device *rbd_dev) |
4216 | { | 4187 | { |
4217 | struct list_head *tmp; | 4188 | struct list_head *tmp; |
4218 | int rbd_id = rbd_dev->dev_id; | 4189 | int rbd_id = rbd_dev->dev_id; |
4219 | int max_id; | 4190 | int max_id; |
4220 | 4191 | ||
4221 | rbd_assert(rbd_id > 0); | 4192 | rbd_assert(rbd_id > 0); |
4222 | 4193 | ||
4223 | dout("rbd_dev %p released dev id %llu\n", rbd_dev, | 4194 | dout("rbd_dev %p released dev id %llu\n", rbd_dev, |
4224 | (unsigned long long) rbd_dev->dev_id); | 4195 | (unsigned long long) rbd_dev->dev_id); |
4225 | spin_lock(&rbd_dev_list_lock); | 4196 | spin_lock(&rbd_dev_list_lock); |
4226 | list_del_init(&rbd_dev->node); | 4197 | list_del_init(&rbd_dev->node); |
4227 | 4198 | ||
4228 | /* | 4199 | /* |
4229 | * If the id being "put" is not the current maximum, there | 4200 | * If the id being "put" is not the current maximum, there |
4230 | * is nothing special we need to do. | 4201 | * is nothing special we need to do. |
4231 | */ | 4202 | */ |
4232 | if (rbd_id != atomic64_read(&rbd_dev_id_max)) { | 4203 | if (rbd_id != atomic64_read(&rbd_dev_id_max)) { |
4233 | spin_unlock(&rbd_dev_list_lock); | 4204 | spin_unlock(&rbd_dev_list_lock); |
4234 | return; | 4205 | return; |
4235 | } | 4206 | } |
4236 | 4207 | ||
4237 | /* | 4208 | /* |
4238 | * We need to update the current maximum id. Search the | 4209 | * We need to update the current maximum id. Search the |
4239 | * list to find out what it is. We're more likely to find | 4210 | * list to find out what it is. We're more likely to find |
4240 | * the maximum at the end, so search the list backward. | 4211 | * the maximum at the end, so search the list backward. |
4241 | */ | 4212 | */ |
4242 | max_id = 0; | 4213 | max_id = 0; |
4243 | list_for_each_prev(tmp, &rbd_dev_list) { | 4214 | list_for_each_prev(tmp, &rbd_dev_list) { |
4244 | struct rbd_device *rbd_dev; | 4215 | struct rbd_device *rbd_dev; |
4245 | 4216 | ||
4246 | rbd_dev = list_entry(tmp, struct rbd_device, node); | 4217 | rbd_dev = list_entry(tmp, struct rbd_device, node); |
4247 | if (rbd_dev->dev_id > max_id) | 4218 | if (rbd_dev->dev_id > max_id) |
4248 | max_id = rbd_dev->dev_id; | 4219 | max_id = rbd_dev->dev_id; |
4249 | } | 4220 | } |
4250 | spin_unlock(&rbd_dev_list_lock); | 4221 | spin_unlock(&rbd_dev_list_lock); |
4251 | 4222 | ||
4252 | /* | 4223 | /* |
4253 | * The max id could have been updated by rbd_dev_id_get(), in | 4224 | * The max id could have been updated by rbd_dev_id_get(), in |
4254 | * which case it now accurately reflects the new maximum. | 4225 | * which case it now accurately reflects the new maximum. |
4255 | * Be careful not to overwrite the maximum value in that | 4226 | * Be careful not to overwrite the maximum value in that |
4256 | * case. | 4227 | * case. |
4257 | */ | 4228 | */ |
4258 | atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id); | 4229 | atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id); |
4259 | dout(" max dev id has been reset\n"); | 4230 | dout(" max dev id has been reset\n"); |
4260 | } | 4231 | } |
4261 | 4232 | ||
4262 | /* | 4233 | /* |
4263 | * Skips over white space at *buf, and updates *buf to point to the | 4234 | * Skips over white space at *buf, and updates *buf to point to the |
4264 | * first found non-space character (if any). Returns the length of | 4235 | * first found non-space character (if any). Returns the length of |
4265 | * the token (string of non-white space characters) found. Note | 4236 | * the token (string of non-white space characters) found. Note |
4266 | * that *buf must be terminated with '\0'. | 4237 | * that *buf must be terminated with '\0'. |
4267 | */ | 4238 | */ |
4268 | static inline size_t next_token(const char **buf) | 4239 | static inline size_t next_token(const char **buf) |
4269 | { | 4240 | { |
4270 | /* | 4241 | /* |
4271 | * These are the characters that produce nonzero for | 4242 | * These are the characters that produce nonzero for |
4272 | * isspace() in the "C" and "POSIX" locales. | 4243 | * isspace() in the "C" and "POSIX" locales. |
4273 | */ | 4244 | */ |
4274 | const char *spaces = " \f\n\r\t\v"; | 4245 | const char *spaces = " \f\n\r\t\v"; |
4275 | 4246 | ||
4276 | *buf += strspn(*buf, spaces); /* Find start of token */ | 4247 | *buf += strspn(*buf, spaces); /* Find start of token */ |
4277 | 4248 | ||
4278 | return strcspn(*buf, spaces); /* Return token length */ | 4249 | return strcspn(*buf, spaces); /* Return token length */ |
4279 | } | 4250 | } |
4280 | 4251 | ||
4281 | /* | 4252 | /* |
4282 | * Finds the next token in *buf, and if the provided token buffer is | 4253 | * Finds the next token in *buf, and if the provided token buffer is |
4283 | * big enough, copies the found token into it. The result, if | 4254 | * big enough, copies the found token into it. The result, if |
4284 | * copied, is guaranteed to be terminated with '\0'. Note that *buf | 4255 | * copied, is guaranteed to be terminated with '\0'. Note that *buf |
4285 | * must be terminated with '\0' on entry. | 4256 | * must be terminated with '\0' on entry. |
4286 | * | 4257 | * |
4287 | * Returns the length of the token found (not including the '\0'). | 4258 | * Returns the length of the token found (not including the '\0'). |
4288 | * Return value will be 0 if no token is found, and it will be >= | 4259 | * Return value will be 0 if no token is found, and it will be >= |
4289 | * token_size if the token would not fit. | 4260 | * token_size if the token would not fit. |
4290 | * | 4261 | * |
4291 | * The *buf pointer will be updated to point beyond the end of the | 4262 | * The *buf pointer will be updated to point beyond the end of the |
4292 | * found token. Note that this occurs even if the token buffer is | 4263 | * found token. Note that this occurs even if the token buffer is |
4293 | * too small to hold it. | 4264 | * too small to hold it. |
4294 | */ | 4265 | */ |
4295 | static inline size_t copy_token(const char **buf, | 4266 | static inline size_t copy_token(const char **buf, |
4296 | char *token, | 4267 | char *token, |
4297 | size_t token_size) | 4268 | size_t token_size) |
4298 | { | 4269 | { |
4299 | size_t len; | 4270 | size_t len; |
4300 | 4271 | ||
4301 | len = next_token(buf); | 4272 | len = next_token(buf); |
4302 | if (len < token_size) { | 4273 | if (len < token_size) { |
4303 | memcpy(token, *buf, len); | 4274 | memcpy(token, *buf, len); |
4304 | *(token + len) = '\0'; | 4275 | *(token + len) = '\0'; |
4305 | } | 4276 | } |
4306 | *buf += len; | 4277 | *buf += len; |
4307 | 4278 | ||
4308 | return len; | 4279 | return len; |
4309 | } | 4280 | } |
4310 | 4281 | ||
4311 | /* | 4282 | /* |
4312 | * Finds the next token in *buf, dynamically allocates a buffer big | 4283 | * Finds the next token in *buf, dynamically allocates a buffer big |
4313 | * enough to hold a copy of it, and copies the token into the new | 4284 | * enough to hold a copy of it, and copies the token into the new |
4314 | * buffer. The copy is guaranteed to be terminated with '\0'. Note | 4285 | * buffer. The copy is guaranteed to be terminated with '\0'. Note |
4315 | * that a duplicate buffer is created even for a zero-length token. | 4286 | * that a duplicate buffer is created even for a zero-length token. |
4316 | * | 4287 | * |
4317 | * Returns a pointer to the newly-allocated duplicate, or a null | 4288 | * Returns a pointer to the newly-allocated duplicate, or a null |
4318 | * pointer if memory for the duplicate was not available. If | 4289 | * pointer if memory for the duplicate was not available. If |
4319 | * the lenp argument is a non-null pointer, the length of the token | 4290 | * the lenp argument is a non-null pointer, the length of the token |
4320 | * (not including the '\0') is returned in *lenp. | 4291 | * (not including the '\0') is returned in *lenp. |
4321 | * | 4292 | * |
4322 | * If successful, the *buf pointer will be updated to point beyond | 4293 | * If successful, the *buf pointer will be updated to point beyond |
4323 | * the end of the found token. | 4294 | * the end of the found token. |
4324 | * | 4295 | * |
4325 | * Note: uses GFP_KERNEL for allocation. | 4296 | * Note: uses GFP_KERNEL for allocation. |
4326 | */ | 4297 | */ |
4327 | static inline char *dup_token(const char **buf, size_t *lenp) | 4298 | static inline char *dup_token(const char **buf, size_t *lenp) |
4328 | { | 4299 | { |
4329 | char *dup; | 4300 | char *dup; |
4330 | size_t len; | 4301 | size_t len; |
4331 | 4302 | ||
4332 | len = next_token(buf); | 4303 | len = next_token(buf); |
4333 | dup = kmemdup(*buf, len + 1, GFP_KERNEL); | 4304 | dup = kmemdup(*buf, len + 1, GFP_KERNEL); |
4334 | if (!dup) | 4305 | if (!dup) |
4335 | return NULL; | 4306 | return NULL; |
4336 | *(dup + len) = '\0'; | 4307 | *(dup + len) = '\0'; |
4337 | *buf += len; | 4308 | *buf += len; |
4338 | 4309 | ||
4339 | if (lenp) | 4310 | if (lenp) |
4340 | *lenp = len; | 4311 | *lenp = len; |
4341 | 4312 | ||
4342 | return dup; | 4313 | return dup; |
4343 | } | 4314 | } |
4344 | 4315 | ||
4345 | /* | 4316 | /* |
4346 | * Parse the options provided for an "rbd add" (i.e., rbd image | 4317 | * Parse the options provided for an "rbd add" (i.e., rbd image |
4347 | * mapping) request. These arrive via a write to /sys/bus/rbd/add, | 4318 | * mapping) request. These arrive via a write to /sys/bus/rbd/add, |
4348 | * and the data written is passed here via a NUL-terminated buffer. | 4319 | * and the data written is passed here via a NUL-terminated buffer. |
4349 | * Returns 0 if successful or an error code otherwise. | 4320 | * Returns 0 if successful or an error code otherwise. |
4350 | * | 4321 | * |
4351 | * The information extracted from these options is recorded in | 4322 | * The information extracted from these options is recorded in |
4352 | * the other parameters which return dynamically-allocated | 4323 | * the other parameters which return dynamically-allocated |
4353 | * structures: | 4324 | * structures: |
4354 | * ceph_opts | 4325 | * ceph_opts |
4355 | * The address of a pointer that will refer to a ceph options | 4326 | * The address of a pointer that will refer to a ceph options |
4356 | * structure. Caller must release the returned pointer using | 4327 | * structure. Caller must release the returned pointer using |
4357 | * ceph_destroy_options() when it is no longer needed. | 4328 | * ceph_destroy_options() when it is no longer needed. |
4358 | * rbd_opts | 4329 | * rbd_opts |
4359 | * Address of an rbd options pointer. Fully initialized by | 4330 | * Address of an rbd options pointer. Fully initialized by |
4360 | * this function; caller must release with kfree(). | 4331 | * this function; caller must release with kfree(). |
4361 | * spec | 4332 | * spec |
4362 | * Address of an rbd image specification pointer. Fully | 4333 | * Address of an rbd image specification pointer. Fully |
4363 | * initialized by this function based on parsed options. | 4334 | * initialized by this function based on parsed options. |
4364 | * Caller must release with rbd_spec_put(). | 4335 | * Caller must release with rbd_spec_put(). |
4365 | * | 4336 | * |
4366 | * The options passed take this form: | 4337 | * The options passed take this form: |
4367 | * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>] | 4338 | * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>] |
4368 | * where: | 4339 | * where: |
4369 | * <mon_addrs> | 4340 | * <mon_addrs> |
4370 | * A comma-separated list of one or more monitor addresses. | 4341 | * A comma-separated list of one or more monitor addresses. |
4371 | * A monitor address is an ip address, optionally followed | 4342 | * A monitor address is an ip address, optionally followed |
4372 | * by a port number (separated by a colon). | 4343 | * by a port number (separated by a colon). |
4373 | * I.e.: ip1[:port1][,ip2[:port2]...] | 4344 | * I.e.: ip1[:port1][,ip2[:port2]...] |
4374 | * <options> | 4345 | * <options> |
4375 | * A comma-separated list of ceph and/or rbd options. | 4346 | * A comma-separated list of ceph and/or rbd options. |
4376 | * <pool_name> | 4347 | * <pool_name> |
4377 | * The name of the rados pool containing the rbd image. | 4348 | * The name of the rados pool containing the rbd image. |
4378 | * <image_name> | 4349 | * <image_name> |
4379 | * The name of the image in that pool to map. | 4350 | * The name of the image in that pool to map. |
4380 | * <snap_id> | 4351 | * <snap_id> |
4381 | * An optional snapshot id. If provided, the mapping will | 4352 | * An optional snapshot id. If provided, the mapping will |
4382 | * present data from the image at the time that snapshot was | 4353 | * present data from the image at the time that snapshot was |
4383 | * created. The image head is used if no snapshot id is | 4354 | * created. The image head is used if no snapshot id is |
4384 | * provided. Snapshot mappings are always read-only. | 4355 | * provided. Snapshot mappings are always read-only. |
4385 | */ | 4356 | */ |
4386 | static int rbd_add_parse_args(const char *buf, | 4357 | static int rbd_add_parse_args(const char *buf, |
4387 | struct ceph_options **ceph_opts, | 4358 | struct ceph_options **ceph_opts, |
4388 | struct rbd_options **opts, | 4359 | struct rbd_options **opts, |
4389 | struct rbd_spec **rbd_spec) | 4360 | struct rbd_spec **rbd_spec) |
4390 | { | 4361 | { |
4391 | size_t len; | 4362 | size_t len; |
4392 | char *options; | 4363 | char *options; |
4393 | const char *mon_addrs; | 4364 | const char *mon_addrs; |
4394 | char *snap_name; | 4365 | char *snap_name; |
4395 | size_t mon_addrs_size; | 4366 | size_t mon_addrs_size; |
4396 | struct rbd_spec *spec = NULL; | 4367 | struct rbd_spec *spec = NULL; |
4397 | struct rbd_options *rbd_opts = NULL; | 4368 | struct rbd_options *rbd_opts = NULL; |
4398 | struct ceph_options *copts; | 4369 | struct ceph_options *copts; |
4399 | int ret; | 4370 | int ret; |
4400 | 4371 | ||
4401 | /* The first four tokens are required */ | 4372 | /* The first four tokens are required */ |
4402 | 4373 | ||
4403 | len = next_token(&buf); | 4374 | len = next_token(&buf); |
4404 | if (!len) { | 4375 | if (!len) { |
4405 | rbd_warn(NULL, "no monitor address(es) provided"); | 4376 | rbd_warn(NULL, "no monitor address(es) provided"); |
4406 | return -EINVAL; | 4377 | return -EINVAL; |
4407 | } | 4378 | } |
4408 | mon_addrs = buf; | 4379 | mon_addrs = buf; |
4409 | mon_addrs_size = len + 1; | 4380 | mon_addrs_size = len + 1; |
4410 | buf += len; | 4381 | buf += len; |
4411 | 4382 | ||
4412 | ret = -EINVAL; | 4383 | ret = -EINVAL; |
4413 | options = dup_token(&buf, NULL); | 4384 | options = dup_token(&buf, NULL); |
4414 | if (!options) | 4385 | if (!options) |
4415 | return -ENOMEM; | 4386 | return -ENOMEM; |
4416 | if (!*options) { | 4387 | if (!*options) { |
4417 | rbd_warn(NULL, "no options provided"); | 4388 | rbd_warn(NULL, "no options provided"); |
4418 | goto out_err; | 4389 | goto out_err; |
4419 | } | 4390 | } |
4420 | 4391 | ||
4421 | spec = rbd_spec_alloc(); | 4392 | spec = rbd_spec_alloc(); |
4422 | if (!spec) | 4393 | if (!spec) |
4423 | goto out_mem; | 4394 | goto out_mem; |
4424 | 4395 | ||
4425 | spec->pool_name = dup_token(&buf, NULL); | 4396 | spec->pool_name = dup_token(&buf, NULL); |
4426 | if (!spec->pool_name) | 4397 | if (!spec->pool_name) |
4427 | goto out_mem; | 4398 | goto out_mem; |
4428 | if (!*spec->pool_name) { | 4399 | if (!*spec->pool_name) { |
4429 | rbd_warn(NULL, "no pool name provided"); | 4400 | rbd_warn(NULL, "no pool name provided"); |
4430 | goto out_err; | 4401 | goto out_err; |
4431 | } | 4402 | } |
4432 | 4403 | ||
4433 | spec->image_name = dup_token(&buf, NULL); | 4404 | spec->image_name = dup_token(&buf, NULL); |
4434 | if (!spec->image_name) | 4405 | if (!spec->image_name) |
4435 | goto out_mem; | 4406 | goto out_mem; |
4436 | if (!*spec->image_name) { | 4407 | if (!*spec->image_name) { |
4437 | rbd_warn(NULL, "no image name provided"); | 4408 | rbd_warn(NULL, "no image name provided"); |
4438 | goto out_err; | 4409 | goto out_err; |
4439 | } | 4410 | } |
4440 | 4411 | ||
4441 | /* | 4412 | /* |
4442 | * Snapshot name is optional; default is to use "-" | 4413 | * Snapshot name is optional; default is to use "-" |
4443 | * (indicating the head/no snapshot). | 4414 | * (indicating the head/no snapshot). |
4444 | */ | 4415 | */ |
4445 | len = next_token(&buf); | 4416 | len = next_token(&buf); |
4446 | if (!len) { | 4417 | if (!len) { |
4447 | buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */ | 4418 | buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */ |
4448 | len = sizeof (RBD_SNAP_HEAD_NAME) - 1; | 4419 | len = sizeof (RBD_SNAP_HEAD_NAME) - 1; |
4449 | } else if (len > RBD_MAX_SNAP_NAME_LEN) { | 4420 | } else if (len > RBD_MAX_SNAP_NAME_LEN) { |
4450 | ret = -ENAMETOOLONG; | 4421 | ret = -ENAMETOOLONG; |
4451 | goto out_err; | 4422 | goto out_err; |
4452 | } | 4423 | } |
4453 | snap_name = kmemdup(buf, len + 1, GFP_KERNEL); | 4424 | snap_name = kmemdup(buf, len + 1, GFP_KERNEL); |
4454 | if (!snap_name) | 4425 | if (!snap_name) |
4455 | goto out_mem; | 4426 | goto out_mem; |
4456 | *(snap_name + len) = '\0'; | 4427 | *(snap_name + len) = '\0'; |
4457 | spec->snap_name = snap_name; | 4428 | spec->snap_name = snap_name; |
4458 | 4429 | ||
4459 | /* Initialize all rbd options to the defaults */ | 4430 | /* Initialize all rbd options to the defaults */ |
4460 | 4431 | ||
4461 | rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL); | 4432 | rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL); |
4462 | if (!rbd_opts) | 4433 | if (!rbd_opts) |
4463 | goto out_mem; | 4434 | goto out_mem; |
4464 | 4435 | ||
4465 | rbd_opts->read_only = RBD_READ_ONLY_DEFAULT; | 4436 | rbd_opts->read_only = RBD_READ_ONLY_DEFAULT; |
4466 | 4437 | ||
4467 | copts = ceph_parse_options(options, mon_addrs, | 4438 | copts = ceph_parse_options(options, mon_addrs, |
4468 | mon_addrs + mon_addrs_size - 1, | 4439 | mon_addrs + mon_addrs_size - 1, |
4469 | parse_rbd_opts_token, rbd_opts); | 4440 | parse_rbd_opts_token, rbd_opts); |
4470 | if (IS_ERR(copts)) { | 4441 | if (IS_ERR(copts)) { |
4471 | ret = PTR_ERR(copts); | 4442 | ret = PTR_ERR(copts); |
4472 | goto out_err; | 4443 | goto out_err; |
4473 | } | 4444 | } |
4474 | kfree(options); | 4445 | kfree(options); |
4475 | 4446 | ||
4476 | *ceph_opts = copts; | 4447 | *ceph_opts = copts; |
4477 | *opts = rbd_opts; | 4448 | *opts = rbd_opts; |
4478 | *rbd_spec = spec; | 4449 | *rbd_spec = spec; |
4479 | 4450 | ||
4480 | return 0; | 4451 | return 0; |
4481 | out_mem: | 4452 | out_mem: |
4482 | ret = -ENOMEM; | 4453 | ret = -ENOMEM; |
4483 | out_err: | 4454 | out_err: |
4484 | kfree(rbd_opts); | 4455 | kfree(rbd_opts); |
4485 | rbd_spec_put(spec); | 4456 | rbd_spec_put(spec); |
4486 | kfree(options); | 4457 | kfree(options); |
4487 | 4458 | ||
4488 | return ret; | 4459 | return ret; |
4489 | } | 4460 | } |
4490 | 4461 | ||
4491 | /* | 4462 | /* |
4492 | * An rbd format 2 image has a unique identifier, distinct from the | 4463 | * An rbd format 2 image has a unique identifier, distinct from the |
4493 | * name given to it by the user. Internally, that identifier is | 4464 | * name given to it by the user. Internally, that identifier is |
4494 | * what's used to specify the names of objects related to the image. | 4465 | * what's used to specify the names of objects related to the image. |
4495 | * | 4466 | * |
4496 | * A special "rbd id" object is used to map an rbd image name to its | 4467 | * A special "rbd id" object is used to map an rbd image name to its |
4497 | * id. If that object doesn't exist, then there is no v2 rbd image | 4468 | * id. If that object doesn't exist, then there is no v2 rbd image |
4498 | * with the supplied name. | 4469 | * with the supplied name. |
4499 | * | 4470 | * |
4500 | * This function will record the given rbd_dev's image_id field if | 4471 | * This function will record the given rbd_dev's image_id field if |
4501 | * it can be determined, and in that case will return 0. If any | 4472 | * it can be determined, and in that case will return 0. If any |
4502 | * errors occur a negative errno will be returned and the rbd_dev's | 4473 | * errors occur a negative errno will be returned and the rbd_dev's |
4503 | * image_id field will be unchanged (and should be NULL). | 4474 | * image_id field will be unchanged (and should be NULL). |
4504 | */ | 4475 | */ |
4505 | static int rbd_dev_image_id(struct rbd_device *rbd_dev) | 4476 | static int rbd_dev_image_id(struct rbd_device *rbd_dev) |
4506 | { | 4477 | { |
4507 | int ret; | 4478 | int ret; |
4508 | size_t size; | 4479 | size_t size; |
4509 | char *object_name; | 4480 | char *object_name; |
4510 | void *response; | 4481 | void *response; |
4511 | char *image_id; | 4482 | char *image_id; |
4512 | 4483 | ||
4513 | /* | 4484 | /* |
4514 | * When probing a parent image, the image id is already | 4485 | * When probing a parent image, the image id is already |
4515 | * known (and the image name likely is not). There's no | 4486 | * known (and the image name likely is not). There's no |
4516 | * need to fetch the image id again in this case. We | 4487 | * need to fetch the image id again in this case. We |
4517 | * do still need to set the image format though. | 4488 | * do still need to set the image format though. |
4518 | */ | 4489 | */ |
4519 | if (rbd_dev->spec->image_id) { | 4490 | if (rbd_dev->spec->image_id) { |
4520 | rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1; | 4491 | rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1; |
4521 | 4492 | ||
4522 | return 0; | 4493 | return 0; |
4523 | } | 4494 | } |
4524 | 4495 | ||
4525 | /* | 4496 | /* |
4526 | * First, see if the format 2 image id file exists, and if | 4497 | * First, see if the format 2 image id file exists, and if |
4527 | * so, get the image's persistent id from it. | 4498 | * so, get the image's persistent id from it. |
4528 | */ | 4499 | */ |
4529 | size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name); | 4500 | size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name); |
4530 | object_name = kmalloc(size, GFP_NOIO); | 4501 | object_name = kmalloc(size, GFP_NOIO); |
4531 | if (!object_name) | 4502 | if (!object_name) |
4532 | return -ENOMEM; | 4503 | return -ENOMEM; |
4533 | sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name); | 4504 | sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name); |
4534 | dout("rbd id object name is %s\n", object_name); | 4505 | dout("rbd id object name is %s\n", object_name); |
4535 | 4506 | ||
4536 | /* Response will be an encoded string, which includes a length */ | 4507 | /* Response will be an encoded string, which includes a length */ |
4537 | 4508 | ||
4538 | size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; | 4509 | size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; |
4539 | response = kzalloc(size, GFP_NOIO); | 4510 | response = kzalloc(size, GFP_NOIO); |
4540 | if (!response) { | 4511 | if (!response) { |
4541 | ret = -ENOMEM; | 4512 | ret = -ENOMEM; |
4542 | goto out; | 4513 | goto out; |
4543 | } | 4514 | } |
4544 | 4515 | ||
4545 | /* If it doesn't exist we'll assume it's a format 1 image */ | 4516 | /* If it doesn't exist we'll assume it's a format 1 image */ |
4546 | 4517 | ||
4547 | ret = rbd_obj_method_sync(rbd_dev, object_name, | 4518 | ret = rbd_obj_method_sync(rbd_dev, object_name, |
4548 | "rbd", "get_id", NULL, 0, | 4519 | "rbd", "get_id", NULL, 0, |
4549 | response, RBD_IMAGE_ID_LEN_MAX, NULL); | 4520 | response, RBD_IMAGE_ID_LEN_MAX, NULL); |
4550 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); | 4521 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); |
4551 | if (ret == -ENOENT) { | 4522 | if (ret == -ENOENT) { |
4552 | image_id = kstrdup("", GFP_KERNEL); | 4523 | image_id = kstrdup("", GFP_KERNEL); |
4553 | ret = image_id ? 0 : -ENOMEM; | 4524 | ret = image_id ? 0 : -ENOMEM; |
4554 | if (!ret) | 4525 | if (!ret) |
4555 | rbd_dev->image_format = 1; | 4526 | rbd_dev->image_format = 1; |
4556 | } else if (ret > sizeof (__le32)) { | 4527 | } else if (ret > sizeof (__le32)) { |
4557 | void *p = response; | 4528 | void *p = response; |
4558 | 4529 | ||
4559 | image_id = ceph_extract_encoded_string(&p, p + ret, | 4530 | image_id = ceph_extract_encoded_string(&p, p + ret, |
4560 | NULL, GFP_NOIO); | 4531 | NULL, GFP_NOIO); |
4561 | ret = IS_ERR(image_id) ? PTR_ERR(image_id) : 0; | 4532 | ret = IS_ERR(image_id) ? PTR_ERR(image_id) : 0; |
4562 | if (!ret) | 4533 | if (!ret) |
4563 | rbd_dev->image_format = 2; | 4534 | rbd_dev->image_format = 2; |
4564 | } else { | 4535 | } else { |
4565 | ret = -EINVAL; | 4536 | ret = -EINVAL; |
4566 | } | 4537 | } |
4567 | 4538 | ||
4568 | if (!ret) { | 4539 | if (!ret) { |
4569 | rbd_dev->spec->image_id = image_id; | 4540 | rbd_dev->spec->image_id = image_id; |
4570 | dout("image_id is %s\n", image_id); | 4541 | dout("image_id is %s\n", image_id); |
4571 | } | 4542 | } |
4572 | out: | 4543 | out: |
4573 | kfree(response); | 4544 | kfree(response); |
4574 | kfree(object_name); | 4545 | kfree(object_name); |
4575 | 4546 | ||
4576 | return ret; | 4547 | return ret; |
4577 | } | 4548 | } |
4578 | 4549 | ||
4579 | /* Undo whatever state changes are made by v1 or v2 image probe */ | 4550 | /* Undo whatever state changes are made by v1 or v2 image probe */ |
4580 | 4551 | ||
4581 | static void rbd_dev_unprobe(struct rbd_device *rbd_dev) | 4552 | static void rbd_dev_unprobe(struct rbd_device *rbd_dev) |
4582 | { | 4553 | { |
4583 | struct rbd_image_header *header; | 4554 | struct rbd_image_header *header; |
4584 | 4555 | ||
4585 | rbd_dev_remove_parent(rbd_dev); | 4556 | rbd_dev_remove_parent(rbd_dev); |
4586 | rbd_spec_put(rbd_dev->parent_spec); | 4557 | rbd_spec_put(rbd_dev->parent_spec); |
4587 | rbd_dev->parent_spec = NULL; | 4558 | rbd_dev->parent_spec = NULL; |
4588 | rbd_dev->parent_overlap = 0; | 4559 | rbd_dev->parent_overlap = 0; |
4589 | 4560 | ||
4590 | /* Free dynamic fields from the header, then zero it out */ | 4561 | /* Free dynamic fields from the header, then zero it out */ |
4591 | 4562 | ||
4592 | header = &rbd_dev->header; | 4563 | header = &rbd_dev->header; |
4593 | rbd_snap_context_put(header->snapc); | 4564 | ceph_put_snap_context(header->snapc); |
4594 | kfree(header->snap_sizes); | 4565 | kfree(header->snap_sizes); |
4595 | kfree(header->snap_names); | 4566 | kfree(header->snap_names); |
4596 | kfree(header->object_prefix); | 4567 | kfree(header->object_prefix); |
4597 | memset(header, 0, sizeof (*header)); | 4568 | memset(header, 0, sizeof (*header)); |
4598 | } | 4569 | } |
4599 | 4570 | ||
4600 | static int rbd_dev_v1_probe(struct rbd_device *rbd_dev) | 4571 | static int rbd_dev_v1_probe(struct rbd_device *rbd_dev) |
4601 | { | 4572 | { |
4602 | int ret; | 4573 | int ret; |
4603 | 4574 | ||
4604 | /* Populate rbd image metadata */ | 4575 | /* Populate rbd image metadata */ |
4605 | 4576 | ||
4606 | ret = rbd_read_header(rbd_dev, &rbd_dev->header); | 4577 | ret = rbd_read_header(rbd_dev, &rbd_dev->header); |
4607 | if (ret < 0) | 4578 | if (ret < 0) |
4608 | goto out_err; | 4579 | goto out_err; |
4609 | 4580 | ||
4610 | /* Version 1 images have no parent (no layering) */ | 4581 | /* Version 1 images have no parent (no layering) */ |
4611 | 4582 | ||
4612 | rbd_dev->parent_spec = NULL; | 4583 | rbd_dev->parent_spec = NULL; |
4613 | rbd_dev->parent_overlap = 0; | 4584 | rbd_dev->parent_overlap = 0; |
4614 | 4585 | ||
4615 | dout("discovered version 1 image, header name is %s\n", | 4586 | dout("discovered version 1 image, header name is %s\n", |
4616 | rbd_dev->header_name); | 4587 | rbd_dev->header_name); |
4617 | 4588 | ||
4618 | return 0; | 4589 | return 0; |
4619 | 4590 | ||
4620 | out_err: | 4591 | out_err: |
4621 | kfree(rbd_dev->header_name); | 4592 | kfree(rbd_dev->header_name); |
4622 | rbd_dev->header_name = NULL; | 4593 | rbd_dev->header_name = NULL; |
4623 | kfree(rbd_dev->spec->image_id); | 4594 | kfree(rbd_dev->spec->image_id); |
4624 | rbd_dev->spec->image_id = NULL; | 4595 | rbd_dev->spec->image_id = NULL; |
4625 | 4596 | ||
4626 | return ret; | 4597 | return ret; |
4627 | } | 4598 | } |
4628 | 4599 | ||
4629 | static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) | 4600 | static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) |
4630 | { | 4601 | { |
4631 | int ret; | 4602 | int ret; |
4632 | u64 ver = 0; | 4603 | u64 ver = 0; |
4633 | 4604 | ||
4634 | ret = rbd_dev_v2_image_size(rbd_dev); | 4605 | ret = rbd_dev_v2_image_size(rbd_dev); |
4635 | if (ret) | 4606 | if (ret) |
4636 | goto out_err; | 4607 | goto out_err; |
4637 | 4608 | ||
4638 | /* Get the object prefix (a.k.a. block_name) for the image */ | 4609 | /* Get the object prefix (a.k.a. block_name) for the image */ |
4639 | 4610 | ||
4640 | ret = rbd_dev_v2_object_prefix(rbd_dev); | 4611 | ret = rbd_dev_v2_object_prefix(rbd_dev); |
4641 | if (ret) | 4612 | if (ret) |
4642 | goto out_err; | 4613 | goto out_err; |
4643 | 4614 | ||
4644 | /* Get the and check features for the image */ | 4615 | /* Get the and check features for the image */ |
4645 | 4616 | ||
4646 | ret = rbd_dev_v2_features(rbd_dev); | 4617 | ret = rbd_dev_v2_features(rbd_dev); |
4647 | if (ret) | 4618 | if (ret) |
4648 | goto out_err; | 4619 | goto out_err; |
4649 | 4620 | ||
4650 | /* If the image supports layering, get the parent info */ | 4621 | /* If the image supports layering, get the parent info */ |
4651 | 4622 | ||
4652 | if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { | 4623 | if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { |
4653 | ret = rbd_dev_v2_parent_info(rbd_dev); | 4624 | ret = rbd_dev_v2_parent_info(rbd_dev); |
4654 | if (ret) | 4625 | if (ret) |
4655 | goto out_err; | 4626 | goto out_err; |
4656 | rbd_warn(rbd_dev, "WARNING: kernel support for " | 4627 | rbd_warn(rbd_dev, "WARNING: kernel support for " |
4657 | "layered rbd images is EXPERIMENTAL!"); | 4628 | "layered rbd images is EXPERIMENTAL!"); |
4658 | } | 4629 | } |
4659 | 4630 | ||
4660 | /* If the image supports fancy striping, get its parameters */ | 4631 | /* If the image supports fancy striping, get its parameters */ |
4661 | 4632 | ||
4662 | if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) { | 4633 | if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) { |
4663 | ret = rbd_dev_v2_striping_info(rbd_dev); | 4634 | ret = rbd_dev_v2_striping_info(rbd_dev); |
4664 | if (ret < 0) | 4635 | if (ret < 0) |
4665 | goto out_err; | 4636 | goto out_err; |
4666 | } | 4637 | } |
4667 | 4638 | ||
4668 | /* crypto and compression type aren't (yet) supported for v2 images */ | 4639 | /* crypto and compression type aren't (yet) supported for v2 images */ |
4669 | 4640 | ||
4670 | rbd_dev->header.crypt_type = 0; | 4641 | rbd_dev->header.crypt_type = 0; |
4671 | rbd_dev->header.comp_type = 0; | 4642 | rbd_dev->header.comp_type = 0; |
4672 | 4643 | ||
4673 | /* Get the snapshot context, plus the header version */ | 4644 | /* Get the snapshot context, plus the header version */ |
4674 | 4645 | ||
4675 | ret = rbd_dev_v2_snap_context(rbd_dev, &ver); | 4646 | ret = rbd_dev_v2_snap_context(rbd_dev, &ver); |
4676 | if (ret) | 4647 | if (ret) |
4677 | goto out_err; | 4648 | goto out_err; |
4678 | rbd_dev->header.obj_version = ver; | 4649 | rbd_dev->header.obj_version = ver; |
4679 | 4650 | ||
4680 | dout("discovered version 2 image, header name is %s\n", | 4651 | dout("discovered version 2 image, header name is %s\n", |
4681 | rbd_dev->header_name); | 4652 | rbd_dev->header_name); |
4682 | 4653 | ||
4683 | return 0; | 4654 | return 0; |
4684 | out_err: | 4655 | out_err: |
4685 | rbd_dev->parent_overlap = 0; | 4656 | rbd_dev->parent_overlap = 0; |
4686 | rbd_spec_put(rbd_dev->parent_spec); | 4657 | rbd_spec_put(rbd_dev->parent_spec); |
4687 | rbd_dev->parent_spec = NULL; | 4658 | rbd_dev->parent_spec = NULL; |
4688 | kfree(rbd_dev->header_name); | 4659 | kfree(rbd_dev->header_name); |
4689 | rbd_dev->header_name = NULL; | 4660 | rbd_dev->header_name = NULL; |
4690 | kfree(rbd_dev->header.object_prefix); | 4661 | kfree(rbd_dev->header.object_prefix); |
4691 | rbd_dev->header.object_prefix = NULL; | 4662 | rbd_dev->header.object_prefix = NULL; |
4692 | 4663 | ||
4693 | return ret; | 4664 | return ret; |
4694 | } | 4665 | } |
4695 | 4666 | ||
4696 | static int rbd_dev_probe_parent(struct rbd_device *rbd_dev) | 4667 | static int rbd_dev_probe_parent(struct rbd_device *rbd_dev) |
4697 | { | 4668 | { |
4698 | struct rbd_device *parent = NULL; | 4669 | struct rbd_device *parent = NULL; |
4699 | struct rbd_spec *parent_spec; | 4670 | struct rbd_spec *parent_spec; |
4700 | struct rbd_client *rbdc; | 4671 | struct rbd_client *rbdc; |
4701 | int ret; | 4672 | int ret; |
4702 | 4673 | ||
4703 | if (!rbd_dev->parent_spec) | 4674 | if (!rbd_dev->parent_spec) |
4704 | return 0; | 4675 | return 0; |
4705 | /* | 4676 | /* |
4706 | * We need to pass a reference to the client and the parent | 4677 | * We need to pass a reference to the client and the parent |
4707 | * spec when creating the parent rbd_dev. Images related by | 4678 | * spec when creating the parent rbd_dev. Images related by |
4708 | * parent/child relationships always share both. | 4679 | * parent/child relationships always share both. |
4709 | */ | 4680 | */ |
4710 | parent_spec = rbd_spec_get(rbd_dev->parent_spec); | 4681 | parent_spec = rbd_spec_get(rbd_dev->parent_spec); |
4711 | rbdc = __rbd_get_client(rbd_dev->rbd_client); | 4682 | rbdc = __rbd_get_client(rbd_dev->rbd_client); |
4712 | 4683 | ||
4713 | ret = -ENOMEM; | 4684 | ret = -ENOMEM; |
4714 | parent = rbd_dev_create(rbdc, parent_spec); | 4685 | parent = rbd_dev_create(rbdc, parent_spec); |
4715 | if (!parent) | 4686 | if (!parent) |
4716 | goto out_err; | 4687 | goto out_err; |
4717 | 4688 | ||
4718 | ret = rbd_dev_image_probe(parent); | 4689 | ret = rbd_dev_image_probe(parent); |
4719 | if (ret < 0) | 4690 | if (ret < 0) |
4720 | goto out_err; | 4691 | goto out_err; |
4721 | rbd_dev->parent = parent; | 4692 | rbd_dev->parent = parent; |
4722 | 4693 | ||
4723 | return 0; | 4694 | return 0; |
4724 | out_err: | 4695 | out_err: |
4725 | if (parent) { | 4696 | if (parent) { |
4726 | rbd_spec_put(rbd_dev->parent_spec); | 4697 | rbd_spec_put(rbd_dev->parent_spec); |
4727 | kfree(rbd_dev->header_name); | 4698 | kfree(rbd_dev->header_name); |
4728 | rbd_dev_destroy(parent); | 4699 | rbd_dev_destroy(parent); |
4729 | } else { | 4700 | } else { |
4730 | rbd_put_client(rbdc); | 4701 | rbd_put_client(rbdc); |
4731 | rbd_spec_put(parent_spec); | 4702 | rbd_spec_put(parent_spec); |
4732 | } | 4703 | } |
4733 | 4704 | ||
4734 | return ret; | 4705 | return ret; |
4735 | } | 4706 | } |
4736 | 4707 | ||
4737 | static int rbd_dev_device_setup(struct rbd_device *rbd_dev) | 4708 | static int rbd_dev_device_setup(struct rbd_device *rbd_dev) |
4738 | { | 4709 | { |
4739 | int ret; | 4710 | int ret; |
4740 | 4711 | ||
4741 | ret = rbd_dev_mapping_set(rbd_dev); | 4712 | ret = rbd_dev_mapping_set(rbd_dev); |
4742 | if (ret) | 4713 | if (ret) |
4743 | return ret; | 4714 | return ret; |
4744 | 4715 | ||
4745 | /* generate unique id: find highest unique id, add one */ | 4716 | /* generate unique id: find highest unique id, add one */ |
4746 | rbd_dev_id_get(rbd_dev); | 4717 | rbd_dev_id_get(rbd_dev); |
4747 | 4718 | ||
4748 | /* Fill in the device name, now that we have its id. */ | 4719 | /* Fill in the device name, now that we have its id. */ |
4749 | BUILD_BUG_ON(DEV_NAME_LEN | 4720 | BUILD_BUG_ON(DEV_NAME_LEN |
4750 | < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH); | 4721 | < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH); |
4751 | sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id); | 4722 | sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id); |
4752 | 4723 | ||
4753 | /* Get our block major device number. */ | 4724 | /* Get our block major device number. */ |
4754 | 4725 | ||
4755 | ret = register_blkdev(0, rbd_dev->name); | 4726 | ret = register_blkdev(0, rbd_dev->name); |
4756 | if (ret < 0) | 4727 | if (ret < 0) |
4757 | goto err_out_id; | 4728 | goto err_out_id; |
4758 | rbd_dev->major = ret; | 4729 | rbd_dev->major = ret; |
4759 | 4730 | ||
4760 | /* Set up the blkdev mapping. */ | 4731 | /* Set up the blkdev mapping. */ |
4761 | 4732 | ||
4762 | ret = rbd_init_disk(rbd_dev); | 4733 | ret = rbd_init_disk(rbd_dev); |
4763 | if (ret) | 4734 | if (ret) |
4764 | goto err_out_blkdev; | 4735 | goto err_out_blkdev; |
4765 | 4736 | ||
4766 | ret = rbd_bus_add_dev(rbd_dev); | 4737 | ret = rbd_bus_add_dev(rbd_dev); |
4767 | if (ret) | 4738 | if (ret) |
4768 | goto err_out_disk; | 4739 | goto err_out_disk; |
4769 | 4740 | ||
4770 | /* Everything's ready. Announce the disk to the world. */ | 4741 | /* Everything's ready. Announce the disk to the world. */ |
4771 | 4742 | ||
4772 | set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); | 4743 | set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); |
4773 | set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); | 4744 | set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); |
4774 | add_disk(rbd_dev->disk); | 4745 | add_disk(rbd_dev->disk); |
4775 | 4746 | ||
4776 | pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name, | 4747 | pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name, |
4777 | (unsigned long long) rbd_dev->mapping.size); | 4748 | (unsigned long long) rbd_dev->mapping.size); |
4778 | 4749 | ||
4779 | return ret; | 4750 | return ret; |
4780 | 4751 | ||
4781 | err_out_disk: | 4752 | err_out_disk: |
4782 | rbd_free_disk(rbd_dev); | 4753 | rbd_free_disk(rbd_dev); |
4783 | err_out_blkdev: | 4754 | err_out_blkdev: |
4784 | unregister_blkdev(rbd_dev->major, rbd_dev->name); | 4755 | unregister_blkdev(rbd_dev->major, rbd_dev->name); |
4785 | err_out_id: | 4756 | err_out_id: |
4786 | rbd_dev_id_put(rbd_dev); | 4757 | rbd_dev_id_put(rbd_dev); |
4787 | rbd_dev_mapping_clear(rbd_dev); | 4758 | rbd_dev_mapping_clear(rbd_dev); |
4788 | 4759 | ||
4789 | return ret; | 4760 | return ret; |
4790 | } | 4761 | } |
4791 | 4762 | ||
4792 | static int rbd_dev_header_name(struct rbd_device *rbd_dev) | 4763 | static int rbd_dev_header_name(struct rbd_device *rbd_dev) |
4793 | { | 4764 | { |
4794 | struct rbd_spec *spec = rbd_dev->spec; | 4765 | struct rbd_spec *spec = rbd_dev->spec; |
4795 | size_t size; | 4766 | size_t size; |
4796 | 4767 | ||
4797 | /* Record the header object name for this rbd image. */ | 4768 | /* Record the header object name for this rbd image. */ |
4798 | 4769 | ||
4799 | rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); | 4770 | rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); |
4800 | 4771 | ||
4801 | if (rbd_dev->image_format == 1) | 4772 | if (rbd_dev->image_format == 1) |
4802 | size = strlen(spec->image_name) + sizeof (RBD_SUFFIX); | 4773 | size = strlen(spec->image_name) + sizeof (RBD_SUFFIX); |
4803 | else | 4774 | else |
4804 | size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id); | 4775 | size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id); |
4805 | 4776 | ||
4806 | rbd_dev->header_name = kmalloc(size, GFP_KERNEL); | 4777 | rbd_dev->header_name = kmalloc(size, GFP_KERNEL); |
4807 | if (!rbd_dev->header_name) | 4778 | if (!rbd_dev->header_name) |
4808 | return -ENOMEM; | 4779 | return -ENOMEM; |
4809 | 4780 | ||
4810 | if (rbd_dev->image_format == 1) | 4781 | if (rbd_dev->image_format == 1) |
4811 | sprintf(rbd_dev->header_name, "%s%s", | 4782 | sprintf(rbd_dev->header_name, "%s%s", |
4812 | spec->image_name, RBD_SUFFIX); | 4783 | spec->image_name, RBD_SUFFIX); |
4813 | else | 4784 | else |
4814 | sprintf(rbd_dev->header_name, "%s%s", | 4785 | sprintf(rbd_dev->header_name, "%s%s", |
4815 | RBD_HEADER_PREFIX, spec->image_id); | 4786 | RBD_HEADER_PREFIX, spec->image_id); |
4816 | return 0; | 4787 | return 0; |
4817 | } | 4788 | } |
4818 | 4789 | ||
4819 | static void rbd_dev_image_release(struct rbd_device *rbd_dev) | 4790 | static void rbd_dev_image_release(struct rbd_device *rbd_dev) |
4820 | { | 4791 | { |
4821 | int ret; | 4792 | int ret; |
4822 | 4793 | ||
4823 | rbd_remove_all_snaps(rbd_dev); | 4794 | rbd_remove_all_snaps(rbd_dev); |
4824 | rbd_dev_unprobe(rbd_dev); | 4795 | rbd_dev_unprobe(rbd_dev); |
4825 | ret = rbd_dev_header_watch_sync(rbd_dev, 0); | 4796 | ret = rbd_dev_header_watch_sync(rbd_dev, 0); |
4826 | if (ret) | 4797 | if (ret) |
4827 | rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret); | 4798 | rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret); |
4828 | kfree(rbd_dev->header_name); | 4799 | kfree(rbd_dev->header_name); |
4829 | rbd_dev->header_name = NULL; | 4800 | rbd_dev->header_name = NULL; |
4830 | rbd_dev->image_format = 0; | 4801 | rbd_dev->image_format = 0; |
4831 | kfree(rbd_dev->spec->image_id); | 4802 | kfree(rbd_dev->spec->image_id); |
4832 | rbd_dev->spec->image_id = NULL; | 4803 | rbd_dev->spec->image_id = NULL; |
4833 | 4804 | ||
4834 | rbd_dev_destroy(rbd_dev); | 4805 | rbd_dev_destroy(rbd_dev); |
4835 | } | 4806 | } |
4836 | 4807 | ||
4837 | /* | 4808 | /* |
4838 | * Probe for the existence of the header object for the given rbd | 4809 | * Probe for the existence of the header object for the given rbd |
4839 | * device. For format 2 images this includes determining the image | 4810 | * device. For format 2 images this includes determining the image |
4840 | * id. | 4811 | * id. |
4841 | */ | 4812 | */ |
4842 | static int rbd_dev_image_probe(struct rbd_device *rbd_dev) | 4813 | static int rbd_dev_image_probe(struct rbd_device *rbd_dev) |
4843 | { | 4814 | { |
4844 | int ret; | 4815 | int ret; |
4845 | int tmp; | 4816 | int tmp; |
4846 | 4817 | ||
4847 | /* | 4818 | /* |
4848 | * Get the id from the image id object. If it's not a | 4819 | * Get the id from the image id object. If it's not a |
4849 | * format 2 image, we'll get ENOENT back, and we'll assume | 4820 | * format 2 image, we'll get ENOENT back, and we'll assume |
4850 | * it's a format 1 image. | 4821 | * it's a format 1 image. |
4851 | */ | 4822 | */ |
4852 | ret = rbd_dev_image_id(rbd_dev); | 4823 | ret = rbd_dev_image_id(rbd_dev); |
4853 | if (ret) | 4824 | if (ret) |
4854 | return ret; | 4825 | return ret; |
4855 | rbd_assert(rbd_dev->spec->image_id); | 4826 | rbd_assert(rbd_dev->spec->image_id); |
4856 | rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); | 4827 | rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); |
4857 | 4828 | ||
4858 | ret = rbd_dev_header_name(rbd_dev); | 4829 | ret = rbd_dev_header_name(rbd_dev); |
4859 | if (ret) | 4830 | if (ret) |
4860 | goto err_out_format; | 4831 | goto err_out_format; |
4861 | 4832 | ||
4862 | ret = rbd_dev_header_watch_sync(rbd_dev, 1); | 4833 | ret = rbd_dev_header_watch_sync(rbd_dev, 1); |
4863 | if (ret) | 4834 | if (ret) |
4864 | goto out_header_name; | 4835 | goto out_header_name; |
4865 | 4836 | ||
4866 | if (rbd_dev->image_format == 1) | 4837 | if (rbd_dev->image_format == 1) |
4867 | ret = rbd_dev_v1_probe(rbd_dev); | 4838 | ret = rbd_dev_v1_probe(rbd_dev); |
4868 | else | 4839 | else |
4869 | ret = rbd_dev_v2_probe(rbd_dev); | 4840 | ret = rbd_dev_v2_probe(rbd_dev); |
4870 | if (ret) | 4841 | if (ret) |
4871 | goto err_out_watch; | 4842 | goto err_out_watch; |
4872 | 4843 | ||
4873 | ret = rbd_dev_snaps_update(rbd_dev); | 4844 | ret = rbd_dev_snaps_update(rbd_dev); |
4874 | if (ret) | 4845 | if (ret) |
4875 | goto err_out_probe; | 4846 | goto err_out_probe; |
4876 | 4847 | ||
4877 | ret = rbd_dev_spec_update(rbd_dev); | 4848 | ret = rbd_dev_spec_update(rbd_dev); |
4878 | if (ret) | 4849 | if (ret) |
4879 | goto err_out_snaps; | 4850 | goto err_out_snaps; |
4880 | 4851 | ||
4881 | ret = rbd_dev_probe_parent(rbd_dev); | 4852 | ret = rbd_dev_probe_parent(rbd_dev); |
4882 | if (!ret) | 4853 | if (!ret) |
4883 | return 0; | 4854 | return 0; |
4884 | 4855 | ||
4885 | err_out_snaps: | 4856 | err_out_snaps: |
4886 | rbd_remove_all_snaps(rbd_dev); | 4857 | rbd_remove_all_snaps(rbd_dev); |
4887 | err_out_probe: | 4858 | err_out_probe: |
4888 | rbd_dev_unprobe(rbd_dev); | 4859 | rbd_dev_unprobe(rbd_dev); |
4889 | err_out_watch: | 4860 | err_out_watch: |
4890 | tmp = rbd_dev_header_watch_sync(rbd_dev, 0); | 4861 | tmp = rbd_dev_header_watch_sync(rbd_dev, 0); |
4891 | if (tmp) | 4862 | if (tmp) |
4892 | rbd_warn(rbd_dev, "unable to tear down watch request\n"); | 4863 | rbd_warn(rbd_dev, "unable to tear down watch request\n"); |
4893 | out_header_name: | 4864 | out_header_name: |
4894 | kfree(rbd_dev->header_name); | 4865 | kfree(rbd_dev->header_name); |
4895 | rbd_dev->header_name = NULL; | 4866 | rbd_dev->header_name = NULL; |
4896 | err_out_format: | 4867 | err_out_format: |
4897 | rbd_dev->image_format = 0; | 4868 | rbd_dev->image_format = 0; |
4898 | kfree(rbd_dev->spec->image_id); | 4869 | kfree(rbd_dev->spec->image_id); |
4899 | rbd_dev->spec->image_id = NULL; | 4870 | rbd_dev->spec->image_id = NULL; |
4900 | 4871 | ||
4901 | dout("probe failed, returning %d\n", ret); | 4872 | dout("probe failed, returning %d\n", ret); |
4902 | 4873 | ||
4903 | return ret; | 4874 | return ret; |
4904 | } | 4875 | } |
4905 | 4876 | ||
4906 | static ssize_t rbd_add(struct bus_type *bus, | 4877 | static ssize_t rbd_add(struct bus_type *bus, |
4907 | const char *buf, | 4878 | const char *buf, |
4908 | size_t count) | 4879 | size_t count) |
4909 | { | 4880 | { |
4910 | struct rbd_device *rbd_dev = NULL; | 4881 | struct rbd_device *rbd_dev = NULL; |
4911 | struct ceph_options *ceph_opts = NULL; | 4882 | struct ceph_options *ceph_opts = NULL; |
4912 | struct rbd_options *rbd_opts = NULL; | 4883 | struct rbd_options *rbd_opts = NULL; |
4913 | struct rbd_spec *spec = NULL; | 4884 | struct rbd_spec *spec = NULL; |
4914 | struct rbd_client *rbdc; | 4885 | struct rbd_client *rbdc; |
4915 | struct ceph_osd_client *osdc; | 4886 | struct ceph_osd_client *osdc; |
4916 | int rc = -ENOMEM; | 4887 | int rc = -ENOMEM; |
4917 | 4888 | ||
4918 | if (!try_module_get(THIS_MODULE)) | 4889 | if (!try_module_get(THIS_MODULE)) |
4919 | return -ENODEV; | 4890 | return -ENODEV; |
4920 | 4891 | ||
4921 | /* parse add command */ | 4892 | /* parse add command */ |
4922 | rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec); | 4893 | rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec); |
4923 | if (rc < 0) | 4894 | if (rc < 0) |
4924 | goto err_out_module; | 4895 | goto err_out_module; |
4925 | 4896 | ||
4926 | rbdc = rbd_get_client(ceph_opts); | 4897 | rbdc = rbd_get_client(ceph_opts); |
4927 | if (IS_ERR(rbdc)) { | 4898 | if (IS_ERR(rbdc)) { |
4928 | rc = PTR_ERR(rbdc); | 4899 | rc = PTR_ERR(rbdc); |
4929 | goto err_out_args; | 4900 | goto err_out_args; |
4930 | } | 4901 | } |
4931 | ceph_opts = NULL; /* rbd_dev client now owns this */ | 4902 | ceph_opts = NULL; /* rbd_dev client now owns this */ |
4932 | 4903 | ||
4933 | /* pick the pool */ | 4904 | /* pick the pool */ |
4934 | osdc = &rbdc->client->osdc; | 4905 | osdc = &rbdc->client->osdc; |
4935 | rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name); | 4906 | rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name); |
4936 | if (rc < 0) | 4907 | if (rc < 0) |
4937 | goto err_out_client; | 4908 | goto err_out_client; |
4938 | spec->pool_id = (u64)rc; | 4909 | spec->pool_id = (u64)rc; |
4939 | 4910 | ||
4940 | /* The ceph file layout needs to fit pool id in 32 bits */ | 4911 | /* The ceph file layout needs to fit pool id in 32 bits */ |
4941 | 4912 | ||
4942 | if (spec->pool_id > (u64)U32_MAX) { | 4913 | if (spec->pool_id > (u64)U32_MAX) { |
4943 | rbd_warn(NULL, "pool id too large (%llu > %u)\n", | 4914 | rbd_warn(NULL, "pool id too large (%llu > %u)\n", |
4944 | (unsigned long long)spec->pool_id, U32_MAX); | 4915 | (unsigned long long)spec->pool_id, U32_MAX); |
4945 | rc = -EIO; | 4916 | rc = -EIO; |
4946 | goto err_out_client; | 4917 | goto err_out_client; |
4947 | } | 4918 | } |
4948 | 4919 | ||
4949 | rbd_dev = rbd_dev_create(rbdc, spec); | 4920 | rbd_dev = rbd_dev_create(rbdc, spec); |
4950 | if (!rbd_dev) | 4921 | if (!rbd_dev) |
4951 | goto err_out_client; | 4922 | goto err_out_client; |
4952 | rbdc = NULL; /* rbd_dev now owns this */ | 4923 | rbdc = NULL; /* rbd_dev now owns this */ |
4953 | spec = NULL; /* rbd_dev now owns this */ | 4924 | spec = NULL; /* rbd_dev now owns this */ |
4954 | 4925 | ||
4955 | rbd_dev->mapping.read_only = rbd_opts->read_only; | 4926 | rbd_dev->mapping.read_only = rbd_opts->read_only; |
4956 | kfree(rbd_opts); | 4927 | kfree(rbd_opts); |
4957 | rbd_opts = NULL; /* done with this */ | 4928 | rbd_opts = NULL; /* done with this */ |
4958 | 4929 | ||
4959 | rc = rbd_dev_image_probe(rbd_dev); | 4930 | rc = rbd_dev_image_probe(rbd_dev); |
4960 | if (rc < 0) | 4931 | if (rc < 0) |
4961 | goto err_out_rbd_dev; | 4932 | goto err_out_rbd_dev; |
4962 | 4933 | ||
4963 | rc = rbd_dev_device_setup(rbd_dev); | 4934 | rc = rbd_dev_device_setup(rbd_dev); |
4964 | if (!rc) | 4935 | if (!rc) |
4965 | return count; | 4936 | return count; |
4966 | 4937 | ||
4967 | rbd_dev_image_release(rbd_dev); | 4938 | rbd_dev_image_release(rbd_dev); |
4968 | err_out_rbd_dev: | 4939 | err_out_rbd_dev: |
4969 | rbd_dev_destroy(rbd_dev); | 4940 | rbd_dev_destroy(rbd_dev); |
4970 | err_out_client: | 4941 | err_out_client: |
4971 | rbd_put_client(rbdc); | 4942 | rbd_put_client(rbdc); |
4972 | err_out_args: | 4943 | err_out_args: |
4973 | if (ceph_opts) | 4944 | if (ceph_opts) |
4974 | ceph_destroy_options(ceph_opts); | 4945 | ceph_destroy_options(ceph_opts); |
4975 | kfree(rbd_opts); | 4946 | kfree(rbd_opts); |
4976 | rbd_spec_put(spec); | 4947 | rbd_spec_put(spec); |
4977 | err_out_module: | 4948 | err_out_module: |
4978 | module_put(THIS_MODULE); | 4949 | module_put(THIS_MODULE); |
4979 | 4950 | ||
4980 | dout("Error adding device %s\n", buf); | 4951 | dout("Error adding device %s\n", buf); |
4981 | 4952 | ||
4982 | return (ssize_t)rc; | 4953 | return (ssize_t)rc; |
4983 | } | 4954 | } |
4984 | 4955 | ||
4985 | static struct rbd_device *__rbd_get_dev(unsigned long dev_id) | 4956 | static struct rbd_device *__rbd_get_dev(unsigned long dev_id) |
4986 | { | 4957 | { |
4987 | struct list_head *tmp; | 4958 | struct list_head *tmp; |
4988 | struct rbd_device *rbd_dev; | 4959 | struct rbd_device *rbd_dev; |
4989 | 4960 | ||
4990 | spin_lock(&rbd_dev_list_lock); | 4961 | spin_lock(&rbd_dev_list_lock); |
4991 | list_for_each(tmp, &rbd_dev_list) { | 4962 | list_for_each(tmp, &rbd_dev_list) { |
4992 | rbd_dev = list_entry(tmp, struct rbd_device, node); | 4963 | rbd_dev = list_entry(tmp, struct rbd_device, node); |
4993 | if (rbd_dev->dev_id == dev_id) { | 4964 | if (rbd_dev->dev_id == dev_id) { |
4994 | spin_unlock(&rbd_dev_list_lock); | 4965 | spin_unlock(&rbd_dev_list_lock); |
4995 | return rbd_dev; | 4966 | return rbd_dev; |
4996 | } | 4967 | } |
4997 | } | 4968 | } |
4998 | spin_unlock(&rbd_dev_list_lock); | 4969 | spin_unlock(&rbd_dev_list_lock); |
4999 | return NULL; | 4970 | return NULL; |
5000 | } | 4971 | } |
5001 | 4972 | ||
5002 | static void rbd_dev_device_release(struct device *dev) | 4973 | static void rbd_dev_device_release(struct device *dev) |
5003 | { | 4974 | { |
5004 | struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); | 4975 | struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); |
5005 | 4976 | ||
5006 | rbd_free_disk(rbd_dev); | 4977 | rbd_free_disk(rbd_dev); |
5007 | clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); | 4978 | clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); |
5008 | rbd_dev_clear_mapping(rbd_dev); | 4979 | rbd_dev_clear_mapping(rbd_dev); |
5009 | unregister_blkdev(rbd_dev->major, rbd_dev->name); | 4980 | unregister_blkdev(rbd_dev->major, rbd_dev->name); |
5010 | rbd_dev->major = 0; | 4981 | rbd_dev->major = 0; |
5011 | rbd_dev_id_put(rbd_dev); | 4982 | rbd_dev_id_put(rbd_dev); |
5012 | rbd_dev_mapping_clear(rbd_dev); | 4983 | rbd_dev_mapping_clear(rbd_dev); |
5013 | } | 4984 | } |
5014 | 4985 | ||
5015 | static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) | 4986 | static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) |
5016 | { | 4987 | { |
5017 | while (rbd_dev->parent) { | 4988 | while (rbd_dev->parent) { |
5018 | struct rbd_device *first = rbd_dev; | 4989 | struct rbd_device *first = rbd_dev; |
5019 | struct rbd_device *second = first->parent; | 4990 | struct rbd_device *second = first->parent; |
5020 | struct rbd_device *third; | 4991 | struct rbd_device *third; |
5021 | 4992 | ||
5022 | /* | 4993 | /* |
5023 | * Follow to the parent with no grandparent and | 4994 | * Follow to the parent with no grandparent and |
5024 | * remove it. | 4995 | * remove it. |
5025 | */ | 4996 | */ |
5026 | while (second && (third = second->parent)) { | 4997 | while (second && (third = second->parent)) { |
5027 | first = second; | 4998 | first = second; |
5028 | second = third; | 4999 | second = third; |
5029 | } | 5000 | } |
5030 | rbd_assert(second); | 5001 | rbd_assert(second); |
5031 | rbd_dev_image_release(second); | 5002 | rbd_dev_image_release(second); |
5032 | first->parent = NULL; | 5003 | first->parent = NULL; |
5033 | first->parent_overlap = 0; | 5004 | first->parent_overlap = 0; |
5034 | 5005 | ||
5035 | rbd_assert(first->parent_spec); | 5006 | rbd_assert(first->parent_spec); |
5036 | rbd_spec_put(first->parent_spec); | 5007 | rbd_spec_put(first->parent_spec); |
5037 | first->parent_spec = NULL; | 5008 | first->parent_spec = NULL; |
5038 | } | 5009 | } |
5039 | } | 5010 | } |
5040 | 5011 | ||
5041 | static ssize_t rbd_remove(struct bus_type *bus, | 5012 | static ssize_t rbd_remove(struct bus_type *bus, |
5042 | const char *buf, | 5013 | const char *buf, |
5043 | size_t count) | 5014 | size_t count) |
5044 | { | 5015 | { |
5045 | struct rbd_device *rbd_dev = NULL; | 5016 | struct rbd_device *rbd_dev = NULL; |
5046 | int target_id; | 5017 | int target_id; |
5047 | unsigned long ul; | 5018 | unsigned long ul; |
5048 | int ret; | 5019 | int ret; |
5049 | 5020 | ||
5050 | ret = strict_strtoul(buf, 10, &ul); | 5021 | ret = strict_strtoul(buf, 10, &ul); |
5051 | if (ret) | 5022 | if (ret) |
5052 | return ret; | 5023 | return ret; |
5053 | 5024 | ||
5054 | /* convert to int; abort if we lost anything in the conversion */ | 5025 | /* convert to int; abort if we lost anything in the conversion */ |
5055 | target_id = (int) ul; | 5026 | target_id = (int) ul; |
5056 | if (target_id != ul) | 5027 | if (target_id != ul) |
5057 | return -EINVAL; | 5028 | return -EINVAL; |
5058 | 5029 | ||
5059 | mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); | 5030 | mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); |
5060 | 5031 | ||
5061 | rbd_dev = __rbd_get_dev(target_id); | 5032 | rbd_dev = __rbd_get_dev(target_id); |
5062 | if (!rbd_dev) { | 5033 | if (!rbd_dev) { |
5063 | ret = -ENOENT; | 5034 | ret = -ENOENT; |
5064 | goto done; | 5035 | goto done; |
5065 | } | 5036 | } |
5066 | 5037 | ||
5067 | spin_lock_irq(&rbd_dev->lock); | 5038 | spin_lock_irq(&rbd_dev->lock); |
5068 | if (rbd_dev->open_count) | 5039 | if (rbd_dev->open_count) |
5069 | ret = -EBUSY; | 5040 | ret = -EBUSY; |
5070 | else | 5041 | else |
5071 | set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags); | 5042 | set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags); |
5072 | spin_unlock_irq(&rbd_dev->lock); | 5043 | spin_unlock_irq(&rbd_dev->lock); |
5073 | if (ret < 0) | 5044 | if (ret < 0) |
5074 | goto done; | 5045 | goto done; |
5075 | ret = count; | 5046 | ret = count; |
5076 | rbd_bus_del_dev(rbd_dev); | 5047 | rbd_bus_del_dev(rbd_dev); |
5077 | rbd_dev_image_release(rbd_dev); | 5048 | rbd_dev_image_release(rbd_dev); |
5078 | module_put(THIS_MODULE); | 5049 | module_put(THIS_MODULE); |
5079 | done: | 5050 | done: |
5080 | mutex_unlock(&ctl_mutex); | 5051 | mutex_unlock(&ctl_mutex); |
5081 | 5052 | ||
5082 | return ret; | 5053 | return ret; |
5083 | } | 5054 | } |
5084 | 5055 | ||
5085 | /* | 5056 | /* |
5086 | * create control files in sysfs | 5057 | * create control files in sysfs |
5087 | * /sys/bus/rbd/... | 5058 | * /sys/bus/rbd/... |
5088 | */ | 5059 | */ |
5089 | static int rbd_sysfs_init(void) | 5060 | static int rbd_sysfs_init(void) |
5090 | { | 5061 | { |
5091 | int ret; | 5062 | int ret; |
5092 | 5063 | ||
5093 | ret = device_register(&rbd_root_dev); | 5064 | ret = device_register(&rbd_root_dev); |
5094 | if (ret < 0) | 5065 | if (ret < 0) |
5095 | return ret; | 5066 | return ret; |
5096 | 5067 | ||
5097 | ret = bus_register(&rbd_bus_type); | 5068 | ret = bus_register(&rbd_bus_type); |
5098 | if (ret < 0) | 5069 | if (ret < 0) |
5099 | device_unregister(&rbd_root_dev); | 5070 | device_unregister(&rbd_root_dev); |
5100 | 5071 | ||
5101 | return ret; | 5072 | return ret; |
5102 | } | 5073 | } |
5103 | 5074 | ||
5104 | static void rbd_sysfs_cleanup(void) | 5075 | static void rbd_sysfs_cleanup(void) |
5105 | { | 5076 | { |
5106 | bus_unregister(&rbd_bus_type); | 5077 | bus_unregister(&rbd_bus_type); |
5107 | device_unregister(&rbd_root_dev); | 5078 | device_unregister(&rbd_root_dev); |
5108 | } | 5079 | } |
5109 | 5080 | ||
5110 | static int __init rbd_init(void) | 5081 | static int __init rbd_init(void) |
5111 | { | 5082 | { |
5112 | int rc; | 5083 | int rc; |
5113 | 5084 | ||
5114 | if (!libceph_compatible(NULL)) { | 5085 | if (!libceph_compatible(NULL)) { |
5115 | rbd_warn(NULL, "libceph incompatibility (quitting)"); | 5086 | rbd_warn(NULL, "libceph incompatibility (quitting)"); |
5116 | 5087 | ||
5117 | return -EINVAL; | 5088 | return -EINVAL; |
5118 | } | 5089 | } |
5119 | rc = rbd_sysfs_init(); | 5090 | rc = rbd_sysfs_init(); |
5120 | if (rc) | 5091 | if (rc) |
5121 | return rc; | 5092 | return rc; |
5122 | pr_info("loaded " RBD_DRV_NAME_LONG "\n"); | 5093 | pr_info("loaded " RBD_DRV_NAME_LONG "\n"); |
5123 | return 0; | 5094 | return 0; |
5124 | } | 5095 | } |
5125 | 5096 | ||
5126 | static void __exit rbd_exit(void) | 5097 | static void __exit rbd_exit(void) |
5127 | { | 5098 | { |
5128 | rbd_sysfs_cleanup(); | 5099 | rbd_sysfs_cleanup(); |
5129 | } | 5100 | } |
5130 | 5101 | ||
5131 | module_init(rbd_init); | 5102 | module_init(rbd_init); |
5132 | module_exit(rbd_exit); | 5103 | module_exit(rbd_exit); |
5133 | 5104 | ||
5134 | MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); | 5105 | MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); |
5135 | MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); | 5106 | MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); |
5136 | MODULE_DESCRIPTION("rados block device"); | 5107 | MODULE_DESCRIPTION("rados block device"); |
5137 | 5108 | ||
5138 | /* following authorship retained from original osdblk.c */ | 5109 | /* following authorship retained from original osdblk.c */ |
5139 | MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>"); | 5110 | MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>"); |
5140 | 5111 | ||
5141 | MODULE_LICENSE("GPL"); | 5112 | MODULE_LICENSE("GPL"); |
5142 | 5113 |
fs/ceph/snap.c
1 | #include <linux/ceph/ceph_debug.h> | 1 | #include <linux/ceph/ceph_debug.h> |
2 | 2 | ||
3 | #include <linux/sort.h> | 3 | #include <linux/sort.h> |
4 | #include <linux/slab.h> | 4 | #include <linux/slab.h> |
5 | 5 | ||
6 | #include "super.h" | 6 | #include "super.h" |
7 | #include "mds_client.h" | 7 | #include "mds_client.h" |
8 | 8 | ||
9 | #include <linux/ceph/decode.h> | 9 | #include <linux/ceph/decode.h> |
10 | 10 | ||
11 | /* | 11 | /* |
12 | * Snapshots in ceph are driven in large part by cooperation from the | 12 | * Snapshots in ceph are driven in large part by cooperation from the |
13 | * client. In contrast to local file systems or file servers that | 13 | * client. In contrast to local file systems or file servers that |
14 | * implement snapshots at a single point in the system, ceph's | 14 | * implement snapshots at a single point in the system, ceph's |
15 | * distributed access to storage requires clients to help decide | 15 | * distributed access to storage requires clients to help decide |
16 | * whether a write logically occurs before or after a recently created | 16 | * whether a write logically occurs before or after a recently created |
17 | * snapshot. | 17 | * snapshot. |
18 | * | 18 | * |
19 | * This provides a perfect instantanous client-wide snapshot. Between | 19 | * This provides a perfect instantanous client-wide snapshot. Between |
20 | * clients, however, snapshots may appear to be applied at slightly | 20 | * clients, however, snapshots may appear to be applied at slightly |
21 | * different points in time, depending on delays in delivering the | 21 | * different points in time, depending on delays in delivering the |
22 | * snapshot notification. | 22 | * snapshot notification. |
23 | * | 23 | * |
24 | * Snapshots are _not_ file system-wide. Instead, each snapshot | 24 | * Snapshots are _not_ file system-wide. Instead, each snapshot |
25 | * applies to the subdirectory nested beneath some directory. This | 25 | * applies to the subdirectory nested beneath some directory. This |
26 | * effectively divides the hierarchy into multiple "realms," where all | 26 | * effectively divides the hierarchy into multiple "realms," where all |
27 | * of the files contained by each realm share the same set of | 27 | * of the files contained by each realm share the same set of |
28 | * snapshots. An individual realm's snap set contains snapshots | 28 | * snapshots. An individual realm's snap set contains snapshots |
29 | * explicitly created on that realm, as well as any snaps in its | 29 | * explicitly created on that realm, as well as any snaps in its |
30 | * parent's snap set _after_ the point at which the parent became it's | 30 | * parent's snap set _after_ the point at which the parent became it's |
31 | * parent (due to, say, a rename). Similarly, snaps from prior parents | 31 | * parent (due to, say, a rename). Similarly, snaps from prior parents |
32 | * during the time intervals during which they were the parent are included. | 32 | * during the time intervals during which they were the parent are included. |
33 | * | 33 | * |
34 | * The client is spared most of this detail, fortunately... it must only | 34 | * The client is spared most of this detail, fortunately... it must only |
35 | * maintains a hierarchy of realms reflecting the current parent/child | 35 | * maintains a hierarchy of realms reflecting the current parent/child |
36 | * realm relationship, and for each realm has an explicit list of snaps | 36 | * realm relationship, and for each realm has an explicit list of snaps |
37 | * inherited from prior parents. | 37 | * inherited from prior parents. |
38 | * | 38 | * |
39 | * A snap_realm struct is maintained for realms containing every inode | 39 | * A snap_realm struct is maintained for realms containing every inode |
40 | * with an open cap in the system. (The needed snap realm information is | 40 | * with an open cap in the system. (The needed snap realm information is |
41 | * provided by the MDS whenever a cap is issued, i.e., on open.) A 'seq' | 41 | * provided by the MDS whenever a cap is issued, i.e., on open.) A 'seq' |
42 | * version number is used to ensure that as realm parameters change (new | 42 | * version number is used to ensure that as realm parameters change (new |
43 | * snapshot, new parent, etc.) the client's realm hierarchy is updated. | 43 | * snapshot, new parent, etc.) the client's realm hierarchy is updated. |
44 | * | 44 | * |
45 | * The realm hierarchy drives the generation of a 'snap context' for each | 45 | * The realm hierarchy drives the generation of a 'snap context' for each |
46 | * realm, which simply lists the resulting set of snaps for the realm. This | 46 | * realm, which simply lists the resulting set of snaps for the realm. This |
47 | * is attached to any writes sent to OSDs. | 47 | * is attached to any writes sent to OSDs. |
48 | */ | 48 | */ |
49 | /* | 49 | /* |
50 | * Unfortunately error handling is a bit mixed here. If we get a snap | 50 | * Unfortunately error handling is a bit mixed here. If we get a snap |
51 | * update, but don't have enough memory to update our realm hierarchy, | 51 | * update, but don't have enough memory to update our realm hierarchy, |
52 | * it's not clear what we can do about it (besides complaining to the | 52 | * it's not clear what we can do about it (besides complaining to the |
53 | * console). | 53 | * console). |
54 | */ | 54 | */ |
55 | 55 | ||
56 | 56 | ||
57 | /* | 57 | /* |
58 | * increase ref count for the realm | 58 | * increase ref count for the realm |
59 | * | 59 | * |
60 | * caller must hold snap_rwsem for write. | 60 | * caller must hold snap_rwsem for write. |
61 | */ | 61 | */ |
62 | void ceph_get_snap_realm(struct ceph_mds_client *mdsc, | 62 | void ceph_get_snap_realm(struct ceph_mds_client *mdsc, |
63 | struct ceph_snap_realm *realm) | 63 | struct ceph_snap_realm *realm) |
64 | { | 64 | { |
65 | dout("get_realm %p %d -> %d\n", realm, | 65 | dout("get_realm %p %d -> %d\n", realm, |
66 | atomic_read(&realm->nref), atomic_read(&realm->nref)+1); | 66 | atomic_read(&realm->nref), atomic_read(&realm->nref)+1); |
67 | /* | 67 | /* |
68 | * since we _only_ increment realm refs or empty the empty | 68 | * since we _only_ increment realm refs or empty the empty |
69 | * list with snap_rwsem held, adjusting the empty list here is | 69 | * list with snap_rwsem held, adjusting the empty list here is |
70 | * safe. we do need to protect against concurrent empty list | 70 | * safe. we do need to protect against concurrent empty list |
71 | * additions, however. | 71 | * additions, however. |
72 | */ | 72 | */ |
73 | if (atomic_read(&realm->nref) == 0) { | 73 | if (atomic_read(&realm->nref) == 0) { |
74 | spin_lock(&mdsc->snap_empty_lock); | 74 | spin_lock(&mdsc->snap_empty_lock); |
75 | list_del_init(&realm->empty_item); | 75 | list_del_init(&realm->empty_item); |
76 | spin_unlock(&mdsc->snap_empty_lock); | 76 | spin_unlock(&mdsc->snap_empty_lock); |
77 | } | 77 | } |
78 | 78 | ||
79 | atomic_inc(&realm->nref); | 79 | atomic_inc(&realm->nref); |
80 | } | 80 | } |
81 | 81 | ||
82 | static void __insert_snap_realm(struct rb_root *root, | 82 | static void __insert_snap_realm(struct rb_root *root, |
83 | struct ceph_snap_realm *new) | 83 | struct ceph_snap_realm *new) |
84 | { | 84 | { |
85 | struct rb_node **p = &root->rb_node; | 85 | struct rb_node **p = &root->rb_node; |
86 | struct rb_node *parent = NULL; | 86 | struct rb_node *parent = NULL; |
87 | struct ceph_snap_realm *r = NULL; | 87 | struct ceph_snap_realm *r = NULL; |
88 | 88 | ||
89 | while (*p) { | 89 | while (*p) { |
90 | parent = *p; | 90 | parent = *p; |
91 | r = rb_entry(parent, struct ceph_snap_realm, node); | 91 | r = rb_entry(parent, struct ceph_snap_realm, node); |
92 | if (new->ino < r->ino) | 92 | if (new->ino < r->ino) |
93 | p = &(*p)->rb_left; | 93 | p = &(*p)->rb_left; |
94 | else if (new->ino > r->ino) | 94 | else if (new->ino > r->ino) |
95 | p = &(*p)->rb_right; | 95 | p = &(*p)->rb_right; |
96 | else | 96 | else |
97 | BUG(); | 97 | BUG(); |
98 | } | 98 | } |
99 | 99 | ||
100 | rb_link_node(&new->node, parent, p); | 100 | rb_link_node(&new->node, parent, p); |
101 | rb_insert_color(&new->node, root); | 101 | rb_insert_color(&new->node, root); |
102 | } | 102 | } |
103 | 103 | ||
104 | /* | 104 | /* |
105 | * create and get the realm rooted at @ino and bump its ref count. | 105 | * create and get the realm rooted at @ino and bump its ref count. |
106 | * | 106 | * |
107 | * caller must hold snap_rwsem for write. | 107 | * caller must hold snap_rwsem for write. |
108 | */ | 108 | */ |
109 | static struct ceph_snap_realm *ceph_create_snap_realm( | 109 | static struct ceph_snap_realm *ceph_create_snap_realm( |
110 | struct ceph_mds_client *mdsc, | 110 | struct ceph_mds_client *mdsc, |
111 | u64 ino) | 111 | u64 ino) |
112 | { | 112 | { |
113 | struct ceph_snap_realm *realm; | 113 | struct ceph_snap_realm *realm; |
114 | 114 | ||
115 | realm = kzalloc(sizeof(*realm), GFP_NOFS); | 115 | realm = kzalloc(sizeof(*realm), GFP_NOFS); |
116 | if (!realm) | 116 | if (!realm) |
117 | return ERR_PTR(-ENOMEM); | 117 | return ERR_PTR(-ENOMEM); |
118 | 118 | ||
119 | atomic_set(&realm->nref, 0); /* tree does not take a ref */ | 119 | atomic_set(&realm->nref, 0); /* tree does not take a ref */ |
120 | realm->ino = ino; | 120 | realm->ino = ino; |
121 | INIT_LIST_HEAD(&realm->children); | 121 | INIT_LIST_HEAD(&realm->children); |
122 | INIT_LIST_HEAD(&realm->child_item); | 122 | INIT_LIST_HEAD(&realm->child_item); |
123 | INIT_LIST_HEAD(&realm->empty_item); | 123 | INIT_LIST_HEAD(&realm->empty_item); |
124 | INIT_LIST_HEAD(&realm->dirty_item); | 124 | INIT_LIST_HEAD(&realm->dirty_item); |
125 | INIT_LIST_HEAD(&realm->inodes_with_caps); | 125 | INIT_LIST_HEAD(&realm->inodes_with_caps); |
126 | spin_lock_init(&realm->inodes_with_caps_lock); | 126 | spin_lock_init(&realm->inodes_with_caps_lock); |
127 | __insert_snap_realm(&mdsc->snap_realms, realm); | 127 | __insert_snap_realm(&mdsc->snap_realms, realm); |
128 | dout("create_snap_realm %llx %p\n", realm->ino, realm); | 128 | dout("create_snap_realm %llx %p\n", realm->ino, realm); |
129 | return realm; | 129 | return realm; |
130 | } | 130 | } |
131 | 131 | ||
132 | /* | 132 | /* |
133 | * lookup the realm rooted at @ino. | 133 | * lookup the realm rooted at @ino. |
134 | * | 134 | * |
135 | * caller must hold snap_rwsem for write. | 135 | * caller must hold snap_rwsem for write. |
136 | */ | 136 | */ |
137 | struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, | 137 | struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, |
138 | u64 ino) | 138 | u64 ino) |
139 | { | 139 | { |
140 | struct rb_node *n = mdsc->snap_realms.rb_node; | 140 | struct rb_node *n = mdsc->snap_realms.rb_node; |
141 | struct ceph_snap_realm *r; | 141 | struct ceph_snap_realm *r; |
142 | 142 | ||
143 | while (n) { | 143 | while (n) { |
144 | r = rb_entry(n, struct ceph_snap_realm, node); | 144 | r = rb_entry(n, struct ceph_snap_realm, node); |
145 | if (ino < r->ino) | 145 | if (ino < r->ino) |
146 | n = n->rb_left; | 146 | n = n->rb_left; |
147 | else if (ino > r->ino) | 147 | else if (ino > r->ino) |
148 | n = n->rb_right; | 148 | n = n->rb_right; |
149 | else { | 149 | else { |
150 | dout("lookup_snap_realm %llx %p\n", r->ino, r); | 150 | dout("lookup_snap_realm %llx %p\n", r->ino, r); |
151 | return r; | 151 | return r; |
152 | } | 152 | } |
153 | } | 153 | } |
154 | return NULL; | 154 | return NULL; |
155 | } | 155 | } |
156 | 156 | ||
157 | static void __put_snap_realm(struct ceph_mds_client *mdsc, | 157 | static void __put_snap_realm(struct ceph_mds_client *mdsc, |
158 | struct ceph_snap_realm *realm); | 158 | struct ceph_snap_realm *realm); |
159 | 159 | ||
160 | /* | 160 | /* |
161 | * called with snap_rwsem (write) | 161 | * called with snap_rwsem (write) |
162 | */ | 162 | */ |
163 | static void __destroy_snap_realm(struct ceph_mds_client *mdsc, | 163 | static void __destroy_snap_realm(struct ceph_mds_client *mdsc, |
164 | struct ceph_snap_realm *realm) | 164 | struct ceph_snap_realm *realm) |
165 | { | 165 | { |
166 | dout("__destroy_snap_realm %p %llx\n", realm, realm->ino); | 166 | dout("__destroy_snap_realm %p %llx\n", realm, realm->ino); |
167 | 167 | ||
168 | rb_erase(&realm->node, &mdsc->snap_realms); | 168 | rb_erase(&realm->node, &mdsc->snap_realms); |
169 | 169 | ||
170 | if (realm->parent) { | 170 | if (realm->parent) { |
171 | list_del_init(&realm->child_item); | 171 | list_del_init(&realm->child_item); |
172 | __put_snap_realm(mdsc, realm->parent); | 172 | __put_snap_realm(mdsc, realm->parent); |
173 | } | 173 | } |
174 | 174 | ||
175 | kfree(realm->prior_parent_snaps); | 175 | kfree(realm->prior_parent_snaps); |
176 | kfree(realm->snaps); | 176 | kfree(realm->snaps); |
177 | ceph_put_snap_context(realm->cached_context); | 177 | ceph_put_snap_context(realm->cached_context); |
178 | kfree(realm); | 178 | kfree(realm); |
179 | } | 179 | } |
180 | 180 | ||
181 | /* | 181 | /* |
182 | * caller holds snap_rwsem (write) | 182 | * caller holds snap_rwsem (write) |
183 | */ | 183 | */ |
184 | static void __put_snap_realm(struct ceph_mds_client *mdsc, | 184 | static void __put_snap_realm(struct ceph_mds_client *mdsc, |
185 | struct ceph_snap_realm *realm) | 185 | struct ceph_snap_realm *realm) |
186 | { | 186 | { |
187 | dout("__put_snap_realm %llx %p %d -> %d\n", realm->ino, realm, | 187 | dout("__put_snap_realm %llx %p %d -> %d\n", realm->ino, realm, |
188 | atomic_read(&realm->nref), atomic_read(&realm->nref)-1); | 188 | atomic_read(&realm->nref), atomic_read(&realm->nref)-1); |
189 | if (atomic_dec_and_test(&realm->nref)) | 189 | if (atomic_dec_and_test(&realm->nref)) |
190 | __destroy_snap_realm(mdsc, realm); | 190 | __destroy_snap_realm(mdsc, realm); |
191 | } | 191 | } |
192 | 192 | ||
193 | /* | 193 | /* |
194 | * caller needn't hold any locks | 194 | * caller needn't hold any locks |
195 | */ | 195 | */ |
196 | void ceph_put_snap_realm(struct ceph_mds_client *mdsc, | 196 | void ceph_put_snap_realm(struct ceph_mds_client *mdsc, |
197 | struct ceph_snap_realm *realm) | 197 | struct ceph_snap_realm *realm) |
198 | { | 198 | { |
199 | dout("put_snap_realm %llx %p %d -> %d\n", realm->ino, realm, | 199 | dout("put_snap_realm %llx %p %d -> %d\n", realm->ino, realm, |
200 | atomic_read(&realm->nref), atomic_read(&realm->nref)-1); | 200 | atomic_read(&realm->nref), atomic_read(&realm->nref)-1); |
201 | if (!atomic_dec_and_test(&realm->nref)) | 201 | if (!atomic_dec_and_test(&realm->nref)) |
202 | return; | 202 | return; |
203 | 203 | ||
204 | if (down_write_trylock(&mdsc->snap_rwsem)) { | 204 | if (down_write_trylock(&mdsc->snap_rwsem)) { |
205 | __destroy_snap_realm(mdsc, realm); | 205 | __destroy_snap_realm(mdsc, realm); |
206 | up_write(&mdsc->snap_rwsem); | 206 | up_write(&mdsc->snap_rwsem); |
207 | } else { | 207 | } else { |
208 | spin_lock(&mdsc->snap_empty_lock); | 208 | spin_lock(&mdsc->snap_empty_lock); |
209 | list_add(&realm->empty_item, &mdsc->snap_empty); | 209 | list_add(&realm->empty_item, &mdsc->snap_empty); |
210 | spin_unlock(&mdsc->snap_empty_lock); | 210 | spin_unlock(&mdsc->snap_empty_lock); |
211 | } | 211 | } |
212 | } | 212 | } |
213 | 213 | ||
214 | /* | 214 | /* |
215 | * Clean up any realms whose ref counts have dropped to zero. Note | 215 | * Clean up any realms whose ref counts have dropped to zero. Note |
216 | * that this does not include realms who were created but not yet | 216 | * that this does not include realms who were created but not yet |
217 | * used. | 217 | * used. |
218 | * | 218 | * |
219 | * Called under snap_rwsem (write) | 219 | * Called under snap_rwsem (write) |
220 | */ | 220 | */ |
221 | static void __cleanup_empty_realms(struct ceph_mds_client *mdsc) | 221 | static void __cleanup_empty_realms(struct ceph_mds_client *mdsc) |
222 | { | 222 | { |
223 | struct ceph_snap_realm *realm; | 223 | struct ceph_snap_realm *realm; |
224 | 224 | ||
225 | spin_lock(&mdsc->snap_empty_lock); | 225 | spin_lock(&mdsc->snap_empty_lock); |
226 | while (!list_empty(&mdsc->snap_empty)) { | 226 | while (!list_empty(&mdsc->snap_empty)) { |
227 | realm = list_first_entry(&mdsc->snap_empty, | 227 | realm = list_first_entry(&mdsc->snap_empty, |
228 | struct ceph_snap_realm, empty_item); | 228 | struct ceph_snap_realm, empty_item); |
229 | list_del(&realm->empty_item); | 229 | list_del(&realm->empty_item); |
230 | spin_unlock(&mdsc->snap_empty_lock); | 230 | spin_unlock(&mdsc->snap_empty_lock); |
231 | __destroy_snap_realm(mdsc, realm); | 231 | __destroy_snap_realm(mdsc, realm); |
232 | spin_lock(&mdsc->snap_empty_lock); | 232 | spin_lock(&mdsc->snap_empty_lock); |
233 | } | 233 | } |
234 | spin_unlock(&mdsc->snap_empty_lock); | 234 | spin_unlock(&mdsc->snap_empty_lock); |
235 | } | 235 | } |
236 | 236 | ||
237 | void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc) | 237 | void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc) |
238 | { | 238 | { |
239 | down_write(&mdsc->snap_rwsem); | 239 | down_write(&mdsc->snap_rwsem); |
240 | __cleanup_empty_realms(mdsc); | 240 | __cleanup_empty_realms(mdsc); |
241 | up_write(&mdsc->snap_rwsem); | 241 | up_write(&mdsc->snap_rwsem); |
242 | } | 242 | } |
243 | 243 | ||
244 | /* | 244 | /* |
245 | * adjust the parent realm of a given @realm. adjust child list, and parent | 245 | * adjust the parent realm of a given @realm. adjust child list, and parent |
246 | * pointers, and ref counts appropriately. | 246 | * pointers, and ref counts appropriately. |
247 | * | 247 | * |
248 | * return true if parent was changed, 0 if unchanged, <0 on error. | 248 | * return true if parent was changed, 0 if unchanged, <0 on error. |
249 | * | 249 | * |
250 | * caller must hold snap_rwsem for write. | 250 | * caller must hold snap_rwsem for write. |
251 | */ | 251 | */ |
252 | static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc, | 252 | static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc, |
253 | struct ceph_snap_realm *realm, | 253 | struct ceph_snap_realm *realm, |
254 | u64 parentino) | 254 | u64 parentino) |
255 | { | 255 | { |
256 | struct ceph_snap_realm *parent; | 256 | struct ceph_snap_realm *parent; |
257 | 257 | ||
258 | if (realm->parent_ino == parentino) | 258 | if (realm->parent_ino == parentino) |
259 | return 0; | 259 | return 0; |
260 | 260 | ||
261 | parent = ceph_lookup_snap_realm(mdsc, parentino); | 261 | parent = ceph_lookup_snap_realm(mdsc, parentino); |
262 | if (!parent) { | 262 | if (!parent) { |
263 | parent = ceph_create_snap_realm(mdsc, parentino); | 263 | parent = ceph_create_snap_realm(mdsc, parentino); |
264 | if (IS_ERR(parent)) | 264 | if (IS_ERR(parent)) |
265 | return PTR_ERR(parent); | 265 | return PTR_ERR(parent); |
266 | } | 266 | } |
267 | dout("adjust_snap_realm_parent %llx %p: %llx %p -> %llx %p\n", | 267 | dout("adjust_snap_realm_parent %llx %p: %llx %p -> %llx %p\n", |
268 | realm->ino, realm, realm->parent_ino, realm->parent, | 268 | realm->ino, realm, realm->parent_ino, realm->parent, |
269 | parentino, parent); | 269 | parentino, parent); |
270 | if (realm->parent) { | 270 | if (realm->parent) { |
271 | list_del_init(&realm->child_item); | 271 | list_del_init(&realm->child_item); |
272 | ceph_put_snap_realm(mdsc, realm->parent); | 272 | ceph_put_snap_realm(mdsc, realm->parent); |
273 | } | 273 | } |
274 | realm->parent_ino = parentino; | 274 | realm->parent_ino = parentino; |
275 | realm->parent = parent; | 275 | realm->parent = parent; |
276 | ceph_get_snap_realm(mdsc, parent); | 276 | ceph_get_snap_realm(mdsc, parent); |
277 | list_add(&realm->child_item, &parent->children); | 277 | list_add(&realm->child_item, &parent->children); |
278 | return 1; | 278 | return 1; |
279 | } | 279 | } |
280 | 280 | ||
281 | 281 | ||
282 | static int cmpu64_rev(const void *a, const void *b) | 282 | static int cmpu64_rev(const void *a, const void *b) |
283 | { | 283 | { |
284 | if (*(u64 *)a < *(u64 *)b) | 284 | if (*(u64 *)a < *(u64 *)b) |
285 | return 1; | 285 | return 1; |
286 | if (*(u64 *)a > *(u64 *)b) | 286 | if (*(u64 *)a > *(u64 *)b) |
287 | return -1; | 287 | return -1; |
288 | return 0; | 288 | return 0; |
289 | } | 289 | } |
290 | 290 | ||
291 | /* | 291 | /* |
292 | * build the snap context for a given realm. | 292 | * build the snap context for a given realm. |
293 | */ | 293 | */ |
294 | static int build_snap_context(struct ceph_snap_realm *realm) | 294 | static int build_snap_context(struct ceph_snap_realm *realm) |
295 | { | 295 | { |
296 | struct ceph_snap_realm *parent = realm->parent; | 296 | struct ceph_snap_realm *parent = realm->parent; |
297 | struct ceph_snap_context *snapc; | 297 | struct ceph_snap_context *snapc; |
298 | int err = 0; | 298 | int err = 0; |
299 | u32 num = realm->num_prior_parent_snaps + realm->num_snaps; | 299 | u32 num = realm->num_prior_parent_snaps + realm->num_snaps; |
300 | 300 | ||
301 | /* | 301 | /* |
302 | * build parent context, if it hasn't been built. | 302 | * build parent context, if it hasn't been built. |
303 | * conservatively estimate that all parent snaps might be | 303 | * conservatively estimate that all parent snaps might be |
304 | * included by us. | 304 | * included by us. |
305 | */ | 305 | */ |
306 | if (parent) { | 306 | if (parent) { |
307 | if (!parent->cached_context) { | 307 | if (!parent->cached_context) { |
308 | err = build_snap_context(parent); | 308 | err = build_snap_context(parent); |
309 | if (err) | 309 | if (err) |
310 | goto fail; | 310 | goto fail; |
311 | } | 311 | } |
312 | num += parent->cached_context->num_snaps; | 312 | num += parent->cached_context->num_snaps; |
313 | } | 313 | } |
314 | 314 | ||
315 | /* do i actually need to update? not if my context seq | 315 | /* do i actually need to update? not if my context seq |
316 | matches realm seq, and my parents' does to. (this works | 316 | matches realm seq, and my parents' does to. (this works |
317 | because we rebuild_snap_realms() works _downward_ in | 317 | because we rebuild_snap_realms() works _downward_ in |
318 | hierarchy after each update.) */ | 318 | hierarchy after each update.) */ |
319 | if (realm->cached_context && | 319 | if (realm->cached_context && |
320 | realm->cached_context->seq == realm->seq && | 320 | realm->cached_context->seq == realm->seq && |
321 | (!parent || | 321 | (!parent || |
322 | realm->cached_context->seq >= parent->cached_context->seq)) { | 322 | realm->cached_context->seq >= parent->cached_context->seq)) { |
323 | dout("build_snap_context %llx %p: %p seq %lld (%u snaps)" | 323 | dout("build_snap_context %llx %p: %p seq %lld (%u snaps)" |
324 | " (unchanged)\n", | 324 | " (unchanged)\n", |
325 | realm->ino, realm, realm->cached_context, | 325 | realm->ino, realm, realm->cached_context, |
326 | realm->cached_context->seq, | 326 | realm->cached_context->seq, |
327 | (unsigned int) realm->cached_context->num_snaps); | 327 | (unsigned int) realm->cached_context->num_snaps); |
328 | return 0; | 328 | return 0; |
329 | } | 329 | } |
330 | 330 | ||
331 | /* alloc new snap context */ | 331 | /* alloc new snap context */ |
332 | err = -ENOMEM; | 332 | err = -ENOMEM; |
333 | if (num > (SIZE_MAX - sizeof(*snapc)) / sizeof(u64)) | 333 | if (num > (SIZE_MAX - sizeof(*snapc)) / sizeof(u64)) |
334 | goto fail; | 334 | goto fail; |
335 | snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS); | 335 | snapc = ceph_create_snap_context(num, GFP_NOFS); |
336 | if (!snapc) | 336 | if (!snapc) |
337 | goto fail; | 337 | goto fail; |
338 | atomic_set(&snapc->nref, 1); | ||
339 | 338 | ||
340 | /* build (reverse sorted) snap vector */ | 339 | /* build (reverse sorted) snap vector */ |
341 | num = 0; | 340 | num = 0; |
342 | snapc->seq = realm->seq; | 341 | snapc->seq = realm->seq; |
343 | if (parent) { | 342 | if (parent) { |
344 | u32 i; | 343 | u32 i; |
345 | 344 | ||
346 | /* include any of parent's snaps occurring _after_ my | 345 | /* include any of parent's snaps occurring _after_ my |
347 | parent became my parent */ | 346 | parent became my parent */ |
348 | for (i = 0; i < parent->cached_context->num_snaps; i++) | 347 | for (i = 0; i < parent->cached_context->num_snaps; i++) |
349 | if (parent->cached_context->snaps[i] >= | 348 | if (parent->cached_context->snaps[i] >= |
350 | realm->parent_since) | 349 | realm->parent_since) |
351 | snapc->snaps[num++] = | 350 | snapc->snaps[num++] = |
352 | parent->cached_context->snaps[i]; | 351 | parent->cached_context->snaps[i]; |
353 | if (parent->cached_context->seq > snapc->seq) | 352 | if (parent->cached_context->seq > snapc->seq) |
354 | snapc->seq = parent->cached_context->seq; | 353 | snapc->seq = parent->cached_context->seq; |
355 | } | 354 | } |
356 | memcpy(snapc->snaps + num, realm->snaps, | 355 | memcpy(snapc->snaps + num, realm->snaps, |
357 | sizeof(u64)*realm->num_snaps); | 356 | sizeof(u64)*realm->num_snaps); |
358 | num += realm->num_snaps; | 357 | num += realm->num_snaps; |
359 | memcpy(snapc->snaps + num, realm->prior_parent_snaps, | 358 | memcpy(snapc->snaps + num, realm->prior_parent_snaps, |
360 | sizeof(u64)*realm->num_prior_parent_snaps); | 359 | sizeof(u64)*realm->num_prior_parent_snaps); |
361 | num += realm->num_prior_parent_snaps; | 360 | num += realm->num_prior_parent_snaps; |
362 | 361 | ||
363 | sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL); | 362 | sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL); |
364 | snapc->num_snaps = num; | 363 | snapc->num_snaps = num; |
365 | dout("build_snap_context %llx %p: %p seq %lld (%u snaps)\n", | 364 | dout("build_snap_context %llx %p: %p seq %lld (%u snaps)\n", |
366 | realm->ino, realm, snapc, snapc->seq, | 365 | realm->ino, realm, snapc, snapc->seq, |
367 | (unsigned int) snapc->num_snaps); | 366 | (unsigned int) snapc->num_snaps); |
368 | 367 | ||
369 | if (realm->cached_context) | 368 | if (realm->cached_context) |
370 | ceph_put_snap_context(realm->cached_context); | 369 | ceph_put_snap_context(realm->cached_context); |
371 | realm->cached_context = snapc; | 370 | realm->cached_context = snapc; |
372 | return 0; | 371 | return 0; |
373 | 372 | ||
374 | fail: | 373 | fail: |
375 | /* | 374 | /* |
376 | * if we fail, clear old (incorrect) cached_context... hopefully | 375 | * if we fail, clear old (incorrect) cached_context... hopefully |
377 | * we'll have better luck building it later | 376 | * we'll have better luck building it later |
378 | */ | 377 | */ |
379 | if (realm->cached_context) { | 378 | if (realm->cached_context) { |
380 | ceph_put_snap_context(realm->cached_context); | 379 | ceph_put_snap_context(realm->cached_context); |
381 | realm->cached_context = NULL; | 380 | realm->cached_context = NULL; |
382 | } | 381 | } |
383 | pr_err("build_snap_context %llx %p fail %d\n", realm->ino, | 382 | pr_err("build_snap_context %llx %p fail %d\n", realm->ino, |
384 | realm, err); | 383 | realm, err); |
385 | return err; | 384 | return err; |
386 | } | 385 | } |
387 | 386 | ||
388 | /* | 387 | /* |
389 | * rebuild snap context for the given realm and all of its children. | 388 | * rebuild snap context for the given realm and all of its children. |
390 | */ | 389 | */ |
391 | static void rebuild_snap_realms(struct ceph_snap_realm *realm) | 390 | static void rebuild_snap_realms(struct ceph_snap_realm *realm) |
392 | { | 391 | { |
393 | struct ceph_snap_realm *child; | 392 | struct ceph_snap_realm *child; |
394 | 393 | ||
395 | dout("rebuild_snap_realms %llx %p\n", realm->ino, realm); | 394 | dout("rebuild_snap_realms %llx %p\n", realm->ino, realm); |
396 | build_snap_context(realm); | 395 | build_snap_context(realm); |
397 | 396 | ||
398 | list_for_each_entry(child, &realm->children, child_item) | 397 | list_for_each_entry(child, &realm->children, child_item) |
399 | rebuild_snap_realms(child); | 398 | rebuild_snap_realms(child); |
400 | } | 399 | } |
401 | 400 | ||
402 | 401 | ||
403 | /* | 402 | /* |
404 | * helper to allocate and decode an array of snapids. free prior | 403 | * helper to allocate and decode an array of snapids. free prior |
405 | * instance, if any. | 404 | * instance, if any. |
406 | */ | 405 | */ |
407 | static int dup_array(u64 **dst, __le64 *src, u32 num) | 406 | static int dup_array(u64 **dst, __le64 *src, u32 num) |
408 | { | 407 | { |
409 | u32 i; | 408 | u32 i; |
410 | 409 | ||
411 | kfree(*dst); | 410 | kfree(*dst); |
412 | if (num) { | 411 | if (num) { |
413 | *dst = kcalloc(num, sizeof(u64), GFP_NOFS); | 412 | *dst = kcalloc(num, sizeof(u64), GFP_NOFS); |
414 | if (!*dst) | 413 | if (!*dst) |
415 | return -ENOMEM; | 414 | return -ENOMEM; |
416 | for (i = 0; i < num; i++) | 415 | for (i = 0; i < num; i++) |
417 | (*dst)[i] = get_unaligned_le64(src + i); | 416 | (*dst)[i] = get_unaligned_le64(src + i); |
418 | } else { | 417 | } else { |
419 | *dst = NULL; | 418 | *dst = NULL; |
420 | } | 419 | } |
421 | return 0; | 420 | return 0; |
422 | } | 421 | } |
423 | 422 | ||
424 | 423 | ||
425 | /* | 424 | /* |
426 | * When a snapshot is applied, the size/mtime inode metadata is queued | 425 | * When a snapshot is applied, the size/mtime inode metadata is queued |
427 | * in a ceph_cap_snap (one for each snapshot) until writeback | 426 | * in a ceph_cap_snap (one for each snapshot) until writeback |
428 | * completes and the metadata can be flushed back to the MDS. | 427 | * completes and the metadata can be flushed back to the MDS. |
429 | * | 428 | * |
430 | * However, if a (sync) write is currently in-progress when we apply | 429 | * However, if a (sync) write is currently in-progress when we apply |
431 | * the snapshot, we have to wait until the write succeeds or fails | 430 | * the snapshot, we have to wait until the write succeeds or fails |
432 | * (and a final size/mtime is known). In this case the | 431 | * (and a final size/mtime is known). In this case the |
433 | * cap_snap->writing = 1, and is said to be "pending." When the write | 432 | * cap_snap->writing = 1, and is said to be "pending." When the write |
434 | * finishes, we __ceph_finish_cap_snap(). | 433 | * finishes, we __ceph_finish_cap_snap(). |
435 | * | 434 | * |
436 | * Caller must hold snap_rwsem for read (i.e., the realm topology won't | 435 | * Caller must hold snap_rwsem for read (i.e., the realm topology won't |
437 | * change). | 436 | * change). |
438 | */ | 437 | */ |
439 | void ceph_queue_cap_snap(struct ceph_inode_info *ci) | 438 | void ceph_queue_cap_snap(struct ceph_inode_info *ci) |
440 | { | 439 | { |
441 | struct inode *inode = &ci->vfs_inode; | 440 | struct inode *inode = &ci->vfs_inode; |
442 | struct ceph_cap_snap *capsnap; | 441 | struct ceph_cap_snap *capsnap; |
443 | int used, dirty; | 442 | int used, dirty; |
444 | 443 | ||
445 | capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS); | 444 | capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS); |
446 | if (!capsnap) { | 445 | if (!capsnap) { |
447 | pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode); | 446 | pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode); |
448 | return; | 447 | return; |
449 | } | 448 | } |
450 | 449 | ||
451 | spin_lock(&ci->i_ceph_lock); | 450 | spin_lock(&ci->i_ceph_lock); |
452 | used = __ceph_caps_used(ci); | 451 | used = __ceph_caps_used(ci); |
453 | dirty = __ceph_caps_dirty(ci); | 452 | dirty = __ceph_caps_dirty(ci); |
454 | 453 | ||
455 | /* | 454 | /* |
456 | * If there is a write in progress, treat that as a dirty Fw, | 455 | * If there is a write in progress, treat that as a dirty Fw, |
457 | * even though it hasn't completed yet; by the time we finish | 456 | * even though it hasn't completed yet; by the time we finish |
458 | * up this capsnap it will be. | 457 | * up this capsnap it will be. |
459 | */ | 458 | */ |
460 | if (used & CEPH_CAP_FILE_WR) | 459 | if (used & CEPH_CAP_FILE_WR) |
461 | dirty |= CEPH_CAP_FILE_WR; | 460 | dirty |= CEPH_CAP_FILE_WR; |
462 | 461 | ||
463 | if (__ceph_have_pending_cap_snap(ci)) { | 462 | if (__ceph_have_pending_cap_snap(ci)) { |
464 | /* there is no point in queuing multiple "pending" cap_snaps, | 463 | /* there is no point in queuing multiple "pending" cap_snaps, |
465 | as no new writes are allowed to start when pending, so any | 464 | as no new writes are allowed to start when pending, so any |
466 | writes in progress now were started before the previous | 465 | writes in progress now were started before the previous |
467 | cap_snap. lucky us. */ | 466 | cap_snap. lucky us. */ |
468 | dout("queue_cap_snap %p already pending\n", inode); | 467 | dout("queue_cap_snap %p already pending\n", inode); |
469 | kfree(capsnap); | 468 | kfree(capsnap); |
470 | } else if (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL| | 469 | } else if (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL| |
471 | CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) { | 470 | CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) { |
472 | struct ceph_snap_context *snapc = ci->i_head_snapc; | 471 | struct ceph_snap_context *snapc = ci->i_head_snapc; |
473 | 472 | ||
474 | /* | 473 | /* |
475 | * if we are a sync write, we may need to go to the snaprealm | 474 | * if we are a sync write, we may need to go to the snaprealm |
476 | * to get the current snapc. | 475 | * to get the current snapc. |
477 | */ | 476 | */ |
478 | if (!snapc) | 477 | if (!snapc) |
479 | snapc = ci->i_snap_realm->cached_context; | 478 | snapc = ci->i_snap_realm->cached_context; |
480 | 479 | ||
481 | dout("queue_cap_snap %p cap_snap %p queuing under %p %s\n", | 480 | dout("queue_cap_snap %p cap_snap %p queuing under %p %s\n", |
482 | inode, capsnap, snapc, ceph_cap_string(dirty)); | 481 | inode, capsnap, snapc, ceph_cap_string(dirty)); |
483 | ihold(inode); | 482 | ihold(inode); |
484 | 483 | ||
485 | atomic_set(&capsnap->nref, 1); | 484 | atomic_set(&capsnap->nref, 1); |
486 | capsnap->ci = ci; | 485 | capsnap->ci = ci; |
487 | INIT_LIST_HEAD(&capsnap->ci_item); | 486 | INIT_LIST_HEAD(&capsnap->ci_item); |
488 | INIT_LIST_HEAD(&capsnap->flushing_item); | 487 | INIT_LIST_HEAD(&capsnap->flushing_item); |
489 | 488 | ||
490 | capsnap->follows = snapc->seq; | 489 | capsnap->follows = snapc->seq; |
491 | capsnap->issued = __ceph_caps_issued(ci, NULL); | 490 | capsnap->issued = __ceph_caps_issued(ci, NULL); |
492 | capsnap->dirty = dirty; | 491 | capsnap->dirty = dirty; |
493 | 492 | ||
494 | capsnap->mode = inode->i_mode; | 493 | capsnap->mode = inode->i_mode; |
495 | capsnap->uid = inode->i_uid; | 494 | capsnap->uid = inode->i_uid; |
496 | capsnap->gid = inode->i_gid; | 495 | capsnap->gid = inode->i_gid; |
497 | 496 | ||
498 | if (dirty & CEPH_CAP_XATTR_EXCL) { | 497 | if (dirty & CEPH_CAP_XATTR_EXCL) { |
499 | __ceph_build_xattrs_blob(ci); | 498 | __ceph_build_xattrs_blob(ci); |
500 | capsnap->xattr_blob = | 499 | capsnap->xattr_blob = |
501 | ceph_buffer_get(ci->i_xattrs.blob); | 500 | ceph_buffer_get(ci->i_xattrs.blob); |
502 | capsnap->xattr_version = ci->i_xattrs.version; | 501 | capsnap->xattr_version = ci->i_xattrs.version; |
503 | } else { | 502 | } else { |
504 | capsnap->xattr_blob = NULL; | 503 | capsnap->xattr_blob = NULL; |
505 | capsnap->xattr_version = 0; | 504 | capsnap->xattr_version = 0; |
506 | } | 505 | } |
507 | 506 | ||
508 | /* dirty page count moved from _head to this cap_snap; | 507 | /* dirty page count moved from _head to this cap_snap; |
509 | all subsequent writes page dirties occur _after_ this | 508 | all subsequent writes page dirties occur _after_ this |
510 | snapshot. */ | 509 | snapshot. */ |
511 | capsnap->dirty_pages = ci->i_wrbuffer_ref_head; | 510 | capsnap->dirty_pages = ci->i_wrbuffer_ref_head; |
512 | ci->i_wrbuffer_ref_head = 0; | 511 | ci->i_wrbuffer_ref_head = 0; |
513 | capsnap->context = snapc; | 512 | capsnap->context = snapc; |
514 | ci->i_head_snapc = | 513 | ci->i_head_snapc = |
515 | ceph_get_snap_context(ci->i_snap_realm->cached_context); | 514 | ceph_get_snap_context(ci->i_snap_realm->cached_context); |
516 | dout(" new snapc is %p\n", ci->i_head_snapc); | 515 | dout(" new snapc is %p\n", ci->i_head_snapc); |
517 | list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); | 516 | list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); |
518 | 517 | ||
519 | if (used & CEPH_CAP_FILE_WR) { | 518 | if (used & CEPH_CAP_FILE_WR) { |
520 | dout("queue_cap_snap %p cap_snap %p snapc %p" | 519 | dout("queue_cap_snap %p cap_snap %p snapc %p" |
521 | " seq %llu used WR, now pending\n", inode, | 520 | " seq %llu used WR, now pending\n", inode, |
522 | capsnap, snapc, snapc->seq); | 521 | capsnap, snapc, snapc->seq); |
523 | capsnap->writing = 1; | 522 | capsnap->writing = 1; |
524 | } else { | 523 | } else { |
525 | /* note mtime, size NOW. */ | 524 | /* note mtime, size NOW. */ |
526 | __ceph_finish_cap_snap(ci, capsnap); | 525 | __ceph_finish_cap_snap(ci, capsnap); |
527 | } | 526 | } |
528 | } else { | 527 | } else { |
529 | dout("queue_cap_snap %p nothing dirty|writing\n", inode); | 528 | dout("queue_cap_snap %p nothing dirty|writing\n", inode); |
530 | kfree(capsnap); | 529 | kfree(capsnap); |
531 | } | 530 | } |
532 | 531 | ||
533 | spin_unlock(&ci->i_ceph_lock); | 532 | spin_unlock(&ci->i_ceph_lock); |
534 | } | 533 | } |
535 | 534 | ||
536 | /* | 535 | /* |
537 | * Finalize the size, mtime for a cap_snap.. that is, settle on final values | 536 | * Finalize the size, mtime for a cap_snap.. that is, settle on final values |
538 | * to be used for the snapshot, to be flushed back to the mds. | 537 | * to be used for the snapshot, to be flushed back to the mds. |
539 | * | 538 | * |
540 | * If capsnap can now be flushed, add to snap_flush list, and return 1. | 539 | * If capsnap can now be flushed, add to snap_flush list, and return 1. |
541 | * | 540 | * |
542 | * Caller must hold i_ceph_lock. | 541 | * Caller must hold i_ceph_lock. |
543 | */ | 542 | */ |
544 | int __ceph_finish_cap_snap(struct ceph_inode_info *ci, | 543 | int __ceph_finish_cap_snap(struct ceph_inode_info *ci, |
545 | struct ceph_cap_snap *capsnap) | 544 | struct ceph_cap_snap *capsnap) |
546 | { | 545 | { |
547 | struct inode *inode = &ci->vfs_inode; | 546 | struct inode *inode = &ci->vfs_inode; |
548 | struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; | 547 | struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; |
549 | 548 | ||
550 | BUG_ON(capsnap->writing); | 549 | BUG_ON(capsnap->writing); |
551 | capsnap->size = inode->i_size; | 550 | capsnap->size = inode->i_size; |
552 | capsnap->mtime = inode->i_mtime; | 551 | capsnap->mtime = inode->i_mtime; |
553 | capsnap->atime = inode->i_atime; | 552 | capsnap->atime = inode->i_atime; |
554 | capsnap->ctime = inode->i_ctime; | 553 | capsnap->ctime = inode->i_ctime; |
555 | capsnap->time_warp_seq = ci->i_time_warp_seq; | 554 | capsnap->time_warp_seq = ci->i_time_warp_seq; |
556 | if (capsnap->dirty_pages) { | 555 | if (capsnap->dirty_pages) { |
557 | dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu " | 556 | dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu " |
558 | "still has %d dirty pages\n", inode, capsnap, | 557 | "still has %d dirty pages\n", inode, capsnap, |
559 | capsnap->context, capsnap->context->seq, | 558 | capsnap->context, capsnap->context->seq, |
560 | ceph_cap_string(capsnap->dirty), capsnap->size, | 559 | ceph_cap_string(capsnap->dirty), capsnap->size, |
561 | capsnap->dirty_pages); | 560 | capsnap->dirty_pages); |
562 | return 0; | 561 | return 0; |
563 | } | 562 | } |
564 | dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n", | 563 | dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n", |
565 | inode, capsnap, capsnap->context, | 564 | inode, capsnap, capsnap->context, |
566 | capsnap->context->seq, ceph_cap_string(capsnap->dirty), | 565 | capsnap->context->seq, ceph_cap_string(capsnap->dirty), |
567 | capsnap->size); | 566 | capsnap->size); |
568 | 567 | ||
569 | spin_lock(&mdsc->snap_flush_lock); | 568 | spin_lock(&mdsc->snap_flush_lock); |
570 | list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list); | 569 | list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list); |
571 | spin_unlock(&mdsc->snap_flush_lock); | 570 | spin_unlock(&mdsc->snap_flush_lock); |
572 | return 1; /* caller may want to ceph_flush_snaps */ | 571 | return 1; /* caller may want to ceph_flush_snaps */ |
573 | } | 572 | } |
574 | 573 | ||
575 | /* | 574 | /* |
576 | * Queue cap_snaps for snap writeback for this realm and its children. | 575 | * Queue cap_snaps for snap writeback for this realm and its children. |
577 | * Called under snap_rwsem, so realm topology won't change. | 576 | * Called under snap_rwsem, so realm topology won't change. |
578 | */ | 577 | */ |
579 | static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) | 578 | static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) |
580 | { | 579 | { |
581 | struct ceph_inode_info *ci; | 580 | struct ceph_inode_info *ci; |
582 | struct inode *lastinode = NULL; | 581 | struct inode *lastinode = NULL; |
583 | struct ceph_snap_realm *child; | 582 | struct ceph_snap_realm *child; |
584 | 583 | ||
585 | dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino); | 584 | dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino); |
586 | 585 | ||
587 | spin_lock(&realm->inodes_with_caps_lock); | 586 | spin_lock(&realm->inodes_with_caps_lock); |
588 | list_for_each_entry(ci, &realm->inodes_with_caps, | 587 | list_for_each_entry(ci, &realm->inodes_with_caps, |
589 | i_snap_realm_item) { | 588 | i_snap_realm_item) { |
590 | struct inode *inode = igrab(&ci->vfs_inode); | 589 | struct inode *inode = igrab(&ci->vfs_inode); |
591 | if (!inode) | 590 | if (!inode) |
592 | continue; | 591 | continue; |
593 | spin_unlock(&realm->inodes_with_caps_lock); | 592 | spin_unlock(&realm->inodes_with_caps_lock); |
594 | if (lastinode) | 593 | if (lastinode) |
595 | iput(lastinode); | 594 | iput(lastinode); |
596 | lastinode = inode; | 595 | lastinode = inode; |
597 | ceph_queue_cap_snap(ci); | 596 | ceph_queue_cap_snap(ci); |
598 | spin_lock(&realm->inodes_with_caps_lock); | 597 | spin_lock(&realm->inodes_with_caps_lock); |
599 | } | 598 | } |
600 | spin_unlock(&realm->inodes_with_caps_lock); | 599 | spin_unlock(&realm->inodes_with_caps_lock); |
601 | if (lastinode) | 600 | if (lastinode) |
602 | iput(lastinode); | 601 | iput(lastinode); |
603 | 602 | ||
604 | list_for_each_entry(child, &realm->children, child_item) { | 603 | list_for_each_entry(child, &realm->children, child_item) { |
605 | dout("queue_realm_cap_snaps %p %llx queue child %p %llx\n", | 604 | dout("queue_realm_cap_snaps %p %llx queue child %p %llx\n", |
606 | realm, realm->ino, child, child->ino); | 605 | realm, realm->ino, child, child->ino); |
607 | list_del_init(&child->dirty_item); | 606 | list_del_init(&child->dirty_item); |
608 | list_add(&child->dirty_item, &realm->dirty_item); | 607 | list_add(&child->dirty_item, &realm->dirty_item); |
609 | } | 608 | } |
610 | 609 | ||
611 | list_del_init(&realm->dirty_item); | 610 | list_del_init(&realm->dirty_item); |
612 | dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino); | 611 | dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino); |
613 | } | 612 | } |
614 | 613 | ||
615 | /* | 614 | /* |
616 | * Parse and apply a snapblob "snap trace" from the MDS. This specifies | 615 | * Parse and apply a snapblob "snap trace" from the MDS. This specifies |
617 | * the snap realm parameters from a given realm and all of its ancestors, | 616 | * the snap realm parameters from a given realm and all of its ancestors, |
618 | * up to the root. | 617 | * up to the root. |
619 | * | 618 | * |
620 | * Caller must hold snap_rwsem for write. | 619 | * Caller must hold snap_rwsem for write. |
621 | */ | 620 | */ |
622 | int ceph_update_snap_trace(struct ceph_mds_client *mdsc, | 621 | int ceph_update_snap_trace(struct ceph_mds_client *mdsc, |
623 | void *p, void *e, bool deletion) | 622 | void *p, void *e, bool deletion) |
624 | { | 623 | { |
625 | struct ceph_mds_snap_realm *ri; /* encoded */ | 624 | struct ceph_mds_snap_realm *ri; /* encoded */ |
626 | __le64 *snaps; /* encoded */ | 625 | __le64 *snaps; /* encoded */ |
627 | __le64 *prior_parent_snaps; /* encoded */ | 626 | __le64 *prior_parent_snaps; /* encoded */ |
628 | struct ceph_snap_realm *realm; | 627 | struct ceph_snap_realm *realm; |
629 | int invalidate = 0; | 628 | int invalidate = 0; |
630 | int err = -ENOMEM; | 629 | int err = -ENOMEM; |
631 | LIST_HEAD(dirty_realms); | 630 | LIST_HEAD(dirty_realms); |
632 | 631 | ||
633 | dout("update_snap_trace deletion=%d\n", deletion); | 632 | dout("update_snap_trace deletion=%d\n", deletion); |
634 | more: | 633 | more: |
635 | ceph_decode_need(&p, e, sizeof(*ri), bad); | 634 | ceph_decode_need(&p, e, sizeof(*ri), bad); |
636 | ri = p; | 635 | ri = p; |
637 | p += sizeof(*ri); | 636 | p += sizeof(*ri); |
638 | ceph_decode_need(&p, e, sizeof(u64)*(le32_to_cpu(ri->num_snaps) + | 637 | ceph_decode_need(&p, e, sizeof(u64)*(le32_to_cpu(ri->num_snaps) + |
639 | le32_to_cpu(ri->num_prior_parent_snaps)), bad); | 638 | le32_to_cpu(ri->num_prior_parent_snaps)), bad); |
640 | snaps = p; | 639 | snaps = p; |
641 | p += sizeof(u64) * le32_to_cpu(ri->num_snaps); | 640 | p += sizeof(u64) * le32_to_cpu(ri->num_snaps); |
642 | prior_parent_snaps = p; | 641 | prior_parent_snaps = p; |
643 | p += sizeof(u64) * le32_to_cpu(ri->num_prior_parent_snaps); | 642 | p += sizeof(u64) * le32_to_cpu(ri->num_prior_parent_snaps); |
644 | 643 | ||
645 | realm = ceph_lookup_snap_realm(mdsc, le64_to_cpu(ri->ino)); | 644 | realm = ceph_lookup_snap_realm(mdsc, le64_to_cpu(ri->ino)); |
646 | if (!realm) { | 645 | if (!realm) { |
647 | realm = ceph_create_snap_realm(mdsc, le64_to_cpu(ri->ino)); | 646 | realm = ceph_create_snap_realm(mdsc, le64_to_cpu(ri->ino)); |
648 | if (IS_ERR(realm)) { | 647 | if (IS_ERR(realm)) { |
649 | err = PTR_ERR(realm); | 648 | err = PTR_ERR(realm); |
650 | goto fail; | 649 | goto fail; |
651 | } | 650 | } |
652 | } | 651 | } |
653 | 652 | ||
654 | /* ensure the parent is correct */ | 653 | /* ensure the parent is correct */ |
655 | err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent)); | 654 | err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent)); |
656 | if (err < 0) | 655 | if (err < 0) |
657 | goto fail; | 656 | goto fail; |
658 | invalidate += err; | 657 | invalidate += err; |
659 | 658 | ||
660 | if (le64_to_cpu(ri->seq) > realm->seq) { | 659 | if (le64_to_cpu(ri->seq) > realm->seq) { |
661 | dout("update_snap_trace updating %llx %p %lld -> %lld\n", | 660 | dout("update_snap_trace updating %llx %p %lld -> %lld\n", |
662 | realm->ino, realm, realm->seq, le64_to_cpu(ri->seq)); | 661 | realm->ino, realm, realm->seq, le64_to_cpu(ri->seq)); |
663 | /* update realm parameters, snap lists */ | 662 | /* update realm parameters, snap lists */ |
664 | realm->seq = le64_to_cpu(ri->seq); | 663 | realm->seq = le64_to_cpu(ri->seq); |
665 | realm->created = le64_to_cpu(ri->created); | 664 | realm->created = le64_to_cpu(ri->created); |
666 | realm->parent_since = le64_to_cpu(ri->parent_since); | 665 | realm->parent_since = le64_to_cpu(ri->parent_since); |
667 | 666 | ||
668 | realm->num_snaps = le32_to_cpu(ri->num_snaps); | 667 | realm->num_snaps = le32_to_cpu(ri->num_snaps); |
669 | err = dup_array(&realm->snaps, snaps, realm->num_snaps); | 668 | err = dup_array(&realm->snaps, snaps, realm->num_snaps); |
670 | if (err < 0) | 669 | if (err < 0) |
671 | goto fail; | 670 | goto fail; |
672 | 671 | ||
673 | realm->num_prior_parent_snaps = | 672 | realm->num_prior_parent_snaps = |
674 | le32_to_cpu(ri->num_prior_parent_snaps); | 673 | le32_to_cpu(ri->num_prior_parent_snaps); |
675 | err = dup_array(&realm->prior_parent_snaps, prior_parent_snaps, | 674 | err = dup_array(&realm->prior_parent_snaps, prior_parent_snaps, |
676 | realm->num_prior_parent_snaps); | 675 | realm->num_prior_parent_snaps); |
677 | if (err < 0) | 676 | if (err < 0) |
678 | goto fail; | 677 | goto fail; |
679 | 678 | ||
680 | /* queue realm for cap_snap creation */ | 679 | /* queue realm for cap_snap creation */ |
681 | list_add(&realm->dirty_item, &dirty_realms); | 680 | list_add(&realm->dirty_item, &dirty_realms); |
682 | 681 | ||
683 | invalidate = 1; | 682 | invalidate = 1; |
684 | } else if (!realm->cached_context) { | 683 | } else if (!realm->cached_context) { |
685 | dout("update_snap_trace %llx %p seq %lld new\n", | 684 | dout("update_snap_trace %llx %p seq %lld new\n", |
686 | realm->ino, realm, realm->seq); | 685 | realm->ino, realm, realm->seq); |
687 | invalidate = 1; | 686 | invalidate = 1; |
688 | } else { | 687 | } else { |
689 | dout("update_snap_trace %llx %p seq %lld unchanged\n", | 688 | dout("update_snap_trace %llx %p seq %lld unchanged\n", |
690 | realm->ino, realm, realm->seq); | 689 | realm->ino, realm, realm->seq); |
691 | } | 690 | } |
692 | 691 | ||
693 | dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino, | 692 | dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino, |
694 | realm, invalidate, p, e); | 693 | realm, invalidate, p, e); |
695 | 694 | ||
696 | if (p < e) | 695 | if (p < e) |
697 | goto more; | 696 | goto more; |
698 | 697 | ||
699 | /* invalidate when we reach the _end_ (root) of the trace */ | 698 | /* invalidate when we reach the _end_ (root) of the trace */ |
700 | if (invalidate) | 699 | if (invalidate) |
701 | rebuild_snap_realms(realm); | 700 | rebuild_snap_realms(realm); |
702 | 701 | ||
703 | /* | 702 | /* |
704 | * queue cap snaps _after_ we've built the new snap contexts, | 703 | * queue cap snaps _after_ we've built the new snap contexts, |
705 | * so that i_head_snapc can be set appropriately. | 704 | * so that i_head_snapc can be set appropriately. |
706 | */ | 705 | */ |
707 | while (!list_empty(&dirty_realms)) { | 706 | while (!list_empty(&dirty_realms)) { |
708 | realm = list_first_entry(&dirty_realms, struct ceph_snap_realm, | 707 | realm = list_first_entry(&dirty_realms, struct ceph_snap_realm, |
709 | dirty_item); | 708 | dirty_item); |
710 | queue_realm_cap_snaps(realm); | 709 | queue_realm_cap_snaps(realm); |
711 | } | 710 | } |
712 | 711 | ||
713 | __cleanup_empty_realms(mdsc); | 712 | __cleanup_empty_realms(mdsc); |
714 | return 0; | 713 | return 0; |
715 | 714 | ||
716 | bad: | 715 | bad: |
717 | err = -EINVAL; | 716 | err = -EINVAL; |
718 | fail: | 717 | fail: |
719 | pr_err("update_snap_trace error %d\n", err); | 718 | pr_err("update_snap_trace error %d\n", err); |
720 | return err; | 719 | return err; |
721 | } | 720 | } |
722 | 721 | ||
723 | 722 | ||
724 | /* | 723 | /* |
725 | * Send any cap_snaps that are queued for flush. Try to carry | 724 | * Send any cap_snaps that are queued for flush. Try to carry |
726 | * s_mutex across multiple snap flushes to avoid locking overhead. | 725 | * s_mutex across multiple snap flushes to avoid locking overhead. |
727 | * | 726 | * |
728 | * Caller holds no locks. | 727 | * Caller holds no locks. |
729 | */ | 728 | */ |
730 | static void flush_snaps(struct ceph_mds_client *mdsc) | 729 | static void flush_snaps(struct ceph_mds_client *mdsc) |
731 | { | 730 | { |
732 | struct ceph_inode_info *ci; | 731 | struct ceph_inode_info *ci; |
733 | struct inode *inode; | 732 | struct inode *inode; |
734 | struct ceph_mds_session *session = NULL; | 733 | struct ceph_mds_session *session = NULL; |
735 | 734 | ||
736 | dout("flush_snaps\n"); | 735 | dout("flush_snaps\n"); |
737 | spin_lock(&mdsc->snap_flush_lock); | 736 | spin_lock(&mdsc->snap_flush_lock); |
738 | while (!list_empty(&mdsc->snap_flush_list)) { | 737 | while (!list_empty(&mdsc->snap_flush_list)) { |
739 | ci = list_first_entry(&mdsc->snap_flush_list, | 738 | ci = list_first_entry(&mdsc->snap_flush_list, |
740 | struct ceph_inode_info, i_snap_flush_item); | 739 | struct ceph_inode_info, i_snap_flush_item); |
741 | inode = &ci->vfs_inode; | 740 | inode = &ci->vfs_inode; |
742 | ihold(inode); | 741 | ihold(inode); |
743 | spin_unlock(&mdsc->snap_flush_lock); | 742 | spin_unlock(&mdsc->snap_flush_lock); |
744 | spin_lock(&ci->i_ceph_lock); | 743 | spin_lock(&ci->i_ceph_lock); |
745 | __ceph_flush_snaps(ci, &session, 0); | 744 | __ceph_flush_snaps(ci, &session, 0); |
746 | spin_unlock(&ci->i_ceph_lock); | 745 | spin_unlock(&ci->i_ceph_lock); |
747 | iput(inode); | 746 | iput(inode); |
748 | spin_lock(&mdsc->snap_flush_lock); | 747 | spin_lock(&mdsc->snap_flush_lock); |
749 | } | 748 | } |
750 | spin_unlock(&mdsc->snap_flush_lock); | 749 | spin_unlock(&mdsc->snap_flush_lock); |
751 | 750 | ||
752 | if (session) { | 751 | if (session) { |
753 | mutex_unlock(&session->s_mutex); | 752 | mutex_unlock(&session->s_mutex); |
754 | ceph_put_mds_session(session); | 753 | ceph_put_mds_session(session); |
755 | } | 754 | } |
756 | dout("flush_snaps done\n"); | 755 | dout("flush_snaps done\n"); |
757 | } | 756 | } |
758 | 757 | ||
759 | 758 | ||
760 | /* | 759 | /* |
761 | * Handle a snap notification from the MDS. | 760 | * Handle a snap notification from the MDS. |
762 | * | 761 | * |
763 | * This can take two basic forms: the simplest is just a snap creation | 762 | * This can take two basic forms: the simplest is just a snap creation |
764 | * or deletion notification on an existing realm. This should update the | 763 | * or deletion notification on an existing realm. This should update the |
765 | * realm and its children. | 764 | * realm and its children. |
766 | * | 765 | * |
767 | * The more difficult case is realm creation, due to snap creation at a | 766 | * The more difficult case is realm creation, due to snap creation at a |
768 | * new point in the file hierarchy, or due to a rename that moves a file or | 767 | * new point in the file hierarchy, or due to a rename that moves a file or |
769 | * directory into another realm. | 768 | * directory into another realm. |
770 | */ | 769 | */ |
771 | void ceph_handle_snap(struct ceph_mds_client *mdsc, | 770 | void ceph_handle_snap(struct ceph_mds_client *mdsc, |
772 | struct ceph_mds_session *session, | 771 | struct ceph_mds_session *session, |
773 | struct ceph_msg *msg) | 772 | struct ceph_msg *msg) |
774 | { | 773 | { |
775 | struct super_block *sb = mdsc->fsc->sb; | 774 | struct super_block *sb = mdsc->fsc->sb; |
776 | int mds = session->s_mds; | 775 | int mds = session->s_mds; |
777 | u64 split; | 776 | u64 split; |
778 | int op; | 777 | int op; |
779 | int trace_len; | 778 | int trace_len; |
780 | struct ceph_snap_realm *realm = NULL; | 779 | struct ceph_snap_realm *realm = NULL; |
781 | void *p = msg->front.iov_base; | 780 | void *p = msg->front.iov_base; |
782 | void *e = p + msg->front.iov_len; | 781 | void *e = p + msg->front.iov_len; |
783 | struct ceph_mds_snap_head *h; | 782 | struct ceph_mds_snap_head *h; |
784 | int num_split_inos, num_split_realms; | 783 | int num_split_inos, num_split_realms; |
785 | __le64 *split_inos = NULL, *split_realms = NULL; | 784 | __le64 *split_inos = NULL, *split_realms = NULL; |
786 | int i; | 785 | int i; |
787 | int locked_rwsem = 0; | 786 | int locked_rwsem = 0; |
788 | 787 | ||
789 | /* decode */ | 788 | /* decode */ |
790 | if (msg->front.iov_len < sizeof(*h)) | 789 | if (msg->front.iov_len < sizeof(*h)) |
791 | goto bad; | 790 | goto bad; |
792 | h = p; | 791 | h = p; |
793 | op = le32_to_cpu(h->op); | 792 | op = le32_to_cpu(h->op); |
794 | split = le64_to_cpu(h->split); /* non-zero if we are splitting an | 793 | split = le64_to_cpu(h->split); /* non-zero if we are splitting an |
795 | * existing realm */ | 794 | * existing realm */ |
796 | num_split_inos = le32_to_cpu(h->num_split_inos); | 795 | num_split_inos = le32_to_cpu(h->num_split_inos); |
797 | num_split_realms = le32_to_cpu(h->num_split_realms); | 796 | num_split_realms = le32_to_cpu(h->num_split_realms); |
798 | trace_len = le32_to_cpu(h->trace_len); | 797 | trace_len = le32_to_cpu(h->trace_len); |
799 | p += sizeof(*h); | 798 | p += sizeof(*h); |
800 | 799 | ||
801 | dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds, | 800 | dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds, |
802 | ceph_snap_op_name(op), split, trace_len); | 801 | ceph_snap_op_name(op), split, trace_len); |
803 | 802 | ||
804 | mutex_lock(&session->s_mutex); | 803 | mutex_lock(&session->s_mutex); |
805 | session->s_seq++; | 804 | session->s_seq++; |
806 | mutex_unlock(&session->s_mutex); | 805 | mutex_unlock(&session->s_mutex); |
807 | 806 | ||
808 | down_write(&mdsc->snap_rwsem); | 807 | down_write(&mdsc->snap_rwsem); |
809 | locked_rwsem = 1; | 808 | locked_rwsem = 1; |
810 | 809 | ||
811 | if (op == CEPH_SNAP_OP_SPLIT) { | 810 | if (op == CEPH_SNAP_OP_SPLIT) { |
812 | struct ceph_mds_snap_realm *ri; | 811 | struct ceph_mds_snap_realm *ri; |
813 | 812 | ||
814 | /* | 813 | /* |
815 | * A "split" breaks part of an existing realm off into | 814 | * A "split" breaks part of an existing realm off into |
816 | * a new realm. The MDS provides a list of inodes | 815 | * a new realm. The MDS provides a list of inodes |
817 | * (with caps) and child realms that belong to the new | 816 | * (with caps) and child realms that belong to the new |
818 | * child. | 817 | * child. |
819 | */ | 818 | */ |
820 | split_inos = p; | 819 | split_inos = p; |
821 | p += sizeof(u64) * num_split_inos; | 820 | p += sizeof(u64) * num_split_inos; |
822 | split_realms = p; | 821 | split_realms = p; |
823 | p += sizeof(u64) * num_split_realms; | 822 | p += sizeof(u64) * num_split_realms; |
824 | ceph_decode_need(&p, e, sizeof(*ri), bad); | 823 | ceph_decode_need(&p, e, sizeof(*ri), bad); |
825 | /* we will peek at realm info here, but will _not_ | 824 | /* we will peek at realm info here, but will _not_ |
826 | * advance p, as the realm update will occur below in | 825 | * advance p, as the realm update will occur below in |
827 | * ceph_update_snap_trace. */ | 826 | * ceph_update_snap_trace. */ |
828 | ri = p; | 827 | ri = p; |
829 | 828 | ||
830 | realm = ceph_lookup_snap_realm(mdsc, split); | 829 | realm = ceph_lookup_snap_realm(mdsc, split); |
831 | if (!realm) { | 830 | if (!realm) { |
832 | realm = ceph_create_snap_realm(mdsc, split); | 831 | realm = ceph_create_snap_realm(mdsc, split); |
833 | if (IS_ERR(realm)) | 832 | if (IS_ERR(realm)) |
834 | goto out; | 833 | goto out; |
835 | } | 834 | } |
836 | ceph_get_snap_realm(mdsc, realm); | 835 | ceph_get_snap_realm(mdsc, realm); |
837 | 836 | ||
838 | dout("splitting snap_realm %llx %p\n", realm->ino, realm); | 837 | dout("splitting snap_realm %llx %p\n", realm->ino, realm); |
839 | for (i = 0; i < num_split_inos; i++) { | 838 | for (i = 0; i < num_split_inos; i++) { |
840 | struct ceph_vino vino = { | 839 | struct ceph_vino vino = { |
841 | .ino = le64_to_cpu(split_inos[i]), | 840 | .ino = le64_to_cpu(split_inos[i]), |
842 | .snap = CEPH_NOSNAP, | 841 | .snap = CEPH_NOSNAP, |
843 | }; | 842 | }; |
844 | struct inode *inode = ceph_find_inode(sb, vino); | 843 | struct inode *inode = ceph_find_inode(sb, vino); |
845 | struct ceph_inode_info *ci; | 844 | struct ceph_inode_info *ci; |
846 | struct ceph_snap_realm *oldrealm; | 845 | struct ceph_snap_realm *oldrealm; |
847 | 846 | ||
848 | if (!inode) | 847 | if (!inode) |
849 | continue; | 848 | continue; |
850 | ci = ceph_inode(inode); | 849 | ci = ceph_inode(inode); |
851 | 850 | ||
852 | spin_lock(&ci->i_ceph_lock); | 851 | spin_lock(&ci->i_ceph_lock); |
853 | if (!ci->i_snap_realm) | 852 | if (!ci->i_snap_realm) |
854 | goto skip_inode; | 853 | goto skip_inode; |
855 | /* | 854 | /* |
856 | * If this inode belongs to a realm that was | 855 | * If this inode belongs to a realm that was |
857 | * created after our new realm, we experienced | 856 | * created after our new realm, we experienced |
858 | * a race (due to another split notifications | 857 | * a race (due to another split notifications |
859 | * arriving from a different MDS). So skip | 858 | * arriving from a different MDS). So skip |
860 | * this inode. | 859 | * this inode. |
861 | */ | 860 | */ |
862 | if (ci->i_snap_realm->created > | 861 | if (ci->i_snap_realm->created > |
863 | le64_to_cpu(ri->created)) { | 862 | le64_to_cpu(ri->created)) { |
864 | dout(" leaving %p in newer realm %llx %p\n", | 863 | dout(" leaving %p in newer realm %llx %p\n", |
865 | inode, ci->i_snap_realm->ino, | 864 | inode, ci->i_snap_realm->ino, |
866 | ci->i_snap_realm); | 865 | ci->i_snap_realm); |
867 | goto skip_inode; | 866 | goto skip_inode; |
868 | } | 867 | } |
869 | dout(" will move %p to split realm %llx %p\n", | 868 | dout(" will move %p to split realm %llx %p\n", |
870 | inode, realm->ino, realm); | 869 | inode, realm->ino, realm); |
871 | /* | 870 | /* |
872 | * Move the inode to the new realm | 871 | * Move the inode to the new realm |
873 | */ | 872 | */ |
874 | spin_lock(&realm->inodes_with_caps_lock); | 873 | spin_lock(&realm->inodes_with_caps_lock); |
875 | list_del_init(&ci->i_snap_realm_item); | 874 | list_del_init(&ci->i_snap_realm_item); |
876 | list_add(&ci->i_snap_realm_item, | 875 | list_add(&ci->i_snap_realm_item, |
877 | &realm->inodes_with_caps); | 876 | &realm->inodes_with_caps); |
878 | oldrealm = ci->i_snap_realm; | 877 | oldrealm = ci->i_snap_realm; |
879 | ci->i_snap_realm = realm; | 878 | ci->i_snap_realm = realm; |
880 | spin_unlock(&realm->inodes_with_caps_lock); | 879 | spin_unlock(&realm->inodes_with_caps_lock); |
881 | spin_unlock(&ci->i_ceph_lock); | 880 | spin_unlock(&ci->i_ceph_lock); |
882 | 881 | ||
883 | ceph_get_snap_realm(mdsc, realm); | 882 | ceph_get_snap_realm(mdsc, realm); |
884 | ceph_put_snap_realm(mdsc, oldrealm); | 883 | ceph_put_snap_realm(mdsc, oldrealm); |
885 | 884 | ||
886 | iput(inode); | 885 | iput(inode); |
887 | continue; | 886 | continue; |
888 | 887 | ||
889 | skip_inode: | 888 | skip_inode: |
890 | spin_unlock(&ci->i_ceph_lock); | 889 | spin_unlock(&ci->i_ceph_lock); |
891 | iput(inode); | 890 | iput(inode); |
892 | } | 891 | } |
893 | 892 | ||
894 | /* we may have taken some of the old realm's children. */ | 893 | /* we may have taken some of the old realm's children. */ |
895 | for (i = 0; i < num_split_realms; i++) { | 894 | for (i = 0; i < num_split_realms; i++) { |
896 | struct ceph_snap_realm *child = | 895 | struct ceph_snap_realm *child = |
897 | ceph_lookup_snap_realm(mdsc, | 896 | ceph_lookup_snap_realm(mdsc, |
898 | le64_to_cpu(split_realms[i])); | 897 | le64_to_cpu(split_realms[i])); |
899 | if (!child) | 898 | if (!child) |
900 | continue; | 899 | continue; |
901 | adjust_snap_realm_parent(mdsc, child, realm->ino); | 900 | adjust_snap_realm_parent(mdsc, child, realm->ino); |
902 | } | 901 | } |
903 | } | 902 | } |
904 | 903 | ||
905 | /* | 904 | /* |
906 | * update using the provided snap trace. if we are deleting a | 905 | * update using the provided snap trace. if we are deleting a |
907 | * snap, we can avoid queueing cap_snaps. | 906 | * snap, we can avoid queueing cap_snaps. |
908 | */ | 907 | */ |
909 | ceph_update_snap_trace(mdsc, p, e, | 908 | ceph_update_snap_trace(mdsc, p, e, |
910 | op == CEPH_SNAP_OP_DESTROY); | 909 | op == CEPH_SNAP_OP_DESTROY); |
911 | 910 | ||
912 | if (op == CEPH_SNAP_OP_SPLIT) | 911 | if (op == CEPH_SNAP_OP_SPLIT) |
913 | /* we took a reference when we created the realm, above */ | 912 | /* we took a reference when we created the realm, above */ |
914 | ceph_put_snap_realm(mdsc, realm); | 913 | ceph_put_snap_realm(mdsc, realm); |
915 | 914 | ||
916 | __cleanup_empty_realms(mdsc); | 915 | __cleanup_empty_realms(mdsc); |
917 | 916 | ||
918 | up_write(&mdsc->snap_rwsem); | 917 | up_write(&mdsc->snap_rwsem); |
919 | 918 | ||
920 | flush_snaps(mdsc); | 919 | flush_snaps(mdsc); |
921 | return; | 920 | return; |
922 | 921 | ||
923 | bad: | 922 | bad: |
924 | pr_err("corrupt snap message from mds%d\n", mds); | 923 | pr_err("corrupt snap message from mds%d\n", mds); |
925 | ceph_msg_dump(msg); | 924 | ceph_msg_dump(msg); |
926 | out: | 925 | out: |
927 | if (locked_rwsem) | 926 | if (locked_rwsem) |
928 | up_write(&mdsc->snap_rwsem); | 927 | up_write(&mdsc->snap_rwsem); |
929 | return; | 928 | return; |
930 | } | 929 | } |
931 | 930 | ||
932 | 931 | ||
933 | 932 | ||
934 | 933 |