Commit 0adfc56ce8fdc5c17630434e49f30536ba7b8559
Exists in
master
and in
7 other branches
Merge git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client
* git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: rbd: use watch/notify for changes in rbd header libceph: add lingering request and watch/notify event framework rbd: update email address in Documentation ceph: rename dentry_release -> d_release, fix comment ceph: add request to the tail of unsafe write list ceph: remove request from unsafe list if it is canceled/timed out ceph: move readahead default to fs/ceph from libceph ceph: add ino32 mount option ceph: update common header files ceph: remove debugfs debug cruft libceph: fix osd request queuing on osdmap updates ceph: preserve I_COMPLETE across rename libceph: Fix base64-decoding when input ends in newline.
Showing 15 changed files Side-by-side Diff
- Documentation/ABI/testing/sysfs-bus-rbd
- drivers/block/rbd.c
- fs/ceph/debugfs.c
- fs/ceph/dir.c
- fs/ceph/file.c
- fs/ceph/inode.c
- fs/ceph/super.c
- fs/ceph/super.h
- include/linux/ceph/ceph_fs.h
- include/linux/ceph/libceph.h
- include/linux/ceph/osd_client.h
- include/linux/ceph/rados.h
- net/ceph/armor.c
- net/ceph/ceph_common.c
- net/ceph/osd_client.c
Documentation/ABI/testing/sysfs-bus-rbd
drivers/block/rbd.c
... | ... | @@ -31,6 +31,7 @@ |
31 | 31 | #include <linux/ceph/osd_client.h> |
32 | 32 | #include <linux/ceph/mon_client.h> |
33 | 33 | #include <linux/ceph/decode.h> |
34 | +#include <linux/parser.h> | |
34 | 35 | |
35 | 36 | #include <linux/kernel.h> |
36 | 37 | #include <linux/device.h> |
... | ... | @@ -54,6 +55,8 @@ |
54 | 55 | |
55 | 56 | #define DEV_NAME_LEN 32 |
56 | 57 | |
58 | +#define RBD_NOTIFY_TIMEOUT_DEFAULT 10 | |
59 | + | |
57 | 60 | /* |
58 | 61 | * block device image metadata (in-memory version) |
59 | 62 | */ |
60 | 63 | |
61 | 64 | |
... | ... | @@ -71,13 +74,20 @@ |
71 | 74 | |
72 | 75 | char *snap_names; |
73 | 76 | u64 *snap_sizes; |
77 | + | |
78 | + u64 obj_version; | |
74 | 79 | }; |
75 | 80 | |
81 | +struct rbd_options { | |
82 | + int notify_timeout; | |
83 | +}; | |
84 | + | |
76 | 85 | /* |
77 | 86 | * an instance of the client. multiple devices may share a client. |
78 | 87 | */ |
79 | 88 | struct rbd_client { |
80 | 89 | struct ceph_client *client; |
90 | + struct rbd_options *rbd_opts; | |
81 | 91 | struct kref kref; |
82 | 92 | struct list_head node; |
83 | 93 | }; |
... | ... | @@ -124,6 +134,9 @@ |
124 | 134 | char pool_name[RBD_MAX_POOL_NAME_LEN]; |
125 | 135 | int poolid; |
126 | 136 | |
137 | + struct ceph_osd_event *watch_event; | |
138 | + struct ceph_osd_request *watch_request; | |
139 | + | |
127 | 140 | char snap_name[RBD_MAX_SNAP_NAME_LEN]; |
128 | 141 | u32 cur_snap; /* index+1 of current snapshot within snap context |
129 | 142 | 0 - for the head */ |
... | ... | @@ -177,6 +190,8 @@ |
177 | 190 | put_device(&rbd_dev->dev); |
178 | 191 | } |
179 | 192 | |
193 | +static int __rbd_update_snaps(struct rbd_device *rbd_dev); | |
194 | + | |
180 | 195 | static int rbd_open(struct block_device *bdev, fmode_t mode) |
181 | 196 | { |
182 | 197 | struct gendisk *disk = bdev->bd_disk; |
... | ... | @@ -211,7 +226,8 @@ |
211 | 226 | * Initialize an rbd client instance. |
212 | 227 | * We own *opt. |
213 | 228 | */ |
214 | -static struct rbd_client *rbd_client_create(struct ceph_options *opt) | |
229 | +static struct rbd_client *rbd_client_create(struct ceph_options *opt, | |
230 | + struct rbd_options *rbd_opts) | |
215 | 231 | { |
216 | 232 | struct rbd_client *rbdc; |
217 | 233 | int ret = -ENOMEM; |
... | ... | @@ -233,6 +249,8 @@ |
233 | 249 | if (ret < 0) |
234 | 250 | goto out_err; |
235 | 251 | |
252 | + rbdc->rbd_opts = rbd_opts; | |
253 | + | |
236 | 254 | spin_lock(&node_lock); |
237 | 255 | list_add_tail(&rbdc->node, &rbd_client_list); |
238 | 256 | spin_unlock(&node_lock); |
... | ... | @@ -267,6 +285,59 @@ |
267 | 285 | } |
268 | 286 | |
269 | 287 | /* |
288 | + * mount options | |
289 | + */ | |
290 | +enum { | |
291 | + Opt_notify_timeout, | |
292 | + Opt_last_int, | |
293 | + /* int args above */ | |
294 | + Opt_last_string, | |
295 | + /* string args above */ | |
296 | +}; | |
297 | + | |
298 | +static match_table_t rbdopt_tokens = { | |
299 | + {Opt_notify_timeout, "notify_timeout=%d"}, | |
300 | + /* int args above */ | |
301 | + /* string args above */ | |
302 | + {-1, NULL} | |
303 | +}; | |
304 | + | |
305 | +static int parse_rbd_opts_token(char *c, void *private) | |
306 | +{ | |
307 | + struct rbd_options *rbdopt = private; | |
308 | + substring_t argstr[MAX_OPT_ARGS]; | |
309 | + int token, intval, ret; | |
310 | + | |
311 | + token = match_token((char *)c, rbdopt_tokens, argstr); | |
312 | + if (token < 0) | |
313 | + return -EINVAL; | |
314 | + | |
315 | + if (token < Opt_last_int) { | |
316 | + ret = match_int(&argstr[0], &intval); | |
317 | + if (ret < 0) { | |
318 | + pr_err("bad mount option arg (not int) " | |
319 | + "at '%s'\n", c); | |
320 | + return ret; | |
321 | + } | |
322 | + dout("got int token %d val %d\n", token, intval); | |
323 | + } else if (token > Opt_last_int && token < Opt_last_string) { | |
324 | + dout("got string token %d val %s\n", token, | |
325 | + argstr[0].from); | |
326 | + } else { | |
327 | + dout("got token %d\n", token); | |
328 | + } | |
329 | + | |
330 | + switch (token) { | |
331 | + case Opt_notify_timeout: | |
332 | + rbdopt->notify_timeout = intval; | |
333 | + break; | |
334 | + default: | |
335 | + BUG_ON(token); | |
336 | + } | |
337 | + return 0; | |
338 | +} | |
339 | + | |
340 | +/* | |
270 | 341 | * Get a ceph client with specific addr and configuration, if one does |
271 | 342 | * not exist create it. |
272 | 343 | */ |
273 | 344 | |
274 | 345 | |
275 | 346 | |
... | ... | @@ -276,11 +347,18 @@ |
276 | 347 | struct rbd_client *rbdc; |
277 | 348 | struct ceph_options *opt; |
278 | 349 | int ret; |
350 | + struct rbd_options *rbd_opts; | |
279 | 351 | |
352 | + rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL); | |
353 | + if (!rbd_opts) | |
354 | + return -ENOMEM; | |
355 | + | |
356 | + rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT; | |
357 | + | |
280 | 358 | ret = ceph_parse_options(&opt, options, mon_addr, |
281 | - mon_addr + strlen(mon_addr), NULL, NULL); | |
359 | + mon_addr + strlen(mon_addr), parse_rbd_opts_token, rbd_opts); | |
282 | 360 | if (ret < 0) |
283 | - return ret; | |
361 | + goto done_err; | |
284 | 362 | |
285 | 363 | spin_lock(&node_lock); |
286 | 364 | rbdc = __rbd_client_find(opt); |
287 | 365 | |
... | ... | @@ -296,13 +374,18 @@ |
296 | 374 | } |
297 | 375 | spin_unlock(&node_lock); |
298 | 376 | |
299 | - rbdc = rbd_client_create(opt); | |
300 | - if (IS_ERR(rbdc)) | |
301 | - return PTR_ERR(rbdc); | |
377 | + rbdc = rbd_client_create(opt, rbd_opts); | |
378 | + if (IS_ERR(rbdc)) { | |
379 | + ret = PTR_ERR(rbdc); | |
380 | + goto done_err; | |
381 | + } | |
302 | 382 | |
303 | 383 | rbd_dev->rbd_client = rbdc; |
304 | 384 | rbd_dev->client = rbdc->client; |
305 | 385 | return 0; |
386 | +done_err: | |
387 | + kfree(rbd_opts); | |
388 | + return ret; | |
306 | 389 | } |
307 | 390 | |
308 | 391 | /* |
... | ... | @@ -318,6 +401,7 @@ |
318 | 401 | spin_unlock(&node_lock); |
319 | 402 | |
320 | 403 | ceph_destroy_client(rbdc->client); |
404 | + kfree(rbdc->rbd_opts); | |
321 | 405 | kfree(rbdc); |
322 | 406 | } |
323 | 407 | |
... | ... | @@ -666,7 +750,9 @@ |
666 | 750 | struct ceph_osd_req_op *ops, |
667 | 751 | int num_reply, |
668 | 752 | void (*rbd_cb)(struct ceph_osd_request *req, |
669 | - struct ceph_msg *msg)) | |
753 | + struct ceph_msg *msg), | |
754 | + struct ceph_osd_request **linger_req, | |
755 | + u64 *ver) | |
670 | 756 | { |
671 | 757 | struct ceph_osd_request *req; |
672 | 758 | struct ceph_file_layout *layout; |
673 | 759 | |
... | ... | @@ -729,12 +815,20 @@ |
729 | 815 | req->r_oid, req->r_oid_len); |
730 | 816 | up_read(&header->snap_rwsem); |
731 | 817 | |
818 | + if (linger_req) { | |
819 | + ceph_osdc_set_request_linger(&dev->client->osdc, req); | |
820 | + *linger_req = req; | |
821 | + } | |
822 | + | |
732 | 823 | ret = ceph_osdc_start_request(&dev->client->osdc, req, false); |
733 | 824 | if (ret < 0) |
734 | 825 | goto done_err; |
735 | 826 | |
736 | 827 | if (!rbd_cb) { |
737 | 828 | ret = ceph_osdc_wait_request(&dev->client->osdc, req); |
829 | + if (ver) | |
830 | + *ver = le64_to_cpu(req->r_reassert_version.version); | |
831 | + dout("reassert_ver=%lld\n", le64_to_cpu(req->r_reassert_version.version)); | |
738 | 832 | ceph_osdc_put_request(req); |
739 | 833 | } |
740 | 834 | return ret; |
... | ... | @@ -789,6 +883,11 @@ |
789 | 883 | kfree(req_data); |
790 | 884 | } |
791 | 885 | |
886 | +static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) | |
887 | +{ | |
888 | + ceph_osdc_put_request(req); | |
889 | +} | |
890 | + | |
792 | 891 | /* |
793 | 892 | * Do a synchronous ceph osd operation |
794 | 893 | */ |
... | ... | @@ -801,7 +900,9 @@ |
801 | 900 | int num_reply, |
802 | 901 | const char *obj, |
803 | 902 | u64 ofs, u64 len, |
804 | - char *buf) | |
903 | + char *buf, | |
904 | + struct ceph_osd_request **linger_req, | |
905 | + u64 *ver) | |
805 | 906 | { |
806 | 907 | int ret; |
807 | 908 | struct page **pages; |
... | ... | @@ -833,7 +934,8 @@ |
833 | 934 | flags, |
834 | 935 | ops, |
835 | 936 | 2, |
836 | - NULL); | |
937 | + NULL, | |
938 | + linger_req, ver); | |
837 | 939 | if (ret < 0) |
838 | 940 | goto done_ops; |
839 | 941 | |
... | ... | @@ -893,7 +995,7 @@ |
893 | 995 | flags, |
894 | 996 | ops, |
895 | 997 | num_reply, |
896 | - rbd_req_cb); | |
998 | + rbd_req_cb, 0, NULL); | |
897 | 999 | done: |
898 | 1000 | kfree(seg_name); |
899 | 1001 | return ret; |
900 | 1002 | |
901 | 1003 | |
902 | 1004 | |
... | ... | @@ -940,19 +1042,175 @@ |
940 | 1042 | u64 snapid, |
941 | 1043 | const char *obj, |
942 | 1044 | u64 ofs, u64 len, |
943 | - char *buf) | |
1045 | + char *buf, | |
1046 | + u64 *ver) | |
944 | 1047 | { |
945 | 1048 | return rbd_req_sync_op(dev, NULL, |
946 | 1049 | (snapid ? snapid : CEPH_NOSNAP), |
947 | 1050 | CEPH_OSD_OP_READ, |
948 | 1051 | CEPH_OSD_FLAG_READ, |
949 | 1052 | NULL, |
950 | - 1, obj, ofs, len, buf); | |
1053 | + 1, obj, ofs, len, buf, NULL, ver); | |
951 | 1054 | } |
952 | 1055 | |
953 | 1056 | /* |
954 | - * Request sync osd read | |
1057 | + * Request sync osd watch | |
955 | 1058 | */ |
1059 | +static int rbd_req_sync_notify_ack(struct rbd_device *dev, | |
1060 | + u64 ver, | |
1061 | + u64 notify_id, | |
1062 | + const char *obj) | |
1063 | +{ | |
1064 | + struct ceph_osd_req_op *ops; | |
1065 | + struct page **pages = NULL; | |
1066 | + int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY_ACK, 0); | |
1067 | + if (ret < 0) | |
1068 | + return ret; | |
1069 | + | |
1070 | + ops[0].watch.ver = cpu_to_le64(dev->header.obj_version); | |
1071 | + ops[0].watch.cookie = notify_id; | |
1072 | + ops[0].watch.flag = 0; | |
1073 | + | |
1074 | + ret = rbd_do_request(NULL, dev, NULL, CEPH_NOSNAP, | |
1075 | + obj, 0, 0, NULL, | |
1076 | + pages, 0, | |
1077 | + CEPH_OSD_FLAG_READ, | |
1078 | + ops, | |
1079 | + 1, | |
1080 | + rbd_simple_req_cb, 0, NULL); | |
1081 | + | |
1082 | + rbd_destroy_ops(ops); | |
1083 | + return ret; | |
1084 | +} | |
1085 | + | |
1086 | +static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) | |
1087 | +{ | |
1088 | + struct rbd_device *dev = (struct rbd_device *)data; | |
1089 | + if (!dev) | |
1090 | + return; | |
1091 | + | |
1092 | + dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name, | |
1093 | + notify_id, (int)opcode); | |
1094 | + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); | |
1095 | + __rbd_update_snaps(dev); | |
1096 | + mutex_unlock(&ctl_mutex); | |
1097 | + | |
1098 | + rbd_req_sync_notify_ack(dev, ver, notify_id, dev->obj_md_name); | |
1099 | +} | |
1100 | + | |
1101 | +/* | |
1102 | + * Request sync osd watch | |
1103 | + */ | |
1104 | +static int rbd_req_sync_watch(struct rbd_device *dev, | |
1105 | + const char *obj, | |
1106 | + u64 ver) | |
1107 | +{ | |
1108 | + struct ceph_osd_req_op *ops; | |
1109 | + struct ceph_osd_client *osdc = &dev->client->osdc; | |
1110 | + | |
1111 | + int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0); | |
1112 | + if (ret < 0) | |
1113 | + return ret; | |
1114 | + | |
1115 | + ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, | |
1116 | + (void *)dev, &dev->watch_event); | |
1117 | + if (ret < 0) | |
1118 | + goto fail; | |
1119 | + | |
1120 | + ops[0].watch.ver = cpu_to_le64(ver); | |
1121 | + ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie); | |
1122 | + ops[0].watch.flag = 1; | |
1123 | + | |
1124 | + ret = rbd_req_sync_op(dev, NULL, | |
1125 | + CEPH_NOSNAP, | |
1126 | + 0, | |
1127 | + CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, | |
1128 | + ops, | |
1129 | + 1, obj, 0, 0, NULL, | |
1130 | + &dev->watch_request, NULL); | |
1131 | + | |
1132 | + if (ret < 0) | |
1133 | + goto fail_event; | |
1134 | + | |
1135 | + rbd_destroy_ops(ops); | |
1136 | + return 0; | |
1137 | + | |
1138 | +fail_event: | |
1139 | + ceph_osdc_cancel_event(dev->watch_event); | |
1140 | + dev->watch_event = NULL; | |
1141 | +fail: | |
1142 | + rbd_destroy_ops(ops); | |
1143 | + return ret; | |
1144 | +} | |
1145 | + | |
1146 | +struct rbd_notify_info { | |
1147 | + struct rbd_device *dev; | |
1148 | +}; | |
1149 | + | |
1150 | +static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data) | |
1151 | +{ | |
1152 | + struct rbd_device *dev = (struct rbd_device *)data; | |
1153 | + if (!dev) | |
1154 | + return; | |
1155 | + | |
1156 | + dout("rbd_notify_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name, | |
1157 | + notify_id, (int)opcode); | |
1158 | +} | |
1159 | + | |
1160 | +/* | |
1161 | + * Request sync osd notify | |
1162 | + */ | |
1163 | +static int rbd_req_sync_notify(struct rbd_device *dev, | |
1164 | + const char *obj) | |
1165 | +{ | |
1166 | + struct ceph_osd_req_op *ops; | |
1167 | + struct ceph_osd_client *osdc = &dev->client->osdc; | |
1168 | + struct ceph_osd_event *event; | |
1169 | + struct rbd_notify_info info; | |
1170 | + int payload_len = sizeof(u32) + sizeof(u32); | |
1171 | + int ret; | |
1172 | + | |
1173 | + ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY, payload_len); | |
1174 | + if (ret < 0) | |
1175 | + return ret; | |
1176 | + | |
1177 | + info.dev = dev; | |
1178 | + | |
1179 | + ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1, | |
1180 | + (void *)&info, &event); | |
1181 | + if (ret < 0) | |
1182 | + goto fail; | |
1183 | + | |
1184 | + ops[0].watch.ver = 1; | |
1185 | + ops[0].watch.flag = 1; | |
1186 | + ops[0].watch.cookie = event->cookie; | |
1187 | + ops[0].watch.prot_ver = RADOS_NOTIFY_VER; | |
1188 | + ops[0].watch.timeout = 12; | |
1189 | + | |
1190 | + ret = rbd_req_sync_op(dev, NULL, | |
1191 | + CEPH_NOSNAP, | |
1192 | + 0, | |
1193 | + CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, | |
1194 | + ops, | |
1195 | + 1, obj, 0, 0, NULL, NULL, NULL); | |
1196 | + if (ret < 0) | |
1197 | + goto fail_event; | |
1198 | + | |
1199 | + ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT); | |
1200 | + dout("ceph_osdc_wait_event returned %d\n", ret); | |
1201 | + rbd_destroy_ops(ops); | |
1202 | + return 0; | |
1203 | + | |
1204 | +fail_event: | |
1205 | + ceph_osdc_cancel_event(event); | |
1206 | +fail: | |
1207 | + rbd_destroy_ops(ops); | |
1208 | + return ret; | |
1209 | +} | |
1210 | + | |
1211 | +/* | |
1212 | + * Request sync osd rollback | |
1213 | + */ | |
956 | 1214 | static int rbd_req_sync_rollback_obj(struct rbd_device *dev, |
957 | 1215 | u64 snapid, |
958 | 1216 | const char *obj) |
959 | 1217 | |
... | ... | @@ -969,13 +1227,10 @@ |
969 | 1227 | 0, |
970 | 1228 | CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, |
971 | 1229 | ops, |
972 | - 1, obj, 0, 0, NULL); | |
1230 | + 1, obj, 0, 0, NULL, NULL, NULL); | |
973 | 1231 | |
974 | 1232 | rbd_destroy_ops(ops); |
975 | 1233 | |
976 | - if (ret < 0) | |
977 | - return ret; | |
978 | - | |
979 | 1234 | return ret; |
980 | 1235 | } |
981 | 1236 | |
... | ... | @@ -987,7 +1242,8 @@ |
987 | 1242 | const char *cls, |
988 | 1243 | const char *method, |
989 | 1244 | const char *data, |
990 | - int len) | |
1245 | + int len, | |
1246 | + u64 *ver) | |
991 | 1247 | { |
992 | 1248 | struct ceph_osd_req_op *ops; |
993 | 1249 | int cls_len = strlen(cls); |
... | ... | @@ -1010,7 +1266,7 @@ |
1010 | 1266 | 0, |
1011 | 1267 | CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, |
1012 | 1268 | ops, |
1013 | - 1, obj, 0, 0, NULL); | |
1269 | + 1, obj, 0, 0, NULL, NULL, ver); | |
1014 | 1270 | |
1015 | 1271 | rbd_destroy_ops(ops); |
1016 | 1272 | |
... | ... | @@ -1156,6 +1412,7 @@ |
1156 | 1412 | struct rbd_image_header_ondisk *dh; |
1157 | 1413 | int snap_count = 0; |
1158 | 1414 | u64 snap_names_len = 0; |
1415 | + u64 ver; | |
1159 | 1416 | |
1160 | 1417 | while (1) { |
1161 | 1418 | int len = sizeof(*dh) + |
... | ... | @@ -1171,7 +1428,7 @@ |
1171 | 1428 | NULL, CEPH_NOSNAP, |
1172 | 1429 | rbd_dev->obj_md_name, |
1173 | 1430 | 0, len, |
1174 | - (char *)dh); | |
1431 | + (char *)dh, &ver); | |
1175 | 1432 | if (rc < 0) |
1176 | 1433 | goto out_dh; |
1177 | 1434 | |
... | ... | @@ -1188,6 +1445,7 @@ |
1188 | 1445 | } |
1189 | 1446 | break; |
1190 | 1447 | } |
1448 | + header->obj_version = ver; | |
1191 | 1449 | |
1192 | 1450 | out_dh: |
1193 | 1451 | kfree(dh); |
... | ... | @@ -1205,6 +1463,7 @@ |
1205 | 1463 | u64 new_snapid; |
1206 | 1464 | int ret; |
1207 | 1465 | void *data, *data_start, *data_end; |
1466 | + u64 ver; | |
1208 | 1467 | |
1209 | 1468 | /* we should create a snapshot only if we're pointing at the head */ |
1210 | 1469 | if (dev->cur_snap) |
... | ... | @@ -1227,7 +1486,7 @@ |
1227 | 1486 | ceph_encode_64_safe(&data, data_end, new_snapid, bad); |
1228 | 1487 | |
1229 | 1488 | ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add", |
1230 | - data_start, data - data_start); | |
1489 | + data_start, data - data_start, &ver); | |
1231 | 1490 | |
1232 | 1491 | kfree(data_start); |
1233 | 1492 | |
... | ... | @@ -1259,6 +1518,7 @@ |
1259 | 1518 | int ret; |
1260 | 1519 | struct rbd_image_header h; |
1261 | 1520 | u64 snap_seq; |
1521 | + int follow_seq = 0; | |
1262 | 1522 | |
1263 | 1523 | ret = rbd_read_header(rbd_dev, &h); |
1264 | 1524 | if (ret < 0) |
... | ... | @@ -1267,6 +1527,11 @@ |
1267 | 1527 | down_write(&rbd_dev->header.snap_rwsem); |
1268 | 1528 | |
1269 | 1529 | snap_seq = rbd_dev->header.snapc->seq; |
1530 | + if (rbd_dev->header.total_snaps && | |
1531 | + rbd_dev->header.snapc->snaps[0] == snap_seq) | |
1532 | + /* pointing at the head, will need to follow that | |
1533 | + if head moves */ | |
1534 | + follow_seq = 1; | |
1270 | 1535 | |
1271 | 1536 | kfree(rbd_dev->header.snapc); |
1272 | 1537 | kfree(rbd_dev->header.snap_names); |
... | ... | @@ -1277,7 +1542,10 @@ |
1277 | 1542 | rbd_dev->header.snap_names = h.snap_names; |
1278 | 1543 | rbd_dev->header.snap_names_len = h.snap_names_len; |
1279 | 1544 | rbd_dev->header.snap_sizes = h.snap_sizes; |
1280 | - rbd_dev->header.snapc->seq = snap_seq; | |
1545 | + if (follow_seq) | |
1546 | + rbd_dev->header.snapc->seq = rbd_dev->header.snapc->snaps[0]; | |
1547 | + else | |
1548 | + rbd_dev->header.snapc->seq = snap_seq; | |
1281 | 1549 | |
1282 | 1550 | ret = __rbd_init_snaps_header(rbd_dev); |
1283 | 1551 | |
1284 | 1552 | |
... | ... | @@ -1699,8 +1967,29 @@ |
1699 | 1967 | device_unregister(&rbd_dev->dev); |
1700 | 1968 | } |
1701 | 1969 | |
1702 | -static ssize_t rbd_add(struct bus_type *bus, const char *buf, size_t count) | |
1970 | +static int rbd_init_watch_dev(struct rbd_device *rbd_dev) | |
1703 | 1971 | { |
1972 | + int ret, rc; | |
1973 | + | |
1974 | + do { | |
1975 | + ret = rbd_req_sync_watch(rbd_dev, rbd_dev->obj_md_name, | |
1976 | + rbd_dev->header.obj_version); | |
1977 | + if (ret == -ERANGE) { | |
1978 | + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); | |
1979 | + rc = __rbd_update_snaps(rbd_dev); | |
1980 | + mutex_unlock(&ctl_mutex); | |
1981 | + if (rc < 0) | |
1982 | + return rc; | |
1983 | + } | |
1984 | + } while (ret == -ERANGE); | |
1985 | + | |
1986 | + return ret; | |
1987 | +} | |
1988 | + | |
1989 | +static ssize_t rbd_add(struct bus_type *bus, | |
1990 | + const char *buf, | |
1991 | + size_t count) | |
1992 | +{ | |
1704 | 1993 | struct ceph_osd_client *osdc; |
1705 | 1994 | struct rbd_device *rbd_dev; |
1706 | 1995 | ssize_t rc = -ENOMEM; |
... | ... | @@ -1797,6 +2086,10 @@ |
1797 | 2086 | if (rc) |
1798 | 2087 | goto err_out_bus; |
1799 | 2088 | |
2089 | + rc = rbd_init_watch_dev(rbd_dev); | |
2090 | + if (rc) | |
2091 | + goto err_out_bus; | |
2092 | + | |
1800 | 2093 | return count; |
1801 | 2094 | |
1802 | 2095 | err_out_bus: |
... | ... | @@ -1849,6 +2142,12 @@ |
1849 | 2142 | struct rbd_device *rbd_dev = |
1850 | 2143 | container_of(dev, struct rbd_device, dev); |
1851 | 2144 | |
2145 | + if (rbd_dev->watch_request) | |
2146 | + ceph_osdc_unregister_linger_request(&rbd_dev->client->osdc, | |
2147 | + rbd_dev->watch_request); | |
2148 | + if (rbd_dev->watch_event) | |
2149 | + ceph_osdc_cancel_event(rbd_dev->watch_event); | |
2150 | + | |
1852 | 2151 | rbd_put_client(rbd_dev); |
1853 | 2152 | |
1854 | 2153 | /* clean up and free blkdev */ |
1855 | 2154 | |
1856 | 2155 | |
1857 | 2156 | |
... | ... | @@ -1914,14 +2213,24 @@ |
1914 | 2213 | ret = rbd_header_add_snap(rbd_dev, |
1915 | 2214 | name, GFP_KERNEL); |
1916 | 2215 | if (ret < 0) |
1917 | - goto done_unlock; | |
2216 | + goto err_unlock; | |
1918 | 2217 | |
1919 | 2218 | ret = __rbd_update_snaps(rbd_dev); |
1920 | 2219 | if (ret < 0) |
1921 | - goto done_unlock; | |
2220 | + goto err_unlock; | |
1922 | 2221 | |
2222 | + /* shouldn't hold ctl_mutex when notifying.. notify might | |
2223 | + trigger a watch callback that would need to get that mutex */ | |
2224 | + mutex_unlock(&ctl_mutex); | |
2225 | + | |
2226 | + /* make a best effort, don't error if failed */ | |
2227 | + rbd_req_sync_notify(rbd_dev, rbd_dev->obj_md_name); | |
2228 | + | |
1923 | 2229 | ret = count; |
1924 | -done_unlock: | |
2230 | + kfree(name); | |
2231 | + return ret; | |
2232 | + | |
2233 | +err_unlock: | |
1925 | 2234 | mutex_unlock(&ctl_mutex); |
1926 | 2235 | kfree(name); |
1927 | 2236 | return ret; |
fs/ceph/debugfs.c
... | ... | @@ -210,8 +210,6 @@ |
210 | 210 | if (!fsc->debugfs_congestion_kb) |
211 | 211 | goto out; |
212 | 212 | |
213 | - dout("a\n"); | |
214 | - | |
215 | 213 | snprintf(name, sizeof(name), "../../bdi/%s", |
216 | 214 | dev_name(fsc->backing_dev_info.dev)); |
217 | 215 | fsc->debugfs_bdi = |
... | ... | @@ -221,7 +219,6 @@ |
221 | 219 | if (!fsc->debugfs_bdi) |
222 | 220 | goto out; |
223 | 221 | |
224 | - dout("b\n"); | |
225 | 222 | fsc->debugfs_mdsmap = debugfs_create_file("mdsmap", |
226 | 223 | 0600, |
227 | 224 | fsc->client->debugfs_dir, |
... | ... | @@ -230,7 +227,6 @@ |
230 | 227 | if (!fsc->debugfs_mdsmap) |
231 | 228 | goto out; |
232 | 229 | |
233 | - dout("ca\n"); | |
234 | 230 | fsc->debugfs_mdsc = debugfs_create_file("mdsc", |
235 | 231 | 0600, |
236 | 232 | fsc->client->debugfs_dir, |
... | ... | @@ -239,7 +235,6 @@ |
239 | 235 | if (!fsc->debugfs_mdsc) |
240 | 236 | goto out; |
241 | 237 | |
242 | - dout("da\n"); | |
243 | 238 | fsc->debugfs_caps = debugfs_create_file("caps", |
244 | 239 | 0400, |
245 | 240 | fsc->client->debugfs_dir, |
... | ... | @@ -248,7 +243,6 @@ |
248 | 243 | if (!fsc->debugfs_caps) |
249 | 244 | goto out; |
250 | 245 | |
251 | - dout("ea\n"); | |
252 | 246 | fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru", |
253 | 247 | 0600, |
254 | 248 | fsc->client->debugfs_dir, |
fs/ceph/dir.c
... | ... | @@ -161,7 +161,7 @@ |
161 | 161 | filp->f_pos = di->offset; |
162 | 162 | err = filldir(dirent, dentry->d_name.name, |
163 | 163 | dentry->d_name.len, di->offset, |
164 | - dentry->d_inode->i_ino, | |
164 | + ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino), | |
165 | 165 | dentry->d_inode->i_mode >> 12); |
166 | 166 | |
167 | 167 | if (last) { |
168 | 168 | |
169 | 169 | |
... | ... | @@ -245,15 +245,17 @@ |
245 | 245 | |
246 | 246 | dout("readdir off 0 -> '.'\n"); |
247 | 247 | if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0), |
248 | - inode->i_ino, inode->i_mode >> 12) < 0) | |
248 | + ceph_translate_ino(inode->i_sb, inode->i_ino), | |
249 | + inode->i_mode >> 12) < 0) | |
249 | 250 | return 0; |
250 | 251 | filp->f_pos = 1; |
251 | 252 | off = 1; |
252 | 253 | } |
253 | 254 | if (filp->f_pos == 1) { |
255 | + ino_t ino = filp->f_dentry->d_parent->d_inode->i_ino; | |
254 | 256 | dout("readdir off 1 -> '..'\n"); |
255 | 257 | if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1), |
256 | - filp->f_dentry->d_parent->d_inode->i_ino, | |
258 | + ceph_translate_ino(inode->i_sb, ino), | |
257 | 259 | inode->i_mode >> 12) < 0) |
258 | 260 | return 0; |
259 | 261 | filp->f_pos = 2; |
... | ... | @@ -377,7 +379,8 @@ |
377 | 379 | if (filldir(dirent, |
378 | 380 | rinfo->dir_dname[off - fi->offset], |
379 | 381 | rinfo->dir_dname_len[off - fi->offset], |
380 | - pos, ino, ftype) < 0) { | |
382 | + pos, | |
383 | + ceph_translate_ino(inode->i_sb, ino), ftype) < 0) { | |
381 | 384 | dout("filldir stopping us...\n"); |
382 | 385 | return 0; |
383 | 386 | } |
384 | 387 | |
385 | 388 | |
... | ... | @@ -1024,14 +1027,13 @@ |
1024 | 1027 | } |
1025 | 1028 | |
1026 | 1029 | /* |
1027 | - * When a dentry is released, clear the dir I_COMPLETE if it was part | |
1028 | - * of the current dir gen or if this is in the snapshot namespace. | |
1030 | + * Release our ceph_dentry_info. | |
1029 | 1031 | */ |
1030 | -static void ceph_dentry_release(struct dentry *dentry) | |
1032 | +static void ceph_d_release(struct dentry *dentry) | |
1031 | 1033 | { |
1032 | 1034 | struct ceph_dentry_info *di = ceph_dentry(dentry); |
1033 | 1035 | |
1034 | - dout("dentry_release %p\n", dentry); | |
1036 | + dout("d_release %p\n", dentry); | |
1035 | 1037 | if (di) { |
1036 | 1038 | ceph_dentry_lru_del(dentry); |
1037 | 1039 | if (di->lease_session) |
1038 | 1040 | |
1039 | 1041 | |
... | ... | @@ -1256,15 +1258,15 @@ |
1256 | 1258 | |
1257 | 1259 | const struct dentry_operations ceph_dentry_ops = { |
1258 | 1260 | .d_revalidate = ceph_d_revalidate, |
1259 | - .d_release = ceph_dentry_release, | |
1261 | + .d_release = ceph_d_release, | |
1260 | 1262 | }; |
1261 | 1263 | |
1262 | 1264 | const struct dentry_operations ceph_snapdir_dentry_ops = { |
1263 | 1265 | .d_revalidate = ceph_snapdir_d_revalidate, |
1264 | - .d_release = ceph_dentry_release, | |
1266 | + .d_release = ceph_d_release, | |
1265 | 1267 | }; |
1266 | 1268 | |
1267 | 1269 | const struct dentry_operations ceph_snap_dentry_ops = { |
1268 | - .d_release = ceph_dentry_release, | |
1270 | + .d_release = ceph_d_release, | |
1269 | 1271 | }; |
fs/ceph/file.c
... | ... | @@ -564,11 +564,19 @@ |
564 | 564 | * start_request so that a tid has been assigned. |
565 | 565 | */ |
566 | 566 | spin_lock(&ci->i_unsafe_lock); |
567 | - list_add(&req->r_unsafe_item, &ci->i_unsafe_writes); | |
567 | + list_add_tail(&req->r_unsafe_item, | |
568 | + &ci->i_unsafe_writes); | |
568 | 569 | spin_unlock(&ci->i_unsafe_lock); |
569 | 570 | ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); |
570 | 571 | } |
572 | + | |
571 | 573 | ret = ceph_osdc_wait_request(&fsc->client->osdc, req); |
574 | + if (ret < 0 && req->r_safe_callback) { | |
575 | + spin_lock(&ci->i_unsafe_lock); | |
576 | + list_del_init(&req->r_unsafe_item); | |
577 | + spin_unlock(&ci->i_unsafe_lock); | |
578 | + ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR); | |
579 | + } | |
572 | 580 | } |
573 | 581 | |
574 | 582 | if (file->f_flags & O_DIRECT) |
fs/ceph/inode.c
... | ... | @@ -36,6 +36,13 @@ |
36 | 36 | /* |
37 | 37 | * find or create an inode, given the ceph ino number |
38 | 38 | */ |
39 | +static int ceph_set_ino_cb(struct inode *inode, void *data) | |
40 | +{ | |
41 | + ceph_inode(inode)->i_vino = *(struct ceph_vino *)data; | |
42 | + inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data); | |
43 | + return 0; | |
44 | +} | |
45 | + | |
39 | 46 | struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino) |
40 | 47 | { |
41 | 48 | struct inode *inode; |
... | ... | @@ -1030,9 +1037,6 @@ |
1030 | 1037 | dout("fill_trace doing d_move %p -> %p\n", |
1031 | 1038 | req->r_old_dentry, dn); |
1032 | 1039 | |
1033 | - /* d_move screws up d_subdirs order */ | |
1034 | - ceph_i_clear(dir, CEPH_I_COMPLETE); | |
1035 | - | |
1036 | 1040 | d_move(req->r_old_dentry, dn); |
1037 | 1041 | dout(" src %p '%.*s' dst %p '%.*s'\n", |
1038 | 1042 | req->r_old_dentry, |
1039 | 1043 | |
... | ... | @@ -1044,12 +1048,15 @@ |
1044 | 1048 | rehashing bug in vfs_rename_dir */ |
1045 | 1049 | ceph_invalidate_dentry_lease(dn); |
1046 | 1050 | |
1047 | - /* take overwritten dentry's readdir offset */ | |
1048 | - dout("dn %p gets %p offset %lld (old offset %lld)\n", | |
1049 | - req->r_old_dentry, dn, ceph_dentry(dn)->offset, | |
1051 | + /* | |
1052 | + * d_move() puts the renamed dentry at the end of | |
1053 | + * d_subdirs. We need to assign it an appropriate | |
1054 | + * directory offset so we can behave when holding | |
1055 | + * I_COMPLETE. | |
1056 | + */ | |
1057 | + ceph_set_dentry_offset(req->r_old_dentry); | |
1058 | + dout("dn %p gets new offset %lld\n", req->r_old_dentry, | |
1050 | 1059 | ceph_dentry(req->r_old_dentry)->offset); |
1051 | - ceph_dentry(req->r_old_dentry)->offset = | |
1052 | - ceph_dentry(dn)->offset; | |
1053 | 1060 | |
1054 | 1061 | dn = req->r_old_dentry; /* use old_dentry */ |
1055 | 1062 | in = dn->d_inode; |
... | ... | @@ -1809,7 +1816,7 @@ |
1809 | 1816 | err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL); |
1810 | 1817 | if (!err) { |
1811 | 1818 | generic_fillattr(inode, stat); |
1812 | - stat->ino = inode->i_ino; | |
1819 | + stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino); | |
1813 | 1820 | if (ceph_snap(inode) != CEPH_NOSNAP) |
1814 | 1821 | stat->dev = ceph_snap(inode); |
1815 | 1822 | else |
fs/ceph/super.c
... | ... | @@ -131,6 +131,7 @@ |
131 | 131 | Opt_rbytes, |
132 | 132 | Opt_norbytes, |
133 | 133 | Opt_noasyncreaddir, |
134 | + Opt_ino32, | |
134 | 135 | }; |
135 | 136 | |
136 | 137 | static match_table_t fsopt_tokens = { |
... | ... | @@ -150,6 +151,7 @@ |
150 | 151 | {Opt_rbytes, "rbytes"}, |
151 | 152 | {Opt_norbytes, "norbytes"}, |
152 | 153 | {Opt_noasyncreaddir, "noasyncreaddir"}, |
154 | + {Opt_ino32, "ino32"}, | |
153 | 155 | {-1, NULL} |
154 | 156 | }; |
155 | 157 | |
... | ... | @@ -225,6 +227,9 @@ |
225 | 227 | case Opt_noasyncreaddir: |
226 | 228 | fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR; |
227 | 229 | break; |
230 | + case Opt_ino32: | |
231 | + fsopt->flags |= CEPH_MOUNT_OPT_INO32; | |
232 | + break; | |
228 | 233 | default: |
229 | 234 | BUG_ON(token); |
230 | 235 | } |
... | ... | @@ -288,7 +293,7 @@ |
288 | 293 | fsopt->sb_flags = flags; |
289 | 294 | fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; |
290 | 295 | |
291 | - fsopt->rsize = CEPH_MOUNT_RSIZE_DEFAULT; | |
296 | + fsopt->rsize = CEPH_RSIZE_DEFAULT; | |
292 | 297 | fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); |
293 | 298 | fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; |
294 | 299 | fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; |
... | ... | @@ -370,7 +375,7 @@ |
370 | 375 | |
371 | 376 | if (fsopt->wsize) |
372 | 377 | seq_printf(m, ",wsize=%d", fsopt->wsize); |
373 | - if (fsopt->rsize != CEPH_MOUNT_RSIZE_DEFAULT) | |
378 | + if (fsopt->rsize != CEPH_RSIZE_DEFAULT) | |
374 | 379 | seq_printf(m, ",rsize=%d", fsopt->rsize); |
375 | 380 | if (fsopt->congestion_kb != default_congestion_kb()) |
376 | 381 | seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb); |
fs/ceph/super.h
... | ... | @@ -27,6 +27,7 @@ |
27 | 27 | #define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ |
28 | 28 | #define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ |
29 | 29 | #define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ |
30 | +#define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */ | |
30 | 31 | |
31 | 32 | #define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES) |
32 | 33 | |
... | ... | @@ -35,6 +36,7 @@ |
35 | 36 | #define ceph_test_mount_opt(fsc, opt) \ |
36 | 37 | (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt)) |
37 | 38 | |
39 | +#define CEPH_RSIZE_DEFAULT (512*1024) /* readahead */ | |
38 | 40 | #define CEPH_MAX_READDIR_DEFAULT 1024 |
39 | 41 | #define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) |
40 | 42 | #define CEPH_SNAPDIRNAME_DEFAULT ".snap" |
... | ... | @@ -319,6 +321,16 @@ |
319 | 321 | return container_of(inode, struct ceph_inode_info, vfs_inode); |
320 | 322 | } |
321 | 323 | |
324 | +static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode) | |
325 | +{ | |
326 | + return (struct ceph_fs_client *)inode->i_sb->s_fs_info; | |
327 | +} | |
328 | + | |
329 | +static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb) | |
330 | +{ | |
331 | + return (struct ceph_fs_client *)sb->s_fs_info; | |
332 | +} | |
333 | + | |
322 | 334 | static inline struct ceph_vino ceph_vino(struct inode *inode) |
323 | 335 | { |
324 | 336 | return ceph_inode(inode)->i_vino; |
325 | 337 | |
326 | 338 | |
327 | 339 | |
... | ... | @@ -327,19 +339,49 @@ |
327 | 339 | /* |
328 | 340 | * ino_t is <64 bits on many architectures, blech. |
329 | 341 | * |
330 | - * don't include snap in ino hash, at least for now. | |
342 | + * i_ino (kernel inode) st_ino (userspace) | |
343 | + * i386 32 32 | |
344 | + * x86_64+ino32 64 32 | |
345 | + * x86_64 64 64 | |
331 | 346 | */ |
347 | +static inline u32 ceph_ino_to_ino32(ino_t ino) | |
348 | +{ | |
349 | + ino ^= ino >> (sizeof(ino) * 8 - 32); | |
350 | + if (!ino) | |
351 | + ino = 1; | |
352 | + return ino; | |
353 | +} | |
354 | + | |
355 | +/* | |
356 | + * kernel i_ino value | |
357 | + */ | |
332 | 358 | static inline ino_t ceph_vino_to_ino(struct ceph_vino vino) |
333 | 359 | { |
334 | 360 | ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */ |
335 | 361 | #if BITS_PER_LONG == 32 |
336 | - ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8; | |
337 | - if (!ino) | |
338 | - ino = 1; | |
362 | + ino = ceph_ino_to_ino32(ino); | |
339 | 363 | #endif |
340 | 364 | return ino; |
341 | 365 | } |
342 | 366 | |
367 | +/* | |
368 | + * user-visible ino (stat, filldir) | |
369 | + */ | |
370 | +#if BITS_PER_LONG == 32 | |
371 | +static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino) | |
372 | +{ | |
373 | + return ino; | |
374 | +} | |
375 | +#else | |
376 | +static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino) | |
377 | +{ | |
378 | + if (ceph_test_mount_opt(ceph_sb_to_client(sb), INO32)) | |
379 | + ino = ceph_ino_to_ino32(ino); | |
380 | + return ino; | |
381 | +} | |
382 | +#endif | |
383 | + | |
384 | + | |
343 | 385 | /* for printf-style formatting */ |
344 | 386 | #define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap |
345 | 387 | |
... | ... | @@ -428,13 +470,6 @@ |
428 | 470 | return ((loff_t)frag << 32) | (loff_t)off; |
429 | 471 | } |
430 | 472 | |
431 | -static inline int ceph_set_ino_cb(struct inode *inode, void *data) | |
432 | -{ | |
433 | - ceph_inode(inode)->i_vino = *(struct ceph_vino *)data; | |
434 | - inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data); | |
435 | - return 0; | |
436 | -} | |
437 | - | |
438 | 473 | /* |
439 | 474 | * caps helpers |
440 | 475 | */ |
... | ... | @@ -503,15 +538,6 @@ |
503 | 538 | int *total, int *avail, int *used, |
504 | 539 | int *reserved, int *min); |
505 | 540 | |
506 | -static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode) | |
507 | -{ | |
508 | - return (struct ceph_fs_client *)inode->i_sb->s_fs_info; | |
509 | -} | |
510 | - | |
511 | -static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb) | |
512 | -{ | |
513 | - return (struct ceph_fs_client *)sb->s_fs_info; | |
514 | -} | |
515 | 541 | |
516 | 542 | |
517 | 543 | /* |
include/linux/ceph/ceph_fs.h
... | ... | @@ -136,10 +136,19 @@ |
136 | 136 | |
137 | 137 | |
138 | 138 | /* osd */ |
139 | -#define CEPH_MSG_OSD_MAP 41 | |
140 | -#define CEPH_MSG_OSD_OP 42 | |
141 | -#define CEPH_MSG_OSD_OPREPLY 43 | |
139 | +#define CEPH_MSG_OSD_MAP 41 | |
140 | +#define CEPH_MSG_OSD_OP 42 | |
141 | +#define CEPH_MSG_OSD_OPREPLY 43 | |
142 | +#define CEPH_MSG_WATCH_NOTIFY 44 | |
142 | 143 | |
144 | + | |
145 | +/* watch-notify operations */ | |
146 | +enum { | |
147 | + WATCH_NOTIFY = 1, /* notifying watcher */ | |
148 | + WATCH_NOTIFY_COMPLETE = 2, /* notifier notified when done */ | |
149 | +}; | |
150 | + | |
151 | + | |
143 | 152 | /* pool operations */ |
144 | 153 | enum { |
145 | 154 | POOL_OP_CREATE = 0x01, |
146 | 155 | |
... | ... | @@ -213,8 +222,10 @@ |
213 | 222 | struct ceph_mon_request_header monhdr; |
214 | 223 | } __attribute__ ((packed)); |
215 | 224 | |
225 | +#define CEPH_SUBSCRIBE_ONETIME 1 /* i want only 1 update after have */ | |
226 | + | |
216 | 227 | struct ceph_mon_subscribe_item { |
217 | - __le64 have_version; __le64 have; | |
228 | + __le64 have_version; __le64 have; | |
218 | 229 | __u8 onetime; |
219 | 230 | } __attribute__ ((packed)); |
220 | 231 |
include/linux/ceph/libceph.h
... | ... | @@ -71,7 +71,6 @@ |
71 | 71 | #define CEPH_OSD_TIMEOUT_DEFAULT 60 /* seconds */ |
72 | 72 | #define CEPH_OSD_KEEPALIVE_DEFAULT 5 |
73 | 73 | #define CEPH_OSD_IDLE_TTL_DEFAULT 60 |
74 | -#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */ | |
75 | 74 | |
76 | 75 | #define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) |
77 | 76 | #define CEPH_MSG_MAX_DATA_LEN (16*1024*1024) |
include/linux/ceph/osd_client.h
... | ... | @@ -32,6 +32,7 @@ |
32 | 32 | struct rb_node o_node; |
33 | 33 | struct ceph_connection o_con; |
34 | 34 | struct list_head o_requests; |
35 | + struct list_head o_linger_requests; | |
35 | 36 | struct list_head o_osd_lru; |
36 | 37 | struct ceph_authorizer *o_authorizer; |
37 | 38 | void *o_authorizer_buf, *o_authorizer_reply_buf; |
... | ... | @@ -47,6 +48,8 @@ |
47 | 48 | struct rb_node r_node; |
48 | 49 | struct list_head r_req_lru_item; |
49 | 50 | struct list_head r_osd_item; |
51 | + struct list_head r_linger_item; | |
52 | + struct list_head r_linger_osd; | |
50 | 53 | struct ceph_osd *r_osd; |
51 | 54 | struct ceph_pg r_pgid; |
52 | 55 | int r_pg_osds[CEPH_PG_MAX_SIZE]; |
... | ... | @@ -59,6 +62,7 @@ |
59 | 62 | int r_flags; /* any additional flags for the osd */ |
60 | 63 | u32 r_sent; /* >0 if r_request is sending/sent */ |
61 | 64 | int r_got_reply; |
65 | + int r_linger; | |
62 | 66 | |
63 | 67 | struct ceph_osd_client *r_osdc; |
64 | 68 | struct kref r_kref; |
... | ... | @@ -74,7 +78,6 @@ |
74 | 78 | char r_oid[40]; /* object name */ |
75 | 79 | int r_oid_len; |
76 | 80 | unsigned long r_stamp; /* send OR check time */ |
77 | - bool r_resend; /* msg send failed, needs retry */ | |
78 | 81 | |
79 | 82 | struct ceph_file_layout r_file_layout; |
80 | 83 | struct ceph_snap_context *r_snapc; /* snap context for writes */ |
... | ... | @@ -90,6 +93,26 @@ |
90 | 93 | struct ceph_pagelist *r_trail; /* trailing part of the data */ |
91 | 94 | }; |
92 | 95 | |
96 | +struct ceph_osd_event { | |
97 | + u64 cookie; | |
98 | + int one_shot; | |
99 | + struct ceph_osd_client *osdc; | |
100 | + void (*cb)(u64, u64, u8, void *); | |
101 | + void *data; | |
102 | + struct rb_node node; | |
103 | + struct list_head osd_node; | |
104 | + struct kref kref; | |
105 | + struct completion completion; | |
106 | +}; | |
107 | + | |
108 | +struct ceph_osd_event_work { | |
109 | + struct work_struct work; | |
110 | + struct ceph_osd_event *event; | |
111 | + u64 ver; | |
112 | + u64 notify_id; | |
113 | + u8 opcode; | |
114 | +}; | |
115 | + | |
93 | 116 | struct ceph_osd_client { |
94 | 117 | struct ceph_client *client; |
95 | 118 | |
... | ... | @@ -104,7 +127,10 @@ |
104 | 127 | u64 timeout_tid; /* tid of timeout triggering rq */ |
105 | 128 | u64 last_tid; /* tid of last request */ |
106 | 129 | struct rb_root requests; /* pending requests */ |
107 | - struct list_head req_lru; /* pending requests lru */ | |
130 | + struct list_head req_lru; /* in-flight lru */ | |
131 | + struct list_head req_unsent; /* unsent/need-resend queue */ | |
132 | + struct list_head req_notarget; /* map to no osd */ | |
133 | + struct list_head req_linger; /* lingering requests */ | |
108 | 134 | int num_requests; |
109 | 135 | struct delayed_work timeout_work; |
110 | 136 | struct delayed_work osds_timeout_work; |
... | ... | @@ -116,6 +142,12 @@ |
116 | 142 | |
117 | 143 | struct ceph_msgpool msgpool_op; |
118 | 144 | struct ceph_msgpool msgpool_op_reply; |
145 | + | |
146 | + spinlock_t event_lock; | |
147 | + struct rb_root event_tree; | |
148 | + u64 event_count; | |
149 | + | |
150 | + struct workqueue_struct *notify_wq; | |
119 | 151 | }; |
120 | 152 | |
121 | 153 | struct ceph_osd_req_op { |
... | ... | @@ -150,6 +182,13 @@ |
150 | 182 | struct { |
151 | 183 | u64 snapid; |
152 | 184 | } snap; |
185 | + struct { | |
186 | + u64 cookie; | |
187 | + u64 ver; | |
188 | + __u8 flag; | |
189 | + u32 prot_ver; | |
190 | + u32 timeout; | |
191 | + } watch; | |
153 | 192 | }; |
154 | 193 | u32 payload_len; |
155 | 194 | }; |
... | ... | @@ -198,6 +237,11 @@ |
198 | 237 | bool use_mempool, int num_reply, |
199 | 238 | int page_align); |
200 | 239 | |
240 | +extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, | |
241 | + struct ceph_osd_request *req); | |
242 | +extern void ceph_osdc_unregister_linger_request(struct ceph_osd_client *osdc, | |
243 | + struct ceph_osd_request *req); | |
244 | + | |
201 | 245 | static inline void ceph_osdc_get_request(struct ceph_osd_request *req) |
202 | 246 | { |
203 | 247 | kref_get(&req->r_kref); |
... | ... | @@ -233,5 +277,14 @@ |
233 | 277 | struct page **pages, int nr_pages, |
234 | 278 | int flags, int do_sync, bool nofail); |
235 | 279 | |
280 | +/* watch/notify events */ | |
281 | +extern int ceph_osdc_create_event(struct ceph_osd_client *osdc, | |
282 | + void (*event_cb)(u64, u64, u8, void *), | |
283 | + int one_shot, void *data, | |
284 | + struct ceph_osd_event **pevent); | |
285 | +extern void ceph_osdc_cancel_event(struct ceph_osd_event *event); | |
286 | +extern int ceph_osdc_wait_event(struct ceph_osd_event *event, | |
287 | + unsigned long timeout); | |
288 | +extern void ceph_osdc_put_event(struct ceph_osd_event *event); | |
236 | 289 | #endif |
include/linux/ceph/rados.h
... | ... | @@ -12,9 +12,9 @@ |
12 | 12 | * osdmap encoding versions |
13 | 13 | */ |
14 | 14 | #define CEPH_OSDMAP_INC_VERSION 5 |
15 | -#define CEPH_OSDMAP_INC_VERSION_EXT 5 | |
15 | +#define CEPH_OSDMAP_INC_VERSION_EXT 6 | |
16 | 16 | #define CEPH_OSDMAP_VERSION 5 |
17 | -#define CEPH_OSDMAP_VERSION_EXT 5 | |
17 | +#define CEPH_OSDMAP_VERSION_EXT 6 | |
18 | 18 | |
19 | 19 | /* |
20 | 20 | * fs id |
21 | 21 | |
22 | 22 | |
... | ... | @@ -181,10 +181,18 @@ |
181 | 181 | /* read */ |
182 | 182 | CEPH_OSD_OP_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1, |
183 | 183 | CEPH_OSD_OP_STAT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2, |
184 | + CEPH_OSD_OP_MAPEXT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 3, | |
184 | 185 | |
185 | 186 | /* fancy read */ |
186 | - CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4, | |
187 | + CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4, | |
188 | + CEPH_OSD_OP_SPARSE_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 5, | |
187 | 189 | |
190 | + CEPH_OSD_OP_NOTIFY = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 6, | |
191 | + CEPH_OSD_OP_NOTIFY_ACK = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 7, | |
192 | + | |
193 | + /* versioning */ | |
194 | + CEPH_OSD_OP_ASSERT_VER = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 8, | |
195 | + | |
188 | 196 | /* write */ |
189 | 197 | CEPH_OSD_OP_WRITE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1, |
190 | 198 | CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2, |
... | ... | @@ -205,6 +213,8 @@ |
205 | 213 | CEPH_OSD_OP_CREATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13, |
206 | 214 | CEPH_OSD_OP_ROLLBACK= CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 14, |
207 | 215 | |
216 | + CEPH_OSD_OP_WATCH = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 15, | |
217 | + | |
208 | 218 | /** attrs **/ |
209 | 219 | /* read */ |
210 | 220 | CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1, |
... | ... | @@ -218,11 +228,14 @@ |
218 | 228 | CEPH_OSD_OP_RMXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4, |
219 | 229 | |
220 | 230 | /** subop **/ |
221 | - CEPH_OSD_OP_PULL = CEPH_OSD_OP_MODE_SUB | 1, | |
222 | - CEPH_OSD_OP_PUSH = CEPH_OSD_OP_MODE_SUB | 2, | |
223 | - CEPH_OSD_OP_BALANCEREADS = CEPH_OSD_OP_MODE_SUB | 3, | |
224 | - CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4, | |
225 | - CEPH_OSD_OP_SCRUB = CEPH_OSD_OP_MODE_SUB | 5, | |
231 | + CEPH_OSD_OP_PULL = CEPH_OSD_OP_MODE_SUB | 1, | |
232 | + CEPH_OSD_OP_PUSH = CEPH_OSD_OP_MODE_SUB | 2, | |
233 | + CEPH_OSD_OP_BALANCEREADS = CEPH_OSD_OP_MODE_SUB | 3, | |
234 | + CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4, | |
235 | + CEPH_OSD_OP_SCRUB = CEPH_OSD_OP_MODE_SUB | 5, | |
236 | + CEPH_OSD_OP_SCRUB_RESERVE = CEPH_OSD_OP_MODE_SUB | 6, | |
237 | + CEPH_OSD_OP_SCRUB_UNRESERVE = CEPH_OSD_OP_MODE_SUB | 7, | |
238 | + CEPH_OSD_OP_SCRUB_STOP = CEPH_OSD_OP_MODE_SUB | 8, | |
226 | 239 | |
227 | 240 | /** lock **/ |
228 | 241 | CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1, |
... | ... | @@ -328,6 +341,8 @@ |
328 | 341 | CEPH_OSD_CMPXATTR_MODE_U64 = 2 |
329 | 342 | }; |
330 | 343 | |
344 | +#define RADOS_NOTIFY_VER 1 | |
345 | + | |
331 | 346 | /* |
332 | 347 | * an individual object operation. each may be accompanied by some data |
333 | 348 | * payload |
... | ... | @@ -359,7 +374,12 @@ |
359 | 374 | struct { |
360 | 375 | __le64 snapid; |
361 | 376 | } __attribute__ ((packed)) snap; |
362 | - }; | |
377 | + struct { | |
378 | + __le64 cookie; | |
379 | + __le64 ver; | |
380 | + __u8 flag; /* 0 = unwatch, 1 = watch */ | |
381 | + } __attribute__ ((packed)) watch; | |
382 | +}; | |
363 | 383 | __le32 payload_len; |
364 | 384 | } __attribute__ ((packed)); |
365 | 385 | |
... | ... | @@ -400,6 +420,7 @@ |
400 | 420 | __le32 num_ops; |
401 | 421 | struct ceph_osd_op ops[0]; /* ops[], object */ |
402 | 422 | } __attribute__ ((packed)); |
423 | + | |
403 | 424 | |
404 | 425 | |
405 | 426 | #endif |
net/ceph/armor.c
net/ceph/ceph_common.c
net/ceph/osd_client.c
... | ... | @@ -22,10 +22,15 @@ |
22 | 22 | #define OSD_OPREPLY_FRONT_LEN 512 |
23 | 23 | |
24 | 24 | static const struct ceph_connection_operations osd_con_ops; |
25 | -static int __kick_requests(struct ceph_osd_client *osdc, | |
26 | - struct ceph_osd *kickosd); | |
27 | 25 | |
28 | -static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd); | |
26 | +static void send_queued(struct ceph_osd_client *osdc); | |
27 | +static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd); | |
28 | +static void __register_request(struct ceph_osd_client *osdc, | |
29 | + struct ceph_osd_request *req); | |
30 | +static void __unregister_linger_request(struct ceph_osd_client *osdc, | |
31 | + struct ceph_osd_request *req); | |
32 | +static int __send_request(struct ceph_osd_client *osdc, | |
33 | + struct ceph_osd_request *req); | |
29 | 34 | |
30 | 35 | static int op_needs_trail(int op) |
31 | 36 | { |
... | ... | @@ -34,6 +39,7 @@ |
34 | 39 | case CEPH_OSD_OP_SETXATTR: |
35 | 40 | case CEPH_OSD_OP_CMPXATTR: |
36 | 41 | case CEPH_OSD_OP_CALL: |
42 | + case CEPH_OSD_OP_NOTIFY: | |
37 | 43 | return 1; |
38 | 44 | default: |
39 | 45 | return 0; |
... | ... | @@ -209,6 +215,8 @@ |
209 | 215 | init_completion(&req->r_completion); |
210 | 216 | init_completion(&req->r_safe_completion); |
211 | 217 | INIT_LIST_HEAD(&req->r_unsafe_item); |
218 | + INIT_LIST_HEAD(&req->r_linger_item); | |
219 | + INIT_LIST_HEAD(&req->r_linger_osd); | |
212 | 220 | req->r_flags = flags; |
213 | 221 | |
214 | 222 | WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0); |
... | ... | @@ -315,6 +323,24 @@ |
315 | 323 | break; |
316 | 324 | case CEPH_OSD_OP_STARTSYNC: |
317 | 325 | break; |
326 | + case CEPH_OSD_OP_NOTIFY: | |
327 | + { | |
328 | + __le32 prot_ver = cpu_to_le32(src->watch.prot_ver); | |
329 | + __le32 timeout = cpu_to_le32(src->watch.timeout); | |
330 | + | |
331 | + BUG_ON(!req->r_trail); | |
332 | + | |
333 | + ceph_pagelist_append(req->r_trail, | |
334 | + &prot_ver, sizeof(prot_ver)); | |
335 | + ceph_pagelist_append(req->r_trail, | |
336 | + &timeout, sizeof(timeout)); | |
337 | + } | |
338 | + case CEPH_OSD_OP_NOTIFY_ACK: | |
339 | + case CEPH_OSD_OP_WATCH: | |
340 | + dst->watch.cookie = cpu_to_le64(src->watch.cookie); | |
341 | + dst->watch.ver = cpu_to_le64(src->watch.ver); | |
342 | + dst->watch.flag = src->watch.flag; | |
343 | + break; | |
318 | 344 | default: |
319 | 345 | pr_err("unrecognized osd opcode %d\n", dst->op); |
320 | 346 | WARN_ON(1); |
321 | 347 | |
... | ... | @@ -529,7 +555,46 @@ |
529 | 555 | return NULL; |
530 | 556 | } |
531 | 557 | |
558 | +/* | |
559 | + * Resubmit requests pending on the given osd. | |
560 | + */ | |
561 | +static void __kick_osd_requests(struct ceph_osd_client *osdc, | |
562 | + struct ceph_osd *osd) | |
563 | +{ | |
564 | + struct ceph_osd_request *req, *nreq; | |
565 | + int err; | |
532 | 566 | |
567 | + dout("__kick_osd_requests osd%d\n", osd->o_osd); | |
568 | + err = __reset_osd(osdc, osd); | |
569 | + if (err == -EAGAIN) | |
570 | + return; | |
571 | + | |
572 | + list_for_each_entry(req, &osd->o_requests, r_osd_item) { | |
573 | + list_move(&req->r_req_lru_item, &osdc->req_unsent); | |
574 | + dout("requeued %p tid %llu osd%d\n", req, req->r_tid, | |
575 | + osd->o_osd); | |
576 | + if (!req->r_linger) | |
577 | + req->r_flags |= CEPH_OSD_FLAG_RETRY; | |
578 | + } | |
579 | + | |
580 | + list_for_each_entry_safe(req, nreq, &osd->o_linger_requests, | |
581 | + r_linger_osd) { | |
582 | + __unregister_linger_request(osdc, req); | |
583 | + __register_request(osdc, req); | |
584 | + list_move(&req->r_req_lru_item, &osdc->req_unsent); | |
585 | + dout("requeued lingering %p tid %llu osd%d\n", req, req->r_tid, | |
586 | + osd->o_osd); | |
587 | + } | |
588 | +} | |
589 | + | |
590 | +static void kick_osd_requests(struct ceph_osd_client *osdc, | |
591 | + struct ceph_osd *kickosd) | |
592 | +{ | |
593 | + mutex_lock(&osdc->request_mutex); | |
594 | + __kick_osd_requests(osdc, kickosd); | |
595 | + mutex_unlock(&osdc->request_mutex); | |
596 | +} | |
597 | + | |
533 | 598 | /* |
534 | 599 | * If the osd connection drops, we need to resubmit all requests. |
535 | 600 | */ |
... | ... | @@ -543,7 +608,8 @@ |
543 | 608 | dout("osd_reset osd%d\n", osd->o_osd); |
544 | 609 | osdc = osd->o_osdc; |
545 | 610 | down_read(&osdc->map_sem); |
546 | - kick_requests(osdc, osd); | |
611 | + kick_osd_requests(osdc, osd); | |
612 | + send_queued(osdc); | |
547 | 613 | up_read(&osdc->map_sem); |
548 | 614 | } |
549 | 615 | |
... | ... | @@ -561,6 +627,7 @@ |
561 | 627 | atomic_set(&osd->o_ref, 1); |
562 | 628 | osd->o_osdc = osdc; |
563 | 629 | INIT_LIST_HEAD(&osd->o_requests); |
630 | + INIT_LIST_HEAD(&osd->o_linger_requests); | |
564 | 631 | INIT_LIST_HEAD(&osd->o_osd_lru); |
565 | 632 | osd->o_incarnation = 1; |
566 | 633 | |
... | ... | @@ -650,7 +717,8 @@ |
650 | 717 | int ret = 0; |
651 | 718 | |
652 | 719 | dout("__reset_osd %p osd%d\n", osd, osd->o_osd); |
653 | - if (list_empty(&osd->o_requests)) { | |
720 | + if (list_empty(&osd->o_requests) && | |
721 | + list_empty(&osd->o_linger_requests)) { | |
654 | 722 | __remove_osd(osdc, osd); |
655 | 723 | } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd], |
656 | 724 | &osd->o_con.peer_addr, |
657 | 725 | |
... | ... | @@ -723,10 +791,9 @@ |
723 | 791 | * Register request, assign tid. If this is the first request, set up |
724 | 792 | * the timeout event. |
725 | 793 | */ |
726 | -static void register_request(struct ceph_osd_client *osdc, | |
727 | - struct ceph_osd_request *req) | |
794 | +static void __register_request(struct ceph_osd_client *osdc, | |
795 | + struct ceph_osd_request *req) | |
728 | 796 | { |
729 | - mutex_lock(&osdc->request_mutex); | |
730 | 797 | req->r_tid = ++osdc->last_tid; |
731 | 798 | req->r_request->hdr.tid = cpu_to_le64(req->r_tid); |
732 | 799 | INIT_LIST_HEAD(&req->r_req_lru_item); |
... | ... | @@ -740,6 +807,13 @@ |
740 | 807 | dout(" first request, scheduling timeout\n"); |
741 | 808 | __schedule_osd_timeout(osdc); |
742 | 809 | } |
810 | +} | |
811 | + | |
812 | +static void register_request(struct ceph_osd_client *osdc, | |
813 | + struct ceph_osd_request *req) | |
814 | +{ | |
815 | + mutex_lock(&osdc->request_mutex); | |
816 | + __register_request(osdc, req); | |
743 | 817 | mutex_unlock(&osdc->request_mutex); |
744 | 818 | } |
745 | 819 | |
746 | 820 | |
... | ... | @@ -758,9 +832,14 @@ |
758 | 832 | ceph_con_revoke(&req->r_osd->o_con, req->r_request); |
759 | 833 | |
760 | 834 | list_del_init(&req->r_osd_item); |
761 | - if (list_empty(&req->r_osd->o_requests)) | |
835 | + if (list_empty(&req->r_osd->o_requests) && | |
836 | + list_empty(&req->r_osd->o_linger_requests)) { | |
837 | + dout("moving osd to %p lru\n", req->r_osd); | |
762 | 838 | __move_osd_to_lru(osdc, req->r_osd); |
763 | - req->r_osd = NULL; | |
839 | + } | |
840 | + if (list_empty(&req->r_osd_item) && | |
841 | + list_empty(&req->r_linger_item)) | |
842 | + req->r_osd = NULL; | |
764 | 843 | } |
765 | 844 | |
766 | 845 | ceph_osdc_put_request(req); |
767 | 846 | |
768 | 847 | |
769 | 848 | |
... | ... | @@ -781,20 +860,72 @@ |
781 | 860 | ceph_con_revoke(&req->r_osd->o_con, req->r_request); |
782 | 861 | req->r_sent = 0; |
783 | 862 | } |
784 | - list_del_init(&req->r_req_lru_item); | |
785 | 863 | } |
786 | 864 | |
865 | +static void __register_linger_request(struct ceph_osd_client *osdc, | |
866 | + struct ceph_osd_request *req) | |
867 | +{ | |
868 | + dout("__register_linger_request %p\n", req); | |
869 | + list_add_tail(&req->r_linger_item, &osdc->req_linger); | |
870 | + list_add_tail(&req->r_linger_osd, &req->r_osd->o_linger_requests); | |
871 | +} | |
872 | + | |
873 | +static void __unregister_linger_request(struct ceph_osd_client *osdc, | |
874 | + struct ceph_osd_request *req) | |
875 | +{ | |
876 | + dout("__unregister_linger_request %p\n", req); | |
877 | + if (req->r_osd) { | |
878 | + list_del_init(&req->r_linger_item); | |
879 | + list_del_init(&req->r_linger_osd); | |
880 | + | |
881 | + if (list_empty(&req->r_osd->o_requests) && | |
882 | + list_empty(&req->r_osd->o_linger_requests)) { | |
883 | + dout("moving osd to %p lru\n", req->r_osd); | |
884 | + __move_osd_to_lru(osdc, req->r_osd); | |
885 | + } | |
886 | + req->r_osd = NULL; | |
887 | + } | |
888 | +} | |
889 | + | |
890 | +void ceph_osdc_unregister_linger_request(struct ceph_osd_client *osdc, | |
891 | + struct ceph_osd_request *req) | |
892 | +{ | |
893 | + mutex_lock(&osdc->request_mutex); | |
894 | + if (req->r_linger) { | |
895 | + __unregister_linger_request(osdc, req); | |
896 | + ceph_osdc_put_request(req); | |
897 | + } | |
898 | + mutex_unlock(&osdc->request_mutex); | |
899 | +} | |
900 | +EXPORT_SYMBOL(ceph_osdc_unregister_linger_request); | |
901 | + | |
902 | +void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, | |
903 | + struct ceph_osd_request *req) | |
904 | +{ | |
905 | + if (!req->r_linger) { | |
906 | + dout("set_request_linger %p\n", req); | |
907 | + req->r_linger = 1; | |
908 | + /* | |
909 | + * caller is now responsible for calling | |
910 | + * unregister_linger_request | |
911 | + */ | |
912 | + ceph_osdc_get_request(req); | |
913 | + } | |
914 | +} | |
915 | +EXPORT_SYMBOL(ceph_osdc_set_request_linger); | |
916 | + | |
787 | 917 | /* |
788 | 918 | * Pick an osd (the first 'up' osd in the pg), allocate the osd struct |
789 | 919 | * (as needed), and set the request r_osd appropriately. If there is |
790 | - * no up osd, set r_osd to NULL. | |
920 | + * no up osd, set r_osd to NULL. Move the request to the appropiate list | |
921 | + * (unsent, homeless) or leave on in-flight lru. | |
791 | 922 | * |
792 | 923 | * Return 0 if unchanged, 1 if changed, or negative on error. |
793 | 924 | * |
794 | 925 | * Caller should hold map_sem for read and request_mutex. |
795 | 926 | */ |
796 | -static int __map_osds(struct ceph_osd_client *osdc, | |
797 | - struct ceph_osd_request *req) | |
927 | +static int __map_request(struct ceph_osd_client *osdc, | |
928 | + struct ceph_osd_request *req) | |
798 | 929 | { |
799 | 930 | struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; |
800 | 931 | struct ceph_pg pgid; |
801 | 932 | |
802 | 933 | |
... | ... | @@ -802,11 +933,13 @@ |
802 | 933 | int o = -1, num = 0; |
803 | 934 | int err; |
804 | 935 | |
805 | - dout("map_osds %p tid %lld\n", req, req->r_tid); | |
936 | + dout("map_request %p tid %lld\n", req, req->r_tid); | |
806 | 937 | err = ceph_calc_object_layout(&reqhead->layout, req->r_oid, |
807 | 938 | &req->r_file_layout, osdc->osdmap); |
808 | - if (err) | |
939 | + if (err) { | |
940 | + list_move(&req->r_req_lru_item, &osdc->req_notarget); | |
809 | 941 | return err; |
942 | + } | |
810 | 943 | pgid = reqhead->layout.ol_pgid; |
811 | 944 | req->r_pgid = pgid; |
812 | 945 | |
... | ... | @@ -823,7 +956,7 @@ |
823 | 956 | (req->r_osd == NULL && o == -1)) |
824 | 957 | return 0; /* no change */ |
825 | 958 | |
826 | - dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n", | |
959 | + dout("map_request tid %llu pgid %d.%x osd%d (was osd%d)\n", | |
827 | 960 | req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o, |
828 | 961 | req->r_osd ? req->r_osd->o_osd : -1); |
829 | 962 | |
830 | 963 | |
831 | 964 | |
... | ... | @@ -841,10 +974,12 @@ |
841 | 974 | if (!req->r_osd && o >= 0) { |
842 | 975 | err = -ENOMEM; |
843 | 976 | req->r_osd = create_osd(osdc); |
844 | - if (!req->r_osd) | |
977 | + if (!req->r_osd) { | |
978 | + list_move(&req->r_req_lru_item, &osdc->req_notarget); | |
845 | 979 | goto out; |
980 | + } | |
846 | 981 | |
847 | - dout("map_osds osd %p is osd%d\n", req->r_osd, o); | |
982 | + dout("map_request osd %p is osd%d\n", req->r_osd, o); | |
848 | 983 | req->r_osd->o_osd = o; |
849 | 984 | req->r_osd->o_con.peer_name.num = cpu_to_le64(o); |
850 | 985 | __insert_osd(osdc, req->r_osd); |
... | ... | @@ -855,6 +990,9 @@ |
855 | 990 | if (req->r_osd) { |
856 | 991 | __remove_osd_from_lru(req->r_osd); |
857 | 992 | list_add(&req->r_osd_item, &req->r_osd->o_requests); |
993 | + list_move(&req->r_req_lru_item, &osdc->req_unsent); | |
994 | + } else { | |
995 | + list_move(&req->r_req_lru_item, &osdc->req_notarget); | |
858 | 996 | } |
859 | 997 | err = 1; /* osd or pg changed */ |
860 | 998 | |
861 | 999 | |
... | ... | @@ -869,17 +1007,7 @@ |
869 | 1007 | struct ceph_osd_request *req) |
870 | 1008 | { |
871 | 1009 | struct ceph_osd_request_head *reqhead; |
872 | - int err; | |
873 | 1010 | |
874 | - err = __map_osds(osdc, req); | |
875 | - if (err < 0) | |
876 | - return err; | |
877 | - if (req->r_osd == NULL) { | |
878 | - dout("send_request %p no up osds in pg\n", req); | |
879 | - ceph_monc_request_next_osdmap(&osdc->client->monc); | |
880 | - return 0; | |
881 | - } | |
882 | - | |
883 | 1011 | dout("send_request %p tid %llu to osd%d flags %d\n", |
884 | 1012 | req, req->r_tid, req->r_osd->o_osd, req->r_flags); |
885 | 1013 | |
... | ... | @@ -898,6 +1026,21 @@ |
898 | 1026 | } |
899 | 1027 | |
900 | 1028 | /* |
1029 | + * Send any requests in the queue (req_unsent). | |
1030 | + */ | |
1031 | +static void send_queued(struct ceph_osd_client *osdc) | |
1032 | +{ | |
1033 | + struct ceph_osd_request *req, *tmp; | |
1034 | + | |
1035 | + dout("send_queued\n"); | |
1036 | + mutex_lock(&osdc->request_mutex); | |
1037 | + list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item) { | |
1038 | + __send_request(osdc, req); | |
1039 | + } | |
1040 | + mutex_unlock(&osdc->request_mutex); | |
1041 | +} | |
1042 | + | |
1043 | +/* | |
901 | 1044 | * Timeout callback, called every N seconds when 1 or more osd |
902 | 1045 | * requests has been active for more than N seconds. When this |
903 | 1046 | * happens, we ping all OSDs with requests who have timed out to |
904 | 1047 | |
905 | 1048 | |
906 | 1049 | |
... | ... | @@ -916,31 +1059,14 @@ |
916 | 1059 | unsigned long keepalive = |
917 | 1060 | osdc->client->options->osd_keepalive_timeout * HZ; |
918 | 1061 | unsigned long last_stamp = 0; |
919 | - struct rb_node *p; | |
920 | 1062 | struct list_head slow_osds; |
921 | - | |
922 | 1063 | dout("timeout\n"); |
923 | 1064 | down_read(&osdc->map_sem); |
924 | 1065 | |
925 | 1066 | ceph_monc_request_next_osdmap(&osdc->client->monc); |
926 | 1067 | |
927 | 1068 | mutex_lock(&osdc->request_mutex); |
928 | - for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { | |
929 | - req = rb_entry(p, struct ceph_osd_request, r_node); | |
930 | 1069 | |
931 | - if (req->r_resend) { | |
932 | - int err; | |
933 | - | |
934 | - dout("osdc resending prev failed %lld\n", req->r_tid); | |
935 | - err = __send_request(osdc, req); | |
936 | - if (err) | |
937 | - dout("osdc failed again on %lld\n", req->r_tid); | |
938 | - else | |
939 | - req->r_resend = false; | |
940 | - continue; | |
941 | - } | |
942 | - } | |
943 | - | |
944 | 1070 | /* |
945 | 1071 | * reset osds that appear to be _really_ unresponsive. this |
946 | 1072 | * is a failsafe measure.. we really shouldn't be getting to |
... | ... | @@ -963,7 +1089,7 @@ |
963 | 1089 | BUG_ON(!osd); |
964 | 1090 | pr_warning(" tid %llu timed out on osd%d, will reset osd\n", |
965 | 1091 | req->r_tid, osd->o_osd); |
966 | - __kick_requests(osdc, osd); | |
1092 | + __kick_osd_requests(osdc, osd); | |
967 | 1093 | } |
968 | 1094 | |
969 | 1095 | /* |
... | ... | @@ -991,7 +1117,7 @@ |
991 | 1117 | |
992 | 1118 | __schedule_osd_timeout(osdc); |
993 | 1119 | mutex_unlock(&osdc->request_mutex); |
994 | - | |
1120 | + send_queued(osdc); | |
995 | 1121 | up_read(&osdc->map_sem); |
996 | 1122 | } |
997 | 1123 | |
... | ... | @@ -1035,7 +1161,6 @@ |
1035 | 1161 | numops * sizeof(struct ceph_osd_op)) |
1036 | 1162 | goto bad; |
1037 | 1163 | dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result); |
1038 | - | |
1039 | 1164 | /* lookup */ |
1040 | 1165 | mutex_lock(&osdc->request_mutex); |
1041 | 1166 | req = __lookup_request(osdc, tid); |
... | ... | @@ -1079,6 +1204,9 @@ |
1079 | 1204 | |
1080 | 1205 | dout("handle_reply tid %llu flags %d\n", tid, flags); |
1081 | 1206 | |
1207 | + if (req->r_linger && (flags & CEPH_OSD_FLAG_ONDISK)) | |
1208 | + __register_linger_request(osdc, req); | |
1209 | + | |
1082 | 1210 | /* either this is a read, or we got the safe response */ |
1083 | 1211 | if (result < 0 || |
1084 | 1212 | (flags & CEPH_OSD_FLAG_ONDISK) || |
... | ... | @@ -1099,6 +1227,7 @@ |
1099 | 1227 | } |
1100 | 1228 | |
1101 | 1229 | done: |
1230 | + dout("req=%p req->r_linger=%d\n", req, req->r_linger); | |
1102 | 1231 | ceph_osdc_put_request(req); |
1103 | 1232 | return; |
1104 | 1233 | |
1105 | 1234 | |
1106 | 1235 | |
1107 | 1236 | |
1108 | 1237 | |
1109 | 1238 | |
1110 | 1239 | |
1111 | 1240 | |
1112 | 1241 | |
1113 | 1242 | |
1114 | 1243 | |
1115 | 1244 | |
1116 | 1245 | |
1117 | 1246 | |
1118 | 1247 | |
1119 | 1248 | |
... | ... | @@ -1109,108 +1238,83 @@ |
1109 | 1238 | ceph_msg_dump(msg); |
1110 | 1239 | } |
1111 | 1240 | |
1112 | - | |
1113 | -static int __kick_requests(struct ceph_osd_client *osdc, | |
1114 | - struct ceph_osd *kickosd) | |
1241 | +static void reset_changed_osds(struct ceph_osd_client *osdc) | |
1115 | 1242 | { |
1116 | - struct ceph_osd_request *req; | |
1117 | 1243 | struct rb_node *p, *n; |
1118 | - int needmap = 0; | |
1119 | - int err; | |
1120 | 1244 | |
1121 | - dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1); | |
1122 | - if (kickosd) { | |
1123 | - err = __reset_osd(osdc, kickosd); | |
1124 | - if (err == -EAGAIN) | |
1125 | - return 1; | |
1126 | - } else { | |
1127 | - for (p = rb_first(&osdc->osds); p; p = n) { | |
1128 | - struct ceph_osd *osd = | |
1129 | - rb_entry(p, struct ceph_osd, o_node); | |
1245 | + for (p = rb_first(&osdc->osds); p; p = n) { | |
1246 | + struct ceph_osd *osd = rb_entry(p, struct ceph_osd, o_node); | |
1130 | 1247 | |
1131 | - n = rb_next(p); | |
1132 | - if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) || | |
1133 | - memcmp(&osd->o_con.peer_addr, | |
1134 | - ceph_osd_addr(osdc->osdmap, | |
1135 | - osd->o_osd), | |
1136 | - sizeof(struct ceph_entity_addr)) != 0) | |
1137 | - __reset_osd(osdc, osd); | |
1138 | - } | |
1248 | + n = rb_next(p); | |
1249 | + if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) || | |
1250 | + memcmp(&osd->o_con.peer_addr, | |
1251 | + ceph_osd_addr(osdc->osdmap, | |
1252 | + osd->o_osd), | |
1253 | + sizeof(struct ceph_entity_addr)) != 0) | |
1254 | + __reset_osd(osdc, osd); | |
1139 | 1255 | } |
1256 | +} | |
1140 | 1257 | |
1258 | +/* | |
1259 | + * Requeue requests whose mapping to an OSD has changed. If requests map to | |
1260 | + * no osd, request a new map. | |
1261 | + * | |
1262 | + * Caller should hold map_sem for read and request_mutex. | |
1263 | + */ | |
1264 | +static void kick_requests(struct ceph_osd_client *osdc) | |
1265 | +{ | |
1266 | + struct ceph_osd_request *req, *nreq; | |
1267 | + struct rb_node *p; | |
1268 | + int needmap = 0; | |
1269 | + int err; | |
1270 | + | |
1271 | + dout("kick_requests\n"); | |
1272 | + mutex_lock(&osdc->request_mutex); | |
1141 | 1273 | for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { |
1142 | 1274 | req = rb_entry(p, struct ceph_osd_request, r_node); |
1143 | - | |
1144 | - if (req->r_resend) { | |
1145 | - dout(" r_resend set on tid %llu\n", req->r_tid); | |
1146 | - __cancel_request(req); | |
1147 | - goto kick; | |
1275 | + err = __map_request(osdc, req); | |
1276 | + if (err < 0) | |
1277 | + continue; /* error */ | |
1278 | + if (req->r_osd == NULL) { | |
1279 | + dout("%p tid %llu maps to no osd\n", req, req->r_tid); | |
1280 | + needmap++; /* request a newer map */ | |
1281 | + } else if (err > 0) { | |
1282 | + dout("%p tid %llu requeued on osd%d\n", req, req->r_tid, | |
1283 | + req->r_osd ? req->r_osd->o_osd : -1); | |
1284 | + if (!req->r_linger) | |
1285 | + req->r_flags |= CEPH_OSD_FLAG_RETRY; | |
1148 | 1286 | } |
1149 | - if (req->r_osd && kickosd == req->r_osd) { | |
1150 | - __cancel_request(req); | |
1151 | - goto kick; | |
1152 | - } | |
1287 | + } | |
1153 | 1288 | |
1154 | - err = __map_osds(osdc, req); | |
1289 | + list_for_each_entry_safe(req, nreq, &osdc->req_linger, | |
1290 | + r_linger_item) { | |
1291 | + dout("linger req=%p req->r_osd=%p\n", req, req->r_osd); | |
1292 | + | |
1293 | + err = __map_request(osdc, req); | |
1155 | 1294 | if (err == 0) |
1156 | - continue; /* no change */ | |
1157 | - if (err < 0) { | |
1158 | - /* | |
1159 | - * FIXME: really, we should set the request | |
1160 | - * error and fail if this isn't a 'nofail' | |
1161 | - * request, but that's a fair bit more | |
1162 | - * complicated to do. So retry! | |
1163 | - */ | |
1164 | - dout(" setting r_resend on %llu\n", req->r_tid); | |
1165 | - req->r_resend = true; | |
1166 | - continue; | |
1167 | - } | |
1295 | + continue; /* no change and no osd was specified */ | |
1296 | + if (err < 0) | |
1297 | + continue; /* hrm! */ | |
1168 | 1298 | if (req->r_osd == NULL) { |
1169 | 1299 | dout("tid %llu maps to no valid osd\n", req->r_tid); |
1170 | 1300 | needmap++; /* request a newer map */ |
1171 | 1301 | continue; |
1172 | 1302 | } |
1173 | 1303 | |
1174 | -kick: | |
1175 | - dout("kicking %p tid %llu osd%d\n", req, req->r_tid, | |
1304 | + dout("kicking lingering %p tid %llu osd%d\n", req, req->r_tid, | |
1176 | 1305 | req->r_osd ? req->r_osd->o_osd : -1); |
1177 | - req->r_flags |= CEPH_OSD_FLAG_RETRY; | |
1178 | - err = __send_request(osdc, req); | |
1179 | - if (err) { | |
1180 | - dout(" setting r_resend on %llu\n", req->r_tid); | |
1181 | - req->r_resend = true; | |
1182 | - } | |
1306 | + __unregister_linger_request(osdc, req); | |
1307 | + __register_request(osdc, req); | |
1183 | 1308 | } |
1184 | - | |
1185 | - return needmap; | |
1186 | -} | |
1187 | - | |
1188 | -/* | |
1189 | - * Resubmit osd requests whose osd or osd address has changed. Request | |
1190 | - * a new osd map if osds are down, or we are otherwise unable to determine | |
1191 | - * how to direct a request. | |
1192 | - * | |
1193 | - * Close connections to down osds. | |
1194 | - * | |
1195 | - * If @who is specified, resubmit requests for that specific osd. | |
1196 | - * | |
1197 | - * Caller should hold map_sem for read and request_mutex. | |
1198 | - */ | |
1199 | -static void kick_requests(struct ceph_osd_client *osdc, | |
1200 | - struct ceph_osd *kickosd) | |
1201 | -{ | |
1202 | - int needmap; | |
1203 | - | |
1204 | - mutex_lock(&osdc->request_mutex); | |
1205 | - needmap = __kick_requests(osdc, kickosd); | |
1206 | 1309 | mutex_unlock(&osdc->request_mutex); |
1207 | 1310 | |
1208 | 1311 | if (needmap) { |
1209 | 1312 | dout("%d requests for down osds, need new map\n", needmap); |
1210 | 1313 | ceph_monc_request_next_osdmap(&osdc->client->monc); |
1211 | 1314 | } |
1212 | - | |
1213 | 1315 | } |
1316 | + | |
1317 | + | |
1214 | 1318 | /* |
1215 | 1319 | * Process updated osd map. |
1216 | 1320 | * |
... | ... | @@ -1263,6 +1367,8 @@ |
1263 | 1367 | ceph_osdmap_destroy(osdc->osdmap); |
1264 | 1368 | osdc->osdmap = newmap; |
1265 | 1369 | } |
1370 | + kick_requests(osdc); | |
1371 | + reset_changed_osds(osdc); | |
1266 | 1372 | } else { |
1267 | 1373 | dout("ignoring incremental map %u len %d\n", |
1268 | 1374 | epoch, maplen); |
... | ... | @@ -1300,6 +1406,7 @@ |
1300 | 1406 | osdc->osdmap = newmap; |
1301 | 1407 | if (oldmap) |
1302 | 1408 | ceph_osdmap_destroy(oldmap); |
1409 | + kick_requests(osdc); | |
1303 | 1410 | } |
1304 | 1411 | p += maplen; |
1305 | 1412 | nr_maps--; |
... | ... | @@ -1308,8 +1415,7 @@ |
1308 | 1415 | done: |
1309 | 1416 | downgrade_write(&osdc->map_sem); |
1310 | 1417 | ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch); |
1311 | - if (newmap) | |
1312 | - kick_requests(osdc, NULL); | |
1418 | + send_queued(osdc); | |
1313 | 1419 | up_read(&osdc->map_sem); |
1314 | 1420 | wake_up_all(&osdc->client->auth_wq); |
1315 | 1421 | return; |
... | ... | @@ -1322,6 +1428,223 @@ |
1322 | 1428 | } |
1323 | 1429 | |
1324 | 1430 | /* |
1431 | + * watch/notify callback event infrastructure | |
1432 | + * | |
1433 | + * These callbacks are used both for watch and notify operations. | |
1434 | + */ | |
1435 | +static void __release_event(struct kref *kref) | |
1436 | +{ | |
1437 | + struct ceph_osd_event *event = | |
1438 | + container_of(kref, struct ceph_osd_event, kref); | |
1439 | + | |
1440 | + dout("__release_event %p\n", event); | |
1441 | + kfree(event); | |
1442 | +} | |
1443 | + | |
1444 | +static void get_event(struct ceph_osd_event *event) | |
1445 | +{ | |
1446 | + kref_get(&event->kref); | |
1447 | +} | |
1448 | + | |
1449 | +void ceph_osdc_put_event(struct ceph_osd_event *event) | |
1450 | +{ | |
1451 | + kref_put(&event->kref, __release_event); | |
1452 | +} | |
1453 | +EXPORT_SYMBOL(ceph_osdc_put_event); | |
1454 | + | |
1455 | +static void __insert_event(struct ceph_osd_client *osdc, | |
1456 | + struct ceph_osd_event *new) | |
1457 | +{ | |
1458 | + struct rb_node **p = &osdc->event_tree.rb_node; | |
1459 | + struct rb_node *parent = NULL; | |
1460 | + struct ceph_osd_event *event = NULL; | |
1461 | + | |
1462 | + while (*p) { | |
1463 | + parent = *p; | |
1464 | + event = rb_entry(parent, struct ceph_osd_event, node); | |
1465 | + if (new->cookie < event->cookie) | |
1466 | + p = &(*p)->rb_left; | |
1467 | + else if (new->cookie > event->cookie) | |
1468 | + p = &(*p)->rb_right; | |
1469 | + else | |
1470 | + BUG(); | |
1471 | + } | |
1472 | + | |
1473 | + rb_link_node(&new->node, parent, p); | |
1474 | + rb_insert_color(&new->node, &osdc->event_tree); | |
1475 | +} | |
1476 | + | |
1477 | +static struct ceph_osd_event *__find_event(struct ceph_osd_client *osdc, | |
1478 | + u64 cookie) | |
1479 | +{ | |
1480 | + struct rb_node **p = &osdc->event_tree.rb_node; | |
1481 | + struct rb_node *parent = NULL; | |
1482 | + struct ceph_osd_event *event = NULL; | |
1483 | + | |
1484 | + while (*p) { | |
1485 | + parent = *p; | |
1486 | + event = rb_entry(parent, struct ceph_osd_event, node); | |
1487 | + if (cookie < event->cookie) | |
1488 | + p = &(*p)->rb_left; | |
1489 | + else if (cookie > event->cookie) | |
1490 | + p = &(*p)->rb_right; | |
1491 | + else | |
1492 | + return event; | |
1493 | + } | |
1494 | + return NULL; | |
1495 | +} | |
1496 | + | |
1497 | +static void __remove_event(struct ceph_osd_event *event) | |
1498 | +{ | |
1499 | + struct ceph_osd_client *osdc = event->osdc; | |
1500 | + | |
1501 | + if (!RB_EMPTY_NODE(&event->node)) { | |
1502 | + dout("__remove_event removed %p\n", event); | |
1503 | + rb_erase(&event->node, &osdc->event_tree); | |
1504 | + ceph_osdc_put_event(event); | |
1505 | + } else { | |
1506 | + dout("__remove_event didn't remove %p\n", event); | |
1507 | + } | |
1508 | +} | |
1509 | + | |
1510 | +int ceph_osdc_create_event(struct ceph_osd_client *osdc, | |
1511 | + void (*event_cb)(u64, u64, u8, void *), | |
1512 | + int one_shot, void *data, | |
1513 | + struct ceph_osd_event **pevent) | |
1514 | +{ | |
1515 | + struct ceph_osd_event *event; | |
1516 | + | |
1517 | + event = kmalloc(sizeof(*event), GFP_NOIO); | |
1518 | + if (!event) | |
1519 | + return -ENOMEM; | |
1520 | + | |
1521 | + dout("create_event %p\n", event); | |
1522 | + event->cb = event_cb; | |
1523 | + event->one_shot = one_shot; | |
1524 | + event->data = data; | |
1525 | + event->osdc = osdc; | |
1526 | + INIT_LIST_HEAD(&event->osd_node); | |
1527 | + kref_init(&event->kref); /* one ref for us */ | |
1528 | + kref_get(&event->kref); /* one ref for the caller */ | |
1529 | + init_completion(&event->completion); | |
1530 | + | |
1531 | + spin_lock(&osdc->event_lock); | |
1532 | + event->cookie = ++osdc->event_count; | |
1533 | + __insert_event(osdc, event); | |
1534 | + spin_unlock(&osdc->event_lock); | |
1535 | + | |
1536 | + *pevent = event; | |
1537 | + return 0; | |
1538 | +} | |
1539 | +EXPORT_SYMBOL(ceph_osdc_create_event); | |
1540 | + | |
1541 | +void ceph_osdc_cancel_event(struct ceph_osd_event *event) | |
1542 | +{ | |
1543 | + struct ceph_osd_client *osdc = event->osdc; | |
1544 | + | |
1545 | + dout("cancel_event %p\n", event); | |
1546 | + spin_lock(&osdc->event_lock); | |
1547 | + __remove_event(event); | |
1548 | + spin_unlock(&osdc->event_lock); | |
1549 | + ceph_osdc_put_event(event); /* caller's */ | |
1550 | +} | |
1551 | +EXPORT_SYMBOL(ceph_osdc_cancel_event); | |
1552 | + | |
1553 | + | |
1554 | +static void do_event_work(struct work_struct *work) | |
1555 | +{ | |
1556 | + struct ceph_osd_event_work *event_work = | |
1557 | + container_of(work, struct ceph_osd_event_work, work); | |
1558 | + struct ceph_osd_event *event = event_work->event; | |
1559 | + u64 ver = event_work->ver; | |
1560 | + u64 notify_id = event_work->notify_id; | |
1561 | + u8 opcode = event_work->opcode; | |
1562 | + | |
1563 | + dout("do_event_work completing %p\n", event); | |
1564 | + event->cb(ver, notify_id, opcode, event->data); | |
1565 | + complete(&event->completion); | |
1566 | + dout("do_event_work completed %p\n", event); | |
1567 | + ceph_osdc_put_event(event); | |
1568 | + kfree(event_work); | |
1569 | +} | |
1570 | + | |
1571 | + | |
1572 | +/* | |
1573 | + * Process osd watch notifications | |
1574 | + */ | |
1575 | +void handle_watch_notify(struct ceph_osd_client *osdc, struct ceph_msg *msg) | |
1576 | +{ | |
1577 | + void *p, *end; | |
1578 | + u8 proto_ver; | |
1579 | + u64 cookie, ver, notify_id; | |
1580 | + u8 opcode; | |
1581 | + struct ceph_osd_event *event; | |
1582 | + struct ceph_osd_event_work *event_work; | |
1583 | + | |
1584 | + p = msg->front.iov_base; | |
1585 | + end = p + msg->front.iov_len; | |
1586 | + | |
1587 | + ceph_decode_8_safe(&p, end, proto_ver, bad); | |
1588 | + ceph_decode_8_safe(&p, end, opcode, bad); | |
1589 | + ceph_decode_64_safe(&p, end, cookie, bad); | |
1590 | + ceph_decode_64_safe(&p, end, ver, bad); | |
1591 | + ceph_decode_64_safe(&p, end, notify_id, bad); | |
1592 | + | |
1593 | + spin_lock(&osdc->event_lock); | |
1594 | + event = __find_event(osdc, cookie); | |
1595 | + if (event) { | |
1596 | + get_event(event); | |
1597 | + if (event->one_shot) | |
1598 | + __remove_event(event); | |
1599 | + } | |
1600 | + spin_unlock(&osdc->event_lock); | |
1601 | + dout("handle_watch_notify cookie %lld ver %lld event %p\n", | |
1602 | + cookie, ver, event); | |
1603 | + if (event) { | |
1604 | + event_work = kmalloc(sizeof(*event_work), GFP_NOIO); | |
1605 | + INIT_WORK(&event_work->work, do_event_work); | |
1606 | + if (!event_work) { | |
1607 | + dout("ERROR: could not allocate event_work\n"); | |
1608 | + goto done_err; | |
1609 | + } | |
1610 | + event_work->event = event; | |
1611 | + event_work->ver = ver; | |
1612 | + event_work->notify_id = notify_id; | |
1613 | + event_work->opcode = opcode; | |
1614 | + if (!queue_work(osdc->notify_wq, &event_work->work)) { | |
1615 | + dout("WARNING: failed to queue notify event work\n"); | |
1616 | + goto done_err; | |
1617 | + } | |
1618 | + } | |
1619 | + | |
1620 | + return; | |
1621 | + | |
1622 | +done_err: | |
1623 | + complete(&event->completion); | |
1624 | + ceph_osdc_put_event(event); | |
1625 | + return; | |
1626 | + | |
1627 | +bad: | |
1628 | + pr_err("osdc handle_watch_notify corrupt msg\n"); | |
1629 | + return; | |
1630 | +} | |
1631 | + | |
1632 | +int ceph_osdc_wait_event(struct ceph_osd_event *event, unsigned long timeout) | |
1633 | +{ | |
1634 | + int err; | |
1635 | + | |
1636 | + dout("wait_event %p\n", event); | |
1637 | + err = wait_for_completion_interruptible_timeout(&event->completion, | |
1638 | + timeout * HZ); | |
1639 | + ceph_osdc_put_event(event); | |
1640 | + if (err > 0) | |
1641 | + err = 0; | |
1642 | + dout("wait_event %p returns %d\n", event, err); | |
1643 | + return err; | |
1644 | +} | |
1645 | +EXPORT_SYMBOL(ceph_osdc_wait_event); | |
1646 | + | |
1647 | +/* | |
1325 | 1648 | * Register request, send initial attempt. |
1326 | 1649 | */ |
1327 | 1650 | int ceph_osdc_start_request(struct ceph_osd_client *osdc, |
... | ... | @@ -1347,15 +1670,22 @@ |
1347 | 1670 | * the request still han't been touched yet. |
1348 | 1671 | */ |
1349 | 1672 | if (req->r_sent == 0) { |
1350 | - rc = __send_request(osdc, req); | |
1351 | - if (rc) { | |
1352 | - if (nofail) { | |
1353 | - dout("osdc_start_request failed send, " | |
1354 | - " marking %lld\n", req->r_tid); | |
1355 | - req->r_resend = true; | |
1356 | - rc = 0; | |
1357 | - } else { | |
1358 | - __unregister_request(osdc, req); | |
1673 | + rc = __map_request(osdc, req); | |
1674 | + if (rc < 0) | |
1675 | + return rc; | |
1676 | + if (req->r_osd == NULL) { | |
1677 | + dout("send_request %p no up osds in pg\n", req); | |
1678 | + ceph_monc_request_next_osdmap(&osdc->client->monc); | |
1679 | + } else { | |
1680 | + rc = __send_request(osdc, req); | |
1681 | + if (rc) { | |
1682 | + if (nofail) { | |
1683 | + dout("osdc_start_request failed send, " | |
1684 | + " will retry %lld\n", req->r_tid); | |
1685 | + rc = 0; | |
1686 | + } else { | |
1687 | + __unregister_request(osdc, req); | |
1688 | + } | |
1359 | 1689 | } |
1360 | 1690 | } |
1361 | 1691 | } |
1362 | 1692 | |
... | ... | @@ -1441,9 +1771,15 @@ |
1441 | 1771 | INIT_LIST_HEAD(&osdc->osd_lru); |
1442 | 1772 | osdc->requests = RB_ROOT; |
1443 | 1773 | INIT_LIST_HEAD(&osdc->req_lru); |
1774 | + INIT_LIST_HEAD(&osdc->req_unsent); | |
1775 | + INIT_LIST_HEAD(&osdc->req_notarget); | |
1776 | + INIT_LIST_HEAD(&osdc->req_linger); | |
1444 | 1777 | osdc->num_requests = 0; |
1445 | 1778 | INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout); |
1446 | 1779 | INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout); |
1780 | + spin_lock_init(&osdc->event_lock); | |
1781 | + osdc->event_tree = RB_ROOT; | |
1782 | + osdc->event_count = 0; | |
1447 | 1783 | |
1448 | 1784 | schedule_delayed_work(&osdc->osds_timeout_work, |
1449 | 1785 | round_jiffies_relative(osdc->client->options->osd_idle_ttl * HZ)); |
... | ... | @@ -1463,6 +1799,13 @@ |
1463 | 1799 | "osd_op_reply"); |
1464 | 1800 | if (err < 0) |
1465 | 1801 | goto out_msgpool; |
1802 | + | |
1803 | + osdc->notify_wq = create_singlethread_workqueue("ceph-watch-notify"); | |
1804 | + if (IS_ERR(osdc->notify_wq)) { | |
1805 | + err = PTR_ERR(osdc->notify_wq); | |
1806 | + osdc->notify_wq = NULL; | |
1807 | + goto out_msgpool; | |
1808 | + } | |
1466 | 1809 | return 0; |
1467 | 1810 | |
1468 | 1811 | out_msgpool: |
... | ... | @@ -1476,6 +1819,8 @@ |
1476 | 1819 | |
1477 | 1820 | void ceph_osdc_stop(struct ceph_osd_client *osdc) |
1478 | 1821 | { |
1822 | + flush_workqueue(osdc->notify_wq); | |
1823 | + destroy_workqueue(osdc->notify_wq); | |
1479 | 1824 | cancel_delayed_work_sync(&osdc->timeout_work); |
1480 | 1825 | cancel_delayed_work_sync(&osdc->osds_timeout_work); |
1481 | 1826 | if (osdc->osdmap) { |
... | ... | @@ -1483,6 +1828,7 @@ |
1483 | 1828 | osdc->osdmap = NULL; |
1484 | 1829 | } |
1485 | 1830 | remove_old_osds(osdc, 1); |
1831 | + WARN_ON(!RB_EMPTY_ROOT(&osdc->osds)); | |
1486 | 1832 | mempool_destroy(osdc->req_mempool); |
1487 | 1833 | ceph_msgpool_destroy(&osdc->msgpool_op); |
1488 | 1834 | ceph_msgpool_destroy(&osdc->msgpool_op_reply); |
... | ... | @@ -1591,6 +1937,9 @@ |
1591 | 1937 | case CEPH_MSG_OSD_OPREPLY: |
1592 | 1938 | handle_reply(osdc, msg, con); |
1593 | 1939 | break; |
1940 | + case CEPH_MSG_WATCH_NOTIFY: | |
1941 | + handle_watch_notify(osdc, msg); | |
1942 | + break; | |
1594 | 1943 | |
1595 | 1944 | default: |
1596 | 1945 | pr_err("received unknown message type %d %s\n", type, |
... | ... | @@ -1684,6 +2033,7 @@ |
1684 | 2033 | |
1685 | 2034 | switch (type) { |
1686 | 2035 | case CEPH_MSG_OSD_MAP: |
2036 | + case CEPH_MSG_WATCH_NOTIFY: | |
1687 | 2037 | return ceph_msg_new(type, front, GFP_NOFS); |
1688 | 2038 | case CEPH_MSG_OSD_OPREPLY: |
1689 | 2039 | return get_reply(con, hdr, skip); |