Commit 0adfc56ce8fdc5c17630434e49f30536ba7b8559

Authored by Linus Torvalds

Merge git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

* git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client:
  rbd: use watch/notify for changes in rbd header
  libceph: add lingering request and watch/notify event framework
  rbd: update email address in Documentation
  ceph: rename dentry_release -> d_release, fix comment
  ceph: add request to the tail of unsafe write list
  ceph: remove request from unsafe list if it is canceled/timed out
  ceph: move readahead default to fs/ceph from libceph
  ceph: add ino32 mount option
  ceph: update common header files
  ceph: remove debugfs debug cruft
  libceph: fix osd request queuing on osdmap updates
  ceph: preserve I_COMPLETE across rename
  libceph: Fix base64-decoding when input ends in newline.

Showing 15 changed files Side-by-side Diff

Documentation/ABI/testing/sysfs-bus-rbd
1 1 What: /sys/bus/rbd/
2 2 Date: November 2010
3   -Contact: Yehuda Sadeh <yehuda@hq.newdream.net>,
  3 +Contact: Yehuda Sadeh <yehuda@newdream.net>,
4 4 Sage Weil <sage@newdream.net>
5 5 Description:
6 6  
... ... @@ -31,6 +31,7 @@
31 31 #include <linux/ceph/osd_client.h>
32 32 #include <linux/ceph/mon_client.h>
33 33 #include <linux/ceph/decode.h>
  34 +#include <linux/parser.h>
34 35  
35 36 #include <linux/kernel.h>
36 37 #include <linux/device.h>
... ... @@ -54,6 +55,8 @@
54 55  
55 56 #define DEV_NAME_LEN 32
56 57  
  58 +#define RBD_NOTIFY_TIMEOUT_DEFAULT 10
  59 +
57 60 /*
58 61 * block device image metadata (in-memory version)
59 62 */
60 63  
61 64  
... ... @@ -71,13 +74,20 @@
71 74  
72 75 char *snap_names;
73 76 u64 *snap_sizes;
  77 +
  78 + u64 obj_version;
74 79 };
75 80  
  81 +struct rbd_options {
  82 + int notify_timeout;
  83 +};
  84 +
76 85 /*
77 86 * an instance of the client. multiple devices may share a client.
78 87 */
79 88 struct rbd_client {
80 89 struct ceph_client *client;
  90 + struct rbd_options *rbd_opts;
81 91 struct kref kref;
82 92 struct list_head node;
83 93 };
... ... @@ -124,6 +134,9 @@
124 134 char pool_name[RBD_MAX_POOL_NAME_LEN];
125 135 int poolid;
126 136  
  137 + struct ceph_osd_event *watch_event;
  138 + struct ceph_osd_request *watch_request;
  139 +
127 140 char snap_name[RBD_MAX_SNAP_NAME_LEN];
128 141 u32 cur_snap; /* index+1 of current snapshot within snap context
129 142 0 - for the head */
... ... @@ -177,6 +190,8 @@
177 190 put_device(&rbd_dev->dev);
178 191 }
179 192  
  193 +static int __rbd_update_snaps(struct rbd_device *rbd_dev);
  194 +
180 195 static int rbd_open(struct block_device *bdev, fmode_t mode)
181 196 {
182 197 struct gendisk *disk = bdev->bd_disk;
... ... @@ -211,7 +226,8 @@
211 226 * Initialize an rbd client instance.
212 227 * We own *opt.
213 228 */
214   -static struct rbd_client *rbd_client_create(struct ceph_options *opt)
  229 +static struct rbd_client *rbd_client_create(struct ceph_options *opt,
  230 + struct rbd_options *rbd_opts)
215 231 {
216 232 struct rbd_client *rbdc;
217 233 int ret = -ENOMEM;
... ... @@ -233,6 +249,8 @@
233 249 if (ret < 0)
234 250 goto out_err;
235 251  
  252 + rbdc->rbd_opts = rbd_opts;
  253 +
236 254 spin_lock(&node_lock);
237 255 list_add_tail(&rbdc->node, &rbd_client_list);
238 256 spin_unlock(&node_lock);
... ... @@ -267,6 +285,59 @@
267 285 }
268 286  
269 287 /*
  288 + * mount options
  289 + */
  290 +enum {
  291 + Opt_notify_timeout,
  292 + Opt_last_int,
  293 + /* int args above */
  294 + Opt_last_string,
  295 + /* string args above */
  296 +};
  297 +
  298 +static match_table_t rbdopt_tokens = {
  299 + {Opt_notify_timeout, "notify_timeout=%d"},
  300 + /* int args above */
  301 + /* string args above */
  302 + {-1, NULL}
  303 +};
  304 +
  305 +static int parse_rbd_opts_token(char *c, void *private)
  306 +{
  307 + struct rbd_options *rbdopt = private;
  308 + substring_t argstr[MAX_OPT_ARGS];
  309 + int token, intval, ret;
  310 +
  311 + token = match_token((char *)c, rbdopt_tokens, argstr);
  312 + if (token < 0)
  313 + return -EINVAL;
  314 +
  315 + if (token < Opt_last_int) {
  316 + ret = match_int(&argstr[0], &intval);
  317 + if (ret < 0) {
  318 + pr_err("bad mount option arg (not int) "
  319 + "at '%s'\n", c);
  320 + return ret;
  321 + }
  322 + dout("got int token %d val %d\n", token, intval);
  323 + } else if (token > Opt_last_int && token < Opt_last_string) {
  324 + dout("got string token %d val %s\n", token,
  325 + argstr[0].from);
  326 + } else {
  327 + dout("got token %d\n", token);
  328 + }
  329 +
  330 + switch (token) {
  331 + case Opt_notify_timeout:
  332 + rbdopt->notify_timeout = intval;
  333 + break;
  334 + default:
  335 + BUG_ON(token);
  336 + }
  337 + return 0;
  338 +}
  339 +
  340 +/*
270 341 * Get a ceph client with specific addr and configuration, if one does
271 342 * not exist create it.
272 343 */
273 344  
274 345  
275 346  
... ... @@ -276,11 +347,18 @@
276 347 struct rbd_client *rbdc;
277 348 struct ceph_options *opt;
278 349 int ret;
  350 + struct rbd_options *rbd_opts;
279 351  
  352 + rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL);
  353 + if (!rbd_opts)
  354 + return -ENOMEM;
  355 +
  356 + rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT;
  357 +
280 358 ret = ceph_parse_options(&opt, options, mon_addr,
281   - mon_addr + strlen(mon_addr), NULL, NULL);
  359 + mon_addr + strlen(mon_addr), parse_rbd_opts_token, rbd_opts);
282 360 if (ret < 0)
283   - return ret;
  361 + goto done_err;
284 362  
285 363 spin_lock(&node_lock);
286 364 rbdc = __rbd_client_find(opt);
287 365  
... ... @@ -296,13 +374,18 @@
296 374 }
297 375 spin_unlock(&node_lock);
298 376  
299   - rbdc = rbd_client_create(opt);
300   - if (IS_ERR(rbdc))
301   - return PTR_ERR(rbdc);
  377 + rbdc = rbd_client_create(opt, rbd_opts);
  378 + if (IS_ERR(rbdc)) {
  379 + ret = PTR_ERR(rbdc);
  380 + goto done_err;
  381 + }
302 382  
303 383 rbd_dev->rbd_client = rbdc;
304 384 rbd_dev->client = rbdc->client;
305 385 return 0;
  386 +done_err:
  387 + kfree(rbd_opts);
  388 + return ret;
306 389 }
307 390  
308 391 /*
... ... @@ -318,6 +401,7 @@
318 401 spin_unlock(&node_lock);
319 402  
320 403 ceph_destroy_client(rbdc->client);
  404 + kfree(rbdc->rbd_opts);
321 405 kfree(rbdc);
322 406 }
323 407  
... ... @@ -666,7 +750,9 @@
666 750 struct ceph_osd_req_op *ops,
667 751 int num_reply,
668 752 void (*rbd_cb)(struct ceph_osd_request *req,
669   - struct ceph_msg *msg))
  753 + struct ceph_msg *msg),
  754 + struct ceph_osd_request **linger_req,
  755 + u64 *ver)
670 756 {
671 757 struct ceph_osd_request *req;
672 758 struct ceph_file_layout *layout;
673 759  
... ... @@ -729,12 +815,20 @@
729 815 req->r_oid, req->r_oid_len);
730 816 up_read(&header->snap_rwsem);
731 817  
  818 + if (linger_req) {
  819 + ceph_osdc_set_request_linger(&dev->client->osdc, req);
  820 + *linger_req = req;
  821 + }
  822 +
732 823 ret = ceph_osdc_start_request(&dev->client->osdc, req, false);
733 824 if (ret < 0)
734 825 goto done_err;
735 826  
736 827 if (!rbd_cb) {
737 828 ret = ceph_osdc_wait_request(&dev->client->osdc, req);
  829 + if (ver)
  830 + *ver = le64_to_cpu(req->r_reassert_version.version);
  831 + dout("reassert_ver=%lld\n", le64_to_cpu(req->r_reassert_version.version));
738 832 ceph_osdc_put_request(req);
739 833 }
740 834 return ret;
... ... @@ -789,6 +883,11 @@
789 883 kfree(req_data);
790 884 }
791 885  
  886 +static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
  887 +{
  888 + ceph_osdc_put_request(req);
  889 +}
  890 +
792 891 /*
793 892 * Do a synchronous ceph osd operation
794 893 */
... ... @@ -801,7 +900,9 @@
801 900 int num_reply,
802 901 const char *obj,
803 902 u64 ofs, u64 len,
804   - char *buf)
  903 + char *buf,
  904 + struct ceph_osd_request **linger_req,
  905 + u64 *ver)
805 906 {
806 907 int ret;
807 908 struct page **pages;
... ... @@ -833,7 +934,8 @@
833 934 flags,
834 935 ops,
835 936 2,
836   - NULL);
  937 + NULL,
  938 + linger_req, ver);
837 939 if (ret < 0)
838 940 goto done_ops;
839 941  
... ... @@ -893,7 +995,7 @@
893 995 flags,
894 996 ops,
895 997 num_reply,
896   - rbd_req_cb);
  998 + rbd_req_cb, 0, NULL);
897 999 done:
898 1000 kfree(seg_name);
899 1001 return ret;
900 1002  
901 1003  
902 1004  
... ... @@ -940,19 +1042,175 @@
940 1042 u64 snapid,
941 1043 const char *obj,
942 1044 u64 ofs, u64 len,
943   - char *buf)
  1045 + char *buf,
  1046 + u64 *ver)
944 1047 {
945 1048 return rbd_req_sync_op(dev, NULL,
946 1049 (snapid ? snapid : CEPH_NOSNAP),
947 1050 CEPH_OSD_OP_READ,
948 1051 CEPH_OSD_FLAG_READ,
949 1052 NULL,
950   - 1, obj, ofs, len, buf);
  1053 + 1, obj, ofs, len, buf, NULL, ver);
951 1054 }
952 1055  
953 1056 /*
954   - * Request sync osd read
  1057 + * Request sync osd watch
955 1058 */
  1059 +static int rbd_req_sync_notify_ack(struct rbd_device *dev,
  1060 + u64 ver,
  1061 + u64 notify_id,
  1062 + const char *obj)
  1063 +{
  1064 + struct ceph_osd_req_op *ops;
  1065 + struct page **pages = NULL;
  1066 + int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY_ACK, 0);
  1067 + if (ret < 0)
  1068 + return ret;
  1069 +
  1070 + ops[0].watch.ver = cpu_to_le64(dev->header.obj_version);
  1071 + ops[0].watch.cookie = notify_id;
  1072 + ops[0].watch.flag = 0;
  1073 +
  1074 + ret = rbd_do_request(NULL, dev, NULL, CEPH_NOSNAP,
  1075 + obj, 0, 0, NULL,
  1076 + pages, 0,
  1077 + CEPH_OSD_FLAG_READ,
  1078 + ops,
  1079 + 1,
  1080 + rbd_simple_req_cb, 0, NULL);
  1081 +
  1082 + rbd_destroy_ops(ops);
  1083 + return ret;
  1084 +}
  1085 +
  1086 +static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
  1087 +{
  1088 + struct rbd_device *dev = (struct rbd_device *)data;
  1089 + if (!dev)
  1090 + return;
  1091 +
  1092 + dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name,
  1093 + notify_id, (int)opcode);
  1094 + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
  1095 + __rbd_update_snaps(dev);
  1096 + mutex_unlock(&ctl_mutex);
  1097 +
  1098 + rbd_req_sync_notify_ack(dev, ver, notify_id, dev->obj_md_name);
  1099 +}
  1100 +
  1101 +/*
  1102 + * Request sync osd watch
  1103 + */
  1104 +static int rbd_req_sync_watch(struct rbd_device *dev,
  1105 + const char *obj,
  1106 + u64 ver)
  1107 +{
  1108 + struct ceph_osd_req_op *ops;
  1109 + struct ceph_osd_client *osdc = &dev->client->osdc;
  1110 +
  1111 + int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
  1112 + if (ret < 0)
  1113 + return ret;
  1114 +
  1115 + ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
  1116 + (void *)dev, &dev->watch_event);
  1117 + if (ret < 0)
  1118 + goto fail;
  1119 +
  1120 + ops[0].watch.ver = cpu_to_le64(ver);
  1121 + ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie);
  1122 + ops[0].watch.flag = 1;
  1123 +
  1124 + ret = rbd_req_sync_op(dev, NULL,
  1125 + CEPH_NOSNAP,
  1126 + 0,
  1127 + CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
  1128 + ops,
  1129 + 1, obj, 0, 0, NULL,
  1130 + &dev->watch_request, NULL);
  1131 +
  1132 + if (ret < 0)
  1133 + goto fail_event;
  1134 +
  1135 + rbd_destroy_ops(ops);
  1136 + return 0;
  1137 +
  1138 +fail_event:
  1139 + ceph_osdc_cancel_event(dev->watch_event);
  1140 + dev->watch_event = NULL;
  1141 +fail:
  1142 + rbd_destroy_ops(ops);
  1143 + return ret;
  1144 +}
  1145 +
  1146 +struct rbd_notify_info {
  1147 + struct rbd_device *dev;
  1148 +};
  1149 +
  1150 +static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
  1151 +{
  1152 + struct rbd_device *dev = (struct rbd_device *)data;
  1153 + if (!dev)
  1154 + return;
  1155 +
  1156 + dout("rbd_notify_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name,
  1157 + notify_id, (int)opcode);
  1158 +}
  1159 +
  1160 +/*
  1161 + * Request sync osd notify
  1162 + */
  1163 +static int rbd_req_sync_notify(struct rbd_device *dev,
  1164 + const char *obj)
  1165 +{
  1166 + struct ceph_osd_req_op *ops;
  1167 + struct ceph_osd_client *osdc = &dev->client->osdc;
  1168 + struct ceph_osd_event *event;
  1169 + struct rbd_notify_info info;
  1170 + int payload_len = sizeof(u32) + sizeof(u32);
  1171 + int ret;
  1172 +
  1173 + ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY, payload_len);
  1174 + if (ret < 0)
  1175 + return ret;
  1176 +
  1177 + info.dev = dev;
  1178 +
  1179 + ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
  1180 + (void *)&info, &event);
  1181 + if (ret < 0)
  1182 + goto fail;
  1183 +
  1184 + ops[0].watch.ver = 1;
  1185 + ops[0].watch.flag = 1;
  1186 + ops[0].watch.cookie = event->cookie;
  1187 + ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
  1188 + ops[0].watch.timeout = 12;
  1189 +
  1190 + ret = rbd_req_sync_op(dev, NULL,
  1191 + CEPH_NOSNAP,
  1192 + 0,
  1193 + CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
  1194 + ops,
  1195 + 1, obj, 0, 0, NULL, NULL, NULL);
  1196 + if (ret < 0)
  1197 + goto fail_event;
  1198 +
  1199 + ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
  1200 + dout("ceph_osdc_wait_event returned %d\n", ret);
  1201 + rbd_destroy_ops(ops);
  1202 + return 0;
  1203 +
  1204 +fail_event:
  1205 + ceph_osdc_cancel_event(event);
  1206 +fail:
  1207 + rbd_destroy_ops(ops);
  1208 + return ret;
  1209 +}
  1210 +
  1211 +/*
  1212 + * Request sync osd rollback
  1213 + */
956 1214 static int rbd_req_sync_rollback_obj(struct rbd_device *dev,
957 1215 u64 snapid,
958 1216 const char *obj)
959 1217  
... ... @@ -969,13 +1227,10 @@
969 1227 0,
970 1228 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
971 1229 ops,
972   - 1, obj, 0, 0, NULL);
  1230 + 1, obj, 0, 0, NULL, NULL, NULL);
973 1231  
974 1232 rbd_destroy_ops(ops);
975 1233  
976   - if (ret < 0)
977   - return ret;
978   -
979 1234 return ret;
980 1235 }
981 1236  
... ... @@ -987,7 +1242,8 @@
987 1242 const char *cls,
988 1243 const char *method,
989 1244 const char *data,
990   - int len)
  1245 + int len,
  1246 + u64 *ver)
991 1247 {
992 1248 struct ceph_osd_req_op *ops;
993 1249 int cls_len = strlen(cls);
... ... @@ -1010,7 +1266,7 @@
1010 1266 0,
1011 1267 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1012 1268 ops,
1013   - 1, obj, 0, 0, NULL);
  1269 + 1, obj, 0, 0, NULL, NULL, ver);
1014 1270  
1015 1271 rbd_destroy_ops(ops);
1016 1272  
... ... @@ -1156,6 +1412,7 @@
1156 1412 struct rbd_image_header_ondisk *dh;
1157 1413 int snap_count = 0;
1158 1414 u64 snap_names_len = 0;
  1415 + u64 ver;
1159 1416  
1160 1417 while (1) {
1161 1418 int len = sizeof(*dh) +
... ... @@ -1171,7 +1428,7 @@
1171 1428 NULL, CEPH_NOSNAP,
1172 1429 rbd_dev->obj_md_name,
1173 1430 0, len,
1174   - (char *)dh);
  1431 + (char *)dh, &ver);
1175 1432 if (rc < 0)
1176 1433 goto out_dh;
1177 1434  
... ... @@ -1188,6 +1445,7 @@
1188 1445 }
1189 1446 break;
1190 1447 }
  1448 + header->obj_version = ver;
1191 1449  
1192 1450 out_dh:
1193 1451 kfree(dh);
... ... @@ -1205,6 +1463,7 @@
1205 1463 u64 new_snapid;
1206 1464 int ret;
1207 1465 void *data, *data_start, *data_end;
  1466 + u64 ver;
1208 1467  
1209 1468 /* we should create a snapshot only if we're pointing at the head */
1210 1469 if (dev->cur_snap)
... ... @@ -1227,7 +1486,7 @@
1227 1486 ceph_encode_64_safe(&data, data_end, new_snapid, bad);
1228 1487  
1229 1488 ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add",
1230   - data_start, data - data_start);
  1489 + data_start, data - data_start, &ver);
1231 1490  
1232 1491 kfree(data_start);
1233 1492  
... ... @@ -1259,6 +1518,7 @@
1259 1518 int ret;
1260 1519 struct rbd_image_header h;
1261 1520 u64 snap_seq;
  1521 + int follow_seq = 0;
1262 1522  
1263 1523 ret = rbd_read_header(rbd_dev, &h);
1264 1524 if (ret < 0)
... ... @@ -1267,6 +1527,11 @@
1267 1527 down_write(&rbd_dev->header.snap_rwsem);
1268 1528  
1269 1529 snap_seq = rbd_dev->header.snapc->seq;
  1530 + if (rbd_dev->header.total_snaps &&
  1531 + rbd_dev->header.snapc->snaps[0] == snap_seq)
  1532 + /* pointing at the head, will need to follow that
  1533 + if head moves */
  1534 + follow_seq = 1;
1270 1535  
1271 1536 kfree(rbd_dev->header.snapc);
1272 1537 kfree(rbd_dev->header.snap_names);
... ... @@ -1277,7 +1542,10 @@
1277 1542 rbd_dev->header.snap_names = h.snap_names;
1278 1543 rbd_dev->header.snap_names_len = h.snap_names_len;
1279 1544 rbd_dev->header.snap_sizes = h.snap_sizes;
1280   - rbd_dev->header.snapc->seq = snap_seq;
  1545 + if (follow_seq)
  1546 + rbd_dev->header.snapc->seq = rbd_dev->header.snapc->snaps[0];
  1547 + else
  1548 + rbd_dev->header.snapc->seq = snap_seq;
1281 1549  
1282 1550 ret = __rbd_init_snaps_header(rbd_dev);
1283 1551  
1284 1552  
... ... @@ -1699,8 +1967,29 @@
1699 1967 device_unregister(&rbd_dev->dev);
1700 1968 }
1701 1969  
1702   -static ssize_t rbd_add(struct bus_type *bus, const char *buf, size_t count)
  1970 +static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
1703 1971 {
  1972 + int ret, rc;
  1973 +
  1974 + do {
  1975 + ret = rbd_req_sync_watch(rbd_dev, rbd_dev->obj_md_name,
  1976 + rbd_dev->header.obj_version);
  1977 + if (ret == -ERANGE) {
  1978 + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
  1979 + rc = __rbd_update_snaps(rbd_dev);
  1980 + mutex_unlock(&ctl_mutex);
  1981 + if (rc < 0)
  1982 + return rc;
  1983 + }
  1984 + } while (ret == -ERANGE);
  1985 +
  1986 + return ret;
  1987 +}
  1988 +
  1989 +static ssize_t rbd_add(struct bus_type *bus,
  1990 + const char *buf,
  1991 + size_t count)
  1992 +{
1704 1993 struct ceph_osd_client *osdc;
1705 1994 struct rbd_device *rbd_dev;
1706 1995 ssize_t rc = -ENOMEM;
... ... @@ -1797,6 +2086,10 @@
1797 2086 if (rc)
1798 2087 goto err_out_bus;
1799 2088  
  2089 + rc = rbd_init_watch_dev(rbd_dev);
  2090 + if (rc)
  2091 + goto err_out_bus;
  2092 +
1800 2093 return count;
1801 2094  
1802 2095 err_out_bus:
... ... @@ -1849,6 +2142,12 @@
1849 2142 struct rbd_device *rbd_dev =
1850 2143 container_of(dev, struct rbd_device, dev);
1851 2144  
  2145 + if (rbd_dev->watch_request)
  2146 + ceph_osdc_unregister_linger_request(&rbd_dev->client->osdc,
  2147 + rbd_dev->watch_request);
  2148 + if (rbd_dev->watch_event)
  2149 + ceph_osdc_cancel_event(rbd_dev->watch_event);
  2150 +
1852 2151 rbd_put_client(rbd_dev);
1853 2152  
1854 2153 /* clean up and free blkdev */
1855 2154  
1856 2155  
1857 2156  
... ... @@ -1914,14 +2213,24 @@
1914 2213 ret = rbd_header_add_snap(rbd_dev,
1915 2214 name, GFP_KERNEL);
1916 2215 if (ret < 0)
1917   - goto done_unlock;
  2216 + goto err_unlock;
1918 2217  
1919 2218 ret = __rbd_update_snaps(rbd_dev);
1920 2219 if (ret < 0)
1921   - goto done_unlock;
  2220 + goto err_unlock;
1922 2221  
  2222 + /* shouldn't hold ctl_mutex when notifying.. notify might
  2223 + trigger a watch callback that would need to get that mutex */
  2224 + mutex_unlock(&ctl_mutex);
  2225 +
  2226 + /* make a best effort, don't error if failed */
  2227 + rbd_req_sync_notify(rbd_dev, rbd_dev->obj_md_name);
  2228 +
1923 2229 ret = count;
1924   -done_unlock:
  2230 + kfree(name);
  2231 + return ret;
  2232 +
  2233 +err_unlock:
1925 2234 mutex_unlock(&ctl_mutex);
1926 2235 kfree(name);
1927 2236 return ret;
... ... @@ -210,8 +210,6 @@
210 210 if (!fsc->debugfs_congestion_kb)
211 211 goto out;
212 212  
213   - dout("a\n");
214   -
215 213 snprintf(name, sizeof(name), "../../bdi/%s",
216 214 dev_name(fsc->backing_dev_info.dev));
217 215 fsc->debugfs_bdi =
... ... @@ -221,7 +219,6 @@
221 219 if (!fsc->debugfs_bdi)
222 220 goto out;
223 221  
224   - dout("b\n");
225 222 fsc->debugfs_mdsmap = debugfs_create_file("mdsmap",
226 223 0600,
227 224 fsc->client->debugfs_dir,
... ... @@ -230,7 +227,6 @@
230 227 if (!fsc->debugfs_mdsmap)
231 228 goto out;
232 229  
233   - dout("ca\n");
234 230 fsc->debugfs_mdsc = debugfs_create_file("mdsc",
235 231 0600,
236 232 fsc->client->debugfs_dir,
... ... @@ -239,7 +235,6 @@
239 235 if (!fsc->debugfs_mdsc)
240 236 goto out;
241 237  
242   - dout("da\n");
243 238 fsc->debugfs_caps = debugfs_create_file("caps",
244 239 0400,
245 240 fsc->client->debugfs_dir,
... ... @@ -248,7 +243,6 @@
248 243 if (!fsc->debugfs_caps)
249 244 goto out;
250 245  
251   - dout("ea\n");
252 246 fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
253 247 0600,
254 248 fsc->client->debugfs_dir,
... ... @@ -161,7 +161,7 @@
161 161 filp->f_pos = di->offset;
162 162 err = filldir(dirent, dentry->d_name.name,
163 163 dentry->d_name.len, di->offset,
164   - dentry->d_inode->i_ino,
  164 + ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino),
165 165 dentry->d_inode->i_mode >> 12);
166 166  
167 167 if (last) {
168 168  
169 169  
... ... @@ -245,15 +245,17 @@
245 245  
246 246 dout("readdir off 0 -> '.'\n");
247 247 if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0),
248   - inode->i_ino, inode->i_mode >> 12) < 0)
  248 + ceph_translate_ino(inode->i_sb, inode->i_ino),
  249 + inode->i_mode >> 12) < 0)
249 250 return 0;
250 251 filp->f_pos = 1;
251 252 off = 1;
252 253 }
253 254 if (filp->f_pos == 1) {
  255 + ino_t ino = filp->f_dentry->d_parent->d_inode->i_ino;
254 256 dout("readdir off 1 -> '..'\n");
255 257 if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1),
256   - filp->f_dentry->d_parent->d_inode->i_ino,
  258 + ceph_translate_ino(inode->i_sb, ino),
257 259 inode->i_mode >> 12) < 0)
258 260 return 0;
259 261 filp->f_pos = 2;
... ... @@ -377,7 +379,8 @@
377 379 if (filldir(dirent,
378 380 rinfo->dir_dname[off - fi->offset],
379 381 rinfo->dir_dname_len[off - fi->offset],
380   - pos, ino, ftype) < 0) {
  382 + pos,
  383 + ceph_translate_ino(inode->i_sb, ino), ftype) < 0) {
381 384 dout("filldir stopping us...\n");
382 385 return 0;
383 386 }
384 387  
385 388  
... ... @@ -1024,14 +1027,13 @@
1024 1027 }
1025 1028  
1026 1029 /*
1027   - * When a dentry is released, clear the dir I_COMPLETE if it was part
1028   - * of the current dir gen or if this is in the snapshot namespace.
  1030 + * Release our ceph_dentry_info.
1029 1031 */
1030   -static void ceph_dentry_release(struct dentry *dentry)
  1032 +static void ceph_d_release(struct dentry *dentry)
1031 1033 {
1032 1034 struct ceph_dentry_info *di = ceph_dentry(dentry);
1033 1035  
1034   - dout("dentry_release %p\n", dentry);
  1036 + dout("d_release %p\n", dentry);
1035 1037 if (di) {
1036 1038 ceph_dentry_lru_del(dentry);
1037 1039 if (di->lease_session)
1038 1040  
1039 1041  
... ... @@ -1256,15 +1258,15 @@
1256 1258  
1257 1259 const struct dentry_operations ceph_dentry_ops = {
1258 1260 .d_revalidate = ceph_d_revalidate,
1259   - .d_release = ceph_dentry_release,
  1261 + .d_release = ceph_d_release,
1260 1262 };
1261 1263  
1262 1264 const struct dentry_operations ceph_snapdir_dentry_ops = {
1263 1265 .d_revalidate = ceph_snapdir_d_revalidate,
1264   - .d_release = ceph_dentry_release,
  1266 + .d_release = ceph_d_release,
1265 1267 };
1266 1268  
1267 1269 const struct dentry_operations ceph_snap_dentry_ops = {
1268   - .d_release = ceph_dentry_release,
  1270 + .d_release = ceph_d_release,
1269 1271 };
... ... @@ -564,11 +564,19 @@
564 564 * start_request so that a tid has been assigned.
565 565 */
566 566 spin_lock(&ci->i_unsafe_lock);
567   - list_add(&req->r_unsafe_item, &ci->i_unsafe_writes);
  567 + list_add_tail(&req->r_unsafe_item,
  568 + &ci->i_unsafe_writes);
568 569 spin_unlock(&ci->i_unsafe_lock);
569 570 ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
570 571 }
  572 +
571 573 ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
  574 + if (ret < 0 && req->r_safe_callback) {
  575 + spin_lock(&ci->i_unsafe_lock);
  576 + list_del_init(&req->r_unsafe_item);
  577 + spin_unlock(&ci->i_unsafe_lock);
  578 + ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
  579 + }
572 580 }
573 581  
574 582 if (file->f_flags & O_DIRECT)
... ... @@ -36,6 +36,13 @@
36 36 /*
37 37 * find or create an inode, given the ceph ino number
38 38 */
  39 +static int ceph_set_ino_cb(struct inode *inode, void *data)
  40 +{
  41 + ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
  42 + inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
  43 + return 0;
  44 +}
  45 +
39 46 struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
40 47 {
41 48 struct inode *inode;
... ... @@ -1030,9 +1037,6 @@
1030 1037 dout("fill_trace doing d_move %p -> %p\n",
1031 1038 req->r_old_dentry, dn);
1032 1039  
1033   - /* d_move screws up d_subdirs order */
1034   - ceph_i_clear(dir, CEPH_I_COMPLETE);
1035   -
1036 1040 d_move(req->r_old_dentry, dn);
1037 1041 dout(" src %p '%.*s' dst %p '%.*s'\n",
1038 1042 req->r_old_dentry,
1039 1043  
... ... @@ -1044,12 +1048,15 @@
1044 1048 rehashing bug in vfs_rename_dir */
1045 1049 ceph_invalidate_dentry_lease(dn);
1046 1050  
1047   - /* take overwritten dentry's readdir offset */
1048   - dout("dn %p gets %p offset %lld (old offset %lld)\n",
1049   - req->r_old_dentry, dn, ceph_dentry(dn)->offset,
  1051 + /*
  1052 + * d_move() puts the renamed dentry at the end of
  1053 + * d_subdirs. We need to assign it an appropriate
  1054 + * directory offset so we can behave when holding
  1055 + * I_COMPLETE.
  1056 + */
  1057 + ceph_set_dentry_offset(req->r_old_dentry);
  1058 + dout("dn %p gets new offset %lld\n", req->r_old_dentry,
1050 1059 ceph_dentry(req->r_old_dentry)->offset);
1051   - ceph_dentry(req->r_old_dentry)->offset =
1052   - ceph_dentry(dn)->offset;
1053 1060  
1054 1061 dn = req->r_old_dentry; /* use old_dentry */
1055 1062 in = dn->d_inode;
... ... @@ -1809,7 +1816,7 @@
1809 1816 err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL);
1810 1817 if (!err) {
1811 1818 generic_fillattr(inode, stat);
1812   - stat->ino = inode->i_ino;
  1819 + stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
1813 1820 if (ceph_snap(inode) != CEPH_NOSNAP)
1814 1821 stat->dev = ceph_snap(inode);
1815 1822 else
... ... @@ -131,6 +131,7 @@
131 131 Opt_rbytes,
132 132 Opt_norbytes,
133 133 Opt_noasyncreaddir,
  134 + Opt_ino32,
134 135 };
135 136  
136 137 static match_table_t fsopt_tokens = {
... ... @@ -150,6 +151,7 @@
150 151 {Opt_rbytes, "rbytes"},
151 152 {Opt_norbytes, "norbytes"},
152 153 {Opt_noasyncreaddir, "noasyncreaddir"},
  154 + {Opt_ino32, "ino32"},
153 155 {-1, NULL}
154 156 };
155 157  
... ... @@ -225,6 +227,9 @@
225 227 case Opt_noasyncreaddir:
226 228 fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
227 229 break;
  230 + case Opt_ino32:
  231 + fsopt->flags |= CEPH_MOUNT_OPT_INO32;
  232 + break;
228 233 default:
229 234 BUG_ON(token);
230 235 }
... ... @@ -288,7 +293,7 @@
288 293 fsopt->sb_flags = flags;
289 294 fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
290 295  
291   - fsopt->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
  296 + fsopt->rsize = CEPH_RSIZE_DEFAULT;
292 297 fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
293 298 fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
294 299 fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
... ... @@ -370,7 +375,7 @@
370 375  
371 376 if (fsopt->wsize)
372 377 seq_printf(m, ",wsize=%d", fsopt->wsize);
373   - if (fsopt->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
  378 + if (fsopt->rsize != CEPH_RSIZE_DEFAULT)
374 379 seq_printf(m, ",rsize=%d", fsopt->rsize);
375 380 if (fsopt->congestion_kb != default_congestion_kb())
376 381 seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
... ... @@ -27,6 +27,7 @@
27 27 #define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */
28 28 #define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
29 29 #define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
  30 +#define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */
30 31  
31 32 #define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES)
32 33  
... ... @@ -35,6 +36,7 @@
35 36 #define ceph_test_mount_opt(fsc, opt) \
36 37 (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
37 38  
  39 +#define CEPH_RSIZE_DEFAULT (512*1024) /* readahead */
38 40 #define CEPH_MAX_READDIR_DEFAULT 1024
39 41 #define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024)
40 42 #define CEPH_SNAPDIRNAME_DEFAULT ".snap"
... ... @@ -319,6 +321,16 @@
319 321 return container_of(inode, struct ceph_inode_info, vfs_inode);
320 322 }
321 323  
  324 +static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode)
  325 +{
  326 + return (struct ceph_fs_client *)inode->i_sb->s_fs_info;
  327 +}
  328 +
  329 +static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb)
  330 +{
  331 + return (struct ceph_fs_client *)sb->s_fs_info;
  332 +}
  333 +
322 334 static inline struct ceph_vino ceph_vino(struct inode *inode)
323 335 {
324 336 return ceph_inode(inode)->i_vino;
325 337  
326 338  
327 339  
... ... @@ -327,19 +339,49 @@
327 339 /*
328 340 * ino_t is <64 bits on many architectures, blech.
329 341 *
330   - * don't include snap in ino hash, at least for now.
  342 + * i_ino (kernel inode) st_ino (userspace)
  343 + * i386 32 32
  344 + * x86_64+ino32 64 32
  345 + * x86_64 64 64
331 346 */
  347 +static inline u32 ceph_ino_to_ino32(ino_t ino)
  348 +{
  349 + ino ^= ino >> (sizeof(ino) * 8 - 32);
  350 + if (!ino)
  351 + ino = 1;
  352 + return ino;
  353 +}
  354 +
  355 +/*
  356 + * kernel i_ino value
  357 + */
332 358 static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
333 359 {
334 360 ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */
335 361 #if BITS_PER_LONG == 32
336   - ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
337   - if (!ino)
338   - ino = 1;
  362 + ino = ceph_ino_to_ino32(ino);
339 363 #endif
340 364 return ino;
341 365 }
342 366  
  367 +/*
  368 + * user-visible ino (stat, filldir)
  369 + */
  370 +#if BITS_PER_LONG == 32
  371 +static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
  372 +{
  373 + return ino;
  374 +}
  375 +#else
  376 +static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
  377 +{
  378 + if (ceph_test_mount_opt(ceph_sb_to_client(sb), INO32))
  379 + ino = ceph_ino_to_ino32(ino);
  380 + return ino;
  381 +}
  382 +#endif
  383 +
  384 +
343 385 /* for printf-style formatting */
344 386 #define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
345 387  
... ... @@ -428,13 +470,6 @@
428 470 return ((loff_t)frag << 32) | (loff_t)off;
429 471 }
430 472  
431   -static inline int ceph_set_ino_cb(struct inode *inode, void *data)
432   -{
433   - ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
434   - inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
435   - return 0;
436   -}
437   -
438 473 /*
439 474 * caps helpers
440 475 */
... ... @@ -503,15 +538,6 @@
503 538 int *total, int *avail, int *used,
504 539 int *reserved, int *min);
505 540  
506   -static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode)
507   -{
508   - return (struct ceph_fs_client *)inode->i_sb->s_fs_info;
509   -}
510   -
511   -static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb)
512   -{
513   - return (struct ceph_fs_client *)sb->s_fs_info;
514   -}
515 541  
516 542  
517 543 /*
include/linux/ceph/ceph_fs.h
... ... @@ -136,10 +136,19 @@
136 136  
137 137  
138 138 /* osd */
139   -#define CEPH_MSG_OSD_MAP 41
140   -#define CEPH_MSG_OSD_OP 42
141   -#define CEPH_MSG_OSD_OPREPLY 43
  139 +#define CEPH_MSG_OSD_MAP 41
  140 +#define CEPH_MSG_OSD_OP 42
  141 +#define CEPH_MSG_OSD_OPREPLY 43
  142 +#define CEPH_MSG_WATCH_NOTIFY 44
142 143  
  144 +
  145 +/* watch-notify operations */
  146 +enum {
  147 + WATCH_NOTIFY = 1, /* notifying watcher */
  148 + WATCH_NOTIFY_COMPLETE = 2, /* notifier notified when done */
  149 +};
  150 +
  151 +
143 152 /* pool operations */
144 153 enum {
145 154 POOL_OP_CREATE = 0x01,
146 155  
... ... @@ -213,8 +222,10 @@
213 222 struct ceph_mon_request_header monhdr;
214 223 } __attribute__ ((packed));
215 224  
  225 +#define CEPH_SUBSCRIBE_ONETIME 1 /* i want only 1 update after have */
  226 +
216 227 struct ceph_mon_subscribe_item {
217   - __le64 have_version; __le64 have;
  228 + __le64 have_version; __le64 have;
218 229 __u8 onetime;
219 230 } __attribute__ ((packed));
220 231  
include/linux/ceph/libceph.h
... ... @@ -71,7 +71,6 @@
71 71 #define CEPH_OSD_TIMEOUT_DEFAULT 60 /* seconds */
72 72 #define CEPH_OSD_KEEPALIVE_DEFAULT 5
73 73 #define CEPH_OSD_IDLE_TTL_DEFAULT 60
74   -#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */
75 74  
76 75 #define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
77 76 #define CEPH_MSG_MAX_DATA_LEN (16*1024*1024)
include/linux/ceph/osd_client.h
... ... @@ -32,6 +32,7 @@
32 32 struct rb_node o_node;
33 33 struct ceph_connection o_con;
34 34 struct list_head o_requests;
  35 + struct list_head o_linger_requests;
35 36 struct list_head o_osd_lru;
36 37 struct ceph_authorizer *o_authorizer;
37 38 void *o_authorizer_buf, *o_authorizer_reply_buf;
... ... @@ -47,6 +48,8 @@
47 48 struct rb_node r_node;
48 49 struct list_head r_req_lru_item;
49 50 struct list_head r_osd_item;
  51 + struct list_head r_linger_item;
  52 + struct list_head r_linger_osd;
50 53 struct ceph_osd *r_osd;
51 54 struct ceph_pg r_pgid;
52 55 int r_pg_osds[CEPH_PG_MAX_SIZE];
... ... @@ -59,6 +62,7 @@
59 62 int r_flags; /* any additional flags for the osd */
60 63 u32 r_sent; /* >0 if r_request is sending/sent */
61 64 int r_got_reply;
  65 + int r_linger;
62 66  
63 67 struct ceph_osd_client *r_osdc;
64 68 struct kref r_kref;
... ... @@ -74,7 +78,6 @@
74 78 char r_oid[40]; /* object name */
75 79 int r_oid_len;
76 80 unsigned long r_stamp; /* send OR check time */
77   - bool r_resend; /* msg send failed, needs retry */
78 81  
79 82 struct ceph_file_layout r_file_layout;
80 83 struct ceph_snap_context *r_snapc; /* snap context for writes */
... ... @@ -90,6 +93,26 @@
90 93 struct ceph_pagelist *r_trail; /* trailing part of the data */
91 94 };
92 95  
  96 +struct ceph_osd_event {
  97 + u64 cookie;
  98 + int one_shot;
  99 + struct ceph_osd_client *osdc;
  100 + void (*cb)(u64, u64, u8, void *);
  101 + void *data;
  102 + struct rb_node node;
  103 + struct list_head osd_node;
  104 + struct kref kref;
  105 + struct completion completion;
  106 +};
  107 +
  108 +struct ceph_osd_event_work {
  109 + struct work_struct work;
  110 + struct ceph_osd_event *event;
  111 + u64 ver;
  112 + u64 notify_id;
  113 + u8 opcode;
  114 +};
  115 +
93 116 struct ceph_osd_client {
94 117 struct ceph_client *client;
95 118  
... ... @@ -104,7 +127,10 @@
104 127 u64 timeout_tid; /* tid of timeout triggering rq */
105 128 u64 last_tid; /* tid of last request */
106 129 struct rb_root requests; /* pending requests */
107   - struct list_head req_lru; /* pending requests lru */
  130 + struct list_head req_lru; /* in-flight lru */
  131 + struct list_head req_unsent; /* unsent/need-resend queue */
  132 + struct list_head req_notarget; /* map to no osd */
  133 + struct list_head req_linger; /* lingering requests */
108 134 int num_requests;
109 135 struct delayed_work timeout_work;
110 136 struct delayed_work osds_timeout_work;
... ... @@ -116,6 +142,12 @@
116 142  
117 143 struct ceph_msgpool msgpool_op;
118 144 struct ceph_msgpool msgpool_op_reply;
  145 +
  146 + spinlock_t event_lock;
  147 + struct rb_root event_tree;
  148 + u64 event_count;
  149 +
  150 + struct workqueue_struct *notify_wq;
119 151 };
120 152  
121 153 struct ceph_osd_req_op {
... ... @@ -150,6 +182,13 @@
150 182 struct {
151 183 u64 snapid;
152 184 } snap;
  185 + struct {
  186 + u64 cookie;
  187 + u64 ver;
  188 + __u8 flag;
  189 + u32 prot_ver;
  190 + u32 timeout;
  191 + } watch;
153 192 };
154 193 u32 payload_len;
155 194 };
... ... @@ -198,6 +237,11 @@
198 237 bool use_mempool, int num_reply,
199 238 int page_align);
200 239  
  240 +extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
  241 + struct ceph_osd_request *req);
  242 +extern void ceph_osdc_unregister_linger_request(struct ceph_osd_client *osdc,
  243 + struct ceph_osd_request *req);
  244 +
201 245 static inline void ceph_osdc_get_request(struct ceph_osd_request *req)
202 246 {
203 247 kref_get(&req->r_kref);
... ... @@ -233,5 +277,14 @@
233 277 struct page **pages, int nr_pages,
234 278 int flags, int do_sync, bool nofail);
235 279  
  280 +/* watch/notify events */
  281 +extern int ceph_osdc_create_event(struct ceph_osd_client *osdc,
  282 + void (*event_cb)(u64, u64, u8, void *),
  283 + int one_shot, void *data,
  284 + struct ceph_osd_event **pevent);
  285 +extern void ceph_osdc_cancel_event(struct ceph_osd_event *event);
  286 +extern int ceph_osdc_wait_event(struct ceph_osd_event *event,
  287 + unsigned long timeout);
  288 +extern void ceph_osdc_put_event(struct ceph_osd_event *event);
236 289 #endif
include/linux/ceph/rados.h
... ... @@ -12,9 +12,9 @@
12 12 * osdmap encoding versions
13 13 */
14 14 #define CEPH_OSDMAP_INC_VERSION 5
15   -#define CEPH_OSDMAP_INC_VERSION_EXT 5
  15 +#define CEPH_OSDMAP_INC_VERSION_EXT 6
16 16 #define CEPH_OSDMAP_VERSION 5
17   -#define CEPH_OSDMAP_VERSION_EXT 5
  17 +#define CEPH_OSDMAP_VERSION_EXT 6
18 18  
19 19 /*
20 20 * fs id
21 21  
22 22  
... ... @@ -181,10 +181,18 @@
181 181 /* read */
182 182 CEPH_OSD_OP_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1,
183 183 CEPH_OSD_OP_STAT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2,
  184 + CEPH_OSD_OP_MAPEXT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 3,
184 185  
185 186 /* fancy read */
186   - CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4,
  187 + CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4,
  188 + CEPH_OSD_OP_SPARSE_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 5,
187 189  
  190 + CEPH_OSD_OP_NOTIFY = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 6,
  191 + CEPH_OSD_OP_NOTIFY_ACK = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 7,
  192 +
  193 + /* versioning */
  194 + CEPH_OSD_OP_ASSERT_VER = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 8,
  195 +
188 196 /* write */
189 197 CEPH_OSD_OP_WRITE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1,
190 198 CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2,
... ... @@ -205,6 +213,8 @@
205 213 CEPH_OSD_OP_CREATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
206 214 CEPH_OSD_OP_ROLLBACK= CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 14,
207 215  
  216 + CEPH_OSD_OP_WATCH = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 15,
  217 +
208 218 /** attrs **/
209 219 /* read */
210 220 CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
... ... @@ -218,11 +228,14 @@
218 228 CEPH_OSD_OP_RMXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4,
219 229  
220 230 /** subop **/
221   - CEPH_OSD_OP_PULL = CEPH_OSD_OP_MODE_SUB | 1,
222   - CEPH_OSD_OP_PUSH = CEPH_OSD_OP_MODE_SUB | 2,
223   - CEPH_OSD_OP_BALANCEREADS = CEPH_OSD_OP_MODE_SUB | 3,
224   - CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4,
225   - CEPH_OSD_OP_SCRUB = CEPH_OSD_OP_MODE_SUB | 5,
  231 + CEPH_OSD_OP_PULL = CEPH_OSD_OP_MODE_SUB | 1,
  232 + CEPH_OSD_OP_PUSH = CEPH_OSD_OP_MODE_SUB | 2,
  233 + CEPH_OSD_OP_BALANCEREADS = CEPH_OSD_OP_MODE_SUB | 3,
  234 + CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4,
  235 + CEPH_OSD_OP_SCRUB = CEPH_OSD_OP_MODE_SUB | 5,
  236 + CEPH_OSD_OP_SCRUB_RESERVE = CEPH_OSD_OP_MODE_SUB | 6,
  237 + CEPH_OSD_OP_SCRUB_UNRESERVE = CEPH_OSD_OP_MODE_SUB | 7,
  238 + CEPH_OSD_OP_SCRUB_STOP = CEPH_OSD_OP_MODE_SUB | 8,
226 239  
227 240 /** lock **/
228 241 CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
... ... @@ -328,6 +341,8 @@
328 341 CEPH_OSD_CMPXATTR_MODE_U64 = 2
329 342 };
330 343  
  344 +#define RADOS_NOTIFY_VER 1
  345 +
331 346 /*
332 347 * an individual object operation. each may be accompanied by some data
333 348 * payload
... ... @@ -359,7 +374,12 @@
359 374 struct {
360 375 __le64 snapid;
361 376 } __attribute__ ((packed)) snap;
362   - };
  377 + struct {
  378 + __le64 cookie;
  379 + __le64 ver;
  380 + __u8 flag; /* 0 = unwatch, 1 = watch */
  381 + } __attribute__ ((packed)) watch;
  382 +};
363 383 __le32 payload_len;
364 384 } __attribute__ ((packed));
365 385  
... ... @@ -400,6 +420,7 @@
400 420 __le32 num_ops;
401 421 struct ceph_osd_op ops[0]; /* ops[], object */
402 422 } __attribute__ ((packed));
  423 +
403 424  
404 425  
405 426 #endif
... ... @@ -78,8 +78,10 @@
78 78 while (src < end) {
79 79 int a, b, c, d;
80 80  
81   - if (src < end && src[0] == '\n')
  81 + if (src[0] == '\n') {
82 82 src++;
  83 + continue;
  84 + }
83 85 if (src + 4 > end)
84 86 return -EINVAL;
85 87 a = decode_bits(src[0]);
net/ceph/ceph_common.c
... ... @@ -62,6 +62,7 @@
62 62 case CEPH_MSG_OSD_MAP: return "osd_map";
63 63 case CEPH_MSG_OSD_OP: return "osd_op";
64 64 case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
  65 + case CEPH_MSG_WATCH_NOTIFY: return "watch_notify";
65 66 default: return "unknown";
66 67 }
67 68 }
net/ceph/osd_client.c
... ... @@ -22,10 +22,15 @@
22 22 #define OSD_OPREPLY_FRONT_LEN 512
23 23  
24 24 static const struct ceph_connection_operations osd_con_ops;
25   -static int __kick_requests(struct ceph_osd_client *osdc,
26   - struct ceph_osd *kickosd);
27 25  
28   -static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
  26 +static void send_queued(struct ceph_osd_client *osdc);
  27 +static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd);
  28 +static void __register_request(struct ceph_osd_client *osdc,
  29 + struct ceph_osd_request *req);
  30 +static void __unregister_linger_request(struct ceph_osd_client *osdc,
  31 + struct ceph_osd_request *req);
  32 +static int __send_request(struct ceph_osd_client *osdc,
  33 + struct ceph_osd_request *req);
29 34  
30 35 static int op_needs_trail(int op)
31 36 {
... ... @@ -34,6 +39,7 @@
34 39 case CEPH_OSD_OP_SETXATTR:
35 40 case CEPH_OSD_OP_CMPXATTR:
36 41 case CEPH_OSD_OP_CALL:
  42 + case CEPH_OSD_OP_NOTIFY:
37 43 return 1;
38 44 default:
39 45 return 0;
... ... @@ -209,6 +215,8 @@
209 215 init_completion(&req->r_completion);
210 216 init_completion(&req->r_safe_completion);
211 217 INIT_LIST_HEAD(&req->r_unsafe_item);
  218 + INIT_LIST_HEAD(&req->r_linger_item);
  219 + INIT_LIST_HEAD(&req->r_linger_osd);
212 220 req->r_flags = flags;
213 221  
214 222 WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
... ... @@ -315,6 +323,24 @@
315 323 break;
316 324 case CEPH_OSD_OP_STARTSYNC:
317 325 break;
  326 + case CEPH_OSD_OP_NOTIFY:
  327 + {
  328 + __le32 prot_ver = cpu_to_le32(src->watch.prot_ver);
  329 + __le32 timeout = cpu_to_le32(src->watch.timeout);
  330 +
  331 + BUG_ON(!req->r_trail);
  332 +
  333 + ceph_pagelist_append(req->r_trail,
  334 + &prot_ver, sizeof(prot_ver));
  335 + ceph_pagelist_append(req->r_trail,
  336 + &timeout, sizeof(timeout));
  337 + }
  338 + case CEPH_OSD_OP_NOTIFY_ACK:
  339 + case CEPH_OSD_OP_WATCH:
  340 + dst->watch.cookie = cpu_to_le64(src->watch.cookie);
  341 + dst->watch.ver = cpu_to_le64(src->watch.ver);
  342 + dst->watch.flag = src->watch.flag;
  343 + break;
318 344 default:
319 345 pr_err("unrecognized osd opcode %d\n", dst->op);
320 346 WARN_ON(1);
321 347  
... ... @@ -529,7 +555,46 @@
529 555 return NULL;
530 556 }
531 557  
  558 +/*
  559 + * Resubmit requests pending on the given osd.
  560 + */
  561 +static void __kick_osd_requests(struct ceph_osd_client *osdc,
  562 + struct ceph_osd *osd)
  563 +{
  564 + struct ceph_osd_request *req, *nreq;
  565 + int err;
532 566  
  567 + dout("__kick_osd_requests osd%d\n", osd->o_osd);
  568 + err = __reset_osd(osdc, osd);
  569 + if (err == -EAGAIN)
  570 + return;
  571 +
  572 + list_for_each_entry(req, &osd->o_requests, r_osd_item) {
  573 + list_move(&req->r_req_lru_item, &osdc->req_unsent);
  574 + dout("requeued %p tid %llu osd%d\n", req, req->r_tid,
  575 + osd->o_osd);
  576 + if (!req->r_linger)
  577 + req->r_flags |= CEPH_OSD_FLAG_RETRY;
  578 + }
  579 +
  580 + list_for_each_entry_safe(req, nreq, &osd->o_linger_requests,
  581 + r_linger_osd) {
  582 + __unregister_linger_request(osdc, req);
  583 + __register_request(osdc, req);
  584 + list_move(&req->r_req_lru_item, &osdc->req_unsent);
  585 + dout("requeued lingering %p tid %llu osd%d\n", req, req->r_tid,
  586 + osd->o_osd);
  587 + }
  588 +}
  589 +
  590 +static void kick_osd_requests(struct ceph_osd_client *osdc,
  591 + struct ceph_osd *kickosd)
  592 +{
  593 + mutex_lock(&osdc->request_mutex);
  594 + __kick_osd_requests(osdc, kickosd);
  595 + mutex_unlock(&osdc->request_mutex);
  596 +}
  597 +
533 598 /*
534 599 * If the osd connection drops, we need to resubmit all requests.
535 600 */
... ... @@ -543,7 +608,8 @@
543 608 dout("osd_reset osd%d\n", osd->o_osd);
544 609 osdc = osd->o_osdc;
545 610 down_read(&osdc->map_sem);
546   - kick_requests(osdc, osd);
  611 + kick_osd_requests(osdc, osd);
  612 + send_queued(osdc);
547 613 up_read(&osdc->map_sem);
548 614 }
549 615  
... ... @@ -561,6 +627,7 @@
561 627 atomic_set(&osd->o_ref, 1);
562 628 osd->o_osdc = osdc;
563 629 INIT_LIST_HEAD(&osd->o_requests);
  630 + INIT_LIST_HEAD(&osd->o_linger_requests);
564 631 INIT_LIST_HEAD(&osd->o_osd_lru);
565 632 osd->o_incarnation = 1;
566 633  
... ... @@ -650,7 +717,8 @@
650 717 int ret = 0;
651 718  
652 719 dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
653   - if (list_empty(&osd->o_requests)) {
  720 + if (list_empty(&osd->o_requests) &&
  721 + list_empty(&osd->o_linger_requests)) {
654 722 __remove_osd(osdc, osd);
655 723 } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
656 724 &osd->o_con.peer_addr,
657 725  
... ... @@ -723,10 +791,9 @@
723 791 * Register request, assign tid. If this is the first request, set up
724 792 * the timeout event.
725 793 */
726   -static void register_request(struct ceph_osd_client *osdc,
727   - struct ceph_osd_request *req)
  794 +static void __register_request(struct ceph_osd_client *osdc,
  795 + struct ceph_osd_request *req)
728 796 {
729   - mutex_lock(&osdc->request_mutex);
730 797 req->r_tid = ++osdc->last_tid;
731 798 req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
732 799 INIT_LIST_HEAD(&req->r_req_lru_item);
... ... @@ -740,6 +807,13 @@
740 807 dout(" first request, scheduling timeout\n");
741 808 __schedule_osd_timeout(osdc);
742 809 }
  810 +}
  811 +
  812 +static void register_request(struct ceph_osd_client *osdc,
  813 + struct ceph_osd_request *req)
  814 +{
  815 + mutex_lock(&osdc->request_mutex);
  816 + __register_request(osdc, req);
743 817 mutex_unlock(&osdc->request_mutex);
744 818 }
745 819  
746 820  
... ... @@ -758,9 +832,14 @@
758 832 ceph_con_revoke(&req->r_osd->o_con, req->r_request);
759 833  
760 834 list_del_init(&req->r_osd_item);
761   - if (list_empty(&req->r_osd->o_requests))
  835 + if (list_empty(&req->r_osd->o_requests) &&
  836 + list_empty(&req->r_osd->o_linger_requests)) {
  837 + dout("moving osd to %p lru\n", req->r_osd);
762 838 __move_osd_to_lru(osdc, req->r_osd);
763   - req->r_osd = NULL;
  839 + }
  840 + if (list_empty(&req->r_osd_item) &&
  841 + list_empty(&req->r_linger_item))
  842 + req->r_osd = NULL;
764 843 }
765 844  
766 845 ceph_osdc_put_request(req);
767 846  
768 847  
769 848  
... ... @@ -781,20 +860,72 @@
781 860 ceph_con_revoke(&req->r_osd->o_con, req->r_request);
782 861 req->r_sent = 0;
783 862 }
784   - list_del_init(&req->r_req_lru_item);
785 863 }
786 864  
  865 +static void __register_linger_request(struct ceph_osd_client *osdc,
  866 + struct ceph_osd_request *req)
  867 +{
  868 + dout("__register_linger_request %p\n", req);
  869 + list_add_tail(&req->r_linger_item, &osdc->req_linger);
  870 + list_add_tail(&req->r_linger_osd, &req->r_osd->o_linger_requests);
  871 +}
  872 +
  873 +static void __unregister_linger_request(struct ceph_osd_client *osdc,
  874 + struct ceph_osd_request *req)
  875 +{
  876 + dout("__unregister_linger_request %p\n", req);
  877 + if (req->r_osd) {
  878 + list_del_init(&req->r_linger_item);
  879 + list_del_init(&req->r_linger_osd);
  880 +
  881 + if (list_empty(&req->r_osd->o_requests) &&
  882 + list_empty(&req->r_osd->o_linger_requests)) {
  883 + dout("moving osd to %p lru\n", req->r_osd);
  884 + __move_osd_to_lru(osdc, req->r_osd);
  885 + }
  886 + req->r_osd = NULL;
  887 + }
  888 +}
  889 +
  890 +void ceph_osdc_unregister_linger_request(struct ceph_osd_client *osdc,
  891 + struct ceph_osd_request *req)
  892 +{
  893 + mutex_lock(&osdc->request_mutex);
  894 + if (req->r_linger) {
  895 + __unregister_linger_request(osdc, req);
  896 + ceph_osdc_put_request(req);
  897 + }
  898 + mutex_unlock(&osdc->request_mutex);
  899 +}
  900 +EXPORT_SYMBOL(ceph_osdc_unregister_linger_request);
  901 +
  902 +void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
  903 + struct ceph_osd_request *req)
  904 +{
  905 + if (!req->r_linger) {
  906 + dout("set_request_linger %p\n", req);
  907 + req->r_linger = 1;
  908 + /*
  909 + * caller is now responsible for calling
  910 + * unregister_linger_request
  911 + */
  912 + ceph_osdc_get_request(req);
  913 + }
  914 +}
  915 +EXPORT_SYMBOL(ceph_osdc_set_request_linger);
  916 +
787 917 /*
788 918 * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
789 919 * (as needed), and set the request r_osd appropriately. If there is
790   - * no up osd, set r_osd to NULL.
  920 + * no up osd, set r_osd to NULL. Move the request to the appropiate list
  921 + * (unsent, homeless) or leave on in-flight lru.
791 922 *
792 923 * Return 0 if unchanged, 1 if changed, or negative on error.
793 924 *
794 925 * Caller should hold map_sem for read and request_mutex.
795 926 */
796   -static int __map_osds(struct ceph_osd_client *osdc,
797   - struct ceph_osd_request *req)
  927 +static int __map_request(struct ceph_osd_client *osdc,
  928 + struct ceph_osd_request *req)
798 929 {
799 930 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
800 931 struct ceph_pg pgid;
801 932  
802 933  
... ... @@ -802,11 +933,13 @@
802 933 int o = -1, num = 0;
803 934 int err;
804 935  
805   - dout("map_osds %p tid %lld\n", req, req->r_tid);
  936 + dout("map_request %p tid %lld\n", req, req->r_tid);
806 937 err = ceph_calc_object_layout(&reqhead->layout, req->r_oid,
807 938 &req->r_file_layout, osdc->osdmap);
808   - if (err)
  939 + if (err) {
  940 + list_move(&req->r_req_lru_item, &osdc->req_notarget);
809 941 return err;
  942 + }
810 943 pgid = reqhead->layout.ol_pgid;
811 944 req->r_pgid = pgid;
812 945  
... ... @@ -823,7 +956,7 @@
823 956 (req->r_osd == NULL && o == -1))
824 957 return 0; /* no change */
825 958  
826   - dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n",
  959 + dout("map_request tid %llu pgid %d.%x osd%d (was osd%d)\n",
827 960 req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
828 961 req->r_osd ? req->r_osd->o_osd : -1);
829 962  
830 963  
831 964  
... ... @@ -841,10 +974,12 @@
841 974 if (!req->r_osd && o >= 0) {
842 975 err = -ENOMEM;
843 976 req->r_osd = create_osd(osdc);
844   - if (!req->r_osd)
  977 + if (!req->r_osd) {
  978 + list_move(&req->r_req_lru_item, &osdc->req_notarget);
845 979 goto out;
  980 + }
846 981  
847   - dout("map_osds osd %p is osd%d\n", req->r_osd, o);
  982 + dout("map_request osd %p is osd%d\n", req->r_osd, o);
848 983 req->r_osd->o_osd = o;
849 984 req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
850 985 __insert_osd(osdc, req->r_osd);
... ... @@ -855,6 +990,9 @@
855 990 if (req->r_osd) {
856 991 __remove_osd_from_lru(req->r_osd);
857 992 list_add(&req->r_osd_item, &req->r_osd->o_requests);
  993 + list_move(&req->r_req_lru_item, &osdc->req_unsent);
  994 + } else {
  995 + list_move(&req->r_req_lru_item, &osdc->req_notarget);
858 996 }
859 997 err = 1; /* osd or pg changed */
860 998  
861 999  
... ... @@ -869,17 +1007,7 @@
869 1007 struct ceph_osd_request *req)
870 1008 {
871 1009 struct ceph_osd_request_head *reqhead;
872   - int err;
873 1010  
874   - err = __map_osds(osdc, req);
875   - if (err < 0)
876   - return err;
877   - if (req->r_osd == NULL) {
878   - dout("send_request %p no up osds in pg\n", req);
879   - ceph_monc_request_next_osdmap(&osdc->client->monc);
880   - return 0;
881   - }
882   -
883 1011 dout("send_request %p tid %llu to osd%d flags %d\n",
884 1012 req, req->r_tid, req->r_osd->o_osd, req->r_flags);
885 1013  
... ... @@ -898,6 +1026,21 @@
898 1026 }
899 1027  
900 1028 /*
  1029 + * Send any requests in the queue (req_unsent).
  1030 + */
  1031 +static void send_queued(struct ceph_osd_client *osdc)
  1032 +{
  1033 + struct ceph_osd_request *req, *tmp;
  1034 +
  1035 + dout("send_queued\n");
  1036 + mutex_lock(&osdc->request_mutex);
  1037 + list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item) {
  1038 + __send_request(osdc, req);
  1039 + }
  1040 + mutex_unlock(&osdc->request_mutex);
  1041 +}
  1042 +
  1043 +/*
901 1044 * Timeout callback, called every N seconds when 1 or more osd
902 1045 * requests has been active for more than N seconds. When this
903 1046 * happens, we ping all OSDs with requests who have timed out to
904 1047  
905 1048  
906 1049  
... ... @@ -916,31 +1059,14 @@
916 1059 unsigned long keepalive =
917 1060 osdc->client->options->osd_keepalive_timeout * HZ;
918 1061 unsigned long last_stamp = 0;
919   - struct rb_node *p;
920 1062 struct list_head slow_osds;
921   -
922 1063 dout("timeout\n");
923 1064 down_read(&osdc->map_sem);
924 1065  
925 1066 ceph_monc_request_next_osdmap(&osdc->client->monc);
926 1067  
927 1068 mutex_lock(&osdc->request_mutex);
928   - for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
929   - req = rb_entry(p, struct ceph_osd_request, r_node);
930 1069  
931   - if (req->r_resend) {
932   - int err;
933   -
934   - dout("osdc resending prev failed %lld\n", req->r_tid);
935   - err = __send_request(osdc, req);
936   - if (err)
937   - dout("osdc failed again on %lld\n", req->r_tid);
938   - else
939   - req->r_resend = false;
940   - continue;
941   - }
942   - }
943   -
944 1070 /*
945 1071 * reset osds that appear to be _really_ unresponsive. this
946 1072 * is a failsafe measure.. we really shouldn't be getting to
... ... @@ -963,7 +1089,7 @@
963 1089 BUG_ON(!osd);
964 1090 pr_warning(" tid %llu timed out on osd%d, will reset osd\n",
965 1091 req->r_tid, osd->o_osd);
966   - __kick_requests(osdc, osd);
  1092 + __kick_osd_requests(osdc, osd);
967 1093 }
968 1094  
969 1095 /*
... ... @@ -991,7 +1117,7 @@
991 1117  
992 1118 __schedule_osd_timeout(osdc);
993 1119 mutex_unlock(&osdc->request_mutex);
994   -
  1120 + send_queued(osdc);
995 1121 up_read(&osdc->map_sem);
996 1122 }
997 1123  
... ... @@ -1035,7 +1161,6 @@
1035 1161 numops * sizeof(struct ceph_osd_op))
1036 1162 goto bad;
1037 1163 dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result);
1038   -
1039 1164 /* lookup */
1040 1165 mutex_lock(&osdc->request_mutex);
1041 1166 req = __lookup_request(osdc, tid);
... ... @@ -1079,6 +1204,9 @@
1079 1204  
1080 1205 dout("handle_reply tid %llu flags %d\n", tid, flags);
1081 1206  
  1207 + if (req->r_linger && (flags & CEPH_OSD_FLAG_ONDISK))
  1208 + __register_linger_request(osdc, req);
  1209 +
1082 1210 /* either this is a read, or we got the safe response */
1083 1211 if (result < 0 ||
1084 1212 (flags & CEPH_OSD_FLAG_ONDISK) ||
... ... @@ -1099,6 +1227,7 @@
1099 1227 }
1100 1228  
1101 1229 done:
  1230 + dout("req=%p req->r_linger=%d\n", req, req->r_linger);
1102 1231 ceph_osdc_put_request(req);
1103 1232 return;
1104 1233  
1105 1234  
1106 1235  
1107 1236  
1108 1237  
1109 1238  
1110 1239  
1111 1240  
1112 1241  
1113 1242  
1114 1243  
1115 1244  
1116 1245  
1117 1246  
1118 1247  
1119 1248  
... ... @@ -1109,108 +1238,83 @@
1109 1238 ceph_msg_dump(msg);
1110 1239 }
1111 1240  
1112   -
1113   -static int __kick_requests(struct ceph_osd_client *osdc,
1114   - struct ceph_osd *kickosd)
  1241 +static void reset_changed_osds(struct ceph_osd_client *osdc)
1115 1242 {
1116   - struct ceph_osd_request *req;
1117 1243 struct rb_node *p, *n;
1118   - int needmap = 0;
1119   - int err;
1120 1244  
1121   - dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
1122   - if (kickosd) {
1123   - err = __reset_osd(osdc, kickosd);
1124   - if (err == -EAGAIN)
1125   - return 1;
1126   - } else {
1127   - for (p = rb_first(&osdc->osds); p; p = n) {
1128   - struct ceph_osd *osd =
1129   - rb_entry(p, struct ceph_osd, o_node);
  1245 + for (p = rb_first(&osdc->osds); p; p = n) {
  1246 + struct ceph_osd *osd = rb_entry(p, struct ceph_osd, o_node);
1130 1247  
1131   - n = rb_next(p);
1132   - if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
1133   - memcmp(&osd->o_con.peer_addr,
1134   - ceph_osd_addr(osdc->osdmap,
1135   - osd->o_osd),
1136   - sizeof(struct ceph_entity_addr)) != 0)
1137   - __reset_osd(osdc, osd);
1138   - }
  1248 + n = rb_next(p);
  1249 + if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
  1250 + memcmp(&osd->o_con.peer_addr,
  1251 + ceph_osd_addr(osdc->osdmap,
  1252 + osd->o_osd),
  1253 + sizeof(struct ceph_entity_addr)) != 0)
  1254 + __reset_osd(osdc, osd);
1139 1255 }
  1256 +}
1140 1257  
  1258 +/*
  1259 + * Requeue requests whose mapping to an OSD has changed. If requests map to
  1260 + * no osd, request a new map.
  1261 + *
  1262 + * Caller should hold map_sem for read and request_mutex.
  1263 + */
  1264 +static void kick_requests(struct ceph_osd_client *osdc)
  1265 +{
  1266 + struct ceph_osd_request *req, *nreq;
  1267 + struct rb_node *p;
  1268 + int needmap = 0;
  1269 + int err;
  1270 +
  1271 + dout("kick_requests\n");
  1272 + mutex_lock(&osdc->request_mutex);
1141 1273 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
1142 1274 req = rb_entry(p, struct ceph_osd_request, r_node);
1143   -
1144   - if (req->r_resend) {
1145   - dout(" r_resend set on tid %llu\n", req->r_tid);
1146   - __cancel_request(req);
1147   - goto kick;
  1275 + err = __map_request(osdc, req);
  1276 + if (err < 0)
  1277 + continue; /* error */
  1278 + if (req->r_osd == NULL) {
  1279 + dout("%p tid %llu maps to no osd\n", req, req->r_tid);
  1280 + needmap++; /* request a newer map */
  1281 + } else if (err > 0) {
  1282 + dout("%p tid %llu requeued on osd%d\n", req, req->r_tid,
  1283 + req->r_osd ? req->r_osd->o_osd : -1);
  1284 + if (!req->r_linger)
  1285 + req->r_flags |= CEPH_OSD_FLAG_RETRY;
1148 1286 }
1149   - if (req->r_osd && kickosd == req->r_osd) {
1150   - __cancel_request(req);
1151   - goto kick;
1152   - }
  1287 + }
1153 1288  
1154   - err = __map_osds(osdc, req);
  1289 + list_for_each_entry_safe(req, nreq, &osdc->req_linger,
  1290 + r_linger_item) {
  1291 + dout("linger req=%p req->r_osd=%p\n", req, req->r_osd);
  1292 +
  1293 + err = __map_request(osdc, req);
1155 1294 if (err == 0)
1156   - continue; /* no change */
1157   - if (err < 0) {
1158   - /*
1159   - * FIXME: really, we should set the request
1160   - * error and fail if this isn't a 'nofail'
1161   - * request, but that's a fair bit more
1162   - * complicated to do. So retry!
1163   - */
1164   - dout(" setting r_resend on %llu\n", req->r_tid);
1165   - req->r_resend = true;
1166   - continue;
1167   - }
  1295 + continue; /* no change and no osd was specified */
  1296 + if (err < 0)
  1297 + continue; /* hrm! */
1168 1298 if (req->r_osd == NULL) {
1169 1299 dout("tid %llu maps to no valid osd\n", req->r_tid);
1170 1300 needmap++; /* request a newer map */
1171 1301 continue;
1172 1302 }
1173 1303  
1174   -kick:
1175   - dout("kicking %p tid %llu osd%d\n", req, req->r_tid,
  1304 + dout("kicking lingering %p tid %llu osd%d\n", req, req->r_tid,
1176 1305 req->r_osd ? req->r_osd->o_osd : -1);
1177   - req->r_flags |= CEPH_OSD_FLAG_RETRY;
1178   - err = __send_request(osdc, req);
1179   - if (err) {
1180   - dout(" setting r_resend on %llu\n", req->r_tid);
1181   - req->r_resend = true;
1182   - }
  1306 + __unregister_linger_request(osdc, req);
  1307 + __register_request(osdc, req);
1183 1308 }
1184   -
1185   - return needmap;
1186   -}
1187   -
1188   -/*
1189   - * Resubmit osd requests whose osd or osd address has changed. Request
1190   - * a new osd map if osds are down, or we are otherwise unable to determine
1191   - * how to direct a request.
1192   - *
1193   - * Close connections to down osds.
1194   - *
1195   - * If @who is specified, resubmit requests for that specific osd.
1196   - *
1197   - * Caller should hold map_sem for read and request_mutex.
1198   - */
1199   -static void kick_requests(struct ceph_osd_client *osdc,
1200   - struct ceph_osd *kickosd)
1201   -{
1202   - int needmap;
1203   -
1204   - mutex_lock(&osdc->request_mutex);
1205   - needmap = __kick_requests(osdc, kickosd);
1206 1309 mutex_unlock(&osdc->request_mutex);
1207 1310  
1208 1311 if (needmap) {
1209 1312 dout("%d requests for down osds, need new map\n", needmap);
1210 1313 ceph_monc_request_next_osdmap(&osdc->client->monc);
1211 1314 }
1212   -
1213 1315 }
  1316 +
  1317 +
1214 1318 /*
1215 1319 * Process updated osd map.
1216 1320 *
... ... @@ -1263,6 +1367,8 @@
1263 1367 ceph_osdmap_destroy(osdc->osdmap);
1264 1368 osdc->osdmap = newmap;
1265 1369 }
  1370 + kick_requests(osdc);
  1371 + reset_changed_osds(osdc);
1266 1372 } else {
1267 1373 dout("ignoring incremental map %u len %d\n",
1268 1374 epoch, maplen);
... ... @@ -1300,6 +1406,7 @@
1300 1406 osdc->osdmap = newmap;
1301 1407 if (oldmap)
1302 1408 ceph_osdmap_destroy(oldmap);
  1409 + kick_requests(osdc);
1303 1410 }
1304 1411 p += maplen;
1305 1412 nr_maps--;
... ... @@ -1308,8 +1415,7 @@
1308 1415 done:
1309 1416 downgrade_write(&osdc->map_sem);
1310 1417 ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
1311   - if (newmap)
1312   - kick_requests(osdc, NULL);
  1418 + send_queued(osdc);
1313 1419 up_read(&osdc->map_sem);
1314 1420 wake_up_all(&osdc->client->auth_wq);
1315 1421 return;
... ... @@ -1322,6 +1428,223 @@
1322 1428 }
1323 1429  
1324 1430 /*
  1431 + * watch/notify callback event infrastructure
  1432 + *
  1433 + * These callbacks are used both for watch and notify operations.
  1434 + */
  1435 +static void __release_event(struct kref *kref)
  1436 +{
  1437 + struct ceph_osd_event *event =
  1438 + container_of(kref, struct ceph_osd_event, kref);
  1439 +
  1440 + dout("__release_event %p\n", event);
  1441 + kfree(event);
  1442 +}
  1443 +
  1444 +static void get_event(struct ceph_osd_event *event)
  1445 +{
  1446 + kref_get(&event->kref);
  1447 +}
  1448 +
  1449 +void ceph_osdc_put_event(struct ceph_osd_event *event)
  1450 +{
  1451 + kref_put(&event->kref, __release_event);
  1452 +}
  1453 +EXPORT_SYMBOL(ceph_osdc_put_event);
  1454 +
  1455 +static void __insert_event(struct ceph_osd_client *osdc,
  1456 + struct ceph_osd_event *new)
  1457 +{
  1458 + struct rb_node **p = &osdc->event_tree.rb_node;
  1459 + struct rb_node *parent = NULL;
  1460 + struct ceph_osd_event *event = NULL;
  1461 +
  1462 + while (*p) {
  1463 + parent = *p;
  1464 + event = rb_entry(parent, struct ceph_osd_event, node);
  1465 + if (new->cookie < event->cookie)
  1466 + p = &(*p)->rb_left;
  1467 + else if (new->cookie > event->cookie)
  1468 + p = &(*p)->rb_right;
  1469 + else
  1470 + BUG();
  1471 + }
  1472 +
  1473 + rb_link_node(&new->node, parent, p);
  1474 + rb_insert_color(&new->node, &osdc->event_tree);
  1475 +}
  1476 +
  1477 +static struct ceph_osd_event *__find_event(struct ceph_osd_client *osdc,
  1478 + u64 cookie)
  1479 +{
  1480 + struct rb_node **p = &osdc->event_tree.rb_node;
  1481 + struct rb_node *parent = NULL;
  1482 + struct ceph_osd_event *event = NULL;
  1483 +
  1484 + while (*p) {
  1485 + parent = *p;
  1486 + event = rb_entry(parent, struct ceph_osd_event, node);
  1487 + if (cookie < event->cookie)
  1488 + p = &(*p)->rb_left;
  1489 + else if (cookie > event->cookie)
  1490 + p = &(*p)->rb_right;
  1491 + else
  1492 + return event;
  1493 + }
  1494 + return NULL;
  1495 +}
  1496 +
  1497 +static void __remove_event(struct ceph_osd_event *event)
  1498 +{
  1499 + struct ceph_osd_client *osdc = event->osdc;
  1500 +
  1501 + if (!RB_EMPTY_NODE(&event->node)) {
  1502 + dout("__remove_event removed %p\n", event);
  1503 + rb_erase(&event->node, &osdc->event_tree);
  1504 + ceph_osdc_put_event(event);
  1505 + } else {
  1506 + dout("__remove_event didn't remove %p\n", event);
  1507 + }
  1508 +}
  1509 +
  1510 +int ceph_osdc_create_event(struct ceph_osd_client *osdc,
  1511 + void (*event_cb)(u64, u64, u8, void *),
  1512 + int one_shot, void *data,
  1513 + struct ceph_osd_event **pevent)
  1514 +{
  1515 + struct ceph_osd_event *event;
  1516 +
  1517 + event = kmalloc(sizeof(*event), GFP_NOIO);
  1518 + if (!event)
  1519 + return -ENOMEM;
  1520 +
  1521 + dout("create_event %p\n", event);
  1522 + event->cb = event_cb;
  1523 + event->one_shot = one_shot;
  1524 + event->data = data;
  1525 + event->osdc = osdc;
  1526 + INIT_LIST_HEAD(&event->osd_node);
  1527 + kref_init(&event->kref); /* one ref for us */
  1528 + kref_get(&event->kref); /* one ref for the caller */
  1529 + init_completion(&event->completion);
  1530 +
  1531 + spin_lock(&osdc->event_lock);
  1532 + event->cookie = ++osdc->event_count;
  1533 + __insert_event(osdc, event);
  1534 + spin_unlock(&osdc->event_lock);
  1535 +
  1536 + *pevent = event;
  1537 + return 0;
  1538 +}
  1539 +EXPORT_SYMBOL(ceph_osdc_create_event);
  1540 +
  1541 +void ceph_osdc_cancel_event(struct ceph_osd_event *event)
  1542 +{
  1543 + struct ceph_osd_client *osdc = event->osdc;
  1544 +
  1545 + dout("cancel_event %p\n", event);
  1546 + spin_lock(&osdc->event_lock);
  1547 + __remove_event(event);
  1548 + spin_unlock(&osdc->event_lock);
  1549 + ceph_osdc_put_event(event); /* caller's */
  1550 +}
  1551 +EXPORT_SYMBOL(ceph_osdc_cancel_event);
  1552 +
  1553 +
  1554 +static void do_event_work(struct work_struct *work)
  1555 +{
  1556 + struct ceph_osd_event_work *event_work =
  1557 + container_of(work, struct ceph_osd_event_work, work);
  1558 + struct ceph_osd_event *event = event_work->event;
  1559 + u64 ver = event_work->ver;
  1560 + u64 notify_id = event_work->notify_id;
  1561 + u8 opcode = event_work->opcode;
  1562 +
  1563 + dout("do_event_work completing %p\n", event);
  1564 + event->cb(ver, notify_id, opcode, event->data);
  1565 + complete(&event->completion);
  1566 + dout("do_event_work completed %p\n", event);
  1567 + ceph_osdc_put_event(event);
  1568 + kfree(event_work);
  1569 +}
  1570 +
  1571 +
  1572 +/*
  1573 + * Process osd watch notifications
  1574 + */
  1575 +void handle_watch_notify(struct ceph_osd_client *osdc, struct ceph_msg *msg)
  1576 +{
  1577 + void *p, *end;
  1578 + u8 proto_ver;
  1579 + u64 cookie, ver, notify_id;
  1580 + u8 opcode;
  1581 + struct ceph_osd_event *event;
  1582 + struct ceph_osd_event_work *event_work;
  1583 +
  1584 + p = msg->front.iov_base;
  1585 + end = p + msg->front.iov_len;
  1586 +
  1587 + ceph_decode_8_safe(&p, end, proto_ver, bad);
  1588 + ceph_decode_8_safe(&p, end, opcode, bad);
  1589 + ceph_decode_64_safe(&p, end, cookie, bad);
  1590 + ceph_decode_64_safe(&p, end, ver, bad);
  1591 + ceph_decode_64_safe(&p, end, notify_id, bad);
  1592 +
  1593 + spin_lock(&osdc->event_lock);
  1594 + event = __find_event(osdc, cookie);
  1595 + if (event) {
  1596 + get_event(event);
  1597 + if (event->one_shot)
  1598 + __remove_event(event);
  1599 + }
  1600 + spin_unlock(&osdc->event_lock);
  1601 + dout("handle_watch_notify cookie %lld ver %lld event %p\n",
  1602 + cookie, ver, event);
  1603 + if (event) {
  1604 + event_work = kmalloc(sizeof(*event_work), GFP_NOIO);
  1605 + INIT_WORK(&event_work->work, do_event_work);
  1606 + if (!event_work) {
  1607 + dout("ERROR: could not allocate event_work\n");
  1608 + goto done_err;
  1609 + }
  1610 + event_work->event = event;
  1611 + event_work->ver = ver;
  1612 + event_work->notify_id = notify_id;
  1613 + event_work->opcode = opcode;
  1614 + if (!queue_work(osdc->notify_wq, &event_work->work)) {
  1615 + dout("WARNING: failed to queue notify event work\n");
  1616 + goto done_err;
  1617 + }
  1618 + }
  1619 +
  1620 + return;
  1621 +
  1622 +done_err:
  1623 + complete(&event->completion);
  1624 + ceph_osdc_put_event(event);
  1625 + return;
  1626 +
  1627 +bad:
  1628 + pr_err("osdc handle_watch_notify corrupt msg\n");
  1629 + return;
  1630 +}
  1631 +
  1632 +int ceph_osdc_wait_event(struct ceph_osd_event *event, unsigned long timeout)
  1633 +{
  1634 + int err;
  1635 +
  1636 + dout("wait_event %p\n", event);
  1637 + err = wait_for_completion_interruptible_timeout(&event->completion,
  1638 + timeout * HZ);
  1639 + ceph_osdc_put_event(event);
  1640 + if (err > 0)
  1641 + err = 0;
  1642 + dout("wait_event %p returns %d\n", event, err);
  1643 + return err;
  1644 +}
  1645 +EXPORT_SYMBOL(ceph_osdc_wait_event);
  1646 +
  1647 +/*
1325 1648 * Register request, send initial attempt.
1326 1649 */
1327 1650 int ceph_osdc_start_request(struct ceph_osd_client *osdc,
... ... @@ -1347,15 +1670,22 @@
1347 1670 * the request still han't been touched yet.
1348 1671 */
1349 1672 if (req->r_sent == 0) {
1350   - rc = __send_request(osdc, req);
1351   - if (rc) {
1352   - if (nofail) {
1353   - dout("osdc_start_request failed send, "
1354   - " marking %lld\n", req->r_tid);
1355   - req->r_resend = true;
1356   - rc = 0;
1357   - } else {
1358   - __unregister_request(osdc, req);
  1673 + rc = __map_request(osdc, req);
  1674 + if (rc < 0)
  1675 + return rc;
  1676 + if (req->r_osd == NULL) {
  1677 + dout("send_request %p no up osds in pg\n", req);
  1678 + ceph_monc_request_next_osdmap(&osdc->client->monc);
  1679 + } else {
  1680 + rc = __send_request(osdc, req);
  1681 + if (rc) {
  1682 + if (nofail) {
  1683 + dout("osdc_start_request failed send, "
  1684 + " will retry %lld\n", req->r_tid);
  1685 + rc = 0;
  1686 + } else {
  1687 + __unregister_request(osdc, req);
  1688 + }
1359 1689 }
1360 1690 }
1361 1691 }
1362 1692  
... ... @@ -1441,9 +1771,15 @@
1441 1771 INIT_LIST_HEAD(&osdc->osd_lru);
1442 1772 osdc->requests = RB_ROOT;
1443 1773 INIT_LIST_HEAD(&osdc->req_lru);
  1774 + INIT_LIST_HEAD(&osdc->req_unsent);
  1775 + INIT_LIST_HEAD(&osdc->req_notarget);
  1776 + INIT_LIST_HEAD(&osdc->req_linger);
1444 1777 osdc->num_requests = 0;
1445 1778 INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
1446 1779 INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
  1780 + spin_lock_init(&osdc->event_lock);
  1781 + osdc->event_tree = RB_ROOT;
  1782 + osdc->event_count = 0;
1447 1783  
1448 1784 schedule_delayed_work(&osdc->osds_timeout_work,
1449 1785 round_jiffies_relative(osdc->client->options->osd_idle_ttl * HZ));
... ... @@ -1463,6 +1799,13 @@
1463 1799 "osd_op_reply");
1464 1800 if (err < 0)
1465 1801 goto out_msgpool;
  1802 +
  1803 + osdc->notify_wq = create_singlethread_workqueue("ceph-watch-notify");
  1804 + if (IS_ERR(osdc->notify_wq)) {
  1805 + err = PTR_ERR(osdc->notify_wq);
  1806 + osdc->notify_wq = NULL;
  1807 + goto out_msgpool;
  1808 + }
1466 1809 return 0;
1467 1810  
1468 1811 out_msgpool:
... ... @@ -1476,6 +1819,8 @@
1476 1819  
1477 1820 void ceph_osdc_stop(struct ceph_osd_client *osdc)
1478 1821 {
  1822 + flush_workqueue(osdc->notify_wq);
  1823 + destroy_workqueue(osdc->notify_wq);
1479 1824 cancel_delayed_work_sync(&osdc->timeout_work);
1480 1825 cancel_delayed_work_sync(&osdc->osds_timeout_work);
1481 1826 if (osdc->osdmap) {
... ... @@ -1483,6 +1828,7 @@
1483 1828 osdc->osdmap = NULL;
1484 1829 }
1485 1830 remove_old_osds(osdc, 1);
  1831 + WARN_ON(!RB_EMPTY_ROOT(&osdc->osds));
1486 1832 mempool_destroy(osdc->req_mempool);
1487 1833 ceph_msgpool_destroy(&osdc->msgpool_op);
1488 1834 ceph_msgpool_destroy(&osdc->msgpool_op_reply);
... ... @@ -1591,6 +1937,9 @@
1591 1937 case CEPH_MSG_OSD_OPREPLY:
1592 1938 handle_reply(osdc, msg, con);
1593 1939 break;
  1940 + case CEPH_MSG_WATCH_NOTIFY:
  1941 + handle_watch_notify(osdc, msg);
  1942 + break;
1594 1943  
1595 1944 default:
1596 1945 pr_err("received unknown message type %d %s\n", type,
... ... @@ -1684,6 +2033,7 @@
1684 2033  
1685 2034 switch (type) {
1686 2035 case CEPH_MSG_OSD_MAP:
  2036 + case CEPH_MSG_WATCH_NOTIFY:
1687 2037 return ceph_msg_new(type, front, GFP_NOFS);
1688 2038 case CEPH_MSG_OSD_OPREPLY:
1689 2039 return get_reply(con, hdr, skip);