Commit 8b3e1a56982d0eafff0afb0ff9e87c8b944a9bdc

Authored by Alex Elder
Committed by Sage Weil
1 parent 2f82ee54d9

rbd: implement layered reads

Implement layered read requests for format 2 rbd images.

If an rbd image is a clone of a snapshot, the snapshot will be the
clone's "parent" image.  When an object read request on a clone
comes back with ENOENT it indicates that the clone is not yet
populated with that portion of the image's data, and the parent
image should be consulted to satisfy the read.

When this occurs, a new image request is created, directed to the
parent image.  The offset and length of the image are the same as
the image-relative offset and length of the object request that
produced ENOENT.  Data from the parent image therefore satisfies the
object read request for the original image request.

While this code works, it will not be active until we enable the
layering feature (by adding RBD_FEATURE_LAYERING to the value of
RBD_FEATURES_SUPPORTED).

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

Showing 1 changed file with 85 additions and 12 deletions Side-by-side Diff

... ... @@ -398,6 +398,8 @@
398 398 # define rbd_assert(expr) ((void) 0)
399 399 #endif /* !RBD_DEBUG */
400 400  
  401 +static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
  402 +
401 403 static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
402 404 static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
403 405  
... ... @@ -1336,9 +1338,15 @@
1336 1338  
1337 1339 static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
1338 1340 {
1339   - dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request,
1340   - obj_request->result, obj_request->xferred, obj_request->length);
1341   - if (obj_request->img_request)
  1341 + struct rbd_img_request *img_request = obj_request->img_request;
  1342 + bool layered = img_request && img_request_layered_test(img_request);
  1343 +
  1344 + dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
  1345 + obj_request, img_request, obj_request->result,
  1346 + obj_request->xferred, obj_request->length);
  1347 + if (layered && obj_request->result == -ENOENT)
  1348 + rbd_img_parent_read(obj_request);
  1349 + else if (img_request)
1342 1350 rbd_img_obj_request_read_callback(obj_request);
1343 1351 else
1344 1352 obj_request_done_set(obj_request);
... ... @@ -1349,9 +1357,8 @@
1349 1357 dout("%s: obj %p result %d %llu\n", __func__, obj_request,
1350 1358 obj_request->result, obj_request->length);
1351 1359 /*
1352   - * There is no such thing as a successful short write.
1353   - * Our xferred value is the number of bytes transferred
1354   - * back. Set it to our originally-requested length.
  1360 + * There is no such thing as a successful short write. Set
  1361 + * it to our originally-requested length.
1355 1362 */
1356 1363 obj_request->xferred = obj_request->length;
1357 1364 obj_request_done_set(obj_request);
... ... @@ -1391,7 +1398,7 @@
1391 1398 * passed to blk_end_request(), which takes an unsigned int.
1392 1399 */
1393 1400 obj_request->xferred = osd_req->r_reply_op_len[0];
1394   - rbd_assert(obj_request->xferred < (u64) UINT_MAX);
  1401 + rbd_assert(obj_request->xferred < (u64)UINT_MAX);
1395 1402 opcode = osd_req->r_ops[0].op;
1396 1403 switch (opcode) {
1397 1404 case CEPH_OSD_OP_READ:
... ... @@ -1607,7 +1614,6 @@
1607 1614 INIT_LIST_HEAD(&img_request->obj_requests);
1608 1615 kref_init(&img_request->kref);
1609 1616  
1610   - (void) img_request_layered_test(img_request); /* Avoid a warning */
1611 1617 rbd_img_request_get(img_request); /* Avoid a warning */
1612 1618 rbd_img_request_put(img_request); /* TEMPORARY */
1613 1619  
... ... @@ -1635,6 +1641,9 @@
1635 1641 if (img_request_write_test(img_request))
1636 1642 ceph_put_snap_context(img_request->snapc);
1637 1643  
  1644 + if (img_request_child_test(img_request))
  1645 + rbd_obj_request_put(img_request->obj_request);
  1646 +
1638 1647 kfree(img_request);
1639 1648 }
1640 1649  
1641 1650  
... ... @@ -1643,13 +1652,11 @@
1643 1652 struct rbd_img_request *img_request;
1644 1653 unsigned int xferred;
1645 1654 int result;
  1655 + bool more;
1646 1656  
1647 1657 rbd_assert(obj_request_img_data_test(obj_request));
1648 1658 img_request = obj_request->img_request;
1649 1659  
1650   - rbd_assert(!img_request_child_test(img_request));
1651   - rbd_assert(img_request->rq != NULL);
1652   -
1653 1660 rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
1654 1661 xferred = (unsigned int)obj_request->xferred;
1655 1662 result = obj_request->result;
... ... @@ -1666,7 +1673,15 @@
1666 1673 img_request->result = result;
1667 1674 }
1668 1675  
1669   - return blk_end_request(img_request->rq, result, xferred);
  1676 + if (img_request_child_test(img_request)) {
  1677 + rbd_assert(img_request->obj_request != NULL);
  1678 + more = obj_request->which < img_request->obj_request_count - 1;
  1679 + } else {
  1680 + rbd_assert(img_request->rq != NULL);
  1681 + more = blk_end_request(img_request->rq, result, xferred);
  1682 + }
  1683 +
  1684 + return more;
1670 1685 }
1671 1686  
1672 1687 static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
... ... @@ -1809,6 +1824,64 @@
1809 1824 }
1810 1825  
1811 1826 return 0;
  1827 +}
  1828 +
  1829 +static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
  1830 +{
  1831 + struct rbd_obj_request *obj_request;
  1832 +
  1833 + rbd_assert(img_request_child_test(img_request));
  1834 +
  1835 + obj_request = img_request->obj_request;
  1836 + rbd_assert(obj_request != NULL);
  1837 + obj_request->result = img_request->result;
  1838 + obj_request->xferred = img_request->xferred;
  1839 +
  1840 + rbd_img_obj_request_read_callback(obj_request);
  1841 + rbd_obj_request_complete(obj_request);
  1842 +}
  1843 +
  1844 +static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
  1845 +{
  1846 + struct rbd_device *rbd_dev;
  1847 + struct rbd_img_request *img_request;
  1848 + int result;
  1849 +
  1850 + rbd_assert(obj_request_img_data_test(obj_request));
  1851 + rbd_assert(obj_request->img_request != NULL);
  1852 + rbd_assert(obj_request->result == (s32) -ENOENT);
  1853 + rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
  1854 +
  1855 + rbd_dev = obj_request->img_request->rbd_dev;
  1856 + rbd_assert(rbd_dev->parent != NULL);
  1857 + /* rbd_read_finish(obj_request, obj_request->length); */
  1858 + img_request = rbd_img_request_create(rbd_dev->parent,
  1859 + obj_request->img_offset,
  1860 + obj_request->length,
  1861 + false, true);
  1862 + result = -ENOMEM;
  1863 + if (!img_request)
  1864 + goto out_err;
  1865 +
  1866 + rbd_obj_request_get(obj_request);
  1867 + img_request->obj_request = obj_request;
  1868 +
  1869 + result = rbd_img_request_fill_bio(img_request, obj_request->bio_list);
  1870 + if (result)
  1871 + goto out_err;
  1872 +
  1873 + img_request->callback = rbd_img_parent_read_callback;
  1874 + result = rbd_img_request_submit(img_request);
  1875 + if (result)
  1876 + goto out_err;
  1877 +
  1878 + return;
  1879 +out_err:
  1880 + if (img_request)
  1881 + rbd_img_request_put(img_request);
  1882 + obj_request->result = result;
  1883 + obj_request->xferred = 0;
  1884 + obj_request_done_set(obj_request);
1812 1885 }
1813 1886  
1814 1887 static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,