Commit 3e1a0699095803e53072699a4a1485af7744601d

Authored by Joe Thornber
Committed by Mike Snitzer
1 parent 07f2b6e038

dm thin: fix out of data space handling

Ideally a thin pool would never run out of data space; the low water
mark would trigger userland to extend the pool before we completely run
out of space.  However, many small random IOs to unprovisioned space can
consume data space at an alarming rate.  Adjust your low water mark if
you're frequently seeing "out-of-data-space" mode.

Before this fix, if data space ran out the pool would be put in
PM_READ_ONLY mode which also aborted the pool's current metadata
transaction (data loss for any changes in the transaction).  This had a
side-effect of needlessly compromising data consistency.  And retry of
queued unserviceable bios, once the data pool was resized, could
initiate changes to potentially inconsistent pool metadata.

Now when the pool's data space is exhausted transition to a new pool
mode (PM_OUT_OF_DATA_SPACE) that allows metadata to be changed but data
may not be allocated.  This allows users to remove thin volumes or
discard data to recover data space.

The pool is no longer put in PM_READ_ONLY mode in response to the pool
running out of data space.  And PM_READ_ONLY mode no longer aborts the
pool's current metadata transaction.  Also, set_pool_mode() will now
notify userspace when the pool mode is changed.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>

Showing 1 changed file with 102 additions and 45 deletions Side-by-side Diff

drivers/md/dm-thin.c
... ... @@ -130,10 +130,11 @@
130 130 struct dm_thin_new_mapping;
131 131  
132 132 /*
133   - * The pool runs in 3 modes. Ordered in degraded order for comparisons.
  133 + * The pool runs in 4 modes. Ordered in degraded order for comparisons.
134 134 */
135 135 enum pool_mode {
136 136 PM_WRITE, /* metadata may be changed */
  137 + PM_OUT_OF_DATA_SPACE, /* metadata may be changed, though data may not be allocated */
137 138 PM_READ_ONLY, /* metadata may not be changed */
138 139 PM_FAIL, /* all I/O fails */
139 140 };
... ... @@ -198,7 +199,6 @@
198 199 };
199 200  
200 201 static enum pool_mode get_pool_mode(struct pool *pool);
201   -static void out_of_data_space(struct pool *pool);
202 202 static void metadata_operation_failed(struct pool *pool, const char *op, int r);
203 203  
204 204 /*
... ... @@ -399,6 +399,23 @@
399 399 spin_unlock_irqrestore(&pool->lock, flags);
400 400 }
401 401  
  402 +static void error_retry_list(struct pool *pool)
  403 +{
  404 + struct bio *bio;
  405 + unsigned long flags;
  406 + struct bio_list bios;
  407 +
  408 + bio_list_init(&bios);
  409 +
  410 + spin_lock_irqsave(&pool->lock, flags);
  411 + bio_list_merge(&bios, &pool->retry_on_resume_list);
  412 + bio_list_init(&pool->retry_on_resume_list);
  413 + spin_unlock_irqrestore(&pool->lock, flags);
  414 +
  415 + while ((bio = bio_list_pop(&bios)))
  416 + bio_io_error(bio);
  417 +}
  418 +
402 419 /*
403 420 * This section of code contains the logic for processing a thin device's IO.
404 421 * Much of the code depends on pool object resources (lists, workqueues, etc)
405 422  
... ... @@ -925,13 +942,15 @@
925 942 }
926 943 }
927 944  
  945 +static void set_pool_mode(struct pool *pool, enum pool_mode new_mode);
  946 +
928 947 static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
929 948 {
930 949 int r;
931 950 dm_block_t free_blocks;
932 951 struct pool *pool = tc->pool;
933 952  
934   - if (get_pool_mode(pool) != PM_WRITE)
  953 + if (WARN_ON(get_pool_mode(pool) != PM_WRITE))
935 954 return -EINVAL;
936 955  
937 956 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
... ... @@ -958,7 +977,7 @@
958 977 }
959 978  
960 979 if (!free_blocks) {
961   - out_of_data_space(pool);
  980 + set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);
962 981 return -ENOSPC;
963 982 }
964 983 }
965 984  
966 985  
... ... @@ -988,15 +1007,32 @@
988 1007 spin_unlock_irqrestore(&pool->lock, flags);
989 1008 }
990 1009  
991   -static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
  1010 +static bool should_error_unserviceable_bio(struct pool *pool)
992 1011 {
993   - /*
994   - * When pool is read-only, no cell locking is needed because
995   - * nothing is changing.
996   - */
997   - WARN_ON_ONCE(get_pool_mode(pool) != PM_READ_ONLY);
  1012 + enum pool_mode m = get_pool_mode(pool);
998 1013  
999   - if (pool->pf.error_if_no_space)
  1014 + switch (m) {
  1015 + case PM_WRITE:
  1016 + /* Shouldn't get here */
  1017 + DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode");
  1018 + return true;
  1019 +
  1020 + case PM_OUT_OF_DATA_SPACE:
  1021 + return pool->pf.error_if_no_space;
  1022 +
  1023 + case PM_READ_ONLY:
  1024 + case PM_FAIL:
  1025 + return true;
  1026 + default:
  1027 + /* Shouldn't get here */
  1028 + DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode");
  1029 + return true;
  1030 + }
  1031 +}
  1032 +
  1033 +static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
  1034 +{
  1035 + if (should_error_unserviceable_bio(pool))
1000 1036 bio_io_error(bio);
1001 1037 else
1002 1038 retry_on_resume(bio);
1003 1039  
... ... @@ -1007,11 +1043,20 @@
1007 1043 struct bio *bio;
1008 1044 struct bio_list bios;
1009 1045  
  1046 + if (should_error_unserviceable_bio(pool)) {
  1047 + cell_error(pool, cell);
  1048 + return;
  1049 + }
  1050 +
1010 1051 bio_list_init(&bios);
1011 1052 cell_release(pool, cell, &bios);
1012 1053  
1013   - while ((bio = bio_list_pop(&bios)))
1014   - handle_unserviceable_bio(pool, bio);
  1054 + if (should_error_unserviceable_bio(pool))
  1055 + while ((bio = bio_list_pop(&bios)))
  1056 + bio_io_error(bio);
  1057 + else
  1058 + while ((bio = bio_list_pop(&bios)))
  1059 + retry_on_resume(bio);
1015 1060 }
1016 1061  
1017 1062 static void process_discard(struct thin_c *tc, struct bio *bio)
... ... @@ -1296,6 +1341,11 @@
1296 1341 }
1297 1342 }
1298 1343  
  1344 +static void process_bio_success(struct thin_c *tc, struct bio *bio)
  1345 +{
  1346 + bio_endio(bio, 0);
  1347 +}
  1348 +
1299 1349 static void process_bio_fail(struct thin_c *tc, struct bio *bio)
1300 1350 {
1301 1351 bio_io_error(bio);
1302 1352  
... ... @@ -1399,9 +1449,15 @@
1399 1449 return pool->pf.mode;
1400 1450 }
1401 1451  
  1452 +static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode)
  1453 +{
  1454 + dm_table_event(pool->ti->table);
  1455 + DMINFO("%s: switching pool to %s mode",
  1456 + dm_device_name(pool->pool_md), new_mode);
  1457 +}
  1458 +
1402 1459 static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
1403 1460 {
1404   - int r;
1405 1461 struct pool_c *pt = pool->ti->private;
1406 1462 bool needs_check = dm_pool_metadata_needs_check(pool->pmd);
1407 1463 enum pool_mode old_mode = get_pool_mode(pool);
1408 1464  
1409 1465  
1410 1466  
1411 1467  
... ... @@ -1429,38 +1485,48 @@
1429 1485 switch (new_mode) {
1430 1486 case PM_FAIL:
1431 1487 if (old_mode != new_mode)
1432   - DMERR("%s: switching pool to failure mode",
1433   - dm_device_name(pool->pool_md));
  1488 + notify_of_pool_mode_change(pool, "failure");
1434 1489 dm_pool_metadata_read_only(pool->pmd);
1435 1490 pool->process_bio = process_bio_fail;
1436 1491 pool->process_discard = process_bio_fail;
1437 1492 pool->process_prepared_mapping = process_prepared_mapping_fail;
1438 1493 pool->process_prepared_discard = process_prepared_discard_fail;
  1494 +
  1495 + error_retry_list(pool);
1439 1496 break;
1440 1497  
1441 1498 case PM_READ_ONLY:
1442 1499 if (old_mode != new_mode)
1443   - DMERR("%s: switching pool to read-only mode",
1444   - dm_device_name(pool->pool_md));
1445   - r = dm_pool_abort_metadata(pool->pmd);
1446   - if (r) {
1447   - DMERR("%s: aborting transaction failed",
1448   - dm_device_name(pool->pool_md));
1449   - new_mode = PM_FAIL;
1450   - set_pool_mode(pool, new_mode);
1451   - } else {
1452   - dm_pool_metadata_read_only(pool->pmd);
1453   - pool->process_bio = process_bio_read_only;
1454   - pool->process_discard = process_discard;
1455   - pool->process_prepared_mapping = process_prepared_mapping_fail;
1456   - pool->process_prepared_discard = process_prepared_discard_passdown;
1457   - }
  1500 + notify_of_pool_mode_change(pool, "read-only");
  1501 + dm_pool_metadata_read_only(pool->pmd);
  1502 + pool->process_bio = process_bio_read_only;
  1503 + pool->process_discard = process_bio_success;
  1504 + pool->process_prepared_mapping = process_prepared_mapping_fail;
  1505 + pool->process_prepared_discard = process_prepared_discard_passdown;
  1506 +
  1507 + error_retry_list(pool);
1458 1508 break;
1459 1509  
  1510 + case PM_OUT_OF_DATA_SPACE:
  1511 + /*
  1512 + * Ideally we'd never hit this state; the low water mark
  1513 + * would trigger userland to extend the pool before we
  1514 + * completely run out of data space. However, many small
  1515 + * IOs to unprovisioned space can consume data space at an
  1516 + * alarming rate. Adjust your low water mark if you're
  1517 + * frequently seeing this mode.
  1518 + */
  1519 + if (old_mode != new_mode)
  1520 + notify_of_pool_mode_change(pool, "out-of-data-space");
  1521 + pool->process_bio = process_bio_read_only;
  1522 + pool->process_discard = process_discard;
  1523 + pool->process_prepared_mapping = process_prepared_mapping;
  1524 + pool->process_prepared_discard = process_prepared_discard_passdown;
  1525 + break;
  1526 +
1460 1527 case PM_WRITE:
1461 1528 if (old_mode != new_mode)
1462   - DMINFO("%s: switching pool to write mode",
1463   - dm_device_name(pool->pool_md));
  1529 + notify_of_pool_mode_change(pool, "write");
1464 1530 dm_pool_metadata_read_write(pool->pmd);
1465 1531 pool->process_bio = process_bio;
1466 1532 pool->process_discard = process_discard;
... ... @@ -1477,17 +1543,6 @@
1477 1543 pt->adjusted_pf.mode = new_mode;
1478 1544 }
1479 1545  
1480   -/*
1481   - * Rather than calling set_pool_mode directly, use these which describe the
1482   - * reason for mode degradation.
1483   - */
1484   -static void out_of_data_space(struct pool *pool)
1485   -{
1486   - DMERR_LIMIT("%s: no free data space available.",
1487   - dm_device_name(pool->pool_md));
1488   - set_pool_mode(pool, PM_READ_ONLY);
1489   -}
1490   -
1491 1546 static void abort_transaction(struct pool *pool)
1492 1547 {
1493 1548 const char *dev_name = dm_device_name(pool->pool_md);
... ... @@ -2719,7 +2774,9 @@
2719 2774 else
2720 2775 DMEMIT("- ");
2721 2776  
2722   - if (pool->pf.mode == PM_READ_ONLY)
  2777 + if (pool->pf.mode == PM_OUT_OF_DATA_SPACE)
  2778 + DMEMIT("out_of_data_space ");
  2779 + else if (pool->pf.mode == PM_READ_ONLY)
2723 2780 DMEMIT("ro ");
2724 2781 else
2725 2782 DMEMIT("rw ");