Commit 8b64e881eb40ac8b9bfcbce068a97eef819044ee

Authored by Mike Snitzer
1 parent 6d16202be7

dm thin: fix set_pool_mode exposed pool operation races

The pool mode must not be switched until after the corresponding pool
process_* methods have been established.  Otherwise, because
set_pool_mode() isn't interlocked with the IO path for performance
reasons, the IO path can end up executing process_* operations that
don't match the mode.  This patch eliminates problems like the following
(as seen on really fast PCIe SSD storage when transitioning the pool's
mode from PM_READ_ONLY to PM_WRITE):

kernel: device-mapper: thin: 253:2: reached low water mark for data device: sending event.
kernel: device-mapper: thin: 253:2: no free data space available.
kernel: device-mapper: thin: 253:2: switching pool to read-only mode
kernel: device-mapper: thin: 253:2: switching pool to write mode
kernel: ------------[ cut here ]------------
kernel: WARNING: CPU: 11 PID: 7564 at drivers/md/dm-thin.c:995 handle_unserviceable_bio+0x146/0x160 [dm_thin_pool]()
...
kernel: Workqueue: dm-thin do_worker [dm_thin_pool]
kernel: 00000000000003e3 ffff880308831cc8 ffffffff8152ebcb 00000000000003e3
kernel: 0000000000000000 ffff880308831d08 ffffffff8104c46c ffff88032502a800
kernel: ffff880036409000 ffff88030ec7ce00 0000000000000001 00000000ffffffc3
kernel: Call Trace:
kernel: [<ffffffff8152ebcb>] dump_stack+0x49/0x5e
kernel: [<ffffffff8104c46c>] warn_slowpath_common+0x8c/0xc0
kernel: [<ffffffff8104c4ba>] warn_slowpath_null+0x1a/0x20
kernel: [<ffffffffa001e2c6>] handle_unserviceable_bio+0x146/0x160 [dm_thin_pool]
kernel: [<ffffffffa001f276>] process_bio_read_only+0x136/0x180 [dm_thin_pool]
kernel: [<ffffffffa0020b75>] process_deferred_bios+0xc5/0x230 [dm_thin_pool]
kernel: [<ffffffffa0020d31>] do_worker+0x51/0x60 [dm_thin_pool]
kernel: [<ffffffff81067823>] process_one_work+0x183/0x490
kernel: [<ffffffff81068c70>] worker_thread+0x120/0x3a0
kernel: [<ffffffff81068b50>] ? manage_workers+0x160/0x160
kernel: [<ffffffff8106e86e>] kthread+0xce/0xf0
kernel: [<ffffffff8106e7a0>] ? kthread_freezable_should_stop+0x70/0x70
kernel: [<ffffffff8153b3ec>] ret_from_fork+0x7c/0xb0
kernel: [<ffffffff8106e7a0>] ? kthread_freezable_should_stop+0x70/0x70
kernel: ---[ end trace 3f00528e08ffa55c ]---
kernel: device-mapper: thin: pool mode is PM_WRITE not PM_READ_ONLY like expected!?

dm-thin.c:995 was the WARN_ON_ONCE(get_pool_mode(pool) != PM_READ_ONLY);
at the top of handle_unserviceable_bio().  And as the additional
debugging I had conveys: the pool mode was _not_ PM_READ_ONLY like
expected, it was already PM_WRITE, yet pool->process_bio was still set
to process_bio_read_only().

Also, while fixing this up, reduce logging of redundant pool mode
transitions by checking new_mode is different from old_mode.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org

Showing 1 changed file with 27 additions and 13 deletions Side-by-side Diff

drivers/md/dm-thin.c
... ... @@ -1392,16 +1392,16 @@
1392 1392 return pool->pf.mode;
1393 1393 }
1394 1394  
1395   -static void set_pool_mode(struct pool *pool, enum pool_mode mode)
  1395 +static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
1396 1396 {
1397 1397 int r;
  1398 + enum pool_mode old_mode = pool->pf.mode;
1398 1399  
1399   - pool->pf.mode = mode;
1400   -
1401   - switch (mode) {
  1400 + switch (new_mode) {
1402 1401 case PM_FAIL:
1403   - DMERR("%s: switching pool to failure mode",
1404   - dm_device_name(pool->pool_md));
  1402 + if (old_mode != new_mode)
  1403 + DMERR("%s: switching pool to failure mode",
  1404 + dm_device_name(pool->pool_md));
1405 1405 dm_pool_metadata_read_only(pool->pmd);
1406 1406 pool->process_bio = process_bio_fail;
1407 1407 pool->process_discard = process_bio_fail;
1408 1408  
... ... @@ -1410,13 +1410,15 @@
1410 1410 break;
1411 1411  
1412 1412 case PM_READ_ONLY:
1413   - DMERR("%s: switching pool to read-only mode",
1414   - dm_device_name(pool->pool_md));
  1413 + if (old_mode != new_mode)
  1414 + DMERR("%s: switching pool to read-only mode",
  1415 + dm_device_name(pool->pool_md));
1415 1416 r = dm_pool_abort_metadata(pool->pmd);
1416 1417 if (r) {
1417 1418 DMERR("%s: aborting transaction failed",
1418 1419 dm_device_name(pool->pool_md));
1419   - set_pool_mode(pool, PM_FAIL);
  1420 + new_mode = PM_FAIL;
  1421 + set_pool_mode(pool, new_mode);
1420 1422 } else {
1421 1423 dm_pool_metadata_read_only(pool->pmd);
1422 1424 pool->process_bio = process_bio_read_only;
... ... @@ -1427,6 +1429,9 @@
1427 1429 break;
1428 1430  
1429 1431 case PM_WRITE:
  1432 + if (old_mode != new_mode)
  1433 + DMINFO("%s: switching pool to write mode",
  1434 + dm_device_name(pool->pool_md));
1430 1435 dm_pool_metadata_read_write(pool->pmd);
1431 1436 pool->process_bio = process_bio;
1432 1437 pool->process_discard = process_discard;
... ... @@ -1434,6 +1439,8 @@
1434 1439 pool->process_prepared_discard = process_prepared_discard;
1435 1440 break;
1436 1441 }
  1442 +
  1443 + pool->pf.mode = new_mode;
1437 1444 }
1438 1445  
1439 1446 /*
... ... @@ -1677,6 +1684,17 @@
1677 1684 enum pool_mode new_mode = pt->adjusted_pf.mode;
1678 1685  
1679 1686 /*
  1687 + * Don't change the pool's mode until set_pool_mode() below.
  1688 + * Otherwise the pool's process_* function pointers may
  1689 + * not match the desired pool mode.
  1690 + */
  1691 + pt->adjusted_pf.mode = old_mode;
  1692 +
  1693 + pool->ti = ti;
  1694 + pool->pf = pt->adjusted_pf;
  1695 + pool->low_water_blocks = pt->low_water_blocks;
  1696 +
  1697 + /*
1680 1698 * If we were in PM_FAIL mode, rollback of metadata failed. We're
1681 1699 * not going to recover without a thin_repair. So we never let the
1682 1700 * pool move out of the old mode. On the other hand a PM_READ_ONLY
... ... @@ -1685,10 +1703,6 @@
1685 1703 */
1686 1704 if (old_mode == PM_FAIL)
1687 1705 new_mode = old_mode;
1688   -
1689   - pool->ti = ti;
1690   - pool->low_water_blocks = pt->low_water_blocks;
1691   - pool->pf = pt->adjusted_pf;
1692 1706  
1693 1707 set_pool_mode(pool, new_mode);
1694 1708