Commit 67324ea18812bc952ef96892fbd5817b9050413f

Authored by Mike Snitzer
1 parent c140e1c4e2

dm thin: sort the per thin deferred bios using an rb_tree

A thin-pool will allocate blocks using FIFO order for all thin devices
which share the thin-pool.  Because of this simplistic allocation the
thin-pool's space can become fragmented quite easily; especially when
multiple threads are requesting blocks in parallel.

Sort each thin device's deferred_bio_list based on logical sector to
help reduce fragmentation of the thin-pool's ondisk layout.

The following tables illustrate the realized gains/potential offered by
sorting each thin device's deferred_bio_list.  An "io size"-sized random
read of the device would result in "seeks/io" fragments being read, with
an average "distance/seek" between each fragment.

Data was written to a single thin device using multiple threads via
iozone (8 threads, 64K for both the block_size and io_size).

unsorted:

     io size   seeks/io distance/seek
  --------------------------------------
          4k    0.000   0b
         16k    0.013   11m
         64k    0.065   11m
        256k    0.274   10m
          1m    1.109   10m
          4m    4.411   10m
         16m    17.097  11m
         64m    60.055  13m
        256m    148.798 25m
          1g    809.929 21m

sorted:

     io size   seeks/io distance/seek
  --------------------------------------
          4k    0.000   0b
         16k    0.000   1g
         64k    0.001   1g
        256k    0.003   1g
          1m    0.011   1g
          4m    0.045   1g
         16m    0.181   1g
         64m    0.747   1011m
        256m    3.299   1g
          1g    14.373  1g

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Joe Thornber <ejt@redhat.com>

Showing 1 changed file with 82 additions and 2 deletions Side-by-side Diff

drivers/md/dm-thin.c
... ... @@ -16,6 +16,7 @@
16 16 #include <linux/init.h>
17 17 #include <linux/module.h>
18 18 #include <linux/slab.h>
  19 +#include <linux/rbtree.h>
19 20  
20 21 #define DM_MSG_PREFIX "thin"
21 22  
... ... @@ -230,6 +231,7 @@
230 231 spinlock_t lock;
231 232 struct bio_list deferred_bio_list;
232 233 struct bio_list retry_on_resume_list;
  234 + struct rb_root sort_bio_list; /* sorted list of deferred bios */
233 235 };
234 236  
235 237 /*----------------------------------------------------------------*/
... ... @@ -371,6 +373,7 @@
371 373 struct dm_deferred_entry *shared_read_entry;
372 374 struct dm_deferred_entry *all_io_entry;
373 375 struct dm_thin_new_mapping *overwrite_mapping;
  376 + struct rb_node rb_node;
374 377 };
375 378  
376 379 static void requeue_bio_list(struct thin_c *tc, struct bio_list *master)
377 380  
... ... @@ -1367,12 +1370,77 @@
1367 1370 jiffies > pool->last_commit_jiffies + COMMIT_PERIOD;
1368 1371 }
1369 1372  
  1373 +#define thin_pbd(node) rb_entry((node), struct dm_thin_endio_hook, rb_node)
  1374 +#define thin_bio(pbd) dm_bio_from_per_bio_data((pbd), sizeof(struct dm_thin_endio_hook))
  1375 +
  1376 +static void __thin_bio_rb_add(struct thin_c *tc, struct bio *bio)
  1377 +{
  1378 + struct rb_node **rbp, *parent;
  1379 + struct dm_thin_endio_hook *pbd;
  1380 + sector_t bi_sector = bio->bi_iter.bi_sector;
  1381 +
  1382 + rbp = &tc->sort_bio_list.rb_node;
  1383 + parent = NULL;
  1384 + while (*rbp) {
  1385 + parent = *rbp;
  1386 + pbd = thin_pbd(parent);
  1387 +
  1388 + if (bi_sector < thin_bio(pbd)->bi_iter.bi_sector)
  1389 + rbp = &(*rbp)->rb_left;
  1390 + else
  1391 + rbp = &(*rbp)->rb_right;
  1392 + }
  1393 +
  1394 + pbd = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
  1395 + rb_link_node(&pbd->rb_node, parent, rbp);
  1396 + rb_insert_color(&pbd->rb_node, &tc->sort_bio_list);
  1397 +}
  1398 +
  1399 +static void __extract_sorted_bios(struct thin_c *tc)
  1400 +{
  1401 + struct rb_node *node;
  1402 + struct dm_thin_endio_hook *pbd;
  1403 + struct bio *bio;
  1404 +
  1405 + for (node = rb_first(&tc->sort_bio_list); node; node = rb_next(node)) {
  1406 + pbd = thin_pbd(node);
  1407 + bio = thin_bio(pbd);
  1408 +
  1409 + bio_list_add(&tc->deferred_bio_list, bio);
  1410 + rb_erase(&pbd->rb_node, &tc->sort_bio_list);
  1411 + }
  1412 +
  1413 + WARN_ON(!RB_EMPTY_ROOT(&tc->sort_bio_list));
  1414 +}
  1415 +
  1416 +static void __sort_thin_deferred_bios(struct thin_c *tc)
  1417 +{
  1418 + struct bio *bio;
  1419 + struct bio_list bios;
  1420 +
  1421 + bio_list_init(&bios);
  1422 + bio_list_merge(&bios, &tc->deferred_bio_list);
  1423 + bio_list_init(&tc->deferred_bio_list);
  1424 +
  1425 + /* Sort deferred_bio_list using rb-tree */
  1426 + while ((bio = bio_list_pop(&bios)))
  1427 + __thin_bio_rb_add(tc, bio);
  1428 +
  1429 + /*
  1430 + * Transfer the sorted bios in sort_bio_list back to
  1431 + * deferred_bio_list to allow lockless submission of
  1432 + * all bios.
  1433 + */
  1434 + __extract_sorted_bios(tc);
  1435 +}
  1436 +
1370 1437 static void process_thin_deferred_bios(struct thin_c *tc)
1371 1438 {
1372 1439 struct pool *pool = tc->pool;
1373 1440 unsigned long flags;
1374 1441 struct bio *bio;
1375 1442 struct bio_list bios;
  1443 + struct blk_plug plug;
1376 1444  
1377 1445 if (tc->requeue_mode) {
1378 1446 requeue_bio_list(tc, &tc->deferred_bio_list);
1379 1447  
1380 1448  
... ... @@ -1382,10 +1450,20 @@
1382 1450 bio_list_init(&bios);
1383 1451  
1384 1452 spin_lock_irqsave(&tc->lock, flags);
  1453 +
  1454 + if (bio_list_empty(&tc->deferred_bio_list)) {
  1455 + spin_unlock_irqrestore(&tc->lock, flags);
  1456 + return;
  1457 + }
  1458 +
  1459 + __sort_thin_deferred_bios(tc);
  1460 +
1385 1461 bio_list_merge(&bios, &tc->deferred_bio_list);
1386 1462 bio_list_init(&tc->deferred_bio_list);
  1463 +
1387 1464 spin_unlock_irqrestore(&tc->lock, flags);
1388 1465  
  1466 + blk_start_plug(&plug);
1389 1467 while ((bio = bio_list_pop(&bios))) {
1390 1468 /*
1391 1469 * If we've got no free new_mapping structs, and processing
... ... @@ -1405,6 +1483,7 @@
1405 1483 else
1406 1484 pool->process_bio(tc, bio);
1407 1485 }
  1486 + blk_finish_plug(&plug);
1408 1487 }
1409 1488  
1410 1489 static void process_deferred_bios(struct pool *pool)
... ... @@ -2964,7 +3043,7 @@
2964 3043 .name = "thin-pool",
2965 3044 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
2966 3045 DM_TARGET_IMMUTABLE,
2967   - .version = {1, 11, 0},
  3046 + .version = {1, 12, 0},
2968 3047 .module = THIS_MODULE,
2969 3048 .ctr = pool_ctr,
2970 3049 .dtr = pool_dtr,
... ... @@ -3040,6 +3119,7 @@
3040 3119 spin_lock_init(&tc->lock);
3041 3120 bio_list_init(&tc->deferred_bio_list);
3042 3121 bio_list_init(&tc->retry_on_resume_list);
  3122 + tc->sort_bio_list = RB_ROOT;
3043 3123  
3044 3124 if (argc == 3) {
3045 3125 r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
... ... @@ -3287,7 +3367,7 @@
3287 3367  
3288 3368 static struct target_type thin_target = {
3289 3369 .name = "thin",
3290   - .version = {1, 11, 0},
  3370 + .version = {1, 12, 0},
3291 3371 .module = THIS_MODULE,
3292 3372 .ctr = thin_ctr,
3293 3373 .dtr = thin_dtr,