Commit 67324ea18812bc952ef96892fbd5817b9050413f
1 parent
c140e1c4e2
Exists in
master
and in
13 other branches
dm thin: sort the per thin deferred bios using an rb_tree
A thin-pool will allocate blocks using FIFO order for all thin devices which share the thin-pool. Because of this simplistic allocation the thin-pool's space can become fragmented quite easily; especially when multiple threads are requesting blocks in parallel. Sort each thin device's deferred_bio_list based on logical sector to help reduce fragmentation of the thin-pool's ondisk layout. The following tables illustrate the realized gains/potential offered by sorting each thin device's deferred_bio_list. An "io size"-sized random read of the device would result in "seeks/io" fragments being read, with an average "distance/seek" between each fragment. Data was written to a single thin device using multiple threads via iozone (8 threads, 64K for both the block_size and io_size). unsorted: io size seeks/io distance/seek -------------------------------------- 4k 0.000 0b 16k 0.013 11m 64k 0.065 11m 256k 0.274 10m 1m 1.109 10m 4m 4.411 10m 16m 17.097 11m 64m 60.055 13m 256m 148.798 25m 1g 809.929 21m sorted: io size seeks/io distance/seek -------------------------------------- 4k 0.000 0b 16k 0.000 1g 64k 0.001 1g 256k 0.003 1g 1m 0.011 1g 4m 0.045 1g 16m 0.181 1g 64m 0.747 1011m 256m 3.299 1g 1g 14.373 1g Signed-off-by: Mike Snitzer <snitzer@redhat.com> Acked-by: Joe Thornber <ejt@redhat.com>
Showing 1 changed file with 82 additions and 2 deletions Side-by-side Diff
drivers/md/dm-thin.c
... | ... | @@ -16,6 +16,7 @@ |
16 | 16 | #include <linux/init.h> |
17 | 17 | #include <linux/module.h> |
18 | 18 | #include <linux/slab.h> |
19 | +#include <linux/rbtree.h> | |
19 | 20 | |
20 | 21 | #define DM_MSG_PREFIX "thin" |
21 | 22 | |
... | ... | @@ -230,6 +231,7 @@ |
230 | 231 | spinlock_t lock; |
231 | 232 | struct bio_list deferred_bio_list; |
232 | 233 | struct bio_list retry_on_resume_list; |
234 | + struct rb_root sort_bio_list; /* sorted list of deferred bios */ | |
233 | 235 | }; |
234 | 236 | |
235 | 237 | /*----------------------------------------------------------------*/ |
... | ... | @@ -371,6 +373,7 @@ |
371 | 373 | struct dm_deferred_entry *shared_read_entry; |
372 | 374 | struct dm_deferred_entry *all_io_entry; |
373 | 375 | struct dm_thin_new_mapping *overwrite_mapping; |
376 | + struct rb_node rb_node; | |
374 | 377 | }; |
375 | 378 | |
376 | 379 | static void requeue_bio_list(struct thin_c *tc, struct bio_list *master) |
377 | 380 | |
... | ... | @@ -1367,12 +1370,77 @@ |
1367 | 1370 | jiffies > pool->last_commit_jiffies + COMMIT_PERIOD; |
1368 | 1371 | } |
1369 | 1372 | |
1373 | +#define thin_pbd(node) rb_entry((node), struct dm_thin_endio_hook, rb_node) | |
1374 | +#define thin_bio(pbd) dm_bio_from_per_bio_data((pbd), sizeof(struct dm_thin_endio_hook)) | |
1375 | + | |
1376 | +static void __thin_bio_rb_add(struct thin_c *tc, struct bio *bio) | |
1377 | +{ | |
1378 | + struct rb_node **rbp, *parent; | |
1379 | + struct dm_thin_endio_hook *pbd; | |
1380 | + sector_t bi_sector = bio->bi_iter.bi_sector; | |
1381 | + | |
1382 | + rbp = &tc->sort_bio_list.rb_node; | |
1383 | + parent = NULL; | |
1384 | + while (*rbp) { | |
1385 | + parent = *rbp; | |
1386 | + pbd = thin_pbd(parent); | |
1387 | + | |
1388 | + if (bi_sector < thin_bio(pbd)->bi_iter.bi_sector) | |
1389 | + rbp = &(*rbp)->rb_left; | |
1390 | + else | |
1391 | + rbp = &(*rbp)->rb_right; | |
1392 | + } | |
1393 | + | |
1394 | + pbd = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); | |
1395 | + rb_link_node(&pbd->rb_node, parent, rbp); | |
1396 | + rb_insert_color(&pbd->rb_node, &tc->sort_bio_list); | |
1397 | +} | |
1398 | + | |
1399 | +static void __extract_sorted_bios(struct thin_c *tc) | |
1400 | +{ | |
1401 | + struct rb_node *node; | |
1402 | + struct dm_thin_endio_hook *pbd; | |
1403 | + struct bio *bio; | |
1404 | + | |
1405 | + for (node = rb_first(&tc->sort_bio_list); node; node = rb_next(node)) { | |
1406 | + pbd = thin_pbd(node); | |
1407 | + bio = thin_bio(pbd); | |
1408 | + | |
1409 | + bio_list_add(&tc->deferred_bio_list, bio); | |
1410 | + rb_erase(&pbd->rb_node, &tc->sort_bio_list); | |
1411 | + } | |
1412 | + | |
1413 | + WARN_ON(!RB_EMPTY_ROOT(&tc->sort_bio_list)); | |
1414 | +} | |
1415 | + | |
1416 | +static void __sort_thin_deferred_bios(struct thin_c *tc) | |
1417 | +{ | |
1418 | + struct bio *bio; | |
1419 | + struct bio_list bios; | |
1420 | + | |
1421 | + bio_list_init(&bios); | |
1422 | + bio_list_merge(&bios, &tc->deferred_bio_list); | |
1423 | + bio_list_init(&tc->deferred_bio_list); | |
1424 | + | |
1425 | + /* Sort deferred_bio_list using rb-tree */ | |
1426 | + while ((bio = bio_list_pop(&bios))) | |
1427 | + __thin_bio_rb_add(tc, bio); | |
1428 | + | |
1429 | + /* | |
1430 | + * Transfer the sorted bios in sort_bio_list back to | |
1431 | + * deferred_bio_list to allow lockless submission of | |
1432 | + * all bios. | |
1433 | + */ | |
1434 | + __extract_sorted_bios(tc); | |
1435 | +} | |
1436 | + | |
1370 | 1437 | static void process_thin_deferred_bios(struct thin_c *tc) |
1371 | 1438 | { |
1372 | 1439 | struct pool *pool = tc->pool; |
1373 | 1440 | unsigned long flags; |
1374 | 1441 | struct bio *bio; |
1375 | 1442 | struct bio_list bios; |
1443 | + struct blk_plug plug; | |
1376 | 1444 | |
1377 | 1445 | if (tc->requeue_mode) { |
1378 | 1446 | requeue_bio_list(tc, &tc->deferred_bio_list); |
1379 | 1447 | |
1380 | 1448 | |
... | ... | @@ -1382,10 +1450,20 @@ |
1382 | 1450 | bio_list_init(&bios); |
1383 | 1451 | |
1384 | 1452 | spin_lock_irqsave(&tc->lock, flags); |
1453 | + | |
1454 | + if (bio_list_empty(&tc->deferred_bio_list)) { | |
1455 | + spin_unlock_irqrestore(&tc->lock, flags); | |
1456 | + return; | |
1457 | + } | |
1458 | + | |
1459 | + __sort_thin_deferred_bios(tc); | |
1460 | + | |
1385 | 1461 | bio_list_merge(&bios, &tc->deferred_bio_list); |
1386 | 1462 | bio_list_init(&tc->deferred_bio_list); |
1463 | + | |
1387 | 1464 | spin_unlock_irqrestore(&tc->lock, flags); |
1388 | 1465 | |
1466 | + blk_start_plug(&plug); | |
1389 | 1467 | while ((bio = bio_list_pop(&bios))) { |
1390 | 1468 | /* |
1391 | 1469 | * If we've got no free new_mapping structs, and processing |
... | ... | @@ -1405,6 +1483,7 @@ |
1405 | 1483 | else |
1406 | 1484 | pool->process_bio(tc, bio); |
1407 | 1485 | } |
1486 | + blk_finish_plug(&plug); | |
1408 | 1487 | } |
1409 | 1488 | |
1410 | 1489 | static void process_deferred_bios(struct pool *pool) |
... | ... | @@ -2964,7 +3043,7 @@ |
2964 | 3043 | .name = "thin-pool", |
2965 | 3044 | .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | |
2966 | 3045 | DM_TARGET_IMMUTABLE, |
2967 | - .version = {1, 11, 0}, | |
3046 | + .version = {1, 12, 0}, | |
2968 | 3047 | .module = THIS_MODULE, |
2969 | 3048 | .ctr = pool_ctr, |
2970 | 3049 | .dtr = pool_dtr, |
... | ... | @@ -3040,6 +3119,7 @@ |
3040 | 3119 | spin_lock_init(&tc->lock); |
3041 | 3120 | bio_list_init(&tc->deferred_bio_list); |
3042 | 3121 | bio_list_init(&tc->retry_on_resume_list); |
3122 | + tc->sort_bio_list = RB_ROOT; | |
3043 | 3123 | |
3044 | 3124 | if (argc == 3) { |
3045 | 3125 | r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev); |
... | ... | @@ -3287,7 +3367,7 @@ |
3287 | 3367 | |
3288 | 3368 | static struct target_type thin_target = { |
3289 | 3369 | .name = "thin", |
3290 | - .version = {1, 11, 0}, | |
3370 | + .version = {1, 12, 0}, | |
3291 | 3371 | .module = THIS_MODULE, |
3292 | 3372 | .ctr = thin_ctr, |
3293 | 3373 | .dtr = thin_dtr, |