Commit 78365411b344df35a198b119133e6515c2dcfb9f

Authored by Kent Overstreet
1 parent 1dd13c8d3c

bcache: Rework allocator reserves

We need a reserve for allocating buckets for new btree nodes - and now that
we've got multiple btrees, it really needs to be per btree.

This reworks the reserves so we've got separate freelists for each reserve
instead of watermarks, which seems to make things a bit cleaner, and it adds
some code so that btree_split() can make sure the reserve is available before it
starts.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>

Showing 8 changed files with 105 additions and 83 deletions Side-by-side Diff

drivers/md/bcache/alloc.c
... ... @@ -132,10 +132,16 @@
132 132 {
133 133 BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b));
134 134  
135   - if (fifo_used(&ca->free) > ca->watermark[WATERMARK_MOVINGGC] &&
136   - CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO)
137   - return false;
  135 + if (CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO) {
  136 + unsigned i;
138 137  
  138 + for (i = 0; i < RESERVE_NONE; i++)
  139 + if (!fifo_full(&ca->free[i]))
  140 + goto add;
  141 +
  142 + return false;
  143 + }
  144 +add:
139 145 b->prio = 0;
140 146  
141 147 if (can_inc_bucket_gen(b) &&
... ... @@ -304,6 +310,21 @@
304 310 __set_current_state(TASK_RUNNING); \
305 311 } while (0)
306 312  
  313 +static int bch_allocator_push(struct cache *ca, long bucket)
  314 +{
  315 + unsigned i;
  316 +
  317 + /* Prios/gens are actually the most important reserve */
  318 + if (fifo_push(&ca->free[RESERVE_PRIO], bucket))
  319 + return true;
  320 +
  321 + for (i = 0; i < RESERVE_NR; i++)
  322 + if (fifo_push(&ca->free[i], bucket))
  323 + return true;
  324 +
  325 + return false;
  326 +}
  327 +
307 328 static int bch_allocator_thread(void *arg)
308 329 {
309 330 struct cache *ca = arg;
... ... @@ -336,9 +357,7 @@
336 357 mutex_lock(&ca->set->bucket_lock);
337 358 }
338 359  
339   - allocator_wait(ca, !fifo_full(&ca->free));
340   -
341   - fifo_push(&ca->free, bucket);
  360 + allocator_wait(ca, bch_allocator_push(ca, bucket));
342 361 wake_up(&ca->set->bucket_wait);
343 362 }
344 363  
345 364  
346 365  
347 366  
348 367  
... ... @@ -365,34 +384,29 @@
365 384 }
366 385 }
367 386  
368   -long bch_bucket_alloc(struct cache *ca, unsigned watermark, bool wait)
  387 +long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait)
369 388 {
370 389 DEFINE_WAIT(w);
371 390 struct bucket *b;
372 391 long r;
373 392  
374 393 /* fastpath */
375   - if (fifo_used(&ca->free) > ca->watermark[watermark]) {
376   - fifo_pop(&ca->free, r);
  394 + if (fifo_pop(&ca->free[RESERVE_NONE], r) ||
  395 + fifo_pop(&ca->free[reserve], r))
377 396 goto out;
378   - }
379 397  
380 398 if (!wait)
381 399 return -1;
382 400  
383   - while (1) {
384   - if (fifo_used(&ca->free) > ca->watermark[watermark]) {
385   - fifo_pop(&ca->free, r);
386   - break;
387   - }
388   -
  401 + do {
389 402 prepare_to_wait(&ca->set->bucket_wait, &w,
390 403 TASK_UNINTERRUPTIBLE);
391 404  
392 405 mutex_unlock(&ca->set->bucket_lock);
393 406 schedule();
394 407 mutex_lock(&ca->set->bucket_lock);
395   - }
  408 + } while (!fifo_pop(&ca->free[RESERVE_NONE], r) &&
  409 + !fifo_pop(&ca->free[reserve], r));
396 410  
397 411 finish_wait(&ca->set->bucket_wait, &w);
398 412 out:
399 413  
... ... @@ -401,12 +415,14 @@
401 415 if (expensive_debug_checks(ca->set)) {
402 416 size_t iter;
403 417 long i;
  418 + unsigned j;
404 419  
405 420 for (iter = 0; iter < prio_buckets(ca) * 2; iter++)
406 421 BUG_ON(ca->prio_buckets[iter] == (uint64_t) r);
407 422  
408   - fifo_for_each(i, &ca->free, iter)
409   - BUG_ON(i == r);
  423 + for (j = 0; j < RESERVE_NR; j++)
  424 + fifo_for_each(i, &ca->free[j], iter)
  425 + BUG_ON(i == r);
410 426 fifo_for_each(i, &ca->free_inc, iter)
411 427 BUG_ON(i == r);
412 428 fifo_for_each(i, &ca->unused, iter)
... ... @@ -419,7 +435,7 @@
419 435  
420 436 SET_GC_SECTORS_USED(b, ca->sb.bucket_size);
421 437  
422   - if (watermark <= WATERMARK_METADATA) {
  438 + if (reserve <= RESERVE_PRIO) {
423 439 SET_GC_MARK(b, GC_MARK_METADATA);
424 440 SET_GC_MOVE(b, 0);
425 441 b->prio = BTREE_PRIO;
... ... @@ -445,7 +461,7 @@
445 461 }
446 462 }
447 463  
448   -int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
  464 +int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve,
449 465 struct bkey *k, int n, bool wait)
450 466 {
451 467 int i;
... ... @@ -459,7 +475,7 @@
459 475  
460 476 for (i = 0; i < n; i++) {
461 477 struct cache *ca = c->cache_by_alloc[i];
462   - long b = bch_bucket_alloc(ca, watermark, wait);
  478 + long b = bch_bucket_alloc(ca, reserve, wait);
463 479  
464 480 if (b == -1)
465 481 goto err;
466 482  
... ... @@ -478,12 +494,12 @@
478 494 return -1;
479 495 }
480 496  
481   -int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
  497 +int bch_bucket_alloc_set(struct cache_set *c, unsigned reserve,
482 498 struct bkey *k, int n, bool wait)
483 499 {
484 500 int ret;
485 501 mutex_lock(&c->bucket_lock);
486   - ret = __bch_bucket_alloc_set(c, watermark, k, n, wait);
  502 + ret = __bch_bucket_alloc_set(c, reserve, k, n, wait);
487 503 mutex_unlock(&c->bucket_lock);
488 504 return ret;
489 505 }
... ... @@ -573,8 +589,8 @@
573 589  
574 590 while (!(b = pick_data_bucket(c, k, write_point, &alloc.key))) {
575 591 unsigned watermark = write_prio
576   - ? WATERMARK_MOVINGGC
577   - : WATERMARK_NONE;
  592 + ? RESERVE_MOVINGGC
  593 + : RESERVE_NONE;
578 594  
579 595 spin_unlock(&c->data_bucket_lock);
580 596  
... ... @@ -689,7 +705,7 @@
689 705 * Then 8 for btree allocations
690 706 * Then half for the moving garbage collector
691 707 */
692   -
  708 +#if 0
693 709 ca->watermark[WATERMARK_PRIO] = 0;
694 710  
695 711 ca->watermark[WATERMARK_METADATA] = prio_buckets(ca);
... ... @@ -699,7 +715,7 @@
699 715  
700 716 ca->watermark[WATERMARK_NONE] = ca->free.size / 2 +
701 717 ca->watermark[WATERMARK_MOVINGGC];
702   -
  718 +#endif
703 719 return 0;
704 720 }
drivers/md/bcache/bcache.h
... ... @@ -383,12 +383,12 @@
383 383 unsigned writeback_rate_p_term_inverse;
384 384 };
385 385  
386   -enum alloc_watermarks {
387   - WATERMARK_PRIO,
388   - WATERMARK_METADATA,
389   - WATERMARK_MOVINGGC,
390   - WATERMARK_NONE,
391   - WATERMARK_MAX
  386 +enum alloc_reserve {
  387 + RESERVE_BTREE,
  388 + RESERVE_PRIO,
  389 + RESERVE_MOVINGGC,
  390 + RESERVE_NONE,
  391 + RESERVE_NR,
392 392 };
393 393  
394 394 struct cache {
... ... @@ -400,8 +400,6 @@
400 400 struct kobject kobj;
401 401 struct block_device *bdev;
402 402  
403   - unsigned watermark[WATERMARK_MAX];
404   -
405 403 struct task_struct *alloc_thread;
406 404  
407 405 struct closure prio;
... ... @@ -430,7 +428,7 @@
430 428 * because all the data they contained was overwritten), so we only
431 429 * need to discard them before they can be moved to the free list.
432 430 */
433   - DECLARE_FIFO(long, free);
  431 + DECLARE_FIFO(long, free)[RESERVE_NR];
434 432 DECLARE_FIFO(long, free_inc);
435 433 DECLARE_FIFO(long, unused);
436 434  
drivers/md/bcache/btree.c
... ... @@ -167,6 +167,8 @@
167 167 _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \
168 168 } \
169 169 rw_unlock(_w, _b); \
  170 + if (_r == -EINTR) \
  171 + schedule(); \
170 172 bch_cannibalize_unlock(c); \
171 173 if (_r == -ENOSPC) { \
172 174 wait_event((c)->try_wait, \
... ... @@ -175,6 +177,7 @@
175 177 } \
176 178 } while (_r == -EINTR); \
177 179 \
  180 + finish_wait(&(c)->bucket_wait, &(op)->wait); \
178 181 _r; \
179 182 })
180 183  
... ... @@ -1075,7 +1078,7 @@
1075 1078  
1076 1079 mutex_lock(&c->bucket_lock);
1077 1080 retry:
1078   - if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, wait))
  1081 + if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait))
1079 1082 goto err;
1080 1083  
1081 1084 bkey_put(c, &k.key);
... ... @@ -1132,6 +1135,28 @@
1132 1135 atomic_inc(&b->c->prio_blocked);
1133 1136 }
1134 1137  
  1138 +static int btree_check_reserve(struct btree *b, struct btree_op *op)
  1139 +{
  1140 + struct cache_set *c = b->c;
  1141 + struct cache *ca;
  1142 + unsigned i, reserve = c->root->level * 2 + 1;
  1143 + int ret = 0;
  1144 +
  1145 + mutex_lock(&c->bucket_lock);
  1146 +
  1147 + for_each_cache(ca, c, i)
  1148 + if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) {
  1149 + if (op)
  1150 + prepare_to_wait(&c->bucket_wait, &op->wait,
  1151 + TASK_UNINTERRUPTIBLE);
  1152 + ret = -EINTR;
  1153 + break;
  1154 + }
  1155 +
  1156 + mutex_unlock(&c->bucket_lock);
  1157 + return ret;
  1158 +}
  1159 +
1135 1160 /* Garbage collection */
1136 1161  
1137 1162 uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
... ... @@ -1428,7 +1453,8 @@
1428 1453  
1429 1454 if (!IS_ERR(last->b)) {
1430 1455 should_rewrite = btree_gc_mark_node(last->b, gc);
1431   - if (should_rewrite) {
  1456 + if (should_rewrite &&
  1457 + !btree_check_reserve(b, NULL)) {
1432 1458 n = btree_node_alloc_replacement(last->b,
1433 1459 false);
1434 1460  
... ... @@ -2070,6 +2096,10 @@
2070 2096  
2071 2097 closure_init_stack(&cl);
2072 2098 bch_keylist_init(&parent_keys);
  2099 +
  2100 + if (!b->level &&
  2101 + btree_check_reserve(b, op))
  2102 + return -EINTR;
2073 2103  
2074 2104 n1 = btree_node_alloc_replacement(b, true);
2075 2105 if (IS_ERR(n1))
drivers/md/bcache/btree.h
... ... @@ -241,6 +241,9 @@
241 241 /* Recursing down the btree */
242 242  
243 243 struct btree_op {
  244 + /* for waiting on btree reserve in btree_split() */
  245 + wait_queue_t wait;
  246 +
244 247 /* Btree level at which we start taking write locks */
245 248 short lock;
246 249  
... ... @@ -250,6 +253,7 @@
250 253 static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level)
251 254 {
252 255 memset(op, 0, sizeof(struct btree_op));
  256 + init_wait(&op->wait);
253 257 op->lock = write_lock_level;
254 258 }
255 259  
drivers/md/bcache/movinggc.c
... ... @@ -211,7 +211,7 @@
211 211 for_each_cache(ca, c, i) {
212 212 unsigned sectors_to_move = 0;
213 213 unsigned reserve_sectors = ca->sb.bucket_size *
214   - min(fifo_used(&ca->free), ca->free.size / 2);
  214 + fifo_used(&ca->free[RESERVE_MOVINGGC]);
215 215  
216 216 ca->heap.used = 0;
217 217  
drivers/md/bcache/super.c
... ... @@ -444,7 +444,7 @@
444 444  
445 445 lockdep_assert_held(&bch_register_lock);
446 446  
447   - if (bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, true))
  447 + if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true))
448 448 return 1;
449 449  
450 450 SET_KEY_SIZE(&k.key, c->sb.bucket_size);
... ... @@ -562,8 +562,8 @@
562 562 atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
563 563 &ca->meta_sectors_written);
564 564  
565   - pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free),
566   - fifo_used(&ca->free_inc), fifo_used(&ca->unused));
  565 + //pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free),
  566 + // fifo_used(&ca->free_inc), fifo_used(&ca->unused));
567 567  
568 568 for (i = prio_buckets(ca) - 1; i >= 0; --i) {
569 569 long bucket;
... ... @@ -582,7 +582,7 @@
582 582 p->magic = pset_magic(&ca->sb);
583 583 p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8);
584 584  
585   - bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, true);
  585 + bucket = bch_bucket_alloc(ca, RESERVE_PRIO, true);
586 586 BUG_ON(bucket == -1);
587 587  
588 588 mutex_unlock(&ca->set->bucket_lock);
... ... @@ -1767,6 +1767,7 @@
1767 1767 void bch_cache_release(struct kobject *kobj)
1768 1768 {
1769 1769 struct cache *ca = container_of(kobj, struct cache, kobj);
  1770 + unsigned i;
1770 1771  
1771 1772 if (ca->set)
1772 1773 ca->set->cache[ca->sb.nr_this_dev] = NULL;
1773 1774  
... ... @@ -1780,8 +1781,10 @@
1780 1781 free_heap(&ca->heap);
1781 1782 free_fifo(&ca->unused);
1782 1783 free_fifo(&ca->free_inc);
1783   - free_fifo(&ca->free);
1784 1784  
  1785 + for (i = 0; i < RESERVE_NR; i++)
  1786 + free_fifo(&ca->free[i]);
  1787 +
1785 1788 if (ca->sb_bio.bi_inline_vecs[0].bv_page)
1786 1789 put_page(ca->sb_bio.bi_io_vec[0].bv_page);
1787 1790  
1788 1791  
... ... @@ -1806,10 +1809,12 @@
1806 1809 ca->journal.bio.bi_max_vecs = 8;
1807 1810 ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs;
1808 1811  
1809   - free = roundup_pow_of_two(ca->sb.nbuckets) >> 9;
1810   - free = max_t(size_t, free, (prio_buckets(ca) + 8) * 2);
  1812 + free = roundup_pow_of_two(ca->sb.nbuckets) >> 10;
1811 1813  
1812   - if (!init_fifo(&ca->free, free, GFP_KERNEL) ||
  1814 + if (!init_fifo(&ca->free[RESERVE_BTREE], 8, GFP_KERNEL) ||
  1815 + !init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
  1816 + !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) ||
  1817 + !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) ||
1813 1818 !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) ||
1814 1819 !init_fifo(&ca->unused, free << 2, GFP_KERNEL) ||
1815 1820 !init_heap(&ca->heap, free << 3, GFP_KERNEL) ||
drivers/md/bcache/sysfs.c
... ... @@ -102,7 +102,6 @@
102 102 rw_attribute(key_merging_disabled);
103 103 rw_attribute(gc_always_rewrite);
104 104 rw_attribute(expensive_debug_checks);
105   -rw_attribute(freelist_percent);
106 105 rw_attribute(cache_replacement_policy);
107 106 rw_attribute(btree_shrinker_disabled);
108 107 rw_attribute(copy_gc_enabled);
... ... @@ -711,9 +710,6 @@
711 710 sysfs_print(io_errors,
712 711 atomic_read(&ca->io_errors) >> IO_ERROR_SHIFT);
713 712  
714   - sysfs_print(freelist_percent, ca->free.size * 100 /
715   - ((size_t) ca->sb.nbuckets));
716   -
717 713 if (attr == &sysfs_cache_replacement_policy)
718 714 return bch_snprint_string_list(buf, PAGE_SIZE,
719 715 cache_replacement_policies,
... ... @@ -820,32 +816,6 @@
820 816 }
821 817 }
822 818  
823   - if (attr == &sysfs_freelist_percent) {
824   - DECLARE_FIFO(long, free);
825   - long i;
826   - size_t p = strtoul_or_return(buf);
827   -
828   - p = clamp_t(size_t,
829   - ((size_t) ca->sb.nbuckets * p) / 100,
830   - roundup_pow_of_two(ca->sb.nbuckets) >> 9,
831   - ca->sb.nbuckets / 2);
832   -
833   - if (!init_fifo_exact(&free, p, GFP_KERNEL))
834   - return -ENOMEM;
835   -
836   - mutex_lock(&ca->set->bucket_lock);
837   -
838   - fifo_move(&free, &ca->free);
839   - fifo_swap(&free, &ca->free);
840   -
841   - mutex_unlock(&ca->set->bucket_lock);
842   -
843   - while (fifo_pop(&free, i))
844   - atomic_dec(&ca->buckets[i].pin);
845   -
846   - free_fifo(&free);
847   - }
848   -
849 819 if (attr == &sysfs_clear_stats) {
850 820 atomic_long_set(&ca->sectors_written, 0);
851 821 atomic_long_set(&ca->btree_sectors_written, 0);
... ... @@ -869,7 +839,6 @@
869 839 &sysfs_metadata_written,
870 840 &sysfs_io_errors,
871 841 &sysfs_clear_stats,
872   - &sysfs_freelist_percent,
873 842 &sysfs_cache_replacement_policy,
874 843 NULL
875 844 };
include/trace/events/bcache.h
... ... @@ -411,7 +411,7 @@
411 411 ),
412 412  
413 413 TP_fast_assign(
414   - __entry->free = fifo_used(&ca->free);
  414 + __entry->free = fifo_used(&ca->free[RESERVE_NONE]);
415 415 __entry->free_inc = fifo_used(&ca->free_inc);
416 416 __entry->free_inc_size = ca->free_inc.size;
417 417 __entry->unused = fifo_used(&ca->unused);
... ... @@ -422,8 +422,8 @@
422 422 );
423 423  
424 424 TRACE_EVENT(bcache_alloc_fail,
425   - TP_PROTO(struct cache *ca),
426   - TP_ARGS(ca),
  425 + TP_PROTO(struct cache *ca, unsigned reserve),
  426 + TP_ARGS(ca, reserve),
427 427  
428 428 TP_STRUCT__entry(
429 429 __field(unsigned, free )
... ... @@ -433,7 +433,7 @@
433 433 ),
434 434  
435 435 TP_fast_assign(
436   - __entry->free = fifo_used(&ca->free);
  436 + __entry->free = fifo_used(&ca->free[reserve]);
437 437 __entry->free_inc = fifo_used(&ca->free_inc);
438 438 __entry->unused = fifo_used(&ca->unused);
439 439 __entry->blocked = atomic_read(&ca->set->prio_blocked);