Doug / smarc-fsl-linux-kernel | Embedian Git Server

Commit 8e51e414a3c6d92ef2cc41720c67342a8e2c0bf7

Authored by Kent Overstreet 2013-06-07 09:15:57 +0800

Committed by Kent Overstreet 2013-07-02 05:43:53 +0800

Exists in smarc-imx_3.14.28_1.0.0_ga and in 1 other branch

bcache: Use standard utility code

Some of bcache's utility code has made it into the rest of the kernel,
so drop the bcache versions.

Bcache used to have a workaround for allocating from a bio set under
generic_make_request() (if you allocated more than once, the bios you
already allocated would get stuck on current->bio_list when you
submitted, and you'd risk deadlock) - bcache would mask out __GFP_WAIT
when allocating bios under generic_make_request() so that allocation
could fail and it could retry from workqueue. But bio_alloc_bioset() has
a workaround now, so we can drop this hack and the associated error
handling.

Signed-off-by: Kent Overstreet <koverstreet@google.com>

Showing 8 changed files with 51 additions and 144 deletions Inline Diff

drivers/md/bcache/btree.c
drivers/md/bcache/debug.c
drivers/md/bcache/io.c
drivers/md/bcache/movinggc.c
drivers/md/bcache/request.c
drivers/md/bcache/util.c
drivers/md/bcache/util.h
drivers/md/bcache/writeback.c

drivers/md/bcache/btree.c

Diff comments View file @ 8e51e41

 /*
  * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
  *
  * Uses a block device as cache for other block devices; optimized for SSDs.
  * All allocation is done in buckets, which should match the erase block size
  * of the device.
  *
  * Buckets containing cached data are kept on a heap sorted by priority;
  * bucket priority is increased on cache hit, and periodically all the buckets
  * on the heap have their priority scaled down. This currently is just used as
  * an LRU but in the future should allow for more intelligent heuristics.
  *
  * Buckets have an 8 bit counter; freeing is accomplished by incrementing the
  * counter. Garbage collection is used to remove stale pointers.
  *
  * Indexing is done via a btree; nodes are not necessarily fully sorted, rather
  * as keys are inserted we only sort the pages that have not yet been written.
  * When garbage collection is run, we resort the entire node.
  *
  * All configuration is done via sysfs; see Documentation/bcache.txt.
  */
 #include "bcache.h"
 #include "btree.h"
 #include "debug.h"
 #include "request.h"
 #include "writeback.h"
 #include <linux/slab.h>
 #include <linux/bitops.h>
 #include <linux/hash.h>
 #include <linux/prefetch.h>
 #include <linux/random.h>
 #include <linux/rcupdate.h>
 #include <trace/events/bcache.h>
 /*
  * Todo:
  * register_bcache: Return errors out to userspace correctly
  *
  * Writeback: don't undirty key until after a cache flush
  *
  * Create an iterator for key pointers
  *
  * On btree write error, mark bucket such that it won't be freed from the cache
  *
  * Journalling:
  *   Check for bad keys in replay
  *   Propagate barriers
  *   Refcount journal entries in journal_replay
  *
  * Garbage collection:
  *   Finish incremental gc
  *   Gc should free old UUIDs, data for invalid UUIDs
  *
  * Provide a way to list backing device UUIDs we have data cached for, and
  * probably how long it's been since we've seen them, and a way to invalidate
  * dirty data for devices that will never be attached again
  *
  * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so
  * that based on that and how much dirty data we have we can keep writeback
  * from being starved
  *
  * Add a tracepoint or somesuch to watch for writeback starvation
  *
  * When btree depth > 1 and splitting an interior node, we have to make sure
  * alloc_bucket() cannot fail. This should be true but is not completely
  * obvious.
  *
  * Make sure all allocations get charged to the root cgroup
  *
  * Plugging?
  *
  * If data write is less than hard sector size of ssd, round up offset in open
  * bucket to the next whole sector
  *
  * Also lookup by cgroup in get_open_bucket()
  *
  * Superblock needs to be fleshed out for multiple cache devices
  *
  * Add a sysfs tunable for the number of writeback IOs in flight
  *
  * Add a sysfs tunable for the number of open data buckets
  *
  * IO tracking: Can we track when one process is doing io on behalf of another?
  * IO tracking: Don't use just an average, weigh more recent stuff higher
  *
  * Test module load/unload
  */
 static const char * const op_types[] = {
 	"insert", "replace"
 };
 static const char *op_type(struct btree_op *op)
 {
 	return op_types[op->type];
 }
 #define MAX_NEED_GC		64
 #define MAX_SAVE_PRIO		72
 #define PTR_DIRTY_BIT		(((uint64_t) 1 << 36))
 #define PTR_HASH(c, k)							\
 	(((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0))
 struct workqueue_struct *bch_gc_wq;
 static struct workqueue_struct *btree_io_wq;
 void bch_btree_op_init_stack(struct btree_op *op)
 {
 	memset(op, 0, sizeof(struct btree_op));
 	closure_init_stack(&op->cl);
 	op->lock = -1;
 	bch_keylist_init(&op->keys);
 }
 /* Btree key manipulation */
 static void bkey_put(struct cache_set *c, struct bkey *k, int level)
 {
 	if ((level && KEY_OFFSET(k)) || !level)
 		__bkey_put(c, k);
 }
 /* Btree IO */
 static uint64_t btree_csum_set(struct btree *b, struct bset *i)
 {
 	uint64_t crc = b->key.ptr[0];
 	void *data = (void *) i + 8, *end = end(i);
 	crc = bch_crc64_update(crc, data, end - data);
 	return crc ^ 0xffffffffffffffffULL;
 }
 static void bch_btree_node_read_done(struct btree *b)
 {
 	const char *err = "bad btree header";
 	struct bset *i = b->sets[0].data;
 	struct btree_iter *iter;
 	iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT);
 	iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
 	iter->used = 0;
 	if (!i->seq)
 		goto err;
 	for (;
 	     b->written < btree_blocks(b) && i->seq == b->sets[0].data->seq;
 	     i = write_block(b)) {
 		err = "unsupported bset version";
 		if (i->version > BCACHE_BSET_VERSION)
 			goto err;
 		err = "bad btree header";
 		if (b->written + set_blocks(i, b->c) > btree_blocks(b))
 			goto err;
 		err = "bad magic";
 		if (i->magic != bset_magic(b->c))
 			goto err;
 		err = "bad checksum";
 		switch (i->version) {
 		case 0:
 			if (i->csum != csum_set(i))
 				goto err;
 			break;
 		case BCACHE_BSET_VERSION:
 			if (i->csum != btree_csum_set(b, i))
 				goto err;
 			break;
 		}
 		err = "empty set";
 		if (i != b->sets[0].data && !i->keys)
 			goto err;
 		bch_btree_iter_push(iter, i->start, end(i));
 		b->written += set_blocks(i, b->c);
 	}
 	err = "corrupted btree";
 	for (i = write_block(b);
 	     index(i, b) < btree_blocks(b);
 	     i = ((void *) i) + block_bytes(b->c))
 		if (i->seq == b->sets[0].data->seq)
 			goto err;
 	bch_btree_sort_and_fix_extents(b, iter);
 	i = b->sets[0].data;
 	err = "short btree key";
 	if (b->sets[0].size &&
 	    bkey_cmp(&b->key, &b->sets[0].end) < 0)
 		goto err;
 	if (b->written < btree_blocks(b))
 		bch_bset_init_next(b);
 out:
 	mempool_free(iter, b->c->fill_iter);
 	return;
 err:
 	set_btree_node_io_error(b);
 	bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys",
 			    err, PTR_BUCKET_NR(b->c, &b->key, 0),
 			    index(i, b), i->keys);
 	goto out;
 }
 static void btree_node_read_endio(struct bio *bio, int error)
 {
 	struct closure *cl = bio->bi_private;
 	closure_put(cl);
 }
 void bch_btree_node_read(struct btree *b)
 {
 	uint64_t start_time = local_clock();
 	struct closure cl;
 	struct bio *bio;
 	trace_bcache_btree_read(b);
 	closure_init_stack(&cl);
 	bio = bch_bbio_alloc(b->c);
 	bio->bi_rw	= REQ_META|READ_SYNC;
 	bio->bi_size	= KEY_SIZE(&b->key) << 9;
 	bio->bi_end_io	= btree_node_read_endio;
 	bio->bi_private	= &cl;
 	bch_bio_map(bio, b->sets[0].data);
 	bch_submit_bbio(bio, b->c, &b->key, 0);
 	closure_sync(&cl);
 	if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 		set_btree_node_io_error(b);
 	bch_bbio_free(bio, b->c);
 	if (btree_node_io_error(b))
 		goto err;
 	bch_btree_node_read_done(b);
 	spin_lock(&b->c->btree_read_time_lock);
 	bch_time_stats_update(&b->c->btree_read_time, start_time);
 	spin_unlock(&b->c->btree_read_time_lock);
 	return;
 err:
 	bch_cache_set_error(b->c, "io error reading bucket %lu",
 			    PTR_BUCKET_NR(b->c, &b->key, 0));
 }
 static void btree_complete_write(struct btree *b, struct btree_write *w)
 {
 	if (w->prio_blocked &&
 	    !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked))
 		wake_up_allocators(b->c);
 	if (w->journal) {
 		atomic_dec_bug(w->journal);
 		__closure_wake_up(&b->c->journal.wait);
 	}
 	w->prio_blocked	= 0;
 	w->journal	= NULL;
 }
 static void __btree_node_write_done(struct closure *cl)
 {
 	struct btree *b = container_of(cl, struct btree, io.cl);
 	struct btree_write *w = btree_prev_write(b);
 	bch_bbio_free(b->bio, b->c);
 	b->bio = NULL;
 	btree_complete_write(b, w);
 	if (btree_node_dirty(b))
 		queue_delayed_work(btree_io_wq, &b->work,
 				   msecs_to_jiffies(30000));
 	closure_return(cl);
 }
 static void btree_node_write_done(struct closure *cl)
 {
 	struct btree *b = container_of(cl, struct btree, io.cl);
 	struct bio_vec *bv;
 	int n;
 	__bio_for_each_segment(bv, b->bio, n, 0)
 		__free_page(bv->bv_page);
 	__btree_node_write_done(cl);
 }
 static void btree_node_write_endio(struct bio *bio, int error)
 {
 	struct closure *cl = bio->bi_private;
 	struct btree *b = container_of(cl, struct btree, io.cl);
 	if (error)
 		set_btree_node_io_error(b);
 	bch_bbio_count_io_errors(b->c, bio, error, "writing btree");
 	closure_put(cl);
 }
 static void do_btree_node_write(struct btree *b)
 {
 	struct closure *cl = &b->io.cl;
 	struct bset *i = b->sets[b->nsets].data;
 	BKEY_PADDED(key) k;
 	i->version	= BCACHE_BSET_VERSION;
 	i->csum		= btree_csum_set(b, i);
 	BUG_ON(b->bio);
 	b->bio = bch_bbio_alloc(b->c);
 	b->bio->bi_end_io	= btree_node_write_endio;
 	b->bio->bi_private	= &b->io.cl;
 	b->bio->bi_rw		= REQ_META|WRITE_SYNC|REQ_FUA;
 	b->bio->bi_size		= set_blocks(i, b->c) * block_bytes(b->c);
 	bch_bio_map(b->bio, i);
 	/*
 	 * If we're appending to a leaf node, we don't technically need FUA -
 	 * this write just needs to be persisted before the next journal write,
 	 * which will be marked FLUSH|FUA.
 	 *
 	 * Similarly if we're writing a new btree root - the pointer is going to
 	 * be in the next journal entry.
 	 *
 	 * But if we're writing a new btree node (that isn't a root) or
 	 * appending to a non leaf btree node, we need either FUA or a flush
 	 * when we write the parent with the new pointer. FUA is cheaper than a
 	 * flush, and writes appending to leaf nodes aren't blocking anything so
 	 * just make all btree node writes FUA to keep things sane.
 	 */
 	bkey_copy(&k.key, &b->key);
 	SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i));
-	if (!bch_bio_alloc_pages(b->bio, GFP_NOIO)) {
+	if (!bio_alloc_pages(b->bio, GFP_NOIO)) {
 		int j;
 		struct bio_vec *bv;
 		void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
 		bio_for_each_segment(bv, b->bio, j)
 			memcpy(page_address(bv->bv_page),
 			       base + j * PAGE_SIZE, PAGE_SIZE);
 		bch_submit_bbio(b->bio, b->c, &k.key, 0);
 		continue_at(cl, btree_node_write_done, NULL);
 	} else {
 		b->bio->bi_vcnt = 0;
 		bch_bio_map(b->bio, i);
 		bch_submit_bbio(b->bio, b->c, &k.key, 0);
 		closure_sync(cl);
 		__btree_node_write_done(cl);
 	}
 }
 void bch_btree_node_write(struct btree *b, struct closure *parent)
 {
 	struct bset *i = b->sets[b->nsets].data;
 	trace_bcache_btree_write(b);
 	BUG_ON(current->bio_list);
 	BUG_ON(b->written >= btree_blocks(b));
 	BUG_ON(b->written && !i->keys);
 	BUG_ON(b->sets->data->seq != i->seq);
 	bch_check_key_order(b, i);
 	cancel_delayed_work(&b->work);
 	/* If caller isn't waiting for write, parent refcount is cache set */
 	closure_lock(&b->io, parent ?: &b->c->cl);
 	clear_bit(BTREE_NODE_dirty,	 &b->flags);
 	change_bit(BTREE_NODE_write_idx, &b->flags);
 	do_btree_node_write(b);
 	b->written += set_blocks(i, b->c);
 	atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size,
 			&PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);
 	bch_btree_sort_lazy(b);
 	if (b->written < btree_blocks(b))
 		bch_bset_init_next(b);
 }
 static void btree_node_write_work(struct work_struct *w)
 {
 	struct btree *b = container_of(to_delayed_work(w), struct btree, work);
 	rw_lock(true, b, b->level);
 	if (btree_node_dirty(b))
 		bch_btree_node_write(b, NULL);
 	rw_unlock(true, b);
 }
 static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op)
 {
 	struct bset *i = b->sets[b->nsets].data;
 	struct btree_write *w = btree_current_write(b);
 	BUG_ON(!b->written);
 	BUG_ON(!i->keys);
 	if (!btree_node_dirty(b))
 		queue_delayed_work(btree_io_wq, &b->work, 30 * HZ);
 	set_btree_node_dirty(b);
 	if (op && op->journal) {
 		if (w->journal &&
 		    journal_pin_cmp(b->c, w, op)) {
 			atomic_dec_bug(w->journal);
 			w->journal = NULL;
 		}
 		if (!w->journal) {
 			w->journal = op->journal;
 			atomic_inc(w->journal);
 		}
 	}
 	/* Force write if set is too big */
 	if (set_bytes(i) > PAGE_SIZE - 48 &&
 	    !current->bio_list)
 		bch_btree_node_write(b, NULL);
 }
 /*
  * Btree in memory cache - allocation/freeing
  * mca -> memory cache
  */
 static void mca_reinit(struct btree *b)
 {
 	unsigned i;
 	b->flags	= 0;
 	b->written	= 0;
 	b->nsets	= 0;
 	for (i = 0; i < MAX_BSETS; i++)
 		b->sets[i].size = 0;
 	/*
 	 * Second loop starts at 1 because b->sets[0]->data is the memory we
 	 * allocated
 	 */
 	for (i = 1; i < MAX_BSETS; i++)
 		b->sets[i].data = NULL;
 }
 #define mca_reserve(c)	(((c->root && c->root->level)		\
 			  ? c->root->level : 1) * 8 + 16)
 #define mca_can_free(c)						\
 	max_t(int, 0, c->bucket_cache_used - mca_reserve(c))
 static void mca_data_free(struct btree *b)
 {
 	struct bset_tree *t = b->sets;
 	BUG_ON(!closure_is_unlocked(&b->io.cl));
 	if (bset_prev_bytes(b) < PAGE_SIZE)
 		kfree(t->prev);
 	else
 		free_pages((unsigned long) t->prev,
 			   get_order(bset_prev_bytes(b)));
 	if (bset_tree_bytes(b) < PAGE_SIZE)
 		kfree(t->tree);
 	else
 		free_pages((unsigned long) t->tree,
 			   get_order(bset_tree_bytes(b)));
 	free_pages((unsigned long) t->data, b->page_order);
 	t->prev = NULL;
 	t->tree = NULL;
 	t->data = NULL;
 	list_move(&b->list, &b->c->btree_cache_freed);
 	b->c->bucket_cache_used--;
 }
 static void mca_bucket_free(struct btree *b)
 {
 	BUG_ON(btree_node_dirty(b));
 	b->key.ptr[0] = 0;
 	hlist_del_init_rcu(&b->hash);
 	list_move(&b->list, &b->c->btree_cache_freeable);
 }
 static unsigned btree_order(struct bkey *k)
 {
 	return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1);
 }
 static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
 {
 	struct bset_tree *t = b->sets;
 	BUG_ON(t->data);
 	b->page_order = max_t(unsigned,
 			      ilog2(b->c->btree_pages),
 			      btree_order(k));
 	t->data = (void *) __get_free_pages(gfp, b->page_order);
 	if (!t->data)
 		goto err;
 	t->tree = bset_tree_bytes(b) < PAGE_SIZE
 		? kmalloc(bset_tree_bytes(b), gfp)
 		: (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b)));
 	if (!t->tree)
 		goto err;
 	t->prev = bset_prev_bytes(b) < PAGE_SIZE
 		? kmalloc(bset_prev_bytes(b), gfp)
 		: (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b)));
 	if (!t->prev)
 		goto err;
 	list_move(&b->list, &b->c->btree_cache);
 	b->c->bucket_cache_used++;
 	return;
 err:
 	mca_data_free(b);
 }
 static struct btree *mca_bucket_alloc(struct cache_set *c,
 				      struct bkey *k, gfp_t gfp)
 {
 	struct btree *b = kzalloc(sizeof(struct btree), gfp);
 	if (!b)
 		return NULL;
 	init_rwsem(&b->lock);
 	lockdep_set_novalidate_class(&b->lock);
 	INIT_LIST_HEAD(&b->list);
 	INIT_DELAYED_WORK(&b->work, btree_node_write_work);
 	b->c = c;
 	closure_init_unlocked(&b->io);
 	mca_data_alloc(b, k, gfp);
 	return b;
 }
 static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order)
 {
 	lockdep_assert_held(&b->c->bucket_lock);
 	if (!down_write_trylock(&b->lock))
 		return -ENOMEM;
 	if (b->page_order < min_order) {
 		rw_unlock(true, b);
 		return -ENOMEM;
 	}
 	BUG_ON(btree_node_dirty(b) && !b->sets[0].data);
 	if (cl && btree_node_dirty(b))
 		bch_btree_node_write(b, NULL);
 	if (cl)
 		closure_wait_event_async(&b->io.wait, cl,
 			 atomic_read(&b->io.cl.remaining) == -1);
 	if (btree_node_dirty(b) ||
 	    !closure_is_unlocked(&b->io.cl) ||
 	    work_pending(&b->work.work)) {
 		rw_unlock(true, b);
 		return -EAGAIN;
 	}
 	return 0;
 }
 static int bch_mca_shrink(struct shrinker *shrink, struct shrink_control *sc)
 {
 	struct cache_set *c = container_of(shrink, struct cache_set, shrink);
 	struct btree *b, *t;
 	unsigned long i, nr = sc->nr_to_scan;
 	if (c->shrinker_disabled)
 		return 0;
 	if (c->try_harder)
 		return 0;
 	/*
 	 * If nr == 0, we're supposed to return the number of items we have
 	 * cached. Not allowed to return -1.
 	 */
 	if (!nr)
 		return mca_can_free(c) * c->btree_pages;
 	/* Return -1 if we can't do anything right now */
 	if (sc->gfp_mask & __GFP_WAIT)
 		mutex_lock(&c->bucket_lock);
 	else if (!mutex_trylock(&c->bucket_lock))
 		return -1;
 	/*
 	 * It's _really_ critical that we don't free too many btree nodes - we
 	 * have to always leave ourselves a reserve. The reserve is how we
 	 * guarantee that allocating memory for a new btree node can always
 	 * succeed, so that inserting keys into the btree can always succeed and
 	 * IO can always make forward progress:
 	 */
 	nr /= c->btree_pages;
 	nr = min_t(unsigned long, nr, mca_can_free(c));
 	i = 0;
 	list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) {
 		if (!nr)
 			break;
 		if (++i > 3 &&
 		    !mca_reap(b, NULL, 0)) {
 			mca_data_free(b);
 			rw_unlock(true, b);
 			--nr;
 		}
 	}
 	/*
 	 * Can happen right when we first start up, before we've read in any
 	 * btree nodes
 	 */
 	if (list_empty(&c->btree_cache))
 		goto out;
 	for (i = 0; nr && i < c->bucket_cache_used; i++) {
 		b = list_first_entry(&c->btree_cache, struct btree, list);
 		list_rotate_left(&c->btree_cache);
 		if (!b->accessed &&
 		    !mca_reap(b, NULL, 0)) {
 			mca_bucket_free(b);
 			mca_data_free(b);
 			rw_unlock(true, b);
 			--nr;
 		} else
 			b->accessed = 0;
 	}
 out:
 	nr = mca_can_free(c) * c->btree_pages;
 	mutex_unlock(&c->bucket_lock);
 	return nr;
 }
 void bch_btree_cache_free(struct cache_set *c)
 {
 	struct btree *b;
 	struct closure cl;
 	closure_init_stack(&cl);
 	if (c->shrink.list.next)
 		unregister_shrinker(&c->shrink);
 	mutex_lock(&c->bucket_lock);
 #ifdef CONFIG_BCACHE_DEBUG
 	if (c->verify_data)
 		list_move(&c->verify_data->list, &c->btree_cache);
 #endif
 	list_splice(&c->btree_cache_freeable,
 		    &c->btree_cache);
 	while (!list_empty(&c->btree_cache)) {
 		b = list_first_entry(&c->btree_cache, struct btree, list);
 		if (btree_node_dirty(b))
 			btree_complete_write(b, btree_current_write(b));
 		clear_bit(BTREE_NODE_dirty, &b->flags);
 		mca_data_free(b);
 	}
 	while (!list_empty(&c->btree_cache_freed)) {
 		b = list_first_entry(&c->btree_cache_freed,
 				     struct btree, list);
 		list_del(&b->list);
 		cancel_delayed_work_sync(&b->work);
 		kfree(b);
 	}
 	mutex_unlock(&c->bucket_lock);
 }
 int bch_btree_cache_alloc(struct cache_set *c)
 {
 	unsigned i;
 	/* XXX: doesn't check for errors */
 	closure_init_unlocked(&c->gc);
 	for (i = 0; i < mca_reserve(c); i++)
 		mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
 	list_splice_init(&c->btree_cache,
 			 &c->btree_cache_freeable);
 #ifdef CONFIG_BCACHE_DEBUG
 	mutex_init(&c->verify_lock);
 	c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
 	if (c->verify_data &&
 	    c->verify_data->sets[0].data)
 		list_del_init(&c->verify_data->list);
 	else
 		c->verify_data = NULL;
 #endif
 	c->shrink.shrink = bch_mca_shrink;
 	c->shrink.seeks = 4;
 	c->shrink.batch = c->btree_pages * 2;
 	register_shrinker(&c->shrink);
 	return 0;
 }
 /* Btree in memory cache - hash table */
 static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k)
 {
 	return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)];
 }
 static struct btree *mca_find(struct cache_set *c, struct bkey *k)
 {
 	struct btree *b;
 	rcu_read_lock();
 	hlist_for_each_entry_rcu(b, mca_hash(c, k), hash)
 		if (PTR_HASH(c, &b->key) == PTR_HASH(c, k))
 			goto out;
 	b = NULL;
 out:
 	rcu_read_unlock();
 	return b;
 }
 static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k,
 				     int level, struct closure *cl)
 {
 	int ret = -ENOMEM;
 	struct btree *i;
 	trace_bcache_btree_cache_cannibalize(c);
 	if (!cl)
 		return ERR_PTR(-ENOMEM);
 	/*
 	 * Trying to free up some memory - i.e. reuse some btree nodes - may
 	 * require initiating IO to flush the dirty part of the node. If we're
 	 * running under generic_make_request(), that IO will never finish and
 	 * we would deadlock. Returning -EAGAIN causes the cache lookup code to
 	 * punt to workqueue and retry.
 	 */
 	if (current->bio_list)
 		return ERR_PTR(-EAGAIN);
 	if (c->try_harder && c->try_harder != cl) {
 		closure_wait_event_async(&c->try_wait, cl, !c->try_harder);
 		return ERR_PTR(-EAGAIN);
 	}
 	c->try_harder = cl;
 	c->try_harder_start = local_clock();
 retry:
 	list_for_each_entry_reverse(i, &c->btree_cache, list) {
 		int r = mca_reap(i, cl, btree_order(k));
 		if (!r)
 			return i;
 		if (r != -ENOMEM)
 			ret = r;
 	}
 	if (ret == -EAGAIN &&
 	    closure_blocking(cl)) {
 		mutex_unlock(&c->bucket_lock);
 		closure_sync(cl);
 		mutex_lock(&c->bucket_lock);
 		goto retry;
 	}
 	return ERR_PTR(ret);
 }
 /*
  * We can only have one thread cannibalizing other cached btree nodes at a time,
  * or we'll deadlock. We use an open coded mutex to ensure that, which a
  * cannibalize_bucket() will take. This means every time we unlock the root of
  * the btree, we need to release this lock if we have it held.
  */
 void bch_cannibalize_unlock(struct cache_set *c, struct closure *cl)
 {
 	if (c->try_harder == cl) {
 		bch_time_stats_update(&c->try_harder_time, c->try_harder_start);
 		c->try_harder = NULL;
 		__closure_wake_up(&c->try_wait);
 	}
 }
 static struct btree *mca_alloc(struct cache_set *c, struct bkey *k,
 			       int level, struct closure *cl)
 {
 	struct btree *b;
 	lockdep_assert_held(&c->bucket_lock);
 	if (mca_find(c, k))
 		return NULL;
 	/* btree_free() doesn't free memory; it sticks the node on the end of
 	 * the list. Check if there's any freed nodes there:
 	 */
 	list_for_each_entry(b, &c->btree_cache_freeable, list)
 		if (!mca_reap(b, NULL, btree_order(k)))
 			goto out;
 	/* We never free struct btree itself, just the memory that holds the on
 	 * disk node. Check the freed list before allocating a new one:
 	 */
 	list_for_each_entry(b, &c->btree_cache_freed, list)
 		if (!mca_reap(b, NULL, 0)) {
 			mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO);
 			if (!b->sets[0].data)
 				goto err;
 			else
 				goto out;
 		}
 	b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO);
 	if (!b)
 		goto err;
 	BUG_ON(!down_write_trylock(&b->lock));
 	if (!b->sets->data)
 		goto err;
 out:
 	BUG_ON(!closure_is_unlocked(&b->io.cl));
 	bkey_copy(&b->key, k);
 	list_move(&b->list, &c->btree_cache);
 	hlist_del_init_rcu(&b->hash);
 	hlist_add_head_rcu(&b->hash, mca_hash(c, k));
 	lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_);
 	b->level	= level;
 	mca_reinit(b);
 	return b;
 err:
 	if (b)
 		rw_unlock(true, b);
 	b = mca_cannibalize(c, k, level, cl);
 	if (!IS_ERR(b))
 		goto out;
 	return b;
 }
 /**
  * bch_btree_node_get - find a btree node in the cache and lock it, reading it
  * in from disk if necessary.
  *
  * If IO is necessary, it uses the closure embedded in struct btree_op to wait;
  * if that closure is in non blocking mode, will return -EAGAIN.
  *
  * The btree node will have either a read or a write lock held, depending on
  * level and op->lock.
  */
 struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k,
 				 int level, struct btree_op *op)
 {
 	int i = 0;
 	bool write = level <= op->lock;
 	struct btree *b;
 	BUG_ON(level < 0);
 retry:
 	b = mca_find(c, k);
 	if (!b) {
 		if (current->bio_list)
 			return ERR_PTR(-EAGAIN);
 		mutex_lock(&c->bucket_lock);
 		b = mca_alloc(c, k, level, &op->cl);
 		mutex_unlock(&c->bucket_lock);
 		if (!b)
 			goto retry;
 		if (IS_ERR(b))
 			return b;
 		bch_btree_node_read(b);
 		if (!write)
 			downgrade_write(&b->lock);
 	} else {
 		rw_lock(write, b, level);
 		if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) {
 			rw_unlock(write, b);
 			goto retry;
 		}
 		BUG_ON(b->level != level);
 	}
 	b->accessed = 1;
 	for (; i <= b->nsets && b->sets[i].size; i++) {
 		prefetch(b->sets[i].tree);
 		prefetch(b->sets[i].data);
 	}
 	for (; i <= b->nsets; i++)
 		prefetch(b->sets[i].data);
 	if (btree_node_io_error(b)) {
 		rw_unlock(write, b);
 		return ERR_PTR(-EIO);
 	}
 	BUG_ON(!b->written);
 	return b;
 }
 static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
 {
 	struct btree *b;
 	mutex_lock(&c->bucket_lock);
 	b = mca_alloc(c, k, level, NULL);
 	mutex_unlock(&c->bucket_lock);
 	if (!IS_ERR_OR_NULL(b)) {
 		bch_btree_node_read(b);
 		rw_unlock(true, b);
 	}
 }
 /* Btree alloc */
 static void btree_node_free(struct btree *b, struct btree_op *op)
 {
 	unsigned i;
 	trace_bcache_btree_node_free(b);
 	/*
 	 * The BUG_ON() in btree_node_get() implies that we must have a write
 	 * lock on parent to free or even invalidate a node
 	 */
 	BUG_ON(op->lock <= b->level);
 	BUG_ON(b == b->c->root);
 	if (btree_node_dirty(b))
 		btree_complete_write(b, btree_current_write(b));
 	clear_bit(BTREE_NODE_dirty, &b->flags);
 	cancel_delayed_work(&b->work);
 	mutex_lock(&b->c->bucket_lock);
 	for (i = 0; i < KEY_PTRS(&b->key); i++) {
 		BUG_ON(atomic_read(&PTR_BUCKET(b->c, &b->key, i)->pin));
 		bch_inc_gen(PTR_CACHE(b->c, &b->key, i),
 			    PTR_BUCKET(b->c, &b->key, i));
 	}
 	bch_bucket_free(b->c, &b->key);
 	mca_bucket_free(b);
 	mutex_unlock(&b->c->bucket_lock);
 }
 struct btree *bch_btree_node_alloc(struct cache_set *c, int level,
 				   struct closure *cl)
 {
 	BKEY_PADDED(key) k;
 	struct btree *b = ERR_PTR(-EAGAIN);
 	mutex_lock(&c->bucket_lock);
 retry:
 	if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, cl))
 		goto err;
 	SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS);
 	b = mca_alloc(c, &k.key, level, cl);
 	if (IS_ERR(b))
 		goto err_free;
 	if (!b) {
 		cache_bug(c,
 			"Tried to allocate bucket that was in btree cache");
 		__bkey_put(c, &k.key);
 		goto retry;
 	}
 	b->accessed = 1;
 	bch_bset_init_next(b);
 	mutex_unlock(&c->bucket_lock);
 	trace_bcache_btree_node_alloc(b);
 	return b;
 err_free:
 	bch_bucket_free(c, &k.key);
 	__bkey_put(c, &k.key);
 err:
 	mutex_unlock(&c->bucket_lock);
 	trace_bcache_btree_node_alloc_fail(b);
 	return b;
 }
 static struct btree *btree_node_alloc_replacement(struct btree *b,
 						  struct closure *cl)
 {
 	struct btree *n = bch_btree_node_alloc(b->c, b->level, cl);
 	if (!IS_ERR_OR_NULL(n))
 		bch_btree_sort_into(b, n);
 	return n;
 }
 /* Garbage collection */
 uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
 {
 	uint8_t stale = 0;
 	unsigned i;
 	struct bucket *g;
 	/*
 	 * ptr_invalid() can't return true for the keys that mark btree nodes as
 	 * freed, but since ptr_bad() returns true we'll never actually use them
 	 * for anything and thus we don't want mark their pointers here
 	 */
 	if (!bkey_cmp(k, &ZERO_KEY))
 		return stale;
 	for (i = 0; i < KEY_PTRS(k); i++) {
 		if (!ptr_available(c, k, i))
 			continue;
 		g = PTR_BUCKET(c, k, i);
 		if (gen_after(g->gc_gen, PTR_GEN(k, i)))
 			g->gc_gen = PTR_GEN(k, i);
 		if (ptr_stale(c, k, i)) {
 			stale = max(stale, ptr_stale(c, k, i));
 			continue;
 		}
 		cache_bug_on(GC_MARK(g) &&
 			     (GC_MARK(g) == GC_MARK_METADATA) != (level != 0),
 			     c, "inconsistent ptrs: mark = %llu, level = %i",
 			     GC_MARK(g), level);
 		if (level)
 			SET_GC_MARK(g, GC_MARK_METADATA);
 		else if (KEY_DIRTY(k))
 			SET_GC_MARK(g, GC_MARK_DIRTY);
 		/* guard against overflow */
 		SET_GC_SECTORS_USED(g, min_t(unsigned,
 					     GC_SECTORS_USED(g) + KEY_SIZE(k),
 					     (1 << 14) - 1));
 		BUG_ON(!GC_SECTORS_USED(g));
 	}
 	return stale;
 }
 #define btree_mark_key(b, k)	__bch_btree_mark_key(b->c, b->level, k)
 static int btree_gc_mark_node(struct btree *b, unsigned *keys,
 			      struct gc_stat *gc)
 {
 	uint8_t stale = 0;
 	unsigned last_dev = -1;
 	struct bcache_device *d = NULL;
 	struct bkey *k;
 	struct btree_iter iter;
 	struct bset_tree *t;
 	gc->nodes++;
 	for_each_key_filter(b, k, &iter, bch_ptr_invalid) {
 		if (last_dev != KEY_INODE(k)) {
 			last_dev = KEY_INODE(k);
 			d = KEY_INODE(k) < b->c->nr_uuids
 				? b->c->devices[last_dev]
 				: NULL;
 		}
 		stale = max(stale, btree_mark_key(b, k));
 		if (bch_ptr_bad(b, k))
 			continue;
 		*keys += bkey_u64s(k);
 		gc->key_bytes += bkey_u64s(k);
 		gc->nkeys++;
 		gc->data += KEY_SIZE(k);
 		if (KEY_DIRTY(k))
 			gc->dirty += KEY_SIZE(k);
 	}
 	for (t = b->sets; t <= &b->sets[b->nsets]; t++)
 		btree_bug_on(t->size &&
 			     bset_written(b, t) &&
 			     bkey_cmp(&b->key, &t->end) < 0,
 			     b, "found short btree key in gc");
 	return stale;
 }
 static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k,
 				    struct btree_op *op)
 {
 	/*
 	 * We block priorities from being written for the duration of garbage
 	 * collection, so we can't sleep in btree_alloc() ->
 	 * bch_bucket_alloc_set(), or we'd risk deadlock - so we don't pass it
 	 * our closure.
 	 */
 	struct btree *n = btree_node_alloc_replacement(b, NULL);
 	if (!IS_ERR_OR_NULL(n)) {
 		swap(b, n);
 		__bkey_put(b->c, &b->key);
 		memcpy(k->ptr, b->key.ptr,
 		       sizeof(uint64_t) * KEY_PTRS(&b->key));
 		btree_node_free(n, op);
 		up_write(&n->lock);
 	}
 	return b;
 }
 /*
  * Leaving this at 2 until we've got incremental garbage collection done; it
  * could be higher (and has been tested with 4) except that garbage collection
  * could take much longer, adversely affecting latency.
  */
 #define GC_MERGE_NODES	2U
 struct gc_merge_info {
 	struct btree	*b;
 	struct bkey	*k;
 	unsigned	keys;
 };
 static void btree_gc_coalesce(struct btree *b, struct btree_op *op,
 			      struct gc_stat *gc, struct gc_merge_info *r)
 {
 	unsigned nodes = 0, keys = 0, blocks;
 	int i;
 	while (nodes < GC_MERGE_NODES && r[nodes].b)
 		keys += r[nodes++].keys;
 	blocks = btree_default_blocks(b->c) * 2 / 3;
 	if (nodes < 2 ||
 	    __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1))
 		return;
 	for (i = nodes - 1; i >= 0; --i) {
 		if (r[i].b->written)
 			r[i].b = btree_gc_alloc(r[i].b, r[i].k, op);
 		if (r[i].b->written)
 			return;
 	}
 	for (i = nodes - 1; i > 0; --i) {
 		struct bset *n1 = r[i].b->sets->data;
 		struct bset *n2 = r[i - 1].b->sets->data;
 		struct bkey *k, *last = NULL;
 		keys = 0;
 		if (i == 1) {
 			/*
 			 * Last node we're not getting rid of - we're getting
 			 * rid of the node at r[0]. Have to try and fit all of
 			 * the remaining keys into this node; we can't ensure
 			 * they will always fit due to rounding and variable
 			 * length keys (shouldn't be possible in practice,
 			 * though)
 			 */
 			if (__set_blocks(n1, n1->keys + r->keys,
 					 b->c) > btree_blocks(r[i].b))
 				return;
 			keys = n2->keys;
 			last = &r->b->key;
 		} else
 			for (k = n2->start;
 			     k < end(n2);
 			     k = bkey_next(k)) {
 				if (__set_blocks(n1, n1->keys + keys +
 						 bkey_u64s(k), b->c) > blocks)
 					break;
 				last = k;
 				keys += bkey_u64s(k);
 			}
 		BUG_ON(__set_blocks(n1, n1->keys + keys,
 				    b->c) > btree_blocks(r[i].b));
 		if (last) {
 			bkey_copy_key(&r[i].b->key, last);
 			bkey_copy_key(r[i].k, last);
 		}
 		memcpy(end(n1),
 		       n2->start,
 		       (void *) node(n2, keys) - (void *) n2->start);
 		n1->keys += keys;
 		memmove(n2->start,
 			node(n2, keys),
 			(void *) end(n2) - (void *) node(n2, keys));
 		n2->keys -= keys;
 		r[i].keys	= n1->keys;
 		r[i - 1].keys	= n2->keys;
 	}
 	btree_node_free(r->b, op);
 	up_write(&r->b->lock);
 	trace_bcache_btree_gc_coalesce(nodes);
 	gc->nodes--;
 	nodes--;
 	memmove(&r[0], &r[1], sizeof(struct gc_merge_info) * nodes);
 	memset(&r[nodes], 0, sizeof(struct gc_merge_info));
 }
 static int btree_gc_recurse(struct btree *b, struct btree_op *op,
 			    struct closure *writes, struct gc_stat *gc)
 {
 	void write(struct btree *r)
 	{
 		if (!r->written)
 			bch_btree_node_write(r, &op->cl);
 		else if (btree_node_dirty(r))
 			bch_btree_node_write(r, writes);
 		up_write(&r->lock);
 	}
 	int ret = 0, stale;
 	unsigned i;
 	struct gc_merge_info r[GC_MERGE_NODES];
 	memset(r, 0, sizeof(r));
 	while ((r->k = bch_next_recurse_key(b, &b->c->gc_done))) {
 		r->b = bch_btree_node_get(b->c, r->k, b->level - 1, op);
 		if (IS_ERR(r->b)) {
 			ret = PTR_ERR(r->b);
 			break;
 		}
 		r->keys	= 0;
 		stale = btree_gc_mark_node(r->b, &r->keys, gc);
 		if (!b->written &&
 		    (r->b->level || stale > 10 ||
 		     b->c->gc_always_rewrite))
 			r->b = btree_gc_alloc(r->b, r->k, op);
 		if (r->b->level)
 			ret = btree_gc_recurse(r->b, op, writes, gc);
 		if (ret) {
 			write(r->b);
 			break;
 		}
 		bkey_copy_key(&b->c->gc_done, r->k);
 		if (!b->written)
 			btree_gc_coalesce(b, op, gc, r);
 		if (r[GC_MERGE_NODES - 1].b)
 			write(r[GC_MERGE_NODES - 1].b);
 		memmove(&r[1], &r[0],
 			sizeof(struct gc_merge_info) * (GC_MERGE_NODES - 1));
 		/* When we've got incremental GC working, we'll want to do
 		 * if (should_resched())
 		 *	return -EAGAIN;
 		 */
 		cond_resched();
 #if 0
 		if (need_resched()) {
 			ret = -EAGAIN;
 			break;
 		}
 #endif
 	}
 	for (i = 1; i < GC_MERGE_NODES && r[i].b; i++)
 		write(r[i].b);
 	/* Might have freed some children, must remove their keys */
 	if (!b->written)
 		bch_btree_sort(b);
 	return ret;
 }
 static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
 			     struct closure *writes, struct gc_stat *gc)
 {
 	struct btree *n = NULL;
 	unsigned keys = 0;
 	int ret = 0, stale = btree_gc_mark_node(b, &keys, gc);
 	if (b->level || stale > 10)
 		n = btree_node_alloc_replacement(b, NULL);
 	if (!IS_ERR_OR_NULL(n))
 		swap(b, n);
 	if (b->level)
 		ret = btree_gc_recurse(b, op, writes, gc);
 	if (!b->written || btree_node_dirty(b)) {
 		bch_btree_node_write(b, n ? &op->cl : NULL);
 	}
 	if (!IS_ERR_OR_NULL(n)) {
 		closure_sync(&op->cl);
 		bch_btree_set_root(b);
 		btree_node_free(n, op);
 		rw_unlock(true, b);
 	}
 	return ret;
 }
 static void btree_gc_start(struct cache_set *c)
 {
 	struct cache *ca;
 	struct bucket *b;
 	unsigned i;
 	if (!c->gc_mark_valid)
 		return;
 	mutex_lock(&c->bucket_lock);
 	c->gc_mark_valid = 0;
 	c->gc_done = ZERO_KEY;
 	for_each_cache(ca, c, i)
 		for_each_bucket(b, ca) {
 			b->gc_gen = b->gen;
 			if (!atomic_read(&b->pin))
 				SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
 		}
 	mutex_unlock(&c->bucket_lock);
 }
 size_t bch_btree_gc_finish(struct cache_set *c)
 {
 	size_t available = 0;
 	struct bucket *b;
 	struct cache *ca;
 	unsigned i;
 	mutex_lock(&c->bucket_lock);
 	set_gc_sectors(c);
 	c->gc_mark_valid = 1;
 	c->need_gc	= 0;
 	if (c->root)
 		for (i = 0; i < KEY_PTRS(&c->root->key); i++)
 			SET_GC_MARK(PTR_BUCKET(c, &c->root->key, i),
 				    GC_MARK_METADATA);
 	for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++)
 		SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i),
 			    GC_MARK_METADATA);
 	for_each_cache(ca, c, i) {
 		uint64_t *i;
 		ca->invalidate_needs_gc = 0;
 		for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++)
 			SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
 		for (i = ca->prio_buckets;
 		     i < ca->prio_buckets + prio_buckets(ca) * 2; i++)
 			SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
 		for_each_bucket(b, ca) {
 			b->last_gc	= b->gc_gen;
 			c->need_gc	= max(c->need_gc, bucket_gc_gen(b));
 			if (!atomic_read(&b->pin) &&
 			    GC_MARK(b) == GC_MARK_RECLAIMABLE) {
 				available++;
 				if (!GC_SECTORS_USED(b))
 					bch_bucket_add_unused(ca, b);
 			}
 		}
 	}
 	mutex_unlock(&c->bucket_lock);
 	return available;
 }
 static void bch_btree_gc(struct closure *cl)
 {
 	struct cache_set *c = container_of(cl, struct cache_set, gc.cl);
 	int ret;
 	unsigned long available;
 	struct gc_stat stats;
 	struct closure writes;
 	struct btree_op op;
 	uint64_t start_time = local_clock();
 	trace_bcache_gc_start(c);
 	memset(&stats, 0, sizeof(struct gc_stat));
 	closure_init_stack(&writes);
 	bch_btree_op_init_stack(&op);
 	op.lock = SHRT_MAX;
 	btree_gc_start(c);
 	atomic_inc(&c->prio_blocked);
 	ret = btree_root(gc_root, c, &op, &writes, &stats);
 	closure_sync(&op.cl);
 	closure_sync(&writes);
 	if (ret) {
 		pr_warn("gc failed!");
 		continue_at(cl, bch_btree_gc, bch_gc_wq);
 	}
 	/* Possibly wait for new UUIDs or whatever to hit disk */
 	bch_journal_meta(c, &op.cl);
 	closure_sync(&op.cl);
 	available = bch_btree_gc_finish(c);
 	atomic_dec(&c->prio_blocked);
 	wake_up_allocators(c);
 	bch_time_stats_update(&c->btree_gc_time, start_time);
 	stats.key_bytes *= sizeof(uint64_t);
 	stats.dirty	<<= 9;
 	stats.data	<<= 9;
 	stats.in_use	= (c->nbuckets - available) * 100 / c->nbuckets;
 	memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
 	trace_bcache_gc_end(c);
 	continue_at(cl, bch_moving_gc, bch_gc_wq);
 }
 void bch_queue_gc(struct cache_set *c)
 {
 	closure_trylock_call(&c->gc.cl, bch_btree_gc, bch_gc_wq, &c->cl);
 }
 /* Initial partial gc */
 static int bch_btree_check_recurse(struct btree *b, struct btree_op *op,
 				   unsigned long **seen)
 {
 	int ret;
 	unsigned i;
 	struct bkey *k;
 	struct bucket *g;
 	struct btree_iter iter;
 	for_each_key_filter(b, k, &iter, bch_ptr_invalid) {
 		for (i = 0; i < KEY_PTRS(k); i++) {
 			if (!ptr_available(b->c, k, i))
 				continue;
 			g = PTR_BUCKET(b->c, k, i);
 			if (!__test_and_set_bit(PTR_BUCKET_NR(b->c, k, i),
 						seen[PTR_DEV(k, i)]) ||
 			    !ptr_stale(b->c, k, i)) {
 				g->gen = PTR_GEN(k, i);
 				if (b->level)
 					g->prio = BTREE_PRIO;
 				else if (g->prio == BTREE_PRIO)
 					g->prio = INITIAL_PRIO;
 			}
 		}
 		btree_mark_key(b, k);
 	}
 	if (b->level) {
 		k = bch_next_recurse_key(b, &ZERO_KEY);
 		while (k) {
 			struct bkey *p = bch_next_recurse_key(b, k);
 			if (p)
 				btree_node_prefetch(b->c, p, b->level - 1);
 			ret = btree(check_recurse, k, b, op, seen);
 			if (ret)
 				return ret;
 			k = p;
 		}
 	}
 	return 0;
 }
 int bch_btree_check(struct cache_set *c, struct btree_op *op)
 {
 	int ret = -ENOMEM;
 	unsigned i;
 	unsigned long *seen[MAX_CACHES_PER_SET];
 	memset(seen, 0, sizeof(seen));
 	for (i = 0; c->cache[i]; i++) {
 		size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8);
 		seen[i] = kmalloc(n, GFP_KERNEL);
 		if (!seen[i])
 			goto err;
 		/* Disables the seen array until prio_read() uses it too */
 		memset(seen[i], 0xFF, n);
 	}
 	ret = btree_root(check_recurse, c, op, seen);
 err:
 	for (i = 0; i < MAX_CACHES_PER_SET; i++)
 		kfree(seen[i]);
 	return ret;
 }
 /* Btree insertion */
 static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert)
 {
 	struct bset *i = b->sets[b->nsets].data;
 	memmove((uint64_t *) where + bkey_u64s(insert),
 		where,
 		(void *) end(i) - (void *) where);
 	i->keys += bkey_u64s(insert);
 	bkey_copy(where, insert);
 	bch_bset_fix_lookup_table(b, where);
 }
 static bool fix_overlapping_extents(struct btree *b,
 				    struct bkey *insert,
 				    struct btree_iter *iter,
 				    struct btree_op *op)
 {
 	void subtract_dirty(struct bkey *k, uint64_t offset, int sectors)
 	{
 		if (KEY_DIRTY(k))
 			bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
 						     offset, -sectors);
 	}
 	uint64_t old_offset;
 	unsigned old_size, sectors_found = 0;
 	while (1) {
 		struct bkey *k = bch_btree_iter_next(iter);
 		if (!k ||
 		    bkey_cmp(&START_KEY(k), insert) >= 0)
 			break;
 		if (bkey_cmp(k, &START_KEY(insert)) <= 0)
 			continue;
 		old_offset = KEY_START(k);
 		old_size = KEY_SIZE(k);
 		/*
 		 * We might overlap with 0 size extents; we can't skip these
 		 * because if they're in the set we're inserting to we have to
 		 * adjust them so they don't overlap with the key we're
 		 * inserting. But we don't want to check them for BTREE_REPLACE
 		 * operations.
 		 */
 		if (op->type == BTREE_REPLACE &&
 		    KEY_SIZE(k)) {
 			/*
 			 * k might have been split since we inserted/found the
 			 * key we're replacing
 			 */
 			unsigned i;
 			uint64_t offset = KEY_START(k) -
 				KEY_START(&op->replace);
 			/* But it must be a subset of the replace key */
 			if (KEY_START(k) < KEY_START(&op->replace) ||
 			    KEY_OFFSET(k) > KEY_OFFSET(&op->replace))
 				goto check_failed;
 			/* We didn't find a key that we were supposed to */
 			if (KEY_START(k) > KEY_START(insert) + sectors_found)
 				goto check_failed;
 			if (KEY_PTRS(&op->replace) != KEY_PTRS(k))
 				goto check_failed;
 			/* skip past gen */
 			offset <<= 8;
 			BUG_ON(!KEY_PTRS(&op->replace));
 			for (i = 0; i < KEY_PTRS(&op->replace); i++)
 				if (k->ptr[i] != op->replace.ptr[i] + offset)
 					goto check_failed;
 			sectors_found = KEY_OFFSET(k) - KEY_START(insert);
 		}
 		if (bkey_cmp(insert, k) < 0 &&
 		    bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) {
 			/*
 			 * We overlapped in the middle of an existing key: that
 			 * means we have to split the old key. But we have to do
 			 * slightly different things depending on whether the
 			 * old key has been written out yet.
 			 */
 			struct bkey *top;
 			subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert));
 			if (bkey_written(b, k)) {
 				/*
 				 * We insert a new key to cover the top of the
 				 * old key, and the old key is modified in place
 				 * to represent the bottom split.
 				 *
 				 * It's completely arbitrary whether the new key
 				 * is the top or the bottom, but it has to match
 				 * up with what btree_sort_fixup() does - it
 				 * doesn't check for this kind of overlap, it
 				 * depends on us inserting a new key for the top
 				 * here.
 				 */
 				top = bch_bset_search(b, &b->sets[b->nsets],
 						      insert);
 				shift_keys(b, top, k);
 			} else {
 				BKEY_PADDED(key) temp;
 				bkey_copy(&temp.key, k);
 				shift_keys(b, k, &temp.key);
 				top = bkey_next(k);
 			}
 			bch_cut_front(insert, top);
 			bch_cut_back(&START_KEY(insert), k);
 			bch_bset_fix_invalidated_key(b, k);
 			return false;
 		}
 		if (bkey_cmp(insert, k) < 0) {
 			bch_cut_front(insert, k);
 		} else {
 			if (bkey_written(b, k) &&
 			    bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) {
 				/*
 				 * Completely overwrote, so we don't have to
 				 * invalidate the binary search tree
 				 */
 				bch_cut_front(k, k);
 			} else {
 				__bch_cut_back(&START_KEY(insert), k);
 				bch_bset_fix_invalidated_key(b, k);
 			}
 		}
 		subtract_dirty(k, old_offset, old_size - KEY_SIZE(k));
 	}
 check_failed:
 	if (op->type == BTREE_REPLACE) {
 		if (!sectors_found) {
 			op->insert_collision = true;
 			return true;
 		} else if (sectors_found < KEY_SIZE(insert)) {
 			SET_KEY_OFFSET(insert, KEY_OFFSET(insert) -
 				       (KEY_SIZE(insert) - sectors_found));
 			SET_KEY_SIZE(insert, sectors_found);
 		}
 	}
 	return false;
 }
 static bool btree_insert_key(struct btree *b, struct btree_op *op,
 			     struct bkey *k)
 {
 	struct bset *i = b->sets[b->nsets].data;
 	struct bkey *m, *prev;
 	unsigned status = BTREE_INSERT_STATUS_INSERT;
 	BUG_ON(bkey_cmp(k, &b->key) > 0);
 	BUG_ON(b->level && !KEY_PTRS(k));
 	BUG_ON(!b->level && !KEY_OFFSET(k));
 	if (!b->level) {
 		struct btree_iter iter;
 		struct bkey search = KEY(KEY_INODE(k), KEY_START(k), 0);
 		/*
 		 * bset_search() returns the first key that is strictly greater
 		 * than the search key - but for back merging, we want to find
 		 * the first key that is greater than or equal to KEY_START(k) -
 		 * unless KEY_START(k) is 0.
 		 */
 		if (KEY_OFFSET(&search))
 			SET_KEY_OFFSET(&search, KEY_OFFSET(&search) - 1);
 		prev = NULL;
 		m = bch_btree_iter_init(b, &iter, &search);
 		if (fix_overlapping_extents(b, k, &iter, op))
 			return false;
 		while (m != end(i) &&
 		       bkey_cmp(k, &START_KEY(m)) > 0)
 			prev = m, m = bkey_next(m);
 		if (key_merging_disabled(b->c))
 			goto insert;
 		/* prev is in the tree, if we merge we're done */
 		status = BTREE_INSERT_STATUS_BACK_MERGE;
 		if (prev &&
 		    bch_bkey_try_merge(b, prev, k))
 			goto merged;
 		status = BTREE_INSERT_STATUS_OVERWROTE;
 		if (m != end(i) &&
 		    KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m))
 			goto copy;
 		status = BTREE_INSERT_STATUS_FRONT_MERGE;
 		if (m != end(i) &&
 		    bch_bkey_try_merge(b, k, m))
 			goto copy;
 	} else
 		m = bch_bset_search(b, &b->sets[b->nsets], k);
 insert:	shift_keys(b, m, k);
 copy:	bkey_copy(m, k);
 merged:
 	if (KEY_DIRTY(k))
 		bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
 					     KEY_START(k), KEY_SIZE(k));
 	bch_check_keys(b, "%u for %s", status, op_type(op));
 	if (b->level && !KEY_OFFSET(k))
 		btree_current_write(b)->prio_blocked++;
 	trace_bcache_btree_insert_key(b, k, op->type, status);
 	return true;
 }
 static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op)
 {
 	bool ret = false;
 	struct bkey *k;
 	unsigned oldsize = bch_count_data(b);
 	while ((k = bch_keylist_pop(&op->keys))) {
 		bkey_put(b->c, k, b->level);
 		ret |= btree_insert_key(b, op, k);
 	}
 	BUG_ON(bch_count_data(b) < oldsize);
 	return ret;
 }
 bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
 				   struct bio *bio)
 {
 	bool ret = false;
 	uint64_t btree_ptr = b->key.ptr[0];
 	unsigned long seq = b->seq;
 	BKEY_PADDED(k) tmp;
 	rw_unlock(false, b);
 	rw_lock(true, b, b->level);
 	if (b->key.ptr[0] != btree_ptr ||
 	    b->seq != seq + 1 ||
 	    should_split(b))
 		goto out;
-	op->replace = KEY(op->inode, bio_end(bio), bio_sectors(bio));
+	op->replace = KEY(op->inode, bio_end_sector(bio), bio_sectors(bio));
 	SET_KEY_PTRS(&op->replace, 1);
 	get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t));
 	SET_PTR_DEV(&op->replace, 0, PTR_CHECK_DEV);
 	bkey_copy(&tmp.k, &op->replace);
 	BUG_ON(op->type != BTREE_INSERT);
 	BUG_ON(!btree_insert_key(b, op, &tmp.k));
 	ret = true;
 out:
 	downgrade_write(&b->lock);
 	return ret;
 }
 static int btree_split(struct btree *b, struct btree_op *op)
 {
 	bool split, root = b == b->c->root;
 	struct btree *n1, *n2 = NULL, *n3 = NULL;
 	uint64_t start_time = local_clock();
 	if (b->level)
 		set_closure_blocking(&op->cl);
 	n1 = btree_node_alloc_replacement(b, &op->cl);
 	if (IS_ERR(n1))
 		goto err;
 	split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5;
 	if (split) {
 		unsigned keys = 0;
 		trace_bcache_btree_node_split(b, n1->sets[0].data->keys);
 		n2 = bch_btree_node_alloc(b->c, b->level, &op->cl);
 		if (IS_ERR(n2))
 			goto err_free1;
 		if (root) {
 			n3 = bch_btree_node_alloc(b->c, b->level + 1, &op->cl);
 			if (IS_ERR(n3))
 				goto err_free2;
 		}
 		bch_btree_insert_keys(n1, op);
 		/* Has to be a linear search because we don't have an auxiliary
 		 * search tree yet
 		 */
 		while (keys < (n1->sets[0].data->keys * 3) / 5)
 			keys += bkey_u64s(node(n1->sets[0].data, keys));
 		bkey_copy_key(&n1->key, node(n1->sets[0].data, keys));
 		keys += bkey_u64s(node(n1->sets[0].data, keys));
 		n2->sets[0].data->keys = n1->sets[0].data->keys - keys;
 		n1->sets[0].data->keys = keys;
 		memcpy(n2->sets[0].data->start,
 		       end(n1->sets[0].data),
 		       n2->sets[0].data->keys * sizeof(uint64_t));
 		bkey_copy_key(&n2->key, &b->key);
 		bch_keylist_add(&op->keys, &n2->key);
 		bch_btree_node_write(n2, &op->cl);
 		rw_unlock(true, n2);
 	} else {
 		trace_bcache_btree_node_compact(b, n1->sets[0].data->keys);
 		bch_btree_insert_keys(n1, op);
 	}
 	bch_keylist_add(&op->keys, &n1->key);
 	bch_btree_node_write(n1, &op->cl);
 	if (n3) {
 		bkey_copy_key(&n3->key, &MAX_KEY);
 		bch_btree_insert_keys(n3, op);
 		bch_btree_node_write(n3, &op->cl);
 		closure_sync(&op->cl);
 		bch_btree_set_root(n3);
 		rw_unlock(true, n3);
 	} else if (root) {
 		op->keys.top = op->keys.bottom;
 		closure_sync(&op->cl);
 		bch_btree_set_root(n1);
 	} else {
 		unsigned i;
 		bkey_copy(op->keys.top, &b->key);
 		bkey_copy_key(op->keys.top, &ZERO_KEY);
 		for (i = 0; i < KEY_PTRS(&b->key); i++) {
 			uint8_t g = PTR_BUCKET(b->c, &b->key, i)->gen + 1;
 			SET_PTR_GEN(op->keys.top, i, g);
 		}
 		bch_keylist_push(&op->keys);
 		closure_sync(&op->cl);
 		atomic_inc(&b->c->prio_blocked);
 	}
 	rw_unlock(true, n1);
 	btree_node_free(b, op);
 	bch_time_stats_update(&b->c->btree_split_time, start_time);
 	return 0;
 err_free2:
 	__bkey_put(n2->c, &n2->key);
 	btree_node_free(n2, op);
 	rw_unlock(true, n2);
 err_free1:
 	__bkey_put(n1->c, &n1->key);
 	btree_node_free(n1, op);
 	rw_unlock(true, n1);
 err:
 	if (n3 == ERR_PTR(-EAGAIN) ||
 	    n2 == ERR_PTR(-EAGAIN) ||
 	    n1 == ERR_PTR(-EAGAIN))
 		return -EAGAIN;
 	pr_warn("couldn't split");
 	return -ENOMEM;
 }
 static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op,
 				    struct keylist *stack_keys)
 {
 	if (b->level) {
 		int ret;
 		struct bkey *insert = op->keys.bottom;
 		struct bkey *k = bch_next_recurse_key(b, &START_KEY(insert));
 		if (!k) {
 			btree_bug(b, "no key to recurse on at level %i/%i",
 				  b->level, b->c->root->level);
 			op->keys.top = op->keys.bottom;
 			return -EIO;
 		}
 		if (bkey_cmp(insert, k) > 0) {
 			unsigned i;
 			if (op->type == BTREE_REPLACE) {
 				__bkey_put(b->c, insert);
 				op->keys.top = op->keys.bottom;
 				op->insert_collision = true;
 				return 0;
 			}
 			for (i = 0; i < KEY_PTRS(insert); i++)
 				atomic_inc(&PTR_BUCKET(b->c, insert, i)->pin);
 			bkey_copy(stack_keys->top, insert);
 			bch_cut_back(k, insert);
 			bch_cut_front(k, stack_keys->top);
 			bch_keylist_push(stack_keys);
 		}
 		ret = btree(insert_recurse, k, b, op, stack_keys);
 		if (ret)
 			return ret;
 	}
 	if (!bch_keylist_empty(&op->keys)) {
 		if (should_split(b)) {
 			if (op->lock <= b->c->root->level) {
 				BUG_ON(b->level);
 				op->lock = b->c->root->level + 1;
 				return -EINTR;
 			}
 			return btree_split(b, op);
 		}
 		BUG_ON(write_block(b) != b->sets[b->nsets].data);
 		if (bch_btree_insert_keys(b, op)) {
 			if (!b->level)
 				bch_btree_leaf_dirty(b, op);
 			else
 				bch_btree_node_write(b, &op->cl);
 		}
 	}
 	return 0;
 }
 int bch_btree_insert(struct btree_op *op, struct cache_set *c)
 {
 	int ret = 0;
 	struct keylist stack_keys;
 	/*
 	 * Don't want to block with the btree locked unless we have to,
 	 * otherwise we get deadlocks with try_harder and between split/gc
 	 */
 	clear_closure_blocking(&op->cl);
 	BUG_ON(bch_keylist_empty(&op->keys));
 	bch_keylist_copy(&stack_keys, &op->keys);
 	bch_keylist_init(&op->keys);
 	while (!bch_keylist_empty(&stack_keys) ||
 	       !bch_keylist_empty(&op->keys)) {
 		if (bch_keylist_empty(&op->keys)) {
 			bch_keylist_add(&op->keys,
 					bch_keylist_pop(&stack_keys));
 			op->lock = 0;
 		}
 		ret = btree_root(insert_recurse, c, op, &stack_keys);
 		if (ret == -EAGAIN) {
 			ret = 0;
 			closure_sync(&op->cl);
 		} else if (ret) {
 			struct bkey *k;
 			pr_err("error %i trying to insert key for %s",
 			       ret, op_type(op));
 			while ((k = bch_keylist_pop(&stack_keys) ?:
 				    bch_keylist_pop(&op->keys)))
 				bkey_put(c, k, 0);
 		}
 	}
 	bch_keylist_free(&stack_keys);
 	if (op->journal)
 		atomic_dec_bug(op->journal);
 	op->journal = NULL;
 	return ret;
 }
 void bch_btree_set_root(struct btree *b)
 {
 	unsigned i;
 	struct closure cl;
 	closure_init_stack(&cl);
 	trace_bcache_btree_set_root(b);
 	BUG_ON(!b->written);
 	for (i = 0; i < KEY_PTRS(&b->key); i++)
 		BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO);
 	mutex_lock(&b->c->bucket_lock);
 	list_del_init(&b->list);
 	mutex_unlock(&b->c->bucket_lock);
 	b->c->root = b;
 	__bkey_put(b->c, &b->key);
 	bch_journal_meta(b->c, &cl);
 	closure_sync(&cl);
 }
 /* Cache lookup */
 static int submit_partial_cache_miss(struct btree *b, struct btree_op *op,
 				     struct bkey *k)
 {
 	struct search *s = container_of(op, struct search, op);
 	struct bio *bio = &s->bio.bio;
 	int ret = 0;
 	while (!ret &&
 	       !op->lookup_done) {
 		unsigned sectors = INT_MAX;
 		if (KEY_INODE(k) == op->inode) {
 			if (KEY_START(k) <= bio->bi_sector)
 				break;
 			sectors = min_t(uint64_t, sectors,
 					KEY_START(k) - bio->bi_sector);
 		}
 		ret = s->d->cache_miss(b, s, bio, sectors);
 	}
 	return ret;
 }
 /*
  * Read from a single key, handling the initial cache miss if the key starts in
  * the middle of the bio
  */
 static int submit_partial_cache_hit(struct btree *b, struct btree_op *op,
 				    struct bkey *k)
 {
 	struct search *s = container_of(op, struct search, op);
 	struct bio *bio = &s->bio.bio;
 	unsigned ptr;
 	struct bio *n;
 	int ret = submit_partial_cache_miss(b, op, k);
 	if (ret || op->lookup_done)
 		return ret;
 	/* XXX: figure out best pointer - for multiple cache devices */
 	ptr = 0;
 	PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO;
 	while (!op->lookup_done &&
 	       KEY_INODE(k) == op->inode &&
 	       bio->bi_sector < KEY_OFFSET(k)) {
 		struct bkey *bio_key;
 		sector_t sector = PTR_OFFSET(k, ptr) +
 			(bio->bi_sector - KEY_START(k));
 		unsigned sectors = min_t(uint64_t, INT_MAX,
 					 KEY_OFFSET(k) - bio->bi_sector);
 		n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
-		if (!n)
-			return -EAGAIN;
 		if (n == bio)
 			op->lookup_done = true;
 		bio_key = &container_of(n, struct bbio, bio)->key;
 		/*
 		 * The bucket we're reading from might be reused while our bio
 		 * is in flight, and we could then end up reading the wrong
 		 * data.
 		 *
 		 * We guard against this by checking (in cache_read_endio()) if
 		 * the pointer is stale again; if so, we treat it as an error
 		 * and reread from the backing device (but we don't pass that
 		 * error up anywhere).
 		 */
 		bch_bkey_copy_single_ptr(bio_key, k, ptr);
 		SET_PTR_OFFSET(bio_key, 0, sector);
 		n->bi_end_io	= bch_cache_read_endio;
 		n->bi_private	= &s->cl;
 		__bch_submit_bbio(n, b->c);
 	}
 	return 0;
 }
 int bch_btree_search_recurse(struct btree *b, struct btree_op *op)
 {
 	struct search *s = container_of(op, struct search, op);
 	struct bio *bio = &s->bio.bio;
 	int ret = 0;
 	struct bkey *k;
 	struct btree_iter iter;
 	bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0));
 	do {
 		k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
 		if (!k) {
 			/*
 			 * b->key would be exactly what we want, except that
 			 * pointers to btree nodes have nonzero size - we
 			 * wouldn't go far enough
 			 */
 			ret = submit_partial_cache_miss(b, op,
 					&KEY(KEY_INODE(&b->key),
 					     KEY_OFFSET(&b->key), 0));
 			break;
 		}
 		ret = b->level
 			? btree(search_recurse, k, b, op)
 			: submit_partial_cache_hit(b, op, k);
 	} while (!ret &&
 		 !op->lookup_done);
 	return ret;
 }
 /* Keybuf code */
 static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r)
 {
 	/* Overlapping keys compare equal */
 	if (bkey_cmp(&l->key, &START_KEY(&r->key)) <= 0)
 		return -1;
 	if (bkey_cmp(&START_KEY(&l->key), &r->key) >= 0)
 		return 1;
 	return 0;
 }
 static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l,
 					    struct keybuf_key *r)
 {
 	return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1);
 }
 static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op,
 				   struct keybuf *buf, struct bkey *end,
 				   keybuf_pred_fn *pred)
 {
 	struct btree_iter iter;
 	bch_btree_iter_init(b, &iter, &buf->last_scanned);
 	while (!array_freelist_empty(&buf->freelist)) {
 		struct bkey *k = bch_btree_iter_next_filter(&iter, b,
 							    bch_ptr_bad);
 		if (!b->level) {
 			if (!k) {
 				buf->last_scanned = b->key;
 				break;
 			}
 			buf->last_scanned = *k;
 			if (bkey_cmp(&buf->last_scanned, end) >= 0)
 				break;
 			if (pred(buf, k)) {
 				struct keybuf_key *w;
 				spin_lock(&buf->lock);
 				w = array_alloc(&buf->freelist);
 				w->private = NULL;
 				bkey_copy(&w->key, k);
 				if (RB_INSERT(&buf->keys, w, node, keybuf_cmp))
 					array_free(&buf->freelist, w);
 				spin_unlock(&buf->lock);
 			}
 		} else {
 			if (!k)
 				break;
 			btree(refill_keybuf, k, b, op, buf, end, pred);
 			/*
 			 * Might get an error here, but can't really do anything
 			 * and it'll get logged elsewhere. Just read what we
 			 * can.
 			 */
 			if (bkey_cmp(&buf->last_scanned, end) >= 0)
 				break;
 			cond_resched();
 		}
 	}
 	return 0;
 }
 void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
 		       struct bkey *end, keybuf_pred_fn *pred)
 {
 	struct bkey start = buf->last_scanned;
 	struct btree_op op;
 	bch_btree_op_init_stack(&op);
 	cond_resched();
 	btree_root(refill_keybuf, c, &op, buf, end, pred);
 	closure_sync(&op.cl);
 	pr_debug("found %s keys from %llu:%llu to %llu:%llu",
 		 RB_EMPTY_ROOT(&buf->keys) ? "no" :
 		 array_freelist_empty(&buf->freelist) ? "some" : "a few",
 		 KEY_INODE(&start), KEY_OFFSET(&start),
 		 KEY_INODE(&buf->last_scanned), KEY_OFFSET(&buf->last_scanned));
 	spin_lock(&buf->lock);
 	if (!RB_EMPTY_ROOT(&buf->keys)) {
 		struct keybuf_key *w;
 		w = RB_FIRST(&buf->keys, struct keybuf_key, node);
 		buf->start	= START_KEY(&w->key);
 		w = RB_LAST(&buf->keys, struct keybuf_key, node);
 		buf->end	= w->key;
 	} else {
 		buf->start	= MAX_KEY;
 		buf->end	= MAX_KEY;
 	}
 	spin_unlock(&buf->lock);
 }
 static void __bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
 {
 	rb_erase(&w->node, &buf->keys);
 	array_free(&buf->freelist, w);
 }
 void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
 {
 	spin_lock(&buf->lock);
 	__bch_keybuf_del(buf, w);
 	spin_unlock(&buf->lock);
 }
 bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start,
 				  struct bkey *end)
 {
 	bool ret = false;
 	struct keybuf_key *p, *w, s;
 	s.key = *start;
 	if (bkey_cmp(end, &buf->start) <= 0 ||
 	    bkey_cmp(start, &buf->end) >= 0)
 		return false;
 	spin_lock(&buf->lock);
 	w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp);
 	while (w && bkey_cmp(&START_KEY(&w->key), end) < 0) {
 		p = w;
 		w = RB_NEXT(w, node);
 		if (p->private)
 			ret = true;
 		else
 			__bch_keybuf_del(buf, p);
 	}
 	spin_unlock(&buf->lock);
 	return ret;
 }
 struct keybuf_key *bch_keybuf_next(struct keybuf *buf)
 {
 	struct keybuf_key *w;
 	spin_lock(&buf->lock);
 	w = RB_FIRST(&buf->keys, struct keybuf_key, node);
 	while (w && w->private)
 		w = RB_NEXT(w, node);
 	if (w)
 		w->private = ERR_PTR(-EINTR);
 	spin_unlock(&buf->lock);
 	return w;
 }
 struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
 					     struct keybuf *buf,
 					     struct bkey *end,
 					     keybuf_pred_fn *pred)
 {
 	struct keybuf_key *ret;
 	while (1) {
 		ret = bch_keybuf_next(buf);
 		if (ret)
 			break;
 		if (bkey_cmp(&buf->last_scanned, end) >= 0) {
 			pr_debug("scan finished");
 			break;
 		}
 		bch_refill_keybuf(c, buf, end, pred);
 	}
 	return ret;
 }
 void bch_keybuf_init(struct keybuf *buf)
 {
 	buf->last_scanned	= MAX_KEY;
 	buf->keys		= RB_ROOT;
 	spin_lock_init(&buf->lock);
 	array_allocator_init(&buf->freelist);
 }
 void bch_btree_exit(void)
 {
 	if (btree_io_wq)
 		destroy_workqueue(btree_io_wq);
 	if (bch_gc_wq)
 		destroy_workqueue(bch_gc_wq);
 }
 int __init bch_btree_init(void)
 {
 	if (!(bch_gc_wq = create_singlethread_workqueue("bch_btree_gc")) ||
 	    !(btree_io_wq = create_singlethread_workqueue("bch_btree_io")))
 		return -ENOMEM;
 	return 0;
 }

drivers/md/bcache/debug.c

Diff comments View file @ 8e51e41

1	/*	1	/*
2	* Assorted bcache debug code	2	* Assorted bcache debug code
3	*	3	*
4	* Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>	4	* Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
5	* Copyright 2012 Google, Inc.	5	* Copyright 2012 Google, Inc.
6	*/	6	*/
7		7
8	#include "bcache.h"	8	#include "bcache.h"
9	#include "btree.h"	9	#include "btree.h"
10	#include "debug.h"	10	#include "debug.h"
11	#include "request.h"	11	#include "request.h"
12		12
13	#include <linux/console.h>	13	#include <linux/console.h>
14	#include <linux/debugfs.h>	14	#include <linux/debugfs.h>
15	#include <linux/module.h>	15	#include <linux/module.h>
16	#include <linux/random.h>	16	#include <linux/random.h>
17	#include <linux/seq_file.h>	17	#include <linux/seq_file.h>
18		18
19	static struct dentry *debug;	19	static struct dentry *debug;
20		20
21	const char bch_ptr_status(struct cache_set c, const struct bkey *k)	21	const char bch_ptr_status(struct cache_set c, const struct bkey *k)
22	{	22	{
23	unsigned i;	23	unsigned i;
24		24
25	for (i = 0; i < KEY_PTRS(k); i++)	25	for (i = 0; i < KEY_PTRS(k); i++)
26	if (ptr_available(c, k, i)) {	26	if (ptr_available(c, k, i)) {
27	struct cache *ca = PTR_CACHE(c, k, i);	27	struct cache *ca = PTR_CACHE(c, k, i);
28	size_t bucket = PTR_BUCKET_NR(c, k, i);	28	size_t bucket = PTR_BUCKET_NR(c, k, i);
29	size_t r = bucket_remainder(c, PTR_OFFSET(k, i));	29	size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
30		30
31	if (KEY_SIZE(k) + r > c->sb.bucket_size)	31	if (KEY_SIZE(k) + r > c->sb.bucket_size)
32	return "bad, length too big";	32	return "bad, length too big";
33	if (bucket < ca->sb.first_bucket)	33	if (bucket < ca->sb.first_bucket)
34	return "bad, short offset";	34	return "bad, short offset";
35	if (bucket >= ca->sb.nbuckets)	35	if (bucket >= ca->sb.nbuckets)
36	return "bad, offset past end of device";	36	return "bad, offset past end of device";
37	if (ptr_stale(c, k, i))	37	if (ptr_stale(c, k, i))
38	return "stale";	38	return "stale";
39	}	39	}
40		40
41	if (!bkey_cmp(k, &ZERO_KEY))	41	if (!bkey_cmp(k, &ZERO_KEY))
42	return "bad, null key";	42	return "bad, null key";
43	if (!KEY_PTRS(k))	43	if (!KEY_PTRS(k))
44	return "bad, no pointers";	44	return "bad, no pointers";
45	if (!KEY_SIZE(k))	45	if (!KEY_SIZE(k))
46	return "zeroed key";	46	return "zeroed key";
47	return "";	47	return "";
48	}	48	}
49		49
50	int bch_bkey_to_text(char buf, size_t size, const struct bkey k)	50	int bch_bkey_to_text(char buf, size_t size, const struct bkey k)
51	{	51	{
52	unsigned i = 0;	52	unsigned i = 0;
53	char out = buf, end = buf + size;	53	char out = buf, end = buf + size;
54		54
55	#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))	55	#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
56		56
57	p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_OFFSET(k), KEY_SIZE(k));	57	p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_OFFSET(k), KEY_SIZE(k));
58		58
59	if (KEY_PTRS(k))	59	if (KEY_PTRS(k))
60	while (1) {	60	while (1) {
61	p("%llu:%llu gen %llu",	61	p("%llu:%llu gen %llu",
62	PTR_DEV(k, i), PTR_OFFSET(k, i), PTR_GEN(k, i));	62	PTR_DEV(k, i), PTR_OFFSET(k, i), PTR_GEN(k, i));
63		63
64	if (++i == KEY_PTRS(k))	64	if (++i == KEY_PTRS(k))
65	break;	65	break;
66		66
67	p(", ");	67	p(", ");
68	}	68	}
69		69
70	p("]");	70	p("]");
71		71
72	if (KEY_DIRTY(k))	72	if (KEY_DIRTY(k))
73	p(" dirty");	73	p(" dirty");
74	if (KEY_CSUM(k))	74	if (KEY_CSUM(k))
75	p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]);	75	p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]);
76	#undef p	76	#undef p
77	return out - buf;	77	return out - buf;
78	}	78	}
79		79
80	int bch_btree_to_text(char buf, size_t size, const struct btree b)	80	int bch_btree_to_text(char buf, size_t size, const struct btree b)
81	{	81	{
82	return scnprintf(buf, size, "%zu level %i/%i",	82	return scnprintf(buf, size, "%zu level %i/%i",
83	PTR_BUCKET_NR(b->c, &b->key, 0),	83	PTR_BUCKET_NR(b->c, &b->key, 0),
84	b->level, b->c->root ? b->c->root->level : -1);	84	b->level, b->c->root ? b->c->root->level : -1);
85	}	85	}
86		86
87	#if defined(CONFIG_BCACHE_DEBUG) \|\| defined(CONFIG_BCACHE_EDEBUG)	87	#if defined(CONFIG_BCACHE_DEBUG) \|\| defined(CONFIG_BCACHE_EDEBUG)
88		88
89	static bool skipped_backwards(struct btree b, struct bkey k)	89	static bool skipped_backwards(struct btree b, struct bkey k)
90	{	90	{
91	return bkey_cmp(k, (!b->level)	91	return bkey_cmp(k, (!b->level)
92	? &START_KEY(bkey_next(k))	92	? &START_KEY(bkey_next(k))
93	: bkey_next(k)) > 0;	93	: bkey_next(k)) > 0;
94	}	94	}
95		95
96	static void dump_bset(struct btree b, struct bset i)	96	static void dump_bset(struct btree b, struct bset i)
97	{	97	{
98	struct bkey *k;	98	struct bkey *k;
99	unsigned j;	99	unsigned j;
100	char buf[80];	100	char buf[80];
101		101
102	for (k = i->start; k < end(i); k = bkey_next(k)) {	102	for (k = i->start; k < end(i); k = bkey_next(k)) {
103	bch_bkey_to_text(buf, sizeof(buf), k);	103	bch_bkey_to_text(buf, sizeof(buf), k);
104	printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b),	104	printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b),
105	(uint64_t *) k - i->d, i->keys, buf);	105	(uint64_t *) k - i->d, i->keys, buf);
106		106
107	for (j = 0; j < KEY_PTRS(k); j++) {	107	for (j = 0; j < KEY_PTRS(k); j++) {
108	size_t n = PTR_BUCKET_NR(b->c, k, j);	108	size_t n = PTR_BUCKET_NR(b->c, k, j);
109	printk(" bucket %zu", n);	109	printk(" bucket %zu", n);
110		110
111	if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets)	111	if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets)
112	printk(" prio %i",	112	printk(" prio %i",
113	PTR_BUCKET(b->c, k, j)->prio);	113	PTR_BUCKET(b->c, k, j)->prio);
114	}	114	}
115		115
116	printk(" %s\n", bch_ptr_status(b->c, k));	116	printk(" %s\n", bch_ptr_status(b->c, k));
117		117
118	if (bkey_next(k) < end(i) &&	118	if (bkey_next(k) < end(i) &&
119	skipped_backwards(b, k))	119	skipped_backwards(b, k))
120	printk(KERN_ERR "Key skipped backwards\n");	120	printk(KERN_ERR "Key skipped backwards\n");
121	}	121	}
122	}	122	}
123		123
124	#endif	124	#endif
125		125
126	#ifdef CONFIG_BCACHE_DEBUG	126	#ifdef CONFIG_BCACHE_DEBUG
127		127
128	void bch_btree_verify(struct btree b, struct bset new)	128	void bch_btree_verify(struct btree b, struct bset new)
129	{	129	{
130	struct btree *v = b->c->verify_data;	130	struct btree *v = b->c->verify_data;
131	struct closure cl;	131	struct closure cl;
132	closure_init_stack(&cl);	132	closure_init_stack(&cl);
133		133
134	if (!b->c->verify)	134	if (!b->c->verify)
135	return;	135	return;
136		136
137	closure_wait_event(&b->io.wait, &cl,	137	closure_wait_event(&b->io.wait, &cl,
138	atomic_read(&b->io.cl.remaining) == -1);	138	atomic_read(&b->io.cl.remaining) == -1);
139		139
140	mutex_lock(&b->c->verify_lock);	140	mutex_lock(&b->c->verify_lock);
141		141
142	bkey_copy(&v->key, &b->key);	142	bkey_copy(&v->key, &b->key);
143	v->written = 0;	143	v->written = 0;
144	v->level = b->level;	144	v->level = b->level;
145		145
146	bch_btree_node_read(v);	146	bch_btree_node_read(v);
147	closure_wait_event(&v->io.wait, &cl,	147	closure_wait_event(&v->io.wait, &cl,
148	atomic_read(&b->io.cl.remaining) == -1);	148	atomic_read(&b->io.cl.remaining) == -1);
149		149
150	if (new->keys != v->sets[0].data->keys \|\|	150	if (new->keys != v->sets[0].data->keys \|\|
151	memcmp(new->start,	151	memcmp(new->start,
152	v->sets[0].data->start,	152	v->sets[0].data->start,
153	(void ) end(new) - (void ) new->start)) {	153	(void ) end(new) - (void ) new->start)) {
154	unsigned i, j;	154	unsigned i, j;
155		155
156	console_lock();	156	console_lock();
157		157
158	printk(KERN_ERR "*** original memory node:\n");	158	printk(KERN_ERR "*** original memory node:\n");
159	for (i = 0; i <= b->nsets; i++)	159	for (i = 0; i <= b->nsets; i++)
160	dump_bset(b, b->sets[i].data);	160	dump_bset(b, b->sets[i].data);
161		161
162	printk(KERN_ERR "*** sorted memory node:\n");	162	printk(KERN_ERR "*** sorted memory node:\n");
163	dump_bset(b, new);	163	dump_bset(b, new);
164		164
165	printk(KERN_ERR "*** on disk node:\n");	165	printk(KERN_ERR "*** on disk node:\n");
166	dump_bset(v, v->sets[0].data);	166	dump_bset(v, v->sets[0].data);
167		167
168	for (j = 0; j < new->keys; j++)	168	for (j = 0; j < new->keys; j++)
169	if (new->d[j] != v->sets[0].data->d[j])	169	if (new->d[j] != v->sets[0].data->d[j])
170	break;	170	break;
171		171
172	console_unlock();	172	console_unlock();
173	panic("verify failed at %u\n", j);	173	panic("verify failed at %u\n", j);
174	}	174	}
175		175
176	mutex_unlock(&b->c->verify_lock);	176	mutex_unlock(&b->c->verify_lock);
177	}	177	}
178		178
179	static void data_verify_endio(struct bio *bio, int error)	179	static void data_verify_endio(struct bio *bio, int error)
180	{	180	{
181	struct closure *cl = bio->bi_private;	181	struct closure *cl = bio->bi_private;
182	closure_put(cl);	182	closure_put(cl);
183	}	183	}
184		184
185	void bch_data_verify(struct search *s)	185	void bch_data_verify(struct search *s)
186	{	186	{
187	char name[BDEVNAME_SIZE];	187	char name[BDEVNAME_SIZE];
188	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);	188	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
189	struct closure *cl = &s->cl;	189	struct closure *cl = &s->cl;
190	struct bio *check;	190	struct bio *check;
191	struct bio_vec *bv;	191	struct bio_vec *bv;
192	int i;	192	int i;
193		193
194	if (!s->unaligned_bvec)	194	if (!s->unaligned_bvec)
195	bio_for_each_segment(bv, s->orig_bio, i)	195	bio_for_each_segment(bv, s->orig_bio, i)
196	bv->bv_offset = 0, bv->bv_len = PAGE_SIZE;	196	bv->bv_offset = 0, bv->bv_len = PAGE_SIZE;
197		197
198	check = bio_clone(s->orig_bio, GFP_NOIO);	198	check = bio_clone(s->orig_bio, GFP_NOIO);
199	if (!check)	199	if (!check)
200	return;	200	return;
201		201
202	if (bch_bio_alloc_pages(check, GFP_NOIO))	202	if (bio_alloc_pages(check, GFP_NOIO))
203	goto out_put;	203	goto out_put;
204		204
205	check->bi_rw = READ_SYNC;	205	check->bi_rw = READ_SYNC;
206	check->bi_private = cl;	206	check->bi_private = cl;
207	check->bi_end_io = data_verify_endio;	207	check->bi_end_io = data_verify_endio;
208		208
209	closure_bio_submit(check, cl, &dc->disk);	209	closure_bio_submit(check, cl, &dc->disk);
210	closure_sync(cl);	210	closure_sync(cl);
211		211
212	bio_for_each_segment(bv, s->orig_bio, i) {	212	bio_for_each_segment(bv, s->orig_bio, i) {
213	void *p1 = kmap(bv->bv_page);	213	void *p1 = kmap(bv->bv_page);
214	void *p2 = kmap(check->bi_io_vec[i].bv_page);	214	void *p2 = kmap(check->bi_io_vec[i].bv_page);
215		215
216	if (memcmp(p1 + bv->bv_offset,	216	if (memcmp(p1 + bv->bv_offset,
217	p2 + bv->bv_offset,	217	p2 + bv->bv_offset,
218	bv->bv_len))	218	bv->bv_len))
219	printk(KERN_ERR	219	printk(KERN_ERR
220	"bcache (%s): verify failed at sector %llu\n",	220	"bcache (%s): verify failed at sector %llu\n",
221	bdevname(dc->bdev, name),	221	bdevname(dc->bdev, name),
222	(uint64_t) s->orig_bio->bi_sector);	222	(uint64_t) s->orig_bio->bi_sector);
223		223
224	kunmap(bv->bv_page);	224	kunmap(bv->bv_page);
225	kunmap(check->bi_io_vec[i].bv_page);	225	kunmap(check->bi_io_vec[i].bv_page);
226	}	226	}
227		227
228	__bio_for_each_segment(bv, check, i, 0)	228	__bio_for_each_segment(bv, check, i, 0)
229	__free_page(bv->bv_page);	229	__free_page(bv->bv_page);
230	out_put:	230	out_put:
231	bio_put(check);	231	bio_put(check);
232	}	232	}
233		233
234	#endif	234	#endif
235		235
236	#ifdef CONFIG_BCACHE_EDEBUG	236	#ifdef CONFIG_BCACHE_EDEBUG
237		237
238	unsigned bch_count_data(struct btree *b)	238	unsigned bch_count_data(struct btree *b)
239	{	239	{
240	unsigned ret = 0;	240	unsigned ret = 0;
241	struct btree_iter iter;	241	struct btree_iter iter;
242	struct bkey *k;	242	struct bkey *k;
243		243
244	if (!b->level)	244	if (!b->level)
245	for_each_key(b, k, &iter)	245	for_each_key(b, k, &iter)
246	ret += KEY_SIZE(k);	246	ret += KEY_SIZE(k);
247	return ret;	247	return ret;
248	}	248	}
249		249
250	static void vdump_bucket_and_panic(struct btree b, const char fmt,	250	static void vdump_bucket_and_panic(struct btree b, const char fmt,
251	va_list args)	251	va_list args)
252	{	252	{
253	unsigned i;	253	unsigned i;
254	char buf[80];	254	char buf[80];
255		255
256	console_lock();	256	console_lock();
257		257
258	for (i = 0; i <= b->nsets; i++)	258	for (i = 0; i <= b->nsets; i++)
259	dump_bset(b, b->sets[i].data);	259	dump_bset(b, b->sets[i].data);
260		260
261	vprintk(fmt, args);	261	vprintk(fmt, args);
262		262
263	console_unlock();	263	console_unlock();
264		264
265	bch_btree_to_text(buf, sizeof(buf), b);	265	bch_btree_to_text(buf, sizeof(buf), b);
266	panic("at %s\n", buf);	266	panic("at %s\n", buf);
267	}	267	}
268		268
269	void bch_check_key_order_msg(struct btree b, struct bset i,	269	void bch_check_key_order_msg(struct btree b, struct bset i,
270	const char *fmt, ...)	270	const char *fmt, ...)
271	{	271	{
272	struct bkey *k;	272	struct bkey *k;
273		273
274	if (!i->keys)	274	if (!i->keys)
275	return;	275	return;
276		276
277	for (k = i->start; bkey_next(k) < end(i); k = bkey_next(k))	277	for (k = i->start; bkey_next(k) < end(i); k = bkey_next(k))
278	if (skipped_backwards(b, k)) {	278	if (skipped_backwards(b, k)) {
279	va_list args;	279	va_list args;
280	va_start(args, fmt);	280	va_start(args, fmt);
281		281
282	vdump_bucket_and_panic(b, fmt, args);	282	vdump_bucket_and_panic(b, fmt, args);
283	va_end(args);	283	va_end(args);
284	}	284	}
285	}	285	}
286		286
287	void bch_check_keys(struct btree b, const char fmt, ...)	287	void bch_check_keys(struct btree b, const char fmt, ...)
288	{	288	{
289	va_list args;	289	va_list args;
290	struct bkey k, p = NULL;	290	struct bkey k, p = NULL;
291	struct btree_iter iter;	291	struct btree_iter iter;
292		292
293	if (b->level)	293	if (b->level)
294	return;	294	return;
295		295
296	for_each_key(b, k, &iter) {	296	for_each_key(b, k, &iter) {
297	if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) {	297	if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) {
298	printk(KERN_ERR "Keys out of order:\n");	298	printk(KERN_ERR "Keys out of order:\n");
299	goto bug;	299	goto bug;
300	}	300	}
301		301
302	if (bch_ptr_invalid(b, k))	302	if (bch_ptr_invalid(b, k))
303	continue;	303	continue;
304		304
305	if (p && bkey_cmp(p, &START_KEY(k)) > 0) {	305	if (p && bkey_cmp(p, &START_KEY(k)) > 0) {
306	printk(KERN_ERR "Overlapping keys:\n");	306	printk(KERN_ERR "Overlapping keys:\n");
307	goto bug;	307	goto bug;
308	}	308	}
309	p = k;	309	p = k;
310	}	310	}
311	return;	311	return;
312	bug:	312	bug:
313	va_start(args, fmt);	313	va_start(args, fmt);
314	vdump_bucket_and_panic(b, fmt, args);	314	vdump_bucket_and_panic(b, fmt, args);
315	va_end(args);	315	va_end(args);
316	}	316	}
317		317
318	#endif	318	#endif
319		319
320	#ifdef CONFIG_DEBUG_FS	320	#ifdef CONFIG_DEBUG_FS
321		321
322	/* XXX: cache set refcounting */	322	/* XXX: cache set refcounting */
323		323
324	struct dump_iterator {	324	struct dump_iterator {
325	char buf[PAGE_SIZE];	325	char buf[PAGE_SIZE];
326	size_t bytes;	326	size_t bytes;
327	struct cache_set *c;	327	struct cache_set *c;
328	struct keybuf keys;	328	struct keybuf keys;
329	};	329	};
330		330
331	static bool dump_pred(struct keybuf buf, struct bkey k)	331	static bool dump_pred(struct keybuf buf, struct bkey k)
332	{	332	{
333	return true;	333	return true;
334	}	334	}
335		335
336	static ssize_t bch_dump_read(struct file file, char __user buf,	336	static ssize_t bch_dump_read(struct file file, char __user buf,
337	size_t size, loff_t *ppos)	337	size_t size, loff_t *ppos)
338	{	338	{
339	struct dump_iterator *i = file->private_data;	339	struct dump_iterator *i = file->private_data;
340	ssize_t ret = 0;	340	ssize_t ret = 0;
341	char kbuf[80];	341	char kbuf[80];
342		342
343	while (size) {	343	while (size) {
344	struct keybuf_key *w;	344	struct keybuf_key *w;
345	unsigned bytes = min(i->bytes, size);	345	unsigned bytes = min(i->bytes, size);
346		346
347	int err = copy_to_user(buf, i->buf, bytes);	347	int err = copy_to_user(buf, i->buf, bytes);
348	if (err)	348	if (err)
349	return err;	349	return err;
350		350
351	ret += bytes;	351	ret += bytes;
352	buf += bytes;	352	buf += bytes;
353	size -= bytes;	353	size -= bytes;
354	i->bytes -= bytes;	354	i->bytes -= bytes;
355	memmove(i->buf, i->buf + bytes, i->bytes);	355	memmove(i->buf, i->buf + bytes, i->bytes);
356		356
357	if (i->bytes)	357	if (i->bytes)
358	break;	358	break;
359		359
360	w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY, dump_pred);	360	w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY, dump_pred);
361	if (!w)	361	if (!w)
362	break;	362	break;
363		363
364	bch_bkey_to_text(kbuf, sizeof(kbuf), &w->key);	364	bch_bkey_to_text(kbuf, sizeof(kbuf), &w->key);
365	i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", kbuf);	365	i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", kbuf);
366	bch_keybuf_del(&i->keys, w);	366	bch_keybuf_del(&i->keys, w);
367	}	367	}
368		368
369	return ret;	369	return ret;
370	}	370	}
371		371
372	static int bch_dump_open(struct inode inode, struct file file)	372	static int bch_dump_open(struct inode inode, struct file file)
373	{	373	{
374	struct cache_set *c = inode->i_private;	374	struct cache_set *c = inode->i_private;
375	struct dump_iterator *i;	375	struct dump_iterator *i;
376		376
377	i = kzalloc(sizeof(struct dump_iterator), GFP_KERNEL);	377	i = kzalloc(sizeof(struct dump_iterator), GFP_KERNEL);
378	if (!i)	378	if (!i)
379	return -ENOMEM;	379	return -ENOMEM;
380		380
381	file->private_data = i;	381	file->private_data = i;
382	i->c = c;	382	i->c = c;
383	bch_keybuf_init(&i->keys);	383	bch_keybuf_init(&i->keys);
384	i->keys.last_scanned = KEY(0, 0, 0);	384	i->keys.last_scanned = KEY(0, 0, 0);
385		385
386	return 0;	386	return 0;
387	}	387	}
388		388
389	static int bch_dump_release(struct inode inode, struct file file)	389	static int bch_dump_release(struct inode inode, struct file file)
390	{	390	{
391	kfree(file->private_data);	391	kfree(file->private_data);
392	return 0;	392	return 0;
393	}	393	}
394		394
395	static const struct file_operations cache_set_debug_ops = {	395	static const struct file_operations cache_set_debug_ops = {
396	.owner = THIS_MODULE,	396	.owner = THIS_MODULE,
397	.open = bch_dump_open,	397	.open = bch_dump_open,
398	.read = bch_dump_read,	398	.read = bch_dump_read,
399	.release = bch_dump_release	399	.release = bch_dump_release
400	};	400	};
401		401
402	void bch_debug_init_cache_set(struct cache_set *c)	402	void bch_debug_init_cache_set(struct cache_set *c)
403	{	403	{
404	if (!IS_ERR_OR_NULL(debug)) {	404	if (!IS_ERR_OR_NULL(debug)) {
405	char name[50];	405	char name[50];
406	snprintf(name, 50, "bcache-%pU", c->sb.set_uuid);	406	snprintf(name, 50, "bcache-%pU", c->sb.set_uuid);
407		407
408	c->debug = debugfs_create_file(name, 0400, debug, c,	408	c->debug = debugfs_create_file(name, 0400, debug, c,
409	&cache_set_debug_ops);	409	&cache_set_debug_ops);
410	}	410	}
411	}	411	}
412		412
413	#endif	413	#endif
414		414
415	void bch_debug_exit(void)	415	void bch_debug_exit(void)
416	{	416	{
417	if (!IS_ERR_OR_NULL(debug))	417	if (!IS_ERR_OR_NULL(debug))
418	debugfs_remove_recursive(debug);	418	debugfs_remove_recursive(debug);
419	}	419	}
420		420
421	int __init bch_debug_init(struct kobject *kobj)	421	int __init bch_debug_init(struct kobject *kobj)
422	{	422	{
423	int ret = 0;	423	int ret = 0;
424		424
425	debug = debugfs_create_dir("bcache", NULL);	425	debug = debugfs_create_dir("bcache", NULL);
426	return ret;	426	return ret;
427	}	427	}
428		428

drivers/md/bcache/io.c

Diff comments View file @ 8e51e41

 /*
  * Some low level IO code, and hacks for various block layer limitations
  *
  * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
  * Copyright 2012 Google, Inc.
  */
 #include "bcache.h"
 #include "bset.h"
 #include "debug.h"
 #include <linux/blkdev.h>
 static void bch_bi_idx_hack_endio(struct bio *bio, int error)
 {
 	struct bio *p = bio->bi_private;
 	bio_endio(p, error);
 	bio_put(bio);
 }
 static void bch_generic_make_request_hack(struct bio *bio)
 {
 	if (bio->bi_idx) {
 		struct bio *clone = bio_alloc(GFP_NOIO, bio_segments(bio));
 		memcpy(clone->bi_io_vec,
 		       bio_iovec(bio),
 		       bio_segments(bio) * sizeof(struct bio_vec));
 		clone->bi_sector	= bio->bi_sector;
 		clone->bi_bdev		= bio->bi_bdev;
 		clone->bi_rw		= bio->bi_rw;
 		clone->bi_vcnt		= bio_segments(bio);
 		clone->bi_size		= bio->bi_size;
 		clone->bi_private	= bio;
 		clone->bi_end_io	= bch_bi_idx_hack_endio;
 		bio = clone;
 	}
 	/*
 	 * Hack, since drivers that clone bios clone up to bi_max_vecs, but our
 	 * bios might have had more than that (before we split them per device
 	 * limitations).
 	 *
 	 * To be taken out once immutable bvec stuff is in.
 	 */
 	bio->bi_max_vecs = bio->bi_vcnt;
 	generic_make_request(bio);
 }
 /**
  * bch_bio_split - split a bio
  * @bio:	bio to split
  * @sectors:	number of sectors to split from the front of @bio
  * @gfp:	gfp mask
  * @bs:		bio set to allocate from
  *
  * Allocates and returns a new bio which represents @sectors from the start of
  * @bio, and updates @bio to represent the remaining sectors.
  *
  * If bio_sectors(@bio) was less than or equal to @sectors, returns @bio
  * unchanged.
  *
  * The newly allocated bio will point to @bio's bi_io_vec, if the split was on a
  * bvec boundry; it is the caller's responsibility to ensure that @bio is not
  * freed before the split.
- *
- * If bch_bio_split() is running under generic_make_request(), it's not safe to
- * allocate more than one bio from the same bio set. Therefore, if it is running
- * under generic_make_request() it masks out __GFP_WAIT when doing the
- * allocation. The caller must check for failure if there's any possibility of
- * it being called from under generic_make_request(); it is then the caller's
- * responsibility to retry from a safe context (by e.g. punting to workqueue).
  */
 struct bio *bch_bio_split(struct bio *bio, int sectors,
 			  gfp_t gfp, struct bio_set *bs)
 {
 	unsigned idx = bio->bi_idx, vcnt = 0, nbytes = sectors << 9;
 	struct bio_vec *bv;
 	struct bio *ret = NULL;
 	BUG_ON(sectors <= 0);
-	/*
-	 * If we're being called from underneath generic_make_request() and we
-	 * already allocated any bios from this bio set, we risk deadlock if we
-	 * use the mempool. So instead, we possibly fail and let the caller punt
-	 * to workqueue or somesuch and retry in a safe context.
-	 */
-	if (current->bio_list)
-		gfp &= ~__GFP_WAIT;
 	if (sectors >= bio_sectors(bio))
 		return bio;
 	if (bio->bi_rw & REQ_DISCARD) {
 		ret = bio_alloc_bioset(gfp, 1, bs);
 		if (!ret)
 			return NULL;
 		idx = 0;
 		goto out;
 	}
 	bio_for_each_segment(bv, bio, idx) {
 		vcnt = idx - bio->bi_idx;
 		if (!nbytes) {
 			ret = bio_alloc_bioset(gfp, vcnt, bs);
 			if (!ret)
 				return NULL;
 			memcpy(ret->bi_io_vec, bio_iovec(bio),
 			       sizeof(struct bio_vec) * vcnt);
 			break;
 		} else if (nbytes < bv->bv_len) {
 			ret = bio_alloc_bioset(gfp, ++vcnt, bs);
 			if (!ret)
 				return NULL;
 			memcpy(ret->bi_io_vec, bio_iovec(bio),
 			       sizeof(struct bio_vec) * vcnt);
 			ret->bi_io_vec[vcnt - 1].bv_len = nbytes;
 			bv->bv_offset	+= nbytes;
 			bv->bv_len	-= nbytes;
 			break;
 		}
 		nbytes -= bv->bv_len;
 	}
 out:
 	ret->bi_bdev	= bio->bi_bdev;
 	ret->bi_sector	= bio->bi_sector;
 	ret->bi_size	= sectors << 9;
 	ret->bi_rw	= bio->bi_rw;
 	ret->bi_vcnt	= vcnt;
 	ret->bi_max_vecs = vcnt;
 	bio->bi_sector	+= sectors;
 	bio->bi_size	-= sectors << 9;
 	bio->bi_idx	 = idx;
 	if (bio_integrity(bio)) {
 		if (bio_integrity_clone(ret, bio, gfp)) {
 			bio_put(ret);
 			return NULL;
 		}
 		bio_integrity_trim(ret, 0, bio_sectors(ret));
 		bio_integrity_trim(bio, bio_sectors(ret), bio_sectors(bio));
 	}
 	return ret;
 }
 static unsigned bch_bio_max_sectors(struct bio *bio)
 {
 	unsigned ret = bio_sectors(bio);
 	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
 	unsigned max_segments = min_t(unsigned, BIO_MAX_PAGES,
 				      queue_max_segments(q));
-	struct bio_vec *bv, *end = bio_iovec(bio) +
-		min_t(int, bio_segments(bio), max_segments);
 	if (bio->bi_rw & REQ_DISCARD)
 		return min(ret, q->limits.max_discard_sectors);
 	if (bio_segments(bio) > max_segments ||
 	    q->merge_bvec_fn) {
+		struct bio_vec *bv;
+		int i, seg = 0;
 		ret = 0;
-		for (bv = bio_iovec(bio); bv < end; bv++) {
+		bio_for_each_segment(bv, bio, i) {
 			struct bvec_merge_data bvm = {
 				.bi_bdev	= bio->bi_bdev,
 				.bi_sector	= bio->bi_sector,
 				.bi_size	= ret << 9,
 				.bi_rw		= bio->bi_rw,
 			};
+			if (seg == max_segments)
+				break;
 			if (q->merge_bvec_fn &&
 			    q->merge_bvec_fn(q, &bvm, bv) < (int) bv->bv_len)
 				break;
+			seg++;
 			ret += bv->bv_len >> 9;
 		}
 	}
 	ret = min(ret, queue_max_sectors(q));
 	WARN_ON(!ret);
 	ret = max_t(int, ret, bio_iovec(bio)->bv_len >> 9);
 	return ret;
 }
 static void bch_bio_submit_split_done(struct closure *cl)
 {
 	struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
 	s->bio->bi_end_io = s->bi_end_io;
 	s->bio->bi_private = s->bi_private;
 	bio_endio(s->bio, 0);
 	closure_debug_destroy(&s->cl);
 	mempool_free(s, s->p->bio_split_hook);
 }
 static void bch_bio_submit_split_endio(struct bio *bio, int error)
 {
 	struct closure *cl = bio->bi_private;
 	struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
 	if (error)
 		clear_bit(BIO_UPTODATE, &s->bio->bi_flags);
 	bio_put(bio);
 	closure_put(cl);
 }
-static void __bch_bio_submit_split(struct closure *cl)
-{
-	struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
-	struct bio *bio = s->bio, *n;
-	do {
-		n = bch_bio_split(bio, bch_bio_max_sectors(bio),
-				  GFP_NOIO, s->p->bio_split);
-		if (!n)
-			continue_at(cl, __bch_bio_submit_split, system_wq);
-		n->bi_end_io	= bch_bio_submit_split_endio;
-		n->bi_private	= cl;
-		closure_get(cl);
-		bch_generic_make_request_hack(n);
-	} while (n != bio);
-	continue_at(cl, bch_bio_submit_split_done, NULL);
-}
 void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p)
 {
 	struct bio_split_hook *s;
+	struct bio *n;
 	if (!bio_has_data(bio) && !(bio->bi_rw & REQ_DISCARD))
 		goto submit;
 	if (bio_sectors(bio) <= bch_bio_max_sectors(bio))
 		goto submit;
 	s = mempool_alloc(p->bio_split_hook, GFP_NOIO);
+	closure_init(&s->cl, NULL);
 	s->bio		= bio;
 	s->p		= p;
 	s->bi_end_io	= bio->bi_end_io;
 	s->bi_private	= bio->bi_private;
 	bio_get(bio);
-	closure_call(&s->cl, __bch_bio_submit_split, NULL, NULL);
+	do {
-	return;
+		n = bch_bio_split(bio, bch_bio_max_sectors(bio),
+				  GFP_NOIO, s->p->bio_split);
+		n->bi_end_io	= bch_bio_submit_split_endio;
+		n->bi_private	= &s->cl;
+		closure_get(&s->cl);
+		bch_generic_make_request_hack(n);
+	} while (n != bio);
+	continue_at(&s->cl, bch_bio_submit_split_done, NULL);
 submit:
 	bch_generic_make_request_hack(bio);
 }
 /* Bios with headers */
 void bch_bbio_free(struct bio *bio, struct cache_set *c)
 {
 	struct bbio *b = container_of(bio, struct bbio, bio);
 	mempool_free(b, c->bio_meta);
 }
 struct bio *bch_bbio_alloc(struct cache_set *c)
 {
 	struct bbio *b = mempool_alloc(c->bio_meta, GFP_NOIO);
 	struct bio *bio = &b->bio;
 	bio_init(bio);
 	bio->bi_flags		|= BIO_POOL_NONE << BIO_POOL_OFFSET;
 	bio->bi_max_vecs	 = bucket_pages(c);
 	bio->bi_io_vec		 = bio->bi_inline_vecs;
 	return bio;
 }
 void __bch_submit_bbio(struct bio *bio, struct cache_set *c)
 {
 	struct bbio *b = container_of(bio, struct bbio, bio);
 	bio->bi_sector	= PTR_OFFSET(&b->key, 0);
 	bio->bi_bdev	= PTR_CACHE(c, &b->key, 0)->bdev;
 	b->submit_time_us = local_clock_us();
 	closure_bio_submit(bio, bio->bi_private, PTR_CACHE(c, &b->key, 0));
 }
 void bch_submit_bbio(struct bio *bio, struct cache_set *c,
 		     struct bkey *k, unsigned ptr)
 {
 	struct bbio *b = container_of(bio, struct bbio, bio);
 	bch_bkey_copy_single_ptr(&b->key, k, ptr);
 	__bch_submit_bbio(bio, c);
 }
 /* IO errors */
 void bch_count_io_errors(struct cache *ca, int error, const char *m)
 {
 	/*
 	 * The halflife of an error is:
 	 * log2(1/2)/log2(127/128) * refresh ~= 88 * refresh
 	 */
 	if (ca->set->error_decay) {
 		unsigned count = atomic_inc_return(&ca->io_count);
 		while (count > ca->set->error_decay) {
 			unsigned errors;
 			unsigned old = count;
 			unsigned new = count - ca->set->error_decay;
 			/*
 			 * First we subtract refresh from count; each time we
 			 * succesfully do so, we rescale the errors once:
 			 */
 			count = atomic_cmpxchg(&ca->io_count, old, new);
 			if (count == old) {
 				count = new;
 				errors = atomic_read(&ca->io_errors);
 				do {
 					old = errors;
 					new = ((uint64_t) errors * 127) / 128;
 					errors = atomic_cmpxchg(&ca->io_errors,
 								old, new);
 				} while (old != errors);
 			}
 		}
 	}
 	if (error) {
 		char buf[BDEVNAME_SIZE];
 		unsigned errors = atomic_add_return(1 << IO_ERROR_SHIFT,
 						    &ca->io_errors);
 		errors >>= IO_ERROR_SHIFT;
 		if (errors < ca->set->error_limit)
 			pr_err("%s: IO error on %s, recovering",
 			       bdevname(ca->bdev, buf), m);
 		else
 			bch_cache_set_error(ca->set,
 					    "%s: too many IO errors %s",
 					    bdevname(ca->bdev, buf), m);
 	}
 }
 void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio,
 			      int error, const char *m)
 {
 	struct bbio *b = container_of(bio, struct bbio, bio);
 	struct cache *ca = PTR_CACHE(c, &b->key, 0);
 	unsigned threshold = bio->bi_rw & REQ_WRITE
 		? c->congested_write_threshold_us
 		: c->congested_read_threshold_us;
 	if (threshold) {
 		unsigned t = local_clock_us();
 		int us = t - b->submit_time_us;
 		int congested = atomic_read(&c->congested);
 		if (us > (int) threshold) {
 			int ms = us / 1024;
 			c->congested_last_us = t;

drivers/md/bcache/movinggc.c

Diff comments View file @ 8e51e41

 /*
  * Moving/copying garbage collector
  *
  * Copyright 2012 Google, Inc.
  */
 #include "bcache.h"
 #include "btree.h"
 #include "debug.h"
 #include "request.h"
 #include <trace/events/bcache.h>
 struct moving_io {
 	struct keybuf_key	*w;
 	struct search		s;
 	struct bbio		bio;
 };
 static bool moving_pred(struct keybuf *buf, struct bkey *k)
 {
 	struct cache_set *c = container_of(buf, struct cache_set,
 					   moving_gc_keys);
 	unsigned i;
 	for (i = 0; i < KEY_PTRS(k); i++) {
 		struct cache *ca = PTR_CACHE(c, k, i);
 		struct bucket *g = PTR_BUCKET(c, k, i);
 		if (GC_SECTORS_USED(g) < ca->gc_move_threshold)
 			return true;
 	}
 	return false;
 }
 /* Moving GC - IO loop */
 static void moving_io_destructor(struct closure *cl)
 {
 	struct moving_io *io = container_of(cl, struct moving_io, s.cl);
 	kfree(io);
 }
 static void write_moving_finish(struct closure *cl)
 {
 	struct moving_io *io = container_of(cl, struct moving_io, s.cl);
 	struct bio *bio = &io->bio.bio;
-	struct bio_vec *bv = bio_iovec_idx(bio, bio->bi_vcnt);
+	struct bio_vec *bv;
+	int i;
-	while (bv-- != bio->bi_io_vec)
+	bio_for_each_segment_all(bv, bio, i)
 		__free_page(bv->bv_page);
 	if (io->s.op.insert_collision)
 		trace_bcache_gc_copy_collision(&io->w->key);
 	bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w);
 	atomic_dec_bug(&io->s.op.c->in_flight);
 	closure_wake_up(&io->s.op.c->moving_gc_wait);
 	closure_return_with_destructor(cl, moving_io_destructor);
 }
 static void read_moving_endio(struct bio *bio, int error)
 {
 	struct moving_io *io = container_of(bio->bi_private,
 					    struct moving_io, s.cl);
 	if (error)
 		io->s.error = error;
 	bch_bbio_endio(io->s.op.c, bio, error, "reading data to move");
 }
 static void moving_init(struct moving_io *io)
 {
 	struct bio *bio = &io->bio.bio;
 	bio_init(bio);
 	bio_get(bio);
 	bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
 	bio->bi_size		= KEY_SIZE(&io->w->key) << 9;
 	bio->bi_max_vecs	= DIV_ROUND_UP(KEY_SIZE(&io->w->key),
 					       PAGE_SECTORS);
 	bio->bi_private		= &io->s.cl;
 	bio->bi_io_vec		= bio->bi_inline_vecs;
 	bch_bio_map(bio, NULL);
 }
 static void write_moving(struct closure *cl)
 {
 	struct search *s = container_of(cl, struct search, cl);
 	struct moving_io *io = container_of(s, struct moving_io, s);
 	if (!s->error) {
 		moving_init(io);
 		io->bio.bio.bi_sector	= KEY_START(&io->w->key);
 		s->op.lock		= -1;
 		s->op.write_prio	= 1;
 		s->op.cache_bio		= &io->bio.bio;
 		s->writeback		= KEY_DIRTY(&io->w->key);
 		s->op.csum		= KEY_CSUM(&io->w->key);
 		s->op.type = BTREE_REPLACE;
 		bkey_copy(&s->op.replace, &io->w->key);
 		closure_init(&s->op.cl, cl);
 		bch_insert_data(&s->op.cl);
 	}
 	continue_at(cl, write_moving_finish, NULL);
 }
 static void read_moving_submit(struct closure *cl)
 {
 	struct search *s = container_of(cl, struct search, cl);
 	struct moving_io *io = container_of(s, struct moving_io, s);
 	struct bio *bio = &io->bio.bio;
 	bch_submit_bbio(bio, s->op.c, &io->w->key, 0);
 	continue_at(cl, write_moving, bch_gc_wq);
 }
 static void read_moving(struct closure *cl)
 {
 	struct cache_set *c = container_of(cl, struct cache_set, moving_gc);
 	struct keybuf_key *w;
 	struct moving_io *io;
 	struct bio *bio;
 	/* XXX: if we error, background writeback could stall indefinitely */
 	while (!test_bit(CACHE_SET_STOPPING, &c->flags)) {
 		w = bch_keybuf_next_rescan(c, &c->moving_gc_keys,
 					   &MAX_KEY, moving_pred);
 		if (!w)
 			break;
 		io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec)
 			     * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
 			     GFP_KERNEL);
 		if (!io)
 			goto err;
 		w->private	= io;
 		io->w		= w;
 		io->s.op.inode	= KEY_INODE(&w->key);
 		io->s.op.c	= c;
 		moving_init(io);
 		bio = &io->bio.bio;
 		bio->bi_rw	= READ;
 		bio->bi_end_io	= read_moving_endio;
-		if (bch_bio_alloc_pages(bio, GFP_KERNEL))
+		if (bio_alloc_pages(bio, GFP_KERNEL))
 			goto err;
 		trace_bcache_gc_copy(&w->key);
 		closure_call(&io->s.cl, read_moving_submit, NULL, &c->gc.cl);
 		if (atomic_inc_return(&c->in_flight) >= 64) {
 			closure_wait_event(&c->moving_gc_wait, cl,
 					   atomic_read(&c->in_flight) < 64);
 			continue_at(cl, read_moving, bch_gc_wq);
 		}
 	}
 	if (0) {
 err:		if (!IS_ERR_OR_NULL(w->private))
 			kfree(w->private);
 		bch_keybuf_del(&c->moving_gc_keys, w);
 	}
 	closure_return(cl);
 }
 static bool bucket_cmp(struct bucket *l, struct bucket *r)
 {
 	return GC_SECTORS_USED(l) < GC_SECTORS_USED(r);
 }
 static unsigned bucket_heap_top(struct cache *ca)
 {
 	return GC_SECTORS_USED(heap_peek(&ca->heap));
 }
 void bch_moving_gc(struct closure *cl)
 {
 	struct cache_set *c = container_of(cl, struct cache_set, gc.cl);
 	struct cache *ca;
 	struct bucket *b;
 	unsigned i;
 	if (!c->copy_gc_enabled)
 		closure_return(cl);
 	mutex_lock(&c->bucket_lock);
 	for_each_cache(ca, c, i) {
 		unsigned sectors_to_move = 0;
 		unsigned reserve_sectors = ca->sb.bucket_size *
 			min(fifo_used(&ca->free), ca->free.size / 2);
 		ca->heap.used = 0;
 		for_each_bucket(b, ca) {
 			if (!GC_SECTORS_USED(b))
 				continue;
 			if (!heap_full(&ca->heap)) {
 				sectors_to_move += GC_SECTORS_USED(b);
 				heap_add(&ca->heap, b, bucket_cmp);
 			} else if (bucket_cmp(b, heap_peek(&ca->heap))) {
 				sectors_to_move -= bucket_heap_top(ca);
 				sectors_to_move += GC_SECTORS_USED(b);
 				ca->heap.data[0] = b;
 				heap_sift(&ca->heap, 0, bucket_cmp);
 			}
 		}
 		while (sectors_to_move > reserve_sectors) {
 			heap_pop(&ca->heap, b, bucket_cmp);
 			sectors_to_move -= GC_SECTORS_USED(b);
 		}
 		ca->gc_move_threshold = bucket_heap_top(ca);
 		pr_debug("threshold %u", ca->gc_move_threshold);
 	}
 	mutex_unlock(&c->bucket_lock);
 	c->moving_gc_keys.last_scanned = ZERO_KEY;
 	closure_init(&c->moving_gc, cl);
 	read_moving(&c->moving_gc);
 	closure_return(cl);
 }
 void bch_moving_init_cache_set(struct cache_set *c)
 {
 	bch_keybuf_init(&c->moving_gc_keys);
 }

drivers/md/bcache/request.c

Diff comments View file @ 8e51e41

 /*
  * Main bcache entry point - handle a read or a write request and decide what to
  * do with it; the make_request functions are called by the block layer.
  *
  * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
  * Copyright 2012 Google, Inc.
  */
 #include "bcache.h"
 #include "btree.h"
 #include "debug.h"
 #include "request.h"
 #include "writeback.h"
 #include <linux/cgroup.h>
 #include <linux/module.h>
 #include <linux/hash.h>
 #include <linux/random.h>
 #include "blk-cgroup.h"
 #include <trace/events/bcache.h>
 #define CUTOFF_CACHE_ADD	95
 #define CUTOFF_CACHE_READA	90
 struct kmem_cache *bch_search_cache;
 static void check_should_skip(struct cached_dev *, struct search *);
 /* Cgroup interface */
 #ifdef CONFIG_CGROUP_BCACHE
 static struct bch_cgroup bcache_default_cgroup = { .cache_mode = -1 };
 static struct bch_cgroup *cgroup_to_bcache(struct cgroup *cgroup)
 {
 	struct cgroup_subsys_state *css;
 	return cgroup &&
 		(css = cgroup_subsys_state(cgroup, bcache_subsys_id))
 		? container_of(css, struct bch_cgroup, css)
 		: &bcache_default_cgroup;
 }
 struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio)
 {
 	struct cgroup_subsys_state *css = bio->bi_css
 		? cgroup_subsys_state(bio->bi_css->cgroup, bcache_subsys_id)
 		: task_subsys_state(current, bcache_subsys_id);
 	return css
 		? container_of(css, struct bch_cgroup, css)
 		: &bcache_default_cgroup;
 }
 static ssize_t cache_mode_read(struct cgroup *cgrp, struct cftype *cft,
 			struct file *file,
 			char __user *buf, size_t nbytes, loff_t *ppos)
 {
 	char tmp[1024];
 	int len = bch_snprint_string_list(tmp, PAGE_SIZE, bch_cache_modes,
 					  cgroup_to_bcache(cgrp)->cache_mode + 1);
 	if (len < 0)
 		return len;
 	return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
 }
 static int cache_mode_write(struct cgroup *cgrp, struct cftype *cft,
 			    const char *buf)
 {
 	int v = bch_read_string_list(buf, bch_cache_modes);
 	if (v < 0)
 		return v;
 	cgroup_to_bcache(cgrp)->cache_mode = v - 1;
 	return 0;
 }
 static u64 bch_verify_read(struct cgroup *cgrp, struct cftype *cft)
 {
 	return cgroup_to_bcache(cgrp)->verify;
 }
 static int bch_verify_write(struct cgroup *cgrp, struct cftype *cft, u64 val)
 {
 	cgroup_to_bcache(cgrp)->verify = val;
 	return 0;
 }
 static u64 bch_cache_hits_read(struct cgroup *cgrp, struct cftype *cft)
 {
 	struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
 	return atomic_read(&bcachecg->stats.cache_hits);
 }
 static u64 bch_cache_misses_read(struct cgroup *cgrp, struct cftype *cft)
 {
 	struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
 	return atomic_read(&bcachecg->stats.cache_misses);
 }
 static u64 bch_cache_bypass_hits_read(struct cgroup *cgrp,
 					 struct cftype *cft)
 {
 	struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
 	return atomic_read(&bcachecg->stats.cache_bypass_hits);
 }
 static u64 bch_cache_bypass_misses_read(struct cgroup *cgrp,
 					   struct cftype *cft)
 {
 	struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
 	return atomic_read(&bcachecg->stats.cache_bypass_misses);
 }
 static struct cftype bch_files[] = {
 	{
 		.name		= "cache_mode",
 		.read		= cache_mode_read,
 		.write_string	= cache_mode_write,
 	},
 	{
 		.name		= "verify",
 		.read_u64	= bch_verify_read,
 		.write_u64	= bch_verify_write,
 	},
 	{
 		.name		= "cache_hits",
 		.read_u64	= bch_cache_hits_read,
 	},
 	{
 		.name		= "cache_misses",
 		.read_u64	= bch_cache_misses_read,
 	},
 	{
 		.name		= "cache_bypass_hits",
 		.read_u64	= bch_cache_bypass_hits_read,
 	},
 	{
 		.name		= "cache_bypass_misses",
 		.read_u64	= bch_cache_bypass_misses_read,
 	},
 	{ }	/* terminate */
 };
 static void init_bch_cgroup(struct bch_cgroup *cg)
 {
 	cg->cache_mode = -1;
 }
 static struct cgroup_subsys_state *bcachecg_create(struct cgroup *cgroup)
 {
 	struct bch_cgroup *cg;
 	cg = kzalloc(sizeof(*cg), GFP_KERNEL);
 	if (!cg)
 		return ERR_PTR(-ENOMEM);
 	init_bch_cgroup(cg);
 	return &cg->css;
 }
 static void bcachecg_destroy(struct cgroup *cgroup)
 {
 	struct bch_cgroup *cg = cgroup_to_bcache(cgroup);
 	free_css_id(&bcache_subsys, &cg->css);
 	kfree(cg);
 }
 struct cgroup_subsys bcache_subsys = {
 	.create		= bcachecg_create,
 	.destroy	= bcachecg_destroy,
 	.subsys_id	= bcache_subsys_id,
 	.name		= "bcache",
 	.module		= THIS_MODULE,
 };
 EXPORT_SYMBOL_GPL(bcache_subsys);
 #endif
 static unsigned cache_mode(struct cached_dev *dc, struct bio *bio)
 {
 #ifdef CONFIG_CGROUP_BCACHE
 	int r = bch_bio_to_cgroup(bio)->cache_mode;
 	if (r >= 0)
 		return r;
 #endif
 	return BDEV_CACHE_MODE(&dc->sb);
 }
 static bool verify(struct cached_dev *dc, struct bio *bio)
 {
 #ifdef CONFIG_CGROUP_BCACHE
 	if (bch_bio_to_cgroup(bio)->verify)
 		return true;
 #endif
 	return dc->verify;
 }
 static void bio_csum(struct bio *bio, struct bkey *k)
 {
 	struct bio_vec *bv;
 	uint64_t csum = 0;
 	int i;
 	bio_for_each_segment(bv, bio, i) {
 		void *d = kmap(bv->bv_page) + bv->bv_offset;
 		csum = bch_crc64_update(csum, d, bv->bv_len);
 		kunmap(bv->bv_page);
 	}
 	k->ptr[KEY_PTRS(k)] = csum & (~0ULL >> 1);
 }
 /* Insert data into cache */
 static void bio_invalidate(struct closure *cl)
 {
 	struct btree_op *op = container_of(cl, struct btree_op, cl);
 	struct bio *bio = op->cache_bio;
 	pr_debug("invalidating %i sectors from %llu",
 		 bio_sectors(bio), (uint64_t) bio->bi_sector);
 	while (bio_sectors(bio)) {
 		unsigned len = min(bio_sectors(bio), 1U << 14);
 		if (bch_keylist_realloc(&op->keys, 0, op->c))
 			goto out;
 		bio->bi_sector	+= len;
 		bio->bi_size	-= len << 9;
 		bch_keylist_add(&op->keys,
 				&KEY(op->inode, bio->bi_sector, len));
 	}
 	op->insert_data_done = true;
 	bio_put(bio);
 out:
 	continue_at(cl, bch_journal, bcache_wq);
 }
 struct open_bucket {
 	struct list_head	list;
 	struct task_struct	*last;
 	unsigned		sectors_free;
 	BKEY_PADDED(key);
 };
 void bch_open_buckets_free(struct cache_set *c)
 {
 	struct open_bucket *b;
 	while (!list_empty(&c->data_buckets)) {
 		b = list_first_entry(&c->data_buckets,
 				     struct open_bucket, list);
 		list_del(&b->list);
 		kfree(b);
 	}
 }
 int bch_open_buckets_alloc(struct cache_set *c)
 {
 	int i;
 	spin_lock_init(&c->data_bucket_lock);
 	for (i = 0; i < 6; i++) {
 		struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL);
 		if (!b)
 			return -ENOMEM;
 		list_add(&b->list, &c->data_buckets);
 	}
 	return 0;
 }
 /*
  * We keep multiple buckets open for writes, and try to segregate different
  * write streams for better cache utilization: first we look for a bucket where
  * the last write to it was sequential with the current write, and failing that
  * we look for a bucket that was last used by the same task.
  *
  * The ideas is if you've got multiple tasks pulling data into the cache at the
  * same time, you'll get better cache utilization if you try to segregate their
  * data and preserve locality.
  *
  * For example, say you've starting Firefox at the same time you're copying a
  * bunch of files. Firefox will likely end up being fairly hot and stay in the
  * cache awhile, but the data you copied might not be; if you wrote all that
  * data to the same buckets it'd get invalidated at the same time.
  *
  * Both of those tasks will be doing fairly random IO so we can't rely on
  * detecting sequential IO to segregate their data, but going off of the task
  * should be a sane heuristic.
  */
 static struct open_bucket *pick_data_bucket(struct cache_set *c,
 					    const struct bkey *search,
 					    struct task_struct *task,
 					    struct bkey *alloc)
 {
 	struct open_bucket *ret, *ret_task = NULL;
 	list_for_each_entry_reverse(ret, &c->data_buckets, list)
 		if (!bkey_cmp(&ret->key, search))
 			goto found;
 		else if (ret->last == task)
 			ret_task = ret;
 	ret = ret_task ?: list_first_entry(&c->data_buckets,
 					   struct open_bucket, list);
 found:
 	if (!ret->sectors_free && KEY_PTRS(alloc)) {
 		ret->sectors_free = c->sb.bucket_size;
 		bkey_copy(&ret->key, alloc);
 		bkey_init(alloc);
 	}
 	if (!ret->sectors_free)
 		ret = NULL;
 	return ret;
 }
 /*
  * Allocates some space in the cache to write to, and k to point to the newly
  * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the
  * end of the newly allocated space).
  *
  * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many
  * sectors were actually allocated.
  *
  * If s->writeback is true, will not fail.
  */
 static bool bch_alloc_sectors(struct bkey *k, unsigned sectors,
 			      struct search *s)
 {
 	struct cache_set *c = s->op.c;
 	struct open_bucket *b;
 	BKEY_PADDED(key) alloc;
 	struct closure cl, *w = NULL;
 	unsigned i;
 	if (s->writeback) {
 		closure_init_stack(&cl);
 		w = &cl;
 	}
 	/*
 	 * We might have to allocate a new bucket, which we can't do with a
 	 * spinlock held. So if we have to allocate, we drop the lock, allocate
 	 * and then retry. KEY_PTRS() indicates whether alloc points to
 	 * allocated bucket(s).
 	 */
 	bkey_init(&alloc.key);
 	spin_lock(&c->data_bucket_lock);
 	while (!(b = pick_data_bucket(c, k, s->task, &alloc.key))) {
 		unsigned watermark = s->op.write_prio
 			? WATERMARK_MOVINGGC
 			: WATERMARK_NONE;
 		spin_unlock(&c->data_bucket_lock);
 		if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, w))
 			return false;
 		spin_lock(&c->data_bucket_lock);
 	}
 	/*
 	 * If we had to allocate, we might race and not need to allocate the
 	 * second time we call find_data_bucket(). If we allocated a bucket but
 	 * didn't use it, drop the refcount bch_bucket_alloc_set() took:
 	 */
 	if (KEY_PTRS(&alloc.key))
 		__bkey_put(c, &alloc.key);
 	for (i = 0; i < KEY_PTRS(&b->key); i++)
 		EBUG_ON(ptr_stale(c, &b->key, i));
 	/* Set up the pointer to the space we're allocating: */
 	for (i = 0; i < KEY_PTRS(&b->key); i++)
 		k->ptr[i] = b->key.ptr[i];
 	sectors = min(sectors, b->sectors_free);
 	SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors);
 	SET_KEY_SIZE(k, sectors);
 	SET_KEY_PTRS(k, KEY_PTRS(&b->key));
 	/*
 	 * Move b to the end of the lru, and keep track of what this bucket was
 	 * last used for:
 	 */
 	list_move_tail(&b->list, &c->data_buckets);
 	bkey_copy_key(&b->key, k);
 	b->last = s->task;
 	b->sectors_free	-= sectors;
 	for (i = 0; i < KEY_PTRS(&b->key); i++) {
 		SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors);
 		atomic_long_add(sectors,
 				&PTR_CACHE(c, &b->key, i)->sectors_written);
 	}
 	if (b->sectors_free < c->sb.block_size)
 		b->sectors_free = 0;
 	/*
 	 * k takes refcounts on the buckets it points to until it's inserted
 	 * into the btree, but if we're done with this bucket we just transfer
 	 * get_data_bucket()'s refcount.
 	 */
 	if (b->sectors_free)
 		for (i = 0; i < KEY_PTRS(&b->key); i++)
 			atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin);
 	spin_unlock(&c->data_bucket_lock);
 	return true;
 }
 static void bch_insert_data_error(struct closure *cl)
 {
 	struct btree_op *op = container_of(cl, struct btree_op, cl);
 	/*
 	 * Our data write just errored, which means we've got a bunch of keys to
 	 * insert that point to data that wasn't succesfully written.
 	 *
 	 * We don't have to insert those keys but we still have to invalidate
 	 * that region of the cache - so, if we just strip off all the pointers
 	 * from the keys we'll accomplish just that.
 	 */
 	struct bkey *src = op->keys.bottom, *dst = op->keys.bottom;
 	while (src != op->keys.top) {
 		struct bkey *n = bkey_next(src);
 		SET_KEY_PTRS(src, 0);
 		bkey_copy(dst, src);
 		dst = bkey_next(dst);
 		src = n;
 	}
 	op->keys.top = dst;
 	bch_journal(cl);
 }
 static void bch_insert_data_endio(struct bio *bio, int error)
 {
 	struct closure *cl = bio->bi_private;
 	struct btree_op *op = container_of(cl, struct btree_op, cl);
 	struct search *s = container_of(op, struct search, op);
 	if (error) {
 		/* TODO: We could try to recover from this. */
 		if (s->writeback)
 			s->error = error;
 		else if (s->write)
 			set_closure_fn(cl, bch_insert_data_error, bcache_wq);
 		else
 			set_closure_fn(cl, NULL, NULL);
 	}
 	bch_bbio_endio(op->c, bio, error, "writing data to cache");
 }
 static void bch_insert_data_loop(struct closure *cl)
 {
 	struct btree_op *op = container_of(cl, struct btree_op, cl);
 	struct search *s = container_of(op, struct search, op);
 	struct bio *bio = op->cache_bio, *n;
 	if (op->skip)
 		return bio_invalidate(cl);
 	if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) {
 		set_gc_sectors(op->c);
 		bch_queue_gc(op->c);
 	}
 	do {
 		unsigned i;
 		struct bkey *k;
 		struct bio_set *split = s->d
 			? s->d->bio_split : op->c->bio_split;
 		/* 1 for the device pointer and 1 for the chksum */
 		if (bch_keylist_realloc(&op->keys,
 					1 + (op->csum ? 1 : 0),
 					op->c))
 			continue_at(cl, bch_journal, bcache_wq);
 		k = op->keys.top;
 		bkey_init(k);
 		SET_KEY_INODE(k, op->inode);
 		SET_KEY_OFFSET(k, bio->bi_sector);
 		if (!bch_alloc_sectors(k, bio_sectors(bio), s))
 			goto err;
 		n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split);
-		if (!n) {
-			__bkey_put(op->c, k);
-			continue_at(cl, bch_insert_data_loop, bcache_wq);
-		}
 		n->bi_end_io	= bch_insert_data_endio;
 		n->bi_private	= cl;
 		if (s->writeback) {
 			SET_KEY_DIRTY(k, true);
 			for (i = 0; i < KEY_PTRS(k); i++)
 				SET_GC_MARK(PTR_BUCKET(op->c, k, i),
 					    GC_MARK_DIRTY);
 		}
 		SET_KEY_CSUM(k, op->csum);
 		if (KEY_CSUM(k))
 			bio_csum(n, k);
 		trace_bcache_cache_insert(k);
 		bch_keylist_push(&op->keys);
 		n->bi_rw |= REQ_WRITE;
 		bch_submit_bbio(n, op->c, k, 0);
 	} while (n != bio);
 	op->insert_data_done = true;
 	continue_at(cl, bch_journal, bcache_wq);
 err:
 	/* bch_alloc_sectors() blocks if s->writeback = true */
 	BUG_ON(s->writeback);
 	/*
 	 * But if it's not a writeback write we'd rather just bail out if
 	 * there aren't any buckets ready to write to - it might take awhile and
 	 * we might be starving btree writes for gc or something.
 	 */
 	if (s->write) {
 		/*
 		 * Writethrough write: We can't complete the write until we've
 		 * updated the index. But we don't want to delay the write while
 		 * we wait for buckets to be freed up, so just invalidate the
 		 * rest of the write.
 		 */
 		op->skip = true;
 		return bio_invalidate(cl);
 	} else {
 		/*
 		 * From a cache miss, we can just insert the keys for the data
 		 * we have written or bail out if we didn't do anything.
 		 */
 		op->insert_data_done = true;
 		bio_put(bio);
 		if (!bch_keylist_empty(&op->keys))
 			continue_at(cl, bch_journal, bcache_wq);
 		else
 			closure_return(cl);
 	}
 }
 /**
  * bch_insert_data - stick some data in the cache
  *
  * This is the starting point for any data to end up in a cache device; it could
  * be from a normal write, or a writeback write, or a write to a flash only
  * volume - it's also used by the moving garbage collector to compact data in
  * mostly empty buckets.
  *
  * It first writes the data to the cache, creating a list of keys to be inserted
  * (if the data had to be fragmented there will be multiple keys); after the
  * data is written it calls bch_journal, and after the keys have been added to
  * the next journal write they're inserted into the btree.
  *
  * It inserts the data in op->cache_bio; bi_sector is used for the key offset,
  * and op->inode is used for the key inode.
  *
  * If op->skip is true, instead of inserting the data it invalidates the region
  * of the cache represented by op->cache_bio and op->inode.
  */
 void bch_insert_data(struct closure *cl)
 {
 	struct btree_op *op = container_of(cl, struct btree_op, cl);
 	bch_keylist_init(&op->keys);
 	bio_get(op->cache_bio);
 	bch_insert_data_loop(cl);
 }
 void bch_btree_insert_async(struct closure *cl)
 {
 	struct btree_op *op = container_of(cl, struct btree_op, cl);
 	struct search *s = container_of(op, struct search, op);
 	if (bch_btree_insert(op, op->c)) {
 		s->error		= -ENOMEM;
 		op->insert_data_done	= true;
 	}
 	if (op->insert_data_done) {
 		bch_keylist_free(&op->keys);
 		closure_return(cl);
 	} else
 		continue_at(cl, bch_insert_data_loop, bcache_wq);
 }
 /* Common code for the make_request functions */
 static void request_endio(struct bio *bio, int error)
 {
 	struct closure *cl = bio->bi_private;
 	if (error) {
 		struct search *s = container_of(cl, struct search, cl);
 		s->error = error;
 		/* Only cache read errors are recoverable */
 		s->recoverable = false;
 	}
 	bio_put(bio);
 	closure_put(cl);
 }
 void bch_cache_read_endio(struct bio *bio, int error)
 {
 	struct bbio *b = container_of(bio, struct bbio, bio);
 	struct closure *cl = bio->bi_private;
 	struct search *s = container_of(cl, struct search, cl);
 	/*
 	 * If the bucket was reused while our bio was in flight, we might have
 	 * read the wrong data. Set s->error but not error so it doesn't get
 	 * counted against the cache device, but we'll still reread the data
 	 * from the backing device.
 	 */
 	if (error)
 		s->error = error;
 	else if (ptr_stale(s->op.c, &b->key, 0)) {
 		atomic_long_inc(&s->op.c->cache_read_races);
 		s->error = -EINTR;
 	}
 	bch_bbio_endio(s->op.c, bio, error, "reading from cache");
 }
 static void bio_complete(struct search *s)
 {
 	if (s->orig_bio) {
 		int cpu, rw = bio_data_dir(s->orig_bio);
 		unsigned long duration = jiffies - s->start_time;
 		cpu = part_stat_lock();
 		part_round_stats(cpu, &s->d->disk->part0);
 		part_stat_add(cpu, &s->d->disk->part0, ticks[rw], duration);
 		part_stat_unlock();
 		trace_bcache_request_end(s, s->orig_bio);
 		bio_endio(s->orig_bio, s->error);
 		s->orig_bio = NULL;
 	}
 }
 static void do_bio_hook(struct search *s)
 {
 	struct bio *bio = &s->bio.bio;
 	memcpy(bio, s->orig_bio, sizeof(struct bio));
 	bio->bi_end_io		= request_endio;
 	bio->bi_private		= &s->cl;
 	atomic_set(&bio->bi_cnt, 3);
 }
 static void search_free(struct closure *cl)
 {
 	struct search *s = container_of(cl, struct search, cl);
 	bio_complete(s);
 	if (s->op.cache_bio)
 		bio_put(s->op.cache_bio);
 	if (s->unaligned_bvec)
 		mempool_free(s->bio.bio.bi_io_vec, s->d->unaligned_bvec);
 	closure_debug_destroy(cl);
 	mempool_free(s, s->d->c->search);
 }
 static struct search *search_alloc(struct bio *bio, struct bcache_device *d)
 {
 	struct bio_vec *bv;
 	struct search *s = mempool_alloc(d->c->search, GFP_NOIO);
 	memset(s, 0, offsetof(struct search, op.keys));
 	__closure_init(&s->cl, NULL);
 	s->op.inode		= d->id;
 	s->op.c			= d->c;
 	s->d			= d;
 	s->op.lock		= -1;
 	s->task			= current;
 	s->orig_bio		= bio;
 	s->write		= (bio->bi_rw & REQ_WRITE) != 0;
 	s->op.flush_journal	= (bio->bi_rw & REQ_FLUSH) != 0;
 	s->op.skip		= (bio->bi_rw & REQ_DISCARD) != 0;
 	s->recoverable		= 1;
 	s->start_time		= jiffies;
 	do_bio_hook(s);
 	if (bio->bi_size != bio_segments(bio) * PAGE_SIZE) {
 		bv = mempool_alloc(d->unaligned_bvec, GFP_NOIO);
 		memcpy(bv, bio_iovec(bio),
 		       sizeof(struct bio_vec) * bio_segments(bio));
 		s->bio.bio.bi_io_vec	= bv;
 		s->unaligned_bvec	= 1;
 	}
 	return s;
 }
 static void btree_read_async(struct closure *cl)
 {
 	struct btree_op *op = container_of(cl, struct btree_op, cl);
 	int ret = btree_root(search_recurse, op->c, op);
 	if (ret == -EAGAIN)
 		continue_at(cl, btree_read_async, bcache_wq);
 	closure_return(cl);
 }
 /* Cached devices */
 static void cached_dev_bio_complete(struct closure *cl)
 {
 	struct search *s = container_of(cl, struct search, cl);
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 	search_free(cl);
 	cached_dev_put(dc);
 }
 /* Process reads */
 static void cached_dev_read_complete(struct closure *cl)
 {
 	struct search *s = container_of(cl, struct search, cl);
 	if (s->op.insert_collision)
 		bch_mark_cache_miss_collision(s);
 	if (s->op.cache_bio) {
 		int i;
 		struct bio_vec *bv;
 		__bio_for_each_segment(bv, s->op.cache_bio, i, 0)
 			__free_page(bv->bv_page);
 	}
 	cached_dev_bio_complete(cl);
 }
 static void request_read_error(struct closure *cl)
 {
 	struct search *s = container_of(cl, struct search, cl);
 	struct bio_vec *bv;
 	int i;
 	if (s->recoverable) {
 		/* Retry from the backing device: */
 		trace_bcache_read_retry(s->orig_bio);
 		s->error = 0;
 		bv = s->bio.bio.bi_io_vec;
 		do_bio_hook(s);
 		s->bio.bio.bi_io_vec = bv;
 		if (!s->unaligned_bvec)
 			bio_for_each_segment(bv, s->orig_bio, i)
 				bv->bv_offset = 0, bv->bv_len = PAGE_SIZE;
 		else
 			memcpy(s->bio.bio.bi_io_vec,
 			       bio_iovec(s->orig_bio),
 			       sizeof(struct bio_vec) *
 			       bio_segments(s->orig_bio));
 		/* XXX: invalidate cache */
 		closure_bio_submit(&s->bio.bio, &s->cl, s->d);
 	}
 	continue_at(cl, cached_dev_read_complete, NULL);
 }
 static void request_read_done(struct closure *cl)
 {
 	struct search *s = container_of(cl, struct search, cl);
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 	/*
 	 * s->cache_bio != NULL implies that we had a cache miss; cache_bio now
 	 * contains data ready to be inserted into the cache.
 	 *
 	 * First, we copy the data we just read from cache_bio's bounce buffers
 	 * to the buffers the original bio pointed to:
 	 */
 	if (s->op.cache_bio) {
-		struct bio_vec *src, *dst;
-		unsigned src_offset, dst_offset, bytes;
-		void *dst_ptr;
 		bio_reset(s->op.cache_bio);
 		s->op.cache_bio->bi_sector	= s->cache_miss->bi_sector;
 		s->op.cache_bio->bi_bdev	= s->cache_miss->bi_bdev;
 		s->op.cache_bio->bi_size	= s->cache_bio_sectors << 9;
 		bch_bio_map(s->op.cache_bio, NULL);
-		src = bio_iovec(s->op.cache_bio);
+		bio_copy_data(s->cache_miss, s->op.cache_bio);
-		dst = bio_iovec(s->cache_miss);
-		src_offset = src->bv_offset;
-		dst_offset = dst->bv_offset;
-		dst_ptr = kmap(dst->bv_page);
-		while (1) {
-			if (dst_offset == dst->bv_offset + dst->bv_len) {
-				kunmap(dst->bv_page);
-				dst++;
-				if (dst == bio_iovec_idx(s->cache_miss,
-						s->cache_miss->bi_vcnt))
-					break;
-				dst_offset = dst->bv_offset;
-				dst_ptr = kmap(dst->bv_page);
-			}
-			if (src_offset == src->bv_offset + src->bv_len) {
-				src++;
-				if (src == bio_iovec_idx(s->op.cache_bio,
-						 s->op.cache_bio->bi_vcnt))
-					BUG();
-				src_offset = src->bv_offset;
-			}
-			bytes = min(dst->bv_offset + dst->bv_len - dst_offset,
-				    src->bv_offset + src->bv_len - src_offset);
-			memcpy(dst_ptr + dst_offset,
-			       page_address(src->bv_page) + src_offset,
-			       bytes);
-			src_offset	+= bytes;
-			dst_offset	+= bytes;
-		}
 		bio_put(s->cache_miss);
 		s->cache_miss = NULL;
 	}
 	if (verify(dc, &s->bio.bio) && s->recoverable)
 		bch_data_verify(s);
 	bio_complete(s);
 	if (s->op.cache_bio &&
 	    !test_bit(CACHE_SET_STOPPING, &s->op.c->flags)) {
 		s->op.type = BTREE_REPLACE;
 		closure_call(&s->op.cl, bch_insert_data, NULL, cl);
 	}
 	continue_at(cl, cached_dev_read_complete, NULL);
 }
 static void request_read_done_bh(struct closure *cl)
 {
 	struct search *s = container_of(cl, struct search, cl);
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 	bch_mark_cache_accounting(s, !s->cache_miss, s->op.skip);
 	trace_bcache_read(s->orig_bio, !s->cache_miss, s->op.skip);
 	if (s->error)
 		continue_at_nobarrier(cl, request_read_error, bcache_wq);
 	else if (s->op.cache_bio || verify(dc, &s->bio.bio))
 		continue_at_nobarrier(cl, request_read_done, bcache_wq);
 	else
 		continue_at_nobarrier(cl, cached_dev_read_complete, NULL);
 }
 static int cached_dev_cache_miss(struct btree *b, struct search *s,
 				 struct bio *bio, unsigned sectors)
 {
 	int ret = 0;
 	unsigned reada;
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 	struct bio *miss;
 	miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
-	if (!miss)
-		return -EAGAIN;
 	if (miss == bio)
 		s->op.lookup_done = true;
 	miss->bi_end_io		= request_endio;
 	miss->bi_private	= &s->cl;
 	if (s->cache_miss || s->op.skip)
 		goto out_submit;
 	if (miss != bio ||
 	    (bio->bi_rw & REQ_RAHEAD) ||
 	    (bio->bi_rw & REQ_META) ||
 	    s->op.c->gc_stats.in_use >= CUTOFF_CACHE_READA)
 		reada = 0;
 	else {
 		reada = min(dc->readahead >> 9,
 			    sectors - bio_sectors(miss));
-		if (bio_end(miss) + reada > bdev_sectors(miss->bi_bdev))
+		if (bio_end_sector(miss) + reada > bdev_sectors(miss->bi_bdev))
-			reada = bdev_sectors(miss->bi_bdev) - bio_end(miss);
+			reada = bdev_sectors(miss->bi_bdev) -
+				bio_end_sector(miss);
 	}
 	s->cache_bio_sectors = bio_sectors(miss) + reada;
 	s->op.cache_bio = bio_alloc_bioset(GFP_NOWAIT,
 			DIV_ROUND_UP(s->cache_bio_sectors, PAGE_SECTORS),
 			dc->disk.bio_split);
 	if (!s->op.cache_bio)
 		goto out_submit;
 	s->op.cache_bio->bi_sector	= miss->bi_sector;
 	s->op.cache_bio->bi_bdev	= miss->bi_bdev;
 	s->op.cache_bio->bi_size	= s->cache_bio_sectors << 9;
 	s->op.cache_bio->bi_end_io	= request_endio;
 	s->op.cache_bio->bi_private	= &s->cl;
 	/* btree_search_recurse()'s btree iterator is no good anymore */
 	ret = -EINTR;
 	if (!bch_btree_insert_check_key(b, &s->op, s->op.cache_bio))
 		goto out_put;
 	bch_bio_map(s->op.cache_bio, NULL);
-	if (bch_bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO))
+	if (bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO))
 		goto out_put;
 	s->cache_miss = miss;
 	bio_get(s->op.cache_bio);
 	closure_bio_submit(s->op.cache_bio, &s->cl, s->d);
 	return ret;
 out_put:
 	bio_put(s->op.cache_bio);
 	s->op.cache_bio = NULL;
 out_submit:
 	closure_bio_submit(miss, &s->cl, s->d);
 	return ret;
 }
 static void request_read(struct cached_dev *dc, struct search *s)
 {
 	struct closure *cl = &s->cl;
 	check_should_skip(dc, s);
 	closure_call(&s->op.cl, btree_read_async, NULL, cl);
 	continue_at(cl, request_read_done_bh, NULL);
 }
 /* Process writes */
 static void cached_dev_write_complete(struct closure *cl)
 {
 	struct search *s = container_of(cl, struct search, cl);
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 	up_read_non_owner(&dc->writeback_lock);
 	cached_dev_bio_complete(cl);
 }
 static void request_write(struct cached_dev *dc, struct search *s)
 {
 	struct closure *cl = &s->cl;
 	struct bio *bio = &s->bio.bio;
 	struct bkey start, end;
 	start = KEY(dc->disk.id, bio->bi_sector, 0);
-	end = KEY(dc->disk.id, bio_end(bio), 0);
+	end = KEY(dc->disk.id, bio_end_sector(bio), 0);
 	bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end);
 	check_should_skip(dc, s);
 	down_read_non_owner(&dc->writeback_lock);
 	if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) {
 		s->op.skip	= false;
 		s->writeback	= true;
 	}
 	if (bio->bi_rw & REQ_DISCARD)
 		goto skip;
 	if (should_writeback(dc, s->orig_bio,
 			     cache_mode(dc, bio),
 			     s->op.skip)) {
 		s->op.skip = false;
 		s->writeback = true;
 	}
 	if (s->op.skip)
 		goto skip;
 	trace_bcache_write(s->orig_bio, s->writeback, s->op.skip);
 	if (!s->writeback) {
 		s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO,
 						   dc->disk.bio_split);
 		closure_bio_submit(bio, cl, s->d);
 	} else {
 		bch_writeback_add(dc);
 		if (s->op.flush_journal) {
 			/* Also need to send a flush to the backing device */
 			s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO,
 							   dc->disk.bio_split);
 			bio->bi_size = 0;
 			bio->bi_vcnt = 0;
 			closure_bio_submit(bio, cl, s->d);
 		} else {
 			s->op.cache_bio = bio;
 		}
 	}
 out:
 	closure_call(&s->op.cl, bch_insert_data, NULL, cl);
 	continue_at(cl, cached_dev_write_complete, NULL);
 skip:
 	s->op.skip = true;
 	s->op.cache_bio = s->orig_bio;
 	bio_get(s->op.cache_bio);
 	if ((bio->bi_rw & REQ_DISCARD) &&
 	    !blk_queue_discard(bdev_get_queue(dc->bdev)))
 		goto out;
 	closure_bio_submit(bio, cl, s->d);
 	goto out;
 }
 static void request_nodata(struct cached_dev *dc, struct search *s)
 {
 	struct closure *cl = &s->cl;
 	struct bio *bio = &s->bio.bio;
 	if (bio->bi_rw & REQ_DISCARD) {
 		request_write(dc, s);
 		return;
 	}
 	if (s->op.flush_journal)
 		bch_journal_meta(s->op.c, cl);
 	closure_bio_submit(bio, cl, s->d);
 	continue_at(cl, cached_dev_bio_complete, NULL);
 }
 /* Cached devices - read & write stuff */
 unsigned bch_get_congested(struct cache_set *c)
 {
 	int i;
 	long rand;
 	if (!c->congested_read_threshold_us &&
 	    !c->congested_write_threshold_us)
 		return 0;
 	i = (local_clock_us() - c->congested_last_us) / 1024;
 	if (i < 0)
 		return 0;
 	i += atomic_read(&c->congested);
 	if (i >= 0)
 		return 0;
 	i += CONGESTED_MAX;
 	if (i > 0)
 		i = fract_exp_two(i, 6);
 	rand = get_random_int();
 	i -= bitmap_weight(&rand, BITS_PER_LONG);
 	return i > 0 ? i : 1;
 }
 static void add_sequential(struct task_struct *t)
 {
 	ewma_add(t->sequential_io_avg,
 		 t->sequential_io, 8, 0);
 	t->sequential_io = 0;
 }
 static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k)
 {
 	return &dc->io_hash[hash_64(k, RECENT_IO_BITS)];
 }
 static void check_should_skip(struct cached_dev *dc, struct search *s)
 {
 	struct cache_set *c = s->op.c;
 	struct bio *bio = &s->bio.bio;
 	unsigned mode = cache_mode(dc, bio);
 	unsigned sectors, congested = bch_get_congested(c);
 	if (atomic_read(&dc->disk.detaching) ||
 	    c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
 	    (bio->bi_rw & REQ_DISCARD))
 		goto skip;
 	if (mode == CACHE_MODE_NONE ||
 	    (mode == CACHE_MODE_WRITEAROUND &&
 	     (bio->bi_rw & REQ_WRITE)))
 		goto skip;
 	if (bio->bi_sector   & (c->sb.block_size - 1) ||
 	    bio_sectors(bio) & (c->sb.block_size - 1)) {
 		pr_debug("skipping unaligned io");
 		goto skip;
 	}
 	if (!congested && !dc->sequential_cutoff)
 		goto rescale;
 	if (!congested &&
 	    mode == CACHE_MODE_WRITEBACK &&
 	    (bio->bi_rw & REQ_WRITE) &&
 	    (bio->bi_rw & REQ_SYNC))
 		goto rescale;
 	if (dc->sequential_merge) {
 		struct io *i;
 		spin_lock(&dc->io_lock);
 		hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash)
 			if (i->last == bio->bi_sector &&
 			    time_before(jiffies, i->jiffies))
 				goto found;
 		i = list_first_entry(&dc->io_lru, struct io, lru);
 		add_sequential(s->task);
 		i->sequential = 0;
 found:
 		if (i->sequential + bio->bi_size > i->sequential)
 			i->sequential	+= bio->bi_size;
-		i->last			 = bio_end(bio);
+		i->last			 = bio_end_sector(bio);
 		i->jiffies		 = jiffies + msecs_to_jiffies(5000);
 		s->task->sequential_io	 = i->sequential;
 		hlist_del(&i->hash);
 		hlist_add_head(&i->hash, iohash(dc, i->last));
 		list_move_tail(&i->lru, &dc->io_lru);
 		spin_unlock(&dc->io_lock);
 	} else {
 		s->task->sequential_io = bio->bi_size;
 		add_sequential(s->task);
 	}
 	sectors = max(s->task->sequential_io,
 		      s->task->sequential_io_avg) >> 9;
 	if (dc->sequential_cutoff &&
 	    sectors >= dc->sequential_cutoff >> 9) {
 		trace_bcache_bypass_sequential(s->orig_bio);
 		goto skip;
 	}
 	if (congested && sectors >= congested) {
 		trace_bcache_bypass_congested(s->orig_bio);
 		goto skip;
 	}
 rescale:
 	bch_rescale_priorities(c, bio_sectors(bio));
 	return;
 skip:
 	bch_mark_sectors_bypassed(s, bio_sectors(bio));
 	s->op.skip = true;
 }
 static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
 {
 	struct search *s;
 	struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
 	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
 	int cpu, rw = bio_data_dir(bio);
 	cpu = part_stat_lock();
 	part_stat_inc(cpu, &d->disk->part0, ios[rw]);
 	part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio));
 	part_stat_unlock();
 	bio->bi_bdev = dc->bdev;
 	bio->bi_sector += dc->sb.data_offset;
 	if (cached_dev_get(dc)) {
 		s = search_alloc(bio, d);
 		trace_bcache_request_start(s, bio);
 		if (!bio_has_data(bio))
 			request_nodata(dc, s);
 		else if (rw)
 			request_write(dc, s);
 		else
 			request_read(dc, s);
 	} else {
 		if ((bio->bi_rw & REQ_DISCARD) &&
 		    !blk_queue_discard(bdev_get_queue(dc->bdev)))
 			bio_endio(bio, 0);
 		else
 			bch_generic_make_request(bio, &d->bio_split_hook);
 	}
 }
 static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode,
 			    unsigned int cmd, unsigned long arg)
 {
 	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
 	return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg);
 }
 static int cached_dev_congested(void *data, int bits)
 {
 	struct bcache_device *d = data;
 	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
 	struct request_queue *q = bdev_get_queue(dc->bdev);
 	int ret = 0;
 	if (bdi_congested(&q->backing_dev_info, bits))
 		return 1;
 	if (cached_dev_get(dc)) {
 		unsigned i;
 		struct cache *ca;
 		for_each_cache(ca, d->c, i) {
 			q = bdev_get_queue(ca->bdev);
 			ret |= bdi_congested(&q->backing_dev_info, bits);
 		}
 		cached_dev_put(dc);
 	}
 	return ret;
 }
 void bch_cached_dev_request_init(struct cached_dev *dc)
 {
 	struct gendisk *g = dc->disk.disk;
 	g->queue->make_request_fn		= cached_dev_make_request;
 	g->queue->backing_dev_info.congested_fn = cached_dev_congested;
 	dc->disk.cache_miss			= cached_dev_cache_miss;
 	dc->disk.ioctl				= cached_dev_ioctl;
 }
 /* Flash backed devices */
 static int flash_dev_cache_miss(struct btree *b, struct search *s,
 				struct bio *bio, unsigned sectors)
 {
+	struct bio_vec *bv;
+	int i;
 	/* Zero fill bio */
-	while (bio->bi_idx != bio->bi_vcnt) {
+	bio_for_each_segment(bv, bio, i) {
-		struct bio_vec *bv = bio_iovec(bio);
 		unsigned j = min(bv->bv_len >> 9, sectors);
 		void *p = kmap(bv->bv_page);
 		memset(p + bv->bv_offset, 0, j << 9);
 		kunmap(bv->bv_page);
-		bv->bv_len	-= j << 9;
+		sectors	-= j;
-		bv->bv_offset	+= j << 9;
-		if (bv->bv_len)
-			return 0;
-		bio->bi_sector	+= j;
-		bio->bi_size	-= j << 9;
-		bio->bi_idx++;
-		sectors		-= j;
 	}
-	s->op.lookup_done = true;
+	bio_advance(bio, min(sectors << 9, bio->bi_size));
+	if (!bio->bi_size)
+		s->op.lookup_done = true;
 	return 0;
 }
 static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
 {
 	struct search *s;
 	struct closure *cl;
 	struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
 	int cpu, rw = bio_data_dir(bio);
 	cpu = part_stat_lock();
 	part_stat_inc(cpu, &d->disk->part0, ios[rw]);
 	part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio));
 	part_stat_unlock();
 	s = search_alloc(bio, d);
 	cl = &s->cl;
 	bio = &s->bio.bio;
 	trace_bcache_request_start(s, bio);
 	if (bio_has_data(bio) && !rw) {
 		closure_call(&s->op.cl, btree_read_async, NULL, cl);
 	} else if (bio_has_data(bio) || s->op.skip) {
 		bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys,
-					     &KEY(d->id, bio->bi_sector, 0),
+					&KEY(d->id, bio->bi_sector, 0),
-					     &KEY(d->id, bio_end(bio), 0));
+					&KEY(d->id, bio_end_sector(bio), 0));
 		s->writeback	= true;
 		s->op.cache_bio	= bio;
 		closure_call(&s->op.cl, bch_insert_data, NULL, cl);
 	} else {
 		/* No data - probably a cache flush */
 		if (s->op.flush_journal)
 			bch_journal_meta(s->op.c, cl);
 	}
 	continue_at(cl, search_free, NULL);
 }
 static int flash_dev_ioctl(struct bcache_device *d, fmode_t mode,
 			   unsigned int cmd, unsigned long arg)
 {
 	return -ENOTTY;
 }
 static int flash_dev_congested(void *data, int bits)
 {
 	struct bcache_device *d = data;
 	struct request_queue *q;
 	struct cache *ca;
 	unsigned i;
 	int ret = 0;
 	for_each_cache(ca, d->c, i) {
 		q = bdev_get_queue(ca->bdev);
 		ret |= bdi_congested(&q->backing_dev_info, bits);
 	}
 	return ret;
 }
 void bch_flash_dev_request_init(struct bcache_device *d)
 {
 	struct gendisk *g = d->disk;
 	g->queue->make_request_fn		= flash_dev_make_request;
 	g->queue->backing_dev_info.congested_fn = flash_dev_congested;
 	d->cache_miss				= flash_dev_cache_miss;
 	d->ioctl				= flash_dev_ioctl;
 }
 void bch_request_exit(void)
 {
 #ifdef CONFIG_CGROUP_BCACHE
 	cgroup_unload_subsys(&bcache_subsys);
 #endif
 	if (bch_search_cache)
 		kmem_cache_destroy(bch_search_cache);
 }
 int __init bch_request_init(void)
 {
 	bch_search_cache = KMEM_CACHE(search, 0);
 	if (!bch_search_cache)
 		return -ENOMEM;
 #ifdef CONFIG_CGROUP_BCACHE
 	cgroup_load_subsys(&bcache_subsys);

drivers/md/bcache/util.c

Diff comments View file @ 8e51e41

 /*
  * random utiility code, for bcache but in theory not specific to bcache
  *
  * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
  * Copyright 2012 Google, Inc.
  */
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/ctype.h>
 #include <linux/debugfs.h>
 #include <linux/module.h>
 #include <linux/seq_file.h>
 #include <linux/types.h>
 #include "util.h"
 #define simple_strtoint(c, end, base)	simple_strtol(c, end, base)
 #define simple_strtouint(c, end, base)	simple_strtoul(c, end, base)
 #define STRTO_H(name, type)					\
 int bch_ ## name ## _h(const char *cp, type *res)		\
 {								\
 	int u = 0;						\
 	char *e;						\
 	type i = simple_ ## name(cp, &e, 10);			\
 								\
 	switch (tolower(*e)) {					\
 	default:						\
 		return -EINVAL;					\
 	case 'y':						\
 	case 'z':						\
 		u++;						\
 	case 'e':						\
 		u++;						\
 	case 'p':						\
 		u++;						\
 	case 't':						\
 		u++;						\
 	case 'g':						\
 		u++;						\
 	case 'm':						\
 		u++;						\
 	case 'k':						\
 		u++;						\
 		if (e++ == cp)					\
 			return -EINVAL;				\
 	case '\n':						\
 	case '\0':						\
 		if (*e == '\n')					\
 			e++;					\
 	}							\
 								\
 	if (*e)							\
 		return -EINVAL;					\
 								\
 	while (u--) {						\
 		if ((type) ~0 > 0 &&				\
 		    (type) ~0 / 1024 <= i)			\
 			return -EINVAL;				\
 		if ((i > 0 && ANYSINT_MAX(type) / 1024 < i) ||	\
 		    (i < 0 && -ANYSINT_MAX(type) / 1024 > i))	\
 			return -EINVAL;				\
 		i *= 1024;					\
 	}							\
 								\
 	*res = i;						\
 	return 0;						\
 }								\
 STRTO_H(strtoint, int)
 STRTO_H(strtouint, unsigned int)
 STRTO_H(strtoll, long long)
 STRTO_H(strtoull, unsigned long long)
 ssize_t bch_hprint(char *buf, int64_t v)
 {
 	static const char units[] = "?kMGTPEZY";
 	char dec[4] = "";
 	int u, t = 0;
 	for (u = 0; v >= 1024 || v <= -1024; u++) {
 		t = v & ~(~0 << 10);
 		v >>= 10;
 	}
 	if (!u)
 		return sprintf(buf, "%llu", v);
 	if (v < 100 && v > -100)
 		snprintf(dec, sizeof(dec), ".%i", t / 100);
 	return sprintf(buf, "%lli%s%c", v, dec, units[u]);
 }
 ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[],
 			    size_t selected)
 {
 	char *out = buf;
 	size_t i;
 	for (i = 0; list[i]; i++)
 		out += snprintf(out, buf + size - out,
 				i == selected ? "[%s] " : "%s ", list[i]);
 	out[-1] = '\n';
 	return out - buf;
 }
 ssize_t bch_read_string_list(const char *buf, const char * const list[])
 {
 	size_t i;
 	char *s, *d = kstrndup(buf, PAGE_SIZE - 1, GFP_KERNEL);
 	if (!d)
 		return -ENOMEM;
 	s = strim(d);
 	for (i = 0; list[i]; i++)
 		if (!strcmp(list[i], s))
 			break;
 	kfree(d);
 	if (!list[i])
 		return -EINVAL;
 	return i;
 }
 bool bch_is_zero(const char *p, size_t n)
 {
 	size_t i;
 	for (i = 0; i < n; i++)
 		if (p[i])
 			return false;
 	return true;
 }
 int bch_parse_uuid(const char *s, char *uuid)
 {
 	size_t i, j, x;
 	memset(uuid, 0, 16);
 	for (i = 0, j = 0;
 	     i < strspn(s, "-0123456789:ABCDEFabcdef") && j < 32;
 	     i++) {
 		x = s[i] | 32;
 		switch (x) {
 		case '0'...'9':
 			x -= '0';
 			break;
 		case 'a'...'f':
 			x -= 'a' - 10;
 			break;
 		default:
 			continue;
 		}
 		if (!(j & 1))
 			x <<= 4;
 		uuid[j++ >> 1] |= x;
 	}
 	return i;
 }
 void bch_time_stats_update(struct time_stats *stats, uint64_t start_time)
 {
 	uint64_t now		= local_clock();
 	uint64_t duration	= time_after64(now, start_time)
 		? now - start_time : 0;
 	uint64_t last		= time_after64(now, stats->last)
 		? now - stats->last : 0;
 	stats->max_duration = max(stats->max_duration, duration);
 	if (stats->last) {
 		ewma_add(stats->average_duration, duration, 8, 8);
 		if (stats->average_frequency)
 			ewma_add(stats->average_frequency, last, 8, 8);
 		else
 			stats->average_frequency  = last << 8;
 	} else {
 		stats->average_duration  = duration << 8;
 	}
 	stats->last = now ?: 1;
 }
 unsigned bch_next_delay(struct ratelimit *d, uint64_t done)
 {
 	uint64_t now = local_clock();
 	d->next += div_u64(done, d->rate);
 	return time_after64(d->next, now)
 		? div_u64(d->next - now, NSEC_PER_SEC / HZ)
 		: 0;
 }
 void bch_bio_map(struct bio *bio, void *base)
 {
 	size_t size = bio->bi_size;
 	struct bio_vec *bv = bio->bi_io_vec;
 	BUG_ON(!bio->bi_size);
 	BUG_ON(bio->bi_vcnt);
 	bv->bv_offset = base ? ((unsigned long) base) % PAGE_SIZE : 0;
 	goto start;
 	for (; size; bio->bi_vcnt++, bv++) {
 		bv->bv_offset	= 0;
 start:		bv->bv_len	= min_t(size_t, PAGE_SIZE - bv->bv_offset,
 					size);
 		if (base) {
 			bv->bv_page = is_vmalloc_addr(base)
 				? vmalloc_to_page(base)
 				: virt_to_page(base);
 			base += bv->bv_len;
 		}
 		size -= bv->bv_len;
 	}
 }
-int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp)
-{
-	int i;
-	struct bio_vec *bv;
-	bio_for_each_segment(bv, bio, i) {
-		bv->bv_page = alloc_page(gfp);
-		if (!bv->bv_page) {
-			while (bv-- != bio->bi_io_vec + bio->bi_idx)
-				__free_page(bv->bv_page);
-			return -ENOMEM;
-		}
-	}
-	return 0;
-}
 /*
  * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any
  * use permitted, subject to terms of PostgreSQL license; see.)
  * If we have a 64-bit integer type, then a 64-bit CRC looks just like the
  * usual sort of implementation. (See Ross Williams' excellent introduction
  * A PAINLESS GUIDE TO CRC ERROR DETECTION ALGORITHMS, available from
  * ftp://ftp.rocksoft.com/papers/crc_v3.txt or several other net sites.)
  * If we have no working 64-bit type, then fake it with two 32-bit registers.
  *
  * The present implementation is a normal (not "reflected", in Williams'
  * terms) 64-bit CRC, using initial all-ones register contents and a final
  * bit inversion. The chosen polynomial is borrowed from the DLT1 spec
  * (ECMA-182, available from http://www.ecma.ch/ecma1/STAND/ECMA-182.HTM):
  *
  * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
  * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
  * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 +
  * x^7 + x^4 + x + 1
 */
 static const uint64_t crc_table[256] = {
 	0x0000000000000000ULL, 0x42F0E1EBA9EA3693ULL, 0x85E1C3D753D46D26ULL,
 	0xC711223CFA3E5BB5ULL, 0x493366450E42ECDFULL, 0x0BC387AEA7A8DA4CULL,
 	0xCCD2A5925D9681F9ULL, 0x8E224479F47CB76AULL, 0x9266CC8A1C85D9BEULL,
 	0xD0962D61B56FEF2DULL, 0x17870F5D4F51B498ULL, 0x5577EEB6E6BB820BULL,
 	0xDB55AACF12C73561ULL, 0x99A54B24BB2D03F2ULL, 0x5EB4691841135847ULL,
 	0x1C4488F3E8F96ED4ULL, 0x663D78FF90E185EFULL, 0x24CD9914390BB37CULL,
 	0xE3DCBB28C335E8C9ULL, 0xA12C5AC36ADFDE5AULL, 0x2F0E1EBA9EA36930ULL,
 	0x6DFEFF5137495FA3ULL, 0xAAEFDD6DCD770416ULL, 0xE81F3C86649D3285ULL,
 	0xF45BB4758C645C51ULL, 0xB6AB559E258E6AC2ULL, 0x71BA77A2DFB03177ULL,
 	0x334A9649765A07E4ULL, 0xBD68D2308226B08EULL, 0xFF9833DB2BCC861DULL,
 	0x388911E7D1F2DDA8ULL, 0x7A79F00C7818EB3BULL, 0xCC7AF1FF21C30BDEULL,
 	0x8E8A101488293D4DULL, 0x499B3228721766F8ULL, 0x0B6BD3C3DBFD506BULL,
 	0x854997BA2F81E701ULL, 0xC7B97651866BD192ULL, 0x00A8546D7C558A27ULL,
 	0x4258B586D5BFBCB4ULL, 0x5E1C3D753D46D260ULL, 0x1CECDC9E94ACE4F3ULL,
 	0xDBFDFEA26E92BF46ULL, 0x990D1F49C77889D5ULL, 0x172F5B3033043EBFULL,
 	0x55DFBADB9AEE082CULL, 0x92CE98E760D05399ULL, 0xD03E790CC93A650AULL,
 	0xAA478900B1228E31ULL, 0xE8B768EB18C8B8A2ULL, 0x2FA64AD7E2F6E317ULL,
 	0x6D56AB3C4B1CD584ULL, 0xE374EF45BF6062EEULL, 0xA1840EAE168A547DULL,
 	0x66952C92ECB40FC8ULL, 0x2465CD79455E395BULL, 0x3821458AADA7578FULL,
 	0x7AD1A461044D611CULL, 0xBDC0865DFE733AA9ULL, 0xFF3067B657990C3AULL,
 	0x711223CFA3E5BB50ULL, 0x33E2C2240A0F8DC3ULL, 0xF4F3E018F031D676ULL,
 	0xB60301F359DBE0E5ULL, 0xDA050215EA6C212FULL, 0x98F5E3FE438617BCULL,
 	0x5FE4C1C2B9B84C09ULL, 0x1D14202910527A9AULL, 0x93366450E42ECDF0ULL,
 	0xD1C685BB4DC4FB63ULL, 0x16D7A787B7FAA0D6ULL, 0x5427466C1E109645ULL,
 	0x4863CE9FF6E9F891ULL, 0x0A932F745F03CE02ULL, 0xCD820D48A53D95B7ULL,
 	0x8F72ECA30CD7A324ULL, 0x0150A8DAF8AB144EULL, 0x43A04931514122DDULL,
 	0x84B16B0DAB7F7968ULL, 0xC6418AE602954FFBULL, 0xBC387AEA7A8DA4C0ULL,
 	0xFEC89B01D3679253ULL, 0x39D9B93D2959C9E6ULL, 0x7B2958D680B3FF75ULL,
 	0xF50B1CAF74CF481FULL, 0xB7FBFD44DD257E8CULL, 0x70EADF78271B2539ULL,
 	0x321A3E938EF113AAULL, 0x2E5EB66066087D7EULL, 0x6CAE578BCFE24BEDULL,
 	0xABBF75B735DC1058ULL, 0xE94F945C9C3626CBULL, 0x676DD025684A91A1ULL,
 	0x259D31CEC1A0A732ULL, 0xE28C13F23B9EFC87ULL, 0xA07CF2199274CA14ULL,
 	0x167FF3EACBAF2AF1ULL, 0x548F120162451C62ULL, 0x939E303D987B47D7ULL,
 	0xD16ED1D631917144ULL, 0x5F4C95AFC5EDC62EULL, 0x1DBC74446C07F0BDULL,
 	0xDAAD56789639AB08ULL, 0x985DB7933FD39D9BULL, 0x84193F60D72AF34FULL,
 	0xC6E9DE8B7EC0C5DCULL, 0x01F8FCB784FE9E69ULL, 0x43081D5C2D14A8FAULL,
 	0xCD2A5925D9681F90ULL, 0x8FDAB8CE70822903ULL, 0x48CB9AF28ABC72B6ULL,
 	0x0A3B7B1923564425ULL, 0x70428B155B4EAF1EULL, 0x32B26AFEF2A4998DULL,
 	0xF5A348C2089AC238ULL, 0xB753A929A170F4ABULL, 0x3971ED50550C43C1ULL,
 	0x7B810CBBFCE67552ULL, 0xBC902E8706D82EE7ULL, 0xFE60CF6CAF321874ULL,
 	0xE224479F47CB76A0ULL, 0xA0D4A674EE214033ULL, 0x67C58448141F1B86ULL,
 	0x253565A3BDF52D15ULL, 0xAB1721DA49899A7FULL, 0xE9E7C031E063ACECULL,
 	0x2EF6E20D1A5DF759ULL, 0x6C0603E6B3B7C1CAULL, 0xF6FAE5C07D3274CDULL,
 	0xB40A042BD4D8425EULL, 0x731B26172EE619EBULL, 0x31EBC7FC870C2F78ULL,
 	0xBFC9838573709812ULL, 0xFD39626EDA9AAE81ULL, 0x3A28405220A4F534ULL,
 	0x78D8A1B9894EC3A7ULL, 0x649C294A61B7AD73ULL, 0x266CC8A1C85D9BE0ULL,
 	0xE17DEA9D3263C055ULL, 0xA38D0B769B89F6C6ULL, 0x2DAF4F0F6FF541ACULL,
 	0x6F5FAEE4C61F773FULL, 0xA84E8CD83C212C8AULL, 0xEABE6D3395CB1A19ULL,
 	0x90C79D3FEDD3F122ULL, 0xD2377CD44439C7B1ULL, 0x15265EE8BE079C04ULL,
 	0x57D6BF0317EDAA97ULL, 0xD9F4FB7AE3911DFDULL, 0x9B041A914A7B2B6EULL,
 	0x5C1538ADB04570DBULL, 0x1EE5D94619AF4648ULL, 0x02A151B5F156289CULL,
 	0x4051B05E58BC1E0FULL, 0x87409262A28245BAULL, 0xC5B073890B687329ULL,
 	0x4B9237F0FF14C443ULL, 0x0962D61B56FEF2D0ULL, 0xCE73F427ACC0A965ULL,
 	0x8C8315CC052A9FF6ULL, 0x3A80143F5CF17F13ULL, 0x7870F5D4F51B4980ULL,
 	0xBF61D7E80F251235ULL, 0xFD913603A6CF24A6ULL, 0x73B3727A52B393CCULL,
 	0x31439391FB59A55FULL, 0xF652B1AD0167FEEAULL, 0xB4A25046A88DC879ULL,
 	0xA8E6D8B54074A6ADULL, 0xEA16395EE99E903EULL, 0x2D071B6213A0CB8BULL,
 	0x6FF7FA89BA4AFD18ULL, 0xE1D5BEF04E364A72ULL, 0xA3255F1BE7DC7CE1ULL,
 	0x64347D271DE22754ULL, 0x26C49CCCB40811C7ULL, 0x5CBD6CC0CC10FAFCULL,
 	0x1E4D8D2B65FACC6FULL, 0xD95CAF179FC497DAULL, 0x9BAC4EFC362EA149ULL,
 	0x158E0A85C2521623ULL, 0x577EEB6E6BB820B0ULL, 0x906FC95291867B05ULL,
 	0xD29F28B9386C4D96ULL, 0xCEDBA04AD0952342ULL, 0x8C2B41A1797F15D1ULL,
 	0x4B3A639D83414E64ULL, 0x09CA82762AAB78F7ULL, 0x87E8C60FDED7CF9DULL,
 	0xC51827E4773DF90EULL, 0x020905D88D03A2BBULL, 0x40F9E43324E99428ULL,
 	0x2CFFE7D5975E55E2ULL, 0x6E0F063E3EB46371ULL, 0xA91E2402C48A38C4ULL,
 	0xEBEEC5E96D600E57ULL, 0x65CC8190991CB93DULL, 0x273C607B30F68FAEULL,
 	0xE02D4247CAC8D41BULL, 0xA2DDA3AC6322E288ULL, 0xBE992B5F8BDB8C5CULL,
 	0xFC69CAB42231BACFULL, 0x3B78E888D80FE17AULL, 0x7988096371E5D7E9ULL,
 	0xF7AA4D1A85996083ULL, 0xB55AACF12C735610ULL, 0x724B8ECDD64D0DA5ULL,
 	0x30BB6F267FA73B36ULL, 0x4AC29F2A07BFD00DULL, 0x08327EC1AE55E69EULL,
 	0xCF235CFD546BBD2BULL, 0x8DD3BD16FD818BB8ULL, 0x03F1F96F09FD3CD2ULL,
 	0x41011884A0170A41ULL, 0x86103AB85A2951F4ULL, 0xC4E0DB53F3C36767ULL,
 	0xD8A453A01B3A09B3ULL, 0x9A54B24BB2D03F20ULL, 0x5D45907748EE6495ULL,
 	0x1FB5719CE1045206ULL, 0x919735E51578E56CULL, 0xD367D40EBC92D3FFULL,
 	0x1476F63246AC884AULL, 0x568617D9EF46BED9ULL, 0xE085162AB69D5E3CULL,
 	0xA275F7C11F7768AFULL, 0x6564D5FDE549331AULL, 0x279434164CA30589ULL,
 	0xA9B6706FB8DFB2E3ULL, 0xEB46918411358470ULL, 0x2C57B3B8EB0BDFC5ULL,
 	0x6EA7525342E1E956ULL, 0x72E3DAA0AA188782ULL, 0x30133B4B03F2B111ULL,
 	0xF7021977F9CCEAA4ULL, 0xB5F2F89C5026DC37ULL, 0x3BD0BCE5A45A6B5DULL,
 	0x79205D0E0DB05DCEULL, 0xBE317F32F78E067BULL, 0xFCC19ED95E6430E8ULL,
 	0x86B86ED5267CDBD3ULL, 0xC4488F3E8F96ED40ULL, 0x0359AD0275A8B6F5ULL,
 	0x41A94CE9DC428066ULL, 0xCF8B0890283E370CULL, 0x8D7BE97B81D4019FULL,
 	0x4A6ACB477BEA5A2AULL, 0x089A2AACD2006CB9ULL, 0x14DEA25F3AF9026DULL,
 	0x562E43B4931334FEULL, 0x913F6188692D6F4BULL, 0xD3CF8063C0C759D8ULL,
 	0x5DEDC41A34BBEEB2ULL, 0x1F1D25F19D51D821ULL, 0xD80C07CD676F8394ULL,
 	0x9AFCE626CE85B507ULL,
 };
 uint64_t bch_crc64_update(uint64_t crc, const void *_data, size_t len)
 {
 	const unsigned char *data = _data;
 	while (len--) {
 		int i = ((int) (crc >> 56) ^ *data++) & 0xFF;
 		crc = crc_table[i] ^ (crc << 8);
 	}
 	return crc;
 }
 uint64_t bch_crc64(const void *data, size_t len)
 {
 	uint64_t crc = 0xffffffffffffffffULL;
 	crc = bch_crc64_update(crc, data, len);
 	return crc ^ 0xffffffffffffffffULL;
 }

drivers/md/bcache/util.h

Diff comments View file @ 8e51e41

 #ifndef _BCACHE_UTIL_H
 #define _BCACHE_UTIL_H
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/llist.h>
 #include <linux/ratelimit.h>
 #include <linux/vmalloc.h>
 #include <linux/workqueue.h>
 #include "closure.h"
 #define PAGE_SECTORS		(PAGE_SIZE / 512)
 struct closure;
 #ifdef CONFIG_BCACHE_EDEBUG
 #define atomic_dec_bug(v)	BUG_ON(atomic_dec_return(v) < 0)
 #define atomic_inc_bug(v, i)	BUG_ON(atomic_inc_return(v) <= i)
 #else /* EDEBUG */
 #define atomic_dec_bug(v)	atomic_dec(v)
 #define atomic_inc_bug(v, i)	atomic_inc(v)
 #endif
 #define BITMASK(name, type, field, offset, size)		\
 static inline uint64_t name(const type *k)			\
 { return (k->field >> offset) & ~(((uint64_t) ~0) << size); }	\
 								\
 static inline void SET_##name(type *k, uint64_t v)		\
 {								\
 	k->field &= ~(~((uint64_t) ~0 << size) << offset);	\
 	k->field |= v << offset;				\
 }
 #define DECLARE_HEAP(type, name)					\
 	struct {							\
 		size_t size, used;					\
 		type *data;						\
 	} name
 #define init_heap(heap, _size, gfp)					\
 ({									\
 	size_t _bytes;							\
 	(heap)->used = 0;						\
 	(heap)->size = (_size);						\
 	_bytes = (heap)->size * sizeof(*(heap)->data);			\
 	(heap)->data = NULL;						\
 	if (_bytes < KMALLOC_MAX_SIZE)					\
 		(heap)->data = kmalloc(_bytes, (gfp));			\
 	if ((!(heap)->data) && ((gfp) & GFP_KERNEL))			\
 		(heap)->data = vmalloc(_bytes);				\
 	(heap)->data;							\
 })
 #define free_heap(heap)							\
 do {									\
 	if (is_vmalloc_addr((heap)->data))				\
 		vfree((heap)->data);					\
 	else								\
 		kfree((heap)->data);					\
 	(heap)->data = NULL;						\
 } while (0)
 #define heap_swap(h, i, j)	swap((h)->data[i], (h)->data[j])
 #define heap_sift(h, i, cmp)						\
 do {									\
 	size_t _r, _j = i;						\
 									\
 	for (; _j * 2 + 1 < (h)->used; _j = _r) {			\
 		_r = _j * 2 + 1;					\
 		if (_r + 1 < (h)->used &&				\
 		    cmp((h)->data[_r], (h)->data[_r + 1]))		\
 			_r++;						\
 									\
 		if (cmp((h)->data[_r], (h)->data[_j]))			\
 			break;						\
 		heap_swap(h, _r, _j);					\
 	}								\
 } while (0)
 #define heap_sift_down(h, i, cmp)					\
 do {									\
 	while (i) {							\
 		size_t p = (i - 1) / 2;					\
 		if (cmp((h)->data[i], (h)->data[p]))			\
 			break;						\
 		heap_swap(h, i, p);					\
 		i = p;							\
 	}								\
 } while (0)
 #define heap_add(h, d, cmp)						\
 ({									\
 	bool _r = !heap_full(h);					\
 	if (_r) {							\
 		size_t _i = (h)->used++;				\
 		(h)->data[_i] = d;					\
 									\
 		heap_sift_down(h, _i, cmp);				\
 		heap_sift(h, _i, cmp);					\
 	}								\
 	_r;								\
 })
 #define heap_pop(h, d, cmp)						\
 ({									\
 	bool _r = (h)->used;						\
 	if (_r) {							\
 		(d) = (h)->data[0];					\
 		(h)->used--;						\
 		heap_swap(h, 0, (h)->used);				\
 		heap_sift(h, 0, cmp);					\
 	}								\
 	_r;								\
 })
 #define heap_peek(h)	((h)->size ? (h)->data[0] : NULL)
 #define heap_full(h)	((h)->used == (h)->size)
 #define DECLARE_FIFO(type, name)					\
 	struct {							\
 		size_t front, back, size, mask;				\
 		type *data;						\
 	} name
 #define fifo_for_each(c, fifo, iter)					\
 	for (iter = (fifo)->front;					\
 	     c = (fifo)->data[iter], iter != (fifo)->back;		\
 	     iter = (iter + 1) & (fifo)->mask)
 #define __init_fifo(fifo, gfp)						\
 ({									\
 	size_t _allocated_size, _bytes;					\
 	BUG_ON(!(fifo)->size);						\
 									\
 	_allocated_size = roundup_pow_of_two((fifo)->size + 1);		\
 	_bytes = _allocated_size * sizeof(*(fifo)->data);		\
 									\
 	(fifo)->mask = _allocated_size - 1;				\
 	(fifo)->front = (fifo)->back = 0;				\
 	(fifo)->data = NULL;						\
 									\
 	if (_bytes < KMALLOC_MAX_SIZE)					\
 		(fifo)->data = kmalloc(_bytes, (gfp));			\
 	if ((!(fifo)->data) && ((gfp) & GFP_KERNEL))			\
 		(fifo)->data = vmalloc(_bytes);				\
 	(fifo)->data;							\
 })
 #define init_fifo_exact(fifo, _size, gfp)				\
 ({									\
 	(fifo)->size = (_size);						\
 	__init_fifo(fifo, gfp);						\
 })
 #define init_fifo(fifo, _size, gfp)					\
 ({									\
 	(fifo)->size = (_size);						\
 	if ((fifo)->size > 4)						\
 		(fifo)->size = roundup_pow_of_two((fifo)->size) - 1;	\
 	__init_fifo(fifo, gfp);						\
 })
 #define free_fifo(fifo)							\
 do {									\
 	if (is_vmalloc_addr((fifo)->data))				\
 		vfree((fifo)->data);					\
 	else								\
 		kfree((fifo)->data);					\
 	(fifo)->data = NULL;						\
 } while (0)
 #define fifo_used(fifo)		(((fifo)->back - (fifo)->front) & (fifo)->mask)
 #define fifo_free(fifo)		((fifo)->size - fifo_used(fifo))
 #define fifo_empty(fifo)	(!fifo_used(fifo))
 #define fifo_full(fifo)		(!fifo_free(fifo))
 #define fifo_front(fifo)	((fifo)->data[(fifo)->front])
 #define fifo_back(fifo)							\
 	((fifo)->data[((fifo)->back - 1) & (fifo)->mask])
 #define fifo_idx(fifo, p)	(((p) - &fifo_front(fifo)) & (fifo)->mask)
 #define fifo_push_back(fifo, i)						\
 ({									\
 	bool _r = !fifo_full((fifo));					\
 	if (_r) {							\
 		(fifo)->data[(fifo)->back++] = (i);			\
 		(fifo)->back &= (fifo)->mask;				\
 	}								\
 	_r;								\
 })
 #define fifo_pop_front(fifo, i)						\
 ({									\
 	bool _r = !fifo_empty((fifo));					\
 	if (_r) {							\
 		(i) = (fifo)->data[(fifo)->front++];			\
 		(fifo)->front &= (fifo)->mask;				\
 	}								\
 	_r;								\
 })
 #define fifo_push_front(fifo, i)					\
 ({									\
 	bool _r = !fifo_full((fifo));					\
 	if (_r) {							\
 		--(fifo)->front;					\
 		(fifo)->front &= (fifo)->mask;				\
 		(fifo)->data[(fifo)->front] = (i);			\
 	}								\
 	_r;								\
 })
 #define fifo_pop_back(fifo, i)						\
 ({									\
 	bool _r = !fifo_empty((fifo));					\
 	if (_r) {							\
 		--(fifo)->back;						\
 		(fifo)->back &= (fifo)->mask;				\
 		(i) = (fifo)->data[(fifo)->back]			\
 	}								\
 	_r;								\
 })
 #define fifo_push(fifo, i)	fifo_push_back(fifo, (i))
 #define fifo_pop(fifo, i)	fifo_pop_front(fifo, (i))
 #define fifo_swap(l, r)							\
 do {									\
 	swap((l)->front, (r)->front);					\
 	swap((l)->back, (r)->back);					\
 	swap((l)->size, (r)->size);					\
 	swap((l)->mask, (r)->mask);					\
 	swap((l)->data, (r)->data);					\
 } while (0)
 #define fifo_move(dest, src)						\
 do {									\
 	typeof(*((dest)->data)) _t;					\
 	while (!fifo_full(dest) &&					\
 	       fifo_pop(src, _t))					\
 		fifo_push(dest, _t);					\
 } while (0)
 /*
  * Simple array based allocator - preallocates a number of elements and you can
  * never allocate more than that, also has no locking.
  *
  * Handy because if you know you only need a fixed number of elements you don't
  * have to worry about memory allocation failure, and sometimes a mempool isn't
  * what you want.
  *
  * We treat the free elements as entries in a singly linked list, and the
  * freelist as a stack - allocating and freeing push and pop off the freelist.
  */
 #define DECLARE_ARRAY_ALLOCATOR(type, name, size)			\
 	struct {							\
 		type	*freelist;					\
 		type	data[size];					\
 	} name
 #define array_alloc(array)						\
 ({									\
 	typeof((array)->freelist) _ret = (array)->freelist;		\
 									\
 	if (_ret)							\
 		(array)->freelist = *((typeof((array)->freelist) *) _ret);\
 									\
 	_ret;								\
 })
 #define array_free(array, ptr)						\
 do {									\
 	typeof((array)->freelist) _ptr = ptr;				\
 									\
 	*((typeof((array)->freelist) *) _ptr) = (array)->freelist;	\
 	(array)->freelist = _ptr;					\
 } while (0)
 #define array_allocator_init(array)					\
 do {									\
 	typeof((array)->freelist) _i;					\
 									\
 	BUILD_BUG_ON(sizeof((array)->data[0]) < sizeof(void *));	\
 	(array)->freelist = NULL;					\
 									\
 	for (_i = (array)->data;					\
 	     _i < (array)->data + ARRAY_SIZE((array)->data);		\
 	     _i++)							\
 		array_free(array, _i);					\
 } while (0)
 #define array_freelist_empty(array)	((array)->freelist == NULL)
 #define ANYSINT_MAX(t)							\
 	((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1)
 int bch_strtoint_h(const char *, int *);
 int bch_strtouint_h(const char *, unsigned int *);
 int bch_strtoll_h(const char *, long long *);
 int bch_strtoull_h(const char *, unsigned long long *);
 static inline int bch_strtol_h(const char *cp, long *res)
 {
 #if BITS_PER_LONG == 32
 	return bch_strtoint_h(cp, (int *) res);
 #else
 	return bch_strtoll_h(cp, (long long *) res);
 #endif
 }
 static inline int bch_strtoul_h(const char *cp, long *res)
 {
 #if BITS_PER_LONG == 32
 	return bch_strtouint_h(cp, (unsigned int *) res);
 #else
 	return bch_strtoull_h(cp, (unsigned long long *) res);
 #endif
 }
 #define strtoi_h(cp, res)						\
 	(__builtin_types_compatible_p(typeof(*res), int)		\
 	? bch_strtoint_h(cp, (void *) res)				\
 	: __builtin_types_compatible_p(typeof(*res), long)		\
 	? bch_strtol_h(cp, (void *) res)				\
 	: __builtin_types_compatible_p(typeof(*res), long long)		\
 	? bch_strtoll_h(cp, (void *) res)				\
 	: __builtin_types_compatible_p(typeof(*res), unsigned int)	\
 	? bch_strtouint_h(cp, (void *) res)				\
 	: __builtin_types_compatible_p(typeof(*res), unsigned long)	\
 	? bch_strtoul_h(cp, (void *) res)				\
 	: __builtin_types_compatible_p(typeof(*res), unsigned long long)\
 	? bch_strtoull_h(cp, (void *) res) : -EINVAL)
 #define strtoul_safe(cp, var)						\
 ({									\
 	unsigned long _v;						\
 	int _r = kstrtoul(cp, 10, &_v);					\
 	if (!_r)							\
 		var = _v;						\
 	_r;								\
 })
 #define strtoul_safe_clamp(cp, var, min, max)				\
 ({									\
 	unsigned long _v;						\
 	int _r = kstrtoul(cp, 10, &_v);					\
 	if (!_r)							\
 		var = clamp_t(typeof(var), _v, min, max);		\
 	_r;								\
 })
 #define snprint(buf, size, var)						\
 	snprintf(buf, size,						\
 		__builtin_types_compatible_p(typeof(var), int)		\
 		     ? "%i\n" :						\
 		__builtin_types_compatible_p(typeof(var), unsigned)	\
 		     ? "%u\n" :						\
 		__builtin_types_compatible_p(typeof(var), long)		\
 		     ? "%li\n" :					\
 		__builtin_types_compatible_p(typeof(var), unsigned long)\
 		     ? "%lu\n" :					\
 		__builtin_types_compatible_p(typeof(var), int64_t)	\
 		     ? "%lli\n" :					\
 		__builtin_types_compatible_p(typeof(var), uint64_t)	\
 		     ? "%llu\n" :					\
 		__builtin_types_compatible_p(typeof(var), const char *)	\
 		     ? "%s\n" : "%i\n", var)
 ssize_t bch_hprint(char *buf, int64_t v);
 bool bch_is_zero(const char *p, size_t n);
 int bch_parse_uuid(const char *s, char *uuid);
 ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[],
 			    size_t selected);
 ssize_t bch_read_string_list(const char *buf, const char * const list[]);
 struct time_stats {
 	/*
 	 * all fields are in nanoseconds, averages are ewmas stored left shifted
 	 * by 8
 	 */
 	uint64_t	max_duration;
 	uint64_t	average_duration;
 	uint64_t	average_frequency;
 	uint64_t	last;
 };
 void bch_time_stats_update(struct time_stats *stats, uint64_t time);
 #define NSEC_PER_ns			1L
 #define NSEC_PER_us			NSEC_PER_USEC
 #define NSEC_PER_ms			NSEC_PER_MSEC
 #define NSEC_PER_sec			NSEC_PER_SEC
 #define __print_time_stat(stats, name, stat, units)			\
 	sysfs_print(name ## _ ## stat ## _ ## units,			\
 		    div_u64((stats)->stat >> 8, NSEC_PER_ ## units))
 #define sysfs_print_time_stats(stats, name,				\
 			       frequency_units,				\
 			       duration_units)				\
 do {									\
 	__print_time_stat(stats, name,					\
 			  average_frequency,	frequency_units);	\
 	__print_time_stat(stats, name,					\
 			  average_duration,	duration_units);	\
 	__print_time_stat(stats, name,					\
 			  max_duration,		duration_units);	\
 									\
 	sysfs_print(name ## _last_ ## frequency_units, (stats)->last	\
 		    ? div_s64(local_clock() - (stats)->last,		\
 			      NSEC_PER_ ## frequency_units)		\
 		    : -1LL);						\
 } while (0)
 #define sysfs_time_stats_attribute(name,				\
 				   frequency_units,			\
 				   duration_units)			\
 read_attribute(name ## _average_frequency_ ## frequency_units);		\
 read_attribute(name ## _average_duration_ ## duration_units);		\
 read_attribute(name ## _max_duration_ ## duration_units);		\
 read_attribute(name ## _last_ ## frequency_units)
 #define sysfs_time_stats_attribute_list(name,				\
 					frequency_units,		\
 					duration_units)			\
 &sysfs_ ## name ## _average_frequency_ ## frequency_units,		\
 &sysfs_ ## name ## _average_duration_ ## duration_units,		\
 &sysfs_ ## name ## _max_duration_ ## duration_units,			\
 &sysfs_ ## name ## _last_ ## frequency_units,
 #define ewma_add(ewma, val, weight, factor)				\
 ({									\
 	(ewma) *= (weight) - 1;						\
 	(ewma) += (val) << factor;					\
 	(ewma) /= (weight);						\
 	(ewma) >> factor;						\
 })
 struct ratelimit {
 	uint64_t		next;
 	unsigned		rate;
 };
 static inline void ratelimit_reset(struct ratelimit *d)
 {
 	d->next = local_clock();
 }
 unsigned bch_next_delay(struct ratelimit *d, uint64_t done);
 #define __DIV_SAFE(n, d, zero)						\
 ({									\
 	typeof(n) _n = (n);						\
 	typeof(d) _d = (d);						\
 	_d ? _n / _d : zero;						\
 })
 #define DIV_SAFE(n, d)	__DIV_SAFE(n, d, 0)
 #define container_of_or_null(ptr, type, member)				\
 ({									\
 	typeof(ptr) _ptr = ptr;						\
 	_ptr ? container_of(_ptr, type, member) : NULL;			\
 })
 #define RB_INSERT(root, new, member, cmp)				\
 ({									\
 	__label__ dup;							\
 	struct rb_node **n = &(root)->rb_node, *parent = NULL;		\
 	typeof(new) this;						\
 	int res, ret = -1;						\
 									\
 	while (*n) {							\
 		parent = *n;						\
 		this = container_of(*n, typeof(*(new)), member);	\
 		res = cmp(new, this);					\
 		if (!res)						\
 			goto dup;					\
 		n = res < 0						\
 			? &(*n)->rb_left				\
 			: &(*n)->rb_right;				\
 	}								\
 									\
 	rb_link_node(&(new)->member, parent, n);			\
 	rb_insert_color(&(new)->member, root);				\
 	ret = 0;							\
 dup:									\
 	ret;								\
 })
 #define RB_SEARCH(root, search, member, cmp)				\
 ({									\
 	struct rb_node *n = (root)->rb_node;				\
 	typeof(&(search)) this, ret = NULL;				\
 	int res;							\
 									\
 	while (n) {							\
 		this = container_of(n, typeof(search), member);		\
 		res = cmp(&(search), this);				\
 		if (!res) {						\
 			ret = this;					\
 			break;						\
 		}							\
 		n = res < 0						\
 			? n->rb_left					\
 			: n->rb_right;					\
 	}								\
 	ret;								\
 })
 #define RB_GREATER(root, search, member, cmp)				\
 ({									\
 	struct rb_node *n = (root)->rb_node;				\
 	typeof(&(search)) this, ret = NULL;				\
 	int res;							\
 									\
 	while (n) {							\
 		this = container_of(n, typeof(search), member);		\
 		res = cmp(&(search), this);				\
 		if (res < 0) {						\
 			ret = this;					\
 			n = n->rb_left;					\
 		} else							\
 			n = n->rb_right;				\
 	}								\
 	ret;								\
 })
 #define RB_FIRST(root, type, member)					\
 	container_of_or_null(rb_first(root), type, member)
 #define RB_LAST(root, type, member)					\
 	container_of_or_null(rb_last(root), type, member)
 #define RB_NEXT(ptr, member)						\
 	container_of_or_null(rb_next(&(ptr)->member), typeof(*ptr), member)
 #define RB_PREV(ptr, member)						\
 	container_of_or_null(rb_prev(&(ptr)->member), typeof(*ptr), member)
 /* Does linear interpolation between powers of two */
 static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
 {
 	unsigned fract = x & ~(~0 << fract_bits);
 	x >>= fract_bits;
 	x   = 1 << x;
 	x  += (x * fract) >> fract_bits;
 	return x;
 }
-#define bio_end(bio)	((bio)->bi_sector + bio_sectors(bio))
 void bch_bio_map(struct bio *bio, void *base);
-int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp);
 static inline sector_t bdev_sectors(struct block_device *bdev)
 {
 	return bdev->bd_inode->i_size >> 9;
 }
 #define closure_bio_submit(bio, cl, dev)				\
 do {									\
 	closure_get(cl);						\
 	bch_generic_make_request(bio, &(dev)->bio_split_hook);		\
 } while (0)
 uint64_t bch_crc64_update(uint64_t, const void *, size_t);
 uint64_t bch_crc64(const void *, size_t);
 #endif /* _BCACHE_UTIL_H */

drivers/md/bcache/writeback.c

Diff comments View file @ 8e51e41

 /*
  * background writeback - scan btree for dirty data and write it to the backing
  * device
  *
  * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
  * Copyright 2012 Google, Inc.
  */
 #include "bcache.h"
 #include "btree.h"
 #include "debug.h"
 #include "writeback.h"
 #include <trace/events/bcache.h>
 static struct workqueue_struct *dirty_wq;
 static void read_dirty(struct closure *);
 struct dirty_io {
 	struct closure		cl;
 	struct cached_dev	*dc;
 	struct bio		bio;
 };
 /* Rate limiting */
 static void __update_writeback_rate(struct cached_dev *dc)
 {
 	struct cache_set *c = dc->disk.c;
 	uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size;
 	uint64_t cache_dirty_target =
 		div_u64(cache_sectors * dc->writeback_percent, 100);
 	int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev),
 				   c->cached_dev_sectors);
 	/* PD controller */
 	int change = 0;
 	int64_t error;
 	int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
 	int64_t derivative = dirty - dc->disk.sectors_dirty_last;
 	dc->disk.sectors_dirty_last = dirty;
 	derivative *= dc->writeback_rate_d_term;
 	derivative = clamp(derivative, -dirty, dirty);
 	derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative,
 			      dc->writeback_rate_d_smooth, 0);
 	/* Avoid divide by zero */
 	if (!target)
 		goto out;
 	error = div64_s64((dirty + derivative - target) << 8, target);
 	change = div_s64((dc->writeback_rate.rate * error) >> 8,
 			 dc->writeback_rate_p_term_inverse);
 	/* Don't increase writeback rate if the device isn't keeping up */
 	if (change > 0 &&
 	    time_after64(local_clock(),
 			 dc->writeback_rate.next + 10 * NSEC_PER_MSEC))
 		change = 0;
 	dc->writeback_rate.rate =
 		clamp_t(int64_t, dc->writeback_rate.rate + change,
 			1, NSEC_PER_MSEC);
 out:
 	dc->writeback_rate_derivative = derivative;
 	dc->writeback_rate_change = change;
 	dc->writeback_rate_target = target;
 	schedule_delayed_work(&dc->writeback_rate_update,
 			      dc->writeback_rate_update_seconds * HZ);
 }
 static void update_writeback_rate(struct work_struct *work)
 {
 	struct cached_dev *dc = container_of(to_delayed_work(work),
 					     struct cached_dev,
 					     writeback_rate_update);
 	down_read(&dc->writeback_lock);
 	if (atomic_read(&dc->has_dirty) &&
 	    dc->writeback_percent)
 		__update_writeback_rate(dc);
 	up_read(&dc->writeback_lock);
 }
 static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
 {
 	if (atomic_read(&dc->disk.detaching) ||
 	    !dc->writeback_percent)
 		return 0;
 	return bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL);
 }
 /* Background writeback */
 static bool dirty_pred(struct keybuf *buf, struct bkey *k)
 {
 	return KEY_DIRTY(k);
 }
 static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k)
 {
 	uint64_t stripe;
 	unsigned nr_sectors = KEY_SIZE(k);
 	struct cached_dev *dc = container_of(buf, struct cached_dev,
 					     writeback_keys);
 	unsigned stripe_size = 1 << dc->disk.stripe_size_bits;
 	if (!KEY_DIRTY(k))
 		return false;
 	stripe = KEY_START(k) >> dc->disk.stripe_size_bits;
 	while (1) {
 		if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) !=
 		    stripe_size)
 			return false;
 		if (nr_sectors <= stripe_size)
 			return true;
 		nr_sectors -= stripe_size;
 		stripe++;
 	}
 }
 static void dirty_init(struct keybuf_key *w)
 {
 	struct dirty_io *io = w->private;
 	struct bio *bio = &io->bio;
 	bio_init(bio);
 	if (!io->dc->writeback_percent)
 		bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
 	bio->bi_size		= KEY_SIZE(&w->key) << 9;
 	bio->bi_max_vecs	= DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS);
 	bio->bi_private		= w;
 	bio->bi_io_vec		= bio->bi_inline_vecs;
 	bch_bio_map(bio, NULL);
 }
 static void refill_dirty(struct closure *cl)
 {
 	struct cached_dev *dc = container_of(cl, struct cached_dev,
 					     writeback.cl);
 	struct keybuf *buf = &dc->writeback_keys;
 	bool searched_from_start = false;
 	struct bkey end = MAX_KEY;
 	SET_KEY_INODE(&end, dc->disk.id);
 	if (!atomic_read(&dc->disk.detaching) &&
 	    !dc->writeback_running)
 		closure_return(cl);
 	down_write(&dc->writeback_lock);
 	if (!atomic_read(&dc->has_dirty)) {
 		SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
 		bch_write_bdev_super(dc, NULL);
 		up_write(&dc->writeback_lock);
 		closure_return(cl);
 	}
 	if (bkey_cmp(&buf->last_scanned, &end) >= 0) {
 		buf->last_scanned = KEY(dc->disk.id, 0, 0);
 		searched_from_start = true;
 	}
 	if (dc->partial_stripes_expensive) {
 		uint64_t i;
 		for (i = 0; i < dc->disk.nr_stripes; i++)
 			if (atomic_read(dc->disk.stripe_sectors_dirty + i) ==
 			    1 << dc->disk.stripe_size_bits)
 				goto full_stripes;
 		goto normal_refill;
 full_stripes:
 		bch_refill_keybuf(dc->disk.c, buf, &end,
 				  dirty_full_stripe_pred);
 	} else {
 normal_refill:
 		bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred);
 	}
 	if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) {
 		/* Searched the entire btree  - delay awhile */
 		if (RB_EMPTY_ROOT(&buf->keys)) {
 			atomic_set(&dc->has_dirty, 0);
 			cached_dev_put(dc);
 		}
 		if (!atomic_read(&dc->disk.detaching))
 			closure_delay(&dc->writeback, dc->writeback_delay * HZ);
 	}
 	up_write(&dc->writeback_lock);
 	ratelimit_reset(&dc->writeback_rate);
 	/* Punt to workqueue only so we don't recurse and blow the stack */
 	continue_at(cl, read_dirty, dirty_wq);
 }
 void bch_writeback_queue(struct cached_dev *dc)
 {
 	if (closure_trylock(&dc->writeback.cl, &dc->disk.cl)) {
 		if (!atomic_read(&dc->disk.detaching))
 			closure_delay(&dc->writeback, dc->writeback_delay * HZ);
 		continue_at(&dc->writeback.cl, refill_dirty, dirty_wq);
 	}
 }
 void bch_writeback_add(struct cached_dev *dc)
 {
 	if (!atomic_read(&dc->has_dirty) &&
 	    !atomic_xchg(&dc->has_dirty, 1)) {
 		atomic_inc(&dc->count);
 		if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) {
 			SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY);
 			/* XXX: should do this synchronously */
 			bch_write_bdev_super(dc, NULL);
 		}
 		bch_writeback_queue(dc);
 		if (dc->writeback_percent)
 			schedule_delayed_work(&dc->writeback_rate_update,
 				      dc->writeback_rate_update_seconds * HZ);
 	}
 }
 void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
 				  uint64_t offset, int nr_sectors)
 {
 	struct bcache_device *d = c->devices[inode];
 	unsigned stripe_size, stripe_offset;
 	uint64_t stripe;
 	if (!d)
 		return;
 	stripe_size = 1 << d->stripe_size_bits;
 	stripe = offset >> d->stripe_size_bits;
 	stripe_offset = offset & (stripe_size - 1);
 	while (nr_sectors) {
 		int s = min_t(unsigned, abs(nr_sectors),
 			      stripe_size - stripe_offset);
 		if (nr_sectors < 0)
 			s = -s;
 		atomic_add(s, d->stripe_sectors_dirty + stripe);
 		nr_sectors -= s;
 		stripe_offset = 0;
 		stripe++;
 	}
 }
 /* Background writeback - IO loop */
 static void dirty_io_destructor(struct closure *cl)
 {
 	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
 	kfree(io);
 }
 static void write_dirty_finish(struct closure *cl)
 {
 	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
 	struct keybuf_key *w = io->bio.bi_private;
 	struct cached_dev *dc = io->dc;
-	struct bio_vec *bv = bio_iovec_idx(&io->bio, io->bio.bi_vcnt);
+	struct bio_vec *bv;
+	int i;
-	while (bv-- != io->bio.bi_io_vec)
+	bio_for_each_segment_all(bv, &io->bio, i)
 		__free_page(bv->bv_page);
 	/* This is kind of a dumb way of signalling errors. */
 	if (KEY_DIRTY(&w->key)) {
 		unsigned i;
 		struct btree_op op;
 		bch_btree_op_init_stack(&op);
 		op.type = BTREE_REPLACE;
 		bkey_copy(&op.replace, &w->key);
 		SET_KEY_DIRTY(&w->key, false);
 		bch_keylist_add(&op.keys, &w->key);
 		for (i = 0; i < KEY_PTRS(&w->key); i++)
 			atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
 		bch_btree_insert(&op, dc->disk.c);
 		closure_sync(&op.cl);
 		if (op.insert_collision)
 			trace_bcache_writeback_collision(&w->key);
 		atomic_long_inc(op.insert_collision
 				? &dc->disk.c->writeback_keys_failed
 				: &dc->disk.c->writeback_keys_done);
 	}
 	bch_keybuf_del(&dc->writeback_keys, w);
 	atomic_dec_bug(&dc->in_flight);
 	closure_wake_up(&dc->writeback_wait);
 	closure_return_with_destructor(cl, dirty_io_destructor);
 }
 static void dirty_endio(struct bio *bio, int error)
 {
 	struct keybuf_key *w = bio->bi_private;
 	struct dirty_io *io = w->private;
 	if (error)
 		SET_KEY_DIRTY(&w->key, false);
 	closure_put(&io->cl);
 }
 static void write_dirty(struct closure *cl)
 {
 	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
 	struct keybuf_key *w = io->bio.bi_private;
 	dirty_init(w);
 	io->bio.bi_rw		= WRITE;
 	io->bio.bi_sector	= KEY_START(&w->key);
 	io->bio.bi_bdev		= io->dc->bdev;
 	io->bio.bi_end_io	= dirty_endio;
 	closure_bio_submit(&io->bio, cl, &io->dc->disk);
 	continue_at(cl, write_dirty_finish, dirty_wq);
 }
 static void read_dirty_endio(struct bio *bio, int error)
 {
 	struct keybuf_key *w = bio->bi_private;
 	struct dirty_io *io = w->private;
 	bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0),
 			    error, "reading dirty data from cache");
 	dirty_endio(bio, error);
 }
 static void read_dirty_submit(struct closure *cl)
 {
 	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
 	closure_bio_submit(&io->bio, cl, &io->dc->disk);
 	continue_at(cl, write_dirty, dirty_wq);
 }
 static void read_dirty(struct closure *cl)
 {
 	struct cached_dev *dc = container_of(cl, struct cached_dev,
 					     writeback.cl);
 	unsigned delay = writeback_delay(dc, 0);
 	struct keybuf_key *w;
 	struct dirty_io *io;
 	/*
 	 * XXX: if we error, background writeback just spins. Should use some
 	 * mempools.
 	 */
 	while (1) {
 		w = bch_keybuf_next(&dc->writeback_keys);
 		if (!w)
 			break;
 		BUG_ON(ptr_stale(dc->disk.c, &w->key, 0));
 		if (delay > 0 &&
 		    (KEY_START(&w->key) != dc->last_read ||
 		     jiffies_to_msecs(delay) > 50)) {
 			w->private = NULL;
 			closure_delay(&dc->writeback, delay);
 			continue_at(cl, read_dirty, dirty_wq);
 		}
 		dc->last_read	= KEY_OFFSET(&w->key);
 		io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec)
 			     * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
 			     GFP_KERNEL);
 		if (!io)
 			goto err;
 		w->private	= io;
 		io->dc		= dc;
 		dirty_init(w);
 		io->bio.bi_sector	= PTR_OFFSET(&w->key, 0);
 		io->bio.bi_bdev		= PTR_CACHE(dc->disk.c,
 						    &w->key, 0)->bdev;
 		io->bio.bi_rw		= READ;
 		io->bio.bi_end_io	= read_dirty_endio;
-		if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL))
+		if (bio_alloc_pages(&io->bio, GFP_KERNEL))
 			goto err_free;
 		trace_bcache_writeback(&w->key);
 		closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl);
 		delay = writeback_delay(dc, KEY_SIZE(&w->key));
 		atomic_inc(&dc->in_flight);
 		if (!closure_wait_event(&dc->writeback_wait, cl,
 					atomic_read(&dc->in_flight) < 64))
 			continue_at(cl, read_dirty, dirty_wq);
 	}
 	if (0) {
 err_free:
 		kfree(w->private);
 err:
 		bch_keybuf_del(&dc->writeback_keys, w);
 	}
 	refill_dirty(cl);
 }
 /* Init */
 static int bch_btree_sectors_dirty_init(struct btree *b, struct btree_op *op,
 					struct cached_dev *dc)
 {
 	struct bkey *k;
 	struct btree_iter iter;
 	bch_btree_iter_init(b, &iter, &KEY(dc->disk.id, 0, 0));
 	while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad)))
 		if (!b->level) {
 			if (KEY_INODE(k) > dc->disk.id)
 				break;
 			if (KEY_DIRTY(k))
 				bcache_dev_sectors_dirty_add(b->c, dc->disk.id,
 							     KEY_START(k),
 							     KEY_SIZE(k));
 		} else {
 			btree(sectors_dirty_init, k, b, op, dc);
 			if (KEY_INODE(k) > dc->disk.id)
 				break;
 			cond_resched();
 		}
 	return 0;
 }
 void bch_sectors_dirty_init(struct cached_dev *dc)
 {
 	struct btree_op op;
 	bch_btree_op_init_stack(&op);
 	btree_root(sectors_dirty_init, dc->disk.c, &op, dc);
 }
 void bch_cached_dev_writeback_init(struct cached_dev *dc)
 {
 	closure_init_unlocked(&dc->writeback);
 	init_rwsem(&dc->writeback_lock);
 	bch_keybuf_init(&dc->writeback_keys);
 	dc->writeback_metadata		= true;
 	dc->writeback_running		= true;
 	dc->writeback_percent		= 10;
 	dc->writeback_delay		= 30;
 	dc->writeback_rate.rate		= 1024;
 	dc->writeback_rate_update_seconds = 30;
 	dc->writeback_rate_d_term	= 16;
 	dc->writeback_rate_p_term_inverse = 64;
 	dc->writeback_rate_d_smooth	= 8;
 	INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
 	schedule_delayed_work(&dc->writeback_rate_update,
 			      dc->writeback_rate_update_seconds * HZ);
 }
 void bch_writeback_exit(void)
 {
 	if (dirty_wq)
 		destroy_workqueue(dirty_wq);
 }
 int __init bch_writeback_init(void)
 {
 	dirty_wq = create_singlethread_workqueue("bcache_writeback");
 	if (!dirty_wq)
 		return -ENOMEM;
 	return 0;
 }