Commit 8e51e414a3c6d92ef2cc41720c67342a8e2c0bf7

Authored by Kent Overstreet
Committed by Kent Overstreet
1 parent 47cd2eb0ee

bcache: Use standard utility code

Some of bcache's utility code has made it into the rest of the kernel,
so drop the bcache versions.

Bcache used to have a workaround for allocating from a bio set under
generic_make_request() (if you allocated more than once, the bios you
already allocated would get stuck on current->bio_list when you
submitted, and you'd risk deadlock) - bcache would mask out __GFP_WAIT
when allocating bios under generic_make_request() so that allocation
could fail and it could retry from workqueue. But bio_alloc_bioset() has
a workaround now, so we can drop this hack and the associated error
handling.

Signed-off-by: Kent Overstreet <koverstreet@google.com>

Showing 8 changed files with 51 additions and 144 deletions Inline Diff

drivers/md/bcache/btree.c
1 /* 1 /*
2 * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> 2 * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
3 * 3 *
4 * Uses a block device as cache for other block devices; optimized for SSDs. 4 * Uses a block device as cache for other block devices; optimized for SSDs.
5 * All allocation is done in buckets, which should match the erase block size 5 * All allocation is done in buckets, which should match the erase block size
6 * of the device. 6 * of the device.
7 * 7 *
8 * Buckets containing cached data are kept on a heap sorted by priority; 8 * Buckets containing cached data are kept on a heap sorted by priority;
9 * bucket priority is increased on cache hit, and periodically all the buckets 9 * bucket priority is increased on cache hit, and periodically all the buckets
10 * on the heap have their priority scaled down. This currently is just used as 10 * on the heap have their priority scaled down. This currently is just used as
11 * an LRU but in the future should allow for more intelligent heuristics. 11 * an LRU but in the future should allow for more intelligent heuristics.
12 * 12 *
13 * Buckets have an 8 bit counter; freeing is accomplished by incrementing the 13 * Buckets have an 8 bit counter; freeing is accomplished by incrementing the
14 * counter. Garbage collection is used to remove stale pointers. 14 * counter. Garbage collection is used to remove stale pointers.
15 * 15 *
16 * Indexing is done via a btree; nodes are not necessarily fully sorted, rather 16 * Indexing is done via a btree; nodes are not necessarily fully sorted, rather
17 * as keys are inserted we only sort the pages that have not yet been written. 17 * as keys are inserted we only sort the pages that have not yet been written.
18 * When garbage collection is run, we resort the entire node. 18 * When garbage collection is run, we resort the entire node.
19 * 19 *
20 * All configuration is done via sysfs; see Documentation/bcache.txt. 20 * All configuration is done via sysfs; see Documentation/bcache.txt.
21 */ 21 */
22 22
23 #include "bcache.h" 23 #include "bcache.h"
24 #include "btree.h" 24 #include "btree.h"
25 #include "debug.h" 25 #include "debug.h"
26 #include "request.h" 26 #include "request.h"
27 #include "writeback.h" 27 #include "writeback.h"
28 28
29 #include <linux/slab.h> 29 #include <linux/slab.h>
30 #include <linux/bitops.h> 30 #include <linux/bitops.h>
31 #include <linux/hash.h> 31 #include <linux/hash.h>
32 #include <linux/prefetch.h> 32 #include <linux/prefetch.h>
33 #include <linux/random.h> 33 #include <linux/random.h>
34 #include <linux/rcupdate.h> 34 #include <linux/rcupdate.h>
35 #include <trace/events/bcache.h> 35 #include <trace/events/bcache.h>
36 36
37 /* 37 /*
38 * Todo: 38 * Todo:
39 * register_bcache: Return errors out to userspace correctly 39 * register_bcache: Return errors out to userspace correctly
40 * 40 *
41 * Writeback: don't undirty key until after a cache flush 41 * Writeback: don't undirty key until after a cache flush
42 * 42 *
43 * Create an iterator for key pointers 43 * Create an iterator for key pointers
44 * 44 *
45 * On btree write error, mark bucket such that it won't be freed from the cache 45 * On btree write error, mark bucket such that it won't be freed from the cache
46 * 46 *
47 * Journalling: 47 * Journalling:
48 * Check for bad keys in replay 48 * Check for bad keys in replay
49 * Propagate barriers 49 * Propagate barriers
50 * Refcount journal entries in journal_replay 50 * Refcount journal entries in journal_replay
51 * 51 *
52 * Garbage collection: 52 * Garbage collection:
53 * Finish incremental gc 53 * Finish incremental gc
54 * Gc should free old UUIDs, data for invalid UUIDs 54 * Gc should free old UUIDs, data for invalid UUIDs
55 * 55 *
56 * Provide a way to list backing device UUIDs we have data cached for, and 56 * Provide a way to list backing device UUIDs we have data cached for, and
57 * probably how long it's been since we've seen them, and a way to invalidate 57 * probably how long it's been since we've seen them, and a way to invalidate
58 * dirty data for devices that will never be attached again 58 * dirty data for devices that will never be attached again
59 * 59 *
60 * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so 60 * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so
61 * that based on that and how much dirty data we have we can keep writeback 61 * that based on that and how much dirty data we have we can keep writeback
62 * from being starved 62 * from being starved
63 * 63 *
64 * Add a tracepoint or somesuch to watch for writeback starvation 64 * Add a tracepoint or somesuch to watch for writeback starvation
65 * 65 *
66 * When btree depth > 1 and splitting an interior node, we have to make sure 66 * When btree depth > 1 and splitting an interior node, we have to make sure
67 * alloc_bucket() cannot fail. This should be true but is not completely 67 * alloc_bucket() cannot fail. This should be true but is not completely
68 * obvious. 68 * obvious.
69 * 69 *
70 * Make sure all allocations get charged to the root cgroup 70 * Make sure all allocations get charged to the root cgroup
71 * 71 *
72 * Plugging? 72 * Plugging?
73 * 73 *
74 * If data write is less than hard sector size of ssd, round up offset in open 74 * If data write is less than hard sector size of ssd, round up offset in open
75 * bucket to the next whole sector 75 * bucket to the next whole sector
76 * 76 *
77 * Also lookup by cgroup in get_open_bucket() 77 * Also lookup by cgroup in get_open_bucket()
78 * 78 *
79 * Superblock needs to be fleshed out for multiple cache devices 79 * Superblock needs to be fleshed out for multiple cache devices
80 * 80 *
81 * Add a sysfs tunable for the number of writeback IOs in flight 81 * Add a sysfs tunable for the number of writeback IOs in flight
82 * 82 *
83 * Add a sysfs tunable for the number of open data buckets 83 * Add a sysfs tunable for the number of open data buckets
84 * 84 *
85 * IO tracking: Can we track when one process is doing io on behalf of another? 85 * IO tracking: Can we track when one process is doing io on behalf of another?
86 * IO tracking: Don't use just an average, weigh more recent stuff higher 86 * IO tracking: Don't use just an average, weigh more recent stuff higher
87 * 87 *
88 * Test module load/unload 88 * Test module load/unload
89 */ 89 */
90 90
91 static const char * const op_types[] = { 91 static const char * const op_types[] = {
92 "insert", "replace" 92 "insert", "replace"
93 }; 93 };
94 94
95 static const char *op_type(struct btree_op *op) 95 static const char *op_type(struct btree_op *op)
96 { 96 {
97 return op_types[op->type]; 97 return op_types[op->type];
98 } 98 }
99 99
100 #define MAX_NEED_GC 64 100 #define MAX_NEED_GC 64
101 #define MAX_SAVE_PRIO 72 101 #define MAX_SAVE_PRIO 72
102 102
103 #define PTR_DIRTY_BIT (((uint64_t) 1 << 36)) 103 #define PTR_DIRTY_BIT (((uint64_t) 1 << 36))
104 104
105 #define PTR_HASH(c, k) \ 105 #define PTR_HASH(c, k) \
106 (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0)) 106 (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0))
107 107
108 struct workqueue_struct *bch_gc_wq; 108 struct workqueue_struct *bch_gc_wq;
109 static struct workqueue_struct *btree_io_wq; 109 static struct workqueue_struct *btree_io_wq;
110 110
111 void bch_btree_op_init_stack(struct btree_op *op) 111 void bch_btree_op_init_stack(struct btree_op *op)
112 { 112 {
113 memset(op, 0, sizeof(struct btree_op)); 113 memset(op, 0, sizeof(struct btree_op));
114 closure_init_stack(&op->cl); 114 closure_init_stack(&op->cl);
115 op->lock = -1; 115 op->lock = -1;
116 bch_keylist_init(&op->keys); 116 bch_keylist_init(&op->keys);
117 } 117 }
118 118
119 /* Btree key manipulation */ 119 /* Btree key manipulation */
120 120
121 static void bkey_put(struct cache_set *c, struct bkey *k, int level) 121 static void bkey_put(struct cache_set *c, struct bkey *k, int level)
122 { 122 {
123 if ((level && KEY_OFFSET(k)) || !level) 123 if ((level && KEY_OFFSET(k)) || !level)
124 __bkey_put(c, k); 124 __bkey_put(c, k);
125 } 125 }
126 126
127 /* Btree IO */ 127 /* Btree IO */
128 128
129 static uint64_t btree_csum_set(struct btree *b, struct bset *i) 129 static uint64_t btree_csum_set(struct btree *b, struct bset *i)
130 { 130 {
131 uint64_t crc = b->key.ptr[0]; 131 uint64_t crc = b->key.ptr[0];
132 void *data = (void *) i + 8, *end = end(i); 132 void *data = (void *) i + 8, *end = end(i);
133 133
134 crc = bch_crc64_update(crc, data, end - data); 134 crc = bch_crc64_update(crc, data, end - data);
135 return crc ^ 0xffffffffffffffffULL; 135 return crc ^ 0xffffffffffffffffULL;
136 } 136 }
137 137
138 static void bch_btree_node_read_done(struct btree *b) 138 static void bch_btree_node_read_done(struct btree *b)
139 { 139 {
140 const char *err = "bad btree header"; 140 const char *err = "bad btree header";
141 struct bset *i = b->sets[0].data; 141 struct bset *i = b->sets[0].data;
142 struct btree_iter *iter; 142 struct btree_iter *iter;
143 143
144 iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT); 144 iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT);
145 iter->size = b->c->sb.bucket_size / b->c->sb.block_size; 145 iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
146 iter->used = 0; 146 iter->used = 0;
147 147
148 if (!i->seq) 148 if (!i->seq)
149 goto err; 149 goto err;
150 150
151 for (; 151 for (;
152 b->written < btree_blocks(b) && i->seq == b->sets[0].data->seq; 152 b->written < btree_blocks(b) && i->seq == b->sets[0].data->seq;
153 i = write_block(b)) { 153 i = write_block(b)) {
154 err = "unsupported bset version"; 154 err = "unsupported bset version";
155 if (i->version > BCACHE_BSET_VERSION) 155 if (i->version > BCACHE_BSET_VERSION)
156 goto err; 156 goto err;
157 157
158 err = "bad btree header"; 158 err = "bad btree header";
159 if (b->written + set_blocks(i, b->c) > btree_blocks(b)) 159 if (b->written + set_blocks(i, b->c) > btree_blocks(b))
160 goto err; 160 goto err;
161 161
162 err = "bad magic"; 162 err = "bad magic";
163 if (i->magic != bset_magic(b->c)) 163 if (i->magic != bset_magic(b->c))
164 goto err; 164 goto err;
165 165
166 err = "bad checksum"; 166 err = "bad checksum";
167 switch (i->version) { 167 switch (i->version) {
168 case 0: 168 case 0:
169 if (i->csum != csum_set(i)) 169 if (i->csum != csum_set(i))
170 goto err; 170 goto err;
171 break; 171 break;
172 case BCACHE_BSET_VERSION: 172 case BCACHE_BSET_VERSION:
173 if (i->csum != btree_csum_set(b, i)) 173 if (i->csum != btree_csum_set(b, i))
174 goto err; 174 goto err;
175 break; 175 break;
176 } 176 }
177 177
178 err = "empty set"; 178 err = "empty set";
179 if (i != b->sets[0].data && !i->keys) 179 if (i != b->sets[0].data && !i->keys)
180 goto err; 180 goto err;
181 181
182 bch_btree_iter_push(iter, i->start, end(i)); 182 bch_btree_iter_push(iter, i->start, end(i));
183 183
184 b->written += set_blocks(i, b->c); 184 b->written += set_blocks(i, b->c);
185 } 185 }
186 186
187 err = "corrupted btree"; 187 err = "corrupted btree";
188 for (i = write_block(b); 188 for (i = write_block(b);
189 index(i, b) < btree_blocks(b); 189 index(i, b) < btree_blocks(b);
190 i = ((void *) i) + block_bytes(b->c)) 190 i = ((void *) i) + block_bytes(b->c))
191 if (i->seq == b->sets[0].data->seq) 191 if (i->seq == b->sets[0].data->seq)
192 goto err; 192 goto err;
193 193
194 bch_btree_sort_and_fix_extents(b, iter); 194 bch_btree_sort_and_fix_extents(b, iter);
195 195
196 i = b->sets[0].data; 196 i = b->sets[0].data;
197 err = "short btree key"; 197 err = "short btree key";
198 if (b->sets[0].size && 198 if (b->sets[0].size &&
199 bkey_cmp(&b->key, &b->sets[0].end) < 0) 199 bkey_cmp(&b->key, &b->sets[0].end) < 0)
200 goto err; 200 goto err;
201 201
202 if (b->written < btree_blocks(b)) 202 if (b->written < btree_blocks(b))
203 bch_bset_init_next(b); 203 bch_bset_init_next(b);
204 out: 204 out:
205 mempool_free(iter, b->c->fill_iter); 205 mempool_free(iter, b->c->fill_iter);
206 return; 206 return;
207 err: 207 err:
208 set_btree_node_io_error(b); 208 set_btree_node_io_error(b);
209 bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys", 209 bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys",
210 err, PTR_BUCKET_NR(b->c, &b->key, 0), 210 err, PTR_BUCKET_NR(b->c, &b->key, 0),
211 index(i, b), i->keys); 211 index(i, b), i->keys);
212 goto out; 212 goto out;
213 } 213 }
214 214
215 static void btree_node_read_endio(struct bio *bio, int error) 215 static void btree_node_read_endio(struct bio *bio, int error)
216 { 216 {
217 struct closure *cl = bio->bi_private; 217 struct closure *cl = bio->bi_private;
218 closure_put(cl); 218 closure_put(cl);
219 } 219 }
220 220
221 void bch_btree_node_read(struct btree *b) 221 void bch_btree_node_read(struct btree *b)
222 { 222 {
223 uint64_t start_time = local_clock(); 223 uint64_t start_time = local_clock();
224 struct closure cl; 224 struct closure cl;
225 struct bio *bio; 225 struct bio *bio;
226 226
227 trace_bcache_btree_read(b); 227 trace_bcache_btree_read(b);
228 228
229 closure_init_stack(&cl); 229 closure_init_stack(&cl);
230 230
231 bio = bch_bbio_alloc(b->c); 231 bio = bch_bbio_alloc(b->c);
232 bio->bi_rw = REQ_META|READ_SYNC; 232 bio->bi_rw = REQ_META|READ_SYNC;
233 bio->bi_size = KEY_SIZE(&b->key) << 9; 233 bio->bi_size = KEY_SIZE(&b->key) << 9;
234 bio->bi_end_io = btree_node_read_endio; 234 bio->bi_end_io = btree_node_read_endio;
235 bio->bi_private = &cl; 235 bio->bi_private = &cl;
236 236
237 bch_bio_map(bio, b->sets[0].data); 237 bch_bio_map(bio, b->sets[0].data);
238 238
239 bch_submit_bbio(bio, b->c, &b->key, 0); 239 bch_submit_bbio(bio, b->c, &b->key, 0);
240 closure_sync(&cl); 240 closure_sync(&cl);
241 241
242 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 242 if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
243 set_btree_node_io_error(b); 243 set_btree_node_io_error(b);
244 244
245 bch_bbio_free(bio, b->c); 245 bch_bbio_free(bio, b->c);
246 246
247 if (btree_node_io_error(b)) 247 if (btree_node_io_error(b))
248 goto err; 248 goto err;
249 249
250 bch_btree_node_read_done(b); 250 bch_btree_node_read_done(b);
251 251
252 spin_lock(&b->c->btree_read_time_lock); 252 spin_lock(&b->c->btree_read_time_lock);
253 bch_time_stats_update(&b->c->btree_read_time, start_time); 253 bch_time_stats_update(&b->c->btree_read_time, start_time);
254 spin_unlock(&b->c->btree_read_time_lock); 254 spin_unlock(&b->c->btree_read_time_lock);
255 255
256 return; 256 return;
257 err: 257 err:
258 bch_cache_set_error(b->c, "io error reading bucket %lu", 258 bch_cache_set_error(b->c, "io error reading bucket %lu",
259 PTR_BUCKET_NR(b->c, &b->key, 0)); 259 PTR_BUCKET_NR(b->c, &b->key, 0));
260 } 260 }
261 261
262 static void btree_complete_write(struct btree *b, struct btree_write *w) 262 static void btree_complete_write(struct btree *b, struct btree_write *w)
263 { 263 {
264 if (w->prio_blocked && 264 if (w->prio_blocked &&
265 !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked)) 265 !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked))
266 wake_up_allocators(b->c); 266 wake_up_allocators(b->c);
267 267
268 if (w->journal) { 268 if (w->journal) {
269 atomic_dec_bug(w->journal); 269 atomic_dec_bug(w->journal);
270 __closure_wake_up(&b->c->journal.wait); 270 __closure_wake_up(&b->c->journal.wait);
271 } 271 }
272 272
273 w->prio_blocked = 0; 273 w->prio_blocked = 0;
274 w->journal = NULL; 274 w->journal = NULL;
275 } 275 }
276 276
277 static void __btree_node_write_done(struct closure *cl) 277 static void __btree_node_write_done(struct closure *cl)
278 { 278 {
279 struct btree *b = container_of(cl, struct btree, io.cl); 279 struct btree *b = container_of(cl, struct btree, io.cl);
280 struct btree_write *w = btree_prev_write(b); 280 struct btree_write *w = btree_prev_write(b);
281 281
282 bch_bbio_free(b->bio, b->c); 282 bch_bbio_free(b->bio, b->c);
283 b->bio = NULL; 283 b->bio = NULL;
284 btree_complete_write(b, w); 284 btree_complete_write(b, w);
285 285
286 if (btree_node_dirty(b)) 286 if (btree_node_dirty(b))
287 queue_delayed_work(btree_io_wq, &b->work, 287 queue_delayed_work(btree_io_wq, &b->work,
288 msecs_to_jiffies(30000)); 288 msecs_to_jiffies(30000));
289 289
290 closure_return(cl); 290 closure_return(cl);
291 } 291 }
292 292
293 static void btree_node_write_done(struct closure *cl) 293 static void btree_node_write_done(struct closure *cl)
294 { 294 {
295 struct btree *b = container_of(cl, struct btree, io.cl); 295 struct btree *b = container_of(cl, struct btree, io.cl);
296 struct bio_vec *bv; 296 struct bio_vec *bv;
297 int n; 297 int n;
298 298
299 __bio_for_each_segment(bv, b->bio, n, 0) 299 __bio_for_each_segment(bv, b->bio, n, 0)
300 __free_page(bv->bv_page); 300 __free_page(bv->bv_page);
301 301
302 __btree_node_write_done(cl); 302 __btree_node_write_done(cl);
303 } 303 }
304 304
305 static void btree_node_write_endio(struct bio *bio, int error) 305 static void btree_node_write_endio(struct bio *bio, int error)
306 { 306 {
307 struct closure *cl = bio->bi_private; 307 struct closure *cl = bio->bi_private;
308 struct btree *b = container_of(cl, struct btree, io.cl); 308 struct btree *b = container_of(cl, struct btree, io.cl);
309 309
310 if (error) 310 if (error)
311 set_btree_node_io_error(b); 311 set_btree_node_io_error(b);
312 312
313 bch_bbio_count_io_errors(b->c, bio, error, "writing btree"); 313 bch_bbio_count_io_errors(b->c, bio, error, "writing btree");
314 closure_put(cl); 314 closure_put(cl);
315 } 315 }
316 316
317 static void do_btree_node_write(struct btree *b) 317 static void do_btree_node_write(struct btree *b)
318 { 318 {
319 struct closure *cl = &b->io.cl; 319 struct closure *cl = &b->io.cl;
320 struct bset *i = b->sets[b->nsets].data; 320 struct bset *i = b->sets[b->nsets].data;
321 BKEY_PADDED(key) k; 321 BKEY_PADDED(key) k;
322 322
323 i->version = BCACHE_BSET_VERSION; 323 i->version = BCACHE_BSET_VERSION;
324 i->csum = btree_csum_set(b, i); 324 i->csum = btree_csum_set(b, i);
325 325
326 BUG_ON(b->bio); 326 BUG_ON(b->bio);
327 b->bio = bch_bbio_alloc(b->c); 327 b->bio = bch_bbio_alloc(b->c);
328 328
329 b->bio->bi_end_io = btree_node_write_endio; 329 b->bio->bi_end_io = btree_node_write_endio;
330 b->bio->bi_private = &b->io.cl; 330 b->bio->bi_private = &b->io.cl;
331 b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA; 331 b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA;
332 b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); 332 b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c);
333 bch_bio_map(b->bio, i); 333 bch_bio_map(b->bio, i);
334 334
335 /* 335 /*
336 * If we're appending to a leaf node, we don't technically need FUA - 336 * If we're appending to a leaf node, we don't technically need FUA -
337 * this write just needs to be persisted before the next journal write, 337 * this write just needs to be persisted before the next journal write,
338 * which will be marked FLUSH|FUA. 338 * which will be marked FLUSH|FUA.
339 * 339 *
340 * Similarly if we're writing a new btree root - the pointer is going to 340 * Similarly if we're writing a new btree root - the pointer is going to
341 * be in the next journal entry. 341 * be in the next journal entry.
342 * 342 *
343 * But if we're writing a new btree node (that isn't a root) or 343 * But if we're writing a new btree node (that isn't a root) or
344 * appending to a non leaf btree node, we need either FUA or a flush 344 * appending to a non leaf btree node, we need either FUA or a flush
345 * when we write the parent with the new pointer. FUA is cheaper than a 345 * when we write the parent with the new pointer. FUA is cheaper than a
346 * flush, and writes appending to leaf nodes aren't blocking anything so 346 * flush, and writes appending to leaf nodes aren't blocking anything so
347 * just make all btree node writes FUA to keep things sane. 347 * just make all btree node writes FUA to keep things sane.
348 */ 348 */
349 349
350 bkey_copy(&k.key, &b->key); 350 bkey_copy(&k.key, &b->key);
351 SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i)); 351 SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i));
352 352
353 if (!bch_bio_alloc_pages(b->bio, GFP_NOIO)) { 353 if (!bio_alloc_pages(b->bio, GFP_NOIO)) {
354 int j; 354 int j;
355 struct bio_vec *bv; 355 struct bio_vec *bv;
356 void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); 356 void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
357 357
358 bio_for_each_segment(bv, b->bio, j) 358 bio_for_each_segment(bv, b->bio, j)
359 memcpy(page_address(bv->bv_page), 359 memcpy(page_address(bv->bv_page),
360 base + j * PAGE_SIZE, PAGE_SIZE); 360 base + j * PAGE_SIZE, PAGE_SIZE);
361 361
362 bch_submit_bbio(b->bio, b->c, &k.key, 0); 362 bch_submit_bbio(b->bio, b->c, &k.key, 0);
363 363
364 continue_at(cl, btree_node_write_done, NULL); 364 continue_at(cl, btree_node_write_done, NULL);
365 } else { 365 } else {
366 b->bio->bi_vcnt = 0; 366 b->bio->bi_vcnt = 0;
367 bch_bio_map(b->bio, i); 367 bch_bio_map(b->bio, i);
368 368
369 bch_submit_bbio(b->bio, b->c, &k.key, 0); 369 bch_submit_bbio(b->bio, b->c, &k.key, 0);
370 370
371 closure_sync(cl); 371 closure_sync(cl);
372 __btree_node_write_done(cl); 372 __btree_node_write_done(cl);
373 } 373 }
374 } 374 }
375 375
376 void bch_btree_node_write(struct btree *b, struct closure *parent) 376 void bch_btree_node_write(struct btree *b, struct closure *parent)
377 { 377 {
378 struct bset *i = b->sets[b->nsets].data; 378 struct bset *i = b->sets[b->nsets].data;
379 379
380 trace_bcache_btree_write(b); 380 trace_bcache_btree_write(b);
381 381
382 BUG_ON(current->bio_list); 382 BUG_ON(current->bio_list);
383 BUG_ON(b->written >= btree_blocks(b)); 383 BUG_ON(b->written >= btree_blocks(b));
384 BUG_ON(b->written && !i->keys); 384 BUG_ON(b->written && !i->keys);
385 BUG_ON(b->sets->data->seq != i->seq); 385 BUG_ON(b->sets->data->seq != i->seq);
386 bch_check_key_order(b, i); 386 bch_check_key_order(b, i);
387 387
388 cancel_delayed_work(&b->work); 388 cancel_delayed_work(&b->work);
389 389
390 /* If caller isn't waiting for write, parent refcount is cache set */ 390 /* If caller isn't waiting for write, parent refcount is cache set */
391 closure_lock(&b->io, parent ?: &b->c->cl); 391 closure_lock(&b->io, parent ?: &b->c->cl);
392 392
393 clear_bit(BTREE_NODE_dirty, &b->flags); 393 clear_bit(BTREE_NODE_dirty, &b->flags);
394 change_bit(BTREE_NODE_write_idx, &b->flags); 394 change_bit(BTREE_NODE_write_idx, &b->flags);
395 395
396 do_btree_node_write(b); 396 do_btree_node_write(b);
397 397
398 b->written += set_blocks(i, b->c); 398 b->written += set_blocks(i, b->c);
399 atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size, 399 atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size,
400 &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); 400 &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);
401 401
402 bch_btree_sort_lazy(b); 402 bch_btree_sort_lazy(b);
403 403
404 if (b->written < btree_blocks(b)) 404 if (b->written < btree_blocks(b))
405 bch_bset_init_next(b); 405 bch_bset_init_next(b);
406 } 406 }
407 407
408 static void btree_node_write_work(struct work_struct *w) 408 static void btree_node_write_work(struct work_struct *w)
409 { 409 {
410 struct btree *b = container_of(to_delayed_work(w), struct btree, work); 410 struct btree *b = container_of(to_delayed_work(w), struct btree, work);
411 411
412 rw_lock(true, b, b->level); 412 rw_lock(true, b, b->level);
413 413
414 if (btree_node_dirty(b)) 414 if (btree_node_dirty(b))
415 bch_btree_node_write(b, NULL); 415 bch_btree_node_write(b, NULL);
416 rw_unlock(true, b); 416 rw_unlock(true, b);
417 } 417 }
418 418
419 static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op) 419 static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op)
420 { 420 {
421 struct bset *i = b->sets[b->nsets].data; 421 struct bset *i = b->sets[b->nsets].data;
422 struct btree_write *w = btree_current_write(b); 422 struct btree_write *w = btree_current_write(b);
423 423
424 BUG_ON(!b->written); 424 BUG_ON(!b->written);
425 BUG_ON(!i->keys); 425 BUG_ON(!i->keys);
426 426
427 if (!btree_node_dirty(b)) 427 if (!btree_node_dirty(b))
428 queue_delayed_work(btree_io_wq, &b->work, 30 * HZ); 428 queue_delayed_work(btree_io_wq, &b->work, 30 * HZ);
429 429
430 set_btree_node_dirty(b); 430 set_btree_node_dirty(b);
431 431
432 if (op && op->journal) { 432 if (op && op->journal) {
433 if (w->journal && 433 if (w->journal &&
434 journal_pin_cmp(b->c, w, op)) { 434 journal_pin_cmp(b->c, w, op)) {
435 atomic_dec_bug(w->journal); 435 atomic_dec_bug(w->journal);
436 w->journal = NULL; 436 w->journal = NULL;
437 } 437 }
438 438
439 if (!w->journal) { 439 if (!w->journal) {
440 w->journal = op->journal; 440 w->journal = op->journal;
441 atomic_inc(w->journal); 441 atomic_inc(w->journal);
442 } 442 }
443 } 443 }
444 444
445 /* Force write if set is too big */ 445 /* Force write if set is too big */
446 if (set_bytes(i) > PAGE_SIZE - 48 && 446 if (set_bytes(i) > PAGE_SIZE - 48 &&
447 !current->bio_list) 447 !current->bio_list)
448 bch_btree_node_write(b, NULL); 448 bch_btree_node_write(b, NULL);
449 } 449 }
450 450
451 /* 451 /*
452 * Btree in memory cache - allocation/freeing 452 * Btree in memory cache - allocation/freeing
453 * mca -> memory cache 453 * mca -> memory cache
454 */ 454 */
455 455
456 static void mca_reinit(struct btree *b) 456 static void mca_reinit(struct btree *b)
457 { 457 {
458 unsigned i; 458 unsigned i;
459 459
460 b->flags = 0; 460 b->flags = 0;
461 b->written = 0; 461 b->written = 0;
462 b->nsets = 0; 462 b->nsets = 0;
463 463
464 for (i = 0; i < MAX_BSETS; i++) 464 for (i = 0; i < MAX_BSETS; i++)
465 b->sets[i].size = 0; 465 b->sets[i].size = 0;
466 /* 466 /*
467 * Second loop starts at 1 because b->sets[0]->data is the memory we 467 * Second loop starts at 1 because b->sets[0]->data is the memory we
468 * allocated 468 * allocated
469 */ 469 */
470 for (i = 1; i < MAX_BSETS; i++) 470 for (i = 1; i < MAX_BSETS; i++)
471 b->sets[i].data = NULL; 471 b->sets[i].data = NULL;
472 } 472 }
473 473
474 #define mca_reserve(c) (((c->root && c->root->level) \ 474 #define mca_reserve(c) (((c->root && c->root->level) \
475 ? c->root->level : 1) * 8 + 16) 475 ? c->root->level : 1) * 8 + 16)
476 #define mca_can_free(c) \ 476 #define mca_can_free(c) \
477 max_t(int, 0, c->bucket_cache_used - mca_reserve(c)) 477 max_t(int, 0, c->bucket_cache_used - mca_reserve(c))
478 478
479 static void mca_data_free(struct btree *b) 479 static void mca_data_free(struct btree *b)
480 { 480 {
481 struct bset_tree *t = b->sets; 481 struct bset_tree *t = b->sets;
482 BUG_ON(!closure_is_unlocked(&b->io.cl)); 482 BUG_ON(!closure_is_unlocked(&b->io.cl));
483 483
484 if (bset_prev_bytes(b) < PAGE_SIZE) 484 if (bset_prev_bytes(b) < PAGE_SIZE)
485 kfree(t->prev); 485 kfree(t->prev);
486 else 486 else
487 free_pages((unsigned long) t->prev, 487 free_pages((unsigned long) t->prev,
488 get_order(bset_prev_bytes(b))); 488 get_order(bset_prev_bytes(b)));
489 489
490 if (bset_tree_bytes(b) < PAGE_SIZE) 490 if (bset_tree_bytes(b) < PAGE_SIZE)
491 kfree(t->tree); 491 kfree(t->tree);
492 else 492 else
493 free_pages((unsigned long) t->tree, 493 free_pages((unsigned long) t->tree,
494 get_order(bset_tree_bytes(b))); 494 get_order(bset_tree_bytes(b)));
495 495
496 free_pages((unsigned long) t->data, b->page_order); 496 free_pages((unsigned long) t->data, b->page_order);
497 497
498 t->prev = NULL; 498 t->prev = NULL;
499 t->tree = NULL; 499 t->tree = NULL;
500 t->data = NULL; 500 t->data = NULL;
501 list_move(&b->list, &b->c->btree_cache_freed); 501 list_move(&b->list, &b->c->btree_cache_freed);
502 b->c->bucket_cache_used--; 502 b->c->bucket_cache_used--;
503 } 503 }
504 504
505 static void mca_bucket_free(struct btree *b) 505 static void mca_bucket_free(struct btree *b)
506 { 506 {
507 BUG_ON(btree_node_dirty(b)); 507 BUG_ON(btree_node_dirty(b));
508 508
509 b->key.ptr[0] = 0; 509 b->key.ptr[0] = 0;
510 hlist_del_init_rcu(&b->hash); 510 hlist_del_init_rcu(&b->hash);
511 list_move(&b->list, &b->c->btree_cache_freeable); 511 list_move(&b->list, &b->c->btree_cache_freeable);
512 } 512 }
513 513
514 static unsigned btree_order(struct bkey *k) 514 static unsigned btree_order(struct bkey *k)
515 { 515 {
516 return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1); 516 return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1);
517 } 517 }
518 518
519 static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) 519 static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
520 { 520 {
521 struct bset_tree *t = b->sets; 521 struct bset_tree *t = b->sets;
522 BUG_ON(t->data); 522 BUG_ON(t->data);
523 523
524 b->page_order = max_t(unsigned, 524 b->page_order = max_t(unsigned,
525 ilog2(b->c->btree_pages), 525 ilog2(b->c->btree_pages),
526 btree_order(k)); 526 btree_order(k));
527 527
528 t->data = (void *) __get_free_pages(gfp, b->page_order); 528 t->data = (void *) __get_free_pages(gfp, b->page_order);
529 if (!t->data) 529 if (!t->data)
530 goto err; 530 goto err;
531 531
532 t->tree = bset_tree_bytes(b) < PAGE_SIZE 532 t->tree = bset_tree_bytes(b) < PAGE_SIZE
533 ? kmalloc(bset_tree_bytes(b), gfp) 533 ? kmalloc(bset_tree_bytes(b), gfp)
534 : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b))); 534 : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b)));
535 if (!t->tree) 535 if (!t->tree)
536 goto err; 536 goto err;
537 537
538 t->prev = bset_prev_bytes(b) < PAGE_SIZE 538 t->prev = bset_prev_bytes(b) < PAGE_SIZE
539 ? kmalloc(bset_prev_bytes(b), gfp) 539 ? kmalloc(bset_prev_bytes(b), gfp)
540 : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b))); 540 : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b)));
541 if (!t->prev) 541 if (!t->prev)
542 goto err; 542 goto err;
543 543
544 list_move(&b->list, &b->c->btree_cache); 544 list_move(&b->list, &b->c->btree_cache);
545 b->c->bucket_cache_used++; 545 b->c->bucket_cache_used++;
546 return; 546 return;
547 err: 547 err:
548 mca_data_free(b); 548 mca_data_free(b);
549 } 549 }
550 550
551 static struct btree *mca_bucket_alloc(struct cache_set *c, 551 static struct btree *mca_bucket_alloc(struct cache_set *c,
552 struct bkey *k, gfp_t gfp) 552 struct bkey *k, gfp_t gfp)
553 { 553 {
554 struct btree *b = kzalloc(sizeof(struct btree), gfp); 554 struct btree *b = kzalloc(sizeof(struct btree), gfp);
555 if (!b) 555 if (!b)
556 return NULL; 556 return NULL;
557 557
558 init_rwsem(&b->lock); 558 init_rwsem(&b->lock);
559 lockdep_set_novalidate_class(&b->lock); 559 lockdep_set_novalidate_class(&b->lock);
560 INIT_LIST_HEAD(&b->list); 560 INIT_LIST_HEAD(&b->list);
561 INIT_DELAYED_WORK(&b->work, btree_node_write_work); 561 INIT_DELAYED_WORK(&b->work, btree_node_write_work);
562 b->c = c; 562 b->c = c;
563 closure_init_unlocked(&b->io); 563 closure_init_unlocked(&b->io);
564 564
565 mca_data_alloc(b, k, gfp); 565 mca_data_alloc(b, k, gfp);
566 return b; 566 return b;
567 } 567 }
568 568
569 static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order) 569 static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order)
570 { 570 {
571 lockdep_assert_held(&b->c->bucket_lock); 571 lockdep_assert_held(&b->c->bucket_lock);
572 572
573 if (!down_write_trylock(&b->lock)) 573 if (!down_write_trylock(&b->lock))
574 return -ENOMEM; 574 return -ENOMEM;
575 575
576 if (b->page_order < min_order) { 576 if (b->page_order < min_order) {
577 rw_unlock(true, b); 577 rw_unlock(true, b);
578 return -ENOMEM; 578 return -ENOMEM;
579 } 579 }
580 580
581 BUG_ON(btree_node_dirty(b) && !b->sets[0].data); 581 BUG_ON(btree_node_dirty(b) && !b->sets[0].data);
582 582
583 if (cl && btree_node_dirty(b)) 583 if (cl && btree_node_dirty(b))
584 bch_btree_node_write(b, NULL); 584 bch_btree_node_write(b, NULL);
585 585
586 if (cl) 586 if (cl)
587 closure_wait_event_async(&b->io.wait, cl, 587 closure_wait_event_async(&b->io.wait, cl,
588 atomic_read(&b->io.cl.remaining) == -1); 588 atomic_read(&b->io.cl.remaining) == -1);
589 589
590 if (btree_node_dirty(b) || 590 if (btree_node_dirty(b) ||
591 !closure_is_unlocked(&b->io.cl) || 591 !closure_is_unlocked(&b->io.cl) ||
592 work_pending(&b->work.work)) { 592 work_pending(&b->work.work)) {
593 rw_unlock(true, b); 593 rw_unlock(true, b);
594 return -EAGAIN; 594 return -EAGAIN;
595 } 595 }
596 596
597 return 0; 597 return 0;
598 } 598 }
599 599
600 static int bch_mca_shrink(struct shrinker *shrink, struct shrink_control *sc) 600 static int bch_mca_shrink(struct shrinker *shrink, struct shrink_control *sc)
601 { 601 {
602 struct cache_set *c = container_of(shrink, struct cache_set, shrink); 602 struct cache_set *c = container_of(shrink, struct cache_set, shrink);
603 struct btree *b, *t; 603 struct btree *b, *t;
604 unsigned long i, nr = sc->nr_to_scan; 604 unsigned long i, nr = sc->nr_to_scan;
605 605
606 if (c->shrinker_disabled) 606 if (c->shrinker_disabled)
607 return 0; 607 return 0;
608 608
609 if (c->try_harder) 609 if (c->try_harder)
610 return 0; 610 return 0;
611 611
612 /* 612 /*
613 * If nr == 0, we're supposed to return the number of items we have 613 * If nr == 0, we're supposed to return the number of items we have
614 * cached. Not allowed to return -1. 614 * cached. Not allowed to return -1.
615 */ 615 */
616 if (!nr) 616 if (!nr)
617 return mca_can_free(c) * c->btree_pages; 617 return mca_can_free(c) * c->btree_pages;
618 618
619 /* Return -1 if we can't do anything right now */ 619 /* Return -1 if we can't do anything right now */
620 if (sc->gfp_mask & __GFP_WAIT) 620 if (sc->gfp_mask & __GFP_WAIT)
621 mutex_lock(&c->bucket_lock); 621 mutex_lock(&c->bucket_lock);
622 else if (!mutex_trylock(&c->bucket_lock)) 622 else if (!mutex_trylock(&c->bucket_lock))
623 return -1; 623 return -1;
624 624
625 /* 625 /*
626 * It's _really_ critical that we don't free too many btree nodes - we 626 * It's _really_ critical that we don't free too many btree nodes - we
627 * have to always leave ourselves a reserve. The reserve is how we 627 * have to always leave ourselves a reserve. The reserve is how we
628 * guarantee that allocating memory for a new btree node can always 628 * guarantee that allocating memory for a new btree node can always
629 * succeed, so that inserting keys into the btree can always succeed and 629 * succeed, so that inserting keys into the btree can always succeed and
630 * IO can always make forward progress: 630 * IO can always make forward progress:
631 */ 631 */
632 nr /= c->btree_pages; 632 nr /= c->btree_pages;
633 nr = min_t(unsigned long, nr, mca_can_free(c)); 633 nr = min_t(unsigned long, nr, mca_can_free(c));
634 634
635 i = 0; 635 i = 0;
636 list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) { 636 list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) {
637 if (!nr) 637 if (!nr)
638 break; 638 break;
639 639
640 if (++i > 3 && 640 if (++i > 3 &&
641 !mca_reap(b, NULL, 0)) { 641 !mca_reap(b, NULL, 0)) {
642 mca_data_free(b); 642 mca_data_free(b);
643 rw_unlock(true, b); 643 rw_unlock(true, b);
644 --nr; 644 --nr;
645 } 645 }
646 } 646 }
647 647
648 /* 648 /*
649 * Can happen right when we first start up, before we've read in any 649 * Can happen right when we first start up, before we've read in any
650 * btree nodes 650 * btree nodes
651 */ 651 */
652 if (list_empty(&c->btree_cache)) 652 if (list_empty(&c->btree_cache))
653 goto out; 653 goto out;
654 654
655 for (i = 0; nr && i < c->bucket_cache_used; i++) { 655 for (i = 0; nr && i < c->bucket_cache_used; i++) {
656 b = list_first_entry(&c->btree_cache, struct btree, list); 656 b = list_first_entry(&c->btree_cache, struct btree, list);
657 list_rotate_left(&c->btree_cache); 657 list_rotate_left(&c->btree_cache);
658 658
659 if (!b->accessed && 659 if (!b->accessed &&
660 !mca_reap(b, NULL, 0)) { 660 !mca_reap(b, NULL, 0)) {
661 mca_bucket_free(b); 661 mca_bucket_free(b);
662 mca_data_free(b); 662 mca_data_free(b);
663 rw_unlock(true, b); 663 rw_unlock(true, b);
664 --nr; 664 --nr;
665 } else 665 } else
666 b->accessed = 0; 666 b->accessed = 0;
667 } 667 }
668 out: 668 out:
669 nr = mca_can_free(c) * c->btree_pages; 669 nr = mca_can_free(c) * c->btree_pages;
670 mutex_unlock(&c->bucket_lock); 670 mutex_unlock(&c->bucket_lock);
671 return nr; 671 return nr;
672 } 672 }
673 673
674 void bch_btree_cache_free(struct cache_set *c) 674 void bch_btree_cache_free(struct cache_set *c)
675 { 675 {
676 struct btree *b; 676 struct btree *b;
677 struct closure cl; 677 struct closure cl;
678 closure_init_stack(&cl); 678 closure_init_stack(&cl);
679 679
680 if (c->shrink.list.next) 680 if (c->shrink.list.next)
681 unregister_shrinker(&c->shrink); 681 unregister_shrinker(&c->shrink);
682 682
683 mutex_lock(&c->bucket_lock); 683 mutex_lock(&c->bucket_lock);
684 684
685 #ifdef CONFIG_BCACHE_DEBUG 685 #ifdef CONFIG_BCACHE_DEBUG
686 if (c->verify_data) 686 if (c->verify_data)
687 list_move(&c->verify_data->list, &c->btree_cache); 687 list_move(&c->verify_data->list, &c->btree_cache);
688 #endif 688 #endif
689 689
690 list_splice(&c->btree_cache_freeable, 690 list_splice(&c->btree_cache_freeable,
691 &c->btree_cache); 691 &c->btree_cache);
692 692
693 while (!list_empty(&c->btree_cache)) { 693 while (!list_empty(&c->btree_cache)) {
694 b = list_first_entry(&c->btree_cache, struct btree, list); 694 b = list_first_entry(&c->btree_cache, struct btree, list);
695 695
696 if (btree_node_dirty(b)) 696 if (btree_node_dirty(b))
697 btree_complete_write(b, btree_current_write(b)); 697 btree_complete_write(b, btree_current_write(b));
698 clear_bit(BTREE_NODE_dirty, &b->flags); 698 clear_bit(BTREE_NODE_dirty, &b->flags);
699 699
700 mca_data_free(b); 700 mca_data_free(b);
701 } 701 }
702 702
703 while (!list_empty(&c->btree_cache_freed)) { 703 while (!list_empty(&c->btree_cache_freed)) {
704 b = list_first_entry(&c->btree_cache_freed, 704 b = list_first_entry(&c->btree_cache_freed,
705 struct btree, list); 705 struct btree, list);
706 list_del(&b->list); 706 list_del(&b->list);
707 cancel_delayed_work_sync(&b->work); 707 cancel_delayed_work_sync(&b->work);
708 kfree(b); 708 kfree(b);
709 } 709 }
710 710
711 mutex_unlock(&c->bucket_lock); 711 mutex_unlock(&c->bucket_lock);
712 } 712 }
713 713
714 int bch_btree_cache_alloc(struct cache_set *c) 714 int bch_btree_cache_alloc(struct cache_set *c)
715 { 715 {
716 unsigned i; 716 unsigned i;
717 717
718 /* XXX: doesn't check for errors */ 718 /* XXX: doesn't check for errors */
719 719
720 closure_init_unlocked(&c->gc); 720 closure_init_unlocked(&c->gc);
721 721
722 for (i = 0; i < mca_reserve(c); i++) 722 for (i = 0; i < mca_reserve(c); i++)
723 mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); 723 mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
724 724
725 list_splice_init(&c->btree_cache, 725 list_splice_init(&c->btree_cache,
726 &c->btree_cache_freeable); 726 &c->btree_cache_freeable);
727 727
728 #ifdef CONFIG_BCACHE_DEBUG 728 #ifdef CONFIG_BCACHE_DEBUG
729 mutex_init(&c->verify_lock); 729 mutex_init(&c->verify_lock);
730 730
731 c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); 731 c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
732 732
733 if (c->verify_data && 733 if (c->verify_data &&
734 c->verify_data->sets[0].data) 734 c->verify_data->sets[0].data)
735 list_del_init(&c->verify_data->list); 735 list_del_init(&c->verify_data->list);
736 else 736 else
737 c->verify_data = NULL; 737 c->verify_data = NULL;
738 #endif 738 #endif
739 739
740 c->shrink.shrink = bch_mca_shrink; 740 c->shrink.shrink = bch_mca_shrink;
741 c->shrink.seeks = 4; 741 c->shrink.seeks = 4;
742 c->shrink.batch = c->btree_pages * 2; 742 c->shrink.batch = c->btree_pages * 2;
743 register_shrinker(&c->shrink); 743 register_shrinker(&c->shrink);
744 744
745 return 0; 745 return 0;
746 } 746 }
747 747
748 /* Btree in memory cache - hash table */ 748 /* Btree in memory cache - hash table */
749 749
750 static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k) 750 static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k)
751 { 751 {
752 return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)]; 752 return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)];
753 } 753 }
754 754
755 static struct btree *mca_find(struct cache_set *c, struct bkey *k) 755 static struct btree *mca_find(struct cache_set *c, struct bkey *k)
756 { 756 {
757 struct btree *b; 757 struct btree *b;
758 758
759 rcu_read_lock(); 759 rcu_read_lock();
760 hlist_for_each_entry_rcu(b, mca_hash(c, k), hash) 760 hlist_for_each_entry_rcu(b, mca_hash(c, k), hash)
761 if (PTR_HASH(c, &b->key) == PTR_HASH(c, k)) 761 if (PTR_HASH(c, &b->key) == PTR_HASH(c, k))
762 goto out; 762 goto out;
763 b = NULL; 763 b = NULL;
764 out: 764 out:
765 rcu_read_unlock(); 765 rcu_read_unlock();
766 return b; 766 return b;
767 } 767 }
768 768
769 static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k, 769 static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k,
770 int level, struct closure *cl) 770 int level, struct closure *cl)
771 { 771 {
772 int ret = -ENOMEM; 772 int ret = -ENOMEM;
773 struct btree *i; 773 struct btree *i;
774 774
775 trace_bcache_btree_cache_cannibalize(c); 775 trace_bcache_btree_cache_cannibalize(c);
776 776
777 if (!cl) 777 if (!cl)
778 return ERR_PTR(-ENOMEM); 778 return ERR_PTR(-ENOMEM);
779 779
780 /* 780 /*
781 * Trying to free up some memory - i.e. reuse some btree nodes - may 781 * Trying to free up some memory - i.e. reuse some btree nodes - may
782 * require initiating IO to flush the dirty part of the node. If we're 782 * require initiating IO to flush the dirty part of the node. If we're
783 * running under generic_make_request(), that IO will never finish and 783 * running under generic_make_request(), that IO will never finish and
784 * we would deadlock. Returning -EAGAIN causes the cache lookup code to 784 * we would deadlock. Returning -EAGAIN causes the cache lookup code to
785 * punt to workqueue and retry. 785 * punt to workqueue and retry.
786 */ 786 */
787 if (current->bio_list) 787 if (current->bio_list)
788 return ERR_PTR(-EAGAIN); 788 return ERR_PTR(-EAGAIN);
789 789
790 if (c->try_harder && c->try_harder != cl) { 790 if (c->try_harder && c->try_harder != cl) {
791 closure_wait_event_async(&c->try_wait, cl, !c->try_harder); 791 closure_wait_event_async(&c->try_wait, cl, !c->try_harder);
792 return ERR_PTR(-EAGAIN); 792 return ERR_PTR(-EAGAIN);
793 } 793 }
794 794
795 c->try_harder = cl; 795 c->try_harder = cl;
796 c->try_harder_start = local_clock(); 796 c->try_harder_start = local_clock();
797 retry: 797 retry:
798 list_for_each_entry_reverse(i, &c->btree_cache, list) { 798 list_for_each_entry_reverse(i, &c->btree_cache, list) {
799 int r = mca_reap(i, cl, btree_order(k)); 799 int r = mca_reap(i, cl, btree_order(k));
800 if (!r) 800 if (!r)
801 return i; 801 return i;
802 if (r != -ENOMEM) 802 if (r != -ENOMEM)
803 ret = r; 803 ret = r;
804 } 804 }
805 805
806 if (ret == -EAGAIN && 806 if (ret == -EAGAIN &&
807 closure_blocking(cl)) { 807 closure_blocking(cl)) {
808 mutex_unlock(&c->bucket_lock); 808 mutex_unlock(&c->bucket_lock);
809 closure_sync(cl); 809 closure_sync(cl);
810 mutex_lock(&c->bucket_lock); 810 mutex_lock(&c->bucket_lock);
811 goto retry; 811 goto retry;
812 } 812 }
813 813
814 return ERR_PTR(ret); 814 return ERR_PTR(ret);
815 } 815 }
816 816
817 /* 817 /*
818 * We can only have one thread cannibalizing other cached btree nodes at a time, 818 * We can only have one thread cannibalizing other cached btree nodes at a time,
819 * or we'll deadlock. We use an open coded mutex to ensure that, which a 819 * or we'll deadlock. We use an open coded mutex to ensure that, which a
820 * cannibalize_bucket() will take. This means every time we unlock the root of 820 * cannibalize_bucket() will take. This means every time we unlock the root of
821 * the btree, we need to release this lock if we have it held. 821 * the btree, we need to release this lock if we have it held.
822 */ 822 */
823 void bch_cannibalize_unlock(struct cache_set *c, struct closure *cl) 823 void bch_cannibalize_unlock(struct cache_set *c, struct closure *cl)
824 { 824 {
825 if (c->try_harder == cl) { 825 if (c->try_harder == cl) {
826 bch_time_stats_update(&c->try_harder_time, c->try_harder_start); 826 bch_time_stats_update(&c->try_harder_time, c->try_harder_start);
827 c->try_harder = NULL; 827 c->try_harder = NULL;
828 __closure_wake_up(&c->try_wait); 828 __closure_wake_up(&c->try_wait);
829 } 829 }
830 } 830 }
831 831
832 static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, 832 static struct btree *mca_alloc(struct cache_set *c, struct bkey *k,
833 int level, struct closure *cl) 833 int level, struct closure *cl)
834 { 834 {
835 struct btree *b; 835 struct btree *b;
836 836
837 lockdep_assert_held(&c->bucket_lock); 837 lockdep_assert_held(&c->bucket_lock);
838 838
839 if (mca_find(c, k)) 839 if (mca_find(c, k))
840 return NULL; 840 return NULL;
841 841
842 /* btree_free() doesn't free memory; it sticks the node on the end of 842 /* btree_free() doesn't free memory; it sticks the node on the end of
843 * the list. Check if there's any freed nodes there: 843 * the list. Check if there's any freed nodes there:
844 */ 844 */
845 list_for_each_entry(b, &c->btree_cache_freeable, list) 845 list_for_each_entry(b, &c->btree_cache_freeable, list)
846 if (!mca_reap(b, NULL, btree_order(k))) 846 if (!mca_reap(b, NULL, btree_order(k)))
847 goto out; 847 goto out;
848 848
849 /* We never free struct btree itself, just the memory that holds the on 849 /* We never free struct btree itself, just the memory that holds the on
850 * disk node. Check the freed list before allocating a new one: 850 * disk node. Check the freed list before allocating a new one:
851 */ 851 */
852 list_for_each_entry(b, &c->btree_cache_freed, list) 852 list_for_each_entry(b, &c->btree_cache_freed, list)
853 if (!mca_reap(b, NULL, 0)) { 853 if (!mca_reap(b, NULL, 0)) {
854 mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); 854 mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO);
855 if (!b->sets[0].data) 855 if (!b->sets[0].data)
856 goto err; 856 goto err;
857 else 857 else
858 goto out; 858 goto out;
859 } 859 }
860 860
861 b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO); 861 b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO);
862 if (!b) 862 if (!b)
863 goto err; 863 goto err;
864 864
865 BUG_ON(!down_write_trylock(&b->lock)); 865 BUG_ON(!down_write_trylock(&b->lock));
866 if (!b->sets->data) 866 if (!b->sets->data)
867 goto err; 867 goto err;
868 out: 868 out:
869 BUG_ON(!closure_is_unlocked(&b->io.cl)); 869 BUG_ON(!closure_is_unlocked(&b->io.cl));
870 870
871 bkey_copy(&b->key, k); 871 bkey_copy(&b->key, k);
872 list_move(&b->list, &c->btree_cache); 872 list_move(&b->list, &c->btree_cache);
873 hlist_del_init_rcu(&b->hash); 873 hlist_del_init_rcu(&b->hash);
874 hlist_add_head_rcu(&b->hash, mca_hash(c, k)); 874 hlist_add_head_rcu(&b->hash, mca_hash(c, k));
875 875
876 lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_); 876 lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_);
877 b->level = level; 877 b->level = level;
878 878
879 mca_reinit(b); 879 mca_reinit(b);
880 880
881 return b; 881 return b;
882 err: 882 err:
883 if (b) 883 if (b)
884 rw_unlock(true, b); 884 rw_unlock(true, b);
885 885
886 b = mca_cannibalize(c, k, level, cl); 886 b = mca_cannibalize(c, k, level, cl);
887 if (!IS_ERR(b)) 887 if (!IS_ERR(b))
888 goto out; 888 goto out;
889 889
890 return b; 890 return b;
891 } 891 }
892 892
893 /** 893 /**
894 * bch_btree_node_get - find a btree node in the cache and lock it, reading it 894 * bch_btree_node_get - find a btree node in the cache and lock it, reading it
895 * in from disk if necessary. 895 * in from disk if necessary.
896 * 896 *
897 * If IO is necessary, it uses the closure embedded in struct btree_op to wait; 897 * If IO is necessary, it uses the closure embedded in struct btree_op to wait;
898 * if that closure is in non blocking mode, will return -EAGAIN. 898 * if that closure is in non blocking mode, will return -EAGAIN.
899 * 899 *
900 * The btree node will have either a read or a write lock held, depending on 900 * The btree node will have either a read or a write lock held, depending on
901 * level and op->lock. 901 * level and op->lock.
902 */ 902 */
903 struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k, 903 struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k,
904 int level, struct btree_op *op) 904 int level, struct btree_op *op)
905 { 905 {
906 int i = 0; 906 int i = 0;
907 bool write = level <= op->lock; 907 bool write = level <= op->lock;
908 struct btree *b; 908 struct btree *b;
909 909
910 BUG_ON(level < 0); 910 BUG_ON(level < 0);
911 retry: 911 retry:
912 b = mca_find(c, k); 912 b = mca_find(c, k);
913 913
914 if (!b) { 914 if (!b) {
915 if (current->bio_list) 915 if (current->bio_list)
916 return ERR_PTR(-EAGAIN); 916 return ERR_PTR(-EAGAIN);
917 917
918 mutex_lock(&c->bucket_lock); 918 mutex_lock(&c->bucket_lock);
919 b = mca_alloc(c, k, level, &op->cl); 919 b = mca_alloc(c, k, level, &op->cl);
920 mutex_unlock(&c->bucket_lock); 920 mutex_unlock(&c->bucket_lock);
921 921
922 if (!b) 922 if (!b)
923 goto retry; 923 goto retry;
924 if (IS_ERR(b)) 924 if (IS_ERR(b))
925 return b; 925 return b;
926 926
927 bch_btree_node_read(b); 927 bch_btree_node_read(b);
928 928
929 if (!write) 929 if (!write)
930 downgrade_write(&b->lock); 930 downgrade_write(&b->lock);
931 } else { 931 } else {
932 rw_lock(write, b, level); 932 rw_lock(write, b, level);
933 if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) { 933 if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) {
934 rw_unlock(write, b); 934 rw_unlock(write, b);
935 goto retry; 935 goto retry;
936 } 936 }
937 BUG_ON(b->level != level); 937 BUG_ON(b->level != level);
938 } 938 }
939 939
940 b->accessed = 1; 940 b->accessed = 1;
941 941
942 for (; i <= b->nsets && b->sets[i].size; i++) { 942 for (; i <= b->nsets && b->sets[i].size; i++) {
943 prefetch(b->sets[i].tree); 943 prefetch(b->sets[i].tree);
944 prefetch(b->sets[i].data); 944 prefetch(b->sets[i].data);
945 } 945 }
946 946
947 for (; i <= b->nsets; i++) 947 for (; i <= b->nsets; i++)
948 prefetch(b->sets[i].data); 948 prefetch(b->sets[i].data);
949 949
950 if (btree_node_io_error(b)) { 950 if (btree_node_io_error(b)) {
951 rw_unlock(write, b); 951 rw_unlock(write, b);
952 return ERR_PTR(-EIO); 952 return ERR_PTR(-EIO);
953 } 953 }
954 954
955 BUG_ON(!b->written); 955 BUG_ON(!b->written);
956 956
957 return b; 957 return b;
958 } 958 }
959 959
960 static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) 960 static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
961 { 961 {
962 struct btree *b; 962 struct btree *b;
963 963
964 mutex_lock(&c->bucket_lock); 964 mutex_lock(&c->bucket_lock);
965 b = mca_alloc(c, k, level, NULL); 965 b = mca_alloc(c, k, level, NULL);
966 mutex_unlock(&c->bucket_lock); 966 mutex_unlock(&c->bucket_lock);
967 967
968 if (!IS_ERR_OR_NULL(b)) { 968 if (!IS_ERR_OR_NULL(b)) {
969 bch_btree_node_read(b); 969 bch_btree_node_read(b);
970 rw_unlock(true, b); 970 rw_unlock(true, b);
971 } 971 }
972 } 972 }
973 973
974 /* Btree alloc */ 974 /* Btree alloc */
975 975
976 static void btree_node_free(struct btree *b, struct btree_op *op) 976 static void btree_node_free(struct btree *b, struct btree_op *op)
977 { 977 {
978 unsigned i; 978 unsigned i;
979 979
980 trace_bcache_btree_node_free(b); 980 trace_bcache_btree_node_free(b);
981 981
982 /* 982 /*
983 * The BUG_ON() in btree_node_get() implies that we must have a write 983 * The BUG_ON() in btree_node_get() implies that we must have a write
984 * lock on parent to free or even invalidate a node 984 * lock on parent to free or even invalidate a node
985 */ 985 */
986 BUG_ON(op->lock <= b->level); 986 BUG_ON(op->lock <= b->level);
987 BUG_ON(b == b->c->root); 987 BUG_ON(b == b->c->root);
988 988
989 if (btree_node_dirty(b)) 989 if (btree_node_dirty(b))
990 btree_complete_write(b, btree_current_write(b)); 990 btree_complete_write(b, btree_current_write(b));
991 clear_bit(BTREE_NODE_dirty, &b->flags); 991 clear_bit(BTREE_NODE_dirty, &b->flags);
992 992
993 cancel_delayed_work(&b->work); 993 cancel_delayed_work(&b->work);
994 994
995 mutex_lock(&b->c->bucket_lock); 995 mutex_lock(&b->c->bucket_lock);
996 996
997 for (i = 0; i < KEY_PTRS(&b->key); i++) { 997 for (i = 0; i < KEY_PTRS(&b->key); i++) {
998 BUG_ON(atomic_read(&PTR_BUCKET(b->c, &b->key, i)->pin)); 998 BUG_ON(atomic_read(&PTR_BUCKET(b->c, &b->key, i)->pin));
999 999
1000 bch_inc_gen(PTR_CACHE(b->c, &b->key, i), 1000 bch_inc_gen(PTR_CACHE(b->c, &b->key, i),
1001 PTR_BUCKET(b->c, &b->key, i)); 1001 PTR_BUCKET(b->c, &b->key, i));
1002 } 1002 }
1003 1003
1004 bch_bucket_free(b->c, &b->key); 1004 bch_bucket_free(b->c, &b->key);
1005 mca_bucket_free(b); 1005 mca_bucket_free(b);
1006 mutex_unlock(&b->c->bucket_lock); 1006 mutex_unlock(&b->c->bucket_lock);
1007 } 1007 }
1008 1008
1009 struct btree *bch_btree_node_alloc(struct cache_set *c, int level, 1009 struct btree *bch_btree_node_alloc(struct cache_set *c, int level,
1010 struct closure *cl) 1010 struct closure *cl)
1011 { 1011 {
1012 BKEY_PADDED(key) k; 1012 BKEY_PADDED(key) k;
1013 struct btree *b = ERR_PTR(-EAGAIN); 1013 struct btree *b = ERR_PTR(-EAGAIN);
1014 1014
1015 mutex_lock(&c->bucket_lock); 1015 mutex_lock(&c->bucket_lock);
1016 retry: 1016 retry:
1017 if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, cl)) 1017 if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, cl))
1018 goto err; 1018 goto err;
1019 1019
1020 SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); 1020 SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS);
1021 1021
1022 b = mca_alloc(c, &k.key, level, cl); 1022 b = mca_alloc(c, &k.key, level, cl);
1023 if (IS_ERR(b)) 1023 if (IS_ERR(b))
1024 goto err_free; 1024 goto err_free;
1025 1025
1026 if (!b) { 1026 if (!b) {
1027 cache_bug(c, 1027 cache_bug(c,
1028 "Tried to allocate bucket that was in btree cache"); 1028 "Tried to allocate bucket that was in btree cache");
1029 __bkey_put(c, &k.key); 1029 __bkey_put(c, &k.key);
1030 goto retry; 1030 goto retry;
1031 } 1031 }
1032 1032
1033 b->accessed = 1; 1033 b->accessed = 1;
1034 bch_bset_init_next(b); 1034 bch_bset_init_next(b);
1035 1035
1036 mutex_unlock(&c->bucket_lock); 1036 mutex_unlock(&c->bucket_lock);
1037 1037
1038 trace_bcache_btree_node_alloc(b); 1038 trace_bcache_btree_node_alloc(b);
1039 return b; 1039 return b;
1040 err_free: 1040 err_free:
1041 bch_bucket_free(c, &k.key); 1041 bch_bucket_free(c, &k.key);
1042 __bkey_put(c, &k.key); 1042 __bkey_put(c, &k.key);
1043 err: 1043 err:
1044 mutex_unlock(&c->bucket_lock); 1044 mutex_unlock(&c->bucket_lock);
1045 1045
1046 trace_bcache_btree_node_alloc_fail(b); 1046 trace_bcache_btree_node_alloc_fail(b);
1047 return b; 1047 return b;
1048 } 1048 }
1049 1049
1050 static struct btree *btree_node_alloc_replacement(struct btree *b, 1050 static struct btree *btree_node_alloc_replacement(struct btree *b,
1051 struct closure *cl) 1051 struct closure *cl)
1052 { 1052 {
1053 struct btree *n = bch_btree_node_alloc(b->c, b->level, cl); 1053 struct btree *n = bch_btree_node_alloc(b->c, b->level, cl);
1054 if (!IS_ERR_OR_NULL(n)) 1054 if (!IS_ERR_OR_NULL(n))
1055 bch_btree_sort_into(b, n); 1055 bch_btree_sort_into(b, n);
1056 1056
1057 return n; 1057 return n;
1058 } 1058 }
1059 1059
1060 /* Garbage collection */ 1060 /* Garbage collection */
1061 1061
1062 uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) 1062 uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
1063 { 1063 {
1064 uint8_t stale = 0; 1064 uint8_t stale = 0;
1065 unsigned i; 1065 unsigned i;
1066 struct bucket *g; 1066 struct bucket *g;
1067 1067
1068 /* 1068 /*
1069 * ptr_invalid() can't return true for the keys that mark btree nodes as 1069 * ptr_invalid() can't return true for the keys that mark btree nodes as
1070 * freed, but since ptr_bad() returns true we'll never actually use them 1070 * freed, but since ptr_bad() returns true we'll never actually use them
1071 * for anything and thus we don't want mark their pointers here 1071 * for anything and thus we don't want mark their pointers here
1072 */ 1072 */
1073 if (!bkey_cmp(k, &ZERO_KEY)) 1073 if (!bkey_cmp(k, &ZERO_KEY))
1074 return stale; 1074 return stale;
1075 1075
1076 for (i = 0; i < KEY_PTRS(k); i++) { 1076 for (i = 0; i < KEY_PTRS(k); i++) {
1077 if (!ptr_available(c, k, i)) 1077 if (!ptr_available(c, k, i))
1078 continue; 1078 continue;
1079 1079
1080 g = PTR_BUCKET(c, k, i); 1080 g = PTR_BUCKET(c, k, i);
1081 1081
1082 if (gen_after(g->gc_gen, PTR_GEN(k, i))) 1082 if (gen_after(g->gc_gen, PTR_GEN(k, i)))
1083 g->gc_gen = PTR_GEN(k, i); 1083 g->gc_gen = PTR_GEN(k, i);
1084 1084
1085 if (ptr_stale(c, k, i)) { 1085 if (ptr_stale(c, k, i)) {
1086 stale = max(stale, ptr_stale(c, k, i)); 1086 stale = max(stale, ptr_stale(c, k, i));
1087 continue; 1087 continue;
1088 } 1088 }
1089 1089
1090 cache_bug_on(GC_MARK(g) && 1090 cache_bug_on(GC_MARK(g) &&
1091 (GC_MARK(g) == GC_MARK_METADATA) != (level != 0), 1091 (GC_MARK(g) == GC_MARK_METADATA) != (level != 0),
1092 c, "inconsistent ptrs: mark = %llu, level = %i", 1092 c, "inconsistent ptrs: mark = %llu, level = %i",
1093 GC_MARK(g), level); 1093 GC_MARK(g), level);
1094 1094
1095 if (level) 1095 if (level)
1096 SET_GC_MARK(g, GC_MARK_METADATA); 1096 SET_GC_MARK(g, GC_MARK_METADATA);
1097 else if (KEY_DIRTY(k)) 1097 else if (KEY_DIRTY(k))
1098 SET_GC_MARK(g, GC_MARK_DIRTY); 1098 SET_GC_MARK(g, GC_MARK_DIRTY);
1099 1099
1100 /* guard against overflow */ 1100 /* guard against overflow */
1101 SET_GC_SECTORS_USED(g, min_t(unsigned, 1101 SET_GC_SECTORS_USED(g, min_t(unsigned,
1102 GC_SECTORS_USED(g) + KEY_SIZE(k), 1102 GC_SECTORS_USED(g) + KEY_SIZE(k),
1103 (1 << 14) - 1)); 1103 (1 << 14) - 1));
1104 1104
1105 BUG_ON(!GC_SECTORS_USED(g)); 1105 BUG_ON(!GC_SECTORS_USED(g));
1106 } 1106 }
1107 1107
1108 return stale; 1108 return stale;
1109 } 1109 }
1110 1110
1111 #define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k) 1111 #define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k)
1112 1112
1113 static int btree_gc_mark_node(struct btree *b, unsigned *keys, 1113 static int btree_gc_mark_node(struct btree *b, unsigned *keys,
1114 struct gc_stat *gc) 1114 struct gc_stat *gc)
1115 { 1115 {
1116 uint8_t stale = 0; 1116 uint8_t stale = 0;
1117 unsigned last_dev = -1; 1117 unsigned last_dev = -1;
1118 struct bcache_device *d = NULL; 1118 struct bcache_device *d = NULL;
1119 struct bkey *k; 1119 struct bkey *k;
1120 struct btree_iter iter; 1120 struct btree_iter iter;
1121 struct bset_tree *t; 1121 struct bset_tree *t;
1122 1122
1123 gc->nodes++; 1123 gc->nodes++;
1124 1124
1125 for_each_key_filter(b, k, &iter, bch_ptr_invalid) { 1125 for_each_key_filter(b, k, &iter, bch_ptr_invalid) {
1126 if (last_dev != KEY_INODE(k)) { 1126 if (last_dev != KEY_INODE(k)) {
1127 last_dev = KEY_INODE(k); 1127 last_dev = KEY_INODE(k);
1128 1128
1129 d = KEY_INODE(k) < b->c->nr_uuids 1129 d = KEY_INODE(k) < b->c->nr_uuids
1130 ? b->c->devices[last_dev] 1130 ? b->c->devices[last_dev]
1131 : NULL; 1131 : NULL;
1132 } 1132 }
1133 1133
1134 stale = max(stale, btree_mark_key(b, k)); 1134 stale = max(stale, btree_mark_key(b, k));
1135 1135
1136 if (bch_ptr_bad(b, k)) 1136 if (bch_ptr_bad(b, k))
1137 continue; 1137 continue;
1138 1138
1139 *keys += bkey_u64s(k); 1139 *keys += bkey_u64s(k);
1140 1140
1141 gc->key_bytes += bkey_u64s(k); 1141 gc->key_bytes += bkey_u64s(k);
1142 gc->nkeys++; 1142 gc->nkeys++;
1143 1143
1144 gc->data += KEY_SIZE(k); 1144 gc->data += KEY_SIZE(k);
1145 if (KEY_DIRTY(k)) 1145 if (KEY_DIRTY(k))
1146 gc->dirty += KEY_SIZE(k); 1146 gc->dirty += KEY_SIZE(k);
1147 } 1147 }
1148 1148
1149 for (t = b->sets; t <= &b->sets[b->nsets]; t++) 1149 for (t = b->sets; t <= &b->sets[b->nsets]; t++)
1150 btree_bug_on(t->size && 1150 btree_bug_on(t->size &&
1151 bset_written(b, t) && 1151 bset_written(b, t) &&
1152 bkey_cmp(&b->key, &t->end) < 0, 1152 bkey_cmp(&b->key, &t->end) < 0,
1153 b, "found short btree key in gc"); 1153 b, "found short btree key in gc");
1154 1154
1155 return stale; 1155 return stale;
1156 } 1156 }
1157 1157
1158 static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k, 1158 static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k,
1159 struct btree_op *op) 1159 struct btree_op *op)
1160 { 1160 {
1161 /* 1161 /*
1162 * We block priorities from being written for the duration of garbage 1162 * We block priorities from being written for the duration of garbage
1163 * collection, so we can't sleep in btree_alloc() -> 1163 * collection, so we can't sleep in btree_alloc() ->
1164 * bch_bucket_alloc_set(), or we'd risk deadlock - so we don't pass it 1164 * bch_bucket_alloc_set(), or we'd risk deadlock - so we don't pass it
1165 * our closure. 1165 * our closure.
1166 */ 1166 */
1167 struct btree *n = btree_node_alloc_replacement(b, NULL); 1167 struct btree *n = btree_node_alloc_replacement(b, NULL);
1168 1168
1169 if (!IS_ERR_OR_NULL(n)) { 1169 if (!IS_ERR_OR_NULL(n)) {
1170 swap(b, n); 1170 swap(b, n);
1171 __bkey_put(b->c, &b->key); 1171 __bkey_put(b->c, &b->key);
1172 1172
1173 memcpy(k->ptr, b->key.ptr, 1173 memcpy(k->ptr, b->key.ptr,
1174 sizeof(uint64_t) * KEY_PTRS(&b->key)); 1174 sizeof(uint64_t) * KEY_PTRS(&b->key));
1175 1175
1176 btree_node_free(n, op); 1176 btree_node_free(n, op);
1177 up_write(&n->lock); 1177 up_write(&n->lock);
1178 } 1178 }
1179 1179
1180 return b; 1180 return b;
1181 } 1181 }
1182 1182
1183 /* 1183 /*
1184 * Leaving this at 2 until we've got incremental garbage collection done; it 1184 * Leaving this at 2 until we've got incremental garbage collection done; it
1185 * could be higher (and has been tested with 4) except that garbage collection 1185 * could be higher (and has been tested with 4) except that garbage collection
1186 * could take much longer, adversely affecting latency. 1186 * could take much longer, adversely affecting latency.
1187 */ 1187 */
1188 #define GC_MERGE_NODES 2U 1188 #define GC_MERGE_NODES 2U
1189 1189
1190 struct gc_merge_info { 1190 struct gc_merge_info {
1191 struct btree *b; 1191 struct btree *b;
1192 struct bkey *k; 1192 struct bkey *k;
1193 unsigned keys; 1193 unsigned keys;
1194 }; 1194 };
1195 1195
1196 static void btree_gc_coalesce(struct btree *b, struct btree_op *op, 1196 static void btree_gc_coalesce(struct btree *b, struct btree_op *op,
1197 struct gc_stat *gc, struct gc_merge_info *r) 1197 struct gc_stat *gc, struct gc_merge_info *r)
1198 { 1198 {
1199 unsigned nodes = 0, keys = 0, blocks; 1199 unsigned nodes = 0, keys = 0, blocks;
1200 int i; 1200 int i;
1201 1201
1202 while (nodes < GC_MERGE_NODES && r[nodes].b) 1202 while (nodes < GC_MERGE_NODES && r[nodes].b)
1203 keys += r[nodes++].keys; 1203 keys += r[nodes++].keys;
1204 1204
1205 blocks = btree_default_blocks(b->c) * 2 / 3; 1205 blocks = btree_default_blocks(b->c) * 2 / 3;
1206 1206
1207 if (nodes < 2 || 1207 if (nodes < 2 ||
1208 __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1)) 1208 __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1))
1209 return; 1209 return;
1210 1210
1211 for (i = nodes - 1; i >= 0; --i) { 1211 for (i = nodes - 1; i >= 0; --i) {
1212 if (r[i].b->written) 1212 if (r[i].b->written)
1213 r[i].b = btree_gc_alloc(r[i].b, r[i].k, op); 1213 r[i].b = btree_gc_alloc(r[i].b, r[i].k, op);
1214 1214
1215 if (r[i].b->written) 1215 if (r[i].b->written)
1216 return; 1216 return;
1217 } 1217 }
1218 1218
1219 for (i = nodes - 1; i > 0; --i) { 1219 for (i = nodes - 1; i > 0; --i) {
1220 struct bset *n1 = r[i].b->sets->data; 1220 struct bset *n1 = r[i].b->sets->data;
1221 struct bset *n2 = r[i - 1].b->sets->data; 1221 struct bset *n2 = r[i - 1].b->sets->data;
1222 struct bkey *k, *last = NULL; 1222 struct bkey *k, *last = NULL;
1223 1223
1224 keys = 0; 1224 keys = 0;
1225 1225
1226 if (i == 1) { 1226 if (i == 1) {
1227 /* 1227 /*
1228 * Last node we're not getting rid of - we're getting 1228 * Last node we're not getting rid of - we're getting
1229 * rid of the node at r[0]. Have to try and fit all of 1229 * rid of the node at r[0]. Have to try and fit all of
1230 * the remaining keys into this node; we can't ensure 1230 * the remaining keys into this node; we can't ensure
1231 * they will always fit due to rounding and variable 1231 * they will always fit due to rounding and variable
1232 * length keys (shouldn't be possible in practice, 1232 * length keys (shouldn't be possible in practice,
1233 * though) 1233 * though)
1234 */ 1234 */
1235 if (__set_blocks(n1, n1->keys + r->keys, 1235 if (__set_blocks(n1, n1->keys + r->keys,
1236 b->c) > btree_blocks(r[i].b)) 1236 b->c) > btree_blocks(r[i].b))
1237 return; 1237 return;
1238 1238
1239 keys = n2->keys; 1239 keys = n2->keys;
1240 last = &r->b->key; 1240 last = &r->b->key;
1241 } else 1241 } else
1242 for (k = n2->start; 1242 for (k = n2->start;
1243 k < end(n2); 1243 k < end(n2);
1244 k = bkey_next(k)) { 1244 k = bkey_next(k)) {
1245 if (__set_blocks(n1, n1->keys + keys + 1245 if (__set_blocks(n1, n1->keys + keys +
1246 bkey_u64s(k), b->c) > blocks) 1246 bkey_u64s(k), b->c) > blocks)
1247 break; 1247 break;
1248 1248
1249 last = k; 1249 last = k;
1250 keys += bkey_u64s(k); 1250 keys += bkey_u64s(k);
1251 } 1251 }
1252 1252
1253 BUG_ON(__set_blocks(n1, n1->keys + keys, 1253 BUG_ON(__set_blocks(n1, n1->keys + keys,
1254 b->c) > btree_blocks(r[i].b)); 1254 b->c) > btree_blocks(r[i].b));
1255 1255
1256 if (last) { 1256 if (last) {
1257 bkey_copy_key(&r[i].b->key, last); 1257 bkey_copy_key(&r[i].b->key, last);
1258 bkey_copy_key(r[i].k, last); 1258 bkey_copy_key(r[i].k, last);
1259 } 1259 }
1260 1260
1261 memcpy(end(n1), 1261 memcpy(end(n1),
1262 n2->start, 1262 n2->start,
1263 (void *) node(n2, keys) - (void *) n2->start); 1263 (void *) node(n2, keys) - (void *) n2->start);
1264 1264
1265 n1->keys += keys; 1265 n1->keys += keys;
1266 1266
1267 memmove(n2->start, 1267 memmove(n2->start,
1268 node(n2, keys), 1268 node(n2, keys),
1269 (void *) end(n2) - (void *) node(n2, keys)); 1269 (void *) end(n2) - (void *) node(n2, keys));
1270 1270
1271 n2->keys -= keys; 1271 n2->keys -= keys;
1272 1272
1273 r[i].keys = n1->keys; 1273 r[i].keys = n1->keys;
1274 r[i - 1].keys = n2->keys; 1274 r[i - 1].keys = n2->keys;
1275 } 1275 }
1276 1276
1277 btree_node_free(r->b, op); 1277 btree_node_free(r->b, op);
1278 up_write(&r->b->lock); 1278 up_write(&r->b->lock);
1279 1279
1280 trace_bcache_btree_gc_coalesce(nodes); 1280 trace_bcache_btree_gc_coalesce(nodes);
1281 1281
1282 gc->nodes--; 1282 gc->nodes--;
1283 nodes--; 1283 nodes--;
1284 1284
1285 memmove(&r[0], &r[1], sizeof(struct gc_merge_info) * nodes); 1285 memmove(&r[0], &r[1], sizeof(struct gc_merge_info) * nodes);
1286 memset(&r[nodes], 0, sizeof(struct gc_merge_info)); 1286 memset(&r[nodes], 0, sizeof(struct gc_merge_info));
1287 } 1287 }
1288 1288
1289 static int btree_gc_recurse(struct btree *b, struct btree_op *op, 1289 static int btree_gc_recurse(struct btree *b, struct btree_op *op,
1290 struct closure *writes, struct gc_stat *gc) 1290 struct closure *writes, struct gc_stat *gc)
1291 { 1291 {
1292 void write(struct btree *r) 1292 void write(struct btree *r)
1293 { 1293 {
1294 if (!r->written) 1294 if (!r->written)
1295 bch_btree_node_write(r, &op->cl); 1295 bch_btree_node_write(r, &op->cl);
1296 else if (btree_node_dirty(r)) 1296 else if (btree_node_dirty(r))
1297 bch_btree_node_write(r, writes); 1297 bch_btree_node_write(r, writes);
1298 1298
1299 up_write(&r->lock); 1299 up_write(&r->lock);
1300 } 1300 }
1301 1301
1302 int ret = 0, stale; 1302 int ret = 0, stale;
1303 unsigned i; 1303 unsigned i;
1304 struct gc_merge_info r[GC_MERGE_NODES]; 1304 struct gc_merge_info r[GC_MERGE_NODES];
1305 1305
1306 memset(r, 0, sizeof(r)); 1306 memset(r, 0, sizeof(r));
1307 1307
1308 while ((r->k = bch_next_recurse_key(b, &b->c->gc_done))) { 1308 while ((r->k = bch_next_recurse_key(b, &b->c->gc_done))) {
1309 r->b = bch_btree_node_get(b->c, r->k, b->level - 1, op); 1309 r->b = bch_btree_node_get(b->c, r->k, b->level - 1, op);
1310 1310
1311 if (IS_ERR(r->b)) { 1311 if (IS_ERR(r->b)) {
1312 ret = PTR_ERR(r->b); 1312 ret = PTR_ERR(r->b);
1313 break; 1313 break;
1314 } 1314 }
1315 1315
1316 r->keys = 0; 1316 r->keys = 0;
1317 stale = btree_gc_mark_node(r->b, &r->keys, gc); 1317 stale = btree_gc_mark_node(r->b, &r->keys, gc);
1318 1318
1319 if (!b->written && 1319 if (!b->written &&
1320 (r->b->level || stale > 10 || 1320 (r->b->level || stale > 10 ||
1321 b->c->gc_always_rewrite)) 1321 b->c->gc_always_rewrite))
1322 r->b = btree_gc_alloc(r->b, r->k, op); 1322 r->b = btree_gc_alloc(r->b, r->k, op);
1323 1323
1324 if (r->b->level) 1324 if (r->b->level)
1325 ret = btree_gc_recurse(r->b, op, writes, gc); 1325 ret = btree_gc_recurse(r->b, op, writes, gc);
1326 1326
1327 if (ret) { 1327 if (ret) {
1328 write(r->b); 1328 write(r->b);
1329 break; 1329 break;
1330 } 1330 }
1331 1331
1332 bkey_copy_key(&b->c->gc_done, r->k); 1332 bkey_copy_key(&b->c->gc_done, r->k);
1333 1333
1334 if (!b->written) 1334 if (!b->written)
1335 btree_gc_coalesce(b, op, gc, r); 1335 btree_gc_coalesce(b, op, gc, r);
1336 1336
1337 if (r[GC_MERGE_NODES - 1].b) 1337 if (r[GC_MERGE_NODES - 1].b)
1338 write(r[GC_MERGE_NODES - 1].b); 1338 write(r[GC_MERGE_NODES - 1].b);
1339 1339
1340 memmove(&r[1], &r[0], 1340 memmove(&r[1], &r[0],
1341 sizeof(struct gc_merge_info) * (GC_MERGE_NODES - 1)); 1341 sizeof(struct gc_merge_info) * (GC_MERGE_NODES - 1));
1342 1342
1343 /* When we've got incremental GC working, we'll want to do 1343 /* When we've got incremental GC working, we'll want to do
1344 * if (should_resched()) 1344 * if (should_resched())
1345 * return -EAGAIN; 1345 * return -EAGAIN;
1346 */ 1346 */
1347 cond_resched(); 1347 cond_resched();
1348 #if 0 1348 #if 0
1349 if (need_resched()) { 1349 if (need_resched()) {
1350 ret = -EAGAIN; 1350 ret = -EAGAIN;
1351 break; 1351 break;
1352 } 1352 }
1353 #endif 1353 #endif
1354 } 1354 }
1355 1355
1356 for (i = 1; i < GC_MERGE_NODES && r[i].b; i++) 1356 for (i = 1; i < GC_MERGE_NODES && r[i].b; i++)
1357 write(r[i].b); 1357 write(r[i].b);
1358 1358
1359 /* Might have freed some children, must remove their keys */ 1359 /* Might have freed some children, must remove their keys */
1360 if (!b->written) 1360 if (!b->written)
1361 bch_btree_sort(b); 1361 bch_btree_sort(b);
1362 1362
1363 return ret; 1363 return ret;
1364 } 1364 }
1365 1365
1366 static int bch_btree_gc_root(struct btree *b, struct btree_op *op, 1366 static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
1367 struct closure *writes, struct gc_stat *gc) 1367 struct closure *writes, struct gc_stat *gc)
1368 { 1368 {
1369 struct btree *n = NULL; 1369 struct btree *n = NULL;
1370 unsigned keys = 0; 1370 unsigned keys = 0;
1371 int ret = 0, stale = btree_gc_mark_node(b, &keys, gc); 1371 int ret = 0, stale = btree_gc_mark_node(b, &keys, gc);
1372 1372
1373 if (b->level || stale > 10) 1373 if (b->level || stale > 10)
1374 n = btree_node_alloc_replacement(b, NULL); 1374 n = btree_node_alloc_replacement(b, NULL);
1375 1375
1376 if (!IS_ERR_OR_NULL(n)) 1376 if (!IS_ERR_OR_NULL(n))
1377 swap(b, n); 1377 swap(b, n);
1378 1378
1379 if (b->level) 1379 if (b->level)
1380 ret = btree_gc_recurse(b, op, writes, gc); 1380 ret = btree_gc_recurse(b, op, writes, gc);
1381 1381
1382 if (!b->written || btree_node_dirty(b)) { 1382 if (!b->written || btree_node_dirty(b)) {
1383 bch_btree_node_write(b, n ? &op->cl : NULL); 1383 bch_btree_node_write(b, n ? &op->cl : NULL);
1384 } 1384 }
1385 1385
1386 if (!IS_ERR_OR_NULL(n)) { 1386 if (!IS_ERR_OR_NULL(n)) {
1387 closure_sync(&op->cl); 1387 closure_sync(&op->cl);
1388 bch_btree_set_root(b); 1388 bch_btree_set_root(b);
1389 btree_node_free(n, op); 1389 btree_node_free(n, op);
1390 rw_unlock(true, b); 1390 rw_unlock(true, b);
1391 } 1391 }
1392 1392
1393 return ret; 1393 return ret;
1394 } 1394 }
1395 1395
1396 static void btree_gc_start(struct cache_set *c) 1396 static void btree_gc_start(struct cache_set *c)
1397 { 1397 {
1398 struct cache *ca; 1398 struct cache *ca;
1399 struct bucket *b; 1399 struct bucket *b;
1400 unsigned i; 1400 unsigned i;
1401 1401
1402 if (!c->gc_mark_valid) 1402 if (!c->gc_mark_valid)
1403 return; 1403 return;
1404 1404
1405 mutex_lock(&c->bucket_lock); 1405 mutex_lock(&c->bucket_lock);
1406 1406
1407 c->gc_mark_valid = 0; 1407 c->gc_mark_valid = 0;
1408 c->gc_done = ZERO_KEY; 1408 c->gc_done = ZERO_KEY;
1409 1409
1410 for_each_cache(ca, c, i) 1410 for_each_cache(ca, c, i)
1411 for_each_bucket(b, ca) { 1411 for_each_bucket(b, ca) {
1412 b->gc_gen = b->gen; 1412 b->gc_gen = b->gen;
1413 if (!atomic_read(&b->pin)) 1413 if (!atomic_read(&b->pin))
1414 SET_GC_MARK(b, GC_MARK_RECLAIMABLE); 1414 SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
1415 } 1415 }
1416 1416
1417 mutex_unlock(&c->bucket_lock); 1417 mutex_unlock(&c->bucket_lock);
1418 } 1418 }
1419 1419
1420 size_t bch_btree_gc_finish(struct cache_set *c) 1420 size_t bch_btree_gc_finish(struct cache_set *c)
1421 { 1421 {
1422 size_t available = 0; 1422 size_t available = 0;
1423 struct bucket *b; 1423 struct bucket *b;
1424 struct cache *ca; 1424 struct cache *ca;
1425 unsigned i; 1425 unsigned i;
1426 1426
1427 mutex_lock(&c->bucket_lock); 1427 mutex_lock(&c->bucket_lock);
1428 1428
1429 set_gc_sectors(c); 1429 set_gc_sectors(c);
1430 c->gc_mark_valid = 1; 1430 c->gc_mark_valid = 1;
1431 c->need_gc = 0; 1431 c->need_gc = 0;
1432 1432
1433 if (c->root) 1433 if (c->root)
1434 for (i = 0; i < KEY_PTRS(&c->root->key); i++) 1434 for (i = 0; i < KEY_PTRS(&c->root->key); i++)
1435 SET_GC_MARK(PTR_BUCKET(c, &c->root->key, i), 1435 SET_GC_MARK(PTR_BUCKET(c, &c->root->key, i),
1436 GC_MARK_METADATA); 1436 GC_MARK_METADATA);
1437 1437
1438 for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++) 1438 for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++)
1439 SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i), 1439 SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i),
1440 GC_MARK_METADATA); 1440 GC_MARK_METADATA);
1441 1441
1442 for_each_cache(ca, c, i) { 1442 for_each_cache(ca, c, i) {
1443 uint64_t *i; 1443 uint64_t *i;
1444 1444
1445 ca->invalidate_needs_gc = 0; 1445 ca->invalidate_needs_gc = 0;
1446 1446
1447 for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++) 1447 for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++)
1448 SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); 1448 SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
1449 1449
1450 for (i = ca->prio_buckets; 1450 for (i = ca->prio_buckets;
1451 i < ca->prio_buckets + prio_buckets(ca) * 2; i++) 1451 i < ca->prio_buckets + prio_buckets(ca) * 2; i++)
1452 SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); 1452 SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
1453 1453
1454 for_each_bucket(b, ca) { 1454 for_each_bucket(b, ca) {
1455 b->last_gc = b->gc_gen; 1455 b->last_gc = b->gc_gen;
1456 c->need_gc = max(c->need_gc, bucket_gc_gen(b)); 1456 c->need_gc = max(c->need_gc, bucket_gc_gen(b));
1457 1457
1458 if (!atomic_read(&b->pin) && 1458 if (!atomic_read(&b->pin) &&
1459 GC_MARK(b) == GC_MARK_RECLAIMABLE) { 1459 GC_MARK(b) == GC_MARK_RECLAIMABLE) {
1460 available++; 1460 available++;
1461 if (!GC_SECTORS_USED(b)) 1461 if (!GC_SECTORS_USED(b))
1462 bch_bucket_add_unused(ca, b); 1462 bch_bucket_add_unused(ca, b);
1463 } 1463 }
1464 } 1464 }
1465 } 1465 }
1466 1466
1467 mutex_unlock(&c->bucket_lock); 1467 mutex_unlock(&c->bucket_lock);
1468 return available; 1468 return available;
1469 } 1469 }
1470 1470
1471 static void bch_btree_gc(struct closure *cl) 1471 static void bch_btree_gc(struct closure *cl)
1472 { 1472 {
1473 struct cache_set *c = container_of(cl, struct cache_set, gc.cl); 1473 struct cache_set *c = container_of(cl, struct cache_set, gc.cl);
1474 int ret; 1474 int ret;
1475 unsigned long available; 1475 unsigned long available;
1476 struct gc_stat stats; 1476 struct gc_stat stats;
1477 struct closure writes; 1477 struct closure writes;
1478 struct btree_op op; 1478 struct btree_op op;
1479 uint64_t start_time = local_clock(); 1479 uint64_t start_time = local_clock();
1480 1480
1481 trace_bcache_gc_start(c); 1481 trace_bcache_gc_start(c);
1482 1482
1483 memset(&stats, 0, sizeof(struct gc_stat)); 1483 memset(&stats, 0, sizeof(struct gc_stat));
1484 closure_init_stack(&writes); 1484 closure_init_stack(&writes);
1485 bch_btree_op_init_stack(&op); 1485 bch_btree_op_init_stack(&op);
1486 op.lock = SHRT_MAX; 1486 op.lock = SHRT_MAX;
1487 1487
1488 btree_gc_start(c); 1488 btree_gc_start(c);
1489 1489
1490 atomic_inc(&c->prio_blocked); 1490 atomic_inc(&c->prio_blocked);
1491 1491
1492 ret = btree_root(gc_root, c, &op, &writes, &stats); 1492 ret = btree_root(gc_root, c, &op, &writes, &stats);
1493 closure_sync(&op.cl); 1493 closure_sync(&op.cl);
1494 closure_sync(&writes); 1494 closure_sync(&writes);
1495 1495
1496 if (ret) { 1496 if (ret) {
1497 pr_warn("gc failed!"); 1497 pr_warn("gc failed!");
1498 continue_at(cl, bch_btree_gc, bch_gc_wq); 1498 continue_at(cl, bch_btree_gc, bch_gc_wq);
1499 } 1499 }
1500 1500
1501 /* Possibly wait for new UUIDs or whatever to hit disk */ 1501 /* Possibly wait for new UUIDs or whatever to hit disk */
1502 bch_journal_meta(c, &op.cl); 1502 bch_journal_meta(c, &op.cl);
1503 closure_sync(&op.cl); 1503 closure_sync(&op.cl);
1504 1504
1505 available = bch_btree_gc_finish(c); 1505 available = bch_btree_gc_finish(c);
1506 1506
1507 atomic_dec(&c->prio_blocked); 1507 atomic_dec(&c->prio_blocked);
1508 wake_up_allocators(c); 1508 wake_up_allocators(c);
1509 1509
1510 bch_time_stats_update(&c->btree_gc_time, start_time); 1510 bch_time_stats_update(&c->btree_gc_time, start_time);
1511 1511
1512 stats.key_bytes *= sizeof(uint64_t); 1512 stats.key_bytes *= sizeof(uint64_t);
1513 stats.dirty <<= 9; 1513 stats.dirty <<= 9;
1514 stats.data <<= 9; 1514 stats.data <<= 9;
1515 stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; 1515 stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets;
1516 memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); 1516 memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
1517 1517
1518 trace_bcache_gc_end(c); 1518 trace_bcache_gc_end(c);
1519 1519
1520 continue_at(cl, bch_moving_gc, bch_gc_wq); 1520 continue_at(cl, bch_moving_gc, bch_gc_wq);
1521 } 1521 }
1522 1522
1523 void bch_queue_gc(struct cache_set *c) 1523 void bch_queue_gc(struct cache_set *c)
1524 { 1524 {
1525 closure_trylock_call(&c->gc.cl, bch_btree_gc, bch_gc_wq, &c->cl); 1525 closure_trylock_call(&c->gc.cl, bch_btree_gc, bch_gc_wq, &c->cl);
1526 } 1526 }
1527 1527
1528 /* Initial partial gc */ 1528 /* Initial partial gc */
1529 1529
1530 static int bch_btree_check_recurse(struct btree *b, struct btree_op *op, 1530 static int bch_btree_check_recurse(struct btree *b, struct btree_op *op,
1531 unsigned long **seen) 1531 unsigned long **seen)
1532 { 1532 {
1533 int ret; 1533 int ret;
1534 unsigned i; 1534 unsigned i;
1535 struct bkey *k; 1535 struct bkey *k;
1536 struct bucket *g; 1536 struct bucket *g;
1537 struct btree_iter iter; 1537 struct btree_iter iter;
1538 1538
1539 for_each_key_filter(b, k, &iter, bch_ptr_invalid) { 1539 for_each_key_filter(b, k, &iter, bch_ptr_invalid) {
1540 for (i = 0; i < KEY_PTRS(k); i++) { 1540 for (i = 0; i < KEY_PTRS(k); i++) {
1541 if (!ptr_available(b->c, k, i)) 1541 if (!ptr_available(b->c, k, i))
1542 continue; 1542 continue;
1543 1543
1544 g = PTR_BUCKET(b->c, k, i); 1544 g = PTR_BUCKET(b->c, k, i);
1545 1545
1546 if (!__test_and_set_bit(PTR_BUCKET_NR(b->c, k, i), 1546 if (!__test_and_set_bit(PTR_BUCKET_NR(b->c, k, i),
1547 seen[PTR_DEV(k, i)]) || 1547 seen[PTR_DEV(k, i)]) ||
1548 !ptr_stale(b->c, k, i)) { 1548 !ptr_stale(b->c, k, i)) {
1549 g->gen = PTR_GEN(k, i); 1549 g->gen = PTR_GEN(k, i);
1550 1550
1551 if (b->level) 1551 if (b->level)
1552 g->prio = BTREE_PRIO; 1552 g->prio = BTREE_PRIO;
1553 else if (g->prio == BTREE_PRIO) 1553 else if (g->prio == BTREE_PRIO)
1554 g->prio = INITIAL_PRIO; 1554 g->prio = INITIAL_PRIO;
1555 } 1555 }
1556 } 1556 }
1557 1557
1558 btree_mark_key(b, k); 1558 btree_mark_key(b, k);
1559 } 1559 }
1560 1560
1561 if (b->level) { 1561 if (b->level) {
1562 k = bch_next_recurse_key(b, &ZERO_KEY); 1562 k = bch_next_recurse_key(b, &ZERO_KEY);
1563 1563
1564 while (k) { 1564 while (k) {
1565 struct bkey *p = bch_next_recurse_key(b, k); 1565 struct bkey *p = bch_next_recurse_key(b, k);
1566 if (p) 1566 if (p)
1567 btree_node_prefetch(b->c, p, b->level - 1); 1567 btree_node_prefetch(b->c, p, b->level - 1);
1568 1568
1569 ret = btree(check_recurse, k, b, op, seen); 1569 ret = btree(check_recurse, k, b, op, seen);
1570 if (ret) 1570 if (ret)
1571 return ret; 1571 return ret;
1572 1572
1573 k = p; 1573 k = p;
1574 } 1574 }
1575 } 1575 }
1576 1576
1577 return 0; 1577 return 0;
1578 } 1578 }
1579 1579
1580 int bch_btree_check(struct cache_set *c, struct btree_op *op) 1580 int bch_btree_check(struct cache_set *c, struct btree_op *op)
1581 { 1581 {
1582 int ret = -ENOMEM; 1582 int ret = -ENOMEM;
1583 unsigned i; 1583 unsigned i;
1584 unsigned long *seen[MAX_CACHES_PER_SET]; 1584 unsigned long *seen[MAX_CACHES_PER_SET];
1585 1585
1586 memset(seen, 0, sizeof(seen)); 1586 memset(seen, 0, sizeof(seen));
1587 1587
1588 for (i = 0; c->cache[i]; i++) { 1588 for (i = 0; c->cache[i]; i++) {
1589 size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8); 1589 size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8);
1590 seen[i] = kmalloc(n, GFP_KERNEL); 1590 seen[i] = kmalloc(n, GFP_KERNEL);
1591 if (!seen[i]) 1591 if (!seen[i])
1592 goto err; 1592 goto err;
1593 1593
1594 /* Disables the seen array until prio_read() uses it too */ 1594 /* Disables the seen array until prio_read() uses it too */
1595 memset(seen[i], 0xFF, n); 1595 memset(seen[i], 0xFF, n);
1596 } 1596 }
1597 1597
1598 ret = btree_root(check_recurse, c, op, seen); 1598 ret = btree_root(check_recurse, c, op, seen);
1599 err: 1599 err:
1600 for (i = 0; i < MAX_CACHES_PER_SET; i++) 1600 for (i = 0; i < MAX_CACHES_PER_SET; i++)
1601 kfree(seen[i]); 1601 kfree(seen[i]);
1602 return ret; 1602 return ret;
1603 } 1603 }
1604 1604
1605 /* Btree insertion */ 1605 /* Btree insertion */
1606 1606
1607 static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert) 1607 static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert)
1608 { 1608 {
1609 struct bset *i = b->sets[b->nsets].data; 1609 struct bset *i = b->sets[b->nsets].data;
1610 1610
1611 memmove((uint64_t *) where + bkey_u64s(insert), 1611 memmove((uint64_t *) where + bkey_u64s(insert),
1612 where, 1612 where,
1613 (void *) end(i) - (void *) where); 1613 (void *) end(i) - (void *) where);
1614 1614
1615 i->keys += bkey_u64s(insert); 1615 i->keys += bkey_u64s(insert);
1616 bkey_copy(where, insert); 1616 bkey_copy(where, insert);
1617 bch_bset_fix_lookup_table(b, where); 1617 bch_bset_fix_lookup_table(b, where);
1618 } 1618 }
1619 1619
1620 static bool fix_overlapping_extents(struct btree *b, 1620 static bool fix_overlapping_extents(struct btree *b,
1621 struct bkey *insert, 1621 struct bkey *insert,
1622 struct btree_iter *iter, 1622 struct btree_iter *iter,
1623 struct btree_op *op) 1623 struct btree_op *op)
1624 { 1624 {
1625 void subtract_dirty(struct bkey *k, uint64_t offset, int sectors) 1625 void subtract_dirty(struct bkey *k, uint64_t offset, int sectors)
1626 { 1626 {
1627 if (KEY_DIRTY(k)) 1627 if (KEY_DIRTY(k))
1628 bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), 1628 bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
1629 offset, -sectors); 1629 offset, -sectors);
1630 } 1630 }
1631 1631
1632 uint64_t old_offset; 1632 uint64_t old_offset;
1633 unsigned old_size, sectors_found = 0; 1633 unsigned old_size, sectors_found = 0;
1634 1634
1635 while (1) { 1635 while (1) {
1636 struct bkey *k = bch_btree_iter_next(iter); 1636 struct bkey *k = bch_btree_iter_next(iter);
1637 if (!k || 1637 if (!k ||
1638 bkey_cmp(&START_KEY(k), insert) >= 0) 1638 bkey_cmp(&START_KEY(k), insert) >= 0)
1639 break; 1639 break;
1640 1640
1641 if (bkey_cmp(k, &START_KEY(insert)) <= 0) 1641 if (bkey_cmp(k, &START_KEY(insert)) <= 0)
1642 continue; 1642 continue;
1643 1643
1644 old_offset = KEY_START(k); 1644 old_offset = KEY_START(k);
1645 old_size = KEY_SIZE(k); 1645 old_size = KEY_SIZE(k);
1646 1646
1647 /* 1647 /*
1648 * We might overlap with 0 size extents; we can't skip these 1648 * We might overlap with 0 size extents; we can't skip these
1649 * because if they're in the set we're inserting to we have to 1649 * because if they're in the set we're inserting to we have to
1650 * adjust them so they don't overlap with the key we're 1650 * adjust them so they don't overlap with the key we're
1651 * inserting. But we don't want to check them for BTREE_REPLACE 1651 * inserting. But we don't want to check them for BTREE_REPLACE
1652 * operations. 1652 * operations.
1653 */ 1653 */
1654 1654
1655 if (op->type == BTREE_REPLACE && 1655 if (op->type == BTREE_REPLACE &&
1656 KEY_SIZE(k)) { 1656 KEY_SIZE(k)) {
1657 /* 1657 /*
1658 * k might have been split since we inserted/found the 1658 * k might have been split since we inserted/found the
1659 * key we're replacing 1659 * key we're replacing
1660 */ 1660 */
1661 unsigned i; 1661 unsigned i;
1662 uint64_t offset = KEY_START(k) - 1662 uint64_t offset = KEY_START(k) -
1663 KEY_START(&op->replace); 1663 KEY_START(&op->replace);
1664 1664
1665 /* But it must be a subset of the replace key */ 1665 /* But it must be a subset of the replace key */
1666 if (KEY_START(k) < KEY_START(&op->replace) || 1666 if (KEY_START(k) < KEY_START(&op->replace) ||
1667 KEY_OFFSET(k) > KEY_OFFSET(&op->replace)) 1667 KEY_OFFSET(k) > KEY_OFFSET(&op->replace))
1668 goto check_failed; 1668 goto check_failed;
1669 1669
1670 /* We didn't find a key that we were supposed to */ 1670 /* We didn't find a key that we were supposed to */
1671 if (KEY_START(k) > KEY_START(insert) + sectors_found) 1671 if (KEY_START(k) > KEY_START(insert) + sectors_found)
1672 goto check_failed; 1672 goto check_failed;
1673 1673
1674 if (KEY_PTRS(&op->replace) != KEY_PTRS(k)) 1674 if (KEY_PTRS(&op->replace) != KEY_PTRS(k))
1675 goto check_failed; 1675 goto check_failed;
1676 1676
1677 /* skip past gen */ 1677 /* skip past gen */
1678 offset <<= 8; 1678 offset <<= 8;
1679 1679
1680 BUG_ON(!KEY_PTRS(&op->replace)); 1680 BUG_ON(!KEY_PTRS(&op->replace));
1681 1681
1682 for (i = 0; i < KEY_PTRS(&op->replace); i++) 1682 for (i = 0; i < KEY_PTRS(&op->replace); i++)
1683 if (k->ptr[i] != op->replace.ptr[i] + offset) 1683 if (k->ptr[i] != op->replace.ptr[i] + offset)
1684 goto check_failed; 1684 goto check_failed;
1685 1685
1686 sectors_found = KEY_OFFSET(k) - KEY_START(insert); 1686 sectors_found = KEY_OFFSET(k) - KEY_START(insert);
1687 } 1687 }
1688 1688
1689 if (bkey_cmp(insert, k) < 0 && 1689 if (bkey_cmp(insert, k) < 0 &&
1690 bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) { 1690 bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) {
1691 /* 1691 /*
1692 * We overlapped in the middle of an existing key: that 1692 * We overlapped in the middle of an existing key: that
1693 * means we have to split the old key. But we have to do 1693 * means we have to split the old key. But we have to do
1694 * slightly different things depending on whether the 1694 * slightly different things depending on whether the
1695 * old key has been written out yet. 1695 * old key has been written out yet.
1696 */ 1696 */
1697 1697
1698 struct bkey *top; 1698 struct bkey *top;
1699 1699
1700 subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert)); 1700 subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert));
1701 1701
1702 if (bkey_written(b, k)) { 1702 if (bkey_written(b, k)) {
1703 /* 1703 /*
1704 * We insert a new key to cover the top of the 1704 * We insert a new key to cover the top of the
1705 * old key, and the old key is modified in place 1705 * old key, and the old key is modified in place
1706 * to represent the bottom split. 1706 * to represent the bottom split.
1707 * 1707 *
1708 * It's completely arbitrary whether the new key 1708 * It's completely arbitrary whether the new key
1709 * is the top or the bottom, but it has to match 1709 * is the top or the bottom, but it has to match
1710 * up with what btree_sort_fixup() does - it 1710 * up with what btree_sort_fixup() does - it
1711 * doesn't check for this kind of overlap, it 1711 * doesn't check for this kind of overlap, it
1712 * depends on us inserting a new key for the top 1712 * depends on us inserting a new key for the top
1713 * here. 1713 * here.
1714 */ 1714 */
1715 top = bch_bset_search(b, &b->sets[b->nsets], 1715 top = bch_bset_search(b, &b->sets[b->nsets],
1716 insert); 1716 insert);
1717 shift_keys(b, top, k); 1717 shift_keys(b, top, k);
1718 } else { 1718 } else {
1719 BKEY_PADDED(key) temp; 1719 BKEY_PADDED(key) temp;
1720 bkey_copy(&temp.key, k); 1720 bkey_copy(&temp.key, k);
1721 shift_keys(b, k, &temp.key); 1721 shift_keys(b, k, &temp.key);
1722 top = bkey_next(k); 1722 top = bkey_next(k);
1723 } 1723 }
1724 1724
1725 bch_cut_front(insert, top); 1725 bch_cut_front(insert, top);
1726 bch_cut_back(&START_KEY(insert), k); 1726 bch_cut_back(&START_KEY(insert), k);
1727 bch_bset_fix_invalidated_key(b, k); 1727 bch_bset_fix_invalidated_key(b, k);
1728 return false; 1728 return false;
1729 } 1729 }
1730 1730
1731 if (bkey_cmp(insert, k) < 0) { 1731 if (bkey_cmp(insert, k) < 0) {
1732 bch_cut_front(insert, k); 1732 bch_cut_front(insert, k);
1733 } else { 1733 } else {
1734 if (bkey_written(b, k) && 1734 if (bkey_written(b, k) &&
1735 bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) { 1735 bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) {
1736 /* 1736 /*
1737 * Completely overwrote, so we don't have to 1737 * Completely overwrote, so we don't have to
1738 * invalidate the binary search tree 1738 * invalidate the binary search tree
1739 */ 1739 */
1740 bch_cut_front(k, k); 1740 bch_cut_front(k, k);
1741 } else { 1741 } else {
1742 __bch_cut_back(&START_KEY(insert), k); 1742 __bch_cut_back(&START_KEY(insert), k);
1743 bch_bset_fix_invalidated_key(b, k); 1743 bch_bset_fix_invalidated_key(b, k);
1744 } 1744 }
1745 } 1745 }
1746 1746
1747 subtract_dirty(k, old_offset, old_size - KEY_SIZE(k)); 1747 subtract_dirty(k, old_offset, old_size - KEY_SIZE(k));
1748 } 1748 }
1749 1749
1750 check_failed: 1750 check_failed:
1751 if (op->type == BTREE_REPLACE) { 1751 if (op->type == BTREE_REPLACE) {
1752 if (!sectors_found) { 1752 if (!sectors_found) {
1753 op->insert_collision = true; 1753 op->insert_collision = true;
1754 return true; 1754 return true;
1755 } else if (sectors_found < KEY_SIZE(insert)) { 1755 } else if (sectors_found < KEY_SIZE(insert)) {
1756 SET_KEY_OFFSET(insert, KEY_OFFSET(insert) - 1756 SET_KEY_OFFSET(insert, KEY_OFFSET(insert) -
1757 (KEY_SIZE(insert) - sectors_found)); 1757 (KEY_SIZE(insert) - sectors_found));
1758 SET_KEY_SIZE(insert, sectors_found); 1758 SET_KEY_SIZE(insert, sectors_found);
1759 } 1759 }
1760 } 1760 }
1761 1761
1762 return false; 1762 return false;
1763 } 1763 }
1764 1764
1765 static bool btree_insert_key(struct btree *b, struct btree_op *op, 1765 static bool btree_insert_key(struct btree *b, struct btree_op *op,
1766 struct bkey *k) 1766 struct bkey *k)
1767 { 1767 {
1768 struct bset *i = b->sets[b->nsets].data; 1768 struct bset *i = b->sets[b->nsets].data;
1769 struct bkey *m, *prev; 1769 struct bkey *m, *prev;
1770 unsigned status = BTREE_INSERT_STATUS_INSERT; 1770 unsigned status = BTREE_INSERT_STATUS_INSERT;
1771 1771
1772 BUG_ON(bkey_cmp(k, &b->key) > 0); 1772 BUG_ON(bkey_cmp(k, &b->key) > 0);
1773 BUG_ON(b->level && !KEY_PTRS(k)); 1773 BUG_ON(b->level && !KEY_PTRS(k));
1774 BUG_ON(!b->level && !KEY_OFFSET(k)); 1774 BUG_ON(!b->level && !KEY_OFFSET(k));
1775 1775
1776 if (!b->level) { 1776 if (!b->level) {
1777 struct btree_iter iter; 1777 struct btree_iter iter;
1778 struct bkey search = KEY(KEY_INODE(k), KEY_START(k), 0); 1778 struct bkey search = KEY(KEY_INODE(k), KEY_START(k), 0);
1779 1779
1780 /* 1780 /*
1781 * bset_search() returns the first key that is strictly greater 1781 * bset_search() returns the first key that is strictly greater
1782 * than the search key - but for back merging, we want to find 1782 * than the search key - but for back merging, we want to find
1783 * the first key that is greater than or equal to KEY_START(k) - 1783 * the first key that is greater than or equal to KEY_START(k) -
1784 * unless KEY_START(k) is 0. 1784 * unless KEY_START(k) is 0.
1785 */ 1785 */
1786 if (KEY_OFFSET(&search)) 1786 if (KEY_OFFSET(&search))
1787 SET_KEY_OFFSET(&search, KEY_OFFSET(&search) - 1); 1787 SET_KEY_OFFSET(&search, KEY_OFFSET(&search) - 1);
1788 1788
1789 prev = NULL; 1789 prev = NULL;
1790 m = bch_btree_iter_init(b, &iter, &search); 1790 m = bch_btree_iter_init(b, &iter, &search);
1791 1791
1792 if (fix_overlapping_extents(b, k, &iter, op)) 1792 if (fix_overlapping_extents(b, k, &iter, op))
1793 return false; 1793 return false;
1794 1794
1795 while (m != end(i) && 1795 while (m != end(i) &&
1796 bkey_cmp(k, &START_KEY(m)) > 0) 1796 bkey_cmp(k, &START_KEY(m)) > 0)
1797 prev = m, m = bkey_next(m); 1797 prev = m, m = bkey_next(m);
1798 1798
1799 if (key_merging_disabled(b->c)) 1799 if (key_merging_disabled(b->c))
1800 goto insert; 1800 goto insert;
1801 1801
1802 /* prev is in the tree, if we merge we're done */ 1802 /* prev is in the tree, if we merge we're done */
1803 status = BTREE_INSERT_STATUS_BACK_MERGE; 1803 status = BTREE_INSERT_STATUS_BACK_MERGE;
1804 if (prev && 1804 if (prev &&
1805 bch_bkey_try_merge(b, prev, k)) 1805 bch_bkey_try_merge(b, prev, k))
1806 goto merged; 1806 goto merged;
1807 1807
1808 status = BTREE_INSERT_STATUS_OVERWROTE; 1808 status = BTREE_INSERT_STATUS_OVERWROTE;
1809 if (m != end(i) && 1809 if (m != end(i) &&
1810 KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m)) 1810 KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m))
1811 goto copy; 1811 goto copy;
1812 1812
1813 status = BTREE_INSERT_STATUS_FRONT_MERGE; 1813 status = BTREE_INSERT_STATUS_FRONT_MERGE;
1814 if (m != end(i) && 1814 if (m != end(i) &&
1815 bch_bkey_try_merge(b, k, m)) 1815 bch_bkey_try_merge(b, k, m))
1816 goto copy; 1816 goto copy;
1817 } else 1817 } else
1818 m = bch_bset_search(b, &b->sets[b->nsets], k); 1818 m = bch_bset_search(b, &b->sets[b->nsets], k);
1819 1819
1820 insert: shift_keys(b, m, k); 1820 insert: shift_keys(b, m, k);
1821 copy: bkey_copy(m, k); 1821 copy: bkey_copy(m, k);
1822 merged: 1822 merged:
1823 if (KEY_DIRTY(k)) 1823 if (KEY_DIRTY(k))
1824 bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), 1824 bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
1825 KEY_START(k), KEY_SIZE(k)); 1825 KEY_START(k), KEY_SIZE(k));
1826 1826
1827 bch_check_keys(b, "%u for %s", status, op_type(op)); 1827 bch_check_keys(b, "%u for %s", status, op_type(op));
1828 1828
1829 if (b->level && !KEY_OFFSET(k)) 1829 if (b->level && !KEY_OFFSET(k))
1830 btree_current_write(b)->prio_blocked++; 1830 btree_current_write(b)->prio_blocked++;
1831 1831
1832 trace_bcache_btree_insert_key(b, k, op->type, status); 1832 trace_bcache_btree_insert_key(b, k, op->type, status);
1833 1833
1834 return true; 1834 return true;
1835 } 1835 }
1836 1836
1837 static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op) 1837 static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op)
1838 { 1838 {
1839 bool ret = false; 1839 bool ret = false;
1840 struct bkey *k; 1840 struct bkey *k;
1841 unsigned oldsize = bch_count_data(b); 1841 unsigned oldsize = bch_count_data(b);
1842 1842
1843 while ((k = bch_keylist_pop(&op->keys))) { 1843 while ((k = bch_keylist_pop(&op->keys))) {
1844 bkey_put(b->c, k, b->level); 1844 bkey_put(b->c, k, b->level);
1845 ret |= btree_insert_key(b, op, k); 1845 ret |= btree_insert_key(b, op, k);
1846 } 1846 }
1847 1847
1848 BUG_ON(bch_count_data(b) < oldsize); 1848 BUG_ON(bch_count_data(b) < oldsize);
1849 return ret; 1849 return ret;
1850 } 1850 }
1851 1851
1852 bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op, 1852 bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
1853 struct bio *bio) 1853 struct bio *bio)
1854 { 1854 {
1855 bool ret = false; 1855 bool ret = false;
1856 uint64_t btree_ptr = b->key.ptr[0]; 1856 uint64_t btree_ptr = b->key.ptr[0];
1857 unsigned long seq = b->seq; 1857 unsigned long seq = b->seq;
1858 BKEY_PADDED(k) tmp; 1858 BKEY_PADDED(k) tmp;
1859 1859
1860 rw_unlock(false, b); 1860 rw_unlock(false, b);
1861 rw_lock(true, b, b->level); 1861 rw_lock(true, b, b->level);
1862 1862
1863 if (b->key.ptr[0] != btree_ptr || 1863 if (b->key.ptr[0] != btree_ptr ||
1864 b->seq != seq + 1 || 1864 b->seq != seq + 1 ||
1865 should_split(b)) 1865 should_split(b))
1866 goto out; 1866 goto out;
1867 1867
1868 op->replace = KEY(op->inode, bio_end(bio), bio_sectors(bio)); 1868 op->replace = KEY(op->inode, bio_end_sector(bio), bio_sectors(bio));
1869 1869
1870 SET_KEY_PTRS(&op->replace, 1); 1870 SET_KEY_PTRS(&op->replace, 1);
1871 get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t)); 1871 get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t));
1872 1872
1873 SET_PTR_DEV(&op->replace, 0, PTR_CHECK_DEV); 1873 SET_PTR_DEV(&op->replace, 0, PTR_CHECK_DEV);
1874 1874
1875 bkey_copy(&tmp.k, &op->replace); 1875 bkey_copy(&tmp.k, &op->replace);
1876 1876
1877 BUG_ON(op->type != BTREE_INSERT); 1877 BUG_ON(op->type != BTREE_INSERT);
1878 BUG_ON(!btree_insert_key(b, op, &tmp.k)); 1878 BUG_ON(!btree_insert_key(b, op, &tmp.k));
1879 ret = true; 1879 ret = true;
1880 out: 1880 out:
1881 downgrade_write(&b->lock); 1881 downgrade_write(&b->lock);
1882 return ret; 1882 return ret;
1883 } 1883 }
1884 1884
1885 static int btree_split(struct btree *b, struct btree_op *op) 1885 static int btree_split(struct btree *b, struct btree_op *op)
1886 { 1886 {
1887 bool split, root = b == b->c->root; 1887 bool split, root = b == b->c->root;
1888 struct btree *n1, *n2 = NULL, *n3 = NULL; 1888 struct btree *n1, *n2 = NULL, *n3 = NULL;
1889 uint64_t start_time = local_clock(); 1889 uint64_t start_time = local_clock();
1890 1890
1891 if (b->level) 1891 if (b->level)
1892 set_closure_blocking(&op->cl); 1892 set_closure_blocking(&op->cl);
1893 1893
1894 n1 = btree_node_alloc_replacement(b, &op->cl); 1894 n1 = btree_node_alloc_replacement(b, &op->cl);
1895 if (IS_ERR(n1)) 1895 if (IS_ERR(n1))
1896 goto err; 1896 goto err;
1897 1897
1898 split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5; 1898 split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5;
1899 1899
1900 if (split) { 1900 if (split) {
1901 unsigned keys = 0; 1901 unsigned keys = 0;
1902 1902
1903 trace_bcache_btree_node_split(b, n1->sets[0].data->keys); 1903 trace_bcache_btree_node_split(b, n1->sets[0].data->keys);
1904 1904
1905 n2 = bch_btree_node_alloc(b->c, b->level, &op->cl); 1905 n2 = bch_btree_node_alloc(b->c, b->level, &op->cl);
1906 if (IS_ERR(n2)) 1906 if (IS_ERR(n2))
1907 goto err_free1; 1907 goto err_free1;
1908 1908
1909 if (root) { 1909 if (root) {
1910 n3 = bch_btree_node_alloc(b->c, b->level + 1, &op->cl); 1910 n3 = bch_btree_node_alloc(b->c, b->level + 1, &op->cl);
1911 if (IS_ERR(n3)) 1911 if (IS_ERR(n3))
1912 goto err_free2; 1912 goto err_free2;
1913 } 1913 }
1914 1914
1915 bch_btree_insert_keys(n1, op); 1915 bch_btree_insert_keys(n1, op);
1916 1916
1917 /* Has to be a linear search because we don't have an auxiliary 1917 /* Has to be a linear search because we don't have an auxiliary
1918 * search tree yet 1918 * search tree yet
1919 */ 1919 */
1920 1920
1921 while (keys < (n1->sets[0].data->keys * 3) / 5) 1921 while (keys < (n1->sets[0].data->keys * 3) / 5)
1922 keys += bkey_u64s(node(n1->sets[0].data, keys)); 1922 keys += bkey_u64s(node(n1->sets[0].data, keys));
1923 1923
1924 bkey_copy_key(&n1->key, node(n1->sets[0].data, keys)); 1924 bkey_copy_key(&n1->key, node(n1->sets[0].data, keys));
1925 keys += bkey_u64s(node(n1->sets[0].data, keys)); 1925 keys += bkey_u64s(node(n1->sets[0].data, keys));
1926 1926
1927 n2->sets[0].data->keys = n1->sets[0].data->keys - keys; 1927 n2->sets[0].data->keys = n1->sets[0].data->keys - keys;
1928 n1->sets[0].data->keys = keys; 1928 n1->sets[0].data->keys = keys;
1929 1929
1930 memcpy(n2->sets[0].data->start, 1930 memcpy(n2->sets[0].data->start,
1931 end(n1->sets[0].data), 1931 end(n1->sets[0].data),
1932 n2->sets[0].data->keys * sizeof(uint64_t)); 1932 n2->sets[0].data->keys * sizeof(uint64_t));
1933 1933
1934 bkey_copy_key(&n2->key, &b->key); 1934 bkey_copy_key(&n2->key, &b->key);
1935 1935
1936 bch_keylist_add(&op->keys, &n2->key); 1936 bch_keylist_add(&op->keys, &n2->key);
1937 bch_btree_node_write(n2, &op->cl); 1937 bch_btree_node_write(n2, &op->cl);
1938 rw_unlock(true, n2); 1938 rw_unlock(true, n2);
1939 } else { 1939 } else {
1940 trace_bcache_btree_node_compact(b, n1->sets[0].data->keys); 1940 trace_bcache_btree_node_compact(b, n1->sets[0].data->keys);
1941 1941
1942 bch_btree_insert_keys(n1, op); 1942 bch_btree_insert_keys(n1, op);
1943 } 1943 }
1944 1944
1945 bch_keylist_add(&op->keys, &n1->key); 1945 bch_keylist_add(&op->keys, &n1->key);
1946 bch_btree_node_write(n1, &op->cl); 1946 bch_btree_node_write(n1, &op->cl);
1947 1947
1948 if (n3) { 1948 if (n3) {
1949 bkey_copy_key(&n3->key, &MAX_KEY); 1949 bkey_copy_key(&n3->key, &MAX_KEY);
1950 bch_btree_insert_keys(n3, op); 1950 bch_btree_insert_keys(n3, op);
1951 bch_btree_node_write(n3, &op->cl); 1951 bch_btree_node_write(n3, &op->cl);
1952 1952
1953 closure_sync(&op->cl); 1953 closure_sync(&op->cl);
1954 bch_btree_set_root(n3); 1954 bch_btree_set_root(n3);
1955 rw_unlock(true, n3); 1955 rw_unlock(true, n3);
1956 } else if (root) { 1956 } else if (root) {
1957 op->keys.top = op->keys.bottom; 1957 op->keys.top = op->keys.bottom;
1958 closure_sync(&op->cl); 1958 closure_sync(&op->cl);
1959 bch_btree_set_root(n1); 1959 bch_btree_set_root(n1);
1960 } else { 1960 } else {
1961 unsigned i; 1961 unsigned i;
1962 1962
1963 bkey_copy(op->keys.top, &b->key); 1963 bkey_copy(op->keys.top, &b->key);
1964 bkey_copy_key(op->keys.top, &ZERO_KEY); 1964 bkey_copy_key(op->keys.top, &ZERO_KEY);
1965 1965
1966 for (i = 0; i < KEY_PTRS(&b->key); i++) { 1966 for (i = 0; i < KEY_PTRS(&b->key); i++) {
1967 uint8_t g = PTR_BUCKET(b->c, &b->key, i)->gen + 1; 1967 uint8_t g = PTR_BUCKET(b->c, &b->key, i)->gen + 1;
1968 1968
1969 SET_PTR_GEN(op->keys.top, i, g); 1969 SET_PTR_GEN(op->keys.top, i, g);
1970 } 1970 }
1971 1971
1972 bch_keylist_push(&op->keys); 1972 bch_keylist_push(&op->keys);
1973 closure_sync(&op->cl); 1973 closure_sync(&op->cl);
1974 atomic_inc(&b->c->prio_blocked); 1974 atomic_inc(&b->c->prio_blocked);
1975 } 1975 }
1976 1976
1977 rw_unlock(true, n1); 1977 rw_unlock(true, n1);
1978 btree_node_free(b, op); 1978 btree_node_free(b, op);
1979 1979
1980 bch_time_stats_update(&b->c->btree_split_time, start_time); 1980 bch_time_stats_update(&b->c->btree_split_time, start_time);
1981 1981
1982 return 0; 1982 return 0;
1983 err_free2: 1983 err_free2:
1984 __bkey_put(n2->c, &n2->key); 1984 __bkey_put(n2->c, &n2->key);
1985 btree_node_free(n2, op); 1985 btree_node_free(n2, op);
1986 rw_unlock(true, n2); 1986 rw_unlock(true, n2);
1987 err_free1: 1987 err_free1:
1988 __bkey_put(n1->c, &n1->key); 1988 __bkey_put(n1->c, &n1->key);
1989 btree_node_free(n1, op); 1989 btree_node_free(n1, op);
1990 rw_unlock(true, n1); 1990 rw_unlock(true, n1);
1991 err: 1991 err:
1992 if (n3 == ERR_PTR(-EAGAIN) || 1992 if (n3 == ERR_PTR(-EAGAIN) ||
1993 n2 == ERR_PTR(-EAGAIN) || 1993 n2 == ERR_PTR(-EAGAIN) ||
1994 n1 == ERR_PTR(-EAGAIN)) 1994 n1 == ERR_PTR(-EAGAIN))
1995 return -EAGAIN; 1995 return -EAGAIN;
1996 1996
1997 pr_warn("couldn't split"); 1997 pr_warn("couldn't split");
1998 return -ENOMEM; 1998 return -ENOMEM;
1999 } 1999 }
2000 2000
2001 static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op, 2001 static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op,
2002 struct keylist *stack_keys) 2002 struct keylist *stack_keys)
2003 { 2003 {
2004 if (b->level) { 2004 if (b->level) {
2005 int ret; 2005 int ret;
2006 struct bkey *insert = op->keys.bottom; 2006 struct bkey *insert = op->keys.bottom;
2007 struct bkey *k = bch_next_recurse_key(b, &START_KEY(insert)); 2007 struct bkey *k = bch_next_recurse_key(b, &START_KEY(insert));
2008 2008
2009 if (!k) { 2009 if (!k) {
2010 btree_bug(b, "no key to recurse on at level %i/%i", 2010 btree_bug(b, "no key to recurse on at level %i/%i",
2011 b->level, b->c->root->level); 2011 b->level, b->c->root->level);
2012 2012
2013 op->keys.top = op->keys.bottom; 2013 op->keys.top = op->keys.bottom;
2014 return -EIO; 2014 return -EIO;
2015 } 2015 }
2016 2016
2017 if (bkey_cmp(insert, k) > 0) { 2017 if (bkey_cmp(insert, k) > 0) {
2018 unsigned i; 2018 unsigned i;
2019 2019
2020 if (op->type == BTREE_REPLACE) { 2020 if (op->type == BTREE_REPLACE) {
2021 __bkey_put(b->c, insert); 2021 __bkey_put(b->c, insert);
2022 op->keys.top = op->keys.bottom; 2022 op->keys.top = op->keys.bottom;
2023 op->insert_collision = true; 2023 op->insert_collision = true;
2024 return 0; 2024 return 0;
2025 } 2025 }
2026 2026
2027 for (i = 0; i < KEY_PTRS(insert); i++) 2027 for (i = 0; i < KEY_PTRS(insert); i++)
2028 atomic_inc(&PTR_BUCKET(b->c, insert, i)->pin); 2028 atomic_inc(&PTR_BUCKET(b->c, insert, i)->pin);
2029 2029
2030 bkey_copy(stack_keys->top, insert); 2030 bkey_copy(stack_keys->top, insert);
2031 2031
2032 bch_cut_back(k, insert); 2032 bch_cut_back(k, insert);
2033 bch_cut_front(k, stack_keys->top); 2033 bch_cut_front(k, stack_keys->top);
2034 2034
2035 bch_keylist_push(stack_keys); 2035 bch_keylist_push(stack_keys);
2036 } 2036 }
2037 2037
2038 ret = btree(insert_recurse, k, b, op, stack_keys); 2038 ret = btree(insert_recurse, k, b, op, stack_keys);
2039 if (ret) 2039 if (ret)
2040 return ret; 2040 return ret;
2041 } 2041 }
2042 2042
2043 if (!bch_keylist_empty(&op->keys)) { 2043 if (!bch_keylist_empty(&op->keys)) {
2044 if (should_split(b)) { 2044 if (should_split(b)) {
2045 if (op->lock <= b->c->root->level) { 2045 if (op->lock <= b->c->root->level) {
2046 BUG_ON(b->level); 2046 BUG_ON(b->level);
2047 op->lock = b->c->root->level + 1; 2047 op->lock = b->c->root->level + 1;
2048 return -EINTR; 2048 return -EINTR;
2049 } 2049 }
2050 return btree_split(b, op); 2050 return btree_split(b, op);
2051 } 2051 }
2052 2052
2053 BUG_ON(write_block(b) != b->sets[b->nsets].data); 2053 BUG_ON(write_block(b) != b->sets[b->nsets].data);
2054 2054
2055 if (bch_btree_insert_keys(b, op)) { 2055 if (bch_btree_insert_keys(b, op)) {
2056 if (!b->level) 2056 if (!b->level)
2057 bch_btree_leaf_dirty(b, op); 2057 bch_btree_leaf_dirty(b, op);
2058 else 2058 else
2059 bch_btree_node_write(b, &op->cl); 2059 bch_btree_node_write(b, &op->cl);
2060 } 2060 }
2061 } 2061 }
2062 2062
2063 return 0; 2063 return 0;
2064 } 2064 }
2065 2065
2066 int bch_btree_insert(struct btree_op *op, struct cache_set *c) 2066 int bch_btree_insert(struct btree_op *op, struct cache_set *c)
2067 { 2067 {
2068 int ret = 0; 2068 int ret = 0;
2069 struct keylist stack_keys; 2069 struct keylist stack_keys;
2070 2070
2071 /* 2071 /*
2072 * Don't want to block with the btree locked unless we have to, 2072 * Don't want to block with the btree locked unless we have to,
2073 * otherwise we get deadlocks with try_harder and between split/gc 2073 * otherwise we get deadlocks with try_harder and between split/gc
2074 */ 2074 */
2075 clear_closure_blocking(&op->cl); 2075 clear_closure_blocking(&op->cl);
2076 2076
2077 BUG_ON(bch_keylist_empty(&op->keys)); 2077 BUG_ON(bch_keylist_empty(&op->keys));
2078 bch_keylist_copy(&stack_keys, &op->keys); 2078 bch_keylist_copy(&stack_keys, &op->keys);
2079 bch_keylist_init(&op->keys); 2079 bch_keylist_init(&op->keys);
2080 2080
2081 while (!bch_keylist_empty(&stack_keys) || 2081 while (!bch_keylist_empty(&stack_keys) ||
2082 !bch_keylist_empty(&op->keys)) { 2082 !bch_keylist_empty(&op->keys)) {
2083 if (bch_keylist_empty(&op->keys)) { 2083 if (bch_keylist_empty(&op->keys)) {
2084 bch_keylist_add(&op->keys, 2084 bch_keylist_add(&op->keys,
2085 bch_keylist_pop(&stack_keys)); 2085 bch_keylist_pop(&stack_keys));
2086 op->lock = 0; 2086 op->lock = 0;
2087 } 2087 }
2088 2088
2089 ret = btree_root(insert_recurse, c, op, &stack_keys); 2089 ret = btree_root(insert_recurse, c, op, &stack_keys);
2090 2090
2091 if (ret == -EAGAIN) { 2091 if (ret == -EAGAIN) {
2092 ret = 0; 2092 ret = 0;
2093 closure_sync(&op->cl); 2093 closure_sync(&op->cl);
2094 } else if (ret) { 2094 } else if (ret) {
2095 struct bkey *k; 2095 struct bkey *k;
2096 2096
2097 pr_err("error %i trying to insert key for %s", 2097 pr_err("error %i trying to insert key for %s",
2098 ret, op_type(op)); 2098 ret, op_type(op));
2099 2099
2100 while ((k = bch_keylist_pop(&stack_keys) ?: 2100 while ((k = bch_keylist_pop(&stack_keys) ?:
2101 bch_keylist_pop(&op->keys))) 2101 bch_keylist_pop(&op->keys)))
2102 bkey_put(c, k, 0); 2102 bkey_put(c, k, 0);
2103 } 2103 }
2104 } 2104 }
2105 2105
2106 bch_keylist_free(&stack_keys); 2106 bch_keylist_free(&stack_keys);
2107 2107
2108 if (op->journal) 2108 if (op->journal)
2109 atomic_dec_bug(op->journal); 2109 atomic_dec_bug(op->journal);
2110 op->journal = NULL; 2110 op->journal = NULL;
2111 return ret; 2111 return ret;
2112 } 2112 }
2113 2113
2114 void bch_btree_set_root(struct btree *b) 2114 void bch_btree_set_root(struct btree *b)
2115 { 2115 {
2116 unsigned i; 2116 unsigned i;
2117 struct closure cl; 2117 struct closure cl;
2118 2118
2119 closure_init_stack(&cl); 2119 closure_init_stack(&cl);
2120 2120
2121 trace_bcache_btree_set_root(b); 2121 trace_bcache_btree_set_root(b);
2122 2122
2123 BUG_ON(!b->written); 2123 BUG_ON(!b->written);
2124 2124
2125 for (i = 0; i < KEY_PTRS(&b->key); i++) 2125 for (i = 0; i < KEY_PTRS(&b->key); i++)
2126 BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO); 2126 BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO);
2127 2127
2128 mutex_lock(&b->c->bucket_lock); 2128 mutex_lock(&b->c->bucket_lock);
2129 list_del_init(&b->list); 2129 list_del_init(&b->list);
2130 mutex_unlock(&b->c->bucket_lock); 2130 mutex_unlock(&b->c->bucket_lock);
2131 2131
2132 b->c->root = b; 2132 b->c->root = b;
2133 __bkey_put(b->c, &b->key); 2133 __bkey_put(b->c, &b->key);
2134 2134
2135 bch_journal_meta(b->c, &cl); 2135 bch_journal_meta(b->c, &cl);
2136 closure_sync(&cl); 2136 closure_sync(&cl);
2137 } 2137 }
2138 2138
2139 /* Cache lookup */ 2139 /* Cache lookup */
2140 2140
2141 static int submit_partial_cache_miss(struct btree *b, struct btree_op *op, 2141 static int submit_partial_cache_miss(struct btree *b, struct btree_op *op,
2142 struct bkey *k) 2142 struct bkey *k)
2143 { 2143 {
2144 struct search *s = container_of(op, struct search, op); 2144 struct search *s = container_of(op, struct search, op);
2145 struct bio *bio = &s->bio.bio; 2145 struct bio *bio = &s->bio.bio;
2146 int ret = 0; 2146 int ret = 0;
2147 2147
2148 while (!ret && 2148 while (!ret &&
2149 !op->lookup_done) { 2149 !op->lookup_done) {
2150 unsigned sectors = INT_MAX; 2150 unsigned sectors = INT_MAX;
2151 2151
2152 if (KEY_INODE(k) == op->inode) { 2152 if (KEY_INODE(k) == op->inode) {
2153 if (KEY_START(k) <= bio->bi_sector) 2153 if (KEY_START(k) <= bio->bi_sector)
2154 break; 2154 break;
2155 2155
2156 sectors = min_t(uint64_t, sectors, 2156 sectors = min_t(uint64_t, sectors,
2157 KEY_START(k) - bio->bi_sector); 2157 KEY_START(k) - bio->bi_sector);
2158 } 2158 }
2159 2159
2160 ret = s->d->cache_miss(b, s, bio, sectors); 2160 ret = s->d->cache_miss(b, s, bio, sectors);
2161 } 2161 }
2162 2162
2163 return ret; 2163 return ret;
2164 } 2164 }
2165 2165
2166 /* 2166 /*
2167 * Read from a single key, handling the initial cache miss if the key starts in 2167 * Read from a single key, handling the initial cache miss if the key starts in
2168 * the middle of the bio 2168 * the middle of the bio
2169 */ 2169 */
2170 static int submit_partial_cache_hit(struct btree *b, struct btree_op *op, 2170 static int submit_partial_cache_hit(struct btree *b, struct btree_op *op,
2171 struct bkey *k) 2171 struct bkey *k)
2172 { 2172 {
2173 struct search *s = container_of(op, struct search, op); 2173 struct search *s = container_of(op, struct search, op);
2174 struct bio *bio = &s->bio.bio; 2174 struct bio *bio = &s->bio.bio;
2175 unsigned ptr; 2175 unsigned ptr;
2176 struct bio *n; 2176 struct bio *n;
2177 2177
2178 int ret = submit_partial_cache_miss(b, op, k); 2178 int ret = submit_partial_cache_miss(b, op, k);
2179 if (ret || op->lookup_done) 2179 if (ret || op->lookup_done)
2180 return ret; 2180 return ret;
2181 2181
2182 /* XXX: figure out best pointer - for multiple cache devices */ 2182 /* XXX: figure out best pointer - for multiple cache devices */
2183 ptr = 0; 2183 ptr = 0;
2184 2184
2185 PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO; 2185 PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO;
2186 2186
2187 while (!op->lookup_done && 2187 while (!op->lookup_done &&
2188 KEY_INODE(k) == op->inode && 2188 KEY_INODE(k) == op->inode &&
2189 bio->bi_sector < KEY_OFFSET(k)) { 2189 bio->bi_sector < KEY_OFFSET(k)) {
2190 struct bkey *bio_key; 2190 struct bkey *bio_key;
2191 sector_t sector = PTR_OFFSET(k, ptr) + 2191 sector_t sector = PTR_OFFSET(k, ptr) +
2192 (bio->bi_sector - KEY_START(k)); 2192 (bio->bi_sector - KEY_START(k));
2193 unsigned sectors = min_t(uint64_t, INT_MAX, 2193 unsigned sectors = min_t(uint64_t, INT_MAX,
2194 KEY_OFFSET(k) - bio->bi_sector); 2194 KEY_OFFSET(k) - bio->bi_sector);
2195 2195
2196 n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); 2196 n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
2197 if (!n)
2198 return -EAGAIN;
2199
2200 if (n == bio) 2197 if (n == bio)
2201 op->lookup_done = true; 2198 op->lookup_done = true;
2202 2199
2203 bio_key = &container_of(n, struct bbio, bio)->key; 2200 bio_key = &container_of(n, struct bbio, bio)->key;
2204 2201
2205 /* 2202 /*
2206 * The bucket we're reading from might be reused while our bio 2203 * The bucket we're reading from might be reused while our bio
2207 * is in flight, and we could then end up reading the wrong 2204 * is in flight, and we could then end up reading the wrong
2208 * data. 2205 * data.
2209 * 2206 *
2210 * We guard against this by checking (in cache_read_endio()) if 2207 * We guard against this by checking (in cache_read_endio()) if
2211 * the pointer is stale again; if so, we treat it as an error 2208 * the pointer is stale again; if so, we treat it as an error
2212 * and reread from the backing device (but we don't pass that 2209 * and reread from the backing device (but we don't pass that
2213 * error up anywhere). 2210 * error up anywhere).
2214 */ 2211 */
2215 2212
2216 bch_bkey_copy_single_ptr(bio_key, k, ptr); 2213 bch_bkey_copy_single_ptr(bio_key, k, ptr);
2217 SET_PTR_OFFSET(bio_key, 0, sector); 2214 SET_PTR_OFFSET(bio_key, 0, sector);
2218 2215
2219 n->bi_end_io = bch_cache_read_endio; 2216 n->bi_end_io = bch_cache_read_endio;
2220 n->bi_private = &s->cl; 2217 n->bi_private = &s->cl;
2221 2218
2222 __bch_submit_bbio(n, b->c); 2219 __bch_submit_bbio(n, b->c);
2223 } 2220 }
2224 2221
2225 return 0; 2222 return 0;
2226 } 2223 }
2227 2224
2228 int bch_btree_search_recurse(struct btree *b, struct btree_op *op) 2225 int bch_btree_search_recurse(struct btree *b, struct btree_op *op)
2229 { 2226 {
2230 struct search *s = container_of(op, struct search, op); 2227 struct search *s = container_of(op, struct search, op);
2231 struct bio *bio = &s->bio.bio; 2228 struct bio *bio = &s->bio.bio;
2232 2229
2233 int ret = 0; 2230 int ret = 0;
2234 struct bkey *k; 2231 struct bkey *k;
2235 struct btree_iter iter; 2232 struct btree_iter iter;
2236 bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0)); 2233 bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0));
2237 2234
2238 do { 2235 do {
2239 k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); 2236 k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
2240 if (!k) { 2237 if (!k) {
2241 /* 2238 /*
2242 * b->key would be exactly what we want, except that 2239 * b->key would be exactly what we want, except that
2243 * pointers to btree nodes have nonzero size - we 2240 * pointers to btree nodes have nonzero size - we
2244 * wouldn't go far enough 2241 * wouldn't go far enough
2245 */ 2242 */
2246 2243
2247 ret = submit_partial_cache_miss(b, op, 2244 ret = submit_partial_cache_miss(b, op,
2248 &KEY(KEY_INODE(&b->key), 2245 &KEY(KEY_INODE(&b->key),
2249 KEY_OFFSET(&b->key), 0)); 2246 KEY_OFFSET(&b->key), 0));
2250 break; 2247 break;
2251 } 2248 }
2252 2249
2253 ret = b->level 2250 ret = b->level
2254 ? btree(search_recurse, k, b, op) 2251 ? btree(search_recurse, k, b, op)
2255 : submit_partial_cache_hit(b, op, k); 2252 : submit_partial_cache_hit(b, op, k);
2256 } while (!ret && 2253 } while (!ret &&
2257 !op->lookup_done); 2254 !op->lookup_done);
2258 2255
2259 return ret; 2256 return ret;
2260 } 2257 }
2261 2258
2262 /* Keybuf code */ 2259 /* Keybuf code */
2263 2260
2264 static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r) 2261 static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r)
2265 { 2262 {
2266 /* Overlapping keys compare equal */ 2263 /* Overlapping keys compare equal */
2267 if (bkey_cmp(&l->key, &START_KEY(&r->key)) <= 0) 2264 if (bkey_cmp(&l->key, &START_KEY(&r->key)) <= 0)
2268 return -1; 2265 return -1;
2269 if (bkey_cmp(&START_KEY(&l->key), &r->key) >= 0) 2266 if (bkey_cmp(&START_KEY(&l->key), &r->key) >= 0)
2270 return 1; 2267 return 1;
2271 return 0; 2268 return 0;
2272 } 2269 }
2273 2270
2274 static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l, 2271 static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l,
2275 struct keybuf_key *r) 2272 struct keybuf_key *r)
2276 { 2273 {
2277 return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1); 2274 return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1);
2278 } 2275 }
2279 2276
2280 static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, 2277 static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op,
2281 struct keybuf *buf, struct bkey *end, 2278 struct keybuf *buf, struct bkey *end,
2282 keybuf_pred_fn *pred) 2279 keybuf_pred_fn *pred)
2283 { 2280 {
2284 struct btree_iter iter; 2281 struct btree_iter iter;
2285 bch_btree_iter_init(b, &iter, &buf->last_scanned); 2282 bch_btree_iter_init(b, &iter, &buf->last_scanned);
2286 2283
2287 while (!array_freelist_empty(&buf->freelist)) { 2284 while (!array_freelist_empty(&buf->freelist)) {
2288 struct bkey *k = bch_btree_iter_next_filter(&iter, b, 2285 struct bkey *k = bch_btree_iter_next_filter(&iter, b,
2289 bch_ptr_bad); 2286 bch_ptr_bad);
2290 2287
2291 if (!b->level) { 2288 if (!b->level) {
2292 if (!k) { 2289 if (!k) {
2293 buf->last_scanned = b->key; 2290 buf->last_scanned = b->key;
2294 break; 2291 break;
2295 } 2292 }
2296 2293
2297 buf->last_scanned = *k; 2294 buf->last_scanned = *k;
2298 if (bkey_cmp(&buf->last_scanned, end) >= 0) 2295 if (bkey_cmp(&buf->last_scanned, end) >= 0)
2299 break; 2296 break;
2300 2297
2301 if (pred(buf, k)) { 2298 if (pred(buf, k)) {
2302 struct keybuf_key *w; 2299 struct keybuf_key *w;
2303 2300
2304 spin_lock(&buf->lock); 2301 spin_lock(&buf->lock);
2305 2302
2306 w = array_alloc(&buf->freelist); 2303 w = array_alloc(&buf->freelist);
2307 2304
2308 w->private = NULL; 2305 w->private = NULL;
2309 bkey_copy(&w->key, k); 2306 bkey_copy(&w->key, k);
2310 2307
2311 if (RB_INSERT(&buf->keys, w, node, keybuf_cmp)) 2308 if (RB_INSERT(&buf->keys, w, node, keybuf_cmp))
2312 array_free(&buf->freelist, w); 2309 array_free(&buf->freelist, w);
2313 2310
2314 spin_unlock(&buf->lock); 2311 spin_unlock(&buf->lock);
2315 } 2312 }
2316 } else { 2313 } else {
2317 if (!k) 2314 if (!k)
2318 break; 2315 break;
2319 2316
2320 btree(refill_keybuf, k, b, op, buf, end, pred); 2317 btree(refill_keybuf, k, b, op, buf, end, pred);
2321 /* 2318 /*
2322 * Might get an error here, but can't really do anything 2319 * Might get an error here, but can't really do anything
2323 * and it'll get logged elsewhere. Just read what we 2320 * and it'll get logged elsewhere. Just read what we
2324 * can. 2321 * can.
2325 */ 2322 */
2326 2323
2327 if (bkey_cmp(&buf->last_scanned, end) >= 0) 2324 if (bkey_cmp(&buf->last_scanned, end) >= 0)
2328 break; 2325 break;
2329 2326
2330 cond_resched(); 2327 cond_resched();
2331 } 2328 }
2332 } 2329 }
2333 2330
2334 return 0; 2331 return 0;
2335 } 2332 }
2336 2333
2337 void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, 2334 void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
2338 struct bkey *end, keybuf_pred_fn *pred) 2335 struct bkey *end, keybuf_pred_fn *pred)
2339 { 2336 {
2340 struct bkey start = buf->last_scanned; 2337 struct bkey start = buf->last_scanned;
2341 struct btree_op op; 2338 struct btree_op op;
2342 bch_btree_op_init_stack(&op); 2339 bch_btree_op_init_stack(&op);
2343 2340
2344 cond_resched(); 2341 cond_resched();
2345 2342
2346 btree_root(refill_keybuf, c, &op, buf, end, pred); 2343 btree_root(refill_keybuf, c, &op, buf, end, pred);
2347 closure_sync(&op.cl); 2344 closure_sync(&op.cl);
2348 2345
2349 pr_debug("found %s keys from %llu:%llu to %llu:%llu", 2346 pr_debug("found %s keys from %llu:%llu to %llu:%llu",
2350 RB_EMPTY_ROOT(&buf->keys) ? "no" : 2347 RB_EMPTY_ROOT(&buf->keys) ? "no" :
2351 array_freelist_empty(&buf->freelist) ? "some" : "a few", 2348 array_freelist_empty(&buf->freelist) ? "some" : "a few",
2352 KEY_INODE(&start), KEY_OFFSET(&start), 2349 KEY_INODE(&start), KEY_OFFSET(&start),
2353 KEY_INODE(&buf->last_scanned), KEY_OFFSET(&buf->last_scanned)); 2350 KEY_INODE(&buf->last_scanned), KEY_OFFSET(&buf->last_scanned));
2354 2351
2355 spin_lock(&buf->lock); 2352 spin_lock(&buf->lock);
2356 2353
2357 if (!RB_EMPTY_ROOT(&buf->keys)) { 2354 if (!RB_EMPTY_ROOT(&buf->keys)) {
2358 struct keybuf_key *w; 2355 struct keybuf_key *w;
2359 w = RB_FIRST(&buf->keys, struct keybuf_key, node); 2356 w = RB_FIRST(&buf->keys, struct keybuf_key, node);
2360 buf->start = START_KEY(&w->key); 2357 buf->start = START_KEY(&w->key);
2361 2358
2362 w = RB_LAST(&buf->keys, struct keybuf_key, node); 2359 w = RB_LAST(&buf->keys, struct keybuf_key, node);
2363 buf->end = w->key; 2360 buf->end = w->key;
2364 } else { 2361 } else {
2365 buf->start = MAX_KEY; 2362 buf->start = MAX_KEY;
2366 buf->end = MAX_KEY; 2363 buf->end = MAX_KEY;
2367 } 2364 }
2368 2365
2369 spin_unlock(&buf->lock); 2366 spin_unlock(&buf->lock);
2370 } 2367 }
2371 2368
2372 static void __bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w) 2369 static void __bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
2373 { 2370 {
2374 rb_erase(&w->node, &buf->keys); 2371 rb_erase(&w->node, &buf->keys);
2375 array_free(&buf->freelist, w); 2372 array_free(&buf->freelist, w);
2376 } 2373 }
2377 2374
2378 void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w) 2375 void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
2379 { 2376 {
2380 spin_lock(&buf->lock); 2377 spin_lock(&buf->lock);
2381 __bch_keybuf_del(buf, w); 2378 __bch_keybuf_del(buf, w);
2382 spin_unlock(&buf->lock); 2379 spin_unlock(&buf->lock);
2383 } 2380 }
2384 2381
2385 bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start, 2382 bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start,
2386 struct bkey *end) 2383 struct bkey *end)
2387 { 2384 {
2388 bool ret = false; 2385 bool ret = false;
2389 struct keybuf_key *p, *w, s; 2386 struct keybuf_key *p, *w, s;
2390 s.key = *start; 2387 s.key = *start;
2391 2388
2392 if (bkey_cmp(end, &buf->start) <= 0 || 2389 if (bkey_cmp(end, &buf->start) <= 0 ||
2393 bkey_cmp(start, &buf->end) >= 0) 2390 bkey_cmp(start, &buf->end) >= 0)
2394 return false; 2391 return false;
2395 2392
2396 spin_lock(&buf->lock); 2393 spin_lock(&buf->lock);
2397 w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp); 2394 w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp);
2398 2395
2399 while (w && bkey_cmp(&START_KEY(&w->key), end) < 0) { 2396 while (w && bkey_cmp(&START_KEY(&w->key), end) < 0) {
2400 p = w; 2397 p = w;
2401 w = RB_NEXT(w, node); 2398 w = RB_NEXT(w, node);
2402 2399
2403 if (p->private) 2400 if (p->private)
2404 ret = true; 2401 ret = true;
2405 else 2402 else
2406 __bch_keybuf_del(buf, p); 2403 __bch_keybuf_del(buf, p);
2407 } 2404 }
2408 2405
2409 spin_unlock(&buf->lock); 2406 spin_unlock(&buf->lock);
2410 return ret; 2407 return ret;
2411 } 2408 }
2412 2409
2413 struct keybuf_key *bch_keybuf_next(struct keybuf *buf) 2410 struct keybuf_key *bch_keybuf_next(struct keybuf *buf)
2414 { 2411 {
2415 struct keybuf_key *w; 2412 struct keybuf_key *w;
2416 spin_lock(&buf->lock); 2413 spin_lock(&buf->lock);
2417 2414
2418 w = RB_FIRST(&buf->keys, struct keybuf_key, node); 2415 w = RB_FIRST(&buf->keys, struct keybuf_key, node);
2419 2416
2420 while (w && w->private) 2417 while (w && w->private)
2421 w = RB_NEXT(w, node); 2418 w = RB_NEXT(w, node);
2422 2419
2423 if (w) 2420 if (w)
2424 w->private = ERR_PTR(-EINTR); 2421 w->private = ERR_PTR(-EINTR);
2425 2422
2426 spin_unlock(&buf->lock); 2423 spin_unlock(&buf->lock);
2427 return w; 2424 return w;
2428 } 2425 }
2429 2426
2430 struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, 2427 struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
2431 struct keybuf *buf, 2428 struct keybuf *buf,
2432 struct bkey *end, 2429 struct bkey *end,
2433 keybuf_pred_fn *pred) 2430 keybuf_pred_fn *pred)
2434 { 2431 {
2435 struct keybuf_key *ret; 2432 struct keybuf_key *ret;
2436 2433
2437 while (1) { 2434 while (1) {
2438 ret = bch_keybuf_next(buf); 2435 ret = bch_keybuf_next(buf);
2439 if (ret) 2436 if (ret)
2440 break; 2437 break;
2441 2438
2442 if (bkey_cmp(&buf->last_scanned, end) >= 0) { 2439 if (bkey_cmp(&buf->last_scanned, end) >= 0) {
2443 pr_debug("scan finished"); 2440 pr_debug("scan finished");
2444 break; 2441 break;
2445 } 2442 }
2446 2443
2447 bch_refill_keybuf(c, buf, end, pred); 2444 bch_refill_keybuf(c, buf, end, pred);
2448 } 2445 }
2449 2446
2450 return ret; 2447 return ret;
2451 } 2448 }
2452 2449
2453 void bch_keybuf_init(struct keybuf *buf) 2450 void bch_keybuf_init(struct keybuf *buf)
2454 { 2451 {
2455 buf->last_scanned = MAX_KEY; 2452 buf->last_scanned = MAX_KEY;
2456 buf->keys = RB_ROOT; 2453 buf->keys = RB_ROOT;
2457 2454
2458 spin_lock_init(&buf->lock); 2455 spin_lock_init(&buf->lock);
2459 array_allocator_init(&buf->freelist); 2456 array_allocator_init(&buf->freelist);
2460 } 2457 }
2461 2458
2462 void bch_btree_exit(void) 2459 void bch_btree_exit(void)
2463 { 2460 {
2464 if (btree_io_wq) 2461 if (btree_io_wq)
2465 destroy_workqueue(btree_io_wq); 2462 destroy_workqueue(btree_io_wq);
2466 if (bch_gc_wq) 2463 if (bch_gc_wq)
2467 destroy_workqueue(bch_gc_wq); 2464 destroy_workqueue(bch_gc_wq);
2468 } 2465 }
2469 2466
2470 int __init bch_btree_init(void) 2467 int __init bch_btree_init(void)
2471 { 2468 {
2472 if (!(bch_gc_wq = create_singlethread_workqueue("bch_btree_gc")) || 2469 if (!(bch_gc_wq = create_singlethread_workqueue("bch_btree_gc")) ||
2473 !(btree_io_wq = create_singlethread_workqueue("bch_btree_io"))) 2470 !(btree_io_wq = create_singlethread_workqueue("bch_btree_io")))
2474 return -ENOMEM; 2471 return -ENOMEM;
2475 2472
2476 return 0; 2473 return 0;
2477 } 2474 }
2478 2475
drivers/md/bcache/debug.c
1 /* 1 /*
2 * Assorted bcache debug code 2 * Assorted bcache debug code
3 * 3 *
4 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> 4 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
5 * Copyright 2012 Google, Inc. 5 * Copyright 2012 Google, Inc.
6 */ 6 */
7 7
8 #include "bcache.h" 8 #include "bcache.h"
9 #include "btree.h" 9 #include "btree.h"
10 #include "debug.h" 10 #include "debug.h"
11 #include "request.h" 11 #include "request.h"
12 12
13 #include <linux/console.h> 13 #include <linux/console.h>
14 #include <linux/debugfs.h> 14 #include <linux/debugfs.h>
15 #include <linux/module.h> 15 #include <linux/module.h>
16 #include <linux/random.h> 16 #include <linux/random.h>
17 #include <linux/seq_file.h> 17 #include <linux/seq_file.h>
18 18
19 static struct dentry *debug; 19 static struct dentry *debug;
20 20
21 const char *bch_ptr_status(struct cache_set *c, const struct bkey *k) 21 const char *bch_ptr_status(struct cache_set *c, const struct bkey *k)
22 { 22 {
23 unsigned i; 23 unsigned i;
24 24
25 for (i = 0; i < KEY_PTRS(k); i++) 25 for (i = 0; i < KEY_PTRS(k); i++)
26 if (ptr_available(c, k, i)) { 26 if (ptr_available(c, k, i)) {
27 struct cache *ca = PTR_CACHE(c, k, i); 27 struct cache *ca = PTR_CACHE(c, k, i);
28 size_t bucket = PTR_BUCKET_NR(c, k, i); 28 size_t bucket = PTR_BUCKET_NR(c, k, i);
29 size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); 29 size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
30 30
31 if (KEY_SIZE(k) + r > c->sb.bucket_size) 31 if (KEY_SIZE(k) + r > c->sb.bucket_size)
32 return "bad, length too big"; 32 return "bad, length too big";
33 if (bucket < ca->sb.first_bucket) 33 if (bucket < ca->sb.first_bucket)
34 return "bad, short offset"; 34 return "bad, short offset";
35 if (bucket >= ca->sb.nbuckets) 35 if (bucket >= ca->sb.nbuckets)
36 return "bad, offset past end of device"; 36 return "bad, offset past end of device";
37 if (ptr_stale(c, k, i)) 37 if (ptr_stale(c, k, i))
38 return "stale"; 38 return "stale";
39 } 39 }
40 40
41 if (!bkey_cmp(k, &ZERO_KEY)) 41 if (!bkey_cmp(k, &ZERO_KEY))
42 return "bad, null key"; 42 return "bad, null key";
43 if (!KEY_PTRS(k)) 43 if (!KEY_PTRS(k))
44 return "bad, no pointers"; 44 return "bad, no pointers";
45 if (!KEY_SIZE(k)) 45 if (!KEY_SIZE(k))
46 return "zeroed key"; 46 return "zeroed key";
47 return ""; 47 return "";
48 } 48 }
49 49
50 int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k) 50 int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k)
51 { 51 {
52 unsigned i = 0; 52 unsigned i = 0;
53 char *out = buf, *end = buf + size; 53 char *out = buf, *end = buf + size;
54 54
55 #define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) 55 #define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
56 56
57 p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_OFFSET(k), KEY_SIZE(k)); 57 p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_OFFSET(k), KEY_SIZE(k));
58 58
59 if (KEY_PTRS(k)) 59 if (KEY_PTRS(k))
60 while (1) { 60 while (1) {
61 p("%llu:%llu gen %llu", 61 p("%llu:%llu gen %llu",
62 PTR_DEV(k, i), PTR_OFFSET(k, i), PTR_GEN(k, i)); 62 PTR_DEV(k, i), PTR_OFFSET(k, i), PTR_GEN(k, i));
63 63
64 if (++i == KEY_PTRS(k)) 64 if (++i == KEY_PTRS(k))
65 break; 65 break;
66 66
67 p(", "); 67 p(", ");
68 } 68 }
69 69
70 p("]"); 70 p("]");
71 71
72 if (KEY_DIRTY(k)) 72 if (KEY_DIRTY(k))
73 p(" dirty"); 73 p(" dirty");
74 if (KEY_CSUM(k)) 74 if (KEY_CSUM(k))
75 p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]); 75 p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]);
76 #undef p 76 #undef p
77 return out - buf; 77 return out - buf;
78 } 78 }
79 79
80 int bch_btree_to_text(char *buf, size_t size, const struct btree *b) 80 int bch_btree_to_text(char *buf, size_t size, const struct btree *b)
81 { 81 {
82 return scnprintf(buf, size, "%zu level %i/%i", 82 return scnprintf(buf, size, "%zu level %i/%i",
83 PTR_BUCKET_NR(b->c, &b->key, 0), 83 PTR_BUCKET_NR(b->c, &b->key, 0),
84 b->level, b->c->root ? b->c->root->level : -1); 84 b->level, b->c->root ? b->c->root->level : -1);
85 } 85 }
86 86
87 #if defined(CONFIG_BCACHE_DEBUG) || defined(CONFIG_BCACHE_EDEBUG) 87 #if defined(CONFIG_BCACHE_DEBUG) || defined(CONFIG_BCACHE_EDEBUG)
88 88
89 static bool skipped_backwards(struct btree *b, struct bkey *k) 89 static bool skipped_backwards(struct btree *b, struct bkey *k)
90 { 90 {
91 return bkey_cmp(k, (!b->level) 91 return bkey_cmp(k, (!b->level)
92 ? &START_KEY(bkey_next(k)) 92 ? &START_KEY(bkey_next(k))
93 : bkey_next(k)) > 0; 93 : bkey_next(k)) > 0;
94 } 94 }
95 95
96 static void dump_bset(struct btree *b, struct bset *i) 96 static void dump_bset(struct btree *b, struct bset *i)
97 { 97 {
98 struct bkey *k; 98 struct bkey *k;
99 unsigned j; 99 unsigned j;
100 char buf[80]; 100 char buf[80];
101 101
102 for (k = i->start; k < end(i); k = bkey_next(k)) { 102 for (k = i->start; k < end(i); k = bkey_next(k)) {
103 bch_bkey_to_text(buf, sizeof(buf), k); 103 bch_bkey_to_text(buf, sizeof(buf), k);
104 printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b), 104 printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b),
105 (uint64_t *) k - i->d, i->keys, buf); 105 (uint64_t *) k - i->d, i->keys, buf);
106 106
107 for (j = 0; j < KEY_PTRS(k); j++) { 107 for (j = 0; j < KEY_PTRS(k); j++) {
108 size_t n = PTR_BUCKET_NR(b->c, k, j); 108 size_t n = PTR_BUCKET_NR(b->c, k, j);
109 printk(" bucket %zu", n); 109 printk(" bucket %zu", n);
110 110
111 if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets) 111 if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets)
112 printk(" prio %i", 112 printk(" prio %i",
113 PTR_BUCKET(b->c, k, j)->prio); 113 PTR_BUCKET(b->c, k, j)->prio);
114 } 114 }
115 115
116 printk(" %s\n", bch_ptr_status(b->c, k)); 116 printk(" %s\n", bch_ptr_status(b->c, k));
117 117
118 if (bkey_next(k) < end(i) && 118 if (bkey_next(k) < end(i) &&
119 skipped_backwards(b, k)) 119 skipped_backwards(b, k))
120 printk(KERN_ERR "Key skipped backwards\n"); 120 printk(KERN_ERR "Key skipped backwards\n");
121 } 121 }
122 } 122 }
123 123
124 #endif 124 #endif
125 125
126 #ifdef CONFIG_BCACHE_DEBUG 126 #ifdef CONFIG_BCACHE_DEBUG
127 127
128 void bch_btree_verify(struct btree *b, struct bset *new) 128 void bch_btree_verify(struct btree *b, struct bset *new)
129 { 129 {
130 struct btree *v = b->c->verify_data; 130 struct btree *v = b->c->verify_data;
131 struct closure cl; 131 struct closure cl;
132 closure_init_stack(&cl); 132 closure_init_stack(&cl);
133 133
134 if (!b->c->verify) 134 if (!b->c->verify)
135 return; 135 return;
136 136
137 closure_wait_event(&b->io.wait, &cl, 137 closure_wait_event(&b->io.wait, &cl,
138 atomic_read(&b->io.cl.remaining) == -1); 138 atomic_read(&b->io.cl.remaining) == -1);
139 139
140 mutex_lock(&b->c->verify_lock); 140 mutex_lock(&b->c->verify_lock);
141 141
142 bkey_copy(&v->key, &b->key); 142 bkey_copy(&v->key, &b->key);
143 v->written = 0; 143 v->written = 0;
144 v->level = b->level; 144 v->level = b->level;
145 145
146 bch_btree_node_read(v); 146 bch_btree_node_read(v);
147 closure_wait_event(&v->io.wait, &cl, 147 closure_wait_event(&v->io.wait, &cl,
148 atomic_read(&b->io.cl.remaining) == -1); 148 atomic_read(&b->io.cl.remaining) == -1);
149 149
150 if (new->keys != v->sets[0].data->keys || 150 if (new->keys != v->sets[0].data->keys ||
151 memcmp(new->start, 151 memcmp(new->start,
152 v->sets[0].data->start, 152 v->sets[0].data->start,
153 (void *) end(new) - (void *) new->start)) { 153 (void *) end(new) - (void *) new->start)) {
154 unsigned i, j; 154 unsigned i, j;
155 155
156 console_lock(); 156 console_lock();
157 157
158 printk(KERN_ERR "*** original memory node:\n"); 158 printk(KERN_ERR "*** original memory node:\n");
159 for (i = 0; i <= b->nsets; i++) 159 for (i = 0; i <= b->nsets; i++)
160 dump_bset(b, b->sets[i].data); 160 dump_bset(b, b->sets[i].data);
161 161
162 printk(KERN_ERR "*** sorted memory node:\n"); 162 printk(KERN_ERR "*** sorted memory node:\n");
163 dump_bset(b, new); 163 dump_bset(b, new);
164 164
165 printk(KERN_ERR "*** on disk node:\n"); 165 printk(KERN_ERR "*** on disk node:\n");
166 dump_bset(v, v->sets[0].data); 166 dump_bset(v, v->sets[0].data);
167 167
168 for (j = 0; j < new->keys; j++) 168 for (j = 0; j < new->keys; j++)
169 if (new->d[j] != v->sets[0].data->d[j]) 169 if (new->d[j] != v->sets[0].data->d[j])
170 break; 170 break;
171 171
172 console_unlock(); 172 console_unlock();
173 panic("verify failed at %u\n", j); 173 panic("verify failed at %u\n", j);
174 } 174 }
175 175
176 mutex_unlock(&b->c->verify_lock); 176 mutex_unlock(&b->c->verify_lock);
177 } 177 }
178 178
179 static void data_verify_endio(struct bio *bio, int error) 179 static void data_verify_endio(struct bio *bio, int error)
180 { 180 {
181 struct closure *cl = bio->bi_private; 181 struct closure *cl = bio->bi_private;
182 closure_put(cl); 182 closure_put(cl);
183 } 183 }
184 184
185 void bch_data_verify(struct search *s) 185 void bch_data_verify(struct search *s)
186 { 186 {
187 char name[BDEVNAME_SIZE]; 187 char name[BDEVNAME_SIZE];
188 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 188 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
189 struct closure *cl = &s->cl; 189 struct closure *cl = &s->cl;
190 struct bio *check; 190 struct bio *check;
191 struct bio_vec *bv; 191 struct bio_vec *bv;
192 int i; 192 int i;
193 193
194 if (!s->unaligned_bvec) 194 if (!s->unaligned_bvec)
195 bio_for_each_segment(bv, s->orig_bio, i) 195 bio_for_each_segment(bv, s->orig_bio, i)
196 bv->bv_offset = 0, bv->bv_len = PAGE_SIZE; 196 bv->bv_offset = 0, bv->bv_len = PAGE_SIZE;
197 197
198 check = bio_clone(s->orig_bio, GFP_NOIO); 198 check = bio_clone(s->orig_bio, GFP_NOIO);
199 if (!check) 199 if (!check)
200 return; 200 return;
201 201
202 if (bch_bio_alloc_pages(check, GFP_NOIO)) 202 if (bio_alloc_pages(check, GFP_NOIO))
203 goto out_put; 203 goto out_put;
204 204
205 check->bi_rw = READ_SYNC; 205 check->bi_rw = READ_SYNC;
206 check->bi_private = cl; 206 check->bi_private = cl;
207 check->bi_end_io = data_verify_endio; 207 check->bi_end_io = data_verify_endio;
208 208
209 closure_bio_submit(check, cl, &dc->disk); 209 closure_bio_submit(check, cl, &dc->disk);
210 closure_sync(cl); 210 closure_sync(cl);
211 211
212 bio_for_each_segment(bv, s->orig_bio, i) { 212 bio_for_each_segment(bv, s->orig_bio, i) {
213 void *p1 = kmap(bv->bv_page); 213 void *p1 = kmap(bv->bv_page);
214 void *p2 = kmap(check->bi_io_vec[i].bv_page); 214 void *p2 = kmap(check->bi_io_vec[i].bv_page);
215 215
216 if (memcmp(p1 + bv->bv_offset, 216 if (memcmp(p1 + bv->bv_offset,
217 p2 + bv->bv_offset, 217 p2 + bv->bv_offset,
218 bv->bv_len)) 218 bv->bv_len))
219 printk(KERN_ERR 219 printk(KERN_ERR
220 "bcache (%s): verify failed at sector %llu\n", 220 "bcache (%s): verify failed at sector %llu\n",
221 bdevname(dc->bdev, name), 221 bdevname(dc->bdev, name),
222 (uint64_t) s->orig_bio->bi_sector); 222 (uint64_t) s->orig_bio->bi_sector);
223 223
224 kunmap(bv->bv_page); 224 kunmap(bv->bv_page);
225 kunmap(check->bi_io_vec[i].bv_page); 225 kunmap(check->bi_io_vec[i].bv_page);
226 } 226 }
227 227
228 __bio_for_each_segment(bv, check, i, 0) 228 __bio_for_each_segment(bv, check, i, 0)
229 __free_page(bv->bv_page); 229 __free_page(bv->bv_page);
230 out_put: 230 out_put:
231 bio_put(check); 231 bio_put(check);
232 } 232 }
233 233
234 #endif 234 #endif
235 235
236 #ifdef CONFIG_BCACHE_EDEBUG 236 #ifdef CONFIG_BCACHE_EDEBUG
237 237
238 unsigned bch_count_data(struct btree *b) 238 unsigned bch_count_data(struct btree *b)
239 { 239 {
240 unsigned ret = 0; 240 unsigned ret = 0;
241 struct btree_iter iter; 241 struct btree_iter iter;
242 struct bkey *k; 242 struct bkey *k;
243 243
244 if (!b->level) 244 if (!b->level)
245 for_each_key(b, k, &iter) 245 for_each_key(b, k, &iter)
246 ret += KEY_SIZE(k); 246 ret += KEY_SIZE(k);
247 return ret; 247 return ret;
248 } 248 }
249 249
250 static void vdump_bucket_and_panic(struct btree *b, const char *fmt, 250 static void vdump_bucket_and_panic(struct btree *b, const char *fmt,
251 va_list args) 251 va_list args)
252 { 252 {
253 unsigned i; 253 unsigned i;
254 char buf[80]; 254 char buf[80];
255 255
256 console_lock(); 256 console_lock();
257 257
258 for (i = 0; i <= b->nsets; i++) 258 for (i = 0; i <= b->nsets; i++)
259 dump_bset(b, b->sets[i].data); 259 dump_bset(b, b->sets[i].data);
260 260
261 vprintk(fmt, args); 261 vprintk(fmt, args);
262 262
263 console_unlock(); 263 console_unlock();
264 264
265 bch_btree_to_text(buf, sizeof(buf), b); 265 bch_btree_to_text(buf, sizeof(buf), b);
266 panic("at %s\n", buf); 266 panic("at %s\n", buf);
267 } 267 }
268 268
269 void bch_check_key_order_msg(struct btree *b, struct bset *i, 269 void bch_check_key_order_msg(struct btree *b, struct bset *i,
270 const char *fmt, ...) 270 const char *fmt, ...)
271 { 271 {
272 struct bkey *k; 272 struct bkey *k;
273 273
274 if (!i->keys) 274 if (!i->keys)
275 return; 275 return;
276 276
277 for (k = i->start; bkey_next(k) < end(i); k = bkey_next(k)) 277 for (k = i->start; bkey_next(k) < end(i); k = bkey_next(k))
278 if (skipped_backwards(b, k)) { 278 if (skipped_backwards(b, k)) {
279 va_list args; 279 va_list args;
280 va_start(args, fmt); 280 va_start(args, fmt);
281 281
282 vdump_bucket_and_panic(b, fmt, args); 282 vdump_bucket_and_panic(b, fmt, args);
283 va_end(args); 283 va_end(args);
284 } 284 }
285 } 285 }
286 286
287 void bch_check_keys(struct btree *b, const char *fmt, ...) 287 void bch_check_keys(struct btree *b, const char *fmt, ...)
288 { 288 {
289 va_list args; 289 va_list args;
290 struct bkey *k, *p = NULL; 290 struct bkey *k, *p = NULL;
291 struct btree_iter iter; 291 struct btree_iter iter;
292 292
293 if (b->level) 293 if (b->level)
294 return; 294 return;
295 295
296 for_each_key(b, k, &iter) { 296 for_each_key(b, k, &iter) {
297 if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) { 297 if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) {
298 printk(KERN_ERR "Keys out of order:\n"); 298 printk(KERN_ERR "Keys out of order:\n");
299 goto bug; 299 goto bug;
300 } 300 }
301 301
302 if (bch_ptr_invalid(b, k)) 302 if (bch_ptr_invalid(b, k))
303 continue; 303 continue;
304 304
305 if (p && bkey_cmp(p, &START_KEY(k)) > 0) { 305 if (p && bkey_cmp(p, &START_KEY(k)) > 0) {
306 printk(KERN_ERR "Overlapping keys:\n"); 306 printk(KERN_ERR "Overlapping keys:\n");
307 goto bug; 307 goto bug;
308 } 308 }
309 p = k; 309 p = k;
310 } 310 }
311 return; 311 return;
312 bug: 312 bug:
313 va_start(args, fmt); 313 va_start(args, fmt);
314 vdump_bucket_and_panic(b, fmt, args); 314 vdump_bucket_and_panic(b, fmt, args);
315 va_end(args); 315 va_end(args);
316 } 316 }
317 317
318 #endif 318 #endif
319 319
320 #ifdef CONFIG_DEBUG_FS 320 #ifdef CONFIG_DEBUG_FS
321 321
322 /* XXX: cache set refcounting */ 322 /* XXX: cache set refcounting */
323 323
324 struct dump_iterator { 324 struct dump_iterator {
325 char buf[PAGE_SIZE]; 325 char buf[PAGE_SIZE];
326 size_t bytes; 326 size_t bytes;
327 struct cache_set *c; 327 struct cache_set *c;
328 struct keybuf keys; 328 struct keybuf keys;
329 }; 329 };
330 330
331 static bool dump_pred(struct keybuf *buf, struct bkey *k) 331 static bool dump_pred(struct keybuf *buf, struct bkey *k)
332 { 332 {
333 return true; 333 return true;
334 } 334 }
335 335
336 static ssize_t bch_dump_read(struct file *file, char __user *buf, 336 static ssize_t bch_dump_read(struct file *file, char __user *buf,
337 size_t size, loff_t *ppos) 337 size_t size, loff_t *ppos)
338 { 338 {
339 struct dump_iterator *i = file->private_data; 339 struct dump_iterator *i = file->private_data;
340 ssize_t ret = 0; 340 ssize_t ret = 0;
341 char kbuf[80]; 341 char kbuf[80];
342 342
343 while (size) { 343 while (size) {
344 struct keybuf_key *w; 344 struct keybuf_key *w;
345 unsigned bytes = min(i->bytes, size); 345 unsigned bytes = min(i->bytes, size);
346 346
347 int err = copy_to_user(buf, i->buf, bytes); 347 int err = copy_to_user(buf, i->buf, bytes);
348 if (err) 348 if (err)
349 return err; 349 return err;
350 350
351 ret += bytes; 351 ret += bytes;
352 buf += bytes; 352 buf += bytes;
353 size -= bytes; 353 size -= bytes;
354 i->bytes -= bytes; 354 i->bytes -= bytes;
355 memmove(i->buf, i->buf + bytes, i->bytes); 355 memmove(i->buf, i->buf + bytes, i->bytes);
356 356
357 if (i->bytes) 357 if (i->bytes)
358 break; 358 break;
359 359
360 w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY, dump_pred); 360 w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY, dump_pred);
361 if (!w) 361 if (!w)
362 break; 362 break;
363 363
364 bch_bkey_to_text(kbuf, sizeof(kbuf), &w->key); 364 bch_bkey_to_text(kbuf, sizeof(kbuf), &w->key);
365 i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", kbuf); 365 i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", kbuf);
366 bch_keybuf_del(&i->keys, w); 366 bch_keybuf_del(&i->keys, w);
367 } 367 }
368 368
369 return ret; 369 return ret;
370 } 370 }
371 371
372 static int bch_dump_open(struct inode *inode, struct file *file) 372 static int bch_dump_open(struct inode *inode, struct file *file)
373 { 373 {
374 struct cache_set *c = inode->i_private; 374 struct cache_set *c = inode->i_private;
375 struct dump_iterator *i; 375 struct dump_iterator *i;
376 376
377 i = kzalloc(sizeof(struct dump_iterator), GFP_KERNEL); 377 i = kzalloc(sizeof(struct dump_iterator), GFP_KERNEL);
378 if (!i) 378 if (!i)
379 return -ENOMEM; 379 return -ENOMEM;
380 380
381 file->private_data = i; 381 file->private_data = i;
382 i->c = c; 382 i->c = c;
383 bch_keybuf_init(&i->keys); 383 bch_keybuf_init(&i->keys);
384 i->keys.last_scanned = KEY(0, 0, 0); 384 i->keys.last_scanned = KEY(0, 0, 0);
385 385
386 return 0; 386 return 0;
387 } 387 }
388 388
389 static int bch_dump_release(struct inode *inode, struct file *file) 389 static int bch_dump_release(struct inode *inode, struct file *file)
390 { 390 {
391 kfree(file->private_data); 391 kfree(file->private_data);
392 return 0; 392 return 0;
393 } 393 }
394 394
395 static const struct file_operations cache_set_debug_ops = { 395 static const struct file_operations cache_set_debug_ops = {
396 .owner = THIS_MODULE, 396 .owner = THIS_MODULE,
397 .open = bch_dump_open, 397 .open = bch_dump_open,
398 .read = bch_dump_read, 398 .read = bch_dump_read,
399 .release = bch_dump_release 399 .release = bch_dump_release
400 }; 400 };
401 401
402 void bch_debug_init_cache_set(struct cache_set *c) 402 void bch_debug_init_cache_set(struct cache_set *c)
403 { 403 {
404 if (!IS_ERR_OR_NULL(debug)) { 404 if (!IS_ERR_OR_NULL(debug)) {
405 char name[50]; 405 char name[50];
406 snprintf(name, 50, "bcache-%pU", c->sb.set_uuid); 406 snprintf(name, 50, "bcache-%pU", c->sb.set_uuid);
407 407
408 c->debug = debugfs_create_file(name, 0400, debug, c, 408 c->debug = debugfs_create_file(name, 0400, debug, c,
409 &cache_set_debug_ops); 409 &cache_set_debug_ops);
410 } 410 }
411 } 411 }
412 412
413 #endif 413 #endif
414 414
415 void bch_debug_exit(void) 415 void bch_debug_exit(void)
416 { 416 {
417 if (!IS_ERR_OR_NULL(debug)) 417 if (!IS_ERR_OR_NULL(debug))
418 debugfs_remove_recursive(debug); 418 debugfs_remove_recursive(debug);
419 } 419 }
420 420
421 int __init bch_debug_init(struct kobject *kobj) 421 int __init bch_debug_init(struct kobject *kobj)
422 { 422 {
423 int ret = 0; 423 int ret = 0;
424 424
425 debug = debugfs_create_dir("bcache", NULL); 425 debug = debugfs_create_dir("bcache", NULL);
426 return ret; 426 return ret;
427 } 427 }
428 428
drivers/md/bcache/io.c
1 /* 1 /*
2 * Some low level IO code, and hacks for various block layer limitations 2 * Some low level IO code, and hacks for various block layer limitations
3 * 3 *
4 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> 4 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
5 * Copyright 2012 Google, Inc. 5 * Copyright 2012 Google, Inc.
6 */ 6 */
7 7
8 #include "bcache.h" 8 #include "bcache.h"
9 #include "bset.h" 9 #include "bset.h"
10 #include "debug.h" 10 #include "debug.h"
11 11
12 #include <linux/blkdev.h> 12 #include <linux/blkdev.h>
13 13
14 static void bch_bi_idx_hack_endio(struct bio *bio, int error) 14 static void bch_bi_idx_hack_endio(struct bio *bio, int error)
15 { 15 {
16 struct bio *p = bio->bi_private; 16 struct bio *p = bio->bi_private;
17 17
18 bio_endio(p, error); 18 bio_endio(p, error);
19 bio_put(bio); 19 bio_put(bio);
20 } 20 }
21 21
22 static void bch_generic_make_request_hack(struct bio *bio) 22 static void bch_generic_make_request_hack(struct bio *bio)
23 { 23 {
24 if (bio->bi_idx) { 24 if (bio->bi_idx) {
25 struct bio *clone = bio_alloc(GFP_NOIO, bio_segments(bio)); 25 struct bio *clone = bio_alloc(GFP_NOIO, bio_segments(bio));
26 26
27 memcpy(clone->bi_io_vec, 27 memcpy(clone->bi_io_vec,
28 bio_iovec(bio), 28 bio_iovec(bio),
29 bio_segments(bio) * sizeof(struct bio_vec)); 29 bio_segments(bio) * sizeof(struct bio_vec));
30 30
31 clone->bi_sector = bio->bi_sector; 31 clone->bi_sector = bio->bi_sector;
32 clone->bi_bdev = bio->bi_bdev; 32 clone->bi_bdev = bio->bi_bdev;
33 clone->bi_rw = bio->bi_rw; 33 clone->bi_rw = bio->bi_rw;
34 clone->bi_vcnt = bio_segments(bio); 34 clone->bi_vcnt = bio_segments(bio);
35 clone->bi_size = bio->bi_size; 35 clone->bi_size = bio->bi_size;
36 36
37 clone->bi_private = bio; 37 clone->bi_private = bio;
38 clone->bi_end_io = bch_bi_idx_hack_endio; 38 clone->bi_end_io = bch_bi_idx_hack_endio;
39 39
40 bio = clone; 40 bio = clone;
41 } 41 }
42 42
43 /* 43 /*
44 * Hack, since drivers that clone bios clone up to bi_max_vecs, but our 44 * Hack, since drivers that clone bios clone up to bi_max_vecs, but our
45 * bios might have had more than that (before we split them per device 45 * bios might have had more than that (before we split them per device
46 * limitations). 46 * limitations).
47 * 47 *
48 * To be taken out once immutable bvec stuff is in. 48 * To be taken out once immutable bvec stuff is in.
49 */ 49 */
50 bio->bi_max_vecs = bio->bi_vcnt; 50 bio->bi_max_vecs = bio->bi_vcnt;
51 51
52 generic_make_request(bio); 52 generic_make_request(bio);
53 } 53 }
54 54
55 /** 55 /**
56 * bch_bio_split - split a bio 56 * bch_bio_split - split a bio
57 * @bio: bio to split 57 * @bio: bio to split
58 * @sectors: number of sectors to split from the front of @bio 58 * @sectors: number of sectors to split from the front of @bio
59 * @gfp: gfp mask 59 * @gfp: gfp mask
60 * @bs: bio set to allocate from 60 * @bs: bio set to allocate from
61 * 61 *
62 * Allocates and returns a new bio which represents @sectors from the start of 62 * Allocates and returns a new bio which represents @sectors from the start of
63 * @bio, and updates @bio to represent the remaining sectors. 63 * @bio, and updates @bio to represent the remaining sectors.
64 * 64 *
65 * If bio_sectors(@bio) was less than or equal to @sectors, returns @bio 65 * If bio_sectors(@bio) was less than or equal to @sectors, returns @bio
66 * unchanged. 66 * unchanged.
67 * 67 *
68 * The newly allocated bio will point to @bio's bi_io_vec, if the split was on a 68 * The newly allocated bio will point to @bio's bi_io_vec, if the split was on a
69 * bvec boundry; it is the caller's responsibility to ensure that @bio is not 69 * bvec boundry; it is the caller's responsibility to ensure that @bio is not
70 * freed before the split. 70 * freed before the split.
71 *
72 * If bch_bio_split() is running under generic_make_request(), it's not safe to
73 * allocate more than one bio from the same bio set. Therefore, if it is running
74 * under generic_make_request() it masks out __GFP_WAIT when doing the
75 * allocation. The caller must check for failure if there's any possibility of
76 * it being called from under generic_make_request(); it is then the caller's
77 * responsibility to retry from a safe context (by e.g. punting to workqueue).
78 */ 71 */
79 struct bio *bch_bio_split(struct bio *bio, int sectors, 72 struct bio *bch_bio_split(struct bio *bio, int sectors,
80 gfp_t gfp, struct bio_set *bs) 73 gfp_t gfp, struct bio_set *bs)
81 { 74 {
82 unsigned idx = bio->bi_idx, vcnt = 0, nbytes = sectors << 9; 75 unsigned idx = bio->bi_idx, vcnt = 0, nbytes = sectors << 9;
83 struct bio_vec *bv; 76 struct bio_vec *bv;
84 struct bio *ret = NULL; 77 struct bio *ret = NULL;
85 78
86 BUG_ON(sectors <= 0); 79 BUG_ON(sectors <= 0);
87 80
88 /*
89 * If we're being called from underneath generic_make_request() and we
90 * already allocated any bios from this bio set, we risk deadlock if we
91 * use the mempool. So instead, we possibly fail and let the caller punt
92 * to workqueue or somesuch and retry in a safe context.
93 */
94 if (current->bio_list)
95 gfp &= ~__GFP_WAIT;
96
97 if (sectors >= bio_sectors(bio)) 81 if (sectors >= bio_sectors(bio))
98 return bio; 82 return bio;
99 83
100 if (bio->bi_rw & REQ_DISCARD) { 84 if (bio->bi_rw & REQ_DISCARD) {
101 ret = bio_alloc_bioset(gfp, 1, bs); 85 ret = bio_alloc_bioset(gfp, 1, bs);
102 if (!ret) 86 if (!ret)
103 return NULL; 87 return NULL;
104 idx = 0; 88 idx = 0;
105 goto out; 89 goto out;
106 } 90 }
107 91
108 bio_for_each_segment(bv, bio, idx) { 92 bio_for_each_segment(bv, bio, idx) {
109 vcnt = idx - bio->bi_idx; 93 vcnt = idx - bio->bi_idx;
110 94
111 if (!nbytes) { 95 if (!nbytes) {
112 ret = bio_alloc_bioset(gfp, vcnt, bs); 96 ret = bio_alloc_bioset(gfp, vcnt, bs);
113 if (!ret) 97 if (!ret)
114 return NULL; 98 return NULL;
115 99
116 memcpy(ret->bi_io_vec, bio_iovec(bio), 100 memcpy(ret->bi_io_vec, bio_iovec(bio),
117 sizeof(struct bio_vec) * vcnt); 101 sizeof(struct bio_vec) * vcnt);
118 102
119 break; 103 break;
120 } else if (nbytes < bv->bv_len) { 104 } else if (nbytes < bv->bv_len) {
121 ret = bio_alloc_bioset(gfp, ++vcnt, bs); 105 ret = bio_alloc_bioset(gfp, ++vcnt, bs);
122 if (!ret) 106 if (!ret)
123 return NULL; 107 return NULL;
124 108
125 memcpy(ret->bi_io_vec, bio_iovec(bio), 109 memcpy(ret->bi_io_vec, bio_iovec(bio),
126 sizeof(struct bio_vec) * vcnt); 110 sizeof(struct bio_vec) * vcnt);
127 111
128 ret->bi_io_vec[vcnt - 1].bv_len = nbytes; 112 ret->bi_io_vec[vcnt - 1].bv_len = nbytes;
129 bv->bv_offset += nbytes; 113 bv->bv_offset += nbytes;
130 bv->bv_len -= nbytes; 114 bv->bv_len -= nbytes;
131 break; 115 break;
132 } 116 }
133 117
134 nbytes -= bv->bv_len; 118 nbytes -= bv->bv_len;
135 } 119 }
136 out: 120 out:
137 ret->bi_bdev = bio->bi_bdev; 121 ret->bi_bdev = bio->bi_bdev;
138 ret->bi_sector = bio->bi_sector; 122 ret->bi_sector = bio->bi_sector;
139 ret->bi_size = sectors << 9; 123 ret->bi_size = sectors << 9;
140 ret->bi_rw = bio->bi_rw; 124 ret->bi_rw = bio->bi_rw;
141 ret->bi_vcnt = vcnt; 125 ret->bi_vcnt = vcnt;
142 ret->bi_max_vecs = vcnt; 126 ret->bi_max_vecs = vcnt;
143 127
144 bio->bi_sector += sectors; 128 bio->bi_sector += sectors;
145 bio->bi_size -= sectors << 9; 129 bio->bi_size -= sectors << 9;
146 bio->bi_idx = idx; 130 bio->bi_idx = idx;
147 131
148 if (bio_integrity(bio)) { 132 if (bio_integrity(bio)) {
149 if (bio_integrity_clone(ret, bio, gfp)) { 133 if (bio_integrity_clone(ret, bio, gfp)) {
150 bio_put(ret); 134 bio_put(ret);
151 return NULL; 135 return NULL;
152 } 136 }
153 137
154 bio_integrity_trim(ret, 0, bio_sectors(ret)); 138 bio_integrity_trim(ret, 0, bio_sectors(ret));
155 bio_integrity_trim(bio, bio_sectors(ret), bio_sectors(bio)); 139 bio_integrity_trim(bio, bio_sectors(ret), bio_sectors(bio));
156 } 140 }
157 141
158 return ret; 142 return ret;
159 } 143 }
160 144
161 static unsigned bch_bio_max_sectors(struct bio *bio) 145 static unsigned bch_bio_max_sectors(struct bio *bio)
162 { 146 {
163 unsigned ret = bio_sectors(bio); 147 unsigned ret = bio_sectors(bio);
164 struct request_queue *q = bdev_get_queue(bio->bi_bdev); 148 struct request_queue *q = bdev_get_queue(bio->bi_bdev);
165 unsigned max_segments = min_t(unsigned, BIO_MAX_PAGES, 149 unsigned max_segments = min_t(unsigned, BIO_MAX_PAGES,
166 queue_max_segments(q)); 150 queue_max_segments(q));
167 struct bio_vec *bv, *end = bio_iovec(bio) +
168 min_t(int, bio_segments(bio), max_segments);
169 151
170 if (bio->bi_rw & REQ_DISCARD) 152 if (bio->bi_rw & REQ_DISCARD)
171 return min(ret, q->limits.max_discard_sectors); 153 return min(ret, q->limits.max_discard_sectors);
172 154
173 if (bio_segments(bio) > max_segments || 155 if (bio_segments(bio) > max_segments ||
174 q->merge_bvec_fn) { 156 q->merge_bvec_fn) {
157 struct bio_vec *bv;
158 int i, seg = 0;
159
175 ret = 0; 160 ret = 0;
176 161
177 for (bv = bio_iovec(bio); bv < end; bv++) { 162 bio_for_each_segment(bv, bio, i) {
178 struct bvec_merge_data bvm = { 163 struct bvec_merge_data bvm = {
179 .bi_bdev = bio->bi_bdev, 164 .bi_bdev = bio->bi_bdev,
180 .bi_sector = bio->bi_sector, 165 .bi_sector = bio->bi_sector,
181 .bi_size = ret << 9, 166 .bi_size = ret << 9,
182 .bi_rw = bio->bi_rw, 167 .bi_rw = bio->bi_rw,
183 }; 168 };
184 169
170 if (seg == max_segments)
171 break;
172
185 if (q->merge_bvec_fn && 173 if (q->merge_bvec_fn &&
186 q->merge_bvec_fn(q, &bvm, bv) < (int) bv->bv_len) 174 q->merge_bvec_fn(q, &bvm, bv) < (int) bv->bv_len)
187 break; 175 break;
188 176
177 seg++;
189 ret += bv->bv_len >> 9; 178 ret += bv->bv_len >> 9;
190 } 179 }
191 } 180 }
192 181
193 ret = min(ret, queue_max_sectors(q)); 182 ret = min(ret, queue_max_sectors(q));
194 183
195 WARN_ON(!ret); 184 WARN_ON(!ret);
196 ret = max_t(int, ret, bio_iovec(bio)->bv_len >> 9); 185 ret = max_t(int, ret, bio_iovec(bio)->bv_len >> 9);
197 186
198 return ret; 187 return ret;
199 } 188 }
200 189
201 static void bch_bio_submit_split_done(struct closure *cl) 190 static void bch_bio_submit_split_done(struct closure *cl)
202 { 191 {
203 struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl); 192 struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
204 193
205 s->bio->bi_end_io = s->bi_end_io; 194 s->bio->bi_end_io = s->bi_end_io;
206 s->bio->bi_private = s->bi_private; 195 s->bio->bi_private = s->bi_private;
207 bio_endio(s->bio, 0); 196 bio_endio(s->bio, 0);
208 197
209 closure_debug_destroy(&s->cl); 198 closure_debug_destroy(&s->cl);
210 mempool_free(s, s->p->bio_split_hook); 199 mempool_free(s, s->p->bio_split_hook);
211 } 200 }
212 201
213 static void bch_bio_submit_split_endio(struct bio *bio, int error) 202 static void bch_bio_submit_split_endio(struct bio *bio, int error)
214 { 203 {
215 struct closure *cl = bio->bi_private; 204 struct closure *cl = bio->bi_private;
216 struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl); 205 struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
217 206
218 if (error) 207 if (error)
219 clear_bit(BIO_UPTODATE, &s->bio->bi_flags); 208 clear_bit(BIO_UPTODATE, &s->bio->bi_flags);
220 209
221 bio_put(bio); 210 bio_put(bio);
222 closure_put(cl); 211 closure_put(cl);
223 } 212 }
224 213
225 static void __bch_bio_submit_split(struct closure *cl)
226 {
227 struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
228 struct bio *bio = s->bio, *n;
229
230 do {
231 n = bch_bio_split(bio, bch_bio_max_sectors(bio),
232 GFP_NOIO, s->p->bio_split);
233 if (!n)
234 continue_at(cl, __bch_bio_submit_split, system_wq);
235
236 n->bi_end_io = bch_bio_submit_split_endio;
237 n->bi_private = cl;
238
239 closure_get(cl);
240 bch_generic_make_request_hack(n);
241 } while (n != bio);
242
243 continue_at(cl, bch_bio_submit_split_done, NULL);
244 }
245
246 void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p) 214 void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p)
247 { 215 {
248 struct bio_split_hook *s; 216 struct bio_split_hook *s;
217 struct bio *n;
249 218
250 if (!bio_has_data(bio) && !(bio->bi_rw & REQ_DISCARD)) 219 if (!bio_has_data(bio) && !(bio->bi_rw & REQ_DISCARD))
251 goto submit; 220 goto submit;
252 221
253 if (bio_sectors(bio) <= bch_bio_max_sectors(bio)) 222 if (bio_sectors(bio) <= bch_bio_max_sectors(bio))
254 goto submit; 223 goto submit;
255 224
256 s = mempool_alloc(p->bio_split_hook, GFP_NOIO); 225 s = mempool_alloc(p->bio_split_hook, GFP_NOIO);
226 closure_init(&s->cl, NULL);
257 227
258 s->bio = bio; 228 s->bio = bio;
259 s->p = p; 229 s->p = p;
260 s->bi_end_io = bio->bi_end_io; 230 s->bi_end_io = bio->bi_end_io;
261 s->bi_private = bio->bi_private; 231 s->bi_private = bio->bi_private;
262 bio_get(bio); 232 bio_get(bio);
263 233
264 closure_call(&s->cl, __bch_bio_submit_split, NULL, NULL); 234 do {
265 return; 235 n = bch_bio_split(bio, bch_bio_max_sectors(bio),
236 GFP_NOIO, s->p->bio_split);
237
238 n->bi_end_io = bch_bio_submit_split_endio;
239 n->bi_private = &s->cl;
240
241 closure_get(&s->cl);
242 bch_generic_make_request_hack(n);
243 } while (n != bio);
244
245 continue_at(&s->cl, bch_bio_submit_split_done, NULL);
266 submit: 246 submit:
267 bch_generic_make_request_hack(bio); 247 bch_generic_make_request_hack(bio);
268 } 248 }
269 249
270 /* Bios with headers */ 250 /* Bios with headers */
271 251
272 void bch_bbio_free(struct bio *bio, struct cache_set *c) 252 void bch_bbio_free(struct bio *bio, struct cache_set *c)
273 { 253 {
274 struct bbio *b = container_of(bio, struct bbio, bio); 254 struct bbio *b = container_of(bio, struct bbio, bio);
275 mempool_free(b, c->bio_meta); 255 mempool_free(b, c->bio_meta);
276 } 256 }
277 257
278 struct bio *bch_bbio_alloc(struct cache_set *c) 258 struct bio *bch_bbio_alloc(struct cache_set *c)
279 { 259 {
280 struct bbio *b = mempool_alloc(c->bio_meta, GFP_NOIO); 260 struct bbio *b = mempool_alloc(c->bio_meta, GFP_NOIO);
281 struct bio *bio = &b->bio; 261 struct bio *bio = &b->bio;
282 262
283 bio_init(bio); 263 bio_init(bio);
284 bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET; 264 bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET;
285 bio->bi_max_vecs = bucket_pages(c); 265 bio->bi_max_vecs = bucket_pages(c);
286 bio->bi_io_vec = bio->bi_inline_vecs; 266 bio->bi_io_vec = bio->bi_inline_vecs;
287 267
288 return bio; 268 return bio;
289 } 269 }
290 270
291 void __bch_submit_bbio(struct bio *bio, struct cache_set *c) 271 void __bch_submit_bbio(struct bio *bio, struct cache_set *c)
292 { 272 {
293 struct bbio *b = container_of(bio, struct bbio, bio); 273 struct bbio *b = container_of(bio, struct bbio, bio);
294 274
295 bio->bi_sector = PTR_OFFSET(&b->key, 0); 275 bio->bi_sector = PTR_OFFSET(&b->key, 0);
296 bio->bi_bdev = PTR_CACHE(c, &b->key, 0)->bdev; 276 bio->bi_bdev = PTR_CACHE(c, &b->key, 0)->bdev;
297 277
298 b->submit_time_us = local_clock_us(); 278 b->submit_time_us = local_clock_us();
299 closure_bio_submit(bio, bio->bi_private, PTR_CACHE(c, &b->key, 0)); 279 closure_bio_submit(bio, bio->bi_private, PTR_CACHE(c, &b->key, 0));
300 } 280 }
301 281
302 void bch_submit_bbio(struct bio *bio, struct cache_set *c, 282 void bch_submit_bbio(struct bio *bio, struct cache_set *c,
303 struct bkey *k, unsigned ptr) 283 struct bkey *k, unsigned ptr)
304 { 284 {
305 struct bbio *b = container_of(bio, struct bbio, bio); 285 struct bbio *b = container_of(bio, struct bbio, bio);
306 bch_bkey_copy_single_ptr(&b->key, k, ptr); 286 bch_bkey_copy_single_ptr(&b->key, k, ptr);
307 __bch_submit_bbio(bio, c); 287 __bch_submit_bbio(bio, c);
308 } 288 }
309 289
310 /* IO errors */ 290 /* IO errors */
311 291
312 void bch_count_io_errors(struct cache *ca, int error, const char *m) 292 void bch_count_io_errors(struct cache *ca, int error, const char *m)
313 { 293 {
314 /* 294 /*
315 * The halflife of an error is: 295 * The halflife of an error is:
316 * log2(1/2)/log2(127/128) * refresh ~= 88 * refresh 296 * log2(1/2)/log2(127/128) * refresh ~= 88 * refresh
317 */ 297 */
318 298
319 if (ca->set->error_decay) { 299 if (ca->set->error_decay) {
320 unsigned count = atomic_inc_return(&ca->io_count); 300 unsigned count = atomic_inc_return(&ca->io_count);
321 301
322 while (count > ca->set->error_decay) { 302 while (count > ca->set->error_decay) {
323 unsigned errors; 303 unsigned errors;
324 unsigned old = count; 304 unsigned old = count;
325 unsigned new = count - ca->set->error_decay; 305 unsigned new = count - ca->set->error_decay;
326 306
327 /* 307 /*
328 * First we subtract refresh from count; each time we 308 * First we subtract refresh from count; each time we
329 * succesfully do so, we rescale the errors once: 309 * succesfully do so, we rescale the errors once:
330 */ 310 */
331 311
332 count = atomic_cmpxchg(&ca->io_count, old, new); 312 count = atomic_cmpxchg(&ca->io_count, old, new);
333 313
334 if (count == old) { 314 if (count == old) {
335 count = new; 315 count = new;
336 316
337 errors = atomic_read(&ca->io_errors); 317 errors = atomic_read(&ca->io_errors);
338 do { 318 do {
339 old = errors; 319 old = errors;
340 new = ((uint64_t) errors * 127) / 128; 320 new = ((uint64_t) errors * 127) / 128;
341 errors = atomic_cmpxchg(&ca->io_errors, 321 errors = atomic_cmpxchg(&ca->io_errors,
342 old, new); 322 old, new);
343 } while (old != errors); 323 } while (old != errors);
344 } 324 }
345 } 325 }
346 } 326 }
347 327
348 if (error) { 328 if (error) {
349 char buf[BDEVNAME_SIZE]; 329 char buf[BDEVNAME_SIZE];
350 unsigned errors = atomic_add_return(1 << IO_ERROR_SHIFT, 330 unsigned errors = atomic_add_return(1 << IO_ERROR_SHIFT,
351 &ca->io_errors); 331 &ca->io_errors);
352 errors >>= IO_ERROR_SHIFT; 332 errors >>= IO_ERROR_SHIFT;
353 333
354 if (errors < ca->set->error_limit) 334 if (errors < ca->set->error_limit)
355 pr_err("%s: IO error on %s, recovering", 335 pr_err("%s: IO error on %s, recovering",
356 bdevname(ca->bdev, buf), m); 336 bdevname(ca->bdev, buf), m);
357 else 337 else
358 bch_cache_set_error(ca->set, 338 bch_cache_set_error(ca->set,
359 "%s: too many IO errors %s", 339 "%s: too many IO errors %s",
360 bdevname(ca->bdev, buf), m); 340 bdevname(ca->bdev, buf), m);
361 } 341 }
362 } 342 }
363 343
364 void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio, 344 void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio,
365 int error, const char *m) 345 int error, const char *m)
366 { 346 {
367 struct bbio *b = container_of(bio, struct bbio, bio); 347 struct bbio *b = container_of(bio, struct bbio, bio);
368 struct cache *ca = PTR_CACHE(c, &b->key, 0); 348 struct cache *ca = PTR_CACHE(c, &b->key, 0);
369 349
370 unsigned threshold = bio->bi_rw & REQ_WRITE 350 unsigned threshold = bio->bi_rw & REQ_WRITE
371 ? c->congested_write_threshold_us 351 ? c->congested_write_threshold_us
372 : c->congested_read_threshold_us; 352 : c->congested_read_threshold_us;
373 353
374 if (threshold) { 354 if (threshold) {
375 unsigned t = local_clock_us(); 355 unsigned t = local_clock_us();
376 356
377 int us = t - b->submit_time_us; 357 int us = t - b->submit_time_us;
378 int congested = atomic_read(&c->congested); 358 int congested = atomic_read(&c->congested);
379 359
380 if (us > (int) threshold) { 360 if (us > (int) threshold) {
381 int ms = us / 1024; 361 int ms = us / 1024;
382 c->congested_last_us = t; 362 c->congested_last_us = t;
383 363
drivers/md/bcache/movinggc.c
1 /* 1 /*
2 * Moving/copying garbage collector 2 * Moving/copying garbage collector
3 * 3 *
4 * Copyright 2012 Google, Inc. 4 * Copyright 2012 Google, Inc.
5 */ 5 */
6 6
7 #include "bcache.h" 7 #include "bcache.h"
8 #include "btree.h" 8 #include "btree.h"
9 #include "debug.h" 9 #include "debug.h"
10 #include "request.h" 10 #include "request.h"
11 11
12 #include <trace/events/bcache.h> 12 #include <trace/events/bcache.h>
13 13
14 struct moving_io { 14 struct moving_io {
15 struct keybuf_key *w; 15 struct keybuf_key *w;
16 struct search s; 16 struct search s;
17 struct bbio bio; 17 struct bbio bio;
18 }; 18 };
19 19
20 static bool moving_pred(struct keybuf *buf, struct bkey *k) 20 static bool moving_pred(struct keybuf *buf, struct bkey *k)
21 { 21 {
22 struct cache_set *c = container_of(buf, struct cache_set, 22 struct cache_set *c = container_of(buf, struct cache_set,
23 moving_gc_keys); 23 moving_gc_keys);
24 unsigned i; 24 unsigned i;
25 25
26 for (i = 0; i < KEY_PTRS(k); i++) { 26 for (i = 0; i < KEY_PTRS(k); i++) {
27 struct cache *ca = PTR_CACHE(c, k, i); 27 struct cache *ca = PTR_CACHE(c, k, i);
28 struct bucket *g = PTR_BUCKET(c, k, i); 28 struct bucket *g = PTR_BUCKET(c, k, i);
29 29
30 if (GC_SECTORS_USED(g) < ca->gc_move_threshold) 30 if (GC_SECTORS_USED(g) < ca->gc_move_threshold)
31 return true; 31 return true;
32 } 32 }
33 33
34 return false; 34 return false;
35 } 35 }
36 36
37 /* Moving GC - IO loop */ 37 /* Moving GC - IO loop */
38 38
39 static void moving_io_destructor(struct closure *cl) 39 static void moving_io_destructor(struct closure *cl)
40 { 40 {
41 struct moving_io *io = container_of(cl, struct moving_io, s.cl); 41 struct moving_io *io = container_of(cl, struct moving_io, s.cl);
42 kfree(io); 42 kfree(io);
43 } 43 }
44 44
45 static void write_moving_finish(struct closure *cl) 45 static void write_moving_finish(struct closure *cl)
46 { 46 {
47 struct moving_io *io = container_of(cl, struct moving_io, s.cl); 47 struct moving_io *io = container_of(cl, struct moving_io, s.cl);
48 struct bio *bio = &io->bio.bio; 48 struct bio *bio = &io->bio.bio;
49 struct bio_vec *bv = bio_iovec_idx(bio, bio->bi_vcnt); 49 struct bio_vec *bv;
50 int i;
50 51
51 while (bv-- != bio->bi_io_vec) 52 bio_for_each_segment_all(bv, bio, i)
52 __free_page(bv->bv_page); 53 __free_page(bv->bv_page);
53 54
54 if (io->s.op.insert_collision) 55 if (io->s.op.insert_collision)
55 trace_bcache_gc_copy_collision(&io->w->key); 56 trace_bcache_gc_copy_collision(&io->w->key);
56 57
57 bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w); 58 bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w);
58 59
59 atomic_dec_bug(&io->s.op.c->in_flight); 60 atomic_dec_bug(&io->s.op.c->in_flight);
60 closure_wake_up(&io->s.op.c->moving_gc_wait); 61 closure_wake_up(&io->s.op.c->moving_gc_wait);
61 62
62 closure_return_with_destructor(cl, moving_io_destructor); 63 closure_return_with_destructor(cl, moving_io_destructor);
63 } 64 }
64 65
65 static void read_moving_endio(struct bio *bio, int error) 66 static void read_moving_endio(struct bio *bio, int error)
66 { 67 {
67 struct moving_io *io = container_of(bio->bi_private, 68 struct moving_io *io = container_of(bio->bi_private,
68 struct moving_io, s.cl); 69 struct moving_io, s.cl);
69 70
70 if (error) 71 if (error)
71 io->s.error = error; 72 io->s.error = error;
72 73
73 bch_bbio_endio(io->s.op.c, bio, error, "reading data to move"); 74 bch_bbio_endio(io->s.op.c, bio, error, "reading data to move");
74 } 75 }
75 76
76 static void moving_init(struct moving_io *io) 77 static void moving_init(struct moving_io *io)
77 { 78 {
78 struct bio *bio = &io->bio.bio; 79 struct bio *bio = &io->bio.bio;
79 80
80 bio_init(bio); 81 bio_init(bio);
81 bio_get(bio); 82 bio_get(bio);
82 bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); 83 bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
83 84
84 bio->bi_size = KEY_SIZE(&io->w->key) << 9; 85 bio->bi_size = KEY_SIZE(&io->w->key) << 9;
85 bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&io->w->key), 86 bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&io->w->key),
86 PAGE_SECTORS); 87 PAGE_SECTORS);
87 bio->bi_private = &io->s.cl; 88 bio->bi_private = &io->s.cl;
88 bio->bi_io_vec = bio->bi_inline_vecs; 89 bio->bi_io_vec = bio->bi_inline_vecs;
89 bch_bio_map(bio, NULL); 90 bch_bio_map(bio, NULL);
90 } 91 }
91 92
92 static void write_moving(struct closure *cl) 93 static void write_moving(struct closure *cl)
93 { 94 {
94 struct search *s = container_of(cl, struct search, cl); 95 struct search *s = container_of(cl, struct search, cl);
95 struct moving_io *io = container_of(s, struct moving_io, s); 96 struct moving_io *io = container_of(s, struct moving_io, s);
96 97
97 if (!s->error) { 98 if (!s->error) {
98 moving_init(io); 99 moving_init(io);
99 100
100 io->bio.bio.bi_sector = KEY_START(&io->w->key); 101 io->bio.bio.bi_sector = KEY_START(&io->w->key);
101 s->op.lock = -1; 102 s->op.lock = -1;
102 s->op.write_prio = 1; 103 s->op.write_prio = 1;
103 s->op.cache_bio = &io->bio.bio; 104 s->op.cache_bio = &io->bio.bio;
104 105
105 s->writeback = KEY_DIRTY(&io->w->key); 106 s->writeback = KEY_DIRTY(&io->w->key);
106 s->op.csum = KEY_CSUM(&io->w->key); 107 s->op.csum = KEY_CSUM(&io->w->key);
107 108
108 s->op.type = BTREE_REPLACE; 109 s->op.type = BTREE_REPLACE;
109 bkey_copy(&s->op.replace, &io->w->key); 110 bkey_copy(&s->op.replace, &io->w->key);
110 111
111 closure_init(&s->op.cl, cl); 112 closure_init(&s->op.cl, cl);
112 bch_insert_data(&s->op.cl); 113 bch_insert_data(&s->op.cl);
113 } 114 }
114 115
115 continue_at(cl, write_moving_finish, NULL); 116 continue_at(cl, write_moving_finish, NULL);
116 } 117 }
117 118
118 static void read_moving_submit(struct closure *cl) 119 static void read_moving_submit(struct closure *cl)
119 { 120 {
120 struct search *s = container_of(cl, struct search, cl); 121 struct search *s = container_of(cl, struct search, cl);
121 struct moving_io *io = container_of(s, struct moving_io, s); 122 struct moving_io *io = container_of(s, struct moving_io, s);
122 struct bio *bio = &io->bio.bio; 123 struct bio *bio = &io->bio.bio;
123 124
124 bch_submit_bbio(bio, s->op.c, &io->w->key, 0); 125 bch_submit_bbio(bio, s->op.c, &io->w->key, 0);
125 126
126 continue_at(cl, write_moving, bch_gc_wq); 127 continue_at(cl, write_moving, bch_gc_wq);
127 } 128 }
128 129
129 static void read_moving(struct closure *cl) 130 static void read_moving(struct closure *cl)
130 { 131 {
131 struct cache_set *c = container_of(cl, struct cache_set, moving_gc); 132 struct cache_set *c = container_of(cl, struct cache_set, moving_gc);
132 struct keybuf_key *w; 133 struct keybuf_key *w;
133 struct moving_io *io; 134 struct moving_io *io;
134 struct bio *bio; 135 struct bio *bio;
135 136
136 /* XXX: if we error, background writeback could stall indefinitely */ 137 /* XXX: if we error, background writeback could stall indefinitely */
137 138
138 while (!test_bit(CACHE_SET_STOPPING, &c->flags)) { 139 while (!test_bit(CACHE_SET_STOPPING, &c->flags)) {
139 w = bch_keybuf_next_rescan(c, &c->moving_gc_keys, 140 w = bch_keybuf_next_rescan(c, &c->moving_gc_keys,
140 &MAX_KEY, moving_pred); 141 &MAX_KEY, moving_pred);
141 if (!w) 142 if (!w)
142 break; 143 break;
143 144
144 io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec) 145 io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec)
145 * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), 146 * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
146 GFP_KERNEL); 147 GFP_KERNEL);
147 if (!io) 148 if (!io)
148 goto err; 149 goto err;
149 150
150 w->private = io; 151 w->private = io;
151 io->w = w; 152 io->w = w;
152 io->s.op.inode = KEY_INODE(&w->key); 153 io->s.op.inode = KEY_INODE(&w->key);
153 io->s.op.c = c; 154 io->s.op.c = c;
154 155
155 moving_init(io); 156 moving_init(io);
156 bio = &io->bio.bio; 157 bio = &io->bio.bio;
157 158
158 bio->bi_rw = READ; 159 bio->bi_rw = READ;
159 bio->bi_end_io = read_moving_endio; 160 bio->bi_end_io = read_moving_endio;
160 161
161 if (bch_bio_alloc_pages(bio, GFP_KERNEL)) 162 if (bio_alloc_pages(bio, GFP_KERNEL))
162 goto err; 163 goto err;
163 164
164 trace_bcache_gc_copy(&w->key); 165 trace_bcache_gc_copy(&w->key);
165 166
166 closure_call(&io->s.cl, read_moving_submit, NULL, &c->gc.cl); 167 closure_call(&io->s.cl, read_moving_submit, NULL, &c->gc.cl);
167 168
168 if (atomic_inc_return(&c->in_flight) >= 64) { 169 if (atomic_inc_return(&c->in_flight) >= 64) {
169 closure_wait_event(&c->moving_gc_wait, cl, 170 closure_wait_event(&c->moving_gc_wait, cl,
170 atomic_read(&c->in_flight) < 64); 171 atomic_read(&c->in_flight) < 64);
171 continue_at(cl, read_moving, bch_gc_wq); 172 continue_at(cl, read_moving, bch_gc_wq);
172 } 173 }
173 } 174 }
174 175
175 if (0) { 176 if (0) {
176 err: if (!IS_ERR_OR_NULL(w->private)) 177 err: if (!IS_ERR_OR_NULL(w->private))
177 kfree(w->private); 178 kfree(w->private);
178 179
179 bch_keybuf_del(&c->moving_gc_keys, w); 180 bch_keybuf_del(&c->moving_gc_keys, w);
180 } 181 }
181 182
182 closure_return(cl); 183 closure_return(cl);
183 } 184 }
184 185
185 static bool bucket_cmp(struct bucket *l, struct bucket *r) 186 static bool bucket_cmp(struct bucket *l, struct bucket *r)
186 { 187 {
187 return GC_SECTORS_USED(l) < GC_SECTORS_USED(r); 188 return GC_SECTORS_USED(l) < GC_SECTORS_USED(r);
188 } 189 }
189 190
190 static unsigned bucket_heap_top(struct cache *ca) 191 static unsigned bucket_heap_top(struct cache *ca)
191 { 192 {
192 return GC_SECTORS_USED(heap_peek(&ca->heap)); 193 return GC_SECTORS_USED(heap_peek(&ca->heap));
193 } 194 }
194 195
195 void bch_moving_gc(struct closure *cl) 196 void bch_moving_gc(struct closure *cl)
196 { 197 {
197 struct cache_set *c = container_of(cl, struct cache_set, gc.cl); 198 struct cache_set *c = container_of(cl, struct cache_set, gc.cl);
198 struct cache *ca; 199 struct cache *ca;
199 struct bucket *b; 200 struct bucket *b;
200 unsigned i; 201 unsigned i;
201 202
202 if (!c->copy_gc_enabled) 203 if (!c->copy_gc_enabled)
203 closure_return(cl); 204 closure_return(cl);
204 205
205 mutex_lock(&c->bucket_lock); 206 mutex_lock(&c->bucket_lock);
206 207
207 for_each_cache(ca, c, i) { 208 for_each_cache(ca, c, i) {
208 unsigned sectors_to_move = 0; 209 unsigned sectors_to_move = 0;
209 unsigned reserve_sectors = ca->sb.bucket_size * 210 unsigned reserve_sectors = ca->sb.bucket_size *
210 min(fifo_used(&ca->free), ca->free.size / 2); 211 min(fifo_used(&ca->free), ca->free.size / 2);
211 212
212 ca->heap.used = 0; 213 ca->heap.used = 0;
213 214
214 for_each_bucket(b, ca) { 215 for_each_bucket(b, ca) {
215 if (!GC_SECTORS_USED(b)) 216 if (!GC_SECTORS_USED(b))
216 continue; 217 continue;
217 218
218 if (!heap_full(&ca->heap)) { 219 if (!heap_full(&ca->heap)) {
219 sectors_to_move += GC_SECTORS_USED(b); 220 sectors_to_move += GC_SECTORS_USED(b);
220 heap_add(&ca->heap, b, bucket_cmp); 221 heap_add(&ca->heap, b, bucket_cmp);
221 } else if (bucket_cmp(b, heap_peek(&ca->heap))) { 222 } else if (bucket_cmp(b, heap_peek(&ca->heap))) {
222 sectors_to_move -= bucket_heap_top(ca); 223 sectors_to_move -= bucket_heap_top(ca);
223 sectors_to_move += GC_SECTORS_USED(b); 224 sectors_to_move += GC_SECTORS_USED(b);
224 225
225 ca->heap.data[0] = b; 226 ca->heap.data[0] = b;
226 heap_sift(&ca->heap, 0, bucket_cmp); 227 heap_sift(&ca->heap, 0, bucket_cmp);
227 } 228 }
228 } 229 }
229 230
230 while (sectors_to_move > reserve_sectors) { 231 while (sectors_to_move > reserve_sectors) {
231 heap_pop(&ca->heap, b, bucket_cmp); 232 heap_pop(&ca->heap, b, bucket_cmp);
232 sectors_to_move -= GC_SECTORS_USED(b); 233 sectors_to_move -= GC_SECTORS_USED(b);
233 } 234 }
234 235
235 ca->gc_move_threshold = bucket_heap_top(ca); 236 ca->gc_move_threshold = bucket_heap_top(ca);
236 237
237 pr_debug("threshold %u", ca->gc_move_threshold); 238 pr_debug("threshold %u", ca->gc_move_threshold);
238 } 239 }
239 240
240 mutex_unlock(&c->bucket_lock); 241 mutex_unlock(&c->bucket_lock);
241 242
242 c->moving_gc_keys.last_scanned = ZERO_KEY; 243 c->moving_gc_keys.last_scanned = ZERO_KEY;
243 244
244 closure_init(&c->moving_gc, cl); 245 closure_init(&c->moving_gc, cl);
245 read_moving(&c->moving_gc); 246 read_moving(&c->moving_gc);
246 247
247 closure_return(cl); 248 closure_return(cl);
248 } 249 }
249 250
250 void bch_moving_init_cache_set(struct cache_set *c) 251 void bch_moving_init_cache_set(struct cache_set *c)
251 { 252 {
252 bch_keybuf_init(&c->moving_gc_keys); 253 bch_keybuf_init(&c->moving_gc_keys);
253 } 254 }
254 255
drivers/md/bcache/request.c
1 /* 1 /*
2 * Main bcache entry point - handle a read or a write request and decide what to 2 * Main bcache entry point - handle a read or a write request and decide what to
3 * do with it; the make_request functions are called by the block layer. 3 * do with it; the make_request functions are called by the block layer.
4 * 4 *
5 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> 5 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
6 * Copyright 2012 Google, Inc. 6 * Copyright 2012 Google, Inc.
7 */ 7 */
8 8
9 #include "bcache.h" 9 #include "bcache.h"
10 #include "btree.h" 10 #include "btree.h"
11 #include "debug.h" 11 #include "debug.h"
12 #include "request.h" 12 #include "request.h"
13 #include "writeback.h" 13 #include "writeback.h"
14 14
15 #include <linux/cgroup.h> 15 #include <linux/cgroup.h>
16 #include <linux/module.h> 16 #include <linux/module.h>
17 #include <linux/hash.h> 17 #include <linux/hash.h>
18 #include <linux/random.h> 18 #include <linux/random.h>
19 #include "blk-cgroup.h" 19 #include "blk-cgroup.h"
20 20
21 #include <trace/events/bcache.h> 21 #include <trace/events/bcache.h>
22 22
23 #define CUTOFF_CACHE_ADD 95 23 #define CUTOFF_CACHE_ADD 95
24 #define CUTOFF_CACHE_READA 90 24 #define CUTOFF_CACHE_READA 90
25 25
26 struct kmem_cache *bch_search_cache; 26 struct kmem_cache *bch_search_cache;
27 27
28 static void check_should_skip(struct cached_dev *, struct search *); 28 static void check_should_skip(struct cached_dev *, struct search *);
29 29
30 /* Cgroup interface */ 30 /* Cgroup interface */
31 31
32 #ifdef CONFIG_CGROUP_BCACHE 32 #ifdef CONFIG_CGROUP_BCACHE
33 static struct bch_cgroup bcache_default_cgroup = { .cache_mode = -1 }; 33 static struct bch_cgroup bcache_default_cgroup = { .cache_mode = -1 };
34 34
35 static struct bch_cgroup *cgroup_to_bcache(struct cgroup *cgroup) 35 static struct bch_cgroup *cgroup_to_bcache(struct cgroup *cgroup)
36 { 36 {
37 struct cgroup_subsys_state *css; 37 struct cgroup_subsys_state *css;
38 return cgroup && 38 return cgroup &&
39 (css = cgroup_subsys_state(cgroup, bcache_subsys_id)) 39 (css = cgroup_subsys_state(cgroup, bcache_subsys_id))
40 ? container_of(css, struct bch_cgroup, css) 40 ? container_of(css, struct bch_cgroup, css)
41 : &bcache_default_cgroup; 41 : &bcache_default_cgroup;
42 } 42 }
43 43
44 struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio) 44 struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio)
45 { 45 {
46 struct cgroup_subsys_state *css = bio->bi_css 46 struct cgroup_subsys_state *css = bio->bi_css
47 ? cgroup_subsys_state(bio->bi_css->cgroup, bcache_subsys_id) 47 ? cgroup_subsys_state(bio->bi_css->cgroup, bcache_subsys_id)
48 : task_subsys_state(current, bcache_subsys_id); 48 : task_subsys_state(current, bcache_subsys_id);
49 49
50 return css 50 return css
51 ? container_of(css, struct bch_cgroup, css) 51 ? container_of(css, struct bch_cgroup, css)
52 : &bcache_default_cgroup; 52 : &bcache_default_cgroup;
53 } 53 }
54 54
55 static ssize_t cache_mode_read(struct cgroup *cgrp, struct cftype *cft, 55 static ssize_t cache_mode_read(struct cgroup *cgrp, struct cftype *cft,
56 struct file *file, 56 struct file *file,
57 char __user *buf, size_t nbytes, loff_t *ppos) 57 char __user *buf, size_t nbytes, loff_t *ppos)
58 { 58 {
59 char tmp[1024]; 59 char tmp[1024];
60 int len = bch_snprint_string_list(tmp, PAGE_SIZE, bch_cache_modes, 60 int len = bch_snprint_string_list(tmp, PAGE_SIZE, bch_cache_modes,
61 cgroup_to_bcache(cgrp)->cache_mode + 1); 61 cgroup_to_bcache(cgrp)->cache_mode + 1);
62 62
63 if (len < 0) 63 if (len < 0)
64 return len; 64 return len;
65 65
66 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 66 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
67 } 67 }
68 68
69 static int cache_mode_write(struct cgroup *cgrp, struct cftype *cft, 69 static int cache_mode_write(struct cgroup *cgrp, struct cftype *cft,
70 const char *buf) 70 const char *buf)
71 { 71 {
72 int v = bch_read_string_list(buf, bch_cache_modes); 72 int v = bch_read_string_list(buf, bch_cache_modes);
73 if (v < 0) 73 if (v < 0)
74 return v; 74 return v;
75 75
76 cgroup_to_bcache(cgrp)->cache_mode = v - 1; 76 cgroup_to_bcache(cgrp)->cache_mode = v - 1;
77 return 0; 77 return 0;
78 } 78 }
79 79
80 static u64 bch_verify_read(struct cgroup *cgrp, struct cftype *cft) 80 static u64 bch_verify_read(struct cgroup *cgrp, struct cftype *cft)
81 { 81 {
82 return cgroup_to_bcache(cgrp)->verify; 82 return cgroup_to_bcache(cgrp)->verify;
83 } 83 }
84 84
85 static int bch_verify_write(struct cgroup *cgrp, struct cftype *cft, u64 val) 85 static int bch_verify_write(struct cgroup *cgrp, struct cftype *cft, u64 val)
86 { 86 {
87 cgroup_to_bcache(cgrp)->verify = val; 87 cgroup_to_bcache(cgrp)->verify = val;
88 return 0; 88 return 0;
89 } 89 }
90 90
91 static u64 bch_cache_hits_read(struct cgroup *cgrp, struct cftype *cft) 91 static u64 bch_cache_hits_read(struct cgroup *cgrp, struct cftype *cft)
92 { 92 {
93 struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); 93 struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
94 return atomic_read(&bcachecg->stats.cache_hits); 94 return atomic_read(&bcachecg->stats.cache_hits);
95 } 95 }
96 96
97 static u64 bch_cache_misses_read(struct cgroup *cgrp, struct cftype *cft) 97 static u64 bch_cache_misses_read(struct cgroup *cgrp, struct cftype *cft)
98 { 98 {
99 struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); 99 struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
100 return atomic_read(&bcachecg->stats.cache_misses); 100 return atomic_read(&bcachecg->stats.cache_misses);
101 } 101 }
102 102
103 static u64 bch_cache_bypass_hits_read(struct cgroup *cgrp, 103 static u64 bch_cache_bypass_hits_read(struct cgroup *cgrp,
104 struct cftype *cft) 104 struct cftype *cft)
105 { 105 {
106 struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); 106 struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
107 return atomic_read(&bcachecg->stats.cache_bypass_hits); 107 return atomic_read(&bcachecg->stats.cache_bypass_hits);
108 } 108 }
109 109
110 static u64 bch_cache_bypass_misses_read(struct cgroup *cgrp, 110 static u64 bch_cache_bypass_misses_read(struct cgroup *cgrp,
111 struct cftype *cft) 111 struct cftype *cft)
112 { 112 {
113 struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); 113 struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
114 return atomic_read(&bcachecg->stats.cache_bypass_misses); 114 return atomic_read(&bcachecg->stats.cache_bypass_misses);
115 } 115 }
116 116
117 static struct cftype bch_files[] = { 117 static struct cftype bch_files[] = {
118 { 118 {
119 .name = "cache_mode", 119 .name = "cache_mode",
120 .read = cache_mode_read, 120 .read = cache_mode_read,
121 .write_string = cache_mode_write, 121 .write_string = cache_mode_write,
122 }, 122 },
123 { 123 {
124 .name = "verify", 124 .name = "verify",
125 .read_u64 = bch_verify_read, 125 .read_u64 = bch_verify_read,
126 .write_u64 = bch_verify_write, 126 .write_u64 = bch_verify_write,
127 }, 127 },
128 { 128 {
129 .name = "cache_hits", 129 .name = "cache_hits",
130 .read_u64 = bch_cache_hits_read, 130 .read_u64 = bch_cache_hits_read,
131 }, 131 },
132 { 132 {
133 .name = "cache_misses", 133 .name = "cache_misses",
134 .read_u64 = bch_cache_misses_read, 134 .read_u64 = bch_cache_misses_read,
135 }, 135 },
136 { 136 {
137 .name = "cache_bypass_hits", 137 .name = "cache_bypass_hits",
138 .read_u64 = bch_cache_bypass_hits_read, 138 .read_u64 = bch_cache_bypass_hits_read,
139 }, 139 },
140 { 140 {
141 .name = "cache_bypass_misses", 141 .name = "cache_bypass_misses",
142 .read_u64 = bch_cache_bypass_misses_read, 142 .read_u64 = bch_cache_bypass_misses_read,
143 }, 143 },
144 { } /* terminate */ 144 { } /* terminate */
145 }; 145 };
146 146
147 static void init_bch_cgroup(struct bch_cgroup *cg) 147 static void init_bch_cgroup(struct bch_cgroup *cg)
148 { 148 {
149 cg->cache_mode = -1; 149 cg->cache_mode = -1;
150 } 150 }
151 151
152 static struct cgroup_subsys_state *bcachecg_create(struct cgroup *cgroup) 152 static struct cgroup_subsys_state *bcachecg_create(struct cgroup *cgroup)
153 { 153 {
154 struct bch_cgroup *cg; 154 struct bch_cgroup *cg;
155 155
156 cg = kzalloc(sizeof(*cg), GFP_KERNEL); 156 cg = kzalloc(sizeof(*cg), GFP_KERNEL);
157 if (!cg) 157 if (!cg)
158 return ERR_PTR(-ENOMEM); 158 return ERR_PTR(-ENOMEM);
159 init_bch_cgroup(cg); 159 init_bch_cgroup(cg);
160 return &cg->css; 160 return &cg->css;
161 } 161 }
162 162
163 static void bcachecg_destroy(struct cgroup *cgroup) 163 static void bcachecg_destroy(struct cgroup *cgroup)
164 { 164 {
165 struct bch_cgroup *cg = cgroup_to_bcache(cgroup); 165 struct bch_cgroup *cg = cgroup_to_bcache(cgroup);
166 free_css_id(&bcache_subsys, &cg->css); 166 free_css_id(&bcache_subsys, &cg->css);
167 kfree(cg); 167 kfree(cg);
168 } 168 }
169 169
170 struct cgroup_subsys bcache_subsys = { 170 struct cgroup_subsys bcache_subsys = {
171 .create = bcachecg_create, 171 .create = bcachecg_create,
172 .destroy = bcachecg_destroy, 172 .destroy = bcachecg_destroy,
173 .subsys_id = bcache_subsys_id, 173 .subsys_id = bcache_subsys_id,
174 .name = "bcache", 174 .name = "bcache",
175 .module = THIS_MODULE, 175 .module = THIS_MODULE,
176 }; 176 };
177 EXPORT_SYMBOL_GPL(bcache_subsys); 177 EXPORT_SYMBOL_GPL(bcache_subsys);
178 #endif 178 #endif
179 179
180 static unsigned cache_mode(struct cached_dev *dc, struct bio *bio) 180 static unsigned cache_mode(struct cached_dev *dc, struct bio *bio)
181 { 181 {
182 #ifdef CONFIG_CGROUP_BCACHE 182 #ifdef CONFIG_CGROUP_BCACHE
183 int r = bch_bio_to_cgroup(bio)->cache_mode; 183 int r = bch_bio_to_cgroup(bio)->cache_mode;
184 if (r >= 0) 184 if (r >= 0)
185 return r; 185 return r;
186 #endif 186 #endif
187 return BDEV_CACHE_MODE(&dc->sb); 187 return BDEV_CACHE_MODE(&dc->sb);
188 } 188 }
189 189
190 static bool verify(struct cached_dev *dc, struct bio *bio) 190 static bool verify(struct cached_dev *dc, struct bio *bio)
191 { 191 {
192 #ifdef CONFIG_CGROUP_BCACHE 192 #ifdef CONFIG_CGROUP_BCACHE
193 if (bch_bio_to_cgroup(bio)->verify) 193 if (bch_bio_to_cgroup(bio)->verify)
194 return true; 194 return true;
195 #endif 195 #endif
196 return dc->verify; 196 return dc->verify;
197 } 197 }
198 198
199 static void bio_csum(struct bio *bio, struct bkey *k) 199 static void bio_csum(struct bio *bio, struct bkey *k)
200 { 200 {
201 struct bio_vec *bv; 201 struct bio_vec *bv;
202 uint64_t csum = 0; 202 uint64_t csum = 0;
203 int i; 203 int i;
204 204
205 bio_for_each_segment(bv, bio, i) { 205 bio_for_each_segment(bv, bio, i) {
206 void *d = kmap(bv->bv_page) + bv->bv_offset; 206 void *d = kmap(bv->bv_page) + bv->bv_offset;
207 csum = bch_crc64_update(csum, d, bv->bv_len); 207 csum = bch_crc64_update(csum, d, bv->bv_len);
208 kunmap(bv->bv_page); 208 kunmap(bv->bv_page);
209 } 209 }
210 210
211 k->ptr[KEY_PTRS(k)] = csum & (~0ULL >> 1); 211 k->ptr[KEY_PTRS(k)] = csum & (~0ULL >> 1);
212 } 212 }
213 213
214 /* Insert data into cache */ 214 /* Insert data into cache */
215 215
216 static void bio_invalidate(struct closure *cl) 216 static void bio_invalidate(struct closure *cl)
217 { 217 {
218 struct btree_op *op = container_of(cl, struct btree_op, cl); 218 struct btree_op *op = container_of(cl, struct btree_op, cl);
219 struct bio *bio = op->cache_bio; 219 struct bio *bio = op->cache_bio;
220 220
221 pr_debug("invalidating %i sectors from %llu", 221 pr_debug("invalidating %i sectors from %llu",
222 bio_sectors(bio), (uint64_t) bio->bi_sector); 222 bio_sectors(bio), (uint64_t) bio->bi_sector);
223 223
224 while (bio_sectors(bio)) { 224 while (bio_sectors(bio)) {
225 unsigned len = min(bio_sectors(bio), 1U << 14); 225 unsigned len = min(bio_sectors(bio), 1U << 14);
226 226
227 if (bch_keylist_realloc(&op->keys, 0, op->c)) 227 if (bch_keylist_realloc(&op->keys, 0, op->c))
228 goto out; 228 goto out;
229 229
230 bio->bi_sector += len; 230 bio->bi_sector += len;
231 bio->bi_size -= len << 9; 231 bio->bi_size -= len << 9;
232 232
233 bch_keylist_add(&op->keys, 233 bch_keylist_add(&op->keys,
234 &KEY(op->inode, bio->bi_sector, len)); 234 &KEY(op->inode, bio->bi_sector, len));
235 } 235 }
236 236
237 op->insert_data_done = true; 237 op->insert_data_done = true;
238 bio_put(bio); 238 bio_put(bio);
239 out: 239 out:
240 continue_at(cl, bch_journal, bcache_wq); 240 continue_at(cl, bch_journal, bcache_wq);
241 } 241 }
242 242
243 struct open_bucket { 243 struct open_bucket {
244 struct list_head list; 244 struct list_head list;
245 struct task_struct *last; 245 struct task_struct *last;
246 unsigned sectors_free; 246 unsigned sectors_free;
247 BKEY_PADDED(key); 247 BKEY_PADDED(key);
248 }; 248 };
249 249
250 void bch_open_buckets_free(struct cache_set *c) 250 void bch_open_buckets_free(struct cache_set *c)
251 { 251 {
252 struct open_bucket *b; 252 struct open_bucket *b;
253 253
254 while (!list_empty(&c->data_buckets)) { 254 while (!list_empty(&c->data_buckets)) {
255 b = list_first_entry(&c->data_buckets, 255 b = list_first_entry(&c->data_buckets,
256 struct open_bucket, list); 256 struct open_bucket, list);
257 list_del(&b->list); 257 list_del(&b->list);
258 kfree(b); 258 kfree(b);
259 } 259 }
260 } 260 }
261 261
262 int bch_open_buckets_alloc(struct cache_set *c) 262 int bch_open_buckets_alloc(struct cache_set *c)
263 { 263 {
264 int i; 264 int i;
265 265
266 spin_lock_init(&c->data_bucket_lock); 266 spin_lock_init(&c->data_bucket_lock);
267 267
268 for (i = 0; i < 6; i++) { 268 for (i = 0; i < 6; i++) {
269 struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL); 269 struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL);
270 if (!b) 270 if (!b)
271 return -ENOMEM; 271 return -ENOMEM;
272 272
273 list_add(&b->list, &c->data_buckets); 273 list_add(&b->list, &c->data_buckets);
274 } 274 }
275 275
276 return 0; 276 return 0;
277 } 277 }
278 278
279 /* 279 /*
280 * We keep multiple buckets open for writes, and try to segregate different 280 * We keep multiple buckets open for writes, and try to segregate different
281 * write streams for better cache utilization: first we look for a bucket where 281 * write streams for better cache utilization: first we look for a bucket where
282 * the last write to it was sequential with the current write, and failing that 282 * the last write to it was sequential with the current write, and failing that
283 * we look for a bucket that was last used by the same task. 283 * we look for a bucket that was last used by the same task.
284 * 284 *
285 * The ideas is if you've got multiple tasks pulling data into the cache at the 285 * The ideas is if you've got multiple tasks pulling data into the cache at the
286 * same time, you'll get better cache utilization if you try to segregate their 286 * same time, you'll get better cache utilization if you try to segregate their
287 * data and preserve locality. 287 * data and preserve locality.
288 * 288 *
289 * For example, say you've starting Firefox at the same time you're copying a 289 * For example, say you've starting Firefox at the same time you're copying a
290 * bunch of files. Firefox will likely end up being fairly hot and stay in the 290 * bunch of files. Firefox will likely end up being fairly hot and stay in the
291 * cache awhile, but the data you copied might not be; if you wrote all that 291 * cache awhile, but the data you copied might not be; if you wrote all that
292 * data to the same buckets it'd get invalidated at the same time. 292 * data to the same buckets it'd get invalidated at the same time.
293 * 293 *
294 * Both of those tasks will be doing fairly random IO so we can't rely on 294 * Both of those tasks will be doing fairly random IO so we can't rely on
295 * detecting sequential IO to segregate their data, but going off of the task 295 * detecting sequential IO to segregate their data, but going off of the task
296 * should be a sane heuristic. 296 * should be a sane heuristic.
297 */ 297 */
298 static struct open_bucket *pick_data_bucket(struct cache_set *c, 298 static struct open_bucket *pick_data_bucket(struct cache_set *c,
299 const struct bkey *search, 299 const struct bkey *search,
300 struct task_struct *task, 300 struct task_struct *task,
301 struct bkey *alloc) 301 struct bkey *alloc)
302 { 302 {
303 struct open_bucket *ret, *ret_task = NULL; 303 struct open_bucket *ret, *ret_task = NULL;
304 304
305 list_for_each_entry_reverse(ret, &c->data_buckets, list) 305 list_for_each_entry_reverse(ret, &c->data_buckets, list)
306 if (!bkey_cmp(&ret->key, search)) 306 if (!bkey_cmp(&ret->key, search))
307 goto found; 307 goto found;
308 else if (ret->last == task) 308 else if (ret->last == task)
309 ret_task = ret; 309 ret_task = ret;
310 310
311 ret = ret_task ?: list_first_entry(&c->data_buckets, 311 ret = ret_task ?: list_first_entry(&c->data_buckets,
312 struct open_bucket, list); 312 struct open_bucket, list);
313 found: 313 found:
314 if (!ret->sectors_free && KEY_PTRS(alloc)) { 314 if (!ret->sectors_free && KEY_PTRS(alloc)) {
315 ret->sectors_free = c->sb.bucket_size; 315 ret->sectors_free = c->sb.bucket_size;
316 bkey_copy(&ret->key, alloc); 316 bkey_copy(&ret->key, alloc);
317 bkey_init(alloc); 317 bkey_init(alloc);
318 } 318 }
319 319
320 if (!ret->sectors_free) 320 if (!ret->sectors_free)
321 ret = NULL; 321 ret = NULL;
322 322
323 return ret; 323 return ret;
324 } 324 }
325 325
326 /* 326 /*
327 * Allocates some space in the cache to write to, and k to point to the newly 327 * Allocates some space in the cache to write to, and k to point to the newly
328 * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the 328 * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the
329 * end of the newly allocated space). 329 * end of the newly allocated space).
330 * 330 *
331 * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many 331 * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many
332 * sectors were actually allocated. 332 * sectors were actually allocated.
333 * 333 *
334 * If s->writeback is true, will not fail. 334 * If s->writeback is true, will not fail.
335 */ 335 */
336 static bool bch_alloc_sectors(struct bkey *k, unsigned sectors, 336 static bool bch_alloc_sectors(struct bkey *k, unsigned sectors,
337 struct search *s) 337 struct search *s)
338 { 338 {
339 struct cache_set *c = s->op.c; 339 struct cache_set *c = s->op.c;
340 struct open_bucket *b; 340 struct open_bucket *b;
341 BKEY_PADDED(key) alloc; 341 BKEY_PADDED(key) alloc;
342 struct closure cl, *w = NULL; 342 struct closure cl, *w = NULL;
343 unsigned i; 343 unsigned i;
344 344
345 if (s->writeback) { 345 if (s->writeback) {
346 closure_init_stack(&cl); 346 closure_init_stack(&cl);
347 w = &cl; 347 w = &cl;
348 } 348 }
349 349
350 /* 350 /*
351 * We might have to allocate a new bucket, which we can't do with a 351 * We might have to allocate a new bucket, which we can't do with a
352 * spinlock held. So if we have to allocate, we drop the lock, allocate 352 * spinlock held. So if we have to allocate, we drop the lock, allocate
353 * and then retry. KEY_PTRS() indicates whether alloc points to 353 * and then retry. KEY_PTRS() indicates whether alloc points to
354 * allocated bucket(s). 354 * allocated bucket(s).
355 */ 355 */
356 356
357 bkey_init(&alloc.key); 357 bkey_init(&alloc.key);
358 spin_lock(&c->data_bucket_lock); 358 spin_lock(&c->data_bucket_lock);
359 359
360 while (!(b = pick_data_bucket(c, k, s->task, &alloc.key))) { 360 while (!(b = pick_data_bucket(c, k, s->task, &alloc.key))) {
361 unsigned watermark = s->op.write_prio 361 unsigned watermark = s->op.write_prio
362 ? WATERMARK_MOVINGGC 362 ? WATERMARK_MOVINGGC
363 : WATERMARK_NONE; 363 : WATERMARK_NONE;
364 364
365 spin_unlock(&c->data_bucket_lock); 365 spin_unlock(&c->data_bucket_lock);
366 366
367 if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, w)) 367 if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, w))
368 return false; 368 return false;
369 369
370 spin_lock(&c->data_bucket_lock); 370 spin_lock(&c->data_bucket_lock);
371 } 371 }
372 372
373 /* 373 /*
374 * If we had to allocate, we might race and not need to allocate the 374 * If we had to allocate, we might race and not need to allocate the
375 * second time we call find_data_bucket(). If we allocated a bucket but 375 * second time we call find_data_bucket(). If we allocated a bucket but
376 * didn't use it, drop the refcount bch_bucket_alloc_set() took: 376 * didn't use it, drop the refcount bch_bucket_alloc_set() took:
377 */ 377 */
378 if (KEY_PTRS(&alloc.key)) 378 if (KEY_PTRS(&alloc.key))
379 __bkey_put(c, &alloc.key); 379 __bkey_put(c, &alloc.key);
380 380
381 for (i = 0; i < KEY_PTRS(&b->key); i++) 381 for (i = 0; i < KEY_PTRS(&b->key); i++)
382 EBUG_ON(ptr_stale(c, &b->key, i)); 382 EBUG_ON(ptr_stale(c, &b->key, i));
383 383
384 /* Set up the pointer to the space we're allocating: */ 384 /* Set up the pointer to the space we're allocating: */
385 385
386 for (i = 0; i < KEY_PTRS(&b->key); i++) 386 for (i = 0; i < KEY_PTRS(&b->key); i++)
387 k->ptr[i] = b->key.ptr[i]; 387 k->ptr[i] = b->key.ptr[i];
388 388
389 sectors = min(sectors, b->sectors_free); 389 sectors = min(sectors, b->sectors_free);
390 390
391 SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors); 391 SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors);
392 SET_KEY_SIZE(k, sectors); 392 SET_KEY_SIZE(k, sectors);
393 SET_KEY_PTRS(k, KEY_PTRS(&b->key)); 393 SET_KEY_PTRS(k, KEY_PTRS(&b->key));
394 394
395 /* 395 /*
396 * Move b to the end of the lru, and keep track of what this bucket was 396 * Move b to the end of the lru, and keep track of what this bucket was
397 * last used for: 397 * last used for:
398 */ 398 */
399 list_move_tail(&b->list, &c->data_buckets); 399 list_move_tail(&b->list, &c->data_buckets);
400 bkey_copy_key(&b->key, k); 400 bkey_copy_key(&b->key, k);
401 b->last = s->task; 401 b->last = s->task;
402 402
403 b->sectors_free -= sectors; 403 b->sectors_free -= sectors;
404 404
405 for (i = 0; i < KEY_PTRS(&b->key); i++) { 405 for (i = 0; i < KEY_PTRS(&b->key); i++) {
406 SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors); 406 SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors);
407 407
408 atomic_long_add(sectors, 408 atomic_long_add(sectors,
409 &PTR_CACHE(c, &b->key, i)->sectors_written); 409 &PTR_CACHE(c, &b->key, i)->sectors_written);
410 } 410 }
411 411
412 if (b->sectors_free < c->sb.block_size) 412 if (b->sectors_free < c->sb.block_size)
413 b->sectors_free = 0; 413 b->sectors_free = 0;
414 414
415 /* 415 /*
416 * k takes refcounts on the buckets it points to until it's inserted 416 * k takes refcounts on the buckets it points to until it's inserted
417 * into the btree, but if we're done with this bucket we just transfer 417 * into the btree, but if we're done with this bucket we just transfer
418 * get_data_bucket()'s refcount. 418 * get_data_bucket()'s refcount.
419 */ 419 */
420 if (b->sectors_free) 420 if (b->sectors_free)
421 for (i = 0; i < KEY_PTRS(&b->key); i++) 421 for (i = 0; i < KEY_PTRS(&b->key); i++)
422 atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin); 422 atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin);
423 423
424 spin_unlock(&c->data_bucket_lock); 424 spin_unlock(&c->data_bucket_lock);
425 return true; 425 return true;
426 } 426 }
427 427
428 static void bch_insert_data_error(struct closure *cl) 428 static void bch_insert_data_error(struct closure *cl)
429 { 429 {
430 struct btree_op *op = container_of(cl, struct btree_op, cl); 430 struct btree_op *op = container_of(cl, struct btree_op, cl);
431 431
432 /* 432 /*
433 * Our data write just errored, which means we've got a bunch of keys to 433 * Our data write just errored, which means we've got a bunch of keys to
434 * insert that point to data that wasn't succesfully written. 434 * insert that point to data that wasn't succesfully written.
435 * 435 *
436 * We don't have to insert those keys but we still have to invalidate 436 * We don't have to insert those keys but we still have to invalidate
437 * that region of the cache - so, if we just strip off all the pointers 437 * that region of the cache - so, if we just strip off all the pointers
438 * from the keys we'll accomplish just that. 438 * from the keys we'll accomplish just that.
439 */ 439 */
440 440
441 struct bkey *src = op->keys.bottom, *dst = op->keys.bottom; 441 struct bkey *src = op->keys.bottom, *dst = op->keys.bottom;
442 442
443 while (src != op->keys.top) { 443 while (src != op->keys.top) {
444 struct bkey *n = bkey_next(src); 444 struct bkey *n = bkey_next(src);
445 445
446 SET_KEY_PTRS(src, 0); 446 SET_KEY_PTRS(src, 0);
447 bkey_copy(dst, src); 447 bkey_copy(dst, src);
448 448
449 dst = bkey_next(dst); 449 dst = bkey_next(dst);
450 src = n; 450 src = n;
451 } 451 }
452 452
453 op->keys.top = dst; 453 op->keys.top = dst;
454 454
455 bch_journal(cl); 455 bch_journal(cl);
456 } 456 }
457 457
458 static void bch_insert_data_endio(struct bio *bio, int error) 458 static void bch_insert_data_endio(struct bio *bio, int error)
459 { 459 {
460 struct closure *cl = bio->bi_private; 460 struct closure *cl = bio->bi_private;
461 struct btree_op *op = container_of(cl, struct btree_op, cl); 461 struct btree_op *op = container_of(cl, struct btree_op, cl);
462 struct search *s = container_of(op, struct search, op); 462 struct search *s = container_of(op, struct search, op);
463 463
464 if (error) { 464 if (error) {
465 /* TODO: We could try to recover from this. */ 465 /* TODO: We could try to recover from this. */
466 if (s->writeback) 466 if (s->writeback)
467 s->error = error; 467 s->error = error;
468 else if (s->write) 468 else if (s->write)
469 set_closure_fn(cl, bch_insert_data_error, bcache_wq); 469 set_closure_fn(cl, bch_insert_data_error, bcache_wq);
470 else 470 else
471 set_closure_fn(cl, NULL, NULL); 471 set_closure_fn(cl, NULL, NULL);
472 } 472 }
473 473
474 bch_bbio_endio(op->c, bio, error, "writing data to cache"); 474 bch_bbio_endio(op->c, bio, error, "writing data to cache");
475 } 475 }
476 476
477 static void bch_insert_data_loop(struct closure *cl) 477 static void bch_insert_data_loop(struct closure *cl)
478 { 478 {
479 struct btree_op *op = container_of(cl, struct btree_op, cl); 479 struct btree_op *op = container_of(cl, struct btree_op, cl);
480 struct search *s = container_of(op, struct search, op); 480 struct search *s = container_of(op, struct search, op);
481 struct bio *bio = op->cache_bio, *n; 481 struct bio *bio = op->cache_bio, *n;
482 482
483 if (op->skip) 483 if (op->skip)
484 return bio_invalidate(cl); 484 return bio_invalidate(cl);
485 485
486 if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) { 486 if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) {
487 set_gc_sectors(op->c); 487 set_gc_sectors(op->c);
488 bch_queue_gc(op->c); 488 bch_queue_gc(op->c);
489 } 489 }
490 490
491 do { 491 do {
492 unsigned i; 492 unsigned i;
493 struct bkey *k; 493 struct bkey *k;
494 struct bio_set *split = s->d 494 struct bio_set *split = s->d
495 ? s->d->bio_split : op->c->bio_split; 495 ? s->d->bio_split : op->c->bio_split;
496 496
497 /* 1 for the device pointer and 1 for the chksum */ 497 /* 1 for the device pointer and 1 for the chksum */
498 if (bch_keylist_realloc(&op->keys, 498 if (bch_keylist_realloc(&op->keys,
499 1 + (op->csum ? 1 : 0), 499 1 + (op->csum ? 1 : 0),
500 op->c)) 500 op->c))
501 continue_at(cl, bch_journal, bcache_wq); 501 continue_at(cl, bch_journal, bcache_wq);
502 502
503 k = op->keys.top; 503 k = op->keys.top;
504 bkey_init(k); 504 bkey_init(k);
505 SET_KEY_INODE(k, op->inode); 505 SET_KEY_INODE(k, op->inode);
506 SET_KEY_OFFSET(k, bio->bi_sector); 506 SET_KEY_OFFSET(k, bio->bi_sector);
507 507
508 if (!bch_alloc_sectors(k, bio_sectors(bio), s)) 508 if (!bch_alloc_sectors(k, bio_sectors(bio), s))
509 goto err; 509 goto err;
510 510
511 n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split); 511 n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split);
512 if (!n) {
513 __bkey_put(op->c, k);
514 continue_at(cl, bch_insert_data_loop, bcache_wq);
515 }
516 512
517 n->bi_end_io = bch_insert_data_endio; 513 n->bi_end_io = bch_insert_data_endio;
518 n->bi_private = cl; 514 n->bi_private = cl;
519 515
520 if (s->writeback) { 516 if (s->writeback) {
521 SET_KEY_DIRTY(k, true); 517 SET_KEY_DIRTY(k, true);
522 518
523 for (i = 0; i < KEY_PTRS(k); i++) 519 for (i = 0; i < KEY_PTRS(k); i++)
524 SET_GC_MARK(PTR_BUCKET(op->c, k, i), 520 SET_GC_MARK(PTR_BUCKET(op->c, k, i),
525 GC_MARK_DIRTY); 521 GC_MARK_DIRTY);
526 } 522 }
527 523
528 SET_KEY_CSUM(k, op->csum); 524 SET_KEY_CSUM(k, op->csum);
529 if (KEY_CSUM(k)) 525 if (KEY_CSUM(k))
530 bio_csum(n, k); 526 bio_csum(n, k);
531 527
532 trace_bcache_cache_insert(k); 528 trace_bcache_cache_insert(k);
533 bch_keylist_push(&op->keys); 529 bch_keylist_push(&op->keys);
534 530
535 n->bi_rw |= REQ_WRITE; 531 n->bi_rw |= REQ_WRITE;
536 bch_submit_bbio(n, op->c, k, 0); 532 bch_submit_bbio(n, op->c, k, 0);
537 } while (n != bio); 533 } while (n != bio);
538 534
539 op->insert_data_done = true; 535 op->insert_data_done = true;
540 continue_at(cl, bch_journal, bcache_wq); 536 continue_at(cl, bch_journal, bcache_wq);
541 err: 537 err:
542 /* bch_alloc_sectors() blocks if s->writeback = true */ 538 /* bch_alloc_sectors() blocks if s->writeback = true */
543 BUG_ON(s->writeback); 539 BUG_ON(s->writeback);
544 540
545 /* 541 /*
546 * But if it's not a writeback write we'd rather just bail out if 542 * But if it's not a writeback write we'd rather just bail out if
547 * there aren't any buckets ready to write to - it might take awhile and 543 * there aren't any buckets ready to write to - it might take awhile and
548 * we might be starving btree writes for gc or something. 544 * we might be starving btree writes for gc or something.
549 */ 545 */
550 546
551 if (s->write) { 547 if (s->write) {
552 /* 548 /*
553 * Writethrough write: We can't complete the write until we've 549 * Writethrough write: We can't complete the write until we've
554 * updated the index. But we don't want to delay the write while 550 * updated the index. But we don't want to delay the write while
555 * we wait for buckets to be freed up, so just invalidate the 551 * we wait for buckets to be freed up, so just invalidate the
556 * rest of the write. 552 * rest of the write.
557 */ 553 */
558 op->skip = true; 554 op->skip = true;
559 return bio_invalidate(cl); 555 return bio_invalidate(cl);
560 } else { 556 } else {
561 /* 557 /*
562 * From a cache miss, we can just insert the keys for the data 558 * From a cache miss, we can just insert the keys for the data
563 * we have written or bail out if we didn't do anything. 559 * we have written or bail out if we didn't do anything.
564 */ 560 */
565 op->insert_data_done = true; 561 op->insert_data_done = true;
566 bio_put(bio); 562 bio_put(bio);
567 563
568 if (!bch_keylist_empty(&op->keys)) 564 if (!bch_keylist_empty(&op->keys))
569 continue_at(cl, bch_journal, bcache_wq); 565 continue_at(cl, bch_journal, bcache_wq);
570 else 566 else
571 closure_return(cl); 567 closure_return(cl);
572 } 568 }
573 } 569 }
574 570
575 /** 571 /**
576 * bch_insert_data - stick some data in the cache 572 * bch_insert_data - stick some data in the cache
577 * 573 *
578 * This is the starting point for any data to end up in a cache device; it could 574 * This is the starting point for any data to end up in a cache device; it could
579 * be from a normal write, or a writeback write, or a write to a flash only 575 * be from a normal write, or a writeback write, or a write to a flash only
580 * volume - it's also used by the moving garbage collector to compact data in 576 * volume - it's also used by the moving garbage collector to compact data in
581 * mostly empty buckets. 577 * mostly empty buckets.
582 * 578 *
583 * It first writes the data to the cache, creating a list of keys to be inserted 579 * It first writes the data to the cache, creating a list of keys to be inserted
584 * (if the data had to be fragmented there will be multiple keys); after the 580 * (if the data had to be fragmented there will be multiple keys); after the
585 * data is written it calls bch_journal, and after the keys have been added to 581 * data is written it calls bch_journal, and after the keys have been added to
586 * the next journal write they're inserted into the btree. 582 * the next journal write they're inserted into the btree.
587 * 583 *
588 * It inserts the data in op->cache_bio; bi_sector is used for the key offset, 584 * It inserts the data in op->cache_bio; bi_sector is used for the key offset,
589 * and op->inode is used for the key inode. 585 * and op->inode is used for the key inode.
590 * 586 *
591 * If op->skip is true, instead of inserting the data it invalidates the region 587 * If op->skip is true, instead of inserting the data it invalidates the region
592 * of the cache represented by op->cache_bio and op->inode. 588 * of the cache represented by op->cache_bio and op->inode.
593 */ 589 */
594 void bch_insert_data(struct closure *cl) 590 void bch_insert_data(struct closure *cl)
595 { 591 {
596 struct btree_op *op = container_of(cl, struct btree_op, cl); 592 struct btree_op *op = container_of(cl, struct btree_op, cl);
597 593
598 bch_keylist_init(&op->keys); 594 bch_keylist_init(&op->keys);
599 bio_get(op->cache_bio); 595 bio_get(op->cache_bio);
600 bch_insert_data_loop(cl); 596 bch_insert_data_loop(cl);
601 } 597 }
602 598
603 void bch_btree_insert_async(struct closure *cl) 599 void bch_btree_insert_async(struct closure *cl)
604 { 600 {
605 struct btree_op *op = container_of(cl, struct btree_op, cl); 601 struct btree_op *op = container_of(cl, struct btree_op, cl);
606 struct search *s = container_of(op, struct search, op); 602 struct search *s = container_of(op, struct search, op);
607 603
608 if (bch_btree_insert(op, op->c)) { 604 if (bch_btree_insert(op, op->c)) {
609 s->error = -ENOMEM; 605 s->error = -ENOMEM;
610 op->insert_data_done = true; 606 op->insert_data_done = true;
611 } 607 }
612 608
613 if (op->insert_data_done) { 609 if (op->insert_data_done) {
614 bch_keylist_free(&op->keys); 610 bch_keylist_free(&op->keys);
615 closure_return(cl); 611 closure_return(cl);
616 } else 612 } else
617 continue_at(cl, bch_insert_data_loop, bcache_wq); 613 continue_at(cl, bch_insert_data_loop, bcache_wq);
618 } 614 }
619 615
620 /* Common code for the make_request functions */ 616 /* Common code for the make_request functions */
621 617
622 static void request_endio(struct bio *bio, int error) 618 static void request_endio(struct bio *bio, int error)
623 { 619 {
624 struct closure *cl = bio->bi_private; 620 struct closure *cl = bio->bi_private;
625 621
626 if (error) { 622 if (error) {
627 struct search *s = container_of(cl, struct search, cl); 623 struct search *s = container_of(cl, struct search, cl);
628 s->error = error; 624 s->error = error;
629 /* Only cache read errors are recoverable */ 625 /* Only cache read errors are recoverable */
630 s->recoverable = false; 626 s->recoverable = false;
631 } 627 }
632 628
633 bio_put(bio); 629 bio_put(bio);
634 closure_put(cl); 630 closure_put(cl);
635 } 631 }
636 632
637 void bch_cache_read_endio(struct bio *bio, int error) 633 void bch_cache_read_endio(struct bio *bio, int error)
638 { 634 {
639 struct bbio *b = container_of(bio, struct bbio, bio); 635 struct bbio *b = container_of(bio, struct bbio, bio);
640 struct closure *cl = bio->bi_private; 636 struct closure *cl = bio->bi_private;
641 struct search *s = container_of(cl, struct search, cl); 637 struct search *s = container_of(cl, struct search, cl);
642 638
643 /* 639 /*
644 * If the bucket was reused while our bio was in flight, we might have 640 * If the bucket was reused while our bio was in flight, we might have
645 * read the wrong data. Set s->error but not error so it doesn't get 641 * read the wrong data. Set s->error but not error so it doesn't get
646 * counted against the cache device, but we'll still reread the data 642 * counted against the cache device, but we'll still reread the data
647 * from the backing device. 643 * from the backing device.
648 */ 644 */
649 645
650 if (error) 646 if (error)
651 s->error = error; 647 s->error = error;
652 else if (ptr_stale(s->op.c, &b->key, 0)) { 648 else if (ptr_stale(s->op.c, &b->key, 0)) {
653 atomic_long_inc(&s->op.c->cache_read_races); 649 atomic_long_inc(&s->op.c->cache_read_races);
654 s->error = -EINTR; 650 s->error = -EINTR;
655 } 651 }
656 652
657 bch_bbio_endio(s->op.c, bio, error, "reading from cache"); 653 bch_bbio_endio(s->op.c, bio, error, "reading from cache");
658 } 654 }
659 655
660 static void bio_complete(struct search *s) 656 static void bio_complete(struct search *s)
661 { 657 {
662 if (s->orig_bio) { 658 if (s->orig_bio) {
663 int cpu, rw = bio_data_dir(s->orig_bio); 659 int cpu, rw = bio_data_dir(s->orig_bio);
664 unsigned long duration = jiffies - s->start_time; 660 unsigned long duration = jiffies - s->start_time;
665 661
666 cpu = part_stat_lock(); 662 cpu = part_stat_lock();
667 part_round_stats(cpu, &s->d->disk->part0); 663 part_round_stats(cpu, &s->d->disk->part0);
668 part_stat_add(cpu, &s->d->disk->part0, ticks[rw], duration); 664 part_stat_add(cpu, &s->d->disk->part0, ticks[rw], duration);
669 part_stat_unlock(); 665 part_stat_unlock();
670 666
671 trace_bcache_request_end(s, s->orig_bio); 667 trace_bcache_request_end(s, s->orig_bio);
672 bio_endio(s->orig_bio, s->error); 668 bio_endio(s->orig_bio, s->error);
673 s->orig_bio = NULL; 669 s->orig_bio = NULL;
674 } 670 }
675 } 671 }
676 672
677 static void do_bio_hook(struct search *s) 673 static void do_bio_hook(struct search *s)
678 { 674 {
679 struct bio *bio = &s->bio.bio; 675 struct bio *bio = &s->bio.bio;
680 memcpy(bio, s->orig_bio, sizeof(struct bio)); 676 memcpy(bio, s->orig_bio, sizeof(struct bio));
681 677
682 bio->bi_end_io = request_endio; 678 bio->bi_end_io = request_endio;
683 bio->bi_private = &s->cl; 679 bio->bi_private = &s->cl;
684 atomic_set(&bio->bi_cnt, 3); 680 atomic_set(&bio->bi_cnt, 3);
685 } 681 }
686 682
687 static void search_free(struct closure *cl) 683 static void search_free(struct closure *cl)
688 { 684 {
689 struct search *s = container_of(cl, struct search, cl); 685 struct search *s = container_of(cl, struct search, cl);
690 bio_complete(s); 686 bio_complete(s);
691 687
692 if (s->op.cache_bio) 688 if (s->op.cache_bio)
693 bio_put(s->op.cache_bio); 689 bio_put(s->op.cache_bio);
694 690
695 if (s->unaligned_bvec) 691 if (s->unaligned_bvec)
696 mempool_free(s->bio.bio.bi_io_vec, s->d->unaligned_bvec); 692 mempool_free(s->bio.bio.bi_io_vec, s->d->unaligned_bvec);
697 693
698 closure_debug_destroy(cl); 694 closure_debug_destroy(cl);
699 mempool_free(s, s->d->c->search); 695 mempool_free(s, s->d->c->search);
700 } 696 }
701 697
702 static struct search *search_alloc(struct bio *bio, struct bcache_device *d) 698 static struct search *search_alloc(struct bio *bio, struct bcache_device *d)
703 { 699 {
704 struct bio_vec *bv; 700 struct bio_vec *bv;
705 struct search *s = mempool_alloc(d->c->search, GFP_NOIO); 701 struct search *s = mempool_alloc(d->c->search, GFP_NOIO);
706 memset(s, 0, offsetof(struct search, op.keys)); 702 memset(s, 0, offsetof(struct search, op.keys));
707 703
708 __closure_init(&s->cl, NULL); 704 __closure_init(&s->cl, NULL);
709 705
710 s->op.inode = d->id; 706 s->op.inode = d->id;
711 s->op.c = d->c; 707 s->op.c = d->c;
712 s->d = d; 708 s->d = d;
713 s->op.lock = -1; 709 s->op.lock = -1;
714 s->task = current; 710 s->task = current;
715 s->orig_bio = bio; 711 s->orig_bio = bio;
716 s->write = (bio->bi_rw & REQ_WRITE) != 0; 712 s->write = (bio->bi_rw & REQ_WRITE) != 0;
717 s->op.flush_journal = (bio->bi_rw & REQ_FLUSH) != 0; 713 s->op.flush_journal = (bio->bi_rw & REQ_FLUSH) != 0;
718 s->op.skip = (bio->bi_rw & REQ_DISCARD) != 0; 714 s->op.skip = (bio->bi_rw & REQ_DISCARD) != 0;
719 s->recoverable = 1; 715 s->recoverable = 1;
720 s->start_time = jiffies; 716 s->start_time = jiffies;
721 do_bio_hook(s); 717 do_bio_hook(s);
722 718
723 if (bio->bi_size != bio_segments(bio) * PAGE_SIZE) { 719 if (bio->bi_size != bio_segments(bio) * PAGE_SIZE) {
724 bv = mempool_alloc(d->unaligned_bvec, GFP_NOIO); 720 bv = mempool_alloc(d->unaligned_bvec, GFP_NOIO);
725 memcpy(bv, bio_iovec(bio), 721 memcpy(bv, bio_iovec(bio),
726 sizeof(struct bio_vec) * bio_segments(bio)); 722 sizeof(struct bio_vec) * bio_segments(bio));
727 723
728 s->bio.bio.bi_io_vec = bv; 724 s->bio.bio.bi_io_vec = bv;
729 s->unaligned_bvec = 1; 725 s->unaligned_bvec = 1;
730 } 726 }
731 727
732 return s; 728 return s;
733 } 729 }
734 730
735 static void btree_read_async(struct closure *cl) 731 static void btree_read_async(struct closure *cl)
736 { 732 {
737 struct btree_op *op = container_of(cl, struct btree_op, cl); 733 struct btree_op *op = container_of(cl, struct btree_op, cl);
738 734
739 int ret = btree_root(search_recurse, op->c, op); 735 int ret = btree_root(search_recurse, op->c, op);
740 736
741 if (ret == -EAGAIN) 737 if (ret == -EAGAIN)
742 continue_at(cl, btree_read_async, bcache_wq); 738 continue_at(cl, btree_read_async, bcache_wq);
743 739
744 closure_return(cl); 740 closure_return(cl);
745 } 741 }
746 742
747 /* Cached devices */ 743 /* Cached devices */
748 744
749 static void cached_dev_bio_complete(struct closure *cl) 745 static void cached_dev_bio_complete(struct closure *cl)
750 { 746 {
751 struct search *s = container_of(cl, struct search, cl); 747 struct search *s = container_of(cl, struct search, cl);
752 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 748 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
753 749
754 search_free(cl); 750 search_free(cl);
755 cached_dev_put(dc); 751 cached_dev_put(dc);
756 } 752 }
757 753
758 /* Process reads */ 754 /* Process reads */
759 755
760 static void cached_dev_read_complete(struct closure *cl) 756 static void cached_dev_read_complete(struct closure *cl)
761 { 757 {
762 struct search *s = container_of(cl, struct search, cl); 758 struct search *s = container_of(cl, struct search, cl);
763 759
764 if (s->op.insert_collision) 760 if (s->op.insert_collision)
765 bch_mark_cache_miss_collision(s); 761 bch_mark_cache_miss_collision(s);
766 762
767 if (s->op.cache_bio) { 763 if (s->op.cache_bio) {
768 int i; 764 int i;
769 struct bio_vec *bv; 765 struct bio_vec *bv;
770 766
771 __bio_for_each_segment(bv, s->op.cache_bio, i, 0) 767 __bio_for_each_segment(bv, s->op.cache_bio, i, 0)
772 __free_page(bv->bv_page); 768 __free_page(bv->bv_page);
773 } 769 }
774 770
775 cached_dev_bio_complete(cl); 771 cached_dev_bio_complete(cl);
776 } 772 }
777 773
778 static void request_read_error(struct closure *cl) 774 static void request_read_error(struct closure *cl)
779 { 775 {
780 struct search *s = container_of(cl, struct search, cl); 776 struct search *s = container_of(cl, struct search, cl);
781 struct bio_vec *bv; 777 struct bio_vec *bv;
782 int i; 778 int i;
783 779
784 if (s->recoverable) { 780 if (s->recoverable) {
785 /* Retry from the backing device: */ 781 /* Retry from the backing device: */
786 trace_bcache_read_retry(s->orig_bio); 782 trace_bcache_read_retry(s->orig_bio);
787 783
788 s->error = 0; 784 s->error = 0;
789 bv = s->bio.bio.bi_io_vec; 785 bv = s->bio.bio.bi_io_vec;
790 do_bio_hook(s); 786 do_bio_hook(s);
791 s->bio.bio.bi_io_vec = bv; 787 s->bio.bio.bi_io_vec = bv;
792 788
793 if (!s->unaligned_bvec) 789 if (!s->unaligned_bvec)
794 bio_for_each_segment(bv, s->orig_bio, i) 790 bio_for_each_segment(bv, s->orig_bio, i)
795 bv->bv_offset = 0, bv->bv_len = PAGE_SIZE; 791 bv->bv_offset = 0, bv->bv_len = PAGE_SIZE;
796 else 792 else
797 memcpy(s->bio.bio.bi_io_vec, 793 memcpy(s->bio.bio.bi_io_vec,
798 bio_iovec(s->orig_bio), 794 bio_iovec(s->orig_bio),
799 sizeof(struct bio_vec) * 795 sizeof(struct bio_vec) *
800 bio_segments(s->orig_bio)); 796 bio_segments(s->orig_bio));
801 797
802 /* XXX: invalidate cache */ 798 /* XXX: invalidate cache */
803 799
804 closure_bio_submit(&s->bio.bio, &s->cl, s->d); 800 closure_bio_submit(&s->bio.bio, &s->cl, s->d);
805 } 801 }
806 802
807 continue_at(cl, cached_dev_read_complete, NULL); 803 continue_at(cl, cached_dev_read_complete, NULL);
808 } 804 }
809 805
810 static void request_read_done(struct closure *cl) 806 static void request_read_done(struct closure *cl)
811 { 807 {
812 struct search *s = container_of(cl, struct search, cl); 808 struct search *s = container_of(cl, struct search, cl);
813 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 809 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
814 810
815 /* 811 /*
816 * s->cache_bio != NULL implies that we had a cache miss; cache_bio now 812 * s->cache_bio != NULL implies that we had a cache miss; cache_bio now
817 * contains data ready to be inserted into the cache. 813 * contains data ready to be inserted into the cache.
818 * 814 *
819 * First, we copy the data we just read from cache_bio's bounce buffers 815 * First, we copy the data we just read from cache_bio's bounce buffers
820 * to the buffers the original bio pointed to: 816 * to the buffers the original bio pointed to:
821 */ 817 */
822 818
823 if (s->op.cache_bio) { 819 if (s->op.cache_bio) {
824 struct bio_vec *src, *dst;
825 unsigned src_offset, dst_offset, bytes;
826 void *dst_ptr;
827
828 bio_reset(s->op.cache_bio); 820 bio_reset(s->op.cache_bio);
829 s->op.cache_bio->bi_sector = s->cache_miss->bi_sector; 821 s->op.cache_bio->bi_sector = s->cache_miss->bi_sector;
830 s->op.cache_bio->bi_bdev = s->cache_miss->bi_bdev; 822 s->op.cache_bio->bi_bdev = s->cache_miss->bi_bdev;
831 s->op.cache_bio->bi_size = s->cache_bio_sectors << 9; 823 s->op.cache_bio->bi_size = s->cache_bio_sectors << 9;
832 bch_bio_map(s->op.cache_bio, NULL); 824 bch_bio_map(s->op.cache_bio, NULL);
833 825
834 src = bio_iovec(s->op.cache_bio); 826 bio_copy_data(s->cache_miss, s->op.cache_bio);
835 dst = bio_iovec(s->cache_miss);
836 src_offset = src->bv_offset;
837 dst_offset = dst->bv_offset;
838 dst_ptr = kmap(dst->bv_page);
839 827
840 while (1) {
841 if (dst_offset == dst->bv_offset + dst->bv_len) {
842 kunmap(dst->bv_page);
843 dst++;
844 if (dst == bio_iovec_idx(s->cache_miss,
845 s->cache_miss->bi_vcnt))
846 break;
847
848 dst_offset = dst->bv_offset;
849 dst_ptr = kmap(dst->bv_page);
850 }
851
852 if (src_offset == src->bv_offset + src->bv_len) {
853 src++;
854 if (src == bio_iovec_idx(s->op.cache_bio,
855 s->op.cache_bio->bi_vcnt))
856 BUG();
857
858 src_offset = src->bv_offset;
859 }
860
861 bytes = min(dst->bv_offset + dst->bv_len - dst_offset,
862 src->bv_offset + src->bv_len - src_offset);
863
864 memcpy(dst_ptr + dst_offset,
865 page_address(src->bv_page) + src_offset,
866 bytes);
867
868 src_offset += bytes;
869 dst_offset += bytes;
870 }
871
872 bio_put(s->cache_miss); 828 bio_put(s->cache_miss);
873 s->cache_miss = NULL; 829 s->cache_miss = NULL;
874 } 830 }
875 831
876 if (verify(dc, &s->bio.bio) && s->recoverable) 832 if (verify(dc, &s->bio.bio) && s->recoverable)
877 bch_data_verify(s); 833 bch_data_verify(s);
878 834
879 bio_complete(s); 835 bio_complete(s);
880 836
881 if (s->op.cache_bio && 837 if (s->op.cache_bio &&
882 !test_bit(CACHE_SET_STOPPING, &s->op.c->flags)) { 838 !test_bit(CACHE_SET_STOPPING, &s->op.c->flags)) {
883 s->op.type = BTREE_REPLACE; 839 s->op.type = BTREE_REPLACE;
884 closure_call(&s->op.cl, bch_insert_data, NULL, cl); 840 closure_call(&s->op.cl, bch_insert_data, NULL, cl);
885 } 841 }
886 842
887 continue_at(cl, cached_dev_read_complete, NULL); 843 continue_at(cl, cached_dev_read_complete, NULL);
888 } 844 }
889 845
890 static void request_read_done_bh(struct closure *cl) 846 static void request_read_done_bh(struct closure *cl)
891 { 847 {
892 struct search *s = container_of(cl, struct search, cl); 848 struct search *s = container_of(cl, struct search, cl);
893 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 849 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
894 850
895 bch_mark_cache_accounting(s, !s->cache_miss, s->op.skip); 851 bch_mark_cache_accounting(s, !s->cache_miss, s->op.skip);
896 trace_bcache_read(s->orig_bio, !s->cache_miss, s->op.skip); 852 trace_bcache_read(s->orig_bio, !s->cache_miss, s->op.skip);
897 853
898 if (s->error) 854 if (s->error)
899 continue_at_nobarrier(cl, request_read_error, bcache_wq); 855 continue_at_nobarrier(cl, request_read_error, bcache_wq);
900 else if (s->op.cache_bio || verify(dc, &s->bio.bio)) 856 else if (s->op.cache_bio || verify(dc, &s->bio.bio))
901 continue_at_nobarrier(cl, request_read_done, bcache_wq); 857 continue_at_nobarrier(cl, request_read_done, bcache_wq);
902 else 858 else
903 continue_at_nobarrier(cl, cached_dev_read_complete, NULL); 859 continue_at_nobarrier(cl, cached_dev_read_complete, NULL);
904 } 860 }
905 861
906 static int cached_dev_cache_miss(struct btree *b, struct search *s, 862 static int cached_dev_cache_miss(struct btree *b, struct search *s,
907 struct bio *bio, unsigned sectors) 863 struct bio *bio, unsigned sectors)
908 { 864 {
909 int ret = 0; 865 int ret = 0;
910 unsigned reada; 866 unsigned reada;
911 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 867 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
912 struct bio *miss; 868 struct bio *miss;
913 869
914 miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); 870 miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
915 if (!miss)
916 return -EAGAIN;
917
918 if (miss == bio) 871 if (miss == bio)
919 s->op.lookup_done = true; 872 s->op.lookup_done = true;
920 873
921 miss->bi_end_io = request_endio; 874 miss->bi_end_io = request_endio;
922 miss->bi_private = &s->cl; 875 miss->bi_private = &s->cl;
923 876
924 if (s->cache_miss || s->op.skip) 877 if (s->cache_miss || s->op.skip)
925 goto out_submit; 878 goto out_submit;
926 879
927 if (miss != bio || 880 if (miss != bio ||
928 (bio->bi_rw & REQ_RAHEAD) || 881 (bio->bi_rw & REQ_RAHEAD) ||
929 (bio->bi_rw & REQ_META) || 882 (bio->bi_rw & REQ_META) ||
930 s->op.c->gc_stats.in_use >= CUTOFF_CACHE_READA) 883 s->op.c->gc_stats.in_use >= CUTOFF_CACHE_READA)
931 reada = 0; 884 reada = 0;
932 else { 885 else {
933 reada = min(dc->readahead >> 9, 886 reada = min(dc->readahead >> 9,
934 sectors - bio_sectors(miss)); 887 sectors - bio_sectors(miss));
935 888
936 if (bio_end(miss) + reada > bdev_sectors(miss->bi_bdev)) 889 if (bio_end_sector(miss) + reada > bdev_sectors(miss->bi_bdev))
937 reada = bdev_sectors(miss->bi_bdev) - bio_end(miss); 890 reada = bdev_sectors(miss->bi_bdev) -
891 bio_end_sector(miss);
938 } 892 }
939 893
940 s->cache_bio_sectors = bio_sectors(miss) + reada; 894 s->cache_bio_sectors = bio_sectors(miss) + reada;
941 s->op.cache_bio = bio_alloc_bioset(GFP_NOWAIT, 895 s->op.cache_bio = bio_alloc_bioset(GFP_NOWAIT,
942 DIV_ROUND_UP(s->cache_bio_sectors, PAGE_SECTORS), 896 DIV_ROUND_UP(s->cache_bio_sectors, PAGE_SECTORS),
943 dc->disk.bio_split); 897 dc->disk.bio_split);
944 898
945 if (!s->op.cache_bio) 899 if (!s->op.cache_bio)
946 goto out_submit; 900 goto out_submit;
947 901
948 s->op.cache_bio->bi_sector = miss->bi_sector; 902 s->op.cache_bio->bi_sector = miss->bi_sector;
949 s->op.cache_bio->bi_bdev = miss->bi_bdev; 903 s->op.cache_bio->bi_bdev = miss->bi_bdev;
950 s->op.cache_bio->bi_size = s->cache_bio_sectors << 9; 904 s->op.cache_bio->bi_size = s->cache_bio_sectors << 9;
951 905
952 s->op.cache_bio->bi_end_io = request_endio; 906 s->op.cache_bio->bi_end_io = request_endio;
953 s->op.cache_bio->bi_private = &s->cl; 907 s->op.cache_bio->bi_private = &s->cl;
954 908
955 /* btree_search_recurse()'s btree iterator is no good anymore */ 909 /* btree_search_recurse()'s btree iterator is no good anymore */
956 ret = -EINTR; 910 ret = -EINTR;
957 if (!bch_btree_insert_check_key(b, &s->op, s->op.cache_bio)) 911 if (!bch_btree_insert_check_key(b, &s->op, s->op.cache_bio))
958 goto out_put; 912 goto out_put;
959 913
960 bch_bio_map(s->op.cache_bio, NULL); 914 bch_bio_map(s->op.cache_bio, NULL);
961 if (bch_bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO)) 915 if (bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO))
962 goto out_put; 916 goto out_put;
963 917
964 s->cache_miss = miss; 918 s->cache_miss = miss;
965 bio_get(s->op.cache_bio); 919 bio_get(s->op.cache_bio);
966 920
967 closure_bio_submit(s->op.cache_bio, &s->cl, s->d); 921 closure_bio_submit(s->op.cache_bio, &s->cl, s->d);
968 922
969 return ret; 923 return ret;
970 out_put: 924 out_put:
971 bio_put(s->op.cache_bio); 925 bio_put(s->op.cache_bio);
972 s->op.cache_bio = NULL; 926 s->op.cache_bio = NULL;
973 out_submit: 927 out_submit:
974 closure_bio_submit(miss, &s->cl, s->d); 928 closure_bio_submit(miss, &s->cl, s->d);
975 return ret; 929 return ret;
976 } 930 }
977 931
978 static void request_read(struct cached_dev *dc, struct search *s) 932 static void request_read(struct cached_dev *dc, struct search *s)
979 { 933 {
980 struct closure *cl = &s->cl; 934 struct closure *cl = &s->cl;
981 935
982 check_should_skip(dc, s); 936 check_should_skip(dc, s);
983 closure_call(&s->op.cl, btree_read_async, NULL, cl); 937 closure_call(&s->op.cl, btree_read_async, NULL, cl);
984 938
985 continue_at(cl, request_read_done_bh, NULL); 939 continue_at(cl, request_read_done_bh, NULL);
986 } 940 }
987 941
988 /* Process writes */ 942 /* Process writes */
989 943
990 static void cached_dev_write_complete(struct closure *cl) 944 static void cached_dev_write_complete(struct closure *cl)
991 { 945 {
992 struct search *s = container_of(cl, struct search, cl); 946 struct search *s = container_of(cl, struct search, cl);
993 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 947 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
994 948
995 up_read_non_owner(&dc->writeback_lock); 949 up_read_non_owner(&dc->writeback_lock);
996 cached_dev_bio_complete(cl); 950 cached_dev_bio_complete(cl);
997 } 951 }
998 952
999 static void request_write(struct cached_dev *dc, struct search *s) 953 static void request_write(struct cached_dev *dc, struct search *s)
1000 { 954 {
1001 struct closure *cl = &s->cl; 955 struct closure *cl = &s->cl;
1002 struct bio *bio = &s->bio.bio; 956 struct bio *bio = &s->bio.bio;
1003 struct bkey start, end; 957 struct bkey start, end;
1004 start = KEY(dc->disk.id, bio->bi_sector, 0); 958 start = KEY(dc->disk.id, bio->bi_sector, 0);
1005 end = KEY(dc->disk.id, bio_end(bio), 0); 959 end = KEY(dc->disk.id, bio_end_sector(bio), 0);
1006 960
1007 bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end); 961 bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end);
1008 962
1009 check_should_skip(dc, s); 963 check_should_skip(dc, s);
1010 down_read_non_owner(&dc->writeback_lock); 964 down_read_non_owner(&dc->writeback_lock);
1011 965
1012 if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) { 966 if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) {
1013 s->op.skip = false; 967 s->op.skip = false;
1014 s->writeback = true; 968 s->writeback = true;
1015 } 969 }
1016 970
1017 if (bio->bi_rw & REQ_DISCARD) 971 if (bio->bi_rw & REQ_DISCARD)
1018 goto skip; 972 goto skip;
1019 973
1020 if (should_writeback(dc, s->orig_bio, 974 if (should_writeback(dc, s->orig_bio,
1021 cache_mode(dc, bio), 975 cache_mode(dc, bio),
1022 s->op.skip)) { 976 s->op.skip)) {
1023 s->op.skip = false; 977 s->op.skip = false;
1024 s->writeback = true; 978 s->writeback = true;
1025 } 979 }
1026 980
1027 if (s->op.skip) 981 if (s->op.skip)
1028 goto skip; 982 goto skip;
1029 983
1030 trace_bcache_write(s->orig_bio, s->writeback, s->op.skip); 984 trace_bcache_write(s->orig_bio, s->writeback, s->op.skip);
1031 985
1032 if (!s->writeback) { 986 if (!s->writeback) {
1033 s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO, 987 s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO,
1034 dc->disk.bio_split); 988 dc->disk.bio_split);
1035 989
1036 closure_bio_submit(bio, cl, s->d); 990 closure_bio_submit(bio, cl, s->d);
1037 } else { 991 } else {
1038 bch_writeback_add(dc); 992 bch_writeback_add(dc);
1039 993
1040 if (s->op.flush_journal) { 994 if (s->op.flush_journal) {
1041 /* Also need to send a flush to the backing device */ 995 /* Also need to send a flush to the backing device */
1042 s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO, 996 s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO,
1043 dc->disk.bio_split); 997 dc->disk.bio_split);
1044 998
1045 bio->bi_size = 0; 999 bio->bi_size = 0;
1046 bio->bi_vcnt = 0; 1000 bio->bi_vcnt = 0;
1047 closure_bio_submit(bio, cl, s->d); 1001 closure_bio_submit(bio, cl, s->d);
1048 } else { 1002 } else {
1049 s->op.cache_bio = bio; 1003 s->op.cache_bio = bio;
1050 } 1004 }
1051 } 1005 }
1052 out: 1006 out:
1053 closure_call(&s->op.cl, bch_insert_data, NULL, cl); 1007 closure_call(&s->op.cl, bch_insert_data, NULL, cl);
1054 continue_at(cl, cached_dev_write_complete, NULL); 1008 continue_at(cl, cached_dev_write_complete, NULL);
1055 skip: 1009 skip:
1056 s->op.skip = true; 1010 s->op.skip = true;
1057 s->op.cache_bio = s->orig_bio; 1011 s->op.cache_bio = s->orig_bio;
1058 bio_get(s->op.cache_bio); 1012 bio_get(s->op.cache_bio);
1059 1013
1060 if ((bio->bi_rw & REQ_DISCARD) && 1014 if ((bio->bi_rw & REQ_DISCARD) &&
1061 !blk_queue_discard(bdev_get_queue(dc->bdev))) 1015 !blk_queue_discard(bdev_get_queue(dc->bdev)))
1062 goto out; 1016 goto out;
1063 1017
1064 closure_bio_submit(bio, cl, s->d); 1018 closure_bio_submit(bio, cl, s->d);
1065 goto out; 1019 goto out;
1066 } 1020 }
1067 1021
1068 static void request_nodata(struct cached_dev *dc, struct search *s) 1022 static void request_nodata(struct cached_dev *dc, struct search *s)
1069 { 1023 {
1070 struct closure *cl = &s->cl; 1024 struct closure *cl = &s->cl;
1071 struct bio *bio = &s->bio.bio; 1025 struct bio *bio = &s->bio.bio;
1072 1026
1073 if (bio->bi_rw & REQ_DISCARD) { 1027 if (bio->bi_rw & REQ_DISCARD) {
1074 request_write(dc, s); 1028 request_write(dc, s);
1075 return; 1029 return;
1076 } 1030 }
1077 1031
1078 if (s->op.flush_journal) 1032 if (s->op.flush_journal)
1079 bch_journal_meta(s->op.c, cl); 1033 bch_journal_meta(s->op.c, cl);
1080 1034
1081 closure_bio_submit(bio, cl, s->d); 1035 closure_bio_submit(bio, cl, s->d);
1082 1036
1083 continue_at(cl, cached_dev_bio_complete, NULL); 1037 continue_at(cl, cached_dev_bio_complete, NULL);
1084 } 1038 }
1085 1039
1086 /* Cached devices - read & write stuff */ 1040 /* Cached devices - read & write stuff */
1087 1041
1088 unsigned bch_get_congested(struct cache_set *c) 1042 unsigned bch_get_congested(struct cache_set *c)
1089 { 1043 {
1090 int i; 1044 int i;
1091 long rand; 1045 long rand;
1092 1046
1093 if (!c->congested_read_threshold_us && 1047 if (!c->congested_read_threshold_us &&
1094 !c->congested_write_threshold_us) 1048 !c->congested_write_threshold_us)
1095 return 0; 1049 return 0;
1096 1050
1097 i = (local_clock_us() - c->congested_last_us) / 1024; 1051 i = (local_clock_us() - c->congested_last_us) / 1024;
1098 if (i < 0) 1052 if (i < 0)
1099 return 0; 1053 return 0;
1100 1054
1101 i += atomic_read(&c->congested); 1055 i += atomic_read(&c->congested);
1102 if (i >= 0) 1056 if (i >= 0)
1103 return 0; 1057 return 0;
1104 1058
1105 i += CONGESTED_MAX; 1059 i += CONGESTED_MAX;
1106 1060
1107 if (i > 0) 1061 if (i > 0)
1108 i = fract_exp_two(i, 6); 1062 i = fract_exp_two(i, 6);
1109 1063
1110 rand = get_random_int(); 1064 rand = get_random_int();
1111 i -= bitmap_weight(&rand, BITS_PER_LONG); 1065 i -= bitmap_weight(&rand, BITS_PER_LONG);
1112 1066
1113 return i > 0 ? i : 1; 1067 return i > 0 ? i : 1;
1114 } 1068 }
1115 1069
1116 static void add_sequential(struct task_struct *t) 1070 static void add_sequential(struct task_struct *t)
1117 { 1071 {
1118 ewma_add(t->sequential_io_avg, 1072 ewma_add(t->sequential_io_avg,
1119 t->sequential_io, 8, 0); 1073 t->sequential_io, 8, 0);
1120 1074
1121 t->sequential_io = 0; 1075 t->sequential_io = 0;
1122 } 1076 }
1123 1077
1124 static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k) 1078 static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k)
1125 { 1079 {
1126 return &dc->io_hash[hash_64(k, RECENT_IO_BITS)]; 1080 return &dc->io_hash[hash_64(k, RECENT_IO_BITS)];
1127 } 1081 }
1128 1082
1129 static void check_should_skip(struct cached_dev *dc, struct search *s) 1083 static void check_should_skip(struct cached_dev *dc, struct search *s)
1130 { 1084 {
1131 struct cache_set *c = s->op.c; 1085 struct cache_set *c = s->op.c;
1132 struct bio *bio = &s->bio.bio; 1086 struct bio *bio = &s->bio.bio;
1133 unsigned mode = cache_mode(dc, bio); 1087 unsigned mode = cache_mode(dc, bio);
1134 unsigned sectors, congested = bch_get_congested(c); 1088 unsigned sectors, congested = bch_get_congested(c);
1135 1089
1136 if (atomic_read(&dc->disk.detaching) || 1090 if (atomic_read(&dc->disk.detaching) ||
1137 c->gc_stats.in_use > CUTOFF_CACHE_ADD || 1091 c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
1138 (bio->bi_rw & REQ_DISCARD)) 1092 (bio->bi_rw & REQ_DISCARD))
1139 goto skip; 1093 goto skip;
1140 1094
1141 if (mode == CACHE_MODE_NONE || 1095 if (mode == CACHE_MODE_NONE ||
1142 (mode == CACHE_MODE_WRITEAROUND && 1096 (mode == CACHE_MODE_WRITEAROUND &&
1143 (bio->bi_rw & REQ_WRITE))) 1097 (bio->bi_rw & REQ_WRITE)))
1144 goto skip; 1098 goto skip;
1145 1099
1146 if (bio->bi_sector & (c->sb.block_size - 1) || 1100 if (bio->bi_sector & (c->sb.block_size - 1) ||
1147 bio_sectors(bio) & (c->sb.block_size - 1)) { 1101 bio_sectors(bio) & (c->sb.block_size - 1)) {
1148 pr_debug("skipping unaligned io"); 1102 pr_debug("skipping unaligned io");
1149 goto skip; 1103 goto skip;
1150 } 1104 }
1151 1105
1152 if (!congested && !dc->sequential_cutoff) 1106 if (!congested && !dc->sequential_cutoff)
1153 goto rescale; 1107 goto rescale;
1154 1108
1155 if (!congested && 1109 if (!congested &&
1156 mode == CACHE_MODE_WRITEBACK && 1110 mode == CACHE_MODE_WRITEBACK &&
1157 (bio->bi_rw & REQ_WRITE) && 1111 (bio->bi_rw & REQ_WRITE) &&
1158 (bio->bi_rw & REQ_SYNC)) 1112 (bio->bi_rw & REQ_SYNC))
1159 goto rescale; 1113 goto rescale;
1160 1114
1161 if (dc->sequential_merge) { 1115 if (dc->sequential_merge) {
1162 struct io *i; 1116 struct io *i;
1163 1117
1164 spin_lock(&dc->io_lock); 1118 spin_lock(&dc->io_lock);
1165 1119
1166 hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash) 1120 hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash)
1167 if (i->last == bio->bi_sector && 1121 if (i->last == bio->bi_sector &&
1168 time_before(jiffies, i->jiffies)) 1122 time_before(jiffies, i->jiffies))
1169 goto found; 1123 goto found;
1170 1124
1171 i = list_first_entry(&dc->io_lru, struct io, lru); 1125 i = list_first_entry(&dc->io_lru, struct io, lru);
1172 1126
1173 add_sequential(s->task); 1127 add_sequential(s->task);
1174 i->sequential = 0; 1128 i->sequential = 0;
1175 found: 1129 found:
1176 if (i->sequential + bio->bi_size > i->sequential) 1130 if (i->sequential + bio->bi_size > i->sequential)
1177 i->sequential += bio->bi_size; 1131 i->sequential += bio->bi_size;
1178 1132
1179 i->last = bio_end(bio); 1133 i->last = bio_end_sector(bio);
1180 i->jiffies = jiffies + msecs_to_jiffies(5000); 1134 i->jiffies = jiffies + msecs_to_jiffies(5000);
1181 s->task->sequential_io = i->sequential; 1135 s->task->sequential_io = i->sequential;
1182 1136
1183 hlist_del(&i->hash); 1137 hlist_del(&i->hash);
1184 hlist_add_head(&i->hash, iohash(dc, i->last)); 1138 hlist_add_head(&i->hash, iohash(dc, i->last));
1185 list_move_tail(&i->lru, &dc->io_lru); 1139 list_move_tail(&i->lru, &dc->io_lru);
1186 1140
1187 spin_unlock(&dc->io_lock); 1141 spin_unlock(&dc->io_lock);
1188 } else { 1142 } else {
1189 s->task->sequential_io = bio->bi_size; 1143 s->task->sequential_io = bio->bi_size;
1190 1144
1191 add_sequential(s->task); 1145 add_sequential(s->task);
1192 } 1146 }
1193 1147
1194 sectors = max(s->task->sequential_io, 1148 sectors = max(s->task->sequential_io,
1195 s->task->sequential_io_avg) >> 9; 1149 s->task->sequential_io_avg) >> 9;
1196 1150
1197 if (dc->sequential_cutoff && 1151 if (dc->sequential_cutoff &&
1198 sectors >= dc->sequential_cutoff >> 9) { 1152 sectors >= dc->sequential_cutoff >> 9) {
1199 trace_bcache_bypass_sequential(s->orig_bio); 1153 trace_bcache_bypass_sequential(s->orig_bio);
1200 goto skip; 1154 goto skip;
1201 } 1155 }
1202 1156
1203 if (congested && sectors >= congested) { 1157 if (congested && sectors >= congested) {
1204 trace_bcache_bypass_congested(s->orig_bio); 1158 trace_bcache_bypass_congested(s->orig_bio);
1205 goto skip; 1159 goto skip;
1206 } 1160 }
1207 1161
1208 rescale: 1162 rescale:
1209 bch_rescale_priorities(c, bio_sectors(bio)); 1163 bch_rescale_priorities(c, bio_sectors(bio));
1210 return; 1164 return;
1211 skip: 1165 skip:
1212 bch_mark_sectors_bypassed(s, bio_sectors(bio)); 1166 bch_mark_sectors_bypassed(s, bio_sectors(bio));
1213 s->op.skip = true; 1167 s->op.skip = true;
1214 } 1168 }
1215 1169
1216 static void cached_dev_make_request(struct request_queue *q, struct bio *bio) 1170 static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
1217 { 1171 {
1218 struct search *s; 1172 struct search *s;
1219 struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; 1173 struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
1220 struct cached_dev *dc = container_of(d, struct cached_dev, disk); 1174 struct cached_dev *dc = container_of(d, struct cached_dev, disk);
1221 int cpu, rw = bio_data_dir(bio); 1175 int cpu, rw = bio_data_dir(bio);
1222 1176
1223 cpu = part_stat_lock(); 1177 cpu = part_stat_lock();
1224 part_stat_inc(cpu, &d->disk->part0, ios[rw]); 1178 part_stat_inc(cpu, &d->disk->part0, ios[rw]);
1225 part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio)); 1179 part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio));
1226 part_stat_unlock(); 1180 part_stat_unlock();
1227 1181
1228 bio->bi_bdev = dc->bdev; 1182 bio->bi_bdev = dc->bdev;
1229 bio->bi_sector += dc->sb.data_offset; 1183 bio->bi_sector += dc->sb.data_offset;
1230 1184
1231 if (cached_dev_get(dc)) { 1185 if (cached_dev_get(dc)) {
1232 s = search_alloc(bio, d); 1186 s = search_alloc(bio, d);
1233 trace_bcache_request_start(s, bio); 1187 trace_bcache_request_start(s, bio);
1234 1188
1235 if (!bio_has_data(bio)) 1189 if (!bio_has_data(bio))
1236 request_nodata(dc, s); 1190 request_nodata(dc, s);
1237 else if (rw) 1191 else if (rw)
1238 request_write(dc, s); 1192 request_write(dc, s);
1239 else 1193 else
1240 request_read(dc, s); 1194 request_read(dc, s);
1241 } else { 1195 } else {
1242 if ((bio->bi_rw & REQ_DISCARD) && 1196 if ((bio->bi_rw & REQ_DISCARD) &&
1243 !blk_queue_discard(bdev_get_queue(dc->bdev))) 1197 !blk_queue_discard(bdev_get_queue(dc->bdev)))
1244 bio_endio(bio, 0); 1198 bio_endio(bio, 0);
1245 else 1199 else
1246 bch_generic_make_request(bio, &d->bio_split_hook); 1200 bch_generic_make_request(bio, &d->bio_split_hook);
1247 } 1201 }
1248 } 1202 }
1249 1203
1250 static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode, 1204 static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode,
1251 unsigned int cmd, unsigned long arg) 1205 unsigned int cmd, unsigned long arg)
1252 { 1206 {
1253 struct cached_dev *dc = container_of(d, struct cached_dev, disk); 1207 struct cached_dev *dc = container_of(d, struct cached_dev, disk);
1254 return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg); 1208 return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg);
1255 } 1209 }
1256 1210
1257 static int cached_dev_congested(void *data, int bits) 1211 static int cached_dev_congested(void *data, int bits)
1258 { 1212 {
1259 struct bcache_device *d = data; 1213 struct bcache_device *d = data;
1260 struct cached_dev *dc = container_of(d, struct cached_dev, disk); 1214 struct cached_dev *dc = container_of(d, struct cached_dev, disk);
1261 struct request_queue *q = bdev_get_queue(dc->bdev); 1215 struct request_queue *q = bdev_get_queue(dc->bdev);
1262 int ret = 0; 1216 int ret = 0;
1263 1217
1264 if (bdi_congested(&q->backing_dev_info, bits)) 1218 if (bdi_congested(&q->backing_dev_info, bits))
1265 return 1; 1219 return 1;
1266 1220
1267 if (cached_dev_get(dc)) { 1221 if (cached_dev_get(dc)) {
1268 unsigned i; 1222 unsigned i;
1269 struct cache *ca; 1223 struct cache *ca;
1270 1224
1271 for_each_cache(ca, d->c, i) { 1225 for_each_cache(ca, d->c, i) {
1272 q = bdev_get_queue(ca->bdev); 1226 q = bdev_get_queue(ca->bdev);
1273 ret |= bdi_congested(&q->backing_dev_info, bits); 1227 ret |= bdi_congested(&q->backing_dev_info, bits);
1274 } 1228 }
1275 1229
1276 cached_dev_put(dc); 1230 cached_dev_put(dc);
1277 } 1231 }
1278 1232
1279 return ret; 1233 return ret;
1280 } 1234 }
1281 1235
1282 void bch_cached_dev_request_init(struct cached_dev *dc) 1236 void bch_cached_dev_request_init(struct cached_dev *dc)
1283 { 1237 {
1284 struct gendisk *g = dc->disk.disk; 1238 struct gendisk *g = dc->disk.disk;
1285 1239
1286 g->queue->make_request_fn = cached_dev_make_request; 1240 g->queue->make_request_fn = cached_dev_make_request;
1287 g->queue->backing_dev_info.congested_fn = cached_dev_congested; 1241 g->queue->backing_dev_info.congested_fn = cached_dev_congested;
1288 dc->disk.cache_miss = cached_dev_cache_miss; 1242 dc->disk.cache_miss = cached_dev_cache_miss;
1289 dc->disk.ioctl = cached_dev_ioctl; 1243 dc->disk.ioctl = cached_dev_ioctl;
1290 } 1244 }
1291 1245
1292 /* Flash backed devices */ 1246 /* Flash backed devices */
1293 1247
1294 static int flash_dev_cache_miss(struct btree *b, struct search *s, 1248 static int flash_dev_cache_miss(struct btree *b, struct search *s,
1295 struct bio *bio, unsigned sectors) 1249 struct bio *bio, unsigned sectors)
1296 { 1250 {
1251 struct bio_vec *bv;
1252 int i;
1253
1297 /* Zero fill bio */ 1254 /* Zero fill bio */
1298 1255
1299 while (bio->bi_idx != bio->bi_vcnt) { 1256 bio_for_each_segment(bv, bio, i) {
1300 struct bio_vec *bv = bio_iovec(bio);
1301 unsigned j = min(bv->bv_len >> 9, sectors); 1257 unsigned j = min(bv->bv_len >> 9, sectors);
1302 1258
1303 void *p = kmap(bv->bv_page); 1259 void *p = kmap(bv->bv_page);
1304 memset(p + bv->bv_offset, 0, j << 9); 1260 memset(p + bv->bv_offset, 0, j << 9);
1305 kunmap(bv->bv_page); 1261 kunmap(bv->bv_page);
1306 1262
1307 bv->bv_len -= j << 9; 1263 sectors -= j;
1308 bv->bv_offset += j << 9;
1309
1310 if (bv->bv_len)
1311 return 0;
1312
1313 bio->bi_sector += j;
1314 bio->bi_size -= j << 9;
1315
1316 bio->bi_idx++;
1317 sectors -= j;
1318 } 1264 }
1319 1265
1320 s->op.lookup_done = true; 1266 bio_advance(bio, min(sectors << 9, bio->bi_size));
1321 1267
1268 if (!bio->bi_size)
1269 s->op.lookup_done = true;
1270
1322 return 0; 1271 return 0;
1323 } 1272 }
1324 1273
1325 static void flash_dev_make_request(struct request_queue *q, struct bio *bio) 1274 static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
1326 { 1275 {
1327 struct search *s; 1276 struct search *s;
1328 struct closure *cl; 1277 struct closure *cl;
1329 struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; 1278 struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
1330 int cpu, rw = bio_data_dir(bio); 1279 int cpu, rw = bio_data_dir(bio);
1331 1280
1332 cpu = part_stat_lock(); 1281 cpu = part_stat_lock();
1333 part_stat_inc(cpu, &d->disk->part0, ios[rw]); 1282 part_stat_inc(cpu, &d->disk->part0, ios[rw]);
1334 part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio)); 1283 part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio));
1335 part_stat_unlock(); 1284 part_stat_unlock();
1336 1285
1337 s = search_alloc(bio, d); 1286 s = search_alloc(bio, d);
1338 cl = &s->cl; 1287 cl = &s->cl;
1339 bio = &s->bio.bio; 1288 bio = &s->bio.bio;
1340 1289
1341 trace_bcache_request_start(s, bio); 1290 trace_bcache_request_start(s, bio);
1342 1291
1343 if (bio_has_data(bio) && !rw) { 1292 if (bio_has_data(bio) && !rw) {
1344 closure_call(&s->op.cl, btree_read_async, NULL, cl); 1293 closure_call(&s->op.cl, btree_read_async, NULL, cl);
1345 } else if (bio_has_data(bio) || s->op.skip) { 1294 } else if (bio_has_data(bio) || s->op.skip) {
1346 bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, 1295 bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys,
1347 &KEY(d->id, bio->bi_sector, 0), 1296 &KEY(d->id, bio->bi_sector, 0),
1348 &KEY(d->id, bio_end(bio), 0)); 1297 &KEY(d->id, bio_end_sector(bio), 0));
1349 1298
1350 s->writeback = true; 1299 s->writeback = true;
1351 s->op.cache_bio = bio; 1300 s->op.cache_bio = bio;
1352 1301
1353 closure_call(&s->op.cl, bch_insert_data, NULL, cl); 1302 closure_call(&s->op.cl, bch_insert_data, NULL, cl);
1354 } else { 1303 } else {
1355 /* No data - probably a cache flush */ 1304 /* No data - probably a cache flush */
1356 if (s->op.flush_journal) 1305 if (s->op.flush_journal)
1357 bch_journal_meta(s->op.c, cl); 1306 bch_journal_meta(s->op.c, cl);
1358 } 1307 }
1359 1308
1360 continue_at(cl, search_free, NULL); 1309 continue_at(cl, search_free, NULL);
1361 } 1310 }
1362 1311
1363 static int flash_dev_ioctl(struct bcache_device *d, fmode_t mode, 1312 static int flash_dev_ioctl(struct bcache_device *d, fmode_t mode,
1364 unsigned int cmd, unsigned long arg) 1313 unsigned int cmd, unsigned long arg)
1365 { 1314 {
1366 return -ENOTTY; 1315 return -ENOTTY;
1367 } 1316 }
1368 1317
1369 static int flash_dev_congested(void *data, int bits) 1318 static int flash_dev_congested(void *data, int bits)
1370 { 1319 {
1371 struct bcache_device *d = data; 1320 struct bcache_device *d = data;
1372 struct request_queue *q; 1321 struct request_queue *q;
1373 struct cache *ca; 1322 struct cache *ca;
1374 unsigned i; 1323 unsigned i;
1375 int ret = 0; 1324 int ret = 0;
1376 1325
1377 for_each_cache(ca, d->c, i) { 1326 for_each_cache(ca, d->c, i) {
1378 q = bdev_get_queue(ca->bdev); 1327 q = bdev_get_queue(ca->bdev);
1379 ret |= bdi_congested(&q->backing_dev_info, bits); 1328 ret |= bdi_congested(&q->backing_dev_info, bits);
1380 } 1329 }
1381 1330
1382 return ret; 1331 return ret;
1383 } 1332 }
1384 1333
1385 void bch_flash_dev_request_init(struct bcache_device *d) 1334 void bch_flash_dev_request_init(struct bcache_device *d)
1386 { 1335 {
1387 struct gendisk *g = d->disk; 1336 struct gendisk *g = d->disk;
1388 1337
1389 g->queue->make_request_fn = flash_dev_make_request; 1338 g->queue->make_request_fn = flash_dev_make_request;
1390 g->queue->backing_dev_info.congested_fn = flash_dev_congested; 1339 g->queue->backing_dev_info.congested_fn = flash_dev_congested;
1391 d->cache_miss = flash_dev_cache_miss; 1340 d->cache_miss = flash_dev_cache_miss;
1392 d->ioctl = flash_dev_ioctl; 1341 d->ioctl = flash_dev_ioctl;
1393 } 1342 }
1394 1343
1395 void bch_request_exit(void) 1344 void bch_request_exit(void)
1396 { 1345 {
1397 #ifdef CONFIG_CGROUP_BCACHE 1346 #ifdef CONFIG_CGROUP_BCACHE
1398 cgroup_unload_subsys(&bcache_subsys); 1347 cgroup_unload_subsys(&bcache_subsys);
1399 #endif 1348 #endif
1400 if (bch_search_cache) 1349 if (bch_search_cache)
1401 kmem_cache_destroy(bch_search_cache); 1350 kmem_cache_destroy(bch_search_cache);
1402 } 1351 }
1403 1352
1404 int __init bch_request_init(void) 1353 int __init bch_request_init(void)
1405 { 1354 {
1406 bch_search_cache = KMEM_CACHE(search, 0); 1355 bch_search_cache = KMEM_CACHE(search, 0);
1407 if (!bch_search_cache) 1356 if (!bch_search_cache)
1408 return -ENOMEM; 1357 return -ENOMEM;
1409 1358
1410 #ifdef CONFIG_CGROUP_BCACHE 1359 #ifdef CONFIG_CGROUP_BCACHE
1411 cgroup_load_subsys(&bcache_subsys); 1360 cgroup_load_subsys(&bcache_subsys);
drivers/md/bcache/util.c
1 /* 1 /*
2 * random utiility code, for bcache but in theory not specific to bcache 2 * random utiility code, for bcache but in theory not specific to bcache
3 * 3 *
4 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> 4 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
5 * Copyright 2012 Google, Inc. 5 * Copyright 2012 Google, Inc.
6 */ 6 */
7 7
8 #include <linux/bio.h> 8 #include <linux/bio.h>
9 #include <linux/blkdev.h> 9 #include <linux/blkdev.h>
10 #include <linux/ctype.h> 10 #include <linux/ctype.h>
11 #include <linux/debugfs.h> 11 #include <linux/debugfs.h>
12 #include <linux/module.h> 12 #include <linux/module.h>
13 #include <linux/seq_file.h> 13 #include <linux/seq_file.h>
14 #include <linux/types.h> 14 #include <linux/types.h>
15 15
16 #include "util.h" 16 #include "util.h"
17 17
18 #define simple_strtoint(c, end, base) simple_strtol(c, end, base) 18 #define simple_strtoint(c, end, base) simple_strtol(c, end, base)
19 #define simple_strtouint(c, end, base) simple_strtoul(c, end, base) 19 #define simple_strtouint(c, end, base) simple_strtoul(c, end, base)
20 20
21 #define STRTO_H(name, type) \ 21 #define STRTO_H(name, type) \
22 int bch_ ## name ## _h(const char *cp, type *res) \ 22 int bch_ ## name ## _h(const char *cp, type *res) \
23 { \ 23 { \
24 int u = 0; \ 24 int u = 0; \
25 char *e; \ 25 char *e; \
26 type i = simple_ ## name(cp, &e, 10); \ 26 type i = simple_ ## name(cp, &e, 10); \
27 \ 27 \
28 switch (tolower(*e)) { \ 28 switch (tolower(*e)) { \
29 default: \ 29 default: \
30 return -EINVAL; \ 30 return -EINVAL; \
31 case 'y': \ 31 case 'y': \
32 case 'z': \ 32 case 'z': \
33 u++; \ 33 u++; \
34 case 'e': \ 34 case 'e': \
35 u++; \ 35 u++; \
36 case 'p': \ 36 case 'p': \
37 u++; \ 37 u++; \
38 case 't': \ 38 case 't': \
39 u++; \ 39 u++; \
40 case 'g': \ 40 case 'g': \
41 u++; \ 41 u++; \
42 case 'm': \ 42 case 'm': \
43 u++; \ 43 u++; \
44 case 'k': \ 44 case 'k': \
45 u++; \ 45 u++; \
46 if (e++ == cp) \ 46 if (e++ == cp) \
47 return -EINVAL; \ 47 return -EINVAL; \
48 case '\n': \ 48 case '\n': \
49 case '\0': \ 49 case '\0': \
50 if (*e == '\n') \ 50 if (*e == '\n') \
51 e++; \ 51 e++; \
52 } \ 52 } \
53 \ 53 \
54 if (*e) \ 54 if (*e) \
55 return -EINVAL; \ 55 return -EINVAL; \
56 \ 56 \
57 while (u--) { \ 57 while (u--) { \
58 if ((type) ~0 > 0 && \ 58 if ((type) ~0 > 0 && \
59 (type) ~0 / 1024 <= i) \ 59 (type) ~0 / 1024 <= i) \
60 return -EINVAL; \ 60 return -EINVAL; \
61 if ((i > 0 && ANYSINT_MAX(type) / 1024 < i) || \ 61 if ((i > 0 && ANYSINT_MAX(type) / 1024 < i) || \
62 (i < 0 && -ANYSINT_MAX(type) / 1024 > i)) \ 62 (i < 0 && -ANYSINT_MAX(type) / 1024 > i)) \
63 return -EINVAL; \ 63 return -EINVAL; \
64 i *= 1024; \ 64 i *= 1024; \
65 } \ 65 } \
66 \ 66 \
67 *res = i; \ 67 *res = i; \
68 return 0; \ 68 return 0; \
69 } \ 69 } \
70 70
71 STRTO_H(strtoint, int) 71 STRTO_H(strtoint, int)
72 STRTO_H(strtouint, unsigned int) 72 STRTO_H(strtouint, unsigned int)
73 STRTO_H(strtoll, long long) 73 STRTO_H(strtoll, long long)
74 STRTO_H(strtoull, unsigned long long) 74 STRTO_H(strtoull, unsigned long long)
75 75
76 ssize_t bch_hprint(char *buf, int64_t v) 76 ssize_t bch_hprint(char *buf, int64_t v)
77 { 77 {
78 static const char units[] = "?kMGTPEZY"; 78 static const char units[] = "?kMGTPEZY";
79 char dec[4] = ""; 79 char dec[4] = "";
80 int u, t = 0; 80 int u, t = 0;
81 81
82 for (u = 0; v >= 1024 || v <= -1024; u++) { 82 for (u = 0; v >= 1024 || v <= -1024; u++) {
83 t = v & ~(~0 << 10); 83 t = v & ~(~0 << 10);
84 v >>= 10; 84 v >>= 10;
85 } 85 }
86 86
87 if (!u) 87 if (!u)
88 return sprintf(buf, "%llu", v); 88 return sprintf(buf, "%llu", v);
89 89
90 if (v < 100 && v > -100) 90 if (v < 100 && v > -100)
91 snprintf(dec, sizeof(dec), ".%i", t / 100); 91 snprintf(dec, sizeof(dec), ".%i", t / 100);
92 92
93 return sprintf(buf, "%lli%s%c", v, dec, units[u]); 93 return sprintf(buf, "%lli%s%c", v, dec, units[u]);
94 } 94 }
95 95
96 ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[], 96 ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[],
97 size_t selected) 97 size_t selected)
98 { 98 {
99 char *out = buf; 99 char *out = buf;
100 size_t i; 100 size_t i;
101 101
102 for (i = 0; list[i]; i++) 102 for (i = 0; list[i]; i++)
103 out += snprintf(out, buf + size - out, 103 out += snprintf(out, buf + size - out,
104 i == selected ? "[%s] " : "%s ", list[i]); 104 i == selected ? "[%s] " : "%s ", list[i]);
105 105
106 out[-1] = '\n'; 106 out[-1] = '\n';
107 return out - buf; 107 return out - buf;
108 } 108 }
109 109
110 ssize_t bch_read_string_list(const char *buf, const char * const list[]) 110 ssize_t bch_read_string_list(const char *buf, const char * const list[])
111 { 111 {
112 size_t i; 112 size_t i;
113 char *s, *d = kstrndup(buf, PAGE_SIZE - 1, GFP_KERNEL); 113 char *s, *d = kstrndup(buf, PAGE_SIZE - 1, GFP_KERNEL);
114 if (!d) 114 if (!d)
115 return -ENOMEM; 115 return -ENOMEM;
116 116
117 s = strim(d); 117 s = strim(d);
118 118
119 for (i = 0; list[i]; i++) 119 for (i = 0; list[i]; i++)
120 if (!strcmp(list[i], s)) 120 if (!strcmp(list[i], s))
121 break; 121 break;
122 122
123 kfree(d); 123 kfree(d);
124 124
125 if (!list[i]) 125 if (!list[i])
126 return -EINVAL; 126 return -EINVAL;
127 127
128 return i; 128 return i;
129 } 129 }
130 130
131 bool bch_is_zero(const char *p, size_t n) 131 bool bch_is_zero(const char *p, size_t n)
132 { 132 {
133 size_t i; 133 size_t i;
134 134
135 for (i = 0; i < n; i++) 135 for (i = 0; i < n; i++)
136 if (p[i]) 136 if (p[i])
137 return false; 137 return false;
138 return true; 138 return true;
139 } 139 }
140 140
141 int bch_parse_uuid(const char *s, char *uuid) 141 int bch_parse_uuid(const char *s, char *uuid)
142 { 142 {
143 size_t i, j, x; 143 size_t i, j, x;
144 memset(uuid, 0, 16); 144 memset(uuid, 0, 16);
145 145
146 for (i = 0, j = 0; 146 for (i = 0, j = 0;
147 i < strspn(s, "-0123456789:ABCDEFabcdef") && j < 32; 147 i < strspn(s, "-0123456789:ABCDEFabcdef") && j < 32;
148 i++) { 148 i++) {
149 x = s[i] | 32; 149 x = s[i] | 32;
150 150
151 switch (x) { 151 switch (x) {
152 case '0'...'9': 152 case '0'...'9':
153 x -= '0'; 153 x -= '0';
154 break; 154 break;
155 case 'a'...'f': 155 case 'a'...'f':
156 x -= 'a' - 10; 156 x -= 'a' - 10;
157 break; 157 break;
158 default: 158 default:
159 continue; 159 continue;
160 } 160 }
161 161
162 if (!(j & 1)) 162 if (!(j & 1))
163 x <<= 4; 163 x <<= 4;
164 uuid[j++ >> 1] |= x; 164 uuid[j++ >> 1] |= x;
165 } 165 }
166 return i; 166 return i;
167 } 167 }
168 168
169 void bch_time_stats_update(struct time_stats *stats, uint64_t start_time) 169 void bch_time_stats_update(struct time_stats *stats, uint64_t start_time)
170 { 170 {
171 uint64_t now = local_clock(); 171 uint64_t now = local_clock();
172 uint64_t duration = time_after64(now, start_time) 172 uint64_t duration = time_after64(now, start_time)
173 ? now - start_time : 0; 173 ? now - start_time : 0;
174 uint64_t last = time_after64(now, stats->last) 174 uint64_t last = time_after64(now, stats->last)
175 ? now - stats->last : 0; 175 ? now - stats->last : 0;
176 176
177 stats->max_duration = max(stats->max_duration, duration); 177 stats->max_duration = max(stats->max_duration, duration);
178 178
179 if (stats->last) { 179 if (stats->last) {
180 ewma_add(stats->average_duration, duration, 8, 8); 180 ewma_add(stats->average_duration, duration, 8, 8);
181 181
182 if (stats->average_frequency) 182 if (stats->average_frequency)
183 ewma_add(stats->average_frequency, last, 8, 8); 183 ewma_add(stats->average_frequency, last, 8, 8);
184 else 184 else
185 stats->average_frequency = last << 8; 185 stats->average_frequency = last << 8;
186 } else { 186 } else {
187 stats->average_duration = duration << 8; 187 stats->average_duration = duration << 8;
188 } 188 }
189 189
190 stats->last = now ?: 1; 190 stats->last = now ?: 1;
191 } 191 }
192 192
193 unsigned bch_next_delay(struct ratelimit *d, uint64_t done) 193 unsigned bch_next_delay(struct ratelimit *d, uint64_t done)
194 { 194 {
195 uint64_t now = local_clock(); 195 uint64_t now = local_clock();
196 196
197 d->next += div_u64(done, d->rate); 197 d->next += div_u64(done, d->rate);
198 198
199 return time_after64(d->next, now) 199 return time_after64(d->next, now)
200 ? div_u64(d->next - now, NSEC_PER_SEC / HZ) 200 ? div_u64(d->next - now, NSEC_PER_SEC / HZ)
201 : 0; 201 : 0;
202 } 202 }
203 203
204 void bch_bio_map(struct bio *bio, void *base) 204 void bch_bio_map(struct bio *bio, void *base)
205 { 205 {
206 size_t size = bio->bi_size; 206 size_t size = bio->bi_size;
207 struct bio_vec *bv = bio->bi_io_vec; 207 struct bio_vec *bv = bio->bi_io_vec;
208 208
209 BUG_ON(!bio->bi_size); 209 BUG_ON(!bio->bi_size);
210 BUG_ON(bio->bi_vcnt); 210 BUG_ON(bio->bi_vcnt);
211 211
212 bv->bv_offset = base ? ((unsigned long) base) % PAGE_SIZE : 0; 212 bv->bv_offset = base ? ((unsigned long) base) % PAGE_SIZE : 0;
213 goto start; 213 goto start;
214 214
215 for (; size; bio->bi_vcnt++, bv++) { 215 for (; size; bio->bi_vcnt++, bv++) {
216 bv->bv_offset = 0; 216 bv->bv_offset = 0;
217 start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset, 217 start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset,
218 size); 218 size);
219 if (base) { 219 if (base) {
220 bv->bv_page = is_vmalloc_addr(base) 220 bv->bv_page = is_vmalloc_addr(base)
221 ? vmalloc_to_page(base) 221 ? vmalloc_to_page(base)
222 : virt_to_page(base); 222 : virt_to_page(base);
223 223
224 base += bv->bv_len; 224 base += bv->bv_len;
225 } 225 }
226 226
227 size -= bv->bv_len; 227 size -= bv->bv_len;
228 } 228 }
229 } 229 }
230 230
231 int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp)
232 {
233 int i;
234 struct bio_vec *bv;
235
236 bio_for_each_segment(bv, bio, i) {
237 bv->bv_page = alloc_page(gfp);
238 if (!bv->bv_page) {
239 while (bv-- != bio->bi_io_vec + bio->bi_idx)
240 __free_page(bv->bv_page);
241 return -ENOMEM;
242 }
243 }
244
245 return 0;
246 }
247
248 /* 231 /*
249 * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any 232 * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any
250 * use permitted, subject to terms of PostgreSQL license; see.) 233 * use permitted, subject to terms of PostgreSQL license; see.)
251 234
252 * If we have a 64-bit integer type, then a 64-bit CRC looks just like the 235 * If we have a 64-bit integer type, then a 64-bit CRC looks just like the
253 * usual sort of implementation. (See Ross Williams' excellent introduction 236 * usual sort of implementation. (See Ross Williams' excellent introduction
254 * A PAINLESS GUIDE TO CRC ERROR DETECTION ALGORITHMS, available from 237 * A PAINLESS GUIDE TO CRC ERROR DETECTION ALGORITHMS, available from
255 * ftp://ftp.rocksoft.com/papers/crc_v3.txt or several other net sites.) 238 * ftp://ftp.rocksoft.com/papers/crc_v3.txt or several other net sites.)
256 * If we have no working 64-bit type, then fake it with two 32-bit registers. 239 * If we have no working 64-bit type, then fake it with two 32-bit registers.
257 * 240 *
258 * The present implementation is a normal (not "reflected", in Williams' 241 * The present implementation is a normal (not "reflected", in Williams'
259 * terms) 64-bit CRC, using initial all-ones register contents and a final 242 * terms) 64-bit CRC, using initial all-ones register contents and a final
260 * bit inversion. The chosen polynomial is borrowed from the DLT1 spec 243 * bit inversion. The chosen polynomial is borrowed from the DLT1 spec
261 * (ECMA-182, available from http://www.ecma.ch/ecma1/STAND/ECMA-182.HTM): 244 * (ECMA-182, available from http://www.ecma.ch/ecma1/STAND/ECMA-182.HTM):
262 * 245 *
263 * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 + 246 * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
264 * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 + 247 * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
265 * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 + 248 * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 +
266 * x^7 + x^4 + x + 1 249 * x^7 + x^4 + x + 1
267 */ 250 */
268 251
269 static const uint64_t crc_table[256] = { 252 static const uint64_t crc_table[256] = {
270 0x0000000000000000ULL, 0x42F0E1EBA9EA3693ULL, 0x85E1C3D753D46D26ULL, 253 0x0000000000000000ULL, 0x42F0E1EBA9EA3693ULL, 0x85E1C3D753D46D26ULL,
271 0xC711223CFA3E5BB5ULL, 0x493366450E42ECDFULL, 0x0BC387AEA7A8DA4CULL, 254 0xC711223CFA3E5BB5ULL, 0x493366450E42ECDFULL, 0x0BC387AEA7A8DA4CULL,
272 0xCCD2A5925D9681F9ULL, 0x8E224479F47CB76AULL, 0x9266CC8A1C85D9BEULL, 255 0xCCD2A5925D9681F9ULL, 0x8E224479F47CB76AULL, 0x9266CC8A1C85D9BEULL,
273 0xD0962D61B56FEF2DULL, 0x17870F5D4F51B498ULL, 0x5577EEB6E6BB820BULL, 256 0xD0962D61B56FEF2DULL, 0x17870F5D4F51B498ULL, 0x5577EEB6E6BB820BULL,
274 0xDB55AACF12C73561ULL, 0x99A54B24BB2D03F2ULL, 0x5EB4691841135847ULL, 257 0xDB55AACF12C73561ULL, 0x99A54B24BB2D03F2ULL, 0x5EB4691841135847ULL,
275 0x1C4488F3E8F96ED4ULL, 0x663D78FF90E185EFULL, 0x24CD9914390BB37CULL, 258 0x1C4488F3E8F96ED4ULL, 0x663D78FF90E185EFULL, 0x24CD9914390BB37CULL,
276 0xE3DCBB28C335E8C9ULL, 0xA12C5AC36ADFDE5AULL, 0x2F0E1EBA9EA36930ULL, 259 0xE3DCBB28C335E8C9ULL, 0xA12C5AC36ADFDE5AULL, 0x2F0E1EBA9EA36930ULL,
277 0x6DFEFF5137495FA3ULL, 0xAAEFDD6DCD770416ULL, 0xE81F3C86649D3285ULL, 260 0x6DFEFF5137495FA3ULL, 0xAAEFDD6DCD770416ULL, 0xE81F3C86649D3285ULL,
278 0xF45BB4758C645C51ULL, 0xB6AB559E258E6AC2ULL, 0x71BA77A2DFB03177ULL, 261 0xF45BB4758C645C51ULL, 0xB6AB559E258E6AC2ULL, 0x71BA77A2DFB03177ULL,
279 0x334A9649765A07E4ULL, 0xBD68D2308226B08EULL, 0xFF9833DB2BCC861DULL, 262 0x334A9649765A07E4ULL, 0xBD68D2308226B08EULL, 0xFF9833DB2BCC861DULL,
280 0x388911E7D1F2DDA8ULL, 0x7A79F00C7818EB3BULL, 0xCC7AF1FF21C30BDEULL, 263 0x388911E7D1F2DDA8ULL, 0x7A79F00C7818EB3BULL, 0xCC7AF1FF21C30BDEULL,
281 0x8E8A101488293D4DULL, 0x499B3228721766F8ULL, 0x0B6BD3C3DBFD506BULL, 264 0x8E8A101488293D4DULL, 0x499B3228721766F8ULL, 0x0B6BD3C3DBFD506BULL,
282 0x854997BA2F81E701ULL, 0xC7B97651866BD192ULL, 0x00A8546D7C558A27ULL, 265 0x854997BA2F81E701ULL, 0xC7B97651866BD192ULL, 0x00A8546D7C558A27ULL,
283 0x4258B586D5BFBCB4ULL, 0x5E1C3D753D46D260ULL, 0x1CECDC9E94ACE4F3ULL, 266 0x4258B586D5BFBCB4ULL, 0x5E1C3D753D46D260ULL, 0x1CECDC9E94ACE4F3ULL,
284 0xDBFDFEA26E92BF46ULL, 0x990D1F49C77889D5ULL, 0x172F5B3033043EBFULL, 267 0xDBFDFEA26E92BF46ULL, 0x990D1F49C77889D5ULL, 0x172F5B3033043EBFULL,
285 0x55DFBADB9AEE082CULL, 0x92CE98E760D05399ULL, 0xD03E790CC93A650AULL, 268 0x55DFBADB9AEE082CULL, 0x92CE98E760D05399ULL, 0xD03E790CC93A650AULL,
286 0xAA478900B1228E31ULL, 0xE8B768EB18C8B8A2ULL, 0x2FA64AD7E2F6E317ULL, 269 0xAA478900B1228E31ULL, 0xE8B768EB18C8B8A2ULL, 0x2FA64AD7E2F6E317ULL,
287 0x6D56AB3C4B1CD584ULL, 0xE374EF45BF6062EEULL, 0xA1840EAE168A547DULL, 270 0x6D56AB3C4B1CD584ULL, 0xE374EF45BF6062EEULL, 0xA1840EAE168A547DULL,
288 0x66952C92ECB40FC8ULL, 0x2465CD79455E395BULL, 0x3821458AADA7578FULL, 271 0x66952C92ECB40FC8ULL, 0x2465CD79455E395BULL, 0x3821458AADA7578FULL,
289 0x7AD1A461044D611CULL, 0xBDC0865DFE733AA9ULL, 0xFF3067B657990C3AULL, 272 0x7AD1A461044D611CULL, 0xBDC0865DFE733AA9ULL, 0xFF3067B657990C3AULL,
290 0x711223CFA3E5BB50ULL, 0x33E2C2240A0F8DC3ULL, 0xF4F3E018F031D676ULL, 273 0x711223CFA3E5BB50ULL, 0x33E2C2240A0F8DC3ULL, 0xF4F3E018F031D676ULL,
291 0xB60301F359DBE0E5ULL, 0xDA050215EA6C212FULL, 0x98F5E3FE438617BCULL, 274 0xB60301F359DBE0E5ULL, 0xDA050215EA6C212FULL, 0x98F5E3FE438617BCULL,
292 0x5FE4C1C2B9B84C09ULL, 0x1D14202910527A9AULL, 0x93366450E42ECDF0ULL, 275 0x5FE4C1C2B9B84C09ULL, 0x1D14202910527A9AULL, 0x93366450E42ECDF0ULL,
293 0xD1C685BB4DC4FB63ULL, 0x16D7A787B7FAA0D6ULL, 0x5427466C1E109645ULL, 276 0xD1C685BB4DC4FB63ULL, 0x16D7A787B7FAA0D6ULL, 0x5427466C1E109645ULL,
294 0x4863CE9FF6E9F891ULL, 0x0A932F745F03CE02ULL, 0xCD820D48A53D95B7ULL, 277 0x4863CE9FF6E9F891ULL, 0x0A932F745F03CE02ULL, 0xCD820D48A53D95B7ULL,
295 0x8F72ECA30CD7A324ULL, 0x0150A8DAF8AB144EULL, 0x43A04931514122DDULL, 278 0x8F72ECA30CD7A324ULL, 0x0150A8DAF8AB144EULL, 0x43A04931514122DDULL,
296 0x84B16B0DAB7F7968ULL, 0xC6418AE602954FFBULL, 0xBC387AEA7A8DA4C0ULL, 279 0x84B16B0DAB7F7968ULL, 0xC6418AE602954FFBULL, 0xBC387AEA7A8DA4C0ULL,
297 0xFEC89B01D3679253ULL, 0x39D9B93D2959C9E6ULL, 0x7B2958D680B3FF75ULL, 280 0xFEC89B01D3679253ULL, 0x39D9B93D2959C9E6ULL, 0x7B2958D680B3FF75ULL,
298 0xF50B1CAF74CF481FULL, 0xB7FBFD44DD257E8CULL, 0x70EADF78271B2539ULL, 281 0xF50B1CAF74CF481FULL, 0xB7FBFD44DD257E8CULL, 0x70EADF78271B2539ULL,
299 0x321A3E938EF113AAULL, 0x2E5EB66066087D7EULL, 0x6CAE578BCFE24BEDULL, 282 0x321A3E938EF113AAULL, 0x2E5EB66066087D7EULL, 0x6CAE578BCFE24BEDULL,
300 0xABBF75B735DC1058ULL, 0xE94F945C9C3626CBULL, 0x676DD025684A91A1ULL, 283 0xABBF75B735DC1058ULL, 0xE94F945C9C3626CBULL, 0x676DD025684A91A1ULL,
301 0x259D31CEC1A0A732ULL, 0xE28C13F23B9EFC87ULL, 0xA07CF2199274CA14ULL, 284 0x259D31CEC1A0A732ULL, 0xE28C13F23B9EFC87ULL, 0xA07CF2199274CA14ULL,
302 0x167FF3EACBAF2AF1ULL, 0x548F120162451C62ULL, 0x939E303D987B47D7ULL, 285 0x167FF3EACBAF2AF1ULL, 0x548F120162451C62ULL, 0x939E303D987B47D7ULL,
303 0xD16ED1D631917144ULL, 0x5F4C95AFC5EDC62EULL, 0x1DBC74446C07F0BDULL, 286 0xD16ED1D631917144ULL, 0x5F4C95AFC5EDC62EULL, 0x1DBC74446C07F0BDULL,
304 0xDAAD56789639AB08ULL, 0x985DB7933FD39D9BULL, 0x84193F60D72AF34FULL, 287 0xDAAD56789639AB08ULL, 0x985DB7933FD39D9BULL, 0x84193F60D72AF34FULL,
305 0xC6E9DE8B7EC0C5DCULL, 0x01F8FCB784FE9E69ULL, 0x43081D5C2D14A8FAULL, 288 0xC6E9DE8B7EC0C5DCULL, 0x01F8FCB784FE9E69ULL, 0x43081D5C2D14A8FAULL,
306 0xCD2A5925D9681F90ULL, 0x8FDAB8CE70822903ULL, 0x48CB9AF28ABC72B6ULL, 289 0xCD2A5925D9681F90ULL, 0x8FDAB8CE70822903ULL, 0x48CB9AF28ABC72B6ULL,
307 0x0A3B7B1923564425ULL, 0x70428B155B4EAF1EULL, 0x32B26AFEF2A4998DULL, 290 0x0A3B7B1923564425ULL, 0x70428B155B4EAF1EULL, 0x32B26AFEF2A4998DULL,
308 0xF5A348C2089AC238ULL, 0xB753A929A170F4ABULL, 0x3971ED50550C43C1ULL, 291 0xF5A348C2089AC238ULL, 0xB753A929A170F4ABULL, 0x3971ED50550C43C1ULL,
309 0x7B810CBBFCE67552ULL, 0xBC902E8706D82EE7ULL, 0xFE60CF6CAF321874ULL, 292 0x7B810CBBFCE67552ULL, 0xBC902E8706D82EE7ULL, 0xFE60CF6CAF321874ULL,
310 0xE224479F47CB76A0ULL, 0xA0D4A674EE214033ULL, 0x67C58448141F1B86ULL, 293 0xE224479F47CB76A0ULL, 0xA0D4A674EE214033ULL, 0x67C58448141F1B86ULL,
311 0x253565A3BDF52D15ULL, 0xAB1721DA49899A7FULL, 0xE9E7C031E063ACECULL, 294 0x253565A3BDF52D15ULL, 0xAB1721DA49899A7FULL, 0xE9E7C031E063ACECULL,
312 0x2EF6E20D1A5DF759ULL, 0x6C0603E6B3B7C1CAULL, 0xF6FAE5C07D3274CDULL, 295 0x2EF6E20D1A5DF759ULL, 0x6C0603E6B3B7C1CAULL, 0xF6FAE5C07D3274CDULL,
313 0xB40A042BD4D8425EULL, 0x731B26172EE619EBULL, 0x31EBC7FC870C2F78ULL, 296 0xB40A042BD4D8425EULL, 0x731B26172EE619EBULL, 0x31EBC7FC870C2F78ULL,
314 0xBFC9838573709812ULL, 0xFD39626EDA9AAE81ULL, 0x3A28405220A4F534ULL, 297 0xBFC9838573709812ULL, 0xFD39626EDA9AAE81ULL, 0x3A28405220A4F534ULL,
315 0x78D8A1B9894EC3A7ULL, 0x649C294A61B7AD73ULL, 0x266CC8A1C85D9BE0ULL, 298 0x78D8A1B9894EC3A7ULL, 0x649C294A61B7AD73ULL, 0x266CC8A1C85D9BE0ULL,
316 0xE17DEA9D3263C055ULL, 0xA38D0B769B89F6C6ULL, 0x2DAF4F0F6FF541ACULL, 299 0xE17DEA9D3263C055ULL, 0xA38D0B769B89F6C6ULL, 0x2DAF4F0F6FF541ACULL,
317 0x6F5FAEE4C61F773FULL, 0xA84E8CD83C212C8AULL, 0xEABE6D3395CB1A19ULL, 300 0x6F5FAEE4C61F773FULL, 0xA84E8CD83C212C8AULL, 0xEABE6D3395CB1A19ULL,
318 0x90C79D3FEDD3F122ULL, 0xD2377CD44439C7B1ULL, 0x15265EE8BE079C04ULL, 301 0x90C79D3FEDD3F122ULL, 0xD2377CD44439C7B1ULL, 0x15265EE8BE079C04ULL,
319 0x57D6BF0317EDAA97ULL, 0xD9F4FB7AE3911DFDULL, 0x9B041A914A7B2B6EULL, 302 0x57D6BF0317EDAA97ULL, 0xD9F4FB7AE3911DFDULL, 0x9B041A914A7B2B6EULL,
320 0x5C1538ADB04570DBULL, 0x1EE5D94619AF4648ULL, 0x02A151B5F156289CULL, 303 0x5C1538ADB04570DBULL, 0x1EE5D94619AF4648ULL, 0x02A151B5F156289CULL,
321 0x4051B05E58BC1E0FULL, 0x87409262A28245BAULL, 0xC5B073890B687329ULL, 304 0x4051B05E58BC1E0FULL, 0x87409262A28245BAULL, 0xC5B073890B687329ULL,
322 0x4B9237F0FF14C443ULL, 0x0962D61B56FEF2D0ULL, 0xCE73F427ACC0A965ULL, 305 0x4B9237F0FF14C443ULL, 0x0962D61B56FEF2D0ULL, 0xCE73F427ACC0A965ULL,
323 0x8C8315CC052A9FF6ULL, 0x3A80143F5CF17F13ULL, 0x7870F5D4F51B4980ULL, 306 0x8C8315CC052A9FF6ULL, 0x3A80143F5CF17F13ULL, 0x7870F5D4F51B4980ULL,
324 0xBF61D7E80F251235ULL, 0xFD913603A6CF24A6ULL, 0x73B3727A52B393CCULL, 307 0xBF61D7E80F251235ULL, 0xFD913603A6CF24A6ULL, 0x73B3727A52B393CCULL,
325 0x31439391FB59A55FULL, 0xF652B1AD0167FEEAULL, 0xB4A25046A88DC879ULL, 308 0x31439391FB59A55FULL, 0xF652B1AD0167FEEAULL, 0xB4A25046A88DC879ULL,
326 0xA8E6D8B54074A6ADULL, 0xEA16395EE99E903EULL, 0x2D071B6213A0CB8BULL, 309 0xA8E6D8B54074A6ADULL, 0xEA16395EE99E903EULL, 0x2D071B6213A0CB8BULL,
327 0x6FF7FA89BA4AFD18ULL, 0xE1D5BEF04E364A72ULL, 0xA3255F1BE7DC7CE1ULL, 310 0x6FF7FA89BA4AFD18ULL, 0xE1D5BEF04E364A72ULL, 0xA3255F1BE7DC7CE1ULL,
328 0x64347D271DE22754ULL, 0x26C49CCCB40811C7ULL, 0x5CBD6CC0CC10FAFCULL, 311 0x64347D271DE22754ULL, 0x26C49CCCB40811C7ULL, 0x5CBD6CC0CC10FAFCULL,
329 0x1E4D8D2B65FACC6FULL, 0xD95CAF179FC497DAULL, 0x9BAC4EFC362EA149ULL, 312 0x1E4D8D2B65FACC6FULL, 0xD95CAF179FC497DAULL, 0x9BAC4EFC362EA149ULL,
330 0x158E0A85C2521623ULL, 0x577EEB6E6BB820B0ULL, 0x906FC95291867B05ULL, 313 0x158E0A85C2521623ULL, 0x577EEB6E6BB820B0ULL, 0x906FC95291867B05ULL,
331 0xD29F28B9386C4D96ULL, 0xCEDBA04AD0952342ULL, 0x8C2B41A1797F15D1ULL, 314 0xD29F28B9386C4D96ULL, 0xCEDBA04AD0952342ULL, 0x8C2B41A1797F15D1ULL,
332 0x4B3A639D83414E64ULL, 0x09CA82762AAB78F7ULL, 0x87E8C60FDED7CF9DULL, 315 0x4B3A639D83414E64ULL, 0x09CA82762AAB78F7ULL, 0x87E8C60FDED7CF9DULL,
333 0xC51827E4773DF90EULL, 0x020905D88D03A2BBULL, 0x40F9E43324E99428ULL, 316 0xC51827E4773DF90EULL, 0x020905D88D03A2BBULL, 0x40F9E43324E99428ULL,
334 0x2CFFE7D5975E55E2ULL, 0x6E0F063E3EB46371ULL, 0xA91E2402C48A38C4ULL, 317 0x2CFFE7D5975E55E2ULL, 0x6E0F063E3EB46371ULL, 0xA91E2402C48A38C4ULL,
335 0xEBEEC5E96D600E57ULL, 0x65CC8190991CB93DULL, 0x273C607B30F68FAEULL, 318 0xEBEEC5E96D600E57ULL, 0x65CC8190991CB93DULL, 0x273C607B30F68FAEULL,
336 0xE02D4247CAC8D41BULL, 0xA2DDA3AC6322E288ULL, 0xBE992B5F8BDB8C5CULL, 319 0xE02D4247CAC8D41BULL, 0xA2DDA3AC6322E288ULL, 0xBE992B5F8BDB8C5CULL,
337 0xFC69CAB42231BACFULL, 0x3B78E888D80FE17AULL, 0x7988096371E5D7E9ULL, 320 0xFC69CAB42231BACFULL, 0x3B78E888D80FE17AULL, 0x7988096371E5D7E9ULL,
338 0xF7AA4D1A85996083ULL, 0xB55AACF12C735610ULL, 0x724B8ECDD64D0DA5ULL, 321 0xF7AA4D1A85996083ULL, 0xB55AACF12C735610ULL, 0x724B8ECDD64D0DA5ULL,
339 0x30BB6F267FA73B36ULL, 0x4AC29F2A07BFD00DULL, 0x08327EC1AE55E69EULL, 322 0x30BB6F267FA73B36ULL, 0x4AC29F2A07BFD00DULL, 0x08327EC1AE55E69EULL,
340 0xCF235CFD546BBD2BULL, 0x8DD3BD16FD818BB8ULL, 0x03F1F96F09FD3CD2ULL, 323 0xCF235CFD546BBD2BULL, 0x8DD3BD16FD818BB8ULL, 0x03F1F96F09FD3CD2ULL,
341 0x41011884A0170A41ULL, 0x86103AB85A2951F4ULL, 0xC4E0DB53F3C36767ULL, 324 0x41011884A0170A41ULL, 0x86103AB85A2951F4ULL, 0xC4E0DB53F3C36767ULL,
342 0xD8A453A01B3A09B3ULL, 0x9A54B24BB2D03F20ULL, 0x5D45907748EE6495ULL, 325 0xD8A453A01B3A09B3ULL, 0x9A54B24BB2D03F20ULL, 0x5D45907748EE6495ULL,
343 0x1FB5719CE1045206ULL, 0x919735E51578E56CULL, 0xD367D40EBC92D3FFULL, 326 0x1FB5719CE1045206ULL, 0x919735E51578E56CULL, 0xD367D40EBC92D3FFULL,
344 0x1476F63246AC884AULL, 0x568617D9EF46BED9ULL, 0xE085162AB69D5E3CULL, 327 0x1476F63246AC884AULL, 0x568617D9EF46BED9ULL, 0xE085162AB69D5E3CULL,
345 0xA275F7C11F7768AFULL, 0x6564D5FDE549331AULL, 0x279434164CA30589ULL, 328 0xA275F7C11F7768AFULL, 0x6564D5FDE549331AULL, 0x279434164CA30589ULL,
346 0xA9B6706FB8DFB2E3ULL, 0xEB46918411358470ULL, 0x2C57B3B8EB0BDFC5ULL, 329 0xA9B6706FB8DFB2E3ULL, 0xEB46918411358470ULL, 0x2C57B3B8EB0BDFC5ULL,
347 0x6EA7525342E1E956ULL, 0x72E3DAA0AA188782ULL, 0x30133B4B03F2B111ULL, 330 0x6EA7525342E1E956ULL, 0x72E3DAA0AA188782ULL, 0x30133B4B03F2B111ULL,
348 0xF7021977F9CCEAA4ULL, 0xB5F2F89C5026DC37ULL, 0x3BD0BCE5A45A6B5DULL, 331 0xF7021977F9CCEAA4ULL, 0xB5F2F89C5026DC37ULL, 0x3BD0BCE5A45A6B5DULL,
349 0x79205D0E0DB05DCEULL, 0xBE317F32F78E067BULL, 0xFCC19ED95E6430E8ULL, 332 0x79205D0E0DB05DCEULL, 0xBE317F32F78E067BULL, 0xFCC19ED95E6430E8ULL,
350 0x86B86ED5267CDBD3ULL, 0xC4488F3E8F96ED40ULL, 0x0359AD0275A8B6F5ULL, 333 0x86B86ED5267CDBD3ULL, 0xC4488F3E8F96ED40ULL, 0x0359AD0275A8B6F5ULL,
351 0x41A94CE9DC428066ULL, 0xCF8B0890283E370CULL, 0x8D7BE97B81D4019FULL, 334 0x41A94CE9DC428066ULL, 0xCF8B0890283E370CULL, 0x8D7BE97B81D4019FULL,
352 0x4A6ACB477BEA5A2AULL, 0x089A2AACD2006CB9ULL, 0x14DEA25F3AF9026DULL, 335 0x4A6ACB477BEA5A2AULL, 0x089A2AACD2006CB9ULL, 0x14DEA25F3AF9026DULL,
353 0x562E43B4931334FEULL, 0x913F6188692D6F4BULL, 0xD3CF8063C0C759D8ULL, 336 0x562E43B4931334FEULL, 0x913F6188692D6F4BULL, 0xD3CF8063C0C759D8ULL,
354 0x5DEDC41A34BBEEB2ULL, 0x1F1D25F19D51D821ULL, 0xD80C07CD676F8394ULL, 337 0x5DEDC41A34BBEEB2ULL, 0x1F1D25F19D51D821ULL, 0xD80C07CD676F8394ULL,
355 0x9AFCE626CE85B507ULL, 338 0x9AFCE626CE85B507ULL,
356 }; 339 };
357 340
358 uint64_t bch_crc64_update(uint64_t crc, const void *_data, size_t len) 341 uint64_t bch_crc64_update(uint64_t crc, const void *_data, size_t len)
359 { 342 {
360 const unsigned char *data = _data; 343 const unsigned char *data = _data;
361 344
362 while (len--) { 345 while (len--) {
363 int i = ((int) (crc >> 56) ^ *data++) & 0xFF; 346 int i = ((int) (crc >> 56) ^ *data++) & 0xFF;
364 crc = crc_table[i] ^ (crc << 8); 347 crc = crc_table[i] ^ (crc << 8);
365 } 348 }
366 349
367 return crc; 350 return crc;
368 } 351 }
369 352
370 uint64_t bch_crc64(const void *data, size_t len) 353 uint64_t bch_crc64(const void *data, size_t len)
371 { 354 {
372 uint64_t crc = 0xffffffffffffffffULL; 355 uint64_t crc = 0xffffffffffffffffULL;
373 356
374 crc = bch_crc64_update(crc, data, len); 357 crc = bch_crc64_update(crc, data, len);
375 358
376 return crc ^ 0xffffffffffffffffULL; 359 return crc ^ 0xffffffffffffffffULL;
377 } 360 }
378 361
drivers/md/bcache/util.h
1 1
2 #ifndef _BCACHE_UTIL_H 2 #ifndef _BCACHE_UTIL_H
3 #define _BCACHE_UTIL_H 3 #define _BCACHE_UTIL_H
4 4
5 #include <linux/errno.h> 5 #include <linux/errno.h>
6 #include <linux/kernel.h> 6 #include <linux/kernel.h>
7 #include <linux/llist.h> 7 #include <linux/llist.h>
8 #include <linux/ratelimit.h> 8 #include <linux/ratelimit.h>
9 #include <linux/vmalloc.h> 9 #include <linux/vmalloc.h>
10 #include <linux/workqueue.h> 10 #include <linux/workqueue.h>
11 11
12 #include "closure.h" 12 #include "closure.h"
13 13
14 #define PAGE_SECTORS (PAGE_SIZE / 512) 14 #define PAGE_SECTORS (PAGE_SIZE / 512)
15 15
16 struct closure; 16 struct closure;
17 17
18 #ifdef CONFIG_BCACHE_EDEBUG 18 #ifdef CONFIG_BCACHE_EDEBUG
19 19
20 #define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) 20 #define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0)
21 #define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i) 21 #define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i)
22 22
23 #else /* EDEBUG */ 23 #else /* EDEBUG */
24 24
25 #define atomic_dec_bug(v) atomic_dec(v) 25 #define atomic_dec_bug(v) atomic_dec(v)
26 #define atomic_inc_bug(v, i) atomic_inc(v) 26 #define atomic_inc_bug(v, i) atomic_inc(v)
27 27
28 #endif 28 #endif
29 29
30 #define BITMASK(name, type, field, offset, size) \ 30 #define BITMASK(name, type, field, offset, size) \
31 static inline uint64_t name(const type *k) \ 31 static inline uint64_t name(const type *k) \
32 { return (k->field >> offset) & ~(((uint64_t) ~0) << size); } \ 32 { return (k->field >> offset) & ~(((uint64_t) ~0) << size); } \
33 \ 33 \
34 static inline void SET_##name(type *k, uint64_t v) \ 34 static inline void SET_##name(type *k, uint64_t v) \
35 { \ 35 { \
36 k->field &= ~(~((uint64_t) ~0 << size) << offset); \ 36 k->field &= ~(~((uint64_t) ~0 << size) << offset); \
37 k->field |= v << offset; \ 37 k->field |= v << offset; \
38 } 38 }
39 39
40 #define DECLARE_HEAP(type, name) \ 40 #define DECLARE_HEAP(type, name) \
41 struct { \ 41 struct { \
42 size_t size, used; \ 42 size_t size, used; \
43 type *data; \ 43 type *data; \
44 } name 44 } name
45 45
46 #define init_heap(heap, _size, gfp) \ 46 #define init_heap(heap, _size, gfp) \
47 ({ \ 47 ({ \
48 size_t _bytes; \ 48 size_t _bytes; \
49 (heap)->used = 0; \ 49 (heap)->used = 0; \
50 (heap)->size = (_size); \ 50 (heap)->size = (_size); \
51 _bytes = (heap)->size * sizeof(*(heap)->data); \ 51 _bytes = (heap)->size * sizeof(*(heap)->data); \
52 (heap)->data = NULL; \ 52 (heap)->data = NULL; \
53 if (_bytes < KMALLOC_MAX_SIZE) \ 53 if (_bytes < KMALLOC_MAX_SIZE) \
54 (heap)->data = kmalloc(_bytes, (gfp)); \ 54 (heap)->data = kmalloc(_bytes, (gfp)); \
55 if ((!(heap)->data) && ((gfp) & GFP_KERNEL)) \ 55 if ((!(heap)->data) && ((gfp) & GFP_KERNEL)) \
56 (heap)->data = vmalloc(_bytes); \ 56 (heap)->data = vmalloc(_bytes); \
57 (heap)->data; \ 57 (heap)->data; \
58 }) 58 })
59 59
60 #define free_heap(heap) \ 60 #define free_heap(heap) \
61 do { \ 61 do { \
62 if (is_vmalloc_addr((heap)->data)) \ 62 if (is_vmalloc_addr((heap)->data)) \
63 vfree((heap)->data); \ 63 vfree((heap)->data); \
64 else \ 64 else \
65 kfree((heap)->data); \ 65 kfree((heap)->data); \
66 (heap)->data = NULL; \ 66 (heap)->data = NULL; \
67 } while (0) 67 } while (0)
68 68
69 #define heap_swap(h, i, j) swap((h)->data[i], (h)->data[j]) 69 #define heap_swap(h, i, j) swap((h)->data[i], (h)->data[j])
70 70
71 #define heap_sift(h, i, cmp) \ 71 #define heap_sift(h, i, cmp) \
72 do { \ 72 do { \
73 size_t _r, _j = i; \ 73 size_t _r, _j = i; \
74 \ 74 \
75 for (; _j * 2 + 1 < (h)->used; _j = _r) { \ 75 for (; _j * 2 + 1 < (h)->used; _j = _r) { \
76 _r = _j * 2 + 1; \ 76 _r = _j * 2 + 1; \
77 if (_r + 1 < (h)->used && \ 77 if (_r + 1 < (h)->used && \
78 cmp((h)->data[_r], (h)->data[_r + 1])) \ 78 cmp((h)->data[_r], (h)->data[_r + 1])) \
79 _r++; \ 79 _r++; \
80 \ 80 \
81 if (cmp((h)->data[_r], (h)->data[_j])) \ 81 if (cmp((h)->data[_r], (h)->data[_j])) \
82 break; \ 82 break; \
83 heap_swap(h, _r, _j); \ 83 heap_swap(h, _r, _j); \
84 } \ 84 } \
85 } while (0) 85 } while (0)
86 86
87 #define heap_sift_down(h, i, cmp) \ 87 #define heap_sift_down(h, i, cmp) \
88 do { \ 88 do { \
89 while (i) { \ 89 while (i) { \
90 size_t p = (i - 1) / 2; \ 90 size_t p = (i - 1) / 2; \
91 if (cmp((h)->data[i], (h)->data[p])) \ 91 if (cmp((h)->data[i], (h)->data[p])) \
92 break; \ 92 break; \
93 heap_swap(h, i, p); \ 93 heap_swap(h, i, p); \
94 i = p; \ 94 i = p; \
95 } \ 95 } \
96 } while (0) 96 } while (0)
97 97
98 #define heap_add(h, d, cmp) \ 98 #define heap_add(h, d, cmp) \
99 ({ \ 99 ({ \
100 bool _r = !heap_full(h); \ 100 bool _r = !heap_full(h); \
101 if (_r) { \ 101 if (_r) { \
102 size_t _i = (h)->used++; \ 102 size_t _i = (h)->used++; \
103 (h)->data[_i] = d; \ 103 (h)->data[_i] = d; \
104 \ 104 \
105 heap_sift_down(h, _i, cmp); \ 105 heap_sift_down(h, _i, cmp); \
106 heap_sift(h, _i, cmp); \ 106 heap_sift(h, _i, cmp); \
107 } \ 107 } \
108 _r; \ 108 _r; \
109 }) 109 })
110 110
111 #define heap_pop(h, d, cmp) \ 111 #define heap_pop(h, d, cmp) \
112 ({ \ 112 ({ \
113 bool _r = (h)->used; \ 113 bool _r = (h)->used; \
114 if (_r) { \ 114 if (_r) { \
115 (d) = (h)->data[0]; \ 115 (d) = (h)->data[0]; \
116 (h)->used--; \ 116 (h)->used--; \
117 heap_swap(h, 0, (h)->used); \ 117 heap_swap(h, 0, (h)->used); \
118 heap_sift(h, 0, cmp); \ 118 heap_sift(h, 0, cmp); \
119 } \ 119 } \
120 _r; \ 120 _r; \
121 }) 121 })
122 122
123 #define heap_peek(h) ((h)->size ? (h)->data[0] : NULL) 123 #define heap_peek(h) ((h)->size ? (h)->data[0] : NULL)
124 124
125 #define heap_full(h) ((h)->used == (h)->size) 125 #define heap_full(h) ((h)->used == (h)->size)
126 126
127 #define DECLARE_FIFO(type, name) \ 127 #define DECLARE_FIFO(type, name) \
128 struct { \ 128 struct { \
129 size_t front, back, size, mask; \ 129 size_t front, back, size, mask; \
130 type *data; \ 130 type *data; \
131 } name 131 } name
132 132
133 #define fifo_for_each(c, fifo, iter) \ 133 #define fifo_for_each(c, fifo, iter) \
134 for (iter = (fifo)->front; \ 134 for (iter = (fifo)->front; \
135 c = (fifo)->data[iter], iter != (fifo)->back; \ 135 c = (fifo)->data[iter], iter != (fifo)->back; \
136 iter = (iter + 1) & (fifo)->mask) 136 iter = (iter + 1) & (fifo)->mask)
137 137
138 #define __init_fifo(fifo, gfp) \ 138 #define __init_fifo(fifo, gfp) \
139 ({ \ 139 ({ \
140 size_t _allocated_size, _bytes; \ 140 size_t _allocated_size, _bytes; \
141 BUG_ON(!(fifo)->size); \ 141 BUG_ON(!(fifo)->size); \
142 \ 142 \
143 _allocated_size = roundup_pow_of_two((fifo)->size + 1); \ 143 _allocated_size = roundup_pow_of_two((fifo)->size + 1); \
144 _bytes = _allocated_size * sizeof(*(fifo)->data); \ 144 _bytes = _allocated_size * sizeof(*(fifo)->data); \
145 \ 145 \
146 (fifo)->mask = _allocated_size - 1; \ 146 (fifo)->mask = _allocated_size - 1; \
147 (fifo)->front = (fifo)->back = 0; \ 147 (fifo)->front = (fifo)->back = 0; \
148 (fifo)->data = NULL; \ 148 (fifo)->data = NULL; \
149 \ 149 \
150 if (_bytes < KMALLOC_MAX_SIZE) \ 150 if (_bytes < KMALLOC_MAX_SIZE) \
151 (fifo)->data = kmalloc(_bytes, (gfp)); \ 151 (fifo)->data = kmalloc(_bytes, (gfp)); \
152 if ((!(fifo)->data) && ((gfp) & GFP_KERNEL)) \ 152 if ((!(fifo)->data) && ((gfp) & GFP_KERNEL)) \
153 (fifo)->data = vmalloc(_bytes); \ 153 (fifo)->data = vmalloc(_bytes); \
154 (fifo)->data; \ 154 (fifo)->data; \
155 }) 155 })
156 156
157 #define init_fifo_exact(fifo, _size, gfp) \ 157 #define init_fifo_exact(fifo, _size, gfp) \
158 ({ \ 158 ({ \
159 (fifo)->size = (_size); \ 159 (fifo)->size = (_size); \
160 __init_fifo(fifo, gfp); \ 160 __init_fifo(fifo, gfp); \
161 }) 161 })
162 162
163 #define init_fifo(fifo, _size, gfp) \ 163 #define init_fifo(fifo, _size, gfp) \
164 ({ \ 164 ({ \
165 (fifo)->size = (_size); \ 165 (fifo)->size = (_size); \
166 if ((fifo)->size > 4) \ 166 if ((fifo)->size > 4) \
167 (fifo)->size = roundup_pow_of_two((fifo)->size) - 1; \ 167 (fifo)->size = roundup_pow_of_two((fifo)->size) - 1; \
168 __init_fifo(fifo, gfp); \ 168 __init_fifo(fifo, gfp); \
169 }) 169 })
170 170
171 #define free_fifo(fifo) \ 171 #define free_fifo(fifo) \
172 do { \ 172 do { \
173 if (is_vmalloc_addr((fifo)->data)) \ 173 if (is_vmalloc_addr((fifo)->data)) \
174 vfree((fifo)->data); \ 174 vfree((fifo)->data); \
175 else \ 175 else \
176 kfree((fifo)->data); \ 176 kfree((fifo)->data); \
177 (fifo)->data = NULL; \ 177 (fifo)->data = NULL; \
178 } while (0) 178 } while (0)
179 179
180 #define fifo_used(fifo) (((fifo)->back - (fifo)->front) & (fifo)->mask) 180 #define fifo_used(fifo) (((fifo)->back - (fifo)->front) & (fifo)->mask)
181 #define fifo_free(fifo) ((fifo)->size - fifo_used(fifo)) 181 #define fifo_free(fifo) ((fifo)->size - fifo_used(fifo))
182 182
183 #define fifo_empty(fifo) (!fifo_used(fifo)) 183 #define fifo_empty(fifo) (!fifo_used(fifo))
184 #define fifo_full(fifo) (!fifo_free(fifo)) 184 #define fifo_full(fifo) (!fifo_free(fifo))
185 185
186 #define fifo_front(fifo) ((fifo)->data[(fifo)->front]) 186 #define fifo_front(fifo) ((fifo)->data[(fifo)->front])
187 #define fifo_back(fifo) \ 187 #define fifo_back(fifo) \
188 ((fifo)->data[((fifo)->back - 1) & (fifo)->mask]) 188 ((fifo)->data[((fifo)->back - 1) & (fifo)->mask])
189 189
190 #define fifo_idx(fifo, p) (((p) - &fifo_front(fifo)) & (fifo)->mask) 190 #define fifo_idx(fifo, p) (((p) - &fifo_front(fifo)) & (fifo)->mask)
191 191
192 #define fifo_push_back(fifo, i) \ 192 #define fifo_push_back(fifo, i) \
193 ({ \ 193 ({ \
194 bool _r = !fifo_full((fifo)); \ 194 bool _r = !fifo_full((fifo)); \
195 if (_r) { \ 195 if (_r) { \
196 (fifo)->data[(fifo)->back++] = (i); \ 196 (fifo)->data[(fifo)->back++] = (i); \
197 (fifo)->back &= (fifo)->mask; \ 197 (fifo)->back &= (fifo)->mask; \
198 } \ 198 } \
199 _r; \ 199 _r; \
200 }) 200 })
201 201
202 #define fifo_pop_front(fifo, i) \ 202 #define fifo_pop_front(fifo, i) \
203 ({ \ 203 ({ \
204 bool _r = !fifo_empty((fifo)); \ 204 bool _r = !fifo_empty((fifo)); \
205 if (_r) { \ 205 if (_r) { \
206 (i) = (fifo)->data[(fifo)->front++]; \ 206 (i) = (fifo)->data[(fifo)->front++]; \
207 (fifo)->front &= (fifo)->mask; \ 207 (fifo)->front &= (fifo)->mask; \
208 } \ 208 } \
209 _r; \ 209 _r; \
210 }) 210 })
211 211
212 #define fifo_push_front(fifo, i) \ 212 #define fifo_push_front(fifo, i) \
213 ({ \ 213 ({ \
214 bool _r = !fifo_full((fifo)); \ 214 bool _r = !fifo_full((fifo)); \
215 if (_r) { \ 215 if (_r) { \
216 --(fifo)->front; \ 216 --(fifo)->front; \
217 (fifo)->front &= (fifo)->mask; \ 217 (fifo)->front &= (fifo)->mask; \
218 (fifo)->data[(fifo)->front] = (i); \ 218 (fifo)->data[(fifo)->front] = (i); \
219 } \ 219 } \
220 _r; \ 220 _r; \
221 }) 221 })
222 222
223 #define fifo_pop_back(fifo, i) \ 223 #define fifo_pop_back(fifo, i) \
224 ({ \ 224 ({ \
225 bool _r = !fifo_empty((fifo)); \ 225 bool _r = !fifo_empty((fifo)); \
226 if (_r) { \ 226 if (_r) { \
227 --(fifo)->back; \ 227 --(fifo)->back; \
228 (fifo)->back &= (fifo)->mask; \ 228 (fifo)->back &= (fifo)->mask; \
229 (i) = (fifo)->data[(fifo)->back] \ 229 (i) = (fifo)->data[(fifo)->back] \
230 } \ 230 } \
231 _r; \ 231 _r; \
232 }) 232 })
233 233
234 #define fifo_push(fifo, i) fifo_push_back(fifo, (i)) 234 #define fifo_push(fifo, i) fifo_push_back(fifo, (i))
235 #define fifo_pop(fifo, i) fifo_pop_front(fifo, (i)) 235 #define fifo_pop(fifo, i) fifo_pop_front(fifo, (i))
236 236
237 #define fifo_swap(l, r) \ 237 #define fifo_swap(l, r) \
238 do { \ 238 do { \
239 swap((l)->front, (r)->front); \ 239 swap((l)->front, (r)->front); \
240 swap((l)->back, (r)->back); \ 240 swap((l)->back, (r)->back); \
241 swap((l)->size, (r)->size); \ 241 swap((l)->size, (r)->size); \
242 swap((l)->mask, (r)->mask); \ 242 swap((l)->mask, (r)->mask); \
243 swap((l)->data, (r)->data); \ 243 swap((l)->data, (r)->data); \
244 } while (0) 244 } while (0)
245 245
246 #define fifo_move(dest, src) \ 246 #define fifo_move(dest, src) \
247 do { \ 247 do { \
248 typeof(*((dest)->data)) _t; \ 248 typeof(*((dest)->data)) _t; \
249 while (!fifo_full(dest) && \ 249 while (!fifo_full(dest) && \
250 fifo_pop(src, _t)) \ 250 fifo_pop(src, _t)) \
251 fifo_push(dest, _t); \ 251 fifo_push(dest, _t); \
252 } while (0) 252 } while (0)
253 253
254 /* 254 /*
255 * Simple array based allocator - preallocates a number of elements and you can 255 * Simple array based allocator - preallocates a number of elements and you can
256 * never allocate more than that, also has no locking. 256 * never allocate more than that, also has no locking.
257 * 257 *
258 * Handy because if you know you only need a fixed number of elements you don't 258 * Handy because if you know you only need a fixed number of elements you don't
259 * have to worry about memory allocation failure, and sometimes a mempool isn't 259 * have to worry about memory allocation failure, and sometimes a mempool isn't
260 * what you want. 260 * what you want.
261 * 261 *
262 * We treat the free elements as entries in a singly linked list, and the 262 * We treat the free elements as entries in a singly linked list, and the
263 * freelist as a stack - allocating and freeing push and pop off the freelist. 263 * freelist as a stack - allocating and freeing push and pop off the freelist.
264 */ 264 */
265 265
266 #define DECLARE_ARRAY_ALLOCATOR(type, name, size) \ 266 #define DECLARE_ARRAY_ALLOCATOR(type, name, size) \
267 struct { \ 267 struct { \
268 type *freelist; \ 268 type *freelist; \
269 type data[size]; \ 269 type data[size]; \
270 } name 270 } name
271 271
272 #define array_alloc(array) \ 272 #define array_alloc(array) \
273 ({ \ 273 ({ \
274 typeof((array)->freelist) _ret = (array)->freelist; \ 274 typeof((array)->freelist) _ret = (array)->freelist; \
275 \ 275 \
276 if (_ret) \ 276 if (_ret) \
277 (array)->freelist = *((typeof((array)->freelist) *) _ret);\ 277 (array)->freelist = *((typeof((array)->freelist) *) _ret);\
278 \ 278 \
279 _ret; \ 279 _ret; \
280 }) 280 })
281 281
282 #define array_free(array, ptr) \ 282 #define array_free(array, ptr) \
283 do { \ 283 do { \
284 typeof((array)->freelist) _ptr = ptr; \ 284 typeof((array)->freelist) _ptr = ptr; \
285 \ 285 \
286 *((typeof((array)->freelist) *) _ptr) = (array)->freelist; \ 286 *((typeof((array)->freelist) *) _ptr) = (array)->freelist; \
287 (array)->freelist = _ptr; \ 287 (array)->freelist = _ptr; \
288 } while (0) 288 } while (0)
289 289
290 #define array_allocator_init(array) \ 290 #define array_allocator_init(array) \
291 do { \ 291 do { \
292 typeof((array)->freelist) _i; \ 292 typeof((array)->freelist) _i; \
293 \ 293 \
294 BUILD_BUG_ON(sizeof((array)->data[0]) < sizeof(void *)); \ 294 BUILD_BUG_ON(sizeof((array)->data[0]) < sizeof(void *)); \
295 (array)->freelist = NULL; \ 295 (array)->freelist = NULL; \
296 \ 296 \
297 for (_i = (array)->data; \ 297 for (_i = (array)->data; \
298 _i < (array)->data + ARRAY_SIZE((array)->data); \ 298 _i < (array)->data + ARRAY_SIZE((array)->data); \
299 _i++) \ 299 _i++) \
300 array_free(array, _i); \ 300 array_free(array, _i); \
301 } while (0) 301 } while (0)
302 302
303 #define array_freelist_empty(array) ((array)->freelist == NULL) 303 #define array_freelist_empty(array) ((array)->freelist == NULL)
304 304
305 #define ANYSINT_MAX(t) \ 305 #define ANYSINT_MAX(t) \
306 ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1) 306 ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1)
307 307
308 int bch_strtoint_h(const char *, int *); 308 int bch_strtoint_h(const char *, int *);
309 int bch_strtouint_h(const char *, unsigned int *); 309 int bch_strtouint_h(const char *, unsigned int *);
310 int bch_strtoll_h(const char *, long long *); 310 int bch_strtoll_h(const char *, long long *);
311 int bch_strtoull_h(const char *, unsigned long long *); 311 int bch_strtoull_h(const char *, unsigned long long *);
312 312
313 static inline int bch_strtol_h(const char *cp, long *res) 313 static inline int bch_strtol_h(const char *cp, long *res)
314 { 314 {
315 #if BITS_PER_LONG == 32 315 #if BITS_PER_LONG == 32
316 return bch_strtoint_h(cp, (int *) res); 316 return bch_strtoint_h(cp, (int *) res);
317 #else 317 #else
318 return bch_strtoll_h(cp, (long long *) res); 318 return bch_strtoll_h(cp, (long long *) res);
319 #endif 319 #endif
320 } 320 }
321 321
322 static inline int bch_strtoul_h(const char *cp, long *res) 322 static inline int bch_strtoul_h(const char *cp, long *res)
323 { 323 {
324 #if BITS_PER_LONG == 32 324 #if BITS_PER_LONG == 32
325 return bch_strtouint_h(cp, (unsigned int *) res); 325 return bch_strtouint_h(cp, (unsigned int *) res);
326 #else 326 #else
327 return bch_strtoull_h(cp, (unsigned long long *) res); 327 return bch_strtoull_h(cp, (unsigned long long *) res);
328 #endif 328 #endif
329 } 329 }
330 330
331 #define strtoi_h(cp, res) \ 331 #define strtoi_h(cp, res) \
332 (__builtin_types_compatible_p(typeof(*res), int) \ 332 (__builtin_types_compatible_p(typeof(*res), int) \
333 ? bch_strtoint_h(cp, (void *) res) \ 333 ? bch_strtoint_h(cp, (void *) res) \
334 : __builtin_types_compatible_p(typeof(*res), long) \ 334 : __builtin_types_compatible_p(typeof(*res), long) \
335 ? bch_strtol_h(cp, (void *) res) \ 335 ? bch_strtol_h(cp, (void *) res) \
336 : __builtin_types_compatible_p(typeof(*res), long long) \ 336 : __builtin_types_compatible_p(typeof(*res), long long) \
337 ? bch_strtoll_h(cp, (void *) res) \ 337 ? bch_strtoll_h(cp, (void *) res) \
338 : __builtin_types_compatible_p(typeof(*res), unsigned int) \ 338 : __builtin_types_compatible_p(typeof(*res), unsigned int) \
339 ? bch_strtouint_h(cp, (void *) res) \ 339 ? bch_strtouint_h(cp, (void *) res) \
340 : __builtin_types_compatible_p(typeof(*res), unsigned long) \ 340 : __builtin_types_compatible_p(typeof(*res), unsigned long) \
341 ? bch_strtoul_h(cp, (void *) res) \ 341 ? bch_strtoul_h(cp, (void *) res) \
342 : __builtin_types_compatible_p(typeof(*res), unsigned long long)\ 342 : __builtin_types_compatible_p(typeof(*res), unsigned long long)\
343 ? bch_strtoull_h(cp, (void *) res) : -EINVAL) 343 ? bch_strtoull_h(cp, (void *) res) : -EINVAL)
344 344
345 #define strtoul_safe(cp, var) \ 345 #define strtoul_safe(cp, var) \
346 ({ \ 346 ({ \
347 unsigned long _v; \ 347 unsigned long _v; \
348 int _r = kstrtoul(cp, 10, &_v); \ 348 int _r = kstrtoul(cp, 10, &_v); \
349 if (!_r) \ 349 if (!_r) \
350 var = _v; \ 350 var = _v; \
351 _r; \ 351 _r; \
352 }) 352 })
353 353
354 #define strtoul_safe_clamp(cp, var, min, max) \ 354 #define strtoul_safe_clamp(cp, var, min, max) \
355 ({ \ 355 ({ \
356 unsigned long _v; \ 356 unsigned long _v; \
357 int _r = kstrtoul(cp, 10, &_v); \ 357 int _r = kstrtoul(cp, 10, &_v); \
358 if (!_r) \ 358 if (!_r) \
359 var = clamp_t(typeof(var), _v, min, max); \ 359 var = clamp_t(typeof(var), _v, min, max); \
360 _r; \ 360 _r; \
361 }) 361 })
362 362
363 #define snprint(buf, size, var) \ 363 #define snprint(buf, size, var) \
364 snprintf(buf, size, \ 364 snprintf(buf, size, \
365 __builtin_types_compatible_p(typeof(var), int) \ 365 __builtin_types_compatible_p(typeof(var), int) \
366 ? "%i\n" : \ 366 ? "%i\n" : \
367 __builtin_types_compatible_p(typeof(var), unsigned) \ 367 __builtin_types_compatible_p(typeof(var), unsigned) \
368 ? "%u\n" : \ 368 ? "%u\n" : \
369 __builtin_types_compatible_p(typeof(var), long) \ 369 __builtin_types_compatible_p(typeof(var), long) \
370 ? "%li\n" : \ 370 ? "%li\n" : \
371 __builtin_types_compatible_p(typeof(var), unsigned long)\ 371 __builtin_types_compatible_p(typeof(var), unsigned long)\
372 ? "%lu\n" : \ 372 ? "%lu\n" : \
373 __builtin_types_compatible_p(typeof(var), int64_t) \ 373 __builtin_types_compatible_p(typeof(var), int64_t) \
374 ? "%lli\n" : \ 374 ? "%lli\n" : \
375 __builtin_types_compatible_p(typeof(var), uint64_t) \ 375 __builtin_types_compatible_p(typeof(var), uint64_t) \
376 ? "%llu\n" : \ 376 ? "%llu\n" : \
377 __builtin_types_compatible_p(typeof(var), const char *) \ 377 __builtin_types_compatible_p(typeof(var), const char *) \
378 ? "%s\n" : "%i\n", var) 378 ? "%s\n" : "%i\n", var)
379 379
380 ssize_t bch_hprint(char *buf, int64_t v); 380 ssize_t bch_hprint(char *buf, int64_t v);
381 381
382 bool bch_is_zero(const char *p, size_t n); 382 bool bch_is_zero(const char *p, size_t n);
383 int bch_parse_uuid(const char *s, char *uuid); 383 int bch_parse_uuid(const char *s, char *uuid);
384 384
385 ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[], 385 ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[],
386 size_t selected); 386 size_t selected);
387 387
388 ssize_t bch_read_string_list(const char *buf, const char * const list[]); 388 ssize_t bch_read_string_list(const char *buf, const char * const list[]);
389 389
390 struct time_stats { 390 struct time_stats {
391 /* 391 /*
392 * all fields are in nanoseconds, averages are ewmas stored left shifted 392 * all fields are in nanoseconds, averages are ewmas stored left shifted
393 * by 8 393 * by 8
394 */ 394 */
395 uint64_t max_duration; 395 uint64_t max_duration;
396 uint64_t average_duration; 396 uint64_t average_duration;
397 uint64_t average_frequency; 397 uint64_t average_frequency;
398 uint64_t last; 398 uint64_t last;
399 }; 399 };
400 400
401 void bch_time_stats_update(struct time_stats *stats, uint64_t time); 401 void bch_time_stats_update(struct time_stats *stats, uint64_t time);
402 402
403 #define NSEC_PER_ns 1L 403 #define NSEC_PER_ns 1L
404 #define NSEC_PER_us NSEC_PER_USEC 404 #define NSEC_PER_us NSEC_PER_USEC
405 #define NSEC_PER_ms NSEC_PER_MSEC 405 #define NSEC_PER_ms NSEC_PER_MSEC
406 #define NSEC_PER_sec NSEC_PER_SEC 406 #define NSEC_PER_sec NSEC_PER_SEC
407 407
408 #define __print_time_stat(stats, name, stat, units) \ 408 #define __print_time_stat(stats, name, stat, units) \
409 sysfs_print(name ## _ ## stat ## _ ## units, \ 409 sysfs_print(name ## _ ## stat ## _ ## units, \
410 div_u64((stats)->stat >> 8, NSEC_PER_ ## units)) 410 div_u64((stats)->stat >> 8, NSEC_PER_ ## units))
411 411
412 #define sysfs_print_time_stats(stats, name, \ 412 #define sysfs_print_time_stats(stats, name, \
413 frequency_units, \ 413 frequency_units, \
414 duration_units) \ 414 duration_units) \
415 do { \ 415 do { \
416 __print_time_stat(stats, name, \ 416 __print_time_stat(stats, name, \
417 average_frequency, frequency_units); \ 417 average_frequency, frequency_units); \
418 __print_time_stat(stats, name, \ 418 __print_time_stat(stats, name, \
419 average_duration, duration_units); \ 419 average_duration, duration_units); \
420 __print_time_stat(stats, name, \ 420 __print_time_stat(stats, name, \
421 max_duration, duration_units); \ 421 max_duration, duration_units); \
422 \ 422 \
423 sysfs_print(name ## _last_ ## frequency_units, (stats)->last \ 423 sysfs_print(name ## _last_ ## frequency_units, (stats)->last \
424 ? div_s64(local_clock() - (stats)->last, \ 424 ? div_s64(local_clock() - (stats)->last, \
425 NSEC_PER_ ## frequency_units) \ 425 NSEC_PER_ ## frequency_units) \
426 : -1LL); \ 426 : -1LL); \
427 } while (0) 427 } while (0)
428 428
429 #define sysfs_time_stats_attribute(name, \ 429 #define sysfs_time_stats_attribute(name, \
430 frequency_units, \ 430 frequency_units, \
431 duration_units) \ 431 duration_units) \
432 read_attribute(name ## _average_frequency_ ## frequency_units); \ 432 read_attribute(name ## _average_frequency_ ## frequency_units); \
433 read_attribute(name ## _average_duration_ ## duration_units); \ 433 read_attribute(name ## _average_duration_ ## duration_units); \
434 read_attribute(name ## _max_duration_ ## duration_units); \ 434 read_attribute(name ## _max_duration_ ## duration_units); \
435 read_attribute(name ## _last_ ## frequency_units) 435 read_attribute(name ## _last_ ## frequency_units)
436 436
437 #define sysfs_time_stats_attribute_list(name, \ 437 #define sysfs_time_stats_attribute_list(name, \
438 frequency_units, \ 438 frequency_units, \
439 duration_units) \ 439 duration_units) \
440 &sysfs_ ## name ## _average_frequency_ ## frequency_units, \ 440 &sysfs_ ## name ## _average_frequency_ ## frequency_units, \
441 &sysfs_ ## name ## _average_duration_ ## duration_units, \ 441 &sysfs_ ## name ## _average_duration_ ## duration_units, \
442 &sysfs_ ## name ## _max_duration_ ## duration_units, \ 442 &sysfs_ ## name ## _max_duration_ ## duration_units, \
443 &sysfs_ ## name ## _last_ ## frequency_units, 443 &sysfs_ ## name ## _last_ ## frequency_units,
444 444
445 #define ewma_add(ewma, val, weight, factor) \ 445 #define ewma_add(ewma, val, weight, factor) \
446 ({ \ 446 ({ \
447 (ewma) *= (weight) - 1; \ 447 (ewma) *= (weight) - 1; \
448 (ewma) += (val) << factor; \ 448 (ewma) += (val) << factor; \
449 (ewma) /= (weight); \ 449 (ewma) /= (weight); \
450 (ewma) >> factor; \ 450 (ewma) >> factor; \
451 }) 451 })
452 452
453 struct ratelimit { 453 struct ratelimit {
454 uint64_t next; 454 uint64_t next;
455 unsigned rate; 455 unsigned rate;
456 }; 456 };
457 457
458 static inline void ratelimit_reset(struct ratelimit *d) 458 static inline void ratelimit_reset(struct ratelimit *d)
459 { 459 {
460 d->next = local_clock(); 460 d->next = local_clock();
461 } 461 }
462 462
463 unsigned bch_next_delay(struct ratelimit *d, uint64_t done); 463 unsigned bch_next_delay(struct ratelimit *d, uint64_t done);
464 464
465 #define __DIV_SAFE(n, d, zero) \ 465 #define __DIV_SAFE(n, d, zero) \
466 ({ \ 466 ({ \
467 typeof(n) _n = (n); \ 467 typeof(n) _n = (n); \
468 typeof(d) _d = (d); \ 468 typeof(d) _d = (d); \
469 _d ? _n / _d : zero; \ 469 _d ? _n / _d : zero; \
470 }) 470 })
471 471
472 #define DIV_SAFE(n, d) __DIV_SAFE(n, d, 0) 472 #define DIV_SAFE(n, d) __DIV_SAFE(n, d, 0)
473 473
474 #define container_of_or_null(ptr, type, member) \ 474 #define container_of_or_null(ptr, type, member) \
475 ({ \ 475 ({ \
476 typeof(ptr) _ptr = ptr; \ 476 typeof(ptr) _ptr = ptr; \
477 _ptr ? container_of(_ptr, type, member) : NULL; \ 477 _ptr ? container_of(_ptr, type, member) : NULL; \
478 }) 478 })
479 479
480 #define RB_INSERT(root, new, member, cmp) \ 480 #define RB_INSERT(root, new, member, cmp) \
481 ({ \ 481 ({ \
482 __label__ dup; \ 482 __label__ dup; \
483 struct rb_node **n = &(root)->rb_node, *parent = NULL; \ 483 struct rb_node **n = &(root)->rb_node, *parent = NULL; \
484 typeof(new) this; \ 484 typeof(new) this; \
485 int res, ret = -1; \ 485 int res, ret = -1; \
486 \ 486 \
487 while (*n) { \ 487 while (*n) { \
488 parent = *n; \ 488 parent = *n; \
489 this = container_of(*n, typeof(*(new)), member); \ 489 this = container_of(*n, typeof(*(new)), member); \
490 res = cmp(new, this); \ 490 res = cmp(new, this); \
491 if (!res) \ 491 if (!res) \
492 goto dup; \ 492 goto dup; \
493 n = res < 0 \ 493 n = res < 0 \
494 ? &(*n)->rb_left \ 494 ? &(*n)->rb_left \
495 : &(*n)->rb_right; \ 495 : &(*n)->rb_right; \
496 } \ 496 } \
497 \ 497 \
498 rb_link_node(&(new)->member, parent, n); \ 498 rb_link_node(&(new)->member, parent, n); \
499 rb_insert_color(&(new)->member, root); \ 499 rb_insert_color(&(new)->member, root); \
500 ret = 0; \ 500 ret = 0; \
501 dup: \ 501 dup: \
502 ret; \ 502 ret; \
503 }) 503 })
504 504
505 #define RB_SEARCH(root, search, member, cmp) \ 505 #define RB_SEARCH(root, search, member, cmp) \
506 ({ \ 506 ({ \
507 struct rb_node *n = (root)->rb_node; \ 507 struct rb_node *n = (root)->rb_node; \
508 typeof(&(search)) this, ret = NULL; \ 508 typeof(&(search)) this, ret = NULL; \
509 int res; \ 509 int res; \
510 \ 510 \
511 while (n) { \ 511 while (n) { \
512 this = container_of(n, typeof(search), member); \ 512 this = container_of(n, typeof(search), member); \
513 res = cmp(&(search), this); \ 513 res = cmp(&(search), this); \
514 if (!res) { \ 514 if (!res) { \
515 ret = this; \ 515 ret = this; \
516 break; \ 516 break; \
517 } \ 517 } \
518 n = res < 0 \ 518 n = res < 0 \
519 ? n->rb_left \ 519 ? n->rb_left \
520 : n->rb_right; \ 520 : n->rb_right; \
521 } \ 521 } \
522 ret; \ 522 ret; \
523 }) 523 })
524 524
525 #define RB_GREATER(root, search, member, cmp) \ 525 #define RB_GREATER(root, search, member, cmp) \
526 ({ \ 526 ({ \
527 struct rb_node *n = (root)->rb_node; \ 527 struct rb_node *n = (root)->rb_node; \
528 typeof(&(search)) this, ret = NULL; \ 528 typeof(&(search)) this, ret = NULL; \
529 int res; \ 529 int res; \
530 \ 530 \
531 while (n) { \ 531 while (n) { \
532 this = container_of(n, typeof(search), member); \ 532 this = container_of(n, typeof(search), member); \
533 res = cmp(&(search), this); \ 533 res = cmp(&(search), this); \
534 if (res < 0) { \ 534 if (res < 0) { \
535 ret = this; \ 535 ret = this; \
536 n = n->rb_left; \ 536 n = n->rb_left; \
537 } else \ 537 } else \
538 n = n->rb_right; \ 538 n = n->rb_right; \
539 } \ 539 } \
540 ret; \ 540 ret; \
541 }) 541 })
542 542
543 #define RB_FIRST(root, type, member) \ 543 #define RB_FIRST(root, type, member) \
544 container_of_or_null(rb_first(root), type, member) 544 container_of_or_null(rb_first(root), type, member)
545 545
546 #define RB_LAST(root, type, member) \ 546 #define RB_LAST(root, type, member) \
547 container_of_or_null(rb_last(root), type, member) 547 container_of_or_null(rb_last(root), type, member)
548 548
549 #define RB_NEXT(ptr, member) \ 549 #define RB_NEXT(ptr, member) \
550 container_of_or_null(rb_next(&(ptr)->member), typeof(*ptr), member) 550 container_of_or_null(rb_next(&(ptr)->member), typeof(*ptr), member)
551 551
552 #define RB_PREV(ptr, member) \ 552 #define RB_PREV(ptr, member) \
553 container_of_or_null(rb_prev(&(ptr)->member), typeof(*ptr), member) 553 container_of_or_null(rb_prev(&(ptr)->member), typeof(*ptr), member)
554 554
555 /* Does linear interpolation between powers of two */ 555 /* Does linear interpolation between powers of two */
556 static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) 556 static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
557 { 557 {
558 unsigned fract = x & ~(~0 << fract_bits); 558 unsigned fract = x & ~(~0 << fract_bits);
559 559
560 x >>= fract_bits; 560 x >>= fract_bits;
561 x = 1 << x; 561 x = 1 << x;
562 x += (x * fract) >> fract_bits; 562 x += (x * fract) >> fract_bits;
563 563
564 return x; 564 return x;
565 } 565 }
566 566
567 #define bio_end(bio) ((bio)->bi_sector + bio_sectors(bio))
568
569 void bch_bio_map(struct bio *bio, void *base); 567 void bch_bio_map(struct bio *bio, void *base);
570
571 int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp);
572 568
573 static inline sector_t bdev_sectors(struct block_device *bdev) 569 static inline sector_t bdev_sectors(struct block_device *bdev)
574 { 570 {
575 return bdev->bd_inode->i_size >> 9; 571 return bdev->bd_inode->i_size >> 9;
576 } 572 }
577 573
578 #define closure_bio_submit(bio, cl, dev) \ 574 #define closure_bio_submit(bio, cl, dev) \
579 do { \ 575 do { \
580 closure_get(cl); \ 576 closure_get(cl); \
581 bch_generic_make_request(bio, &(dev)->bio_split_hook); \ 577 bch_generic_make_request(bio, &(dev)->bio_split_hook); \
582 } while (0) 578 } while (0)
583 579
584 uint64_t bch_crc64_update(uint64_t, const void *, size_t); 580 uint64_t bch_crc64_update(uint64_t, const void *, size_t);
585 uint64_t bch_crc64(const void *, size_t); 581 uint64_t bch_crc64(const void *, size_t);
586 582
587 #endif /* _BCACHE_UTIL_H */ 583 #endif /* _BCACHE_UTIL_H */
588 584
drivers/md/bcache/writeback.c
1 /* 1 /*
2 * background writeback - scan btree for dirty data and write it to the backing 2 * background writeback - scan btree for dirty data and write it to the backing
3 * device 3 * device
4 * 4 *
5 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> 5 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
6 * Copyright 2012 Google, Inc. 6 * Copyright 2012 Google, Inc.
7 */ 7 */
8 8
9 #include "bcache.h" 9 #include "bcache.h"
10 #include "btree.h" 10 #include "btree.h"
11 #include "debug.h" 11 #include "debug.h"
12 #include "writeback.h" 12 #include "writeback.h"
13 13
14 #include <trace/events/bcache.h> 14 #include <trace/events/bcache.h>
15 15
16 static struct workqueue_struct *dirty_wq; 16 static struct workqueue_struct *dirty_wq;
17 17
18 static void read_dirty(struct closure *); 18 static void read_dirty(struct closure *);
19 19
20 struct dirty_io { 20 struct dirty_io {
21 struct closure cl; 21 struct closure cl;
22 struct cached_dev *dc; 22 struct cached_dev *dc;
23 struct bio bio; 23 struct bio bio;
24 }; 24 };
25 25
26 /* Rate limiting */ 26 /* Rate limiting */
27 27
28 static void __update_writeback_rate(struct cached_dev *dc) 28 static void __update_writeback_rate(struct cached_dev *dc)
29 { 29 {
30 struct cache_set *c = dc->disk.c; 30 struct cache_set *c = dc->disk.c;
31 uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size; 31 uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size;
32 uint64_t cache_dirty_target = 32 uint64_t cache_dirty_target =
33 div_u64(cache_sectors * dc->writeback_percent, 100); 33 div_u64(cache_sectors * dc->writeback_percent, 100);
34 34
35 int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev), 35 int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev),
36 c->cached_dev_sectors); 36 c->cached_dev_sectors);
37 37
38 /* PD controller */ 38 /* PD controller */
39 39
40 int change = 0; 40 int change = 0;
41 int64_t error; 41 int64_t error;
42 int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); 42 int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
43 int64_t derivative = dirty - dc->disk.sectors_dirty_last; 43 int64_t derivative = dirty - dc->disk.sectors_dirty_last;
44 44
45 dc->disk.sectors_dirty_last = dirty; 45 dc->disk.sectors_dirty_last = dirty;
46 46
47 derivative *= dc->writeback_rate_d_term; 47 derivative *= dc->writeback_rate_d_term;
48 derivative = clamp(derivative, -dirty, dirty); 48 derivative = clamp(derivative, -dirty, dirty);
49 49
50 derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative, 50 derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative,
51 dc->writeback_rate_d_smooth, 0); 51 dc->writeback_rate_d_smooth, 0);
52 52
53 /* Avoid divide by zero */ 53 /* Avoid divide by zero */
54 if (!target) 54 if (!target)
55 goto out; 55 goto out;
56 56
57 error = div64_s64((dirty + derivative - target) << 8, target); 57 error = div64_s64((dirty + derivative - target) << 8, target);
58 58
59 change = div_s64((dc->writeback_rate.rate * error) >> 8, 59 change = div_s64((dc->writeback_rate.rate * error) >> 8,
60 dc->writeback_rate_p_term_inverse); 60 dc->writeback_rate_p_term_inverse);
61 61
62 /* Don't increase writeback rate if the device isn't keeping up */ 62 /* Don't increase writeback rate if the device isn't keeping up */
63 if (change > 0 && 63 if (change > 0 &&
64 time_after64(local_clock(), 64 time_after64(local_clock(),
65 dc->writeback_rate.next + 10 * NSEC_PER_MSEC)) 65 dc->writeback_rate.next + 10 * NSEC_PER_MSEC))
66 change = 0; 66 change = 0;
67 67
68 dc->writeback_rate.rate = 68 dc->writeback_rate.rate =
69 clamp_t(int64_t, dc->writeback_rate.rate + change, 69 clamp_t(int64_t, dc->writeback_rate.rate + change,
70 1, NSEC_PER_MSEC); 70 1, NSEC_PER_MSEC);
71 out: 71 out:
72 dc->writeback_rate_derivative = derivative; 72 dc->writeback_rate_derivative = derivative;
73 dc->writeback_rate_change = change; 73 dc->writeback_rate_change = change;
74 dc->writeback_rate_target = target; 74 dc->writeback_rate_target = target;
75 75
76 schedule_delayed_work(&dc->writeback_rate_update, 76 schedule_delayed_work(&dc->writeback_rate_update,
77 dc->writeback_rate_update_seconds * HZ); 77 dc->writeback_rate_update_seconds * HZ);
78 } 78 }
79 79
80 static void update_writeback_rate(struct work_struct *work) 80 static void update_writeback_rate(struct work_struct *work)
81 { 81 {
82 struct cached_dev *dc = container_of(to_delayed_work(work), 82 struct cached_dev *dc = container_of(to_delayed_work(work),
83 struct cached_dev, 83 struct cached_dev,
84 writeback_rate_update); 84 writeback_rate_update);
85 85
86 down_read(&dc->writeback_lock); 86 down_read(&dc->writeback_lock);
87 87
88 if (atomic_read(&dc->has_dirty) && 88 if (atomic_read(&dc->has_dirty) &&
89 dc->writeback_percent) 89 dc->writeback_percent)
90 __update_writeback_rate(dc); 90 __update_writeback_rate(dc);
91 91
92 up_read(&dc->writeback_lock); 92 up_read(&dc->writeback_lock);
93 } 93 }
94 94
95 static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) 95 static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
96 { 96 {
97 if (atomic_read(&dc->disk.detaching) || 97 if (atomic_read(&dc->disk.detaching) ||
98 !dc->writeback_percent) 98 !dc->writeback_percent)
99 return 0; 99 return 0;
100 100
101 return bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL); 101 return bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL);
102 } 102 }
103 103
104 /* Background writeback */ 104 /* Background writeback */
105 105
106 static bool dirty_pred(struct keybuf *buf, struct bkey *k) 106 static bool dirty_pred(struct keybuf *buf, struct bkey *k)
107 { 107 {
108 return KEY_DIRTY(k); 108 return KEY_DIRTY(k);
109 } 109 }
110 110
111 static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k) 111 static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k)
112 { 112 {
113 uint64_t stripe; 113 uint64_t stripe;
114 unsigned nr_sectors = KEY_SIZE(k); 114 unsigned nr_sectors = KEY_SIZE(k);
115 struct cached_dev *dc = container_of(buf, struct cached_dev, 115 struct cached_dev *dc = container_of(buf, struct cached_dev,
116 writeback_keys); 116 writeback_keys);
117 unsigned stripe_size = 1 << dc->disk.stripe_size_bits; 117 unsigned stripe_size = 1 << dc->disk.stripe_size_bits;
118 118
119 if (!KEY_DIRTY(k)) 119 if (!KEY_DIRTY(k))
120 return false; 120 return false;
121 121
122 stripe = KEY_START(k) >> dc->disk.stripe_size_bits; 122 stripe = KEY_START(k) >> dc->disk.stripe_size_bits;
123 while (1) { 123 while (1) {
124 if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) != 124 if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) !=
125 stripe_size) 125 stripe_size)
126 return false; 126 return false;
127 127
128 if (nr_sectors <= stripe_size) 128 if (nr_sectors <= stripe_size)
129 return true; 129 return true;
130 130
131 nr_sectors -= stripe_size; 131 nr_sectors -= stripe_size;
132 stripe++; 132 stripe++;
133 } 133 }
134 } 134 }
135 135
136 static void dirty_init(struct keybuf_key *w) 136 static void dirty_init(struct keybuf_key *w)
137 { 137 {
138 struct dirty_io *io = w->private; 138 struct dirty_io *io = w->private;
139 struct bio *bio = &io->bio; 139 struct bio *bio = &io->bio;
140 140
141 bio_init(bio); 141 bio_init(bio);
142 if (!io->dc->writeback_percent) 142 if (!io->dc->writeback_percent)
143 bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); 143 bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
144 144
145 bio->bi_size = KEY_SIZE(&w->key) << 9; 145 bio->bi_size = KEY_SIZE(&w->key) << 9;
146 bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS); 146 bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS);
147 bio->bi_private = w; 147 bio->bi_private = w;
148 bio->bi_io_vec = bio->bi_inline_vecs; 148 bio->bi_io_vec = bio->bi_inline_vecs;
149 bch_bio_map(bio, NULL); 149 bch_bio_map(bio, NULL);
150 } 150 }
151 151
152 static void refill_dirty(struct closure *cl) 152 static void refill_dirty(struct closure *cl)
153 { 153 {
154 struct cached_dev *dc = container_of(cl, struct cached_dev, 154 struct cached_dev *dc = container_of(cl, struct cached_dev,
155 writeback.cl); 155 writeback.cl);
156 struct keybuf *buf = &dc->writeback_keys; 156 struct keybuf *buf = &dc->writeback_keys;
157 bool searched_from_start = false; 157 bool searched_from_start = false;
158 struct bkey end = MAX_KEY; 158 struct bkey end = MAX_KEY;
159 SET_KEY_INODE(&end, dc->disk.id); 159 SET_KEY_INODE(&end, dc->disk.id);
160 160
161 if (!atomic_read(&dc->disk.detaching) && 161 if (!atomic_read(&dc->disk.detaching) &&
162 !dc->writeback_running) 162 !dc->writeback_running)
163 closure_return(cl); 163 closure_return(cl);
164 164
165 down_write(&dc->writeback_lock); 165 down_write(&dc->writeback_lock);
166 166
167 if (!atomic_read(&dc->has_dirty)) { 167 if (!atomic_read(&dc->has_dirty)) {
168 SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); 168 SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
169 bch_write_bdev_super(dc, NULL); 169 bch_write_bdev_super(dc, NULL);
170 170
171 up_write(&dc->writeback_lock); 171 up_write(&dc->writeback_lock);
172 closure_return(cl); 172 closure_return(cl);
173 } 173 }
174 174
175 if (bkey_cmp(&buf->last_scanned, &end) >= 0) { 175 if (bkey_cmp(&buf->last_scanned, &end) >= 0) {
176 buf->last_scanned = KEY(dc->disk.id, 0, 0); 176 buf->last_scanned = KEY(dc->disk.id, 0, 0);
177 searched_from_start = true; 177 searched_from_start = true;
178 } 178 }
179 179
180 if (dc->partial_stripes_expensive) { 180 if (dc->partial_stripes_expensive) {
181 uint64_t i; 181 uint64_t i;
182 182
183 for (i = 0; i < dc->disk.nr_stripes; i++) 183 for (i = 0; i < dc->disk.nr_stripes; i++)
184 if (atomic_read(dc->disk.stripe_sectors_dirty + i) == 184 if (atomic_read(dc->disk.stripe_sectors_dirty + i) ==
185 1 << dc->disk.stripe_size_bits) 185 1 << dc->disk.stripe_size_bits)
186 goto full_stripes; 186 goto full_stripes;
187 187
188 goto normal_refill; 188 goto normal_refill;
189 full_stripes: 189 full_stripes:
190 bch_refill_keybuf(dc->disk.c, buf, &end, 190 bch_refill_keybuf(dc->disk.c, buf, &end,
191 dirty_full_stripe_pred); 191 dirty_full_stripe_pred);
192 } else { 192 } else {
193 normal_refill: 193 normal_refill:
194 bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); 194 bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred);
195 } 195 }
196 196
197 if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) { 197 if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) {
198 /* Searched the entire btree - delay awhile */ 198 /* Searched the entire btree - delay awhile */
199 199
200 if (RB_EMPTY_ROOT(&buf->keys)) { 200 if (RB_EMPTY_ROOT(&buf->keys)) {
201 atomic_set(&dc->has_dirty, 0); 201 atomic_set(&dc->has_dirty, 0);
202 cached_dev_put(dc); 202 cached_dev_put(dc);
203 } 203 }
204 204
205 if (!atomic_read(&dc->disk.detaching)) 205 if (!atomic_read(&dc->disk.detaching))
206 closure_delay(&dc->writeback, dc->writeback_delay * HZ); 206 closure_delay(&dc->writeback, dc->writeback_delay * HZ);
207 } 207 }
208 208
209 up_write(&dc->writeback_lock); 209 up_write(&dc->writeback_lock);
210 210
211 ratelimit_reset(&dc->writeback_rate); 211 ratelimit_reset(&dc->writeback_rate);
212 212
213 /* Punt to workqueue only so we don't recurse and blow the stack */ 213 /* Punt to workqueue only so we don't recurse and blow the stack */
214 continue_at(cl, read_dirty, dirty_wq); 214 continue_at(cl, read_dirty, dirty_wq);
215 } 215 }
216 216
217 void bch_writeback_queue(struct cached_dev *dc) 217 void bch_writeback_queue(struct cached_dev *dc)
218 { 218 {
219 if (closure_trylock(&dc->writeback.cl, &dc->disk.cl)) { 219 if (closure_trylock(&dc->writeback.cl, &dc->disk.cl)) {
220 if (!atomic_read(&dc->disk.detaching)) 220 if (!atomic_read(&dc->disk.detaching))
221 closure_delay(&dc->writeback, dc->writeback_delay * HZ); 221 closure_delay(&dc->writeback, dc->writeback_delay * HZ);
222 222
223 continue_at(&dc->writeback.cl, refill_dirty, dirty_wq); 223 continue_at(&dc->writeback.cl, refill_dirty, dirty_wq);
224 } 224 }
225 } 225 }
226 226
227 void bch_writeback_add(struct cached_dev *dc) 227 void bch_writeback_add(struct cached_dev *dc)
228 { 228 {
229 if (!atomic_read(&dc->has_dirty) && 229 if (!atomic_read(&dc->has_dirty) &&
230 !atomic_xchg(&dc->has_dirty, 1)) { 230 !atomic_xchg(&dc->has_dirty, 1)) {
231 atomic_inc(&dc->count); 231 atomic_inc(&dc->count);
232 232
233 if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { 233 if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) {
234 SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); 234 SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY);
235 /* XXX: should do this synchronously */ 235 /* XXX: should do this synchronously */
236 bch_write_bdev_super(dc, NULL); 236 bch_write_bdev_super(dc, NULL);
237 } 237 }
238 238
239 bch_writeback_queue(dc); 239 bch_writeback_queue(dc);
240 240
241 if (dc->writeback_percent) 241 if (dc->writeback_percent)
242 schedule_delayed_work(&dc->writeback_rate_update, 242 schedule_delayed_work(&dc->writeback_rate_update,
243 dc->writeback_rate_update_seconds * HZ); 243 dc->writeback_rate_update_seconds * HZ);
244 } 244 }
245 } 245 }
246 246
247 void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, 247 void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
248 uint64_t offset, int nr_sectors) 248 uint64_t offset, int nr_sectors)
249 { 249 {
250 struct bcache_device *d = c->devices[inode]; 250 struct bcache_device *d = c->devices[inode];
251 unsigned stripe_size, stripe_offset; 251 unsigned stripe_size, stripe_offset;
252 uint64_t stripe; 252 uint64_t stripe;
253 253
254 if (!d) 254 if (!d)
255 return; 255 return;
256 256
257 stripe_size = 1 << d->stripe_size_bits; 257 stripe_size = 1 << d->stripe_size_bits;
258 stripe = offset >> d->stripe_size_bits; 258 stripe = offset >> d->stripe_size_bits;
259 stripe_offset = offset & (stripe_size - 1); 259 stripe_offset = offset & (stripe_size - 1);
260 260
261 while (nr_sectors) { 261 while (nr_sectors) {
262 int s = min_t(unsigned, abs(nr_sectors), 262 int s = min_t(unsigned, abs(nr_sectors),
263 stripe_size - stripe_offset); 263 stripe_size - stripe_offset);
264 264
265 if (nr_sectors < 0) 265 if (nr_sectors < 0)
266 s = -s; 266 s = -s;
267 267
268 atomic_add(s, d->stripe_sectors_dirty + stripe); 268 atomic_add(s, d->stripe_sectors_dirty + stripe);
269 nr_sectors -= s; 269 nr_sectors -= s;
270 stripe_offset = 0; 270 stripe_offset = 0;
271 stripe++; 271 stripe++;
272 } 272 }
273 } 273 }
274 274
275 /* Background writeback - IO loop */ 275 /* Background writeback - IO loop */
276 276
277 static void dirty_io_destructor(struct closure *cl) 277 static void dirty_io_destructor(struct closure *cl)
278 { 278 {
279 struct dirty_io *io = container_of(cl, struct dirty_io, cl); 279 struct dirty_io *io = container_of(cl, struct dirty_io, cl);
280 kfree(io); 280 kfree(io);
281 } 281 }
282 282
283 static void write_dirty_finish(struct closure *cl) 283 static void write_dirty_finish(struct closure *cl)
284 { 284 {
285 struct dirty_io *io = container_of(cl, struct dirty_io, cl); 285 struct dirty_io *io = container_of(cl, struct dirty_io, cl);
286 struct keybuf_key *w = io->bio.bi_private; 286 struct keybuf_key *w = io->bio.bi_private;
287 struct cached_dev *dc = io->dc; 287 struct cached_dev *dc = io->dc;
288 struct bio_vec *bv = bio_iovec_idx(&io->bio, io->bio.bi_vcnt); 288 struct bio_vec *bv;
289 int i;
289 290
290 while (bv-- != io->bio.bi_io_vec) 291 bio_for_each_segment_all(bv, &io->bio, i)
291 __free_page(bv->bv_page); 292 __free_page(bv->bv_page);
292 293
293 /* This is kind of a dumb way of signalling errors. */ 294 /* This is kind of a dumb way of signalling errors. */
294 if (KEY_DIRTY(&w->key)) { 295 if (KEY_DIRTY(&w->key)) {
295 unsigned i; 296 unsigned i;
296 struct btree_op op; 297 struct btree_op op;
297 bch_btree_op_init_stack(&op); 298 bch_btree_op_init_stack(&op);
298 299
299 op.type = BTREE_REPLACE; 300 op.type = BTREE_REPLACE;
300 bkey_copy(&op.replace, &w->key); 301 bkey_copy(&op.replace, &w->key);
301 302
302 SET_KEY_DIRTY(&w->key, false); 303 SET_KEY_DIRTY(&w->key, false);
303 bch_keylist_add(&op.keys, &w->key); 304 bch_keylist_add(&op.keys, &w->key);
304 305
305 for (i = 0; i < KEY_PTRS(&w->key); i++) 306 for (i = 0; i < KEY_PTRS(&w->key); i++)
306 atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); 307 atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
307 308
308 bch_btree_insert(&op, dc->disk.c); 309 bch_btree_insert(&op, dc->disk.c);
309 closure_sync(&op.cl); 310 closure_sync(&op.cl);
310 311
311 if (op.insert_collision) 312 if (op.insert_collision)
312 trace_bcache_writeback_collision(&w->key); 313 trace_bcache_writeback_collision(&w->key);
313 314
314 atomic_long_inc(op.insert_collision 315 atomic_long_inc(op.insert_collision
315 ? &dc->disk.c->writeback_keys_failed 316 ? &dc->disk.c->writeback_keys_failed
316 : &dc->disk.c->writeback_keys_done); 317 : &dc->disk.c->writeback_keys_done);
317 } 318 }
318 319
319 bch_keybuf_del(&dc->writeback_keys, w); 320 bch_keybuf_del(&dc->writeback_keys, w);
320 atomic_dec_bug(&dc->in_flight); 321 atomic_dec_bug(&dc->in_flight);
321 322
322 closure_wake_up(&dc->writeback_wait); 323 closure_wake_up(&dc->writeback_wait);
323 324
324 closure_return_with_destructor(cl, dirty_io_destructor); 325 closure_return_with_destructor(cl, dirty_io_destructor);
325 } 326 }
326 327
327 static void dirty_endio(struct bio *bio, int error) 328 static void dirty_endio(struct bio *bio, int error)
328 { 329 {
329 struct keybuf_key *w = bio->bi_private; 330 struct keybuf_key *w = bio->bi_private;
330 struct dirty_io *io = w->private; 331 struct dirty_io *io = w->private;
331 332
332 if (error) 333 if (error)
333 SET_KEY_DIRTY(&w->key, false); 334 SET_KEY_DIRTY(&w->key, false);
334 335
335 closure_put(&io->cl); 336 closure_put(&io->cl);
336 } 337 }
337 338
338 static void write_dirty(struct closure *cl) 339 static void write_dirty(struct closure *cl)
339 { 340 {
340 struct dirty_io *io = container_of(cl, struct dirty_io, cl); 341 struct dirty_io *io = container_of(cl, struct dirty_io, cl);
341 struct keybuf_key *w = io->bio.bi_private; 342 struct keybuf_key *w = io->bio.bi_private;
342 343
343 dirty_init(w); 344 dirty_init(w);
344 io->bio.bi_rw = WRITE; 345 io->bio.bi_rw = WRITE;
345 io->bio.bi_sector = KEY_START(&w->key); 346 io->bio.bi_sector = KEY_START(&w->key);
346 io->bio.bi_bdev = io->dc->bdev; 347 io->bio.bi_bdev = io->dc->bdev;
347 io->bio.bi_end_io = dirty_endio; 348 io->bio.bi_end_io = dirty_endio;
348 349
349 closure_bio_submit(&io->bio, cl, &io->dc->disk); 350 closure_bio_submit(&io->bio, cl, &io->dc->disk);
350 351
351 continue_at(cl, write_dirty_finish, dirty_wq); 352 continue_at(cl, write_dirty_finish, dirty_wq);
352 } 353 }
353 354
354 static void read_dirty_endio(struct bio *bio, int error) 355 static void read_dirty_endio(struct bio *bio, int error)
355 { 356 {
356 struct keybuf_key *w = bio->bi_private; 357 struct keybuf_key *w = bio->bi_private;
357 struct dirty_io *io = w->private; 358 struct dirty_io *io = w->private;
358 359
359 bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0), 360 bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0),
360 error, "reading dirty data from cache"); 361 error, "reading dirty data from cache");
361 362
362 dirty_endio(bio, error); 363 dirty_endio(bio, error);
363 } 364 }
364 365
365 static void read_dirty_submit(struct closure *cl) 366 static void read_dirty_submit(struct closure *cl)
366 { 367 {
367 struct dirty_io *io = container_of(cl, struct dirty_io, cl); 368 struct dirty_io *io = container_of(cl, struct dirty_io, cl);
368 369
369 closure_bio_submit(&io->bio, cl, &io->dc->disk); 370 closure_bio_submit(&io->bio, cl, &io->dc->disk);
370 371
371 continue_at(cl, write_dirty, dirty_wq); 372 continue_at(cl, write_dirty, dirty_wq);
372 } 373 }
373 374
374 static void read_dirty(struct closure *cl) 375 static void read_dirty(struct closure *cl)
375 { 376 {
376 struct cached_dev *dc = container_of(cl, struct cached_dev, 377 struct cached_dev *dc = container_of(cl, struct cached_dev,
377 writeback.cl); 378 writeback.cl);
378 unsigned delay = writeback_delay(dc, 0); 379 unsigned delay = writeback_delay(dc, 0);
379 struct keybuf_key *w; 380 struct keybuf_key *w;
380 struct dirty_io *io; 381 struct dirty_io *io;
381 382
382 /* 383 /*
383 * XXX: if we error, background writeback just spins. Should use some 384 * XXX: if we error, background writeback just spins. Should use some
384 * mempools. 385 * mempools.
385 */ 386 */
386 387
387 while (1) { 388 while (1) {
388 w = bch_keybuf_next(&dc->writeback_keys); 389 w = bch_keybuf_next(&dc->writeback_keys);
389 if (!w) 390 if (!w)
390 break; 391 break;
391 392
392 BUG_ON(ptr_stale(dc->disk.c, &w->key, 0)); 393 BUG_ON(ptr_stale(dc->disk.c, &w->key, 0));
393 394
394 if (delay > 0 && 395 if (delay > 0 &&
395 (KEY_START(&w->key) != dc->last_read || 396 (KEY_START(&w->key) != dc->last_read ||
396 jiffies_to_msecs(delay) > 50)) { 397 jiffies_to_msecs(delay) > 50)) {
397 w->private = NULL; 398 w->private = NULL;
398 399
399 closure_delay(&dc->writeback, delay); 400 closure_delay(&dc->writeback, delay);
400 continue_at(cl, read_dirty, dirty_wq); 401 continue_at(cl, read_dirty, dirty_wq);
401 } 402 }
402 403
403 dc->last_read = KEY_OFFSET(&w->key); 404 dc->last_read = KEY_OFFSET(&w->key);
404 405
405 io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec) 406 io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec)
406 * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), 407 * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
407 GFP_KERNEL); 408 GFP_KERNEL);
408 if (!io) 409 if (!io)
409 goto err; 410 goto err;
410 411
411 w->private = io; 412 w->private = io;
412 io->dc = dc; 413 io->dc = dc;
413 414
414 dirty_init(w); 415 dirty_init(w);
415 io->bio.bi_sector = PTR_OFFSET(&w->key, 0); 416 io->bio.bi_sector = PTR_OFFSET(&w->key, 0);
416 io->bio.bi_bdev = PTR_CACHE(dc->disk.c, 417 io->bio.bi_bdev = PTR_CACHE(dc->disk.c,
417 &w->key, 0)->bdev; 418 &w->key, 0)->bdev;
418 io->bio.bi_rw = READ; 419 io->bio.bi_rw = READ;
419 io->bio.bi_end_io = read_dirty_endio; 420 io->bio.bi_end_io = read_dirty_endio;
420 421
421 if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL)) 422 if (bio_alloc_pages(&io->bio, GFP_KERNEL))
422 goto err_free; 423 goto err_free;
423 424
424 trace_bcache_writeback(&w->key); 425 trace_bcache_writeback(&w->key);
425 426
426 closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl); 427 closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl);
427 428
428 delay = writeback_delay(dc, KEY_SIZE(&w->key)); 429 delay = writeback_delay(dc, KEY_SIZE(&w->key));
429 430
430 atomic_inc(&dc->in_flight); 431 atomic_inc(&dc->in_flight);
431 432
432 if (!closure_wait_event(&dc->writeback_wait, cl, 433 if (!closure_wait_event(&dc->writeback_wait, cl,
433 atomic_read(&dc->in_flight) < 64)) 434 atomic_read(&dc->in_flight) < 64))
434 continue_at(cl, read_dirty, dirty_wq); 435 continue_at(cl, read_dirty, dirty_wq);
435 } 436 }
436 437
437 if (0) { 438 if (0) {
438 err_free: 439 err_free:
439 kfree(w->private); 440 kfree(w->private);
440 err: 441 err:
441 bch_keybuf_del(&dc->writeback_keys, w); 442 bch_keybuf_del(&dc->writeback_keys, w);
442 } 443 }
443 444
444 refill_dirty(cl); 445 refill_dirty(cl);
445 } 446 }
446 447
447 /* Init */ 448 /* Init */
448 449
449 static int bch_btree_sectors_dirty_init(struct btree *b, struct btree_op *op, 450 static int bch_btree_sectors_dirty_init(struct btree *b, struct btree_op *op,
450 struct cached_dev *dc) 451 struct cached_dev *dc)
451 { 452 {
452 struct bkey *k; 453 struct bkey *k;
453 struct btree_iter iter; 454 struct btree_iter iter;
454 455
455 bch_btree_iter_init(b, &iter, &KEY(dc->disk.id, 0, 0)); 456 bch_btree_iter_init(b, &iter, &KEY(dc->disk.id, 0, 0));
456 while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) 457 while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad)))
457 if (!b->level) { 458 if (!b->level) {
458 if (KEY_INODE(k) > dc->disk.id) 459 if (KEY_INODE(k) > dc->disk.id)
459 break; 460 break;
460 461
461 if (KEY_DIRTY(k)) 462 if (KEY_DIRTY(k))
462 bcache_dev_sectors_dirty_add(b->c, dc->disk.id, 463 bcache_dev_sectors_dirty_add(b->c, dc->disk.id,
463 KEY_START(k), 464 KEY_START(k),
464 KEY_SIZE(k)); 465 KEY_SIZE(k));
465 } else { 466 } else {
466 btree(sectors_dirty_init, k, b, op, dc); 467 btree(sectors_dirty_init, k, b, op, dc);
467 if (KEY_INODE(k) > dc->disk.id) 468 if (KEY_INODE(k) > dc->disk.id)
468 break; 469 break;
469 470
470 cond_resched(); 471 cond_resched();
471 } 472 }
472 473
473 return 0; 474 return 0;
474 } 475 }
475 476
476 void bch_sectors_dirty_init(struct cached_dev *dc) 477 void bch_sectors_dirty_init(struct cached_dev *dc)
477 { 478 {
478 struct btree_op op; 479 struct btree_op op;
479 480
480 bch_btree_op_init_stack(&op); 481 bch_btree_op_init_stack(&op);
481 btree_root(sectors_dirty_init, dc->disk.c, &op, dc); 482 btree_root(sectors_dirty_init, dc->disk.c, &op, dc);
482 } 483 }
483 484
484 void bch_cached_dev_writeback_init(struct cached_dev *dc) 485 void bch_cached_dev_writeback_init(struct cached_dev *dc)
485 { 486 {
486 closure_init_unlocked(&dc->writeback); 487 closure_init_unlocked(&dc->writeback);
487 init_rwsem(&dc->writeback_lock); 488 init_rwsem(&dc->writeback_lock);
488 489
489 bch_keybuf_init(&dc->writeback_keys); 490 bch_keybuf_init(&dc->writeback_keys);
490 491
491 dc->writeback_metadata = true; 492 dc->writeback_metadata = true;
492 dc->writeback_running = true; 493 dc->writeback_running = true;
493 dc->writeback_percent = 10; 494 dc->writeback_percent = 10;
494 dc->writeback_delay = 30; 495 dc->writeback_delay = 30;
495 dc->writeback_rate.rate = 1024; 496 dc->writeback_rate.rate = 1024;
496 497
497 dc->writeback_rate_update_seconds = 30; 498 dc->writeback_rate_update_seconds = 30;
498 dc->writeback_rate_d_term = 16; 499 dc->writeback_rate_d_term = 16;
499 dc->writeback_rate_p_term_inverse = 64; 500 dc->writeback_rate_p_term_inverse = 64;
500 dc->writeback_rate_d_smooth = 8; 501 dc->writeback_rate_d_smooth = 8;
501 502
502 INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); 503 INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
503 schedule_delayed_work(&dc->writeback_rate_update, 504 schedule_delayed_work(&dc->writeback_rate_update,
504 dc->writeback_rate_update_seconds * HZ); 505 dc->writeback_rate_update_seconds * HZ);
505 } 506 }
506 507
507 void bch_writeback_exit(void) 508 void bch_writeback_exit(void)
508 { 509 {
509 if (dirty_wq) 510 if (dirty_wq)
510 destroy_workqueue(dirty_wq); 511 destroy_workqueue(dirty_wq);
511 } 512 }
512 513
513 int __init bch_writeback_init(void) 514 int __init bch_writeback_init(void)
514 { 515 {
515 dirty_wq = create_singlethread_workqueue("bcache_writeback"); 516 dirty_wq = create_singlethread_workqueue("bcache_writeback");
516 if (!dirty_wq) 517 if (!dirty_wq)
517 return -ENOMEM; 518 return -ENOMEM;
518 519
519 return 0; 520 return 0;
520 } 521 }
521 522