Commit 8e51e414a3c6d92ef2cc41720c67342a8e2c0bf7
Committed by
Kent Overstreet
1 parent
47cd2eb0ee
Exists in
smarc-imx_3.14.28_1.0.0_ga
and in
1 other branch
bcache: Use standard utility code
Some of bcache's utility code has made it into the rest of the kernel, so drop the bcache versions. Bcache used to have a workaround for allocating from a bio set under generic_make_request() (if you allocated more than once, the bios you already allocated would get stuck on current->bio_list when you submitted, and you'd risk deadlock) - bcache would mask out __GFP_WAIT when allocating bios under generic_make_request() so that allocation could fail and it could retry from workqueue. But bio_alloc_bioset() has a workaround now, so we can drop this hack and the associated error handling. Signed-off-by: Kent Overstreet <koverstreet@google.com>
Showing 8 changed files with 51 additions and 144 deletions Inline Diff
drivers/md/bcache/btree.c
1 | /* | 1 | /* |
2 | * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> | 2 | * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> |
3 | * | 3 | * |
4 | * Uses a block device as cache for other block devices; optimized for SSDs. | 4 | * Uses a block device as cache for other block devices; optimized for SSDs. |
5 | * All allocation is done in buckets, which should match the erase block size | 5 | * All allocation is done in buckets, which should match the erase block size |
6 | * of the device. | 6 | * of the device. |
7 | * | 7 | * |
8 | * Buckets containing cached data are kept on a heap sorted by priority; | 8 | * Buckets containing cached data are kept on a heap sorted by priority; |
9 | * bucket priority is increased on cache hit, and periodically all the buckets | 9 | * bucket priority is increased on cache hit, and periodically all the buckets |
10 | * on the heap have their priority scaled down. This currently is just used as | 10 | * on the heap have their priority scaled down. This currently is just used as |
11 | * an LRU but in the future should allow for more intelligent heuristics. | 11 | * an LRU but in the future should allow for more intelligent heuristics. |
12 | * | 12 | * |
13 | * Buckets have an 8 bit counter; freeing is accomplished by incrementing the | 13 | * Buckets have an 8 bit counter; freeing is accomplished by incrementing the |
14 | * counter. Garbage collection is used to remove stale pointers. | 14 | * counter. Garbage collection is used to remove stale pointers. |
15 | * | 15 | * |
16 | * Indexing is done via a btree; nodes are not necessarily fully sorted, rather | 16 | * Indexing is done via a btree; nodes are not necessarily fully sorted, rather |
17 | * as keys are inserted we only sort the pages that have not yet been written. | 17 | * as keys are inserted we only sort the pages that have not yet been written. |
18 | * When garbage collection is run, we resort the entire node. | 18 | * When garbage collection is run, we resort the entire node. |
19 | * | 19 | * |
20 | * All configuration is done via sysfs; see Documentation/bcache.txt. | 20 | * All configuration is done via sysfs; see Documentation/bcache.txt. |
21 | */ | 21 | */ |
22 | 22 | ||
23 | #include "bcache.h" | 23 | #include "bcache.h" |
24 | #include "btree.h" | 24 | #include "btree.h" |
25 | #include "debug.h" | 25 | #include "debug.h" |
26 | #include "request.h" | 26 | #include "request.h" |
27 | #include "writeback.h" | 27 | #include "writeback.h" |
28 | 28 | ||
29 | #include <linux/slab.h> | 29 | #include <linux/slab.h> |
30 | #include <linux/bitops.h> | 30 | #include <linux/bitops.h> |
31 | #include <linux/hash.h> | 31 | #include <linux/hash.h> |
32 | #include <linux/prefetch.h> | 32 | #include <linux/prefetch.h> |
33 | #include <linux/random.h> | 33 | #include <linux/random.h> |
34 | #include <linux/rcupdate.h> | 34 | #include <linux/rcupdate.h> |
35 | #include <trace/events/bcache.h> | 35 | #include <trace/events/bcache.h> |
36 | 36 | ||
37 | /* | 37 | /* |
38 | * Todo: | 38 | * Todo: |
39 | * register_bcache: Return errors out to userspace correctly | 39 | * register_bcache: Return errors out to userspace correctly |
40 | * | 40 | * |
41 | * Writeback: don't undirty key until after a cache flush | 41 | * Writeback: don't undirty key until after a cache flush |
42 | * | 42 | * |
43 | * Create an iterator for key pointers | 43 | * Create an iterator for key pointers |
44 | * | 44 | * |
45 | * On btree write error, mark bucket such that it won't be freed from the cache | 45 | * On btree write error, mark bucket such that it won't be freed from the cache |
46 | * | 46 | * |
47 | * Journalling: | 47 | * Journalling: |
48 | * Check for bad keys in replay | 48 | * Check for bad keys in replay |
49 | * Propagate barriers | 49 | * Propagate barriers |
50 | * Refcount journal entries in journal_replay | 50 | * Refcount journal entries in journal_replay |
51 | * | 51 | * |
52 | * Garbage collection: | 52 | * Garbage collection: |
53 | * Finish incremental gc | 53 | * Finish incremental gc |
54 | * Gc should free old UUIDs, data for invalid UUIDs | 54 | * Gc should free old UUIDs, data for invalid UUIDs |
55 | * | 55 | * |
56 | * Provide a way to list backing device UUIDs we have data cached for, and | 56 | * Provide a way to list backing device UUIDs we have data cached for, and |
57 | * probably how long it's been since we've seen them, and a way to invalidate | 57 | * probably how long it's been since we've seen them, and a way to invalidate |
58 | * dirty data for devices that will never be attached again | 58 | * dirty data for devices that will never be attached again |
59 | * | 59 | * |
60 | * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so | 60 | * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so |
61 | * that based on that and how much dirty data we have we can keep writeback | 61 | * that based on that and how much dirty data we have we can keep writeback |
62 | * from being starved | 62 | * from being starved |
63 | * | 63 | * |
64 | * Add a tracepoint or somesuch to watch for writeback starvation | 64 | * Add a tracepoint or somesuch to watch for writeback starvation |
65 | * | 65 | * |
66 | * When btree depth > 1 and splitting an interior node, we have to make sure | 66 | * When btree depth > 1 and splitting an interior node, we have to make sure |
67 | * alloc_bucket() cannot fail. This should be true but is not completely | 67 | * alloc_bucket() cannot fail. This should be true but is not completely |
68 | * obvious. | 68 | * obvious. |
69 | * | 69 | * |
70 | * Make sure all allocations get charged to the root cgroup | 70 | * Make sure all allocations get charged to the root cgroup |
71 | * | 71 | * |
72 | * Plugging? | 72 | * Plugging? |
73 | * | 73 | * |
74 | * If data write is less than hard sector size of ssd, round up offset in open | 74 | * If data write is less than hard sector size of ssd, round up offset in open |
75 | * bucket to the next whole sector | 75 | * bucket to the next whole sector |
76 | * | 76 | * |
77 | * Also lookup by cgroup in get_open_bucket() | 77 | * Also lookup by cgroup in get_open_bucket() |
78 | * | 78 | * |
79 | * Superblock needs to be fleshed out for multiple cache devices | 79 | * Superblock needs to be fleshed out for multiple cache devices |
80 | * | 80 | * |
81 | * Add a sysfs tunable for the number of writeback IOs in flight | 81 | * Add a sysfs tunable for the number of writeback IOs in flight |
82 | * | 82 | * |
83 | * Add a sysfs tunable for the number of open data buckets | 83 | * Add a sysfs tunable for the number of open data buckets |
84 | * | 84 | * |
85 | * IO tracking: Can we track when one process is doing io on behalf of another? | 85 | * IO tracking: Can we track when one process is doing io on behalf of another? |
86 | * IO tracking: Don't use just an average, weigh more recent stuff higher | 86 | * IO tracking: Don't use just an average, weigh more recent stuff higher |
87 | * | 87 | * |
88 | * Test module load/unload | 88 | * Test module load/unload |
89 | */ | 89 | */ |
90 | 90 | ||
91 | static const char * const op_types[] = { | 91 | static const char * const op_types[] = { |
92 | "insert", "replace" | 92 | "insert", "replace" |
93 | }; | 93 | }; |
94 | 94 | ||
95 | static const char *op_type(struct btree_op *op) | 95 | static const char *op_type(struct btree_op *op) |
96 | { | 96 | { |
97 | return op_types[op->type]; | 97 | return op_types[op->type]; |
98 | } | 98 | } |
99 | 99 | ||
100 | #define MAX_NEED_GC 64 | 100 | #define MAX_NEED_GC 64 |
101 | #define MAX_SAVE_PRIO 72 | 101 | #define MAX_SAVE_PRIO 72 |
102 | 102 | ||
103 | #define PTR_DIRTY_BIT (((uint64_t) 1 << 36)) | 103 | #define PTR_DIRTY_BIT (((uint64_t) 1 << 36)) |
104 | 104 | ||
105 | #define PTR_HASH(c, k) \ | 105 | #define PTR_HASH(c, k) \ |
106 | (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0)) | 106 | (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0)) |
107 | 107 | ||
108 | struct workqueue_struct *bch_gc_wq; | 108 | struct workqueue_struct *bch_gc_wq; |
109 | static struct workqueue_struct *btree_io_wq; | 109 | static struct workqueue_struct *btree_io_wq; |
110 | 110 | ||
111 | void bch_btree_op_init_stack(struct btree_op *op) | 111 | void bch_btree_op_init_stack(struct btree_op *op) |
112 | { | 112 | { |
113 | memset(op, 0, sizeof(struct btree_op)); | 113 | memset(op, 0, sizeof(struct btree_op)); |
114 | closure_init_stack(&op->cl); | 114 | closure_init_stack(&op->cl); |
115 | op->lock = -1; | 115 | op->lock = -1; |
116 | bch_keylist_init(&op->keys); | 116 | bch_keylist_init(&op->keys); |
117 | } | 117 | } |
118 | 118 | ||
119 | /* Btree key manipulation */ | 119 | /* Btree key manipulation */ |
120 | 120 | ||
121 | static void bkey_put(struct cache_set *c, struct bkey *k, int level) | 121 | static void bkey_put(struct cache_set *c, struct bkey *k, int level) |
122 | { | 122 | { |
123 | if ((level && KEY_OFFSET(k)) || !level) | 123 | if ((level && KEY_OFFSET(k)) || !level) |
124 | __bkey_put(c, k); | 124 | __bkey_put(c, k); |
125 | } | 125 | } |
126 | 126 | ||
127 | /* Btree IO */ | 127 | /* Btree IO */ |
128 | 128 | ||
129 | static uint64_t btree_csum_set(struct btree *b, struct bset *i) | 129 | static uint64_t btree_csum_set(struct btree *b, struct bset *i) |
130 | { | 130 | { |
131 | uint64_t crc = b->key.ptr[0]; | 131 | uint64_t crc = b->key.ptr[0]; |
132 | void *data = (void *) i + 8, *end = end(i); | 132 | void *data = (void *) i + 8, *end = end(i); |
133 | 133 | ||
134 | crc = bch_crc64_update(crc, data, end - data); | 134 | crc = bch_crc64_update(crc, data, end - data); |
135 | return crc ^ 0xffffffffffffffffULL; | 135 | return crc ^ 0xffffffffffffffffULL; |
136 | } | 136 | } |
137 | 137 | ||
138 | static void bch_btree_node_read_done(struct btree *b) | 138 | static void bch_btree_node_read_done(struct btree *b) |
139 | { | 139 | { |
140 | const char *err = "bad btree header"; | 140 | const char *err = "bad btree header"; |
141 | struct bset *i = b->sets[0].data; | 141 | struct bset *i = b->sets[0].data; |
142 | struct btree_iter *iter; | 142 | struct btree_iter *iter; |
143 | 143 | ||
144 | iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT); | 144 | iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT); |
145 | iter->size = b->c->sb.bucket_size / b->c->sb.block_size; | 145 | iter->size = b->c->sb.bucket_size / b->c->sb.block_size; |
146 | iter->used = 0; | 146 | iter->used = 0; |
147 | 147 | ||
148 | if (!i->seq) | 148 | if (!i->seq) |
149 | goto err; | 149 | goto err; |
150 | 150 | ||
151 | for (; | 151 | for (; |
152 | b->written < btree_blocks(b) && i->seq == b->sets[0].data->seq; | 152 | b->written < btree_blocks(b) && i->seq == b->sets[0].data->seq; |
153 | i = write_block(b)) { | 153 | i = write_block(b)) { |
154 | err = "unsupported bset version"; | 154 | err = "unsupported bset version"; |
155 | if (i->version > BCACHE_BSET_VERSION) | 155 | if (i->version > BCACHE_BSET_VERSION) |
156 | goto err; | 156 | goto err; |
157 | 157 | ||
158 | err = "bad btree header"; | 158 | err = "bad btree header"; |
159 | if (b->written + set_blocks(i, b->c) > btree_blocks(b)) | 159 | if (b->written + set_blocks(i, b->c) > btree_blocks(b)) |
160 | goto err; | 160 | goto err; |
161 | 161 | ||
162 | err = "bad magic"; | 162 | err = "bad magic"; |
163 | if (i->magic != bset_magic(b->c)) | 163 | if (i->magic != bset_magic(b->c)) |
164 | goto err; | 164 | goto err; |
165 | 165 | ||
166 | err = "bad checksum"; | 166 | err = "bad checksum"; |
167 | switch (i->version) { | 167 | switch (i->version) { |
168 | case 0: | 168 | case 0: |
169 | if (i->csum != csum_set(i)) | 169 | if (i->csum != csum_set(i)) |
170 | goto err; | 170 | goto err; |
171 | break; | 171 | break; |
172 | case BCACHE_BSET_VERSION: | 172 | case BCACHE_BSET_VERSION: |
173 | if (i->csum != btree_csum_set(b, i)) | 173 | if (i->csum != btree_csum_set(b, i)) |
174 | goto err; | 174 | goto err; |
175 | break; | 175 | break; |
176 | } | 176 | } |
177 | 177 | ||
178 | err = "empty set"; | 178 | err = "empty set"; |
179 | if (i != b->sets[0].data && !i->keys) | 179 | if (i != b->sets[0].data && !i->keys) |
180 | goto err; | 180 | goto err; |
181 | 181 | ||
182 | bch_btree_iter_push(iter, i->start, end(i)); | 182 | bch_btree_iter_push(iter, i->start, end(i)); |
183 | 183 | ||
184 | b->written += set_blocks(i, b->c); | 184 | b->written += set_blocks(i, b->c); |
185 | } | 185 | } |
186 | 186 | ||
187 | err = "corrupted btree"; | 187 | err = "corrupted btree"; |
188 | for (i = write_block(b); | 188 | for (i = write_block(b); |
189 | index(i, b) < btree_blocks(b); | 189 | index(i, b) < btree_blocks(b); |
190 | i = ((void *) i) + block_bytes(b->c)) | 190 | i = ((void *) i) + block_bytes(b->c)) |
191 | if (i->seq == b->sets[0].data->seq) | 191 | if (i->seq == b->sets[0].data->seq) |
192 | goto err; | 192 | goto err; |
193 | 193 | ||
194 | bch_btree_sort_and_fix_extents(b, iter); | 194 | bch_btree_sort_and_fix_extents(b, iter); |
195 | 195 | ||
196 | i = b->sets[0].data; | 196 | i = b->sets[0].data; |
197 | err = "short btree key"; | 197 | err = "short btree key"; |
198 | if (b->sets[0].size && | 198 | if (b->sets[0].size && |
199 | bkey_cmp(&b->key, &b->sets[0].end) < 0) | 199 | bkey_cmp(&b->key, &b->sets[0].end) < 0) |
200 | goto err; | 200 | goto err; |
201 | 201 | ||
202 | if (b->written < btree_blocks(b)) | 202 | if (b->written < btree_blocks(b)) |
203 | bch_bset_init_next(b); | 203 | bch_bset_init_next(b); |
204 | out: | 204 | out: |
205 | mempool_free(iter, b->c->fill_iter); | 205 | mempool_free(iter, b->c->fill_iter); |
206 | return; | 206 | return; |
207 | err: | 207 | err: |
208 | set_btree_node_io_error(b); | 208 | set_btree_node_io_error(b); |
209 | bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys", | 209 | bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys", |
210 | err, PTR_BUCKET_NR(b->c, &b->key, 0), | 210 | err, PTR_BUCKET_NR(b->c, &b->key, 0), |
211 | index(i, b), i->keys); | 211 | index(i, b), i->keys); |
212 | goto out; | 212 | goto out; |
213 | } | 213 | } |
214 | 214 | ||
215 | static void btree_node_read_endio(struct bio *bio, int error) | 215 | static void btree_node_read_endio(struct bio *bio, int error) |
216 | { | 216 | { |
217 | struct closure *cl = bio->bi_private; | 217 | struct closure *cl = bio->bi_private; |
218 | closure_put(cl); | 218 | closure_put(cl); |
219 | } | 219 | } |
220 | 220 | ||
221 | void bch_btree_node_read(struct btree *b) | 221 | void bch_btree_node_read(struct btree *b) |
222 | { | 222 | { |
223 | uint64_t start_time = local_clock(); | 223 | uint64_t start_time = local_clock(); |
224 | struct closure cl; | 224 | struct closure cl; |
225 | struct bio *bio; | 225 | struct bio *bio; |
226 | 226 | ||
227 | trace_bcache_btree_read(b); | 227 | trace_bcache_btree_read(b); |
228 | 228 | ||
229 | closure_init_stack(&cl); | 229 | closure_init_stack(&cl); |
230 | 230 | ||
231 | bio = bch_bbio_alloc(b->c); | 231 | bio = bch_bbio_alloc(b->c); |
232 | bio->bi_rw = REQ_META|READ_SYNC; | 232 | bio->bi_rw = REQ_META|READ_SYNC; |
233 | bio->bi_size = KEY_SIZE(&b->key) << 9; | 233 | bio->bi_size = KEY_SIZE(&b->key) << 9; |
234 | bio->bi_end_io = btree_node_read_endio; | 234 | bio->bi_end_io = btree_node_read_endio; |
235 | bio->bi_private = &cl; | 235 | bio->bi_private = &cl; |
236 | 236 | ||
237 | bch_bio_map(bio, b->sets[0].data); | 237 | bch_bio_map(bio, b->sets[0].data); |
238 | 238 | ||
239 | bch_submit_bbio(bio, b->c, &b->key, 0); | 239 | bch_submit_bbio(bio, b->c, &b->key, 0); |
240 | closure_sync(&cl); | 240 | closure_sync(&cl); |
241 | 241 | ||
242 | if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) | 242 | if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) |
243 | set_btree_node_io_error(b); | 243 | set_btree_node_io_error(b); |
244 | 244 | ||
245 | bch_bbio_free(bio, b->c); | 245 | bch_bbio_free(bio, b->c); |
246 | 246 | ||
247 | if (btree_node_io_error(b)) | 247 | if (btree_node_io_error(b)) |
248 | goto err; | 248 | goto err; |
249 | 249 | ||
250 | bch_btree_node_read_done(b); | 250 | bch_btree_node_read_done(b); |
251 | 251 | ||
252 | spin_lock(&b->c->btree_read_time_lock); | 252 | spin_lock(&b->c->btree_read_time_lock); |
253 | bch_time_stats_update(&b->c->btree_read_time, start_time); | 253 | bch_time_stats_update(&b->c->btree_read_time, start_time); |
254 | spin_unlock(&b->c->btree_read_time_lock); | 254 | spin_unlock(&b->c->btree_read_time_lock); |
255 | 255 | ||
256 | return; | 256 | return; |
257 | err: | 257 | err: |
258 | bch_cache_set_error(b->c, "io error reading bucket %lu", | 258 | bch_cache_set_error(b->c, "io error reading bucket %lu", |
259 | PTR_BUCKET_NR(b->c, &b->key, 0)); | 259 | PTR_BUCKET_NR(b->c, &b->key, 0)); |
260 | } | 260 | } |
261 | 261 | ||
262 | static void btree_complete_write(struct btree *b, struct btree_write *w) | 262 | static void btree_complete_write(struct btree *b, struct btree_write *w) |
263 | { | 263 | { |
264 | if (w->prio_blocked && | 264 | if (w->prio_blocked && |
265 | !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked)) | 265 | !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked)) |
266 | wake_up_allocators(b->c); | 266 | wake_up_allocators(b->c); |
267 | 267 | ||
268 | if (w->journal) { | 268 | if (w->journal) { |
269 | atomic_dec_bug(w->journal); | 269 | atomic_dec_bug(w->journal); |
270 | __closure_wake_up(&b->c->journal.wait); | 270 | __closure_wake_up(&b->c->journal.wait); |
271 | } | 271 | } |
272 | 272 | ||
273 | w->prio_blocked = 0; | 273 | w->prio_blocked = 0; |
274 | w->journal = NULL; | 274 | w->journal = NULL; |
275 | } | 275 | } |
276 | 276 | ||
277 | static void __btree_node_write_done(struct closure *cl) | 277 | static void __btree_node_write_done(struct closure *cl) |
278 | { | 278 | { |
279 | struct btree *b = container_of(cl, struct btree, io.cl); | 279 | struct btree *b = container_of(cl, struct btree, io.cl); |
280 | struct btree_write *w = btree_prev_write(b); | 280 | struct btree_write *w = btree_prev_write(b); |
281 | 281 | ||
282 | bch_bbio_free(b->bio, b->c); | 282 | bch_bbio_free(b->bio, b->c); |
283 | b->bio = NULL; | 283 | b->bio = NULL; |
284 | btree_complete_write(b, w); | 284 | btree_complete_write(b, w); |
285 | 285 | ||
286 | if (btree_node_dirty(b)) | 286 | if (btree_node_dirty(b)) |
287 | queue_delayed_work(btree_io_wq, &b->work, | 287 | queue_delayed_work(btree_io_wq, &b->work, |
288 | msecs_to_jiffies(30000)); | 288 | msecs_to_jiffies(30000)); |
289 | 289 | ||
290 | closure_return(cl); | 290 | closure_return(cl); |
291 | } | 291 | } |
292 | 292 | ||
293 | static void btree_node_write_done(struct closure *cl) | 293 | static void btree_node_write_done(struct closure *cl) |
294 | { | 294 | { |
295 | struct btree *b = container_of(cl, struct btree, io.cl); | 295 | struct btree *b = container_of(cl, struct btree, io.cl); |
296 | struct bio_vec *bv; | 296 | struct bio_vec *bv; |
297 | int n; | 297 | int n; |
298 | 298 | ||
299 | __bio_for_each_segment(bv, b->bio, n, 0) | 299 | __bio_for_each_segment(bv, b->bio, n, 0) |
300 | __free_page(bv->bv_page); | 300 | __free_page(bv->bv_page); |
301 | 301 | ||
302 | __btree_node_write_done(cl); | 302 | __btree_node_write_done(cl); |
303 | } | 303 | } |
304 | 304 | ||
305 | static void btree_node_write_endio(struct bio *bio, int error) | 305 | static void btree_node_write_endio(struct bio *bio, int error) |
306 | { | 306 | { |
307 | struct closure *cl = bio->bi_private; | 307 | struct closure *cl = bio->bi_private; |
308 | struct btree *b = container_of(cl, struct btree, io.cl); | 308 | struct btree *b = container_of(cl, struct btree, io.cl); |
309 | 309 | ||
310 | if (error) | 310 | if (error) |
311 | set_btree_node_io_error(b); | 311 | set_btree_node_io_error(b); |
312 | 312 | ||
313 | bch_bbio_count_io_errors(b->c, bio, error, "writing btree"); | 313 | bch_bbio_count_io_errors(b->c, bio, error, "writing btree"); |
314 | closure_put(cl); | 314 | closure_put(cl); |
315 | } | 315 | } |
316 | 316 | ||
317 | static void do_btree_node_write(struct btree *b) | 317 | static void do_btree_node_write(struct btree *b) |
318 | { | 318 | { |
319 | struct closure *cl = &b->io.cl; | 319 | struct closure *cl = &b->io.cl; |
320 | struct bset *i = b->sets[b->nsets].data; | 320 | struct bset *i = b->sets[b->nsets].data; |
321 | BKEY_PADDED(key) k; | 321 | BKEY_PADDED(key) k; |
322 | 322 | ||
323 | i->version = BCACHE_BSET_VERSION; | 323 | i->version = BCACHE_BSET_VERSION; |
324 | i->csum = btree_csum_set(b, i); | 324 | i->csum = btree_csum_set(b, i); |
325 | 325 | ||
326 | BUG_ON(b->bio); | 326 | BUG_ON(b->bio); |
327 | b->bio = bch_bbio_alloc(b->c); | 327 | b->bio = bch_bbio_alloc(b->c); |
328 | 328 | ||
329 | b->bio->bi_end_io = btree_node_write_endio; | 329 | b->bio->bi_end_io = btree_node_write_endio; |
330 | b->bio->bi_private = &b->io.cl; | 330 | b->bio->bi_private = &b->io.cl; |
331 | b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA; | 331 | b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA; |
332 | b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); | 332 | b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); |
333 | bch_bio_map(b->bio, i); | 333 | bch_bio_map(b->bio, i); |
334 | 334 | ||
335 | /* | 335 | /* |
336 | * If we're appending to a leaf node, we don't technically need FUA - | 336 | * If we're appending to a leaf node, we don't technically need FUA - |
337 | * this write just needs to be persisted before the next journal write, | 337 | * this write just needs to be persisted before the next journal write, |
338 | * which will be marked FLUSH|FUA. | 338 | * which will be marked FLUSH|FUA. |
339 | * | 339 | * |
340 | * Similarly if we're writing a new btree root - the pointer is going to | 340 | * Similarly if we're writing a new btree root - the pointer is going to |
341 | * be in the next journal entry. | 341 | * be in the next journal entry. |
342 | * | 342 | * |
343 | * But if we're writing a new btree node (that isn't a root) or | 343 | * But if we're writing a new btree node (that isn't a root) or |
344 | * appending to a non leaf btree node, we need either FUA or a flush | 344 | * appending to a non leaf btree node, we need either FUA or a flush |
345 | * when we write the parent with the new pointer. FUA is cheaper than a | 345 | * when we write the parent with the new pointer. FUA is cheaper than a |
346 | * flush, and writes appending to leaf nodes aren't blocking anything so | 346 | * flush, and writes appending to leaf nodes aren't blocking anything so |
347 | * just make all btree node writes FUA to keep things sane. | 347 | * just make all btree node writes FUA to keep things sane. |
348 | */ | 348 | */ |
349 | 349 | ||
350 | bkey_copy(&k.key, &b->key); | 350 | bkey_copy(&k.key, &b->key); |
351 | SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i)); | 351 | SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i)); |
352 | 352 | ||
353 | if (!bch_bio_alloc_pages(b->bio, GFP_NOIO)) { | 353 | if (!bio_alloc_pages(b->bio, GFP_NOIO)) { |
354 | int j; | 354 | int j; |
355 | struct bio_vec *bv; | 355 | struct bio_vec *bv; |
356 | void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); | 356 | void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); |
357 | 357 | ||
358 | bio_for_each_segment(bv, b->bio, j) | 358 | bio_for_each_segment(bv, b->bio, j) |
359 | memcpy(page_address(bv->bv_page), | 359 | memcpy(page_address(bv->bv_page), |
360 | base + j * PAGE_SIZE, PAGE_SIZE); | 360 | base + j * PAGE_SIZE, PAGE_SIZE); |
361 | 361 | ||
362 | bch_submit_bbio(b->bio, b->c, &k.key, 0); | 362 | bch_submit_bbio(b->bio, b->c, &k.key, 0); |
363 | 363 | ||
364 | continue_at(cl, btree_node_write_done, NULL); | 364 | continue_at(cl, btree_node_write_done, NULL); |
365 | } else { | 365 | } else { |
366 | b->bio->bi_vcnt = 0; | 366 | b->bio->bi_vcnt = 0; |
367 | bch_bio_map(b->bio, i); | 367 | bch_bio_map(b->bio, i); |
368 | 368 | ||
369 | bch_submit_bbio(b->bio, b->c, &k.key, 0); | 369 | bch_submit_bbio(b->bio, b->c, &k.key, 0); |
370 | 370 | ||
371 | closure_sync(cl); | 371 | closure_sync(cl); |
372 | __btree_node_write_done(cl); | 372 | __btree_node_write_done(cl); |
373 | } | 373 | } |
374 | } | 374 | } |
375 | 375 | ||
376 | void bch_btree_node_write(struct btree *b, struct closure *parent) | 376 | void bch_btree_node_write(struct btree *b, struct closure *parent) |
377 | { | 377 | { |
378 | struct bset *i = b->sets[b->nsets].data; | 378 | struct bset *i = b->sets[b->nsets].data; |
379 | 379 | ||
380 | trace_bcache_btree_write(b); | 380 | trace_bcache_btree_write(b); |
381 | 381 | ||
382 | BUG_ON(current->bio_list); | 382 | BUG_ON(current->bio_list); |
383 | BUG_ON(b->written >= btree_blocks(b)); | 383 | BUG_ON(b->written >= btree_blocks(b)); |
384 | BUG_ON(b->written && !i->keys); | 384 | BUG_ON(b->written && !i->keys); |
385 | BUG_ON(b->sets->data->seq != i->seq); | 385 | BUG_ON(b->sets->data->seq != i->seq); |
386 | bch_check_key_order(b, i); | 386 | bch_check_key_order(b, i); |
387 | 387 | ||
388 | cancel_delayed_work(&b->work); | 388 | cancel_delayed_work(&b->work); |
389 | 389 | ||
390 | /* If caller isn't waiting for write, parent refcount is cache set */ | 390 | /* If caller isn't waiting for write, parent refcount is cache set */ |
391 | closure_lock(&b->io, parent ?: &b->c->cl); | 391 | closure_lock(&b->io, parent ?: &b->c->cl); |
392 | 392 | ||
393 | clear_bit(BTREE_NODE_dirty, &b->flags); | 393 | clear_bit(BTREE_NODE_dirty, &b->flags); |
394 | change_bit(BTREE_NODE_write_idx, &b->flags); | 394 | change_bit(BTREE_NODE_write_idx, &b->flags); |
395 | 395 | ||
396 | do_btree_node_write(b); | 396 | do_btree_node_write(b); |
397 | 397 | ||
398 | b->written += set_blocks(i, b->c); | 398 | b->written += set_blocks(i, b->c); |
399 | atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size, | 399 | atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size, |
400 | &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); | 400 | &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); |
401 | 401 | ||
402 | bch_btree_sort_lazy(b); | 402 | bch_btree_sort_lazy(b); |
403 | 403 | ||
404 | if (b->written < btree_blocks(b)) | 404 | if (b->written < btree_blocks(b)) |
405 | bch_bset_init_next(b); | 405 | bch_bset_init_next(b); |
406 | } | 406 | } |
407 | 407 | ||
408 | static void btree_node_write_work(struct work_struct *w) | 408 | static void btree_node_write_work(struct work_struct *w) |
409 | { | 409 | { |
410 | struct btree *b = container_of(to_delayed_work(w), struct btree, work); | 410 | struct btree *b = container_of(to_delayed_work(w), struct btree, work); |
411 | 411 | ||
412 | rw_lock(true, b, b->level); | 412 | rw_lock(true, b, b->level); |
413 | 413 | ||
414 | if (btree_node_dirty(b)) | 414 | if (btree_node_dirty(b)) |
415 | bch_btree_node_write(b, NULL); | 415 | bch_btree_node_write(b, NULL); |
416 | rw_unlock(true, b); | 416 | rw_unlock(true, b); |
417 | } | 417 | } |
418 | 418 | ||
419 | static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op) | 419 | static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op) |
420 | { | 420 | { |
421 | struct bset *i = b->sets[b->nsets].data; | 421 | struct bset *i = b->sets[b->nsets].data; |
422 | struct btree_write *w = btree_current_write(b); | 422 | struct btree_write *w = btree_current_write(b); |
423 | 423 | ||
424 | BUG_ON(!b->written); | 424 | BUG_ON(!b->written); |
425 | BUG_ON(!i->keys); | 425 | BUG_ON(!i->keys); |
426 | 426 | ||
427 | if (!btree_node_dirty(b)) | 427 | if (!btree_node_dirty(b)) |
428 | queue_delayed_work(btree_io_wq, &b->work, 30 * HZ); | 428 | queue_delayed_work(btree_io_wq, &b->work, 30 * HZ); |
429 | 429 | ||
430 | set_btree_node_dirty(b); | 430 | set_btree_node_dirty(b); |
431 | 431 | ||
432 | if (op && op->journal) { | 432 | if (op && op->journal) { |
433 | if (w->journal && | 433 | if (w->journal && |
434 | journal_pin_cmp(b->c, w, op)) { | 434 | journal_pin_cmp(b->c, w, op)) { |
435 | atomic_dec_bug(w->journal); | 435 | atomic_dec_bug(w->journal); |
436 | w->journal = NULL; | 436 | w->journal = NULL; |
437 | } | 437 | } |
438 | 438 | ||
439 | if (!w->journal) { | 439 | if (!w->journal) { |
440 | w->journal = op->journal; | 440 | w->journal = op->journal; |
441 | atomic_inc(w->journal); | 441 | atomic_inc(w->journal); |
442 | } | 442 | } |
443 | } | 443 | } |
444 | 444 | ||
445 | /* Force write if set is too big */ | 445 | /* Force write if set is too big */ |
446 | if (set_bytes(i) > PAGE_SIZE - 48 && | 446 | if (set_bytes(i) > PAGE_SIZE - 48 && |
447 | !current->bio_list) | 447 | !current->bio_list) |
448 | bch_btree_node_write(b, NULL); | 448 | bch_btree_node_write(b, NULL); |
449 | } | 449 | } |
450 | 450 | ||
451 | /* | 451 | /* |
452 | * Btree in memory cache - allocation/freeing | 452 | * Btree in memory cache - allocation/freeing |
453 | * mca -> memory cache | 453 | * mca -> memory cache |
454 | */ | 454 | */ |
455 | 455 | ||
456 | static void mca_reinit(struct btree *b) | 456 | static void mca_reinit(struct btree *b) |
457 | { | 457 | { |
458 | unsigned i; | 458 | unsigned i; |
459 | 459 | ||
460 | b->flags = 0; | 460 | b->flags = 0; |
461 | b->written = 0; | 461 | b->written = 0; |
462 | b->nsets = 0; | 462 | b->nsets = 0; |
463 | 463 | ||
464 | for (i = 0; i < MAX_BSETS; i++) | 464 | for (i = 0; i < MAX_BSETS; i++) |
465 | b->sets[i].size = 0; | 465 | b->sets[i].size = 0; |
466 | /* | 466 | /* |
467 | * Second loop starts at 1 because b->sets[0]->data is the memory we | 467 | * Second loop starts at 1 because b->sets[0]->data is the memory we |
468 | * allocated | 468 | * allocated |
469 | */ | 469 | */ |
470 | for (i = 1; i < MAX_BSETS; i++) | 470 | for (i = 1; i < MAX_BSETS; i++) |
471 | b->sets[i].data = NULL; | 471 | b->sets[i].data = NULL; |
472 | } | 472 | } |
473 | 473 | ||
474 | #define mca_reserve(c) (((c->root && c->root->level) \ | 474 | #define mca_reserve(c) (((c->root && c->root->level) \ |
475 | ? c->root->level : 1) * 8 + 16) | 475 | ? c->root->level : 1) * 8 + 16) |
476 | #define mca_can_free(c) \ | 476 | #define mca_can_free(c) \ |
477 | max_t(int, 0, c->bucket_cache_used - mca_reserve(c)) | 477 | max_t(int, 0, c->bucket_cache_used - mca_reserve(c)) |
478 | 478 | ||
479 | static void mca_data_free(struct btree *b) | 479 | static void mca_data_free(struct btree *b) |
480 | { | 480 | { |
481 | struct bset_tree *t = b->sets; | 481 | struct bset_tree *t = b->sets; |
482 | BUG_ON(!closure_is_unlocked(&b->io.cl)); | 482 | BUG_ON(!closure_is_unlocked(&b->io.cl)); |
483 | 483 | ||
484 | if (bset_prev_bytes(b) < PAGE_SIZE) | 484 | if (bset_prev_bytes(b) < PAGE_SIZE) |
485 | kfree(t->prev); | 485 | kfree(t->prev); |
486 | else | 486 | else |
487 | free_pages((unsigned long) t->prev, | 487 | free_pages((unsigned long) t->prev, |
488 | get_order(bset_prev_bytes(b))); | 488 | get_order(bset_prev_bytes(b))); |
489 | 489 | ||
490 | if (bset_tree_bytes(b) < PAGE_SIZE) | 490 | if (bset_tree_bytes(b) < PAGE_SIZE) |
491 | kfree(t->tree); | 491 | kfree(t->tree); |
492 | else | 492 | else |
493 | free_pages((unsigned long) t->tree, | 493 | free_pages((unsigned long) t->tree, |
494 | get_order(bset_tree_bytes(b))); | 494 | get_order(bset_tree_bytes(b))); |
495 | 495 | ||
496 | free_pages((unsigned long) t->data, b->page_order); | 496 | free_pages((unsigned long) t->data, b->page_order); |
497 | 497 | ||
498 | t->prev = NULL; | 498 | t->prev = NULL; |
499 | t->tree = NULL; | 499 | t->tree = NULL; |
500 | t->data = NULL; | 500 | t->data = NULL; |
501 | list_move(&b->list, &b->c->btree_cache_freed); | 501 | list_move(&b->list, &b->c->btree_cache_freed); |
502 | b->c->bucket_cache_used--; | 502 | b->c->bucket_cache_used--; |
503 | } | 503 | } |
504 | 504 | ||
505 | static void mca_bucket_free(struct btree *b) | 505 | static void mca_bucket_free(struct btree *b) |
506 | { | 506 | { |
507 | BUG_ON(btree_node_dirty(b)); | 507 | BUG_ON(btree_node_dirty(b)); |
508 | 508 | ||
509 | b->key.ptr[0] = 0; | 509 | b->key.ptr[0] = 0; |
510 | hlist_del_init_rcu(&b->hash); | 510 | hlist_del_init_rcu(&b->hash); |
511 | list_move(&b->list, &b->c->btree_cache_freeable); | 511 | list_move(&b->list, &b->c->btree_cache_freeable); |
512 | } | 512 | } |
513 | 513 | ||
514 | static unsigned btree_order(struct bkey *k) | 514 | static unsigned btree_order(struct bkey *k) |
515 | { | 515 | { |
516 | return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1); | 516 | return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1); |
517 | } | 517 | } |
518 | 518 | ||
519 | static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) | 519 | static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) |
520 | { | 520 | { |
521 | struct bset_tree *t = b->sets; | 521 | struct bset_tree *t = b->sets; |
522 | BUG_ON(t->data); | 522 | BUG_ON(t->data); |
523 | 523 | ||
524 | b->page_order = max_t(unsigned, | 524 | b->page_order = max_t(unsigned, |
525 | ilog2(b->c->btree_pages), | 525 | ilog2(b->c->btree_pages), |
526 | btree_order(k)); | 526 | btree_order(k)); |
527 | 527 | ||
528 | t->data = (void *) __get_free_pages(gfp, b->page_order); | 528 | t->data = (void *) __get_free_pages(gfp, b->page_order); |
529 | if (!t->data) | 529 | if (!t->data) |
530 | goto err; | 530 | goto err; |
531 | 531 | ||
532 | t->tree = bset_tree_bytes(b) < PAGE_SIZE | 532 | t->tree = bset_tree_bytes(b) < PAGE_SIZE |
533 | ? kmalloc(bset_tree_bytes(b), gfp) | 533 | ? kmalloc(bset_tree_bytes(b), gfp) |
534 | : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b))); | 534 | : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b))); |
535 | if (!t->tree) | 535 | if (!t->tree) |
536 | goto err; | 536 | goto err; |
537 | 537 | ||
538 | t->prev = bset_prev_bytes(b) < PAGE_SIZE | 538 | t->prev = bset_prev_bytes(b) < PAGE_SIZE |
539 | ? kmalloc(bset_prev_bytes(b), gfp) | 539 | ? kmalloc(bset_prev_bytes(b), gfp) |
540 | : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b))); | 540 | : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b))); |
541 | if (!t->prev) | 541 | if (!t->prev) |
542 | goto err; | 542 | goto err; |
543 | 543 | ||
544 | list_move(&b->list, &b->c->btree_cache); | 544 | list_move(&b->list, &b->c->btree_cache); |
545 | b->c->bucket_cache_used++; | 545 | b->c->bucket_cache_used++; |
546 | return; | 546 | return; |
547 | err: | 547 | err: |
548 | mca_data_free(b); | 548 | mca_data_free(b); |
549 | } | 549 | } |
550 | 550 | ||
551 | static struct btree *mca_bucket_alloc(struct cache_set *c, | 551 | static struct btree *mca_bucket_alloc(struct cache_set *c, |
552 | struct bkey *k, gfp_t gfp) | 552 | struct bkey *k, gfp_t gfp) |
553 | { | 553 | { |
554 | struct btree *b = kzalloc(sizeof(struct btree), gfp); | 554 | struct btree *b = kzalloc(sizeof(struct btree), gfp); |
555 | if (!b) | 555 | if (!b) |
556 | return NULL; | 556 | return NULL; |
557 | 557 | ||
558 | init_rwsem(&b->lock); | 558 | init_rwsem(&b->lock); |
559 | lockdep_set_novalidate_class(&b->lock); | 559 | lockdep_set_novalidate_class(&b->lock); |
560 | INIT_LIST_HEAD(&b->list); | 560 | INIT_LIST_HEAD(&b->list); |
561 | INIT_DELAYED_WORK(&b->work, btree_node_write_work); | 561 | INIT_DELAYED_WORK(&b->work, btree_node_write_work); |
562 | b->c = c; | 562 | b->c = c; |
563 | closure_init_unlocked(&b->io); | 563 | closure_init_unlocked(&b->io); |
564 | 564 | ||
565 | mca_data_alloc(b, k, gfp); | 565 | mca_data_alloc(b, k, gfp); |
566 | return b; | 566 | return b; |
567 | } | 567 | } |
568 | 568 | ||
569 | static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order) | 569 | static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order) |
570 | { | 570 | { |
571 | lockdep_assert_held(&b->c->bucket_lock); | 571 | lockdep_assert_held(&b->c->bucket_lock); |
572 | 572 | ||
573 | if (!down_write_trylock(&b->lock)) | 573 | if (!down_write_trylock(&b->lock)) |
574 | return -ENOMEM; | 574 | return -ENOMEM; |
575 | 575 | ||
576 | if (b->page_order < min_order) { | 576 | if (b->page_order < min_order) { |
577 | rw_unlock(true, b); | 577 | rw_unlock(true, b); |
578 | return -ENOMEM; | 578 | return -ENOMEM; |
579 | } | 579 | } |
580 | 580 | ||
581 | BUG_ON(btree_node_dirty(b) && !b->sets[0].data); | 581 | BUG_ON(btree_node_dirty(b) && !b->sets[0].data); |
582 | 582 | ||
583 | if (cl && btree_node_dirty(b)) | 583 | if (cl && btree_node_dirty(b)) |
584 | bch_btree_node_write(b, NULL); | 584 | bch_btree_node_write(b, NULL); |
585 | 585 | ||
586 | if (cl) | 586 | if (cl) |
587 | closure_wait_event_async(&b->io.wait, cl, | 587 | closure_wait_event_async(&b->io.wait, cl, |
588 | atomic_read(&b->io.cl.remaining) == -1); | 588 | atomic_read(&b->io.cl.remaining) == -1); |
589 | 589 | ||
590 | if (btree_node_dirty(b) || | 590 | if (btree_node_dirty(b) || |
591 | !closure_is_unlocked(&b->io.cl) || | 591 | !closure_is_unlocked(&b->io.cl) || |
592 | work_pending(&b->work.work)) { | 592 | work_pending(&b->work.work)) { |
593 | rw_unlock(true, b); | 593 | rw_unlock(true, b); |
594 | return -EAGAIN; | 594 | return -EAGAIN; |
595 | } | 595 | } |
596 | 596 | ||
597 | return 0; | 597 | return 0; |
598 | } | 598 | } |
599 | 599 | ||
600 | static int bch_mca_shrink(struct shrinker *shrink, struct shrink_control *sc) | 600 | static int bch_mca_shrink(struct shrinker *shrink, struct shrink_control *sc) |
601 | { | 601 | { |
602 | struct cache_set *c = container_of(shrink, struct cache_set, shrink); | 602 | struct cache_set *c = container_of(shrink, struct cache_set, shrink); |
603 | struct btree *b, *t; | 603 | struct btree *b, *t; |
604 | unsigned long i, nr = sc->nr_to_scan; | 604 | unsigned long i, nr = sc->nr_to_scan; |
605 | 605 | ||
606 | if (c->shrinker_disabled) | 606 | if (c->shrinker_disabled) |
607 | return 0; | 607 | return 0; |
608 | 608 | ||
609 | if (c->try_harder) | 609 | if (c->try_harder) |
610 | return 0; | 610 | return 0; |
611 | 611 | ||
612 | /* | 612 | /* |
613 | * If nr == 0, we're supposed to return the number of items we have | 613 | * If nr == 0, we're supposed to return the number of items we have |
614 | * cached. Not allowed to return -1. | 614 | * cached. Not allowed to return -1. |
615 | */ | 615 | */ |
616 | if (!nr) | 616 | if (!nr) |
617 | return mca_can_free(c) * c->btree_pages; | 617 | return mca_can_free(c) * c->btree_pages; |
618 | 618 | ||
619 | /* Return -1 if we can't do anything right now */ | 619 | /* Return -1 if we can't do anything right now */ |
620 | if (sc->gfp_mask & __GFP_WAIT) | 620 | if (sc->gfp_mask & __GFP_WAIT) |
621 | mutex_lock(&c->bucket_lock); | 621 | mutex_lock(&c->bucket_lock); |
622 | else if (!mutex_trylock(&c->bucket_lock)) | 622 | else if (!mutex_trylock(&c->bucket_lock)) |
623 | return -1; | 623 | return -1; |
624 | 624 | ||
625 | /* | 625 | /* |
626 | * It's _really_ critical that we don't free too many btree nodes - we | 626 | * It's _really_ critical that we don't free too many btree nodes - we |
627 | * have to always leave ourselves a reserve. The reserve is how we | 627 | * have to always leave ourselves a reserve. The reserve is how we |
628 | * guarantee that allocating memory for a new btree node can always | 628 | * guarantee that allocating memory for a new btree node can always |
629 | * succeed, so that inserting keys into the btree can always succeed and | 629 | * succeed, so that inserting keys into the btree can always succeed and |
630 | * IO can always make forward progress: | 630 | * IO can always make forward progress: |
631 | */ | 631 | */ |
632 | nr /= c->btree_pages; | 632 | nr /= c->btree_pages; |
633 | nr = min_t(unsigned long, nr, mca_can_free(c)); | 633 | nr = min_t(unsigned long, nr, mca_can_free(c)); |
634 | 634 | ||
635 | i = 0; | 635 | i = 0; |
636 | list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) { | 636 | list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) { |
637 | if (!nr) | 637 | if (!nr) |
638 | break; | 638 | break; |
639 | 639 | ||
640 | if (++i > 3 && | 640 | if (++i > 3 && |
641 | !mca_reap(b, NULL, 0)) { | 641 | !mca_reap(b, NULL, 0)) { |
642 | mca_data_free(b); | 642 | mca_data_free(b); |
643 | rw_unlock(true, b); | 643 | rw_unlock(true, b); |
644 | --nr; | 644 | --nr; |
645 | } | 645 | } |
646 | } | 646 | } |
647 | 647 | ||
648 | /* | 648 | /* |
649 | * Can happen right when we first start up, before we've read in any | 649 | * Can happen right when we first start up, before we've read in any |
650 | * btree nodes | 650 | * btree nodes |
651 | */ | 651 | */ |
652 | if (list_empty(&c->btree_cache)) | 652 | if (list_empty(&c->btree_cache)) |
653 | goto out; | 653 | goto out; |
654 | 654 | ||
655 | for (i = 0; nr && i < c->bucket_cache_used; i++) { | 655 | for (i = 0; nr && i < c->bucket_cache_used; i++) { |
656 | b = list_first_entry(&c->btree_cache, struct btree, list); | 656 | b = list_first_entry(&c->btree_cache, struct btree, list); |
657 | list_rotate_left(&c->btree_cache); | 657 | list_rotate_left(&c->btree_cache); |
658 | 658 | ||
659 | if (!b->accessed && | 659 | if (!b->accessed && |
660 | !mca_reap(b, NULL, 0)) { | 660 | !mca_reap(b, NULL, 0)) { |
661 | mca_bucket_free(b); | 661 | mca_bucket_free(b); |
662 | mca_data_free(b); | 662 | mca_data_free(b); |
663 | rw_unlock(true, b); | 663 | rw_unlock(true, b); |
664 | --nr; | 664 | --nr; |
665 | } else | 665 | } else |
666 | b->accessed = 0; | 666 | b->accessed = 0; |
667 | } | 667 | } |
668 | out: | 668 | out: |
669 | nr = mca_can_free(c) * c->btree_pages; | 669 | nr = mca_can_free(c) * c->btree_pages; |
670 | mutex_unlock(&c->bucket_lock); | 670 | mutex_unlock(&c->bucket_lock); |
671 | return nr; | 671 | return nr; |
672 | } | 672 | } |
673 | 673 | ||
674 | void bch_btree_cache_free(struct cache_set *c) | 674 | void bch_btree_cache_free(struct cache_set *c) |
675 | { | 675 | { |
676 | struct btree *b; | 676 | struct btree *b; |
677 | struct closure cl; | 677 | struct closure cl; |
678 | closure_init_stack(&cl); | 678 | closure_init_stack(&cl); |
679 | 679 | ||
680 | if (c->shrink.list.next) | 680 | if (c->shrink.list.next) |
681 | unregister_shrinker(&c->shrink); | 681 | unregister_shrinker(&c->shrink); |
682 | 682 | ||
683 | mutex_lock(&c->bucket_lock); | 683 | mutex_lock(&c->bucket_lock); |
684 | 684 | ||
685 | #ifdef CONFIG_BCACHE_DEBUG | 685 | #ifdef CONFIG_BCACHE_DEBUG |
686 | if (c->verify_data) | 686 | if (c->verify_data) |
687 | list_move(&c->verify_data->list, &c->btree_cache); | 687 | list_move(&c->verify_data->list, &c->btree_cache); |
688 | #endif | 688 | #endif |
689 | 689 | ||
690 | list_splice(&c->btree_cache_freeable, | 690 | list_splice(&c->btree_cache_freeable, |
691 | &c->btree_cache); | 691 | &c->btree_cache); |
692 | 692 | ||
693 | while (!list_empty(&c->btree_cache)) { | 693 | while (!list_empty(&c->btree_cache)) { |
694 | b = list_first_entry(&c->btree_cache, struct btree, list); | 694 | b = list_first_entry(&c->btree_cache, struct btree, list); |
695 | 695 | ||
696 | if (btree_node_dirty(b)) | 696 | if (btree_node_dirty(b)) |
697 | btree_complete_write(b, btree_current_write(b)); | 697 | btree_complete_write(b, btree_current_write(b)); |
698 | clear_bit(BTREE_NODE_dirty, &b->flags); | 698 | clear_bit(BTREE_NODE_dirty, &b->flags); |
699 | 699 | ||
700 | mca_data_free(b); | 700 | mca_data_free(b); |
701 | } | 701 | } |
702 | 702 | ||
703 | while (!list_empty(&c->btree_cache_freed)) { | 703 | while (!list_empty(&c->btree_cache_freed)) { |
704 | b = list_first_entry(&c->btree_cache_freed, | 704 | b = list_first_entry(&c->btree_cache_freed, |
705 | struct btree, list); | 705 | struct btree, list); |
706 | list_del(&b->list); | 706 | list_del(&b->list); |
707 | cancel_delayed_work_sync(&b->work); | 707 | cancel_delayed_work_sync(&b->work); |
708 | kfree(b); | 708 | kfree(b); |
709 | } | 709 | } |
710 | 710 | ||
711 | mutex_unlock(&c->bucket_lock); | 711 | mutex_unlock(&c->bucket_lock); |
712 | } | 712 | } |
713 | 713 | ||
714 | int bch_btree_cache_alloc(struct cache_set *c) | 714 | int bch_btree_cache_alloc(struct cache_set *c) |
715 | { | 715 | { |
716 | unsigned i; | 716 | unsigned i; |
717 | 717 | ||
718 | /* XXX: doesn't check for errors */ | 718 | /* XXX: doesn't check for errors */ |
719 | 719 | ||
720 | closure_init_unlocked(&c->gc); | 720 | closure_init_unlocked(&c->gc); |
721 | 721 | ||
722 | for (i = 0; i < mca_reserve(c); i++) | 722 | for (i = 0; i < mca_reserve(c); i++) |
723 | mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); | 723 | mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); |
724 | 724 | ||
725 | list_splice_init(&c->btree_cache, | 725 | list_splice_init(&c->btree_cache, |
726 | &c->btree_cache_freeable); | 726 | &c->btree_cache_freeable); |
727 | 727 | ||
728 | #ifdef CONFIG_BCACHE_DEBUG | 728 | #ifdef CONFIG_BCACHE_DEBUG |
729 | mutex_init(&c->verify_lock); | 729 | mutex_init(&c->verify_lock); |
730 | 730 | ||
731 | c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); | 731 | c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); |
732 | 732 | ||
733 | if (c->verify_data && | 733 | if (c->verify_data && |
734 | c->verify_data->sets[0].data) | 734 | c->verify_data->sets[0].data) |
735 | list_del_init(&c->verify_data->list); | 735 | list_del_init(&c->verify_data->list); |
736 | else | 736 | else |
737 | c->verify_data = NULL; | 737 | c->verify_data = NULL; |
738 | #endif | 738 | #endif |
739 | 739 | ||
740 | c->shrink.shrink = bch_mca_shrink; | 740 | c->shrink.shrink = bch_mca_shrink; |
741 | c->shrink.seeks = 4; | 741 | c->shrink.seeks = 4; |
742 | c->shrink.batch = c->btree_pages * 2; | 742 | c->shrink.batch = c->btree_pages * 2; |
743 | register_shrinker(&c->shrink); | 743 | register_shrinker(&c->shrink); |
744 | 744 | ||
745 | return 0; | 745 | return 0; |
746 | } | 746 | } |
747 | 747 | ||
748 | /* Btree in memory cache - hash table */ | 748 | /* Btree in memory cache - hash table */ |
749 | 749 | ||
750 | static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k) | 750 | static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k) |
751 | { | 751 | { |
752 | return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)]; | 752 | return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)]; |
753 | } | 753 | } |
754 | 754 | ||
755 | static struct btree *mca_find(struct cache_set *c, struct bkey *k) | 755 | static struct btree *mca_find(struct cache_set *c, struct bkey *k) |
756 | { | 756 | { |
757 | struct btree *b; | 757 | struct btree *b; |
758 | 758 | ||
759 | rcu_read_lock(); | 759 | rcu_read_lock(); |
760 | hlist_for_each_entry_rcu(b, mca_hash(c, k), hash) | 760 | hlist_for_each_entry_rcu(b, mca_hash(c, k), hash) |
761 | if (PTR_HASH(c, &b->key) == PTR_HASH(c, k)) | 761 | if (PTR_HASH(c, &b->key) == PTR_HASH(c, k)) |
762 | goto out; | 762 | goto out; |
763 | b = NULL; | 763 | b = NULL; |
764 | out: | 764 | out: |
765 | rcu_read_unlock(); | 765 | rcu_read_unlock(); |
766 | return b; | 766 | return b; |
767 | } | 767 | } |
768 | 768 | ||
769 | static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k, | 769 | static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k, |
770 | int level, struct closure *cl) | 770 | int level, struct closure *cl) |
771 | { | 771 | { |
772 | int ret = -ENOMEM; | 772 | int ret = -ENOMEM; |
773 | struct btree *i; | 773 | struct btree *i; |
774 | 774 | ||
775 | trace_bcache_btree_cache_cannibalize(c); | 775 | trace_bcache_btree_cache_cannibalize(c); |
776 | 776 | ||
777 | if (!cl) | 777 | if (!cl) |
778 | return ERR_PTR(-ENOMEM); | 778 | return ERR_PTR(-ENOMEM); |
779 | 779 | ||
780 | /* | 780 | /* |
781 | * Trying to free up some memory - i.e. reuse some btree nodes - may | 781 | * Trying to free up some memory - i.e. reuse some btree nodes - may |
782 | * require initiating IO to flush the dirty part of the node. If we're | 782 | * require initiating IO to flush the dirty part of the node. If we're |
783 | * running under generic_make_request(), that IO will never finish and | 783 | * running under generic_make_request(), that IO will never finish and |
784 | * we would deadlock. Returning -EAGAIN causes the cache lookup code to | 784 | * we would deadlock. Returning -EAGAIN causes the cache lookup code to |
785 | * punt to workqueue and retry. | 785 | * punt to workqueue and retry. |
786 | */ | 786 | */ |
787 | if (current->bio_list) | 787 | if (current->bio_list) |
788 | return ERR_PTR(-EAGAIN); | 788 | return ERR_PTR(-EAGAIN); |
789 | 789 | ||
790 | if (c->try_harder && c->try_harder != cl) { | 790 | if (c->try_harder && c->try_harder != cl) { |
791 | closure_wait_event_async(&c->try_wait, cl, !c->try_harder); | 791 | closure_wait_event_async(&c->try_wait, cl, !c->try_harder); |
792 | return ERR_PTR(-EAGAIN); | 792 | return ERR_PTR(-EAGAIN); |
793 | } | 793 | } |
794 | 794 | ||
795 | c->try_harder = cl; | 795 | c->try_harder = cl; |
796 | c->try_harder_start = local_clock(); | 796 | c->try_harder_start = local_clock(); |
797 | retry: | 797 | retry: |
798 | list_for_each_entry_reverse(i, &c->btree_cache, list) { | 798 | list_for_each_entry_reverse(i, &c->btree_cache, list) { |
799 | int r = mca_reap(i, cl, btree_order(k)); | 799 | int r = mca_reap(i, cl, btree_order(k)); |
800 | if (!r) | 800 | if (!r) |
801 | return i; | 801 | return i; |
802 | if (r != -ENOMEM) | 802 | if (r != -ENOMEM) |
803 | ret = r; | 803 | ret = r; |
804 | } | 804 | } |
805 | 805 | ||
806 | if (ret == -EAGAIN && | 806 | if (ret == -EAGAIN && |
807 | closure_blocking(cl)) { | 807 | closure_blocking(cl)) { |
808 | mutex_unlock(&c->bucket_lock); | 808 | mutex_unlock(&c->bucket_lock); |
809 | closure_sync(cl); | 809 | closure_sync(cl); |
810 | mutex_lock(&c->bucket_lock); | 810 | mutex_lock(&c->bucket_lock); |
811 | goto retry; | 811 | goto retry; |
812 | } | 812 | } |
813 | 813 | ||
814 | return ERR_PTR(ret); | 814 | return ERR_PTR(ret); |
815 | } | 815 | } |
816 | 816 | ||
817 | /* | 817 | /* |
818 | * We can only have one thread cannibalizing other cached btree nodes at a time, | 818 | * We can only have one thread cannibalizing other cached btree nodes at a time, |
819 | * or we'll deadlock. We use an open coded mutex to ensure that, which a | 819 | * or we'll deadlock. We use an open coded mutex to ensure that, which a |
820 | * cannibalize_bucket() will take. This means every time we unlock the root of | 820 | * cannibalize_bucket() will take. This means every time we unlock the root of |
821 | * the btree, we need to release this lock if we have it held. | 821 | * the btree, we need to release this lock if we have it held. |
822 | */ | 822 | */ |
823 | void bch_cannibalize_unlock(struct cache_set *c, struct closure *cl) | 823 | void bch_cannibalize_unlock(struct cache_set *c, struct closure *cl) |
824 | { | 824 | { |
825 | if (c->try_harder == cl) { | 825 | if (c->try_harder == cl) { |
826 | bch_time_stats_update(&c->try_harder_time, c->try_harder_start); | 826 | bch_time_stats_update(&c->try_harder_time, c->try_harder_start); |
827 | c->try_harder = NULL; | 827 | c->try_harder = NULL; |
828 | __closure_wake_up(&c->try_wait); | 828 | __closure_wake_up(&c->try_wait); |
829 | } | 829 | } |
830 | } | 830 | } |
831 | 831 | ||
832 | static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, | 832 | static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, |
833 | int level, struct closure *cl) | 833 | int level, struct closure *cl) |
834 | { | 834 | { |
835 | struct btree *b; | 835 | struct btree *b; |
836 | 836 | ||
837 | lockdep_assert_held(&c->bucket_lock); | 837 | lockdep_assert_held(&c->bucket_lock); |
838 | 838 | ||
839 | if (mca_find(c, k)) | 839 | if (mca_find(c, k)) |
840 | return NULL; | 840 | return NULL; |
841 | 841 | ||
842 | /* btree_free() doesn't free memory; it sticks the node on the end of | 842 | /* btree_free() doesn't free memory; it sticks the node on the end of |
843 | * the list. Check if there's any freed nodes there: | 843 | * the list. Check if there's any freed nodes there: |
844 | */ | 844 | */ |
845 | list_for_each_entry(b, &c->btree_cache_freeable, list) | 845 | list_for_each_entry(b, &c->btree_cache_freeable, list) |
846 | if (!mca_reap(b, NULL, btree_order(k))) | 846 | if (!mca_reap(b, NULL, btree_order(k))) |
847 | goto out; | 847 | goto out; |
848 | 848 | ||
849 | /* We never free struct btree itself, just the memory that holds the on | 849 | /* We never free struct btree itself, just the memory that holds the on |
850 | * disk node. Check the freed list before allocating a new one: | 850 | * disk node. Check the freed list before allocating a new one: |
851 | */ | 851 | */ |
852 | list_for_each_entry(b, &c->btree_cache_freed, list) | 852 | list_for_each_entry(b, &c->btree_cache_freed, list) |
853 | if (!mca_reap(b, NULL, 0)) { | 853 | if (!mca_reap(b, NULL, 0)) { |
854 | mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); | 854 | mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); |
855 | if (!b->sets[0].data) | 855 | if (!b->sets[0].data) |
856 | goto err; | 856 | goto err; |
857 | else | 857 | else |
858 | goto out; | 858 | goto out; |
859 | } | 859 | } |
860 | 860 | ||
861 | b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO); | 861 | b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO); |
862 | if (!b) | 862 | if (!b) |
863 | goto err; | 863 | goto err; |
864 | 864 | ||
865 | BUG_ON(!down_write_trylock(&b->lock)); | 865 | BUG_ON(!down_write_trylock(&b->lock)); |
866 | if (!b->sets->data) | 866 | if (!b->sets->data) |
867 | goto err; | 867 | goto err; |
868 | out: | 868 | out: |
869 | BUG_ON(!closure_is_unlocked(&b->io.cl)); | 869 | BUG_ON(!closure_is_unlocked(&b->io.cl)); |
870 | 870 | ||
871 | bkey_copy(&b->key, k); | 871 | bkey_copy(&b->key, k); |
872 | list_move(&b->list, &c->btree_cache); | 872 | list_move(&b->list, &c->btree_cache); |
873 | hlist_del_init_rcu(&b->hash); | 873 | hlist_del_init_rcu(&b->hash); |
874 | hlist_add_head_rcu(&b->hash, mca_hash(c, k)); | 874 | hlist_add_head_rcu(&b->hash, mca_hash(c, k)); |
875 | 875 | ||
876 | lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_); | 876 | lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_); |
877 | b->level = level; | 877 | b->level = level; |
878 | 878 | ||
879 | mca_reinit(b); | 879 | mca_reinit(b); |
880 | 880 | ||
881 | return b; | 881 | return b; |
882 | err: | 882 | err: |
883 | if (b) | 883 | if (b) |
884 | rw_unlock(true, b); | 884 | rw_unlock(true, b); |
885 | 885 | ||
886 | b = mca_cannibalize(c, k, level, cl); | 886 | b = mca_cannibalize(c, k, level, cl); |
887 | if (!IS_ERR(b)) | 887 | if (!IS_ERR(b)) |
888 | goto out; | 888 | goto out; |
889 | 889 | ||
890 | return b; | 890 | return b; |
891 | } | 891 | } |
892 | 892 | ||
893 | /** | 893 | /** |
894 | * bch_btree_node_get - find a btree node in the cache and lock it, reading it | 894 | * bch_btree_node_get - find a btree node in the cache and lock it, reading it |
895 | * in from disk if necessary. | 895 | * in from disk if necessary. |
896 | * | 896 | * |
897 | * If IO is necessary, it uses the closure embedded in struct btree_op to wait; | 897 | * If IO is necessary, it uses the closure embedded in struct btree_op to wait; |
898 | * if that closure is in non blocking mode, will return -EAGAIN. | 898 | * if that closure is in non blocking mode, will return -EAGAIN. |
899 | * | 899 | * |
900 | * The btree node will have either a read or a write lock held, depending on | 900 | * The btree node will have either a read or a write lock held, depending on |
901 | * level and op->lock. | 901 | * level and op->lock. |
902 | */ | 902 | */ |
903 | struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k, | 903 | struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k, |
904 | int level, struct btree_op *op) | 904 | int level, struct btree_op *op) |
905 | { | 905 | { |
906 | int i = 0; | 906 | int i = 0; |
907 | bool write = level <= op->lock; | 907 | bool write = level <= op->lock; |
908 | struct btree *b; | 908 | struct btree *b; |
909 | 909 | ||
910 | BUG_ON(level < 0); | 910 | BUG_ON(level < 0); |
911 | retry: | 911 | retry: |
912 | b = mca_find(c, k); | 912 | b = mca_find(c, k); |
913 | 913 | ||
914 | if (!b) { | 914 | if (!b) { |
915 | if (current->bio_list) | 915 | if (current->bio_list) |
916 | return ERR_PTR(-EAGAIN); | 916 | return ERR_PTR(-EAGAIN); |
917 | 917 | ||
918 | mutex_lock(&c->bucket_lock); | 918 | mutex_lock(&c->bucket_lock); |
919 | b = mca_alloc(c, k, level, &op->cl); | 919 | b = mca_alloc(c, k, level, &op->cl); |
920 | mutex_unlock(&c->bucket_lock); | 920 | mutex_unlock(&c->bucket_lock); |
921 | 921 | ||
922 | if (!b) | 922 | if (!b) |
923 | goto retry; | 923 | goto retry; |
924 | if (IS_ERR(b)) | 924 | if (IS_ERR(b)) |
925 | return b; | 925 | return b; |
926 | 926 | ||
927 | bch_btree_node_read(b); | 927 | bch_btree_node_read(b); |
928 | 928 | ||
929 | if (!write) | 929 | if (!write) |
930 | downgrade_write(&b->lock); | 930 | downgrade_write(&b->lock); |
931 | } else { | 931 | } else { |
932 | rw_lock(write, b, level); | 932 | rw_lock(write, b, level); |
933 | if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) { | 933 | if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) { |
934 | rw_unlock(write, b); | 934 | rw_unlock(write, b); |
935 | goto retry; | 935 | goto retry; |
936 | } | 936 | } |
937 | BUG_ON(b->level != level); | 937 | BUG_ON(b->level != level); |
938 | } | 938 | } |
939 | 939 | ||
940 | b->accessed = 1; | 940 | b->accessed = 1; |
941 | 941 | ||
942 | for (; i <= b->nsets && b->sets[i].size; i++) { | 942 | for (; i <= b->nsets && b->sets[i].size; i++) { |
943 | prefetch(b->sets[i].tree); | 943 | prefetch(b->sets[i].tree); |
944 | prefetch(b->sets[i].data); | 944 | prefetch(b->sets[i].data); |
945 | } | 945 | } |
946 | 946 | ||
947 | for (; i <= b->nsets; i++) | 947 | for (; i <= b->nsets; i++) |
948 | prefetch(b->sets[i].data); | 948 | prefetch(b->sets[i].data); |
949 | 949 | ||
950 | if (btree_node_io_error(b)) { | 950 | if (btree_node_io_error(b)) { |
951 | rw_unlock(write, b); | 951 | rw_unlock(write, b); |
952 | return ERR_PTR(-EIO); | 952 | return ERR_PTR(-EIO); |
953 | } | 953 | } |
954 | 954 | ||
955 | BUG_ON(!b->written); | 955 | BUG_ON(!b->written); |
956 | 956 | ||
957 | return b; | 957 | return b; |
958 | } | 958 | } |
959 | 959 | ||
960 | static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) | 960 | static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) |
961 | { | 961 | { |
962 | struct btree *b; | 962 | struct btree *b; |
963 | 963 | ||
964 | mutex_lock(&c->bucket_lock); | 964 | mutex_lock(&c->bucket_lock); |
965 | b = mca_alloc(c, k, level, NULL); | 965 | b = mca_alloc(c, k, level, NULL); |
966 | mutex_unlock(&c->bucket_lock); | 966 | mutex_unlock(&c->bucket_lock); |
967 | 967 | ||
968 | if (!IS_ERR_OR_NULL(b)) { | 968 | if (!IS_ERR_OR_NULL(b)) { |
969 | bch_btree_node_read(b); | 969 | bch_btree_node_read(b); |
970 | rw_unlock(true, b); | 970 | rw_unlock(true, b); |
971 | } | 971 | } |
972 | } | 972 | } |
973 | 973 | ||
974 | /* Btree alloc */ | 974 | /* Btree alloc */ |
975 | 975 | ||
976 | static void btree_node_free(struct btree *b, struct btree_op *op) | 976 | static void btree_node_free(struct btree *b, struct btree_op *op) |
977 | { | 977 | { |
978 | unsigned i; | 978 | unsigned i; |
979 | 979 | ||
980 | trace_bcache_btree_node_free(b); | 980 | trace_bcache_btree_node_free(b); |
981 | 981 | ||
982 | /* | 982 | /* |
983 | * The BUG_ON() in btree_node_get() implies that we must have a write | 983 | * The BUG_ON() in btree_node_get() implies that we must have a write |
984 | * lock on parent to free or even invalidate a node | 984 | * lock on parent to free or even invalidate a node |
985 | */ | 985 | */ |
986 | BUG_ON(op->lock <= b->level); | 986 | BUG_ON(op->lock <= b->level); |
987 | BUG_ON(b == b->c->root); | 987 | BUG_ON(b == b->c->root); |
988 | 988 | ||
989 | if (btree_node_dirty(b)) | 989 | if (btree_node_dirty(b)) |
990 | btree_complete_write(b, btree_current_write(b)); | 990 | btree_complete_write(b, btree_current_write(b)); |
991 | clear_bit(BTREE_NODE_dirty, &b->flags); | 991 | clear_bit(BTREE_NODE_dirty, &b->flags); |
992 | 992 | ||
993 | cancel_delayed_work(&b->work); | 993 | cancel_delayed_work(&b->work); |
994 | 994 | ||
995 | mutex_lock(&b->c->bucket_lock); | 995 | mutex_lock(&b->c->bucket_lock); |
996 | 996 | ||
997 | for (i = 0; i < KEY_PTRS(&b->key); i++) { | 997 | for (i = 0; i < KEY_PTRS(&b->key); i++) { |
998 | BUG_ON(atomic_read(&PTR_BUCKET(b->c, &b->key, i)->pin)); | 998 | BUG_ON(atomic_read(&PTR_BUCKET(b->c, &b->key, i)->pin)); |
999 | 999 | ||
1000 | bch_inc_gen(PTR_CACHE(b->c, &b->key, i), | 1000 | bch_inc_gen(PTR_CACHE(b->c, &b->key, i), |
1001 | PTR_BUCKET(b->c, &b->key, i)); | 1001 | PTR_BUCKET(b->c, &b->key, i)); |
1002 | } | 1002 | } |
1003 | 1003 | ||
1004 | bch_bucket_free(b->c, &b->key); | 1004 | bch_bucket_free(b->c, &b->key); |
1005 | mca_bucket_free(b); | 1005 | mca_bucket_free(b); |
1006 | mutex_unlock(&b->c->bucket_lock); | 1006 | mutex_unlock(&b->c->bucket_lock); |
1007 | } | 1007 | } |
1008 | 1008 | ||
1009 | struct btree *bch_btree_node_alloc(struct cache_set *c, int level, | 1009 | struct btree *bch_btree_node_alloc(struct cache_set *c, int level, |
1010 | struct closure *cl) | 1010 | struct closure *cl) |
1011 | { | 1011 | { |
1012 | BKEY_PADDED(key) k; | 1012 | BKEY_PADDED(key) k; |
1013 | struct btree *b = ERR_PTR(-EAGAIN); | 1013 | struct btree *b = ERR_PTR(-EAGAIN); |
1014 | 1014 | ||
1015 | mutex_lock(&c->bucket_lock); | 1015 | mutex_lock(&c->bucket_lock); |
1016 | retry: | 1016 | retry: |
1017 | if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, cl)) | 1017 | if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, cl)) |
1018 | goto err; | 1018 | goto err; |
1019 | 1019 | ||
1020 | SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); | 1020 | SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); |
1021 | 1021 | ||
1022 | b = mca_alloc(c, &k.key, level, cl); | 1022 | b = mca_alloc(c, &k.key, level, cl); |
1023 | if (IS_ERR(b)) | 1023 | if (IS_ERR(b)) |
1024 | goto err_free; | 1024 | goto err_free; |
1025 | 1025 | ||
1026 | if (!b) { | 1026 | if (!b) { |
1027 | cache_bug(c, | 1027 | cache_bug(c, |
1028 | "Tried to allocate bucket that was in btree cache"); | 1028 | "Tried to allocate bucket that was in btree cache"); |
1029 | __bkey_put(c, &k.key); | 1029 | __bkey_put(c, &k.key); |
1030 | goto retry; | 1030 | goto retry; |
1031 | } | 1031 | } |
1032 | 1032 | ||
1033 | b->accessed = 1; | 1033 | b->accessed = 1; |
1034 | bch_bset_init_next(b); | 1034 | bch_bset_init_next(b); |
1035 | 1035 | ||
1036 | mutex_unlock(&c->bucket_lock); | 1036 | mutex_unlock(&c->bucket_lock); |
1037 | 1037 | ||
1038 | trace_bcache_btree_node_alloc(b); | 1038 | trace_bcache_btree_node_alloc(b); |
1039 | return b; | 1039 | return b; |
1040 | err_free: | 1040 | err_free: |
1041 | bch_bucket_free(c, &k.key); | 1041 | bch_bucket_free(c, &k.key); |
1042 | __bkey_put(c, &k.key); | 1042 | __bkey_put(c, &k.key); |
1043 | err: | 1043 | err: |
1044 | mutex_unlock(&c->bucket_lock); | 1044 | mutex_unlock(&c->bucket_lock); |
1045 | 1045 | ||
1046 | trace_bcache_btree_node_alloc_fail(b); | 1046 | trace_bcache_btree_node_alloc_fail(b); |
1047 | return b; | 1047 | return b; |
1048 | } | 1048 | } |
1049 | 1049 | ||
1050 | static struct btree *btree_node_alloc_replacement(struct btree *b, | 1050 | static struct btree *btree_node_alloc_replacement(struct btree *b, |
1051 | struct closure *cl) | 1051 | struct closure *cl) |
1052 | { | 1052 | { |
1053 | struct btree *n = bch_btree_node_alloc(b->c, b->level, cl); | 1053 | struct btree *n = bch_btree_node_alloc(b->c, b->level, cl); |
1054 | if (!IS_ERR_OR_NULL(n)) | 1054 | if (!IS_ERR_OR_NULL(n)) |
1055 | bch_btree_sort_into(b, n); | 1055 | bch_btree_sort_into(b, n); |
1056 | 1056 | ||
1057 | return n; | 1057 | return n; |
1058 | } | 1058 | } |
1059 | 1059 | ||
1060 | /* Garbage collection */ | 1060 | /* Garbage collection */ |
1061 | 1061 | ||
1062 | uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) | 1062 | uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) |
1063 | { | 1063 | { |
1064 | uint8_t stale = 0; | 1064 | uint8_t stale = 0; |
1065 | unsigned i; | 1065 | unsigned i; |
1066 | struct bucket *g; | 1066 | struct bucket *g; |
1067 | 1067 | ||
1068 | /* | 1068 | /* |
1069 | * ptr_invalid() can't return true for the keys that mark btree nodes as | 1069 | * ptr_invalid() can't return true for the keys that mark btree nodes as |
1070 | * freed, but since ptr_bad() returns true we'll never actually use them | 1070 | * freed, but since ptr_bad() returns true we'll never actually use them |
1071 | * for anything and thus we don't want mark their pointers here | 1071 | * for anything and thus we don't want mark their pointers here |
1072 | */ | 1072 | */ |
1073 | if (!bkey_cmp(k, &ZERO_KEY)) | 1073 | if (!bkey_cmp(k, &ZERO_KEY)) |
1074 | return stale; | 1074 | return stale; |
1075 | 1075 | ||
1076 | for (i = 0; i < KEY_PTRS(k); i++) { | 1076 | for (i = 0; i < KEY_PTRS(k); i++) { |
1077 | if (!ptr_available(c, k, i)) | 1077 | if (!ptr_available(c, k, i)) |
1078 | continue; | 1078 | continue; |
1079 | 1079 | ||
1080 | g = PTR_BUCKET(c, k, i); | 1080 | g = PTR_BUCKET(c, k, i); |
1081 | 1081 | ||
1082 | if (gen_after(g->gc_gen, PTR_GEN(k, i))) | 1082 | if (gen_after(g->gc_gen, PTR_GEN(k, i))) |
1083 | g->gc_gen = PTR_GEN(k, i); | 1083 | g->gc_gen = PTR_GEN(k, i); |
1084 | 1084 | ||
1085 | if (ptr_stale(c, k, i)) { | 1085 | if (ptr_stale(c, k, i)) { |
1086 | stale = max(stale, ptr_stale(c, k, i)); | 1086 | stale = max(stale, ptr_stale(c, k, i)); |
1087 | continue; | 1087 | continue; |
1088 | } | 1088 | } |
1089 | 1089 | ||
1090 | cache_bug_on(GC_MARK(g) && | 1090 | cache_bug_on(GC_MARK(g) && |
1091 | (GC_MARK(g) == GC_MARK_METADATA) != (level != 0), | 1091 | (GC_MARK(g) == GC_MARK_METADATA) != (level != 0), |
1092 | c, "inconsistent ptrs: mark = %llu, level = %i", | 1092 | c, "inconsistent ptrs: mark = %llu, level = %i", |
1093 | GC_MARK(g), level); | 1093 | GC_MARK(g), level); |
1094 | 1094 | ||
1095 | if (level) | 1095 | if (level) |
1096 | SET_GC_MARK(g, GC_MARK_METADATA); | 1096 | SET_GC_MARK(g, GC_MARK_METADATA); |
1097 | else if (KEY_DIRTY(k)) | 1097 | else if (KEY_DIRTY(k)) |
1098 | SET_GC_MARK(g, GC_MARK_DIRTY); | 1098 | SET_GC_MARK(g, GC_MARK_DIRTY); |
1099 | 1099 | ||
1100 | /* guard against overflow */ | 1100 | /* guard against overflow */ |
1101 | SET_GC_SECTORS_USED(g, min_t(unsigned, | 1101 | SET_GC_SECTORS_USED(g, min_t(unsigned, |
1102 | GC_SECTORS_USED(g) + KEY_SIZE(k), | 1102 | GC_SECTORS_USED(g) + KEY_SIZE(k), |
1103 | (1 << 14) - 1)); | 1103 | (1 << 14) - 1)); |
1104 | 1104 | ||
1105 | BUG_ON(!GC_SECTORS_USED(g)); | 1105 | BUG_ON(!GC_SECTORS_USED(g)); |
1106 | } | 1106 | } |
1107 | 1107 | ||
1108 | return stale; | 1108 | return stale; |
1109 | } | 1109 | } |
1110 | 1110 | ||
1111 | #define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k) | 1111 | #define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k) |
1112 | 1112 | ||
1113 | static int btree_gc_mark_node(struct btree *b, unsigned *keys, | 1113 | static int btree_gc_mark_node(struct btree *b, unsigned *keys, |
1114 | struct gc_stat *gc) | 1114 | struct gc_stat *gc) |
1115 | { | 1115 | { |
1116 | uint8_t stale = 0; | 1116 | uint8_t stale = 0; |
1117 | unsigned last_dev = -1; | 1117 | unsigned last_dev = -1; |
1118 | struct bcache_device *d = NULL; | 1118 | struct bcache_device *d = NULL; |
1119 | struct bkey *k; | 1119 | struct bkey *k; |
1120 | struct btree_iter iter; | 1120 | struct btree_iter iter; |
1121 | struct bset_tree *t; | 1121 | struct bset_tree *t; |
1122 | 1122 | ||
1123 | gc->nodes++; | 1123 | gc->nodes++; |
1124 | 1124 | ||
1125 | for_each_key_filter(b, k, &iter, bch_ptr_invalid) { | 1125 | for_each_key_filter(b, k, &iter, bch_ptr_invalid) { |
1126 | if (last_dev != KEY_INODE(k)) { | 1126 | if (last_dev != KEY_INODE(k)) { |
1127 | last_dev = KEY_INODE(k); | 1127 | last_dev = KEY_INODE(k); |
1128 | 1128 | ||
1129 | d = KEY_INODE(k) < b->c->nr_uuids | 1129 | d = KEY_INODE(k) < b->c->nr_uuids |
1130 | ? b->c->devices[last_dev] | 1130 | ? b->c->devices[last_dev] |
1131 | : NULL; | 1131 | : NULL; |
1132 | } | 1132 | } |
1133 | 1133 | ||
1134 | stale = max(stale, btree_mark_key(b, k)); | 1134 | stale = max(stale, btree_mark_key(b, k)); |
1135 | 1135 | ||
1136 | if (bch_ptr_bad(b, k)) | 1136 | if (bch_ptr_bad(b, k)) |
1137 | continue; | 1137 | continue; |
1138 | 1138 | ||
1139 | *keys += bkey_u64s(k); | 1139 | *keys += bkey_u64s(k); |
1140 | 1140 | ||
1141 | gc->key_bytes += bkey_u64s(k); | 1141 | gc->key_bytes += bkey_u64s(k); |
1142 | gc->nkeys++; | 1142 | gc->nkeys++; |
1143 | 1143 | ||
1144 | gc->data += KEY_SIZE(k); | 1144 | gc->data += KEY_SIZE(k); |
1145 | if (KEY_DIRTY(k)) | 1145 | if (KEY_DIRTY(k)) |
1146 | gc->dirty += KEY_SIZE(k); | 1146 | gc->dirty += KEY_SIZE(k); |
1147 | } | 1147 | } |
1148 | 1148 | ||
1149 | for (t = b->sets; t <= &b->sets[b->nsets]; t++) | 1149 | for (t = b->sets; t <= &b->sets[b->nsets]; t++) |
1150 | btree_bug_on(t->size && | 1150 | btree_bug_on(t->size && |
1151 | bset_written(b, t) && | 1151 | bset_written(b, t) && |
1152 | bkey_cmp(&b->key, &t->end) < 0, | 1152 | bkey_cmp(&b->key, &t->end) < 0, |
1153 | b, "found short btree key in gc"); | 1153 | b, "found short btree key in gc"); |
1154 | 1154 | ||
1155 | return stale; | 1155 | return stale; |
1156 | } | 1156 | } |
1157 | 1157 | ||
1158 | static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k, | 1158 | static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k, |
1159 | struct btree_op *op) | 1159 | struct btree_op *op) |
1160 | { | 1160 | { |
1161 | /* | 1161 | /* |
1162 | * We block priorities from being written for the duration of garbage | 1162 | * We block priorities from being written for the duration of garbage |
1163 | * collection, so we can't sleep in btree_alloc() -> | 1163 | * collection, so we can't sleep in btree_alloc() -> |
1164 | * bch_bucket_alloc_set(), or we'd risk deadlock - so we don't pass it | 1164 | * bch_bucket_alloc_set(), or we'd risk deadlock - so we don't pass it |
1165 | * our closure. | 1165 | * our closure. |
1166 | */ | 1166 | */ |
1167 | struct btree *n = btree_node_alloc_replacement(b, NULL); | 1167 | struct btree *n = btree_node_alloc_replacement(b, NULL); |
1168 | 1168 | ||
1169 | if (!IS_ERR_OR_NULL(n)) { | 1169 | if (!IS_ERR_OR_NULL(n)) { |
1170 | swap(b, n); | 1170 | swap(b, n); |
1171 | __bkey_put(b->c, &b->key); | 1171 | __bkey_put(b->c, &b->key); |
1172 | 1172 | ||
1173 | memcpy(k->ptr, b->key.ptr, | 1173 | memcpy(k->ptr, b->key.ptr, |
1174 | sizeof(uint64_t) * KEY_PTRS(&b->key)); | 1174 | sizeof(uint64_t) * KEY_PTRS(&b->key)); |
1175 | 1175 | ||
1176 | btree_node_free(n, op); | 1176 | btree_node_free(n, op); |
1177 | up_write(&n->lock); | 1177 | up_write(&n->lock); |
1178 | } | 1178 | } |
1179 | 1179 | ||
1180 | return b; | 1180 | return b; |
1181 | } | 1181 | } |
1182 | 1182 | ||
1183 | /* | 1183 | /* |
1184 | * Leaving this at 2 until we've got incremental garbage collection done; it | 1184 | * Leaving this at 2 until we've got incremental garbage collection done; it |
1185 | * could be higher (and has been tested with 4) except that garbage collection | 1185 | * could be higher (and has been tested with 4) except that garbage collection |
1186 | * could take much longer, adversely affecting latency. | 1186 | * could take much longer, adversely affecting latency. |
1187 | */ | 1187 | */ |
1188 | #define GC_MERGE_NODES 2U | 1188 | #define GC_MERGE_NODES 2U |
1189 | 1189 | ||
1190 | struct gc_merge_info { | 1190 | struct gc_merge_info { |
1191 | struct btree *b; | 1191 | struct btree *b; |
1192 | struct bkey *k; | 1192 | struct bkey *k; |
1193 | unsigned keys; | 1193 | unsigned keys; |
1194 | }; | 1194 | }; |
1195 | 1195 | ||
1196 | static void btree_gc_coalesce(struct btree *b, struct btree_op *op, | 1196 | static void btree_gc_coalesce(struct btree *b, struct btree_op *op, |
1197 | struct gc_stat *gc, struct gc_merge_info *r) | 1197 | struct gc_stat *gc, struct gc_merge_info *r) |
1198 | { | 1198 | { |
1199 | unsigned nodes = 0, keys = 0, blocks; | 1199 | unsigned nodes = 0, keys = 0, blocks; |
1200 | int i; | 1200 | int i; |
1201 | 1201 | ||
1202 | while (nodes < GC_MERGE_NODES && r[nodes].b) | 1202 | while (nodes < GC_MERGE_NODES && r[nodes].b) |
1203 | keys += r[nodes++].keys; | 1203 | keys += r[nodes++].keys; |
1204 | 1204 | ||
1205 | blocks = btree_default_blocks(b->c) * 2 / 3; | 1205 | blocks = btree_default_blocks(b->c) * 2 / 3; |
1206 | 1206 | ||
1207 | if (nodes < 2 || | 1207 | if (nodes < 2 || |
1208 | __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1)) | 1208 | __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1)) |
1209 | return; | 1209 | return; |
1210 | 1210 | ||
1211 | for (i = nodes - 1; i >= 0; --i) { | 1211 | for (i = nodes - 1; i >= 0; --i) { |
1212 | if (r[i].b->written) | 1212 | if (r[i].b->written) |
1213 | r[i].b = btree_gc_alloc(r[i].b, r[i].k, op); | 1213 | r[i].b = btree_gc_alloc(r[i].b, r[i].k, op); |
1214 | 1214 | ||
1215 | if (r[i].b->written) | 1215 | if (r[i].b->written) |
1216 | return; | 1216 | return; |
1217 | } | 1217 | } |
1218 | 1218 | ||
1219 | for (i = nodes - 1; i > 0; --i) { | 1219 | for (i = nodes - 1; i > 0; --i) { |
1220 | struct bset *n1 = r[i].b->sets->data; | 1220 | struct bset *n1 = r[i].b->sets->data; |
1221 | struct bset *n2 = r[i - 1].b->sets->data; | 1221 | struct bset *n2 = r[i - 1].b->sets->data; |
1222 | struct bkey *k, *last = NULL; | 1222 | struct bkey *k, *last = NULL; |
1223 | 1223 | ||
1224 | keys = 0; | 1224 | keys = 0; |
1225 | 1225 | ||
1226 | if (i == 1) { | 1226 | if (i == 1) { |
1227 | /* | 1227 | /* |
1228 | * Last node we're not getting rid of - we're getting | 1228 | * Last node we're not getting rid of - we're getting |
1229 | * rid of the node at r[0]. Have to try and fit all of | 1229 | * rid of the node at r[0]. Have to try and fit all of |
1230 | * the remaining keys into this node; we can't ensure | 1230 | * the remaining keys into this node; we can't ensure |
1231 | * they will always fit due to rounding and variable | 1231 | * they will always fit due to rounding and variable |
1232 | * length keys (shouldn't be possible in practice, | 1232 | * length keys (shouldn't be possible in practice, |
1233 | * though) | 1233 | * though) |
1234 | */ | 1234 | */ |
1235 | if (__set_blocks(n1, n1->keys + r->keys, | 1235 | if (__set_blocks(n1, n1->keys + r->keys, |
1236 | b->c) > btree_blocks(r[i].b)) | 1236 | b->c) > btree_blocks(r[i].b)) |
1237 | return; | 1237 | return; |
1238 | 1238 | ||
1239 | keys = n2->keys; | 1239 | keys = n2->keys; |
1240 | last = &r->b->key; | 1240 | last = &r->b->key; |
1241 | } else | 1241 | } else |
1242 | for (k = n2->start; | 1242 | for (k = n2->start; |
1243 | k < end(n2); | 1243 | k < end(n2); |
1244 | k = bkey_next(k)) { | 1244 | k = bkey_next(k)) { |
1245 | if (__set_blocks(n1, n1->keys + keys + | 1245 | if (__set_blocks(n1, n1->keys + keys + |
1246 | bkey_u64s(k), b->c) > blocks) | 1246 | bkey_u64s(k), b->c) > blocks) |
1247 | break; | 1247 | break; |
1248 | 1248 | ||
1249 | last = k; | 1249 | last = k; |
1250 | keys += bkey_u64s(k); | 1250 | keys += bkey_u64s(k); |
1251 | } | 1251 | } |
1252 | 1252 | ||
1253 | BUG_ON(__set_blocks(n1, n1->keys + keys, | 1253 | BUG_ON(__set_blocks(n1, n1->keys + keys, |
1254 | b->c) > btree_blocks(r[i].b)); | 1254 | b->c) > btree_blocks(r[i].b)); |
1255 | 1255 | ||
1256 | if (last) { | 1256 | if (last) { |
1257 | bkey_copy_key(&r[i].b->key, last); | 1257 | bkey_copy_key(&r[i].b->key, last); |
1258 | bkey_copy_key(r[i].k, last); | 1258 | bkey_copy_key(r[i].k, last); |
1259 | } | 1259 | } |
1260 | 1260 | ||
1261 | memcpy(end(n1), | 1261 | memcpy(end(n1), |
1262 | n2->start, | 1262 | n2->start, |
1263 | (void *) node(n2, keys) - (void *) n2->start); | 1263 | (void *) node(n2, keys) - (void *) n2->start); |
1264 | 1264 | ||
1265 | n1->keys += keys; | 1265 | n1->keys += keys; |
1266 | 1266 | ||
1267 | memmove(n2->start, | 1267 | memmove(n2->start, |
1268 | node(n2, keys), | 1268 | node(n2, keys), |
1269 | (void *) end(n2) - (void *) node(n2, keys)); | 1269 | (void *) end(n2) - (void *) node(n2, keys)); |
1270 | 1270 | ||
1271 | n2->keys -= keys; | 1271 | n2->keys -= keys; |
1272 | 1272 | ||
1273 | r[i].keys = n1->keys; | 1273 | r[i].keys = n1->keys; |
1274 | r[i - 1].keys = n2->keys; | 1274 | r[i - 1].keys = n2->keys; |
1275 | } | 1275 | } |
1276 | 1276 | ||
1277 | btree_node_free(r->b, op); | 1277 | btree_node_free(r->b, op); |
1278 | up_write(&r->b->lock); | 1278 | up_write(&r->b->lock); |
1279 | 1279 | ||
1280 | trace_bcache_btree_gc_coalesce(nodes); | 1280 | trace_bcache_btree_gc_coalesce(nodes); |
1281 | 1281 | ||
1282 | gc->nodes--; | 1282 | gc->nodes--; |
1283 | nodes--; | 1283 | nodes--; |
1284 | 1284 | ||
1285 | memmove(&r[0], &r[1], sizeof(struct gc_merge_info) * nodes); | 1285 | memmove(&r[0], &r[1], sizeof(struct gc_merge_info) * nodes); |
1286 | memset(&r[nodes], 0, sizeof(struct gc_merge_info)); | 1286 | memset(&r[nodes], 0, sizeof(struct gc_merge_info)); |
1287 | } | 1287 | } |
1288 | 1288 | ||
1289 | static int btree_gc_recurse(struct btree *b, struct btree_op *op, | 1289 | static int btree_gc_recurse(struct btree *b, struct btree_op *op, |
1290 | struct closure *writes, struct gc_stat *gc) | 1290 | struct closure *writes, struct gc_stat *gc) |
1291 | { | 1291 | { |
1292 | void write(struct btree *r) | 1292 | void write(struct btree *r) |
1293 | { | 1293 | { |
1294 | if (!r->written) | 1294 | if (!r->written) |
1295 | bch_btree_node_write(r, &op->cl); | 1295 | bch_btree_node_write(r, &op->cl); |
1296 | else if (btree_node_dirty(r)) | 1296 | else if (btree_node_dirty(r)) |
1297 | bch_btree_node_write(r, writes); | 1297 | bch_btree_node_write(r, writes); |
1298 | 1298 | ||
1299 | up_write(&r->lock); | 1299 | up_write(&r->lock); |
1300 | } | 1300 | } |
1301 | 1301 | ||
1302 | int ret = 0, stale; | 1302 | int ret = 0, stale; |
1303 | unsigned i; | 1303 | unsigned i; |
1304 | struct gc_merge_info r[GC_MERGE_NODES]; | 1304 | struct gc_merge_info r[GC_MERGE_NODES]; |
1305 | 1305 | ||
1306 | memset(r, 0, sizeof(r)); | 1306 | memset(r, 0, sizeof(r)); |
1307 | 1307 | ||
1308 | while ((r->k = bch_next_recurse_key(b, &b->c->gc_done))) { | 1308 | while ((r->k = bch_next_recurse_key(b, &b->c->gc_done))) { |
1309 | r->b = bch_btree_node_get(b->c, r->k, b->level - 1, op); | 1309 | r->b = bch_btree_node_get(b->c, r->k, b->level - 1, op); |
1310 | 1310 | ||
1311 | if (IS_ERR(r->b)) { | 1311 | if (IS_ERR(r->b)) { |
1312 | ret = PTR_ERR(r->b); | 1312 | ret = PTR_ERR(r->b); |
1313 | break; | 1313 | break; |
1314 | } | 1314 | } |
1315 | 1315 | ||
1316 | r->keys = 0; | 1316 | r->keys = 0; |
1317 | stale = btree_gc_mark_node(r->b, &r->keys, gc); | 1317 | stale = btree_gc_mark_node(r->b, &r->keys, gc); |
1318 | 1318 | ||
1319 | if (!b->written && | 1319 | if (!b->written && |
1320 | (r->b->level || stale > 10 || | 1320 | (r->b->level || stale > 10 || |
1321 | b->c->gc_always_rewrite)) | 1321 | b->c->gc_always_rewrite)) |
1322 | r->b = btree_gc_alloc(r->b, r->k, op); | 1322 | r->b = btree_gc_alloc(r->b, r->k, op); |
1323 | 1323 | ||
1324 | if (r->b->level) | 1324 | if (r->b->level) |
1325 | ret = btree_gc_recurse(r->b, op, writes, gc); | 1325 | ret = btree_gc_recurse(r->b, op, writes, gc); |
1326 | 1326 | ||
1327 | if (ret) { | 1327 | if (ret) { |
1328 | write(r->b); | 1328 | write(r->b); |
1329 | break; | 1329 | break; |
1330 | } | 1330 | } |
1331 | 1331 | ||
1332 | bkey_copy_key(&b->c->gc_done, r->k); | 1332 | bkey_copy_key(&b->c->gc_done, r->k); |
1333 | 1333 | ||
1334 | if (!b->written) | 1334 | if (!b->written) |
1335 | btree_gc_coalesce(b, op, gc, r); | 1335 | btree_gc_coalesce(b, op, gc, r); |
1336 | 1336 | ||
1337 | if (r[GC_MERGE_NODES - 1].b) | 1337 | if (r[GC_MERGE_NODES - 1].b) |
1338 | write(r[GC_MERGE_NODES - 1].b); | 1338 | write(r[GC_MERGE_NODES - 1].b); |
1339 | 1339 | ||
1340 | memmove(&r[1], &r[0], | 1340 | memmove(&r[1], &r[0], |
1341 | sizeof(struct gc_merge_info) * (GC_MERGE_NODES - 1)); | 1341 | sizeof(struct gc_merge_info) * (GC_MERGE_NODES - 1)); |
1342 | 1342 | ||
1343 | /* When we've got incremental GC working, we'll want to do | 1343 | /* When we've got incremental GC working, we'll want to do |
1344 | * if (should_resched()) | 1344 | * if (should_resched()) |
1345 | * return -EAGAIN; | 1345 | * return -EAGAIN; |
1346 | */ | 1346 | */ |
1347 | cond_resched(); | 1347 | cond_resched(); |
1348 | #if 0 | 1348 | #if 0 |
1349 | if (need_resched()) { | 1349 | if (need_resched()) { |
1350 | ret = -EAGAIN; | 1350 | ret = -EAGAIN; |
1351 | break; | 1351 | break; |
1352 | } | 1352 | } |
1353 | #endif | 1353 | #endif |
1354 | } | 1354 | } |
1355 | 1355 | ||
1356 | for (i = 1; i < GC_MERGE_NODES && r[i].b; i++) | 1356 | for (i = 1; i < GC_MERGE_NODES && r[i].b; i++) |
1357 | write(r[i].b); | 1357 | write(r[i].b); |
1358 | 1358 | ||
1359 | /* Might have freed some children, must remove their keys */ | 1359 | /* Might have freed some children, must remove their keys */ |
1360 | if (!b->written) | 1360 | if (!b->written) |
1361 | bch_btree_sort(b); | 1361 | bch_btree_sort(b); |
1362 | 1362 | ||
1363 | return ret; | 1363 | return ret; |
1364 | } | 1364 | } |
1365 | 1365 | ||
1366 | static int bch_btree_gc_root(struct btree *b, struct btree_op *op, | 1366 | static int bch_btree_gc_root(struct btree *b, struct btree_op *op, |
1367 | struct closure *writes, struct gc_stat *gc) | 1367 | struct closure *writes, struct gc_stat *gc) |
1368 | { | 1368 | { |
1369 | struct btree *n = NULL; | 1369 | struct btree *n = NULL; |
1370 | unsigned keys = 0; | 1370 | unsigned keys = 0; |
1371 | int ret = 0, stale = btree_gc_mark_node(b, &keys, gc); | 1371 | int ret = 0, stale = btree_gc_mark_node(b, &keys, gc); |
1372 | 1372 | ||
1373 | if (b->level || stale > 10) | 1373 | if (b->level || stale > 10) |
1374 | n = btree_node_alloc_replacement(b, NULL); | 1374 | n = btree_node_alloc_replacement(b, NULL); |
1375 | 1375 | ||
1376 | if (!IS_ERR_OR_NULL(n)) | 1376 | if (!IS_ERR_OR_NULL(n)) |
1377 | swap(b, n); | 1377 | swap(b, n); |
1378 | 1378 | ||
1379 | if (b->level) | 1379 | if (b->level) |
1380 | ret = btree_gc_recurse(b, op, writes, gc); | 1380 | ret = btree_gc_recurse(b, op, writes, gc); |
1381 | 1381 | ||
1382 | if (!b->written || btree_node_dirty(b)) { | 1382 | if (!b->written || btree_node_dirty(b)) { |
1383 | bch_btree_node_write(b, n ? &op->cl : NULL); | 1383 | bch_btree_node_write(b, n ? &op->cl : NULL); |
1384 | } | 1384 | } |
1385 | 1385 | ||
1386 | if (!IS_ERR_OR_NULL(n)) { | 1386 | if (!IS_ERR_OR_NULL(n)) { |
1387 | closure_sync(&op->cl); | 1387 | closure_sync(&op->cl); |
1388 | bch_btree_set_root(b); | 1388 | bch_btree_set_root(b); |
1389 | btree_node_free(n, op); | 1389 | btree_node_free(n, op); |
1390 | rw_unlock(true, b); | 1390 | rw_unlock(true, b); |
1391 | } | 1391 | } |
1392 | 1392 | ||
1393 | return ret; | 1393 | return ret; |
1394 | } | 1394 | } |
1395 | 1395 | ||
1396 | static void btree_gc_start(struct cache_set *c) | 1396 | static void btree_gc_start(struct cache_set *c) |
1397 | { | 1397 | { |
1398 | struct cache *ca; | 1398 | struct cache *ca; |
1399 | struct bucket *b; | 1399 | struct bucket *b; |
1400 | unsigned i; | 1400 | unsigned i; |
1401 | 1401 | ||
1402 | if (!c->gc_mark_valid) | 1402 | if (!c->gc_mark_valid) |
1403 | return; | 1403 | return; |
1404 | 1404 | ||
1405 | mutex_lock(&c->bucket_lock); | 1405 | mutex_lock(&c->bucket_lock); |
1406 | 1406 | ||
1407 | c->gc_mark_valid = 0; | 1407 | c->gc_mark_valid = 0; |
1408 | c->gc_done = ZERO_KEY; | 1408 | c->gc_done = ZERO_KEY; |
1409 | 1409 | ||
1410 | for_each_cache(ca, c, i) | 1410 | for_each_cache(ca, c, i) |
1411 | for_each_bucket(b, ca) { | 1411 | for_each_bucket(b, ca) { |
1412 | b->gc_gen = b->gen; | 1412 | b->gc_gen = b->gen; |
1413 | if (!atomic_read(&b->pin)) | 1413 | if (!atomic_read(&b->pin)) |
1414 | SET_GC_MARK(b, GC_MARK_RECLAIMABLE); | 1414 | SET_GC_MARK(b, GC_MARK_RECLAIMABLE); |
1415 | } | 1415 | } |
1416 | 1416 | ||
1417 | mutex_unlock(&c->bucket_lock); | 1417 | mutex_unlock(&c->bucket_lock); |
1418 | } | 1418 | } |
1419 | 1419 | ||
1420 | size_t bch_btree_gc_finish(struct cache_set *c) | 1420 | size_t bch_btree_gc_finish(struct cache_set *c) |
1421 | { | 1421 | { |
1422 | size_t available = 0; | 1422 | size_t available = 0; |
1423 | struct bucket *b; | 1423 | struct bucket *b; |
1424 | struct cache *ca; | 1424 | struct cache *ca; |
1425 | unsigned i; | 1425 | unsigned i; |
1426 | 1426 | ||
1427 | mutex_lock(&c->bucket_lock); | 1427 | mutex_lock(&c->bucket_lock); |
1428 | 1428 | ||
1429 | set_gc_sectors(c); | 1429 | set_gc_sectors(c); |
1430 | c->gc_mark_valid = 1; | 1430 | c->gc_mark_valid = 1; |
1431 | c->need_gc = 0; | 1431 | c->need_gc = 0; |
1432 | 1432 | ||
1433 | if (c->root) | 1433 | if (c->root) |
1434 | for (i = 0; i < KEY_PTRS(&c->root->key); i++) | 1434 | for (i = 0; i < KEY_PTRS(&c->root->key); i++) |
1435 | SET_GC_MARK(PTR_BUCKET(c, &c->root->key, i), | 1435 | SET_GC_MARK(PTR_BUCKET(c, &c->root->key, i), |
1436 | GC_MARK_METADATA); | 1436 | GC_MARK_METADATA); |
1437 | 1437 | ||
1438 | for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++) | 1438 | for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++) |
1439 | SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i), | 1439 | SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i), |
1440 | GC_MARK_METADATA); | 1440 | GC_MARK_METADATA); |
1441 | 1441 | ||
1442 | for_each_cache(ca, c, i) { | 1442 | for_each_cache(ca, c, i) { |
1443 | uint64_t *i; | 1443 | uint64_t *i; |
1444 | 1444 | ||
1445 | ca->invalidate_needs_gc = 0; | 1445 | ca->invalidate_needs_gc = 0; |
1446 | 1446 | ||
1447 | for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++) | 1447 | for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++) |
1448 | SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); | 1448 | SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); |
1449 | 1449 | ||
1450 | for (i = ca->prio_buckets; | 1450 | for (i = ca->prio_buckets; |
1451 | i < ca->prio_buckets + prio_buckets(ca) * 2; i++) | 1451 | i < ca->prio_buckets + prio_buckets(ca) * 2; i++) |
1452 | SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); | 1452 | SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); |
1453 | 1453 | ||
1454 | for_each_bucket(b, ca) { | 1454 | for_each_bucket(b, ca) { |
1455 | b->last_gc = b->gc_gen; | 1455 | b->last_gc = b->gc_gen; |
1456 | c->need_gc = max(c->need_gc, bucket_gc_gen(b)); | 1456 | c->need_gc = max(c->need_gc, bucket_gc_gen(b)); |
1457 | 1457 | ||
1458 | if (!atomic_read(&b->pin) && | 1458 | if (!atomic_read(&b->pin) && |
1459 | GC_MARK(b) == GC_MARK_RECLAIMABLE) { | 1459 | GC_MARK(b) == GC_MARK_RECLAIMABLE) { |
1460 | available++; | 1460 | available++; |
1461 | if (!GC_SECTORS_USED(b)) | 1461 | if (!GC_SECTORS_USED(b)) |
1462 | bch_bucket_add_unused(ca, b); | 1462 | bch_bucket_add_unused(ca, b); |
1463 | } | 1463 | } |
1464 | } | 1464 | } |
1465 | } | 1465 | } |
1466 | 1466 | ||
1467 | mutex_unlock(&c->bucket_lock); | 1467 | mutex_unlock(&c->bucket_lock); |
1468 | return available; | 1468 | return available; |
1469 | } | 1469 | } |
1470 | 1470 | ||
1471 | static void bch_btree_gc(struct closure *cl) | 1471 | static void bch_btree_gc(struct closure *cl) |
1472 | { | 1472 | { |
1473 | struct cache_set *c = container_of(cl, struct cache_set, gc.cl); | 1473 | struct cache_set *c = container_of(cl, struct cache_set, gc.cl); |
1474 | int ret; | 1474 | int ret; |
1475 | unsigned long available; | 1475 | unsigned long available; |
1476 | struct gc_stat stats; | 1476 | struct gc_stat stats; |
1477 | struct closure writes; | 1477 | struct closure writes; |
1478 | struct btree_op op; | 1478 | struct btree_op op; |
1479 | uint64_t start_time = local_clock(); | 1479 | uint64_t start_time = local_clock(); |
1480 | 1480 | ||
1481 | trace_bcache_gc_start(c); | 1481 | trace_bcache_gc_start(c); |
1482 | 1482 | ||
1483 | memset(&stats, 0, sizeof(struct gc_stat)); | 1483 | memset(&stats, 0, sizeof(struct gc_stat)); |
1484 | closure_init_stack(&writes); | 1484 | closure_init_stack(&writes); |
1485 | bch_btree_op_init_stack(&op); | 1485 | bch_btree_op_init_stack(&op); |
1486 | op.lock = SHRT_MAX; | 1486 | op.lock = SHRT_MAX; |
1487 | 1487 | ||
1488 | btree_gc_start(c); | 1488 | btree_gc_start(c); |
1489 | 1489 | ||
1490 | atomic_inc(&c->prio_blocked); | 1490 | atomic_inc(&c->prio_blocked); |
1491 | 1491 | ||
1492 | ret = btree_root(gc_root, c, &op, &writes, &stats); | 1492 | ret = btree_root(gc_root, c, &op, &writes, &stats); |
1493 | closure_sync(&op.cl); | 1493 | closure_sync(&op.cl); |
1494 | closure_sync(&writes); | 1494 | closure_sync(&writes); |
1495 | 1495 | ||
1496 | if (ret) { | 1496 | if (ret) { |
1497 | pr_warn("gc failed!"); | 1497 | pr_warn("gc failed!"); |
1498 | continue_at(cl, bch_btree_gc, bch_gc_wq); | 1498 | continue_at(cl, bch_btree_gc, bch_gc_wq); |
1499 | } | 1499 | } |
1500 | 1500 | ||
1501 | /* Possibly wait for new UUIDs or whatever to hit disk */ | 1501 | /* Possibly wait for new UUIDs or whatever to hit disk */ |
1502 | bch_journal_meta(c, &op.cl); | 1502 | bch_journal_meta(c, &op.cl); |
1503 | closure_sync(&op.cl); | 1503 | closure_sync(&op.cl); |
1504 | 1504 | ||
1505 | available = bch_btree_gc_finish(c); | 1505 | available = bch_btree_gc_finish(c); |
1506 | 1506 | ||
1507 | atomic_dec(&c->prio_blocked); | 1507 | atomic_dec(&c->prio_blocked); |
1508 | wake_up_allocators(c); | 1508 | wake_up_allocators(c); |
1509 | 1509 | ||
1510 | bch_time_stats_update(&c->btree_gc_time, start_time); | 1510 | bch_time_stats_update(&c->btree_gc_time, start_time); |
1511 | 1511 | ||
1512 | stats.key_bytes *= sizeof(uint64_t); | 1512 | stats.key_bytes *= sizeof(uint64_t); |
1513 | stats.dirty <<= 9; | 1513 | stats.dirty <<= 9; |
1514 | stats.data <<= 9; | 1514 | stats.data <<= 9; |
1515 | stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; | 1515 | stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; |
1516 | memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); | 1516 | memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); |
1517 | 1517 | ||
1518 | trace_bcache_gc_end(c); | 1518 | trace_bcache_gc_end(c); |
1519 | 1519 | ||
1520 | continue_at(cl, bch_moving_gc, bch_gc_wq); | 1520 | continue_at(cl, bch_moving_gc, bch_gc_wq); |
1521 | } | 1521 | } |
1522 | 1522 | ||
1523 | void bch_queue_gc(struct cache_set *c) | 1523 | void bch_queue_gc(struct cache_set *c) |
1524 | { | 1524 | { |
1525 | closure_trylock_call(&c->gc.cl, bch_btree_gc, bch_gc_wq, &c->cl); | 1525 | closure_trylock_call(&c->gc.cl, bch_btree_gc, bch_gc_wq, &c->cl); |
1526 | } | 1526 | } |
1527 | 1527 | ||
1528 | /* Initial partial gc */ | 1528 | /* Initial partial gc */ |
1529 | 1529 | ||
1530 | static int bch_btree_check_recurse(struct btree *b, struct btree_op *op, | 1530 | static int bch_btree_check_recurse(struct btree *b, struct btree_op *op, |
1531 | unsigned long **seen) | 1531 | unsigned long **seen) |
1532 | { | 1532 | { |
1533 | int ret; | 1533 | int ret; |
1534 | unsigned i; | 1534 | unsigned i; |
1535 | struct bkey *k; | 1535 | struct bkey *k; |
1536 | struct bucket *g; | 1536 | struct bucket *g; |
1537 | struct btree_iter iter; | 1537 | struct btree_iter iter; |
1538 | 1538 | ||
1539 | for_each_key_filter(b, k, &iter, bch_ptr_invalid) { | 1539 | for_each_key_filter(b, k, &iter, bch_ptr_invalid) { |
1540 | for (i = 0; i < KEY_PTRS(k); i++) { | 1540 | for (i = 0; i < KEY_PTRS(k); i++) { |
1541 | if (!ptr_available(b->c, k, i)) | 1541 | if (!ptr_available(b->c, k, i)) |
1542 | continue; | 1542 | continue; |
1543 | 1543 | ||
1544 | g = PTR_BUCKET(b->c, k, i); | 1544 | g = PTR_BUCKET(b->c, k, i); |
1545 | 1545 | ||
1546 | if (!__test_and_set_bit(PTR_BUCKET_NR(b->c, k, i), | 1546 | if (!__test_and_set_bit(PTR_BUCKET_NR(b->c, k, i), |
1547 | seen[PTR_DEV(k, i)]) || | 1547 | seen[PTR_DEV(k, i)]) || |
1548 | !ptr_stale(b->c, k, i)) { | 1548 | !ptr_stale(b->c, k, i)) { |
1549 | g->gen = PTR_GEN(k, i); | 1549 | g->gen = PTR_GEN(k, i); |
1550 | 1550 | ||
1551 | if (b->level) | 1551 | if (b->level) |
1552 | g->prio = BTREE_PRIO; | 1552 | g->prio = BTREE_PRIO; |
1553 | else if (g->prio == BTREE_PRIO) | 1553 | else if (g->prio == BTREE_PRIO) |
1554 | g->prio = INITIAL_PRIO; | 1554 | g->prio = INITIAL_PRIO; |
1555 | } | 1555 | } |
1556 | } | 1556 | } |
1557 | 1557 | ||
1558 | btree_mark_key(b, k); | 1558 | btree_mark_key(b, k); |
1559 | } | 1559 | } |
1560 | 1560 | ||
1561 | if (b->level) { | 1561 | if (b->level) { |
1562 | k = bch_next_recurse_key(b, &ZERO_KEY); | 1562 | k = bch_next_recurse_key(b, &ZERO_KEY); |
1563 | 1563 | ||
1564 | while (k) { | 1564 | while (k) { |
1565 | struct bkey *p = bch_next_recurse_key(b, k); | 1565 | struct bkey *p = bch_next_recurse_key(b, k); |
1566 | if (p) | 1566 | if (p) |
1567 | btree_node_prefetch(b->c, p, b->level - 1); | 1567 | btree_node_prefetch(b->c, p, b->level - 1); |
1568 | 1568 | ||
1569 | ret = btree(check_recurse, k, b, op, seen); | 1569 | ret = btree(check_recurse, k, b, op, seen); |
1570 | if (ret) | 1570 | if (ret) |
1571 | return ret; | 1571 | return ret; |
1572 | 1572 | ||
1573 | k = p; | 1573 | k = p; |
1574 | } | 1574 | } |
1575 | } | 1575 | } |
1576 | 1576 | ||
1577 | return 0; | 1577 | return 0; |
1578 | } | 1578 | } |
1579 | 1579 | ||
1580 | int bch_btree_check(struct cache_set *c, struct btree_op *op) | 1580 | int bch_btree_check(struct cache_set *c, struct btree_op *op) |
1581 | { | 1581 | { |
1582 | int ret = -ENOMEM; | 1582 | int ret = -ENOMEM; |
1583 | unsigned i; | 1583 | unsigned i; |
1584 | unsigned long *seen[MAX_CACHES_PER_SET]; | 1584 | unsigned long *seen[MAX_CACHES_PER_SET]; |
1585 | 1585 | ||
1586 | memset(seen, 0, sizeof(seen)); | 1586 | memset(seen, 0, sizeof(seen)); |
1587 | 1587 | ||
1588 | for (i = 0; c->cache[i]; i++) { | 1588 | for (i = 0; c->cache[i]; i++) { |
1589 | size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8); | 1589 | size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8); |
1590 | seen[i] = kmalloc(n, GFP_KERNEL); | 1590 | seen[i] = kmalloc(n, GFP_KERNEL); |
1591 | if (!seen[i]) | 1591 | if (!seen[i]) |
1592 | goto err; | 1592 | goto err; |
1593 | 1593 | ||
1594 | /* Disables the seen array until prio_read() uses it too */ | 1594 | /* Disables the seen array until prio_read() uses it too */ |
1595 | memset(seen[i], 0xFF, n); | 1595 | memset(seen[i], 0xFF, n); |
1596 | } | 1596 | } |
1597 | 1597 | ||
1598 | ret = btree_root(check_recurse, c, op, seen); | 1598 | ret = btree_root(check_recurse, c, op, seen); |
1599 | err: | 1599 | err: |
1600 | for (i = 0; i < MAX_CACHES_PER_SET; i++) | 1600 | for (i = 0; i < MAX_CACHES_PER_SET; i++) |
1601 | kfree(seen[i]); | 1601 | kfree(seen[i]); |
1602 | return ret; | 1602 | return ret; |
1603 | } | 1603 | } |
1604 | 1604 | ||
1605 | /* Btree insertion */ | 1605 | /* Btree insertion */ |
1606 | 1606 | ||
1607 | static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert) | 1607 | static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert) |
1608 | { | 1608 | { |
1609 | struct bset *i = b->sets[b->nsets].data; | 1609 | struct bset *i = b->sets[b->nsets].data; |
1610 | 1610 | ||
1611 | memmove((uint64_t *) where + bkey_u64s(insert), | 1611 | memmove((uint64_t *) where + bkey_u64s(insert), |
1612 | where, | 1612 | where, |
1613 | (void *) end(i) - (void *) where); | 1613 | (void *) end(i) - (void *) where); |
1614 | 1614 | ||
1615 | i->keys += bkey_u64s(insert); | 1615 | i->keys += bkey_u64s(insert); |
1616 | bkey_copy(where, insert); | 1616 | bkey_copy(where, insert); |
1617 | bch_bset_fix_lookup_table(b, where); | 1617 | bch_bset_fix_lookup_table(b, where); |
1618 | } | 1618 | } |
1619 | 1619 | ||
1620 | static bool fix_overlapping_extents(struct btree *b, | 1620 | static bool fix_overlapping_extents(struct btree *b, |
1621 | struct bkey *insert, | 1621 | struct bkey *insert, |
1622 | struct btree_iter *iter, | 1622 | struct btree_iter *iter, |
1623 | struct btree_op *op) | 1623 | struct btree_op *op) |
1624 | { | 1624 | { |
1625 | void subtract_dirty(struct bkey *k, uint64_t offset, int sectors) | 1625 | void subtract_dirty(struct bkey *k, uint64_t offset, int sectors) |
1626 | { | 1626 | { |
1627 | if (KEY_DIRTY(k)) | 1627 | if (KEY_DIRTY(k)) |
1628 | bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), | 1628 | bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), |
1629 | offset, -sectors); | 1629 | offset, -sectors); |
1630 | } | 1630 | } |
1631 | 1631 | ||
1632 | uint64_t old_offset; | 1632 | uint64_t old_offset; |
1633 | unsigned old_size, sectors_found = 0; | 1633 | unsigned old_size, sectors_found = 0; |
1634 | 1634 | ||
1635 | while (1) { | 1635 | while (1) { |
1636 | struct bkey *k = bch_btree_iter_next(iter); | 1636 | struct bkey *k = bch_btree_iter_next(iter); |
1637 | if (!k || | 1637 | if (!k || |
1638 | bkey_cmp(&START_KEY(k), insert) >= 0) | 1638 | bkey_cmp(&START_KEY(k), insert) >= 0) |
1639 | break; | 1639 | break; |
1640 | 1640 | ||
1641 | if (bkey_cmp(k, &START_KEY(insert)) <= 0) | 1641 | if (bkey_cmp(k, &START_KEY(insert)) <= 0) |
1642 | continue; | 1642 | continue; |
1643 | 1643 | ||
1644 | old_offset = KEY_START(k); | 1644 | old_offset = KEY_START(k); |
1645 | old_size = KEY_SIZE(k); | 1645 | old_size = KEY_SIZE(k); |
1646 | 1646 | ||
1647 | /* | 1647 | /* |
1648 | * We might overlap with 0 size extents; we can't skip these | 1648 | * We might overlap with 0 size extents; we can't skip these |
1649 | * because if they're in the set we're inserting to we have to | 1649 | * because if they're in the set we're inserting to we have to |
1650 | * adjust them so they don't overlap with the key we're | 1650 | * adjust them so they don't overlap with the key we're |
1651 | * inserting. But we don't want to check them for BTREE_REPLACE | 1651 | * inserting. But we don't want to check them for BTREE_REPLACE |
1652 | * operations. | 1652 | * operations. |
1653 | */ | 1653 | */ |
1654 | 1654 | ||
1655 | if (op->type == BTREE_REPLACE && | 1655 | if (op->type == BTREE_REPLACE && |
1656 | KEY_SIZE(k)) { | 1656 | KEY_SIZE(k)) { |
1657 | /* | 1657 | /* |
1658 | * k might have been split since we inserted/found the | 1658 | * k might have been split since we inserted/found the |
1659 | * key we're replacing | 1659 | * key we're replacing |
1660 | */ | 1660 | */ |
1661 | unsigned i; | 1661 | unsigned i; |
1662 | uint64_t offset = KEY_START(k) - | 1662 | uint64_t offset = KEY_START(k) - |
1663 | KEY_START(&op->replace); | 1663 | KEY_START(&op->replace); |
1664 | 1664 | ||
1665 | /* But it must be a subset of the replace key */ | 1665 | /* But it must be a subset of the replace key */ |
1666 | if (KEY_START(k) < KEY_START(&op->replace) || | 1666 | if (KEY_START(k) < KEY_START(&op->replace) || |
1667 | KEY_OFFSET(k) > KEY_OFFSET(&op->replace)) | 1667 | KEY_OFFSET(k) > KEY_OFFSET(&op->replace)) |
1668 | goto check_failed; | 1668 | goto check_failed; |
1669 | 1669 | ||
1670 | /* We didn't find a key that we were supposed to */ | 1670 | /* We didn't find a key that we were supposed to */ |
1671 | if (KEY_START(k) > KEY_START(insert) + sectors_found) | 1671 | if (KEY_START(k) > KEY_START(insert) + sectors_found) |
1672 | goto check_failed; | 1672 | goto check_failed; |
1673 | 1673 | ||
1674 | if (KEY_PTRS(&op->replace) != KEY_PTRS(k)) | 1674 | if (KEY_PTRS(&op->replace) != KEY_PTRS(k)) |
1675 | goto check_failed; | 1675 | goto check_failed; |
1676 | 1676 | ||
1677 | /* skip past gen */ | 1677 | /* skip past gen */ |
1678 | offset <<= 8; | 1678 | offset <<= 8; |
1679 | 1679 | ||
1680 | BUG_ON(!KEY_PTRS(&op->replace)); | 1680 | BUG_ON(!KEY_PTRS(&op->replace)); |
1681 | 1681 | ||
1682 | for (i = 0; i < KEY_PTRS(&op->replace); i++) | 1682 | for (i = 0; i < KEY_PTRS(&op->replace); i++) |
1683 | if (k->ptr[i] != op->replace.ptr[i] + offset) | 1683 | if (k->ptr[i] != op->replace.ptr[i] + offset) |
1684 | goto check_failed; | 1684 | goto check_failed; |
1685 | 1685 | ||
1686 | sectors_found = KEY_OFFSET(k) - KEY_START(insert); | 1686 | sectors_found = KEY_OFFSET(k) - KEY_START(insert); |
1687 | } | 1687 | } |
1688 | 1688 | ||
1689 | if (bkey_cmp(insert, k) < 0 && | 1689 | if (bkey_cmp(insert, k) < 0 && |
1690 | bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) { | 1690 | bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) { |
1691 | /* | 1691 | /* |
1692 | * We overlapped in the middle of an existing key: that | 1692 | * We overlapped in the middle of an existing key: that |
1693 | * means we have to split the old key. But we have to do | 1693 | * means we have to split the old key. But we have to do |
1694 | * slightly different things depending on whether the | 1694 | * slightly different things depending on whether the |
1695 | * old key has been written out yet. | 1695 | * old key has been written out yet. |
1696 | */ | 1696 | */ |
1697 | 1697 | ||
1698 | struct bkey *top; | 1698 | struct bkey *top; |
1699 | 1699 | ||
1700 | subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert)); | 1700 | subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert)); |
1701 | 1701 | ||
1702 | if (bkey_written(b, k)) { | 1702 | if (bkey_written(b, k)) { |
1703 | /* | 1703 | /* |
1704 | * We insert a new key to cover the top of the | 1704 | * We insert a new key to cover the top of the |
1705 | * old key, and the old key is modified in place | 1705 | * old key, and the old key is modified in place |
1706 | * to represent the bottom split. | 1706 | * to represent the bottom split. |
1707 | * | 1707 | * |
1708 | * It's completely arbitrary whether the new key | 1708 | * It's completely arbitrary whether the new key |
1709 | * is the top or the bottom, but it has to match | 1709 | * is the top or the bottom, but it has to match |
1710 | * up with what btree_sort_fixup() does - it | 1710 | * up with what btree_sort_fixup() does - it |
1711 | * doesn't check for this kind of overlap, it | 1711 | * doesn't check for this kind of overlap, it |
1712 | * depends on us inserting a new key for the top | 1712 | * depends on us inserting a new key for the top |
1713 | * here. | 1713 | * here. |
1714 | */ | 1714 | */ |
1715 | top = bch_bset_search(b, &b->sets[b->nsets], | 1715 | top = bch_bset_search(b, &b->sets[b->nsets], |
1716 | insert); | 1716 | insert); |
1717 | shift_keys(b, top, k); | 1717 | shift_keys(b, top, k); |
1718 | } else { | 1718 | } else { |
1719 | BKEY_PADDED(key) temp; | 1719 | BKEY_PADDED(key) temp; |
1720 | bkey_copy(&temp.key, k); | 1720 | bkey_copy(&temp.key, k); |
1721 | shift_keys(b, k, &temp.key); | 1721 | shift_keys(b, k, &temp.key); |
1722 | top = bkey_next(k); | 1722 | top = bkey_next(k); |
1723 | } | 1723 | } |
1724 | 1724 | ||
1725 | bch_cut_front(insert, top); | 1725 | bch_cut_front(insert, top); |
1726 | bch_cut_back(&START_KEY(insert), k); | 1726 | bch_cut_back(&START_KEY(insert), k); |
1727 | bch_bset_fix_invalidated_key(b, k); | 1727 | bch_bset_fix_invalidated_key(b, k); |
1728 | return false; | 1728 | return false; |
1729 | } | 1729 | } |
1730 | 1730 | ||
1731 | if (bkey_cmp(insert, k) < 0) { | 1731 | if (bkey_cmp(insert, k) < 0) { |
1732 | bch_cut_front(insert, k); | 1732 | bch_cut_front(insert, k); |
1733 | } else { | 1733 | } else { |
1734 | if (bkey_written(b, k) && | 1734 | if (bkey_written(b, k) && |
1735 | bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) { | 1735 | bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) { |
1736 | /* | 1736 | /* |
1737 | * Completely overwrote, so we don't have to | 1737 | * Completely overwrote, so we don't have to |
1738 | * invalidate the binary search tree | 1738 | * invalidate the binary search tree |
1739 | */ | 1739 | */ |
1740 | bch_cut_front(k, k); | 1740 | bch_cut_front(k, k); |
1741 | } else { | 1741 | } else { |
1742 | __bch_cut_back(&START_KEY(insert), k); | 1742 | __bch_cut_back(&START_KEY(insert), k); |
1743 | bch_bset_fix_invalidated_key(b, k); | 1743 | bch_bset_fix_invalidated_key(b, k); |
1744 | } | 1744 | } |
1745 | } | 1745 | } |
1746 | 1746 | ||
1747 | subtract_dirty(k, old_offset, old_size - KEY_SIZE(k)); | 1747 | subtract_dirty(k, old_offset, old_size - KEY_SIZE(k)); |
1748 | } | 1748 | } |
1749 | 1749 | ||
1750 | check_failed: | 1750 | check_failed: |
1751 | if (op->type == BTREE_REPLACE) { | 1751 | if (op->type == BTREE_REPLACE) { |
1752 | if (!sectors_found) { | 1752 | if (!sectors_found) { |
1753 | op->insert_collision = true; | 1753 | op->insert_collision = true; |
1754 | return true; | 1754 | return true; |
1755 | } else if (sectors_found < KEY_SIZE(insert)) { | 1755 | } else if (sectors_found < KEY_SIZE(insert)) { |
1756 | SET_KEY_OFFSET(insert, KEY_OFFSET(insert) - | 1756 | SET_KEY_OFFSET(insert, KEY_OFFSET(insert) - |
1757 | (KEY_SIZE(insert) - sectors_found)); | 1757 | (KEY_SIZE(insert) - sectors_found)); |
1758 | SET_KEY_SIZE(insert, sectors_found); | 1758 | SET_KEY_SIZE(insert, sectors_found); |
1759 | } | 1759 | } |
1760 | } | 1760 | } |
1761 | 1761 | ||
1762 | return false; | 1762 | return false; |
1763 | } | 1763 | } |
1764 | 1764 | ||
1765 | static bool btree_insert_key(struct btree *b, struct btree_op *op, | 1765 | static bool btree_insert_key(struct btree *b, struct btree_op *op, |
1766 | struct bkey *k) | 1766 | struct bkey *k) |
1767 | { | 1767 | { |
1768 | struct bset *i = b->sets[b->nsets].data; | 1768 | struct bset *i = b->sets[b->nsets].data; |
1769 | struct bkey *m, *prev; | 1769 | struct bkey *m, *prev; |
1770 | unsigned status = BTREE_INSERT_STATUS_INSERT; | 1770 | unsigned status = BTREE_INSERT_STATUS_INSERT; |
1771 | 1771 | ||
1772 | BUG_ON(bkey_cmp(k, &b->key) > 0); | 1772 | BUG_ON(bkey_cmp(k, &b->key) > 0); |
1773 | BUG_ON(b->level && !KEY_PTRS(k)); | 1773 | BUG_ON(b->level && !KEY_PTRS(k)); |
1774 | BUG_ON(!b->level && !KEY_OFFSET(k)); | 1774 | BUG_ON(!b->level && !KEY_OFFSET(k)); |
1775 | 1775 | ||
1776 | if (!b->level) { | 1776 | if (!b->level) { |
1777 | struct btree_iter iter; | 1777 | struct btree_iter iter; |
1778 | struct bkey search = KEY(KEY_INODE(k), KEY_START(k), 0); | 1778 | struct bkey search = KEY(KEY_INODE(k), KEY_START(k), 0); |
1779 | 1779 | ||
1780 | /* | 1780 | /* |
1781 | * bset_search() returns the first key that is strictly greater | 1781 | * bset_search() returns the first key that is strictly greater |
1782 | * than the search key - but for back merging, we want to find | 1782 | * than the search key - but for back merging, we want to find |
1783 | * the first key that is greater than or equal to KEY_START(k) - | 1783 | * the first key that is greater than or equal to KEY_START(k) - |
1784 | * unless KEY_START(k) is 0. | 1784 | * unless KEY_START(k) is 0. |
1785 | */ | 1785 | */ |
1786 | if (KEY_OFFSET(&search)) | 1786 | if (KEY_OFFSET(&search)) |
1787 | SET_KEY_OFFSET(&search, KEY_OFFSET(&search) - 1); | 1787 | SET_KEY_OFFSET(&search, KEY_OFFSET(&search) - 1); |
1788 | 1788 | ||
1789 | prev = NULL; | 1789 | prev = NULL; |
1790 | m = bch_btree_iter_init(b, &iter, &search); | 1790 | m = bch_btree_iter_init(b, &iter, &search); |
1791 | 1791 | ||
1792 | if (fix_overlapping_extents(b, k, &iter, op)) | 1792 | if (fix_overlapping_extents(b, k, &iter, op)) |
1793 | return false; | 1793 | return false; |
1794 | 1794 | ||
1795 | while (m != end(i) && | 1795 | while (m != end(i) && |
1796 | bkey_cmp(k, &START_KEY(m)) > 0) | 1796 | bkey_cmp(k, &START_KEY(m)) > 0) |
1797 | prev = m, m = bkey_next(m); | 1797 | prev = m, m = bkey_next(m); |
1798 | 1798 | ||
1799 | if (key_merging_disabled(b->c)) | 1799 | if (key_merging_disabled(b->c)) |
1800 | goto insert; | 1800 | goto insert; |
1801 | 1801 | ||
1802 | /* prev is in the tree, if we merge we're done */ | 1802 | /* prev is in the tree, if we merge we're done */ |
1803 | status = BTREE_INSERT_STATUS_BACK_MERGE; | 1803 | status = BTREE_INSERT_STATUS_BACK_MERGE; |
1804 | if (prev && | 1804 | if (prev && |
1805 | bch_bkey_try_merge(b, prev, k)) | 1805 | bch_bkey_try_merge(b, prev, k)) |
1806 | goto merged; | 1806 | goto merged; |
1807 | 1807 | ||
1808 | status = BTREE_INSERT_STATUS_OVERWROTE; | 1808 | status = BTREE_INSERT_STATUS_OVERWROTE; |
1809 | if (m != end(i) && | 1809 | if (m != end(i) && |
1810 | KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m)) | 1810 | KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m)) |
1811 | goto copy; | 1811 | goto copy; |
1812 | 1812 | ||
1813 | status = BTREE_INSERT_STATUS_FRONT_MERGE; | 1813 | status = BTREE_INSERT_STATUS_FRONT_MERGE; |
1814 | if (m != end(i) && | 1814 | if (m != end(i) && |
1815 | bch_bkey_try_merge(b, k, m)) | 1815 | bch_bkey_try_merge(b, k, m)) |
1816 | goto copy; | 1816 | goto copy; |
1817 | } else | 1817 | } else |
1818 | m = bch_bset_search(b, &b->sets[b->nsets], k); | 1818 | m = bch_bset_search(b, &b->sets[b->nsets], k); |
1819 | 1819 | ||
1820 | insert: shift_keys(b, m, k); | 1820 | insert: shift_keys(b, m, k); |
1821 | copy: bkey_copy(m, k); | 1821 | copy: bkey_copy(m, k); |
1822 | merged: | 1822 | merged: |
1823 | if (KEY_DIRTY(k)) | 1823 | if (KEY_DIRTY(k)) |
1824 | bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), | 1824 | bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), |
1825 | KEY_START(k), KEY_SIZE(k)); | 1825 | KEY_START(k), KEY_SIZE(k)); |
1826 | 1826 | ||
1827 | bch_check_keys(b, "%u for %s", status, op_type(op)); | 1827 | bch_check_keys(b, "%u for %s", status, op_type(op)); |
1828 | 1828 | ||
1829 | if (b->level && !KEY_OFFSET(k)) | 1829 | if (b->level && !KEY_OFFSET(k)) |
1830 | btree_current_write(b)->prio_blocked++; | 1830 | btree_current_write(b)->prio_blocked++; |
1831 | 1831 | ||
1832 | trace_bcache_btree_insert_key(b, k, op->type, status); | 1832 | trace_bcache_btree_insert_key(b, k, op->type, status); |
1833 | 1833 | ||
1834 | return true; | 1834 | return true; |
1835 | } | 1835 | } |
1836 | 1836 | ||
1837 | static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op) | 1837 | static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op) |
1838 | { | 1838 | { |
1839 | bool ret = false; | 1839 | bool ret = false; |
1840 | struct bkey *k; | 1840 | struct bkey *k; |
1841 | unsigned oldsize = bch_count_data(b); | 1841 | unsigned oldsize = bch_count_data(b); |
1842 | 1842 | ||
1843 | while ((k = bch_keylist_pop(&op->keys))) { | 1843 | while ((k = bch_keylist_pop(&op->keys))) { |
1844 | bkey_put(b->c, k, b->level); | 1844 | bkey_put(b->c, k, b->level); |
1845 | ret |= btree_insert_key(b, op, k); | 1845 | ret |= btree_insert_key(b, op, k); |
1846 | } | 1846 | } |
1847 | 1847 | ||
1848 | BUG_ON(bch_count_data(b) < oldsize); | 1848 | BUG_ON(bch_count_data(b) < oldsize); |
1849 | return ret; | 1849 | return ret; |
1850 | } | 1850 | } |
1851 | 1851 | ||
1852 | bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op, | 1852 | bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op, |
1853 | struct bio *bio) | 1853 | struct bio *bio) |
1854 | { | 1854 | { |
1855 | bool ret = false; | 1855 | bool ret = false; |
1856 | uint64_t btree_ptr = b->key.ptr[0]; | 1856 | uint64_t btree_ptr = b->key.ptr[0]; |
1857 | unsigned long seq = b->seq; | 1857 | unsigned long seq = b->seq; |
1858 | BKEY_PADDED(k) tmp; | 1858 | BKEY_PADDED(k) tmp; |
1859 | 1859 | ||
1860 | rw_unlock(false, b); | 1860 | rw_unlock(false, b); |
1861 | rw_lock(true, b, b->level); | 1861 | rw_lock(true, b, b->level); |
1862 | 1862 | ||
1863 | if (b->key.ptr[0] != btree_ptr || | 1863 | if (b->key.ptr[0] != btree_ptr || |
1864 | b->seq != seq + 1 || | 1864 | b->seq != seq + 1 || |
1865 | should_split(b)) | 1865 | should_split(b)) |
1866 | goto out; | 1866 | goto out; |
1867 | 1867 | ||
1868 | op->replace = KEY(op->inode, bio_end(bio), bio_sectors(bio)); | 1868 | op->replace = KEY(op->inode, bio_end_sector(bio), bio_sectors(bio)); |
1869 | 1869 | ||
1870 | SET_KEY_PTRS(&op->replace, 1); | 1870 | SET_KEY_PTRS(&op->replace, 1); |
1871 | get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t)); | 1871 | get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t)); |
1872 | 1872 | ||
1873 | SET_PTR_DEV(&op->replace, 0, PTR_CHECK_DEV); | 1873 | SET_PTR_DEV(&op->replace, 0, PTR_CHECK_DEV); |
1874 | 1874 | ||
1875 | bkey_copy(&tmp.k, &op->replace); | 1875 | bkey_copy(&tmp.k, &op->replace); |
1876 | 1876 | ||
1877 | BUG_ON(op->type != BTREE_INSERT); | 1877 | BUG_ON(op->type != BTREE_INSERT); |
1878 | BUG_ON(!btree_insert_key(b, op, &tmp.k)); | 1878 | BUG_ON(!btree_insert_key(b, op, &tmp.k)); |
1879 | ret = true; | 1879 | ret = true; |
1880 | out: | 1880 | out: |
1881 | downgrade_write(&b->lock); | 1881 | downgrade_write(&b->lock); |
1882 | return ret; | 1882 | return ret; |
1883 | } | 1883 | } |
1884 | 1884 | ||
1885 | static int btree_split(struct btree *b, struct btree_op *op) | 1885 | static int btree_split(struct btree *b, struct btree_op *op) |
1886 | { | 1886 | { |
1887 | bool split, root = b == b->c->root; | 1887 | bool split, root = b == b->c->root; |
1888 | struct btree *n1, *n2 = NULL, *n3 = NULL; | 1888 | struct btree *n1, *n2 = NULL, *n3 = NULL; |
1889 | uint64_t start_time = local_clock(); | 1889 | uint64_t start_time = local_clock(); |
1890 | 1890 | ||
1891 | if (b->level) | 1891 | if (b->level) |
1892 | set_closure_blocking(&op->cl); | 1892 | set_closure_blocking(&op->cl); |
1893 | 1893 | ||
1894 | n1 = btree_node_alloc_replacement(b, &op->cl); | 1894 | n1 = btree_node_alloc_replacement(b, &op->cl); |
1895 | if (IS_ERR(n1)) | 1895 | if (IS_ERR(n1)) |
1896 | goto err; | 1896 | goto err; |
1897 | 1897 | ||
1898 | split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5; | 1898 | split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5; |
1899 | 1899 | ||
1900 | if (split) { | 1900 | if (split) { |
1901 | unsigned keys = 0; | 1901 | unsigned keys = 0; |
1902 | 1902 | ||
1903 | trace_bcache_btree_node_split(b, n1->sets[0].data->keys); | 1903 | trace_bcache_btree_node_split(b, n1->sets[0].data->keys); |
1904 | 1904 | ||
1905 | n2 = bch_btree_node_alloc(b->c, b->level, &op->cl); | 1905 | n2 = bch_btree_node_alloc(b->c, b->level, &op->cl); |
1906 | if (IS_ERR(n2)) | 1906 | if (IS_ERR(n2)) |
1907 | goto err_free1; | 1907 | goto err_free1; |
1908 | 1908 | ||
1909 | if (root) { | 1909 | if (root) { |
1910 | n3 = bch_btree_node_alloc(b->c, b->level + 1, &op->cl); | 1910 | n3 = bch_btree_node_alloc(b->c, b->level + 1, &op->cl); |
1911 | if (IS_ERR(n3)) | 1911 | if (IS_ERR(n3)) |
1912 | goto err_free2; | 1912 | goto err_free2; |
1913 | } | 1913 | } |
1914 | 1914 | ||
1915 | bch_btree_insert_keys(n1, op); | 1915 | bch_btree_insert_keys(n1, op); |
1916 | 1916 | ||
1917 | /* Has to be a linear search because we don't have an auxiliary | 1917 | /* Has to be a linear search because we don't have an auxiliary |
1918 | * search tree yet | 1918 | * search tree yet |
1919 | */ | 1919 | */ |
1920 | 1920 | ||
1921 | while (keys < (n1->sets[0].data->keys * 3) / 5) | 1921 | while (keys < (n1->sets[0].data->keys * 3) / 5) |
1922 | keys += bkey_u64s(node(n1->sets[0].data, keys)); | 1922 | keys += bkey_u64s(node(n1->sets[0].data, keys)); |
1923 | 1923 | ||
1924 | bkey_copy_key(&n1->key, node(n1->sets[0].data, keys)); | 1924 | bkey_copy_key(&n1->key, node(n1->sets[0].data, keys)); |
1925 | keys += bkey_u64s(node(n1->sets[0].data, keys)); | 1925 | keys += bkey_u64s(node(n1->sets[0].data, keys)); |
1926 | 1926 | ||
1927 | n2->sets[0].data->keys = n1->sets[0].data->keys - keys; | 1927 | n2->sets[0].data->keys = n1->sets[0].data->keys - keys; |
1928 | n1->sets[0].data->keys = keys; | 1928 | n1->sets[0].data->keys = keys; |
1929 | 1929 | ||
1930 | memcpy(n2->sets[0].data->start, | 1930 | memcpy(n2->sets[0].data->start, |
1931 | end(n1->sets[0].data), | 1931 | end(n1->sets[0].data), |
1932 | n2->sets[0].data->keys * sizeof(uint64_t)); | 1932 | n2->sets[0].data->keys * sizeof(uint64_t)); |
1933 | 1933 | ||
1934 | bkey_copy_key(&n2->key, &b->key); | 1934 | bkey_copy_key(&n2->key, &b->key); |
1935 | 1935 | ||
1936 | bch_keylist_add(&op->keys, &n2->key); | 1936 | bch_keylist_add(&op->keys, &n2->key); |
1937 | bch_btree_node_write(n2, &op->cl); | 1937 | bch_btree_node_write(n2, &op->cl); |
1938 | rw_unlock(true, n2); | 1938 | rw_unlock(true, n2); |
1939 | } else { | 1939 | } else { |
1940 | trace_bcache_btree_node_compact(b, n1->sets[0].data->keys); | 1940 | trace_bcache_btree_node_compact(b, n1->sets[0].data->keys); |
1941 | 1941 | ||
1942 | bch_btree_insert_keys(n1, op); | 1942 | bch_btree_insert_keys(n1, op); |
1943 | } | 1943 | } |
1944 | 1944 | ||
1945 | bch_keylist_add(&op->keys, &n1->key); | 1945 | bch_keylist_add(&op->keys, &n1->key); |
1946 | bch_btree_node_write(n1, &op->cl); | 1946 | bch_btree_node_write(n1, &op->cl); |
1947 | 1947 | ||
1948 | if (n3) { | 1948 | if (n3) { |
1949 | bkey_copy_key(&n3->key, &MAX_KEY); | 1949 | bkey_copy_key(&n3->key, &MAX_KEY); |
1950 | bch_btree_insert_keys(n3, op); | 1950 | bch_btree_insert_keys(n3, op); |
1951 | bch_btree_node_write(n3, &op->cl); | 1951 | bch_btree_node_write(n3, &op->cl); |
1952 | 1952 | ||
1953 | closure_sync(&op->cl); | 1953 | closure_sync(&op->cl); |
1954 | bch_btree_set_root(n3); | 1954 | bch_btree_set_root(n3); |
1955 | rw_unlock(true, n3); | 1955 | rw_unlock(true, n3); |
1956 | } else if (root) { | 1956 | } else if (root) { |
1957 | op->keys.top = op->keys.bottom; | 1957 | op->keys.top = op->keys.bottom; |
1958 | closure_sync(&op->cl); | 1958 | closure_sync(&op->cl); |
1959 | bch_btree_set_root(n1); | 1959 | bch_btree_set_root(n1); |
1960 | } else { | 1960 | } else { |
1961 | unsigned i; | 1961 | unsigned i; |
1962 | 1962 | ||
1963 | bkey_copy(op->keys.top, &b->key); | 1963 | bkey_copy(op->keys.top, &b->key); |
1964 | bkey_copy_key(op->keys.top, &ZERO_KEY); | 1964 | bkey_copy_key(op->keys.top, &ZERO_KEY); |
1965 | 1965 | ||
1966 | for (i = 0; i < KEY_PTRS(&b->key); i++) { | 1966 | for (i = 0; i < KEY_PTRS(&b->key); i++) { |
1967 | uint8_t g = PTR_BUCKET(b->c, &b->key, i)->gen + 1; | 1967 | uint8_t g = PTR_BUCKET(b->c, &b->key, i)->gen + 1; |
1968 | 1968 | ||
1969 | SET_PTR_GEN(op->keys.top, i, g); | 1969 | SET_PTR_GEN(op->keys.top, i, g); |
1970 | } | 1970 | } |
1971 | 1971 | ||
1972 | bch_keylist_push(&op->keys); | 1972 | bch_keylist_push(&op->keys); |
1973 | closure_sync(&op->cl); | 1973 | closure_sync(&op->cl); |
1974 | atomic_inc(&b->c->prio_blocked); | 1974 | atomic_inc(&b->c->prio_blocked); |
1975 | } | 1975 | } |
1976 | 1976 | ||
1977 | rw_unlock(true, n1); | 1977 | rw_unlock(true, n1); |
1978 | btree_node_free(b, op); | 1978 | btree_node_free(b, op); |
1979 | 1979 | ||
1980 | bch_time_stats_update(&b->c->btree_split_time, start_time); | 1980 | bch_time_stats_update(&b->c->btree_split_time, start_time); |
1981 | 1981 | ||
1982 | return 0; | 1982 | return 0; |
1983 | err_free2: | 1983 | err_free2: |
1984 | __bkey_put(n2->c, &n2->key); | 1984 | __bkey_put(n2->c, &n2->key); |
1985 | btree_node_free(n2, op); | 1985 | btree_node_free(n2, op); |
1986 | rw_unlock(true, n2); | 1986 | rw_unlock(true, n2); |
1987 | err_free1: | 1987 | err_free1: |
1988 | __bkey_put(n1->c, &n1->key); | 1988 | __bkey_put(n1->c, &n1->key); |
1989 | btree_node_free(n1, op); | 1989 | btree_node_free(n1, op); |
1990 | rw_unlock(true, n1); | 1990 | rw_unlock(true, n1); |
1991 | err: | 1991 | err: |
1992 | if (n3 == ERR_PTR(-EAGAIN) || | 1992 | if (n3 == ERR_PTR(-EAGAIN) || |
1993 | n2 == ERR_PTR(-EAGAIN) || | 1993 | n2 == ERR_PTR(-EAGAIN) || |
1994 | n1 == ERR_PTR(-EAGAIN)) | 1994 | n1 == ERR_PTR(-EAGAIN)) |
1995 | return -EAGAIN; | 1995 | return -EAGAIN; |
1996 | 1996 | ||
1997 | pr_warn("couldn't split"); | 1997 | pr_warn("couldn't split"); |
1998 | return -ENOMEM; | 1998 | return -ENOMEM; |
1999 | } | 1999 | } |
2000 | 2000 | ||
2001 | static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op, | 2001 | static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op, |
2002 | struct keylist *stack_keys) | 2002 | struct keylist *stack_keys) |
2003 | { | 2003 | { |
2004 | if (b->level) { | 2004 | if (b->level) { |
2005 | int ret; | 2005 | int ret; |
2006 | struct bkey *insert = op->keys.bottom; | 2006 | struct bkey *insert = op->keys.bottom; |
2007 | struct bkey *k = bch_next_recurse_key(b, &START_KEY(insert)); | 2007 | struct bkey *k = bch_next_recurse_key(b, &START_KEY(insert)); |
2008 | 2008 | ||
2009 | if (!k) { | 2009 | if (!k) { |
2010 | btree_bug(b, "no key to recurse on at level %i/%i", | 2010 | btree_bug(b, "no key to recurse on at level %i/%i", |
2011 | b->level, b->c->root->level); | 2011 | b->level, b->c->root->level); |
2012 | 2012 | ||
2013 | op->keys.top = op->keys.bottom; | 2013 | op->keys.top = op->keys.bottom; |
2014 | return -EIO; | 2014 | return -EIO; |
2015 | } | 2015 | } |
2016 | 2016 | ||
2017 | if (bkey_cmp(insert, k) > 0) { | 2017 | if (bkey_cmp(insert, k) > 0) { |
2018 | unsigned i; | 2018 | unsigned i; |
2019 | 2019 | ||
2020 | if (op->type == BTREE_REPLACE) { | 2020 | if (op->type == BTREE_REPLACE) { |
2021 | __bkey_put(b->c, insert); | 2021 | __bkey_put(b->c, insert); |
2022 | op->keys.top = op->keys.bottom; | 2022 | op->keys.top = op->keys.bottom; |
2023 | op->insert_collision = true; | 2023 | op->insert_collision = true; |
2024 | return 0; | 2024 | return 0; |
2025 | } | 2025 | } |
2026 | 2026 | ||
2027 | for (i = 0; i < KEY_PTRS(insert); i++) | 2027 | for (i = 0; i < KEY_PTRS(insert); i++) |
2028 | atomic_inc(&PTR_BUCKET(b->c, insert, i)->pin); | 2028 | atomic_inc(&PTR_BUCKET(b->c, insert, i)->pin); |
2029 | 2029 | ||
2030 | bkey_copy(stack_keys->top, insert); | 2030 | bkey_copy(stack_keys->top, insert); |
2031 | 2031 | ||
2032 | bch_cut_back(k, insert); | 2032 | bch_cut_back(k, insert); |
2033 | bch_cut_front(k, stack_keys->top); | 2033 | bch_cut_front(k, stack_keys->top); |
2034 | 2034 | ||
2035 | bch_keylist_push(stack_keys); | 2035 | bch_keylist_push(stack_keys); |
2036 | } | 2036 | } |
2037 | 2037 | ||
2038 | ret = btree(insert_recurse, k, b, op, stack_keys); | 2038 | ret = btree(insert_recurse, k, b, op, stack_keys); |
2039 | if (ret) | 2039 | if (ret) |
2040 | return ret; | 2040 | return ret; |
2041 | } | 2041 | } |
2042 | 2042 | ||
2043 | if (!bch_keylist_empty(&op->keys)) { | 2043 | if (!bch_keylist_empty(&op->keys)) { |
2044 | if (should_split(b)) { | 2044 | if (should_split(b)) { |
2045 | if (op->lock <= b->c->root->level) { | 2045 | if (op->lock <= b->c->root->level) { |
2046 | BUG_ON(b->level); | 2046 | BUG_ON(b->level); |
2047 | op->lock = b->c->root->level + 1; | 2047 | op->lock = b->c->root->level + 1; |
2048 | return -EINTR; | 2048 | return -EINTR; |
2049 | } | 2049 | } |
2050 | return btree_split(b, op); | 2050 | return btree_split(b, op); |
2051 | } | 2051 | } |
2052 | 2052 | ||
2053 | BUG_ON(write_block(b) != b->sets[b->nsets].data); | 2053 | BUG_ON(write_block(b) != b->sets[b->nsets].data); |
2054 | 2054 | ||
2055 | if (bch_btree_insert_keys(b, op)) { | 2055 | if (bch_btree_insert_keys(b, op)) { |
2056 | if (!b->level) | 2056 | if (!b->level) |
2057 | bch_btree_leaf_dirty(b, op); | 2057 | bch_btree_leaf_dirty(b, op); |
2058 | else | 2058 | else |
2059 | bch_btree_node_write(b, &op->cl); | 2059 | bch_btree_node_write(b, &op->cl); |
2060 | } | 2060 | } |
2061 | } | 2061 | } |
2062 | 2062 | ||
2063 | return 0; | 2063 | return 0; |
2064 | } | 2064 | } |
2065 | 2065 | ||
2066 | int bch_btree_insert(struct btree_op *op, struct cache_set *c) | 2066 | int bch_btree_insert(struct btree_op *op, struct cache_set *c) |
2067 | { | 2067 | { |
2068 | int ret = 0; | 2068 | int ret = 0; |
2069 | struct keylist stack_keys; | 2069 | struct keylist stack_keys; |
2070 | 2070 | ||
2071 | /* | 2071 | /* |
2072 | * Don't want to block with the btree locked unless we have to, | 2072 | * Don't want to block with the btree locked unless we have to, |
2073 | * otherwise we get deadlocks with try_harder and between split/gc | 2073 | * otherwise we get deadlocks with try_harder and between split/gc |
2074 | */ | 2074 | */ |
2075 | clear_closure_blocking(&op->cl); | 2075 | clear_closure_blocking(&op->cl); |
2076 | 2076 | ||
2077 | BUG_ON(bch_keylist_empty(&op->keys)); | 2077 | BUG_ON(bch_keylist_empty(&op->keys)); |
2078 | bch_keylist_copy(&stack_keys, &op->keys); | 2078 | bch_keylist_copy(&stack_keys, &op->keys); |
2079 | bch_keylist_init(&op->keys); | 2079 | bch_keylist_init(&op->keys); |
2080 | 2080 | ||
2081 | while (!bch_keylist_empty(&stack_keys) || | 2081 | while (!bch_keylist_empty(&stack_keys) || |
2082 | !bch_keylist_empty(&op->keys)) { | 2082 | !bch_keylist_empty(&op->keys)) { |
2083 | if (bch_keylist_empty(&op->keys)) { | 2083 | if (bch_keylist_empty(&op->keys)) { |
2084 | bch_keylist_add(&op->keys, | 2084 | bch_keylist_add(&op->keys, |
2085 | bch_keylist_pop(&stack_keys)); | 2085 | bch_keylist_pop(&stack_keys)); |
2086 | op->lock = 0; | 2086 | op->lock = 0; |
2087 | } | 2087 | } |
2088 | 2088 | ||
2089 | ret = btree_root(insert_recurse, c, op, &stack_keys); | 2089 | ret = btree_root(insert_recurse, c, op, &stack_keys); |
2090 | 2090 | ||
2091 | if (ret == -EAGAIN) { | 2091 | if (ret == -EAGAIN) { |
2092 | ret = 0; | 2092 | ret = 0; |
2093 | closure_sync(&op->cl); | 2093 | closure_sync(&op->cl); |
2094 | } else if (ret) { | 2094 | } else if (ret) { |
2095 | struct bkey *k; | 2095 | struct bkey *k; |
2096 | 2096 | ||
2097 | pr_err("error %i trying to insert key for %s", | 2097 | pr_err("error %i trying to insert key for %s", |
2098 | ret, op_type(op)); | 2098 | ret, op_type(op)); |
2099 | 2099 | ||
2100 | while ((k = bch_keylist_pop(&stack_keys) ?: | 2100 | while ((k = bch_keylist_pop(&stack_keys) ?: |
2101 | bch_keylist_pop(&op->keys))) | 2101 | bch_keylist_pop(&op->keys))) |
2102 | bkey_put(c, k, 0); | 2102 | bkey_put(c, k, 0); |
2103 | } | 2103 | } |
2104 | } | 2104 | } |
2105 | 2105 | ||
2106 | bch_keylist_free(&stack_keys); | 2106 | bch_keylist_free(&stack_keys); |
2107 | 2107 | ||
2108 | if (op->journal) | 2108 | if (op->journal) |
2109 | atomic_dec_bug(op->journal); | 2109 | atomic_dec_bug(op->journal); |
2110 | op->journal = NULL; | 2110 | op->journal = NULL; |
2111 | return ret; | 2111 | return ret; |
2112 | } | 2112 | } |
2113 | 2113 | ||
2114 | void bch_btree_set_root(struct btree *b) | 2114 | void bch_btree_set_root(struct btree *b) |
2115 | { | 2115 | { |
2116 | unsigned i; | 2116 | unsigned i; |
2117 | struct closure cl; | 2117 | struct closure cl; |
2118 | 2118 | ||
2119 | closure_init_stack(&cl); | 2119 | closure_init_stack(&cl); |
2120 | 2120 | ||
2121 | trace_bcache_btree_set_root(b); | 2121 | trace_bcache_btree_set_root(b); |
2122 | 2122 | ||
2123 | BUG_ON(!b->written); | 2123 | BUG_ON(!b->written); |
2124 | 2124 | ||
2125 | for (i = 0; i < KEY_PTRS(&b->key); i++) | 2125 | for (i = 0; i < KEY_PTRS(&b->key); i++) |
2126 | BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO); | 2126 | BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO); |
2127 | 2127 | ||
2128 | mutex_lock(&b->c->bucket_lock); | 2128 | mutex_lock(&b->c->bucket_lock); |
2129 | list_del_init(&b->list); | 2129 | list_del_init(&b->list); |
2130 | mutex_unlock(&b->c->bucket_lock); | 2130 | mutex_unlock(&b->c->bucket_lock); |
2131 | 2131 | ||
2132 | b->c->root = b; | 2132 | b->c->root = b; |
2133 | __bkey_put(b->c, &b->key); | 2133 | __bkey_put(b->c, &b->key); |
2134 | 2134 | ||
2135 | bch_journal_meta(b->c, &cl); | 2135 | bch_journal_meta(b->c, &cl); |
2136 | closure_sync(&cl); | 2136 | closure_sync(&cl); |
2137 | } | 2137 | } |
2138 | 2138 | ||
2139 | /* Cache lookup */ | 2139 | /* Cache lookup */ |
2140 | 2140 | ||
2141 | static int submit_partial_cache_miss(struct btree *b, struct btree_op *op, | 2141 | static int submit_partial_cache_miss(struct btree *b, struct btree_op *op, |
2142 | struct bkey *k) | 2142 | struct bkey *k) |
2143 | { | 2143 | { |
2144 | struct search *s = container_of(op, struct search, op); | 2144 | struct search *s = container_of(op, struct search, op); |
2145 | struct bio *bio = &s->bio.bio; | 2145 | struct bio *bio = &s->bio.bio; |
2146 | int ret = 0; | 2146 | int ret = 0; |
2147 | 2147 | ||
2148 | while (!ret && | 2148 | while (!ret && |
2149 | !op->lookup_done) { | 2149 | !op->lookup_done) { |
2150 | unsigned sectors = INT_MAX; | 2150 | unsigned sectors = INT_MAX; |
2151 | 2151 | ||
2152 | if (KEY_INODE(k) == op->inode) { | 2152 | if (KEY_INODE(k) == op->inode) { |
2153 | if (KEY_START(k) <= bio->bi_sector) | 2153 | if (KEY_START(k) <= bio->bi_sector) |
2154 | break; | 2154 | break; |
2155 | 2155 | ||
2156 | sectors = min_t(uint64_t, sectors, | 2156 | sectors = min_t(uint64_t, sectors, |
2157 | KEY_START(k) - bio->bi_sector); | 2157 | KEY_START(k) - bio->bi_sector); |
2158 | } | 2158 | } |
2159 | 2159 | ||
2160 | ret = s->d->cache_miss(b, s, bio, sectors); | 2160 | ret = s->d->cache_miss(b, s, bio, sectors); |
2161 | } | 2161 | } |
2162 | 2162 | ||
2163 | return ret; | 2163 | return ret; |
2164 | } | 2164 | } |
2165 | 2165 | ||
2166 | /* | 2166 | /* |
2167 | * Read from a single key, handling the initial cache miss if the key starts in | 2167 | * Read from a single key, handling the initial cache miss if the key starts in |
2168 | * the middle of the bio | 2168 | * the middle of the bio |
2169 | */ | 2169 | */ |
2170 | static int submit_partial_cache_hit(struct btree *b, struct btree_op *op, | 2170 | static int submit_partial_cache_hit(struct btree *b, struct btree_op *op, |
2171 | struct bkey *k) | 2171 | struct bkey *k) |
2172 | { | 2172 | { |
2173 | struct search *s = container_of(op, struct search, op); | 2173 | struct search *s = container_of(op, struct search, op); |
2174 | struct bio *bio = &s->bio.bio; | 2174 | struct bio *bio = &s->bio.bio; |
2175 | unsigned ptr; | 2175 | unsigned ptr; |
2176 | struct bio *n; | 2176 | struct bio *n; |
2177 | 2177 | ||
2178 | int ret = submit_partial_cache_miss(b, op, k); | 2178 | int ret = submit_partial_cache_miss(b, op, k); |
2179 | if (ret || op->lookup_done) | 2179 | if (ret || op->lookup_done) |
2180 | return ret; | 2180 | return ret; |
2181 | 2181 | ||
2182 | /* XXX: figure out best pointer - for multiple cache devices */ | 2182 | /* XXX: figure out best pointer - for multiple cache devices */ |
2183 | ptr = 0; | 2183 | ptr = 0; |
2184 | 2184 | ||
2185 | PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO; | 2185 | PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO; |
2186 | 2186 | ||
2187 | while (!op->lookup_done && | 2187 | while (!op->lookup_done && |
2188 | KEY_INODE(k) == op->inode && | 2188 | KEY_INODE(k) == op->inode && |
2189 | bio->bi_sector < KEY_OFFSET(k)) { | 2189 | bio->bi_sector < KEY_OFFSET(k)) { |
2190 | struct bkey *bio_key; | 2190 | struct bkey *bio_key; |
2191 | sector_t sector = PTR_OFFSET(k, ptr) + | 2191 | sector_t sector = PTR_OFFSET(k, ptr) + |
2192 | (bio->bi_sector - KEY_START(k)); | 2192 | (bio->bi_sector - KEY_START(k)); |
2193 | unsigned sectors = min_t(uint64_t, INT_MAX, | 2193 | unsigned sectors = min_t(uint64_t, INT_MAX, |
2194 | KEY_OFFSET(k) - bio->bi_sector); | 2194 | KEY_OFFSET(k) - bio->bi_sector); |
2195 | 2195 | ||
2196 | n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); | 2196 | n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); |
2197 | if (!n) | ||
2198 | return -EAGAIN; | ||
2199 | |||
2200 | if (n == bio) | 2197 | if (n == bio) |
2201 | op->lookup_done = true; | 2198 | op->lookup_done = true; |
2202 | 2199 | ||
2203 | bio_key = &container_of(n, struct bbio, bio)->key; | 2200 | bio_key = &container_of(n, struct bbio, bio)->key; |
2204 | 2201 | ||
2205 | /* | 2202 | /* |
2206 | * The bucket we're reading from might be reused while our bio | 2203 | * The bucket we're reading from might be reused while our bio |
2207 | * is in flight, and we could then end up reading the wrong | 2204 | * is in flight, and we could then end up reading the wrong |
2208 | * data. | 2205 | * data. |
2209 | * | 2206 | * |
2210 | * We guard against this by checking (in cache_read_endio()) if | 2207 | * We guard against this by checking (in cache_read_endio()) if |
2211 | * the pointer is stale again; if so, we treat it as an error | 2208 | * the pointer is stale again; if so, we treat it as an error |
2212 | * and reread from the backing device (but we don't pass that | 2209 | * and reread from the backing device (but we don't pass that |
2213 | * error up anywhere). | 2210 | * error up anywhere). |
2214 | */ | 2211 | */ |
2215 | 2212 | ||
2216 | bch_bkey_copy_single_ptr(bio_key, k, ptr); | 2213 | bch_bkey_copy_single_ptr(bio_key, k, ptr); |
2217 | SET_PTR_OFFSET(bio_key, 0, sector); | 2214 | SET_PTR_OFFSET(bio_key, 0, sector); |
2218 | 2215 | ||
2219 | n->bi_end_io = bch_cache_read_endio; | 2216 | n->bi_end_io = bch_cache_read_endio; |
2220 | n->bi_private = &s->cl; | 2217 | n->bi_private = &s->cl; |
2221 | 2218 | ||
2222 | __bch_submit_bbio(n, b->c); | 2219 | __bch_submit_bbio(n, b->c); |
2223 | } | 2220 | } |
2224 | 2221 | ||
2225 | return 0; | 2222 | return 0; |
2226 | } | 2223 | } |
2227 | 2224 | ||
2228 | int bch_btree_search_recurse(struct btree *b, struct btree_op *op) | 2225 | int bch_btree_search_recurse(struct btree *b, struct btree_op *op) |
2229 | { | 2226 | { |
2230 | struct search *s = container_of(op, struct search, op); | 2227 | struct search *s = container_of(op, struct search, op); |
2231 | struct bio *bio = &s->bio.bio; | 2228 | struct bio *bio = &s->bio.bio; |
2232 | 2229 | ||
2233 | int ret = 0; | 2230 | int ret = 0; |
2234 | struct bkey *k; | 2231 | struct bkey *k; |
2235 | struct btree_iter iter; | 2232 | struct btree_iter iter; |
2236 | bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0)); | 2233 | bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0)); |
2237 | 2234 | ||
2238 | do { | 2235 | do { |
2239 | k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); | 2236 | k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); |
2240 | if (!k) { | 2237 | if (!k) { |
2241 | /* | 2238 | /* |
2242 | * b->key would be exactly what we want, except that | 2239 | * b->key would be exactly what we want, except that |
2243 | * pointers to btree nodes have nonzero size - we | 2240 | * pointers to btree nodes have nonzero size - we |
2244 | * wouldn't go far enough | 2241 | * wouldn't go far enough |
2245 | */ | 2242 | */ |
2246 | 2243 | ||
2247 | ret = submit_partial_cache_miss(b, op, | 2244 | ret = submit_partial_cache_miss(b, op, |
2248 | &KEY(KEY_INODE(&b->key), | 2245 | &KEY(KEY_INODE(&b->key), |
2249 | KEY_OFFSET(&b->key), 0)); | 2246 | KEY_OFFSET(&b->key), 0)); |
2250 | break; | 2247 | break; |
2251 | } | 2248 | } |
2252 | 2249 | ||
2253 | ret = b->level | 2250 | ret = b->level |
2254 | ? btree(search_recurse, k, b, op) | 2251 | ? btree(search_recurse, k, b, op) |
2255 | : submit_partial_cache_hit(b, op, k); | 2252 | : submit_partial_cache_hit(b, op, k); |
2256 | } while (!ret && | 2253 | } while (!ret && |
2257 | !op->lookup_done); | 2254 | !op->lookup_done); |
2258 | 2255 | ||
2259 | return ret; | 2256 | return ret; |
2260 | } | 2257 | } |
2261 | 2258 | ||
2262 | /* Keybuf code */ | 2259 | /* Keybuf code */ |
2263 | 2260 | ||
2264 | static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r) | 2261 | static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r) |
2265 | { | 2262 | { |
2266 | /* Overlapping keys compare equal */ | 2263 | /* Overlapping keys compare equal */ |
2267 | if (bkey_cmp(&l->key, &START_KEY(&r->key)) <= 0) | 2264 | if (bkey_cmp(&l->key, &START_KEY(&r->key)) <= 0) |
2268 | return -1; | 2265 | return -1; |
2269 | if (bkey_cmp(&START_KEY(&l->key), &r->key) >= 0) | 2266 | if (bkey_cmp(&START_KEY(&l->key), &r->key) >= 0) |
2270 | return 1; | 2267 | return 1; |
2271 | return 0; | 2268 | return 0; |
2272 | } | 2269 | } |
2273 | 2270 | ||
2274 | static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l, | 2271 | static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l, |
2275 | struct keybuf_key *r) | 2272 | struct keybuf_key *r) |
2276 | { | 2273 | { |
2277 | return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1); | 2274 | return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1); |
2278 | } | 2275 | } |
2279 | 2276 | ||
2280 | static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, | 2277 | static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, |
2281 | struct keybuf *buf, struct bkey *end, | 2278 | struct keybuf *buf, struct bkey *end, |
2282 | keybuf_pred_fn *pred) | 2279 | keybuf_pred_fn *pred) |
2283 | { | 2280 | { |
2284 | struct btree_iter iter; | 2281 | struct btree_iter iter; |
2285 | bch_btree_iter_init(b, &iter, &buf->last_scanned); | 2282 | bch_btree_iter_init(b, &iter, &buf->last_scanned); |
2286 | 2283 | ||
2287 | while (!array_freelist_empty(&buf->freelist)) { | 2284 | while (!array_freelist_empty(&buf->freelist)) { |
2288 | struct bkey *k = bch_btree_iter_next_filter(&iter, b, | 2285 | struct bkey *k = bch_btree_iter_next_filter(&iter, b, |
2289 | bch_ptr_bad); | 2286 | bch_ptr_bad); |
2290 | 2287 | ||
2291 | if (!b->level) { | 2288 | if (!b->level) { |
2292 | if (!k) { | 2289 | if (!k) { |
2293 | buf->last_scanned = b->key; | 2290 | buf->last_scanned = b->key; |
2294 | break; | 2291 | break; |
2295 | } | 2292 | } |
2296 | 2293 | ||
2297 | buf->last_scanned = *k; | 2294 | buf->last_scanned = *k; |
2298 | if (bkey_cmp(&buf->last_scanned, end) >= 0) | 2295 | if (bkey_cmp(&buf->last_scanned, end) >= 0) |
2299 | break; | 2296 | break; |
2300 | 2297 | ||
2301 | if (pred(buf, k)) { | 2298 | if (pred(buf, k)) { |
2302 | struct keybuf_key *w; | 2299 | struct keybuf_key *w; |
2303 | 2300 | ||
2304 | spin_lock(&buf->lock); | 2301 | spin_lock(&buf->lock); |
2305 | 2302 | ||
2306 | w = array_alloc(&buf->freelist); | 2303 | w = array_alloc(&buf->freelist); |
2307 | 2304 | ||
2308 | w->private = NULL; | 2305 | w->private = NULL; |
2309 | bkey_copy(&w->key, k); | 2306 | bkey_copy(&w->key, k); |
2310 | 2307 | ||
2311 | if (RB_INSERT(&buf->keys, w, node, keybuf_cmp)) | 2308 | if (RB_INSERT(&buf->keys, w, node, keybuf_cmp)) |
2312 | array_free(&buf->freelist, w); | 2309 | array_free(&buf->freelist, w); |
2313 | 2310 | ||
2314 | spin_unlock(&buf->lock); | 2311 | spin_unlock(&buf->lock); |
2315 | } | 2312 | } |
2316 | } else { | 2313 | } else { |
2317 | if (!k) | 2314 | if (!k) |
2318 | break; | 2315 | break; |
2319 | 2316 | ||
2320 | btree(refill_keybuf, k, b, op, buf, end, pred); | 2317 | btree(refill_keybuf, k, b, op, buf, end, pred); |
2321 | /* | 2318 | /* |
2322 | * Might get an error here, but can't really do anything | 2319 | * Might get an error here, but can't really do anything |
2323 | * and it'll get logged elsewhere. Just read what we | 2320 | * and it'll get logged elsewhere. Just read what we |
2324 | * can. | 2321 | * can. |
2325 | */ | 2322 | */ |
2326 | 2323 | ||
2327 | if (bkey_cmp(&buf->last_scanned, end) >= 0) | 2324 | if (bkey_cmp(&buf->last_scanned, end) >= 0) |
2328 | break; | 2325 | break; |
2329 | 2326 | ||
2330 | cond_resched(); | 2327 | cond_resched(); |
2331 | } | 2328 | } |
2332 | } | 2329 | } |
2333 | 2330 | ||
2334 | return 0; | 2331 | return 0; |
2335 | } | 2332 | } |
2336 | 2333 | ||
2337 | void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, | 2334 | void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, |
2338 | struct bkey *end, keybuf_pred_fn *pred) | 2335 | struct bkey *end, keybuf_pred_fn *pred) |
2339 | { | 2336 | { |
2340 | struct bkey start = buf->last_scanned; | 2337 | struct bkey start = buf->last_scanned; |
2341 | struct btree_op op; | 2338 | struct btree_op op; |
2342 | bch_btree_op_init_stack(&op); | 2339 | bch_btree_op_init_stack(&op); |
2343 | 2340 | ||
2344 | cond_resched(); | 2341 | cond_resched(); |
2345 | 2342 | ||
2346 | btree_root(refill_keybuf, c, &op, buf, end, pred); | 2343 | btree_root(refill_keybuf, c, &op, buf, end, pred); |
2347 | closure_sync(&op.cl); | 2344 | closure_sync(&op.cl); |
2348 | 2345 | ||
2349 | pr_debug("found %s keys from %llu:%llu to %llu:%llu", | 2346 | pr_debug("found %s keys from %llu:%llu to %llu:%llu", |
2350 | RB_EMPTY_ROOT(&buf->keys) ? "no" : | 2347 | RB_EMPTY_ROOT(&buf->keys) ? "no" : |
2351 | array_freelist_empty(&buf->freelist) ? "some" : "a few", | 2348 | array_freelist_empty(&buf->freelist) ? "some" : "a few", |
2352 | KEY_INODE(&start), KEY_OFFSET(&start), | 2349 | KEY_INODE(&start), KEY_OFFSET(&start), |
2353 | KEY_INODE(&buf->last_scanned), KEY_OFFSET(&buf->last_scanned)); | 2350 | KEY_INODE(&buf->last_scanned), KEY_OFFSET(&buf->last_scanned)); |
2354 | 2351 | ||
2355 | spin_lock(&buf->lock); | 2352 | spin_lock(&buf->lock); |
2356 | 2353 | ||
2357 | if (!RB_EMPTY_ROOT(&buf->keys)) { | 2354 | if (!RB_EMPTY_ROOT(&buf->keys)) { |
2358 | struct keybuf_key *w; | 2355 | struct keybuf_key *w; |
2359 | w = RB_FIRST(&buf->keys, struct keybuf_key, node); | 2356 | w = RB_FIRST(&buf->keys, struct keybuf_key, node); |
2360 | buf->start = START_KEY(&w->key); | 2357 | buf->start = START_KEY(&w->key); |
2361 | 2358 | ||
2362 | w = RB_LAST(&buf->keys, struct keybuf_key, node); | 2359 | w = RB_LAST(&buf->keys, struct keybuf_key, node); |
2363 | buf->end = w->key; | 2360 | buf->end = w->key; |
2364 | } else { | 2361 | } else { |
2365 | buf->start = MAX_KEY; | 2362 | buf->start = MAX_KEY; |
2366 | buf->end = MAX_KEY; | 2363 | buf->end = MAX_KEY; |
2367 | } | 2364 | } |
2368 | 2365 | ||
2369 | spin_unlock(&buf->lock); | 2366 | spin_unlock(&buf->lock); |
2370 | } | 2367 | } |
2371 | 2368 | ||
2372 | static void __bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w) | 2369 | static void __bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w) |
2373 | { | 2370 | { |
2374 | rb_erase(&w->node, &buf->keys); | 2371 | rb_erase(&w->node, &buf->keys); |
2375 | array_free(&buf->freelist, w); | 2372 | array_free(&buf->freelist, w); |
2376 | } | 2373 | } |
2377 | 2374 | ||
2378 | void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w) | 2375 | void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w) |
2379 | { | 2376 | { |
2380 | spin_lock(&buf->lock); | 2377 | spin_lock(&buf->lock); |
2381 | __bch_keybuf_del(buf, w); | 2378 | __bch_keybuf_del(buf, w); |
2382 | spin_unlock(&buf->lock); | 2379 | spin_unlock(&buf->lock); |
2383 | } | 2380 | } |
2384 | 2381 | ||
2385 | bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start, | 2382 | bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start, |
2386 | struct bkey *end) | 2383 | struct bkey *end) |
2387 | { | 2384 | { |
2388 | bool ret = false; | 2385 | bool ret = false; |
2389 | struct keybuf_key *p, *w, s; | 2386 | struct keybuf_key *p, *w, s; |
2390 | s.key = *start; | 2387 | s.key = *start; |
2391 | 2388 | ||
2392 | if (bkey_cmp(end, &buf->start) <= 0 || | 2389 | if (bkey_cmp(end, &buf->start) <= 0 || |
2393 | bkey_cmp(start, &buf->end) >= 0) | 2390 | bkey_cmp(start, &buf->end) >= 0) |
2394 | return false; | 2391 | return false; |
2395 | 2392 | ||
2396 | spin_lock(&buf->lock); | 2393 | spin_lock(&buf->lock); |
2397 | w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp); | 2394 | w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp); |
2398 | 2395 | ||
2399 | while (w && bkey_cmp(&START_KEY(&w->key), end) < 0) { | 2396 | while (w && bkey_cmp(&START_KEY(&w->key), end) < 0) { |
2400 | p = w; | 2397 | p = w; |
2401 | w = RB_NEXT(w, node); | 2398 | w = RB_NEXT(w, node); |
2402 | 2399 | ||
2403 | if (p->private) | 2400 | if (p->private) |
2404 | ret = true; | 2401 | ret = true; |
2405 | else | 2402 | else |
2406 | __bch_keybuf_del(buf, p); | 2403 | __bch_keybuf_del(buf, p); |
2407 | } | 2404 | } |
2408 | 2405 | ||
2409 | spin_unlock(&buf->lock); | 2406 | spin_unlock(&buf->lock); |
2410 | return ret; | 2407 | return ret; |
2411 | } | 2408 | } |
2412 | 2409 | ||
2413 | struct keybuf_key *bch_keybuf_next(struct keybuf *buf) | 2410 | struct keybuf_key *bch_keybuf_next(struct keybuf *buf) |
2414 | { | 2411 | { |
2415 | struct keybuf_key *w; | 2412 | struct keybuf_key *w; |
2416 | spin_lock(&buf->lock); | 2413 | spin_lock(&buf->lock); |
2417 | 2414 | ||
2418 | w = RB_FIRST(&buf->keys, struct keybuf_key, node); | 2415 | w = RB_FIRST(&buf->keys, struct keybuf_key, node); |
2419 | 2416 | ||
2420 | while (w && w->private) | 2417 | while (w && w->private) |
2421 | w = RB_NEXT(w, node); | 2418 | w = RB_NEXT(w, node); |
2422 | 2419 | ||
2423 | if (w) | 2420 | if (w) |
2424 | w->private = ERR_PTR(-EINTR); | 2421 | w->private = ERR_PTR(-EINTR); |
2425 | 2422 | ||
2426 | spin_unlock(&buf->lock); | 2423 | spin_unlock(&buf->lock); |
2427 | return w; | 2424 | return w; |
2428 | } | 2425 | } |
2429 | 2426 | ||
2430 | struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, | 2427 | struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, |
2431 | struct keybuf *buf, | 2428 | struct keybuf *buf, |
2432 | struct bkey *end, | 2429 | struct bkey *end, |
2433 | keybuf_pred_fn *pred) | 2430 | keybuf_pred_fn *pred) |
2434 | { | 2431 | { |
2435 | struct keybuf_key *ret; | 2432 | struct keybuf_key *ret; |
2436 | 2433 | ||
2437 | while (1) { | 2434 | while (1) { |
2438 | ret = bch_keybuf_next(buf); | 2435 | ret = bch_keybuf_next(buf); |
2439 | if (ret) | 2436 | if (ret) |
2440 | break; | 2437 | break; |
2441 | 2438 | ||
2442 | if (bkey_cmp(&buf->last_scanned, end) >= 0) { | 2439 | if (bkey_cmp(&buf->last_scanned, end) >= 0) { |
2443 | pr_debug("scan finished"); | 2440 | pr_debug("scan finished"); |
2444 | break; | 2441 | break; |
2445 | } | 2442 | } |
2446 | 2443 | ||
2447 | bch_refill_keybuf(c, buf, end, pred); | 2444 | bch_refill_keybuf(c, buf, end, pred); |
2448 | } | 2445 | } |
2449 | 2446 | ||
2450 | return ret; | 2447 | return ret; |
2451 | } | 2448 | } |
2452 | 2449 | ||
2453 | void bch_keybuf_init(struct keybuf *buf) | 2450 | void bch_keybuf_init(struct keybuf *buf) |
2454 | { | 2451 | { |
2455 | buf->last_scanned = MAX_KEY; | 2452 | buf->last_scanned = MAX_KEY; |
2456 | buf->keys = RB_ROOT; | 2453 | buf->keys = RB_ROOT; |
2457 | 2454 | ||
2458 | spin_lock_init(&buf->lock); | 2455 | spin_lock_init(&buf->lock); |
2459 | array_allocator_init(&buf->freelist); | 2456 | array_allocator_init(&buf->freelist); |
2460 | } | 2457 | } |
2461 | 2458 | ||
2462 | void bch_btree_exit(void) | 2459 | void bch_btree_exit(void) |
2463 | { | 2460 | { |
2464 | if (btree_io_wq) | 2461 | if (btree_io_wq) |
2465 | destroy_workqueue(btree_io_wq); | 2462 | destroy_workqueue(btree_io_wq); |
2466 | if (bch_gc_wq) | 2463 | if (bch_gc_wq) |
2467 | destroy_workqueue(bch_gc_wq); | 2464 | destroy_workqueue(bch_gc_wq); |
2468 | } | 2465 | } |
2469 | 2466 | ||
2470 | int __init bch_btree_init(void) | 2467 | int __init bch_btree_init(void) |
2471 | { | 2468 | { |
2472 | if (!(bch_gc_wq = create_singlethread_workqueue("bch_btree_gc")) || | 2469 | if (!(bch_gc_wq = create_singlethread_workqueue("bch_btree_gc")) || |
2473 | !(btree_io_wq = create_singlethread_workqueue("bch_btree_io"))) | 2470 | !(btree_io_wq = create_singlethread_workqueue("bch_btree_io"))) |
2474 | return -ENOMEM; | 2471 | return -ENOMEM; |
2475 | 2472 | ||
2476 | return 0; | 2473 | return 0; |
2477 | } | 2474 | } |
2478 | 2475 |
drivers/md/bcache/debug.c
1 | /* | 1 | /* |
2 | * Assorted bcache debug code | 2 | * Assorted bcache debug code |
3 | * | 3 | * |
4 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> | 4 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> |
5 | * Copyright 2012 Google, Inc. | 5 | * Copyright 2012 Google, Inc. |
6 | */ | 6 | */ |
7 | 7 | ||
8 | #include "bcache.h" | 8 | #include "bcache.h" |
9 | #include "btree.h" | 9 | #include "btree.h" |
10 | #include "debug.h" | 10 | #include "debug.h" |
11 | #include "request.h" | 11 | #include "request.h" |
12 | 12 | ||
13 | #include <linux/console.h> | 13 | #include <linux/console.h> |
14 | #include <linux/debugfs.h> | 14 | #include <linux/debugfs.h> |
15 | #include <linux/module.h> | 15 | #include <linux/module.h> |
16 | #include <linux/random.h> | 16 | #include <linux/random.h> |
17 | #include <linux/seq_file.h> | 17 | #include <linux/seq_file.h> |
18 | 18 | ||
19 | static struct dentry *debug; | 19 | static struct dentry *debug; |
20 | 20 | ||
21 | const char *bch_ptr_status(struct cache_set *c, const struct bkey *k) | 21 | const char *bch_ptr_status(struct cache_set *c, const struct bkey *k) |
22 | { | 22 | { |
23 | unsigned i; | 23 | unsigned i; |
24 | 24 | ||
25 | for (i = 0; i < KEY_PTRS(k); i++) | 25 | for (i = 0; i < KEY_PTRS(k); i++) |
26 | if (ptr_available(c, k, i)) { | 26 | if (ptr_available(c, k, i)) { |
27 | struct cache *ca = PTR_CACHE(c, k, i); | 27 | struct cache *ca = PTR_CACHE(c, k, i); |
28 | size_t bucket = PTR_BUCKET_NR(c, k, i); | 28 | size_t bucket = PTR_BUCKET_NR(c, k, i); |
29 | size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); | 29 | size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); |
30 | 30 | ||
31 | if (KEY_SIZE(k) + r > c->sb.bucket_size) | 31 | if (KEY_SIZE(k) + r > c->sb.bucket_size) |
32 | return "bad, length too big"; | 32 | return "bad, length too big"; |
33 | if (bucket < ca->sb.first_bucket) | 33 | if (bucket < ca->sb.first_bucket) |
34 | return "bad, short offset"; | 34 | return "bad, short offset"; |
35 | if (bucket >= ca->sb.nbuckets) | 35 | if (bucket >= ca->sb.nbuckets) |
36 | return "bad, offset past end of device"; | 36 | return "bad, offset past end of device"; |
37 | if (ptr_stale(c, k, i)) | 37 | if (ptr_stale(c, k, i)) |
38 | return "stale"; | 38 | return "stale"; |
39 | } | 39 | } |
40 | 40 | ||
41 | if (!bkey_cmp(k, &ZERO_KEY)) | 41 | if (!bkey_cmp(k, &ZERO_KEY)) |
42 | return "bad, null key"; | 42 | return "bad, null key"; |
43 | if (!KEY_PTRS(k)) | 43 | if (!KEY_PTRS(k)) |
44 | return "bad, no pointers"; | 44 | return "bad, no pointers"; |
45 | if (!KEY_SIZE(k)) | 45 | if (!KEY_SIZE(k)) |
46 | return "zeroed key"; | 46 | return "zeroed key"; |
47 | return ""; | 47 | return ""; |
48 | } | 48 | } |
49 | 49 | ||
50 | int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k) | 50 | int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k) |
51 | { | 51 | { |
52 | unsigned i = 0; | 52 | unsigned i = 0; |
53 | char *out = buf, *end = buf + size; | 53 | char *out = buf, *end = buf + size; |
54 | 54 | ||
55 | #define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) | 55 | #define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) |
56 | 56 | ||
57 | p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_OFFSET(k), KEY_SIZE(k)); | 57 | p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_OFFSET(k), KEY_SIZE(k)); |
58 | 58 | ||
59 | if (KEY_PTRS(k)) | 59 | if (KEY_PTRS(k)) |
60 | while (1) { | 60 | while (1) { |
61 | p("%llu:%llu gen %llu", | 61 | p("%llu:%llu gen %llu", |
62 | PTR_DEV(k, i), PTR_OFFSET(k, i), PTR_GEN(k, i)); | 62 | PTR_DEV(k, i), PTR_OFFSET(k, i), PTR_GEN(k, i)); |
63 | 63 | ||
64 | if (++i == KEY_PTRS(k)) | 64 | if (++i == KEY_PTRS(k)) |
65 | break; | 65 | break; |
66 | 66 | ||
67 | p(", "); | 67 | p(", "); |
68 | } | 68 | } |
69 | 69 | ||
70 | p("]"); | 70 | p("]"); |
71 | 71 | ||
72 | if (KEY_DIRTY(k)) | 72 | if (KEY_DIRTY(k)) |
73 | p(" dirty"); | 73 | p(" dirty"); |
74 | if (KEY_CSUM(k)) | 74 | if (KEY_CSUM(k)) |
75 | p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]); | 75 | p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]); |
76 | #undef p | 76 | #undef p |
77 | return out - buf; | 77 | return out - buf; |
78 | } | 78 | } |
79 | 79 | ||
80 | int bch_btree_to_text(char *buf, size_t size, const struct btree *b) | 80 | int bch_btree_to_text(char *buf, size_t size, const struct btree *b) |
81 | { | 81 | { |
82 | return scnprintf(buf, size, "%zu level %i/%i", | 82 | return scnprintf(buf, size, "%zu level %i/%i", |
83 | PTR_BUCKET_NR(b->c, &b->key, 0), | 83 | PTR_BUCKET_NR(b->c, &b->key, 0), |
84 | b->level, b->c->root ? b->c->root->level : -1); | 84 | b->level, b->c->root ? b->c->root->level : -1); |
85 | } | 85 | } |
86 | 86 | ||
87 | #if defined(CONFIG_BCACHE_DEBUG) || defined(CONFIG_BCACHE_EDEBUG) | 87 | #if defined(CONFIG_BCACHE_DEBUG) || defined(CONFIG_BCACHE_EDEBUG) |
88 | 88 | ||
89 | static bool skipped_backwards(struct btree *b, struct bkey *k) | 89 | static bool skipped_backwards(struct btree *b, struct bkey *k) |
90 | { | 90 | { |
91 | return bkey_cmp(k, (!b->level) | 91 | return bkey_cmp(k, (!b->level) |
92 | ? &START_KEY(bkey_next(k)) | 92 | ? &START_KEY(bkey_next(k)) |
93 | : bkey_next(k)) > 0; | 93 | : bkey_next(k)) > 0; |
94 | } | 94 | } |
95 | 95 | ||
96 | static void dump_bset(struct btree *b, struct bset *i) | 96 | static void dump_bset(struct btree *b, struct bset *i) |
97 | { | 97 | { |
98 | struct bkey *k; | 98 | struct bkey *k; |
99 | unsigned j; | 99 | unsigned j; |
100 | char buf[80]; | 100 | char buf[80]; |
101 | 101 | ||
102 | for (k = i->start; k < end(i); k = bkey_next(k)) { | 102 | for (k = i->start; k < end(i); k = bkey_next(k)) { |
103 | bch_bkey_to_text(buf, sizeof(buf), k); | 103 | bch_bkey_to_text(buf, sizeof(buf), k); |
104 | printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b), | 104 | printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b), |
105 | (uint64_t *) k - i->d, i->keys, buf); | 105 | (uint64_t *) k - i->d, i->keys, buf); |
106 | 106 | ||
107 | for (j = 0; j < KEY_PTRS(k); j++) { | 107 | for (j = 0; j < KEY_PTRS(k); j++) { |
108 | size_t n = PTR_BUCKET_NR(b->c, k, j); | 108 | size_t n = PTR_BUCKET_NR(b->c, k, j); |
109 | printk(" bucket %zu", n); | 109 | printk(" bucket %zu", n); |
110 | 110 | ||
111 | if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets) | 111 | if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets) |
112 | printk(" prio %i", | 112 | printk(" prio %i", |
113 | PTR_BUCKET(b->c, k, j)->prio); | 113 | PTR_BUCKET(b->c, k, j)->prio); |
114 | } | 114 | } |
115 | 115 | ||
116 | printk(" %s\n", bch_ptr_status(b->c, k)); | 116 | printk(" %s\n", bch_ptr_status(b->c, k)); |
117 | 117 | ||
118 | if (bkey_next(k) < end(i) && | 118 | if (bkey_next(k) < end(i) && |
119 | skipped_backwards(b, k)) | 119 | skipped_backwards(b, k)) |
120 | printk(KERN_ERR "Key skipped backwards\n"); | 120 | printk(KERN_ERR "Key skipped backwards\n"); |
121 | } | 121 | } |
122 | } | 122 | } |
123 | 123 | ||
124 | #endif | 124 | #endif |
125 | 125 | ||
126 | #ifdef CONFIG_BCACHE_DEBUG | 126 | #ifdef CONFIG_BCACHE_DEBUG |
127 | 127 | ||
128 | void bch_btree_verify(struct btree *b, struct bset *new) | 128 | void bch_btree_verify(struct btree *b, struct bset *new) |
129 | { | 129 | { |
130 | struct btree *v = b->c->verify_data; | 130 | struct btree *v = b->c->verify_data; |
131 | struct closure cl; | 131 | struct closure cl; |
132 | closure_init_stack(&cl); | 132 | closure_init_stack(&cl); |
133 | 133 | ||
134 | if (!b->c->verify) | 134 | if (!b->c->verify) |
135 | return; | 135 | return; |
136 | 136 | ||
137 | closure_wait_event(&b->io.wait, &cl, | 137 | closure_wait_event(&b->io.wait, &cl, |
138 | atomic_read(&b->io.cl.remaining) == -1); | 138 | atomic_read(&b->io.cl.remaining) == -1); |
139 | 139 | ||
140 | mutex_lock(&b->c->verify_lock); | 140 | mutex_lock(&b->c->verify_lock); |
141 | 141 | ||
142 | bkey_copy(&v->key, &b->key); | 142 | bkey_copy(&v->key, &b->key); |
143 | v->written = 0; | 143 | v->written = 0; |
144 | v->level = b->level; | 144 | v->level = b->level; |
145 | 145 | ||
146 | bch_btree_node_read(v); | 146 | bch_btree_node_read(v); |
147 | closure_wait_event(&v->io.wait, &cl, | 147 | closure_wait_event(&v->io.wait, &cl, |
148 | atomic_read(&b->io.cl.remaining) == -1); | 148 | atomic_read(&b->io.cl.remaining) == -1); |
149 | 149 | ||
150 | if (new->keys != v->sets[0].data->keys || | 150 | if (new->keys != v->sets[0].data->keys || |
151 | memcmp(new->start, | 151 | memcmp(new->start, |
152 | v->sets[0].data->start, | 152 | v->sets[0].data->start, |
153 | (void *) end(new) - (void *) new->start)) { | 153 | (void *) end(new) - (void *) new->start)) { |
154 | unsigned i, j; | 154 | unsigned i, j; |
155 | 155 | ||
156 | console_lock(); | 156 | console_lock(); |
157 | 157 | ||
158 | printk(KERN_ERR "*** original memory node:\n"); | 158 | printk(KERN_ERR "*** original memory node:\n"); |
159 | for (i = 0; i <= b->nsets; i++) | 159 | for (i = 0; i <= b->nsets; i++) |
160 | dump_bset(b, b->sets[i].data); | 160 | dump_bset(b, b->sets[i].data); |
161 | 161 | ||
162 | printk(KERN_ERR "*** sorted memory node:\n"); | 162 | printk(KERN_ERR "*** sorted memory node:\n"); |
163 | dump_bset(b, new); | 163 | dump_bset(b, new); |
164 | 164 | ||
165 | printk(KERN_ERR "*** on disk node:\n"); | 165 | printk(KERN_ERR "*** on disk node:\n"); |
166 | dump_bset(v, v->sets[0].data); | 166 | dump_bset(v, v->sets[0].data); |
167 | 167 | ||
168 | for (j = 0; j < new->keys; j++) | 168 | for (j = 0; j < new->keys; j++) |
169 | if (new->d[j] != v->sets[0].data->d[j]) | 169 | if (new->d[j] != v->sets[0].data->d[j]) |
170 | break; | 170 | break; |
171 | 171 | ||
172 | console_unlock(); | 172 | console_unlock(); |
173 | panic("verify failed at %u\n", j); | 173 | panic("verify failed at %u\n", j); |
174 | } | 174 | } |
175 | 175 | ||
176 | mutex_unlock(&b->c->verify_lock); | 176 | mutex_unlock(&b->c->verify_lock); |
177 | } | 177 | } |
178 | 178 | ||
179 | static void data_verify_endio(struct bio *bio, int error) | 179 | static void data_verify_endio(struct bio *bio, int error) |
180 | { | 180 | { |
181 | struct closure *cl = bio->bi_private; | 181 | struct closure *cl = bio->bi_private; |
182 | closure_put(cl); | 182 | closure_put(cl); |
183 | } | 183 | } |
184 | 184 | ||
185 | void bch_data_verify(struct search *s) | 185 | void bch_data_verify(struct search *s) |
186 | { | 186 | { |
187 | char name[BDEVNAME_SIZE]; | 187 | char name[BDEVNAME_SIZE]; |
188 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | 188 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); |
189 | struct closure *cl = &s->cl; | 189 | struct closure *cl = &s->cl; |
190 | struct bio *check; | 190 | struct bio *check; |
191 | struct bio_vec *bv; | 191 | struct bio_vec *bv; |
192 | int i; | 192 | int i; |
193 | 193 | ||
194 | if (!s->unaligned_bvec) | 194 | if (!s->unaligned_bvec) |
195 | bio_for_each_segment(bv, s->orig_bio, i) | 195 | bio_for_each_segment(bv, s->orig_bio, i) |
196 | bv->bv_offset = 0, bv->bv_len = PAGE_SIZE; | 196 | bv->bv_offset = 0, bv->bv_len = PAGE_SIZE; |
197 | 197 | ||
198 | check = bio_clone(s->orig_bio, GFP_NOIO); | 198 | check = bio_clone(s->orig_bio, GFP_NOIO); |
199 | if (!check) | 199 | if (!check) |
200 | return; | 200 | return; |
201 | 201 | ||
202 | if (bch_bio_alloc_pages(check, GFP_NOIO)) | 202 | if (bio_alloc_pages(check, GFP_NOIO)) |
203 | goto out_put; | 203 | goto out_put; |
204 | 204 | ||
205 | check->bi_rw = READ_SYNC; | 205 | check->bi_rw = READ_SYNC; |
206 | check->bi_private = cl; | 206 | check->bi_private = cl; |
207 | check->bi_end_io = data_verify_endio; | 207 | check->bi_end_io = data_verify_endio; |
208 | 208 | ||
209 | closure_bio_submit(check, cl, &dc->disk); | 209 | closure_bio_submit(check, cl, &dc->disk); |
210 | closure_sync(cl); | 210 | closure_sync(cl); |
211 | 211 | ||
212 | bio_for_each_segment(bv, s->orig_bio, i) { | 212 | bio_for_each_segment(bv, s->orig_bio, i) { |
213 | void *p1 = kmap(bv->bv_page); | 213 | void *p1 = kmap(bv->bv_page); |
214 | void *p2 = kmap(check->bi_io_vec[i].bv_page); | 214 | void *p2 = kmap(check->bi_io_vec[i].bv_page); |
215 | 215 | ||
216 | if (memcmp(p1 + bv->bv_offset, | 216 | if (memcmp(p1 + bv->bv_offset, |
217 | p2 + bv->bv_offset, | 217 | p2 + bv->bv_offset, |
218 | bv->bv_len)) | 218 | bv->bv_len)) |
219 | printk(KERN_ERR | 219 | printk(KERN_ERR |
220 | "bcache (%s): verify failed at sector %llu\n", | 220 | "bcache (%s): verify failed at sector %llu\n", |
221 | bdevname(dc->bdev, name), | 221 | bdevname(dc->bdev, name), |
222 | (uint64_t) s->orig_bio->bi_sector); | 222 | (uint64_t) s->orig_bio->bi_sector); |
223 | 223 | ||
224 | kunmap(bv->bv_page); | 224 | kunmap(bv->bv_page); |
225 | kunmap(check->bi_io_vec[i].bv_page); | 225 | kunmap(check->bi_io_vec[i].bv_page); |
226 | } | 226 | } |
227 | 227 | ||
228 | __bio_for_each_segment(bv, check, i, 0) | 228 | __bio_for_each_segment(bv, check, i, 0) |
229 | __free_page(bv->bv_page); | 229 | __free_page(bv->bv_page); |
230 | out_put: | 230 | out_put: |
231 | bio_put(check); | 231 | bio_put(check); |
232 | } | 232 | } |
233 | 233 | ||
234 | #endif | 234 | #endif |
235 | 235 | ||
236 | #ifdef CONFIG_BCACHE_EDEBUG | 236 | #ifdef CONFIG_BCACHE_EDEBUG |
237 | 237 | ||
238 | unsigned bch_count_data(struct btree *b) | 238 | unsigned bch_count_data(struct btree *b) |
239 | { | 239 | { |
240 | unsigned ret = 0; | 240 | unsigned ret = 0; |
241 | struct btree_iter iter; | 241 | struct btree_iter iter; |
242 | struct bkey *k; | 242 | struct bkey *k; |
243 | 243 | ||
244 | if (!b->level) | 244 | if (!b->level) |
245 | for_each_key(b, k, &iter) | 245 | for_each_key(b, k, &iter) |
246 | ret += KEY_SIZE(k); | 246 | ret += KEY_SIZE(k); |
247 | return ret; | 247 | return ret; |
248 | } | 248 | } |
249 | 249 | ||
250 | static void vdump_bucket_and_panic(struct btree *b, const char *fmt, | 250 | static void vdump_bucket_and_panic(struct btree *b, const char *fmt, |
251 | va_list args) | 251 | va_list args) |
252 | { | 252 | { |
253 | unsigned i; | 253 | unsigned i; |
254 | char buf[80]; | 254 | char buf[80]; |
255 | 255 | ||
256 | console_lock(); | 256 | console_lock(); |
257 | 257 | ||
258 | for (i = 0; i <= b->nsets; i++) | 258 | for (i = 0; i <= b->nsets; i++) |
259 | dump_bset(b, b->sets[i].data); | 259 | dump_bset(b, b->sets[i].data); |
260 | 260 | ||
261 | vprintk(fmt, args); | 261 | vprintk(fmt, args); |
262 | 262 | ||
263 | console_unlock(); | 263 | console_unlock(); |
264 | 264 | ||
265 | bch_btree_to_text(buf, sizeof(buf), b); | 265 | bch_btree_to_text(buf, sizeof(buf), b); |
266 | panic("at %s\n", buf); | 266 | panic("at %s\n", buf); |
267 | } | 267 | } |
268 | 268 | ||
269 | void bch_check_key_order_msg(struct btree *b, struct bset *i, | 269 | void bch_check_key_order_msg(struct btree *b, struct bset *i, |
270 | const char *fmt, ...) | 270 | const char *fmt, ...) |
271 | { | 271 | { |
272 | struct bkey *k; | 272 | struct bkey *k; |
273 | 273 | ||
274 | if (!i->keys) | 274 | if (!i->keys) |
275 | return; | 275 | return; |
276 | 276 | ||
277 | for (k = i->start; bkey_next(k) < end(i); k = bkey_next(k)) | 277 | for (k = i->start; bkey_next(k) < end(i); k = bkey_next(k)) |
278 | if (skipped_backwards(b, k)) { | 278 | if (skipped_backwards(b, k)) { |
279 | va_list args; | 279 | va_list args; |
280 | va_start(args, fmt); | 280 | va_start(args, fmt); |
281 | 281 | ||
282 | vdump_bucket_and_panic(b, fmt, args); | 282 | vdump_bucket_and_panic(b, fmt, args); |
283 | va_end(args); | 283 | va_end(args); |
284 | } | 284 | } |
285 | } | 285 | } |
286 | 286 | ||
287 | void bch_check_keys(struct btree *b, const char *fmt, ...) | 287 | void bch_check_keys(struct btree *b, const char *fmt, ...) |
288 | { | 288 | { |
289 | va_list args; | 289 | va_list args; |
290 | struct bkey *k, *p = NULL; | 290 | struct bkey *k, *p = NULL; |
291 | struct btree_iter iter; | 291 | struct btree_iter iter; |
292 | 292 | ||
293 | if (b->level) | 293 | if (b->level) |
294 | return; | 294 | return; |
295 | 295 | ||
296 | for_each_key(b, k, &iter) { | 296 | for_each_key(b, k, &iter) { |
297 | if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) { | 297 | if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) { |
298 | printk(KERN_ERR "Keys out of order:\n"); | 298 | printk(KERN_ERR "Keys out of order:\n"); |
299 | goto bug; | 299 | goto bug; |
300 | } | 300 | } |
301 | 301 | ||
302 | if (bch_ptr_invalid(b, k)) | 302 | if (bch_ptr_invalid(b, k)) |
303 | continue; | 303 | continue; |
304 | 304 | ||
305 | if (p && bkey_cmp(p, &START_KEY(k)) > 0) { | 305 | if (p && bkey_cmp(p, &START_KEY(k)) > 0) { |
306 | printk(KERN_ERR "Overlapping keys:\n"); | 306 | printk(KERN_ERR "Overlapping keys:\n"); |
307 | goto bug; | 307 | goto bug; |
308 | } | 308 | } |
309 | p = k; | 309 | p = k; |
310 | } | 310 | } |
311 | return; | 311 | return; |
312 | bug: | 312 | bug: |
313 | va_start(args, fmt); | 313 | va_start(args, fmt); |
314 | vdump_bucket_and_panic(b, fmt, args); | 314 | vdump_bucket_and_panic(b, fmt, args); |
315 | va_end(args); | 315 | va_end(args); |
316 | } | 316 | } |
317 | 317 | ||
318 | #endif | 318 | #endif |
319 | 319 | ||
320 | #ifdef CONFIG_DEBUG_FS | 320 | #ifdef CONFIG_DEBUG_FS |
321 | 321 | ||
322 | /* XXX: cache set refcounting */ | 322 | /* XXX: cache set refcounting */ |
323 | 323 | ||
324 | struct dump_iterator { | 324 | struct dump_iterator { |
325 | char buf[PAGE_SIZE]; | 325 | char buf[PAGE_SIZE]; |
326 | size_t bytes; | 326 | size_t bytes; |
327 | struct cache_set *c; | 327 | struct cache_set *c; |
328 | struct keybuf keys; | 328 | struct keybuf keys; |
329 | }; | 329 | }; |
330 | 330 | ||
331 | static bool dump_pred(struct keybuf *buf, struct bkey *k) | 331 | static bool dump_pred(struct keybuf *buf, struct bkey *k) |
332 | { | 332 | { |
333 | return true; | 333 | return true; |
334 | } | 334 | } |
335 | 335 | ||
336 | static ssize_t bch_dump_read(struct file *file, char __user *buf, | 336 | static ssize_t bch_dump_read(struct file *file, char __user *buf, |
337 | size_t size, loff_t *ppos) | 337 | size_t size, loff_t *ppos) |
338 | { | 338 | { |
339 | struct dump_iterator *i = file->private_data; | 339 | struct dump_iterator *i = file->private_data; |
340 | ssize_t ret = 0; | 340 | ssize_t ret = 0; |
341 | char kbuf[80]; | 341 | char kbuf[80]; |
342 | 342 | ||
343 | while (size) { | 343 | while (size) { |
344 | struct keybuf_key *w; | 344 | struct keybuf_key *w; |
345 | unsigned bytes = min(i->bytes, size); | 345 | unsigned bytes = min(i->bytes, size); |
346 | 346 | ||
347 | int err = copy_to_user(buf, i->buf, bytes); | 347 | int err = copy_to_user(buf, i->buf, bytes); |
348 | if (err) | 348 | if (err) |
349 | return err; | 349 | return err; |
350 | 350 | ||
351 | ret += bytes; | 351 | ret += bytes; |
352 | buf += bytes; | 352 | buf += bytes; |
353 | size -= bytes; | 353 | size -= bytes; |
354 | i->bytes -= bytes; | 354 | i->bytes -= bytes; |
355 | memmove(i->buf, i->buf + bytes, i->bytes); | 355 | memmove(i->buf, i->buf + bytes, i->bytes); |
356 | 356 | ||
357 | if (i->bytes) | 357 | if (i->bytes) |
358 | break; | 358 | break; |
359 | 359 | ||
360 | w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY, dump_pred); | 360 | w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY, dump_pred); |
361 | if (!w) | 361 | if (!w) |
362 | break; | 362 | break; |
363 | 363 | ||
364 | bch_bkey_to_text(kbuf, sizeof(kbuf), &w->key); | 364 | bch_bkey_to_text(kbuf, sizeof(kbuf), &w->key); |
365 | i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", kbuf); | 365 | i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", kbuf); |
366 | bch_keybuf_del(&i->keys, w); | 366 | bch_keybuf_del(&i->keys, w); |
367 | } | 367 | } |
368 | 368 | ||
369 | return ret; | 369 | return ret; |
370 | } | 370 | } |
371 | 371 | ||
372 | static int bch_dump_open(struct inode *inode, struct file *file) | 372 | static int bch_dump_open(struct inode *inode, struct file *file) |
373 | { | 373 | { |
374 | struct cache_set *c = inode->i_private; | 374 | struct cache_set *c = inode->i_private; |
375 | struct dump_iterator *i; | 375 | struct dump_iterator *i; |
376 | 376 | ||
377 | i = kzalloc(sizeof(struct dump_iterator), GFP_KERNEL); | 377 | i = kzalloc(sizeof(struct dump_iterator), GFP_KERNEL); |
378 | if (!i) | 378 | if (!i) |
379 | return -ENOMEM; | 379 | return -ENOMEM; |
380 | 380 | ||
381 | file->private_data = i; | 381 | file->private_data = i; |
382 | i->c = c; | 382 | i->c = c; |
383 | bch_keybuf_init(&i->keys); | 383 | bch_keybuf_init(&i->keys); |
384 | i->keys.last_scanned = KEY(0, 0, 0); | 384 | i->keys.last_scanned = KEY(0, 0, 0); |
385 | 385 | ||
386 | return 0; | 386 | return 0; |
387 | } | 387 | } |
388 | 388 | ||
389 | static int bch_dump_release(struct inode *inode, struct file *file) | 389 | static int bch_dump_release(struct inode *inode, struct file *file) |
390 | { | 390 | { |
391 | kfree(file->private_data); | 391 | kfree(file->private_data); |
392 | return 0; | 392 | return 0; |
393 | } | 393 | } |
394 | 394 | ||
395 | static const struct file_operations cache_set_debug_ops = { | 395 | static const struct file_operations cache_set_debug_ops = { |
396 | .owner = THIS_MODULE, | 396 | .owner = THIS_MODULE, |
397 | .open = bch_dump_open, | 397 | .open = bch_dump_open, |
398 | .read = bch_dump_read, | 398 | .read = bch_dump_read, |
399 | .release = bch_dump_release | 399 | .release = bch_dump_release |
400 | }; | 400 | }; |
401 | 401 | ||
402 | void bch_debug_init_cache_set(struct cache_set *c) | 402 | void bch_debug_init_cache_set(struct cache_set *c) |
403 | { | 403 | { |
404 | if (!IS_ERR_OR_NULL(debug)) { | 404 | if (!IS_ERR_OR_NULL(debug)) { |
405 | char name[50]; | 405 | char name[50]; |
406 | snprintf(name, 50, "bcache-%pU", c->sb.set_uuid); | 406 | snprintf(name, 50, "bcache-%pU", c->sb.set_uuid); |
407 | 407 | ||
408 | c->debug = debugfs_create_file(name, 0400, debug, c, | 408 | c->debug = debugfs_create_file(name, 0400, debug, c, |
409 | &cache_set_debug_ops); | 409 | &cache_set_debug_ops); |
410 | } | 410 | } |
411 | } | 411 | } |
412 | 412 | ||
413 | #endif | 413 | #endif |
414 | 414 | ||
415 | void bch_debug_exit(void) | 415 | void bch_debug_exit(void) |
416 | { | 416 | { |
417 | if (!IS_ERR_OR_NULL(debug)) | 417 | if (!IS_ERR_OR_NULL(debug)) |
418 | debugfs_remove_recursive(debug); | 418 | debugfs_remove_recursive(debug); |
419 | } | 419 | } |
420 | 420 | ||
421 | int __init bch_debug_init(struct kobject *kobj) | 421 | int __init bch_debug_init(struct kobject *kobj) |
422 | { | 422 | { |
423 | int ret = 0; | 423 | int ret = 0; |
424 | 424 | ||
425 | debug = debugfs_create_dir("bcache", NULL); | 425 | debug = debugfs_create_dir("bcache", NULL); |
426 | return ret; | 426 | return ret; |
427 | } | 427 | } |
428 | 428 |
drivers/md/bcache/io.c
1 | /* | 1 | /* |
2 | * Some low level IO code, and hacks for various block layer limitations | 2 | * Some low level IO code, and hacks for various block layer limitations |
3 | * | 3 | * |
4 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> | 4 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> |
5 | * Copyright 2012 Google, Inc. | 5 | * Copyright 2012 Google, Inc. |
6 | */ | 6 | */ |
7 | 7 | ||
8 | #include "bcache.h" | 8 | #include "bcache.h" |
9 | #include "bset.h" | 9 | #include "bset.h" |
10 | #include "debug.h" | 10 | #include "debug.h" |
11 | 11 | ||
12 | #include <linux/blkdev.h> | 12 | #include <linux/blkdev.h> |
13 | 13 | ||
14 | static void bch_bi_idx_hack_endio(struct bio *bio, int error) | 14 | static void bch_bi_idx_hack_endio(struct bio *bio, int error) |
15 | { | 15 | { |
16 | struct bio *p = bio->bi_private; | 16 | struct bio *p = bio->bi_private; |
17 | 17 | ||
18 | bio_endio(p, error); | 18 | bio_endio(p, error); |
19 | bio_put(bio); | 19 | bio_put(bio); |
20 | } | 20 | } |
21 | 21 | ||
22 | static void bch_generic_make_request_hack(struct bio *bio) | 22 | static void bch_generic_make_request_hack(struct bio *bio) |
23 | { | 23 | { |
24 | if (bio->bi_idx) { | 24 | if (bio->bi_idx) { |
25 | struct bio *clone = bio_alloc(GFP_NOIO, bio_segments(bio)); | 25 | struct bio *clone = bio_alloc(GFP_NOIO, bio_segments(bio)); |
26 | 26 | ||
27 | memcpy(clone->bi_io_vec, | 27 | memcpy(clone->bi_io_vec, |
28 | bio_iovec(bio), | 28 | bio_iovec(bio), |
29 | bio_segments(bio) * sizeof(struct bio_vec)); | 29 | bio_segments(bio) * sizeof(struct bio_vec)); |
30 | 30 | ||
31 | clone->bi_sector = bio->bi_sector; | 31 | clone->bi_sector = bio->bi_sector; |
32 | clone->bi_bdev = bio->bi_bdev; | 32 | clone->bi_bdev = bio->bi_bdev; |
33 | clone->bi_rw = bio->bi_rw; | 33 | clone->bi_rw = bio->bi_rw; |
34 | clone->bi_vcnt = bio_segments(bio); | 34 | clone->bi_vcnt = bio_segments(bio); |
35 | clone->bi_size = bio->bi_size; | 35 | clone->bi_size = bio->bi_size; |
36 | 36 | ||
37 | clone->bi_private = bio; | 37 | clone->bi_private = bio; |
38 | clone->bi_end_io = bch_bi_idx_hack_endio; | 38 | clone->bi_end_io = bch_bi_idx_hack_endio; |
39 | 39 | ||
40 | bio = clone; | 40 | bio = clone; |
41 | } | 41 | } |
42 | 42 | ||
43 | /* | 43 | /* |
44 | * Hack, since drivers that clone bios clone up to bi_max_vecs, but our | 44 | * Hack, since drivers that clone bios clone up to bi_max_vecs, but our |
45 | * bios might have had more than that (before we split them per device | 45 | * bios might have had more than that (before we split them per device |
46 | * limitations). | 46 | * limitations). |
47 | * | 47 | * |
48 | * To be taken out once immutable bvec stuff is in. | 48 | * To be taken out once immutable bvec stuff is in. |
49 | */ | 49 | */ |
50 | bio->bi_max_vecs = bio->bi_vcnt; | 50 | bio->bi_max_vecs = bio->bi_vcnt; |
51 | 51 | ||
52 | generic_make_request(bio); | 52 | generic_make_request(bio); |
53 | } | 53 | } |
54 | 54 | ||
55 | /** | 55 | /** |
56 | * bch_bio_split - split a bio | 56 | * bch_bio_split - split a bio |
57 | * @bio: bio to split | 57 | * @bio: bio to split |
58 | * @sectors: number of sectors to split from the front of @bio | 58 | * @sectors: number of sectors to split from the front of @bio |
59 | * @gfp: gfp mask | 59 | * @gfp: gfp mask |
60 | * @bs: bio set to allocate from | 60 | * @bs: bio set to allocate from |
61 | * | 61 | * |
62 | * Allocates and returns a new bio which represents @sectors from the start of | 62 | * Allocates and returns a new bio which represents @sectors from the start of |
63 | * @bio, and updates @bio to represent the remaining sectors. | 63 | * @bio, and updates @bio to represent the remaining sectors. |
64 | * | 64 | * |
65 | * If bio_sectors(@bio) was less than or equal to @sectors, returns @bio | 65 | * If bio_sectors(@bio) was less than or equal to @sectors, returns @bio |
66 | * unchanged. | 66 | * unchanged. |
67 | * | 67 | * |
68 | * The newly allocated bio will point to @bio's bi_io_vec, if the split was on a | 68 | * The newly allocated bio will point to @bio's bi_io_vec, if the split was on a |
69 | * bvec boundry; it is the caller's responsibility to ensure that @bio is not | 69 | * bvec boundry; it is the caller's responsibility to ensure that @bio is not |
70 | * freed before the split. | 70 | * freed before the split. |
71 | * | ||
72 | * If bch_bio_split() is running under generic_make_request(), it's not safe to | ||
73 | * allocate more than one bio from the same bio set. Therefore, if it is running | ||
74 | * under generic_make_request() it masks out __GFP_WAIT when doing the | ||
75 | * allocation. The caller must check for failure if there's any possibility of | ||
76 | * it being called from under generic_make_request(); it is then the caller's | ||
77 | * responsibility to retry from a safe context (by e.g. punting to workqueue). | ||
78 | */ | 71 | */ |
79 | struct bio *bch_bio_split(struct bio *bio, int sectors, | 72 | struct bio *bch_bio_split(struct bio *bio, int sectors, |
80 | gfp_t gfp, struct bio_set *bs) | 73 | gfp_t gfp, struct bio_set *bs) |
81 | { | 74 | { |
82 | unsigned idx = bio->bi_idx, vcnt = 0, nbytes = sectors << 9; | 75 | unsigned idx = bio->bi_idx, vcnt = 0, nbytes = sectors << 9; |
83 | struct bio_vec *bv; | 76 | struct bio_vec *bv; |
84 | struct bio *ret = NULL; | 77 | struct bio *ret = NULL; |
85 | 78 | ||
86 | BUG_ON(sectors <= 0); | 79 | BUG_ON(sectors <= 0); |
87 | 80 | ||
88 | /* | ||
89 | * If we're being called from underneath generic_make_request() and we | ||
90 | * already allocated any bios from this bio set, we risk deadlock if we | ||
91 | * use the mempool. So instead, we possibly fail and let the caller punt | ||
92 | * to workqueue or somesuch and retry in a safe context. | ||
93 | */ | ||
94 | if (current->bio_list) | ||
95 | gfp &= ~__GFP_WAIT; | ||
96 | |||
97 | if (sectors >= bio_sectors(bio)) | 81 | if (sectors >= bio_sectors(bio)) |
98 | return bio; | 82 | return bio; |
99 | 83 | ||
100 | if (bio->bi_rw & REQ_DISCARD) { | 84 | if (bio->bi_rw & REQ_DISCARD) { |
101 | ret = bio_alloc_bioset(gfp, 1, bs); | 85 | ret = bio_alloc_bioset(gfp, 1, bs); |
102 | if (!ret) | 86 | if (!ret) |
103 | return NULL; | 87 | return NULL; |
104 | idx = 0; | 88 | idx = 0; |
105 | goto out; | 89 | goto out; |
106 | } | 90 | } |
107 | 91 | ||
108 | bio_for_each_segment(bv, bio, idx) { | 92 | bio_for_each_segment(bv, bio, idx) { |
109 | vcnt = idx - bio->bi_idx; | 93 | vcnt = idx - bio->bi_idx; |
110 | 94 | ||
111 | if (!nbytes) { | 95 | if (!nbytes) { |
112 | ret = bio_alloc_bioset(gfp, vcnt, bs); | 96 | ret = bio_alloc_bioset(gfp, vcnt, bs); |
113 | if (!ret) | 97 | if (!ret) |
114 | return NULL; | 98 | return NULL; |
115 | 99 | ||
116 | memcpy(ret->bi_io_vec, bio_iovec(bio), | 100 | memcpy(ret->bi_io_vec, bio_iovec(bio), |
117 | sizeof(struct bio_vec) * vcnt); | 101 | sizeof(struct bio_vec) * vcnt); |
118 | 102 | ||
119 | break; | 103 | break; |
120 | } else if (nbytes < bv->bv_len) { | 104 | } else if (nbytes < bv->bv_len) { |
121 | ret = bio_alloc_bioset(gfp, ++vcnt, bs); | 105 | ret = bio_alloc_bioset(gfp, ++vcnt, bs); |
122 | if (!ret) | 106 | if (!ret) |
123 | return NULL; | 107 | return NULL; |
124 | 108 | ||
125 | memcpy(ret->bi_io_vec, bio_iovec(bio), | 109 | memcpy(ret->bi_io_vec, bio_iovec(bio), |
126 | sizeof(struct bio_vec) * vcnt); | 110 | sizeof(struct bio_vec) * vcnt); |
127 | 111 | ||
128 | ret->bi_io_vec[vcnt - 1].bv_len = nbytes; | 112 | ret->bi_io_vec[vcnt - 1].bv_len = nbytes; |
129 | bv->bv_offset += nbytes; | 113 | bv->bv_offset += nbytes; |
130 | bv->bv_len -= nbytes; | 114 | bv->bv_len -= nbytes; |
131 | break; | 115 | break; |
132 | } | 116 | } |
133 | 117 | ||
134 | nbytes -= bv->bv_len; | 118 | nbytes -= bv->bv_len; |
135 | } | 119 | } |
136 | out: | 120 | out: |
137 | ret->bi_bdev = bio->bi_bdev; | 121 | ret->bi_bdev = bio->bi_bdev; |
138 | ret->bi_sector = bio->bi_sector; | 122 | ret->bi_sector = bio->bi_sector; |
139 | ret->bi_size = sectors << 9; | 123 | ret->bi_size = sectors << 9; |
140 | ret->bi_rw = bio->bi_rw; | 124 | ret->bi_rw = bio->bi_rw; |
141 | ret->bi_vcnt = vcnt; | 125 | ret->bi_vcnt = vcnt; |
142 | ret->bi_max_vecs = vcnt; | 126 | ret->bi_max_vecs = vcnt; |
143 | 127 | ||
144 | bio->bi_sector += sectors; | 128 | bio->bi_sector += sectors; |
145 | bio->bi_size -= sectors << 9; | 129 | bio->bi_size -= sectors << 9; |
146 | bio->bi_idx = idx; | 130 | bio->bi_idx = idx; |
147 | 131 | ||
148 | if (bio_integrity(bio)) { | 132 | if (bio_integrity(bio)) { |
149 | if (bio_integrity_clone(ret, bio, gfp)) { | 133 | if (bio_integrity_clone(ret, bio, gfp)) { |
150 | bio_put(ret); | 134 | bio_put(ret); |
151 | return NULL; | 135 | return NULL; |
152 | } | 136 | } |
153 | 137 | ||
154 | bio_integrity_trim(ret, 0, bio_sectors(ret)); | 138 | bio_integrity_trim(ret, 0, bio_sectors(ret)); |
155 | bio_integrity_trim(bio, bio_sectors(ret), bio_sectors(bio)); | 139 | bio_integrity_trim(bio, bio_sectors(ret), bio_sectors(bio)); |
156 | } | 140 | } |
157 | 141 | ||
158 | return ret; | 142 | return ret; |
159 | } | 143 | } |
160 | 144 | ||
161 | static unsigned bch_bio_max_sectors(struct bio *bio) | 145 | static unsigned bch_bio_max_sectors(struct bio *bio) |
162 | { | 146 | { |
163 | unsigned ret = bio_sectors(bio); | 147 | unsigned ret = bio_sectors(bio); |
164 | struct request_queue *q = bdev_get_queue(bio->bi_bdev); | 148 | struct request_queue *q = bdev_get_queue(bio->bi_bdev); |
165 | unsigned max_segments = min_t(unsigned, BIO_MAX_PAGES, | 149 | unsigned max_segments = min_t(unsigned, BIO_MAX_PAGES, |
166 | queue_max_segments(q)); | 150 | queue_max_segments(q)); |
167 | struct bio_vec *bv, *end = bio_iovec(bio) + | ||
168 | min_t(int, bio_segments(bio), max_segments); | ||
169 | 151 | ||
170 | if (bio->bi_rw & REQ_DISCARD) | 152 | if (bio->bi_rw & REQ_DISCARD) |
171 | return min(ret, q->limits.max_discard_sectors); | 153 | return min(ret, q->limits.max_discard_sectors); |
172 | 154 | ||
173 | if (bio_segments(bio) > max_segments || | 155 | if (bio_segments(bio) > max_segments || |
174 | q->merge_bvec_fn) { | 156 | q->merge_bvec_fn) { |
157 | struct bio_vec *bv; | ||
158 | int i, seg = 0; | ||
159 | |||
175 | ret = 0; | 160 | ret = 0; |
176 | 161 | ||
177 | for (bv = bio_iovec(bio); bv < end; bv++) { | 162 | bio_for_each_segment(bv, bio, i) { |
178 | struct bvec_merge_data bvm = { | 163 | struct bvec_merge_data bvm = { |
179 | .bi_bdev = bio->bi_bdev, | 164 | .bi_bdev = bio->bi_bdev, |
180 | .bi_sector = bio->bi_sector, | 165 | .bi_sector = bio->bi_sector, |
181 | .bi_size = ret << 9, | 166 | .bi_size = ret << 9, |
182 | .bi_rw = bio->bi_rw, | 167 | .bi_rw = bio->bi_rw, |
183 | }; | 168 | }; |
184 | 169 | ||
170 | if (seg == max_segments) | ||
171 | break; | ||
172 | |||
185 | if (q->merge_bvec_fn && | 173 | if (q->merge_bvec_fn && |
186 | q->merge_bvec_fn(q, &bvm, bv) < (int) bv->bv_len) | 174 | q->merge_bvec_fn(q, &bvm, bv) < (int) bv->bv_len) |
187 | break; | 175 | break; |
188 | 176 | ||
177 | seg++; | ||
189 | ret += bv->bv_len >> 9; | 178 | ret += bv->bv_len >> 9; |
190 | } | 179 | } |
191 | } | 180 | } |
192 | 181 | ||
193 | ret = min(ret, queue_max_sectors(q)); | 182 | ret = min(ret, queue_max_sectors(q)); |
194 | 183 | ||
195 | WARN_ON(!ret); | 184 | WARN_ON(!ret); |
196 | ret = max_t(int, ret, bio_iovec(bio)->bv_len >> 9); | 185 | ret = max_t(int, ret, bio_iovec(bio)->bv_len >> 9); |
197 | 186 | ||
198 | return ret; | 187 | return ret; |
199 | } | 188 | } |
200 | 189 | ||
201 | static void bch_bio_submit_split_done(struct closure *cl) | 190 | static void bch_bio_submit_split_done(struct closure *cl) |
202 | { | 191 | { |
203 | struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl); | 192 | struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl); |
204 | 193 | ||
205 | s->bio->bi_end_io = s->bi_end_io; | 194 | s->bio->bi_end_io = s->bi_end_io; |
206 | s->bio->bi_private = s->bi_private; | 195 | s->bio->bi_private = s->bi_private; |
207 | bio_endio(s->bio, 0); | 196 | bio_endio(s->bio, 0); |
208 | 197 | ||
209 | closure_debug_destroy(&s->cl); | 198 | closure_debug_destroy(&s->cl); |
210 | mempool_free(s, s->p->bio_split_hook); | 199 | mempool_free(s, s->p->bio_split_hook); |
211 | } | 200 | } |
212 | 201 | ||
213 | static void bch_bio_submit_split_endio(struct bio *bio, int error) | 202 | static void bch_bio_submit_split_endio(struct bio *bio, int error) |
214 | { | 203 | { |
215 | struct closure *cl = bio->bi_private; | 204 | struct closure *cl = bio->bi_private; |
216 | struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl); | 205 | struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl); |
217 | 206 | ||
218 | if (error) | 207 | if (error) |
219 | clear_bit(BIO_UPTODATE, &s->bio->bi_flags); | 208 | clear_bit(BIO_UPTODATE, &s->bio->bi_flags); |
220 | 209 | ||
221 | bio_put(bio); | 210 | bio_put(bio); |
222 | closure_put(cl); | 211 | closure_put(cl); |
223 | } | 212 | } |
224 | 213 | ||
225 | static void __bch_bio_submit_split(struct closure *cl) | ||
226 | { | ||
227 | struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl); | ||
228 | struct bio *bio = s->bio, *n; | ||
229 | |||
230 | do { | ||
231 | n = bch_bio_split(bio, bch_bio_max_sectors(bio), | ||
232 | GFP_NOIO, s->p->bio_split); | ||
233 | if (!n) | ||
234 | continue_at(cl, __bch_bio_submit_split, system_wq); | ||
235 | |||
236 | n->bi_end_io = bch_bio_submit_split_endio; | ||
237 | n->bi_private = cl; | ||
238 | |||
239 | closure_get(cl); | ||
240 | bch_generic_make_request_hack(n); | ||
241 | } while (n != bio); | ||
242 | |||
243 | continue_at(cl, bch_bio_submit_split_done, NULL); | ||
244 | } | ||
245 | |||
246 | void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p) | 214 | void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p) |
247 | { | 215 | { |
248 | struct bio_split_hook *s; | 216 | struct bio_split_hook *s; |
217 | struct bio *n; | ||
249 | 218 | ||
250 | if (!bio_has_data(bio) && !(bio->bi_rw & REQ_DISCARD)) | 219 | if (!bio_has_data(bio) && !(bio->bi_rw & REQ_DISCARD)) |
251 | goto submit; | 220 | goto submit; |
252 | 221 | ||
253 | if (bio_sectors(bio) <= bch_bio_max_sectors(bio)) | 222 | if (bio_sectors(bio) <= bch_bio_max_sectors(bio)) |
254 | goto submit; | 223 | goto submit; |
255 | 224 | ||
256 | s = mempool_alloc(p->bio_split_hook, GFP_NOIO); | 225 | s = mempool_alloc(p->bio_split_hook, GFP_NOIO); |
226 | closure_init(&s->cl, NULL); | ||
257 | 227 | ||
258 | s->bio = bio; | 228 | s->bio = bio; |
259 | s->p = p; | 229 | s->p = p; |
260 | s->bi_end_io = bio->bi_end_io; | 230 | s->bi_end_io = bio->bi_end_io; |
261 | s->bi_private = bio->bi_private; | 231 | s->bi_private = bio->bi_private; |
262 | bio_get(bio); | 232 | bio_get(bio); |
263 | 233 | ||
264 | closure_call(&s->cl, __bch_bio_submit_split, NULL, NULL); | 234 | do { |
265 | return; | 235 | n = bch_bio_split(bio, bch_bio_max_sectors(bio), |
236 | GFP_NOIO, s->p->bio_split); | ||
237 | |||
238 | n->bi_end_io = bch_bio_submit_split_endio; | ||
239 | n->bi_private = &s->cl; | ||
240 | |||
241 | closure_get(&s->cl); | ||
242 | bch_generic_make_request_hack(n); | ||
243 | } while (n != bio); | ||
244 | |||
245 | continue_at(&s->cl, bch_bio_submit_split_done, NULL); | ||
266 | submit: | 246 | submit: |
267 | bch_generic_make_request_hack(bio); | 247 | bch_generic_make_request_hack(bio); |
268 | } | 248 | } |
269 | 249 | ||
270 | /* Bios with headers */ | 250 | /* Bios with headers */ |
271 | 251 | ||
272 | void bch_bbio_free(struct bio *bio, struct cache_set *c) | 252 | void bch_bbio_free(struct bio *bio, struct cache_set *c) |
273 | { | 253 | { |
274 | struct bbio *b = container_of(bio, struct bbio, bio); | 254 | struct bbio *b = container_of(bio, struct bbio, bio); |
275 | mempool_free(b, c->bio_meta); | 255 | mempool_free(b, c->bio_meta); |
276 | } | 256 | } |
277 | 257 | ||
278 | struct bio *bch_bbio_alloc(struct cache_set *c) | 258 | struct bio *bch_bbio_alloc(struct cache_set *c) |
279 | { | 259 | { |
280 | struct bbio *b = mempool_alloc(c->bio_meta, GFP_NOIO); | 260 | struct bbio *b = mempool_alloc(c->bio_meta, GFP_NOIO); |
281 | struct bio *bio = &b->bio; | 261 | struct bio *bio = &b->bio; |
282 | 262 | ||
283 | bio_init(bio); | 263 | bio_init(bio); |
284 | bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET; | 264 | bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET; |
285 | bio->bi_max_vecs = bucket_pages(c); | 265 | bio->bi_max_vecs = bucket_pages(c); |
286 | bio->bi_io_vec = bio->bi_inline_vecs; | 266 | bio->bi_io_vec = bio->bi_inline_vecs; |
287 | 267 | ||
288 | return bio; | 268 | return bio; |
289 | } | 269 | } |
290 | 270 | ||
291 | void __bch_submit_bbio(struct bio *bio, struct cache_set *c) | 271 | void __bch_submit_bbio(struct bio *bio, struct cache_set *c) |
292 | { | 272 | { |
293 | struct bbio *b = container_of(bio, struct bbio, bio); | 273 | struct bbio *b = container_of(bio, struct bbio, bio); |
294 | 274 | ||
295 | bio->bi_sector = PTR_OFFSET(&b->key, 0); | 275 | bio->bi_sector = PTR_OFFSET(&b->key, 0); |
296 | bio->bi_bdev = PTR_CACHE(c, &b->key, 0)->bdev; | 276 | bio->bi_bdev = PTR_CACHE(c, &b->key, 0)->bdev; |
297 | 277 | ||
298 | b->submit_time_us = local_clock_us(); | 278 | b->submit_time_us = local_clock_us(); |
299 | closure_bio_submit(bio, bio->bi_private, PTR_CACHE(c, &b->key, 0)); | 279 | closure_bio_submit(bio, bio->bi_private, PTR_CACHE(c, &b->key, 0)); |
300 | } | 280 | } |
301 | 281 | ||
302 | void bch_submit_bbio(struct bio *bio, struct cache_set *c, | 282 | void bch_submit_bbio(struct bio *bio, struct cache_set *c, |
303 | struct bkey *k, unsigned ptr) | 283 | struct bkey *k, unsigned ptr) |
304 | { | 284 | { |
305 | struct bbio *b = container_of(bio, struct bbio, bio); | 285 | struct bbio *b = container_of(bio, struct bbio, bio); |
306 | bch_bkey_copy_single_ptr(&b->key, k, ptr); | 286 | bch_bkey_copy_single_ptr(&b->key, k, ptr); |
307 | __bch_submit_bbio(bio, c); | 287 | __bch_submit_bbio(bio, c); |
308 | } | 288 | } |
309 | 289 | ||
310 | /* IO errors */ | 290 | /* IO errors */ |
311 | 291 | ||
312 | void bch_count_io_errors(struct cache *ca, int error, const char *m) | 292 | void bch_count_io_errors(struct cache *ca, int error, const char *m) |
313 | { | 293 | { |
314 | /* | 294 | /* |
315 | * The halflife of an error is: | 295 | * The halflife of an error is: |
316 | * log2(1/2)/log2(127/128) * refresh ~= 88 * refresh | 296 | * log2(1/2)/log2(127/128) * refresh ~= 88 * refresh |
317 | */ | 297 | */ |
318 | 298 | ||
319 | if (ca->set->error_decay) { | 299 | if (ca->set->error_decay) { |
320 | unsigned count = atomic_inc_return(&ca->io_count); | 300 | unsigned count = atomic_inc_return(&ca->io_count); |
321 | 301 | ||
322 | while (count > ca->set->error_decay) { | 302 | while (count > ca->set->error_decay) { |
323 | unsigned errors; | 303 | unsigned errors; |
324 | unsigned old = count; | 304 | unsigned old = count; |
325 | unsigned new = count - ca->set->error_decay; | 305 | unsigned new = count - ca->set->error_decay; |
326 | 306 | ||
327 | /* | 307 | /* |
328 | * First we subtract refresh from count; each time we | 308 | * First we subtract refresh from count; each time we |
329 | * succesfully do so, we rescale the errors once: | 309 | * succesfully do so, we rescale the errors once: |
330 | */ | 310 | */ |
331 | 311 | ||
332 | count = atomic_cmpxchg(&ca->io_count, old, new); | 312 | count = atomic_cmpxchg(&ca->io_count, old, new); |
333 | 313 | ||
334 | if (count == old) { | 314 | if (count == old) { |
335 | count = new; | 315 | count = new; |
336 | 316 | ||
337 | errors = atomic_read(&ca->io_errors); | 317 | errors = atomic_read(&ca->io_errors); |
338 | do { | 318 | do { |
339 | old = errors; | 319 | old = errors; |
340 | new = ((uint64_t) errors * 127) / 128; | 320 | new = ((uint64_t) errors * 127) / 128; |
341 | errors = atomic_cmpxchg(&ca->io_errors, | 321 | errors = atomic_cmpxchg(&ca->io_errors, |
342 | old, new); | 322 | old, new); |
343 | } while (old != errors); | 323 | } while (old != errors); |
344 | } | 324 | } |
345 | } | 325 | } |
346 | } | 326 | } |
347 | 327 | ||
348 | if (error) { | 328 | if (error) { |
349 | char buf[BDEVNAME_SIZE]; | 329 | char buf[BDEVNAME_SIZE]; |
350 | unsigned errors = atomic_add_return(1 << IO_ERROR_SHIFT, | 330 | unsigned errors = atomic_add_return(1 << IO_ERROR_SHIFT, |
351 | &ca->io_errors); | 331 | &ca->io_errors); |
352 | errors >>= IO_ERROR_SHIFT; | 332 | errors >>= IO_ERROR_SHIFT; |
353 | 333 | ||
354 | if (errors < ca->set->error_limit) | 334 | if (errors < ca->set->error_limit) |
355 | pr_err("%s: IO error on %s, recovering", | 335 | pr_err("%s: IO error on %s, recovering", |
356 | bdevname(ca->bdev, buf), m); | 336 | bdevname(ca->bdev, buf), m); |
357 | else | 337 | else |
358 | bch_cache_set_error(ca->set, | 338 | bch_cache_set_error(ca->set, |
359 | "%s: too many IO errors %s", | 339 | "%s: too many IO errors %s", |
360 | bdevname(ca->bdev, buf), m); | 340 | bdevname(ca->bdev, buf), m); |
361 | } | 341 | } |
362 | } | 342 | } |
363 | 343 | ||
364 | void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio, | 344 | void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio, |
365 | int error, const char *m) | 345 | int error, const char *m) |
366 | { | 346 | { |
367 | struct bbio *b = container_of(bio, struct bbio, bio); | 347 | struct bbio *b = container_of(bio, struct bbio, bio); |
368 | struct cache *ca = PTR_CACHE(c, &b->key, 0); | 348 | struct cache *ca = PTR_CACHE(c, &b->key, 0); |
369 | 349 | ||
370 | unsigned threshold = bio->bi_rw & REQ_WRITE | 350 | unsigned threshold = bio->bi_rw & REQ_WRITE |
371 | ? c->congested_write_threshold_us | 351 | ? c->congested_write_threshold_us |
372 | : c->congested_read_threshold_us; | 352 | : c->congested_read_threshold_us; |
373 | 353 | ||
374 | if (threshold) { | 354 | if (threshold) { |
375 | unsigned t = local_clock_us(); | 355 | unsigned t = local_clock_us(); |
376 | 356 | ||
377 | int us = t - b->submit_time_us; | 357 | int us = t - b->submit_time_us; |
378 | int congested = atomic_read(&c->congested); | 358 | int congested = atomic_read(&c->congested); |
379 | 359 | ||
380 | if (us > (int) threshold) { | 360 | if (us > (int) threshold) { |
381 | int ms = us / 1024; | 361 | int ms = us / 1024; |
382 | c->congested_last_us = t; | 362 | c->congested_last_us = t; |
383 | 363 |
drivers/md/bcache/movinggc.c
1 | /* | 1 | /* |
2 | * Moving/copying garbage collector | 2 | * Moving/copying garbage collector |
3 | * | 3 | * |
4 | * Copyright 2012 Google, Inc. | 4 | * Copyright 2012 Google, Inc. |
5 | */ | 5 | */ |
6 | 6 | ||
7 | #include "bcache.h" | 7 | #include "bcache.h" |
8 | #include "btree.h" | 8 | #include "btree.h" |
9 | #include "debug.h" | 9 | #include "debug.h" |
10 | #include "request.h" | 10 | #include "request.h" |
11 | 11 | ||
12 | #include <trace/events/bcache.h> | 12 | #include <trace/events/bcache.h> |
13 | 13 | ||
14 | struct moving_io { | 14 | struct moving_io { |
15 | struct keybuf_key *w; | 15 | struct keybuf_key *w; |
16 | struct search s; | 16 | struct search s; |
17 | struct bbio bio; | 17 | struct bbio bio; |
18 | }; | 18 | }; |
19 | 19 | ||
20 | static bool moving_pred(struct keybuf *buf, struct bkey *k) | 20 | static bool moving_pred(struct keybuf *buf, struct bkey *k) |
21 | { | 21 | { |
22 | struct cache_set *c = container_of(buf, struct cache_set, | 22 | struct cache_set *c = container_of(buf, struct cache_set, |
23 | moving_gc_keys); | 23 | moving_gc_keys); |
24 | unsigned i; | 24 | unsigned i; |
25 | 25 | ||
26 | for (i = 0; i < KEY_PTRS(k); i++) { | 26 | for (i = 0; i < KEY_PTRS(k); i++) { |
27 | struct cache *ca = PTR_CACHE(c, k, i); | 27 | struct cache *ca = PTR_CACHE(c, k, i); |
28 | struct bucket *g = PTR_BUCKET(c, k, i); | 28 | struct bucket *g = PTR_BUCKET(c, k, i); |
29 | 29 | ||
30 | if (GC_SECTORS_USED(g) < ca->gc_move_threshold) | 30 | if (GC_SECTORS_USED(g) < ca->gc_move_threshold) |
31 | return true; | 31 | return true; |
32 | } | 32 | } |
33 | 33 | ||
34 | return false; | 34 | return false; |
35 | } | 35 | } |
36 | 36 | ||
37 | /* Moving GC - IO loop */ | 37 | /* Moving GC - IO loop */ |
38 | 38 | ||
39 | static void moving_io_destructor(struct closure *cl) | 39 | static void moving_io_destructor(struct closure *cl) |
40 | { | 40 | { |
41 | struct moving_io *io = container_of(cl, struct moving_io, s.cl); | 41 | struct moving_io *io = container_of(cl, struct moving_io, s.cl); |
42 | kfree(io); | 42 | kfree(io); |
43 | } | 43 | } |
44 | 44 | ||
45 | static void write_moving_finish(struct closure *cl) | 45 | static void write_moving_finish(struct closure *cl) |
46 | { | 46 | { |
47 | struct moving_io *io = container_of(cl, struct moving_io, s.cl); | 47 | struct moving_io *io = container_of(cl, struct moving_io, s.cl); |
48 | struct bio *bio = &io->bio.bio; | 48 | struct bio *bio = &io->bio.bio; |
49 | struct bio_vec *bv = bio_iovec_idx(bio, bio->bi_vcnt); | 49 | struct bio_vec *bv; |
50 | int i; | ||
50 | 51 | ||
51 | while (bv-- != bio->bi_io_vec) | 52 | bio_for_each_segment_all(bv, bio, i) |
52 | __free_page(bv->bv_page); | 53 | __free_page(bv->bv_page); |
53 | 54 | ||
54 | if (io->s.op.insert_collision) | 55 | if (io->s.op.insert_collision) |
55 | trace_bcache_gc_copy_collision(&io->w->key); | 56 | trace_bcache_gc_copy_collision(&io->w->key); |
56 | 57 | ||
57 | bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w); | 58 | bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w); |
58 | 59 | ||
59 | atomic_dec_bug(&io->s.op.c->in_flight); | 60 | atomic_dec_bug(&io->s.op.c->in_flight); |
60 | closure_wake_up(&io->s.op.c->moving_gc_wait); | 61 | closure_wake_up(&io->s.op.c->moving_gc_wait); |
61 | 62 | ||
62 | closure_return_with_destructor(cl, moving_io_destructor); | 63 | closure_return_with_destructor(cl, moving_io_destructor); |
63 | } | 64 | } |
64 | 65 | ||
65 | static void read_moving_endio(struct bio *bio, int error) | 66 | static void read_moving_endio(struct bio *bio, int error) |
66 | { | 67 | { |
67 | struct moving_io *io = container_of(bio->bi_private, | 68 | struct moving_io *io = container_of(bio->bi_private, |
68 | struct moving_io, s.cl); | 69 | struct moving_io, s.cl); |
69 | 70 | ||
70 | if (error) | 71 | if (error) |
71 | io->s.error = error; | 72 | io->s.error = error; |
72 | 73 | ||
73 | bch_bbio_endio(io->s.op.c, bio, error, "reading data to move"); | 74 | bch_bbio_endio(io->s.op.c, bio, error, "reading data to move"); |
74 | } | 75 | } |
75 | 76 | ||
76 | static void moving_init(struct moving_io *io) | 77 | static void moving_init(struct moving_io *io) |
77 | { | 78 | { |
78 | struct bio *bio = &io->bio.bio; | 79 | struct bio *bio = &io->bio.bio; |
79 | 80 | ||
80 | bio_init(bio); | 81 | bio_init(bio); |
81 | bio_get(bio); | 82 | bio_get(bio); |
82 | bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); | 83 | bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); |
83 | 84 | ||
84 | bio->bi_size = KEY_SIZE(&io->w->key) << 9; | 85 | bio->bi_size = KEY_SIZE(&io->w->key) << 9; |
85 | bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&io->w->key), | 86 | bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&io->w->key), |
86 | PAGE_SECTORS); | 87 | PAGE_SECTORS); |
87 | bio->bi_private = &io->s.cl; | 88 | bio->bi_private = &io->s.cl; |
88 | bio->bi_io_vec = bio->bi_inline_vecs; | 89 | bio->bi_io_vec = bio->bi_inline_vecs; |
89 | bch_bio_map(bio, NULL); | 90 | bch_bio_map(bio, NULL); |
90 | } | 91 | } |
91 | 92 | ||
92 | static void write_moving(struct closure *cl) | 93 | static void write_moving(struct closure *cl) |
93 | { | 94 | { |
94 | struct search *s = container_of(cl, struct search, cl); | 95 | struct search *s = container_of(cl, struct search, cl); |
95 | struct moving_io *io = container_of(s, struct moving_io, s); | 96 | struct moving_io *io = container_of(s, struct moving_io, s); |
96 | 97 | ||
97 | if (!s->error) { | 98 | if (!s->error) { |
98 | moving_init(io); | 99 | moving_init(io); |
99 | 100 | ||
100 | io->bio.bio.bi_sector = KEY_START(&io->w->key); | 101 | io->bio.bio.bi_sector = KEY_START(&io->w->key); |
101 | s->op.lock = -1; | 102 | s->op.lock = -1; |
102 | s->op.write_prio = 1; | 103 | s->op.write_prio = 1; |
103 | s->op.cache_bio = &io->bio.bio; | 104 | s->op.cache_bio = &io->bio.bio; |
104 | 105 | ||
105 | s->writeback = KEY_DIRTY(&io->w->key); | 106 | s->writeback = KEY_DIRTY(&io->w->key); |
106 | s->op.csum = KEY_CSUM(&io->w->key); | 107 | s->op.csum = KEY_CSUM(&io->w->key); |
107 | 108 | ||
108 | s->op.type = BTREE_REPLACE; | 109 | s->op.type = BTREE_REPLACE; |
109 | bkey_copy(&s->op.replace, &io->w->key); | 110 | bkey_copy(&s->op.replace, &io->w->key); |
110 | 111 | ||
111 | closure_init(&s->op.cl, cl); | 112 | closure_init(&s->op.cl, cl); |
112 | bch_insert_data(&s->op.cl); | 113 | bch_insert_data(&s->op.cl); |
113 | } | 114 | } |
114 | 115 | ||
115 | continue_at(cl, write_moving_finish, NULL); | 116 | continue_at(cl, write_moving_finish, NULL); |
116 | } | 117 | } |
117 | 118 | ||
118 | static void read_moving_submit(struct closure *cl) | 119 | static void read_moving_submit(struct closure *cl) |
119 | { | 120 | { |
120 | struct search *s = container_of(cl, struct search, cl); | 121 | struct search *s = container_of(cl, struct search, cl); |
121 | struct moving_io *io = container_of(s, struct moving_io, s); | 122 | struct moving_io *io = container_of(s, struct moving_io, s); |
122 | struct bio *bio = &io->bio.bio; | 123 | struct bio *bio = &io->bio.bio; |
123 | 124 | ||
124 | bch_submit_bbio(bio, s->op.c, &io->w->key, 0); | 125 | bch_submit_bbio(bio, s->op.c, &io->w->key, 0); |
125 | 126 | ||
126 | continue_at(cl, write_moving, bch_gc_wq); | 127 | continue_at(cl, write_moving, bch_gc_wq); |
127 | } | 128 | } |
128 | 129 | ||
129 | static void read_moving(struct closure *cl) | 130 | static void read_moving(struct closure *cl) |
130 | { | 131 | { |
131 | struct cache_set *c = container_of(cl, struct cache_set, moving_gc); | 132 | struct cache_set *c = container_of(cl, struct cache_set, moving_gc); |
132 | struct keybuf_key *w; | 133 | struct keybuf_key *w; |
133 | struct moving_io *io; | 134 | struct moving_io *io; |
134 | struct bio *bio; | 135 | struct bio *bio; |
135 | 136 | ||
136 | /* XXX: if we error, background writeback could stall indefinitely */ | 137 | /* XXX: if we error, background writeback could stall indefinitely */ |
137 | 138 | ||
138 | while (!test_bit(CACHE_SET_STOPPING, &c->flags)) { | 139 | while (!test_bit(CACHE_SET_STOPPING, &c->flags)) { |
139 | w = bch_keybuf_next_rescan(c, &c->moving_gc_keys, | 140 | w = bch_keybuf_next_rescan(c, &c->moving_gc_keys, |
140 | &MAX_KEY, moving_pred); | 141 | &MAX_KEY, moving_pred); |
141 | if (!w) | 142 | if (!w) |
142 | break; | 143 | break; |
143 | 144 | ||
144 | io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec) | 145 | io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec) |
145 | * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), | 146 | * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), |
146 | GFP_KERNEL); | 147 | GFP_KERNEL); |
147 | if (!io) | 148 | if (!io) |
148 | goto err; | 149 | goto err; |
149 | 150 | ||
150 | w->private = io; | 151 | w->private = io; |
151 | io->w = w; | 152 | io->w = w; |
152 | io->s.op.inode = KEY_INODE(&w->key); | 153 | io->s.op.inode = KEY_INODE(&w->key); |
153 | io->s.op.c = c; | 154 | io->s.op.c = c; |
154 | 155 | ||
155 | moving_init(io); | 156 | moving_init(io); |
156 | bio = &io->bio.bio; | 157 | bio = &io->bio.bio; |
157 | 158 | ||
158 | bio->bi_rw = READ; | 159 | bio->bi_rw = READ; |
159 | bio->bi_end_io = read_moving_endio; | 160 | bio->bi_end_io = read_moving_endio; |
160 | 161 | ||
161 | if (bch_bio_alloc_pages(bio, GFP_KERNEL)) | 162 | if (bio_alloc_pages(bio, GFP_KERNEL)) |
162 | goto err; | 163 | goto err; |
163 | 164 | ||
164 | trace_bcache_gc_copy(&w->key); | 165 | trace_bcache_gc_copy(&w->key); |
165 | 166 | ||
166 | closure_call(&io->s.cl, read_moving_submit, NULL, &c->gc.cl); | 167 | closure_call(&io->s.cl, read_moving_submit, NULL, &c->gc.cl); |
167 | 168 | ||
168 | if (atomic_inc_return(&c->in_flight) >= 64) { | 169 | if (atomic_inc_return(&c->in_flight) >= 64) { |
169 | closure_wait_event(&c->moving_gc_wait, cl, | 170 | closure_wait_event(&c->moving_gc_wait, cl, |
170 | atomic_read(&c->in_flight) < 64); | 171 | atomic_read(&c->in_flight) < 64); |
171 | continue_at(cl, read_moving, bch_gc_wq); | 172 | continue_at(cl, read_moving, bch_gc_wq); |
172 | } | 173 | } |
173 | } | 174 | } |
174 | 175 | ||
175 | if (0) { | 176 | if (0) { |
176 | err: if (!IS_ERR_OR_NULL(w->private)) | 177 | err: if (!IS_ERR_OR_NULL(w->private)) |
177 | kfree(w->private); | 178 | kfree(w->private); |
178 | 179 | ||
179 | bch_keybuf_del(&c->moving_gc_keys, w); | 180 | bch_keybuf_del(&c->moving_gc_keys, w); |
180 | } | 181 | } |
181 | 182 | ||
182 | closure_return(cl); | 183 | closure_return(cl); |
183 | } | 184 | } |
184 | 185 | ||
185 | static bool bucket_cmp(struct bucket *l, struct bucket *r) | 186 | static bool bucket_cmp(struct bucket *l, struct bucket *r) |
186 | { | 187 | { |
187 | return GC_SECTORS_USED(l) < GC_SECTORS_USED(r); | 188 | return GC_SECTORS_USED(l) < GC_SECTORS_USED(r); |
188 | } | 189 | } |
189 | 190 | ||
190 | static unsigned bucket_heap_top(struct cache *ca) | 191 | static unsigned bucket_heap_top(struct cache *ca) |
191 | { | 192 | { |
192 | return GC_SECTORS_USED(heap_peek(&ca->heap)); | 193 | return GC_SECTORS_USED(heap_peek(&ca->heap)); |
193 | } | 194 | } |
194 | 195 | ||
195 | void bch_moving_gc(struct closure *cl) | 196 | void bch_moving_gc(struct closure *cl) |
196 | { | 197 | { |
197 | struct cache_set *c = container_of(cl, struct cache_set, gc.cl); | 198 | struct cache_set *c = container_of(cl, struct cache_set, gc.cl); |
198 | struct cache *ca; | 199 | struct cache *ca; |
199 | struct bucket *b; | 200 | struct bucket *b; |
200 | unsigned i; | 201 | unsigned i; |
201 | 202 | ||
202 | if (!c->copy_gc_enabled) | 203 | if (!c->copy_gc_enabled) |
203 | closure_return(cl); | 204 | closure_return(cl); |
204 | 205 | ||
205 | mutex_lock(&c->bucket_lock); | 206 | mutex_lock(&c->bucket_lock); |
206 | 207 | ||
207 | for_each_cache(ca, c, i) { | 208 | for_each_cache(ca, c, i) { |
208 | unsigned sectors_to_move = 0; | 209 | unsigned sectors_to_move = 0; |
209 | unsigned reserve_sectors = ca->sb.bucket_size * | 210 | unsigned reserve_sectors = ca->sb.bucket_size * |
210 | min(fifo_used(&ca->free), ca->free.size / 2); | 211 | min(fifo_used(&ca->free), ca->free.size / 2); |
211 | 212 | ||
212 | ca->heap.used = 0; | 213 | ca->heap.used = 0; |
213 | 214 | ||
214 | for_each_bucket(b, ca) { | 215 | for_each_bucket(b, ca) { |
215 | if (!GC_SECTORS_USED(b)) | 216 | if (!GC_SECTORS_USED(b)) |
216 | continue; | 217 | continue; |
217 | 218 | ||
218 | if (!heap_full(&ca->heap)) { | 219 | if (!heap_full(&ca->heap)) { |
219 | sectors_to_move += GC_SECTORS_USED(b); | 220 | sectors_to_move += GC_SECTORS_USED(b); |
220 | heap_add(&ca->heap, b, bucket_cmp); | 221 | heap_add(&ca->heap, b, bucket_cmp); |
221 | } else if (bucket_cmp(b, heap_peek(&ca->heap))) { | 222 | } else if (bucket_cmp(b, heap_peek(&ca->heap))) { |
222 | sectors_to_move -= bucket_heap_top(ca); | 223 | sectors_to_move -= bucket_heap_top(ca); |
223 | sectors_to_move += GC_SECTORS_USED(b); | 224 | sectors_to_move += GC_SECTORS_USED(b); |
224 | 225 | ||
225 | ca->heap.data[0] = b; | 226 | ca->heap.data[0] = b; |
226 | heap_sift(&ca->heap, 0, bucket_cmp); | 227 | heap_sift(&ca->heap, 0, bucket_cmp); |
227 | } | 228 | } |
228 | } | 229 | } |
229 | 230 | ||
230 | while (sectors_to_move > reserve_sectors) { | 231 | while (sectors_to_move > reserve_sectors) { |
231 | heap_pop(&ca->heap, b, bucket_cmp); | 232 | heap_pop(&ca->heap, b, bucket_cmp); |
232 | sectors_to_move -= GC_SECTORS_USED(b); | 233 | sectors_to_move -= GC_SECTORS_USED(b); |
233 | } | 234 | } |
234 | 235 | ||
235 | ca->gc_move_threshold = bucket_heap_top(ca); | 236 | ca->gc_move_threshold = bucket_heap_top(ca); |
236 | 237 | ||
237 | pr_debug("threshold %u", ca->gc_move_threshold); | 238 | pr_debug("threshold %u", ca->gc_move_threshold); |
238 | } | 239 | } |
239 | 240 | ||
240 | mutex_unlock(&c->bucket_lock); | 241 | mutex_unlock(&c->bucket_lock); |
241 | 242 | ||
242 | c->moving_gc_keys.last_scanned = ZERO_KEY; | 243 | c->moving_gc_keys.last_scanned = ZERO_KEY; |
243 | 244 | ||
244 | closure_init(&c->moving_gc, cl); | 245 | closure_init(&c->moving_gc, cl); |
245 | read_moving(&c->moving_gc); | 246 | read_moving(&c->moving_gc); |
246 | 247 | ||
247 | closure_return(cl); | 248 | closure_return(cl); |
248 | } | 249 | } |
249 | 250 | ||
250 | void bch_moving_init_cache_set(struct cache_set *c) | 251 | void bch_moving_init_cache_set(struct cache_set *c) |
251 | { | 252 | { |
252 | bch_keybuf_init(&c->moving_gc_keys); | 253 | bch_keybuf_init(&c->moving_gc_keys); |
253 | } | 254 | } |
254 | 255 |
drivers/md/bcache/request.c
1 | /* | 1 | /* |
2 | * Main bcache entry point - handle a read or a write request and decide what to | 2 | * Main bcache entry point - handle a read or a write request and decide what to |
3 | * do with it; the make_request functions are called by the block layer. | 3 | * do with it; the make_request functions are called by the block layer. |
4 | * | 4 | * |
5 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> | 5 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> |
6 | * Copyright 2012 Google, Inc. | 6 | * Copyright 2012 Google, Inc. |
7 | */ | 7 | */ |
8 | 8 | ||
9 | #include "bcache.h" | 9 | #include "bcache.h" |
10 | #include "btree.h" | 10 | #include "btree.h" |
11 | #include "debug.h" | 11 | #include "debug.h" |
12 | #include "request.h" | 12 | #include "request.h" |
13 | #include "writeback.h" | 13 | #include "writeback.h" |
14 | 14 | ||
15 | #include <linux/cgroup.h> | 15 | #include <linux/cgroup.h> |
16 | #include <linux/module.h> | 16 | #include <linux/module.h> |
17 | #include <linux/hash.h> | 17 | #include <linux/hash.h> |
18 | #include <linux/random.h> | 18 | #include <linux/random.h> |
19 | #include "blk-cgroup.h" | 19 | #include "blk-cgroup.h" |
20 | 20 | ||
21 | #include <trace/events/bcache.h> | 21 | #include <trace/events/bcache.h> |
22 | 22 | ||
23 | #define CUTOFF_CACHE_ADD 95 | 23 | #define CUTOFF_CACHE_ADD 95 |
24 | #define CUTOFF_CACHE_READA 90 | 24 | #define CUTOFF_CACHE_READA 90 |
25 | 25 | ||
26 | struct kmem_cache *bch_search_cache; | 26 | struct kmem_cache *bch_search_cache; |
27 | 27 | ||
28 | static void check_should_skip(struct cached_dev *, struct search *); | 28 | static void check_should_skip(struct cached_dev *, struct search *); |
29 | 29 | ||
30 | /* Cgroup interface */ | 30 | /* Cgroup interface */ |
31 | 31 | ||
32 | #ifdef CONFIG_CGROUP_BCACHE | 32 | #ifdef CONFIG_CGROUP_BCACHE |
33 | static struct bch_cgroup bcache_default_cgroup = { .cache_mode = -1 }; | 33 | static struct bch_cgroup bcache_default_cgroup = { .cache_mode = -1 }; |
34 | 34 | ||
35 | static struct bch_cgroup *cgroup_to_bcache(struct cgroup *cgroup) | 35 | static struct bch_cgroup *cgroup_to_bcache(struct cgroup *cgroup) |
36 | { | 36 | { |
37 | struct cgroup_subsys_state *css; | 37 | struct cgroup_subsys_state *css; |
38 | return cgroup && | 38 | return cgroup && |
39 | (css = cgroup_subsys_state(cgroup, bcache_subsys_id)) | 39 | (css = cgroup_subsys_state(cgroup, bcache_subsys_id)) |
40 | ? container_of(css, struct bch_cgroup, css) | 40 | ? container_of(css, struct bch_cgroup, css) |
41 | : &bcache_default_cgroup; | 41 | : &bcache_default_cgroup; |
42 | } | 42 | } |
43 | 43 | ||
44 | struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio) | 44 | struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio) |
45 | { | 45 | { |
46 | struct cgroup_subsys_state *css = bio->bi_css | 46 | struct cgroup_subsys_state *css = bio->bi_css |
47 | ? cgroup_subsys_state(bio->bi_css->cgroup, bcache_subsys_id) | 47 | ? cgroup_subsys_state(bio->bi_css->cgroup, bcache_subsys_id) |
48 | : task_subsys_state(current, bcache_subsys_id); | 48 | : task_subsys_state(current, bcache_subsys_id); |
49 | 49 | ||
50 | return css | 50 | return css |
51 | ? container_of(css, struct bch_cgroup, css) | 51 | ? container_of(css, struct bch_cgroup, css) |
52 | : &bcache_default_cgroup; | 52 | : &bcache_default_cgroup; |
53 | } | 53 | } |
54 | 54 | ||
55 | static ssize_t cache_mode_read(struct cgroup *cgrp, struct cftype *cft, | 55 | static ssize_t cache_mode_read(struct cgroup *cgrp, struct cftype *cft, |
56 | struct file *file, | 56 | struct file *file, |
57 | char __user *buf, size_t nbytes, loff_t *ppos) | 57 | char __user *buf, size_t nbytes, loff_t *ppos) |
58 | { | 58 | { |
59 | char tmp[1024]; | 59 | char tmp[1024]; |
60 | int len = bch_snprint_string_list(tmp, PAGE_SIZE, bch_cache_modes, | 60 | int len = bch_snprint_string_list(tmp, PAGE_SIZE, bch_cache_modes, |
61 | cgroup_to_bcache(cgrp)->cache_mode + 1); | 61 | cgroup_to_bcache(cgrp)->cache_mode + 1); |
62 | 62 | ||
63 | if (len < 0) | 63 | if (len < 0) |
64 | return len; | 64 | return len; |
65 | 65 | ||
66 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); | 66 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); |
67 | } | 67 | } |
68 | 68 | ||
69 | static int cache_mode_write(struct cgroup *cgrp, struct cftype *cft, | 69 | static int cache_mode_write(struct cgroup *cgrp, struct cftype *cft, |
70 | const char *buf) | 70 | const char *buf) |
71 | { | 71 | { |
72 | int v = bch_read_string_list(buf, bch_cache_modes); | 72 | int v = bch_read_string_list(buf, bch_cache_modes); |
73 | if (v < 0) | 73 | if (v < 0) |
74 | return v; | 74 | return v; |
75 | 75 | ||
76 | cgroup_to_bcache(cgrp)->cache_mode = v - 1; | 76 | cgroup_to_bcache(cgrp)->cache_mode = v - 1; |
77 | return 0; | 77 | return 0; |
78 | } | 78 | } |
79 | 79 | ||
80 | static u64 bch_verify_read(struct cgroup *cgrp, struct cftype *cft) | 80 | static u64 bch_verify_read(struct cgroup *cgrp, struct cftype *cft) |
81 | { | 81 | { |
82 | return cgroup_to_bcache(cgrp)->verify; | 82 | return cgroup_to_bcache(cgrp)->verify; |
83 | } | 83 | } |
84 | 84 | ||
85 | static int bch_verify_write(struct cgroup *cgrp, struct cftype *cft, u64 val) | 85 | static int bch_verify_write(struct cgroup *cgrp, struct cftype *cft, u64 val) |
86 | { | 86 | { |
87 | cgroup_to_bcache(cgrp)->verify = val; | 87 | cgroup_to_bcache(cgrp)->verify = val; |
88 | return 0; | 88 | return 0; |
89 | } | 89 | } |
90 | 90 | ||
91 | static u64 bch_cache_hits_read(struct cgroup *cgrp, struct cftype *cft) | 91 | static u64 bch_cache_hits_read(struct cgroup *cgrp, struct cftype *cft) |
92 | { | 92 | { |
93 | struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); | 93 | struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); |
94 | return atomic_read(&bcachecg->stats.cache_hits); | 94 | return atomic_read(&bcachecg->stats.cache_hits); |
95 | } | 95 | } |
96 | 96 | ||
97 | static u64 bch_cache_misses_read(struct cgroup *cgrp, struct cftype *cft) | 97 | static u64 bch_cache_misses_read(struct cgroup *cgrp, struct cftype *cft) |
98 | { | 98 | { |
99 | struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); | 99 | struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); |
100 | return atomic_read(&bcachecg->stats.cache_misses); | 100 | return atomic_read(&bcachecg->stats.cache_misses); |
101 | } | 101 | } |
102 | 102 | ||
103 | static u64 bch_cache_bypass_hits_read(struct cgroup *cgrp, | 103 | static u64 bch_cache_bypass_hits_read(struct cgroup *cgrp, |
104 | struct cftype *cft) | 104 | struct cftype *cft) |
105 | { | 105 | { |
106 | struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); | 106 | struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); |
107 | return atomic_read(&bcachecg->stats.cache_bypass_hits); | 107 | return atomic_read(&bcachecg->stats.cache_bypass_hits); |
108 | } | 108 | } |
109 | 109 | ||
110 | static u64 bch_cache_bypass_misses_read(struct cgroup *cgrp, | 110 | static u64 bch_cache_bypass_misses_read(struct cgroup *cgrp, |
111 | struct cftype *cft) | 111 | struct cftype *cft) |
112 | { | 112 | { |
113 | struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); | 113 | struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); |
114 | return atomic_read(&bcachecg->stats.cache_bypass_misses); | 114 | return atomic_read(&bcachecg->stats.cache_bypass_misses); |
115 | } | 115 | } |
116 | 116 | ||
117 | static struct cftype bch_files[] = { | 117 | static struct cftype bch_files[] = { |
118 | { | 118 | { |
119 | .name = "cache_mode", | 119 | .name = "cache_mode", |
120 | .read = cache_mode_read, | 120 | .read = cache_mode_read, |
121 | .write_string = cache_mode_write, | 121 | .write_string = cache_mode_write, |
122 | }, | 122 | }, |
123 | { | 123 | { |
124 | .name = "verify", | 124 | .name = "verify", |
125 | .read_u64 = bch_verify_read, | 125 | .read_u64 = bch_verify_read, |
126 | .write_u64 = bch_verify_write, | 126 | .write_u64 = bch_verify_write, |
127 | }, | 127 | }, |
128 | { | 128 | { |
129 | .name = "cache_hits", | 129 | .name = "cache_hits", |
130 | .read_u64 = bch_cache_hits_read, | 130 | .read_u64 = bch_cache_hits_read, |
131 | }, | 131 | }, |
132 | { | 132 | { |
133 | .name = "cache_misses", | 133 | .name = "cache_misses", |
134 | .read_u64 = bch_cache_misses_read, | 134 | .read_u64 = bch_cache_misses_read, |
135 | }, | 135 | }, |
136 | { | 136 | { |
137 | .name = "cache_bypass_hits", | 137 | .name = "cache_bypass_hits", |
138 | .read_u64 = bch_cache_bypass_hits_read, | 138 | .read_u64 = bch_cache_bypass_hits_read, |
139 | }, | 139 | }, |
140 | { | 140 | { |
141 | .name = "cache_bypass_misses", | 141 | .name = "cache_bypass_misses", |
142 | .read_u64 = bch_cache_bypass_misses_read, | 142 | .read_u64 = bch_cache_bypass_misses_read, |
143 | }, | 143 | }, |
144 | { } /* terminate */ | 144 | { } /* terminate */ |
145 | }; | 145 | }; |
146 | 146 | ||
147 | static void init_bch_cgroup(struct bch_cgroup *cg) | 147 | static void init_bch_cgroup(struct bch_cgroup *cg) |
148 | { | 148 | { |
149 | cg->cache_mode = -1; | 149 | cg->cache_mode = -1; |
150 | } | 150 | } |
151 | 151 | ||
152 | static struct cgroup_subsys_state *bcachecg_create(struct cgroup *cgroup) | 152 | static struct cgroup_subsys_state *bcachecg_create(struct cgroup *cgroup) |
153 | { | 153 | { |
154 | struct bch_cgroup *cg; | 154 | struct bch_cgroup *cg; |
155 | 155 | ||
156 | cg = kzalloc(sizeof(*cg), GFP_KERNEL); | 156 | cg = kzalloc(sizeof(*cg), GFP_KERNEL); |
157 | if (!cg) | 157 | if (!cg) |
158 | return ERR_PTR(-ENOMEM); | 158 | return ERR_PTR(-ENOMEM); |
159 | init_bch_cgroup(cg); | 159 | init_bch_cgroup(cg); |
160 | return &cg->css; | 160 | return &cg->css; |
161 | } | 161 | } |
162 | 162 | ||
163 | static void bcachecg_destroy(struct cgroup *cgroup) | 163 | static void bcachecg_destroy(struct cgroup *cgroup) |
164 | { | 164 | { |
165 | struct bch_cgroup *cg = cgroup_to_bcache(cgroup); | 165 | struct bch_cgroup *cg = cgroup_to_bcache(cgroup); |
166 | free_css_id(&bcache_subsys, &cg->css); | 166 | free_css_id(&bcache_subsys, &cg->css); |
167 | kfree(cg); | 167 | kfree(cg); |
168 | } | 168 | } |
169 | 169 | ||
170 | struct cgroup_subsys bcache_subsys = { | 170 | struct cgroup_subsys bcache_subsys = { |
171 | .create = bcachecg_create, | 171 | .create = bcachecg_create, |
172 | .destroy = bcachecg_destroy, | 172 | .destroy = bcachecg_destroy, |
173 | .subsys_id = bcache_subsys_id, | 173 | .subsys_id = bcache_subsys_id, |
174 | .name = "bcache", | 174 | .name = "bcache", |
175 | .module = THIS_MODULE, | 175 | .module = THIS_MODULE, |
176 | }; | 176 | }; |
177 | EXPORT_SYMBOL_GPL(bcache_subsys); | 177 | EXPORT_SYMBOL_GPL(bcache_subsys); |
178 | #endif | 178 | #endif |
179 | 179 | ||
180 | static unsigned cache_mode(struct cached_dev *dc, struct bio *bio) | 180 | static unsigned cache_mode(struct cached_dev *dc, struct bio *bio) |
181 | { | 181 | { |
182 | #ifdef CONFIG_CGROUP_BCACHE | 182 | #ifdef CONFIG_CGROUP_BCACHE |
183 | int r = bch_bio_to_cgroup(bio)->cache_mode; | 183 | int r = bch_bio_to_cgroup(bio)->cache_mode; |
184 | if (r >= 0) | 184 | if (r >= 0) |
185 | return r; | 185 | return r; |
186 | #endif | 186 | #endif |
187 | return BDEV_CACHE_MODE(&dc->sb); | 187 | return BDEV_CACHE_MODE(&dc->sb); |
188 | } | 188 | } |
189 | 189 | ||
190 | static bool verify(struct cached_dev *dc, struct bio *bio) | 190 | static bool verify(struct cached_dev *dc, struct bio *bio) |
191 | { | 191 | { |
192 | #ifdef CONFIG_CGROUP_BCACHE | 192 | #ifdef CONFIG_CGROUP_BCACHE |
193 | if (bch_bio_to_cgroup(bio)->verify) | 193 | if (bch_bio_to_cgroup(bio)->verify) |
194 | return true; | 194 | return true; |
195 | #endif | 195 | #endif |
196 | return dc->verify; | 196 | return dc->verify; |
197 | } | 197 | } |
198 | 198 | ||
199 | static void bio_csum(struct bio *bio, struct bkey *k) | 199 | static void bio_csum(struct bio *bio, struct bkey *k) |
200 | { | 200 | { |
201 | struct bio_vec *bv; | 201 | struct bio_vec *bv; |
202 | uint64_t csum = 0; | 202 | uint64_t csum = 0; |
203 | int i; | 203 | int i; |
204 | 204 | ||
205 | bio_for_each_segment(bv, bio, i) { | 205 | bio_for_each_segment(bv, bio, i) { |
206 | void *d = kmap(bv->bv_page) + bv->bv_offset; | 206 | void *d = kmap(bv->bv_page) + bv->bv_offset; |
207 | csum = bch_crc64_update(csum, d, bv->bv_len); | 207 | csum = bch_crc64_update(csum, d, bv->bv_len); |
208 | kunmap(bv->bv_page); | 208 | kunmap(bv->bv_page); |
209 | } | 209 | } |
210 | 210 | ||
211 | k->ptr[KEY_PTRS(k)] = csum & (~0ULL >> 1); | 211 | k->ptr[KEY_PTRS(k)] = csum & (~0ULL >> 1); |
212 | } | 212 | } |
213 | 213 | ||
214 | /* Insert data into cache */ | 214 | /* Insert data into cache */ |
215 | 215 | ||
216 | static void bio_invalidate(struct closure *cl) | 216 | static void bio_invalidate(struct closure *cl) |
217 | { | 217 | { |
218 | struct btree_op *op = container_of(cl, struct btree_op, cl); | 218 | struct btree_op *op = container_of(cl, struct btree_op, cl); |
219 | struct bio *bio = op->cache_bio; | 219 | struct bio *bio = op->cache_bio; |
220 | 220 | ||
221 | pr_debug("invalidating %i sectors from %llu", | 221 | pr_debug("invalidating %i sectors from %llu", |
222 | bio_sectors(bio), (uint64_t) bio->bi_sector); | 222 | bio_sectors(bio), (uint64_t) bio->bi_sector); |
223 | 223 | ||
224 | while (bio_sectors(bio)) { | 224 | while (bio_sectors(bio)) { |
225 | unsigned len = min(bio_sectors(bio), 1U << 14); | 225 | unsigned len = min(bio_sectors(bio), 1U << 14); |
226 | 226 | ||
227 | if (bch_keylist_realloc(&op->keys, 0, op->c)) | 227 | if (bch_keylist_realloc(&op->keys, 0, op->c)) |
228 | goto out; | 228 | goto out; |
229 | 229 | ||
230 | bio->bi_sector += len; | 230 | bio->bi_sector += len; |
231 | bio->bi_size -= len << 9; | 231 | bio->bi_size -= len << 9; |
232 | 232 | ||
233 | bch_keylist_add(&op->keys, | 233 | bch_keylist_add(&op->keys, |
234 | &KEY(op->inode, bio->bi_sector, len)); | 234 | &KEY(op->inode, bio->bi_sector, len)); |
235 | } | 235 | } |
236 | 236 | ||
237 | op->insert_data_done = true; | 237 | op->insert_data_done = true; |
238 | bio_put(bio); | 238 | bio_put(bio); |
239 | out: | 239 | out: |
240 | continue_at(cl, bch_journal, bcache_wq); | 240 | continue_at(cl, bch_journal, bcache_wq); |
241 | } | 241 | } |
242 | 242 | ||
243 | struct open_bucket { | 243 | struct open_bucket { |
244 | struct list_head list; | 244 | struct list_head list; |
245 | struct task_struct *last; | 245 | struct task_struct *last; |
246 | unsigned sectors_free; | 246 | unsigned sectors_free; |
247 | BKEY_PADDED(key); | 247 | BKEY_PADDED(key); |
248 | }; | 248 | }; |
249 | 249 | ||
250 | void bch_open_buckets_free(struct cache_set *c) | 250 | void bch_open_buckets_free(struct cache_set *c) |
251 | { | 251 | { |
252 | struct open_bucket *b; | 252 | struct open_bucket *b; |
253 | 253 | ||
254 | while (!list_empty(&c->data_buckets)) { | 254 | while (!list_empty(&c->data_buckets)) { |
255 | b = list_first_entry(&c->data_buckets, | 255 | b = list_first_entry(&c->data_buckets, |
256 | struct open_bucket, list); | 256 | struct open_bucket, list); |
257 | list_del(&b->list); | 257 | list_del(&b->list); |
258 | kfree(b); | 258 | kfree(b); |
259 | } | 259 | } |
260 | } | 260 | } |
261 | 261 | ||
262 | int bch_open_buckets_alloc(struct cache_set *c) | 262 | int bch_open_buckets_alloc(struct cache_set *c) |
263 | { | 263 | { |
264 | int i; | 264 | int i; |
265 | 265 | ||
266 | spin_lock_init(&c->data_bucket_lock); | 266 | spin_lock_init(&c->data_bucket_lock); |
267 | 267 | ||
268 | for (i = 0; i < 6; i++) { | 268 | for (i = 0; i < 6; i++) { |
269 | struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL); | 269 | struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL); |
270 | if (!b) | 270 | if (!b) |
271 | return -ENOMEM; | 271 | return -ENOMEM; |
272 | 272 | ||
273 | list_add(&b->list, &c->data_buckets); | 273 | list_add(&b->list, &c->data_buckets); |
274 | } | 274 | } |
275 | 275 | ||
276 | return 0; | 276 | return 0; |
277 | } | 277 | } |
278 | 278 | ||
279 | /* | 279 | /* |
280 | * We keep multiple buckets open for writes, and try to segregate different | 280 | * We keep multiple buckets open for writes, and try to segregate different |
281 | * write streams for better cache utilization: first we look for a bucket where | 281 | * write streams for better cache utilization: first we look for a bucket where |
282 | * the last write to it was sequential with the current write, and failing that | 282 | * the last write to it was sequential with the current write, and failing that |
283 | * we look for a bucket that was last used by the same task. | 283 | * we look for a bucket that was last used by the same task. |
284 | * | 284 | * |
285 | * The ideas is if you've got multiple tasks pulling data into the cache at the | 285 | * The ideas is if you've got multiple tasks pulling data into the cache at the |
286 | * same time, you'll get better cache utilization if you try to segregate their | 286 | * same time, you'll get better cache utilization if you try to segregate their |
287 | * data and preserve locality. | 287 | * data and preserve locality. |
288 | * | 288 | * |
289 | * For example, say you've starting Firefox at the same time you're copying a | 289 | * For example, say you've starting Firefox at the same time you're copying a |
290 | * bunch of files. Firefox will likely end up being fairly hot and stay in the | 290 | * bunch of files. Firefox will likely end up being fairly hot and stay in the |
291 | * cache awhile, but the data you copied might not be; if you wrote all that | 291 | * cache awhile, but the data you copied might not be; if you wrote all that |
292 | * data to the same buckets it'd get invalidated at the same time. | 292 | * data to the same buckets it'd get invalidated at the same time. |
293 | * | 293 | * |
294 | * Both of those tasks will be doing fairly random IO so we can't rely on | 294 | * Both of those tasks will be doing fairly random IO so we can't rely on |
295 | * detecting sequential IO to segregate their data, but going off of the task | 295 | * detecting sequential IO to segregate their data, but going off of the task |
296 | * should be a sane heuristic. | 296 | * should be a sane heuristic. |
297 | */ | 297 | */ |
298 | static struct open_bucket *pick_data_bucket(struct cache_set *c, | 298 | static struct open_bucket *pick_data_bucket(struct cache_set *c, |
299 | const struct bkey *search, | 299 | const struct bkey *search, |
300 | struct task_struct *task, | 300 | struct task_struct *task, |
301 | struct bkey *alloc) | 301 | struct bkey *alloc) |
302 | { | 302 | { |
303 | struct open_bucket *ret, *ret_task = NULL; | 303 | struct open_bucket *ret, *ret_task = NULL; |
304 | 304 | ||
305 | list_for_each_entry_reverse(ret, &c->data_buckets, list) | 305 | list_for_each_entry_reverse(ret, &c->data_buckets, list) |
306 | if (!bkey_cmp(&ret->key, search)) | 306 | if (!bkey_cmp(&ret->key, search)) |
307 | goto found; | 307 | goto found; |
308 | else if (ret->last == task) | 308 | else if (ret->last == task) |
309 | ret_task = ret; | 309 | ret_task = ret; |
310 | 310 | ||
311 | ret = ret_task ?: list_first_entry(&c->data_buckets, | 311 | ret = ret_task ?: list_first_entry(&c->data_buckets, |
312 | struct open_bucket, list); | 312 | struct open_bucket, list); |
313 | found: | 313 | found: |
314 | if (!ret->sectors_free && KEY_PTRS(alloc)) { | 314 | if (!ret->sectors_free && KEY_PTRS(alloc)) { |
315 | ret->sectors_free = c->sb.bucket_size; | 315 | ret->sectors_free = c->sb.bucket_size; |
316 | bkey_copy(&ret->key, alloc); | 316 | bkey_copy(&ret->key, alloc); |
317 | bkey_init(alloc); | 317 | bkey_init(alloc); |
318 | } | 318 | } |
319 | 319 | ||
320 | if (!ret->sectors_free) | 320 | if (!ret->sectors_free) |
321 | ret = NULL; | 321 | ret = NULL; |
322 | 322 | ||
323 | return ret; | 323 | return ret; |
324 | } | 324 | } |
325 | 325 | ||
326 | /* | 326 | /* |
327 | * Allocates some space in the cache to write to, and k to point to the newly | 327 | * Allocates some space in the cache to write to, and k to point to the newly |
328 | * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the | 328 | * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the |
329 | * end of the newly allocated space). | 329 | * end of the newly allocated space). |
330 | * | 330 | * |
331 | * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many | 331 | * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many |
332 | * sectors were actually allocated. | 332 | * sectors were actually allocated. |
333 | * | 333 | * |
334 | * If s->writeback is true, will not fail. | 334 | * If s->writeback is true, will not fail. |
335 | */ | 335 | */ |
336 | static bool bch_alloc_sectors(struct bkey *k, unsigned sectors, | 336 | static bool bch_alloc_sectors(struct bkey *k, unsigned sectors, |
337 | struct search *s) | 337 | struct search *s) |
338 | { | 338 | { |
339 | struct cache_set *c = s->op.c; | 339 | struct cache_set *c = s->op.c; |
340 | struct open_bucket *b; | 340 | struct open_bucket *b; |
341 | BKEY_PADDED(key) alloc; | 341 | BKEY_PADDED(key) alloc; |
342 | struct closure cl, *w = NULL; | 342 | struct closure cl, *w = NULL; |
343 | unsigned i; | 343 | unsigned i; |
344 | 344 | ||
345 | if (s->writeback) { | 345 | if (s->writeback) { |
346 | closure_init_stack(&cl); | 346 | closure_init_stack(&cl); |
347 | w = &cl; | 347 | w = &cl; |
348 | } | 348 | } |
349 | 349 | ||
350 | /* | 350 | /* |
351 | * We might have to allocate a new bucket, which we can't do with a | 351 | * We might have to allocate a new bucket, which we can't do with a |
352 | * spinlock held. So if we have to allocate, we drop the lock, allocate | 352 | * spinlock held. So if we have to allocate, we drop the lock, allocate |
353 | * and then retry. KEY_PTRS() indicates whether alloc points to | 353 | * and then retry. KEY_PTRS() indicates whether alloc points to |
354 | * allocated bucket(s). | 354 | * allocated bucket(s). |
355 | */ | 355 | */ |
356 | 356 | ||
357 | bkey_init(&alloc.key); | 357 | bkey_init(&alloc.key); |
358 | spin_lock(&c->data_bucket_lock); | 358 | spin_lock(&c->data_bucket_lock); |
359 | 359 | ||
360 | while (!(b = pick_data_bucket(c, k, s->task, &alloc.key))) { | 360 | while (!(b = pick_data_bucket(c, k, s->task, &alloc.key))) { |
361 | unsigned watermark = s->op.write_prio | 361 | unsigned watermark = s->op.write_prio |
362 | ? WATERMARK_MOVINGGC | 362 | ? WATERMARK_MOVINGGC |
363 | : WATERMARK_NONE; | 363 | : WATERMARK_NONE; |
364 | 364 | ||
365 | spin_unlock(&c->data_bucket_lock); | 365 | spin_unlock(&c->data_bucket_lock); |
366 | 366 | ||
367 | if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, w)) | 367 | if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, w)) |
368 | return false; | 368 | return false; |
369 | 369 | ||
370 | spin_lock(&c->data_bucket_lock); | 370 | spin_lock(&c->data_bucket_lock); |
371 | } | 371 | } |
372 | 372 | ||
373 | /* | 373 | /* |
374 | * If we had to allocate, we might race and not need to allocate the | 374 | * If we had to allocate, we might race and not need to allocate the |
375 | * second time we call find_data_bucket(). If we allocated a bucket but | 375 | * second time we call find_data_bucket(). If we allocated a bucket but |
376 | * didn't use it, drop the refcount bch_bucket_alloc_set() took: | 376 | * didn't use it, drop the refcount bch_bucket_alloc_set() took: |
377 | */ | 377 | */ |
378 | if (KEY_PTRS(&alloc.key)) | 378 | if (KEY_PTRS(&alloc.key)) |
379 | __bkey_put(c, &alloc.key); | 379 | __bkey_put(c, &alloc.key); |
380 | 380 | ||
381 | for (i = 0; i < KEY_PTRS(&b->key); i++) | 381 | for (i = 0; i < KEY_PTRS(&b->key); i++) |
382 | EBUG_ON(ptr_stale(c, &b->key, i)); | 382 | EBUG_ON(ptr_stale(c, &b->key, i)); |
383 | 383 | ||
384 | /* Set up the pointer to the space we're allocating: */ | 384 | /* Set up the pointer to the space we're allocating: */ |
385 | 385 | ||
386 | for (i = 0; i < KEY_PTRS(&b->key); i++) | 386 | for (i = 0; i < KEY_PTRS(&b->key); i++) |
387 | k->ptr[i] = b->key.ptr[i]; | 387 | k->ptr[i] = b->key.ptr[i]; |
388 | 388 | ||
389 | sectors = min(sectors, b->sectors_free); | 389 | sectors = min(sectors, b->sectors_free); |
390 | 390 | ||
391 | SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors); | 391 | SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors); |
392 | SET_KEY_SIZE(k, sectors); | 392 | SET_KEY_SIZE(k, sectors); |
393 | SET_KEY_PTRS(k, KEY_PTRS(&b->key)); | 393 | SET_KEY_PTRS(k, KEY_PTRS(&b->key)); |
394 | 394 | ||
395 | /* | 395 | /* |
396 | * Move b to the end of the lru, and keep track of what this bucket was | 396 | * Move b to the end of the lru, and keep track of what this bucket was |
397 | * last used for: | 397 | * last used for: |
398 | */ | 398 | */ |
399 | list_move_tail(&b->list, &c->data_buckets); | 399 | list_move_tail(&b->list, &c->data_buckets); |
400 | bkey_copy_key(&b->key, k); | 400 | bkey_copy_key(&b->key, k); |
401 | b->last = s->task; | 401 | b->last = s->task; |
402 | 402 | ||
403 | b->sectors_free -= sectors; | 403 | b->sectors_free -= sectors; |
404 | 404 | ||
405 | for (i = 0; i < KEY_PTRS(&b->key); i++) { | 405 | for (i = 0; i < KEY_PTRS(&b->key); i++) { |
406 | SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors); | 406 | SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors); |
407 | 407 | ||
408 | atomic_long_add(sectors, | 408 | atomic_long_add(sectors, |
409 | &PTR_CACHE(c, &b->key, i)->sectors_written); | 409 | &PTR_CACHE(c, &b->key, i)->sectors_written); |
410 | } | 410 | } |
411 | 411 | ||
412 | if (b->sectors_free < c->sb.block_size) | 412 | if (b->sectors_free < c->sb.block_size) |
413 | b->sectors_free = 0; | 413 | b->sectors_free = 0; |
414 | 414 | ||
415 | /* | 415 | /* |
416 | * k takes refcounts on the buckets it points to until it's inserted | 416 | * k takes refcounts on the buckets it points to until it's inserted |
417 | * into the btree, but if we're done with this bucket we just transfer | 417 | * into the btree, but if we're done with this bucket we just transfer |
418 | * get_data_bucket()'s refcount. | 418 | * get_data_bucket()'s refcount. |
419 | */ | 419 | */ |
420 | if (b->sectors_free) | 420 | if (b->sectors_free) |
421 | for (i = 0; i < KEY_PTRS(&b->key); i++) | 421 | for (i = 0; i < KEY_PTRS(&b->key); i++) |
422 | atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin); | 422 | atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin); |
423 | 423 | ||
424 | spin_unlock(&c->data_bucket_lock); | 424 | spin_unlock(&c->data_bucket_lock); |
425 | return true; | 425 | return true; |
426 | } | 426 | } |
427 | 427 | ||
428 | static void bch_insert_data_error(struct closure *cl) | 428 | static void bch_insert_data_error(struct closure *cl) |
429 | { | 429 | { |
430 | struct btree_op *op = container_of(cl, struct btree_op, cl); | 430 | struct btree_op *op = container_of(cl, struct btree_op, cl); |
431 | 431 | ||
432 | /* | 432 | /* |
433 | * Our data write just errored, which means we've got a bunch of keys to | 433 | * Our data write just errored, which means we've got a bunch of keys to |
434 | * insert that point to data that wasn't succesfully written. | 434 | * insert that point to data that wasn't succesfully written. |
435 | * | 435 | * |
436 | * We don't have to insert those keys but we still have to invalidate | 436 | * We don't have to insert those keys but we still have to invalidate |
437 | * that region of the cache - so, if we just strip off all the pointers | 437 | * that region of the cache - so, if we just strip off all the pointers |
438 | * from the keys we'll accomplish just that. | 438 | * from the keys we'll accomplish just that. |
439 | */ | 439 | */ |
440 | 440 | ||
441 | struct bkey *src = op->keys.bottom, *dst = op->keys.bottom; | 441 | struct bkey *src = op->keys.bottom, *dst = op->keys.bottom; |
442 | 442 | ||
443 | while (src != op->keys.top) { | 443 | while (src != op->keys.top) { |
444 | struct bkey *n = bkey_next(src); | 444 | struct bkey *n = bkey_next(src); |
445 | 445 | ||
446 | SET_KEY_PTRS(src, 0); | 446 | SET_KEY_PTRS(src, 0); |
447 | bkey_copy(dst, src); | 447 | bkey_copy(dst, src); |
448 | 448 | ||
449 | dst = bkey_next(dst); | 449 | dst = bkey_next(dst); |
450 | src = n; | 450 | src = n; |
451 | } | 451 | } |
452 | 452 | ||
453 | op->keys.top = dst; | 453 | op->keys.top = dst; |
454 | 454 | ||
455 | bch_journal(cl); | 455 | bch_journal(cl); |
456 | } | 456 | } |
457 | 457 | ||
458 | static void bch_insert_data_endio(struct bio *bio, int error) | 458 | static void bch_insert_data_endio(struct bio *bio, int error) |
459 | { | 459 | { |
460 | struct closure *cl = bio->bi_private; | 460 | struct closure *cl = bio->bi_private; |
461 | struct btree_op *op = container_of(cl, struct btree_op, cl); | 461 | struct btree_op *op = container_of(cl, struct btree_op, cl); |
462 | struct search *s = container_of(op, struct search, op); | 462 | struct search *s = container_of(op, struct search, op); |
463 | 463 | ||
464 | if (error) { | 464 | if (error) { |
465 | /* TODO: We could try to recover from this. */ | 465 | /* TODO: We could try to recover from this. */ |
466 | if (s->writeback) | 466 | if (s->writeback) |
467 | s->error = error; | 467 | s->error = error; |
468 | else if (s->write) | 468 | else if (s->write) |
469 | set_closure_fn(cl, bch_insert_data_error, bcache_wq); | 469 | set_closure_fn(cl, bch_insert_data_error, bcache_wq); |
470 | else | 470 | else |
471 | set_closure_fn(cl, NULL, NULL); | 471 | set_closure_fn(cl, NULL, NULL); |
472 | } | 472 | } |
473 | 473 | ||
474 | bch_bbio_endio(op->c, bio, error, "writing data to cache"); | 474 | bch_bbio_endio(op->c, bio, error, "writing data to cache"); |
475 | } | 475 | } |
476 | 476 | ||
477 | static void bch_insert_data_loop(struct closure *cl) | 477 | static void bch_insert_data_loop(struct closure *cl) |
478 | { | 478 | { |
479 | struct btree_op *op = container_of(cl, struct btree_op, cl); | 479 | struct btree_op *op = container_of(cl, struct btree_op, cl); |
480 | struct search *s = container_of(op, struct search, op); | 480 | struct search *s = container_of(op, struct search, op); |
481 | struct bio *bio = op->cache_bio, *n; | 481 | struct bio *bio = op->cache_bio, *n; |
482 | 482 | ||
483 | if (op->skip) | 483 | if (op->skip) |
484 | return bio_invalidate(cl); | 484 | return bio_invalidate(cl); |
485 | 485 | ||
486 | if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) { | 486 | if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) { |
487 | set_gc_sectors(op->c); | 487 | set_gc_sectors(op->c); |
488 | bch_queue_gc(op->c); | 488 | bch_queue_gc(op->c); |
489 | } | 489 | } |
490 | 490 | ||
491 | do { | 491 | do { |
492 | unsigned i; | 492 | unsigned i; |
493 | struct bkey *k; | 493 | struct bkey *k; |
494 | struct bio_set *split = s->d | 494 | struct bio_set *split = s->d |
495 | ? s->d->bio_split : op->c->bio_split; | 495 | ? s->d->bio_split : op->c->bio_split; |
496 | 496 | ||
497 | /* 1 for the device pointer and 1 for the chksum */ | 497 | /* 1 for the device pointer and 1 for the chksum */ |
498 | if (bch_keylist_realloc(&op->keys, | 498 | if (bch_keylist_realloc(&op->keys, |
499 | 1 + (op->csum ? 1 : 0), | 499 | 1 + (op->csum ? 1 : 0), |
500 | op->c)) | 500 | op->c)) |
501 | continue_at(cl, bch_journal, bcache_wq); | 501 | continue_at(cl, bch_journal, bcache_wq); |
502 | 502 | ||
503 | k = op->keys.top; | 503 | k = op->keys.top; |
504 | bkey_init(k); | 504 | bkey_init(k); |
505 | SET_KEY_INODE(k, op->inode); | 505 | SET_KEY_INODE(k, op->inode); |
506 | SET_KEY_OFFSET(k, bio->bi_sector); | 506 | SET_KEY_OFFSET(k, bio->bi_sector); |
507 | 507 | ||
508 | if (!bch_alloc_sectors(k, bio_sectors(bio), s)) | 508 | if (!bch_alloc_sectors(k, bio_sectors(bio), s)) |
509 | goto err; | 509 | goto err; |
510 | 510 | ||
511 | n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split); | 511 | n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split); |
512 | if (!n) { | ||
513 | __bkey_put(op->c, k); | ||
514 | continue_at(cl, bch_insert_data_loop, bcache_wq); | ||
515 | } | ||
516 | 512 | ||
517 | n->bi_end_io = bch_insert_data_endio; | 513 | n->bi_end_io = bch_insert_data_endio; |
518 | n->bi_private = cl; | 514 | n->bi_private = cl; |
519 | 515 | ||
520 | if (s->writeback) { | 516 | if (s->writeback) { |
521 | SET_KEY_DIRTY(k, true); | 517 | SET_KEY_DIRTY(k, true); |
522 | 518 | ||
523 | for (i = 0; i < KEY_PTRS(k); i++) | 519 | for (i = 0; i < KEY_PTRS(k); i++) |
524 | SET_GC_MARK(PTR_BUCKET(op->c, k, i), | 520 | SET_GC_MARK(PTR_BUCKET(op->c, k, i), |
525 | GC_MARK_DIRTY); | 521 | GC_MARK_DIRTY); |
526 | } | 522 | } |
527 | 523 | ||
528 | SET_KEY_CSUM(k, op->csum); | 524 | SET_KEY_CSUM(k, op->csum); |
529 | if (KEY_CSUM(k)) | 525 | if (KEY_CSUM(k)) |
530 | bio_csum(n, k); | 526 | bio_csum(n, k); |
531 | 527 | ||
532 | trace_bcache_cache_insert(k); | 528 | trace_bcache_cache_insert(k); |
533 | bch_keylist_push(&op->keys); | 529 | bch_keylist_push(&op->keys); |
534 | 530 | ||
535 | n->bi_rw |= REQ_WRITE; | 531 | n->bi_rw |= REQ_WRITE; |
536 | bch_submit_bbio(n, op->c, k, 0); | 532 | bch_submit_bbio(n, op->c, k, 0); |
537 | } while (n != bio); | 533 | } while (n != bio); |
538 | 534 | ||
539 | op->insert_data_done = true; | 535 | op->insert_data_done = true; |
540 | continue_at(cl, bch_journal, bcache_wq); | 536 | continue_at(cl, bch_journal, bcache_wq); |
541 | err: | 537 | err: |
542 | /* bch_alloc_sectors() blocks if s->writeback = true */ | 538 | /* bch_alloc_sectors() blocks if s->writeback = true */ |
543 | BUG_ON(s->writeback); | 539 | BUG_ON(s->writeback); |
544 | 540 | ||
545 | /* | 541 | /* |
546 | * But if it's not a writeback write we'd rather just bail out if | 542 | * But if it's not a writeback write we'd rather just bail out if |
547 | * there aren't any buckets ready to write to - it might take awhile and | 543 | * there aren't any buckets ready to write to - it might take awhile and |
548 | * we might be starving btree writes for gc or something. | 544 | * we might be starving btree writes for gc or something. |
549 | */ | 545 | */ |
550 | 546 | ||
551 | if (s->write) { | 547 | if (s->write) { |
552 | /* | 548 | /* |
553 | * Writethrough write: We can't complete the write until we've | 549 | * Writethrough write: We can't complete the write until we've |
554 | * updated the index. But we don't want to delay the write while | 550 | * updated the index. But we don't want to delay the write while |
555 | * we wait for buckets to be freed up, so just invalidate the | 551 | * we wait for buckets to be freed up, so just invalidate the |
556 | * rest of the write. | 552 | * rest of the write. |
557 | */ | 553 | */ |
558 | op->skip = true; | 554 | op->skip = true; |
559 | return bio_invalidate(cl); | 555 | return bio_invalidate(cl); |
560 | } else { | 556 | } else { |
561 | /* | 557 | /* |
562 | * From a cache miss, we can just insert the keys for the data | 558 | * From a cache miss, we can just insert the keys for the data |
563 | * we have written or bail out if we didn't do anything. | 559 | * we have written or bail out if we didn't do anything. |
564 | */ | 560 | */ |
565 | op->insert_data_done = true; | 561 | op->insert_data_done = true; |
566 | bio_put(bio); | 562 | bio_put(bio); |
567 | 563 | ||
568 | if (!bch_keylist_empty(&op->keys)) | 564 | if (!bch_keylist_empty(&op->keys)) |
569 | continue_at(cl, bch_journal, bcache_wq); | 565 | continue_at(cl, bch_journal, bcache_wq); |
570 | else | 566 | else |
571 | closure_return(cl); | 567 | closure_return(cl); |
572 | } | 568 | } |
573 | } | 569 | } |
574 | 570 | ||
575 | /** | 571 | /** |
576 | * bch_insert_data - stick some data in the cache | 572 | * bch_insert_data - stick some data in the cache |
577 | * | 573 | * |
578 | * This is the starting point for any data to end up in a cache device; it could | 574 | * This is the starting point for any data to end up in a cache device; it could |
579 | * be from a normal write, or a writeback write, or a write to a flash only | 575 | * be from a normal write, or a writeback write, or a write to a flash only |
580 | * volume - it's also used by the moving garbage collector to compact data in | 576 | * volume - it's also used by the moving garbage collector to compact data in |
581 | * mostly empty buckets. | 577 | * mostly empty buckets. |
582 | * | 578 | * |
583 | * It first writes the data to the cache, creating a list of keys to be inserted | 579 | * It first writes the data to the cache, creating a list of keys to be inserted |
584 | * (if the data had to be fragmented there will be multiple keys); after the | 580 | * (if the data had to be fragmented there will be multiple keys); after the |
585 | * data is written it calls bch_journal, and after the keys have been added to | 581 | * data is written it calls bch_journal, and after the keys have been added to |
586 | * the next journal write they're inserted into the btree. | 582 | * the next journal write they're inserted into the btree. |
587 | * | 583 | * |
588 | * It inserts the data in op->cache_bio; bi_sector is used for the key offset, | 584 | * It inserts the data in op->cache_bio; bi_sector is used for the key offset, |
589 | * and op->inode is used for the key inode. | 585 | * and op->inode is used for the key inode. |
590 | * | 586 | * |
591 | * If op->skip is true, instead of inserting the data it invalidates the region | 587 | * If op->skip is true, instead of inserting the data it invalidates the region |
592 | * of the cache represented by op->cache_bio and op->inode. | 588 | * of the cache represented by op->cache_bio and op->inode. |
593 | */ | 589 | */ |
594 | void bch_insert_data(struct closure *cl) | 590 | void bch_insert_data(struct closure *cl) |
595 | { | 591 | { |
596 | struct btree_op *op = container_of(cl, struct btree_op, cl); | 592 | struct btree_op *op = container_of(cl, struct btree_op, cl); |
597 | 593 | ||
598 | bch_keylist_init(&op->keys); | 594 | bch_keylist_init(&op->keys); |
599 | bio_get(op->cache_bio); | 595 | bio_get(op->cache_bio); |
600 | bch_insert_data_loop(cl); | 596 | bch_insert_data_loop(cl); |
601 | } | 597 | } |
602 | 598 | ||
603 | void bch_btree_insert_async(struct closure *cl) | 599 | void bch_btree_insert_async(struct closure *cl) |
604 | { | 600 | { |
605 | struct btree_op *op = container_of(cl, struct btree_op, cl); | 601 | struct btree_op *op = container_of(cl, struct btree_op, cl); |
606 | struct search *s = container_of(op, struct search, op); | 602 | struct search *s = container_of(op, struct search, op); |
607 | 603 | ||
608 | if (bch_btree_insert(op, op->c)) { | 604 | if (bch_btree_insert(op, op->c)) { |
609 | s->error = -ENOMEM; | 605 | s->error = -ENOMEM; |
610 | op->insert_data_done = true; | 606 | op->insert_data_done = true; |
611 | } | 607 | } |
612 | 608 | ||
613 | if (op->insert_data_done) { | 609 | if (op->insert_data_done) { |
614 | bch_keylist_free(&op->keys); | 610 | bch_keylist_free(&op->keys); |
615 | closure_return(cl); | 611 | closure_return(cl); |
616 | } else | 612 | } else |
617 | continue_at(cl, bch_insert_data_loop, bcache_wq); | 613 | continue_at(cl, bch_insert_data_loop, bcache_wq); |
618 | } | 614 | } |
619 | 615 | ||
620 | /* Common code for the make_request functions */ | 616 | /* Common code for the make_request functions */ |
621 | 617 | ||
622 | static void request_endio(struct bio *bio, int error) | 618 | static void request_endio(struct bio *bio, int error) |
623 | { | 619 | { |
624 | struct closure *cl = bio->bi_private; | 620 | struct closure *cl = bio->bi_private; |
625 | 621 | ||
626 | if (error) { | 622 | if (error) { |
627 | struct search *s = container_of(cl, struct search, cl); | 623 | struct search *s = container_of(cl, struct search, cl); |
628 | s->error = error; | 624 | s->error = error; |
629 | /* Only cache read errors are recoverable */ | 625 | /* Only cache read errors are recoverable */ |
630 | s->recoverable = false; | 626 | s->recoverable = false; |
631 | } | 627 | } |
632 | 628 | ||
633 | bio_put(bio); | 629 | bio_put(bio); |
634 | closure_put(cl); | 630 | closure_put(cl); |
635 | } | 631 | } |
636 | 632 | ||
637 | void bch_cache_read_endio(struct bio *bio, int error) | 633 | void bch_cache_read_endio(struct bio *bio, int error) |
638 | { | 634 | { |
639 | struct bbio *b = container_of(bio, struct bbio, bio); | 635 | struct bbio *b = container_of(bio, struct bbio, bio); |
640 | struct closure *cl = bio->bi_private; | 636 | struct closure *cl = bio->bi_private; |
641 | struct search *s = container_of(cl, struct search, cl); | 637 | struct search *s = container_of(cl, struct search, cl); |
642 | 638 | ||
643 | /* | 639 | /* |
644 | * If the bucket was reused while our bio was in flight, we might have | 640 | * If the bucket was reused while our bio was in flight, we might have |
645 | * read the wrong data. Set s->error but not error so it doesn't get | 641 | * read the wrong data. Set s->error but not error so it doesn't get |
646 | * counted against the cache device, but we'll still reread the data | 642 | * counted against the cache device, but we'll still reread the data |
647 | * from the backing device. | 643 | * from the backing device. |
648 | */ | 644 | */ |
649 | 645 | ||
650 | if (error) | 646 | if (error) |
651 | s->error = error; | 647 | s->error = error; |
652 | else if (ptr_stale(s->op.c, &b->key, 0)) { | 648 | else if (ptr_stale(s->op.c, &b->key, 0)) { |
653 | atomic_long_inc(&s->op.c->cache_read_races); | 649 | atomic_long_inc(&s->op.c->cache_read_races); |
654 | s->error = -EINTR; | 650 | s->error = -EINTR; |
655 | } | 651 | } |
656 | 652 | ||
657 | bch_bbio_endio(s->op.c, bio, error, "reading from cache"); | 653 | bch_bbio_endio(s->op.c, bio, error, "reading from cache"); |
658 | } | 654 | } |
659 | 655 | ||
660 | static void bio_complete(struct search *s) | 656 | static void bio_complete(struct search *s) |
661 | { | 657 | { |
662 | if (s->orig_bio) { | 658 | if (s->orig_bio) { |
663 | int cpu, rw = bio_data_dir(s->orig_bio); | 659 | int cpu, rw = bio_data_dir(s->orig_bio); |
664 | unsigned long duration = jiffies - s->start_time; | 660 | unsigned long duration = jiffies - s->start_time; |
665 | 661 | ||
666 | cpu = part_stat_lock(); | 662 | cpu = part_stat_lock(); |
667 | part_round_stats(cpu, &s->d->disk->part0); | 663 | part_round_stats(cpu, &s->d->disk->part0); |
668 | part_stat_add(cpu, &s->d->disk->part0, ticks[rw], duration); | 664 | part_stat_add(cpu, &s->d->disk->part0, ticks[rw], duration); |
669 | part_stat_unlock(); | 665 | part_stat_unlock(); |
670 | 666 | ||
671 | trace_bcache_request_end(s, s->orig_bio); | 667 | trace_bcache_request_end(s, s->orig_bio); |
672 | bio_endio(s->orig_bio, s->error); | 668 | bio_endio(s->orig_bio, s->error); |
673 | s->orig_bio = NULL; | 669 | s->orig_bio = NULL; |
674 | } | 670 | } |
675 | } | 671 | } |
676 | 672 | ||
677 | static void do_bio_hook(struct search *s) | 673 | static void do_bio_hook(struct search *s) |
678 | { | 674 | { |
679 | struct bio *bio = &s->bio.bio; | 675 | struct bio *bio = &s->bio.bio; |
680 | memcpy(bio, s->orig_bio, sizeof(struct bio)); | 676 | memcpy(bio, s->orig_bio, sizeof(struct bio)); |
681 | 677 | ||
682 | bio->bi_end_io = request_endio; | 678 | bio->bi_end_io = request_endio; |
683 | bio->bi_private = &s->cl; | 679 | bio->bi_private = &s->cl; |
684 | atomic_set(&bio->bi_cnt, 3); | 680 | atomic_set(&bio->bi_cnt, 3); |
685 | } | 681 | } |
686 | 682 | ||
687 | static void search_free(struct closure *cl) | 683 | static void search_free(struct closure *cl) |
688 | { | 684 | { |
689 | struct search *s = container_of(cl, struct search, cl); | 685 | struct search *s = container_of(cl, struct search, cl); |
690 | bio_complete(s); | 686 | bio_complete(s); |
691 | 687 | ||
692 | if (s->op.cache_bio) | 688 | if (s->op.cache_bio) |
693 | bio_put(s->op.cache_bio); | 689 | bio_put(s->op.cache_bio); |
694 | 690 | ||
695 | if (s->unaligned_bvec) | 691 | if (s->unaligned_bvec) |
696 | mempool_free(s->bio.bio.bi_io_vec, s->d->unaligned_bvec); | 692 | mempool_free(s->bio.bio.bi_io_vec, s->d->unaligned_bvec); |
697 | 693 | ||
698 | closure_debug_destroy(cl); | 694 | closure_debug_destroy(cl); |
699 | mempool_free(s, s->d->c->search); | 695 | mempool_free(s, s->d->c->search); |
700 | } | 696 | } |
701 | 697 | ||
702 | static struct search *search_alloc(struct bio *bio, struct bcache_device *d) | 698 | static struct search *search_alloc(struct bio *bio, struct bcache_device *d) |
703 | { | 699 | { |
704 | struct bio_vec *bv; | 700 | struct bio_vec *bv; |
705 | struct search *s = mempool_alloc(d->c->search, GFP_NOIO); | 701 | struct search *s = mempool_alloc(d->c->search, GFP_NOIO); |
706 | memset(s, 0, offsetof(struct search, op.keys)); | 702 | memset(s, 0, offsetof(struct search, op.keys)); |
707 | 703 | ||
708 | __closure_init(&s->cl, NULL); | 704 | __closure_init(&s->cl, NULL); |
709 | 705 | ||
710 | s->op.inode = d->id; | 706 | s->op.inode = d->id; |
711 | s->op.c = d->c; | 707 | s->op.c = d->c; |
712 | s->d = d; | 708 | s->d = d; |
713 | s->op.lock = -1; | 709 | s->op.lock = -1; |
714 | s->task = current; | 710 | s->task = current; |
715 | s->orig_bio = bio; | 711 | s->orig_bio = bio; |
716 | s->write = (bio->bi_rw & REQ_WRITE) != 0; | 712 | s->write = (bio->bi_rw & REQ_WRITE) != 0; |
717 | s->op.flush_journal = (bio->bi_rw & REQ_FLUSH) != 0; | 713 | s->op.flush_journal = (bio->bi_rw & REQ_FLUSH) != 0; |
718 | s->op.skip = (bio->bi_rw & REQ_DISCARD) != 0; | 714 | s->op.skip = (bio->bi_rw & REQ_DISCARD) != 0; |
719 | s->recoverable = 1; | 715 | s->recoverable = 1; |
720 | s->start_time = jiffies; | 716 | s->start_time = jiffies; |
721 | do_bio_hook(s); | 717 | do_bio_hook(s); |
722 | 718 | ||
723 | if (bio->bi_size != bio_segments(bio) * PAGE_SIZE) { | 719 | if (bio->bi_size != bio_segments(bio) * PAGE_SIZE) { |
724 | bv = mempool_alloc(d->unaligned_bvec, GFP_NOIO); | 720 | bv = mempool_alloc(d->unaligned_bvec, GFP_NOIO); |
725 | memcpy(bv, bio_iovec(bio), | 721 | memcpy(bv, bio_iovec(bio), |
726 | sizeof(struct bio_vec) * bio_segments(bio)); | 722 | sizeof(struct bio_vec) * bio_segments(bio)); |
727 | 723 | ||
728 | s->bio.bio.bi_io_vec = bv; | 724 | s->bio.bio.bi_io_vec = bv; |
729 | s->unaligned_bvec = 1; | 725 | s->unaligned_bvec = 1; |
730 | } | 726 | } |
731 | 727 | ||
732 | return s; | 728 | return s; |
733 | } | 729 | } |
734 | 730 | ||
735 | static void btree_read_async(struct closure *cl) | 731 | static void btree_read_async(struct closure *cl) |
736 | { | 732 | { |
737 | struct btree_op *op = container_of(cl, struct btree_op, cl); | 733 | struct btree_op *op = container_of(cl, struct btree_op, cl); |
738 | 734 | ||
739 | int ret = btree_root(search_recurse, op->c, op); | 735 | int ret = btree_root(search_recurse, op->c, op); |
740 | 736 | ||
741 | if (ret == -EAGAIN) | 737 | if (ret == -EAGAIN) |
742 | continue_at(cl, btree_read_async, bcache_wq); | 738 | continue_at(cl, btree_read_async, bcache_wq); |
743 | 739 | ||
744 | closure_return(cl); | 740 | closure_return(cl); |
745 | } | 741 | } |
746 | 742 | ||
747 | /* Cached devices */ | 743 | /* Cached devices */ |
748 | 744 | ||
749 | static void cached_dev_bio_complete(struct closure *cl) | 745 | static void cached_dev_bio_complete(struct closure *cl) |
750 | { | 746 | { |
751 | struct search *s = container_of(cl, struct search, cl); | 747 | struct search *s = container_of(cl, struct search, cl); |
752 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | 748 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); |
753 | 749 | ||
754 | search_free(cl); | 750 | search_free(cl); |
755 | cached_dev_put(dc); | 751 | cached_dev_put(dc); |
756 | } | 752 | } |
757 | 753 | ||
758 | /* Process reads */ | 754 | /* Process reads */ |
759 | 755 | ||
760 | static void cached_dev_read_complete(struct closure *cl) | 756 | static void cached_dev_read_complete(struct closure *cl) |
761 | { | 757 | { |
762 | struct search *s = container_of(cl, struct search, cl); | 758 | struct search *s = container_of(cl, struct search, cl); |
763 | 759 | ||
764 | if (s->op.insert_collision) | 760 | if (s->op.insert_collision) |
765 | bch_mark_cache_miss_collision(s); | 761 | bch_mark_cache_miss_collision(s); |
766 | 762 | ||
767 | if (s->op.cache_bio) { | 763 | if (s->op.cache_bio) { |
768 | int i; | 764 | int i; |
769 | struct bio_vec *bv; | 765 | struct bio_vec *bv; |
770 | 766 | ||
771 | __bio_for_each_segment(bv, s->op.cache_bio, i, 0) | 767 | __bio_for_each_segment(bv, s->op.cache_bio, i, 0) |
772 | __free_page(bv->bv_page); | 768 | __free_page(bv->bv_page); |
773 | } | 769 | } |
774 | 770 | ||
775 | cached_dev_bio_complete(cl); | 771 | cached_dev_bio_complete(cl); |
776 | } | 772 | } |
777 | 773 | ||
778 | static void request_read_error(struct closure *cl) | 774 | static void request_read_error(struct closure *cl) |
779 | { | 775 | { |
780 | struct search *s = container_of(cl, struct search, cl); | 776 | struct search *s = container_of(cl, struct search, cl); |
781 | struct bio_vec *bv; | 777 | struct bio_vec *bv; |
782 | int i; | 778 | int i; |
783 | 779 | ||
784 | if (s->recoverable) { | 780 | if (s->recoverable) { |
785 | /* Retry from the backing device: */ | 781 | /* Retry from the backing device: */ |
786 | trace_bcache_read_retry(s->orig_bio); | 782 | trace_bcache_read_retry(s->orig_bio); |
787 | 783 | ||
788 | s->error = 0; | 784 | s->error = 0; |
789 | bv = s->bio.bio.bi_io_vec; | 785 | bv = s->bio.bio.bi_io_vec; |
790 | do_bio_hook(s); | 786 | do_bio_hook(s); |
791 | s->bio.bio.bi_io_vec = bv; | 787 | s->bio.bio.bi_io_vec = bv; |
792 | 788 | ||
793 | if (!s->unaligned_bvec) | 789 | if (!s->unaligned_bvec) |
794 | bio_for_each_segment(bv, s->orig_bio, i) | 790 | bio_for_each_segment(bv, s->orig_bio, i) |
795 | bv->bv_offset = 0, bv->bv_len = PAGE_SIZE; | 791 | bv->bv_offset = 0, bv->bv_len = PAGE_SIZE; |
796 | else | 792 | else |
797 | memcpy(s->bio.bio.bi_io_vec, | 793 | memcpy(s->bio.bio.bi_io_vec, |
798 | bio_iovec(s->orig_bio), | 794 | bio_iovec(s->orig_bio), |
799 | sizeof(struct bio_vec) * | 795 | sizeof(struct bio_vec) * |
800 | bio_segments(s->orig_bio)); | 796 | bio_segments(s->orig_bio)); |
801 | 797 | ||
802 | /* XXX: invalidate cache */ | 798 | /* XXX: invalidate cache */ |
803 | 799 | ||
804 | closure_bio_submit(&s->bio.bio, &s->cl, s->d); | 800 | closure_bio_submit(&s->bio.bio, &s->cl, s->d); |
805 | } | 801 | } |
806 | 802 | ||
807 | continue_at(cl, cached_dev_read_complete, NULL); | 803 | continue_at(cl, cached_dev_read_complete, NULL); |
808 | } | 804 | } |
809 | 805 | ||
810 | static void request_read_done(struct closure *cl) | 806 | static void request_read_done(struct closure *cl) |
811 | { | 807 | { |
812 | struct search *s = container_of(cl, struct search, cl); | 808 | struct search *s = container_of(cl, struct search, cl); |
813 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | 809 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); |
814 | 810 | ||
815 | /* | 811 | /* |
816 | * s->cache_bio != NULL implies that we had a cache miss; cache_bio now | 812 | * s->cache_bio != NULL implies that we had a cache miss; cache_bio now |
817 | * contains data ready to be inserted into the cache. | 813 | * contains data ready to be inserted into the cache. |
818 | * | 814 | * |
819 | * First, we copy the data we just read from cache_bio's bounce buffers | 815 | * First, we copy the data we just read from cache_bio's bounce buffers |
820 | * to the buffers the original bio pointed to: | 816 | * to the buffers the original bio pointed to: |
821 | */ | 817 | */ |
822 | 818 | ||
823 | if (s->op.cache_bio) { | 819 | if (s->op.cache_bio) { |
824 | struct bio_vec *src, *dst; | ||
825 | unsigned src_offset, dst_offset, bytes; | ||
826 | void *dst_ptr; | ||
827 | |||
828 | bio_reset(s->op.cache_bio); | 820 | bio_reset(s->op.cache_bio); |
829 | s->op.cache_bio->bi_sector = s->cache_miss->bi_sector; | 821 | s->op.cache_bio->bi_sector = s->cache_miss->bi_sector; |
830 | s->op.cache_bio->bi_bdev = s->cache_miss->bi_bdev; | 822 | s->op.cache_bio->bi_bdev = s->cache_miss->bi_bdev; |
831 | s->op.cache_bio->bi_size = s->cache_bio_sectors << 9; | 823 | s->op.cache_bio->bi_size = s->cache_bio_sectors << 9; |
832 | bch_bio_map(s->op.cache_bio, NULL); | 824 | bch_bio_map(s->op.cache_bio, NULL); |
833 | 825 | ||
834 | src = bio_iovec(s->op.cache_bio); | 826 | bio_copy_data(s->cache_miss, s->op.cache_bio); |
835 | dst = bio_iovec(s->cache_miss); | ||
836 | src_offset = src->bv_offset; | ||
837 | dst_offset = dst->bv_offset; | ||
838 | dst_ptr = kmap(dst->bv_page); | ||
839 | 827 | ||
840 | while (1) { | ||
841 | if (dst_offset == dst->bv_offset + dst->bv_len) { | ||
842 | kunmap(dst->bv_page); | ||
843 | dst++; | ||
844 | if (dst == bio_iovec_idx(s->cache_miss, | ||
845 | s->cache_miss->bi_vcnt)) | ||
846 | break; | ||
847 | |||
848 | dst_offset = dst->bv_offset; | ||
849 | dst_ptr = kmap(dst->bv_page); | ||
850 | } | ||
851 | |||
852 | if (src_offset == src->bv_offset + src->bv_len) { | ||
853 | src++; | ||
854 | if (src == bio_iovec_idx(s->op.cache_bio, | ||
855 | s->op.cache_bio->bi_vcnt)) | ||
856 | BUG(); | ||
857 | |||
858 | src_offset = src->bv_offset; | ||
859 | } | ||
860 | |||
861 | bytes = min(dst->bv_offset + dst->bv_len - dst_offset, | ||
862 | src->bv_offset + src->bv_len - src_offset); | ||
863 | |||
864 | memcpy(dst_ptr + dst_offset, | ||
865 | page_address(src->bv_page) + src_offset, | ||
866 | bytes); | ||
867 | |||
868 | src_offset += bytes; | ||
869 | dst_offset += bytes; | ||
870 | } | ||
871 | |||
872 | bio_put(s->cache_miss); | 828 | bio_put(s->cache_miss); |
873 | s->cache_miss = NULL; | 829 | s->cache_miss = NULL; |
874 | } | 830 | } |
875 | 831 | ||
876 | if (verify(dc, &s->bio.bio) && s->recoverable) | 832 | if (verify(dc, &s->bio.bio) && s->recoverable) |
877 | bch_data_verify(s); | 833 | bch_data_verify(s); |
878 | 834 | ||
879 | bio_complete(s); | 835 | bio_complete(s); |
880 | 836 | ||
881 | if (s->op.cache_bio && | 837 | if (s->op.cache_bio && |
882 | !test_bit(CACHE_SET_STOPPING, &s->op.c->flags)) { | 838 | !test_bit(CACHE_SET_STOPPING, &s->op.c->flags)) { |
883 | s->op.type = BTREE_REPLACE; | 839 | s->op.type = BTREE_REPLACE; |
884 | closure_call(&s->op.cl, bch_insert_data, NULL, cl); | 840 | closure_call(&s->op.cl, bch_insert_data, NULL, cl); |
885 | } | 841 | } |
886 | 842 | ||
887 | continue_at(cl, cached_dev_read_complete, NULL); | 843 | continue_at(cl, cached_dev_read_complete, NULL); |
888 | } | 844 | } |
889 | 845 | ||
890 | static void request_read_done_bh(struct closure *cl) | 846 | static void request_read_done_bh(struct closure *cl) |
891 | { | 847 | { |
892 | struct search *s = container_of(cl, struct search, cl); | 848 | struct search *s = container_of(cl, struct search, cl); |
893 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | 849 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); |
894 | 850 | ||
895 | bch_mark_cache_accounting(s, !s->cache_miss, s->op.skip); | 851 | bch_mark_cache_accounting(s, !s->cache_miss, s->op.skip); |
896 | trace_bcache_read(s->orig_bio, !s->cache_miss, s->op.skip); | 852 | trace_bcache_read(s->orig_bio, !s->cache_miss, s->op.skip); |
897 | 853 | ||
898 | if (s->error) | 854 | if (s->error) |
899 | continue_at_nobarrier(cl, request_read_error, bcache_wq); | 855 | continue_at_nobarrier(cl, request_read_error, bcache_wq); |
900 | else if (s->op.cache_bio || verify(dc, &s->bio.bio)) | 856 | else if (s->op.cache_bio || verify(dc, &s->bio.bio)) |
901 | continue_at_nobarrier(cl, request_read_done, bcache_wq); | 857 | continue_at_nobarrier(cl, request_read_done, bcache_wq); |
902 | else | 858 | else |
903 | continue_at_nobarrier(cl, cached_dev_read_complete, NULL); | 859 | continue_at_nobarrier(cl, cached_dev_read_complete, NULL); |
904 | } | 860 | } |
905 | 861 | ||
906 | static int cached_dev_cache_miss(struct btree *b, struct search *s, | 862 | static int cached_dev_cache_miss(struct btree *b, struct search *s, |
907 | struct bio *bio, unsigned sectors) | 863 | struct bio *bio, unsigned sectors) |
908 | { | 864 | { |
909 | int ret = 0; | 865 | int ret = 0; |
910 | unsigned reada; | 866 | unsigned reada; |
911 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | 867 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); |
912 | struct bio *miss; | 868 | struct bio *miss; |
913 | 869 | ||
914 | miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); | 870 | miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); |
915 | if (!miss) | ||
916 | return -EAGAIN; | ||
917 | |||
918 | if (miss == bio) | 871 | if (miss == bio) |
919 | s->op.lookup_done = true; | 872 | s->op.lookup_done = true; |
920 | 873 | ||
921 | miss->bi_end_io = request_endio; | 874 | miss->bi_end_io = request_endio; |
922 | miss->bi_private = &s->cl; | 875 | miss->bi_private = &s->cl; |
923 | 876 | ||
924 | if (s->cache_miss || s->op.skip) | 877 | if (s->cache_miss || s->op.skip) |
925 | goto out_submit; | 878 | goto out_submit; |
926 | 879 | ||
927 | if (miss != bio || | 880 | if (miss != bio || |
928 | (bio->bi_rw & REQ_RAHEAD) || | 881 | (bio->bi_rw & REQ_RAHEAD) || |
929 | (bio->bi_rw & REQ_META) || | 882 | (bio->bi_rw & REQ_META) || |
930 | s->op.c->gc_stats.in_use >= CUTOFF_CACHE_READA) | 883 | s->op.c->gc_stats.in_use >= CUTOFF_CACHE_READA) |
931 | reada = 0; | 884 | reada = 0; |
932 | else { | 885 | else { |
933 | reada = min(dc->readahead >> 9, | 886 | reada = min(dc->readahead >> 9, |
934 | sectors - bio_sectors(miss)); | 887 | sectors - bio_sectors(miss)); |
935 | 888 | ||
936 | if (bio_end(miss) + reada > bdev_sectors(miss->bi_bdev)) | 889 | if (bio_end_sector(miss) + reada > bdev_sectors(miss->bi_bdev)) |
937 | reada = bdev_sectors(miss->bi_bdev) - bio_end(miss); | 890 | reada = bdev_sectors(miss->bi_bdev) - |
891 | bio_end_sector(miss); | ||
938 | } | 892 | } |
939 | 893 | ||
940 | s->cache_bio_sectors = bio_sectors(miss) + reada; | 894 | s->cache_bio_sectors = bio_sectors(miss) + reada; |
941 | s->op.cache_bio = bio_alloc_bioset(GFP_NOWAIT, | 895 | s->op.cache_bio = bio_alloc_bioset(GFP_NOWAIT, |
942 | DIV_ROUND_UP(s->cache_bio_sectors, PAGE_SECTORS), | 896 | DIV_ROUND_UP(s->cache_bio_sectors, PAGE_SECTORS), |
943 | dc->disk.bio_split); | 897 | dc->disk.bio_split); |
944 | 898 | ||
945 | if (!s->op.cache_bio) | 899 | if (!s->op.cache_bio) |
946 | goto out_submit; | 900 | goto out_submit; |
947 | 901 | ||
948 | s->op.cache_bio->bi_sector = miss->bi_sector; | 902 | s->op.cache_bio->bi_sector = miss->bi_sector; |
949 | s->op.cache_bio->bi_bdev = miss->bi_bdev; | 903 | s->op.cache_bio->bi_bdev = miss->bi_bdev; |
950 | s->op.cache_bio->bi_size = s->cache_bio_sectors << 9; | 904 | s->op.cache_bio->bi_size = s->cache_bio_sectors << 9; |
951 | 905 | ||
952 | s->op.cache_bio->bi_end_io = request_endio; | 906 | s->op.cache_bio->bi_end_io = request_endio; |
953 | s->op.cache_bio->bi_private = &s->cl; | 907 | s->op.cache_bio->bi_private = &s->cl; |
954 | 908 | ||
955 | /* btree_search_recurse()'s btree iterator is no good anymore */ | 909 | /* btree_search_recurse()'s btree iterator is no good anymore */ |
956 | ret = -EINTR; | 910 | ret = -EINTR; |
957 | if (!bch_btree_insert_check_key(b, &s->op, s->op.cache_bio)) | 911 | if (!bch_btree_insert_check_key(b, &s->op, s->op.cache_bio)) |
958 | goto out_put; | 912 | goto out_put; |
959 | 913 | ||
960 | bch_bio_map(s->op.cache_bio, NULL); | 914 | bch_bio_map(s->op.cache_bio, NULL); |
961 | if (bch_bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO)) | 915 | if (bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO)) |
962 | goto out_put; | 916 | goto out_put; |
963 | 917 | ||
964 | s->cache_miss = miss; | 918 | s->cache_miss = miss; |
965 | bio_get(s->op.cache_bio); | 919 | bio_get(s->op.cache_bio); |
966 | 920 | ||
967 | closure_bio_submit(s->op.cache_bio, &s->cl, s->d); | 921 | closure_bio_submit(s->op.cache_bio, &s->cl, s->d); |
968 | 922 | ||
969 | return ret; | 923 | return ret; |
970 | out_put: | 924 | out_put: |
971 | bio_put(s->op.cache_bio); | 925 | bio_put(s->op.cache_bio); |
972 | s->op.cache_bio = NULL; | 926 | s->op.cache_bio = NULL; |
973 | out_submit: | 927 | out_submit: |
974 | closure_bio_submit(miss, &s->cl, s->d); | 928 | closure_bio_submit(miss, &s->cl, s->d); |
975 | return ret; | 929 | return ret; |
976 | } | 930 | } |
977 | 931 | ||
978 | static void request_read(struct cached_dev *dc, struct search *s) | 932 | static void request_read(struct cached_dev *dc, struct search *s) |
979 | { | 933 | { |
980 | struct closure *cl = &s->cl; | 934 | struct closure *cl = &s->cl; |
981 | 935 | ||
982 | check_should_skip(dc, s); | 936 | check_should_skip(dc, s); |
983 | closure_call(&s->op.cl, btree_read_async, NULL, cl); | 937 | closure_call(&s->op.cl, btree_read_async, NULL, cl); |
984 | 938 | ||
985 | continue_at(cl, request_read_done_bh, NULL); | 939 | continue_at(cl, request_read_done_bh, NULL); |
986 | } | 940 | } |
987 | 941 | ||
988 | /* Process writes */ | 942 | /* Process writes */ |
989 | 943 | ||
990 | static void cached_dev_write_complete(struct closure *cl) | 944 | static void cached_dev_write_complete(struct closure *cl) |
991 | { | 945 | { |
992 | struct search *s = container_of(cl, struct search, cl); | 946 | struct search *s = container_of(cl, struct search, cl); |
993 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | 947 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); |
994 | 948 | ||
995 | up_read_non_owner(&dc->writeback_lock); | 949 | up_read_non_owner(&dc->writeback_lock); |
996 | cached_dev_bio_complete(cl); | 950 | cached_dev_bio_complete(cl); |
997 | } | 951 | } |
998 | 952 | ||
999 | static void request_write(struct cached_dev *dc, struct search *s) | 953 | static void request_write(struct cached_dev *dc, struct search *s) |
1000 | { | 954 | { |
1001 | struct closure *cl = &s->cl; | 955 | struct closure *cl = &s->cl; |
1002 | struct bio *bio = &s->bio.bio; | 956 | struct bio *bio = &s->bio.bio; |
1003 | struct bkey start, end; | 957 | struct bkey start, end; |
1004 | start = KEY(dc->disk.id, bio->bi_sector, 0); | 958 | start = KEY(dc->disk.id, bio->bi_sector, 0); |
1005 | end = KEY(dc->disk.id, bio_end(bio), 0); | 959 | end = KEY(dc->disk.id, bio_end_sector(bio), 0); |
1006 | 960 | ||
1007 | bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end); | 961 | bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end); |
1008 | 962 | ||
1009 | check_should_skip(dc, s); | 963 | check_should_skip(dc, s); |
1010 | down_read_non_owner(&dc->writeback_lock); | 964 | down_read_non_owner(&dc->writeback_lock); |
1011 | 965 | ||
1012 | if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) { | 966 | if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) { |
1013 | s->op.skip = false; | 967 | s->op.skip = false; |
1014 | s->writeback = true; | 968 | s->writeback = true; |
1015 | } | 969 | } |
1016 | 970 | ||
1017 | if (bio->bi_rw & REQ_DISCARD) | 971 | if (bio->bi_rw & REQ_DISCARD) |
1018 | goto skip; | 972 | goto skip; |
1019 | 973 | ||
1020 | if (should_writeback(dc, s->orig_bio, | 974 | if (should_writeback(dc, s->orig_bio, |
1021 | cache_mode(dc, bio), | 975 | cache_mode(dc, bio), |
1022 | s->op.skip)) { | 976 | s->op.skip)) { |
1023 | s->op.skip = false; | 977 | s->op.skip = false; |
1024 | s->writeback = true; | 978 | s->writeback = true; |
1025 | } | 979 | } |
1026 | 980 | ||
1027 | if (s->op.skip) | 981 | if (s->op.skip) |
1028 | goto skip; | 982 | goto skip; |
1029 | 983 | ||
1030 | trace_bcache_write(s->orig_bio, s->writeback, s->op.skip); | 984 | trace_bcache_write(s->orig_bio, s->writeback, s->op.skip); |
1031 | 985 | ||
1032 | if (!s->writeback) { | 986 | if (!s->writeback) { |
1033 | s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO, | 987 | s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO, |
1034 | dc->disk.bio_split); | 988 | dc->disk.bio_split); |
1035 | 989 | ||
1036 | closure_bio_submit(bio, cl, s->d); | 990 | closure_bio_submit(bio, cl, s->d); |
1037 | } else { | 991 | } else { |
1038 | bch_writeback_add(dc); | 992 | bch_writeback_add(dc); |
1039 | 993 | ||
1040 | if (s->op.flush_journal) { | 994 | if (s->op.flush_journal) { |
1041 | /* Also need to send a flush to the backing device */ | 995 | /* Also need to send a flush to the backing device */ |
1042 | s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO, | 996 | s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO, |
1043 | dc->disk.bio_split); | 997 | dc->disk.bio_split); |
1044 | 998 | ||
1045 | bio->bi_size = 0; | 999 | bio->bi_size = 0; |
1046 | bio->bi_vcnt = 0; | 1000 | bio->bi_vcnt = 0; |
1047 | closure_bio_submit(bio, cl, s->d); | 1001 | closure_bio_submit(bio, cl, s->d); |
1048 | } else { | 1002 | } else { |
1049 | s->op.cache_bio = bio; | 1003 | s->op.cache_bio = bio; |
1050 | } | 1004 | } |
1051 | } | 1005 | } |
1052 | out: | 1006 | out: |
1053 | closure_call(&s->op.cl, bch_insert_data, NULL, cl); | 1007 | closure_call(&s->op.cl, bch_insert_data, NULL, cl); |
1054 | continue_at(cl, cached_dev_write_complete, NULL); | 1008 | continue_at(cl, cached_dev_write_complete, NULL); |
1055 | skip: | 1009 | skip: |
1056 | s->op.skip = true; | 1010 | s->op.skip = true; |
1057 | s->op.cache_bio = s->orig_bio; | 1011 | s->op.cache_bio = s->orig_bio; |
1058 | bio_get(s->op.cache_bio); | 1012 | bio_get(s->op.cache_bio); |
1059 | 1013 | ||
1060 | if ((bio->bi_rw & REQ_DISCARD) && | 1014 | if ((bio->bi_rw & REQ_DISCARD) && |
1061 | !blk_queue_discard(bdev_get_queue(dc->bdev))) | 1015 | !blk_queue_discard(bdev_get_queue(dc->bdev))) |
1062 | goto out; | 1016 | goto out; |
1063 | 1017 | ||
1064 | closure_bio_submit(bio, cl, s->d); | 1018 | closure_bio_submit(bio, cl, s->d); |
1065 | goto out; | 1019 | goto out; |
1066 | } | 1020 | } |
1067 | 1021 | ||
1068 | static void request_nodata(struct cached_dev *dc, struct search *s) | 1022 | static void request_nodata(struct cached_dev *dc, struct search *s) |
1069 | { | 1023 | { |
1070 | struct closure *cl = &s->cl; | 1024 | struct closure *cl = &s->cl; |
1071 | struct bio *bio = &s->bio.bio; | 1025 | struct bio *bio = &s->bio.bio; |
1072 | 1026 | ||
1073 | if (bio->bi_rw & REQ_DISCARD) { | 1027 | if (bio->bi_rw & REQ_DISCARD) { |
1074 | request_write(dc, s); | 1028 | request_write(dc, s); |
1075 | return; | 1029 | return; |
1076 | } | 1030 | } |
1077 | 1031 | ||
1078 | if (s->op.flush_journal) | 1032 | if (s->op.flush_journal) |
1079 | bch_journal_meta(s->op.c, cl); | 1033 | bch_journal_meta(s->op.c, cl); |
1080 | 1034 | ||
1081 | closure_bio_submit(bio, cl, s->d); | 1035 | closure_bio_submit(bio, cl, s->d); |
1082 | 1036 | ||
1083 | continue_at(cl, cached_dev_bio_complete, NULL); | 1037 | continue_at(cl, cached_dev_bio_complete, NULL); |
1084 | } | 1038 | } |
1085 | 1039 | ||
1086 | /* Cached devices - read & write stuff */ | 1040 | /* Cached devices - read & write stuff */ |
1087 | 1041 | ||
1088 | unsigned bch_get_congested(struct cache_set *c) | 1042 | unsigned bch_get_congested(struct cache_set *c) |
1089 | { | 1043 | { |
1090 | int i; | 1044 | int i; |
1091 | long rand; | 1045 | long rand; |
1092 | 1046 | ||
1093 | if (!c->congested_read_threshold_us && | 1047 | if (!c->congested_read_threshold_us && |
1094 | !c->congested_write_threshold_us) | 1048 | !c->congested_write_threshold_us) |
1095 | return 0; | 1049 | return 0; |
1096 | 1050 | ||
1097 | i = (local_clock_us() - c->congested_last_us) / 1024; | 1051 | i = (local_clock_us() - c->congested_last_us) / 1024; |
1098 | if (i < 0) | 1052 | if (i < 0) |
1099 | return 0; | 1053 | return 0; |
1100 | 1054 | ||
1101 | i += atomic_read(&c->congested); | 1055 | i += atomic_read(&c->congested); |
1102 | if (i >= 0) | 1056 | if (i >= 0) |
1103 | return 0; | 1057 | return 0; |
1104 | 1058 | ||
1105 | i += CONGESTED_MAX; | 1059 | i += CONGESTED_MAX; |
1106 | 1060 | ||
1107 | if (i > 0) | 1061 | if (i > 0) |
1108 | i = fract_exp_two(i, 6); | 1062 | i = fract_exp_two(i, 6); |
1109 | 1063 | ||
1110 | rand = get_random_int(); | 1064 | rand = get_random_int(); |
1111 | i -= bitmap_weight(&rand, BITS_PER_LONG); | 1065 | i -= bitmap_weight(&rand, BITS_PER_LONG); |
1112 | 1066 | ||
1113 | return i > 0 ? i : 1; | 1067 | return i > 0 ? i : 1; |
1114 | } | 1068 | } |
1115 | 1069 | ||
1116 | static void add_sequential(struct task_struct *t) | 1070 | static void add_sequential(struct task_struct *t) |
1117 | { | 1071 | { |
1118 | ewma_add(t->sequential_io_avg, | 1072 | ewma_add(t->sequential_io_avg, |
1119 | t->sequential_io, 8, 0); | 1073 | t->sequential_io, 8, 0); |
1120 | 1074 | ||
1121 | t->sequential_io = 0; | 1075 | t->sequential_io = 0; |
1122 | } | 1076 | } |
1123 | 1077 | ||
1124 | static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k) | 1078 | static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k) |
1125 | { | 1079 | { |
1126 | return &dc->io_hash[hash_64(k, RECENT_IO_BITS)]; | 1080 | return &dc->io_hash[hash_64(k, RECENT_IO_BITS)]; |
1127 | } | 1081 | } |
1128 | 1082 | ||
1129 | static void check_should_skip(struct cached_dev *dc, struct search *s) | 1083 | static void check_should_skip(struct cached_dev *dc, struct search *s) |
1130 | { | 1084 | { |
1131 | struct cache_set *c = s->op.c; | 1085 | struct cache_set *c = s->op.c; |
1132 | struct bio *bio = &s->bio.bio; | 1086 | struct bio *bio = &s->bio.bio; |
1133 | unsigned mode = cache_mode(dc, bio); | 1087 | unsigned mode = cache_mode(dc, bio); |
1134 | unsigned sectors, congested = bch_get_congested(c); | 1088 | unsigned sectors, congested = bch_get_congested(c); |
1135 | 1089 | ||
1136 | if (atomic_read(&dc->disk.detaching) || | 1090 | if (atomic_read(&dc->disk.detaching) || |
1137 | c->gc_stats.in_use > CUTOFF_CACHE_ADD || | 1091 | c->gc_stats.in_use > CUTOFF_CACHE_ADD || |
1138 | (bio->bi_rw & REQ_DISCARD)) | 1092 | (bio->bi_rw & REQ_DISCARD)) |
1139 | goto skip; | 1093 | goto skip; |
1140 | 1094 | ||
1141 | if (mode == CACHE_MODE_NONE || | 1095 | if (mode == CACHE_MODE_NONE || |
1142 | (mode == CACHE_MODE_WRITEAROUND && | 1096 | (mode == CACHE_MODE_WRITEAROUND && |
1143 | (bio->bi_rw & REQ_WRITE))) | 1097 | (bio->bi_rw & REQ_WRITE))) |
1144 | goto skip; | 1098 | goto skip; |
1145 | 1099 | ||
1146 | if (bio->bi_sector & (c->sb.block_size - 1) || | 1100 | if (bio->bi_sector & (c->sb.block_size - 1) || |
1147 | bio_sectors(bio) & (c->sb.block_size - 1)) { | 1101 | bio_sectors(bio) & (c->sb.block_size - 1)) { |
1148 | pr_debug("skipping unaligned io"); | 1102 | pr_debug("skipping unaligned io"); |
1149 | goto skip; | 1103 | goto skip; |
1150 | } | 1104 | } |
1151 | 1105 | ||
1152 | if (!congested && !dc->sequential_cutoff) | 1106 | if (!congested && !dc->sequential_cutoff) |
1153 | goto rescale; | 1107 | goto rescale; |
1154 | 1108 | ||
1155 | if (!congested && | 1109 | if (!congested && |
1156 | mode == CACHE_MODE_WRITEBACK && | 1110 | mode == CACHE_MODE_WRITEBACK && |
1157 | (bio->bi_rw & REQ_WRITE) && | 1111 | (bio->bi_rw & REQ_WRITE) && |
1158 | (bio->bi_rw & REQ_SYNC)) | 1112 | (bio->bi_rw & REQ_SYNC)) |
1159 | goto rescale; | 1113 | goto rescale; |
1160 | 1114 | ||
1161 | if (dc->sequential_merge) { | 1115 | if (dc->sequential_merge) { |
1162 | struct io *i; | 1116 | struct io *i; |
1163 | 1117 | ||
1164 | spin_lock(&dc->io_lock); | 1118 | spin_lock(&dc->io_lock); |
1165 | 1119 | ||
1166 | hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash) | 1120 | hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash) |
1167 | if (i->last == bio->bi_sector && | 1121 | if (i->last == bio->bi_sector && |
1168 | time_before(jiffies, i->jiffies)) | 1122 | time_before(jiffies, i->jiffies)) |
1169 | goto found; | 1123 | goto found; |
1170 | 1124 | ||
1171 | i = list_first_entry(&dc->io_lru, struct io, lru); | 1125 | i = list_first_entry(&dc->io_lru, struct io, lru); |
1172 | 1126 | ||
1173 | add_sequential(s->task); | 1127 | add_sequential(s->task); |
1174 | i->sequential = 0; | 1128 | i->sequential = 0; |
1175 | found: | 1129 | found: |
1176 | if (i->sequential + bio->bi_size > i->sequential) | 1130 | if (i->sequential + bio->bi_size > i->sequential) |
1177 | i->sequential += bio->bi_size; | 1131 | i->sequential += bio->bi_size; |
1178 | 1132 | ||
1179 | i->last = bio_end(bio); | 1133 | i->last = bio_end_sector(bio); |
1180 | i->jiffies = jiffies + msecs_to_jiffies(5000); | 1134 | i->jiffies = jiffies + msecs_to_jiffies(5000); |
1181 | s->task->sequential_io = i->sequential; | 1135 | s->task->sequential_io = i->sequential; |
1182 | 1136 | ||
1183 | hlist_del(&i->hash); | 1137 | hlist_del(&i->hash); |
1184 | hlist_add_head(&i->hash, iohash(dc, i->last)); | 1138 | hlist_add_head(&i->hash, iohash(dc, i->last)); |
1185 | list_move_tail(&i->lru, &dc->io_lru); | 1139 | list_move_tail(&i->lru, &dc->io_lru); |
1186 | 1140 | ||
1187 | spin_unlock(&dc->io_lock); | 1141 | spin_unlock(&dc->io_lock); |
1188 | } else { | 1142 | } else { |
1189 | s->task->sequential_io = bio->bi_size; | 1143 | s->task->sequential_io = bio->bi_size; |
1190 | 1144 | ||
1191 | add_sequential(s->task); | 1145 | add_sequential(s->task); |
1192 | } | 1146 | } |
1193 | 1147 | ||
1194 | sectors = max(s->task->sequential_io, | 1148 | sectors = max(s->task->sequential_io, |
1195 | s->task->sequential_io_avg) >> 9; | 1149 | s->task->sequential_io_avg) >> 9; |
1196 | 1150 | ||
1197 | if (dc->sequential_cutoff && | 1151 | if (dc->sequential_cutoff && |
1198 | sectors >= dc->sequential_cutoff >> 9) { | 1152 | sectors >= dc->sequential_cutoff >> 9) { |
1199 | trace_bcache_bypass_sequential(s->orig_bio); | 1153 | trace_bcache_bypass_sequential(s->orig_bio); |
1200 | goto skip; | 1154 | goto skip; |
1201 | } | 1155 | } |
1202 | 1156 | ||
1203 | if (congested && sectors >= congested) { | 1157 | if (congested && sectors >= congested) { |
1204 | trace_bcache_bypass_congested(s->orig_bio); | 1158 | trace_bcache_bypass_congested(s->orig_bio); |
1205 | goto skip; | 1159 | goto skip; |
1206 | } | 1160 | } |
1207 | 1161 | ||
1208 | rescale: | 1162 | rescale: |
1209 | bch_rescale_priorities(c, bio_sectors(bio)); | 1163 | bch_rescale_priorities(c, bio_sectors(bio)); |
1210 | return; | 1164 | return; |
1211 | skip: | 1165 | skip: |
1212 | bch_mark_sectors_bypassed(s, bio_sectors(bio)); | 1166 | bch_mark_sectors_bypassed(s, bio_sectors(bio)); |
1213 | s->op.skip = true; | 1167 | s->op.skip = true; |
1214 | } | 1168 | } |
1215 | 1169 | ||
1216 | static void cached_dev_make_request(struct request_queue *q, struct bio *bio) | 1170 | static void cached_dev_make_request(struct request_queue *q, struct bio *bio) |
1217 | { | 1171 | { |
1218 | struct search *s; | 1172 | struct search *s; |
1219 | struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; | 1173 | struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; |
1220 | struct cached_dev *dc = container_of(d, struct cached_dev, disk); | 1174 | struct cached_dev *dc = container_of(d, struct cached_dev, disk); |
1221 | int cpu, rw = bio_data_dir(bio); | 1175 | int cpu, rw = bio_data_dir(bio); |
1222 | 1176 | ||
1223 | cpu = part_stat_lock(); | 1177 | cpu = part_stat_lock(); |
1224 | part_stat_inc(cpu, &d->disk->part0, ios[rw]); | 1178 | part_stat_inc(cpu, &d->disk->part0, ios[rw]); |
1225 | part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio)); | 1179 | part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio)); |
1226 | part_stat_unlock(); | 1180 | part_stat_unlock(); |
1227 | 1181 | ||
1228 | bio->bi_bdev = dc->bdev; | 1182 | bio->bi_bdev = dc->bdev; |
1229 | bio->bi_sector += dc->sb.data_offset; | 1183 | bio->bi_sector += dc->sb.data_offset; |
1230 | 1184 | ||
1231 | if (cached_dev_get(dc)) { | 1185 | if (cached_dev_get(dc)) { |
1232 | s = search_alloc(bio, d); | 1186 | s = search_alloc(bio, d); |
1233 | trace_bcache_request_start(s, bio); | 1187 | trace_bcache_request_start(s, bio); |
1234 | 1188 | ||
1235 | if (!bio_has_data(bio)) | 1189 | if (!bio_has_data(bio)) |
1236 | request_nodata(dc, s); | 1190 | request_nodata(dc, s); |
1237 | else if (rw) | 1191 | else if (rw) |
1238 | request_write(dc, s); | 1192 | request_write(dc, s); |
1239 | else | 1193 | else |
1240 | request_read(dc, s); | 1194 | request_read(dc, s); |
1241 | } else { | 1195 | } else { |
1242 | if ((bio->bi_rw & REQ_DISCARD) && | 1196 | if ((bio->bi_rw & REQ_DISCARD) && |
1243 | !blk_queue_discard(bdev_get_queue(dc->bdev))) | 1197 | !blk_queue_discard(bdev_get_queue(dc->bdev))) |
1244 | bio_endio(bio, 0); | 1198 | bio_endio(bio, 0); |
1245 | else | 1199 | else |
1246 | bch_generic_make_request(bio, &d->bio_split_hook); | 1200 | bch_generic_make_request(bio, &d->bio_split_hook); |
1247 | } | 1201 | } |
1248 | } | 1202 | } |
1249 | 1203 | ||
1250 | static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode, | 1204 | static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode, |
1251 | unsigned int cmd, unsigned long arg) | 1205 | unsigned int cmd, unsigned long arg) |
1252 | { | 1206 | { |
1253 | struct cached_dev *dc = container_of(d, struct cached_dev, disk); | 1207 | struct cached_dev *dc = container_of(d, struct cached_dev, disk); |
1254 | return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg); | 1208 | return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg); |
1255 | } | 1209 | } |
1256 | 1210 | ||
1257 | static int cached_dev_congested(void *data, int bits) | 1211 | static int cached_dev_congested(void *data, int bits) |
1258 | { | 1212 | { |
1259 | struct bcache_device *d = data; | 1213 | struct bcache_device *d = data; |
1260 | struct cached_dev *dc = container_of(d, struct cached_dev, disk); | 1214 | struct cached_dev *dc = container_of(d, struct cached_dev, disk); |
1261 | struct request_queue *q = bdev_get_queue(dc->bdev); | 1215 | struct request_queue *q = bdev_get_queue(dc->bdev); |
1262 | int ret = 0; | 1216 | int ret = 0; |
1263 | 1217 | ||
1264 | if (bdi_congested(&q->backing_dev_info, bits)) | 1218 | if (bdi_congested(&q->backing_dev_info, bits)) |
1265 | return 1; | 1219 | return 1; |
1266 | 1220 | ||
1267 | if (cached_dev_get(dc)) { | 1221 | if (cached_dev_get(dc)) { |
1268 | unsigned i; | 1222 | unsigned i; |
1269 | struct cache *ca; | 1223 | struct cache *ca; |
1270 | 1224 | ||
1271 | for_each_cache(ca, d->c, i) { | 1225 | for_each_cache(ca, d->c, i) { |
1272 | q = bdev_get_queue(ca->bdev); | 1226 | q = bdev_get_queue(ca->bdev); |
1273 | ret |= bdi_congested(&q->backing_dev_info, bits); | 1227 | ret |= bdi_congested(&q->backing_dev_info, bits); |
1274 | } | 1228 | } |
1275 | 1229 | ||
1276 | cached_dev_put(dc); | 1230 | cached_dev_put(dc); |
1277 | } | 1231 | } |
1278 | 1232 | ||
1279 | return ret; | 1233 | return ret; |
1280 | } | 1234 | } |
1281 | 1235 | ||
1282 | void bch_cached_dev_request_init(struct cached_dev *dc) | 1236 | void bch_cached_dev_request_init(struct cached_dev *dc) |
1283 | { | 1237 | { |
1284 | struct gendisk *g = dc->disk.disk; | 1238 | struct gendisk *g = dc->disk.disk; |
1285 | 1239 | ||
1286 | g->queue->make_request_fn = cached_dev_make_request; | 1240 | g->queue->make_request_fn = cached_dev_make_request; |
1287 | g->queue->backing_dev_info.congested_fn = cached_dev_congested; | 1241 | g->queue->backing_dev_info.congested_fn = cached_dev_congested; |
1288 | dc->disk.cache_miss = cached_dev_cache_miss; | 1242 | dc->disk.cache_miss = cached_dev_cache_miss; |
1289 | dc->disk.ioctl = cached_dev_ioctl; | 1243 | dc->disk.ioctl = cached_dev_ioctl; |
1290 | } | 1244 | } |
1291 | 1245 | ||
1292 | /* Flash backed devices */ | 1246 | /* Flash backed devices */ |
1293 | 1247 | ||
1294 | static int flash_dev_cache_miss(struct btree *b, struct search *s, | 1248 | static int flash_dev_cache_miss(struct btree *b, struct search *s, |
1295 | struct bio *bio, unsigned sectors) | 1249 | struct bio *bio, unsigned sectors) |
1296 | { | 1250 | { |
1251 | struct bio_vec *bv; | ||
1252 | int i; | ||
1253 | |||
1297 | /* Zero fill bio */ | 1254 | /* Zero fill bio */ |
1298 | 1255 | ||
1299 | while (bio->bi_idx != bio->bi_vcnt) { | 1256 | bio_for_each_segment(bv, bio, i) { |
1300 | struct bio_vec *bv = bio_iovec(bio); | ||
1301 | unsigned j = min(bv->bv_len >> 9, sectors); | 1257 | unsigned j = min(bv->bv_len >> 9, sectors); |
1302 | 1258 | ||
1303 | void *p = kmap(bv->bv_page); | 1259 | void *p = kmap(bv->bv_page); |
1304 | memset(p + bv->bv_offset, 0, j << 9); | 1260 | memset(p + bv->bv_offset, 0, j << 9); |
1305 | kunmap(bv->bv_page); | 1261 | kunmap(bv->bv_page); |
1306 | 1262 | ||
1307 | bv->bv_len -= j << 9; | 1263 | sectors -= j; |
1308 | bv->bv_offset += j << 9; | ||
1309 | |||
1310 | if (bv->bv_len) | ||
1311 | return 0; | ||
1312 | |||
1313 | bio->bi_sector += j; | ||
1314 | bio->bi_size -= j << 9; | ||
1315 | |||
1316 | bio->bi_idx++; | ||
1317 | sectors -= j; | ||
1318 | } | 1264 | } |
1319 | 1265 | ||
1320 | s->op.lookup_done = true; | 1266 | bio_advance(bio, min(sectors << 9, bio->bi_size)); |
1321 | 1267 | ||
1268 | if (!bio->bi_size) | ||
1269 | s->op.lookup_done = true; | ||
1270 | |||
1322 | return 0; | 1271 | return 0; |
1323 | } | 1272 | } |
1324 | 1273 | ||
1325 | static void flash_dev_make_request(struct request_queue *q, struct bio *bio) | 1274 | static void flash_dev_make_request(struct request_queue *q, struct bio *bio) |
1326 | { | 1275 | { |
1327 | struct search *s; | 1276 | struct search *s; |
1328 | struct closure *cl; | 1277 | struct closure *cl; |
1329 | struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; | 1278 | struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; |
1330 | int cpu, rw = bio_data_dir(bio); | 1279 | int cpu, rw = bio_data_dir(bio); |
1331 | 1280 | ||
1332 | cpu = part_stat_lock(); | 1281 | cpu = part_stat_lock(); |
1333 | part_stat_inc(cpu, &d->disk->part0, ios[rw]); | 1282 | part_stat_inc(cpu, &d->disk->part0, ios[rw]); |
1334 | part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio)); | 1283 | part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio)); |
1335 | part_stat_unlock(); | 1284 | part_stat_unlock(); |
1336 | 1285 | ||
1337 | s = search_alloc(bio, d); | 1286 | s = search_alloc(bio, d); |
1338 | cl = &s->cl; | 1287 | cl = &s->cl; |
1339 | bio = &s->bio.bio; | 1288 | bio = &s->bio.bio; |
1340 | 1289 | ||
1341 | trace_bcache_request_start(s, bio); | 1290 | trace_bcache_request_start(s, bio); |
1342 | 1291 | ||
1343 | if (bio_has_data(bio) && !rw) { | 1292 | if (bio_has_data(bio) && !rw) { |
1344 | closure_call(&s->op.cl, btree_read_async, NULL, cl); | 1293 | closure_call(&s->op.cl, btree_read_async, NULL, cl); |
1345 | } else if (bio_has_data(bio) || s->op.skip) { | 1294 | } else if (bio_has_data(bio) || s->op.skip) { |
1346 | bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, | 1295 | bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, |
1347 | &KEY(d->id, bio->bi_sector, 0), | 1296 | &KEY(d->id, bio->bi_sector, 0), |
1348 | &KEY(d->id, bio_end(bio), 0)); | 1297 | &KEY(d->id, bio_end_sector(bio), 0)); |
1349 | 1298 | ||
1350 | s->writeback = true; | 1299 | s->writeback = true; |
1351 | s->op.cache_bio = bio; | 1300 | s->op.cache_bio = bio; |
1352 | 1301 | ||
1353 | closure_call(&s->op.cl, bch_insert_data, NULL, cl); | 1302 | closure_call(&s->op.cl, bch_insert_data, NULL, cl); |
1354 | } else { | 1303 | } else { |
1355 | /* No data - probably a cache flush */ | 1304 | /* No data - probably a cache flush */ |
1356 | if (s->op.flush_journal) | 1305 | if (s->op.flush_journal) |
1357 | bch_journal_meta(s->op.c, cl); | 1306 | bch_journal_meta(s->op.c, cl); |
1358 | } | 1307 | } |
1359 | 1308 | ||
1360 | continue_at(cl, search_free, NULL); | 1309 | continue_at(cl, search_free, NULL); |
1361 | } | 1310 | } |
1362 | 1311 | ||
1363 | static int flash_dev_ioctl(struct bcache_device *d, fmode_t mode, | 1312 | static int flash_dev_ioctl(struct bcache_device *d, fmode_t mode, |
1364 | unsigned int cmd, unsigned long arg) | 1313 | unsigned int cmd, unsigned long arg) |
1365 | { | 1314 | { |
1366 | return -ENOTTY; | 1315 | return -ENOTTY; |
1367 | } | 1316 | } |
1368 | 1317 | ||
1369 | static int flash_dev_congested(void *data, int bits) | 1318 | static int flash_dev_congested(void *data, int bits) |
1370 | { | 1319 | { |
1371 | struct bcache_device *d = data; | 1320 | struct bcache_device *d = data; |
1372 | struct request_queue *q; | 1321 | struct request_queue *q; |
1373 | struct cache *ca; | 1322 | struct cache *ca; |
1374 | unsigned i; | 1323 | unsigned i; |
1375 | int ret = 0; | 1324 | int ret = 0; |
1376 | 1325 | ||
1377 | for_each_cache(ca, d->c, i) { | 1326 | for_each_cache(ca, d->c, i) { |
1378 | q = bdev_get_queue(ca->bdev); | 1327 | q = bdev_get_queue(ca->bdev); |
1379 | ret |= bdi_congested(&q->backing_dev_info, bits); | 1328 | ret |= bdi_congested(&q->backing_dev_info, bits); |
1380 | } | 1329 | } |
1381 | 1330 | ||
1382 | return ret; | 1331 | return ret; |
1383 | } | 1332 | } |
1384 | 1333 | ||
1385 | void bch_flash_dev_request_init(struct bcache_device *d) | 1334 | void bch_flash_dev_request_init(struct bcache_device *d) |
1386 | { | 1335 | { |
1387 | struct gendisk *g = d->disk; | 1336 | struct gendisk *g = d->disk; |
1388 | 1337 | ||
1389 | g->queue->make_request_fn = flash_dev_make_request; | 1338 | g->queue->make_request_fn = flash_dev_make_request; |
1390 | g->queue->backing_dev_info.congested_fn = flash_dev_congested; | 1339 | g->queue->backing_dev_info.congested_fn = flash_dev_congested; |
1391 | d->cache_miss = flash_dev_cache_miss; | 1340 | d->cache_miss = flash_dev_cache_miss; |
1392 | d->ioctl = flash_dev_ioctl; | 1341 | d->ioctl = flash_dev_ioctl; |
1393 | } | 1342 | } |
1394 | 1343 | ||
1395 | void bch_request_exit(void) | 1344 | void bch_request_exit(void) |
1396 | { | 1345 | { |
1397 | #ifdef CONFIG_CGROUP_BCACHE | 1346 | #ifdef CONFIG_CGROUP_BCACHE |
1398 | cgroup_unload_subsys(&bcache_subsys); | 1347 | cgroup_unload_subsys(&bcache_subsys); |
1399 | #endif | 1348 | #endif |
1400 | if (bch_search_cache) | 1349 | if (bch_search_cache) |
1401 | kmem_cache_destroy(bch_search_cache); | 1350 | kmem_cache_destroy(bch_search_cache); |
1402 | } | 1351 | } |
1403 | 1352 | ||
1404 | int __init bch_request_init(void) | 1353 | int __init bch_request_init(void) |
1405 | { | 1354 | { |
1406 | bch_search_cache = KMEM_CACHE(search, 0); | 1355 | bch_search_cache = KMEM_CACHE(search, 0); |
1407 | if (!bch_search_cache) | 1356 | if (!bch_search_cache) |
1408 | return -ENOMEM; | 1357 | return -ENOMEM; |
1409 | 1358 | ||
1410 | #ifdef CONFIG_CGROUP_BCACHE | 1359 | #ifdef CONFIG_CGROUP_BCACHE |
1411 | cgroup_load_subsys(&bcache_subsys); | 1360 | cgroup_load_subsys(&bcache_subsys); |
drivers/md/bcache/util.c
1 | /* | 1 | /* |
2 | * random utiility code, for bcache but in theory not specific to bcache | 2 | * random utiility code, for bcache but in theory not specific to bcache |
3 | * | 3 | * |
4 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> | 4 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> |
5 | * Copyright 2012 Google, Inc. | 5 | * Copyright 2012 Google, Inc. |
6 | */ | 6 | */ |
7 | 7 | ||
8 | #include <linux/bio.h> | 8 | #include <linux/bio.h> |
9 | #include <linux/blkdev.h> | 9 | #include <linux/blkdev.h> |
10 | #include <linux/ctype.h> | 10 | #include <linux/ctype.h> |
11 | #include <linux/debugfs.h> | 11 | #include <linux/debugfs.h> |
12 | #include <linux/module.h> | 12 | #include <linux/module.h> |
13 | #include <linux/seq_file.h> | 13 | #include <linux/seq_file.h> |
14 | #include <linux/types.h> | 14 | #include <linux/types.h> |
15 | 15 | ||
16 | #include "util.h" | 16 | #include "util.h" |
17 | 17 | ||
18 | #define simple_strtoint(c, end, base) simple_strtol(c, end, base) | 18 | #define simple_strtoint(c, end, base) simple_strtol(c, end, base) |
19 | #define simple_strtouint(c, end, base) simple_strtoul(c, end, base) | 19 | #define simple_strtouint(c, end, base) simple_strtoul(c, end, base) |
20 | 20 | ||
21 | #define STRTO_H(name, type) \ | 21 | #define STRTO_H(name, type) \ |
22 | int bch_ ## name ## _h(const char *cp, type *res) \ | 22 | int bch_ ## name ## _h(const char *cp, type *res) \ |
23 | { \ | 23 | { \ |
24 | int u = 0; \ | 24 | int u = 0; \ |
25 | char *e; \ | 25 | char *e; \ |
26 | type i = simple_ ## name(cp, &e, 10); \ | 26 | type i = simple_ ## name(cp, &e, 10); \ |
27 | \ | 27 | \ |
28 | switch (tolower(*e)) { \ | 28 | switch (tolower(*e)) { \ |
29 | default: \ | 29 | default: \ |
30 | return -EINVAL; \ | 30 | return -EINVAL; \ |
31 | case 'y': \ | 31 | case 'y': \ |
32 | case 'z': \ | 32 | case 'z': \ |
33 | u++; \ | 33 | u++; \ |
34 | case 'e': \ | 34 | case 'e': \ |
35 | u++; \ | 35 | u++; \ |
36 | case 'p': \ | 36 | case 'p': \ |
37 | u++; \ | 37 | u++; \ |
38 | case 't': \ | 38 | case 't': \ |
39 | u++; \ | 39 | u++; \ |
40 | case 'g': \ | 40 | case 'g': \ |
41 | u++; \ | 41 | u++; \ |
42 | case 'm': \ | 42 | case 'm': \ |
43 | u++; \ | 43 | u++; \ |
44 | case 'k': \ | 44 | case 'k': \ |
45 | u++; \ | 45 | u++; \ |
46 | if (e++ == cp) \ | 46 | if (e++ == cp) \ |
47 | return -EINVAL; \ | 47 | return -EINVAL; \ |
48 | case '\n': \ | 48 | case '\n': \ |
49 | case '\0': \ | 49 | case '\0': \ |
50 | if (*e == '\n') \ | 50 | if (*e == '\n') \ |
51 | e++; \ | 51 | e++; \ |
52 | } \ | 52 | } \ |
53 | \ | 53 | \ |
54 | if (*e) \ | 54 | if (*e) \ |
55 | return -EINVAL; \ | 55 | return -EINVAL; \ |
56 | \ | 56 | \ |
57 | while (u--) { \ | 57 | while (u--) { \ |
58 | if ((type) ~0 > 0 && \ | 58 | if ((type) ~0 > 0 && \ |
59 | (type) ~0 / 1024 <= i) \ | 59 | (type) ~0 / 1024 <= i) \ |
60 | return -EINVAL; \ | 60 | return -EINVAL; \ |
61 | if ((i > 0 && ANYSINT_MAX(type) / 1024 < i) || \ | 61 | if ((i > 0 && ANYSINT_MAX(type) / 1024 < i) || \ |
62 | (i < 0 && -ANYSINT_MAX(type) / 1024 > i)) \ | 62 | (i < 0 && -ANYSINT_MAX(type) / 1024 > i)) \ |
63 | return -EINVAL; \ | 63 | return -EINVAL; \ |
64 | i *= 1024; \ | 64 | i *= 1024; \ |
65 | } \ | 65 | } \ |
66 | \ | 66 | \ |
67 | *res = i; \ | 67 | *res = i; \ |
68 | return 0; \ | 68 | return 0; \ |
69 | } \ | 69 | } \ |
70 | 70 | ||
71 | STRTO_H(strtoint, int) | 71 | STRTO_H(strtoint, int) |
72 | STRTO_H(strtouint, unsigned int) | 72 | STRTO_H(strtouint, unsigned int) |
73 | STRTO_H(strtoll, long long) | 73 | STRTO_H(strtoll, long long) |
74 | STRTO_H(strtoull, unsigned long long) | 74 | STRTO_H(strtoull, unsigned long long) |
75 | 75 | ||
76 | ssize_t bch_hprint(char *buf, int64_t v) | 76 | ssize_t bch_hprint(char *buf, int64_t v) |
77 | { | 77 | { |
78 | static const char units[] = "?kMGTPEZY"; | 78 | static const char units[] = "?kMGTPEZY"; |
79 | char dec[4] = ""; | 79 | char dec[4] = ""; |
80 | int u, t = 0; | 80 | int u, t = 0; |
81 | 81 | ||
82 | for (u = 0; v >= 1024 || v <= -1024; u++) { | 82 | for (u = 0; v >= 1024 || v <= -1024; u++) { |
83 | t = v & ~(~0 << 10); | 83 | t = v & ~(~0 << 10); |
84 | v >>= 10; | 84 | v >>= 10; |
85 | } | 85 | } |
86 | 86 | ||
87 | if (!u) | 87 | if (!u) |
88 | return sprintf(buf, "%llu", v); | 88 | return sprintf(buf, "%llu", v); |
89 | 89 | ||
90 | if (v < 100 && v > -100) | 90 | if (v < 100 && v > -100) |
91 | snprintf(dec, sizeof(dec), ".%i", t / 100); | 91 | snprintf(dec, sizeof(dec), ".%i", t / 100); |
92 | 92 | ||
93 | return sprintf(buf, "%lli%s%c", v, dec, units[u]); | 93 | return sprintf(buf, "%lli%s%c", v, dec, units[u]); |
94 | } | 94 | } |
95 | 95 | ||
96 | ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[], | 96 | ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[], |
97 | size_t selected) | 97 | size_t selected) |
98 | { | 98 | { |
99 | char *out = buf; | 99 | char *out = buf; |
100 | size_t i; | 100 | size_t i; |
101 | 101 | ||
102 | for (i = 0; list[i]; i++) | 102 | for (i = 0; list[i]; i++) |
103 | out += snprintf(out, buf + size - out, | 103 | out += snprintf(out, buf + size - out, |
104 | i == selected ? "[%s] " : "%s ", list[i]); | 104 | i == selected ? "[%s] " : "%s ", list[i]); |
105 | 105 | ||
106 | out[-1] = '\n'; | 106 | out[-1] = '\n'; |
107 | return out - buf; | 107 | return out - buf; |
108 | } | 108 | } |
109 | 109 | ||
110 | ssize_t bch_read_string_list(const char *buf, const char * const list[]) | 110 | ssize_t bch_read_string_list(const char *buf, const char * const list[]) |
111 | { | 111 | { |
112 | size_t i; | 112 | size_t i; |
113 | char *s, *d = kstrndup(buf, PAGE_SIZE - 1, GFP_KERNEL); | 113 | char *s, *d = kstrndup(buf, PAGE_SIZE - 1, GFP_KERNEL); |
114 | if (!d) | 114 | if (!d) |
115 | return -ENOMEM; | 115 | return -ENOMEM; |
116 | 116 | ||
117 | s = strim(d); | 117 | s = strim(d); |
118 | 118 | ||
119 | for (i = 0; list[i]; i++) | 119 | for (i = 0; list[i]; i++) |
120 | if (!strcmp(list[i], s)) | 120 | if (!strcmp(list[i], s)) |
121 | break; | 121 | break; |
122 | 122 | ||
123 | kfree(d); | 123 | kfree(d); |
124 | 124 | ||
125 | if (!list[i]) | 125 | if (!list[i]) |
126 | return -EINVAL; | 126 | return -EINVAL; |
127 | 127 | ||
128 | return i; | 128 | return i; |
129 | } | 129 | } |
130 | 130 | ||
131 | bool bch_is_zero(const char *p, size_t n) | 131 | bool bch_is_zero(const char *p, size_t n) |
132 | { | 132 | { |
133 | size_t i; | 133 | size_t i; |
134 | 134 | ||
135 | for (i = 0; i < n; i++) | 135 | for (i = 0; i < n; i++) |
136 | if (p[i]) | 136 | if (p[i]) |
137 | return false; | 137 | return false; |
138 | return true; | 138 | return true; |
139 | } | 139 | } |
140 | 140 | ||
141 | int bch_parse_uuid(const char *s, char *uuid) | 141 | int bch_parse_uuid(const char *s, char *uuid) |
142 | { | 142 | { |
143 | size_t i, j, x; | 143 | size_t i, j, x; |
144 | memset(uuid, 0, 16); | 144 | memset(uuid, 0, 16); |
145 | 145 | ||
146 | for (i = 0, j = 0; | 146 | for (i = 0, j = 0; |
147 | i < strspn(s, "-0123456789:ABCDEFabcdef") && j < 32; | 147 | i < strspn(s, "-0123456789:ABCDEFabcdef") && j < 32; |
148 | i++) { | 148 | i++) { |
149 | x = s[i] | 32; | 149 | x = s[i] | 32; |
150 | 150 | ||
151 | switch (x) { | 151 | switch (x) { |
152 | case '0'...'9': | 152 | case '0'...'9': |
153 | x -= '0'; | 153 | x -= '0'; |
154 | break; | 154 | break; |
155 | case 'a'...'f': | 155 | case 'a'...'f': |
156 | x -= 'a' - 10; | 156 | x -= 'a' - 10; |
157 | break; | 157 | break; |
158 | default: | 158 | default: |
159 | continue; | 159 | continue; |
160 | } | 160 | } |
161 | 161 | ||
162 | if (!(j & 1)) | 162 | if (!(j & 1)) |
163 | x <<= 4; | 163 | x <<= 4; |
164 | uuid[j++ >> 1] |= x; | 164 | uuid[j++ >> 1] |= x; |
165 | } | 165 | } |
166 | return i; | 166 | return i; |
167 | } | 167 | } |
168 | 168 | ||
169 | void bch_time_stats_update(struct time_stats *stats, uint64_t start_time) | 169 | void bch_time_stats_update(struct time_stats *stats, uint64_t start_time) |
170 | { | 170 | { |
171 | uint64_t now = local_clock(); | 171 | uint64_t now = local_clock(); |
172 | uint64_t duration = time_after64(now, start_time) | 172 | uint64_t duration = time_after64(now, start_time) |
173 | ? now - start_time : 0; | 173 | ? now - start_time : 0; |
174 | uint64_t last = time_after64(now, stats->last) | 174 | uint64_t last = time_after64(now, stats->last) |
175 | ? now - stats->last : 0; | 175 | ? now - stats->last : 0; |
176 | 176 | ||
177 | stats->max_duration = max(stats->max_duration, duration); | 177 | stats->max_duration = max(stats->max_duration, duration); |
178 | 178 | ||
179 | if (stats->last) { | 179 | if (stats->last) { |
180 | ewma_add(stats->average_duration, duration, 8, 8); | 180 | ewma_add(stats->average_duration, duration, 8, 8); |
181 | 181 | ||
182 | if (stats->average_frequency) | 182 | if (stats->average_frequency) |
183 | ewma_add(stats->average_frequency, last, 8, 8); | 183 | ewma_add(stats->average_frequency, last, 8, 8); |
184 | else | 184 | else |
185 | stats->average_frequency = last << 8; | 185 | stats->average_frequency = last << 8; |
186 | } else { | 186 | } else { |
187 | stats->average_duration = duration << 8; | 187 | stats->average_duration = duration << 8; |
188 | } | 188 | } |
189 | 189 | ||
190 | stats->last = now ?: 1; | 190 | stats->last = now ?: 1; |
191 | } | 191 | } |
192 | 192 | ||
193 | unsigned bch_next_delay(struct ratelimit *d, uint64_t done) | 193 | unsigned bch_next_delay(struct ratelimit *d, uint64_t done) |
194 | { | 194 | { |
195 | uint64_t now = local_clock(); | 195 | uint64_t now = local_clock(); |
196 | 196 | ||
197 | d->next += div_u64(done, d->rate); | 197 | d->next += div_u64(done, d->rate); |
198 | 198 | ||
199 | return time_after64(d->next, now) | 199 | return time_after64(d->next, now) |
200 | ? div_u64(d->next - now, NSEC_PER_SEC / HZ) | 200 | ? div_u64(d->next - now, NSEC_PER_SEC / HZ) |
201 | : 0; | 201 | : 0; |
202 | } | 202 | } |
203 | 203 | ||
204 | void bch_bio_map(struct bio *bio, void *base) | 204 | void bch_bio_map(struct bio *bio, void *base) |
205 | { | 205 | { |
206 | size_t size = bio->bi_size; | 206 | size_t size = bio->bi_size; |
207 | struct bio_vec *bv = bio->bi_io_vec; | 207 | struct bio_vec *bv = bio->bi_io_vec; |
208 | 208 | ||
209 | BUG_ON(!bio->bi_size); | 209 | BUG_ON(!bio->bi_size); |
210 | BUG_ON(bio->bi_vcnt); | 210 | BUG_ON(bio->bi_vcnt); |
211 | 211 | ||
212 | bv->bv_offset = base ? ((unsigned long) base) % PAGE_SIZE : 0; | 212 | bv->bv_offset = base ? ((unsigned long) base) % PAGE_SIZE : 0; |
213 | goto start; | 213 | goto start; |
214 | 214 | ||
215 | for (; size; bio->bi_vcnt++, bv++) { | 215 | for (; size; bio->bi_vcnt++, bv++) { |
216 | bv->bv_offset = 0; | 216 | bv->bv_offset = 0; |
217 | start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset, | 217 | start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset, |
218 | size); | 218 | size); |
219 | if (base) { | 219 | if (base) { |
220 | bv->bv_page = is_vmalloc_addr(base) | 220 | bv->bv_page = is_vmalloc_addr(base) |
221 | ? vmalloc_to_page(base) | 221 | ? vmalloc_to_page(base) |
222 | : virt_to_page(base); | 222 | : virt_to_page(base); |
223 | 223 | ||
224 | base += bv->bv_len; | 224 | base += bv->bv_len; |
225 | } | 225 | } |
226 | 226 | ||
227 | size -= bv->bv_len; | 227 | size -= bv->bv_len; |
228 | } | 228 | } |
229 | } | 229 | } |
230 | 230 | ||
231 | int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp) | ||
232 | { | ||
233 | int i; | ||
234 | struct bio_vec *bv; | ||
235 | |||
236 | bio_for_each_segment(bv, bio, i) { | ||
237 | bv->bv_page = alloc_page(gfp); | ||
238 | if (!bv->bv_page) { | ||
239 | while (bv-- != bio->bi_io_vec + bio->bi_idx) | ||
240 | __free_page(bv->bv_page); | ||
241 | return -ENOMEM; | ||
242 | } | ||
243 | } | ||
244 | |||
245 | return 0; | ||
246 | } | ||
247 | |||
248 | /* | 231 | /* |
249 | * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any | 232 | * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any |
250 | * use permitted, subject to terms of PostgreSQL license; see.) | 233 | * use permitted, subject to terms of PostgreSQL license; see.) |
251 | 234 | ||
252 | * If we have a 64-bit integer type, then a 64-bit CRC looks just like the | 235 | * If we have a 64-bit integer type, then a 64-bit CRC looks just like the |
253 | * usual sort of implementation. (See Ross Williams' excellent introduction | 236 | * usual sort of implementation. (See Ross Williams' excellent introduction |
254 | * A PAINLESS GUIDE TO CRC ERROR DETECTION ALGORITHMS, available from | 237 | * A PAINLESS GUIDE TO CRC ERROR DETECTION ALGORITHMS, available from |
255 | * ftp://ftp.rocksoft.com/papers/crc_v3.txt or several other net sites.) | 238 | * ftp://ftp.rocksoft.com/papers/crc_v3.txt or several other net sites.) |
256 | * If we have no working 64-bit type, then fake it with two 32-bit registers. | 239 | * If we have no working 64-bit type, then fake it with two 32-bit registers. |
257 | * | 240 | * |
258 | * The present implementation is a normal (not "reflected", in Williams' | 241 | * The present implementation is a normal (not "reflected", in Williams' |
259 | * terms) 64-bit CRC, using initial all-ones register contents and a final | 242 | * terms) 64-bit CRC, using initial all-ones register contents and a final |
260 | * bit inversion. The chosen polynomial is borrowed from the DLT1 spec | 243 | * bit inversion. The chosen polynomial is borrowed from the DLT1 spec |
261 | * (ECMA-182, available from http://www.ecma.ch/ecma1/STAND/ECMA-182.HTM): | 244 | * (ECMA-182, available from http://www.ecma.ch/ecma1/STAND/ECMA-182.HTM): |
262 | * | 245 | * |
263 | * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 + | 246 | * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 + |
264 | * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 + | 247 | * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 + |
265 | * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 + | 248 | * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 + |
266 | * x^7 + x^4 + x + 1 | 249 | * x^7 + x^4 + x + 1 |
267 | */ | 250 | */ |
268 | 251 | ||
269 | static const uint64_t crc_table[256] = { | 252 | static const uint64_t crc_table[256] = { |
270 | 0x0000000000000000ULL, 0x42F0E1EBA9EA3693ULL, 0x85E1C3D753D46D26ULL, | 253 | 0x0000000000000000ULL, 0x42F0E1EBA9EA3693ULL, 0x85E1C3D753D46D26ULL, |
271 | 0xC711223CFA3E5BB5ULL, 0x493366450E42ECDFULL, 0x0BC387AEA7A8DA4CULL, | 254 | 0xC711223CFA3E5BB5ULL, 0x493366450E42ECDFULL, 0x0BC387AEA7A8DA4CULL, |
272 | 0xCCD2A5925D9681F9ULL, 0x8E224479F47CB76AULL, 0x9266CC8A1C85D9BEULL, | 255 | 0xCCD2A5925D9681F9ULL, 0x8E224479F47CB76AULL, 0x9266CC8A1C85D9BEULL, |
273 | 0xD0962D61B56FEF2DULL, 0x17870F5D4F51B498ULL, 0x5577EEB6E6BB820BULL, | 256 | 0xD0962D61B56FEF2DULL, 0x17870F5D4F51B498ULL, 0x5577EEB6E6BB820BULL, |
274 | 0xDB55AACF12C73561ULL, 0x99A54B24BB2D03F2ULL, 0x5EB4691841135847ULL, | 257 | 0xDB55AACF12C73561ULL, 0x99A54B24BB2D03F2ULL, 0x5EB4691841135847ULL, |
275 | 0x1C4488F3E8F96ED4ULL, 0x663D78FF90E185EFULL, 0x24CD9914390BB37CULL, | 258 | 0x1C4488F3E8F96ED4ULL, 0x663D78FF90E185EFULL, 0x24CD9914390BB37CULL, |
276 | 0xE3DCBB28C335E8C9ULL, 0xA12C5AC36ADFDE5AULL, 0x2F0E1EBA9EA36930ULL, | 259 | 0xE3DCBB28C335E8C9ULL, 0xA12C5AC36ADFDE5AULL, 0x2F0E1EBA9EA36930ULL, |
277 | 0x6DFEFF5137495FA3ULL, 0xAAEFDD6DCD770416ULL, 0xE81F3C86649D3285ULL, | 260 | 0x6DFEFF5137495FA3ULL, 0xAAEFDD6DCD770416ULL, 0xE81F3C86649D3285ULL, |
278 | 0xF45BB4758C645C51ULL, 0xB6AB559E258E6AC2ULL, 0x71BA77A2DFB03177ULL, | 261 | 0xF45BB4758C645C51ULL, 0xB6AB559E258E6AC2ULL, 0x71BA77A2DFB03177ULL, |
279 | 0x334A9649765A07E4ULL, 0xBD68D2308226B08EULL, 0xFF9833DB2BCC861DULL, | 262 | 0x334A9649765A07E4ULL, 0xBD68D2308226B08EULL, 0xFF9833DB2BCC861DULL, |
280 | 0x388911E7D1F2DDA8ULL, 0x7A79F00C7818EB3BULL, 0xCC7AF1FF21C30BDEULL, | 263 | 0x388911E7D1F2DDA8ULL, 0x7A79F00C7818EB3BULL, 0xCC7AF1FF21C30BDEULL, |
281 | 0x8E8A101488293D4DULL, 0x499B3228721766F8ULL, 0x0B6BD3C3DBFD506BULL, | 264 | 0x8E8A101488293D4DULL, 0x499B3228721766F8ULL, 0x0B6BD3C3DBFD506BULL, |
282 | 0x854997BA2F81E701ULL, 0xC7B97651866BD192ULL, 0x00A8546D7C558A27ULL, | 265 | 0x854997BA2F81E701ULL, 0xC7B97651866BD192ULL, 0x00A8546D7C558A27ULL, |
283 | 0x4258B586D5BFBCB4ULL, 0x5E1C3D753D46D260ULL, 0x1CECDC9E94ACE4F3ULL, | 266 | 0x4258B586D5BFBCB4ULL, 0x5E1C3D753D46D260ULL, 0x1CECDC9E94ACE4F3ULL, |
284 | 0xDBFDFEA26E92BF46ULL, 0x990D1F49C77889D5ULL, 0x172F5B3033043EBFULL, | 267 | 0xDBFDFEA26E92BF46ULL, 0x990D1F49C77889D5ULL, 0x172F5B3033043EBFULL, |
285 | 0x55DFBADB9AEE082CULL, 0x92CE98E760D05399ULL, 0xD03E790CC93A650AULL, | 268 | 0x55DFBADB9AEE082CULL, 0x92CE98E760D05399ULL, 0xD03E790CC93A650AULL, |
286 | 0xAA478900B1228E31ULL, 0xE8B768EB18C8B8A2ULL, 0x2FA64AD7E2F6E317ULL, | 269 | 0xAA478900B1228E31ULL, 0xE8B768EB18C8B8A2ULL, 0x2FA64AD7E2F6E317ULL, |
287 | 0x6D56AB3C4B1CD584ULL, 0xE374EF45BF6062EEULL, 0xA1840EAE168A547DULL, | 270 | 0x6D56AB3C4B1CD584ULL, 0xE374EF45BF6062EEULL, 0xA1840EAE168A547DULL, |
288 | 0x66952C92ECB40FC8ULL, 0x2465CD79455E395BULL, 0x3821458AADA7578FULL, | 271 | 0x66952C92ECB40FC8ULL, 0x2465CD79455E395BULL, 0x3821458AADA7578FULL, |
289 | 0x7AD1A461044D611CULL, 0xBDC0865DFE733AA9ULL, 0xFF3067B657990C3AULL, | 272 | 0x7AD1A461044D611CULL, 0xBDC0865DFE733AA9ULL, 0xFF3067B657990C3AULL, |
290 | 0x711223CFA3E5BB50ULL, 0x33E2C2240A0F8DC3ULL, 0xF4F3E018F031D676ULL, | 273 | 0x711223CFA3E5BB50ULL, 0x33E2C2240A0F8DC3ULL, 0xF4F3E018F031D676ULL, |
291 | 0xB60301F359DBE0E5ULL, 0xDA050215EA6C212FULL, 0x98F5E3FE438617BCULL, | 274 | 0xB60301F359DBE0E5ULL, 0xDA050215EA6C212FULL, 0x98F5E3FE438617BCULL, |
292 | 0x5FE4C1C2B9B84C09ULL, 0x1D14202910527A9AULL, 0x93366450E42ECDF0ULL, | 275 | 0x5FE4C1C2B9B84C09ULL, 0x1D14202910527A9AULL, 0x93366450E42ECDF0ULL, |
293 | 0xD1C685BB4DC4FB63ULL, 0x16D7A787B7FAA0D6ULL, 0x5427466C1E109645ULL, | 276 | 0xD1C685BB4DC4FB63ULL, 0x16D7A787B7FAA0D6ULL, 0x5427466C1E109645ULL, |
294 | 0x4863CE9FF6E9F891ULL, 0x0A932F745F03CE02ULL, 0xCD820D48A53D95B7ULL, | 277 | 0x4863CE9FF6E9F891ULL, 0x0A932F745F03CE02ULL, 0xCD820D48A53D95B7ULL, |
295 | 0x8F72ECA30CD7A324ULL, 0x0150A8DAF8AB144EULL, 0x43A04931514122DDULL, | 278 | 0x8F72ECA30CD7A324ULL, 0x0150A8DAF8AB144EULL, 0x43A04931514122DDULL, |
296 | 0x84B16B0DAB7F7968ULL, 0xC6418AE602954FFBULL, 0xBC387AEA7A8DA4C0ULL, | 279 | 0x84B16B0DAB7F7968ULL, 0xC6418AE602954FFBULL, 0xBC387AEA7A8DA4C0ULL, |
297 | 0xFEC89B01D3679253ULL, 0x39D9B93D2959C9E6ULL, 0x7B2958D680B3FF75ULL, | 280 | 0xFEC89B01D3679253ULL, 0x39D9B93D2959C9E6ULL, 0x7B2958D680B3FF75ULL, |
298 | 0xF50B1CAF74CF481FULL, 0xB7FBFD44DD257E8CULL, 0x70EADF78271B2539ULL, | 281 | 0xF50B1CAF74CF481FULL, 0xB7FBFD44DD257E8CULL, 0x70EADF78271B2539ULL, |
299 | 0x321A3E938EF113AAULL, 0x2E5EB66066087D7EULL, 0x6CAE578BCFE24BEDULL, | 282 | 0x321A3E938EF113AAULL, 0x2E5EB66066087D7EULL, 0x6CAE578BCFE24BEDULL, |
300 | 0xABBF75B735DC1058ULL, 0xE94F945C9C3626CBULL, 0x676DD025684A91A1ULL, | 283 | 0xABBF75B735DC1058ULL, 0xE94F945C9C3626CBULL, 0x676DD025684A91A1ULL, |
301 | 0x259D31CEC1A0A732ULL, 0xE28C13F23B9EFC87ULL, 0xA07CF2199274CA14ULL, | 284 | 0x259D31CEC1A0A732ULL, 0xE28C13F23B9EFC87ULL, 0xA07CF2199274CA14ULL, |
302 | 0x167FF3EACBAF2AF1ULL, 0x548F120162451C62ULL, 0x939E303D987B47D7ULL, | 285 | 0x167FF3EACBAF2AF1ULL, 0x548F120162451C62ULL, 0x939E303D987B47D7ULL, |
303 | 0xD16ED1D631917144ULL, 0x5F4C95AFC5EDC62EULL, 0x1DBC74446C07F0BDULL, | 286 | 0xD16ED1D631917144ULL, 0x5F4C95AFC5EDC62EULL, 0x1DBC74446C07F0BDULL, |
304 | 0xDAAD56789639AB08ULL, 0x985DB7933FD39D9BULL, 0x84193F60D72AF34FULL, | 287 | 0xDAAD56789639AB08ULL, 0x985DB7933FD39D9BULL, 0x84193F60D72AF34FULL, |
305 | 0xC6E9DE8B7EC0C5DCULL, 0x01F8FCB784FE9E69ULL, 0x43081D5C2D14A8FAULL, | 288 | 0xC6E9DE8B7EC0C5DCULL, 0x01F8FCB784FE9E69ULL, 0x43081D5C2D14A8FAULL, |
306 | 0xCD2A5925D9681F90ULL, 0x8FDAB8CE70822903ULL, 0x48CB9AF28ABC72B6ULL, | 289 | 0xCD2A5925D9681F90ULL, 0x8FDAB8CE70822903ULL, 0x48CB9AF28ABC72B6ULL, |
307 | 0x0A3B7B1923564425ULL, 0x70428B155B4EAF1EULL, 0x32B26AFEF2A4998DULL, | 290 | 0x0A3B7B1923564425ULL, 0x70428B155B4EAF1EULL, 0x32B26AFEF2A4998DULL, |
308 | 0xF5A348C2089AC238ULL, 0xB753A929A170F4ABULL, 0x3971ED50550C43C1ULL, | 291 | 0xF5A348C2089AC238ULL, 0xB753A929A170F4ABULL, 0x3971ED50550C43C1ULL, |
309 | 0x7B810CBBFCE67552ULL, 0xBC902E8706D82EE7ULL, 0xFE60CF6CAF321874ULL, | 292 | 0x7B810CBBFCE67552ULL, 0xBC902E8706D82EE7ULL, 0xFE60CF6CAF321874ULL, |
310 | 0xE224479F47CB76A0ULL, 0xA0D4A674EE214033ULL, 0x67C58448141F1B86ULL, | 293 | 0xE224479F47CB76A0ULL, 0xA0D4A674EE214033ULL, 0x67C58448141F1B86ULL, |
311 | 0x253565A3BDF52D15ULL, 0xAB1721DA49899A7FULL, 0xE9E7C031E063ACECULL, | 294 | 0x253565A3BDF52D15ULL, 0xAB1721DA49899A7FULL, 0xE9E7C031E063ACECULL, |
312 | 0x2EF6E20D1A5DF759ULL, 0x6C0603E6B3B7C1CAULL, 0xF6FAE5C07D3274CDULL, | 295 | 0x2EF6E20D1A5DF759ULL, 0x6C0603E6B3B7C1CAULL, 0xF6FAE5C07D3274CDULL, |
313 | 0xB40A042BD4D8425EULL, 0x731B26172EE619EBULL, 0x31EBC7FC870C2F78ULL, | 296 | 0xB40A042BD4D8425EULL, 0x731B26172EE619EBULL, 0x31EBC7FC870C2F78ULL, |
314 | 0xBFC9838573709812ULL, 0xFD39626EDA9AAE81ULL, 0x3A28405220A4F534ULL, | 297 | 0xBFC9838573709812ULL, 0xFD39626EDA9AAE81ULL, 0x3A28405220A4F534ULL, |
315 | 0x78D8A1B9894EC3A7ULL, 0x649C294A61B7AD73ULL, 0x266CC8A1C85D9BE0ULL, | 298 | 0x78D8A1B9894EC3A7ULL, 0x649C294A61B7AD73ULL, 0x266CC8A1C85D9BE0ULL, |
316 | 0xE17DEA9D3263C055ULL, 0xA38D0B769B89F6C6ULL, 0x2DAF4F0F6FF541ACULL, | 299 | 0xE17DEA9D3263C055ULL, 0xA38D0B769B89F6C6ULL, 0x2DAF4F0F6FF541ACULL, |
317 | 0x6F5FAEE4C61F773FULL, 0xA84E8CD83C212C8AULL, 0xEABE6D3395CB1A19ULL, | 300 | 0x6F5FAEE4C61F773FULL, 0xA84E8CD83C212C8AULL, 0xEABE6D3395CB1A19ULL, |
318 | 0x90C79D3FEDD3F122ULL, 0xD2377CD44439C7B1ULL, 0x15265EE8BE079C04ULL, | 301 | 0x90C79D3FEDD3F122ULL, 0xD2377CD44439C7B1ULL, 0x15265EE8BE079C04ULL, |
319 | 0x57D6BF0317EDAA97ULL, 0xD9F4FB7AE3911DFDULL, 0x9B041A914A7B2B6EULL, | 302 | 0x57D6BF0317EDAA97ULL, 0xD9F4FB7AE3911DFDULL, 0x9B041A914A7B2B6EULL, |
320 | 0x5C1538ADB04570DBULL, 0x1EE5D94619AF4648ULL, 0x02A151B5F156289CULL, | 303 | 0x5C1538ADB04570DBULL, 0x1EE5D94619AF4648ULL, 0x02A151B5F156289CULL, |
321 | 0x4051B05E58BC1E0FULL, 0x87409262A28245BAULL, 0xC5B073890B687329ULL, | 304 | 0x4051B05E58BC1E0FULL, 0x87409262A28245BAULL, 0xC5B073890B687329ULL, |
322 | 0x4B9237F0FF14C443ULL, 0x0962D61B56FEF2D0ULL, 0xCE73F427ACC0A965ULL, | 305 | 0x4B9237F0FF14C443ULL, 0x0962D61B56FEF2D0ULL, 0xCE73F427ACC0A965ULL, |
323 | 0x8C8315CC052A9FF6ULL, 0x3A80143F5CF17F13ULL, 0x7870F5D4F51B4980ULL, | 306 | 0x8C8315CC052A9FF6ULL, 0x3A80143F5CF17F13ULL, 0x7870F5D4F51B4980ULL, |
324 | 0xBF61D7E80F251235ULL, 0xFD913603A6CF24A6ULL, 0x73B3727A52B393CCULL, | 307 | 0xBF61D7E80F251235ULL, 0xFD913603A6CF24A6ULL, 0x73B3727A52B393CCULL, |
325 | 0x31439391FB59A55FULL, 0xF652B1AD0167FEEAULL, 0xB4A25046A88DC879ULL, | 308 | 0x31439391FB59A55FULL, 0xF652B1AD0167FEEAULL, 0xB4A25046A88DC879ULL, |
326 | 0xA8E6D8B54074A6ADULL, 0xEA16395EE99E903EULL, 0x2D071B6213A0CB8BULL, | 309 | 0xA8E6D8B54074A6ADULL, 0xEA16395EE99E903EULL, 0x2D071B6213A0CB8BULL, |
327 | 0x6FF7FA89BA4AFD18ULL, 0xE1D5BEF04E364A72ULL, 0xA3255F1BE7DC7CE1ULL, | 310 | 0x6FF7FA89BA4AFD18ULL, 0xE1D5BEF04E364A72ULL, 0xA3255F1BE7DC7CE1ULL, |
328 | 0x64347D271DE22754ULL, 0x26C49CCCB40811C7ULL, 0x5CBD6CC0CC10FAFCULL, | 311 | 0x64347D271DE22754ULL, 0x26C49CCCB40811C7ULL, 0x5CBD6CC0CC10FAFCULL, |
329 | 0x1E4D8D2B65FACC6FULL, 0xD95CAF179FC497DAULL, 0x9BAC4EFC362EA149ULL, | 312 | 0x1E4D8D2B65FACC6FULL, 0xD95CAF179FC497DAULL, 0x9BAC4EFC362EA149ULL, |
330 | 0x158E0A85C2521623ULL, 0x577EEB6E6BB820B0ULL, 0x906FC95291867B05ULL, | 313 | 0x158E0A85C2521623ULL, 0x577EEB6E6BB820B0ULL, 0x906FC95291867B05ULL, |
331 | 0xD29F28B9386C4D96ULL, 0xCEDBA04AD0952342ULL, 0x8C2B41A1797F15D1ULL, | 314 | 0xD29F28B9386C4D96ULL, 0xCEDBA04AD0952342ULL, 0x8C2B41A1797F15D1ULL, |
332 | 0x4B3A639D83414E64ULL, 0x09CA82762AAB78F7ULL, 0x87E8C60FDED7CF9DULL, | 315 | 0x4B3A639D83414E64ULL, 0x09CA82762AAB78F7ULL, 0x87E8C60FDED7CF9DULL, |
333 | 0xC51827E4773DF90EULL, 0x020905D88D03A2BBULL, 0x40F9E43324E99428ULL, | 316 | 0xC51827E4773DF90EULL, 0x020905D88D03A2BBULL, 0x40F9E43324E99428ULL, |
334 | 0x2CFFE7D5975E55E2ULL, 0x6E0F063E3EB46371ULL, 0xA91E2402C48A38C4ULL, | 317 | 0x2CFFE7D5975E55E2ULL, 0x6E0F063E3EB46371ULL, 0xA91E2402C48A38C4ULL, |
335 | 0xEBEEC5E96D600E57ULL, 0x65CC8190991CB93DULL, 0x273C607B30F68FAEULL, | 318 | 0xEBEEC5E96D600E57ULL, 0x65CC8190991CB93DULL, 0x273C607B30F68FAEULL, |
336 | 0xE02D4247CAC8D41BULL, 0xA2DDA3AC6322E288ULL, 0xBE992B5F8BDB8C5CULL, | 319 | 0xE02D4247CAC8D41BULL, 0xA2DDA3AC6322E288ULL, 0xBE992B5F8BDB8C5CULL, |
337 | 0xFC69CAB42231BACFULL, 0x3B78E888D80FE17AULL, 0x7988096371E5D7E9ULL, | 320 | 0xFC69CAB42231BACFULL, 0x3B78E888D80FE17AULL, 0x7988096371E5D7E9ULL, |
338 | 0xF7AA4D1A85996083ULL, 0xB55AACF12C735610ULL, 0x724B8ECDD64D0DA5ULL, | 321 | 0xF7AA4D1A85996083ULL, 0xB55AACF12C735610ULL, 0x724B8ECDD64D0DA5ULL, |
339 | 0x30BB6F267FA73B36ULL, 0x4AC29F2A07BFD00DULL, 0x08327EC1AE55E69EULL, | 322 | 0x30BB6F267FA73B36ULL, 0x4AC29F2A07BFD00DULL, 0x08327EC1AE55E69EULL, |
340 | 0xCF235CFD546BBD2BULL, 0x8DD3BD16FD818BB8ULL, 0x03F1F96F09FD3CD2ULL, | 323 | 0xCF235CFD546BBD2BULL, 0x8DD3BD16FD818BB8ULL, 0x03F1F96F09FD3CD2ULL, |
341 | 0x41011884A0170A41ULL, 0x86103AB85A2951F4ULL, 0xC4E0DB53F3C36767ULL, | 324 | 0x41011884A0170A41ULL, 0x86103AB85A2951F4ULL, 0xC4E0DB53F3C36767ULL, |
342 | 0xD8A453A01B3A09B3ULL, 0x9A54B24BB2D03F20ULL, 0x5D45907748EE6495ULL, | 325 | 0xD8A453A01B3A09B3ULL, 0x9A54B24BB2D03F20ULL, 0x5D45907748EE6495ULL, |
343 | 0x1FB5719CE1045206ULL, 0x919735E51578E56CULL, 0xD367D40EBC92D3FFULL, | 326 | 0x1FB5719CE1045206ULL, 0x919735E51578E56CULL, 0xD367D40EBC92D3FFULL, |
344 | 0x1476F63246AC884AULL, 0x568617D9EF46BED9ULL, 0xE085162AB69D5E3CULL, | 327 | 0x1476F63246AC884AULL, 0x568617D9EF46BED9ULL, 0xE085162AB69D5E3CULL, |
345 | 0xA275F7C11F7768AFULL, 0x6564D5FDE549331AULL, 0x279434164CA30589ULL, | 328 | 0xA275F7C11F7768AFULL, 0x6564D5FDE549331AULL, 0x279434164CA30589ULL, |
346 | 0xA9B6706FB8DFB2E3ULL, 0xEB46918411358470ULL, 0x2C57B3B8EB0BDFC5ULL, | 329 | 0xA9B6706FB8DFB2E3ULL, 0xEB46918411358470ULL, 0x2C57B3B8EB0BDFC5ULL, |
347 | 0x6EA7525342E1E956ULL, 0x72E3DAA0AA188782ULL, 0x30133B4B03F2B111ULL, | 330 | 0x6EA7525342E1E956ULL, 0x72E3DAA0AA188782ULL, 0x30133B4B03F2B111ULL, |
348 | 0xF7021977F9CCEAA4ULL, 0xB5F2F89C5026DC37ULL, 0x3BD0BCE5A45A6B5DULL, | 331 | 0xF7021977F9CCEAA4ULL, 0xB5F2F89C5026DC37ULL, 0x3BD0BCE5A45A6B5DULL, |
349 | 0x79205D0E0DB05DCEULL, 0xBE317F32F78E067BULL, 0xFCC19ED95E6430E8ULL, | 332 | 0x79205D0E0DB05DCEULL, 0xBE317F32F78E067BULL, 0xFCC19ED95E6430E8ULL, |
350 | 0x86B86ED5267CDBD3ULL, 0xC4488F3E8F96ED40ULL, 0x0359AD0275A8B6F5ULL, | 333 | 0x86B86ED5267CDBD3ULL, 0xC4488F3E8F96ED40ULL, 0x0359AD0275A8B6F5ULL, |
351 | 0x41A94CE9DC428066ULL, 0xCF8B0890283E370CULL, 0x8D7BE97B81D4019FULL, | 334 | 0x41A94CE9DC428066ULL, 0xCF8B0890283E370CULL, 0x8D7BE97B81D4019FULL, |
352 | 0x4A6ACB477BEA5A2AULL, 0x089A2AACD2006CB9ULL, 0x14DEA25F3AF9026DULL, | 335 | 0x4A6ACB477BEA5A2AULL, 0x089A2AACD2006CB9ULL, 0x14DEA25F3AF9026DULL, |
353 | 0x562E43B4931334FEULL, 0x913F6188692D6F4BULL, 0xD3CF8063C0C759D8ULL, | 336 | 0x562E43B4931334FEULL, 0x913F6188692D6F4BULL, 0xD3CF8063C0C759D8ULL, |
354 | 0x5DEDC41A34BBEEB2ULL, 0x1F1D25F19D51D821ULL, 0xD80C07CD676F8394ULL, | 337 | 0x5DEDC41A34BBEEB2ULL, 0x1F1D25F19D51D821ULL, 0xD80C07CD676F8394ULL, |
355 | 0x9AFCE626CE85B507ULL, | 338 | 0x9AFCE626CE85B507ULL, |
356 | }; | 339 | }; |
357 | 340 | ||
358 | uint64_t bch_crc64_update(uint64_t crc, const void *_data, size_t len) | 341 | uint64_t bch_crc64_update(uint64_t crc, const void *_data, size_t len) |
359 | { | 342 | { |
360 | const unsigned char *data = _data; | 343 | const unsigned char *data = _data; |
361 | 344 | ||
362 | while (len--) { | 345 | while (len--) { |
363 | int i = ((int) (crc >> 56) ^ *data++) & 0xFF; | 346 | int i = ((int) (crc >> 56) ^ *data++) & 0xFF; |
364 | crc = crc_table[i] ^ (crc << 8); | 347 | crc = crc_table[i] ^ (crc << 8); |
365 | } | 348 | } |
366 | 349 | ||
367 | return crc; | 350 | return crc; |
368 | } | 351 | } |
369 | 352 | ||
370 | uint64_t bch_crc64(const void *data, size_t len) | 353 | uint64_t bch_crc64(const void *data, size_t len) |
371 | { | 354 | { |
372 | uint64_t crc = 0xffffffffffffffffULL; | 355 | uint64_t crc = 0xffffffffffffffffULL; |
373 | 356 | ||
374 | crc = bch_crc64_update(crc, data, len); | 357 | crc = bch_crc64_update(crc, data, len); |
375 | 358 | ||
376 | return crc ^ 0xffffffffffffffffULL; | 359 | return crc ^ 0xffffffffffffffffULL; |
377 | } | 360 | } |
378 | 361 |
drivers/md/bcache/util.h
1 | 1 | ||
2 | #ifndef _BCACHE_UTIL_H | 2 | #ifndef _BCACHE_UTIL_H |
3 | #define _BCACHE_UTIL_H | 3 | #define _BCACHE_UTIL_H |
4 | 4 | ||
5 | #include <linux/errno.h> | 5 | #include <linux/errno.h> |
6 | #include <linux/kernel.h> | 6 | #include <linux/kernel.h> |
7 | #include <linux/llist.h> | 7 | #include <linux/llist.h> |
8 | #include <linux/ratelimit.h> | 8 | #include <linux/ratelimit.h> |
9 | #include <linux/vmalloc.h> | 9 | #include <linux/vmalloc.h> |
10 | #include <linux/workqueue.h> | 10 | #include <linux/workqueue.h> |
11 | 11 | ||
12 | #include "closure.h" | 12 | #include "closure.h" |
13 | 13 | ||
14 | #define PAGE_SECTORS (PAGE_SIZE / 512) | 14 | #define PAGE_SECTORS (PAGE_SIZE / 512) |
15 | 15 | ||
16 | struct closure; | 16 | struct closure; |
17 | 17 | ||
18 | #ifdef CONFIG_BCACHE_EDEBUG | 18 | #ifdef CONFIG_BCACHE_EDEBUG |
19 | 19 | ||
20 | #define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) | 20 | #define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) |
21 | #define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i) | 21 | #define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i) |
22 | 22 | ||
23 | #else /* EDEBUG */ | 23 | #else /* EDEBUG */ |
24 | 24 | ||
25 | #define atomic_dec_bug(v) atomic_dec(v) | 25 | #define atomic_dec_bug(v) atomic_dec(v) |
26 | #define atomic_inc_bug(v, i) atomic_inc(v) | 26 | #define atomic_inc_bug(v, i) atomic_inc(v) |
27 | 27 | ||
28 | #endif | 28 | #endif |
29 | 29 | ||
30 | #define BITMASK(name, type, field, offset, size) \ | 30 | #define BITMASK(name, type, field, offset, size) \ |
31 | static inline uint64_t name(const type *k) \ | 31 | static inline uint64_t name(const type *k) \ |
32 | { return (k->field >> offset) & ~(((uint64_t) ~0) << size); } \ | 32 | { return (k->field >> offset) & ~(((uint64_t) ~0) << size); } \ |
33 | \ | 33 | \ |
34 | static inline void SET_##name(type *k, uint64_t v) \ | 34 | static inline void SET_##name(type *k, uint64_t v) \ |
35 | { \ | 35 | { \ |
36 | k->field &= ~(~((uint64_t) ~0 << size) << offset); \ | 36 | k->field &= ~(~((uint64_t) ~0 << size) << offset); \ |
37 | k->field |= v << offset; \ | 37 | k->field |= v << offset; \ |
38 | } | 38 | } |
39 | 39 | ||
40 | #define DECLARE_HEAP(type, name) \ | 40 | #define DECLARE_HEAP(type, name) \ |
41 | struct { \ | 41 | struct { \ |
42 | size_t size, used; \ | 42 | size_t size, used; \ |
43 | type *data; \ | 43 | type *data; \ |
44 | } name | 44 | } name |
45 | 45 | ||
46 | #define init_heap(heap, _size, gfp) \ | 46 | #define init_heap(heap, _size, gfp) \ |
47 | ({ \ | 47 | ({ \ |
48 | size_t _bytes; \ | 48 | size_t _bytes; \ |
49 | (heap)->used = 0; \ | 49 | (heap)->used = 0; \ |
50 | (heap)->size = (_size); \ | 50 | (heap)->size = (_size); \ |
51 | _bytes = (heap)->size * sizeof(*(heap)->data); \ | 51 | _bytes = (heap)->size * sizeof(*(heap)->data); \ |
52 | (heap)->data = NULL; \ | 52 | (heap)->data = NULL; \ |
53 | if (_bytes < KMALLOC_MAX_SIZE) \ | 53 | if (_bytes < KMALLOC_MAX_SIZE) \ |
54 | (heap)->data = kmalloc(_bytes, (gfp)); \ | 54 | (heap)->data = kmalloc(_bytes, (gfp)); \ |
55 | if ((!(heap)->data) && ((gfp) & GFP_KERNEL)) \ | 55 | if ((!(heap)->data) && ((gfp) & GFP_KERNEL)) \ |
56 | (heap)->data = vmalloc(_bytes); \ | 56 | (heap)->data = vmalloc(_bytes); \ |
57 | (heap)->data; \ | 57 | (heap)->data; \ |
58 | }) | 58 | }) |
59 | 59 | ||
60 | #define free_heap(heap) \ | 60 | #define free_heap(heap) \ |
61 | do { \ | 61 | do { \ |
62 | if (is_vmalloc_addr((heap)->data)) \ | 62 | if (is_vmalloc_addr((heap)->data)) \ |
63 | vfree((heap)->data); \ | 63 | vfree((heap)->data); \ |
64 | else \ | 64 | else \ |
65 | kfree((heap)->data); \ | 65 | kfree((heap)->data); \ |
66 | (heap)->data = NULL; \ | 66 | (heap)->data = NULL; \ |
67 | } while (0) | 67 | } while (0) |
68 | 68 | ||
69 | #define heap_swap(h, i, j) swap((h)->data[i], (h)->data[j]) | 69 | #define heap_swap(h, i, j) swap((h)->data[i], (h)->data[j]) |
70 | 70 | ||
71 | #define heap_sift(h, i, cmp) \ | 71 | #define heap_sift(h, i, cmp) \ |
72 | do { \ | 72 | do { \ |
73 | size_t _r, _j = i; \ | 73 | size_t _r, _j = i; \ |
74 | \ | 74 | \ |
75 | for (; _j * 2 + 1 < (h)->used; _j = _r) { \ | 75 | for (; _j * 2 + 1 < (h)->used; _j = _r) { \ |
76 | _r = _j * 2 + 1; \ | 76 | _r = _j * 2 + 1; \ |
77 | if (_r + 1 < (h)->used && \ | 77 | if (_r + 1 < (h)->used && \ |
78 | cmp((h)->data[_r], (h)->data[_r + 1])) \ | 78 | cmp((h)->data[_r], (h)->data[_r + 1])) \ |
79 | _r++; \ | 79 | _r++; \ |
80 | \ | 80 | \ |
81 | if (cmp((h)->data[_r], (h)->data[_j])) \ | 81 | if (cmp((h)->data[_r], (h)->data[_j])) \ |
82 | break; \ | 82 | break; \ |
83 | heap_swap(h, _r, _j); \ | 83 | heap_swap(h, _r, _j); \ |
84 | } \ | 84 | } \ |
85 | } while (0) | 85 | } while (0) |
86 | 86 | ||
87 | #define heap_sift_down(h, i, cmp) \ | 87 | #define heap_sift_down(h, i, cmp) \ |
88 | do { \ | 88 | do { \ |
89 | while (i) { \ | 89 | while (i) { \ |
90 | size_t p = (i - 1) / 2; \ | 90 | size_t p = (i - 1) / 2; \ |
91 | if (cmp((h)->data[i], (h)->data[p])) \ | 91 | if (cmp((h)->data[i], (h)->data[p])) \ |
92 | break; \ | 92 | break; \ |
93 | heap_swap(h, i, p); \ | 93 | heap_swap(h, i, p); \ |
94 | i = p; \ | 94 | i = p; \ |
95 | } \ | 95 | } \ |
96 | } while (0) | 96 | } while (0) |
97 | 97 | ||
98 | #define heap_add(h, d, cmp) \ | 98 | #define heap_add(h, d, cmp) \ |
99 | ({ \ | 99 | ({ \ |
100 | bool _r = !heap_full(h); \ | 100 | bool _r = !heap_full(h); \ |
101 | if (_r) { \ | 101 | if (_r) { \ |
102 | size_t _i = (h)->used++; \ | 102 | size_t _i = (h)->used++; \ |
103 | (h)->data[_i] = d; \ | 103 | (h)->data[_i] = d; \ |
104 | \ | 104 | \ |
105 | heap_sift_down(h, _i, cmp); \ | 105 | heap_sift_down(h, _i, cmp); \ |
106 | heap_sift(h, _i, cmp); \ | 106 | heap_sift(h, _i, cmp); \ |
107 | } \ | 107 | } \ |
108 | _r; \ | 108 | _r; \ |
109 | }) | 109 | }) |
110 | 110 | ||
111 | #define heap_pop(h, d, cmp) \ | 111 | #define heap_pop(h, d, cmp) \ |
112 | ({ \ | 112 | ({ \ |
113 | bool _r = (h)->used; \ | 113 | bool _r = (h)->used; \ |
114 | if (_r) { \ | 114 | if (_r) { \ |
115 | (d) = (h)->data[0]; \ | 115 | (d) = (h)->data[0]; \ |
116 | (h)->used--; \ | 116 | (h)->used--; \ |
117 | heap_swap(h, 0, (h)->used); \ | 117 | heap_swap(h, 0, (h)->used); \ |
118 | heap_sift(h, 0, cmp); \ | 118 | heap_sift(h, 0, cmp); \ |
119 | } \ | 119 | } \ |
120 | _r; \ | 120 | _r; \ |
121 | }) | 121 | }) |
122 | 122 | ||
123 | #define heap_peek(h) ((h)->size ? (h)->data[0] : NULL) | 123 | #define heap_peek(h) ((h)->size ? (h)->data[0] : NULL) |
124 | 124 | ||
125 | #define heap_full(h) ((h)->used == (h)->size) | 125 | #define heap_full(h) ((h)->used == (h)->size) |
126 | 126 | ||
127 | #define DECLARE_FIFO(type, name) \ | 127 | #define DECLARE_FIFO(type, name) \ |
128 | struct { \ | 128 | struct { \ |
129 | size_t front, back, size, mask; \ | 129 | size_t front, back, size, mask; \ |
130 | type *data; \ | 130 | type *data; \ |
131 | } name | 131 | } name |
132 | 132 | ||
133 | #define fifo_for_each(c, fifo, iter) \ | 133 | #define fifo_for_each(c, fifo, iter) \ |
134 | for (iter = (fifo)->front; \ | 134 | for (iter = (fifo)->front; \ |
135 | c = (fifo)->data[iter], iter != (fifo)->back; \ | 135 | c = (fifo)->data[iter], iter != (fifo)->back; \ |
136 | iter = (iter + 1) & (fifo)->mask) | 136 | iter = (iter + 1) & (fifo)->mask) |
137 | 137 | ||
138 | #define __init_fifo(fifo, gfp) \ | 138 | #define __init_fifo(fifo, gfp) \ |
139 | ({ \ | 139 | ({ \ |
140 | size_t _allocated_size, _bytes; \ | 140 | size_t _allocated_size, _bytes; \ |
141 | BUG_ON(!(fifo)->size); \ | 141 | BUG_ON(!(fifo)->size); \ |
142 | \ | 142 | \ |
143 | _allocated_size = roundup_pow_of_two((fifo)->size + 1); \ | 143 | _allocated_size = roundup_pow_of_two((fifo)->size + 1); \ |
144 | _bytes = _allocated_size * sizeof(*(fifo)->data); \ | 144 | _bytes = _allocated_size * sizeof(*(fifo)->data); \ |
145 | \ | 145 | \ |
146 | (fifo)->mask = _allocated_size - 1; \ | 146 | (fifo)->mask = _allocated_size - 1; \ |
147 | (fifo)->front = (fifo)->back = 0; \ | 147 | (fifo)->front = (fifo)->back = 0; \ |
148 | (fifo)->data = NULL; \ | 148 | (fifo)->data = NULL; \ |
149 | \ | 149 | \ |
150 | if (_bytes < KMALLOC_MAX_SIZE) \ | 150 | if (_bytes < KMALLOC_MAX_SIZE) \ |
151 | (fifo)->data = kmalloc(_bytes, (gfp)); \ | 151 | (fifo)->data = kmalloc(_bytes, (gfp)); \ |
152 | if ((!(fifo)->data) && ((gfp) & GFP_KERNEL)) \ | 152 | if ((!(fifo)->data) && ((gfp) & GFP_KERNEL)) \ |
153 | (fifo)->data = vmalloc(_bytes); \ | 153 | (fifo)->data = vmalloc(_bytes); \ |
154 | (fifo)->data; \ | 154 | (fifo)->data; \ |
155 | }) | 155 | }) |
156 | 156 | ||
157 | #define init_fifo_exact(fifo, _size, gfp) \ | 157 | #define init_fifo_exact(fifo, _size, gfp) \ |
158 | ({ \ | 158 | ({ \ |
159 | (fifo)->size = (_size); \ | 159 | (fifo)->size = (_size); \ |
160 | __init_fifo(fifo, gfp); \ | 160 | __init_fifo(fifo, gfp); \ |
161 | }) | 161 | }) |
162 | 162 | ||
163 | #define init_fifo(fifo, _size, gfp) \ | 163 | #define init_fifo(fifo, _size, gfp) \ |
164 | ({ \ | 164 | ({ \ |
165 | (fifo)->size = (_size); \ | 165 | (fifo)->size = (_size); \ |
166 | if ((fifo)->size > 4) \ | 166 | if ((fifo)->size > 4) \ |
167 | (fifo)->size = roundup_pow_of_two((fifo)->size) - 1; \ | 167 | (fifo)->size = roundup_pow_of_two((fifo)->size) - 1; \ |
168 | __init_fifo(fifo, gfp); \ | 168 | __init_fifo(fifo, gfp); \ |
169 | }) | 169 | }) |
170 | 170 | ||
171 | #define free_fifo(fifo) \ | 171 | #define free_fifo(fifo) \ |
172 | do { \ | 172 | do { \ |
173 | if (is_vmalloc_addr((fifo)->data)) \ | 173 | if (is_vmalloc_addr((fifo)->data)) \ |
174 | vfree((fifo)->data); \ | 174 | vfree((fifo)->data); \ |
175 | else \ | 175 | else \ |
176 | kfree((fifo)->data); \ | 176 | kfree((fifo)->data); \ |
177 | (fifo)->data = NULL; \ | 177 | (fifo)->data = NULL; \ |
178 | } while (0) | 178 | } while (0) |
179 | 179 | ||
180 | #define fifo_used(fifo) (((fifo)->back - (fifo)->front) & (fifo)->mask) | 180 | #define fifo_used(fifo) (((fifo)->back - (fifo)->front) & (fifo)->mask) |
181 | #define fifo_free(fifo) ((fifo)->size - fifo_used(fifo)) | 181 | #define fifo_free(fifo) ((fifo)->size - fifo_used(fifo)) |
182 | 182 | ||
183 | #define fifo_empty(fifo) (!fifo_used(fifo)) | 183 | #define fifo_empty(fifo) (!fifo_used(fifo)) |
184 | #define fifo_full(fifo) (!fifo_free(fifo)) | 184 | #define fifo_full(fifo) (!fifo_free(fifo)) |
185 | 185 | ||
186 | #define fifo_front(fifo) ((fifo)->data[(fifo)->front]) | 186 | #define fifo_front(fifo) ((fifo)->data[(fifo)->front]) |
187 | #define fifo_back(fifo) \ | 187 | #define fifo_back(fifo) \ |
188 | ((fifo)->data[((fifo)->back - 1) & (fifo)->mask]) | 188 | ((fifo)->data[((fifo)->back - 1) & (fifo)->mask]) |
189 | 189 | ||
190 | #define fifo_idx(fifo, p) (((p) - &fifo_front(fifo)) & (fifo)->mask) | 190 | #define fifo_idx(fifo, p) (((p) - &fifo_front(fifo)) & (fifo)->mask) |
191 | 191 | ||
192 | #define fifo_push_back(fifo, i) \ | 192 | #define fifo_push_back(fifo, i) \ |
193 | ({ \ | 193 | ({ \ |
194 | bool _r = !fifo_full((fifo)); \ | 194 | bool _r = !fifo_full((fifo)); \ |
195 | if (_r) { \ | 195 | if (_r) { \ |
196 | (fifo)->data[(fifo)->back++] = (i); \ | 196 | (fifo)->data[(fifo)->back++] = (i); \ |
197 | (fifo)->back &= (fifo)->mask; \ | 197 | (fifo)->back &= (fifo)->mask; \ |
198 | } \ | 198 | } \ |
199 | _r; \ | 199 | _r; \ |
200 | }) | 200 | }) |
201 | 201 | ||
202 | #define fifo_pop_front(fifo, i) \ | 202 | #define fifo_pop_front(fifo, i) \ |
203 | ({ \ | 203 | ({ \ |
204 | bool _r = !fifo_empty((fifo)); \ | 204 | bool _r = !fifo_empty((fifo)); \ |
205 | if (_r) { \ | 205 | if (_r) { \ |
206 | (i) = (fifo)->data[(fifo)->front++]; \ | 206 | (i) = (fifo)->data[(fifo)->front++]; \ |
207 | (fifo)->front &= (fifo)->mask; \ | 207 | (fifo)->front &= (fifo)->mask; \ |
208 | } \ | 208 | } \ |
209 | _r; \ | 209 | _r; \ |
210 | }) | 210 | }) |
211 | 211 | ||
212 | #define fifo_push_front(fifo, i) \ | 212 | #define fifo_push_front(fifo, i) \ |
213 | ({ \ | 213 | ({ \ |
214 | bool _r = !fifo_full((fifo)); \ | 214 | bool _r = !fifo_full((fifo)); \ |
215 | if (_r) { \ | 215 | if (_r) { \ |
216 | --(fifo)->front; \ | 216 | --(fifo)->front; \ |
217 | (fifo)->front &= (fifo)->mask; \ | 217 | (fifo)->front &= (fifo)->mask; \ |
218 | (fifo)->data[(fifo)->front] = (i); \ | 218 | (fifo)->data[(fifo)->front] = (i); \ |
219 | } \ | 219 | } \ |
220 | _r; \ | 220 | _r; \ |
221 | }) | 221 | }) |
222 | 222 | ||
223 | #define fifo_pop_back(fifo, i) \ | 223 | #define fifo_pop_back(fifo, i) \ |
224 | ({ \ | 224 | ({ \ |
225 | bool _r = !fifo_empty((fifo)); \ | 225 | bool _r = !fifo_empty((fifo)); \ |
226 | if (_r) { \ | 226 | if (_r) { \ |
227 | --(fifo)->back; \ | 227 | --(fifo)->back; \ |
228 | (fifo)->back &= (fifo)->mask; \ | 228 | (fifo)->back &= (fifo)->mask; \ |
229 | (i) = (fifo)->data[(fifo)->back] \ | 229 | (i) = (fifo)->data[(fifo)->back] \ |
230 | } \ | 230 | } \ |
231 | _r; \ | 231 | _r; \ |
232 | }) | 232 | }) |
233 | 233 | ||
234 | #define fifo_push(fifo, i) fifo_push_back(fifo, (i)) | 234 | #define fifo_push(fifo, i) fifo_push_back(fifo, (i)) |
235 | #define fifo_pop(fifo, i) fifo_pop_front(fifo, (i)) | 235 | #define fifo_pop(fifo, i) fifo_pop_front(fifo, (i)) |
236 | 236 | ||
237 | #define fifo_swap(l, r) \ | 237 | #define fifo_swap(l, r) \ |
238 | do { \ | 238 | do { \ |
239 | swap((l)->front, (r)->front); \ | 239 | swap((l)->front, (r)->front); \ |
240 | swap((l)->back, (r)->back); \ | 240 | swap((l)->back, (r)->back); \ |
241 | swap((l)->size, (r)->size); \ | 241 | swap((l)->size, (r)->size); \ |
242 | swap((l)->mask, (r)->mask); \ | 242 | swap((l)->mask, (r)->mask); \ |
243 | swap((l)->data, (r)->data); \ | 243 | swap((l)->data, (r)->data); \ |
244 | } while (0) | 244 | } while (0) |
245 | 245 | ||
246 | #define fifo_move(dest, src) \ | 246 | #define fifo_move(dest, src) \ |
247 | do { \ | 247 | do { \ |
248 | typeof(*((dest)->data)) _t; \ | 248 | typeof(*((dest)->data)) _t; \ |
249 | while (!fifo_full(dest) && \ | 249 | while (!fifo_full(dest) && \ |
250 | fifo_pop(src, _t)) \ | 250 | fifo_pop(src, _t)) \ |
251 | fifo_push(dest, _t); \ | 251 | fifo_push(dest, _t); \ |
252 | } while (0) | 252 | } while (0) |
253 | 253 | ||
254 | /* | 254 | /* |
255 | * Simple array based allocator - preallocates a number of elements and you can | 255 | * Simple array based allocator - preallocates a number of elements and you can |
256 | * never allocate more than that, also has no locking. | 256 | * never allocate more than that, also has no locking. |
257 | * | 257 | * |
258 | * Handy because if you know you only need a fixed number of elements you don't | 258 | * Handy because if you know you only need a fixed number of elements you don't |
259 | * have to worry about memory allocation failure, and sometimes a mempool isn't | 259 | * have to worry about memory allocation failure, and sometimes a mempool isn't |
260 | * what you want. | 260 | * what you want. |
261 | * | 261 | * |
262 | * We treat the free elements as entries in a singly linked list, and the | 262 | * We treat the free elements as entries in a singly linked list, and the |
263 | * freelist as a stack - allocating and freeing push and pop off the freelist. | 263 | * freelist as a stack - allocating and freeing push and pop off the freelist. |
264 | */ | 264 | */ |
265 | 265 | ||
266 | #define DECLARE_ARRAY_ALLOCATOR(type, name, size) \ | 266 | #define DECLARE_ARRAY_ALLOCATOR(type, name, size) \ |
267 | struct { \ | 267 | struct { \ |
268 | type *freelist; \ | 268 | type *freelist; \ |
269 | type data[size]; \ | 269 | type data[size]; \ |
270 | } name | 270 | } name |
271 | 271 | ||
272 | #define array_alloc(array) \ | 272 | #define array_alloc(array) \ |
273 | ({ \ | 273 | ({ \ |
274 | typeof((array)->freelist) _ret = (array)->freelist; \ | 274 | typeof((array)->freelist) _ret = (array)->freelist; \ |
275 | \ | 275 | \ |
276 | if (_ret) \ | 276 | if (_ret) \ |
277 | (array)->freelist = *((typeof((array)->freelist) *) _ret);\ | 277 | (array)->freelist = *((typeof((array)->freelist) *) _ret);\ |
278 | \ | 278 | \ |
279 | _ret; \ | 279 | _ret; \ |
280 | }) | 280 | }) |
281 | 281 | ||
282 | #define array_free(array, ptr) \ | 282 | #define array_free(array, ptr) \ |
283 | do { \ | 283 | do { \ |
284 | typeof((array)->freelist) _ptr = ptr; \ | 284 | typeof((array)->freelist) _ptr = ptr; \ |
285 | \ | 285 | \ |
286 | *((typeof((array)->freelist) *) _ptr) = (array)->freelist; \ | 286 | *((typeof((array)->freelist) *) _ptr) = (array)->freelist; \ |
287 | (array)->freelist = _ptr; \ | 287 | (array)->freelist = _ptr; \ |
288 | } while (0) | 288 | } while (0) |
289 | 289 | ||
290 | #define array_allocator_init(array) \ | 290 | #define array_allocator_init(array) \ |
291 | do { \ | 291 | do { \ |
292 | typeof((array)->freelist) _i; \ | 292 | typeof((array)->freelist) _i; \ |
293 | \ | 293 | \ |
294 | BUILD_BUG_ON(sizeof((array)->data[0]) < sizeof(void *)); \ | 294 | BUILD_BUG_ON(sizeof((array)->data[0]) < sizeof(void *)); \ |
295 | (array)->freelist = NULL; \ | 295 | (array)->freelist = NULL; \ |
296 | \ | 296 | \ |
297 | for (_i = (array)->data; \ | 297 | for (_i = (array)->data; \ |
298 | _i < (array)->data + ARRAY_SIZE((array)->data); \ | 298 | _i < (array)->data + ARRAY_SIZE((array)->data); \ |
299 | _i++) \ | 299 | _i++) \ |
300 | array_free(array, _i); \ | 300 | array_free(array, _i); \ |
301 | } while (0) | 301 | } while (0) |
302 | 302 | ||
303 | #define array_freelist_empty(array) ((array)->freelist == NULL) | 303 | #define array_freelist_empty(array) ((array)->freelist == NULL) |
304 | 304 | ||
305 | #define ANYSINT_MAX(t) \ | 305 | #define ANYSINT_MAX(t) \ |
306 | ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1) | 306 | ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1) |
307 | 307 | ||
308 | int bch_strtoint_h(const char *, int *); | 308 | int bch_strtoint_h(const char *, int *); |
309 | int bch_strtouint_h(const char *, unsigned int *); | 309 | int bch_strtouint_h(const char *, unsigned int *); |
310 | int bch_strtoll_h(const char *, long long *); | 310 | int bch_strtoll_h(const char *, long long *); |
311 | int bch_strtoull_h(const char *, unsigned long long *); | 311 | int bch_strtoull_h(const char *, unsigned long long *); |
312 | 312 | ||
313 | static inline int bch_strtol_h(const char *cp, long *res) | 313 | static inline int bch_strtol_h(const char *cp, long *res) |
314 | { | 314 | { |
315 | #if BITS_PER_LONG == 32 | 315 | #if BITS_PER_LONG == 32 |
316 | return bch_strtoint_h(cp, (int *) res); | 316 | return bch_strtoint_h(cp, (int *) res); |
317 | #else | 317 | #else |
318 | return bch_strtoll_h(cp, (long long *) res); | 318 | return bch_strtoll_h(cp, (long long *) res); |
319 | #endif | 319 | #endif |
320 | } | 320 | } |
321 | 321 | ||
322 | static inline int bch_strtoul_h(const char *cp, long *res) | 322 | static inline int bch_strtoul_h(const char *cp, long *res) |
323 | { | 323 | { |
324 | #if BITS_PER_LONG == 32 | 324 | #if BITS_PER_LONG == 32 |
325 | return bch_strtouint_h(cp, (unsigned int *) res); | 325 | return bch_strtouint_h(cp, (unsigned int *) res); |
326 | #else | 326 | #else |
327 | return bch_strtoull_h(cp, (unsigned long long *) res); | 327 | return bch_strtoull_h(cp, (unsigned long long *) res); |
328 | #endif | 328 | #endif |
329 | } | 329 | } |
330 | 330 | ||
331 | #define strtoi_h(cp, res) \ | 331 | #define strtoi_h(cp, res) \ |
332 | (__builtin_types_compatible_p(typeof(*res), int) \ | 332 | (__builtin_types_compatible_p(typeof(*res), int) \ |
333 | ? bch_strtoint_h(cp, (void *) res) \ | 333 | ? bch_strtoint_h(cp, (void *) res) \ |
334 | : __builtin_types_compatible_p(typeof(*res), long) \ | 334 | : __builtin_types_compatible_p(typeof(*res), long) \ |
335 | ? bch_strtol_h(cp, (void *) res) \ | 335 | ? bch_strtol_h(cp, (void *) res) \ |
336 | : __builtin_types_compatible_p(typeof(*res), long long) \ | 336 | : __builtin_types_compatible_p(typeof(*res), long long) \ |
337 | ? bch_strtoll_h(cp, (void *) res) \ | 337 | ? bch_strtoll_h(cp, (void *) res) \ |
338 | : __builtin_types_compatible_p(typeof(*res), unsigned int) \ | 338 | : __builtin_types_compatible_p(typeof(*res), unsigned int) \ |
339 | ? bch_strtouint_h(cp, (void *) res) \ | 339 | ? bch_strtouint_h(cp, (void *) res) \ |
340 | : __builtin_types_compatible_p(typeof(*res), unsigned long) \ | 340 | : __builtin_types_compatible_p(typeof(*res), unsigned long) \ |
341 | ? bch_strtoul_h(cp, (void *) res) \ | 341 | ? bch_strtoul_h(cp, (void *) res) \ |
342 | : __builtin_types_compatible_p(typeof(*res), unsigned long long)\ | 342 | : __builtin_types_compatible_p(typeof(*res), unsigned long long)\ |
343 | ? bch_strtoull_h(cp, (void *) res) : -EINVAL) | 343 | ? bch_strtoull_h(cp, (void *) res) : -EINVAL) |
344 | 344 | ||
345 | #define strtoul_safe(cp, var) \ | 345 | #define strtoul_safe(cp, var) \ |
346 | ({ \ | 346 | ({ \ |
347 | unsigned long _v; \ | 347 | unsigned long _v; \ |
348 | int _r = kstrtoul(cp, 10, &_v); \ | 348 | int _r = kstrtoul(cp, 10, &_v); \ |
349 | if (!_r) \ | 349 | if (!_r) \ |
350 | var = _v; \ | 350 | var = _v; \ |
351 | _r; \ | 351 | _r; \ |
352 | }) | 352 | }) |
353 | 353 | ||
354 | #define strtoul_safe_clamp(cp, var, min, max) \ | 354 | #define strtoul_safe_clamp(cp, var, min, max) \ |
355 | ({ \ | 355 | ({ \ |
356 | unsigned long _v; \ | 356 | unsigned long _v; \ |
357 | int _r = kstrtoul(cp, 10, &_v); \ | 357 | int _r = kstrtoul(cp, 10, &_v); \ |
358 | if (!_r) \ | 358 | if (!_r) \ |
359 | var = clamp_t(typeof(var), _v, min, max); \ | 359 | var = clamp_t(typeof(var), _v, min, max); \ |
360 | _r; \ | 360 | _r; \ |
361 | }) | 361 | }) |
362 | 362 | ||
363 | #define snprint(buf, size, var) \ | 363 | #define snprint(buf, size, var) \ |
364 | snprintf(buf, size, \ | 364 | snprintf(buf, size, \ |
365 | __builtin_types_compatible_p(typeof(var), int) \ | 365 | __builtin_types_compatible_p(typeof(var), int) \ |
366 | ? "%i\n" : \ | 366 | ? "%i\n" : \ |
367 | __builtin_types_compatible_p(typeof(var), unsigned) \ | 367 | __builtin_types_compatible_p(typeof(var), unsigned) \ |
368 | ? "%u\n" : \ | 368 | ? "%u\n" : \ |
369 | __builtin_types_compatible_p(typeof(var), long) \ | 369 | __builtin_types_compatible_p(typeof(var), long) \ |
370 | ? "%li\n" : \ | 370 | ? "%li\n" : \ |
371 | __builtin_types_compatible_p(typeof(var), unsigned long)\ | 371 | __builtin_types_compatible_p(typeof(var), unsigned long)\ |
372 | ? "%lu\n" : \ | 372 | ? "%lu\n" : \ |
373 | __builtin_types_compatible_p(typeof(var), int64_t) \ | 373 | __builtin_types_compatible_p(typeof(var), int64_t) \ |
374 | ? "%lli\n" : \ | 374 | ? "%lli\n" : \ |
375 | __builtin_types_compatible_p(typeof(var), uint64_t) \ | 375 | __builtin_types_compatible_p(typeof(var), uint64_t) \ |
376 | ? "%llu\n" : \ | 376 | ? "%llu\n" : \ |
377 | __builtin_types_compatible_p(typeof(var), const char *) \ | 377 | __builtin_types_compatible_p(typeof(var), const char *) \ |
378 | ? "%s\n" : "%i\n", var) | 378 | ? "%s\n" : "%i\n", var) |
379 | 379 | ||
380 | ssize_t bch_hprint(char *buf, int64_t v); | 380 | ssize_t bch_hprint(char *buf, int64_t v); |
381 | 381 | ||
382 | bool bch_is_zero(const char *p, size_t n); | 382 | bool bch_is_zero(const char *p, size_t n); |
383 | int bch_parse_uuid(const char *s, char *uuid); | 383 | int bch_parse_uuid(const char *s, char *uuid); |
384 | 384 | ||
385 | ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[], | 385 | ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[], |
386 | size_t selected); | 386 | size_t selected); |
387 | 387 | ||
388 | ssize_t bch_read_string_list(const char *buf, const char * const list[]); | 388 | ssize_t bch_read_string_list(const char *buf, const char * const list[]); |
389 | 389 | ||
390 | struct time_stats { | 390 | struct time_stats { |
391 | /* | 391 | /* |
392 | * all fields are in nanoseconds, averages are ewmas stored left shifted | 392 | * all fields are in nanoseconds, averages are ewmas stored left shifted |
393 | * by 8 | 393 | * by 8 |
394 | */ | 394 | */ |
395 | uint64_t max_duration; | 395 | uint64_t max_duration; |
396 | uint64_t average_duration; | 396 | uint64_t average_duration; |
397 | uint64_t average_frequency; | 397 | uint64_t average_frequency; |
398 | uint64_t last; | 398 | uint64_t last; |
399 | }; | 399 | }; |
400 | 400 | ||
401 | void bch_time_stats_update(struct time_stats *stats, uint64_t time); | 401 | void bch_time_stats_update(struct time_stats *stats, uint64_t time); |
402 | 402 | ||
403 | #define NSEC_PER_ns 1L | 403 | #define NSEC_PER_ns 1L |
404 | #define NSEC_PER_us NSEC_PER_USEC | 404 | #define NSEC_PER_us NSEC_PER_USEC |
405 | #define NSEC_PER_ms NSEC_PER_MSEC | 405 | #define NSEC_PER_ms NSEC_PER_MSEC |
406 | #define NSEC_PER_sec NSEC_PER_SEC | 406 | #define NSEC_PER_sec NSEC_PER_SEC |
407 | 407 | ||
408 | #define __print_time_stat(stats, name, stat, units) \ | 408 | #define __print_time_stat(stats, name, stat, units) \ |
409 | sysfs_print(name ## _ ## stat ## _ ## units, \ | 409 | sysfs_print(name ## _ ## stat ## _ ## units, \ |
410 | div_u64((stats)->stat >> 8, NSEC_PER_ ## units)) | 410 | div_u64((stats)->stat >> 8, NSEC_PER_ ## units)) |
411 | 411 | ||
412 | #define sysfs_print_time_stats(stats, name, \ | 412 | #define sysfs_print_time_stats(stats, name, \ |
413 | frequency_units, \ | 413 | frequency_units, \ |
414 | duration_units) \ | 414 | duration_units) \ |
415 | do { \ | 415 | do { \ |
416 | __print_time_stat(stats, name, \ | 416 | __print_time_stat(stats, name, \ |
417 | average_frequency, frequency_units); \ | 417 | average_frequency, frequency_units); \ |
418 | __print_time_stat(stats, name, \ | 418 | __print_time_stat(stats, name, \ |
419 | average_duration, duration_units); \ | 419 | average_duration, duration_units); \ |
420 | __print_time_stat(stats, name, \ | 420 | __print_time_stat(stats, name, \ |
421 | max_duration, duration_units); \ | 421 | max_duration, duration_units); \ |
422 | \ | 422 | \ |
423 | sysfs_print(name ## _last_ ## frequency_units, (stats)->last \ | 423 | sysfs_print(name ## _last_ ## frequency_units, (stats)->last \ |
424 | ? div_s64(local_clock() - (stats)->last, \ | 424 | ? div_s64(local_clock() - (stats)->last, \ |
425 | NSEC_PER_ ## frequency_units) \ | 425 | NSEC_PER_ ## frequency_units) \ |
426 | : -1LL); \ | 426 | : -1LL); \ |
427 | } while (0) | 427 | } while (0) |
428 | 428 | ||
429 | #define sysfs_time_stats_attribute(name, \ | 429 | #define sysfs_time_stats_attribute(name, \ |
430 | frequency_units, \ | 430 | frequency_units, \ |
431 | duration_units) \ | 431 | duration_units) \ |
432 | read_attribute(name ## _average_frequency_ ## frequency_units); \ | 432 | read_attribute(name ## _average_frequency_ ## frequency_units); \ |
433 | read_attribute(name ## _average_duration_ ## duration_units); \ | 433 | read_attribute(name ## _average_duration_ ## duration_units); \ |
434 | read_attribute(name ## _max_duration_ ## duration_units); \ | 434 | read_attribute(name ## _max_duration_ ## duration_units); \ |
435 | read_attribute(name ## _last_ ## frequency_units) | 435 | read_attribute(name ## _last_ ## frequency_units) |
436 | 436 | ||
437 | #define sysfs_time_stats_attribute_list(name, \ | 437 | #define sysfs_time_stats_attribute_list(name, \ |
438 | frequency_units, \ | 438 | frequency_units, \ |
439 | duration_units) \ | 439 | duration_units) \ |
440 | &sysfs_ ## name ## _average_frequency_ ## frequency_units, \ | 440 | &sysfs_ ## name ## _average_frequency_ ## frequency_units, \ |
441 | &sysfs_ ## name ## _average_duration_ ## duration_units, \ | 441 | &sysfs_ ## name ## _average_duration_ ## duration_units, \ |
442 | &sysfs_ ## name ## _max_duration_ ## duration_units, \ | 442 | &sysfs_ ## name ## _max_duration_ ## duration_units, \ |
443 | &sysfs_ ## name ## _last_ ## frequency_units, | 443 | &sysfs_ ## name ## _last_ ## frequency_units, |
444 | 444 | ||
445 | #define ewma_add(ewma, val, weight, factor) \ | 445 | #define ewma_add(ewma, val, weight, factor) \ |
446 | ({ \ | 446 | ({ \ |
447 | (ewma) *= (weight) - 1; \ | 447 | (ewma) *= (weight) - 1; \ |
448 | (ewma) += (val) << factor; \ | 448 | (ewma) += (val) << factor; \ |
449 | (ewma) /= (weight); \ | 449 | (ewma) /= (weight); \ |
450 | (ewma) >> factor; \ | 450 | (ewma) >> factor; \ |
451 | }) | 451 | }) |
452 | 452 | ||
453 | struct ratelimit { | 453 | struct ratelimit { |
454 | uint64_t next; | 454 | uint64_t next; |
455 | unsigned rate; | 455 | unsigned rate; |
456 | }; | 456 | }; |
457 | 457 | ||
458 | static inline void ratelimit_reset(struct ratelimit *d) | 458 | static inline void ratelimit_reset(struct ratelimit *d) |
459 | { | 459 | { |
460 | d->next = local_clock(); | 460 | d->next = local_clock(); |
461 | } | 461 | } |
462 | 462 | ||
463 | unsigned bch_next_delay(struct ratelimit *d, uint64_t done); | 463 | unsigned bch_next_delay(struct ratelimit *d, uint64_t done); |
464 | 464 | ||
465 | #define __DIV_SAFE(n, d, zero) \ | 465 | #define __DIV_SAFE(n, d, zero) \ |
466 | ({ \ | 466 | ({ \ |
467 | typeof(n) _n = (n); \ | 467 | typeof(n) _n = (n); \ |
468 | typeof(d) _d = (d); \ | 468 | typeof(d) _d = (d); \ |
469 | _d ? _n / _d : zero; \ | 469 | _d ? _n / _d : zero; \ |
470 | }) | 470 | }) |
471 | 471 | ||
472 | #define DIV_SAFE(n, d) __DIV_SAFE(n, d, 0) | 472 | #define DIV_SAFE(n, d) __DIV_SAFE(n, d, 0) |
473 | 473 | ||
474 | #define container_of_or_null(ptr, type, member) \ | 474 | #define container_of_or_null(ptr, type, member) \ |
475 | ({ \ | 475 | ({ \ |
476 | typeof(ptr) _ptr = ptr; \ | 476 | typeof(ptr) _ptr = ptr; \ |
477 | _ptr ? container_of(_ptr, type, member) : NULL; \ | 477 | _ptr ? container_of(_ptr, type, member) : NULL; \ |
478 | }) | 478 | }) |
479 | 479 | ||
480 | #define RB_INSERT(root, new, member, cmp) \ | 480 | #define RB_INSERT(root, new, member, cmp) \ |
481 | ({ \ | 481 | ({ \ |
482 | __label__ dup; \ | 482 | __label__ dup; \ |
483 | struct rb_node **n = &(root)->rb_node, *parent = NULL; \ | 483 | struct rb_node **n = &(root)->rb_node, *parent = NULL; \ |
484 | typeof(new) this; \ | 484 | typeof(new) this; \ |
485 | int res, ret = -1; \ | 485 | int res, ret = -1; \ |
486 | \ | 486 | \ |
487 | while (*n) { \ | 487 | while (*n) { \ |
488 | parent = *n; \ | 488 | parent = *n; \ |
489 | this = container_of(*n, typeof(*(new)), member); \ | 489 | this = container_of(*n, typeof(*(new)), member); \ |
490 | res = cmp(new, this); \ | 490 | res = cmp(new, this); \ |
491 | if (!res) \ | 491 | if (!res) \ |
492 | goto dup; \ | 492 | goto dup; \ |
493 | n = res < 0 \ | 493 | n = res < 0 \ |
494 | ? &(*n)->rb_left \ | 494 | ? &(*n)->rb_left \ |
495 | : &(*n)->rb_right; \ | 495 | : &(*n)->rb_right; \ |
496 | } \ | 496 | } \ |
497 | \ | 497 | \ |
498 | rb_link_node(&(new)->member, parent, n); \ | 498 | rb_link_node(&(new)->member, parent, n); \ |
499 | rb_insert_color(&(new)->member, root); \ | 499 | rb_insert_color(&(new)->member, root); \ |
500 | ret = 0; \ | 500 | ret = 0; \ |
501 | dup: \ | 501 | dup: \ |
502 | ret; \ | 502 | ret; \ |
503 | }) | 503 | }) |
504 | 504 | ||
505 | #define RB_SEARCH(root, search, member, cmp) \ | 505 | #define RB_SEARCH(root, search, member, cmp) \ |
506 | ({ \ | 506 | ({ \ |
507 | struct rb_node *n = (root)->rb_node; \ | 507 | struct rb_node *n = (root)->rb_node; \ |
508 | typeof(&(search)) this, ret = NULL; \ | 508 | typeof(&(search)) this, ret = NULL; \ |
509 | int res; \ | 509 | int res; \ |
510 | \ | 510 | \ |
511 | while (n) { \ | 511 | while (n) { \ |
512 | this = container_of(n, typeof(search), member); \ | 512 | this = container_of(n, typeof(search), member); \ |
513 | res = cmp(&(search), this); \ | 513 | res = cmp(&(search), this); \ |
514 | if (!res) { \ | 514 | if (!res) { \ |
515 | ret = this; \ | 515 | ret = this; \ |
516 | break; \ | 516 | break; \ |
517 | } \ | 517 | } \ |
518 | n = res < 0 \ | 518 | n = res < 0 \ |
519 | ? n->rb_left \ | 519 | ? n->rb_left \ |
520 | : n->rb_right; \ | 520 | : n->rb_right; \ |
521 | } \ | 521 | } \ |
522 | ret; \ | 522 | ret; \ |
523 | }) | 523 | }) |
524 | 524 | ||
525 | #define RB_GREATER(root, search, member, cmp) \ | 525 | #define RB_GREATER(root, search, member, cmp) \ |
526 | ({ \ | 526 | ({ \ |
527 | struct rb_node *n = (root)->rb_node; \ | 527 | struct rb_node *n = (root)->rb_node; \ |
528 | typeof(&(search)) this, ret = NULL; \ | 528 | typeof(&(search)) this, ret = NULL; \ |
529 | int res; \ | 529 | int res; \ |
530 | \ | 530 | \ |
531 | while (n) { \ | 531 | while (n) { \ |
532 | this = container_of(n, typeof(search), member); \ | 532 | this = container_of(n, typeof(search), member); \ |
533 | res = cmp(&(search), this); \ | 533 | res = cmp(&(search), this); \ |
534 | if (res < 0) { \ | 534 | if (res < 0) { \ |
535 | ret = this; \ | 535 | ret = this; \ |
536 | n = n->rb_left; \ | 536 | n = n->rb_left; \ |
537 | } else \ | 537 | } else \ |
538 | n = n->rb_right; \ | 538 | n = n->rb_right; \ |
539 | } \ | 539 | } \ |
540 | ret; \ | 540 | ret; \ |
541 | }) | 541 | }) |
542 | 542 | ||
543 | #define RB_FIRST(root, type, member) \ | 543 | #define RB_FIRST(root, type, member) \ |
544 | container_of_or_null(rb_first(root), type, member) | 544 | container_of_or_null(rb_first(root), type, member) |
545 | 545 | ||
546 | #define RB_LAST(root, type, member) \ | 546 | #define RB_LAST(root, type, member) \ |
547 | container_of_or_null(rb_last(root), type, member) | 547 | container_of_or_null(rb_last(root), type, member) |
548 | 548 | ||
549 | #define RB_NEXT(ptr, member) \ | 549 | #define RB_NEXT(ptr, member) \ |
550 | container_of_or_null(rb_next(&(ptr)->member), typeof(*ptr), member) | 550 | container_of_or_null(rb_next(&(ptr)->member), typeof(*ptr), member) |
551 | 551 | ||
552 | #define RB_PREV(ptr, member) \ | 552 | #define RB_PREV(ptr, member) \ |
553 | container_of_or_null(rb_prev(&(ptr)->member), typeof(*ptr), member) | 553 | container_of_or_null(rb_prev(&(ptr)->member), typeof(*ptr), member) |
554 | 554 | ||
555 | /* Does linear interpolation between powers of two */ | 555 | /* Does linear interpolation between powers of two */ |
556 | static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) | 556 | static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) |
557 | { | 557 | { |
558 | unsigned fract = x & ~(~0 << fract_bits); | 558 | unsigned fract = x & ~(~0 << fract_bits); |
559 | 559 | ||
560 | x >>= fract_bits; | 560 | x >>= fract_bits; |
561 | x = 1 << x; | 561 | x = 1 << x; |
562 | x += (x * fract) >> fract_bits; | 562 | x += (x * fract) >> fract_bits; |
563 | 563 | ||
564 | return x; | 564 | return x; |
565 | } | 565 | } |
566 | 566 | ||
567 | #define bio_end(bio) ((bio)->bi_sector + bio_sectors(bio)) | ||
568 | |||
569 | void bch_bio_map(struct bio *bio, void *base); | 567 | void bch_bio_map(struct bio *bio, void *base); |
570 | |||
571 | int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp); | ||
572 | 568 | ||
573 | static inline sector_t bdev_sectors(struct block_device *bdev) | 569 | static inline sector_t bdev_sectors(struct block_device *bdev) |
574 | { | 570 | { |
575 | return bdev->bd_inode->i_size >> 9; | 571 | return bdev->bd_inode->i_size >> 9; |
576 | } | 572 | } |
577 | 573 | ||
578 | #define closure_bio_submit(bio, cl, dev) \ | 574 | #define closure_bio_submit(bio, cl, dev) \ |
579 | do { \ | 575 | do { \ |
580 | closure_get(cl); \ | 576 | closure_get(cl); \ |
581 | bch_generic_make_request(bio, &(dev)->bio_split_hook); \ | 577 | bch_generic_make_request(bio, &(dev)->bio_split_hook); \ |
582 | } while (0) | 578 | } while (0) |
583 | 579 | ||
584 | uint64_t bch_crc64_update(uint64_t, const void *, size_t); | 580 | uint64_t bch_crc64_update(uint64_t, const void *, size_t); |
585 | uint64_t bch_crc64(const void *, size_t); | 581 | uint64_t bch_crc64(const void *, size_t); |
586 | 582 | ||
587 | #endif /* _BCACHE_UTIL_H */ | 583 | #endif /* _BCACHE_UTIL_H */ |
588 | 584 |
drivers/md/bcache/writeback.c
1 | /* | 1 | /* |
2 | * background writeback - scan btree for dirty data and write it to the backing | 2 | * background writeback - scan btree for dirty data and write it to the backing |
3 | * device | 3 | * device |
4 | * | 4 | * |
5 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> | 5 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> |
6 | * Copyright 2012 Google, Inc. | 6 | * Copyright 2012 Google, Inc. |
7 | */ | 7 | */ |
8 | 8 | ||
9 | #include "bcache.h" | 9 | #include "bcache.h" |
10 | #include "btree.h" | 10 | #include "btree.h" |
11 | #include "debug.h" | 11 | #include "debug.h" |
12 | #include "writeback.h" | 12 | #include "writeback.h" |
13 | 13 | ||
14 | #include <trace/events/bcache.h> | 14 | #include <trace/events/bcache.h> |
15 | 15 | ||
16 | static struct workqueue_struct *dirty_wq; | 16 | static struct workqueue_struct *dirty_wq; |
17 | 17 | ||
18 | static void read_dirty(struct closure *); | 18 | static void read_dirty(struct closure *); |
19 | 19 | ||
20 | struct dirty_io { | 20 | struct dirty_io { |
21 | struct closure cl; | 21 | struct closure cl; |
22 | struct cached_dev *dc; | 22 | struct cached_dev *dc; |
23 | struct bio bio; | 23 | struct bio bio; |
24 | }; | 24 | }; |
25 | 25 | ||
26 | /* Rate limiting */ | 26 | /* Rate limiting */ |
27 | 27 | ||
28 | static void __update_writeback_rate(struct cached_dev *dc) | 28 | static void __update_writeback_rate(struct cached_dev *dc) |
29 | { | 29 | { |
30 | struct cache_set *c = dc->disk.c; | 30 | struct cache_set *c = dc->disk.c; |
31 | uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size; | 31 | uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size; |
32 | uint64_t cache_dirty_target = | 32 | uint64_t cache_dirty_target = |
33 | div_u64(cache_sectors * dc->writeback_percent, 100); | 33 | div_u64(cache_sectors * dc->writeback_percent, 100); |
34 | 34 | ||
35 | int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev), | 35 | int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev), |
36 | c->cached_dev_sectors); | 36 | c->cached_dev_sectors); |
37 | 37 | ||
38 | /* PD controller */ | 38 | /* PD controller */ |
39 | 39 | ||
40 | int change = 0; | 40 | int change = 0; |
41 | int64_t error; | 41 | int64_t error; |
42 | int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); | 42 | int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); |
43 | int64_t derivative = dirty - dc->disk.sectors_dirty_last; | 43 | int64_t derivative = dirty - dc->disk.sectors_dirty_last; |
44 | 44 | ||
45 | dc->disk.sectors_dirty_last = dirty; | 45 | dc->disk.sectors_dirty_last = dirty; |
46 | 46 | ||
47 | derivative *= dc->writeback_rate_d_term; | 47 | derivative *= dc->writeback_rate_d_term; |
48 | derivative = clamp(derivative, -dirty, dirty); | 48 | derivative = clamp(derivative, -dirty, dirty); |
49 | 49 | ||
50 | derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative, | 50 | derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative, |
51 | dc->writeback_rate_d_smooth, 0); | 51 | dc->writeback_rate_d_smooth, 0); |
52 | 52 | ||
53 | /* Avoid divide by zero */ | 53 | /* Avoid divide by zero */ |
54 | if (!target) | 54 | if (!target) |
55 | goto out; | 55 | goto out; |
56 | 56 | ||
57 | error = div64_s64((dirty + derivative - target) << 8, target); | 57 | error = div64_s64((dirty + derivative - target) << 8, target); |
58 | 58 | ||
59 | change = div_s64((dc->writeback_rate.rate * error) >> 8, | 59 | change = div_s64((dc->writeback_rate.rate * error) >> 8, |
60 | dc->writeback_rate_p_term_inverse); | 60 | dc->writeback_rate_p_term_inverse); |
61 | 61 | ||
62 | /* Don't increase writeback rate if the device isn't keeping up */ | 62 | /* Don't increase writeback rate if the device isn't keeping up */ |
63 | if (change > 0 && | 63 | if (change > 0 && |
64 | time_after64(local_clock(), | 64 | time_after64(local_clock(), |
65 | dc->writeback_rate.next + 10 * NSEC_PER_MSEC)) | 65 | dc->writeback_rate.next + 10 * NSEC_PER_MSEC)) |
66 | change = 0; | 66 | change = 0; |
67 | 67 | ||
68 | dc->writeback_rate.rate = | 68 | dc->writeback_rate.rate = |
69 | clamp_t(int64_t, dc->writeback_rate.rate + change, | 69 | clamp_t(int64_t, dc->writeback_rate.rate + change, |
70 | 1, NSEC_PER_MSEC); | 70 | 1, NSEC_PER_MSEC); |
71 | out: | 71 | out: |
72 | dc->writeback_rate_derivative = derivative; | 72 | dc->writeback_rate_derivative = derivative; |
73 | dc->writeback_rate_change = change; | 73 | dc->writeback_rate_change = change; |
74 | dc->writeback_rate_target = target; | 74 | dc->writeback_rate_target = target; |
75 | 75 | ||
76 | schedule_delayed_work(&dc->writeback_rate_update, | 76 | schedule_delayed_work(&dc->writeback_rate_update, |
77 | dc->writeback_rate_update_seconds * HZ); | 77 | dc->writeback_rate_update_seconds * HZ); |
78 | } | 78 | } |
79 | 79 | ||
80 | static void update_writeback_rate(struct work_struct *work) | 80 | static void update_writeback_rate(struct work_struct *work) |
81 | { | 81 | { |
82 | struct cached_dev *dc = container_of(to_delayed_work(work), | 82 | struct cached_dev *dc = container_of(to_delayed_work(work), |
83 | struct cached_dev, | 83 | struct cached_dev, |
84 | writeback_rate_update); | 84 | writeback_rate_update); |
85 | 85 | ||
86 | down_read(&dc->writeback_lock); | 86 | down_read(&dc->writeback_lock); |
87 | 87 | ||
88 | if (atomic_read(&dc->has_dirty) && | 88 | if (atomic_read(&dc->has_dirty) && |
89 | dc->writeback_percent) | 89 | dc->writeback_percent) |
90 | __update_writeback_rate(dc); | 90 | __update_writeback_rate(dc); |
91 | 91 | ||
92 | up_read(&dc->writeback_lock); | 92 | up_read(&dc->writeback_lock); |
93 | } | 93 | } |
94 | 94 | ||
95 | static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) | 95 | static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) |
96 | { | 96 | { |
97 | if (atomic_read(&dc->disk.detaching) || | 97 | if (atomic_read(&dc->disk.detaching) || |
98 | !dc->writeback_percent) | 98 | !dc->writeback_percent) |
99 | return 0; | 99 | return 0; |
100 | 100 | ||
101 | return bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL); | 101 | return bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL); |
102 | } | 102 | } |
103 | 103 | ||
104 | /* Background writeback */ | 104 | /* Background writeback */ |
105 | 105 | ||
106 | static bool dirty_pred(struct keybuf *buf, struct bkey *k) | 106 | static bool dirty_pred(struct keybuf *buf, struct bkey *k) |
107 | { | 107 | { |
108 | return KEY_DIRTY(k); | 108 | return KEY_DIRTY(k); |
109 | } | 109 | } |
110 | 110 | ||
111 | static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k) | 111 | static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k) |
112 | { | 112 | { |
113 | uint64_t stripe; | 113 | uint64_t stripe; |
114 | unsigned nr_sectors = KEY_SIZE(k); | 114 | unsigned nr_sectors = KEY_SIZE(k); |
115 | struct cached_dev *dc = container_of(buf, struct cached_dev, | 115 | struct cached_dev *dc = container_of(buf, struct cached_dev, |
116 | writeback_keys); | 116 | writeback_keys); |
117 | unsigned stripe_size = 1 << dc->disk.stripe_size_bits; | 117 | unsigned stripe_size = 1 << dc->disk.stripe_size_bits; |
118 | 118 | ||
119 | if (!KEY_DIRTY(k)) | 119 | if (!KEY_DIRTY(k)) |
120 | return false; | 120 | return false; |
121 | 121 | ||
122 | stripe = KEY_START(k) >> dc->disk.stripe_size_bits; | 122 | stripe = KEY_START(k) >> dc->disk.stripe_size_bits; |
123 | while (1) { | 123 | while (1) { |
124 | if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) != | 124 | if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) != |
125 | stripe_size) | 125 | stripe_size) |
126 | return false; | 126 | return false; |
127 | 127 | ||
128 | if (nr_sectors <= stripe_size) | 128 | if (nr_sectors <= stripe_size) |
129 | return true; | 129 | return true; |
130 | 130 | ||
131 | nr_sectors -= stripe_size; | 131 | nr_sectors -= stripe_size; |
132 | stripe++; | 132 | stripe++; |
133 | } | 133 | } |
134 | } | 134 | } |
135 | 135 | ||
136 | static void dirty_init(struct keybuf_key *w) | 136 | static void dirty_init(struct keybuf_key *w) |
137 | { | 137 | { |
138 | struct dirty_io *io = w->private; | 138 | struct dirty_io *io = w->private; |
139 | struct bio *bio = &io->bio; | 139 | struct bio *bio = &io->bio; |
140 | 140 | ||
141 | bio_init(bio); | 141 | bio_init(bio); |
142 | if (!io->dc->writeback_percent) | 142 | if (!io->dc->writeback_percent) |
143 | bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); | 143 | bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); |
144 | 144 | ||
145 | bio->bi_size = KEY_SIZE(&w->key) << 9; | 145 | bio->bi_size = KEY_SIZE(&w->key) << 9; |
146 | bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS); | 146 | bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS); |
147 | bio->bi_private = w; | 147 | bio->bi_private = w; |
148 | bio->bi_io_vec = bio->bi_inline_vecs; | 148 | bio->bi_io_vec = bio->bi_inline_vecs; |
149 | bch_bio_map(bio, NULL); | 149 | bch_bio_map(bio, NULL); |
150 | } | 150 | } |
151 | 151 | ||
152 | static void refill_dirty(struct closure *cl) | 152 | static void refill_dirty(struct closure *cl) |
153 | { | 153 | { |
154 | struct cached_dev *dc = container_of(cl, struct cached_dev, | 154 | struct cached_dev *dc = container_of(cl, struct cached_dev, |
155 | writeback.cl); | 155 | writeback.cl); |
156 | struct keybuf *buf = &dc->writeback_keys; | 156 | struct keybuf *buf = &dc->writeback_keys; |
157 | bool searched_from_start = false; | 157 | bool searched_from_start = false; |
158 | struct bkey end = MAX_KEY; | 158 | struct bkey end = MAX_KEY; |
159 | SET_KEY_INODE(&end, dc->disk.id); | 159 | SET_KEY_INODE(&end, dc->disk.id); |
160 | 160 | ||
161 | if (!atomic_read(&dc->disk.detaching) && | 161 | if (!atomic_read(&dc->disk.detaching) && |
162 | !dc->writeback_running) | 162 | !dc->writeback_running) |
163 | closure_return(cl); | 163 | closure_return(cl); |
164 | 164 | ||
165 | down_write(&dc->writeback_lock); | 165 | down_write(&dc->writeback_lock); |
166 | 166 | ||
167 | if (!atomic_read(&dc->has_dirty)) { | 167 | if (!atomic_read(&dc->has_dirty)) { |
168 | SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); | 168 | SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); |
169 | bch_write_bdev_super(dc, NULL); | 169 | bch_write_bdev_super(dc, NULL); |
170 | 170 | ||
171 | up_write(&dc->writeback_lock); | 171 | up_write(&dc->writeback_lock); |
172 | closure_return(cl); | 172 | closure_return(cl); |
173 | } | 173 | } |
174 | 174 | ||
175 | if (bkey_cmp(&buf->last_scanned, &end) >= 0) { | 175 | if (bkey_cmp(&buf->last_scanned, &end) >= 0) { |
176 | buf->last_scanned = KEY(dc->disk.id, 0, 0); | 176 | buf->last_scanned = KEY(dc->disk.id, 0, 0); |
177 | searched_from_start = true; | 177 | searched_from_start = true; |
178 | } | 178 | } |
179 | 179 | ||
180 | if (dc->partial_stripes_expensive) { | 180 | if (dc->partial_stripes_expensive) { |
181 | uint64_t i; | 181 | uint64_t i; |
182 | 182 | ||
183 | for (i = 0; i < dc->disk.nr_stripes; i++) | 183 | for (i = 0; i < dc->disk.nr_stripes; i++) |
184 | if (atomic_read(dc->disk.stripe_sectors_dirty + i) == | 184 | if (atomic_read(dc->disk.stripe_sectors_dirty + i) == |
185 | 1 << dc->disk.stripe_size_bits) | 185 | 1 << dc->disk.stripe_size_bits) |
186 | goto full_stripes; | 186 | goto full_stripes; |
187 | 187 | ||
188 | goto normal_refill; | 188 | goto normal_refill; |
189 | full_stripes: | 189 | full_stripes: |
190 | bch_refill_keybuf(dc->disk.c, buf, &end, | 190 | bch_refill_keybuf(dc->disk.c, buf, &end, |
191 | dirty_full_stripe_pred); | 191 | dirty_full_stripe_pred); |
192 | } else { | 192 | } else { |
193 | normal_refill: | 193 | normal_refill: |
194 | bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); | 194 | bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); |
195 | } | 195 | } |
196 | 196 | ||
197 | if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) { | 197 | if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) { |
198 | /* Searched the entire btree - delay awhile */ | 198 | /* Searched the entire btree - delay awhile */ |
199 | 199 | ||
200 | if (RB_EMPTY_ROOT(&buf->keys)) { | 200 | if (RB_EMPTY_ROOT(&buf->keys)) { |
201 | atomic_set(&dc->has_dirty, 0); | 201 | atomic_set(&dc->has_dirty, 0); |
202 | cached_dev_put(dc); | 202 | cached_dev_put(dc); |
203 | } | 203 | } |
204 | 204 | ||
205 | if (!atomic_read(&dc->disk.detaching)) | 205 | if (!atomic_read(&dc->disk.detaching)) |
206 | closure_delay(&dc->writeback, dc->writeback_delay * HZ); | 206 | closure_delay(&dc->writeback, dc->writeback_delay * HZ); |
207 | } | 207 | } |
208 | 208 | ||
209 | up_write(&dc->writeback_lock); | 209 | up_write(&dc->writeback_lock); |
210 | 210 | ||
211 | ratelimit_reset(&dc->writeback_rate); | 211 | ratelimit_reset(&dc->writeback_rate); |
212 | 212 | ||
213 | /* Punt to workqueue only so we don't recurse and blow the stack */ | 213 | /* Punt to workqueue only so we don't recurse and blow the stack */ |
214 | continue_at(cl, read_dirty, dirty_wq); | 214 | continue_at(cl, read_dirty, dirty_wq); |
215 | } | 215 | } |
216 | 216 | ||
217 | void bch_writeback_queue(struct cached_dev *dc) | 217 | void bch_writeback_queue(struct cached_dev *dc) |
218 | { | 218 | { |
219 | if (closure_trylock(&dc->writeback.cl, &dc->disk.cl)) { | 219 | if (closure_trylock(&dc->writeback.cl, &dc->disk.cl)) { |
220 | if (!atomic_read(&dc->disk.detaching)) | 220 | if (!atomic_read(&dc->disk.detaching)) |
221 | closure_delay(&dc->writeback, dc->writeback_delay * HZ); | 221 | closure_delay(&dc->writeback, dc->writeback_delay * HZ); |
222 | 222 | ||
223 | continue_at(&dc->writeback.cl, refill_dirty, dirty_wq); | 223 | continue_at(&dc->writeback.cl, refill_dirty, dirty_wq); |
224 | } | 224 | } |
225 | } | 225 | } |
226 | 226 | ||
227 | void bch_writeback_add(struct cached_dev *dc) | 227 | void bch_writeback_add(struct cached_dev *dc) |
228 | { | 228 | { |
229 | if (!atomic_read(&dc->has_dirty) && | 229 | if (!atomic_read(&dc->has_dirty) && |
230 | !atomic_xchg(&dc->has_dirty, 1)) { | 230 | !atomic_xchg(&dc->has_dirty, 1)) { |
231 | atomic_inc(&dc->count); | 231 | atomic_inc(&dc->count); |
232 | 232 | ||
233 | if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { | 233 | if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { |
234 | SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); | 234 | SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); |
235 | /* XXX: should do this synchronously */ | 235 | /* XXX: should do this synchronously */ |
236 | bch_write_bdev_super(dc, NULL); | 236 | bch_write_bdev_super(dc, NULL); |
237 | } | 237 | } |
238 | 238 | ||
239 | bch_writeback_queue(dc); | 239 | bch_writeback_queue(dc); |
240 | 240 | ||
241 | if (dc->writeback_percent) | 241 | if (dc->writeback_percent) |
242 | schedule_delayed_work(&dc->writeback_rate_update, | 242 | schedule_delayed_work(&dc->writeback_rate_update, |
243 | dc->writeback_rate_update_seconds * HZ); | 243 | dc->writeback_rate_update_seconds * HZ); |
244 | } | 244 | } |
245 | } | 245 | } |
246 | 246 | ||
247 | void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, | 247 | void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, |
248 | uint64_t offset, int nr_sectors) | 248 | uint64_t offset, int nr_sectors) |
249 | { | 249 | { |
250 | struct bcache_device *d = c->devices[inode]; | 250 | struct bcache_device *d = c->devices[inode]; |
251 | unsigned stripe_size, stripe_offset; | 251 | unsigned stripe_size, stripe_offset; |
252 | uint64_t stripe; | 252 | uint64_t stripe; |
253 | 253 | ||
254 | if (!d) | 254 | if (!d) |
255 | return; | 255 | return; |
256 | 256 | ||
257 | stripe_size = 1 << d->stripe_size_bits; | 257 | stripe_size = 1 << d->stripe_size_bits; |
258 | stripe = offset >> d->stripe_size_bits; | 258 | stripe = offset >> d->stripe_size_bits; |
259 | stripe_offset = offset & (stripe_size - 1); | 259 | stripe_offset = offset & (stripe_size - 1); |
260 | 260 | ||
261 | while (nr_sectors) { | 261 | while (nr_sectors) { |
262 | int s = min_t(unsigned, abs(nr_sectors), | 262 | int s = min_t(unsigned, abs(nr_sectors), |
263 | stripe_size - stripe_offset); | 263 | stripe_size - stripe_offset); |
264 | 264 | ||
265 | if (nr_sectors < 0) | 265 | if (nr_sectors < 0) |
266 | s = -s; | 266 | s = -s; |
267 | 267 | ||
268 | atomic_add(s, d->stripe_sectors_dirty + stripe); | 268 | atomic_add(s, d->stripe_sectors_dirty + stripe); |
269 | nr_sectors -= s; | 269 | nr_sectors -= s; |
270 | stripe_offset = 0; | 270 | stripe_offset = 0; |
271 | stripe++; | 271 | stripe++; |
272 | } | 272 | } |
273 | } | 273 | } |
274 | 274 | ||
275 | /* Background writeback - IO loop */ | 275 | /* Background writeback - IO loop */ |
276 | 276 | ||
277 | static void dirty_io_destructor(struct closure *cl) | 277 | static void dirty_io_destructor(struct closure *cl) |
278 | { | 278 | { |
279 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | 279 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); |
280 | kfree(io); | 280 | kfree(io); |
281 | } | 281 | } |
282 | 282 | ||
283 | static void write_dirty_finish(struct closure *cl) | 283 | static void write_dirty_finish(struct closure *cl) |
284 | { | 284 | { |
285 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | 285 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); |
286 | struct keybuf_key *w = io->bio.bi_private; | 286 | struct keybuf_key *w = io->bio.bi_private; |
287 | struct cached_dev *dc = io->dc; | 287 | struct cached_dev *dc = io->dc; |
288 | struct bio_vec *bv = bio_iovec_idx(&io->bio, io->bio.bi_vcnt); | 288 | struct bio_vec *bv; |
289 | int i; | ||
289 | 290 | ||
290 | while (bv-- != io->bio.bi_io_vec) | 291 | bio_for_each_segment_all(bv, &io->bio, i) |
291 | __free_page(bv->bv_page); | 292 | __free_page(bv->bv_page); |
292 | 293 | ||
293 | /* This is kind of a dumb way of signalling errors. */ | 294 | /* This is kind of a dumb way of signalling errors. */ |
294 | if (KEY_DIRTY(&w->key)) { | 295 | if (KEY_DIRTY(&w->key)) { |
295 | unsigned i; | 296 | unsigned i; |
296 | struct btree_op op; | 297 | struct btree_op op; |
297 | bch_btree_op_init_stack(&op); | 298 | bch_btree_op_init_stack(&op); |
298 | 299 | ||
299 | op.type = BTREE_REPLACE; | 300 | op.type = BTREE_REPLACE; |
300 | bkey_copy(&op.replace, &w->key); | 301 | bkey_copy(&op.replace, &w->key); |
301 | 302 | ||
302 | SET_KEY_DIRTY(&w->key, false); | 303 | SET_KEY_DIRTY(&w->key, false); |
303 | bch_keylist_add(&op.keys, &w->key); | 304 | bch_keylist_add(&op.keys, &w->key); |
304 | 305 | ||
305 | for (i = 0; i < KEY_PTRS(&w->key); i++) | 306 | for (i = 0; i < KEY_PTRS(&w->key); i++) |
306 | atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); | 307 | atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); |
307 | 308 | ||
308 | bch_btree_insert(&op, dc->disk.c); | 309 | bch_btree_insert(&op, dc->disk.c); |
309 | closure_sync(&op.cl); | 310 | closure_sync(&op.cl); |
310 | 311 | ||
311 | if (op.insert_collision) | 312 | if (op.insert_collision) |
312 | trace_bcache_writeback_collision(&w->key); | 313 | trace_bcache_writeback_collision(&w->key); |
313 | 314 | ||
314 | atomic_long_inc(op.insert_collision | 315 | atomic_long_inc(op.insert_collision |
315 | ? &dc->disk.c->writeback_keys_failed | 316 | ? &dc->disk.c->writeback_keys_failed |
316 | : &dc->disk.c->writeback_keys_done); | 317 | : &dc->disk.c->writeback_keys_done); |
317 | } | 318 | } |
318 | 319 | ||
319 | bch_keybuf_del(&dc->writeback_keys, w); | 320 | bch_keybuf_del(&dc->writeback_keys, w); |
320 | atomic_dec_bug(&dc->in_flight); | 321 | atomic_dec_bug(&dc->in_flight); |
321 | 322 | ||
322 | closure_wake_up(&dc->writeback_wait); | 323 | closure_wake_up(&dc->writeback_wait); |
323 | 324 | ||
324 | closure_return_with_destructor(cl, dirty_io_destructor); | 325 | closure_return_with_destructor(cl, dirty_io_destructor); |
325 | } | 326 | } |
326 | 327 | ||
327 | static void dirty_endio(struct bio *bio, int error) | 328 | static void dirty_endio(struct bio *bio, int error) |
328 | { | 329 | { |
329 | struct keybuf_key *w = bio->bi_private; | 330 | struct keybuf_key *w = bio->bi_private; |
330 | struct dirty_io *io = w->private; | 331 | struct dirty_io *io = w->private; |
331 | 332 | ||
332 | if (error) | 333 | if (error) |
333 | SET_KEY_DIRTY(&w->key, false); | 334 | SET_KEY_DIRTY(&w->key, false); |
334 | 335 | ||
335 | closure_put(&io->cl); | 336 | closure_put(&io->cl); |
336 | } | 337 | } |
337 | 338 | ||
338 | static void write_dirty(struct closure *cl) | 339 | static void write_dirty(struct closure *cl) |
339 | { | 340 | { |
340 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | 341 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); |
341 | struct keybuf_key *w = io->bio.bi_private; | 342 | struct keybuf_key *w = io->bio.bi_private; |
342 | 343 | ||
343 | dirty_init(w); | 344 | dirty_init(w); |
344 | io->bio.bi_rw = WRITE; | 345 | io->bio.bi_rw = WRITE; |
345 | io->bio.bi_sector = KEY_START(&w->key); | 346 | io->bio.bi_sector = KEY_START(&w->key); |
346 | io->bio.bi_bdev = io->dc->bdev; | 347 | io->bio.bi_bdev = io->dc->bdev; |
347 | io->bio.bi_end_io = dirty_endio; | 348 | io->bio.bi_end_io = dirty_endio; |
348 | 349 | ||
349 | closure_bio_submit(&io->bio, cl, &io->dc->disk); | 350 | closure_bio_submit(&io->bio, cl, &io->dc->disk); |
350 | 351 | ||
351 | continue_at(cl, write_dirty_finish, dirty_wq); | 352 | continue_at(cl, write_dirty_finish, dirty_wq); |
352 | } | 353 | } |
353 | 354 | ||
354 | static void read_dirty_endio(struct bio *bio, int error) | 355 | static void read_dirty_endio(struct bio *bio, int error) |
355 | { | 356 | { |
356 | struct keybuf_key *w = bio->bi_private; | 357 | struct keybuf_key *w = bio->bi_private; |
357 | struct dirty_io *io = w->private; | 358 | struct dirty_io *io = w->private; |
358 | 359 | ||
359 | bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0), | 360 | bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0), |
360 | error, "reading dirty data from cache"); | 361 | error, "reading dirty data from cache"); |
361 | 362 | ||
362 | dirty_endio(bio, error); | 363 | dirty_endio(bio, error); |
363 | } | 364 | } |
364 | 365 | ||
365 | static void read_dirty_submit(struct closure *cl) | 366 | static void read_dirty_submit(struct closure *cl) |
366 | { | 367 | { |
367 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | 368 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); |
368 | 369 | ||
369 | closure_bio_submit(&io->bio, cl, &io->dc->disk); | 370 | closure_bio_submit(&io->bio, cl, &io->dc->disk); |
370 | 371 | ||
371 | continue_at(cl, write_dirty, dirty_wq); | 372 | continue_at(cl, write_dirty, dirty_wq); |
372 | } | 373 | } |
373 | 374 | ||
374 | static void read_dirty(struct closure *cl) | 375 | static void read_dirty(struct closure *cl) |
375 | { | 376 | { |
376 | struct cached_dev *dc = container_of(cl, struct cached_dev, | 377 | struct cached_dev *dc = container_of(cl, struct cached_dev, |
377 | writeback.cl); | 378 | writeback.cl); |
378 | unsigned delay = writeback_delay(dc, 0); | 379 | unsigned delay = writeback_delay(dc, 0); |
379 | struct keybuf_key *w; | 380 | struct keybuf_key *w; |
380 | struct dirty_io *io; | 381 | struct dirty_io *io; |
381 | 382 | ||
382 | /* | 383 | /* |
383 | * XXX: if we error, background writeback just spins. Should use some | 384 | * XXX: if we error, background writeback just spins. Should use some |
384 | * mempools. | 385 | * mempools. |
385 | */ | 386 | */ |
386 | 387 | ||
387 | while (1) { | 388 | while (1) { |
388 | w = bch_keybuf_next(&dc->writeback_keys); | 389 | w = bch_keybuf_next(&dc->writeback_keys); |
389 | if (!w) | 390 | if (!w) |
390 | break; | 391 | break; |
391 | 392 | ||
392 | BUG_ON(ptr_stale(dc->disk.c, &w->key, 0)); | 393 | BUG_ON(ptr_stale(dc->disk.c, &w->key, 0)); |
393 | 394 | ||
394 | if (delay > 0 && | 395 | if (delay > 0 && |
395 | (KEY_START(&w->key) != dc->last_read || | 396 | (KEY_START(&w->key) != dc->last_read || |
396 | jiffies_to_msecs(delay) > 50)) { | 397 | jiffies_to_msecs(delay) > 50)) { |
397 | w->private = NULL; | 398 | w->private = NULL; |
398 | 399 | ||
399 | closure_delay(&dc->writeback, delay); | 400 | closure_delay(&dc->writeback, delay); |
400 | continue_at(cl, read_dirty, dirty_wq); | 401 | continue_at(cl, read_dirty, dirty_wq); |
401 | } | 402 | } |
402 | 403 | ||
403 | dc->last_read = KEY_OFFSET(&w->key); | 404 | dc->last_read = KEY_OFFSET(&w->key); |
404 | 405 | ||
405 | io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec) | 406 | io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec) |
406 | * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), | 407 | * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), |
407 | GFP_KERNEL); | 408 | GFP_KERNEL); |
408 | if (!io) | 409 | if (!io) |
409 | goto err; | 410 | goto err; |
410 | 411 | ||
411 | w->private = io; | 412 | w->private = io; |
412 | io->dc = dc; | 413 | io->dc = dc; |
413 | 414 | ||
414 | dirty_init(w); | 415 | dirty_init(w); |
415 | io->bio.bi_sector = PTR_OFFSET(&w->key, 0); | 416 | io->bio.bi_sector = PTR_OFFSET(&w->key, 0); |
416 | io->bio.bi_bdev = PTR_CACHE(dc->disk.c, | 417 | io->bio.bi_bdev = PTR_CACHE(dc->disk.c, |
417 | &w->key, 0)->bdev; | 418 | &w->key, 0)->bdev; |
418 | io->bio.bi_rw = READ; | 419 | io->bio.bi_rw = READ; |
419 | io->bio.bi_end_io = read_dirty_endio; | 420 | io->bio.bi_end_io = read_dirty_endio; |
420 | 421 | ||
421 | if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL)) | 422 | if (bio_alloc_pages(&io->bio, GFP_KERNEL)) |
422 | goto err_free; | 423 | goto err_free; |
423 | 424 | ||
424 | trace_bcache_writeback(&w->key); | 425 | trace_bcache_writeback(&w->key); |
425 | 426 | ||
426 | closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl); | 427 | closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl); |
427 | 428 | ||
428 | delay = writeback_delay(dc, KEY_SIZE(&w->key)); | 429 | delay = writeback_delay(dc, KEY_SIZE(&w->key)); |
429 | 430 | ||
430 | atomic_inc(&dc->in_flight); | 431 | atomic_inc(&dc->in_flight); |
431 | 432 | ||
432 | if (!closure_wait_event(&dc->writeback_wait, cl, | 433 | if (!closure_wait_event(&dc->writeback_wait, cl, |
433 | atomic_read(&dc->in_flight) < 64)) | 434 | atomic_read(&dc->in_flight) < 64)) |
434 | continue_at(cl, read_dirty, dirty_wq); | 435 | continue_at(cl, read_dirty, dirty_wq); |
435 | } | 436 | } |
436 | 437 | ||
437 | if (0) { | 438 | if (0) { |
438 | err_free: | 439 | err_free: |
439 | kfree(w->private); | 440 | kfree(w->private); |
440 | err: | 441 | err: |
441 | bch_keybuf_del(&dc->writeback_keys, w); | 442 | bch_keybuf_del(&dc->writeback_keys, w); |
442 | } | 443 | } |
443 | 444 | ||
444 | refill_dirty(cl); | 445 | refill_dirty(cl); |
445 | } | 446 | } |
446 | 447 | ||
447 | /* Init */ | 448 | /* Init */ |
448 | 449 | ||
449 | static int bch_btree_sectors_dirty_init(struct btree *b, struct btree_op *op, | 450 | static int bch_btree_sectors_dirty_init(struct btree *b, struct btree_op *op, |
450 | struct cached_dev *dc) | 451 | struct cached_dev *dc) |
451 | { | 452 | { |
452 | struct bkey *k; | 453 | struct bkey *k; |
453 | struct btree_iter iter; | 454 | struct btree_iter iter; |
454 | 455 | ||
455 | bch_btree_iter_init(b, &iter, &KEY(dc->disk.id, 0, 0)); | 456 | bch_btree_iter_init(b, &iter, &KEY(dc->disk.id, 0, 0)); |
456 | while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) | 457 | while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) |
457 | if (!b->level) { | 458 | if (!b->level) { |
458 | if (KEY_INODE(k) > dc->disk.id) | 459 | if (KEY_INODE(k) > dc->disk.id) |
459 | break; | 460 | break; |
460 | 461 | ||
461 | if (KEY_DIRTY(k)) | 462 | if (KEY_DIRTY(k)) |
462 | bcache_dev_sectors_dirty_add(b->c, dc->disk.id, | 463 | bcache_dev_sectors_dirty_add(b->c, dc->disk.id, |
463 | KEY_START(k), | 464 | KEY_START(k), |
464 | KEY_SIZE(k)); | 465 | KEY_SIZE(k)); |
465 | } else { | 466 | } else { |
466 | btree(sectors_dirty_init, k, b, op, dc); | 467 | btree(sectors_dirty_init, k, b, op, dc); |
467 | if (KEY_INODE(k) > dc->disk.id) | 468 | if (KEY_INODE(k) > dc->disk.id) |
468 | break; | 469 | break; |
469 | 470 | ||
470 | cond_resched(); | 471 | cond_resched(); |
471 | } | 472 | } |
472 | 473 | ||
473 | return 0; | 474 | return 0; |
474 | } | 475 | } |
475 | 476 | ||
476 | void bch_sectors_dirty_init(struct cached_dev *dc) | 477 | void bch_sectors_dirty_init(struct cached_dev *dc) |
477 | { | 478 | { |
478 | struct btree_op op; | 479 | struct btree_op op; |
479 | 480 | ||
480 | bch_btree_op_init_stack(&op); | 481 | bch_btree_op_init_stack(&op); |
481 | btree_root(sectors_dirty_init, dc->disk.c, &op, dc); | 482 | btree_root(sectors_dirty_init, dc->disk.c, &op, dc); |
482 | } | 483 | } |
483 | 484 | ||
484 | void bch_cached_dev_writeback_init(struct cached_dev *dc) | 485 | void bch_cached_dev_writeback_init(struct cached_dev *dc) |
485 | { | 486 | { |
486 | closure_init_unlocked(&dc->writeback); | 487 | closure_init_unlocked(&dc->writeback); |
487 | init_rwsem(&dc->writeback_lock); | 488 | init_rwsem(&dc->writeback_lock); |
488 | 489 | ||
489 | bch_keybuf_init(&dc->writeback_keys); | 490 | bch_keybuf_init(&dc->writeback_keys); |
490 | 491 | ||
491 | dc->writeback_metadata = true; | 492 | dc->writeback_metadata = true; |
492 | dc->writeback_running = true; | 493 | dc->writeback_running = true; |
493 | dc->writeback_percent = 10; | 494 | dc->writeback_percent = 10; |
494 | dc->writeback_delay = 30; | 495 | dc->writeback_delay = 30; |
495 | dc->writeback_rate.rate = 1024; | 496 | dc->writeback_rate.rate = 1024; |
496 | 497 | ||
497 | dc->writeback_rate_update_seconds = 30; | 498 | dc->writeback_rate_update_seconds = 30; |
498 | dc->writeback_rate_d_term = 16; | 499 | dc->writeback_rate_d_term = 16; |
499 | dc->writeback_rate_p_term_inverse = 64; | 500 | dc->writeback_rate_p_term_inverse = 64; |
500 | dc->writeback_rate_d_smooth = 8; | 501 | dc->writeback_rate_d_smooth = 8; |
501 | 502 | ||
502 | INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); | 503 | INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); |
503 | schedule_delayed_work(&dc->writeback_rate_update, | 504 | schedule_delayed_work(&dc->writeback_rate_update, |
504 | dc->writeback_rate_update_seconds * HZ); | 505 | dc->writeback_rate_update_seconds * HZ); |
505 | } | 506 | } |
506 | 507 | ||
507 | void bch_writeback_exit(void) | 508 | void bch_writeback_exit(void) |
508 | { | 509 | { |
509 | if (dirty_wq) | 510 | if (dirty_wq) |
510 | destroy_workqueue(dirty_wq); | 511 | destroy_workqueue(dirty_wq); |
511 | } | 512 | } |
512 | 513 | ||
513 | int __init bch_writeback_init(void) | 514 | int __init bch_writeback_init(void) |
514 | { | 515 | { |
515 | dirty_wq = create_singlethread_workqueue("bcache_writeback"); | 516 | dirty_wq = create_singlethread_workqueue("bcache_writeback"); |
516 | if (!dirty_wq) | 517 | if (!dirty_wq) |
517 | return -ENOMEM; | 518 | return -ENOMEM; |
518 | 519 | ||
519 | return 0; | 520 | return 0; |
520 | } | 521 | } |
521 | 522 |