Commit 72c270612bd33192fa836ad0f2939af1ca218292
1 parent
279afbad4e
Exists in
smarc-imx_3.14.28_1.0.0_ga
and in
1 other branch
bcache: Write out full stripes
Now that we're tracking dirty data per stripe, we can add two optimizations for raid5/6: * If a stripe is already dirty, force writes to that stripe to writeback mode - to help build up full stripes of dirty data * When flushing dirty data, preferentially write out full stripes first if there are any. Signed-off-by: Kent Overstreet <koverstreet@google.com>
Showing 9 changed files with 121 additions and 37 deletions Inline Diff
drivers/md/bcache/bcache.h
1 | #ifndef _BCACHE_H | 1 | #ifndef _BCACHE_H |
2 | #define _BCACHE_H | 2 | #define _BCACHE_H |
3 | 3 | ||
4 | /* | 4 | /* |
5 | * SOME HIGH LEVEL CODE DOCUMENTATION: | 5 | * SOME HIGH LEVEL CODE DOCUMENTATION: |
6 | * | 6 | * |
7 | * Bcache mostly works with cache sets, cache devices, and backing devices. | 7 | * Bcache mostly works with cache sets, cache devices, and backing devices. |
8 | * | 8 | * |
9 | * Support for multiple cache devices hasn't quite been finished off yet, but | 9 | * Support for multiple cache devices hasn't quite been finished off yet, but |
10 | * it's about 95% plumbed through. A cache set and its cache devices is sort of | 10 | * it's about 95% plumbed through. A cache set and its cache devices is sort of |
11 | * like a md raid array and its component devices. Most of the code doesn't care | 11 | * like a md raid array and its component devices. Most of the code doesn't care |
12 | * about individual cache devices, the main abstraction is the cache set. | 12 | * about individual cache devices, the main abstraction is the cache set. |
13 | * | 13 | * |
14 | * Multiple cache devices is intended to give us the ability to mirror dirty | 14 | * Multiple cache devices is intended to give us the ability to mirror dirty |
15 | * cached data and metadata, without mirroring clean cached data. | 15 | * cached data and metadata, without mirroring clean cached data. |
16 | * | 16 | * |
17 | * Backing devices are different, in that they have a lifetime independent of a | 17 | * Backing devices are different, in that they have a lifetime independent of a |
18 | * cache set. When you register a newly formatted backing device it'll come up | 18 | * cache set. When you register a newly formatted backing device it'll come up |
19 | * in passthrough mode, and then you can attach and detach a backing device from | 19 | * in passthrough mode, and then you can attach and detach a backing device from |
20 | * a cache set at runtime - while it's mounted and in use. Detaching implicitly | 20 | * a cache set at runtime - while it's mounted and in use. Detaching implicitly |
21 | * invalidates any cached data for that backing device. | 21 | * invalidates any cached data for that backing device. |
22 | * | 22 | * |
23 | * A cache set can have multiple (many) backing devices attached to it. | 23 | * A cache set can have multiple (many) backing devices attached to it. |
24 | * | 24 | * |
25 | * There's also flash only volumes - this is the reason for the distinction | 25 | * There's also flash only volumes - this is the reason for the distinction |
26 | * between struct cached_dev and struct bcache_device. A flash only volume | 26 | * between struct cached_dev and struct bcache_device. A flash only volume |
27 | * works much like a bcache device that has a backing device, except the | 27 | * works much like a bcache device that has a backing device, except the |
28 | * "cached" data is always dirty. The end result is that we get thin | 28 | * "cached" data is always dirty. The end result is that we get thin |
29 | * provisioning with very little additional code. | 29 | * provisioning with very little additional code. |
30 | * | 30 | * |
31 | * Flash only volumes work but they're not production ready because the moving | 31 | * Flash only volumes work but they're not production ready because the moving |
32 | * garbage collector needs more work. More on that later. | 32 | * garbage collector needs more work. More on that later. |
33 | * | 33 | * |
34 | * BUCKETS/ALLOCATION: | 34 | * BUCKETS/ALLOCATION: |
35 | * | 35 | * |
36 | * Bcache is primarily designed for caching, which means that in normal | 36 | * Bcache is primarily designed for caching, which means that in normal |
37 | * operation all of our available space will be allocated. Thus, we need an | 37 | * operation all of our available space will be allocated. Thus, we need an |
38 | * efficient way of deleting things from the cache so we can write new things to | 38 | * efficient way of deleting things from the cache so we can write new things to |
39 | * it. | 39 | * it. |
40 | * | 40 | * |
41 | * To do this, we first divide the cache device up into buckets. A bucket is the | 41 | * To do this, we first divide the cache device up into buckets. A bucket is the |
42 | * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+ | 42 | * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+ |
43 | * works efficiently. | 43 | * works efficiently. |
44 | * | 44 | * |
45 | * Each bucket has a 16 bit priority, and an 8 bit generation associated with | 45 | * Each bucket has a 16 bit priority, and an 8 bit generation associated with |
46 | * it. The gens and priorities for all the buckets are stored contiguously and | 46 | * it. The gens and priorities for all the buckets are stored contiguously and |
47 | * packed on disk (in a linked list of buckets - aside from the superblock, all | 47 | * packed on disk (in a linked list of buckets - aside from the superblock, all |
48 | * of bcache's metadata is stored in buckets). | 48 | * of bcache's metadata is stored in buckets). |
49 | * | 49 | * |
50 | * The priority is used to implement an LRU. We reset a bucket's priority when | 50 | * The priority is used to implement an LRU. We reset a bucket's priority when |
51 | * we allocate it or on cache it, and every so often we decrement the priority | 51 | * we allocate it or on cache it, and every so often we decrement the priority |
52 | * of each bucket. It could be used to implement something more sophisticated, | 52 | * of each bucket. It could be used to implement something more sophisticated, |
53 | * if anyone ever gets around to it. | 53 | * if anyone ever gets around to it. |
54 | * | 54 | * |
55 | * The generation is used for invalidating buckets. Each pointer also has an 8 | 55 | * The generation is used for invalidating buckets. Each pointer also has an 8 |
56 | * bit generation embedded in it; for a pointer to be considered valid, its gen | 56 | * bit generation embedded in it; for a pointer to be considered valid, its gen |
57 | * must match the gen of the bucket it points into. Thus, to reuse a bucket all | 57 | * must match the gen of the bucket it points into. Thus, to reuse a bucket all |
58 | * we have to do is increment its gen (and write its new gen to disk; we batch | 58 | * we have to do is increment its gen (and write its new gen to disk; we batch |
59 | * this up). | 59 | * this up). |
60 | * | 60 | * |
61 | * Bcache is entirely COW - we never write twice to a bucket, even buckets that | 61 | * Bcache is entirely COW - we never write twice to a bucket, even buckets that |
62 | * contain metadata (including btree nodes). | 62 | * contain metadata (including btree nodes). |
63 | * | 63 | * |
64 | * THE BTREE: | 64 | * THE BTREE: |
65 | * | 65 | * |
66 | * Bcache is in large part design around the btree. | 66 | * Bcache is in large part design around the btree. |
67 | * | 67 | * |
68 | * At a high level, the btree is just an index of key -> ptr tuples. | 68 | * At a high level, the btree is just an index of key -> ptr tuples. |
69 | * | 69 | * |
70 | * Keys represent extents, and thus have a size field. Keys also have a variable | 70 | * Keys represent extents, and thus have a size field. Keys also have a variable |
71 | * number of pointers attached to them (potentially zero, which is handy for | 71 | * number of pointers attached to them (potentially zero, which is handy for |
72 | * invalidating the cache). | 72 | * invalidating the cache). |
73 | * | 73 | * |
74 | * The key itself is an inode:offset pair. The inode number corresponds to a | 74 | * The key itself is an inode:offset pair. The inode number corresponds to a |
75 | * backing device or a flash only volume. The offset is the ending offset of the | 75 | * backing device or a flash only volume. The offset is the ending offset of the |
76 | * extent within the inode - not the starting offset; this makes lookups | 76 | * extent within the inode - not the starting offset; this makes lookups |
77 | * slightly more convenient. | 77 | * slightly more convenient. |
78 | * | 78 | * |
79 | * Pointers contain the cache device id, the offset on that device, and an 8 bit | 79 | * Pointers contain the cache device id, the offset on that device, and an 8 bit |
80 | * generation number. More on the gen later. | 80 | * generation number. More on the gen later. |
81 | * | 81 | * |
82 | * Index lookups are not fully abstracted - cache lookups in particular are | 82 | * Index lookups are not fully abstracted - cache lookups in particular are |
83 | * still somewhat mixed in with the btree code, but things are headed in that | 83 | * still somewhat mixed in with the btree code, but things are headed in that |
84 | * direction. | 84 | * direction. |
85 | * | 85 | * |
86 | * Updates are fairly well abstracted, though. There are two different ways of | 86 | * Updates are fairly well abstracted, though. There are two different ways of |
87 | * updating the btree; insert and replace. | 87 | * updating the btree; insert and replace. |
88 | * | 88 | * |
89 | * BTREE_INSERT will just take a list of keys and insert them into the btree - | 89 | * BTREE_INSERT will just take a list of keys and insert them into the btree - |
90 | * overwriting (possibly only partially) any extents they overlap with. This is | 90 | * overwriting (possibly only partially) any extents they overlap with. This is |
91 | * used to update the index after a write. | 91 | * used to update the index after a write. |
92 | * | 92 | * |
93 | * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is | 93 | * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is |
94 | * overwriting a key that matches another given key. This is used for inserting | 94 | * overwriting a key that matches another given key. This is used for inserting |
95 | * data into the cache after a cache miss, and for background writeback, and for | 95 | * data into the cache after a cache miss, and for background writeback, and for |
96 | * the moving garbage collector. | 96 | * the moving garbage collector. |
97 | * | 97 | * |
98 | * There is no "delete" operation; deleting things from the index is | 98 | * There is no "delete" operation; deleting things from the index is |
99 | * accomplished by either by invalidating pointers (by incrementing a bucket's | 99 | * accomplished by either by invalidating pointers (by incrementing a bucket's |
100 | * gen) or by inserting a key with 0 pointers - which will overwrite anything | 100 | * gen) or by inserting a key with 0 pointers - which will overwrite anything |
101 | * previously present at that location in the index. | 101 | * previously present at that location in the index. |
102 | * | 102 | * |
103 | * This means that there are always stale/invalid keys in the btree. They're | 103 | * This means that there are always stale/invalid keys in the btree. They're |
104 | * filtered out by the code that iterates through a btree node, and removed when | 104 | * filtered out by the code that iterates through a btree node, and removed when |
105 | * a btree node is rewritten. | 105 | * a btree node is rewritten. |
106 | * | 106 | * |
107 | * BTREE NODES: | 107 | * BTREE NODES: |
108 | * | 108 | * |
109 | * Our unit of allocation is a bucket, and we we can't arbitrarily allocate and | 109 | * Our unit of allocation is a bucket, and we we can't arbitrarily allocate and |
110 | * free smaller than a bucket - so, that's how big our btree nodes are. | 110 | * free smaller than a bucket - so, that's how big our btree nodes are. |
111 | * | 111 | * |
112 | * (If buckets are really big we'll only use part of the bucket for a btree node | 112 | * (If buckets are really big we'll only use part of the bucket for a btree node |
113 | * - no less than 1/4th - but a bucket still contains no more than a single | 113 | * - no less than 1/4th - but a bucket still contains no more than a single |
114 | * btree node. I'd actually like to change this, but for now we rely on the | 114 | * btree node. I'd actually like to change this, but for now we rely on the |
115 | * bucket's gen for deleting btree nodes when we rewrite/split a node.) | 115 | * bucket's gen for deleting btree nodes when we rewrite/split a node.) |
116 | * | 116 | * |
117 | * Anyways, btree nodes are big - big enough to be inefficient with a textbook | 117 | * Anyways, btree nodes are big - big enough to be inefficient with a textbook |
118 | * btree implementation. | 118 | * btree implementation. |
119 | * | 119 | * |
120 | * The way this is solved is that btree nodes are internally log structured; we | 120 | * The way this is solved is that btree nodes are internally log structured; we |
121 | * can append new keys to an existing btree node without rewriting it. This | 121 | * can append new keys to an existing btree node without rewriting it. This |
122 | * means each set of keys we write is sorted, but the node is not. | 122 | * means each set of keys we write is sorted, but the node is not. |
123 | * | 123 | * |
124 | * We maintain this log structure in memory - keeping 1Mb of keys sorted would | 124 | * We maintain this log structure in memory - keeping 1Mb of keys sorted would |
125 | * be expensive, and we have to distinguish between the keys we have written and | 125 | * be expensive, and we have to distinguish between the keys we have written and |
126 | * the keys we haven't. So to do a lookup in a btree node, we have to search | 126 | * the keys we haven't. So to do a lookup in a btree node, we have to search |
127 | * each sorted set. But we do merge written sets together lazily, so the cost of | 127 | * each sorted set. But we do merge written sets together lazily, so the cost of |
128 | * these extra searches is quite low (normally most of the keys in a btree node | 128 | * these extra searches is quite low (normally most of the keys in a btree node |
129 | * will be in one big set, and then there'll be one or two sets that are much | 129 | * will be in one big set, and then there'll be one or two sets that are much |
130 | * smaller). | 130 | * smaller). |
131 | * | 131 | * |
132 | * This log structure makes bcache's btree more of a hybrid between a | 132 | * This log structure makes bcache's btree more of a hybrid between a |
133 | * conventional btree and a compacting data structure, with some of the | 133 | * conventional btree and a compacting data structure, with some of the |
134 | * advantages of both. | 134 | * advantages of both. |
135 | * | 135 | * |
136 | * GARBAGE COLLECTION: | 136 | * GARBAGE COLLECTION: |
137 | * | 137 | * |
138 | * We can't just invalidate any bucket - it might contain dirty data or | 138 | * We can't just invalidate any bucket - it might contain dirty data or |
139 | * metadata. If it once contained dirty data, other writes might overwrite it | 139 | * metadata. If it once contained dirty data, other writes might overwrite it |
140 | * later, leaving no valid pointers into that bucket in the index. | 140 | * later, leaving no valid pointers into that bucket in the index. |
141 | * | 141 | * |
142 | * Thus, the primary purpose of garbage collection is to find buckets to reuse. | 142 | * Thus, the primary purpose of garbage collection is to find buckets to reuse. |
143 | * It also counts how much valid data it each bucket currently contains, so that | 143 | * It also counts how much valid data it each bucket currently contains, so that |
144 | * allocation can reuse buckets sooner when they've been mostly overwritten. | 144 | * allocation can reuse buckets sooner when they've been mostly overwritten. |
145 | * | 145 | * |
146 | * It also does some things that are really internal to the btree | 146 | * It also does some things that are really internal to the btree |
147 | * implementation. If a btree node contains pointers that are stale by more than | 147 | * implementation. If a btree node contains pointers that are stale by more than |
148 | * some threshold, it rewrites the btree node to avoid the bucket's generation | 148 | * some threshold, it rewrites the btree node to avoid the bucket's generation |
149 | * wrapping around. It also merges adjacent btree nodes if they're empty enough. | 149 | * wrapping around. It also merges adjacent btree nodes if they're empty enough. |
150 | * | 150 | * |
151 | * THE JOURNAL: | 151 | * THE JOURNAL: |
152 | * | 152 | * |
153 | * Bcache's journal is not necessary for consistency; we always strictly | 153 | * Bcache's journal is not necessary for consistency; we always strictly |
154 | * order metadata writes so that the btree and everything else is consistent on | 154 | * order metadata writes so that the btree and everything else is consistent on |
155 | * disk in the event of an unclean shutdown, and in fact bcache had writeback | 155 | * disk in the event of an unclean shutdown, and in fact bcache had writeback |
156 | * caching (with recovery from unclean shutdown) before journalling was | 156 | * caching (with recovery from unclean shutdown) before journalling was |
157 | * implemented. | 157 | * implemented. |
158 | * | 158 | * |
159 | * Rather, the journal is purely a performance optimization; we can't complete a | 159 | * Rather, the journal is purely a performance optimization; we can't complete a |
160 | * write until we've updated the index on disk, otherwise the cache would be | 160 | * write until we've updated the index on disk, otherwise the cache would be |
161 | * inconsistent in the event of an unclean shutdown. This means that without the | 161 | * inconsistent in the event of an unclean shutdown. This means that without the |
162 | * journal, on random write workloads we constantly have to update all the leaf | 162 | * journal, on random write workloads we constantly have to update all the leaf |
163 | * nodes in the btree, and those writes will be mostly empty (appending at most | 163 | * nodes in the btree, and those writes will be mostly empty (appending at most |
164 | * a few keys each) - highly inefficient in terms of amount of metadata writes, | 164 | * a few keys each) - highly inefficient in terms of amount of metadata writes, |
165 | * and it puts more strain on the various btree resorting/compacting code. | 165 | * and it puts more strain on the various btree resorting/compacting code. |
166 | * | 166 | * |
167 | * The journal is just a log of keys we've inserted; on startup we just reinsert | 167 | * The journal is just a log of keys we've inserted; on startup we just reinsert |
168 | * all the keys in the open journal entries. That means that when we're updating | 168 | * all the keys in the open journal entries. That means that when we're updating |
169 | * a node in the btree, we can wait until a 4k block of keys fills up before | 169 | * a node in the btree, we can wait until a 4k block of keys fills up before |
170 | * writing them out. | 170 | * writing them out. |
171 | * | 171 | * |
172 | * For simplicity, we only journal updates to leaf nodes; updates to parent | 172 | * For simplicity, we only journal updates to leaf nodes; updates to parent |
173 | * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth | 173 | * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth |
174 | * the complexity to deal with journalling them (in particular, journal replay) | 174 | * the complexity to deal with journalling them (in particular, journal replay) |
175 | * - updates to non leaf nodes just happen synchronously (see btree_split()). | 175 | * - updates to non leaf nodes just happen synchronously (see btree_split()). |
176 | */ | 176 | */ |
177 | 177 | ||
178 | #define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__ | 178 | #define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__ |
179 | 179 | ||
180 | #include <linux/bio.h> | 180 | #include <linux/bio.h> |
181 | #include <linux/kobject.h> | 181 | #include <linux/kobject.h> |
182 | #include <linux/list.h> | 182 | #include <linux/list.h> |
183 | #include <linux/mutex.h> | 183 | #include <linux/mutex.h> |
184 | #include <linux/rbtree.h> | 184 | #include <linux/rbtree.h> |
185 | #include <linux/rwsem.h> | 185 | #include <linux/rwsem.h> |
186 | #include <linux/types.h> | 186 | #include <linux/types.h> |
187 | #include <linux/workqueue.h> | 187 | #include <linux/workqueue.h> |
188 | 188 | ||
189 | #include "util.h" | 189 | #include "util.h" |
190 | #include "closure.h" | 190 | #include "closure.h" |
191 | 191 | ||
192 | struct bucket { | 192 | struct bucket { |
193 | atomic_t pin; | 193 | atomic_t pin; |
194 | uint16_t prio; | 194 | uint16_t prio; |
195 | uint8_t gen; | 195 | uint8_t gen; |
196 | uint8_t disk_gen; | 196 | uint8_t disk_gen; |
197 | uint8_t last_gc; /* Most out of date gen in the btree */ | 197 | uint8_t last_gc; /* Most out of date gen in the btree */ |
198 | uint8_t gc_gen; | 198 | uint8_t gc_gen; |
199 | uint16_t gc_mark; | 199 | uint16_t gc_mark; |
200 | }; | 200 | }; |
201 | 201 | ||
202 | /* | 202 | /* |
203 | * I'd use bitfields for these, but I don't trust the compiler not to screw me | 203 | * I'd use bitfields for these, but I don't trust the compiler not to screw me |
204 | * as multiple threads touch struct bucket without locking | 204 | * as multiple threads touch struct bucket without locking |
205 | */ | 205 | */ |
206 | 206 | ||
207 | BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2); | 207 | BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2); |
208 | #define GC_MARK_RECLAIMABLE 0 | 208 | #define GC_MARK_RECLAIMABLE 0 |
209 | #define GC_MARK_DIRTY 1 | 209 | #define GC_MARK_DIRTY 1 |
210 | #define GC_MARK_METADATA 2 | 210 | #define GC_MARK_METADATA 2 |
211 | BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 14); | 211 | BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 14); |
212 | 212 | ||
213 | struct bkey { | 213 | struct bkey { |
214 | uint64_t high; | 214 | uint64_t high; |
215 | uint64_t low; | 215 | uint64_t low; |
216 | uint64_t ptr[]; | 216 | uint64_t ptr[]; |
217 | }; | 217 | }; |
218 | 218 | ||
219 | /* Enough for a key with 6 pointers */ | 219 | /* Enough for a key with 6 pointers */ |
220 | #define BKEY_PAD 8 | 220 | #define BKEY_PAD 8 |
221 | 221 | ||
222 | #define BKEY_PADDED(key) \ | 222 | #define BKEY_PADDED(key) \ |
223 | union { struct bkey key; uint64_t key ## _pad[BKEY_PAD]; } | 223 | union { struct bkey key; uint64_t key ## _pad[BKEY_PAD]; } |
224 | 224 | ||
225 | /* Version 0: Cache device | 225 | /* Version 0: Cache device |
226 | * Version 1: Backing device | 226 | * Version 1: Backing device |
227 | * Version 2: Seed pointer into btree node checksum | 227 | * Version 2: Seed pointer into btree node checksum |
228 | * Version 3: Cache device with new UUID format | 228 | * Version 3: Cache device with new UUID format |
229 | * Version 4: Backing device with data offset | 229 | * Version 4: Backing device with data offset |
230 | */ | 230 | */ |
231 | #define BCACHE_SB_VERSION_CDEV 0 | 231 | #define BCACHE_SB_VERSION_CDEV 0 |
232 | #define BCACHE_SB_VERSION_BDEV 1 | 232 | #define BCACHE_SB_VERSION_BDEV 1 |
233 | #define BCACHE_SB_VERSION_CDEV_WITH_UUID 3 | 233 | #define BCACHE_SB_VERSION_CDEV_WITH_UUID 3 |
234 | #define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4 | 234 | #define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4 |
235 | #define BCACHE_SB_MAX_VERSION 4 | 235 | #define BCACHE_SB_MAX_VERSION 4 |
236 | 236 | ||
237 | #define SB_SECTOR 8 | 237 | #define SB_SECTOR 8 |
238 | #define SB_SIZE 4096 | 238 | #define SB_SIZE 4096 |
239 | #define SB_LABEL_SIZE 32 | 239 | #define SB_LABEL_SIZE 32 |
240 | #define SB_JOURNAL_BUCKETS 256U | 240 | #define SB_JOURNAL_BUCKETS 256U |
241 | /* SB_JOURNAL_BUCKETS must be divisible by BITS_PER_LONG */ | 241 | /* SB_JOURNAL_BUCKETS must be divisible by BITS_PER_LONG */ |
242 | #define MAX_CACHES_PER_SET 8 | 242 | #define MAX_CACHES_PER_SET 8 |
243 | 243 | ||
244 | #define BDEV_DATA_START_DEFAULT 16 /* sectors */ | 244 | #define BDEV_DATA_START_DEFAULT 16 /* sectors */ |
245 | 245 | ||
246 | struct cache_sb { | 246 | struct cache_sb { |
247 | uint64_t csum; | 247 | uint64_t csum; |
248 | uint64_t offset; /* sector where this sb was written */ | 248 | uint64_t offset; /* sector where this sb was written */ |
249 | uint64_t version; | 249 | uint64_t version; |
250 | 250 | ||
251 | uint8_t magic[16]; | 251 | uint8_t magic[16]; |
252 | 252 | ||
253 | uint8_t uuid[16]; | 253 | uint8_t uuid[16]; |
254 | union { | 254 | union { |
255 | uint8_t set_uuid[16]; | 255 | uint8_t set_uuid[16]; |
256 | uint64_t set_magic; | 256 | uint64_t set_magic; |
257 | }; | 257 | }; |
258 | uint8_t label[SB_LABEL_SIZE]; | 258 | uint8_t label[SB_LABEL_SIZE]; |
259 | 259 | ||
260 | uint64_t flags; | 260 | uint64_t flags; |
261 | uint64_t seq; | 261 | uint64_t seq; |
262 | uint64_t pad[8]; | 262 | uint64_t pad[8]; |
263 | 263 | ||
264 | union { | 264 | union { |
265 | struct { | 265 | struct { |
266 | /* Cache devices */ | 266 | /* Cache devices */ |
267 | uint64_t nbuckets; /* device size */ | 267 | uint64_t nbuckets; /* device size */ |
268 | 268 | ||
269 | uint16_t block_size; /* sectors */ | 269 | uint16_t block_size; /* sectors */ |
270 | uint16_t bucket_size; /* sectors */ | 270 | uint16_t bucket_size; /* sectors */ |
271 | 271 | ||
272 | uint16_t nr_in_set; | 272 | uint16_t nr_in_set; |
273 | uint16_t nr_this_dev; | 273 | uint16_t nr_this_dev; |
274 | }; | 274 | }; |
275 | struct { | 275 | struct { |
276 | /* Backing devices */ | 276 | /* Backing devices */ |
277 | uint64_t data_offset; | 277 | uint64_t data_offset; |
278 | 278 | ||
279 | /* | 279 | /* |
280 | * block_size from the cache device section is still used by | 280 | * block_size from the cache device section is still used by |
281 | * backing devices, so don't add anything here until we fix | 281 | * backing devices, so don't add anything here until we fix |
282 | * things to not need it for backing devices anymore | 282 | * things to not need it for backing devices anymore |
283 | */ | 283 | */ |
284 | }; | 284 | }; |
285 | }; | 285 | }; |
286 | 286 | ||
287 | uint32_t last_mount; /* time_t */ | 287 | uint32_t last_mount; /* time_t */ |
288 | 288 | ||
289 | uint16_t first_bucket; | 289 | uint16_t first_bucket; |
290 | union { | 290 | union { |
291 | uint16_t njournal_buckets; | 291 | uint16_t njournal_buckets; |
292 | uint16_t keys; | 292 | uint16_t keys; |
293 | }; | 293 | }; |
294 | uint64_t d[SB_JOURNAL_BUCKETS]; /* journal buckets */ | 294 | uint64_t d[SB_JOURNAL_BUCKETS]; /* journal buckets */ |
295 | }; | 295 | }; |
296 | 296 | ||
297 | BITMASK(CACHE_SYNC, struct cache_sb, flags, 0, 1); | 297 | BITMASK(CACHE_SYNC, struct cache_sb, flags, 0, 1); |
298 | BITMASK(CACHE_DISCARD, struct cache_sb, flags, 1, 1); | 298 | BITMASK(CACHE_DISCARD, struct cache_sb, flags, 1, 1); |
299 | BITMASK(CACHE_REPLACEMENT, struct cache_sb, flags, 2, 3); | 299 | BITMASK(CACHE_REPLACEMENT, struct cache_sb, flags, 2, 3); |
300 | #define CACHE_REPLACEMENT_LRU 0U | 300 | #define CACHE_REPLACEMENT_LRU 0U |
301 | #define CACHE_REPLACEMENT_FIFO 1U | 301 | #define CACHE_REPLACEMENT_FIFO 1U |
302 | #define CACHE_REPLACEMENT_RANDOM 2U | 302 | #define CACHE_REPLACEMENT_RANDOM 2U |
303 | 303 | ||
304 | BITMASK(BDEV_CACHE_MODE, struct cache_sb, flags, 0, 4); | 304 | BITMASK(BDEV_CACHE_MODE, struct cache_sb, flags, 0, 4); |
305 | #define CACHE_MODE_WRITETHROUGH 0U | 305 | #define CACHE_MODE_WRITETHROUGH 0U |
306 | #define CACHE_MODE_WRITEBACK 1U | 306 | #define CACHE_MODE_WRITEBACK 1U |
307 | #define CACHE_MODE_WRITEAROUND 2U | 307 | #define CACHE_MODE_WRITEAROUND 2U |
308 | #define CACHE_MODE_NONE 3U | 308 | #define CACHE_MODE_NONE 3U |
309 | BITMASK(BDEV_STATE, struct cache_sb, flags, 61, 2); | 309 | BITMASK(BDEV_STATE, struct cache_sb, flags, 61, 2); |
310 | #define BDEV_STATE_NONE 0U | 310 | #define BDEV_STATE_NONE 0U |
311 | #define BDEV_STATE_CLEAN 1U | 311 | #define BDEV_STATE_CLEAN 1U |
312 | #define BDEV_STATE_DIRTY 2U | 312 | #define BDEV_STATE_DIRTY 2U |
313 | #define BDEV_STATE_STALE 3U | 313 | #define BDEV_STATE_STALE 3U |
314 | 314 | ||
315 | /* Version 1: Seed pointer into btree node checksum | 315 | /* Version 1: Seed pointer into btree node checksum |
316 | */ | 316 | */ |
317 | #define BCACHE_BSET_VERSION 1 | 317 | #define BCACHE_BSET_VERSION 1 |
318 | 318 | ||
319 | /* | 319 | /* |
320 | * This is the on disk format for btree nodes - a btree node on disk is a list | 320 | * This is the on disk format for btree nodes - a btree node on disk is a list |
321 | * of these; within each set the keys are sorted | 321 | * of these; within each set the keys are sorted |
322 | */ | 322 | */ |
323 | struct bset { | 323 | struct bset { |
324 | uint64_t csum; | 324 | uint64_t csum; |
325 | uint64_t magic; | 325 | uint64_t magic; |
326 | uint64_t seq; | 326 | uint64_t seq; |
327 | uint32_t version; | 327 | uint32_t version; |
328 | uint32_t keys; | 328 | uint32_t keys; |
329 | 329 | ||
330 | union { | 330 | union { |
331 | struct bkey start[0]; | 331 | struct bkey start[0]; |
332 | uint64_t d[0]; | 332 | uint64_t d[0]; |
333 | }; | 333 | }; |
334 | }; | 334 | }; |
335 | 335 | ||
336 | /* | 336 | /* |
337 | * On disk format for priorities and gens - see super.c near prio_write() for | 337 | * On disk format for priorities and gens - see super.c near prio_write() for |
338 | * more. | 338 | * more. |
339 | */ | 339 | */ |
340 | struct prio_set { | 340 | struct prio_set { |
341 | uint64_t csum; | 341 | uint64_t csum; |
342 | uint64_t magic; | 342 | uint64_t magic; |
343 | uint64_t seq; | 343 | uint64_t seq; |
344 | uint32_t version; | 344 | uint32_t version; |
345 | uint32_t pad; | 345 | uint32_t pad; |
346 | 346 | ||
347 | uint64_t next_bucket; | 347 | uint64_t next_bucket; |
348 | 348 | ||
349 | struct bucket_disk { | 349 | struct bucket_disk { |
350 | uint16_t prio; | 350 | uint16_t prio; |
351 | uint8_t gen; | 351 | uint8_t gen; |
352 | } __attribute((packed)) data[]; | 352 | } __attribute((packed)) data[]; |
353 | }; | 353 | }; |
354 | 354 | ||
355 | struct uuid_entry { | 355 | struct uuid_entry { |
356 | union { | 356 | union { |
357 | struct { | 357 | struct { |
358 | uint8_t uuid[16]; | 358 | uint8_t uuid[16]; |
359 | uint8_t label[32]; | 359 | uint8_t label[32]; |
360 | uint32_t first_reg; | 360 | uint32_t first_reg; |
361 | uint32_t last_reg; | 361 | uint32_t last_reg; |
362 | uint32_t invalidated; | 362 | uint32_t invalidated; |
363 | 363 | ||
364 | uint32_t flags; | 364 | uint32_t flags; |
365 | /* Size of flash only volumes */ | 365 | /* Size of flash only volumes */ |
366 | uint64_t sectors; | 366 | uint64_t sectors; |
367 | }; | 367 | }; |
368 | 368 | ||
369 | uint8_t pad[128]; | 369 | uint8_t pad[128]; |
370 | }; | 370 | }; |
371 | }; | 371 | }; |
372 | 372 | ||
373 | BITMASK(UUID_FLASH_ONLY, struct uuid_entry, flags, 0, 1); | 373 | BITMASK(UUID_FLASH_ONLY, struct uuid_entry, flags, 0, 1); |
374 | 374 | ||
375 | #include "journal.h" | 375 | #include "journal.h" |
376 | #include "stats.h" | 376 | #include "stats.h" |
377 | struct search; | 377 | struct search; |
378 | struct btree; | 378 | struct btree; |
379 | struct keybuf; | 379 | struct keybuf; |
380 | 380 | ||
381 | struct keybuf_key { | 381 | struct keybuf_key { |
382 | struct rb_node node; | 382 | struct rb_node node; |
383 | BKEY_PADDED(key); | 383 | BKEY_PADDED(key); |
384 | void *private; | 384 | void *private; |
385 | }; | 385 | }; |
386 | 386 | ||
387 | typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *); | 387 | typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *); |
388 | 388 | ||
389 | struct keybuf { | 389 | struct keybuf { |
390 | keybuf_pred_fn *key_predicate; | ||
391 | |||
392 | struct bkey last_scanned; | 390 | struct bkey last_scanned; |
393 | spinlock_t lock; | 391 | spinlock_t lock; |
394 | 392 | ||
395 | /* | 393 | /* |
396 | * Beginning and end of range in rb tree - so that we can skip taking | 394 | * Beginning and end of range in rb tree - so that we can skip taking |
397 | * lock and checking the rb tree when we need to check for overlapping | 395 | * lock and checking the rb tree when we need to check for overlapping |
398 | * keys. | 396 | * keys. |
399 | */ | 397 | */ |
400 | struct bkey start; | 398 | struct bkey start; |
401 | struct bkey end; | 399 | struct bkey end; |
402 | 400 | ||
403 | struct rb_root keys; | 401 | struct rb_root keys; |
404 | 402 | ||
405 | #define KEYBUF_NR 100 | 403 | #define KEYBUF_NR 100 |
406 | DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist, KEYBUF_NR); | 404 | DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist, KEYBUF_NR); |
407 | }; | 405 | }; |
408 | 406 | ||
409 | struct bio_split_pool { | 407 | struct bio_split_pool { |
410 | struct bio_set *bio_split; | 408 | struct bio_set *bio_split; |
411 | mempool_t *bio_split_hook; | 409 | mempool_t *bio_split_hook; |
412 | }; | 410 | }; |
413 | 411 | ||
414 | struct bio_split_hook { | 412 | struct bio_split_hook { |
415 | struct closure cl; | 413 | struct closure cl; |
416 | struct bio_split_pool *p; | 414 | struct bio_split_pool *p; |
417 | struct bio *bio; | 415 | struct bio *bio; |
418 | bio_end_io_t *bi_end_io; | 416 | bio_end_io_t *bi_end_io; |
419 | void *bi_private; | 417 | void *bi_private; |
420 | }; | 418 | }; |
421 | 419 | ||
422 | struct bcache_device { | 420 | struct bcache_device { |
423 | struct closure cl; | 421 | struct closure cl; |
424 | 422 | ||
425 | struct kobject kobj; | 423 | struct kobject kobj; |
426 | 424 | ||
427 | struct cache_set *c; | 425 | struct cache_set *c; |
428 | unsigned id; | 426 | unsigned id; |
429 | #define BCACHEDEVNAME_SIZE 12 | 427 | #define BCACHEDEVNAME_SIZE 12 |
430 | char name[BCACHEDEVNAME_SIZE]; | 428 | char name[BCACHEDEVNAME_SIZE]; |
431 | 429 | ||
432 | struct gendisk *disk; | 430 | struct gendisk *disk; |
433 | 431 | ||
434 | /* If nonzero, we're closing */ | 432 | /* If nonzero, we're closing */ |
435 | atomic_t closing; | 433 | atomic_t closing; |
436 | 434 | ||
437 | /* If nonzero, we're detaching/unregistering from cache set */ | 435 | /* If nonzero, we're detaching/unregistering from cache set */ |
438 | atomic_t detaching; | 436 | atomic_t detaching; |
439 | 437 | ||
440 | uint64_t nr_stripes; | 438 | uint64_t nr_stripes; |
441 | unsigned stripe_size_bits; | 439 | unsigned stripe_size_bits; |
442 | atomic_t *stripe_sectors_dirty; | 440 | atomic_t *stripe_sectors_dirty; |
443 | 441 | ||
444 | unsigned long sectors_dirty_last; | 442 | unsigned long sectors_dirty_last; |
445 | long sectors_dirty_derivative; | 443 | long sectors_dirty_derivative; |
446 | 444 | ||
447 | mempool_t *unaligned_bvec; | 445 | mempool_t *unaligned_bvec; |
448 | struct bio_set *bio_split; | 446 | struct bio_set *bio_split; |
449 | 447 | ||
450 | unsigned data_csum:1; | 448 | unsigned data_csum:1; |
451 | 449 | ||
452 | int (*cache_miss)(struct btree *, struct search *, | 450 | int (*cache_miss)(struct btree *, struct search *, |
453 | struct bio *, unsigned); | 451 | struct bio *, unsigned); |
454 | int (*ioctl) (struct bcache_device *, fmode_t, unsigned, unsigned long); | 452 | int (*ioctl) (struct bcache_device *, fmode_t, unsigned, unsigned long); |
455 | 453 | ||
456 | struct bio_split_pool bio_split_hook; | 454 | struct bio_split_pool bio_split_hook; |
457 | }; | 455 | }; |
458 | 456 | ||
459 | struct io { | 457 | struct io { |
460 | /* Used to track sequential IO so it can be skipped */ | 458 | /* Used to track sequential IO so it can be skipped */ |
461 | struct hlist_node hash; | 459 | struct hlist_node hash; |
462 | struct list_head lru; | 460 | struct list_head lru; |
463 | 461 | ||
464 | unsigned long jiffies; | 462 | unsigned long jiffies; |
465 | unsigned sequential; | 463 | unsigned sequential; |
466 | sector_t last; | 464 | sector_t last; |
467 | }; | 465 | }; |
468 | 466 | ||
469 | struct cached_dev { | 467 | struct cached_dev { |
470 | struct list_head list; | 468 | struct list_head list; |
471 | struct bcache_device disk; | 469 | struct bcache_device disk; |
472 | struct block_device *bdev; | 470 | struct block_device *bdev; |
473 | 471 | ||
474 | struct cache_sb sb; | 472 | struct cache_sb sb; |
475 | struct bio sb_bio; | 473 | struct bio sb_bio; |
476 | struct bio_vec sb_bv[1]; | 474 | struct bio_vec sb_bv[1]; |
477 | struct closure_with_waitlist sb_write; | 475 | struct closure_with_waitlist sb_write; |
478 | 476 | ||
479 | /* Refcount on the cache set. Always nonzero when we're caching. */ | 477 | /* Refcount on the cache set. Always nonzero when we're caching. */ |
480 | atomic_t count; | 478 | atomic_t count; |
481 | struct work_struct detach; | 479 | struct work_struct detach; |
482 | 480 | ||
483 | /* | 481 | /* |
484 | * Device might not be running if it's dirty and the cache set hasn't | 482 | * Device might not be running if it's dirty and the cache set hasn't |
485 | * showed up yet. | 483 | * showed up yet. |
486 | */ | 484 | */ |
487 | atomic_t running; | 485 | atomic_t running; |
488 | 486 | ||
489 | /* | 487 | /* |
490 | * Writes take a shared lock from start to finish; scanning for dirty | 488 | * Writes take a shared lock from start to finish; scanning for dirty |
491 | * data to refill the rb tree requires an exclusive lock. | 489 | * data to refill the rb tree requires an exclusive lock. |
492 | */ | 490 | */ |
493 | struct rw_semaphore writeback_lock; | 491 | struct rw_semaphore writeback_lock; |
494 | 492 | ||
495 | /* | 493 | /* |
496 | * Nonzero, and writeback has a refcount (d->count), iff there is dirty | 494 | * Nonzero, and writeback has a refcount (d->count), iff there is dirty |
497 | * data in the cache. Protected by writeback_lock; must have an | 495 | * data in the cache. Protected by writeback_lock; must have an |
498 | * shared lock to set and exclusive lock to clear. | 496 | * shared lock to set and exclusive lock to clear. |
499 | */ | 497 | */ |
500 | atomic_t has_dirty; | 498 | atomic_t has_dirty; |
501 | 499 | ||
502 | struct ratelimit writeback_rate; | 500 | struct ratelimit writeback_rate; |
503 | struct delayed_work writeback_rate_update; | 501 | struct delayed_work writeback_rate_update; |
504 | 502 | ||
505 | /* | 503 | /* |
506 | * Internal to the writeback code, so read_dirty() can keep track of | 504 | * Internal to the writeback code, so read_dirty() can keep track of |
507 | * where it's at. | 505 | * where it's at. |
508 | */ | 506 | */ |
509 | sector_t last_read; | 507 | sector_t last_read; |
510 | 508 | ||
511 | /* Number of writeback bios in flight */ | 509 | /* Number of writeback bios in flight */ |
512 | atomic_t in_flight; | 510 | atomic_t in_flight; |
513 | struct closure_with_timer writeback; | 511 | struct closure_with_timer writeback; |
514 | struct closure_waitlist writeback_wait; | 512 | struct closure_waitlist writeback_wait; |
515 | 513 | ||
516 | struct keybuf writeback_keys; | 514 | struct keybuf writeback_keys; |
517 | 515 | ||
518 | /* For tracking sequential IO */ | 516 | /* For tracking sequential IO */ |
519 | #define RECENT_IO_BITS 7 | 517 | #define RECENT_IO_BITS 7 |
520 | #define RECENT_IO (1 << RECENT_IO_BITS) | 518 | #define RECENT_IO (1 << RECENT_IO_BITS) |
521 | struct io io[RECENT_IO]; | 519 | struct io io[RECENT_IO]; |
522 | struct hlist_head io_hash[RECENT_IO + 1]; | 520 | struct hlist_head io_hash[RECENT_IO + 1]; |
523 | struct list_head io_lru; | 521 | struct list_head io_lru; |
524 | spinlock_t io_lock; | 522 | spinlock_t io_lock; |
525 | 523 | ||
526 | struct cache_accounting accounting; | 524 | struct cache_accounting accounting; |
527 | 525 | ||
528 | /* The rest of this all shows up in sysfs */ | 526 | /* The rest of this all shows up in sysfs */ |
529 | unsigned sequential_cutoff; | 527 | unsigned sequential_cutoff; |
530 | unsigned readahead; | 528 | unsigned readahead; |
531 | 529 | ||
532 | unsigned sequential_merge:1; | 530 | unsigned sequential_merge:1; |
533 | unsigned verify:1; | 531 | unsigned verify:1; |
534 | 532 | ||
533 | unsigned partial_stripes_expensive:1; | ||
535 | unsigned writeback_metadata:1; | 534 | unsigned writeback_metadata:1; |
536 | unsigned writeback_running:1; | 535 | unsigned writeback_running:1; |
537 | unsigned char writeback_percent; | 536 | unsigned char writeback_percent; |
538 | unsigned writeback_delay; | 537 | unsigned writeback_delay; |
539 | 538 | ||
540 | int writeback_rate_change; | 539 | int writeback_rate_change; |
541 | int64_t writeback_rate_derivative; | 540 | int64_t writeback_rate_derivative; |
542 | uint64_t writeback_rate_target; | 541 | uint64_t writeback_rate_target; |
543 | 542 | ||
544 | unsigned writeback_rate_update_seconds; | 543 | unsigned writeback_rate_update_seconds; |
545 | unsigned writeback_rate_d_term; | 544 | unsigned writeback_rate_d_term; |
546 | unsigned writeback_rate_p_term_inverse; | 545 | unsigned writeback_rate_p_term_inverse; |
547 | unsigned writeback_rate_d_smooth; | 546 | unsigned writeback_rate_d_smooth; |
548 | }; | 547 | }; |
549 | 548 | ||
550 | enum alloc_watermarks { | 549 | enum alloc_watermarks { |
551 | WATERMARK_PRIO, | 550 | WATERMARK_PRIO, |
552 | WATERMARK_METADATA, | 551 | WATERMARK_METADATA, |
553 | WATERMARK_MOVINGGC, | 552 | WATERMARK_MOVINGGC, |
554 | WATERMARK_NONE, | 553 | WATERMARK_NONE, |
555 | WATERMARK_MAX | 554 | WATERMARK_MAX |
556 | }; | 555 | }; |
557 | 556 | ||
558 | struct cache { | 557 | struct cache { |
559 | struct cache_set *set; | 558 | struct cache_set *set; |
560 | struct cache_sb sb; | 559 | struct cache_sb sb; |
561 | struct bio sb_bio; | 560 | struct bio sb_bio; |
562 | struct bio_vec sb_bv[1]; | 561 | struct bio_vec sb_bv[1]; |
563 | 562 | ||
564 | struct kobject kobj; | 563 | struct kobject kobj; |
565 | struct block_device *bdev; | 564 | struct block_device *bdev; |
566 | 565 | ||
567 | unsigned watermark[WATERMARK_MAX]; | 566 | unsigned watermark[WATERMARK_MAX]; |
568 | 567 | ||
569 | struct task_struct *alloc_thread; | 568 | struct task_struct *alloc_thread; |
570 | 569 | ||
571 | struct closure prio; | 570 | struct closure prio; |
572 | struct prio_set *disk_buckets; | 571 | struct prio_set *disk_buckets; |
573 | 572 | ||
574 | /* | 573 | /* |
575 | * When allocating new buckets, prio_write() gets first dibs - since we | 574 | * When allocating new buckets, prio_write() gets first dibs - since we |
576 | * may not be allocate at all without writing priorities and gens. | 575 | * may not be allocate at all without writing priorities and gens. |
577 | * prio_buckets[] contains the last buckets we wrote priorities to (so | 576 | * prio_buckets[] contains the last buckets we wrote priorities to (so |
578 | * gc can mark them as metadata), prio_next[] contains the buckets | 577 | * gc can mark them as metadata), prio_next[] contains the buckets |
579 | * allocated for the next prio write. | 578 | * allocated for the next prio write. |
580 | */ | 579 | */ |
581 | uint64_t *prio_buckets; | 580 | uint64_t *prio_buckets; |
582 | uint64_t *prio_last_buckets; | 581 | uint64_t *prio_last_buckets; |
583 | 582 | ||
584 | /* | 583 | /* |
585 | * free: Buckets that are ready to be used | 584 | * free: Buckets that are ready to be used |
586 | * | 585 | * |
587 | * free_inc: Incoming buckets - these are buckets that currently have | 586 | * free_inc: Incoming buckets - these are buckets that currently have |
588 | * cached data in them, and we can't reuse them until after we write | 587 | * cached data in them, and we can't reuse them until after we write |
589 | * their new gen to disk. After prio_write() finishes writing the new | 588 | * their new gen to disk. After prio_write() finishes writing the new |
590 | * gens/prios, they'll be moved to the free list (and possibly discarded | 589 | * gens/prios, they'll be moved to the free list (and possibly discarded |
591 | * in the process) | 590 | * in the process) |
592 | * | 591 | * |
593 | * unused: GC found nothing pointing into these buckets (possibly | 592 | * unused: GC found nothing pointing into these buckets (possibly |
594 | * because all the data they contained was overwritten), so we only | 593 | * because all the data they contained was overwritten), so we only |
595 | * need to discard them before they can be moved to the free list. | 594 | * need to discard them before they can be moved to the free list. |
596 | */ | 595 | */ |
597 | DECLARE_FIFO(long, free); | 596 | DECLARE_FIFO(long, free); |
598 | DECLARE_FIFO(long, free_inc); | 597 | DECLARE_FIFO(long, free_inc); |
599 | DECLARE_FIFO(long, unused); | 598 | DECLARE_FIFO(long, unused); |
600 | 599 | ||
601 | size_t fifo_last_bucket; | 600 | size_t fifo_last_bucket; |
602 | 601 | ||
603 | /* Allocation stuff: */ | 602 | /* Allocation stuff: */ |
604 | struct bucket *buckets; | 603 | struct bucket *buckets; |
605 | 604 | ||
606 | DECLARE_HEAP(struct bucket *, heap); | 605 | DECLARE_HEAP(struct bucket *, heap); |
607 | 606 | ||
608 | /* | 607 | /* |
609 | * max(gen - disk_gen) for all buckets. When it gets too big we have to | 608 | * max(gen - disk_gen) for all buckets. When it gets too big we have to |
610 | * call prio_write() to keep gens from wrapping. | 609 | * call prio_write() to keep gens from wrapping. |
611 | */ | 610 | */ |
612 | uint8_t need_save_prio; | 611 | uint8_t need_save_prio; |
613 | unsigned gc_move_threshold; | 612 | unsigned gc_move_threshold; |
614 | 613 | ||
615 | /* | 614 | /* |
616 | * If nonzero, we know we aren't going to find any buckets to invalidate | 615 | * If nonzero, we know we aren't going to find any buckets to invalidate |
617 | * until a gc finishes - otherwise we could pointlessly burn a ton of | 616 | * until a gc finishes - otherwise we could pointlessly burn a ton of |
618 | * cpu | 617 | * cpu |
619 | */ | 618 | */ |
620 | unsigned invalidate_needs_gc:1; | 619 | unsigned invalidate_needs_gc:1; |
621 | 620 | ||
622 | bool discard; /* Get rid of? */ | 621 | bool discard; /* Get rid of? */ |
623 | 622 | ||
624 | /* | 623 | /* |
625 | * We preallocate structs for issuing discards to buckets, and keep them | 624 | * We preallocate structs for issuing discards to buckets, and keep them |
626 | * on this list when they're not in use; do_discard() issues discards | 625 | * on this list when they're not in use; do_discard() issues discards |
627 | * whenever there's work to do and is called by free_some_buckets() and | 626 | * whenever there's work to do and is called by free_some_buckets() and |
628 | * when a discard finishes. | 627 | * when a discard finishes. |
629 | */ | 628 | */ |
630 | atomic_t discards_in_flight; | 629 | atomic_t discards_in_flight; |
631 | struct list_head discards; | 630 | struct list_head discards; |
632 | 631 | ||
633 | struct journal_device journal; | 632 | struct journal_device journal; |
634 | 633 | ||
635 | /* The rest of this all shows up in sysfs */ | 634 | /* The rest of this all shows up in sysfs */ |
636 | #define IO_ERROR_SHIFT 20 | 635 | #define IO_ERROR_SHIFT 20 |
637 | atomic_t io_errors; | 636 | atomic_t io_errors; |
638 | atomic_t io_count; | 637 | atomic_t io_count; |
639 | 638 | ||
640 | atomic_long_t meta_sectors_written; | 639 | atomic_long_t meta_sectors_written; |
641 | atomic_long_t btree_sectors_written; | 640 | atomic_long_t btree_sectors_written; |
642 | atomic_long_t sectors_written; | 641 | atomic_long_t sectors_written; |
643 | 642 | ||
644 | struct bio_split_pool bio_split_hook; | 643 | struct bio_split_pool bio_split_hook; |
645 | }; | 644 | }; |
646 | 645 | ||
647 | struct gc_stat { | 646 | struct gc_stat { |
648 | size_t nodes; | 647 | size_t nodes; |
649 | size_t key_bytes; | 648 | size_t key_bytes; |
650 | 649 | ||
651 | size_t nkeys; | 650 | size_t nkeys; |
652 | uint64_t data; /* sectors */ | 651 | uint64_t data; /* sectors */ |
653 | uint64_t dirty; /* sectors */ | 652 | uint64_t dirty; /* sectors */ |
654 | unsigned in_use; /* percent */ | 653 | unsigned in_use; /* percent */ |
655 | }; | 654 | }; |
656 | 655 | ||
657 | /* | 656 | /* |
658 | * Flag bits, for how the cache set is shutting down, and what phase it's at: | 657 | * Flag bits, for how the cache set is shutting down, and what phase it's at: |
659 | * | 658 | * |
660 | * CACHE_SET_UNREGISTERING means we're not just shutting down, we're detaching | 659 | * CACHE_SET_UNREGISTERING means we're not just shutting down, we're detaching |
661 | * all the backing devices first (their cached data gets invalidated, and they | 660 | * all the backing devices first (their cached data gets invalidated, and they |
662 | * won't automatically reattach). | 661 | * won't automatically reattach). |
663 | * | 662 | * |
664 | * CACHE_SET_STOPPING always gets set first when we're closing down a cache set; | 663 | * CACHE_SET_STOPPING always gets set first when we're closing down a cache set; |
665 | * we'll continue to run normally for awhile with CACHE_SET_STOPPING set (i.e. | 664 | * we'll continue to run normally for awhile with CACHE_SET_STOPPING set (i.e. |
666 | * flushing dirty data). | 665 | * flushing dirty data). |
667 | * | 666 | * |
668 | * CACHE_SET_STOPPING_2 gets set at the last phase, when it's time to shut down | 667 | * CACHE_SET_STOPPING_2 gets set at the last phase, when it's time to shut down |
669 | * the allocation thread. | 668 | * the allocation thread. |
670 | */ | 669 | */ |
671 | #define CACHE_SET_UNREGISTERING 0 | 670 | #define CACHE_SET_UNREGISTERING 0 |
672 | #define CACHE_SET_STOPPING 1 | 671 | #define CACHE_SET_STOPPING 1 |
673 | #define CACHE_SET_STOPPING_2 2 | 672 | #define CACHE_SET_STOPPING_2 2 |
674 | 673 | ||
675 | struct cache_set { | 674 | struct cache_set { |
676 | struct closure cl; | 675 | struct closure cl; |
677 | 676 | ||
678 | struct list_head list; | 677 | struct list_head list; |
679 | struct kobject kobj; | 678 | struct kobject kobj; |
680 | struct kobject internal; | 679 | struct kobject internal; |
681 | struct dentry *debug; | 680 | struct dentry *debug; |
682 | struct cache_accounting accounting; | 681 | struct cache_accounting accounting; |
683 | 682 | ||
684 | unsigned long flags; | 683 | unsigned long flags; |
685 | 684 | ||
686 | struct cache_sb sb; | 685 | struct cache_sb sb; |
687 | 686 | ||
688 | struct cache *cache[MAX_CACHES_PER_SET]; | 687 | struct cache *cache[MAX_CACHES_PER_SET]; |
689 | struct cache *cache_by_alloc[MAX_CACHES_PER_SET]; | 688 | struct cache *cache_by_alloc[MAX_CACHES_PER_SET]; |
690 | int caches_loaded; | 689 | int caches_loaded; |
691 | 690 | ||
692 | struct bcache_device **devices; | 691 | struct bcache_device **devices; |
693 | struct list_head cached_devs; | 692 | struct list_head cached_devs; |
694 | uint64_t cached_dev_sectors; | 693 | uint64_t cached_dev_sectors; |
695 | struct closure caching; | 694 | struct closure caching; |
696 | 695 | ||
697 | struct closure_with_waitlist sb_write; | 696 | struct closure_with_waitlist sb_write; |
698 | 697 | ||
699 | mempool_t *search; | 698 | mempool_t *search; |
700 | mempool_t *bio_meta; | 699 | mempool_t *bio_meta; |
701 | struct bio_set *bio_split; | 700 | struct bio_set *bio_split; |
702 | 701 | ||
703 | /* For the btree cache */ | 702 | /* For the btree cache */ |
704 | struct shrinker shrink; | 703 | struct shrinker shrink; |
705 | 704 | ||
706 | /* For the btree cache and anything allocation related */ | 705 | /* For the btree cache and anything allocation related */ |
707 | struct mutex bucket_lock; | 706 | struct mutex bucket_lock; |
708 | 707 | ||
709 | /* log2(bucket_size), in sectors */ | 708 | /* log2(bucket_size), in sectors */ |
710 | unsigned short bucket_bits; | 709 | unsigned short bucket_bits; |
711 | 710 | ||
712 | /* log2(block_size), in sectors */ | 711 | /* log2(block_size), in sectors */ |
713 | unsigned short block_bits; | 712 | unsigned short block_bits; |
714 | 713 | ||
715 | /* | 714 | /* |
716 | * Default number of pages for a new btree node - may be less than a | 715 | * Default number of pages for a new btree node - may be less than a |
717 | * full bucket | 716 | * full bucket |
718 | */ | 717 | */ |
719 | unsigned btree_pages; | 718 | unsigned btree_pages; |
720 | 719 | ||
721 | /* | 720 | /* |
722 | * Lists of struct btrees; lru is the list for structs that have memory | 721 | * Lists of struct btrees; lru is the list for structs that have memory |
723 | * allocated for actual btree node, freed is for structs that do not. | 722 | * allocated for actual btree node, freed is for structs that do not. |
724 | * | 723 | * |
725 | * We never free a struct btree, except on shutdown - we just put it on | 724 | * We never free a struct btree, except on shutdown - we just put it on |
726 | * the btree_cache_freed list and reuse it later. This simplifies the | 725 | * the btree_cache_freed list and reuse it later. This simplifies the |
727 | * code, and it doesn't cost us much memory as the memory usage is | 726 | * code, and it doesn't cost us much memory as the memory usage is |
728 | * dominated by buffers that hold the actual btree node data and those | 727 | * dominated by buffers that hold the actual btree node data and those |
729 | * can be freed - and the number of struct btrees allocated is | 728 | * can be freed - and the number of struct btrees allocated is |
730 | * effectively bounded. | 729 | * effectively bounded. |
731 | * | 730 | * |
732 | * btree_cache_freeable effectively is a small cache - we use it because | 731 | * btree_cache_freeable effectively is a small cache - we use it because |
733 | * high order page allocations can be rather expensive, and it's quite | 732 | * high order page allocations can be rather expensive, and it's quite |
734 | * common to delete and allocate btree nodes in quick succession. It | 733 | * common to delete and allocate btree nodes in quick succession. It |
735 | * should never grow past ~2-3 nodes in practice. | 734 | * should never grow past ~2-3 nodes in practice. |
736 | */ | 735 | */ |
737 | struct list_head btree_cache; | 736 | struct list_head btree_cache; |
738 | struct list_head btree_cache_freeable; | 737 | struct list_head btree_cache_freeable; |
739 | struct list_head btree_cache_freed; | 738 | struct list_head btree_cache_freed; |
740 | 739 | ||
741 | /* Number of elements in btree_cache + btree_cache_freeable lists */ | 740 | /* Number of elements in btree_cache + btree_cache_freeable lists */ |
742 | unsigned bucket_cache_used; | 741 | unsigned bucket_cache_used; |
743 | 742 | ||
744 | /* | 743 | /* |
745 | * If we need to allocate memory for a new btree node and that | 744 | * If we need to allocate memory for a new btree node and that |
746 | * allocation fails, we can cannibalize another node in the btree cache | 745 | * allocation fails, we can cannibalize another node in the btree cache |
747 | * to satisfy the allocation. However, only one thread can be doing this | 746 | * to satisfy the allocation. However, only one thread can be doing this |
748 | * at a time, for obvious reasons - try_harder and try_wait are | 747 | * at a time, for obvious reasons - try_harder and try_wait are |
749 | * basically a lock for this that we can wait on asynchronously. The | 748 | * basically a lock for this that we can wait on asynchronously. The |
750 | * btree_root() macro releases the lock when it returns. | 749 | * btree_root() macro releases the lock when it returns. |
751 | */ | 750 | */ |
752 | struct closure *try_harder; | 751 | struct closure *try_harder; |
753 | struct closure_waitlist try_wait; | 752 | struct closure_waitlist try_wait; |
754 | uint64_t try_harder_start; | 753 | uint64_t try_harder_start; |
755 | 754 | ||
756 | /* | 755 | /* |
757 | * When we free a btree node, we increment the gen of the bucket the | 756 | * When we free a btree node, we increment the gen of the bucket the |
758 | * node is in - but we can't rewrite the prios and gens until we | 757 | * node is in - but we can't rewrite the prios and gens until we |
759 | * finished whatever it is we were doing, otherwise after a crash the | 758 | * finished whatever it is we were doing, otherwise after a crash the |
760 | * btree node would be freed but for say a split, we might not have the | 759 | * btree node would be freed but for say a split, we might not have the |
761 | * pointers to the new nodes inserted into the btree yet. | 760 | * pointers to the new nodes inserted into the btree yet. |
762 | * | 761 | * |
763 | * This is a refcount that blocks prio_write() until the new keys are | 762 | * This is a refcount that blocks prio_write() until the new keys are |
764 | * written. | 763 | * written. |
765 | */ | 764 | */ |
766 | atomic_t prio_blocked; | 765 | atomic_t prio_blocked; |
767 | struct closure_waitlist bucket_wait; | 766 | struct closure_waitlist bucket_wait; |
768 | 767 | ||
769 | /* | 768 | /* |
770 | * For any bio we don't skip we subtract the number of sectors from | 769 | * For any bio we don't skip we subtract the number of sectors from |
771 | * rescale; when it hits 0 we rescale all the bucket priorities. | 770 | * rescale; when it hits 0 we rescale all the bucket priorities. |
772 | */ | 771 | */ |
773 | atomic_t rescale; | 772 | atomic_t rescale; |
774 | /* | 773 | /* |
775 | * When we invalidate buckets, we use both the priority and the amount | 774 | * When we invalidate buckets, we use both the priority and the amount |
776 | * of good data to determine which buckets to reuse first - to weight | 775 | * of good data to determine which buckets to reuse first - to weight |
777 | * those together consistently we keep track of the smallest nonzero | 776 | * those together consistently we keep track of the smallest nonzero |
778 | * priority of any bucket. | 777 | * priority of any bucket. |
779 | */ | 778 | */ |
780 | uint16_t min_prio; | 779 | uint16_t min_prio; |
781 | 780 | ||
782 | /* | 781 | /* |
783 | * max(gen - gc_gen) for all buckets. When it gets too big we have to gc | 782 | * max(gen - gc_gen) for all buckets. When it gets too big we have to gc |
784 | * to keep gens from wrapping around. | 783 | * to keep gens from wrapping around. |
785 | */ | 784 | */ |
786 | uint8_t need_gc; | 785 | uint8_t need_gc; |
787 | struct gc_stat gc_stats; | 786 | struct gc_stat gc_stats; |
788 | size_t nbuckets; | 787 | size_t nbuckets; |
789 | 788 | ||
790 | struct closure_with_waitlist gc; | 789 | struct closure_with_waitlist gc; |
791 | /* Where in the btree gc currently is */ | 790 | /* Where in the btree gc currently is */ |
792 | struct bkey gc_done; | 791 | struct bkey gc_done; |
793 | 792 | ||
794 | /* | 793 | /* |
795 | * The allocation code needs gc_mark in struct bucket to be correct, but | 794 | * The allocation code needs gc_mark in struct bucket to be correct, but |
796 | * it's not while a gc is in progress. Protected by bucket_lock. | 795 | * it's not while a gc is in progress. Protected by bucket_lock. |
797 | */ | 796 | */ |
798 | int gc_mark_valid; | 797 | int gc_mark_valid; |
799 | 798 | ||
800 | /* Counts how many sectors bio_insert has added to the cache */ | 799 | /* Counts how many sectors bio_insert has added to the cache */ |
801 | atomic_t sectors_to_gc; | 800 | atomic_t sectors_to_gc; |
802 | 801 | ||
803 | struct closure moving_gc; | 802 | struct closure moving_gc; |
804 | struct closure_waitlist moving_gc_wait; | 803 | struct closure_waitlist moving_gc_wait; |
805 | struct keybuf moving_gc_keys; | 804 | struct keybuf moving_gc_keys; |
806 | /* Number of moving GC bios in flight */ | 805 | /* Number of moving GC bios in flight */ |
807 | atomic_t in_flight; | 806 | atomic_t in_flight; |
808 | 807 | ||
809 | struct btree *root; | 808 | struct btree *root; |
810 | 809 | ||
811 | #ifdef CONFIG_BCACHE_DEBUG | 810 | #ifdef CONFIG_BCACHE_DEBUG |
812 | struct btree *verify_data; | 811 | struct btree *verify_data; |
813 | struct mutex verify_lock; | 812 | struct mutex verify_lock; |
814 | #endif | 813 | #endif |
815 | 814 | ||
816 | unsigned nr_uuids; | 815 | unsigned nr_uuids; |
817 | struct uuid_entry *uuids; | 816 | struct uuid_entry *uuids; |
818 | BKEY_PADDED(uuid_bucket); | 817 | BKEY_PADDED(uuid_bucket); |
819 | struct closure_with_waitlist uuid_write; | 818 | struct closure_with_waitlist uuid_write; |
820 | 819 | ||
821 | /* | 820 | /* |
822 | * A btree node on disk could have too many bsets for an iterator to fit | 821 | * A btree node on disk could have too many bsets for an iterator to fit |
823 | * on the stack - have to dynamically allocate them | 822 | * on the stack - have to dynamically allocate them |
824 | */ | 823 | */ |
825 | mempool_t *fill_iter; | 824 | mempool_t *fill_iter; |
826 | 825 | ||
827 | /* | 826 | /* |
828 | * btree_sort() is a merge sort and requires temporary space - single | 827 | * btree_sort() is a merge sort and requires temporary space - single |
829 | * element mempool | 828 | * element mempool |
830 | */ | 829 | */ |
831 | struct mutex sort_lock; | 830 | struct mutex sort_lock; |
832 | struct bset *sort; | 831 | struct bset *sort; |
833 | unsigned sort_crit_factor; | 832 | unsigned sort_crit_factor; |
834 | 833 | ||
835 | /* List of buckets we're currently writing data to */ | 834 | /* List of buckets we're currently writing data to */ |
836 | struct list_head data_buckets; | 835 | struct list_head data_buckets; |
837 | spinlock_t data_bucket_lock; | 836 | spinlock_t data_bucket_lock; |
838 | 837 | ||
839 | struct journal journal; | 838 | struct journal journal; |
840 | 839 | ||
841 | #define CONGESTED_MAX 1024 | 840 | #define CONGESTED_MAX 1024 |
842 | unsigned congested_last_us; | 841 | unsigned congested_last_us; |
843 | atomic_t congested; | 842 | atomic_t congested; |
844 | 843 | ||
845 | /* The rest of this all shows up in sysfs */ | 844 | /* The rest of this all shows up in sysfs */ |
846 | unsigned congested_read_threshold_us; | 845 | unsigned congested_read_threshold_us; |
847 | unsigned congested_write_threshold_us; | 846 | unsigned congested_write_threshold_us; |
848 | 847 | ||
849 | spinlock_t sort_time_lock; | 848 | spinlock_t sort_time_lock; |
850 | struct time_stats sort_time; | 849 | struct time_stats sort_time; |
851 | struct time_stats btree_gc_time; | 850 | struct time_stats btree_gc_time; |
852 | struct time_stats btree_split_time; | 851 | struct time_stats btree_split_time; |
853 | spinlock_t btree_read_time_lock; | 852 | spinlock_t btree_read_time_lock; |
854 | struct time_stats btree_read_time; | 853 | struct time_stats btree_read_time; |
855 | struct time_stats try_harder_time; | 854 | struct time_stats try_harder_time; |
856 | 855 | ||
857 | atomic_long_t cache_read_races; | 856 | atomic_long_t cache_read_races; |
858 | atomic_long_t writeback_keys_done; | 857 | atomic_long_t writeback_keys_done; |
859 | atomic_long_t writeback_keys_failed; | 858 | atomic_long_t writeback_keys_failed; |
860 | unsigned error_limit; | 859 | unsigned error_limit; |
861 | unsigned error_decay; | 860 | unsigned error_decay; |
862 | unsigned short journal_delay_ms; | 861 | unsigned short journal_delay_ms; |
863 | unsigned verify:1; | 862 | unsigned verify:1; |
864 | unsigned key_merging_disabled:1; | 863 | unsigned key_merging_disabled:1; |
865 | unsigned gc_always_rewrite:1; | 864 | unsigned gc_always_rewrite:1; |
866 | unsigned shrinker_disabled:1; | 865 | unsigned shrinker_disabled:1; |
867 | unsigned copy_gc_enabled:1; | 866 | unsigned copy_gc_enabled:1; |
868 | 867 | ||
869 | #define BUCKET_HASH_BITS 12 | 868 | #define BUCKET_HASH_BITS 12 |
870 | struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; | 869 | struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; |
871 | }; | 870 | }; |
872 | 871 | ||
873 | static inline bool key_merging_disabled(struct cache_set *c) | 872 | static inline bool key_merging_disabled(struct cache_set *c) |
874 | { | 873 | { |
875 | #ifdef CONFIG_BCACHE_DEBUG | 874 | #ifdef CONFIG_BCACHE_DEBUG |
876 | return c->key_merging_disabled; | 875 | return c->key_merging_disabled; |
877 | #else | 876 | #else |
878 | return 0; | 877 | return 0; |
879 | #endif | 878 | #endif |
880 | } | 879 | } |
881 | 880 | ||
882 | static inline bool SB_IS_BDEV(const struct cache_sb *sb) | 881 | static inline bool SB_IS_BDEV(const struct cache_sb *sb) |
883 | { | 882 | { |
884 | return sb->version == BCACHE_SB_VERSION_BDEV | 883 | return sb->version == BCACHE_SB_VERSION_BDEV |
885 | || sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET; | 884 | || sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET; |
886 | } | 885 | } |
887 | 886 | ||
888 | struct bbio { | 887 | struct bbio { |
889 | unsigned submit_time_us; | 888 | unsigned submit_time_us; |
890 | union { | 889 | union { |
891 | struct bkey key; | 890 | struct bkey key; |
892 | uint64_t _pad[3]; | 891 | uint64_t _pad[3]; |
893 | /* | 892 | /* |
894 | * We only need pad = 3 here because we only ever carry around a | 893 | * We only need pad = 3 here because we only ever carry around a |
895 | * single pointer - i.e. the pointer we're doing io to/from. | 894 | * single pointer - i.e. the pointer we're doing io to/from. |
896 | */ | 895 | */ |
897 | }; | 896 | }; |
898 | struct bio bio; | 897 | struct bio bio; |
899 | }; | 898 | }; |
900 | 899 | ||
901 | static inline unsigned local_clock_us(void) | 900 | static inline unsigned local_clock_us(void) |
902 | { | 901 | { |
903 | return local_clock() >> 10; | 902 | return local_clock() >> 10; |
904 | } | 903 | } |
905 | 904 | ||
906 | #define BTREE_PRIO USHRT_MAX | 905 | #define BTREE_PRIO USHRT_MAX |
907 | #define INITIAL_PRIO 32768 | 906 | #define INITIAL_PRIO 32768 |
908 | 907 | ||
909 | #define btree_bytes(c) ((c)->btree_pages * PAGE_SIZE) | 908 | #define btree_bytes(c) ((c)->btree_pages * PAGE_SIZE) |
910 | #define btree_blocks(b) \ | 909 | #define btree_blocks(b) \ |
911 | ((unsigned) (KEY_SIZE(&b->key) >> (b)->c->block_bits)) | 910 | ((unsigned) (KEY_SIZE(&b->key) >> (b)->c->block_bits)) |
912 | 911 | ||
913 | #define btree_default_blocks(c) \ | 912 | #define btree_default_blocks(c) \ |
914 | ((unsigned) ((PAGE_SECTORS * (c)->btree_pages) >> (c)->block_bits)) | 913 | ((unsigned) ((PAGE_SECTORS * (c)->btree_pages) >> (c)->block_bits)) |
915 | 914 | ||
916 | #define bucket_pages(c) ((c)->sb.bucket_size / PAGE_SECTORS) | 915 | #define bucket_pages(c) ((c)->sb.bucket_size / PAGE_SECTORS) |
917 | #define bucket_bytes(c) ((c)->sb.bucket_size << 9) | 916 | #define bucket_bytes(c) ((c)->sb.bucket_size << 9) |
918 | #define block_bytes(c) ((c)->sb.block_size << 9) | 917 | #define block_bytes(c) ((c)->sb.block_size << 9) |
919 | 918 | ||
920 | #define __set_bytes(i, k) (sizeof(*(i)) + (k) * sizeof(uint64_t)) | 919 | #define __set_bytes(i, k) (sizeof(*(i)) + (k) * sizeof(uint64_t)) |
921 | #define set_bytes(i) __set_bytes(i, i->keys) | 920 | #define set_bytes(i) __set_bytes(i, i->keys) |
922 | 921 | ||
923 | #define __set_blocks(i, k, c) DIV_ROUND_UP(__set_bytes(i, k), block_bytes(c)) | 922 | #define __set_blocks(i, k, c) DIV_ROUND_UP(__set_bytes(i, k), block_bytes(c)) |
924 | #define set_blocks(i, c) __set_blocks(i, (i)->keys, c) | 923 | #define set_blocks(i, c) __set_blocks(i, (i)->keys, c) |
925 | 924 | ||
926 | #define node(i, j) ((struct bkey *) ((i)->d + (j))) | 925 | #define node(i, j) ((struct bkey *) ((i)->d + (j))) |
927 | #define end(i) node(i, (i)->keys) | 926 | #define end(i) node(i, (i)->keys) |
928 | 927 | ||
929 | #define index(i, b) \ | 928 | #define index(i, b) \ |
930 | ((size_t) (((void *) i - (void *) (b)->sets[0].data) / \ | 929 | ((size_t) (((void *) i - (void *) (b)->sets[0].data) / \ |
931 | block_bytes(b->c))) | 930 | block_bytes(b->c))) |
932 | 931 | ||
933 | #define btree_data_space(b) (PAGE_SIZE << (b)->page_order) | 932 | #define btree_data_space(b) (PAGE_SIZE << (b)->page_order) |
934 | 933 | ||
935 | #define prios_per_bucket(c) \ | 934 | #define prios_per_bucket(c) \ |
936 | ((bucket_bytes(c) - sizeof(struct prio_set)) / \ | 935 | ((bucket_bytes(c) - sizeof(struct prio_set)) / \ |
937 | sizeof(struct bucket_disk)) | 936 | sizeof(struct bucket_disk)) |
938 | #define prio_buckets(c) \ | 937 | #define prio_buckets(c) \ |
939 | DIV_ROUND_UP((size_t) (c)->sb.nbuckets, prios_per_bucket(c)) | 938 | DIV_ROUND_UP((size_t) (c)->sb.nbuckets, prios_per_bucket(c)) |
940 | 939 | ||
941 | #define JSET_MAGIC 0x245235c1a3625032ULL | 940 | #define JSET_MAGIC 0x245235c1a3625032ULL |
942 | #define PSET_MAGIC 0x6750e15f87337f91ULL | 941 | #define PSET_MAGIC 0x6750e15f87337f91ULL |
943 | #define BSET_MAGIC 0x90135c78b99e07f5ULL | 942 | #define BSET_MAGIC 0x90135c78b99e07f5ULL |
944 | 943 | ||
945 | #define jset_magic(c) ((c)->sb.set_magic ^ JSET_MAGIC) | 944 | #define jset_magic(c) ((c)->sb.set_magic ^ JSET_MAGIC) |
946 | #define pset_magic(c) ((c)->sb.set_magic ^ PSET_MAGIC) | 945 | #define pset_magic(c) ((c)->sb.set_magic ^ PSET_MAGIC) |
947 | #define bset_magic(c) ((c)->sb.set_magic ^ BSET_MAGIC) | 946 | #define bset_magic(c) ((c)->sb.set_magic ^ BSET_MAGIC) |
948 | 947 | ||
949 | /* Bkey fields: all units are in sectors */ | 948 | /* Bkey fields: all units are in sectors */ |
950 | 949 | ||
951 | #define KEY_FIELD(name, field, offset, size) \ | 950 | #define KEY_FIELD(name, field, offset, size) \ |
952 | BITMASK(name, struct bkey, field, offset, size) | 951 | BITMASK(name, struct bkey, field, offset, size) |
953 | 952 | ||
954 | #define PTR_FIELD(name, offset, size) \ | 953 | #define PTR_FIELD(name, offset, size) \ |
955 | static inline uint64_t name(const struct bkey *k, unsigned i) \ | 954 | static inline uint64_t name(const struct bkey *k, unsigned i) \ |
956 | { return (k->ptr[i] >> offset) & ~(((uint64_t) ~0) << size); } \ | 955 | { return (k->ptr[i] >> offset) & ~(((uint64_t) ~0) << size); } \ |
957 | \ | 956 | \ |
958 | static inline void SET_##name(struct bkey *k, unsigned i, uint64_t v)\ | 957 | static inline void SET_##name(struct bkey *k, unsigned i, uint64_t v)\ |
959 | { \ | 958 | { \ |
960 | k->ptr[i] &= ~(~((uint64_t) ~0 << size) << offset); \ | 959 | k->ptr[i] &= ~(~((uint64_t) ~0 << size) << offset); \ |
961 | k->ptr[i] |= v << offset; \ | 960 | k->ptr[i] |= v << offset; \ |
962 | } | 961 | } |
963 | 962 | ||
964 | KEY_FIELD(KEY_PTRS, high, 60, 3) | 963 | KEY_FIELD(KEY_PTRS, high, 60, 3) |
965 | KEY_FIELD(HEADER_SIZE, high, 58, 2) | 964 | KEY_FIELD(HEADER_SIZE, high, 58, 2) |
966 | KEY_FIELD(KEY_CSUM, high, 56, 2) | 965 | KEY_FIELD(KEY_CSUM, high, 56, 2) |
967 | KEY_FIELD(KEY_PINNED, high, 55, 1) | 966 | KEY_FIELD(KEY_PINNED, high, 55, 1) |
968 | KEY_FIELD(KEY_DIRTY, high, 36, 1) | 967 | KEY_FIELD(KEY_DIRTY, high, 36, 1) |
969 | 968 | ||
970 | KEY_FIELD(KEY_SIZE, high, 20, 16) | 969 | KEY_FIELD(KEY_SIZE, high, 20, 16) |
971 | KEY_FIELD(KEY_INODE, high, 0, 20) | 970 | KEY_FIELD(KEY_INODE, high, 0, 20) |
972 | 971 | ||
973 | /* Next time I change the on disk format, KEY_OFFSET() won't be 64 bits */ | 972 | /* Next time I change the on disk format, KEY_OFFSET() won't be 64 bits */ |
974 | 973 | ||
975 | static inline uint64_t KEY_OFFSET(const struct bkey *k) | 974 | static inline uint64_t KEY_OFFSET(const struct bkey *k) |
976 | { | 975 | { |
977 | return k->low; | 976 | return k->low; |
978 | } | 977 | } |
979 | 978 | ||
980 | static inline void SET_KEY_OFFSET(struct bkey *k, uint64_t v) | 979 | static inline void SET_KEY_OFFSET(struct bkey *k, uint64_t v) |
981 | { | 980 | { |
982 | k->low = v; | 981 | k->low = v; |
983 | } | 982 | } |
984 | 983 | ||
985 | PTR_FIELD(PTR_DEV, 51, 12) | 984 | PTR_FIELD(PTR_DEV, 51, 12) |
986 | PTR_FIELD(PTR_OFFSET, 8, 43) | 985 | PTR_FIELD(PTR_OFFSET, 8, 43) |
987 | PTR_FIELD(PTR_GEN, 0, 8) | 986 | PTR_FIELD(PTR_GEN, 0, 8) |
988 | 987 | ||
989 | #define PTR_CHECK_DEV ((1 << 12) - 1) | 988 | #define PTR_CHECK_DEV ((1 << 12) - 1) |
990 | 989 | ||
991 | #define PTR(gen, offset, dev) \ | 990 | #define PTR(gen, offset, dev) \ |
992 | ((((uint64_t) dev) << 51) | ((uint64_t) offset) << 8 | gen) | 991 | ((((uint64_t) dev) << 51) | ((uint64_t) offset) << 8 | gen) |
993 | 992 | ||
994 | static inline size_t sector_to_bucket(struct cache_set *c, sector_t s) | 993 | static inline size_t sector_to_bucket(struct cache_set *c, sector_t s) |
995 | { | 994 | { |
996 | return s >> c->bucket_bits; | 995 | return s >> c->bucket_bits; |
997 | } | 996 | } |
998 | 997 | ||
999 | static inline sector_t bucket_to_sector(struct cache_set *c, size_t b) | 998 | static inline sector_t bucket_to_sector(struct cache_set *c, size_t b) |
1000 | { | 999 | { |
1001 | return ((sector_t) b) << c->bucket_bits; | 1000 | return ((sector_t) b) << c->bucket_bits; |
1002 | } | 1001 | } |
1003 | 1002 | ||
1004 | static inline sector_t bucket_remainder(struct cache_set *c, sector_t s) | 1003 | static inline sector_t bucket_remainder(struct cache_set *c, sector_t s) |
1005 | { | 1004 | { |
1006 | return s & (c->sb.bucket_size - 1); | 1005 | return s & (c->sb.bucket_size - 1); |
1007 | } | 1006 | } |
1008 | 1007 | ||
1009 | static inline struct cache *PTR_CACHE(struct cache_set *c, | 1008 | static inline struct cache *PTR_CACHE(struct cache_set *c, |
1010 | const struct bkey *k, | 1009 | const struct bkey *k, |
1011 | unsigned ptr) | 1010 | unsigned ptr) |
1012 | { | 1011 | { |
1013 | return c->cache[PTR_DEV(k, ptr)]; | 1012 | return c->cache[PTR_DEV(k, ptr)]; |
1014 | } | 1013 | } |
1015 | 1014 | ||
1016 | static inline size_t PTR_BUCKET_NR(struct cache_set *c, | 1015 | static inline size_t PTR_BUCKET_NR(struct cache_set *c, |
1017 | const struct bkey *k, | 1016 | const struct bkey *k, |
1018 | unsigned ptr) | 1017 | unsigned ptr) |
1019 | { | 1018 | { |
1020 | return sector_to_bucket(c, PTR_OFFSET(k, ptr)); | 1019 | return sector_to_bucket(c, PTR_OFFSET(k, ptr)); |
1021 | } | 1020 | } |
1022 | 1021 | ||
1023 | static inline struct bucket *PTR_BUCKET(struct cache_set *c, | 1022 | static inline struct bucket *PTR_BUCKET(struct cache_set *c, |
1024 | const struct bkey *k, | 1023 | const struct bkey *k, |
1025 | unsigned ptr) | 1024 | unsigned ptr) |
1026 | { | 1025 | { |
1027 | return PTR_CACHE(c, k, ptr)->buckets + PTR_BUCKET_NR(c, k, ptr); | 1026 | return PTR_CACHE(c, k, ptr)->buckets + PTR_BUCKET_NR(c, k, ptr); |
1028 | } | 1027 | } |
1029 | 1028 | ||
1030 | /* Btree key macros */ | 1029 | /* Btree key macros */ |
1031 | 1030 | ||
1032 | /* | 1031 | /* |
1033 | * The high bit being set is a relic from when we used it to do binary | 1032 | * The high bit being set is a relic from when we used it to do binary |
1034 | * searches - it told you where a key started. It's not used anymore, | 1033 | * searches - it told you where a key started. It's not used anymore, |
1035 | * and can probably be safely dropped. | 1034 | * and can probably be safely dropped. |
1036 | */ | 1035 | */ |
1037 | #define KEY(dev, sector, len) \ | 1036 | #define KEY(dev, sector, len) \ |
1038 | ((struct bkey) { \ | 1037 | ((struct bkey) { \ |
1039 | .high = (1ULL << 63) | ((uint64_t) (len) << 20) | (dev), \ | 1038 | .high = (1ULL << 63) | ((uint64_t) (len) << 20) | (dev), \ |
1040 | .low = (sector) \ | 1039 | .low = (sector) \ |
1041 | }) | 1040 | }) |
1042 | 1041 | ||
1043 | static inline void bkey_init(struct bkey *k) | 1042 | static inline void bkey_init(struct bkey *k) |
1044 | { | 1043 | { |
1045 | *k = KEY(0, 0, 0); | 1044 | *k = KEY(0, 0, 0); |
1046 | } | 1045 | } |
1047 | 1046 | ||
1048 | #define KEY_START(k) (KEY_OFFSET(k) - KEY_SIZE(k)) | 1047 | #define KEY_START(k) (KEY_OFFSET(k) - KEY_SIZE(k)) |
1049 | #define START_KEY(k) KEY(KEY_INODE(k), KEY_START(k), 0) | 1048 | #define START_KEY(k) KEY(KEY_INODE(k), KEY_START(k), 0) |
1050 | #define MAX_KEY KEY(~(~0 << 20), ((uint64_t) ~0) >> 1, 0) | 1049 | #define MAX_KEY KEY(~(~0 << 20), ((uint64_t) ~0) >> 1, 0) |
1051 | #define ZERO_KEY KEY(0, 0, 0) | 1050 | #define ZERO_KEY KEY(0, 0, 0) |
1052 | 1051 | ||
1053 | /* | 1052 | /* |
1054 | * This is used for various on disk data structures - cache_sb, prio_set, bset, | 1053 | * This is used for various on disk data structures - cache_sb, prio_set, bset, |
1055 | * jset: The checksum is _always_ the first 8 bytes of these structs | 1054 | * jset: The checksum is _always_ the first 8 bytes of these structs |
1056 | */ | 1055 | */ |
1057 | #define csum_set(i) \ | 1056 | #define csum_set(i) \ |
1058 | bch_crc64(((void *) (i)) + sizeof(uint64_t), \ | 1057 | bch_crc64(((void *) (i)) + sizeof(uint64_t), \ |
1059 | ((void *) end(i)) - (((void *) (i)) + sizeof(uint64_t))) | 1058 | ((void *) end(i)) - (((void *) (i)) + sizeof(uint64_t))) |
1060 | 1059 | ||
1061 | /* Error handling macros */ | 1060 | /* Error handling macros */ |
1062 | 1061 | ||
1063 | #define btree_bug(b, ...) \ | 1062 | #define btree_bug(b, ...) \ |
1064 | do { \ | 1063 | do { \ |
1065 | if (bch_cache_set_error((b)->c, __VA_ARGS__)) \ | 1064 | if (bch_cache_set_error((b)->c, __VA_ARGS__)) \ |
1066 | dump_stack(); \ | 1065 | dump_stack(); \ |
1067 | } while (0) | 1066 | } while (0) |
1068 | 1067 | ||
1069 | #define cache_bug(c, ...) \ | 1068 | #define cache_bug(c, ...) \ |
1070 | do { \ | 1069 | do { \ |
1071 | if (bch_cache_set_error(c, __VA_ARGS__)) \ | 1070 | if (bch_cache_set_error(c, __VA_ARGS__)) \ |
1072 | dump_stack(); \ | 1071 | dump_stack(); \ |
1073 | } while (0) | 1072 | } while (0) |
1074 | 1073 | ||
1075 | #define btree_bug_on(cond, b, ...) \ | 1074 | #define btree_bug_on(cond, b, ...) \ |
1076 | do { \ | 1075 | do { \ |
1077 | if (cond) \ | 1076 | if (cond) \ |
1078 | btree_bug(b, __VA_ARGS__); \ | 1077 | btree_bug(b, __VA_ARGS__); \ |
1079 | } while (0) | 1078 | } while (0) |
1080 | 1079 | ||
1081 | #define cache_bug_on(cond, c, ...) \ | 1080 | #define cache_bug_on(cond, c, ...) \ |
1082 | do { \ | 1081 | do { \ |
1083 | if (cond) \ | 1082 | if (cond) \ |
1084 | cache_bug(c, __VA_ARGS__); \ | 1083 | cache_bug(c, __VA_ARGS__); \ |
1085 | } while (0) | 1084 | } while (0) |
1086 | 1085 | ||
1087 | #define cache_set_err_on(cond, c, ...) \ | 1086 | #define cache_set_err_on(cond, c, ...) \ |
1088 | do { \ | 1087 | do { \ |
1089 | if (cond) \ | 1088 | if (cond) \ |
1090 | bch_cache_set_error(c, __VA_ARGS__); \ | 1089 | bch_cache_set_error(c, __VA_ARGS__); \ |
1091 | } while (0) | 1090 | } while (0) |
1092 | 1091 | ||
1093 | /* Looping macros */ | 1092 | /* Looping macros */ |
1094 | 1093 | ||
1095 | #define for_each_cache(ca, cs, iter) \ | 1094 | #define for_each_cache(ca, cs, iter) \ |
1096 | for (iter = 0; ca = cs->cache[iter], iter < (cs)->sb.nr_in_set; iter++) | 1095 | for (iter = 0; ca = cs->cache[iter], iter < (cs)->sb.nr_in_set; iter++) |
1097 | 1096 | ||
1098 | #define for_each_bucket(b, ca) \ | 1097 | #define for_each_bucket(b, ca) \ |
1099 | for (b = (ca)->buckets + (ca)->sb.first_bucket; \ | 1098 | for (b = (ca)->buckets + (ca)->sb.first_bucket; \ |
1100 | b < (ca)->buckets + (ca)->sb.nbuckets; b++) | 1099 | b < (ca)->buckets + (ca)->sb.nbuckets; b++) |
1101 | 1100 | ||
1102 | static inline void __bkey_put(struct cache_set *c, struct bkey *k) | 1101 | static inline void __bkey_put(struct cache_set *c, struct bkey *k) |
1103 | { | 1102 | { |
1104 | unsigned i; | 1103 | unsigned i; |
1105 | 1104 | ||
1106 | for (i = 0; i < KEY_PTRS(k); i++) | 1105 | for (i = 0; i < KEY_PTRS(k); i++) |
1107 | atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin); | 1106 | atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin); |
1108 | } | 1107 | } |
1109 | 1108 | ||
1110 | static inline void cached_dev_put(struct cached_dev *dc) | 1109 | static inline void cached_dev_put(struct cached_dev *dc) |
1111 | { | 1110 | { |
1112 | if (atomic_dec_and_test(&dc->count)) | 1111 | if (atomic_dec_and_test(&dc->count)) |
1113 | schedule_work(&dc->detach); | 1112 | schedule_work(&dc->detach); |
1114 | } | 1113 | } |
1115 | 1114 | ||
1116 | static inline bool cached_dev_get(struct cached_dev *dc) | 1115 | static inline bool cached_dev_get(struct cached_dev *dc) |
1117 | { | 1116 | { |
1118 | if (!atomic_inc_not_zero(&dc->count)) | 1117 | if (!atomic_inc_not_zero(&dc->count)) |
1119 | return false; | 1118 | return false; |
1120 | 1119 | ||
1121 | /* Paired with the mb in cached_dev_attach */ | 1120 | /* Paired with the mb in cached_dev_attach */ |
1122 | smp_mb__after_atomic_inc(); | 1121 | smp_mb__after_atomic_inc(); |
1123 | return true; | 1122 | return true; |
1124 | } | 1123 | } |
1125 | 1124 | ||
1126 | /* | 1125 | /* |
1127 | * bucket_gc_gen() returns the difference between the bucket's current gen and | 1126 | * bucket_gc_gen() returns the difference between the bucket's current gen and |
1128 | * the oldest gen of any pointer into that bucket in the btree (last_gc). | 1127 | * the oldest gen of any pointer into that bucket in the btree (last_gc). |
1129 | * | 1128 | * |
1130 | * bucket_disk_gen() returns the difference between the current gen and the gen | 1129 | * bucket_disk_gen() returns the difference between the current gen and the gen |
1131 | * on disk; they're both used to make sure gens don't wrap around. | 1130 | * on disk; they're both used to make sure gens don't wrap around. |
1132 | */ | 1131 | */ |
1133 | 1132 | ||
1134 | static inline uint8_t bucket_gc_gen(struct bucket *b) | 1133 | static inline uint8_t bucket_gc_gen(struct bucket *b) |
1135 | { | 1134 | { |
1136 | return b->gen - b->last_gc; | 1135 | return b->gen - b->last_gc; |
1137 | } | 1136 | } |
1138 | 1137 | ||
1139 | static inline uint8_t bucket_disk_gen(struct bucket *b) | 1138 | static inline uint8_t bucket_disk_gen(struct bucket *b) |
1140 | { | 1139 | { |
1141 | return b->gen - b->disk_gen; | 1140 | return b->gen - b->disk_gen; |
1142 | } | 1141 | } |
1143 | 1142 | ||
1144 | #define BUCKET_GC_GEN_MAX 96U | 1143 | #define BUCKET_GC_GEN_MAX 96U |
1145 | #define BUCKET_DISK_GEN_MAX 64U | 1144 | #define BUCKET_DISK_GEN_MAX 64U |
1146 | 1145 | ||
1147 | #define kobj_attribute_write(n, fn) \ | 1146 | #define kobj_attribute_write(n, fn) \ |
1148 | static struct kobj_attribute ksysfs_##n = __ATTR(n, S_IWUSR, NULL, fn) | 1147 | static struct kobj_attribute ksysfs_##n = __ATTR(n, S_IWUSR, NULL, fn) |
1149 | 1148 | ||
1150 | #define kobj_attribute_rw(n, show, store) \ | 1149 | #define kobj_attribute_rw(n, show, store) \ |
1151 | static struct kobj_attribute ksysfs_##n = \ | 1150 | static struct kobj_attribute ksysfs_##n = \ |
1152 | __ATTR(n, S_IWUSR|S_IRUSR, show, store) | 1151 | __ATTR(n, S_IWUSR|S_IRUSR, show, store) |
1153 | 1152 | ||
1154 | static inline void wake_up_allocators(struct cache_set *c) | 1153 | static inline void wake_up_allocators(struct cache_set *c) |
1155 | { | 1154 | { |
1156 | struct cache *ca; | 1155 | struct cache *ca; |
1157 | unsigned i; | 1156 | unsigned i; |
1158 | 1157 | ||
1159 | for_each_cache(ca, c, i) | 1158 | for_each_cache(ca, c, i) |
1160 | wake_up_process(ca->alloc_thread); | 1159 | wake_up_process(ca->alloc_thread); |
1161 | } | 1160 | } |
1162 | 1161 | ||
1163 | /* Forward declarations */ | 1162 | /* Forward declarations */ |
1164 | 1163 | ||
1165 | void bch_count_io_errors(struct cache *, int, const char *); | 1164 | void bch_count_io_errors(struct cache *, int, const char *); |
1166 | void bch_bbio_count_io_errors(struct cache_set *, struct bio *, | 1165 | void bch_bbio_count_io_errors(struct cache_set *, struct bio *, |
1167 | int, const char *); | 1166 | int, const char *); |
1168 | void bch_bbio_endio(struct cache_set *, struct bio *, int, const char *); | 1167 | void bch_bbio_endio(struct cache_set *, struct bio *, int, const char *); |
1169 | void bch_bbio_free(struct bio *, struct cache_set *); | 1168 | void bch_bbio_free(struct bio *, struct cache_set *); |
1170 | struct bio *bch_bbio_alloc(struct cache_set *); | 1169 | struct bio *bch_bbio_alloc(struct cache_set *); |
1171 | 1170 | ||
1172 | struct bio *bch_bio_split(struct bio *, int, gfp_t, struct bio_set *); | 1171 | struct bio *bch_bio_split(struct bio *, int, gfp_t, struct bio_set *); |
1173 | void bch_generic_make_request(struct bio *, struct bio_split_pool *); | 1172 | void bch_generic_make_request(struct bio *, struct bio_split_pool *); |
1174 | void __bch_submit_bbio(struct bio *, struct cache_set *); | 1173 | void __bch_submit_bbio(struct bio *, struct cache_set *); |
1175 | void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned); | 1174 | void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned); |
1176 | 1175 | ||
1177 | uint8_t bch_inc_gen(struct cache *, struct bucket *); | 1176 | uint8_t bch_inc_gen(struct cache *, struct bucket *); |
1178 | void bch_rescale_priorities(struct cache_set *, int); | 1177 | void bch_rescale_priorities(struct cache_set *, int); |
1179 | bool bch_bucket_add_unused(struct cache *, struct bucket *); | 1178 | bool bch_bucket_add_unused(struct cache *, struct bucket *); |
1180 | 1179 | ||
1181 | long bch_bucket_alloc(struct cache *, unsigned, struct closure *); | 1180 | long bch_bucket_alloc(struct cache *, unsigned, struct closure *); |
1182 | void bch_bucket_free(struct cache_set *, struct bkey *); | 1181 | void bch_bucket_free(struct cache_set *, struct bkey *); |
1183 | 1182 | ||
1184 | int __bch_bucket_alloc_set(struct cache_set *, unsigned, | 1183 | int __bch_bucket_alloc_set(struct cache_set *, unsigned, |
1185 | struct bkey *, int, struct closure *); | 1184 | struct bkey *, int, struct closure *); |
1186 | int bch_bucket_alloc_set(struct cache_set *, unsigned, | 1185 | int bch_bucket_alloc_set(struct cache_set *, unsigned, |
1187 | struct bkey *, int, struct closure *); | 1186 | struct bkey *, int, struct closure *); |
1188 | 1187 | ||
1189 | __printf(2, 3) | 1188 | __printf(2, 3) |
1190 | bool bch_cache_set_error(struct cache_set *, const char *, ...); | 1189 | bool bch_cache_set_error(struct cache_set *, const char *, ...); |
1191 | 1190 | ||
1192 | void bch_prio_write(struct cache *); | 1191 | void bch_prio_write(struct cache *); |
1193 | void bch_write_bdev_super(struct cached_dev *, struct closure *); | 1192 | void bch_write_bdev_super(struct cached_dev *, struct closure *); |
1194 | 1193 | ||
1195 | extern struct workqueue_struct *bcache_wq, *bch_gc_wq; | 1194 | extern struct workqueue_struct *bcache_wq, *bch_gc_wq; |
1196 | extern const char * const bch_cache_modes[]; | 1195 | extern const char * const bch_cache_modes[]; |
1197 | extern struct mutex bch_register_lock; | 1196 | extern struct mutex bch_register_lock; |
1198 | extern struct list_head bch_cache_sets; | 1197 | extern struct list_head bch_cache_sets; |
1199 | 1198 | ||
1200 | extern struct kobj_type bch_cached_dev_ktype; | 1199 | extern struct kobj_type bch_cached_dev_ktype; |
1201 | extern struct kobj_type bch_flash_dev_ktype; | 1200 | extern struct kobj_type bch_flash_dev_ktype; |
1202 | extern struct kobj_type bch_cache_set_ktype; | 1201 | extern struct kobj_type bch_cache_set_ktype; |
1203 | extern struct kobj_type bch_cache_set_internal_ktype; | 1202 | extern struct kobj_type bch_cache_set_internal_ktype; |
1204 | extern struct kobj_type bch_cache_ktype; | 1203 | extern struct kobj_type bch_cache_ktype; |
1205 | 1204 | ||
1206 | void bch_cached_dev_release(struct kobject *); | 1205 | void bch_cached_dev_release(struct kobject *); |
1207 | void bch_flash_dev_release(struct kobject *); | 1206 | void bch_flash_dev_release(struct kobject *); |
1208 | void bch_cache_set_release(struct kobject *); | 1207 | void bch_cache_set_release(struct kobject *); |
1209 | void bch_cache_release(struct kobject *); | 1208 | void bch_cache_release(struct kobject *); |
1210 | 1209 | ||
1211 | int bch_uuid_write(struct cache_set *); | 1210 | int bch_uuid_write(struct cache_set *); |
1212 | void bcache_write_super(struct cache_set *); | 1211 | void bcache_write_super(struct cache_set *); |
1213 | 1212 | ||
1214 | int bch_flash_dev_create(struct cache_set *c, uint64_t size); | 1213 | int bch_flash_dev_create(struct cache_set *c, uint64_t size); |
1215 | 1214 | ||
1216 | int bch_cached_dev_attach(struct cached_dev *, struct cache_set *); | 1215 | int bch_cached_dev_attach(struct cached_dev *, struct cache_set *); |
1217 | void bch_cached_dev_detach(struct cached_dev *); | 1216 | void bch_cached_dev_detach(struct cached_dev *); |
1218 | void bch_cached_dev_run(struct cached_dev *); | 1217 | void bch_cached_dev_run(struct cached_dev *); |
1219 | void bcache_device_stop(struct bcache_device *); | 1218 | void bcache_device_stop(struct bcache_device *); |
1220 | 1219 | ||
1221 | void bch_cache_set_unregister(struct cache_set *); | 1220 | void bch_cache_set_unregister(struct cache_set *); |
1222 | void bch_cache_set_stop(struct cache_set *); | 1221 | void bch_cache_set_stop(struct cache_set *); |
1223 | 1222 | ||
1224 | struct cache_set *bch_cache_set_alloc(struct cache_sb *); | 1223 | struct cache_set *bch_cache_set_alloc(struct cache_sb *); |
1225 | void bch_btree_cache_free(struct cache_set *); | 1224 | void bch_btree_cache_free(struct cache_set *); |
1226 | int bch_btree_cache_alloc(struct cache_set *); | 1225 | int bch_btree_cache_alloc(struct cache_set *); |
1227 | void bch_moving_init_cache_set(struct cache_set *); | 1226 | void bch_moving_init_cache_set(struct cache_set *); |
1228 | 1227 | ||
1229 | int bch_cache_allocator_start(struct cache *ca); | 1228 | int bch_cache_allocator_start(struct cache *ca); |
1230 | void bch_cache_allocator_exit(struct cache *ca); | 1229 | void bch_cache_allocator_exit(struct cache *ca); |
1231 | int bch_cache_allocator_init(struct cache *ca); | 1230 | int bch_cache_allocator_init(struct cache *ca); |
1232 | 1231 | ||
1233 | void bch_debug_exit(void); | 1232 | void bch_debug_exit(void); |
1234 | int bch_debug_init(struct kobject *); | 1233 | int bch_debug_init(struct kobject *); |
1235 | void bch_writeback_exit(void); | 1234 | void bch_writeback_exit(void); |
1236 | int bch_writeback_init(void); | 1235 | int bch_writeback_init(void); |
1237 | void bch_request_exit(void); | 1236 | void bch_request_exit(void); |
1238 | int bch_request_init(void); | 1237 | int bch_request_init(void); |
1239 | void bch_btree_exit(void); | 1238 | void bch_btree_exit(void); |
1240 | int bch_btree_init(void); | 1239 | int bch_btree_init(void); |
1241 | 1240 | ||
1242 | #endif /* _BCACHE_H */ | 1241 | #endif /* _BCACHE_H */ |
drivers/md/bcache/btree.c
1 | /* | 1 | /* |
2 | * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> | 2 | * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> |
3 | * | 3 | * |
4 | * Uses a block device as cache for other block devices; optimized for SSDs. | 4 | * Uses a block device as cache for other block devices; optimized for SSDs. |
5 | * All allocation is done in buckets, which should match the erase block size | 5 | * All allocation is done in buckets, which should match the erase block size |
6 | * of the device. | 6 | * of the device. |
7 | * | 7 | * |
8 | * Buckets containing cached data are kept on a heap sorted by priority; | 8 | * Buckets containing cached data are kept on a heap sorted by priority; |
9 | * bucket priority is increased on cache hit, and periodically all the buckets | 9 | * bucket priority is increased on cache hit, and periodically all the buckets |
10 | * on the heap have their priority scaled down. This currently is just used as | 10 | * on the heap have their priority scaled down. This currently is just used as |
11 | * an LRU but in the future should allow for more intelligent heuristics. | 11 | * an LRU but in the future should allow for more intelligent heuristics. |
12 | * | 12 | * |
13 | * Buckets have an 8 bit counter; freeing is accomplished by incrementing the | 13 | * Buckets have an 8 bit counter; freeing is accomplished by incrementing the |
14 | * counter. Garbage collection is used to remove stale pointers. | 14 | * counter. Garbage collection is used to remove stale pointers. |
15 | * | 15 | * |
16 | * Indexing is done via a btree; nodes are not necessarily fully sorted, rather | 16 | * Indexing is done via a btree; nodes are not necessarily fully sorted, rather |
17 | * as keys are inserted we only sort the pages that have not yet been written. | 17 | * as keys are inserted we only sort the pages that have not yet been written. |
18 | * When garbage collection is run, we resort the entire node. | 18 | * When garbage collection is run, we resort the entire node. |
19 | * | 19 | * |
20 | * All configuration is done via sysfs; see Documentation/bcache.txt. | 20 | * All configuration is done via sysfs; see Documentation/bcache.txt. |
21 | */ | 21 | */ |
22 | 22 | ||
23 | #include "bcache.h" | 23 | #include "bcache.h" |
24 | #include "btree.h" | 24 | #include "btree.h" |
25 | #include "debug.h" | 25 | #include "debug.h" |
26 | #include "request.h" | 26 | #include "request.h" |
27 | #include "writeback.h" | 27 | #include "writeback.h" |
28 | 28 | ||
29 | #include <linux/slab.h> | 29 | #include <linux/slab.h> |
30 | #include <linux/bitops.h> | 30 | #include <linux/bitops.h> |
31 | #include <linux/hash.h> | 31 | #include <linux/hash.h> |
32 | #include <linux/prefetch.h> | 32 | #include <linux/prefetch.h> |
33 | #include <linux/random.h> | 33 | #include <linux/random.h> |
34 | #include <linux/rcupdate.h> | 34 | #include <linux/rcupdate.h> |
35 | #include <trace/events/bcache.h> | 35 | #include <trace/events/bcache.h> |
36 | 36 | ||
37 | /* | 37 | /* |
38 | * Todo: | 38 | * Todo: |
39 | * register_bcache: Return errors out to userspace correctly | 39 | * register_bcache: Return errors out to userspace correctly |
40 | * | 40 | * |
41 | * Writeback: don't undirty key until after a cache flush | 41 | * Writeback: don't undirty key until after a cache flush |
42 | * | 42 | * |
43 | * Create an iterator for key pointers | 43 | * Create an iterator for key pointers |
44 | * | 44 | * |
45 | * On btree write error, mark bucket such that it won't be freed from the cache | 45 | * On btree write error, mark bucket such that it won't be freed from the cache |
46 | * | 46 | * |
47 | * Journalling: | 47 | * Journalling: |
48 | * Check for bad keys in replay | 48 | * Check for bad keys in replay |
49 | * Propagate barriers | 49 | * Propagate barriers |
50 | * Refcount journal entries in journal_replay | 50 | * Refcount journal entries in journal_replay |
51 | * | 51 | * |
52 | * Garbage collection: | 52 | * Garbage collection: |
53 | * Finish incremental gc | 53 | * Finish incremental gc |
54 | * Gc should free old UUIDs, data for invalid UUIDs | 54 | * Gc should free old UUIDs, data for invalid UUIDs |
55 | * | 55 | * |
56 | * Provide a way to list backing device UUIDs we have data cached for, and | 56 | * Provide a way to list backing device UUIDs we have data cached for, and |
57 | * probably how long it's been since we've seen them, and a way to invalidate | 57 | * probably how long it's been since we've seen them, and a way to invalidate |
58 | * dirty data for devices that will never be attached again | 58 | * dirty data for devices that will never be attached again |
59 | * | 59 | * |
60 | * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so | 60 | * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so |
61 | * that based on that and how much dirty data we have we can keep writeback | 61 | * that based on that and how much dirty data we have we can keep writeback |
62 | * from being starved | 62 | * from being starved |
63 | * | 63 | * |
64 | * Add a tracepoint or somesuch to watch for writeback starvation | 64 | * Add a tracepoint or somesuch to watch for writeback starvation |
65 | * | 65 | * |
66 | * When btree depth > 1 and splitting an interior node, we have to make sure | 66 | * When btree depth > 1 and splitting an interior node, we have to make sure |
67 | * alloc_bucket() cannot fail. This should be true but is not completely | 67 | * alloc_bucket() cannot fail. This should be true but is not completely |
68 | * obvious. | 68 | * obvious. |
69 | * | 69 | * |
70 | * Make sure all allocations get charged to the root cgroup | 70 | * Make sure all allocations get charged to the root cgroup |
71 | * | 71 | * |
72 | * Plugging? | 72 | * Plugging? |
73 | * | 73 | * |
74 | * If data write is less than hard sector size of ssd, round up offset in open | 74 | * If data write is less than hard sector size of ssd, round up offset in open |
75 | * bucket to the next whole sector | 75 | * bucket to the next whole sector |
76 | * | 76 | * |
77 | * Also lookup by cgroup in get_open_bucket() | 77 | * Also lookup by cgroup in get_open_bucket() |
78 | * | 78 | * |
79 | * Superblock needs to be fleshed out for multiple cache devices | 79 | * Superblock needs to be fleshed out for multiple cache devices |
80 | * | 80 | * |
81 | * Add a sysfs tunable for the number of writeback IOs in flight | 81 | * Add a sysfs tunable for the number of writeback IOs in flight |
82 | * | 82 | * |
83 | * Add a sysfs tunable for the number of open data buckets | 83 | * Add a sysfs tunable for the number of open data buckets |
84 | * | 84 | * |
85 | * IO tracking: Can we track when one process is doing io on behalf of another? | 85 | * IO tracking: Can we track when one process is doing io on behalf of another? |
86 | * IO tracking: Don't use just an average, weigh more recent stuff higher | 86 | * IO tracking: Don't use just an average, weigh more recent stuff higher |
87 | * | 87 | * |
88 | * Test module load/unload | 88 | * Test module load/unload |
89 | */ | 89 | */ |
90 | 90 | ||
91 | static const char * const op_types[] = { | 91 | static const char * const op_types[] = { |
92 | "insert", "replace" | 92 | "insert", "replace" |
93 | }; | 93 | }; |
94 | 94 | ||
95 | static const char *op_type(struct btree_op *op) | 95 | static const char *op_type(struct btree_op *op) |
96 | { | 96 | { |
97 | return op_types[op->type]; | 97 | return op_types[op->type]; |
98 | } | 98 | } |
99 | 99 | ||
100 | #define MAX_NEED_GC 64 | 100 | #define MAX_NEED_GC 64 |
101 | #define MAX_SAVE_PRIO 72 | 101 | #define MAX_SAVE_PRIO 72 |
102 | 102 | ||
103 | #define PTR_DIRTY_BIT (((uint64_t) 1 << 36)) | 103 | #define PTR_DIRTY_BIT (((uint64_t) 1 << 36)) |
104 | 104 | ||
105 | #define PTR_HASH(c, k) \ | 105 | #define PTR_HASH(c, k) \ |
106 | (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0)) | 106 | (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0)) |
107 | 107 | ||
108 | struct workqueue_struct *bch_gc_wq; | 108 | struct workqueue_struct *bch_gc_wq; |
109 | static struct workqueue_struct *btree_io_wq; | 109 | static struct workqueue_struct *btree_io_wq; |
110 | 110 | ||
111 | void bch_btree_op_init_stack(struct btree_op *op) | 111 | void bch_btree_op_init_stack(struct btree_op *op) |
112 | { | 112 | { |
113 | memset(op, 0, sizeof(struct btree_op)); | 113 | memset(op, 0, sizeof(struct btree_op)); |
114 | closure_init_stack(&op->cl); | 114 | closure_init_stack(&op->cl); |
115 | op->lock = -1; | 115 | op->lock = -1; |
116 | bch_keylist_init(&op->keys); | 116 | bch_keylist_init(&op->keys); |
117 | } | 117 | } |
118 | 118 | ||
119 | /* Btree key manipulation */ | 119 | /* Btree key manipulation */ |
120 | 120 | ||
121 | static void bkey_put(struct cache_set *c, struct bkey *k, int level) | 121 | static void bkey_put(struct cache_set *c, struct bkey *k, int level) |
122 | { | 122 | { |
123 | if ((level && KEY_OFFSET(k)) || !level) | 123 | if ((level && KEY_OFFSET(k)) || !level) |
124 | __bkey_put(c, k); | 124 | __bkey_put(c, k); |
125 | } | 125 | } |
126 | 126 | ||
127 | /* Btree IO */ | 127 | /* Btree IO */ |
128 | 128 | ||
129 | static uint64_t btree_csum_set(struct btree *b, struct bset *i) | 129 | static uint64_t btree_csum_set(struct btree *b, struct bset *i) |
130 | { | 130 | { |
131 | uint64_t crc = b->key.ptr[0]; | 131 | uint64_t crc = b->key.ptr[0]; |
132 | void *data = (void *) i + 8, *end = end(i); | 132 | void *data = (void *) i + 8, *end = end(i); |
133 | 133 | ||
134 | crc = bch_crc64_update(crc, data, end - data); | 134 | crc = bch_crc64_update(crc, data, end - data); |
135 | return crc ^ 0xffffffffffffffffULL; | 135 | return crc ^ 0xffffffffffffffffULL; |
136 | } | 136 | } |
137 | 137 | ||
138 | void bch_btree_node_read_done(struct btree *b) | 138 | void bch_btree_node_read_done(struct btree *b) |
139 | { | 139 | { |
140 | const char *err = "bad btree header"; | 140 | const char *err = "bad btree header"; |
141 | struct bset *i = b->sets[0].data; | 141 | struct bset *i = b->sets[0].data; |
142 | struct btree_iter *iter; | 142 | struct btree_iter *iter; |
143 | 143 | ||
144 | iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT); | 144 | iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT); |
145 | iter->size = b->c->sb.bucket_size / b->c->sb.block_size; | 145 | iter->size = b->c->sb.bucket_size / b->c->sb.block_size; |
146 | iter->used = 0; | 146 | iter->used = 0; |
147 | 147 | ||
148 | if (!i->seq) | 148 | if (!i->seq) |
149 | goto err; | 149 | goto err; |
150 | 150 | ||
151 | for (; | 151 | for (; |
152 | b->written < btree_blocks(b) && i->seq == b->sets[0].data->seq; | 152 | b->written < btree_blocks(b) && i->seq == b->sets[0].data->seq; |
153 | i = write_block(b)) { | 153 | i = write_block(b)) { |
154 | err = "unsupported bset version"; | 154 | err = "unsupported bset version"; |
155 | if (i->version > BCACHE_BSET_VERSION) | 155 | if (i->version > BCACHE_BSET_VERSION) |
156 | goto err; | 156 | goto err; |
157 | 157 | ||
158 | err = "bad btree header"; | 158 | err = "bad btree header"; |
159 | if (b->written + set_blocks(i, b->c) > btree_blocks(b)) | 159 | if (b->written + set_blocks(i, b->c) > btree_blocks(b)) |
160 | goto err; | 160 | goto err; |
161 | 161 | ||
162 | err = "bad magic"; | 162 | err = "bad magic"; |
163 | if (i->magic != bset_magic(b->c)) | 163 | if (i->magic != bset_magic(b->c)) |
164 | goto err; | 164 | goto err; |
165 | 165 | ||
166 | err = "bad checksum"; | 166 | err = "bad checksum"; |
167 | switch (i->version) { | 167 | switch (i->version) { |
168 | case 0: | 168 | case 0: |
169 | if (i->csum != csum_set(i)) | 169 | if (i->csum != csum_set(i)) |
170 | goto err; | 170 | goto err; |
171 | break; | 171 | break; |
172 | case BCACHE_BSET_VERSION: | 172 | case BCACHE_BSET_VERSION: |
173 | if (i->csum != btree_csum_set(b, i)) | 173 | if (i->csum != btree_csum_set(b, i)) |
174 | goto err; | 174 | goto err; |
175 | break; | 175 | break; |
176 | } | 176 | } |
177 | 177 | ||
178 | err = "empty set"; | 178 | err = "empty set"; |
179 | if (i != b->sets[0].data && !i->keys) | 179 | if (i != b->sets[0].data && !i->keys) |
180 | goto err; | 180 | goto err; |
181 | 181 | ||
182 | bch_btree_iter_push(iter, i->start, end(i)); | 182 | bch_btree_iter_push(iter, i->start, end(i)); |
183 | 183 | ||
184 | b->written += set_blocks(i, b->c); | 184 | b->written += set_blocks(i, b->c); |
185 | } | 185 | } |
186 | 186 | ||
187 | err = "corrupted btree"; | 187 | err = "corrupted btree"; |
188 | for (i = write_block(b); | 188 | for (i = write_block(b); |
189 | index(i, b) < btree_blocks(b); | 189 | index(i, b) < btree_blocks(b); |
190 | i = ((void *) i) + block_bytes(b->c)) | 190 | i = ((void *) i) + block_bytes(b->c)) |
191 | if (i->seq == b->sets[0].data->seq) | 191 | if (i->seq == b->sets[0].data->seq) |
192 | goto err; | 192 | goto err; |
193 | 193 | ||
194 | bch_btree_sort_and_fix_extents(b, iter); | 194 | bch_btree_sort_and_fix_extents(b, iter); |
195 | 195 | ||
196 | i = b->sets[0].data; | 196 | i = b->sets[0].data; |
197 | err = "short btree key"; | 197 | err = "short btree key"; |
198 | if (b->sets[0].size && | 198 | if (b->sets[0].size && |
199 | bkey_cmp(&b->key, &b->sets[0].end) < 0) | 199 | bkey_cmp(&b->key, &b->sets[0].end) < 0) |
200 | goto err; | 200 | goto err; |
201 | 201 | ||
202 | if (b->written < btree_blocks(b)) | 202 | if (b->written < btree_blocks(b)) |
203 | bch_bset_init_next(b); | 203 | bch_bset_init_next(b); |
204 | out: | 204 | out: |
205 | mempool_free(iter, b->c->fill_iter); | 205 | mempool_free(iter, b->c->fill_iter); |
206 | return; | 206 | return; |
207 | err: | 207 | err: |
208 | set_btree_node_io_error(b); | 208 | set_btree_node_io_error(b); |
209 | bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys", | 209 | bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys", |
210 | err, PTR_BUCKET_NR(b->c, &b->key, 0), | 210 | err, PTR_BUCKET_NR(b->c, &b->key, 0), |
211 | index(i, b), i->keys); | 211 | index(i, b), i->keys); |
212 | goto out; | 212 | goto out; |
213 | } | 213 | } |
214 | 214 | ||
215 | static void btree_node_read_endio(struct bio *bio, int error) | 215 | static void btree_node_read_endio(struct bio *bio, int error) |
216 | { | 216 | { |
217 | struct closure *cl = bio->bi_private; | 217 | struct closure *cl = bio->bi_private; |
218 | closure_put(cl); | 218 | closure_put(cl); |
219 | } | 219 | } |
220 | 220 | ||
221 | void bch_btree_node_read(struct btree *b) | 221 | void bch_btree_node_read(struct btree *b) |
222 | { | 222 | { |
223 | uint64_t start_time = local_clock(); | 223 | uint64_t start_time = local_clock(); |
224 | struct closure cl; | 224 | struct closure cl; |
225 | struct bio *bio; | 225 | struct bio *bio; |
226 | 226 | ||
227 | trace_bcache_btree_read(b); | 227 | trace_bcache_btree_read(b); |
228 | 228 | ||
229 | closure_init_stack(&cl); | 229 | closure_init_stack(&cl); |
230 | 230 | ||
231 | bio = bch_bbio_alloc(b->c); | 231 | bio = bch_bbio_alloc(b->c); |
232 | bio->bi_rw = REQ_META|READ_SYNC; | 232 | bio->bi_rw = REQ_META|READ_SYNC; |
233 | bio->bi_size = KEY_SIZE(&b->key) << 9; | 233 | bio->bi_size = KEY_SIZE(&b->key) << 9; |
234 | bio->bi_end_io = btree_node_read_endio; | 234 | bio->bi_end_io = btree_node_read_endio; |
235 | bio->bi_private = &cl; | 235 | bio->bi_private = &cl; |
236 | 236 | ||
237 | bch_bio_map(bio, b->sets[0].data); | 237 | bch_bio_map(bio, b->sets[0].data); |
238 | 238 | ||
239 | bch_submit_bbio(bio, b->c, &b->key, 0); | 239 | bch_submit_bbio(bio, b->c, &b->key, 0); |
240 | closure_sync(&cl); | 240 | closure_sync(&cl); |
241 | 241 | ||
242 | if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) | 242 | if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) |
243 | set_btree_node_io_error(b); | 243 | set_btree_node_io_error(b); |
244 | 244 | ||
245 | bch_bbio_free(bio, b->c); | 245 | bch_bbio_free(bio, b->c); |
246 | 246 | ||
247 | if (btree_node_io_error(b)) | 247 | if (btree_node_io_error(b)) |
248 | goto err; | 248 | goto err; |
249 | 249 | ||
250 | bch_btree_node_read_done(b); | 250 | bch_btree_node_read_done(b); |
251 | 251 | ||
252 | spin_lock(&b->c->btree_read_time_lock); | 252 | spin_lock(&b->c->btree_read_time_lock); |
253 | bch_time_stats_update(&b->c->btree_read_time, start_time); | 253 | bch_time_stats_update(&b->c->btree_read_time, start_time); |
254 | spin_unlock(&b->c->btree_read_time_lock); | 254 | spin_unlock(&b->c->btree_read_time_lock); |
255 | 255 | ||
256 | return; | 256 | return; |
257 | err: | 257 | err: |
258 | bch_cache_set_error(b->c, "io error reading bucket %lu", | 258 | bch_cache_set_error(b->c, "io error reading bucket %lu", |
259 | PTR_BUCKET_NR(b->c, &b->key, 0)); | 259 | PTR_BUCKET_NR(b->c, &b->key, 0)); |
260 | } | 260 | } |
261 | 261 | ||
262 | static void btree_complete_write(struct btree *b, struct btree_write *w) | 262 | static void btree_complete_write(struct btree *b, struct btree_write *w) |
263 | { | 263 | { |
264 | if (w->prio_blocked && | 264 | if (w->prio_blocked && |
265 | !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked)) | 265 | !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked)) |
266 | wake_up_allocators(b->c); | 266 | wake_up_allocators(b->c); |
267 | 267 | ||
268 | if (w->journal) { | 268 | if (w->journal) { |
269 | atomic_dec_bug(w->journal); | 269 | atomic_dec_bug(w->journal); |
270 | __closure_wake_up(&b->c->journal.wait); | 270 | __closure_wake_up(&b->c->journal.wait); |
271 | } | 271 | } |
272 | 272 | ||
273 | w->prio_blocked = 0; | 273 | w->prio_blocked = 0; |
274 | w->journal = NULL; | 274 | w->journal = NULL; |
275 | } | 275 | } |
276 | 276 | ||
277 | static void __btree_node_write_done(struct closure *cl) | 277 | static void __btree_node_write_done(struct closure *cl) |
278 | { | 278 | { |
279 | struct btree *b = container_of(cl, struct btree, io.cl); | 279 | struct btree *b = container_of(cl, struct btree, io.cl); |
280 | struct btree_write *w = btree_prev_write(b); | 280 | struct btree_write *w = btree_prev_write(b); |
281 | 281 | ||
282 | bch_bbio_free(b->bio, b->c); | 282 | bch_bbio_free(b->bio, b->c); |
283 | b->bio = NULL; | 283 | b->bio = NULL; |
284 | btree_complete_write(b, w); | 284 | btree_complete_write(b, w); |
285 | 285 | ||
286 | if (btree_node_dirty(b)) | 286 | if (btree_node_dirty(b)) |
287 | queue_delayed_work(btree_io_wq, &b->work, | 287 | queue_delayed_work(btree_io_wq, &b->work, |
288 | msecs_to_jiffies(30000)); | 288 | msecs_to_jiffies(30000)); |
289 | 289 | ||
290 | closure_return(cl); | 290 | closure_return(cl); |
291 | } | 291 | } |
292 | 292 | ||
293 | static void btree_node_write_done(struct closure *cl) | 293 | static void btree_node_write_done(struct closure *cl) |
294 | { | 294 | { |
295 | struct btree *b = container_of(cl, struct btree, io.cl); | 295 | struct btree *b = container_of(cl, struct btree, io.cl); |
296 | struct bio_vec *bv; | 296 | struct bio_vec *bv; |
297 | int n; | 297 | int n; |
298 | 298 | ||
299 | __bio_for_each_segment(bv, b->bio, n, 0) | 299 | __bio_for_each_segment(bv, b->bio, n, 0) |
300 | __free_page(bv->bv_page); | 300 | __free_page(bv->bv_page); |
301 | 301 | ||
302 | __btree_node_write_done(cl); | 302 | __btree_node_write_done(cl); |
303 | } | 303 | } |
304 | 304 | ||
305 | static void btree_node_write_endio(struct bio *bio, int error) | 305 | static void btree_node_write_endio(struct bio *bio, int error) |
306 | { | 306 | { |
307 | struct closure *cl = bio->bi_private; | 307 | struct closure *cl = bio->bi_private; |
308 | struct btree *b = container_of(cl, struct btree, io.cl); | 308 | struct btree *b = container_of(cl, struct btree, io.cl); |
309 | 309 | ||
310 | if (error) | 310 | if (error) |
311 | set_btree_node_io_error(b); | 311 | set_btree_node_io_error(b); |
312 | 312 | ||
313 | bch_bbio_count_io_errors(b->c, bio, error, "writing btree"); | 313 | bch_bbio_count_io_errors(b->c, bio, error, "writing btree"); |
314 | closure_put(cl); | 314 | closure_put(cl); |
315 | } | 315 | } |
316 | 316 | ||
317 | static void do_btree_node_write(struct btree *b) | 317 | static void do_btree_node_write(struct btree *b) |
318 | { | 318 | { |
319 | struct closure *cl = &b->io.cl; | 319 | struct closure *cl = &b->io.cl; |
320 | struct bset *i = b->sets[b->nsets].data; | 320 | struct bset *i = b->sets[b->nsets].data; |
321 | BKEY_PADDED(key) k; | 321 | BKEY_PADDED(key) k; |
322 | 322 | ||
323 | i->version = BCACHE_BSET_VERSION; | 323 | i->version = BCACHE_BSET_VERSION; |
324 | i->csum = btree_csum_set(b, i); | 324 | i->csum = btree_csum_set(b, i); |
325 | 325 | ||
326 | BUG_ON(b->bio); | 326 | BUG_ON(b->bio); |
327 | b->bio = bch_bbio_alloc(b->c); | 327 | b->bio = bch_bbio_alloc(b->c); |
328 | 328 | ||
329 | b->bio->bi_end_io = btree_node_write_endio; | 329 | b->bio->bi_end_io = btree_node_write_endio; |
330 | b->bio->bi_private = &b->io.cl; | 330 | b->bio->bi_private = &b->io.cl; |
331 | b->bio->bi_rw = REQ_META|WRITE_SYNC; | 331 | b->bio->bi_rw = REQ_META|WRITE_SYNC; |
332 | b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); | 332 | b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); |
333 | bch_bio_map(b->bio, i); | 333 | bch_bio_map(b->bio, i); |
334 | 334 | ||
335 | bkey_copy(&k.key, &b->key); | 335 | bkey_copy(&k.key, &b->key); |
336 | SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i)); | 336 | SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i)); |
337 | 337 | ||
338 | if (!bch_bio_alloc_pages(b->bio, GFP_NOIO)) { | 338 | if (!bch_bio_alloc_pages(b->bio, GFP_NOIO)) { |
339 | int j; | 339 | int j; |
340 | struct bio_vec *bv; | 340 | struct bio_vec *bv; |
341 | void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); | 341 | void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); |
342 | 342 | ||
343 | bio_for_each_segment(bv, b->bio, j) | 343 | bio_for_each_segment(bv, b->bio, j) |
344 | memcpy(page_address(bv->bv_page), | 344 | memcpy(page_address(bv->bv_page), |
345 | base + j * PAGE_SIZE, PAGE_SIZE); | 345 | base + j * PAGE_SIZE, PAGE_SIZE); |
346 | 346 | ||
347 | bch_submit_bbio(b->bio, b->c, &k.key, 0); | 347 | bch_submit_bbio(b->bio, b->c, &k.key, 0); |
348 | 348 | ||
349 | continue_at(cl, btree_node_write_done, NULL); | 349 | continue_at(cl, btree_node_write_done, NULL); |
350 | } else { | 350 | } else { |
351 | b->bio->bi_vcnt = 0; | 351 | b->bio->bi_vcnt = 0; |
352 | bch_bio_map(b->bio, i); | 352 | bch_bio_map(b->bio, i); |
353 | 353 | ||
354 | bch_submit_bbio(b->bio, b->c, &k.key, 0); | 354 | bch_submit_bbio(b->bio, b->c, &k.key, 0); |
355 | 355 | ||
356 | closure_sync(cl); | 356 | closure_sync(cl); |
357 | __btree_node_write_done(cl); | 357 | __btree_node_write_done(cl); |
358 | } | 358 | } |
359 | } | 359 | } |
360 | 360 | ||
361 | void bch_btree_node_write(struct btree *b, struct closure *parent) | 361 | void bch_btree_node_write(struct btree *b, struct closure *parent) |
362 | { | 362 | { |
363 | struct bset *i = b->sets[b->nsets].data; | 363 | struct bset *i = b->sets[b->nsets].data; |
364 | 364 | ||
365 | trace_bcache_btree_write(b); | 365 | trace_bcache_btree_write(b); |
366 | 366 | ||
367 | BUG_ON(current->bio_list); | 367 | BUG_ON(current->bio_list); |
368 | BUG_ON(b->written >= btree_blocks(b)); | 368 | BUG_ON(b->written >= btree_blocks(b)); |
369 | BUG_ON(b->written && !i->keys); | 369 | BUG_ON(b->written && !i->keys); |
370 | BUG_ON(b->sets->data->seq != i->seq); | 370 | BUG_ON(b->sets->data->seq != i->seq); |
371 | bch_check_key_order(b, i); | 371 | bch_check_key_order(b, i); |
372 | 372 | ||
373 | cancel_delayed_work(&b->work); | 373 | cancel_delayed_work(&b->work); |
374 | 374 | ||
375 | /* If caller isn't waiting for write, parent refcount is cache set */ | 375 | /* If caller isn't waiting for write, parent refcount is cache set */ |
376 | closure_lock(&b->io, parent ?: &b->c->cl); | 376 | closure_lock(&b->io, parent ?: &b->c->cl); |
377 | 377 | ||
378 | clear_bit(BTREE_NODE_dirty, &b->flags); | 378 | clear_bit(BTREE_NODE_dirty, &b->flags); |
379 | change_bit(BTREE_NODE_write_idx, &b->flags); | 379 | change_bit(BTREE_NODE_write_idx, &b->flags); |
380 | 380 | ||
381 | do_btree_node_write(b); | 381 | do_btree_node_write(b); |
382 | 382 | ||
383 | b->written += set_blocks(i, b->c); | 383 | b->written += set_blocks(i, b->c); |
384 | atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size, | 384 | atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size, |
385 | &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); | 385 | &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); |
386 | 386 | ||
387 | bch_btree_sort_lazy(b); | 387 | bch_btree_sort_lazy(b); |
388 | 388 | ||
389 | if (b->written < btree_blocks(b)) | 389 | if (b->written < btree_blocks(b)) |
390 | bch_bset_init_next(b); | 390 | bch_bset_init_next(b); |
391 | } | 391 | } |
392 | 392 | ||
393 | static void btree_node_write_work(struct work_struct *w) | 393 | static void btree_node_write_work(struct work_struct *w) |
394 | { | 394 | { |
395 | struct btree *b = container_of(to_delayed_work(w), struct btree, work); | 395 | struct btree *b = container_of(to_delayed_work(w), struct btree, work); |
396 | 396 | ||
397 | rw_lock(true, b, b->level); | 397 | rw_lock(true, b, b->level); |
398 | 398 | ||
399 | if (btree_node_dirty(b)) | 399 | if (btree_node_dirty(b)) |
400 | bch_btree_node_write(b, NULL); | 400 | bch_btree_node_write(b, NULL); |
401 | rw_unlock(true, b); | 401 | rw_unlock(true, b); |
402 | } | 402 | } |
403 | 403 | ||
404 | static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op) | 404 | static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op) |
405 | { | 405 | { |
406 | struct bset *i = b->sets[b->nsets].data; | 406 | struct bset *i = b->sets[b->nsets].data; |
407 | struct btree_write *w = btree_current_write(b); | 407 | struct btree_write *w = btree_current_write(b); |
408 | 408 | ||
409 | BUG_ON(!b->written); | 409 | BUG_ON(!b->written); |
410 | BUG_ON(!i->keys); | 410 | BUG_ON(!i->keys); |
411 | 411 | ||
412 | if (!btree_node_dirty(b)) | 412 | if (!btree_node_dirty(b)) |
413 | queue_delayed_work(btree_io_wq, &b->work, 30 * HZ); | 413 | queue_delayed_work(btree_io_wq, &b->work, 30 * HZ); |
414 | 414 | ||
415 | set_btree_node_dirty(b); | 415 | set_btree_node_dirty(b); |
416 | 416 | ||
417 | if (op && op->journal) { | 417 | if (op && op->journal) { |
418 | if (w->journal && | 418 | if (w->journal && |
419 | journal_pin_cmp(b->c, w, op)) { | 419 | journal_pin_cmp(b->c, w, op)) { |
420 | atomic_dec_bug(w->journal); | 420 | atomic_dec_bug(w->journal); |
421 | w->journal = NULL; | 421 | w->journal = NULL; |
422 | } | 422 | } |
423 | 423 | ||
424 | if (!w->journal) { | 424 | if (!w->journal) { |
425 | w->journal = op->journal; | 425 | w->journal = op->journal; |
426 | atomic_inc(w->journal); | 426 | atomic_inc(w->journal); |
427 | } | 427 | } |
428 | } | 428 | } |
429 | 429 | ||
430 | /* Force write if set is too big */ | 430 | /* Force write if set is too big */ |
431 | if (set_bytes(i) > PAGE_SIZE - 48 && | 431 | if (set_bytes(i) > PAGE_SIZE - 48 && |
432 | !current->bio_list) | 432 | !current->bio_list) |
433 | bch_btree_node_write(b, NULL); | 433 | bch_btree_node_write(b, NULL); |
434 | } | 434 | } |
435 | 435 | ||
436 | /* | 436 | /* |
437 | * Btree in memory cache - allocation/freeing | 437 | * Btree in memory cache - allocation/freeing |
438 | * mca -> memory cache | 438 | * mca -> memory cache |
439 | */ | 439 | */ |
440 | 440 | ||
441 | static void mca_reinit(struct btree *b) | 441 | static void mca_reinit(struct btree *b) |
442 | { | 442 | { |
443 | unsigned i; | 443 | unsigned i; |
444 | 444 | ||
445 | b->flags = 0; | 445 | b->flags = 0; |
446 | b->written = 0; | 446 | b->written = 0; |
447 | b->nsets = 0; | 447 | b->nsets = 0; |
448 | 448 | ||
449 | for (i = 0; i < MAX_BSETS; i++) | 449 | for (i = 0; i < MAX_BSETS; i++) |
450 | b->sets[i].size = 0; | 450 | b->sets[i].size = 0; |
451 | /* | 451 | /* |
452 | * Second loop starts at 1 because b->sets[0]->data is the memory we | 452 | * Second loop starts at 1 because b->sets[0]->data is the memory we |
453 | * allocated | 453 | * allocated |
454 | */ | 454 | */ |
455 | for (i = 1; i < MAX_BSETS; i++) | 455 | for (i = 1; i < MAX_BSETS; i++) |
456 | b->sets[i].data = NULL; | 456 | b->sets[i].data = NULL; |
457 | } | 457 | } |
458 | 458 | ||
459 | #define mca_reserve(c) (((c->root && c->root->level) \ | 459 | #define mca_reserve(c) (((c->root && c->root->level) \ |
460 | ? c->root->level : 1) * 8 + 16) | 460 | ? c->root->level : 1) * 8 + 16) |
461 | #define mca_can_free(c) \ | 461 | #define mca_can_free(c) \ |
462 | max_t(int, 0, c->bucket_cache_used - mca_reserve(c)) | 462 | max_t(int, 0, c->bucket_cache_used - mca_reserve(c)) |
463 | 463 | ||
464 | static void mca_data_free(struct btree *b) | 464 | static void mca_data_free(struct btree *b) |
465 | { | 465 | { |
466 | struct bset_tree *t = b->sets; | 466 | struct bset_tree *t = b->sets; |
467 | BUG_ON(!closure_is_unlocked(&b->io.cl)); | 467 | BUG_ON(!closure_is_unlocked(&b->io.cl)); |
468 | 468 | ||
469 | if (bset_prev_bytes(b) < PAGE_SIZE) | 469 | if (bset_prev_bytes(b) < PAGE_SIZE) |
470 | kfree(t->prev); | 470 | kfree(t->prev); |
471 | else | 471 | else |
472 | free_pages((unsigned long) t->prev, | 472 | free_pages((unsigned long) t->prev, |
473 | get_order(bset_prev_bytes(b))); | 473 | get_order(bset_prev_bytes(b))); |
474 | 474 | ||
475 | if (bset_tree_bytes(b) < PAGE_SIZE) | 475 | if (bset_tree_bytes(b) < PAGE_SIZE) |
476 | kfree(t->tree); | 476 | kfree(t->tree); |
477 | else | 477 | else |
478 | free_pages((unsigned long) t->tree, | 478 | free_pages((unsigned long) t->tree, |
479 | get_order(bset_tree_bytes(b))); | 479 | get_order(bset_tree_bytes(b))); |
480 | 480 | ||
481 | free_pages((unsigned long) t->data, b->page_order); | 481 | free_pages((unsigned long) t->data, b->page_order); |
482 | 482 | ||
483 | t->prev = NULL; | 483 | t->prev = NULL; |
484 | t->tree = NULL; | 484 | t->tree = NULL; |
485 | t->data = NULL; | 485 | t->data = NULL; |
486 | list_move(&b->list, &b->c->btree_cache_freed); | 486 | list_move(&b->list, &b->c->btree_cache_freed); |
487 | b->c->bucket_cache_used--; | 487 | b->c->bucket_cache_used--; |
488 | } | 488 | } |
489 | 489 | ||
490 | static void mca_bucket_free(struct btree *b) | 490 | static void mca_bucket_free(struct btree *b) |
491 | { | 491 | { |
492 | BUG_ON(btree_node_dirty(b)); | 492 | BUG_ON(btree_node_dirty(b)); |
493 | 493 | ||
494 | b->key.ptr[0] = 0; | 494 | b->key.ptr[0] = 0; |
495 | hlist_del_init_rcu(&b->hash); | 495 | hlist_del_init_rcu(&b->hash); |
496 | list_move(&b->list, &b->c->btree_cache_freeable); | 496 | list_move(&b->list, &b->c->btree_cache_freeable); |
497 | } | 497 | } |
498 | 498 | ||
499 | static unsigned btree_order(struct bkey *k) | 499 | static unsigned btree_order(struct bkey *k) |
500 | { | 500 | { |
501 | return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1); | 501 | return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1); |
502 | } | 502 | } |
503 | 503 | ||
504 | static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) | 504 | static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) |
505 | { | 505 | { |
506 | struct bset_tree *t = b->sets; | 506 | struct bset_tree *t = b->sets; |
507 | BUG_ON(t->data); | 507 | BUG_ON(t->data); |
508 | 508 | ||
509 | b->page_order = max_t(unsigned, | 509 | b->page_order = max_t(unsigned, |
510 | ilog2(b->c->btree_pages), | 510 | ilog2(b->c->btree_pages), |
511 | btree_order(k)); | 511 | btree_order(k)); |
512 | 512 | ||
513 | t->data = (void *) __get_free_pages(gfp, b->page_order); | 513 | t->data = (void *) __get_free_pages(gfp, b->page_order); |
514 | if (!t->data) | 514 | if (!t->data) |
515 | goto err; | 515 | goto err; |
516 | 516 | ||
517 | t->tree = bset_tree_bytes(b) < PAGE_SIZE | 517 | t->tree = bset_tree_bytes(b) < PAGE_SIZE |
518 | ? kmalloc(bset_tree_bytes(b), gfp) | 518 | ? kmalloc(bset_tree_bytes(b), gfp) |
519 | : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b))); | 519 | : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b))); |
520 | if (!t->tree) | 520 | if (!t->tree) |
521 | goto err; | 521 | goto err; |
522 | 522 | ||
523 | t->prev = bset_prev_bytes(b) < PAGE_SIZE | 523 | t->prev = bset_prev_bytes(b) < PAGE_SIZE |
524 | ? kmalloc(bset_prev_bytes(b), gfp) | 524 | ? kmalloc(bset_prev_bytes(b), gfp) |
525 | : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b))); | 525 | : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b))); |
526 | if (!t->prev) | 526 | if (!t->prev) |
527 | goto err; | 527 | goto err; |
528 | 528 | ||
529 | list_move(&b->list, &b->c->btree_cache); | 529 | list_move(&b->list, &b->c->btree_cache); |
530 | b->c->bucket_cache_used++; | 530 | b->c->bucket_cache_used++; |
531 | return; | 531 | return; |
532 | err: | 532 | err: |
533 | mca_data_free(b); | 533 | mca_data_free(b); |
534 | } | 534 | } |
535 | 535 | ||
536 | static struct btree *mca_bucket_alloc(struct cache_set *c, | 536 | static struct btree *mca_bucket_alloc(struct cache_set *c, |
537 | struct bkey *k, gfp_t gfp) | 537 | struct bkey *k, gfp_t gfp) |
538 | { | 538 | { |
539 | struct btree *b = kzalloc(sizeof(struct btree), gfp); | 539 | struct btree *b = kzalloc(sizeof(struct btree), gfp); |
540 | if (!b) | 540 | if (!b) |
541 | return NULL; | 541 | return NULL; |
542 | 542 | ||
543 | init_rwsem(&b->lock); | 543 | init_rwsem(&b->lock); |
544 | lockdep_set_novalidate_class(&b->lock); | 544 | lockdep_set_novalidate_class(&b->lock); |
545 | INIT_LIST_HEAD(&b->list); | 545 | INIT_LIST_HEAD(&b->list); |
546 | INIT_DELAYED_WORK(&b->work, btree_node_write_work); | 546 | INIT_DELAYED_WORK(&b->work, btree_node_write_work); |
547 | b->c = c; | 547 | b->c = c; |
548 | closure_init_unlocked(&b->io); | 548 | closure_init_unlocked(&b->io); |
549 | 549 | ||
550 | mca_data_alloc(b, k, gfp); | 550 | mca_data_alloc(b, k, gfp); |
551 | return b; | 551 | return b; |
552 | } | 552 | } |
553 | 553 | ||
554 | static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order) | 554 | static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order) |
555 | { | 555 | { |
556 | lockdep_assert_held(&b->c->bucket_lock); | 556 | lockdep_assert_held(&b->c->bucket_lock); |
557 | 557 | ||
558 | if (!down_write_trylock(&b->lock)) | 558 | if (!down_write_trylock(&b->lock)) |
559 | return -ENOMEM; | 559 | return -ENOMEM; |
560 | 560 | ||
561 | if (b->page_order < min_order) { | 561 | if (b->page_order < min_order) { |
562 | rw_unlock(true, b); | 562 | rw_unlock(true, b); |
563 | return -ENOMEM; | 563 | return -ENOMEM; |
564 | } | 564 | } |
565 | 565 | ||
566 | BUG_ON(btree_node_dirty(b) && !b->sets[0].data); | 566 | BUG_ON(btree_node_dirty(b) && !b->sets[0].data); |
567 | 567 | ||
568 | if (cl && btree_node_dirty(b)) | 568 | if (cl && btree_node_dirty(b)) |
569 | bch_btree_node_write(b, NULL); | 569 | bch_btree_node_write(b, NULL); |
570 | 570 | ||
571 | if (cl) | 571 | if (cl) |
572 | closure_wait_event_async(&b->io.wait, cl, | 572 | closure_wait_event_async(&b->io.wait, cl, |
573 | atomic_read(&b->io.cl.remaining) == -1); | 573 | atomic_read(&b->io.cl.remaining) == -1); |
574 | 574 | ||
575 | if (btree_node_dirty(b) || | 575 | if (btree_node_dirty(b) || |
576 | !closure_is_unlocked(&b->io.cl) || | 576 | !closure_is_unlocked(&b->io.cl) || |
577 | work_pending(&b->work.work)) { | 577 | work_pending(&b->work.work)) { |
578 | rw_unlock(true, b); | 578 | rw_unlock(true, b); |
579 | return -EAGAIN; | 579 | return -EAGAIN; |
580 | } | 580 | } |
581 | 581 | ||
582 | return 0; | 582 | return 0; |
583 | } | 583 | } |
584 | 584 | ||
585 | static int bch_mca_shrink(struct shrinker *shrink, struct shrink_control *sc) | 585 | static int bch_mca_shrink(struct shrinker *shrink, struct shrink_control *sc) |
586 | { | 586 | { |
587 | struct cache_set *c = container_of(shrink, struct cache_set, shrink); | 587 | struct cache_set *c = container_of(shrink, struct cache_set, shrink); |
588 | struct btree *b, *t; | 588 | struct btree *b, *t; |
589 | unsigned long i, nr = sc->nr_to_scan; | 589 | unsigned long i, nr = sc->nr_to_scan; |
590 | 590 | ||
591 | if (c->shrinker_disabled) | 591 | if (c->shrinker_disabled) |
592 | return 0; | 592 | return 0; |
593 | 593 | ||
594 | if (c->try_harder) | 594 | if (c->try_harder) |
595 | return 0; | 595 | return 0; |
596 | 596 | ||
597 | /* | 597 | /* |
598 | * If nr == 0, we're supposed to return the number of items we have | 598 | * If nr == 0, we're supposed to return the number of items we have |
599 | * cached. Not allowed to return -1. | 599 | * cached. Not allowed to return -1. |
600 | */ | 600 | */ |
601 | if (!nr) | 601 | if (!nr) |
602 | return mca_can_free(c) * c->btree_pages; | 602 | return mca_can_free(c) * c->btree_pages; |
603 | 603 | ||
604 | /* Return -1 if we can't do anything right now */ | 604 | /* Return -1 if we can't do anything right now */ |
605 | if (sc->gfp_mask & __GFP_WAIT) | 605 | if (sc->gfp_mask & __GFP_WAIT) |
606 | mutex_lock(&c->bucket_lock); | 606 | mutex_lock(&c->bucket_lock); |
607 | else if (!mutex_trylock(&c->bucket_lock)) | 607 | else if (!mutex_trylock(&c->bucket_lock)) |
608 | return -1; | 608 | return -1; |
609 | 609 | ||
610 | nr /= c->btree_pages; | 610 | nr /= c->btree_pages; |
611 | nr = min_t(unsigned long, nr, mca_can_free(c)); | 611 | nr = min_t(unsigned long, nr, mca_can_free(c)); |
612 | 612 | ||
613 | i = 0; | 613 | i = 0; |
614 | list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) { | 614 | list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) { |
615 | if (!nr) | 615 | if (!nr) |
616 | break; | 616 | break; |
617 | 617 | ||
618 | if (++i > 3 && | 618 | if (++i > 3 && |
619 | !mca_reap(b, NULL, 0)) { | 619 | !mca_reap(b, NULL, 0)) { |
620 | mca_data_free(b); | 620 | mca_data_free(b); |
621 | rw_unlock(true, b); | 621 | rw_unlock(true, b); |
622 | --nr; | 622 | --nr; |
623 | } | 623 | } |
624 | } | 624 | } |
625 | 625 | ||
626 | /* | 626 | /* |
627 | * Can happen right when we first start up, before we've read in any | 627 | * Can happen right when we first start up, before we've read in any |
628 | * btree nodes | 628 | * btree nodes |
629 | */ | 629 | */ |
630 | if (list_empty(&c->btree_cache)) | 630 | if (list_empty(&c->btree_cache)) |
631 | goto out; | 631 | goto out; |
632 | 632 | ||
633 | for (i = 0; nr && i < c->bucket_cache_used; i++) { | 633 | for (i = 0; nr && i < c->bucket_cache_used; i++) { |
634 | b = list_first_entry(&c->btree_cache, struct btree, list); | 634 | b = list_first_entry(&c->btree_cache, struct btree, list); |
635 | list_rotate_left(&c->btree_cache); | 635 | list_rotate_left(&c->btree_cache); |
636 | 636 | ||
637 | if (!b->accessed && | 637 | if (!b->accessed && |
638 | !mca_reap(b, NULL, 0)) { | 638 | !mca_reap(b, NULL, 0)) { |
639 | mca_bucket_free(b); | 639 | mca_bucket_free(b); |
640 | mca_data_free(b); | 640 | mca_data_free(b); |
641 | rw_unlock(true, b); | 641 | rw_unlock(true, b); |
642 | --nr; | 642 | --nr; |
643 | } else | 643 | } else |
644 | b->accessed = 0; | 644 | b->accessed = 0; |
645 | } | 645 | } |
646 | out: | 646 | out: |
647 | nr = mca_can_free(c) * c->btree_pages; | 647 | nr = mca_can_free(c) * c->btree_pages; |
648 | mutex_unlock(&c->bucket_lock); | 648 | mutex_unlock(&c->bucket_lock); |
649 | return nr; | 649 | return nr; |
650 | } | 650 | } |
651 | 651 | ||
652 | void bch_btree_cache_free(struct cache_set *c) | 652 | void bch_btree_cache_free(struct cache_set *c) |
653 | { | 653 | { |
654 | struct btree *b; | 654 | struct btree *b; |
655 | struct closure cl; | 655 | struct closure cl; |
656 | closure_init_stack(&cl); | 656 | closure_init_stack(&cl); |
657 | 657 | ||
658 | if (c->shrink.list.next) | 658 | if (c->shrink.list.next) |
659 | unregister_shrinker(&c->shrink); | 659 | unregister_shrinker(&c->shrink); |
660 | 660 | ||
661 | mutex_lock(&c->bucket_lock); | 661 | mutex_lock(&c->bucket_lock); |
662 | 662 | ||
663 | #ifdef CONFIG_BCACHE_DEBUG | 663 | #ifdef CONFIG_BCACHE_DEBUG |
664 | if (c->verify_data) | 664 | if (c->verify_data) |
665 | list_move(&c->verify_data->list, &c->btree_cache); | 665 | list_move(&c->verify_data->list, &c->btree_cache); |
666 | #endif | 666 | #endif |
667 | 667 | ||
668 | list_splice(&c->btree_cache_freeable, | 668 | list_splice(&c->btree_cache_freeable, |
669 | &c->btree_cache); | 669 | &c->btree_cache); |
670 | 670 | ||
671 | while (!list_empty(&c->btree_cache)) { | 671 | while (!list_empty(&c->btree_cache)) { |
672 | b = list_first_entry(&c->btree_cache, struct btree, list); | 672 | b = list_first_entry(&c->btree_cache, struct btree, list); |
673 | 673 | ||
674 | if (btree_node_dirty(b)) | 674 | if (btree_node_dirty(b)) |
675 | btree_complete_write(b, btree_current_write(b)); | 675 | btree_complete_write(b, btree_current_write(b)); |
676 | clear_bit(BTREE_NODE_dirty, &b->flags); | 676 | clear_bit(BTREE_NODE_dirty, &b->flags); |
677 | 677 | ||
678 | mca_data_free(b); | 678 | mca_data_free(b); |
679 | } | 679 | } |
680 | 680 | ||
681 | while (!list_empty(&c->btree_cache_freed)) { | 681 | while (!list_empty(&c->btree_cache_freed)) { |
682 | b = list_first_entry(&c->btree_cache_freed, | 682 | b = list_first_entry(&c->btree_cache_freed, |
683 | struct btree, list); | 683 | struct btree, list); |
684 | list_del(&b->list); | 684 | list_del(&b->list); |
685 | cancel_delayed_work_sync(&b->work); | 685 | cancel_delayed_work_sync(&b->work); |
686 | kfree(b); | 686 | kfree(b); |
687 | } | 687 | } |
688 | 688 | ||
689 | mutex_unlock(&c->bucket_lock); | 689 | mutex_unlock(&c->bucket_lock); |
690 | } | 690 | } |
691 | 691 | ||
692 | int bch_btree_cache_alloc(struct cache_set *c) | 692 | int bch_btree_cache_alloc(struct cache_set *c) |
693 | { | 693 | { |
694 | unsigned i; | 694 | unsigned i; |
695 | 695 | ||
696 | /* XXX: doesn't check for errors */ | 696 | /* XXX: doesn't check for errors */ |
697 | 697 | ||
698 | closure_init_unlocked(&c->gc); | 698 | closure_init_unlocked(&c->gc); |
699 | 699 | ||
700 | for (i = 0; i < mca_reserve(c); i++) | 700 | for (i = 0; i < mca_reserve(c); i++) |
701 | mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); | 701 | mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); |
702 | 702 | ||
703 | list_splice_init(&c->btree_cache, | 703 | list_splice_init(&c->btree_cache, |
704 | &c->btree_cache_freeable); | 704 | &c->btree_cache_freeable); |
705 | 705 | ||
706 | #ifdef CONFIG_BCACHE_DEBUG | 706 | #ifdef CONFIG_BCACHE_DEBUG |
707 | mutex_init(&c->verify_lock); | 707 | mutex_init(&c->verify_lock); |
708 | 708 | ||
709 | c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); | 709 | c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); |
710 | 710 | ||
711 | if (c->verify_data && | 711 | if (c->verify_data && |
712 | c->verify_data->sets[0].data) | 712 | c->verify_data->sets[0].data) |
713 | list_del_init(&c->verify_data->list); | 713 | list_del_init(&c->verify_data->list); |
714 | else | 714 | else |
715 | c->verify_data = NULL; | 715 | c->verify_data = NULL; |
716 | #endif | 716 | #endif |
717 | 717 | ||
718 | c->shrink.shrink = bch_mca_shrink; | 718 | c->shrink.shrink = bch_mca_shrink; |
719 | c->shrink.seeks = 4; | 719 | c->shrink.seeks = 4; |
720 | c->shrink.batch = c->btree_pages * 2; | 720 | c->shrink.batch = c->btree_pages * 2; |
721 | register_shrinker(&c->shrink); | 721 | register_shrinker(&c->shrink); |
722 | 722 | ||
723 | return 0; | 723 | return 0; |
724 | } | 724 | } |
725 | 725 | ||
726 | /* Btree in memory cache - hash table */ | 726 | /* Btree in memory cache - hash table */ |
727 | 727 | ||
728 | static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k) | 728 | static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k) |
729 | { | 729 | { |
730 | return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)]; | 730 | return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)]; |
731 | } | 731 | } |
732 | 732 | ||
733 | static struct btree *mca_find(struct cache_set *c, struct bkey *k) | 733 | static struct btree *mca_find(struct cache_set *c, struct bkey *k) |
734 | { | 734 | { |
735 | struct btree *b; | 735 | struct btree *b; |
736 | 736 | ||
737 | rcu_read_lock(); | 737 | rcu_read_lock(); |
738 | hlist_for_each_entry_rcu(b, mca_hash(c, k), hash) | 738 | hlist_for_each_entry_rcu(b, mca_hash(c, k), hash) |
739 | if (PTR_HASH(c, &b->key) == PTR_HASH(c, k)) | 739 | if (PTR_HASH(c, &b->key) == PTR_HASH(c, k)) |
740 | goto out; | 740 | goto out; |
741 | b = NULL; | 741 | b = NULL; |
742 | out: | 742 | out: |
743 | rcu_read_unlock(); | 743 | rcu_read_unlock(); |
744 | return b; | 744 | return b; |
745 | } | 745 | } |
746 | 746 | ||
747 | static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k, | 747 | static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k, |
748 | int level, struct closure *cl) | 748 | int level, struct closure *cl) |
749 | { | 749 | { |
750 | int ret = -ENOMEM; | 750 | int ret = -ENOMEM; |
751 | struct btree *i; | 751 | struct btree *i; |
752 | 752 | ||
753 | trace_bcache_btree_cache_cannibalize(c); | 753 | trace_bcache_btree_cache_cannibalize(c); |
754 | 754 | ||
755 | if (!cl) | 755 | if (!cl) |
756 | return ERR_PTR(-ENOMEM); | 756 | return ERR_PTR(-ENOMEM); |
757 | 757 | ||
758 | /* | 758 | /* |
759 | * Trying to free up some memory - i.e. reuse some btree nodes - may | 759 | * Trying to free up some memory - i.e. reuse some btree nodes - may |
760 | * require initiating IO to flush the dirty part of the node. If we're | 760 | * require initiating IO to flush the dirty part of the node. If we're |
761 | * running under generic_make_request(), that IO will never finish and | 761 | * running under generic_make_request(), that IO will never finish and |
762 | * we would deadlock. Returning -EAGAIN causes the cache lookup code to | 762 | * we would deadlock. Returning -EAGAIN causes the cache lookup code to |
763 | * punt to workqueue and retry. | 763 | * punt to workqueue and retry. |
764 | */ | 764 | */ |
765 | if (current->bio_list) | 765 | if (current->bio_list) |
766 | return ERR_PTR(-EAGAIN); | 766 | return ERR_PTR(-EAGAIN); |
767 | 767 | ||
768 | if (c->try_harder && c->try_harder != cl) { | 768 | if (c->try_harder && c->try_harder != cl) { |
769 | closure_wait_event_async(&c->try_wait, cl, !c->try_harder); | 769 | closure_wait_event_async(&c->try_wait, cl, !c->try_harder); |
770 | return ERR_PTR(-EAGAIN); | 770 | return ERR_PTR(-EAGAIN); |
771 | } | 771 | } |
772 | 772 | ||
773 | c->try_harder = cl; | 773 | c->try_harder = cl; |
774 | c->try_harder_start = local_clock(); | 774 | c->try_harder_start = local_clock(); |
775 | retry: | 775 | retry: |
776 | list_for_each_entry_reverse(i, &c->btree_cache, list) { | 776 | list_for_each_entry_reverse(i, &c->btree_cache, list) { |
777 | int r = mca_reap(i, cl, btree_order(k)); | 777 | int r = mca_reap(i, cl, btree_order(k)); |
778 | if (!r) | 778 | if (!r) |
779 | return i; | 779 | return i; |
780 | if (r != -ENOMEM) | 780 | if (r != -ENOMEM) |
781 | ret = r; | 781 | ret = r; |
782 | } | 782 | } |
783 | 783 | ||
784 | if (ret == -EAGAIN && | 784 | if (ret == -EAGAIN && |
785 | closure_blocking(cl)) { | 785 | closure_blocking(cl)) { |
786 | mutex_unlock(&c->bucket_lock); | 786 | mutex_unlock(&c->bucket_lock); |
787 | closure_sync(cl); | 787 | closure_sync(cl); |
788 | mutex_lock(&c->bucket_lock); | 788 | mutex_lock(&c->bucket_lock); |
789 | goto retry; | 789 | goto retry; |
790 | } | 790 | } |
791 | 791 | ||
792 | return ERR_PTR(ret); | 792 | return ERR_PTR(ret); |
793 | } | 793 | } |
794 | 794 | ||
795 | /* | 795 | /* |
796 | * We can only have one thread cannibalizing other cached btree nodes at a time, | 796 | * We can only have one thread cannibalizing other cached btree nodes at a time, |
797 | * or we'll deadlock. We use an open coded mutex to ensure that, which a | 797 | * or we'll deadlock. We use an open coded mutex to ensure that, which a |
798 | * cannibalize_bucket() will take. This means every time we unlock the root of | 798 | * cannibalize_bucket() will take. This means every time we unlock the root of |
799 | * the btree, we need to release this lock if we have it held. | 799 | * the btree, we need to release this lock if we have it held. |
800 | */ | 800 | */ |
801 | void bch_cannibalize_unlock(struct cache_set *c, struct closure *cl) | 801 | void bch_cannibalize_unlock(struct cache_set *c, struct closure *cl) |
802 | { | 802 | { |
803 | if (c->try_harder == cl) { | 803 | if (c->try_harder == cl) { |
804 | bch_time_stats_update(&c->try_harder_time, c->try_harder_start); | 804 | bch_time_stats_update(&c->try_harder_time, c->try_harder_start); |
805 | c->try_harder = NULL; | 805 | c->try_harder = NULL; |
806 | __closure_wake_up(&c->try_wait); | 806 | __closure_wake_up(&c->try_wait); |
807 | } | 807 | } |
808 | } | 808 | } |
809 | 809 | ||
810 | static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, | 810 | static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, |
811 | int level, struct closure *cl) | 811 | int level, struct closure *cl) |
812 | { | 812 | { |
813 | struct btree *b; | 813 | struct btree *b; |
814 | 814 | ||
815 | lockdep_assert_held(&c->bucket_lock); | 815 | lockdep_assert_held(&c->bucket_lock); |
816 | 816 | ||
817 | if (mca_find(c, k)) | 817 | if (mca_find(c, k)) |
818 | return NULL; | 818 | return NULL; |
819 | 819 | ||
820 | /* btree_free() doesn't free memory; it sticks the node on the end of | 820 | /* btree_free() doesn't free memory; it sticks the node on the end of |
821 | * the list. Check if there's any freed nodes there: | 821 | * the list. Check if there's any freed nodes there: |
822 | */ | 822 | */ |
823 | list_for_each_entry(b, &c->btree_cache_freeable, list) | 823 | list_for_each_entry(b, &c->btree_cache_freeable, list) |
824 | if (!mca_reap(b, NULL, btree_order(k))) | 824 | if (!mca_reap(b, NULL, btree_order(k))) |
825 | goto out; | 825 | goto out; |
826 | 826 | ||
827 | /* We never free struct btree itself, just the memory that holds the on | 827 | /* We never free struct btree itself, just the memory that holds the on |
828 | * disk node. Check the freed list before allocating a new one: | 828 | * disk node. Check the freed list before allocating a new one: |
829 | */ | 829 | */ |
830 | list_for_each_entry(b, &c->btree_cache_freed, list) | 830 | list_for_each_entry(b, &c->btree_cache_freed, list) |
831 | if (!mca_reap(b, NULL, 0)) { | 831 | if (!mca_reap(b, NULL, 0)) { |
832 | mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); | 832 | mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); |
833 | if (!b->sets[0].data) | 833 | if (!b->sets[0].data) |
834 | goto err; | 834 | goto err; |
835 | else | 835 | else |
836 | goto out; | 836 | goto out; |
837 | } | 837 | } |
838 | 838 | ||
839 | b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO); | 839 | b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO); |
840 | if (!b) | 840 | if (!b) |
841 | goto err; | 841 | goto err; |
842 | 842 | ||
843 | BUG_ON(!down_write_trylock(&b->lock)); | 843 | BUG_ON(!down_write_trylock(&b->lock)); |
844 | if (!b->sets->data) | 844 | if (!b->sets->data) |
845 | goto err; | 845 | goto err; |
846 | out: | 846 | out: |
847 | BUG_ON(!closure_is_unlocked(&b->io.cl)); | 847 | BUG_ON(!closure_is_unlocked(&b->io.cl)); |
848 | 848 | ||
849 | bkey_copy(&b->key, k); | 849 | bkey_copy(&b->key, k); |
850 | list_move(&b->list, &c->btree_cache); | 850 | list_move(&b->list, &c->btree_cache); |
851 | hlist_del_init_rcu(&b->hash); | 851 | hlist_del_init_rcu(&b->hash); |
852 | hlist_add_head_rcu(&b->hash, mca_hash(c, k)); | 852 | hlist_add_head_rcu(&b->hash, mca_hash(c, k)); |
853 | 853 | ||
854 | lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_); | 854 | lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_); |
855 | b->level = level; | 855 | b->level = level; |
856 | 856 | ||
857 | mca_reinit(b); | 857 | mca_reinit(b); |
858 | 858 | ||
859 | return b; | 859 | return b; |
860 | err: | 860 | err: |
861 | if (b) | 861 | if (b) |
862 | rw_unlock(true, b); | 862 | rw_unlock(true, b); |
863 | 863 | ||
864 | b = mca_cannibalize(c, k, level, cl); | 864 | b = mca_cannibalize(c, k, level, cl); |
865 | if (!IS_ERR(b)) | 865 | if (!IS_ERR(b)) |
866 | goto out; | 866 | goto out; |
867 | 867 | ||
868 | return b; | 868 | return b; |
869 | } | 869 | } |
870 | 870 | ||
871 | /** | 871 | /** |
872 | * bch_btree_node_get - find a btree node in the cache and lock it, reading it | 872 | * bch_btree_node_get - find a btree node in the cache and lock it, reading it |
873 | * in from disk if necessary. | 873 | * in from disk if necessary. |
874 | * | 874 | * |
875 | * If IO is necessary, it uses the closure embedded in struct btree_op to wait; | 875 | * If IO is necessary, it uses the closure embedded in struct btree_op to wait; |
876 | * if that closure is in non blocking mode, will return -EAGAIN. | 876 | * if that closure is in non blocking mode, will return -EAGAIN. |
877 | * | 877 | * |
878 | * The btree node will have either a read or a write lock held, depending on | 878 | * The btree node will have either a read or a write lock held, depending on |
879 | * level and op->lock. | 879 | * level and op->lock. |
880 | */ | 880 | */ |
881 | struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k, | 881 | struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k, |
882 | int level, struct btree_op *op) | 882 | int level, struct btree_op *op) |
883 | { | 883 | { |
884 | int i = 0; | 884 | int i = 0; |
885 | bool write = level <= op->lock; | 885 | bool write = level <= op->lock; |
886 | struct btree *b; | 886 | struct btree *b; |
887 | 887 | ||
888 | BUG_ON(level < 0); | 888 | BUG_ON(level < 0); |
889 | retry: | 889 | retry: |
890 | b = mca_find(c, k); | 890 | b = mca_find(c, k); |
891 | 891 | ||
892 | if (!b) { | 892 | if (!b) { |
893 | if (current->bio_list) | 893 | if (current->bio_list) |
894 | return ERR_PTR(-EAGAIN); | 894 | return ERR_PTR(-EAGAIN); |
895 | 895 | ||
896 | mutex_lock(&c->bucket_lock); | 896 | mutex_lock(&c->bucket_lock); |
897 | b = mca_alloc(c, k, level, &op->cl); | 897 | b = mca_alloc(c, k, level, &op->cl); |
898 | mutex_unlock(&c->bucket_lock); | 898 | mutex_unlock(&c->bucket_lock); |
899 | 899 | ||
900 | if (!b) | 900 | if (!b) |
901 | goto retry; | 901 | goto retry; |
902 | if (IS_ERR(b)) | 902 | if (IS_ERR(b)) |
903 | return b; | 903 | return b; |
904 | 904 | ||
905 | bch_btree_node_read(b); | 905 | bch_btree_node_read(b); |
906 | 906 | ||
907 | if (!write) | 907 | if (!write) |
908 | downgrade_write(&b->lock); | 908 | downgrade_write(&b->lock); |
909 | } else { | 909 | } else { |
910 | rw_lock(write, b, level); | 910 | rw_lock(write, b, level); |
911 | if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) { | 911 | if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) { |
912 | rw_unlock(write, b); | 912 | rw_unlock(write, b); |
913 | goto retry; | 913 | goto retry; |
914 | } | 914 | } |
915 | BUG_ON(b->level != level); | 915 | BUG_ON(b->level != level); |
916 | } | 916 | } |
917 | 917 | ||
918 | b->accessed = 1; | 918 | b->accessed = 1; |
919 | 919 | ||
920 | for (; i <= b->nsets && b->sets[i].size; i++) { | 920 | for (; i <= b->nsets && b->sets[i].size; i++) { |
921 | prefetch(b->sets[i].tree); | 921 | prefetch(b->sets[i].tree); |
922 | prefetch(b->sets[i].data); | 922 | prefetch(b->sets[i].data); |
923 | } | 923 | } |
924 | 924 | ||
925 | for (; i <= b->nsets; i++) | 925 | for (; i <= b->nsets; i++) |
926 | prefetch(b->sets[i].data); | 926 | prefetch(b->sets[i].data); |
927 | 927 | ||
928 | if (btree_node_io_error(b)) { | 928 | if (btree_node_io_error(b)) { |
929 | rw_unlock(write, b); | 929 | rw_unlock(write, b); |
930 | return ERR_PTR(-EIO); | 930 | return ERR_PTR(-EIO); |
931 | } | 931 | } |
932 | 932 | ||
933 | BUG_ON(!b->written); | 933 | BUG_ON(!b->written); |
934 | 934 | ||
935 | return b; | 935 | return b; |
936 | } | 936 | } |
937 | 937 | ||
938 | static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) | 938 | static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) |
939 | { | 939 | { |
940 | struct btree *b; | 940 | struct btree *b; |
941 | 941 | ||
942 | mutex_lock(&c->bucket_lock); | 942 | mutex_lock(&c->bucket_lock); |
943 | b = mca_alloc(c, k, level, NULL); | 943 | b = mca_alloc(c, k, level, NULL); |
944 | mutex_unlock(&c->bucket_lock); | 944 | mutex_unlock(&c->bucket_lock); |
945 | 945 | ||
946 | if (!IS_ERR_OR_NULL(b)) { | 946 | if (!IS_ERR_OR_NULL(b)) { |
947 | bch_btree_node_read(b); | 947 | bch_btree_node_read(b); |
948 | rw_unlock(true, b); | 948 | rw_unlock(true, b); |
949 | } | 949 | } |
950 | } | 950 | } |
951 | 951 | ||
952 | /* Btree alloc */ | 952 | /* Btree alloc */ |
953 | 953 | ||
954 | static void btree_node_free(struct btree *b, struct btree_op *op) | 954 | static void btree_node_free(struct btree *b, struct btree_op *op) |
955 | { | 955 | { |
956 | unsigned i; | 956 | unsigned i; |
957 | 957 | ||
958 | trace_bcache_btree_node_free(b); | 958 | trace_bcache_btree_node_free(b); |
959 | 959 | ||
960 | /* | 960 | /* |
961 | * The BUG_ON() in btree_node_get() implies that we must have a write | 961 | * The BUG_ON() in btree_node_get() implies that we must have a write |
962 | * lock on parent to free or even invalidate a node | 962 | * lock on parent to free or even invalidate a node |
963 | */ | 963 | */ |
964 | BUG_ON(op->lock <= b->level); | 964 | BUG_ON(op->lock <= b->level); |
965 | BUG_ON(b == b->c->root); | 965 | BUG_ON(b == b->c->root); |
966 | 966 | ||
967 | if (btree_node_dirty(b)) | 967 | if (btree_node_dirty(b)) |
968 | btree_complete_write(b, btree_current_write(b)); | 968 | btree_complete_write(b, btree_current_write(b)); |
969 | clear_bit(BTREE_NODE_dirty, &b->flags); | 969 | clear_bit(BTREE_NODE_dirty, &b->flags); |
970 | 970 | ||
971 | cancel_delayed_work(&b->work); | 971 | cancel_delayed_work(&b->work); |
972 | 972 | ||
973 | mutex_lock(&b->c->bucket_lock); | 973 | mutex_lock(&b->c->bucket_lock); |
974 | 974 | ||
975 | for (i = 0; i < KEY_PTRS(&b->key); i++) { | 975 | for (i = 0; i < KEY_PTRS(&b->key); i++) { |
976 | BUG_ON(atomic_read(&PTR_BUCKET(b->c, &b->key, i)->pin)); | 976 | BUG_ON(atomic_read(&PTR_BUCKET(b->c, &b->key, i)->pin)); |
977 | 977 | ||
978 | bch_inc_gen(PTR_CACHE(b->c, &b->key, i), | 978 | bch_inc_gen(PTR_CACHE(b->c, &b->key, i), |
979 | PTR_BUCKET(b->c, &b->key, i)); | 979 | PTR_BUCKET(b->c, &b->key, i)); |
980 | } | 980 | } |
981 | 981 | ||
982 | bch_bucket_free(b->c, &b->key); | 982 | bch_bucket_free(b->c, &b->key); |
983 | mca_bucket_free(b); | 983 | mca_bucket_free(b); |
984 | mutex_unlock(&b->c->bucket_lock); | 984 | mutex_unlock(&b->c->bucket_lock); |
985 | } | 985 | } |
986 | 986 | ||
987 | struct btree *bch_btree_node_alloc(struct cache_set *c, int level, | 987 | struct btree *bch_btree_node_alloc(struct cache_set *c, int level, |
988 | struct closure *cl) | 988 | struct closure *cl) |
989 | { | 989 | { |
990 | BKEY_PADDED(key) k; | 990 | BKEY_PADDED(key) k; |
991 | struct btree *b = ERR_PTR(-EAGAIN); | 991 | struct btree *b = ERR_PTR(-EAGAIN); |
992 | 992 | ||
993 | mutex_lock(&c->bucket_lock); | 993 | mutex_lock(&c->bucket_lock); |
994 | retry: | 994 | retry: |
995 | if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, cl)) | 995 | if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, cl)) |
996 | goto err; | 996 | goto err; |
997 | 997 | ||
998 | SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); | 998 | SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); |
999 | 999 | ||
1000 | b = mca_alloc(c, &k.key, level, cl); | 1000 | b = mca_alloc(c, &k.key, level, cl); |
1001 | if (IS_ERR(b)) | 1001 | if (IS_ERR(b)) |
1002 | goto err_free; | 1002 | goto err_free; |
1003 | 1003 | ||
1004 | if (!b) { | 1004 | if (!b) { |
1005 | cache_bug(c, | 1005 | cache_bug(c, |
1006 | "Tried to allocate bucket that was in btree cache"); | 1006 | "Tried to allocate bucket that was in btree cache"); |
1007 | __bkey_put(c, &k.key); | 1007 | __bkey_put(c, &k.key); |
1008 | goto retry; | 1008 | goto retry; |
1009 | } | 1009 | } |
1010 | 1010 | ||
1011 | b->accessed = 1; | 1011 | b->accessed = 1; |
1012 | bch_bset_init_next(b); | 1012 | bch_bset_init_next(b); |
1013 | 1013 | ||
1014 | mutex_unlock(&c->bucket_lock); | 1014 | mutex_unlock(&c->bucket_lock); |
1015 | 1015 | ||
1016 | trace_bcache_btree_node_alloc(b); | 1016 | trace_bcache_btree_node_alloc(b); |
1017 | return b; | 1017 | return b; |
1018 | err_free: | 1018 | err_free: |
1019 | bch_bucket_free(c, &k.key); | 1019 | bch_bucket_free(c, &k.key); |
1020 | __bkey_put(c, &k.key); | 1020 | __bkey_put(c, &k.key); |
1021 | err: | 1021 | err: |
1022 | mutex_unlock(&c->bucket_lock); | 1022 | mutex_unlock(&c->bucket_lock); |
1023 | 1023 | ||
1024 | trace_bcache_btree_node_alloc_fail(b); | 1024 | trace_bcache_btree_node_alloc_fail(b); |
1025 | return b; | 1025 | return b; |
1026 | } | 1026 | } |
1027 | 1027 | ||
1028 | static struct btree *btree_node_alloc_replacement(struct btree *b, | 1028 | static struct btree *btree_node_alloc_replacement(struct btree *b, |
1029 | struct closure *cl) | 1029 | struct closure *cl) |
1030 | { | 1030 | { |
1031 | struct btree *n = bch_btree_node_alloc(b->c, b->level, cl); | 1031 | struct btree *n = bch_btree_node_alloc(b->c, b->level, cl); |
1032 | if (!IS_ERR_OR_NULL(n)) | 1032 | if (!IS_ERR_OR_NULL(n)) |
1033 | bch_btree_sort_into(b, n); | 1033 | bch_btree_sort_into(b, n); |
1034 | 1034 | ||
1035 | return n; | 1035 | return n; |
1036 | } | 1036 | } |
1037 | 1037 | ||
1038 | /* Garbage collection */ | 1038 | /* Garbage collection */ |
1039 | 1039 | ||
1040 | uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) | 1040 | uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) |
1041 | { | 1041 | { |
1042 | uint8_t stale = 0; | 1042 | uint8_t stale = 0; |
1043 | unsigned i; | 1043 | unsigned i; |
1044 | struct bucket *g; | 1044 | struct bucket *g; |
1045 | 1045 | ||
1046 | /* | 1046 | /* |
1047 | * ptr_invalid() can't return true for the keys that mark btree nodes as | 1047 | * ptr_invalid() can't return true for the keys that mark btree nodes as |
1048 | * freed, but since ptr_bad() returns true we'll never actually use them | 1048 | * freed, but since ptr_bad() returns true we'll never actually use them |
1049 | * for anything and thus we don't want mark their pointers here | 1049 | * for anything and thus we don't want mark their pointers here |
1050 | */ | 1050 | */ |
1051 | if (!bkey_cmp(k, &ZERO_KEY)) | 1051 | if (!bkey_cmp(k, &ZERO_KEY)) |
1052 | return stale; | 1052 | return stale; |
1053 | 1053 | ||
1054 | for (i = 0; i < KEY_PTRS(k); i++) { | 1054 | for (i = 0; i < KEY_PTRS(k); i++) { |
1055 | if (!ptr_available(c, k, i)) | 1055 | if (!ptr_available(c, k, i)) |
1056 | continue; | 1056 | continue; |
1057 | 1057 | ||
1058 | g = PTR_BUCKET(c, k, i); | 1058 | g = PTR_BUCKET(c, k, i); |
1059 | 1059 | ||
1060 | if (gen_after(g->gc_gen, PTR_GEN(k, i))) | 1060 | if (gen_after(g->gc_gen, PTR_GEN(k, i))) |
1061 | g->gc_gen = PTR_GEN(k, i); | 1061 | g->gc_gen = PTR_GEN(k, i); |
1062 | 1062 | ||
1063 | if (ptr_stale(c, k, i)) { | 1063 | if (ptr_stale(c, k, i)) { |
1064 | stale = max(stale, ptr_stale(c, k, i)); | 1064 | stale = max(stale, ptr_stale(c, k, i)); |
1065 | continue; | 1065 | continue; |
1066 | } | 1066 | } |
1067 | 1067 | ||
1068 | cache_bug_on(GC_MARK(g) && | 1068 | cache_bug_on(GC_MARK(g) && |
1069 | (GC_MARK(g) == GC_MARK_METADATA) != (level != 0), | 1069 | (GC_MARK(g) == GC_MARK_METADATA) != (level != 0), |
1070 | c, "inconsistent ptrs: mark = %llu, level = %i", | 1070 | c, "inconsistent ptrs: mark = %llu, level = %i", |
1071 | GC_MARK(g), level); | 1071 | GC_MARK(g), level); |
1072 | 1072 | ||
1073 | if (level) | 1073 | if (level) |
1074 | SET_GC_MARK(g, GC_MARK_METADATA); | 1074 | SET_GC_MARK(g, GC_MARK_METADATA); |
1075 | else if (KEY_DIRTY(k)) | 1075 | else if (KEY_DIRTY(k)) |
1076 | SET_GC_MARK(g, GC_MARK_DIRTY); | 1076 | SET_GC_MARK(g, GC_MARK_DIRTY); |
1077 | 1077 | ||
1078 | /* guard against overflow */ | 1078 | /* guard against overflow */ |
1079 | SET_GC_SECTORS_USED(g, min_t(unsigned, | 1079 | SET_GC_SECTORS_USED(g, min_t(unsigned, |
1080 | GC_SECTORS_USED(g) + KEY_SIZE(k), | 1080 | GC_SECTORS_USED(g) + KEY_SIZE(k), |
1081 | (1 << 14) - 1)); | 1081 | (1 << 14) - 1)); |
1082 | 1082 | ||
1083 | BUG_ON(!GC_SECTORS_USED(g)); | 1083 | BUG_ON(!GC_SECTORS_USED(g)); |
1084 | } | 1084 | } |
1085 | 1085 | ||
1086 | return stale; | 1086 | return stale; |
1087 | } | 1087 | } |
1088 | 1088 | ||
1089 | #define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k) | 1089 | #define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k) |
1090 | 1090 | ||
1091 | static int btree_gc_mark_node(struct btree *b, unsigned *keys, | 1091 | static int btree_gc_mark_node(struct btree *b, unsigned *keys, |
1092 | struct gc_stat *gc) | 1092 | struct gc_stat *gc) |
1093 | { | 1093 | { |
1094 | uint8_t stale = 0; | 1094 | uint8_t stale = 0; |
1095 | unsigned last_dev = -1; | 1095 | unsigned last_dev = -1; |
1096 | struct bcache_device *d = NULL; | 1096 | struct bcache_device *d = NULL; |
1097 | struct bkey *k; | 1097 | struct bkey *k; |
1098 | struct btree_iter iter; | 1098 | struct btree_iter iter; |
1099 | struct bset_tree *t; | 1099 | struct bset_tree *t; |
1100 | 1100 | ||
1101 | gc->nodes++; | 1101 | gc->nodes++; |
1102 | 1102 | ||
1103 | for_each_key_filter(b, k, &iter, bch_ptr_invalid) { | 1103 | for_each_key_filter(b, k, &iter, bch_ptr_invalid) { |
1104 | if (last_dev != KEY_INODE(k)) { | 1104 | if (last_dev != KEY_INODE(k)) { |
1105 | last_dev = KEY_INODE(k); | 1105 | last_dev = KEY_INODE(k); |
1106 | 1106 | ||
1107 | d = KEY_INODE(k) < b->c->nr_uuids | 1107 | d = KEY_INODE(k) < b->c->nr_uuids |
1108 | ? b->c->devices[last_dev] | 1108 | ? b->c->devices[last_dev] |
1109 | : NULL; | 1109 | : NULL; |
1110 | } | 1110 | } |
1111 | 1111 | ||
1112 | stale = max(stale, btree_mark_key(b, k)); | 1112 | stale = max(stale, btree_mark_key(b, k)); |
1113 | 1113 | ||
1114 | if (bch_ptr_bad(b, k)) | 1114 | if (bch_ptr_bad(b, k)) |
1115 | continue; | 1115 | continue; |
1116 | 1116 | ||
1117 | *keys += bkey_u64s(k); | 1117 | *keys += bkey_u64s(k); |
1118 | 1118 | ||
1119 | gc->key_bytes += bkey_u64s(k); | 1119 | gc->key_bytes += bkey_u64s(k); |
1120 | gc->nkeys++; | 1120 | gc->nkeys++; |
1121 | 1121 | ||
1122 | gc->data += KEY_SIZE(k); | 1122 | gc->data += KEY_SIZE(k); |
1123 | if (KEY_DIRTY(k)) | 1123 | if (KEY_DIRTY(k)) |
1124 | gc->dirty += KEY_SIZE(k); | 1124 | gc->dirty += KEY_SIZE(k); |
1125 | } | 1125 | } |
1126 | 1126 | ||
1127 | for (t = b->sets; t <= &b->sets[b->nsets]; t++) | 1127 | for (t = b->sets; t <= &b->sets[b->nsets]; t++) |
1128 | btree_bug_on(t->size && | 1128 | btree_bug_on(t->size && |
1129 | bset_written(b, t) && | 1129 | bset_written(b, t) && |
1130 | bkey_cmp(&b->key, &t->end) < 0, | 1130 | bkey_cmp(&b->key, &t->end) < 0, |
1131 | b, "found short btree key in gc"); | 1131 | b, "found short btree key in gc"); |
1132 | 1132 | ||
1133 | return stale; | 1133 | return stale; |
1134 | } | 1134 | } |
1135 | 1135 | ||
1136 | static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k, | 1136 | static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k, |
1137 | struct btree_op *op) | 1137 | struct btree_op *op) |
1138 | { | 1138 | { |
1139 | /* | 1139 | /* |
1140 | * We block priorities from being written for the duration of garbage | 1140 | * We block priorities from being written for the duration of garbage |
1141 | * collection, so we can't sleep in btree_alloc() -> | 1141 | * collection, so we can't sleep in btree_alloc() -> |
1142 | * bch_bucket_alloc_set(), or we'd risk deadlock - so we don't pass it | 1142 | * bch_bucket_alloc_set(), or we'd risk deadlock - so we don't pass it |
1143 | * our closure. | 1143 | * our closure. |
1144 | */ | 1144 | */ |
1145 | struct btree *n = btree_node_alloc_replacement(b, NULL); | 1145 | struct btree *n = btree_node_alloc_replacement(b, NULL); |
1146 | 1146 | ||
1147 | if (!IS_ERR_OR_NULL(n)) { | 1147 | if (!IS_ERR_OR_NULL(n)) { |
1148 | swap(b, n); | 1148 | swap(b, n); |
1149 | __bkey_put(b->c, &b->key); | 1149 | __bkey_put(b->c, &b->key); |
1150 | 1150 | ||
1151 | memcpy(k->ptr, b->key.ptr, | 1151 | memcpy(k->ptr, b->key.ptr, |
1152 | sizeof(uint64_t) * KEY_PTRS(&b->key)); | 1152 | sizeof(uint64_t) * KEY_PTRS(&b->key)); |
1153 | 1153 | ||
1154 | btree_node_free(n, op); | 1154 | btree_node_free(n, op); |
1155 | up_write(&n->lock); | 1155 | up_write(&n->lock); |
1156 | } | 1156 | } |
1157 | 1157 | ||
1158 | return b; | 1158 | return b; |
1159 | } | 1159 | } |
1160 | 1160 | ||
1161 | /* | 1161 | /* |
1162 | * Leaving this at 2 until we've got incremental garbage collection done; it | 1162 | * Leaving this at 2 until we've got incremental garbage collection done; it |
1163 | * could be higher (and has been tested with 4) except that garbage collection | 1163 | * could be higher (and has been tested with 4) except that garbage collection |
1164 | * could take much longer, adversely affecting latency. | 1164 | * could take much longer, adversely affecting latency. |
1165 | */ | 1165 | */ |
1166 | #define GC_MERGE_NODES 2U | 1166 | #define GC_MERGE_NODES 2U |
1167 | 1167 | ||
1168 | struct gc_merge_info { | 1168 | struct gc_merge_info { |
1169 | struct btree *b; | 1169 | struct btree *b; |
1170 | struct bkey *k; | 1170 | struct bkey *k; |
1171 | unsigned keys; | 1171 | unsigned keys; |
1172 | }; | 1172 | }; |
1173 | 1173 | ||
1174 | static void btree_gc_coalesce(struct btree *b, struct btree_op *op, | 1174 | static void btree_gc_coalesce(struct btree *b, struct btree_op *op, |
1175 | struct gc_stat *gc, struct gc_merge_info *r) | 1175 | struct gc_stat *gc, struct gc_merge_info *r) |
1176 | { | 1176 | { |
1177 | unsigned nodes = 0, keys = 0, blocks; | 1177 | unsigned nodes = 0, keys = 0, blocks; |
1178 | int i; | 1178 | int i; |
1179 | 1179 | ||
1180 | while (nodes < GC_MERGE_NODES && r[nodes].b) | 1180 | while (nodes < GC_MERGE_NODES && r[nodes].b) |
1181 | keys += r[nodes++].keys; | 1181 | keys += r[nodes++].keys; |
1182 | 1182 | ||
1183 | blocks = btree_default_blocks(b->c) * 2 / 3; | 1183 | blocks = btree_default_blocks(b->c) * 2 / 3; |
1184 | 1184 | ||
1185 | if (nodes < 2 || | 1185 | if (nodes < 2 || |
1186 | __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1)) | 1186 | __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1)) |
1187 | return; | 1187 | return; |
1188 | 1188 | ||
1189 | for (i = nodes - 1; i >= 0; --i) { | 1189 | for (i = nodes - 1; i >= 0; --i) { |
1190 | if (r[i].b->written) | 1190 | if (r[i].b->written) |
1191 | r[i].b = btree_gc_alloc(r[i].b, r[i].k, op); | 1191 | r[i].b = btree_gc_alloc(r[i].b, r[i].k, op); |
1192 | 1192 | ||
1193 | if (r[i].b->written) | 1193 | if (r[i].b->written) |
1194 | return; | 1194 | return; |
1195 | } | 1195 | } |
1196 | 1196 | ||
1197 | for (i = nodes - 1; i > 0; --i) { | 1197 | for (i = nodes - 1; i > 0; --i) { |
1198 | struct bset *n1 = r[i].b->sets->data; | 1198 | struct bset *n1 = r[i].b->sets->data; |
1199 | struct bset *n2 = r[i - 1].b->sets->data; | 1199 | struct bset *n2 = r[i - 1].b->sets->data; |
1200 | struct bkey *k, *last = NULL; | 1200 | struct bkey *k, *last = NULL; |
1201 | 1201 | ||
1202 | keys = 0; | 1202 | keys = 0; |
1203 | 1203 | ||
1204 | if (i == 1) { | 1204 | if (i == 1) { |
1205 | /* | 1205 | /* |
1206 | * Last node we're not getting rid of - we're getting | 1206 | * Last node we're not getting rid of - we're getting |
1207 | * rid of the node at r[0]. Have to try and fit all of | 1207 | * rid of the node at r[0]. Have to try and fit all of |
1208 | * the remaining keys into this node; we can't ensure | 1208 | * the remaining keys into this node; we can't ensure |
1209 | * they will always fit due to rounding and variable | 1209 | * they will always fit due to rounding and variable |
1210 | * length keys (shouldn't be possible in practice, | 1210 | * length keys (shouldn't be possible in practice, |
1211 | * though) | 1211 | * though) |
1212 | */ | 1212 | */ |
1213 | if (__set_blocks(n1, n1->keys + r->keys, | 1213 | if (__set_blocks(n1, n1->keys + r->keys, |
1214 | b->c) > btree_blocks(r[i].b)) | 1214 | b->c) > btree_blocks(r[i].b)) |
1215 | return; | 1215 | return; |
1216 | 1216 | ||
1217 | keys = n2->keys; | 1217 | keys = n2->keys; |
1218 | last = &r->b->key; | 1218 | last = &r->b->key; |
1219 | } else | 1219 | } else |
1220 | for (k = n2->start; | 1220 | for (k = n2->start; |
1221 | k < end(n2); | 1221 | k < end(n2); |
1222 | k = bkey_next(k)) { | 1222 | k = bkey_next(k)) { |
1223 | if (__set_blocks(n1, n1->keys + keys + | 1223 | if (__set_blocks(n1, n1->keys + keys + |
1224 | bkey_u64s(k), b->c) > blocks) | 1224 | bkey_u64s(k), b->c) > blocks) |
1225 | break; | 1225 | break; |
1226 | 1226 | ||
1227 | last = k; | 1227 | last = k; |
1228 | keys += bkey_u64s(k); | 1228 | keys += bkey_u64s(k); |
1229 | } | 1229 | } |
1230 | 1230 | ||
1231 | BUG_ON(__set_blocks(n1, n1->keys + keys, | 1231 | BUG_ON(__set_blocks(n1, n1->keys + keys, |
1232 | b->c) > btree_blocks(r[i].b)); | 1232 | b->c) > btree_blocks(r[i].b)); |
1233 | 1233 | ||
1234 | if (last) { | 1234 | if (last) { |
1235 | bkey_copy_key(&r[i].b->key, last); | 1235 | bkey_copy_key(&r[i].b->key, last); |
1236 | bkey_copy_key(r[i].k, last); | 1236 | bkey_copy_key(r[i].k, last); |
1237 | } | 1237 | } |
1238 | 1238 | ||
1239 | memcpy(end(n1), | 1239 | memcpy(end(n1), |
1240 | n2->start, | 1240 | n2->start, |
1241 | (void *) node(n2, keys) - (void *) n2->start); | 1241 | (void *) node(n2, keys) - (void *) n2->start); |
1242 | 1242 | ||
1243 | n1->keys += keys; | 1243 | n1->keys += keys; |
1244 | 1244 | ||
1245 | memmove(n2->start, | 1245 | memmove(n2->start, |
1246 | node(n2, keys), | 1246 | node(n2, keys), |
1247 | (void *) end(n2) - (void *) node(n2, keys)); | 1247 | (void *) end(n2) - (void *) node(n2, keys)); |
1248 | 1248 | ||
1249 | n2->keys -= keys; | 1249 | n2->keys -= keys; |
1250 | 1250 | ||
1251 | r[i].keys = n1->keys; | 1251 | r[i].keys = n1->keys; |
1252 | r[i - 1].keys = n2->keys; | 1252 | r[i - 1].keys = n2->keys; |
1253 | } | 1253 | } |
1254 | 1254 | ||
1255 | btree_node_free(r->b, op); | 1255 | btree_node_free(r->b, op); |
1256 | up_write(&r->b->lock); | 1256 | up_write(&r->b->lock); |
1257 | 1257 | ||
1258 | trace_bcache_btree_gc_coalesce(nodes); | 1258 | trace_bcache_btree_gc_coalesce(nodes); |
1259 | 1259 | ||
1260 | gc->nodes--; | 1260 | gc->nodes--; |
1261 | nodes--; | 1261 | nodes--; |
1262 | 1262 | ||
1263 | memmove(&r[0], &r[1], sizeof(struct gc_merge_info) * nodes); | 1263 | memmove(&r[0], &r[1], sizeof(struct gc_merge_info) * nodes); |
1264 | memset(&r[nodes], 0, sizeof(struct gc_merge_info)); | 1264 | memset(&r[nodes], 0, sizeof(struct gc_merge_info)); |
1265 | } | 1265 | } |
1266 | 1266 | ||
1267 | static int btree_gc_recurse(struct btree *b, struct btree_op *op, | 1267 | static int btree_gc_recurse(struct btree *b, struct btree_op *op, |
1268 | struct closure *writes, struct gc_stat *gc) | 1268 | struct closure *writes, struct gc_stat *gc) |
1269 | { | 1269 | { |
1270 | void write(struct btree *r) | 1270 | void write(struct btree *r) |
1271 | { | 1271 | { |
1272 | if (!r->written) | 1272 | if (!r->written) |
1273 | bch_btree_node_write(r, &op->cl); | 1273 | bch_btree_node_write(r, &op->cl); |
1274 | else if (btree_node_dirty(r)) | 1274 | else if (btree_node_dirty(r)) |
1275 | bch_btree_node_write(r, writes); | 1275 | bch_btree_node_write(r, writes); |
1276 | 1276 | ||
1277 | up_write(&r->lock); | 1277 | up_write(&r->lock); |
1278 | } | 1278 | } |
1279 | 1279 | ||
1280 | int ret = 0, stale; | 1280 | int ret = 0, stale; |
1281 | unsigned i; | 1281 | unsigned i; |
1282 | struct gc_merge_info r[GC_MERGE_NODES]; | 1282 | struct gc_merge_info r[GC_MERGE_NODES]; |
1283 | 1283 | ||
1284 | memset(r, 0, sizeof(r)); | 1284 | memset(r, 0, sizeof(r)); |
1285 | 1285 | ||
1286 | while ((r->k = bch_next_recurse_key(b, &b->c->gc_done))) { | 1286 | while ((r->k = bch_next_recurse_key(b, &b->c->gc_done))) { |
1287 | r->b = bch_btree_node_get(b->c, r->k, b->level - 1, op); | 1287 | r->b = bch_btree_node_get(b->c, r->k, b->level - 1, op); |
1288 | 1288 | ||
1289 | if (IS_ERR(r->b)) { | 1289 | if (IS_ERR(r->b)) { |
1290 | ret = PTR_ERR(r->b); | 1290 | ret = PTR_ERR(r->b); |
1291 | break; | 1291 | break; |
1292 | } | 1292 | } |
1293 | 1293 | ||
1294 | r->keys = 0; | 1294 | r->keys = 0; |
1295 | stale = btree_gc_mark_node(r->b, &r->keys, gc); | 1295 | stale = btree_gc_mark_node(r->b, &r->keys, gc); |
1296 | 1296 | ||
1297 | if (!b->written && | 1297 | if (!b->written && |
1298 | (r->b->level || stale > 10 || | 1298 | (r->b->level || stale > 10 || |
1299 | b->c->gc_always_rewrite)) | 1299 | b->c->gc_always_rewrite)) |
1300 | r->b = btree_gc_alloc(r->b, r->k, op); | 1300 | r->b = btree_gc_alloc(r->b, r->k, op); |
1301 | 1301 | ||
1302 | if (r->b->level) | 1302 | if (r->b->level) |
1303 | ret = btree_gc_recurse(r->b, op, writes, gc); | 1303 | ret = btree_gc_recurse(r->b, op, writes, gc); |
1304 | 1304 | ||
1305 | if (ret) { | 1305 | if (ret) { |
1306 | write(r->b); | 1306 | write(r->b); |
1307 | break; | 1307 | break; |
1308 | } | 1308 | } |
1309 | 1309 | ||
1310 | bkey_copy_key(&b->c->gc_done, r->k); | 1310 | bkey_copy_key(&b->c->gc_done, r->k); |
1311 | 1311 | ||
1312 | if (!b->written) | 1312 | if (!b->written) |
1313 | btree_gc_coalesce(b, op, gc, r); | 1313 | btree_gc_coalesce(b, op, gc, r); |
1314 | 1314 | ||
1315 | if (r[GC_MERGE_NODES - 1].b) | 1315 | if (r[GC_MERGE_NODES - 1].b) |
1316 | write(r[GC_MERGE_NODES - 1].b); | 1316 | write(r[GC_MERGE_NODES - 1].b); |
1317 | 1317 | ||
1318 | memmove(&r[1], &r[0], | 1318 | memmove(&r[1], &r[0], |
1319 | sizeof(struct gc_merge_info) * (GC_MERGE_NODES - 1)); | 1319 | sizeof(struct gc_merge_info) * (GC_MERGE_NODES - 1)); |
1320 | 1320 | ||
1321 | /* When we've got incremental GC working, we'll want to do | 1321 | /* When we've got incremental GC working, we'll want to do |
1322 | * if (should_resched()) | 1322 | * if (should_resched()) |
1323 | * return -EAGAIN; | 1323 | * return -EAGAIN; |
1324 | */ | 1324 | */ |
1325 | cond_resched(); | 1325 | cond_resched(); |
1326 | #if 0 | 1326 | #if 0 |
1327 | if (need_resched()) { | 1327 | if (need_resched()) { |
1328 | ret = -EAGAIN; | 1328 | ret = -EAGAIN; |
1329 | break; | 1329 | break; |
1330 | } | 1330 | } |
1331 | #endif | 1331 | #endif |
1332 | } | 1332 | } |
1333 | 1333 | ||
1334 | for (i = 1; i < GC_MERGE_NODES && r[i].b; i++) | 1334 | for (i = 1; i < GC_MERGE_NODES && r[i].b; i++) |
1335 | write(r[i].b); | 1335 | write(r[i].b); |
1336 | 1336 | ||
1337 | /* Might have freed some children, must remove their keys */ | 1337 | /* Might have freed some children, must remove their keys */ |
1338 | if (!b->written) | 1338 | if (!b->written) |
1339 | bch_btree_sort(b); | 1339 | bch_btree_sort(b); |
1340 | 1340 | ||
1341 | return ret; | 1341 | return ret; |
1342 | } | 1342 | } |
1343 | 1343 | ||
1344 | static int bch_btree_gc_root(struct btree *b, struct btree_op *op, | 1344 | static int bch_btree_gc_root(struct btree *b, struct btree_op *op, |
1345 | struct closure *writes, struct gc_stat *gc) | 1345 | struct closure *writes, struct gc_stat *gc) |
1346 | { | 1346 | { |
1347 | struct btree *n = NULL; | 1347 | struct btree *n = NULL; |
1348 | unsigned keys = 0; | 1348 | unsigned keys = 0; |
1349 | int ret = 0, stale = btree_gc_mark_node(b, &keys, gc); | 1349 | int ret = 0, stale = btree_gc_mark_node(b, &keys, gc); |
1350 | 1350 | ||
1351 | if (b->level || stale > 10) | 1351 | if (b->level || stale > 10) |
1352 | n = btree_node_alloc_replacement(b, NULL); | 1352 | n = btree_node_alloc_replacement(b, NULL); |
1353 | 1353 | ||
1354 | if (!IS_ERR_OR_NULL(n)) | 1354 | if (!IS_ERR_OR_NULL(n)) |
1355 | swap(b, n); | 1355 | swap(b, n); |
1356 | 1356 | ||
1357 | if (b->level) | 1357 | if (b->level) |
1358 | ret = btree_gc_recurse(b, op, writes, gc); | 1358 | ret = btree_gc_recurse(b, op, writes, gc); |
1359 | 1359 | ||
1360 | if (!b->written || btree_node_dirty(b)) { | 1360 | if (!b->written || btree_node_dirty(b)) { |
1361 | bch_btree_node_write(b, n ? &op->cl : NULL); | 1361 | bch_btree_node_write(b, n ? &op->cl : NULL); |
1362 | } | 1362 | } |
1363 | 1363 | ||
1364 | if (!IS_ERR_OR_NULL(n)) { | 1364 | if (!IS_ERR_OR_NULL(n)) { |
1365 | closure_sync(&op->cl); | 1365 | closure_sync(&op->cl); |
1366 | bch_btree_set_root(b); | 1366 | bch_btree_set_root(b); |
1367 | btree_node_free(n, op); | 1367 | btree_node_free(n, op); |
1368 | rw_unlock(true, b); | 1368 | rw_unlock(true, b); |
1369 | } | 1369 | } |
1370 | 1370 | ||
1371 | return ret; | 1371 | return ret; |
1372 | } | 1372 | } |
1373 | 1373 | ||
1374 | static void btree_gc_start(struct cache_set *c) | 1374 | static void btree_gc_start(struct cache_set *c) |
1375 | { | 1375 | { |
1376 | struct cache *ca; | 1376 | struct cache *ca; |
1377 | struct bucket *b; | 1377 | struct bucket *b; |
1378 | unsigned i; | 1378 | unsigned i; |
1379 | 1379 | ||
1380 | if (!c->gc_mark_valid) | 1380 | if (!c->gc_mark_valid) |
1381 | return; | 1381 | return; |
1382 | 1382 | ||
1383 | mutex_lock(&c->bucket_lock); | 1383 | mutex_lock(&c->bucket_lock); |
1384 | 1384 | ||
1385 | c->gc_mark_valid = 0; | 1385 | c->gc_mark_valid = 0; |
1386 | c->gc_done = ZERO_KEY; | 1386 | c->gc_done = ZERO_KEY; |
1387 | 1387 | ||
1388 | for_each_cache(ca, c, i) | 1388 | for_each_cache(ca, c, i) |
1389 | for_each_bucket(b, ca) { | 1389 | for_each_bucket(b, ca) { |
1390 | b->gc_gen = b->gen; | 1390 | b->gc_gen = b->gen; |
1391 | if (!atomic_read(&b->pin)) | 1391 | if (!atomic_read(&b->pin)) |
1392 | SET_GC_MARK(b, GC_MARK_RECLAIMABLE); | 1392 | SET_GC_MARK(b, GC_MARK_RECLAIMABLE); |
1393 | } | 1393 | } |
1394 | 1394 | ||
1395 | mutex_unlock(&c->bucket_lock); | 1395 | mutex_unlock(&c->bucket_lock); |
1396 | } | 1396 | } |
1397 | 1397 | ||
1398 | size_t bch_btree_gc_finish(struct cache_set *c) | 1398 | size_t bch_btree_gc_finish(struct cache_set *c) |
1399 | { | 1399 | { |
1400 | size_t available = 0; | 1400 | size_t available = 0; |
1401 | struct bucket *b; | 1401 | struct bucket *b; |
1402 | struct cache *ca; | 1402 | struct cache *ca; |
1403 | unsigned i; | 1403 | unsigned i; |
1404 | 1404 | ||
1405 | mutex_lock(&c->bucket_lock); | 1405 | mutex_lock(&c->bucket_lock); |
1406 | 1406 | ||
1407 | set_gc_sectors(c); | 1407 | set_gc_sectors(c); |
1408 | c->gc_mark_valid = 1; | 1408 | c->gc_mark_valid = 1; |
1409 | c->need_gc = 0; | 1409 | c->need_gc = 0; |
1410 | 1410 | ||
1411 | if (c->root) | 1411 | if (c->root) |
1412 | for (i = 0; i < KEY_PTRS(&c->root->key); i++) | 1412 | for (i = 0; i < KEY_PTRS(&c->root->key); i++) |
1413 | SET_GC_MARK(PTR_BUCKET(c, &c->root->key, i), | 1413 | SET_GC_MARK(PTR_BUCKET(c, &c->root->key, i), |
1414 | GC_MARK_METADATA); | 1414 | GC_MARK_METADATA); |
1415 | 1415 | ||
1416 | for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++) | 1416 | for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++) |
1417 | SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i), | 1417 | SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i), |
1418 | GC_MARK_METADATA); | 1418 | GC_MARK_METADATA); |
1419 | 1419 | ||
1420 | for_each_cache(ca, c, i) { | 1420 | for_each_cache(ca, c, i) { |
1421 | uint64_t *i; | 1421 | uint64_t *i; |
1422 | 1422 | ||
1423 | ca->invalidate_needs_gc = 0; | 1423 | ca->invalidate_needs_gc = 0; |
1424 | 1424 | ||
1425 | for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++) | 1425 | for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++) |
1426 | SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); | 1426 | SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); |
1427 | 1427 | ||
1428 | for (i = ca->prio_buckets; | 1428 | for (i = ca->prio_buckets; |
1429 | i < ca->prio_buckets + prio_buckets(ca) * 2; i++) | 1429 | i < ca->prio_buckets + prio_buckets(ca) * 2; i++) |
1430 | SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); | 1430 | SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); |
1431 | 1431 | ||
1432 | for_each_bucket(b, ca) { | 1432 | for_each_bucket(b, ca) { |
1433 | b->last_gc = b->gc_gen; | 1433 | b->last_gc = b->gc_gen; |
1434 | c->need_gc = max(c->need_gc, bucket_gc_gen(b)); | 1434 | c->need_gc = max(c->need_gc, bucket_gc_gen(b)); |
1435 | 1435 | ||
1436 | if (!atomic_read(&b->pin) && | 1436 | if (!atomic_read(&b->pin) && |
1437 | GC_MARK(b) == GC_MARK_RECLAIMABLE) { | 1437 | GC_MARK(b) == GC_MARK_RECLAIMABLE) { |
1438 | available++; | 1438 | available++; |
1439 | if (!GC_SECTORS_USED(b)) | 1439 | if (!GC_SECTORS_USED(b)) |
1440 | bch_bucket_add_unused(ca, b); | 1440 | bch_bucket_add_unused(ca, b); |
1441 | } | 1441 | } |
1442 | } | 1442 | } |
1443 | } | 1443 | } |
1444 | 1444 | ||
1445 | mutex_unlock(&c->bucket_lock); | 1445 | mutex_unlock(&c->bucket_lock); |
1446 | return available; | 1446 | return available; |
1447 | } | 1447 | } |
1448 | 1448 | ||
1449 | static void bch_btree_gc(struct closure *cl) | 1449 | static void bch_btree_gc(struct closure *cl) |
1450 | { | 1450 | { |
1451 | struct cache_set *c = container_of(cl, struct cache_set, gc.cl); | 1451 | struct cache_set *c = container_of(cl, struct cache_set, gc.cl); |
1452 | int ret; | 1452 | int ret; |
1453 | unsigned long available; | 1453 | unsigned long available; |
1454 | struct gc_stat stats; | 1454 | struct gc_stat stats; |
1455 | struct closure writes; | 1455 | struct closure writes; |
1456 | struct btree_op op; | 1456 | struct btree_op op; |
1457 | uint64_t start_time = local_clock(); | 1457 | uint64_t start_time = local_clock(); |
1458 | 1458 | ||
1459 | trace_bcache_gc_start(c); | 1459 | trace_bcache_gc_start(c); |
1460 | 1460 | ||
1461 | memset(&stats, 0, sizeof(struct gc_stat)); | 1461 | memset(&stats, 0, sizeof(struct gc_stat)); |
1462 | closure_init_stack(&writes); | 1462 | closure_init_stack(&writes); |
1463 | bch_btree_op_init_stack(&op); | 1463 | bch_btree_op_init_stack(&op); |
1464 | op.lock = SHRT_MAX; | 1464 | op.lock = SHRT_MAX; |
1465 | 1465 | ||
1466 | btree_gc_start(c); | 1466 | btree_gc_start(c); |
1467 | 1467 | ||
1468 | atomic_inc(&c->prio_blocked); | 1468 | atomic_inc(&c->prio_blocked); |
1469 | 1469 | ||
1470 | ret = btree_root(gc_root, c, &op, &writes, &stats); | 1470 | ret = btree_root(gc_root, c, &op, &writes, &stats); |
1471 | closure_sync(&op.cl); | 1471 | closure_sync(&op.cl); |
1472 | closure_sync(&writes); | 1472 | closure_sync(&writes); |
1473 | 1473 | ||
1474 | if (ret) { | 1474 | if (ret) { |
1475 | pr_warn("gc failed!"); | 1475 | pr_warn("gc failed!"); |
1476 | continue_at(cl, bch_btree_gc, bch_gc_wq); | 1476 | continue_at(cl, bch_btree_gc, bch_gc_wq); |
1477 | } | 1477 | } |
1478 | 1478 | ||
1479 | /* Possibly wait for new UUIDs or whatever to hit disk */ | 1479 | /* Possibly wait for new UUIDs or whatever to hit disk */ |
1480 | bch_journal_meta(c, &op.cl); | 1480 | bch_journal_meta(c, &op.cl); |
1481 | closure_sync(&op.cl); | 1481 | closure_sync(&op.cl); |
1482 | 1482 | ||
1483 | available = bch_btree_gc_finish(c); | 1483 | available = bch_btree_gc_finish(c); |
1484 | 1484 | ||
1485 | atomic_dec(&c->prio_blocked); | 1485 | atomic_dec(&c->prio_blocked); |
1486 | wake_up_allocators(c); | 1486 | wake_up_allocators(c); |
1487 | 1487 | ||
1488 | bch_time_stats_update(&c->btree_gc_time, start_time); | 1488 | bch_time_stats_update(&c->btree_gc_time, start_time); |
1489 | 1489 | ||
1490 | stats.key_bytes *= sizeof(uint64_t); | 1490 | stats.key_bytes *= sizeof(uint64_t); |
1491 | stats.dirty <<= 9; | 1491 | stats.dirty <<= 9; |
1492 | stats.data <<= 9; | 1492 | stats.data <<= 9; |
1493 | stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; | 1493 | stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; |
1494 | memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); | 1494 | memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); |
1495 | 1495 | ||
1496 | trace_bcache_gc_end(c); | 1496 | trace_bcache_gc_end(c); |
1497 | 1497 | ||
1498 | continue_at(cl, bch_moving_gc, bch_gc_wq); | 1498 | continue_at(cl, bch_moving_gc, bch_gc_wq); |
1499 | } | 1499 | } |
1500 | 1500 | ||
1501 | void bch_queue_gc(struct cache_set *c) | 1501 | void bch_queue_gc(struct cache_set *c) |
1502 | { | 1502 | { |
1503 | closure_trylock_call(&c->gc.cl, bch_btree_gc, bch_gc_wq, &c->cl); | 1503 | closure_trylock_call(&c->gc.cl, bch_btree_gc, bch_gc_wq, &c->cl); |
1504 | } | 1504 | } |
1505 | 1505 | ||
1506 | /* Initial partial gc */ | 1506 | /* Initial partial gc */ |
1507 | 1507 | ||
1508 | static int bch_btree_check_recurse(struct btree *b, struct btree_op *op, | 1508 | static int bch_btree_check_recurse(struct btree *b, struct btree_op *op, |
1509 | unsigned long **seen) | 1509 | unsigned long **seen) |
1510 | { | 1510 | { |
1511 | int ret; | 1511 | int ret; |
1512 | unsigned i; | 1512 | unsigned i; |
1513 | struct bkey *k; | 1513 | struct bkey *k; |
1514 | struct bucket *g; | 1514 | struct bucket *g; |
1515 | struct btree_iter iter; | 1515 | struct btree_iter iter; |
1516 | 1516 | ||
1517 | for_each_key_filter(b, k, &iter, bch_ptr_invalid) { | 1517 | for_each_key_filter(b, k, &iter, bch_ptr_invalid) { |
1518 | for (i = 0; i < KEY_PTRS(k); i++) { | 1518 | for (i = 0; i < KEY_PTRS(k); i++) { |
1519 | if (!ptr_available(b->c, k, i)) | 1519 | if (!ptr_available(b->c, k, i)) |
1520 | continue; | 1520 | continue; |
1521 | 1521 | ||
1522 | g = PTR_BUCKET(b->c, k, i); | 1522 | g = PTR_BUCKET(b->c, k, i); |
1523 | 1523 | ||
1524 | if (!__test_and_set_bit(PTR_BUCKET_NR(b->c, k, i), | 1524 | if (!__test_and_set_bit(PTR_BUCKET_NR(b->c, k, i), |
1525 | seen[PTR_DEV(k, i)]) || | 1525 | seen[PTR_DEV(k, i)]) || |
1526 | !ptr_stale(b->c, k, i)) { | 1526 | !ptr_stale(b->c, k, i)) { |
1527 | g->gen = PTR_GEN(k, i); | 1527 | g->gen = PTR_GEN(k, i); |
1528 | 1528 | ||
1529 | if (b->level) | 1529 | if (b->level) |
1530 | g->prio = BTREE_PRIO; | 1530 | g->prio = BTREE_PRIO; |
1531 | else if (g->prio == BTREE_PRIO) | 1531 | else if (g->prio == BTREE_PRIO) |
1532 | g->prio = INITIAL_PRIO; | 1532 | g->prio = INITIAL_PRIO; |
1533 | } | 1533 | } |
1534 | } | 1534 | } |
1535 | 1535 | ||
1536 | btree_mark_key(b, k); | 1536 | btree_mark_key(b, k); |
1537 | } | 1537 | } |
1538 | 1538 | ||
1539 | if (b->level) { | 1539 | if (b->level) { |
1540 | k = bch_next_recurse_key(b, &ZERO_KEY); | 1540 | k = bch_next_recurse_key(b, &ZERO_KEY); |
1541 | 1541 | ||
1542 | while (k) { | 1542 | while (k) { |
1543 | struct bkey *p = bch_next_recurse_key(b, k); | 1543 | struct bkey *p = bch_next_recurse_key(b, k); |
1544 | if (p) | 1544 | if (p) |
1545 | btree_node_prefetch(b->c, p, b->level - 1); | 1545 | btree_node_prefetch(b->c, p, b->level - 1); |
1546 | 1546 | ||
1547 | ret = btree(check_recurse, k, b, op, seen); | 1547 | ret = btree(check_recurse, k, b, op, seen); |
1548 | if (ret) | 1548 | if (ret) |
1549 | return ret; | 1549 | return ret; |
1550 | 1550 | ||
1551 | k = p; | 1551 | k = p; |
1552 | } | 1552 | } |
1553 | } | 1553 | } |
1554 | 1554 | ||
1555 | return 0; | 1555 | return 0; |
1556 | } | 1556 | } |
1557 | 1557 | ||
1558 | int bch_btree_check(struct cache_set *c, struct btree_op *op) | 1558 | int bch_btree_check(struct cache_set *c, struct btree_op *op) |
1559 | { | 1559 | { |
1560 | int ret = -ENOMEM; | 1560 | int ret = -ENOMEM; |
1561 | unsigned i; | 1561 | unsigned i; |
1562 | unsigned long *seen[MAX_CACHES_PER_SET]; | 1562 | unsigned long *seen[MAX_CACHES_PER_SET]; |
1563 | 1563 | ||
1564 | memset(seen, 0, sizeof(seen)); | 1564 | memset(seen, 0, sizeof(seen)); |
1565 | 1565 | ||
1566 | for (i = 0; c->cache[i]; i++) { | 1566 | for (i = 0; c->cache[i]; i++) { |
1567 | size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8); | 1567 | size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8); |
1568 | seen[i] = kmalloc(n, GFP_KERNEL); | 1568 | seen[i] = kmalloc(n, GFP_KERNEL); |
1569 | if (!seen[i]) | 1569 | if (!seen[i]) |
1570 | goto err; | 1570 | goto err; |
1571 | 1571 | ||
1572 | /* Disables the seen array until prio_read() uses it too */ | 1572 | /* Disables the seen array until prio_read() uses it too */ |
1573 | memset(seen[i], 0xFF, n); | 1573 | memset(seen[i], 0xFF, n); |
1574 | } | 1574 | } |
1575 | 1575 | ||
1576 | ret = btree_root(check_recurse, c, op, seen); | 1576 | ret = btree_root(check_recurse, c, op, seen); |
1577 | err: | 1577 | err: |
1578 | for (i = 0; i < MAX_CACHES_PER_SET; i++) | 1578 | for (i = 0; i < MAX_CACHES_PER_SET; i++) |
1579 | kfree(seen[i]); | 1579 | kfree(seen[i]); |
1580 | return ret; | 1580 | return ret; |
1581 | } | 1581 | } |
1582 | 1582 | ||
1583 | /* Btree insertion */ | 1583 | /* Btree insertion */ |
1584 | 1584 | ||
1585 | static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert) | 1585 | static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert) |
1586 | { | 1586 | { |
1587 | struct bset *i = b->sets[b->nsets].data; | 1587 | struct bset *i = b->sets[b->nsets].data; |
1588 | 1588 | ||
1589 | memmove((uint64_t *) where + bkey_u64s(insert), | 1589 | memmove((uint64_t *) where + bkey_u64s(insert), |
1590 | where, | 1590 | where, |
1591 | (void *) end(i) - (void *) where); | 1591 | (void *) end(i) - (void *) where); |
1592 | 1592 | ||
1593 | i->keys += bkey_u64s(insert); | 1593 | i->keys += bkey_u64s(insert); |
1594 | bkey_copy(where, insert); | 1594 | bkey_copy(where, insert); |
1595 | bch_bset_fix_lookup_table(b, where); | 1595 | bch_bset_fix_lookup_table(b, where); |
1596 | } | 1596 | } |
1597 | 1597 | ||
1598 | static bool fix_overlapping_extents(struct btree *b, | 1598 | static bool fix_overlapping_extents(struct btree *b, |
1599 | struct bkey *insert, | 1599 | struct bkey *insert, |
1600 | struct btree_iter *iter, | 1600 | struct btree_iter *iter, |
1601 | struct btree_op *op) | 1601 | struct btree_op *op) |
1602 | { | 1602 | { |
1603 | void subtract_dirty(struct bkey *k, uint64_t offset, int sectors) | 1603 | void subtract_dirty(struct bkey *k, uint64_t offset, int sectors) |
1604 | { | 1604 | { |
1605 | if (KEY_DIRTY(k)) | 1605 | if (KEY_DIRTY(k)) |
1606 | bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), | 1606 | bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), |
1607 | offset, -sectors); | 1607 | offset, -sectors); |
1608 | } | 1608 | } |
1609 | 1609 | ||
1610 | uint64_t old_offset; | 1610 | uint64_t old_offset; |
1611 | unsigned old_size, sectors_found = 0; | 1611 | unsigned old_size, sectors_found = 0; |
1612 | 1612 | ||
1613 | while (1) { | 1613 | while (1) { |
1614 | struct bkey *k = bch_btree_iter_next(iter); | 1614 | struct bkey *k = bch_btree_iter_next(iter); |
1615 | if (!k || | 1615 | if (!k || |
1616 | bkey_cmp(&START_KEY(k), insert) >= 0) | 1616 | bkey_cmp(&START_KEY(k), insert) >= 0) |
1617 | break; | 1617 | break; |
1618 | 1618 | ||
1619 | if (bkey_cmp(k, &START_KEY(insert)) <= 0) | 1619 | if (bkey_cmp(k, &START_KEY(insert)) <= 0) |
1620 | continue; | 1620 | continue; |
1621 | 1621 | ||
1622 | old_offset = KEY_START(k); | 1622 | old_offset = KEY_START(k); |
1623 | old_size = KEY_SIZE(k); | 1623 | old_size = KEY_SIZE(k); |
1624 | 1624 | ||
1625 | /* | 1625 | /* |
1626 | * We might overlap with 0 size extents; we can't skip these | 1626 | * We might overlap with 0 size extents; we can't skip these |
1627 | * because if they're in the set we're inserting to we have to | 1627 | * because if they're in the set we're inserting to we have to |
1628 | * adjust them so they don't overlap with the key we're | 1628 | * adjust them so they don't overlap with the key we're |
1629 | * inserting. But we don't want to check them for BTREE_REPLACE | 1629 | * inserting. But we don't want to check them for BTREE_REPLACE |
1630 | * operations. | 1630 | * operations. |
1631 | */ | 1631 | */ |
1632 | 1632 | ||
1633 | if (op->type == BTREE_REPLACE && | 1633 | if (op->type == BTREE_REPLACE && |
1634 | KEY_SIZE(k)) { | 1634 | KEY_SIZE(k)) { |
1635 | /* | 1635 | /* |
1636 | * k might have been split since we inserted/found the | 1636 | * k might have been split since we inserted/found the |
1637 | * key we're replacing | 1637 | * key we're replacing |
1638 | */ | 1638 | */ |
1639 | unsigned i; | 1639 | unsigned i; |
1640 | uint64_t offset = KEY_START(k) - | 1640 | uint64_t offset = KEY_START(k) - |
1641 | KEY_START(&op->replace); | 1641 | KEY_START(&op->replace); |
1642 | 1642 | ||
1643 | /* But it must be a subset of the replace key */ | 1643 | /* But it must be a subset of the replace key */ |
1644 | if (KEY_START(k) < KEY_START(&op->replace) || | 1644 | if (KEY_START(k) < KEY_START(&op->replace) || |
1645 | KEY_OFFSET(k) > KEY_OFFSET(&op->replace)) | 1645 | KEY_OFFSET(k) > KEY_OFFSET(&op->replace)) |
1646 | goto check_failed; | 1646 | goto check_failed; |
1647 | 1647 | ||
1648 | /* We didn't find a key that we were supposed to */ | 1648 | /* We didn't find a key that we were supposed to */ |
1649 | if (KEY_START(k) > KEY_START(insert) + sectors_found) | 1649 | if (KEY_START(k) > KEY_START(insert) + sectors_found) |
1650 | goto check_failed; | 1650 | goto check_failed; |
1651 | 1651 | ||
1652 | if (KEY_PTRS(&op->replace) != KEY_PTRS(k)) | 1652 | if (KEY_PTRS(&op->replace) != KEY_PTRS(k)) |
1653 | goto check_failed; | 1653 | goto check_failed; |
1654 | 1654 | ||
1655 | /* skip past gen */ | 1655 | /* skip past gen */ |
1656 | offset <<= 8; | 1656 | offset <<= 8; |
1657 | 1657 | ||
1658 | BUG_ON(!KEY_PTRS(&op->replace)); | 1658 | BUG_ON(!KEY_PTRS(&op->replace)); |
1659 | 1659 | ||
1660 | for (i = 0; i < KEY_PTRS(&op->replace); i++) | 1660 | for (i = 0; i < KEY_PTRS(&op->replace); i++) |
1661 | if (k->ptr[i] != op->replace.ptr[i] + offset) | 1661 | if (k->ptr[i] != op->replace.ptr[i] + offset) |
1662 | goto check_failed; | 1662 | goto check_failed; |
1663 | 1663 | ||
1664 | sectors_found = KEY_OFFSET(k) - KEY_START(insert); | 1664 | sectors_found = KEY_OFFSET(k) - KEY_START(insert); |
1665 | } | 1665 | } |
1666 | 1666 | ||
1667 | if (bkey_cmp(insert, k) < 0 && | 1667 | if (bkey_cmp(insert, k) < 0 && |
1668 | bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) { | 1668 | bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) { |
1669 | /* | 1669 | /* |
1670 | * We overlapped in the middle of an existing key: that | 1670 | * We overlapped in the middle of an existing key: that |
1671 | * means we have to split the old key. But we have to do | 1671 | * means we have to split the old key. But we have to do |
1672 | * slightly different things depending on whether the | 1672 | * slightly different things depending on whether the |
1673 | * old key has been written out yet. | 1673 | * old key has been written out yet. |
1674 | */ | 1674 | */ |
1675 | 1675 | ||
1676 | struct bkey *top; | 1676 | struct bkey *top; |
1677 | 1677 | ||
1678 | subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert)); | 1678 | subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert)); |
1679 | 1679 | ||
1680 | if (bkey_written(b, k)) { | 1680 | if (bkey_written(b, k)) { |
1681 | /* | 1681 | /* |
1682 | * We insert a new key to cover the top of the | 1682 | * We insert a new key to cover the top of the |
1683 | * old key, and the old key is modified in place | 1683 | * old key, and the old key is modified in place |
1684 | * to represent the bottom split. | 1684 | * to represent the bottom split. |
1685 | * | 1685 | * |
1686 | * It's completely arbitrary whether the new key | 1686 | * It's completely arbitrary whether the new key |
1687 | * is the top or the bottom, but it has to match | 1687 | * is the top or the bottom, but it has to match |
1688 | * up with what btree_sort_fixup() does - it | 1688 | * up with what btree_sort_fixup() does - it |
1689 | * doesn't check for this kind of overlap, it | 1689 | * doesn't check for this kind of overlap, it |
1690 | * depends on us inserting a new key for the top | 1690 | * depends on us inserting a new key for the top |
1691 | * here. | 1691 | * here. |
1692 | */ | 1692 | */ |
1693 | top = bch_bset_search(b, &b->sets[b->nsets], | 1693 | top = bch_bset_search(b, &b->sets[b->nsets], |
1694 | insert); | 1694 | insert); |
1695 | shift_keys(b, top, k); | 1695 | shift_keys(b, top, k); |
1696 | } else { | 1696 | } else { |
1697 | BKEY_PADDED(key) temp; | 1697 | BKEY_PADDED(key) temp; |
1698 | bkey_copy(&temp.key, k); | 1698 | bkey_copy(&temp.key, k); |
1699 | shift_keys(b, k, &temp.key); | 1699 | shift_keys(b, k, &temp.key); |
1700 | top = bkey_next(k); | 1700 | top = bkey_next(k); |
1701 | } | 1701 | } |
1702 | 1702 | ||
1703 | bch_cut_front(insert, top); | 1703 | bch_cut_front(insert, top); |
1704 | bch_cut_back(&START_KEY(insert), k); | 1704 | bch_cut_back(&START_KEY(insert), k); |
1705 | bch_bset_fix_invalidated_key(b, k); | 1705 | bch_bset_fix_invalidated_key(b, k); |
1706 | return false; | 1706 | return false; |
1707 | } | 1707 | } |
1708 | 1708 | ||
1709 | if (bkey_cmp(insert, k) < 0) { | 1709 | if (bkey_cmp(insert, k) < 0) { |
1710 | bch_cut_front(insert, k); | 1710 | bch_cut_front(insert, k); |
1711 | } else { | 1711 | } else { |
1712 | if (bkey_written(b, k) && | 1712 | if (bkey_written(b, k) && |
1713 | bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) { | 1713 | bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) { |
1714 | /* | 1714 | /* |
1715 | * Completely overwrote, so we don't have to | 1715 | * Completely overwrote, so we don't have to |
1716 | * invalidate the binary search tree | 1716 | * invalidate the binary search tree |
1717 | */ | 1717 | */ |
1718 | bch_cut_front(k, k); | 1718 | bch_cut_front(k, k); |
1719 | } else { | 1719 | } else { |
1720 | __bch_cut_back(&START_KEY(insert), k); | 1720 | __bch_cut_back(&START_KEY(insert), k); |
1721 | bch_bset_fix_invalidated_key(b, k); | 1721 | bch_bset_fix_invalidated_key(b, k); |
1722 | } | 1722 | } |
1723 | } | 1723 | } |
1724 | 1724 | ||
1725 | subtract_dirty(k, old_offset, old_size - KEY_SIZE(k)); | 1725 | subtract_dirty(k, old_offset, old_size - KEY_SIZE(k)); |
1726 | } | 1726 | } |
1727 | 1727 | ||
1728 | check_failed: | 1728 | check_failed: |
1729 | if (op->type == BTREE_REPLACE) { | 1729 | if (op->type == BTREE_REPLACE) { |
1730 | if (!sectors_found) { | 1730 | if (!sectors_found) { |
1731 | op->insert_collision = true; | 1731 | op->insert_collision = true; |
1732 | return true; | 1732 | return true; |
1733 | } else if (sectors_found < KEY_SIZE(insert)) { | 1733 | } else if (sectors_found < KEY_SIZE(insert)) { |
1734 | SET_KEY_OFFSET(insert, KEY_OFFSET(insert) - | 1734 | SET_KEY_OFFSET(insert, KEY_OFFSET(insert) - |
1735 | (KEY_SIZE(insert) - sectors_found)); | 1735 | (KEY_SIZE(insert) - sectors_found)); |
1736 | SET_KEY_SIZE(insert, sectors_found); | 1736 | SET_KEY_SIZE(insert, sectors_found); |
1737 | } | 1737 | } |
1738 | } | 1738 | } |
1739 | 1739 | ||
1740 | return false; | 1740 | return false; |
1741 | } | 1741 | } |
1742 | 1742 | ||
1743 | static bool btree_insert_key(struct btree *b, struct btree_op *op, | 1743 | static bool btree_insert_key(struct btree *b, struct btree_op *op, |
1744 | struct bkey *k) | 1744 | struct bkey *k) |
1745 | { | 1745 | { |
1746 | struct bset *i = b->sets[b->nsets].data; | 1746 | struct bset *i = b->sets[b->nsets].data; |
1747 | struct bkey *m, *prev; | 1747 | struct bkey *m, *prev; |
1748 | unsigned status = BTREE_INSERT_STATUS_INSERT; | 1748 | unsigned status = BTREE_INSERT_STATUS_INSERT; |
1749 | 1749 | ||
1750 | BUG_ON(bkey_cmp(k, &b->key) > 0); | 1750 | BUG_ON(bkey_cmp(k, &b->key) > 0); |
1751 | BUG_ON(b->level && !KEY_PTRS(k)); | 1751 | BUG_ON(b->level && !KEY_PTRS(k)); |
1752 | BUG_ON(!b->level && !KEY_OFFSET(k)); | 1752 | BUG_ON(!b->level && !KEY_OFFSET(k)); |
1753 | 1753 | ||
1754 | if (!b->level) { | 1754 | if (!b->level) { |
1755 | struct btree_iter iter; | 1755 | struct btree_iter iter; |
1756 | struct bkey search = KEY(KEY_INODE(k), KEY_START(k), 0); | 1756 | struct bkey search = KEY(KEY_INODE(k), KEY_START(k), 0); |
1757 | 1757 | ||
1758 | /* | 1758 | /* |
1759 | * bset_search() returns the first key that is strictly greater | 1759 | * bset_search() returns the first key that is strictly greater |
1760 | * than the search key - but for back merging, we want to find | 1760 | * than the search key - but for back merging, we want to find |
1761 | * the first key that is greater than or equal to KEY_START(k) - | 1761 | * the first key that is greater than or equal to KEY_START(k) - |
1762 | * unless KEY_START(k) is 0. | 1762 | * unless KEY_START(k) is 0. |
1763 | */ | 1763 | */ |
1764 | if (KEY_OFFSET(&search)) | 1764 | if (KEY_OFFSET(&search)) |
1765 | SET_KEY_OFFSET(&search, KEY_OFFSET(&search) - 1); | 1765 | SET_KEY_OFFSET(&search, KEY_OFFSET(&search) - 1); |
1766 | 1766 | ||
1767 | prev = NULL; | 1767 | prev = NULL; |
1768 | m = bch_btree_iter_init(b, &iter, &search); | 1768 | m = bch_btree_iter_init(b, &iter, &search); |
1769 | 1769 | ||
1770 | if (fix_overlapping_extents(b, k, &iter, op)) | 1770 | if (fix_overlapping_extents(b, k, &iter, op)) |
1771 | return false; | 1771 | return false; |
1772 | 1772 | ||
1773 | while (m != end(i) && | 1773 | while (m != end(i) && |
1774 | bkey_cmp(k, &START_KEY(m)) > 0) | 1774 | bkey_cmp(k, &START_KEY(m)) > 0) |
1775 | prev = m, m = bkey_next(m); | 1775 | prev = m, m = bkey_next(m); |
1776 | 1776 | ||
1777 | if (key_merging_disabled(b->c)) | 1777 | if (key_merging_disabled(b->c)) |
1778 | goto insert; | 1778 | goto insert; |
1779 | 1779 | ||
1780 | /* prev is in the tree, if we merge we're done */ | 1780 | /* prev is in the tree, if we merge we're done */ |
1781 | status = BTREE_INSERT_STATUS_BACK_MERGE; | 1781 | status = BTREE_INSERT_STATUS_BACK_MERGE; |
1782 | if (prev && | 1782 | if (prev && |
1783 | bch_bkey_try_merge(b, prev, k)) | 1783 | bch_bkey_try_merge(b, prev, k)) |
1784 | goto merged; | 1784 | goto merged; |
1785 | 1785 | ||
1786 | status = BTREE_INSERT_STATUS_OVERWROTE; | 1786 | status = BTREE_INSERT_STATUS_OVERWROTE; |
1787 | if (m != end(i) && | 1787 | if (m != end(i) && |
1788 | KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m)) | 1788 | KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m)) |
1789 | goto copy; | 1789 | goto copy; |
1790 | 1790 | ||
1791 | status = BTREE_INSERT_STATUS_FRONT_MERGE; | 1791 | status = BTREE_INSERT_STATUS_FRONT_MERGE; |
1792 | if (m != end(i) && | 1792 | if (m != end(i) && |
1793 | bch_bkey_try_merge(b, k, m)) | 1793 | bch_bkey_try_merge(b, k, m)) |
1794 | goto copy; | 1794 | goto copy; |
1795 | } else | 1795 | } else |
1796 | m = bch_bset_search(b, &b->sets[b->nsets], k); | 1796 | m = bch_bset_search(b, &b->sets[b->nsets], k); |
1797 | 1797 | ||
1798 | insert: shift_keys(b, m, k); | 1798 | insert: shift_keys(b, m, k); |
1799 | copy: bkey_copy(m, k); | 1799 | copy: bkey_copy(m, k); |
1800 | merged: | 1800 | merged: |
1801 | if (KEY_DIRTY(k)) | 1801 | if (KEY_DIRTY(k)) |
1802 | bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), | 1802 | bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), |
1803 | KEY_START(k), KEY_SIZE(k)); | 1803 | KEY_START(k), KEY_SIZE(k)); |
1804 | 1804 | ||
1805 | bch_check_keys(b, "%u for %s", status, op_type(op)); | 1805 | bch_check_keys(b, "%u for %s", status, op_type(op)); |
1806 | 1806 | ||
1807 | if (b->level && !KEY_OFFSET(k)) | 1807 | if (b->level && !KEY_OFFSET(k)) |
1808 | btree_current_write(b)->prio_blocked++; | 1808 | btree_current_write(b)->prio_blocked++; |
1809 | 1809 | ||
1810 | trace_bcache_btree_insert_key(b, k, op->type, status); | 1810 | trace_bcache_btree_insert_key(b, k, op->type, status); |
1811 | 1811 | ||
1812 | return true; | 1812 | return true; |
1813 | } | 1813 | } |
1814 | 1814 | ||
1815 | bool bch_btree_insert_keys(struct btree *b, struct btree_op *op) | 1815 | bool bch_btree_insert_keys(struct btree *b, struct btree_op *op) |
1816 | { | 1816 | { |
1817 | bool ret = false; | 1817 | bool ret = false; |
1818 | struct bkey *k; | 1818 | struct bkey *k; |
1819 | unsigned oldsize = bch_count_data(b); | 1819 | unsigned oldsize = bch_count_data(b); |
1820 | 1820 | ||
1821 | while ((k = bch_keylist_pop(&op->keys))) { | 1821 | while ((k = bch_keylist_pop(&op->keys))) { |
1822 | bkey_put(b->c, k, b->level); | 1822 | bkey_put(b->c, k, b->level); |
1823 | ret |= btree_insert_key(b, op, k); | 1823 | ret |= btree_insert_key(b, op, k); |
1824 | } | 1824 | } |
1825 | 1825 | ||
1826 | BUG_ON(bch_count_data(b) < oldsize); | 1826 | BUG_ON(bch_count_data(b) < oldsize); |
1827 | return ret; | 1827 | return ret; |
1828 | } | 1828 | } |
1829 | 1829 | ||
1830 | bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op, | 1830 | bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op, |
1831 | struct bio *bio) | 1831 | struct bio *bio) |
1832 | { | 1832 | { |
1833 | bool ret = false; | 1833 | bool ret = false; |
1834 | uint64_t btree_ptr = b->key.ptr[0]; | 1834 | uint64_t btree_ptr = b->key.ptr[0]; |
1835 | unsigned long seq = b->seq; | 1835 | unsigned long seq = b->seq; |
1836 | BKEY_PADDED(k) tmp; | 1836 | BKEY_PADDED(k) tmp; |
1837 | 1837 | ||
1838 | rw_unlock(false, b); | 1838 | rw_unlock(false, b); |
1839 | rw_lock(true, b, b->level); | 1839 | rw_lock(true, b, b->level); |
1840 | 1840 | ||
1841 | if (b->key.ptr[0] != btree_ptr || | 1841 | if (b->key.ptr[0] != btree_ptr || |
1842 | b->seq != seq + 1 || | 1842 | b->seq != seq + 1 || |
1843 | should_split(b)) | 1843 | should_split(b)) |
1844 | goto out; | 1844 | goto out; |
1845 | 1845 | ||
1846 | op->replace = KEY(op->inode, bio_end(bio), bio_sectors(bio)); | 1846 | op->replace = KEY(op->inode, bio_end(bio), bio_sectors(bio)); |
1847 | 1847 | ||
1848 | SET_KEY_PTRS(&op->replace, 1); | 1848 | SET_KEY_PTRS(&op->replace, 1); |
1849 | get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t)); | 1849 | get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t)); |
1850 | 1850 | ||
1851 | SET_PTR_DEV(&op->replace, 0, PTR_CHECK_DEV); | 1851 | SET_PTR_DEV(&op->replace, 0, PTR_CHECK_DEV); |
1852 | 1852 | ||
1853 | bkey_copy(&tmp.k, &op->replace); | 1853 | bkey_copy(&tmp.k, &op->replace); |
1854 | 1854 | ||
1855 | BUG_ON(op->type != BTREE_INSERT); | 1855 | BUG_ON(op->type != BTREE_INSERT); |
1856 | BUG_ON(!btree_insert_key(b, op, &tmp.k)); | 1856 | BUG_ON(!btree_insert_key(b, op, &tmp.k)); |
1857 | ret = true; | 1857 | ret = true; |
1858 | out: | 1858 | out: |
1859 | downgrade_write(&b->lock); | 1859 | downgrade_write(&b->lock); |
1860 | return ret; | 1860 | return ret; |
1861 | } | 1861 | } |
1862 | 1862 | ||
1863 | static int btree_split(struct btree *b, struct btree_op *op) | 1863 | static int btree_split(struct btree *b, struct btree_op *op) |
1864 | { | 1864 | { |
1865 | bool split, root = b == b->c->root; | 1865 | bool split, root = b == b->c->root; |
1866 | struct btree *n1, *n2 = NULL, *n3 = NULL; | 1866 | struct btree *n1, *n2 = NULL, *n3 = NULL; |
1867 | uint64_t start_time = local_clock(); | 1867 | uint64_t start_time = local_clock(); |
1868 | 1868 | ||
1869 | if (b->level) | 1869 | if (b->level) |
1870 | set_closure_blocking(&op->cl); | 1870 | set_closure_blocking(&op->cl); |
1871 | 1871 | ||
1872 | n1 = btree_node_alloc_replacement(b, &op->cl); | 1872 | n1 = btree_node_alloc_replacement(b, &op->cl); |
1873 | if (IS_ERR(n1)) | 1873 | if (IS_ERR(n1)) |
1874 | goto err; | 1874 | goto err; |
1875 | 1875 | ||
1876 | split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5; | 1876 | split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5; |
1877 | 1877 | ||
1878 | if (split) { | 1878 | if (split) { |
1879 | unsigned keys = 0; | 1879 | unsigned keys = 0; |
1880 | 1880 | ||
1881 | trace_bcache_btree_node_split(b, n1->sets[0].data->keys); | 1881 | trace_bcache_btree_node_split(b, n1->sets[0].data->keys); |
1882 | 1882 | ||
1883 | n2 = bch_btree_node_alloc(b->c, b->level, &op->cl); | 1883 | n2 = bch_btree_node_alloc(b->c, b->level, &op->cl); |
1884 | if (IS_ERR(n2)) | 1884 | if (IS_ERR(n2)) |
1885 | goto err_free1; | 1885 | goto err_free1; |
1886 | 1886 | ||
1887 | if (root) { | 1887 | if (root) { |
1888 | n3 = bch_btree_node_alloc(b->c, b->level + 1, &op->cl); | 1888 | n3 = bch_btree_node_alloc(b->c, b->level + 1, &op->cl); |
1889 | if (IS_ERR(n3)) | 1889 | if (IS_ERR(n3)) |
1890 | goto err_free2; | 1890 | goto err_free2; |
1891 | } | 1891 | } |
1892 | 1892 | ||
1893 | bch_btree_insert_keys(n1, op); | 1893 | bch_btree_insert_keys(n1, op); |
1894 | 1894 | ||
1895 | /* Has to be a linear search because we don't have an auxiliary | 1895 | /* Has to be a linear search because we don't have an auxiliary |
1896 | * search tree yet | 1896 | * search tree yet |
1897 | */ | 1897 | */ |
1898 | 1898 | ||
1899 | while (keys < (n1->sets[0].data->keys * 3) / 5) | 1899 | while (keys < (n1->sets[0].data->keys * 3) / 5) |
1900 | keys += bkey_u64s(node(n1->sets[0].data, keys)); | 1900 | keys += bkey_u64s(node(n1->sets[0].data, keys)); |
1901 | 1901 | ||
1902 | bkey_copy_key(&n1->key, node(n1->sets[0].data, keys)); | 1902 | bkey_copy_key(&n1->key, node(n1->sets[0].data, keys)); |
1903 | keys += bkey_u64s(node(n1->sets[0].data, keys)); | 1903 | keys += bkey_u64s(node(n1->sets[0].data, keys)); |
1904 | 1904 | ||
1905 | n2->sets[0].data->keys = n1->sets[0].data->keys - keys; | 1905 | n2->sets[0].data->keys = n1->sets[0].data->keys - keys; |
1906 | n1->sets[0].data->keys = keys; | 1906 | n1->sets[0].data->keys = keys; |
1907 | 1907 | ||
1908 | memcpy(n2->sets[0].data->start, | 1908 | memcpy(n2->sets[0].data->start, |
1909 | end(n1->sets[0].data), | 1909 | end(n1->sets[0].data), |
1910 | n2->sets[0].data->keys * sizeof(uint64_t)); | 1910 | n2->sets[0].data->keys * sizeof(uint64_t)); |
1911 | 1911 | ||
1912 | bkey_copy_key(&n2->key, &b->key); | 1912 | bkey_copy_key(&n2->key, &b->key); |
1913 | 1913 | ||
1914 | bch_keylist_add(&op->keys, &n2->key); | 1914 | bch_keylist_add(&op->keys, &n2->key); |
1915 | bch_btree_node_write(n2, &op->cl); | 1915 | bch_btree_node_write(n2, &op->cl); |
1916 | rw_unlock(true, n2); | 1916 | rw_unlock(true, n2); |
1917 | } else { | 1917 | } else { |
1918 | trace_bcache_btree_node_compact(b, n1->sets[0].data->keys); | 1918 | trace_bcache_btree_node_compact(b, n1->sets[0].data->keys); |
1919 | 1919 | ||
1920 | bch_btree_insert_keys(n1, op); | 1920 | bch_btree_insert_keys(n1, op); |
1921 | } | 1921 | } |
1922 | 1922 | ||
1923 | bch_keylist_add(&op->keys, &n1->key); | 1923 | bch_keylist_add(&op->keys, &n1->key); |
1924 | bch_btree_node_write(n1, &op->cl); | 1924 | bch_btree_node_write(n1, &op->cl); |
1925 | 1925 | ||
1926 | if (n3) { | 1926 | if (n3) { |
1927 | bkey_copy_key(&n3->key, &MAX_KEY); | 1927 | bkey_copy_key(&n3->key, &MAX_KEY); |
1928 | bch_btree_insert_keys(n3, op); | 1928 | bch_btree_insert_keys(n3, op); |
1929 | bch_btree_node_write(n3, &op->cl); | 1929 | bch_btree_node_write(n3, &op->cl); |
1930 | 1930 | ||
1931 | closure_sync(&op->cl); | 1931 | closure_sync(&op->cl); |
1932 | bch_btree_set_root(n3); | 1932 | bch_btree_set_root(n3); |
1933 | rw_unlock(true, n3); | 1933 | rw_unlock(true, n3); |
1934 | } else if (root) { | 1934 | } else if (root) { |
1935 | op->keys.top = op->keys.bottom; | 1935 | op->keys.top = op->keys.bottom; |
1936 | closure_sync(&op->cl); | 1936 | closure_sync(&op->cl); |
1937 | bch_btree_set_root(n1); | 1937 | bch_btree_set_root(n1); |
1938 | } else { | 1938 | } else { |
1939 | unsigned i; | 1939 | unsigned i; |
1940 | 1940 | ||
1941 | bkey_copy(op->keys.top, &b->key); | 1941 | bkey_copy(op->keys.top, &b->key); |
1942 | bkey_copy_key(op->keys.top, &ZERO_KEY); | 1942 | bkey_copy_key(op->keys.top, &ZERO_KEY); |
1943 | 1943 | ||
1944 | for (i = 0; i < KEY_PTRS(&b->key); i++) { | 1944 | for (i = 0; i < KEY_PTRS(&b->key); i++) { |
1945 | uint8_t g = PTR_BUCKET(b->c, &b->key, i)->gen + 1; | 1945 | uint8_t g = PTR_BUCKET(b->c, &b->key, i)->gen + 1; |
1946 | 1946 | ||
1947 | SET_PTR_GEN(op->keys.top, i, g); | 1947 | SET_PTR_GEN(op->keys.top, i, g); |
1948 | } | 1948 | } |
1949 | 1949 | ||
1950 | bch_keylist_push(&op->keys); | 1950 | bch_keylist_push(&op->keys); |
1951 | closure_sync(&op->cl); | 1951 | closure_sync(&op->cl); |
1952 | atomic_inc(&b->c->prio_blocked); | 1952 | atomic_inc(&b->c->prio_blocked); |
1953 | } | 1953 | } |
1954 | 1954 | ||
1955 | rw_unlock(true, n1); | 1955 | rw_unlock(true, n1); |
1956 | btree_node_free(b, op); | 1956 | btree_node_free(b, op); |
1957 | 1957 | ||
1958 | bch_time_stats_update(&b->c->btree_split_time, start_time); | 1958 | bch_time_stats_update(&b->c->btree_split_time, start_time); |
1959 | 1959 | ||
1960 | return 0; | 1960 | return 0; |
1961 | err_free2: | 1961 | err_free2: |
1962 | __bkey_put(n2->c, &n2->key); | 1962 | __bkey_put(n2->c, &n2->key); |
1963 | btree_node_free(n2, op); | 1963 | btree_node_free(n2, op); |
1964 | rw_unlock(true, n2); | 1964 | rw_unlock(true, n2); |
1965 | err_free1: | 1965 | err_free1: |
1966 | __bkey_put(n1->c, &n1->key); | 1966 | __bkey_put(n1->c, &n1->key); |
1967 | btree_node_free(n1, op); | 1967 | btree_node_free(n1, op); |
1968 | rw_unlock(true, n1); | 1968 | rw_unlock(true, n1); |
1969 | err: | 1969 | err: |
1970 | if (n3 == ERR_PTR(-EAGAIN) || | 1970 | if (n3 == ERR_PTR(-EAGAIN) || |
1971 | n2 == ERR_PTR(-EAGAIN) || | 1971 | n2 == ERR_PTR(-EAGAIN) || |
1972 | n1 == ERR_PTR(-EAGAIN)) | 1972 | n1 == ERR_PTR(-EAGAIN)) |
1973 | return -EAGAIN; | 1973 | return -EAGAIN; |
1974 | 1974 | ||
1975 | pr_warn("couldn't split"); | 1975 | pr_warn("couldn't split"); |
1976 | return -ENOMEM; | 1976 | return -ENOMEM; |
1977 | } | 1977 | } |
1978 | 1978 | ||
1979 | static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op, | 1979 | static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op, |
1980 | struct keylist *stack_keys) | 1980 | struct keylist *stack_keys) |
1981 | { | 1981 | { |
1982 | if (b->level) { | 1982 | if (b->level) { |
1983 | int ret; | 1983 | int ret; |
1984 | struct bkey *insert = op->keys.bottom; | 1984 | struct bkey *insert = op->keys.bottom; |
1985 | struct bkey *k = bch_next_recurse_key(b, &START_KEY(insert)); | 1985 | struct bkey *k = bch_next_recurse_key(b, &START_KEY(insert)); |
1986 | 1986 | ||
1987 | if (!k) { | 1987 | if (!k) { |
1988 | btree_bug(b, "no key to recurse on at level %i/%i", | 1988 | btree_bug(b, "no key to recurse on at level %i/%i", |
1989 | b->level, b->c->root->level); | 1989 | b->level, b->c->root->level); |
1990 | 1990 | ||
1991 | op->keys.top = op->keys.bottom; | 1991 | op->keys.top = op->keys.bottom; |
1992 | return -EIO; | 1992 | return -EIO; |
1993 | } | 1993 | } |
1994 | 1994 | ||
1995 | if (bkey_cmp(insert, k) > 0) { | 1995 | if (bkey_cmp(insert, k) > 0) { |
1996 | unsigned i; | 1996 | unsigned i; |
1997 | 1997 | ||
1998 | if (op->type == BTREE_REPLACE) { | 1998 | if (op->type == BTREE_REPLACE) { |
1999 | __bkey_put(b->c, insert); | 1999 | __bkey_put(b->c, insert); |
2000 | op->keys.top = op->keys.bottom; | 2000 | op->keys.top = op->keys.bottom; |
2001 | op->insert_collision = true; | 2001 | op->insert_collision = true; |
2002 | return 0; | 2002 | return 0; |
2003 | } | 2003 | } |
2004 | 2004 | ||
2005 | for (i = 0; i < KEY_PTRS(insert); i++) | 2005 | for (i = 0; i < KEY_PTRS(insert); i++) |
2006 | atomic_inc(&PTR_BUCKET(b->c, insert, i)->pin); | 2006 | atomic_inc(&PTR_BUCKET(b->c, insert, i)->pin); |
2007 | 2007 | ||
2008 | bkey_copy(stack_keys->top, insert); | 2008 | bkey_copy(stack_keys->top, insert); |
2009 | 2009 | ||
2010 | bch_cut_back(k, insert); | 2010 | bch_cut_back(k, insert); |
2011 | bch_cut_front(k, stack_keys->top); | 2011 | bch_cut_front(k, stack_keys->top); |
2012 | 2012 | ||
2013 | bch_keylist_push(stack_keys); | 2013 | bch_keylist_push(stack_keys); |
2014 | } | 2014 | } |
2015 | 2015 | ||
2016 | ret = btree(insert_recurse, k, b, op, stack_keys); | 2016 | ret = btree(insert_recurse, k, b, op, stack_keys); |
2017 | if (ret) | 2017 | if (ret) |
2018 | return ret; | 2018 | return ret; |
2019 | } | 2019 | } |
2020 | 2020 | ||
2021 | if (!bch_keylist_empty(&op->keys)) { | 2021 | if (!bch_keylist_empty(&op->keys)) { |
2022 | if (should_split(b)) { | 2022 | if (should_split(b)) { |
2023 | if (op->lock <= b->c->root->level) { | 2023 | if (op->lock <= b->c->root->level) { |
2024 | BUG_ON(b->level); | 2024 | BUG_ON(b->level); |
2025 | op->lock = b->c->root->level + 1; | 2025 | op->lock = b->c->root->level + 1; |
2026 | return -EINTR; | 2026 | return -EINTR; |
2027 | } | 2027 | } |
2028 | return btree_split(b, op); | 2028 | return btree_split(b, op); |
2029 | } | 2029 | } |
2030 | 2030 | ||
2031 | BUG_ON(write_block(b) != b->sets[b->nsets].data); | 2031 | BUG_ON(write_block(b) != b->sets[b->nsets].data); |
2032 | 2032 | ||
2033 | if (bch_btree_insert_keys(b, op)) { | 2033 | if (bch_btree_insert_keys(b, op)) { |
2034 | if (!b->level) | 2034 | if (!b->level) |
2035 | bch_btree_leaf_dirty(b, op); | 2035 | bch_btree_leaf_dirty(b, op); |
2036 | else | 2036 | else |
2037 | bch_btree_node_write(b, &op->cl); | 2037 | bch_btree_node_write(b, &op->cl); |
2038 | } | 2038 | } |
2039 | } | 2039 | } |
2040 | 2040 | ||
2041 | return 0; | 2041 | return 0; |
2042 | } | 2042 | } |
2043 | 2043 | ||
2044 | int bch_btree_insert(struct btree_op *op, struct cache_set *c) | 2044 | int bch_btree_insert(struct btree_op *op, struct cache_set *c) |
2045 | { | 2045 | { |
2046 | int ret = 0; | 2046 | int ret = 0; |
2047 | struct keylist stack_keys; | 2047 | struct keylist stack_keys; |
2048 | 2048 | ||
2049 | /* | 2049 | /* |
2050 | * Don't want to block with the btree locked unless we have to, | 2050 | * Don't want to block with the btree locked unless we have to, |
2051 | * otherwise we get deadlocks with try_harder and between split/gc | 2051 | * otherwise we get deadlocks with try_harder and between split/gc |
2052 | */ | 2052 | */ |
2053 | clear_closure_blocking(&op->cl); | 2053 | clear_closure_blocking(&op->cl); |
2054 | 2054 | ||
2055 | BUG_ON(bch_keylist_empty(&op->keys)); | 2055 | BUG_ON(bch_keylist_empty(&op->keys)); |
2056 | bch_keylist_copy(&stack_keys, &op->keys); | 2056 | bch_keylist_copy(&stack_keys, &op->keys); |
2057 | bch_keylist_init(&op->keys); | 2057 | bch_keylist_init(&op->keys); |
2058 | 2058 | ||
2059 | while (!bch_keylist_empty(&stack_keys) || | 2059 | while (!bch_keylist_empty(&stack_keys) || |
2060 | !bch_keylist_empty(&op->keys)) { | 2060 | !bch_keylist_empty(&op->keys)) { |
2061 | if (bch_keylist_empty(&op->keys)) { | 2061 | if (bch_keylist_empty(&op->keys)) { |
2062 | bch_keylist_add(&op->keys, | 2062 | bch_keylist_add(&op->keys, |
2063 | bch_keylist_pop(&stack_keys)); | 2063 | bch_keylist_pop(&stack_keys)); |
2064 | op->lock = 0; | 2064 | op->lock = 0; |
2065 | } | 2065 | } |
2066 | 2066 | ||
2067 | ret = btree_root(insert_recurse, c, op, &stack_keys); | 2067 | ret = btree_root(insert_recurse, c, op, &stack_keys); |
2068 | 2068 | ||
2069 | if (ret == -EAGAIN) { | 2069 | if (ret == -EAGAIN) { |
2070 | ret = 0; | 2070 | ret = 0; |
2071 | closure_sync(&op->cl); | 2071 | closure_sync(&op->cl); |
2072 | } else if (ret) { | 2072 | } else if (ret) { |
2073 | struct bkey *k; | 2073 | struct bkey *k; |
2074 | 2074 | ||
2075 | pr_err("error %i trying to insert key for %s", | 2075 | pr_err("error %i trying to insert key for %s", |
2076 | ret, op_type(op)); | 2076 | ret, op_type(op)); |
2077 | 2077 | ||
2078 | while ((k = bch_keylist_pop(&stack_keys) ?: | 2078 | while ((k = bch_keylist_pop(&stack_keys) ?: |
2079 | bch_keylist_pop(&op->keys))) | 2079 | bch_keylist_pop(&op->keys))) |
2080 | bkey_put(c, k, 0); | 2080 | bkey_put(c, k, 0); |
2081 | } | 2081 | } |
2082 | } | 2082 | } |
2083 | 2083 | ||
2084 | bch_keylist_free(&stack_keys); | 2084 | bch_keylist_free(&stack_keys); |
2085 | 2085 | ||
2086 | if (op->journal) | 2086 | if (op->journal) |
2087 | atomic_dec_bug(op->journal); | 2087 | atomic_dec_bug(op->journal); |
2088 | op->journal = NULL; | 2088 | op->journal = NULL; |
2089 | return ret; | 2089 | return ret; |
2090 | } | 2090 | } |
2091 | 2091 | ||
2092 | void bch_btree_set_root(struct btree *b) | 2092 | void bch_btree_set_root(struct btree *b) |
2093 | { | 2093 | { |
2094 | unsigned i; | 2094 | unsigned i; |
2095 | 2095 | ||
2096 | trace_bcache_btree_set_root(b); | 2096 | trace_bcache_btree_set_root(b); |
2097 | 2097 | ||
2098 | BUG_ON(!b->written); | 2098 | BUG_ON(!b->written); |
2099 | 2099 | ||
2100 | for (i = 0; i < KEY_PTRS(&b->key); i++) | 2100 | for (i = 0; i < KEY_PTRS(&b->key); i++) |
2101 | BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO); | 2101 | BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO); |
2102 | 2102 | ||
2103 | mutex_lock(&b->c->bucket_lock); | 2103 | mutex_lock(&b->c->bucket_lock); |
2104 | list_del_init(&b->list); | 2104 | list_del_init(&b->list); |
2105 | mutex_unlock(&b->c->bucket_lock); | 2105 | mutex_unlock(&b->c->bucket_lock); |
2106 | 2106 | ||
2107 | b->c->root = b; | 2107 | b->c->root = b; |
2108 | __bkey_put(b->c, &b->key); | 2108 | __bkey_put(b->c, &b->key); |
2109 | 2109 | ||
2110 | bch_journal_meta(b->c, NULL); | 2110 | bch_journal_meta(b->c, NULL); |
2111 | } | 2111 | } |
2112 | 2112 | ||
2113 | /* Cache lookup */ | 2113 | /* Cache lookup */ |
2114 | 2114 | ||
2115 | static int submit_partial_cache_miss(struct btree *b, struct btree_op *op, | 2115 | static int submit_partial_cache_miss(struct btree *b, struct btree_op *op, |
2116 | struct bkey *k) | 2116 | struct bkey *k) |
2117 | { | 2117 | { |
2118 | struct search *s = container_of(op, struct search, op); | 2118 | struct search *s = container_of(op, struct search, op); |
2119 | struct bio *bio = &s->bio.bio; | 2119 | struct bio *bio = &s->bio.bio; |
2120 | int ret = 0; | 2120 | int ret = 0; |
2121 | 2121 | ||
2122 | while (!ret && | 2122 | while (!ret && |
2123 | !op->lookup_done) { | 2123 | !op->lookup_done) { |
2124 | unsigned sectors = INT_MAX; | 2124 | unsigned sectors = INT_MAX; |
2125 | 2125 | ||
2126 | if (KEY_INODE(k) == op->inode) { | 2126 | if (KEY_INODE(k) == op->inode) { |
2127 | if (KEY_START(k) <= bio->bi_sector) | 2127 | if (KEY_START(k) <= bio->bi_sector) |
2128 | break; | 2128 | break; |
2129 | 2129 | ||
2130 | sectors = min_t(uint64_t, sectors, | 2130 | sectors = min_t(uint64_t, sectors, |
2131 | KEY_START(k) - bio->bi_sector); | 2131 | KEY_START(k) - bio->bi_sector); |
2132 | } | 2132 | } |
2133 | 2133 | ||
2134 | ret = s->d->cache_miss(b, s, bio, sectors); | 2134 | ret = s->d->cache_miss(b, s, bio, sectors); |
2135 | } | 2135 | } |
2136 | 2136 | ||
2137 | return ret; | 2137 | return ret; |
2138 | } | 2138 | } |
2139 | 2139 | ||
2140 | /* | 2140 | /* |
2141 | * Read from a single key, handling the initial cache miss if the key starts in | 2141 | * Read from a single key, handling the initial cache miss if the key starts in |
2142 | * the middle of the bio | 2142 | * the middle of the bio |
2143 | */ | 2143 | */ |
2144 | static int submit_partial_cache_hit(struct btree *b, struct btree_op *op, | 2144 | static int submit_partial_cache_hit(struct btree *b, struct btree_op *op, |
2145 | struct bkey *k) | 2145 | struct bkey *k) |
2146 | { | 2146 | { |
2147 | struct search *s = container_of(op, struct search, op); | 2147 | struct search *s = container_of(op, struct search, op); |
2148 | struct bio *bio = &s->bio.bio; | 2148 | struct bio *bio = &s->bio.bio; |
2149 | unsigned ptr; | 2149 | unsigned ptr; |
2150 | struct bio *n; | 2150 | struct bio *n; |
2151 | 2151 | ||
2152 | int ret = submit_partial_cache_miss(b, op, k); | 2152 | int ret = submit_partial_cache_miss(b, op, k); |
2153 | if (ret || op->lookup_done) | 2153 | if (ret || op->lookup_done) |
2154 | return ret; | 2154 | return ret; |
2155 | 2155 | ||
2156 | /* XXX: figure out best pointer - for multiple cache devices */ | 2156 | /* XXX: figure out best pointer - for multiple cache devices */ |
2157 | ptr = 0; | 2157 | ptr = 0; |
2158 | 2158 | ||
2159 | PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO; | 2159 | PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO; |
2160 | 2160 | ||
2161 | while (!op->lookup_done && | 2161 | while (!op->lookup_done && |
2162 | KEY_INODE(k) == op->inode && | 2162 | KEY_INODE(k) == op->inode && |
2163 | bio->bi_sector < KEY_OFFSET(k)) { | 2163 | bio->bi_sector < KEY_OFFSET(k)) { |
2164 | struct bkey *bio_key; | 2164 | struct bkey *bio_key; |
2165 | sector_t sector = PTR_OFFSET(k, ptr) + | 2165 | sector_t sector = PTR_OFFSET(k, ptr) + |
2166 | (bio->bi_sector - KEY_START(k)); | 2166 | (bio->bi_sector - KEY_START(k)); |
2167 | unsigned sectors = min_t(uint64_t, INT_MAX, | 2167 | unsigned sectors = min_t(uint64_t, INT_MAX, |
2168 | KEY_OFFSET(k) - bio->bi_sector); | 2168 | KEY_OFFSET(k) - bio->bi_sector); |
2169 | 2169 | ||
2170 | n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); | 2170 | n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); |
2171 | if (!n) | 2171 | if (!n) |
2172 | return -EAGAIN; | 2172 | return -EAGAIN; |
2173 | 2173 | ||
2174 | if (n == bio) | 2174 | if (n == bio) |
2175 | op->lookup_done = true; | 2175 | op->lookup_done = true; |
2176 | 2176 | ||
2177 | bio_key = &container_of(n, struct bbio, bio)->key; | 2177 | bio_key = &container_of(n, struct bbio, bio)->key; |
2178 | 2178 | ||
2179 | /* | 2179 | /* |
2180 | * The bucket we're reading from might be reused while our bio | 2180 | * The bucket we're reading from might be reused while our bio |
2181 | * is in flight, and we could then end up reading the wrong | 2181 | * is in flight, and we could then end up reading the wrong |
2182 | * data. | 2182 | * data. |
2183 | * | 2183 | * |
2184 | * We guard against this by checking (in cache_read_endio()) if | 2184 | * We guard against this by checking (in cache_read_endio()) if |
2185 | * the pointer is stale again; if so, we treat it as an error | 2185 | * the pointer is stale again; if so, we treat it as an error |
2186 | * and reread from the backing device (but we don't pass that | 2186 | * and reread from the backing device (but we don't pass that |
2187 | * error up anywhere). | 2187 | * error up anywhere). |
2188 | */ | 2188 | */ |
2189 | 2189 | ||
2190 | bch_bkey_copy_single_ptr(bio_key, k, ptr); | 2190 | bch_bkey_copy_single_ptr(bio_key, k, ptr); |
2191 | SET_PTR_OFFSET(bio_key, 0, sector); | 2191 | SET_PTR_OFFSET(bio_key, 0, sector); |
2192 | 2192 | ||
2193 | n->bi_end_io = bch_cache_read_endio; | 2193 | n->bi_end_io = bch_cache_read_endio; |
2194 | n->bi_private = &s->cl; | 2194 | n->bi_private = &s->cl; |
2195 | 2195 | ||
2196 | __bch_submit_bbio(n, b->c); | 2196 | __bch_submit_bbio(n, b->c); |
2197 | } | 2197 | } |
2198 | 2198 | ||
2199 | return 0; | 2199 | return 0; |
2200 | } | 2200 | } |
2201 | 2201 | ||
2202 | int bch_btree_search_recurse(struct btree *b, struct btree_op *op) | 2202 | int bch_btree_search_recurse(struct btree *b, struct btree_op *op) |
2203 | { | 2203 | { |
2204 | struct search *s = container_of(op, struct search, op); | 2204 | struct search *s = container_of(op, struct search, op); |
2205 | struct bio *bio = &s->bio.bio; | 2205 | struct bio *bio = &s->bio.bio; |
2206 | 2206 | ||
2207 | int ret = 0; | 2207 | int ret = 0; |
2208 | struct bkey *k; | 2208 | struct bkey *k; |
2209 | struct btree_iter iter; | 2209 | struct btree_iter iter; |
2210 | bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0)); | 2210 | bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0)); |
2211 | 2211 | ||
2212 | do { | 2212 | do { |
2213 | k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); | 2213 | k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); |
2214 | if (!k) { | 2214 | if (!k) { |
2215 | /* | 2215 | /* |
2216 | * b->key would be exactly what we want, except that | 2216 | * b->key would be exactly what we want, except that |
2217 | * pointers to btree nodes have nonzero size - we | 2217 | * pointers to btree nodes have nonzero size - we |
2218 | * wouldn't go far enough | 2218 | * wouldn't go far enough |
2219 | */ | 2219 | */ |
2220 | 2220 | ||
2221 | ret = submit_partial_cache_miss(b, op, | 2221 | ret = submit_partial_cache_miss(b, op, |
2222 | &KEY(KEY_INODE(&b->key), | 2222 | &KEY(KEY_INODE(&b->key), |
2223 | KEY_OFFSET(&b->key), 0)); | 2223 | KEY_OFFSET(&b->key), 0)); |
2224 | break; | 2224 | break; |
2225 | } | 2225 | } |
2226 | 2226 | ||
2227 | ret = b->level | 2227 | ret = b->level |
2228 | ? btree(search_recurse, k, b, op) | 2228 | ? btree(search_recurse, k, b, op) |
2229 | : submit_partial_cache_hit(b, op, k); | 2229 | : submit_partial_cache_hit(b, op, k); |
2230 | } while (!ret && | 2230 | } while (!ret && |
2231 | !op->lookup_done); | 2231 | !op->lookup_done); |
2232 | 2232 | ||
2233 | return ret; | 2233 | return ret; |
2234 | } | 2234 | } |
2235 | 2235 | ||
2236 | /* Keybuf code */ | 2236 | /* Keybuf code */ |
2237 | 2237 | ||
2238 | static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r) | 2238 | static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r) |
2239 | { | 2239 | { |
2240 | /* Overlapping keys compare equal */ | 2240 | /* Overlapping keys compare equal */ |
2241 | if (bkey_cmp(&l->key, &START_KEY(&r->key)) <= 0) | 2241 | if (bkey_cmp(&l->key, &START_KEY(&r->key)) <= 0) |
2242 | return -1; | 2242 | return -1; |
2243 | if (bkey_cmp(&START_KEY(&l->key), &r->key) >= 0) | 2243 | if (bkey_cmp(&START_KEY(&l->key), &r->key) >= 0) |
2244 | return 1; | 2244 | return 1; |
2245 | return 0; | 2245 | return 0; |
2246 | } | 2246 | } |
2247 | 2247 | ||
2248 | static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l, | 2248 | static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l, |
2249 | struct keybuf_key *r) | 2249 | struct keybuf_key *r) |
2250 | { | 2250 | { |
2251 | return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1); | 2251 | return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1); |
2252 | } | 2252 | } |
2253 | 2253 | ||
2254 | static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, | 2254 | static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, |
2255 | struct keybuf *buf, struct bkey *end) | 2255 | struct keybuf *buf, struct bkey *end, |
2256 | keybuf_pred_fn *pred) | ||
2256 | { | 2257 | { |
2257 | struct btree_iter iter; | 2258 | struct btree_iter iter; |
2258 | bch_btree_iter_init(b, &iter, &buf->last_scanned); | 2259 | bch_btree_iter_init(b, &iter, &buf->last_scanned); |
2259 | 2260 | ||
2260 | while (!array_freelist_empty(&buf->freelist)) { | 2261 | while (!array_freelist_empty(&buf->freelist)) { |
2261 | struct bkey *k = bch_btree_iter_next_filter(&iter, b, | 2262 | struct bkey *k = bch_btree_iter_next_filter(&iter, b, |
2262 | bch_ptr_bad); | 2263 | bch_ptr_bad); |
2263 | 2264 | ||
2264 | if (!b->level) { | 2265 | if (!b->level) { |
2265 | if (!k) { | 2266 | if (!k) { |
2266 | buf->last_scanned = b->key; | 2267 | buf->last_scanned = b->key; |
2267 | break; | 2268 | break; |
2268 | } | 2269 | } |
2269 | 2270 | ||
2270 | buf->last_scanned = *k; | 2271 | buf->last_scanned = *k; |
2271 | if (bkey_cmp(&buf->last_scanned, end) >= 0) | 2272 | if (bkey_cmp(&buf->last_scanned, end) >= 0) |
2272 | break; | 2273 | break; |
2273 | 2274 | ||
2274 | if (buf->key_predicate(buf, k)) { | 2275 | if (pred(buf, k)) { |
2275 | struct keybuf_key *w; | 2276 | struct keybuf_key *w; |
2276 | 2277 | ||
2277 | spin_lock(&buf->lock); | 2278 | spin_lock(&buf->lock); |
2278 | 2279 | ||
2279 | w = array_alloc(&buf->freelist); | 2280 | w = array_alloc(&buf->freelist); |
2280 | 2281 | ||
2281 | w->private = NULL; | 2282 | w->private = NULL; |
2282 | bkey_copy(&w->key, k); | 2283 | bkey_copy(&w->key, k); |
2283 | 2284 | ||
2284 | if (RB_INSERT(&buf->keys, w, node, keybuf_cmp)) | 2285 | if (RB_INSERT(&buf->keys, w, node, keybuf_cmp)) |
2285 | array_free(&buf->freelist, w); | 2286 | array_free(&buf->freelist, w); |
2286 | 2287 | ||
2287 | spin_unlock(&buf->lock); | 2288 | spin_unlock(&buf->lock); |
2288 | } | 2289 | } |
2289 | } else { | 2290 | } else { |
2290 | if (!k) | 2291 | if (!k) |
2291 | break; | 2292 | break; |
2292 | 2293 | ||
2293 | btree(refill_keybuf, k, b, op, buf, end); | 2294 | btree(refill_keybuf, k, b, op, buf, end, pred); |
2294 | /* | 2295 | /* |
2295 | * Might get an error here, but can't really do anything | 2296 | * Might get an error here, but can't really do anything |
2296 | * and it'll get logged elsewhere. Just read what we | 2297 | * and it'll get logged elsewhere. Just read what we |
2297 | * can. | 2298 | * can. |
2298 | */ | 2299 | */ |
2299 | 2300 | ||
2300 | if (bkey_cmp(&buf->last_scanned, end) >= 0) | 2301 | if (bkey_cmp(&buf->last_scanned, end) >= 0) |
2301 | break; | 2302 | break; |
2302 | 2303 | ||
2303 | cond_resched(); | 2304 | cond_resched(); |
2304 | } | 2305 | } |
2305 | } | 2306 | } |
2306 | 2307 | ||
2307 | return 0; | 2308 | return 0; |
2308 | } | 2309 | } |
2309 | 2310 | ||
2310 | void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, | 2311 | void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, |
2311 | struct bkey *end) | 2312 | struct bkey *end, keybuf_pred_fn *pred) |
2312 | { | 2313 | { |
2313 | struct bkey start = buf->last_scanned; | 2314 | struct bkey start = buf->last_scanned; |
2314 | struct btree_op op; | 2315 | struct btree_op op; |
2315 | bch_btree_op_init_stack(&op); | 2316 | bch_btree_op_init_stack(&op); |
2316 | 2317 | ||
2317 | cond_resched(); | 2318 | cond_resched(); |
2318 | 2319 | ||
2319 | btree_root(refill_keybuf, c, &op, buf, end); | 2320 | btree_root(refill_keybuf, c, &op, buf, end, pred); |
2320 | closure_sync(&op.cl); | 2321 | closure_sync(&op.cl); |
2321 | 2322 | ||
2322 | pr_debug("found %s keys from %llu:%llu to %llu:%llu", | 2323 | pr_debug("found %s keys from %llu:%llu to %llu:%llu", |
2323 | RB_EMPTY_ROOT(&buf->keys) ? "no" : | 2324 | RB_EMPTY_ROOT(&buf->keys) ? "no" : |
2324 | array_freelist_empty(&buf->freelist) ? "some" : "a few", | 2325 | array_freelist_empty(&buf->freelist) ? "some" : "a few", |
2325 | KEY_INODE(&start), KEY_OFFSET(&start), | 2326 | KEY_INODE(&start), KEY_OFFSET(&start), |
2326 | KEY_INODE(&buf->last_scanned), KEY_OFFSET(&buf->last_scanned)); | 2327 | KEY_INODE(&buf->last_scanned), KEY_OFFSET(&buf->last_scanned)); |
2327 | 2328 | ||
2328 | spin_lock(&buf->lock); | 2329 | spin_lock(&buf->lock); |
2329 | 2330 | ||
2330 | if (!RB_EMPTY_ROOT(&buf->keys)) { | 2331 | if (!RB_EMPTY_ROOT(&buf->keys)) { |
2331 | struct keybuf_key *w; | 2332 | struct keybuf_key *w; |
2332 | w = RB_FIRST(&buf->keys, struct keybuf_key, node); | 2333 | w = RB_FIRST(&buf->keys, struct keybuf_key, node); |
2333 | buf->start = START_KEY(&w->key); | 2334 | buf->start = START_KEY(&w->key); |
2334 | 2335 | ||
2335 | w = RB_LAST(&buf->keys, struct keybuf_key, node); | 2336 | w = RB_LAST(&buf->keys, struct keybuf_key, node); |
2336 | buf->end = w->key; | 2337 | buf->end = w->key; |
2337 | } else { | 2338 | } else { |
2338 | buf->start = MAX_KEY; | 2339 | buf->start = MAX_KEY; |
2339 | buf->end = MAX_KEY; | 2340 | buf->end = MAX_KEY; |
2340 | } | 2341 | } |
2341 | 2342 | ||
2342 | spin_unlock(&buf->lock); | 2343 | spin_unlock(&buf->lock); |
2343 | } | 2344 | } |
2344 | 2345 | ||
2345 | static void __bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w) | 2346 | static void __bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w) |
2346 | { | 2347 | { |
2347 | rb_erase(&w->node, &buf->keys); | 2348 | rb_erase(&w->node, &buf->keys); |
2348 | array_free(&buf->freelist, w); | 2349 | array_free(&buf->freelist, w); |
2349 | } | 2350 | } |
2350 | 2351 | ||
2351 | void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w) | 2352 | void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w) |
2352 | { | 2353 | { |
2353 | spin_lock(&buf->lock); | 2354 | spin_lock(&buf->lock); |
2354 | __bch_keybuf_del(buf, w); | 2355 | __bch_keybuf_del(buf, w); |
2355 | spin_unlock(&buf->lock); | 2356 | spin_unlock(&buf->lock); |
2356 | } | 2357 | } |
2357 | 2358 | ||
2358 | bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start, | 2359 | bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start, |
2359 | struct bkey *end) | 2360 | struct bkey *end) |
2360 | { | 2361 | { |
2361 | bool ret = false; | 2362 | bool ret = false; |
2362 | struct keybuf_key *p, *w, s; | 2363 | struct keybuf_key *p, *w, s; |
2363 | s.key = *start; | 2364 | s.key = *start; |
2364 | 2365 | ||
2365 | if (bkey_cmp(end, &buf->start) <= 0 || | 2366 | if (bkey_cmp(end, &buf->start) <= 0 || |
2366 | bkey_cmp(start, &buf->end) >= 0) | 2367 | bkey_cmp(start, &buf->end) >= 0) |
2367 | return false; | 2368 | return false; |
2368 | 2369 | ||
2369 | spin_lock(&buf->lock); | 2370 | spin_lock(&buf->lock); |
2370 | w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp); | 2371 | w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp); |
2371 | 2372 | ||
2372 | while (w && bkey_cmp(&START_KEY(&w->key), end) < 0) { | 2373 | while (w && bkey_cmp(&START_KEY(&w->key), end) < 0) { |
2373 | p = w; | 2374 | p = w; |
2374 | w = RB_NEXT(w, node); | 2375 | w = RB_NEXT(w, node); |
2375 | 2376 | ||
2376 | if (p->private) | 2377 | if (p->private) |
2377 | ret = true; | 2378 | ret = true; |
2378 | else | 2379 | else |
2379 | __bch_keybuf_del(buf, p); | 2380 | __bch_keybuf_del(buf, p); |
2380 | } | 2381 | } |
2381 | 2382 | ||
2382 | spin_unlock(&buf->lock); | 2383 | spin_unlock(&buf->lock); |
2383 | return ret; | 2384 | return ret; |
2384 | } | 2385 | } |
2385 | 2386 | ||
2386 | struct keybuf_key *bch_keybuf_next(struct keybuf *buf) | 2387 | struct keybuf_key *bch_keybuf_next(struct keybuf *buf) |
2387 | { | 2388 | { |
2388 | struct keybuf_key *w; | 2389 | struct keybuf_key *w; |
2389 | spin_lock(&buf->lock); | 2390 | spin_lock(&buf->lock); |
2390 | 2391 | ||
2391 | w = RB_FIRST(&buf->keys, struct keybuf_key, node); | 2392 | w = RB_FIRST(&buf->keys, struct keybuf_key, node); |
2392 | 2393 | ||
2393 | while (w && w->private) | 2394 | while (w && w->private) |
2394 | w = RB_NEXT(w, node); | 2395 | w = RB_NEXT(w, node); |
2395 | 2396 | ||
2396 | if (w) | 2397 | if (w) |
2397 | w->private = ERR_PTR(-EINTR); | 2398 | w->private = ERR_PTR(-EINTR); |
2398 | 2399 | ||
2399 | spin_unlock(&buf->lock); | 2400 | spin_unlock(&buf->lock); |
2400 | return w; | 2401 | return w; |
2401 | } | 2402 | } |
2402 | 2403 | ||
2403 | struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, | 2404 | struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, |
2404 | struct keybuf *buf, | 2405 | struct keybuf *buf, |
2405 | struct bkey *end) | 2406 | struct bkey *end, |
2407 | keybuf_pred_fn *pred) | ||
2406 | { | 2408 | { |
2407 | struct keybuf_key *ret; | 2409 | struct keybuf_key *ret; |
2408 | 2410 | ||
2409 | while (1) { | 2411 | while (1) { |
2410 | ret = bch_keybuf_next(buf); | 2412 | ret = bch_keybuf_next(buf); |
2411 | if (ret) | 2413 | if (ret) |
2412 | break; | 2414 | break; |
2413 | 2415 | ||
2414 | if (bkey_cmp(&buf->last_scanned, end) >= 0) { | 2416 | if (bkey_cmp(&buf->last_scanned, end) >= 0) { |
2415 | pr_debug("scan finished"); | 2417 | pr_debug("scan finished"); |
2416 | break; | 2418 | break; |
2417 | } | 2419 | } |
2418 | 2420 | ||
2419 | bch_refill_keybuf(c, buf, end); | 2421 | bch_refill_keybuf(c, buf, end, pred); |
2420 | } | 2422 | } |
2421 | 2423 | ||
2422 | return ret; | 2424 | return ret; |
2423 | } | 2425 | } |
2424 | 2426 | ||
2425 | void bch_keybuf_init(struct keybuf *buf, keybuf_pred_fn *fn) | 2427 | void bch_keybuf_init(struct keybuf *buf) |
2426 | { | 2428 | { |
2427 | buf->key_predicate = fn; | ||
2428 | buf->last_scanned = MAX_KEY; | 2429 | buf->last_scanned = MAX_KEY; |
2429 | buf->keys = RB_ROOT; | 2430 | buf->keys = RB_ROOT; |
2430 | 2431 | ||
2431 | spin_lock_init(&buf->lock); | 2432 | spin_lock_init(&buf->lock); |
2432 | array_allocator_init(&buf->freelist); | 2433 | array_allocator_init(&buf->freelist); |
2433 | } | 2434 | } |
2434 | 2435 | ||
2435 | void bch_btree_exit(void) | 2436 | void bch_btree_exit(void) |
2436 | { | 2437 | { |
2437 | if (btree_io_wq) | 2438 | if (btree_io_wq) |
2438 | destroy_workqueue(btree_io_wq); | 2439 | destroy_workqueue(btree_io_wq); |
2439 | if (bch_gc_wq) | 2440 | if (bch_gc_wq) |
2440 | destroy_workqueue(bch_gc_wq); | 2441 | destroy_workqueue(bch_gc_wq); |
2441 | } | 2442 | } |
2442 | 2443 | ||
2443 | int __init bch_btree_init(void) | 2444 | int __init bch_btree_init(void) |
2444 | { | 2445 | { |
2445 | if (!(bch_gc_wq = create_singlethread_workqueue("bch_btree_gc")) || | 2446 | if (!(bch_gc_wq = create_singlethread_workqueue("bch_btree_gc")) || |
2446 | !(btree_io_wq = create_singlethread_workqueue("bch_btree_io"))) | 2447 | !(btree_io_wq = create_singlethread_workqueue("bch_btree_io"))) |
2447 | return -ENOMEM; | 2448 | return -ENOMEM; |
2448 | 2449 | ||
2449 | return 0; | 2450 | return 0; |
2450 | } | 2451 | } |
drivers/md/bcache/btree.h
1 | #ifndef _BCACHE_BTREE_H | 1 | #ifndef _BCACHE_BTREE_H |
2 | #define _BCACHE_BTREE_H | 2 | #define _BCACHE_BTREE_H |
3 | 3 | ||
4 | /* | 4 | /* |
5 | * THE BTREE: | 5 | * THE BTREE: |
6 | * | 6 | * |
7 | * At a high level, bcache's btree is relatively standard b+ tree. All keys and | 7 | * At a high level, bcache's btree is relatively standard b+ tree. All keys and |
8 | * pointers are in the leaves; interior nodes only have pointers to the child | 8 | * pointers are in the leaves; interior nodes only have pointers to the child |
9 | * nodes. | 9 | * nodes. |
10 | * | 10 | * |
11 | * In the interior nodes, a struct bkey always points to a child btree node, and | 11 | * In the interior nodes, a struct bkey always points to a child btree node, and |
12 | * the key is the highest key in the child node - except that the highest key in | 12 | * the key is the highest key in the child node - except that the highest key in |
13 | * an interior node is always MAX_KEY. The size field refers to the size on disk | 13 | * an interior node is always MAX_KEY. The size field refers to the size on disk |
14 | * of the child node - this would allow us to have variable sized btree nodes | 14 | * of the child node - this would allow us to have variable sized btree nodes |
15 | * (handy for keeping the depth of the btree 1 by expanding just the root). | 15 | * (handy for keeping the depth of the btree 1 by expanding just the root). |
16 | * | 16 | * |
17 | * Btree nodes are themselves log structured, but this is hidden fairly | 17 | * Btree nodes are themselves log structured, but this is hidden fairly |
18 | * thoroughly. Btree nodes on disk will in practice have extents that overlap | 18 | * thoroughly. Btree nodes on disk will in practice have extents that overlap |
19 | * (because they were written at different times), but in memory we never have | 19 | * (because they were written at different times), but in memory we never have |
20 | * overlapping extents - when we read in a btree node from disk, the first thing | 20 | * overlapping extents - when we read in a btree node from disk, the first thing |
21 | * we do is resort all the sets of keys with a mergesort, and in the same pass | 21 | * we do is resort all the sets of keys with a mergesort, and in the same pass |
22 | * we check for overlapping extents and adjust them appropriately. | 22 | * we check for overlapping extents and adjust them appropriately. |
23 | * | 23 | * |
24 | * struct btree_op is a central interface to the btree code. It's used for | 24 | * struct btree_op is a central interface to the btree code. It's used for |
25 | * specifying read vs. write locking, and the embedded closure is used for | 25 | * specifying read vs. write locking, and the embedded closure is used for |
26 | * waiting on IO or reserve memory. | 26 | * waiting on IO or reserve memory. |
27 | * | 27 | * |
28 | * BTREE CACHE: | 28 | * BTREE CACHE: |
29 | * | 29 | * |
30 | * Btree nodes are cached in memory; traversing the btree might require reading | 30 | * Btree nodes are cached in memory; traversing the btree might require reading |
31 | * in btree nodes which is handled mostly transparently. | 31 | * in btree nodes which is handled mostly transparently. |
32 | * | 32 | * |
33 | * bch_btree_node_get() looks up a btree node in the cache and reads it in from | 33 | * bch_btree_node_get() looks up a btree node in the cache and reads it in from |
34 | * disk if necessary. This function is almost never called directly though - the | 34 | * disk if necessary. This function is almost never called directly though - the |
35 | * btree() macro is used to get a btree node, call some function on it, and | 35 | * btree() macro is used to get a btree node, call some function on it, and |
36 | * unlock the node after the function returns. | 36 | * unlock the node after the function returns. |
37 | * | 37 | * |
38 | * The root is special cased - it's taken out of the cache's lru (thus pinning | 38 | * The root is special cased - it's taken out of the cache's lru (thus pinning |
39 | * it in memory), so we can find the root of the btree by just dereferencing a | 39 | * it in memory), so we can find the root of the btree by just dereferencing a |
40 | * pointer instead of looking it up in the cache. This makes locking a bit | 40 | * pointer instead of looking it up in the cache. This makes locking a bit |
41 | * tricky, since the root pointer is protected by the lock in the btree node it | 41 | * tricky, since the root pointer is protected by the lock in the btree node it |
42 | * points to - the btree_root() macro handles this. | 42 | * points to - the btree_root() macro handles this. |
43 | * | 43 | * |
44 | * In various places we must be able to allocate memory for multiple btree nodes | 44 | * In various places we must be able to allocate memory for multiple btree nodes |
45 | * in order to make forward progress. To do this we use the btree cache itself | 45 | * in order to make forward progress. To do this we use the btree cache itself |
46 | * as a reserve; if __get_free_pages() fails, we'll find a node in the btree | 46 | * as a reserve; if __get_free_pages() fails, we'll find a node in the btree |
47 | * cache we can reuse. We can't allow more than one thread to be doing this at a | 47 | * cache we can reuse. We can't allow more than one thread to be doing this at a |
48 | * time, so there's a lock, implemented by a pointer to the btree_op closure - | 48 | * time, so there's a lock, implemented by a pointer to the btree_op closure - |
49 | * this allows the btree_root() macro to implicitly release this lock. | 49 | * this allows the btree_root() macro to implicitly release this lock. |
50 | * | 50 | * |
51 | * BTREE IO: | 51 | * BTREE IO: |
52 | * | 52 | * |
53 | * Btree nodes never have to be explicitly read in; bch_btree_node_get() handles | 53 | * Btree nodes never have to be explicitly read in; bch_btree_node_get() handles |
54 | * this. | 54 | * this. |
55 | * | 55 | * |
56 | * For writing, we have two btree_write structs embeddded in struct btree - one | 56 | * For writing, we have two btree_write structs embeddded in struct btree - one |
57 | * write in flight, and one being set up, and we toggle between them. | 57 | * write in flight, and one being set up, and we toggle between them. |
58 | * | 58 | * |
59 | * Writing is done with a single function - bch_btree_write() really serves two | 59 | * Writing is done with a single function - bch_btree_write() really serves two |
60 | * different purposes and should be broken up into two different functions. When | 60 | * different purposes and should be broken up into two different functions. When |
61 | * passing now = false, it merely indicates that the node is now dirty - calling | 61 | * passing now = false, it merely indicates that the node is now dirty - calling |
62 | * it ensures that the dirty keys will be written at some point in the future. | 62 | * it ensures that the dirty keys will be written at some point in the future. |
63 | * | 63 | * |
64 | * When passing now = true, bch_btree_write() causes a write to happen | 64 | * When passing now = true, bch_btree_write() causes a write to happen |
65 | * "immediately" (if there was already a write in flight, it'll cause the write | 65 | * "immediately" (if there was already a write in flight, it'll cause the write |
66 | * to happen as soon as the previous write completes). It returns immediately | 66 | * to happen as soon as the previous write completes). It returns immediately |
67 | * though - but it takes a refcount on the closure in struct btree_op you passed | 67 | * though - but it takes a refcount on the closure in struct btree_op you passed |
68 | * to it, so a closure_sync() later can be used to wait for the write to | 68 | * to it, so a closure_sync() later can be used to wait for the write to |
69 | * complete. | 69 | * complete. |
70 | * | 70 | * |
71 | * This is handy because btree_split() and garbage collection can issue writes | 71 | * This is handy because btree_split() and garbage collection can issue writes |
72 | * in parallel, reducing the amount of time they have to hold write locks. | 72 | * in parallel, reducing the amount of time they have to hold write locks. |
73 | * | 73 | * |
74 | * LOCKING: | 74 | * LOCKING: |
75 | * | 75 | * |
76 | * When traversing the btree, we may need write locks starting at some level - | 76 | * When traversing the btree, we may need write locks starting at some level - |
77 | * inserting a key into the btree will typically only require a write lock on | 77 | * inserting a key into the btree will typically only require a write lock on |
78 | * the leaf node. | 78 | * the leaf node. |
79 | * | 79 | * |
80 | * This is specified with the lock field in struct btree_op; lock = 0 means we | 80 | * This is specified with the lock field in struct btree_op; lock = 0 means we |
81 | * take write locks at level <= 0, i.e. only leaf nodes. bch_btree_node_get() | 81 | * take write locks at level <= 0, i.e. only leaf nodes. bch_btree_node_get() |
82 | * checks this field and returns the node with the appropriate lock held. | 82 | * checks this field and returns the node with the appropriate lock held. |
83 | * | 83 | * |
84 | * If, after traversing the btree, the insertion code discovers it has to split | 84 | * If, after traversing the btree, the insertion code discovers it has to split |
85 | * then it must restart from the root and take new locks - to do this it changes | 85 | * then it must restart from the root and take new locks - to do this it changes |
86 | * the lock field and returns -EINTR, which causes the btree_root() macro to | 86 | * the lock field and returns -EINTR, which causes the btree_root() macro to |
87 | * loop. | 87 | * loop. |
88 | * | 88 | * |
89 | * Handling cache misses require a different mechanism for upgrading to a write | 89 | * Handling cache misses require a different mechanism for upgrading to a write |
90 | * lock. We do cache lookups with only a read lock held, but if we get a cache | 90 | * lock. We do cache lookups with only a read lock held, but if we get a cache |
91 | * miss and we wish to insert this data into the cache, we have to insert a | 91 | * miss and we wish to insert this data into the cache, we have to insert a |
92 | * placeholder key to detect races - otherwise, we could race with a write and | 92 | * placeholder key to detect races - otherwise, we could race with a write and |
93 | * overwrite the data that was just written to the cache with stale data from | 93 | * overwrite the data that was just written to the cache with stale data from |
94 | * the backing device. | 94 | * the backing device. |
95 | * | 95 | * |
96 | * For this we use a sequence number that write locks and unlocks increment - to | 96 | * For this we use a sequence number that write locks and unlocks increment - to |
97 | * insert the check key it unlocks the btree node and then takes a write lock, | 97 | * insert the check key it unlocks the btree node and then takes a write lock, |
98 | * and fails if the sequence number doesn't match. | 98 | * and fails if the sequence number doesn't match. |
99 | */ | 99 | */ |
100 | 100 | ||
101 | #include "bset.h" | 101 | #include "bset.h" |
102 | #include "debug.h" | 102 | #include "debug.h" |
103 | 103 | ||
104 | struct btree_write { | 104 | struct btree_write { |
105 | atomic_t *journal; | 105 | atomic_t *journal; |
106 | 106 | ||
107 | /* If btree_split() frees a btree node, it writes a new pointer to that | 107 | /* If btree_split() frees a btree node, it writes a new pointer to that |
108 | * btree node indicating it was freed; it takes a refcount on | 108 | * btree node indicating it was freed; it takes a refcount on |
109 | * c->prio_blocked because we can't write the gens until the new | 109 | * c->prio_blocked because we can't write the gens until the new |
110 | * pointer is on disk. This allows btree_write_endio() to release the | 110 | * pointer is on disk. This allows btree_write_endio() to release the |
111 | * refcount that btree_split() took. | 111 | * refcount that btree_split() took. |
112 | */ | 112 | */ |
113 | int prio_blocked; | 113 | int prio_blocked; |
114 | }; | 114 | }; |
115 | 115 | ||
116 | struct btree { | 116 | struct btree { |
117 | /* Hottest entries first */ | 117 | /* Hottest entries first */ |
118 | struct hlist_node hash; | 118 | struct hlist_node hash; |
119 | 119 | ||
120 | /* Key/pointer for this btree node */ | 120 | /* Key/pointer for this btree node */ |
121 | BKEY_PADDED(key); | 121 | BKEY_PADDED(key); |
122 | 122 | ||
123 | /* Single bit - set when accessed, cleared by shrinker */ | 123 | /* Single bit - set when accessed, cleared by shrinker */ |
124 | unsigned long accessed; | 124 | unsigned long accessed; |
125 | unsigned long seq; | 125 | unsigned long seq; |
126 | struct rw_semaphore lock; | 126 | struct rw_semaphore lock; |
127 | struct cache_set *c; | 127 | struct cache_set *c; |
128 | 128 | ||
129 | unsigned long flags; | 129 | unsigned long flags; |
130 | uint16_t written; /* would be nice to kill */ | 130 | uint16_t written; /* would be nice to kill */ |
131 | uint8_t level; | 131 | uint8_t level; |
132 | uint8_t nsets; | 132 | uint8_t nsets; |
133 | uint8_t page_order; | 133 | uint8_t page_order; |
134 | 134 | ||
135 | /* | 135 | /* |
136 | * Set of sorted keys - the real btree node - plus a binary search tree | 136 | * Set of sorted keys - the real btree node - plus a binary search tree |
137 | * | 137 | * |
138 | * sets[0] is special; set[0]->tree, set[0]->prev and set[0]->data point | 138 | * sets[0] is special; set[0]->tree, set[0]->prev and set[0]->data point |
139 | * to the memory we have allocated for this btree node. Additionally, | 139 | * to the memory we have allocated for this btree node. Additionally, |
140 | * set[0]->data points to the entire btree node as it exists on disk. | 140 | * set[0]->data points to the entire btree node as it exists on disk. |
141 | */ | 141 | */ |
142 | struct bset_tree sets[MAX_BSETS]; | 142 | struct bset_tree sets[MAX_BSETS]; |
143 | 143 | ||
144 | /* For outstanding btree writes, used as a lock - protects write_idx */ | 144 | /* For outstanding btree writes, used as a lock - protects write_idx */ |
145 | struct closure_with_waitlist io; | 145 | struct closure_with_waitlist io; |
146 | 146 | ||
147 | struct list_head list; | 147 | struct list_head list; |
148 | struct delayed_work work; | 148 | struct delayed_work work; |
149 | 149 | ||
150 | struct btree_write writes[2]; | 150 | struct btree_write writes[2]; |
151 | struct bio *bio; | 151 | struct bio *bio; |
152 | }; | 152 | }; |
153 | 153 | ||
154 | #define BTREE_FLAG(flag) \ | 154 | #define BTREE_FLAG(flag) \ |
155 | static inline bool btree_node_ ## flag(struct btree *b) \ | 155 | static inline bool btree_node_ ## flag(struct btree *b) \ |
156 | { return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ | 156 | { return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ |
157 | \ | 157 | \ |
158 | static inline void set_btree_node_ ## flag(struct btree *b) \ | 158 | static inline void set_btree_node_ ## flag(struct btree *b) \ |
159 | { set_bit(BTREE_NODE_ ## flag, &b->flags); } \ | 159 | { set_bit(BTREE_NODE_ ## flag, &b->flags); } \ |
160 | 160 | ||
161 | enum btree_flags { | 161 | enum btree_flags { |
162 | BTREE_NODE_io_error, | 162 | BTREE_NODE_io_error, |
163 | BTREE_NODE_dirty, | 163 | BTREE_NODE_dirty, |
164 | BTREE_NODE_write_idx, | 164 | BTREE_NODE_write_idx, |
165 | }; | 165 | }; |
166 | 166 | ||
167 | BTREE_FLAG(io_error); | 167 | BTREE_FLAG(io_error); |
168 | BTREE_FLAG(dirty); | 168 | BTREE_FLAG(dirty); |
169 | BTREE_FLAG(write_idx); | 169 | BTREE_FLAG(write_idx); |
170 | 170 | ||
171 | static inline struct btree_write *btree_current_write(struct btree *b) | 171 | static inline struct btree_write *btree_current_write(struct btree *b) |
172 | { | 172 | { |
173 | return b->writes + btree_node_write_idx(b); | 173 | return b->writes + btree_node_write_idx(b); |
174 | } | 174 | } |
175 | 175 | ||
176 | static inline struct btree_write *btree_prev_write(struct btree *b) | 176 | static inline struct btree_write *btree_prev_write(struct btree *b) |
177 | { | 177 | { |
178 | return b->writes + (btree_node_write_idx(b) ^ 1); | 178 | return b->writes + (btree_node_write_idx(b) ^ 1); |
179 | } | 179 | } |
180 | 180 | ||
181 | static inline unsigned bset_offset(struct btree *b, struct bset *i) | 181 | static inline unsigned bset_offset(struct btree *b, struct bset *i) |
182 | { | 182 | { |
183 | return (((size_t) i) - ((size_t) b->sets->data)) >> 9; | 183 | return (((size_t) i) - ((size_t) b->sets->data)) >> 9; |
184 | } | 184 | } |
185 | 185 | ||
186 | static inline struct bset *write_block(struct btree *b) | 186 | static inline struct bset *write_block(struct btree *b) |
187 | { | 187 | { |
188 | return ((void *) b->sets[0].data) + b->written * block_bytes(b->c); | 188 | return ((void *) b->sets[0].data) + b->written * block_bytes(b->c); |
189 | } | 189 | } |
190 | 190 | ||
191 | static inline bool bset_written(struct btree *b, struct bset_tree *t) | 191 | static inline bool bset_written(struct btree *b, struct bset_tree *t) |
192 | { | 192 | { |
193 | return t->data < write_block(b); | 193 | return t->data < write_block(b); |
194 | } | 194 | } |
195 | 195 | ||
196 | static inline bool bkey_written(struct btree *b, struct bkey *k) | 196 | static inline bool bkey_written(struct btree *b, struct bkey *k) |
197 | { | 197 | { |
198 | return k < write_block(b)->start; | 198 | return k < write_block(b)->start; |
199 | } | 199 | } |
200 | 200 | ||
201 | static inline void set_gc_sectors(struct cache_set *c) | 201 | static inline void set_gc_sectors(struct cache_set *c) |
202 | { | 202 | { |
203 | atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 8); | 203 | atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 8); |
204 | } | 204 | } |
205 | 205 | ||
206 | static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k) | 206 | static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k) |
207 | { | 207 | { |
208 | return __bch_ptr_invalid(b->c, b->level, k); | 208 | return __bch_ptr_invalid(b->c, b->level, k); |
209 | } | 209 | } |
210 | 210 | ||
211 | static inline struct bkey *bch_btree_iter_init(struct btree *b, | 211 | static inline struct bkey *bch_btree_iter_init(struct btree *b, |
212 | struct btree_iter *iter, | 212 | struct btree_iter *iter, |
213 | struct bkey *search) | 213 | struct bkey *search) |
214 | { | 214 | { |
215 | return __bch_btree_iter_init(b, iter, search, b->sets); | 215 | return __bch_btree_iter_init(b, iter, search, b->sets); |
216 | } | 216 | } |
217 | 217 | ||
218 | /* Looping macros */ | 218 | /* Looping macros */ |
219 | 219 | ||
220 | #define for_each_cached_btree(b, c, iter) \ | 220 | #define for_each_cached_btree(b, c, iter) \ |
221 | for (iter = 0; \ | 221 | for (iter = 0; \ |
222 | iter < ARRAY_SIZE((c)->bucket_hash); \ | 222 | iter < ARRAY_SIZE((c)->bucket_hash); \ |
223 | iter++) \ | 223 | iter++) \ |
224 | hlist_for_each_entry_rcu((b), (c)->bucket_hash + iter, hash) | 224 | hlist_for_each_entry_rcu((b), (c)->bucket_hash + iter, hash) |
225 | 225 | ||
226 | #define for_each_key_filter(b, k, iter, filter) \ | 226 | #define for_each_key_filter(b, k, iter, filter) \ |
227 | for (bch_btree_iter_init((b), (iter), NULL); \ | 227 | for (bch_btree_iter_init((b), (iter), NULL); \ |
228 | ((k) = bch_btree_iter_next_filter((iter), b, filter));) | 228 | ((k) = bch_btree_iter_next_filter((iter), b, filter));) |
229 | 229 | ||
230 | #define for_each_key(b, k, iter) \ | 230 | #define for_each_key(b, k, iter) \ |
231 | for (bch_btree_iter_init((b), (iter), NULL); \ | 231 | for (bch_btree_iter_init((b), (iter), NULL); \ |
232 | ((k) = bch_btree_iter_next(iter));) | 232 | ((k) = bch_btree_iter_next(iter));) |
233 | 233 | ||
234 | /* Recursing down the btree */ | 234 | /* Recursing down the btree */ |
235 | 235 | ||
236 | struct btree_op { | 236 | struct btree_op { |
237 | struct closure cl; | 237 | struct closure cl; |
238 | struct cache_set *c; | 238 | struct cache_set *c; |
239 | 239 | ||
240 | /* Journal entry we have a refcount on */ | 240 | /* Journal entry we have a refcount on */ |
241 | atomic_t *journal; | 241 | atomic_t *journal; |
242 | 242 | ||
243 | /* Bio to be inserted into the cache */ | 243 | /* Bio to be inserted into the cache */ |
244 | struct bio *cache_bio; | 244 | struct bio *cache_bio; |
245 | 245 | ||
246 | unsigned inode; | 246 | unsigned inode; |
247 | 247 | ||
248 | uint16_t write_prio; | 248 | uint16_t write_prio; |
249 | 249 | ||
250 | /* Btree level at which we start taking write locks */ | 250 | /* Btree level at which we start taking write locks */ |
251 | short lock; | 251 | short lock; |
252 | 252 | ||
253 | /* Btree insertion type */ | 253 | /* Btree insertion type */ |
254 | enum { | 254 | enum { |
255 | BTREE_INSERT, | 255 | BTREE_INSERT, |
256 | BTREE_REPLACE | 256 | BTREE_REPLACE |
257 | } type:8; | 257 | } type:8; |
258 | 258 | ||
259 | unsigned csum:1; | 259 | unsigned csum:1; |
260 | unsigned skip:1; | 260 | unsigned skip:1; |
261 | unsigned flush_journal:1; | 261 | unsigned flush_journal:1; |
262 | 262 | ||
263 | unsigned insert_data_done:1; | 263 | unsigned insert_data_done:1; |
264 | unsigned lookup_done:1; | 264 | unsigned lookup_done:1; |
265 | unsigned insert_collision:1; | 265 | unsigned insert_collision:1; |
266 | 266 | ||
267 | /* Anything after this point won't get zeroed in do_bio_hook() */ | 267 | /* Anything after this point won't get zeroed in do_bio_hook() */ |
268 | 268 | ||
269 | /* Keys to be inserted */ | 269 | /* Keys to be inserted */ |
270 | struct keylist keys; | 270 | struct keylist keys; |
271 | BKEY_PADDED(replace); | 271 | BKEY_PADDED(replace); |
272 | }; | 272 | }; |
273 | 273 | ||
274 | enum { | 274 | enum { |
275 | BTREE_INSERT_STATUS_INSERT, | 275 | BTREE_INSERT_STATUS_INSERT, |
276 | BTREE_INSERT_STATUS_BACK_MERGE, | 276 | BTREE_INSERT_STATUS_BACK_MERGE, |
277 | BTREE_INSERT_STATUS_OVERWROTE, | 277 | BTREE_INSERT_STATUS_OVERWROTE, |
278 | BTREE_INSERT_STATUS_FRONT_MERGE, | 278 | BTREE_INSERT_STATUS_FRONT_MERGE, |
279 | }; | 279 | }; |
280 | 280 | ||
281 | void bch_btree_op_init_stack(struct btree_op *); | 281 | void bch_btree_op_init_stack(struct btree_op *); |
282 | 282 | ||
283 | static inline void rw_lock(bool w, struct btree *b, int level) | 283 | static inline void rw_lock(bool w, struct btree *b, int level) |
284 | { | 284 | { |
285 | w ? down_write_nested(&b->lock, level + 1) | 285 | w ? down_write_nested(&b->lock, level + 1) |
286 | : down_read_nested(&b->lock, level + 1); | 286 | : down_read_nested(&b->lock, level + 1); |
287 | if (w) | 287 | if (w) |
288 | b->seq++; | 288 | b->seq++; |
289 | } | 289 | } |
290 | 290 | ||
291 | static inline void rw_unlock(bool w, struct btree *b) | 291 | static inline void rw_unlock(bool w, struct btree *b) |
292 | { | 292 | { |
293 | #ifdef CONFIG_BCACHE_EDEBUG | 293 | #ifdef CONFIG_BCACHE_EDEBUG |
294 | unsigned i; | 294 | unsigned i; |
295 | 295 | ||
296 | if (w && b->key.ptr[0]) | 296 | if (w && b->key.ptr[0]) |
297 | for (i = 0; i <= b->nsets; i++) | 297 | for (i = 0; i <= b->nsets; i++) |
298 | bch_check_key_order(b, b->sets[i].data); | 298 | bch_check_key_order(b, b->sets[i].data); |
299 | #endif | 299 | #endif |
300 | 300 | ||
301 | if (w) | 301 | if (w) |
302 | b->seq++; | 302 | b->seq++; |
303 | (w ? up_write : up_read)(&b->lock); | 303 | (w ? up_write : up_read)(&b->lock); |
304 | } | 304 | } |
305 | 305 | ||
306 | #define insert_lock(s, b) ((b)->level <= (s)->lock) | 306 | #define insert_lock(s, b) ((b)->level <= (s)->lock) |
307 | 307 | ||
308 | /* | 308 | /* |
309 | * These macros are for recursing down the btree - they handle the details of | 309 | * These macros are for recursing down the btree - they handle the details of |
310 | * locking and looking up nodes in the cache for you. They're best treated as | 310 | * locking and looking up nodes in the cache for you. They're best treated as |
311 | * mere syntax when reading code that uses them. | 311 | * mere syntax when reading code that uses them. |
312 | * | 312 | * |
313 | * op->lock determines whether we take a read or a write lock at a given depth. | 313 | * op->lock determines whether we take a read or a write lock at a given depth. |
314 | * If you've got a read lock and find that you need a write lock (i.e. you're | 314 | * If you've got a read lock and find that you need a write lock (i.e. you're |
315 | * going to have to split), set op->lock and return -EINTR; btree_root() will | 315 | * going to have to split), set op->lock and return -EINTR; btree_root() will |
316 | * call you again and you'll have the correct lock. | 316 | * call you again and you'll have the correct lock. |
317 | */ | 317 | */ |
318 | 318 | ||
319 | /** | 319 | /** |
320 | * btree - recurse down the btree on a specified key | 320 | * btree - recurse down the btree on a specified key |
321 | * @fn: function to call, which will be passed the child node | 321 | * @fn: function to call, which will be passed the child node |
322 | * @key: key to recurse on | 322 | * @key: key to recurse on |
323 | * @b: parent btree node | 323 | * @b: parent btree node |
324 | * @op: pointer to struct btree_op | 324 | * @op: pointer to struct btree_op |
325 | */ | 325 | */ |
326 | #define btree(fn, key, b, op, ...) \ | 326 | #define btree(fn, key, b, op, ...) \ |
327 | ({ \ | 327 | ({ \ |
328 | int _r, l = (b)->level - 1; \ | 328 | int _r, l = (b)->level - 1; \ |
329 | bool _w = l <= (op)->lock; \ | 329 | bool _w = l <= (op)->lock; \ |
330 | struct btree *_b = bch_btree_node_get((b)->c, key, l, op); \ | 330 | struct btree *_b = bch_btree_node_get((b)->c, key, l, op); \ |
331 | if (!IS_ERR(_b)) { \ | 331 | if (!IS_ERR(_b)) { \ |
332 | _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ | 332 | _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ |
333 | rw_unlock(_w, _b); \ | 333 | rw_unlock(_w, _b); \ |
334 | } else \ | 334 | } else \ |
335 | _r = PTR_ERR(_b); \ | 335 | _r = PTR_ERR(_b); \ |
336 | _r; \ | 336 | _r; \ |
337 | }) | 337 | }) |
338 | 338 | ||
339 | /** | 339 | /** |
340 | * btree_root - call a function on the root of the btree | 340 | * btree_root - call a function on the root of the btree |
341 | * @fn: function to call, which will be passed the child node | 341 | * @fn: function to call, which will be passed the child node |
342 | * @c: cache set | 342 | * @c: cache set |
343 | * @op: pointer to struct btree_op | 343 | * @op: pointer to struct btree_op |
344 | */ | 344 | */ |
345 | #define btree_root(fn, c, op, ...) \ | 345 | #define btree_root(fn, c, op, ...) \ |
346 | ({ \ | 346 | ({ \ |
347 | int _r = -EINTR; \ | 347 | int _r = -EINTR; \ |
348 | do { \ | 348 | do { \ |
349 | struct btree *_b = (c)->root; \ | 349 | struct btree *_b = (c)->root; \ |
350 | bool _w = insert_lock(op, _b); \ | 350 | bool _w = insert_lock(op, _b); \ |
351 | rw_lock(_w, _b, _b->level); \ | 351 | rw_lock(_w, _b, _b->level); \ |
352 | if (_b == (c)->root && \ | 352 | if (_b == (c)->root && \ |
353 | _w == insert_lock(op, _b)) \ | 353 | _w == insert_lock(op, _b)) \ |
354 | _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ | 354 | _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ |
355 | rw_unlock(_w, _b); \ | 355 | rw_unlock(_w, _b); \ |
356 | bch_cannibalize_unlock(c, &(op)->cl); \ | 356 | bch_cannibalize_unlock(c, &(op)->cl); \ |
357 | } while (_r == -EINTR); \ | 357 | } while (_r == -EINTR); \ |
358 | \ | 358 | \ |
359 | _r; \ | 359 | _r; \ |
360 | }) | 360 | }) |
361 | 361 | ||
362 | static inline bool should_split(struct btree *b) | 362 | static inline bool should_split(struct btree *b) |
363 | { | 363 | { |
364 | struct bset *i = write_block(b); | 364 | struct bset *i = write_block(b); |
365 | return b->written >= btree_blocks(b) || | 365 | return b->written >= btree_blocks(b) || |
366 | (i->seq == b->sets[0].data->seq && | 366 | (i->seq == b->sets[0].data->seq && |
367 | b->written + __set_blocks(i, i->keys + 15, b->c) | 367 | b->written + __set_blocks(i, i->keys + 15, b->c) |
368 | > btree_blocks(b)); | 368 | > btree_blocks(b)); |
369 | } | 369 | } |
370 | 370 | ||
371 | void bch_btree_node_read(struct btree *); | 371 | void bch_btree_node_read(struct btree *); |
372 | void bch_btree_node_read_done(struct btree *); | 372 | void bch_btree_node_read_done(struct btree *); |
373 | void bch_btree_node_write(struct btree *, struct closure *); | 373 | void bch_btree_node_write(struct btree *, struct closure *); |
374 | 374 | ||
375 | void bch_cannibalize_unlock(struct cache_set *, struct closure *); | 375 | void bch_cannibalize_unlock(struct cache_set *, struct closure *); |
376 | void bch_btree_set_root(struct btree *); | 376 | void bch_btree_set_root(struct btree *); |
377 | struct btree *bch_btree_node_alloc(struct cache_set *, int, struct closure *); | 377 | struct btree *bch_btree_node_alloc(struct cache_set *, int, struct closure *); |
378 | struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, | 378 | struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, |
379 | int, struct btree_op *); | 379 | int, struct btree_op *); |
380 | 380 | ||
381 | bool bch_btree_insert_keys(struct btree *, struct btree_op *); | 381 | bool bch_btree_insert_keys(struct btree *, struct btree_op *); |
382 | bool bch_btree_insert_check_key(struct btree *, struct btree_op *, | 382 | bool bch_btree_insert_check_key(struct btree *, struct btree_op *, |
383 | struct bio *); | 383 | struct bio *); |
384 | int bch_btree_insert(struct btree_op *, struct cache_set *); | 384 | int bch_btree_insert(struct btree_op *, struct cache_set *); |
385 | 385 | ||
386 | int bch_btree_search_recurse(struct btree *, struct btree_op *); | 386 | int bch_btree_search_recurse(struct btree *, struct btree_op *); |
387 | 387 | ||
388 | void bch_queue_gc(struct cache_set *); | 388 | void bch_queue_gc(struct cache_set *); |
389 | size_t bch_btree_gc_finish(struct cache_set *); | 389 | size_t bch_btree_gc_finish(struct cache_set *); |
390 | void bch_moving_gc(struct closure *); | 390 | void bch_moving_gc(struct closure *); |
391 | int bch_btree_check(struct cache_set *, struct btree_op *); | 391 | int bch_btree_check(struct cache_set *, struct btree_op *); |
392 | uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *); | 392 | uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *); |
393 | 393 | ||
394 | void bch_keybuf_init(struct keybuf *, keybuf_pred_fn *); | 394 | void bch_keybuf_init(struct keybuf *); |
395 | void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *); | 395 | void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *, |
396 | keybuf_pred_fn *); | ||
396 | bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *, | 397 | bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *, |
397 | struct bkey *); | 398 | struct bkey *); |
398 | void bch_keybuf_del(struct keybuf *, struct keybuf_key *); | 399 | void bch_keybuf_del(struct keybuf *, struct keybuf_key *); |
399 | struct keybuf_key *bch_keybuf_next(struct keybuf *); | 400 | struct keybuf_key *bch_keybuf_next(struct keybuf *); |
400 | struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, | 401 | struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, struct keybuf *, |
401 | struct keybuf *, struct bkey *); | 402 | struct bkey *, keybuf_pred_fn *); |
402 | 403 | ||
403 | #endif | 404 | #endif |
404 | 405 |
drivers/md/bcache/debug.c
1 | /* | 1 | /* |
2 | * Assorted bcache debug code | 2 | * Assorted bcache debug code |
3 | * | 3 | * |
4 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> | 4 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> |
5 | * Copyright 2012 Google, Inc. | 5 | * Copyright 2012 Google, Inc. |
6 | */ | 6 | */ |
7 | 7 | ||
8 | #include "bcache.h" | 8 | #include "bcache.h" |
9 | #include "btree.h" | 9 | #include "btree.h" |
10 | #include "debug.h" | 10 | #include "debug.h" |
11 | #include "request.h" | 11 | #include "request.h" |
12 | 12 | ||
13 | #include <linux/console.h> | 13 | #include <linux/console.h> |
14 | #include <linux/debugfs.h> | 14 | #include <linux/debugfs.h> |
15 | #include <linux/module.h> | 15 | #include <linux/module.h> |
16 | #include <linux/random.h> | 16 | #include <linux/random.h> |
17 | #include <linux/seq_file.h> | 17 | #include <linux/seq_file.h> |
18 | 18 | ||
19 | static struct dentry *debug; | 19 | static struct dentry *debug; |
20 | 20 | ||
21 | const char *bch_ptr_status(struct cache_set *c, const struct bkey *k) | 21 | const char *bch_ptr_status(struct cache_set *c, const struct bkey *k) |
22 | { | 22 | { |
23 | unsigned i; | 23 | unsigned i; |
24 | 24 | ||
25 | for (i = 0; i < KEY_PTRS(k); i++) | 25 | for (i = 0; i < KEY_PTRS(k); i++) |
26 | if (ptr_available(c, k, i)) { | 26 | if (ptr_available(c, k, i)) { |
27 | struct cache *ca = PTR_CACHE(c, k, i); | 27 | struct cache *ca = PTR_CACHE(c, k, i); |
28 | size_t bucket = PTR_BUCKET_NR(c, k, i); | 28 | size_t bucket = PTR_BUCKET_NR(c, k, i); |
29 | size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); | 29 | size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); |
30 | 30 | ||
31 | if (KEY_SIZE(k) + r > c->sb.bucket_size) | 31 | if (KEY_SIZE(k) + r > c->sb.bucket_size) |
32 | return "bad, length too big"; | 32 | return "bad, length too big"; |
33 | if (bucket < ca->sb.first_bucket) | 33 | if (bucket < ca->sb.first_bucket) |
34 | return "bad, short offset"; | 34 | return "bad, short offset"; |
35 | if (bucket >= ca->sb.nbuckets) | 35 | if (bucket >= ca->sb.nbuckets) |
36 | return "bad, offset past end of device"; | 36 | return "bad, offset past end of device"; |
37 | if (ptr_stale(c, k, i)) | 37 | if (ptr_stale(c, k, i)) |
38 | return "stale"; | 38 | return "stale"; |
39 | } | 39 | } |
40 | 40 | ||
41 | if (!bkey_cmp(k, &ZERO_KEY)) | 41 | if (!bkey_cmp(k, &ZERO_KEY)) |
42 | return "bad, null key"; | 42 | return "bad, null key"; |
43 | if (!KEY_PTRS(k)) | 43 | if (!KEY_PTRS(k)) |
44 | return "bad, no pointers"; | 44 | return "bad, no pointers"; |
45 | if (!KEY_SIZE(k)) | 45 | if (!KEY_SIZE(k)) |
46 | return "zeroed key"; | 46 | return "zeroed key"; |
47 | return ""; | 47 | return ""; |
48 | } | 48 | } |
49 | 49 | ||
50 | int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k) | 50 | int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k) |
51 | { | 51 | { |
52 | unsigned i = 0; | 52 | unsigned i = 0; |
53 | char *out = buf, *end = buf + size; | 53 | char *out = buf, *end = buf + size; |
54 | 54 | ||
55 | #define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) | 55 | #define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) |
56 | 56 | ||
57 | p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_OFFSET(k), KEY_SIZE(k)); | 57 | p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_OFFSET(k), KEY_SIZE(k)); |
58 | 58 | ||
59 | if (KEY_PTRS(k)) | 59 | if (KEY_PTRS(k)) |
60 | while (1) { | 60 | while (1) { |
61 | p("%llu:%llu gen %llu", | 61 | p("%llu:%llu gen %llu", |
62 | PTR_DEV(k, i), PTR_OFFSET(k, i), PTR_GEN(k, i)); | 62 | PTR_DEV(k, i), PTR_OFFSET(k, i), PTR_GEN(k, i)); |
63 | 63 | ||
64 | if (++i == KEY_PTRS(k)) | 64 | if (++i == KEY_PTRS(k)) |
65 | break; | 65 | break; |
66 | 66 | ||
67 | p(", "); | 67 | p(", "); |
68 | } | 68 | } |
69 | 69 | ||
70 | p("]"); | 70 | p("]"); |
71 | 71 | ||
72 | if (KEY_DIRTY(k)) | 72 | if (KEY_DIRTY(k)) |
73 | p(" dirty"); | 73 | p(" dirty"); |
74 | if (KEY_CSUM(k)) | 74 | if (KEY_CSUM(k)) |
75 | p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]); | 75 | p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]); |
76 | #undef p | 76 | #undef p |
77 | return out - buf; | 77 | return out - buf; |
78 | } | 78 | } |
79 | 79 | ||
80 | int bch_btree_to_text(char *buf, size_t size, const struct btree *b) | 80 | int bch_btree_to_text(char *buf, size_t size, const struct btree *b) |
81 | { | 81 | { |
82 | return scnprintf(buf, size, "%zu level %i/%i", | 82 | return scnprintf(buf, size, "%zu level %i/%i", |
83 | PTR_BUCKET_NR(b->c, &b->key, 0), | 83 | PTR_BUCKET_NR(b->c, &b->key, 0), |
84 | b->level, b->c->root ? b->c->root->level : -1); | 84 | b->level, b->c->root ? b->c->root->level : -1); |
85 | } | 85 | } |
86 | 86 | ||
87 | #if defined(CONFIG_BCACHE_DEBUG) || defined(CONFIG_BCACHE_EDEBUG) | 87 | #if defined(CONFIG_BCACHE_DEBUG) || defined(CONFIG_BCACHE_EDEBUG) |
88 | 88 | ||
89 | static bool skipped_backwards(struct btree *b, struct bkey *k) | 89 | static bool skipped_backwards(struct btree *b, struct bkey *k) |
90 | { | 90 | { |
91 | return bkey_cmp(k, (!b->level) | 91 | return bkey_cmp(k, (!b->level) |
92 | ? &START_KEY(bkey_next(k)) | 92 | ? &START_KEY(bkey_next(k)) |
93 | : bkey_next(k)) > 0; | 93 | : bkey_next(k)) > 0; |
94 | } | 94 | } |
95 | 95 | ||
96 | static void dump_bset(struct btree *b, struct bset *i) | 96 | static void dump_bset(struct btree *b, struct bset *i) |
97 | { | 97 | { |
98 | struct bkey *k; | 98 | struct bkey *k; |
99 | unsigned j; | 99 | unsigned j; |
100 | char buf[80]; | 100 | char buf[80]; |
101 | 101 | ||
102 | for (k = i->start; k < end(i); k = bkey_next(k)) { | 102 | for (k = i->start; k < end(i); k = bkey_next(k)) { |
103 | bch_bkey_to_text(buf, sizeof(buf), k); | 103 | bch_bkey_to_text(buf, sizeof(buf), k); |
104 | printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b), | 104 | printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b), |
105 | (uint64_t *) k - i->d, i->keys, buf); | 105 | (uint64_t *) k - i->d, i->keys, buf); |
106 | 106 | ||
107 | for (j = 0; j < KEY_PTRS(k); j++) { | 107 | for (j = 0; j < KEY_PTRS(k); j++) { |
108 | size_t n = PTR_BUCKET_NR(b->c, k, j); | 108 | size_t n = PTR_BUCKET_NR(b->c, k, j); |
109 | printk(" bucket %zu", n); | 109 | printk(" bucket %zu", n); |
110 | 110 | ||
111 | if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets) | 111 | if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets) |
112 | printk(" prio %i", | 112 | printk(" prio %i", |
113 | PTR_BUCKET(b->c, k, j)->prio); | 113 | PTR_BUCKET(b->c, k, j)->prio); |
114 | } | 114 | } |
115 | 115 | ||
116 | printk(" %s\n", bch_ptr_status(b->c, k)); | 116 | printk(" %s\n", bch_ptr_status(b->c, k)); |
117 | 117 | ||
118 | if (bkey_next(k) < end(i) && | 118 | if (bkey_next(k) < end(i) && |
119 | skipped_backwards(b, k)) | 119 | skipped_backwards(b, k)) |
120 | printk(KERN_ERR "Key skipped backwards\n"); | 120 | printk(KERN_ERR "Key skipped backwards\n"); |
121 | } | 121 | } |
122 | } | 122 | } |
123 | 123 | ||
124 | #endif | 124 | #endif |
125 | 125 | ||
126 | #ifdef CONFIG_BCACHE_DEBUG | 126 | #ifdef CONFIG_BCACHE_DEBUG |
127 | 127 | ||
128 | void bch_btree_verify(struct btree *b, struct bset *new) | 128 | void bch_btree_verify(struct btree *b, struct bset *new) |
129 | { | 129 | { |
130 | struct btree *v = b->c->verify_data; | 130 | struct btree *v = b->c->verify_data; |
131 | struct closure cl; | 131 | struct closure cl; |
132 | closure_init_stack(&cl); | 132 | closure_init_stack(&cl); |
133 | 133 | ||
134 | if (!b->c->verify) | 134 | if (!b->c->verify) |
135 | return; | 135 | return; |
136 | 136 | ||
137 | closure_wait_event(&b->io.wait, &cl, | 137 | closure_wait_event(&b->io.wait, &cl, |
138 | atomic_read(&b->io.cl.remaining) == -1); | 138 | atomic_read(&b->io.cl.remaining) == -1); |
139 | 139 | ||
140 | mutex_lock(&b->c->verify_lock); | 140 | mutex_lock(&b->c->verify_lock); |
141 | 141 | ||
142 | bkey_copy(&v->key, &b->key); | 142 | bkey_copy(&v->key, &b->key); |
143 | v->written = 0; | 143 | v->written = 0; |
144 | v->level = b->level; | 144 | v->level = b->level; |
145 | 145 | ||
146 | bch_btree_node_read(v); | 146 | bch_btree_node_read(v); |
147 | closure_wait_event(&v->io.wait, &cl, | 147 | closure_wait_event(&v->io.wait, &cl, |
148 | atomic_read(&b->io.cl.remaining) == -1); | 148 | atomic_read(&b->io.cl.remaining) == -1); |
149 | 149 | ||
150 | if (new->keys != v->sets[0].data->keys || | 150 | if (new->keys != v->sets[0].data->keys || |
151 | memcmp(new->start, | 151 | memcmp(new->start, |
152 | v->sets[0].data->start, | 152 | v->sets[0].data->start, |
153 | (void *) end(new) - (void *) new->start)) { | 153 | (void *) end(new) - (void *) new->start)) { |
154 | unsigned i, j; | 154 | unsigned i, j; |
155 | 155 | ||
156 | console_lock(); | 156 | console_lock(); |
157 | 157 | ||
158 | printk(KERN_ERR "*** original memory node:\n"); | 158 | printk(KERN_ERR "*** original memory node:\n"); |
159 | for (i = 0; i <= b->nsets; i++) | 159 | for (i = 0; i <= b->nsets; i++) |
160 | dump_bset(b, b->sets[i].data); | 160 | dump_bset(b, b->sets[i].data); |
161 | 161 | ||
162 | printk(KERN_ERR "*** sorted memory node:\n"); | 162 | printk(KERN_ERR "*** sorted memory node:\n"); |
163 | dump_bset(b, new); | 163 | dump_bset(b, new); |
164 | 164 | ||
165 | printk(KERN_ERR "*** on disk node:\n"); | 165 | printk(KERN_ERR "*** on disk node:\n"); |
166 | dump_bset(v, v->sets[0].data); | 166 | dump_bset(v, v->sets[0].data); |
167 | 167 | ||
168 | for (j = 0; j < new->keys; j++) | 168 | for (j = 0; j < new->keys; j++) |
169 | if (new->d[j] != v->sets[0].data->d[j]) | 169 | if (new->d[j] != v->sets[0].data->d[j]) |
170 | break; | 170 | break; |
171 | 171 | ||
172 | console_unlock(); | 172 | console_unlock(); |
173 | panic("verify failed at %u\n", j); | 173 | panic("verify failed at %u\n", j); |
174 | } | 174 | } |
175 | 175 | ||
176 | mutex_unlock(&b->c->verify_lock); | 176 | mutex_unlock(&b->c->verify_lock); |
177 | } | 177 | } |
178 | 178 | ||
179 | static void data_verify_endio(struct bio *bio, int error) | 179 | static void data_verify_endio(struct bio *bio, int error) |
180 | { | 180 | { |
181 | struct closure *cl = bio->bi_private; | 181 | struct closure *cl = bio->bi_private; |
182 | closure_put(cl); | 182 | closure_put(cl); |
183 | } | 183 | } |
184 | 184 | ||
185 | void bch_data_verify(struct search *s) | 185 | void bch_data_verify(struct search *s) |
186 | { | 186 | { |
187 | char name[BDEVNAME_SIZE]; | 187 | char name[BDEVNAME_SIZE]; |
188 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | 188 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); |
189 | struct closure *cl = &s->cl; | 189 | struct closure *cl = &s->cl; |
190 | struct bio *check; | 190 | struct bio *check; |
191 | struct bio_vec *bv; | 191 | struct bio_vec *bv; |
192 | int i; | 192 | int i; |
193 | 193 | ||
194 | if (!s->unaligned_bvec) | 194 | if (!s->unaligned_bvec) |
195 | bio_for_each_segment(bv, s->orig_bio, i) | 195 | bio_for_each_segment(bv, s->orig_bio, i) |
196 | bv->bv_offset = 0, bv->bv_len = PAGE_SIZE; | 196 | bv->bv_offset = 0, bv->bv_len = PAGE_SIZE; |
197 | 197 | ||
198 | check = bio_clone(s->orig_bio, GFP_NOIO); | 198 | check = bio_clone(s->orig_bio, GFP_NOIO); |
199 | if (!check) | 199 | if (!check) |
200 | return; | 200 | return; |
201 | 201 | ||
202 | if (bch_bio_alloc_pages(check, GFP_NOIO)) | 202 | if (bch_bio_alloc_pages(check, GFP_NOIO)) |
203 | goto out_put; | 203 | goto out_put; |
204 | 204 | ||
205 | check->bi_rw = READ_SYNC; | 205 | check->bi_rw = READ_SYNC; |
206 | check->bi_private = cl; | 206 | check->bi_private = cl; |
207 | check->bi_end_io = data_verify_endio; | 207 | check->bi_end_io = data_verify_endio; |
208 | 208 | ||
209 | closure_bio_submit(check, cl, &dc->disk); | 209 | closure_bio_submit(check, cl, &dc->disk); |
210 | closure_sync(cl); | 210 | closure_sync(cl); |
211 | 211 | ||
212 | bio_for_each_segment(bv, s->orig_bio, i) { | 212 | bio_for_each_segment(bv, s->orig_bio, i) { |
213 | void *p1 = kmap(bv->bv_page); | 213 | void *p1 = kmap(bv->bv_page); |
214 | void *p2 = kmap(check->bi_io_vec[i].bv_page); | 214 | void *p2 = kmap(check->bi_io_vec[i].bv_page); |
215 | 215 | ||
216 | if (memcmp(p1 + bv->bv_offset, | 216 | if (memcmp(p1 + bv->bv_offset, |
217 | p2 + bv->bv_offset, | 217 | p2 + bv->bv_offset, |
218 | bv->bv_len)) | 218 | bv->bv_len)) |
219 | printk(KERN_ERR | 219 | printk(KERN_ERR |
220 | "bcache (%s): verify failed at sector %llu\n", | 220 | "bcache (%s): verify failed at sector %llu\n", |
221 | bdevname(dc->bdev, name), | 221 | bdevname(dc->bdev, name), |
222 | (uint64_t) s->orig_bio->bi_sector); | 222 | (uint64_t) s->orig_bio->bi_sector); |
223 | 223 | ||
224 | kunmap(bv->bv_page); | 224 | kunmap(bv->bv_page); |
225 | kunmap(check->bi_io_vec[i].bv_page); | 225 | kunmap(check->bi_io_vec[i].bv_page); |
226 | } | 226 | } |
227 | 227 | ||
228 | __bio_for_each_segment(bv, check, i, 0) | 228 | __bio_for_each_segment(bv, check, i, 0) |
229 | __free_page(bv->bv_page); | 229 | __free_page(bv->bv_page); |
230 | out_put: | 230 | out_put: |
231 | bio_put(check); | 231 | bio_put(check); |
232 | } | 232 | } |
233 | 233 | ||
234 | #endif | 234 | #endif |
235 | 235 | ||
236 | #ifdef CONFIG_BCACHE_EDEBUG | 236 | #ifdef CONFIG_BCACHE_EDEBUG |
237 | 237 | ||
238 | unsigned bch_count_data(struct btree *b) | 238 | unsigned bch_count_data(struct btree *b) |
239 | { | 239 | { |
240 | unsigned ret = 0; | 240 | unsigned ret = 0; |
241 | struct btree_iter iter; | 241 | struct btree_iter iter; |
242 | struct bkey *k; | 242 | struct bkey *k; |
243 | 243 | ||
244 | if (!b->level) | 244 | if (!b->level) |
245 | for_each_key(b, k, &iter) | 245 | for_each_key(b, k, &iter) |
246 | ret += KEY_SIZE(k); | 246 | ret += KEY_SIZE(k); |
247 | return ret; | 247 | return ret; |
248 | } | 248 | } |
249 | 249 | ||
250 | static void vdump_bucket_and_panic(struct btree *b, const char *fmt, | 250 | static void vdump_bucket_and_panic(struct btree *b, const char *fmt, |
251 | va_list args) | 251 | va_list args) |
252 | { | 252 | { |
253 | unsigned i; | 253 | unsigned i; |
254 | char buf[80]; | 254 | char buf[80]; |
255 | 255 | ||
256 | console_lock(); | 256 | console_lock(); |
257 | 257 | ||
258 | for (i = 0; i <= b->nsets; i++) | 258 | for (i = 0; i <= b->nsets; i++) |
259 | dump_bset(b, b->sets[i].data); | 259 | dump_bset(b, b->sets[i].data); |
260 | 260 | ||
261 | vprintk(fmt, args); | 261 | vprintk(fmt, args); |
262 | 262 | ||
263 | console_unlock(); | 263 | console_unlock(); |
264 | 264 | ||
265 | bch_btree_to_text(buf, sizeof(buf), b); | 265 | bch_btree_to_text(buf, sizeof(buf), b); |
266 | panic("at %s\n", buf); | 266 | panic("at %s\n", buf); |
267 | } | 267 | } |
268 | 268 | ||
269 | void bch_check_key_order_msg(struct btree *b, struct bset *i, | 269 | void bch_check_key_order_msg(struct btree *b, struct bset *i, |
270 | const char *fmt, ...) | 270 | const char *fmt, ...) |
271 | { | 271 | { |
272 | struct bkey *k; | 272 | struct bkey *k; |
273 | 273 | ||
274 | if (!i->keys) | 274 | if (!i->keys) |
275 | return; | 275 | return; |
276 | 276 | ||
277 | for (k = i->start; bkey_next(k) < end(i); k = bkey_next(k)) | 277 | for (k = i->start; bkey_next(k) < end(i); k = bkey_next(k)) |
278 | if (skipped_backwards(b, k)) { | 278 | if (skipped_backwards(b, k)) { |
279 | va_list args; | 279 | va_list args; |
280 | va_start(args, fmt); | 280 | va_start(args, fmt); |
281 | 281 | ||
282 | vdump_bucket_and_panic(b, fmt, args); | 282 | vdump_bucket_and_panic(b, fmt, args); |
283 | va_end(args); | 283 | va_end(args); |
284 | } | 284 | } |
285 | } | 285 | } |
286 | 286 | ||
287 | void bch_check_keys(struct btree *b, const char *fmt, ...) | 287 | void bch_check_keys(struct btree *b, const char *fmt, ...) |
288 | { | 288 | { |
289 | va_list args; | 289 | va_list args; |
290 | struct bkey *k, *p = NULL; | 290 | struct bkey *k, *p = NULL; |
291 | struct btree_iter iter; | 291 | struct btree_iter iter; |
292 | 292 | ||
293 | if (b->level) | 293 | if (b->level) |
294 | return; | 294 | return; |
295 | 295 | ||
296 | for_each_key(b, k, &iter) { | 296 | for_each_key(b, k, &iter) { |
297 | if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) { | 297 | if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) { |
298 | printk(KERN_ERR "Keys out of order:\n"); | 298 | printk(KERN_ERR "Keys out of order:\n"); |
299 | goto bug; | 299 | goto bug; |
300 | } | 300 | } |
301 | 301 | ||
302 | if (bch_ptr_invalid(b, k)) | 302 | if (bch_ptr_invalid(b, k)) |
303 | continue; | 303 | continue; |
304 | 304 | ||
305 | if (p && bkey_cmp(p, &START_KEY(k)) > 0) { | 305 | if (p && bkey_cmp(p, &START_KEY(k)) > 0) { |
306 | printk(KERN_ERR "Overlapping keys:\n"); | 306 | printk(KERN_ERR "Overlapping keys:\n"); |
307 | goto bug; | 307 | goto bug; |
308 | } | 308 | } |
309 | p = k; | 309 | p = k; |
310 | } | 310 | } |
311 | return; | 311 | return; |
312 | bug: | 312 | bug: |
313 | va_start(args, fmt); | 313 | va_start(args, fmt); |
314 | vdump_bucket_and_panic(b, fmt, args); | 314 | vdump_bucket_and_panic(b, fmt, args); |
315 | va_end(args); | 315 | va_end(args); |
316 | } | 316 | } |
317 | 317 | ||
318 | #endif | 318 | #endif |
319 | 319 | ||
320 | #ifdef CONFIG_DEBUG_FS | 320 | #ifdef CONFIG_DEBUG_FS |
321 | 321 | ||
322 | /* XXX: cache set refcounting */ | 322 | /* XXX: cache set refcounting */ |
323 | 323 | ||
324 | struct dump_iterator { | 324 | struct dump_iterator { |
325 | char buf[PAGE_SIZE]; | 325 | char buf[PAGE_SIZE]; |
326 | size_t bytes; | 326 | size_t bytes; |
327 | struct cache_set *c; | 327 | struct cache_set *c; |
328 | struct keybuf keys; | 328 | struct keybuf keys; |
329 | }; | 329 | }; |
330 | 330 | ||
331 | static bool dump_pred(struct keybuf *buf, struct bkey *k) | 331 | static bool dump_pred(struct keybuf *buf, struct bkey *k) |
332 | { | 332 | { |
333 | return true; | 333 | return true; |
334 | } | 334 | } |
335 | 335 | ||
336 | static ssize_t bch_dump_read(struct file *file, char __user *buf, | 336 | static ssize_t bch_dump_read(struct file *file, char __user *buf, |
337 | size_t size, loff_t *ppos) | 337 | size_t size, loff_t *ppos) |
338 | { | 338 | { |
339 | struct dump_iterator *i = file->private_data; | 339 | struct dump_iterator *i = file->private_data; |
340 | ssize_t ret = 0; | 340 | ssize_t ret = 0; |
341 | char kbuf[80]; | 341 | char kbuf[80]; |
342 | 342 | ||
343 | while (size) { | 343 | while (size) { |
344 | struct keybuf_key *w; | 344 | struct keybuf_key *w; |
345 | unsigned bytes = min(i->bytes, size); | 345 | unsigned bytes = min(i->bytes, size); |
346 | 346 | ||
347 | int err = copy_to_user(buf, i->buf, bytes); | 347 | int err = copy_to_user(buf, i->buf, bytes); |
348 | if (err) | 348 | if (err) |
349 | return err; | 349 | return err; |
350 | 350 | ||
351 | ret += bytes; | 351 | ret += bytes; |
352 | buf += bytes; | 352 | buf += bytes; |
353 | size -= bytes; | 353 | size -= bytes; |
354 | i->bytes -= bytes; | 354 | i->bytes -= bytes; |
355 | memmove(i->buf, i->buf + bytes, i->bytes); | 355 | memmove(i->buf, i->buf + bytes, i->bytes); |
356 | 356 | ||
357 | if (i->bytes) | 357 | if (i->bytes) |
358 | break; | 358 | break; |
359 | 359 | ||
360 | w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY); | 360 | w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY, dump_pred); |
361 | if (!w) | 361 | if (!w) |
362 | break; | 362 | break; |
363 | 363 | ||
364 | bch_bkey_to_text(kbuf, sizeof(kbuf), &w->key); | 364 | bch_bkey_to_text(kbuf, sizeof(kbuf), &w->key); |
365 | i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", kbuf); | 365 | i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", kbuf); |
366 | bch_keybuf_del(&i->keys, w); | 366 | bch_keybuf_del(&i->keys, w); |
367 | } | 367 | } |
368 | 368 | ||
369 | return ret; | 369 | return ret; |
370 | } | 370 | } |
371 | 371 | ||
372 | static int bch_dump_open(struct inode *inode, struct file *file) | 372 | static int bch_dump_open(struct inode *inode, struct file *file) |
373 | { | 373 | { |
374 | struct cache_set *c = inode->i_private; | 374 | struct cache_set *c = inode->i_private; |
375 | struct dump_iterator *i; | 375 | struct dump_iterator *i; |
376 | 376 | ||
377 | i = kzalloc(sizeof(struct dump_iterator), GFP_KERNEL); | 377 | i = kzalloc(sizeof(struct dump_iterator), GFP_KERNEL); |
378 | if (!i) | 378 | if (!i) |
379 | return -ENOMEM; | 379 | return -ENOMEM; |
380 | 380 | ||
381 | file->private_data = i; | 381 | file->private_data = i; |
382 | i->c = c; | 382 | i->c = c; |
383 | bch_keybuf_init(&i->keys, dump_pred); | 383 | bch_keybuf_init(&i->keys); |
384 | i->keys.last_scanned = KEY(0, 0, 0); | 384 | i->keys.last_scanned = KEY(0, 0, 0); |
385 | 385 | ||
386 | return 0; | 386 | return 0; |
387 | } | 387 | } |
388 | 388 | ||
389 | static int bch_dump_release(struct inode *inode, struct file *file) | 389 | static int bch_dump_release(struct inode *inode, struct file *file) |
390 | { | 390 | { |
391 | kfree(file->private_data); | 391 | kfree(file->private_data); |
392 | return 0; | 392 | return 0; |
393 | } | 393 | } |
394 | 394 | ||
395 | static const struct file_operations cache_set_debug_ops = { | 395 | static const struct file_operations cache_set_debug_ops = { |
396 | .owner = THIS_MODULE, | 396 | .owner = THIS_MODULE, |
397 | .open = bch_dump_open, | 397 | .open = bch_dump_open, |
398 | .read = bch_dump_read, | 398 | .read = bch_dump_read, |
399 | .release = bch_dump_release | 399 | .release = bch_dump_release |
400 | }; | 400 | }; |
401 | 401 | ||
402 | void bch_debug_init_cache_set(struct cache_set *c) | 402 | void bch_debug_init_cache_set(struct cache_set *c) |
403 | { | 403 | { |
404 | if (!IS_ERR_OR_NULL(debug)) { | 404 | if (!IS_ERR_OR_NULL(debug)) { |
405 | char name[50]; | 405 | char name[50]; |
406 | snprintf(name, 50, "bcache-%pU", c->sb.set_uuid); | 406 | snprintf(name, 50, "bcache-%pU", c->sb.set_uuid); |
407 | 407 | ||
408 | c->debug = debugfs_create_file(name, 0400, debug, c, | 408 | c->debug = debugfs_create_file(name, 0400, debug, c, |
409 | &cache_set_debug_ops); | 409 | &cache_set_debug_ops); |
410 | } | 410 | } |
411 | } | 411 | } |
412 | 412 | ||
413 | #endif | 413 | #endif |
414 | 414 | ||
415 | /* Fuzz tester has rotted: */ | 415 | /* Fuzz tester has rotted: */ |
416 | #if 0 | 416 | #if 0 |
417 | 417 | ||
418 | static ssize_t btree_fuzz(struct kobject *k, struct kobj_attribute *a, | 418 | static ssize_t btree_fuzz(struct kobject *k, struct kobj_attribute *a, |
419 | const char *buffer, size_t size) | 419 | const char *buffer, size_t size) |
420 | { | 420 | { |
421 | void dump(struct btree *b) | 421 | void dump(struct btree *b) |
422 | { | 422 | { |
423 | struct bset *i; | 423 | struct bset *i; |
424 | 424 | ||
425 | for (i = b->sets[0].data; | 425 | for (i = b->sets[0].data; |
426 | index(i, b) < btree_blocks(b) && | 426 | index(i, b) < btree_blocks(b) && |
427 | i->seq == b->sets[0].data->seq; | 427 | i->seq == b->sets[0].data->seq; |
428 | i = ((void *) i) + set_blocks(i, b->c) * block_bytes(b->c)) | 428 | i = ((void *) i) + set_blocks(i, b->c) * block_bytes(b->c)) |
429 | dump_bset(b, i); | 429 | dump_bset(b, i); |
430 | } | 430 | } |
431 | 431 | ||
432 | struct cache_sb *sb; | 432 | struct cache_sb *sb; |
433 | struct cache_set *c; | 433 | struct cache_set *c; |
434 | struct btree *all[3], *b, *fill, *orig; | 434 | struct btree *all[3], *b, *fill, *orig; |
435 | int j; | 435 | int j; |
436 | 436 | ||
437 | struct btree_op op; | 437 | struct btree_op op; |
438 | bch_btree_op_init_stack(&op); | 438 | bch_btree_op_init_stack(&op); |
439 | 439 | ||
440 | sb = kzalloc(sizeof(struct cache_sb), GFP_KERNEL); | 440 | sb = kzalloc(sizeof(struct cache_sb), GFP_KERNEL); |
441 | if (!sb) | 441 | if (!sb) |
442 | return -ENOMEM; | 442 | return -ENOMEM; |
443 | 443 | ||
444 | sb->bucket_size = 128; | 444 | sb->bucket_size = 128; |
445 | sb->block_size = 4; | 445 | sb->block_size = 4; |
446 | 446 | ||
447 | c = bch_cache_set_alloc(sb); | 447 | c = bch_cache_set_alloc(sb); |
448 | if (!c) | 448 | if (!c) |
449 | return -ENOMEM; | 449 | return -ENOMEM; |
450 | 450 | ||
451 | for (j = 0; j < 3; j++) { | 451 | for (j = 0; j < 3; j++) { |
452 | BUG_ON(list_empty(&c->btree_cache)); | 452 | BUG_ON(list_empty(&c->btree_cache)); |
453 | all[j] = list_first_entry(&c->btree_cache, struct btree, list); | 453 | all[j] = list_first_entry(&c->btree_cache, struct btree, list); |
454 | list_del_init(&all[j]->list); | 454 | list_del_init(&all[j]->list); |
455 | 455 | ||
456 | all[j]->key = KEY(0, 0, c->sb.bucket_size); | 456 | all[j]->key = KEY(0, 0, c->sb.bucket_size); |
457 | bkey_copy_key(&all[j]->key, &MAX_KEY); | 457 | bkey_copy_key(&all[j]->key, &MAX_KEY); |
458 | } | 458 | } |
459 | 459 | ||
460 | b = all[0]; | 460 | b = all[0]; |
461 | fill = all[1]; | 461 | fill = all[1]; |
462 | orig = all[2]; | 462 | orig = all[2]; |
463 | 463 | ||
464 | while (1) { | 464 | while (1) { |
465 | for (j = 0; j < 3; j++) | 465 | for (j = 0; j < 3; j++) |
466 | all[j]->written = all[j]->nsets = 0; | 466 | all[j]->written = all[j]->nsets = 0; |
467 | 467 | ||
468 | bch_bset_init_next(b); | 468 | bch_bset_init_next(b); |
469 | 469 | ||
470 | while (1) { | 470 | while (1) { |
471 | struct bset *i = write_block(b); | 471 | struct bset *i = write_block(b); |
472 | struct bkey *k = op.keys.top; | 472 | struct bkey *k = op.keys.top; |
473 | unsigned rand; | 473 | unsigned rand; |
474 | 474 | ||
475 | bkey_init(k); | 475 | bkey_init(k); |
476 | rand = get_random_int(); | 476 | rand = get_random_int(); |
477 | 477 | ||
478 | op.type = rand & 1 | 478 | op.type = rand & 1 |
479 | ? BTREE_INSERT | 479 | ? BTREE_INSERT |
480 | : BTREE_REPLACE; | 480 | : BTREE_REPLACE; |
481 | rand >>= 1; | 481 | rand >>= 1; |
482 | 482 | ||
483 | SET_KEY_SIZE(k, bucket_remainder(c, rand)); | 483 | SET_KEY_SIZE(k, bucket_remainder(c, rand)); |
484 | rand >>= c->bucket_bits; | 484 | rand >>= c->bucket_bits; |
485 | rand &= 1024 * 512 - 1; | 485 | rand &= 1024 * 512 - 1; |
486 | rand += c->sb.bucket_size; | 486 | rand += c->sb.bucket_size; |
487 | SET_KEY_OFFSET(k, rand); | 487 | SET_KEY_OFFSET(k, rand); |
488 | #if 0 | 488 | #if 0 |
489 | SET_KEY_PTRS(k, 1); | 489 | SET_KEY_PTRS(k, 1); |
490 | #endif | 490 | #endif |
491 | bch_keylist_push(&op.keys); | 491 | bch_keylist_push(&op.keys); |
492 | bch_btree_insert_keys(b, &op); | 492 | bch_btree_insert_keys(b, &op); |
493 | 493 | ||
494 | if (should_split(b) || | 494 | if (should_split(b) || |
495 | set_blocks(i, b->c) != | 495 | set_blocks(i, b->c) != |
496 | __set_blocks(i, i->keys + 15, b->c)) { | 496 | __set_blocks(i, i->keys + 15, b->c)) { |
497 | i->csum = csum_set(i); | 497 | i->csum = csum_set(i); |
498 | 498 | ||
499 | memcpy(write_block(fill), | 499 | memcpy(write_block(fill), |
500 | i, set_bytes(i)); | 500 | i, set_bytes(i)); |
501 | 501 | ||
502 | b->written += set_blocks(i, b->c); | 502 | b->written += set_blocks(i, b->c); |
503 | fill->written = b->written; | 503 | fill->written = b->written; |
504 | if (b->written == btree_blocks(b)) | 504 | if (b->written == btree_blocks(b)) |
505 | break; | 505 | break; |
506 | 506 | ||
507 | bch_btree_sort_lazy(b); | 507 | bch_btree_sort_lazy(b); |
508 | bch_bset_init_next(b); | 508 | bch_bset_init_next(b); |
509 | } | 509 | } |
510 | } | 510 | } |
511 | 511 | ||
512 | memcpy(orig->sets[0].data, | 512 | memcpy(orig->sets[0].data, |
513 | fill->sets[0].data, | 513 | fill->sets[0].data, |
514 | btree_bytes(c)); | 514 | btree_bytes(c)); |
515 | 515 | ||
516 | bch_btree_sort(b); | 516 | bch_btree_sort(b); |
517 | fill->written = 0; | 517 | fill->written = 0; |
518 | bch_btree_node_read_done(fill); | 518 | bch_btree_node_read_done(fill); |
519 | 519 | ||
520 | if (b->sets[0].data->keys != fill->sets[0].data->keys || | 520 | if (b->sets[0].data->keys != fill->sets[0].data->keys || |
521 | memcmp(b->sets[0].data->start, | 521 | memcmp(b->sets[0].data->start, |
522 | fill->sets[0].data->start, | 522 | fill->sets[0].data->start, |
523 | b->sets[0].data->keys * sizeof(uint64_t))) { | 523 | b->sets[0].data->keys * sizeof(uint64_t))) { |
524 | struct bset *i = b->sets[0].data; | 524 | struct bset *i = b->sets[0].data; |
525 | struct bkey *k, *l; | 525 | struct bkey *k, *l; |
526 | 526 | ||
527 | for (k = i->start, | 527 | for (k = i->start, |
528 | l = fill->sets[0].data->start; | 528 | l = fill->sets[0].data->start; |
529 | k < end(i); | 529 | k < end(i); |
530 | k = bkey_next(k), l = bkey_next(l)) | 530 | k = bkey_next(k), l = bkey_next(l)) |
531 | if (bkey_cmp(k, l) || | 531 | if (bkey_cmp(k, l) || |
532 | KEY_SIZE(k) != KEY_SIZE(l)) { | 532 | KEY_SIZE(k) != KEY_SIZE(l)) { |
533 | char buf1[80]; | 533 | char buf1[80]; |
534 | char buf2[80]; | 534 | char buf2[80]; |
535 | 535 | ||
536 | bch_bkey_to_text(buf1, sizeof(buf1), k); | 536 | bch_bkey_to_text(buf1, sizeof(buf1), k); |
537 | bch_bkey_to_text(buf2, sizeof(buf2), l); | 537 | bch_bkey_to_text(buf2, sizeof(buf2), l); |
538 | 538 | ||
539 | pr_err("key %zi differs: %s != %s", | 539 | pr_err("key %zi differs: %s != %s", |
540 | (uint64_t *) k - i->d, | 540 | (uint64_t *) k - i->d, |
541 | buf1, buf2); | 541 | buf1, buf2); |
542 | } | 542 | } |
543 | 543 | ||
544 | for (j = 0; j < 3; j++) { | 544 | for (j = 0; j < 3; j++) { |
545 | pr_err("**** Set %i ****", j); | 545 | pr_err("**** Set %i ****", j); |
546 | dump(all[j]); | 546 | dump(all[j]); |
547 | } | 547 | } |
548 | panic("\n"); | 548 | panic("\n"); |
549 | } | 549 | } |
550 | 550 | ||
551 | pr_info("fuzz complete: %i keys", b->sets[0].data->keys); | 551 | pr_info("fuzz complete: %i keys", b->sets[0].data->keys); |
552 | } | 552 | } |
553 | } | 553 | } |
554 | 554 | ||
555 | kobj_attribute_write(fuzz, btree_fuzz); | 555 | kobj_attribute_write(fuzz, btree_fuzz); |
556 | #endif | 556 | #endif |
557 | 557 | ||
558 | void bch_debug_exit(void) | 558 | void bch_debug_exit(void) |
559 | { | 559 | { |
560 | if (!IS_ERR_OR_NULL(debug)) | 560 | if (!IS_ERR_OR_NULL(debug)) |
561 | debugfs_remove_recursive(debug); | 561 | debugfs_remove_recursive(debug); |
562 | } | 562 | } |
563 | 563 | ||
564 | int __init bch_debug_init(struct kobject *kobj) | 564 | int __init bch_debug_init(struct kobject *kobj) |
565 | { | 565 | { |
566 | int ret = 0; | 566 | int ret = 0; |
567 | #if 0 | 567 | #if 0 |
568 | ret = sysfs_create_file(kobj, &ksysfs_fuzz.attr); | 568 | ret = sysfs_create_file(kobj, &ksysfs_fuzz.attr); |
569 | if (ret) | 569 | if (ret) |
570 | return ret; | 570 | return ret; |
571 | #endif | 571 | #endif |
572 | 572 | ||
573 | debug = debugfs_create_dir("bcache", NULL); | 573 | debug = debugfs_create_dir("bcache", NULL); |
574 | return ret; | 574 | return ret; |
575 | } | 575 | } |
576 | 576 |
drivers/md/bcache/movinggc.c
1 | /* | 1 | /* |
2 | * Moving/copying garbage collector | 2 | * Moving/copying garbage collector |
3 | * | 3 | * |
4 | * Copyright 2012 Google, Inc. | 4 | * Copyright 2012 Google, Inc. |
5 | */ | 5 | */ |
6 | 6 | ||
7 | #include "bcache.h" | 7 | #include "bcache.h" |
8 | #include "btree.h" | 8 | #include "btree.h" |
9 | #include "debug.h" | 9 | #include "debug.h" |
10 | #include "request.h" | 10 | #include "request.h" |
11 | 11 | ||
12 | #include <trace/events/bcache.h> | 12 | #include <trace/events/bcache.h> |
13 | 13 | ||
14 | struct moving_io { | 14 | struct moving_io { |
15 | struct keybuf_key *w; | 15 | struct keybuf_key *w; |
16 | struct search s; | 16 | struct search s; |
17 | struct bbio bio; | 17 | struct bbio bio; |
18 | }; | 18 | }; |
19 | 19 | ||
20 | static bool moving_pred(struct keybuf *buf, struct bkey *k) | 20 | static bool moving_pred(struct keybuf *buf, struct bkey *k) |
21 | { | 21 | { |
22 | struct cache_set *c = container_of(buf, struct cache_set, | 22 | struct cache_set *c = container_of(buf, struct cache_set, |
23 | moving_gc_keys); | 23 | moving_gc_keys); |
24 | unsigned i; | 24 | unsigned i; |
25 | 25 | ||
26 | for (i = 0; i < KEY_PTRS(k); i++) { | 26 | for (i = 0; i < KEY_PTRS(k); i++) { |
27 | struct cache *ca = PTR_CACHE(c, k, i); | 27 | struct cache *ca = PTR_CACHE(c, k, i); |
28 | struct bucket *g = PTR_BUCKET(c, k, i); | 28 | struct bucket *g = PTR_BUCKET(c, k, i); |
29 | 29 | ||
30 | if (GC_SECTORS_USED(g) < ca->gc_move_threshold) | 30 | if (GC_SECTORS_USED(g) < ca->gc_move_threshold) |
31 | return true; | 31 | return true; |
32 | } | 32 | } |
33 | 33 | ||
34 | return false; | 34 | return false; |
35 | } | 35 | } |
36 | 36 | ||
37 | /* Moving GC - IO loop */ | 37 | /* Moving GC - IO loop */ |
38 | 38 | ||
39 | static void moving_io_destructor(struct closure *cl) | 39 | static void moving_io_destructor(struct closure *cl) |
40 | { | 40 | { |
41 | struct moving_io *io = container_of(cl, struct moving_io, s.cl); | 41 | struct moving_io *io = container_of(cl, struct moving_io, s.cl); |
42 | kfree(io); | 42 | kfree(io); |
43 | } | 43 | } |
44 | 44 | ||
45 | static void write_moving_finish(struct closure *cl) | 45 | static void write_moving_finish(struct closure *cl) |
46 | { | 46 | { |
47 | struct moving_io *io = container_of(cl, struct moving_io, s.cl); | 47 | struct moving_io *io = container_of(cl, struct moving_io, s.cl); |
48 | struct bio *bio = &io->bio.bio; | 48 | struct bio *bio = &io->bio.bio; |
49 | struct bio_vec *bv = bio_iovec_idx(bio, bio->bi_vcnt); | 49 | struct bio_vec *bv = bio_iovec_idx(bio, bio->bi_vcnt); |
50 | 50 | ||
51 | while (bv-- != bio->bi_io_vec) | 51 | while (bv-- != bio->bi_io_vec) |
52 | __free_page(bv->bv_page); | 52 | __free_page(bv->bv_page); |
53 | 53 | ||
54 | if (io->s.op.insert_collision) | 54 | if (io->s.op.insert_collision) |
55 | trace_bcache_gc_copy_collision(&io->w->key); | 55 | trace_bcache_gc_copy_collision(&io->w->key); |
56 | 56 | ||
57 | bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w); | 57 | bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w); |
58 | 58 | ||
59 | atomic_dec_bug(&io->s.op.c->in_flight); | 59 | atomic_dec_bug(&io->s.op.c->in_flight); |
60 | closure_wake_up(&io->s.op.c->moving_gc_wait); | 60 | closure_wake_up(&io->s.op.c->moving_gc_wait); |
61 | 61 | ||
62 | closure_return_with_destructor(cl, moving_io_destructor); | 62 | closure_return_with_destructor(cl, moving_io_destructor); |
63 | } | 63 | } |
64 | 64 | ||
65 | static void read_moving_endio(struct bio *bio, int error) | 65 | static void read_moving_endio(struct bio *bio, int error) |
66 | { | 66 | { |
67 | struct moving_io *io = container_of(bio->bi_private, | 67 | struct moving_io *io = container_of(bio->bi_private, |
68 | struct moving_io, s.cl); | 68 | struct moving_io, s.cl); |
69 | 69 | ||
70 | if (error) | 70 | if (error) |
71 | io->s.error = error; | 71 | io->s.error = error; |
72 | 72 | ||
73 | bch_bbio_endio(io->s.op.c, bio, error, "reading data to move"); | 73 | bch_bbio_endio(io->s.op.c, bio, error, "reading data to move"); |
74 | } | 74 | } |
75 | 75 | ||
76 | static void moving_init(struct moving_io *io) | 76 | static void moving_init(struct moving_io *io) |
77 | { | 77 | { |
78 | struct bio *bio = &io->bio.bio; | 78 | struct bio *bio = &io->bio.bio; |
79 | 79 | ||
80 | bio_init(bio); | 80 | bio_init(bio); |
81 | bio_get(bio); | 81 | bio_get(bio); |
82 | bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); | 82 | bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); |
83 | 83 | ||
84 | bio->bi_size = KEY_SIZE(&io->w->key) << 9; | 84 | bio->bi_size = KEY_SIZE(&io->w->key) << 9; |
85 | bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&io->w->key), | 85 | bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&io->w->key), |
86 | PAGE_SECTORS); | 86 | PAGE_SECTORS); |
87 | bio->bi_private = &io->s.cl; | 87 | bio->bi_private = &io->s.cl; |
88 | bio->bi_io_vec = bio->bi_inline_vecs; | 88 | bio->bi_io_vec = bio->bi_inline_vecs; |
89 | bch_bio_map(bio, NULL); | 89 | bch_bio_map(bio, NULL); |
90 | } | 90 | } |
91 | 91 | ||
92 | static void write_moving(struct closure *cl) | 92 | static void write_moving(struct closure *cl) |
93 | { | 93 | { |
94 | struct search *s = container_of(cl, struct search, cl); | 94 | struct search *s = container_of(cl, struct search, cl); |
95 | struct moving_io *io = container_of(s, struct moving_io, s); | 95 | struct moving_io *io = container_of(s, struct moving_io, s); |
96 | 96 | ||
97 | if (!s->error) { | 97 | if (!s->error) { |
98 | moving_init(io); | 98 | moving_init(io); |
99 | 99 | ||
100 | io->bio.bio.bi_sector = KEY_START(&io->w->key); | 100 | io->bio.bio.bi_sector = KEY_START(&io->w->key); |
101 | s->op.lock = -1; | 101 | s->op.lock = -1; |
102 | s->op.write_prio = 1; | 102 | s->op.write_prio = 1; |
103 | s->op.cache_bio = &io->bio.bio; | 103 | s->op.cache_bio = &io->bio.bio; |
104 | 104 | ||
105 | s->writeback = KEY_DIRTY(&io->w->key); | 105 | s->writeback = KEY_DIRTY(&io->w->key); |
106 | s->op.csum = KEY_CSUM(&io->w->key); | 106 | s->op.csum = KEY_CSUM(&io->w->key); |
107 | 107 | ||
108 | s->op.type = BTREE_REPLACE; | 108 | s->op.type = BTREE_REPLACE; |
109 | bkey_copy(&s->op.replace, &io->w->key); | 109 | bkey_copy(&s->op.replace, &io->w->key); |
110 | 110 | ||
111 | closure_init(&s->op.cl, cl); | 111 | closure_init(&s->op.cl, cl); |
112 | bch_insert_data(&s->op.cl); | 112 | bch_insert_data(&s->op.cl); |
113 | } | 113 | } |
114 | 114 | ||
115 | continue_at(cl, write_moving_finish, NULL); | 115 | continue_at(cl, write_moving_finish, NULL); |
116 | } | 116 | } |
117 | 117 | ||
118 | static void read_moving_submit(struct closure *cl) | 118 | static void read_moving_submit(struct closure *cl) |
119 | { | 119 | { |
120 | struct search *s = container_of(cl, struct search, cl); | 120 | struct search *s = container_of(cl, struct search, cl); |
121 | struct moving_io *io = container_of(s, struct moving_io, s); | 121 | struct moving_io *io = container_of(s, struct moving_io, s); |
122 | struct bio *bio = &io->bio.bio; | 122 | struct bio *bio = &io->bio.bio; |
123 | 123 | ||
124 | bch_submit_bbio(bio, s->op.c, &io->w->key, 0); | 124 | bch_submit_bbio(bio, s->op.c, &io->w->key, 0); |
125 | 125 | ||
126 | continue_at(cl, write_moving, bch_gc_wq); | 126 | continue_at(cl, write_moving, bch_gc_wq); |
127 | } | 127 | } |
128 | 128 | ||
129 | static void read_moving(struct closure *cl) | 129 | static void read_moving(struct closure *cl) |
130 | { | 130 | { |
131 | struct cache_set *c = container_of(cl, struct cache_set, moving_gc); | 131 | struct cache_set *c = container_of(cl, struct cache_set, moving_gc); |
132 | struct keybuf_key *w; | 132 | struct keybuf_key *w; |
133 | struct moving_io *io; | 133 | struct moving_io *io; |
134 | struct bio *bio; | 134 | struct bio *bio; |
135 | 135 | ||
136 | /* XXX: if we error, background writeback could stall indefinitely */ | 136 | /* XXX: if we error, background writeback could stall indefinitely */ |
137 | 137 | ||
138 | while (!test_bit(CACHE_SET_STOPPING, &c->flags)) { | 138 | while (!test_bit(CACHE_SET_STOPPING, &c->flags)) { |
139 | w = bch_keybuf_next_rescan(c, &c->moving_gc_keys, &MAX_KEY); | 139 | w = bch_keybuf_next_rescan(c, &c->moving_gc_keys, |
140 | &MAX_KEY, moving_pred); | ||
140 | if (!w) | 141 | if (!w) |
141 | break; | 142 | break; |
142 | 143 | ||
143 | io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec) | 144 | io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec) |
144 | * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), | 145 | * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), |
145 | GFP_KERNEL); | 146 | GFP_KERNEL); |
146 | if (!io) | 147 | if (!io) |
147 | goto err; | 148 | goto err; |
148 | 149 | ||
149 | w->private = io; | 150 | w->private = io; |
150 | io->w = w; | 151 | io->w = w; |
151 | io->s.op.inode = KEY_INODE(&w->key); | 152 | io->s.op.inode = KEY_INODE(&w->key); |
152 | io->s.op.c = c; | 153 | io->s.op.c = c; |
153 | 154 | ||
154 | moving_init(io); | 155 | moving_init(io); |
155 | bio = &io->bio.bio; | 156 | bio = &io->bio.bio; |
156 | 157 | ||
157 | bio->bi_rw = READ; | 158 | bio->bi_rw = READ; |
158 | bio->bi_end_io = read_moving_endio; | 159 | bio->bi_end_io = read_moving_endio; |
159 | 160 | ||
160 | if (bch_bio_alloc_pages(bio, GFP_KERNEL)) | 161 | if (bch_bio_alloc_pages(bio, GFP_KERNEL)) |
161 | goto err; | 162 | goto err; |
162 | 163 | ||
163 | trace_bcache_gc_copy(&w->key); | 164 | trace_bcache_gc_copy(&w->key); |
164 | 165 | ||
165 | closure_call(&io->s.cl, read_moving_submit, NULL, &c->gc.cl); | 166 | closure_call(&io->s.cl, read_moving_submit, NULL, &c->gc.cl); |
166 | 167 | ||
167 | if (atomic_inc_return(&c->in_flight) >= 64) { | 168 | if (atomic_inc_return(&c->in_flight) >= 64) { |
168 | closure_wait_event(&c->moving_gc_wait, cl, | 169 | closure_wait_event(&c->moving_gc_wait, cl, |
169 | atomic_read(&c->in_flight) < 64); | 170 | atomic_read(&c->in_flight) < 64); |
170 | continue_at(cl, read_moving, bch_gc_wq); | 171 | continue_at(cl, read_moving, bch_gc_wq); |
171 | } | 172 | } |
172 | } | 173 | } |
173 | 174 | ||
174 | if (0) { | 175 | if (0) { |
175 | err: if (!IS_ERR_OR_NULL(w->private)) | 176 | err: if (!IS_ERR_OR_NULL(w->private)) |
176 | kfree(w->private); | 177 | kfree(w->private); |
177 | 178 | ||
178 | bch_keybuf_del(&c->moving_gc_keys, w); | 179 | bch_keybuf_del(&c->moving_gc_keys, w); |
179 | } | 180 | } |
180 | 181 | ||
181 | closure_return(cl); | 182 | closure_return(cl); |
182 | } | 183 | } |
183 | 184 | ||
184 | static bool bucket_cmp(struct bucket *l, struct bucket *r) | 185 | static bool bucket_cmp(struct bucket *l, struct bucket *r) |
185 | { | 186 | { |
186 | return GC_SECTORS_USED(l) < GC_SECTORS_USED(r); | 187 | return GC_SECTORS_USED(l) < GC_SECTORS_USED(r); |
187 | } | 188 | } |
188 | 189 | ||
189 | static unsigned bucket_heap_top(struct cache *ca) | 190 | static unsigned bucket_heap_top(struct cache *ca) |
190 | { | 191 | { |
191 | return GC_SECTORS_USED(heap_peek(&ca->heap)); | 192 | return GC_SECTORS_USED(heap_peek(&ca->heap)); |
192 | } | 193 | } |
193 | 194 | ||
194 | void bch_moving_gc(struct closure *cl) | 195 | void bch_moving_gc(struct closure *cl) |
195 | { | 196 | { |
196 | struct cache_set *c = container_of(cl, struct cache_set, gc.cl); | 197 | struct cache_set *c = container_of(cl, struct cache_set, gc.cl); |
197 | struct cache *ca; | 198 | struct cache *ca; |
198 | struct bucket *b; | 199 | struct bucket *b; |
199 | unsigned i; | 200 | unsigned i; |
200 | 201 | ||
201 | if (!c->copy_gc_enabled) | 202 | if (!c->copy_gc_enabled) |
202 | closure_return(cl); | 203 | closure_return(cl); |
203 | 204 | ||
204 | mutex_lock(&c->bucket_lock); | 205 | mutex_lock(&c->bucket_lock); |
205 | 206 | ||
206 | for_each_cache(ca, c, i) { | 207 | for_each_cache(ca, c, i) { |
207 | unsigned sectors_to_move = 0; | 208 | unsigned sectors_to_move = 0; |
208 | unsigned reserve_sectors = ca->sb.bucket_size * | 209 | unsigned reserve_sectors = ca->sb.bucket_size * |
209 | min(fifo_used(&ca->free), ca->free.size / 2); | 210 | min(fifo_used(&ca->free), ca->free.size / 2); |
210 | 211 | ||
211 | ca->heap.used = 0; | 212 | ca->heap.used = 0; |
212 | 213 | ||
213 | for_each_bucket(b, ca) { | 214 | for_each_bucket(b, ca) { |
214 | if (!GC_SECTORS_USED(b)) | 215 | if (!GC_SECTORS_USED(b)) |
215 | continue; | 216 | continue; |
216 | 217 | ||
217 | if (!heap_full(&ca->heap)) { | 218 | if (!heap_full(&ca->heap)) { |
218 | sectors_to_move += GC_SECTORS_USED(b); | 219 | sectors_to_move += GC_SECTORS_USED(b); |
219 | heap_add(&ca->heap, b, bucket_cmp); | 220 | heap_add(&ca->heap, b, bucket_cmp); |
220 | } else if (bucket_cmp(b, heap_peek(&ca->heap))) { | 221 | } else if (bucket_cmp(b, heap_peek(&ca->heap))) { |
221 | sectors_to_move -= bucket_heap_top(ca); | 222 | sectors_to_move -= bucket_heap_top(ca); |
222 | sectors_to_move += GC_SECTORS_USED(b); | 223 | sectors_to_move += GC_SECTORS_USED(b); |
223 | 224 | ||
224 | ca->heap.data[0] = b; | 225 | ca->heap.data[0] = b; |
225 | heap_sift(&ca->heap, 0, bucket_cmp); | 226 | heap_sift(&ca->heap, 0, bucket_cmp); |
226 | } | 227 | } |
227 | } | 228 | } |
228 | 229 | ||
229 | while (sectors_to_move > reserve_sectors) { | 230 | while (sectors_to_move > reserve_sectors) { |
230 | heap_pop(&ca->heap, b, bucket_cmp); | 231 | heap_pop(&ca->heap, b, bucket_cmp); |
231 | sectors_to_move -= GC_SECTORS_USED(b); | 232 | sectors_to_move -= GC_SECTORS_USED(b); |
232 | } | 233 | } |
233 | 234 | ||
234 | ca->gc_move_threshold = bucket_heap_top(ca); | 235 | ca->gc_move_threshold = bucket_heap_top(ca); |
235 | 236 | ||
236 | pr_debug("threshold %u", ca->gc_move_threshold); | 237 | pr_debug("threshold %u", ca->gc_move_threshold); |
237 | } | 238 | } |
238 | 239 | ||
239 | mutex_unlock(&c->bucket_lock); | 240 | mutex_unlock(&c->bucket_lock); |
240 | 241 | ||
241 | c->moving_gc_keys.last_scanned = ZERO_KEY; | 242 | c->moving_gc_keys.last_scanned = ZERO_KEY; |
242 | 243 | ||
243 | closure_init(&c->moving_gc, cl); | 244 | closure_init(&c->moving_gc, cl); |
244 | read_moving(&c->moving_gc); | 245 | read_moving(&c->moving_gc); |
245 | 246 | ||
246 | closure_return(cl); | 247 | closure_return(cl); |
247 | } | 248 | } |
248 | 249 | ||
249 | void bch_moving_init_cache_set(struct cache_set *c) | 250 | void bch_moving_init_cache_set(struct cache_set *c) |
250 | { | 251 | { |
251 | bch_keybuf_init(&c->moving_gc_keys, moving_pred); | 252 | bch_keybuf_init(&c->moving_gc_keys); |
252 | } | 253 | } |
253 | 254 |
drivers/md/bcache/request.c
1 | /* | 1 | /* |
2 | * Main bcache entry point - handle a read or a write request and decide what to | 2 | * Main bcache entry point - handle a read or a write request and decide what to |
3 | * do with it; the make_request functions are called by the block layer. | 3 | * do with it; the make_request functions are called by the block layer. |
4 | * | 4 | * |
5 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> | 5 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> |
6 | * Copyright 2012 Google, Inc. | 6 | * Copyright 2012 Google, Inc. |
7 | */ | 7 | */ |
8 | 8 | ||
9 | #include "bcache.h" | 9 | #include "bcache.h" |
10 | #include "btree.h" | 10 | #include "btree.h" |
11 | #include "debug.h" | 11 | #include "debug.h" |
12 | #include "request.h" | 12 | #include "request.h" |
13 | #include "writeback.h" | 13 | #include "writeback.h" |
14 | 14 | ||
15 | #include <linux/cgroup.h> | 15 | #include <linux/cgroup.h> |
16 | #include <linux/module.h> | 16 | #include <linux/module.h> |
17 | #include <linux/hash.h> | 17 | #include <linux/hash.h> |
18 | #include <linux/random.h> | 18 | #include <linux/random.h> |
19 | #include "blk-cgroup.h" | 19 | #include "blk-cgroup.h" |
20 | 20 | ||
21 | #include <trace/events/bcache.h> | 21 | #include <trace/events/bcache.h> |
22 | 22 | ||
23 | #define CUTOFF_CACHE_ADD 95 | 23 | #define CUTOFF_CACHE_ADD 95 |
24 | #define CUTOFF_CACHE_READA 90 | 24 | #define CUTOFF_CACHE_READA 90 |
25 | #define CUTOFF_WRITEBACK 50 | ||
26 | #define CUTOFF_WRITEBACK_SYNC 75 | ||
27 | 25 | ||
28 | struct kmem_cache *bch_search_cache; | 26 | struct kmem_cache *bch_search_cache; |
29 | 27 | ||
30 | static void check_should_skip(struct cached_dev *, struct search *); | 28 | static void check_should_skip(struct cached_dev *, struct search *); |
31 | 29 | ||
32 | /* Cgroup interface */ | 30 | /* Cgroup interface */ |
33 | 31 | ||
34 | #ifdef CONFIG_CGROUP_BCACHE | 32 | #ifdef CONFIG_CGROUP_BCACHE |
35 | static struct bch_cgroup bcache_default_cgroup = { .cache_mode = -1 }; | 33 | static struct bch_cgroup bcache_default_cgroup = { .cache_mode = -1 }; |
36 | 34 | ||
37 | static struct bch_cgroup *cgroup_to_bcache(struct cgroup *cgroup) | 35 | static struct bch_cgroup *cgroup_to_bcache(struct cgroup *cgroup) |
38 | { | 36 | { |
39 | struct cgroup_subsys_state *css; | 37 | struct cgroup_subsys_state *css; |
40 | return cgroup && | 38 | return cgroup && |
41 | (css = cgroup_subsys_state(cgroup, bcache_subsys_id)) | 39 | (css = cgroup_subsys_state(cgroup, bcache_subsys_id)) |
42 | ? container_of(css, struct bch_cgroup, css) | 40 | ? container_of(css, struct bch_cgroup, css) |
43 | : &bcache_default_cgroup; | 41 | : &bcache_default_cgroup; |
44 | } | 42 | } |
45 | 43 | ||
46 | struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio) | 44 | struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio) |
47 | { | 45 | { |
48 | struct cgroup_subsys_state *css = bio->bi_css | 46 | struct cgroup_subsys_state *css = bio->bi_css |
49 | ? cgroup_subsys_state(bio->bi_css->cgroup, bcache_subsys_id) | 47 | ? cgroup_subsys_state(bio->bi_css->cgroup, bcache_subsys_id) |
50 | : task_subsys_state(current, bcache_subsys_id); | 48 | : task_subsys_state(current, bcache_subsys_id); |
51 | 49 | ||
52 | return css | 50 | return css |
53 | ? container_of(css, struct bch_cgroup, css) | 51 | ? container_of(css, struct bch_cgroup, css) |
54 | : &bcache_default_cgroup; | 52 | : &bcache_default_cgroup; |
55 | } | 53 | } |
56 | 54 | ||
57 | static ssize_t cache_mode_read(struct cgroup *cgrp, struct cftype *cft, | 55 | static ssize_t cache_mode_read(struct cgroup *cgrp, struct cftype *cft, |
58 | struct file *file, | 56 | struct file *file, |
59 | char __user *buf, size_t nbytes, loff_t *ppos) | 57 | char __user *buf, size_t nbytes, loff_t *ppos) |
60 | { | 58 | { |
61 | char tmp[1024]; | 59 | char tmp[1024]; |
62 | int len = bch_snprint_string_list(tmp, PAGE_SIZE, bch_cache_modes, | 60 | int len = bch_snprint_string_list(tmp, PAGE_SIZE, bch_cache_modes, |
63 | cgroup_to_bcache(cgrp)->cache_mode + 1); | 61 | cgroup_to_bcache(cgrp)->cache_mode + 1); |
64 | 62 | ||
65 | if (len < 0) | 63 | if (len < 0) |
66 | return len; | 64 | return len; |
67 | 65 | ||
68 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); | 66 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); |
69 | } | 67 | } |
70 | 68 | ||
71 | static int cache_mode_write(struct cgroup *cgrp, struct cftype *cft, | 69 | static int cache_mode_write(struct cgroup *cgrp, struct cftype *cft, |
72 | const char *buf) | 70 | const char *buf) |
73 | { | 71 | { |
74 | int v = bch_read_string_list(buf, bch_cache_modes); | 72 | int v = bch_read_string_list(buf, bch_cache_modes); |
75 | if (v < 0) | 73 | if (v < 0) |
76 | return v; | 74 | return v; |
77 | 75 | ||
78 | cgroup_to_bcache(cgrp)->cache_mode = v - 1; | 76 | cgroup_to_bcache(cgrp)->cache_mode = v - 1; |
79 | return 0; | 77 | return 0; |
80 | } | 78 | } |
81 | 79 | ||
82 | static u64 bch_verify_read(struct cgroup *cgrp, struct cftype *cft) | 80 | static u64 bch_verify_read(struct cgroup *cgrp, struct cftype *cft) |
83 | { | 81 | { |
84 | return cgroup_to_bcache(cgrp)->verify; | 82 | return cgroup_to_bcache(cgrp)->verify; |
85 | } | 83 | } |
86 | 84 | ||
87 | static int bch_verify_write(struct cgroup *cgrp, struct cftype *cft, u64 val) | 85 | static int bch_verify_write(struct cgroup *cgrp, struct cftype *cft, u64 val) |
88 | { | 86 | { |
89 | cgroup_to_bcache(cgrp)->verify = val; | 87 | cgroup_to_bcache(cgrp)->verify = val; |
90 | return 0; | 88 | return 0; |
91 | } | 89 | } |
92 | 90 | ||
93 | static u64 bch_cache_hits_read(struct cgroup *cgrp, struct cftype *cft) | 91 | static u64 bch_cache_hits_read(struct cgroup *cgrp, struct cftype *cft) |
94 | { | 92 | { |
95 | struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); | 93 | struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); |
96 | return atomic_read(&bcachecg->stats.cache_hits); | 94 | return atomic_read(&bcachecg->stats.cache_hits); |
97 | } | 95 | } |
98 | 96 | ||
99 | static u64 bch_cache_misses_read(struct cgroup *cgrp, struct cftype *cft) | 97 | static u64 bch_cache_misses_read(struct cgroup *cgrp, struct cftype *cft) |
100 | { | 98 | { |
101 | struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); | 99 | struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); |
102 | return atomic_read(&bcachecg->stats.cache_misses); | 100 | return atomic_read(&bcachecg->stats.cache_misses); |
103 | } | 101 | } |
104 | 102 | ||
105 | static u64 bch_cache_bypass_hits_read(struct cgroup *cgrp, | 103 | static u64 bch_cache_bypass_hits_read(struct cgroup *cgrp, |
106 | struct cftype *cft) | 104 | struct cftype *cft) |
107 | { | 105 | { |
108 | struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); | 106 | struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); |
109 | return atomic_read(&bcachecg->stats.cache_bypass_hits); | 107 | return atomic_read(&bcachecg->stats.cache_bypass_hits); |
110 | } | 108 | } |
111 | 109 | ||
112 | static u64 bch_cache_bypass_misses_read(struct cgroup *cgrp, | 110 | static u64 bch_cache_bypass_misses_read(struct cgroup *cgrp, |
113 | struct cftype *cft) | 111 | struct cftype *cft) |
114 | { | 112 | { |
115 | struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); | 113 | struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); |
116 | return atomic_read(&bcachecg->stats.cache_bypass_misses); | 114 | return atomic_read(&bcachecg->stats.cache_bypass_misses); |
117 | } | 115 | } |
118 | 116 | ||
119 | static struct cftype bch_files[] = { | 117 | static struct cftype bch_files[] = { |
120 | { | 118 | { |
121 | .name = "cache_mode", | 119 | .name = "cache_mode", |
122 | .read = cache_mode_read, | 120 | .read = cache_mode_read, |
123 | .write_string = cache_mode_write, | 121 | .write_string = cache_mode_write, |
124 | }, | 122 | }, |
125 | { | 123 | { |
126 | .name = "verify", | 124 | .name = "verify", |
127 | .read_u64 = bch_verify_read, | 125 | .read_u64 = bch_verify_read, |
128 | .write_u64 = bch_verify_write, | 126 | .write_u64 = bch_verify_write, |
129 | }, | 127 | }, |
130 | { | 128 | { |
131 | .name = "cache_hits", | 129 | .name = "cache_hits", |
132 | .read_u64 = bch_cache_hits_read, | 130 | .read_u64 = bch_cache_hits_read, |
133 | }, | 131 | }, |
134 | { | 132 | { |
135 | .name = "cache_misses", | 133 | .name = "cache_misses", |
136 | .read_u64 = bch_cache_misses_read, | 134 | .read_u64 = bch_cache_misses_read, |
137 | }, | 135 | }, |
138 | { | 136 | { |
139 | .name = "cache_bypass_hits", | 137 | .name = "cache_bypass_hits", |
140 | .read_u64 = bch_cache_bypass_hits_read, | 138 | .read_u64 = bch_cache_bypass_hits_read, |
141 | }, | 139 | }, |
142 | { | 140 | { |
143 | .name = "cache_bypass_misses", | 141 | .name = "cache_bypass_misses", |
144 | .read_u64 = bch_cache_bypass_misses_read, | 142 | .read_u64 = bch_cache_bypass_misses_read, |
145 | }, | 143 | }, |
146 | { } /* terminate */ | 144 | { } /* terminate */ |
147 | }; | 145 | }; |
148 | 146 | ||
149 | static void init_bch_cgroup(struct bch_cgroup *cg) | 147 | static void init_bch_cgroup(struct bch_cgroup *cg) |
150 | { | 148 | { |
151 | cg->cache_mode = -1; | 149 | cg->cache_mode = -1; |
152 | } | 150 | } |
153 | 151 | ||
154 | static struct cgroup_subsys_state *bcachecg_create(struct cgroup *cgroup) | 152 | static struct cgroup_subsys_state *bcachecg_create(struct cgroup *cgroup) |
155 | { | 153 | { |
156 | struct bch_cgroup *cg; | 154 | struct bch_cgroup *cg; |
157 | 155 | ||
158 | cg = kzalloc(sizeof(*cg), GFP_KERNEL); | 156 | cg = kzalloc(sizeof(*cg), GFP_KERNEL); |
159 | if (!cg) | 157 | if (!cg) |
160 | return ERR_PTR(-ENOMEM); | 158 | return ERR_PTR(-ENOMEM); |
161 | init_bch_cgroup(cg); | 159 | init_bch_cgroup(cg); |
162 | return &cg->css; | 160 | return &cg->css; |
163 | } | 161 | } |
164 | 162 | ||
165 | static void bcachecg_destroy(struct cgroup *cgroup) | 163 | static void bcachecg_destroy(struct cgroup *cgroup) |
166 | { | 164 | { |
167 | struct bch_cgroup *cg = cgroup_to_bcache(cgroup); | 165 | struct bch_cgroup *cg = cgroup_to_bcache(cgroup); |
168 | free_css_id(&bcache_subsys, &cg->css); | 166 | free_css_id(&bcache_subsys, &cg->css); |
169 | kfree(cg); | 167 | kfree(cg); |
170 | } | 168 | } |
171 | 169 | ||
172 | struct cgroup_subsys bcache_subsys = { | 170 | struct cgroup_subsys bcache_subsys = { |
173 | .create = bcachecg_create, | 171 | .create = bcachecg_create, |
174 | .destroy = bcachecg_destroy, | 172 | .destroy = bcachecg_destroy, |
175 | .subsys_id = bcache_subsys_id, | 173 | .subsys_id = bcache_subsys_id, |
176 | .name = "bcache", | 174 | .name = "bcache", |
177 | .module = THIS_MODULE, | 175 | .module = THIS_MODULE, |
178 | }; | 176 | }; |
179 | EXPORT_SYMBOL_GPL(bcache_subsys); | 177 | EXPORT_SYMBOL_GPL(bcache_subsys); |
180 | #endif | 178 | #endif |
181 | 179 | ||
182 | static unsigned cache_mode(struct cached_dev *dc, struct bio *bio) | 180 | static unsigned cache_mode(struct cached_dev *dc, struct bio *bio) |
183 | { | 181 | { |
184 | #ifdef CONFIG_CGROUP_BCACHE | 182 | #ifdef CONFIG_CGROUP_BCACHE |
185 | int r = bch_bio_to_cgroup(bio)->cache_mode; | 183 | int r = bch_bio_to_cgroup(bio)->cache_mode; |
186 | if (r >= 0) | 184 | if (r >= 0) |
187 | return r; | 185 | return r; |
188 | #endif | 186 | #endif |
189 | return BDEV_CACHE_MODE(&dc->sb); | 187 | return BDEV_CACHE_MODE(&dc->sb); |
190 | } | 188 | } |
191 | 189 | ||
192 | static bool verify(struct cached_dev *dc, struct bio *bio) | 190 | static bool verify(struct cached_dev *dc, struct bio *bio) |
193 | { | 191 | { |
194 | #ifdef CONFIG_CGROUP_BCACHE | 192 | #ifdef CONFIG_CGROUP_BCACHE |
195 | if (bch_bio_to_cgroup(bio)->verify) | 193 | if (bch_bio_to_cgroup(bio)->verify) |
196 | return true; | 194 | return true; |
197 | #endif | 195 | #endif |
198 | return dc->verify; | 196 | return dc->verify; |
199 | } | 197 | } |
200 | 198 | ||
201 | static void bio_csum(struct bio *bio, struct bkey *k) | 199 | static void bio_csum(struct bio *bio, struct bkey *k) |
202 | { | 200 | { |
203 | struct bio_vec *bv; | 201 | struct bio_vec *bv; |
204 | uint64_t csum = 0; | 202 | uint64_t csum = 0; |
205 | int i; | 203 | int i; |
206 | 204 | ||
207 | bio_for_each_segment(bv, bio, i) { | 205 | bio_for_each_segment(bv, bio, i) { |
208 | void *d = kmap(bv->bv_page) + bv->bv_offset; | 206 | void *d = kmap(bv->bv_page) + bv->bv_offset; |
209 | csum = bch_crc64_update(csum, d, bv->bv_len); | 207 | csum = bch_crc64_update(csum, d, bv->bv_len); |
210 | kunmap(bv->bv_page); | 208 | kunmap(bv->bv_page); |
211 | } | 209 | } |
212 | 210 | ||
213 | k->ptr[KEY_PTRS(k)] = csum & (~0ULL >> 1); | 211 | k->ptr[KEY_PTRS(k)] = csum & (~0ULL >> 1); |
214 | } | 212 | } |
215 | 213 | ||
216 | /* Insert data into cache */ | 214 | /* Insert data into cache */ |
217 | 215 | ||
218 | static void bio_invalidate(struct closure *cl) | 216 | static void bio_invalidate(struct closure *cl) |
219 | { | 217 | { |
220 | struct btree_op *op = container_of(cl, struct btree_op, cl); | 218 | struct btree_op *op = container_of(cl, struct btree_op, cl); |
221 | struct bio *bio = op->cache_bio; | 219 | struct bio *bio = op->cache_bio; |
222 | 220 | ||
223 | pr_debug("invalidating %i sectors from %llu", | 221 | pr_debug("invalidating %i sectors from %llu", |
224 | bio_sectors(bio), (uint64_t) bio->bi_sector); | 222 | bio_sectors(bio), (uint64_t) bio->bi_sector); |
225 | 223 | ||
226 | while (bio_sectors(bio)) { | 224 | while (bio_sectors(bio)) { |
227 | unsigned len = min(bio_sectors(bio), 1U << 14); | 225 | unsigned len = min(bio_sectors(bio), 1U << 14); |
228 | 226 | ||
229 | if (bch_keylist_realloc(&op->keys, 0, op->c)) | 227 | if (bch_keylist_realloc(&op->keys, 0, op->c)) |
230 | goto out; | 228 | goto out; |
231 | 229 | ||
232 | bio->bi_sector += len; | 230 | bio->bi_sector += len; |
233 | bio->bi_size -= len << 9; | 231 | bio->bi_size -= len << 9; |
234 | 232 | ||
235 | bch_keylist_add(&op->keys, | 233 | bch_keylist_add(&op->keys, |
236 | &KEY(op->inode, bio->bi_sector, len)); | 234 | &KEY(op->inode, bio->bi_sector, len)); |
237 | } | 235 | } |
238 | 236 | ||
239 | op->insert_data_done = true; | 237 | op->insert_data_done = true; |
240 | bio_put(bio); | 238 | bio_put(bio); |
241 | out: | 239 | out: |
242 | continue_at(cl, bch_journal, bcache_wq); | 240 | continue_at(cl, bch_journal, bcache_wq); |
243 | } | 241 | } |
244 | 242 | ||
245 | struct open_bucket { | 243 | struct open_bucket { |
246 | struct list_head list; | 244 | struct list_head list; |
247 | struct task_struct *last; | 245 | struct task_struct *last; |
248 | unsigned sectors_free; | 246 | unsigned sectors_free; |
249 | BKEY_PADDED(key); | 247 | BKEY_PADDED(key); |
250 | }; | 248 | }; |
251 | 249 | ||
252 | void bch_open_buckets_free(struct cache_set *c) | 250 | void bch_open_buckets_free(struct cache_set *c) |
253 | { | 251 | { |
254 | struct open_bucket *b; | 252 | struct open_bucket *b; |
255 | 253 | ||
256 | while (!list_empty(&c->data_buckets)) { | 254 | while (!list_empty(&c->data_buckets)) { |
257 | b = list_first_entry(&c->data_buckets, | 255 | b = list_first_entry(&c->data_buckets, |
258 | struct open_bucket, list); | 256 | struct open_bucket, list); |
259 | list_del(&b->list); | 257 | list_del(&b->list); |
260 | kfree(b); | 258 | kfree(b); |
261 | } | 259 | } |
262 | } | 260 | } |
263 | 261 | ||
264 | int bch_open_buckets_alloc(struct cache_set *c) | 262 | int bch_open_buckets_alloc(struct cache_set *c) |
265 | { | 263 | { |
266 | int i; | 264 | int i; |
267 | 265 | ||
268 | spin_lock_init(&c->data_bucket_lock); | 266 | spin_lock_init(&c->data_bucket_lock); |
269 | 267 | ||
270 | for (i = 0; i < 6; i++) { | 268 | for (i = 0; i < 6; i++) { |
271 | struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL); | 269 | struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL); |
272 | if (!b) | 270 | if (!b) |
273 | return -ENOMEM; | 271 | return -ENOMEM; |
274 | 272 | ||
275 | list_add(&b->list, &c->data_buckets); | 273 | list_add(&b->list, &c->data_buckets); |
276 | } | 274 | } |
277 | 275 | ||
278 | return 0; | 276 | return 0; |
279 | } | 277 | } |
280 | 278 | ||
281 | /* | 279 | /* |
282 | * We keep multiple buckets open for writes, and try to segregate different | 280 | * We keep multiple buckets open for writes, and try to segregate different |
283 | * write streams for better cache utilization: first we look for a bucket where | 281 | * write streams for better cache utilization: first we look for a bucket where |
284 | * the last write to it was sequential with the current write, and failing that | 282 | * the last write to it was sequential with the current write, and failing that |
285 | * we look for a bucket that was last used by the same task. | 283 | * we look for a bucket that was last used by the same task. |
286 | * | 284 | * |
287 | * The ideas is if you've got multiple tasks pulling data into the cache at the | 285 | * The ideas is if you've got multiple tasks pulling data into the cache at the |
288 | * same time, you'll get better cache utilization if you try to segregate their | 286 | * same time, you'll get better cache utilization if you try to segregate their |
289 | * data and preserve locality. | 287 | * data and preserve locality. |
290 | * | 288 | * |
291 | * For example, say you've starting Firefox at the same time you're copying a | 289 | * For example, say you've starting Firefox at the same time you're copying a |
292 | * bunch of files. Firefox will likely end up being fairly hot and stay in the | 290 | * bunch of files. Firefox will likely end up being fairly hot and stay in the |
293 | * cache awhile, but the data you copied might not be; if you wrote all that | 291 | * cache awhile, but the data you copied might not be; if you wrote all that |
294 | * data to the same buckets it'd get invalidated at the same time. | 292 | * data to the same buckets it'd get invalidated at the same time. |
295 | * | 293 | * |
296 | * Both of those tasks will be doing fairly random IO so we can't rely on | 294 | * Both of those tasks will be doing fairly random IO so we can't rely on |
297 | * detecting sequential IO to segregate their data, but going off of the task | 295 | * detecting sequential IO to segregate their data, but going off of the task |
298 | * should be a sane heuristic. | 296 | * should be a sane heuristic. |
299 | */ | 297 | */ |
300 | static struct open_bucket *pick_data_bucket(struct cache_set *c, | 298 | static struct open_bucket *pick_data_bucket(struct cache_set *c, |
301 | const struct bkey *search, | 299 | const struct bkey *search, |
302 | struct task_struct *task, | 300 | struct task_struct *task, |
303 | struct bkey *alloc) | 301 | struct bkey *alloc) |
304 | { | 302 | { |
305 | struct open_bucket *ret, *ret_task = NULL; | 303 | struct open_bucket *ret, *ret_task = NULL; |
306 | 304 | ||
307 | list_for_each_entry_reverse(ret, &c->data_buckets, list) | 305 | list_for_each_entry_reverse(ret, &c->data_buckets, list) |
308 | if (!bkey_cmp(&ret->key, search)) | 306 | if (!bkey_cmp(&ret->key, search)) |
309 | goto found; | 307 | goto found; |
310 | else if (ret->last == task) | 308 | else if (ret->last == task) |
311 | ret_task = ret; | 309 | ret_task = ret; |
312 | 310 | ||
313 | ret = ret_task ?: list_first_entry(&c->data_buckets, | 311 | ret = ret_task ?: list_first_entry(&c->data_buckets, |
314 | struct open_bucket, list); | 312 | struct open_bucket, list); |
315 | found: | 313 | found: |
316 | if (!ret->sectors_free && KEY_PTRS(alloc)) { | 314 | if (!ret->sectors_free && KEY_PTRS(alloc)) { |
317 | ret->sectors_free = c->sb.bucket_size; | 315 | ret->sectors_free = c->sb.bucket_size; |
318 | bkey_copy(&ret->key, alloc); | 316 | bkey_copy(&ret->key, alloc); |
319 | bkey_init(alloc); | 317 | bkey_init(alloc); |
320 | } | 318 | } |
321 | 319 | ||
322 | if (!ret->sectors_free) | 320 | if (!ret->sectors_free) |
323 | ret = NULL; | 321 | ret = NULL; |
324 | 322 | ||
325 | return ret; | 323 | return ret; |
326 | } | 324 | } |
327 | 325 | ||
328 | /* | 326 | /* |
329 | * Allocates some space in the cache to write to, and k to point to the newly | 327 | * Allocates some space in the cache to write to, and k to point to the newly |
330 | * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the | 328 | * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the |
331 | * end of the newly allocated space). | 329 | * end of the newly allocated space). |
332 | * | 330 | * |
333 | * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many | 331 | * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many |
334 | * sectors were actually allocated. | 332 | * sectors were actually allocated. |
335 | * | 333 | * |
336 | * If s->writeback is true, will not fail. | 334 | * If s->writeback is true, will not fail. |
337 | */ | 335 | */ |
338 | static bool bch_alloc_sectors(struct bkey *k, unsigned sectors, | 336 | static bool bch_alloc_sectors(struct bkey *k, unsigned sectors, |
339 | struct search *s) | 337 | struct search *s) |
340 | { | 338 | { |
341 | struct cache_set *c = s->op.c; | 339 | struct cache_set *c = s->op.c; |
342 | struct open_bucket *b; | 340 | struct open_bucket *b; |
343 | BKEY_PADDED(key) alloc; | 341 | BKEY_PADDED(key) alloc; |
344 | struct closure cl, *w = NULL; | 342 | struct closure cl, *w = NULL; |
345 | unsigned i; | 343 | unsigned i; |
346 | 344 | ||
347 | if (s->writeback) { | 345 | if (s->writeback) { |
348 | closure_init_stack(&cl); | 346 | closure_init_stack(&cl); |
349 | w = &cl; | 347 | w = &cl; |
350 | } | 348 | } |
351 | 349 | ||
352 | /* | 350 | /* |
353 | * We might have to allocate a new bucket, which we can't do with a | 351 | * We might have to allocate a new bucket, which we can't do with a |
354 | * spinlock held. So if we have to allocate, we drop the lock, allocate | 352 | * spinlock held. So if we have to allocate, we drop the lock, allocate |
355 | * and then retry. KEY_PTRS() indicates whether alloc points to | 353 | * and then retry. KEY_PTRS() indicates whether alloc points to |
356 | * allocated bucket(s). | 354 | * allocated bucket(s). |
357 | */ | 355 | */ |
358 | 356 | ||
359 | bkey_init(&alloc.key); | 357 | bkey_init(&alloc.key); |
360 | spin_lock(&c->data_bucket_lock); | 358 | spin_lock(&c->data_bucket_lock); |
361 | 359 | ||
362 | while (!(b = pick_data_bucket(c, k, s->task, &alloc.key))) { | 360 | while (!(b = pick_data_bucket(c, k, s->task, &alloc.key))) { |
363 | unsigned watermark = s->op.write_prio | 361 | unsigned watermark = s->op.write_prio |
364 | ? WATERMARK_MOVINGGC | 362 | ? WATERMARK_MOVINGGC |
365 | : WATERMARK_NONE; | 363 | : WATERMARK_NONE; |
366 | 364 | ||
367 | spin_unlock(&c->data_bucket_lock); | 365 | spin_unlock(&c->data_bucket_lock); |
368 | 366 | ||
369 | if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, w)) | 367 | if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, w)) |
370 | return false; | 368 | return false; |
371 | 369 | ||
372 | spin_lock(&c->data_bucket_lock); | 370 | spin_lock(&c->data_bucket_lock); |
373 | } | 371 | } |
374 | 372 | ||
375 | /* | 373 | /* |
376 | * If we had to allocate, we might race and not need to allocate the | 374 | * If we had to allocate, we might race and not need to allocate the |
377 | * second time we call find_data_bucket(). If we allocated a bucket but | 375 | * second time we call find_data_bucket(). If we allocated a bucket but |
378 | * didn't use it, drop the refcount bch_bucket_alloc_set() took: | 376 | * didn't use it, drop the refcount bch_bucket_alloc_set() took: |
379 | */ | 377 | */ |
380 | if (KEY_PTRS(&alloc.key)) | 378 | if (KEY_PTRS(&alloc.key)) |
381 | __bkey_put(c, &alloc.key); | 379 | __bkey_put(c, &alloc.key); |
382 | 380 | ||
383 | for (i = 0; i < KEY_PTRS(&b->key); i++) | 381 | for (i = 0; i < KEY_PTRS(&b->key); i++) |
384 | EBUG_ON(ptr_stale(c, &b->key, i)); | 382 | EBUG_ON(ptr_stale(c, &b->key, i)); |
385 | 383 | ||
386 | /* Set up the pointer to the space we're allocating: */ | 384 | /* Set up the pointer to the space we're allocating: */ |
387 | 385 | ||
388 | for (i = 0; i < KEY_PTRS(&b->key); i++) | 386 | for (i = 0; i < KEY_PTRS(&b->key); i++) |
389 | k->ptr[i] = b->key.ptr[i]; | 387 | k->ptr[i] = b->key.ptr[i]; |
390 | 388 | ||
391 | sectors = min(sectors, b->sectors_free); | 389 | sectors = min(sectors, b->sectors_free); |
392 | 390 | ||
393 | SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors); | 391 | SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors); |
394 | SET_KEY_SIZE(k, sectors); | 392 | SET_KEY_SIZE(k, sectors); |
395 | SET_KEY_PTRS(k, KEY_PTRS(&b->key)); | 393 | SET_KEY_PTRS(k, KEY_PTRS(&b->key)); |
396 | 394 | ||
397 | /* | 395 | /* |
398 | * Move b to the end of the lru, and keep track of what this bucket was | 396 | * Move b to the end of the lru, and keep track of what this bucket was |
399 | * last used for: | 397 | * last used for: |
400 | */ | 398 | */ |
401 | list_move_tail(&b->list, &c->data_buckets); | 399 | list_move_tail(&b->list, &c->data_buckets); |
402 | bkey_copy_key(&b->key, k); | 400 | bkey_copy_key(&b->key, k); |
403 | b->last = s->task; | 401 | b->last = s->task; |
404 | 402 | ||
405 | b->sectors_free -= sectors; | 403 | b->sectors_free -= sectors; |
406 | 404 | ||
407 | for (i = 0; i < KEY_PTRS(&b->key); i++) { | 405 | for (i = 0; i < KEY_PTRS(&b->key); i++) { |
408 | SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors); | 406 | SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors); |
409 | 407 | ||
410 | atomic_long_add(sectors, | 408 | atomic_long_add(sectors, |
411 | &PTR_CACHE(c, &b->key, i)->sectors_written); | 409 | &PTR_CACHE(c, &b->key, i)->sectors_written); |
412 | } | 410 | } |
413 | 411 | ||
414 | if (b->sectors_free < c->sb.block_size) | 412 | if (b->sectors_free < c->sb.block_size) |
415 | b->sectors_free = 0; | 413 | b->sectors_free = 0; |
416 | 414 | ||
417 | /* | 415 | /* |
418 | * k takes refcounts on the buckets it points to until it's inserted | 416 | * k takes refcounts on the buckets it points to until it's inserted |
419 | * into the btree, but if we're done with this bucket we just transfer | 417 | * into the btree, but if we're done with this bucket we just transfer |
420 | * get_data_bucket()'s refcount. | 418 | * get_data_bucket()'s refcount. |
421 | */ | 419 | */ |
422 | if (b->sectors_free) | 420 | if (b->sectors_free) |
423 | for (i = 0; i < KEY_PTRS(&b->key); i++) | 421 | for (i = 0; i < KEY_PTRS(&b->key); i++) |
424 | atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin); | 422 | atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin); |
425 | 423 | ||
426 | spin_unlock(&c->data_bucket_lock); | 424 | spin_unlock(&c->data_bucket_lock); |
427 | return true; | 425 | return true; |
428 | } | 426 | } |
429 | 427 | ||
430 | static void bch_insert_data_error(struct closure *cl) | 428 | static void bch_insert_data_error(struct closure *cl) |
431 | { | 429 | { |
432 | struct btree_op *op = container_of(cl, struct btree_op, cl); | 430 | struct btree_op *op = container_of(cl, struct btree_op, cl); |
433 | 431 | ||
434 | /* | 432 | /* |
435 | * Our data write just errored, which means we've got a bunch of keys to | 433 | * Our data write just errored, which means we've got a bunch of keys to |
436 | * insert that point to data that wasn't succesfully written. | 434 | * insert that point to data that wasn't succesfully written. |
437 | * | 435 | * |
438 | * We don't have to insert those keys but we still have to invalidate | 436 | * We don't have to insert those keys but we still have to invalidate |
439 | * that region of the cache - so, if we just strip off all the pointers | 437 | * that region of the cache - so, if we just strip off all the pointers |
440 | * from the keys we'll accomplish just that. | 438 | * from the keys we'll accomplish just that. |
441 | */ | 439 | */ |
442 | 440 | ||
443 | struct bkey *src = op->keys.bottom, *dst = op->keys.bottom; | 441 | struct bkey *src = op->keys.bottom, *dst = op->keys.bottom; |
444 | 442 | ||
445 | while (src != op->keys.top) { | 443 | while (src != op->keys.top) { |
446 | struct bkey *n = bkey_next(src); | 444 | struct bkey *n = bkey_next(src); |
447 | 445 | ||
448 | SET_KEY_PTRS(src, 0); | 446 | SET_KEY_PTRS(src, 0); |
449 | bkey_copy(dst, src); | 447 | bkey_copy(dst, src); |
450 | 448 | ||
451 | dst = bkey_next(dst); | 449 | dst = bkey_next(dst); |
452 | src = n; | 450 | src = n; |
453 | } | 451 | } |
454 | 452 | ||
455 | op->keys.top = dst; | 453 | op->keys.top = dst; |
456 | 454 | ||
457 | bch_journal(cl); | 455 | bch_journal(cl); |
458 | } | 456 | } |
459 | 457 | ||
460 | static void bch_insert_data_endio(struct bio *bio, int error) | 458 | static void bch_insert_data_endio(struct bio *bio, int error) |
461 | { | 459 | { |
462 | struct closure *cl = bio->bi_private; | 460 | struct closure *cl = bio->bi_private; |
463 | struct btree_op *op = container_of(cl, struct btree_op, cl); | 461 | struct btree_op *op = container_of(cl, struct btree_op, cl); |
464 | struct search *s = container_of(op, struct search, op); | 462 | struct search *s = container_of(op, struct search, op); |
465 | 463 | ||
466 | if (error) { | 464 | if (error) { |
467 | /* TODO: We could try to recover from this. */ | 465 | /* TODO: We could try to recover from this. */ |
468 | if (s->writeback) | 466 | if (s->writeback) |
469 | s->error = error; | 467 | s->error = error; |
470 | else if (s->write) | 468 | else if (s->write) |
471 | set_closure_fn(cl, bch_insert_data_error, bcache_wq); | 469 | set_closure_fn(cl, bch_insert_data_error, bcache_wq); |
472 | else | 470 | else |
473 | set_closure_fn(cl, NULL, NULL); | 471 | set_closure_fn(cl, NULL, NULL); |
474 | } | 472 | } |
475 | 473 | ||
476 | bch_bbio_endio(op->c, bio, error, "writing data to cache"); | 474 | bch_bbio_endio(op->c, bio, error, "writing data to cache"); |
477 | } | 475 | } |
478 | 476 | ||
479 | static void bch_insert_data_loop(struct closure *cl) | 477 | static void bch_insert_data_loop(struct closure *cl) |
480 | { | 478 | { |
481 | struct btree_op *op = container_of(cl, struct btree_op, cl); | 479 | struct btree_op *op = container_of(cl, struct btree_op, cl); |
482 | struct search *s = container_of(op, struct search, op); | 480 | struct search *s = container_of(op, struct search, op); |
483 | struct bio *bio = op->cache_bio, *n; | 481 | struct bio *bio = op->cache_bio, *n; |
484 | 482 | ||
485 | if (op->skip) | 483 | if (op->skip) |
486 | return bio_invalidate(cl); | 484 | return bio_invalidate(cl); |
487 | 485 | ||
488 | if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) { | 486 | if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) { |
489 | set_gc_sectors(op->c); | 487 | set_gc_sectors(op->c); |
490 | bch_queue_gc(op->c); | 488 | bch_queue_gc(op->c); |
491 | } | 489 | } |
492 | 490 | ||
493 | do { | 491 | do { |
494 | unsigned i; | 492 | unsigned i; |
495 | struct bkey *k; | 493 | struct bkey *k; |
496 | struct bio_set *split = s->d | 494 | struct bio_set *split = s->d |
497 | ? s->d->bio_split : op->c->bio_split; | 495 | ? s->d->bio_split : op->c->bio_split; |
498 | 496 | ||
499 | /* 1 for the device pointer and 1 for the chksum */ | 497 | /* 1 for the device pointer and 1 for the chksum */ |
500 | if (bch_keylist_realloc(&op->keys, | 498 | if (bch_keylist_realloc(&op->keys, |
501 | 1 + (op->csum ? 1 : 0), | 499 | 1 + (op->csum ? 1 : 0), |
502 | op->c)) | 500 | op->c)) |
503 | continue_at(cl, bch_journal, bcache_wq); | 501 | continue_at(cl, bch_journal, bcache_wq); |
504 | 502 | ||
505 | k = op->keys.top; | 503 | k = op->keys.top; |
506 | bkey_init(k); | 504 | bkey_init(k); |
507 | SET_KEY_INODE(k, op->inode); | 505 | SET_KEY_INODE(k, op->inode); |
508 | SET_KEY_OFFSET(k, bio->bi_sector); | 506 | SET_KEY_OFFSET(k, bio->bi_sector); |
509 | 507 | ||
510 | if (!bch_alloc_sectors(k, bio_sectors(bio), s)) | 508 | if (!bch_alloc_sectors(k, bio_sectors(bio), s)) |
511 | goto err; | 509 | goto err; |
512 | 510 | ||
513 | n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split); | 511 | n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split); |
514 | if (!n) { | 512 | if (!n) { |
515 | __bkey_put(op->c, k); | 513 | __bkey_put(op->c, k); |
516 | continue_at(cl, bch_insert_data_loop, bcache_wq); | 514 | continue_at(cl, bch_insert_data_loop, bcache_wq); |
517 | } | 515 | } |
518 | 516 | ||
519 | n->bi_end_io = bch_insert_data_endio; | 517 | n->bi_end_io = bch_insert_data_endio; |
520 | n->bi_private = cl; | 518 | n->bi_private = cl; |
521 | 519 | ||
522 | if (s->writeback) { | 520 | if (s->writeback) { |
523 | SET_KEY_DIRTY(k, true); | 521 | SET_KEY_DIRTY(k, true); |
524 | 522 | ||
525 | for (i = 0; i < KEY_PTRS(k); i++) | 523 | for (i = 0; i < KEY_PTRS(k); i++) |
526 | SET_GC_MARK(PTR_BUCKET(op->c, k, i), | 524 | SET_GC_MARK(PTR_BUCKET(op->c, k, i), |
527 | GC_MARK_DIRTY); | 525 | GC_MARK_DIRTY); |
528 | } | 526 | } |
529 | 527 | ||
530 | SET_KEY_CSUM(k, op->csum); | 528 | SET_KEY_CSUM(k, op->csum); |
531 | if (KEY_CSUM(k)) | 529 | if (KEY_CSUM(k)) |
532 | bio_csum(n, k); | 530 | bio_csum(n, k); |
533 | 531 | ||
534 | trace_bcache_cache_insert(k); | 532 | trace_bcache_cache_insert(k); |
535 | bch_keylist_push(&op->keys); | 533 | bch_keylist_push(&op->keys); |
536 | 534 | ||
537 | n->bi_rw |= REQ_WRITE; | 535 | n->bi_rw |= REQ_WRITE; |
538 | bch_submit_bbio(n, op->c, k, 0); | 536 | bch_submit_bbio(n, op->c, k, 0); |
539 | } while (n != bio); | 537 | } while (n != bio); |
540 | 538 | ||
541 | op->insert_data_done = true; | 539 | op->insert_data_done = true; |
542 | continue_at(cl, bch_journal, bcache_wq); | 540 | continue_at(cl, bch_journal, bcache_wq); |
543 | err: | 541 | err: |
544 | /* bch_alloc_sectors() blocks if s->writeback = true */ | 542 | /* bch_alloc_sectors() blocks if s->writeback = true */ |
545 | BUG_ON(s->writeback); | 543 | BUG_ON(s->writeback); |
546 | 544 | ||
547 | /* | 545 | /* |
548 | * But if it's not a writeback write we'd rather just bail out if | 546 | * But if it's not a writeback write we'd rather just bail out if |
549 | * there aren't any buckets ready to write to - it might take awhile and | 547 | * there aren't any buckets ready to write to - it might take awhile and |
550 | * we might be starving btree writes for gc or something. | 548 | * we might be starving btree writes for gc or something. |
551 | */ | 549 | */ |
552 | 550 | ||
553 | if (s->write) { | 551 | if (s->write) { |
554 | /* | 552 | /* |
555 | * Writethrough write: We can't complete the write until we've | 553 | * Writethrough write: We can't complete the write until we've |
556 | * updated the index. But we don't want to delay the write while | 554 | * updated the index. But we don't want to delay the write while |
557 | * we wait for buckets to be freed up, so just invalidate the | 555 | * we wait for buckets to be freed up, so just invalidate the |
558 | * rest of the write. | 556 | * rest of the write. |
559 | */ | 557 | */ |
560 | op->skip = true; | 558 | op->skip = true; |
561 | return bio_invalidate(cl); | 559 | return bio_invalidate(cl); |
562 | } else { | 560 | } else { |
563 | /* | 561 | /* |
564 | * From a cache miss, we can just insert the keys for the data | 562 | * From a cache miss, we can just insert the keys for the data |
565 | * we have written or bail out if we didn't do anything. | 563 | * we have written or bail out if we didn't do anything. |
566 | */ | 564 | */ |
567 | op->insert_data_done = true; | 565 | op->insert_data_done = true; |
568 | bio_put(bio); | 566 | bio_put(bio); |
569 | 567 | ||
570 | if (!bch_keylist_empty(&op->keys)) | 568 | if (!bch_keylist_empty(&op->keys)) |
571 | continue_at(cl, bch_journal, bcache_wq); | 569 | continue_at(cl, bch_journal, bcache_wq); |
572 | else | 570 | else |
573 | closure_return(cl); | 571 | closure_return(cl); |
574 | } | 572 | } |
575 | } | 573 | } |
576 | 574 | ||
577 | /** | 575 | /** |
578 | * bch_insert_data - stick some data in the cache | 576 | * bch_insert_data - stick some data in the cache |
579 | * | 577 | * |
580 | * This is the starting point for any data to end up in a cache device; it could | 578 | * This is the starting point for any data to end up in a cache device; it could |
581 | * be from a normal write, or a writeback write, or a write to a flash only | 579 | * be from a normal write, or a writeback write, or a write to a flash only |
582 | * volume - it's also used by the moving garbage collector to compact data in | 580 | * volume - it's also used by the moving garbage collector to compact data in |
583 | * mostly empty buckets. | 581 | * mostly empty buckets. |
584 | * | 582 | * |
585 | * It first writes the data to the cache, creating a list of keys to be inserted | 583 | * It first writes the data to the cache, creating a list of keys to be inserted |
586 | * (if the data had to be fragmented there will be multiple keys); after the | 584 | * (if the data had to be fragmented there will be multiple keys); after the |
587 | * data is written it calls bch_journal, and after the keys have been added to | 585 | * data is written it calls bch_journal, and after the keys have been added to |
588 | * the next journal write they're inserted into the btree. | 586 | * the next journal write they're inserted into the btree. |
589 | * | 587 | * |
590 | * It inserts the data in op->cache_bio; bi_sector is used for the key offset, | 588 | * It inserts the data in op->cache_bio; bi_sector is used for the key offset, |
591 | * and op->inode is used for the key inode. | 589 | * and op->inode is used for the key inode. |
592 | * | 590 | * |
593 | * If op->skip is true, instead of inserting the data it invalidates the region | 591 | * If op->skip is true, instead of inserting the data it invalidates the region |
594 | * of the cache represented by op->cache_bio and op->inode. | 592 | * of the cache represented by op->cache_bio and op->inode. |
595 | */ | 593 | */ |
596 | void bch_insert_data(struct closure *cl) | 594 | void bch_insert_data(struct closure *cl) |
597 | { | 595 | { |
598 | struct btree_op *op = container_of(cl, struct btree_op, cl); | 596 | struct btree_op *op = container_of(cl, struct btree_op, cl); |
599 | 597 | ||
600 | bch_keylist_init(&op->keys); | 598 | bch_keylist_init(&op->keys); |
601 | bio_get(op->cache_bio); | 599 | bio_get(op->cache_bio); |
602 | bch_insert_data_loop(cl); | 600 | bch_insert_data_loop(cl); |
603 | } | 601 | } |
604 | 602 | ||
605 | void bch_btree_insert_async(struct closure *cl) | 603 | void bch_btree_insert_async(struct closure *cl) |
606 | { | 604 | { |
607 | struct btree_op *op = container_of(cl, struct btree_op, cl); | 605 | struct btree_op *op = container_of(cl, struct btree_op, cl); |
608 | struct search *s = container_of(op, struct search, op); | 606 | struct search *s = container_of(op, struct search, op); |
609 | 607 | ||
610 | if (bch_btree_insert(op, op->c)) { | 608 | if (bch_btree_insert(op, op->c)) { |
611 | s->error = -ENOMEM; | 609 | s->error = -ENOMEM; |
612 | op->insert_data_done = true; | 610 | op->insert_data_done = true; |
613 | } | 611 | } |
614 | 612 | ||
615 | if (op->insert_data_done) { | 613 | if (op->insert_data_done) { |
616 | bch_keylist_free(&op->keys); | 614 | bch_keylist_free(&op->keys); |
617 | closure_return(cl); | 615 | closure_return(cl); |
618 | } else | 616 | } else |
619 | continue_at(cl, bch_insert_data_loop, bcache_wq); | 617 | continue_at(cl, bch_insert_data_loop, bcache_wq); |
620 | } | 618 | } |
621 | 619 | ||
622 | /* Common code for the make_request functions */ | 620 | /* Common code for the make_request functions */ |
623 | 621 | ||
624 | static void request_endio(struct bio *bio, int error) | 622 | static void request_endio(struct bio *bio, int error) |
625 | { | 623 | { |
626 | struct closure *cl = bio->bi_private; | 624 | struct closure *cl = bio->bi_private; |
627 | 625 | ||
628 | if (error) { | 626 | if (error) { |
629 | struct search *s = container_of(cl, struct search, cl); | 627 | struct search *s = container_of(cl, struct search, cl); |
630 | s->error = error; | 628 | s->error = error; |
631 | /* Only cache read errors are recoverable */ | 629 | /* Only cache read errors are recoverable */ |
632 | s->recoverable = false; | 630 | s->recoverable = false; |
633 | } | 631 | } |
634 | 632 | ||
635 | bio_put(bio); | 633 | bio_put(bio); |
636 | closure_put(cl); | 634 | closure_put(cl); |
637 | } | 635 | } |
638 | 636 | ||
639 | void bch_cache_read_endio(struct bio *bio, int error) | 637 | void bch_cache_read_endio(struct bio *bio, int error) |
640 | { | 638 | { |
641 | struct bbio *b = container_of(bio, struct bbio, bio); | 639 | struct bbio *b = container_of(bio, struct bbio, bio); |
642 | struct closure *cl = bio->bi_private; | 640 | struct closure *cl = bio->bi_private; |
643 | struct search *s = container_of(cl, struct search, cl); | 641 | struct search *s = container_of(cl, struct search, cl); |
644 | 642 | ||
645 | /* | 643 | /* |
646 | * If the bucket was reused while our bio was in flight, we might have | 644 | * If the bucket was reused while our bio was in flight, we might have |
647 | * read the wrong data. Set s->error but not error so it doesn't get | 645 | * read the wrong data. Set s->error but not error so it doesn't get |
648 | * counted against the cache device, but we'll still reread the data | 646 | * counted against the cache device, but we'll still reread the data |
649 | * from the backing device. | 647 | * from the backing device. |
650 | */ | 648 | */ |
651 | 649 | ||
652 | if (error) | 650 | if (error) |
653 | s->error = error; | 651 | s->error = error; |
654 | else if (ptr_stale(s->op.c, &b->key, 0)) { | 652 | else if (ptr_stale(s->op.c, &b->key, 0)) { |
655 | atomic_long_inc(&s->op.c->cache_read_races); | 653 | atomic_long_inc(&s->op.c->cache_read_races); |
656 | s->error = -EINTR; | 654 | s->error = -EINTR; |
657 | } | 655 | } |
658 | 656 | ||
659 | bch_bbio_endio(s->op.c, bio, error, "reading from cache"); | 657 | bch_bbio_endio(s->op.c, bio, error, "reading from cache"); |
660 | } | 658 | } |
661 | 659 | ||
662 | static void bio_complete(struct search *s) | 660 | static void bio_complete(struct search *s) |
663 | { | 661 | { |
664 | if (s->orig_bio) { | 662 | if (s->orig_bio) { |
665 | int cpu, rw = bio_data_dir(s->orig_bio); | 663 | int cpu, rw = bio_data_dir(s->orig_bio); |
666 | unsigned long duration = jiffies - s->start_time; | 664 | unsigned long duration = jiffies - s->start_time; |
667 | 665 | ||
668 | cpu = part_stat_lock(); | 666 | cpu = part_stat_lock(); |
669 | part_round_stats(cpu, &s->d->disk->part0); | 667 | part_round_stats(cpu, &s->d->disk->part0); |
670 | part_stat_add(cpu, &s->d->disk->part0, ticks[rw], duration); | 668 | part_stat_add(cpu, &s->d->disk->part0, ticks[rw], duration); |
671 | part_stat_unlock(); | 669 | part_stat_unlock(); |
672 | 670 | ||
673 | trace_bcache_request_end(s, s->orig_bio); | 671 | trace_bcache_request_end(s, s->orig_bio); |
674 | bio_endio(s->orig_bio, s->error); | 672 | bio_endio(s->orig_bio, s->error); |
675 | s->orig_bio = NULL; | 673 | s->orig_bio = NULL; |
676 | } | 674 | } |
677 | } | 675 | } |
678 | 676 | ||
679 | static void do_bio_hook(struct search *s) | 677 | static void do_bio_hook(struct search *s) |
680 | { | 678 | { |
681 | struct bio *bio = &s->bio.bio; | 679 | struct bio *bio = &s->bio.bio; |
682 | memcpy(bio, s->orig_bio, sizeof(struct bio)); | 680 | memcpy(bio, s->orig_bio, sizeof(struct bio)); |
683 | 681 | ||
684 | bio->bi_end_io = request_endio; | 682 | bio->bi_end_io = request_endio; |
685 | bio->bi_private = &s->cl; | 683 | bio->bi_private = &s->cl; |
686 | atomic_set(&bio->bi_cnt, 3); | 684 | atomic_set(&bio->bi_cnt, 3); |
687 | } | 685 | } |
688 | 686 | ||
689 | static void search_free(struct closure *cl) | 687 | static void search_free(struct closure *cl) |
690 | { | 688 | { |
691 | struct search *s = container_of(cl, struct search, cl); | 689 | struct search *s = container_of(cl, struct search, cl); |
692 | bio_complete(s); | 690 | bio_complete(s); |
693 | 691 | ||
694 | if (s->op.cache_bio) | 692 | if (s->op.cache_bio) |
695 | bio_put(s->op.cache_bio); | 693 | bio_put(s->op.cache_bio); |
696 | 694 | ||
697 | if (s->unaligned_bvec) | 695 | if (s->unaligned_bvec) |
698 | mempool_free(s->bio.bio.bi_io_vec, s->d->unaligned_bvec); | 696 | mempool_free(s->bio.bio.bi_io_vec, s->d->unaligned_bvec); |
699 | 697 | ||
700 | closure_debug_destroy(cl); | 698 | closure_debug_destroy(cl); |
701 | mempool_free(s, s->d->c->search); | 699 | mempool_free(s, s->d->c->search); |
702 | } | 700 | } |
703 | 701 | ||
704 | static struct search *search_alloc(struct bio *bio, struct bcache_device *d) | 702 | static struct search *search_alloc(struct bio *bio, struct bcache_device *d) |
705 | { | 703 | { |
706 | struct bio_vec *bv; | 704 | struct bio_vec *bv; |
707 | struct search *s = mempool_alloc(d->c->search, GFP_NOIO); | 705 | struct search *s = mempool_alloc(d->c->search, GFP_NOIO); |
708 | memset(s, 0, offsetof(struct search, op.keys)); | 706 | memset(s, 0, offsetof(struct search, op.keys)); |
709 | 707 | ||
710 | __closure_init(&s->cl, NULL); | 708 | __closure_init(&s->cl, NULL); |
711 | 709 | ||
712 | s->op.inode = d->id; | 710 | s->op.inode = d->id; |
713 | s->op.c = d->c; | 711 | s->op.c = d->c; |
714 | s->d = d; | 712 | s->d = d; |
715 | s->op.lock = -1; | 713 | s->op.lock = -1; |
716 | s->task = current; | 714 | s->task = current; |
717 | s->orig_bio = bio; | 715 | s->orig_bio = bio; |
718 | s->write = (bio->bi_rw & REQ_WRITE) != 0; | 716 | s->write = (bio->bi_rw & REQ_WRITE) != 0; |
719 | s->op.flush_journal = (bio->bi_rw & REQ_FLUSH) != 0; | 717 | s->op.flush_journal = (bio->bi_rw & REQ_FLUSH) != 0; |
720 | s->op.skip = (bio->bi_rw & REQ_DISCARD) != 0; | 718 | s->op.skip = (bio->bi_rw & REQ_DISCARD) != 0; |
721 | s->recoverable = 1; | 719 | s->recoverable = 1; |
722 | s->start_time = jiffies; | 720 | s->start_time = jiffies; |
723 | do_bio_hook(s); | 721 | do_bio_hook(s); |
724 | 722 | ||
725 | if (bio->bi_size != bio_segments(bio) * PAGE_SIZE) { | 723 | if (bio->bi_size != bio_segments(bio) * PAGE_SIZE) { |
726 | bv = mempool_alloc(d->unaligned_bvec, GFP_NOIO); | 724 | bv = mempool_alloc(d->unaligned_bvec, GFP_NOIO); |
727 | memcpy(bv, bio_iovec(bio), | 725 | memcpy(bv, bio_iovec(bio), |
728 | sizeof(struct bio_vec) * bio_segments(bio)); | 726 | sizeof(struct bio_vec) * bio_segments(bio)); |
729 | 727 | ||
730 | s->bio.bio.bi_io_vec = bv; | 728 | s->bio.bio.bi_io_vec = bv; |
731 | s->unaligned_bvec = 1; | 729 | s->unaligned_bvec = 1; |
732 | } | 730 | } |
733 | 731 | ||
734 | return s; | 732 | return s; |
735 | } | 733 | } |
736 | 734 | ||
737 | static void btree_read_async(struct closure *cl) | 735 | static void btree_read_async(struct closure *cl) |
738 | { | 736 | { |
739 | struct btree_op *op = container_of(cl, struct btree_op, cl); | 737 | struct btree_op *op = container_of(cl, struct btree_op, cl); |
740 | 738 | ||
741 | int ret = btree_root(search_recurse, op->c, op); | 739 | int ret = btree_root(search_recurse, op->c, op); |
742 | 740 | ||
743 | if (ret == -EAGAIN) | 741 | if (ret == -EAGAIN) |
744 | continue_at(cl, btree_read_async, bcache_wq); | 742 | continue_at(cl, btree_read_async, bcache_wq); |
745 | 743 | ||
746 | closure_return(cl); | 744 | closure_return(cl); |
747 | } | 745 | } |
748 | 746 | ||
749 | /* Cached devices */ | 747 | /* Cached devices */ |
750 | 748 | ||
751 | static void cached_dev_bio_complete(struct closure *cl) | 749 | static void cached_dev_bio_complete(struct closure *cl) |
752 | { | 750 | { |
753 | struct search *s = container_of(cl, struct search, cl); | 751 | struct search *s = container_of(cl, struct search, cl); |
754 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | 752 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); |
755 | 753 | ||
756 | search_free(cl); | 754 | search_free(cl); |
757 | cached_dev_put(dc); | 755 | cached_dev_put(dc); |
758 | } | 756 | } |
759 | 757 | ||
760 | /* Process reads */ | 758 | /* Process reads */ |
761 | 759 | ||
762 | static void cached_dev_read_complete(struct closure *cl) | 760 | static void cached_dev_read_complete(struct closure *cl) |
763 | { | 761 | { |
764 | struct search *s = container_of(cl, struct search, cl); | 762 | struct search *s = container_of(cl, struct search, cl); |
765 | 763 | ||
766 | if (s->op.insert_collision) | 764 | if (s->op.insert_collision) |
767 | bch_mark_cache_miss_collision(s); | 765 | bch_mark_cache_miss_collision(s); |
768 | 766 | ||
769 | if (s->op.cache_bio) { | 767 | if (s->op.cache_bio) { |
770 | int i; | 768 | int i; |
771 | struct bio_vec *bv; | 769 | struct bio_vec *bv; |
772 | 770 | ||
773 | __bio_for_each_segment(bv, s->op.cache_bio, i, 0) | 771 | __bio_for_each_segment(bv, s->op.cache_bio, i, 0) |
774 | __free_page(bv->bv_page); | 772 | __free_page(bv->bv_page); |
775 | } | 773 | } |
776 | 774 | ||
777 | cached_dev_bio_complete(cl); | 775 | cached_dev_bio_complete(cl); |
778 | } | 776 | } |
779 | 777 | ||
780 | static void request_read_error(struct closure *cl) | 778 | static void request_read_error(struct closure *cl) |
781 | { | 779 | { |
782 | struct search *s = container_of(cl, struct search, cl); | 780 | struct search *s = container_of(cl, struct search, cl); |
783 | struct bio_vec *bv; | 781 | struct bio_vec *bv; |
784 | int i; | 782 | int i; |
785 | 783 | ||
786 | if (s->recoverable) { | 784 | if (s->recoverable) { |
787 | /* Retry from the backing device: */ | 785 | /* Retry from the backing device: */ |
788 | trace_bcache_read_retry(s->orig_bio); | 786 | trace_bcache_read_retry(s->orig_bio); |
789 | 787 | ||
790 | s->error = 0; | 788 | s->error = 0; |
791 | bv = s->bio.bio.bi_io_vec; | 789 | bv = s->bio.bio.bi_io_vec; |
792 | do_bio_hook(s); | 790 | do_bio_hook(s); |
793 | s->bio.bio.bi_io_vec = bv; | 791 | s->bio.bio.bi_io_vec = bv; |
794 | 792 | ||
795 | if (!s->unaligned_bvec) | 793 | if (!s->unaligned_bvec) |
796 | bio_for_each_segment(bv, s->orig_bio, i) | 794 | bio_for_each_segment(bv, s->orig_bio, i) |
797 | bv->bv_offset = 0, bv->bv_len = PAGE_SIZE; | 795 | bv->bv_offset = 0, bv->bv_len = PAGE_SIZE; |
798 | else | 796 | else |
799 | memcpy(s->bio.bio.bi_io_vec, | 797 | memcpy(s->bio.bio.bi_io_vec, |
800 | bio_iovec(s->orig_bio), | 798 | bio_iovec(s->orig_bio), |
801 | sizeof(struct bio_vec) * | 799 | sizeof(struct bio_vec) * |
802 | bio_segments(s->orig_bio)); | 800 | bio_segments(s->orig_bio)); |
803 | 801 | ||
804 | /* XXX: invalidate cache */ | 802 | /* XXX: invalidate cache */ |
805 | 803 | ||
806 | closure_bio_submit(&s->bio.bio, &s->cl, s->d); | 804 | closure_bio_submit(&s->bio.bio, &s->cl, s->d); |
807 | } | 805 | } |
808 | 806 | ||
809 | continue_at(cl, cached_dev_read_complete, NULL); | 807 | continue_at(cl, cached_dev_read_complete, NULL); |
810 | } | 808 | } |
811 | 809 | ||
812 | static void request_read_done(struct closure *cl) | 810 | static void request_read_done(struct closure *cl) |
813 | { | 811 | { |
814 | struct search *s = container_of(cl, struct search, cl); | 812 | struct search *s = container_of(cl, struct search, cl); |
815 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | 813 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); |
816 | 814 | ||
817 | /* | 815 | /* |
818 | * s->cache_bio != NULL implies that we had a cache miss; cache_bio now | 816 | * s->cache_bio != NULL implies that we had a cache miss; cache_bio now |
819 | * contains data ready to be inserted into the cache. | 817 | * contains data ready to be inserted into the cache. |
820 | * | 818 | * |
821 | * First, we copy the data we just read from cache_bio's bounce buffers | 819 | * First, we copy the data we just read from cache_bio's bounce buffers |
822 | * to the buffers the original bio pointed to: | 820 | * to the buffers the original bio pointed to: |
823 | */ | 821 | */ |
824 | 822 | ||
825 | if (s->op.cache_bio) { | 823 | if (s->op.cache_bio) { |
826 | struct bio_vec *src, *dst; | 824 | struct bio_vec *src, *dst; |
827 | unsigned src_offset, dst_offset, bytes; | 825 | unsigned src_offset, dst_offset, bytes; |
828 | void *dst_ptr; | 826 | void *dst_ptr; |
829 | 827 | ||
830 | bio_reset(s->op.cache_bio); | 828 | bio_reset(s->op.cache_bio); |
831 | s->op.cache_bio->bi_sector = s->cache_miss->bi_sector; | 829 | s->op.cache_bio->bi_sector = s->cache_miss->bi_sector; |
832 | s->op.cache_bio->bi_bdev = s->cache_miss->bi_bdev; | 830 | s->op.cache_bio->bi_bdev = s->cache_miss->bi_bdev; |
833 | s->op.cache_bio->bi_size = s->cache_bio_sectors << 9; | 831 | s->op.cache_bio->bi_size = s->cache_bio_sectors << 9; |
834 | bch_bio_map(s->op.cache_bio, NULL); | 832 | bch_bio_map(s->op.cache_bio, NULL); |
835 | 833 | ||
836 | src = bio_iovec(s->op.cache_bio); | 834 | src = bio_iovec(s->op.cache_bio); |
837 | dst = bio_iovec(s->cache_miss); | 835 | dst = bio_iovec(s->cache_miss); |
838 | src_offset = src->bv_offset; | 836 | src_offset = src->bv_offset; |
839 | dst_offset = dst->bv_offset; | 837 | dst_offset = dst->bv_offset; |
840 | dst_ptr = kmap(dst->bv_page); | 838 | dst_ptr = kmap(dst->bv_page); |
841 | 839 | ||
842 | while (1) { | 840 | while (1) { |
843 | if (dst_offset == dst->bv_offset + dst->bv_len) { | 841 | if (dst_offset == dst->bv_offset + dst->bv_len) { |
844 | kunmap(dst->bv_page); | 842 | kunmap(dst->bv_page); |
845 | dst++; | 843 | dst++; |
846 | if (dst == bio_iovec_idx(s->cache_miss, | 844 | if (dst == bio_iovec_idx(s->cache_miss, |
847 | s->cache_miss->bi_vcnt)) | 845 | s->cache_miss->bi_vcnt)) |
848 | break; | 846 | break; |
849 | 847 | ||
850 | dst_offset = dst->bv_offset; | 848 | dst_offset = dst->bv_offset; |
851 | dst_ptr = kmap(dst->bv_page); | 849 | dst_ptr = kmap(dst->bv_page); |
852 | } | 850 | } |
853 | 851 | ||
854 | if (src_offset == src->bv_offset + src->bv_len) { | 852 | if (src_offset == src->bv_offset + src->bv_len) { |
855 | src++; | 853 | src++; |
856 | if (src == bio_iovec_idx(s->op.cache_bio, | 854 | if (src == bio_iovec_idx(s->op.cache_bio, |
857 | s->op.cache_bio->bi_vcnt)) | 855 | s->op.cache_bio->bi_vcnt)) |
858 | BUG(); | 856 | BUG(); |
859 | 857 | ||
860 | src_offset = src->bv_offset; | 858 | src_offset = src->bv_offset; |
861 | } | 859 | } |
862 | 860 | ||
863 | bytes = min(dst->bv_offset + dst->bv_len - dst_offset, | 861 | bytes = min(dst->bv_offset + dst->bv_len - dst_offset, |
864 | src->bv_offset + src->bv_len - src_offset); | 862 | src->bv_offset + src->bv_len - src_offset); |
865 | 863 | ||
866 | memcpy(dst_ptr + dst_offset, | 864 | memcpy(dst_ptr + dst_offset, |
867 | page_address(src->bv_page) + src_offset, | 865 | page_address(src->bv_page) + src_offset, |
868 | bytes); | 866 | bytes); |
869 | 867 | ||
870 | src_offset += bytes; | 868 | src_offset += bytes; |
871 | dst_offset += bytes; | 869 | dst_offset += bytes; |
872 | } | 870 | } |
873 | 871 | ||
874 | bio_put(s->cache_miss); | 872 | bio_put(s->cache_miss); |
875 | s->cache_miss = NULL; | 873 | s->cache_miss = NULL; |
876 | } | 874 | } |
877 | 875 | ||
878 | if (verify(dc, &s->bio.bio) && s->recoverable) | 876 | if (verify(dc, &s->bio.bio) && s->recoverable) |
879 | bch_data_verify(s); | 877 | bch_data_verify(s); |
880 | 878 | ||
881 | bio_complete(s); | 879 | bio_complete(s); |
882 | 880 | ||
883 | if (s->op.cache_bio && | 881 | if (s->op.cache_bio && |
884 | !test_bit(CACHE_SET_STOPPING, &s->op.c->flags)) { | 882 | !test_bit(CACHE_SET_STOPPING, &s->op.c->flags)) { |
885 | s->op.type = BTREE_REPLACE; | 883 | s->op.type = BTREE_REPLACE; |
886 | closure_call(&s->op.cl, bch_insert_data, NULL, cl); | 884 | closure_call(&s->op.cl, bch_insert_data, NULL, cl); |
887 | } | 885 | } |
888 | 886 | ||
889 | continue_at(cl, cached_dev_read_complete, NULL); | 887 | continue_at(cl, cached_dev_read_complete, NULL); |
890 | } | 888 | } |
891 | 889 | ||
892 | static void request_read_done_bh(struct closure *cl) | 890 | static void request_read_done_bh(struct closure *cl) |
893 | { | 891 | { |
894 | struct search *s = container_of(cl, struct search, cl); | 892 | struct search *s = container_of(cl, struct search, cl); |
895 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | 893 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); |
896 | 894 | ||
897 | bch_mark_cache_accounting(s, !s->cache_miss, s->op.skip); | 895 | bch_mark_cache_accounting(s, !s->cache_miss, s->op.skip); |
898 | trace_bcache_read(s->orig_bio, !s->cache_miss, s->op.skip); | 896 | trace_bcache_read(s->orig_bio, !s->cache_miss, s->op.skip); |
899 | 897 | ||
900 | if (s->error) | 898 | if (s->error) |
901 | continue_at_nobarrier(cl, request_read_error, bcache_wq); | 899 | continue_at_nobarrier(cl, request_read_error, bcache_wq); |
902 | else if (s->op.cache_bio || verify(dc, &s->bio.bio)) | 900 | else if (s->op.cache_bio || verify(dc, &s->bio.bio)) |
903 | continue_at_nobarrier(cl, request_read_done, bcache_wq); | 901 | continue_at_nobarrier(cl, request_read_done, bcache_wq); |
904 | else | 902 | else |
905 | continue_at_nobarrier(cl, cached_dev_read_complete, NULL); | 903 | continue_at_nobarrier(cl, cached_dev_read_complete, NULL); |
906 | } | 904 | } |
907 | 905 | ||
908 | static int cached_dev_cache_miss(struct btree *b, struct search *s, | 906 | static int cached_dev_cache_miss(struct btree *b, struct search *s, |
909 | struct bio *bio, unsigned sectors) | 907 | struct bio *bio, unsigned sectors) |
910 | { | 908 | { |
911 | int ret = 0; | 909 | int ret = 0; |
912 | unsigned reada; | 910 | unsigned reada; |
913 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | 911 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); |
914 | struct bio *miss; | 912 | struct bio *miss; |
915 | 913 | ||
916 | miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); | 914 | miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); |
917 | if (!miss) | 915 | if (!miss) |
918 | return -EAGAIN; | 916 | return -EAGAIN; |
919 | 917 | ||
920 | if (miss == bio) | 918 | if (miss == bio) |
921 | s->op.lookup_done = true; | 919 | s->op.lookup_done = true; |
922 | 920 | ||
923 | miss->bi_end_io = request_endio; | 921 | miss->bi_end_io = request_endio; |
924 | miss->bi_private = &s->cl; | 922 | miss->bi_private = &s->cl; |
925 | 923 | ||
926 | if (s->cache_miss || s->op.skip) | 924 | if (s->cache_miss || s->op.skip) |
927 | goto out_submit; | 925 | goto out_submit; |
928 | 926 | ||
929 | if (miss != bio || | 927 | if (miss != bio || |
930 | (bio->bi_rw & REQ_RAHEAD) || | 928 | (bio->bi_rw & REQ_RAHEAD) || |
931 | (bio->bi_rw & REQ_META) || | 929 | (bio->bi_rw & REQ_META) || |
932 | s->op.c->gc_stats.in_use >= CUTOFF_CACHE_READA) | 930 | s->op.c->gc_stats.in_use >= CUTOFF_CACHE_READA) |
933 | reada = 0; | 931 | reada = 0; |
934 | else { | 932 | else { |
935 | reada = min(dc->readahead >> 9, | 933 | reada = min(dc->readahead >> 9, |
936 | sectors - bio_sectors(miss)); | 934 | sectors - bio_sectors(miss)); |
937 | 935 | ||
938 | if (bio_end(miss) + reada > bdev_sectors(miss->bi_bdev)) | 936 | if (bio_end(miss) + reada > bdev_sectors(miss->bi_bdev)) |
939 | reada = bdev_sectors(miss->bi_bdev) - bio_end(miss); | 937 | reada = bdev_sectors(miss->bi_bdev) - bio_end(miss); |
940 | } | 938 | } |
941 | 939 | ||
942 | s->cache_bio_sectors = bio_sectors(miss) + reada; | 940 | s->cache_bio_sectors = bio_sectors(miss) + reada; |
943 | s->op.cache_bio = bio_alloc_bioset(GFP_NOWAIT, | 941 | s->op.cache_bio = bio_alloc_bioset(GFP_NOWAIT, |
944 | DIV_ROUND_UP(s->cache_bio_sectors, PAGE_SECTORS), | 942 | DIV_ROUND_UP(s->cache_bio_sectors, PAGE_SECTORS), |
945 | dc->disk.bio_split); | 943 | dc->disk.bio_split); |
946 | 944 | ||
947 | if (!s->op.cache_bio) | 945 | if (!s->op.cache_bio) |
948 | goto out_submit; | 946 | goto out_submit; |
949 | 947 | ||
950 | s->op.cache_bio->bi_sector = miss->bi_sector; | 948 | s->op.cache_bio->bi_sector = miss->bi_sector; |
951 | s->op.cache_bio->bi_bdev = miss->bi_bdev; | 949 | s->op.cache_bio->bi_bdev = miss->bi_bdev; |
952 | s->op.cache_bio->bi_size = s->cache_bio_sectors << 9; | 950 | s->op.cache_bio->bi_size = s->cache_bio_sectors << 9; |
953 | 951 | ||
954 | s->op.cache_bio->bi_end_io = request_endio; | 952 | s->op.cache_bio->bi_end_io = request_endio; |
955 | s->op.cache_bio->bi_private = &s->cl; | 953 | s->op.cache_bio->bi_private = &s->cl; |
956 | 954 | ||
957 | /* btree_search_recurse()'s btree iterator is no good anymore */ | 955 | /* btree_search_recurse()'s btree iterator is no good anymore */ |
958 | ret = -EINTR; | 956 | ret = -EINTR; |
959 | if (!bch_btree_insert_check_key(b, &s->op, s->op.cache_bio)) | 957 | if (!bch_btree_insert_check_key(b, &s->op, s->op.cache_bio)) |
960 | goto out_put; | 958 | goto out_put; |
961 | 959 | ||
962 | bch_bio_map(s->op.cache_bio, NULL); | 960 | bch_bio_map(s->op.cache_bio, NULL); |
963 | if (bch_bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO)) | 961 | if (bch_bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO)) |
964 | goto out_put; | 962 | goto out_put; |
965 | 963 | ||
966 | s->cache_miss = miss; | 964 | s->cache_miss = miss; |
967 | bio_get(s->op.cache_bio); | 965 | bio_get(s->op.cache_bio); |
968 | 966 | ||
969 | closure_bio_submit(s->op.cache_bio, &s->cl, s->d); | 967 | closure_bio_submit(s->op.cache_bio, &s->cl, s->d); |
970 | 968 | ||
971 | return ret; | 969 | return ret; |
972 | out_put: | 970 | out_put: |
973 | bio_put(s->op.cache_bio); | 971 | bio_put(s->op.cache_bio); |
974 | s->op.cache_bio = NULL; | 972 | s->op.cache_bio = NULL; |
975 | out_submit: | 973 | out_submit: |
976 | closure_bio_submit(miss, &s->cl, s->d); | 974 | closure_bio_submit(miss, &s->cl, s->d); |
977 | return ret; | 975 | return ret; |
978 | } | 976 | } |
979 | 977 | ||
980 | static void request_read(struct cached_dev *dc, struct search *s) | 978 | static void request_read(struct cached_dev *dc, struct search *s) |
981 | { | 979 | { |
982 | struct closure *cl = &s->cl; | 980 | struct closure *cl = &s->cl; |
983 | 981 | ||
984 | check_should_skip(dc, s); | 982 | check_should_skip(dc, s); |
985 | closure_call(&s->op.cl, btree_read_async, NULL, cl); | 983 | closure_call(&s->op.cl, btree_read_async, NULL, cl); |
986 | 984 | ||
987 | continue_at(cl, request_read_done_bh, NULL); | 985 | continue_at(cl, request_read_done_bh, NULL); |
988 | } | 986 | } |
989 | 987 | ||
990 | /* Process writes */ | 988 | /* Process writes */ |
991 | 989 | ||
992 | static void cached_dev_write_complete(struct closure *cl) | 990 | static void cached_dev_write_complete(struct closure *cl) |
993 | { | 991 | { |
994 | struct search *s = container_of(cl, struct search, cl); | 992 | struct search *s = container_of(cl, struct search, cl); |
995 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | 993 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); |
996 | 994 | ||
997 | up_read_non_owner(&dc->writeback_lock); | 995 | up_read_non_owner(&dc->writeback_lock); |
998 | cached_dev_bio_complete(cl); | 996 | cached_dev_bio_complete(cl); |
999 | } | 997 | } |
1000 | 998 | ||
1001 | static bool should_writeback(struct cached_dev *dc, struct bio *bio) | ||
1002 | { | ||
1003 | unsigned threshold = (bio->bi_rw & REQ_SYNC) | ||
1004 | ? CUTOFF_WRITEBACK_SYNC | ||
1005 | : CUTOFF_WRITEBACK; | ||
1006 | |||
1007 | return !atomic_read(&dc->disk.detaching) && | ||
1008 | cache_mode(dc, bio) == CACHE_MODE_WRITEBACK && | ||
1009 | dc->disk.c->gc_stats.in_use < threshold; | ||
1010 | } | ||
1011 | |||
1012 | static void request_write(struct cached_dev *dc, struct search *s) | 999 | static void request_write(struct cached_dev *dc, struct search *s) |
1013 | { | 1000 | { |
1014 | struct closure *cl = &s->cl; | 1001 | struct closure *cl = &s->cl; |
1015 | struct bio *bio = &s->bio.bio; | 1002 | struct bio *bio = &s->bio.bio; |
1016 | struct bkey start, end; | 1003 | struct bkey start, end; |
1017 | start = KEY(dc->disk.id, bio->bi_sector, 0); | 1004 | start = KEY(dc->disk.id, bio->bi_sector, 0); |
1018 | end = KEY(dc->disk.id, bio_end(bio), 0); | 1005 | end = KEY(dc->disk.id, bio_end(bio), 0); |
1019 | 1006 | ||
1020 | bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end); | 1007 | bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end); |
1021 | 1008 | ||
1022 | check_should_skip(dc, s); | 1009 | check_should_skip(dc, s); |
1023 | down_read_non_owner(&dc->writeback_lock); | 1010 | down_read_non_owner(&dc->writeback_lock); |
1024 | 1011 | ||
1025 | if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) { | 1012 | if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) { |
1026 | s->op.skip = false; | 1013 | s->op.skip = false; |
1027 | s->writeback = true; | 1014 | s->writeback = true; |
1028 | } | 1015 | } |
1029 | 1016 | ||
1030 | if (bio->bi_rw & REQ_DISCARD) | 1017 | if (bio->bi_rw & REQ_DISCARD) |
1031 | goto skip; | 1018 | goto skip; |
1032 | 1019 | ||
1020 | if (should_writeback(dc, s->orig_bio, | ||
1021 | cache_mode(dc, bio), | ||
1022 | s->op.skip)) { | ||
1023 | s->op.skip = false; | ||
1024 | s->writeback = true; | ||
1025 | } | ||
1026 | |||
1033 | if (s->op.skip) | 1027 | if (s->op.skip) |
1034 | goto skip; | 1028 | goto skip; |
1035 | |||
1036 | if (should_writeback(dc, s->orig_bio)) | ||
1037 | s->writeback = true; | ||
1038 | 1029 | ||
1039 | trace_bcache_write(s->orig_bio, s->writeback, s->op.skip); | 1030 | trace_bcache_write(s->orig_bio, s->writeback, s->op.skip); |
1040 | 1031 | ||
1041 | if (!s->writeback) { | 1032 | if (!s->writeback) { |
1042 | s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO, | 1033 | s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO, |
1043 | dc->disk.bio_split); | 1034 | dc->disk.bio_split); |
1044 | 1035 | ||
1045 | closure_bio_submit(bio, cl, s->d); | 1036 | closure_bio_submit(bio, cl, s->d); |
1046 | } else { | 1037 | } else { |
1047 | s->op.cache_bio = bio; | 1038 | s->op.cache_bio = bio; |
1048 | bch_writeback_add(dc); | 1039 | bch_writeback_add(dc); |
1049 | } | 1040 | } |
1050 | out: | 1041 | out: |
1051 | closure_call(&s->op.cl, bch_insert_data, NULL, cl); | 1042 | closure_call(&s->op.cl, bch_insert_data, NULL, cl); |
1052 | continue_at(cl, cached_dev_write_complete, NULL); | 1043 | continue_at(cl, cached_dev_write_complete, NULL); |
1053 | skip: | 1044 | skip: |
1054 | s->op.skip = true; | 1045 | s->op.skip = true; |
1055 | s->op.cache_bio = s->orig_bio; | 1046 | s->op.cache_bio = s->orig_bio; |
1056 | bio_get(s->op.cache_bio); | 1047 | bio_get(s->op.cache_bio); |
1057 | 1048 | ||
1058 | if ((bio->bi_rw & REQ_DISCARD) && | 1049 | if ((bio->bi_rw & REQ_DISCARD) && |
1059 | !blk_queue_discard(bdev_get_queue(dc->bdev))) | 1050 | !blk_queue_discard(bdev_get_queue(dc->bdev))) |
1060 | goto out; | 1051 | goto out; |
1061 | 1052 | ||
1062 | closure_bio_submit(bio, cl, s->d); | 1053 | closure_bio_submit(bio, cl, s->d); |
1063 | goto out; | 1054 | goto out; |
1064 | } | 1055 | } |
1065 | 1056 | ||
1066 | static void request_nodata(struct cached_dev *dc, struct search *s) | 1057 | static void request_nodata(struct cached_dev *dc, struct search *s) |
1067 | { | 1058 | { |
1068 | struct closure *cl = &s->cl; | 1059 | struct closure *cl = &s->cl; |
1069 | struct bio *bio = &s->bio.bio; | 1060 | struct bio *bio = &s->bio.bio; |
1070 | 1061 | ||
1071 | if (bio->bi_rw & REQ_DISCARD) { | 1062 | if (bio->bi_rw & REQ_DISCARD) { |
1072 | request_write(dc, s); | 1063 | request_write(dc, s); |
1073 | return; | 1064 | return; |
1074 | } | 1065 | } |
1075 | 1066 | ||
1076 | if (s->op.flush_journal) | 1067 | if (s->op.flush_journal) |
1077 | bch_journal_meta(s->op.c, cl); | 1068 | bch_journal_meta(s->op.c, cl); |
1078 | 1069 | ||
1079 | closure_bio_submit(bio, cl, s->d); | 1070 | closure_bio_submit(bio, cl, s->d); |
1080 | 1071 | ||
1081 | continue_at(cl, cached_dev_bio_complete, NULL); | 1072 | continue_at(cl, cached_dev_bio_complete, NULL); |
1082 | } | 1073 | } |
1083 | 1074 | ||
1084 | /* Cached devices - read & write stuff */ | 1075 | /* Cached devices - read & write stuff */ |
1085 | 1076 | ||
1086 | unsigned bch_get_congested(struct cache_set *c) | 1077 | unsigned bch_get_congested(struct cache_set *c) |
1087 | { | 1078 | { |
1088 | int i; | 1079 | int i; |
1089 | long rand; | 1080 | long rand; |
1090 | 1081 | ||
1091 | if (!c->congested_read_threshold_us && | 1082 | if (!c->congested_read_threshold_us && |
1092 | !c->congested_write_threshold_us) | 1083 | !c->congested_write_threshold_us) |
1093 | return 0; | 1084 | return 0; |
1094 | 1085 | ||
1095 | i = (local_clock_us() - c->congested_last_us) / 1024; | 1086 | i = (local_clock_us() - c->congested_last_us) / 1024; |
1096 | if (i < 0) | 1087 | if (i < 0) |
1097 | return 0; | 1088 | return 0; |
1098 | 1089 | ||
1099 | i += atomic_read(&c->congested); | 1090 | i += atomic_read(&c->congested); |
1100 | if (i >= 0) | 1091 | if (i >= 0) |
1101 | return 0; | 1092 | return 0; |
1102 | 1093 | ||
1103 | i += CONGESTED_MAX; | 1094 | i += CONGESTED_MAX; |
1104 | 1095 | ||
1105 | if (i > 0) | 1096 | if (i > 0) |
1106 | i = fract_exp_two(i, 6); | 1097 | i = fract_exp_two(i, 6); |
1107 | 1098 | ||
1108 | rand = get_random_int(); | 1099 | rand = get_random_int(); |
1109 | i -= bitmap_weight(&rand, BITS_PER_LONG); | 1100 | i -= bitmap_weight(&rand, BITS_PER_LONG); |
1110 | 1101 | ||
1111 | return i > 0 ? i : 1; | 1102 | return i > 0 ? i : 1; |
1112 | } | 1103 | } |
1113 | 1104 | ||
1114 | static void add_sequential(struct task_struct *t) | 1105 | static void add_sequential(struct task_struct *t) |
1115 | { | 1106 | { |
1116 | ewma_add(t->sequential_io_avg, | 1107 | ewma_add(t->sequential_io_avg, |
1117 | t->sequential_io, 8, 0); | 1108 | t->sequential_io, 8, 0); |
1118 | 1109 | ||
1119 | t->sequential_io = 0; | 1110 | t->sequential_io = 0; |
1120 | } | 1111 | } |
1121 | 1112 | ||
1122 | static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k) | 1113 | static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k) |
1123 | { | 1114 | { |
1124 | return &dc->io_hash[hash_64(k, RECENT_IO_BITS)]; | 1115 | return &dc->io_hash[hash_64(k, RECENT_IO_BITS)]; |
1125 | } | 1116 | } |
1126 | 1117 | ||
1127 | static void check_should_skip(struct cached_dev *dc, struct search *s) | 1118 | static void check_should_skip(struct cached_dev *dc, struct search *s) |
1128 | { | 1119 | { |
1129 | struct cache_set *c = s->op.c; | 1120 | struct cache_set *c = s->op.c; |
1130 | struct bio *bio = &s->bio.bio; | 1121 | struct bio *bio = &s->bio.bio; |
1131 | unsigned mode = cache_mode(dc, bio); | 1122 | unsigned mode = cache_mode(dc, bio); |
1132 | unsigned sectors, congested = bch_get_congested(c); | 1123 | unsigned sectors, congested = bch_get_congested(c); |
1133 | 1124 | ||
1134 | if (atomic_read(&dc->disk.detaching) || | 1125 | if (atomic_read(&dc->disk.detaching) || |
1135 | c->gc_stats.in_use > CUTOFF_CACHE_ADD || | 1126 | c->gc_stats.in_use > CUTOFF_CACHE_ADD || |
1136 | (bio->bi_rw & REQ_DISCARD)) | 1127 | (bio->bi_rw & REQ_DISCARD)) |
1137 | goto skip; | 1128 | goto skip; |
1138 | 1129 | ||
1139 | if (mode == CACHE_MODE_NONE || | 1130 | if (mode == CACHE_MODE_NONE || |
1140 | (mode == CACHE_MODE_WRITEAROUND && | 1131 | (mode == CACHE_MODE_WRITEAROUND && |
1141 | (bio->bi_rw & REQ_WRITE))) | 1132 | (bio->bi_rw & REQ_WRITE))) |
1142 | goto skip; | 1133 | goto skip; |
1143 | 1134 | ||
1144 | if (bio->bi_sector & (c->sb.block_size - 1) || | 1135 | if (bio->bi_sector & (c->sb.block_size - 1) || |
1145 | bio_sectors(bio) & (c->sb.block_size - 1)) { | 1136 | bio_sectors(bio) & (c->sb.block_size - 1)) { |
1146 | pr_debug("skipping unaligned io"); | 1137 | pr_debug("skipping unaligned io"); |
1147 | goto skip; | 1138 | goto skip; |
1148 | } | 1139 | } |
1149 | 1140 | ||
1150 | if (!congested && !dc->sequential_cutoff) | 1141 | if (!congested && !dc->sequential_cutoff) |
1151 | goto rescale; | 1142 | goto rescale; |
1152 | 1143 | ||
1153 | if (!congested && | 1144 | if (!congested && |
1154 | mode == CACHE_MODE_WRITEBACK && | 1145 | mode == CACHE_MODE_WRITEBACK && |
1155 | (bio->bi_rw & REQ_WRITE) && | 1146 | (bio->bi_rw & REQ_WRITE) && |
1156 | (bio->bi_rw & REQ_SYNC)) | 1147 | (bio->bi_rw & REQ_SYNC)) |
1157 | goto rescale; | 1148 | goto rescale; |
1158 | 1149 | ||
1159 | if (dc->sequential_merge) { | 1150 | if (dc->sequential_merge) { |
1160 | struct io *i; | 1151 | struct io *i; |
1161 | 1152 | ||
1162 | spin_lock(&dc->io_lock); | 1153 | spin_lock(&dc->io_lock); |
1163 | 1154 | ||
1164 | hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash) | 1155 | hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash) |
1165 | if (i->last == bio->bi_sector && | 1156 | if (i->last == bio->bi_sector && |
1166 | time_before(jiffies, i->jiffies)) | 1157 | time_before(jiffies, i->jiffies)) |
1167 | goto found; | 1158 | goto found; |
1168 | 1159 | ||
1169 | i = list_first_entry(&dc->io_lru, struct io, lru); | 1160 | i = list_first_entry(&dc->io_lru, struct io, lru); |
1170 | 1161 | ||
1171 | add_sequential(s->task); | 1162 | add_sequential(s->task); |
1172 | i->sequential = 0; | 1163 | i->sequential = 0; |
1173 | found: | 1164 | found: |
1174 | if (i->sequential + bio->bi_size > i->sequential) | 1165 | if (i->sequential + bio->bi_size > i->sequential) |
1175 | i->sequential += bio->bi_size; | 1166 | i->sequential += bio->bi_size; |
1176 | 1167 | ||
1177 | i->last = bio_end(bio); | 1168 | i->last = bio_end(bio); |
1178 | i->jiffies = jiffies + msecs_to_jiffies(5000); | 1169 | i->jiffies = jiffies + msecs_to_jiffies(5000); |
1179 | s->task->sequential_io = i->sequential; | 1170 | s->task->sequential_io = i->sequential; |
1180 | 1171 | ||
1181 | hlist_del(&i->hash); | 1172 | hlist_del(&i->hash); |
1182 | hlist_add_head(&i->hash, iohash(dc, i->last)); | 1173 | hlist_add_head(&i->hash, iohash(dc, i->last)); |
1183 | list_move_tail(&i->lru, &dc->io_lru); | 1174 | list_move_tail(&i->lru, &dc->io_lru); |
1184 | 1175 | ||
1185 | spin_unlock(&dc->io_lock); | 1176 | spin_unlock(&dc->io_lock); |
1186 | } else { | 1177 | } else { |
1187 | s->task->sequential_io = bio->bi_size; | 1178 | s->task->sequential_io = bio->bi_size; |
1188 | 1179 | ||
1189 | add_sequential(s->task); | 1180 | add_sequential(s->task); |
1190 | } | 1181 | } |
1191 | 1182 | ||
1192 | sectors = max(s->task->sequential_io, | 1183 | sectors = max(s->task->sequential_io, |
1193 | s->task->sequential_io_avg) >> 9; | 1184 | s->task->sequential_io_avg) >> 9; |
1194 | 1185 | ||
1195 | if (dc->sequential_cutoff && | 1186 | if (dc->sequential_cutoff && |
1196 | sectors >= dc->sequential_cutoff >> 9) { | 1187 | sectors >= dc->sequential_cutoff >> 9) { |
1197 | trace_bcache_bypass_sequential(s->orig_bio); | 1188 | trace_bcache_bypass_sequential(s->orig_bio); |
1198 | goto skip; | 1189 | goto skip; |
1199 | } | 1190 | } |
1200 | 1191 | ||
1201 | if (congested && sectors >= congested) { | 1192 | if (congested && sectors >= congested) { |
1202 | trace_bcache_bypass_congested(s->orig_bio); | 1193 | trace_bcache_bypass_congested(s->orig_bio); |
1203 | goto skip; | 1194 | goto skip; |
1204 | } | 1195 | } |
1205 | 1196 | ||
1206 | rescale: | 1197 | rescale: |
1207 | bch_rescale_priorities(c, bio_sectors(bio)); | 1198 | bch_rescale_priorities(c, bio_sectors(bio)); |
1208 | return; | 1199 | return; |
1209 | skip: | 1200 | skip: |
1210 | bch_mark_sectors_bypassed(s, bio_sectors(bio)); | 1201 | bch_mark_sectors_bypassed(s, bio_sectors(bio)); |
1211 | s->op.skip = true; | 1202 | s->op.skip = true; |
1212 | } | 1203 | } |
1213 | 1204 | ||
1214 | static void cached_dev_make_request(struct request_queue *q, struct bio *bio) | 1205 | static void cached_dev_make_request(struct request_queue *q, struct bio *bio) |
1215 | { | 1206 | { |
1216 | struct search *s; | 1207 | struct search *s; |
1217 | struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; | 1208 | struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; |
1218 | struct cached_dev *dc = container_of(d, struct cached_dev, disk); | 1209 | struct cached_dev *dc = container_of(d, struct cached_dev, disk); |
1219 | int cpu, rw = bio_data_dir(bio); | 1210 | int cpu, rw = bio_data_dir(bio); |
1220 | 1211 | ||
1221 | cpu = part_stat_lock(); | 1212 | cpu = part_stat_lock(); |
1222 | part_stat_inc(cpu, &d->disk->part0, ios[rw]); | 1213 | part_stat_inc(cpu, &d->disk->part0, ios[rw]); |
1223 | part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio)); | 1214 | part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio)); |
1224 | part_stat_unlock(); | 1215 | part_stat_unlock(); |
1225 | 1216 | ||
1226 | bio->bi_bdev = dc->bdev; | 1217 | bio->bi_bdev = dc->bdev; |
1227 | bio->bi_sector += dc->sb.data_offset; | 1218 | bio->bi_sector += dc->sb.data_offset; |
1228 | 1219 | ||
1229 | if (cached_dev_get(dc)) { | 1220 | if (cached_dev_get(dc)) { |
1230 | s = search_alloc(bio, d); | 1221 | s = search_alloc(bio, d); |
1231 | trace_bcache_request_start(s, bio); | 1222 | trace_bcache_request_start(s, bio); |
1232 | 1223 | ||
1233 | if (!bio_has_data(bio)) | 1224 | if (!bio_has_data(bio)) |
1234 | request_nodata(dc, s); | 1225 | request_nodata(dc, s); |
1235 | else if (rw) | 1226 | else if (rw) |
1236 | request_write(dc, s); | 1227 | request_write(dc, s); |
1237 | else | 1228 | else |
1238 | request_read(dc, s); | 1229 | request_read(dc, s); |
1239 | } else { | 1230 | } else { |
1240 | if ((bio->bi_rw & REQ_DISCARD) && | 1231 | if ((bio->bi_rw & REQ_DISCARD) && |
1241 | !blk_queue_discard(bdev_get_queue(dc->bdev))) | 1232 | !blk_queue_discard(bdev_get_queue(dc->bdev))) |
1242 | bio_endio(bio, 0); | 1233 | bio_endio(bio, 0); |
1243 | else | 1234 | else |
1244 | bch_generic_make_request(bio, &d->bio_split_hook); | 1235 | bch_generic_make_request(bio, &d->bio_split_hook); |
1245 | } | 1236 | } |
1246 | } | 1237 | } |
1247 | 1238 | ||
1248 | static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode, | 1239 | static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode, |
1249 | unsigned int cmd, unsigned long arg) | 1240 | unsigned int cmd, unsigned long arg) |
1250 | { | 1241 | { |
1251 | struct cached_dev *dc = container_of(d, struct cached_dev, disk); | 1242 | struct cached_dev *dc = container_of(d, struct cached_dev, disk); |
1252 | return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg); | 1243 | return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg); |
1253 | } | 1244 | } |
1254 | 1245 | ||
1255 | static int cached_dev_congested(void *data, int bits) | 1246 | static int cached_dev_congested(void *data, int bits) |
1256 | { | 1247 | { |
1257 | struct bcache_device *d = data; | 1248 | struct bcache_device *d = data; |
1258 | struct cached_dev *dc = container_of(d, struct cached_dev, disk); | 1249 | struct cached_dev *dc = container_of(d, struct cached_dev, disk); |
1259 | struct request_queue *q = bdev_get_queue(dc->bdev); | 1250 | struct request_queue *q = bdev_get_queue(dc->bdev); |
1260 | int ret = 0; | 1251 | int ret = 0; |
1261 | 1252 | ||
1262 | if (bdi_congested(&q->backing_dev_info, bits)) | 1253 | if (bdi_congested(&q->backing_dev_info, bits)) |
1263 | return 1; | 1254 | return 1; |
1264 | 1255 | ||
1265 | if (cached_dev_get(dc)) { | 1256 | if (cached_dev_get(dc)) { |
1266 | unsigned i; | 1257 | unsigned i; |
1267 | struct cache *ca; | 1258 | struct cache *ca; |
1268 | 1259 | ||
1269 | for_each_cache(ca, d->c, i) { | 1260 | for_each_cache(ca, d->c, i) { |
1270 | q = bdev_get_queue(ca->bdev); | 1261 | q = bdev_get_queue(ca->bdev); |
1271 | ret |= bdi_congested(&q->backing_dev_info, bits); | 1262 | ret |= bdi_congested(&q->backing_dev_info, bits); |
1272 | } | 1263 | } |
1273 | 1264 | ||
1274 | cached_dev_put(dc); | 1265 | cached_dev_put(dc); |
1275 | } | 1266 | } |
1276 | 1267 | ||
1277 | return ret; | 1268 | return ret; |
1278 | } | 1269 | } |
1279 | 1270 | ||
1280 | void bch_cached_dev_request_init(struct cached_dev *dc) | 1271 | void bch_cached_dev_request_init(struct cached_dev *dc) |
1281 | { | 1272 | { |
1282 | struct gendisk *g = dc->disk.disk; | 1273 | struct gendisk *g = dc->disk.disk; |
1283 | 1274 | ||
1284 | g->queue->make_request_fn = cached_dev_make_request; | 1275 | g->queue->make_request_fn = cached_dev_make_request; |
1285 | g->queue->backing_dev_info.congested_fn = cached_dev_congested; | 1276 | g->queue->backing_dev_info.congested_fn = cached_dev_congested; |
1286 | dc->disk.cache_miss = cached_dev_cache_miss; | 1277 | dc->disk.cache_miss = cached_dev_cache_miss; |
1287 | dc->disk.ioctl = cached_dev_ioctl; | 1278 | dc->disk.ioctl = cached_dev_ioctl; |
1288 | } | 1279 | } |
1289 | 1280 | ||
1290 | /* Flash backed devices */ | 1281 | /* Flash backed devices */ |
1291 | 1282 | ||
1292 | static int flash_dev_cache_miss(struct btree *b, struct search *s, | 1283 | static int flash_dev_cache_miss(struct btree *b, struct search *s, |
1293 | struct bio *bio, unsigned sectors) | 1284 | struct bio *bio, unsigned sectors) |
1294 | { | 1285 | { |
1295 | /* Zero fill bio */ | 1286 | /* Zero fill bio */ |
1296 | 1287 | ||
1297 | while (bio->bi_idx != bio->bi_vcnt) { | 1288 | while (bio->bi_idx != bio->bi_vcnt) { |
1298 | struct bio_vec *bv = bio_iovec(bio); | 1289 | struct bio_vec *bv = bio_iovec(bio); |
1299 | unsigned j = min(bv->bv_len >> 9, sectors); | 1290 | unsigned j = min(bv->bv_len >> 9, sectors); |
1300 | 1291 | ||
1301 | void *p = kmap(bv->bv_page); | 1292 | void *p = kmap(bv->bv_page); |
1302 | memset(p + bv->bv_offset, 0, j << 9); | 1293 | memset(p + bv->bv_offset, 0, j << 9); |
1303 | kunmap(bv->bv_page); | 1294 | kunmap(bv->bv_page); |
1304 | 1295 | ||
1305 | bv->bv_len -= j << 9; | 1296 | bv->bv_len -= j << 9; |
1306 | bv->bv_offset += j << 9; | 1297 | bv->bv_offset += j << 9; |
1307 | 1298 | ||
1308 | if (bv->bv_len) | 1299 | if (bv->bv_len) |
1309 | return 0; | 1300 | return 0; |
1310 | 1301 | ||
1311 | bio->bi_sector += j; | 1302 | bio->bi_sector += j; |
1312 | bio->bi_size -= j << 9; | 1303 | bio->bi_size -= j << 9; |
1313 | 1304 | ||
1314 | bio->bi_idx++; | 1305 | bio->bi_idx++; |
1315 | sectors -= j; | 1306 | sectors -= j; |
1316 | } | 1307 | } |
1317 | 1308 | ||
1318 | s->op.lookup_done = true; | 1309 | s->op.lookup_done = true; |
1319 | 1310 | ||
1320 | return 0; | 1311 | return 0; |
1321 | } | 1312 | } |
1322 | 1313 | ||
1323 | static void flash_dev_make_request(struct request_queue *q, struct bio *bio) | 1314 | static void flash_dev_make_request(struct request_queue *q, struct bio *bio) |
1324 | { | 1315 | { |
1325 | struct search *s; | 1316 | struct search *s; |
1326 | struct closure *cl; | 1317 | struct closure *cl; |
1327 | struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; | 1318 | struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; |
1328 | int cpu, rw = bio_data_dir(bio); | 1319 | int cpu, rw = bio_data_dir(bio); |
1329 | 1320 | ||
1330 | cpu = part_stat_lock(); | 1321 | cpu = part_stat_lock(); |
1331 | part_stat_inc(cpu, &d->disk->part0, ios[rw]); | 1322 | part_stat_inc(cpu, &d->disk->part0, ios[rw]); |
1332 | part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio)); | 1323 | part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio)); |
1333 | part_stat_unlock(); | 1324 | part_stat_unlock(); |
1334 | 1325 | ||
1335 | s = search_alloc(bio, d); | 1326 | s = search_alloc(bio, d); |
1336 | cl = &s->cl; | 1327 | cl = &s->cl; |
1337 | bio = &s->bio.bio; | 1328 | bio = &s->bio.bio; |
1338 | 1329 | ||
1339 | trace_bcache_request_start(s, bio); | 1330 | trace_bcache_request_start(s, bio); |
1340 | 1331 | ||
1341 | if (bio_has_data(bio) && !rw) { | 1332 | if (bio_has_data(bio) && !rw) { |
1342 | closure_call(&s->op.cl, btree_read_async, NULL, cl); | 1333 | closure_call(&s->op.cl, btree_read_async, NULL, cl); |
1343 | } else if (bio_has_data(bio) || s->op.skip) { | 1334 | } else if (bio_has_data(bio) || s->op.skip) { |
1344 | bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, | 1335 | bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, |
1345 | &KEY(d->id, bio->bi_sector, 0), | 1336 | &KEY(d->id, bio->bi_sector, 0), |
1346 | &KEY(d->id, bio_end(bio), 0)); | 1337 | &KEY(d->id, bio_end(bio), 0)); |
1347 | 1338 | ||
1348 | s->writeback = true; | 1339 | s->writeback = true; |
1349 | s->op.cache_bio = bio; | 1340 | s->op.cache_bio = bio; |
1350 | 1341 | ||
1351 | closure_call(&s->op.cl, bch_insert_data, NULL, cl); | 1342 | closure_call(&s->op.cl, bch_insert_data, NULL, cl); |
1352 | } else { | 1343 | } else { |
1353 | /* No data - probably a cache flush */ | 1344 | /* No data - probably a cache flush */ |
1354 | if (s->op.flush_journal) | 1345 | if (s->op.flush_journal) |
1355 | bch_journal_meta(s->op.c, cl); | 1346 | bch_journal_meta(s->op.c, cl); |
1356 | } | 1347 | } |
1357 | 1348 | ||
1358 | continue_at(cl, search_free, NULL); | 1349 | continue_at(cl, search_free, NULL); |
1359 | } | 1350 | } |
1360 | 1351 | ||
1361 | static int flash_dev_ioctl(struct bcache_device *d, fmode_t mode, | 1352 | static int flash_dev_ioctl(struct bcache_device *d, fmode_t mode, |
1362 | unsigned int cmd, unsigned long arg) | 1353 | unsigned int cmd, unsigned long arg) |
1363 | { | 1354 | { |
1364 | return -ENOTTY; | 1355 | return -ENOTTY; |
1365 | } | 1356 | } |
1366 | 1357 | ||
1367 | static int flash_dev_congested(void *data, int bits) | 1358 | static int flash_dev_congested(void *data, int bits) |
1368 | { | 1359 | { |
1369 | struct bcache_device *d = data; | 1360 | struct bcache_device *d = data; |
1370 | struct request_queue *q; | 1361 | struct request_queue *q; |
1371 | struct cache *ca; | 1362 | struct cache *ca; |
1372 | unsigned i; | 1363 | unsigned i; |
1373 | int ret = 0; | 1364 | int ret = 0; |
1374 | 1365 | ||
1375 | for_each_cache(ca, d->c, i) { | 1366 | for_each_cache(ca, d->c, i) { |
1376 | q = bdev_get_queue(ca->bdev); | 1367 | q = bdev_get_queue(ca->bdev); |
1377 | ret |= bdi_congested(&q->backing_dev_info, bits); | 1368 | ret |= bdi_congested(&q->backing_dev_info, bits); |
1378 | } | 1369 | } |
1379 | 1370 | ||
1380 | return ret; | 1371 | return ret; |
1381 | } | 1372 | } |
1382 | 1373 | ||
1383 | void bch_flash_dev_request_init(struct bcache_device *d) | 1374 | void bch_flash_dev_request_init(struct bcache_device *d) |
1384 | { | 1375 | { |
1385 | struct gendisk *g = d->disk; | 1376 | struct gendisk *g = d->disk; |
1386 | 1377 | ||
1387 | g->queue->make_request_fn = flash_dev_make_request; | 1378 | g->queue->make_request_fn = flash_dev_make_request; |
1388 | g->queue->backing_dev_info.congested_fn = flash_dev_congested; | 1379 | g->queue->backing_dev_info.congested_fn = flash_dev_congested; |
1389 | d->cache_miss = flash_dev_cache_miss; | 1380 | d->cache_miss = flash_dev_cache_miss; |
1390 | d->ioctl = flash_dev_ioctl; | 1381 | d->ioctl = flash_dev_ioctl; |
1391 | } | 1382 | } |
1392 | 1383 | ||
1393 | void bch_request_exit(void) | 1384 | void bch_request_exit(void) |
1394 | { | 1385 | { |
1395 | #ifdef CONFIG_CGROUP_BCACHE | 1386 | #ifdef CONFIG_CGROUP_BCACHE |
1396 | cgroup_unload_subsys(&bcache_subsys); | 1387 | cgroup_unload_subsys(&bcache_subsys); |
1397 | #endif | 1388 | #endif |
1398 | if (bch_search_cache) | 1389 | if (bch_search_cache) |
1399 | kmem_cache_destroy(bch_search_cache); | 1390 | kmem_cache_destroy(bch_search_cache); |
1400 | } | 1391 | } |
1401 | 1392 | ||
1402 | int __init bch_request_init(void) | 1393 | int __init bch_request_init(void) |
1403 | { | 1394 | { |
1404 | bch_search_cache = KMEM_CACHE(search, 0); | 1395 | bch_search_cache = KMEM_CACHE(search, 0); |
1405 | if (!bch_search_cache) | 1396 | if (!bch_search_cache) |
1406 | return -ENOMEM; | 1397 | return -ENOMEM; |
1407 | 1398 | ||
1408 | #ifdef CONFIG_CGROUP_BCACHE | 1399 | #ifdef CONFIG_CGROUP_BCACHE |
1409 | cgroup_load_subsys(&bcache_subsys); | 1400 | cgroup_load_subsys(&bcache_subsys); |
drivers/md/bcache/sysfs.c
1 | /* | 1 | /* |
2 | * bcache sysfs interfaces | 2 | * bcache sysfs interfaces |
3 | * | 3 | * |
4 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> | 4 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> |
5 | * Copyright 2012 Google, Inc. | 5 | * Copyright 2012 Google, Inc. |
6 | */ | 6 | */ |
7 | 7 | ||
8 | #include "bcache.h" | 8 | #include "bcache.h" |
9 | #include "sysfs.h" | 9 | #include "sysfs.h" |
10 | #include "btree.h" | 10 | #include "btree.h" |
11 | #include "request.h" | 11 | #include "request.h" |
12 | #include "writeback.h" | 12 | #include "writeback.h" |
13 | 13 | ||
14 | #include <linux/blkdev.h> | 14 | #include <linux/blkdev.h> |
15 | #include <linux/sort.h> | 15 | #include <linux/sort.h> |
16 | 16 | ||
17 | static const char * const cache_replacement_policies[] = { | 17 | static const char * const cache_replacement_policies[] = { |
18 | "lru", | 18 | "lru", |
19 | "fifo", | 19 | "fifo", |
20 | "random", | 20 | "random", |
21 | NULL | 21 | NULL |
22 | }; | 22 | }; |
23 | 23 | ||
24 | write_attribute(attach); | 24 | write_attribute(attach); |
25 | write_attribute(detach); | 25 | write_attribute(detach); |
26 | write_attribute(unregister); | 26 | write_attribute(unregister); |
27 | write_attribute(stop); | 27 | write_attribute(stop); |
28 | write_attribute(clear_stats); | 28 | write_attribute(clear_stats); |
29 | write_attribute(trigger_gc); | 29 | write_attribute(trigger_gc); |
30 | write_attribute(prune_cache); | 30 | write_attribute(prune_cache); |
31 | write_attribute(flash_vol_create); | 31 | write_attribute(flash_vol_create); |
32 | 32 | ||
33 | read_attribute(bucket_size); | 33 | read_attribute(bucket_size); |
34 | read_attribute(block_size); | 34 | read_attribute(block_size); |
35 | read_attribute(nbuckets); | 35 | read_attribute(nbuckets); |
36 | read_attribute(tree_depth); | 36 | read_attribute(tree_depth); |
37 | read_attribute(root_usage_percent); | 37 | read_attribute(root_usage_percent); |
38 | read_attribute(priority_stats); | 38 | read_attribute(priority_stats); |
39 | read_attribute(btree_cache_size); | 39 | read_attribute(btree_cache_size); |
40 | read_attribute(btree_cache_max_chain); | 40 | read_attribute(btree_cache_max_chain); |
41 | read_attribute(cache_available_percent); | 41 | read_attribute(cache_available_percent); |
42 | read_attribute(written); | 42 | read_attribute(written); |
43 | read_attribute(btree_written); | 43 | read_attribute(btree_written); |
44 | read_attribute(metadata_written); | 44 | read_attribute(metadata_written); |
45 | read_attribute(active_journal_entries); | 45 | read_attribute(active_journal_entries); |
46 | 46 | ||
47 | sysfs_time_stats_attribute(btree_gc, sec, ms); | 47 | sysfs_time_stats_attribute(btree_gc, sec, ms); |
48 | sysfs_time_stats_attribute(btree_split, sec, us); | 48 | sysfs_time_stats_attribute(btree_split, sec, us); |
49 | sysfs_time_stats_attribute(btree_sort, ms, us); | 49 | sysfs_time_stats_attribute(btree_sort, ms, us); |
50 | sysfs_time_stats_attribute(btree_read, ms, us); | 50 | sysfs_time_stats_attribute(btree_read, ms, us); |
51 | sysfs_time_stats_attribute(try_harder, ms, us); | 51 | sysfs_time_stats_attribute(try_harder, ms, us); |
52 | 52 | ||
53 | read_attribute(btree_nodes); | 53 | read_attribute(btree_nodes); |
54 | read_attribute(btree_used_percent); | 54 | read_attribute(btree_used_percent); |
55 | read_attribute(average_key_size); | 55 | read_attribute(average_key_size); |
56 | read_attribute(dirty_data); | 56 | read_attribute(dirty_data); |
57 | read_attribute(bset_tree_stats); | 57 | read_attribute(bset_tree_stats); |
58 | 58 | ||
59 | read_attribute(state); | 59 | read_attribute(state); |
60 | read_attribute(cache_read_races); | 60 | read_attribute(cache_read_races); |
61 | read_attribute(writeback_keys_done); | 61 | read_attribute(writeback_keys_done); |
62 | read_attribute(writeback_keys_failed); | 62 | read_attribute(writeback_keys_failed); |
63 | read_attribute(io_errors); | 63 | read_attribute(io_errors); |
64 | read_attribute(congested); | 64 | read_attribute(congested); |
65 | rw_attribute(congested_read_threshold_us); | 65 | rw_attribute(congested_read_threshold_us); |
66 | rw_attribute(congested_write_threshold_us); | 66 | rw_attribute(congested_write_threshold_us); |
67 | 67 | ||
68 | rw_attribute(sequential_cutoff); | 68 | rw_attribute(sequential_cutoff); |
69 | rw_attribute(sequential_merge); | 69 | rw_attribute(sequential_merge); |
70 | rw_attribute(data_csum); | 70 | rw_attribute(data_csum); |
71 | rw_attribute(cache_mode); | 71 | rw_attribute(cache_mode); |
72 | rw_attribute(writeback_metadata); | 72 | rw_attribute(writeback_metadata); |
73 | rw_attribute(writeback_running); | 73 | rw_attribute(writeback_running); |
74 | rw_attribute(writeback_percent); | 74 | rw_attribute(writeback_percent); |
75 | rw_attribute(writeback_delay); | 75 | rw_attribute(writeback_delay); |
76 | rw_attribute(writeback_rate); | 76 | rw_attribute(writeback_rate); |
77 | 77 | ||
78 | rw_attribute(writeback_rate_update_seconds); | 78 | rw_attribute(writeback_rate_update_seconds); |
79 | rw_attribute(writeback_rate_d_term); | 79 | rw_attribute(writeback_rate_d_term); |
80 | rw_attribute(writeback_rate_p_term_inverse); | 80 | rw_attribute(writeback_rate_p_term_inverse); |
81 | rw_attribute(writeback_rate_d_smooth); | 81 | rw_attribute(writeback_rate_d_smooth); |
82 | read_attribute(writeback_rate_debug); | 82 | read_attribute(writeback_rate_debug); |
83 | 83 | ||
84 | read_attribute(stripe_size); | ||
85 | read_attribute(partial_stripes_expensive); | ||
86 | |||
84 | rw_attribute(synchronous); | 87 | rw_attribute(synchronous); |
85 | rw_attribute(journal_delay_ms); | 88 | rw_attribute(journal_delay_ms); |
86 | rw_attribute(discard); | 89 | rw_attribute(discard); |
87 | rw_attribute(running); | 90 | rw_attribute(running); |
88 | rw_attribute(label); | 91 | rw_attribute(label); |
89 | rw_attribute(readahead); | 92 | rw_attribute(readahead); |
90 | rw_attribute(io_error_limit); | 93 | rw_attribute(io_error_limit); |
91 | rw_attribute(io_error_halflife); | 94 | rw_attribute(io_error_halflife); |
92 | rw_attribute(verify); | 95 | rw_attribute(verify); |
93 | rw_attribute(key_merging_disabled); | 96 | rw_attribute(key_merging_disabled); |
94 | rw_attribute(gc_always_rewrite); | 97 | rw_attribute(gc_always_rewrite); |
95 | rw_attribute(freelist_percent); | 98 | rw_attribute(freelist_percent); |
96 | rw_attribute(cache_replacement_policy); | 99 | rw_attribute(cache_replacement_policy); |
97 | rw_attribute(btree_shrinker_disabled); | 100 | rw_attribute(btree_shrinker_disabled); |
98 | rw_attribute(copy_gc_enabled); | 101 | rw_attribute(copy_gc_enabled); |
99 | rw_attribute(size); | 102 | rw_attribute(size); |
100 | 103 | ||
101 | SHOW(__bch_cached_dev) | 104 | SHOW(__bch_cached_dev) |
102 | { | 105 | { |
103 | struct cached_dev *dc = container_of(kobj, struct cached_dev, | 106 | struct cached_dev *dc = container_of(kobj, struct cached_dev, |
104 | disk.kobj); | 107 | disk.kobj); |
105 | const char *states[] = { "no cache", "clean", "dirty", "inconsistent" }; | 108 | const char *states[] = { "no cache", "clean", "dirty", "inconsistent" }; |
106 | 109 | ||
107 | #define var(stat) (dc->stat) | 110 | #define var(stat) (dc->stat) |
108 | 111 | ||
109 | if (attr == &sysfs_cache_mode) | 112 | if (attr == &sysfs_cache_mode) |
110 | return bch_snprint_string_list(buf, PAGE_SIZE, | 113 | return bch_snprint_string_list(buf, PAGE_SIZE, |
111 | bch_cache_modes + 1, | 114 | bch_cache_modes + 1, |
112 | BDEV_CACHE_MODE(&dc->sb)); | 115 | BDEV_CACHE_MODE(&dc->sb)); |
113 | 116 | ||
114 | sysfs_printf(data_csum, "%i", dc->disk.data_csum); | 117 | sysfs_printf(data_csum, "%i", dc->disk.data_csum); |
115 | var_printf(verify, "%i"); | 118 | var_printf(verify, "%i"); |
116 | var_printf(writeback_metadata, "%i"); | 119 | var_printf(writeback_metadata, "%i"); |
117 | var_printf(writeback_running, "%i"); | 120 | var_printf(writeback_running, "%i"); |
118 | var_print(writeback_delay); | 121 | var_print(writeback_delay); |
119 | var_print(writeback_percent); | 122 | var_print(writeback_percent); |
120 | sysfs_print(writeback_rate, dc->writeback_rate.rate); | 123 | sysfs_print(writeback_rate, dc->writeback_rate.rate); |
121 | 124 | ||
122 | var_print(writeback_rate_update_seconds); | 125 | var_print(writeback_rate_update_seconds); |
123 | var_print(writeback_rate_d_term); | 126 | var_print(writeback_rate_d_term); |
124 | var_print(writeback_rate_p_term_inverse); | 127 | var_print(writeback_rate_p_term_inverse); |
125 | var_print(writeback_rate_d_smooth); | 128 | var_print(writeback_rate_d_smooth); |
126 | 129 | ||
127 | if (attr == &sysfs_writeback_rate_debug) { | 130 | if (attr == &sysfs_writeback_rate_debug) { |
128 | char dirty[20]; | 131 | char dirty[20]; |
129 | char derivative[20]; | 132 | char derivative[20]; |
130 | char target[20]; | 133 | char target[20]; |
131 | bch_hprint(dirty, | 134 | bch_hprint(dirty, |
132 | bcache_dev_sectors_dirty(&dc->disk) << 9); | 135 | bcache_dev_sectors_dirty(&dc->disk) << 9); |
133 | bch_hprint(derivative, dc->writeback_rate_derivative << 9); | 136 | bch_hprint(derivative, dc->writeback_rate_derivative << 9); |
134 | bch_hprint(target, dc->writeback_rate_target << 9); | 137 | bch_hprint(target, dc->writeback_rate_target << 9); |
135 | 138 | ||
136 | return sprintf(buf, | 139 | return sprintf(buf, |
137 | "rate:\t\t%u\n" | 140 | "rate:\t\t%u\n" |
138 | "change:\t\t%i\n" | 141 | "change:\t\t%i\n" |
139 | "dirty:\t\t%s\n" | 142 | "dirty:\t\t%s\n" |
140 | "derivative:\t%s\n" | 143 | "derivative:\t%s\n" |
141 | "target:\t\t%s\n", | 144 | "target:\t\t%s\n", |
142 | dc->writeback_rate.rate, | 145 | dc->writeback_rate.rate, |
143 | dc->writeback_rate_change, | 146 | dc->writeback_rate_change, |
144 | dirty, derivative, target); | 147 | dirty, derivative, target); |
145 | } | 148 | } |
146 | 149 | ||
147 | sysfs_hprint(dirty_data, | 150 | sysfs_hprint(dirty_data, |
148 | bcache_dev_sectors_dirty(&dc->disk) << 9); | 151 | bcache_dev_sectors_dirty(&dc->disk) << 9); |
149 | 152 | ||
153 | sysfs_hprint(stripe_size, (1 << dc->disk.stripe_size_bits) << 9); | ||
154 | var_printf(partial_stripes_expensive, "%u"); | ||
155 | |||
150 | var_printf(sequential_merge, "%i"); | 156 | var_printf(sequential_merge, "%i"); |
151 | var_hprint(sequential_cutoff); | 157 | var_hprint(sequential_cutoff); |
152 | var_hprint(readahead); | 158 | var_hprint(readahead); |
153 | 159 | ||
154 | sysfs_print(running, atomic_read(&dc->running)); | 160 | sysfs_print(running, atomic_read(&dc->running)); |
155 | sysfs_print(state, states[BDEV_STATE(&dc->sb)]); | 161 | sysfs_print(state, states[BDEV_STATE(&dc->sb)]); |
156 | 162 | ||
157 | if (attr == &sysfs_label) { | 163 | if (attr == &sysfs_label) { |
158 | memcpy(buf, dc->sb.label, SB_LABEL_SIZE); | 164 | memcpy(buf, dc->sb.label, SB_LABEL_SIZE); |
159 | buf[SB_LABEL_SIZE + 1] = '\0'; | 165 | buf[SB_LABEL_SIZE + 1] = '\0'; |
160 | strcat(buf, "\n"); | 166 | strcat(buf, "\n"); |
161 | return strlen(buf); | 167 | return strlen(buf); |
162 | } | 168 | } |
163 | 169 | ||
164 | #undef var | 170 | #undef var |
165 | return 0; | 171 | return 0; |
166 | } | 172 | } |
167 | SHOW_LOCKED(bch_cached_dev) | 173 | SHOW_LOCKED(bch_cached_dev) |
168 | 174 | ||
169 | STORE(__cached_dev) | 175 | STORE(__cached_dev) |
170 | { | 176 | { |
171 | struct cached_dev *dc = container_of(kobj, struct cached_dev, | 177 | struct cached_dev *dc = container_of(kobj, struct cached_dev, |
172 | disk.kobj); | 178 | disk.kobj); |
173 | unsigned v = size; | 179 | unsigned v = size; |
174 | struct cache_set *c; | 180 | struct cache_set *c; |
175 | 181 | ||
176 | #define d_strtoul(var) sysfs_strtoul(var, dc->var) | 182 | #define d_strtoul(var) sysfs_strtoul(var, dc->var) |
177 | #define d_strtoi_h(var) sysfs_hatoi(var, dc->var) | 183 | #define d_strtoi_h(var) sysfs_hatoi(var, dc->var) |
178 | 184 | ||
179 | sysfs_strtoul(data_csum, dc->disk.data_csum); | 185 | sysfs_strtoul(data_csum, dc->disk.data_csum); |
180 | d_strtoul(verify); | 186 | d_strtoul(verify); |
181 | d_strtoul(writeback_metadata); | 187 | d_strtoul(writeback_metadata); |
182 | d_strtoul(writeback_running); | 188 | d_strtoul(writeback_running); |
183 | d_strtoul(writeback_delay); | 189 | d_strtoul(writeback_delay); |
184 | sysfs_strtoul_clamp(writeback_rate, | 190 | sysfs_strtoul_clamp(writeback_rate, |
185 | dc->writeback_rate.rate, 1, 1000000); | 191 | dc->writeback_rate.rate, 1, 1000000); |
186 | sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40); | 192 | sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40); |
187 | 193 | ||
188 | d_strtoul(writeback_rate_update_seconds); | 194 | d_strtoul(writeback_rate_update_seconds); |
189 | d_strtoul(writeback_rate_d_term); | 195 | d_strtoul(writeback_rate_d_term); |
190 | d_strtoul(writeback_rate_p_term_inverse); | 196 | d_strtoul(writeback_rate_p_term_inverse); |
191 | sysfs_strtoul_clamp(writeback_rate_p_term_inverse, | 197 | sysfs_strtoul_clamp(writeback_rate_p_term_inverse, |
192 | dc->writeback_rate_p_term_inverse, 1, INT_MAX); | 198 | dc->writeback_rate_p_term_inverse, 1, INT_MAX); |
193 | d_strtoul(writeback_rate_d_smooth); | 199 | d_strtoul(writeback_rate_d_smooth); |
194 | 200 | ||
195 | d_strtoul(sequential_merge); | 201 | d_strtoul(sequential_merge); |
196 | d_strtoi_h(sequential_cutoff); | 202 | d_strtoi_h(sequential_cutoff); |
197 | d_strtoi_h(readahead); | 203 | d_strtoi_h(readahead); |
198 | 204 | ||
199 | if (attr == &sysfs_clear_stats) | 205 | if (attr == &sysfs_clear_stats) |
200 | bch_cache_accounting_clear(&dc->accounting); | 206 | bch_cache_accounting_clear(&dc->accounting); |
201 | 207 | ||
202 | if (attr == &sysfs_running && | 208 | if (attr == &sysfs_running && |
203 | strtoul_or_return(buf)) | 209 | strtoul_or_return(buf)) |
204 | bch_cached_dev_run(dc); | 210 | bch_cached_dev_run(dc); |
205 | 211 | ||
206 | if (attr == &sysfs_cache_mode) { | 212 | if (attr == &sysfs_cache_mode) { |
207 | ssize_t v = bch_read_string_list(buf, bch_cache_modes + 1); | 213 | ssize_t v = bch_read_string_list(buf, bch_cache_modes + 1); |
208 | 214 | ||
209 | if (v < 0) | 215 | if (v < 0) |
210 | return v; | 216 | return v; |
211 | 217 | ||
212 | if ((unsigned) v != BDEV_CACHE_MODE(&dc->sb)) { | 218 | if ((unsigned) v != BDEV_CACHE_MODE(&dc->sb)) { |
213 | SET_BDEV_CACHE_MODE(&dc->sb, v); | 219 | SET_BDEV_CACHE_MODE(&dc->sb, v); |
214 | bch_write_bdev_super(dc, NULL); | 220 | bch_write_bdev_super(dc, NULL); |
215 | } | 221 | } |
216 | } | 222 | } |
217 | 223 | ||
218 | if (attr == &sysfs_label) { | 224 | if (attr == &sysfs_label) { |
219 | memcpy(dc->sb.label, buf, SB_LABEL_SIZE); | 225 | memcpy(dc->sb.label, buf, SB_LABEL_SIZE); |
220 | bch_write_bdev_super(dc, NULL); | 226 | bch_write_bdev_super(dc, NULL); |
221 | if (dc->disk.c) { | 227 | if (dc->disk.c) { |
222 | memcpy(dc->disk.c->uuids[dc->disk.id].label, | 228 | memcpy(dc->disk.c->uuids[dc->disk.id].label, |
223 | buf, SB_LABEL_SIZE); | 229 | buf, SB_LABEL_SIZE); |
224 | bch_uuid_write(dc->disk.c); | 230 | bch_uuid_write(dc->disk.c); |
225 | } | 231 | } |
226 | } | 232 | } |
227 | 233 | ||
228 | if (attr == &sysfs_attach) { | 234 | if (attr == &sysfs_attach) { |
229 | if (bch_parse_uuid(buf, dc->sb.set_uuid) < 16) | 235 | if (bch_parse_uuid(buf, dc->sb.set_uuid) < 16) |
230 | return -EINVAL; | 236 | return -EINVAL; |
231 | 237 | ||
232 | list_for_each_entry(c, &bch_cache_sets, list) { | 238 | list_for_each_entry(c, &bch_cache_sets, list) { |
233 | v = bch_cached_dev_attach(dc, c); | 239 | v = bch_cached_dev_attach(dc, c); |
234 | if (!v) | 240 | if (!v) |
235 | return size; | 241 | return size; |
236 | } | 242 | } |
237 | 243 | ||
238 | pr_err("Can't attach %s: cache set not found", buf); | 244 | pr_err("Can't attach %s: cache set not found", buf); |
239 | size = v; | 245 | size = v; |
240 | } | 246 | } |
241 | 247 | ||
242 | if (attr == &sysfs_detach && dc->disk.c) | 248 | if (attr == &sysfs_detach && dc->disk.c) |
243 | bch_cached_dev_detach(dc); | 249 | bch_cached_dev_detach(dc); |
244 | 250 | ||
245 | if (attr == &sysfs_stop) | 251 | if (attr == &sysfs_stop) |
246 | bcache_device_stop(&dc->disk); | 252 | bcache_device_stop(&dc->disk); |
247 | 253 | ||
248 | return size; | 254 | return size; |
249 | } | 255 | } |
250 | 256 | ||
251 | STORE(bch_cached_dev) | 257 | STORE(bch_cached_dev) |
252 | { | 258 | { |
253 | struct cached_dev *dc = container_of(kobj, struct cached_dev, | 259 | struct cached_dev *dc = container_of(kobj, struct cached_dev, |
254 | disk.kobj); | 260 | disk.kobj); |
255 | 261 | ||
256 | mutex_lock(&bch_register_lock); | 262 | mutex_lock(&bch_register_lock); |
257 | size = __cached_dev_store(kobj, attr, buf, size); | 263 | size = __cached_dev_store(kobj, attr, buf, size); |
258 | 264 | ||
259 | if (attr == &sysfs_writeback_running) | 265 | if (attr == &sysfs_writeback_running) |
260 | bch_writeback_queue(dc); | 266 | bch_writeback_queue(dc); |
261 | 267 | ||
262 | if (attr == &sysfs_writeback_percent) | 268 | if (attr == &sysfs_writeback_percent) |
263 | schedule_delayed_work(&dc->writeback_rate_update, | 269 | schedule_delayed_work(&dc->writeback_rate_update, |
264 | dc->writeback_rate_update_seconds * HZ); | 270 | dc->writeback_rate_update_seconds * HZ); |
265 | 271 | ||
266 | mutex_unlock(&bch_register_lock); | 272 | mutex_unlock(&bch_register_lock); |
267 | return size; | 273 | return size; |
268 | } | 274 | } |
269 | 275 | ||
270 | static struct attribute *bch_cached_dev_files[] = { | 276 | static struct attribute *bch_cached_dev_files[] = { |
271 | &sysfs_attach, | 277 | &sysfs_attach, |
272 | &sysfs_detach, | 278 | &sysfs_detach, |
273 | &sysfs_stop, | 279 | &sysfs_stop, |
274 | #if 0 | 280 | #if 0 |
275 | &sysfs_data_csum, | 281 | &sysfs_data_csum, |
276 | #endif | 282 | #endif |
277 | &sysfs_cache_mode, | 283 | &sysfs_cache_mode, |
278 | &sysfs_writeback_metadata, | 284 | &sysfs_writeback_metadata, |
279 | &sysfs_writeback_running, | 285 | &sysfs_writeback_running, |
280 | &sysfs_writeback_delay, | 286 | &sysfs_writeback_delay, |
281 | &sysfs_writeback_percent, | 287 | &sysfs_writeback_percent, |
282 | &sysfs_writeback_rate, | 288 | &sysfs_writeback_rate, |
283 | &sysfs_writeback_rate_update_seconds, | 289 | &sysfs_writeback_rate_update_seconds, |
284 | &sysfs_writeback_rate_d_term, | 290 | &sysfs_writeback_rate_d_term, |
285 | &sysfs_writeback_rate_p_term_inverse, | 291 | &sysfs_writeback_rate_p_term_inverse, |
286 | &sysfs_writeback_rate_d_smooth, | 292 | &sysfs_writeback_rate_d_smooth, |
287 | &sysfs_writeback_rate_debug, | 293 | &sysfs_writeback_rate_debug, |
288 | &sysfs_dirty_data, | 294 | &sysfs_dirty_data, |
295 | &sysfs_stripe_size, | ||
296 | &sysfs_partial_stripes_expensive, | ||
289 | &sysfs_sequential_cutoff, | 297 | &sysfs_sequential_cutoff, |
290 | &sysfs_sequential_merge, | 298 | &sysfs_sequential_merge, |
291 | &sysfs_clear_stats, | 299 | &sysfs_clear_stats, |
292 | &sysfs_running, | 300 | &sysfs_running, |
293 | &sysfs_state, | 301 | &sysfs_state, |
294 | &sysfs_label, | 302 | &sysfs_label, |
295 | &sysfs_readahead, | 303 | &sysfs_readahead, |
296 | #ifdef CONFIG_BCACHE_DEBUG | 304 | #ifdef CONFIG_BCACHE_DEBUG |
297 | &sysfs_verify, | 305 | &sysfs_verify, |
298 | #endif | 306 | #endif |
299 | NULL | 307 | NULL |
300 | }; | 308 | }; |
301 | KTYPE(bch_cached_dev); | 309 | KTYPE(bch_cached_dev); |
302 | 310 | ||
303 | SHOW(bch_flash_dev) | 311 | SHOW(bch_flash_dev) |
304 | { | 312 | { |
305 | struct bcache_device *d = container_of(kobj, struct bcache_device, | 313 | struct bcache_device *d = container_of(kobj, struct bcache_device, |
306 | kobj); | 314 | kobj); |
307 | struct uuid_entry *u = &d->c->uuids[d->id]; | 315 | struct uuid_entry *u = &d->c->uuids[d->id]; |
308 | 316 | ||
309 | sysfs_printf(data_csum, "%i", d->data_csum); | 317 | sysfs_printf(data_csum, "%i", d->data_csum); |
310 | sysfs_hprint(size, u->sectors << 9); | 318 | sysfs_hprint(size, u->sectors << 9); |
311 | 319 | ||
312 | if (attr == &sysfs_label) { | 320 | if (attr == &sysfs_label) { |
313 | memcpy(buf, u->label, SB_LABEL_SIZE); | 321 | memcpy(buf, u->label, SB_LABEL_SIZE); |
314 | buf[SB_LABEL_SIZE + 1] = '\0'; | 322 | buf[SB_LABEL_SIZE + 1] = '\0'; |
315 | strcat(buf, "\n"); | 323 | strcat(buf, "\n"); |
316 | return strlen(buf); | 324 | return strlen(buf); |
317 | } | 325 | } |
318 | 326 | ||
319 | return 0; | 327 | return 0; |
320 | } | 328 | } |
321 | 329 | ||
322 | STORE(__bch_flash_dev) | 330 | STORE(__bch_flash_dev) |
323 | { | 331 | { |
324 | struct bcache_device *d = container_of(kobj, struct bcache_device, | 332 | struct bcache_device *d = container_of(kobj, struct bcache_device, |
325 | kobj); | 333 | kobj); |
326 | struct uuid_entry *u = &d->c->uuids[d->id]; | 334 | struct uuid_entry *u = &d->c->uuids[d->id]; |
327 | 335 | ||
328 | sysfs_strtoul(data_csum, d->data_csum); | 336 | sysfs_strtoul(data_csum, d->data_csum); |
329 | 337 | ||
330 | if (attr == &sysfs_size) { | 338 | if (attr == &sysfs_size) { |
331 | uint64_t v; | 339 | uint64_t v; |
332 | strtoi_h_or_return(buf, v); | 340 | strtoi_h_or_return(buf, v); |
333 | 341 | ||
334 | u->sectors = v >> 9; | 342 | u->sectors = v >> 9; |
335 | bch_uuid_write(d->c); | 343 | bch_uuid_write(d->c); |
336 | set_capacity(d->disk, u->sectors); | 344 | set_capacity(d->disk, u->sectors); |
337 | } | 345 | } |
338 | 346 | ||
339 | if (attr == &sysfs_label) { | 347 | if (attr == &sysfs_label) { |
340 | memcpy(u->label, buf, SB_LABEL_SIZE); | 348 | memcpy(u->label, buf, SB_LABEL_SIZE); |
341 | bch_uuid_write(d->c); | 349 | bch_uuid_write(d->c); |
342 | } | 350 | } |
343 | 351 | ||
344 | if (attr == &sysfs_unregister) { | 352 | if (attr == &sysfs_unregister) { |
345 | atomic_set(&d->detaching, 1); | 353 | atomic_set(&d->detaching, 1); |
346 | bcache_device_stop(d); | 354 | bcache_device_stop(d); |
347 | } | 355 | } |
348 | 356 | ||
349 | return size; | 357 | return size; |
350 | } | 358 | } |
351 | STORE_LOCKED(bch_flash_dev) | 359 | STORE_LOCKED(bch_flash_dev) |
352 | 360 | ||
353 | static struct attribute *bch_flash_dev_files[] = { | 361 | static struct attribute *bch_flash_dev_files[] = { |
354 | &sysfs_unregister, | 362 | &sysfs_unregister, |
355 | #if 0 | 363 | #if 0 |
356 | &sysfs_data_csum, | 364 | &sysfs_data_csum, |
357 | #endif | 365 | #endif |
358 | &sysfs_label, | 366 | &sysfs_label, |
359 | &sysfs_size, | 367 | &sysfs_size, |
360 | NULL | 368 | NULL |
361 | }; | 369 | }; |
362 | KTYPE(bch_flash_dev); | 370 | KTYPE(bch_flash_dev); |
363 | 371 | ||
364 | SHOW(__bch_cache_set) | 372 | SHOW(__bch_cache_set) |
365 | { | 373 | { |
366 | unsigned root_usage(struct cache_set *c) | 374 | unsigned root_usage(struct cache_set *c) |
367 | { | 375 | { |
368 | unsigned bytes = 0; | 376 | unsigned bytes = 0; |
369 | struct bkey *k; | 377 | struct bkey *k; |
370 | struct btree *b; | 378 | struct btree *b; |
371 | struct btree_iter iter; | 379 | struct btree_iter iter; |
372 | 380 | ||
373 | goto lock_root; | 381 | goto lock_root; |
374 | 382 | ||
375 | do { | 383 | do { |
376 | rw_unlock(false, b); | 384 | rw_unlock(false, b); |
377 | lock_root: | 385 | lock_root: |
378 | b = c->root; | 386 | b = c->root; |
379 | rw_lock(false, b, b->level); | 387 | rw_lock(false, b, b->level); |
380 | } while (b != c->root); | 388 | } while (b != c->root); |
381 | 389 | ||
382 | for_each_key_filter(b, k, &iter, bch_ptr_bad) | 390 | for_each_key_filter(b, k, &iter, bch_ptr_bad) |
383 | bytes += bkey_bytes(k); | 391 | bytes += bkey_bytes(k); |
384 | 392 | ||
385 | rw_unlock(false, b); | 393 | rw_unlock(false, b); |
386 | 394 | ||
387 | return (bytes * 100) / btree_bytes(c); | 395 | return (bytes * 100) / btree_bytes(c); |
388 | } | 396 | } |
389 | 397 | ||
390 | size_t cache_size(struct cache_set *c) | 398 | size_t cache_size(struct cache_set *c) |
391 | { | 399 | { |
392 | size_t ret = 0; | 400 | size_t ret = 0; |
393 | struct btree *b; | 401 | struct btree *b; |
394 | 402 | ||
395 | mutex_lock(&c->bucket_lock); | 403 | mutex_lock(&c->bucket_lock); |
396 | list_for_each_entry(b, &c->btree_cache, list) | 404 | list_for_each_entry(b, &c->btree_cache, list) |
397 | ret += 1 << (b->page_order + PAGE_SHIFT); | 405 | ret += 1 << (b->page_order + PAGE_SHIFT); |
398 | 406 | ||
399 | mutex_unlock(&c->bucket_lock); | 407 | mutex_unlock(&c->bucket_lock); |
400 | return ret; | 408 | return ret; |
401 | } | 409 | } |
402 | 410 | ||
403 | unsigned cache_max_chain(struct cache_set *c) | 411 | unsigned cache_max_chain(struct cache_set *c) |
404 | { | 412 | { |
405 | unsigned ret = 0; | 413 | unsigned ret = 0; |
406 | struct hlist_head *h; | 414 | struct hlist_head *h; |
407 | 415 | ||
408 | mutex_lock(&c->bucket_lock); | 416 | mutex_lock(&c->bucket_lock); |
409 | 417 | ||
410 | for (h = c->bucket_hash; | 418 | for (h = c->bucket_hash; |
411 | h < c->bucket_hash + (1 << BUCKET_HASH_BITS); | 419 | h < c->bucket_hash + (1 << BUCKET_HASH_BITS); |
412 | h++) { | 420 | h++) { |
413 | unsigned i = 0; | 421 | unsigned i = 0; |
414 | struct hlist_node *p; | 422 | struct hlist_node *p; |
415 | 423 | ||
416 | hlist_for_each(p, h) | 424 | hlist_for_each(p, h) |
417 | i++; | 425 | i++; |
418 | 426 | ||
419 | ret = max(ret, i); | 427 | ret = max(ret, i); |
420 | } | 428 | } |
421 | 429 | ||
422 | mutex_unlock(&c->bucket_lock); | 430 | mutex_unlock(&c->bucket_lock); |
423 | return ret; | 431 | return ret; |
424 | } | 432 | } |
425 | 433 | ||
426 | unsigned btree_used(struct cache_set *c) | 434 | unsigned btree_used(struct cache_set *c) |
427 | { | 435 | { |
428 | return div64_u64(c->gc_stats.key_bytes * 100, | 436 | return div64_u64(c->gc_stats.key_bytes * 100, |
429 | (c->gc_stats.nodes ?: 1) * btree_bytes(c)); | 437 | (c->gc_stats.nodes ?: 1) * btree_bytes(c)); |
430 | } | 438 | } |
431 | 439 | ||
432 | unsigned average_key_size(struct cache_set *c) | 440 | unsigned average_key_size(struct cache_set *c) |
433 | { | 441 | { |
434 | return c->gc_stats.nkeys | 442 | return c->gc_stats.nkeys |
435 | ? div64_u64(c->gc_stats.data, c->gc_stats.nkeys) | 443 | ? div64_u64(c->gc_stats.data, c->gc_stats.nkeys) |
436 | : 0; | 444 | : 0; |
437 | } | 445 | } |
438 | 446 | ||
439 | struct cache_set *c = container_of(kobj, struct cache_set, kobj); | 447 | struct cache_set *c = container_of(kobj, struct cache_set, kobj); |
440 | 448 | ||
441 | sysfs_print(synchronous, CACHE_SYNC(&c->sb)); | 449 | sysfs_print(synchronous, CACHE_SYNC(&c->sb)); |
442 | sysfs_print(journal_delay_ms, c->journal_delay_ms); | 450 | sysfs_print(journal_delay_ms, c->journal_delay_ms); |
443 | sysfs_hprint(bucket_size, bucket_bytes(c)); | 451 | sysfs_hprint(bucket_size, bucket_bytes(c)); |
444 | sysfs_hprint(block_size, block_bytes(c)); | 452 | sysfs_hprint(block_size, block_bytes(c)); |
445 | sysfs_print(tree_depth, c->root->level); | 453 | sysfs_print(tree_depth, c->root->level); |
446 | sysfs_print(root_usage_percent, root_usage(c)); | 454 | sysfs_print(root_usage_percent, root_usage(c)); |
447 | 455 | ||
448 | sysfs_hprint(btree_cache_size, cache_size(c)); | 456 | sysfs_hprint(btree_cache_size, cache_size(c)); |
449 | sysfs_print(btree_cache_max_chain, cache_max_chain(c)); | 457 | sysfs_print(btree_cache_max_chain, cache_max_chain(c)); |
450 | sysfs_print(cache_available_percent, 100 - c->gc_stats.in_use); | 458 | sysfs_print(cache_available_percent, 100 - c->gc_stats.in_use); |
451 | 459 | ||
452 | sysfs_print_time_stats(&c->btree_gc_time, btree_gc, sec, ms); | 460 | sysfs_print_time_stats(&c->btree_gc_time, btree_gc, sec, ms); |
453 | sysfs_print_time_stats(&c->btree_split_time, btree_split, sec, us); | 461 | sysfs_print_time_stats(&c->btree_split_time, btree_split, sec, us); |
454 | sysfs_print_time_stats(&c->sort_time, btree_sort, ms, us); | 462 | sysfs_print_time_stats(&c->sort_time, btree_sort, ms, us); |
455 | sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us); | 463 | sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us); |
456 | sysfs_print_time_stats(&c->try_harder_time, try_harder, ms, us); | 464 | sysfs_print_time_stats(&c->try_harder_time, try_harder, ms, us); |
457 | 465 | ||
458 | sysfs_print(btree_used_percent, btree_used(c)); | 466 | sysfs_print(btree_used_percent, btree_used(c)); |
459 | sysfs_print(btree_nodes, c->gc_stats.nodes); | 467 | sysfs_print(btree_nodes, c->gc_stats.nodes); |
460 | sysfs_hprint(dirty_data, c->gc_stats.dirty); | 468 | sysfs_hprint(dirty_data, c->gc_stats.dirty); |
461 | sysfs_hprint(average_key_size, average_key_size(c)); | 469 | sysfs_hprint(average_key_size, average_key_size(c)); |
462 | 470 | ||
463 | sysfs_print(cache_read_races, | 471 | sysfs_print(cache_read_races, |
464 | atomic_long_read(&c->cache_read_races)); | 472 | atomic_long_read(&c->cache_read_races)); |
465 | 473 | ||
466 | sysfs_print(writeback_keys_done, | 474 | sysfs_print(writeback_keys_done, |
467 | atomic_long_read(&c->writeback_keys_done)); | 475 | atomic_long_read(&c->writeback_keys_done)); |
468 | sysfs_print(writeback_keys_failed, | 476 | sysfs_print(writeback_keys_failed, |
469 | atomic_long_read(&c->writeback_keys_failed)); | 477 | atomic_long_read(&c->writeback_keys_failed)); |
470 | 478 | ||
471 | /* See count_io_errors for why 88 */ | 479 | /* See count_io_errors for why 88 */ |
472 | sysfs_print(io_error_halflife, c->error_decay * 88); | 480 | sysfs_print(io_error_halflife, c->error_decay * 88); |
473 | sysfs_print(io_error_limit, c->error_limit >> IO_ERROR_SHIFT); | 481 | sysfs_print(io_error_limit, c->error_limit >> IO_ERROR_SHIFT); |
474 | 482 | ||
475 | sysfs_hprint(congested, | 483 | sysfs_hprint(congested, |
476 | ((uint64_t) bch_get_congested(c)) << 9); | 484 | ((uint64_t) bch_get_congested(c)) << 9); |
477 | sysfs_print(congested_read_threshold_us, | 485 | sysfs_print(congested_read_threshold_us, |
478 | c->congested_read_threshold_us); | 486 | c->congested_read_threshold_us); |
479 | sysfs_print(congested_write_threshold_us, | 487 | sysfs_print(congested_write_threshold_us, |
480 | c->congested_write_threshold_us); | 488 | c->congested_write_threshold_us); |
481 | 489 | ||
482 | sysfs_print(active_journal_entries, fifo_used(&c->journal.pin)); | 490 | sysfs_print(active_journal_entries, fifo_used(&c->journal.pin)); |
483 | sysfs_printf(verify, "%i", c->verify); | 491 | sysfs_printf(verify, "%i", c->verify); |
484 | sysfs_printf(key_merging_disabled, "%i", c->key_merging_disabled); | 492 | sysfs_printf(key_merging_disabled, "%i", c->key_merging_disabled); |
485 | sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite); | 493 | sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite); |
486 | sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled); | 494 | sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled); |
487 | sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); | 495 | sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); |
488 | 496 | ||
489 | if (attr == &sysfs_bset_tree_stats) | 497 | if (attr == &sysfs_bset_tree_stats) |
490 | return bch_bset_print_stats(c, buf); | 498 | return bch_bset_print_stats(c, buf); |
491 | 499 | ||
492 | return 0; | 500 | return 0; |
493 | } | 501 | } |
494 | SHOW_LOCKED(bch_cache_set) | 502 | SHOW_LOCKED(bch_cache_set) |
495 | 503 | ||
496 | STORE(__bch_cache_set) | 504 | STORE(__bch_cache_set) |
497 | { | 505 | { |
498 | struct cache_set *c = container_of(kobj, struct cache_set, kobj); | 506 | struct cache_set *c = container_of(kobj, struct cache_set, kobj); |
499 | 507 | ||
500 | if (attr == &sysfs_unregister) | 508 | if (attr == &sysfs_unregister) |
501 | bch_cache_set_unregister(c); | 509 | bch_cache_set_unregister(c); |
502 | 510 | ||
503 | if (attr == &sysfs_stop) | 511 | if (attr == &sysfs_stop) |
504 | bch_cache_set_stop(c); | 512 | bch_cache_set_stop(c); |
505 | 513 | ||
506 | if (attr == &sysfs_synchronous) { | 514 | if (attr == &sysfs_synchronous) { |
507 | bool sync = strtoul_or_return(buf); | 515 | bool sync = strtoul_or_return(buf); |
508 | 516 | ||
509 | if (sync != CACHE_SYNC(&c->sb)) { | 517 | if (sync != CACHE_SYNC(&c->sb)) { |
510 | SET_CACHE_SYNC(&c->sb, sync); | 518 | SET_CACHE_SYNC(&c->sb, sync); |
511 | bcache_write_super(c); | 519 | bcache_write_super(c); |
512 | } | 520 | } |
513 | } | 521 | } |
514 | 522 | ||
515 | if (attr == &sysfs_flash_vol_create) { | 523 | if (attr == &sysfs_flash_vol_create) { |
516 | int r; | 524 | int r; |
517 | uint64_t v; | 525 | uint64_t v; |
518 | strtoi_h_or_return(buf, v); | 526 | strtoi_h_or_return(buf, v); |
519 | 527 | ||
520 | r = bch_flash_dev_create(c, v); | 528 | r = bch_flash_dev_create(c, v); |
521 | if (r) | 529 | if (r) |
522 | return r; | 530 | return r; |
523 | } | 531 | } |
524 | 532 | ||
525 | if (attr == &sysfs_clear_stats) { | 533 | if (attr == &sysfs_clear_stats) { |
526 | atomic_long_set(&c->writeback_keys_done, 0); | 534 | atomic_long_set(&c->writeback_keys_done, 0); |
527 | atomic_long_set(&c->writeback_keys_failed, 0); | 535 | atomic_long_set(&c->writeback_keys_failed, 0); |
528 | 536 | ||
529 | memset(&c->gc_stats, 0, sizeof(struct gc_stat)); | 537 | memset(&c->gc_stats, 0, sizeof(struct gc_stat)); |
530 | bch_cache_accounting_clear(&c->accounting); | 538 | bch_cache_accounting_clear(&c->accounting); |
531 | } | 539 | } |
532 | 540 | ||
533 | if (attr == &sysfs_trigger_gc) | 541 | if (attr == &sysfs_trigger_gc) |
534 | bch_queue_gc(c); | 542 | bch_queue_gc(c); |
535 | 543 | ||
536 | if (attr == &sysfs_prune_cache) { | 544 | if (attr == &sysfs_prune_cache) { |
537 | struct shrink_control sc; | 545 | struct shrink_control sc; |
538 | sc.gfp_mask = GFP_KERNEL; | 546 | sc.gfp_mask = GFP_KERNEL; |
539 | sc.nr_to_scan = strtoul_or_return(buf); | 547 | sc.nr_to_scan = strtoul_or_return(buf); |
540 | c->shrink.shrink(&c->shrink, &sc); | 548 | c->shrink.shrink(&c->shrink, &sc); |
541 | } | 549 | } |
542 | 550 | ||
543 | sysfs_strtoul(congested_read_threshold_us, | 551 | sysfs_strtoul(congested_read_threshold_us, |
544 | c->congested_read_threshold_us); | 552 | c->congested_read_threshold_us); |
545 | sysfs_strtoul(congested_write_threshold_us, | 553 | sysfs_strtoul(congested_write_threshold_us, |
546 | c->congested_write_threshold_us); | 554 | c->congested_write_threshold_us); |
547 | 555 | ||
548 | if (attr == &sysfs_io_error_limit) | 556 | if (attr == &sysfs_io_error_limit) |
549 | c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT; | 557 | c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT; |
550 | 558 | ||
551 | /* See count_io_errors() for why 88 */ | 559 | /* See count_io_errors() for why 88 */ |
552 | if (attr == &sysfs_io_error_halflife) | 560 | if (attr == &sysfs_io_error_halflife) |
553 | c->error_decay = strtoul_or_return(buf) / 88; | 561 | c->error_decay = strtoul_or_return(buf) / 88; |
554 | 562 | ||
555 | sysfs_strtoul(journal_delay_ms, c->journal_delay_ms); | 563 | sysfs_strtoul(journal_delay_ms, c->journal_delay_ms); |
556 | sysfs_strtoul(verify, c->verify); | 564 | sysfs_strtoul(verify, c->verify); |
557 | sysfs_strtoul(key_merging_disabled, c->key_merging_disabled); | 565 | sysfs_strtoul(key_merging_disabled, c->key_merging_disabled); |
558 | sysfs_strtoul(gc_always_rewrite, c->gc_always_rewrite); | 566 | sysfs_strtoul(gc_always_rewrite, c->gc_always_rewrite); |
559 | sysfs_strtoul(btree_shrinker_disabled, c->shrinker_disabled); | 567 | sysfs_strtoul(btree_shrinker_disabled, c->shrinker_disabled); |
560 | sysfs_strtoul(copy_gc_enabled, c->copy_gc_enabled); | 568 | sysfs_strtoul(copy_gc_enabled, c->copy_gc_enabled); |
561 | 569 | ||
562 | return size; | 570 | return size; |
563 | } | 571 | } |
564 | STORE_LOCKED(bch_cache_set) | 572 | STORE_LOCKED(bch_cache_set) |
565 | 573 | ||
566 | SHOW(bch_cache_set_internal) | 574 | SHOW(bch_cache_set_internal) |
567 | { | 575 | { |
568 | struct cache_set *c = container_of(kobj, struct cache_set, internal); | 576 | struct cache_set *c = container_of(kobj, struct cache_set, internal); |
569 | return bch_cache_set_show(&c->kobj, attr, buf); | 577 | return bch_cache_set_show(&c->kobj, attr, buf); |
570 | } | 578 | } |
571 | 579 | ||
572 | STORE(bch_cache_set_internal) | 580 | STORE(bch_cache_set_internal) |
573 | { | 581 | { |
574 | struct cache_set *c = container_of(kobj, struct cache_set, internal); | 582 | struct cache_set *c = container_of(kobj, struct cache_set, internal); |
575 | return bch_cache_set_store(&c->kobj, attr, buf, size); | 583 | return bch_cache_set_store(&c->kobj, attr, buf, size); |
576 | } | 584 | } |
577 | 585 | ||
578 | static void bch_cache_set_internal_release(struct kobject *k) | 586 | static void bch_cache_set_internal_release(struct kobject *k) |
579 | { | 587 | { |
580 | } | 588 | } |
581 | 589 | ||
582 | static struct attribute *bch_cache_set_files[] = { | 590 | static struct attribute *bch_cache_set_files[] = { |
583 | &sysfs_unregister, | 591 | &sysfs_unregister, |
584 | &sysfs_stop, | 592 | &sysfs_stop, |
585 | &sysfs_synchronous, | 593 | &sysfs_synchronous, |
586 | &sysfs_journal_delay_ms, | 594 | &sysfs_journal_delay_ms, |
587 | &sysfs_flash_vol_create, | 595 | &sysfs_flash_vol_create, |
588 | 596 | ||
589 | &sysfs_bucket_size, | 597 | &sysfs_bucket_size, |
590 | &sysfs_block_size, | 598 | &sysfs_block_size, |
591 | &sysfs_tree_depth, | 599 | &sysfs_tree_depth, |
592 | &sysfs_root_usage_percent, | 600 | &sysfs_root_usage_percent, |
593 | &sysfs_btree_cache_size, | 601 | &sysfs_btree_cache_size, |
594 | &sysfs_cache_available_percent, | 602 | &sysfs_cache_available_percent, |
595 | 603 | ||
596 | &sysfs_average_key_size, | 604 | &sysfs_average_key_size, |
597 | &sysfs_dirty_data, | 605 | &sysfs_dirty_data, |
598 | 606 | ||
599 | &sysfs_io_error_limit, | 607 | &sysfs_io_error_limit, |
600 | &sysfs_io_error_halflife, | 608 | &sysfs_io_error_halflife, |
601 | &sysfs_congested, | 609 | &sysfs_congested, |
602 | &sysfs_congested_read_threshold_us, | 610 | &sysfs_congested_read_threshold_us, |
603 | &sysfs_congested_write_threshold_us, | 611 | &sysfs_congested_write_threshold_us, |
604 | &sysfs_clear_stats, | 612 | &sysfs_clear_stats, |
605 | NULL | 613 | NULL |
606 | }; | 614 | }; |
607 | KTYPE(bch_cache_set); | 615 | KTYPE(bch_cache_set); |
608 | 616 | ||
609 | static struct attribute *bch_cache_set_internal_files[] = { | 617 | static struct attribute *bch_cache_set_internal_files[] = { |
610 | &sysfs_active_journal_entries, | 618 | &sysfs_active_journal_entries, |
611 | 619 | ||
612 | sysfs_time_stats_attribute_list(btree_gc, sec, ms) | 620 | sysfs_time_stats_attribute_list(btree_gc, sec, ms) |
613 | sysfs_time_stats_attribute_list(btree_split, sec, us) | 621 | sysfs_time_stats_attribute_list(btree_split, sec, us) |
614 | sysfs_time_stats_attribute_list(btree_sort, ms, us) | 622 | sysfs_time_stats_attribute_list(btree_sort, ms, us) |
615 | sysfs_time_stats_attribute_list(btree_read, ms, us) | 623 | sysfs_time_stats_attribute_list(btree_read, ms, us) |
616 | sysfs_time_stats_attribute_list(try_harder, ms, us) | 624 | sysfs_time_stats_attribute_list(try_harder, ms, us) |
617 | 625 | ||
618 | &sysfs_btree_nodes, | 626 | &sysfs_btree_nodes, |
619 | &sysfs_btree_used_percent, | 627 | &sysfs_btree_used_percent, |
620 | &sysfs_btree_cache_max_chain, | 628 | &sysfs_btree_cache_max_chain, |
621 | 629 | ||
622 | &sysfs_bset_tree_stats, | 630 | &sysfs_bset_tree_stats, |
623 | &sysfs_cache_read_races, | 631 | &sysfs_cache_read_races, |
624 | &sysfs_writeback_keys_done, | 632 | &sysfs_writeback_keys_done, |
625 | &sysfs_writeback_keys_failed, | 633 | &sysfs_writeback_keys_failed, |
626 | 634 | ||
627 | &sysfs_trigger_gc, | 635 | &sysfs_trigger_gc, |
628 | &sysfs_prune_cache, | 636 | &sysfs_prune_cache, |
629 | #ifdef CONFIG_BCACHE_DEBUG | 637 | #ifdef CONFIG_BCACHE_DEBUG |
630 | &sysfs_verify, | 638 | &sysfs_verify, |
631 | &sysfs_key_merging_disabled, | 639 | &sysfs_key_merging_disabled, |
632 | #endif | 640 | #endif |
633 | &sysfs_gc_always_rewrite, | 641 | &sysfs_gc_always_rewrite, |
634 | &sysfs_btree_shrinker_disabled, | 642 | &sysfs_btree_shrinker_disabled, |
635 | &sysfs_copy_gc_enabled, | 643 | &sysfs_copy_gc_enabled, |
636 | NULL | 644 | NULL |
637 | }; | 645 | }; |
638 | KTYPE(bch_cache_set_internal); | 646 | KTYPE(bch_cache_set_internal); |
639 | 647 | ||
640 | SHOW(__bch_cache) | 648 | SHOW(__bch_cache) |
641 | { | 649 | { |
642 | struct cache *ca = container_of(kobj, struct cache, kobj); | 650 | struct cache *ca = container_of(kobj, struct cache, kobj); |
643 | 651 | ||
644 | sysfs_hprint(bucket_size, bucket_bytes(ca)); | 652 | sysfs_hprint(bucket_size, bucket_bytes(ca)); |
645 | sysfs_hprint(block_size, block_bytes(ca)); | 653 | sysfs_hprint(block_size, block_bytes(ca)); |
646 | sysfs_print(nbuckets, ca->sb.nbuckets); | 654 | sysfs_print(nbuckets, ca->sb.nbuckets); |
647 | sysfs_print(discard, ca->discard); | 655 | sysfs_print(discard, ca->discard); |
648 | sysfs_hprint(written, atomic_long_read(&ca->sectors_written) << 9); | 656 | sysfs_hprint(written, atomic_long_read(&ca->sectors_written) << 9); |
649 | sysfs_hprint(btree_written, | 657 | sysfs_hprint(btree_written, |
650 | atomic_long_read(&ca->btree_sectors_written) << 9); | 658 | atomic_long_read(&ca->btree_sectors_written) << 9); |
651 | sysfs_hprint(metadata_written, | 659 | sysfs_hprint(metadata_written, |
652 | (atomic_long_read(&ca->meta_sectors_written) + | 660 | (atomic_long_read(&ca->meta_sectors_written) + |
653 | atomic_long_read(&ca->btree_sectors_written)) << 9); | 661 | atomic_long_read(&ca->btree_sectors_written)) << 9); |
654 | 662 | ||
655 | sysfs_print(io_errors, | 663 | sysfs_print(io_errors, |
656 | atomic_read(&ca->io_errors) >> IO_ERROR_SHIFT); | 664 | atomic_read(&ca->io_errors) >> IO_ERROR_SHIFT); |
657 | 665 | ||
658 | sysfs_print(freelist_percent, ca->free.size * 100 / | 666 | sysfs_print(freelist_percent, ca->free.size * 100 / |
659 | ((size_t) ca->sb.nbuckets)); | 667 | ((size_t) ca->sb.nbuckets)); |
660 | 668 | ||
661 | if (attr == &sysfs_cache_replacement_policy) | 669 | if (attr == &sysfs_cache_replacement_policy) |
662 | return bch_snprint_string_list(buf, PAGE_SIZE, | 670 | return bch_snprint_string_list(buf, PAGE_SIZE, |
663 | cache_replacement_policies, | 671 | cache_replacement_policies, |
664 | CACHE_REPLACEMENT(&ca->sb)); | 672 | CACHE_REPLACEMENT(&ca->sb)); |
665 | 673 | ||
666 | if (attr == &sysfs_priority_stats) { | 674 | if (attr == &sysfs_priority_stats) { |
667 | int cmp(const void *l, const void *r) | 675 | int cmp(const void *l, const void *r) |
668 | { return *((uint16_t *) r) - *((uint16_t *) l); } | 676 | { return *((uint16_t *) r) - *((uint16_t *) l); } |
669 | 677 | ||
670 | size_t n = ca->sb.nbuckets, i, unused, btree; | 678 | size_t n = ca->sb.nbuckets, i, unused, btree; |
671 | uint64_t sum = 0; | 679 | uint64_t sum = 0; |
672 | /* Compute 31 quantiles */ | 680 | /* Compute 31 quantiles */ |
673 | uint16_t q[31], *p, *cached; | 681 | uint16_t q[31], *p, *cached; |
674 | ssize_t ret; | 682 | ssize_t ret; |
675 | 683 | ||
676 | cached = p = vmalloc(ca->sb.nbuckets * sizeof(uint16_t)); | 684 | cached = p = vmalloc(ca->sb.nbuckets * sizeof(uint16_t)); |
677 | if (!p) | 685 | if (!p) |
678 | return -ENOMEM; | 686 | return -ENOMEM; |
679 | 687 | ||
680 | mutex_lock(&ca->set->bucket_lock); | 688 | mutex_lock(&ca->set->bucket_lock); |
681 | for (i = ca->sb.first_bucket; i < n; i++) | 689 | for (i = ca->sb.first_bucket; i < n; i++) |
682 | p[i] = ca->buckets[i].prio; | 690 | p[i] = ca->buckets[i].prio; |
683 | mutex_unlock(&ca->set->bucket_lock); | 691 | mutex_unlock(&ca->set->bucket_lock); |
684 | 692 | ||
685 | sort(p, n, sizeof(uint16_t), cmp, NULL); | 693 | sort(p, n, sizeof(uint16_t), cmp, NULL); |
686 | 694 | ||
687 | while (n && | 695 | while (n && |
688 | !cached[n - 1]) | 696 | !cached[n - 1]) |
689 | --n; | 697 | --n; |
690 | 698 | ||
691 | unused = ca->sb.nbuckets - n; | 699 | unused = ca->sb.nbuckets - n; |
692 | 700 | ||
693 | while (cached < p + n && | 701 | while (cached < p + n && |
694 | *cached == BTREE_PRIO) | 702 | *cached == BTREE_PRIO) |
695 | cached++; | 703 | cached++; |
696 | 704 | ||
697 | btree = cached - p; | 705 | btree = cached - p; |
698 | n -= btree; | 706 | n -= btree; |
699 | 707 | ||
700 | for (i = 0; i < n; i++) | 708 | for (i = 0; i < n; i++) |
701 | sum += INITIAL_PRIO - cached[i]; | 709 | sum += INITIAL_PRIO - cached[i]; |
702 | 710 | ||
703 | if (n) | 711 | if (n) |
704 | do_div(sum, n); | 712 | do_div(sum, n); |
705 | 713 | ||
706 | for (i = 0; i < ARRAY_SIZE(q); i++) | 714 | for (i = 0; i < ARRAY_SIZE(q); i++) |
707 | q[i] = INITIAL_PRIO - cached[n * (i + 1) / | 715 | q[i] = INITIAL_PRIO - cached[n * (i + 1) / |
708 | (ARRAY_SIZE(q) + 1)]; | 716 | (ARRAY_SIZE(q) + 1)]; |
709 | 717 | ||
710 | vfree(p); | 718 | vfree(p); |
711 | 719 | ||
712 | ret = scnprintf(buf, PAGE_SIZE, | 720 | ret = scnprintf(buf, PAGE_SIZE, |
713 | "Unused: %zu%%\n" | 721 | "Unused: %zu%%\n" |
714 | "Metadata: %zu%%\n" | 722 | "Metadata: %zu%%\n" |
715 | "Average: %llu\n" | 723 | "Average: %llu\n" |
716 | "Sectors per Q: %zu\n" | 724 | "Sectors per Q: %zu\n" |
717 | "Quantiles: [", | 725 | "Quantiles: [", |
718 | unused * 100 / (size_t) ca->sb.nbuckets, | 726 | unused * 100 / (size_t) ca->sb.nbuckets, |
719 | btree * 100 / (size_t) ca->sb.nbuckets, sum, | 727 | btree * 100 / (size_t) ca->sb.nbuckets, sum, |
720 | n * ca->sb.bucket_size / (ARRAY_SIZE(q) + 1)); | 728 | n * ca->sb.bucket_size / (ARRAY_SIZE(q) + 1)); |
721 | 729 | ||
722 | for (i = 0; i < ARRAY_SIZE(q); i++) | 730 | for (i = 0; i < ARRAY_SIZE(q); i++) |
723 | ret += scnprintf(buf + ret, PAGE_SIZE - ret, | 731 | ret += scnprintf(buf + ret, PAGE_SIZE - ret, |
724 | "%u ", q[i]); | 732 | "%u ", q[i]); |
725 | ret--; | 733 | ret--; |
726 | 734 | ||
727 | ret += scnprintf(buf + ret, PAGE_SIZE - ret, "]\n"); | 735 | ret += scnprintf(buf + ret, PAGE_SIZE - ret, "]\n"); |
728 | 736 | ||
729 | return ret; | 737 | return ret; |
730 | } | 738 | } |
731 | 739 | ||
732 | return 0; | 740 | return 0; |
733 | } | 741 | } |
734 | SHOW_LOCKED(bch_cache) | 742 | SHOW_LOCKED(bch_cache) |
735 | 743 | ||
736 | STORE(__bch_cache) | 744 | STORE(__bch_cache) |
737 | { | 745 | { |
738 | struct cache *ca = container_of(kobj, struct cache, kobj); | 746 | struct cache *ca = container_of(kobj, struct cache, kobj); |
739 | 747 | ||
740 | if (attr == &sysfs_discard) { | 748 | if (attr == &sysfs_discard) { |
741 | bool v = strtoul_or_return(buf); | 749 | bool v = strtoul_or_return(buf); |
742 | 750 | ||
743 | if (blk_queue_discard(bdev_get_queue(ca->bdev))) | 751 | if (blk_queue_discard(bdev_get_queue(ca->bdev))) |
744 | ca->discard = v; | 752 | ca->discard = v; |
745 | 753 | ||
746 | if (v != CACHE_DISCARD(&ca->sb)) { | 754 | if (v != CACHE_DISCARD(&ca->sb)) { |
747 | SET_CACHE_DISCARD(&ca->sb, v); | 755 | SET_CACHE_DISCARD(&ca->sb, v); |
748 | bcache_write_super(ca->set); | 756 | bcache_write_super(ca->set); |
749 | } | 757 | } |
750 | } | 758 | } |
751 | 759 | ||
752 | if (attr == &sysfs_cache_replacement_policy) { | 760 | if (attr == &sysfs_cache_replacement_policy) { |
753 | ssize_t v = bch_read_string_list(buf, cache_replacement_policies); | 761 | ssize_t v = bch_read_string_list(buf, cache_replacement_policies); |
754 | 762 | ||
755 | if (v < 0) | 763 | if (v < 0) |
756 | return v; | 764 | return v; |
757 | 765 | ||
758 | if ((unsigned) v != CACHE_REPLACEMENT(&ca->sb)) { | 766 | if ((unsigned) v != CACHE_REPLACEMENT(&ca->sb)) { |
759 | mutex_lock(&ca->set->bucket_lock); | 767 | mutex_lock(&ca->set->bucket_lock); |
760 | SET_CACHE_REPLACEMENT(&ca->sb, v); | 768 | SET_CACHE_REPLACEMENT(&ca->sb, v); |
761 | mutex_unlock(&ca->set->bucket_lock); | 769 | mutex_unlock(&ca->set->bucket_lock); |
762 | 770 | ||
763 | bcache_write_super(ca->set); | 771 | bcache_write_super(ca->set); |
764 | } | 772 | } |
765 | } | 773 | } |
766 | 774 | ||
767 | if (attr == &sysfs_freelist_percent) { | 775 | if (attr == &sysfs_freelist_percent) { |
768 | DECLARE_FIFO(long, free); | 776 | DECLARE_FIFO(long, free); |
769 | long i; | 777 | long i; |
770 | size_t p = strtoul_or_return(buf); | 778 | size_t p = strtoul_or_return(buf); |
771 | 779 | ||
772 | p = clamp_t(size_t, | 780 | p = clamp_t(size_t, |
773 | ((size_t) ca->sb.nbuckets * p) / 100, | 781 | ((size_t) ca->sb.nbuckets * p) / 100, |
774 | roundup_pow_of_two(ca->sb.nbuckets) >> 9, | 782 | roundup_pow_of_two(ca->sb.nbuckets) >> 9, |
775 | ca->sb.nbuckets / 2); | 783 | ca->sb.nbuckets / 2); |
776 | 784 | ||
777 | if (!init_fifo_exact(&free, p, GFP_KERNEL)) | 785 | if (!init_fifo_exact(&free, p, GFP_KERNEL)) |
778 | return -ENOMEM; | 786 | return -ENOMEM; |
779 | 787 | ||
780 | mutex_lock(&ca->set->bucket_lock); | 788 | mutex_lock(&ca->set->bucket_lock); |
781 | 789 | ||
782 | fifo_move(&free, &ca->free); | 790 | fifo_move(&free, &ca->free); |
783 | fifo_swap(&free, &ca->free); | 791 | fifo_swap(&free, &ca->free); |
784 | 792 | ||
785 | mutex_unlock(&ca->set->bucket_lock); | 793 | mutex_unlock(&ca->set->bucket_lock); |
786 | 794 | ||
787 | while (fifo_pop(&free, i)) | 795 | while (fifo_pop(&free, i)) |
788 | atomic_dec(&ca->buckets[i].pin); | 796 | atomic_dec(&ca->buckets[i].pin); |
789 | 797 | ||
790 | free_fifo(&free); | 798 | free_fifo(&free); |
791 | } | 799 | } |
792 | 800 | ||
793 | if (attr == &sysfs_clear_stats) { | 801 | if (attr == &sysfs_clear_stats) { |
794 | atomic_long_set(&ca->sectors_written, 0); | 802 | atomic_long_set(&ca->sectors_written, 0); |
795 | atomic_long_set(&ca->btree_sectors_written, 0); | 803 | atomic_long_set(&ca->btree_sectors_written, 0); |
796 | atomic_long_set(&ca->meta_sectors_written, 0); | 804 | atomic_long_set(&ca->meta_sectors_written, 0); |
797 | atomic_set(&ca->io_count, 0); | 805 | atomic_set(&ca->io_count, 0); |
798 | atomic_set(&ca->io_errors, 0); | 806 | atomic_set(&ca->io_errors, 0); |
799 | } | 807 | } |
800 | 808 | ||
801 | return size; | 809 | return size; |
802 | } | 810 | } |
803 | STORE_LOCKED(bch_cache) | 811 | STORE_LOCKED(bch_cache) |
804 | 812 | ||
805 | static struct attribute *bch_cache_files[] = { | 813 | static struct attribute *bch_cache_files[] = { |
806 | &sysfs_bucket_size, | 814 | &sysfs_bucket_size, |
807 | &sysfs_block_size, | 815 | &sysfs_block_size, |
808 | &sysfs_nbuckets, | 816 | &sysfs_nbuckets, |
809 | &sysfs_priority_stats, | 817 | &sysfs_priority_stats, |
810 | &sysfs_discard, | 818 | &sysfs_discard, |
811 | &sysfs_written, | 819 | &sysfs_written, |
812 | &sysfs_btree_written, | 820 | &sysfs_btree_written, |
813 | &sysfs_metadata_written, | 821 | &sysfs_metadata_written, |
814 | &sysfs_io_errors, | 822 | &sysfs_io_errors, |
815 | &sysfs_clear_stats, | 823 | &sysfs_clear_stats, |
816 | &sysfs_freelist_percent, | 824 | &sysfs_freelist_percent, |
817 | &sysfs_cache_replacement_policy, | 825 | &sysfs_cache_replacement_policy, |
818 | NULL | 826 | NULL |
819 | }; | 827 | }; |
820 | KTYPE(bch_cache); | 828 | KTYPE(bch_cache); |
821 | 829 |
drivers/md/bcache/writeback.c
1 | /* | 1 | /* |
2 | * background writeback - scan btree for dirty data and write it to the backing | 2 | * background writeback - scan btree for dirty data and write it to the backing |
3 | * device | 3 | * device |
4 | * | 4 | * |
5 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> | 5 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> |
6 | * Copyright 2012 Google, Inc. | 6 | * Copyright 2012 Google, Inc. |
7 | */ | 7 | */ |
8 | 8 | ||
9 | #include "bcache.h" | 9 | #include "bcache.h" |
10 | #include "btree.h" | 10 | #include "btree.h" |
11 | #include "debug.h" | 11 | #include "debug.h" |
12 | #include "writeback.h" | 12 | #include "writeback.h" |
13 | 13 | ||
14 | #include <trace/events/bcache.h> | 14 | #include <trace/events/bcache.h> |
15 | 15 | ||
16 | static struct workqueue_struct *dirty_wq; | 16 | static struct workqueue_struct *dirty_wq; |
17 | 17 | ||
18 | static void read_dirty(struct closure *); | 18 | static void read_dirty(struct closure *); |
19 | 19 | ||
20 | struct dirty_io { | 20 | struct dirty_io { |
21 | struct closure cl; | 21 | struct closure cl; |
22 | struct cached_dev *dc; | 22 | struct cached_dev *dc; |
23 | struct bio bio; | 23 | struct bio bio; |
24 | }; | 24 | }; |
25 | 25 | ||
26 | /* Rate limiting */ | 26 | /* Rate limiting */ |
27 | 27 | ||
28 | static void __update_writeback_rate(struct cached_dev *dc) | 28 | static void __update_writeback_rate(struct cached_dev *dc) |
29 | { | 29 | { |
30 | struct cache_set *c = dc->disk.c; | 30 | struct cache_set *c = dc->disk.c; |
31 | uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size; | 31 | uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size; |
32 | uint64_t cache_dirty_target = | 32 | uint64_t cache_dirty_target = |
33 | div_u64(cache_sectors * dc->writeback_percent, 100); | 33 | div_u64(cache_sectors * dc->writeback_percent, 100); |
34 | 34 | ||
35 | int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev), | 35 | int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev), |
36 | c->cached_dev_sectors); | 36 | c->cached_dev_sectors); |
37 | 37 | ||
38 | /* PD controller */ | 38 | /* PD controller */ |
39 | 39 | ||
40 | int change = 0; | 40 | int change = 0; |
41 | int64_t error; | 41 | int64_t error; |
42 | int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); | 42 | int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); |
43 | int64_t derivative = dirty - dc->disk.sectors_dirty_last; | 43 | int64_t derivative = dirty - dc->disk.sectors_dirty_last; |
44 | 44 | ||
45 | dc->disk.sectors_dirty_last = dirty; | 45 | dc->disk.sectors_dirty_last = dirty; |
46 | 46 | ||
47 | derivative *= dc->writeback_rate_d_term; | 47 | derivative *= dc->writeback_rate_d_term; |
48 | derivative = clamp(derivative, -dirty, dirty); | 48 | derivative = clamp(derivative, -dirty, dirty); |
49 | 49 | ||
50 | derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative, | 50 | derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative, |
51 | dc->writeback_rate_d_smooth, 0); | 51 | dc->writeback_rate_d_smooth, 0); |
52 | 52 | ||
53 | /* Avoid divide by zero */ | 53 | /* Avoid divide by zero */ |
54 | if (!target) | 54 | if (!target) |
55 | goto out; | 55 | goto out; |
56 | 56 | ||
57 | error = div64_s64((dirty + derivative - target) << 8, target); | 57 | error = div64_s64((dirty + derivative - target) << 8, target); |
58 | 58 | ||
59 | change = div_s64((dc->writeback_rate.rate * error) >> 8, | 59 | change = div_s64((dc->writeback_rate.rate * error) >> 8, |
60 | dc->writeback_rate_p_term_inverse); | 60 | dc->writeback_rate_p_term_inverse); |
61 | 61 | ||
62 | /* Don't increase writeback rate if the device isn't keeping up */ | 62 | /* Don't increase writeback rate if the device isn't keeping up */ |
63 | if (change > 0 && | 63 | if (change > 0 && |
64 | time_after64(local_clock(), | 64 | time_after64(local_clock(), |
65 | dc->writeback_rate.next + 10 * NSEC_PER_MSEC)) | 65 | dc->writeback_rate.next + 10 * NSEC_PER_MSEC)) |
66 | change = 0; | 66 | change = 0; |
67 | 67 | ||
68 | dc->writeback_rate.rate = | 68 | dc->writeback_rate.rate = |
69 | clamp_t(int64_t, dc->writeback_rate.rate + change, | 69 | clamp_t(int64_t, dc->writeback_rate.rate + change, |
70 | 1, NSEC_PER_MSEC); | 70 | 1, NSEC_PER_MSEC); |
71 | out: | 71 | out: |
72 | dc->writeback_rate_derivative = derivative; | 72 | dc->writeback_rate_derivative = derivative; |
73 | dc->writeback_rate_change = change; | 73 | dc->writeback_rate_change = change; |
74 | dc->writeback_rate_target = target; | 74 | dc->writeback_rate_target = target; |
75 | 75 | ||
76 | schedule_delayed_work(&dc->writeback_rate_update, | 76 | schedule_delayed_work(&dc->writeback_rate_update, |
77 | dc->writeback_rate_update_seconds * HZ); | 77 | dc->writeback_rate_update_seconds * HZ); |
78 | } | 78 | } |
79 | 79 | ||
80 | static void update_writeback_rate(struct work_struct *work) | 80 | static void update_writeback_rate(struct work_struct *work) |
81 | { | 81 | { |
82 | struct cached_dev *dc = container_of(to_delayed_work(work), | 82 | struct cached_dev *dc = container_of(to_delayed_work(work), |
83 | struct cached_dev, | 83 | struct cached_dev, |
84 | writeback_rate_update); | 84 | writeback_rate_update); |
85 | 85 | ||
86 | down_read(&dc->writeback_lock); | 86 | down_read(&dc->writeback_lock); |
87 | 87 | ||
88 | if (atomic_read(&dc->has_dirty) && | 88 | if (atomic_read(&dc->has_dirty) && |
89 | dc->writeback_percent) | 89 | dc->writeback_percent) |
90 | __update_writeback_rate(dc); | 90 | __update_writeback_rate(dc); |
91 | 91 | ||
92 | up_read(&dc->writeback_lock); | 92 | up_read(&dc->writeback_lock); |
93 | } | 93 | } |
94 | 94 | ||
95 | static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) | 95 | static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) |
96 | { | 96 | { |
97 | if (atomic_read(&dc->disk.detaching) || | 97 | if (atomic_read(&dc->disk.detaching) || |
98 | !dc->writeback_percent) | 98 | !dc->writeback_percent) |
99 | return 0; | 99 | return 0; |
100 | 100 | ||
101 | return bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL); | 101 | return bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL); |
102 | } | 102 | } |
103 | 103 | ||
104 | /* Background writeback */ | 104 | /* Background writeback */ |
105 | 105 | ||
106 | static bool dirty_pred(struct keybuf *buf, struct bkey *k) | 106 | static bool dirty_pred(struct keybuf *buf, struct bkey *k) |
107 | { | 107 | { |
108 | return KEY_DIRTY(k); | 108 | return KEY_DIRTY(k); |
109 | } | 109 | } |
110 | 110 | ||
111 | static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k) | ||
112 | { | ||
113 | uint64_t stripe; | ||
114 | unsigned nr_sectors = KEY_SIZE(k); | ||
115 | struct cached_dev *dc = container_of(buf, struct cached_dev, | ||
116 | writeback_keys); | ||
117 | unsigned stripe_size = 1 << dc->disk.stripe_size_bits; | ||
118 | |||
119 | if (!KEY_DIRTY(k)) | ||
120 | return false; | ||
121 | |||
122 | stripe = KEY_START(k) >> dc->disk.stripe_size_bits; | ||
123 | while (1) { | ||
124 | if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) != | ||
125 | stripe_size) | ||
126 | return false; | ||
127 | |||
128 | if (nr_sectors <= stripe_size) | ||
129 | return true; | ||
130 | |||
131 | nr_sectors -= stripe_size; | ||
132 | stripe++; | ||
133 | } | ||
134 | } | ||
135 | |||
111 | static void dirty_init(struct keybuf_key *w) | 136 | static void dirty_init(struct keybuf_key *w) |
112 | { | 137 | { |
113 | struct dirty_io *io = w->private; | 138 | struct dirty_io *io = w->private; |
114 | struct bio *bio = &io->bio; | 139 | struct bio *bio = &io->bio; |
115 | 140 | ||
116 | bio_init(bio); | 141 | bio_init(bio); |
117 | if (!io->dc->writeback_percent) | 142 | if (!io->dc->writeback_percent) |
118 | bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); | 143 | bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); |
119 | 144 | ||
120 | bio->bi_size = KEY_SIZE(&w->key) << 9; | 145 | bio->bi_size = KEY_SIZE(&w->key) << 9; |
121 | bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS); | 146 | bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS); |
122 | bio->bi_private = w; | 147 | bio->bi_private = w; |
123 | bio->bi_io_vec = bio->bi_inline_vecs; | 148 | bio->bi_io_vec = bio->bi_inline_vecs; |
124 | bch_bio_map(bio, NULL); | 149 | bch_bio_map(bio, NULL); |
125 | } | 150 | } |
126 | 151 | ||
127 | static void refill_dirty(struct closure *cl) | 152 | static void refill_dirty(struct closure *cl) |
128 | { | 153 | { |
129 | struct cached_dev *dc = container_of(cl, struct cached_dev, | 154 | struct cached_dev *dc = container_of(cl, struct cached_dev, |
130 | writeback.cl); | 155 | writeback.cl); |
131 | struct keybuf *buf = &dc->writeback_keys; | 156 | struct keybuf *buf = &dc->writeback_keys; |
132 | bool searched_from_start = false; | 157 | bool searched_from_start = false; |
133 | struct bkey end = MAX_KEY; | 158 | struct bkey end = MAX_KEY; |
134 | SET_KEY_INODE(&end, dc->disk.id); | 159 | SET_KEY_INODE(&end, dc->disk.id); |
135 | 160 | ||
136 | if (!atomic_read(&dc->disk.detaching) && | 161 | if (!atomic_read(&dc->disk.detaching) && |
137 | !dc->writeback_running) | 162 | !dc->writeback_running) |
138 | closure_return(cl); | 163 | closure_return(cl); |
139 | 164 | ||
140 | down_write(&dc->writeback_lock); | 165 | down_write(&dc->writeback_lock); |
141 | 166 | ||
142 | if (!atomic_read(&dc->has_dirty)) { | 167 | if (!atomic_read(&dc->has_dirty)) { |
143 | SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); | 168 | SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); |
144 | bch_write_bdev_super(dc, NULL); | 169 | bch_write_bdev_super(dc, NULL); |
145 | 170 | ||
146 | up_write(&dc->writeback_lock); | 171 | up_write(&dc->writeback_lock); |
147 | closure_return(cl); | 172 | closure_return(cl); |
148 | } | 173 | } |
149 | 174 | ||
150 | if (bkey_cmp(&buf->last_scanned, &end) >= 0) { | 175 | if (bkey_cmp(&buf->last_scanned, &end) >= 0) { |
151 | buf->last_scanned = KEY(dc->disk.id, 0, 0); | 176 | buf->last_scanned = KEY(dc->disk.id, 0, 0); |
152 | searched_from_start = true; | 177 | searched_from_start = true; |
153 | } | 178 | } |
154 | 179 | ||
155 | bch_refill_keybuf(dc->disk.c, buf, &end); | 180 | if (dc->partial_stripes_expensive) { |
181 | uint64_t i; | ||
156 | 182 | ||
183 | for (i = 0; i < dc->disk.nr_stripes; i++) | ||
184 | if (atomic_read(dc->disk.stripe_sectors_dirty + i) == | ||
185 | 1 << dc->disk.stripe_size_bits) | ||
186 | goto full_stripes; | ||
187 | |||
188 | goto normal_refill; | ||
189 | full_stripes: | ||
190 | bch_refill_keybuf(dc->disk.c, buf, &end, | ||
191 | dirty_full_stripe_pred); | ||
192 | } else { | ||
193 | normal_refill: | ||
194 | bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); | ||
195 | } | ||
196 | |||
157 | if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) { | 197 | if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) { |
158 | /* Searched the entire btree - delay awhile */ | 198 | /* Searched the entire btree - delay awhile */ |
159 | 199 | ||
160 | if (RB_EMPTY_ROOT(&buf->keys)) { | 200 | if (RB_EMPTY_ROOT(&buf->keys)) { |
161 | atomic_set(&dc->has_dirty, 0); | 201 | atomic_set(&dc->has_dirty, 0); |
162 | cached_dev_put(dc); | 202 | cached_dev_put(dc); |
163 | } | 203 | } |
164 | 204 | ||
165 | if (!atomic_read(&dc->disk.detaching)) | 205 | if (!atomic_read(&dc->disk.detaching)) |
166 | closure_delay(&dc->writeback, dc->writeback_delay * HZ); | 206 | closure_delay(&dc->writeback, dc->writeback_delay * HZ); |
167 | } | 207 | } |
168 | 208 | ||
169 | up_write(&dc->writeback_lock); | 209 | up_write(&dc->writeback_lock); |
170 | 210 | ||
171 | ratelimit_reset(&dc->writeback_rate); | 211 | ratelimit_reset(&dc->writeback_rate); |
172 | 212 | ||
173 | /* Punt to workqueue only so we don't recurse and blow the stack */ | 213 | /* Punt to workqueue only so we don't recurse and blow the stack */ |
174 | continue_at(cl, read_dirty, dirty_wq); | 214 | continue_at(cl, read_dirty, dirty_wq); |
175 | } | 215 | } |
176 | 216 | ||
177 | void bch_writeback_queue(struct cached_dev *dc) | 217 | void bch_writeback_queue(struct cached_dev *dc) |
178 | { | 218 | { |
179 | if (closure_trylock(&dc->writeback.cl, &dc->disk.cl)) { | 219 | if (closure_trylock(&dc->writeback.cl, &dc->disk.cl)) { |
180 | if (!atomic_read(&dc->disk.detaching)) | 220 | if (!atomic_read(&dc->disk.detaching)) |
181 | closure_delay(&dc->writeback, dc->writeback_delay * HZ); | 221 | closure_delay(&dc->writeback, dc->writeback_delay * HZ); |
182 | 222 | ||
183 | continue_at(&dc->writeback.cl, refill_dirty, dirty_wq); | 223 | continue_at(&dc->writeback.cl, refill_dirty, dirty_wq); |
184 | } | 224 | } |
185 | } | 225 | } |
186 | 226 | ||
187 | void bch_writeback_add(struct cached_dev *dc) | 227 | void bch_writeback_add(struct cached_dev *dc) |
188 | { | 228 | { |
189 | if (!atomic_read(&dc->has_dirty) && | 229 | if (!atomic_read(&dc->has_dirty) && |
190 | !atomic_xchg(&dc->has_dirty, 1)) { | 230 | !atomic_xchg(&dc->has_dirty, 1)) { |
191 | atomic_inc(&dc->count); | 231 | atomic_inc(&dc->count); |
192 | 232 | ||
193 | if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { | 233 | if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { |
194 | SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); | 234 | SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); |
195 | /* XXX: should do this synchronously */ | 235 | /* XXX: should do this synchronously */ |
196 | bch_write_bdev_super(dc, NULL); | 236 | bch_write_bdev_super(dc, NULL); |
197 | } | 237 | } |
198 | 238 | ||
199 | bch_writeback_queue(dc); | 239 | bch_writeback_queue(dc); |
200 | 240 | ||
201 | if (dc->writeback_percent) | 241 | if (dc->writeback_percent) |
202 | schedule_delayed_work(&dc->writeback_rate_update, | 242 | schedule_delayed_work(&dc->writeback_rate_update, |
203 | dc->writeback_rate_update_seconds * HZ); | 243 | dc->writeback_rate_update_seconds * HZ); |
204 | } | 244 | } |
205 | } | 245 | } |
206 | 246 | ||
207 | void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, | 247 | void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, |
208 | uint64_t offset, int nr_sectors) | 248 | uint64_t offset, int nr_sectors) |
209 | { | 249 | { |
210 | struct bcache_device *d = c->devices[inode]; | 250 | struct bcache_device *d = c->devices[inode]; |
211 | unsigned stripe_size, stripe_offset; | 251 | unsigned stripe_size, stripe_offset; |
212 | uint64_t stripe; | 252 | uint64_t stripe; |
213 | 253 | ||
214 | if (!d) | 254 | if (!d) |
215 | return; | 255 | return; |
216 | 256 | ||
217 | stripe_size = 1 << d->stripe_size_bits; | 257 | stripe_size = 1 << d->stripe_size_bits; |
218 | stripe = offset >> d->stripe_size_bits; | 258 | stripe = offset >> d->stripe_size_bits; |
219 | stripe_offset = offset & (stripe_size - 1); | 259 | stripe_offset = offset & (stripe_size - 1); |
220 | 260 | ||
221 | while (nr_sectors) { | 261 | while (nr_sectors) { |
222 | int s = min_t(unsigned, abs(nr_sectors), | 262 | int s = min_t(unsigned, abs(nr_sectors), |
223 | stripe_size - stripe_offset); | 263 | stripe_size - stripe_offset); |
224 | 264 | ||
225 | if (nr_sectors < 0) | 265 | if (nr_sectors < 0) |
226 | s = -s; | 266 | s = -s; |
227 | 267 | ||
228 | atomic_add(s, d->stripe_sectors_dirty + stripe); | 268 | atomic_add(s, d->stripe_sectors_dirty + stripe); |
229 | nr_sectors -= s; | 269 | nr_sectors -= s; |
230 | stripe_offset = 0; | 270 | stripe_offset = 0; |
231 | stripe++; | 271 | stripe++; |
232 | } | 272 | } |
233 | } | 273 | } |
234 | 274 | ||
235 | /* Background writeback - IO loop */ | 275 | /* Background writeback - IO loop */ |
236 | 276 | ||
237 | static void dirty_io_destructor(struct closure *cl) | 277 | static void dirty_io_destructor(struct closure *cl) |
238 | { | 278 | { |
239 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | 279 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); |
240 | kfree(io); | 280 | kfree(io); |
241 | } | 281 | } |
242 | 282 | ||
243 | static void write_dirty_finish(struct closure *cl) | 283 | static void write_dirty_finish(struct closure *cl) |
244 | { | 284 | { |
245 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | 285 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); |
246 | struct keybuf_key *w = io->bio.bi_private; | 286 | struct keybuf_key *w = io->bio.bi_private; |
247 | struct cached_dev *dc = io->dc; | 287 | struct cached_dev *dc = io->dc; |
248 | struct bio_vec *bv = bio_iovec_idx(&io->bio, io->bio.bi_vcnt); | 288 | struct bio_vec *bv = bio_iovec_idx(&io->bio, io->bio.bi_vcnt); |
249 | 289 | ||
250 | while (bv-- != io->bio.bi_io_vec) | 290 | while (bv-- != io->bio.bi_io_vec) |
251 | __free_page(bv->bv_page); | 291 | __free_page(bv->bv_page); |
252 | 292 | ||
253 | /* This is kind of a dumb way of signalling errors. */ | 293 | /* This is kind of a dumb way of signalling errors. */ |
254 | if (KEY_DIRTY(&w->key)) { | 294 | if (KEY_DIRTY(&w->key)) { |
255 | unsigned i; | 295 | unsigned i; |
256 | struct btree_op op; | 296 | struct btree_op op; |
257 | bch_btree_op_init_stack(&op); | 297 | bch_btree_op_init_stack(&op); |
258 | 298 | ||
259 | op.type = BTREE_REPLACE; | 299 | op.type = BTREE_REPLACE; |
260 | bkey_copy(&op.replace, &w->key); | 300 | bkey_copy(&op.replace, &w->key); |
261 | 301 | ||
262 | SET_KEY_DIRTY(&w->key, false); | 302 | SET_KEY_DIRTY(&w->key, false); |
263 | bch_keylist_add(&op.keys, &w->key); | 303 | bch_keylist_add(&op.keys, &w->key); |
264 | 304 | ||
265 | for (i = 0; i < KEY_PTRS(&w->key); i++) | 305 | for (i = 0; i < KEY_PTRS(&w->key); i++) |
266 | atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); | 306 | atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); |
267 | 307 | ||
268 | bch_btree_insert(&op, dc->disk.c); | 308 | bch_btree_insert(&op, dc->disk.c); |
269 | closure_sync(&op.cl); | 309 | closure_sync(&op.cl); |
270 | 310 | ||
271 | if (op.insert_collision) | 311 | if (op.insert_collision) |
272 | trace_bcache_writeback_collision(&w->key); | 312 | trace_bcache_writeback_collision(&w->key); |
273 | 313 | ||
274 | atomic_long_inc(op.insert_collision | 314 | atomic_long_inc(op.insert_collision |
275 | ? &dc->disk.c->writeback_keys_failed | 315 | ? &dc->disk.c->writeback_keys_failed |
276 | : &dc->disk.c->writeback_keys_done); | 316 | : &dc->disk.c->writeback_keys_done); |
277 | } | 317 | } |
278 | 318 | ||
279 | bch_keybuf_del(&dc->writeback_keys, w); | 319 | bch_keybuf_del(&dc->writeback_keys, w); |
280 | atomic_dec_bug(&dc->in_flight); | 320 | atomic_dec_bug(&dc->in_flight); |
281 | 321 | ||
282 | closure_wake_up(&dc->writeback_wait); | 322 | closure_wake_up(&dc->writeback_wait); |
283 | 323 | ||
284 | closure_return_with_destructor(cl, dirty_io_destructor); | 324 | closure_return_with_destructor(cl, dirty_io_destructor); |
285 | } | 325 | } |
286 | 326 | ||
287 | static void dirty_endio(struct bio *bio, int error) | 327 | static void dirty_endio(struct bio *bio, int error) |
288 | { | 328 | { |
289 | struct keybuf_key *w = bio->bi_private; | 329 | struct keybuf_key *w = bio->bi_private; |
290 | struct dirty_io *io = w->private; | 330 | struct dirty_io *io = w->private; |
291 | 331 | ||
292 | if (error) | 332 | if (error) |
293 | SET_KEY_DIRTY(&w->key, false); | 333 | SET_KEY_DIRTY(&w->key, false); |
294 | 334 | ||
295 | closure_put(&io->cl); | 335 | closure_put(&io->cl); |
296 | } | 336 | } |
297 | 337 | ||
298 | static void write_dirty(struct closure *cl) | 338 | static void write_dirty(struct closure *cl) |
299 | { | 339 | { |
300 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | 340 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); |
301 | struct keybuf_key *w = io->bio.bi_private; | 341 | struct keybuf_key *w = io->bio.bi_private; |
302 | 342 | ||
303 | dirty_init(w); | 343 | dirty_init(w); |
304 | io->bio.bi_rw = WRITE; | 344 | io->bio.bi_rw = WRITE; |
305 | io->bio.bi_sector = KEY_START(&w->key); | 345 | io->bio.bi_sector = KEY_START(&w->key); |
306 | io->bio.bi_bdev = io->dc->bdev; | 346 | io->bio.bi_bdev = io->dc->bdev; |
307 | io->bio.bi_end_io = dirty_endio; | 347 | io->bio.bi_end_io = dirty_endio; |
308 | 348 | ||
309 | closure_bio_submit(&io->bio, cl, &io->dc->disk); | 349 | closure_bio_submit(&io->bio, cl, &io->dc->disk); |
310 | 350 | ||
311 | continue_at(cl, write_dirty_finish, dirty_wq); | 351 | continue_at(cl, write_dirty_finish, dirty_wq); |
312 | } | 352 | } |
313 | 353 | ||
314 | static void read_dirty_endio(struct bio *bio, int error) | 354 | static void read_dirty_endio(struct bio *bio, int error) |
315 | { | 355 | { |
316 | struct keybuf_key *w = bio->bi_private; | 356 | struct keybuf_key *w = bio->bi_private; |
317 | struct dirty_io *io = w->private; | 357 | struct dirty_io *io = w->private; |
318 | 358 | ||
319 | bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0), | 359 | bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0), |
320 | error, "reading dirty data from cache"); | 360 | error, "reading dirty data from cache"); |
321 | 361 | ||
322 | dirty_endio(bio, error); | 362 | dirty_endio(bio, error); |
323 | } | 363 | } |
324 | 364 | ||
325 | static void read_dirty_submit(struct closure *cl) | 365 | static void read_dirty_submit(struct closure *cl) |
326 | { | 366 | { |
327 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | 367 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); |
328 | 368 | ||
329 | closure_bio_submit(&io->bio, cl, &io->dc->disk); | 369 | closure_bio_submit(&io->bio, cl, &io->dc->disk); |
330 | 370 | ||
331 | continue_at(cl, write_dirty, dirty_wq); | 371 | continue_at(cl, write_dirty, dirty_wq); |
332 | } | 372 | } |
333 | 373 | ||
334 | static void read_dirty(struct closure *cl) | 374 | static void read_dirty(struct closure *cl) |
335 | { | 375 | { |
336 | struct cached_dev *dc = container_of(cl, struct cached_dev, | 376 | struct cached_dev *dc = container_of(cl, struct cached_dev, |
337 | writeback.cl); | 377 | writeback.cl); |
338 | unsigned delay = writeback_delay(dc, 0); | 378 | unsigned delay = writeback_delay(dc, 0); |
339 | struct keybuf_key *w; | 379 | struct keybuf_key *w; |
340 | struct dirty_io *io; | 380 | struct dirty_io *io; |
341 | 381 | ||
342 | /* | 382 | /* |
343 | * XXX: if we error, background writeback just spins. Should use some | 383 | * XXX: if we error, background writeback just spins. Should use some |
344 | * mempools. | 384 | * mempools. |
345 | */ | 385 | */ |
346 | 386 | ||
347 | while (1) { | 387 | while (1) { |
348 | w = bch_keybuf_next(&dc->writeback_keys); | 388 | w = bch_keybuf_next(&dc->writeback_keys); |
349 | if (!w) | 389 | if (!w) |
350 | break; | 390 | break; |
351 | 391 | ||
352 | BUG_ON(ptr_stale(dc->disk.c, &w->key, 0)); | 392 | BUG_ON(ptr_stale(dc->disk.c, &w->key, 0)); |
353 | 393 | ||
354 | if (delay > 0 && | 394 | if (delay > 0 && |
355 | (KEY_START(&w->key) != dc->last_read || | 395 | (KEY_START(&w->key) != dc->last_read || |
356 | jiffies_to_msecs(delay) > 50)) { | 396 | jiffies_to_msecs(delay) > 50)) { |
357 | w->private = NULL; | 397 | w->private = NULL; |
358 | 398 | ||
359 | closure_delay(&dc->writeback, delay); | 399 | closure_delay(&dc->writeback, delay); |
360 | continue_at(cl, read_dirty, dirty_wq); | 400 | continue_at(cl, read_dirty, dirty_wq); |
361 | } | 401 | } |
362 | 402 | ||
363 | dc->last_read = KEY_OFFSET(&w->key); | 403 | dc->last_read = KEY_OFFSET(&w->key); |
364 | 404 | ||
365 | io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec) | 405 | io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec) |
366 | * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), | 406 | * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), |
367 | GFP_KERNEL); | 407 | GFP_KERNEL); |
368 | if (!io) | 408 | if (!io) |
369 | goto err; | 409 | goto err; |
370 | 410 | ||
371 | w->private = io; | 411 | w->private = io; |
372 | io->dc = dc; | 412 | io->dc = dc; |
373 | 413 | ||
374 | dirty_init(w); | 414 | dirty_init(w); |
375 | io->bio.bi_sector = PTR_OFFSET(&w->key, 0); | 415 | io->bio.bi_sector = PTR_OFFSET(&w->key, 0); |
376 | io->bio.bi_bdev = PTR_CACHE(dc->disk.c, | 416 | io->bio.bi_bdev = PTR_CACHE(dc->disk.c, |
377 | &w->key, 0)->bdev; | 417 | &w->key, 0)->bdev; |
378 | io->bio.bi_rw = READ; | 418 | io->bio.bi_rw = READ; |
379 | io->bio.bi_end_io = read_dirty_endio; | 419 | io->bio.bi_end_io = read_dirty_endio; |
380 | 420 | ||
381 | if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL)) | 421 | if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL)) |
382 | goto err_free; | 422 | goto err_free; |
383 | 423 | ||
384 | trace_bcache_writeback(&w->key); | 424 | trace_bcache_writeback(&w->key); |
385 | 425 | ||
386 | closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl); | 426 | closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl); |
387 | 427 | ||
388 | delay = writeback_delay(dc, KEY_SIZE(&w->key)); | 428 | delay = writeback_delay(dc, KEY_SIZE(&w->key)); |
389 | 429 | ||
390 | atomic_inc(&dc->in_flight); | 430 | atomic_inc(&dc->in_flight); |
391 | 431 | ||
392 | if (!closure_wait_event(&dc->writeback_wait, cl, | 432 | if (!closure_wait_event(&dc->writeback_wait, cl, |
393 | atomic_read(&dc->in_flight) < 64)) | 433 | atomic_read(&dc->in_flight) < 64)) |
394 | continue_at(cl, read_dirty, dirty_wq); | 434 | continue_at(cl, read_dirty, dirty_wq); |
395 | } | 435 | } |
396 | 436 | ||
397 | if (0) { | 437 | if (0) { |
398 | err_free: | 438 | err_free: |
399 | kfree(w->private); | 439 | kfree(w->private); |
400 | err: | 440 | err: |
401 | bch_keybuf_del(&dc->writeback_keys, w); | 441 | bch_keybuf_del(&dc->writeback_keys, w); |
402 | } | 442 | } |
403 | 443 | ||
404 | refill_dirty(cl); | 444 | refill_dirty(cl); |
405 | } | 445 | } |
406 | 446 | ||
407 | /* Init */ | 447 | /* Init */ |
408 | 448 | ||
409 | static int bch_btree_sectors_dirty_init(struct btree *b, struct btree_op *op, | 449 | static int bch_btree_sectors_dirty_init(struct btree *b, struct btree_op *op, |
410 | struct cached_dev *dc) | 450 | struct cached_dev *dc) |
411 | { | 451 | { |
412 | struct bkey *k; | 452 | struct bkey *k; |
413 | struct btree_iter iter; | 453 | struct btree_iter iter; |
414 | 454 | ||
415 | bch_btree_iter_init(b, &iter, &KEY(dc->disk.id, 0, 0)); | 455 | bch_btree_iter_init(b, &iter, &KEY(dc->disk.id, 0, 0)); |
416 | while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) | 456 | while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) |
417 | if (!b->level) { | 457 | if (!b->level) { |
418 | if (KEY_INODE(k) > dc->disk.id) | 458 | if (KEY_INODE(k) > dc->disk.id) |
419 | break; | 459 | break; |
420 | 460 | ||
421 | if (KEY_DIRTY(k)) | 461 | if (KEY_DIRTY(k)) |
422 | bcache_dev_sectors_dirty_add(b->c, dc->disk.id, | 462 | bcache_dev_sectors_dirty_add(b->c, dc->disk.id, |
423 | KEY_START(k), | 463 | KEY_START(k), |
424 | KEY_SIZE(k)); | 464 | KEY_SIZE(k)); |
425 | } else { | 465 | } else { |
426 | btree(sectors_dirty_init, k, b, op, dc); | 466 | btree(sectors_dirty_init, k, b, op, dc); |
427 | if (KEY_INODE(k) > dc->disk.id) | 467 | if (KEY_INODE(k) > dc->disk.id) |
428 | break; | 468 | break; |
429 | 469 | ||
430 | cond_resched(); | 470 | cond_resched(); |
431 | } | 471 | } |
432 | 472 | ||
433 | return 0; | 473 | return 0; |
434 | } | 474 | } |
435 | 475 | ||
436 | void bch_sectors_dirty_init(struct cached_dev *dc) | 476 | void bch_sectors_dirty_init(struct cached_dev *dc) |
437 | { | 477 | { |
438 | struct btree_op op; | 478 | struct btree_op op; |
439 | 479 | ||
440 | bch_btree_op_init_stack(&op); | 480 | bch_btree_op_init_stack(&op); |
441 | btree_root(sectors_dirty_init, dc->disk.c, &op, dc); | 481 | btree_root(sectors_dirty_init, dc->disk.c, &op, dc); |
442 | } | 482 | } |
443 | 483 | ||
444 | void bch_cached_dev_writeback_init(struct cached_dev *dc) | 484 | void bch_cached_dev_writeback_init(struct cached_dev *dc) |
445 | { | 485 | { |
446 | closure_init_unlocked(&dc->writeback); | 486 | closure_init_unlocked(&dc->writeback); |
447 | init_rwsem(&dc->writeback_lock); | 487 | init_rwsem(&dc->writeback_lock); |
448 | 488 | ||
449 | bch_keybuf_init(&dc->writeback_keys, dirty_pred); | 489 | bch_keybuf_init(&dc->writeback_keys); |
450 | 490 | ||
451 | dc->writeback_metadata = true; | 491 | dc->writeback_metadata = true; |
452 | dc->writeback_running = true; | 492 | dc->writeback_running = true; |
453 | dc->writeback_percent = 10; | 493 | dc->writeback_percent = 10; |
454 | dc->writeback_delay = 30; | 494 | dc->writeback_delay = 30; |
455 | dc->writeback_rate.rate = 1024; | 495 | dc->writeback_rate.rate = 1024; |
456 | 496 | ||
457 | dc->writeback_rate_update_seconds = 30; | 497 | dc->writeback_rate_update_seconds = 30; |
458 | dc->writeback_rate_d_term = 16; | 498 | dc->writeback_rate_d_term = 16; |
459 | dc->writeback_rate_p_term_inverse = 64; | 499 | dc->writeback_rate_p_term_inverse = 64; |
460 | dc->writeback_rate_d_smooth = 8; | 500 | dc->writeback_rate_d_smooth = 8; |
461 | 501 | ||
462 | INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); | 502 | INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); |
463 | schedule_delayed_work(&dc->writeback_rate_update, | 503 | schedule_delayed_work(&dc->writeback_rate_update, |
464 | dc->writeback_rate_update_seconds * HZ); | 504 | dc->writeback_rate_update_seconds * HZ); |
465 | } | 505 | } |
466 | 506 | ||
467 | void bch_writeback_exit(void) | 507 | void bch_writeback_exit(void) |
468 | { | 508 | { |
469 | if (dirty_wq) | 509 | if (dirty_wq) |
470 | destroy_workqueue(dirty_wq); | 510 | destroy_workqueue(dirty_wq); |
471 | } | 511 | } |
472 | 512 | ||
473 | int __init bch_writeback_init(void) | 513 | int __init bch_writeback_init(void) |
474 | { | 514 | { |
475 | dirty_wq = create_singlethread_workqueue("bcache_writeback"); | 515 | dirty_wq = create_singlethread_workqueue("bcache_writeback"); |
476 | if (!dirty_wq) | 516 | if (!dirty_wq) |
477 | return -ENOMEM; | 517 | return -ENOMEM; |
478 | 518 | ||
479 | return 0; | 519 | return 0; |
480 | } | 520 | } |
481 | 521 |
drivers/md/bcache/writeback.h
1 | #ifndef _BCACHE_WRITEBACK_H | 1 | #ifndef _BCACHE_WRITEBACK_H |
2 | #define _BCACHE_WRITEBACK_H | 2 | #define _BCACHE_WRITEBACK_H |
3 | 3 | ||
4 | #define CUTOFF_WRITEBACK 40 | ||
5 | #define CUTOFF_WRITEBACK_SYNC 70 | ||
6 | |||
4 | static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) | 7 | static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) |
5 | { | 8 | { |
6 | uint64_t i, ret = 0; | 9 | uint64_t i, ret = 0; |
7 | 10 | ||
8 | for (i = 0; i < d->nr_stripes; i++) | 11 | for (i = 0; i < d->nr_stripes; i++) |
9 | ret += atomic_read(d->stripe_sectors_dirty + i); | 12 | ret += atomic_read(d->stripe_sectors_dirty + i); |
10 | 13 | ||
11 | return ret; | 14 | return ret; |
15 | } | ||
16 | |||
17 | static inline bool bcache_dev_stripe_dirty(struct bcache_device *d, | ||
18 | uint64_t offset, | ||
19 | unsigned nr_sectors) | ||
20 | { | ||
21 | uint64_t stripe = offset >> d->stripe_size_bits; | ||
22 | |||
23 | while (1) { | ||
24 | if (atomic_read(d->stripe_sectors_dirty + stripe)) | ||
25 | return true; | ||
26 | |||
27 | if (nr_sectors <= 1 << d->stripe_size_bits) | ||
28 | return false; | ||
29 | |||
30 | nr_sectors -= 1 << d->stripe_size_bits; | ||
31 | stripe++; | ||
32 | } | ||
33 | } | ||
34 | |||
35 | static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, | ||
36 | unsigned cache_mode, bool would_skip) | ||
37 | { | ||
38 | unsigned in_use = dc->disk.c->gc_stats.in_use; | ||
39 | |||
40 | if (cache_mode != CACHE_MODE_WRITEBACK || | ||
41 | atomic_read(&dc->disk.detaching) || | ||
42 | in_use > CUTOFF_WRITEBACK_SYNC) | ||
43 | return false; | ||
44 | |||
45 | if (dc->partial_stripes_expensive && | ||
46 | bcache_dev_stripe_dirty(&dc->disk, bio->bi_sector, | ||
47 | bio_sectors(bio))) | ||
48 | return true; | ||
49 | |||
50 | if (would_skip) | ||
51 | return false; | ||
52 | |||
53 | return bio->bi_rw & REQ_SYNC || | ||
54 | in_use <= CUTOFF_WRITEBACK; | ||
12 | } | 55 | } |
13 | 56 | ||
14 | void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int); | 57 | void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int); |
15 | void bch_writeback_queue(struct cached_dev *); | 58 | void bch_writeback_queue(struct cached_dev *); |
16 | void bch_writeback_add(struct cached_dev *); | 59 | void bch_writeback_add(struct cached_dev *); |
17 | 60 | ||
18 | void bch_sectors_dirty_init(struct cached_dev *dc); | 61 | void bch_sectors_dirty_init(struct cached_dev *dc); |
19 | void bch_cached_dev_writeback_init(struct cached_dev *); | 62 | void bch_cached_dev_writeback_init(struct cached_dev *); |
20 | 63 | ||
21 | #endif | 64 | #endif |
22 | 65 |