Commit eb84ae039e10f1f80443d846ba1350122bbdc753
1 parent
54641bd17d
Exists in
master
and in
7 other branches
Btrfs: Cleanup and comment ordered-data.c
Signed-off-by: Chris Mason <chris.mason@oracle.com>
Showing 3 changed files with 121 additions and 70 deletions Side-by-side Diff
fs/btrfs/inode.c
... | ... | @@ -1343,7 +1343,7 @@ |
1343 | 1343 | unlock_extent(io_tree, page_start, page_end, GFP_NOFS); |
1344 | 1344 | unlock_page(page); |
1345 | 1345 | page_cache_release(page); |
1346 | - btrfs_wait_ordered_extent(inode, ordered); | |
1346 | + btrfs_start_ordered_extent(inode, ordered, 1); | |
1347 | 1347 | btrfs_put_ordered_extent(ordered); |
1348 | 1348 | goto again; |
1349 | 1349 | } |
... | ... | @@ -2660,6 +2660,10 @@ |
2660 | 2660 | ordered = btrfs_lookup_ordered_extent(page->mapping->host, |
2661 | 2661 | page_offset(page)); |
2662 | 2662 | if (ordered) { |
2663 | + /* | |
2664 | + * IO on this page will never be started, so we need | |
2665 | + * to account for any ordered extents now | |
2666 | + */ | |
2663 | 2667 | clear_extent_bit(tree, page_start, page_end, |
2664 | 2668 | EXTENT_DIRTY | EXTENT_DELALLOC | |
2665 | 2669 | EXTENT_LOCKED, 1, 0, GFP_NOFS); |
2666 | 2670 | |
... | ... | @@ -2732,11 +2736,15 @@ |
2732 | 2736 | lock_extent(io_tree, page_start, page_end, GFP_NOFS); |
2733 | 2737 | set_page_extent_mapped(page); |
2734 | 2738 | |
2739 | + /* | |
2740 | + * we can't set the delalloc bits if there are pending ordered | |
2741 | + * extents. Drop our locks and wait for them to finish | |
2742 | + */ | |
2735 | 2743 | ordered = btrfs_lookup_ordered_extent(inode, page_start); |
2736 | 2744 | if (ordered) { |
2737 | 2745 | unlock_extent(io_tree, page_start, page_end, GFP_NOFS); |
2738 | 2746 | unlock_page(page); |
2739 | - btrfs_wait_ordered_extent(inode, ordered); | |
2747 | + btrfs_start_ordered_extent(inode, ordered, 1); | |
2740 | 2748 | btrfs_put_ordered_extent(ordered); |
2741 | 2749 | goto again; |
2742 | 2750 | } |
fs/btrfs/ordered-data.c
... | ... | @@ -136,6 +136,19 @@ |
136 | 136 | return ret; |
137 | 137 | } |
138 | 138 | |
139 | +/* allocate and add a new ordered_extent into the per-inode tree. | |
140 | + * file_offset is the logical offset in the file | |
141 | + * | |
142 | + * start is the disk block number of an extent already reserved in the | |
143 | + * extent allocation tree | |
144 | + * | |
145 | + * len is the length of the extent | |
146 | + * | |
147 | + * This also sets the EXTENT_ORDERED bit on the range in the inode. | |
148 | + * | |
149 | + * The tree is given a single reference on the ordered extent that was | |
150 | + * inserted. | |
151 | + */ | |
139 | 152 | int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, |
140 | 153 | u64 start, u64 len) |
141 | 154 | { |
... | ... | @@ -152,7 +165,6 @@ |
152 | 165 | entry->file_offset = file_offset; |
153 | 166 | entry->start = start; |
154 | 167 | entry->len = len; |
155 | - entry->inode = inode; | |
156 | 168 | /* one ref for the tree */ |
157 | 169 | atomic_set(&entry->refs, 1); |
158 | 170 | init_waitqueue_head(&entry->wait); |
159 | 171 | |
... | ... | @@ -167,12 +179,15 @@ |
167 | 179 | set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset, |
168 | 180 | entry_end(entry) - 1, GFP_NOFS); |
169 | 181 | |
170 | - set_bit(BTRFS_ORDERED_START, &entry->flags); | |
171 | 182 | mutex_unlock(&tree->mutex); |
172 | 183 | BUG_ON(node); |
173 | 184 | return 0; |
174 | 185 | } |
175 | 186 | |
187 | +/* | |
188 | + * Add a struct btrfs_ordered_sum into the list of checksums to be inserted | |
189 | + * when an ordered extent is finished. | |
190 | + */ | |
176 | 191 | int btrfs_add_ordered_sum(struct inode *inode, struct btrfs_ordered_sum *sum) |
177 | 192 | { |
178 | 193 | struct btrfs_ordered_inode_tree *tree; |
179 | 194 | |
180 | 195 | |
... | ... | @@ -182,29 +197,25 @@ |
182 | 197 | tree = &BTRFS_I(inode)->ordered_tree; |
183 | 198 | mutex_lock(&tree->mutex); |
184 | 199 | node = tree_search(tree, sum->file_offset); |
185 | - if (!node) { | |
186 | -search_fail: | |
187 | -printk("add ordered sum failed to find a node for inode %lu offset %Lu\n", inode->i_ino, sum->file_offset); | |
188 | - node = rb_first(&tree->tree); | |
189 | - while(node) { | |
190 | - entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); | |
191 | - printk("entry %Lu %Lu %Lu\n", entry->file_offset, entry->file_offset + entry->len, entry->start); | |
192 | - node = rb_next(node); | |
193 | - } | |
194 | - BUG(); | |
195 | - } | |
196 | 200 | BUG_ON(!node); |
197 | 201 | |
198 | 202 | entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); |
199 | - if (!offset_in_entry(entry, sum->file_offset)) { | |
200 | - goto search_fail; | |
201 | - } | |
203 | + BUG_ON(!offset_in_entry(entry, sum->file_offset)); | |
202 | 204 | |
203 | 205 | list_add_tail(&sum->list, &entry->list); |
204 | 206 | mutex_unlock(&tree->mutex); |
205 | 207 | return 0; |
206 | 208 | } |
207 | 209 | |
210 | +/* | |
211 | + * this is used to account for finished IO across a given range | |
212 | + * of the file. The IO should not span ordered extents. If | |
213 | + * a given ordered_extent is completely done, 1 is returned, otherwise | |
214 | + * 0. | |
215 | + * | |
216 | + * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used | |
217 | + * to make sure this function only returns 1 once for a given ordered extent. | |
218 | + */ | |
208 | 219 | int btrfs_dec_test_ordered_pending(struct inode *inode, |
209 | 220 | u64 file_offset, u64 io_size) |
210 | 221 | { |
... | ... | @@ -233,9 +244,6 @@ |
233 | 244 | ret = test_range_bit(io_tree, entry->file_offset, |
234 | 245 | entry->file_offset + entry->len - 1, |
235 | 246 | EXTENT_ORDERED, 0); |
236 | - if (!test_bit(BTRFS_ORDERED_START, &entry->flags)) { | |
237 | -printk("inode %lu not ready yet for extent %Lu %Lu\n", inode->i_ino, entry->file_offset, entry_end(entry)); | |
238 | - } | |
239 | 247 | if (ret == 0) |
240 | 248 | ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); |
241 | 249 | out: |
... | ... | @@ -243,6 +251,10 @@ |
243 | 251 | return ret == 0; |
244 | 252 | } |
245 | 253 | |
254 | +/* | |
255 | + * used to drop a reference on an ordered extent. This will free | |
256 | + * the extent if the last reference is dropped | |
257 | + */ | |
246 | 258 | int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) |
247 | 259 | { |
248 | 260 | struct list_head *cur; |
... | ... | @@ -260,6 +272,10 @@ |
260 | 272 | return 0; |
261 | 273 | } |
262 | 274 | |
275 | +/* | |
276 | + * remove an ordered extent from the tree. No references are dropped | |
277 | + * but, anyone waiting on this extent is woken up. | |
278 | + */ | |
263 | 279 | int btrfs_remove_ordered_extent(struct inode *inode, |
264 | 280 | struct btrfs_ordered_extent *entry) |
265 | 281 | { |
266 | 282 | |
267 | 283 | |
... | ... | @@ -277,27 +293,25 @@ |
277 | 293 | return 0; |
278 | 294 | } |
279 | 295 | |
280 | -void btrfs_wait_ordered_extent(struct inode *inode, | |
281 | - struct btrfs_ordered_extent *entry) | |
296 | +/* | |
297 | + * Used to start IO or wait for a given ordered extent to finish. | |
298 | + * | |
299 | + * If wait is one, this effectively waits on page writeback for all the pages | |
300 | + * in the extent, and it waits on the io completion code to insert | |
301 | + * metadata into the btree corresponding to the extent | |
302 | + */ | |
303 | +void btrfs_start_ordered_extent(struct inode *inode, | |
304 | + struct btrfs_ordered_extent *entry, | |
305 | + int wait) | |
282 | 306 | { |
283 | 307 | u64 start = entry->file_offset; |
284 | 308 | u64 end = start + entry->len - 1; |
285 | -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22) | |
286 | - do_sync_file_range(file, start, end, SYNC_FILE_RANGE_WRITE); | |
287 | -#else | |
288 | - do_sync_mapping_range(inode->i_mapping, start, end, | |
289 | - SYNC_FILE_RANGE_WRITE); | |
290 | -#endif | |
291 | - wait_event(entry->wait, | |
292 | - test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags)); | |
293 | -} | |
294 | 309 | |
295 | -static void btrfs_start_ordered_extent(struct inode *inode, | |
296 | - struct btrfs_ordered_extent *entry, int wait) | |
297 | -{ | |
298 | - u64 start = entry->file_offset; | |
299 | - u64 end = start + entry->len - 1; | |
300 | - | |
310 | + /* | |
311 | + * pages in the range can be dirty, clean or writeback. We | |
312 | + * start IO on any dirty ones so the wait doesn't stall waiting | |
313 | + * for pdflush to find them | |
314 | + */ | |
301 | 315 | #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22) |
302 | 316 | do_sync_file_range(file, start, end, SYNC_FILE_RANGE_WRITE); |
303 | 317 | #else |
... | ... | @@ -309,6 +323,9 @@ |
309 | 323 | &entry->flags)); |
310 | 324 | } |
311 | 325 | |
326 | +/* | |
327 | + * Used to wait on ordered extents across a large range of bytes. | |
328 | + */ | |
312 | 329 | void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) |
313 | 330 | { |
314 | 331 | u64 end; |
315 | 332 | |
... | ... | @@ -349,31 +366,11 @@ |
349 | 366 | } |
350 | 367 | } |
351 | 368 | |
352 | -int btrfs_add_ordered_pending(struct inode *inode, | |
353 | - struct btrfs_ordered_extent *ordered, | |
354 | - u64 start, u64 len) | |
355 | -{ | |
356 | - WARN_ON(1); | |
357 | - return 0; | |
358 | -#if 0 | |
359 | - int ret; | |
360 | - struct btrfs_ordered_inode_tree *tree; | |
361 | - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; | |
362 | 369 | |
363 | - tree = &BTRFS_I(inode)->ordered_tree; | |
364 | - mutex_lock(&tree->mutex); | |
365 | - if (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)) { | |
366 | - ret = -EAGAIN; | |
367 | - goto out; | |
368 | - } | |
369 | - set_extent_ordered(io_tree, start, start + len - 1, GFP_NOFS); | |
370 | - ret = 0; | |
371 | -out: | |
372 | - mutex_unlock(&tree->mutex); | |
373 | - return ret; | |
374 | -#endif | |
375 | -} | |
376 | - | |
370 | +/* | |
371 | + * find an ordered extent corresponding to file_offset. return NULL if | |
372 | + * nothing is found, otherwise take a reference on the extent and return it | |
373 | + */ | |
377 | 374 | struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, |
378 | 375 | u64 file_offset) |
379 | 376 | { |
... | ... | @@ -397,6 +394,10 @@ |
397 | 394 | return entry; |
398 | 395 | } |
399 | 396 | |
397 | +/* | |
398 | + * lookup and return any extent before 'file_offset'. NULL is returned | |
399 | + * if none is found | |
400 | + */ | |
400 | 401 | struct btrfs_ordered_extent * |
401 | 402 | btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset) |
402 | 403 | { |
... | ... | @@ -417,6 +418,10 @@ |
417 | 418 | return entry; |
418 | 419 | } |
419 | 420 | |
421 | +/* | |
422 | + * After an extent is done, call this to conditionally update the on disk | |
423 | + * i_size. i_size is updated to cover any fully written part of the file. | |
424 | + */ | |
420 | 425 | int btrfs_ordered_update_i_size(struct inode *inode, |
421 | 426 | struct btrfs_ordered_extent *ordered) |
422 | 427 | { |
... | ... | @@ -507,6 +512,11 @@ |
507 | 512 | return 0; |
508 | 513 | } |
509 | 514 | |
515 | +/* | |
516 | + * search the ordered extents for one corresponding to 'offset' and | |
517 | + * try to find a checksum. This is used because we allow pages to | |
518 | + * be reclaimed before their checksum is actually put into the btree | |
519 | + */ | |
510 | 520 | int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u32 *sum) |
511 | 521 | { |
512 | 522 | struct btrfs_ordered_sum *ordered_sum; |
fs/btrfs/ordered-data.h
... | ... | @@ -19,12 +19,19 @@ |
19 | 19 | #ifndef __BTRFS_ORDERED_DATA__ |
20 | 20 | #define __BTRFS_ORDERED_DATA__ |
21 | 21 | |
22 | +/* one of these per inode */ | |
22 | 23 | struct btrfs_ordered_inode_tree { |
23 | 24 | struct mutex mutex; |
24 | 25 | struct rb_root tree; |
25 | 26 | struct rb_node *last; |
26 | 27 | }; |
27 | 28 | |
29 | +/* | |
30 | + * these are used to collect checksums done just before bios submission. | |
31 | + * They are attached via a list into the ordered extent, and | |
32 | + * checksum items are inserted into the tree after all the blocks in | |
33 | + * the ordered extent are on disk | |
34 | + */ | |
28 | 35 | struct btrfs_sector_sum { |
29 | 36 | u64 offset; |
30 | 37 | u32 sum; |
31 | 38 | |
32 | 39 | |
33 | 40 | |
34 | 41 | |
35 | 42 | |
36 | 43 | |
37 | 44 | |
38 | 45 | |
39 | 46 | |
40 | 47 | |
41 | 48 | |
42 | 49 | |
... | ... | @@ -34,27 +41,56 @@ |
34 | 41 | u64 file_offset; |
35 | 42 | u64 len; |
36 | 43 | struct list_head list; |
44 | + /* last field is a variable length array of btrfs_sector_sums */ | |
37 | 45 | struct btrfs_sector_sum sums; |
38 | 46 | }; |
39 | 47 | |
40 | -/* bits for the flags field */ | |
48 | +/* | |
49 | + * bits for the flags field: | |
50 | + * | |
51 | + * BTRFS_ORDERED_IO_DONE is set when all of the blocks are written. | |
52 | + * It is used to make sure metadata is inserted into the tree only once | |
53 | + * per extent. | |
54 | + * | |
55 | + * BTRFS_ORDERED_COMPLETE is set when the extent is removed from the | |
56 | + * rbtree, just before waking any waiters. It is used to indicate the | |
57 | + * IO is done and any metadata is inserted into the tree. | |
58 | + */ | |
41 | 59 | #define BTRFS_ORDERED_IO_DONE 0 /* set when all the pages are written */ |
60 | + | |
42 | 61 | #define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */ |
43 | -#define BTRFS_ORDERED_START 2 /* set when tree setup */ | |
44 | 62 | |
45 | 63 | struct btrfs_ordered_extent { |
64 | + /* logical offset in the file */ | |
46 | 65 | u64 file_offset; |
66 | + | |
67 | + /* disk byte number */ | |
47 | 68 | u64 start; |
69 | + | |
70 | + /* length of the extent in bytes */ | |
48 | 71 | u64 len; |
72 | + | |
73 | + /* flags (described above) */ | |
49 | 74 | unsigned long flags; |
75 | + | |
76 | + /* reference count */ | |
50 | 77 | atomic_t refs; |
78 | + | |
79 | + /* list of checksums for insertion when the extent io is done */ | |
51 | 80 | struct list_head list; |
52 | - struct inode *inode; | |
81 | + | |
82 | + /* used to wait for the BTRFS_ORDERED_COMPLETE bit */ | |
53 | 83 | wait_queue_head_t wait; |
84 | + | |
85 | + /* our friendly rbtree entry */ | |
54 | 86 | struct rb_node rb_node; |
55 | 87 | }; |
56 | 88 | |
57 | 89 | |
90 | +/* | |
91 | + * calculates the total size you need to allocate for an ordered sum | |
92 | + * structure spanning 'bytes' in the file | |
93 | + */ | |
58 | 94 | static inline int btrfs_ordered_sum_size(struct btrfs_root *root, u64 bytes) |
59 | 95 | { |
60 | 96 | unsigned long num_sectors = (bytes + root->sectorsize - 1) / |
61 | 97 | |
... | ... | @@ -81,14 +117,11 @@ |
81 | 117 | int btrfs_add_ordered_sum(struct inode *inode, struct btrfs_ordered_sum *sum); |
82 | 118 | struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, |
83 | 119 | u64 file_offset); |
84 | -void btrfs_wait_ordered_extent(struct inode *inode, | |
85 | - struct btrfs_ordered_extent *entry); | |
120 | +void btrfs_start_ordered_extent(struct inode *inode, | |
121 | + struct btrfs_ordered_extent *entry, int wait); | |
86 | 122 | void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); |
87 | 123 | struct btrfs_ordered_extent * |
88 | 124 | btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); |
89 | -int btrfs_add_ordered_pending(struct inode *inode, | |
90 | - struct btrfs_ordered_extent *ordered, | |
91 | - u64 start, u64 len); | |
92 | 125 | int btrfs_ordered_update_i_size(struct inode *inode, |
93 | 126 | struct btrfs_ordered_extent *ordered); |
94 | 127 | int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u32 *sum); |