Commit eb84ae039e10f1f80443d846ba1350122bbdc753

Authored by Chris Mason
1 parent 54641bd17d

Btrfs: Cleanup and comment ordered-data.c

Signed-off-by: Chris Mason <chris.mason@oracle.com>

Showing 3 changed files with 121 additions and 70 deletions Side-by-side Diff

... ... @@ -1343,7 +1343,7 @@
1343 1343 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
1344 1344 unlock_page(page);
1345 1345 page_cache_release(page);
1346   - btrfs_wait_ordered_extent(inode, ordered);
  1346 + btrfs_start_ordered_extent(inode, ordered, 1);
1347 1347 btrfs_put_ordered_extent(ordered);
1348 1348 goto again;
1349 1349 }
... ... @@ -2660,6 +2660,10 @@
2660 2660 ordered = btrfs_lookup_ordered_extent(page->mapping->host,
2661 2661 page_offset(page));
2662 2662 if (ordered) {
  2663 + /*
  2664 + * IO on this page will never be started, so we need
  2665 + * to account for any ordered extents now
  2666 + */
2663 2667 clear_extent_bit(tree, page_start, page_end,
2664 2668 EXTENT_DIRTY | EXTENT_DELALLOC |
2665 2669 EXTENT_LOCKED, 1, 0, GFP_NOFS);
2666 2670  
... ... @@ -2732,11 +2736,15 @@
2732 2736 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
2733 2737 set_page_extent_mapped(page);
2734 2738  
  2739 + /*
  2740 + * we can't set the delalloc bits if there are pending ordered
  2741 + * extents. Drop our locks and wait for them to finish
  2742 + */
2735 2743 ordered = btrfs_lookup_ordered_extent(inode, page_start);
2736 2744 if (ordered) {
2737 2745 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
2738 2746 unlock_page(page);
2739   - btrfs_wait_ordered_extent(inode, ordered);
  2747 + btrfs_start_ordered_extent(inode, ordered, 1);
2740 2748 btrfs_put_ordered_extent(ordered);
2741 2749 goto again;
2742 2750 }
fs/btrfs/ordered-data.c
... ... @@ -136,6 +136,19 @@
136 136 return ret;
137 137 }
138 138  
  139 +/* allocate and add a new ordered_extent into the per-inode tree.
  140 + * file_offset is the logical offset in the file
  141 + *
  142 + * start is the disk block number of an extent already reserved in the
  143 + * extent allocation tree
  144 + *
  145 + * len is the length of the extent
  146 + *
  147 + * This also sets the EXTENT_ORDERED bit on the range in the inode.
  148 + *
  149 + * The tree is given a single reference on the ordered extent that was
  150 + * inserted.
  151 + */
139 152 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
140 153 u64 start, u64 len)
141 154 {
... ... @@ -152,7 +165,6 @@
152 165 entry->file_offset = file_offset;
153 166 entry->start = start;
154 167 entry->len = len;
155   - entry->inode = inode;
156 168 /* one ref for the tree */
157 169 atomic_set(&entry->refs, 1);
158 170 init_waitqueue_head(&entry->wait);
159 171  
... ... @@ -167,12 +179,15 @@
167 179 set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset,
168 180 entry_end(entry) - 1, GFP_NOFS);
169 181  
170   - set_bit(BTRFS_ORDERED_START, &entry->flags);
171 182 mutex_unlock(&tree->mutex);
172 183 BUG_ON(node);
173 184 return 0;
174 185 }
175 186  
  187 +/*
  188 + * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
  189 + * when an ordered extent is finished.
  190 + */
176 191 int btrfs_add_ordered_sum(struct inode *inode, struct btrfs_ordered_sum *sum)
177 192 {
178 193 struct btrfs_ordered_inode_tree *tree;
179 194  
180 195  
... ... @@ -182,29 +197,25 @@
182 197 tree = &BTRFS_I(inode)->ordered_tree;
183 198 mutex_lock(&tree->mutex);
184 199 node = tree_search(tree, sum->file_offset);
185   - if (!node) {
186   -search_fail:
187   -printk("add ordered sum failed to find a node for inode %lu offset %Lu\n", inode->i_ino, sum->file_offset);
188   - node = rb_first(&tree->tree);
189   - while(node) {
190   - entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
191   - printk("entry %Lu %Lu %Lu\n", entry->file_offset, entry->file_offset + entry->len, entry->start);
192   - node = rb_next(node);
193   - }
194   - BUG();
195   - }
196 200 BUG_ON(!node);
197 201  
198 202 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
199   - if (!offset_in_entry(entry, sum->file_offset)) {
200   - goto search_fail;
201   - }
  203 + BUG_ON(!offset_in_entry(entry, sum->file_offset));
202 204  
203 205 list_add_tail(&sum->list, &entry->list);
204 206 mutex_unlock(&tree->mutex);
205 207 return 0;
206 208 }
207 209  
  210 +/*
  211 + * this is used to account for finished IO across a given range
  212 + * of the file. The IO should not span ordered extents. If
  213 + * a given ordered_extent is completely done, 1 is returned, otherwise
  214 + * 0.
  215 + *
  216 + * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
  217 + * to make sure this function only returns 1 once for a given ordered extent.
  218 + */
208 219 int btrfs_dec_test_ordered_pending(struct inode *inode,
209 220 u64 file_offset, u64 io_size)
210 221 {
... ... @@ -233,9 +244,6 @@
233 244 ret = test_range_bit(io_tree, entry->file_offset,
234 245 entry->file_offset + entry->len - 1,
235 246 EXTENT_ORDERED, 0);
236   - if (!test_bit(BTRFS_ORDERED_START, &entry->flags)) {
237   -printk("inode %lu not ready yet for extent %Lu %Lu\n", inode->i_ino, entry->file_offset, entry_end(entry));
238   - }
239 247 if (ret == 0)
240 248 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
241 249 out:
... ... @@ -243,6 +251,10 @@
243 251 return ret == 0;
244 252 }
245 253  
  254 +/*
  255 + * used to drop a reference on an ordered extent. This will free
  256 + * the extent if the last reference is dropped
  257 + */
246 258 int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
247 259 {
248 260 struct list_head *cur;
... ... @@ -260,6 +272,10 @@
260 272 return 0;
261 273 }
262 274  
  275 +/*
  276 + * remove an ordered extent from the tree. No references are dropped
  277 + * but, anyone waiting on this extent is woken up.
  278 + */
263 279 int btrfs_remove_ordered_extent(struct inode *inode,
264 280 struct btrfs_ordered_extent *entry)
265 281 {
266 282  
267 283  
... ... @@ -277,27 +293,25 @@
277 293 return 0;
278 294 }
279 295  
280   -void btrfs_wait_ordered_extent(struct inode *inode,
281   - struct btrfs_ordered_extent *entry)
  296 +/*
  297 + * Used to start IO or wait for a given ordered extent to finish.
  298 + *
  299 + * If wait is one, this effectively waits on page writeback for all the pages
  300 + * in the extent, and it waits on the io completion code to insert
  301 + * metadata into the btree corresponding to the extent
  302 + */
  303 +void btrfs_start_ordered_extent(struct inode *inode,
  304 + struct btrfs_ordered_extent *entry,
  305 + int wait)
282 306 {
283 307 u64 start = entry->file_offset;
284 308 u64 end = start + entry->len - 1;
285   -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
286   - do_sync_file_range(file, start, end, SYNC_FILE_RANGE_WRITE);
287   -#else
288   - do_sync_mapping_range(inode->i_mapping, start, end,
289   - SYNC_FILE_RANGE_WRITE);
290   -#endif
291   - wait_event(entry->wait,
292   - test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags));
293   -}
294 309  
295   -static void btrfs_start_ordered_extent(struct inode *inode,
296   - struct btrfs_ordered_extent *entry, int wait)
297   -{
298   - u64 start = entry->file_offset;
299   - u64 end = start + entry->len - 1;
300   -
  310 + /*
  311 + * pages in the range can be dirty, clean or writeback. We
  312 + * start IO on any dirty ones so the wait doesn't stall waiting
  313 + * for pdflush to find them
  314 + */
301 315 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
302 316 do_sync_file_range(file, start, end, SYNC_FILE_RANGE_WRITE);
303 317 #else
... ... @@ -309,6 +323,9 @@
309 323 &entry->flags));
310 324 }
311 325  
  326 +/*
  327 + * Used to wait on ordered extents across a large range of bytes.
  328 + */
312 329 void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
313 330 {
314 331 u64 end;
315 332  
... ... @@ -349,31 +366,11 @@
349 366 }
350 367 }
351 368  
352   -int btrfs_add_ordered_pending(struct inode *inode,
353   - struct btrfs_ordered_extent *ordered,
354   - u64 start, u64 len)
355   -{
356   - WARN_ON(1);
357   - return 0;
358   -#if 0
359   - int ret;
360   - struct btrfs_ordered_inode_tree *tree;
361   - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
362 369  
363   - tree = &BTRFS_I(inode)->ordered_tree;
364   - mutex_lock(&tree->mutex);
365   - if (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)) {
366   - ret = -EAGAIN;
367   - goto out;
368   - }
369   - set_extent_ordered(io_tree, start, start + len - 1, GFP_NOFS);
370   - ret = 0;
371   -out:
372   - mutex_unlock(&tree->mutex);
373   - return ret;
374   -#endif
375   -}
376   -
  370 +/*
  371 + * find an ordered extent corresponding to file_offset. return NULL if
  372 + * nothing is found, otherwise take a reference on the extent and return it
  373 + */
377 374 struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
378 375 u64 file_offset)
379 376 {
... ... @@ -397,6 +394,10 @@
397 394 return entry;
398 395 }
399 396  
  397 +/*
  398 + * lookup and return any extent before 'file_offset'. NULL is returned
  399 + * if none is found
  400 + */
400 401 struct btrfs_ordered_extent *
401 402 btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset)
402 403 {
... ... @@ -417,6 +418,10 @@
417 418 return entry;
418 419 }
419 420  
  421 +/*
  422 + * After an extent is done, call this to conditionally update the on disk
  423 + * i_size. i_size is updated to cover any fully written part of the file.
  424 + */
420 425 int btrfs_ordered_update_i_size(struct inode *inode,
421 426 struct btrfs_ordered_extent *ordered)
422 427 {
... ... @@ -507,6 +512,11 @@
507 512 return 0;
508 513 }
509 514  
  515 +/*
  516 + * search the ordered extents for one corresponding to 'offset' and
  517 + * try to find a checksum. This is used because we allow pages to
  518 + * be reclaimed before their checksum is actually put into the btree
  519 + */
510 520 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u32 *sum)
511 521 {
512 522 struct btrfs_ordered_sum *ordered_sum;
fs/btrfs/ordered-data.h
... ... @@ -19,12 +19,19 @@
19 19 #ifndef __BTRFS_ORDERED_DATA__
20 20 #define __BTRFS_ORDERED_DATA__
21 21  
  22 +/* one of these per inode */
22 23 struct btrfs_ordered_inode_tree {
23 24 struct mutex mutex;
24 25 struct rb_root tree;
25 26 struct rb_node *last;
26 27 };
27 28  
  29 +/*
  30 + * these are used to collect checksums done just before bios submission.
  31 + * They are attached via a list into the ordered extent, and
  32 + * checksum items are inserted into the tree after all the blocks in
  33 + * the ordered extent are on disk
  34 + */
28 35 struct btrfs_sector_sum {
29 36 u64 offset;
30 37 u32 sum;
31 38  
32 39  
33 40  
34 41  
35 42  
36 43  
37 44  
38 45  
39 46  
40 47  
41 48  
42 49  
... ... @@ -34,27 +41,56 @@
34 41 u64 file_offset;
35 42 u64 len;
36 43 struct list_head list;
  44 + /* last field is a variable length array of btrfs_sector_sums */
37 45 struct btrfs_sector_sum sums;
38 46 };
39 47  
40   -/* bits for the flags field */
  48 +/*
  49 + * bits for the flags field:
  50 + *
  51 + * BTRFS_ORDERED_IO_DONE is set when all of the blocks are written.
  52 + * It is used to make sure metadata is inserted into the tree only once
  53 + * per extent.
  54 + *
  55 + * BTRFS_ORDERED_COMPLETE is set when the extent is removed from the
  56 + * rbtree, just before waking any waiters. It is used to indicate the
  57 + * IO is done and any metadata is inserted into the tree.
  58 + */
41 59 #define BTRFS_ORDERED_IO_DONE 0 /* set when all the pages are written */
  60 +
42 61 #define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */
43   -#define BTRFS_ORDERED_START 2 /* set when tree setup */
44 62  
45 63 struct btrfs_ordered_extent {
  64 + /* logical offset in the file */
46 65 u64 file_offset;
  66 +
  67 + /* disk byte number */
47 68 u64 start;
  69 +
  70 + /* length of the extent in bytes */
48 71 u64 len;
  72 +
  73 + /* flags (described above) */
49 74 unsigned long flags;
  75 +
  76 + /* reference count */
50 77 atomic_t refs;
  78 +
  79 + /* list of checksums for insertion when the extent io is done */
51 80 struct list_head list;
52   - struct inode *inode;
  81 +
  82 + /* used to wait for the BTRFS_ORDERED_COMPLETE bit */
53 83 wait_queue_head_t wait;
  84 +
  85 + /* our friendly rbtree entry */
54 86 struct rb_node rb_node;
55 87 };
56 88  
57 89  
  90 +/*
  91 + * calculates the total size you need to allocate for an ordered sum
  92 + * structure spanning 'bytes' in the file
  93 + */
58 94 static inline int btrfs_ordered_sum_size(struct btrfs_root *root, u64 bytes)
59 95 {
60 96 unsigned long num_sectors = (bytes + root->sectorsize - 1) /
61 97  
... ... @@ -81,14 +117,11 @@
81 117 int btrfs_add_ordered_sum(struct inode *inode, struct btrfs_ordered_sum *sum);
82 118 struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
83 119 u64 file_offset);
84   -void btrfs_wait_ordered_extent(struct inode *inode,
85   - struct btrfs_ordered_extent *entry);
  120 +void btrfs_start_ordered_extent(struct inode *inode,
  121 + struct btrfs_ordered_extent *entry, int wait);
86 122 void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
87 123 struct btrfs_ordered_extent *
88 124 btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
89   -int btrfs_add_ordered_pending(struct inode *inode,
90   - struct btrfs_ordered_extent *ordered,
91   - u64 start, u64 len);
92 125 int btrfs_ordered_update_i_size(struct inode *inode,
93 126 struct btrfs_ordered_extent *ordered);
94 127 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u32 *sum);