Commit ff023aac31198e88507d626825379b28ea481d4d

Authored by Stefan Behrens
Committed by Josef Bacik
1 parent 618919236b

Btrfs: add code to scrub to copy read data to another disk

The device replace procedure makes use of the scrub code. The scrub
code is the most efficient code to read the allocated data of a disk,
i.e. it reads sequentially in order to avoid disk head movements, it
skips unallocated blocks, it uses read ahead mechanisms, and it
contains all the code to detect and repair defects.
This commit adds code to scrub to allow the scrub code to copy read
data to another disk.
One goal is to be able to perform as fast as possible. Therefore the
write requests are collected until huge bios are built, and the
write process is decoupled from the read process with some kind of
flow control, of course, in order to limit the allocated memory.
The best performance on spinning disks could by reached when the
head movements are avoided as much as possible. Therefore a single
worker is used to interface the read process with the write process.
The regular scrub operation works as fast as before, it is not
negatively influenced and actually it is more or less unchanged.

Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>

Showing 5 changed files with 851 additions and 73 deletions Side-by-side Diff

... ... @@ -1483,6 +1483,8 @@
1483 1483 struct rw_semaphore scrub_super_lock;
1484 1484 int scrub_workers_refcnt;
1485 1485 struct btrfs_workers scrub_workers;
  1486 + struct btrfs_workers scrub_wr_completion_workers;
  1487 + struct btrfs_workers scrub_nocow_workers;
1486 1488  
1487 1489 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
1488 1490 u32 check_integrity_print_mask;
fs/btrfs/dev-replace.h
  1 +/*
  2 + * Copyright (C) STRATO AG 2012. All rights reserved.
  3 + *
  4 + * This program is free software; you can redistribute it and/or
  5 + * modify it under the terms of the GNU General Public
  6 + * License v2 as published by the Free Software Foundation.
  7 + *
  8 + * This program is distributed in the hope that it will be useful,
  9 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11 + * General Public License for more details.
  12 + *
  13 + * You should have received a copy of the GNU General Public
  14 + * License along with this program; if not, write to the
  15 + * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 + * Boston, MA 021110-1307, USA.
  17 + */
  18 +
  19 +#if !defined(__BTRFS_DEV_REPLACE__)
  20 +#define __BTRFS_DEV_REPLACE__
  21 +
  22 +static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value)
  23 +{
  24 + atomic64_inc(stat_value);
  25 +}
  26 +#endif
... ... @@ -418,12 +418,17 @@
418 418 */
419 419 continue;
420 420 }
  421 + if (!dev->bdev) {
  422 + /* cannot read ahead on missing device */
  423 + continue;
  424 + }
421 425 prev_dev = dev;
422 426 ret = radix_tree_insert(&dev->reada_extents, index, re);
423 427 if (ret) {
424 428 while (--i >= 0) {
425 429 dev = bbio->stripes[i].dev;
426 430 BUG_ON(dev == NULL);
  431 + /* ignore whether the entry was inserted */
427 432 radix_tree_delete(&dev->reada_extents, index);
428 433 }
429 434 BUG_ON(fs_info == NULL);
... ... @@ -914,7 +919,10 @@
914 919 generation = btrfs_header_generation(node);
915 920 free_extent_buffer(node);
916 921  
917   - reada_add_block(rc, start, &max_key, level, generation);
  922 + if (reada_add_block(rc, start, &max_key, level, generation)) {
  923 + kfree(rc);
  924 + return ERR_PTR(-ENOMEM);
  925 + }
918 926  
919 927 reada_start_machine(root->fs_info);
920 928  
Changes suppressed. Click to show
... ... @@ -25,6 +25,7 @@
25 25 #include "transaction.h"
26 26 #include "backref.h"
27 27 #include "extent_io.h"
  28 +#include "dev-replace.h"
28 29 #include "check-integrity.h"
29 30 #include "rcu-string.h"
30 31  
... ... @@ -44,8 +45,15 @@
44 45 struct scrub_block;
45 46 struct scrub_ctx;
46 47  
47   -#define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */
48   -#define SCRUB_BIOS_PER_CTX 16 /* 1 MB per device in flight */
  48 +/*
  49 + * the following three values only influence the performance.
  50 + * The last one configures the number of parallel and outstanding I/O
  51 + * operations. The first two values configure an upper limit for the number
  52 + * of (dynamically allocated) pages that are added to a bio.
  53 + */
  54 +#define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */
  55 +#define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */
  56 +#define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */
49 57  
50 58 /*
51 59 * the following value times PAGE_SIZE needs to be large enough to match the
... ... @@ -62,6 +70,7 @@
62 70 u64 generation;
63 71 u64 logical;
64 72 u64 physical;
  73 + u64 physical_for_dev_replace;
65 74 atomic_t ref_count;
66 75 struct {
67 76 unsigned int mirror_num:8;
... ... @@ -79,7 +88,11 @@
79 88 int err;
80 89 u64 logical;
81 90 u64 physical;
82   - struct scrub_page *pagev[SCRUB_PAGES_PER_BIO];
  91 +#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
  92 + struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO];
  93 +#else
  94 + struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO];
  95 +#endif
83 96 int page_count;
84 97 int next_free;
85 98 struct btrfs_work work;
86 99  
... ... @@ -99,8 +112,16 @@
99 112 };
100 113 };
101 114  
  115 +struct scrub_wr_ctx {
  116 + struct scrub_bio *wr_curr_bio;
  117 + struct btrfs_device *tgtdev;
  118 + int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
  119 + atomic_t flush_all_writes;
  120 + struct mutex wr_lock;
  121 +};
  122 +
102 123 struct scrub_ctx {
103   - struct scrub_bio *bios[SCRUB_BIOS_PER_CTX];
  124 + struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX];
104 125 struct btrfs_root *dev_root;
105 126 int first_free;
106 127 int curr;
107 128  
... ... @@ -112,12 +133,13 @@
112 133 struct list_head csum_list;
113 134 atomic_t cancel_req;
114 135 int readonly;
115   - int pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */
  136 + int pages_per_rd_bio;
116 137 u32 sectorsize;
117 138 u32 nodesize;
118 139 u32 leafsize;
119 140  
120 141 int is_dev_replace;
  142 + struct scrub_wr_ctx wr_ctx;
121 143  
122 144 /*
123 145 * statistics
... ... @@ -135,6 +157,15 @@
135 157 int mirror_num;
136 158 };
137 159  
  160 +struct scrub_copy_nocow_ctx {
  161 + struct scrub_ctx *sctx;
  162 + u64 logical;
  163 + u64 len;
  164 + int mirror_num;
  165 + u64 physical_for_dev_replace;
  166 + struct btrfs_work work;
  167 +};
  168 +
138 169 struct scrub_warning {
139 170 struct btrfs_path *path;
140 171 u64 extent_item_size;
141 172  
... ... @@ -156,8 +187,9 @@
156 187 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
157 188 static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
158 189 struct btrfs_fs_info *fs_info,
  190 + struct scrub_block *original_sblock,
159 191 u64 length, u64 logical,
160   - struct scrub_block *sblock);
  192 + struct scrub_block *sblocks_for_recheck);
161 193 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
162 194 struct scrub_block *sblock, int is_metadata,
163 195 int have_csum, u8 *csum, u64 generation,
... ... @@ -174,6 +206,9 @@
174 206 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
175 207 struct scrub_block *sblock_good,
176 208 int page_num, int force_write);
  209 +static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
  210 +static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
  211 + int page_num);
177 212 static int scrub_checksum_data(struct scrub_block *sblock);
178 213 static int scrub_checksum_tree_block(struct scrub_block *sblock);
179 214 static int scrub_checksum_super(struct scrub_block *sblock);
180 215  
181 216  
... ... @@ -181,14 +216,38 @@
181 216 static void scrub_block_put(struct scrub_block *sblock);
182 217 static void scrub_page_get(struct scrub_page *spage);
183 218 static void scrub_page_put(struct scrub_page *spage);
184   -static int scrub_add_page_to_bio(struct scrub_ctx *sctx,
185   - struct scrub_page *spage);
  219 +static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
  220 + struct scrub_page *spage);
186 221 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
187 222 u64 physical, struct btrfs_device *dev, u64 flags,
188   - u64 gen, int mirror_num, u8 *csum, int force);
  223 + u64 gen, int mirror_num, u8 *csum, int force,
  224 + u64 physical_for_dev_replace);
189 225 static void scrub_bio_end_io(struct bio *bio, int err);
190 226 static void scrub_bio_end_io_worker(struct btrfs_work *work);
191 227 static void scrub_block_complete(struct scrub_block *sblock);
  228 +static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
  229 + u64 extent_logical, u64 extent_len,
  230 + u64 *extent_physical,
  231 + struct btrfs_device **extent_dev,
  232 + int *extent_mirror_num);
  233 +static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
  234 + struct scrub_wr_ctx *wr_ctx,
  235 + struct btrfs_fs_info *fs_info,
  236 + struct btrfs_device *dev,
  237 + int is_dev_replace);
  238 +static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
  239 +static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
  240 + struct scrub_page *spage);
  241 +static void scrub_wr_submit(struct scrub_ctx *sctx);
  242 +static void scrub_wr_bio_end_io(struct bio *bio, int err);
  243 +static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
  244 +static int write_page_nocow(struct scrub_ctx *sctx,
  245 + u64 physical_for_dev_replace, struct page *page);
  246 +static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
  247 + void *ctx);
  248 +static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
  249 + int mirror_num, u64 physical_for_dev_replace);
  250 +static void copy_nocow_pages_worker(struct btrfs_work *work);
192 251  
193 252  
194 253 static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
195 254  
196 255  
... ... @@ -262,19 +321,20 @@
262 321 if (!sctx)
263 322 return;
264 323  
  324 + scrub_free_wr_ctx(&sctx->wr_ctx);
  325 +
265 326 /* this can happen when scrub is cancelled */
266 327 if (sctx->curr != -1) {
267 328 struct scrub_bio *sbio = sctx->bios[sctx->curr];
268 329  
269 330 for (i = 0; i < sbio->page_count; i++) {
270   - BUG_ON(!sbio->pagev[i]);
271   - BUG_ON(!sbio->pagev[i]->page);
  331 + WARN_ON(!sbio->pagev[i]->page);
272 332 scrub_block_put(sbio->pagev[i]->sblock);
273 333 }
274 334 bio_put(sbio->bio);
275 335 }
276 336  
277   - for (i = 0; i < SCRUB_BIOS_PER_CTX; ++i) {
  337 + for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
278 338 struct scrub_bio *sbio = sctx->bios[i];
279 339  
280 340 if (!sbio)
281 341  
282 342  
283 343  
... ... @@ -292,18 +352,29 @@
292 352 struct scrub_ctx *sctx;
293 353 int i;
294 354 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
295   - int pages_per_bio;
  355 + int pages_per_rd_bio;
  356 + int ret;
296 357  
297   - pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO,
298   - bio_get_nr_vecs(dev->bdev));
  358 + /*
  359 + * the setting of pages_per_rd_bio is correct for scrub but might
  360 + * be wrong for the dev_replace code where we might read from
  361 + * different devices in the initial huge bios. However, that
  362 + * code is able to correctly handle the case when adding a page
  363 + * to a bio fails.
  364 + */
  365 + if (dev->bdev)
  366 + pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,
  367 + bio_get_nr_vecs(dev->bdev));
  368 + else
  369 + pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
299 370 sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
300 371 if (!sctx)
301 372 goto nomem;
302 373 sctx->is_dev_replace = is_dev_replace;
303   - sctx->pages_per_bio = pages_per_bio;
  374 + sctx->pages_per_rd_bio = pages_per_rd_bio;
304 375 sctx->curr = -1;
305 376 sctx->dev_root = dev->dev_root;
306   - for (i = 0; i < SCRUB_BIOS_PER_CTX; ++i) {
  377 + for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
307 378 struct scrub_bio *sbio;
308 379  
309 380 sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
... ... @@ -316,7 +387,7 @@
316 387 sbio->page_count = 0;
317 388 sbio->work.func = scrub_bio_end_io_worker;
318 389  
319   - if (i != SCRUB_BIOS_PER_CTX - 1)
  390 + if (i != SCRUB_BIOS_PER_SCTX - 1)
320 391 sctx->bios[i]->next_free = i + 1;
321 392 else
322 393 sctx->bios[i]->next_free = -1;
... ... @@ -334,6 +405,13 @@
334 405 spin_lock_init(&sctx->list_lock);
335 406 spin_lock_init(&sctx->stat_lock);
336 407 init_waitqueue_head(&sctx->list_wait);
  408 +
  409 + ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info,
  410 + fs_info->dev_replace.tgtdev, is_dev_replace);
  411 + if (ret) {
  412 + scrub_free_ctx(sctx);
  413 + return ERR_PTR(ret);
  414 + }
337 415 return sctx;
338 416  
339 417 nomem:
... ... @@ -341,7 +419,8 @@
341 419 return ERR_PTR(-ENOMEM);
342 420 }
343 421  
344   -static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
  422 +static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
  423 + void *warn_ctx)
345 424 {
346 425 u64 isize;
347 426 u32 nlink;
... ... @@ -349,7 +428,7 @@
349 428 int i;
350 429 struct extent_buffer *eb;
351 430 struct btrfs_inode_item *inode_item;
352   - struct scrub_warning *swarn = ctx;
  431 + struct scrub_warning *swarn = warn_ctx;
353 432 struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
354 433 struct inode_fs_paths *ipath = NULL;
355 434 struct btrfs_root *local_root;
356 435  
... ... @@ -492,11 +571,11 @@
492 571 kfree(swarn.msg_buf);
493 572 }
494 573  
495   -static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
  574 +static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
496 575 {
497 576 struct page *page = NULL;
498 577 unsigned long index;
499   - struct scrub_fixup_nodatasum *fixup = ctx;
  578 + struct scrub_fixup_nodatasum *fixup = fixup_ctx;
500 579 int ret;
501 580 int corrected = 0;
502 581 struct btrfs_key key;
... ... @@ -660,7 +739,9 @@
660 739 spin_lock(&sctx->stat_lock);
661 740 ++sctx->stat.uncorrectable_errors;
662 741 spin_unlock(&sctx->stat_lock);
663   -
  742 + btrfs_dev_replace_stats_inc(
  743 + &sctx->dev_root->fs_info->dev_replace.
  744 + num_uncorrectable_read_errors);
664 745 printk_ratelimited_in_rcu(KERN_ERR
665 746 "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n",
666 747 (unsigned long long)fixup->logical,
... ... @@ -715,6 +796,11 @@
715 796 csum = sblock_to_check->pagev[0]->csum;
716 797 dev = sblock_to_check->pagev[0]->dev;
717 798  
  799 + if (sctx->is_dev_replace && !is_metadata && !have_csum) {
  800 + sblocks_for_recheck = NULL;
  801 + goto nodatasum_case;
  802 + }
  803 +
718 804 /*
719 805 * read all mirrors one after the other. This includes to
720 806 * re-read the extent or metadata block that failed (that was
... ... @@ -758,7 +844,7 @@
758 844 }
759 845  
760 846 /* setup the context, map the logical blocks and alloc the pages */
761   - ret = scrub_setup_recheck_block(sctx, fs_info, length,
  847 + ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length,
762 848 logical, sblocks_for_recheck);
763 849 if (ret) {
764 850 spin_lock(&sctx->stat_lock);
... ... @@ -789,6 +875,8 @@
789 875 sctx->stat.unverified_errors++;
790 876 spin_unlock(&sctx->stat_lock);
791 877  
  878 + if (sctx->is_dev_replace)
  879 + scrub_write_block_to_dev_replace(sblock_bad);
792 880 goto out;
793 881 }
794 882  
795 883  
... ... @@ -822,12 +910,15 @@
822 910 BTRFS_DEV_STAT_CORRUPTION_ERRS);
823 911 }
824 912  
825   - if (sctx->readonly)
  913 + if (sctx->readonly && !sctx->is_dev_replace)
826 914 goto did_not_correct_error;
827 915  
828 916 if (!is_metadata && !have_csum) {
829 917 struct scrub_fixup_nodatasum *fixup_nodatasum;
830 918  
  919 +nodatasum_case:
  920 + WARN_ON(sctx->is_dev_replace);
  921 +
831 922 /*
832 923 * !is_metadata and !have_csum, this means that the data
833 924 * might not be COW'ed, that it might be modified
834 925  
835 926  
... ... @@ -883,18 +974,79 @@
883 974 if (!sblock_other->header_error &&
884 975 !sblock_other->checksum_error &&
885 976 sblock_other->no_io_error_seen) {
886   - int force_write = is_metadata || have_csum;
  977 + if (sctx->is_dev_replace) {
  978 + scrub_write_block_to_dev_replace(sblock_other);
  979 + } else {
  980 + int force_write = is_metadata || have_csum;
887 981  
888   - ret = scrub_repair_block_from_good_copy(sblock_bad,
889   - sblock_other,
890   - force_write);
  982 + ret = scrub_repair_block_from_good_copy(
  983 + sblock_bad, sblock_other,
  984 + force_write);
  985 + }
891 986 if (0 == ret)
892 987 goto corrected_error;
893 988 }
894 989 }
895 990  
896 991 /*
897   - * in case of I/O errors in the area that is supposed to be
  992 + * for dev_replace, pick good pages and write to the target device.
  993 + */
  994 + if (sctx->is_dev_replace) {
  995 + success = 1;
  996 + for (page_num = 0; page_num < sblock_bad->page_count;
  997 + page_num++) {
  998 + int sub_success;
  999 +
  1000 + sub_success = 0;
  1001 + for (mirror_index = 0;
  1002 + mirror_index < BTRFS_MAX_MIRRORS &&
  1003 + sblocks_for_recheck[mirror_index].page_count > 0;
  1004 + mirror_index++) {
  1005 + struct scrub_block *sblock_other =
  1006 + sblocks_for_recheck + mirror_index;
  1007 + struct scrub_page *page_other =
  1008 + sblock_other->pagev[page_num];
  1009 +
  1010 + if (!page_other->io_error) {
  1011 + ret = scrub_write_page_to_dev_replace(
  1012 + sblock_other, page_num);
  1013 + if (ret == 0) {
  1014 + /* succeeded for this page */
  1015 + sub_success = 1;
  1016 + break;
  1017 + } else {
  1018 + btrfs_dev_replace_stats_inc(
  1019 + &sctx->dev_root->
  1020 + fs_info->dev_replace.
  1021 + num_write_errors);
  1022 + }
  1023 + }
  1024 + }
  1025 +
  1026 + if (!sub_success) {
  1027 + /*
  1028 + * did not find a mirror to fetch the page
  1029 + * from. scrub_write_page_to_dev_replace()
  1030 + * handles this case (page->io_error), by
  1031 + * filling the block with zeros before
  1032 + * submitting the write request
  1033 + */
  1034 + success = 0;
  1035 + ret = scrub_write_page_to_dev_replace(
  1036 + sblock_bad, page_num);
  1037 + if (ret)
  1038 + btrfs_dev_replace_stats_inc(
  1039 + &sctx->dev_root->fs_info->
  1040 + dev_replace.num_write_errors);
  1041 + }
  1042 + }
  1043 +
  1044 + goto out;
  1045 + }
  1046 +
  1047 + /*
  1048 + * for regular scrub, repair those pages that are errored.
  1049 + * In case of I/O errors in the area that is supposed to be
898 1050 * repaired, continue by picking good copies of those pages.
899 1051 * Select the good pages from mirrors to rewrite bad pages from
900 1052 * the area to fix. Afterwards verify the checksum of the block
... ... @@ -1017,6 +1169,7 @@
1017 1169  
1018 1170 static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
1019 1171 struct btrfs_fs_info *fs_info,
  1172 + struct scrub_block *original_sblock,
1020 1173 u64 length, u64 logical,
1021 1174 struct scrub_block *sblocks_for_recheck)
1022 1175 {
... ... @@ -1047,7 +1200,7 @@
1047 1200 return -EIO;
1048 1201 }
1049 1202  
1050   - BUG_ON(page_index >= SCRUB_PAGES_PER_BIO);
  1203 + BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
1051 1204 for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
1052 1205 mirror_index++) {
1053 1206 struct scrub_block *sblock;
... ... @@ -1071,6 +1224,10 @@
1071 1224 sblock->pagev[page_index] = page;
1072 1225 page->logical = logical;
1073 1226 page->physical = bbio->stripes[mirror_index].physical;
  1227 + BUG_ON(page_index >= original_sblock->page_count);
  1228 + page->physical_for_dev_replace =
  1229 + original_sblock->pagev[page_index]->
  1230 + physical_for_dev_replace;
1074 1231 /* for missing devices, dev->bdev is NULL */
1075 1232 page->dev = bbio->stripes[mirror_index].dev;
1076 1233 page->mirror_num = mirror_index + 1;
... ... @@ -1249,6 +1406,12 @@
1249 1406 int ret;
1250 1407 DECLARE_COMPLETION_ONSTACK(complete);
1251 1408  
  1409 + if (!page_bad->dev->bdev) {
  1410 + printk_ratelimited(KERN_WARNING
  1411 + "btrfs: scrub_repair_page_from_good_copy(bdev == NULL) is unexpected!\n");
  1412 + return -EIO;
  1413 + }
  1414 +
1252 1415 bio = bio_alloc(GFP_NOFS, 1);
1253 1416 if (!bio)
1254 1417 return -EIO;
... ... @@ -1269,6 +1432,9 @@
1269 1432 if (!bio_flagged(bio, BIO_UPTODATE)) {
1270 1433 btrfs_dev_stat_inc_and_print(page_bad->dev,
1271 1434 BTRFS_DEV_STAT_WRITE_ERRS);
  1435 + btrfs_dev_replace_stats_inc(
  1436 + &sblock_bad->sctx->dev_root->fs_info->
  1437 + dev_replace.num_write_errors);
1272 1438 bio_put(bio);
1273 1439 return -EIO;
1274 1440 }
1275 1441  
... ... @@ -1278,8 +1444,169 @@
1278 1444 return 0;
1279 1445 }
1280 1446  
1281   -static void scrub_checksum(struct scrub_block *sblock)
  1447 +static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1282 1448 {
  1449 + int page_num;
  1450 +
  1451 + for (page_num = 0; page_num < sblock->page_count; page_num++) {
  1452 + int ret;
  1453 +
  1454 + ret = scrub_write_page_to_dev_replace(sblock, page_num);
  1455 + if (ret)
  1456 + btrfs_dev_replace_stats_inc(
  1457 + &sblock->sctx->dev_root->fs_info->dev_replace.
  1458 + num_write_errors);
  1459 + }
  1460 +}
  1461 +
  1462 +static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
  1463 + int page_num)
  1464 +{
  1465 + struct scrub_page *spage = sblock->pagev[page_num];
  1466 +
  1467 + BUG_ON(spage->page == NULL);
  1468 + if (spage->io_error) {
  1469 + void *mapped_buffer = kmap_atomic(spage->page);
  1470 +
  1471 + memset(mapped_buffer, 0, PAGE_CACHE_SIZE);
  1472 + flush_dcache_page(spage->page);
  1473 + kunmap_atomic(mapped_buffer);
  1474 + }
  1475 + return scrub_add_page_to_wr_bio(sblock->sctx, spage);
  1476 +}
  1477 +
  1478 +static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
  1479 + struct scrub_page *spage)
  1480 +{
  1481 + struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
  1482 + struct scrub_bio *sbio;
  1483 + int ret;
  1484 +
  1485 + mutex_lock(&wr_ctx->wr_lock);
  1486 +again:
  1487 + if (!wr_ctx->wr_curr_bio) {
  1488 + wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
  1489 + GFP_NOFS);
  1490 + if (!wr_ctx->wr_curr_bio) {
  1491 + mutex_unlock(&wr_ctx->wr_lock);
  1492 + return -ENOMEM;
  1493 + }
  1494 + wr_ctx->wr_curr_bio->sctx = sctx;
  1495 + wr_ctx->wr_curr_bio->page_count = 0;
  1496 + }
  1497 + sbio = wr_ctx->wr_curr_bio;
  1498 + if (sbio->page_count == 0) {
  1499 + struct bio *bio;
  1500 +
  1501 + sbio->physical = spage->physical_for_dev_replace;
  1502 + sbio->logical = spage->logical;
  1503 + sbio->dev = wr_ctx->tgtdev;
  1504 + bio = sbio->bio;
  1505 + if (!bio) {
  1506 + bio = bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
  1507 + if (!bio) {
  1508 + mutex_unlock(&wr_ctx->wr_lock);
  1509 + return -ENOMEM;
  1510 + }
  1511 + sbio->bio = bio;
  1512 + }
  1513 +
  1514 + bio->bi_private = sbio;
  1515 + bio->bi_end_io = scrub_wr_bio_end_io;
  1516 + bio->bi_bdev = sbio->dev->bdev;
  1517 + bio->bi_sector = sbio->physical >> 9;
  1518 + sbio->err = 0;
  1519 + } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
  1520 + spage->physical_for_dev_replace ||
  1521 + sbio->logical + sbio->page_count * PAGE_SIZE !=
  1522 + spage->logical) {
  1523 + scrub_wr_submit(sctx);
  1524 + goto again;
  1525 + }
  1526 +
  1527 + ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
  1528 + if (ret != PAGE_SIZE) {
  1529 + if (sbio->page_count < 1) {
  1530 + bio_put(sbio->bio);
  1531 + sbio->bio = NULL;
  1532 + mutex_unlock(&wr_ctx->wr_lock);
  1533 + return -EIO;
  1534 + }
  1535 + scrub_wr_submit(sctx);
  1536 + goto again;
  1537 + }
  1538 +
  1539 + sbio->pagev[sbio->page_count] = spage;
  1540 + scrub_page_get(spage);
  1541 + sbio->page_count++;
  1542 + if (sbio->page_count == wr_ctx->pages_per_wr_bio)
  1543 + scrub_wr_submit(sctx);
  1544 + mutex_unlock(&wr_ctx->wr_lock);
  1545 +
  1546 + return 0;
  1547 +}
  1548 +
  1549 +static void scrub_wr_submit(struct scrub_ctx *sctx)
  1550 +{
  1551 + struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
  1552 + struct scrub_bio *sbio;
  1553 +
  1554 + if (!wr_ctx->wr_curr_bio)
  1555 + return;
  1556 +
  1557 + sbio = wr_ctx->wr_curr_bio;
  1558 + wr_ctx->wr_curr_bio = NULL;
  1559 + WARN_ON(!sbio->bio->bi_bdev);
  1560 + scrub_pending_bio_inc(sctx);
  1561 + /* process all writes in a single worker thread. Then the block layer
  1562 + * orders the requests before sending them to the driver which
  1563 + * doubled the write performance on spinning disks when measured
  1564 + * with Linux 3.5 */
  1565 + btrfsic_submit_bio(WRITE, sbio->bio);
  1566 +}
  1567 +
  1568 +static void scrub_wr_bio_end_io(struct bio *bio, int err)
  1569 +{
  1570 + struct scrub_bio *sbio = bio->bi_private;
  1571 + struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
  1572 +
  1573 + sbio->err = err;
  1574 + sbio->bio = bio;
  1575 +
  1576 + sbio->work.func = scrub_wr_bio_end_io_worker;
  1577 + btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work);
  1578 +}
  1579 +
  1580 +static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
  1581 +{
  1582 + struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
  1583 + struct scrub_ctx *sctx = sbio->sctx;
  1584 + int i;
  1585 +
  1586 + WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
  1587 + if (sbio->err) {
  1588 + struct btrfs_dev_replace *dev_replace =
  1589 + &sbio->sctx->dev_root->fs_info->dev_replace;
  1590 +
  1591 + for (i = 0; i < sbio->page_count; i++) {
  1592 + struct scrub_page *spage = sbio->pagev[i];
  1593 +
  1594 + spage->io_error = 1;
  1595 + btrfs_dev_replace_stats_inc(&dev_replace->
  1596 + num_write_errors);
  1597 + }
  1598 + }
  1599 +
  1600 + for (i = 0; i < sbio->page_count; i++)
  1601 + scrub_page_put(sbio->pagev[i]);
  1602 +
  1603 + bio_put(sbio->bio);
  1604 + kfree(sbio);
  1605 + scrub_pending_bio_dec(sctx);
  1606 +}
  1607 +
  1608 +static int scrub_checksum(struct scrub_block *sblock)
  1609 +{
1283 1610 u64 flags;
1284 1611 int ret;
1285 1612  
... ... @@ -1296,6 +1623,8 @@
1296 1623 WARN_ON(1);
1297 1624 if (ret)
1298 1625 scrub_handle_errored_block(sblock);
  1626 +
  1627 + return ret;
1299 1628 }
1300 1629  
1301 1630 static int scrub_checksum_data(struct scrub_block *sblock)
... ... @@ -1386,7 +1715,7 @@
1386 1715 BTRFS_UUID_SIZE))
1387 1716 ++fail;
1388 1717  
1389   - BUG_ON(sctx->nodesize != sctx->leafsize);
  1718 + WARN_ON(sctx->nodesize != sctx->leafsize);
1390 1719 len = sctx->nodesize - BTRFS_CSUM_SIZE;
1391 1720 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1392 1721 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1393 1722  
... ... @@ -1534,11 +1863,24 @@
1534 1863 sctx->curr = -1;
1535 1864 scrub_pending_bio_inc(sctx);
1536 1865  
1537   - btrfsic_submit_bio(READ, sbio->bio);
  1866 + if (!sbio->bio->bi_bdev) {
  1867 + /*
  1868 + * this case should not happen. If btrfs_map_block() is
  1869 + * wrong, it could happen for dev-replace operations on
  1870 + * missing devices when no mirrors are available, but in
  1871 + * this case it should already fail the mount.
  1872 + * This case is handled correctly (but _very_ slowly).
  1873 + */
  1874 + printk_ratelimited(KERN_WARNING
  1875 + "btrfs: scrub_submit(bio bdev == NULL) is unexpected!\n");
  1876 + bio_endio(sbio->bio, -EIO);
  1877 + } else {
  1878 + btrfsic_submit_bio(READ, sbio->bio);
  1879 + }
1538 1880 }
1539 1881  
1540   -static int scrub_add_page_to_bio(struct scrub_ctx *sctx,
1541   - struct scrub_page *spage)
  1882 +static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
  1883 + struct scrub_page *spage)
1542 1884 {
1543 1885 struct scrub_block *sblock = spage->sblock;
1544 1886 struct scrub_bio *sbio;
... ... @@ -1570,7 +1912,7 @@
1570 1912 sbio->dev = spage->dev;
1571 1913 bio = sbio->bio;
1572 1914 if (!bio) {
1573   - bio = bio_alloc(GFP_NOFS, sctx->pages_per_bio);
  1915 + bio = bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
1574 1916 if (!bio)
1575 1917 return -ENOMEM;
1576 1918 sbio->bio = bio;
1577 1919  
... ... @@ -1602,10 +1944,10 @@
1602 1944 goto again;
1603 1945 }
1604 1946  
1605   - scrub_block_get(sblock); /* one for the added page */
  1947 + scrub_block_get(sblock); /* one for the page added to the bio */
1606 1948 atomic_inc(&sblock->outstanding_pages);
1607 1949 sbio->page_count++;
1608   - if (sbio->page_count == sctx->pages_per_bio)
  1950 + if (sbio->page_count == sctx->pages_per_rd_bio)
1609 1951 scrub_submit(sctx);
1610 1952  
1611 1953 return 0;
... ... @@ -1613,7 +1955,8 @@
1613 1955  
1614 1956 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
1615 1957 u64 physical, struct btrfs_device *dev, u64 flags,
1616   - u64 gen, int mirror_num, u8 *csum, int force)
  1958 + u64 gen, int mirror_num, u8 *csum, int force,
  1959 + u64 physical_for_dev_replace)
1617 1960 {
1618 1961 struct scrub_block *sblock;
1619 1962 int index;
... ... @@ -1654,6 +1997,7 @@
1654 1997 spage->generation = gen;
1655 1998 spage->logical = logical;
1656 1999 spage->physical = physical;
  2000 + spage->physical_for_dev_replace = physical_for_dev_replace;
1657 2001 spage->mirror_num = mirror_num;
1658 2002 if (csum) {
1659 2003 spage->have_csum = 1;
... ... @@ -1668,6 +2012,7 @@
1668 2012 len -= l;
1669 2013 logical += l;
1670 2014 physical += l;
  2015 + physical_for_dev_replace += l;
1671 2016 }
1672 2017  
1673 2018 WARN_ON(sblock->page_count == 0);
... ... @@ -1675,7 +2020,7 @@
1675 2020 struct scrub_page *spage = sblock->pagev[index];
1676 2021 int ret;
1677 2022  
1678   - ret = scrub_add_page_to_bio(sctx, spage);
  2023 + ret = scrub_add_page_to_rd_bio(sctx, spage);
1679 2024 if (ret) {
1680 2025 scrub_block_put(sblock);
1681 2026 return ret;
... ... @@ -1707,7 +2052,7 @@
1707 2052 struct scrub_ctx *sctx = sbio->sctx;
1708 2053 int i;
1709 2054  
1710   - BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO);
  2055 + BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
1711 2056 if (sbio->err) {
1712 2057 for (i = 0; i < sbio->page_count; i++) {
1713 2058 struct scrub_page *spage = sbio->pagev[i];
1714 2059  
1715 2060  
... ... @@ -1733,15 +2078,30 @@
1733 2078 sbio->next_free = sctx->first_free;
1734 2079 sctx->first_free = sbio->index;
1735 2080 spin_unlock(&sctx->list_lock);
  2081 +
  2082 + if (sctx->is_dev_replace &&
  2083 + atomic_read(&sctx->wr_ctx.flush_all_writes)) {
  2084 + mutex_lock(&sctx->wr_ctx.wr_lock);
  2085 + scrub_wr_submit(sctx);
  2086 + mutex_unlock(&sctx->wr_ctx.wr_lock);
  2087 + }
  2088 +
1736 2089 scrub_pending_bio_dec(sctx);
1737 2090 }
1738 2091  
1739 2092 static void scrub_block_complete(struct scrub_block *sblock)
1740 2093 {
1741   - if (!sblock->no_io_error_seen)
  2094 + if (!sblock->no_io_error_seen) {
1742 2095 scrub_handle_errored_block(sblock);
1743   - else
1744   - scrub_checksum(sblock);
  2096 + } else {
  2097 + /*
  2098 + * if has checksum error, write via repair mechanism in
  2099 + * dev replace case, otherwise write here in dev replace
  2100 + * case.
  2101 + */
  2102 + if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace)
  2103 + scrub_write_block_to_dev_replace(sblock);
  2104 + }
1745 2105 }
1746 2106  
1747 2107 static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
... ... @@ -1786,7 +2146,7 @@
1786 2146 /* scrub extent tries to collect up to 64 kB for each bio */
1787 2147 static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
1788 2148 u64 physical, struct btrfs_device *dev, u64 flags,
1789   - u64 gen, int mirror_num)
  2149 + u64 gen, int mirror_num, u64 physical_for_dev_replace)
1790 2150 {
1791 2151 int ret;
1792 2152 u8 csum[BTRFS_CSUM_SIZE];
... ... @@ -1799,7 +2159,7 @@
1799 2159 sctx->stat.data_bytes_scrubbed += len;
1800 2160 spin_unlock(&sctx->stat_lock);
1801 2161 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1802   - BUG_ON(sctx->nodesize != sctx->leafsize);
  2162 + WARN_ON(sctx->nodesize != sctx->leafsize);
1803 2163 blocksize = sctx->nodesize;
1804 2164 spin_lock(&sctx->stat_lock);
1805 2165 sctx->stat.tree_extents_scrubbed++;
... ... @@ -1807,7 +2167,7 @@
1807 2167 spin_unlock(&sctx->stat_lock);
1808 2168 } else {
1809 2169 blocksize = sctx->sectorsize;
1810   - BUG_ON(1);
  2170 + WARN_ON(1);
1811 2171 }
1812 2172  
1813 2173 while (len) {
1814 2174  
1815 2175  
... ... @@ -1819,14 +2179,23 @@
1819 2179 have_csum = scrub_find_csum(sctx, logical, l, csum);
1820 2180 if (have_csum == 0)
1821 2181 ++sctx->stat.no_csum;
  2182 + if (sctx->is_dev_replace && !have_csum) {
  2183 + ret = copy_nocow_pages(sctx, logical, l,
  2184 + mirror_num,
  2185 + physical_for_dev_replace);
  2186 + goto behind_scrub_pages;
  2187 + }
1822 2188 }
1823 2189 ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
1824   - mirror_num, have_csum ? csum : NULL, 0);
  2190 + mirror_num, have_csum ? csum : NULL, 0,
  2191 + physical_for_dev_replace);
  2192 +behind_scrub_pages:
1825 2193 if (ret)
1826 2194 return ret;
1827 2195 len -= l;
1828 2196 logical += l;
1829 2197 physical += l;
  2198 + physical_for_dev_replace += l;
1830 2199 }
1831 2200 return 0;
1832 2201 }
... ... @@ -1834,7 +2203,8 @@
1834 2203 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
1835 2204 struct map_lookup *map,
1836 2205 struct btrfs_device *scrub_dev,
1837   - int num, u64 base, u64 length)
  2206 + int num, u64 base, u64 length,
  2207 + int is_dev_replace)
1838 2208 {
1839 2209 struct btrfs_path *path;
1840 2210 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
... ... @@ -1859,6 +2229,11 @@
1859 2229 struct btrfs_key key_end;
1860 2230 u64 increment = map->stripe_len;
1861 2231 u64 offset;
  2232 + u64 extent_logical;
  2233 + u64 extent_physical;
  2234 + u64 extent_len;
  2235 + struct btrfs_device *extent_dev;
  2236 + int extent_mirror_num;
1862 2237  
1863 2238 nstripes = length;
1864 2239 offset = 0;
1865 2240  
1866 2241  
... ... @@ -1966,9 +2341,14 @@
1966 2341 */
1967 2342 if (atomic_read(&fs_info->scrub_pause_req)) {
1968 2343 /* push queued extents */
  2344 + atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
1969 2345 scrub_submit(sctx);
  2346 + mutex_lock(&sctx->wr_ctx.wr_lock);
  2347 + scrub_wr_submit(sctx);
  2348 + mutex_unlock(&sctx->wr_ctx.wr_lock);
1970 2349 wait_event(sctx->list_wait,
1971 2350 atomic_read(&sctx->bios_in_flight) == 0);
  2351 + atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
1972 2352 atomic_inc(&fs_info->scrubs_paused);
1973 2353 wake_up(&fs_info->scrub_pause_wait);
1974 2354 mutex_lock(&fs_info->scrub_lock);
... ... @@ -2063,10 +2443,20 @@
2063 2443 key.objectid;
2064 2444 }
2065 2445  
2066   - ret = scrub_extent(sctx, key.objectid, key.offset,
2067   - key.objectid - logical + physical,
2068   - scrub_dev, flags, generation,
2069   - mirror_num);
  2446 + extent_logical = key.objectid;
  2447 + extent_physical = key.objectid - logical + physical;
  2448 + extent_len = key.offset;
  2449 + extent_dev = scrub_dev;
  2450 + extent_mirror_num = mirror_num;
  2451 + if (is_dev_replace)
  2452 + scrub_remap_extent(fs_info, extent_logical,
  2453 + extent_len, &extent_physical,
  2454 + &extent_dev,
  2455 + &extent_mirror_num);
  2456 + ret = scrub_extent(sctx, extent_logical, extent_len,
  2457 + extent_physical, extent_dev, flags,
  2458 + generation, extent_mirror_num,
  2459 + key.objectid - logical + physical);
2070 2460 if (ret)
2071 2461 goto out;
2072 2462  
2073 2463  
2074 2464  
... ... @@ -2080,10 +2470,13 @@
2080 2470 sctx->stat.last_physical = physical;
2081 2471 spin_unlock(&sctx->stat_lock);
2082 2472 }
  2473 +out:
2083 2474 /* push queued extents */
2084 2475 scrub_submit(sctx);
  2476 + mutex_lock(&sctx->wr_ctx.wr_lock);
  2477 + scrub_wr_submit(sctx);
  2478 + mutex_unlock(&sctx->wr_ctx.wr_lock);
2085 2479  
2086   -out:
2087 2480 blk_finish_plug(&plug);
2088 2481 btrfs_free_path(path);
2089 2482 return ret < 0 ? ret : 0;
2090 2483  
... ... @@ -2093,14 +2486,14 @@
2093 2486 struct btrfs_device *scrub_dev,
2094 2487 u64 chunk_tree, u64 chunk_objectid,
2095 2488 u64 chunk_offset, u64 length,
2096   - u64 dev_offset)
  2489 + u64 dev_offset, int is_dev_replace)
2097 2490 {
2098 2491 struct btrfs_mapping_tree *map_tree =
2099 2492 &sctx->dev_root->fs_info->mapping_tree;
2100 2493 struct map_lookup *map;
2101 2494 struct extent_map *em;
2102 2495 int i;
2103   - int ret = -EINVAL;
  2496 + int ret = 0;
2104 2497  
2105 2498 read_lock(&map_tree->map_tree.lock);
2106 2499 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
... ... @@ -2120,7 +2513,8 @@
2120 2513 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
2121 2514 map->stripes[i].physical == dev_offset) {
2122 2515 ret = scrub_stripe(sctx, map, scrub_dev, i,
2123   - chunk_offset, length);
  2516 + chunk_offset, length,
  2517 + is_dev_replace);
2124 2518 if (ret)
2125 2519 goto out;
2126 2520 }
... ... @@ -2133,7 +2527,8 @@
2133 2527  
2134 2528 static noinline_for_stack
2135 2529 int scrub_enumerate_chunks(struct scrub_ctx *sctx,
2136   - struct btrfs_device *scrub_dev, u64 start, u64 end)
  2530 + struct btrfs_device *scrub_dev, u64 start, u64 end,
  2531 + int is_dev_replace)
2137 2532 {
2138 2533 struct btrfs_dev_extent *dev_extent = NULL;
2139 2534 struct btrfs_path *path;
... ... @@ -2149,6 +2544,7 @@
2149 2544 struct btrfs_key key;
2150 2545 struct btrfs_key found_key;
2151 2546 struct btrfs_block_group_cache *cache;
  2547 + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
2152 2548  
2153 2549 path = btrfs_alloc_path();
2154 2550 if (!path)
2155 2551  
2156 2552  
... ... @@ -2214,11 +2610,61 @@
2214 2610 ret = -ENOENT;
2215 2611 break;
2216 2612 }
  2613 + dev_replace->cursor_right = found_key.offset + length;
  2614 + dev_replace->cursor_left = found_key.offset;
  2615 + dev_replace->item_needs_writeback = 1;
2217 2616 ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
2218   - chunk_offset, length, found_key.offset);
  2617 + chunk_offset, length, found_key.offset,
  2618 + is_dev_replace);
  2619 +
  2620 + /*
  2621 + * flush, submit all pending read and write bios, afterwards
  2622 + * wait for them.
  2623 + * Note that in the dev replace case, a read request causes
  2624 + * write requests that are submitted in the read completion
  2625 + * worker. Therefore in the current situation, it is required
  2626 + * that all write requests are flushed, so that all read and
  2627 + * write requests are really completed when bios_in_flight
  2628 + * changes to 0.
  2629 + */
  2630 + atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
  2631 + scrub_submit(sctx);
  2632 + mutex_lock(&sctx->wr_ctx.wr_lock);
  2633 + scrub_wr_submit(sctx);
  2634 + mutex_unlock(&sctx->wr_ctx.wr_lock);
  2635 +
  2636 + wait_event(sctx->list_wait,
  2637 + atomic_read(&sctx->bios_in_flight) == 0);
  2638 + atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
  2639 + atomic_inc(&fs_info->scrubs_paused);
  2640 + wake_up(&fs_info->scrub_pause_wait);
  2641 + wait_event(sctx->list_wait,
  2642 + atomic_read(&sctx->workers_pending) == 0);
  2643 +
  2644 + mutex_lock(&fs_info->scrub_lock);
  2645 + while (atomic_read(&fs_info->scrub_pause_req)) {
  2646 + mutex_unlock(&fs_info->scrub_lock);
  2647 + wait_event(fs_info->scrub_pause_wait,
  2648 + atomic_read(&fs_info->scrub_pause_req) == 0);
  2649 + mutex_lock(&fs_info->scrub_lock);
  2650 + }
  2651 + atomic_dec(&fs_info->scrubs_paused);
  2652 + mutex_unlock(&fs_info->scrub_lock);
  2653 + wake_up(&fs_info->scrub_pause_wait);
  2654 +
  2655 + dev_replace->cursor_left = dev_replace->cursor_right;
  2656 + dev_replace->item_needs_writeback = 1;
2219 2657 btrfs_put_block_group(cache);
2220 2658 if (ret)
2221 2659 break;
  2660 + if (atomic64_read(&dev_replace->num_write_errors) > 0) {
  2661 + ret = -EIO;
  2662 + break;
  2663 + }
  2664 + if (sctx->stat.malloc_errors > 0) {
  2665 + ret = -ENOMEM;
  2666 + break;
  2667 + }
2222 2668  
2223 2669 key.offset = found_key.offset + length;
2224 2670 btrfs_release_path(path);
... ... @@ -2254,7 +2700,7 @@
2254 2700  
2255 2701 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
2256 2702 scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
2257   - NULL, 1);
  2703 + NULL, 1, bytenr);
2258 2704 if (ret)
2259 2705 return ret;
2260 2706 }
2261 2707  
2262 2708  
... ... @@ -2266,18 +2712,38 @@
2266 2712 /*
2267 2713 * get a reference count on fs_info->scrub_workers. start worker if necessary
2268 2714 */
2269   -static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info)
  2715 +static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
  2716 + int is_dev_replace)
2270 2717 {
2271 2718 int ret = 0;
2272 2719  
2273 2720 mutex_lock(&fs_info->scrub_lock);
2274 2721 if (fs_info->scrub_workers_refcnt == 0) {
2275   - btrfs_init_workers(&fs_info->scrub_workers, "scrub",
2276   - fs_info->thread_pool_size, &fs_info->generic_worker);
  2722 + if (is_dev_replace)
  2723 + btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1,
  2724 + &fs_info->generic_worker);
  2725 + else
  2726 + btrfs_init_workers(&fs_info->scrub_workers, "scrub",
  2727 + fs_info->thread_pool_size,
  2728 + &fs_info->generic_worker);
2277 2729 fs_info->scrub_workers.idle_thresh = 4;
2278 2730 ret = btrfs_start_workers(&fs_info->scrub_workers);
2279 2731 if (ret)
2280 2732 goto out;
  2733 + btrfs_init_workers(&fs_info->scrub_wr_completion_workers,
  2734 + "scrubwrc",
  2735 + fs_info->thread_pool_size,
  2736 + &fs_info->generic_worker);
  2737 + fs_info->scrub_wr_completion_workers.idle_thresh = 2;
  2738 + ret = btrfs_start_workers(
  2739 + &fs_info->scrub_wr_completion_workers);
  2740 + if (ret)
  2741 + goto out;
  2742 + btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1,
  2743 + &fs_info->generic_worker);
  2744 + ret = btrfs_start_workers(&fs_info->scrub_nocow_workers);
  2745 + if (ret)
  2746 + goto out;
2281 2747 }
2282 2748 ++fs_info->scrub_workers_refcnt;
2283 2749 out:
2284 2750  
... ... @@ -2289,8 +2755,11 @@
2289 2755 static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
2290 2756 {
2291 2757 mutex_lock(&fs_info->scrub_lock);
2292   - if (--fs_info->scrub_workers_refcnt == 0)
  2758 + if (--fs_info->scrub_workers_refcnt == 0) {
2293 2759 btrfs_stop_workers(&fs_info->scrub_workers);
  2760 + btrfs_stop_workers(&fs_info->scrub_wr_completion_workers);
  2761 + btrfs_stop_workers(&fs_info->scrub_nocow_workers);
  2762 + }
2294 2763 WARN_ON(fs_info->scrub_workers_refcnt < 0);
2295 2764 mutex_unlock(&fs_info->scrub_lock);
2296 2765 }
... ... @@ -2354,7 +2823,7 @@
2354 2823 return -EINVAL;
2355 2824 }
2356 2825  
2357   - ret = scrub_workers_get(fs_info);
  2826 + ret = scrub_workers_get(fs_info, is_dev_replace);
2358 2827 if (ret)
2359 2828 return ret;
2360 2829  
2361 2830  
... ... @@ -2394,12 +2863,15 @@
2394 2863 mutex_unlock(&fs_info->scrub_lock);
2395 2864 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2396 2865  
2397   - down_read(&fs_info->scrub_super_lock);
2398   - ret = scrub_supers(sctx, dev);
2399   - up_read(&fs_info->scrub_super_lock);
  2866 + if (!is_dev_replace) {
  2867 + down_read(&fs_info->scrub_super_lock);
  2868 + ret = scrub_supers(sctx, dev);
  2869 + up_read(&fs_info->scrub_super_lock);
  2870 + }
2400 2871  
2401 2872 if (!ret)
2402   - ret = scrub_enumerate_chunks(sctx, dev, start, end);
  2873 + ret = scrub_enumerate_chunks(sctx, dev, start, end,
  2874 + is_dev_replace);
2403 2875  
2404 2876 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
2405 2877 atomic_dec(&fs_info->scrubs_running);
... ... @@ -2536,5 +3008,274 @@
2536 3008 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2537 3009  
2538 3010 return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
  3011 +}
  3012 +
  3013 +static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
  3014 + u64 extent_logical, u64 extent_len,
  3015 + u64 *extent_physical,
  3016 + struct btrfs_device **extent_dev,
  3017 + int *extent_mirror_num)
  3018 +{
  3019 + u64 mapped_length;
  3020 + struct btrfs_bio *bbio = NULL;
  3021 + int ret;
  3022 +
  3023 + mapped_length = extent_len;
  3024 + ret = btrfs_map_block(fs_info, READ, extent_logical,
  3025 + &mapped_length, &bbio, 0);
  3026 + if (ret || !bbio || mapped_length < extent_len ||
  3027 + !bbio->stripes[0].dev->bdev) {
  3028 + kfree(bbio);
  3029 + return;
  3030 + }
  3031 +
  3032 + *extent_physical = bbio->stripes[0].physical;
  3033 + *extent_mirror_num = bbio->mirror_num;
  3034 + *extent_dev = bbio->stripes[0].dev;
  3035 + kfree(bbio);
  3036 +}
  3037 +
  3038 +static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
  3039 + struct scrub_wr_ctx *wr_ctx,
  3040 + struct btrfs_fs_info *fs_info,
  3041 + struct btrfs_device *dev,
  3042 + int is_dev_replace)
  3043 +{
  3044 + WARN_ON(wr_ctx->wr_curr_bio != NULL);
  3045 +
  3046 + mutex_init(&wr_ctx->wr_lock);
  3047 + wr_ctx->wr_curr_bio = NULL;
  3048 + if (!is_dev_replace)
  3049 + return 0;
  3050 +
  3051 + WARN_ON(!dev->bdev);
  3052 + wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO,
  3053 + bio_get_nr_vecs(dev->bdev));
  3054 + wr_ctx->tgtdev = dev;
  3055 + atomic_set(&wr_ctx->flush_all_writes, 0);
  3056 + return 0;
  3057 +}
  3058 +
  3059 +static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)
  3060 +{
  3061 + mutex_lock(&wr_ctx->wr_lock);
  3062 + kfree(wr_ctx->wr_curr_bio);
  3063 + wr_ctx->wr_curr_bio = NULL;
  3064 + mutex_unlock(&wr_ctx->wr_lock);
  3065 +}
  3066 +
  3067 +static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
  3068 + int mirror_num, u64 physical_for_dev_replace)
  3069 +{
  3070 + struct scrub_copy_nocow_ctx *nocow_ctx;
  3071 + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
  3072 +
  3073 + nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
  3074 + if (!nocow_ctx) {
  3075 + spin_lock(&sctx->stat_lock);
  3076 + sctx->stat.malloc_errors++;
  3077 + spin_unlock(&sctx->stat_lock);
  3078 + return -ENOMEM;
  3079 + }
  3080 +
  3081 + scrub_pending_trans_workers_inc(sctx);
  3082 +
  3083 + nocow_ctx->sctx = sctx;
  3084 + nocow_ctx->logical = logical;
  3085 + nocow_ctx->len = len;
  3086 + nocow_ctx->mirror_num = mirror_num;
  3087 + nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
  3088 + nocow_ctx->work.func = copy_nocow_pages_worker;
  3089 + btrfs_queue_worker(&fs_info->scrub_nocow_workers,
  3090 + &nocow_ctx->work);
  3091 +
  3092 + return 0;
  3093 +}
  3094 +
  3095 +static void copy_nocow_pages_worker(struct btrfs_work *work)
  3096 +{
  3097 + struct scrub_copy_nocow_ctx *nocow_ctx =
  3098 + container_of(work, struct scrub_copy_nocow_ctx, work);
  3099 + struct scrub_ctx *sctx = nocow_ctx->sctx;
  3100 + u64 logical = nocow_ctx->logical;
  3101 + u64 len = nocow_ctx->len;
  3102 + int mirror_num = nocow_ctx->mirror_num;
  3103 + u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
  3104 + int ret;
  3105 + struct btrfs_trans_handle *trans = NULL;
  3106 + struct btrfs_fs_info *fs_info;
  3107 + struct btrfs_path *path;
  3108 + struct btrfs_root *root;
  3109 + int not_written = 0;
  3110 +
  3111 + fs_info = sctx->dev_root->fs_info;
  3112 + root = fs_info->extent_root;
  3113 +
  3114 + path = btrfs_alloc_path();
  3115 + if (!path) {
  3116 + spin_lock(&sctx->stat_lock);
  3117 + sctx->stat.malloc_errors++;
  3118 + spin_unlock(&sctx->stat_lock);
  3119 + not_written = 1;
  3120 + goto out;
  3121 + }
  3122 +
  3123 + trans = btrfs_join_transaction(root);
  3124 + if (IS_ERR(trans)) {
  3125 + not_written = 1;
  3126 + goto out;
  3127 + }
  3128 +
  3129 + ret = iterate_inodes_from_logical(logical, fs_info, path,
  3130 + copy_nocow_pages_for_inode,
  3131 + nocow_ctx);
  3132 + if (ret != 0 && ret != -ENOENT) {
  3133 + pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %llu, ret %d\n",
  3134 + (unsigned long long)logical,
  3135 + (unsigned long long)physical_for_dev_replace,
  3136 + (unsigned long long)len,
  3137 + (unsigned long long)mirror_num, ret);
  3138 + not_written = 1;
  3139 + goto out;
  3140 + }
  3141 +
  3142 +out:
  3143 + if (trans && !IS_ERR(trans))
  3144 + btrfs_end_transaction(trans, root);
  3145 + if (not_written)
  3146 + btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
  3147 + num_uncorrectable_read_errors);
  3148 +
  3149 + btrfs_free_path(path);
  3150 + kfree(nocow_ctx);
  3151 +
  3152 + scrub_pending_trans_workers_dec(sctx);
  3153 +}
  3154 +
  3155 +static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
  3156 +{
  3157 + unsigned long index;
  3158 + struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
  3159 + int ret = 0;
  3160 + struct btrfs_key key;
  3161 + struct inode *inode = NULL;
  3162 + struct btrfs_root *local_root;
  3163 + u64 physical_for_dev_replace;
  3164 + u64 len;
  3165 + struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
  3166 +
  3167 + key.objectid = root;
  3168 + key.type = BTRFS_ROOT_ITEM_KEY;
  3169 + key.offset = (u64)-1;
  3170 + local_root = btrfs_read_fs_root_no_name(fs_info, &key);
  3171 + if (IS_ERR(local_root))
  3172 + return PTR_ERR(local_root);
  3173 +
  3174 + key.type = BTRFS_INODE_ITEM_KEY;
  3175 + key.objectid = inum;
  3176 + key.offset = 0;
  3177 + inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
  3178 + if (IS_ERR(inode))
  3179 + return PTR_ERR(inode);
  3180 +
  3181 + physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
  3182 + len = nocow_ctx->len;
  3183 + while (len >= PAGE_CACHE_SIZE) {
  3184 + struct page *page = NULL;
  3185 + int ret_sub;
  3186 +
  3187 + index = offset >> PAGE_CACHE_SHIFT;
  3188 +
  3189 + page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
  3190 + if (!page) {
  3191 + pr_err("find_or_create_page() failed\n");
  3192 + ret = -ENOMEM;
  3193 + goto next_page;
  3194 + }
  3195 +
  3196 + if (PageUptodate(page)) {
  3197 + if (PageDirty(page))
  3198 + goto next_page;
  3199 + } else {
  3200 + ClearPageError(page);
  3201 + ret_sub = extent_read_full_page(&BTRFS_I(inode)->
  3202 + io_tree,
  3203 + page, btrfs_get_extent,
  3204 + nocow_ctx->mirror_num);
  3205 + if (ret_sub) {
  3206 + ret = ret_sub;
  3207 + goto next_page;
  3208 + }
  3209 + wait_on_page_locked(page);
  3210 + if (!PageUptodate(page)) {
  3211 + ret = -EIO;
  3212 + goto next_page;
  3213 + }
  3214 + }
  3215 + ret_sub = write_page_nocow(nocow_ctx->sctx,
  3216 + physical_for_dev_replace, page);
  3217 + if (ret_sub) {
  3218 + ret = ret_sub;
  3219 + goto next_page;
  3220 + }
  3221 +
  3222 +next_page:
  3223 + if (page) {
  3224 + unlock_page(page);
  3225 + put_page(page);
  3226 + }
  3227 + offset += PAGE_CACHE_SIZE;
  3228 + physical_for_dev_replace += PAGE_CACHE_SIZE;
  3229 + len -= PAGE_CACHE_SIZE;
  3230 + }
  3231 +
  3232 + if (inode)
  3233 + iput(inode);
  3234 + return ret;
  3235 +}
  3236 +
  3237 +static int write_page_nocow(struct scrub_ctx *sctx,
  3238 + u64 physical_for_dev_replace, struct page *page)
  3239 +{
  3240 + struct bio *bio;
  3241 + struct btrfs_device *dev;
  3242 + int ret;
  3243 + DECLARE_COMPLETION_ONSTACK(compl);
  3244 +
  3245 + dev = sctx->wr_ctx.tgtdev;
  3246 + if (!dev)
  3247 + return -EIO;
  3248 + if (!dev->bdev) {
  3249 + printk_ratelimited(KERN_WARNING
  3250 + "btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
  3251 + return -EIO;
  3252 + }
  3253 + bio = bio_alloc(GFP_NOFS, 1);
  3254 + if (!bio) {
  3255 + spin_lock(&sctx->stat_lock);
  3256 + sctx->stat.malloc_errors++;
  3257 + spin_unlock(&sctx->stat_lock);
  3258 + return -ENOMEM;
  3259 + }
  3260 + bio->bi_private = &compl;
  3261 + bio->bi_end_io = scrub_complete_bio_end_io;
  3262 + bio->bi_size = 0;
  3263 + bio->bi_sector = physical_for_dev_replace >> 9;
  3264 + bio->bi_bdev = dev->bdev;
  3265 + ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
  3266 + if (ret != PAGE_CACHE_SIZE) {
  3267 +leave_with_eio:
  3268 + bio_put(bio);
  3269 + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
  3270 + return -EIO;
  3271 + }
  3272 + btrfsic_submit_bio(WRITE_SYNC, bio);
  3273 + wait_for_completion(&compl);
  3274 +
  3275 + if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
  3276 + goto leave_with_eio;
  3277 +
  3278 + bio_put(bio);
  3279 + return 0;
2539 3280 }
... ... @@ -1195,7 +1195,8 @@
1195 1195 btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size);
1196 1196 btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size);
1197 1197 btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size);
1198   - btrfs_set_max_workers(&fs_info->scrub_workers, new_pool_size);
  1198 + btrfs_set_max_workers(&fs_info->scrub_wr_completion_workers,
  1199 + new_pool_size);
1199 1200 }
1200 1201  
1201 1202 static int btrfs_remount(struct super_block *sb, int *flags, char *data)