Commit 71cdd40fd498f12679070def668f6a4719ddbd1c
Committed by
Trond Myklebust
1 parent
31e6306a40
Exists in
master
and in
4 other branches
pnfsblock: write_pagelist handle zero invalid extents
For invalid extents, find other pages in the same fsblock and write them out. [pnfsblock: write_begin] Signed-off-by: Fred Isaman <iisaman@citi.umich.edu> Signed-off-by: Benny Halevy <bhalevy@panasas.com> Signed-off-by: Benny Halevy <bhalevy@tonian.com> Signed-off-by: Peng Tao <peng_tao@emc.com> Signed-off-by: Jim Rees <rees@umich.edu> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Showing 1 changed file with 233 additions and 42 deletions Side-by-side Diff
fs/nfs/blocklayout/blocklayout.c
... | ... | @@ -35,6 +35,7 @@ |
35 | 35 | #include <linux/mount.h> |
36 | 36 | #include <linux/namei.h> |
37 | 37 | #include <linux/bio.h> /* struct bio */ |
38 | +#include <linux/buffer_head.h> /* various write calls */ | |
38 | 39 | |
39 | 40 | #include "blocklayout.h" |
40 | 41 | |
... | ... | @@ -79,12 +80,8 @@ |
79 | 80 | */ |
80 | 81 | static int is_writable(struct pnfs_block_extent *be, sector_t isect) |
81 | 82 | { |
82 | - if (be->be_state == PNFS_BLOCK_READWRITE_DATA) | |
83 | - return 1; | |
84 | - else if (be->be_state != PNFS_BLOCK_INVALID_DATA) | |
85 | - return 0; | |
86 | - else | |
87 | - return bl_is_sector_init(be->be_inval, isect); | |
83 | + return (be->be_state == PNFS_BLOCK_READWRITE_DATA || | |
84 | + be->be_state == PNFS_BLOCK_INVALID_DATA); | |
88 | 85 | } |
89 | 86 | |
90 | 87 | /* The data we are handed might be spread across several bios. We need |
... | ... | @@ -353,6 +350,31 @@ |
353 | 350 | } |
354 | 351 | } |
355 | 352 | |
353 | +static void bl_end_io_write_zero(struct bio *bio, int err) | |
354 | +{ | |
355 | + struct parallel_io *par = bio->bi_private; | |
356 | + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | |
357 | + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; | |
358 | + struct nfs_write_data *wdata = (struct nfs_write_data *)par->data; | |
359 | + | |
360 | + do { | |
361 | + struct page *page = bvec->bv_page; | |
362 | + | |
363 | + if (--bvec >= bio->bi_io_vec) | |
364 | + prefetchw(&bvec->bv_page->flags); | |
365 | + /* This is the zeroing page we added */ | |
366 | + end_page_writeback(page); | |
367 | + page_cache_release(page); | |
368 | + } while (bvec >= bio->bi_io_vec); | |
369 | + if (!uptodate) { | |
370 | + if (!wdata->pnfs_error) | |
371 | + wdata->pnfs_error = -EIO; | |
372 | + bl_set_lo_fail(wdata->lseg); | |
373 | + } | |
374 | + bio_put(bio); | |
375 | + put_parallel(par); | |
376 | +} | |
377 | + | |
356 | 378 | /* This is basically copied from mpage_end_io_read */ |
357 | 379 | static void bl_end_io_write(struct bio *bio, int err) |
358 | 380 | { |
359 | 381 | |
... | ... | @@ -379,11 +401,8 @@ |
379 | 401 | dprintk("%s enter\n", __func__); |
380 | 402 | task = container_of(work, struct rpc_task, u.tk_work); |
381 | 403 | wdata = container_of(task, struct nfs_write_data, task); |
382 | - if (!wdata->task.tk_status) { | |
404 | + if (!wdata->pnfs_error) { | |
383 | 405 | /* Marks for LAYOUTCOMMIT */ |
384 | - /* BUG - this should be called after each bio, not after | |
385 | - * all finish, unless have some way of storing success/failure | |
386 | - */ | |
387 | 406 | mark_extents_written(BLK_LSEG2EXT(wdata->lseg), |
388 | 407 | wdata->args.offset, wdata->args.count); |
389 | 408 | } |
390 | 409 | |
391 | 410 | |
392 | 411 | |
393 | 412 | |
394 | 413 | |
395 | 414 | |
... | ... | @@ -391,38 +410,110 @@ |
391 | 410 | } |
392 | 411 | |
393 | 412 | /* Called when last of bios associated with a bl_write_pagelist call finishes */ |
394 | -static void | |
395 | -bl_end_par_io_write(void *data) | |
413 | +static void bl_end_par_io_write(void *data) | |
396 | 414 | { |
397 | 415 | struct nfs_write_data *wdata = data; |
398 | 416 | |
399 | - /* STUB - ignoring error handling */ | |
400 | 417 | wdata->task.tk_status = 0; |
401 | 418 | wdata->verf.committed = NFS_FILE_SYNC; |
402 | 419 | INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup); |
403 | 420 | schedule_work(&wdata->task.u.tk_work); |
404 | 421 | } |
405 | 422 | |
423 | +/* FIXME STUB - mark intersection of layout and page as bad, so is not | |
424 | + * used again. | |
425 | + */ | |
426 | +static void mark_bad_read(void) | |
427 | +{ | |
428 | + return; | |
429 | +} | |
430 | + | |
431 | +/* | |
432 | + * map_block: map a requested I/0 block (isect) into an offset in the LVM | |
433 | + * block_device | |
434 | + */ | |
435 | +static void | |
436 | +map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be) | |
437 | +{ | |
438 | + dprintk("%s enter be=%p\n", __func__, be); | |
439 | + | |
440 | + set_buffer_mapped(bh); | |
441 | + bh->b_bdev = be->be_mdev; | |
442 | + bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >> | |
443 | + (be->be_mdev->bd_inode->i_blkbits - SECTOR_SHIFT); | |
444 | + | |
445 | + dprintk("%s isect %llu, bh->b_blocknr %ld, using bsize %Zd\n", | |
446 | + __func__, (unsigned long long)isect, (long)bh->b_blocknr, | |
447 | + bh->b_size); | |
448 | + return; | |
449 | +} | |
450 | + | |
451 | +/* Given an unmapped page, zero it or read in page for COW, page is locked | |
452 | + * by caller. | |
453 | + */ | |
454 | +static int | |
455 | +init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read) | |
456 | +{ | |
457 | + struct buffer_head *bh = NULL; | |
458 | + int ret = 0; | |
459 | + sector_t isect; | |
460 | + | |
461 | + dprintk("%s enter, %p\n", __func__, page); | |
462 | + BUG_ON(PageUptodate(page)); | |
463 | + if (!cow_read) { | |
464 | + zero_user_segment(page, 0, PAGE_SIZE); | |
465 | + SetPageUptodate(page); | |
466 | + goto cleanup; | |
467 | + } | |
468 | + | |
469 | + bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0); | |
470 | + if (!bh) { | |
471 | + ret = -ENOMEM; | |
472 | + goto cleanup; | |
473 | + } | |
474 | + | |
475 | + isect = (sector_t) page->index << PAGE_CACHE_SECTOR_SHIFT; | |
476 | + map_block(bh, isect, cow_read); | |
477 | + if (!bh_uptodate_or_lock(bh)) | |
478 | + ret = bh_submit_read(bh); | |
479 | + if (ret) | |
480 | + goto cleanup; | |
481 | + SetPageUptodate(page); | |
482 | + | |
483 | +cleanup: | |
484 | + bl_put_extent(cow_read); | |
485 | + if (bh) | |
486 | + free_buffer_head(bh); | |
487 | + if (ret) { | |
488 | + /* Need to mark layout with bad read...should now | |
489 | + * just use nfs4 for reads and writes. | |
490 | + */ | |
491 | + mark_bad_read(); | |
492 | + } | |
493 | + return ret; | |
494 | +} | |
495 | + | |
406 | 496 | static enum pnfs_try_status |
407 | 497 | bl_write_pagelist(struct nfs_write_data *wdata, int sync) |
408 | 498 | { |
409 | - int i; | |
499 | + int i, ret, npg_zero, pg_index, last = 0; | |
410 | 500 | struct bio *bio = NULL; |
411 | - struct pnfs_block_extent *be = NULL; | |
412 | - sector_t isect, extent_length = 0; | |
501 | + struct pnfs_block_extent *be = NULL, *cow_read = NULL; | |
502 | + sector_t isect, last_isect = 0, extent_length = 0; | |
413 | 503 | struct parallel_io *par; |
414 | 504 | loff_t offset = wdata->args.offset; |
415 | 505 | size_t count = wdata->args.count; |
416 | 506 | struct page **pages = wdata->args.pages; |
417 | - int pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT; | |
507 | + struct page *page; | |
508 | + pgoff_t index; | |
509 | + u64 temp; | |
510 | + int npg_per_block = | |
511 | + NFS_SERVER(wdata->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT; | |
418 | 512 | |
419 | 513 | dprintk("%s enter, %Zu@%lld\n", __func__, count, offset); |
420 | 514 | /* At this point, wdata->pages is a (sequential) list of nfs_pages. |
421 | - * We want to write each, and if there is an error remove it from | |
422 | - * list and call | |
423 | - * nfs_retry_request(req) to have it redone using nfs. | |
424 | - * QUEST? Do as block or per req? Think have to do per block | |
425 | - * as part of end_bio | |
515 | + * We want to write each, and if there is an error set pnfs_error | |
516 | + * to have it redone using nfs. | |
426 | 517 | */ |
427 | 518 | par = alloc_parallel(wdata); |
428 | 519 | if (!par) |
... | ... | @@ -433,7 +524,91 @@ |
433 | 524 | /* At this point, have to be more careful with error handling */ |
434 | 525 | |
435 | 526 | isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT); |
436 | - for (i = pg_index; i < wdata->npages ; i++) { | |
527 | + be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read); | |
528 | + if (!be || !is_writable(be, isect)) { | |
529 | + dprintk("%s no matching extents!\n", __func__); | |
530 | + wdata->pnfs_error = -EINVAL; | |
531 | + goto out; | |
532 | + } | |
533 | + | |
534 | + /* First page inside INVALID extent */ | |
535 | + if (be->be_state == PNFS_BLOCK_INVALID_DATA) { | |
536 | + temp = offset >> PAGE_CACHE_SHIFT; | |
537 | + npg_zero = do_div(temp, npg_per_block); | |
538 | + isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) & | |
539 | + (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT); | |
540 | + extent_length = be->be_length - (isect - be->be_f_offset); | |
541 | + | |
542 | +fill_invalid_ext: | |
543 | + dprintk("%s need to zero %d pages\n", __func__, npg_zero); | |
544 | + for (;npg_zero > 0; npg_zero--) { | |
545 | + /* page ref released in bl_end_io_write_zero */ | |
546 | + index = isect >> PAGE_CACHE_SECTOR_SHIFT; | |
547 | + dprintk("%s zero %dth page: index %lu isect %llu\n", | |
548 | + __func__, npg_zero, index, | |
549 | + (unsigned long long)isect); | |
550 | + page = | |
551 | + find_or_create_page(wdata->inode->i_mapping, index, | |
552 | + GFP_NOFS); | |
553 | + if (!page) { | |
554 | + dprintk("%s oom\n", __func__); | |
555 | + wdata->pnfs_error = -ENOMEM; | |
556 | + goto out; | |
557 | + } | |
558 | + | |
559 | + /* PageDirty: Other will write this out | |
560 | + * PageWriteback: Other is writing this out | |
561 | + * PageUptodate: It was read before | |
562 | + * sector_initialized: already written out | |
563 | + */ | |
564 | + if (PageDirty(page) || PageWriteback(page) || | |
565 | + bl_is_sector_init(be->be_inval, isect)) { | |
566 | + print_page(page); | |
567 | + unlock_page(page); | |
568 | + page_cache_release(page); | |
569 | + goto next_page; | |
570 | + } | |
571 | + if (!PageUptodate(page)) { | |
572 | + /* New page, readin or zero it */ | |
573 | + init_page_for_write(page, cow_read); | |
574 | + } | |
575 | + set_page_writeback(page); | |
576 | + unlock_page(page); | |
577 | + | |
578 | + ret = bl_mark_sectors_init(be->be_inval, isect, | |
579 | + PAGE_CACHE_SECTORS, | |
580 | + NULL); | |
581 | + if (unlikely(ret)) { | |
582 | + dprintk("%s bl_mark_sectors_init fail %d\n", | |
583 | + __func__, ret); | |
584 | + end_page_writeback(page); | |
585 | + page_cache_release(page); | |
586 | + wdata->pnfs_error = ret; | |
587 | + goto out; | |
588 | + } | |
589 | + bio = bl_add_page_to_bio(bio, npg_zero, WRITE, | |
590 | + isect, page, be, | |
591 | + bl_end_io_write_zero, par); | |
592 | + if (IS_ERR(bio)) { | |
593 | + wdata->pnfs_error = PTR_ERR(bio); | |
594 | + goto out; | |
595 | + } | |
596 | + /* FIXME: This should be done in bi_end_io */ | |
597 | + mark_extents_written(BLK_LSEG2EXT(wdata->lseg), | |
598 | + page->index << PAGE_CACHE_SHIFT, | |
599 | + PAGE_CACHE_SIZE); | |
600 | +next_page: | |
601 | + isect += PAGE_CACHE_SECTORS; | |
602 | + extent_length -= PAGE_CACHE_SECTORS; | |
603 | + } | |
604 | + if (last) | |
605 | + goto write_done; | |
606 | + } | |
607 | + bio = bl_submit_bio(WRITE, bio); | |
608 | + | |
609 | + /* Middle pages */ | |
610 | + pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT; | |
611 | + for (i = pg_index; i < wdata->npages; i++) { | |
437 | 612 | if (!extent_length) { |
438 | 613 | /* We've used up the previous extent */ |
439 | 614 | bl_put_extent(be); |
440 | 615 | |
441 | 616 | |
442 | 617 | |
443 | 618 | |
444 | 619 | |
445 | 620 | |
446 | 621 | |
... | ... | @@ -442,35 +617,51 @@ |
442 | 617 | be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), |
443 | 618 | isect, NULL); |
444 | 619 | if (!be || !is_writable(be, isect)) { |
445 | - wdata->pnfs_error = -ENOMEM; | |
620 | + wdata->pnfs_error = -EINVAL; | |
446 | 621 | goto out; |
447 | 622 | } |
448 | 623 | extent_length = be->be_length - |
449 | - (isect - be->be_f_offset); | |
624 | + (isect - be->be_f_offset); | |
450 | 625 | } |
451 | - for (;;) { | |
452 | - if (!bio) { | |
453 | - bio = bio_alloc(GFP_NOIO, wdata->npages - i); | |
454 | - if (!bio) { | |
455 | - wdata->pnfs_error = -ENOMEM; | |
456 | - goto out; | |
457 | - } | |
458 | - bio->bi_sector = isect - be->be_f_offset + | |
459 | - be->be_v_offset; | |
460 | - bio->bi_bdev = be->be_mdev; | |
461 | - bio->bi_end_io = bl_end_io_write; | |
462 | - bio->bi_private = par; | |
626 | + if (be->be_state == PNFS_BLOCK_INVALID_DATA) { | |
627 | + ret = bl_mark_sectors_init(be->be_inval, isect, | |
628 | + PAGE_CACHE_SECTORS, | |
629 | + NULL); | |
630 | + if (unlikely(ret)) { | |
631 | + dprintk("%s bl_mark_sectors_init fail %d\n", | |
632 | + __func__, ret); | |
633 | + wdata->pnfs_error = ret; | |
634 | + goto out; | |
463 | 635 | } |
464 | - if (bio_add_page(bio, pages[i], PAGE_SIZE, 0)) | |
465 | - break; | |
466 | - bio = bl_submit_bio(WRITE, bio); | |
467 | 636 | } |
637 | + bio = bl_add_page_to_bio(bio, wdata->npages - i, WRITE, | |
638 | + isect, pages[i], be, | |
639 | + bl_end_io_write, par); | |
640 | + if (IS_ERR(bio)) { | |
641 | + wdata->pnfs_error = PTR_ERR(bio); | |
642 | + goto out; | |
643 | + } | |
468 | 644 | isect += PAGE_CACHE_SECTORS; |
645 | + last_isect = isect; | |
469 | 646 | extent_length -= PAGE_CACHE_SECTORS; |
470 | 647 | } |
471 | - wdata->res.count = (isect << SECTOR_SHIFT) - (offset); | |
472 | - if (count < wdata->res.count) | |
648 | + | |
649 | + /* Last page inside INVALID extent */ | |
650 | + if (be->be_state == PNFS_BLOCK_INVALID_DATA) { | |
651 | + bio = bl_submit_bio(WRITE, bio); | |
652 | + temp = last_isect >> PAGE_CACHE_SECTOR_SHIFT; | |
653 | + npg_zero = npg_per_block - do_div(temp, npg_per_block); | |
654 | + if (npg_zero < npg_per_block) { | |
655 | + last = 1; | |
656 | + goto fill_invalid_ext; | |
657 | + } | |
658 | + } | |
659 | + | |
660 | +write_done: | |
661 | + wdata->res.count = (last_isect << SECTOR_SHIFT) - (offset); | |
662 | + if (count < wdata->res.count) { | |
473 | 663 | wdata->res.count = count; |
664 | + } | |
474 | 665 | out: |
475 | 666 | bl_put_extent(be); |
476 | 667 | bl_submit_bio(WRITE, bio); |