Commit 71cdd40fd498f12679070def668f6a4719ddbd1c

Authored by Peng Tao
Committed by Trond Myklebust
1 parent 31e6306a40

pnfsblock: write_pagelist handle zero invalid extents

For invalid extents, find other pages in the same fsblock and write them out.

[pnfsblock: write_begin]
Signed-off-by: Fred Isaman <iisaman@citi.umich.edu>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Benny Halevy <bhalevy@tonian.com>
Signed-off-by: Peng Tao <peng_tao@emc.com>
Signed-off-by: Jim Rees <rees@umich.edu>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

Showing 1 changed file with 233 additions and 42 deletions Side-by-side Diff

fs/nfs/blocklayout/blocklayout.c
... ... @@ -35,6 +35,7 @@
35 35 #include <linux/mount.h>
36 36 #include <linux/namei.h>
37 37 #include <linux/bio.h> /* struct bio */
  38 +#include <linux/buffer_head.h> /* various write calls */
38 39  
39 40 #include "blocklayout.h"
40 41  
... ... @@ -79,12 +80,8 @@
79 80 */
80 81 static int is_writable(struct pnfs_block_extent *be, sector_t isect)
81 82 {
82   - if (be->be_state == PNFS_BLOCK_READWRITE_DATA)
83   - return 1;
84   - else if (be->be_state != PNFS_BLOCK_INVALID_DATA)
85   - return 0;
86   - else
87   - return bl_is_sector_init(be->be_inval, isect);
  83 + return (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
  84 + be->be_state == PNFS_BLOCK_INVALID_DATA);
88 85 }
89 86  
90 87 /* The data we are handed might be spread across several bios. We need
... ... @@ -353,6 +350,31 @@
353 350 }
354 351 }
355 352  
  353 +static void bl_end_io_write_zero(struct bio *bio, int err)
  354 +{
  355 + struct parallel_io *par = bio->bi_private;
  356 + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
  357 + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
  358 + struct nfs_write_data *wdata = (struct nfs_write_data *)par->data;
  359 +
  360 + do {
  361 + struct page *page = bvec->bv_page;
  362 +
  363 + if (--bvec >= bio->bi_io_vec)
  364 + prefetchw(&bvec->bv_page->flags);
  365 + /* This is the zeroing page we added */
  366 + end_page_writeback(page);
  367 + page_cache_release(page);
  368 + } while (bvec >= bio->bi_io_vec);
  369 + if (!uptodate) {
  370 + if (!wdata->pnfs_error)
  371 + wdata->pnfs_error = -EIO;
  372 + bl_set_lo_fail(wdata->lseg);
  373 + }
  374 + bio_put(bio);
  375 + put_parallel(par);
  376 +}
  377 +
356 378 /* This is basically copied from mpage_end_io_read */
357 379 static void bl_end_io_write(struct bio *bio, int err)
358 380 {
359 381  
... ... @@ -379,11 +401,8 @@
379 401 dprintk("%s enter\n", __func__);
380 402 task = container_of(work, struct rpc_task, u.tk_work);
381 403 wdata = container_of(task, struct nfs_write_data, task);
382   - if (!wdata->task.tk_status) {
  404 + if (!wdata->pnfs_error) {
383 405 /* Marks for LAYOUTCOMMIT */
384   - /* BUG - this should be called after each bio, not after
385   - * all finish, unless have some way of storing success/failure
386   - */
387 406 mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
388 407 wdata->args.offset, wdata->args.count);
389 408 }
390 409  
391 410  
392 411  
393 412  
394 413  
395 414  
... ... @@ -391,38 +410,110 @@
391 410 }
392 411  
393 412 /* Called when last of bios associated with a bl_write_pagelist call finishes */
394   -static void
395   -bl_end_par_io_write(void *data)
  413 +static void bl_end_par_io_write(void *data)
396 414 {
397 415 struct nfs_write_data *wdata = data;
398 416  
399   - /* STUB - ignoring error handling */
400 417 wdata->task.tk_status = 0;
401 418 wdata->verf.committed = NFS_FILE_SYNC;
402 419 INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
403 420 schedule_work(&wdata->task.u.tk_work);
404 421 }
405 422  
  423 +/* FIXME STUB - mark intersection of layout and page as bad, so is not
  424 + * used again.
  425 + */
  426 +static void mark_bad_read(void)
  427 +{
  428 + return;
  429 +}
  430 +
  431 +/*
  432 + * map_block: map a requested I/0 block (isect) into an offset in the LVM
  433 + * block_device
  434 + */
  435 +static void
  436 +map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be)
  437 +{
  438 + dprintk("%s enter be=%p\n", __func__, be);
  439 +
  440 + set_buffer_mapped(bh);
  441 + bh->b_bdev = be->be_mdev;
  442 + bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >>
  443 + (be->be_mdev->bd_inode->i_blkbits - SECTOR_SHIFT);
  444 +
  445 + dprintk("%s isect %llu, bh->b_blocknr %ld, using bsize %Zd\n",
  446 + __func__, (unsigned long long)isect, (long)bh->b_blocknr,
  447 + bh->b_size);
  448 + return;
  449 +}
  450 +
  451 +/* Given an unmapped page, zero it or read in page for COW, page is locked
  452 + * by caller.
  453 + */
  454 +static int
  455 +init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read)
  456 +{
  457 + struct buffer_head *bh = NULL;
  458 + int ret = 0;
  459 + sector_t isect;
  460 +
  461 + dprintk("%s enter, %p\n", __func__, page);
  462 + BUG_ON(PageUptodate(page));
  463 + if (!cow_read) {
  464 + zero_user_segment(page, 0, PAGE_SIZE);
  465 + SetPageUptodate(page);
  466 + goto cleanup;
  467 + }
  468 +
  469 + bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0);
  470 + if (!bh) {
  471 + ret = -ENOMEM;
  472 + goto cleanup;
  473 + }
  474 +
  475 + isect = (sector_t) page->index << PAGE_CACHE_SECTOR_SHIFT;
  476 + map_block(bh, isect, cow_read);
  477 + if (!bh_uptodate_or_lock(bh))
  478 + ret = bh_submit_read(bh);
  479 + if (ret)
  480 + goto cleanup;
  481 + SetPageUptodate(page);
  482 +
  483 +cleanup:
  484 + bl_put_extent(cow_read);
  485 + if (bh)
  486 + free_buffer_head(bh);
  487 + if (ret) {
  488 + /* Need to mark layout with bad read...should now
  489 + * just use nfs4 for reads and writes.
  490 + */
  491 + mark_bad_read();
  492 + }
  493 + return ret;
  494 +}
  495 +
406 496 static enum pnfs_try_status
407 497 bl_write_pagelist(struct nfs_write_data *wdata, int sync)
408 498 {
409   - int i;
  499 + int i, ret, npg_zero, pg_index, last = 0;
410 500 struct bio *bio = NULL;
411   - struct pnfs_block_extent *be = NULL;
412   - sector_t isect, extent_length = 0;
  501 + struct pnfs_block_extent *be = NULL, *cow_read = NULL;
  502 + sector_t isect, last_isect = 0, extent_length = 0;
413 503 struct parallel_io *par;
414 504 loff_t offset = wdata->args.offset;
415 505 size_t count = wdata->args.count;
416 506 struct page **pages = wdata->args.pages;
417   - int pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT;
  507 + struct page *page;
  508 + pgoff_t index;
  509 + u64 temp;
  510 + int npg_per_block =
  511 + NFS_SERVER(wdata->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT;
418 512  
419 513 dprintk("%s enter, %Zu@%lld\n", __func__, count, offset);
420 514 /* At this point, wdata->pages is a (sequential) list of nfs_pages.
421   - * We want to write each, and if there is an error remove it from
422   - * list and call
423   - * nfs_retry_request(req) to have it redone using nfs.
424   - * QUEST? Do as block or per req? Think have to do per block
425   - * as part of end_bio
  515 + * We want to write each, and if there is an error set pnfs_error
  516 + * to have it redone using nfs.
426 517 */
427 518 par = alloc_parallel(wdata);
428 519 if (!par)
... ... @@ -433,7 +524,91 @@
433 524 /* At this point, have to be more careful with error handling */
434 525  
435 526 isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
436   - for (i = pg_index; i < wdata->npages ; i++) {
  527 + be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read);
  528 + if (!be || !is_writable(be, isect)) {
  529 + dprintk("%s no matching extents!\n", __func__);
  530 + wdata->pnfs_error = -EINVAL;
  531 + goto out;
  532 + }
  533 +
  534 + /* First page inside INVALID extent */
  535 + if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
  536 + temp = offset >> PAGE_CACHE_SHIFT;
  537 + npg_zero = do_div(temp, npg_per_block);
  538 + isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) &
  539 + (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
  540 + extent_length = be->be_length - (isect - be->be_f_offset);
  541 +
  542 +fill_invalid_ext:
  543 + dprintk("%s need to zero %d pages\n", __func__, npg_zero);
  544 + for (;npg_zero > 0; npg_zero--) {
  545 + /* page ref released in bl_end_io_write_zero */
  546 + index = isect >> PAGE_CACHE_SECTOR_SHIFT;
  547 + dprintk("%s zero %dth page: index %lu isect %llu\n",
  548 + __func__, npg_zero, index,
  549 + (unsigned long long)isect);
  550 + page =
  551 + find_or_create_page(wdata->inode->i_mapping, index,
  552 + GFP_NOFS);
  553 + if (!page) {
  554 + dprintk("%s oom\n", __func__);
  555 + wdata->pnfs_error = -ENOMEM;
  556 + goto out;
  557 + }
  558 +
  559 + /* PageDirty: Other will write this out
  560 + * PageWriteback: Other is writing this out
  561 + * PageUptodate: It was read before
  562 + * sector_initialized: already written out
  563 + */
  564 + if (PageDirty(page) || PageWriteback(page) ||
  565 + bl_is_sector_init(be->be_inval, isect)) {
  566 + print_page(page);
  567 + unlock_page(page);
  568 + page_cache_release(page);
  569 + goto next_page;
  570 + }
  571 + if (!PageUptodate(page)) {
  572 + /* New page, readin or zero it */
  573 + init_page_for_write(page, cow_read);
  574 + }
  575 + set_page_writeback(page);
  576 + unlock_page(page);
  577 +
  578 + ret = bl_mark_sectors_init(be->be_inval, isect,
  579 + PAGE_CACHE_SECTORS,
  580 + NULL);
  581 + if (unlikely(ret)) {
  582 + dprintk("%s bl_mark_sectors_init fail %d\n",
  583 + __func__, ret);
  584 + end_page_writeback(page);
  585 + page_cache_release(page);
  586 + wdata->pnfs_error = ret;
  587 + goto out;
  588 + }
  589 + bio = bl_add_page_to_bio(bio, npg_zero, WRITE,
  590 + isect, page, be,
  591 + bl_end_io_write_zero, par);
  592 + if (IS_ERR(bio)) {
  593 + wdata->pnfs_error = PTR_ERR(bio);
  594 + goto out;
  595 + }
  596 + /* FIXME: This should be done in bi_end_io */
  597 + mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
  598 + page->index << PAGE_CACHE_SHIFT,
  599 + PAGE_CACHE_SIZE);
  600 +next_page:
  601 + isect += PAGE_CACHE_SECTORS;
  602 + extent_length -= PAGE_CACHE_SECTORS;
  603 + }
  604 + if (last)
  605 + goto write_done;
  606 + }
  607 + bio = bl_submit_bio(WRITE, bio);
  608 +
  609 + /* Middle pages */
  610 + pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT;
  611 + for (i = pg_index; i < wdata->npages; i++) {
437 612 if (!extent_length) {
438 613 /* We've used up the previous extent */
439 614 bl_put_extent(be);
440 615  
441 616  
442 617  
443 618  
444 619  
445 620  
446 621  
... ... @@ -442,35 +617,51 @@
442 617 be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg),
443 618 isect, NULL);
444 619 if (!be || !is_writable(be, isect)) {
445   - wdata->pnfs_error = -ENOMEM;
  620 + wdata->pnfs_error = -EINVAL;
446 621 goto out;
447 622 }
448 623 extent_length = be->be_length -
449   - (isect - be->be_f_offset);
  624 + (isect - be->be_f_offset);
450 625 }
451   - for (;;) {
452   - if (!bio) {
453   - bio = bio_alloc(GFP_NOIO, wdata->npages - i);
454   - if (!bio) {
455   - wdata->pnfs_error = -ENOMEM;
456   - goto out;
457   - }
458   - bio->bi_sector = isect - be->be_f_offset +
459   - be->be_v_offset;
460   - bio->bi_bdev = be->be_mdev;
461   - bio->bi_end_io = bl_end_io_write;
462   - bio->bi_private = par;
  626 + if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
  627 + ret = bl_mark_sectors_init(be->be_inval, isect,
  628 + PAGE_CACHE_SECTORS,
  629 + NULL);
  630 + if (unlikely(ret)) {
  631 + dprintk("%s bl_mark_sectors_init fail %d\n",
  632 + __func__, ret);
  633 + wdata->pnfs_error = ret;
  634 + goto out;
463 635 }
464   - if (bio_add_page(bio, pages[i], PAGE_SIZE, 0))
465   - break;
466   - bio = bl_submit_bio(WRITE, bio);
467 636 }
  637 + bio = bl_add_page_to_bio(bio, wdata->npages - i, WRITE,
  638 + isect, pages[i], be,
  639 + bl_end_io_write, par);
  640 + if (IS_ERR(bio)) {
  641 + wdata->pnfs_error = PTR_ERR(bio);
  642 + goto out;
  643 + }
468 644 isect += PAGE_CACHE_SECTORS;
  645 + last_isect = isect;
469 646 extent_length -= PAGE_CACHE_SECTORS;
470 647 }
471   - wdata->res.count = (isect << SECTOR_SHIFT) - (offset);
472   - if (count < wdata->res.count)
  648 +
  649 + /* Last page inside INVALID extent */
  650 + if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
  651 + bio = bl_submit_bio(WRITE, bio);
  652 + temp = last_isect >> PAGE_CACHE_SECTOR_SHIFT;
  653 + npg_zero = npg_per_block - do_div(temp, npg_per_block);
  654 + if (npg_zero < npg_per_block) {
  655 + last = 1;
  656 + goto fill_invalid_ext;
  657 + }
  658 + }
  659 +
  660 +write_done:
  661 + wdata->res.count = (last_isect << SECTOR_SHIFT) - (offset);
  662 + if (count < wdata->res.count) {
473 663 wdata->res.count = count;
  664 + }
474 665 out:
475 666 bl_put_extent(be);
476 667 bl_submit_bio(WRITE, bio);