Commit 0e5b88cd9975dca6c191cc9bd11f233fac4ca882

Authored by Linus Torvalds

Merge git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable

* git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable:
  Btrfs: break out of shrink_delalloc earlier
  btrfs: fix not enough reserved space
  btrfs: fix dip leak
  Btrfs: make sure not to return overlapping extents to fiemap
  Btrfs: deal with short returns from copy_from_user
  Btrfs: fix regressions in copy_from_user handling

Showing 5 changed files Side-by-side Diff

... ... @@ -729,6 +729,15 @@
729 729 u64 disk_total; /* total bytes on disk, takes mirrors into
730 730 account */
731 731  
  732 + /*
  733 + * we bump reservation progress every time we decrement
  734 + * bytes_reserved. This way people waiting for reservations
  735 + * know something good has happened and they can check
  736 + * for progress. The number here isn't to be trusted, it
  737 + * just shows reclaim activity
  738 + */
  739 + unsigned long reservation_progress;
  740 +
732 741 int full; /* indicates that we cannot allocate any more
733 742 chunks for this space */
734 743 int force_alloc; /* set if we need to force a chunk alloc for
fs/btrfs/extent-tree.c
... ... @@ -3342,15 +3342,16 @@
3342 3342 u64 max_reclaim;
3343 3343 u64 reclaimed = 0;
3344 3344 long time_left;
3345   - int pause = 1;
3346 3345 int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3347 3346 int loops = 0;
  3347 + unsigned long progress;
3348 3348  
3349 3349 block_rsv = &root->fs_info->delalloc_block_rsv;
3350 3350 space_info = block_rsv->space_info;
3351 3351  
3352 3352 smp_mb();
3353 3353 reserved = space_info->bytes_reserved;
  3354 + progress = space_info->reservation_progress;
3354 3355  
3355 3356 if (reserved == 0)
3356 3357 return 0;
3357 3358  
3358 3359  
3359 3360  
3360 3361  
3361 3362  
... ... @@ -3365,32 +3366,37 @@
3365 3366 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
3366 3367  
3367 3368 spin_lock(&space_info->lock);
3368   - if (reserved > space_info->bytes_reserved) {
3369   - loops = 0;
  3369 + if (reserved > space_info->bytes_reserved)
3370 3370 reclaimed += reserved - space_info->bytes_reserved;
3371   - } else {
3372   - loops++;
3373   - }
3374 3371 reserved = space_info->bytes_reserved;
3375 3372 spin_unlock(&space_info->lock);
3376 3373  
  3374 + loops++;
  3375 +
3377 3376 if (reserved == 0 || reclaimed >= max_reclaim)
3378 3377 break;
3379 3378  
3380 3379 if (trans && trans->transaction->blocked)
3381 3380 return -EAGAIN;
3382 3381  
3383   - __set_current_state(TASK_INTERRUPTIBLE);
3384   - time_left = schedule_timeout(pause);
  3382 + time_left = schedule_timeout_interruptible(1);
3385 3383  
3386 3384 /* We were interrupted, exit */
3387 3385 if (time_left)
3388 3386 break;
3389 3387  
3390   - pause <<= 1;
3391   - if (pause > HZ / 10)
3392   - pause = HZ / 10;
  3388 + /* we've kicked the IO a few times, if anything has been freed,
  3389 + * exit. There is no sense in looping here for a long time
  3390 + * when we really need to commit the transaction, or there are
  3391 + * just too many writers without enough free space
  3392 + */
3393 3393  
  3394 + if (loops > 3) {
  3395 + smp_mb();
  3396 + if (progress != space_info->reservation_progress)
  3397 + break;
  3398 + }
  3399 +
3394 3400 }
3395 3401 return reclaimed >= to_reclaim;
3396 3402 }
... ... @@ -3612,6 +3618,7 @@
3612 3618 if (num_bytes) {
3613 3619 spin_lock(&space_info->lock);
3614 3620 space_info->bytes_reserved -= num_bytes;
  3621 + space_info->reservation_progress++;
3615 3622 spin_unlock(&space_info->lock);
3616 3623 }
3617 3624 }
... ... @@ -3844,6 +3851,7 @@
3844 3851 if (block_rsv->reserved >= block_rsv->size) {
3845 3852 num_bytes = block_rsv->reserved - block_rsv->size;
3846 3853 sinfo->bytes_reserved -= num_bytes;
  3854 + sinfo->reservation_progress++;
3847 3855 block_rsv->reserved = block_rsv->size;
3848 3856 block_rsv->full = 1;
3849 3857 }
... ... @@ -4005,7 +4013,6 @@
4005 4013 to_reserve = 0;
4006 4014 }
4007 4015 spin_unlock(&BTRFS_I(inode)->accounting_lock);
4008   -
4009 4016 to_reserve += calc_csum_metadata_size(inode, num_bytes);
4010 4017 ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
4011 4018 if (ret)
... ... @@ -4133,6 +4140,7 @@
4133 4140 btrfs_set_block_group_used(&cache->item, old_val);
4134 4141 cache->reserved -= num_bytes;
4135 4142 cache->space_info->bytes_reserved -= num_bytes;
  4143 + cache->space_info->reservation_progress++;
4136 4144 cache->space_info->bytes_used += num_bytes;
4137 4145 cache->space_info->disk_used += num_bytes * factor;
4138 4146 spin_unlock(&cache->lock);
... ... @@ -4184,6 +4192,7 @@
4184 4192 if (reserved) {
4185 4193 cache->reserved -= num_bytes;
4186 4194 cache->space_info->bytes_reserved -= num_bytes;
  4195 + cache->space_info->reservation_progress++;
4187 4196 }
4188 4197 spin_unlock(&cache->lock);
4189 4198 spin_unlock(&cache->space_info->lock);
... ... @@ -4234,6 +4243,7 @@
4234 4243 space_info->bytes_readonly += num_bytes;
4235 4244 cache->reserved -= num_bytes;
4236 4245 space_info->bytes_reserved -= num_bytes;
  4246 + space_info->reservation_progress++;
4237 4247 }
4238 4248 spin_unlock(&cache->lock);
4239 4249 spin_unlock(&space_info->lock);
... ... @@ -4712,6 +4722,7 @@
4712 4722 if (ret) {
4713 4723 spin_lock(&cache->space_info->lock);
4714 4724 cache->space_info->bytes_reserved -= buf->len;
  4725 + cache->space_info->reservation_progress++;
4715 4726 spin_unlock(&cache->space_info->lock);
4716 4727 }
4717 4728 goto out;
fs/btrfs/extent_io.c
... ... @@ -3046,17 +3046,38 @@
3046 3046 }
3047 3047  
3048 3048 while (!end) {
3049   - off = extent_map_end(em);
3050   - if (off >= max)
3051   - end = 1;
  3049 + u64 offset_in_extent;
3052 3050  
3053   - em_start = em->start;
3054   - em_len = em->len;
  3051 + /* break if the extent we found is outside the range */
  3052 + if (em->start >= max || extent_map_end(em) < off)
  3053 + break;
  3054 +
  3055 + /*
  3056 + * get_extent may return an extent that starts before our
  3057 + * requested range. We have to make sure the ranges
  3058 + * we return to fiemap always move forward and don't
  3059 + * overlap, so adjust the offsets here
  3060 + */
  3061 + em_start = max(em->start, off);
  3062 +
  3063 + /*
  3064 + * record the offset from the start of the extent
  3065 + * for adjusting the disk offset below
  3066 + */
  3067 + offset_in_extent = em_start - em->start;
3055 3068 em_end = extent_map_end(em);
  3069 + em_len = em_end - em_start;
3056 3070 emflags = em->flags;
3057 3071 disko = 0;
3058 3072 flags = 0;
3059 3073  
  3074 + /*
  3075 + * bump off for our next call to get_extent
  3076 + */
  3077 + off = extent_map_end(em);
  3078 + if (off >= max)
  3079 + end = 1;
  3080 +
3060 3081 if (em->block_start == EXTENT_MAP_LAST_BYTE) {
3061 3082 end = 1;
3062 3083 flags |= FIEMAP_EXTENT_LAST;
... ... @@ -3067,7 +3088,7 @@
3067 3088 flags |= (FIEMAP_EXTENT_DELALLOC |
3068 3089 FIEMAP_EXTENT_UNKNOWN);
3069 3090 } else {
3070   - disko = em->block_start;
  3091 + disko = em->block_start + offset_in_extent;
3071 3092 }
3072 3093 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
3073 3094 flags |= FIEMAP_EXTENT_ENCODED;
... ... @@ -70,6 +70,19 @@
70 70  
71 71 /* Flush processor's dcache for this page */
72 72 flush_dcache_page(page);
  73 +
  74 + /*
  75 + * if we get a partial write, we can end up with
  76 + * partially up to date pages. These add
  77 + * a lot of complexity, so make sure they don't
  78 + * happen by forcing this copy to be retried.
  79 + *
  80 + * The rest of the btrfs_file_write code will fall
  81 + * back to page at a time copies after we return 0.
  82 + */
  83 + if (!PageUptodate(page) && copied < count)
  84 + copied = 0;
  85 +
73 86 iov_iter_advance(i, copied);
74 87 write_bytes -= copied;
75 88 total_copied += copied;
... ... @@ -763,6 +776,27 @@
763 776 }
764 777  
765 778 /*
  779 + * on error we return an unlocked page and the error value
  780 + * on success we return a locked page and 0
  781 + */
  782 +static int prepare_uptodate_page(struct page *page, u64 pos)
  783 +{
  784 + int ret = 0;
  785 +
  786 + if ((pos & (PAGE_CACHE_SIZE - 1)) && !PageUptodate(page)) {
  787 + ret = btrfs_readpage(NULL, page);
  788 + if (ret)
  789 + return ret;
  790 + lock_page(page);
  791 + if (!PageUptodate(page)) {
  792 + unlock_page(page);
  793 + return -EIO;
  794 + }
  795 + }
  796 + return 0;
  797 +}
  798 +
  799 +/*
766 800 * this gets pages into the page cache and locks them down, it also properly
767 801 * waits for data=ordered extents to finish before allowing the pages to be
768 802 * modified.
... ... @@ -777,6 +811,7 @@
777 811 unsigned long index = pos >> PAGE_CACHE_SHIFT;
778 812 struct inode *inode = fdentry(file)->d_inode;
779 813 int err = 0;
  814 + int faili = 0;
780 815 u64 start_pos;
781 816 u64 last_pos;
782 817  
783 818  
784 819  
... ... @@ -794,15 +829,24 @@
794 829 for (i = 0; i < num_pages; i++) {
795 830 pages[i] = grab_cache_page(inode->i_mapping, index + i);
796 831 if (!pages[i]) {
797   - int c;
798   - for (c = i - 1; c >= 0; c--) {
799   - unlock_page(pages[c]);
800   - page_cache_release(pages[c]);
801   - }
802   - return -ENOMEM;
  832 + faili = i - 1;
  833 + err = -ENOMEM;
  834 + goto fail;
803 835 }
  836 +
  837 + if (i == 0)
  838 + err = prepare_uptodate_page(pages[i], pos);
  839 + if (i == num_pages - 1)
  840 + err = prepare_uptodate_page(pages[i],
  841 + pos + write_bytes);
  842 + if (err) {
  843 + page_cache_release(pages[i]);
  844 + faili = i - 1;
  845 + goto fail;
  846 + }
804 847 wait_on_page_writeback(pages[i]);
805 848 }
  849 + err = 0;
806 850 if (start_pos < inode->i_size) {
807 851 struct btrfs_ordered_extent *ordered;
808 852 lock_extent_bits(&BTRFS_I(inode)->io_tree,
... ... @@ -842,6 +886,14 @@
842 886 WARN_ON(!PageLocked(pages[i]));
843 887 }
844 888 return 0;
  889 +fail:
  890 + while (faili >= 0) {
  891 + unlock_page(pages[faili]);
  892 + page_cache_release(pages[faili]);
  893 + faili--;
  894 + }
  895 + return err;
  896 +
845 897 }
846 898  
847 899 static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
... ... @@ -851,7 +903,6 @@
851 903 struct file *file = iocb->ki_filp;
852 904 struct inode *inode = fdentry(file)->d_inode;
853 905 struct btrfs_root *root = BTRFS_I(inode)->root;
854   - struct page *pinned[2];
855 906 struct page **pages = NULL;
856 907 struct iov_iter i;
857 908 loff_t *ppos = &iocb->ki_pos;
... ... @@ -872,9 +923,6 @@
872 923 will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
873 924 (file->f_flags & O_DIRECT));
874 925  
875   - pinned[0] = NULL;
876   - pinned[1] = NULL;
877   -
878 926 start_pos = pos;
879 927  
880 928 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
... ... @@ -962,32 +1010,6 @@
962 1010 first_index = pos >> PAGE_CACHE_SHIFT;
963 1011 last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;
964 1012  
965   - /*
966   - * there are lots of better ways to do this, but this code
967   - * makes sure the first and last page in the file range are
968   - * up to date and ready for cow
969   - */
970   - if ((pos & (PAGE_CACHE_SIZE - 1))) {
971   - pinned[0] = grab_cache_page(inode->i_mapping, first_index);
972   - if (!PageUptodate(pinned[0])) {
973   - ret = btrfs_readpage(NULL, pinned[0]);
974   - BUG_ON(ret);
975   - wait_on_page_locked(pinned[0]);
976   - } else {
977   - unlock_page(pinned[0]);
978   - }
979   - }
980   - if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) {
981   - pinned[1] = grab_cache_page(inode->i_mapping, last_index);
982   - if (!PageUptodate(pinned[1])) {
983   - ret = btrfs_readpage(NULL, pinned[1]);
984   - BUG_ON(ret);
985   - wait_on_page_locked(pinned[1]);
986   - } else {
987   - unlock_page(pinned[1]);
988   - }
989   - }
990   -
991 1013 while (iov_iter_count(&i) > 0) {
992 1014 size_t offset = pos & (PAGE_CACHE_SIZE - 1);
993 1015 size_t write_bytes = min(iov_iter_count(&i),
994 1016  
... ... @@ -1024,9 +1046,21 @@
1024 1046  
1025 1047 copied = btrfs_copy_from_user(pos, num_pages,
1026 1048 write_bytes, pages, &i);
1027   - dirty_pages = (copied + offset + PAGE_CACHE_SIZE - 1) >>
1028   - PAGE_CACHE_SHIFT;
1029 1049  
  1050 + /*
  1051 + * if we have trouble faulting in the pages, fall
  1052 + * back to one page at a time
  1053 + */
  1054 + if (copied < write_bytes)
  1055 + nrptrs = 1;
  1056 +
  1057 + if (copied == 0)
  1058 + dirty_pages = 0;
  1059 + else
  1060 + dirty_pages = (copied + offset +
  1061 + PAGE_CACHE_SIZE - 1) >>
  1062 + PAGE_CACHE_SHIFT;
  1063 +
1030 1064 if (num_pages > dirty_pages) {
1031 1065 if (copied > 0)
1032 1066 atomic_inc(
... ... @@ -1069,10 +1103,6 @@
1069 1103 err = ret;
1070 1104  
1071 1105 kfree(pages);
1072   - if (pinned[0])
1073   - page_cache_release(pinned[0]);
1074   - if (pinned[1])
1075   - page_cache_release(pinned[1]);
1076 1106 *ppos = pos;
1077 1107  
1078 1108 /*
... ... @@ -4821,10 +4821,11 @@
4821 4821 goto fail;
4822 4822  
4823 4823 /*
4824   - * 1 item for inode ref
  4824 + * 2 items for inode and inode ref
4825 4825 * 2 items for dir items
  4826 + * 1 item for parent inode
4826 4827 */
4827   - trans = btrfs_start_transaction(root, 3);
  4828 + trans = btrfs_start_transaction(root, 5);
4828 4829 if (IS_ERR(trans)) {
4829 4830 err = PTR_ERR(trans);
4830 4831 goto fail;
... ... @@ -6056,6 +6057,7 @@
6056 6057 if (!skip_sum) {
6057 6058 dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
6058 6059 if (!dip->csums) {
  6060 + kfree(dip);
6059 6061 ret = -ENOMEM;
6060 6062 goto free_ordered;
6061 6063 }