Commit 0e5b88cd9975dca6c191cc9bd11f233fac4ca882
Exists in
master
and in
7 other branches
Merge git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable
* git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable: Btrfs: break out of shrink_delalloc earlier btrfs: fix not enough reserved space btrfs: fix dip leak Btrfs: make sure not to return overlapping extents to fiemap Btrfs: deal with short returns from copy_from_user Btrfs: fix regressions in copy_from_user handling
Showing 5 changed files Side-by-side Diff
fs/btrfs/ctree.h
... | ... | @@ -729,6 +729,15 @@ |
729 | 729 | u64 disk_total; /* total bytes on disk, takes mirrors into |
730 | 730 | account */ |
731 | 731 | |
732 | + /* | |
733 | + * we bump reservation progress every time we decrement | |
734 | + * bytes_reserved. This way people waiting for reservations | |
735 | + * know something good has happened and they can check | |
736 | + * for progress. The number here isn't to be trusted, it | |
737 | + * just shows reclaim activity | |
738 | + */ | |
739 | + unsigned long reservation_progress; | |
740 | + | |
732 | 741 | int full; /* indicates that we cannot allocate any more |
733 | 742 | chunks for this space */ |
734 | 743 | int force_alloc; /* set if we need to force a chunk alloc for |
fs/btrfs/extent-tree.c
... | ... | @@ -3342,15 +3342,16 @@ |
3342 | 3342 | u64 max_reclaim; |
3343 | 3343 | u64 reclaimed = 0; |
3344 | 3344 | long time_left; |
3345 | - int pause = 1; | |
3346 | 3345 | int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; |
3347 | 3346 | int loops = 0; |
3347 | + unsigned long progress; | |
3348 | 3348 | |
3349 | 3349 | block_rsv = &root->fs_info->delalloc_block_rsv; |
3350 | 3350 | space_info = block_rsv->space_info; |
3351 | 3351 | |
3352 | 3352 | smp_mb(); |
3353 | 3353 | reserved = space_info->bytes_reserved; |
3354 | + progress = space_info->reservation_progress; | |
3354 | 3355 | |
3355 | 3356 | if (reserved == 0) |
3356 | 3357 | return 0; |
3357 | 3358 | |
3358 | 3359 | |
3359 | 3360 | |
3360 | 3361 | |
3361 | 3362 | |
... | ... | @@ -3365,32 +3366,37 @@ |
3365 | 3366 | writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages); |
3366 | 3367 | |
3367 | 3368 | spin_lock(&space_info->lock); |
3368 | - if (reserved > space_info->bytes_reserved) { | |
3369 | - loops = 0; | |
3369 | + if (reserved > space_info->bytes_reserved) | |
3370 | 3370 | reclaimed += reserved - space_info->bytes_reserved; |
3371 | - } else { | |
3372 | - loops++; | |
3373 | - } | |
3374 | 3371 | reserved = space_info->bytes_reserved; |
3375 | 3372 | spin_unlock(&space_info->lock); |
3376 | 3373 | |
3374 | + loops++; | |
3375 | + | |
3377 | 3376 | if (reserved == 0 || reclaimed >= max_reclaim) |
3378 | 3377 | break; |
3379 | 3378 | |
3380 | 3379 | if (trans && trans->transaction->blocked) |
3381 | 3380 | return -EAGAIN; |
3382 | 3381 | |
3383 | - __set_current_state(TASK_INTERRUPTIBLE); | |
3384 | - time_left = schedule_timeout(pause); | |
3382 | + time_left = schedule_timeout_interruptible(1); | |
3385 | 3383 | |
3386 | 3384 | /* We were interrupted, exit */ |
3387 | 3385 | if (time_left) |
3388 | 3386 | break; |
3389 | 3387 | |
3390 | - pause <<= 1; | |
3391 | - if (pause > HZ / 10) | |
3392 | - pause = HZ / 10; | |
3388 | + /* we've kicked the IO a few times, if anything has been freed, | |
3389 | + * exit. There is no sense in looping here for a long time | |
3390 | + * when we really need to commit the transaction, or there are | |
3391 | + * just too many writers without enough free space | |
3392 | + */ | |
3393 | 3393 | |
3394 | + if (loops > 3) { | |
3395 | + smp_mb(); | |
3396 | + if (progress != space_info->reservation_progress) | |
3397 | + break; | |
3398 | + } | |
3399 | + | |
3394 | 3400 | } |
3395 | 3401 | return reclaimed >= to_reclaim; |
3396 | 3402 | } |
... | ... | @@ -3612,6 +3618,7 @@ |
3612 | 3618 | if (num_bytes) { |
3613 | 3619 | spin_lock(&space_info->lock); |
3614 | 3620 | space_info->bytes_reserved -= num_bytes; |
3621 | + space_info->reservation_progress++; | |
3615 | 3622 | spin_unlock(&space_info->lock); |
3616 | 3623 | } |
3617 | 3624 | } |
... | ... | @@ -3844,6 +3851,7 @@ |
3844 | 3851 | if (block_rsv->reserved >= block_rsv->size) { |
3845 | 3852 | num_bytes = block_rsv->reserved - block_rsv->size; |
3846 | 3853 | sinfo->bytes_reserved -= num_bytes; |
3854 | + sinfo->reservation_progress++; | |
3847 | 3855 | block_rsv->reserved = block_rsv->size; |
3848 | 3856 | block_rsv->full = 1; |
3849 | 3857 | } |
... | ... | @@ -4005,7 +4013,6 @@ |
4005 | 4013 | to_reserve = 0; |
4006 | 4014 | } |
4007 | 4015 | spin_unlock(&BTRFS_I(inode)->accounting_lock); |
4008 | - | |
4009 | 4016 | to_reserve += calc_csum_metadata_size(inode, num_bytes); |
4010 | 4017 | ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1); |
4011 | 4018 | if (ret) |
... | ... | @@ -4133,6 +4140,7 @@ |
4133 | 4140 | btrfs_set_block_group_used(&cache->item, old_val); |
4134 | 4141 | cache->reserved -= num_bytes; |
4135 | 4142 | cache->space_info->bytes_reserved -= num_bytes; |
4143 | + cache->space_info->reservation_progress++; | |
4136 | 4144 | cache->space_info->bytes_used += num_bytes; |
4137 | 4145 | cache->space_info->disk_used += num_bytes * factor; |
4138 | 4146 | spin_unlock(&cache->lock); |
... | ... | @@ -4184,6 +4192,7 @@ |
4184 | 4192 | if (reserved) { |
4185 | 4193 | cache->reserved -= num_bytes; |
4186 | 4194 | cache->space_info->bytes_reserved -= num_bytes; |
4195 | + cache->space_info->reservation_progress++; | |
4187 | 4196 | } |
4188 | 4197 | spin_unlock(&cache->lock); |
4189 | 4198 | spin_unlock(&cache->space_info->lock); |
... | ... | @@ -4234,6 +4243,7 @@ |
4234 | 4243 | space_info->bytes_readonly += num_bytes; |
4235 | 4244 | cache->reserved -= num_bytes; |
4236 | 4245 | space_info->bytes_reserved -= num_bytes; |
4246 | + space_info->reservation_progress++; | |
4237 | 4247 | } |
4238 | 4248 | spin_unlock(&cache->lock); |
4239 | 4249 | spin_unlock(&space_info->lock); |
... | ... | @@ -4712,6 +4722,7 @@ |
4712 | 4722 | if (ret) { |
4713 | 4723 | spin_lock(&cache->space_info->lock); |
4714 | 4724 | cache->space_info->bytes_reserved -= buf->len; |
4725 | + cache->space_info->reservation_progress++; | |
4715 | 4726 | spin_unlock(&cache->space_info->lock); |
4716 | 4727 | } |
4717 | 4728 | goto out; |
fs/btrfs/extent_io.c
... | ... | @@ -3046,17 +3046,38 @@ |
3046 | 3046 | } |
3047 | 3047 | |
3048 | 3048 | while (!end) { |
3049 | - off = extent_map_end(em); | |
3050 | - if (off >= max) | |
3051 | - end = 1; | |
3049 | + u64 offset_in_extent; | |
3052 | 3050 | |
3053 | - em_start = em->start; | |
3054 | - em_len = em->len; | |
3051 | + /* break if the extent we found is outside the range */ | |
3052 | + if (em->start >= max || extent_map_end(em) < off) | |
3053 | + break; | |
3054 | + | |
3055 | + /* | |
3056 | + * get_extent may return an extent that starts before our | |
3057 | + * requested range. We have to make sure the ranges | |
3058 | + * we return to fiemap always move forward and don't | |
3059 | + * overlap, so adjust the offsets here | |
3060 | + */ | |
3061 | + em_start = max(em->start, off); | |
3062 | + | |
3063 | + /* | |
3064 | + * record the offset from the start of the extent | |
3065 | + * for adjusting the disk offset below | |
3066 | + */ | |
3067 | + offset_in_extent = em_start - em->start; | |
3055 | 3068 | em_end = extent_map_end(em); |
3069 | + em_len = em_end - em_start; | |
3056 | 3070 | emflags = em->flags; |
3057 | 3071 | disko = 0; |
3058 | 3072 | flags = 0; |
3059 | 3073 | |
3074 | + /* | |
3075 | + * bump off for our next call to get_extent | |
3076 | + */ | |
3077 | + off = extent_map_end(em); | |
3078 | + if (off >= max) | |
3079 | + end = 1; | |
3080 | + | |
3060 | 3081 | if (em->block_start == EXTENT_MAP_LAST_BYTE) { |
3061 | 3082 | end = 1; |
3062 | 3083 | flags |= FIEMAP_EXTENT_LAST; |
... | ... | @@ -3067,7 +3088,7 @@ |
3067 | 3088 | flags |= (FIEMAP_EXTENT_DELALLOC | |
3068 | 3089 | FIEMAP_EXTENT_UNKNOWN); |
3069 | 3090 | } else { |
3070 | - disko = em->block_start; | |
3091 | + disko = em->block_start + offset_in_extent; | |
3071 | 3092 | } |
3072 | 3093 | if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) |
3073 | 3094 | flags |= FIEMAP_EXTENT_ENCODED; |
fs/btrfs/file.c
... | ... | @@ -70,6 +70,19 @@ |
70 | 70 | |
71 | 71 | /* Flush processor's dcache for this page */ |
72 | 72 | flush_dcache_page(page); |
73 | + | |
74 | + /* | |
75 | + * if we get a partial write, we can end up with | |
76 | + * partially up to date pages. These add | |
77 | + * a lot of complexity, so make sure they don't | |
78 | + * happen by forcing this copy to be retried. | |
79 | + * | |
80 | + * The rest of the btrfs_file_write code will fall | |
81 | + * back to page at a time copies after we return 0. | |
82 | + */ | |
83 | + if (!PageUptodate(page) && copied < count) | |
84 | + copied = 0; | |
85 | + | |
73 | 86 | iov_iter_advance(i, copied); |
74 | 87 | write_bytes -= copied; |
75 | 88 | total_copied += copied; |
... | ... | @@ -763,6 +776,27 @@ |
763 | 776 | } |
764 | 777 | |
765 | 778 | /* |
779 | + * on error we return an unlocked page and the error value | |
780 | + * on success we return a locked page and 0 | |
781 | + */ | |
782 | +static int prepare_uptodate_page(struct page *page, u64 pos) | |
783 | +{ | |
784 | + int ret = 0; | |
785 | + | |
786 | + if ((pos & (PAGE_CACHE_SIZE - 1)) && !PageUptodate(page)) { | |
787 | + ret = btrfs_readpage(NULL, page); | |
788 | + if (ret) | |
789 | + return ret; | |
790 | + lock_page(page); | |
791 | + if (!PageUptodate(page)) { | |
792 | + unlock_page(page); | |
793 | + return -EIO; | |
794 | + } | |
795 | + } | |
796 | + return 0; | |
797 | +} | |
798 | + | |
799 | +/* | |
766 | 800 | * this gets pages into the page cache and locks them down, it also properly |
767 | 801 | * waits for data=ordered extents to finish before allowing the pages to be |
768 | 802 | * modified. |
... | ... | @@ -777,6 +811,7 @@ |
777 | 811 | unsigned long index = pos >> PAGE_CACHE_SHIFT; |
778 | 812 | struct inode *inode = fdentry(file)->d_inode; |
779 | 813 | int err = 0; |
814 | + int faili = 0; | |
780 | 815 | u64 start_pos; |
781 | 816 | u64 last_pos; |
782 | 817 | |
783 | 818 | |
784 | 819 | |
... | ... | @@ -794,15 +829,24 @@ |
794 | 829 | for (i = 0; i < num_pages; i++) { |
795 | 830 | pages[i] = grab_cache_page(inode->i_mapping, index + i); |
796 | 831 | if (!pages[i]) { |
797 | - int c; | |
798 | - for (c = i - 1; c >= 0; c--) { | |
799 | - unlock_page(pages[c]); | |
800 | - page_cache_release(pages[c]); | |
801 | - } | |
802 | - return -ENOMEM; | |
832 | + faili = i - 1; | |
833 | + err = -ENOMEM; | |
834 | + goto fail; | |
803 | 835 | } |
836 | + | |
837 | + if (i == 0) | |
838 | + err = prepare_uptodate_page(pages[i], pos); | |
839 | + if (i == num_pages - 1) | |
840 | + err = prepare_uptodate_page(pages[i], | |
841 | + pos + write_bytes); | |
842 | + if (err) { | |
843 | + page_cache_release(pages[i]); | |
844 | + faili = i - 1; | |
845 | + goto fail; | |
846 | + } | |
804 | 847 | wait_on_page_writeback(pages[i]); |
805 | 848 | } |
849 | + err = 0; | |
806 | 850 | if (start_pos < inode->i_size) { |
807 | 851 | struct btrfs_ordered_extent *ordered; |
808 | 852 | lock_extent_bits(&BTRFS_I(inode)->io_tree, |
... | ... | @@ -842,6 +886,14 @@ |
842 | 886 | WARN_ON(!PageLocked(pages[i])); |
843 | 887 | } |
844 | 888 | return 0; |
889 | +fail: | |
890 | + while (faili >= 0) { | |
891 | + unlock_page(pages[faili]); | |
892 | + page_cache_release(pages[faili]); | |
893 | + faili--; | |
894 | + } | |
895 | + return err; | |
896 | + | |
845 | 897 | } |
846 | 898 | |
847 | 899 | static ssize_t btrfs_file_aio_write(struct kiocb *iocb, |
... | ... | @@ -851,7 +903,6 @@ |
851 | 903 | struct file *file = iocb->ki_filp; |
852 | 904 | struct inode *inode = fdentry(file)->d_inode; |
853 | 905 | struct btrfs_root *root = BTRFS_I(inode)->root; |
854 | - struct page *pinned[2]; | |
855 | 906 | struct page **pages = NULL; |
856 | 907 | struct iov_iter i; |
857 | 908 | loff_t *ppos = &iocb->ki_pos; |
... | ... | @@ -872,9 +923,6 @@ |
872 | 923 | will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) || |
873 | 924 | (file->f_flags & O_DIRECT)); |
874 | 925 | |
875 | - pinned[0] = NULL; | |
876 | - pinned[1] = NULL; | |
877 | - | |
878 | 926 | start_pos = pos; |
879 | 927 | |
880 | 928 | vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); |
... | ... | @@ -962,32 +1010,6 @@ |
962 | 1010 | first_index = pos >> PAGE_CACHE_SHIFT; |
963 | 1011 | last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT; |
964 | 1012 | |
965 | - /* | |
966 | - * there are lots of better ways to do this, but this code | |
967 | - * makes sure the first and last page in the file range are | |
968 | - * up to date and ready for cow | |
969 | - */ | |
970 | - if ((pos & (PAGE_CACHE_SIZE - 1))) { | |
971 | - pinned[0] = grab_cache_page(inode->i_mapping, first_index); | |
972 | - if (!PageUptodate(pinned[0])) { | |
973 | - ret = btrfs_readpage(NULL, pinned[0]); | |
974 | - BUG_ON(ret); | |
975 | - wait_on_page_locked(pinned[0]); | |
976 | - } else { | |
977 | - unlock_page(pinned[0]); | |
978 | - } | |
979 | - } | |
980 | - if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) { | |
981 | - pinned[1] = grab_cache_page(inode->i_mapping, last_index); | |
982 | - if (!PageUptodate(pinned[1])) { | |
983 | - ret = btrfs_readpage(NULL, pinned[1]); | |
984 | - BUG_ON(ret); | |
985 | - wait_on_page_locked(pinned[1]); | |
986 | - } else { | |
987 | - unlock_page(pinned[1]); | |
988 | - } | |
989 | - } | |
990 | - | |
991 | 1013 | while (iov_iter_count(&i) > 0) { |
992 | 1014 | size_t offset = pos & (PAGE_CACHE_SIZE - 1); |
993 | 1015 | size_t write_bytes = min(iov_iter_count(&i), |
994 | 1016 | |
... | ... | @@ -1024,9 +1046,21 @@ |
1024 | 1046 | |
1025 | 1047 | copied = btrfs_copy_from_user(pos, num_pages, |
1026 | 1048 | write_bytes, pages, &i); |
1027 | - dirty_pages = (copied + offset + PAGE_CACHE_SIZE - 1) >> | |
1028 | - PAGE_CACHE_SHIFT; | |
1029 | 1049 | |
1050 | + /* | |
1051 | + * if we have trouble faulting in the pages, fall | |
1052 | + * back to one page at a time | |
1053 | + */ | |
1054 | + if (copied < write_bytes) | |
1055 | + nrptrs = 1; | |
1056 | + | |
1057 | + if (copied == 0) | |
1058 | + dirty_pages = 0; | |
1059 | + else | |
1060 | + dirty_pages = (copied + offset + | |
1061 | + PAGE_CACHE_SIZE - 1) >> | |
1062 | + PAGE_CACHE_SHIFT; | |
1063 | + | |
1030 | 1064 | if (num_pages > dirty_pages) { |
1031 | 1065 | if (copied > 0) |
1032 | 1066 | atomic_inc( |
... | ... | @@ -1069,10 +1103,6 @@ |
1069 | 1103 | err = ret; |
1070 | 1104 | |
1071 | 1105 | kfree(pages); |
1072 | - if (pinned[0]) | |
1073 | - page_cache_release(pinned[0]); | |
1074 | - if (pinned[1]) | |
1075 | - page_cache_release(pinned[1]); | |
1076 | 1106 | *ppos = pos; |
1077 | 1107 | |
1078 | 1108 | /* |
fs/btrfs/inode.c
... | ... | @@ -4821,10 +4821,11 @@ |
4821 | 4821 | goto fail; |
4822 | 4822 | |
4823 | 4823 | /* |
4824 | - * 1 item for inode ref | |
4824 | + * 2 items for inode and inode ref | |
4825 | 4825 | * 2 items for dir items |
4826 | + * 1 item for parent inode | |
4826 | 4827 | */ |
4827 | - trans = btrfs_start_transaction(root, 3); | |
4828 | + trans = btrfs_start_transaction(root, 5); | |
4828 | 4829 | if (IS_ERR(trans)) { |
4829 | 4830 | err = PTR_ERR(trans); |
4830 | 4831 | goto fail; |
... | ... | @@ -6056,6 +6057,7 @@ |
6056 | 6057 | if (!skip_sum) { |
6057 | 6058 | dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS); |
6058 | 6059 | if (!dip->csums) { |
6060 | + kfree(dip); | |
6059 | 6061 | ret = -ENOMEM; |
6060 | 6062 | goto free_ordered; |
6061 | 6063 | } |