Commit 3a307ffc2730bfa1a4dfa94537be9d412338aad2
1 parent
2e89b2e48e
ocfs2: rework ocfs2_buffered_write_cluster()
Use some ideas from the new-aops patch series and turn ocfs2_buffered_write_cluster() into a 2 stage operation with the caller copying data in between. The code now understands multiple cluster writes as a result of having to deal with a full page write for greater than 4k pages. This sets us up to easily call into the write path during ->page_mkwrite(). Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Showing 3 changed files with 551 additions and 438 deletions Side-by-side Diff
fs/ocfs2/aops.c
Changes suppressed. Click to show
... | ... | @@ -684,6 +684,8 @@ |
684 | 684 | bh = bh->b_this_page, block_start += bsize) { |
685 | 685 | block_end = block_start + bsize; |
686 | 686 | |
687 | + clear_buffer_new(bh); | |
688 | + | |
687 | 689 | /* |
688 | 690 | * Ignore blocks outside of our i/o range - |
689 | 691 | * they may belong to unallocated clusters. |
690 | 692 | |
... | ... | @@ -698,10 +700,9 @@ |
698 | 700 | * For an allocating write with cluster size >= page |
699 | 701 | * size, we always write the entire page. |
700 | 702 | */ |
703 | + if (new) | |
704 | + set_buffer_new(bh); | |
701 | 705 | |
702 | - if (buffer_new(bh)) | |
703 | - clear_buffer_new(bh); | |
704 | - | |
705 | 706 | if (!buffer_mapped(bh)) { |
706 | 707 | map_bh(bh, inode->i_sb, *p_blkno); |
707 | 708 | unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); |
708 | 709 | |
709 | 710 | |
710 | 711 | |
711 | 712 | |
712 | 713 | |
713 | 714 | |
714 | 715 | |
715 | 716 | |
716 | 717 | |
717 | 718 | |
718 | 719 | |
719 | 720 | |
720 | 721 | |
721 | 722 | |
722 | 723 | |
723 | 724 | |
724 | 725 | |
725 | 726 | |
726 | 727 | |
727 | 728 | |
728 | 729 | |
729 | 730 | |
730 | 731 | |
731 | 732 | |
732 | 733 | |
733 | 734 | |
734 | 735 | |
735 | 736 | |
736 | 737 | |
737 | 738 | |
738 | 739 | |
739 | 740 | |
740 | 741 | |
741 | 742 | |
... | ... | @@ -761,217 +762,232 @@ |
761 | 762 | return ret; |
762 | 763 | } |
763 | 764 | |
765 | +#if (PAGE_CACHE_SIZE >= OCFS2_MAX_CLUSTERSIZE) | |
766 | +#define OCFS2_MAX_CTXT_PAGES 1 | |
767 | +#else | |
768 | +#define OCFS2_MAX_CTXT_PAGES (OCFS2_MAX_CLUSTERSIZE / PAGE_CACHE_SIZE) | |
769 | +#endif | |
770 | + | |
771 | +#define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE) | |
772 | + | |
764 | 773 | /* |
765 | - * This will copy user data from the buffer page in the splice | |
766 | - * context. | |
767 | - * | |
768 | - * For now, we ignore SPLICE_F_MOVE as that would require some extra | |
769 | - * communication out all the way to ocfs2_write(). | |
774 | + * Describe the state of a single cluster to be written to. | |
770 | 775 | */ |
771 | -int ocfs2_map_and_write_splice_data(struct inode *inode, | |
772 | - struct ocfs2_write_ctxt *wc, u64 *p_blkno, | |
773 | - unsigned int *ret_from, unsigned int *ret_to) | |
774 | -{ | |
775 | - int ret; | |
776 | - unsigned int to, from, cluster_start, cluster_end; | |
777 | - char *src, *dst; | |
778 | - struct ocfs2_splice_write_priv *sp = wc->w_private; | |
779 | - struct pipe_buffer *buf = sp->s_buf; | |
780 | - unsigned long bytes, src_from; | |
781 | - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | |
776 | +struct ocfs2_write_cluster_desc { | |
777 | + u32 c_cpos; | |
778 | + u32 c_phys; | |
779 | + /* | |
780 | + * Give this a unique field because c_phys eventually gets | |
781 | + * filled. | |
782 | + */ | |
783 | + unsigned c_new; | |
784 | +}; | |
782 | 785 | |
783 | - ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start, | |
784 | - &cluster_end); | |
786 | +struct ocfs2_write_ctxt { | |
787 | + /* Logical cluster position / len of write */ | |
788 | + u32 w_cpos; | |
789 | + u32 w_clen; | |
785 | 790 | |
786 | - from = sp->s_offset; | |
787 | - src_from = sp->s_buf_offset; | |
788 | - bytes = wc->w_count; | |
791 | + struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE]; | |
789 | 792 | |
790 | - if (wc->w_large_pages) { | |
791 | - /* | |
792 | - * For cluster size < page size, we have to | |
793 | - * calculate pos within the cluster and obey | |
794 | - * the rightmost boundary. | |
795 | - */ | |
796 | - bytes = min(bytes, (unsigned long)(osb->s_clustersize | |
797 | - - (wc->w_pos & (osb->s_clustersize - 1)))); | |
798 | - } | |
799 | - to = from + bytes; | |
793 | + /* | |
794 | + * This is true if page_size > cluster_size. | |
795 | + * | |
796 | + * It triggers a set of special cases during write which might | |
797 | + * have to deal with allocating writes to partial pages. | |
798 | + */ | |
799 | + unsigned int w_large_pages; | |
800 | 800 | |
801 | - BUG_ON(from > PAGE_CACHE_SIZE); | |
802 | - BUG_ON(to > PAGE_CACHE_SIZE); | |
803 | - BUG_ON(from < cluster_start); | |
804 | - BUG_ON(to > cluster_end); | |
801 | + /* | |
802 | + * Pages involved in this write. | |
803 | + * | |
804 | + * w_target_page is the page being written to by the user. | |
805 | + * | |
806 | + * w_pages is an array of pages which always contains | |
807 | + * w_target_page, and in the case of an allocating write with | |
808 | + * page_size < cluster size, it will contain zero'd and mapped | |
809 | + * pages adjacent to w_target_page which need to be written | |
810 | + * out in so that future reads from that region will get | |
811 | + * zero's. | |
812 | + */ | |
813 | + struct page *w_pages[OCFS2_MAX_CTXT_PAGES]; | |
814 | + unsigned int w_num_pages; | |
815 | + struct page *w_target_page; | |
805 | 816 | |
806 | - if (wc->w_this_page_new) | |
807 | - ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, | |
808 | - cluster_start, cluster_end, 1); | |
809 | - else | |
810 | - ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, | |
811 | - from, to, 0); | |
812 | - if (ret) { | |
813 | - mlog_errno(ret); | |
814 | - goto out; | |
817 | + /* | |
818 | + * ocfs2_write_end() uses this to know what the real range to | |
819 | + * write in the target should be. | |
820 | + */ | |
821 | + unsigned int w_target_from; | |
822 | + unsigned int w_target_to; | |
823 | + | |
824 | + /* | |
825 | + * We could use journal_current_handle() but this is cleaner, | |
826 | + * IMHO -Mark | |
827 | + */ | |
828 | + handle_t *w_handle; | |
829 | + | |
830 | + struct buffer_head *w_di_bh; | |
831 | +}; | |
832 | + | |
833 | +static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) | |
834 | +{ | |
835 | + int i; | |
836 | + | |
837 | + for(i = 0; i < wc->w_num_pages; i++) { | |
838 | + if (wc->w_pages[i] == NULL) | |
839 | + continue; | |
840 | + | |
841 | + unlock_page(wc->w_pages[i]); | |
842 | + mark_page_accessed(wc->w_pages[i]); | |
843 | + page_cache_release(wc->w_pages[i]); | |
815 | 844 | } |
816 | 845 | |
817 | - src = buf->ops->map(sp->s_pipe, buf, 1); | |
818 | - dst = kmap_atomic(wc->w_this_page, KM_USER1); | |
819 | - memcpy(dst + from, src + src_from, bytes); | |
820 | - kunmap_atomic(wc->w_this_page, KM_USER1); | |
821 | - buf->ops->unmap(sp->s_pipe, buf, src); | |
846 | + brelse(wc->w_di_bh); | |
847 | + kfree(wc); | |
848 | +} | |
822 | 849 | |
823 | - wc->w_finished_copy = 1; | |
850 | +static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, | |
851 | + struct ocfs2_super *osb, loff_t pos, | |
852 | + unsigned len) | |
853 | +{ | |
854 | + struct ocfs2_write_ctxt *wc; | |
824 | 855 | |
825 | - *ret_from = from; | |
826 | - *ret_to = to; | |
827 | -out: | |
856 | + wc = kzalloc(sizeof(struct ocfs2_write_ctxt), GFP_NOFS); | |
857 | + if (!wc) | |
858 | + return -ENOMEM; | |
828 | 859 | |
829 | - return bytes ? (unsigned int)bytes : ret; | |
860 | + wc->w_cpos = pos >> osb->s_clustersize_bits; | |
861 | + wc->w_clen = ocfs2_clusters_for_bytes(osb->sb, len); | |
862 | + | |
863 | + if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) | |
864 | + wc->w_large_pages = 1; | |
865 | + else | |
866 | + wc->w_large_pages = 0; | |
867 | + | |
868 | + *wcp = wc; | |
869 | + | |
870 | + return 0; | |
830 | 871 | } |
831 | 872 | |
832 | 873 | /* |
833 | - * This will copy user data from the iovec in the buffered write | |
834 | - * context. | |
874 | + * If a page has any new buffers, zero them out here, and mark them uptodate | |
875 | + * and dirty so they'll be written out (in order to prevent uninitialised | |
876 | + * block data from leaking). And clear the new bit. | |
835 | 877 | */ |
836 | -int ocfs2_map_and_write_user_data(struct inode *inode, | |
837 | - struct ocfs2_write_ctxt *wc, u64 *p_blkno, | |
838 | - unsigned int *ret_from, unsigned int *ret_to) | |
878 | +static void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to) | |
839 | 879 | { |
840 | - int ret; | |
841 | - unsigned int to, from, cluster_start, cluster_end; | |
842 | - unsigned long bytes, src_from; | |
843 | - char *dst; | |
844 | - struct ocfs2_buffered_write_priv *bp = wc->w_private; | |
845 | - const struct iovec *cur_iov = bp->b_cur_iov; | |
846 | - char __user *buf; | |
847 | - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | |
880 | + unsigned int block_start, block_end; | |
881 | + struct buffer_head *head, *bh; | |
848 | 882 | |
849 | - ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start, | |
850 | - &cluster_end); | |
883 | + BUG_ON(!PageLocked(page)); | |
884 | + if (!page_has_buffers(page)) | |
885 | + return; | |
851 | 886 | |
852 | - buf = cur_iov->iov_base + bp->b_cur_off; | |
853 | - src_from = (unsigned long)buf & ~PAGE_CACHE_MASK; | |
887 | + bh = head = page_buffers(page); | |
888 | + block_start = 0; | |
889 | + do { | |
890 | + block_end = block_start + bh->b_size; | |
854 | 891 | |
855 | - from = wc->w_pos & (PAGE_CACHE_SIZE - 1); | |
892 | + if (buffer_new(bh)) { | |
893 | + if (block_end > from && block_start < to) { | |
894 | + if (!PageUptodate(page)) { | |
895 | + unsigned start, end; | |
896 | + void *kaddr; | |
856 | 897 | |
857 | - /* | |
858 | - * This is a lot of comparisons, but it reads quite | |
859 | - * easily, which is important here. | |
860 | - */ | |
861 | - /* Stay within the src page */ | |
862 | - bytes = PAGE_SIZE - src_from; | |
863 | - /* Stay within the vector */ | |
864 | - bytes = min(bytes, | |
865 | - (unsigned long)(cur_iov->iov_len - bp->b_cur_off)); | |
866 | - /* Stay within count */ | |
867 | - bytes = min(bytes, (unsigned long)wc->w_count); | |
868 | - /* | |
869 | - * For clustersize > page size, just stay within | |
870 | - * target page, otherwise we have to calculate pos | |
871 | - * within the cluster and obey the rightmost | |
872 | - * boundary. | |
873 | - */ | |
874 | - if (wc->w_large_pages) { | |
875 | - /* | |
876 | - * For cluster size < page size, we have to | |
877 | - * calculate pos within the cluster and obey | |
878 | - * the rightmost boundary. | |
879 | - */ | |
880 | - bytes = min(bytes, (unsigned long)(osb->s_clustersize | |
881 | - - (wc->w_pos & (osb->s_clustersize - 1)))); | |
882 | - } else { | |
883 | - /* | |
884 | - * cluster size > page size is the most common | |
885 | - * case - we just stay within the target page | |
886 | - * boundary. | |
887 | - */ | |
888 | - bytes = min(bytes, PAGE_CACHE_SIZE - from); | |
889 | - } | |
898 | + start = max(from, block_start); | |
899 | + end = min(to, block_end); | |
890 | 900 | |
891 | - to = from + bytes; | |
901 | + kaddr = kmap_atomic(page, KM_USER0); | |
902 | + memset(kaddr+start, 0, end - start); | |
903 | + flush_dcache_page(page); | |
904 | + kunmap_atomic(kaddr, KM_USER0); | |
905 | + set_buffer_uptodate(bh); | |
906 | + } | |
892 | 907 | |
893 | - BUG_ON(from > PAGE_CACHE_SIZE); | |
894 | - BUG_ON(to > PAGE_CACHE_SIZE); | |
895 | - BUG_ON(from < cluster_start); | |
896 | - BUG_ON(to > cluster_end); | |
908 | + clear_buffer_new(bh); | |
909 | + mark_buffer_dirty(bh); | |
910 | + } | |
911 | + } | |
897 | 912 | |
898 | - if (wc->w_this_page_new) | |
899 | - ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, | |
900 | - cluster_start, cluster_end, 1); | |
901 | - else | |
902 | - ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, | |
903 | - from, to, 0); | |
904 | - if (ret) { | |
905 | - mlog_errno(ret); | |
906 | - goto out; | |
907 | - } | |
913 | + block_start = block_end; | |
914 | + bh = bh->b_this_page; | |
915 | + } while (bh != head); | |
916 | +} | |
908 | 917 | |
909 | - dst = kmap(wc->w_this_page); | |
910 | - memcpy(dst + from, bp->b_src_buf + src_from, bytes); | |
911 | - kunmap(wc->w_this_page); | |
918 | +/* | |
919 | + * Only called when we have a failure during allocating write to write | |
920 | + * zero's to the newly allocated region. | |
921 | + */ | |
922 | +static void ocfs2_write_failure(struct inode *inode, | |
923 | + struct ocfs2_write_ctxt *wc, | |
924 | + loff_t user_pos, unsigned user_len) | |
925 | +{ | |
926 | + int i; | |
927 | + unsigned from, to; | |
928 | + struct page *tmppage; | |
912 | 929 | |
913 | - /* | |
914 | - * XXX: This is slow, but simple. The caller of | |
915 | - * ocfs2_buffered_write_cluster() is responsible for | |
916 | - * passing through the iovecs, so it's difficult to | |
917 | - * predict what our next step is in here after our | |
918 | - * initial write. A future version should be pushing | |
919 | - * that iovec manipulation further down. | |
920 | - * | |
921 | - * By setting this, we indicate that a copy from user | |
922 | - * data was done, and subsequent calls for this | |
923 | - * cluster will skip copying more data. | |
924 | - */ | |
925 | - wc->w_finished_copy = 1; | |
930 | + ocfs2_zero_new_buffers(wc->w_target_page, user_pos, user_len); | |
926 | 931 | |
927 | - *ret_from = from; | |
928 | - *ret_to = to; | |
929 | -out: | |
932 | + if (wc->w_large_pages) { | |
933 | + from = wc->w_target_from; | |
934 | + to = wc->w_target_to; | |
935 | + } else { | |
936 | + from = 0; | |
937 | + to = PAGE_CACHE_SIZE; | |
938 | + } | |
930 | 939 | |
931 | - return bytes ? (unsigned int)bytes : ret; | |
940 | + for(i = 0; i < wc->w_num_pages; i++) { | |
941 | + tmppage = wc->w_pages[i]; | |
942 | + | |
943 | + if (ocfs2_should_order_data(inode)) | |
944 | + walk_page_buffers(wc->w_handle, page_buffers(tmppage), | |
945 | + from, to, NULL, | |
946 | + ocfs2_journal_dirty_data); | |
947 | + | |
948 | + block_commit_write(tmppage, from, to); | |
949 | + } | |
932 | 950 | } |
933 | 951 | |
934 | -/* | |
935 | - * Map, fill and write a page to disk. | |
936 | - * | |
937 | - * The work of copying data is done via callback. Newly allocated | |
938 | - * pages which don't take user data will be zero'd (set 'new' to | |
939 | - * indicate an allocating write) | |
940 | - * | |
941 | - * Returns a negative error code or the number of bytes copied into | |
942 | - * the page. | |
943 | - */ | |
944 | -static int ocfs2_write_data_page(struct inode *inode, handle_t *handle, | |
945 | - u64 *p_blkno, struct page *page, | |
946 | - struct ocfs2_write_ctxt *wc, int new) | |
952 | +static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno, | |
953 | + struct ocfs2_write_ctxt *wc, | |
954 | + struct page *page, u32 cpos, | |
955 | + loff_t user_pos, unsigned user_len, | |
956 | + int new) | |
947 | 957 | { |
948 | - int ret, copied = 0; | |
949 | - unsigned int from = 0, to = 0; | |
958 | + int ret; | |
959 | + unsigned int map_from = 0, map_to = 0; | |
950 | 960 | unsigned int cluster_start, cluster_end; |
951 | - unsigned int zero_from = 0, zero_to = 0; | |
961 | + unsigned int user_data_from = 0, user_data_to = 0; | |
952 | 962 | |
953 | - ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos, | |
963 | + ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos, | |
954 | 964 | &cluster_start, &cluster_end); |
955 | 965 | |
956 | - if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index | |
957 | - && !wc->w_finished_copy) { | |
966 | + if (page == wc->w_target_page) { | |
967 | + map_from = user_pos & (PAGE_CACHE_SIZE - 1); | |
968 | + map_to = map_from + user_len; | |
958 | 969 | |
959 | - wc->w_this_page = page; | |
960 | - wc->w_this_page_new = new; | |
961 | - ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to); | |
962 | - if (ret < 0) { | |
970 | + if (new) | |
971 | + ret = ocfs2_map_page_blocks(page, p_blkno, inode, | |
972 | + cluster_start, cluster_end, | |
973 | + new); | |
974 | + else | |
975 | + ret = ocfs2_map_page_blocks(page, p_blkno, inode, | |
976 | + map_from, map_to, new); | |
977 | + if (ret) { | |
963 | 978 | mlog_errno(ret); |
964 | 979 | goto out; |
965 | 980 | } |
966 | 981 | |
967 | - copied = ret; | |
968 | - | |
969 | - zero_from = from; | |
970 | - zero_to = to; | |
982 | + user_data_from = map_from; | |
983 | + user_data_to = map_to; | |
971 | 984 | if (new) { |
972 | - from = cluster_start; | |
973 | - to = cluster_end; | |
985 | + map_from = cluster_start; | |
986 | + map_to = cluster_end; | |
974 | 987 | } |
988 | + | |
989 | + wc->w_target_from = map_from; | |
990 | + wc->w_target_to = map_to; | |
975 | 991 | } else { |
976 | 992 | /* |
977 | 993 | * If we haven't allocated the new page yet, we |
978 | 994 | |
... | ... | @@ -980,11 +996,11 @@ |
980 | 996 | */ |
981 | 997 | BUG_ON(!new); |
982 | 998 | |
983 | - from = cluster_start; | |
984 | - to = cluster_end; | |
999 | + map_from = cluster_start; | |
1000 | + map_to = cluster_end; | |
985 | 1001 | |
986 | 1002 | ret = ocfs2_map_page_blocks(page, p_blkno, inode, |
987 | - cluster_start, cluster_end, 1); | |
1003 | + cluster_start, cluster_end, new); | |
988 | 1004 | if (ret) { |
989 | 1005 | mlog_errno(ret); |
990 | 1006 | goto out; |
991 | 1007 | |
992 | 1008 | |
993 | 1009 | |
994 | 1010 | |
995 | 1011 | |
996 | 1012 | |
997 | 1013 | |
998 | 1014 | |
999 | 1015 | |
1000 | 1016 | |
1001 | 1017 | |
1002 | 1018 | |
1003 | 1019 | |
1004 | 1020 | |
1005 | 1021 | |
1006 | 1022 | |
1007 | 1023 | |
1008 | 1024 | |
... | ... | @@ -1003,108 +1019,84 @@ |
1003 | 1019 | */ |
1004 | 1020 | if (new && !PageUptodate(page)) |
1005 | 1021 | ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb), |
1006 | - wc->w_cpos, zero_from, zero_to); | |
1022 | + cpos, user_data_from, user_data_to); | |
1007 | 1023 | |
1008 | 1024 | flush_dcache_page(page); |
1009 | 1025 | |
1010 | - if (ocfs2_should_order_data(inode)) { | |
1011 | - ret = walk_page_buffers(handle, | |
1012 | - page_buffers(page), | |
1013 | - from, to, NULL, | |
1014 | - ocfs2_journal_dirty_data); | |
1015 | - if (ret < 0) | |
1016 | - mlog_errno(ret); | |
1017 | - } | |
1018 | - | |
1019 | - /* | |
1020 | - * We don't use generic_commit_write() because we need to | |
1021 | - * handle our own i_size update. | |
1022 | - */ | |
1023 | - ret = block_commit_write(page, from, to); | |
1024 | - if (ret) | |
1025 | - mlog_errno(ret); | |
1026 | 1026 | out: |
1027 | - | |
1028 | - return copied ? copied : ret; | |
1027 | + return ret; | |
1029 | 1028 | } |
1030 | 1029 | |
1031 | 1030 | /* |
1032 | - * Do the actual write of some data into an inode. Optionally allocate | |
1033 | - * in order to fulfill the write. | |
1034 | - * | |
1035 | - * cpos is the logical cluster offset within the file to write at | |
1036 | - * | |
1037 | - * 'phys' is the physical mapping of that offset. a 'phys' value of | |
1038 | - * zero indicates that allocation is required. In this case, data_ac | |
1039 | - * and meta_ac should be valid (meta_ac can be null if metadata | |
1040 | - * allocation isn't required). | |
1031 | + * This function will only grab one clusters worth of pages. | |
1041 | 1032 | */ |
1042 | -static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle, | |
1043 | - struct buffer_head *di_bh, | |
1044 | - struct ocfs2_alloc_context *data_ac, | |
1045 | - struct ocfs2_alloc_context *meta_ac, | |
1046 | - struct ocfs2_write_ctxt *wc) | |
1033 | +static int ocfs2_grab_pages_for_write(struct address_space *mapping, | |
1034 | + struct ocfs2_write_ctxt *wc, | |
1035 | + u32 cpos, loff_t user_pos, int new) | |
1047 | 1036 | { |
1048 | - int ret, i, numpages = 1, new; | |
1049 | - unsigned int copied = 0; | |
1050 | - u32 tmp_pos; | |
1051 | - u64 v_blkno, p_blkno; | |
1052 | - struct address_space *mapping = file->f_mapping; | |
1037 | + int ret = 0, i; | |
1038 | + unsigned long start, target_index, index; | |
1053 | 1039 | struct inode *inode = mapping->host; |
1054 | - unsigned long index, start; | |
1055 | - struct page **cpages; | |
1056 | 1040 | |
1057 | - new = phys == 0 ? 1 : 0; | |
1041 | + target_index = user_pos >> PAGE_CACHE_SHIFT; | |
1058 | 1042 | |
1059 | 1043 | /* |
1060 | 1044 | * Figure out how many pages we'll be manipulating here. For |
1061 | 1045 | * non allocating write, we just change the one |
1062 | 1046 | * page. Otherwise, we'll need a whole clusters worth. |
1063 | 1047 | */ |
1064 | - if (new) | |
1065 | - numpages = ocfs2_pages_per_cluster(inode->i_sb); | |
1066 | - | |
1067 | - cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS); | |
1068 | - if (!cpages) { | |
1069 | - ret = -ENOMEM; | |
1070 | - mlog_errno(ret); | |
1071 | - return ret; | |
1072 | - } | |
1073 | - | |
1074 | - /* | |
1075 | - * Fill our page array first. That way we've grabbed enough so | |
1076 | - * that we can zero and flush if we error after adding the | |
1077 | - * extent. | |
1078 | - */ | |
1079 | 1048 | if (new) { |
1080 | - start = ocfs2_align_clusters_to_page_index(inode->i_sb, | |
1081 | - wc->w_cpos); | |
1082 | - v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos); | |
1049 | + wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb); | |
1050 | + start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos); | |
1083 | 1051 | } else { |
1084 | - start = wc->w_pos >> PAGE_CACHE_SHIFT; | |
1085 | - v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits; | |
1052 | + wc->w_num_pages = 1; | |
1053 | + start = target_index; | |
1086 | 1054 | } |
1087 | 1055 | |
1088 | - for(i = 0; i < numpages; i++) { | |
1056 | + for(i = 0; i < wc->w_num_pages; i++) { | |
1089 | 1057 | index = start + i; |
1090 | 1058 | |
1091 | - cpages[i] = find_or_create_page(mapping, index, GFP_NOFS); | |
1092 | - if (!cpages[i]) { | |
1059 | + wc->w_pages[i] = find_or_create_page(mapping, index, GFP_NOFS); | |
1060 | + if (!wc->w_pages[i]) { | |
1093 | 1061 | ret = -ENOMEM; |
1094 | 1062 | mlog_errno(ret); |
1095 | 1063 | goto out; |
1096 | 1064 | } |
1065 | + | |
1066 | + if (index == target_index) | |
1067 | + wc->w_target_page = wc->w_pages[i]; | |
1097 | 1068 | } |
1069 | +out: | |
1070 | + return ret; | |
1071 | +} | |
1098 | 1072 | |
1073 | +/* | |
1074 | + * Prepare a single cluster for write one cluster into the file. | |
1075 | + */ | |
1076 | +static int ocfs2_write_cluster(struct address_space *mapping, | |
1077 | + u32 phys, struct ocfs2_alloc_context *data_ac, | |
1078 | + struct ocfs2_alloc_context *meta_ac, | |
1079 | + struct ocfs2_write_ctxt *wc, u32 cpos, | |
1080 | + loff_t user_pos, unsigned user_len) | |
1081 | +{ | |
1082 | + int ret, i, new; | |
1083 | + u64 v_blkno, p_blkno; | |
1084 | + struct inode *inode = mapping->host; | |
1085 | + | |
1086 | + new = phys == 0 ? 1 : 0; | |
1087 | + | |
1099 | 1088 | if (new) { |
1089 | + u32 tmp_pos; | |
1090 | + | |
1100 | 1091 | /* |
1101 | 1092 | * This is safe to call with the page locks - it won't take |
1102 | 1093 | * any additional semaphores or cluster locks. |
1103 | 1094 | */ |
1104 | - tmp_pos = wc->w_cpos; | |
1095 | + tmp_pos = cpos; | |
1105 | 1096 | ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode, |
1106 | - &tmp_pos, 1, di_bh, handle, | |
1107 | - data_ac, meta_ac, NULL); | |
1097 | + &tmp_pos, 1, wc->w_di_bh, | |
1098 | + wc->w_handle, data_ac, | |
1099 | + meta_ac, NULL); | |
1108 | 1100 | /* |
1109 | 1101 | * This shouldn't happen because we must have already |
1110 | 1102 | * calculated the correct meta data allocation required. The |
1111 | 1103 | |
1112 | 1104 | |
1113 | 1105 | |
1114 | 1106 | |
1115 | 1107 | |
1116 | 1108 | |
1117 | 1109 | |
1118 | 1110 | |
1119 | 1111 | |
1120 | 1112 | |
1121 | 1113 | |
1122 | 1114 | |
1123 | 1115 | |
1124 | 1116 | |
1125 | 1117 | |
1126 | 1118 | |
1127 | 1119 | |
1128 | 1120 | |
... | ... | @@ -1121,103 +1113,132 @@ |
1121 | 1113 | mlog_errno(ret); |
1122 | 1114 | goto out; |
1123 | 1115 | } |
1116 | + | |
1117 | + v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos); | |
1118 | + } else { | |
1119 | + v_blkno = user_pos >> inode->i_sb->s_blocksize_bits; | |
1124 | 1120 | } |
1125 | 1121 | |
1122 | + /* | |
1123 | + * The only reason this should fail is due to an inability to | |
1124 | + * find the extent added. | |
1125 | + */ | |
1126 | 1126 | ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL, |
1127 | 1127 | NULL); |
1128 | 1128 | if (ret < 0) { |
1129 | - | |
1130 | - /* | |
1131 | - * XXX: Should we go readonly here? | |
1132 | - */ | |
1133 | - | |
1134 | - mlog_errno(ret); | |
1129 | + ocfs2_error(inode->i_sb, "Corrupting extend for inode %llu, " | |
1130 | + "at logical block %llu", | |
1131 | + (unsigned long long)OCFS2_I(inode)->ip_blkno, | |
1132 | + (unsigned long long)v_blkno); | |
1135 | 1133 | goto out; |
1136 | 1134 | } |
1137 | 1135 | |
1138 | 1136 | BUG_ON(p_blkno == 0); |
1139 | 1137 | |
1140 | - for(i = 0; i < numpages; i++) { | |
1141 | - ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i], | |
1142 | - wc, new); | |
1143 | - if (ret < 0) { | |
1144 | - mlog_errno(ret); | |
1145 | - goto out; | |
1146 | - } | |
1138 | + for(i = 0; i < wc->w_num_pages; i++) { | |
1139 | + int tmpret; | |
1147 | 1140 | |
1148 | - copied += ret; | |
1141 | + tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc, | |
1142 | + wc->w_pages[i], cpos, | |
1143 | + user_pos, user_len, new); | |
1144 | + if (tmpret) { | |
1145 | + mlog_errno(tmpret); | |
1146 | + if (ret == 0) | |
1147 | + tmpret = ret; | |
1148 | + } | |
1149 | 1149 | } |
1150 | 1150 | |
1151 | + /* | |
1152 | + * We only have cleanup to do in case of allocating write. | |
1153 | + */ | |
1154 | + if (ret && new) | |
1155 | + ocfs2_write_failure(inode, wc, user_pos, user_len); | |
1156 | + | |
1151 | 1157 | out: |
1152 | - for(i = 0; i < numpages; i++) { | |
1153 | - unlock_page(cpages[i]); | |
1154 | - mark_page_accessed(cpages[i]); | |
1155 | - page_cache_release(cpages[i]); | |
1156 | - } | |
1157 | - kfree(cpages); | |
1158 | 1158 | |
1159 | - return copied ? copied : ret; | |
1159 | + return ret; | |
1160 | 1160 | } |
1161 | 1161 | |
1162 | -static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc, | |
1163 | - struct ocfs2_super *osb, loff_t pos, | |
1164 | - size_t count, ocfs2_page_writer *cb, | |
1165 | - void *cb_priv) | |
1162 | +/* | |
1163 | + * ocfs2_write_end() wants to know which parts of the target page it | |
1164 | + * should complete the write on. It's easiest to compute them ahead of | |
1165 | + * time when a more complete view of the write is available. | |
1166 | + */ | |
1167 | +static void ocfs2_set_target_boundaries(struct ocfs2_super *osb, | |
1168 | + struct ocfs2_write_ctxt *wc, | |
1169 | + loff_t pos, unsigned len, int alloc) | |
1166 | 1170 | { |
1167 | - wc->w_count = count; | |
1168 | - wc->w_pos = pos; | |
1169 | - wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits; | |
1170 | - wc->w_finished_copy = 0; | |
1171 | + struct ocfs2_write_cluster_desc *desc; | |
1171 | 1172 | |
1172 | - if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) | |
1173 | - wc->w_large_pages = 1; | |
1174 | - else | |
1175 | - wc->w_large_pages = 0; | |
1173 | + wc->w_target_from = pos & (PAGE_CACHE_SIZE - 1); | |
1174 | + wc->w_target_to = wc->w_target_from + len; | |
1176 | 1175 | |
1177 | - wc->w_write_data_page = cb; | |
1178 | - wc->w_private = cb_priv; | |
1176 | + if (alloc == 0) | |
1177 | + return; | |
1178 | + | |
1179 | + /* | |
1180 | + * Allocating write - we may have different boundaries based | |
1181 | + * on page size and cluster size. | |
1182 | + * | |
1183 | + * NOTE: We can no longer compute one value from the other as | |
1184 | + * the actual write length and user provided length may be | |
1185 | + * different. | |
1186 | + */ | |
1187 | + | |
1188 | + if (wc->w_large_pages) { | |
1189 | + /* | |
1190 | + * We only care about the 1st and last cluster within | |
1191 | + * our range and whether they are holes or not. Either | |
1192 | + * value may be extended out to the start/end of a | |
1193 | + * newly allocated cluster. | |
1194 | + */ | |
1195 | + desc = &wc->w_desc[0]; | |
1196 | + if (desc->c_new) | |
1197 | + ocfs2_figure_cluster_boundaries(osb, | |
1198 | + desc->c_cpos, | |
1199 | + &wc->w_target_from, | |
1200 | + NULL); | |
1201 | + | |
1202 | + desc = &wc->w_desc[wc->w_clen - 1]; | |
1203 | + if (desc->c_new) | |
1204 | + ocfs2_figure_cluster_boundaries(osb, | |
1205 | + desc->c_cpos, | |
1206 | + NULL, | |
1207 | + &wc->w_target_to); | |
1208 | + } else { | |
1209 | + wc->w_target_from = 0; | |
1210 | + wc->w_target_to = PAGE_CACHE_SIZE; | |
1211 | + } | |
1179 | 1212 | } |
1180 | 1213 | |
1181 | -/* | |
1182 | - * Write a cluster to an inode. The cluster may not be allocated yet, | |
1183 | - * in which case it will be. This only exists for buffered writes - | |
1184 | - * O_DIRECT takes a more "traditional" path through the kernel. | |
1185 | - * | |
1186 | - * The caller is responsible for incrementing pos, written counts, etc | |
1187 | - * | |
1188 | - * For file systems that don't support sparse files, pre-allocation | |
1189 | - * and page zeroing up until cpos should be done prior to this | |
1190 | - * function call. | |
1191 | - * | |
1192 | - * Callers should be holding i_sem, and the rw cluster lock. | |
1193 | - * | |
1194 | - * Returns the number of user bytes written, or less than zero for | |
1195 | - * error. | |
1196 | - */ | |
1197 | -ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, | |
1198 | - size_t count, ocfs2_page_writer *actor, | |
1199 | - void *priv) | |
1214 | +int ocfs2_write_begin(struct file *file, struct address_space *mapping, | |
1215 | + loff_t pos, unsigned len, unsigned flags, | |
1216 | + struct page **pagep, void **fsdata) | |
1200 | 1217 | { |
1201 | - int ret, credits = OCFS2_INODE_UPDATE_CREDITS; | |
1202 | - ssize_t written = 0; | |
1203 | - u32 phys; | |
1204 | - struct inode *inode = file->f_mapping->host; | |
1218 | + int ret, i, credits = OCFS2_INODE_UPDATE_CREDITS; | |
1219 | + unsigned int num_clusters = 0, clusters_to_alloc = 0; | |
1220 | + u32 phys = 0; | |
1221 | + struct ocfs2_write_ctxt *wc; | |
1222 | + struct inode *inode = mapping->host; | |
1205 | 1223 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
1206 | - struct buffer_head *di_bh = NULL; | |
1207 | 1224 | struct ocfs2_dinode *di; |
1208 | 1225 | struct ocfs2_alloc_context *data_ac = NULL; |
1209 | 1226 | struct ocfs2_alloc_context *meta_ac = NULL; |
1210 | 1227 | handle_t *handle; |
1211 | - struct ocfs2_write_ctxt wc; | |
1228 | + struct ocfs2_write_cluster_desc *desc; | |
1212 | 1229 | |
1213 | - ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv); | |
1230 | + ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len); | |
1231 | + if (ret) { | |
1232 | + mlog_errno(ret); | |
1233 | + return ret; | |
1234 | + } | |
1214 | 1235 | |
1215 | - ret = ocfs2_meta_lock(inode, &di_bh, 1); | |
1236 | + ret = ocfs2_meta_lock(inode, &wc->w_di_bh, 1); | |
1216 | 1237 | if (ret) { |
1217 | 1238 | mlog_errno(ret); |
1218 | 1239 | goto out; |
1219 | 1240 | } |
1220 | - di = (struct ocfs2_dinode *)di_bh->b_data; | |
1241 | + di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; | |
1221 | 1242 | |
1222 | 1243 | /* |
1223 | 1244 | * Take alloc sem here to prevent concurrent lookups. That way |
1224 | 1245 | |
1225 | 1246 | |
1226 | 1247 | |
... | ... | @@ -1228,23 +1249,60 @@ |
1228 | 1249 | */ |
1229 | 1250 | down_write(&OCFS2_I(inode)->ip_alloc_sem); |
1230 | 1251 | |
1231 | - ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL, NULL); | |
1232 | - if (ret) { | |
1233 | - mlog_errno(ret); | |
1234 | - goto out_meta; | |
1252 | + for (i = 0; i < wc->w_clen; i++) { | |
1253 | + desc = &wc->w_desc[i]; | |
1254 | + desc->c_cpos = wc->w_cpos + i; | |
1255 | + | |
1256 | + if (num_clusters == 0) { | |
1257 | + ret = ocfs2_get_clusters(inode, desc->c_cpos, &phys, | |
1258 | + &num_clusters, NULL); | |
1259 | + if (ret) { | |
1260 | + mlog_errno(ret); | |
1261 | + goto out_meta; | |
1262 | + } | |
1263 | + } else if (phys) { | |
1264 | + /* | |
1265 | + * Only increment phys if it doesn't describe | |
1266 | + * a hole. | |
1267 | + */ | |
1268 | + phys++; | |
1269 | + } | |
1270 | + | |
1271 | + desc->c_phys = phys; | |
1272 | + if (phys == 0) { | |
1273 | + desc->c_new = 1; | |
1274 | + clusters_to_alloc++; | |
1275 | + } | |
1276 | + | |
1277 | + num_clusters--; | |
1235 | 1278 | } |
1236 | 1279 | |
1237 | - /* phys == 0 means that allocation is required. */ | |
1238 | - if (phys == 0) { | |
1239 | - ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac); | |
1280 | + /* | |
1281 | + * We set w_target_from, w_target_to here so that | |
1282 | + * ocfs2_write_end() knows which range in the target page to | |
1283 | + * write out. An allocation requires that we write the entire | |
1284 | + * cluster range. | |
1285 | + */ | |
1286 | + if (clusters_to_alloc > 0) { | |
1287 | + /* | |
1288 | + * XXX: We are stretching the limits of | |
1289 | + * ocfs2_lock_allocators(). It greately over-estimates | |
1290 | + * the work to be done. | |
1291 | + */ | |
1292 | + ret = ocfs2_lock_allocators(inode, di, clusters_to_alloc, | |
1293 | + &data_ac, &meta_ac); | |
1240 | 1294 | if (ret) { |
1241 | 1295 | mlog_errno(ret); |
1242 | 1296 | goto out_meta; |
1243 | 1297 | } |
1244 | 1298 | |
1245 | - credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1); | |
1299 | + credits = ocfs2_calc_extend_credits(inode->i_sb, di, | |
1300 | + clusters_to_alloc); | |
1301 | + | |
1246 | 1302 | } |
1247 | 1303 | |
1304 | + ocfs2_set_target_boundaries(osb, wc, pos, len, clusters_to_alloc); | |
1305 | + | |
1248 | 1306 | ret = ocfs2_data_lock(inode, 1); |
1249 | 1307 | if (ret) { |
1250 | 1308 | mlog_errno(ret); |
1251 | 1309 | |
1252 | 1310 | |
1253 | 1311 | |
1254 | 1312 | |
1255 | 1313 | |
... | ... | @@ -1258,36 +1316,50 @@ |
1258 | 1316 | goto out_data; |
1259 | 1317 | } |
1260 | 1318 | |
1261 | - written = ocfs2_write(file, phys, handle, di_bh, data_ac, | |
1262 | - meta_ac, &wc); | |
1263 | - if (written < 0) { | |
1264 | - ret = written; | |
1319 | + wc->w_handle = handle; | |
1320 | + | |
1321 | + /* | |
1322 | + * We don't want this to fail in ocfs2_write_end(), so do it | |
1323 | + * here. | |
1324 | + */ | |
1325 | + ret = ocfs2_journal_access(handle, inode, wc->w_di_bh, | |
1326 | + OCFS2_JOURNAL_ACCESS_WRITE); | |
1327 | + if (ret) { | |
1265 | 1328 | mlog_errno(ret); |
1266 | 1329 | goto out_commit; |
1267 | 1330 | } |
1268 | 1331 | |
1269 | - ret = ocfs2_journal_access(handle, inode, di_bh, | |
1270 | - OCFS2_JOURNAL_ACCESS_WRITE); | |
1332 | + /* | |
1333 | + * Fill our page array first. That way we've grabbed enough so | |
1334 | + * that we can zero and flush if we error after adding the | |
1335 | + * extent. | |
1336 | + */ | |
1337 | + ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, | |
1338 | + clusters_to_alloc); | |
1271 | 1339 | if (ret) { |
1272 | 1340 | mlog_errno(ret); |
1273 | 1341 | goto out_commit; |
1274 | 1342 | } |
1275 | 1343 | |
1276 | - pos += written; | |
1277 | - if (pos > inode->i_size) { | |
1278 | - i_size_write(inode, pos); | |
1279 | - mark_inode_dirty(inode); | |
1344 | + for (i = 0; i < wc->w_clen; i++) { | |
1345 | + desc = &wc->w_desc[i]; | |
1346 | + | |
1347 | + ret = ocfs2_write_cluster(mapping, desc->c_phys, data_ac, | |
1348 | + meta_ac, wc, desc->c_cpos, pos, len); | |
1349 | + if (ret) { | |
1350 | + mlog_errno(ret); | |
1351 | + goto out_commit; | |
1352 | + } | |
1280 | 1353 | } |
1281 | - inode->i_blocks = ocfs2_inode_sector_count(inode); | |
1282 | - di->i_size = cpu_to_le64((u64)i_size_read(inode)); | |
1283 | - inode->i_mtime = inode->i_ctime = CURRENT_TIME; | |
1284 | - di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); | |
1285 | - di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); | |
1286 | 1354 | |
1287 | - ret = ocfs2_journal_dirty(handle, di_bh); | |
1288 | - if (ret) | |
1289 | - mlog_errno(ret); | |
1355 | + if (data_ac) | |
1356 | + ocfs2_free_alloc_context(data_ac); | |
1357 | + if (meta_ac) | |
1358 | + ocfs2_free_alloc_context(meta_ac); | |
1290 | 1359 | |
1360 | + *pagep = wc->w_target_page; | |
1361 | + *fsdata = wc; | |
1362 | + return 0; | |
1291 | 1363 | out_commit: |
1292 | 1364 | ocfs2_commit_trans(osb, handle); |
1293 | 1365 | |
1294 | 1366 | |
1295 | 1367 | |
... | ... | @@ -1299,13 +1371,85 @@ |
1299 | 1371 | ocfs2_meta_unlock(inode, 1); |
1300 | 1372 | |
1301 | 1373 | out: |
1302 | - brelse(di_bh); | |
1374 | + ocfs2_free_write_ctxt(wc); | |
1375 | + | |
1303 | 1376 | if (data_ac) |
1304 | 1377 | ocfs2_free_alloc_context(data_ac); |
1305 | 1378 | if (meta_ac) |
1306 | 1379 | ocfs2_free_alloc_context(meta_ac); |
1380 | + return ret; | |
1381 | +} | |
1307 | 1382 | |
1308 | - return written ? written : ret; | |
1383 | +int ocfs2_write_end(struct file *file, struct address_space *mapping, | |
1384 | + loff_t pos, unsigned len, unsigned copied, | |
1385 | + struct page *page, void *fsdata) | |
1386 | +{ | |
1387 | + int i; | |
1388 | + unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1); | |
1389 | + struct inode *inode = mapping->host; | |
1390 | + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | |
1391 | + struct ocfs2_write_ctxt *wc = fsdata; | |
1392 | + struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; | |
1393 | + handle_t *handle = wc->w_handle; | |
1394 | + struct page *tmppage; | |
1395 | + | |
1396 | + if (unlikely(copied < len)) { | |
1397 | + if (!PageUptodate(wc->w_target_page)) | |
1398 | + copied = 0; | |
1399 | + | |
1400 | + ocfs2_zero_new_buffers(wc->w_target_page, start+copied, | |
1401 | + start+len); | |
1402 | + } | |
1403 | + flush_dcache_page(wc->w_target_page); | |
1404 | + | |
1405 | + for(i = 0; i < wc->w_num_pages; i++) { | |
1406 | + tmppage = wc->w_pages[i]; | |
1407 | + | |
1408 | + if (tmppage == wc->w_target_page) { | |
1409 | + from = wc->w_target_from; | |
1410 | + to = wc->w_target_to; | |
1411 | + | |
1412 | + BUG_ON(from > PAGE_CACHE_SIZE || | |
1413 | + to > PAGE_CACHE_SIZE || | |
1414 | + to < from); | |
1415 | + } else { | |
1416 | + /* | |
1417 | + * Pages adjacent to the target (if any) imply | |
1418 | + * a hole-filling write in which case we want | |
1419 | + * to flush their entire range. | |
1420 | + */ | |
1421 | + from = 0; | |
1422 | + to = PAGE_CACHE_SIZE; | |
1423 | + } | |
1424 | + | |
1425 | + if (ocfs2_should_order_data(inode)) | |
1426 | + walk_page_buffers(wc->w_handle, page_buffers(tmppage), | |
1427 | + from, to, NULL, | |
1428 | + ocfs2_journal_dirty_data); | |
1429 | + | |
1430 | + block_commit_write(tmppage, from, to); | |
1431 | + } | |
1432 | + | |
1433 | + pos += copied; | |
1434 | + if (pos > inode->i_size) { | |
1435 | + i_size_write(inode, pos); | |
1436 | + mark_inode_dirty(inode); | |
1437 | + } | |
1438 | + inode->i_blocks = ocfs2_inode_sector_count(inode); | |
1439 | + di->i_size = cpu_to_le64((u64)i_size_read(inode)); | |
1440 | + inode->i_mtime = inode->i_ctime = CURRENT_TIME; | |
1441 | + di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); | |
1442 | + di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); | |
1443 | + | |
1444 | + ocfs2_journal_dirty(handle, wc->w_di_bh); | |
1445 | + | |
1446 | + ocfs2_commit_trans(osb, handle); | |
1447 | + ocfs2_data_unlock(inode, 1); | |
1448 | + up_write(&OCFS2_I(inode)->ip_alloc_sem); | |
1449 | + ocfs2_meta_unlock(inode, 1); | |
1450 | + ocfs2_free_write_ctxt(wc); | |
1451 | + | |
1452 | + return copied; | |
1309 | 1453 | } |
1310 | 1454 | |
1311 | 1455 | const struct address_space_operations ocfs2_aops = { |
fs/ocfs2/aops.h
... | ... | @@ -42,57 +42,13 @@ |
42 | 42 | int (*fn)( handle_t *handle, |
43 | 43 | struct buffer_head *bh)); |
44 | 44 | |
45 | -struct ocfs2_write_ctxt; | |
46 | -typedef int (ocfs2_page_writer)(struct inode *, struct ocfs2_write_ctxt *, | |
47 | - u64 *, unsigned int *, unsigned int *); | |
45 | +int ocfs2_write_begin(struct file *file, struct address_space *mapping, | |
46 | + loff_t pos, unsigned len, unsigned flags, | |
47 | + struct page **pagep, void **fsdata); | |
48 | 48 | |
49 | -ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, | |
50 | - size_t count, ocfs2_page_writer *actor, | |
51 | - void *priv); | |
52 | - | |
53 | -struct ocfs2_write_ctxt { | |
54 | - size_t w_count; | |
55 | - loff_t w_pos; | |
56 | - u32 w_cpos; | |
57 | - unsigned int w_finished_copy; | |
58 | - | |
59 | - /* This is true if page_size > cluster_size */ | |
60 | - unsigned int w_large_pages; | |
61 | - | |
62 | - /* Filler callback and private data */ | |
63 | - ocfs2_page_writer *w_write_data_page; | |
64 | - void *w_private; | |
65 | - | |
66 | - /* Only valid for the filler callback */ | |
67 | - struct page *w_this_page; | |
68 | - unsigned int w_this_page_new; | |
69 | -}; | |
70 | - | |
71 | -struct ocfs2_buffered_write_priv { | |
72 | - char *b_src_buf; | |
73 | - const struct iovec *b_cur_iov; /* Current iovec */ | |
74 | - size_t b_cur_off; /* Offset in the | |
75 | - * current iovec */ | |
76 | -}; | |
77 | -int ocfs2_map_and_write_user_data(struct inode *inode, | |
78 | - struct ocfs2_write_ctxt *wc, | |
79 | - u64 *p_blkno, | |
80 | - unsigned int *ret_from, | |
81 | - unsigned int *ret_to); | |
82 | - | |
83 | -struct ocfs2_splice_write_priv { | |
84 | - struct splice_desc *s_sd; | |
85 | - struct pipe_buffer *s_buf; | |
86 | - struct pipe_inode_info *s_pipe; | |
87 | - /* Neither offset value is ever larger than one page */ | |
88 | - unsigned int s_offset; | |
89 | - unsigned int s_buf_offset; | |
90 | -}; | |
91 | -int ocfs2_map_and_write_splice_data(struct inode *inode, | |
92 | - struct ocfs2_write_ctxt *wc, | |
93 | - u64 *p_blkno, | |
94 | - unsigned int *ret_from, | |
95 | - unsigned int *ret_to); | |
49 | +int ocfs2_write_end(struct file *file, struct address_space *mapping, | |
50 | + loff_t pos, unsigned len, unsigned copied, | |
51 | + struct page *page, void *fsdata); | |
96 | 52 | |
97 | 53 | /* all ocfs2_dio_end_io()'s fault */ |
98 | 54 | #define ocfs2_iocb_is_rw_locked(iocb) \ |
fs/ocfs2/file.c
... | ... | @@ -1335,15 +1335,16 @@ |
1335 | 1335 | *basep = base; |
1336 | 1336 | } |
1337 | 1337 | |
1338 | -static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp, | |
1338 | +static struct page * ocfs2_get_write_source(char **ret_src_buf, | |
1339 | 1339 | const struct iovec *cur_iov, |
1340 | 1340 | size_t iov_offset) |
1341 | 1341 | { |
1342 | 1342 | int ret; |
1343 | - char *buf; | |
1343 | + char *buf = cur_iov->iov_base + iov_offset; | |
1344 | 1344 | struct page *src_page = NULL; |
1345 | + unsigned long off; | |
1345 | 1346 | |
1346 | - buf = cur_iov->iov_base + iov_offset; | |
1347 | + off = (unsigned long)(buf) & ~PAGE_CACHE_MASK; | |
1347 | 1348 | |
1348 | 1349 | if (!segment_eq(get_fs(), KERNEL_DS)) { |
1349 | 1350 | /* |
1350 | 1351 | |
1351 | 1352 | |
... | ... | @@ -1355,18 +1356,17 @@ |
1355 | 1356 | (unsigned long)buf & PAGE_CACHE_MASK, 1, |
1356 | 1357 | 0, 0, &src_page, NULL); |
1357 | 1358 | if (ret == 1) |
1358 | - bp->b_src_buf = kmap(src_page); | |
1359 | + *ret_src_buf = kmap(src_page) + off; | |
1359 | 1360 | else |
1360 | 1361 | src_page = ERR_PTR(-EFAULT); |
1361 | 1362 | } else { |
1362 | - bp->b_src_buf = buf; | |
1363 | + *ret_src_buf = buf; | |
1363 | 1364 | } |
1364 | 1365 | |
1365 | 1366 | return src_page; |
1366 | 1367 | } |
1367 | 1368 | |
1368 | -static void ocfs2_put_write_source(struct ocfs2_buffered_write_priv *bp, | |
1369 | - struct page *page) | |
1369 | +static void ocfs2_put_write_source(struct page *page) | |
1370 | 1370 | { |
1371 | 1371 | if (page) { |
1372 | 1372 | kunmap(page); |
1373 | 1373 | |
... | ... | @@ -1382,10 +1382,12 @@ |
1382 | 1382 | { |
1383 | 1383 | int ret = 0; |
1384 | 1384 | ssize_t copied, total = 0; |
1385 | - size_t iov_offset = 0; | |
1385 | + size_t iov_offset = 0, bytes; | |
1386 | + loff_t pos; | |
1386 | 1387 | const struct iovec *cur_iov = iov; |
1387 | - struct ocfs2_buffered_write_priv bp; | |
1388 | - struct page *page; | |
1388 | + struct page *user_page, *page; | |
1389 | + char *buf, *dst; | |
1390 | + void *fsdata; | |
1389 | 1391 | |
1390 | 1392 | /* |
1391 | 1393 | * handle partial DIO write. Adjust cur_iov if needed. |
1392 | 1394 | |
1393 | 1395 | |
1394 | 1396 | |
1395 | 1397 | |
... | ... | @@ -1393,21 +1395,38 @@ |
1393 | 1395 | ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written); |
1394 | 1396 | |
1395 | 1397 | do { |
1396 | - bp.b_cur_off = iov_offset; | |
1397 | - bp.b_cur_iov = cur_iov; | |
1398 | + pos = *ppos; | |
1398 | 1399 | |
1399 | - page = ocfs2_get_write_source(&bp, cur_iov, iov_offset); | |
1400 | - if (IS_ERR(page)) { | |
1401 | - ret = PTR_ERR(page); | |
1400 | + user_page = ocfs2_get_write_source(&buf, cur_iov, iov_offset); | |
1401 | + if (IS_ERR(user_page)) { | |
1402 | + ret = PTR_ERR(user_page); | |
1402 | 1403 | goto out; |
1403 | 1404 | } |
1404 | 1405 | |
1405 | - copied = ocfs2_buffered_write_cluster(file, *ppos, count, | |
1406 | - ocfs2_map_and_write_user_data, | |
1407 | - &bp); | |
1406 | + /* Stay within our page boundaries */ | |
1407 | + bytes = min((PAGE_CACHE_SIZE - ((unsigned long)pos & ~PAGE_CACHE_MASK)), | |
1408 | + (PAGE_CACHE_SIZE - ((unsigned long)buf & ~PAGE_CACHE_MASK))); | |
1409 | + /* Stay within the vector boundary */ | |
1410 | + bytes = min_t(size_t, bytes, cur_iov->iov_len - iov_offset); | |
1411 | + /* Stay within count */ | |
1412 | + bytes = min(bytes, count); | |
1408 | 1413 | |
1409 | - ocfs2_put_write_source(&bp, page); | |
1414 | + page = NULL; | |
1415 | + ret = ocfs2_write_begin(file, file->f_mapping, pos, bytes, 0, | |
1416 | + &page, &fsdata); | |
1417 | + if (ret) { | |
1418 | + mlog_errno(ret); | |
1419 | + goto out; | |
1420 | + } | |
1410 | 1421 | |
1422 | + dst = kmap_atomic(page, KM_USER0); | |
1423 | + memcpy(dst + (pos & (PAGE_CACHE_SIZE - 1)), buf, bytes); | |
1424 | + kunmap_atomic(dst, KM_USER0); | |
1425 | + flush_dcache_page(page); | |
1426 | + ocfs2_put_write_source(user_page); | |
1427 | + | |
1428 | + copied = ocfs2_write_end(file, file->f_mapping, pos, bytes, | |
1429 | + bytes, page, fsdata); | |
1411 | 1430 | if (copied < 0) { |
1412 | 1431 | mlog_errno(copied); |
1413 | 1432 | ret = copied; |
... | ... | @@ -1415,7 +1434,7 @@ |
1415 | 1434 | } |
1416 | 1435 | |
1417 | 1436 | total += copied; |
1418 | - *ppos = *ppos + copied; | |
1437 | + *ppos = pos + copied; | |
1419 | 1438 | count -= copied; |
1420 | 1439 | |
1421 | 1440 | ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied); |
1422 | 1441 | |
1423 | 1442 | |
1424 | 1443 | |
1425 | 1444 | |
1426 | 1445 | |
1427 | 1446 | |
1428 | 1447 | |
... | ... | @@ -1585,52 +1604,46 @@ |
1585 | 1604 | struct pipe_buffer *buf, |
1586 | 1605 | struct splice_desc *sd) |
1587 | 1606 | { |
1588 | - int ret, count, total = 0; | |
1607 | + int ret, count; | |
1589 | 1608 | ssize_t copied = 0; |
1590 | - struct ocfs2_splice_write_priv sp; | |
1609 | + struct file *file = sd->u.file; | |
1610 | + unsigned int offset; | |
1611 | + struct page *page = NULL; | |
1612 | + void *fsdata; | |
1613 | + char *src, *dst; | |
1591 | 1614 | |
1592 | 1615 | ret = buf->ops->confirm(pipe, buf); |
1593 | 1616 | if (ret) |
1594 | 1617 | goto out; |
1595 | 1618 | |
1596 | - sp.s_sd = sd; | |
1597 | - sp.s_buf = buf; | |
1598 | - sp.s_pipe = pipe; | |
1599 | - sp.s_offset = sd->pos & ~PAGE_CACHE_MASK; | |
1600 | - sp.s_buf_offset = buf->offset; | |
1601 | - | |
1619 | + offset = sd->pos & ~PAGE_CACHE_MASK; | |
1602 | 1620 | count = sd->len; |
1603 | - if (count + sp.s_offset > PAGE_CACHE_SIZE) | |
1604 | - count = PAGE_CACHE_SIZE - sp.s_offset; | |
1621 | + if (count + offset > PAGE_CACHE_SIZE) | |
1622 | + count = PAGE_CACHE_SIZE - offset; | |
1605 | 1623 | |
1606 | - do { | |
1607 | - /* | |
1608 | - * splice wants us to copy up to one page at a | |
1609 | - * time. For pagesize > cluster size, this means we | |
1610 | - * might enter ocfs2_buffered_write_cluster() more | |
1611 | - * than once, so keep track of our progress here. | |
1612 | - */ | |
1613 | - copied = ocfs2_buffered_write_cluster(sd->u.file, | |
1614 | - (loff_t)sd->pos + total, | |
1615 | - count, | |
1616 | - ocfs2_map_and_write_splice_data, | |
1617 | - &sp); | |
1618 | - if (copied < 0) { | |
1619 | - mlog_errno(copied); | |
1620 | - ret = copied; | |
1621 | - goto out; | |
1622 | - } | |
1624 | + ret = ocfs2_write_begin(file, file->f_mapping, sd->pos, count, 0, | |
1625 | + &page, &fsdata); | |
1626 | + if (ret) { | |
1627 | + mlog_errno(ret); | |
1628 | + goto out; | |
1629 | + } | |
1623 | 1630 | |
1624 | - count -= copied; | |
1625 | - sp.s_offset += copied; | |
1626 | - sp.s_buf_offset += copied; | |
1627 | - total += copied; | |
1628 | - } while (count); | |
1631 | + src = buf->ops->map(pipe, buf, 1); | |
1632 | + dst = kmap_atomic(page, KM_USER1); | |
1633 | + memcpy(dst + offset, src + buf->offset, count); | |
1634 | + kunmap_atomic(page, KM_USER1); | |
1635 | + buf->ops->unmap(pipe, buf, src); | |
1629 | 1636 | |
1630 | - ret = 0; | |
1637 | + copied = ocfs2_write_end(file, file->f_mapping, sd->pos, count, count, | |
1638 | + page, fsdata); | |
1639 | + if (copied < 0) { | |
1640 | + mlog_errno(copied); | |
1641 | + ret = copied; | |
1642 | + goto out; | |
1643 | + } | |
1631 | 1644 | out: |
1632 | 1645 | |
1633 | - return total ? total : ret; | |
1646 | + return copied ? copied : ret; | |
1634 | 1647 | } |
1635 | 1648 | |
1636 | 1649 | static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe, |