Commit 3a307ffc2730bfa1a4dfa94537be9d412338aad2

Authored by Mark Fasheh
1 parent 2e89b2e48e

ocfs2: rework ocfs2_buffered_write_cluster()

Use some ideas from the new-aops patch series and turn
ocfs2_buffered_write_cluster() into a 2 stage operation with the caller
copying data in between. The code now understands multiple cluster writes as
a result of having to deal with a full page write for greater than 4k pages.

This sets us up to easily call into the write path during ->page_mkwrite().

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

Showing 3 changed files with 551 additions and 438 deletions Side-by-side Diff

Changes suppressed. Click to show
... ... @@ -684,6 +684,8 @@
684 684 bh = bh->b_this_page, block_start += bsize) {
685 685 block_end = block_start + bsize;
686 686  
  687 + clear_buffer_new(bh);
  688 +
687 689 /*
688 690 * Ignore blocks outside of our i/o range -
689 691 * they may belong to unallocated clusters.
690 692  
... ... @@ -698,10 +700,9 @@
698 700 * For an allocating write with cluster size >= page
699 701 * size, we always write the entire page.
700 702 */
  703 + if (new)
  704 + set_buffer_new(bh);
701 705  
702   - if (buffer_new(bh))
703   - clear_buffer_new(bh);
704   -
705 706 if (!buffer_mapped(bh)) {
706 707 map_bh(bh, inode->i_sb, *p_blkno);
707 708 unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
708 709  
709 710  
710 711  
711 712  
712 713  
713 714  
714 715  
715 716  
716 717  
717 718  
718 719  
719 720  
720 721  
721 722  
722 723  
723 724  
724 725  
725 726  
726 727  
727 728  
728 729  
729 730  
730 731  
731 732  
732 733  
733 734  
734 735  
735 736  
736 737  
737 738  
738 739  
739 740  
740 741  
741 742  
... ... @@ -761,217 +762,232 @@
761 762 return ret;
762 763 }
763 764  
  765 +#if (PAGE_CACHE_SIZE >= OCFS2_MAX_CLUSTERSIZE)
  766 +#define OCFS2_MAX_CTXT_PAGES 1
  767 +#else
  768 +#define OCFS2_MAX_CTXT_PAGES (OCFS2_MAX_CLUSTERSIZE / PAGE_CACHE_SIZE)
  769 +#endif
  770 +
  771 +#define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE)
  772 +
764 773 /*
765   - * This will copy user data from the buffer page in the splice
766   - * context.
767   - *
768   - * For now, we ignore SPLICE_F_MOVE as that would require some extra
769   - * communication out all the way to ocfs2_write().
  774 + * Describe the state of a single cluster to be written to.
770 775 */
771   -int ocfs2_map_and_write_splice_data(struct inode *inode,
772   - struct ocfs2_write_ctxt *wc, u64 *p_blkno,
773   - unsigned int *ret_from, unsigned int *ret_to)
774   -{
775   - int ret;
776   - unsigned int to, from, cluster_start, cluster_end;
777   - char *src, *dst;
778   - struct ocfs2_splice_write_priv *sp = wc->w_private;
779   - struct pipe_buffer *buf = sp->s_buf;
780   - unsigned long bytes, src_from;
781   - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
  776 +struct ocfs2_write_cluster_desc {
  777 + u32 c_cpos;
  778 + u32 c_phys;
  779 + /*
  780 + * Give this a unique field because c_phys eventually gets
  781 + * filled.
  782 + */
  783 + unsigned c_new;
  784 +};
782 785  
783   - ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start,
784   - &cluster_end);
  786 +struct ocfs2_write_ctxt {
  787 + /* Logical cluster position / len of write */
  788 + u32 w_cpos;
  789 + u32 w_clen;
785 790  
786   - from = sp->s_offset;
787   - src_from = sp->s_buf_offset;
788   - bytes = wc->w_count;
  791 + struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
789 792  
790   - if (wc->w_large_pages) {
791   - /*
792   - * For cluster size < page size, we have to
793   - * calculate pos within the cluster and obey
794   - * the rightmost boundary.
795   - */
796   - bytes = min(bytes, (unsigned long)(osb->s_clustersize
797   - - (wc->w_pos & (osb->s_clustersize - 1))));
798   - }
799   - to = from + bytes;
  793 + /*
  794 + * This is true if page_size > cluster_size.
  795 + *
  796 + * It triggers a set of special cases during write which might
  797 + * have to deal with allocating writes to partial pages.
  798 + */
  799 + unsigned int w_large_pages;
800 800  
801   - BUG_ON(from > PAGE_CACHE_SIZE);
802   - BUG_ON(to > PAGE_CACHE_SIZE);
803   - BUG_ON(from < cluster_start);
804   - BUG_ON(to > cluster_end);
  801 + /*
  802 + * Pages involved in this write.
  803 + *
  804 + * w_target_page is the page being written to by the user.
  805 + *
  806 + * w_pages is an array of pages which always contains
  807 + * w_target_page, and in the case of an allocating write with
  808 + * page_size < cluster size, it will contain zero'd and mapped
  809 + * pages adjacent to w_target_page which need to be written
  810 + * out in so that future reads from that region will get
  811 + * zero's.
  812 + */
  813 + struct page *w_pages[OCFS2_MAX_CTXT_PAGES];
  814 + unsigned int w_num_pages;
  815 + struct page *w_target_page;
805 816  
806   - if (wc->w_this_page_new)
807   - ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
808   - cluster_start, cluster_end, 1);
809   - else
810   - ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
811   - from, to, 0);
812   - if (ret) {
813   - mlog_errno(ret);
814   - goto out;
  817 + /*
  818 + * ocfs2_write_end() uses this to know what the real range to
  819 + * write in the target should be.
  820 + */
  821 + unsigned int w_target_from;
  822 + unsigned int w_target_to;
  823 +
  824 + /*
  825 + * We could use journal_current_handle() but this is cleaner,
  826 + * IMHO -Mark
  827 + */
  828 + handle_t *w_handle;
  829 +
  830 + struct buffer_head *w_di_bh;
  831 +};
  832 +
  833 +static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
  834 +{
  835 + int i;
  836 +
  837 + for(i = 0; i < wc->w_num_pages; i++) {
  838 + if (wc->w_pages[i] == NULL)
  839 + continue;
  840 +
  841 + unlock_page(wc->w_pages[i]);
  842 + mark_page_accessed(wc->w_pages[i]);
  843 + page_cache_release(wc->w_pages[i]);
815 844 }
816 845  
817   - src = buf->ops->map(sp->s_pipe, buf, 1);
818   - dst = kmap_atomic(wc->w_this_page, KM_USER1);
819   - memcpy(dst + from, src + src_from, bytes);
820   - kunmap_atomic(wc->w_this_page, KM_USER1);
821   - buf->ops->unmap(sp->s_pipe, buf, src);
  846 + brelse(wc->w_di_bh);
  847 + kfree(wc);
  848 +}
822 849  
823   - wc->w_finished_copy = 1;
  850 +static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
  851 + struct ocfs2_super *osb, loff_t pos,
  852 + unsigned len)
  853 +{
  854 + struct ocfs2_write_ctxt *wc;
824 855  
825   - *ret_from = from;
826   - *ret_to = to;
827   -out:
  856 + wc = kzalloc(sizeof(struct ocfs2_write_ctxt), GFP_NOFS);
  857 + if (!wc)
  858 + return -ENOMEM;
828 859  
829   - return bytes ? (unsigned int)bytes : ret;
  860 + wc->w_cpos = pos >> osb->s_clustersize_bits;
  861 + wc->w_clen = ocfs2_clusters_for_bytes(osb->sb, len);
  862 +
  863 + if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
  864 + wc->w_large_pages = 1;
  865 + else
  866 + wc->w_large_pages = 0;
  867 +
  868 + *wcp = wc;
  869 +
  870 + return 0;
830 871 }
831 872  
832 873 /*
833   - * This will copy user data from the iovec in the buffered write
834   - * context.
  874 + * If a page has any new buffers, zero them out here, and mark them uptodate
  875 + * and dirty so they'll be written out (in order to prevent uninitialised
  876 + * block data from leaking). And clear the new bit.
835 877 */
836   -int ocfs2_map_and_write_user_data(struct inode *inode,
837   - struct ocfs2_write_ctxt *wc, u64 *p_blkno,
838   - unsigned int *ret_from, unsigned int *ret_to)
  878 +static void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to)
839 879 {
840   - int ret;
841   - unsigned int to, from, cluster_start, cluster_end;
842   - unsigned long bytes, src_from;
843   - char *dst;
844   - struct ocfs2_buffered_write_priv *bp = wc->w_private;
845   - const struct iovec *cur_iov = bp->b_cur_iov;
846   - char __user *buf;
847   - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
  880 + unsigned int block_start, block_end;
  881 + struct buffer_head *head, *bh;
848 882  
849   - ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start,
850   - &cluster_end);
  883 + BUG_ON(!PageLocked(page));
  884 + if (!page_has_buffers(page))
  885 + return;
851 886  
852   - buf = cur_iov->iov_base + bp->b_cur_off;
853   - src_from = (unsigned long)buf & ~PAGE_CACHE_MASK;
  887 + bh = head = page_buffers(page);
  888 + block_start = 0;
  889 + do {
  890 + block_end = block_start + bh->b_size;
854 891  
855   - from = wc->w_pos & (PAGE_CACHE_SIZE - 1);
  892 + if (buffer_new(bh)) {
  893 + if (block_end > from && block_start < to) {
  894 + if (!PageUptodate(page)) {
  895 + unsigned start, end;
  896 + void *kaddr;
856 897  
857   - /*
858   - * This is a lot of comparisons, but it reads quite
859   - * easily, which is important here.
860   - */
861   - /* Stay within the src page */
862   - bytes = PAGE_SIZE - src_from;
863   - /* Stay within the vector */
864   - bytes = min(bytes,
865   - (unsigned long)(cur_iov->iov_len - bp->b_cur_off));
866   - /* Stay within count */
867   - bytes = min(bytes, (unsigned long)wc->w_count);
868   - /*
869   - * For clustersize > page size, just stay within
870   - * target page, otherwise we have to calculate pos
871   - * within the cluster and obey the rightmost
872   - * boundary.
873   - */
874   - if (wc->w_large_pages) {
875   - /*
876   - * For cluster size < page size, we have to
877   - * calculate pos within the cluster and obey
878   - * the rightmost boundary.
879   - */
880   - bytes = min(bytes, (unsigned long)(osb->s_clustersize
881   - - (wc->w_pos & (osb->s_clustersize - 1))));
882   - } else {
883   - /*
884   - * cluster size > page size is the most common
885   - * case - we just stay within the target page
886   - * boundary.
887   - */
888   - bytes = min(bytes, PAGE_CACHE_SIZE - from);
889   - }
  898 + start = max(from, block_start);
  899 + end = min(to, block_end);
890 900  
891   - to = from + bytes;
  901 + kaddr = kmap_atomic(page, KM_USER0);
  902 + memset(kaddr+start, 0, end - start);
  903 + flush_dcache_page(page);
  904 + kunmap_atomic(kaddr, KM_USER0);
  905 + set_buffer_uptodate(bh);
  906 + }
892 907  
893   - BUG_ON(from > PAGE_CACHE_SIZE);
894   - BUG_ON(to > PAGE_CACHE_SIZE);
895   - BUG_ON(from < cluster_start);
896   - BUG_ON(to > cluster_end);
  908 + clear_buffer_new(bh);
  909 + mark_buffer_dirty(bh);
  910 + }
  911 + }
897 912  
898   - if (wc->w_this_page_new)
899   - ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
900   - cluster_start, cluster_end, 1);
901   - else
902   - ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
903   - from, to, 0);
904   - if (ret) {
905   - mlog_errno(ret);
906   - goto out;
907   - }
  913 + block_start = block_end;
  914 + bh = bh->b_this_page;
  915 + } while (bh != head);
  916 +}
908 917  
909   - dst = kmap(wc->w_this_page);
910   - memcpy(dst + from, bp->b_src_buf + src_from, bytes);
911   - kunmap(wc->w_this_page);
  918 +/*
  919 + * Only called when we have a failure during allocating write to write
  920 + * zero's to the newly allocated region.
  921 + */
  922 +static void ocfs2_write_failure(struct inode *inode,
  923 + struct ocfs2_write_ctxt *wc,
  924 + loff_t user_pos, unsigned user_len)
  925 +{
  926 + int i;
  927 + unsigned from, to;
  928 + struct page *tmppage;
912 929  
913   - /*
914   - * XXX: This is slow, but simple. The caller of
915   - * ocfs2_buffered_write_cluster() is responsible for
916   - * passing through the iovecs, so it's difficult to
917   - * predict what our next step is in here after our
918   - * initial write. A future version should be pushing
919   - * that iovec manipulation further down.
920   - *
921   - * By setting this, we indicate that a copy from user
922   - * data was done, and subsequent calls for this
923   - * cluster will skip copying more data.
924   - */
925   - wc->w_finished_copy = 1;
  930 + ocfs2_zero_new_buffers(wc->w_target_page, user_pos, user_len);
926 931  
927   - *ret_from = from;
928   - *ret_to = to;
929   -out:
  932 + if (wc->w_large_pages) {
  933 + from = wc->w_target_from;
  934 + to = wc->w_target_to;
  935 + } else {
  936 + from = 0;
  937 + to = PAGE_CACHE_SIZE;
  938 + }
930 939  
931   - return bytes ? (unsigned int)bytes : ret;
  940 + for(i = 0; i < wc->w_num_pages; i++) {
  941 + tmppage = wc->w_pages[i];
  942 +
  943 + if (ocfs2_should_order_data(inode))
  944 + walk_page_buffers(wc->w_handle, page_buffers(tmppage),
  945 + from, to, NULL,
  946 + ocfs2_journal_dirty_data);
  947 +
  948 + block_commit_write(tmppage, from, to);
  949 + }
932 950 }
933 951  
934   -/*
935   - * Map, fill and write a page to disk.
936   - *
937   - * The work of copying data is done via callback. Newly allocated
938   - * pages which don't take user data will be zero'd (set 'new' to
939   - * indicate an allocating write)
940   - *
941   - * Returns a negative error code or the number of bytes copied into
942   - * the page.
943   - */
944   -static int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
945   - u64 *p_blkno, struct page *page,
946   - struct ocfs2_write_ctxt *wc, int new)
  952 +static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno,
  953 + struct ocfs2_write_ctxt *wc,
  954 + struct page *page, u32 cpos,
  955 + loff_t user_pos, unsigned user_len,
  956 + int new)
947 957 {
948   - int ret, copied = 0;
949   - unsigned int from = 0, to = 0;
  958 + int ret;
  959 + unsigned int map_from = 0, map_to = 0;
950 960 unsigned int cluster_start, cluster_end;
951   - unsigned int zero_from = 0, zero_to = 0;
  961 + unsigned int user_data_from = 0, user_data_to = 0;
952 962  
953   - ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos,
  963 + ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos,
954 964 &cluster_start, &cluster_end);
955 965  
956   - if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index
957   - && !wc->w_finished_copy) {
  966 + if (page == wc->w_target_page) {
  967 + map_from = user_pos & (PAGE_CACHE_SIZE - 1);
  968 + map_to = map_from + user_len;
958 969  
959   - wc->w_this_page = page;
960   - wc->w_this_page_new = new;
961   - ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to);
962   - if (ret < 0) {
  970 + if (new)
  971 + ret = ocfs2_map_page_blocks(page, p_blkno, inode,
  972 + cluster_start, cluster_end,
  973 + new);
  974 + else
  975 + ret = ocfs2_map_page_blocks(page, p_blkno, inode,
  976 + map_from, map_to, new);
  977 + if (ret) {
963 978 mlog_errno(ret);
964 979 goto out;
965 980 }
966 981  
967   - copied = ret;
968   -
969   - zero_from = from;
970   - zero_to = to;
  982 + user_data_from = map_from;
  983 + user_data_to = map_to;
971 984 if (new) {
972   - from = cluster_start;
973   - to = cluster_end;
  985 + map_from = cluster_start;
  986 + map_to = cluster_end;
974 987 }
  988 +
  989 + wc->w_target_from = map_from;
  990 + wc->w_target_to = map_to;
975 991 } else {
976 992 /*
977 993 * If we haven't allocated the new page yet, we
978 994  
... ... @@ -980,11 +996,11 @@
980 996 */
981 997 BUG_ON(!new);
982 998  
983   - from = cluster_start;
984   - to = cluster_end;
  999 + map_from = cluster_start;
  1000 + map_to = cluster_end;
985 1001  
986 1002 ret = ocfs2_map_page_blocks(page, p_blkno, inode,
987   - cluster_start, cluster_end, 1);
  1003 + cluster_start, cluster_end, new);
988 1004 if (ret) {
989 1005 mlog_errno(ret);
990 1006 goto out;
991 1007  
992 1008  
993 1009  
994 1010  
995 1011  
996 1012  
997 1013  
998 1014  
999 1015  
1000 1016  
1001 1017  
1002 1018  
1003 1019  
1004 1020  
1005 1021  
1006 1022  
1007 1023  
1008 1024  
... ... @@ -1003,108 +1019,84 @@
1003 1019 */
1004 1020 if (new && !PageUptodate(page))
1005 1021 ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb),
1006   - wc->w_cpos, zero_from, zero_to);
  1022 + cpos, user_data_from, user_data_to);
1007 1023  
1008 1024 flush_dcache_page(page);
1009 1025  
1010   - if (ocfs2_should_order_data(inode)) {
1011   - ret = walk_page_buffers(handle,
1012   - page_buffers(page),
1013   - from, to, NULL,
1014   - ocfs2_journal_dirty_data);
1015   - if (ret < 0)
1016   - mlog_errno(ret);
1017   - }
1018   -
1019   - /*
1020   - * We don't use generic_commit_write() because we need to
1021   - * handle our own i_size update.
1022   - */
1023   - ret = block_commit_write(page, from, to);
1024   - if (ret)
1025   - mlog_errno(ret);
1026 1026 out:
1027   -
1028   - return copied ? copied : ret;
  1027 + return ret;
1029 1028 }
1030 1029  
1031 1030 /*
1032   - * Do the actual write of some data into an inode. Optionally allocate
1033   - * in order to fulfill the write.
1034   - *
1035   - * cpos is the logical cluster offset within the file to write at
1036   - *
1037   - * 'phys' is the physical mapping of that offset. a 'phys' value of
1038   - * zero indicates that allocation is required. In this case, data_ac
1039   - * and meta_ac should be valid (meta_ac can be null if metadata
1040   - * allocation isn't required).
  1031 + * This function will only grab one clusters worth of pages.
1041 1032 */
1042   -static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
1043   - struct buffer_head *di_bh,
1044   - struct ocfs2_alloc_context *data_ac,
1045   - struct ocfs2_alloc_context *meta_ac,
1046   - struct ocfs2_write_ctxt *wc)
  1033 +static int ocfs2_grab_pages_for_write(struct address_space *mapping,
  1034 + struct ocfs2_write_ctxt *wc,
  1035 + u32 cpos, loff_t user_pos, int new)
1047 1036 {
1048   - int ret, i, numpages = 1, new;
1049   - unsigned int copied = 0;
1050   - u32 tmp_pos;
1051   - u64 v_blkno, p_blkno;
1052   - struct address_space *mapping = file->f_mapping;
  1037 + int ret = 0, i;
  1038 + unsigned long start, target_index, index;
1053 1039 struct inode *inode = mapping->host;
1054   - unsigned long index, start;
1055   - struct page **cpages;
1056 1040  
1057   - new = phys == 0 ? 1 : 0;
  1041 + target_index = user_pos >> PAGE_CACHE_SHIFT;
1058 1042  
1059 1043 /*
1060 1044 * Figure out how many pages we'll be manipulating here. For
1061 1045 * non allocating write, we just change the one
1062 1046 * page. Otherwise, we'll need a whole clusters worth.
1063 1047 */
1064   - if (new)
1065   - numpages = ocfs2_pages_per_cluster(inode->i_sb);
1066   -
1067   - cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS);
1068   - if (!cpages) {
1069   - ret = -ENOMEM;
1070   - mlog_errno(ret);
1071   - return ret;
1072   - }
1073   -
1074   - /*
1075   - * Fill our page array first. That way we've grabbed enough so
1076   - * that we can zero and flush if we error after adding the
1077   - * extent.
1078   - */
1079 1048 if (new) {
1080   - start = ocfs2_align_clusters_to_page_index(inode->i_sb,
1081   - wc->w_cpos);
1082   - v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos);
  1049 + wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
  1050 + start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
1083 1051 } else {
1084   - start = wc->w_pos >> PAGE_CACHE_SHIFT;
1085   - v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits;
  1052 + wc->w_num_pages = 1;
  1053 + start = target_index;
1086 1054 }
1087 1055  
1088   - for(i = 0; i < numpages; i++) {
  1056 + for(i = 0; i < wc->w_num_pages; i++) {
1089 1057 index = start + i;
1090 1058  
1091   - cpages[i] = find_or_create_page(mapping, index, GFP_NOFS);
1092   - if (!cpages[i]) {
  1059 + wc->w_pages[i] = find_or_create_page(mapping, index, GFP_NOFS);
  1060 + if (!wc->w_pages[i]) {
1093 1061 ret = -ENOMEM;
1094 1062 mlog_errno(ret);
1095 1063 goto out;
1096 1064 }
  1065 +
  1066 + if (index == target_index)
  1067 + wc->w_target_page = wc->w_pages[i];
1097 1068 }
  1069 +out:
  1070 + return ret;
  1071 +}
1098 1072  
  1073 +/*
  1074 + * Prepare a single cluster for write one cluster into the file.
  1075 + */
  1076 +static int ocfs2_write_cluster(struct address_space *mapping,
  1077 + u32 phys, struct ocfs2_alloc_context *data_ac,
  1078 + struct ocfs2_alloc_context *meta_ac,
  1079 + struct ocfs2_write_ctxt *wc, u32 cpos,
  1080 + loff_t user_pos, unsigned user_len)
  1081 +{
  1082 + int ret, i, new;
  1083 + u64 v_blkno, p_blkno;
  1084 + struct inode *inode = mapping->host;
  1085 +
  1086 + new = phys == 0 ? 1 : 0;
  1087 +
1099 1088 if (new) {
  1089 + u32 tmp_pos;
  1090 +
1100 1091 /*
1101 1092 * This is safe to call with the page locks - it won't take
1102 1093 * any additional semaphores or cluster locks.
1103 1094 */
1104   - tmp_pos = wc->w_cpos;
  1095 + tmp_pos = cpos;
1105 1096 ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode,
1106   - &tmp_pos, 1, di_bh, handle,
1107   - data_ac, meta_ac, NULL);
  1097 + &tmp_pos, 1, wc->w_di_bh,
  1098 + wc->w_handle, data_ac,
  1099 + meta_ac, NULL);
1108 1100 /*
1109 1101 * This shouldn't happen because we must have already
1110 1102 * calculated the correct meta data allocation required. The
1111 1103  
1112 1104  
1113 1105  
1114 1106  
1115 1107  
1116 1108  
1117 1109  
1118 1110  
1119 1111  
1120 1112  
1121 1113  
1122 1114  
1123 1115  
1124 1116  
1125 1117  
1126 1118  
1127 1119  
1128 1120  
... ... @@ -1121,103 +1113,132 @@
1121 1113 mlog_errno(ret);
1122 1114 goto out;
1123 1115 }
  1116 +
  1117 + v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos);
  1118 + } else {
  1119 + v_blkno = user_pos >> inode->i_sb->s_blocksize_bits;
1124 1120 }
1125 1121  
  1122 + /*
  1123 + * The only reason this should fail is due to an inability to
  1124 + * find the extent added.
  1125 + */
1126 1126 ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL,
1127 1127 NULL);
1128 1128 if (ret < 0) {
1129   -
1130   - /*
1131   - * XXX: Should we go readonly here?
1132   - */
1133   -
1134   - mlog_errno(ret);
  1129 + ocfs2_error(inode->i_sb, "Corrupting extend for inode %llu, "
  1130 + "at logical block %llu",
  1131 + (unsigned long long)OCFS2_I(inode)->ip_blkno,
  1132 + (unsigned long long)v_blkno);
1135 1133 goto out;
1136 1134 }
1137 1135  
1138 1136 BUG_ON(p_blkno == 0);
1139 1137  
1140   - for(i = 0; i < numpages; i++) {
1141   - ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i],
1142   - wc, new);
1143   - if (ret < 0) {
1144   - mlog_errno(ret);
1145   - goto out;
1146   - }
  1138 + for(i = 0; i < wc->w_num_pages; i++) {
  1139 + int tmpret;
1147 1140  
1148   - copied += ret;
  1141 + tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc,
  1142 + wc->w_pages[i], cpos,
  1143 + user_pos, user_len, new);
  1144 + if (tmpret) {
  1145 + mlog_errno(tmpret);
  1146 + if (ret == 0)
  1147 + tmpret = ret;
  1148 + }
1149 1149 }
1150 1150  
  1151 + /*
  1152 + * We only have cleanup to do in case of allocating write.
  1153 + */
  1154 + if (ret && new)
  1155 + ocfs2_write_failure(inode, wc, user_pos, user_len);
  1156 +
1151 1157 out:
1152   - for(i = 0; i < numpages; i++) {
1153   - unlock_page(cpages[i]);
1154   - mark_page_accessed(cpages[i]);
1155   - page_cache_release(cpages[i]);
1156   - }
1157   - kfree(cpages);
1158 1158  
1159   - return copied ? copied : ret;
  1159 + return ret;
1160 1160 }
1161 1161  
1162   -static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc,
1163   - struct ocfs2_super *osb, loff_t pos,
1164   - size_t count, ocfs2_page_writer *cb,
1165   - void *cb_priv)
  1162 +/*
  1163 + * ocfs2_write_end() wants to know which parts of the target page it
  1164 + * should complete the write on. It's easiest to compute them ahead of
  1165 + * time when a more complete view of the write is available.
  1166 + */
  1167 +static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
  1168 + struct ocfs2_write_ctxt *wc,
  1169 + loff_t pos, unsigned len, int alloc)
1166 1170 {
1167   - wc->w_count = count;
1168   - wc->w_pos = pos;
1169   - wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits;
1170   - wc->w_finished_copy = 0;
  1171 + struct ocfs2_write_cluster_desc *desc;
1171 1172  
1172   - if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
1173   - wc->w_large_pages = 1;
1174   - else
1175   - wc->w_large_pages = 0;
  1173 + wc->w_target_from = pos & (PAGE_CACHE_SIZE - 1);
  1174 + wc->w_target_to = wc->w_target_from + len;
1176 1175  
1177   - wc->w_write_data_page = cb;
1178   - wc->w_private = cb_priv;
  1176 + if (alloc == 0)
  1177 + return;
  1178 +
  1179 + /*
  1180 + * Allocating write - we may have different boundaries based
  1181 + * on page size and cluster size.
  1182 + *
  1183 + * NOTE: We can no longer compute one value from the other as
  1184 + * the actual write length and user provided length may be
  1185 + * different.
  1186 + */
  1187 +
  1188 + if (wc->w_large_pages) {
  1189 + /*
  1190 + * We only care about the 1st and last cluster within
  1191 + * our range and whether they are holes or not. Either
  1192 + * value may be extended out to the start/end of a
  1193 + * newly allocated cluster.
  1194 + */
  1195 + desc = &wc->w_desc[0];
  1196 + if (desc->c_new)
  1197 + ocfs2_figure_cluster_boundaries(osb,
  1198 + desc->c_cpos,
  1199 + &wc->w_target_from,
  1200 + NULL);
  1201 +
  1202 + desc = &wc->w_desc[wc->w_clen - 1];
  1203 + if (desc->c_new)
  1204 + ocfs2_figure_cluster_boundaries(osb,
  1205 + desc->c_cpos,
  1206 + NULL,
  1207 + &wc->w_target_to);
  1208 + } else {
  1209 + wc->w_target_from = 0;
  1210 + wc->w_target_to = PAGE_CACHE_SIZE;
  1211 + }
1179 1212 }
1180 1213  
1181   -/*
1182   - * Write a cluster to an inode. The cluster may not be allocated yet,
1183   - * in which case it will be. This only exists for buffered writes -
1184   - * O_DIRECT takes a more "traditional" path through the kernel.
1185   - *
1186   - * The caller is responsible for incrementing pos, written counts, etc
1187   - *
1188   - * For file systems that don't support sparse files, pre-allocation
1189   - * and page zeroing up until cpos should be done prior to this
1190   - * function call.
1191   - *
1192   - * Callers should be holding i_sem, and the rw cluster lock.
1193   - *
1194   - * Returns the number of user bytes written, or less than zero for
1195   - * error.
1196   - */
1197   -ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
1198   - size_t count, ocfs2_page_writer *actor,
1199   - void *priv)
  1214 +int ocfs2_write_begin(struct file *file, struct address_space *mapping,
  1215 + loff_t pos, unsigned len, unsigned flags,
  1216 + struct page **pagep, void **fsdata)
1200 1217 {
1201   - int ret, credits = OCFS2_INODE_UPDATE_CREDITS;
1202   - ssize_t written = 0;
1203   - u32 phys;
1204   - struct inode *inode = file->f_mapping->host;
  1218 + int ret, i, credits = OCFS2_INODE_UPDATE_CREDITS;
  1219 + unsigned int num_clusters = 0, clusters_to_alloc = 0;
  1220 + u32 phys = 0;
  1221 + struct ocfs2_write_ctxt *wc;
  1222 + struct inode *inode = mapping->host;
1205 1223 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1206   - struct buffer_head *di_bh = NULL;
1207 1224 struct ocfs2_dinode *di;
1208 1225 struct ocfs2_alloc_context *data_ac = NULL;
1209 1226 struct ocfs2_alloc_context *meta_ac = NULL;
1210 1227 handle_t *handle;
1211   - struct ocfs2_write_ctxt wc;
  1228 + struct ocfs2_write_cluster_desc *desc;
1212 1229  
1213   - ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv);
  1230 + ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len);
  1231 + if (ret) {
  1232 + mlog_errno(ret);
  1233 + return ret;
  1234 + }
1214 1235  
1215   - ret = ocfs2_meta_lock(inode, &di_bh, 1);
  1236 + ret = ocfs2_meta_lock(inode, &wc->w_di_bh, 1);
1216 1237 if (ret) {
1217 1238 mlog_errno(ret);
1218 1239 goto out;
1219 1240 }
1220   - di = (struct ocfs2_dinode *)di_bh->b_data;
  1241 + di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
1221 1242  
1222 1243 /*
1223 1244 * Take alloc sem here to prevent concurrent lookups. That way
1224 1245  
1225 1246  
1226 1247  
... ... @@ -1228,23 +1249,60 @@
1228 1249 */
1229 1250 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1230 1251  
1231   - ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL, NULL);
1232   - if (ret) {
1233   - mlog_errno(ret);
1234   - goto out_meta;
  1252 + for (i = 0; i < wc->w_clen; i++) {
  1253 + desc = &wc->w_desc[i];
  1254 + desc->c_cpos = wc->w_cpos + i;
  1255 +
  1256 + if (num_clusters == 0) {
  1257 + ret = ocfs2_get_clusters(inode, desc->c_cpos, &phys,
  1258 + &num_clusters, NULL);
  1259 + if (ret) {
  1260 + mlog_errno(ret);
  1261 + goto out_meta;
  1262 + }
  1263 + } else if (phys) {
  1264 + /*
  1265 + * Only increment phys if it doesn't describe
  1266 + * a hole.
  1267 + */
  1268 + phys++;
  1269 + }
  1270 +
  1271 + desc->c_phys = phys;
  1272 + if (phys == 0) {
  1273 + desc->c_new = 1;
  1274 + clusters_to_alloc++;
  1275 + }
  1276 +
  1277 + num_clusters--;
1235 1278 }
1236 1279  
1237   - /* phys == 0 means that allocation is required. */
1238   - if (phys == 0) {
1239   - ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac);
  1280 + /*
  1281 + * We set w_target_from, w_target_to here so that
  1282 + * ocfs2_write_end() knows which range in the target page to
  1283 + * write out. An allocation requires that we write the entire
  1284 + * cluster range.
  1285 + */
  1286 + if (clusters_to_alloc > 0) {
  1287 + /*
  1288 + * XXX: We are stretching the limits of
  1289 + * ocfs2_lock_allocators(). It greately over-estimates
  1290 + * the work to be done.
  1291 + */
  1292 + ret = ocfs2_lock_allocators(inode, di, clusters_to_alloc,
  1293 + &data_ac, &meta_ac);
1240 1294 if (ret) {
1241 1295 mlog_errno(ret);
1242 1296 goto out_meta;
1243 1297 }
1244 1298  
1245   - credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1);
  1299 + credits = ocfs2_calc_extend_credits(inode->i_sb, di,
  1300 + clusters_to_alloc);
  1301 +
1246 1302 }
1247 1303  
  1304 + ocfs2_set_target_boundaries(osb, wc, pos, len, clusters_to_alloc);
  1305 +
1248 1306 ret = ocfs2_data_lock(inode, 1);
1249 1307 if (ret) {
1250 1308 mlog_errno(ret);
1251 1309  
1252 1310  
1253 1311  
1254 1312  
1255 1313  
... ... @@ -1258,36 +1316,50 @@
1258 1316 goto out_data;
1259 1317 }
1260 1318  
1261   - written = ocfs2_write(file, phys, handle, di_bh, data_ac,
1262   - meta_ac, &wc);
1263   - if (written < 0) {
1264   - ret = written;
  1319 + wc->w_handle = handle;
  1320 +
  1321 + /*
  1322 + * We don't want this to fail in ocfs2_write_end(), so do it
  1323 + * here.
  1324 + */
  1325 + ret = ocfs2_journal_access(handle, inode, wc->w_di_bh,
  1326 + OCFS2_JOURNAL_ACCESS_WRITE);
  1327 + if (ret) {
1265 1328 mlog_errno(ret);
1266 1329 goto out_commit;
1267 1330 }
1268 1331  
1269   - ret = ocfs2_journal_access(handle, inode, di_bh,
1270   - OCFS2_JOURNAL_ACCESS_WRITE);
  1332 + /*
  1333 + * Fill our page array first. That way we've grabbed enough so
  1334 + * that we can zero and flush if we error after adding the
  1335 + * extent.
  1336 + */
  1337 + ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos,
  1338 + clusters_to_alloc);
1271 1339 if (ret) {
1272 1340 mlog_errno(ret);
1273 1341 goto out_commit;
1274 1342 }
1275 1343  
1276   - pos += written;
1277   - if (pos > inode->i_size) {
1278   - i_size_write(inode, pos);
1279   - mark_inode_dirty(inode);
  1344 + for (i = 0; i < wc->w_clen; i++) {
  1345 + desc = &wc->w_desc[i];
  1346 +
  1347 + ret = ocfs2_write_cluster(mapping, desc->c_phys, data_ac,
  1348 + meta_ac, wc, desc->c_cpos, pos, len);
  1349 + if (ret) {
  1350 + mlog_errno(ret);
  1351 + goto out_commit;
  1352 + }
1280 1353 }
1281   - inode->i_blocks = ocfs2_inode_sector_count(inode);
1282   - di->i_size = cpu_to_le64((u64)i_size_read(inode));
1283   - inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1284   - di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
1285   - di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
1286 1354  
1287   - ret = ocfs2_journal_dirty(handle, di_bh);
1288   - if (ret)
1289   - mlog_errno(ret);
  1355 + if (data_ac)
  1356 + ocfs2_free_alloc_context(data_ac);
  1357 + if (meta_ac)
  1358 + ocfs2_free_alloc_context(meta_ac);
1290 1359  
  1360 + *pagep = wc->w_target_page;
  1361 + *fsdata = wc;
  1362 + return 0;
1291 1363 out_commit:
1292 1364 ocfs2_commit_trans(osb, handle);
1293 1365  
1294 1366  
1295 1367  
... ... @@ -1299,13 +1371,85 @@
1299 1371 ocfs2_meta_unlock(inode, 1);
1300 1372  
1301 1373 out:
1302   - brelse(di_bh);
  1374 + ocfs2_free_write_ctxt(wc);
  1375 +
1303 1376 if (data_ac)
1304 1377 ocfs2_free_alloc_context(data_ac);
1305 1378 if (meta_ac)
1306 1379 ocfs2_free_alloc_context(meta_ac);
  1380 + return ret;
  1381 +}
1307 1382  
1308   - return written ? written : ret;
  1383 +int ocfs2_write_end(struct file *file, struct address_space *mapping,
  1384 + loff_t pos, unsigned len, unsigned copied,
  1385 + struct page *page, void *fsdata)
  1386 +{
  1387 + int i;
  1388 + unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1);
  1389 + struct inode *inode = mapping->host;
  1390 + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
  1391 + struct ocfs2_write_ctxt *wc = fsdata;
  1392 + struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
  1393 + handle_t *handle = wc->w_handle;
  1394 + struct page *tmppage;
  1395 +
  1396 + if (unlikely(copied < len)) {
  1397 + if (!PageUptodate(wc->w_target_page))
  1398 + copied = 0;
  1399 +
  1400 + ocfs2_zero_new_buffers(wc->w_target_page, start+copied,
  1401 + start+len);
  1402 + }
  1403 + flush_dcache_page(wc->w_target_page);
  1404 +
  1405 + for(i = 0; i < wc->w_num_pages; i++) {
  1406 + tmppage = wc->w_pages[i];
  1407 +
  1408 + if (tmppage == wc->w_target_page) {
  1409 + from = wc->w_target_from;
  1410 + to = wc->w_target_to;
  1411 +
  1412 + BUG_ON(from > PAGE_CACHE_SIZE ||
  1413 + to > PAGE_CACHE_SIZE ||
  1414 + to < from);
  1415 + } else {
  1416 + /*
  1417 + * Pages adjacent to the target (if any) imply
  1418 + * a hole-filling write in which case we want
  1419 + * to flush their entire range.
  1420 + */
  1421 + from = 0;
  1422 + to = PAGE_CACHE_SIZE;
  1423 + }
  1424 +
  1425 + if (ocfs2_should_order_data(inode))
  1426 + walk_page_buffers(wc->w_handle, page_buffers(tmppage),
  1427 + from, to, NULL,
  1428 + ocfs2_journal_dirty_data);
  1429 +
  1430 + block_commit_write(tmppage, from, to);
  1431 + }
  1432 +
  1433 + pos += copied;
  1434 + if (pos > inode->i_size) {
  1435 + i_size_write(inode, pos);
  1436 + mark_inode_dirty(inode);
  1437 + }
  1438 + inode->i_blocks = ocfs2_inode_sector_count(inode);
  1439 + di->i_size = cpu_to_le64((u64)i_size_read(inode));
  1440 + inode->i_mtime = inode->i_ctime = CURRENT_TIME;
  1441 + di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
  1442 + di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
  1443 +
  1444 + ocfs2_journal_dirty(handle, wc->w_di_bh);
  1445 +
  1446 + ocfs2_commit_trans(osb, handle);
  1447 + ocfs2_data_unlock(inode, 1);
  1448 + up_write(&OCFS2_I(inode)->ip_alloc_sem);
  1449 + ocfs2_meta_unlock(inode, 1);
  1450 + ocfs2_free_write_ctxt(wc);
  1451 +
  1452 + return copied;
1309 1453 }
1310 1454  
1311 1455 const struct address_space_operations ocfs2_aops = {
... ... @@ -42,57 +42,13 @@
42 42 int (*fn)( handle_t *handle,
43 43 struct buffer_head *bh));
44 44  
45   -struct ocfs2_write_ctxt;
46   -typedef int (ocfs2_page_writer)(struct inode *, struct ocfs2_write_ctxt *,
47   - u64 *, unsigned int *, unsigned int *);
  45 +int ocfs2_write_begin(struct file *file, struct address_space *mapping,
  46 + loff_t pos, unsigned len, unsigned flags,
  47 + struct page **pagep, void **fsdata);
48 48  
49   -ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
50   - size_t count, ocfs2_page_writer *actor,
51   - void *priv);
52   -
53   -struct ocfs2_write_ctxt {
54   - size_t w_count;
55   - loff_t w_pos;
56   - u32 w_cpos;
57   - unsigned int w_finished_copy;
58   -
59   - /* This is true if page_size > cluster_size */
60   - unsigned int w_large_pages;
61   -
62   - /* Filler callback and private data */
63   - ocfs2_page_writer *w_write_data_page;
64   - void *w_private;
65   -
66   - /* Only valid for the filler callback */
67   - struct page *w_this_page;
68   - unsigned int w_this_page_new;
69   -};
70   -
71   -struct ocfs2_buffered_write_priv {
72   - char *b_src_buf;
73   - const struct iovec *b_cur_iov; /* Current iovec */
74   - size_t b_cur_off; /* Offset in the
75   - * current iovec */
76   -};
77   -int ocfs2_map_and_write_user_data(struct inode *inode,
78   - struct ocfs2_write_ctxt *wc,
79   - u64 *p_blkno,
80   - unsigned int *ret_from,
81   - unsigned int *ret_to);
82   -
83   -struct ocfs2_splice_write_priv {
84   - struct splice_desc *s_sd;
85   - struct pipe_buffer *s_buf;
86   - struct pipe_inode_info *s_pipe;
87   - /* Neither offset value is ever larger than one page */
88   - unsigned int s_offset;
89   - unsigned int s_buf_offset;
90   -};
91   -int ocfs2_map_and_write_splice_data(struct inode *inode,
92   - struct ocfs2_write_ctxt *wc,
93   - u64 *p_blkno,
94   - unsigned int *ret_from,
95   - unsigned int *ret_to);
  49 +int ocfs2_write_end(struct file *file, struct address_space *mapping,
  50 + loff_t pos, unsigned len, unsigned copied,
  51 + struct page *page, void *fsdata);
96 52  
97 53 /* all ocfs2_dio_end_io()'s fault */
98 54 #define ocfs2_iocb_is_rw_locked(iocb) \
... ... @@ -1335,15 +1335,16 @@
1335 1335 *basep = base;
1336 1336 }
1337 1337  
1338   -static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp,
  1338 +static struct page * ocfs2_get_write_source(char **ret_src_buf,
1339 1339 const struct iovec *cur_iov,
1340 1340 size_t iov_offset)
1341 1341 {
1342 1342 int ret;
1343   - char *buf;
  1343 + char *buf = cur_iov->iov_base + iov_offset;
1344 1344 struct page *src_page = NULL;
  1345 + unsigned long off;
1345 1346  
1346   - buf = cur_iov->iov_base + iov_offset;
  1347 + off = (unsigned long)(buf) & ~PAGE_CACHE_MASK;
1347 1348  
1348 1349 if (!segment_eq(get_fs(), KERNEL_DS)) {
1349 1350 /*
1350 1351  
1351 1352  
... ... @@ -1355,18 +1356,17 @@
1355 1356 (unsigned long)buf & PAGE_CACHE_MASK, 1,
1356 1357 0, 0, &src_page, NULL);
1357 1358 if (ret == 1)
1358   - bp->b_src_buf = kmap(src_page);
  1359 + *ret_src_buf = kmap(src_page) + off;
1359 1360 else
1360 1361 src_page = ERR_PTR(-EFAULT);
1361 1362 } else {
1362   - bp->b_src_buf = buf;
  1363 + *ret_src_buf = buf;
1363 1364 }
1364 1365  
1365 1366 return src_page;
1366 1367 }
1367 1368  
1368   -static void ocfs2_put_write_source(struct ocfs2_buffered_write_priv *bp,
1369   - struct page *page)
  1369 +static void ocfs2_put_write_source(struct page *page)
1370 1370 {
1371 1371 if (page) {
1372 1372 kunmap(page);
1373 1373  
... ... @@ -1382,10 +1382,12 @@
1382 1382 {
1383 1383 int ret = 0;
1384 1384 ssize_t copied, total = 0;
1385   - size_t iov_offset = 0;
  1385 + size_t iov_offset = 0, bytes;
  1386 + loff_t pos;
1386 1387 const struct iovec *cur_iov = iov;
1387   - struct ocfs2_buffered_write_priv bp;
1388   - struct page *page;
  1388 + struct page *user_page, *page;
  1389 + char *buf, *dst;
  1390 + void *fsdata;
1389 1391  
1390 1392 /*
1391 1393 * handle partial DIO write. Adjust cur_iov if needed.
1392 1394  
1393 1395  
1394 1396  
1395 1397  
... ... @@ -1393,21 +1395,38 @@
1393 1395 ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written);
1394 1396  
1395 1397 do {
1396   - bp.b_cur_off = iov_offset;
1397   - bp.b_cur_iov = cur_iov;
  1398 + pos = *ppos;
1398 1399  
1399   - page = ocfs2_get_write_source(&bp, cur_iov, iov_offset);
1400   - if (IS_ERR(page)) {
1401   - ret = PTR_ERR(page);
  1400 + user_page = ocfs2_get_write_source(&buf, cur_iov, iov_offset);
  1401 + if (IS_ERR(user_page)) {
  1402 + ret = PTR_ERR(user_page);
1402 1403 goto out;
1403 1404 }
1404 1405  
1405   - copied = ocfs2_buffered_write_cluster(file, *ppos, count,
1406   - ocfs2_map_and_write_user_data,
1407   - &bp);
  1406 + /* Stay within our page boundaries */
  1407 + bytes = min((PAGE_CACHE_SIZE - ((unsigned long)pos & ~PAGE_CACHE_MASK)),
  1408 + (PAGE_CACHE_SIZE - ((unsigned long)buf & ~PAGE_CACHE_MASK)));
  1409 + /* Stay within the vector boundary */
  1410 + bytes = min_t(size_t, bytes, cur_iov->iov_len - iov_offset);
  1411 + /* Stay within count */
  1412 + bytes = min(bytes, count);
1408 1413  
1409   - ocfs2_put_write_source(&bp, page);
  1414 + page = NULL;
  1415 + ret = ocfs2_write_begin(file, file->f_mapping, pos, bytes, 0,
  1416 + &page, &fsdata);
  1417 + if (ret) {
  1418 + mlog_errno(ret);
  1419 + goto out;
  1420 + }
1410 1421  
  1422 + dst = kmap_atomic(page, KM_USER0);
  1423 + memcpy(dst + (pos & (PAGE_CACHE_SIZE - 1)), buf, bytes);
  1424 + kunmap_atomic(dst, KM_USER0);
  1425 + flush_dcache_page(page);
  1426 + ocfs2_put_write_source(user_page);
  1427 +
  1428 + copied = ocfs2_write_end(file, file->f_mapping, pos, bytes,
  1429 + bytes, page, fsdata);
1411 1430 if (copied < 0) {
1412 1431 mlog_errno(copied);
1413 1432 ret = copied;
... ... @@ -1415,7 +1434,7 @@
1415 1434 }
1416 1435  
1417 1436 total += copied;
1418   - *ppos = *ppos + copied;
  1437 + *ppos = pos + copied;
1419 1438 count -= copied;
1420 1439  
1421 1440 ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied);
1422 1441  
1423 1442  
1424 1443  
1425 1444  
1426 1445  
1427 1446  
1428 1447  
... ... @@ -1585,52 +1604,46 @@
1585 1604 struct pipe_buffer *buf,
1586 1605 struct splice_desc *sd)
1587 1606 {
1588   - int ret, count, total = 0;
  1607 + int ret, count;
1589 1608 ssize_t copied = 0;
1590   - struct ocfs2_splice_write_priv sp;
  1609 + struct file *file = sd->u.file;
  1610 + unsigned int offset;
  1611 + struct page *page = NULL;
  1612 + void *fsdata;
  1613 + char *src, *dst;
1591 1614  
1592 1615 ret = buf->ops->confirm(pipe, buf);
1593 1616 if (ret)
1594 1617 goto out;
1595 1618  
1596   - sp.s_sd = sd;
1597   - sp.s_buf = buf;
1598   - sp.s_pipe = pipe;
1599   - sp.s_offset = sd->pos & ~PAGE_CACHE_MASK;
1600   - sp.s_buf_offset = buf->offset;
1601   -
  1619 + offset = sd->pos & ~PAGE_CACHE_MASK;
1602 1620 count = sd->len;
1603   - if (count + sp.s_offset > PAGE_CACHE_SIZE)
1604   - count = PAGE_CACHE_SIZE - sp.s_offset;
  1621 + if (count + offset > PAGE_CACHE_SIZE)
  1622 + count = PAGE_CACHE_SIZE - offset;
1605 1623  
1606   - do {
1607   - /*
1608   - * splice wants us to copy up to one page at a
1609   - * time. For pagesize > cluster size, this means we
1610   - * might enter ocfs2_buffered_write_cluster() more
1611   - * than once, so keep track of our progress here.
1612   - */
1613   - copied = ocfs2_buffered_write_cluster(sd->u.file,
1614   - (loff_t)sd->pos + total,
1615   - count,
1616   - ocfs2_map_and_write_splice_data,
1617   - &sp);
1618   - if (copied < 0) {
1619   - mlog_errno(copied);
1620   - ret = copied;
1621   - goto out;
1622   - }
  1624 + ret = ocfs2_write_begin(file, file->f_mapping, sd->pos, count, 0,
  1625 + &page, &fsdata);
  1626 + if (ret) {
  1627 + mlog_errno(ret);
  1628 + goto out;
  1629 + }
1623 1630  
1624   - count -= copied;
1625   - sp.s_offset += copied;
1626   - sp.s_buf_offset += copied;
1627   - total += copied;
1628   - } while (count);
  1631 + src = buf->ops->map(pipe, buf, 1);
  1632 + dst = kmap_atomic(page, KM_USER1);
  1633 + memcpy(dst + offset, src + buf->offset, count);
  1634 + kunmap_atomic(page, KM_USER1);
  1635 + buf->ops->unmap(pipe, buf, src);
1629 1636  
1630   - ret = 0;
  1637 + copied = ocfs2_write_end(file, file->f_mapping, sd->pos, count, count,
  1638 + page, fsdata);
  1639 + if (copied < 0) {
  1640 + mlog_errno(copied);
  1641 + ret = copied;
  1642 + goto out;
  1643 + }
1631 1644 out:
1632 1645  
1633   - return total ? total : ret;
  1646 + return copied ? copied : ret;
1634 1647 }
1635 1648  
1636 1649 static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe,