Commit 7307de80510a70e5e5aa98de1e80ccbb7d90a3a8

Authored by Mark Fasheh
1 parent 607d44aa3f

ocfs2: shared writeable mmap

Implement cluster consistent shared writeable mappings using the
->page_mkwrite() callback.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

Showing 4 changed files with 200 additions and 39 deletions Inline Diff

1 /* -*- mode: c; c-basic-offset: 8; -*- 1 /* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0: 2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 * 3 *
4 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 4 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
5 * 5 *
6 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public 7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either 8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version. 9 * version 2 of the License, or (at your option) any later version.
10 * 10 *
11 * This program is distributed in the hope that it will be useful, 11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details. 14 * General Public License for more details.
15 * 15 *
16 * You should have received a copy of the GNU General Public 16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the 17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA. 19 * Boston, MA 021110-1307, USA.
20 */ 20 */
21 21
22 #include <linux/fs.h> 22 #include <linux/fs.h>
23 #include <linux/slab.h> 23 #include <linux/slab.h>
24 #include <linux/highmem.h> 24 #include <linux/highmem.h>
25 #include <linux/pagemap.h> 25 #include <linux/pagemap.h>
26 #include <asm/byteorder.h> 26 #include <asm/byteorder.h>
27 #include <linux/swap.h> 27 #include <linux/swap.h>
28 #include <linux/pipe_fs_i.h> 28 #include <linux/pipe_fs_i.h>
29 29
30 #define MLOG_MASK_PREFIX ML_FILE_IO 30 #define MLOG_MASK_PREFIX ML_FILE_IO
31 #include <cluster/masklog.h> 31 #include <cluster/masklog.h>
32 32
33 #include "ocfs2.h" 33 #include "ocfs2.h"
34 34
35 #include "alloc.h" 35 #include "alloc.h"
36 #include "aops.h" 36 #include "aops.h"
37 #include "dlmglue.h" 37 #include "dlmglue.h"
38 #include "extent_map.h" 38 #include "extent_map.h"
39 #include "file.h" 39 #include "file.h"
40 #include "inode.h" 40 #include "inode.h"
41 #include "journal.h" 41 #include "journal.h"
42 #include "suballoc.h" 42 #include "suballoc.h"
43 #include "super.h" 43 #include "super.h"
44 #include "symlink.h" 44 #include "symlink.h"
45 45
46 #include "buffer_head_io.h" 46 #include "buffer_head_io.h"
47 47
48 static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, 48 static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
49 struct buffer_head *bh_result, int create) 49 struct buffer_head *bh_result, int create)
50 { 50 {
51 int err = -EIO; 51 int err = -EIO;
52 int status; 52 int status;
53 struct ocfs2_dinode *fe = NULL; 53 struct ocfs2_dinode *fe = NULL;
54 struct buffer_head *bh = NULL; 54 struct buffer_head *bh = NULL;
55 struct buffer_head *buffer_cache_bh = NULL; 55 struct buffer_head *buffer_cache_bh = NULL;
56 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 56 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
57 void *kaddr; 57 void *kaddr;
58 58
59 mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, 59 mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
60 (unsigned long long)iblock, bh_result, create); 60 (unsigned long long)iblock, bh_result, create);
61 61
62 BUG_ON(ocfs2_inode_is_fast_symlink(inode)); 62 BUG_ON(ocfs2_inode_is_fast_symlink(inode));
63 63
64 if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) { 64 if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) {
65 mlog(ML_ERROR, "block offset > PATH_MAX: %llu", 65 mlog(ML_ERROR, "block offset > PATH_MAX: %llu",
66 (unsigned long long)iblock); 66 (unsigned long long)iblock);
67 goto bail; 67 goto bail;
68 } 68 }
69 69
70 status = ocfs2_read_block(OCFS2_SB(inode->i_sb), 70 status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
71 OCFS2_I(inode)->ip_blkno, 71 OCFS2_I(inode)->ip_blkno,
72 &bh, OCFS2_BH_CACHED, inode); 72 &bh, OCFS2_BH_CACHED, inode);
73 if (status < 0) { 73 if (status < 0) {
74 mlog_errno(status); 74 mlog_errno(status);
75 goto bail; 75 goto bail;
76 } 76 }
77 fe = (struct ocfs2_dinode *) bh->b_data; 77 fe = (struct ocfs2_dinode *) bh->b_data;
78 78
79 if (!OCFS2_IS_VALID_DINODE(fe)) { 79 if (!OCFS2_IS_VALID_DINODE(fe)) {
80 mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n", 80 mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n",
81 (unsigned long long)le64_to_cpu(fe->i_blkno), 7, 81 (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
82 fe->i_signature); 82 fe->i_signature);
83 goto bail; 83 goto bail;
84 } 84 }
85 85
86 if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb, 86 if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
87 le32_to_cpu(fe->i_clusters))) { 87 le32_to_cpu(fe->i_clusters))) {
88 mlog(ML_ERROR, "block offset is outside the allocated size: " 88 mlog(ML_ERROR, "block offset is outside the allocated size: "
89 "%llu\n", (unsigned long long)iblock); 89 "%llu\n", (unsigned long long)iblock);
90 goto bail; 90 goto bail;
91 } 91 }
92 92
93 /* We don't use the page cache to create symlink data, so if 93 /* We don't use the page cache to create symlink data, so if
94 * need be, copy it over from the buffer cache. */ 94 * need be, copy it over from the buffer cache. */
95 if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) { 95 if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) {
96 u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + 96 u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) +
97 iblock; 97 iblock;
98 buffer_cache_bh = sb_getblk(osb->sb, blkno); 98 buffer_cache_bh = sb_getblk(osb->sb, blkno);
99 if (!buffer_cache_bh) { 99 if (!buffer_cache_bh) {
100 mlog(ML_ERROR, "couldn't getblock for symlink!\n"); 100 mlog(ML_ERROR, "couldn't getblock for symlink!\n");
101 goto bail; 101 goto bail;
102 } 102 }
103 103
104 /* we haven't locked out transactions, so a commit 104 /* we haven't locked out transactions, so a commit
105 * could've happened. Since we've got a reference on 105 * could've happened. Since we've got a reference on
106 * the bh, even if it commits while we're doing the 106 * the bh, even if it commits while we're doing the
107 * copy, the data is still good. */ 107 * copy, the data is still good. */
108 if (buffer_jbd(buffer_cache_bh) 108 if (buffer_jbd(buffer_cache_bh)
109 && ocfs2_inode_is_new(inode)) { 109 && ocfs2_inode_is_new(inode)) {
110 kaddr = kmap_atomic(bh_result->b_page, KM_USER0); 110 kaddr = kmap_atomic(bh_result->b_page, KM_USER0);
111 if (!kaddr) { 111 if (!kaddr) {
112 mlog(ML_ERROR, "couldn't kmap!\n"); 112 mlog(ML_ERROR, "couldn't kmap!\n");
113 goto bail; 113 goto bail;
114 } 114 }
115 memcpy(kaddr + (bh_result->b_size * iblock), 115 memcpy(kaddr + (bh_result->b_size * iblock),
116 buffer_cache_bh->b_data, 116 buffer_cache_bh->b_data,
117 bh_result->b_size); 117 bh_result->b_size);
118 kunmap_atomic(kaddr, KM_USER0); 118 kunmap_atomic(kaddr, KM_USER0);
119 set_buffer_uptodate(bh_result); 119 set_buffer_uptodate(bh_result);
120 } 120 }
121 brelse(buffer_cache_bh); 121 brelse(buffer_cache_bh);
122 } 122 }
123 123
124 map_bh(bh_result, inode->i_sb, 124 map_bh(bh_result, inode->i_sb,
125 le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock); 125 le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock);
126 126
127 err = 0; 127 err = 0;
128 128
129 bail: 129 bail:
130 if (bh) 130 if (bh)
131 brelse(bh); 131 brelse(bh);
132 132
133 mlog_exit(err); 133 mlog_exit(err);
134 return err; 134 return err;
135 } 135 }
136 136
137 static int ocfs2_get_block(struct inode *inode, sector_t iblock, 137 static int ocfs2_get_block(struct inode *inode, sector_t iblock,
138 struct buffer_head *bh_result, int create) 138 struct buffer_head *bh_result, int create)
139 { 139 {
140 int err = 0; 140 int err = 0;
141 unsigned int ext_flags; 141 unsigned int ext_flags;
142 u64 p_blkno, past_eof; 142 u64 p_blkno, past_eof;
143 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 143 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
144 144
145 mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, 145 mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
146 (unsigned long long)iblock, bh_result, create); 146 (unsigned long long)iblock, bh_result, create);
147 147
148 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) 148 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
149 mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n", 149 mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
150 inode, inode->i_ino); 150 inode, inode->i_ino);
151 151
152 if (S_ISLNK(inode->i_mode)) { 152 if (S_ISLNK(inode->i_mode)) {
153 /* this always does I/O for some reason. */ 153 /* this always does I/O for some reason. */
154 err = ocfs2_symlink_get_block(inode, iblock, bh_result, create); 154 err = ocfs2_symlink_get_block(inode, iblock, bh_result, create);
155 goto bail; 155 goto bail;
156 } 156 }
157 157
158 err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL, 158 err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL,
159 &ext_flags); 159 &ext_flags);
160 if (err) { 160 if (err) {
161 mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, " 161 mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
162 "%llu, NULL)\n", err, inode, (unsigned long long)iblock, 162 "%llu, NULL)\n", err, inode, (unsigned long long)iblock,
163 (unsigned long long)p_blkno); 163 (unsigned long long)p_blkno);
164 goto bail; 164 goto bail;
165 } 165 }
166 166
167 /* 167 /*
168 * ocfs2 never allocates in this function - the only time we 168 * ocfs2 never allocates in this function - the only time we
169 * need to use BH_New is when we're extending i_size on a file 169 * need to use BH_New is when we're extending i_size on a file
170 * system which doesn't support holes, in which case BH_New 170 * system which doesn't support holes, in which case BH_New
171 * allows block_prepare_write() to zero. 171 * allows block_prepare_write() to zero.
172 */ 172 */
173 mlog_bug_on_msg(create && p_blkno == 0 && ocfs2_sparse_alloc(osb), 173 mlog_bug_on_msg(create && p_blkno == 0 && ocfs2_sparse_alloc(osb),
174 "ino %lu, iblock %llu\n", inode->i_ino, 174 "ino %lu, iblock %llu\n", inode->i_ino,
175 (unsigned long long)iblock); 175 (unsigned long long)iblock);
176 176
177 /* Treat the unwritten extent as a hole for zeroing purposes. */ 177 /* Treat the unwritten extent as a hole for zeroing purposes. */
178 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) 178 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
179 map_bh(bh_result, inode->i_sb, p_blkno); 179 map_bh(bh_result, inode->i_sb, p_blkno);
180 180
181 if (!ocfs2_sparse_alloc(osb)) { 181 if (!ocfs2_sparse_alloc(osb)) {
182 if (p_blkno == 0) { 182 if (p_blkno == 0) {
183 err = -EIO; 183 err = -EIO;
184 mlog(ML_ERROR, 184 mlog(ML_ERROR,
185 "iblock = %llu p_blkno = %llu blkno=(%llu)\n", 185 "iblock = %llu p_blkno = %llu blkno=(%llu)\n",
186 (unsigned long long)iblock, 186 (unsigned long long)iblock,
187 (unsigned long long)p_blkno, 187 (unsigned long long)p_blkno,
188 (unsigned long long)OCFS2_I(inode)->ip_blkno); 188 (unsigned long long)OCFS2_I(inode)->ip_blkno);
189 mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters); 189 mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters);
190 dump_stack(); 190 dump_stack();
191 } 191 }
192 192
193 past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); 193 past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
194 mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino, 194 mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
195 (unsigned long long)past_eof); 195 (unsigned long long)past_eof);
196 196
197 if (create && (iblock >= past_eof)) 197 if (create && (iblock >= past_eof))
198 set_buffer_new(bh_result); 198 set_buffer_new(bh_result);
199 } 199 }
200 200
201 bail: 201 bail:
202 if (err < 0) 202 if (err < 0)
203 err = -EIO; 203 err = -EIO;
204 204
205 mlog_exit(err); 205 mlog_exit(err);
206 return err; 206 return err;
207 } 207 }
208 208
209 static int ocfs2_readpage(struct file *file, struct page *page) 209 static int ocfs2_readpage(struct file *file, struct page *page)
210 { 210 {
211 struct inode *inode = page->mapping->host; 211 struct inode *inode = page->mapping->host;
212 loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT; 212 loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
213 int ret, unlock = 1; 213 int ret, unlock = 1;
214 214
215 mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0)); 215 mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
216 216
217 ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page); 217 ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page);
218 if (ret != 0) { 218 if (ret != 0) {
219 if (ret == AOP_TRUNCATED_PAGE) 219 if (ret == AOP_TRUNCATED_PAGE)
220 unlock = 0; 220 unlock = 0;
221 mlog_errno(ret); 221 mlog_errno(ret);
222 goto out; 222 goto out;
223 } 223 }
224 224
225 if (down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem) == 0) { 225 if (down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem) == 0) {
226 ret = AOP_TRUNCATED_PAGE; 226 ret = AOP_TRUNCATED_PAGE;
227 goto out_meta_unlock; 227 goto out_meta_unlock;
228 } 228 }
229 229
230 /* 230 /*
231 * i_size might have just been updated as we grabed the meta lock. We 231 * i_size might have just been updated as we grabed the meta lock. We
232 * might now be discovering a truncate that hit on another node. 232 * might now be discovering a truncate that hit on another node.
233 * block_read_full_page->get_block freaks out if it is asked to read 233 * block_read_full_page->get_block freaks out if it is asked to read
234 * beyond the end of a file, so we check here. Callers 234 * beyond the end of a file, so we check here. Callers
235 * (generic_file_read, fault->nopage) are clever enough to check i_size 235 * (generic_file_read, fault->nopage) are clever enough to check i_size
236 * and notice that the page they just read isn't needed. 236 * and notice that the page they just read isn't needed.
237 * 237 *
238 * XXX sys_readahead() seems to get that wrong? 238 * XXX sys_readahead() seems to get that wrong?
239 */ 239 */
240 if (start >= i_size_read(inode)) { 240 if (start >= i_size_read(inode)) {
241 zero_user_page(page, 0, PAGE_SIZE, KM_USER0); 241 zero_user_page(page, 0, PAGE_SIZE, KM_USER0);
242 SetPageUptodate(page); 242 SetPageUptodate(page);
243 ret = 0; 243 ret = 0;
244 goto out_alloc; 244 goto out_alloc;
245 } 245 }
246 246
247 ret = ocfs2_data_lock_with_page(inode, 0, page); 247 ret = ocfs2_data_lock_with_page(inode, 0, page);
248 if (ret != 0) { 248 if (ret != 0) {
249 if (ret == AOP_TRUNCATED_PAGE) 249 if (ret == AOP_TRUNCATED_PAGE)
250 unlock = 0; 250 unlock = 0;
251 mlog_errno(ret); 251 mlog_errno(ret);
252 goto out_alloc; 252 goto out_alloc;
253 } 253 }
254 254
255 ret = block_read_full_page(page, ocfs2_get_block); 255 ret = block_read_full_page(page, ocfs2_get_block);
256 unlock = 0; 256 unlock = 0;
257 257
258 ocfs2_data_unlock(inode, 0); 258 ocfs2_data_unlock(inode, 0);
259 out_alloc: 259 out_alloc:
260 up_read(&OCFS2_I(inode)->ip_alloc_sem); 260 up_read(&OCFS2_I(inode)->ip_alloc_sem);
261 out_meta_unlock: 261 out_meta_unlock:
262 ocfs2_meta_unlock(inode, 0); 262 ocfs2_meta_unlock(inode, 0);
263 out: 263 out:
264 if (unlock) 264 if (unlock)
265 unlock_page(page); 265 unlock_page(page);
266 mlog_exit(ret); 266 mlog_exit(ret);
267 return ret; 267 return ret;
268 } 268 }
269 269
270 /* Note: Because we don't support holes, our allocation has 270 /* Note: Because we don't support holes, our allocation has
271 * already happened (allocation writes zeros to the file data) 271 * already happened (allocation writes zeros to the file data)
272 * so we don't have to worry about ordered writes in 272 * so we don't have to worry about ordered writes in
273 * ocfs2_writepage. 273 * ocfs2_writepage.
274 * 274 *
275 * ->writepage is called during the process of invalidating the page cache 275 * ->writepage is called during the process of invalidating the page cache
276 * during blocked lock processing. It can't block on any cluster locks 276 * during blocked lock processing. It can't block on any cluster locks
277 * to during block mapping. It's relying on the fact that the block 277 * to during block mapping. It's relying on the fact that the block
278 * mapping can't have disappeared under the dirty pages that it is 278 * mapping can't have disappeared under the dirty pages that it is
279 * being asked to write back. 279 * being asked to write back.
280 */ 280 */
281 static int ocfs2_writepage(struct page *page, struct writeback_control *wbc) 281 static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
282 { 282 {
283 int ret; 283 int ret;
284 284
285 mlog_entry("(0x%p)\n", page); 285 mlog_entry("(0x%p)\n", page);
286 286
287 ret = block_write_full_page(page, ocfs2_get_block, wbc); 287 ret = block_write_full_page(page, ocfs2_get_block, wbc);
288 288
289 mlog_exit(ret); 289 mlog_exit(ret);
290 290
291 return ret; 291 return ret;
292 } 292 }
293 293
294 /* 294 /*
295 * This is called from ocfs2_write_zero_page() which has handled it's 295 * This is called from ocfs2_write_zero_page() which has handled it's
296 * own cluster locking and has ensured allocation exists for those 296 * own cluster locking and has ensured allocation exists for those
297 * blocks to be written. 297 * blocks to be written.
298 */ 298 */
299 int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page, 299 int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
300 unsigned from, unsigned to) 300 unsigned from, unsigned to)
301 { 301 {
302 int ret; 302 int ret;
303 303
304 down_read(&OCFS2_I(inode)->ip_alloc_sem); 304 down_read(&OCFS2_I(inode)->ip_alloc_sem);
305 305
306 ret = block_prepare_write(page, from, to, ocfs2_get_block); 306 ret = block_prepare_write(page, from, to, ocfs2_get_block);
307 307
308 up_read(&OCFS2_I(inode)->ip_alloc_sem); 308 up_read(&OCFS2_I(inode)->ip_alloc_sem);
309 309
310 return ret; 310 return ret;
311 } 311 }
312 312
313 /* Taken from ext3. We don't necessarily need the full blown 313 /* Taken from ext3. We don't necessarily need the full blown
314 * functionality yet, but IMHO it's better to cut and paste the whole 314 * functionality yet, but IMHO it's better to cut and paste the whole
315 * thing so we can avoid introducing our own bugs (and easily pick up 315 * thing so we can avoid introducing our own bugs (and easily pick up
316 * their fixes when they happen) --Mark */ 316 * their fixes when they happen) --Mark */
317 int walk_page_buffers( handle_t *handle, 317 int walk_page_buffers( handle_t *handle,
318 struct buffer_head *head, 318 struct buffer_head *head,
319 unsigned from, 319 unsigned from,
320 unsigned to, 320 unsigned to,
321 int *partial, 321 int *partial,
322 int (*fn)( handle_t *handle, 322 int (*fn)( handle_t *handle,
323 struct buffer_head *bh)) 323 struct buffer_head *bh))
324 { 324 {
325 struct buffer_head *bh; 325 struct buffer_head *bh;
326 unsigned block_start, block_end; 326 unsigned block_start, block_end;
327 unsigned blocksize = head->b_size; 327 unsigned blocksize = head->b_size;
328 int err, ret = 0; 328 int err, ret = 0;
329 struct buffer_head *next; 329 struct buffer_head *next;
330 330
331 for ( bh = head, block_start = 0; 331 for ( bh = head, block_start = 0;
332 ret == 0 && (bh != head || !block_start); 332 ret == 0 && (bh != head || !block_start);
333 block_start = block_end, bh = next) 333 block_start = block_end, bh = next)
334 { 334 {
335 next = bh->b_this_page; 335 next = bh->b_this_page;
336 block_end = block_start + blocksize; 336 block_end = block_start + blocksize;
337 if (block_end <= from || block_start >= to) { 337 if (block_end <= from || block_start >= to) {
338 if (partial && !buffer_uptodate(bh)) 338 if (partial && !buffer_uptodate(bh))
339 *partial = 1; 339 *partial = 1;
340 continue; 340 continue;
341 } 341 }
342 err = (*fn)(handle, bh); 342 err = (*fn)(handle, bh);
343 if (!ret) 343 if (!ret)
344 ret = err; 344 ret = err;
345 } 345 }
346 return ret; 346 return ret;
347 } 347 }
348 348
349 handle_t *ocfs2_start_walk_page_trans(struct inode *inode, 349 handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
350 struct page *page, 350 struct page *page,
351 unsigned from, 351 unsigned from,
352 unsigned to) 352 unsigned to)
353 { 353 {
354 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 354 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
355 handle_t *handle = NULL; 355 handle_t *handle = NULL;
356 int ret = 0; 356 int ret = 0;
357 357
358 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 358 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
359 if (!handle) { 359 if (!handle) {
360 ret = -ENOMEM; 360 ret = -ENOMEM;
361 mlog_errno(ret); 361 mlog_errno(ret);
362 goto out; 362 goto out;
363 } 363 }
364 364
365 if (ocfs2_should_order_data(inode)) { 365 if (ocfs2_should_order_data(inode)) {
366 ret = walk_page_buffers(handle, 366 ret = walk_page_buffers(handle,
367 page_buffers(page), 367 page_buffers(page),
368 from, to, NULL, 368 from, to, NULL,
369 ocfs2_journal_dirty_data); 369 ocfs2_journal_dirty_data);
370 if (ret < 0) 370 if (ret < 0)
371 mlog_errno(ret); 371 mlog_errno(ret);
372 } 372 }
373 out: 373 out:
374 if (ret) { 374 if (ret) {
375 if (handle) 375 if (handle)
376 ocfs2_commit_trans(osb, handle); 376 ocfs2_commit_trans(osb, handle);
377 handle = ERR_PTR(ret); 377 handle = ERR_PTR(ret);
378 } 378 }
379 return handle; 379 return handle;
380 } 380 }
381 381
382 static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) 382 static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
383 { 383 {
384 sector_t status; 384 sector_t status;
385 u64 p_blkno = 0; 385 u64 p_blkno = 0;
386 int err = 0; 386 int err = 0;
387 struct inode *inode = mapping->host; 387 struct inode *inode = mapping->host;
388 388
389 mlog_entry("(block = %llu)\n", (unsigned long long)block); 389 mlog_entry("(block = %llu)\n", (unsigned long long)block);
390 390
391 /* We don't need to lock journal system files, since they aren't 391 /* We don't need to lock journal system files, since they aren't
392 * accessed concurrently from multiple nodes. 392 * accessed concurrently from multiple nodes.
393 */ 393 */
394 if (!INODE_JOURNAL(inode)) { 394 if (!INODE_JOURNAL(inode)) {
395 err = ocfs2_meta_lock(inode, NULL, 0); 395 err = ocfs2_meta_lock(inode, NULL, 0);
396 if (err) { 396 if (err) {
397 if (err != -ENOENT) 397 if (err != -ENOENT)
398 mlog_errno(err); 398 mlog_errno(err);
399 goto bail; 399 goto bail;
400 } 400 }
401 down_read(&OCFS2_I(inode)->ip_alloc_sem); 401 down_read(&OCFS2_I(inode)->ip_alloc_sem);
402 } 402 }
403 403
404 err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, NULL); 404 err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, NULL);
405 405
406 if (!INODE_JOURNAL(inode)) { 406 if (!INODE_JOURNAL(inode)) {
407 up_read(&OCFS2_I(inode)->ip_alloc_sem); 407 up_read(&OCFS2_I(inode)->ip_alloc_sem);
408 ocfs2_meta_unlock(inode, 0); 408 ocfs2_meta_unlock(inode, 0);
409 } 409 }
410 410
411 if (err) { 411 if (err) {
412 mlog(ML_ERROR, "get_blocks() failed, block = %llu\n", 412 mlog(ML_ERROR, "get_blocks() failed, block = %llu\n",
413 (unsigned long long)block); 413 (unsigned long long)block);
414 mlog_errno(err); 414 mlog_errno(err);
415 goto bail; 415 goto bail;
416 } 416 }
417 417
418 418
419 bail: 419 bail:
420 status = err ? 0 : p_blkno; 420 status = err ? 0 : p_blkno;
421 421
422 mlog_exit((int)status); 422 mlog_exit((int)status);
423 423
424 return status; 424 return status;
425 } 425 }
426 426
427 /* 427 /*
428 * TODO: Make this into a generic get_blocks function. 428 * TODO: Make this into a generic get_blocks function.
429 * 429 *
430 * From do_direct_io in direct-io.c: 430 * From do_direct_io in direct-io.c:
431 * "So what we do is to permit the ->get_blocks function to populate 431 * "So what we do is to permit the ->get_blocks function to populate
432 * bh.b_size with the size of IO which is permitted at this offset and 432 * bh.b_size with the size of IO which is permitted at this offset and
433 * this i_blkbits." 433 * this i_blkbits."
434 * 434 *
435 * This function is called directly from get_more_blocks in direct-io.c. 435 * This function is called directly from get_more_blocks in direct-io.c.
436 * 436 *
437 * called like this: dio->get_blocks(dio->inode, fs_startblk, 437 * called like this: dio->get_blocks(dio->inode, fs_startblk,
438 * fs_count, map_bh, dio->rw == WRITE); 438 * fs_count, map_bh, dio->rw == WRITE);
439 */ 439 */
440 static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, 440 static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
441 struct buffer_head *bh_result, int create) 441 struct buffer_head *bh_result, int create)
442 { 442 {
443 int ret; 443 int ret;
444 u64 p_blkno, inode_blocks, contig_blocks; 444 u64 p_blkno, inode_blocks, contig_blocks;
445 unsigned int ext_flags; 445 unsigned int ext_flags;
446 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; 446 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
447 unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; 447 unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
448 448
449 /* This function won't even be called if the request isn't all 449 /* This function won't even be called if the request isn't all
450 * nicely aligned and of the right size, so there's no need 450 * nicely aligned and of the right size, so there's no need
451 * for us to check any of that. */ 451 * for us to check any of that. */
452 452
453 inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); 453 inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
454 454
455 /* 455 /*
456 * Any write past EOF is not allowed because we'd be extending. 456 * Any write past EOF is not allowed because we'd be extending.
457 */ 457 */
458 if (create && (iblock + max_blocks) > inode_blocks) { 458 if (create && (iblock + max_blocks) > inode_blocks) {
459 ret = -EIO; 459 ret = -EIO;
460 goto bail; 460 goto bail;
461 } 461 }
462 462
463 /* This figures out the size of the next contiguous block, and 463 /* This figures out the size of the next contiguous block, and
464 * our logical offset */ 464 * our logical offset */
465 ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, 465 ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
466 &contig_blocks, &ext_flags); 466 &contig_blocks, &ext_flags);
467 if (ret) { 467 if (ret) {
468 mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", 468 mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
469 (unsigned long long)iblock); 469 (unsigned long long)iblock);
470 ret = -EIO; 470 ret = -EIO;
471 goto bail; 471 goto bail;
472 } 472 }
473 473
474 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno) { 474 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno) {
475 ocfs2_error(inode->i_sb, 475 ocfs2_error(inode->i_sb,
476 "Inode %llu has a hole at block %llu\n", 476 "Inode %llu has a hole at block %llu\n",
477 (unsigned long long)OCFS2_I(inode)->ip_blkno, 477 (unsigned long long)OCFS2_I(inode)->ip_blkno,
478 (unsigned long long)iblock); 478 (unsigned long long)iblock);
479 ret = -EROFS; 479 ret = -EROFS;
480 goto bail; 480 goto bail;
481 } 481 }
482 482
483 /* 483 /*
484 * get_more_blocks() expects us to describe a hole by clearing 484 * get_more_blocks() expects us to describe a hole by clearing
485 * the mapped bit on bh_result(). 485 * the mapped bit on bh_result().
486 * 486 *
487 * Consider an unwritten extent as a hole. 487 * Consider an unwritten extent as a hole.
488 */ 488 */
489 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) 489 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
490 map_bh(bh_result, inode->i_sb, p_blkno); 490 map_bh(bh_result, inode->i_sb, p_blkno);
491 else { 491 else {
492 /* 492 /*
493 * ocfs2_prepare_inode_for_write() should have caught 493 * ocfs2_prepare_inode_for_write() should have caught
494 * the case where we'd be filling a hole and triggered 494 * the case where we'd be filling a hole and triggered
495 * a buffered write instead. 495 * a buffered write instead.
496 */ 496 */
497 if (create) { 497 if (create) {
498 ret = -EIO; 498 ret = -EIO;
499 mlog_errno(ret); 499 mlog_errno(ret);
500 goto bail; 500 goto bail;
501 } 501 }
502 502
503 clear_buffer_mapped(bh_result); 503 clear_buffer_mapped(bh_result);
504 } 504 }
505 505
506 /* make sure we don't map more than max_blocks blocks here as 506 /* make sure we don't map more than max_blocks blocks here as
507 that's all the kernel will handle at this point. */ 507 that's all the kernel will handle at this point. */
508 if (max_blocks < contig_blocks) 508 if (max_blocks < contig_blocks)
509 contig_blocks = max_blocks; 509 contig_blocks = max_blocks;
510 bh_result->b_size = contig_blocks << blocksize_bits; 510 bh_result->b_size = contig_blocks << blocksize_bits;
511 bail: 511 bail:
512 return ret; 512 return ret;
513 } 513 }
514 514
515 /* 515 /*
516 * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're 516 * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
517 * particularly interested in the aio/dio case. Like the core uses 517 * particularly interested in the aio/dio case. Like the core uses
518 * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from 518 * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
519 * truncation on another. 519 * truncation on another.
520 */ 520 */
521 static void ocfs2_dio_end_io(struct kiocb *iocb, 521 static void ocfs2_dio_end_io(struct kiocb *iocb,
522 loff_t offset, 522 loff_t offset,
523 ssize_t bytes, 523 ssize_t bytes,
524 void *private) 524 void *private)
525 { 525 {
526 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; 526 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
527 int level; 527 int level;
528 528
529 /* this io's submitter should not have unlocked this before we could */ 529 /* this io's submitter should not have unlocked this before we could */
530 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); 530 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
531 531
532 ocfs2_iocb_clear_rw_locked(iocb); 532 ocfs2_iocb_clear_rw_locked(iocb);
533 533
534 level = ocfs2_iocb_rw_locked_level(iocb); 534 level = ocfs2_iocb_rw_locked_level(iocb);
535 if (!level) 535 if (!level)
536 up_read(&inode->i_alloc_sem); 536 up_read(&inode->i_alloc_sem);
537 ocfs2_rw_unlock(inode, level); 537 ocfs2_rw_unlock(inode, level);
538 } 538 }
539 539
540 /* 540 /*
541 * ocfs2_invalidatepage() and ocfs2_releasepage() are shamelessly stolen 541 * ocfs2_invalidatepage() and ocfs2_releasepage() are shamelessly stolen
542 * from ext3. PageChecked() bits have been removed as OCFS2 does not 542 * from ext3. PageChecked() bits have been removed as OCFS2 does not
543 * do journalled data. 543 * do journalled data.
544 */ 544 */
545 static void ocfs2_invalidatepage(struct page *page, unsigned long offset) 545 static void ocfs2_invalidatepage(struct page *page, unsigned long offset)
546 { 546 {
547 journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal; 547 journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
548 548
549 journal_invalidatepage(journal, page, offset); 549 journal_invalidatepage(journal, page, offset);
550 } 550 }
551 551
552 static int ocfs2_releasepage(struct page *page, gfp_t wait) 552 static int ocfs2_releasepage(struct page *page, gfp_t wait)
553 { 553 {
554 journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal; 554 journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
555 555
556 if (!page_has_buffers(page)) 556 if (!page_has_buffers(page))
557 return 0; 557 return 0;
558 return journal_try_to_free_buffers(journal, page, wait); 558 return journal_try_to_free_buffers(journal, page, wait);
559 } 559 }
560 560
561 static ssize_t ocfs2_direct_IO(int rw, 561 static ssize_t ocfs2_direct_IO(int rw,
562 struct kiocb *iocb, 562 struct kiocb *iocb,
563 const struct iovec *iov, 563 const struct iovec *iov,
564 loff_t offset, 564 loff_t offset,
565 unsigned long nr_segs) 565 unsigned long nr_segs)
566 { 566 {
567 struct file *file = iocb->ki_filp; 567 struct file *file = iocb->ki_filp;
568 struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host; 568 struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
569 int ret; 569 int ret;
570 570
571 mlog_entry_void(); 571 mlog_entry_void();
572 572
573 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { 573 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
574 /* 574 /*
575 * We get PR data locks even for O_DIRECT. This 575 * We get PR data locks even for O_DIRECT. This
576 * allows concurrent O_DIRECT I/O but doesn't let 576 * allows concurrent O_DIRECT I/O but doesn't let
577 * O_DIRECT with extending and buffered zeroing writes 577 * O_DIRECT with extending and buffered zeroing writes
578 * race. If they did race then the buffered zeroing 578 * race. If they did race then the buffered zeroing
579 * could be written back after the O_DIRECT I/O. It's 579 * could be written back after the O_DIRECT I/O. It's
580 * one thing to tell people not to mix buffered and 580 * one thing to tell people not to mix buffered and
581 * O_DIRECT writes, but expecting them to understand 581 * O_DIRECT writes, but expecting them to understand
582 * that file extension is also an implicit buffered 582 * that file extension is also an implicit buffered
583 * write is too much. By getting the PR we force 583 * write is too much. By getting the PR we force
584 * writeback of the buffered zeroing before 584 * writeback of the buffered zeroing before
585 * proceeding. 585 * proceeding.
586 */ 586 */
587 ret = ocfs2_data_lock(inode, 0); 587 ret = ocfs2_data_lock(inode, 0);
588 if (ret < 0) { 588 if (ret < 0) {
589 mlog_errno(ret); 589 mlog_errno(ret);
590 goto out; 590 goto out;
591 } 591 }
592 ocfs2_data_unlock(inode, 0); 592 ocfs2_data_unlock(inode, 0);
593 } 593 }
594 594
595 ret = blockdev_direct_IO_no_locking(rw, iocb, inode, 595 ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
596 inode->i_sb->s_bdev, iov, offset, 596 inode->i_sb->s_bdev, iov, offset,
597 nr_segs, 597 nr_segs,
598 ocfs2_direct_IO_get_blocks, 598 ocfs2_direct_IO_get_blocks,
599 ocfs2_dio_end_io); 599 ocfs2_dio_end_io);
600 out: 600 out:
601 mlog_exit(ret); 601 mlog_exit(ret);
602 return ret; 602 return ret;
603 } 603 }
604 604
605 static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb, 605 static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
606 u32 cpos, 606 u32 cpos,
607 unsigned int *start, 607 unsigned int *start,
608 unsigned int *end) 608 unsigned int *end)
609 { 609 {
610 unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE; 610 unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE;
611 611
612 if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) { 612 if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) {
613 unsigned int cpp; 613 unsigned int cpp;
614 614
615 cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits); 615 cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits);
616 616
617 cluster_start = cpos % cpp; 617 cluster_start = cpos % cpp;
618 cluster_start = cluster_start << osb->s_clustersize_bits; 618 cluster_start = cluster_start << osb->s_clustersize_bits;
619 619
620 cluster_end = cluster_start + osb->s_clustersize; 620 cluster_end = cluster_start + osb->s_clustersize;
621 } 621 }
622 622
623 BUG_ON(cluster_start > PAGE_SIZE); 623 BUG_ON(cluster_start > PAGE_SIZE);
624 BUG_ON(cluster_end > PAGE_SIZE); 624 BUG_ON(cluster_end > PAGE_SIZE);
625 625
626 if (start) 626 if (start)
627 *start = cluster_start; 627 *start = cluster_start;
628 if (end) 628 if (end)
629 *end = cluster_end; 629 *end = cluster_end;
630 } 630 }
631 631
632 /* 632 /*
633 * 'from' and 'to' are the region in the page to avoid zeroing. 633 * 'from' and 'to' are the region in the page to avoid zeroing.
634 * 634 *
635 * If pagesize > clustersize, this function will avoid zeroing outside 635 * If pagesize > clustersize, this function will avoid zeroing outside
636 * of the cluster boundary. 636 * of the cluster boundary.
637 * 637 *
638 * from == to == 0 is code for "zero the entire cluster region" 638 * from == to == 0 is code for "zero the entire cluster region"
639 */ 639 */
640 static void ocfs2_clear_page_regions(struct page *page, 640 static void ocfs2_clear_page_regions(struct page *page,
641 struct ocfs2_super *osb, u32 cpos, 641 struct ocfs2_super *osb, u32 cpos,
642 unsigned from, unsigned to) 642 unsigned from, unsigned to)
643 { 643 {
644 void *kaddr; 644 void *kaddr;
645 unsigned int cluster_start, cluster_end; 645 unsigned int cluster_start, cluster_end;
646 646
647 ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end); 647 ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end);
648 648
649 kaddr = kmap_atomic(page, KM_USER0); 649 kaddr = kmap_atomic(page, KM_USER0);
650 650
651 if (from || to) { 651 if (from || to) {
652 if (from > cluster_start) 652 if (from > cluster_start)
653 memset(kaddr + cluster_start, 0, from - cluster_start); 653 memset(kaddr + cluster_start, 0, from - cluster_start);
654 if (to < cluster_end) 654 if (to < cluster_end)
655 memset(kaddr + to, 0, cluster_end - to); 655 memset(kaddr + to, 0, cluster_end - to);
656 } else { 656 } else {
657 memset(kaddr + cluster_start, 0, cluster_end - cluster_start); 657 memset(kaddr + cluster_start, 0, cluster_end - cluster_start);
658 } 658 }
659 659
660 kunmap_atomic(kaddr, KM_USER0); 660 kunmap_atomic(kaddr, KM_USER0);
661 } 661 }
662 662
663 /* 663 /*
664 * Some of this taken from block_prepare_write(). We already have our 664 * Some of this taken from block_prepare_write(). We already have our
665 * mapping by now though, and the entire write will be allocating or 665 * mapping by now though, and the entire write will be allocating or
666 * it won't, so not much need to use BH_New. 666 * it won't, so not much need to use BH_New.
667 * 667 *
668 * This will also skip zeroing, which is handled externally. 668 * This will also skip zeroing, which is handled externally.
669 */ 669 */
670 int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, 670 int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
671 struct inode *inode, unsigned int from, 671 struct inode *inode, unsigned int from,
672 unsigned int to, int new) 672 unsigned int to, int new)
673 { 673 {
674 int ret = 0; 674 int ret = 0;
675 struct buffer_head *head, *bh, *wait[2], **wait_bh = wait; 675 struct buffer_head *head, *bh, *wait[2], **wait_bh = wait;
676 unsigned int block_end, block_start; 676 unsigned int block_end, block_start;
677 unsigned int bsize = 1 << inode->i_blkbits; 677 unsigned int bsize = 1 << inode->i_blkbits;
678 678
679 if (!page_has_buffers(page)) 679 if (!page_has_buffers(page))
680 create_empty_buffers(page, bsize, 0); 680 create_empty_buffers(page, bsize, 0);
681 681
682 head = page_buffers(page); 682 head = page_buffers(page);
683 for (bh = head, block_start = 0; bh != head || !block_start; 683 for (bh = head, block_start = 0; bh != head || !block_start;
684 bh = bh->b_this_page, block_start += bsize) { 684 bh = bh->b_this_page, block_start += bsize) {
685 block_end = block_start + bsize; 685 block_end = block_start + bsize;
686 686
687 clear_buffer_new(bh); 687 clear_buffer_new(bh);
688 688
689 /* 689 /*
690 * Ignore blocks outside of our i/o range - 690 * Ignore blocks outside of our i/o range -
691 * they may belong to unallocated clusters. 691 * they may belong to unallocated clusters.
692 */ 692 */
693 if (block_start >= to || block_end <= from) { 693 if (block_start >= to || block_end <= from) {
694 if (PageUptodate(page)) 694 if (PageUptodate(page))
695 set_buffer_uptodate(bh); 695 set_buffer_uptodate(bh);
696 continue; 696 continue;
697 } 697 }
698 698
699 /* 699 /*
700 * For an allocating write with cluster size >= page 700 * For an allocating write with cluster size >= page
701 * size, we always write the entire page. 701 * size, we always write the entire page.
702 */ 702 */
703 if (new) 703 if (new)
704 set_buffer_new(bh); 704 set_buffer_new(bh);
705 705
706 if (!buffer_mapped(bh)) { 706 if (!buffer_mapped(bh)) {
707 map_bh(bh, inode->i_sb, *p_blkno); 707 map_bh(bh, inode->i_sb, *p_blkno);
708 unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); 708 unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
709 } 709 }
710 710
711 if (PageUptodate(page)) { 711 if (PageUptodate(page)) {
712 if (!buffer_uptodate(bh)) 712 if (!buffer_uptodate(bh))
713 set_buffer_uptodate(bh); 713 set_buffer_uptodate(bh);
714 } else if (!buffer_uptodate(bh) && !buffer_delay(bh) && 714 } else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
715 (block_start < from || block_end > to)) { 715 (block_start < from || block_end > to)) {
716 ll_rw_block(READ, 1, &bh); 716 ll_rw_block(READ, 1, &bh);
717 *wait_bh++=bh; 717 *wait_bh++=bh;
718 } 718 }
719 719
720 *p_blkno = *p_blkno + 1; 720 *p_blkno = *p_blkno + 1;
721 } 721 }
722 722
723 /* 723 /*
724 * If we issued read requests - let them complete. 724 * If we issued read requests - let them complete.
725 */ 725 */
726 while(wait_bh > wait) { 726 while(wait_bh > wait) {
727 wait_on_buffer(*--wait_bh); 727 wait_on_buffer(*--wait_bh);
728 if (!buffer_uptodate(*wait_bh)) 728 if (!buffer_uptodate(*wait_bh))
729 ret = -EIO; 729 ret = -EIO;
730 } 730 }
731 731
732 if (ret == 0 || !new) 732 if (ret == 0 || !new)
733 return ret; 733 return ret;
734 734
735 /* 735 /*
736 * If we get -EIO above, zero out any newly allocated blocks 736 * If we get -EIO above, zero out any newly allocated blocks
737 * to avoid exposing stale data. 737 * to avoid exposing stale data.
738 */ 738 */
739 bh = head; 739 bh = head;
740 block_start = 0; 740 block_start = 0;
741 do { 741 do {
742 void *kaddr; 742 void *kaddr;
743 743
744 block_end = block_start + bsize; 744 block_end = block_start + bsize;
745 if (block_end <= from) 745 if (block_end <= from)
746 goto next_bh; 746 goto next_bh;
747 if (block_start >= to) 747 if (block_start >= to)
748 break; 748 break;
749 749
750 kaddr = kmap_atomic(page, KM_USER0); 750 kaddr = kmap_atomic(page, KM_USER0);
751 memset(kaddr+block_start, 0, bh->b_size); 751 memset(kaddr+block_start, 0, bh->b_size);
752 flush_dcache_page(page); 752 flush_dcache_page(page);
753 kunmap_atomic(kaddr, KM_USER0); 753 kunmap_atomic(kaddr, KM_USER0);
754 set_buffer_uptodate(bh); 754 set_buffer_uptodate(bh);
755 mark_buffer_dirty(bh); 755 mark_buffer_dirty(bh);
756 756
757 next_bh: 757 next_bh:
758 block_start = block_end; 758 block_start = block_end;
759 bh = bh->b_this_page; 759 bh = bh->b_this_page;
760 } while (bh != head); 760 } while (bh != head);
761 761
762 return ret; 762 return ret;
763 } 763 }
764 764
765 #if (PAGE_CACHE_SIZE >= OCFS2_MAX_CLUSTERSIZE) 765 #if (PAGE_CACHE_SIZE >= OCFS2_MAX_CLUSTERSIZE)
766 #define OCFS2_MAX_CTXT_PAGES 1 766 #define OCFS2_MAX_CTXT_PAGES 1
767 #else 767 #else
768 #define OCFS2_MAX_CTXT_PAGES (OCFS2_MAX_CLUSTERSIZE / PAGE_CACHE_SIZE) 768 #define OCFS2_MAX_CTXT_PAGES (OCFS2_MAX_CLUSTERSIZE / PAGE_CACHE_SIZE)
769 #endif 769 #endif
770 770
771 #define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE) 771 #define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE)
772 772
773 /* 773 /*
774 * Describe the state of a single cluster to be written to. 774 * Describe the state of a single cluster to be written to.
775 */ 775 */
776 struct ocfs2_write_cluster_desc { 776 struct ocfs2_write_cluster_desc {
777 u32 c_cpos; 777 u32 c_cpos;
778 u32 c_phys; 778 u32 c_phys;
779 /* 779 /*
780 * Give this a unique field because c_phys eventually gets 780 * Give this a unique field because c_phys eventually gets
781 * filled. 781 * filled.
782 */ 782 */
783 unsigned c_new; 783 unsigned c_new;
784 }; 784 };
785 785
786 struct ocfs2_write_ctxt { 786 struct ocfs2_write_ctxt {
787 /* Logical cluster position / len of write */ 787 /* Logical cluster position / len of write */
788 u32 w_cpos; 788 u32 w_cpos;
789 u32 w_clen; 789 u32 w_clen;
790 790
791 struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE]; 791 struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
792 792
793 /* 793 /*
794 * This is true if page_size > cluster_size. 794 * This is true if page_size > cluster_size.
795 * 795 *
796 * It triggers a set of special cases during write which might 796 * It triggers a set of special cases during write which might
797 * have to deal with allocating writes to partial pages. 797 * have to deal with allocating writes to partial pages.
798 */ 798 */
799 unsigned int w_large_pages; 799 unsigned int w_large_pages;
800 800
801 /* 801 /*
802 * Pages involved in this write. 802 * Pages involved in this write.
803 * 803 *
804 * w_target_page is the page being written to by the user. 804 * w_target_page is the page being written to by the user.
805 * 805 *
806 * w_pages is an array of pages which always contains 806 * w_pages is an array of pages which always contains
807 * w_target_page, and in the case of an allocating write with 807 * w_target_page, and in the case of an allocating write with
808 * page_size < cluster size, it will contain zero'd and mapped 808 * page_size < cluster size, it will contain zero'd and mapped
809 * pages adjacent to w_target_page which need to be written 809 * pages adjacent to w_target_page which need to be written
810 * out in so that future reads from that region will get 810 * out in so that future reads from that region will get
811 * zero's. 811 * zero's.
812 */ 812 */
813 struct page *w_pages[OCFS2_MAX_CTXT_PAGES]; 813 struct page *w_pages[OCFS2_MAX_CTXT_PAGES];
814 unsigned int w_num_pages; 814 unsigned int w_num_pages;
815 struct page *w_target_page; 815 struct page *w_target_page;
816 816
817 /* 817 /*
818 * ocfs2_write_end() uses this to know what the real range to 818 * ocfs2_write_end() uses this to know what the real range to
819 * write in the target should be. 819 * write in the target should be.
820 */ 820 */
821 unsigned int w_target_from; 821 unsigned int w_target_from;
822 unsigned int w_target_to; 822 unsigned int w_target_to;
823 823
824 /* 824 /*
825 * We could use journal_current_handle() but this is cleaner, 825 * We could use journal_current_handle() but this is cleaner,
826 * IMHO -Mark 826 * IMHO -Mark
827 */ 827 */
828 handle_t *w_handle; 828 handle_t *w_handle;
829 829
830 struct buffer_head *w_di_bh; 830 struct buffer_head *w_di_bh;
831 }; 831 };
832 832
833 static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) 833 static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
834 { 834 {
835 int i; 835 int i;
836 836
837 for(i = 0; i < wc->w_num_pages; i++) { 837 for(i = 0; i < wc->w_num_pages; i++) {
838 if (wc->w_pages[i] == NULL) 838 if (wc->w_pages[i] == NULL)
839 continue; 839 continue;
840 840
841 unlock_page(wc->w_pages[i]); 841 unlock_page(wc->w_pages[i]);
842 mark_page_accessed(wc->w_pages[i]); 842 mark_page_accessed(wc->w_pages[i]);
843 page_cache_release(wc->w_pages[i]); 843 page_cache_release(wc->w_pages[i]);
844 } 844 }
845 845
846 brelse(wc->w_di_bh); 846 brelse(wc->w_di_bh);
847 kfree(wc); 847 kfree(wc);
848 } 848 }
849 849
850 static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, 850 static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
851 struct ocfs2_super *osb, loff_t pos, 851 struct ocfs2_super *osb, loff_t pos,
852 unsigned len, struct buffer_head *di_bh) 852 unsigned len, struct buffer_head *di_bh)
853 { 853 {
854 struct ocfs2_write_ctxt *wc; 854 struct ocfs2_write_ctxt *wc;
855 855
856 wc = kzalloc(sizeof(struct ocfs2_write_ctxt), GFP_NOFS); 856 wc = kzalloc(sizeof(struct ocfs2_write_ctxt), GFP_NOFS);
857 if (!wc) 857 if (!wc)
858 return -ENOMEM; 858 return -ENOMEM;
859 859
860 wc->w_cpos = pos >> osb->s_clustersize_bits; 860 wc->w_cpos = pos >> osb->s_clustersize_bits;
861 wc->w_clen = ocfs2_clusters_for_bytes(osb->sb, len); 861 wc->w_clen = ocfs2_clusters_for_bytes(osb->sb, len);
862 get_bh(di_bh); 862 get_bh(di_bh);
863 wc->w_di_bh = di_bh; 863 wc->w_di_bh = di_bh;
864 864
865 if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) 865 if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
866 wc->w_large_pages = 1; 866 wc->w_large_pages = 1;
867 else 867 else
868 wc->w_large_pages = 0; 868 wc->w_large_pages = 0;
869 869
870 *wcp = wc; 870 *wcp = wc;
871 871
872 return 0; 872 return 0;
873 } 873 }
874 874
875 /* 875 /*
876 * If a page has any new buffers, zero them out here, and mark them uptodate 876 * If a page has any new buffers, zero them out here, and mark them uptodate
877 * and dirty so they'll be written out (in order to prevent uninitialised 877 * and dirty so they'll be written out (in order to prevent uninitialised
878 * block data from leaking). And clear the new bit. 878 * block data from leaking). And clear the new bit.
879 */ 879 */
880 static void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to) 880 static void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to)
881 { 881 {
882 unsigned int block_start, block_end; 882 unsigned int block_start, block_end;
883 struct buffer_head *head, *bh; 883 struct buffer_head *head, *bh;
884 884
885 BUG_ON(!PageLocked(page)); 885 BUG_ON(!PageLocked(page));
886 if (!page_has_buffers(page)) 886 if (!page_has_buffers(page))
887 return; 887 return;
888 888
889 bh = head = page_buffers(page); 889 bh = head = page_buffers(page);
890 block_start = 0; 890 block_start = 0;
891 do { 891 do {
892 block_end = block_start + bh->b_size; 892 block_end = block_start + bh->b_size;
893 893
894 if (buffer_new(bh)) { 894 if (buffer_new(bh)) {
895 if (block_end > from && block_start < to) { 895 if (block_end > from && block_start < to) {
896 if (!PageUptodate(page)) { 896 if (!PageUptodate(page)) {
897 unsigned start, end; 897 unsigned start, end;
898 void *kaddr; 898 void *kaddr;
899 899
900 start = max(from, block_start); 900 start = max(from, block_start);
901 end = min(to, block_end); 901 end = min(to, block_end);
902 902
903 kaddr = kmap_atomic(page, KM_USER0); 903 kaddr = kmap_atomic(page, KM_USER0);
904 memset(kaddr+start, 0, end - start); 904 memset(kaddr+start, 0, end - start);
905 flush_dcache_page(page); 905 flush_dcache_page(page);
906 kunmap_atomic(kaddr, KM_USER0); 906 kunmap_atomic(kaddr, KM_USER0);
907 set_buffer_uptodate(bh); 907 set_buffer_uptodate(bh);
908 } 908 }
909 909
910 clear_buffer_new(bh); 910 clear_buffer_new(bh);
911 mark_buffer_dirty(bh); 911 mark_buffer_dirty(bh);
912 } 912 }
913 } 913 }
914 914
915 block_start = block_end; 915 block_start = block_end;
916 bh = bh->b_this_page; 916 bh = bh->b_this_page;
917 } while (bh != head); 917 } while (bh != head);
918 } 918 }
919 919
920 /* 920 /*
921 * Only called when we have a failure during allocating write to write 921 * Only called when we have a failure during allocating write to write
922 * zero's to the newly allocated region. 922 * zero's to the newly allocated region.
923 */ 923 */
924 static void ocfs2_write_failure(struct inode *inode, 924 static void ocfs2_write_failure(struct inode *inode,
925 struct ocfs2_write_ctxt *wc, 925 struct ocfs2_write_ctxt *wc,
926 loff_t user_pos, unsigned user_len) 926 loff_t user_pos, unsigned user_len)
927 { 927 {
928 int i; 928 int i;
929 unsigned from, to; 929 unsigned from, to;
930 struct page *tmppage; 930 struct page *tmppage;
931 931
932 ocfs2_zero_new_buffers(wc->w_target_page, user_pos, user_len); 932 ocfs2_zero_new_buffers(wc->w_target_page, user_pos, user_len);
933 933
934 if (wc->w_large_pages) { 934 if (wc->w_large_pages) {
935 from = wc->w_target_from; 935 from = wc->w_target_from;
936 to = wc->w_target_to; 936 to = wc->w_target_to;
937 } else { 937 } else {
938 from = 0; 938 from = 0;
939 to = PAGE_CACHE_SIZE; 939 to = PAGE_CACHE_SIZE;
940 } 940 }
941 941
942 for(i = 0; i < wc->w_num_pages; i++) { 942 for(i = 0; i < wc->w_num_pages; i++) {
943 tmppage = wc->w_pages[i]; 943 tmppage = wc->w_pages[i];
944 944
945 if (ocfs2_should_order_data(inode)) 945 if (ocfs2_should_order_data(inode))
946 walk_page_buffers(wc->w_handle, page_buffers(tmppage), 946 walk_page_buffers(wc->w_handle, page_buffers(tmppage),
947 from, to, NULL, 947 from, to, NULL,
948 ocfs2_journal_dirty_data); 948 ocfs2_journal_dirty_data);
949 949
950 block_commit_write(tmppage, from, to); 950 block_commit_write(tmppage, from, to);
951 } 951 }
952 } 952 }
953 953
954 static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno, 954 static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno,
955 struct ocfs2_write_ctxt *wc, 955 struct ocfs2_write_ctxt *wc,
956 struct page *page, u32 cpos, 956 struct page *page, u32 cpos,
957 loff_t user_pos, unsigned user_len, 957 loff_t user_pos, unsigned user_len,
958 int new) 958 int new)
959 { 959 {
960 int ret; 960 int ret;
961 unsigned int map_from = 0, map_to = 0; 961 unsigned int map_from = 0, map_to = 0;
962 unsigned int cluster_start, cluster_end; 962 unsigned int cluster_start, cluster_end;
963 unsigned int user_data_from = 0, user_data_to = 0; 963 unsigned int user_data_from = 0, user_data_to = 0;
964 964
965 ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos, 965 ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos,
966 &cluster_start, &cluster_end); 966 &cluster_start, &cluster_end);
967 967
968 if (page == wc->w_target_page) { 968 if (page == wc->w_target_page) {
969 map_from = user_pos & (PAGE_CACHE_SIZE - 1); 969 map_from = user_pos & (PAGE_CACHE_SIZE - 1);
970 map_to = map_from + user_len; 970 map_to = map_from + user_len;
971 971
972 if (new) 972 if (new)
973 ret = ocfs2_map_page_blocks(page, p_blkno, inode, 973 ret = ocfs2_map_page_blocks(page, p_blkno, inode,
974 cluster_start, cluster_end, 974 cluster_start, cluster_end,
975 new); 975 new);
976 else 976 else
977 ret = ocfs2_map_page_blocks(page, p_blkno, inode, 977 ret = ocfs2_map_page_blocks(page, p_blkno, inode,
978 map_from, map_to, new); 978 map_from, map_to, new);
979 if (ret) { 979 if (ret) {
980 mlog_errno(ret); 980 mlog_errno(ret);
981 goto out; 981 goto out;
982 } 982 }
983 983
984 user_data_from = map_from; 984 user_data_from = map_from;
985 user_data_to = map_to; 985 user_data_to = map_to;
986 if (new) { 986 if (new) {
987 map_from = cluster_start; 987 map_from = cluster_start;
988 map_to = cluster_end; 988 map_to = cluster_end;
989 } 989 }
990 990
991 wc->w_target_from = map_from; 991 wc->w_target_from = map_from;
992 wc->w_target_to = map_to; 992 wc->w_target_to = map_to;
993 } else { 993 } else {
994 /* 994 /*
995 * If we haven't allocated the new page yet, we 995 * If we haven't allocated the new page yet, we
996 * shouldn't be writing it out without copying user 996 * shouldn't be writing it out without copying user
997 * data. This is likely a math error from the caller. 997 * data. This is likely a math error from the caller.
998 */ 998 */
999 BUG_ON(!new); 999 BUG_ON(!new);
1000 1000
1001 map_from = cluster_start; 1001 map_from = cluster_start;
1002 map_to = cluster_end; 1002 map_to = cluster_end;
1003 1003
1004 ret = ocfs2_map_page_blocks(page, p_blkno, inode, 1004 ret = ocfs2_map_page_blocks(page, p_blkno, inode,
1005 cluster_start, cluster_end, new); 1005 cluster_start, cluster_end, new);
1006 if (ret) { 1006 if (ret) {
1007 mlog_errno(ret); 1007 mlog_errno(ret);
1008 goto out; 1008 goto out;
1009 } 1009 }
1010 } 1010 }
1011 1011
1012 /* 1012 /*
1013 * Parts of newly allocated pages need to be zero'd. 1013 * Parts of newly allocated pages need to be zero'd.
1014 * 1014 *
1015 * Above, we have also rewritten 'to' and 'from' - as far as 1015 * Above, we have also rewritten 'to' and 'from' - as far as
1016 * the rest of the function is concerned, the entire cluster 1016 * the rest of the function is concerned, the entire cluster
1017 * range inside of a page needs to be written. 1017 * range inside of a page needs to be written.
1018 * 1018 *
1019 * We can skip this if the page is up to date - it's already 1019 * We can skip this if the page is up to date - it's already
1020 * been zero'd from being read in as a hole. 1020 * been zero'd from being read in as a hole.
1021 */ 1021 */
1022 if (new && !PageUptodate(page)) 1022 if (new && !PageUptodate(page))
1023 ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb), 1023 ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb),
1024 cpos, user_data_from, user_data_to); 1024 cpos, user_data_from, user_data_to);
1025 1025
1026 flush_dcache_page(page); 1026 flush_dcache_page(page);
1027 1027
1028 out: 1028 out:
1029 return ret; 1029 return ret;
1030 } 1030 }
1031 1031
1032 /* 1032 /*
1033 * This function will only grab one clusters worth of pages. 1033 * This function will only grab one clusters worth of pages.
1034 */ 1034 */
1035 static int ocfs2_grab_pages_for_write(struct address_space *mapping, 1035 static int ocfs2_grab_pages_for_write(struct address_space *mapping,
1036 struct ocfs2_write_ctxt *wc, 1036 struct ocfs2_write_ctxt *wc,
1037 u32 cpos, loff_t user_pos, int new) 1037 u32 cpos, loff_t user_pos, int new,
1038 struct page *mmap_page)
1038 { 1039 {
1039 int ret = 0, i; 1040 int ret = 0, i;
1040 unsigned long start, target_index, index; 1041 unsigned long start, target_index, index;
1041 struct inode *inode = mapping->host; 1042 struct inode *inode = mapping->host;
1042 1043
1043 target_index = user_pos >> PAGE_CACHE_SHIFT; 1044 target_index = user_pos >> PAGE_CACHE_SHIFT;
1044 1045
1045 /* 1046 /*
1046 * Figure out how many pages we'll be manipulating here. For 1047 * Figure out how many pages we'll be manipulating here. For
1047 * non allocating write, we just change the one 1048 * non allocating write, we just change the one
1048 * page. Otherwise, we'll need a whole clusters worth. 1049 * page. Otherwise, we'll need a whole clusters worth.
1049 */ 1050 */
1050 if (new) { 1051 if (new) {
1051 wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb); 1052 wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
1052 start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos); 1053 start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
1053 } else { 1054 } else {
1054 wc->w_num_pages = 1; 1055 wc->w_num_pages = 1;
1055 start = target_index; 1056 start = target_index;
1056 } 1057 }
1057 1058
1058 for(i = 0; i < wc->w_num_pages; i++) { 1059 for(i = 0; i < wc->w_num_pages; i++) {
1059 index = start + i; 1060 index = start + i;
1060 1061
1061 wc->w_pages[i] = find_or_create_page(mapping, index, GFP_NOFS); 1062 if (index == target_index && mmap_page) {
1062 if (!wc->w_pages[i]) { 1063 /*
1063 ret = -ENOMEM; 1064 * ocfs2_pagemkwrite() is a little different
1064 mlog_errno(ret); 1065 * and wants us to directly use the page
1065 goto out; 1066 * passed in.
1067 */
1068 lock_page(mmap_page);
1069
1070 if (mmap_page->mapping != mapping) {
1071 unlock_page(mmap_page);
1072 /*
1073 * Sanity check - the locking in
1074 * ocfs2_pagemkwrite() should ensure
1075 * that this code doesn't trigger.
1076 */
1077 ret = -EINVAL;
1078 mlog_errno(ret);
1079 goto out;
1080 }
1081
1082 page_cache_get(mmap_page);
1083 wc->w_pages[i] = mmap_page;
1084 } else {
1085 wc->w_pages[i] = find_or_create_page(mapping, index,
1086 GFP_NOFS);
1087 if (!wc->w_pages[i]) {
1088 ret = -ENOMEM;
1089 mlog_errno(ret);
1090 goto out;
1091 }
1066 } 1092 }
1067 1093
1068 if (index == target_index) 1094 if (index == target_index)
1069 wc->w_target_page = wc->w_pages[i]; 1095 wc->w_target_page = wc->w_pages[i];
1070 } 1096 }
1071 out: 1097 out:
1072 return ret; 1098 return ret;
1073 } 1099 }
1074 1100
1075 /* 1101 /*
1076 * Prepare a single cluster for write one cluster into the file. 1102 * Prepare a single cluster for write one cluster into the file.
1077 */ 1103 */
1078 static int ocfs2_write_cluster(struct address_space *mapping, 1104 static int ocfs2_write_cluster(struct address_space *mapping,
1079 u32 phys, struct ocfs2_alloc_context *data_ac, 1105 u32 phys, struct ocfs2_alloc_context *data_ac,
1080 struct ocfs2_alloc_context *meta_ac, 1106 struct ocfs2_alloc_context *meta_ac,
1081 struct ocfs2_write_ctxt *wc, u32 cpos, 1107 struct ocfs2_write_ctxt *wc, u32 cpos,
1082 loff_t user_pos, unsigned user_len) 1108 loff_t user_pos, unsigned user_len)
1083 { 1109 {
1084 int ret, i, new; 1110 int ret, i, new;
1085 u64 v_blkno, p_blkno; 1111 u64 v_blkno, p_blkno;
1086 struct inode *inode = mapping->host; 1112 struct inode *inode = mapping->host;
1087 1113
1088 new = phys == 0 ? 1 : 0; 1114 new = phys == 0 ? 1 : 0;
1089 1115
1090 if (new) { 1116 if (new) {
1091 u32 tmp_pos; 1117 u32 tmp_pos;
1092 1118
1093 /* 1119 /*
1094 * This is safe to call with the page locks - it won't take 1120 * This is safe to call with the page locks - it won't take
1095 * any additional semaphores or cluster locks. 1121 * any additional semaphores or cluster locks.
1096 */ 1122 */
1097 tmp_pos = cpos; 1123 tmp_pos = cpos;
1098 ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode, 1124 ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode,
1099 &tmp_pos, 1, wc->w_di_bh, 1125 &tmp_pos, 1, wc->w_di_bh,
1100 wc->w_handle, data_ac, 1126 wc->w_handle, data_ac,
1101 meta_ac, NULL); 1127 meta_ac, NULL);
1102 /* 1128 /*
1103 * This shouldn't happen because we must have already 1129 * This shouldn't happen because we must have already
1104 * calculated the correct meta data allocation required. The 1130 * calculated the correct meta data allocation required. The
1105 * internal tree allocation code should know how to increase 1131 * internal tree allocation code should know how to increase
1106 * transaction credits itself. 1132 * transaction credits itself.
1107 * 1133 *
1108 * If need be, we could handle -EAGAIN for a 1134 * If need be, we could handle -EAGAIN for a
1109 * RESTART_TRANS here. 1135 * RESTART_TRANS here.
1110 */ 1136 */
1111 mlog_bug_on_msg(ret == -EAGAIN, 1137 mlog_bug_on_msg(ret == -EAGAIN,
1112 "Inode %llu: EAGAIN return during allocation.\n", 1138 "Inode %llu: EAGAIN return during allocation.\n",
1113 (unsigned long long)OCFS2_I(inode)->ip_blkno); 1139 (unsigned long long)OCFS2_I(inode)->ip_blkno);
1114 if (ret < 0) { 1140 if (ret < 0) {
1115 mlog_errno(ret); 1141 mlog_errno(ret);
1116 goto out; 1142 goto out;
1117 } 1143 }
1118 1144
1119 v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos); 1145 v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos);
1120 } else { 1146 } else {
1121 v_blkno = user_pos >> inode->i_sb->s_blocksize_bits; 1147 v_blkno = user_pos >> inode->i_sb->s_blocksize_bits;
1122 } 1148 }
1123 1149
1124 /* 1150 /*
1125 * The only reason this should fail is due to an inability to 1151 * The only reason this should fail is due to an inability to
1126 * find the extent added. 1152 * find the extent added.
1127 */ 1153 */
1128 ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL, 1154 ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL,
1129 NULL); 1155 NULL);
1130 if (ret < 0) { 1156 if (ret < 0) {
1131 ocfs2_error(inode->i_sb, "Corrupting extend for inode %llu, " 1157 ocfs2_error(inode->i_sb, "Corrupting extend for inode %llu, "
1132 "at logical block %llu", 1158 "at logical block %llu",
1133 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1159 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1134 (unsigned long long)v_blkno); 1160 (unsigned long long)v_blkno);
1135 goto out; 1161 goto out;
1136 } 1162 }
1137 1163
1138 BUG_ON(p_blkno == 0); 1164 BUG_ON(p_blkno == 0);
1139 1165
1140 for(i = 0; i < wc->w_num_pages; i++) { 1166 for(i = 0; i < wc->w_num_pages; i++) {
1141 int tmpret; 1167 int tmpret;
1142 1168
1143 tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc, 1169 tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc,
1144 wc->w_pages[i], cpos, 1170 wc->w_pages[i], cpos,
1145 user_pos, user_len, new); 1171 user_pos, user_len, new);
1146 if (tmpret) { 1172 if (tmpret) {
1147 mlog_errno(tmpret); 1173 mlog_errno(tmpret);
1148 if (ret == 0) 1174 if (ret == 0)
1149 tmpret = ret; 1175 tmpret = ret;
1150 } 1176 }
1151 } 1177 }
1152 1178
1153 /* 1179 /*
1154 * We only have cleanup to do in case of allocating write. 1180 * We only have cleanup to do in case of allocating write.
1155 */ 1181 */
1156 if (ret && new) 1182 if (ret && new)
1157 ocfs2_write_failure(inode, wc, user_pos, user_len); 1183 ocfs2_write_failure(inode, wc, user_pos, user_len);
1158 1184
1159 out: 1185 out:
1160 1186
1161 return ret; 1187 return ret;
1162 } 1188 }
1163 1189
1164 /* 1190 /*
1165 * ocfs2_write_end() wants to know which parts of the target page it 1191 * ocfs2_write_end() wants to know which parts of the target page it
1166 * should complete the write on. It's easiest to compute them ahead of 1192 * should complete the write on. It's easiest to compute them ahead of
1167 * time when a more complete view of the write is available. 1193 * time when a more complete view of the write is available.
1168 */ 1194 */
1169 static void ocfs2_set_target_boundaries(struct ocfs2_super *osb, 1195 static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
1170 struct ocfs2_write_ctxt *wc, 1196 struct ocfs2_write_ctxt *wc,
1171 loff_t pos, unsigned len, int alloc) 1197 loff_t pos, unsigned len, int alloc)
1172 { 1198 {
1173 struct ocfs2_write_cluster_desc *desc; 1199 struct ocfs2_write_cluster_desc *desc;
1174 1200
1175 wc->w_target_from = pos & (PAGE_CACHE_SIZE - 1); 1201 wc->w_target_from = pos & (PAGE_CACHE_SIZE - 1);
1176 wc->w_target_to = wc->w_target_from + len; 1202 wc->w_target_to = wc->w_target_from + len;
1177 1203
1178 if (alloc == 0) 1204 if (alloc == 0)
1179 return; 1205 return;
1180 1206
1181 /* 1207 /*
1182 * Allocating write - we may have different boundaries based 1208 * Allocating write - we may have different boundaries based
1183 * on page size and cluster size. 1209 * on page size and cluster size.
1184 * 1210 *
1185 * NOTE: We can no longer compute one value from the other as 1211 * NOTE: We can no longer compute one value from the other as
1186 * the actual write length and user provided length may be 1212 * the actual write length and user provided length may be
1187 * different. 1213 * different.
1188 */ 1214 */
1189 1215
1190 if (wc->w_large_pages) { 1216 if (wc->w_large_pages) {
1191 /* 1217 /*
1192 * We only care about the 1st and last cluster within 1218 * We only care about the 1st and last cluster within
1193 * our range and whether they are holes or not. Either 1219 * our range and whether they are holes or not. Either
1194 * value may be extended out to the start/end of a 1220 * value may be extended out to the start/end of a
1195 * newly allocated cluster. 1221 * newly allocated cluster.
1196 */ 1222 */
1197 desc = &wc->w_desc[0]; 1223 desc = &wc->w_desc[0];
1198 if (desc->c_new) 1224 if (desc->c_new)
1199 ocfs2_figure_cluster_boundaries(osb, 1225 ocfs2_figure_cluster_boundaries(osb,
1200 desc->c_cpos, 1226 desc->c_cpos,
1201 &wc->w_target_from, 1227 &wc->w_target_from,
1202 NULL); 1228 NULL);
1203 1229
1204 desc = &wc->w_desc[wc->w_clen - 1]; 1230 desc = &wc->w_desc[wc->w_clen - 1];
1205 if (desc->c_new) 1231 if (desc->c_new)
1206 ocfs2_figure_cluster_boundaries(osb, 1232 ocfs2_figure_cluster_boundaries(osb,
1207 desc->c_cpos, 1233 desc->c_cpos,
1208 NULL, 1234 NULL,
1209 &wc->w_target_to); 1235 &wc->w_target_to);
1210 } else { 1236 } else {
1211 wc->w_target_from = 0; 1237 wc->w_target_from = 0;
1212 wc->w_target_to = PAGE_CACHE_SIZE; 1238 wc->w_target_to = PAGE_CACHE_SIZE;
1213 } 1239 }
1214 } 1240 }
1215 1241
1216 static int ocfs2_write_begin_nolock(struct address_space *mapping, 1242 int ocfs2_write_begin_nolock(struct address_space *mapping,
1217 loff_t pos, unsigned len, unsigned flags, 1243 loff_t pos, unsigned len, unsigned flags,
1218 struct page **pagep, void **fsdata, 1244 struct page **pagep, void **fsdata,
1219 struct buffer_head *di_bh) 1245 struct buffer_head *di_bh, struct page *mmap_page)
1220 { 1246 {
1221 int ret, i, credits = OCFS2_INODE_UPDATE_CREDITS; 1247 int ret, i, credits = OCFS2_INODE_UPDATE_CREDITS;
1222 unsigned int num_clusters = 0, clusters_to_alloc = 0; 1248 unsigned int num_clusters = 0, clusters_to_alloc = 0;
1223 u32 phys = 0; 1249 u32 phys = 0;
1224 struct ocfs2_write_ctxt *wc; 1250 struct ocfs2_write_ctxt *wc;
1225 struct inode *inode = mapping->host; 1251 struct inode *inode = mapping->host;
1226 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1252 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1227 struct ocfs2_dinode *di; 1253 struct ocfs2_dinode *di;
1228 struct ocfs2_alloc_context *data_ac = NULL; 1254 struct ocfs2_alloc_context *data_ac = NULL;
1229 struct ocfs2_alloc_context *meta_ac = NULL; 1255 struct ocfs2_alloc_context *meta_ac = NULL;
1230 handle_t *handle; 1256 handle_t *handle;
1231 struct ocfs2_write_cluster_desc *desc; 1257 struct ocfs2_write_cluster_desc *desc;
1232 1258
1233 ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh); 1259 ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
1234 if (ret) { 1260 if (ret) {
1235 mlog_errno(ret); 1261 mlog_errno(ret);
1236 return ret; 1262 return ret;
1237 } 1263 }
1238 1264
1239 di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; 1265 di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
1240 1266
1241 for (i = 0; i < wc->w_clen; i++) { 1267 for (i = 0; i < wc->w_clen; i++) {
1242 desc = &wc->w_desc[i]; 1268 desc = &wc->w_desc[i];
1243 desc->c_cpos = wc->w_cpos + i; 1269 desc->c_cpos = wc->w_cpos + i;
1244 1270
1245 if (num_clusters == 0) { 1271 if (num_clusters == 0) {
1246 ret = ocfs2_get_clusters(inode, desc->c_cpos, &phys, 1272 ret = ocfs2_get_clusters(inode, desc->c_cpos, &phys,
1247 &num_clusters, NULL); 1273 &num_clusters, NULL);
1248 if (ret) { 1274 if (ret) {
1249 mlog_errno(ret); 1275 mlog_errno(ret);
1250 goto out; 1276 goto out;
1251 } 1277 }
1252 } else if (phys) { 1278 } else if (phys) {
1253 /* 1279 /*
1254 * Only increment phys if it doesn't describe 1280 * Only increment phys if it doesn't describe
1255 * a hole. 1281 * a hole.
1256 */ 1282 */
1257 phys++; 1283 phys++;
1258 } 1284 }
1259 1285
1260 desc->c_phys = phys; 1286 desc->c_phys = phys;
1261 if (phys == 0) { 1287 if (phys == 0) {
1262 desc->c_new = 1; 1288 desc->c_new = 1;
1263 clusters_to_alloc++; 1289 clusters_to_alloc++;
1264 } 1290 }
1265 1291
1266 num_clusters--; 1292 num_clusters--;
1267 } 1293 }
1268 1294
1269 /* 1295 /*
1270 * We set w_target_from, w_target_to here so that 1296 * We set w_target_from, w_target_to here so that
1271 * ocfs2_write_end() knows which range in the target page to 1297 * ocfs2_write_end() knows which range in the target page to
1272 * write out. An allocation requires that we write the entire 1298 * write out. An allocation requires that we write the entire
1273 * cluster range. 1299 * cluster range.
1274 */ 1300 */
1275 if (clusters_to_alloc > 0) { 1301 if (clusters_to_alloc > 0) {
1276 /* 1302 /*
1277 * XXX: We are stretching the limits of 1303 * XXX: We are stretching the limits of
1278 * ocfs2_lock_allocators(). It greately over-estimates 1304 * ocfs2_lock_allocators(). It greately over-estimates
1279 * the work to be done. 1305 * the work to be done.
1280 */ 1306 */
1281 ret = ocfs2_lock_allocators(inode, di, clusters_to_alloc, 1307 ret = ocfs2_lock_allocators(inode, di, clusters_to_alloc,
1282 &data_ac, &meta_ac); 1308 &data_ac, &meta_ac);
1283 if (ret) { 1309 if (ret) {
1284 mlog_errno(ret); 1310 mlog_errno(ret);
1285 goto out; 1311 goto out;
1286 } 1312 }
1287 1313
1288 credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1314 credits = ocfs2_calc_extend_credits(inode->i_sb, di,
1289 clusters_to_alloc); 1315 clusters_to_alloc);
1290 1316
1291 } 1317 }
1292 1318
1293 ocfs2_set_target_boundaries(osb, wc, pos, len, clusters_to_alloc); 1319 ocfs2_set_target_boundaries(osb, wc, pos, len, clusters_to_alloc);
1294 1320
1295 handle = ocfs2_start_trans(osb, credits); 1321 handle = ocfs2_start_trans(osb, credits);
1296 if (IS_ERR(handle)) { 1322 if (IS_ERR(handle)) {
1297 ret = PTR_ERR(handle); 1323 ret = PTR_ERR(handle);
1298 mlog_errno(ret); 1324 mlog_errno(ret);
1299 goto out; 1325 goto out;
1300 } 1326 }
1301 1327
1302 wc->w_handle = handle; 1328 wc->w_handle = handle;
1303 1329
1304 /* 1330 /*
1305 * We don't want this to fail in ocfs2_write_end(), so do it 1331 * We don't want this to fail in ocfs2_write_end(), so do it
1306 * here. 1332 * here.
1307 */ 1333 */
1308 ret = ocfs2_journal_access(handle, inode, wc->w_di_bh, 1334 ret = ocfs2_journal_access(handle, inode, wc->w_di_bh,
1309 OCFS2_JOURNAL_ACCESS_WRITE); 1335 OCFS2_JOURNAL_ACCESS_WRITE);
1310 if (ret) { 1336 if (ret) {
1311 mlog_errno(ret); 1337 mlog_errno(ret);
1312 goto out_commit; 1338 goto out_commit;
1313 } 1339 }
1314 1340
1315 /* 1341 /*
1316 * Fill our page array first. That way we've grabbed enough so 1342 * Fill our page array first. That way we've grabbed enough so
1317 * that we can zero and flush if we error after adding the 1343 * that we can zero and flush if we error after adding the
1318 * extent. 1344 * extent.
1319 */ 1345 */
1320 ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, 1346 ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos,
1321 clusters_to_alloc); 1347 clusters_to_alloc, mmap_page);
1322 if (ret) { 1348 if (ret) {
1323 mlog_errno(ret); 1349 mlog_errno(ret);
1324 goto out_commit; 1350 goto out_commit;
1325 } 1351 }
1326 1352
1327 for (i = 0; i < wc->w_clen; i++) { 1353 for (i = 0; i < wc->w_clen; i++) {
1328 desc = &wc->w_desc[i]; 1354 desc = &wc->w_desc[i];
1329 1355
1330 ret = ocfs2_write_cluster(mapping, desc->c_phys, data_ac, 1356 ret = ocfs2_write_cluster(mapping, desc->c_phys, data_ac,
1331 meta_ac, wc, desc->c_cpos, pos, len); 1357 meta_ac, wc, desc->c_cpos, pos, len);
1332 if (ret) { 1358 if (ret) {
1333 mlog_errno(ret); 1359 mlog_errno(ret);
1334 goto out_commit; 1360 goto out_commit;
1335 } 1361 }
1336 } 1362 }
1337 1363
1338 if (data_ac) 1364 if (data_ac)
1339 ocfs2_free_alloc_context(data_ac); 1365 ocfs2_free_alloc_context(data_ac);
1340 if (meta_ac) 1366 if (meta_ac)
1341 ocfs2_free_alloc_context(meta_ac); 1367 ocfs2_free_alloc_context(meta_ac);
1342 1368
1343 *pagep = wc->w_target_page; 1369 *pagep = wc->w_target_page;
1344 *fsdata = wc; 1370 *fsdata = wc;
1345 return 0; 1371 return 0;
1346 out_commit: 1372 out_commit:
1347 ocfs2_commit_trans(osb, handle); 1373 ocfs2_commit_trans(osb, handle);
1348 1374
1349 out: 1375 out:
1350 ocfs2_free_write_ctxt(wc); 1376 ocfs2_free_write_ctxt(wc);
1351 1377
1352 if (data_ac) 1378 if (data_ac)
1353 ocfs2_free_alloc_context(data_ac); 1379 ocfs2_free_alloc_context(data_ac);
1354 if (meta_ac) 1380 if (meta_ac)
1355 ocfs2_free_alloc_context(meta_ac); 1381 ocfs2_free_alloc_context(meta_ac);
1356 return ret; 1382 return ret;
1357 } 1383 }
1358 1384
1359 int ocfs2_write_begin(struct file *file, struct address_space *mapping, 1385 int ocfs2_write_begin(struct file *file, struct address_space *mapping,
1360 loff_t pos, unsigned len, unsigned flags, 1386 loff_t pos, unsigned len, unsigned flags,
1361 struct page **pagep, void **fsdata) 1387 struct page **pagep, void **fsdata)
1362 { 1388 {
1363 int ret; 1389 int ret;
1364 struct buffer_head *di_bh = NULL; 1390 struct buffer_head *di_bh = NULL;
1365 struct inode *inode = mapping->host; 1391 struct inode *inode = mapping->host;
1366 1392
1367 ret = ocfs2_meta_lock(inode, &di_bh, 1); 1393 ret = ocfs2_meta_lock(inode, &di_bh, 1);
1368 if (ret) { 1394 if (ret) {
1369 mlog_errno(ret); 1395 mlog_errno(ret);
1370 return ret; 1396 return ret;
1371 } 1397 }
1372 1398
1373 /* 1399 /*
1374 * Take alloc sem here to prevent concurrent lookups. That way 1400 * Take alloc sem here to prevent concurrent lookups. That way
1375 * the mapping, zeroing and tree manipulation within 1401 * the mapping, zeroing and tree manipulation within
1376 * ocfs2_write() will be safe against ->readpage(). This 1402 * ocfs2_write() will be safe against ->readpage(). This
1377 * should also serve to lock out allocation from a shared 1403 * should also serve to lock out allocation from a shared
1378 * writeable region. 1404 * writeable region.
1379 */ 1405 */
1380 down_write(&OCFS2_I(inode)->ip_alloc_sem); 1406 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1381 1407
1382 ret = ocfs2_data_lock(inode, 1); 1408 ret = ocfs2_data_lock(inode, 1);
1383 if (ret) { 1409 if (ret) {
1384 mlog_errno(ret); 1410 mlog_errno(ret);
1385 goto out_fail; 1411 goto out_fail;
1386 } 1412 }
1387 1413
1388 ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep, 1414 ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep,
1389 fsdata, di_bh); 1415 fsdata, di_bh, NULL);
1390 if (ret) { 1416 if (ret) {
1391 mlog_errno(ret); 1417 mlog_errno(ret);
1392 goto out_fail_data; 1418 goto out_fail_data;
1393 } 1419 }
1394 1420
1395 brelse(di_bh); 1421 brelse(di_bh);
1396 1422
1397 return 0; 1423 return 0;
1398 1424
1399 out_fail_data: 1425 out_fail_data:
1400 ocfs2_data_unlock(inode, 1); 1426 ocfs2_data_unlock(inode, 1);
1401 out_fail: 1427 out_fail:
1402 up_write(&OCFS2_I(inode)->ip_alloc_sem); 1428 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1403 1429
1404 brelse(di_bh); 1430 brelse(di_bh);
1405 ocfs2_meta_unlock(inode, 1); 1431 ocfs2_meta_unlock(inode, 1);
1406 1432
1407 return ret; 1433 return ret;
1408 } 1434 }
1409 1435
1410 static int ocfs2_write_end_nolock(struct address_space *mapping, 1436 int ocfs2_write_end_nolock(struct address_space *mapping,
1411 loff_t pos, unsigned len, unsigned copied, 1437 loff_t pos, unsigned len, unsigned copied,
1412 struct page *page, void *fsdata) 1438 struct page *page, void *fsdata)
1413 { 1439 {
1414 int i; 1440 int i;
1415 unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1); 1441 unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1);
1416 struct inode *inode = mapping->host; 1442 struct inode *inode = mapping->host;
1417 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1443 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1418 struct ocfs2_write_ctxt *wc = fsdata; 1444 struct ocfs2_write_ctxt *wc = fsdata;
1419 struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; 1445 struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
1420 handle_t *handle = wc->w_handle; 1446 handle_t *handle = wc->w_handle;
1421 struct page *tmppage; 1447 struct page *tmppage;
1422 1448
1423 if (unlikely(copied < len)) { 1449 if (unlikely(copied < len)) {
1424 if (!PageUptodate(wc->w_target_page)) 1450 if (!PageUptodate(wc->w_target_page))
1425 copied = 0; 1451 copied = 0;
1426 1452
1427 ocfs2_zero_new_buffers(wc->w_target_page, start+copied, 1453 ocfs2_zero_new_buffers(wc->w_target_page, start+copied,
1428 start+len); 1454 start+len);
1429 } 1455 }
1430 flush_dcache_page(wc->w_target_page); 1456 flush_dcache_page(wc->w_target_page);
1431 1457
1432 for(i = 0; i < wc->w_num_pages; i++) { 1458 for(i = 0; i < wc->w_num_pages; i++) {
1433 tmppage = wc->w_pages[i]; 1459 tmppage = wc->w_pages[i];
1434 1460
1435 if (tmppage == wc->w_target_page) { 1461 if (tmppage == wc->w_target_page) {
1436 from = wc->w_target_from; 1462 from = wc->w_target_from;
1437 to = wc->w_target_to; 1463 to = wc->w_target_to;
1438 1464
1439 BUG_ON(from > PAGE_CACHE_SIZE || 1465 BUG_ON(from > PAGE_CACHE_SIZE ||
1440 to > PAGE_CACHE_SIZE || 1466 to > PAGE_CACHE_SIZE ||
1441 to < from); 1467 to < from);
1442 } else { 1468 } else {
1443 /* 1469 /*
1444 * Pages adjacent to the target (if any) imply 1470 * Pages adjacent to the target (if any) imply
1445 * a hole-filling write in which case we want 1471 * a hole-filling write in which case we want
1446 * to flush their entire range. 1472 * to flush their entire range.
1447 */ 1473 */
1448 from = 0; 1474 from = 0;
1449 to = PAGE_CACHE_SIZE; 1475 to = PAGE_CACHE_SIZE;
1450 } 1476 }
1451 1477
1452 if (ocfs2_should_order_data(inode)) 1478 if (ocfs2_should_order_data(inode))
1453 walk_page_buffers(wc->w_handle, page_buffers(tmppage), 1479 walk_page_buffers(wc->w_handle, page_buffers(tmppage),
1454 from, to, NULL, 1480 from, to, NULL,
1455 ocfs2_journal_dirty_data); 1481 ocfs2_journal_dirty_data);
1456 1482
1457 block_commit_write(tmppage, from, to); 1483 block_commit_write(tmppage, from, to);
1458 } 1484 }
1459 1485
1460 pos += copied; 1486 pos += copied;
1461 if (pos > inode->i_size) { 1487 if (pos > inode->i_size) {
1462 i_size_write(inode, pos); 1488 i_size_write(inode, pos);
1463 mark_inode_dirty(inode); 1489 mark_inode_dirty(inode);
1464 } 1490 }
1465 inode->i_blocks = ocfs2_inode_sector_count(inode); 1491 inode->i_blocks = ocfs2_inode_sector_count(inode);
1466 di->i_size = cpu_to_le64((u64)i_size_read(inode)); 1492 di->i_size = cpu_to_le64((u64)i_size_read(inode));
1467 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 1493 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1468 di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); 1494 di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
1469 di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); 1495 di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
1470 1496
1471 ocfs2_journal_dirty(handle, wc->w_di_bh); 1497 ocfs2_journal_dirty(handle, wc->w_di_bh);
1472 1498
1473 ocfs2_commit_trans(osb, handle); 1499 ocfs2_commit_trans(osb, handle);
1474 ocfs2_free_write_ctxt(wc); 1500 ocfs2_free_write_ctxt(wc);
1475 1501
1476 return copied; 1502 return copied;
1477 } 1503 }
1478 1504
1479 int ocfs2_write_end(struct file *file, struct address_space *mapping, 1505 int ocfs2_write_end(struct file *file, struct address_space *mapping,
1480 loff_t pos, unsigned len, unsigned copied, 1506 loff_t pos, unsigned len, unsigned copied,
1481 struct page *page, void *fsdata) 1507 struct page *page, void *fsdata)
1482 { 1508 {
1483 int ret; 1509 int ret;
1484 struct inode *inode = mapping->host; 1510 struct inode *inode = mapping->host;
1485 1511
1486 ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata); 1512 ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata);
1487 1513
1488 ocfs2_data_unlock(inode, 1); 1514 ocfs2_data_unlock(inode, 1);
1489 up_write(&OCFS2_I(inode)->ip_alloc_sem); 1515 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1490 ocfs2_meta_unlock(inode, 1); 1516 ocfs2_meta_unlock(inode, 1);
1491 1517
1492 return ret; 1518 return ret;
1493 } 1519 }
1494 1520
1495 const struct address_space_operations ocfs2_aops = { 1521 const struct address_space_operations ocfs2_aops = {
1496 .readpage = ocfs2_readpage, 1522 .readpage = ocfs2_readpage,
1497 .writepage = ocfs2_writepage, 1523 .writepage = ocfs2_writepage,
1498 .bmap = ocfs2_bmap, 1524 .bmap = ocfs2_bmap,
1499 .sync_page = block_sync_page, 1525 .sync_page = block_sync_page,
1500 .direct_IO = ocfs2_direct_IO, 1526 .direct_IO = ocfs2_direct_IO,
1501 .invalidatepage = ocfs2_invalidatepage, 1527 .invalidatepage = ocfs2_invalidatepage,
1502 .releasepage = ocfs2_releasepage, 1528 .releasepage = ocfs2_releasepage,
1503 .migratepage = buffer_migrate_page, 1529 .migratepage = buffer_migrate_page,
1504 }; 1530 };
1505 1531
1 /* -*- mode: c; c-basic-offset: 8; -*- 1 /* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0: 2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 * 3 *
4 * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved. 4 * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved.
5 * 5 *
6 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public 7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either 8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version. 9 * version 2 of the License, or (at your option) any later version.
10 * 10 *
11 * This program is distributed in the hope that it will be useful, 11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details. 14 * General Public License for more details.
15 * 15 *
16 * You should have received a copy of the GNU General Public 16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the 17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA. 19 * Boston, MA 021110-1307, USA.
20 */ 20 */
21 21
22 #ifndef OCFS2_AOPS_H 22 #ifndef OCFS2_AOPS_H
23 #define OCFS2_AOPS_H 23 #define OCFS2_AOPS_H
24 24
25 int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page, 25 int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
26 unsigned from, unsigned to); 26 unsigned from, unsigned to);
27 27
28 handle_t *ocfs2_start_walk_page_trans(struct inode *inode, 28 handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
29 struct page *page, 29 struct page *page,
30 unsigned from, 30 unsigned from,
31 unsigned to); 31 unsigned to);
32 32
33 int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, 33 int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
34 struct inode *inode, unsigned int from, 34 struct inode *inode, unsigned int from,
35 unsigned int to, int new); 35 unsigned int to, int new);
36 36
37 int walk_page_buffers( handle_t *handle, 37 int walk_page_buffers( handle_t *handle,
38 struct buffer_head *head, 38 struct buffer_head *head,
39 unsigned from, 39 unsigned from,
40 unsigned to, 40 unsigned to,
41 int *partial, 41 int *partial,
42 int (*fn)( handle_t *handle, 42 int (*fn)( handle_t *handle,
43 struct buffer_head *bh)); 43 struct buffer_head *bh));
44 44
45 int ocfs2_write_begin(struct file *file, struct address_space *mapping, 45 int ocfs2_write_begin(struct file *file, struct address_space *mapping,
46 loff_t pos, unsigned len, unsigned flags, 46 loff_t pos, unsigned len, unsigned flags,
47 struct page **pagep, void **fsdata); 47 struct page **pagep, void **fsdata);
48 48
49 int ocfs2_write_end(struct file *file, struct address_space *mapping, 49 int ocfs2_write_end(struct file *file, struct address_space *mapping,
50 loff_t pos, unsigned len, unsigned copied, 50 loff_t pos, unsigned len, unsigned copied,
51 struct page *page, void *fsdata); 51 struct page *page, void *fsdata);
52 52
53 int ocfs2_write_end_nolock(struct address_space *mapping,
54 loff_t pos, unsigned len, unsigned copied,
55 struct page *page, void *fsdata);
56
57 int ocfs2_write_begin_nolock(struct address_space *mapping,
58 loff_t pos, unsigned len, unsigned flags,
59 struct page **pagep, void **fsdata,
60 struct buffer_head *di_bh, struct page *mmap_page);
61
53 /* all ocfs2_dio_end_io()'s fault */ 62 /* all ocfs2_dio_end_io()'s fault */
54 #define ocfs2_iocb_is_rw_locked(iocb) \ 63 #define ocfs2_iocb_is_rw_locked(iocb) \
55 test_bit(0, (unsigned long *)&iocb->private) 64 test_bit(0, (unsigned long *)&iocb->private)
56 static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level) 65 static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
57 { 66 {
58 set_bit(0, (unsigned long *)&iocb->private); 67 set_bit(0, (unsigned long *)&iocb->private);
59 if (level) 68 if (level)
60 set_bit(1, (unsigned long *)&iocb->private); 69 set_bit(1, (unsigned long *)&iocb->private);
61 else 70 else
62 clear_bit(1, (unsigned long *)&iocb->private); 71 clear_bit(1, (unsigned long *)&iocb->private);
63 } 72 }
64 #define ocfs2_iocb_clear_rw_locked(iocb) \ 73 #define ocfs2_iocb_clear_rw_locked(iocb) \
65 clear_bit(0, (unsigned long *)&iocb->private) 74 clear_bit(0, (unsigned long *)&iocb->private)
66 #define ocfs2_iocb_rw_locked_level(iocb) \ 75 #define ocfs2_iocb_rw_locked_level(iocb) \
67 test_bit(1, (unsigned long *)&iocb->private) 76 test_bit(1, (unsigned long *)&iocb->private)
68 #endif /* OCFS2_FILE_H */ 77 #endif /* OCFS2_FILE_H */
69 78
1 /* -*- mode: c; c-basic-offset: 8; -*- 1 /* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0: 2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 * 3 *
4 * file.c 4 * file.c
5 * 5 *
6 * File open, close, extend, truncate 6 * File open, close, extend, truncate
7 * 7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 * 9 *
10 * This program is free software; you can redistribute it and/or 10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public 11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either 12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version. 13 * version 2 of the License, or (at your option) any later version.
14 * 14 *
15 * This program is distributed in the hope that it will be useful, 15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details. 18 * General Public License for more details.
19 * 19 *
20 * You should have received a copy of the GNU General Public 20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the 21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA. 23 * Boston, MA 021110-1307, USA.
24 */ 24 */
25 25
26 #include <linux/capability.h> 26 #include <linux/capability.h>
27 #include <linux/fs.h> 27 #include <linux/fs.h>
28 #include <linux/types.h> 28 #include <linux/types.h>
29 #include <linux/slab.h> 29 #include <linux/slab.h>
30 #include <linux/highmem.h> 30 #include <linux/highmem.h>
31 #include <linux/pagemap.h> 31 #include <linux/pagemap.h>
32 #include <linux/uio.h> 32 #include <linux/uio.h>
33 #include <linux/sched.h> 33 #include <linux/sched.h>
34 #include <linux/splice.h> 34 #include <linux/splice.h>
35 #include <linux/mount.h> 35 #include <linux/mount.h>
36 #include <linux/writeback.h> 36 #include <linux/writeback.h>
37 37
38 #define MLOG_MASK_PREFIX ML_INODE 38 #define MLOG_MASK_PREFIX ML_INODE
39 #include <cluster/masklog.h> 39 #include <cluster/masklog.h>
40 40
41 #include "ocfs2.h" 41 #include "ocfs2.h"
42 42
43 #include "alloc.h" 43 #include "alloc.h"
44 #include "aops.h" 44 #include "aops.h"
45 #include "dir.h" 45 #include "dir.h"
46 #include "dlmglue.h" 46 #include "dlmglue.h"
47 #include "extent_map.h" 47 #include "extent_map.h"
48 #include "file.h" 48 #include "file.h"
49 #include "sysfile.h" 49 #include "sysfile.h"
50 #include "inode.h" 50 #include "inode.h"
51 #include "ioctl.h" 51 #include "ioctl.h"
52 #include "journal.h" 52 #include "journal.h"
53 #include "mmap.h" 53 #include "mmap.h"
54 #include "suballoc.h" 54 #include "suballoc.h"
55 #include "super.h" 55 #include "super.h"
56 56
57 #include "buffer_head_io.h" 57 #include "buffer_head_io.h"
58 58
59 static int ocfs2_sync_inode(struct inode *inode) 59 static int ocfs2_sync_inode(struct inode *inode)
60 { 60 {
61 filemap_fdatawrite(inode->i_mapping); 61 filemap_fdatawrite(inode->i_mapping);
62 return sync_mapping_buffers(inode->i_mapping); 62 return sync_mapping_buffers(inode->i_mapping);
63 } 63 }
64 64
65 static int ocfs2_file_open(struct inode *inode, struct file *file) 65 static int ocfs2_file_open(struct inode *inode, struct file *file)
66 { 66 {
67 int status; 67 int status;
68 int mode = file->f_flags; 68 int mode = file->f_flags;
69 struct ocfs2_inode_info *oi = OCFS2_I(inode); 69 struct ocfs2_inode_info *oi = OCFS2_I(inode);
70 70
71 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, 71 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
72 file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name); 72 file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name);
73 73
74 spin_lock(&oi->ip_lock); 74 spin_lock(&oi->ip_lock);
75 75
76 /* Check that the inode hasn't been wiped from disk by another 76 /* Check that the inode hasn't been wiped from disk by another
77 * node. If it hasn't then we're safe as long as we hold the 77 * node. If it hasn't then we're safe as long as we hold the
78 * spin lock until our increment of open count. */ 78 * spin lock until our increment of open count. */
79 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { 79 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
80 spin_unlock(&oi->ip_lock); 80 spin_unlock(&oi->ip_lock);
81 81
82 status = -ENOENT; 82 status = -ENOENT;
83 goto leave; 83 goto leave;
84 } 84 }
85 85
86 if (mode & O_DIRECT) 86 if (mode & O_DIRECT)
87 oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT; 87 oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT;
88 88
89 oi->ip_open_count++; 89 oi->ip_open_count++;
90 spin_unlock(&oi->ip_lock); 90 spin_unlock(&oi->ip_lock);
91 status = 0; 91 status = 0;
92 leave: 92 leave:
93 mlog_exit(status); 93 mlog_exit(status);
94 return status; 94 return status;
95 } 95 }
96 96
97 static int ocfs2_file_release(struct inode *inode, struct file *file) 97 static int ocfs2_file_release(struct inode *inode, struct file *file)
98 { 98 {
99 struct ocfs2_inode_info *oi = OCFS2_I(inode); 99 struct ocfs2_inode_info *oi = OCFS2_I(inode);
100 100
101 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, 101 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
102 file->f_path.dentry->d_name.len, 102 file->f_path.dentry->d_name.len,
103 file->f_path.dentry->d_name.name); 103 file->f_path.dentry->d_name.name);
104 104
105 spin_lock(&oi->ip_lock); 105 spin_lock(&oi->ip_lock);
106 if (!--oi->ip_open_count) 106 if (!--oi->ip_open_count)
107 oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT; 107 oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
108 spin_unlock(&oi->ip_lock); 108 spin_unlock(&oi->ip_lock);
109 109
110 mlog_exit(0); 110 mlog_exit(0);
111 111
112 return 0; 112 return 0;
113 } 113 }
114 114
115 static int ocfs2_sync_file(struct file *file, 115 static int ocfs2_sync_file(struct file *file,
116 struct dentry *dentry, 116 struct dentry *dentry,
117 int datasync) 117 int datasync)
118 { 118 {
119 int err = 0; 119 int err = 0;
120 journal_t *journal; 120 journal_t *journal;
121 struct inode *inode = dentry->d_inode; 121 struct inode *inode = dentry->d_inode;
122 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 122 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
123 123
124 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync, 124 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
125 dentry->d_name.len, dentry->d_name.name); 125 dentry->d_name.len, dentry->d_name.name);
126 126
127 err = ocfs2_sync_inode(dentry->d_inode); 127 err = ocfs2_sync_inode(dentry->d_inode);
128 if (err) 128 if (err)
129 goto bail; 129 goto bail;
130 130
131 journal = osb->journal->j_journal; 131 journal = osb->journal->j_journal;
132 err = journal_force_commit(journal); 132 err = journal_force_commit(journal);
133 133
134 bail: 134 bail:
135 mlog_exit(err); 135 mlog_exit(err);
136 136
137 return (err < 0) ? -EIO : 0; 137 return (err < 0) ? -EIO : 0;
138 } 138 }
139 139
140 int ocfs2_should_update_atime(struct inode *inode, 140 int ocfs2_should_update_atime(struct inode *inode,
141 struct vfsmount *vfsmnt) 141 struct vfsmount *vfsmnt)
142 { 142 {
143 struct timespec now; 143 struct timespec now;
144 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 144 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
145 145
146 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) 146 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
147 return 0; 147 return 0;
148 148
149 if ((inode->i_flags & S_NOATIME) || 149 if ((inode->i_flags & S_NOATIME) ||
150 ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))) 150 ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)))
151 return 0; 151 return 0;
152 152
153 /* 153 /*
154 * We can be called with no vfsmnt structure - NFSD will 154 * We can be called with no vfsmnt structure - NFSD will
155 * sometimes do this. 155 * sometimes do this.
156 * 156 *
157 * Note that our action here is different than touch_atime() - 157 * Note that our action here is different than touch_atime() -
158 * if we can't tell whether this is a noatime mount, then we 158 * if we can't tell whether this is a noatime mount, then we
159 * don't know whether to trust the value of s_atime_quantum. 159 * don't know whether to trust the value of s_atime_quantum.
160 */ 160 */
161 if (vfsmnt == NULL) 161 if (vfsmnt == NULL)
162 return 0; 162 return 0;
163 163
164 if ((vfsmnt->mnt_flags & MNT_NOATIME) || 164 if ((vfsmnt->mnt_flags & MNT_NOATIME) ||
165 ((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))) 165 ((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
166 return 0; 166 return 0;
167 167
168 if (vfsmnt->mnt_flags & MNT_RELATIME) { 168 if (vfsmnt->mnt_flags & MNT_RELATIME) {
169 if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) || 169 if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) ||
170 (timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0)) 170 (timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0))
171 return 1; 171 return 1;
172 172
173 return 0; 173 return 0;
174 } 174 }
175 175
176 now = CURRENT_TIME; 176 now = CURRENT_TIME;
177 if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum)) 177 if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum))
178 return 0; 178 return 0;
179 else 179 else
180 return 1; 180 return 1;
181 } 181 }
182 182
183 int ocfs2_update_inode_atime(struct inode *inode, 183 int ocfs2_update_inode_atime(struct inode *inode,
184 struct buffer_head *bh) 184 struct buffer_head *bh)
185 { 185 {
186 int ret; 186 int ret;
187 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 187 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
188 handle_t *handle; 188 handle_t *handle;
189 189
190 mlog_entry_void(); 190 mlog_entry_void();
191 191
192 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 192 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
193 if (handle == NULL) { 193 if (handle == NULL) {
194 ret = -ENOMEM; 194 ret = -ENOMEM;
195 mlog_errno(ret); 195 mlog_errno(ret);
196 goto out; 196 goto out;
197 } 197 }
198 198
199 inode->i_atime = CURRENT_TIME; 199 inode->i_atime = CURRENT_TIME;
200 ret = ocfs2_mark_inode_dirty(handle, inode, bh); 200 ret = ocfs2_mark_inode_dirty(handle, inode, bh);
201 if (ret < 0) 201 if (ret < 0)
202 mlog_errno(ret); 202 mlog_errno(ret);
203 203
204 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); 204 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
205 out: 205 out:
206 mlog_exit(ret); 206 mlog_exit(ret);
207 return ret; 207 return ret;
208 } 208 }
209 209
210 static int ocfs2_set_inode_size(handle_t *handle, 210 static int ocfs2_set_inode_size(handle_t *handle,
211 struct inode *inode, 211 struct inode *inode,
212 struct buffer_head *fe_bh, 212 struct buffer_head *fe_bh,
213 u64 new_i_size) 213 u64 new_i_size)
214 { 214 {
215 int status; 215 int status;
216 216
217 mlog_entry_void(); 217 mlog_entry_void();
218 i_size_write(inode, new_i_size); 218 i_size_write(inode, new_i_size);
219 inode->i_blocks = ocfs2_inode_sector_count(inode); 219 inode->i_blocks = ocfs2_inode_sector_count(inode);
220 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 220 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
221 221
222 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); 222 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
223 if (status < 0) { 223 if (status < 0) {
224 mlog_errno(status); 224 mlog_errno(status);
225 goto bail; 225 goto bail;
226 } 226 }
227 227
228 bail: 228 bail:
229 mlog_exit(status); 229 mlog_exit(status);
230 return status; 230 return status;
231 } 231 }
232 232
233 static int ocfs2_simple_size_update(struct inode *inode, 233 static int ocfs2_simple_size_update(struct inode *inode,
234 struct buffer_head *di_bh, 234 struct buffer_head *di_bh,
235 u64 new_i_size) 235 u64 new_i_size)
236 { 236 {
237 int ret; 237 int ret;
238 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 238 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
239 handle_t *handle = NULL; 239 handle_t *handle = NULL;
240 240
241 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 241 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
242 if (handle == NULL) { 242 if (handle == NULL) {
243 ret = -ENOMEM; 243 ret = -ENOMEM;
244 mlog_errno(ret); 244 mlog_errno(ret);
245 goto out; 245 goto out;
246 } 246 }
247 247
248 ret = ocfs2_set_inode_size(handle, inode, di_bh, 248 ret = ocfs2_set_inode_size(handle, inode, di_bh,
249 new_i_size); 249 new_i_size);
250 if (ret < 0) 250 if (ret < 0)
251 mlog_errno(ret); 251 mlog_errno(ret);
252 252
253 ocfs2_commit_trans(osb, handle); 253 ocfs2_commit_trans(osb, handle);
254 out: 254 out:
255 return ret; 255 return ret;
256 } 256 }
257 257
258 static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, 258 static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
259 struct inode *inode, 259 struct inode *inode,
260 struct buffer_head *fe_bh, 260 struct buffer_head *fe_bh,
261 u64 new_i_size) 261 u64 new_i_size)
262 { 262 {
263 int status; 263 int status;
264 handle_t *handle; 264 handle_t *handle;
265 struct ocfs2_dinode *di; 265 struct ocfs2_dinode *di;
266 266
267 mlog_entry_void(); 267 mlog_entry_void();
268 268
269 /* TODO: This needs to actually orphan the inode in this 269 /* TODO: This needs to actually orphan the inode in this
270 * transaction. */ 270 * transaction. */
271 271
272 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 272 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
273 if (IS_ERR(handle)) { 273 if (IS_ERR(handle)) {
274 status = PTR_ERR(handle); 274 status = PTR_ERR(handle);
275 mlog_errno(status); 275 mlog_errno(status);
276 goto out; 276 goto out;
277 } 277 }
278 278
279 status = ocfs2_journal_access(handle, inode, fe_bh, 279 status = ocfs2_journal_access(handle, inode, fe_bh,
280 OCFS2_JOURNAL_ACCESS_WRITE); 280 OCFS2_JOURNAL_ACCESS_WRITE);
281 if (status < 0) { 281 if (status < 0) {
282 mlog_errno(status); 282 mlog_errno(status);
283 goto out_commit; 283 goto out_commit;
284 } 284 }
285 285
286 /* 286 /*
287 * Do this before setting i_size. 287 * Do this before setting i_size.
288 */ 288 */
289 status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size); 289 status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size);
290 if (status) { 290 if (status) {
291 mlog_errno(status); 291 mlog_errno(status);
292 goto out_commit; 292 goto out_commit;
293 } 293 }
294 294
295 i_size_write(inode, new_i_size); 295 i_size_write(inode, new_i_size);
296 inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size); 296 inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
297 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 297 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
298 298
299 di = (struct ocfs2_dinode *) fe_bh->b_data; 299 di = (struct ocfs2_dinode *) fe_bh->b_data;
300 di->i_size = cpu_to_le64(new_i_size); 300 di->i_size = cpu_to_le64(new_i_size);
301 di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); 301 di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
302 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 302 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
303 303
304 status = ocfs2_journal_dirty(handle, fe_bh); 304 status = ocfs2_journal_dirty(handle, fe_bh);
305 if (status < 0) 305 if (status < 0)
306 mlog_errno(status); 306 mlog_errno(status);
307 307
308 out_commit: 308 out_commit:
309 ocfs2_commit_trans(osb, handle); 309 ocfs2_commit_trans(osb, handle);
310 out: 310 out:
311 311
312 mlog_exit(status); 312 mlog_exit(status);
313 return status; 313 return status;
314 } 314 }
315 315
316 static int ocfs2_truncate_file(struct inode *inode, 316 static int ocfs2_truncate_file(struct inode *inode,
317 struct buffer_head *di_bh, 317 struct buffer_head *di_bh,
318 u64 new_i_size) 318 u64 new_i_size)
319 { 319 {
320 int status = 0; 320 int status = 0;
321 struct ocfs2_dinode *fe = NULL; 321 struct ocfs2_dinode *fe = NULL;
322 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 322 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
323 struct ocfs2_truncate_context *tc = NULL; 323 struct ocfs2_truncate_context *tc = NULL;
324 324
325 mlog_entry("(inode = %llu, new_i_size = %llu\n", 325 mlog_entry("(inode = %llu, new_i_size = %llu\n",
326 (unsigned long long)OCFS2_I(inode)->ip_blkno, 326 (unsigned long long)OCFS2_I(inode)->ip_blkno,
327 (unsigned long long)new_i_size); 327 (unsigned long long)new_i_size);
328 328
329 fe = (struct ocfs2_dinode *) di_bh->b_data; 329 fe = (struct ocfs2_dinode *) di_bh->b_data;
330 if (!OCFS2_IS_VALID_DINODE(fe)) { 330 if (!OCFS2_IS_VALID_DINODE(fe)) {
331 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); 331 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
332 status = -EIO; 332 status = -EIO;
333 goto bail; 333 goto bail;
334 } 334 }
335 335
336 mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode), 336 mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
337 "Inode %llu, inode i_size = %lld != di " 337 "Inode %llu, inode i_size = %lld != di "
338 "i_size = %llu, i_flags = 0x%x\n", 338 "i_size = %llu, i_flags = 0x%x\n",
339 (unsigned long long)OCFS2_I(inode)->ip_blkno, 339 (unsigned long long)OCFS2_I(inode)->ip_blkno,
340 i_size_read(inode), 340 i_size_read(inode),
341 (unsigned long long)le64_to_cpu(fe->i_size), 341 (unsigned long long)le64_to_cpu(fe->i_size),
342 le32_to_cpu(fe->i_flags)); 342 le32_to_cpu(fe->i_flags));
343 343
344 if (new_i_size > le64_to_cpu(fe->i_size)) { 344 if (new_i_size > le64_to_cpu(fe->i_size)) {
345 mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n", 345 mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n",
346 (unsigned long long)le64_to_cpu(fe->i_size), 346 (unsigned long long)le64_to_cpu(fe->i_size),
347 (unsigned long long)new_i_size); 347 (unsigned long long)new_i_size);
348 status = -EINVAL; 348 status = -EINVAL;
349 mlog_errno(status); 349 mlog_errno(status);
350 goto bail; 350 goto bail;
351 } 351 }
352 352
353 mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n", 353 mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n",
354 (unsigned long long)le64_to_cpu(fe->i_blkno), 354 (unsigned long long)le64_to_cpu(fe->i_blkno),
355 (unsigned long long)le64_to_cpu(fe->i_size), 355 (unsigned long long)le64_to_cpu(fe->i_size),
356 (unsigned long long)new_i_size); 356 (unsigned long long)new_i_size);
357 357
358 /* lets handle the simple truncate cases before doing any more 358 /* lets handle the simple truncate cases before doing any more
359 * cluster locking. */ 359 * cluster locking. */
360 if (new_i_size == le64_to_cpu(fe->i_size)) 360 if (new_i_size == le64_to_cpu(fe->i_size))
361 goto bail; 361 goto bail;
362 362
363 down_write(&OCFS2_I(inode)->ip_alloc_sem); 363 down_write(&OCFS2_I(inode)->ip_alloc_sem);
364 364
365 /* This forces other nodes to sync and drop their pages. Do 365 /* This forces other nodes to sync and drop their pages. Do
366 * this even if we have a truncate without allocation change - 366 * this even if we have a truncate without allocation change -
367 * ocfs2 cluster sizes can be much greater than page size, so 367 * ocfs2 cluster sizes can be much greater than page size, so
368 * we have to truncate them anyway. */ 368 * we have to truncate them anyway. */
369 status = ocfs2_data_lock(inode, 1); 369 status = ocfs2_data_lock(inode, 1);
370 if (status < 0) { 370 if (status < 0) {
371 up_write(&OCFS2_I(inode)->ip_alloc_sem); 371 up_write(&OCFS2_I(inode)->ip_alloc_sem);
372 372
373 mlog_errno(status); 373 mlog_errno(status);
374 goto bail; 374 goto bail;
375 } 375 }
376 376
377 unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1); 377 unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
378 truncate_inode_pages(inode->i_mapping, new_i_size); 378 truncate_inode_pages(inode->i_mapping, new_i_size);
379 379
380 /* alright, we're going to need to do a full blown alloc size 380 /* alright, we're going to need to do a full blown alloc size
381 * change. Orphan the inode so that recovery can complete the 381 * change. Orphan the inode so that recovery can complete the
382 * truncate if necessary. This does the task of marking 382 * truncate if necessary. This does the task of marking
383 * i_size. */ 383 * i_size. */
384 status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); 384 status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
385 if (status < 0) { 385 if (status < 0) {
386 mlog_errno(status); 386 mlog_errno(status);
387 goto bail_unlock_data; 387 goto bail_unlock_data;
388 } 388 }
389 389
390 status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); 390 status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
391 if (status < 0) { 391 if (status < 0) {
392 mlog_errno(status); 392 mlog_errno(status);
393 goto bail_unlock_data; 393 goto bail_unlock_data;
394 } 394 }
395 395
396 status = ocfs2_commit_truncate(osb, inode, di_bh, tc); 396 status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
397 if (status < 0) { 397 if (status < 0) {
398 mlog_errno(status); 398 mlog_errno(status);
399 goto bail_unlock_data; 399 goto bail_unlock_data;
400 } 400 }
401 401
402 /* TODO: orphan dir cleanup here. */ 402 /* TODO: orphan dir cleanup here. */
403 bail_unlock_data: 403 bail_unlock_data:
404 ocfs2_data_unlock(inode, 1); 404 ocfs2_data_unlock(inode, 1);
405 405
406 up_write(&OCFS2_I(inode)->ip_alloc_sem); 406 up_write(&OCFS2_I(inode)->ip_alloc_sem);
407 407
408 bail: 408 bail:
409 409
410 mlog_exit(status); 410 mlog_exit(status);
411 return status; 411 return status;
412 } 412 }
413 413
414 /* 414 /*
415 * extend allocation only here. 415 * extend allocation only here.
416 * we'll update all the disk stuff, and oip->alloc_size 416 * we'll update all the disk stuff, and oip->alloc_size
417 * 417 *
418 * expect stuff to be locked, a transaction started and enough data / 418 * expect stuff to be locked, a transaction started and enough data /
419 * metadata reservations in the contexts. 419 * metadata reservations in the contexts.
420 * 420 *
421 * Will return -EAGAIN, and a reason if a restart is needed. 421 * Will return -EAGAIN, and a reason if a restart is needed.
422 * If passed in, *reason will always be set, even in error. 422 * If passed in, *reason will always be set, even in error.
423 */ 423 */
424 int ocfs2_do_extend_allocation(struct ocfs2_super *osb, 424 int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
425 struct inode *inode, 425 struct inode *inode,
426 u32 *logical_offset, 426 u32 *logical_offset,
427 u32 clusters_to_add, 427 u32 clusters_to_add,
428 struct buffer_head *fe_bh, 428 struct buffer_head *fe_bh,
429 handle_t *handle, 429 handle_t *handle,
430 struct ocfs2_alloc_context *data_ac, 430 struct ocfs2_alloc_context *data_ac,
431 struct ocfs2_alloc_context *meta_ac, 431 struct ocfs2_alloc_context *meta_ac,
432 enum ocfs2_alloc_restarted *reason_ret) 432 enum ocfs2_alloc_restarted *reason_ret)
433 { 433 {
434 int status = 0; 434 int status = 0;
435 int free_extents; 435 int free_extents;
436 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; 436 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
437 enum ocfs2_alloc_restarted reason = RESTART_NONE; 437 enum ocfs2_alloc_restarted reason = RESTART_NONE;
438 u32 bit_off, num_bits; 438 u32 bit_off, num_bits;
439 u64 block; 439 u64 block;
440 440
441 BUG_ON(!clusters_to_add); 441 BUG_ON(!clusters_to_add);
442 442
443 free_extents = ocfs2_num_free_extents(osb, inode, fe); 443 free_extents = ocfs2_num_free_extents(osb, inode, fe);
444 if (free_extents < 0) { 444 if (free_extents < 0) {
445 status = free_extents; 445 status = free_extents;
446 mlog_errno(status); 446 mlog_errno(status);
447 goto leave; 447 goto leave;
448 } 448 }
449 449
450 /* there are two cases which could cause us to EAGAIN in the 450 /* there are two cases which could cause us to EAGAIN in the
451 * we-need-more-metadata case: 451 * we-need-more-metadata case:
452 * 1) we haven't reserved *any* 452 * 1) we haven't reserved *any*
453 * 2) we are so fragmented, we've needed to add metadata too 453 * 2) we are so fragmented, we've needed to add metadata too
454 * many times. */ 454 * many times. */
455 if (!free_extents && !meta_ac) { 455 if (!free_extents && !meta_ac) {
456 mlog(0, "we haven't reserved any metadata!\n"); 456 mlog(0, "we haven't reserved any metadata!\n");
457 status = -EAGAIN; 457 status = -EAGAIN;
458 reason = RESTART_META; 458 reason = RESTART_META;
459 goto leave; 459 goto leave;
460 } else if ((!free_extents) 460 } else if ((!free_extents)
461 && (ocfs2_alloc_context_bits_left(meta_ac) 461 && (ocfs2_alloc_context_bits_left(meta_ac)
462 < ocfs2_extend_meta_needed(fe))) { 462 < ocfs2_extend_meta_needed(fe))) {
463 mlog(0, "filesystem is really fragmented...\n"); 463 mlog(0, "filesystem is really fragmented...\n");
464 status = -EAGAIN; 464 status = -EAGAIN;
465 reason = RESTART_META; 465 reason = RESTART_META;
466 goto leave; 466 goto leave;
467 } 467 }
468 468
469 status = ocfs2_claim_clusters(osb, handle, data_ac, 1, 469 status = ocfs2_claim_clusters(osb, handle, data_ac, 1,
470 &bit_off, &num_bits); 470 &bit_off, &num_bits);
471 if (status < 0) { 471 if (status < 0) {
472 if (status != -ENOSPC) 472 if (status != -ENOSPC)
473 mlog_errno(status); 473 mlog_errno(status);
474 goto leave; 474 goto leave;
475 } 475 }
476 476
477 BUG_ON(num_bits > clusters_to_add); 477 BUG_ON(num_bits > clusters_to_add);
478 478
479 /* reserve our write early -- insert_extent may update the inode */ 479 /* reserve our write early -- insert_extent may update the inode */
480 status = ocfs2_journal_access(handle, inode, fe_bh, 480 status = ocfs2_journal_access(handle, inode, fe_bh,
481 OCFS2_JOURNAL_ACCESS_WRITE); 481 OCFS2_JOURNAL_ACCESS_WRITE);
482 if (status < 0) { 482 if (status < 0) {
483 mlog_errno(status); 483 mlog_errno(status);
484 goto leave; 484 goto leave;
485 } 485 }
486 486
487 block = ocfs2_clusters_to_blocks(osb->sb, bit_off); 487 block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
488 mlog(0, "Allocating %u clusters at block %u for inode %llu\n", 488 mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
489 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); 489 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
490 status = ocfs2_insert_extent(osb, handle, inode, fe_bh, 490 status = ocfs2_insert_extent(osb, handle, inode, fe_bh,
491 *logical_offset, block, num_bits, 491 *logical_offset, block, num_bits,
492 meta_ac); 492 meta_ac);
493 if (status < 0) { 493 if (status < 0) {
494 mlog_errno(status); 494 mlog_errno(status);
495 goto leave; 495 goto leave;
496 } 496 }
497 497
498 status = ocfs2_journal_dirty(handle, fe_bh); 498 status = ocfs2_journal_dirty(handle, fe_bh);
499 if (status < 0) { 499 if (status < 0) {
500 mlog_errno(status); 500 mlog_errno(status);
501 goto leave; 501 goto leave;
502 } 502 }
503 503
504 clusters_to_add -= num_bits; 504 clusters_to_add -= num_bits;
505 *logical_offset += num_bits; 505 *logical_offset += num_bits;
506 506
507 if (clusters_to_add) { 507 if (clusters_to_add) {
508 mlog(0, "need to alloc once more, clusters = %u, wanted = " 508 mlog(0, "need to alloc once more, clusters = %u, wanted = "
509 "%u\n", fe->i_clusters, clusters_to_add); 509 "%u\n", fe->i_clusters, clusters_to_add);
510 status = -EAGAIN; 510 status = -EAGAIN;
511 reason = RESTART_TRANS; 511 reason = RESTART_TRANS;
512 } 512 }
513 513
514 leave: 514 leave:
515 mlog_exit(status); 515 mlog_exit(status);
516 if (reason_ret) 516 if (reason_ret)
517 *reason_ret = reason; 517 *reason_ret = reason;
518 return status; 518 return status;
519 } 519 }
520 520
521 /* 521 /*
522 * For a given allocation, determine which allocators will need to be 522 * For a given allocation, determine which allocators will need to be
523 * accessed, and lock them, reserving the appropriate number of bits. 523 * accessed, and lock them, reserving the appropriate number of bits.
524 * 524 *
525 * Called from ocfs2_extend_allocation() for file systems which don't 525 * Called from ocfs2_extend_allocation() for file systems which don't
526 * support holes, and from ocfs2_write() for file systems which 526 * support holes, and from ocfs2_write() for file systems which
527 * understand sparse inodes. 527 * understand sparse inodes.
528 */ 528 */
529 int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, 529 int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
530 u32 clusters_to_add, 530 u32 clusters_to_add,
531 struct ocfs2_alloc_context **data_ac, 531 struct ocfs2_alloc_context **data_ac,
532 struct ocfs2_alloc_context **meta_ac) 532 struct ocfs2_alloc_context **meta_ac)
533 { 533 {
534 int ret, num_free_extents; 534 int ret, num_free_extents;
535 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 535 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
536 536
537 *meta_ac = NULL; 537 *meta_ac = NULL;
538 *data_ac = NULL; 538 *data_ac = NULL;
539 539
540 mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, " 540 mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
541 "clusters_to_add = %u\n", 541 "clusters_to_add = %u\n",
542 (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode), 542 (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
543 le32_to_cpu(di->i_clusters), clusters_to_add); 543 le32_to_cpu(di->i_clusters), clusters_to_add);
544 544
545 num_free_extents = ocfs2_num_free_extents(osb, inode, di); 545 num_free_extents = ocfs2_num_free_extents(osb, inode, di);
546 if (num_free_extents < 0) { 546 if (num_free_extents < 0) {
547 ret = num_free_extents; 547 ret = num_free_extents;
548 mlog_errno(ret); 548 mlog_errno(ret);
549 goto out; 549 goto out;
550 } 550 }
551 551
552 /* 552 /*
553 * Sparse allocation file systems need to be more conservative 553 * Sparse allocation file systems need to be more conservative
554 * with reserving room for expansion - the actual allocation 554 * with reserving room for expansion - the actual allocation
555 * happens while we've got a journal handle open so re-taking 555 * happens while we've got a journal handle open so re-taking
556 * a cluster lock (because we ran out of room for another 556 * a cluster lock (because we ran out of room for another
557 * extent) will violate ordering rules. 557 * extent) will violate ordering rules.
558 * 558 *
559 * Most of the time we'll only be seeing this 1 cluster at a time 559 * Most of the time we'll only be seeing this 1 cluster at a time
560 * anyway. 560 * anyway.
561 */ 561 */
562 if (!num_free_extents || 562 if (!num_free_extents ||
563 (ocfs2_sparse_alloc(osb) && num_free_extents < clusters_to_add)) { 563 (ocfs2_sparse_alloc(osb) && num_free_extents < clusters_to_add)) {
564 ret = ocfs2_reserve_new_metadata(osb, di, meta_ac); 564 ret = ocfs2_reserve_new_metadata(osb, di, meta_ac);
565 if (ret < 0) { 565 if (ret < 0) {
566 if (ret != -ENOSPC) 566 if (ret != -ENOSPC)
567 mlog_errno(ret); 567 mlog_errno(ret);
568 goto out; 568 goto out;
569 } 569 }
570 } 570 }
571 571
572 ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac); 572 ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
573 if (ret < 0) { 573 if (ret < 0) {
574 if (ret != -ENOSPC) 574 if (ret != -ENOSPC)
575 mlog_errno(ret); 575 mlog_errno(ret);
576 goto out; 576 goto out;
577 } 577 }
578 578
579 out: 579 out:
580 if (ret) { 580 if (ret) {
581 if (*meta_ac) { 581 if (*meta_ac) {
582 ocfs2_free_alloc_context(*meta_ac); 582 ocfs2_free_alloc_context(*meta_ac);
583 *meta_ac = NULL; 583 *meta_ac = NULL;
584 } 584 }
585 585
586 /* 586 /*
587 * We cannot have an error and a non null *data_ac. 587 * We cannot have an error and a non null *data_ac.
588 */ 588 */
589 } 589 }
590 590
591 return ret; 591 return ret;
592 } 592 }
593 593
594 static int ocfs2_extend_allocation(struct inode *inode, 594 static int ocfs2_extend_allocation(struct inode *inode,
595 u32 clusters_to_add) 595 u32 clusters_to_add)
596 { 596 {
597 int status = 0; 597 int status = 0;
598 int restart_func = 0; 598 int restart_func = 0;
599 int drop_alloc_sem = 0; 599 int drop_alloc_sem = 0;
600 int credits; 600 int credits;
601 u32 prev_clusters, logical_start; 601 u32 prev_clusters, logical_start;
602 struct buffer_head *bh = NULL; 602 struct buffer_head *bh = NULL;
603 struct ocfs2_dinode *fe = NULL; 603 struct ocfs2_dinode *fe = NULL;
604 handle_t *handle = NULL; 604 handle_t *handle = NULL;
605 struct ocfs2_alloc_context *data_ac = NULL; 605 struct ocfs2_alloc_context *data_ac = NULL;
606 struct ocfs2_alloc_context *meta_ac = NULL; 606 struct ocfs2_alloc_context *meta_ac = NULL;
607 enum ocfs2_alloc_restarted why; 607 enum ocfs2_alloc_restarted why;
608 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 608 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
609 609
610 mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); 610 mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
611 611
612 /* 612 /*
613 * This function only exists for file systems which don't 613 * This function only exists for file systems which don't
614 * support holes. 614 * support holes.
615 */ 615 */
616 BUG_ON(ocfs2_sparse_alloc(osb)); 616 BUG_ON(ocfs2_sparse_alloc(osb));
617 617
618 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, 618 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
619 OCFS2_BH_CACHED, inode); 619 OCFS2_BH_CACHED, inode);
620 if (status < 0) { 620 if (status < 0) {
621 mlog_errno(status); 621 mlog_errno(status);
622 goto leave; 622 goto leave;
623 } 623 }
624 624
625 fe = (struct ocfs2_dinode *) bh->b_data; 625 fe = (struct ocfs2_dinode *) bh->b_data;
626 if (!OCFS2_IS_VALID_DINODE(fe)) { 626 if (!OCFS2_IS_VALID_DINODE(fe)) {
627 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); 627 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
628 status = -EIO; 628 status = -EIO;
629 goto leave; 629 goto leave;
630 } 630 }
631 631
632 logical_start = OCFS2_I(inode)->ip_clusters; 632 logical_start = OCFS2_I(inode)->ip_clusters;
633 633
634 restart_all: 634 restart_all:
635 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); 635 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
636 636
637 /* blocks peope in read/write from reading our allocation 637 /* blocks peope in read/write from reading our allocation
638 * until we're done changing it. We depend on i_mutex to block 638 * until we're done changing it. We depend on i_mutex to block
639 * other extend/truncate calls while we're here. Ordering wrt 639 * other extend/truncate calls while we're here. Ordering wrt
640 * start_trans is important here -- always do it before! */ 640 * start_trans is important here -- always do it before! */
641 down_write(&OCFS2_I(inode)->ip_alloc_sem); 641 down_write(&OCFS2_I(inode)->ip_alloc_sem);
642 drop_alloc_sem = 1; 642 drop_alloc_sem = 1;
643 643
644 status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac, 644 status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac,
645 &meta_ac); 645 &meta_ac);
646 if (status) { 646 if (status) {
647 mlog_errno(status); 647 mlog_errno(status);
648 goto leave; 648 goto leave;
649 } 649 }
650 650
651 credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add); 651 credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
652 handle = ocfs2_start_trans(osb, credits); 652 handle = ocfs2_start_trans(osb, credits);
653 if (IS_ERR(handle)) { 653 if (IS_ERR(handle)) {
654 status = PTR_ERR(handle); 654 status = PTR_ERR(handle);
655 handle = NULL; 655 handle = NULL;
656 mlog_errno(status); 656 mlog_errno(status);
657 goto leave; 657 goto leave;
658 } 658 }
659 659
660 restarted_transaction: 660 restarted_transaction:
661 /* reserve a write to the file entry early on - that we if we 661 /* reserve a write to the file entry early on - that we if we
662 * run out of credits in the allocation path, we can still 662 * run out of credits in the allocation path, we can still
663 * update i_size. */ 663 * update i_size. */
664 status = ocfs2_journal_access(handle, inode, bh, 664 status = ocfs2_journal_access(handle, inode, bh,
665 OCFS2_JOURNAL_ACCESS_WRITE); 665 OCFS2_JOURNAL_ACCESS_WRITE);
666 if (status < 0) { 666 if (status < 0) {
667 mlog_errno(status); 667 mlog_errno(status);
668 goto leave; 668 goto leave;
669 } 669 }
670 670
671 prev_clusters = OCFS2_I(inode)->ip_clusters; 671 prev_clusters = OCFS2_I(inode)->ip_clusters;
672 672
673 status = ocfs2_do_extend_allocation(osb, 673 status = ocfs2_do_extend_allocation(osb,
674 inode, 674 inode,
675 &logical_start, 675 &logical_start,
676 clusters_to_add, 676 clusters_to_add,
677 bh, 677 bh,
678 handle, 678 handle,
679 data_ac, 679 data_ac,
680 meta_ac, 680 meta_ac,
681 &why); 681 &why);
682 if ((status < 0) && (status != -EAGAIN)) { 682 if ((status < 0) && (status != -EAGAIN)) {
683 if (status != -ENOSPC) 683 if (status != -ENOSPC)
684 mlog_errno(status); 684 mlog_errno(status);
685 goto leave; 685 goto leave;
686 } 686 }
687 687
688 status = ocfs2_journal_dirty(handle, bh); 688 status = ocfs2_journal_dirty(handle, bh);
689 if (status < 0) { 689 if (status < 0) {
690 mlog_errno(status); 690 mlog_errno(status);
691 goto leave; 691 goto leave;
692 } 692 }
693 693
694 spin_lock(&OCFS2_I(inode)->ip_lock); 694 spin_lock(&OCFS2_I(inode)->ip_lock);
695 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); 695 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
696 spin_unlock(&OCFS2_I(inode)->ip_lock); 696 spin_unlock(&OCFS2_I(inode)->ip_lock);
697 697
698 if (why != RESTART_NONE && clusters_to_add) { 698 if (why != RESTART_NONE && clusters_to_add) {
699 if (why == RESTART_META) { 699 if (why == RESTART_META) {
700 mlog(0, "restarting function.\n"); 700 mlog(0, "restarting function.\n");
701 restart_func = 1; 701 restart_func = 1;
702 } else { 702 } else {
703 BUG_ON(why != RESTART_TRANS); 703 BUG_ON(why != RESTART_TRANS);
704 704
705 mlog(0, "restarting transaction.\n"); 705 mlog(0, "restarting transaction.\n");
706 /* TODO: This can be more intelligent. */ 706 /* TODO: This can be more intelligent. */
707 credits = ocfs2_calc_extend_credits(osb->sb, 707 credits = ocfs2_calc_extend_credits(osb->sb,
708 fe, 708 fe,
709 clusters_to_add); 709 clusters_to_add);
710 status = ocfs2_extend_trans(handle, credits); 710 status = ocfs2_extend_trans(handle, credits);
711 if (status < 0) { 711 if (status < 0) {
712 /* handle still has to be committed at 712 /* handle still has to be committed at
713 * this point. */ 713 * this point. */
714 status = -ENOMEM; 714 status = -ENOMEM;
715 mlog_errno(status); 715 mlog_errno(status);
716 goto leave; 716 goto leave;
717 } 717 }
718 goto restarted_transaction; 718 goto restarted_transaction;
719 } 719 }
720 } 720 }
721 721
722 mlog(0, "fe: i_clusters = %u, i_size=%llu\n", 722 mlog(0, "fe: i_clusters = %u, i_size=%llu\n",
723 le32_to_cpu(fe->i_clusters), 723 le32_to_cpu(fe->i_clusters),
724 (unsigned long long)le64_to_cpu(fe->i_size)); 724 (unsigned long long)le64_to_cpu(fe->i_size));
725 mlog(0, "inode: ip_clusters=%u, i_size=%lld\n", 725 mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
726 OCFS2_I(inode)->ip_clusters, i_size_read(inode)); 726 OCFS2_I(inode)->ip_clusters, i_size_read(inode));
727 727
728 leave: 728 leave:
729 if (drop_alloc_sem) { 729 if (drop_alloc_sem) {
730 up_write(&OCFS2_I(inode)->ip_alloc_sem); 730 up_write(&OCFS2_I(inode)->ip_alloc_sem);
731 drop_alloc_sem = 0; 731 drop_alloc_sem = 0;
732 } 732 }
733 if (handle) { 733 if (handle) {
734 ocfs2_commit_trans(osb, handle); 734 ocfs2_commit_trans(osb, handle);
735 handle = NULL; 735 handle = NULL;
736 } 736 }
737 if (data_ac) { 737 if (data_ac) {
738 ocfs2_free_alloc_context(data_ac); 738 ocfs2_free_alloc_context(data_ac);
739 data_ac = NULL; 739 data_ac = NULL;
740 } 740 }
741 if (meta_ac) { 741 if (meta_ac) {
742 ocfs2_free_alloc_context(meta_ac); 742 ocfs2_free_alloc_context(meta_ac);
743 meta_ac = NULL; 743 meta_ac = NULL;
744 } 744 }
745 if ((!status) && restart_func) { 745 if ((!status) && restart_func) {
746 restart_func = 0; 746 restart_func = 0;
747 goto restart_all; 747 goto restart_all;
748 } 748 }
749 if (bh) { 749 if (bh) {
750 brelse(bh); 750 brelse(bh);
751 bh = NULL; 751 bh = NULL;
752 } 752 }
753 753
754 mlog_exit(status); 754 mlog_exit(status);
755 return status; 755 return status;
756 } 756 }
757 757
758 /* Some parts of this taken from generic_cont_expand, which turned out 758 /* Some parts of this taken from generic_cont_expand, which turned out
759 * to be too fragile to do exactly what we need without us having to 759 * to be too fragile to do exactly what we need without us having to
760 * worry about recursive locking in ->prepare_write() and 760 * worry about recursive locking in ->prepare_write() and
761 * ->commit_write(). */ 761 * ->commit_write(). */
762 static int ocfs2_write_zero_page(struct inode *inode, 762 static int ocfs2_write_zero_page(struct inode *inode,
763 u64 size) 763 u64 size)
764 { 764 {
765 struct address_space *mapping = inode->i_mapping; 765 struct address_space *mapping = inode->i_mapping;
766 struct page *page; 766 struct page *page;
767 unsigned long index; 767 unsigned long index;
768 unsigned int offset; 768 unsigned int offset;
769 handle_t *handle = NULL; 769 handle_t *handle = NULL;
770 int ret; 770 int ret;
771 771
772 offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */ 772 offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
773 /* ugh. in prepare/commit_write, if from==to==start of block, we 773 /* ugh. in prepare/commit_write, if from==to==start of block, we
774 ** skip the prepare. make sure we never send an offset for the start 774 ** skip the prepare. make sure we never send an offset for the start
775 ** of a block 775 ** of a block
776 */ 776 */
777 if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) { 777 if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
778 offset++; 778 offset++;
779 } 779 }
780 index = size >> PAGE_CACHE_SHIFT; 780 index = size >> PAGE_CACHE_SHIFT;
781 781
782 page = grab_cache_page(mapping, index); 782 page = grab_cache_page(mapping, index);
783 if (!page) { 783 if (!page) {
784 ret = -ENOMEM; 784 ret = -ENOMEM;
785 mlog_errno(ret); 785 mlog_errno(ret);
786 goto out; 786 goto out;
787 } 787 }
788 788
789 ret = ocfs2_prepare_write_nolock(inode, page, offset, offset); 789 ret = ocfs2_prepare_write_nolock(inode, page, offset, offset);
790 if (ret < 0) { 790 if (ret < 0) {
791 mlog_errno(ret); 791 mlog_errno(ret);
792 goto out_unlock; 792 goto out_unlock;
793 } 793 }
794 794
795 if (ocfs2_should_order_data(inode)) { 795 if (ocfs2_should_order_data(inode)) {
796 handle = ocfs2_start_walk_page_trans(inode, page, offset, 796 handle = ocfs2_start_walk_page_trans(inode, page, offset,
797 offset); 797 offset);
798 if (IS_ERR(handle)) { 798 if (IS_ERR(handle)) {
799 ret = PTR_ERR(handle); 799 ret = PTR_ERR(handle);
800 handle = NULL; 800 handle = NULL;
801 goto out_unlock; 801 goto out_unlock;
802 } 802 }
803 } 803 }
804 804
805 /* must not update i_size! */ 805 /* must not update i_size! */
806 ret = block_commit_write(page, offset, offset); 806 ret = block_commit_write(page, offset, offset);
807 if (ret < 0) 807 if (ret < 0)
808 mlog_errno(ret); 808 mlog_errno(ret);
809 else 809 else
810 ret = 0; 810 ret = 0;
811 811
812 if (handle) 812 if (handle)
813 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); 813 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
814 out_unlock: 814 out_unlock:
815 unlock_page(page); 815 unlock_page(page);
816 page_cache_release(page); 816 page_cache_release(page);
817 out: 817 out:
818 return ret; 818 return ret;
819 } 819 }
820 820
821 static int ocfs2_zero_extend(struct inode *inode, 821 static int ocfs2_zero_extend(struct inode *inode,
822 u64 zero_to_size) 822 u64 zero_to_size)
823 { 823 {
824 int ret = 0; 824 int ret = 0;
825 u64 start_off; 825 u64 start_off;
826 struct super_block *sb = inode->i_sb; 826 struct super_block *sb = inode->i_sb;
827 827
828 start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode)); 828 start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
829 while (start_off < zero_to_size) { 829 while (start_off < zero_to_size) {
830 ret = ocfs2_write_zero_page(inode, start_off); 830 ret = ocfs2_write_zero_page(inode, start_off);
831 if (ret < 0) { 831 if (ret < 0) {
832 mlog_errno(ret); 832 mlog_errno(ret);
833 goto out; 833 goto out;
834 } 834 }
835 835
836 start_off += sb->s_blocksize; 836 start_off += sb->s_blocksize;
837 837
838 /* 838 /*
839 * Very large extends have the potential to lock up 839 * Very large extends have the potential to lock up
840 * the cpu for extended periods of time. 840 * the cpu for extended periods of time.
841 */ 841 */
842 cond_resched(); 842 cond_resched();
843 } 843 }
844 844
845 out: 845 out:
846 return ret; 846 return ret;
847 } 847 }
848 848
849 /* 849 /*
850 * A tail_to_skip value > 0 indicates that we're being called from 850 * A tail_to_skip value > 0 indicates that we're being called from
851 * ocfs2_file_aio_write(). This has the following implications: 851 * ocfs2_file_aio_write(). This has the following implications:
852 * 852 *
853 * - we don't want to update i_size 853 * - we don't want to update i_size
854 * - di_bh will be NULL, which is fine because it's only used in the 854 * - di_bh will be NULL, which is fine because it's only used in the
855 * case where we want to update i_size. 855 * case where we want to update i_size.
856 * - ocfs2_zero_extend() will then only be filling the hole created 856 * - ocfs2_zero_extend() will then only be filling the hole created
857 * between i_size and the start of the write. 857 * between i_size and the start of the write.
858 */ 858 */
859 static int ocfs2_extend_file(struct inode *inode, 859 static int ocfs2_extend_file(struct inode *inode,
860 struct buffer_head *di_bh, 860 struct buffer_head *di_bh,
861 u64 new_i_size, 861 u64 new_i_size,
862 size_t tail_to_skip) 862 size_t tail_to_skip)
863 { 863 {
864 int ret = 0; 864 int ret = 0;
865 u32 clusters_to_add = 0; 865 u32 clusters_to_add = 0;
866 866
867 BUG_ON(!tail_to_skip && !di_bh); 867 BUG_ON(!tail_to_skip && !di_bh);
868 868
869 /* setattr sometimes calls us like this. */ 869 /* setattr sometimes calls us like this. */
870 if (new_i_size == 0) 870 if (new_i_size == 0)
871 goto out; 871 goto out;
872 872
873 if (i_size_read(inode) == new_i_size) 873 if (i_size_read(inode) == new_i_size)
874 goto out; 874 goto out;
875 BUG_ON(new_i_size < i_size_read(inode)); 875 BUG_ON(new_i_size < i_size_read(inode));
876 876
877 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { 877 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
878 BUG_ON(tail_to_skip != 0); 878 BUG_ON(tail_to_skip != 0);
879 goto out_update_size; 879 goto out_update_size;
880 } 880 }
881 881
882 clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - 882 clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) -
883 OCFS2_I(inode)->ip_clusters; 883 OCFS2_I(inode)->ip_clusters;
884 884
885 /* 885 /*
886 * protect the pages that ocfs2_zero_extend is going to be 886 * protect the pages that ocfs2_zero_extend is going to be
887 * pulling into the page cache.. we do this before the 887 * pulling into the page cache.. we do this before the
888 * metadata extend so that we don't get into the situation 888 * metadata extend so that we don't get into the situation
889 * where we've extended the metadata but can't get the data 889 * where we've extended the metadata but can't get the data
890 * lock to zero. 890 * lock to zero.
891 */ 891 */
892 ret = ocfs2_data_lock(inode, 1); 892 ret = ocfs2_data_lock(inode, 1);
893 if (ret < 0) { 893 if (ret < 0) {
894 mlog_errno(ret); 894 mlog_errno(ret);
895 goto out; 895 goto out;
896 } 896 }
897 897
898 if (clusters_to_add) { 898 if (clusters_to_add) {
899 ret = ocfs2_extend_allocation(inode, clusters_to_add); 899 ret = ocfs2_extend_allocation(inode, clusters_to_add);
900 if (ret < 0) { 900 if (ret < 0) {
901 mlog_errno(ret); 901 mlog_errno(ret);
902 goto out_unlock; 902 goto out_unlock;
903 } 903 }
904 } 904 }
905 905
906 /* 906 /*
907 * Call this even if we don't add any clusters to the tree. We 907 * Call this even if we don't add any clusters to the tree. We
908 * still need to zero the area between the old i_size and the 908 * still need to zero the area between the old i_size and the
909 * new i_size. 909 * new i_size.
910 */ 910 */
911 ret = ocfs2_zero_extend(inode, (u64)new_i_size - tail_to_skip); 911 ret = ocfs2_zero_extend(inode, (u64)new_i_size - tail_to_skip);
912 if (ret < 0) { 912 if (ret < 0) {
913 mlog_errno(ret); 913 mlog_errno(ret);
914 goto out_unlock; 914 goto out_unlock;
915 } 915 }
916 916
917 out_update_size: 917 out_update_size:
918 if (!tail_to_skip) { 918 if (!tail_to_skip) {
919 /* We're being called from ocfs2_setattr() which wants 919 /* We're being called from ocfs2_setattr() which wants
920 * us to update i_size */ 920 * us to update i_size */
921 ret = ocfs2_simple_size_update(inode, di_bh, new_i_size); 921 ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
922 if (ret < 0) 922 if (ret < 0)
923 mlog_errno(ret); 923 mlog_errno(ret);
924 } 924 }
925 925
926 out_unlock: 926 out_unlock:
927 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) 927 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
928 ocfs2_data_unlock(inode, 1); 928 ocfs2_data_unlock(inode, 1);
929 929
930 out: 930 out:
931 return ret; 931 return ret;
932 } 932 }
933 933
934 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) 934 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
935 { 935 {
936 int status = 0, size_change; 936 int status = 0, size_change;
937 struct inode *inode = dentry->d_inode; 937 struct inode *inode = dentry->d_inode;
938 struct super_block *sb = inode->i_sb; 938 struct super_block *sb = inode->i_sb;
939 struct ocfs2_super *osb = OCFS2_SB(sb); 939 struct ocfs2_super *osb = OCFS2_SB(sb);
940 struct buffer_head *bh = NULL; 940 struct buffer_head *bh = NULL;
941 handle_t *handle = NULL; 941 handle_t *handle = NULL;
942 942
943 mlog_entry("(0x%p, '%.*s')\n", dentry, 943 mlog_entry("(0x%p, '%.*s')\n", dentry,
944 dentry->d_name.len, dentry->d_name.name); 944 dentry->d_name.len, dentry->d_name.name);
945 945
946 if (attr->ia_valid & ATTR_MODE) 946 if (attr->ia_valid & ATTR_MODE)
947 mlog(0, "mode change: %d\n", attr->ia_mode); 947 mlog(0, "mode change: %d\n", attr->ia_mode);
948 if (attr->ia_valid & ATTR_UID) 948 if (attr->ia_valid & ATTR_UID)
949 mlog(0, "uid change: %d\n", attr->ia_uid); 949 mlog(0, "uid change: %d\n", attr->ia_uid);
950 if (attr->ia_valid & ATTR_GID) 950 if (attr->ia_valid & ATTR_GID)
951 mlog(0, "gid change: %d\n", attr->ia_gid); 951 mlog(0, "gid change: %d\n", attr->ia_gid);
952 if (attr->ia_valid & ATTR_SIZE) 952 if (attr->ia_valid & ATTR_SIZE)
953 mlog(0, "size change...\n"); 953 mlog(0, "size change...\n");
954 if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME)) 954 if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME))
955 mlog(0, "time change...\n"); 955 mlog(0, "time change...\n");
956 956
957 #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \ 957 #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
958 | ATTR_GID | ATTR_UID | ATTR_MODE) 958 | ATTR_GID | ATTR_UID | ATTR_MODE)
959 if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) { 959 if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) {
960 mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid); 960 mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid);
961 return 0; 961 return 0;
962 } 962 }
963 963
964 status = inode_change_ok(inode, attr); 964 status = inode_change_ok(inode, attr);
965 if (status) 965 if (status)
966 return status; 966 return status;
967 967
968 size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; 968 size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
969 if (size_change) { 969 if (size_change) {
970 status = ocfs2_rw_lock(inode, 1); 970 status = ocfs2_rw_lock(inode, 1);
971 if (status < 0) { 971 if (status < 0) {
972 mlog_errno(status); 972 mlog_errno(status);
973 goto bail; 973 goto bail;
974 } 974 }
975 } 975 }
976 976
977 status = ocfs2_meta_lock(inode, &bh, 1); 977 status = ocfs2_meta_lock(inode, &bh, 1);
978 if (status < 0) { 978 if (status < 0) {
979 if (status != -ENOENT) 979 if (status != -ENOENT)
980 mlog_errno(status); 980 mlog_errno(status);
981 goto bail_unlock_rw; 981 goto bail_unlock_rw;
982 } 982 }
983 983
984 if (size_change && attr->ia_size != i_size_read(inode)) { 984 if (size_change && attr->ia_size != i_size_read(inode)) {
985 if (i_size_read(inode) > attr->ia_size) 985 if (i_size_read(inode) > attr->ia_size)
986 status = ocfs2_truncate_file(inode, bh, attr->ia_size); 986 status = ocfs2_truncate_file(inode, bh, attr->ia_size);
987 else 987 else
988 status = ocfs2_extend_file(inode, bh, attr->ia_size, 0); 988 status = ocfs2_extend_file(inode, bh, attr->ia_size, 0);
989 if (status < 0) { 989 if (status < 0) {
990 if (status != -ENOSPC) 990 if (status != -ENOSPC)
991 mlog_errno(status); 991 mlog_errno(status);
992 status = -ENOSPC; 992 status = -ENOSPC;
993 goto bail_unlock; 993 goto bail_unlock;
994 } 994 }
995 } 995 }
996 996
997 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 997 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
998 if (IS_ERR(handle)) { 998 if (IS_ERR(handle)) {
999 status = PTR_ERR(handle); 999 status = PTR_ERR(handle);
1000 mlog_errno(status); 1000 mlog_errno(status);
1001 goto bail_unlock; 1001 goto bail_unlock;
1002 } 1002 }
1003 1003
1004 /*
1005 * This will intentionally not wind up calling vmtruncate(),
1006 * since all the work for a size change has been done above.
1007 * Otherwise, we could get into problems with truncate as
1008 * ip_alloc_sem is used there to protect against i_size
1009 * changes.
1010 */
1004 status = inode_setattr(inode, attr); 1011 status = inode_setattr(inode, attr);
1005 if (status < 0) { 1012 if (status < 0) {
1006 mlog_errno(status); 1013 mlog_errno(status);
1007 goto bail_commit; 1014 goto bail_commit;
1008 } 1015 }
1009 1016
1010 status = ocfs2_mark_inode_dirty(handle, inode, bh); 1017 status = ocfs2_mark_inode_dirty(handle, inode, bh);
1011 if (status < 0) 1018 if (status < 0)
1012 mlog_errno(status); 1019 mlog_errno(status);
1013 1020
1014 bail_commit: 1021 bail_commit:
1015 ocfs2_commit_trans(osb, handle); 1022 ocfs2_commit_trans(osb, handle);
1016 bail_unlock: 1023 bail_unlock:
1017 ocfs2_meta_unlock(inode, 1); 1024 ocfs2_meta_unlock(inode, 1);
1018 bail_unlock_rw: 1025 bail_unlock_rw:
1019 if (size_change) 1026 if (size_change)
1020 ocfs2_rw_unlock(inode, 1); 1027 ocfs2_rw_unlock(inode, 1);
1021 bail: 1028 bail:
1022 if (bh) 1029 if (bh)
1023 brelse(bh); 1030 brelse(bh);
1024 1031
1025 mlog_exit(status); 1032 mlog_exit(status);
1026 return status; 1033 return status;
1027 } 1034 }
1028 1035
1029 int ocfs2_getattr(struct vfsmount *mnt, 1036 int ocfs2_getattr(struct vfsmount *mnt,
1030 struct dentry *dentry, 1037 struct dentry *dentry,
1031 struct kstat *stat) 1038 struct kstat *stat)
1032 { 1039 {
1033 struct inode *inode = dentry->d_inode; 1040 struct inode *inode = dentry->d_inode;
1034 struct super_block *sb = dentry->d_inode->i_sb; 1041 struct super_block *sb = dentry->d_inode->i_sb;
1035 struct ocfs2_super *osb = sb->s_fs_info; 1042 struct ocfs2_super *osb = sb->s_fs_info;
1036 int err; 1043 int err;
1037 1044
1038 mlog_entry_void(); 1045 mlog_entry_void();
1039 1046
1040 err = ocfs2_inode_revalidate(dentry); 1047 err = ocfs2_inode_revalidate(dentry);
1041 if (err) { 1048 if (err) {
1042 if (err != -ENOENT) 1049 if (err != -ENOENT)
1043 mlog_errno(err); 1050 mlog_errno(err);
1044 goto bail; 1051 goto bail;
1045 } 1052 }
1046 1053
1047 generic_fillattr(inode, stat); 1054 generic_fillattr(inode, stat);
1048 1055
1049 /* We set the blksize from the cluster size for performance */ 1056 /* We set the blksize from the cluster size for performance */
1050 stat->blksize = osb->s_clustersize; 1057 stat->blksize = osb->s_clustersize;
1051 1058
1052 bail: 1059 bail:
1053 mlog_exit(err); 1060 mlog_exit(err);
1054 1061
1055 return err; 1062 return err;
1056 } 1063 }
1057 1064
1058 int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd) 1065 int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
1059 { 1066 {
1060 int ret; 1067 int ret;
1061 1068
1062 mlog_entry_void(); 1069 mlog_entry_void();
1063 1070
1064 ret = ocfs2_meta_lock(inode, NULL, 0); 1071 ret = ocfs2_meta_lock(inode, NULL, 0);
1065 if (ret) { 1072 if (ret) {
1066 if (ret != -ENOENT) 1073 if (ret != -ENOENT)
1067 mlog_errno(ret); 1074 mlog_errno(ret);
1068 goto out; 1075 goto out;
1069 } 1076 }
1070 1077
1071 ret = generic_permission(inode, mask, NULL); 1078 ret = generic_permission(inode, mask, NULL);
1072 1079
1073 ocfs2_meta_unlock(inode, 0); 1080 ocfs2_meta_unlock(inode, 0);
1074 out: 1081 out:
1075 mlog_exit(ret); 1082 mlog_exit(ret);
1076 return ret; 1083 return ret;
1077 } 1084 }
1078 1085
1079 static int ocfs2_write_remove_suid(struct inode *inode) 1086 static int ocfs2_write_remove_suid(struct inode *inode)
1080 { 1087 {
1081 int ret; 1088 int ret;
1082 struct buffer_head *bh = NULL; 1089 struct buffer_head *bh = NULL;
1083 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1090 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1084 handle_t *handle; 1091 handle_t *handle;
1085 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1092 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1086 struct ocfs2_dinode *di; 1093 struct ocfs2_dinode *di;
1087 1094
1088 mlog_entry("(Inode %llu, mode 0%o)\n", 1095 mlog_entry("(Inode %llu, mode 0%o)\n",
1089 (unsigned long long)oi->ip_blkno, inode->i_mode); 1096 (unsigned long long)oi->ip_blkno, inode->i_mode);
1090 1097
1091 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 1098 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1092 if (handle == NULL) { 1099 if (handle == NULL) {
1093 ret = -ENOMEM; 1100 ret = -ENOMEM;
1094 mlog_errno(ret); 1101 mlog_errno(ret);
1095 goto out; 1102 goto out;
1096 } 1103 }
1097 1104
1098 ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode); 1105 ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
1099 if (ret < 0) { 1106 if (ret < 0) {
1100 mlog_errno(ret); 1107 mlog_errno(ret);
1101 goto out_trans; 1108 goto out_trans;
1102 } 1109 }
1103 1110
1104 ret = ocfs2_journal_access(handle, inode, bh, 1111 ret = ocfs2_journal_access(handle, inode, bh,
1105 OCFS2_JOURNAL_ACCESS_WRITE); 1112 OCFS2_JOURNAL_ACCESS_WRITE);
1106 if (ret < 0) { 1113 if (ret < 0) {
1107 mlog_errno(ret); 1114 mlog_errno(ret);
1108 goto out_bh; 1115 goto out_bh;
1109 } 1116 }
1110 1117
1111 inode->i_mode &= ~S_ISUID; 1118 inode->i_mode &= ~S_ISUID;
1112 if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP)) 1119 if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))
1113 inode->i_mode &= ~S_ISGID; 1120 inode->i_mode &= ~S_ISGID;
1114 1121
1115 di = (struct ocfs2_dinode *) bh->b_data; 1122 di = (struct ocfs2_dinode *) bh->b_data;
1116 di->i_mode = cpu_to_le16(inode->i_mode); 1123 di->i_mode = cpu_to_le16(inode->i_mode);
1117 1124
1118 ret = ocfs2_journal_dirty(handle, bh); 1125 ret = ocfs2_journal_dirty(handle, bh);
1119 if (ret < 0) 1126 if (ret < 0)
1120 mlog_errno(ret); 1127 mlog_errno(ret);
1121 out_bh: 1128 out_bh:
1122 brelse(bh); 1129 brelse(bh);
1123 out_trans: 1130 out_trans:
1124 ocfs2_commit_trans(osb, handle); 1131 ocfs2_commit_trans(osb, handle);
1125 out: 1132 out:
1126 mlog_exit(ret); 1133 mlog_exit(ret);
1127 return ret; 1134 return ret;
1128 } 1135 }
1129 1136
1130 /* 1137 /*
1131 * Will look for holes and unwritten extents in the range starting at 1138 * Will look for holes and unwritten extents in the range starting at
1132 * pos for count bytes (inclusive). 1139 * pos for count bytes (inclusive).
1133 */ 1140 */
1134 static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos, 1141 static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
1135 size_t count) 1142 size_t count)
1136 { 1143 {
1137 int ret = 0; 1144 int ret = 0;
1138 unsigned int extent_flags; 1145 unsigned int extent_flags;
1139 u32 cpos, clusters, extent_len, phys_cpos; 1146 u32 cpos, clusters, extent_len, phys_cpos;
1140 struct super_block *sb = inode->i_sb; 1147 struct super_block *sb = inode->i_sb;
1141 1148
1142 cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits; 1149 cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
1143 clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos; 1150 clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
1144 1151
1145 while (clusters) { 1152 while (clusters) {
1146 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len, 1153 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
1147 &extent_flags); 1154 &extent_flags);
1148 if (ret < 0) { 1155 if (ret < 0) {
1149 mlog_errno(ret); 1156 mlog_errno(ret);
1150 goto out; 1157 goto out;
1151 } 1158 }
1152 1159
1153 if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) { 1160 if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
1154 ret = 1; 1161 ret = 1;
1155 break; 1162 break;
1156 } 1163 }
1157 1164
1158 if (extent_len > clusters) 1165 if (extent_len > clusters)
1159 extent_len = clusters; 1166 extent_len = clusters;
1160 1167
1161 clusters -= extent_len; 1168 clusters -= extent_len;
1162 cpos += extent_len; 1169 cpos += extent_len;
1163 } 1170 }
1164 out: 1171 out:
1165 return ret; 1172 return ret;
1166 } 1173 }
1167 1174
1168 static int ocfs2_prepare_inode_for_write(struct dentry *dentry, 1175 static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1169 loff_t *ppos, 1176 loff_t *ppos,
1170 size_t count, 1177 size_t count,
1171 int appending, 1178 int appending,
1172 int *direct_io) 1179 int *direct_io)
1173 { 1180 {
1174 int ret = 0, meta_level = appending; 1181 int ret = 0, meta_level = appending;
1175 struct inode *inode = dentry->d_inode; 1182 struct inode *inode = dentry->d_inode;
1176 u32 clusters; 1183 u32 clusters;
1177 loff_t newsize, saved_pos; 1184 loff_t newsize, saved_pos;
1178 1185
1179 /* 1186 /*
1180 * We sample i_size under a read level meta lock to see if our write 1187 * We sample i_size under a read level meta lock to see if our write
1181 * is extending the file, if it is we back off and get a write level 1188 * is extending the file, if it is we back off and get a write level
1182 * meta lock. 1189 * meta lock.
1183 */ 1190 */
1184 for(;;) { 1191 for(;;) {
1185 ret = ocfs2_meta_lock(inode, NULL, meta_level); 1192 ret = ocfs2_meta_lock(inode, NULL, meta_level);
1186 if (ret < 0) { 1193 if (ret < 0) {
1187 meta_level = -1; 1194 meta_level = -1;
1188 mlog_errno(ret); 1195 mlog_errno(ret);
1189 goto out; 1196 goto out;
1190 } 1197 }
1191 1198
1192 /* Clear suid / sgid if necessary. We do this here 1199 /* Clear suid / sgid if necessary. We do this here
1193 * instead of later in the write path because 1200 * instead of later in the write path because
1194 * remove_suid() calls ->setattr without any hint that 1201 * remove_suid() calls ->setattr without any hint that
1195 * we may have already done our cluster locking. Since 1202 * we may have already done our cluster locking. Since
1196 * ocfs2_setattr() *must* take cluster locks to 1203 * ocfs2_setattr() *must* take cluster locks to
1197 * proceeed, this will lead us to recursively lock the 1204 * proceeed, this will lead us to recursively lock the
1198 * inode. There's also the dinode i_size state which 1205 * inode. There's also the dinode i_size state which
1199 * can be lost via setattr during extending writes (we 1206 * can be lost via setattr during extending writes (we
1200 * set inode->i_size at the end of a write. */ 1207 * set inode->i_size at the end of a write. */
1201 if (should_remove_suid(dentry)) { 1208 if (should_remove_suid(dentry)) {
1202 if (meta_level == 0) { 1209 if (meta_level == 0) {
1203 ocfs2_meta_unlock(inode, meta_level); 1210 ocfs2_meta_unlock(inode, meta_level);
1204 meta_level = 1; 1211 meta_level = 1;
1205 continue; 1212 continue;
1206 } 1213 }
1207 1214
1208 ret = ocfs2_write_remove_suid(inode); 1215 ret = ocfs2_write_remove_suid(inode);
1209 if (ret < 0) { 1216 if (ret < 0) {
1210 mlog_errno(ret); 1217 mlog_errno(ret);
1211 goto out_unlock; 1218 goto out_unlock;
1212 } 1219 }
1213 } 1220 }
1214 1221
1215 /* work on a copy of ppos until we're sure that we won't have 1222 /* work on a copy of ppos until we're sure that we won't have
1216 * to recalculate it due to relocking. */ 1223 * to recalculate it due to relocking. */
1217 if (appending) { 1224 if (appending) {
1218 saved_pos = i_size_read(inode); 1225 saved_pos = i_size_read(inode);
1219 mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos); 1226 mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);
1220 } else { 1227 } else {
1221 saved_pos = *ppos; 1228 saved_pos = *ppos;
1222 } 1229 }
1223 1230
1224 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { 1231 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
1225 loff_t end = saved_pos + count; 1232 loff_t end = saved_pos + count;
1226 1233
1227 /* 1234 /*
1228 * Skip the O_DIRECT checks if we don't need 1235 * Skip the O_DIRECT checks if we don't need
1229 * them. 1236 * them.
1230 */ 1237 */
1231 if (!direct_io || !(*direct_io)) 1238 if (!direct_io || !(*direct_io))
1232 break; 1239 break;
1233 1240
1234 /* 1241 /*
1235 * Allowing concurrent direct writes means 1242 * Allowing concurrent direct writes means
1236 * i_size changes wouldn't be synchronized, so 1243 * i_size changes wouldn't be synchronized, so
1237 * one node could wind up truncating another 1244 * one node could wind up truncating another
1238 * nodes writes. 1245 * nodes writes.
1239 */ 1246 */
1240 if (end > i_size_read(inode)) { 1247 if (end > i_size_read(inode)) {
1241 *direct_io = 0; 1248 *direct_io = 0;
1242 break; 1249 break;
1243 } 1250 }
1244 1251
1245 /* 1252 /*
1246 * We don't fill holes during direct io, so 1253 * We don't fill holes during direct io, so
1247 * check for them here. If any are found, the 1254 * check for them here. If any are found, the
1248 * caller will have to retake some cluster 1255 * caller will have to retake some cluster
1249 * locks and initiate the io as buffered. 1256 * locks and initiate the io as buffered.
1250 */ 1257 */
1251 ret = ocfs2_check_range_for_holes(inode, saved_pos, 1258 ret = ocfs2_check_range_for_holes(inode, saved_pos,
1252 count); 1259 count);
1253 if (ret == 1) { 1260 if (ret == 1) {
1254 *direct_io = 0; 1261 *direct_io = 0;
1255 ret = 0; 1262 ret = 0;
1256 } else if (ret < 0) 1263 } else if (ret < 0)
1257 mlog_errno(ret); 1264 mlog_errno(ret);
1258 break; 1265 break;
1259 } 1266 }
1260 1267
1261 /* 1268 /*
1262 * The rest of this loop is concerned with legacy file 1269 * The rest of this loop is concerned with legacy file
1263 * systems which don't support sparse files. 1270 * systems which don't support sparse files.
1264 */ 1271 */
1265 1272
1266 newsize = count + saved_pos; 1273 newsize = count + saved_pos;
1267 1274
1268 mlog(0, "pos=%lld newsize=%lld cursize=%lld\n", 1275 mlog(0, "pos=%lld newsize=%lld cursize=%lld\n",
1269 (long long) saved_pos, (long long) newsize, 1276 (long long) saved_pos, (long long) newsize,
1270 (long long) i_size_read(inode)); 1277 (long long) i_size_read(inode));
1271 1278
1272 /* No need for a higher level metadata lock if we're 1279 /* No need for a higher level metadata lock if we're
1273 * never going past i_size. */ 1280 * never going past i_size. */
1274 if (newsize <= i_size_read(inode)) 1281 if (newsize <= i_size_read(inode))
1275 break; 1282 break;
1276 1283
1277 if (meta_level == 0) { 1284 if (meta_level == 0) {
1278 ocfs2_meta_unlock(inode, meta_level); 1285 ocfs2_meta_unlock(inode, meta_level);
1279 meta_level = 1; 1286 meta_level = 1;
1280 continue; 1287 continue;
1281 } 1288 }
1282 1289
1283 spin_lock(&OCFS2_I(inode)->ip_lock); 1290 spin_lock(&OCFS2_I(inode)->ip_lock);
1284 clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) - 1291 clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) -
1285 OCFS2_I(inode)->ip_clusters; 1292 OCFS2_I(inode)->ip_clusters;
1286 spin_unlock(&OCFS2_I(inode)->ip_lock); 1293 spin_unlock(&OCFS2_I(inode)->ip_lock);
1287 1294
1288 mlog(0, "Writing at EOF, may need more allocation: " 1295 mlog(0, "Writing at EOF, may need more allocation: "
1289 "i_size = %lld, newsize = %lld, need %u clusters\n", 1296 "i_size = %lld, newsize = %lld, need %u clusters\n",
1290 (long long) i_size_read(inode), (long long) newsize, 1297 (long long) i_size_read(inode), (long long) newsize,
1291 clusters); 1298 clusters);
1292 1299
1293 /* We only want to continue the rest of this loop if 1300 /* We only want to continue the rest of this loop if
1294 * our extend will actually require more 1301 * our extend will actually require more
1295 * allocation. */ 1302 * allocation. */
1296 if (!clusters) 1303 if (!clusters)
1297 break; 1304 break;
1298 1305
1299 ret = ocfs2_extend_file(inode, NULL, newsize, count); 1306 ret = ocfs2_extend_file(inode, NULL, newsize, count);
1300 if (ret < 0) { 1307 if (ret < 0) {
1301 if (ret != -ENOSPC) 1308 if (ret != -ENOSPC)
1302 mlog_errno(ret); 1309 mlog_errno(ret);
1303 goto out_unlock; 1310 goto out_unlock;
1304 } 1311 }
1305 break; 1312 break;
1306 } 1313 }
1307 1314
1308 if (appending) 1315 if (appending)
1309 *ppos = saved_pos; 1316 *ppos = saved_pos;
1310 1317
1311 out_unlock: 1318 out_unlock:
1312 ocfs2_meta_unlock(inode, meta_level); 1319 ocfs2_meta_unlock(inode, meta_level);
1313 1320
1314 out: 1321 out:
1315 return ret; 1322 return ret;
1316 } 1323 }
1317 1324
1318 static inline void 1325 static inline void
1319 ocfs2_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes) 1326 ocfs2_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
1320 { 1327 {
1321 const struct iovec *iov = *iovp; 1328 const struct iovec *iov = *iovp;
1322 size_t base = *basep; 1329 size_t base = *basep;
1323 1330
1324 do { 1331 do {
1325 int copy = min(bytes, iov->iov_len - base); 1332 int copy = min(bytes, iov->iov_len - base);
1326 1333
1327 bytes -= copy; 1334 bytes -= copy;
1328 base += copy; 1335 base += copy;
1329 if (iov->iov_len == base) { 1336 if (iov->iov_len == base) {
1330 iov++; 1337 iov++;
1331 base = 0; 1338 base = 0;
1332 } 1339 }
1333 } while (bytes); 1340 } while (bytes);
1334 *iovp = iov; 1341 *iovp = iov;
1335 *basep = base; 1342 *basep = base;
1336 } 1343 }
1337 1344
1338 static struct page * ocfs2_get_write_source(char **ret_src_buf, 1345 static struct page * ocfs2_get_write_source(char **ret_src_buf,
1339 const struct iovec *cur_iov, 1346 const struct iovec *cur_iov,
1340 size_t iov_offset) 1347 size_t iov_offset)
1341 { 1348 {
1342 int ret; 1349 int ret;
1343 char *buf = cur_iov->iov_base + iov_offset; 1350 char *buf = cur_iov->iov_base + iov_offset;
1344 struct page *src_page = NULL; 1351 struct page *src_page = NULL;
1345 unsigned long off; 1352 unsigned long off;
1346 1353
1347 off = (unsigned long)(buf) & ~PAGE_CACHE_MASK; 1354 off = (unsigned long)(buf) & ~PAGE_CACHE_MASK;
1348 1355
1349 if (!segment_eq(get_fs(), KERNEL_DS)) { 1356 if (!segment_eq(get_fs(), KERNEL_DS)) {
1350 /* 1357 /*
1351 * Pull in the user page. We want to do this outside 1358 * Pull in the user page. We want to do this outside
1352 * of the meta data locks in order to preserve locking 1359 * of the meta data locks in order to preserve locking
1353 * order in case of page fault. 1360 * order in case of page fault.
1354 */ 1361 */
1355 ret = get_user_pages(current, current->mm, 1362 ret = get_user_pages(current, current->mm,
1356 (unsigned long)buf & PAGE_CACHE_MASK, 1, 1363 (unsigned long)buf & PAGE_CACHE_MASK, 1,
1357 0, 0, &src_page, NULL); 1364 0, 0, &src_page, NULL);
1358 if (ret == 1) 1365 if (ret == 1)
1359 *ret_src_buf = kmap(src_page) + off; 1366 *ret_src_buf = kmap(src_page) + off;
1360 else 1367 else
1361 src_page = ERR_PTR(-EFAULT); 1368 src_page = ERR_PTR(-EFAULT);
1362 } else { 1369 } else {
1363 *ret_src_buf = buf; 1370 *ret_src_buf = buf;
1364 } 1371 }
1365 1372
1366 return src_page; 1373 return src_page;
1367 } 1374 }
1368 1375
1369 static void ocfs2_put_write_source(struct page *page) 1376 static void ocfs2_put_write_source(struct page *page)
1370 { 1377 {
1371 if (page) { 1378 if (page) {
1372 kunmap(page); 1379 kunmap(page);
1373 page_cache_release(page); 1380 page_cache_release(page);
1374 } 1381 }
1375 } 1382 }
1376 1383
1377 static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos, 1384 static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos,
1378 const struct iovec *iov, 1385 const struct iovec *iov,
1379 unsigned long nr_segs, 1386 unsigned long nr_segs,
1380 size_t count, 1387 size_t count,
1381 ssize_t o_direct_written) 1388 ssize_t o_direct_written)
1382 { 1389 {
1383 int ret = 0; 1390 int ret = 0;
1384 ssize_t copied, total = 0; 1391 ssize_t copied, total = 0;
1385 size_t iov_offset = 0, bytes; 1392 size_t iov_offset = 0, bytes;
1386 loff_t pos; 1393 loff_t pos;
1387 const struct iovec *cur_iov = iov; 1394 const struct iovec *cur_iov = iov;
1388 struct page *user_page, *page; 1395 struct page *user_page, *page;
1389 char *buf, *dst; 1396 char *buf, *dst;
1390 void *fsdata; 1397 void *fsdata;
1391 1398
1392 /* 1399 /*
1393 * handle partial DIO write. Adjust cur_iov if needed. 1400 * handle partial DIO write. Adjust cur_iov if needed.
1394 */ 1401 */
1395 ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written); 1402 ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written);
1396 1403
1397 do { 1404 do {
1398 pos = *ppos; 1405 pos = *ppos;
1399 1406
1400 user_page = ocfs2_get_write_source(&buf, cur_iov, iov_offset); 1407 user_page = ocfs2_get_write_source(&buf, cur_iov, iov_offset);
1401 if (IS_ERR(user_page)) { 1408 if (IS_ERR(user_page)) {
1402 ret = PTR_ERR(user_page); 1409 ret = PTR_ERR(user_page);
1403 goto out; 1410 goto out;
1404 } 1411 }
1405 1412
1406 /* Stay within our page boundaries */ 1413 /* Stay within our page boundaries */
1407 bytes = min((PAGE_CACHE_SIZE - ((unsigned long)pos & ~PAGE_CACHE_MASK)), 1414 bytes = min((PAGE_CACHE_SIZE - ((unsigned long)pos & ~PAGE_CACHE_MASK)),
1408 (PAGE_CACHE_SIZE - ((unsigned long)buf & ~PAGE_CACHE_MASK))); 1415 (PAGE_CACHE_SIZE - ((unsigned long)buf & ~PAGE_CACHE_MASK)));
1409 /* Stay within the vector boundary */ 1416 /* Stay within the vector boundary */
1410 bytes = min_t(size_t, bytes, cur_iov->iov_len - iov_offset); 1417 bytes = min_t(size_t, bytes, cur_iov->iov_len - iov_offset);
1411 /* Stay within count */ 1418 /* Stay within count */
1412 bytes = min(bytes, count); 1419 bytes = min(bytes, count);
1413 1420
1414 page = NULL; 1421 page = NULL;
1415 ret = ocfs2_write_begin(file, file->f_mapping, pos, bytes, 0, 1422 ret = ocfs2_write_begin(file, file->f_mapping, pos, bytes, 0,
1416 &page, &fsdata); 1423 &page, &fsdata);
1417 if (ret) { 1424 if (ret) {
1418 mlog_errno(ret); 1425 mlog_errno(ret);
1419 goto out; 1426 goto out;
1420 } 1427 }
1421 1428
1422 dst = kmap_atomic(page, KM_USER0); 1429 dst = kmap_atomic(page, KM_USER0);
1423 memcpy(dst + (pos & (PAGE_CACHE_SIZE - 1)), buf, bytes); 1430 memcpy(dst + (pos & (PAGE_CACHE_SIZE - 1)), buf, bytes);
1424 kunmap_atomic(dst, KM_USER0); 1431 kunmap_atomic(dst, KM_USER0);
1425 flush_dcache_page(page); 1432 flush_dcache_page(page);
1426 ocfs2_put_write_source(user_page); 1433 ocfs2_put_write_source(user_page);
1427 1434
1428 copied = ocfs2_write_end(file, file->f_mapping, pos, bytes, 1435 copied = ocfs2_write_end(file, file->f_mapping, pos, bytes,
1429 bytes, page, fsdata); 1436 bytes, page, fsdata);
1430 if (copied < 0) { 1437 if (copied < 0) {
1431 mlog_errno(copied); 1438 mlog_errno(copied);
1432 ret = copied; 1439 ret = copied;
1433 goto out; 1440 goto out;
1434 } 1441 }
1435 1442
1436 total += copied; 1443 total += copied;
1437 *ppos = pos + copied; 1444 *ppos = pos + copied;
1438 count -= copied; 1445 count -= copied;
1439 1446
1440 ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied); 1447 ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied);
1441 } while(count); 1448 } while(count);
1442 1449
1443 out: 1450 out:
1444 return total ? total : ret; 1451 return total ? total : ret;
1445 } 1452 }
1446 1453
1447 static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, 1454 static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
1448 const struct iovec *iov, 1455 const struct iovec *iov,
1449 unsigned long nr_segs, 1456 unsigned long nr_segs,
1450 loff_t pos) 1457 loff_t pos)
1451 { 1458 {
1452 int ret, direct_io, appending, rw_level, have_alloc_sem = 0; 1459 int ret, direct_io, appending, rw_level, have_alloc_sem = 0;
1453 int can_do_direct, sync = 0; 1460 int can_do_direct, sync = 0;
1454 ssize_t written = 0; 1461 ssize_t written = 0;
1455 size_t ocount; /* original count */ 1462 size_t ocount; /* original count */
1456 size_t count; /* after file limit checks */ 1463 size_t count; /* after file limit checks */
1457 loff_t *ppos = &iocb->ki_pos; 1464 loff_t *ppos = &iocb->ki_pos;
1458 struct file *file = iocb->ki_filp; 1465 struct file *file = iocb->ki_filp;
1459 struct inode *inode = file->f_path.dentry->d_inode; 1466 struct inode *inode = file->f_path.dentry->d_inode;
1460 1467
1461 mlog_entry("(0x%p, %u, '%.*s')\n", file, 1468 mlog_entry("(0x%p, %u, '%.*s')\n", file,
1462 (unsigned int)nr_segs, 1469 (unsigned int)nr_segs,
1463 file->f_path.dentry->d_name.len, 1470 file->f_path.dentry->d_name.len,
1464 file->f_path.dentry->d_name.name); 1471 file->f_path.dentry->d_name.name);
1465 1472
1466 if (iocb->ki_left == 0) 1473 if (iocb->ki_left == 0)
1467 return 0; 1474 return 0;
1468 1475
1469 ret = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); 1476 ret = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
1470 if (ret) 1477 if (ret)
1471 return ret; 1478 return ret;
1472 1479
1473 count = ocount; 1480 count = ocount;
1474 1481
1475 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 1482 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
1476 1483
1477 appending = file->f_flags & O_APPEND ? 1 : 0; 1484 appending = file->f_flags & O_APPEND ? 1 : 0;
1478 direct_io = file->f_flags & O_DIRECT ? 1 : 0; 1485 direct_io = file->f_flags & O_DIRECT ? 1 : 0;
1479 1486
1480 mutex_lock(&inode->i_mutex); 1487 mutex_lock(&inode->i_mutex);
1481 1488
1482 relock: 1489 relock:
1483 /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ 1490 /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
1484 if (direct_io) { 1491 if (direct_io) {
1485 down_read(&inode->i_alloc_sem); 1492 down_read(&inode->i_alloc_sem);
1486 have_alloc_sem = 1; 1493 have_alloc_sem = 1;
1487 } 1494 }
1488 1495
1489 /* concurrent O_DIRECT writes are allowed */ 1496 /* concurrent O_DIRECT writes are allowed */
1490 rw_level = !direct_io; 1497 rw_level = !direct_io;
1491 ret = ocfs2_rw_lock(inode, rw_level); 1498 ret = ocfs2_rw_lock(inode, rw_level);
1492 if (ret < 0) { 1499 if (ret < 0) {
1493 mlog_errno(ret); 1500 mlog_errno(ret);
1494 goto out_sems; 1501 goto out_sems;
1495 } 1502 }
1496 1503
1497 can_do_direct = direct_io; 1504 can_do_direct = direct_io;
1498 ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos, 1505 ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos,
1499 iocb->ki_left, appending, 1506 iocb->ki_left, appending,
1500 &can_do_direct); 1507 &can_do_direct);
1501 if (ret < 0) { 1508 if (ret < 0) {
1502 mlog_errno(ret); 1509 mlog_errno(ret);
1503 goto out; 1510 goto out;
1504 } 1511 }
1505 1512
1506 /* 1513 /*
1507 * We can't complete the direct I/O as requested, fall back to 1514 * We can't complete the direct I/O as requested, fall back to
1508 * buffered I/O. 1515 * buffered I/O.
1509 */ 1516 */
1510 if (direct_io && !can_do_direct) { 1517 if (direct_io && !can_do_direct) {
1511 ocfs2_rw_unlock(inode, rw_level); 1518 ocfs2_rw_unlock(inode, rw_level);
1512 up_read(&inode->i_alloc_sem); 1519 up_read(&inode->i_alloc_sem);
1513 1520
1514 have_alloc_sem = 0; 1521 have_alloc_sem = 0;
1515 rw_level = -1; 1522 rw_level = -1;
1516 1523
1517 direct_io = 0; 1524 direct_io = 0;
1518 sync = 1; 1525 sync = 1;
1519 goto relock; 1526 goto relock;
1520 } 1527 }
1521 1528
1522 if (!sync && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) 1529 if (!sync && ((file->f_flags & O_SYNC) || IS_SYNC(inode)))
1523 sync = 1; 1530 sync = 1;
1524 1531
1525 /* 1532 /*
1526 * XXX: Is it ok to execute these checks a second time? 1533 * XXX: Is it ok to execute these checks a second time?
1527 */ 1534 */
1528 ret = generic_write_checks(file, ppos, &count, S_ISBLK(inode->i_mode)); 1535 ret = generic_write_checks(file, ppos, &count, S_ISBLK(inode->i_mode));
1529 if (ret) 1536 if (ret)
1530 goto out; 1537 goto out;
1531 1538
1532 /* 1539 /*
1533 * Set pos so that sync_page_range_nolock() below understands 1540 * Set pos so that sync_page_range_nolock() below understands
1534 * where to start from. We might've moved it around via the 1541 * where to start from. We might've moved it around via the
1535 * calls above. The range we want to actually sync starts from 1542 * calls above. The range we want to actually sync starts from
1536 * *ppos here. 1543 * *ppos here.
1537 * 1544 *
1538 */ 1545 */
1539 pos = *ppos; 1546 pos = *ppos;
1540 1547
1541 /* communicate with ocfs2_dio_end_io */ 1548 /* communicate with ocfs2_dio_end_io */
1542 ocfs2_iocb_set_rw_locked(iocb, rw_level); 1549 ocfs2_iocb_set_rw_locked(iocb, rw_level);
1543 1550
1544 if (direct_io) { 1551 if (direct_io) {
1545 written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, 1552 written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
1546 ppos, count, ocount); 1553 ppos, count, ocount);
1547 if (written < 0) { 1554 if (written < 0) {
1548 ret = written; 1555 ret = written;
1549 goto out_dio; 1556 goto out_dio;
1550 } 1557 }
1551 } else { 1558 } else {
1552 written = ocfs2_file_buffered_write(file, ppos, iov, nr_segs, 1559 written = ocfs2_file_buffered_write(file, ppos, iov, nr_segs,
1553 count, written); 1560 count, written);
1554 if (written < 0) { 1561 if (written < 0) {
1555 ret = written; 1562 ret = written;
1556 if (ret != -EFAULT || ret != -ENOSPC) 1563 if (ret != -EFAULT || ret != -ENOSPC)
1557 mlog_errno(ret); 1564 mlog_errno(ret);
1558 goto out; 1565 goto out;
1559 } 1566 }
1560 } 1567 }
1561 1568
1562 out_dio: 1569 out_dio:
1563 /* buffered aio wouldn't have proper lock coverage today */ 1570 /* buffered aio wouldn't have proper lock coverage today */
1564 BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); 1571 BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
1565 1572
1566 /* 1573 /*
1567 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io 1574 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
1568 * function pointer which is called when o_direct io completes so that 1575 * function pointer which is called when o_direct io completes so that
1569 * it can unlock our rw lock. (it's the clustered equivalent of 1576 * it can unlock our rw lock. (it's the clustered equivalent of
1570 * i_alloc_sem; protects truncate from racing with pending ios). 1577 * i_alloc_sem; protects truncate from racing with pending ios).
1571 * Unfortunately there are error cases which call end_io and others 1578 * Unfortunately there are error cases which call end_io and others
1572 * that don't. so we don't have to unlock the rw_lock if either an 1579 * that don't. so we don't have to unlock the rw_lock if either an
1573 * async dio is going to do it in the future or an end_io after an 1580 * async dio is going to do it in the future or an end_io after an
1574 * error has already done it. 1581 * error has already done it.
1575 */ 1582 */
1576 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { 1583 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
1577 rw_level = -1; 1584 rw_level = -1;
1578 have_alloc_sem = 0; 1585 have_alloc_sem = 0;
1579 } 1586 }
1580 1587
1581 out: 1588 out:
1582 if (rw_level != -1) 1589 if (rw_level != -1)
1583 ocfs2_rw_unlock(inode, rw_level); 1590 ocfs2_rw_unlock(inode, rw_level);
1584 1591
1585 out_sems: 1592 out_sems:
1586 if (have_alloc_sem) 1593 if (have_alloc_sem)
1587 up_read(&inode->i_alloc_sem); 1594 up_read(&inode->i_alloc_sem);
1588 1595
1589 if (written > 0 && sync) { 1596 if (written > 0 && sync) {
1590 ssize_t err; 1597 ssize_t err;
1591 1598
1592 err = sync_page_range_nolock(inode, file->f_mapping, pos, count); 1599 err = sync_page_range_nolock(inode, file->f_mapping, pos, count);
1593 if (err < 0) 1600 if (err < 0)
1594 written = err; 1601 written = err;
1595 } 1602 }
1596 1603
1597 mutex_unlock(&inode->i_mutex); 1604 mutex_unlock(&inode->i_mutex);
1598 1605
1599 mlog_exit(ret); 1606 mlog_exit(ret);
1600 return written ? written : ret; 1607 return written ? written : ret;
1601 } 1608 }
1602 1609
1603 static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe, 1610 static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe,
1604 struct pipe_buffer *buf, 1611 struct pipe_buffer *buf,
1605 struct splice_desc *sd) 1612 struct splice_desc *sd)
1606 { 1613 {
1607 int ret, count; 1614 int ret, count;
1608 ssize_t copied = 0; 1615 ssize_t copied = 0;
1609 struct file *file = sd->u.file; 1616 struct file *file = sd->u.file;
1610 unsigned int offset; 1617 unsigned int offset;
1611 struct page *page = NULL; 1618 struct page *page = NULL;
1612 void *fsdata; 1619 void *fsdata;
1613 char *src, *dst; 1620 char *src, *dst;
1614 1621
1615 ret = buf->ops->confirm(pipe, buf); 1622 ret = buf->ops->confirm(pipe, buf);
1616 if (ret) 1623 if (ret)
1617 goto out; 1624 goto out;
1618 1625
1619 offset = sd->pos & ~PAGE_CACHE_MASK; 1626 offset = sd->pos & ~PAGE_CACHE_MASK;
1620 count = sd->len; 1627 count = sd->len;
1621 if (count + offset > PAGE_CACHE_SIZE) 1628 if (count + offset > PAGE_CACHE_SIZE)
1622 count = PAGE_CACHE_SIZE - offset; 1629 count = PAGE_CACHE_SIZE - offset;
1623 1630
1624 ret = ocfs2_write_begin(file, file->f_mapping, sd->pos, count, 0, 1631 ret = ocfs2_write_begin(file, file->f_mapping, sd->pos, count, 0,
1625 &page, &fsdata); 1632 &page, &fsdata);
1626 if (ret) { 1633 if (ret) {
1627 mlog_errno(ret); 1634 mlog_errno(ret);
1628 goto out; 1635 goto out;
1629 } 1636 }
1630 1637
1631 src = buf->ops->map(pipe, buf, 1); 1638 src = buf->ops->map(pipe, buf, 1);
1632 dst = kmap_atomic(page, KM_USER1); 1639 dst = kmap_atomic(page, KM_USER1);
1633 memcpy(dst + offset, src + buf->offset, count); 1640 memcpy(dst + offset, src + buf->offset, count);
1634 kunmap_atomic(page, KM_USER1); 1641 kunmap_atomic(page, KM_USER1);
1635 buf->ops->unmap(pipe, buf, src); 1642 buf->ops->unmap(pipe, buf, src);
1636 1643
1637 copied = ocfs2_write_end(file, file->f_mapping, sd->pos, count, count, 1644 copied = ocfs2_write_end(file, file->f_mapping, sd->pos, count, count,
1638 page, fsdata); 1645 page, fsdata);
1639 if (copied < 0) { 1646 if (copied < 0) {
1640 mlog_errno(copied); 1647 mlog_errno(copied);
1641 ret = copied; 1648 ret = copied;
1642 goto out; 1649 goto out;
1643 } 1650 }
1644 out: 1651 out:
1645 1652
1646 return copied ? copied : ret; 1653 return copied ? copied : ret;
1647 } 1654 }
1648 1655
1649 static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe, 1656 static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe,
1650 struct file *out, 1657 struct file *out,
1651 loff_t *ppos, 1658 loff_t *ppos,
1652 size_t len, 1659 size_t len,
1653 unsigned int flags) 1660 unsigned int flags)
1654 { 1661 {
1655 int ret, err; 1662 int ret, err;
1656 struct address_space *mapping = out->f_mapping; 1663 struct address_space *mapping = out->f_mapping;
1657 struct inode *inode = mapping->host; 1664 struct inode *inode = mapping->host;
1658 struct splice_desc sd = { 1665 struct splice_desc sd = {
1659 .total_len = len, 1666 .total_len = len,
1660 .flags = flags, 1667 .flags = flags,
1661 .pos = *ppos, 1668 .pos = *ppos,
1662 .u.file = out, 1669 .u.file = out,
1663 }; 1670 };
1664 1671
1665 ret = __splice_from_pipe(pipe, &sd, ocfs2_splice_write_actor); 1672 ret = __splice_from_pipe(pipe, &sd, ocfs2_splice_write_actor);
1666 if (ret > 0) { 1673 if (ret > 0) {
1667 *ppos += ret; 1674 *ppos += ret;
1668 1675
1669 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { 1676 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
1670 err = generic_osync_inode(inode, mapping, 1677 err = generic_osync_inode(inode, mapping,
1671 OSYNC_METADATA|OSYNC_DATA); 1678 OSYNC_METADATA|OSYNC_DATA);
1672 if (err) 1679 if (err)
1673 ret = err; 1680 ret = err;
1674 } 1681 }
1675 } 1682 }
1676 1683
1677 return ret; 1684 return ret;
1678 } 1685 }
1679 1686
1680 static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, 1687 static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
1681 struct file *out, 1688 struct file *out,
1682 loff_t *ppos, 1689 loff_t *ppos,
1683 size_t len, 1690 size_t len,
1684 unsigned int flags) 1691 unsigned int flags)
1685 { 1692 {
1686 int ret; 1693 int ret;
1687 struct inode *inode = out->f_path.dentry->d_inode; 1694 struct inode *inode = out->f_path.dentry->d_inode;
1688 1695
1689 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe, 1696 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe,
1690 (unsigned int)len, 1697 (unsigned int)len,
1691 out->f_path.dentry->d_name.len, 1698 out->f_path.dentry->d_name.len,
1692 out->f_path.dentry->d_name.name); 1699 out->f_path.dentry->d_name.name);
1693 1700
1694 inode_double_lock(inode, pipe->inode); 1701 inode_double_lock(inode, pipe->inode);
1695 1702
1696 ret = ocfs2_rw_lock(inode, 1); 1703 ret = ocfs2_rw_lock(inode, 1);
1697 if (ret < 0) { 1704 if (ret < 0) {
1698 mlog_errno(ret); 1705 mlog_errno(ret);
1699 goto out; 1706 goto out;
1700 } 1707 }
1701 1708
1702 ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0, 1709 ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0,
1703 NULL); 1710 NULL);
1704 if (ret < 0) { 1711 if (ret < 0) {
1705 mlog_errno(ret); 1712 mlog_errno(ret);
1706 goto out_unlock; 1713 goto out_unlock;
1707 } 1714 }
1708 1715
1709 /* ok, we're done with i_size and alloc work */ 1716 /* ok, we're done with i_size and alloc work */
1710 ret = __ocfs2_file_splice_write(pipe, out, ppos, len, flags); 1717 ret = __ocfs2_file_splice_write(pipe, out, ppos, len, flags);
1711 1718
1712 out_unlock: 1719 out_unlock:
1713 ocfs2_rw_unlock(inode, 1); 1720 ocfs2_rw_unlock(inode, 1);
1714 out: 1721 out:
1715 inode_double_unlock(inode, pipe->inode); 1722 inode_double_unlock(inode, pipe->inode);
1716 1723
1717 mlog_exit(ret); 1724 mlog_exit(ret);
1718 return ret; 1725 return ret;
1719 } 1726 }
1720 1727
1721 static ssize_t ocfs2_file_splice_read(struct file *in, 1728 static ssize_t ocfs2_file_splice_read(struct file *in,
1722 loff_t *ppos, 1729 loff_t *ppos,
1723 struct pipe_inode_info *pipe, 1730 struct pipe_inode_info *pipe,
1724 size_t len, 1731 size_t len,
1725 unsigned int flags) 1732 unsigned int flags)
1726 { 1733 {
1727 int ret = 0; 1734 int ret = 0;
1728 struct inode *inode = in->f_path.dentry->d_inode; 1735 struct inode *inode = in->f_path.dentry->d_inode;
1729 1736
1730 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe, 1737 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe,
1731 (unsigned int)len, 1738 (unsigned int)len,
1732 in->f_path.dentry->d_name.len, 1739 in->f_path.dentry->d_name.len,
1733 in->f_path.dentry->d_name.name); 1740 in->f_path.dentry->d_name.name);
1734 1741
1735 /* 1742 /*
1736 * See the comment in ocfs2_file_aio_read() 1743 * See the comment in ocfs2_file_aio_read()
1737 */ 1744 */
1738 ret = ocfs2_meta_lock(inode, NULL, 0); 1745 ret = ocfs2_meta_lock(inode, NULL, 0);
1739 if (ret < 0) { 1746 if (ret < 0) {
1740 mlog_errno(ret); 1747 mlog_errno(ret);
1741 goto bail; 1748 goto bail;
1742 } 1749 }
1743 ocfs2_meta_unlock(inode, 0); 1750 ocfs2_meta_unlock(inode, 0);
1744 1751
1745 ret = generic_file_splice_read(in, ppos, pipe, len, flags); 1752 ret = generic_file_splice_read(in, ppos, pipe, len, flags);
1746 1753
1747 bail: 1754 bail:
1748 mlog_exit(ret); 1755 mlog_exit(ret);
1749 return ret; 1756 return ret;
1750 } 1757 }
1751 1758
1752 static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, 1759 static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
1753 const struct iovec *iov, 1760 const struct iovec *iov,
1754 unsigned long nr_segs, 1761 unsigned long nr_segs,
1755 loff_t pos) 1762 loff_t pos)
1756 { 1763 {
1757 int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0; 1764 int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;
1758 struct file *filp = iocb->ki_filp; 1765 struct file *filp = iocb->ki_filp;
1759 struct inode *inode = filp->f_path.dentry->d_inode; 1766 struct inode *inode = filp->f_path.dentry->d_inode;
1760 1767
1761 mlog_entry("(0x%p, %u, '%.*s')\n", filp, 1768 mlog_entry("(0x%p, %u, '%.*s')\n", filp,
1762 (unsigned int)nr_segs, 1769 (unsigned int)nr_segs,
1763 filp->f_path.dentry->d_name.len, 1770 filp->f_path.dentry->d_name.len,
1764 filp->f_path.dentry->d_name.name); 1771 filp->f_path.dentry->d_name.name);
1765 1772
1766 if (!inode) { 1773 if (!inode) {
1767 ret = -EINVAL; 1774 ret = -EINVAL;
1768 mlog_errno(ret); 1775 mlog_errno(ret);
1769 goto bail; 1776 goto bail;
1770 } 1777 }
1771 1778
1772 /* 1779 /*
1773 * buffered reads protect themselves in ->readpage(). O_DIRECT reads 1780 * buffered reads protect themselves in ->readpage(). O_DIRECT reads
1774 * need locks to protect pending reads from racing with truncate. 1781 * need locks to protect pending reads from racing with truncate.
1775 */ 1782 */
1776 if (filp->f_flags & O_DIRECT) { 1783 if (filp->f_flags & O_DIRECT) {
1777 down_read(&inode->i_alloc_sem); 1784 down_read(&inode->i_alloc_sem);
1778 have_alloc_sem = 1; 1785 have_alloc_sem = 1;
1779 1786
1780 ret = ocfs2_rw_lock(inode, 0); 1787 ret = ocfs2_rw_lock(inode, 0);
1781 if (ret < 0) { 1788 if (ret < 0) {
1782 mlog_errno(ret); 1789 mlog_errno(ret);
1783 goto bail; 1790 goto bail;
1784 } 1791 }
1785 rw_level = 0; 1792 rw_level = 0;
1786 /* communicate with ocfs2_dio_end_io */ 1793 /* communicate with ocfs2_dio_end_io */
1787 ocfs2_iocb_set_rw_locked(iocb, rw_level); 1794 ocfs2_iocb_set_rw_locked(iocb, rw_level);
1788 } 1795 }
1789 1796
1790 /* 1797 /*
1791 * We're fine letting folks race truncates and extending 1798 * We're fine letting folks race truncates and extending
1792 * writes with read across the cluster, just like they can 1799 * writes with read across the cluster, just like they can
1793 * locally. Hence no rw_lock during read. 1800 * locally. Hence no rw_lock during read.
1794 * 1801 *
1795 * Take and drop the meta data lock to update inode fields 1802 * Take and drop the meta data lock to update inode fields
1796 * like i_size. This allows the checks down below 1803 * like i_size. This allows the checks down below
1797 * generic_file_aio_read() a chance of actually working. 1804 * generic_file_aio_read() a chance of actually working.
1798 */ 1805 */
1799 ret = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level); 1806 ret = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level);
1800 if (ret < 0) { 1807 if (ret < 0) {
1801 mlog_errno(ret); 1808 mlog_errno(ret);
1802 goto bail; 1809 goto bail;
1803 } 1810 }
1804 ocfs2_meta_unlock(inode, lock_level); 1811 ocfs2_meta_unlock(inode, lock_level);
1805 1812
1806 ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos); 1813 ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
1807 if (ret == -EINVAL) 1814 if (ret == -EINVAL)
1808 mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n"); 1815 mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n");
1809 1816
1810 /* buffered aio wouldn't have proper lock coverage today */ 1817 /* buffered aio wouldn't have proper lock coverage today */
1811 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); 1818 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
1812 1819
1813 /* see ocfs2_file_aio_write */ 1820 /* see ocfs2_file_aio_write */
1814 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { 1821 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
1815 rw_level = -1; 1822 rw_level = -1;
1816 have_alloc_sem = 0; 1823 have_alloc_sem = 0;
1817 } 1824 }
1818 1825
1819 bail: 1826 bail:
1820 if (have_alloc_sem) 1827 if (have_alloc_sem)
1821 up_read(&inode->i_alloc_sem); 1828 up_read(&inode->i_alloc_sem);
1822 if (rw_level != -1) 1829 if (rw_level != -1)
1823 ocfs2_rw_unlock(inode, rw_level); 1830 ocfs2_rw_unlock(inode, rw_level);
1824 mlog_exit(ret); 1831 mlog_exit(ret);
1825 1832
1826 return ret; 1833 return ret;
1827 } 1834 }
1828 1835
1829 const struct inode_operations ocfs2_file_iops = { 1836 const struct inode_operations ocfs2_file_iops = {
1830 .setattr = ocfs2_setattr, 1837 .setattr = ocfs2_setattr,
1831 .getattr = ocfs2_getattr, 1838 .getattr = ocfs2_getattr,
1832 .permission = ocfs2_permission, 1839 .permission = ocfs2_permission,
1833 }; 1840 };
1834 1841
1835 const struct inode_operations ocfs2_special_file_iops = { 1842 const struct inode_operations ocfs2_special_file_iops = {
1836 .setattr = ocfs2_setattr, 1843 .setattr = ocfs2_setattr,
1837 .getattr = ocfs2_getattr, 1844 .getattr = ocfs2_getattr,
1838 .permission = ocfs2_permission, 1845 .permission = ocfs2_permission,
1839 }; 1846 };
1840 1847
1841 const struct file_operations ocfs2_fops = { 1848 const struct file_operations ocfs2_fops = {
1842 .read = do_sync_read, 1849 .read = do_sync_read,
1843 .write = do_sync_write, 1850 .write = do_sync_write,
1844 .mmap = ocfs2_mmap, 1851 .mmap = ocfs2_mmap,
1845 .fsync = ocfs2_sync_file, 1852 .fsync = ocfs2_sync_file,
1846 .release = ocfs2_file_release, 1853 .release = ocfs2_file_release,
1847 .open = ocfs2_file_open, 1854 .open = ocfs2_file_open,
1848 .aio_read = ocfs2_file_aio_read, 1855 .aio_read = ocfs2_file_aio_read,
1849 .aio_write = ocfs2_file_aio_write, 1856 .aio_write = ocfs2_file_aio_write,
1850 .ioctl = ocfs2_ioctl, 1857 .ioctl = ocfs2_ioctl,
1851 #ifdef CONFIG_COMPAT 1858 #ifdef CONFIG_COMPAT
1852 .compat_ioctl = ocfs2_compat_ioctl, 1859 .compat_ioctl = ocfs2_compat_ioctl,
1853 #endif 1860 #endif
1854 .splice_read = ocfs2_file_splice_read, 1861 .splice_read = ocfs2_file_splice_read,
1855 .splice_write = ocfs2_file_splice_write, 1862 .splice_write = ocfs2_file_splice_write,
1856 }; 1863 };
1857 1864
1858 const struct file_operations ocfs2_dops = { 1865 const struct file_operations ocfs2_dops = {
1859 .read = generic_read_dir, 1866 .read = generic_read_dir,
1860 .readdir = ocfs2_readdir, 1867 .readdir = ocfs2_readdir,
1861 .fsync = ocfs2_sync_file, 1868 .fsync = ocfs2_sync_file,
1862 .ioctl = ocfs2_ioctl, 1869 .ioctl = ocfs2_ioctl,
1863 #ifdef CONFIG_COMPAT 1870 #ifdef CONFIG_COMPAT
1864 .compat_ioctl = ocfs2_compat_ioctl, 1871 .compat_ioctl = ocfs2_compat_ioctl,
1865 #endif 1872 #endif
1866 }; 1873 };
1867 1874
1 /* -*- mode: c; c-basic-offset: 8; -*- 1 /* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0: 2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 * 3 *
4 * mmap.c 4 * mmap.c
5 * 5 *
6 * Code to deal with the mess that is clustered mmap. 6 * Code to deal with the mess that is clustered mmap.
7 * 7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 * 9 *
10 * This program is free software; you can redistribute it and/or 10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public 11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either 12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version. 13 * version 2 of the License, or (at your option) any later version.
14 * 14 *
15 * This program is distributed in the hope that it will be useful, 15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details. 18 * General Public License for more details.
19 * 19 *
20 * You should have received a copy of the GNU General Public 20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the 21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA. 23 * Boston, MA 021110-1307, USA.
24 */ 24 */
25 25
26 #include <linux/fs.h> 26 #include <linux/fs.h>
27 #include <linux/types.h> 27 #include <linux/types.h>
28 #include <linux/slab.h> 28 #include <linux/slab.h>
29 #include <linux/highmem.h> 29 #include <linux/highmem.h>
30 #include <linux/pagemap.h> 30 #include <linux/pagemap.h>
31 #include <linux/uio.h> 31 #include <linux/uio.h>
32 #include <linux/signal.h> 32 #include <linux/signal.h>
33 #include <linux/rbtree.h> 33 #include <linux/rbtree.h>
34 34
35 #define MLOG_MASK_PREFIX ML_FILE_IO 35 #define MLOG_MASK_PREFIX ML_FILE_IO
36 #include <cluster/masklog.h> 36 #include <cluster/masklog.h>
37 37
38 #include "ocfs2.h" 38 #include "ocfs2.h"
39 39
40 #include "aops.h"
40 #include "dlmglue.h" 41 #include "dlmglue.h"
41 #include "file.h" 42 #include "file.h"
42 #include "inode.h" 43 #include "inode.h"
43 #include "mmap.h" 44 #include "mmap.h"
44 45
46 static inline int ocfs2_vm_op_block_sigs(sigset_t *blocked, sigset_t *oldset)
47 {
48 /* The best way to deal with signals in the vm path is
49 * to block them upfront, rather than allowing the
50 * locking paths to return -ERESTARTSYS. */
51 sigfillset(blocked);
52
53 /* We should technically never get a bad return value
54 * from sigprocmask */
55 return sigprocmask(SIG_BLOCK, blocked, oldset);
56 }
57
58 static inline int ocfs2_vm_op_unblock_sigs(sigset_t *oldset)
59 {
60 return sigprocmask(SIG_SETMASK, oldset, NULL);
61 }
62
45 static struct page *ocfs2_nopage(struct vm_area_struct * area, 63 static struct page *ocfs2_nopage(struct vm_area_struct * area,
46 unsigned long address, 64 unsigned long address,
47 int *type) 65 int *type)
48 { 66 {
49 struct page *page = NOPAGE_SIGBUS; 67 struct page *page = NOPAGE_SIGBUS;
50 sigset_t blocked, oldset; 68 sigset_t blocked, oldset;
51 int ret; 69 int ret;
52 70
53 mlog_entry("(area=%p, address=%lu, type=%p)\n", area, address, 71 mlog_entry("(area=%p, address=%lu, type=%p)\n", area, address,
54 type); 72 type);
55 73
56 /* The best way to deal with signals in this path is 74 ret = ocfs2_vm_op_block_sigs(&blocked, &oldset);
57 * to block them upfront, rather than allowing the
58 * locking paths to return -ERESTARTSYS. */
59 sigfillset(&blocked);
60
61 /* We should technically never get a bad ret return
62 * from sigprocmask */
63 ret = sigprocmask(SIG_BLOCK, &blocked, &oldset);
64 if (ret < 0) { 75 if (ret < 0) {
65 mlog_errno(ret); 76 mlog_errno(ret);
66 goto out; 77 goto out;
67 } 78 }
68 79
69 page = filemap_nopage(area, address, type); 80 page = filemap_nopage(area, address, type);
70 81
71 ret = sigprocmask(SIG_SETMASK, &oldset, NULL); 82 ret = ocfs2_vm_op_unblock_sigs(&oldset);
72 if (ret < 0) 83 if (ret < 0)
73 mlog_errno(ret); 84 mlog_errno(ret);
74 out: 85 out:
75 mlog_exit_ptr(page); 86 mlog_exit_ptr(page);
76 return page; 87 return page;
77 } 88 }
78 89
79 static struct vm_operations_struct ocfs2_file_vm_ops = { 90 static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
80 .nopage = ocfs2_nopage, 91 struct page *page)
81 }; 92 {
93 int ret;
94 struct address_space *mapping = inode->i_mapping;
95 loff_t pos = page->index << PAGE_CACHE_SHIFT;
96 unsigned int len = PAGE_CACHE_SIZE;
97 pgoff_t last_index;
98 struct page *locked_page = NULL;
99 void *fsdata;
100 loff_t size = i_size_read(inode);
82 101
83 int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) 102 /*
103 * Another node might have truncated while we were waiting on
104 * cluster locks.
105 */
106 last_index = size >> PAGE_CACHE_SHIFT;
107 if (page->index > last_index) {
108 ret = -EINVAL;
109 goto out;
110 }
111
112 /*
113 * The i_size check above doesn't catch the case where nodes
114 * truncated and then re-extended the file. We'll re-check the
115 * page mapping after taking the page lock inside of
116 * ocfs2_write_begin_nolock().
117 */
118 if (!PageUptodate(page) || page->mapping != inode->i_mapping) {
119 ret = -EINVAL;
120 goto out;
121 }
122
123 /*
124 * Call ocfs2_write_begin() and ocfs2_write_end() to take
125 * advantage of the allocation code there. We pass a write
126 * length of the whole page (chopped to i_size) to make sure
127 * the whole thing is allocated.
128 *
129 * Since we know the page is up to date, we don't have to
130 * worry about ocfs2_write_begin() skipping some buffer reads
131 * because the "write" would invalidate their data.
132 */
133 if (page->index == last_index)
134 len = size & ~PAGE_CACHE_MASK;
135
136 ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page,
137 &fsdata, di_bh, page);
138 if (ret) {
139 if (ret != -ENOSPC)
140 mlog_errno(ret);
141 goto out;
142 }
143
144 ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page,
145 fsdata);
146 if (ret < 0) {
147 mlog_errno(ret);
148 goto out;
149 }
150 BUG_ON(ret != len);
151 ret = 0;
152 out:
153 return ret;
154 }
155
156 static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
84 { 157 {
85 int ret = 0, lock_level = 0; 158 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
86 struct ocfs2_super *osb = OCFS2_SB(file->f_dentry->d_inode->i_sb); 159 struct buffer_head *di_bh = NULL;
160 sigset_t blocked, oldset;
161 int ret, ret2;
87 162
163 ret = ocfs2_vm_op_block_sigs(&blocked, &oldset);
164 if (ret < 0) {
165 mlog_errno(ret);
166 return ret;
167 }
168
88 /* 169 /*
89 * Only support shared writeable mmap for local mounts which 170 * The cluster locks taken will block a truncate from another
90 * don't know about holes. 171 * node. Taking the data lock will also ensure that we don't
172 * attempt page truncation as part of a downconvert.
91 */ 173 */
92 if ((!ocfs2_mount_local(osb) || ocfs2_sparse_alloc(osb)) && 174 ret = ocfs2_meta_lock(inode, &di_bh, 1);
93 ((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) && 175 if (ret < 0) {
94 ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) { 176 mlog_errno(ret);
95 mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags); 177 goto out;
96 /* This is -EINVAL because generic_file_readonly_mmap
97 * returns it in a similar situation. */
98 return -EINVAL;
99 } 178 }
179
180 /*
181 * The alloc sem should be enough to serialize with
182 * ocfs2_truncate_file() changing i_size as well as any thread
183 * modifying the inode btree.
184 */
185 down_write(&OCFS2_I(inode)->ip_alloc_sem);
186
187 ret = ocfs2_data_lock(inode, 1);
188 if (ret < 0) {
189 mlog_errno(ret);
190 goto out_meta_unlock;
191 }
192
193 ret = __ocfs2_page_mkwrite(inode, di_bh, page);
194
195 ocfs2_data_unlock(inode, 1);
196
197 out_meta_unlock:
198 up_write(&OCFS2_I(inode)->ip_alloc_sem);
199
200 brelse(di_bh);
201 ocfs2_meta_unlock(inode, 1);
202
203 out:
204 ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
205 if (ret2 < 0)
206 mlog_errno(ret2);
207
208 return ret;
209 }
210
211 static struct vm_operations_struct ocfs2_file_vm_ops = {
212 .nopage = ocfs2_nopage,
213 .page_mkwrite = ocfs2_page_mkwrite,
214 };
215
216 int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
217 {
218 int ret = 0, lock_level = 0;
100 219
101 ret = ocfs2_meta_lock_atime(file->f_dentry->d_inode, 220 ret = ocfs2_meta_lock_atime(file->f_dentry->d_inode,
102 file->f_vfsmnt, &lock_level); 221 file->f_vfsmnt, &lock_level);
103 if (ret < 0) { 222 if (ret < 0) {