Commit 4f902c37727bbedbc0508a1477874c58ddcc9af8

Authored by Mark Fasheh
1 parent 49cb8d2d49

ocfs2: Fix extent lookup to return true size of holes

Initially, we had wired things to return a size '1' of holes. Cook up a
small amount of code to find the next extent and calculate the number of
clusters between the virtual offset and the next allocated extent.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

Showing 5 changed files with 109 additions and 12 deletions Inline Diff

1 /* -*- mode: c; c-basic-offset: 8; -*- 1 /* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0: 2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 * 3 *
4 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 4 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
5 * 5 *
6 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public 7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either 8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version. 9 * version 2 of the License, or (at your option) any later version.
10 * 10 *
11 * This program is distributed in the hope that it will be useful, 11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details. 14 * General Public License for more details.
15 * 15 *
16 * You should have received a copy of the GNU General Public 16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the 17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA. 19 * Boston, MA 021110-1307, USA.
20 */ 20 */
21 21
22 #include <linux/fs.h> 22 #include <linux/fs.h>
23 #include <linux/slab.h> 23 #include <linux/slab.h>
24 #include <linux/highmem.h> 24 #include <linux/highmem.h>
25 #include <linux/pagemap.h> 25 #include <linux/pagemap.h>
26 #include <asm/byteorder.h> 26 #include <asm/byteorder.h>
27 #include <linux/swap.h> 27 #include <linux/swap.h>
28 #include <linux/pipe_fs_i.h> 28 #include <linux/pipe_fs_i.h>
29 29
30 #define MLOG_MASK_PREFIX ML_FILE_IO 30 #define MLOG_MASK_PREFIX ML_FILE_IO
31 #include <cluster/masklog.h> 31 #include <cluster/masklog.h>
32 32
33 #include "ocfs2.h" 33 #include "ocfs2.h"
34 34
35 #include "alloc.h" 35 #include "alloc.h"
36 #include "aops.h" 36 #include "aops.h"
37 #include "dlmglue.h" 37 #include "dlmglue.h"
38 #include "extent_map.h" 38 #include "extent_map.h"
39 #include "file.h" 39 #include "file.h"
40 #include "inode.h" 40 #include "inode.h"
41 #include "journal.h" 41 #include "journal.h"
42 #include "suballoc.h" 42 #include "suballoc.h"
43 #include "super.h" 43 #include "super.h"
44 #include "symlink.h" 44 #include "symlink.h"
45 45
46 #include "buffer_head_io.h" 46 #include "buffer_head_io.h"
47 47
48 static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, 48 static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
49 struct buffer_head *bh_result, int create) 49 struct buffer_head *bh_result, int create)
50 { 50 {
51 int err = -EIO; 51 int err = -EIO;
52 int status; 52 int status;
53 struct ocfs2_dinode *fe = NULL; 53 struct ocfs2_dinode *fe = NULL;
54 struct buffer_head *bh = NULL; 54 struct buffer_head *bh = NULL;
55 struct buffer_head *buffer_cache_bh = NULL; 55 struct buffer_head *buffer_cache_bh = NULL;
56 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 56 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
57 void *kaddr; 57 void *kaddr;
58 58
59 mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, 59 mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
60 (unsigned long long)iblock, bh_result, create); 60 (unsigned long long)iblock, bh_result, create);
61 61
62 BUG_ON(ocfs2_inode_is_fast_symlink(inode)); 62 BUG_ON(ocfs2_inode_is_fast_symlink(inode));
63 63
64 if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) { 64 if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) {
65 mlog(ML_ERROR, "block offset > PATH_MAX: %llu", 65 mlog(ML_ERROR, "block offset > PATH_MAX: %llu",
66 (unsigned long long)iblock); 66 (unsigned long long)iblock);
67 goto bail; 67 goto bail;
68 } 68 }
69 69
70 status = ocfs2_read_block(OCFS2_SB(inode->i_sb), 70 status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
71 OCFS2_I(inode)->ip_blkno, 71 OCFS2_I(inode)->ip_blkno,
72 &bh, OCFS2_BH_CACHED, inode); 72 &bh, OCFS2_BH_CACHED, inode);
73 if (status < 0) { 73 if (status < 0) {
74 mlog_errno(status); 74 mlog_errno(status);
75 goto bail; 75 goto bail;
76 } 76 }
77 fe = (struct ocfs2_dinode *) bh->b_data; 77 fe = (struct ocfs2_dinode *) bh->b_data;
78 78
79 if (!OCFS2_IS_VALID_DINODE(fe)) { 79 if (!OCFS2_IS_VALID_DINODE(fe)) {
80 mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n", 80 mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n",
81 (unsigned long long)fe->i_blkno, 7, fe->i_signature); 81 (unsigned long long)fe->i_blkno, 7, fe->i_signature);
82 goto bail; 82 goto bail;
83 } 83 }
84 84
85 if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb, 85 if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
86 le32_to_cpu(fe->i_clusters))) { 86 le32_to_cpu(fe->i_clusters))) {
87 mlog(ML_ERROR, "block offset is outside the allocated size: " 87 mlog(ML_ERROR, "block offset is outside the allocated size: "
88 "%llu\n", (unsigned long long)iblock); 88 "%llu\n", (unsigned long long)iblock);
89 goto bail; 89 goto bail;
90 } 90 }
91 91
92 /* We don't use the page cache to create symlink data, so if 92 /* We don't use the page cache to create symlink data, so if
93 * need be, copy it over from the buffer cache. */ 93 * need be, copy it over from the buffer cache. */
94 if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) { 94 if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) {
95 u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + 95 u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) +
96 iblock; 96 iblock;
97 buffer_cache_bh = sb_getblk(osb->sb, blkno); 97 buffer_cache_bh = sb_getblk(osb->sb, blkno);
98 if (!buffer_cache_bh) { 98 if (!buffer_cache_bh) {
99 mlog(ML_ERROR, "couldn't getblock for symlink!\n"); 99 mlog(ML_ERROR, "couldn't getblock for symlink!\n");
100 goto bail; 100 goto bail;
101 } 101 }
102 102
103 /* we haven't locked out transactions, so a commit 103 /* we haven't locked out transactions, so a commit
104 * could've happened. Since we've got a reference on 104 * could've happened. Since we've got a reference on
105 * the bh, even if it commits while we're doing the 105 * the bh, even if it commits while we're doing the
106 * copy, the data is still good. */ 106 * copy, the data is still good. */
107 if (buffer_jbd(buffer_cache_bh) 107 if (buffer_jbd(buffer_cache_bh)
108 && ocfs2_inode_is_new(inode)) { 108 && ocfs2_inode_is_new(inode)) {
109 kaddr = kmap_atomic(bh_result->b_page, KM_USER0); 109 kaddr = kmap_atomic(bh_result->b_page, KM_USER0);
110 if (!kaddr) { 110 if (!kaddr) {
111 mlog(ML_ERROR, "couldn't kmap!\n"); 111 mlog(ML_ERROR, "couldn't kmap!\n");
112 goto bail; 112 goto bail;
113 } 113 }
114 memcpy(kaddr + (bh_result->b_size * iblock), 114 memcpy(kaddr + (bh_result->b_size * iblock),
115 buffer_cache_bh->b_data, 115 buffer_cache_bh->b_data,
116 bh_result->b_size); 116 bh_result->b_size);
117 kunmap_atomic(kaddr, KM_USER0); 117 kunmap_atomic(kaddr, KM_USER0);
118 set_buffer_uptodate(bh_result); 118 set_buffer_uptodate(bh_result);
119 } 119 }
120 brelse(buffer_cache_bh); 120 brelse(buffer_cache_bh);
121 } 121 }
122 122
123 map_bh(bh_result, inode->i_sb, 123 map_bh(bh_result, inode->i_sb,
124 le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock); 124 le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock);
125 125
126 err = 0; 126 err = 0;
127 127
128 bail: 128 bail:
129 if (bh) 129 if (bh)
130 brelse(bh); 130 brelse(bh);
131 131
132 mlog_exit(err); 132 mlog_exit(err);
133 return err; 133 return err;
134 } 134 }
135 135
136 static int ocfs2_get_block(struct inode *inode, sector_t iblock, 136 static int ocfs2_get_block(struct inode *inode, sector_t iblock,
137 struct buffer_head *bh_result, int create) 137 struct buffer_head *bh_result, int create)
138 { 138 {
139 int err = 0; 139 int err = 0;
140 unsigned int ext_flags; 140 unsigned int ext_flags;
141 u64 p_blkno, past_eof; 141 u64 p_blkno, past_eof;
142 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 142 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
143 143
144 mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, 144 mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
145 (unsigned long long)iblock, bh_result, create); 145 (unsigned long long)iblock, bh_result, create);
146 146
147 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) 147 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
148 mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n", 148 mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
149 inode, inode->i_ino); 149 inode, inode->i_ino);
150 150
151 if (S_ISLNK(inode->i_mode)) { 151 if (S_ISLNK(inode->i_mode)) {
152 /* this always does I/O for some reason. */ 152 /* this always does I/O for some reason. */
153 err = ocfs2_symlink_get_block(inode, iblock, bh_result, create); 153 err = ocfs2_symlink_get_block(inode, iblock, bh_result, create);
154 goto bail; 154 goto bail;
155 } 155 }
156 156
157 err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL, 157 err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL,
158 &ext_flags); 158 &ext_flags);
159 if (err) { 159 if (err) {
160 mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, " 160 mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
161 "%llu, NULL)\n", err, inode, (unsigned long long)iblock, 161 "%llu, NULL)\n", err, inode, (unsigned long long)iblock,
162 (unsigned long long)p_blkno); 162 (unsigned long long)p_blkno);
163 goto bail; 163 goto bail;
164 } 164 }
165 165
166 /* 166 /*
167 * ocfs2 never allocates in this function - the only time we 167 * ocfs2 never allocates in this function - the only time we
168 * need to use BH_New is when we're extending i_size on a file 168 * need to use BH_New is when we're extending i_size on a file
169 * system which doesn't support holes, in which case BH_New 169 * system which doesn't support holes, in which case BH_New
170 * allows block_prepare_write() to zero. 170 * allows block_prepare_write() to zero.
171 */ 171 */
172 mlog_bug_on_msg(create && p_blkno == 0 && ocfs2_sparse_alloc(osb), 172 mlog_bug_on_msg(create && p_blkno == 0 && ocfs2_sparse_alloc(osb),
173 "ino %lu, iblock %llu\n", inode->i_ino, 173 "ino %lu, iblock %llu\n", inode->i_ino,
174 (unsigned long long)iblock); 174 (unsigned long long)iblock);
175 175
176 /* Treat the unwritten extent as a hole for zeroing purposes. */ 176 /* Treat the unwritten extent as a hole for zeroing purposes. */
177 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) 177 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
178 map_bh(bh_result, inode->i_sb, p_blkno); 178 map_bh(bh_result, inode->i_sb, p_blkno);
179 179
180 if (!ocfs2_sparse_alloc(osb)) { 180 if (!ocfs2_sparse_alloc(osb)) {
181 if (p_blkno == 0) { 181 if (p_blkno == 0) {
182 err = -EIO; 182 err = -EIO;
183 mlog(ML_ERROR, 183 mlog(ML_ERROR,
184 "iblock = %llu p_blkno = %llu blkno=(%llu)\n", 184 "iblock = %llu p_blkno = %llu blkno=(%llu)\n",
185 (unsigned long long)iblock, 185 (unsigned long long)iblock,
186 (unsigned long long)p_blkno, 186 (unsigned long long)p_blkno,
187 (unsigned long long)OCFS2_I(inode)->ip_blkno); 187 (unsigned long long)OCFS2_I(inode)->ip_blkno);
188 mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters); 188 mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters);
189 dump_stack(); 189 dump_stack();
190 } 190 }
191 191
192 past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); 192 past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
193 mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino, 193 mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
194 (unsigned long long)past_eof); 194 (unsigned long long)past_eof);
195 195
196 if (create && (iblock >= past_eof)) 196 if (create && (iblock >= past_eof))
197 set_buffer_new(bh_result); 197 set_buffer_new(bh_result);
198 } 198 }
199 199
200 bail: 200 bail:
201 if (err < 0) 201 if (err < 0)
202 err = -EIO; 202 err = -EIO;
203 203
204 mlog_exit(err); 204 mlog_exit(err);
205 return err; 205 return err;
206 } 206 }
207 207
208 static int ocfs2_readpage(struct file *file, struct page *page) 208 static int ocfs2_readpage(struct file *file, struct page *page)
209 { 209 {
210 struct inode *inode = page->mapping->host; 210 struct inode *inode = page->mapping->host;
211 loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT; 211 loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
212 int ret, unlock = 1; 212 int ret, unlock = 1;
213 213
214 mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0)); 214 mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
215 215
216 ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page); 216 ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page);
217 if (ret != 0) { 217 if (ret != 0) {
218 if (ret == AOP_TRUNCATED_PAGE) 218 if (ret == AOP_TRUNCATED_PAGE)
219 unlock = 0; 219 unlock = 0;
220 mlog_errno(ret); 220 mlog_errno(ret);
221 goto out; 221 goto out;
222 } 222 }
223 223
224 down_read(&OCFS2_I(inode)->ip_alloc_sem); 224 down_read(&OCFS2_I(inode)->ip_alloc_sem);
225 225
226 /* 226 /*
227 * i_size might have just been updated as we grabed the meta lock. We 227 * i_size might have just been updated as we grabed the meta lock. We
228 * might now be discovering a truncate that hit on another node. 228 * might now be discovering a truncate that hit on another node.
229 * block_read_full_page->get_block freaks out if it is asked to read 229 * block_read_full_page->get_block freaks out if it is asked to read
230 * beyond the end of a file, so we check here. Callers 230 * beyond the end of a file, so we check here. Callers
231 * (generic_file_read, fault->nopage) are clever enough to check i_size 231 * (generic_file_read, fault->nopage) are clever enough to check i_size
232 * and notice that the page they just read isn't needed. 232 * and notice that the page they just read isn't needed.
233 * 233 *
234 * XXX sys_readahead() seems to get that wrong? 234 * XXX sys_readahead() seems to get that wrong?
235 */ 235 */
236 if (start >= i_size_read(inode)) { 236 if (start >= i_size_read(inode)) {
237 char *addr = kmap(page); 237 char *addr = kmap(page);
238 memset(addr, 0, PAGE_SIZE); 238 memset(addr, 0, PAGE_SIZE);
239 flush_dcache_page(page); 239 flush_dcache_page(page);
240 kunmap(page); 240 kunmap(page);
241 SetPageUptodate(page); 241 SetPageUptodate(page);
242 ret = 0; 242 ret = 0;
243 goto out_alloc; 243 goto out_alloc;
244 } 244 }
245 245
246 ret = ocfs2_data_lock_with_page(inode, 0, page); 246 ret = ocfs2_data_lock_with_page(inode, 0, page);
247 if (ret != 0) { 247 if (ret != 0) {
248 if (ret == AOP_TRUNCATED_PAGE) 248 if (ret == AOP_TRUNCATED_PAGE)
249 unlock = 0; 249 unlock = 0;
250 mlog_errno(ret); 250 mlog_errno(ret);
251 goto out_alloc; 251 goto out_alloc;
252 } 252 }
253 253
254 ret = block_read_full_page(page, ocfs2_get_block); 254 ret = block_read_full_page(page, ocfs2_get_block);
255 unlock = 0; 255 unlock = 0;
256 256
257 ocfs2_data_unlock(inode, 0); 257 ocfs2_data_unlock(inode, 0);
258 out_alloc: 258 out_alloc:
259 up_read(&OCFS2_I(inode)->ip_alloc_sem); 259 up_read(&OCFS2_I(inode)->ip_alloc_sem);
260 ocfs2_meta_unlock(inode, 0); 260 ocfs2_meta_unlock(inode, 0);
261 out: 261 out:
262 if (unlock) 262 if (unlock)
263 unlock_page(page); 263 unlock_page(page);
264 mlog_exit(ret); 264 mlog_exit(ret);
265 return ret; 265 return ret;
266 } 266 }
267 267
268 /* Note: Because we don't support holes, our allocation has 268 /* Note: Because we don't support holes, our allocation has
269 * already happened (allocation writes zeros to the file data) 269 * already happened (allocation writes zeros to the file data)
270 * so we don't have to worry about ordered writes in 270 * so we don't have to worry about ordered writes in
271 * ocfs2_writepage. 271 * ocfs2_writepage.
272 * 272 *
273 * ->writepage is called during the process of invalidating the page cache 273 * ->writepage is called during the process of invalidating the page cache
274 * during blocked lock processing. It can't block on any cluster locks 274 * during blocked lock processing. It can't block on any cluster locks
275 * to during block mapping. It's relying on the fact that the block 275 * to during block mapping. It's relying on the fact that the block
276 * mapping can't have disappeared under the dirty pages that it is 276 * mapping can't have disappeared under the dirty pages that it is
277 * being asked to write back. 277 * being asked to write back.
278 */ 278 */
279 static int ocfs2_writepage(struct page *page, struct writeback_control *wbc) 279 static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
280 { 280 {
281 int ret; 281 int ret;
282 282
283 mlog_entry("(0x%p)\n", page); 283 mlog_entry("(0x%p)\n", page);
284 284
285 ret = block_write_full_page(page, ocfs2_get_block, wbc); 285 ret = block_write_full_page(page, ocfs2_get_block, wbc);
286 286
287 mlog_exit(ret); 287 mlog_exit(ret);
288 288
289 return ret; 289 return ret;
290 } 290 }
291 291
292 /* 292 /*
293 * This is called from ocfs2_write_zero_page() which has handled it's 293 * This is called from ocfs2_write_zero_page() which has handled it's
294 * own cluster locking and has ensured allocation exists for those 294 * own cluster locking and has ensured allocation exists for those
295 * blocks to be written. 295 * blocks to be written.
296 */ 296 */
297 int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page, 297 int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
298 unsigned from, unsigned to) 298 unsigned from, unsigned to)
299 { 299 {
300 int ret; 300 int ret;
301 301
302 down_read(&OCFS2_I(inode)->ip_alloc_sem); 302 down_read(&OCFS2_I(inode)->ip_alloc_sem);
303 303
304 ret = block_prepare_write(page, from, to, ocfs2_get_block); 304 ret = block_prepare_write(page, from, to, ocfs2_get_block);
305 305
306 up_read(&OCFS2_I(inode)->ip_alloc_sem); 306 up_read(&OCFS2_I(inode)->ip_alloc_sem);
307 307
308 return ret; 308 return ret;
309 } 309 }
310 310
311 /* Taken from ext3. We don't necessarily need the full blown 311 /* Taken from ext3. We don't necessarily need the full blown
312 * functionality yet, but IMHO it's better to cut and paste the whole 312 * functionality yet, but IMHO it's better to cut and paste the whole
313 * thing so we can avoid introducing our own bugs (and easily pick up 313 * thing so we can avoid introducing our own bugs (and easily pick up
314 * their fixes when they happen) --Mark */ 314 * their fixes when they happen) --Mark */
315 int walk_page_buffers( handle_t *handle, 315 int walk_page_buffers( handle_t *handle,
316 struct buffer_head *head, 316 struct buffer_head *head,
317 unsigned from, 317 unsigned from,
318 unsigned to, 318 unsigned to,
319 int *partial, 319 int *partial,
320 int (*fn)( handle_t *handle, 320 int (*fn)( handle_t *handle,
321 struct buffer_head *bh)) 321 struct buffer_head *bh))
322 { 322 {
323 struct buffer_head *bh; 323 struct buffer_head *bh;
324 unsigned block_start, block_end; 324 unsigned block_start, block_end;
325 unsigned blocksize = head->b_size; 325 unsigned blocksize = head->b_size;
326 int err, ret = 0; 326 int err, ret = 0;
327 struct buffer_head *next; 327 struct buffer_head *next;
328 328
329 for ( bh = head, block_start = 0; 329 for ( bh = head, block_start = 0;
330 ret == 0 && (bh != head || !block_start); 330 ret == 0 && (bh != head || !block_start);
331 block_start = block_end, bh = next) 331 block_start = block_end, bh = next)
332 { 332 {
333 next = bh->b_this_page; 333 next = bh->b_this_page;
334 block_end = block_start + blocksize; 334 block_end = block_start + blocksize;
335 if (block_end <= from || block_start >= to) { 335 if (block_end <= from || block_start >= to) {
336 if (partial && !buffer_uptodate(bh)) 336 if (partial && !buffer_uptodate(bh))
337 *partial = 1; 337 *partial = 1;
338 continue; 338 continue;
339 } 339 }
340 err = (*fn)(handle, bh); 340 err = (*fn)(handle, bh);
341 if (!ret) 341 if (!ret)
342 ret = err; 342 ret = err;
343 } 343 }
344 return ret; 344 return ret;
345 } 345 }
346 346
347 handle_t *ocfs2_start_walk_page_trans(struct inode *inode, 347 handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
348 struct page *page, 348 struct page *page,
349 unsigned from, 349 unsigned from,
350 unsigned to) 350 unsigned to)
351 { 351 {
352 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 352 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
353 handle_t *handle = NULL; 353 handle_t *handle = NULL;
354 int ret = 0; 354 int ret = 0;
355 355
356 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 356 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
357 if (!handle) { 357 if (!handle) {
358 ret = -ENOMEM; 358 ret = -ENOMEM;
359 mlog_errno(ret); 359 mlog_errno(ret);
360 goto out; 360 goto out;
361 } 361 }
362 362
363 if (ocfs2_should_order_data(inode)) { 363 if (ocfs2_should_order_data(inode)) {
364 ret = walk_page_buffers(handle, 364 ret = walk_page_buffers(handle,
365 page_buffers(page), 365 page_buffers(page),
366 from, to, NULL, 366 from, to, NULL,
367 ocfs2_journal_dirty_data); 367 ocfs2_journal_dirty_data);
368 if (ret < 0) 368 if (ret < 0)
369 mlog_errno(ret); 369 mlog_errno(ret);
370 } 370 }
371 out: 371 out:
372 if (ret) { 372 if (ret) {
373 if (handle) 373 if (handle)
374 ocfs2_commit_trans(osb, handle); 374 ocfs2_commit_trans(osb, handle);
375 handle = ERR_PTR(ret); 375 handle = ERR_PTR(ret);
376 } 376 }
377 return handle; 377 return handle;
378 } 378 }
379 379
380 static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) 380 static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
381 { 381 {
382 sector_t status; 382 sector_t status;
383 u64 p_blkno = 0; 383 u64 p_blkno = 0;
384 int err = 0; 384 int err = 0;
385 struct inode *inode = mapping->host; 385 struct inode *inode = mapping->host;
386 386
387 mlog_entry("(block = %llu)\n", (unsigned long long)block); 387 mlog_entry("(block = %llu)\n", (unsigned long long)block);
388 388
389 /* We don't need to lock journal system files, since they aren't 389 /* We don't need to lock journal system files, since they aren't
390 * accessed concurrently from multiple nodes. 390 * accessed concurrently from multiple nodes.
391 */ 391 */
392 if (!INODE_JOURNAL(inode)) { 392 if (!INODE_JOURNAL(inode)) {
393 err = ocfs2_meta_lock(inode, NULL, 0); 393 err = ocfs2_meta_lock(inode, NULL, 0);
394 if (err) { 394 if (err) {
395 if (err != -ENOENT) 395 if (err != -ENOENT)
396 mlog_errno(err); 396 mlog_errno(err);
397 goto bail; 397 goto bail;
398 } 398 }
399 down_read(&OCFS2_I(inode)->ip_alloc_sem); 399 down_read(&OCFS2_I(inode)->ip_alloc_sem);
400 } 400 }
401 401
402 err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, NULL); 402 err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, NULL);
403 403
404 if (!INODE_JOURNAL(inode)) { 404 if (!INODE_JOURNAL(inode)) {
405 up_read(&OCFS2_I(inode)->ip_alloc_sem); 405 up_read(&OCFS2_I(inode)->ip_alloc_sem);
406 ocfs2_meta_unlock(inode, 0); 406 ocfs2_meta_unlock(inode, 0);
407 } 407 }
408 408
409 if (err) { 409 if (err) {
410 mlog(ML_ERROR, "get_blocks() failed, block = %llu\n", 410 mlog(ML_ERROR, "get_blocks() failed, block = %llu\n",
411 (unsigned long long)block); 411 (unsigned long long)block);
412 mlog_errno(err); 412 mlog_errno(err);
413 goto bail; 413 goto bail;
414 } 414 }
415 415
416 416
417 bail: 417 bail:
418 status = err ? 0 : p_blkno; 418 status = err ? 0 : p_blkno;
419 419
420 mlog_exit((int)status); 420 mlog_exit((int)status);
421 421
422 return status; 422 return status;
423 } 423 }
424 424
425 /* 425 /*
426 * TODO: Make this into a generic get_blocks function. 426 * TODO: Make this into a generic get_blocks function.
427 * 427 *
428 * From do_direct_io in direct-io.c: 428 * From do_direct_io in direct-io.c:
429 * "So what we do is to permit the ->get_blocks function to populate 429 * "So what we do is to permit the ->get_blocks function to populate
430 * bh.b_size with the size of IO which is permitted at this offset and 430 * bh.b_size with the size of IO which is permitted at this offset and
431 * this i_blkbits." 431 * this i_blkbits."
432 * 432 *
433 * This function is called directly from get_more_blocks in direct-io.c. 433 * This function is called directly from get_more_blocks in direct-io.c.
434 * 434 *
435 * called like this: dio->get_blocks(dio->inode, fs_startblk, 435 * called like this: dio->get_blocks(dio->inode, fs_startblk,
436 * fs_count, map_bh, dio->rw == WRITE); 436 * fs_count, map_bh, dio->rw == WRITE);
437 */ 437 */
438 static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, 438 static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
439 struct buffer_head *bh_result, int create) 439 struct buffer_head *bh_result, int create)
440 { 440 {
441 int ret; 441 int ret;
442 u64 p_blkno, inode_blocks; 442 u64 p_blkno, inode_blocks, contig_blocks;
443 int contig_blocks;
444 unsigned int ext_flags; 443 unsigned int ext_flags;
445 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; 444 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
446 unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; 445 unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
447 446
448 /* This function won't even be called if the request isn't all 447 /* This function won't even be called if the request isn't all
449 * nicely aligned and of the right size, so there's no need 448 * nicely aligned and of the right size, so there's no need
450 * for us to check any of that. */ 449 * for us to check any of that. */
451 450
452 inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); 451 inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
453 452
454 /* 453 /*
455 * Any write past EOF is not allowed because we'd be extending. 454 * Any write past EOF is not allowed because we'd be extending.
456 */ 455 */
457 if (create && (iblock + max_blocks) > inode_blocks) { 456 if (create && (iblock + max_blocks) > inode_blocks) {
458 ret = -EIO; 457 ret = -EIO;
459 goto bail; 458 goto bail;
460 } 459 }
461 460
462 /* This figures out the size of the next contiguous block, and 461 /* This figures out the size of the next contiguous block, and
463 * our logical offset */ 462 * our logical offset */
464 ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, 463 ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
465 &contig_blocks, &ext_flags); 464 &contig_blocks, &ext_flags);
466 if (ret) { 465 if (ret) {
467 mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", 466 mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
468 (unsigned long long)iblock); 467 (unsigned long long)iblock);
469 ret = -EIO; 468 ret = -EIO;
470 goto bail; 469 goto bail;
471 } 470 }
472 471
473 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno) { 472 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno) {
474 ocfs2_error(inode->i_sb, 473 ocfs2_error(inode->i_sb,
475 "Inode %llu has a hole at block %llu\n", 474 "Inode %llu has a hole at block %llu\n",
476 (unsigned long long)OCFS2_I(inode)->ip_blkno, 475 (unsigned long long)OCFS2_I(inode)->ip_blkno,
477 (unsigned long long)iblock); 476 (unsigned long long)iblock);
478 ret = -EROFS; 477 ret = -EROFS;
479 goto bail; 478 goto bail;
480 } 479 }
481 480
482 /* 481 /*
483 * get_more_blocks() expects us to describe a hole by clearing 482 * get_more_blocks() expects us to describe a hole by clearing
484 * the mapped bit on bh_result(). 483 * the mapped bit on bh_result().
485 * 484 *
486 * Consider an unwritten extent as a hole. 485 * Consider an unwritten extent as a hole.
487 */ 486 */
488 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) 487 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
489 map_bh(bh_result, inode->i_sb, p_blkno); 488 map_bh(bh_result, inode->i_sb, p_blkno);
490 else { 489 else {
491 /* 490 /*
492 * ocfs2_prepare_inode_for_write() should have caught 491 * ocfs2_prepare_inode_for_write() should have caught
493 * the case where we'd be filling a hole and triggered 492 * the case where we'd be filling a hole and triggered
494 * a buffered write instead. 493 * a buffered write instead.
495 */ 494 */
496 if (create) { 495 if (create) {
497 ret = -EIO; 496 ret = -EIO;
498 mlog_errno(ret); 497 mlog_errno(ret);
499 goto bail; 498 goto bail;
500 } 499 }
501 500
502 clear_buffer_mapped(bh_result); 501 clear_buffer_mapped(bh_result);
503 } 502 }
504 503
505 /* make sure we don't map more than max_blocks blocks here as 504 /* make sure we don't map more than max_blocks blocks here as
506 that's all the kernel will handle at this point. */ 505 that's all the kernel will handle at this point. */
507 if (max_blocks < contig_blocks) 506 if (max_blocks < contig_blocks)
508 contig_blocks = max_blocks; 507 contig_blocks = max_blocks;
509 bh_result->b_size = contig_blocks << blocksize_bits; 508 bh_result->b_size = contig_blocks << blocksize_bits;
510 bail: 509 bail:
511 return ret; 510 return ret;
512 } 511 }
513 512
514 /* 513 /*
515 * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're 514 * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
516 * particularly interested in the aio/dio case. Like the core uses 515 * particularly interested in the aio/dio case. Like the core uses
517 * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from 516 * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
518 * truncation on another. 517 * truncation on another.
519 */ 518 */
520 static void ocfs2_dio_end_io(struct kiocb *iocb, 519 static void ocfs2_dio_end_io(struct kiocb *iocb,
521 loff_t offset, 520 loff_t offset,
522 ssize_t bytes, 521 ssize_t bytes,
523 void *private) 522 void *private)
524 { 523 {
525 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; 524 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
526 525
527 /* this io's submitter should not have unlocked this before we could */ 526 /* this io's submitter should not have unlocked this before we could */
528 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); 527 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
529 ocfs2_iocb_clear_rw_locked(iocb); 528 ocfs2_iocb_clear_rw_locked(iocb);
530 up_read(&inode->i_alloc_sem); 529 up_read(&inode->i_alloc_sem);
531 ocfs2_rw_unlock(inode, 0); 530 ocfs2_rw_unlock(inode, 0);
532 } 531 }
533 532
534 /* 533 /*
535 * ocfs2_invalidatepage() and ocfs2_releasepage() are shamelessly stolen 534 * ocfs2_invalidatepage() and ocfs2_releasepage() are shamelessly stolen
536 * from ext3. PageChecked() bits have been removed as OCFS2 does not 535 * from ext3. PageChecked() bits have been removed as OCFS2 does not
537 * do journalled data. 536 * do journalled data.
538 */ 537 */
539 static void ocfs2_invalidatepage(struct page *page, unsigned long offset) 538 static void ocfs2_invalidatepage(struct page *page, unsigned long offset)
540 { 539 {
541 journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal; 540 journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
542 541
543 journal_invalidatepage(journal, page, offset); 542 journal_invalidatepage(journal, page, offset);
544 } 543 }
545 544
546 static int ocfs2_releasepage(struct page *page, gfp_t wait) 545 static int ocfs2_releasepage(struct page *page, gfp_t wait)
547 { 546 {
548 journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal; 547 journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
549 548
550 if (!page_has_buffers(page)) 549 if (!page_has_buffers(page))
551 return 0; 550 return 0;
552 return journal_try_to_free_buffers(journal, page, wait); 551 return journal_try_to_free_buffers(journal, page, wait);
553 } 552 }
554 553
555 static ssize_t ocfs2_direct_IO(int rw, 554 static ssize_t ocfs2_direct_IO(int rw,
556 struct kiocb *iocb, 555 struct kiocb *iocb,
557 const struct iovec *iov, 556 const struct iovec *iov,
558 loff_t offset, 557 loff_t offset,
559 unsigned long nr_segs) 558 unsigned long nr_segs)
560 { 559 {
561 struct file *file = iocb->ki_filp; 560 struct file *file = iocb->ki_filp;
562 struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host; 561 struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
563 int ret; 562 int ret;
564 563
565 mlog_entry_void(); 564 mlog_entry_void();
566 565
567 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { 566 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
568 /* 567 /*
569 * We get PR data locks even for O_DIRECT. This 568 * We get PR data locks even for O_DIRECT. This
570 * allows concurrent O_DIRECT I/O but doesn't let 569 * allows concurrent O_DIRECT I/O but doesn't let
571 * O_DIRECT with extending and buffered zeroing writes 570 * O_DIRECT with extending and buffered zeroing writes
572 * race. If they did race then the buffered zeroing 571 * race. If they did race then the buffered zeroing
573 * could be written back after the O_DIRECT I/O. It's 572 * could be written back after the O_DIRECT I/O. It's
574 * one thing to tell people not to mix buffered and 573 * one thing to tell people not to mix buffered and
575 * O_DIRECT writes, but expecting them to understand 574 * O_DIRECT writes, but expecting them to understand
576 * that file extension is also an implicit buffered 575 * that file extension is also an implicit buffered
577 * write is too much. By getting the PR we force 576 * write is too much. By getting the PR we force
578 * writeback of the buffered zeroing before 577 * writeback of the buffered zeroing before
579 * proceeding. 578 * proceeding.
580 */ 579 */
581 ret = ocfs2_data_lock(inode, 0); 580 ret = ocfs2_data_lock(inode, 0);
582 if (ret < 0) { 581 if (ret < 0) {
583 mlog_errno(ret); 582 mlog_errno(ret);
584 goto out; 583 goto out;
585 } 584 }
586 ocfs2_data_unlock(inode, 0); 585 ocfs2_data_unlock(inode, 0);
587 } 586 }
588 587
589 ret = blockdev_direct_IO_no_locking(rw, iocb, inode, 588 ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
590 inode->i_sb->s_bdev, iov, offset, 589 inode->i_sb->s_bdev, iov, offset,
591 nr_segs, 590 nr_segs,
592 ocfs2_direct_IO_get_blocks, 591 ocfs2_direct_IO_get_blocks,
593 ocfs2_dio_end_io); 592 ocfs2_dio_end_io);
594 out: 593 out:
595 mlog_exit(ret); 594 mlog_exit(ret);
596 return ret; 595 return ret;
597 } 596 }
598 597
599 static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb, 598 static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
600 u32 cpos, 599 u32 cpos,
601 unsigned int *start, 600 unsigned int *start,
602 unsigned int *end) 601 unsigned int *end)
603 { 602 {
604 unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE; 603 unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE;
605 604
606 if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) { 605 if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) {
607 unsigned int cpp; 606 unsigned int cpp;
608 607
609 cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits); 608 cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits);
610 609
611 cluster_start = cpos % cpp; 610 cluster_start = cpos % cpp;
612 cluster_start = cluster_start << osb->s_clustersize_bits; 611 cluster_start = cluster_start << osb->s_clustersize_bits;
613 612
614 cluster_end = cluster_start + osb->s_clustersize; 613 cluster_end = cluster_start + osb->s_clustersize;
615 } 614 }
616 615
617 BUG_ON(cluster_start > PAGE_SIZE); 616 BUG_ON(cluster_start > PAGE_SIZE);
618 BUG_ON(cluster_end > PAGE_SIZE); 617 BUG_ON(cluster_end > PAGE_SIZE);
619 618
620 if (start) 619 if (start)
621 *start = cluster_start; 620 *start = cluster_start;
622 if (end) 621 if (end)
623 *end = cluster_end; 622 *end = cluster_end;
624 } 623 }
625 624
626 /* 625 /*
627 * 'from' and 'to' are the region in the page to avoid zeroing. 626 * 'from' and 'to' are the region in the page to avoid zeroing.
628 * 627 *
629 * If pagesize > clustersize, this function will avoid zeroing outside 628 * If pagesize > clustersize, this function will avoid zeroing outside
630 * of the cluster boundary. 629 * of the cluster boundary.
631 * 630 *
632 * from == to == 0 is code for "zero the entire cluster region" 631 * from == to == 0 is code for "zero the entire cluster region"
633 */ 632 */
634 static void ocfs2_clear_page_regions(struct page *page, 633 static void ocfs2_clear_page_regions(struct page *page,
635 struct ocfs2_super *osb, u32 cpos, 634 struct ocfs2_super *osb, u32 cpos,
636 unsigned from, unsigned to) 635 unsigned from, unsigned to)
637 { 636 {
638 void *kaddr; 637 void *kaddr;
639 unsigned int cluster_start, cluster_end; 638 unsigned int cluster_start, cluster_end;
640 639
641 ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end); 640 ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end);
642 641
643 kaddr = kmap_atomic(page, KM_USER0); 642 kaddr = kmap_atomic(page, KM_USER0);
644 643
645 if (from || to) { 644 if (from || to) {
646 if (from > cluster_start) 645 if (from > cluster_start)
647 memset(kaddr + cluster_start, 0, from - cluster_start); 646 memset(kaddr + cluster_start, 0, from - cluster_start);
648 if (to < cluster_end) 647 if (to < cluster_end)
649 memset(kaddr + to, 0, cluster_end - to); 648 memset(kaddr + to, 0, cluster_end - to);
650 } else { 649 } else {
651 memset(kaddr + cluster_start, 0, cluster_end - cluster_start); 650 memset(kaddr + cluster_start, 0, cluster_end - cluster_start);
652 } 651 }
653 652
654 kunmap_atomic(kaddr, KM_USER0); 653 kunmap_atomic(kaddr, KM_USER0);
655 } 654 }
656 655
657 /* 656 /*
658 * Some of this taken from block_prepare_write(). We already have our 657 * Some of this taken from block_prepare_write(). We already have our
659 * mapping by now though, and the entire write will be allocating or 658 * mapping by now though, and the entire write will be allocating or
660 * it won't, so not much need to use BH_New. 659 * it won't, so not much need to use BH_New.
661 * 660 *
662 * This will also skip zeroing, which is handled externally. 661 * This will also skip zeroing, which is handled externally.
663 */ 662 */
664 int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, 663 int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
665 struct inode *inode, unsigned int from, 664 struct inode *inode, unsigned int from,
666 unsigned int to, int new) 665 unsigned int to, int new)
667 { 666 {
668 int ret = 0; 667 int ret = 0;
669 struct buffer_head *head, *bh, *wait[2], **wait_bh = wait; 668 struct buffer_head *head, *bh, *wait[2], **wait_bh = wait;
670 unsigned int block_end, block_start; 669 unsigned int block_end, block_start;
671 unsigned int bsize = 1 << inode->i_blkbits; 670 unsigned int bsize = 1 << inode->i_blkbits;
672 671
673 if (!page_has_buffers(page)) 672 if (!page_has_buffers(page))
674 create_empty_buffers(page, bsize, 0); 673 create_empty_buffers(page, bsize, 0);
675 674
676 head = page_buffers(page); 675 head = page_buffers(page);
677 for (bh = head, block_start = 0; bh != head || !block_start; 676 for (bh = head, block_start = 0; bh != head || !block_start;
678 bh = bh->b_this_page, block_start += bsize) { 677 bh = bh->b_this_page, block_start += bsize) {
679 block_end = block_start + bsize; 678 block_end = block_start + bsize;
680 679
681 /* 680 /*
682 * Ignore blocks outside of our i/o range - 681 * Ignore blocks outside of our i/o range -
683 * they may belong to unallocated clusters. 682 * they may belong to unallocated clusters.
684 */ 683 */
685 if (block_start >= to || block_end <= from) { 684 if (block_start >= to || block_end <= from) {
686 if (PageUptodate(page)) 685 if (PageUptodate(page))
687 set_buffer_uptodate(bh); 686 set_buffer_uptodate(bh);
688 continue; 687 continue;
689 } 688 }
690 689
691 /* 690 /*
692 * For an allocating write with cluster size >= page 691 * For an allocating write with cluster size >= page
693 * size, we always write the entire page. 692 * size, we always write the entire page.
694 */ 693 */
695 694
696 if (buffer_new(bh)) 695 if (buffer_new(bh))
697 clear_buffer_new(bh); 696 clear_buffer_new(bh);
698 697
699 if (!buffer_mapped(bh)) { 698 if (!buffer_mapped(bh)) {
700 map_bh(bh, inode->i_sb, *p_blkno); 699 map_bh(bh, inode->i_sb, *p_blkno);
701 unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); 700 unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
702 } 701 }
703 702
704 if (PageUptodate(page)) { 703 if (PageUptodate(page)) {
705 if (!buffer_uptodate(bh)) 704 if (!buffer_uptodate(bh))
706 set_buffer_uptodate(bh); 705 set_buffer_uptodate(bh);
707 } else if (!buffer_uptodate(bh) && !buffer_delay(bh) && 706 } else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
708 (block_start < from || block_end > to)) { 707 (block_start < from || block_end > to)) {
709 ll_rw_block(READ, 1, &bh); 708 ll_rw_block(READ, 1, &bh);
710 *wait_bh++=bh; 709 *wait_bh++=bh;
711 } 710 }
712 711
713 *p_blkno = *p_blkno + 1; 712 *p_blkno = *p_blkno + 1;
714 } 713 }
715 714
716 /* 715 /*
717 * If we issued read requests - let them complete. 716 * If we issued read requests - let them complete.
718 */ 717 */
719 while(wait_bh > wait) { 718 while(wait_bh > wait) {
720 wait_on_buffer(*--wait_bh); 719 wait_on_buffer(*--wait_bh);
721 if (!buffer_uptodate(*wait_bh)) 720 if (!buffer_uptodate(*wait_bh))
722 ret = -EIO; 721 ret = -EIO;
723 } 722 }
724 723
725 if (ret == 0 || !new) 724 if (ret == 0 || !new)
726 return ret; 725 return ret;
727 726
728 /* 727 /*
729 * If we get -EIO above, zero out any newly allocated blocks 728 * If we get -EIO above, zero out any newly allocated blocks
730 * to avoid exposing stale data. 729 * to avoid exposing stale data.
731 */ 730 */
732 bh = head; 731 bh = head;
733 block_start = 0; 732 block_start = 0;
734 do { 733 do {
735 void *kaddr; 734 void *kaddr;
736 735
737 block_end = block_start + bsize; 736 block_end = block_start + bsize;
738 if (block_end <= from) 737 if (block_end <= from)
739 goto next_bh; 738 goto next_bh;
740 if (block_start >= to) 739 if (block_start >= to)
741 break; 740 break;
742 741
743 kaddr = kmap_atomic(page, KM_USER0); 742 kaddr = kmap_atomic(page, KM_USER0);
744 memset(kaddr+block_start, 0, bh->b_size); 743 memset(kaddr+block_start, 0, bh->b_size);
745 flush_dcache_page(page); 744 flush_dcache_page(page);
746 kunmap_atomic(kaddr, KM_USER0); 745 kunmap_atomic(kaddr, KM_USER0);
747 set_buffer_uptodate(bh); 746 set_buffer_uptodate(bh);
748 mark_buffer_dirty(bh); 747 mark_buffer_dirty(bh);
749 748
750 next_bh: 749 next_bh:
751 block_start = block_end; 750 block_start = block_end;
752 bh = bh->b_this_page; 751 bh = bh->b_this_page;
753 } while (bh != head); 752 } while (bh != head);
754 753
755 return ret; 754 return ret;
756 } 755 }
757 756
758 /* 757 /*
759 * This will copy user data from the buffer page in the splice 758 * This will copy user data from the buffer page in the splice
760 * context. 759 * context.
761 * 760 *
762 * For now, we ignore SPLICE_F_MOVE as that would require some extra 761 * For now, we ignore SPLICE_F_MOVE as that would require some extra
763 * communication out all the way to ocfs2_write(). 762 * communication out all the way to ocfs2_write().
764 */ 763 */
765 int ocfs2_map_and_write_splice_data(struct inode *inode, 764 int ocfs2_map_and_write_splice_data(struct inode *inode,
766 struct ocfs2_write_ctxt *wc, u64 *p_blkno, 765 struct ocfs2_write_ctxt *wc, u64 *p_blkno,
767 unsigned int *ret_from, unsigned int *ret_to) 766 unsigned int *ret_from, unsigned int *ret_to)
768 { 767 {
769 int ret; 768 int ret;
770 unsigned int to, from, cluster_start, cluster_end; 769 unsigned int to, from, cluster_start, cluster_end;
771 char *src, *dst; 770 char *src, *dst;
772 struct ocfs2_splice_write_priv *sp = wc->w_private; 771 struct ocfs2_splice_write_priv *sp = wc->w_private;
773 struct pipe_buffer *buf = sp->s_buf; 772 struct pipe_buffer *buf = sp->s_buf;
774 unsigned long bytes, src_from; 773 unsigned long bytes, src_from;
775 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 774 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
776 775
777 ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start, 776 ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start,
778 &cluster_end); 777 &cluster_end);
779 778
780 from = sp->s_offset; 779 from = sp->s_offset;
781 src_from = sp->s_buf_offset; 780 src_from = sp->s_buf_offset;
782 bytes = wc->w_count; 781 bytes = wc->w_count;
783 782
784 if (wc->w_large_pages) { 783 if (wc->w_large_pages) {
785 /* 784 /*
786 * For cluster size < page size, we have to 785 * For cluster size < page size, we have to
787 * calculate pos within the cluster and obey 786 * calculate pos within the cluster and obey
788 * the rightmost boundary. 787 * the rightmost boundary.
789 */ 788 */
790 bytes = min(bytes, (unsigned long)(osb->s_clustersize 789 bytes = min(bytes, (unsigned long)(osb->s_clustersize
791 - (wc->w_pos & (osb->s_clustersize - 1)))); 790 - (wc->w_pos & (osb->s_clustersize - 1))));
792 } 791 }
793 to = from + bytes; 792 to = from + bytes;
794 793
795 if (wc->w_this_page_new) 794 if (wc->w_this_page_new)
796 ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, 795 ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
797 cluster_start, cluster_end, 1); 796 cluster_start, cluster_end, 1);
798 else 797 else
799 ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, 798 ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
800 from, to, 0); 799 from, to, 0);
801 if (ret) { 800 if (ret) {
802 mlog_errno(ret); 801 mlog_errno(ret);
803 goto out; 802 goto out;
804 } 803 }
805 804
806 BUG_ON(from > PAGE_CACHE_SIZE); 805 BUG_ON(from > PAGE_CACHE_SIZE);
807 BUG_ON(to > PAGE_CACHE_SIZE); 806 BUG_ON(to > PAGE_CACHE_SIZE);
808 BUG_ON(from > osb->s_clustersize); 807 BUG_ON(from > osb->s_clustersize);
809 BUG_ON(to > osb->s_clustersize); 808 BUG_ON(to > osb->s_clustersize);
810 809
811 src = buf->ops->map(sp->s_pipe, buf, 1); 810 src = buf->ops->map(sp->s_pipe, buf, 1);
812 dst = kmap_atomic(wc->w_this_page, KM_USER1); 811 dst = kmap_atomic(wc->w_this_page, KM_USER1);
813 memcpy(dst + from, src + src_from, bytes); 812 memcpy(dst + from, src + src_from, bytes);
814 kunmap_atomic(wc->w_this_page, KM_USER1); 813 kunmap_atomic(wc->w_this_page, KM_USER1);
815 buf->ops->unmap(sp->s_pipe, buf, src); 814 buf->ops->unmap(sp->s_pipe, buf, src);
816 815
817 wc->w_finished_copy = 1; 816 wc->w_finished_copy = 1;
818 817
819 *ret_from = from; 818 *ret_from = from;
820 *ret_to = to; 819 *ret_to = to;
821 out: 820 out:
822 821
823 return bytes ? (unsigned int)bytes : ret; 822 return bytes ? (unsigned int)bytes : ret;
824 } 823 }
825 824
826 /* 825 /*
827 * This will copy user data from the iovec in the buffered write 826 * This will copy user data from the iovec in the buffered write
828 * context. 827 * context.
829 */ 828 */
830 int ocfs2_map_and_write_user_data(struct inode *inode, 829 int ocfs2_map_and_write_user_data(struct inode *inode,
831 struct ocfs2_write_ctxt *wc, u64 *p_blkno, 830 struct ocfs2_write_ctxt *wc, u64 *p_blkno,
832 unsigned int *ret_from, unsigned int *ret_to) 831 unsigned int *ret_from, unsigned int *ret_to)
833 { 832 {
834 int ret; 833 int ret;
835 unsigned int to, from, cluster_start, cluster_end; 834 unsigned int to, from, cluster_start, cluster_end;
836 unsigned long bytes, src_from; 835 unsigned long bytes, src_from;
837 char *dst; 836 char *dst;
838 struct ocfs2_buffered_write_priv *bp = wc->w_private; 837 struct ocfs2_buffered_write_priv *bp = wc->w_private;
839 const struct iovec *cur_iov = bp->b_cur_iov; 838 const struct iovec *cur_iov = bp->b_cur_iov;
840 char __user *buf; 839 char __user *buf;
841 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 840 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
842 841
843 ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start, 842 ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start,
844 &cluster_end); 843 &cluster_end);
845 844
846 buf = cur_iov->iov_base + bp->b_cur_off; 845 buf = cur_iov->iov_base + bp->b_cur_off;
847 src_from = (unsigned long)buf & ~PAGE_CACHE_MASK; 846 src_from = (unsigned long)buf & ~PAGE_CACHE_MASK;
848 847
849 from = wc->w_pos & (PAGE_CACHE_SIZE - 1); 848 from = wc->w_pos & (PAGE_CACHE_SIZE - 1);
850 849
851 /* 850 /*
852 * This is a lot of comparisons, but it reads quite 851 * This is a lot of comparisons, but it reads quite
853 * easily, which is important here. 852 * easily, which is important here.
854 */ 853 */
855 /* Stay within the src page */ 854 /* Stay within the src page */
856 bytes = PAGE_SIZE - src_from; 855 bytes = PAGE_SIZE - src_from;
857 /* Stay within the vector */ 856 /* Stay within the vector */
858 bytes = min(bytes, 857 bytes = min(bytes,
859 (unsigned long)(cur_iov->iov_len - bp->b_cur_off)); 858 (unsigned long)(cur_iov->iov_len - bp->b_cur_off));
860 /* Stay within count */ 859 /* Stay within count */
861 bytes = min(bytes, (unsigned long)wc->w_count); 860 bytes = min(bytes, (unsigned long)wc->w_count);
862 /* 861 /*
863 * For clustersize > page size, just stay within 862 * For clustersize > page size, just stay within
864 * target page, otherwise we have to calculate pos 863 * target page, otherwise we have to calculate pos
865 * within the cluster and obey the rightmost 864 * within the cluster and obey the rightmost
866 * boundary. 865 * boundary.
867 */ 866 */
868 if (wc->w_large_pages) { 867 if (wc->w_large_pages) {
869 /* 868 /*
870 * For cluster size < page size, we have to 869 * For cluster size < page size, we have to
871 * calculate pos within the cluster and obey 870 * calculate pos within the cluster and obey
872 * the rightmost boundary. 871 * the rightmost boundary.
873 */ 872 */
874 bytes = min(bytes, (unsigned long)(osb->s_clustersize 873 bytes = min(bytes, (unsigned long)(osb->s_clustersize
875 - (wc->w_pos & (osb->s_clustersize - 1)))); 874 - (wc->w_pos & (osb->s_clustersize - 1))));
876 } else { 875 } else {
877 /* 876 /*
878 * cluster size > page size is the most common 877 * cluster size > page size is the most common
879 * case - we just stay within the target page 878 * case - we just stay within the target page
880 * boundary. 879 * boundary.
881 */ 880 */
882 bytes = min(bytes, PAGE_CACHE_SIZE - from); 881 bytes = min(bytes, PAGE_CACHE_SIZE - from);
883 } 882 }
884 883
885 to = from + bytes; 884 to = from + bytes;
886 885
887 if (wc->w_this_page_new) 886 if (wc->w_this_page_new)
888 ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, 887 ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
889 cluster_start, cluster_end, 1); 888 cluster_start, cluster_end, 1);
890 else 889 else
891 ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, 890 ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
892 from, to, 0); 891 from, to, 0);
893 if (ret) { 892 if (ret) {
894 mlog_errno(ret); 893 mlog_errno(ret);
895 goto out; 894 goto out;
896 } 895 }
897 896
898 BUG_ON(from > PAGE_CACHE_SIZE); 897 BUG_ON(from > PAGE_CACHE_SIZE);
899 BUG_ON(to > PAGE_CACHE_SIZE); 898 BUG_ON(to > PAGE_CACHE_SIZE);
900 BUG_ON(from > osb->s_clustersize); 899 BUG_ON(from > osb->s_clustersize);
901 BUG_ON(to > osb->s_clustersize); 900 BUG_ON(to > osb->s_clustersize);
902 901
903 dst = kmap(wc->w_this_page); 902 dst = kmap(wc->w_this_page);
904 memcpy(dst + from, bp->b_src_buf + src_from, bytes); 903 memcpy(dst + from, bp->b_src_buf + src_from, bytes);
905 kunmap(wc->w_this_page); 904 kunmap(wc->w_this_page);
906 905
907 /* 906 /*
908 * XXX: This is slow, but simple. The caller of 907 * XXX: This is slow, but simple. The caller of
909 * ocfs2_buffered_write_cluster() is responsible for 908 * ocfs2_buffered_write_cluster() is responsible for
910 * passing through the iovecs, so it's difficult to 909 * passing through the iovecs, so it's difficult to
911 * predict what our next step is in here after our 910 * predict what our next step is in here after our
912 * initial write. A future version should be pushing 911 * initial write. A future version should be pushing
913 * that iovec manipulation further down. 912 * that iovec manipulation further down.
914 * 913 *
915 * By setting this, we indicate that a copy from user 914 * By setting this, we indicate that a copy from user
916 * data was done, and subsequent calls for this 915 * data was done, and subsequent calls for this
917 * cluster will skip copying more data. 916 * cluster will skip copying more data.
918 */ 917 */
919 wc->w_finished_copy = 1; 918 wc->w_finished_copy = 1;
920 919
921 *ret_from = from; 920 *ret_from = from;
922 *ret_to = to; 921 *ret_to = to;
923 out: 922 out:
924 923
925 return bytes ? (unsigned int)bytes : ret; 924 return bytes ? (unsigned int)bytes : ret;
926 } 925 }
927 926
928 /* 927 /*
929 * Map, fill and write a page to disk. 928 * Map, fill and write a page to disk.
930 * 929 *
931 * The work of copying data is done via callback. Newly allocated 930 * The work of copying data is done via callback. Newly allocated
932 * pages which don't take user data will be zero'd (set 'new' to 931 * pages which don't take user data will be zero'd (set 'new' to
933 * indicate an allocating write) 932 * indicate an allocating write)
934 * 933 *
935 * Returns a negative error code or the number of bytes copied into 934 * Returns a negative error code or the number of bytes copied into
936 * the page. 935 * the page.
937 */ 936 */
938 int ocfs2_write_data_page(struct inode *inode, handle_t *handle, 937 int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
939 u64 *p_blkno, struct page *page, 938 u64 *p_blkno, struct page *page,
940 struct ocfs2_write_ctxt *wc, int new) 939 struct ocfs2_write_ctxt *wc, int new)
941 { 940 {
942 int ret, copied = 0; 941 int ret, copied = 0;
943 unsigned int from = 0, to = 0; 942 unsigned int from = 0, to = 0;
944 unsigned int cluster_start, cluster_end; 943 unsigned int cluster_start, cluster_end;
945 unsigned int zero_from = 0, zero_to = 0; 944 unsigned int zero_from = 0, zero_to = 0;
946 945
947 ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos, 946 ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos,
948 &cluster_start, &cluster_end); 947 &cluster_start, &cluster_end);
949 948
950 if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index 949 if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index
951 && !wc->w_finished_copy) { 950 && !wc->w_finished_copy) {
952 951
953 wc->w_this_page = page; 952 wc->w_this_page = page;
954 wc->w_this_page_new = new; 953 wc->w_this_page_new = new;
955 ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to); 954 ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to);
956 if (ret < 0) { 955 if (ret < 0) {
957 mlog_errno(ret); 956 mlog_errno(ret);
958 goto out; 957 goto out;
959 } 958 }
960 959
961 copied = ret; 960 copied = ret;
962 961
963 zero_from = from; 962 zero_from = from;
964 zero_to = to; 963 zero_to = to;
965 if (new) { 964 if (new) {
966 from = cluster_start; 965 from = cluster_start;
967 to = cluster_end; 966 to = cluster_end;
968 } 967 }
969 } else { 968 } else {
970 /* 969 /*
971 * If we haven't allocated the new page yet, we 970 * If we haven't allocated the new page yet, we
972 * shouldn't be writing it out without copying user 971 * shouldn't be writing it out without copying user
973 * data. This is likely a math error from the caller. 972 * data. This is likely a math error from the caller.
974 */ 973 */
975 BUG_ON(!new); 974 BUG_ON(!new);
976 975
977 from = cluster_start; 976 from = cluster_start;
978 to = cluster_end; 977 to = cluster_end;
979 978
980 ret = ocfs2_map_page_blocks(page, p_blkno, inode, 979 ret = ocfs2_map_page_blocks(page, p_blkno, inode,
981 cluster_start, cluster_end, 1); 980 cluster_start, cluster_end, 1);
982 if (ret) { 981 if (ret) {
983 mlog_errno(ret); 982 mlog_errno(ret);
984 goto out; 983 goto out;
985 } 984 }
986 } 985 }
987 986
988 /* 987 /*
989 * Parts of newly allocated pages need to be zero'd. 988 * Parts of newly allocated pages need to be zero'd.
990 * 989 *
991 * Above, we have also rewritten 'to' and 'from' - as far as 990 * Above, we have also rewritten 'to' and 'from' - as far as
992 * the rest of the function is concerned, the entire cluster 991 * the rest of the function is concerned, the entire cluster
993 * range inside of a page needs to be written. 992 * range inside of a page needs to be written.
994 * 993 *
995 * We can skip this if the page is up to date - it's already 994 * We can skip this if the page is up to date - it's already
996 * been zero'd from being read in as a hole. 995 * been zero'd from being read in as a hole.
997 */ 996 */
998 if (new && !PageUptodate(page)) 997 if (new && !PageUptodate(page))
999 ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb), 998 ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb),
1000 wc->w_cpos, zero_from, zero_to); 999 wc->w_cpos, zero_from, zero_to);
1001 1000
1002 flush_dcache_page(page); 1001 flush_dcache_page(page);
1003 1002
1004 if (ocfs2_should_order_data(inode)) { 1003 if (ocfs2_should_order_data(inode)) {
1005 ret = walk_page_buffers(handle, 1004 ret = walk_page_buffers(handle,
1006 page_buffers(page), 1005 page_buffers(page),
1007 from, to, NULL, 1006 from, to, NULL,
1008 ocfs2_journal_dirty_data); 1007 ocfs2_journal_dirty_data);
1009 if (ret < 0) 1008 if (ret < 0)
1010 mlog_errno(ret); 1009 mlog_errno(ret);
1011 } 1010 }
1012 1011
1013 /* 1012 /*
1014 * We don't use generic_commit_write() because we need to 1013 * We don't use generic_commit_write() because we need to
1015 * handle our own i_size update. 1014 * handle our own i_size update.
1016 */ 1015 */
1017 ret = block_commit_write(page, from, to); 1016 ret = block_commit_write(page, from, to);
1018 if (ret) 1017 if (ret)
1019 mlog_errno(ret); 1018 mlog_errno(ret);
1020 out: 1019 out:
1021 1020
1022 return copied ? copied : ret; 1021 return copied ? copied : ret;
1023 } 1022 }
1024 1023
1025 /* 1024 /*
1026 * Do the actual write of some data into an inode. Optionally allocate 1025 * Do the actual write of some data into an inode. Optionally allocate
1027 * in order to fulfill the write. 1026 * in order to fulfill the write.
1028 * 1027 *
1029 * cpos is the logical cluster offset within the file to write at 1028 * cpos is the logical cluster offset within the file to write at
1030 * 1029 *
1031 * 'phys' is the physical mapping of that offset. a 'phys' value of 1030 * 'phys' is the physical mapping of that offset. a 'phys' value of
1032 * zero indicates that allocation is required. In this case, data_ac 1031 * zero indicates that allocation is required. In this case, data_ac
1033 * and meta_ac should be valid (meta_ac can be null if metadata 1032 * and meta_ac should be valid (meta_ac can be null if metadata
1034 * allocation isn't required). 1033 * allocation isn't required).
1035 */ 1034 */
1036 static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle, 1035 static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
1037 struct buffer_head *di_bh, 1036 struct buffer_head *di_bh,
1038 struct ocfs2_alloc_context *data_ac, 1037 struct ocfs2_alloc_context *data_ac,
1039 struct ocfs2_alloc_context *meta_ac, 1038 struct ocfs2_alloc_context *meta_ac,
1040 struct ocfs2_write_ctxt *wc) 1039 struct ocfs2_write_ctxt *wc)
1041 { 1040 {
1042 int ret, i, numpages = 1, new; 1041 int ret, i, numpages = 1, new;
1043 unsigned int copied = 0; 1042 unsigned int copied = 0;
1044 u32 tmp_pos; 1043 u32 tmp_pos;
1045 u64 v_blkno, p_blkno; 1044 u64 v_blkno, p_blkno;
1046 struct address_space *mapping = file->f_mapping; 1045 struct address_space *mapping = file->f_mapping;
1047 struct inode *inode = mapping->host; 1046 struct inode *inode = mapping->host;
1048 unsigned long index, start; 1047 unsigned long index, start;
1049 struct page **cpages; 1048 struct page **cpages;
1050 1049
1051 new = phys == 0 ? 1 : 0; 1050 new = phys == 0 ? 1 : 0;
1052 1051
1053 /* 1052 /*
1054 * Figure out how many pages we'll be manipulating here. For 1053 * Figure out how many pages we'll be manipulating here. For
1055 * non allocating write, we just change the one 1054 * non allocating write, we just change the one
1056 * page. Otherwise, we'll need a whole clusters worth. 1055 * page. Otherwise, we'll need a whole clusters worth.
1057 */ 1056 */
1058 if (new) 1057 if (new)
1059 numpages = ocfs2_pages_per_cluster(inode->i_sb); 1058 numpages = ocfs2_pages_per_cluster(inode->i_sb);
1060 1059
1061 cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS); 1060 cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS);
1062 if (!cpages) { 1061 if (!cpages) {
1063 ret = -ENOMEM; 1062 ret = -ENOMEM;
1064 mlog_errno(ret); 1063 mlog_errno(ret);
1065 return ret; 1064 return ret;
1066 } 1065 }
1067 1066
1068 /* 1067 /*
1069 * Fill our page array first. That way we've grabbed enough so 1068 * Fill our page array first. That way we've grabbed enough so
1070 * that we can zero and flush if we error after adding the 1069 * that we can zero and flush if we error after adding the
1071 * extent. 1070 * extent.
1072 */ 1071 */
1073 if (new) { 1072 if (new) {
1074 start = ocfs2_align_clusters_to_page_index(inode->i_sb, 1073 start = ocfs2_align_clusters_to_page_index(inode->i_sb,
1075 wc->w_cpos); 1074 wc->w_cpos);
1076 v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos); 1075 v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos);
1077 } else { 1076 } else {
1078 start = wc->w_pos >> PAGE_CACHE_SHIFT; 1077 start = wc->w_pos >> PAGE_CACHE_SHIFT;
1079 v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits; 1078 v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits;
1080 } 1079 }
1081 1080
1082 for(i = 0; i < numpages; i++) { 1081 for(i = 0; i < numpages; i++) {
1083 index = start + i; 1082 index = start + i;
1084 1083
1085 cpages[i] = grab_cache_page(mapping, index); 1084 cpages[i] = grab_cache_page(mapping, index);
1086 if (!cpages[i]) { 1085 if (!cpages[i]) {
1087 ret = -ENOMEM; 1086 ret = -ENOMEM;
1088 mlog_errno(ret); 1087 mlog_errno(ret);
1089 goto out; 1088 goto out;
1090 } 1089 }
1091 } 1090 }
1092 1091
1093 if (new) { 1092 if (new) {
1094 /* 1093 /*
1095 * This is safe to call with the page locks - it won't take 1094 * This is safe to call with the page locks - it won't take
1096 * any additional semaphores or cluster locks. 1095 * any additional semaphores or cluster locks.
1097 */ 1096 */
1098 tmp_pos = wc->w_cpos; 1097 tmp_pos = wc->w_cpos;
1099 ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode, 1098 ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode,
1100 &tmp_pos, 1, di_bh, handle, 1099 &tmp_pos, 1, di_bh, handle,
1101 data_ac, meta_ac, NULL); 1100 data_ac, meta_ac, NULL);
1102 /* 1101 /*
1103 * This shouldn't happen because we must have already 1102 * This shouldn't happen because we must have already
1104 * calculated the correct meta data allocation required. The 1103 * calculated the correct meta data allocation required. The
1105 * internal tree allocation code should know how to increase 1104 * internal tree allocation code should know how to increase
1106 * transaction credits itself. 1105 * transaction credits itself.
1107 * 1106 *
1108 * If need be, we could handle -EAGAIN for a 1107 * If need be, we could handle -EAGAIN for a
1109 * RESTART_TRANS here. 1108 * RESTART_TRANS here.
1110 */ 1109 */
1111 mlog_bug_on_msg(ret == -EAGAIN, 1110 mlog_bug_on_msg(ret == -EAGAIN,
1112 "Inode %llu: EAGAIN return during allocation.\n", 1111 "Inode %llu: EAGAIN return during allocation.\n",
1113 (unsigned long long)OCFS2_I(inode)->ip_blkno); 1112 (unsigned long long)OCFS2_I(inode)->ip_blkno);
1114 if (ret < 0) { 1113 if (ret < 0) {
1115 mlog_errno(ret); 1114 mlog_errno(ret);
1116 goto out; 1115 goto out;
1117 } 1116 }
1118 } 1117 }
1119 1118
1120 ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL, 1119 ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL,
1121 NULL); 1120 NULL);
1122 if (ret < 0) { 1121 if (ret < 0) {
1123 1122
1124 /* 1123 /*
1125 * XXX: Should we go readonly here? 1124 * XXX: Should we go readonly here?
1126 */ 1125 */
1127 1126
1128 mlog_errno(ret); 1127 mlog_errno(ret);
1129 goto out; 1128 goto out;
1130 } 1129 }
1131 1130
1132 BUG_ON(p_blkno == 0); 1131 BUG_ON(p_blkno == 0);
1133 1132
1134 for(i = 0; i < numpages; i++) { 1133 for(i = 0; i < numpages; i++) {
1135 ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i], 1134 ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i],
1136 wc, new); 1135 wc, new);
1137 if (ret < 0) { 1136 if (ret < 0) {
1138 mlog_errno(ret); 1137 mlog_errno(ret);
1139 goto out; 1138 goto out;
1140 } 1139 }
1141 1140
1142 copied += ret; 1141 copied += ret;
1143 } 1142 }
1144 1143
1145 out: 1144 out:
1146 for(i = 0; i < numpages; i++) { 1145 for(i = 0; i < numpages; i++) {
1147 unlock_page(cpages[i]); 1146 unlock_page(cpages[i]);
1148 mark_page_accessed(cpages[i]); 1147 mark_page_accessed(cpages[i]);
1149 page_cache_release(cpages[i]); 1148 page_cache_release(cpages[i]);
1150 } 1149 }
1151 kfree(cpages); 1150 kfree(cpages);
1152 1151
1153 return copied ? copied : ret; 1152 return copied ? copied : ret;
1154 } 1153 }
1155 1154
1156 static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc, 1155 static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc,
1157 struct ocfs2_super *osb, loff_t pos, 1156 struct ocfs2_super *osb, loff_t pos,
1158 size_t count, ocfs2_page_writer *cb, 1157 size_t count, ocfs2_page_writer *cb,
1159 void *cb_priv) 1158 void *cb_priv)
1160 { 1159 {
1161 wc->w_count = count; 1160 wc->w_count = count;
1162 wc->w_pos = pos; 1161 wc->w_pos = pos;
1163 wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits; 1162 wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits;
1164 wc->w_finished_copy = 0; 1163 wc->w_finished_copy = 0;
1165 1164
1166 if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) 1165 if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
1167 wc->w_large_pages = 1; 1166 wc->w_large_pages = 1;
1168 else 1167 else
1169 wc->w_large_pages = 0; 1168 wc->w_large_pages = 0;
1170 1169
1171 wc->w_write_data_page = cb; 1170 wc->w_write_data_page = cb;
1172 wc->w_private = cb_priv; 1171 wc->w_private = cb_priv;
1173 } 1172 }
1174 1173
1175 /* 1174 /*
1176 * Write a cluster to an inode. The cluster may not be allocated yet, 1175 * Write a cluster to an inode. The cluster may not be allocated yet,
1177 * in which case it will be. This only exists for buffered writes - 1176 * in which case it will be. This only exists for buffered writes -
1178 * O_DIRECT takes a more "traditional" path through the kernel. 1177 * O_DIRECT takes a more "traditional" path through the kernel.
1179 * 1178 *
1180 * The caller is responsible for incrementing pos, written counts, etc 1179 * The caller is responsible for incrementing pos, written counts, etc
1181 * 1180 *
1182 * For file systems that don't support sparse files, pre-allocation 1181 * For file systems that don't support sparse files, pre-allocation
1183 * and page zeroing up until cpos should be done prior to this 1182 * and page zeroing up until cpos should be done prior to this
1184 * function call. 1183 * function call.
1185 * 1184 *
1186 * Callers should be holding i_sem, and the rw cluster lock. 1185 * Callers should be holding i_sem, and the rw cluster lock.
1187 * 1186 *
1188 * Returns the number of user bytes written, or less than zero for 1187 * Returns the number of user bytes written, or less than zero for
1189 * error. 1188 * error.
1190 */ 1189 */
1191 ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, 1190 ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
1192 size_t count, ocfs2_page_writer *actor, 1191 size_t count, ocfs2_page_writer *actor,
1193 void *priv) 1192 void *priv)
1194 { 1193 {
1195 int ret, credits = OCFS2_INODE_UPDATE_CREDITS; 1194 int ret, credits = OCFS2_INODE_UPDATE_CREDITS;
1196 ssize_t written = 0; 1195 ssize_t written = 0;
1197 u32 phys; 1196 u32 phys;
1198 struct inode *inode = file->f_mapping->host; 1197 struct inode *inode = file->f_mapping->host;
1199 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1198 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1200 struct buffer_head *di_bh = NULL; 1199 struct buffer_head *di_bh = NULL;
1201 struct ocfs2_dinode *di; 1200 struct ocfs2_dinode *di;
1202 struct ocfs2_alloc_context *data_ac = NULL; 1201 struct ocfs2_alloc_context *data_ac = NULL;
1203 struct ocfs2_alloc_context *meta_ac = NULL; 1202 struct ocfs2_alloc_context *meta_ac = NULL;
1204 handle_t *handle; 1203 handle_t *handle;
1205 struct ocfs2_write_ctxt wc; 1204 struct ocfs2_write_ctxt wc;
1206 1205
1207 ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv); 1206 ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv);
1208 1207
1209 ret = ocfs2_meta_lock(inode, &di_bh, 1); 1208 ret = ocfs2_meta_lock(inode, &di_bh, 1);
1210 if (ret) { 1209 if (ret) {
1211 mlog_errno(ret); 1210 mlog_errno(ret);
1212 goto out; 1211 goto out;
1213 } 1212 }
1214 di = (struct ocfs2_dinode *)di_bh->b_data; 1213 di = (struct ocfs2_dinode *)di_bh->b_data;
1215 1214
1216 /* 1215 /*
1217 * Take alloc sem here to prevent concurrent lookups. That way 1216 * Take alloc sem here to prevent concurrent lookups. That way
1218 * the mapping, zeroing and tree manipulation within 1217 * the mapping, zeroing and tree manipulation within
1219 * ocfs2_write() will be safe against ->readpage(). This 1218 * ocfs2_write() will be safe against ->readpage(). This
1220 * should also serve to lock out allocation from a shared 1219 * should also serve to lock out allocation from a shared
1221 * writeable region. 1220 * writeable region.
1222 */ 1221 */
1223 down_write(&OCFS2_I(inode)->ip_alloc_sem); 1222 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1224 1223
1225 ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL, NULL); 1224 ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL, NULL);
1226 if (ret) { 1225 if (ret) {
1227 mlog_errno(ret); 1226 mlog_errno(ret);
1228 goto out_meta; 1227 goto out_meta;
1229 } 1228 }
1230 1229
1231 /* phys == 0 means that allocation is required. */ 1230 /* phys == 0 means that allocation is required. */
1232 if (phys == 0) { 1231 if (phys == 0) {
1233 ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac); 1232 ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac);
1234 if (ret) { 1233 if (ret) {
1235 mlog_errno(ret); 1234 mlog_errno(ret);
1236 goto out_meta; 1235 goto out_meta;
1237 } 1236 }
1238 1237
1239 credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1); 1238 credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1);
1240 } 1239 }
1241 1240
1242 ret = ocfs2_data_lock(inode, 1); 1241 ret = ocfs2_data_lock(inode, 1);
1243 if (ret) { 1242 if (ret) {
1244 mlog_errno(ret); 1243 mlog_errno(ret);
1245 goto out_meta; 1244 goto out_meta;
1246 } 1245 }
1247 1246
1248 handle = ocfs2_start_trans(osb, credits); 1247 handle = ocfs2_start_trans(osb, credits);
1249 if (IS_ERR(handle)) { 1248 if (IS_ERR(handle)) {
1250 ret = PTR_ERR(handle); 1249 ret = PTR_ERR(handle);
1251 mlog_errno(ret); 1250 mlog_errno(ret);
1252 goto out_data; 1251 goto out_data;
1253 } 1252 }
1254 1253
1255 written = ocfs2_write(file, phys, handle, di_bh, data_ac, 1254 written = ocfs2_write(file, phys, handle, di_bh, data_ac,
1256 meta_ac, &wc); 1255 meta_ac, &wc);
1257 if (written < 0) { 1256 if (written < 0) {
1258 ret = written; 1257 ret = written;
1259 mlog_errno(ret); 1258 mlog_errno(ret);
1260 goto out_commit; 1259 goto out_commit;
1261 } 1260 }
1262 1261
1263 ret = ocfs2_journal_access(handle, inode, di_bh, 1262 ret = ocfs2_journal_access(handle, inode, di_bh,
1264 OCFS2_JOURNAL_ACCESS_WRITE); 1263 OCFS2_JOURNAL_ACCESS_WRITE);
1265 if (ret) { 1264 if (ret) {
1266 mlog_errno(ret); 1265 mlog_errno(ret);
1267 goto out_commit; 1266 goto out_commit;
1268 } 1267 }
1269 1268
1270 pos += written; 1269 pos += written;
1271 if (pos > inode->i_size) { 1270 if (pos > inode->i_size) {
1272 i_size_write(inode, pos); 1271 i_size_write(inode, pos);
1273 mark_inode_dirty(inode); 1272 mark_inode_dirty(inode);
1274 } 1273 }
1275 inode->i_blocks = ocfs2_align_bytes_to_sectors((u64)(i_size_read(inode))); 1274 inode->i_blocks = ocfs2_align_bytes_to_sectors((u64)(i_size_read(inode)));
1276 di->i_size = cpu_to_le64((u64)i_size_read(inode)); 1275 di->i_size = cpu_to_le64((u64)i_size_read(inode));
1277 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 1276 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1278 di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); 1277 di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
1279 di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); 1278 di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
1280 1279
1281 ret = ocfs2_journal_dirty(handle, di_bh); 1280 ret = ocfs2_journal_dirty(handle, di_bh);
1282 if (ret) 1281 if (ret)
1283 mlog_errno(ret); 1282 mlog_errno(ret);
1284 1283
1285 out_commit: 1284 out_commit:
1286 ocfs2_commit_trans(osb, handle); 1285 ocfs2_commit_trans(osb, handle);
1287 1286
1288 out_data: 1287 out_data:
1289 ocfs2_data_unlock(inode, 1); 1288 ocfs2_data_unlock(inode, 1);
1290 1289
1291 out_meta: 1290 out_meta:
1292 up_write(&OCFS2_I(inode)->ip_alloc_sem); 1291 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1293 ocfs2_meta_unlock(inode, 1); 1292 ocfs2_meta_unlock(inode, 1);
1294 1293
1295 out: 1294 out:
1296 brelse(di_bh); 1295 brelse(di_bh);
1297 if (data_ac) 1296 if (data_ac)
1298 ocfs2_free_alloc_context(data_ac); 1297 ocfs2_free_alloc_context(data_ac);
1299 if (meta_ac) 1298 if (meta_ac)
1300 ocfs2_free_alloc_context(meta_ac); 1299 ocfs2_free_alloc_context(meta_ac);
1301 1300
1302 return written ? written : ret; 1301 return written ? written : ret;
1303 } 1302 }
1304 1303
1305 const struct address_space_operations ocfs2_aops = { 1304 const struct address_space_operations ocfs2_aops = {
1306 .readpage = ocfs2_readpage, 1305 .readpage = ocfs2_readpage,
1307 .writepage = ocfs2_writepage, 1306 .writepage = ocfs2_writepage,
1308 .bmap = ocfs2_bmap, 1307 .bmap = ocfs2_bmap,
1309 .sync_page = block_sync_page, 1308 .sync_page = block_sync_page,
1310 .direct_IO = ocfs2_direct_IO, 1309 .direct_IO = ocfs2_direct_IO,
1311 .invalidatepage = ocfs2_invalidatepage, 1310 .invalidatepage = ocfs2_invalidatepage,
1312 .releasepage = ocfs2_releasepage, 1311 .releasepage = ocfs2_releasepage,
1313 .migratepage = buffer_migrate_page, 1312 .migratepage = buffer_migrate_page,
1314 }; 1313 };
1315 1314
fs/ocfs2/extent_map.c
1 /* -*- mode: c; c-basic-offset: 8; -*- 1 /* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0: 2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 * 3 *
4 * extent_map.c 4 * extent_map.c
5 * 5 *
6 * Block/Cluster mapping functions 6 * Block/Cluster mapping functions
7 * 7 *
8 * Copyright (C) 2004 Oracle. All rights reserved. 8 * Copyright (C) 2004 Oracle. All rights reserved.
9 * 9 *
10 * This program is free software; you can redistribute it and/or 10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public 11 * modify it under the terms of the GNU General Public
12 * License, version 2, as published by the Free Software Foundation. 12 * License, version 2, as published by the Free Software Foundation.
13 * 13 *
14 * This program is distributed in the hope that it will be useful, 14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details. 17 * General Public License for more details.
18 * 18 *
19 * You should have received a copy of the GNU General Public 19 * You should have received a copy of the GNU General Public
20 * License along with this program; if not, write to the 20 * License along with this program; if not, write to the
21 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 21 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
22 * Boston, MA 021110-1307, USA. 22 * Boston, MA 021110-1307, USA.
23 */ 23 */
24 24
25 #include <linux/fs.h> 25 #include <linux/fs.h>
26 #include <linux/init.h> 26 #include <linux/init.h>
27 #include <linux/types.h> 27 #include <linux/types.h>
28 28
29 #define MLOG_MASK_PREFIX ML_EXTENT_MAP 29 #define MLOG_MASK_PREFIX ML_EXTENT_MAP
30 #include <cluster/masklog.h> 30 #include <cluster/masklog.h>
31 31
32 #include "ocfs2.h" 32 #include "ocfs2.h"
33 33
34 #include "alloc.h" 34 #include "alloc.h"
35 #include "extent_map.h" 35 #include "extent_map.h"
36 #include "inode.h" 36 #include "inode.h"
37 #include "super.h" 37 #include "super.h"
38 38
39 #include "buffer_head_io.h" 39 #include "buffer_head_io.h"
40 40
41 /* 41 /*
42 * Return the 1st index within el which contains an extent start
43 * larger than v_cluster.
44 */
45 static int ocfs2_search_for_hole_index(struct ocfs2_extent_list *el,
46 u32 v_cluster)
47 {
48 int i;
49 struct ocfs2_extent_rec *rec;
50
51 for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
52 rec = &el->l_recs[i];
53
54 if (v_cluster < le32_to_cpu(rec->e_cpos))
55 break;
56 }
57
58 return i;
59 }
60
61 /*
62 * Figure out the size of a hole which starts at v_cluster within the given
63 * extent list.
64 *
65 * If there is no more allocation past v_cluster, we return the maximum
66 * cluster size minus v_cluster.
67 *
68 * If we have in-inode extents, then el points to the dinode list and
69 * eb_bh is NULL. Otherwise, eb_bh should point to the extent block
70 * containing el.
71 */
72 static int ocfs2_figure_hole_clusters(struct inode *inode,
73 struct ocfs2_extent_list *el,
74 struct buffer_head *eb_bh,
75 u32 v_cluster,
76 u32 *num_clusters)
77 {
78 int ret, i;
79 struct buffer_head *next_eb_bh = NULL;
80 struct ocfs2_extent_block *eb, *next_eb;
81
82 i = ocfs2_search_for_hole_index(el, v_cluster);
83
84 if (i == le16_to_cpu(el->l_next_free_rec) && eb_bh) {
85 eb = (struct ocfs2_extent_block *)eb_bh->b_data;
86
87 /*
88 * Check the next leaf for any extents.
89 */
90
91 if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
92 goto no_more_extents;
93
94 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
95 le64_to_cpu(eb->h_next_leaf_blk),
96 &next_eb_bh, OCFS2_BH_CACHED, inode);
97 if (ret) {
98 mlog_errno(ret);
99 goto out;
100 }
101 next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;
102
103 if (!OCFS2_IS_VALID_EXTENT_BLOCK(next_eb)) {
104 ret = -EROFS;
105 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, next_eb);
106 goto out;
107 }
108
109 el = &next_eb->h_list;
110
111 i = ocfs2_search_for_hole_index(el, v_cluster);
112 }
113
114 no_more_extents:
115 if (i == le16_to_cpu(el->l_next_free_rec)) {
116 /*
117 * We're at the end of our existing allocation. Just
118 * return the maximum number of clusters we could
119 * possibly allocate.
120 */
121 *num_clusters = UINT_MAX - v_cluster;
122 } else {
123 *num_clusters = le32_to_cpu(el->l_recs[i].e_cpos) - v_cluster;
124 }
125
126 ret = 0;
127 out:
128 brelse(next_eb_bh);
129 return ret;
130 }
131
132 /*
42 * Return the index of the extent record which contains cluster #v_cluster. 133 * Return the index of the extent record which contains cluster #v_cluster.
43 * -1 is returned if it was not found. 134 * -1 is returned if it was not found.
44 * 135 *
45 * Should work fine on interior and exterior nodes. 136 * Should work fine on interior and exterior nodes.
46 */ 137 */
47 static int ocfs2_search_extent_list(struct ocfs2_extent_list *el, 138 static int ocfs2_search_extent_list(struct ocfs2_extent_list *el,
48 u32 v_cluster) 139 u32 v_cluster)
49 { 140 {
50 int ret = -1; 141 int ret = -1;
51 int i; 142 int i;
52 struct ocfs2_extent_rec *rec; 143 struct ocfs2_extent_rec *rec;
53 u32 rec_end, rec_start, clusters; 144 u32 rec_end, rec_start, clusters;
54 145
55 for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { 146 for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
56 rec = &el->l_recs[i]; 147 rec = &el->l_recs[i];
57 148
58 rec_start = le32_to_cpu(rec->e_cpos); 149 rec_start = le32_to_cpu(rec->e_cpos);
59 clusters = ocfs2_rec_clusters(el, rec); 150 clusters = ocfs2_rec_clusters(el, rec);
60 151
61 rec_end = rec_start + clusters; 152 rec_end = rec_start + clusters;
62 153
63 if (v_cluster >= rec_start && v_cluster < rec_end) { 154 if (v_cluster >= rec_start && v_cluster < rec_end) {
64 ret = i; 155 ret = i;
65 break; 156 break;
66 } 157 }
67 } 158 }
68 159
69 return ret; 160 return ret;
70 } 161 }
71 162
72 int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, 163 int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
73 u32 *p_cluster, u32 *num_clusters, 164 u32 *p_cluster, u32 *num_clusters,
74 unsigned int *extent_flags) 165 unsigned int *extent_flags)
75 { 166 {
76 int ret, i; 167 int ret, i;
77 unsigned int flags = 0; 168 unsigned int flags = 0;
78 struct buffer_head *di_bh = NULL; 169 struct buffer_head *di_bh = NULL;
79 struct buffer_head *eb_bh = NULL; 170 struct buffer_head *eb_bh = NULL;
80 struct ocfs2_dinode *di; 171 struct ocfs2_dinode *di;
81 struct ocfs2_extent_block *eb; 172 struct ocfs2_extent_block *eb;
82 struct ocfs2_extent_list *el; 173 struct ocfs2_extent_list *el;
83 struct ocfs2_extent_rec *rec; 174 struct ocfs2_extent_rec *rec;
84 u32 coff; 175 u32 coff;
85 176
86 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno, 177 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
87 &di_bh, OCFS2_BH_CACHED, inode); 178 &di_bh, OCFS2_BH_CACHED, inode);
88 if (ret) { 179 if (ret) {
89 mlog_errno(ret); 180 mlog_errno(ret);
90 goto out; 181 goto out;
91 } 182 }
92 183
93 di = (struct ocfs2_dinode *) di_bh->b_data; 184 di = (struct ocfs2_dinode *) di_bh->b_data;
94 el = &di->id2.i_list; 185 el = &di->id2.i_list;
95 186
96 if (el->l_tree_depth) { 187 if (el->l_tree_depth) {
97 ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh); 188 ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh);
98 if (ret) { 189 if (ret) {
99 mlog_errno(ret); 190 mlog_errno(ret);
100 goto out; 191 goto out;
101 } 192 }
102 193
103 eb = (struct ocfs2_extent_block *) eb_bh->b_data; 194 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
104 el = &eb->h_list; 195 el = &eb->h_list;
105 196
106 if (el->l_tree_depth) { 197 if (el->l_tree_depth) {
107 ocfs2_error(inode->i_sb, 198 ocfs2_error(inode->i_sb,
108 "Inode %lu has non zero tree depth in " 199 "Inode %lu has non zero tree depth in "
109 "leaf block %llu\n", inode->i_ino, 200 "leaf block %llu\n", inode->i_ino,
110 (unsigned long long)eb_bh->b_blocknr); 201 (unsigned long long)eb_bh->b_blocknr);
111 ret = -EROFS; 202 ret = -EROFS;
112 goto out; 203 goto out;
113 } 204 }
114 } 205 }
115 206
116 i = ocfs2_search_extent_list(el, v_cluster); 207 i = ocfs2_search_extent_list(el, v_cluster);
117 if (i == -1) { 208 if (i == -1) {
118 /* 209 /*
119 * A hole was found. Return some canned values that 210 * A hole was found. Return some canned values that
120 * callers can key on. 211 * callers can key on. If asked for, num_clusters will
212 * be populated with the size of the hole.
121 */ 213 */
122 *p_cluster = 0; 214 *p_cluster = 0;
123 if (num_clusters) 215 if (num_clusters) {
124 *num_clusters = 1; 216 ret = ocfs2_figure_hole_clusters(inode, el, eb_bh,
217 v_cluster,
218 num_clusters);
219 if (ret) {
220 mlog_errno(ret);
221 goto out;
222 }
223 }
125 } else { 224 } else {
126 rec = &el->l_recs[i]; 225 rec = &el->l_recs[i];
127 226
128 BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); 227 BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
129 228
130 if (!rec->e_blkno) { 229 if (!rec->e_blkno) {
131 ocfs2_error(inode->i_sb, "Inode %lu has bad extent " 230 ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
132 "record (%u, %u, 0)", inode->i_ino, 231 "record (%u, %u, 0)", inode->i_ino,
133 le32_to_cpu(rec->e_cpos), 232 le32_to_cpu(rec->e_cpos),
134 ocfs2_rec_clusters(el, rec)); 233 ocfs2_rec_clusters(el, rec));
135 ret = -EROFS; 234 ret = -EROFS;
136 goto out; 235 goto out;
137 } 236 }
138 237
139 coff = v_cluster - le32_to_cpu(rec->e_cpos); 238 coff = v_cluster - le32_to_cpu(rec->e_cpos);
140 239
141 *p_cluster = ocfs2_blocks_to_clusters(inode->i_sb, 240 *p_cluster = ocfs2_blocks_to_clusters(inode->i_sb,
142 le64_to_cpu(rec->e_blkno)); 241 le64_to_cpu(rec->e_blkno));
143 *p_cluster = *p_cluster + coff; 242 *p_cluster = *p_cluster + coff;
144 243
145 if (num_clusters) 244 if (num_clusters)
146 *num_clusters = ocfs2_rec_clusters(el, rec) - coff; 245 *num_clusters = ocfs2_rec_clusters(el, rec) - coff;
147 246
148 flags = rec->e_flags; 247 flags = rec->e_flags;
149 } 248 }
150 249
151 if (extent_flags) 250 if (extent_flags)
152 *extent_flags = flags; 251 *extent_flags = flags;
153 252
154 out: 253 out:
155 brelse(di_bh); 254 brelse(di_bh);
156 brelse(eb_bh); 255 brelse(eb_bh);
157 return ret; 256 return ret;
158 } 257 }
159 258
160 /* 259 /*
161 * This expects alloc_sem to be held. The allocation cannot change at 260 * This expects alloc_sem to be held. The allocation cannot change at
162 * all while the map is in the process of being updated. 261 * all while the map is in the process of being updated.
163 */ 262 */
164 int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, 263 int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
165 int *ret_count, unsigned int *extent_flags) 264 u64 *ret_count, unsigned int *extent_flags)
166 { 265 {
167 int ret; 266 int ret;
168 int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); 267 int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
169 u32 cpos, num_clusters, p_cluster; 268 u32 cpos, num_clusters, p_cluster;
170 u64 boff = 0; 269 u64 boff = 0;
171 270
172 cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno); 271 cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
173 272
174 ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters, 273 ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters,
175 extent_flags); 274 extent_flags);
176 if (ret) { 275 if (ret) {
177 mlog_errno(ret); 276 mlog_errno(ret);
178 goto out; 277 goto out;
179 } 278 }
180 279
181 /* 280 /*
182 * p_cluster == 0 indicates a hole. 281 * p_cluster == 0 indicates a hole.
183 */ 282 */
184 if (p_cluster) { 283 if (p_cluster) {
185 boff = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); 284 boff = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
186 boff += (v_blkno & (u64)(bpc - 1)); 285 boff += (v_blkno & (u64)(bpc - 1));
187 } 286 }
188 287
189 *p_blkno = boff; 288 *p_blkno = boff;
190 289
191 if (ret_count) { 290 if (ret_count) {
192 *ret_count = ocfs2_clusters_to_blocks(inode->i_sb, num_clusters); 291 *ret_count = ocfs2_clusters_to_blocks(inode->i_sb, num_clusters);
193 *ret_count -= v_blkno & (u64)(bpc - 1); 292 *ret_count -= v_blkno & (u64)(bpc - 1);
194 } 293 }
195 294
196 out: 295 out:
197 return ret; 296 return ret;
198 } 297 }
199 298
fs/ocfs2/extent_map.h
1 /* -*- mode: c; c-basic-offset: 8; -*- 1 /* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0: 2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 * 3 *
4 * extent_map.h 4 * extent_map.h
5 * 5 *
6 * In-memory file extent mappings for OCFS2. 6 * In-memory file extent mappings for OCFS2.
7 * 7 *
8 * Copyright (C) 2004 Oracle. All rights reserved. 8 * Copyright (C) 2004 Oracle. All rights reserved.
9 * 9 *
10 * This program is free software; you can redistribute it and/or 10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public 11 * modify it under the terms of the GNU General Public
12 * License, version 2, as published by the Free Software Foundation. 12 * License, version 2, as published by the Free Software Foundation.
13 * 13 *
14 * This program is distributed in the hope that it will be useful, 14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details. 17 * General Public License for more details.
18 * 18 *
19 * You should have received a copy of the GNU General Public 19 * You should have received a copy of the GNU General Public
20 * License along with this program; if not, write to the 20 * License along with this program; if not, write to the
21 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 21 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
22 * Boston, MA 021110-1307, USA. 22 * Boston, MA 021110-1307, USA.
23 */ 23 */
24 24
25 #ifndef _EXTENT_MAP_H 25 #ifndef _EXTENT_MAP_H
26 #define _EXTENT_MAP_H 26 #define _EXTENT_MAP_H
27 27
28 int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster, 28 int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster,
29 u32 *num_clusters, unsigned int *extent_flags); 29 u32 *num_clusters, unsigned int *extent_flags);
30 int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, 30 int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
31 int *ret_count, unsigned int *extent_flags); 31 u64 *ret_count, unsigned int *extent_flags);
32 32
33 #endif /* _EXTENT_MAP_H */ 33 #endif /* _EXTENT_MAP_H */
34 34
1 /* -*- mode: c; c-basic-offset: 8; -*- 1 /* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0: 2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 * 3 *
4 * journal.c 4 * journal.c
5 * 5 *
6 * Defines functions of journalling api 6 * Defines functions of journalling api
7 * 7 *
8 * Copyright (C) 2003, 2004 Oracle. All rights reserved. 8 * Copyright (C) 2003, 2004 Oracle. All rights reserved.
9 * 9 *
10 * This program is free software; you can redistribute it and/or 10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public 11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either 12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version. 13 * version 2 of the License, or (at your option) any later version.
14 * 14 *
15 * This program is distributed in the hope that it will be useful, 15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details. 18 * General Public License for more details.
19 * 19 *
20 * You should have received a copy of the GNU General Public 20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the 21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA. 23 * Boston, MA 021110-1307, USA.
24 */ 24 */
25 25
26 #include <linux/fs.h> 26 #include <linux/fs.h>
27 #include <linux/types.h> 27 #include <linux/types.h>
28 #include <linux/slab.h> 28 #include <linux/slab.h>
29 #include <linux/highmem.h> 29 #include <linux/highmem.h>
30 #include <linux/kthread.h> 30 #include <linux/kthread.h>
31 31
32 #define MLOG_MASK_PREFIX ML_JOURNAL 32 #define MLOG_MASK_PREFIX ML_JOURNAL
33 #include <cluster/masklog.h> 33 #include <cluster/masklog.h>
34 34
35 #include "ocfs2.h" 35 #include "ocfs2.h"
36 36
37 #include "alloc.h" 37 #include "alloc.h"
38 #include "dlmglue.h" 38 #include "dlmglue.h"
39 #include "extent_map.h" 39 #include "extent_map.h"
40 #include "heartbeat.h" 40 #include "heartbeat.h"
41 #include "inode.h" 41 #include "inode.h"
42 #include "journal.h" 42 #include "journal.h"
43 #include "localalloc.h" 43 #include "localalloc.h"
44 #include "namei.h" 44 #include "namei.h"
45 #include "slot_map.h" 45 #include "slot_map.h"
46 #include "super.h" 46 #include "super.h"
47 #include "vote.h" 47 #include "vote.h"
48 #include "sysfile.h" 48 #include "sysfile.h"
49 49
50 #include "buffer_head_io.h" 50 #include "buffer_head_io.h"
51 51
52 DEFINE_SPINLOCK(trans_inc_lock); 52 DEFINE_SPINLOCK(trans_inc_lock);
53 53
54 static int ocfs2_force_read_journal(struct inode *inode); 54 static int ocfs2_force_read_journal(struct inode *inode);
55 static int ocfs2_recover_node(struct ocfs2_super *osb, 55 static int ocfs2_recover_node(struct ocfs2_super *osb,
56 int node_num); 56 int node_num);
57 static int __ocfs2_recovery_thread(void *arg); 57 static int __ocfs2_recovery_thread(void *arg);
58 static int ocfs2_commit_cache(struct ocfs2_super *osb); 58 static int ocfs2_commit_cache(struct ocfs2_super *osb);
59 static int ocfs2_wait_on_mount(struct ocfs2_super *osb); 59 static int ocfs2_wait_on_mount(struct ocfs2_super *osb);
60 static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb, 60 static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
61 int dirty); 61 int dirty);
62 static int ocfs2_trylock_journal(struct ocfs2_super *osb, 62 static int ocfs2_trylock_journal(struct ocfs2_super *osb,
63 int slot_num); 63 int slot_num);
64 static int ocfs2_recover_orphans(struct ocfs2_super *osb, 64 static int ocfs2_recover_orphans(struct ocfs2_super *osb,
65 int slot); 65 int slot);
66 static int ocfs2_commit_thread(void *arg); 66 static int ocfs2_commit_thread(void *arg);
67 67
68 static int ocfs2_commit_cache(struct ocfs2_super *osb) 68 static int ocfs2_commit_cache(struct ocfs2_super *osb)
69 { 69 {
70 int status = 0; 70 int status = 0;
71 unsigned int flushed; 71 unsigned int flushed;
72 unsigned long old_id; 72 unsigned long old_id;
73 struct ocfs2_journal *journal = NULL; 73 struct ocfs2_journal *journal = NULL;
74 74
75 mlog_entry_void(); 75 mlog_entry_void();
76 76
77 journal = osb->journal; 77 journal = osb->journal;
78 78
79 /* Flush all pending commits and checkpoint the journal. */ 79 /* Flush all pending commits and checkpoint the journal. */
80 down_write(&journal->j_trans_barrier); 80 down_write(&journal->j_trans_barrier);
81 81
82 if (atomic_read(&journal->j_num_trans) == 0) { 82 if (atomic_read(&journal->j_num_trans) == 0) {
83 up_write(&journal->j_trans_barrier); 83 up_write(&journal->j_trans_barrier);
84 mlog(0, "No transactions for me to flush!\n"); 84 mlog(0, "No transactions for me to flush!\n");
85 goto finally; 85 goto finally;
86 } 86 }
87 87
88 journal_lock_updates(journal->j_journal); 88 journal_lock_updates(journal->j_journal);
89 status = journal_flush(journal->j_journal); 89 status = journal_flush(journal->j_journal);
90 journal_unlock_updates(journal->j_journal); 90 journal_unlock_updates(journal->j_journal);
91 if (status < 0) { 91 if (status < 0) {
92 up_write(&journal->j_trans_barrier); 92 up_write(&journal->j_trans_barrier);
93 mlog_errno(status); 93 mlog_errno(status);
94 goto finally; 94 goto finally;
95 } 95 }
96 96
97 old_id = ocfs2_inc_trans_id(journal); 97 old_id = ocfs2_inc_trans_id(journal);
98 98
99 flushed = atomic_read(&journal->j_num_trans); 99 flushed = atomic_read(&journal->j_num_trans);
100 atomic_set(&journal->j_num_trans, 0); 100 atomic_set(&journal->j_num_trans, 0);
101 up_write(&journal->j_trans_barrier); 101 up_write(&journal->j_trans_barrier);
102 102
103 mlog(0, "commit_thread: flushed transaction %lu (%u handles)\n", 103 mlog(0, "commit_thread: flushed transaction %lu (%u handles)\n",
104 journal->j_trans_id, flushed); 104 journal->j_trans_id, flushed);
105 105
106 ocfs2_kick_vote_thread(osb); 106 ocfs2_kick_vote_thread(osb);
107 wake_up(&journal->j_checkpointed); 107 wake_up(&journal->j_checkpointed);
108 finally: 108 finally:
109 mlog_exit(status); 109 mlog_exit(status);
110 return status; 110 return status;
111 } 111 }
112 112
113 /* pass it NULL and it will allocate a new handle object for you. If 113 /* pass it NULL and it will allocate a new handle object for you. If
114 * you pass it a handle however, it may still return error, in which 114 * you pass it a handle however, it may still return error, in which
115 * case it has free'd the passed handle for you. */ 115 * case it has free'd the passed handle for you. */
116 handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs) 116 handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
117 { 117 {
118 journal_t *journal = osb->journal->j_journal; 118 journal_t *journal = osb->journal->j_journal;
119 handle_t *handle; 119 handle_t *handle;
120 120
121 BUG_ON(!osb || !osb->journal->j_journal); 121 BUG_ON(!osb || !osb->journal->j_journal);
122 122
123 if (ocfs2_is_hard_readonly(osb)) 123 if (ocfs2_is_hard_readonly(osb))
124 return ERR_PTR(-EROFS); 124 return ERR_PTR(-EROFS);
125 125
126 BUG_ON(osb->journal->j_state == OCFS2_JOURNAL_FREE); 126 BUG_ON(osb->journal->j_state == OCFS2_JOURNAL_FREE);
127 BUG_ON(max_buffs <= 0); 127 BUG_ON(max_buffs <= 0);
128 128
129 /* JBD might support this, but our journalling code doesn't yet. */ 129 /* JBD might support this, but our journalling code doesn't yet. */
130 if (journal_current_handle()) { 130 if (journal_current_handle()) {
131 mlog(ML_ERROR, "Recursive transaction attempted!\n"); 131 mlog(ML_ERROR, "Recursive transaction attempted!\n");
132 BUG(); 132 BUG();
133 } 133 }
134 134
135 down_read(&osb->journal->j_trans_barrier); 135 down_read(&osb->journal->j_trans_barrier);
136 136
137 handle = journal_start(journal, max_buffs); 137 handle = journal_start(journal, max_buffs);
138 if (IS_ERR(handle)) { 138 if (IS_ERR(handle)) {
139 up_read(&osb->journal->j_trans_barrier); 139 up_read(&osb->journal->j_trans_barrier);
140 140
141 mlog_errno(PTR_ERR(handle)); 141 mlog_errno(PTR_ERR(handle));
142 142
143 if (is_journal_aborted(journal)) { 143 if (is_journal_aborted(journal)) {
144 ocfs2_abort(osb->sb, "Detected aborted journal"); 144 ocfs2_abort(osb->sb, "Detected aborted journal");
145 handle = ERR_PTR(-EROFS); 145 handle = ERR_PTR(-EROFS);
146 } 146 }
147 } else { 147 } else {
148 if (!ocfs2_mount_local(osb)) 148 if (!ocfs2_mount_local(osb))
149 atomic_inc(&(osb->journal->j_num_trans)); 149 atomic_inc(&(osb->journal->j_num_trans));
150 } 150 }
151 151
152 return handle; 152 return handle;
153 } 153 }
154 154
155 int ocfs2_commit_trans(struct ocfs2_super *osb, 155 int ocfs2_commit_trans(struct ocfs2_super *osb,
156 handle_t *handle) 156 handle_t *handle)
157 { 157 {
158 int ret; 158 int ret;
159 struct ocfs2_journal *journal = osb->journal; 159 struct ocfs2_journal *journal = osb->journal;
160 160
161 BUG_ON(!handle); 161 BUG_ON(!handle);
162 162
163 ret = journal_stop(handle); 163 ret = journal_stop(handle);
164 if (ret < 0) 164 if (ret < 0)
165 mlog_errno(ret); 165 mlog_errno(ret);
166 166
167 up_read(&journal->j_trans_barrier); 167 up_read(&journal->j_trans_barrier);
168 168
169 return ret; 169 return ret;
170 } 170 }
171 171
172 /* 172 /*
173 * 'nblocks' is what you want to add to the current 173 * 'nblocks' is what you want to add to the current
174 * transaction. extend_trans will either extend the current handle by 174 * transaction. extend_trans will either extend the current handle by
175 * nblocks, or commit it and start a new one with nblocks credits. 175 * nblocks, or commit it and start a new one with nblocks credits.
176 * 176 *
177 * WARNING: This will not release any semaphores or disk locks taken 177 * WARNING: This will not release any semaphores or disk locks taken
178 * during the transaction, so make sure they were taken *before* 178 * during the transaction, so make sure they were taken *before*
179 * start_trans or we'll have ordering deadlocks. 179 * start_trans or we'll have ordering deadlocks.
180 * 180 *
181 * WARNING2: Note that we do *not* drop j_trans_barrier here. This is 181 * WARNING2: Note that we do *not* drop j_trans_barrier here. This is
182 * good because transaction ids haven't yet been recorded on the 182 * good because transaction ids haven't yet been recorded on the
183 * cluster locks associated with this handle. 183 * cluster locks associated with this handle.
184 */ 184 */
185 int ocfs2_extend_trans(handle_t *handle, int nblocks) 185 int ocfs2_extend_trans(handle_t *handle, int nblocks)
186 { 186 {
187 int status; 187 int status;
188 188
189 BUG_ON(!handle); 189 BUG_ON(!handle);
190 BUG_ON(!nblocks); 190 BUG_ON(!nblocks);
191 191
192 mlog_entry_void(); 192 mlog_entry_void();
193 193
194 mlog(0, "Trying to extend transaction by %d blocks\n", nblocks); 194 mlog(0, "Trying to extend transaction by %d blocks\n", nblocks);
195 195
196 status = journal_extend(handle, nblocks); 196 status = journal_extend(handle, nblocks);
197 if (status < 0) { 197 if (status < 0) {
198 mlog_errno(status); 198 mlog_errno(status);
199 goto bail; 199 goto bail;
200 } 200 }
201 201
202 if (status > 0) { 202 if (status > 0) {
203 mlog(0, "journal_extend failed, trying journal_restart\n"); 203 mlog(0, "journal_extend failed, trying journal_restart\n");
204 status = journal_restart(handle, nblocks); 204 status = journal_restart(handle, nblocks);
205 if (status < 0) { 205 if (status < 0) {
206 mlog_errno(status); 206 mlog_errno(status);
207 goto bail; 207 goto bail;
208 } 208 }
209 } 209 }
210 210
211 status = 0; 211 status = 0;
212 bail: 212 bail:
213 213
214 mlog_exit(status); 214 mlog_exit(status);
215 return status; 215 return status;
216 } 216 }
217 217
218 int ocfs2_journal_access(handle_t *handle, 218 int ocfs2_journal_access(handle_t *handle,
219 struct inode *inode, 219 struct inode *inode,
220 struct buffer_head *bh, 220 struct buffer_head *bh,
221 int type) 221 int type)
222 { 222 {
223 int status; 223 int status;
224 224
225 BUG_ON(!inode); 225 BUG_ON(!inode);
226 BUG_ON(!handle); 226 BUG_ON(!handle);
227 BUG_ON(!bh); 227 BUG_ON(!bh);
228 228
229 mlog_entry("bh->b_blocknr=%llu, type=%d (\"%s\"), bh->b_size = %zu\n", 229 mlog_entry("bh->b_blocknr=%llu, type=%d (\"%s\"), bh->b_size = %zu\n",
230 (unsigned long long)bh->b_blocknr, type, 230 (unsigned long long)bh->b_blocknr, type,
231 (type == OCFS2_JOURNAL_ACCESS_CREATE) ? 231 (type == OCFS2_JOURNAL_ACCESS_CREATE) ?
232 "OCFS2_JOURNAL_ACCESS_CREATE" : 232 "OCFS2_JOURNAL_ACCESS_CREATE" :
233 "OCFS2_JOURNAL_ACCESS_WRITE", 233 "OCFS2_JOURNAL_ACCESS_WRITE",
234 bh->b_size); 234 bh->b_size);
235 235
236 /* we can safely remove this assertion after testing. */ 236 /* we can safely remove this assertion after testing. */
237 if (!buffer_uptodate(bh)) { 237 if (!buffer_uptodate(bh)) {
238 mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n"); 238 mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n");
239 mlog(ML_ERROR, "b_blocknr=%llu\n", 239 mlog(ML_ERROR, "b_blocknr=%llu\n",
240 (unsigned long long)bh->b_blocknr); 240 (unsigned long long)bh->b_blocknr);
241 BUG(); 241 BUG();
242 } 242 }
243 243
244 /* Set the current transaction information on the inode so 244 /* Set the current transaction information on the inode so
245 * that the locking code knows whether it can drop it's locks 245 * that the locking code knows whether it can drop it's locks
246 * on this inode or not. We're protected from the commit 246 * on this inode or not. We're protected from the commit
247 * thread updating the current transaction id until 247 * thread updating the current transaction id until
248 * ocfs2_commit_trans() because ocfs2_start_trans() took 248 * ocfs2_commit_trans() because ocfs2_start_trans() took
249 * j_trans_barrier for us. */ 249 * j_trans_barrier for us. */
250 ocfs2_set_inode_lock_trans(OCFS2_SB(inode->i_sb)->journal, inode); 250 ocfs2_set_inode_lock_trans(OCFS2_SB(inode->i_sb)->journal, inode);
251 251
252 mutex_lock(&OCFS2_I(inode)->ip_io_mutex); 252 mutex_lock(&OCFS2_I(inode)->ip_io_mutex);
253 switch (type) { 253 switch (type) {
254 case OCFS2_JOURNAL_ACCESS_CREATE: 254 case OCFS2_JOURNAL_ACCESS_CREATE:
255 case OCFS2_JOURNAL_ACCESS_WRITE: 255 case OCFS2_JOURNAL_ACCESS_WRITE:
256 status = journal_get_write_access(handle, bh); 256 status = journal_get_write_access(handle, bh);
257 break; 257 break;
258 258
259 case OCFS2_JOURNAL_ACCESS_UNDO: 259 case OCFS2_JOURNAL_ACCESS_UNDO:
260 status = journal_get_undo_access(handle, bh); 260 status = journal_get_undo_access(handle, bh);
261 break; 261 break;
262 262
263 default: 263 default:
264 status = -EINVAL; 264 status = -EINVAL;
265 mlog(ML_ERROR, "Uknown access type!\n"); 265 mlog(ML_ERROR, "Uknown access type!\n");
266 } 266 }
267 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); 267 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
268 268
269 if (status < 0) 269 if (status < 0)
270 mlog(ML_ERROR, "Error %d getting %d access to buffer!\n", 270 mlog(ML_ERROR, "Error %d getting %d access to buffer!\n",
271 status, type); 271 status, type);
272 272
273 mlog_exit(status); 273 mlog_exit(status);
274 return status; 274 return status;
275 } 275 }
276 276
277 int ocfs2_journal_dirty(handle_t *handle, 277 int ocfs2_journal_dirty(handle_t *handle,
278 struct buffer_head *bh) 278 struct buffer_head *bh)
279 { 279 {
280 int status; 280 int status;
281 281
282 mlog_entry("(bh->b_blocknr=%llu)\n", 282 mlog_entry("(bh->b_blocknr=%llu)\n",
283 (unsigned long long)bh->b_blocknr); 283 (unsigned long long)bh->b_blocknr);
284 284
285 status = journal_dirty_metadata(handle, bh); 285 status = journal_dirty_metadata(handle, bh);
286 if (status < 0) 286 if (status < 0)
287 mlog(ML_ERROR, "Could not dirty metadata buffer. " 287 mlog(ML_ERROR, "Could not dirty metadata buffer. "
288 "(bh->b_blocknr=%llu)\n", 288 "(bh->b_blocknr=%llu)\n",
289 (unsigned long long)bh->b_blocknr); 289 (unsigned long long)bh->b_blocknr);
290 290
291 mlog_exit(status); 291 mlog_exit(status);
292 return status; 292 return status;
293 } 293 }
294 294
295 int ocfs2_journal_dirty_data(handle_t *handle, 295 int ocfs2_journal_dirty_data(handle_t *handle,
296 struct buffer_head *bh) 296 struct buffer_head *bh)
297 { 297 {
298 int err = journal_dirty_data(handle, bh); 298 int err = journal_dirty_data(handle, bh);
299 if (err) 299 if (err)
300 mlog_errno(err); 300 mlog_errno(err);
301 /* TODO: When we can handle it, abort the handle and go RO on 301 /* TODO: When we can handle it, abort the handle and go RO on
302 * error here. */ 302 * error here. */
303 303
304 return err; 304 return err;
305 } 305 }
306 306
307 #define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * 5) 307 #define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * 5)
308 308
309 void ocfs2_set_journal_params(struct ocfs2_super *osb) 309 void ocfs2_set_journal_params(struct ocfs2_super *osb)
310 { 310 {
311 journal_t *journal = osb->journal->j_journal; 311 journal_t *journal = osb->journal->j_journal;
312 312
313 spin_lock(&journal->j_state_lock); 313 spin_lock(&journal->j_state_lock);
314 journal->j_commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL; 314 journal->j_commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL;
315 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) 315 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
316 journal->j_flags |= JFS_BARRIER; 316 journal->j_flags |= JFS_BARRIER;
317 else 317 else
318 journal->j_flags &= ~JFS_BARRIER; 318 journal->j_flags &= ~JFS_BARRIER;
319 spin_unlock(&journal->j_state_lock); 319 spin_unlock(&journal->j_state_lock);
320 } 320 }
321 321
322 int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty) 322 int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
323 { 323 {
324 int status = -1; 324 int status = -1;
325 struct inode *inode = NULL; /* the journal inode */ 325 struct inode *inode = NULL; /* the journal inode */
326 journal_t *j_journal = NULL; 326 journal_t *j_journal = NULL;
327 struct ocfs2_dinode *di = NULL; 327 struct ocfs2_dinode *di = NULL;
328 struct buffer_head *bh = NULL; 328 struct buffer_head *bh = NULL;
329 struct ocfs2_super *osb; 329 struct ocfs2_super *osb;
330 int meta_lock = 0; 330 int meta_lock = 0;
331 331
332 mlog_entry_void(); 332 mlog_entry_void();
333 333
334 BUG_ON(!journal); 334 BUG_ON(!journal);
335 335
336 osb = journal->j_osb; 336 osb = journal->j_osb;
337 337
338 /* already have the inode for our journal */ 338 /* already have the inode for our journal */
339 inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE, 339 inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
340 osb->slot_num); 340 osb->slot_num);
341 if (inode == NULL) { 341 if (inode == NULL) {
342 status = -EACCES; 342 status = -EACCES;
343 mlog_errno(status); 343 mlog_errno(status);
344 goto done; 344 goto done;
345 } 345 }
346 if (is_bad_inode(inode)) { 346 if (is_bad_inode(inode)) {
347 mlog(ML_ERROR, "access error (bad inode)\n"); 347 mlog(ML_ERROR, "access error (bad inode)\n");
348 iput(inode); 348 iput(inode);
349 inode = NULL; 349 inode = NULL;
350 status = -EACCES; 350 status = -EACCES;
351 goto done; 351 goto done;
352 } 352 }
353 353
354 SET_INODE_JOURNAL(inode); 354 SET_INODE_JOURNAL(inode);
355 OCFS2_I(inode)->ip_open_count++; 355 OCFS2_I(inode)->ip_open_count++;
356 356
357 /* Skip recovery waits here - journal inode metadata never 357 /* Skip recovery waits here - journal inode metadata never
358 * changes in a live cluster so it can be considered an 358 * changes in a live cluster so it can be considered an
359 * exception to the rule. */ 359 * exception to the rule. */
360 status = ocfs2_meta_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY); 360 status = ocfs2_meta_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
361 if (status < 0) { 361 if (status < 0) {
362 if (status != -ERESTARTSYS) 362 if (status != -ERESTARTSYS)
363 mlog(ML_ERROR, "Could not get lock on journal!\n"); 363 mlog(ML_ERROR, "Could not get lock on journal!\n");
364 goto done; 364 goto done;
365 } 365 }
366 366
367 meta_lock = 1; 367 meta_lock = 1;
368 di = (struct ocfs2_dinode *)bh->b_data; 368 di = (struct ocfs2_dinode *)bh->b_data;
369 369
370 if (inode->i_size < OCFS2_MIN_JOURNAL_SIZE) { 370 if (inode->i_size < OCFS2_MIN_JOURNAL_SIZE) {
371 mlog(ML_ERROR, "Journal file size (%lld) is too small!\n", 371 mlog(ML_ERROR, "Journal file size (%lld) is too small!\n",
372 inode->i_size); 372 inode->i_size);
373 status = -EINVAL; 373 status = -EINVAL;
374 goto done; 374 goto done;
375 } 375 }
376 376
377 mlog(0, "inode->i_size = %lld\n", inode->i_size); 377 mlog(0, "inode->i_size = %lld\n", inode->i_size);
378 mlog(0, "inode->i_blocks = %llu\n", 378 mlog(0, "inode->i_blocks = %llu\n",
379 (unsigned long long)inode->i_blocks); 379 (unsigned long long)inode->i_blocks);
380 mlog(0, "inode->ip_clusters = %u\n", OCFS2_I(inode)->ip_clusters); 380 mlog(0, "inode->ip_clusters = %u\n", OCFS2_I(inode)->ip_clusters);
381 381
382 /* call the kernels journal init function now */ 382 /* call the kernels journal init function now */
383 j_journal = journal_init_inode(inode); 383 j_journal = journal_init_inode(inode);
384 if (j_journal == NULL) { 384 if (j_journal == NULL) {
385 mlog(ML_ERROR, "Linux journal layer error\n"); 385 mlog(ML_ERROR, "Linux journal layer error\n");
386 status = -EINVAL; 386 status = -EINVAL;
387 goto done; 387 goto done;
388 } 388 }
389 389
390 mlog(0, "Returned from journal_init_inode\n"); 390 mlog(0, "Returned from journal_init_inode\n");
391 mlog(0, "j_journal->j_maxlen = %u\n", j_journal->j_maxlen); 391 mlog(0, "j_journal->j_maxlen = %u\n", j_journal->j_maxlen);
392 392
393 *dirty = (le32_to_cpu(di->id1.journal1.ij_flags) & 393 *dirty = (le32_to_cpu(di->id1.journal1.ij_flags) &
394 OCFS2_JOURNAL_DIRTY_FL); 394 OCFS2_JOURNAL_DIRTY_FL);
395 395
396 journal->j_journal = j_journal; 396 journal->j_journal = j_journal;
397 journal->j_inode = inode; 397 journal->j_inode = inode;
398 journal->j_bh = bh; 398 journal->j_bh = bh;
399 399
400 ocfs2_set_journal_params(osb); 400 ocfs2_set_journal_params(osb);
401 401
402 journal->j_state = OCFS2_JOURNAL_LOADED; 402 journal->j_state = OCFS2_JOURNAL_LOADED;
403 403
404 status = 0; 404 status = 0;
405 done: 405 done:
406 if (status < 0) { 406 if (status < 0) {
407 if (meta_lock) 407 if (meta_lock)
408 ocfs2_meta_unlock(inode, 1); 408 ocfs2_meta_unlock(inode, 1);
409 if (bh != NULL) 409 if (bh != NULL)
410 brelse(bh); 410 brelse(bh);
411 if (inode) { 411 if (inode) {
412 OCFS2_I(inode)->ip_open_count--; 412 OCFS2_I(inode)->ip_open_count--;
413 iput(inode); 413 iput(inode);
414 } 414 }
415 } 415 }
416 416
417 mlog_exit(status); 417 mlog_exit(status);
418 return status; 418 return status;
419 } 419 }
420 420
421 static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb, 421 static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
422 int dirty) 422 int dirty)
423 { 423 {
424 int status; 424 int status;
425 unsigned int flags; 425 unsigned int flags;
426 struct ocfs2_journal *journal = osb->journal; 426 struct ocfs2_journal *journal = osb->journal;
427 struct buffer_head *bh = journal->j_bh; 427 struct buffer_head *bh = journal->j_bh;
428 struct ocfs2_dinode *fe; 428 struct ocfs2_dinode *fe;
429 429
430 mlog_entry_void(); 430 mlog_entry_void();
431 431
432 fe = (struct ocfs2_dinode *)bh->b_data; 432 fe = (struct ocfs2_dinode *)bh->b_data;
433 if (!OCFS2_IS_VALID_DINODE(fe)) { 433 if (!OCFS2_IS_VALID_DINODE(fe)) {
434 /* This is called from startup/shutdown which will 434 /* This is called from startup/shutdown which will
435 * handle the errors in a specific manner, so no need 435 * handle the errors in a specific manner, so no need
436 * to call ocfs2_error() here. */ 436 * to call ocfs2_error() here. */
437 mlog(ML_ERROR, "Journal dinode %llu has invalid " 437 mlog(ML_ERROR, "Journal dinode %llu has invalid "
438 "signature: %.*s", (unsigned long long)fe->i_blkno, 7, 438 "signature: %.*s", (unsigned long long)fe->i_blkno, 7,
439 fe->i_signature); 439 fe->i_signature);
440 status = -EIO; 440 status = -EIO;
441 goto out; 441 goto out;
442 } 442 }
443 443
444 flags = le32_to_cpu(fe->id1.journal1.ij_flags); 444 flags = le32_to_cpu(fe->id1.journal1.ij_flags);
445 if (dirty) 445 if (dirty)
446 flags |= OCFS2_JOURNAL_DIRTY_FL; 446 flags |= OCFS2_JOURNAL_DIRTY_FL;
447 else 447 else
448 flags &= ~OCFS2_JOURNAL_DIRTY_FL; 448 flags &= ~OCFS2_JOURNAL_DIRTY_FL;
449 fe->id1.journal1.ij_flags = cpu_to_le32(flags); 449 fe->id1.journal1.ij_flags = cpu_to_le32(flags);
450 450
451 status = ocfs2_write_block(osb, bh, journal->j_inode); 451 status = ocfs2_write_block(osb, bh, journal->j_inode);
452 if (status < 0) 452 if (status < 0)
453 mlog_errno(status); 453 mlog_errno(status);
454 454
455 out: 455 out:
456 mlog_exit(status); 456 mlog_exit(status);
457 return status; 457 return status;
458 } 458 }
459 459
460 /* 460 /*
461 * If the journal has been kmalloc'd it needs to be freed after this 461 * If the journal has been kmalloc'd it needs to be freed after this
462 * call. 462 * call.
463 */ 463 */
464 void ocfs2_journal_shutdown(struct ocfs2_super *osb) 464 void ocfs2_journal_shutdown(struct ocfs2_super *osb)
465 { 465 {
466 struct ocfs2_journal *journal = NULL; 466 struct ocfs2_journal *journal = NULL;
467 int status = 0; 467 int status = 0;
468 struct inode *inode = NULL; 468 struct inode *inode = NULL;
469 int num_running_trans = 0; 469 int num_running_trans = 0;
470 470
471 mlog_entry_void(); 471 mlog_entry_void();
472 472
473 BUG_ON(!osb); 473 BUG_ON(!osb);
474 474
475 journal = osb->journal; 475 journal = osb->journal;
476 if (!journal) 476 if (!journal)
477 goto done; 477 goto done;
478 478
479 inode = journal->j_inode; 479 inode = journal->j_inode;
480 480
481 if (journal->j_state != OCFS2_JOURNAL_LOADED) 481 if (journal->j_state != OCFS2_JOURNAL_LOADED)
482 goto done; 482 goto done;
483 483
484 /* need to inc inode use count as journal_destroy will iput. */ 484 /* need to inc inode use count as journal_destroy will iput. */
485 if (!igrab(inode)) 485 if (!igrab(inode))
486 BUG(); 486 BUG();
487 487
488 num_running_trans = atomic_read(&(osb->journal->j_num_trans)); 488 num_running_trans = atomic_read(&(osb->journal->j_num_trans));
489 if (num_running_trans > 0) 489 if (num_running_trans > 0)
490 mlog(0, "Shutting down journal: must wait on %d " 490 mlog(0, "Shutting down journal: must wait on %d "
491 "running transactions!\n", 491 "running transactions!\n",
492 num_running_trans); 492 num_running_trans);
493 493
494 /* Do a commit_cache here. It will flush our journal, *and* 494 /* Do a commit_cache here. It will flush our journal, *and*
495 * release any locks that are still held. 495 * release any locks that are still held.
496 * set the SHUTDOWN flag and release the trans lock. 496 * set the SHUTDOWN flag and release the trans lock.
497 * the commit thread will take the trans lock for us below. */ 497 * the commit thread will take the trans lock for us below. */
498 journal->j_state = OCFS2_JOURNAL_IN_SHUTDOWN; 498 journal->j_state = OCFS2_JOURNAL_IN_SHUTDOWN;
499 499
500 /* The OCFS2_JOURNAL_IN_SHUTDOWN will signal to commit_cache to not 500 /* The OCFS2_JOURNAL_IN_SHUTDOWN will signal to commit_cache to not
501 * drop the trans_lock (which we want to hold until we 501 * drop the trans_lock (which we want to hold until we
502 * completely destroy the journal. */ 502 * completely destroy the journal. */
503 if (osb->commit_task) { 503 if (osb->commit_task) {
504 /* Wait for the commit thread */ 504 /* Wait for the commit thread */
505 mlog(0, "Waiting for ocfs2commit to exit....\n"); 505 mlog(0, "Waiting for ocfs2commit to exit....\n");
506 kthread_stop(osb->commit_task); 506 kthread_stop(osb->commit_task);
507 osb->commit_task = NULL; 507 osb->commit_task = NULL;
508 } 508 }
509 509
510 BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0); 510 BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0);
511 511
512 if (ocfs2_mount_local(osb)) { 512 if (ocfs2_mount_local(osb)) {
513 journal_lock_updates(journal->j_journal); 513 journal_lock_updates(journal->j_journal);
514 status = journal_flush(journal->j_journal); 514 status = journal_flush(journal->j_journal);
515 journal_unlock_updates(journal->j_journal); 515 journal_unlock_updates(journal->j_journal);
516 if (status < 0) 516 if (status < 0)
517 mlog_errno(status); 517 mlog_errno(status);
518 } 518 }
519 519
520 if (status == 0) { 520 if (status == 0) {
521 /* 521 /*
522 * Do not toggle if flush was unsuccessful otherwise 522 * Do not toggle if flush was unsuccessful otherwise
523 * will leave dirty metadata in a "clean" journal 523 * will leave dirty metadata in a "clean" journal
524 */ 524 */
525 status = ocfs2_journal_toggle_dirty(osb, 0); 525 status = ocfs2_journal_toggle_dirty(osb, 0);
526 if (status < 0) 526 if (status < 0)
527 mlog_errno(status); 527 mlog_errno(status);
528 } 528 }
529 529
530 /* Shutdown the kernel journal system */ 530 /* Shutdown the kernel journal system */
531 journal_destroy(journal->j_journal); 531 journal_destroy(journal->j_journal);
532 532
533 OCFS2_I(inode)->ip_open_count--; 533 OCFS2_I(inode)->ip_open_count--;
534 534
535 /* unlock our journal */ 535 /* unlock our journal */
536 ocfs2_meta_unlock(inode, 1); 536 ocfs2_meta_unlock(inode, 1);
537 537
538 brelse(journal->j_bh); 538 brelse(journal->j_bh);
539 journal->j_bh = NULL; 539 journal->j_bh = NULL;
540 540
541 journal->j_state = OCFS2_JOURNAL_FREE; 541 journal->j_state = OCFS2_JOURNAL_FREE;
542 542
543 // up_write(&journal->j_trans_barrier); 543 // up_write(&journal->j_trans_barrier);
544 done: 544 done:
545 if (inode) 545 if (inode)
546 iput(inode); 546 iput(inode);
547 mlog_exit_void(); 547 mlog_exit_void();
548 } 548 }
549 549
550 static void ocfs2_clear_journal_error(struct super_block *sb, 550 static void ocfs2_clear_journal_error(struct super_block *sb,
551 journal_t *journal, 551 journal_t *journal,
552 int slot) 552 int slot)
553 { 553 {
554 int olderr; 554 int olderr;
555 555
556 olderr = journal_errno(journal); 556 olderr = journal_errno(journal);
557 if (olderr) { 557 if (olderr) {
558 mlog(ML_ERROR, "File system error %d recorded in " 558 mlog(ML_ERROR, "File system error %d recorded in "
559 "journal %u.\n", olderr, slot); 559 "journal %u.\n", olderr, slot);
560 mlog(ML_ERROR, "File system on device %s needs checking.\n", 560 mlog(ML_ERROR, "File system on device %s needs checking.\n",
561 sb->s_id); 561 sb->s_id);
562 562
563 journal_ack_err(journal); 563 journal_ack_err(journal);
564 journal_clear_err(journal); 564 journal_clear_err(journal);
565 } 565 }
566 } 566 }
567 567
568 int ocfs2_journal_load(struct ocfs2_journal *journal, int local) 568 int ocfs2_journal_load(struct ocfs2_journal *journal, int local)
569 { 569 {
570 int status = 0; 570 int status = 0;
571 struct ocfs2_super *osb; 571 struct ocfs2_super *osb;
572 572
573 mlog_entry_void(); 573 mlog_entry_void();
574 574
575 if (!journal) 575 if (!journal)
576 BUG(); 576 BUG();
577 577
578 osb = journal->j_osb; 578 osb = journal->j_osb;
579 579
580 status = journal_load(journal->j_journal); 580 status = journal_load(journal->j_journal);
581 if (status < 0) { 581 if (status < 0) {
582 mlog(ML_ERROR, "Failed to load journal!\n"); 582 mlog(ML_ERROR, "Failed to load journal!\n");
583 goto done; 583 goto done;
584 } 584 }
585 585
586 ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num); 586 ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num);
587 587
588 status = ocfs2_journal_toggle_dirty(osb, 1); 588 status = ocfs2_journal_toggle_dirty(osb, 1);
589 if (status < 0) { 589 if (status < 0) {
590 mlog_errno(status); 590 mlog_errno(status);
591 goto done; 591 goto done;
592 } 592 }
593 593
594 /* Launch the commit thread */ 594 /* Launch the commit thread */
595 if (!local) { 595 if (!local) {
596 osb->commit_task = kthread_run(ocfs2_commit_thread, osb, 596 osb->commit_task = kthread_run(ocfs2_commit_thread, osb,
597 "ocfs2cmt"); 597 "ocfs2cmt");
598 if (IS_ERR(osb->commit_task)) { 598 if (IS_ERR(osb->commit_task)) {
599 status = PTR_ERR(osb->commit_task); 599 status = PTR_ERR(osb->commit_task);
600 osb->commit_task = NULL; 600 osb->commit_task = NULL;
601 mlog(ML_ERROR, "unable to launch ocfs2commit thread, " 601 mlog(ML_ERROR, "unable to launch ocfs2commit thread, "
602 "error=%d", status); 602 "error=%d", status);
603 goto done; 603 goto done;
604 } 604 }
605 } else 605 } else
606 osb->commit_task = NULL; 606 osb->commit_task = NULL;
607 607
608 done: 608 done:
609 mlog_exit(status); 609 mlog_exit(status);
610 return status; 610 return status;
611 } 611 }
612 612
613 613
614 /* 'full' flag tells us whether we clear out all blocks or if we just 614 /* 'full' flag tells us whether we clear out all blocks or if we just
615 * mark the journal clean */ 615 * mark the journal clean */
616 int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full) 616 int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
617 { 617 {
618 int status; 618 int status;
619 619
620 mlog_entry_void(); 620 mlog_entry_void();
621 621
622 BUG_ON(!journal); 622 BUG_ON(!journal);
623 623
624 status = journal_wipe(journal->j_journal, full); 624 status = journal_wipe(journal->j_journal, full);
625 if (status < 0) { 625 if (status < 0) {
626 mlog_errno(status); 626 mlog_errno(status);
627 goto bail; 627 goto bail;
628 } 628 }
629 629
630 status = ocfs2_journal_toggle_dirty(journal->j_osb, 0); 630 status = ocfs2_journal_toggle_dirty(journal->j_osb, 0);
631 if (status < 0) 631 if (status < 0)
632 mlog_errno(status); 632 mlog_errno(status);
633 633
634 bail: 634 bail:
635 mlog_exit(status); 635 mlog_exit(status);
636 return status; 636 return status;
637 } 637 }
638 638
639 /* 639 /*
640 * JBD Might read a cached version of another nodes journal file. We 640 * JBD Might read a cached version of another nodes journal file. We
641 * don't want this as this file changes often and we get no 641 * don't want this as this file changes often and we get no
642 * notification on those changes. The only way to be sure that we've 642 * notification on those changes. The only way to be sure that we've
643 * got the most up to date version of those blocks then is to force 643 * got the most up to date version of those blocks then is to force
644 * read them off disk. Just searching through the buffer cache won't 644 * read them off disk. Just searching through the buffer cache won't
645 * work as there may be pages backing this file which are still marked 645 * work as there may be pages backing this file which are still marked
646 * up to date. We know things can't change on this file underneath us 646 * up to date. We know things can't change on this file underneath us
647 * as we have the lock by now :) 647 * as we have the lock by now :)
648 */ 648 */
649 static int ocfs2_force_read_journal(struct inode *inode) 649 static int ocfs2_force_read_journal(struct inode *inode)
650 { 650 {
651 int status = 0; 651 int status = 0;
652 int i, p_blocks; 652 int i;
653 u64 v_blkno, p_blkno; 653 u64 v_blkno, p_blkno, p_blocks;
654 #define CONCURRENT_JOURNAL_FILL 32 654 #define CONCURRENT_JOURNAL_FILL 32ULL
655 struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL]; 655 struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL];
656 656
657 mlog_entry_void(); 657 mlog_entry_void();
658 658
659 BUG_ON(inode->i_blocks != 659 BUG_ON(inode->i_blocks !=
660 ocfs2_align_bytes_to_sectors(i_size_read(inode))); 660 ocfs2_align_bytes_to_sectors(i_size_read(inode)));
661 661
662 memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL); 662 memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL);
663 663
664 mlog(0, "Force reading %llu blocks\n", 664 mlog(0, "Force reading %llu blocks\n",
665 (unsigned long long)(inode->i_blocks >> 665 (unsigned long long)(inode->i_blocks >>
666 (inode->i_sb->s_blocksize_bits - 9))); 666 (inode->i_sb->s_blocksize_bits - 9)));
667 667
668 v_blkno = 0; 668 v_blkno = 0;
669 while (v_blkno < 669 while (v_blkno <
670 (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))) { 670 (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))) {
671 671
672 status = ocfs2_extent_map_get_blocks(inode, v_blkno, 672 status = ocfs2_extent_map_get_blocks(inode, v_blkno,
673 &p_blkno, &p_blocks, NULL); 673 &p_blkno, &p_blocks, NULL);
674 if (status < 0) { 674 if (status < 0) {
675 mlog_errno(status); 675 mlog_errno(status);
676 goto bail; 676 goto bail;
677 } 677 }
678 678
679 if (p_blocks > CONCURRENT_JOURNAL_FILL) 679 if (p_blocks > CONCURRENT_JOURNAL_FILL)
680 p_blocks = CONCURRENT_JOURNAL_FILL; 680 p_blocks = CONCURRENT_JOURNAL_FILL;
681 681
682 /* We are reading journal data which should not 682 /* We are reading journal data which should not
683 * be put in the uptodate cache */ 683 * be put in the uptodate cache */
684 status = ocfs2_read_blocks(OCFS2_SB(inode->i_sb), 684 status = ocfs2_read_blocks(OCFS2_SB(inode->i_sb),
685 p_blkno, p_blocks, bhs, 0, 685 p_blkno, p_blocks, bhs, 0,
686 NULL); 686 NULL);
687 if (status < 0) { 687 if (status < 0) {
688 mlog_errno(status); 688 mlog_errno(status);
689 goto bail; 689 goto bail;
690 } 690 }
691 691
692 for(i = 0; i < p_blocks; i++) { 692 for(i = 0; i < p_blocks; i++) {
693 brelse(bhs[i]); 693 brelse(bhs[i]);
694 bhs[i] = NULL; 694 bhs[i] = NULL;
695 } 695 }
696 696
697 v_blkno += p_blocks; 697 v_blkno += p_blocks;
698 } 698 }
699 699
700 bail: 700 bail:
701 for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++) 701 for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++)
702 if (bhs[i]) 702 if (bhs[i])
703 brelse(bhs[i]); 703 brelse(bhs[i]);
704 mlog_exit(status); 704 mlog_exit(status);
705 return status; 705 return status;
706 } 706 }
707 707
708 struct ocfs2_la_recovery_item { 708 struct ocfs2_la_recovery_item {
709 struct list_head lri_list; 709 struct list_head lri_list;
710 int lri_slot; 710 int lri_slot;
711 struct ocfs2_dinode *lri_la_dinode; 711 struct ocfs2_dinode *lri_la_dinode;
712 struct ocfs2_dinode *lri_tl_dinode; 712 struct ocfs2_dinode *lri_tl_dinode;
713 }; 713 };
714 714
715 /* Does the second half of the recovery process. By this point, the 715 /* Does the second half of the recovery process. By this point, the
716 * node is marked clean and can actually be considered recovered, 716 * node is marked clean and can actually be considered recovered,
717 * hence it's no longer in the recovery map, but there's still some 717 * hence it's no longer in the recovery map, but there's still some
718 * cleanup we can do which shouldn't happen within the recovery thread 718 * cleanup we can do which shouldn't happen within the recovery thread
719 * as locking in that context becomes very difficult if we are to take 719 * as locking in that context becomes very difficult if we are to take
720 * recovering nodes into account. 720 * recovering nodes into account.
721 * 721 *
722 * NOTE: This function can and will sleep on recovery of other nodes 722 * NOTE: This function can and will sleep on recovery of other nodes
723 * during cluster locking, just like any other ocfs2 process. 723 * during cluster locking, just like any other ocfs2 process.
724 */ 724 */
725 void ocfs2_complete_recovery(struct work_struct *work) 725 void ocfs2_complete_recovery(struct work_struct *work)
726 { 726 {
727 int ret; 727 int ret;
728 struct ocfs2_journal *journal = 728 struct ocfs2_journal *journal =
729 container_of(work, struct ocfs2_journal, j_recovery_work); 729 container_of(work, struct ocfs2_journal, j_recovery_work);
730 struct ocfs2_super *osb = journal->j_osb; 730 struct ocfs2_super *osb = journal->j_osb;
731 struct ocfs2_dinode *la_dinode, *tl_dinode; 731 struct ocfs2_dinode *la_dinode, *tl_dinode;
732 struct ocfs2_la_recovery_item *item; 732 struct ocfs2_la_recovery_item *item;
733 struct list_head *p, *n; 733 struct list_head *p, *n;
734 LIST_HEAD(tmp_la_list); 734 LIST_HEAD(tmp_la_list);
735 735
736 mlog_entry_void(); 736 mlog_entry_void();
737 737
738 mlog(0, "completing recovery from keventd\n"); 738 mlog(0, "completing recovery from keventd\n");
739 739
740 spin_lock(&journal->j_lock); 740 spin_lock(&journal->j_lock);
741 list_splice_init(&journal->j_la_cleanups, &tmp_la_list); 741 list_splice_init(&journal->j_la_cleanups, &tmp_la_list);
742 spin_unlock(&journal->j_lock); 742 spin_unlock(&journal->j_lock);
743 743
744 list_for_each_safe(p, n, &tmp_la_list) { 744 list_for_each_safe(p, n, &tmp_la_list) {
745 item = list_entry(p, struct ocfs2_la_recovery_item, lri_list); 745 item = list_entry(p, struct ocfs2_la_recovery_item, lri_list);
746 list_del_init(&item->lri_list); 746 list_del_init(&item->lri_list);
747 747
748 mlog(0, "Complete recovery for slot %d\n", item->lri_slot); 748 mlog(0, "Complete recovery for slot %d\n", item->lri_slot);
749 749
750 la_dinode = item->lri_la_dinode; 750 la_dinode = item->lri_la_dinode;
751 if (la_dinode) { 751 if (la_dinode) {
752 mlog(0, "Clean up local alloc %llu\n", 752 mlog(0, "Clean up local alloc %llu\n",
753 (unsigned long long)la_dinode->i_blkno); 753 (unsigned long long)la_dinode->i_blkno);
754 754
755 ret = ocfs2_complete_local_alloc_recovery(osb, 755 ret = ocfs2_complete_local_alloc_recovery(osb,
756 la_dinode); 756 la_dinode);
757 if (ret < 0) 757 if (ret < 0)
758 mlog_errno(ret); 758 mlog_errno(ret);
759 759
760 kfree(la_dinode); 760 kfree(la_dinode);
761 } 761 }
762 762
763 tl_dinode = item->lri_tl_dinode; 763 tl_dinode = item->lri_tl_dinode;
764 if (tl_dinode) { 764 if (tl_dinode) {
765 mlog(0, "Clean up truncate log %llu\n", 765 mlog(0, "Clean up truncate log %llu\n",
766 (unsigned long long)tl_dinode->i_blkno); 766 (unsigned long long)tl_dinode->i_blkno);
767 767
768 ret = ocfs2_complete_truncate_log_recovery(osb, 768 ret = ocfs2_complete_truncate_log_recovery(osb,
769 tl_dinode); 769 tl_dinode);
770 if (ret < 0) 770 if (ret < 0)
771 mlog_errno(ret); 771 mlog_errno(ret);
772 772
773 kfree(tl_dinode); 773 kfree(tl_dinode);
774 } 774 }
775 775
776 ret = ocfs2_recover_orphans(osb, item->lri_slot); 776 ret = ocfs2_recover_orphans(osb, item->lri_slot);
777 if (ret < 0) 777 if (ret < 0)
778 mlog_errno(ret); 778 mlog_errno(ret);
779 779
780 kfree(item); 780 kfree(item);
781 } 781 }
782 782
783 mlog(0, "Recovery completion\n"); 783 mlog(0, "Recovery completion\n");
784 mlog_exit_void(); 784 mlog_exit_void();
785 } 785 }
786 786
787 /* NOTE: This function always eats your references to la_dinode and 787 /* NOTE: This function always eats your references to la_dinode and
788 * tl_dinode, either manually on error, or by passing them to 788 * tl_dinode, either manually on error, or by passing them to
789 * ocfs2_complete_recovery */ 789 * ocfs2_complete_recovery */
790 static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, 790 static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
791 int slot_num, 791 int slot_num,
792 struct ocfs2_dinode *la_dinode, 792 struct ocfs2_dinode *la_dinode,
793 struct ocfs2_dinode *tl_dinode) 793 struct ocfs2_dinode *tl_dinode)
794 { 794 {
795 struct ocfs2_la_recovery_item *item; 795 struct ocfs2_la_recovery_item *item;
796 796
797 item = kmalloc(sizeof(struct ocfs2_la_recovery_item), GFP_NOFS); 797 item = kmalloc(sizeof(struct ocfs2_la_recovery_item), GFP_NOFS);
798 if (!item) { 798 if (!item) {
799 /* Though we wish to avoid it, we are in fact safe in 799 /* Though we wish to avoid it, we are in fact safe in
800 * skipping local alloc cleanup as fsck.ocfs2 is more 800 * skipping local alloc cleanup as fsck.ocfs2 is more
801 * than capable of reclaiming unused space. */ 801 * than capable of reclaiming unused space. */
802 if (la_dinode) 802 if (la_dinode)
803 kfree(la_dinode); 803 kfree(la_dinode);
804 804
805 if (tl_dinode) 805 if (tl_dinode)
806 kfree(tl_dinode); 806 kfree(tl_dinode);
807 807
808 mlog_errno(-ENOMEM); 808 mlog_errno(-ENOMEM);
809 return; 809 return;
810 } 810 }
811 811
812 INIT_LIST_HEAD(&item->lri_list); 812 INIT_LIST_HEAD(&item->lri_list);
813 item->lri_la_dinode = la_dinode; 813 item->lri_la_dinode = la_dinode;
814 item->lri_slot = slot_num; 814 item->lri_slot = slot_num;
815 item->lri_tl_dinode = tl_dinode; 815 item->lri_tl_dinode = tl_dinode;
816 816
817 spin_lock(&journal->j_lock); 817 spin_lock(&journal->j_lock);
818 list_add_tail(&item->lri_list, &journal->j_la_cleanups); 818 list_add_tail(&item->lri_list, &journal->j_la_cleanups);
819 queue_work(ocfs2_wq, &journal->j_recovery_work); 819 queue_work(ocfs2_wq, &journal->j_recovery_work);
820 spin_unlock(&journal->j_lock); 820 spin_unlock(&journal->j_lock);
821 } 821 }
822 822
823 /* Called by the mount code to queue recovery the last part of 823 /* Called by the mount code to queue recovery the last part of
824 * recovery for it's own slot. */ 824 * recovery for it's own slot. */
825 void ocfs2_complete_mount_recovery(struct ocfs2_super *osb) 825 void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
826 { 826 {
827 struct ocfs2_journal *journal = osb->journal; 827 struct ocfs2_journal *journal = osb->journal;
828 828
829 if (osb->dirty) { 829 if (osb->dirty) {
830 /* No need to queue up our truncate_log as regular 830 /* No need to queue up our truncate_log as regular
831 * cleanup will catch that. */ 831 * cleanup will catch that. */
832 ocfs2_queue_recovery_completion(journal, 832 ocfs2_queue_recovery_completion(journal,
833 osb->slot_num, 833 osb->slot_num,
834 osb->local_alloc_copy, 834 osb->local_alloc_copy,
835 NULL); 835 NULL);
836 ocfs2_schedule_truncate_log_flush(osb, 0); 836 ocfs2_schedule_truncate_log_flush(osb, 0);
837 837
838 osb->local_alloc_copy = NULL; 838 osb->local_alloc_copy = NULL;
839 osb->dirty = 0; 839 osb->dirty = 0;
840 } 840 }
841 } 841 }
842 842
843 static int __ocfs2_recovery_thread(void *arg) 843 static int __ocfs2_recovery_thread(void *arg)
844 { 844 {
845 int status, node_num; 845 int status, node_num;
846 struct ocfs2_super *osb = arg; 846 struct ocfs2_super *osb = arg;
847 847
848 mlog_entry_void(); 848 mlog_entry_void();
849 849
850 status = ocfs2_wait_on_mount(osb); 850 status = ocfs2_wait_on_mount(osb);
851 if (status < 0) { 851 if (status < 0) {
852 goto bail; 852 goto bail;
853 } 853 }
854 854
855 restart: 855 restart:
856 status = ocfs2_super_lock(osb, 1); 856 status = ocfs2_super_lock(osb, 1);
857 if (status < 0) { 857 if (status < 0) {
858 mlog_errno(status); 858 mlog_errno(status);
859 goto bail; 859 goto bail;
860 } 860 }
861 861
862 while(!ocfs2_node_map_is_empty(osb, &osb->recovery_map)) { 862 while(!ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
863 node_num = ocfs2_node_map_first_set_bit(osb, 863 node_num = ocfs2_node_map_first_set_bit(osb,
864 &osb->recovery_map); 864 &osb->recovery_map);
865 if (node_num == O2NM_INVALID_NODE_NUM) { 865 if (node_num == O2NM_INVALID_NODE_NUM) {
866 mlog(0, "Out of nodes to recover.\n"); 866 mlog(0, "Out of nodes to recover.\n");
867 break; 867 break;
868 } 868 }
869 869
870 status = ocfs2_recover_node(osb, node_num); 870 status = ocfs2_recover_node(osb, node_num);
871 if (status < 0) { 871 if (status < 0) {
872 mlog(ML_ERROR, 872 mlog(ML_ERROR,
873 "Error %d recovering node %d on device (%u,%u)!\n", 873 "Error %d recovering node %d on device (%u,%u)!\n",
874 status, node_num, 874 status, node_num,
875 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); 875 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
876 mlog(ML_ERROR, "Volume requires unmount.\n"); 876 mlog(ML_ERROR, "Volume requires unmount.\n");
877 continue; 877 continue;
878 } 878 }
879 879
880 ocfs2_recovery_map_clear(osb, node_num); 880 ocfs2_recovery_map_clear(osb, node_num);
881 } 881 }
882 ocfs2_super_unlock(osb, 1); 882 ocfs2_super_unlock(osb, 1);
883 883
884 /* We always run recovery on our own orphan dir - the dead 884 /* We always run recovery on our own orphan dir - the dead
885 * node(s) may have voted "no" on an inode delete earlier. A 885 * node(s) may have voted "no" on an inode delete earlier. A
886 * revote is therefore required. */ 886 * revote is therefore required. */
887 ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL, 887 ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
888 NULL); 888 NULL);
889 889
890 bail: 890 bail:
891 mutex_lock(&osb->recovery_lock); 891 mutex_lock(&osb->recovery_lock);
892 if (!status && 892 if (!status &&
893 !ocfs2_node_map_is_empty(osb, &osb->recovery_map)) { 893 !ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
894 mutex_unlock(&osb->recovery_lock); 894 mutex_unlock(&osb->recovery_lock);
895 goto restart; 895 goto restart;
896 } 896 }
897 897
898 osb->recovery_thread_task = NULL; 898 osb->recovery_thread_task = NULL;
899 mb(); /* sync with ocfs2_recovery_thread_running */ 899 mb(); /* sync with ocfs2_recovery_thread_running */
900 wake_up(&osb->recovery_event); 900 wake_up(&osb->recovery_event);
901 901
902 mutex_unlock(&osb->recovery_lock); 902 mutex_unlock(&osb->recovery_lock);
903 903
904 mlog_exit(status); 904 mlog_exit(status);
905 /* no one is callint kthread_stop() for us so the kthread() api 905 /* no one is callint kthread_stop() for us so the kthread() api
906 * requires that we call do_exit(). And it isn't exported, but 906 * requires that we call do_exit(). And it isn't exported, but
907 * complete_and_exit() seems to be a minimal wrapper around it. */ 907 * complete_and_exit() seems to be a minimal wrapper around it. */
908 complete_and_exit(NULL, status); 908 complete_and_exit(NULL, status);
909 return status; 909 return status;
910 } 910 }
911 911
912 void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num) 912 void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
913 { 913 {
914 mlog_entry("(node_num=%d, osb->node_num = %d)\n", 914 mlog_entry("(node_num=%d, osb->node_num = %d)\n",
915 node_num, osb->node_num); 915 node_num, osb->node_num);
916 916
917 mutex_lock(&osb->recovery_lock); 917 mutex_lock(&osb->recovery_lock);
918 if (osb->disable_recovery) 918 if (osb->disable_recovery)
919 goto out; 919 goto out;
920 920
921 /* People waiting on recovery will wait on 921 /* People waiting on recovery will wait on
922 * the recovery map to empty. */ 922 * the recovery map to empty. */
923 if (!ocfs2_recovery_map_set(osb, node_num)) 923 if (!ocfs2_recovery_map_set(osb, node_num))
924 mlog(0, "node %d already be in recovery.\n", node_num); 924 mlog(0, "node %d already be in recovery.\n", node_num);
925 925
926 mlog(0, "starting recovery thread...\n"); 926 mlog(0, "starting recovery thread...\n");
927 927
928 if (osb->recovery_thread_task) 928 if (osb->recovery_thread_task)
929 goto out; 929 goto out;
930 930
931 osb->recovery_thread_task = kthread_run(__ocfs2_recovery_thread, osb, 931 osb->recovery_thread_task = kthread_run(__ocfs2_recovery_thread, osb,
932 "ocfs2rec"); 932 "ocfs2rec");
933 if (IS_ERR(osb->recovery_thread_task)) { 933 if (IS_ERR(osb->recovery_thread_task)) {
934 mlog_errno((int)PTR_ERR(osb->recovery_thread_task)); 934 mlog_errno((int)PTR_ERR(osb->recovery_thread_task));
935 osb->recovery_thread_task = NULL; 935 osb->recovery_thread_task = NULL;
936 } 936 }
937 937
938 out: 938 out:
939 mutex_unlock(&osb->recovery_lock); 939 mutex_unlock(&osb->recovery_lock);
940 wake_up(&osb->recovery_event); 940 wake_up(&osb->recovery_event);
941 941
942 mlog_exit_void(); 942 mlog_exit_void();
943 } 943 }
944 944
945 /* Does the actual journal replay and marks the journal inode as 945 /* Does the actual journal replay and marks the journal inode as
946 * clean. Will only replay if the journal inode is marked dirty. */ 946 * clean. Will only replay if the journal inode is marked dirty. */
947 static int ocfs2_replay_journal(struct ocfs2_super *osb, 947 static int ocfs2_replay_journal(struct ocfs2_super *osb,
948 int node_num, 948 int node_num,
949 int slot_num) 949 int slot_num)
950 { 950 {
951 int status; 951 int status;
952 int got_lock = 0; 952 int got_lock = 0;
953 unsigned int flags; 953 unsigned int flags;
954 struct inode *inode = NULL; 954 struct inode *inode = NULL;
955 struct ocfs2_dinode *fe; 955 struct ocfs2_dinode *fe;
956 journal_t *journal = NULL; 956 journal_t *journal = NULL;
957 struct buffer_head *bh = NULL; 957 struct buffer_head *bh = NULL;
958 958
959 inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE, 959 inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
960 slot_num); 960 slot_num);
961 if (inode == NULL) { 961 if (inode == NULL) {
962 status = -EACCES; 962 status = -EACCES;
963 mlog_errno(status); 963 mlog_errno(status);
964 goto done; 964 goto done;
965 } 965 }
966 if (is_bad_inode(inode)) { 966 if (is_bad_inode(inode)) {
967 status = -EACCES; 967 status = -EACCES;
968 iput(inode); 968 iput(inode);
969 inode = NULL; 969 inode = NULL;
970 mlog_errno(status); 970 mlog_errno(status);
971 goto done; 971 goto done;
972 } 972 }
973 SET_INODE_JOURNAL(inode); 973 SET_INODE_JOURNAL(inode);
974 974
975 status = ocfs2_meta_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY); 975 status = ocfs2_meta_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
976 if (status < 0) { 976 if (status < 0) {
977 mlog(0, "status returned from ocfs2_meta_lock=%d\n", status); 977 mlog(0, "status returned from ocfs2_meta_lock=%d\n", status);
978 if (status != -ERESTARTSYS) 978 if (status != -ERESTARTSYS)
979 mlog(ML_ERROR, "Could not lock journal!\n"); 979 mlog(ML_ERROR, "Could not lock journal!\n");
980 goto done; 980 goto done;
981 } 981 }
982 got_lock = 1; 982 got_lock = 1;
983 983
984 fe = (struct ocfs2_dinode *) bh->b_data; 984 fe = (struct ocfs2_dinode *) bh->b_data;
985 985
986 flags = le32_to_cpu(fe->id1.journal1.ij_flags); 986 flags = le32_to_cpu(fe->id1.journal1.ij_flags);
987 987
988 if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) { 988 if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) {
989 mlog(0, "No recovery required for node %d\n", node_num); 989 mlog(0, "No recovery required for node %d\n", node_num);
990 goto done; 990 goto done;
991 } 991 }
992 992
993 mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n", 993 mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n",
994 node_num, slot_num, 994 node_num, slot_num,
995 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); 995 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
996 996
997 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); 997 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
998 998
999 status = ocfs2_force_read_journal(inode); 999 status = ocfs2_force_read_journal(inode);
1000 if (status < 0) { 1000 if (status < 0) {
1001 mlog_errno(status); 1001 mlog_errno(status);
1002 goto done; 1002 goto done;
1003 } 1003 }
1004 1004
1005 mlog(0, "calling journal_init_inode\n"); 1005 mlog(0, "calling journal_init_inode\n");
1006 journal = journal_init_inode(inode); 1006 journal = journal_init_inode(inode);
1007 if (journal == NULL) { 1007 if (journal == NULL) {
1008 mlog(ML_ERROR, "Linux journal layer error\n"); 1008 mlog(ML_ERROR, "Linux journal layer error\n");
1009 status = -EIO; 1009 status = -EIO;
1010 goto done; 1010 goto done;
1011 } 1011 }
1012 1012
1013 status = journal_load(journal); 1013 status = journal_load(journal);
1014 if (status < 0) { 1014 if (status < 0) {
1015 mlog_errno(status); 1015 mlog_errno(status);
1016 if (!igrab(inode)) 1016 if (!igrab(inode))
1017 BUG(); 1017 BUG();
1018 journal_destroy(journal); 1018 journal_destroy(journal);
1019 goto done; 1019 goto done;
1020 } 1020 }
1021 1021
1022 ocfs2_clear_journal_error(osb->sb, journal, slot_num); 1022 ocfs2_clear_journal_error(osb->sb, journal, slot_num);
1023 1023
1024 /* wipe the journal */ 1024 /* wipe the journal */
1025 mlog(0, "flushing the journal.\n"); 1025 mlog(0, "flushing the journal.\n");
1026 journal_lock_updates(journal); 1026 journal_lock_updates(journal);
1027 status = journal_flush(journal); 1027 status = journal_flush(journal);
1028 journal_unlock_updates(journal); 1028 journal_unlock_updates(journal);
1029 if (status < 0) 1029 if (status < 0)
1030 mlog_errno(status); 1030 mlog_errno(status);
1031 1031
1032 /* This will mark the node clean */ 1032 /* This will mark the node clean */
1033 flags = le32_to_cpu(fe->id1.journal1.ij_flags); 1033 flags = le32_to_cpu(fe->id1.journal1.ij_flags);
1034 flags &= ~OCFS2_JOURNAL_DIRTY_FL; 1034 flags &= ~OCFS2_JOURNAL_DIRTY_FL;
1035 fe->id1.journal1.ij_flags = cpu_to_le32(flags); 1035 fe->id1.journal1.ij_flags = cpu_to_le32(flags);
1036 1036
1037 status = ocfs2_write_block(osb, bh, inode); 1037 status = ocfs2_write_block(osb, bh, inode);
1038 if (status < 0) 1038 if (status < 0)
1039 mlog_errno(status); 1039 mlog_errno(status);
1040 1040
1041 if (!igrab(inode)) 1041 if (!igrab(inode))
1042 BUG(); 1042 BUG();
1043 1043
1044 journal_destroy(journal); 1044 journal_destroy(journal);
1045 1045
1046 done: 1046 done:
1047 /* drop the lock on this nodes journal */ 1047 /* drop the lock on this nodes journal */
1048 if (got_lock) 1048 if (got_lock)
1049 ocfs2_meta_unlock(inode, 1); 1049 ocfs2_meta_unlock(inode, 1);
1050 1050
1051 if (inode) 1051 if (inode)
1052 iput(inode); 1052 iput(inode);
1053 1053
1054 if (bh) 1054 if (bh)
1055 brelse(bh); 1055 brelse(bh);
1056 1056
1057 mlog_exit(status); 1057 mlog_exit(status);
1058 return status; 1058 return status;
1059 } 1059 }
1060 1060
1061 /* 1061 /*
1062 * Do the most important parts of node recovery: 1062 * Do the most important parts of node recovery:
1063 * - Replay it's journal 1063 * - Replay it's journal
1064 * - Stamp a clean local allocator file 1064 * - Stamp a clean local allocator file
1065 * - Stamp a clean truncate log 1065 * - Stamp a clean truncate log
1066 * - Mark the node clean 1066 * - Mark the node clean
1067 * 1067 *
1068 * If this function completes without error, a node in OCFS2 can be 1068 * If this function completes without error, a node in OCFS2 can be
1069 * said to have been safely recovered. As a result, failure during the 1069 * said to have been safely recovered. As a result, failure during the
1070 * second part of a nodes recovery process (local alloc recovery) is 1070 * second part of a nodes recovery process (local alloc recovery) is
1071 * far less concerning. 1071 * far less concerning.
1072 */ 1072 */
1073 static int ocfs2_recover_node(struct ocfs2_super *osb, 1073 static int ocfs2_recover_node(struct ocfs2_super *osb,
1074 int node_num) 1074 int node_num)
1075 { 1075 {
1076 int status = 0; 1076 int status = 0;
1077 int slot_num; 1077 int slot_num;
1078 struct ocfs2_slot_info *si = osb->slot_info; 1078 struct ocfs2_slot_info *si = osb->slot_info;
1079 struct ocfs2_dinode *la_copy = NULL; 1079 struct ocfs2_dinode *la_copy = NULL;
1080 struct ocfs2_dinode *tl_copy = NULL; 1080 struct ocfs2_dinode *tl_copy = NULL;
1081 1081
1082 mlog_entry("(node_num=%d, osb->node_num = %d)\n", 1082 mlog_entry("(node_num=%d, osb->node_num = %d)\n",
1083 node_num, osb->node_num); 1083 node_num, osb->node_num);
1084 1084
1085 mlog(0, "checking node %d\n", node_num); 1085 mlog(0, "checking node %d\n", node_num);
1086 1086
1087 /* Should not ever be called to recover ourselves -- in that 1087 /* Should not ever be called to recover ourselves -- in that
1088 * case we should've called ocfs2_journal_load instead. */ 1088 * case we should've called ocfs2_journal_load instead. */
1089 BUG_ON(osb->node_num == node_num); 1089 BUG_ON(osb->node_num == node_num);
1090 1090
1091 slot_num = ocfs2_node_num_to_slot(si, node_num); 1091 slot_num = ocfs2_node_num_to_slot(si, node_num);
1092 if (slot_num == OCFS2_INVALID_SLOT) { 1092 if (slot_num == OCFS2_INVALID_SLOT) {
1093 status = 0; 1093 status = 0;
1094 mlog(0, "no slot for this node, so no recovery required.\n"); 1094 mlog(0, "no slot for this node, so no recovery required.\n");
1095 goto done; 1095 goto done;
1096 } 1096 }
1097 1097
1098 mlog(0, "node %d was using slot %d\n", node_num, slot_num); 1098 mlog(0, "node %d was using slot %d\n", node_num, slot_num);
1099 1099
1100 status = ocfs2_replay_journal(osb, node_num, slot_num); 1100 status = ocfs2_replay_journal(osb, node_num, slot_num);
1101 if (status < 0) { 1101 if (status < 0) {
1102 mlog_errno(status); 1102 mlog_errno(status);
1103 goto done; 1103 goto done;
1104 } 1104 }
1105 1105
1106 /* Stamp a clean local alloc file AFTER recovering the journal... */ 1106 /* Stamp a clean local alloc file AFTER recovering the journal... */
1107 status = ocfs2_begin_local_alloc_recovery(osb, slot_num, &la_copy); 1107 status = ocfs2_begin_local_alloc_recovery(osb, slot_num, &la_copy);
1108 if (status < 0) { 1108 if (status < 0) {
1109 mlog_errno(status); 1109 mlog_errno(status);
1110 goto done; 1110 goto done;
1111 } 1111 }
1112 1112
1113 /* An error from begin_truncate_log_recovery is not 1113 /* An error from begin_truncate_log_recovery is not
1114 * serious enough to warrant halting the rest of 1114 * serious enough to warrant halting the rest of
1115 * recovery. */ 1115 * recovery. */
1116 status = ocfs2_begin_truncate_log_recovery(osb, slot_num, &tl_copy); 1116 status = ocfs2_begin_truncate_log_recovery(osb, slot_num, &tl_copy);
1117 if (status < 0) 1117 if (status < 0)
1118 mlog_errno(status); 1118 mlog_errno(status);
1119 1119
1120 /* Likewise, this would be a strange but ultimately not so 1120 /* Likewise, this would be a strange but ultimately not so
1121 * harmful place to get an error... */ 1121 * harmful place to get an error... */
1122 ocfs2_clear_slot(si, slot_num); 1122 ocfs2_clear_slot(si, slot_num);
1123 status = ocfs2_update_disk_slots(osb, si); 1123 status = ocfs2_update_disk_slots(osb, si);
1124 if (status < 0) 1124 if (status < 0)
1125 mlog_errno(status); 1125 mlog_errno(status);
1126 1126
1127 /* This will kfree the memory pointed to by la_copy and tl_copy */ 1127 /* This will kfree the memory pointed to by la_copy and tl_copy */
1128 ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy, 1128 ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy,
1129 tl_copy); 1129 tl_copy);
1130 1130
1131 status = 0; 1131 status = 0;
1132 done: 1132 done:
1133 1133
1134 mlog_exit(status); 1134 mlog_exit(status);
1135 return status; 1135 return status;
1136 } 1136 }
1137 1137
1138 /* Test node liveness by trylocking his journal. If we get the lock, 1138 /* Test node liveness by trylocking his journal. If we get the lock,
1139 * we drop it here. Return 0 if we got the lock, -EAGAIN if node is 1139 * we drop it here. Return 0 if we got the lock, -EAGAIN if node is
1140 * still alive (we couldn't get the lock) and < 0 on error. */ 1140 * still alive (we couldn't get the lock) and < 0 on error. */
1141 static int ocfs2_trylock_journal(struct ocfs2_super *osb, 1141 static int ocfs2_trylock_journal(struct ocfs2_super *osb,
1142 int slot_num) 1142 int slot_num)
1143 { 1143 {
1144 int status, flags; 1144 int status, flags;
1145 struct inode *inode = NULL; 1145 struct inode *inode = NULL;
1146 1146
1147 inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE, 1147 inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
1148 slot_num); 1148 slot_num);
1149 if (inode == NULL) { 1149 if (inode == NULL) {
1150 mlog(ML_ERROR, "access error\n"); 1150 mlog(ML_ERROR, "access error\n");
1151 status = -EACCES; 1151 status = -EACCES;
1152 goto bail; 1152 goto bail;
1153 } 1153 }
1154 if (is_bad_inode(inode)) { 1154 if (is_bad_inode(inode)) {
1155 mlog(ML_ERROR, "access error (bad inode)\n"); 1155 mlog(ML_ERROR, "access error (bad inode)\n");
1156 iput(inode); 1156 iput(inode);
1157 inode = NULL; 1157 inode = NULL;
1158 status = -EACCES; 1158 status = -EACCES;
1159 goto bail; 1159 goto bail;
1160 } 1160 }
1161 SET_INODE_JOURNAL(inode); 1161 SET_INODE_JOURNAL(inode);
1162 1162
1163 flags = OCFS2_META_LOCK_RECOVERY | OCFS2_META_LOCK_NOQUEUE; 1163 flags = OCFS2_META_LOCK_RECOVERY | OCFS2_META_LOCK_NOQUEUE;
1164 status = ocfs2_meta_lock_full(inode, NULL, 1, flags); 1164 status = ocfs2_meta_lock_full(inode, NULL, 1, flags);
1165 if (status < 0) { 1165 if (status < 0) {
1166 if (status != -EAGAIN) 1166 if (status != -EAGAIN)
1167 mlog_errno(status); 1167 mlog_errno(status);
1168 goto bail; 1168 goto bail;
1169 } 1169 }
1170 1170
1171 ocfs2_meta_unlock(inode, 1); 1171 ocfs2_meta_unlock(inode, 1);
1172 bail: 1172 bail:
1173 if (inode) 1173 if (inode)
1174 iput(inode); 1174 iput(inode);
1175 1175
1176 return status; 1176 return status;
1177 } 1177 }
1178 1178
1179 /* Call this underneath ocfs2_super_lock. It also assumes that the 1179 /* Call this underneath ocfs2_super_lock. It also assumes that the
1180 * slot info struct has been updated from disk. */ 1180 * slot info struct has been updated from disk. */
1181 int ocfs2_mark_dead_nodes(struct ocfs2_super *osb) 1181 int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
1182 { 1182 {
1183 int status, i, node_num; 1183 int status, i, node_num;
1184 struct ocfs2_slot_info *si = osb->slot_info; 1184 struct ocfs2_slot_info *si = osb->slot_info;
1185 1185
1186 /* This is called with the super block cluster lock, so we 1186 /* This is called with the super block cluster lock, so we
1187 * know that the slot map can't change underneath us. */ 1187 * know that the slot map can't change underneath us. */
1188 1188
1189 spin_lock(&si->si_lock); 1189 spin_lock(&si->si_lock);
1190 for(i = 0; i < si->si_num_slots; i++) { 1190 for(i = 0; i < si->si_num_slots; i++) {
1191 if (i == osb->slot_num) 1191 if (i == osb->slot_num)
1192 continue; 1192 continue;
1193 if (ocfs2_is_empty_slot(si, i)) 1193 if (ocfs2_is_empty_slot(si, i))
1194 continue; 1194 continue;
1195 1195
1196 node_num = si->si_global_node_nums[i]; 1196 node_num = si->si_global_node_nums[i];
1197 if (ocfs2_node_map_test_bit(osb, &osb->recovery_map, node_num)) 1197 if (ocfs2_node_map_test_bit(osb, &osb->recovery_map, node_num))
1198 continue; 1198 continue;
1199 spin_unlock(&si->si_lock); 1199 spin_unlock(&si->si_lock);
1200 1200
1201 /* Ok, we have a slot occupied by another node which 1201 /* Ok, we have a slot occupied by another node which
1202 * is not in the recovery map. We trylock his journal 1202 * is not in the recovery map. We trylock his journal
1203 * file here to test if he's alive. */ 1203 * file here to test if he's alive. */
1204 status = ocfs2_trylock_journal(osb, i); 1204 status = ocfs2_trylock_journal(osb, i);
1205 if (!status) { 1205 if (!status) {
1206 /* Since we're called from mount, we know that 1206 /* Since we're called from mount, we know that
1207 * the recovery thread can't race us on 1207 * the recovery thread can't race us on
1208 * setting / checking the recovery bits. */ 1208 * setting / checking the recovery bits. */
1209 ocfs2_recovery_thread(osb, node_num); 1209 ocfs2_recovery_thread(osb, node_num);
1210 } else if ((status < 0) && (status != -EAGAIN)) { 1210 } else if ((status < 0) && (status != -EAGAIN)) {
1211 mlog_errno(status); 1211 mlog_errno(status);
1212 goto bail; 1212 goto bail;
1213 } 1213 }
1214 1214
1215 spin_lock(&si->si_lock); 1215 spin_lock(&si->si_lock);
1216 } 1216 }
1217 spin_unlock(&si->si_lock); 1217 spin_unlock(&si->si_lock);
1218 1218
1219 status = 0; 1219 status = 0;
1220 bail: 1220 bail:
1221 mlog_exit(status); 1221 mlog_exit(status);
1222 return status; 1222 return status;
1223 } 1223 }
1224 1224
1225 static int ocfs2_queue_orphans(struct ocfs2_super *osb, 1225 static int ocfs2_queue_orphans(struct ocfs2_super *osb,
1226 int slot, 1226 int slot,
1227 struct inode **head) 1227 struct inode **head)
1228 { 1228 {
1229 int status; 1229 int status;
1230 struct inode *orphan_dir_inode = NULL; 1230 struct inode *orphan_dir_inode = NULL;
1231 struct inode *iter; 1231 struct inode *iter;
1232 unsigned long offset, blk, local; 1232 unsigned long offset, blk, local;
1233 struct buffer_head *bh = NULL; 1233 struct buffer_head *bh = NULL;
1234 struct ocfs2_dir_entry *de; 1234 struct ocfs2_dir_entry *de;
1235 struct super_block *sb = osb->sb; 1235 struct super_block *sb = osb->sb;
1236 1236
1237 orphan_dir_inode = ocfs2_get_system_file_inode(osb, 1237 orphan_dir_inode = ocfs2_get_system_file_inode(osb,
1238 ORPHAN_DIR_SYSTEM_INODE, 1238 ORPHAN_DIR_SYSTEM_INODE,
1239 slot); 1239 slot);
1240 if (!orphan_dir_inode) { 1240 if (!orphan_dir_inode) {
1241 status = -ENOENT; 1241 status = -ENOENT;
1242 mlog_errno(status); 1242 mlog_errno(status);
1243 return status; 1243 return status;
1244 } 1244 }
1245 1245
1246 mutex_lock(&orphan_dir_inode->i_mutex); 1246 mutex_lock(&orphan_dir_inode->i_mutex);
1247 status = ocfs2_meta_lock(orphan_dir_inode, NULL, 0); 1247 status = ocfs2_meta_lock(orphan_dir_inode, NULL, 0);
1248 if (status < 0) { 1248 if (status < 0) {
1249 mlog_errno(status); 1249 mlog_errno(status);
1250 goto out; 1250 goto out;
1251 } 1251 }
1252 1252
1253 offset = 0; 1253 offset = 0;
1254 iter = NULL; 1254 iter = NULL;
1255 while(offset < i_size_read(orphan_dir_inode)) { 1255 while(offset < i_size_read(orphan_dir_inode)) {
1256 blk = offset >> sb->s_blocksize_bits; 1256 blk = offset >> sb->s_blocksize_bits;
1257 1257
1258 bh = ocfs2_bread(orphan_dir_inode, blk, &status, 0); 1258 bh = ocfs2_bread(orphan_dir_inode, blk, &status, 0);
1259 if (!bh) 1259 if (!bh)
1260 status = -EINVAL; 1260 status = -EINVAL;
1261 if (status < 0) { 1261 if (status < 0) {
1262 if (bh) 1262 if (bh)
1263 brelse(bh); 1263 brelse(bh);
1264 mlog_errno(status); 1264 mlog_errno(status);
1265 goto out_unlock; 1265 goto out_unlock;
1266 } 1266 }
1267 1267
1268 local = 0; 1268 local = 0;
1269 while(offset < i_size_read(orphan_dir_inode) 1269 while(offset < i_size_read(orphan_dir_inode)
1270 && local < sb->s_blocksize) { 1270 && local < sb->s_blocksize) {
1271 de = (struct ocfs2_dir_entry *) (bh->b_data + local); 1271 de = (struct ocfs2_dir_entry *) (bh->b_data + local);
1272 1272
1273 if (!ocfs2_check_dir_entry(orphan_dir_inode, 1273 if (!ocfs2_check_dir_entry(orphan_dir_inode,
1274 de, bh, local)) { 1274 de, bh, local)) {
1275 status = -EINVAL; 1275 status = -EINVAL;
1276 mlog_errno(status); 1276 mlog_errno(status);
1277 brelse(bh); 1277 brelse(bh);
1278 goto out_unlock; 1278 goto out_unlock;
1279 } 1279 }
1280 1280
1281 local += le16_to_cpu(de->rec_len); 1281 local += le16_to_cpu(de->rec_len);
1282 offset += le16_to_cpu(de->rec_len); 1282 offset += le16_to_cpu(de->rec_len);
1283 1283
1284 /* I guess we silently fail on no inode? */ 1284 /* I guess we silently fail on no inode? */
1285 if (!le64_to_cpu(de->inode)) 1285 if (!le64_to_cpu(de->inode))
1286 continue; 1286 continue;
1287 if (de->file_type > OCFS2_FT_MAX) { 1287 if (de->file_type > OCFS2_FT_MAX) {
1288 mlog(ML_ERROR, 1288 mlog(ML_ERROR,
1289 "block %llu contains invalid de: " 1289 "block %llu contains invalid de: "
1290 "inode = %llu, rec_len = %u, " 1290 "inode = %llu, rec_len = %u, "
1291 "name_len = %u, file_type = %u, " 1291 "name_len = %u, file_type = %u, "
1292 "name='%.*s'\n", 1292 "name='%.*s'\n",
1293 (unsigned long long)bh->b_blocknr, 1293 (unsigned long long)bh->b_blocknr,
1294 (unsigned long long)le64_to_cpu(de->inode), 1294 (unsigned long long)le64_to_cpu(de->inode),
1295 le16_to_cpu(de->rec_len), 1295 le16_to_cpu(de->rec_len),
1296 de->name_len, 1296 de->name_len,
1297 de->file_type, 1297 de->file_type,
1298 de->name_len, 1298 de->name_len,
1299 de->name); 1299 de->name);
1300 continue; 1300 continue;
1301 } 1301 }
1302 if (de->name_len == 1 && !strncmp(".", de->name, 1)) 1302 if (de->name_len == 1 && !strncmp(".", de->name, 1))
1303 continue; 1303 continue;
1304 if (de->name_len == 2 && !strncmp("..", de->name, 2)) 1304 if (de->name_len == 2 && !strncmp("..", de->name, 2))
1305 continue; 1305 continue;
1306 1306
1307 iter = ocfs2_iget(osb, le64_to_cpu(de->inode), 1307 iter = ocfs2_iget(osb, le64_to_cpu(de->inode),
1308 OCFS2_FI_FLAG_ORPHAN_RECOVERY); 1308 OCFS2_FI_FLAG_ORPHAN_RECOVERY);
1309 if (IS_ERR(iter)) 1309 if (IS_ERR(iter))
1310 continue; 1310 continue;
1311 1311
1312 mlog(0, "queue orphan %llu\n", 1312 mlog(0, "queue orphan %llu\n",
1313 (unsigned long long)OCFS2_I(iter)->ip_blkno); 1313 (unsigned long long)OCFS2_I(iter)->ip_blkno);
1314 /* No locking is required for the next_orphan 1314 /* No locking is required for the next_orphan
1315 * queue as there is only ever a single 1315 * queue as there is only ever a single
1316 * process doing orphan recovery. */ 1316 * process doing orphan recovery. */
1317 OCFS2_I(iter)->ip_next_orphan = *head; 1317 OCFS2_I(iter)->ip_next_orphan = *head;
1318 *head = iter; 1318 *head = iter;
1319 } 1319 }
1320 brelse(bh); 1320 brelse(bh);
1321 } 1321 }
1322 1322
1323 out_unlock: 1323 out_unlock:
1324 ocfs2_meta_unlock(orphan_dir_inode, 0); 1324 ocfs2_meta_unlock(orphan_dir_inode, 0);
1325 out: 1325 out:
1326 mutex_unlock(&orphan_dir_inode->i_mutex); 1326 mutex_unlock(&orphan_dir_inode->i_mutex);
1327 iput(orphan_dir_inode); 1327 iput(orphan_dir_inode);
1328 return status; 1328 return status;
1329 } 1329 }
1330 1330
1331 static int ocfs2_orphan_recovery_can_continue(struct ocfs2_super *osb, 1331 static int ocfs2_orphan_recovery_can_continue(struct ocfs2_super *osb,
1332 int slot) 1332 int slot)
1333 { 1333 {
1334 int ret; 1334 int ret;
1335 1335
1336 spin_lock(&osb->osb_lock); 1336 spin_lock(&osb->osb_lock);
1337 ret = !osb->osb_orphan_wipes[slot]; 1337 ret = !osb->osb_orphan_wipes[slot];
1338 spin_unlock(&osb->osb_lock); 1338 spin_unlock(&osb->osb_lock);
1339 return ret; 1339 return ret;
1340 } 1340 }
1341 1341
1342 static void ocfs2_mark_recovering_orphan_dir(struct ocfs2_super *osb, 1342 static void ocfs2_mark_recovering_orphan_dir(struct ocfs2_super *osb,
1343 int slot) 1343 int slot)
1344 { 1344 {
1345 spin_lock(&osb->osb_lock); 1345 spin_lock(&osb->osb_lock);
1346 /* Mark ourselves such that new processes in delete_inode() 1346 /* Mark ourselves such that new processes in delete_inode()
1347 * know to quit early. */ 1347 * know to quit early. */
1348 ocfs2_node_map_set_bit(osb, &osb->osb_recovering_orphan_dirs, slot); 1348 ocfs2_node_map_set_bit(osb, &osb->osb_recovering_orphan_dirs, slot);
1349 while (osb->osb_orphan_wipes[slot]) { 1349 while (osb->osb_orphan_wipes[slot]) {
1350 /* If any processes are already in the middle of an 1350 /* If any processes are already in the middle of an
1351 * orphan wipe on this dir, then we need to wait for 1351 * orphan wipe on this dir, then we need to wait for
1352 * them. */ 1352 * them. */
1353 spin_unlock(&osb->osb_lock); 1353 spin_unlock(&osb->osb_lock);
1354 wait_event_interruptible(osb->osb_wipe_event, 1354 wait_event_interruptible(osb->osb_wipe_event,
1355 ocfs2_orphan_recovery_can_continue(osb, slot)); 1355 ocfs2_orphan_recovery_can_continue(osb, slot));
1356 spin_lock(&osb->osb_lock); 1356 spin_lock(&osb->osb_lock);
1357 } 1357 }
1358 spin_unlock(&osb->osb_lock); 1358 spin_unlock(&osb->osb_lock);
1359 } 1359 }
1360 1360
1361 static void ocfs2_clear_recovering_orphan_dir(struct ocfs2_super *osb, 1361 static void ocfs2_clear_recovering_orphan_dir(struct ocfs2_super *osb,
1362 int slot) 1362 int slot)
1363 { 1363 {
1364 ocfs2_node_map_clear_bit(osb, &osb->osb_recovering_orphan_dirs, slot); 1364 ocfs2_node_map_clear_bit(osb, &osb->osb_recovering_orphan_dirs, slot);
1365 } 1365 }
1366 1366
1367 /* 1367 /*
1368 * Orphan recovery. Each mounted node has it's own orphan dir which we 1368 * Orphan recovery. Each mounted node has it's own orphan dir which we
1369 * must run during recovery. Our strategy here is to build a list of 1369 * must run during recovery. Our strategy here is to build a list of
1370 * the inodes in the orphan dir and iget/iput them. The VFS does 1370 * the inodes in the orphan dir and iget/iput them. The VFS does
1371 * (most) of the rest of the work. 1371 * (most) of the rest of the work.
1372 * 1372 *
1373 * Orphan recovery can happen at any time, not just mount so we have a 1373 * Orphan recovery can happen at any time, not just mount so we have a
1374 * couple of extra considerations. 1374 * couple of extra considerations.
1375 * 1375 *
1376 * - We grab as many inodes as we can under the orphan dir lock - 1376 * - We grab as many inodes as we can under the orphan dir lock -
1377 * doing iget() outside the orphan dir risks getting a reference on 1377 * doing iget() outside the orphan dir risks getting a reference on
1378 * an invalid inode. 1378 * an invalid inode.
1379 * - We must be sure not to deadlock with other processes on the 1379 * - We must be sure not to deadlock with other processes on the
1380 * system wanting to run delete_inode(). This can happen when they go 1380 * system wanting to run delete_inode(). This can happen when they go
1381 * to lock the orphan dir and the orphan recovery process attempts to 1381 * to lock the orphan dir and the orphan recovery process attempts to
1382 * iget() inside the orphan dir lock. This can be avoided by 1382 * iget() inside the orphan dir lock. This can be avoided by
1383 * advertising our state to ocfs2_delete_inode(). 1383 * advertising our state to ocfs2_delete_inode().
1384 */ 1384 */
1385 static int ocfs2_recover_orphans(struct ocfs2_super *osb, 1385 static int ocfs2_recover_orphans(struct ocfs2_super *osb,
1386 int slot) 1386 int slot)
1387 { 1387 {
1388 int ret = 0; 1388 int ret = 0;
1389 struct inode *inode = NULL; 1389 struct inode *inode = NULL;
1390 struct inode *iter; 1390 struct inode *iter;
1391 struct ocfs2_inode_info *oi; 1391 struct ocfs2_inode_info *oi;
1392 1392
1393 mlog(0, "Recover inodes from orphan dir in slot %d\n", slot); 1393 mlog(0, "Recover inodes from orphan dir in slot %d\n", slot);
1394 1394
1395 ocfs2_mark_recovering_orphan_dir(osb, slot); 1395 ocfs2_mark_recovering_orphan_dir(osb, slot);
1396 ret = ocfs2_queue_orphans(osb, slot, &inode); 1396 ret = ocfs2_queue_orphans(osb, slot, &inode);
1397 ocfs2_clear_recovering_orphan_dir(osb, slot); 1397 ocfs2_clear_recovering_orphan_dir(osb, slot);
1398 1398
1399 /* Error here should be noted, but we want to continue with as 1399 /* Error here should be noted, but we want to continue with as
1400 * many queued inodes as we've got. */ 1400 * many queued inodes as we've got. */
1401 if (ret) 1401 if (ret)
1402 mlog_errno(ret); 1402 mlog_errno(ret);
1403 1403
1404 while (inode) { 1404 while (inode) {
1405 oi = OCFS2_I(inode); 1405 oi = OCFS2_I(inode);
1406 mlog(0, "iput orphan %llu\n", (unsigned long long)oi->ip_blkno); 1406 mlog(0, "iput orphan %llu\n", (unsigned long long)oi->ip_blkno);
1407 1407
1408 iter = oi->ip_next_orphan; 1408 iter = oi->ip_next_orphan;
1409 1409
1410 spin_lock(&oi->ip_lock); 1410 spin_lock(&oi->ip_lock);
1411 /* Delete voting may have set these on the assumption 1411 /* Delete voting may have set these on the assumption
1412 * that the other node would wipe them successfully. 1412 * that the other node would wipe them successfully.
1413 * If they are still in the node's orphan dir, we need 1413 * If they are still in the node's orphan dir, we need
1414 * to reset that state. */ 1414 * to reset that state. */
1415 oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE); 1415 oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE);
1416 1416
1417 /* Set the proper information to get us going into 1417 /* Set the proper information to get us going into
1418 * ocfs2_delete_inode. */ 1418 * ocfs2_delete_inode. */
1419 oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; 1419 oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
1420 spin_unlock(&oi->ip_lock); 1420 spin_unlock(&oi->ip_lock);
1421 1421
1422 iput(inode); 1422 iput(inode);
1423 1423
1424 inode = iter; 1424 inode = iter;
1425 } 1425 }
1426 1426
1427 return ret; 1427 return ret;
1428 } 1428 }
1429 1429
1430 static int ocfs2_wait_on_mount(struct ocfs2_super *osb) 1430 static int ocfs2_wait_on_mount(struct ocfs2_super *osb)
1431 { 1431 {
1432 /* This check is good because ocfs2 will wait on our recovery 1432 /* This check is good because ocfs2 will wait on our recovery
1433 * thread before changing it to something other than MOUNTED 1433 * thread before changing it to something other than MOUNTED
1434 * or DISABLED. */ 1434 * or DISABLED. */
1435 wait_event(osb->osb_mount_event, 1435 wait_event(osb->osb_mount_event,
1436 atomic_read(&osb->vol_state) == VOLUME_MOUNTED || 1436 atomic_read(&osb->vol_state) == VOLUME_MOUNTED ||
1437 atomic_read(&osb->vol_state) == VOLUME_DISABLED); 1437 atomic_read(&osb->vol_state) == VOLUME_DISABLED);
1438 1438
1439 /* If there's an error on mount, then we may never get to the 1439 /* If there's an error on mount, then we may never get to the
1440 * MOUNTED flag, but this is set right before 1440 * MOUNTED flag, but this is set right before
1441 * dismount_volume() so we can trust it. */ 1441 * dismount_volume() so we can trust it. */
1442 if (atomic_read(&osb->vol_state) == VOLUME_DISABLED) { 1442 if (atomic_read(&osb->vol_state) == VOLUME_DISABLED) {
1443 mlog(0, "mount error, exiting!\n"); 1443 mlog(0, "mount error, exiting!\n");
1444 return -EBUSY; 1444 return -EBUSY;
1445 } 1445 }
1446 1446
1447 return 0; 1447 return 0;
1448 } 1448 }
1449 1449
1450 static int ocfs2_commit_thread(void *arg) 1450 static int ocfs2_commit_thread(void *arg)
1451 { 1451 {
1452 int status; 1452 int status;
1453 struct ocfs2_super *osb = arg; 1453 struct ocfs2_super *osb = arg;
1454 struct ocfs2_journal *journal = osb->journal; 1454 struct ocfs2_journal *journal = osb->journal;
1455 1455
1456 /* we can trust j_num_trans here because _should_stop() is only set in 1456 /* we can trust j_num_trans here because _should_stop() is only set in
1457 * shutdown and nobody other than ourselves should be able to start 1457 * shutdown and nobody other than ourselves should be able to start
1458 * transactions. committing on shutdown might take a few iterations 1458 * transactions. committing on shutdown might take a few iterations
1459 * as final transactions put deleted inodes on the list */ 1459 * as final transactions put deleted inodes on the list */
1460 while (!(kthread_should_stop() && 1460 while (!(kthread_should_stop() &&
1461 atomic_read(&journal->j_num_trans) == 0)) { 1461 atomic_read(&journal->j_num_trans) == 0)) {
1462 1462
1463 wait_event_interruptible(osb->checkpoint_event, 1463 wait_event_interruptible(osb->checkpoint_event,
1464 atomic_read(&journal->j_num_trans) 1464 atomic_read(&journal->j_num_trans)
1465 || kthread_should_stop()); 1465 || kthread_should_stop());
1466 1466
1467 status = ocfs2_commit_cache(osb); 1467 status = ocfs2_commit_cache(osb);
1468 if (status < 0) 1468 if (status < 0)
1469 mlog_errno(status); 1469 mlog_errno(status);
1470 1470
1471 if (kthread_should_stop() && atomic_read(&journal->j_num_trans)){ 1471 if (kthread_should_stop() && atomic_read(&journal->j_num_trans)){
1472 mlog(ML_KTHREAD, 1472 mlog(ML_KTHREAD,
1473 "commit_thread: %u transactions pending on " 1473 "commit_thread: %u transactions pending on "
1474 "shutdown\n", 1474 "shutdown\n",
1475 atomic_read(&journal->j_num_trans)); 1475 atomic_read(&journal->j_num_trans));
1476 } 1476 }
1477 } 1477 }
1478 1478
1479 return 0; 1479 return 0;
1480 } 1480 }
1481 1481
1482 /* Look for a dirty journal without taking any cluster locks. Used for 1482 /* Look for a dirty journal without taking any cluster locks. Used for
1483 * hard readonly access to determine whether the file system journals 1483 * hard readonly access to determine whether the file system journals
1484 * require recovery. */ 1484 * require recovery. */
1485 int ocfs2_check_journals_nolocks(struct ocfs2_super *osb) 1485 int ocfs2_check_journals_nolocks(struct ocfs2_super *osb)
1486 { 1486 {
1487 int ret = 0; 1487 int ret = 0;
1488 unsigned int slot; 1488 unsigned int slot;
1489 struct buffer_head *di_bh; 1489 struct buffer_head *di_bh;
1490 struct ocfs2_dinode *di; 1490 struct ocfs2_dinode *di;
1491 struct inode *journal = NULL; 1491 struct inode *journal = NULL;
1492 1492
1493 for(slot = 0; slot < osb->max_slots; slot++) { 1493 for(slot = 0; slot < osb->max_slots; slot++) {
1494 journal = ocfs2_get_system_file_inode(osb, 1494 journal = ocfs2_get_system_file_inode(osb,
1495 JOURNAL_SYSTEM_INODE, 1495 JOURNAL_SYSTEM_INODE,
1496 slot); 1496 slot);
1497 if (!journal || is_bad_inode(journal)) { 1497 if (!journal || is_bad_inode(journal)) {
1498 ret = -EACCES; 1498 ret = -EACCES;
1499 mlog_errno(ret); 1499 mlog_errno(ret);
1500 goto out; 1500 goto out;
1501 } 1501 }
1502 1502
1503 di_bh = NULL; 1503 di_bh = NULL;
1504 ret = ocfs2_read_block(osb, OCFS2_I(journal)->ip_blkno, &di_bh, 1504 ret = ocfs2_read_block(osb, OCFS2_I(journal)->ip_blkno, &di_bh,
1505 0, journal); 1505 0, journal);
1506 if (ret < 0) { 1506 if (ret < 0) {
1507 mlog_errno(ret); 1507 mlog_errno(ret);
1508 goto out; 1508 goto out;
1509 } 1509 }
1510 1510
1511 di = (struct ocfs2_dinode *) di_bh->b_data; 1511 di = (struct ocfs2_dinode *) di_bh->b_data;
1512 1512
1513 if (le32_to_cpu(di->id1.journal1.ij_flags) & 1513 if (le32_to_cpu(di->id1.journal1.ij_flags) &
1514 OCFS2_JOURNAL_DIRTY_FL) 1514 OCFS2_JOURNAL_DIRTY_FL)
1515 ret = -EROFS; 1515 ret = -EROFS;
1516 1516
1517 brelse(di_bh); 1517 brelse(di_bh);
1518 if (ret) 1518 if (ret)
1519 break; 1519 break;
1520 } 1520 }
1521 1521
1522 out: 1522 out:
1523 if (journal) 1523 if (journal)
1524 iput(journal); 1524 iput(journal);
1525 1525
1526 return ret; 1526 return ret;
1527 } 1527 }
1528 1528
1 /* -*- mode: c; c-basic-offset: 8; -*- 1 /* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0: 2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 * 3 *
4 * namei.c 4 * namei.c
5 * 5 *
6 * Create and rename file, directory, symlinks 6 * Create and rename file, directory, symlinks
7 * 7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 * 9 *
10 * Portions of this code from linux/fs/ext3/dir.c 10 * Portions of this code from linux/fs/ext3/dir.c
11 * 11 *
12 * Copyright (C) 1992, 1993, 1994, 1995 12 * Copyright (C) 1992, 1993, 1994, 1995
13 * Remy Card (card@masi.ibp.fr) 13 * Remy Card (card@masi.ibp.fr)
14 * Laboratoire MASI - Institut Blaise pascal 14 * Laboratoire MASI - Institut Blaise pascal
15 * Universite Pierre et Marie Curie (Paris VI) 15 * Universite Pierre et Marie Curie (Paris VI)
16 * 16 *
17 * from 17 * from
18 * 18 *
19 * linux/fs/minix/dir.c 19 * linux/fs/minix/dir.c
20 * 20 *
21 * Copyright (C) 1991, 1992 Linux Torvalds 21 * Copyright (C) 1991, 1992 Linux Torvalds
22 * 22 *
23 * This program is free software; you can redistribute it and/or 23 * This program is free software; you can redistribute it and/or
24 * modify it under the terms of the GNU General Public 24 * modify it under the terms of the GNU General Public
25 * License as published by the Free Software Foundation; either 25 * License as published by the Free Software Foundation; either
26 * version 2 of the License, or (at your option) any later version. 26 * version 2 of the License, or (at your option) any later version.
27 * 27 *
28 * This program is distributed in the hope that it will be useful, 28 * This program is distributed in the hope that it will be useful,
29 * but WITHOUT ANY WARRANTY; without even the implied warranty of 29 * but WITHOUT ANY WARRANTY; without even the implied warranty of
30 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 30 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
31 * General Public License for more details. 31 * General Public License for more details.
32 * 32 *
33 * You should have received a copy of the GNU General Public 33 * You should have received a copy of the GNU General Public
34 * License along with this program; if not, write to the 34 * License along with this program; if not, write to the
35 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 35 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
36 * Boston, MA 021110-1307, USA. 36 * Boston, MA 021110-1307, USA.
37 */ 37 */
38 38
39 #include <linux/fs.h> 39 #include <linux/fs.h>
40 #include <linux/types.h> 40 #include <linux/types.h>
41 #include <linux/slab.h> 41 #include <linux/slab.h>
42 #include <linux/highmem.h> 42 #include <linux/highmem.h>
43 43
44 #define MLOG_MASK_PREFIX ML_NAMEI 44 #define MLOG_MASK_PREFIX ML_NAMEI
45 #include <cluster/masklog.h> 45 #include <cluster/masklog.h>
46 46
47 #include "ocfs2.h" 47 #include "ocfs2.h"
48 48
49 #include "alloc.h" 49 #include "alloc.h"
50 #include "dcache.h" 50 #include "dcache.h"
51 #include "dir.h" 51 #include "dir.h"
52 #include "dlmglue.h" 52 #include "dlmglue.h"
53 #include "extent_map.h" 53 #include "extent_map.h"
54 #include "file.h" 54 #include "file.h"
55 #include "inode.h" 55 #include "inode.h"
56 #include "journal.h" 56 #include "journal.h"
57 #include "namei.h" 57 #include "namei.h"
58 #include "suballoc.h" 58 #include "suballoc.h"
59 #include "super.h" 59 #include "super.h"
60 #include "symlink.h" 60 #include "symlink.h"
61 #include "sysfile.h" 61 #include "sysfile.h"
62 #include "uptodate.h" 62 #include "uptodate.h"
63 #include "vote.h" 63 #include "vote.h"
64 64
65 #include "buffer_head_io.h" 65 #include "buffer_head_io.h"
66 66
67 #define NAMEI_RA_CHUNKS 2 67 #define NAMEI_RA_CHUNKS 2
68 #define NAMEI_RA_BLOCKS 4 68 #define NAMEI_RA_BLOCKS 4
69 #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) 69 #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
70 #define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) 70 #define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
71 71
72 static int inline ocfs2_search_dirblock(struct buffer_head *bh, 72 static int inline ocfs2_search_dirblock(struct buffer_head *bh,
73 struct inode *dir, 73 struct inode *dir,
74 const char *name, int namelen, 74 const char *name, int namelen,
75 unsigned long offset, 75 unsigned long offset,
76 struct ocfs2_dir_entry **res_dir); 76 struct ocfs2_dir_entry **res_dir);
77 77
78 static int ocfs2_delete_entry(handle_t *handle, 78 static int ocfs2_delete_entry(handle_t *handle,
79 struct inode *dir, 79 struct inode *dir,
80 struct ocfs2_dir_entry *de_del, 80 struct ocfs2_dir_entry *de_del,
81 struct buffer_head *bh); 81 struct buffer_head *bh);
82 82
83 static int __ocfs2_add_entry(handle_t *handle, 83 static int __ocfs2_add_entry(handle_t *handle,
84 struct inode *dir, 84 struct inode *dir,
85 const char *name, int namelen, 85 const char *name, int namelen,
86 struct inode *inode, u64 blkno, 86 struct inode *inode, u64 blkno,
87 struct buffer_head *parent_fe_bh, 87 struct buffer_head *parent_fe_bh,
88 struct buffer_head *insert_bh); 88 struct buffer_head *insert_bh);
89 89
90 static int ocfs2_mknod_locked(struct ocfs2_super *osb, 90 static int ocfs2_mknod_locked(struct ocfs2_super *osb,
91 struct inode *dir, 91 struct inode *dir,
92 struct dentry *dentry, int mode, 92 struct dentry *dentry, int mode,
93 dev_t dev, 93 dev_t dev,
94 struct buffer_head **new_fe_bh, 94 struct buffer_head **new_fe_bh,
95 struct buffer_head *parent_fe_bh, 95 struct buffer_head *parent_fe_bh,
96 handle_t *handle, 96 handle_t *handle,
97 struct inode **ret_inode, 97 struct inode **ret_inode,
98 struct ocfs2_alloc_context *inode_ac); 98 struct ocfs2_alloc_context *inode_ac);
99 99
100 static int ocfs2_fill_new_dir(struct ocfs2_super *osb, 100 static int ocfs2_fill_new_dir(struct ocfs2_super *osb,
101 handle_t *handle, 101 handle_t *handle,
102 struct inode *parent, 102 struct inode *parent,
103 struct inode *inode, 103 struct inode *inode,
104 struct buffer_head *fe_bh, 104 struct buffer_head *fe_bh,
105 struct ocfs2_alloc_context *data_ac); 105 struct ocfs2_alloc_context *data_ac);
106 106
107 static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, 107 static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
108 struct inode **ret_orphan_dir, 108 struct inode **ret_orphan_dir,
109 struct inode *inode, 109 struct inode *inode,
110 char *name, 110 char *name,
111 struct buffer_head **de_bh); 111 struct buffer_head **de_bh);
112 112
113 static int ocfs2_orphan_add(struct ocfs2_super *osb, 113 static int ocfs2_orphan_add(struct ocfs2_super *osb,
114 handle_t *handle, 114 handle_t *handle,
115 struct inode *inode, 115 struct inode *inode,
116 struct ocfs2_dinode *fe, 116 struct ocfs2_dinode *fe,
117 char *name, 117 char *name,
118 struct buffer_head *de_bh, 118 struct buffer_head *de_bh,
119 struct inode *orphan_dir_inode); 119 struct inode *orphan_dir_inode);
120 120
121 static int ocfs2_create_symlink_data(struct ocfs2_super *osb, 121 static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
122 handle_t *handle, 122 handle_t *handle,
123 struct inode *inode, 123 struct inode *inode,
124 const char *symname); 124 const char *symname);
125 125
126 static inline int ocfs2_add_entry(handle_t *handle, 126 static inline int ocfs2_add_entry(handle_t *handle,
127 struct dentry *dentry, 127 struct dentry *dentry,
128 struct inode *inode, u64 blkno, 128 struct inode *inode, u64 blkno,
129 struct buffer_head *parent_fe_bh, 129 struct buffer_head *parent_fe_bh,
130 struct buffer_head *insert_bh) 130 struct buffer_head *insert_bh)
131 { 131 {
132 return __ocfs2_add_entry(handle, dentry->d_parent->d_inode, 132 return __ocfs2_add_entry(handle, dentry->d_parent->d_inode,
133 dentry->d_name.name, dentry->d_name.len, 133 dentry->d_name.name, dentry->d_name.len,
134 inode, blkno, parent_fe_bh, insert_bh); 134 inode, blkno, parent_fe_bh, insert_bh);
135 } 135 }
136 136
137 /* An orphan dir name is an 8 byte value, printed as a hex string */ 137 /* An orphan dir name is an 8 byte value, printed as a hex string */
138 #define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64))) 138 #define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64)))
139 139
140 static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, 140 static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
141 struct nameidata *nd) 141 struct nameidata *nd)
142 { 142 {
143 int status; 143 int status;
144 u64 blkno; 144 u64 blkno;
145 struct buffer_head *dirent_bh = NULL; 145 struct buffer_head *dirent_bh = NULL;
146 struct inode *inode = NULL; 146 struct inode *inode = NULL;
147 struct dentry *ret; 147 struct dentry *ret;
148 struct ocfs2_dir_entry *dirent; 148 struct ocfs2_dir_entry *dirent;
149 struct ocfs2_inode_info *oi; 149 struct ocfs2_inode_info *oi;
150 150
151 mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry, 151 mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
152 dentry->d_name.len, dentry->d_name.name); 152 dentry->d_name.len, dentry->d_name.name);
153 153
154 if (dentry->d_name.len > OCFS2_MAX_FILENAME_LEN) { 154 if (dentry->d_name.len > OCFS2_MAX_FILENAME_LEN) {
155 ret = ERR_PTR(-ENAMETOOLONG); 155 ret = ERR_PTR(-ENAMETOOLONG);
156 goto bail; 156 goto bail;
157 } 157 }
158 158
159 mlog(0, "find name %.*s in directory %llu\n", dentry->d_name.len, 159 mlog(0, "find name %.*s in directory %llu\n", dentry->d_name.len,
160 dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno); 160 dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno);
161 161
162 status = ocfs2_meta_lock(dir, NULL, 0); 162 status = ocfs2_meta_lock(dir, NULL, 0);
163 if (status < 0) { 163 if (status < 0) {
164 if (status != -ENOENT) 164 if (status != -ENOENT)
165 mlog_errno(status); 165 mlog_errno(status);
166 ret = ERR_PTR(status); 166 ret = ERR_PTR(status);
167 goto bail; 167 goto bail;
168 } 168 }
169 169
170 status = ocfs2_find_files_on_disk(dentry->d_name.name, 170 status = ocfs2_find_files_on_disk(dentry->d_name.name,
171 dentry->d_name.len, &blkno, 171 dentry->d_name.len, &blkno,
172 dir, &dirent_bh, &dirent); 172 dir, &dirent_bh, &dirent);
173 if (status < 0) 173 if (status < 0)
174 goto bail_add; 174 goto bail_add;
175 175
176 inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0); 176 inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0);
177 if (IS_ERR(inode)) { 177 if (IS_ERR(inode)) {
178 ret = ERR_PTR(-EACCES); 178 ret = ERR_PTR(-EACCES);
179 goto bail_unlock; 179 goto bail_unlock;
180 } 180 }
181 181
182 oi = OCFS2_I(inode); 182 oi = OCFS2_I(inode);
183 /* Clear any orphaned state... If we were able to look up the 183 /* Clear any orphaned state... If we were able to look up the
184 * inode from a directory, it certainly can't be orphaned. We 184 * inode from a directory, it certainly can't be orphaned. We
185 * might have the bad state from a node which intended to 185 * might have the bad state from a node which intended to
186 * orphan this inode but crashed before it could commit the 186 * orphan this inode but crashed before it could commit the
187 * unlink. */ 187 * unlink. */
188 spin_lock(&oi->ip_lock); 188 spin_lock(&oi->ip_lock);
189 oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED; 189 oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED;
190 spin_unlock(&oi->ip_lock); 190 spin_unlock(&oi->ip_lock);
191 191
192 bail_add: 192 bail_add:
193 dentry->d_op = &ocfs2_dentry_ops; 193 dentry->d_op = &ocfs2_dentry_ops;
194 ret = d_splice_alias(inode, dentry); 194 ret = d_splice_alias(inode, dentry);
195 195
196 if (inode) { 196 if (inode) {
197 /* 197 /*
198 * If d_splice_alias() finds a DCACHE_DISCONNECTED 198 * If d_splice_alias() finds a DCACHE_DISCONNECTED
199 * dentry, it will d_move() it on top of ourse. The 199 * dentry, it will d_move() it on top of ourse. The
200 * return value will indicate this however, so in 200 * return value will indicate this however, so in
201 * those cases, we switch them around for the locking 201 * those cases, we switch them around for the locking
202 * code. 202 * code.
203 * 203 *
204 * NOTE: This dentry already has ->d_op set from 204 * NOTE: This dentry already has ->d_op set from
205 * ocfs2_get_parent() and ocfs2_get_dentry() 205 * ocfs2_get_parent() and ocfs2_get_dentry()
206 */ 206 */
207 if (ret) 207 if (ret)
208 dentry = ret; 208 dentry = ret;
209 209
210 status = ocfs2_dentry_attach_lock(dentry, inode, 210 status = ocfs2_dentry_attach_lock(dentry, inode,
211 OCFS2_I(dir)->ip_blkno); 211 OCFS2_I(dir)->ip_blkno);
212 if (status) { 212 if (status) {
213 mlog_errno(status); 213 mlog_errno(status);
214 ret = ERR_PTR(status); 214 ret = ERR_PTR(status);
215 goto bail_unlock; 215 goto bail_unlock;
216 } 216 }
217 } 217 }
218 218
219 bail_unlock: 219 bail_unlock:
220 /* Don't drop the cluster lock until *after* the d_add -- 220 /* Don't drop the cluster lock until *after* the d_add --
221 * unlink on another node will message us to remove that 221 * unlink on another node will message us to remove that
222 * dentry under this lock so otherwise we can race this with 222 * dentry under this lock so otherwise we can race this with
223 * the vote thread and have a stale dentry. */ 223 * the vote thread and have a stale dentry. */
224 ocfs2_meta_unlock(dir, 0); 224 ocfs2_meta_unlock(dir, 0);
225 225
226 bail: 226 bail:
227 if (dirent_bh) 227 if (dirent_bh)
228 brelse(dirent_bh); 228 brelse(dirent_bh);
229 229
230 mlog_exit_ptr(ret); 230 mlog_exit_ptr(ret);
231 231
232 return ret; 232 return ret;
233 } 233 }
234 234
235 static int ocfs2_fill_new_dir(struct ocfs2_super *osb, 235 static int ocfs2_fill_new_dir(struct ocfs2_super *osb,
236 handle_t *handle, 236 handle_t *handle,
237 struct inode *parent, 237 struct inode *parent,
238 struct inode *inode, 238 struct inode *inode,
239 struct buffer_head *fe_bh, 239 struct buffer_head *fe_bh,
240 struct ocfs2_alloc_context *data_ac) 240 struct ocfs2_alloc_context *data_ac)
241 { 241 {
242 int status; 242 int status;
243 struct buffer_head *new_bh = NULL; 243 struct buffer_head *new_bh = NULL;
244 struct ocfs2_dir_entry *de = NULL; 244 struct ocfs2_dir_entry *de = NULL;
245 245
246 mlog_entry_void(); 246 mlog_entry_void();
247 247
248 status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh, 248 status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
249 data_ac, NULL, &new_bh); 249 data_ac, NULL, &new_bh);
250 if (status < 0) { 250 if (status < 0) {
251 mlog_errno(status); 251 mlog_errno(status);
252 goto bail; 252 goto bail;
253 } 253 }
254 254
255 ocfs2_set_new_buffer_uptodate(inode, new_bh); 255 ocfs2_set_new_buffer_uptodate(inode, new_bh);
256 256
257 status = ocfs2_journal_access(handle, inode, new_bh, 257 status = ocfs2_journal_access(handle, inode, new_bh,
258 OCFS2_JOURNAL_ACCESS_CREATE); 258 OCFS2_JOURNAL_ACCESS_CREATE);
259 if (status < 0) { 259 if (status < 0) {
260 mlog_errno(status); 260 mlog_errno(status);
261 goto bail; 261 goto bail;
262 } 262 }
263 memset(new_bh->b_data, 0, osb->sb->s_blocksize); 263 memset(new_bh->b_data, 0, osb->sb->s_blocksize);
264 264
265 de = (struct ocfs2_dir_entry *) new_bh->b_data; 265 de = (struct ocfs2_dir_entry *) new_bh->b_data;
266 de->inode = cpu_to_le64(OCFS2_I(inode)->ip_blkno); 266 de->inode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
267 de->name_len = 1; 267 de->name_len = 1;
268 de->rec_len = 268 de->rec_len =
269 cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len)); 269 cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
270 strcpy(de->name, "."); 270 strcpy(de->name, ".");
271 ocfs2_set_de_type(de, S_IFDIR); 271 ocfs2_set_de_type(de, S_IFDIR);
272 de = (struct ocfs2_dir_entry *) ((char *)de + le16_to_cpu(de->rec_len)); 272 de = (struct ocfs2_dir_entry *) ((char *)de + le16_to_cpu(de->rec_len));
273 de->inode = cpu_to_le64(OCFS2_I(parent)->ip_blkno); 273 de->inode = cpu_to_le64(OCFS2_I(parent)->ip_blkno);
274 de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize - 274 de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize -
275 OCFS2_DIR_REC_LEN(1)); 275 OCFS2_DIR_REC_LEN(1));
276 de->name_len = 2; 276 de->name_len = 2;
277 strcpy(de->name, ".."); 277 strcpy(de->name, "..");
278 ocfs2_set_de_type(de, S_IFDIR); 278 ocfs2_set_de_type(de, S_IFDIR);
279 279
280 status = ocfs2_journal_dirty(handle, new_bh); 280 status = ocfs2_journal_dirty(handle, new_bh);
281 if (status < 0) { 281 if (status < 0) {
282 mlog_errno(status); 282 mlog_errno(status);
283 goto bail; 283 goto bail;
284 } 284 }
285 285
286 i_size_write(inode, inode->i_sb->s_blocksize); 286 i_size_write(inode, inode->i_sb->s_blocksize);
287 inode->i_nlink = 2; 287 inode->i_nlink = 2;
288 inode->i_blocks = ocfs2_align_bytes_to_sectors(inode->i_sb->s_blocksize); 288 inode->i_blocks = ocfs2_align_bytes_to_sectors(inode->i_sb->s_blocksize);
289 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); 289 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
290 if (status < 0) { 290 if (status < 0) {
291 mlog_errno(status); 291 mlog_errno(status);
292 goto bail; 292 goto bail;
293 } 293 }
294 294
295 status = 0; 295 status = 0;
296 bail: 296 bail:
297 if (new_bh) 297 if (new_bh)
298 brelse(new_bh); 298 brelse(new_bh);
299 299
300 mlog_exit(status); 300 mlog_exit(status);
301 return status; 301 return status;
302 } 302 }
303 303
304 static int ocfs2_mknod(struct inode *dir, 304 static int ocfs2_mknod(struct inode *dir,
305 struct dentry *dentry, 305 struct dentry *dentry,
306 int mode, 306 int mode,
307 dev_t dev) 307 dev_t dev)
308 { 308 {
309 int status = 0; 309 int status = 0;
310 struct buffer_head *parent_fe_bh = NULL; 310 struct buffer_head *parent_fe_bh = NULL;
311 handle_t *handle = NULL; 311 handle_t *handle = NULL;
312 struct ocfs2_super *osb; 312 struct ocfs2_super *osb;
313 struct ocfs2_dinode *dirfe; 313 struct ocfs2_dinode *dirfe;
314 struct buffer_head *new_fe_bh = NULL; 314 struct buffer_head *new_fe_bh = NULL;
315 struct buffer_head *de_bh = NULL; 315 struct buffer_head *de_bh = NULL;
316 struct inode *inode = NULL; 316 struct inode *inode = NULL;
317 struct ocfs2_alloc_context *inode_ac = NULL; 317 struct ocfs2_alloc_context *inode_ac = NULL;
318 struct ocfs2_alloc_context *data_ac = NULL; 318 struct ocfs2_alloc_context *data_ac = NULL;
319 319
320 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode, 320 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
321 (unsigned long)dev, dentry->d_name.len, 321 (unsigned long)dev, dentry->d_name.len,
322 dentry->d_name.name); 322 dentry->d_name.name);
323 323
324 /* get our super block */ 324 /* get our super block */
325 osb = OCFS2_SB(dir->i_sb); 325 osb = OCFS2_SB(dir->i_sb);
326 326
327 status = ocfs2_meta_lock(dir, &parent_fe_bh, 1); 327 status = ocfs2_meta_lock(dir, &parent_fe_bh, 1);
328 if (status < 0) { 328 if (status < 0) {
329 if (status != -ENOENT) 329 if (status != -ENOENT)
330 mlog_errno(status); 330 mlog_errno(status);
331 return status; 331 return status;
332 } 332 }
333 333
334 if (S_ISDIR(mode) && (dir->i_nlink >= OCFS2_LINK_MAX)) { 334 if (S_ISDIR(mode) && (dir->i_nlink >= OCFS2_LINK_MAX)) {
335 status = -EMLINK; 335 status = -EMLINK;
336 goto leave; 336 goto leave;
337 } 337 }
338 338
339 dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data; 339 dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
340 if (!dirfe->i_links_count) { 340 if (!dirfe->i_links_count) {
341 /* can't make a file in a deleted directory. */ 341 /* can't make a file in a deleted directory. */
342 status = -ENOENT; 342 status = -ENOENT;
343 goto leave; 343 goto leave;
344 } 344 }
345 345
346 status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name, 346 status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
347 dentry->d_name.len); 347 dentry->d_name.len);
348 if (status) 348 if (status)
349 goto leave; 349 goto leave;
350 350
351 /* get a spot inside the dir. */ 351 /* get a spot inside the dir. */
352 status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh, 352 status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
353 dentry->d_name.name, 353 dentry->d_name.name,
354 dentry->d_name.len, &de_bh); 354 dentry->d_name.len, &de_bh);
355 if (status < 0) { 355 if (status < 0) {
356 mlog_errno(status); 356 mlog_errno(status);
357 goto leave; 357 goto leave;
358 } 358 }
359 359
360 /* reserve an inode spot */ 360 /* reserve an inode spot */
361 status = ocfs2_reserve_new_inode(osb, &inode_ac); 361 status = ocfs2_reserve_new_inode(osb, &inode_ac);
362 if (status < 0) { 362 if (status < 0) {
363 if (status != -ENOSPC) 363 if (status != -ENOSPC)
364 mlog_errno(status); 364 mlog_errno(status);
365 goto leave; 365 goto leave;
366 } 366 }
367 367
368 /* are we making a directory? If so, reserve a cluster for his 368 /* are we making a directory? If so, reserve a cluster for his
369 * 1st extent. */ 369 * 1st extent. */
370 if (S_ISDIR(mode)) { 370 if (S_ISDIR(mode)) {
371 status = ocfs2_reserve_clusters(osb, 1, &data_ac); 371 status = ocfs2_reserve_clusters(osb, 1, &data_ac);
372 if (status < 0) { 372 if (status < 0) {
373 if (status != -ENOSPC) 373 if (status != -ENOSPC)
374 mlog_errno(status); 374 mlog_errno(status);
375 goto leave; 375 goto leave;
376 } 376 }
377 } 377 }
378 378
379 handle = ocfs2_start_trans(osb, OCFS2_MKNOD_CREDITS); 379 handle = ocfs2_start_trans(osb, OCFS2_MKNOD_CREDITS);
380 if (IS_ERR(handle)) { 380 if (IS_ERR(handle)) {
381 status = PTR_ERR(handle); 381 status = PTR_ERR(handle);
382 handle = NULL; 382 handle = NULL;
383 mlog_errno(status); 383 mlog_errno(status);
384 goto leave; 384 goto leave;
385 } 385 }
386 386
387 /* do the real work now. */ 387 /* do the real work now. */
388 status = ocfs2_mknod_locked(osb, dir, dentry, mode, dev, 388 status = ocfs2_mknod_locked(osb, dir, dentry, mode, dev,
389 &new_fe_bh, parent_fe_bh, handle, 389 &new_fe_bh, parent_fe_bh, handle,
390 &inode, inode_ac); 390 &inode, inode_ac);
391 if (status < 0) { 391 if (status < 0) {
392 mlog_errno(status); 392 mlog_errno(status);
393 goto leave; 393 goto leave;
394 } 394 }
395 395
396 if (S_ISDIR(mode)) { 396 if (S_ISDIR(mode)) {
397 status = ocfs2_fill_new_dir(osb, handle, dir, inode, 397 status = ocfs2_fill_new_dir(osb, handle, dir, inode,
398 new_fe_bh, data_ac); 398 new_fe_bh, data_ac);
399 if (status < 0) { 399 if (status < 0) {
400 mlog_errno(status); 400 mlog_errno(status);
401 goto leave; 401 goto leave;
402 } 402 }
403 403
404 status = ocfs2_journal_access(handle, dir, parent_fe_bh, 404 status = ocfs2_journal_access(handle, dir, parent_fe_bh,
405 OCFS2_JOURNAL_ACCESS_WRITE); 405 OCFS2_JOURNAL_ACCESS_WRITE);
406 if (status < 0) { 406 if (status < 0) {
407 mlog_errno(status); 407 mlog_errno(status);
408 goto leave; 408 goto leave;
409 } 409 }
410 le16_add_cpu(&dirfe->i_links_count, 1); 410 le16_add_cpu(&dirfe->i_links_count, 1);
411 status = ocfs2_journal_dirty(handle, parent_fe_bh); 411 status = ocfs2_journal_dirty(handle, parent_fe_bh);
412 if (status < 0) { 412 if (status < 0) {
413 mlog_errno(status); 413 mlog_errno(status);
414 goto leave; 414 goto leave;
415 } 415 }
416 inc_nlink(dir); 416 inc_nlink(dir);
417 } 417 }
418 418
419 status = ocfs2_add_entry(handle, dentry, inode, 419 status = ocfs2_add_entry(handle, dentry, inode,
420 OCFS2_I(inode)->ip_blkno, parent_fe_bh, 420 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
421 de_bh); 421 de_bh);
422 if (status < 0) { 422 if (status < 0) {
423 mlog_errno(status); 423 mlog_errno(status);
424 goto leave; 424 goto leave;
425 } 425 }
426 426
427 status = ocfs2_dentry_attach_lock(dentry, inode, 427 status = ocfs2_dentry_attach_lock(dentry, inode,
428 OCFS2_I(dir)->ip_blkno); 428 OCFS2_I(dir)->ip_blkno);
429 if (status) { 429 if (status) {
430 mlog_errno(status); 430 mlog_errno(status);
431 goto leave; 431 goto leave;
432 } 432 }
433 433
434 insert_inode_hash(inode); 434 insert_inode_hash(inode);
435 dentry->d_op = &ocfs2_dentry_ops; 435 dentry->d_op = &ocfs2_dentry_ops;
436 d_instantiate(dentry, inode); 436 d_instantiate(dentry, inode);
437 status = 0; 437 status = 0;
438 leave: 438 leave:
439 if (handle) 439 if (handle)
440 ocfs2_commit_trans(osb, handle); 440 ocfs2_commit_trans(osb, handle);
441 441
442 ocfs2_meta_unlock(dir, 1); 442 ocfs2_meta_unlock(dir, 1);
443 443
444 if (status == -ENOSPC) 444 if (status == -ENOSPC)
445 mlog(0, "Disk is full\n"); 445 mlog(0, "Disk is full\n");
446 446
447 if (new_fe_bh) 447 if (new_fe_bh)
448 brelse(new_fe_bh); 448 brelse(new_fe_bh);
449 449
450 if (de_bh) 450 if (de_bh)
451 brelse(de_bh); 451 brelse(de_bh);
452 452
453 if (parent_fe_bh) 453 if (parent_fe_bh)
454 brelse(parent_fe_bh); 454 brelse(parent_fe_bh);
455 455
456 if ((status < 0) && inode) 456 if ((status < 0) && inode)
457 iput(inode); 457 iput(inode);
458 458
459 if (inode_ac) 459 if (inode_ac)
460 ocfs2_free_alloc_context(inode_ac); 460 ocfs2_free_alloc_context(inode_ac);
461 461
462 if (data_ac) 462 if (data_ac)
463 ocfs2_free_alloc_context(data_ac); 463 ocfs2_free_alloc_context(data_ac);
464 464
465 mlog_exit(status); 465 mlog_exit(status);
466 466
467 return status; 467 return status;
468 } 468 }
469 469
470 static int ocfs2_mknod_locked(struct ocfs2_super *osb, 470 static int ocfs2_mknod_locked(struct ocfs2_super *osb,
471 struct inode *dir, 471 struct inode *dir,
472 struct dentry *dentry, int mode, 472 struct dentry *dentry, int mode,
473 dev_t dev, 473 dev_t dev,
474 struct buffer_head **new_fe_bh, 474 struct buffer_head **new_fe_bh,
475 struct buffer_head *parent_fe_bh, 475 struct buffer_head *parent_fe_bh,
476 handle_t *handle, 476 handle_t *handle,
477 struct inode **ret_inode, 477 struct inode **ret_inode,
478 struct ocfs2_alloc_context *inode_ac) 478 struct ocfs2_alloc_context *inode_ac)
479 { 479 {
480 int status = 0; 480 int status = 0;
481 struct ocfs2_dinode *fe = NULL; 481 struct ocfs2_dinode *fe = NULL;
482 struct ocfs2_extent_list *fel; 482 struct ocfs2_extent_list *fel;
483 u64 fe_blkno = 0; 483 u64 fe_blkno = 0;
484 u16 suballoc_bit; 484 u16 suballoc_bit;
485 struct inode *inode = NULL; 485 struct inode *inode = NULL;
486 486
487 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode, 487 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
488 (unsigned long)dev, dentry->d_name.len, 488 (unsigned long)dev, dentry->d_name.len,
489 dentry->d_name.name); 489 dentry->d_name.name);
490 490
491 *new_fe_bh = NULL; 491 *new_fe_bh = NULL;
492 *ret_inode = NULL; 492 *ret_inode = NULL;
493 493
494 status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit, 494 status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit,
495 &fe_blkno); 495 &fe_blkno);
496 if (status < 0) { 496 if (status < 0) {
497 mlog_errno(status); 497 mlog_errno(status);
498 goto leave; 498 goto leave;
499 } 499 }
500 500
501 inode = new_inode(dir->i_sb); 501 inode = new_inode(dir->i_sb);
502 if (IS_ERR(inode)) { 502 if (IS_ERR(inode)) {
503 status = PTR_ERR(inode); 503 status = PTR_ERR(inode);
504 mlog(ML_ERROR, "new_inode failed!\n"); 504 mlog(ML_ERROR, "new_inode failed!\n");
505 goto leave; 505 goto leave;
506 } 506 }
507 507
508 /* populate as many fields early on as possible - many of 508 /* populate as many fields early on as possible - many of
509 * these are used by the support functions here and in 509 * these are used by the support functions here and in
510 * callers. */ 510 * callers. */
511 inode->i_ino = ino_from_blkno(osb->sb, fe_blkno); 511 inode->i_ino = ino_from_blkno(osb->sb, fe_blkno);
512 OCFS2_I(inode)->ip_blkno = fe_blkno; 512 OCFS2_I(inode)->ip_blkno = fe_blkno;
513 if (S_ISDIR(mode)) 513 if (S_ISDIR(mode))
514 inode->i_nlink = 2; 514 inode->i_nlink = 2;
515 else 515 else
516 inode->i_nlink = 1; 516 inode->i_nlink = 1;
517 inode->i_mode = mode; 517 inode->i_mode = mode;
518 spin_lock(&osb->osb_lock); 518 spin_lock(&osb->osb_lock);
519 inode->i_generation = osb->s_next_generation++; 519 inode->i_generation = osb->s_next_generation++;
520 spin_unlock(&osb->osb_lock); 520 spin_unlock(&osb->osb_lock);
521 521
522 *new_fe_bh = sb_getblk(osb->sb, fe_blkno); 522 *new_fe_bh = sb_getblk(osb->sb, fe_blkno);
523 if (!*new_fe_bh) { 523 if (!*new_fe_bh) {
524 status = -EIO; 524 status = -EIO;
525 mlog_errno(status); 525 mlog_errno(status);
526 goto leave; 526 goto leave;
527 } 527 }
528 ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh); 528 ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh);
529 529
530 status = ocfs2_journal_access(handle, inode, *new_fe_bh, 530 status = ocfs2_journal_access(handle, inode, *new_fe_bh,
531 OCFS2_JOURNAL_ACCESS_CREATE); 531 OCFS2_JOURNAL_ACCESS_CREATE);
532 if (status < 0) { 532 if (status < 0) {
533 mlog_errno(status); 533 mlog_errno(status);
534 goto leave; 534 goto leave;
535 } 535 }
536 536
537 fe = (struct ocfs2_dinode *) (*new_fe_bh)->b_data; 537 fe = (struct ocfs2_dinode *) (*new_fe_bh)->b_data;
538 memset(fe, 0, osb->sb->s_blocksize); 538 memset(fe, 0, osb->sb->s_blocksize);
539 539
540 fe->i_generation = cpu_to_le32(inode->i_generation); 540 fe->i_generation = cpu_to_le32(inode->i_generation);
541 fe->i_fs_generation = cpu_to_le32(osb->fs_generation); 541 fe->i_fs_generation = cpu_to_le32(osb->fs_generation);
542 fe->i_blkno = cpu_to_le64(fe_blkno); 542 fe->i_blkno = cpu_to_le64(fe_blkno);
543 fe->i_suballoc_bit = cpu_to_le16(suballoc_bit); 543 fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
544 fe->i_suballoc_slot = cpu_to_le16(osb->slot_num); 544 fe->i_suballoc_slot = cpu_to_le16(osb->slot_num);
545 fe->i_uid = cpu_to_le32(current->fsuid); 545 fe->i_uid = cpu_to_le32(current->fsuid);
546 if (dir->i_mode & S_ISGID) { 546 if (dir->i_mode & S_ISGID) {
547 fe->i_gid = cpu_to_le32(dir->i_gid); 547 fe->i_gid = cpu_to_le32(dir->i_gid);
548 if (S_ISDIR(mode)) 548 if (S_ISDIR(mode))
549 mode |= S_ISGID; 549 mode |= S_ISGID;
550 } else 550 } else
551 fe->i_gid = cpu_to_le32(current->fsgid); 551 fe->i_gid = cpu_to_le32(current->fsgid);
552 fe->i_mode = cpu_to_le16(mode); 552 fe->i_mode = cpu_to_le16(mode);
553 if (S_ISCHR(mode) || S_ISBLK(mode)) 553 if (S_ISCHR(mode) || S_ISBLK(mode))
554 fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev)); 554 fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev));
555 555
556 fe->i_links_count = cpu_to_le16(inode->i_nlink); 556 fe->i_links_count = cpu_to_le16(inode->i_nlink);
557 557
558 fe->i_last_eb_blk = 0; 558 fe->i_last_eb_blk = 0;
559 strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE); 559 strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE);
560 le32_add_cpu(&fe->i_flags, OCFS2_VALID_FL); 560 le32_add_cpu(&fe->i_flags, OCFS2_VALID_FL);
561 fe->i_atime = fe->i_ctime = fe->i_mtime = 561 fe->i_atime = fe->i_ctime = fe->i_mtime =
562 cpu_to_le64(CURRENT_TIME.tv_sec); 562 cpu_to_le64(CURRENT_TIME.tv_sec);
563 fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec = 563 fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec =
564 cpu_to_le32(CURRENT_TIME.tv_nsec); 564 cpu_to_le32(CURRENT_TIME.tv_nsec);
565 fe->i_dtime = 0; 565 fe->i_dtime = 0;
566 566
567 fel = &fe->id2.i_list; 567 fel = &fe->id2.i_list;
568 fel->l_tree_depth = 0; 568 fel->l_tree_depth = 0;
569 fel->l_next_free_rec = 0; 569 fel->l_next_free_rec = 0;
570 fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb)); 570 fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb));
571 571
572 status = ocfs2_journal_dirty(handle, *new_fe_bh); 572 status = ocfs2_journal_dirty(handle, *new_fe_bh);
573 if (status < 0) { 573 if (status < 0) {
574 mlog_errno(status); 574 mlog_errno(status);
575 goto leave; 575 goto leave;
576 } 576 }
577 577
578 if (ocfs2_populate_inode(inode, fe, 1) < 0) { 578 if (ocfs2_populate_inode(inode, fe, 1) < 0) {
579 mlog(ML_ERROR, "populate inode failed! bh->b_blocknr=%llu, " 579 mlog(ML_ERROR, "populate inode failed! bh->b_blocknr=%llu, "
580 "i_blkno=%llu, i_ino=%lu\n", 580 "i_blkno=%llu, i_ino=%lu\n",
581 (unsigned long long) (*new_fe_bh)->b_blocknr, 581 (unsigned long long) (*new_fe_bh)->b_blocknr,
582 (unsigned long long)fe->i_blkno, inode->i_ino); 582 (unsigned long long)fe->i_blkno, inode->i_ino);
583 BUG(); 583 BUG();
584 } 584 }
585 585
586 ocfs2_inode_set_new(osb, inode); 586 ocfs2_inode_set_new(osb, inode);
587 if (!ocfs2_mount_local(osb)) { 587 if (!ocfs2_mount_local(osb)) {
588 status = ocfs2_create_new_inode_locks(inode); 588 status = ocfs2_create_new_inode_locks(inode);
589 if (status < 0) 589 if (status < 0)
590 mlog_errno(status); 590 mlog_errno(status);
591 } 591 }
592 592
593 status = 0; /* error in ocfs2_create_new_inode_locks is not 593 status = 0; /* error in ocfs2_create_new_inode_locks is not
594 * critical */ 594 * critical */
595 595
596 *ret_inode = inode; 596 *ret_inode = inode;
597 leave: 597 leave:
598 if (status < 0) { 598 if (status < 0) {
599 if (*new_fe_bh) { 599 if (*new_fe_bh) {
600 brelse(*new_fe_bh); 600 brelse(*new_fe_bh);
601 *new_fe_bh = NULL; 601 *new_fe_bh = NULL;
602 } 602 }
603 if (inode) 603 if (inode)
604 iput(inode); 604 iput(inode);
605 } 605 }
606 606
607 mlog_exit(status); 607 mlog_exit(status);
608 return status; 608 return status;
609 } 609 }
610 610
611 static int ocfs2_mkdir(struct inode *dir, 611 static int ocfs2_mkdir(struct inode *dir,
612 struct dentry *dentry, 612 struct dentry *dentry,
613 int mode) 613 int mode)
614 { 614 {
615 int ret; 615 int ret;
616 616
617 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode, 617 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode,
618 dentry->d_name.len, dentry->d_name.name); 618 dentry->d_name.len, dentry->d_name.name);
619 ret = ocfs2_mknod(dir, dentry, mode | S_IFDIR, 0); 619 ret = ocfs2_mknod(dir, dentry, mode | S_IFDIR, 0);
620 mlog_exit(ret); 620 mlog_exit(ret);
621 621
622 return ret; 622 return ret;
623 } 623 }
624 624
625 static int ocfs2_create(struct inode *dir, 625 static int ocfs2_create(struct inode *dir,
626 struct dentry *dentry, 626 struct dentry *dentry,
627 int mode, 627 int mode,
628 struct nameidata *nd) 628 struct nameidata *nd)
629 { 629 {
630 int ret; 630 int ret;
631 631
632 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode, 632 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode,
633 dentry->d_name.len, dentry->d_name.name); 633 dentry->d_name.len, dentry->d_name.name);
634 ret = ocfs2_mknod(dir, dentry, mode | S_IFREG, 0); 634 ret = ocfs2_mknod(dir, dentry, mode | S_IFREG, 0);
635 mlog_exit(ret); 635 mlog_exit(ret);
636 636
637 return ret; 637 return ret;
638 } 638 }
639 639
640 static int ocfs2_link(struct dentry *old_dentry, 640 static int ocfs2_link(struct dentry *old_dentry,
641 struct inode *dir, 641 struct inode *dir,
642 struct dentry *dentry) 642 struct dentry *dentry)
643 { 643 {
644 handle_t *handle; 644 handle_t *handle;
645 struct inode *inode = old_dentry->d_inode; 645 struct inode *inode = old_dentry->d_inode;
646 int err; 646 int err;
647 struct buffer_head *fe_bh = NULL; 647 struct buffer_head *fe_bh = NULL;
648 struct buffer_head *parent_fe_bh = NULL; 648 struct buffer_head *parent_fe_bh = NULL;
649 struct buffer_head *de_bh = NULL; 649 struct buffer_head *de_bh = NULL;
650 struct ocfs2_dinode *fe = NULL; 650 struct ocfs2_dinode *fe = NULL;
651 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 651 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
652 652
653 mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino, 653 mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino,
654 old_dentry->d_name.len, old_dentry->d_name.name, 654 old_dentry->d_name.len, old_dentry->d_name.name,
655 dentry->d_name.len, dentry->d_name.name); 655 dentry->d_name.len, dentry->d_name.name);
656 656
657 if (S_ISDIR(inode->i_mode)) 657 if (S_ISDIR(inode->i_mode))
658 return -EPERM; 658 return -EPERM;
659 659
660 err = ocfs2_meta_lock(dir, &parent_fe_bh, 1); 660 err = ocfs2_meta_lock(dir, &parent_fe_bh, 1);
661 if (err < 0) { 661 if (err < 0) {
662 if (err != -ENOENT) 662 if (err != -ENOENT)
663 mlog_errno(err); 663 mlog_errno(err);
664 return err; 664 return err;
665 } 665 }
666 666
667 if (!dir->i_nlink) { 667 if (!dir->i_nlink) {
668 err = -ENOENT; 668 err = -ENOENT;
669 goto out; 669 goto out;
670 } 670 }
671 671
672 err = ocfs2_check_dir_for_entry(dir, dentry->d_name.name, 672 err = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
673 dentry->d_name.len); 673 dentry->d_name.len);
674 if (err) 674 if (err)
675 goto out; 675 goto out;
676 676
677 err = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh, 677 err = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
678 dentry->d_name.name, 678 dentry->d_name.name,
679 dentry->d_name.len, &de_bh); 679 dentry->d_name.len, &de_bh);
680 if (err < 0) { 680 if (err < 0) {
681 mlog_errno(err); 681 mlog_errno(err);
682 goto out; 682 goto out;
683 } 683 }
684 684
685 err = ocfs2_meta_lock(inode, &fe_bh, 1); 685 err = ocfs2_meta_lock(inode, &fe_bh, 1);
686 if (err < 0) { 686 if (err < 0) {
687 if (err != -ENOENT) 687 if (err != -ENOENT)
688 mlog_errno(err); 688 mlog_errno(err);
689 goto out; 689 goto out;
690 } 690 }
691 691
692 fe = (struct ocfs2_dinode *) fe_bh->b_data; 692 fe = (struct ocfs2_dinode *) fe_bh->b_data;
693 if (le16_to_cpu(fe->i_links_count) >= OCFS2_LINK_MAX) { 693 if (le16_to_cpu(fe->i_links_count) >= OCFS2_LINK_MAX) {
694 err = -EMLINK; 694 err = -EMLINK;
695 goto out_unlock_inode; 695 goto out_unlock_inode;
696 } 696 }
697 697
698 handle = ocfs2_start_trans(osb, OCFS2_LINK_CREDITS); 698 handle = ocfs2_start_trans(osb, OCFS2_LINK_CREDITS);
699 if (IS_ERR(handle)) { 699 if (IS_ERR(handle)) {
700 err = PTR_ERR(handle); 700 err = PTR_ERR(handle);
701 handle = NULL; 701 handle = NULL;
702 mlog_errno(err); 702 mlog_errno(err);
703 goto out_unlock_inode; 703 goto out_unlock_inode;
704 } 704 }
705 705
706 err = ocfs2_journal_access(handle, inode, fe_bh, 706 err = ocfs2_journal_access(handle, inode, fe_bh,
707 OCFS2_JOURNAL_ACCESS_WRITE); 707 OCFS2_JOURNAL_ACCESS_WRITE);
708 if (err < 0) { 708 if (err < 0) {
709 mlog_errno(err); 709 mlog_errno(err);
710 goto out_commit; 710 goto out_commit;
711 } 711 }
712 712
713 inc_nlink(inode); 713 inc_nlink(inode);
714 inode->i_ctime = CURRENT_TIME; 714 inode->i_ctime = CURRENT_TIME;
715 fe->i_links_count = cpu_to_le16(inode->i_nlink); 715 fe->i_links_count = cpu_to_le16(inode->i_nlink);
716 fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); 716 fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
717 fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 717 fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
718 718
719 err = ocfs2_journal_dirty(handle, fe_bh); 719 err = ocfs2_journal_dirty(handle, fe_bh);
720 if (err < 0) { 720 if (err < 0) {
721 le16_add_cpu(&fe->i_links_count, -1); 721 le16_add_cpu(&fe->i_links_count, -1);
722 drop_nlink(inode); 722 drop_nlink(inode);
723 mlog_errno(err); 723 mlog_errno(err);
724 goto out_commit; 724 goto out_commit;
725 } 725 }
726 726
727 err = ocfs2_add_entry(handle, dentry, inode, 727 err = ocfs2_add_entry(handle, dentry, inode,
728 OCFS2_I(inode)->ip_blkno, 728 OCFS2_I(inode)->ip_blkno,
729 parent_fe_bh, de_bh); 729 parent_fe_bh, de_bh);
730 if (err) { 730 if (err) {
731 le16_add_cpu(&fe->i_links_count, -1); 731 le16_add_cpu(&fe->i_links_count, -1);
732 drop_nlink(inode); 732 drop_nlink(inode);
733 mlog_errno(err); 733 mlog_errno(err);
734 goto out_commit; 734 goto out_commit;
735 } 735 }
736 736
737 err = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno); 737 err = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno);
738 if (err) { 738 if (err) {
739 mlog_errno(err); 739 mlog_errno(err);
740 goto out_commit; 740 goto out_commit;
741 } 741 }
742 742
743 atomic_inc(&inode->i_count); 743 atomic_inc(&inode->i_count);
744 dentry->d_op = &ocfs2_dentry_ops; 744 dentry->d_op = &ocfs2_dentry_ops;
745 d_instantiate(dentry, inode); 745 d_instantiate(dentry, inode);
746 746
747 out_commit: 747 out_commit:
748 ocfs2_commit_trans(osb, handle); 748 ocfs2_commit_trans(osb, handle);
749 out_unlock_inode: 749 out_unlock_inode:
750 ocfs2_meta_unlock(inode, 1); 750 ocfs2_meta_unlock(inode, 1);
751 751
752 out: 752 out:
753 ocfs2_meta_unlock(dir, 1); 753 ocfs2_meta_unlock(dir, 1);
754 754
755 if (de_bh) 755 if (de_bh)
756 brelse(de_bh); 756 brelse(de_bh);
757 if (fe_bh) 757 if (fe_bh)
758 brelse(fe_bh); 758 brelse(fe_bh);
759 if (parent_fe_bh) 759 if (parent_fe_bh)
760 brelse(parent_fe_bh); 760 brelse(parent_fe_bh);
761 761
762 mlog_exit(err); 762 mlog_exit(err);
763 763
764 return err; 764 return err;
765 } 765 }
766 766
767 /* 767 /*
768 * Takes and drops an exclusive lock on the given dentry. This will 768 * Takes and drops an exclusive lock on the given dentry. This will
769 * force other nodes to drop it. 769 * force other nodes to drop it.
770 */ 770 */
771 static int ocfs2_remote_dentry_delete(struct dentry *dentry) 771 static int ocfs2_remote_dentry_delete(struct dentry *dentry)
772 { 772 {
773 int ret; 773 int ret;
774 774
775 ret = ocfs2_dentry_lock(dentry, 1); 775 ret = ocfs2_dentry_lock(dentry, 1);
776 if (ret) 776 if (ret)
777 mlog_errno(ret); 777 mlog_errno(ret);
778 else 778 else
779 ocfs2_dentry_unlock(dentry, 1); 779 ocfs2_dentry_unlock(dentry, 1);
780 780
781 return ret; 781 return ret;
782 } 782 }
783 783
784 static inline int inode_is_unlinkable(struct inode *inode) 784 static inline int inode_is_unlinkable(struct inode *inode)
785 { 785 {
786 if (S_ISDIR(inode->i_mode)) { 786 if (S_ISDIR(inode->i_mode)) {
787 if (inode->i_nlink == 2) 787 if (inode->i_nlink == 2)
788 return 1; 788 return 1;
789 return 0; 789 return 0;
790 } 790 }
791 791
792 if (inode->i_nlink == 1) 792 if (inode->i_nlink == 1)
793 return 1; 793 return 1;
794 return 0; 794 return 0;
795 } 795 }
796 796
797 static int ocfs2_unlink(struct inode *dir, 797 static int ocfs2_unlink(struct inode *dir,
798 struct dentry *dentry) 798 struct dentry *dentry)
799 { 799 {
800 int status; 800 int status;
801 int child_locked = 0; 801 int child_locked = 0;
802 struct inode *inode = dentry->d_inode; 802 struct inode *inode = dentry->d_inode;
803 struct inode *orphan_dir = NULL; 803 struct inode *orphan_dir = NULL;
804 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 804 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
805 u64 blkno; 805 u64 blkno;
806 struct ocfs2_dinode *fe = NULL; 806 struct ocfs2_dinode *fe = NULL;
807 struct buffer_head *fe_bh = NULL; 807 struct buffer_head *fe_bh = NULL;
808 struct buffer_head *parent_node_bh = NULL; 808 struct buffer_head *parent_node_bh = NULL;
809 handle_t *handle = NULL; 809 handle_t *handle = NULL;
810 struct ocfs2_dir_entry *dirent = NULL; 810 struct ocfs2_dir_entry *dirent = NULL;
811 struct buffer_head *dirent_bh = NULL; 811 struct buffer_head *dirent_bh = NULL;
812 char orphan_name[OCFS2_ORPHAN_NAMELEN + 1]; 812 char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
813 struct buffer_head *orphan_entry_bh = NULL; 813 struct buffer_head *orphan_entry_bh = NULL;
814 814
815 mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry, 815 mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
816 dentry->d_name.len, dentry->d_name.name); 816 dentry->d_name.len, dentry->d_name.name);
817 817
818 BUG_ON(dentry->d_parent->d_inode != dir); 818 BUG_ON(dentry->d_parent->d_inode != dir);
819 819
820 mlog(0, "ino = %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); 820 mlog(0, "ino = %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno);
821 821
822 if (inode == osb->root_inode) { 822 if (inode == osb->root_inode) {
823 mlog(0, "Cannot delete the root directory\n"); 823 mlog(0, "Cannot delete the root directory\n");
824 return -EPERM; 824 return -EPERM;
825 } 825 }
826 826
827 status = ocfs2_meta_lock(dir, &parent_node_bh, 1); 827 status = ocfs2_meta_lock(dir, &parent_node_bh, 1);
828 if (status < 0) { 828 if (status < 0) {
829 if (status != -ENOENT) 829 if (status != -ENOENT)
830 mlog_errno(status); 830 mlog_errno(status);
831 return status; 831 return status;
832 } 832 }
833 833
834 status = ocfs2_find_files_on_disk(dentry->d_name.name, 834 status = ocfs2_find_files_on_disk(dentry->d_name.name,
835 dentry->d_name.len, &blkno, 835 dentry->d_name.len, &blkno,
836 dir, &dirent_bh, &dirent); 836 dir, &dirent_bh, &dirent);
837 if (status < 0) { 837 if (status < 0) {
838 if (status != -ENOENT) 838 if (status != -ENOENT)
839 mlog_errno(status); 839 mlog_errno(status);
840 goto leave; 840 goto leave;
841 } 841 }
842 842
843 if (OCFS2_I(inode)->ip_blkno != blkno) { 843 if (OCFS2_I(inode)->ip_blkno != blkno) {
844 status = -ENOENT; 844 status = -ENOENT;
845 845
846 mlog(0, "ip_blkno %llu != dirent blkno %llu ip_flags = %x\n", 846 mlog(0, "ip_blkno %llu != dirent blkno %llu ip_flags = %x\n",
847 (unsigned long long)OCFS2_I(inode)->ip_blkno, 847 (unsigned long long)OCFS2_I(inode)->ip_blkno,
848 (unsigned long long)blkno, OCFS2_I(inode)->ip_flags); 848 (unsigned long long)blkno, OCFS2_I(inode)->ip_flags);
849 goto leave; 849 goto leave;
850 } 850 }
851 851
852 status = ocfs2_meta_lock(inode, &fe_bh, 1); 852 status = ocfs2_meta_lock(inode, &fe_bh, 1);
853 if (status < 0) { 853 if (status < 0) {
854 if (status != -ENOENT) 854 if (status != -ENOENT)
855 mlog_errno(status); 855 mlog_errno(status);
856 goto leave; 856 goto leave;
857 } 857 }
858 child_locked = 1; 858 child_locked = 1;
859 859
860 if (S_ISDIR(inode->i_mode)) { 860 if (S_ISDIR(inode->i_mode)) {
861 if (!ocfs2_empty_dir(inode)) { 861 if (!ocfs2_empty_dir(inode)) {
862 status = -ENOTEMPTY; 862 status = -ENOTEMPTY;
863 goto leave; 863 goto leave;
864 } else if (inode->i_nlink != 2) { 864 } else if (inode->i_nlink != 2) {
865 status = -ENOTEMPTY; 865 status = -ENOTEMPTY;
866 goto leave; 866 goto leave;
867 } 867 }
868 } 868 }
869 869
870 status = ocfs2_remote_dentry_delete(dentry); 870 status = ocfs2_remote_dentry_delete(dentry);
871 if (status < 0) { 871 if (status < 0) {
872 /* This vote should succeed under all normal 872 /* This vote should succeed under all normal
873 * circumstances. */ 873 * circumstances. */
874 mlog_errno(status); 874 mlog_errno(status);
875 goto leave; 875 goto leave;
876 } 876 }
877 877
878 if (inode_is_unlinkable(inode)) { 878 if (inode_is_unlinkable(inode)) {
879 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, inode, 879 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, inode,
880 orphan_name, 880 orphan_name,
881 &orphan_entry_bh); 881 &orphan_entry_bh);
882 if (status < 0) { 882 if (status < 0) {
883 mlog_errno(status); 883 mlog_errno(status);
884 goto leave; 884 goto leave;
885 } 885 }
886 } 886 }
887 887
888 handle = ocfs2_start_trans(osb, OCFS2_UNLINK_CREDITS); 888 handle = ocfs2_start_trans(osb, OCFS2_UNLINK_CREDITS);
889 if (IS_ERR(handle)) { 889 if (IS_ERR(handle)) {
890 status = PTR_ERR(handle); 890 status = PTR_ERR(handle);
891 handle = NULL; 891 handle = NULL;
892 mlog_errno(status); 892 mlog_errno(status);
893 goto leave; 893 goto leave;
894 } 894 }
895 895
896 status = ocfs2_journal_access(handle, inode, fe_bh, 896 status = ocfs2_journal_access(handle, inode, fe_bh,
897 OCFS2_JOURNAL_ACCESS_WRITE); 897 OCFS2_JOURNAL_ACCESS_WRITE);
898 if (status < 0) { 898 if (status < 0) {
899 mlog_errno(status); 899 mlog_errno(status);
900 goto leave; 900 goto leave;
901 } 901 }
902 902
903 fe = (struct ocfs2_dinode *) fe_bh->b_data; 903 fe = (struct ocfs2_dinode *) fe_bh->b_data;
904 904
905 if (inode_is_unlinkable(inode)) { 905 if (inode_is_unlinkable(inode)) {
906 status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name, 906 status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name,
907 orphan_entry_bh, orphan_dir); 907 orphan_entry_bh, orphan_dir);
908 if (status < 0) { 908 if (status < 0) {
909 mlog_errno(status); 909 mlog_errno(status);
910 goto leave; 910 goto leave;
911 } 911 }
912 } 912 }
913 913
914 /* delete the name from the parent dir */ 914 /* delete the name from the parent dir */
915 status = ocfs2_delete_entry(handle, dir, dirent, dirent_bh); 915 status = ocfs2_delete_entry(handle, dir, dirent, dirent_bh);
916 if (status < 0) { 916 if (status < 0) {
917 mlog_errno(status); 917 mlog_errno(status);
918 goto leave; 918 goto leave;
919 } 919 }
920 920
921 if (S_ISDIR(inode->i_mode)) 921 if (S_ISDIR(inode->i_mode))
922 drop_nlink(inode); 922 drop_nlink(inode);
923 drop_nlink(inode); 923 drop_nlink(inode);
924 fe->i_links_count = cpu_to_le16(inode->i_nlink); 924 fe->i_links_count = cpu_to_le16(inode->i_nlink);
925 925
926 status = ocfs2_journal_dirty(handle, fe_bh); 926 status = ocfs2_journal_dirty(handle, fe_bh);
927 if (status < 0) { 927 if (status < 0) {
928 mlog_errno(status); 928 mlog_errno(status);
929 goto leave; 929 goto leave;
930 } 930 }
931 931
932 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 932 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
933 if (S_ISDIR(inode->i_mode)) 933 if (S_ISDIR(inode->i_mode))
934 drop_nlink(dir); 934 drop_nlink(dir);
935 935
936 status = ocfs2_mark_inode_dirty(handle, dir, parent_node_bh); 936 status = ocfs2_mark_inode_dirty(handle, dir, parent_node_bh);
937 if (status < 0) { 937 if (status < 0) {
938 mlog_errno(status); 938 mlog_errno(status);
939 if (S_ISDIR(inode->i_mode)) 939 if (S_ISDIR(inode->i_mode))
940 inc_nlink(dir); 940 inc_nlink(dir);
941 } 941 }
942 942
943 leave: 943 leave:
944 if (handle) 944 if (handle)
945 ocfs2_commit_trans(osb, handle); 945 ocfs2_commit_trans(osb, handle);
946 946
947 if (child_locked) 947 if (child_locked)
948 ocfs2_meta_unlock(inode, 1); 948 ocfs2_meta_unlock(inode, 1);
949 949
950 ocfs2_meta_unlock(dir, 1); 950 ocfs2_meta_unlock(dir, 1);
951 951
952 if (orphan_dir) { 952 if (orphan_dir) {
953 /* This was locked for us in ocfs2_prepare_orphan_dir() */ 953 /* This was locked for us in ocfs2_prepare_orphan_dir() */
954 ocfs2_meta_unlock(orphan_dir, 1); 954 ocfs2_meta_unlock(orphan_dir, 1);
955 mutex_unlock(&orphan_dir->i_mutex); 955 mutex_unlock(&orphan_dir->i_mutex);
956 iput(orphan_dir); 956 iput(orphan_dir);
957 } 957 }
958 958
959 if (fe_bh) 959 if (fe_bh)
960 brelse(fe_bh); 960 brelse(fe_bh);
961 961
962 if (dirent_bh) 962 if (dirent_bh)
963 brelse(dirent_bh); 963 brelse(dirent_bh);
964 964
965 if (parent_node_bh) 965 if (parent_node_bh)
966 brelse(parent_node_bh); 966 brelse(parent_node_bh);
967 967
968 if (orphan_entry_bh) 968 if (orphan_entry_bh)
969 brelse(orphan_entry_bh); 969 brelse(orphan_entry_bh);
970 970
971 mlog_exit(status); 971 mlog_exit(status);
972 972
973 return status; 973 return status;
974 } 974 }
975 975
976 /* 976 /*
977 * The only place this should be used is rename! 977 * The only place this should be used is rename!
978 * if they have the same id, then the 1st one is the only one locked. 978 * if they have the same id, then the 1st one is the only one locked.
979 */ 979 */
980 static int ocfs2_double_lock(struct ocfs2_super *osb, 980 static int ocfs2_double_lock(struct ocfs2_super *osb,
981 struct buffer_head **bh1, 981 struct buffer_head **bh1,
982 struct inode *inode1, 982 struct inode *inode1,
983 struct buffer_head **bh2, 983 struct buffer_head **bh2,
984 struct inode *inode2) 984 struct inode *inode2)
985 { 985 {
986 int status; 986 int status;
987 struct ocfs2_inode_info *oi1 = OCFS2_I(inode1); 987 struct ocfs2_inode_info *oi1 = OCFS2_I(inode1);
988 struct ocfs2_inode_info *oi2 = OCFS2_I(inode2); 988 struct ocfs2_inode_info *oi2 = OCFS2_I(inode2);
989 struct buffer_head **tmpbh; 989 struct buffer_head **tmpbh;
990 struct inode *tmpinode; 990 struct inode *tmpinode;
991 991
992 mlog_entry("(inode1 = %llu, inode2 = %llu)\n", 992 mlog_entry("(inode1 = %llu, inode2 = %llu)\n",
993 (unsigned long long)oi1->ip_blkno, 993 (unsigned long long)oi1->ip_blkno,
994 (unsigned long long)oi2->ip_blkno); 994 (unsigned long long)oi2->ip_blkno);
995 995
996 if (*bh1) 996 if (*bh1)
997 *bh1 = NULL; 997 *bh1 = NULL;
998 if (*bh2) 998 if (*bh2)
999 *bh2 = NULL; 999 *bh2 = NULL;
1000 1000
1001 /* we always want to lock the one with the lower lockid first. */ 1001 /* we always want to lock the one with the lower lockid first. */
1002 if (oi1->ip_blkno != oi2->ip_blkno) { 1002 if (oi1->ip_blkno != oi2->ip_blkno) {
1003 if (oi1->ip_blkno < oi2->ip_blkno) { 1003 if (oi1->ip_blkno < oi2->ip_blkno) {
1004 /* switch id1 and id2 around */ 1004 /* switch id1 and id2 around */
1005 mlog(0, "switching them around...\n"); 1005 mlog(0, "switching them around...\n");
1006 tmpbh = bh2; 1006 tmpbh = bh2;
1007 bh2 = bh1; 1007 bh2 = bh1;
1008 bh1 = tmpbh; 1008 bh1 = tmpbh;
1009 1009
1010 tmpinode = inode2; 1010 tmpinode = inode2;
1011 inode2 = inode1; 1011 inode2 = inode1;
1012 inode1 = tmpinode; 1012 inode1 = tmpinode;
1013 } 1013 }
1014 /* lock id2 */ 1014 /* lock id2 */
1015 status = ocfs2_meta_lock(inode2, bh2, 1); 1015 status = ocfs2_meta_lock(inode2, bh2, 1);
1016 if (status < 0) { 1016 if (status < 0) {
1017 if (status != -ENOENT) 1017 if (status != -ENOENT)
1018 mlog_errno(status); 1018 mlog_errno(status);
1019 goto bail; 1019 goto bail;
1020 } 1020 }
1021 } 1021 }
1022 1022
1023 /* lock id1 */ 1023 /* lock id1 */
1024 status = ocfs2_meta_lock(inode1, bh1, 1); 1024 status = ocfs2_meta_lock(inode1, bh1, 1);
1025 if (status < 0) { 1025 if (status < 0) {
1026 /* 1026 /*
1027 * An error return must mean that no cluster locks 1027 * An error return must mean that no cluster locks
1028 * were held on function exit. 1028 * were held on function exit.
1029 */ 1029 */
1030 if (oi1->ip_blkno != oi2->ip_blkno) 1030 if (oi1->ip_blkno != oi2->ip_blkno)
1031 ocfs2_meta_unlock(inode2, 1); 1031 ocfs2_meta_unlock(inode2, 1);
1032 1032
1033 if (status != -ENOENT) 1033 if (status != -ENOENT)
1034 mlog_errno(status); 1034 mlog_errno(status);
1035 } 1035 }
1036 1036
1037 bail: 1037 bail:
1038 mlog_exit(status); 1038 mlog_exit(status);
1039 return status; 1039 return status;
1040 } 1040 }
1041 1041
1042 static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2) 1042 static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2)
1043 { 1043 {
1044 ocfs2_meta_unlock(inode1, 1); 1044 ocfs2_meta_unlock(inode1, 1);
1045 1045
1046 if (inode1 != inode2) 1046 if (inode1 != inode2)
1047 ocfs2_meta_unlock(inode2, 1); 1047 ocfs2_meta_unlock(inode2, 1);
1048 } 1048 }
1049 1049
1050 #define PARENT_INO(buffer) \ 1050 #define PARENT_INO(buffer) \
1051 ((struct ocfs2_dir_entry *) \ 1051 ((struct ocfs2_dir_entry *) \
1052 ((char *)buffer + \ 1052 ((char *)buffer + \
1053 le16_to_cpu(((struct ocfs2_dir_entry *)buffer)->rec_len)))->inode 1053 le16_to_cpu(((struct ocfs2_dir_entry *)buffer)->rec_len)))->inode
1054 1054
1055 static int ocfs2_rename(struct inode *old_dir, 1055 static int ocfs2_rename(struct inode *old_dir,
1056 struct dentry *old_dentry, 1056 struct dentry *old_dentry,
1057 struct inode *new_dir, 1057 struct inode *new_dir,
1058 struct dentry *new_dentry) 1058 struct dentry *new_dentry)
1059 { 1059 {
1060 int status = 0, rename_lock = 0, parents_locked = 0; 1060 int status = 0, rename_lock = 0, parents_locked = 0;
1061 int old_child_locked = 0, new_child_locked = 0; 1061 int old_child_locked = 0, new_child_locked = 0;
1062 struct inode *old_inode = old_dentry->d_inode; 1062 struct inode *old_inode = old_dentry->d_inode;
1063 struct inode *new_inode = new_dentry->d_inode; 1063 struct inode *new_inode = new_dentry->d_inode;
1064 struct inode *orphan_dir = NULL; 1064 struct inode *orphan_dir = NULL;
1065 struct ocfs2_dinode *newfe = NULL; 1065 struct ocfs2_dinode *newfe = NULL;
1066 char orphan_name[OCFS2_ORPHAN_NAMELEN + 1]; 1066 char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
1067 struct buffer_head *orphan_entry_bh = NULL; 1067 struct buffer_head *orphan_entry_bh = NULL;
1068 struct buffer_head *newfe_bh = NULL; 1068 struct buffer_head *newfe_bh = NULL;
1069 struct buffer_head *old_inode_bh = NULL; 1069 struct buffer_head *old_inode_bh = NULL;
1070 struct buffer_head *insert_entry_bh = NULL; 1070 struct buffer_head *insert_entry_bh = NULL;
1071 struct ocfs2_super *osb = NULL; 1071 struct ocfs2_super *osb = NULL;
1072 u64 newfe_blkno; 1072 u64 newfe_blkno;
1073 handle_t *handle = NULL; 1073 handle_t *handle = NULL;
1074 struct buffer_head *old_dir_bh = NULL; 1074 struct buffer_head *old_dir_bh = NULL;
1075 struct buffer_head *new_dir_bh = NULL; 1075 struct buffer_head *new_dir_bh = NULL;
1076 struct ocfs2_dir_entry *old_de = NULL, *new_de = NULL; // dirent for old_dentry 1076 struct ocfs2_dir_entry *old_de = NULL, *new_de = NULL; // dirent for old_dentry
1077 // and new_dentry 1077 // and new_dentry
1078 struct buffer_head *new_de_bh = NULL, *old_de_bh = NULL; // bhs for above 1078 struct buffer_head *new_de_bh = NULL, *old_de_bh = NULL; // bhs for above
1079 struct buffer_head *old_inode_de_bh = NULL; // if old_dentry is a dir, 1079 struct buffer_head *old_inode_de_bh = NULL; // if old_dentry is a dir,
1080 // this is the 1st dirent bh 1080 // this is the 1st dirent bh
1081 nlink_t old_dir_nlink = old_dir->i_nlink; 1081 nlink_t old_dir_nlink = old_dir->i_nlink;
1082 1082
1083 /* At some point it might be nice to break this function up a 1083 /* At some point it might be nice to break this function up a
1084 * bit. */ 1084 * bit. */
1085 1085
1086 mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p, from='%.*s' to='%.*s')\n", 1086 mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p, from='%.*s' to='%.*s')\n",
1087 old_dir, old_dentry, new_dir, new_dentry, 1087 old_dir, old_dentry, new_dir, new_dentry,
1088 old_dentry->d_name.len, old_dentry->d_name.name, 1088 old_dentry->d_name.len, old_dentry->d_name.name,
1089 new_dentry->d_name.len, new_dentry->d_name.name); 1089 new_dentry->d_name.len, new_dentry->d_name.name);
1090 1090
1091 osb = OCFS2_SB(old_dir->i_sb); 1091 osb = OCFS2_SB(old_dir->i_sb);
1092 1092
1093 if (new_inode) { 1093 if (new_inode) {
1094 if (!igrab(new_inode)) 1094 if (!igrab(new_inode))
1095 BUG(); 1095 BUG();
1096 } 1096 }
1097 1097
1098 /* Assume a directory hierarchy thusly: 1098 /* Assume a directory hierarchy thusly:
1099 * a/b/c 1099 * a/b/c
1100 * a/d 1100 * a/d
1101 * a,b,c, and d are all directories. 1101 * a,b,c, and d are all directories.
1102 * 1102 *
1103 * from cwd of 'a' on both nodes: 1103 * from cwd of 'a' on both nodes:
1104 * node1: mv b/c d 1104 * node1: mv b/c d
1105 * node2: mv d b/c 1105 * node2: mv d b/c
1106 * 1106 *
1107 * And that's why, just like the VFS, we need a file system 1107 * And that's why, just like the VFS, we need a file system
1108 * rename lock. */ 1108 * rename lock. */
1109 if (old_dentry != new_dentry) { 1109 if (old_dentry != new_dentry) {
1110 status = ocfs2_rename_lock(osb); 1110 status = ocfs2_rename_lock(osb);
1111 if (status < 0) { 1111 if (status < 0) {
1112 mlog_errno(status); 1112 mlog_errno(status);
1113 goto bail; 1113 goto bail;
1114 } 1114 }
1115 rename_lock = 1; 1115 rename_lock = 1;
1116 } 1116 }
1117 1117
1118 /* if old and new are the same, this'll just do one lock. */ 1118 /* if old and new are the same, this'll just do one lock. */
1119 status = ocfs2_double_lock(osb, &old_dir_bh, old_dir, 1119 status = ocfs2_double_lock(osb, &old_dir_bh, old_dir,
1120 &new_dir_bh, new_dir); 1120 &new_dir_bh, new_dir);
1121 if (status < 0) { 1121 if (status < 0) {
1122 mlog_errno(status); 1122 mlog_errno(status);
1123 goto bail; 1123 goto bail;
1124 } 1124 }
1125 parents_locked = 1; 1125 parents_locked = 1;
1126 1126
1127 /* make sure both dirs have bhs 1127 /* make sure both dirs have bhs
1128 * get an extra ref on old_dir_bh if old==new */ 1128 * get an extra ref on old_dir_bh if old==new */
1129 if (!new_dir_bh) { 1129 if (!new_dir_bh) {
1130 if (old_dir_bh) { 1130 if (old_dir_bh) {
1131 new_dir_bh = old_dir_bh; 1131 new_dir_bh = old_dir_bh;
1132 get_bh(new_dir_bh); 1132 get_bh(new_dir_bh);
1133 } else { 1133 } else {
1134 mlog(ML_ERROR, "no old_dir_bh!\n"); 1134 mlog(ML_ERROR, "no old_dir_bh!\n");
1135 status = -EIO; 1135 status = -EIO;
1136 goto bail; 1136 goto bail;
1137 } 1137 }
1138 } 1138 }
1139 1139
1140 /* 1140 /*
1141 * Aside from allowing a meta data update, the locking here 1141 * Aside from allowing a meta data update, the locking here
1142 * also ensures that the vote thread on other nodes won't have 1142 * also ensures that the vote thread on other nodes won't have
1143 * to concurrently downconvert the inode and the dentry locks. 1143 * to concurrently downconvert the inode and the dentry locks.
1144 */ 1144 */
1145 status = ocfs2_meta_lock(old_inode, &old_inode_bh, 1); 1145 status = ocfs2_meta_lock(old_inode, &old_inode_bh, 1);
1146 if (status < 0) { 1146 if (status < 0) {
1147 if (status != -ENOENT) 1147 if (status != -ENOENT)
1148 mlog_errno(status); 1148 mlog_errno(status);
1149 goto bail; 1149 goto bail;
1150 } 1150 }
1151 old_child_locked = 1; 1151 old_child_locked = 1;
1152 1152
1153 status = ocfs2_remote_dentry_delete(old_dentry); 1153 status = ocfs2_remote_dentry_delete(old_dentry);
1154 if (status < 0) { 1154 if (status < 0) {
1155 mlog_errno(status); 1155 mlog_errno(status);
1156 goto bail; 1156 goto bail;
1157 } 1157 }
1158 1158
1159 if (S_ISDIR(old_inode->i_mode)) { 1159 if (S_ISDIR(old_inode->i_mode)) {
1160 status = -EIO; 1160 status = -EIO;
1161 old_inode_de_bh = ocfs2_bread(old_inode, 0, &status, 0); 1161 old_inode_de_bh = ocfs2_bread(old_inode, 0, &status, 0);
1162 if (!old_inode_de_bh) 1162 if (!old_inode_de_bh)
1163 goto bail; 1163 goto bail;
1164 1164
1165 status = -EIO; 1165 status = -EIO;
1166 if (le64_to_cpu(PARENT_INO(old_inode_de_bh->b_data)) != 1166 if (le64_to_cpu(PARENT_INO(old_inode_de_bh->b_data)) !=
1167 OCFS2_I(old_dir)->ip_blkno) 1167 OCFS2_I(old_dir)->ip_blkno)
1168 goto bail; 1168 goto bail;
1169 status = -EMLINK; 1169 status = -EMLINK;
1170 if (!new_inode && new_dir!=old_dir && 1170 if (!new_inode && new_dir!=old_dir &&
1171 new_dir->i_nlink >= OCFS2_LINK_MAX) 1171 new_dir->i_nlink >= OCFS2_LINK_MAX)
1172 goto bail; 1172 goto bail;
1173 } 1173 }
1174 1174
1175 status = -ENOENT; 1175 status = -ENOENT;
1176 old_de_bh = ocfs2_find_entry(old_dentry->d_name.name, 1176 old_de_bh = ocfs2_find_entry(old_dentry->d_name.name,
1177 old_dentry->d_name.len, 1177 old_dentry->d_name.len,
1178 old_dir, &old_de); 1178 old_dir, &old_de);
1179 if (!old_de_bh) 1179 if (!old_de_bh)
1180 goto bail; 1180 goto bail;
1181 1181
1182 /* 1182 /*
1183 * Check for inode number is _not_ due to possible IO errors. 1183 * Check for inode number is _not_ due to possible IO errors.
1184 * We might rmdir the source, keep it as pwd of some process 1184 * We might rmdir the source, keep it as pwd of some process
1185 * and merrily kill the link to whatever was created under the 1185 * and merrily kill the link to whatever was created under the
1186 * same name. Goodbye sticky bit ;-< 1186 * same name. Goodbye sticky bit ;-<
1187 */ 1187 */
1188 if (le64_to_cpu(old_de->inode) != OCFS2_I(old_inode)->ip_blkno) 1188 if (le64_to_cpu(old_de->inode) != OCFS2_I(old_inode)->ip_blkno)
1189 goto bail; 1189 goto bail;
1190 1190
1191 /* check if the target already exists (in which case we need 1191 /* check if the target already exists (in which case we need
1192 * to delete it */ 1192 * to delete it */
1193 status = ocfs2_find_files_on_disk(new_dentry->d_name.name, 1193 status = ocfs2_find_files_on_disk(new_dentry->d_name.name,
1194 new_dentry->d_name.len, 1194 new_dentry->d_name.len,
1195 &newfe_blkno, new_dir, &new_de_bh, 1195 &newfe_blkno, new_dir, &new_de_bh,
1196 &new_de); 1196 &new_de);
1197 /* The only error we allow here is -ENOENT because the new 1197 /* The only error we allow here is -ENOENT because the new
1198 * file not existing is perfectly valid. */ 1198 * file not existing is perfectly valid. */
1199 if ((status < 0) && (status != -ENOENT)) { 1199 if ((status < 0) && (status != -ENOENT)) {
1200 /* If we cannot find the file specified we should just */ 1200 /* If we cannot find the file specified we should just */
1201 /* return the error... */ 1201 /* return the error... */
1202 mlog_errno(status); 1202 mlog_errno(status);
1203 goto bail; 1203 goto bail;
1204 } 1204 }
1205 1205
1206 if (!new_de && new_inode) 1206 if (!new_de && new_inode)
1207 mlog(ML_ERROR, "inode %lu does not exist in it's parent " 1207 mlog(ML_ERROR, "inode %lu does not exist in it's parent "
1208 "directory!", new_inode->i_ino); 1208 "directory!", new_inode->i_ino);
1209 1209
1210 /* In case we need to overwrite an existing file, we blow it 1210 /* In case we need to overwrite an existing file, we blow it
1211 * away first */ 1211 * away first */
1212 if (new_de) { 1212 if (new_de) {
1213 /* VFS didn't think there existed an inode here, but 1213 /* VFS didn't think there existed an inode here, but
1214 * someone else in the cluster must have raced our 1214 * someone else in the cluster must have raced our
1215 * rename to create one. Today we error cleanly, in 1215 * rename to create one. Today we error cleanly, in
1216 * the future we should consider calling iget to build 1216 * the future we should consider calling iget to build
1217 * a new struct inode for this entry. */ 1217 * a new struct inode for this entry. */
1218 if (!new_inode) { 1218 if (!new_inode) {
1219 status = -EACCES; 1219 status = -EACCES;
1220 1220
1221 mlog(0, "We found an inode for name %.*s but VFS " 1221 mlog(0, "We found an inode for name %.*s but VFS "
1222 "didn't give us one.\n", new_dentry->d_name.len, 1222 "didn't give us one.\n", new_dentry->d_name.len,
1223 new_dentry->d_name.name); 1223 new_dentry->d_name.name);
1224 goto bail; 1224 goto bail;
1225 } 1225 }
1226 1226
1227 if (OCFS2_I(new_inode)->ip_blkno != newfe_blkno) { 1227 if (OCFS2_I(new_inode)->ip_blkno != newfe_blkno) {
1228 status = -EACCES; 1228 status = -EACCES;
1229 1229
1230 mlog(0, "Inode %llu and dir %llu disagree. flags = %x\n", 1230 mlog(0, "Inode %llu and dir %llu disagree. flags = %x\n",
1231 (unsigned long long)OCFS2_I(new_inode)->ip_blkno, 1231 (unsigned long long)OCFS2_I(new_inode)->ip_blkno,
1232 (unsigned long long)newfe_blkno, 1232 (unsigned long long)newfe_blkno,
1233 OCFS2_I(new_inode)->ip_flags); 1233 OCFS2_I(new_inode)->ip_flags);
1234 goto bail; 1234 goto bail;
1235 } 1235 }
1236 1236
1237 status = ocfs2_meta_lock(new_inode, &newfe_bh, 1); 1237 status = ocfs2_meta_lock(new_inode, &newfe_bh, 1);
1238 if (status < 0) { 1238 if (status < 0) {
1239 if (status != -ENOENT) 1239 if (status != -ENOENT)
1240 mlog_errno(status); 1240 mlog_errno(status);
1241 goto bail; 1241 goto bail;
1242 } 1242 }
1243 new_child_locked = 1; 1243 new_child_locked = 1;
1244 1244
1245 status = ocfs2_remote_dentry_delete(new_dentry); 1245 status = ocfs2_remote_dentry_delete(new_dentry);
1246 if (status < 0) { 1246 if (status < 0) {
1247 mlog_errno(status); 1247 mlog_errno(status);
1248 goto bail; 1248 goto bail;
1249 } 1249 }
1250 1250
1251 newfe = (struct ocfs2_dinode *) newfe_bh->b_data; 1251 newfe = (struct ocfs2_dinode *) newfe_bh->b_data;
1252 1252
1253 mlog(0, "aha rename over existing... new_de=%p new_blkno=%llu " 1253 mlog(0, "aha rename over existing... new_de=%p new_blkno=%llu "
1254 "newfebh=%p bhblocknr=%llu\n", new_de, 1254 "newfebh=%p bhblocknr=%llu\n", new_de,
1255 (unsigned long long)newfe_blkno, newfe_bh, newfe_bh ? 1255 (unsigned long long)newfe_blkno, newfe_bh, newfe_bh ?
1256 (unsigned long long)newfe_bh->b_blocknr : 0ULL); 1256 (unsigned long long)newfe_bh->b_blocknr : 0ULL);
1257 1257
1258 if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) { 1258 if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) {
1259 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, 1259 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
1260 new_inode, 1260 new_inode,
1261 orphan_name, 1261 orphan_name,
1262 &orphan_entry_bh); 1262 &orphan_entry_bh);
1263 if (status < 0) { 1263 if (status < 0) {
1264 mlog_errno(status); 1264 mlog_errno(status);
1265 goto bail; 1265 goto bail;
1266 } 1266 }
1267 } 1267 }
1268 } else { 1268 } else {
1269 BUG_ON(new_dentry->d_parent->d_inode != new_dir); 1269 BUG_ON(new_dentry->d_parent->d_inode != new_dir);
1270 1270
1271 status = ocfs2_check_dir_for_entry(new_dir, 1271 status = ocfs2_check_dir_for_entry(new_dir,
1272 new_dentry->d_name.name, 1272 new_dentry->d_name.name,
1273 new_dentry->d_name.len); 1273 new_dentry->d_name.len);
1274 if (status) 1274 if (status)
1275 goto bail; 1275 goto bail;
1276 1276
1277 status = ocfs2_prepare_dir_for_insert(osb, new_dir, new_dir_bh, 1277 status = ocfs2_prepare_dir_for_insert(osb, new_dir, new_dir_bh,
1278 new_dentry->d_name.name, 1278 new_dentry->d_name.name,
1279 new_dentry->d_name.len, 1279 new_dentry->d_name.len,
1280 &insert_entry_bh); 1280 &insert_entry_bh);
1281 if (status < 0) { 1281 if (status < 0) {
1282 mlog_errno(status); 1282 mlog_errno(status);
1283 goto bail; 1283 goto bail;
1284 } 1284 }
1285 } 1285 }
1286 1286
1287 handle = ocfs2_start_trans(osb, OCFS2_RENAME_CREDITS); 1287 handle = ocfs2_start_trans(osb, OCFS2_RENAME_CREDITS);
1288 if (IS_ERR(handle)) { 1288 if (IS_ERR(handle)) {
1289 status = PTR_ERR(handle); 1289 status = PTR_ERR(handle);
1290 handle = NULL; 1290 handle = NULL;
1291 mlog_errno(status); 1291 mlog_errno(status);
1292 goto bail; 1292 goto bail;
1293 } 1293 }
1294 1294
1295 if (new_de) { 1295 if (new_de) {
1296 if (S_ISDIR(new_inode->i_mode)) { 1296 if (S_ISDIR(new_inode->i_mode)) {
1297 if (!ocfs2_empty_dir(new_inode) || 1297 if (!ocfs2_empty_dir(new_inode) ||
1298 new_inode->i_nlink != 2) { 1298 new_inode->i_nlink != 2) {
1299 status = -ENOTEMPTY; 1299 status = -ENOTEMPTY;
1300 goto bail; 1300 goto bail;
1301 } 1301 }
1302 } 1302 }
1303 status = ocfs2_journal_access(handle, new_inode, newfe_bh, 1303 status = ocfs2_journal_access(handle, new_inode, newfe_bh,
1304 OCFS2_JOURNAL_ACCESS_WRITE); 1304 OCFS2_JOURNAL_ACCESS_WRITE);
1305 if (status < 0) { 1305 if (status < 0) {
1306 mlog_errno(status); 1306 mlog_errno(status);
1307 goto bail; 1307 goto bail;
1308 } 1308 }
1309 1309
1310 if (S_ISDIR(new_inode->i_mode) || 1310 if (S_ISDIR(new_inode->i_mode) ||
1311 (newfe->i_links_count == cpu_to_le16(1))){ 1311 (newfe->i_links_count == cpu_to_le16(1))){
1312 status = ocfs2_orphan_add(osb, handle, new_inode, 1312 status = ocfs2_orphan_add(osb, handle, new_inode,
1313 newfe, orphan_name, 1313 newfe, orphan_name,
1314 orphan_entry_bh, orphan_dir); 1314 orphan_entry_bh, orphan_dir);
1315 if (status < 0) { 1315 if (status < 0) {
1316 mlog_errno(status); 1316 mlog_errno(status);
1317 goto bail; 1317 goto bail;
1318 } 1318 }
1319 } 1319 }
1320 1320
1321 /* change the dirent to point to the correct inode */ 1321 /* change the dirent to point to the correct inode */
1322 status = ocfs2_journal_access(handle, new_dir, new_de_bh, 1322 status = ocfs2_journal_access(handle, new_dir, new_de_bh,
1323 OCFS2_JOURNAL_ACCESS_WRITE); 1323 OCFS2_JOURNAL_ACCESS_WRITE);
1324 if (status < 0) { 1324 if (status < 0) {
1325 mlog_errno(status); 1325 mlog_errno(status);
1326 goto bail; 1326 goto bail;
1327 } 1327 }
1328 new_de->inode = cpu_to_le64(OCFS2_I(old_inode)->ip_blkno); 1328 new_de->inode = cpu_to_le64(OCFS2_I(old_inode)->ip_blkno);
1329 new_de->file_type = old_de->file_type; 1329 new_de->file_type = old_de->file_type;
1330 new_dir->i_version++; 1330 new_dir->i_version++;
1331 status = ocfs2_journal_dirty(handle, new_de_bh); 1331 status = ocfs2_journal_dirty(handle, new_de_bh);
1332 if (status < 0) { 1332 if (status < 0) {
1333 mlog_errno(status); 1333 mlog_errno(status);
1334 goto bail; 1334 goto bail;
1335 } 1335 }
1336 1336
1337 if (S_ISDIR(new_inode->i_mode)) 1337 if (S_ISDIR(new_inode->i_mode))
1338 newfe->i_links_count = 0; 1338 newfe->i_links_count = 0;
1339 else 1339 else
1340 le16_add_cpu(&newfe->i_links_count, -1); 1340 le16_add_cpu(&newfe->i_links_count, -1);
1341 1341
1342 status = ocfs2_journal_dirty(handle, newfe_bh); 1342 status = ocfs2_journal_dirty(handle, newfe_bh);
1343 if (status < 0) { 1343 if (status < 0) {
1344 mlog_errno(status); 1344 mlog_errno(status);
1345 goto bail; 1345 goto bail;
1346 } 1346 }
1347 } else { 1347 } else {
1348 /* if the name was not found in new_dir, add it now */ 1348 /* if the name was not found in new_dir, add it now */
1349 status = ocfs2_add_entry(handle, new_dentry, old_inode, 1349 status = ocfs2_add_entry(handle, new_dentry, old_inode,
1350 OCFS2_I(old_inode)->ip_blkno, 1350 OCFS2_I(old_inode)->ip_blkno,
1351 new_dir_bh, insert_entry_bh); 1351 new_dir_bh, insert_entry_bh);
1352 } 1352 }
1353 1353
1354 old_inode->i_ctime = CURRENT_TIME; 1354 old_inode->i_ctime = CURRENT_TIME;
1355 mark_inode_dirty(old_inode); 1355 mark_inode_dirty(old_inode);
1356 ocfs2_mark_inode_dirty(handle, old_inode, old_inode_bh); 1356 ocfs2_mark_inode_dirty(handle, old_inode, old_inode_bh);
1357 1357
1358 /* now that the name has been added to new_dir, remove the old name */ 1358 /* now that the name has been added to new_dir, remove the old name */
1359 status = ocfs2_delete_entry(handle, old_dir, old_de, old_de_bh); 1359 status = ocfs2_delete_entry(handle, old_dir, old_de, old_de_bh);
1360 if (status < 0) { 1360 if (status < 0) {
1361 mlog_errno(status); 1361 mlog_errno(status);
1362 goto bail; 1362 goto bail;
1363 } 1363 }
1364 1364
1365 if (new_inode) { 1365 if (new_inode) {
1366 new_inode->i_nlink--; 1366 new_inode->i_nlink--;
1367 new_inode->i_ctime = CURRENT_TIME; 1367 new_inode->i_ctime = CURRENT_TIME;
1368 } 1368 }
1369 old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; 1369 old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
1370 if (old_inode_de_bh) { 1370 if (old_inode_de_bh) {
1371 status = ocfs2_journal_access(handle, old_inode, 1371 status = ocfs2_journal_access(handle, old_inode,
1372 old_inode_de_bh, 1372 old_inode_de_bh,
1373 OCFS2_JOURNAL_ACCESS_WRITE); 1373 OCFS2_JOURNAL_ACCESS_WRITE);
1374 PARENT_INO(old_inode_de_bh->b_data) = 1374 PARENT_INO(old_inode_de_bh->b_data) =
1375 cpu_to_le64(OCFS2_I(new_dir)->ip_blkno); 1375 cpu_to_le64(OCFS2_I(new_dir)->ip_blkno);
1376 status = ocfs2_journal_dirty(handle, old_inode_de_bh); 1376 status = ocfs2_journal_dirty(handle, old_inode_de_bh);
1377 old_dir->i_nlink--; 1377 old_dir->i_nlink--;
1378 if (new_inode) { 1378 if (new_inode) {
1379 new_inode->i_nlink--; 1379 new_inode->i_nlink--;
1380 } else { 1380 } else {
1381 inc_nlink(new_dir); 1381 inc_nlink(new_dir);
1382 mark_inode_dirty(new_dir); 1382 mark_inode_dirty(new_dir);
1383 } 1383 }
1384 } 1384 }
1385 mark_inode_dirty(old_dir); 1385 mark_inode_dirty(old_dir);
1386 ocfs2_mark_inode_dirty(handle, old_dir, old_dir_bh); 1386 ocfs2_mark_inode_dirty(handle, old_dir, old_dir_bh);
1387 if (new_inode) { 1387 if (new_inode) {
1388 mark_inode_dirty(new_inode); 1388 mark_inode_dirty(new_inode);
1389 ocfs2_mark_inode_dirty(handle, new_inode, newfe_bh); 1389 ocfs2_mark_inode_dirty(handle, new_inode, newfe_bh);
1390 } 1390 }
1391 1391
1392 if (old_dir != new_dir) { 1392 if (old_dir != new_dir) {
1393 /* Keep the same times on both directories.*/ 1393 /* Keep the same times on both directories.*/
1394 new_dir->i_ctime = new_dir->i_mtime = old_dir->i_ctime; 1394 new_dir->i_ctime = new_dir->i_mtime = old_dir->i_ctime;
1395 1395
1396 /* 1396 /*
1397 * This will also pick up the i_nlink change from the 1397 * This will also pick up the i_nlink change from the
1398 * block above. 1398 * block above.
1399 */ 1399 */
1400 ocfs2_mark_inode_dirty(handle, new_dir, new_dir_bh); 1400 ocfs2_mark_inode_dirty(handle, new_dir, new_dir_bh);
1401 } 1401 }
1402 1402
1403 if (old_dir_nlink != old_dir->i_nlink) { 1403 if (old_dir_nlink != old_dir->i_nlink) {
1404 if (!old_dir_bh) { 1404 if (!old_dir_bh) {
1405 mlog(ML_ERROR, "need to change nlink for old dir " 1405 mlog(ML_ERROR, "need to change nlink for old dir "
1406 "%llu from %d to %d but bh is NULL!\n", 1406 "%llu from %d to %d but bh is NULL!\n",
1407 (unsigned long long)OCFS2_I(old_dir)->ip_blkno, 1407 (unsigned long long)OCFS2_I(old_dir)->ip_blkno,
1408 (int)old_dir_nlink, old_dir->i_nlink); 1408 (int)old_dir_nlink, old_dir->i_nlink);
1409 } else { 1409 } else {
1410 struct ocfs2_dinode *fe; 1410 struct ocfs2_dinode *fe;
1411 status = ocfs2_journal_access(handle, old_dir, 1411 status = ocfs2_journal_access(handle, old_dir,
1412 old_dir_bh, 1412 old_dir_bh,
1413 OCFS2_JOURNAL_ACCESS_WRITE); 1413 OCFS2_JOURNAL_ACCESS_WRITE);
1414 fe = (struct ocfs2_dinode *) old_dir_bh->b_data; 1414 fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
1415 fe->i_links_count = cpu_to_le16(old_dir->i_nlink); 1415 fe->i_links_count = cpu_to_le16(old_dir->i_nlink);
1416 status = ocfs2_journal_dirty(handle, old_dir_bh); 1416 status = ocfs2_journal_dirty(handle, old_dir_bh);
1417 } 1417 }
1418 } 1418 }
1419 1419
1420 ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir); 1420 ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir);
1421 status = 0; 1421 status = 0;
1422 bail: 1422 bail:
1423 if (rename_lock) 1423 if (rename_lock)
1424 ocfs2_rename_unlock(osb); 1424 ocfs2_rename_unlock(osb);
1425 1425
1426 if (handle) 1426 if (handle)
1427 ocfs2_commit_trans(osb, handle); 1427 ocfs2_commit_trans(osb, handle);
1428 1428
1429 if (parents_locked) 1429 if (parents_locked)
1430 ocfs2_double_unlock(old_dir, new_dir); 1430 ocfs2_double_unlock(old_dir, new_dir);
1431 1431
1432 if (old_child_locked) 1432 if (old_child_locked)
1433 ocfs2_meta_unlock(old_inode, 1); 1433 ocfs2_meta_unlock(old_inode, 1);
1434 1434
1435 if (new_child_locked) 1435 if (new_child_locked)
1436 ocfs2_meta_unlock(new_inode, 1); 1436 ocfs2_meta_unlock(new_inode, 1);
1437 1437
1438 if (orphan_dir) { 1438 if (orphan_dir) {
1439 /* This was locked for us in ocfs2_prepare_orphan_dir() */ 1439 /* This was locked for us in ocfs2_prepare_orphan_dir() */
1440 ocfs2_meta_unlock(orphan_dir, 1); 1440 ocfs2_meta_unlock(orphan_dir, 1);
1441 mutex_unlock(&orphan_dir->i_mutex); 1441 mutex_unlock(&orphan_dir->i_mutex);
1442 iput(orphan_dir); 1442 iput(orphan_dir);
1443 } 1443 }
1444 1444
1445 if (new_inode) 1445 if (new_inode)
1446 sync_mapping_buffers(old_inode->i_mapping); 1446 sync_mapping_buffers(old_inode->i_mapping);
1447 1447
1448 if (new_inode) 1448 if (new_inode)
1449 iput(new_inode); 1449 iput(new_inode);
1450 if (newfe_bh) 1450 if (newfe_bh)
1451 brelse(newfe_bh); 1451 brelse(newfe_bh);
1452 if (old_inode_bh) 1452 if (old_inode_bh)
1453 brelse(old_inode_bh); 1453 brelse(old_inode_bh);
1454 if (old_dir_bh) 1454 if (old_dir_bh)
1455 brelse(old_dir_bh); 1455 brelse(old_dir_bh);
1456 if (new_dir_bh) 1456 if (new_dir_bh)
1457 brelse(new_dir_bh); 1457 brelse(new_dir_bh);
1458 if (new_de_bh) 1458 if (new_de_bh)
1459 brelse(new_de_bh); 1459 brelse(new_de_bh);
1460 if (old_de_bh) 1460 if (old_de_bh)
1461 brelse(old_de_bh); 1461 brelse(old_de_bh);
1462 if (old_inode_de_bh) 1462 if (old_inode_de_bh)
1463 brelse(old_inode_de_bh); 1463 brelse(old_inode_de_bh);
1464 if (orphan_entry_bh) 1464 if (orphan_entry_bh)
1465 brelse(orphan_entry_bh); 1465 brelse(orphan_entry_bh);
1466 if (insert_entry_bh) 1466 if (insert_entry_bh)
1467 brelse(insert_entry_bh); 1467 brelse(insert_entry_bh);
1468 1468
1469 mlog_exit(status); 1469 mlog_exit(status);
1470 1470
1471 return status; 1471 return status;
1472 } 1472 }
1473 1473
1474 /* 1474 /*
1475 * we expect i_size = strlen(symname). Copy symname into the file 1475 * we expect i_size = strlen(symname). Copy symname into the file
1476 * data, including the null terminator. 1476 * data, including the null terminator.
1477 */ 1477 */
1478 static int ocfs2_create_symlink_data(struct ocfs2_super *osb, 1478 static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
1479 handle_t *handle, 1479 handle_t *handle,
1480 struct inode *inode, 1480 struct inode *inode,
1481 const char *symname) 1481 const char *symname)
1482 { 1482 {
1483 struct buffer_head **bhs = NULL; 1483 struct buffer_head **bhs = NULL;
1484 const char *c; 1484 const char *c;
1485 struct super_block *sb = osb->sb; 1485 struct super_block *sb = osb->sb;
1486 u64 p_blkno; 1486 u64 p_blkno, p_blocks;
1487 int p_blocks;
1488 int virtual, blocks, status, i, bytes_left; 1487 int virtual, blocks, status, i, bytes_left;
1489 1488
1490 bytes_left = i_size_read(inode) + 1; 1489 bytes_left = i_size_read(inode) + 1;
1491 /* we can't trust i_blocks because we're actually going to 1490 /* we can't trust i_blocks because we're actually going to
1492 * write i_size + 1 bytes. */ 1491 * write i_size + 1 bytes. */
1493 blocks = (bytes_left + sb->s_blocksize - 1) >> sb->s_blocksize_bits; 1492 blocks = (bytes_left + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
1494 1493
1495 mlog_entry("i_blocks = %llu, i_size = %llu, blocks = %d\n", 1494 mlog_entry("i_blocks = %llu, i_size = %llu, blocks = %d\n",
1496 (unsigned long long)inode->i_blocks, 1495 (unsigned long long)inode->i_blocks,
1497 i_size_read(inode), blocks); 1496 i_size_read(inode), blocks);
1498 1497
1499 /* Sanity check -- make sure we're going to fit. */ 1498 /* Sanity check -- make sure we're going to fit. */
1500 if (bytes_left > 1499 if (bytes_left >
1501 ocfs2_clusters_to_bytes(sb, OCFS2_I(inode)->ip_clusters)) { 1500 ocfs2_clusters_to_bytes(sb, OCFS2_I(inode)->ip_clusters)) {
1502 status = -EIO; 1501 status = -EIO;
1503 mlog_errno(status); 1502 mlog_errno(status);
1504 goto bail; 1503 goto bail;
1505 } 1504 }
1506 1505
1507 bhs = kcalloc(blocks, sizeof(struct buffer_head *), GFP_KERNEL); 1506 bhs = kcalloc(blocks, sizeof(struct buffer_head *), GFP_KERNEL);
1508 if (!bhs) { 1507 if (!bhs) {
1509 status = -ENOMEM; 1508 status = -ENOMEM;
1510 mlog_errno(status); 1509 mlog_errno(status);
1511 goto bail; 1510 goto bail;
1512 } 1511 }
1513 1512
1514 status = ocfs2_extent_map_get_blocks(inode, 0, &p_blkno, &p_blocks, 1513 status = ocfs2_extent_map_get_blocks(inode, 0, &p_blkno, &p_blocks,
1515 NULL); 1514 NULL);
1516 if (status < 0) { 1515 if (status < 0) {
1517 mlog_errno(status); 1516 mlog_errno(status);
1518 goto bail; 1517 goto bail;
1519 } 1518 }
1520 1519
1521 /* links can never be larger than one cluster so we know this 1520 /* links can never be larger than one cluster so we know this
1522 * is all going to be contiguous, but do a sanity check 1521 * is all going to be contiguous, but do a sanity check
1523 * anyway. */ 1522 * anyway. */
1524 if ((p_blocks << sb->s_blocksize_bits) < bytes_left) { 1523 if ((p_blocks << sb->s_blocksize_bits) < bytes_left) {
1525 status = -EIO; 1524 status = -EIO;
1526 mlog_errno(status); 1525 mlog_errno(status);
1527 goto bail; 1526 goto bail;
1528 } 1527 }
1529 1528
1530 virtual = 0; 1529 virtual = 0;
1531 while(bytes_left > 0) { 1530 while(bytes_left > 0) {
1532 c = &symname[virtual * sb->s_blocksize]; 1531 c = &symname[virtual * sb->s_blocksize];
1533 1532
1534 bhs[virtual] = sb_getblk(sb, p_blkno); 1533 bhs[virtual] = sb_getblk(sb, p_blkno);
1535 if (!bhs[virtual]) { 1534 if (!bhs[virtual]) {
1536 status = -ENOMEM; 1535 status = -ENOMEM;
1537 mlog_errno(status); 1536 mlog_errno(status);
1538 goto bail; 1537 goto bail;
1539 } 1538 }
1540 ocfs2_set_new_buffer_uptodate(inode, bhs[virtual]); 1539 ocfs2_set_new_buffer_uptodate(inode, bhs[virtual]);
1541 1540
1542 status = ocfs2_journal_access(handle, inode, bhs[virtual], 1541 status = ocfs2_journal_access(handle, inode, bhs[virtual],
1543 OCFS2_JOURNAL_ACCESS_CREATE); 1542 OCFS2_JOURNAL_ACCESS_CREATE);
1544 if (status < 0) { 1543 if (status < 0) {
1545 mlog_errno(status); 1544 mlog_errno(status);
1546 goto bail; 1545 goto bail;
1547 } 1546 }
1548 1547
1549 memset(bhs[virtual]->b_data, 0, sb->s_blocksize); 1548 memset(bhs[virtual]->b_data, 0, sb->s_blocksize);
1550 1549
1551 memcpy(bhs[virtual]->b_data, c, 1550 memcpy(bhs[virtual]->b_data, c,
1552 (bytes_left > sb->s_blocksize) ? sb->s_blocksize : 1551 (bytes_left > sb->s_blocksize) ? sb->s_blocksize :
1553 bytes_left); 1552 bytes_left);
1554 1553
1555 status = ocfs2_journal_dirty(handle, bhs[virtual]); 1554 status = ocfs2_journal_dirty(handle, bhs[virtual]);
1556 if (status < 0) { 1555 if (status < 0) {
1557 mlog_errno(status); 1556 mlog_errno(status);
1558 goto bail; 1557 goto bail;
1559 } 1558 }
1560 1559
1561 virtual++; 1560 virtual++;
1562 p_blkno++; 1561 p_blkno++;
1563 bytes_left -= sb->s_blocksize; 1562 bytes_left -= sb->s_blocksize;
1564 } 1563 }
1565 1564
1566 status = 0; 1565 status = 0;
1567 bail: 1566 bail:
1568 1567
1569 if (bhs) { 1568 if (bhs) {
1570 for(i = 0; i < blocks; i++) 1569 for(i = 0; i < blocks; i++)
1571 if (bhs[i]) 1570 if (bhs[i])
1572 brelse(bhs[i]); 1571 brelse(bhs[i]);
1573 kfree(bhs); 1572 kfree(bhs);
1574 } 1573 }
1575 1574
1576 mlog_exit(status); 1575 mlog_exit(status);
1577 return status; 1576 return status;
1578 } 1577 }
1579 1578
1580 static int ocfs2_symlink(struct inode *dir, 1579 static int ocfs2_symlink(struct inode *dir,
1581 struct dentry *dentry, 1580 struct dentry *dentry,
1582 const char *symname) 1581 const char *symname)
1583 { 1582 {
1584 int status, l, credits; 1583 int status, l, credits;
1585 u64 newsize; 1584 u64 newsize;
1586 struct ocfs2_super *osb = NULL; 1585 struct ocfs2_super *osb = NULL;
1587 struct inode *inode = NULL; 1586 struct inode *inode = NULL;
1588 struct super_block *sb; 1587 struct super_block *sb;
1589 struct buffer_head *new_fe_bh = NULL; 1588 struct buffer_head *new_fe_bh = NULL;
1590 struct buffer_head *de_bh = NULL; 1589 struct buffer_head *de_bh = NULL;
1591 struct buffer_head *parent_fe_bh = NULL; 1590 struct buffer_head *parent_fe_bh = NULL;
1592 struct ocfs2_dinode *fe = NULL; 1591 struct ocfs2_dinode *fe = NULL;
1593 struct ocfs2_dinode *dirfe; 1592 struct ocfs2_dinode *dirfe;
1594 handle_t *handle = NULL; 1593 handle_t *handle = NULL;
1595 struct ocfs2_alloc_context *inode_ac = NULL; 1594 struct ocfs2_alloc_context *inode_ac = NULL;
1596 struct ocfs2_alloc_context *data_ac = NULL; 1595 struct ocfs2_alloc_context *data_ac = NULL;
1597 1596
1598 mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir, 1597 mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
1599 dentry, symname, dentry->d_name.len, dentry->d_name.name); 1598 dentry, symname, dentry->d_name.len, dentry->d_name.name);
1600 1599
1601 sb = dir->i_sb; 1600 sb = dir->i_sb;
1602 osb = OCFS2_SB(sb); 1601 osb = OCFS2_SB(sb);
1603 1602
1604 l = strlen(symname) + 1; 1603 l = strlen(symname) + 1;
1605 1604
1606 credits = ocfs2_calc_symlink_credits(sb); 1605 credits = ocfs2_calc_symlink_credits(sb);
1607 1606
1608 /* lock the parent directory */ 1607 /* lock the parent directory */
1609 status = ocfs2_meta_lock(dir, &parent_fe_bh, 1); 1608 status = ocfs2_meta_lock(dir, &parent_fe_bh, 1);
1610 if (status < 0) { 1609 if (status < 0) {
1611 if (status != -ENOENT) 1610 if (status != -ENOENT)
1612 mlog_errno(status); 1611 mlog_errno(status);
1613 return status; 1612 return status;
1614 } 1613 }
1615 1614
1616 dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data; 1615 dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
1617 if (!dirfe->i_links_count) { 1616 if (!dirfe->i_links_count) {
1618 /* can't make a file in a deleted directory. */ 1617 /* can't make a file in a deleted directory. */
1619 status = -ENOENT; 1618 status = -ENOENT;
1620 goto bail; 1619 goto bail;
1621 } 1620 }
1622 1621
1623 status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name, 1622 status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
1624 dentry->d_name.len); 1623 dentry->d_name.len);
1625 if (status) 1624 if (status)
1626 goto bail; 1625 goto bail;
1627 1626
1628 status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh, 1627 status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
1629 dentry->d_name.name, 1628 dentry->d_name.name,
1630 dentry->d_name.len, &de_bh); 1629 dentry->d_name.len, &de_bh);
1631 if (status < 0) { 1630 if (status < 0) {
1632 mlog_errno(status); 1631 mlog_errno(status);
1633 goto bail; 1632 goto bail;
1634 } 1633 }
1635 1634
1636 status = ocfs2_reserve_new_inode(osb, &inode_ac); 1635 status = ocfs2_reserve_new_inode(osb, &inode_ac);
1637 if (status < 0) { 1636 if (status < 0) {
1638 if (status != -ENOSPC) 1637 if (status != -ENOSPC)
1639 mlog_errno(status); 1638 mlog_errno(status);
1640 goto bail; 1639 goto bail;
1641 } 1640 }
1642 1641
1643 /* don't reserve bitmap space for fast symlinks. */ 1642 /* don't reserve bitmap space for fast symlinks. */
1644 if (l > ocfs2_fast_symlink_chars(sb)) { 1643 if (l > ocfs2_fast_symlink_chars(sb)) {
1645 status = ocfs2_reserve_clusters(osb, 1, &data_ac); 1644 status = ocfs2_reserve_clusters(osb, 1, &data_ac);
1646 if (status < 0) { 1645 if (status < 0) {
1647 if (status != -ENOSPC) 1646 if (status != -ENOSPC)
1648 mlog_errno(status); 1647 mlog_errno(status);
1649 goto bail; 1648 goto bail;
1650 } 1649 }
1651 } 1650 }
1652 1651
1653 handle = ocfs2_start_trans(osb, credits); 1652 handle = ocfs2_start_trans(osb, credits);
1654 if (IS_ERR(handle)) { 1653 if (IS_ERR(handle)) {
1655 status = PTR_ERR(handle); 1654 status = PTR_ERR(handle);
1656 handle = NULL; 1655 handle = NULL;
1657 mlog_errno(status); 1656 mlog_errno(status);
1658 goto bail; 1657 goto bail;
1659 } 1658 }
1660 1659
1661 status = ocfs2_mknod_locked(osb, dir, dentry, 1660 status = ocfs2_mknod_locked(osb, dir, dentry,
1662 S_IFLNK | S_IRWXUGO, 0, 1661 S_IFLNK | S_IRWXUGO, 0,
1663 &new_fe_bh, parent_fe_bh, handle, 1662 &new_fe_bh, parent_fe_bh, handle,
1664 &inode, inode_ac); 1663 &inode, inode_ac);
1665 if (status < 0) { 1664 if (status < 0) {
1666 mlog_errno(status); 1665 mlog_errno(status);
1667 goto bail; 1666 goto bail;
1668 } 1667 }
1669 1668
1670 fe = (struct ocfs2_dinode *) new_fe_bh->b_data; 1669 fe = (struct ocfs2_dinode *) new_fe_bh->b_data;
1671 inode->i_rdev = 0; 1670 inode->i_rdev = 0;
1672 newsize = l - 1; 1671 newsize = l - 1;
1673 if (l > ocfs2_fast_symlink_chars(sb)) { 1672 if (l > ocfs2_fast_symlink_chars(sb)) {
1674 u32 offset = 0; 1673 u32 offset = 0;
1675 1674
1676 inode->i_op = &ocfs2_symlink_inode_operations; 1675 inode->i_op = &ocfs2_symlink_inode_operations;
1677 status = ocfs2_do_extend_allocation(osb, inode, &offset, 1, 1676 status = ocfs2_do_extend_allocation(osb, inode, &offset, 1,
1678 new_fe_bh, 1677 new_fe_bh,
1679 handle, data_ac, NULL, 1678 handle, data_ac, NULL,
1680 NULL); 1679 NULL);
1681 if (status < 0) { 1680 if (status < 0) {
1682 if (status != -ENOSPC && status != -EINTR) { 1681 if (status != -ENOSPC && status != -EINTR) {
1683 mlog(ML_ERROR, 1682 mlog(ML_ERROR,
1684 "Failed to extend file to %llu\n", 1683 "Failed to extend file to %llu\n",
1685 (unsigned long long)newsize); 1684 (unsigned long long)newsize);
1686 mlog_errno(status); 1685 mlog_errno(status);
1687 status = -ENOSPC; 1686 status = -ENOSPC;
1688 } 1687 }
1689 goto bail; 1688 goto bail;
1690 } 1689 }
1691 i_size_write(inode, newsize); 1690 i_size_write(inode, newsize);
1692 inode->i_blocks = ocfs2_align_bytes_to_sectors(newsize); 1691 inode->i_blocks = ocfs2_align_bytes_to_sectors(newsize);
1693 } else { 1692 } else {
1694 inode->i_op = &ocfs2_fast_symlink_inode_operations; 1693 inode->i_op = &ocfs2_fast_symlink_inode_operations;
1695 memcpy((char *) fe->id2.i_symlink, symname, l); 1694 memcpy((char *) fe->id2.i_symlink, symname, l);
1696 i_size_write(inode, newsize); 1695 i_size_write(inode, newsize);
1697 inode->i_blocks = 0; 1696 inode->i_blocks = 0;
1698 } 1697 }
1699 1698
1700 status = ocfs2_mark_inode_dirty(handle, inode, new_fe_bh); 1699 status = ocfs2_mark_inode_dirty(handle, inode, new_fe_bh);
1701 if (status < 0) { 1700 if (status < 0) {
1702 mlog_errno(status); 1701 mlog_errno(status);
1703 goto bail; 1702 goto bail;
1704 } 1703 }
1705 1704
1706 if (!ocfs2_inode_is_fast_symlink(inode)) { 1705 if (!ocfs2_inode_is_fast_symlink(inode)) {
1707 status = ocfs2_create_symlink_data(osb, handle, inode, 1706 status = ocfs2_create_symlink_data(osb, handle, inode,
1708 symname); 1707 symname);
1709 if (status < 0) { 1708 if (status < 0) {
1710 mlog_errno(status); 1709 mlog_errno(status);
1711 goto bail; 1710 goto bail;
1712 } 1711 }
1713 } 1712 }
1714 1713
1715 status = ocfs2_add_entry(handle, dentry, inode, 1714 status = ocfs2_add_entry(handle, dentry, inode,
1716 le64_to_cpu(fe->i_blkno), parent_fe_bh, 1715 le64_to_cpu(fe->i_blkno), parent_fe_bh,
1717 de_bh); 1716 de_bh);
1718 if (status < 0) { 1717 if (status < 0) {
1719 mlog_errno(status); 1718 mlog_errno(status);
1720 goto bail; 1719 goto bail;
1721 } 1720 }
1722 1721
1723 status = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno); 1722 status = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno);
1724 if (status) { 1723 if (status) {
1725 mlog_errno(status); 1724 mlog_errno(status);
1726 goto bail; 1725 goto bail;
1727 } 1726 }
1728 1727
1729 insert_inode_hash(inode); 1728 insert_inode_hash(inode);
1730 dentry->d_op = &ocfs2_dentry_ops; 1729 dentry->d_op = &ocfs2_dentry_ops;
1731 d_instantiate(dentry, inode); 1730 d_instantiate(dentry, inode);
1732 bail: 1731 bail:
1733 if (handle) 1732 if (handle)
1734 ocfs2_commit_trans(osb, handle); 1733 ocfs2_commit_trans(osb, handle);
1735 1734
1736 ocfs2_meta_unlock(dir, 1); 1735 ocfs2_meta_unlock(dir, 1);
1737 1736
1738 if (new_fe_bh) 1737 if (new_fe_bh)
1739 brelse(new_fe_bh); 1738 brelse(new_fe_bh);
1740 if (parent_fe_bh) 1739 if (parent_fe_bh)
1741 brelse(parent_fe_bh); 1740 brelse(parent_fe_bh);
1742 if (de_bh) 1741 if (de_bh)
1743 brelse(de_bh); 1742 brelse(de_bh);
1744 if (inode_ac) 1743 if (inode_ac)
1745 ocfs2_free_alloc_context(inode_ac); 1744 ocfs2_free_alloc_context(inode_ac);
1746 if (data_ac) 1745 if (data_ac)
1747 ocfs2_free_alloc_context(data_ac); 1746 ocfs2_free_alloc_context(data_ac);
1748 if ((status < 0) && inode) 1747 if ((status < 0) && inode)
1749 iput(inode); 1748 iput(inode);
1750 1749
1751 mlog_exit(status); 1750 mlog_exit(status);
1752 1751
1753 return status; 1752 return status;
1754 } 1753 }
1755 1754
1756 int ocfs2_check_dir_entry(struct inode * dir, 1755 int ocfs2_check_dir_entry(struct inode * dir,
1757 struct ocfs2_dir_entry * de, 1756 struct ocfs2_dir_entry * de,
1758 struct buffer_head * bh, 1757 struct buffer_head * bh,
1759 unsigned long offset) 1758 unsigned long offset)
1760 { 1759 {
1761 const char *error_msg = NULL; 1760 const char *error_msg = NULL;
1762 const int rlen = le16_to_cpu(de->rec_len); 1761 const int rlen = le16_to_cpu(de->rec_len);
1763 1762
1764 if (rlen < OCFS2_DIR_REC_LEN(1)) 1763 if (rlen < OCFS2_DIR_REC_LEN(1))
1765 error_msg = "rec_len is smaller than minimal"; 1764 error_msg = "rec_len is smaller than minimal";
1766 else if (rlen % 4 != 0) 1765 else if (rlen % 4 != 0)
1767 error_msg = "rec_len % 4 != 0"; 1766 error_msg = "rec_len % 4 != 0";
1768 else if (rlen < OCFS2_DIR_REC_LEN(de->name_len)) 1767 else if (rlen < OCFS2_DIR_REC_LEN(de->name_len))
1769 error_msg = "rec_len is too small for name_len"; 1768 error_msg = "rec_len is too small for name_len";
1770 else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize) 1769 else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
1771 error_msg = "directory entry across blocks"; 1770 error_msg = "directory entry across blocks";
1772 1771
1773 if (error_msg != NULL) 1772 if (error_msg != NULL)
1774 mlog(ML_ERROR, "bad entry in directory #%llu: %s - " 1773 mlog(ML_ERROR, "bad entry in directory #%llu: %s - "
1775 "offset=%lu, inode=%llu, rec_len=%d, name_len=%d\n", 1774 "offset=%lu, inode=%llu, rec_len=%d, name_len=%d\n",
1776 (unsigned long long)OCFS2_I(dir)->ip_blkno, error_msg, 1775 (unsigned long long)OCFS2_I(dir)->ip_blkno, error_msg,
1777 offset, (unsigned long long)le64_to_cpu(de->inode), rlen, 1776 offset, (unsigned long long)le64_to_cpu(de->inode), rlen,
1778 de->name_len); 1777 de->name_len);
1779 return error_msg == NULL ? 1 : 0; 1778 return error_msg == NULL ? 1 : 0;
1780 } 1779 }
1781 1780
1782 /* we don't always have a dentry for what we want to add, so people 1781 /* we don't always have a dentry for what we want to add, so people
1783 * like orphan dir can call this instead. 1782 * like orphan dir can call this instead.
1784 * 1783 *
1785 * If you pass me insert_bh, I'll skip the search of the other dir 1784 * If you pass me insert_bh, I'll skip the search of the other dir
1786 * blocks and put the record in there. 1785 * blocks and put the record in there.
1787 */ 1786 */
1788 static int __ocfs2_add_entry(handle_t *handle, 1787 static int __ocfs2_add_entry(handle_t *handle,
1789 struct inode *dir, 1788 struct inode *dir,
1790 const char *name, int namelen, 1789 const char *name, int namelen,
1791 struct inode *inode, u64 blkno, 1790 struct inode *inode, u64 blkno,
1792 struct buffer_head *parent_fe_bh, 1791 struct buffer_head *parent_fe_bh,
1793 struct buffer_head *insert_bh) 1792 struct buffer_head *insert_bh)
1794 { 1793 {
1795 unsigned long offset; 1794 unsigned long offset;
1796 unsigned short rec_len; 1795 unsigned short rec_len;
1797 struct ocfs2_dir_entry *de, *de1; 1796 struct ocfs2_dir_entry *de, *de1;
1798 struct super_block *sb; 1797 struct super_block *sb;
1799 int retval, status; 1798 int retval, status;
1800 1799
1801 mlog_entry_void(); 1800 mlog_entry_void();
1802 1801
1803 sb = dir->i_sb; 1802 sb = dir->i_sb;
1804 1803
1805 if (!namelen) 1804 if (!namelen)
1806 return -EINVAL; 1805 return -EINVAL;
1807 1806
1808 rec_len = OCFS2_DIR_REC_LEN(namelen); 1807 rec_len = OCFS2_DIR_REC_LEN(namelen);
1809 offset = 0; 1808 offset = 0;
1810 de = (struct ocfs2_dir_entry *) insert_bh->b_data; 1809 de = (struct ocfs2_dir_entry *) insert_bh->b_data;
1811 while (1) { 1810 while (1) {
1812 BUG_ON((char *)de >= sb->s_blocksize + insert_bh->b_data); 1811 BUG_ON((char *)de >= sb->s_blocksize + insert_bh->b_data);
1813 /* These checks should've already been passed by the 1812 /* These checks should've already been passed by the
1814 * prepare function, but I guess we can leave them 1813 * prepare function, but I guess we can leave them
1815 * here anyway. */ 1814 * here anyway. */
1816 if (!ocfs2_check_dir_entry(dir, de, insert_bh, offset)) { 1815 if (!ocfs2_check_dir_entry(dir, de, insert_bh, offset)) {
1817 retval = -ENOENT; 1816 retval = -ENOENT;
1818 goto bail; 1817 goto bail;
1819 } 1818 }
1820 if (ocfs2_match(namelen, name, de)) { 1819 if (ocfs2_match(namelen, name, de)) {
1821 retval = -EEXIST; 1820 retval = -EEXIST;
1822 goto bail; 1821 goto bail;
1823 } 1822 }
1824 if (((le64_to_cpu(de->inode) == 0) && 1823 if (((le64_to_cpu(de->inode) == 0) &&
1825 (le16_to_cpu(de->rec_len) >= rec_len)) || 1824 (le16_to_cpu(de->rec_len) >= rec_len)) ||
1826 (le16_to_cpu(de->rec_len) >= 1825 (le16_to_cpu(de->rec_len) >=
1827 (OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) { 1826 (OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) {
1828 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 1827 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
1829 retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh); 1828 retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
1830 if (retval < 0) { 1829 if (retval < 0) {
1831 mlog_errno(retval); 1830 mlog_errno(retval);
1832 goto bail; 1831 goto bail;
1833 } 1832 }
1834 1833
1835 status = ocfs2_journal_access(handle, dir, insert_bh, 1834 status = ocfs2_journal_access(handle, dir, insert_bh,
1836 OCFS2_JOURNAL_ACCESS_WRITE); 1835 OCFS2_JOURNAL_ACCESS_WRITE);
1837 /* By now the buffer is marked for journaling */ 1836 /* By now the buffer is marked for journaling */
1838 offset += le16_to_cpu(de->rec_len); 1837 offset += le16_to_cpu(de->rec_len);
1839 if (le64_to_cpu(de->inode)) { 1838 if (le64_to_cpu(de->inode)) {
1840 de1 = (struct ocfs2_dir_entry *)((char *) de + 1839 de1 = (struct ocfs2_dir_entry *)((char *) de +
1841 OCFS2_DIR_REC_LEN(de->name_len)); 1840 OCFS2_DIR_REC_LEN(de->name_len));
1842 de1->rec_len = 1841 de1->rec_len =
1843 cpu_to_le16(le16_to_cpu(de->rec_len) - 1842 cpu_to_le16(le16_to_cpu(de->rec_len) -
1844 OCFS2_DIR_REC_LEN(de->name_len)); 1843 OCFS2_DIR_REC_LEN(de->name_len));
1845 de->rec_len = cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len)); 1844 de->rec_len = cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
1846 de = de1; 1845 de = de1;
1847 } 1846 }
1848 de->file_type = OCFS2_FT_UNKNOWN; 1847 de->file_type = OCFS2_FT_UNKNOWN;
1849 if (blkno) { 1848 if (blkno) {
1850 de->inode = cpu_to_le64(blkno); 1849 de->inode = cpu_to_le64(blkno);
1851 ocfs2_set_de_type(de, inode->i_mode); 1850 ocfs2_set_de_type(de, inode->i_mode);
1852 } else 1851 } else
1853 de->inode = 0; 1852 de->inode = 0;
1854 de->name_len = namelen; 1853 de->name_len = namelen;
1855 memcpy(de->name, name, namelen); 1854 memcpy(de->name, name, namelen);
1856 1855
1857 dir->i_version++; 1856 dir->i_version++;
1858 status = ocfs2_journal_dirty(handle, insert_bh); 1857 status = ocfs2_journal_dirty(handle, insert_bh);
1859 retval = 0; 1858 retval = 0;
1860 goto bail; 1859 goto bail;
1861 } 1860 }
1862 offset += le16_to_cpu(de->rec_len); 1861 offset += le16_to_cpu(de->rec_len);
1863 de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len)); 1862 de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len));
1864 } 1863 }
1865 1864
1866 /* when you think about it, the assert above should prevent us 1865 /* when you think about it, the assert above should prevent us
1867 * from ever getting here. */ 1866 * from ever getting here. */
1868 retval = -ENOSPC; 1867 retval = -ENOSPC;
1869 bail: 1868 bail:
1870 1869
1871 mlog_exit(retval); 1870 mlog_exit(retval);
1872 return retval; 1871 return retval;
1873 } 1872 }
1874 1873
1875 1874
1876 /* 1875 /*
1877 * ocfs2_delete_entry deletes a directory entry by merging it with the 1876 * ocfs2_delete_entry deletes a directory entry by merging it with the
1878 * previous entry 1877 * previous entry
1879 */ 1878 */
1880 static int ocfs2_delete_entry(handle_t *handle, 1879 static int ocfs2_delete_entry(handle_t *handle,
1881 struct inode *dir, 1880 struct inode *dir,
1882 struct ocfs2_dir_entry *de_del, 1881 struct ocfs2_dir_entry *de_del,
1883 struct buffer_head *bh) 1882 struct buffer_head *bh)
1884 { 1883 {
1885 struct ocfs2_dir_entry *de, *pde; 1884 struct ocfs2_dir_entry *de, *pde;
1886 int i, status = -ENOENT; 1885 int i, status = -ENOENT;
1887 1886
1888 mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh); 1887 mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh);
1889 1888
1890 i = 0; 1889 i = 0;
1891 pde = NULL; 1890 pde = NULL;
1892 de = (struct ocfs2_dir_entry *) bh->b_data; 1891 de = (struct ocfs2_dir_entry *) bh->b_data;
1893 while (i < bh->b_size) { 1892 while (i < bh->b_size) {
1894 if (!ocfs2_check_dir_entry(dir, de, bh, i)) { 1893 if (!ocfs2_check_dir_entry(dir, de, bh, i)) {
1895 status = -EIO; 1894 status = -EIO;
1896 mlog_errno(status); 1895 mlog_errno(status);
1897 goto bail; 1896 goto bail;
1898 } 1897 }
1899 if (de == de_del) { 1898 if (de == de_del) {
1900 status = ocfs2_journal_access(handle, dir, bh, 1899 status = ocfs2_journal_access(handle, dir, bh,
1901 OCFS2_JOURNAL_ACCESS_WRITE); 1900 OCFS2_JOURNAL_ACCESS_WRITE);
1902 if (status < 0) { 1901 if (status < 0) {
1903 status = -EIO; 1902 status = -EIO;
1904 mlog_errno(status); 1903 mlog_errno(status);
1905 goto bail; 1904 goto bail;
1906 } 1905 }
1907 if (pde) 1906 if (pde)
1908 pde->rec_len = 1907 pde->rec_len =
1909 cpu_to_le16(le16_to_cpu(pde->rec_len) + 1908 cpu_to_le16(le16_to_cpu(pde->rec_len) +
1910 le16_to_cpu(de->rec_len)); 1909 le16_to_cpu(de->rec_len));
1911 else 1910 else
1912 de->inode = 0; 1911 de->inode = 0;
1913 dir->i_version++; 1912 dir->i_version++;
1914 status = ocfs2_journal_dirty(handle, bh); 1913 status = ocfs2_journal_dirty(handle, bh);
1915 goto bail; 1914 goto bail;
1916 } 1915 }
1917 i += le16_to_cpu(de->rec_len); 1916 i += le16_to_cpu(de->rec_len);
1918 pde = de; 1917 pde = de;
1919 de = (struct ocfs2_dir_entry *)((char *)de + le16_to_cpu(de->rec_len)); 1918 de = (struct ocfs2_dir_entry *)((char *)de + le16_to_cpu(de->rec_len));
1920 } 1919 }
1921 bail: 1920 bail:
1922 mlog_exit(status); 1921 mlog_exit(status);
1923 return status; 1922 return status;
1924 } 1923 }
1925 1924
1926 /* 1925 /*
1927 * Returns 0 if not found, -1 on failure, and 1 on success 1926 * Returns 0 if not found, -1 on failure, and 1 on success
1928 */ 1927 */
1929 static int inline ocfs2_search_dirblock(struct buffer_head *bh, 1928 static int inline ocfs2_search_dirblock(struct buffer_head *bh,
1930 struct inode *dir, 1929 struct inode *dir,
1931 const char *name, int namelen, 1930 const char *name, int namelen,
1932 unsigned long offset, 1931 unsigned long offset,
1933 struct ocfs2_dir_entry **res_dir) 1932 struct ocfs2_dir_entry **res_dir)
1934 { 1933 {
1935 struct ocfs2_dir_entry *de; 1934 struct ocfs2_dir_entry *de;
1936 char *dlimit, *de_buf; 1935 char *dlimit, *de_buf;
1937 int de_len; 1936 int de_len;
1938 int ret = 0; 1937 int ret = 0;
1939 1938
1940 mlog_entry_void(); 1939 mlog_entry_void();
1941 1940
1942 de_buf = bh->b_data; 1941 de_buf = bh->b_data;
1943 dlimit = de_buf + dir->i_sb->s_blocksize; 1942 dlimit = de_buf + dir->i_sb->s_blocksize;
1944 1943
1945 while (de_buf < dlimit) { 1944 while (de_buf < dlimit) {
1946 /* this code is executed quadratically often */ 1945 /* this code is executed quadratically often */
1947 /* do minimal checking `by hand' */ 1946 /* do minimal checking `by hand' */
1948 1947
1949 de = (struct ocfs2_dir_entry *) de_buf; 1948 de = (struct ocfs2_dir_entry *) de_buf;
1950 1949
1951 if (de_buf + namelen <= dlimit && 1950 if (de_buf + namelen <= dlimit &&
1952 ocfs2_match(namelen, name, de)) { 1951 ocfs2_match(namelen, name, de)) {
1953 /* found a match - just to be sure, do a full check */ 1952 /* found a match - just to be sure, do a full check */
1954 if (!ocfs2_check_dir_entry(dir, de, bh, offset)) { 1953 if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {
1955 ret = -1; 1954 ret = -1;
1956 goto bail; 1955 goto bail;
1957 } 1956 }
1958 *res_dir = de; 1957 *res_dir = de;
1959 ret = 1; 1958 ret = 1;
1960 goto bail; 1959 goto bail;
1961 } 1960 }
1962 1961
1963 /* prevent looping on a bad block */ 1962 /* prevent looping on a bad block */
1964 de_len = le16_to_cpu(de->rec_len); 1963 de_len = le16_to_cpu(de->rec_len);
1965 if (de_len <= 0) { 1964 if (de_len <= 0) {
1966 ret = -1; 1965 ret = -1;
1967 goto bail; 1966 goto bail;
1968 } 1967 }
1969 1968
1970 de_buf += de_len; 1969 de_buf += de_len;
1971 offset += de_len; 1970 offset += de_len;
1972 } 1971 }
1973 1972
1974 bail: 1973 bail:
1975 mlog_exit(ret); 1974 mlog_exit(ret);
1976 return ret; 1975 return ret;
1977 } 1976 }
1978 1977
1979 struct buffer_head *ocfs2_find_entry(const char *name, int namelen, 1978 struct buffer_head *ocfs2_find_entry(const char *name, int namelen,
1980 struct inode *dir, 1979 struct inode *dir,
1981 struct ocfs2_dir_entry **res_dir) 1980 struct ocfs2_dir_entry **res_dir)
1982 { 1981 {
1983 struct super_block *sb; 1982 struct super_block *sb;
1984 struct buffer_head *bh_use[NAMEI_RA_SIZE]; 1983 struct buffer_head *bh_use[NAMEI_RA_SIZE];
1985 struct buffer_head *bh, *ret = NULL; 1984 struct buffer_head *bh, *ret = NULL;
1986 unsigned long start, block, b; 1985 unsigned long start, block, b;
1987 int ra_max = 0; /* Number of bh's in the readahead 1986 int ra_max = 0; /* Number of bh's in the readahead
1988 buffer, bh_use[] */ 1987 buffer, bh_use[] */
1989 int ra_ptr = 0; /* Current index into readahead 1988 int ra_ptr = 0; /* Current index into readahead
1990 buffer */ 1989 buffer */
1991 int num = 0; 1990 int num = 0;
1992 int nblocks, i, err; 1991 int nblocks, i, err;
1993 1992
1994 mlog_entry_void(); 1993 mlog_entry_void();
1995 1994
1996 *res_dir = NULL; 1995 *res_dir = NULL;
1997 sb = dir->i_sb; 1996 sb = dir->i_sb;
1998 1997
1999 nblocks = i_size_read(dir) >> sb->s_blocksize_bits; 1998 nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
2000 start = OCFS2_I(dir)->ip_dir_start_lookup; 1999 start = OCFS2_I(dir)->ip_dir_start_lookup;
2001 if (start >= nblocks) 2000 if (start >= nblocks)
2002 start = 0; 2001 start = 0;
2003 block = start; 2002 block = start;
2004 2003
2005 restart: 2004 restart:
2006 do { 2005 do {
2007 /* 2006 /*
2008 * We deal with the read-ahead logic here. 2007 * We deal with the read-ahead logic here.
2009 */ 2008 */
2010 if (ra_ptr >= ra_max) { 2009 if (ra_ptr >= ra_max) {
2011 /* Refill the readahead buffer */ 2010 /* Refill the readahead buffer */
2012 ra_ptr = 0; 2011 ra_ptr = 0;
2013 b = block; 2012 b = block;
2014 for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) { 2013 for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
2015 /* 2014 /*
2016 * Terminate if we reach the end of the 2015 * Terminate if we reach the end of the
2017 * directory and must wrap, or if our 2016 * directory and must wrap, or if our
2018 * search has finished at this block. 2017 * search has finished at this block.
2019 */ 2018 */
2020 if (b >= nblocks || (num && block == start)) { 2019 if (b >= nblocks || (num && block == start)) {
2021 bh_use[ra_max] = NULL; 2020 bh_use[ra_max] = NULL;
2022 break; 2021 break;
2023 } 2022 }
2024 num++; 2023 num++;
2025 2024
2026 bh = ocfs2_bread(dir, b++, &err, 1); 2025 bh = ocfs2_bread(dir, b++, &err, 1);
2027 bh_use[ra_max] = bh; 2026 bh_use[ra_max] = bh;
2028 } 2027 }
2029 } 2028 }
2030 if ((bh = bh_use[ra_ptr++]) == NULL) 2029 if ((bh = bh_use[ra_ptr++]) == NULL)
2031 goto next; 2030 goto next;
2032 wait_on_buffer(bh); 2031 wait_on_buffer(bh);
2033 if (!buffer_uptodate(bh)) { 2032 if (!buffer_uptodate(bh)) {
2034 /* read error, skip block & hope for the best */ 2033 /* read error, skip block & hope for the best */
2035 ocfs2_error(dir->i_sb, "reading directory %llu, " 2034 ocfs2_error(dir->i_sb, "reading directory %llu, "
2036 "offset %lu\n", 2035 "offset %lu\n",
2037 (unsigned long long)OCFS2_I(dir)->ip_blkno, 2036 (unsigned long long)OCFS2_I(dir)->ip_blkno,
2038 block); 2037 block);
2039 brelse(bh); 2038 brelse(bh);
2040 goto next; 2039 goto next;
2041 } 2040 }
2042 i = ocfs2_search_dirblock(bh, dir, name, namelen, 2041 i = ocfs2_search_dirblock(bh, dir, name, namelen,
2043 block << sb->s_blocksize_bits, 2042 block << sb->s_blocksize_bits,
2044 res_dir); 2043 res_dir);
2045 if (i == 1) { 2044 if (i == 1) {
2046 OCFS2_I(dir)->ip_dir_start_lookup = block; 2045 OCFS2_I(dir)->ip_dir_start_lookup = block;
2047 ret = bh; 2046 ret = bh;
2048 goto cleanup_and_exit; 2047 goto cleanup_and_exit;
2049 } else { 2048 } else {
2050 brelse(bh); 2049 brelse(bh);
2051 if (i < 0) 2050 if (i < 0)
2052 goto cleanup_and_exit; 2051 goto cleanup_and_exit;
2053 } 2052 }
2054 next: 2053 next:
2055 if (++block >= nblocks) 2054 if (++block >= nblocks)
2056 block = 0; 2055 block = 0;
2057 } while (block != start); 2056 } while (block != start);
2058 2057
2059 /* 2058 /*
2060 * If the directory has grown while we were searching, then 2059 * If the directory has grown while we were searching, then
2061 * search the last part of the directory before giving up. 2060 * search the last part of the directory before giving up.
2062 */ 2061 */
2063 block = nblocks; 2062 block = nblocks;
2064 nblocks = i_size_read(dir) >> sb->s_blocksize_bits; 2063 nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
2065 if (block < nblocks) { 2064 if (block < nblocks) {
2066 start = 0; 2065 start = 0;
2067 goto restart; 2066 goto restart;
2068 } 2067 }
2069 2068
2070 cleanup_and_exit: 2069 cleanup_and_exit:
2071 /* Clean up the read-ahead blocks */ 2070 /* Clean up the read-ahead blocks */
2072 for (; ra_ptr < ra_max; ra_ptr++) 2071 for (; ra_ptr < ra_max; ra_ptr++)
2073 brelse(bh_use[ra_ptr]); 2072 brelse(bh_use[ra_ptr]);
2074 2073
2075 mlog_exit_ptr(ret); 2074 mlog_exit_ptr(ret);
2076 return ret; 2075 return ret;
2077 } 2076 }
2078 2077
2079 static int ocfs2_blkno_stringify(u64 blkno, char *name) 2078 static int ocfs2_blkno_stringify(u64 blkno, char *name)
2080 { 2079 {
2081 int status, namelen; 2080 int status, namelen;
2082 2081
2083 mlog_entry_void(); 2082 mlog_entry_void();
2084 2083
2085 namelen = snprintf(name, OCFS2_ORPHAN_NAMELEN + 1, "%016llx", 2084 namelen = snprintf(name, OCFS2_ORPHAN_NAMELEN + 1, "%016llx",
2086 (long long)blkno); 2085 (long long)blkno);
2087 if (namelen <= 0) { 2086 if (namelen <= 0) {
2088 if (namelen) 2087 if (namelen)
2089 status = namelen; 2088 status = namelen;
2090 else 2089 else
2091 status = -EINVAL; 2090 status = -EINVAL;
2092 mlog_errno(status); 2091 mlog_errno(status);
2093 goto bail; 2092 goto bail;
2094 } 2093 }
2095 if (namelen != OCFS2_ORPHAN_NAMELEN) { 2094 if (namelen != OCFS2_ORPHAN_NAMELEN) {
2096 status = -EINVAL; 2095 status = -EINVAL;
2097 mlog_errno(status); 2096 mlog_errno(status);
2098 goto bail; 2097 goto bail;
2099 } 2098 }
2100 2099
2101 mlog(0, "built filename '%s' for orphan dir (len=%d)\n", name, 2100 mlog(0, "built filename '%s' for orphan dir (len=%d)\n", name,
2102 namelen); 2101 namelen);
2103 2102
2104 status = 0; 2103 status = 0;
2105 bail: 2104 bail:
2106 mlog_exit(status); 2105 mlog_exit(status);
2107 return status; 2106 return status;
2108 } 2107 }
2109 2108
2110 static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, 2109 static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
2111 struct inode **ret_orphan_dir, 2110 struct inode **ret_orphan_dir,
2112 struct inode *inode, 2111 struct inode *inode,
2113 char *name, 2112 char *name,
2114 struct buffer_head **de_bh) 2113 struct buffer_head **de_bh)
2115 { 2114 {
2116 struct inode *orphan_dir_inode; 2115 struct inode *orphan_dir_inode;
2117 struct buffer_head *orphan_dir_bh = NULL; 2116 struct buffer_head *orphan_dir_bh = NULL;
2118 int status = 0; 2117 int status = 0;
2119 2118
2120 status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name); 2119 status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
2121 if (status < 0) { 2120 if (status < 0) {
2122 mlog_errno(status); 2121 mlog_errno(status);
2123 return status; 2122 return status;
2124 } 2123 }
2125 2124
2126 orphan_dir_inode = ocfs2_get_system_file_inode(osb, 2125 orphan_dir_inode = ocfs2_get_system_file_inode(osb,
2127 ORPHAN_DIR_SYSTEM_INODE, 2126 ORPHAN_DIR_SYSTEM_INODE,
2128 osb->slot_num); 2127 osb->slot_num);
2129 if (!orphan_dir_inode) { 2128 if (!orphan_dir_inode) {
2130 status = -ENOENT; 2129 status = -ENOENT;
2131 mlog_errno(status); 2130 mlog_errno(status);
2132 return status; 2131 return status;
2133 } 2132 }
2134 2133
2135 mutex_lock(&orphan_dir_inode->i_mutex); 2134 mutex_lock(&orphan_dir_inode->i_mutex);
2136 2135
2137 status = ocfs2_meta_lock(orphan_dir_inode, &orphan_dir_bh, 1); 2136 status = ocfs2_meta_lock(orphan_dir_inode, &orphan_dir_bh, 1);
2138 if (status < 0) { 2137 if (status < 0) {
2139 mlog_errno(status); 2138 mlog_errno(status);
2140 goto leave; 2139 goto leave;
2141 } 2140 }
2142 2141
2143 status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode, 2142 status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
2144 orphan_dir_bh, name, 2143 orphan_dir_bh, name,
2145 OCFS2_ORPHAN_NAMELEN, de_bh); 2144 OCFS2_ORPHAN_NAMELEN, de_bh);
2146 if (status < 0) { 2145 if (status < 0) {
2147 ocfs2_meta_unlock(orphan_dir_inode, 1); 2146 ocfs2_meta_unlock(orphan_dir_inode, 1);
2148 2147
2149 mlog_errno(status); 2148 mlog_errno(status);
2150 goto leave; 2149 goto leave;
2151 } 2150 }
2152 2151
2153 *ret_orphan_dir = orphan_dir_inode; 2152 *ret_orphan_dir = orphan_dir_inode;
2154 2153
2155 leave: 2154 leave:
2156 if (status) { 2155 if (status) {
2157 mutex_unlock(&orphan_dir_inode->i_mutex); 2156 mutex_unlock(&orphan_dir_inode->i_mutex);
2158 iput(orphan_dir_inode); 2157 iput(orphan_dir_inode);
2159 } 2158 }
2160 2159
2161 if (orphan_dir_bh) 2160 if (orphan_dir_bh)
2162 brelse(orphan_dir_bh); 2161 brelse(orphan_dir_bh);
2163 2162
2164 mlog_exit(status); 2163 mlog_exit(status);
2165 return status; 2164 return status;
2166 } 2165 }
2167 2166
2168 static int ocfs2_orphan_add(struct ocfs2_super *osb, 2167 static int ocfs2_orphan_add(struct ocfs2_super *osb,
2169 handle_t *handle, 2168 handle_t *handle,
2170 struct inode *inode, 2169 struct inode *inode,
2171 struct ocfs2_dinode *fe, 2170 struct ocfs2_dinode *fe,
2172 char *name, 2171 char *name,
2173 struct buffer_head *de_bh, 2172 struct buffer_head *de_bh,
2174 struct inode *orphan_dir_inode) 2173 struct inode *orphan_dir_inode)
2175 { 2174 {
2176 struct buffer_head *orphan_dir_bh = NULL; 2175 struct buffer_head *orphan_dir_bh = NULL;
2177 int status = 0; 2176 int status = 0;
2178 struct ocfs2_dinode *orphan_fe; 2177 struct ocfs2_dinode *orphan_fe;
2179 2178
2180 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); 2179 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
2181 2180
2182 status = ocfs2_read_block(osb, 2181 status = ocfs2_read_block(osb,
2183 OCFS2_I(orphan_dir_inode)->ip_blkno, 2182 OCFS2_I(orphan_dir_inode)->ip_blkno,
2184 &orphan_dir_bh, OCFS2_BH_CACHED, 2183 &orphan_dir_bh, OCFS2_BH_CACHED,
2185 orphan_dir_inode); 2184 orphan_dir_inode);
2186 if (status < 0) { 2185 if (status < 0) {
2187 mlog_errno(status); 2186 mlog_errno(status);
2188 goto leave; 2187 goto leave;
2189 } 2188 }
2190 2189
2191 status = ocfs2_journal_access(handle, orphan_dir_inode, orphan_dir_bh, 2190 status = ocfs2_journal_access(handle, orphan_dir_inode, orphan_dir_bh,
2192 OCFS2_JOURNAL_ACCESS_WRITE); 2191 OCFS2_JOURNAL_ACCESS_WRITE);
2193 if (status < 0) { 2192 if (status < 0) {
2194 mlog_errno(status); 2193 mlog_errno(status);
2195 goto leave; 2194 goto leave;
2196 } 2195 }
2197 2196
2198 /* we're a cluster, and nlink can change on disk from 2197 /* we're a cluster, and nlink can change on disk from
2199 * underneath us... */ 2198 * underneath us... */
2200 orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; 2199 orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
2201 if (S_ISDIR(inode->i_mode)) 2200 if (S_ISDIR(inode->i_mode))
2202 le16_add_cpu(&orphan_fe->i_links_count, 1); 2201 le16_add_cpu(&orphan_fe->i_links_count, 1);
2203 orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count); 2202 orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count);
2204 2203
2205 status = ocfs2_journal_dirty(handle, orphan_dir_bh); 2204 status = ocfs2_journal_dirty(handle, orphan_dir_bh);
2206 if (status < 0) { 2205 if (status < 0) {
2207 mlog_errno(status); 2206 mlog_errno(status);
2208 goto leave; 2207 goto leave;
2209 } 2208 }
2210 2209
2211 status = __ocfs2_add_entry(handle, orphan_dir_inode, name, 2210 status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
2212 OCFS2_ORPHAN_NAMELEN, inode, 2211 OCFS2_ORPHAN_NAMELEN, inode,
2213 OCFS2_I(inode)->ip_blkno, 2212 OCFS2_I(inode)->ip_blkno,
2214 orphan_dir_bh, de_bh); 2213 orphan_dir_bh, de_bh);
2215 if (status < 0) { 2214 if (status < 0) {
2216 mlog_errno(status); 2215 mlog_errno(status);
2217 goto leave; 2216 goto leave;
2218 } 2217 }
2219 2218
2220 le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL); 2219 le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL);
2221 2220
2222 /* Record which orphan dir our inode now resides 2221 /* Record which orphan dir our inode now resides
2223 * in. delete_inode will use this to determine which orphan 2222 * in. delete_inode will use this to determine which orphan
2224 * dir to lock. */ 2223 * dir to lock. */
2225 fe->i_orphaned_slot = cpu_to_le16(osb->slot_num); 2224 fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
2226 2225
2227 mlog(0, "Inode %llu orphaned in slot %d\n", 2226 mlog(0, "Inode %llu orphaned in slot %d\n",
2228 (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num); 2227 (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);
2229 2228
2230 leave: 2229 leave:
2231 if (orphan_dir_bh) 2230 if (orphan_dir_bh)
2232 brelse(orphan_dir_bh); 2231 brelse(orphan_dir_bh);
2233 2232
2234 mlog_exit(status); 2233 mlog_exit(status);
2235 return status; 2234 return status;
2236 } 2235 }
2237 2236
2238 /* unlike orphan_add, we expect the orphan dir to already be locked here. */ 2237 /* unlike orphan_add, we expect the orphan dir to already be locked here. */
2239 int ocfs2_orphan_del(struct ocfs2_super *osb, 2238 int ocfs2_orphan_del(struct ocfs2_super *osb,
2240 handle_t *handle, 2239 handle_t *handle,
2241 struct inode *orphan_dir_inode, 2240 struct inode *orphan_dir_inode,
2242 struct inode *inode, 2241 struct inode *inode,
2243 struct buffer_head *orphan_dir_bh) 2242 struct buffer_head *orphan_dir_bh)
2244 { 2243 {
2245 char name[OCFS2_ORPHAN_NAMELEN + 1]; 2244 char name[OCFS2_ORPHAN_NAMELEN + 1];
2246 struct ocfs2_dinode *orphan_fe; 2245 struct ocfs2_dinode *orphan_fe;
2247 int status = 0; 2246 int status = 0;
2248 struct buffer_head *target_de_bh = NULL; 2247 struct buffer_head *target_de_bh = NULL;
2249 struct ocfs2_dir_entry *target_de = NULL; 2248 struct ocfs2_dir_entry *target_de = NULL;
2250 2249
2251 mlog_entry_void(); 2250 mlog_entry_void();
2252 2251
2253 status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name); 2252 status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
2254 if (status < 0) { 2253 if (status < 0) {
2255 mlog_errno(status); 2254 mlog_errno(status);
2256 goto leave; 2255 goto leave;
2257 } 2256 }
2258 2257
2259 mlog(0, "removing '%s' from orphan dir %llu (namelen=%d)\n", 2258 mlog(0, "removing '%s' from orphan dir %llu (namelen=%d)\n",
2260 name, (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno, 2259 name, (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno,
2261 OCFS2_ORPHAN_NAMELEN); 2260 OCFS2_ORPHAN_NAMELEN);
2262 2261
2263 /* find it's spot in the orphan directory */ 2262 /* find it's spot in the orphan directory */
2264 target_de_bh = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, 2263 target_de_bh = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN,
2265 orphan_dir_inode, &target_de); 2264 orphan_dir_inode, &target_de);
2266 if (!target_de_bh) { 2265 if (!target_de_bh) {
2267 status = -ENOENT; 2266 status = -ENOENT;
2268 mlog_errno(status); 2267 mlog_errno(status);
2269 goto leave; 2268 goto leave;
2270 } 2269 }
2271 2270
2272 /* remove it from the orphan directory */ 2271 /* remove it from the orphan directory */
2273 status = ocfs2_delete_entry(handle, orphan_dir_inode, target_de, 2272 status = ocfs2_delete_entry(handle, orphan_dir_inode, target_de,
2274 target_de_bh); 2273 target_de_bh);
2275 if (status < 0) { 2274 if (status < 0) {
2276 mlog_errno(status); 2275 mlog_errno(status);
2277 goto leave; 2276 goto leave;
2278 } 2277 }
2279 2278
2280 status = ocfs2_journal_access(handle,orphan_dir_inode, orphan_dir_bh, 2279 status = ocfs2_journal_access(handle,orphan_dir_inode, orphan_dir_bh,
2281 OCFS2_JOURNAL_ACCESS_WRITE); 2280 OCFS2_JOURNAL_ACCESS_WRITE);
2282 if (status < 0) { 2281 if (status < 0) {
2283 mlog_errno(status); 2282 mlog_errno(status);
2284 goto leave; 2283 goto leave;
2285 } 2284 }
2286 2285
2287 /* do the i_nlink dance! :) */ 2286 /* do the i_nlink dance! :) */
2288 orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; 2287 orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
2289 if (S_ISDIR(inode->i_mode)) 2288 if (S_ISDIR(inode->i_mode))
2290 le16_add_cpu(&orphan_fe->i_links_count, -1); 2289 le16_add_cpu(&orphan_fe->i_links_count, -1);
2291 orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count); 2290 orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count);
2292 2291
2293 status = ocfs2_journal_dirty(handle, orphan_dir_bh); 2292 status = ocfs2_journal_dirty(handle, orphan_dir_bh);
2294 if (status < 0) { 2293 if (status < 0) {
2295 mlog_errno(status); 2294 mlog_errno(status);
2296 goto leave; 2295 goto leave;
2297 } 2296 }
2298 2297
2299 leave: 2298 leave:
2300 if (target_de_bh) 2299 if (target_de_bh)
2301 brelse(target_de_bh); 2300 brelse(target_de_bh);
2302 2301
2303 mlog_exit(status); 2302 mlog_exit(status);
2304 return status; 2303 return status;
2305 } 2304 }
2306 2305
2307 const struct inode_operations ocfs2_dir_iops = { 2306 const struct inode_operations ocfs2_dir_iops = {
2308 .create = ocfs2_create, 2307 .create = ocfs2_create,
2309 .lookup = ocfs2_lookup, 2308 .lookup = ocfs2_lookup,
2310 .link = ocfs2_link, 2309 .link = ocfs2_link,
2311 .unlink = ocfs2_unlink, 2310 .unlink = ocfs2_unlink,
2312 .rmdir = ocfs2_unlink, 2311 .rmdir = ocfs2_unlink,
2313 .symlink = ocfs2_symlink, 2312 .symlink = ocfs2_symlink,
2314 .mkdir = ocfs2_mkdir, 2313 .mkdir = ocfs2_mkdir,
2315 .mknod = ocfs2_mknod, 2314 .mknod = ocfs2_mknod,
2316 .rename = ocfs2_rename, 2315 .rename = ocfs2_rename,
2317 .setattr = ocfs2_setattr, 2316 .setattr = ocfs2_setattr,
2318 .getattr = ocfs2_getattr, 2317 .getattr = ocfs2_getattr,
2319 .permission = ocfs2_permission, 2318 .permission = ocfs2_permission,
2320 }; 2319 };
2321 2320