Eric Lee / smarc-ti-linux-kernel | Embedian Git Server

Commit d28c91740ae2cd1d963f9e4e3889789894cb6d52

Authored by Josef Sipek 2006-12-08 18:37:25 +0800

Committed by Linus Torvalds 2006-12-09 00:28:48 +0800

[PATCH] struct path: convert ocfs2

Signed-off-by: Josef Sipek <jsipek@fsl.cs.sunysb.edu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 4 changed files with 22 additions and 22 deletions Inline Diff

fs/ocfs2/aops.c
fs/ocfs2/dir.c
fs/ocfs2/dlm/dlmfs.c
fs/ocfs2/file.c

fs/ocfs2/aops.c

Diff comments View file @ d28c917

1	/* -- mode: c; c-basic-offset: 8; --	1	/* -- mode: c; c-basic-offset: 8; --
2	* vim: noexpandtab sw=8 ts=8 sts=0:	2	* vim: noexpandtab sw=8 ts=8 sts=0:
3	*	3	*
4	* Copyright (C) 2002, 2004 Oracle. All rights reserved.	4	* Copyright (C) 2002, 2004 Oracle. All rights reserved.
5	*	5	*
6	* This program is free software; you can redistribute it and/or	6	* This program is free software; you can redistribute it and/or
7	* modify it under the terms of the GNU General Public	7	* modify it under the terms of the GNU General Public
8	* License as published by the Free Software Foundation; either	8	* License as published by the Free Software Foundation; either
9	* version 2 of the License, or (at your option) any later version.	9	* version 2 of the License, or (at your option) any later version.
10	*	10	*
11	* This program is distributed in the hope that it will be useful,	11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of	12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU	13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	* General Public License for more details.	14	* General Public License for more details.
15	*	15	*
16	* You should have received a copy of the GNU General Public	16	* You should have received a copy of the GNU General Public
17	* License along with this program; if not, write to the	17	* License along with this program; if not, write to the
18	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,	18	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19	* Boston, MA 021110-1307, USA.	19	* Boston, MA 021110-1307, USA.
20	*/	20	*/
21		21
22	#include <linux/fs.h>	22	#include <linux/fs.h>
23	#include <linux/slab.h>	23	#include <linux/slab.h>
24	#include <linux/highmem.h>	24	#include <linux/highmem.h>
25	#include <linux/pagemap.h>	25	#include <linux/pagemap.h>
26	#include <asm/byteorder.h>	26	#include <asm/byteorder.h>
27		27
28	#define MLOG_MASK_PREFIX ML_FILE_IO	28	#define MLOG_MASK_PREFIX ML_FILE_IO
29	#include <cluster/masklog.h>	29	#include <cluster/masklog.h>
30		30
31	#include "ocfs2.h"	31	#include "ocfs2.h"
32		32
33	#include "alloc.h"	33	#include "alloc.h"
34	#include "aops.h"	34	#include "aops.h"
35	#include "dlmglue.h"	35	#include "dlmglue.h"
36	#include "extent_map.h"	36	#include "extent_map.h"
37	#include "file.h"	37	#include "file.h"
38	#include "inode.h"	38	#include "inode.h"
39	#include "journal.h"	39	#include "journal.h"
40	#include "super.h"	40	#include "super.h"
41	#include "symlink.h"	41	#include "symlink.h"
42		42
43	#include "buffer_head_io.h"	43	#include "buffer_head_io.h"
44		44
45	static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,	45	static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
46	struct buffer_head *bh_result, int create)	46	struct buffer_head *bh_result, int create)
47	{	47	{
48	int err = -EIO;	48	int err = -EIO;
49	int status;	49	int status;
50	struct ocfs2_dinode *fe = NULL;	50	struct ocfs2_dinode *fe = NULL;
51	struct buffer_head *bh = NULL;	51	struct buffer_head *bh = NULL;
52	struct buffer_head *buffer_cache_bh = NULL;	52	struct buffer_head *buffer_cache_bh = NULL;
53	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);	53	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
54	void *kaddr;	54	void *kaddr;
55		55
56	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,	56	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
57	(unsigned long long)iblock, bh_result, create);	57	(unsigned long long)iblock, bh_result, create);
58		58
59	BUG_ON(ocfs2_inode_is_fast_symlink(inode));	59	BUG_ON(ocfs2_inode_is_fast_symlink(inode));
60		60
61	if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) {	61	if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) {
62	mlog(ML_ERROR, "block offset > PATH_MAX: %llu",	62	mlog(ML_ERROR, "block offset > PATH_MAX: %llu",
63	(unsigned long long)iblock);	63	(unsigned long long)iblock);
64	goto bail;	64	goto bail;
65	}	65	}
66		66
67	status = ocfs2_read_block(OCFS2_SB(inode->i_sb),	67	status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
68	OCFS2_I(inode)->ip_blkno,	68	OCFS2_I(inode)->ip_blkno,
69	&bh, OCFS2_BH_CACHED, inode);	69	&bh, OCFS2_BH_CACHED, inode);
70	if (status < 0) {	70	if (status < 0) {
71	mlog_errno(status);	71	mlog_errno(status);
72	goto bail;	72	goto bail;
73	}	73	}
74	fe = (struct ocfs2_dinode *) bh->b_data;	74	fe = (struct ocfs2_dinode *) bh->b_data;
75		75
76	if (!OCFS2_IS_VALID_DINODE(fe)) {	76	if (!OCFS2_IS_VALID_DINODE(fe)) {
77	mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n",	77	mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n",
78	(unsigned long long)fe->i_blkno, 7, fe->i_signature);	78	(unsigned long long)fe->i_blkno, 7, fe->i_signature);
79	goto bail;	79	goto bail;
80	}	80	}
81		81
82	if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,	82	if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
83	le32_to_cpu(fe->i_clusters))) {	83	le32_to_cpu(fe->i_clusters))) {
84	mlog(ML_ERROR, "block offset is outside the allocated size: "	84	mlog(ML_ERROR, "block offset is outside the allocated size: "
85	"%llu\n", (unsigned long long)iblock);	85	"%llu\n", (unsigned long long)iblock);
86	goto bail;	86	goto bail;
87	}	87	}
88		88
89	/* We don't use the page cache to create symlink data, so if	89	/* We don't use the page cache to create symlink data, so if
90	* need be, copy it over from the buffer cache. */	90	* need be, copy it over from the buffer cache. */
91	if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) {	91	if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) {
92	u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) +	92	u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) +
93	iblock;	93	iblock;
94	buffer_cache_bh = sb_getblk(osb->sb, blkno);	94	buffer_cache_bh = sb_getblk(osb->sb, blkno);
95	if (!buffer_cache_bh) {	95	if (!buffer_cache_bh) {
96	mlog(ML_ERROR, "couldn't getblock for symlink!\n");	96	mlog(ML_ERROR, "couldn't getblock for symlink!\n");
97	goto bail;	97	goto bail;
98	}	98	}
99		99
100	/* we haven't locked out transactions, so a commit	100	/* we haven't locked out transactions, so a commit
101	* could've happened. Since we've got a reference on	101	* could've happened. Since we've got a reference on
102	* the bh, even if it commits while we're doing the	102	* the bh, even if it commits while we're doing the
103	* copy, the data is still good. */	103	* copy, the data is still good. */
104	if (buffer_jbd(buffer_cache_bh)	104	if (buffer_jbd(buffer_cache_bh)
105	&& ocfs2_inode_is_new(inode)) {	105	&& ocfs2_inode_is_new(inode)) {
106	kaddr = kmap_atomic(bh_result->b_page, KM_USER0);	106	kaddr = kmap_atomic(bh_result->b_page, KM_USER0);
107	if (!kaddr) {	107	if (!kaddr) {
108	mlog(ML_ERROR, "couldn't kmap!\n");	108	mlog(ML_ERROR, "couldn't kmap!\n");
109	goto bail;	109	goto bail;
110	}	110	}
111	memcpy(kaddr + (bh_result->b_size * iblock),	111	memcpy(kaddr + (bh_result->b_size * iblock),
112	buffer_cache_bh->b_data,	112	buffer_cache_bh->b_data,
113	bh_result->b_size);	113	bh_result->b_size);
114	kunmap_atomic(kaddr, KM_USER0);	114	kunmap_atomic(kaddr, KM_USER0);
115	set_buffer_uptodate(bh_result);	115	set_buffer_uptodate(bh_result);
116	}	116	}
117	brelse(buffer_cache_bh);	117	brelse(buffer_cache_bh);
118	}	118	}
119		119
120	map_bh(bh_result, inode->i_sb,	120	map_bh(bh_result, inode->i_sb,
121	le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock);	121	le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock);
122		122
123	err = 0;	123	err = 0;
124		124
125	bail:	125	bail:
126	if (bh)	126	if (bh)
127	brelse(bh);	127	brelse(bh);
128		128
129	mlog_exit(err);	129	mlog_exit(err);
130	return err;	130	return err;
131	}	131	}
132		132
133	static int ocfs2_get_block(struct inode *inode, sector_t iblock,	133	static int ocfs2_get_block(struct inode *inode, sector_t iblock,
134	struct buffer_head *bh_result, int create)	134	struct buffer_head *bh_result, int create)
135	{	135	{
136	int err = 0;	136	int err = 0;
137	u64 p_blkno, past_eof;	137	u64 p_blkno, past_eof;
138		138
139	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,	139	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
140	(unsigned long long)iblock, bh_result, create);	140	(unsigned long long)iblock, bh_result, create);
141		141
142	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)	142	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
143	mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",	143	mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
144	inode, inode->i_ino);	144	inode, inode->i_ino);
145		145
146	if (S_ISLNK(inode->i_mode)) {	146	if (S_ISLNK(inode->i_mode)) {
147	/* this always does I/O for some reason. */	147	/* this always does I/O for some reason. */
148	err = ocfs2_symlink_get_block(inode, iblock, bh_result, create);	148	err = ocfs2_symlink_get_block(inode, iblock, bh_result, create);
149	goto bail;	149	goto bail;
150	}	150	}
151		151
152	/* this can happen if another node truncs after our extend! */	152	/* this can happen if another node truncs after our extend! */
153	spin_lock(&OCFS2_I(inode)->ip_lock);	153	spin_lock(&OCFS2_I(inode)->ip_lock);
154	if (iblock >= ocfs2_clusters_to_blocks(inode->i_sb,	154	if (iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
155	OCFS2_I(inode)->ip_clusters))	155	OCFS2_I(inode)->ip_clusters))
156	err = -EIO;	156	err = -EIO;
157	spin_unlock(&OCFS2_I(inode)->ip_lock);	157	spin_unlock(&OCFS2_I(inode)->ip_lock);
158	if (err)	158	if (err)
159	goto bail;	159	goto bail;
160		160
161	err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,	161	err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
162	NULL);	162	NULL);
163	if (err) {	163	if (err) {
164	mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "	164	mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
165	"%llu, NULL)\n", err, inode, (unsigned long long)iblock,	165	"%llu, NULL)\n", err, inode, (unsigned long long)iblock,
166	(unsigned long long)p_blkno);	166	(unsigned long long)p_blkno);
167	goto bail;	167	goto bail;
168	}	168	}
169		169
170	map_bh(bh_result, inode->i_sb, p_blkno);	170	map_bh(bh_result, inode->i_sb, p_blkno);
171		171
172	if (bh_result->b_blocknr == 0) {	172	if (bh_result->b_blocknr == 0) {
173	err = -EIO;	173	err = -EIO;
174	mlog(ML_ERROR, "iblock = %llu p_blkno = %llu blkno=(%llu)\n",	174	mlog(ML_ERROR, "iblock = %llu p_blkno = %llu blkno=(%llu)\n",
175	(unsigned long long)iblock,	175	(unsigned long long)iblock,
176	(unsigned long long)p_blkno,	176	(unsigned long long)p_blkno,
177	(unsigned long long)OCFS2_I(inode)->ip_blkno);	177	(unsigned long long)OCFS2_I(inode)->ip_blkno);
178	}	178	}
179		179
180	past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));	180	past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
181	mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,	181	mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
182	(unsigned long long)past_eof);	182	(unsigned long long)past_eof);
183		183
184	if (create && (iblock >= past_eof))	184	if (create && (iblock >= past_eof))
185	set_buffer_new(bh_result);	185	set_buffer_new(bh_result);
186		186
187	bail:	187	bail:
188	if (err < 0)	188	if (err < 0)
189	err = -EIO;	189	err = -EIO;
190		190
191	mlog_exit(err);	191	mlog_exit(err);
192	return err;	192	return err;
193	}	193	}
194		194
195	static int ocfs2_readpage(struct file file, struct page page)	195	static int ocfs2_readpage(struct file file, struct page page)
196	{	196	{
197	struct inode *inode = page->mapping->host;	197	struct inode *inode = page->mapping->host;
198	loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;	198	loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
199	int ret, unlock = 1;	199	int ret, unlock = 1;
200		200
201	mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));	201	mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
202		202
203	ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page);	203	ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page);
204	if (ret != 0) {	204	if (ret != 0) {
205	if (ret == AOP_TRUNCATED_PAGE)	205	if (ret == AOP_TRUNCATED_PAGE)
206	unlock = 0;	206	unlock = 0;
207	mlog_errno(ret);	207	mlog_errno(ret);
208	goto out;	208	goto out;
209	}	209	}
210		210
211	down_read(&OCFS2_I(inode)->ip_alloc_sem);	211	down_read(&OCFS2_I(inode)->ip_alloc_sem);
212		212
213	/*	213	/*
214	* i_size might have just been updated as we grabed the meta lock. We	214	* i_size might have just been updated as we grabed the meta lock. We
215	* might now be discovering a truncate that hit on another node.	215	* might now be discovering a truncate that hit on another node.
216	* block_read_full_page->get_block freaks out if it is asked to read	216	* block_read_full_page->get_block freaks out if it is asked to read
217	* beyond the end of a file, so we check here. Callers	217	* beyond the end of a file, so we check here. Callers
218	* (generic_file_read, fault->nopage) are clever enough to check i_size	218	* (generic_file_read, fault->nopage) are clever enough to check i_size
219	* and notice that the page they just read isn't needed.	219	* and notice that the page they just read isn't needed.
220	*	220	*
221	* XXX sys_readahead() seems to get that wrong?	221	* XXX sys_readahead() seems to get that wrong?
222	*/	222	*/
223	if (start >= i_size_read(inode)) {	223	if (start >= i_size_read(inode)) {
224	char *addr = kmap(page);	224	char *addr = kmap(page);
225	memset(addr, 0, PAGE_SIZE);	225	memset(addr, 0, PAGE_SIZE);
226	flush_dcache_page(page);	226	flush_dcache_page(page);
227	kunmap(page);	227	kunmap(page);
228	SetPageUptodate(page);	228	SetPageUptodate(page);
229	ret = 0;	229	ret = 0;
230	goto out_alloc;	230	goto out_alloc;
231	}	231	}
232		232
233	ret = ocfs2_data_lock_with_page(inode, 0, page);	233	ret = ocfs2_data_lock_with_page(inode, 0, page);
234	if (ret != 0) {	234	if (ret != 0) {
235	if (ret == AOP_TRUNCATED_PAGE)	235	if (ret == AOP_TRUNCATED_PAGE)
236	unlock = 0;	236	unlock = 0;
237	mlog_errno(ret);	237	mlog_errno(ret);
238	goto out_alloc;	238	goto out_alloc;
239	}	239	}
240		240
241	ret = block_read_full_page(page, ocfs2_get_block);	241	ret = block_read_full_page(page, ocfs2_get_block);
242	unlock = 0;	242	unlock = 0;
243		243
244	ocfs2_data_unlock(inode, 0);	244	ocfs2_data_unlock(inode, 0);
245	out_alloc:	245	out_alloc:
246	up_read(&OCFS2_I(inode)->ip_alloc_sem);	246	up_read(&OCFS2_I(inode)->ip_alloc_sem);
247	ocfs2_meta_unlock(inode, 0);	247	ocfs2_meta_unlock(inode, 0);
248	out:	248	out:
249	if (unlock)	249	if (unlock)
250	unlock_page(page);	250	unlock_page(page);
251	mlog_exit(ret);	251	mlog_exit(ret);
252	return ret;	252	return ret;
253	}	253	}
254		254
255	/* Note: Because we don't support holes, our allocation has	255	/* Note: Because we don't support holes, our allocation has
256	* already happened (allocation writes zeros to the file data)	256	* already happened (allocation writes zeros to the file data)
257	* so we don't have to worry about ordered writes in	257	* so we don't have to worry about ordered writes in
258	* ocfs2_writepage.	258	* ocfs2_writepage.
259	*	259	*
260	* ->writepage is called during the process of invalidating the page cache	260	* ->writepage is called during the process of invalidating the page cache
261	* during blocked lock processing. It can't block on any cluster locks	261	* during blocked lock processing. It can't block on any cluster locks
262	* to during block mapping. It's relying on the fact that the block	262	* to during block mapping. It's relying on the fact that the block
263	* mapping can't have disappeared under the dirty pages that it is	263	* mapping can't have disappeared under the dirty pages that it is
264	* being asked to write back.	264	* being asked to write back.
265	*/	265	*/
266	static int ocfs2_writepage(struct page page, struct writeback_control wbc)	266	static int ocfs2_writepage(struct page page, struct writeback_control wbc)
267	{	267	{
268	int ret;	268	int ret;
269		269
270	mlog_entry("(0x%p)\n", page);	270	mlog_entry("(0x%p)\n", page);
271		271
272	ret = block_write_full_page(page, ocfs2_get_block, wbc);	272	ret = block_write_full_page(page, ocfs2_get_block, wbc);
273		273
274	mlog_exit(ret);	274	mlog_exit(ret);
275		275
276	return ret;	276	return ret;
277	}	277	}
278		278
279	/* This can also be called from ocfs2_write_zero_page() which has done	279	/* This can also be called from ocfs2_write_zero_page() which has done
280	* it's own cluster locking. */	280	* it's own cluster locking. */
281	int ocfs2_prepare_write_nolock(struct inode inode, struct page page,	281	int ocfs2_prepare_write_nolock(struct inode inode, struct page page,
282	unsigned from, unsigned to)	282	unsigned from, unsigned to)
283	{	283	{
284	int ret;	284	int ret;
285		285
286	down_read(&OCFS2_I(inode)->ip_alloc_sem);	286	down_read(&OCFS2_I(inode)->ip_alloc_sem);
287		287
288	ret = block_prepare_write(page, from, to, ocfs2_get_block);	288	ret = block_prepare_write(page, from, to, ocfs2_get_block);
289		289
290	up_read(&OCFS2_I(inode)->ip_alloc_sem);	290	up_read(&OCFS2_I(inode)->ip_alloc_sem);
291		291
292	return ret;	292	return ret;
293	}	293	}
294		294
295	/*	295	/*
296	* ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called	296	* ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called
297	* from loopback. It must be able to perform its own locking around	297	* from loopback. It must be able to perform its own locking around
298	* ocfs2_get_block().	298	* ocfs2_get_block().
299	*/	299	*/
300	static int ocfs2_prepare_write(struct file file, struct page page,	300	static int ocfs2_prepare_write(struct file file, struct page page,
301	unsigned from, unsigned to)	301	unsigned from, unsigned to)
302	{	302	{
303	struct inode *inode = page->mapping->host;	303	struct inode *inode = page->mapping->host;
304	int ret;	304	int ret;
305		305
306	mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);	306	mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
307		307
308	ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page);	308	ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page);
309	if (ret != 0) {	309	if (ret != 0) {
310	mlog_errno(ret);	310	mlog_errno(ret);
311	goto out;	311	goto out;
312	}	312	}
313		313
314	ret = ocfs2_prepare_write_nolock(inode, page, from, to);	314	ret = ocfs2_prepare_write_nolock(inode, page, from, to);
315		315
316	ocfs2_meta_unlock(inode, 0);	316	ocfs2_meta_unlock(inode, 0);
317	out:	317	out:
318	mlog_exit(ret);	318	mlog_exit(ret);
319	return ret;	319	return ret;
320	}	320	}
321		321
322	/* Taken from ext3. We don't necessarily need the full blown	322	/* Taken from ext3. We don't necessarily need the full blown
323	* functionality yet, but IMHO it's better to cut and paste the whole	323	* functionality yet, but IMHO it's better to cut and paste the whole
324	* thing so we can avoid introducing our own bugs (and easily pick up	324	* thing so we can avoid introducing our own bugs (and easily pick up
325	* their fixes when they happen) --Mark */	325	* their fixes when they happen) --Mark */
326	static int walk_page_buffers( handle_t *handle,	326	static int walk_page_buffers( handle_t *handle,
327	struct buffer_head *head,	327	struct buffer_head *head,
328	unsigned from,	328	unsigned from,
329	unsigned to,	329	unsigned to,
330	int *partial,	330	int *partial,
331	int (fn)( handle_t handle,	331	int (fn)( handle_t handle,
332	struct buffer_head *bh))	332	struct buffer_head *bh))
333	{	333	{
334	struct buffer_head *bh;	334	struct buffer_head *bh;
335	unsigned block_start, block_end;	335	unsigned block_start, block_end;
336	unsigned blocksize = head->b_size;	336	unsigned blocksize = head->b_size;
337	int err, ret = 0;	337	int err, ret = 0;
338	struct buffer_head *next;	338	struct buffer_head *next;
339		339
340	for ( bh = head, block_start = 0;	340	for ( bh = head, block_start = 0;
341	ret == 0 && (bh != head \|\| !block_start);	341	ret == 0 && (bh != head \|\| !block_start);
342	block_start = block_end, bh = next)	342	block_start = block_end, bh = next)
343	{	343	{
344	next = bh->b_this_page;	344	next = bh->b_this_page;
345	block_end = block_start + blocksize;	345	block_end = block_start + blocksize;
346	if (block_end <= from \|\| block_start >= to) {	346	if (block_end <= from \|\| block_start >= to) {
347	if (partial && !buffer_uptodate(bh))	347	if (partial && !buffer_uptodate(bh))
348	*partial = 1;	348	*partial = 1;
349	continue;	349	continue;
350	}	350	}
351	err = (*fn)(handle, bh);	351	err = (*fn)(handle, bh);
352	if (!ret)	352	if (!ret)
353	ret = err;	353	ret = err;
354	}	354	}
355	return ret;	355	return ret;
356	}	356	}
357		357
358	handle_t ocfs2_start_walk_page_trans(struct inode inode,	358	handle_t ocfs2_start_walk_page_trans(struct inode inode,
359	struct page *page,	359	struct page *page,
360	unsigned from,	360	unsigned from,
361	unsigned to)	361	unsigned to)
362	{	362	{
363	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);	363	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
364	handle_t *handle = NULL;	364	handle_t *handle = NULL;
365	int ret = 0;	365	int ret = 0;
366		366
367	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);	367	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
368	if (!handle) {	368	if (!handle) {
369	ret = -ENOMEM;	369	ret = -ENOMEM;
370	mlog_errno(ret);	370	mlog_errno(ret);
371	goto out;	371	goto out;
372	}	372	}
373		373
374	if (ocfs2_should_order_data(inode)) {	374	if (ocfs2_should_order_data(inode)) {
375	ret = walk_page_buffers(handle,	375	ret = walk_page_buffers(handle,
376	page_buffers(page),	376	page_buffers(page),
377	from, to, NULL,	377	from, to, NULL,
378	ocfs2_journal_dirty_data);	378	ocfs2_journal_dirty_data);
379	if (ret < 0)	379	if (ret < 0)
380	mlog_errno(ret);	380	mlog_errno(ret);
381	}	381	}
382	out:	382	out:
383	if (ret) {	383	if (ret) {
384	if (handle)	384	if (handle)
385	ocfs2_commit_trans(osb, handle);	385	ocfs2_commit_trans(osb, handle);
386	handle = ERR_PTR(ret);	386	handle = ERR_PTR(ret);
387	}	387	}
388	return handle;	388	return handle;
389	}	389	}
390		390
391	static int ocfs2_commit_write(struct file file, struct page page,	391	static int ocfs2_commit_write(struct file file, struct page page,
392	unsigned from, unsigned to)	392	unsigned from, unsigned to)
393	{	393	{
394	int ret;	394	int ret;
395	struct buffer_head *di_bh = NULL;	395	struct buffer_head *di_bh = NULL;
396	struct inode *inode = page->mapping->host;	396	struct inode *inode = page->mapping->host;
397	handle_t *handle = NULL;	397	handle_t *handle = NULL;
398	struct ocfs2_dinode *di;	398	struct ocfs2_dinode *di;
399		399
400	mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);	400	mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
401		401
402	/* NOTE: ocfs2_file_aio_write has ensured that it's safe for	402	/* NOTE: ocfs2_file_aio_write has ensured that it's safe for
403	* us to continue here without rechecking the I/O against	403	* us to continue here without rechecking the I/O against
404	* changed inode values.	404	* changed inode values.
405	*	405	*
406	* 1) We're currently holding the inode alloc lock, so no	406	* 1) We're currently holding the inode alloc lock, so no
407	* nodes can change it underneath us.	407	* nodes can change it underneath us.
408	*	408	*
409	* 2) We've had to take the metadata lock at least once	409	* 2) We've had to take the metadata lock at least once
410	* already to check for extending writes, suid removal, etc.	410	* already to check for extending writes, suid removal, etc.
411	* The meta data update code then ensures that we don't get a	411	* The meta data update code then ensures that we don't get a
412	* stale inode allocation image (i_size, i_clusters, etc).	412	* stale inode allocation image (i_size, i_clusters, etc).
413	*/	413	*/
414		414
415	ret = ocfs2_meta_lock_with_page(inode, &di_bh, 1, page);	415	ret = ocfs2_meta_lock_with_page(inode, &di_bh, 1, page);
416	if (ret != 0) {	416	if (ret != 0) {
417	mlog_errno(ret);	417	mlog_errno(ret);
418	goto out;	418	goto out;
419	}	419	}
420		420
421	ret = ocfs2_data_lock_with_page(inode, 1, page);	421	ret = ocfs2_data_lock_with_page(inode, 1, page);
422	if (ret != 0) {	422	if (ret != 0) {
423	mlog_errno(ret);	423	mlog_errno(ret);
424	goto out_unlock_meta;	424	goto out_unlock_meta;
425	}	425	}
426		426
427	handle = ocfs2_start_walk_page_trans(inode, page, from, to);	427	handle = ocfs2_start_walk_page_trans(inode, page, from, to);
428	if (IS_ERR(handle)) {	428	if (IS_ERR(handle)) {
429	ret = PTR_ERR(handle);	429	ret = PTR_ERR(handle);
430	goto out_unlock_data;	430	goto out_unlock_data;
431	}	431	}
432		432
433	/* Mark our buffer early. We'd rather catch this error up here	433	/* Mark our buffer early. We'd rather catch this error up here
434	* as opposed to after a successful commit_write which would	434	* as opposed to after a successful commit_write which would
435	* require us to set back inode->i_size. */	435	* require us to set back inode->i_size. */
436	ret = ocfs2_journal_access(handle, inode, di_bh,	436	ret = ocfs2_journal_access(handle, inode, di_bh,
437	OCFS2_JOURNAL_ACCESS_WRITE);	437	OCFS2_JOURNAL_ACCESS_WRITE);
438	if (ret < 0) {	438	if (ret < 0) {
439	mlog_errno(ret);	439	mlog_errno(ret);
440	goto out_commit;	440	goto out_commit;
441	}	441	}
442		442
443	/* might update i_size */	443	/* might update i_size */
444	ret = generic_commit_write(file, page, from, to);	444	ret = generic_commit_write(file, page, from, to);
445	if (ret < 0) {	445	if (ret < 0) {
446	mlog_errno(ret);	446	mlog_errno(ret);
447	goto out_commit;	447	goto out_commit;
448	}	448	}
449		449
450	di = (struct ocfs2_dinode *)di_bh->b_data;	450	di = (struct ocfs2_dinode *)di_bh->b_data;
451		451
452	/* ocfs2_mark_inode_dirty() is too heavy to use here. */	452	/* ocfs2_mark_inode_dirty() is too heavy to use here. */
453	inode->i_mtime = inode->i_ctime = CURRENT_TIME;	453	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
454	di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);	454	di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
455	di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);	455	di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
456		456
457	inode->i_blocks = ocfs2_align_bytes_to_sectors((u64)(i_size_read(inode)));	457	inode->i_blocks = ocfs2_align_bytes_to_sectors((u64)(i_size_read(inode)));
458	di->i_size = cpu_to_le64((u64)i_size_read(inode));	458	di->i_size = cpu_to_le64((u64)i_size_read(inode));
459		459
460	ret = ocfs2_journal_dirty(handle, di_bh);	460	ret = ocfs2_journal_dirty(handle, di_bh);
461	if (ret < 0) {	461	if (ret < 0) {
462	mlog_errno(ret);	462	mlog_errno(ret);
463	goto out_commit;	463	goto out_commit;
464	}	464	}
465		465
466	out_commit:	466	out_commit:
467	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);	467	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
468	out_unlock_data:	468	out_unlock_data:
469	ocfs2_data_unlock(inode, 1);	469	ocfs2_data_unlock(inode, 1);
470	out_unlock_meta:	470	out_unlock_meta:
471	ocfs2_meta_unlock(inode, 1);	471	ocfs2_meta_unlock(inode, 1);
472	out:	472	out:
473	if (di_bh)	473	if (di_bh)
474	brelse(di_bh);	474	brelse(di_bh);
475		475
476	mlog_exit(ret);	476	mlog_exit(ret);
477	return ret;	477	return ret;
478	}	478	}
479		479
480	static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)	480	static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
481	{	481	{
482	sector_t status;	482	sector_t status;
483	u64 p_blkno = 0;	483	u64 p_blkno = 0;
484	int err = 0;	484	int err = 0;
485	struct inode *inode = mapping->host;	485	struct inode *inode = mapping->host;
486		486
487	mlog_entry("(block = %llu)\n", (unsigned long long)block);	487	mlog_entry("(block = %llu)\n", (unsigned long long)block);
488		488
489	/* We don't need to lock journal system files, since they aren't	489	/* We don't need to lock journal system files, since they aren't
490	* accessed concurrently from multiple nodes.	490	* accessed concurrently from multiple nodes.
491	*/	491	*/
492	if (!INODE_JOURNAL(inode)) {	492	if (!INODE_JOURNAL(inode)) {
493	err = ocfs2_meta_lock(inode, NULL, 0);	493	err = ocfs2_meta_lock(inode, NULL, 0);
494	if (err) {	494	if (err) {
495	if (err != -ENOENT)	495	if (err != -ENOENT)
496	mlog_errno(err);	496	mlog_errno(err);
497	goto bail;	497	goto bail;
498	}	498	}
499	down_read(&OCFS2_I(inode)->ip_alloc_sem);	499	down_read(&OCFS2_I(inode)->ip_alloc_sem);
500	}	500	}
501		501
502	err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno,	502	err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno,
503	NULL);	503	NULL);
504		504
505	if (!INODE_JOURNAL(inode)) {	505	if (!INODE_JOURNAL(inode)) {
506	up_read(&OCFS2_I(inode)->ip_alloc_sem);	506	up_read(&OCFS2_I(inode)->ip_alloc_sem);
507	ocfs2_meta_unlock(inode, 0);	507	ocfs2_meta_unlock(inode, 0);
508	}	508	}
509		509
510	if (err) {	510	if (err) {
511	mlog(ML_ERROR, "get_blocks() failed, block = %llu\n",	511	mlog(ML_ERROR, "get_blocks() failed, block = %llu\n",
512	(unsigned long long)block);	512	(unsigned long long)block);
513	mlog_errno(err);	513	mlog_errno(err);
514	goto bail;	514	goto bail;
515	}	515	}
516		516
517		517
518	bail:	518	bail:
519	status = err ? 0 : p_blkno;	519	status = err ? 0 : p_blkno;
520		520
521	mlog_exit((int)status);	521	mlog_exit((int)status);
522		522
523	return status;	523	return status;
524	}	524	}
525		525
526	/*	526	/*
527	* TODO: Make this into a generic get_blocks function.	527	* TODO: Make this into a generic get_blocks function.
528	*	528	*
529	* From do_direct_io in direct-io.c:	529	* From do_direct_io in direct-io.c:
530	* "So what we do is to permit the ->get_blocks function to populate	530	* "So what we do is to permit the ->get_blocks function to populate
531	* bh.b_size with the size of IO which is permitted at this offset and	531	* bh.b_size with the size of IO which is permitted at this offset and
532	* this i_blkbits."	532	* this i_blkbits."
533	*	533	*
534	* This function is called directly from get_more_blocks in direct-io.c.	534	* This function is called directly from get_more_blocks in direct-io.c.
535	*	535	*
536	* called like this: dio->get_blocks(dio->inode, fs_startblk,	536	* called like this: dio->get_blocks(dio->inode, fs_startblk,
537	* fs_count, map_bh, dio->rw == WRITE);	537	* fs_count, map_bh, dio->rw == WRITE);
538	*/	538	*/
539	static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,	539	static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
540	struct buffer_head *bh_result, int create)	540	struct buffer_head *bh_result, int create)
541	{	541	{
542	int ret;	542	int ret;
543	u64 vbo_max; /* file offset, max_blocks from iblock */	543	u64 vbo_max; /* file offset, max_blocks from iblock */
544	u64 p_blkno;	544	u64 p_blkno;
545	int contig_blocks;	545	int contig_blocks;
546	unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;	546	unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
547	unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;	547	unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
548		548
549	/* This function won't even be called if the request isn't all	549	/* This function won't even be called if the request isn't all
550	* nicely aligned and of the right size, so there's no need	550	* nicely aligned and of the right size, so there's no need
551	* for us to check any of that. */	551	* for us to check any of that. */
552		552
553	vbo_max = ((u64)iblock + max_blocks) << blocksize_bits;	553	vbo_max = ((u64)iblock + max_blocks) << blocksize_bits;
554		554
555	spin_lock(&OCFS2_I(inode)->ip_lock);	555	spin_lock(&OCFS2_I(inode)->ip_lock);
556	if ((iblock + max_blocks) >	556	if ((iblock + max_blocks) >
557	ocfs2_clusters_to_blocks(inode->i_sb,	557	ocfs2_clusters_to_blocks(inode->i_sb,
558	OCFS2_I(inode)->ip_clusters)) {	558	OCFS2_I(inode)->ip_clusters)) {
559	spin_unlock(&OCFS2_I(inode)->ip_lock);	559	spin_unlock(&OCFS2_I(inode)->ip_lock);
560	ret = -EIO;	560	ret = -EIO;
561	goto bail;	561	goto bail;
562	}	562	}
563	spin_unlock(&OCFS2_I(inode)->ip_lock);	563	spin_unlock(&OCFS2_I(inode)->ip_lock);
564		564
565	/* This figures out the size of the next contiguous block, and	565	/* This figures out the size of the next contiguous block, and
566	* our logical offset */	566	* our logical offset */
567	ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,	567	ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
568	&contig_blocks);	568	&contig_blocks);
569	if (ret) {	569	if (ret) {
570	mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",	570	mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
571	(unsigned long long)iblock);	571	(unsigned long long)iblock);
572	ret = -EIO;	572	ret = -EIO;
573	goto bail;	573	goto bail;
574	}	574	}
575		575
576	map_bh(bh_result, inode->i_sb, p_blkno);	576	map_bh(bh_result, inode->i_sb, p_blkno);
577		577
578	/* make sure we don't map more than max_blocks blocks here as	578	/* make sure we don't map more than max_blocks blocks here as
579	that's all the kernel will handle at this point. */	579	that's all the kernel will handle at this point. */
580	if (max_blocks < contig_blocks)	580	if (max_blocks < contig_blocks)
581	contig_blocks = max_blocks;	581	contig_blocks = max_blocks;
582	bh_result->b_size = contig_blocks << blocksize_bits;	582	bh_result->b_size = contig_blocks << blocksize_bits;
583	bail:	583	bail:
584	return ret;	584	return ret;
585	}	585	}
586		586
587	/*	587	/*
588	* ocfs2_dio_end_io is called by the dio core when a dio is finished. We're	588	* ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
589	* particularly interested in the aio/dio case. Like the core uses	589	* particularly interested in the aio/dio case. Like the core uses
590	* i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from	590	* i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
591	* truncation on another.	591	* truncation on another.
592	*/	592	*/
593	static void ocfs2_dio_end_io(struct kiocb *iocb,	593	static void ocfs2_dio_end_io(struct kiocb *iocb,
594	loff_t offset,	594	loff_t offset,
595	ssize_t bytes,	595	ssize_t bytes,
596	void *private)	596	void *private)
597	{	597	{
598	struct inode *inode = iocb->ki_filp->f_dentry->d_inode;	598	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
599		599
600	/* this io's submitter should not have unlocked this before we could */	600	/* this io's submitter should not have unlocked this before we could */
601	BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));	601	BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
602	ocfs2_iocb_clear_rw_locked(iocb);	602	ocfs2_iocb_clear_rw_locked(iocb);
603	up_read(&inode->i_alloc_sem);	603	up_read(&inode->i_alloc_sem);
604	ocfs2_rw_unlock(inode, 0);	604	ocfs2_rw_unlock(inode, 0);
605	}	605	}
606		606
607	static ssize_t ocfs2_direct_IO(int rw,	607	static ssize_t ocfs2_direct_IO(int rw,
608	struct kiocb *iocb,	608	struct kiocb *iocb,
609	const struct iovec *iov,	609	const struct iovec *iov,
610	loff_t offset,	610	loff_t offset,
611	unsigned long nr_segs)	611	unsigned long nr_segs)
612	{	612	{
613	struct file *file = iocb->ki_filp;	613	struct file *file = iocb->ki_filp;
614	struct inode *inode = file->f_dentry->d_inode->i_mapping->host;	614	struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
615	int ret;	615	int ret;
616		616
617	mlog_entry_void();	617	mlog_entry_void();
618		618
619	/*	619	/*
620	* We get PR data locks even for O_DIRECT. This allows	620	* We get PR data locks even for O_DIRECT. This allows
621	* concurrent O_DIRECT I/O but doesn't let O_DIRECT with	621	* concurrent O_DIRECT I/O but doesn't let O_DIRECT with
622	* extending and buffered zeroing writes race. If they did	622	* extending and buffered zeroing writes race. If they did
623	* race then the buffered zeroing could be written back after	623	* race then the buffered zeroing could be written back after
624	* the O_DIRECT I/O. It's one thing to tell people not to mix	624	* the O_DIRECT I/O. It's one thing to tell people not to mix
625	* buffered and O_DIRECT writes, but expecting them to	625	* buffered and O_DIRECT writes, but expecting them to
626	* understand that file extension is also an implicit buffered	626	* understand that file extension is also an implicit buffered
627	* write is too much. By getting the PR we force writeback of	627	* write is too much. By getting the PR we force writeback of
628	* the buffered zeroing before proceeding.	628	* the buffered zeroing before proceeding.
629	*/	629	*/
630	ret = ocfs2_data_lock(inode, 0);	630	ret = ocfs2_data_lock(inode, 0);
631	if (ret < 0) {	631	if (ret < 0) {
632	mlog_errno(ret);	632	mlog_errno(ret);
633	goto out;	633	goto out;
634	}	634	}
635	ocfs2_data_unlock(inode, 0);	635	ocfs2_data_unlock(inode, 0);
636		636
637	ret = blockdev_direct_IO_no_locking(rw, iocb, inode,	637	ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
638	inode->i_sb->s_bdev, iov, offset,	638	inode->i_sb->s_bdev, iov, offset,
639	nr_segs,	639	nr_segs,
640	ocfs2_direct_IO_get_blocks,	640	ocfs2_direct_IO_get_blocks,
641	ocfs2_dio_end_io);	641	ocfs2_dio_end_io);
642	out:	642	out:
643	mlog_exit(ret);	643	mlog_exit(ret);
644	return ret;	644	return ret;
645	}	645	}
646		646
647	const struct address_space_operations ocfs2_aops = {	647	const struct address_space_operations ocfs2_aops = {
648	.readpage = ocfs2_readpage,	648	.readpage = ocfs2_readpage,
649	.writepage = ocfs2_writepage,	649	.writepage = ocfs2_writepage,
650	.prepare_write = ocfs2_prepare_write,	650	.prepare_write = ocfs2_prepare_write,
651	.commit_write = ocfs2_commit_write,	651	.commit_write = ocfs2_commit_write,
652	.bmap = ocfs2_bmap,	652	.bmap = ocfs2_bmap,
653	.sync_page = block_sync_page,	653	.sync_page = block_sync_page,
654	.direct_IO = ocfs2_direct_IO	654	.direct_IO = ocfs2_direct_IO
655	};	655	};
656		656

fs/ocfs2/dir.c

Diff comments View file @ d28c917

1	/* -- mode: c; c-basic-offset: 8; --	1	/* -- mode: c; c-basic-offset: 8; --
2	* vim: noexpandtab sw=8 ts=8 sts=0:	2	* vim: noexpandtab sw=8 ts=8 sts=0:
3	*	3	*
4	* dir.c	4	* dir.c
5	*	5	*
6	* Creates, reads, walks and deletes directory-nodes	6	* Creates, reads, walks and deletes directory-nodes
7	*	7	*
8	* Copyright (C) 2002, 2004 Oracle. All rights reserved.	8	* Copyright (C) 2002, 2004 Oracle. All rights reserved.
9	*	9	*
10	* Portions of this code from linux/fs/ext3/dir.c	10	* Portions of this code from linux/fs/ext3/dir.c
11	*	11	*
12	* Copyright (C) 1992, 1993, 1994, 1995	12	* Copyright (C) 1992, 1993, 1994, 1995
13	* Remy Card (card@masi.ibp.fr)	13	* Remy Card (card@masi.ibp.fr)
14	* Laboratoire MASI - Institut Blaise pascal	14	* Laboratoire MASI - Institut Blaise pascal
15	* Universite Pierre et Marie Curie (Paris VI)	15	* Universite Pierre et Marie Curie (Paris VI)
16	*	16	*
17	* from	17	* from
18	*	18	*
19	* linux/fs/minix/dir.c	19	* linux/fs/minix/dir.c
20	*	20	*
21	* Copyright (C) 1991, 1992 Linux Torvalds	21	* Copyright (C) 1991, 1992 Linux Torvalds
22	*	22	*
23	* This program is free software; you can redistribute it and/or	23	* This program is free software; you can redistribute it and/or
24	* modify it under the terms of the GNU General Public	24	* modify it under the terms of the GNU General Public
25	* License as published by the Free Software Foundation; either	25	* License as published by the Free Software Foundation; either
26	* version 2 of the License, or (at your option) any later version.	26	* version 2 of the License, or (at your option) any later version.
27	*	27	*
28	* This program is distributed in the hope that it will be useful,	28	* This program is distributed in the hope that it will be useful,
29	* but WITHOUT ANY WARRANTY; without even the implied warranty of	29	* but WITHOUT ANY WARRANTY; without even the implied warranty of
30	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU	30	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
31	* General Public License for more details.	31	* General Public License for more details.
32	*	32	*
33	* You should have received a copy of the GNU General Public	33	* You should have received a copy of the GNU General Public
34	* License along with this program; if not, write to the	34	* License along with this program; if not, write to the
35	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,	35	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
36	* Boston, MA 021110-1307, USA.	36	* Boston, MA 021110-1307, USA.
37	*/	37	*/
38		38
39	#include <linux/fs.h>	39	#include <linux/fs.h>
40	#include <linux/types.h>	40	#include <linux/types.h>
41	#include <linux/slab.h>	41	#include <linux/slab.h>
42	#include <linux/highmem.h>	42	#include <linux/highmem.h>
43		43
44	#define MLOG_MASK_PREFIX ML_NAMEI	44	#define MLOG_MASK_PREFIX ML_NAMEI
45	#include <cluster/masklog.h>	45	#include <cluster/masklog.h>
46		46
47	#include "ocfs2.h"	47	#include "ocfs2.h"
48		48
49	#include "alloc.h"	49	#include "alloc.h"
50	#include "dir.h"	50	#include "dir.h"
51	#include "dlmglue.h"	51	#include "dlmglue.h"
52	#include "extent_map.h"	52	#include "extent_map.h"
53	#include "file.h"	53	#include "file.h"
54	#include "inode.h"	54	#include "inode.h"
55	#include "journal.h"	55	#include "journal.h"
56	#include "namei.h"	56	#include "namei.h"
57	#include "suballoc.h"	57	#include "suballoc.h"
58	#include "uptodate.h"	58	#include "uptodate.h"
59		59
60	#include "buffer_head_io.h"	60	#include "buffer_head_io.h"
61		61
62	static unsigned char ocfs2_filetype_table[] = {	62	static unsigned char ocfs2_filetype_table[] = {
63	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK	63	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
64	};	64	};
65		65
66	static int ocfs2_extend_dir(struct ocfs2_super *osb,	66	static int ocfs2_extend_dir(struct ocfs2_super *osb,
67	struct inode *dir,	67	struct inode *dir,
68	struct buffer_head *parent_fe_bh,	68	struct buffer_head *parent_fe_bh,
69	struct buffer_head **new_de_bh);	69	struct buffer_head **new_de_bh);
70	/*	70	/*
71	* ocfs2_readdir()	71	* ocfs2_readdir()
72	*	72	*
73	*/	73	*/
74	int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)	74	int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
75	{	75	{
76	int error = 0;	76	int error = 0;
77	unsigned long offset, blk, last_ra_blk = 0;	77	unsigned long offset, blk, last_ra_blk = 0;
78	int i, stored;	78	int i, stored;
79	struct buffer_head * bh, * tmp;	79	struct buffer_head * bh, * tmp;
80	struct ocfs2_dir_entry * de;	80	struct ocfs2_dir_entry * de;
81	int err;	81	int err;
82	struct inode *inode = filp->f_dentry->d_inode;	82	struct inode *inode = filp->f_path.dentry->d_inode;
83	struct super_block * sb = inode->i_sb;	83	struct super_block * sb = inode->i_sb;
84	unsigned int ra_sectors = 16;	84	unsigned int ra_sectors = 16;
85	int lock_level = 0;	85	int lock_level = 0;
86		86
87	mlog_entry("dirino=%llu\n",	87	mlog_entry("dirino=%llu\n",
88	(unsigned long long)OCFS2_I(inode)->ip_blkno);	88	(unsigned long long)OCFS2_I(inode)->ip_blkno);
89		89
90	stored = 0;	90	stored = 0;
91	bh = NULL;	91	bh = NULL;
92		92
93	error = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level);	93	error = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level);
94	if (lock_level && error >= 0) {	94	if (lock_level && error >= 0) {
95	/* We release EX lock which used to update atime	95	/* We release EX lock which used to update atime
96	* and get PR lock again to reduce contention	96	* and get PR lock again to reduce contention
97	* on commonly accessed directories. */	97	* on commonly accessed directories. */
98	ocfs2_meta_unlock(inode, 1);	98	ocfs2_meta_unlock(inode, 1);
99	lock_level = 0;	99	lock_level = 0;
100	error = ocfs2_meta_lock(inode, NULL, 0);	100	error = ocfs2_meta_lock(inode, NULL, 0);
101	}	101	}
102	if (error < 0) {	102	if (error < 0) {
103	if (error != -ENOENT)	103	if (error != -ENOENT)
104	mlog_errno(error);	104	mlog_errno(error);
105	/* we haven't got any yet, so propagate the error. */	105	/* we haven't got any yet, so propagate the error. */
106	stored = error;	106	stored = error;
107	goto bail_nolock;	107	goto bail_nolock;
108	}	108	}
109		109
110	offset = filp->f_pos & (sb->s_blocksize - 1);	110	offset = filp->f_pos & (sb->s_blocksize - 1);
111		111
112	while (!error && !stored && filp->f_pos < i_size_read(inode)) {	112	while (!error && !stored && filp->f_pos < i_size_read(inode)) {
113	blk = (filp->f_pos) >> sb->s_blocksize_bits;	113	blk = (filp->f_pos) >> sb->s_blocksize_bits;
114	bh = ocfs2_bread(inode, blk, &err, 0);	114	bh = ocfs2_bread(inode, blk, &err, 0);
115	if (!bh) {	115	if (!bh) {
116	mlog(ML_ERROR,	116	mlog(ML_ERROR,
117	"directory #%llu contains a hole at offset %lld\n",	117	"directory #%llu contains a hole at offset %lld\n",
118	(unsigned long long)OCFS2_I(inode)->ip_blkno,	118	(unsigned long long)OCFS2_I(inode)->ip_blkno,
119	filp->f_pos);	119	filp->f_pos);
120	filp->f_pos += sb->s_blocksize - offset;	120	filp->f_pos += sb->s_blocksize - offset;
121	continue;	121	continue;
122	}	122	}
123		123
124	/* The idea here is to begin with 8k read-ahead and to stay	124	/* The idea here is to begin with 8k read-ahead and to stay
125	* 4k ahead of our current position.	125	* 4k ahead of our current position.
126	*	126	*
127	* TODO: Use the pagecache for this. We just need to	127	* TODO: Use the pagecache for this. We just need to
128	* make sure it's cluster-safe... */	128	* make sure it's cluster-safe... */
129	if (!last_ra_blk	129	if (!last_ra_blk
130	\|\| (((last_ra_blk - blk) << 9) <= (ra_sectors / 2))) {	130	\|\| (((last_ra_blk - blk) << 9) <= (ra_sectors / 2))) {
131	for (i = ra_sectors >> (sb->s_blocksize_bits - 9);	131	for (i = ra_sectors >> (sb->s_blocksize_bits - 9);
132	i > 0; i--) {	132	i > 0; i--) {
133	tmp = ocfs2_bread(inode, ++blk, &err, 1);	133	tmp = ocfs2_bread(inode, ++blk, &err, 1);
134	if (tmp)	134	if (tmp)
135	brelse(tmp);	135	brelse(tmp);
136	}	136	}
137	last_ra_blk = blk;	137	last_ra_blk = blk;
138	ra_sectors = 8;	138	ra_sectors = 8;
139	}	139	}
140		140
141	revalidate:	141	revalidate:
142	/* If the dir block has changed since the last call to	142	/* If the dir block has changed since the last call to
143	* readdir(2), then we might be pointing to an invalid	143	* readdir(2), then we might be pointing to an invalid
144	* dirent right now. Scan from the start of the block	144	* dirent right now. Scan from the start of the block
145	* to make sure. */	145	* to make sure. */
146	if (filp->f_version != inode->i_version) {	146	if (filp->f_version != inode->i_version) {
147	for (i = 0; i < sb->s_blocksize && i < offset; ) {	147	for (i = 0; i < sb->s_blocksize && i < offset; ) {
148	de = (struct ocfs2_dir_entry *) (bh->b_data + i);	148	de = (struct ocfs2_dir_entry *) (bh->b_data + i);
149	/* It's too expensive to do a full	149	/* It's too expensive to do a full
150	* dirent test each time round this	150	* dirent test each time round this
151	* loop, but we do have to test at	151	* loop, but we do have to test at
152	* least that it is non-zero. A	152	* least that it is non-zero. A
153	* failure will be detected in the	153	* failure will be detected in the
154	* dirent test below. */	154	* dirent test below. */
155	if (le16_to_cpu(de->rec_len) <	155	if (le16_to_cpu(de->rec_len) <
156	OCFS2_DIR_REC_LEN(1))	156	OCFS2_DIR_REC_LEN(1))
157	break;	157	break;
158	i += le16_to_cpu(de->rec_len);	158	i += le16_to_cpu(de->rec_len);
159	}	159	}
160	offset = i;	160	offset = i;
161	filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))	161	filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
162	\| offset;	162	\| offset;
163	filp->f_version = inode->i_version;	163	filp->f_version = inode->i_version;
164	}	164	}
165		165
166	while (!error && filp->f_pos < i_size_read(inode)	166	while (!error && filp->f_pos < i_size_read(inode)
167	&& offset < sb->s_blocksize) {	167	&& offset < sb->s_blocksize) {
168	de = (struct ocfs2_dir_entry *) (bh->b_data + offset);	168	de = (struct ocfs2_dir_entry *) (bh->b_data + offset);
169	if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {	169	if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
170	/* On error, skip the f_pos to the	170	/* On error, skip the f_pos to the
171	next block. */	171	next block. */
172	filp->f_pos = (filp->f_pos \|	172	filp->f_pos = (filp->f_pos \|
173	(sb->s_blocksize - 1)) + 1;	173	(sb->s_blocksize - 1)) + 1;
174	brelse(bh);	174	brelse(bh);
175	goto bail;	175	goto bail;
176	}	176	}
177	offset += le16_to_cpu(de->rec_len);	177	offset += le16_to_cpu(de->rec_len);
178	if (le64_to_cpu(de->inode)) {	178	if (le64_to_cpu(de->inode)) {
179	/* We might block in the next section	179	/* We might block in the next section
180	* if the data destination is	180	* if the data destination is
181	* currently swapped out. So, use a	181	* currently swapped out. So, use a
182	* version stamp to detect whether or	182	* version stamp to detect whether or
183	* not the directory has been modified	183	* not the directory has been modified
184	* during the copy operation.	184	* during the copy operation.
185	*/	185	*/
186	unsigned long version = filp->f_version;	186	unsigned long version = filp->f_version;
187	unsigned char d_type = DT_UNKNOWN;	187	unsigned char d_type = DT_UNKNOWN;
188		188
189	if (de->file_type < OCFS2_FT_MAX)	189	if (de->file_type < OCFS2_FT_MAX)
190	d_type = ocfs2_filetype_table[de->file_type];	190	d_type = ocfs2_filetype_table[de->file_type];
191	error = filldir(dirent, de->name,	191	error = filldir(dirent, de->name,
192	de->name_len,	192	de->name_len,
193	filp->f_pos,	193	filp->f_pos,
194	ino_from_blkno(sb, le64_to_cpu(de->inode)),	194	ino_from_blkno(sb, le64_to_cpu(de->inode)),
195	d_type);	195	d_type);
196	if (error)	196	if (error)
197	break;	197	break;
198	if (version != filp->f_version)	198	if (version != filp->f_version)
199	goto revalidate;	199	goto revalidate;
200	stored ++;	200	stored ++;
201	}	201	}
202	filp->f_pos += le16_to_cpu(de->rec_len);	202	filp->f_pos += le16_to_cpu(de->rec_len);
203	}	203	}
204	offset = 0;	204	offset = 0;
205	brelse(bh);	205	brelse(bh);
206	}	206	}
207		207
208	stored = 0;	208	stored = 0;
209	bail:	209	bail:
210	ocfs2_meta_unlock(inode, lock_level);	210	ocfs2_meta_unlock(inode, lock_level);
211		211
212	bail_nolock:	212	bail_nolock:
213	mlog_exit(stored);	213	mlog_exit(stored);
214		214
215	return stored;	215	return stored;
216	}	216	}
217		217
218	/*	218	/*
219	* NOTE: this should always be called with parent dir i_mutex taken.	219	* NOTE: this should always be called with parent dir i_mutex taken.
220	*/	220	*/
221	int ocfs2_find_files_on_disk(const char *name,	221	int ocfs2_find_files_on_disk(const char *name,
222	int namelen,	222	int namelen,
223	u64 *blkno,	223	u64 *blkno,
224	struct inode *inode,	224	struct inode *inode,
225	struct buffer_head **dirent_bh,	225	struct buffer_head **dirent_bh,
226	struct ocfs2_dir_entry **dirent)	226	struct ocfs2_dir_entry **dirent)
227	{	227	{
228	int status = -ENOENT;	228	int status = -ENOENT;
229		229
230	mlog_entry("(name=%.*s, blkno=%p, inode=%p, dirent_bh=%p, dirent=%p)\n",	230	mlog_entry("(name=%.*s, blkno=%p, inode=%p, dirent_bh=%p, dirent=%p)\n",
231	namelen, name, blkno, inode, dirent_bh, dirent);	231	namelen, name, blkno, inode, dirent_bh, dirent);
232		232
233	*dirent_bh = ocfs2_find_entry(name, namelen, inode, dirent);	233	*dirent_bh = ocfs2_find_entry(name, namelen, inode, dirent);
234	if (!dirent_bh \|\| !dirent) {	234	if (!dirent_bh \|\| !dirent) {
235	status = -ENOENT;	235	status = -ENOENT;
236	goto leave;	236	goto leave;
237	}	237	}
238		238
239	blkno = le64_to_cpu((dirent)->inode);	239	blkno = le64_to_cpu((dirent)->inode);
240		240
241	status = 0;	241	status = 0;
242	leave:	242	leave:
243	if (status < 0) {	243	if (status < 0) {
244	*dirent = NULL;	244	*dirent = NULL;
245	if (*dirent_bh) {	245	if (*dirent_bh) {
246	brelse(*dirent_bh);	246	brelse(*dirent_bh);
247	*dirent_bh = NULL;	247	*dirent_bh = NULL;
248	}	248	}
249	}	249	}
250		250
251	mlog_exit(status);	251	mlog_exit(status);
252	return status;	252	return status;
253	}	253	}
254		254
255	/* Check for a name within a directory.	255	/* Check for a name within a directory.
256	*	256	*
257	* Return 0 if the name does not exist	257	* Return 0 if the name does not exist
258	* Return -EEXIST if the directory contains the name	258	* Return -EEXIST if the directory contains the name
259	*	259	*
260	* Callers should have i_mutex + a cluster lock on dir	260	* Callers should have i_mutex + a cluster lock on dir
261	*/	261	*/
262	int ocfs2_check_dir_for_entry(struct inode *dir,	262	int ocfs2_check_dir_for_entry(struct inode *dir,
263	const char *name,	263	const char *name,
264	int namelen)	264	int namelen)
265	{	265	{
266	int ret;	266	int ret;
267	struct buffer_head *dirent_bh = NULL;	267	struct buffer_head *dirent_bh = NULL;
268	struct ocfs2_dir_entry *dirent = NULL;	268	struct ocfs2_dir_entry *dirent = NULL;
269		269
270	mlog_entry("dir %llu, name '%.*s'\n",	270	mlog_entry("dir %llu, name '%.*s'\n",
271	(unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name);	271	(unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name);
272		272
273	ret = -EEXIST;	273	ret = -EEXIST;
274	dirent_bh = ocfs2_find_entry(name, namelen, dir, &dirent);	274	dirent_bh = ocfs2_find_entry(name, namelen, dir, &dirent);
275	if (dirent_bh)	275	if (dirent_bh)
276	goto bail;	276	goto bail;
277		277
278	ret = 0;	278	ret = 0;
279	bail:	279	bail:
280	if (dirent_bh)	280	if (dirent_bh)
281	brelse(dirent_bh);	281	brelse(dirent_bh);
282		282
283	mlog_exit(ret);	283	mlog_exit(ret);
284	return ret;	284	return ret;
285	}	285	}
286		286
287	/*	287	/*
288	* routine to check that the specified directory is empty (for rmdir)	288	* routine to check that the specified directory is empty (for rmdir)
289	*/	289	*/
290	int ocfs2_empty_dir(struct inode *inode)	290	int ocfs2_empty_dir(struct inode *inode)
291	{	291	{
292	unsigned long offset;	292	unsigned long offset;
293	struct buffer_head * bh;	293	struct buffer_head * bh;
294	struct ocfs2_dir_entry * de, * de1;	294	struct ocfs2_dir_entry * de, * de1;
295	struct super_block * sb;	295	struct super_block * sb;
296	int err;	296	int err;
297		297
298	sb = inode->i_sb;	298	sb = inode->i_sb;
299	if ((i_size_read(inode) <	299	if ((i_size_read(inode) <
300	(OCFS2_DIR_REC_LEN(1) + OCFS2_DIR_REC_LEN(2))) \|\|	300	(OCFS2_DIR_REC_LEN(1) + OCFS2_DIR_REC_LEN(2))) \|\|
301	!(bh = ocfs2_bread(inode, 0, &err, 0))) {	301	!(bh = ocfs2_bread(inode, 0, &err, 0))) {
302	mlog(ML_ERROR, "bad directory (dir #%llu) - no data block\n",	302	mlog(ML_ERROR, "bad directory (dir #%llu) - no data block\n",
303	(unsigned long long)OCFS2_I(inode)->ip_blkno);	303	(unsigned long long)OCFS2_I(inode)->ip_blkno);
304	return 1;	304	return 1;
305	}	305	}
306		306
307	de = (struct ocfs2_dir_entry *) bh->b_data;	307	de = (struct ocfs2_dir_entry *) bh->b_data;
308	de1 = (struct ocfs2_dir_entry *)	308	de1 = (struct ocfs2_dir_entry *)
309	((char *)de + le16_to_cpu(de->rec_len));	309	((char *)de + le16_to_cpu(de->rec_len));
310	if ((le64_to_cpu(de->inode) != OCFS2_I(inode)->ip_blkno) \|\|	310	if ((le64_to_cpu(de->inode) != OCFS2_I(inode)->ip_blkno) \|\|
311	!le64_to_cpu(de1->inode) \|\|	311	!le64_to_cpu(de1->inode) \|\|
312	strcmp(".", de->name) \|\|	312	strcmp(".", de->name) \|\|
313	strcmp("..", de1->name)) {	313	strcmp("..", de1->name)) {
314	mlog(ML_ERROR, "bad directory (dir #%llu) - no `.' or `..'\n",	314	mlog(ML_ERROR, "bad directory (dir #%llu) - no `.' or `..'\n",
315	(unsigned long long)OCFS2_I(inode)->ip_blkno);	315	(unsigned long long)OCFS2_I(inode)->ip_blkno);
316	brelse(bh);	316	brelse(bh);
317	return 1;	317	return 1;
318	}	318	}
319	offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len);	319	offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len);
320	de = (struct ocfs2_dir_entry )((char )de1 + le16_to_cpu(de1->rec_len));	320	de = (struct ocfs2_dir_entry )((char )de1 + le16_to_cpu(de1->rec_len));
321	while (offset < i_size_read(inode) ) {	321	while (offset < i_size_read(inode) ) {
322	if (!bh \|\| (void )de >= (void )(bh->b_data + sb->s_blocksize)) {	322	if (!bh \|\| (void )de >= (void )(bh->b_data + sb->s_blocksize)) {
323	brelse(bh);	323	brelse(bh);
324	bh = ocfs2_bread(inode,	324	bh = ocfs2_bread(inode,
325	offset >> sb->s_blocksize_bits, &err, 0);	325	offset >> sb->s_blocksize_bits, &err, 0);
326	if (!bh) {	326	if (!bh) {
327	mlog(ML_ERROR, "dir %llu has a hole at %lu\n",	327	mlog(ML_ERROR, "dir %llu has a hole at %lu\n",
328	(unsigned long long)OCFS2_I(inode)->ip_blkno, offset);	328	(unsigned long long)OCFS2_I(inode)->ip_blkno, offset);
329	offset += sb->s_blocksize;	329	offset += sb->s_blocksize;
330	continue;	330	continue;
331	}	331	}
332	de = (struct ocfs2_dir_entry *) bh->b_data;	332	de = (struct ocfs2_dir_entry *) bh->b_data;
333	}	333	}
334	if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {	334	if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
335	brelse(bh);	335	brelse(bh);
336	return 1;	336	return 1;
337	}	337	}
338	if (le64_to_cpu(de->inode)) {	338	if (le64_to_cpu(de->inode)) {
339	brelse(bh);	339	brelse(bh);
340	return 0;	340	return 0;
341	}	341	}
342	offset += le16_to_cpu(de->rec_len);	342	offset += le16_to_cpu(de->rec_len);
343	de = (struct ocfs2_dir_entry *)	343	de = (struct ocfs2_dir_entry *)
344	((char *)de + le16_to_cpu(de->rec_len));	344	((char *)de + le16_to_cpu(de->rec_len));
345	}	345	}
346	brelse(bh);	346	brelse(bh);
347	return 1;	347	return 1;
348	}	348	}
349		349
350	/* returns a bh of the 1st new block in the allocation. */	350	/* returns a bh of the 1st new block in the allocation. */
351	int ocfs2_do_extend_dir(struct super_block *sb,	351	int ocfs2_do_extend_dir(struct super_block *sb,
352	handle_t *handle,	352	handle_t *handle,
353	struct inode *dir,	353	struct inode *dir,
354	struct buffer_head *parent_fe_bh,	354	struct buffer_head *parent_fe_bh,
355	struct ocfs2_alloc_context *data_ac,	355	struct ocfs2_alloc_context *data_ac,
356	struct ocfs2_alloc_context *meta_ac,	356	struct ocfs2_alloc_context *meta_ac,
357	struct buffer_head **new_bh)	357	struct buffer_head **new_bh)
358	{	358	{
359	int status;	359	int status;
360	int extend;	360	int extend;
361	u64 p_blkno;	361	u64 p_blkno;
362		362
363	spin_lock(&OCFS2_I(dir)->ip_lock);	363	spin_lock(&OCFS2_I(dir)->ip_lock);
364	extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters));	364	extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters));
365	spin_unlock(&OCFS2_I(dir)->ip_lock);	365	spin_unlock(&OCFS2_I(dir)->ip_lock);
366		366
367	if (extend) {	367	if (extend) {
368	status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, 1,	368	status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, 1,
369	parent_fe_bh, handle,	369	parent_fe_bh, handle,
370	data_ac, meta_ac, NULL);	370	data_ac, meta_ac, NULL);
371	BUG_ON(status == -EAGAIN);	371	BUG_ON(status == -EAGAIN);
372	if (status < 0) {	372	if (status < 0) {
373	mlog_errno(status);	373	mlog_errno(status);
374	goto bail;	374	goto bail;
375	}	375	}
376	}	376	}
377		377
378	status = ocfs2_extent_map_get_blocks(dir, (dir->i_blocks >>	378	status = ocfs2_extent_map_get_blocks(dir, (dir->i_blocks >>
379	(sb->s_blocksize_bits - 9)),	379	(sb->s_blocksize_bits - 9)),
380	1, &p_blkno, NULL);	380	1, &p_blkno, NULL);
381	if (status < 0) {	381	if (status < 0) {
382	mlog_errno(status);	382	mlog_errno(status);
383	goto bail;	383	goto bail;
384	}	384	}
385		385
386	*new_bh = sb_getblk(sb, p_blkno);	386	*new_bh = sb_getblk(sb, p_blkno);
387	if (!*new_bh) {	387	if (!*new_bh) {
388	status = -EIO;	388	status = -EIO;
389	mlog_errno(status);	389	mlog_errno(status);
390	goto bail;	390	goto bail;
391	}	391	}
392	status = 0;	392	status = 0;
393	bail:	393	bail:
394	mlog_exit(status);	394	mlog_exit(status);
395	return status;	395	return status;
396	}	396	}
397		397
398	/* assumes you already have a cluster lock on the directory. */	398	/* assumes you already have a cluster lock on the directory. */
399	static int ocfs2_extend_dir(struct ocfs2_super *osb,	399	static int ocfs2_extend_dir(struct ocfs2_super *osb,
400	struct inode *dir,	400	struct inode *dir,
401	struct buffer_head *parent_fe_bh,	401	struct buffer_head *parent_fe_bh,
402	struct buffer_head **new_de_bh)	402	struct buffer_head **new_de_bh)
403	{	403	{
404	int status = 0;	404	int status = 0;
405	int credits, num_free_extents;	405	int credits, num_free_extents;
406	loff_t dir_i_size;	406	loff_t dir_i_size;
407	struct ocfs2_dinode fe = (struct ocfs2_dinode ) parent_fe_bh->b_data;	407	struct ocfs2_dinode fe = (struct ocfs2_dinode ) parent_fe_bh->b_data;
408	struct ocfs2_alloc_context *data_ac = NULL;	408	struct ocfs2_alloc_context *data_ac = NULL;
409	struct ocfs2_alloc_context *meta_ac = NULL;	409	struct ocfs2_alloc_context *meta_ac = NULL;
410	handle_t *handle = NULL;	410	handle_t *handle = NULL;
411	struct buffer_head *new_bh = NULL;	411	struct buffer_head *new_bh = NULL;
412	struct ocfs2_dir_entry * de;	412	struct ocfs2_dir_entry * de;
413	struct super_block *sb = osb->sb;	413	struct super_block *sb = osb->sb;
414		414
415	mlog_entry_void();	415	mlog_entry_void();
416		416
417	dir_i_size = i_size_read(dir);	417	dir_i_size = i_size_read(dir);
418	mlog(0, "extending dir %llu (i_size = %lld)\n",	418	mlog(0, "extending dir %llu (i_size = %lld)\n",
419	(unsigned long long)OCFS2_I(dir)->ip_blkno, dir_i_size);	419	(unsigned long long)OCFS2_I(dir)->ip_blkno, dir_i_size);
420		420
421	/* dir->i_size is always block aligned. */	421	/* dir->i_size is always block aligned. */
422	spin_lock(&OCFS2_I(dir)->ip_lock);	422	spin_lock(&OCFS2_I(dir)->ip_lock);
423	if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) {	423	if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) {
424	spin_unlock(&OCFS2_I(dir)->ip_lock);	424	spin_unlock(&OCFS2_I(dir)->ip_lock);
425	num_free_extents = ocfs2_num_free_extents(osb, dir, fe);	425	num_free_extents = ocfs2_num_free_extents(osb, dir, fe);
426	if (num_free_extents < 0) {	426	if (num_free_extents < 0) {
427	status = num_free_extents;	427	status = num_free_extents;
428	mlog_errno(status);	428	mlog_errno(status);
429	goto bail;	429	goto bail;
430	}	430	}
431		431
432	if (!num_free_extents) {	432	if (!num_free_extents) {
433	status = ocfs2_reserve_new_metadata(osb, fe, &meta_ac);	433	status = ocfs2_reserve_new_metadata(osb, fe, &meta_ac);
434	if (status < 0) {	434	if (status < 0) {
435	if (status != -ENOSPC)	435	if (status != -ENOSPC)
436	mlog_errno(status);	436	mlog_errno(status);
437	goto bail;	437	goto bail;
438	}	438	}
439	}	439	}
440		440
441	status = ocfs2_reserve_clusters(osb, 1, &data_ac);	441	status = ocfs2_reserve_clusters(osb, 1, &data_ac);
442	if (status < 0) {	442	if (status < 0) {
443	if (status != -ENOSPC)	443	if (status != -ENOSPC)
444	mlog_errno(status);	444	mlog_errno(status);
445	goto bail;	445	goto bail;
446	}	446	}
447		447
448	credits = ocfs2_calc_extend_credits(sb, fe, 1);	448	credits = ocfs2_calc_extend_credits(sb, fe, 1);
449	} else {	449	} else {
450	spin_unlock(&OCFS2_I(dir)->ip_lock);	450	spin_unlock(&OCFS2_I(dir)->ip_lock);
451	credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;	451	credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
452	}	452	}
453		453
454	handle = ocfs2_start_trans(osb, credits);	454	handle = ocfs2_start_trans(osb, credits);
455	if (IS_ERR(handle)) {	455	if (IS_ERR(handle)) {
456	status = PTR_ERR(handle);	456	status = PTR_ERR(handle);
457	handle = NULL;	457	handle = NULL;
458	mlog_errno(status);	458	mlog_errno(status);
459	goto bail;	459	goto bail;
460	}	460	}
461		461
462	status = ocfs2_do_extend_dir(osb->sb, handle, dir, parent_fe_bh,	462	status = ocfs2_do_extend_dir(osb->sb, handle, dir, parent_fe_bh,
463	data_ac, meta_ac, &new_bh);	463	data_ac, meta_ac, &new_bh);
464	if (status < 0) {	464	if (status < 0) {
465	mlog_errno(status);	465	mlog_errno(status);
466	goto bail;	466	goto bail;
467	}	467	}
468		468
469	ocfs2_set_new_buffer_uptodate(dir, new_bh);	469	ocfs2_set_new_buffer_uptodate(dir, new_bh);
470		470
471	status = ocfs2_journal_access(handle, dir, new_bh,	471	status = ocfs2_journal_access(handle, dir, new_bh,
472	OCFS2_JOURNAL_ACCESS_CREATE);	472	OCFS2_JOURNAL_ACCESS_CREATE);
473	if (status < 0) {	473	if (status < 0) {
474	mlog_errno(status);	474	mlog_errno(status);
475	goto bail;	475	goto bail;
476	}	476	}
477	memset(new_bh->b_data, 0, sb->s_blocksize);	477	memset(new_bh->b_data, 0, sb->s_blocksize);
478	de = (struct ocfs2_dir_entry *) new_bh->b_data;	478	de = (struct ocfs2_dir_entry *) new_bh->b_data;
479	de->inode = 0;	479	de->inode = 0;
480	de->rec_len = cpu_to_le16(sb->s_blocksize);	480	de->rec_len = cpu_to_le16(sb->s_blocksize);
481	status = ocfs2_journal_dirty(handle, new_bh);	481	status = ocfs2_journal_dirty(handle, new_bh);
482	if (status < 0) {	482	if (status < 0) {
483	mlog_errno(status);	483	mlog_errno(status);
484	goto bail;	484	goto bail;
485	}	485	}
486		486
487	dir_i_size += dir->i_sb->s_blocksize;	487	dir_i_size += dir->i_sb->s_blocksize;
488	i_size_write(dir, dir_i_size);	488	i_size_write(dir, dir_i_size);
489	dir->i_blocks = ocfs2_align_bytes_to_sectors(dir_i_size);	489	dir->i_blocks = ocfs2_align_bytes_to_sectors(dir_i_size);
490	status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);	490	status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
491	if (status < 0) {	491	if (status < 0) {
492	mlog_errno(status);	492	mlog_errno(status);
493	goto bail;	493	goto bail;
494	}	494	}
495		495
496	*new_de_bh = new_bh;	496	*new_de_bh = new_bh;
497	get_bh(*new_de_bh);	497	get_bh(*new_de_bh);
498	bail:	498	bail:
499	if (handle)	499	if (handle)
500	ocfs2_commit_trans(osb, handle);	500	ocfs2_commit_trans(osb, handle);
501		501
502	if (data_ac)	502	if (data_ac)
503	ocfs2_free_alloc_context(data_ac);	503	ocfs2_free_alloc_context(data_ac);
504	if (meta_ac)	504	if (meta_ac)
505	ocfs2_free_alloc_context(meta_ac);	505	ocfs2_free_alloc_context(meta_ac);
506		506
507	if (new_bh)	507	if (new_bh)
508	brelse(new_bh);	508	brelse(new_bh);
509		509
510	mlog_exit(status);	510	mlog_exit(status);
511	return status;	511	return status;
512	}	512	}
513		513
514	/*	514	/*
515	* Search the dir for a good spot, extending it if necessary. The	515	* Search the dir for a good spot, extending it if necessary. The
516	* block containing an appropriate record is returned in ret_de_bh.	516	* block containing an appropriate record is returned in ret_de_bh.
517	*/	517	*/
518	int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,	518	int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
519	struct inode *dir,	519	struct inode *dir,
520	struct buffer_head *parent_fe_bh,	520	struct buffer_head *parent_fe_bh,
521	const char *name,	521	const char *name,
522	int namelen,	522	int namelen,
523	struct buffer_head **ret_de_bh)	523	struct buffer_head **ret_de_bh)
524	{	524	{
525	unsigned long offset;	525	unsigned long offset;
526	struct buffer_head * bh = NULL;	526	struct buffer_head * bh = NULL;
527	unsigned short rec_len;	527	unsigned short rec_len;
528	struct ocfs2_dinode *fe;	528	struct ocfs2_dinode *fe;
529	struct ocfs2_dir_entry *de;	529	struct ocfs2_dir_entry *de;
530	struct super_block *sb;	530	struct super_block *sb;
531	int status;	531	int status;
532		532
533	mlog_entry_void();	533	mlog_entry_void();
534		534
535	mlog(0, "getting ready to insert namelen %d into dir %llu\n",	535	mlog(0, "getting ready to insert namelen %d into dir %llu\n",
536	namelen, (unsigned long long)OCFS2_I(dir)->ip_blkno);	536	namelen, (unsigned long long)OCFS2_I(dir)->ip_blkno);
537		537
538	BUG_ON(!S_ISDIR(dir->i_mode));	538	BUG_ON(!S_ISDIR(dir->i_mode));
539	fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;	539	fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
540	BUG_ON(le64_to_cpu(fe->i_size) != i_size_read(dir));	540	BUG_ON(le64_to_cpu(fe->i_size) != i_size_read(dir));
541		541
542	sb = dir->i_sb;	542	sb = dir->i_sb;
543		543
544	if (!namelen) {	544	if (!namelen) {
545	status = -EINVAL;	545	status = -EINVAL;
546	mlog_errno(status);	546	mlog_errno(status);
547	goto bail;	547	goto bail;
548	}	548	}
549		549
550	bh = ocfs2_bread(dir, 0, &status, 0);	550	bh = ocfs2_bread(dir, 0, &status, 0);
551	if (!bh) {	551	if (!bh) {
552	mlog_errno(status);	552	mlog_errno(status);
553	goto bail;	553	goto bail;
554	}	554	}
555		555
556	rec_len = OCFS2_DIR_REC_LEN(namelen);	556	rec_len = OCFS2_DIR_REC_LEN(namelen);
557	offset = 0;	557	offset = 0;
558	de = (struct ocfs2_dir_entry *) bh->b_data;	558	de = (struct ocfs2_dir_entry *) bh->b_data;
559	while (1) {	559	while (1) {
560	if ((char *)de >= sb->s_blocksize + bh->b_data) {	560	if ((char *)de >= sb->s_blocksize + bh->b_data) {
561	brelse(bh);	561	brelse(bh);
562	bh = NULL;	562	bh = NULL;
563		563
564	if (i_size_read(dir) <= offset) {	564	if (i_size_read(dir) <= offset) {
565	status = ocfs2_extend_dir(osb,	565	status = ocfs2_extend_dir(osb,
566	dir,	566	dir,
567	parent_fe_bh,	567	parent_fe_bh,
568	&bh);	568	&bh);
569	if (status < 0) {	569	if (status < 0) {
570	mlog_errno(status);	570	mlog_errno(status);
571	goto bail;	571	goto bail;
572	}	572	}
573	BUG_ON(!bh);	573	BUG_ON(!bh);
574	*ret_de_bh = bh;	574	*ret_de_bh = bh;
575	get_bh(*ret_de_bh);	575	get_bh(*ret_de_bh);
576	goto bail;	576	goto bail;
577	}	577	}
578	bh = ocfs2_bread(dir,	578	bh = ocfs2_bread(dir,
579	offset >> sb->s_blocksize_bits,	579	offset >> sb->s_blocksize_bits,
580	&status,	580	&status,
581	0);	581	0);
582	if (!bh) {	582	if (!bh) {
583	mlog_errno(status);	583	mlog_errno(status);
584	goto bail;	584	goto bail;
585	}	585	}
586	/* move to next block */	586	/* move to next block */
587	de = (struct ocfs2_dir_entry *) bh->b_data;	587	de = (struct ocfs2_dir_entry *) bh->b_data;
588	}	588	}
589	if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {	589	if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {
590	status = -ENOENT;	590	status = -ENOENT;
591	goto bail;	591	goto bail;
592	}	592	}
593	if (ocfs2_match(namelen, name, de)) {	593	if (ocfs2_match(namelen, name, de)) {
594	status = -EEXIST;	594	status = -EEXIST;
595	goto bail;	595	goto bail;
596	}	596	}
597	if (((le64_to_cpu(de->inode) == 0) &&	597	if (((le64_to_cpu(de->inode) == 0) &&
598	(le16_to_cpu(de->rec_len) >= rec_len)) \|\|	598	(le16_to_cpu(de->rec_len) >= rec_len)) \|\|
599	(le16_to_cpu(de->rec_len) >=	599	(le16_to_cpu(de->rec_len) >=
600	(OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) {	600	(OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) {
601	/* Ok, we found a spot. Return this bh and let	601	/* Ok, we found a spot. Return this bh and let
602	* the caller actually fill it in. */	602	* the caller actually fill it in. */
603	*ret_de_bh = bh;	603	*ret_de_bh = bh;
604	get_bh(*ret_de_bh);	604	get_bh(*ret_de_bh);
605	status = 0;	605	status = 0;
606	goto bail;	606	goto bail;
607	}	607	}
608	offset += le16_to_cpu(de->rec_len);	608	offset += le16_to_cpu(de->rec_len);
609	de = (struct ocfs2_dir_entry )((char ) de + le16_to_cpu(de->rec_len));	609	de = (struct ocfs2_dir_entry )((char ) de + le16_to_cpu(de->rec_len));
610	}	610	}
611		611
612	status = 0;	612	status = 0;
613	bail:	613	bail:
614	if (bh)	614	if (bh)
615	brelse(bh);	615	brelse(bh);
616		616
617	mlog_exit(status);	617	mlog_exit(status);
618	return status;	618	return status;
619	}	619	}
620		620

fs/ocfs2/dlm/dlmfs.c

Diff comments View file @ d28c917

1	/* -- mode: c; c-basic-offset: 8; --	1	/* -- mode: c; c-basic-offset: 8; --
2	* vim: noexpandtab sw=8 ts=8 sts=0:	2	* vim: noexpandtab sw=8 ts=8 sts=0:
3	*	3	*
4	* dlmfs.c	4	* dlmfs.c
5	*	5	*
6	* Code which implements the kernel side of a minimal userspace	6	* Code which implements the kernel side of a minimal userspace
7	* interface to our DLM. This file handles the virtual file system	7	* interface to our DLM. This file handles the virtual file system
8	* used for communication with userspace. Credit should go to ramfs,	8	* used for communication with userspace. Credit should go to ramfs,
9	* which was a template for the fs side of this module.	9	* which was a template for the fs side of this module.
10	*	10	*
11	* Copyright (C) 2003, 2004 Oracle. All rights reserved.	11	* Copyright (C) 2003, 2004 Oracle. All rights reserved.
12	*	12	*
13	* This program is free software; you can redistribute it and/or	13	* This program is free software; you can redistribute it and/or
14	* modify it under the terms of the GNU General Public	14	* modify it under the terms of the GNU General Public
15	* License as published by the Free Software Foundation; either	15	* License as published by the Free Software Foundation; either
16	* version 2 of the License, or (at your option) any later version.	16	* version 2 of the License, or (at your option) any later version.
17	*	17	*
18	* This program is distributed in the hope that it will be useful,	18	* This program is distributed in the hope that it will be useful,
19	* but WITHOUT ANY WARRANTY; without even the implied warranty of	19	* but WITHOUT ANY WARRANTY; without even the implied warranty of
20	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU	20	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21	* General Public License for more details.	21	* General Public License for more details.
22	*	22	*
23	* You should have received a copy of the GNU General Public	23	* You should have received a copy of the GNU General Public
24	* License along with this program; if not, write to the	24	* License along with this program; if not, write to the
25	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,	25	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
26	* Boston, MA 021110-1307, USA.	26	* Boston, MA 021110-1307, USA.
27	*/	27	*/
28		28
29	/* Simple VFS hooks based on: */	29	/* Simple VFS hooks based on: */
30	/*	30	/*
31	* Resizable simple ram filesystem for Linux.	31	* Resizable simple ram filesystem for Linux.
32	*	32	*
33	* Copyright (C) 2000 Linus Torvalds.	33	* Copyright (C) 2000 Linus Torvalds.
34	* 2000 Transmeta Corp.	34	* 2000 Transmeta Corp.
35	*/	35	*/
36		36
37	#include <linux/module.h>	37	#include <linux/module.h>
38	#include <linux/fs.h>	38	#include <linux/fs.h>
39	#include <linux/pagemap.h>	39	#include <linux/pagemap.h>
40	#include <linux/types.h>	40	#include <linux/types.h>
41	#include <linux/slab.h>	41	#include <linux/slab.h>
42	#include <linux/highmem.h>	42	#include <linux/highmem.h>
43	#include <linux/init.h>	43	#include <linux/init.h>
44	#include <linux/string.h>	44	#include <linux/string.h>
45	#include <linux/smp_lock.h>	45	#include <linux/smp_lock.h>
46	#include <linux/backing-dev.h>	46	#include <linux/backing-dev.h>
47		47
48	#include <asm/uaccess.h>	48	#include <asm/uaccess.h>
49		49
50		50
51	#include "cluster/nodemanager.h"	51	#include "cluster/nodemanager.h"
52	#include "cluster/heartbeat.h"	52	#include "cluster/heartbeat.h"
53	#include "cluster/tcp.h"	53	#include "cluster/tcp.h"
54		54
55	#include "dlmapi.h"	55	#include "dlmapi.h"
56		56
57	#include "userdlm.h"	57	#include "userdlm.h"
58		58
59	#include "dlmfsver.h"	59	#include "dlmfsver.h"
60		60
61	#define MLOG_MASK_PREFIX ML_DLMFS	61	#define MLOG_MASK_PREFIX ML_DLMFS
62	#include "cluster/masklog.h"	62	#include "cluster/masklog.h"
63		63
64	static struct super_operations dlmfs_ops;	64	static struct super_operations dlmfs_ops;
65	static struct file_operations dlmfs_file_operations;	65	static struct file_operations dlmfs_file_operations;
66	static struct inode_operations dlmfs_dir_inode_operations;	66	static struct inode_operations dlmfs_dir_inode_operations;
67	static struct inode_operations dlmfs_root_inode_operations;	67	static struct inode_operations dlmfs_root_inode_operations;
68	static struct inode_operations dlmfs_file_inode_operations;	68	static struct inode_operations dlmfs_file_inode_operations;
69	static struct kmem_cache *dlmfs_inode_cache;	69	static struct kmem_cache *dlmfs_inode_cache;
70		70
71	struct workqueue_struct *user_dlm_worker;	71	struct workqueue_struct *user_dlm_worker;
72		72
73	/*	73	/*
74	* decodes a set of open flags into a valid lock level and a set of flags.	74	* decodes a set of open flags into a valid lock level and a set of flags.
75	* returns < 0 if we have invalid flags	75	* returns < 0 if we have invalid flags
76	* flags which mean something to us:	76	* flags which mean something to us:
77	* O_RDONLY -> PRMODE level	77	* O_RDONLY -> PRMODE level
78	* O_WRONLY -> EXMODE level	78	* O_WRONLY -> EXMODE level
79	*	79	*
80	* O_NONBLOCK -> LKM_NOQUEUE	80	* O_NONBLOCK -> LKM_NOQUEUE
81	*/	81	*/
82	static int dlmfs_decode_open_flags(int open_flags,	82	static int dlmfs_decode_open_flags(int open_flags,
83	int *level,	83	int *level,
84	int *flags)	84	int *flags)
85	{	85	{
86	if (open_flags & (O_WRONLY\|O_RDWR))	86	if (open_flags & (O_WRONLY\|O_RDWR))
87	*level = LKM_EXMODE;	87	*level = LKM_EXMODE;
88	else	88	else
89	*level = LKM_PRMODE;	89	*level = LKM_PRMODE;
90		90
91	*flags = 0;	91	*flags = 0;
92	if (open_flags & O_NONBLOCK)	92	if (open_flags & O_NONBLOCK)
93	*flags \|= LKM_NOQUEUE;	93	*flags \|= LKM_NOQUEUE;
94		94
95	return 0;	95	return 0;
96	}	96	}
97		97
98	static int dlmfs_file_open(struct inode *inode,	98	static int dlmfs_file_open(struct inode *inode,
99	struct file *file)	99	struct file *file)
100	{	100	{
101	int status, level, flags;	101	int status, level, flags;
102	struct dlmfs_filp_private *fp = NULL;	102	struct dlmfs_filp_private *fp = NULL;
103	struct dlmfs_inode_private *ip;	103	struct dlmfs_inode_private *ip;
104		104
105	if (S_ISDIR(inode->i_mode))	105	if (S_ISDIR(inode->i_mode))
106	BUG();	106	BUG();
107		107
108	mlog(0, "open called on inode %lu, flags 0x%x\n", inode->i_ino,	108	mlog(0, "open called on inode %lu, flags 0x%x\n", inode->i_ino,
109	file->f_flags);	109	file->f_flags);
110		110
111	status = dlmfs_decode_open_flags(file->f_flags, &level, &flags);	111	status = dlmfs_decode_open_flags(file->f_flags, &level, &flags);
112	if (status < 0)	112	if (status < 0)
113	goto bail;	113	goto bail;
114		114
115	/* We don't want to honor O_APPEND at read/write time as it	115	/* We don't want to honor O_APPEND at read/write time as it
116	* doesn't make sense for LVB writes. */	116	* doesn't make sense for LVB writes. */
117	file->f_flags &= ~O_APPEND;	117	file->f_flags &= ~O_APPEND;
118		118
119	fp = kmalloc(sizeof(*fp), GFP_NOFS);	119	fp = kmalloc(sizeof(*fp), GFP_NOFS);
120	if (!fp) {	120	if (!fp) {
121	status = -ENOMEM;	121	status = -ENOMEM;
122	goto bail;	122	goto bail;
123	}	123	}
124	fp->fp_lock_level = level;	124	fp->fp_lock_level = level;
125		125
126	ip = DLMFS_I(inode);	126	ip = DLMFS_I(inode);
127		127
128	status = user_dlm_cluster_lock(&ip->ip_lockres, level, flags);	128	status = user_dlm_cluster_lock(&ip->ip_lockres, level, flags);
129	if (status < 0) {	129	if (status < 0) {
130	/* this is a strange error to return here but I want	130	/* this is a strange error to return here but I want
131	* to be able userspace to be able to distinguish a	131	* to be able userspace to be able to distinguish a
132	* valid lock request from one that simply couldn't be	132	* valid lock request from one that simply couldn't be
133	* granted. */	133	* granted. */
134	if (flags & LKM_NOQUEUE && status == -EAGAIN)	134	if (flags & LKM_NOQUEUE && status == -EAGAIN)
135	status = -ETXTBSY;	135	status = -ETXTBSY;
136	kfree(fp);	136	kfree(fp);
137	goto bail;	137	goto bail;
138	}	138	}
139		139
140	file->private_data = fp;	140	file->private_data = fp;
141	bail:	141	bail:
142	return status;	142	return status;
143	}	143	}
144		144
145	static int dlmfs_file_release(struct inode *inode,	145	static int dlmfs_file_release(struct inode *inode,
146	struct file *file)	146	struct file *file)
147	{	147	{
148	int level, status;	148	int level, status;
149	struct dlmfs_inode_private *ip = DLMFS_I(inode);	149	struct dlmfs_inode_private *ip = DLMFS_I(inode);
150	struct dlmfs_filp_private *fp =	150	struct dlmfs_filp_private *fp =
151	(struct dlmfs_filp_private *) file->private_data;	151	(struct dlmfs_filp_private *) file->private_data;
152		152
153	if (S_ISDIR(inode->i_mode))	153	if (S_ISDIR(inode->i_mode))
154	BUG();	154	BUG();
155		155
156	mlog(0, "close called on inode %lu\n", inode->i_ino);	156	mlog(0, "close called on inode %lu\n", inode->i_ino);
157		157
158	status = 0;	158	status = 0;
159	if (fp) {	159	if (fp) {
160	level = fp->fp_lock_level;	160	level = fp->fp_lock_level;
161	if (level != LKM_IVMODE)	161	if (level != LKM_IVMODE)
162	user_dlm_cluster_unlock(&ip->ip_lockres, level);	162	user_dlm_cluster_unlock(&ip->ip_lockres, level);
163		163
164	kfree(fp);	164	kfree(fp);
165	file->private_data = NULL;	165	file->private_data = NULL;
166	}	166	}
167		167
168	return 0;	168	return 0;
169	}	169	}
170		170
171	static ssize_t dlmfs_file_read(struct file *filp,	171	static ssize_t dlmfs_file_read(struct file *filp,
172	char __user *buf,	172	char __user *buf,
173	size_t count,	173	size_t count,
174	loff_t *ppos)	174	loff_t *ppos)
175	{	175	{
176	int bytes_left;	176	int bytes_left;
177	ssize_t readlen;	177	ssize_t readlen;
178	char *lvb_buf;	178	char *lvb_buf;
179	struct inode *inode = filp->f_dentry->d_inode;	179	struct inode *inode = filp->f_path.dentry->d_inode;
180		180
181	mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",	181	mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
182	inode->i_ino, count, *ppos);	182	inode->i_ino, count, *ppos);
183		183
184	if (*ppos >= i_size_read(inode))	184	if (*ppos >= i_size_read(inode))
185	return 0;	185	return 0;
186		186
187	if (!count)	187	if (!count)
188	return 0;	188	return 0;
189		189
190	if (!access_ok(VERIFY_WRITE, buf, count))	190	if (!access_ok(VERIFY_WRITE, buf, count))
191	return -EFAULT;	191	return -EFAULT;
192		192
193	/* don't read past the lvb */	193	/* don't read past the lvb */
194	if ((count + *ppos) > i_size_read(inode))	194	if ((count + *ppos) > i_size_read(inode))
195	readlen = i_size_read(inode) - *ppos;	195	readlen = i_size_read(inode) - *ppos;
196	else	196	else
197	readlen = count - *ppos;	197	readlen = count - *ppos;
198		198
199	lvb_buf = kmalloc(readlen, GFP_NOFS);	199	lvb_buf = kmalloc(readlen, GFP_NOFS);
200	if (!lvb_buf)	200	if (!lvb_buf)
201	return -ENOMEM;	201	return -ENOMEM;
202		202
203	user_dlm_read_lvb(inode, lvb_buf, readlen);	203	user_dlm_read_lvb(inode, lvb_buf, readlen);
204	bytes_left = __copy_to_user(buf, lvb_buf, readlen);	204	bytes_left = __copy_to_user(buf, lvb_buf, readlen);
205	readlen -= bytes_left;	205	readlen -= bytes_left;
206		206
207	kfree(lvb_buf);	207	kfree(lvb_buf);
208		208
209	ppos = ppos + readlen;	209	ppos = ppos + readlen;
210		210
211	mlog(0, "read %zd bytes\n", readlen);	211	mlog(0, "read %zd bytes\n", readlen);
212	return readlen;	212	return readlen;
213	}	213	}
214		214
215	static ssize_t dlmfs_file_write(struct file *filp,	215	static ssize_t dlmfs_file_write(struct file *filp,
216	const char __user *buf,	216	const char __user *buf,
217	size_t count,	217	size_t count,
218	loff_t *ppos)	218	loff_t *ppos)
219	{	219	{
220	int bytes_left;	220	int bytes_left;
221	ssize_t writelen;	221	ssize_t writelen;
222	char *lvb_buf;	222	char *lvb_buf;
223	struct inode *inode = filp->f_dentry->d_inode;	223	struct inode *inode = filp->f_path.dentry->d_inode;
224		224
225	mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",	225	mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
226	inode->i_ino, count, *ppos);	226	inode->i_ino, count, *ppos);
227		227
228	if (*ppos >= i_size_read(inode))	228	if (*ppos >= i_size_read(inode))
229	return -ENOSPC;	229	return -ENOSPC;
230		230
231	if (!count)	231	if (!count)
232	return 0;	232	return 0;
233		233
234	if (!access_ok(VERIFY_READ, buf, count))	234	if (!access_ok(VERIFY_READ, buf, count))
235	return -EFAULT;	235	return -EFAULT;
236		236
237	/* don't write past the lvb */	237	/* don't write past the lvb */
238	if ((count + *ppos) > i_size_read(inode))	238	if ((count + *ppos) > i_size_read(inode))
239	writelen = i_size_read(inode) - *ppos;	239	writelen = i_size_read(inode) - *ppos;
240	else	240	else
241	writelen = count - *ppos;	241	writelen = count - *ppos;
242		242
243	lvb_buf = kmalloc(writelen, GFP_NOFS);	243	lvb_buf = kmalloc(writelen, GFP_NOFS);
244	if (!lvb_buf)	244	if (!lvb_buf)
245	return -ENOMEM;	245	return -ENOMEM;
246		246
247	bytes_left = copy_from_user(lvb_buf, buf, writelen);	247	bytes_left = copy_from_user(lvb_buf, buf, writelen);
248	writelen -= bytes_left;	248	writelen -= bytes_left;
249	if (writelen)	249	if (writelen)
250	user_dlm_write_lvb(inode, lvb_buf, writelen);	250	user_dlm_write_lvb(inode, lvb_buf, writelen);
251		251
252	kfree(lvb_buf);	252	kfree(lvb_buf);
253		253
254	ppos = ppos + writelen;	254	ppos = ppos + writelen;
255	mlog(0, "wrote %zd bytes\n", writelen);	255	mlog(0, "wrote %zd bytes\n", writelen);
256	return writelen;	256	return writelen;
257	}	257	}
258		258
259	static void dlmfs_init_once(void *foo,	259	static void dlmfs_init_once(void *foo,
260	struct kmem_cache *cachep,	260	struct kmem_cache *cachep,
261	unsigned long flags)	261	unsigned long flags)
262	{	262	{
263	struct dlmfs_inode_private *ip =	263	struct dlmfs_inode_private *ip =
264	(struct dlmfs_inode_private *) foo;	264	(struct dlmfs_inode_private *) foo;
265		265
266	if ((flags & (SLAB_CTOR_VERIFY\|SLAB_CTOR_CONSTRUCTOR)) ==	266	if ((flags & (SLAB_CTOR_VERIFY\|SLAB_CTOR_CONSTRUCTOR)) ==
267	SLAB_CTOR_CONSTRUCTOR) {	267	SLAB_CTOR_CONSTRUCTOR) {
268	ip->ip_dlm = NULL;	268	ip->ip_dlm = NULL;
269	ip->ip_parent = NULL;	269	ip->ip_parent = NULL;
270		270
271	inode_init_once(&ip->ip_vfs_inode);	271	inode_init_once(&ip->ip_vfs_inode);
272	}	272	}
273	}	273	}
274		274
275	static struct inode dlmfs_alloc_inode(struct super_block sb)	275	static struct inode dlmfs_alloc_inode(struct super_block sb)
276	{	276	{
277	struct dlmfs_inode_private *ip;	277	struct dlmfs_inode_private *ip;
278		278
279	ip = kmem_cache_alloc(dlmfs_inode_cache, GFP_NOFS);	279	ip = kmem_cache_alloc(dlmfs_inode_cache, GFP_NOFS);
280	if (!ip)	280	if (!ip)
281	return NULL;	281	return NULL;
282		282
283	return &ip->ip_vfs_inode;	283	return &ip->ip_vfs_inode;
284	}	284	}
285		285
286	static void dlmfs_destroy_inode(struct inode *inode)	286	static void dlmfs_destroy_inode(struct inode *inode)
287	{	287	{
288	kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode));	288	kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode));
289	}	289	}
290		290
291	static void dlmfs_clear_inode(struct inode *inode)	291	static void dlmfs_clear_inode(struct inode *inode)
292	{	292	{
293	int status;	293	int status;
294	struct dlmfs_inode_private *ip;	294	struct dlmfs_inode_private *ip;
295		295
296	if (!inode)	296	if (!inode)
297	return;	297	return;
298		298
299	mlog(0, "inode %lu\n", inode->i_ino);	299	mlog(0, "inode %lu\n", inode->i_ino);
300		300
301	ip = DLMFS_I(inode);	301	ip = DLMFS_I(inode);
302		302
303	if (S_ISREG(inode->i_mode)) {	303	if (S_ISREG(inode->i_mode)) {
304	status = user_dlm_destroy_lock(&ip->ip_lockres);	304	status = user_dlm_destroy_lock(&ip->ip_lockres);
305	if (status < 0)	305	if (status < 0)
306	mlog_errno(status);	306	mlog_errno(status);
307	iput(ip->ip_parent);	307	iput(ip->ip_parent);
308	goto clear_fields;	308	goto clear_fields;
309	}	309	}
310		310
311	mlog(0, "we're a directory, ip->ip_dlm = 0x%p\n", ip->ip_dlm);	311	mlog(0, "we're a directory, ip->ip_dlm = 0x%p\n", ip->ip_dlm);
312	/* we must be a directory. If required, lets unregister the	312	/* we must be a directory. If required, lets unregister the
313	* dlm context now. */	313	* dlm context now. */
314	if (ip->ip_dlm)	314	if (ip->ip_dlm)
315	user_dlm_unregister_context(ip->ip_dlm);	315	user_dlm_unregister_context(ip->ip_dlm);
316	clear_fields:	316	clear_fields:
317	ip->ip_parent = NULL;	317	ip->ip_parent = NULL;
318	ip->ip_dlm = NULL;	318	ip->ip_dlm = NULL;
319	}	319	}
320		320
321	static struct backing_dev_info dlmfs_backing_dev_info = {	321	static struct backing_dev_info dlmfs_backing_dev_info = {
322	.ra_pages = 0, /* No readahead */	322	.ra_pages = 0, /* No readahead */
323	.capabilities = BDI_CAP_NO_ACCT_DIRTY \| BDI_CAP_NO_WRITEBACK,	323	.capabilities = BDI_CAP_NO_ACCT_DIRTY \| BDI_CAP_NO_WRITEBACK,
324	};	324	};
325		325
326	static struct inode dlmfs_get_root_inode(struct super_block sb)	326	static struct inode dlmfs_get_root_inode(struct super_block sb)
327	{	327	{
328	struct inode *inode = new_inode(sb);	328	struct inode *inode = new_inode(sb);
329	int mode = S_IFDIR \| 0755;	329	int mode = S_IFDIR \| 0755;
330	struct dlmfs_inode_private *ip;	330	struct dlmfs_inode_private *ip;
331		331
332	if (inode) {	332	if (inode) {
333	ip = DLMFS_I(inode);	333	ip = DLMFS_I(inode);
334		334
335	inode->i_mode = mode;	335	inode->i_mode = mode;
336	inode->i_uid = current->fsuid;	336	inode->i_uid = current->fsuid;
337	inode->i_gid = current->fsgid;	337	inode->i_gid = current->fsgid;
338	inode->i_blocks = 0;	338	inode->i_blocks = 0;
339	inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;	339	inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
340	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;	340	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
341	inc_nlink(inode);	341	inc_nlink(inode);
342		342
343	inode->i_fop = &simple_dir_operations;	343	inode->i_fop = &simple_dir_operations;
344	inode->i_op = &dlmfs_root_inode_operations;	344	inode->i_op = &dlmfs_root_inode_operations;
345	}	345	}
346		346
347	return inode;	347	return inode;
348	}	348	}
349		349
350	static struct inode dlmfs_get_inode(struct inode parent,	350	static struct inode dlmfs_get_inode(struct inode parent,
351	struct dentry *dentry,	351	struct dentry *dentry,
352	int mode)	352	int mode)
353	{	353	{
354	struct super_block *sb = parent->i_sb;	354	struct super_block *sb = parent->i_sb;
355	struct inode * inode = new_inode(sb);	355	struct inode * inode = new_inode(sb);
356	struct dlmfs_inode_private *ip;	356	struct dlmfs_inode_private *ip;
357		357
358	if (!inode)	358	if (!inode)
359	return NULL;	359	return NULL;
360		360
361	inode->i_mode = mode;	361	inode->i_mode = mode;
362	inode->i_uid = current->fsuid;	362	inode->i_uid = current->fsuid;
363	inode->i_gid = current->fsgid;	363	inode->i_gid = current->fsgid;
364	inode->i_blocks = 0;	364	inode->i_blocks = 0;
365	inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;	365	inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
366	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;	366	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
367		367
368	ip = DLMFS_I(inode);	368	ip = DLMFS_I(inode);
369	ip->ip_dlm = DLMFS_I(parent)->ip_dlm;	369	ip->ip_dlm = DLMFS_I(parent)->ip_dlm;
370		370
371	switch (mode & S_IFMT) {	371	switch (mode & S_IFMT) {
372	default:	372	default:
373	/* for now we don't support anything other than	373	/* for now we don't support anything other than
374	* directories and regular files. */	374	* directories and regular files. */
375	BUG();	375	BUG();
376	break;	376	break;
377	case S_IFREG:	377	case S_IFREG:
378	inode->i_op = &dlmfs_file_inode_operations;	378	inode->i_op = &dlmfs_file_inode_operations;
379	inode->i_fop = &dlmfs_file_operations;	379	inode->i_fop = &dlmfs_file_operations;
380		380
381	i_size_write(inode, DLM_LVB_LEN);	381	i_size_write(inode, DLM_LVB_LEN);
382		382
383	user_dlm_lock_res_init(&ip->ip_lockres, dentry);	383	user_dlm_lock_res_init(&ip->ip_lockres, dentry);
384		384
385	/* released at clear_inode time, this insures that we	385	/* released at clear_inode time, this insures that we
386	* get to drop the dlm reference on each lock before	386	* get to drop the dlm reference on each lock before
387	* we call the unregister code for releasing parent	387	* we call the unregister code for releasing parent
388	* directories. */	388	* directories. */
389	ip->ip_parent = igrab(parent);	389	ip->ip_parent = igrab(parent);
390	BUG_ON(!ip->ip_parent);	390	BUG_ON(!ip->ip_parent);
391	break;	391	break;
392	case S_IFDIR:	392	case S_IFDIR:
393	inode->i_op = &dlmfs_dir_inode_operations;	393	inode->i_op = &dlmfs_dir_inode_operations;
394	inode->i_fop = &simple_dir_operations;	394	inode->i_fop = &simple_dir_operations;
395		395
396	/* directory inodes start off with i_nlink ==	396	/* directory inodes start off with i_nlink ==
397	* 2 (for "." entry) */	397	* 2 (for "." entry) */
398	inc_nlink(inode);	398	inc_nlink(inode);
399	break;	399	break;
400	}	400	}
401		401
402	if (parent->i_mode & S_ISGID) {	402	if (parent->i_mode & S_ISGID) {
403	inode->i_gid = parent->i_gid;	403	inode->i_gid = parent->i_gid;
404	if (S_ISDIR(mode))	404	if (S_ISDIR(mode))
405	inode->i_mode \|= S_ISGID;	405	inode->i_mode \|= S_ISGID;
406	}	406	}
407		407
408	return inode;	408	return inode;
409	}	409	}
410		410
411	/*	411	/*
412	* File creation. Allocate an inode, and we're done..	412	* File creation. Allocate an inode, and we're done..
413	*/	413	*/
414	/* SMP-safe */	414	/* SMP-safe */
415	static int dlmfs_mkdir(struct inode * dir,	415	static int dlmfs_mkdir(struct inode * dir,
416	struct dentry * dentry,	416	struct dentry * dentry,
417	int mode)	417	int mode)
418	{	418	{
419	int status;	419	int status;
420	struct inode *inode = NULL;	420	struct inode *inode = NULL;
421	struct qstr *domain = &dentry->d_name;	421	struct qstr *domain = &dentry->d_name;
422	struct dlmfs_inode_private *ip;	422	struct dlmfs_inode_private *ip;
423	struct dlm_ctxt *dlm;	423	struct dlm_ctxt *dlm;
424		424
425	mlog(0, "mkdir %.*s\n", domain->len, domain->name);	425	mlog(0, "mkdir %.*s\n", domain->len, domain->name);
426		426
427	/* verify that we have a proper domain */	427	/* verify that we have a proper domain */
428	if (domain->len >= O2NM_MAX_NAME_LEN) {	428	if (domain->len >= O2NM_MAX_NAME_LEN) {
429	status = -EINVAL;	429	status = -EINVAL;
430	mlog(ML_ERROR, "invalid domain name for directory.\n");	430	mlog(ML_ERROR, "invalid domain name for directory.\n");
431	goto bail;	431	goto bail;
432	}	432	}
433		433
434	inode = dlmfs_get_inode(dir, dentry, mode \| S_IFDIR);	434	inode = dlmfs_get_inode(dir, dentry, mode \| S_IFDIR);
435	if (!inode) {	435	if (!inode) {
436	status = -ENOMEM;	436	status = -ENOMEM;
437	mlog_errno(status);	437	mlog_errno(status);
438	goto bail;	438	goto bail;
439	}	439	}
440		440
441	ip = DLMFS_I(inode);	441	ip = DLMFS_I(inode);
442		442
443	dlm = user_dlm_register_context(domain);	443	dlm = user_dlm_register_context(domain);
444	if (IS_ERR(dlm)) {	444	if (IS_ERR(dlm)) {
445	status = PTR_ERR(dlm);	445	status = PTR_ERR(dlm);
446	mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n",	446	mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n",
447	status, domain->len, domain->name);	447	status, domain->len, domain->name);
448	goto bail;	448	goto bail;
449	}	449	}
450	ip->ip_dlm = dlm;	450	ip->ip_dlm = dlm;
451		451
452	inc_nlink(dir);	452	inc_nlink(dir);
453	d_instantiate(dentry, inode);	453	d_instantiate(dentry, inode);
454	dget(dentry); /* Extra count - pin the dentry in core */	454	dget(dentry); /* Extra count - pin the dentry in core */
455		455
456	status = 0;	456	status = 0;
457	bail:	457	bail:
458	if (status < 0)	458	if (status < 0)
459	iput(inode);	459	iput(inode);
460	return status;	460	return status;
461	}	461	}
462		462
463	static int dlmfs_create(struct inode *dir,	463	static int dlmfs_create(struct inode *dir,
464	struct dentry *dentry,	464	struct dentry *dentry,
465	int mode,	465	int mode,
466	struct nameidata *nd)	466	struct nameidata *nd)
467	{	467	{
468	int status = 0;	468	int status = 0;
469	struct inode *inode;	469	struct inode *inode;
470	struct qstr *name = &dentry->d_name;	470	struct qstr *name = &dentry->d_name;
471		471
472	mlog(0, "create %.*s\n", name->len, name->name);	472	mlog(0, "create %.*s\n", name->len, name->name);
473		473
474	/* verify name is valid and doesn't contain any dlm reserved	474	/* verify name is valid and doesn't contain any dlm reserved
475	* characters */	475	* characters */
476	if (name->len >= USER_DLM_LOCK_ID_MAX_LEN \|\|	476	if (name->len >= USER_DLM_LOCK_ID_MAX_LEN \|\|
477	name->name[0] == '$') {	477	name->name[0] == '$') {
478	status = -EINVAL;	478	status = -EINVAL;
479	mlog(ML_ERROR, "invalid lock name, %.*s\n", name->len,	479	mlog(ML_ERROR, "invalid lock name, %.*s\n", name->len,
480	name->name);	480	name->name);
481	goto bail;	481	goto bail;
482	}	482	}
483		483
484	inode = dlmfs_get_inode(dir, dentry, mode \| S_IFREG);	484	inode = dlmfs_get_inode(dir, dentry, mode \| S_IFREG);
485	if (!inode) {	485	if (!inode) {
486	status = -ENOMEM;	486	status = -ENOMEM;
487	mlog_errno(status);	487	mlog_errno(status);
488	goto bail;	488	goto bail;
489	}	489	}
490		490
491	d_instantiate(dentry, inode);	491	d_instantiate(dentry, inode);
492	dget(dentry); /* Extra count - pin the dentry in core */	492	dget(dentry); /* Extra count - pin the dentry in core */
493	bail:	493	bail:
494	return status;	494	return status;
495	}	495	}
496		496
497	static int dlmfs_unlink(struct inode *dir,	497	static int dlmfs_unlink(struct inode *dir,
498	struct dentry *dentry)	498	struct dentry *dentry)
499	{	499	{
500	int status;	500	int status;
501	struct inode *inode = dentry->d_inode;	501	struct inode *inode = dentry->d_inode;
502		502
503	mlog(0, "unlink inode %lu\n", inode->i_ino);	503	mlog(0, "unlink inode %lu\n", inode->i_ino);
504		504
505	/* if there are no current holders, or none that are waiting	505	/* if there are no current holders, or none that are waiting
506	* to acquire a lock, this basically destroys our lockres. */	506	* to acquire a lock, this basically destroys our lockres. */
507	status = user_dlm_destroy_lock(&DLMFS_I(inode)->ip_lockres);	507	status = user_dlm_destroy_lock(&DLMFS_I(inode)->ip_lockres);
508	if (status < 0) {	508	if (status < 0) {
509	mlog(ML_ERROR, "unlink %.*s, error %d from destroy\n",	509	mlog(ML_ERROR, "unlink %.*s, error %d from destroy\n",
510	dentry->d_name.len, dentry->d_name.name, status);	510	dentry->d_name.len, dentry->d_name.name, status);
511	goto bail;	511	goto bail;
512	}	512	}
513	status = simple_unlink(dir, dentry);	513	status = simple_unlink(dir, dentry);
514	bail:	514	bail:
515	return status;	515	return status;
516	}	516	}
517		517
518	static int dlmfs_fill_super(struct super_block * sb,	518	static int dlmfs_fill_super(struct super_block * sb,
519	void * data,	519	void * data,
520	int silent)	520	int silent)
521	{	521	{
522	struct inode * inode;	522	struct inode * inode;
523	struct dentry * root;	523	struct dentry * root;
524		524
525	sb->s_maxbytes = MAX_LFS_FILESIZE;	525	sb->s_maxbytes = MAX_LFS_FILESIZE;
526	sb->s_blocksize = PAGE_CACHE_SIZE;	526	sb->s_blocksize = PAGE_CACHE_SIZE;
527	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;	527	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
528	sb->s_magic = DLMFS_MAGIC;	528	sb->s_magic = DLMFS_MAGIC;
529	sb->s_op = &dlmfs_ops;	529	sb->s_op = &dlmfs_ops;
530	inode = dlmfs_get_root_inode(sb);	530	inode = dlmfs_get_root_inode(sb);
531	if (!inode)	531	if (!inode)
532	return -ENOMEM;	532	return -ENOMEM;
533		533
534	root = d_alloc_root(inode);	534	root = d_alloc_root(inode);
535	if (!root) {	535	if (!root) {
536	iput(inode);	536	iput(inode);
537	return -ENOMEM;	537	return -ENOMEM;
538	}	538	}
539	sb->s_root = root;	539	sb->s_root = root;
540	return 0;	540	return 0;
541	}	541	}
542		542
543	static struct file_operations dlmfs_file_operations = {	543	static struct file_operations dlmfs_file_operations = {
544	.open = dlmfs_file_open,	544	.open = dlmfs_file_open,
545	.release = dlmfs_file_release,	545	.release = dlmfs_file_release,
546	.read = dlmfs_file_read,	546	.read = dlmfs_file_read,
547	.write = dlmfs_file_write,	547	.write = dlmfs_file_write,
548	};	548	};
549		549
550	static struct inode_operations dlmfs_dir_inode_operations = {	550	static struct inode_operations dlmfs_dir_inode_operations = {
551	.create = dlmfs_create,	551	.create = dlmfs_create,
552	.lookup = simple_lookup,	552	.lookup = simple_lookup,
553	.unlink = dlmfs_unlink,	553	.unlink = dlmfs_unlink,
554	};	554	};
555		555
556	/* this way we can restrict mkdir to only the toplevel of the fs. */	556	/* this way we can restrict mkdir to only the toplevel of the fs. */
557	static struct inode_operations dlmfs_root_inode_operations = {	557	static struct inode_operations dlmfs_root_inode_operations = {
558	.lookup = simple_lookup,	558	.lookup = simple_lookup,
559	.mkdir = dlmfs_mkdir,	559	.mkdir = dlmfs_mkdir,
560	.rmdir = simple_rmdir,	560	.rmdir = simple_rmdir,
561	};	561	};
562		562
563	static struct super_operations dlmfs_ops = {	563	static struct super_operations dlmfs_ops = {
564	.statfs = simple_statfs,	564	.statfs = simple_statfs,
565	.alloc_inode = dlmfs_alloc_inode,	565	.alloc_inode = dlmfs_alloc_inode,
566	.destroy_inode = dlmfs_destroy_inode,	566	.destroy_inode = dlmfs_destroy_inode,
567	.clear_inode = dlmfs_clear_inode,	567	.clear_inode = dlmfs_clear_inode,
568	.drop_inode = generic_delete_inode,	568	.drop_inode = generic_delete_inode,
569	};	569	};
570		570
571	static struct inode_operations dlmfs_file_inode_operations = {	571	static struct inode_operations dlmfs_file_inode_operations = {
572	.getattr = simple_getattr,	572	.getattr = simple_getattr,
573	};	573	};
574		574
575	static int dlmfs_get_sb(struct file_system_type *fs_type,	575	static int dlmfs_get_sb(struct file_system_type *fs_type,
576	int flags, const char dev_name, void data, struct vfsmount *mnt)	576	int flags, const char dev_name, void data, struct vfsmount *mnt)
577	{	577	{
578	return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super, mnt);	578	return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super, mnt);
579	}	579	}
580		580
581	static struct file_system_type dlmfs_fs_type = {	581	static struct file_system_type dlmfs_fs_type = {
582	.owner = THIS_MODULE,	582	.owner = THIS_MODULE,
583	.name = "ocfs2_dlmfs",	583	.name = "ocfs2_dlmfs",
584	.get_sb = dlmfs_get_sb,	584	.get_sb = dlmfs_get_sb,
585	.kill_sb = kill_litter_super,	585	.kill_sb = kill_litter_super,
586	};	586	};
587		587
588	static int __init init_dlmfs_fs(void)	588	static int __init init_dlmfs_fs(void)
589	{	589	{
590	int status;	590	int status;
591	int cleanup_inode = 0, cleanup_worker = 0;	591	int cleanup_inode = 0, cleanup_worker = 0;
592		592
593	dlmfs_print_version();	593	dlmfs_print_version();
594		594
595	dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache",	595	dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache",
596	sizeof(struct dlmfs_inode_private),	596	sizeof(struct dlmfs_inode_private),
597	0, (SLAB_HWCACHE_ALIGN\|SLAB_RECLAIM_ACCOUNT\|	597	0, (SLAB_HWCACHE_ALIGN\|SLAB_RECLAIM_ACCOUNT\|
598	SLAB_MEM_SPREAD),	598	SLAB_MEM_SPREAD),
599	dlmfs_init_once, NULL);	599	dlmfs_init_once, NULL);
600	if (!dlmfs_inode_cache)	600	if (!dlmfs_inode_cache)
601	return -ENOMEM;	601	return -ENOMEM;
602	cleanup_inode = 1;	602	cleanup_inode = 1;
603		603
604	user_dlm_worker = create_singlethread_workqueue("user_dlm");	604	user_dlm_worker = create_singlethread_workqueue("user_dlm");
605	if (!user_dlm_worker) {	605	if (!user_dlm_worker) {
606	status = -ENOMEM;	606	status = -ENOMEM;
607	goto bail;	607	goto bail;
608	}	608	}
609	cleanup_worker = 1;	609	cleanup_worker = 1;
610		610
611	status = register_filesystem(&dlmfs_fs_type);	611	status = register_filesystem(&dlmfs_fs_type);
612	bail:	612	bail:
613	if (status) {	613	if (status) {
614	if (cleanup_inode)	614	if (cleanup_inode)
615	kmem_cache_destroy(dlmfs_inode_cache);	615	kmem_cache_destroy(dlmfs_inode_cache);
616	if (cleanup_worker)	616	if (cleanup_worker)
617	destroy_workqueue(user_dlm_worker);	617	destroy_workqueue(user_dlm_worker);
618	} else	618	} else
619	printk("OCFS2 User DLM kernel interface loaded\n");	619	printk("OCFS2 User DLM kernel interface loaded\n");
620	return status;	620	return status;
621	}	621	}
622		622
623	static void __exit exit_dlmfs_fs(void)	623	static void __exit exit_dlmfs_fs(void)
624	{	624	{
625	unregister_filesystem(&dlmfs_fs_type);	625	unregister_filesystem(&dlmfs_fs_type);
626		626
627	flush_workqueue(user_dlm_worker);	627	flush_workqueue(user_dlm_worker);
628	destroy_workqueue(user_dlm_worker);	628	destroy_workqueue(user_dlm_worker);
629		629
630	kmem_cache_destroy(dlmfs_inode_cache);	630	kmem_cache_destroy(dlmfs_inode_cache);
631	}	631	}
632		632
633	MODULE_AUTHOR("Oracle");	633	MODULE_AUTHOR("Oracle");
634	MODULE_LICENSE("GPL");	634	MODULE_LICENSE("GPL");
635		635
636	module_init(init_dlmfs_fs)	636	module_init(init_dlmfs_fs)
637	module_exit(exit_dlmfs_fs)	637	module_exit(exit_dlmfs_fs)
638		638

fs/ocfs2/file.c

Diff comments View file @ d28c917

1	/* -- mode: c; c-basic-offset: 8; --	1	/* -- mode: c; c-basic-offset: 8; --
2	* vim: noexpandtab sw=8 ts=8 sts=0:	2	* vim: noexpandtab sw=8 ts=8 sts=0:
3	*	3	*
4	* file.c	4	* file.c
5	*	5	*
6	* File open, close, extend, truncate	6	* File open, close, extend, truncate
7	*	7	*
8	* Copyright (C) 2002, 2004 Oracle. All rights reserved.	8	* Copyright (C) 2002, 2004 Oracle. All rights reserved.
9	*	9	*
10	* This program is free software; you can redistribute it and/or	10	* This program is free software; you can redistribute it and/or
11	* modify it under the terms of the GNU General Public	11	* modify it under the terms of the GNU General Public
12	* License as published by the Free Software Foundation; either	12	* License as published by the Free Software Foundation; either
13	* version 2 of the License, or (at your option) any later version.	13	* version 2 of the License, or (at your option) any later version.
14	*	14	*
15	* This program is distributed in the hope that it will be useful,	15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of	16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU	17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18	* General Public License for more details.	18	* General Public License for more details.
19	*	19	*
20	* You should have received a copy of the GNU General Public	20	* You should have received a copy of the GNU General Public
21	* License along with this program; if not, write to the	21	* License along with this program; if not, write to the
22	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,	22	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23	* Boston, MA 021110-1307, USA.	23	* Boston, MA 021110-1307, USA.
24	*/	24	*/
25		25
26	#include <linux/capability.h>	26	#include <linux/capability.h>
27	#include <linux/fs.h>	27	#include <linux/fs.h>
28	#include <linux/types.h>	28	#include <linux/types.h>
29	#include <linux/slab.h>	29	#include <linux/slab.h>
30	#include <linux/highmem.h>	30	#include <linux/highmem.h>
31	#include <linux/pagemap.h>	31	#include <linux/pagemap.h>
32	#include <linux/uio.h>	32	#include <linux/uio.h>
33	#include <linux/sched.h>	33	#include <linux/sched.h>
34	#include <linux/pipe_fs_i.h>	34	#include <linux/pipe_fs_i.h>
35	#include <linux/mount.h>	35	#include <linux/mount.h>
36		36
37	#define MLOG_MASK_PREFIX ML_INODE	37	#define MLOG_MASK_PREFIX ML_INODE
38	#include <cluster/masklog.h>	38	#include <cluster/masklog.h>
39		39
40	#include "ocfs2.h"	40	#include "ocfs2.h"
41		41
42	#include "alloc.h"	42	#include "alloc.h"
43	#include "aops.h"	43	#include "aops.h"
44	#include "dir.h"	44	#include "dir.h"
45	#include "dlmglue.h"	45	#include "dlmglue.h"
46	#include "extent_map.h"	46	#include "extent_map.h"
47	#include "file.h"	47	#include "file.h"
48	#include "sysfile.h"	48	#include "sysfile.h"
49	#include "inode.h"	49	#include "inode.h"
50	#include "ioctl.h"	50	#include "ioctl.h"
51	#include "journal.h"	51	#include "journal.h"
52	#include "mmap.h"	52	#include "mmap.h"
53	#include "suballoc.h"	53	#include "suballoc.h"
54	#include "super.h"	54	#include "super.h"
55		55
56	#include "buffer_head_io.h"	56	#include "buffer_head_io.h"
57		57
58	static int ocfs2_sync_inode(struct inode *inode)	58	static int ocfs2_sync_inode(struct inode *inode)
59	{	59	{
60	filemap_fdatawrite(inode->i_mapping);	60	filemap_fdatawrite(inode->i_mapping);
61	return sync_mapping_buffers(inode->i_mapping);	61	return sync_mapping_buffers(inode->i_mapping);
62	}	62	}
63		63
64	static int ocfs2_file_open(struct inode inode, struct file file)	64	static int ocfs2_file_open(struct inode inode, struct file file)
65	{	65	{
66	int status;	66	int status;
67	int mode = file->f_flags;	67	int mode = file->f_flags;
68	struct ocfs2_inode_info *oi = OCFS2_I(inode);	68	struct ocfs2_inode_info *oi = OCFS2_I(inode);
69		69
70	mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,	70	mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
71	file->f_dentry->d_name.len, file->f_dentry->d_name.name);	71	file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name);
72		72
73	spin_lock(&oi->ip_lock);	73	spin_lock(&oi->ip_lock);
74		74
75	/* Check that the inode hasn't been wiped from disk by another	75	/* Check that the inode hasn't been wiped from disk by another
76	* node. If it hasn't then we're safe as long as we hold the	76	* node. If it hasn't then we're safe as long as we hold the
77	* spin lock until our increment of open count. */	77	* spin lock until our increment of open count. */
78	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {	78	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
79	spin_unlock(&oi->ip_lock);	79	spin_unlock(&oi->ip_lock);
80		80
81	status = -ENOENT;	81	status = -ENOENT;
82	goto leave;	82	goto leave;
83	}	83	}
84		84
85	if (mode & O_DIRECT)	85	if (mode & O_DIRECT)
86	oi->ip_flags \|= OCFS2_INODE_OPEN_DIRECT;	86	oi->ip_flags \|= OCFS2_INODE_OPEN_DIRECT;
87		87
88	oi->ip_open_count++;	88	oi->ip_open_count++;
89	spin_unlock(&oi->ip_lock);	89	spin_unlock(&oi->ip_lock);
90	status = 0;	90	status = 0;
91	leave:	91	leave:
92	mlog_exit(status);	92	mlog_exit(status);
93	return status;	93	return status;
94	}	94	}
95		95
96	static int ocfs2_file_release(struct inode inode, struct file file)	96	static int ocfs2_file_release(struct inode inode, struct file file)
97	{	97	{
98	struct ocfs2_inode_info *oi = OCFS2_I(inode);	98	struct ocfs2_inode_info *oi = OCFS2_I(inode);
99		99
100	mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,	100	mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
101	file->f_dentry->d_name.len,	101	file->f_path.dentry->d_name.len,
102	file->f_dentry->d_name.name);	102	file->f_path.dentry->d_name.name);
103		103
104	spin_lock(&oi->ip_lock);	104	spin_lock(&oi->ip_lock);
105	if (!--oi->ip_open_count)	105	if (!--oi->ip_open_count)
106	oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;	106	oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
107	spin_unlock(&oi->ip_lock);	107	spin_unlock(&oi->ip_lock);
108		108
109	mlog_exit(0);	109	mlog_exit(0);
110		110
111	return 0;	111	return 0;
112	}	112	}
113		113
114	static int ocfs2_sync_file(struct file *file,	114	static int ocfs2_sync_file(struct file *file,
115	struct dentry *dentry,	115	struct dentry *dentry,
116	int datasync)	116	int datasync)
117	{	117	{
118	int err = 0;	118	int err = 0;
119	journal_t *journal;	119	journal_t *journal;
120	struct inode *inode = dentry->d_inode;	120	struct inode *inode = dentry->d_inode;
121	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);	121	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
122		122
123	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,	123	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
124	dentry->d_name.len, dentry->d_name.name);	124	dentry->d_name.len, dentry->d_name.name);
125		125
126	err = ocfs2_sync_inode(dentry->d_inode);	126	err = ocfs2_sync_inode(dentry->d_inode);
127	if (err)	127	if (err)
128	goto bail;	128	goto bail;
129		129
130	journal = osb->journal->j_journal;	130	journal = osb->journal->j_journal;
131	err = journal_force_commit(journal);	131	err = journal_force_commit(journal);
132		132
133	bail:	133	bail:
134	mlog_exit(err);	134	mlog_exit(err);
135		135
136	return (err < 0) ? -EIO : 0;	136	return (err < 0) ? -EIO : 0;
137	}	137	}
138		138
139	int ocfs2_should_update_atime(struct inode *inode,	139	int ocfs2_should_update_atime(struct inode *inode,
140	struct vfsmount *vfsmnt)	140	struct vfsmount *vfsmnt)
141	{	141	{
142	struct timespec now;	142	struct timespec now;
143	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);	143	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
144		144
145	if (ocfs2_is_hard_readonly(osb) \|\| ocfs2_is_soft_readonly(osb))	145	if (ocfs2_is_hard_readonly(osb) \|\| ocfs2_is_soft_readonly(osb))
146	return 0;	146	return 0;
147		147
148	if ((inode->i_flags & S_NOATIME) \|\|	148	if ((inode->i_flags & S_NOATIME) \|\|
149	((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)))	149	((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)))
150	return 0;	150	return 0;
151		151
152	if ((vfsmnt->mnt_flags & MNT_NOATIME) \|\|	152	if ((vfsmnt->mnt_flags & MNT_NOATIME) \|\|
153	((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))	153	((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
154	return 0;	154	return 0;
155		155
156	now = CURRENT_TIME;	156	now = CURRENT_TIME;
157	if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum))	157	if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum))
158	return 0;	158	return 0;
159	else	159	else
160	return 1;	160	return 1;
161	}	161	}
162		162
163	int ocfs2_update_inode_atime(struct inode *inode,	163	int ocfs2_update_inode_atime(struct inode *inode,
164	struct buffer_head *bh)	164	struct buffer_head *bh)
165	{	165	{
166	int ret;	166	int ret;
167	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);	167	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
168	handle_t *handle;	168	handle_t *handle;
169		169
170	mlog_entry_void();	170	mlog_entry_void();
171		171
172	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);	172	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
173	if (handle == NULL) {	173	if (handle == NULL) {
174	ret = -ENOMEM;	174	ret = -ENOMEM;
175	mlog_errno(ret);	175	mlog_errno(ret);
176	goto out;	176	goto out;
177	}	177	}
178		178
179	inode->i_atime = CURRENT_TIME;	179	inode->i_atime = CURRENT_TIME;
180	ret = ocfs2_mark_inode_dirty(handle, inode, bh);	180	ret = ocfs2_mark_inode_dirty(handle, inode, bh);
181	if (ret < 0)	181	if (ret < 0)
182	mlog_errno(ret);	182	mlog_errno(ret);
183		183
184	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);	184	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
185	out:	185	out:
186	mlog_exit(ret);	186	mlog_exit(ret);
187	return ret;	187	return ret;
188	}	188	}
189		189
190	int ocfs2_set_inode_size(handle_t *handle,	190	int ocfs2_set_inode_size(handle_t *handle,
191	struct inode *inode,	191	struct inode *inode,
192	struct buffer_head *fe_bh,	192	struct buffer_head *fe_bh,
193	u64 new_i_size)	193	u64 new_i_size)
194	{	194	{
195	int status;	195	int status;
196		196
197	mlog_entry_void();	197	mlog_entry_void();
198	i_size_write(inode, new_i_size);	198	i_size_write(inode, new_i_size);
199	inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);	199	inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
200	inode->i_ctime = inode->i_mtime = CURRENT_TIME;	200	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
201		201
202	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);	202	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
203	if (status < 0) {	203	if (status < 0) {
204	mlog_errno(status);	204	mlog_errno(status);
205	goto bail;	205	goto bail;
206	}	206	}
207		207
208	bail:	208	bail:
209	mlog_exit(status);	209	mlog_exit(status);
210	return status;	210	return status;
211	}	211	}
212		212
213	static int ocfs2_simple_size_update(struct inode *inode,	213	static int ocfs2_simple_size_update(struct inode *inode,
214	struct buffer_head *di_bh,	214	struct buffer_head *di_bh,
215	u64 new_i_size)	215	u64 new_i_size)
216	{	216	{
217	int ret;	217	int ret;
218	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);	218	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
219	handle_t *handle = NULL;	219	handle_t *handle = NULL;
220		220
221	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);	221	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
222	if (handle == NULL) {	222	if (handle == NULL) {
223	ret = -ENOMEM;	223	ret = -ENOMEM;
224	mlog_errno(ret);	224	mlog_errno(ret);
225	goto out;	225	goto out;
226	}	226	}
227		227
228	ret = ocfs2_set_inode_size(handle, inode, di_bh,	228	ret = ocfs2_set_inode_size(handle, inode, di_bh,
229	new_i_size);	229	new_i_size);
230	if (ret < 0)	230	if (ret < 0)
231	mlog_errno(ret);	231	mlog_errno(ret);
232		232
233	ocfs2_commit_trans(osb, handle);	233	ocfs2_commit_trans(osb, handle);
234	out:	234	out:
235	return ret;	235	return ret;
236	}	236	}
237		237
238	static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,	238	static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
239	struct inode *inode,	239	struct inode *inode,
240	struct buffer_head *fe_bh,	240	struct buffer_head *fe_bh,
241	u64 new_i_size)	241	u64 new_i_size)
242	{	242	{
243	int status;	243	int status;
244	handle_t *handle;	244	handle_t *handle;
245		245
246	mlog_entry_void();	246	mlog_entry_void();
247		247
248	/* TODO: This needs to actually orphan the inode in this	248	/* TODO: This needs to actually orphan the inode in this
249	* transaction. */	249	* transaction. */
250		250
251	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);	251	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
252	if (IS_ERR(handle)) {	252	if (IS_ERR(handle)) {
253	status = PTR_ERR(handle);	253	status = PTR_ERR(handle);
254	mlog_errno(status);	254	mlog_errno(status);
255	goto out;	255	goto out;
256	}	256	}
257		257
258	status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size);	258	status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size);
259	if (status < 0)	259	if (status < 0)
260	mlog_errno(status);	260	mlog_errno(status);
261		261
262	ocfs2_commit_trans(osb, handle);	262	ocfs2_commit_trans(osb, handle);
263	out:	263	out:
264	mlog_exit(status);	264	mlog_exit(status);
265	return status;	265	return status;
266	}	266	}
267		267
268	static int ocfs2_truncate_file(struct inode *inode,	268	static int ocfs2_truncate_file(struct inode *inode,
269	struct buffer_head *di_bh,	269	struct buffer_head *di_bh,
270	u64 new_i_size)	270	u64 new_i_size)
271	{	271	{
272	int status = 0;	272	int status = 0;
273	struct ocfs2_dinode *fe = NULL;	273	struct ocfs2_dinode *fe = NULL;
274	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);	274	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
275	struct ocfs2_truncate_context *tc = NULL;	275	struct ocfs2_truncate_context *tc = NULL;
276		276
277	mlog_entry("(inode = %llu, new_i_size = %llu\n",	277	mlog_entry("(inode = %llu, new_i_size = %llu\n",
278	(unsigned long long)OCFS2_I(inode)->ip_blkno,	278	(unsigned long long)OCFS2_I(inode)->ip_blkno,
279	(unsigned long long)new_i_size);	279	(unsigned long long)new_i_size);
280		280
281	truncate_inode_pages(inode->i_mapping, new_i_size);	281	truncate_inode_pages(inode->i_mapping, new_i_size);
282		282
283	fe = (struct ocfs2_dinode *) di_bh->b_data;	283	fe = (struct ocfs2_dinode *) di_bh->b_data;
284	if (!OCFS2_IS_VALID_DINODE(fe)) {	284	if (!OCFS2_IS_VALID_DINODE(fe)) {
285	OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);	285	OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
286	status = -EIO;	286	status = -EIO;
287	goto bail;	287	goto bail;
288	}	288	}
289		289
290	mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),	290	mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
291	"Inode %llu, inode i_size = %lld != di "	291	"Inode %llu, inode i_size = %lld != di "
292	"i_size = %llu, i_flags = 0x%x\n",	292	"i_size = %llu, i_flags = 0x%x\n",
293	(unsigned long long)OCFS2_I(inode)->ip_blkno,	293	(unsigned long long)OCFS2_I(inode)->ip_blkno,
294	i_size_read(inode),	294	i_size_read(inode),
295	(unsigned long long)le64_to_cpu(fe->i_size),	295	(unsigned long long)le64_to_cpu(fe->i_size),
296	le32_to_cpu(fe->i_flags));	296	le32_to_cpu(fe->i_flags));
297		297
298	if (new_i_size > le64_to_cpu(fe->i_size)) {	298	if (new_i_size > le64_to_cpu(fe->i_size)) {
299	mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n",	299	mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n",
300	(unsigned long long)le64_to_cpu(fe->i_size),	300	(unsigned long long)le64_to_cpu(fe->i_size),
301	(unsigned long long)new_i_size);	301	(unsigned long long)new_i_size);
302	status = -EINVAL;	302	status = -EINVAL;
303	mlog_errno(status);	303	mlog_errno(status);
304	goto bail;	304	goto bail;
305	}	305	}
306		306
307	mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n",	307	mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n",
308	(unsigned long long)le64_to_cpu(fe->i_blkno),	308	(unsigned long long)le64_to_cpu(fe->i_blkno),
309	(unsigned long long)le64_to_cpu(fe->i_size),	309	(unsigned long long)le64_to_cpu(fe->i_size),
310	(unsigned long long)new_i_size);	310	(unsigned long long)new_i_size);
311		311
312	/* lets handle the simple truncate cases before doing any more	312	/* lets handle the simple truncate cases before doing any more
313	* cluster locking. */	313	* cluster locking. */
314	if (new_i_size == le64_to_cpu(fe->i_size))	314	if (new_i_size == le64_to_cpu(fe->i_size))
315	goto bail;	315	goto bail;
316		316
317	/* This forces other nodes to sync and drop their pages. Do	317	/* This forces other nodes to sync and drop their pages. Do
318	* this even if we have a truncate without allocation change -	318	* this even if we have a truncate without allocation change -
319	* ocfs2 cluster sizes can be much greater than page size, so	319	* ocfs2 cluster sizes can be much greater than page size, so
320	* we have to truncate them anyway. */	320	* we have to truncate them anyway. */
321	status = ocfs2_data_lock(inode, 1);	321	status = ocfs2_data_lock(inode, 1);
322	if (status < 0) {	322	if (status < 0) {
323	mlog_errno(status);	323	mlog_errno(status);
324	goto bail;	324	goto bail;
325	}	325	}
326	ocfs2_data_unlock(inode, 1);	326	ocfs2_data_unlock(inode, 1);
327		327
328	if (le32_to_cpu(fe->i_clusters) ==	328	if (le32_to_cpu(fe->i_clusters) ==
329	ocfs2_clusters_for_bytes(osb->sb, new_i_size)) {	329	ocfs2_clusters_for_bytes(osb->sb, new_i_size)) {
330	mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n",	330	mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n",
331	fe->i_clusters);	331	fe->i_clusters);
332	/* No allocation change is required, so lets fast path	332	/* No allocation change is required, so lets fast path
333	* this truncate. */	333	* this truncate. */
334	status = ocfs2_simple_size_update(inode, di_bh, new_i_size);	334	status = ocfs2_simple_size_update(inode, di_bh, new_i_size);
335	if (status < 0)	335	if (status < 0)
336	mlog_errno(status);	336	mlog_errno(status);
337	goto bail;	337	goto bail;
338	}	338	}
339		339
340	/* alright, we're going to need to do a full blown alloc size	340	/* alright, we're going to need to do a full blown alloc size
341	* change. Orphan the inode so that recovery can complete the	341	* change. Orphan the inode so that recovery can complete the
342	* truncate if necessary. This does the task of marking	342	* truncate if necessary. This does the task of marking
343	* i_size. */	343	* i_size. */
344	status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);	344	status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
345	if (status < 0) {	345	if (status < 0) {
346	mlog_errno(status);	346	mlog_errno(status);
347	goto bail;	347	goto bail;
348	}	348	}
349		349
350	status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);	350	status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
351	if (status < 0) {	351	if (status < 0) {
352	mlog_errno(status);	352	mlog_errno(status);
353	goto bail;	353	goto bail;
354	}	354	}
355		355
356	status = ocfs2_commit_truncate(osb, inode, di_bh, tc);	356	status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
357	if (status < 0) {	357	if (status < 0) {
358	mlog_errno(status);	358	mlog_errno(status);
359	goto bail;	359	goto bail;
360	}	360	}
361		361
362	/* TODO: orphan dir cleanup here. */	362	/* TODO: orphan dir cleanup here. */
363	bail:	363	bail:
364		364
365	mlog_exit(status);	365	mlog_exit(status);
366	return status;	366	return status;
367	}	367	}
368		368
369	/*	369	/*
370	* extend allocation only here.	370	* extend allocation only here.
371	* we'll update all the disk stuff, and oip->alloc_size	371	* we'll update all the disk stuff, and oip->alloc_size
372	*	372	*
373	* expect stuff to be locked, a transaction started and enough data /	373	* expect stuff to be locked, a transaction started and enough data /
374	* metadata reservations in the contexts.	374	* metadata reservations in the contexts.
375	*	375	*
376	* Will return -EAGAIN, and a reason if a restart is needed.	376	* Will return -EAGAIN, and a reason if a restart is needed.
377	* If passed in, *reason will always be set, even in error.	377	* If passed in, *reason will always be set, even in error.
378	*/	378	*/
379	int ocfs2_do_extend_allocation(struct ocfs2_super *osb,	379	int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
380	struct inode *inode,	380	struct inode *inode,
381	u32 clusters_to_add,	381	u32 clusters_to_add,
382	struct buffer_head *fe_bh,	382	struct buffer_head *fe_bh,
383	handle_t *handle,	383	handle_t *handle,
384	struct ocfs2_alloc_context *data_ac,	384	struct ocfs2_alloc_context *data_ac,
385	struct ocfs2_alloc_context *meta_ac,	385	struct ocfs2_alloc_context *meta_ac,
386	enum ocfs2_alloc_restarted *reason_ret)	386	enum ocfs2_alloc_restarted *reason_ret)
387	{	387	{
388	int status = 0;	388	int status = 0;
389	int free_extents;	389	int free_extents;
390	struct ocfs2_dinode fe = (struct ocfs2_dinode ) fe_bh->b_data;	390	struct ocfs2_dinode fe = (struct ocfs2_dinode ) fe_bh->b_data;
391	enum ocfs2_alloc_restarted reason = RESTART_NONE;	391	enum ocfs2_alloc_restarted reason = RESTART_NONE;
392	u32 bit_off, num_bits;	392	u32 bit_off, num_bits;
393	u64 block;	393	u64 block;
394		394
395	BUG_ON(!clusters_to_add);	395	BUG_ON(!clusters_to_add);
396		396
397	free_extents = ocfs2_num_free_extents(osb, inode, fe);	397	free_extents = ocfs2_num_free_extents(osb, inode, fe);
398	if (free_extents < 0) {	398	if (free_extents < 0) {
399	status = free_extents;	399	status = free_extents;
400	mlog_errno(status);	400	mlog_errno(status);
401	goto leave;	401	goto leave;
402	}	402	}
403		403
404	/* there are two cases which could cause us to EAGAIN in the	404	/* there are two cases which could cause us to EAGAIN in the
405	* we-need-more-metadata case:	405	* we-need-more-metadata case:
406	* 1) we haven't reserved any	406	* 1) we haven't reserved any
407	* 2) we are so fragmented, we've needed to add metadata too	407	* 2) we are so fragmented, we've needed to add metadata too
408	* many times. */	408	* many times. */
409	if (!free_extents && !meta_ac) {	409	if (!free_extents && !meta_ac) {
410	mlog(0, "we haven't reserved any metadata!\n");	410	mlog(0, "we haven't reserved any metadata!\n");
411	status = -EAGAIN;	411	status = -EAGAIN;
412	reason = RESTART_META;	412	reason = RESTART_META;
413	goto leave;	413	goto leave;
414	} else if ((!free_extents)	414	} else if ((!free_extents)
415	&& (ocfs2_alloc_context_bits_left(meta_ac)	415	&& (ocfs2_alloc_context_bits_left(meta_ac)
416	< ocfs2_extend_meta_needed(fe))) {	416	< ocfs2_extend_meta_needed(fe))) {
417	mlog(0, "filesystem is really fragmented...\n");	417	mlog(0, "filesystem is really fragmented...\n");
418	status = -EAGAIN;	418	status = -EAGAIN;
419	reason = RESTART_META;	419	reason = RESTART_META;
420	goto leave;	420	goto leave;
421	}	421	}
422		422
423	status = ocfs2_claim_clusters(osb, handle, data_ac, 1,	423	status = ocfs2_claim_clusters(osb, handle, data_ac, 1,
424	&bit_off, &num_bits);	424	&bit_off, &num_bits);
425	if (status < 0) {	425	if (status < 0) {
426	if (status != -ENOSPC)	426	if (status != -ENOSPC)
427	mlog_errno(status);	427	mlog_errno(status);
428	goto leave;	428	goto leave;
429	}	429	}
430		430
431	BUG_ON(num_bits > clusters_to_add);	431	BUG_ON(num_bits > clusters_to_add);
432		432
433	/* reserve our write early -- insert_extent may update the inode */	433	/* reserve our write early -- insert_extent may update the inode */
434	status = ocfs2_journal_access(handle, inode, fe_bh,	434	status = ocfs2_journal_access(handle, inode, fe_bh,
435	OCFS2_JOURNAL_ACCESS_WRITE);	435	OCFS2_JOURNAL_ACCESS_WRITE);
436	if (status < 0) {	436	if (status < 0) {
437	mlog_errno(status);	437	mlog_errno(status);
438	goto leave;	438	goto leave;
439	}	439	}
440		440
441	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);	441	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
442	mlog(0, "Allocating %u clusters at block %u for inode %llu\n",	442	mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
443	num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);	443	num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
444	status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block,	444	status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block,
445	num_bits, meta_ac);	445	num_bits, meta_ac);
446	if (status < 0) {	446	if (status < 0) {
447	mlog_errno(status);	447	mlog_errno(status);
448	goto leave;	448	goto leave;
449	}	449	}
450		450
451	le32_add_cpu(&fe->i_clusters, num_bits);	451	le32_add_cpu(&fe->i_clusters, num_bits);
452	spin_lock(&OCFS2_I(inode)->ip_lock);	452	spin_lock(&OCFS2_I(inode)->ip_lock);
453	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);	453	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
454	spin_unlock(&OCFS2_I(inode)->ip_lock);	454	spin_unlock(&OCFS2_I(inode)->ip_lock);
455		455
456	status = ocfs2_journal_dirty(handle, fe_bh);	456	status = ocfs2_journal_dirty(handle, fe_bh);
457	if (status < 0) {	457	if (status < 0) {
458	mlog_errno(status);	458	mlog_errno(status);
459	goto leave;	459	goto leave;
460	}	460	}
461		461
462	clusters_to_add -= num_bits;	462	clusters_to_add -= num_bits;
463		463
464	if (clusters_to_add) {	464	if (clusters_to_add) {
465	mlog(0, "need to alloc once more, clusters = %u, wanted = "	465	mlog(0, "need to alloc once more, clusters = %u, wanted = "
466	"%u\n", fe->i_clusters, clusters_to_add);	466	"%u\n", fe->i_clusters, clusters_to_add);
467	status = -EAGAIN;	467	status = -EAGAIN;
468	reason = RESTART_TRANS;	468	reason = RESTART_TRANS;
469	}	469	}
470		470
471	leave:	471	leave:
472	mlog_exit(status);	472	mlog_exit(status);
473	if (reason_ret)	473	if (reason_ret)
474	*reason_ret = reason;	474	*reason_ret = reason;
475	return status;	475	return status;
476	}	476	}
477		477
478	static int ocfs2_extend_allocation(struct inode *inode,	478	static int ocfs2_extend_allocation(struct inode *inode,
479	u32 clusters_to_add)	479	u32 clusters_to_add)
480	{	480	{
481	int status = 0;	481	int status = 0;
482	int restart_func = 0;	482	int restart_func = 0;
483	int drop_alloc_sem = 0;	483	int drop_alloc_sem = 0;
484	int credits, num_free_extents;	484	int credits, num_free_extents;
485	u32 prev_clusters;	485	u32 prev_clusters;
486	struct buffer_head *bh = NULL;	486	struct buffer_head *bh = NULL;
487	struct ocfs2_dinode *fe = NULL;	487	struct ocfs2_dinode *fe = NULL;
488	handle_t *handle = NULL;	488	handle_t *handle = NULL;
489	struct ocfs2_alloc_context *data_ac = NULL;	489	struct ocfs2_alloc_context *data_ac = NULL;
490	struct ocfs2_alloc_context *meta_ac = NULL;	490	struct ocfs2_alloc_context *meta_ac = NULL;
491	enum ocfs2_alloc_restarted why;	491	enum ocfs2_alloc_restarted why;
492	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);	492	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
493		493
494	mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);	494	mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
495		495
496	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,	496	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
497	OCFS2_BH_CACHED, inode);	497	OCFS2_BH_CACHED, inode);
498	if (status < 0) {	498	if (status < 0) {
499	mlog_errno(status);	499	mlog_errno(status);
500	goto leave;	500	goto leave;
501	}	501	}
502		502
503	fe = (struct ocfs2_dinode *) bh->b_data;	503	fe = (struct ocfs2_dinode *) bh->b_data;
504	if (!OCFS2_IS_VALID_DINODE(fe)) {	504	if (!OCFS2_IS_VALID_DINODE(fe)) {
505	OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);	505	OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
506	status = -EIO;	506	status = -EIO;
507	goto leave;	507	goto leave;
508	}	508	}
509		509
510	restart_all:	510	restart_all:
511	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);	511	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
512		512
513	mlog(0, "extend inode %llu, i_size = %lld, fe->i_clusters = %u, "	513	mlog(0, "extend inode %llu, i_size = %lld, fe->i_clusters = %u, "
514	"clusters_to_add = %u\n",	514	"clusters_to_add = %u\n",
515	(unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),	515	(unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
516	fe->i_clusters, clusters_to_add);	516	fe->i_clusters, clusters_to_add);
517		517
518	num_free_extents = ocfs2_num_free_extents(osb,	518	num_free_extents = ocfs2_num_free_extents(osb,
519	inode,	519	inode,
520	fe);	520	fe);
521	if (num_free_extents < 0) {	521	if (num_free_extents < 0) {
522	status = num_free_extents;	522	status = num_free_extents;
523	mlog_errno(status);	523	mlog_errno(status);
524	goto leave;	524	goto leave;
525	}	525	}
526		526
527	if (!num_free_extents) {	527	if (!num_free_extents) {
528	status = ocfs2_reserve_new_metadata(osb, fe, &meta_ac);	528	status = ocfs2_reserve_new_metadata(osb, fe, &meta_ac);
529	if (status < 0) {	529	if (status < 0) {
530	if (status != -ENOSPC)	530	if (status != -ENOSPC)
531	mlog_errno(status);	531	mlog_errno(status);
532	goto leave;	532	goto leave;
533	}	533	}
534	}	534	}
535		535
536	status = ocfs2_reserve_clusters(osb, clusters_to_add, &data_ac);	536	status = ocfs2_reserve_clusters(osb, clusters_to_add, &data_ac);
537	if (status < 0) {	537	if (status < 0) {
538	if (status != -ENOSPC)	538	if (status != -ENOSPC)
539	mlog_errno(status);	539	mlog_errno(status);
540	goto leave;	540	goto leave;
541	}	541	}
542		542
543	/* blocks peope in read/write from reading our allocation	543	/* blocks peope in read/write from reading our allocation
544	* until we're done changing it. We depend on i_mutex to block	544	* until we're done changing it. We depend on i_mutex to block
545	* other extend/truncate calls while we're here. Ordering wrt	545	* other extend/truncate calls while we're here. Ordering wrt
546	* start_trans is important here -- always do it before! */	546	* start_trans is important here -- always do it before! */
547	down_write(&OCFS2_I(inode)->ip_alloc_sem);	547	down_write(&OCFS2_I(inode)->ip_alloc_sem);
548	drop_alloc_sem = 1;	548	drop_alloc_sem = 1;
549		549
550	credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);	550	credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
551	handle = ocfs2_start_trans(osb, credits);	551	handle = ocfs2_start_trans(osb, credits);
552	if (IS_ERR(handle)) {	552	if (IS_ERR(handle)) {
553	status = PTR_ERR(handle);	553	status = PTR_ERR(handle);
554	handle = NULL;	554	handle = NULL;
555	mlog_errno(status);	555	mlog_errno(status);
556	goto leave;	556	goto leave;
557	}	557	}
558		558
559	restarted_transaction:	559	restarted_transaction:
560	/* reserve a write to the file entry early on - that we if we	560	/* reserve a write to the file entry early on - that we if we
561	* run out of credits in the allocation path, we can still	561	* run out of credits in the allocation path, we can still
562	* update i_size. */	562	* update i_size. */
563	status = ocfs2_journal_access(handle, inode, bh,	563	status = ocfs2_journal_access(handle, inode, bh,
564	OCFS2_JOURNAL_ACCESS_WRITE);	564	OCFS2_JOURNAL_ACCESS_WRITE);
565	if (status < 0) {	565	if (status < 0) {
566	mlog_errno(status);	566	mlog_errno(status);
567	goto leave;	567	goto leave;
568	}	568	}
569		569
570	prev_clusters = OCFS2_I(inode)->ip_clusters;	570	prev_clusters = OCFS2_I(inode)->ip_clusters;
571		571
572	status = ocfs2_do_extend_allocation(osb,	572	status = ocfs2_do_extend_allocation(osb,
573	inode,	573	inode,
574	clusters_to_add,	574	clusters_to_add,
575	bh,	575	bh,
576	handle,	576	handle,
577	data_ac,	577	data_ac,
578	meta_ac,	578	meta_ac,
579	&why);	579	&why);
580	if ((status < 0) && (status != -EAGAIN)) {	580	if ((status < 0) && (status != -EAGAIN)) {
581	if (status != -ENOSPC)	581	if (status != -ENOSPC)
582	mlog_errno(status);	582	mlog_errno(status);
583	goto leave;	583	goto leave;
584	}	584	}
585		585
586	status = ocfs2_journal_dirty(handle, bh);	586	status = ocfs2_journal_dirty(handle, bh);
587	if (status < 0) {	587	if (status < 0) {
588	mlog_errno(status);	588	mlog_errno(status);
589	goto leave;	589	goto leave;
590	}	590	}
591		591
592	spin_lock(&OCFS2_I(inode)->ip_lock);	592	spin_lock(&OCFS2_I(inode)->ip_lock);
593	clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);	593	clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
594	spin_unlock(&OCFS2_I(inode)->ip_lock);	594	spin_unlock(&OCFS2_I(inode)->ip_lock);
595		595
596	if (why != RESTART_NONE && clusters_to_add) {	596	if (why != RESTART_NONE && clusters_to_add) {
597	if (why == RESTART_META) {	597	if (why == RESTART_META) {
598	mlog(0, "restarting function.\n");	598	mlog(0, "restarting function.\n");
599	restart_func = 1;	599	restart_func = 1;
600	} else {	600	} else {
601	BUG_ON(why != RESTART_TRANS);	601	BUG_ON(why != RESTART_TRANS);
602		602
603	mlog(0, "restarting transaction.\n");	603	mlog(0, "restarting transaction.\n");
604	/* TODO: This can be more intelligent. */	604	/* TODO: This can be more intelligent. */
605	credits = ocfs2_calc_extend_credits(osb->sb,	605	credits = ocfs2_calc_extend_credits(osb->sb,
606	fe,	606	fe,
607	clusters_to_add);	607	clusters_to_add);
608	status = ocfs2_extend_trans(handle, credits);	608	status = ocfs2_extend_trans(handle, credits);
609	if (status < 0) {	609	if (status < 0) {
610	/* handle still has to be committed at	610	/* handle still has to be committed at
611	* this point. */	611	* this point. */
612	status = -ENOMEM;	612	status = -ENOMEM;
613	mlog_errno(status);	613	mlog_errno(status);
614	goto leave;	614	goto leave;
615	}	615	}
616	goto restarted_transaction;	616	goto restarted_transaction;
617	}	617	}
618	}	618	}
619		619
620	mlog(0, "fe: i_clusters = %u, i_size=%llu\n",	620	mlog(0, "fe: i_clusters = %u, i_size=%llu\n",
621	fe->i_clusters, (unsigned long long)fe->i_size);	621	fe->i_clusters, (unsigned long long)fe->i_size);
622	mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",	622	mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
623	OCFS2_I(inode)->ip_clusters, i_size_read(inode));	623	OCFS2_I(inode)->ip_clusters, i_size_read(inode));
624		624
625	leave:	625	leave:
626	if (drop_alloc_sem) {	626	if (drop_alloc_sem) {
627	up_write(&OCFS2_I(inode)->ip_alloc_sem);	627	up_write(&OCFS2_I(inode)->ip_alloc_sem);
628	drop_alloc_sem = 0;	628	drop_alloc_sem = 0;
629	}	629	}
630	if (handle) {	630	if (handle) {
631	ocfs2_commit_trans(osb, handle);	631	ocfs2_commit_trans(osb, handle);
632	handle = NULL;	632	handle = NULL;
633	}	633	}
634	if (data_ac) {	634	if (data_ac) {
635	ocfs2_free_alloc_context(data_ac);	635	ocfs2_free_alloc_context(data_ac);
636	data_ac = NULL;	636	data_ac = NULL;
637	}	637	}
638	if (meta_ac) {	638	if (meta_ac) {
639	ocfs2_free_alloc_context(meta_ac);	639	ocfs2_free_alloc_context(meta_ac);
640	meta_ac = NULL;	640	meta_ac = NULL;
641	}	641	}
642	if ((!status) && restart_func) {	642	if ((!status) && restart_func) {
643	restart_func = 0;	643	restart_func = 0;
644	goto restart_all;	644	goto restart_all;
645	}	645	}
646	if (bh) {	646	if (bh) {
647	brelse(bh);	647	brelse(bh);
648	bh = NULL;	648	bh = NULL;
649	}	649	}
650		650
651	mlog_exit(status);	651	mlog_exit(status);
652	return status;	652	return status;
653	}	653	}
654		654
655	/* Some parts of this taken from generic_cont_expand, which turned out	655	/* Some parts of this taken from generic_cont_expand, which turned out
656	* to be too fragile to do exactly what we need without us having to	656	* to be too fragile to do exactly what we need without us having to
657	* worry about recursive locking in ->prepare_write() and	657	* worry about recursive locking in ->prepare_write() and
658	* ->commit_write(). */	658	* ->commit_write(). */
659	static int ocfs2_write_zero_page(struct inode *inode,	659	static int ocfs2_write_zero_page(struct inode *inode,
660	u64 size)	660	u64 size)
661	{	661	{
662	struct address_space *mapping = inode->i_mapping;	662	struct address_space *mapping = inode->i_mapping;
663	struct page *page;	663	struct page *page;
664	unsigned long index;	664	unsigned long index;
665	unsigned int offset;	665	unsigned int offset;
666	handle_t *handle = NULL;	666	handle_t *handle = NULL;
667	int ret;	667	int ret;
668		668
669	offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */	669	offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
670	/* ugh. in prepare/commit_write, if from==to==start of block, we	670	/* ugh. in prepare/commit_write, if from==to==start of block, we
671	** skip the prepare. make sure we never send an offset for the start	671	** skip the prepare. make sure we never send an offset for the start
672	** of a block	672	** of a block
673	*/	673	*/
674	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {	674	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
675	offset++;	675	offset++;
676	}	676	}
677	index = size >> PAGE_CACHE_SHIFT;	677	index = size >> PAGE_CACHE_SHIFT;
678		678
679	page = grab_cache_page(mapping, index);	679	page = grab_cache_page(mapping, index);
680	if (!page) {	680	if (!page) {
681	ret = -ENOMEM;	681	ret = -ENOMEM;
682	mlog_errno(ret);	682	mlog_errno(ret);
683	goto out;	683	goto out;
684	}	684	}
685		685
686	ret = ocfs2_prepare_write_nolock(inode, page, offset, offset);	686	ret = ocfs2_prepare_write_nolock(inode, page, offset, offset);
687	if (ret < 0) {	687	if (ret < 0) {
688	mlog_errno(ret);	688	mlog_errno(ret);
689	goto out_unlock;	689	goto out_unlock;
690	}	690	}
691		691
692	if (ocfs2_should_order_data(inode)) {	692	if (ocfs2_should_order_data(inode)) {
693	handle = ocfs2_start_walk_page_trans(inode, page, offset,	693	handle = ocfs2_start_walk_page_trans(inode, page, offset,
694	offset);	694	offset);
695	if (IS_ERR(handle)) {	695	if (IS_ERR(handle)) {
696	ret = PTR_ERR(handle);	696	ret = PTR_ERR(handle);
697	handle = NULL;	697	handle = NULL;
698	goto out_unlock;	698	goto out_unlock;
699	}	699	}
700	}	700	}
701		701
702	/* must not update i_size! */	702	/* must not update i_size! */
703	ret = block_commit_write(page, offset, offset);	703	ret = block_commit_write(page, offset, offset);
704	if (ret < 0)	704	if (ret < 0)
705	mlog_errno(ret);	705	mlog_errno(ret);
706	else	706	else
707	ret = 0;	707	ret = 0;
708		708
709	if (handle)	709	if (handle)
710	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);	710	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
711	out_unlock:	711	out_unlock:
712	unlock_page(page);	712	unlock_page(page);
713	page_cache_release(page);	713	page_cache_release(page);
714	out:	714	out:
715	return ret;	715	return ret;
716	}	716	}
717		717
718	static int ocfs2_zero_extend(struct inode *inode,	718	static int ocfs2_zero_extend(struct inode *inode,
719	u64 zero_to_size)	719	u64 zero_to_size)
720	{	720	{
721	int ret = 0;	721	int ret = 0;
722	u64 start_off;	722	u64 start_off;
723	struct super_block *sb = inode->i_sb;	723	struct super_block *sb = inode->i_sb;
724		724
725	start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));	725	start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
726	while (start_off < zero_to_size) {	726	while (start_off < zero_to_size) {
727	ret = ocfs2_write_zero_page(inode, start_off);	727	ret = ocfs2_write_zero_page(inode, start_off);
728	if (ret < 0) {	728	if (ret < 0) {
729	mlog_errno(ret);	729	mlog_errno(ret);
730	goto out;	730	goto out;
731	}	731	}
732		732
733	start_off += sb->s_blocksize;	733	start_off += sb->s_blocksize;
734		734
735	/*	735	/*
736	* Very large extends have the potential to lock up	736	* Very large extends have the potential to lock up
737	* the cpu for extended periods of time.	737	* the cpu for extended periods of time.
738	*/	738	*/
739	cond_resched();	739	cond_resched();
740	}	740	}
741		741
742	out:	742	out:
743	return ret;	743	return ret;
744	}	744	}
745		745
746	/*	746	/*
747	* A tail_to_skip value > 0 indicates that we're being called from	747	* A tail_to_skip value > 0 indicates that we're being called from
748	* ocfs2_file_aio_write(). This has the following implications:	748	* ocfs2_file_aio_write(). This has the following implications:
749	*	749	*
750	* - we don't want to update i_size	750	* - we don't want to update i_size
751	* - di_bh will be NULL, which is fine because it's only used in the	751	* - di_bh will be NULL, which is fine because it's only used in the
752	* case where we want to update i_size.	752	* case where we want to update i_size.
753	* - ocfs2_zero_extend() will then only be filling the hole created	753	* - ocfs2_zero_extend() will then only be filling the hole created
754	* between i_size and the start of the write.	754	* between i_size and the start of the write.
755	*/	755	*/
756	static int ocfs2_extend_file(struct inode *inode,	756	static int ocfs2_extend_file(struct inode *inode,
757	struct buffer_head *di_bh,	757	struct buffer_head *di_bh,
758	u64 new_i_size,	758	u64 new_i_size,
759	size_t tail_to_skip)	759	size_t tail_to_skip)
760	{	760	{
761	int ret = 0;	761	int ret = 0;
762	u32 clusters_to_add;	762	u32 clusters_to_add;
763		763
764	BUG_ON(!tail_to_skip && !di_bh);	764	BUG_ON(!tail_to_skip && !di_bh);
765		765
766	/* setattr sometimes calls us like this. */	766	/* setattr sometimes calls us like this. */
767	if (new_i_size == 0)	767	if (new_i_size == 0)
768	goto out;	768	goto out;
769		769
770	if (i_size_read(inode) == new_i_size)	770	if (i_size_read(inode) == new_i_size)
771	goto out;	771	goto out;
772	BUG_ON(new_i_size < i_size_read(inode));	772	BUG_ON(new_i_size < i_size_read(inode));
773		773
774	clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) -	774	clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) -
775	OCFS2_I(inode)->ip_clusters;	775	OCFS2_I(inode)->ip_clusters;
776		776
777	/*	777	/*
778	* protect the pages that ocfs2_zero_extend is going to be	778	* protect the pages that ocfs2_zero_extend is going to be
779	* pulling into the page cache.. we do this before the	779	* pulling into the page cache.. we do this before the
780	* metadata extend so that we don't get into the situation	780	* metadata extend so that we don't get into the situation
781	* where we've extended the metadata but can't get the data	781	* where we've extended the metadata but can't get the data
782	* lock to zero.	782	* lock to zero.
783	*/	783	*/
784	ret = ocfs2_data_lock(inode, 1);	784	ret = ocfs2_data_lock(inode, 1);
785	if (ret < 0) {	785	if (ret < 0) {
786	mlog_errno(ret);	786	mlog_errno(ret);
787	goto out;	787	goto out;
788	}	788	}
789		789
790	if (clusters_to_add) {	790	if (clusters_to_add) {
791	ret = ocfs2_extend_allocation(inode, clusters_to_add);	791	ret = ocfs2_extend_allocation(inode, clusters_to_add);
792	if (ret < 0) {	792	if (ret < 0) {
793	mlog_errno(ret);	793	mlog_errno(ret);
794	goto out_unlock;	794	goto out_unlock;
795	}	795	}
796	}	796	}
797		797
798	/*	798	/*
799	* Call this even if we don't add any clusters to the tree. We	799	* Call this even if we don't add any clusters to the tree. We
800	* still need to zero the area between the old i_size and the	800	* still need to zero the area between the old i_size and the
801	* new i_size.	801	* new i_size.
802	*/	802	*/
803	ret = ocfs2_zero_extend(inode, (u64)new_i_size - tail_to_skip);	803	ret = ocfs2_zero_extend(inode, (u64)new_i_size - tail_to_skip);
804	if (ret < 0) {	804	if (ret < 0) {
805	mlog_errno(ret);	805	mlog_errno(ret);
806	goto out_unlock;	806	goto out_unlock;
807	}	807	}
808		808
809	if (!tail_to_skip) {	809	if (!tail_to_skip) {
810	/* We're being called from ocfs2_setattr() which wants	810	/* We're being called from ocfs2_setattr() which wants
811	* us to update i_size */	811	* us to update i_size */
812	ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);	812	ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
813	if (ret < 0)	813	if (ret < 0)
814	mlog_errno(ret);	814	mlog_errno(ret);
815	}	815	}
816		816
817	out_unlock:	817	out_unlock:
818	ocfs2_data_unlock(inode, 1);	818	ocfs2_data_unlock(inode, 1);
819		819
820	out:	820	out:
821	return ret;	821	return ret;
822	}	822	}
823		823
824	int ocfs2_setattr(struct dentry dentry, struct iattr attr)	824	int ocfs2_setattr(struct dentry dentry, struct iattr attr)
825	{	825	{
826	int status = 0, size_change;	826	int status = 0, size_change;
827	struct inode *inode = dentry->d_inode;	827	struct inode *inode = dentry->d_inode;
828	struct super_block *sb = inode->i_sb;	828	struct super_block *sb = inode->i_sb;
829	struct ocfs2_super *osb = OCFS2_SB(sb);	829	struct ocfs2_super *osb = OCFS2_SB(sb);
830	struct buffer_head *bh = NULL;	830	struct buffer_head *bh = NULL;
831	handle_t *handle = NULL;	831	handle_t *handle = NULL;
832		832
833	mlog_entry("(0x%p, '%.*s')\n", dentry,	833	mlog_entry("(0x%p, '%.*s')\n", dentry,
834	dentry->d_name.len, dentry->d_name.name);	834	dentry->d_name.len, dentry->d_name.name);
835		835
836	if (attr->ia_valid & ATTR_MODE)	836	if (attr->ia_valid & ATTR_MODE)
837	mlog(0, "mode change: %d\n", attr->ia_mode);	837	mlog(0, "mode change: %d\n", attr->ia_mode);
838	if (attr->ia_valid & ATTR_UID)	838	if (attr->ia_valid & ATTR_UID)
839	mlog(0, "uid change: %d\n", attr->ia_uid);	839	mlog(0, "uid change: %d\n", attr->ia_uid);
840	if (attr->ia_valid & ATTR_GID)	840	if (attr->ia_valid & ATTR_GID)
841	mlog(0, "gid change: %d\n", attr->ia_gid);	841	mlog(0, "gid change: %d\n", attr->ia_gid);
842	if (attr->ia_valid & ATTR_SIZE)	842	if (attr->ia_valid & ATTR_SIZE)
843	mlog(0, "size change...\n");	843	mlog(0, "size change...\n");
844	if (attr->ia_valid & (ATTR_ATIME \| ATTR_MTIME \| ATTR_CTIME))	844	if (attr->ia_valid & (ATTR_ATIME \| ATTR_MTIME \| ATTR_CTIME))
845	mlog(0, "time change...\n");	845	mlog(0, "time change...\n");
846		846
847	#define OCFS2_VALID_ATTRS (ATTR_ATIME \| ATTR_MTIME \| ATTR_CTIME \| ATTR_SIZE \	847	#define OCFS2_VALID_ATTRS (ATTR_ATIME \| ATTR_MTIME \| ATTR_CTIME \| ATTR_SIZE \
848	\| ATTR_GID \| ATTR_UID \| ATTR_MODE)	848	\| ATTR_GID \| ATTR_UID \| ATTR_MODE)
849	if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) {	849	if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) {
850	mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid);	850	mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid);
851	return 0;	851	return 0;
852	}	852	}
853		853
854	status = inode_change_ok(inode, attr);	854	status = inode_change_ok(inode, attr);
855	if (status)	855	if (status)
856	return status;	856	return status;
857		857
858	size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;	858	size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
859	if (size_change) {	859	if (size_change) {
860	status = ocfs2_rw_lock(inode, 1);	860	status = ocfs2_rw_lock(inode, 1);
861	if (status < 0) {	861	if (status < 0) {
862	mlog_errno(status);	862	mlog_errno(status);
863	goto bail;	863	goto bail;
864	}	864	}
865	}	865	}
866		866
867	status = ocfs2_meta_lock(inode, &bh, 1);	867	status = ocfs2_meta_lock(inode, &bh, 1);
868	if (status < 0) {	868	if (status < 0) {
869	if (status != -ENOENT)	869	if (status != -ENOENT)
870	mlog_errno(status);	870	mlog_errno(status);
871	goto bail_unlock_rw;	871	goto bail_unlock_rw;
872	}	872	}
873		873
874	if (size_change && attr->ia_size != i_size_read(inode)) {	874	if (size_change && attr->ia_size != i_size_read(inode)) {
875	if (i_size_read(inode) > attr->ia_size)	875	if (i_size_read(inode) > attr->ia_size)
876	status = ocfs2_truncate_file(inode, bh, attr->ia_size);	876	status = ocfs2_truncate_file(inode, bh, attr->ia_size);
877	else	877	else
878	status = ocfs2_extend_file(inode, bh, attr->ia_size, 0);	878	status = ocfs2_extend_file(inode, bh, attr->ia_size, 0);
879	if (status < 0) {	879	if (status < 0) {
880	if (status != -ENOSPC)	880	if (status != -ENOSPC)
881	mlog_errno(status);	881	mlog_errno(status);
882	status = -ENOSPC;	882	status = -ENOSPC;
883	goto bail_unlock;	883	goto bail_unlock;
884	}	884	}
885	}	885	}
886		886
887	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);	887	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
888	if (IS_ERR(handle)) {	888	if (IS_ERR(handle)) {
889	status = PTR_ERR(handle);	889	status = PTR_ERR(handle);
890	mlog_errno(status);	890	mlog_errno(status);
891	goto bail_unlock;	891	goto bail_unlock;
892	}	892	}
893		893
894	status = inode_setattr(inode, attr);	894	status = inode_setattr(inode, attr);
895	if (status < 0) {	895	if (status < 0) {
896	mlog_errno(status);	896	mlog_errno(status);
897	goto bail_commit;	897	goto bail_commit;
898	}	898	}
899		899
900	status = ocfs2_mark_inode_dirty(handle, inode, bh);	900	status = ocfs2_mark_inode_dirty(handle, inode, bh);
901	if (status < 0)	901	if (status < 0)
902	mlog_errno(status);	902	mlog_errno(status);
903		903
904	bail_commit:	904	bail_commit:
905	ocfs2_commit_trans(osb, handle);	905	ocfs2_commit_trans(osb, handle);
906	bail_unlock:	906	bail_unlock:
907	ocfs2_meta_unlock(inode, 1);	907	ocfs2_meta_unlock(inode, 1);
908	bail_unlock_rw:	908	bail_unlock_rw:
909	if (size_change)	909	if (size_change)
910	ocfs2_rw_unlock(inode, 1);	910	ocfs2_rw_unlock(inode, 1);
911	bail:	911	bail:
912	if (bh)	912	if (bh)
913	brelse(bh);	913	brelse(bh);
914		914
915	mlog_exit(status);	915	mlog_exit(status);
916	return status;	916	return status;
917	}	917	}
918		918
919	int ocfs2_getattr(struct vfsmount *mnt,	919	int ocfs2_getattr(struct vfsmount *mnt,
920	struct dentry *dentry,	920	struct dentry *dentry,
921	struct kstat *stat)	921	struct kstat *stat)
922	{	922	{
923	struct inode *inode = dentry->d_inode;	923	struct inode *inode = dentry->d_inode;
924	struct super_block *sb = dentry->d_inode->i_sb;	924	struct super_block *sb = dentry->d_inode->i_sb;
925	struct ocfs2_super *osb = sb->s_fs_info;	925	struct ocfs2_super *osb = sb->s_fs_info;
926	int err;	926	int err;
927		927
928	mlog_entry_void();	928	mlog_entry_void();
929		929
930	err = ocfs2_inode_revalidate(dentry);	930	err = ocfs2_inode_revalidate(dentry);
931	if (err) {	931	if (err) {
932	if (err != -ENOENT)	932	if (err != -ENOENT)
933	mlog_errno(err);	933	mlog_errno(err);
934	goto bail;	934	goto bail;
935	}	935	}
936		936
937	generic_fillattr(inode, stat);	937	generic_fillattr(inode, stat);
938		938
939	/* We set the blksize from the cluster size for performance */	939	/* We set the blksize from the cluster size for performance */
940	stat->blksize = osb->s_clustersize;	940	stat->blksize = osb->s_clustersize;
941		941
942	bail:	942	bail:
943	mlog_exit(err);	943	mlog_exit(err);
944		944
945	return err;	945	return err;
946	}	946	}
947		947
948	int ocfs2_permission(struct inode inode, int mask, struct nameidata nd)	948	int ocfs2_permission(struct inode inode, int mask, struct nameidata nd)
949	{	949	{
950	int ret;	950	int ret;
951		951
952	mlog_entry_void();	952	mlog_entry_void();
953		953
954	ret = ocfs2_meta_lock(inode, NULL, 0);	954	ret = ocfs2_meta_lock(inode, NULL, 0);
955	if (ret) {	955	if (ret) {
956	mlog_errno(ret);	956	mlog_errno(ret);
957	goto out;	957	goto out;
958	}	958	}
959		959
960	ret = generic_permission(inode, mask, NULL);	960	ret = generic_permission(inode, mask, NULL);
961	if (ret)	961	if (ret)
962	mlog_errno(ret);	962	mlog_errno(ret);
963		963
964	ocfs2_meta_unlock(inode, 0);	964	ocfs2_meta_unlock(inode, 0);
965	out:	965	out:
966	mlog_exit(ret);	966	mlog_exit(ret);
967	return ret;	967	return ret;
968	}	968	}
969		969
970	static int ocfs2_write_remove_suid(struct inode *inode)	970	static int ocfs2_write_remove_suid(struct inode *inode)
971	{	971	{
972	int ret;	972	int ret;
973	struct buffer_head *bh = NULL;	973	struct buffer_head *bh = NULL;
974	struct ocfs2_inode_info *oi = OCFS2_I(inode);	974	struct ocfs2_inode_info *oi = OCFS2_I(inode);
975	handle_t *handle;	975	handle_t *handle;
976	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);	976	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
977	struct ocfs2_dinode *di;	977	struct ocfs2_dinode *di;
978		978
979	mlog_entry("(Inode %llu, mode 0%o)\n",	979	mlog_entry("(Inode %llu, mode 0%o)\n",
980	(unsigned long long)oi->ip_blkno, inode->i_mode);	980	(unsigned long long)oi->ip_blkno, inode->i_mode);
981		981
982	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);	982	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
983	if (handle == NULL) {	983	if (handle == NULL) {
984	ret = -ENOMEM;	984	ret = -ENOMEM;
985	mlog_errno(ret);	985	mlog_errno(ret);
986	goto out;	986	goto out;
987	}	987	}
988		988
989	ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);	989	ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
990	if (ret < 0) {	990	if (ret < 0) {
991	mlog_errno(ret);	991	mlog_errno(ret);
992	goto out_trans;	992	goto out_trans;
993	}	993	}
994		994
995	ret = ocfs2_journal_access(handle, inode, bh,	995	ret = ocfs2_journal_access(handle, inode, bh,
996	OCFS2_JOURNAL_ACCESS_WRITE);	996	OCFS2_JOURNAL_ACCESS_WRITE);
997	if (ret < 0) {	997	if (ret < 0) {
998	mlog_errno(ret);	998	mlog_errno(ret);
999	goto out_bh;	999	goto out_bh;
1000	}	1000	}
1001		1001
1002	inode->i_mode &= ~S_ISUID;	1002	inode->i_mode &= ~S_ISUID;
1003	if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))	1003	if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))
1004	inode->i_mode &= ~S_ISGID;	1004	inode->i_mode &= ~S_ISGID;
1005		1005
1006	di = (struct ocfs2_dinode *) bh->b_data;	1006	di = (struct ocfs2_dinode *) bh->b_data;
1007	di->i_mode = cpu_to_le16(inode->i_mode);	1007	di->i_mode = cpu_to_le16(inode->i_mode);
1008		1008
1009	ret = ocfs2_journal_dirty(handle, bh);	1009	ret = ocfs2_journal_dirty(handle, bh);
1010	if (ret < 0)	1010	if (ret < 0)
1011	mlog_errno(ret);	1011	mlog_errno(ret);
1012	out_bh:	1012	out_bh:
1013	brelse(bh);	1013	brelse(bh);
1014	out_trans:	1014	out_trans:
1015	ocfs2_commit_trans(osb, handle);	1015	ocfs2_commit_trans(osb, handle);
1016	out:	1016	out:
1017	mlog_exit(ret);	1017	mlog_exit(ret);
1018	return ret;	1018	return ret;
1019	}	1019	}
1020		1020
1021	static int ocfs2_prepare_inode_for_write(struct dentry *dentry,	1021	static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1022	loff_t *ppos,	1022	loff_t *ppos,
1023	size_t count,	1023	size_t count,
1024	int appending)	1024	int appending)
1025	{	1025	{
1026	int ret = 0, meta_level = appending;	1026	int ret = 0, meta_level = appending;
1027	struct inode *inode = dentry->d_inode;	1027	struct inode *inode = dentry->d_inode;
1028	u32 clusters;	1028	u32 clusters;
1029	loff_t newsize, saved_pos;	1029	loff_t newsize, saved_pos;
1030		1030
1031	/*	1031	/*
1032	* We sample i_size under a read level meta lock to see if our write	1032	* We sample i_size under a read level meta lock to see if our write
1033	* is extending the file, if it is we back off and get a write level	1033	* is extending the file, if it is we back off and get a write level
1034	* meta lock.	1034	* meta lock.
1035	*/	1035	*/
1036	for(;;) {	1036	for(;;) {
1037	ret = ocfs2_meta_lock(inode, NULL, meta_level);	1037	ret = ocfs2_meta_lock(inode, NULL, meta_level);
1038	if (ret < 0) {	1038	if (ret < 0) {
1039	meta_level = -1;	1039	meta_level = -1;
1040	mlog_errno(ret);	1040	mlog_errno(ret);
1041	goto out;	1041	goto out;
1042	}	1042	}
1043		1043
1044	/* Clear suid / sgid if necessary. We do this here	1044	/* Clear suid / sgid if necessary. We do this here
1045	* instead of later in the write path because	1045	* instead of later in the write path because
1046	* remove_suid() calls ->setattr without any hint that	1046	* remove_suid() calls ->setattr without any hint that
1047	* we may have already done our cluster locking. Since	1047	* we may have already done our cluster locking. Since
1048	* ocfs2_setattr() must take cluster locks to	1048	* ocfs2_setattr() must take cluster locks to
1049	* proceeed, this will lead us to recursively lock the	1049	* proceeed, this will lead us to recursively lock the
1050	* inode. There's also the dinode i_size state which	1050	* inode. There's also the dinode i_size state which
1051	* can be lost via setattr during extending writes (we	1051	* can be lost via setattr during extending writes (we
1052	* set inode->i_size at the end of a write. */	1052	* set inode->i_size at the end of a write. */
1053	if (should_remove_suid(dentry)) {	1053	if (should_remove_suid(dentry)) {
1054	if (meta_level == 0) {	1054	if (meta_level == 0) {
1055	ocfs2_meta_unlock(inode, meta_level);	1055	ocfs2_meta_unlock(inode, meta_level);
1056	meta_level = 1;	1056	meta_level = 1;
1057	continue;	1057	continue;
1058	}	1058	}
1059		1059
1060	ret = ocfs2_write_remove_suid(inode);	1060	ret = ocfs2_write_remove_suid(inode);
1061	if (ret < 0) {	1061	if (ret < 0) {
1062	mlog_errno(ret);	1062	mlog_errno(ret);
1063	goto out_unlock;	1063	goto out_unlock;
1064	}	1064	}
1065	}	1065	}
1066		1066
1067	/* work on a copy of ppos until we're sure that we won't have	1067	/* work on a copy of ppos until we're sure that we won't have
1068	* to recalculate it due to relocking. */	1068	* to recalculate it due to relocking. */
1069	if (appending) {	1069	if (appending) {
1070	saved_pos = i_size_read(inode);	1070	saved_pos = i_size_read(inode);
1071	mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);	1071	mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);
1072	} else {	1072	} else {
1073	saved_pos = *ppos;	1073	saved_pos = *ppos;
1074	}	1074	}
1075	newsize = count + saved_pos;	1075	newsize = count + saved_pos;
1076		1076
1077	mlog(0, "pos=%lld newsize=%lld cursize=%lld\n",	1077	mlog(0, "pos=%lld newsize=%lld cursize=%lld\n",
1078	(long long) saved_pos, (long long) newsize,	1078	(long long) saved_pos, (long long) newsize,
1079	(long long) i_size_read(inode));	1079	(long long) i_size_read(inode));
1080		1080
1081	/* No need for a higher level metadata lock if we're	1081	/* No need for a higher level metadata lock if we're
1082	* never going past i_size. */	1082	* never going past i_size. */
1083	if (newsize <= i_size_read(inode))	1083	if (newsize <= i_size_read(inode))
1084	break;	1084	break;
1085		1085
1086	if (meta_level == 0) {	1086	if (meta_level == 0) {
1087	ocfs2_meta_unlock(inode, meta_level);	1087	ocfs2_meta_unlock(inode, meta_level);
1088	meta_level = 1;	1088	meta_level = 1;
1089	continue;	1089	continue;
1090	}	1090	}
1091		1091
1092	spin_lock(&OCFS2_I(inode)->ip_lock);	1092	spin_lock(&OCFS2_I(inode)->ip_lock);
1093	clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) -	1093	clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) -
1094	OCFS2_I(inode)->ip_clusters;	1094	OCFS2_I(inode)->ip_clusters;
1095	spin_unlock(&OCFS2_I(inode)->ip_lock);	1095	spin_unlock(&OCFS2_I(inode)->ip_lock);
1096		1096
1097	mlog(0, "Writing at EOF, may need more allocation: "	1097	mlog(0, "Writing at EOF, may need more allocation: "
1098	"i_size = %lld, newsize = %lld, need %u clusters\n",	1098	"i_size = %lld, newsize = %lld, need %u clusters\n",
1099	(long long) i_size_read(inode), (long long) newsize,	1099	(long long) i_size_read(inode), (long long) newsize,
1100	clusters);	1100	clusters);
1101		1101
1102	/* We only want to continue the rest of this loop if	1102	/* We only want to continue the rest of this loop if
1103	* our extend will actually require more	1103	* our extend will actually require more
1104	* allocation. */	1104	* allocation. */
1105	if (!clusters)	1105	if (!clusters)
1106	break;	1106	break;
1107		1107
1108	ret = ocfs2_extend_file(inode, NULL, newsize, count);	1108	ret = ocfs2_extend_file(inode, NULL, newsize, count);
1109	if (ret < 0) {	1109	if (ret < 0) {
1110	if (ret != -ENOSPC)	1110	if (ret != -ENOSPC)
1111	mlog_errno(ret);	1111	mlog_errno(ret);
1112	goto out_unlock;	1112	goto out_unlock;
1113	}	1113	}
1114	break;	1114	break;
1115	}	1115	}
1116		1116
1117	if (appending)	1117	if (appending)
1118	*ppos = saved_pos;	1118	*ppos = saved_pos;
1119		1119
1120	out_unlock:	1120	out_unlock:
1121	ocfs2_meta_unlock(inode, meta_level);	1121	ocfs2_meta_unlock(inode, meta_level);
1122		1122
1123	out:	1123	out:
1124	return ret;	1124	return ret;
1125	}	1125	}
1126		1126
1127	static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,	1127	static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
1128	const struct iovec *iov,	1128	const struct iovec *iov,
1129	unsigned long nr_segs,	1129	unsigned long nr_segs,
1130	loff_t pos)	1130	loff_t pos)
1131	{	1131	{
1132	int ret, rw_level, have_alloc_sem = 0;	1132	int ret, rw_level, have_alloc_sem = 0;
1133	struct file *filp = iocb->ki_filp;	1133	struct file *filp = iocb->ki_filp;
1134	struct inode *inode = filp->f_dentry->d_inode;	1134	struct inode *inode = filp->f_path.dentry->d_inode;
1135	int appending = filp->f_flags & O_APPEND ? 1 : 0;	1135	int appending = filp->f_flags & O_APPEND ? 1 : 0;
1136		1136
1137	mlog_entry("(0x%p, %u, '%.*s')\n", filp,	1137	mlog_entry("(0x%p, %u, '%.*s')\n", filp,
1138	(unsigned int)nr_segs,	1138	(unsigned int)nr_segs,
1139	filp->f_dentry->d_name.len,	1139	filp->f_path.dentry->d_name.len,
1140	filp->f_dentry->d_name.name);	1140	filp->f_path.dentry->d_name.name);
1141		1141
1142	/* happy write of zero bytes */	1142	/* happy write of zero bytes */
1143	if (iocb->ki_left == 0)	1143	if (iocb->ki_left == 0)
1144	return 0;	1144	return 0;
1145		1145
1146	mutex_lock(&inode->i_mutex);	1146	mutex_lock(&inode->i_mutex);
1147	/* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */	1147	/* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
1148	if (filp->f_flags & O_DIRECT) {	1148	if (filp->f_flags & O_DIRECT) {
1149	have_alloc_sem = 1;	1149	have_alloc_sem = 1;
1150	down_read(&inode->i_alloc_sem);	1150	down_read(&inode->i_alloc_sem);
1151	}	1151	}
1152		1152
1153	/* concurrent O_DIRECT writes are allowed */	1153	/* concurrent O_DIRECT writes are allowed */
1154	rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1;	1154	rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1;
1155	ret = ocfs2_rw_lock(inode, rw_level);	1155	ret = ocfs2_rw_lock(inode, rw_level);
1156	if (ret < 0) {	1156	if (ret < 0) {
1157	rw_level = -1;	1157	rw_level = -1;
1158	mlog_errno(ret);	1158	mlog_errno(ret);
1159	goto out;	1159	goto out;
1160	}	1160	}
1161		1161
1162	ret = ocfs2_prepare_inode_for_write(filp->f_dentry, &iocb->ki_pos,	1162	ret = ocfs2_prepare_inode_for_write(filp->f_path.dentry, &iocb->ki_pos,
1163	iocb->ki_left, appending);	1163	iocb->ki_left, appending);
1164	if (ret < 0) {	1164	if (ret < 0) {
1165	mlog_errno(ret);	1165	mlog_errno(ret);
1166	goto out;	1166	goto out;
1167	}	1167	}
1168		1168
1169	/* communicate with ocfs2_dio_end_io */	1169	/* communicate with ocfs2_dio_end_io */
1170	ocfs2_iocb_set_rw_locked(iocb);	1170	ocfs2_iocb_set_rw_locked(iocb);
1171		1171
1172	ret = generic_file_aio_write_nolock(iocb, iov, nr_segs, iocb->ki_pos);	1172	ret = generic_file_aio_write_nolock(iocb, iov, nr_segs, iocb->ki_pos);
1173		1173
1174	/* buffered aio wouldn't have proper lock coverage today */	1174	/* buffered aio wouldn't have proper lock coverage today */
1175	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));	1175	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
1176		1176
1177	/*	1177	/*
1178	* deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io	1178	* deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
1179	* function pointer which is called when o_direct io completes so that	1179	* function pointer which is called when o_direct io completes so that
1180	* it can unlock our rw lock. (it's the clustered equivalent of	1180	* it can unlock our rw lock. (it's the clustered equivalent of
1181	* i_alloc_sem; protects truncate from racing with pending ios).	1181	* i_alloc_sem; protects truncate from racing with pending ios).
1182	* Unfortunately there are error cases which call end_io and others	1182	* Unfortunately there are error cases which call end_io and others
1183	* that don't. so we don't have to unlock the rw_lock if either an	1183	* that don't. so we don't have to unlock the rw_lock if either an
1184	* async dio is going to do it in the future or an end_io after an	1184	* async dio is going to do it in the future or an end_io after an
1185	* error has already done it.	1185	* error has already done it.
1186	*/	1186	*/
1187	if (ret == -EIOCBQUEUED \|\| !ocfs2_iocb_is_rw_locked(iocb)) {	1187	if (ret == -EIOCBQUEUED \|\| !ocfs2_iocb_is_rw_locked(iocb)) {
1188	rw_level = -1;	1188	rw_level = -1;
1189	have_alloc_sem = 0;	1189	have_alloc_sem = 0;
1190	}	1190	}
1191		1191
1192	out:	1192	out:
1193	if (have_alloc_sem)	1193	if (have_alloc_sem)
1194	up_read(&inode->i_alloc_sem);	1194	up_read(&inode->i_alloc_sem);
1195	if (rw_level != -1)	1195	if (rw_level != -1)
1196	ocfs2_rw_unlock(inode, rw_level);	1196	ocfs2_rw_unlock(inode, rw_level);
1197	mutex_unlock(&inode->i_mutex);	1197	mutex_unlock(&inode->i_mutex);
1198		1198
1199	mlog_exit(ret);	1199	mlog_exit(ret);
1200	return ret;	1200	return ret;
1201	}	1201	}
1202		1202
1203	static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,	1203	static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
1204	struct file *out,	1204	struct file *out,
1205	loff_t *ppos,	1205	loff_t *ppos,
1206	size_t len,	1206	size_t len,
1207	unsigned int flags)	1207	unsigned int flags)
1208	{	1208	{
1209	int ret;	1209	int ret;
1210	struct inode *inode = out->f_dentry->d_inode;	1210	struct inode *inode = out->f_path.dentry->d_inode;
1211		1211
1212	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe,	1212	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe,
1213	(unsigned int)len,	1213	(unsigned int)len,
1214	out->f_dentry->d_name.len,	1214	out->f_path.dentry->d_name.len,
1215	out->f_dentry->d_name.name);	1215	out->f_path.dentry->d_name.name);
1216		1216
1217	inode_double_lock(inode, pipe->inode);	1217	inode_double_lock(inode, pipe->inode);
1218		1218
1219	ret = ocfs2_rw_lock(inode, 1);	1219	ret = ocfs2_rw_lock(inode, 1);
1220	if (ret < 0) {	1220	if (ret < 0) {
1221	mlog_errno(ret);	1221	mlog_errno(ret);
1222	goto out;	1222	goto out;
1223	}	1223	}
1224		1224
1225	ret = ocfs2_prepare_inode_for_write(out->f_dentry, ppos, len, 0);	1225	ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0);
1226	if (ret < 0) {	1226	if (ret < 0) {
1227	mlog_errno(ret);	1227	mlog_errno(ret);
1228	goto out_unlock;	1228	goto out_unlock;
1229	}	1229	}
1230		1230
1231	/* ok, we're done with i_size and alloc work */	1231	/* ok, we're done with i_size and alloc work */
1232	ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags);	1232	ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags);
1233		1233
1234	out_unlock:	1234	out_unlock:
1235	ocfs2_rw_unlock(inode, 1);	1235	ocfs2_rw_unlock(inode, 1);
1236	out:	1236	out:
1237	inode_double_unlock(inode, pipe->inode);	1237	inode_double_unlock(inode, pipe->inode);
1238		1238
1239	mlog_exit(ret);	1239	mlog_exit(ret);
1240	return ret;	1240	return ret;
1241	}	1241	}
1242		1242
1243	static ssize_t ocfs2_file_splice_read(struct file *in,	1243	static ssize_t ocfs2_file_splice_read(struct file *in,
1244	loff_t *ppos,	1244	loff_t *ppos,
1245	struct pipe_inode_info *pipe,	1245	struct pipe_inode_info *pipe,
1246	size_t len,	1246	size_t len,
1247	unsigned int flags)	1247	unsigned int flags)
1248	{	1248	{
1249	int ret = 0;	1249	int ret = 0;
1250	struct inode *inode = in->f_dentry->d_inode;	1250	struct inode *inode = in->f_path.dentry->d_inode;
1251		1251
1252	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe,	1252	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe,
1253	(unsigned int)len,	1253	(unsigned int)len,
1254	in->f_dentry->d_name.len,	1254	in->f_path.dentry->d_name.len,
1255	in->f_dentry->d_name.name);	1255	in->f_path.dentry->d_name.name);
1256		1256
1257	/*	1257	/*
1258	* See the comment in ocfs2_file_aio_read()	1258	* See the comment in ocfs2_file_aio_read()
1259	*/	1259	*/
1260	ret = ocfs2_meta_lock(inode, NULL, 0);	1260	ret = ocfs2_meta_lock(inode, NULL, 0);
1261	if (ret < 0) {	1261	if (ret < 0) {
1262	mlog_errno(ret);	1262	mlog_errno(ret);
1263	goto bail;	1263	goto bail;
1264	}	1264	}
1265	ocfs2_meta_unlock(inode, 0);	1265	ocfs2_meta_unlock(inode, 0);
1266		1266
1267	ret = generic_file_splice_read(in, ppos, pipe, len, flags);	1267	ret = generic_file_splice_read(in, ppos, pipe, len, flags);
1268		1268
1269	bail:	1269	bail:
1270	mlog_exit(ret);	1270	mlog_exit(ret);
1271	return ret;	1271	return ret;
1272	}	1272	}
1273		1273
1274	static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,	1274	static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
1275	const struct iovec *iov,	1275	const struct iovec *iov,
1276	unsigned long nr_segs,	1276	unsigned long nr_segs,
1277	loff_t pos)	1277	loff_t pos)
1278	{	1278	{
1279	int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;	1279	int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;
1280	struct file *filp = iocb->ki_filp;	1280	struct file *filp = iocb->ki_filp;
1281	struct inode *inode = filp->f_dentry->d_inode;	1281	struct inode *inode = filp->f_path.dentry->d_inode;
1282		1282
1283	mlog_entry("(0x%p, %u, '%.*s')\n", filp,	1283	mlog_entry("(0x%p, %u, '%.*s')\n", filp,
1284	(unsigned int)nr_segs,	1284	(unsigned int)nr_segs,
1285	filp->f_dentry->d_name.len,	1285	filp->f_path.dentry->d_name.len,
1286	filp->f_dentry->d_name.name);	1286	filp->f_path.dentry->d_name.name);
1287		1287
1288	if (!inode) {	1288	if (!inode) {
1289	ret = -EINVAL;	1289	ret = -EINVAL;
1290	mlog_errno(ret);	1290	mlog_errno(ret);
1291	goto bail;	1291	goto bail;
1292	}	1292	}
1293		1293
1294	/*	1294	/*
1295	* buffered reads protect themselves in ->readpage(). O_DIRECT reads	1295	* buffered reads protect themselves in ->readpage(). O_DIRECT reads
1296	* need locks to protect pending reads from racing with truncate.	1296	* need locks to protect pending reads from racing with truncate.
1297	*/	1297	*/
1298	if (filp->f_flags & O_DIRECT) {	1298	if (filp->f_flags & O_DIRECT) {
1299	down_read(&inode->i_alloc_sem);	1299	down_read(&inode->i_alloc_sem);
1300	have_alloc_sem = 1;	1300	have_alloc_sem = 1;
1301		1301
1302	ret = ocfs2_rw_lock(inode, 0);	1302	ret = ocfs2_rw_lock(inode, 0);
1303	if (ret < 0) {	1303	if (ret < 0) {
1304	mlog_errno(ret);	1304	mlog_errno(ret);
1305	goto bail;	1305	goto bail;
1306	}	1306	}
1307	rw_level = 0;	1307	rw_level = 0;
1308	/* communicate with ocfs2_dio_end_io */	1308	/* communicate with ocfs2_dio_end_io */
1309	ocfs2_iocb_set_rw_locked(iocb);	1309	ocfs2_iocb_set_rw_locked(iocb);
1310	}	1310	}
1311		1311
1312	/*	1312	/*
1313	* We're fine letting folks race truncates and extending	1313	* We're fine letting folks race truncates and extending
1314	* writes with read across the cluster, just like they can	1314	* writes with read across the cluster, just like they can
1315	* locally. Hence no rw_lock during read.	1315	* locally. Hence no rw_lock during read.
1316	*	1316	*
1317	* Take and drop the meta data lock to update inode fields	1317	* Take and drop the meta data lock to update inode fields
1318	* like i_size. This allows the checks down below	1318	* like i_size. This allows the checks down below
1319	* generic_file_aio_read() a chance of actually working.	1319	* generic_file_aio_read() a chance of actually working.
1320	*/	1320	*/
1321	ret = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level);	1321	ret = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level);
1322	if (ret < 0) {	1322	if (ret < 0) {
1323	mlog_errno(ret);	1323	mlog_errno(ret);
1324	goto bail;	1324	goto bail;
1325	}	1325	}
1326	ocfs2_meta_unlock(inode, lock_level);	1326	ocfs2_meta_unlock(inode, lock_level);
1327		1327
1328	ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);	1328	ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
1329	if (ret == -EINVAL)	1329	if (ret == -EINVAL)
1330	mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n");	1330	mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n");
1331		1331
1332	/* buffered aio wouldn't have proper lock coverage today */	1332	/* buffered aio wouldn't have proper lock coverage today */
1333	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));	1333	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
1334		1334
1335	/* see ocfs2_file_aio_write */	1335	/* see ocfs2_file_aio_write */
1336	if (ret == -EIOCBQUEUED \|\| !ocfs2_iocb_is_rw_locked(iocb)) {	1336	if (ret == -EIOCBQUEUED \|\| !ocfs2_iocb_is_rw_locked(iocb)) {
1337	rw_level = -1;	1337	rw_level = -1;
1338	have_alloc_sem = 0;	1338	have_alloc_sem = 0;
1339	}	1339	}
1340		1340
1341	bail:	1341	bail:
1342	if (have_alloc_sem)	1342	if (have_alloc_sem)
1343	up_read(&inode->i_alloc_sem);	1343	up_read(&inode->i_alloc_sem);
1344	if (rw_level != -1)	1344	if (rw_level != -1)
1345	ocfs2_rw_unlock(inode, rw_level);	1345	ocfs2_rw_unlock(inode, rw_level);
1346	mlog_exit(ret);	1346	mlog_exit(ret);
1347		1347
1348	return ret;	1348	return ret;
1349	}	1349	}
1350		1350
1351	struct inode_operations ocfs2_file_iops = {	1351	struct inode_operations ocfs2_file_iops = {
1352	.setattr = ocfs2_setattr,	1352	.setattr = ocfs2_setattr,
1353	.getattr = ocfs2_getattr,	1353	.getattr = ocfs2_getattr,
1354	.permission = ocfs2_permission,	1354	.permission = ocfs2_permission,
1355	};	1355	};
1356		1356
1357	struct inode_operations ocfs2_special_file_iops = {	1357	struct inode_operations ocfs2_special_file_iops = {
1358	.setattr = ocfs2_setattr,	1358	.setattr = ocfs2_setattr,
1359	.getattr = ocfs2_getattr,	1359	.getattr = ocfs2_getattr,
1360	.permission = ocfs2_permission,	1360	.permission = ocfs2_permission,
1361	};	1361	};
1362		1362
1363	const struct file_operations ocfs2_fops = {	1363	const struct file_operations ocfs2_fops = {
1364	.read = do_sync_read,	1364	.read = do_sync_read,
1365	.write = do_sync_write,	1365	.write = do_sync_write,
1366	.sendfile = generic_file_sendfile,	1366	.sendfile = generic_file_sendfile,
1367	.mmap = ocfs2_mmap,	1367	.mmap = ocfs2_mmap,
1368	.fsync = ocfs2_sync_file,	1368	.fsync = ocfs2_sync_file,
1369	.release = ocfs2_file_release,	1369	.release = ocfs2_file_release,
1370	.open = ocfs2_file_open,	1370	.open = ocfs2_file_open,
1371	.aio_read = ocfs2_file_aio_read,	1371	.aio_read = ocfs2_file_aio_read,
1372	.aio_write = ocfs2_file_aio_write,	1372	.aio_write = ocfs2_file_aio_write,
1373	.ioctl = ocfs2_ioctl,	1373	.ioctl = ocfs2_ioctl,
1374	.splice_read = ocfs2_file_splice_read,	1374	.splice_read = ocfs2_file_splice_read,
1375	.splice_write = ocfs2_file_splice_write,	1375	.splice_write = ocfs2_file_splice_write,
1376	};	1376	};
1377		1377
1378	const struct file_operations ocfs2_dops = {	1378	const struct file_operations ocfs2_dops = {
1379	.read = generic_read_dir,	1379	.read = generic_read_dir,
1380	.readdir = ocfs2_readdir,	1380	.readdir = ocfs2_readdir,
1381	.fsync = ocfs2_sync_file,	1381	.fsync = ocfs2_sync_file,
1382	.ioctl = ocfs2_ioctl,	1382	.ioctl = ocfs2_ioctl,
1383	};	1383	};
1384		1384