Commit 562c72aa57c36b178eacc3500a0215651eca9429

Authored by Christoph Hellwig
Committed by Al Viro
1 parent 11b80f459a

fs: move inode_dio_wait calls into ->setattr

Let filesystems handle waiting for direct I/O requests themselves instead
of doing it beforehand.  This means filesystem-specific locks to prevent
new dio referenes from appearing can be held.  This is important to allow
generalizing i_dio_count to non-DIO_LOCKING filesystems.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

Showing 12 changed files with 24 additions and 3 deletions Inline Diff

1 /* 1 /*
2 * linux/fs/attr.c 2 * linux/fs/attr.c
3 * 3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds 4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * changes by Thomas Schoebel-Theuer 5 * changes by Thomas Schoebel-Theuer
6 */ 6 */
7 7
8 #include <linux/module.h> 8 #include <linux/module.h>
9 #include <linux/time.h> 9 #include <linux/time.h>
10 #include <linux/mm.h> 10 #include <linux/mm.h>
11 #include <linux/string.h> 11 #include <linux/string.h>
12 #include <linux/capability.h> 12 #include <linux/capability.h>
13 #include <linux/fsnotify.h> 13 #include <linux/fsnotify.h>
14 #include <linux/fcntl.h> 14 #include <linux/fcntl.h>
15 #include <linux/security.h> 15 #include <linux/security.h>
16 16
17 /** 17 /**
18 * inode_change_ok - check if attribute changes to an inode are allowed 18 * inode_change_ok - check if attribute changes to an inode are allowed
19 * @inode: inode to check 19 * @inode: inode to check
20 * @attr: attributes to change 20 * @attr: attributes to change
21 * 21 *
22 * Check if we are allowed to change the attributes contained in @attr 22 * Check if we are allowed to change the attributes contained in @attr
23 * in the given inode. This includes the normal unix access permission 23 * in the given inode. This includes the normal unix access permission
24 * checks, as well as checks for rlimits and others. 24 * checks, as well as checks for rlimits and others.
25 * 25 *
26 * Should be called as the first thing in ->setattr implementations, 26 * Should be called as the first thing in ->setattr implementations,
27 * possibly after taking additional locks. 27 * possibly after taking additional locks.
28 */ 28 */
29 int inode_change_ok(const struct inode *inode, struct iattr *attr) 29 int inode_change_ok(const struct inode *inode, struct iattr *attr)
30 { 30 {
31 unsigned int ia_valid = attr->ia_valid; 31 unsigned int ia_valid = attr->ia_valid;
32 32
33 /* 33 /*
34 * First check size constraints. These can't be overriden using 34 * First check size constraints. These can't be overriden using
35 * ATTR_FORCE. 35 * ATTR_FORCE.
36 */ 36 */
37 if (ia_valid & ATTR_SIZE) { 37 if (ia_valid & ATTR_SIZE) {
38 int error = inode_newsize_ok(inode, attr->ia_size); 38 int error = inode_newsize_ok(inode, attr->ia_size);
39 if (error) 39 if (error)
40 return error; 40 return error;
41 } 41 }
42 42
43 /* If force is set do it anyway. */ 43 /* If force is set do it anyway. */
44 if (ia_valid & ATTR_FORCE) 44 if (ia_valid & ATTR_FORCE)
45 return 0; 45 return 0;
46 46
47 /* Make sure a caller can chown. */ 47 /* Make sure a caller can chown. */
48 if ((ia_valid & ATTR_UID) && 48 if ((ia_valid & ATTR_UID) &&
49 (current_fsuid() != inode->i_uid || 49 (current_fsuid() != inode->i_uid ||
50 attr->ia_uid != inode->i_uid) && !capable(CAP_CHOWN)) 50 attr->ia_uid != inode->i_uid) && !capable(CAP_CHOWN))
51 return -EPERM; 51 return -EPERM;
52 52
53 /* Make sure caller can chgrp. */ 53 /* Make sure caller can chgrp. */
54 if ((ia_valid & ATTR_GID) && 54 if ((ia_valid & ATTR_GID) &&
55 (current_fsuid() != inode->i_uid || 55 (current_fsuid() != inode->i_uid ||
56 (!in_group_p(attr->ia_gid) && attr->ia_gid != inode->i_gid)) && 56 (!in_group_p(attr->ia_gid) && attr->ia_gid != inode->i_gid)) &&
57 !capable(CAP_CHOWN)) 57 !capable(CAP_CHOWN))
58 return -EPERM; 58 return -EPERM;
59 59
60 /* Make sure a caller can chmod. */ 60 /* Make sure a caller can chmod. */
61 if (ia_valid & ATTR_MODE) { 61 if (ia_valid & ATTR_MODE) {
62 if (!inode_owner_or_capable(inode)) 62 if (!inode_owner_or_capable(inode))
63 return -EPERM; 63 return -EPERM;
64 /* Also check the setgid bit! */ 64 /* Also check the setgid bit! */
65 if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid : 65 if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
66 inode->i_gid) && !capable(CAP_FSETID)) 66 inode->i_gid) && !capable(CAP_FSETID))
67 attr->ia_mode &= ~S_ISGID; 67 attr->ia_mode &= ~S_ISGID;
68 } 68 }
69 69
70 /* Check for setting the inode time. */ 70 /* Check for setting the inode time. */
71 if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) { 71 if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) {
72 if (!inode_owner_or_capable(inode)) 72 if (!inode_owner_or_capable(inode))
73 return -EPERM; 73 return -EPERM;
74 } 74 }
75 75
76 return 0; 76 return 0;
77 } 77 }
78 EXPORT_SYMBOL(inode_change_ok); 78 EXPORT_SYMBOL(inode_change_ok);
79 79
80 /** 80 /**
81 * inode_newsize_ok - may this inode be truncated to a given size 81 * inode_newsize_ok - may this inode be truncated to a given size
82 * @inode: the inode to be truncated 82 * @inode: the inode to be truncated
83 * @offset: the new size to assign to the inode 83 * @offset: the new size to assign to the inode
84 * @Returns: 0 on success, -ve errno on failure 84 * @Returns: 0 on success, -ve errno on failure
85 * 85 *
86 * inode_newsize_ok must be called with i_mutex held. 86 * inode_newsize_ok must be called with i_mutex held.
87 * 87 *
88 * inode_newsize_ok will check filesystem limits and ulimits to check that the 88 * inode_newsize_ok will check filesystem limits and ulimits to check that the
89 * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ 89 * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ
90 * when necessary. Caller must not proceed with inode size change if failure is 90 * when necessary. Caller must not proceed with inode size change if failure is
91 * returned. @inode must be a file (not directory), with appropriate 91 * returned. @inode must be a file (not directory), with appropriate
92 * permissions to allow truncate (inode_newsize_ok does NOT check these 92 * permissions to allow truncate (inode_newsize_ok does NOT check these
93 * conditions). 93 * conditions).
94 */ 94 */
95 int inode_newsize_ok(const struct inode *inode, loff_t offset) 95 int inode_newsize_ok(const struct inode *inode, loff_t offset)
96 { 96 {
97 if (inode->i_size < offset) { 97 if (inode->i_size < offset) {
98 unsigned long limit; 98 unsigned long limit;
99 99
100 limit = rlimit(RLIMIT_FSIZE); 100 limit = rlimit(RLIMIT_FSIZE);
101 if (limit != RLIM_INFINITY && offset > limit) 101 if (limit != RLIM_INFINITY && offset > limit)
102 goto out_sig; 102 goto out_sig;
103 if (offset > inode->i_sb->s_maxbytes) 103 if (offset > inode->i_sb->s_maxbytes)
104 goto out_big; 104 goto out_big;
105 } else { 105 } else {
106 /* 106 /*
107 * truncation of in-use swapfiles is disallowed - it would 107 * truncation of in-use swapfiles is disallowed - it would
108 * cause subsequent swapout to scribble on the now-freed 108 * cause subsequent swapout to scribble on the now-freed
109 * blocks. 109 * blocks.
110 */ 110 */
111 if (IS_SWAPFILE(inode)) 111 if (IS_SWAPFILE(inode))
112 return -ETXTBSY; 112 return -ETXTBSY;
113 } 113 }
114 114
115 return 0; 115 return 0;
116 out_sig: 116 out_sig:
117 send_sig(SIGXFSZ, current, 0); 117 send_sig(SIGXFSZ, current, 0);
118 out_big: 118 out_big:
119 return -EFBIG; 119 return -EFBIG;
120 } 120 }
121 EXPORT_SYMBOL(inode_newsize_ok); 121 EXPORT_SYMBOL(inode_newsize_ok);
122 122
123 /** 123 /**
124 * setattr_copy - copy simple metadata updates into the generic inode 124 * setattr_copy - copy simple metadata updates into the generic inode
125 * @inode: the inode to be updated 125 * @inode: the inode to be updated
126 * @attr: the new attributes 126 * @attr: the new attributes
127 * 127 *
128 * setattr_copy must be called with i_mutex held. 128 * setattr_copy must be called with i_mutex held.
129 * 129 *
130 * setattr_copy updates the inode's metadata with that specified 130 * setattr_copy updates the inode's metadata with that specified
131 * in attr. Noticeably missing is inode size update, which is more complex 131 * in attr. Noticeably missing is inode size update, which is more complex
132 * as it requires pagecache updates. 132 * as it requires pagecache updates.
133 * 133 *
134 * The inode is not marked as dirty after this operation. The rationale is 134 * The inode is not marked as dirty after this operation. The rationale is
135 * that for "simple" filesystems, the struct inode is the inode storage. 135 * that for "simple" filesystems, the struct inode is the inode storage.
136 * The caller is free to mark the inode dirty afterwards if needed. 136 * The caller is free to mark the inode dirty afterwards if needed.
137 */ 137 */
138 void setattr_copy(struct inode *inode, const struct iattr *attr) 138 void setattr_copy(struct inode *inode, const struct iattr *attr)
139 { 139 {
140 unsigned int ia_valid = attr->ia_valid; 140 unsigned int ia_valid = attr->ia_valid;
141 141
142 if (ia_valid & ATTR_UID) 142 if (ia_valid & ATTR_UID)
143 inode->i_uid = attr->ia_uid; 143 inode->i_uid = attr->ia_uid;
144 if (ia_valid & ATTR_GID) 144 if (ia_valid & ATTR_GID)
145 inode->i_gid = attr->ia_gid; 145 inode->i_gid = attr->ia_gid;
146 if (ia_valid & ATTR_ATIME) 146 if (ia_valid & ATTR_ATIME)
147 inode->i_atime = timespec_trunc(attr->ia_atime, 147 inode->i_atime = timespec_trunc(attr->ia_atime,
148 inode->i_sb->s_time_gran); 148 inode->i_sb->s_time_gran);
149 if (ia_valid & ATTR_MTIME) 149 if (ia_valid & ATTR_MTIME)
150 inode->i_mtime = timespec_trunc(attr->ia_mtime, 150 inode->i_mtime = timespec_trunc(attr->ia_mtime,
151 inode->i_sb->s_time_gran); 151 inode->i_sb->s_time_gran);
152 if (ia_valid & ATTR_CTIME) 152 if (ia_valid & ATTR_CTIME)
153 inode->i_ctime = timespec_trunc(attr->ia_ctime, 153 inode->i_ctime = timespec_trunc(attr->ia_ctime,
154 inode->i_sb->s_time_gran); 154 inode->i_sb->s_time_gran);
155 if (ia_valid & ATTR_MODE) { 155 if (ia_valid & ATTR_MODE) {
156 umode_t mode = attr->ia_mode; 156 umode_t mode = attr->ia_mode;
157 157
158 if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) 158 if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
159 mode &= ~S_ISGID; 159 mode &= ~S_ISGID;
160 inode->i_mode = mode; 160 inode->i_mode = mode;
161 } 161 }
162 } 162 }
163 EXPORT_SYMBOL(setattr_copy); 163 EXPORT_SYMBOL(setattr_copy);
164 164
165 int notify_change(struct dentry * dentry, struct iattr * attr) 165 int notify_change(struct dentry * dentry, struct iattr * attr)
166 { 166 {
167 struct inode *inode = dentry->d_inode; 167 struct inode *inode = dentry->d_inode;
168 mode_t mode = inode->i_mode; 168 mode_t mode = inode->i_mode;
169 int error; 169 int error;
170 struct timespec now; 170 struct timespec now;
171 unsigned int ia_valid = attr->ia_valid; 171 unsigned int ia_valid = attr->ia_valid;
172 172
173 if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) { 173 if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) {
174 if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) 174 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
175 return -EPERM; 175 return -EPERM;
176 } 176 }
177 177
178 if ((ia_valid & ATTR_MODE)) { 178 if ((ia_valid & ATTR_MODE)) {
179 mode_t amode = attr->ia_mode; 179 mode_t amode = attr->ia_mode;
180 /* Flag setting protected by i_mutex */ 180 /* Flag setting protected by i_mutex */
181 if (is_sxid(amode)) 181 if (is_sxid(amode))
182 inode->i_flags &= ~S_NOSEC; 182 inode->i_flags &= ~S_NOSEC;
183 } 183 }
184 184
185 now = current_fs_time(inode->i_sb); 185 now = current_fs_time(inode->i_sb);
186 186
187 attr->ia_ctime = now; 187 attr->ia_ctime = now;
188 if (!(ia_valid & ATTR_ATIME_SET)) 188 if (!(ia_valid & ATTR_ATIME_SET))
189 attr->ia_atime = now; 189 attr->ia_atime = now;
190 if (!(ia_valid & ATTR_MTIME_SET)) 190 if (!(ia_valid & ATTR_MTIME_SET))
191 attr->ia_mtime = now; 191 attr->ia_mtime = now;
192 if (ia_valid & ATTR_KILL_PRIV) { 192 if (ia_valid & ATTR_KILL_PRIV) {
193 attr->ia_valid &= ~ATTR_KILL_PRIV; 193 attr->ia_valid &= ~ATTR_KILL_PRIV;
194 ia_valid &= ~ATTR_KILL_PRIV; 194 ia_valid &= ~ATTR_KILL_PRIV;
195 error = security_inode_need_killpriv(dentry); 195 error = security_inode_need_killpriv(dentry);
196 if (error > 0) 196 if (error > 0)
197 error = security_inode_killpriv(dentry); 197 error = security_inode_killpriv(dentry);
198 if (error) 198 if (error)
199 return error; 199 return error;
200 } 200 }
201 201
202 /* 202 /*
203 * We now pass ATTR_KILL_S*ID to the lower level setattr function so 203 * We now pass ATTR_KILL_S*ID to the lower level setattr function so
204 * that the function has the ability to reinterpret a mode change 204 * that the function has the ability to reinterpret a mode change
205 * that's due to these bits. This adds an implicit restriction that 205 * that's due to these bits. This adds an implicit restriction that
206 * no function will ever call notify_change with both ATTR_MODE and 206 * no function will ever call notify_change with both ATTR_MODE and
207 * ATTR_KILL_S*ID set. 207 * ATTR_KILL_S*ID set.
208 */ 208 */
209 if ((ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) && 209 if ((ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) &&
210 (ia_valid & ATTR_MODE)) 210 (ia_valid & ATTR_MODE))
211 BUG(); 211 BUG();
212 212
213 if (ia_valid & ATTR_KILL_SUID) { 213 if (ia_valid & ATTR_KILL_SUID) {
214 if (mode & S_ISUID) { 214 if (mode & S_ISUID) {
215 ia_valid = attr->ia_valid |= ATTR_MODE; 215 ia_valid = attr->ia_valid |= ATTR_MODE;
216 attr->ia_mode = (inode->i_mode & ~S_ISUID); 216 attr->ia_mode = (inode->i_mode & ~S_ISUID);
217 } 217 }
218 } 218 }
219 if (ia_valid & ATTR_KILL_SGID) { 219 if (ia_valid & ATTR_KILL_SGID) {
220 if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { 220 if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
221 if (!(ia_valid & ATTR_MODE)) { 221 if (!(ia_valid & ATTR_MODE)) {
222 ia_valid = attr->ia_valid |= ATTR_MODE; 222 ia_valid = attr->ia_valid |= ATTR_MODE;
223 attr->ia_mode = inode->i_mode; 223 attr->ia_mode = inode->i_mode;
224 } 224 }
225 attr->ia_mode &= ~S_ISGID; 225 attr->ia_mode &= ~S_ISGID;
226 } 226 }
227 } 227 }
228 if (!(attr->ia_valid & ~(ATTR_KILL_SUID | ATTR_KILL_SGID))) 228 if (!(attr->ia_valid & ~(ATTR_KILL_SUID | ATTR_KILL_SGID)))
229 return 0; 229 return 0;
230 230
231 error = security_inode_setattr(dentry, attr); 231 error = security_inode_setattr(dentry, attr);
232 if (error) 232 if (error)
233 return error; 233 return error;
234 234
235 if (ia_valid & ATTR_SIZE)
236 inode_dio_wait(inode);
237
238 if (inode->i_op->setattr) 235 if (inode->i_op->setattr)
239 error = inode->i_op->setattr(dentry, attr); 236 error = inode->i_op->setattr(dentry, attr);
240 else 237 else
241 error = simple_setattr(dentry, attr); 238 error = simple_setattr(dentry, attr);
242 239
243 if (!error) 240 if (!error)
244 fsnotify_change(dentry, ia_valid); 241 fsnotify_change(dentry, ia_valid);
245 242
246 return error; 243 return error;
247 } 244 }
248 245
249 EXPORT_SYMBOL(notify_change); 246 EXPORT_SYMBOL(notify_change);
250 247
1 /* 1 /*
2 * linux/fs/ext2/inode.c 2 * linux/fs/ext2/inode.c
3 * 3 *
4 * Copyright (C) 1992, 1993, 1994, 1995 4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr) 5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal 6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI) 7 * Universite Pierre et Marie Curie (Paris VI)
8 * 8 *
9 * from 9 * from
10 * 10 *
11 * linux/fs/minix/inode.c 11 * linux/fs/minix/inode.c
12 * 12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds 13 * Copyright (C) 1991, 1992 Linus Torvalds
14 * 14 *
15 * Goal-directed block allocation by Stephen Tweedie 15 * Goal-directed block allocation by Stephen Tweedie
16 * (sct@dcs.ed.ac.uk), 1993, 1998 16 * (sct@dcs.ed.ac.uk), 1993, 1998
17 * Big-endian to little-endian byte-swapping/bitmaps by 17 * Big-endian to little-endian byte-swapping/bitmaps by
18 * David S. Miller (davem@caip.rutgers.edu), 1995 18 * David S. Miller (davem@caip.rutgers.edu), 1995
19 * 64-bit file support on 64-bit platforms by Jakub Jelinek 19 * 64-bit file support on 64-bit platforms by Jakub Jelinek
20 * (jj@sunsite.ms.mff.cuni.cz) 20 * (jj@sunsite.ms.mff.cuni.cz)
21 * 21 *
22 * Assorted race fixes, rewrite of ext2_get_block() by Al Viro, 2000 22 * Assorted race fixes, rewrite of ext2_get_block() by Al Viro, 2000
23 */ 23 */
24 24
25 #include <linux/time.h> 25 #include <linux/time.h>
26 #include <linux/highuid.h> 26 #include <linux/highuid.h>
27 #include <linux/pagemap.h> 27 #include <linux/pagemap.h>
28 #include <linux/quotaops.h> 28 #include <linux/quotaops.h>
29 #include <linux/module.h> 29 #include <linux/module.h>
30 #include <linux/writeback.h> 30 #include <linux/writeback.h>
31 #include <linux/buffer_head.h> 31 #include <linux/buffer_head.h>
32 #include <linux/mpage.h> 32 #include <linux/mpage.h>
33 #include <linux/fiemap.h> 33 #include <linux/fiemap.h>
34 #include <linux/namei.h> 34 #include <linux/namei.h>
35 #include "ext2.h" 35 #include "ext2.h"
36 #include "acl.h" 36 #include "acl.h"
37 #include "xip.h" 37 #include "xip.h"
38 38
39 MODULE_AUTHOR("Remy Card and others"); 39 MODULE_AUTHOR("Remy Card and others");
40 MODULE_DESCRIPTION("Second Extended Filesystem"); 40 MODULE_DESCRIPTION("Second Extended Filesystem");
41 MODULE_LICENSE("GPL"); 41 MODULE_LICENSE("GPL");
42 42
43 static int __ext2_write_inode(struct inode *inode, int do_sync); 43 static int __ext2_write_inode(struct inode *inode, int do_sync);
44 44
45 /* 45 /*
46 * Test whether an inode is a fast symlink. 46 * Test whether an inode is a fast symlink.
47 */ 47 */
48 static inline int ext2_inode_is_fast_symlink(struct inode *inode) 48 static inline int ext2_inode_is_fast_symlink(struct inode *inode)
49 { 49 {
50 int ea_blocks = EXT2_I(inode)->i_file_acl ? 50 int ea_blocks = EXT2_I(inode)->i_file_acl ?
51 (inode->i_sb->s_blocksize >> 9) : 0; 51 (inode->i_sb->s_blocksize >> 9) : 0;
52 52
53 return (S_ISLNK(inode->i_mode) && 53 return (S_ISLNK(inode->i_mode) &&
54 inode->i_blocks - ea_blocks == 0); 54 inode->i_blocks - ea_blocks == 0);
55 } 55 }
56 56
57 static void ext2_truncate_blocks(struct inode *inode, loff_t offset); 57 static void ext2_truncate_blocks(struct inode *inode, loff_t offset);
58 58
59 static void ext2_write_failed(struct address_space *mapping, loff_t to) 59 static void ext2_write_failed(struct address_space *mapping, loff_t to)
60 { 60 {
61 struct inode *inode = mapping->host; 61 struct inode *inode = mapping->host;
62 62
63 if (to > inode->i_size) { 63 if (to > inode->i_size) {
64 truncate_pagecache(inode, to, inode->i_size); 64 truncate_pagecache(inode, to, inode->i_size);
65 ext2_truncate_blocks(inode, inode->i_size); 65 ext2_truncate_blocks(inode, inode->i_size);
66 } 66 }
67 } 67 }
68 68
69 /* 69 /*
70 * Called at the last iput() if i_nlink is zero. 70 * Called at the last iput() if i_nlink is zero.
71 */ 71 */
72 void ext2_evict_inode(struct inode * inode) 72 void ext2_evict_inode(struct inode * inode)
73 { 73 {
74 struct ext2_block_alloc_info *rsv; 74 struct ext2_block_alloc_info *rsv;
75 int want_delete = 0; 75 int want_delete = 0;
76 76
77 if (!inode->i_nlink && !is_bad_inode(inode)) { 77 if (!inode->i_nlink && !is_bad_inode(inode)) {
78 want_delete = 1; 78 want_delete = 1;
79 dquot_initialize(inode); 79 dquot_initialize(inode);
80 } else { 80 } else {
81 dquot_drop(inode); 81 dquot_drop(inode);
82 } 82 }
83 83
84 truncate_inode_pages(&inode->i_data, 0); 84 truncate_inode_pages(&inode->i_data, 0);
85 85
86 if (want_delete) { 86 if (want_delete) {
87 /* set dtime */ 87 /* set dtime */
88 EXT2_I(inode)->i_dtime = get_seconds(); 88 EXT2_I(inode)->i_dtime = get_seconds();
89 mark_inode_dirty(inode); 89 mark_inode_dirty(inode);
90 __ext2_write_inode(inode, inode_needs_sync(inode)); 90 __ext2_write_inode(inode, inode_needs_sync(inode));
91 /* truncate to 0 */ 91 /* truncate to 0 */
92 inode->i_size = 0; 92 inode->i_size = 0;
93 if (inode->i_blocks) 93 if (inode->i_blocks)
94 ext2_truncate_blocks(inode, 0); 94 ext2_truncate_blocks(inode, 0);
95 } 95 }
96 96
97 invalidate_inode_buffers(inode); 97 invalidate_inode_buffers(inode);
98 end_writeback(inode); 98 end_writeback(inode);
99 99
100 ext2_discard_reservation(inode); 100 ext2_discard_reservation(inode);
101 rsv = EXT2_I(inode)->i_block_alloc_info; 101 rsv = EXT2_I(inode)->i_block_alloc_info;
102 EXT2_I(inode)->i_block_alloc_info = NULL; 102 EXT2_I(inode)->i_block_alloc_info = NULL;
103 if (unlikely(rsv)) 103 if (unlikely(rsv))
104 kfree(rsv); 104 kfree(rsv);
105 105
106 if (want_delete) 106 if (want_delete)
107 ext2_free_inode(inode); 107 ext2_free_inode(inode);
108 } 108 }
109 109
110 typedef struct { 110 typedef struct {
111 __le32 *p; 111 __le32 *p;
112 __le32 key; 112 __le32 key;
113 struct buffer_head *bh; 113 struct buffer_head *bh;
114 } Indirect; 114 } Indirect;
115 115
116 static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) 116 static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
117 { 117 {
118 p->key = *(p->p = v); 118 p->key = *(p->p = v);
119 p->bh = bh; 119 p->bh = bh;
120 } 120 }
121 121
122 static inline int verify_chain(Indirect *from, Indirect *to) 122 static inline int verify_chain(Indirect *from, Indirect *to)
123 { 123 {
124 while (from <= to && from->key == *from->p) 124 while (from <= to && from->key == *from->p)
125 from++; 125 from++;
126 return (from > to); 126 return (from > to);
127 } 127 }
128 128
129 /** 129 /**
130 * ext2_block_to_path - parse the block number into array of offsets 130 * ext2_block_to_path - parse the block number into array of offsets
131 * @inode: inode in question (we are only interested in its superblock) 131 * @inode: inode in question (we are only interested in its superblock)
132 * @i_block: block number to be parsed 132 * @i_block: block number to be parsed
133 * @offsets: array to store the offsets in 133 * @offsets: array to store the offsets in
134 * @boundary: set this non-zero if the referred-to block is likely to be 134 * @boundary: set this non-zero if the referred-to block is likely to be
135 * followed (on disk) by an indirect block. 135 * followed (on disk) by an indirect block.
136 * To store the locations of file's data ext2 uses a data structure common 136 * To store the locations of file's data ext2 uses a data structure common
137 * for UNIX filesystems - tree of pointers anchored in the inode, with 137 * for UNIX filesystems - tree of pointers anchored in the inode, with
138 * data blocks at leaves and indirect blocks in intermediate nodes. 138 * data blocks at leaves and indirect blocks in intermediate nodes.
139 * This function translates the block number into path in that tree - 139 * This function translates the block number into path in that tree -
140 * return value is the path length and @offsets[n] is the offset of 140 * return value is the path length and @offsets[n] is the offset of
141 * pointer to (n+1)th node in the nth one. If @block is out of range 141 * pointer to (n+1)th node in the nth one. If @block is out of range
142 * (negative or too large) warning is printed and zero returned. 142 * (negative or too large) warning is printed and zero returned.
143 * 143 *
144 * Note: function doesn't find node addresses, so no IO is needed. All 144 * Note: function doesn't find node addresses, so no IO is needed. All
145 * we need to know is the capacity of indirect blocks (taken from the 145 * we need to know is the capacity of indirect blocks (taken from the
146 * inode->i_sb). 146 * inode->i_sb).
147 */ 147 */
148 148
149 /* 149 /*
150 * Portability note: the last comparison (check that we fit into triple 150 * Portability note: the last comparison (check that we fit into triple
151 * indirect block) is spelled differently, because otherwise on an 151 * indirect block) is spelled differently, because otherwise on an
152 * architecture with 32-bit longs and 8Kb pages we might get into trouble 152 * architecture with 32-bit longs and 8Kb pages we might get into trouble
153 * if our filesystem had 8Kb blocks. We might use long long, but that would 153 * if our filesystem had 8Kb blocks. We might use long long, but that would
154 * kill us on x86. Oh, well, at least the sign propagation does not matter - 154 * kill us on x86. Oh, well, at least the sign propagation does not matter -
155 * i_block would have to be negative in the very beginning, so we would not 155 * i_block would have to be negative in the very beginning, so we would not
156 * get there at all. 156 * get there at all.
157 */ 157 */
158 158
159 static int ext2_block_to_path(struct inode *inode, 159 static int ext2_block_to_path(struct inode *inode,
160 long i_block, int offsets[4], int *boundary) 160 long i_block, int offsets[4], int *boundary)
161 { 161 {
162 int ptrs = EXT2_ADDR_PER_BLOCK(inode->i_sb); 162 int ptrs = EXT2_ADDR_PER_BLOCK(inode->i_sb);
163 int ptrs_bits = EXT2_ADDR_PER_BLOCK_BITS(inode->i_sb); 163 int ptrs_bits = EXT2_ADDR_PER_BLOCK_BITS(inode->i_sb);
164 const long direct_blocks = EXT2_NDIR_BLOCKS, 164 const long direct_blocks = EXT2_NDIR_BLOCKS,
165 indirect_blocks = ptrs, 165 indirect_blocks = ptrs,
166 double_blocks = (1 << (ptrs_bits * 2)); 166 double_blocks = (1 << (ptrs_bits * 2));
167 int n = 0; 167 int n = 0;
168 int final = 0; 168 int final = 0;
169 169
170 if (i_block < 0) { 170 if (i_block < 0) {
171 ext2_msg(inode->i_sb, KERN_WARNING, 171 ext2_msg(inode->i_sb, KERN_WARNING,
172 "warning: %s: block < 0", __func__); 172 "warning: %s: block < 0", __func__);
173 } else if (i_block < direct_blocks) { 173 } else if (i_block < direct_blocks) {
174 offsets[n++] = i_block; 174 offsets[n++] = i_block;
175 final = direct_blocks; 175 final = direct_blocks;
176 } else if ( (i_block -= direct_blocks) < indirect_blocks) { 176 } else if ( (i_block -= direct_blocks) < indirect_blocks) {
177 offsets[n++] = EXT2_IND_BLOCK; 177 offsets[n++] = EXT2_IND_BLOCK;
178 offsets[n++] = i_block; 178 offsets[n++] = i_block;
179 final = ptrs; 179 final = ptrs;
180 } else if ((i_block -= indirect_blocks) < double_blocks) { 180 } else if ((i_block -= indirect_blocks) < double_blocks) {
181 offsets[n++] = EXT2_DIND_BLOCK; 181 offsets[n++] = EXT2_DIND_BLOCK;
182 offsets[n++] = i_block >> ptrs_bits; 182 offsets[n++] = i_block >> ptrs_bits;
183 offsets[n++] = i_block & (ptrs - 1); 183 offsets[n++] = i_block & (ptrs - 1);
184 final = ptrs; 184 final = ptrs;
185 } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { 185 } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
186 offsets[n++] = EXT2_TIND_BLOCK; 186 offsets[n++] = EXT2_TIND_BLOCK;
187 offsets[n++] = i_block >> (ptrs_bits * 2); 187 offsets[n++] = i_block >> (ptrs_bits * 2);
188 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); 188 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
189 offsets[n++] = i_block & (ptrs - 1); 189 offsets[n++] = i_block & (ptrs - 1);
190 final = ptrs; 190 final = ptrs;
191 } else { 191 } else {
192 ext2_msg(inode->i_sb, KERN_WARNING, 192 ext2_msg(inode->i_sb, KERN_WARNING,
193 "warning: %s: block is too big", __func__); 193 "warning: %s: block is too big", __func__);
194 } 194 }
195 if (boundary) 195 if (boundary)
196 *boundary = final - 1 - (i_block & (ptrs - 1)); 196 *boundary = final - 1 - (i_block & (ptrs - 1));
197 197
198 return n; 198 return n;
199 } 199 }
200 200
201 /** 201 /**
202 * ext2_get_branch - read the chain of indirect blocks leading to data 202 * ext2_get_branch - read the chain of indirect blocks leading to data
203 * @inode: inode in question 203 * @inode: inode in question
204 * @depth: depth of the chain (1 - direct pointer, etc.) 204 * @depth: depth of the chain (1 - direct pointer, etc.)
205 * @offsets: offsets of pointers in inode/indirect blocks 205 * @offsets: offsets of pointers in inode/indirect blocks
206 * @chain: place to store the result 206 * @chain: place to store the result
207 * @err: here we store the error value 207 * @err: here we store the error value
208 * 208 *
209 * Function fills the array of triples <key, p, bh> and returns %NULL 209 * Function fills the array of triples <key, p, bh> and returns %NULL
210 * if everything went OK or the pointer to the last filled triple 210 * if everything went OK or the pointer to the last filled triple
211 * (incomplete one) otherwise. Upon the return chain[i].key contains 211 * (incomplete one) otherwise. Upon the return chain[i].key contains
212 * the number of (i+1)-th block in the chain (as it is stored in memory, 212 * the number of (i+1)-th block in the chain (as it is stored in memory,
213 * i.e. little-endian 32-bit), chain[i].p contains the address of that 213 * i.e. little-endian 32-bit), chain[i].p contains the address of that
214 * number (it points into struct inode for i==0 and into the bh->b_data 214 * number (it points into struct inode for i==0 and into the bh->b_data
215 * for i>0) and chain[i].bh points to the buffer_head of i-th indirect 215 * for i>0) and chain[i].bh points to the buffer_head of i-th indirect
216 * block for i>0 and NULL for i==0. In other words, it holds the block 216 * block for i>0 and NULL for i==0. In other words, it holds the block
217 * numbers of the chain, addresses they were taken from (and where we can 217 * numbers of the chain, addresses they were taken from (and where we can
218 * verify that chain did not change) and buffer_heads hosting these 218 * verify that chain did not change) and buffer_heads hosting these
219 * numbers. 219 * numbers.
220 * 220 *
221 * Function stops when it stumbles upon zero pointer (absent block) 221 * Function stops when it stumbles upon zero pointer (absent block)
222 * (pointer to last triple returned, *@err == 0) 222 * (pointer to last triple returned, *@err == 0)
223 * or when it gets an IO error reading an indirect block 223 * or when it gets an IO error reading an indirect block
224 * (ditto, *@err == -EIO) 224 * (ditto, *@err == -EIO)
225 * or when it notices that chain had been changed while it was reading 225 * or when it notices that chain had been changed while it was reading
226 * (ditto, *@err == -EAGAIN) 226 * (ditto, *@err == -EAGAIN)
227 * or when it reads all @depth-1 indirect blocks successfully and finds 227 * or when it reads all @depth-1 indirect blocks successfully and finds
228 * the whole chain, all way to the data (returns %NULL, *err == 0). 228 * the whole chain, all way to the data (returns %NULL, *err == 0).
229 */ 229 */
230 static Indirect *ext2_get_branch(struct inode *inode, 230 static Indirect *ext2_get_branch(struct inode *inode,
231 int depth, 231 int depth,
232 int *offsets, 232 int *offsets,
233 Indirect chain[4], 233 Indirect chain[4],
234 int *err) 234 int *err)
235 { 235 {
236 struct super_block *sb = inode->i_sb; 236 struct super_block *sb = inode->i_sb;
237 Indirect *p = chain; 237 Indirect *p = chain;
238 struct buffer_head *bh; 238 struct buffer_head *bh;
239 239
240 *err = 0; 240 *err = 0;
241 /* i_data is not going away, no lock needed */ 241 /* i_data is not going away, no lock needed */
242 add_chain (chain, NULL, EXT2_I(inode)->i_data + *offsets); 242 add_chain (chain, NULL, EXT2_I(inode)->i_data + *offsets);
243 if (!p->key) 243 if (!p->key)
244 goto no_block; 244 goto no_block;
245 while (--depth) { 245 while (--depth) {
246 bh = sb_bread(sb, le32_to_cpu(p->key)); 246 bh = sb_bread(sb, le32_to_cpu(p->key));
247 if (!bh) 247 if (!bh)
248 goto failure; 248 goto failure;
249 read_lock(&EXT2_I(inode)->i_meta_lock); 249 read_lock(&EXT2_I(inode)->i_meta_lock);
250 if (!verify_chain(chain, p)) 250 if (!verify_chain(chain, p))
251 goto changed; 251 goto changed;
252 add_chain(++p, bh, (__le32*)bh->b_data + *++offsets); 252 add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
253 read_unlock(&EXT2_I(inode)->i_meta_lock); 253 read_unlock(&EXT2_I(inode)->i_meta_lock);
254 if (!p->key) 254 if (!p->key)
255 goto no_block; 255 goto no_block;
256 } 256 }
257 return NULL; 257 return NULL;
258 258
259 changed: 259 changed:
260 read_unlock(&EXT2_I(inode)->i_meta_lock); 260 read_unlock(&EXT2_I(inode)->i_meta_lock);
261 brelse(bh); 261 brelse(bh);
262 *err = -EAGAIN; 262 *err = -EAGAIN;
263 goto no_block; 263 goto no_block;
264 failure: 264 failure:
265 *err = -EIO; 265 *err = -EIO;
266 no_block: 266 no_block:
267 return p; 267 return p;
268 } 268 }
269 269
270 /** 270 /**
271 * ext2_find_near - find a place for allocation with sufficient locality 271 * ext2_find_near - find a place for allocation with sufficient locality
272 * @inode: owner 272 * @inode: owner
273 * @ind: descriptor of indirect block. 273 * @ind: descriptor of indirect block.
274 * 274 *
275 * This function returns the preferred place for block allocation. 275 * This function returns the preferred place for block allocation.
276 * It is used when heuristic for sequential allocation fails. 276 * It is used when heuristic for sequential allocation fails.
277 * Rules are: 277 * Rules are:
278 * + if there is a block to the left of our position - allocate near it. 278 * + if there is a block to the left of our position - allocate near it.
279 * + if pointer will live in indirect block - allocate near that block. 279 * + if pointer will live in indirect block - allocate near that block.
280 * + if pointer will live in inode - allocate in the same cylinder group. 280 * + if pointer will live in inode - allocate in the same cylinder group.
281 * 281 *
282 * In the latter case we colour the starting block by the callers PID to 282 * In the latter case we colour the starting block by the callers PID to
283 * prevent it from clashing with concurrent allocations for a different inode 283 * prevent it from clashing with concurrent allocations for a different inode
284 * in the same block group. The PID is used here so that functionally related 284 * in the same block group. The PID is used here so that functionally related
285 * files will be close-by on-disk. 285 * files will be close-by on-disk.
286 * 286 *
287 * Caller must make sure that @ind is valid and will stay that way. 287 * Caller must make sure that @ind is valid and will stay that way.
288 */ 288 */
289 289
290 static ext2_fsblk_t ext2_find_near(struct inode *inode, Indirect *ind) 290 static ext2_fsblk_t ext2_find_near(struct inode *inode, Indirect *ind)
291 { 291 {
292 struct ext2_inode_info *ei = EXT2_I(inode); 292 struct ext2_inode_info *ei = EXT2_I(inode);
293 __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; 293 __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data;
294 __le32 *p; 294 __le32 *p;
295 ext2_fsblk_t bg_start; 295 ext2_fsblk_t bg_start;
296 ext2_fsblk_t colour; 296 ext2_fsblk_t colour;
297 297
298 /* Try to find previous block */ 298 /* Try to find previous block */
299 for (p = ind->p - 1; p >= start; p--) 299 for (p = ind->p - 1; p >= start; p--)
300 if (*p) 300 if (*p)
301 return le32_to_cpu(*p); 301 return le32_to_cpu(*p);
302 302
303 /* No such thing, so let's try location of indirect block */ 303 /* No such thing, so let's try location of indirect block */
304 if (ind->bh) 304 if (ind->bh)
305 return ind->bh->b_blocknr; 305 return ind->bh->b_blocknr;
306 306
307 /* 307 /*
308 * It is going to be referred from inode itself? OK, just put it into 308 * It is going to be referred from inode itself? OK, just put it into
309 * the same cylinder group then. 309 * the same cylinder group then.
310 */ 310 */
311 bg_start = ext2_group_first_block_no(inode->i_sb, ei->i_block_group); 311 bg_start = ext2_group_first_block_no(inode->i_sb, ei->i_block_group);
312 colour = (current->pid % 16) * 312 colour = (current->pid % 16) *
313 (EXT2_BLOCKS_PER_GROUP(inode->i_sb) / 16); 313 (EXT2_BLOCKS_PER_GROUP(inode->i_sb) / 16);
314 return bg_start + colour; 314 return bg_start + colour;
315 } 315 }
316 316
317 /** 317 /**
318 * ext2_find_goal - find a preferred place for allocation. 318 * ext2_find_goal - find a preferred place for allocation.
319 * @inode: owner 319 * @inode: owner
320 * @block: block we want 320 * @block: block we want
321 * @partial: pointer to the last triple within a chain 321 * @partial: pointer to the last triple within a chain
322 * 322 *
323 * Returns preferred place for a block (the goal). 323 * Returns preferred place for a block (the goal).
324 */ 324 */
325 325
326 static inline ext2_fsblk_t ext2_find_goal(struct inode *inode, long block, 326 static inline ext2_fsblk_t ext2_find_goal(struct inode *inode, long block,
327 Indirect *partial) 327 Indirect *partial)
328 { 328 {
329 struct ext2_block_alloc_info *block_i; 329 struct ext2_block_alloc_info *block_i;
330 330
331 block_i = EXT2_I(inode)->i_block_alloc_info; 331 block_i = EXT2_I(inode)->i_block_alloc_info;
332 332
333 /* 333 /*
334 * try the heuristic for sequential allocation, 334 * try the heuristic for sequential allocation,
335 * failing that at least try to get decent locality. 335 * failing that at least try to get decent locality.
336 */ 336 */
337 if (block_i && (block == block_i->last_alloc_logical_block + 1) 337 if (block_i && (block == block_i->last_alloc_logical_block + 1)
338 && (block_i->last_alloc_physical_block != 0)) { 338 && (block_i->last_alloc_physical_block != 0)) {
339 return block_i->last_alloc_physical_block + 1; 339 return block_i->last_alloc_physical_block + 1;
340 } 340 }
341 341
342 return ext2_find_near(inode, partial); 342 return ext2_find_near(inode, partial);
343 } 343 }
344 344
345 /** 345 /**
346 * ext2_blks_to_allocate: Look up the block map and count the number 346 * ext2_blks_to_allocate: Look up the block map and count the number
347 * of direct blocks need to be allocated for the given branch. 347 * of direct blocks need to be allocated for the given branch.
348 * 348 *
349 * @branch: chain of indirect blocks 349 * @branch: chain of indirect blocks
350 * @k: number of blocks need for indirect blocks 350 * @k: number of blocks need for indirect blocks
351 * @blks: number of data blocks to be mapped. 351 * @blks: number of data blocks to be mapped.
352 * @blocks_to_boundary: the offset in the indirect block 352 * @blocks_to_boundary: the offset in the indirect block
353 * 353 *
354 * return the total number of blocks to be allocate, including the 354 * return the total number of blocks to be allocate, including the
355 * direct and indirect blocks. 355 * direct and indirect blocks.
356 */ 356 */
357 static int 357 static int
358 ext2_blks_to_allocate(Indirect * branch, int k, unsigned long blks, 358 ext2_blks_to_allocate(Indirect * branch, int k, unsigned long blks,
359 int blocks_to_boundary) 359 int blocks_to_boundary)
360 { 360 {
361 unsigned long count = 0; 361 unsigned long count = 0;
362 362
363 /* 363 /*
364 * Simple case, [t,d]Indirect block(s) has not allocated yet 364 * Simple case, [t,d]Indirect block(s) has not allocated yet
365 * then it's clear blocks on that path have not allocated 365 * then it's clear blocks on that path have not allocated
366 */ 366 */
367 if (k > 0) { 367 if (k > 0) {
368 /* right now don't hanel cross boundary allocation */ 368 /* right now don't hanel cross boundary allocation */
369 if (blks < blocks_to_boundary + 1) 369 if (blks < blocks_to_boundary + 1)
370 count += blks; 370 count += blks;
371 else 371 else
372 count += blocks_to_boundary + 1; 372 count += blocks_to_boundary + 1;
373 return count; 373 return count;
374 } 374 }
375 375
376 count++; 376 count++;
377 while (count < blks && count <= blocks_to_boundary 377 while (count < blks && count <= blocks_to_boundary
378 && le32_to_cpu(*(branch[0].p + count)) == 0) { 378 && le32_to_cpu(*(branch[0].p + count)) == 0) {
379 count++; 379 count++;
380 } 380 }
381 return count; 381 return count;
382 } 382 }
383 383
384 /** 384 /**
385 * ext2_alloc_blocks: multiple allocate blocks needed for a branch 385 * ext2_alloc_blocks: multiple allocate blocks needed for a branch
386 * @indirect_blks: the number of blocks need to allocate for indirect 386 * @indirect_blks: the number of blocks need to allocate for indirect
387 * blocks 387 * blocks
388 * 388 *
389 * @new_blocks: on return it will store the new block numbers for 389 * @new_blocks: on return it will store the new block numbers for
390 * the indirect blocks(if needed) and the first direct block, 390 * the indirect blocks(if needed) and the first direct block,
391 * @blks: on return it will store the total number of allocated 391 * @blks: on return it will store the total number of allocated
392 * direct blocks 392 * direct blocks
393 */ 393 */
394 static int ext2_alloc_blocks(struct inode *inode, 394 static int ext2_alloc_blocks(struct inode *inode,
395 ext2_fsblk_t goal, int indirect_blks, int blks, 395 ext2_fsblk_t goal, int indirect_blks, int blks,
396 ext2_fsblk_t new_blocks[4], int *err) 396 ext2_fsblk_t new_blocks[4], int *err)
397 { 397 {
398 int target, i; 398 int target, i;
399 unsigned long count = 0; 399 unsigned long count = 0;
400 int index = 0; 400 int index = 0;
401 ext2_fsblk_t current_block = 0; 401 ext2_fsblk_t current_block = 0;
402 int ret = 0; 402 int ret = 0;
403 403
404 /* 404 /*
405 * Here we try to allocate the requested multiple blocks at once, 405 * Here we try to allocate the requested multiple blocks at once,
406 * on a best-effort basis. 406 * on a best-effort basis.
407 * To build a branch, we should allocate blocks for 407 * To build a branch, we should allocate blocks for
408 * the indirect blocks(if not allocated yet), and at least 408 * the indirect blocks(if not allocated yet), and at least
409 * the first direct block of this branch. That's the 409 * the first direct block of this branch. That's the
410 * minimum number of blocks need to allocate(required) 410 * minimum number of blocks need to allocate(required)
411 */ 411 */
412 target = blks + indirect_blks; 412 target = blks + indirect_blks;
413 413
414 while (1) { 414 while (1) {
415 count = target; 415 count = target;
416 /* allocating blocks for indirect blocks and direct blocks */ 416 /* allocating blocks for indirect blocks and direct blocks */
417 current_block = ext2_new_blocks(inode,goal,&count,err); 417 current_block = ext2_new_blocks(inode,goal,&count,err);
418 if (*err) 418 if (*err)
419 goto failed_out; 419 goto failed_out;
420 420
421 target -= count; 421 target -= count;
422 /* allocate blocks for indirect blocks */ 422 /* allocate blocks for indirect blocks */
423 while (index < indirect_blks && count) { 423 while (index < indirect_blks && count) {
424 new_blocks[index++] = current_block++; 424 new_blocks[index++] = current_block++;
425 count--; 425 count--;
426 } 426 }
427 427
428 if (count > 0) 428 if (count > 0)
429 break; 429 break;
430 } 430 }
431 431
432 /* save the new block number for the first direct block */ 432 /* save the new block number for the first direct block */
433 new_blocks[index] = current_block; 433 new_blocks[index] = current_block;
434 434
435 /* total number of blocks allocated for direct blocks */ 435 /* total number of blocks allocated for direct blocks */
436 ret = count; 436 ret = count;
437 *err = 0; 437 *err = 0;
438 return ret; 438 return ret;
439 failed_out: 439 failed_out:
440 for (i = 0; i <index; i++) 440 for (i = 0; i <index; i++)
441 ext2_free_blocks(inode, new_blocks[i], 1); 441 ext2_free_blocks(inode, new_blocks[i], 1);
442 if (index) 442 if (index)
443 mark_inode_dirty(inode); 443 mark_inode_dirty(inode);
444 return ret; 444 return ret;
445 } 445 }
446 446
447 /** 447 /**
448 * ext2_alloc_branch - allocate and set up a chain of blocks. 448 * ext2_alloc_branch - allocate and set up a chain of blocks.
449 * @inode: owner 449 * @inode: owner
450 * @num: depth of the chain (number of blocks to allocate) 450 * @num: depth of the chain (number of blocks to allocate)
451 * @offsets: offsets (in the blocks) to store the pointers to next. 451 * @offsets: offsets (in the blocks) to store the pointers to next.
452 * @branch: place to store the chain in. 452 * @branch: place to store the chain in.
453 * 453 *
454 * This function allocates @num blocks, zeroes out all but the last one, 454 * This function allocates @num blocks, zeroes out all but the last one,
455 * links them into chain and (if we are synchronous) writes them to disk. 455 * links them into chain and (if we are synchronous) writes them to disk.
456 * In other words, it prepares a branch that can be spliced onto the 456 * In other words, it prepares a branch that can be spliced onto the
457 * inode. It stores the information about that chain in the branch[], in 457 * inode. It stores the information about that chain in the branch[], in
458 * the same format as ext2_get_branch() would do. We are calling it after 458 * the same format as ext2_get_branch() would do. We are calling it after
459 * we had read the existing part of chain and partial points to the last 459 * we had read the existing part of chain and partial points to the last
460 * triple of that (one with zero ->key). Upon the exit we have the same 460 * triple of that (one with zero ->key). Upon the exit we have the same
461 * picture as after the successful ext2_get_block(), except that in one 461 * picture as after the successful ext2_get_block(), except that in one
462 * place chain is disconnected - *branch->p is still zero (we did not 462 * place chain is disconnected - *branch->p is still zero (we did not
463 * set the last link), but branch->key contains the number that should 463 * set the last link), but branch->key contains the number that should
464 * be placed into *branch->p to fill that gap. 464 * be placed into *branch->p to fill that gap.
465 * 465 *
466 * If allocation fails we free all blocks we've allocated (and forget 466 * If allocation fails we free all blocks we've allocated (and forget
467 * their buffer_heads) and return the error value the from failed 467 * their buffer_heads) and return the error value the from failed
468 * ext2_alloc_block() (normally -ENOSPC). Otherwise we set the chain 468 * ext2_alloc_block() (normally -ENOSPC). Otherwise we set the chain
469 * as described above and return 0. 469 * as described above and return 0.
470 */ 470 */
471 471
472 static int ext2_alloc_branch(struct inode *inode, 472 static int ext2_alloc_branch(struct inode *inode,
473 int indirect_blks, int *blks, ext2_fsblk_t goal, 473 int indirect_blks, int *blks, ext2_fsblk_t goal,
474 int *offsets, Indirect *branch) 474 int *offsets, Indirect *branch)
475 { 475 {
476 int blocksize = inode->i_sb->s_blocksize; 476 int blocksize = inode->i_sb->s_blocksize;
477 int i, n = 0; 477 int i, n = 0;
478 int err = 0; 478 int err = 0;
479 struct buffer_head *bh; 479 struct buffer_head *bh;
480 int num; 480 int num;
481 ext2_fsblk_t new_blocks[4]; 481 ext2_fsblk_t new_blocks[4];
482 ext2_fsblk_t current_block; 482 ext2_fsblk_t current_block;
483 483
484 num = ext2_alloc_blocks(inode, goal, indirect_blks, 484 num = ext2_alloc_blocks(inode, goal, indirect_blks,
485 *blks, new_blocks, &err); 485 *blks, new_blocks, &err);
486 if (err) 486 if (err)
487 return err; 487 return err;
488 488
489 branch[0].key = cpu_to_le32(new_blocks[0]); 489 branch[0].key = cpu_to_le32(new_blocks[0]);
490 /* 490 /*
491 * metadata blocks and data blocks are allocated. 491 * metadata blocks and data blocks are allocated.
492 */ 492 */
493 for (n = 1; n <= indirect_blks; n++) { 493 for (n = 1; n <= indirect_blks; n++) {
494 /* 494 /*
495 * Get buffer_head for parent block, zero it out 495 * Get buffer_head for parent block, zero it out
496 * and set the pointer to new one, then send 496 * and set the pointer to new one, then send
497 * parent to disk. 497 * parent to disk.
498 */ 498 */
499 bh = sb_getblk(inode->i_sb, new_blocks[n-1]); 499 bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
500 branch[n].bh = bh; 500 branch[n].bh = bh;
501 lock_buffer(bh); 501 lock_buffer(bh);
502 memset(bh->b_data, 0, blocksize); 502 memset(bh->b_data, 0, blocksize);
503 branch[n].p = (__le32 *) bh->b_data + offsets[n]; 503 branch[n].p = (__le32 *) bh->b_data + offsets[n];
504 branch[n].key = cpu_to_le32(new_blocks[n]); 504 branch[n].key = cpu_to_le32(new_blocks[n]);
505 *branch[n].p = branch[n].key; 505 *branch[n].p = branch[n].key;
506 if ( n == indirect_blks) { 506 if ( n == indirect_blks) {
507 current_block = new_blocks[n]; 507 current_block = new_blocks[n];
508 /* 508 /*
509 * End of chain, update the last new metablock of 509 * End of chain, update the last new metablock of
510 * the chain to point to the new allocated 510 * the chain to point to the new allocated
511 * data blocks numbers 511 * data blocks numbers
512 */ 512 */
513 for (i=1; i < num; i++) 513 for (i=1; i < num; i++)
514 *(branch[n].p + i) = cpu_to_le32(++current_block); 514 *(branch[n].p + i) = cpu_to_le32(++current_block);
515 } 515 }
516 set_buffer_uptodate(bh); 516 set_buffer_uptodate(bh);
517 unlock_buffer(bh); 517 unlock_buffer(bh);
518 mark_buffer_dirty_inode(bh, inode); 518 mark_buffer_dirty_inode(bh, inode);
519 /* We used to sync bh here if IS_SYNC(inode). 519 /* We used to sync bh here if IS_SYNC(inode).
520 * But we now rely upon generic_write_sync() 520 * But we now rely upon generic_write_sync()
521 * and b_inode_buffers. But not for directories. 521 * and b_inode_buffers. But not for directories.
522 */ 522 */
523 if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) 523 if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
524 sync_dirty_buffer(bh); 524 sync_dirty_buffer(bh);
525 } 525 }
526 *blks = num; 526 *blks = num;
527 return err; 527 return err;
528 } 528 }
529 529
530 /** 530 /**
531 * ext2_splice_branch - splice the allocated branch onto inode. 531 * ext2_splice_branch - splice the allocated branch onto inode.
532 * @inode: owner 532 * @inode: owner
533 * @block: (logical) number of block we are adding 533 * @block: (logical) number of block we are adding
534 * @where: location of missing link 534 * @where: location of missing link
535 * @num: number of indirect blocks we are adding 535 * @num: number of indirect blocks we are adding
536 * @blks: number of direct blocks we are adding 536 * @blks: number of direct blocks we are adding
537 * 537 *
538 * This function fills the missing link and does all housekeeping needed in 538 * This function fills the missing link and does all housekeeping needed in
539 * inode (->i_blocks, etc.). In case of success we end up with the full 539 * inode (->i_blocks, etc.). In case of success we end up with the full
540 * chain to new block and return 0. 540 * chain to new block and return 0.
541 */ 541 */
542 static void ext2_splice_branch(struct inode *inode, 542 static void ext2_splice_branch(struct inode *inode,
543 long block, Indirect *where, int num, int blks) 543 long block, Indirect *where, int num, int blks)
544 { 544 {
545 int i; 545 int i;
546 struct ext2_block_alloc_info *block_i; 546 struct ext2_block_alloc_info *block_i;
547 ext2_fsblk_t current_block; 547 ext2_fsblk_t current_block;
548 548
549 block_i = EXT2_I(inode)->i_block_alloc_info; 549 block_i = EXT2_I(inode)->i_block_alloc_info;
550 550
551 /* XXX LOCKING probably should have i_meta_lock ?*/ 551 /* XXX LOCKING probably should have i_meta_lock ?*/
552 /* That's it */ 552 /* That's it */
553 553
554 *where->p = where->key; 554 *where->p = where->key;
555 555
556 /* 556 /*
557 * Update the host buffer_head or inode to point to more just allocated 557 * Update the host buffer_head or inode to point to more just allocated
558 * direct blocks blocks 558 * direct blocks blocks
559 */ 559 */
560 if (num == 0 && blks > 1) { 560 if (num == 0 && blks > 1) {
561 current_block = le32_to_cpu(where->key) + 1; 561 current_block = le32_to_cpu(where->key) + 1;
562 for (i = 1; i < blks; i++) 562 for (i = 1; i < blks; i++)
563 *(where->p + i ) = cpu_to_le32(current_block++); 563 *(where->p + i ) = cpu_to_le32(current_block++);
564 } 564 }
565 565
566 /* 566 /*
567 * update the most recently allocated logical & physical block 567 * update the most recently allocated logical & physical block
568 * in i_block_alloc_info, to assist find the proper goal block for next 568 * in i_block_alloc_info, to assist find the proper goal block for next
569 * allocation 569 * allocation
570 */ 570 */
571 if (block_i) { 571 if (block_i) {
572 block_i->last_alloc_logical_block = block + blks - 1; 572 block_i->last_alloc_logical_block = block + blks - 1;
573 block_i->last_alloc_physical_block = 573 block_i->last_alloc_physical_block =
574 le32_to_cpu(where[num].key) + blks - 1; 574 le32_to_cpu(where[num].key) + blks - 1;
575 } 575 }
576 576
577 /* We are done with atomic stuff, now do the rest of housekeeping */ 577 /* We are done with atomic stuff, now do the rest of housekeeping */
578 578
579 /* had we spliced it onto indirect block? */ 579 /* had we spliced it onto indirect block? */
580 if (where->bh) 580 if (where->bh)
581 mark_buffer_dirty_inode(where->bh, inode); 581 mark_buffer_dirty_inode(where->bh, inode);
582 582
583 inode->i_ctime = CURRENT_TIME_SEC; 583 inode->i_ctime = CURRENT_TIME_SEC;
584 mark_inode_dirty(inode); 584 mark_inode_dirty(inode);
585 } 585 }
586 586
587 /* 587 /*
588 * Allocation strategy is simple: if we have to allocate something, we will 588 * Allocation strategy is simple: if we have to allocate something, we will
589 * have to go the whole way to leaf. So let's do it before attaching anything 589 * have to go the whole way to leaf. So let's do it before attaching anything
590 * to tree, set linkage between the newborn blocks, write them if sync is 590 * to tree, set linkage between the newborn blocks, write them if sync is
591 * required, recheck the path, free and repeat if check fails, otherwise 591 * required, recheck the path, free and repeat if check fails, otherwise
592 * set the last missing link (that will protect us from any truncate-generated 592 * set the last missing link (that will protect us from any truncate-generated
593 * removals - all blocks on the path are immune now) and possibly force the 593 * removals - all blocks on the path are immune now) and possibly force the
594 * write on the parent block. 594 * write on the parent block.
595 * That has a nice additional property: no special recovery from the failed 595 * That has a nice additional property: no special recovery from the failed
596 * allocations is needed - we simply release blocks and do not touch anything 596 * allocations is needed - we simply release blocks and do not touch anything
597 * reachable from inode. 597 * reachable from inode.
598 * 598 *
599 * `handle' can be NULL if create == 0. 599 * `handle' can be NULL if create == 0.
600 * 600 *
601 * return > 0, # of blocks mapped or allocated. 601 * return > 0, # of blocks mapped or allocated.
602 * return = 0, if plain lookup failed. 602 * return = 0, if plain lookup failed.
603 * return < 0, error case. 603 * return < 0, error case.
604 */ 604 */
605 static int ext2_get_blocks(struct inode *inode, 605 static int ext2_get_blocks(struct inode *inode,
606 sector_t iblock, unsigned long maxblocks, 606 sector_t iblock, unsigned long maxblocks,
607 struct buffer_head *bh_result, 607 struct buffer_head *bh_result,
608 int create) 608 int create)
609 { 609 {
610 int err = -EIO; 610 int err = -EIO;
611 int offsets[4]; 611 int offsets[4];
612 Indirect chain[4]; 612 Indirect chain[4];
613 Indirect *partial; 613 Indirect *partial;
614 ext2_fsblk_t goal; 614 ext2_fsblk_t goal;
615 int indirect_blks; 615 int indirect_blks;
616 int blocks_to_boundary = 0; 616 int blocks_to_boundary = 0;
617 int depth; 617 int depth;
618 struct ext2_inode_info *ei = EXT2_I(inode); 618 struct ext2_inode_info *ei = EXT2_I(inode);
619 int count = 0; 619 int count = 0;
620 ext2_fsblk_t first_block = 0; 620 ext2_fsblk_t first_block = 0;
621 621
622 depth = ext2_block_to_path(inode,iblock,offsets,&blocks_to_boundary); 622 depth = ext2_block_to_path(inode,iblock,offsets,&blocks_to_boundary);
623 623
624 if (depth == 0) 624 if (depth == 0)
625 return (err); 625 return (err);
626 626
627 partial = ext2_get_branch(inode, depth, offsets, chain, &err); 627 partial = ext2_get_branch(inode, depth, offsets, chain, &err);
628 /* Simplest case - block found, no allocation needed */ 628 /* Simplest case - block found, no allocation needed */
629 if (!partial) { 629 if (!partial) {
630 first_block = le32_to_cpu(chain[depth - 1].key); 630 first_block = le32_to_cpu(chain[depth - 1].key);
631 clear_buffer_new(bh_result); /* What's this do? */ 631 clear_buffer_new(bh_result); /* What's this do? */
632 count++; 632 count++;
633 /*map more blocks*/ 633 /*map more blocks*/
634 while (count < maxblocks && count <= blocks_to_boundary) { 634 while (count < maxblocks && count <= blocks_to_boundary) {
635 ext2_fsblk_t blk; 635 ext2_fsblk_t blk;
636 636
637 if (!verify_chain(chain, chain + depth - 1)) { 637 if (!verify_chain(chain, chain + depth - 1)) {
638 /* 638 /*
639 * Indirect block might be removed by 639 * Indirect block might be removed by
640 * truncate while we were reading it. 640 * truncate while we were reading it.
641 * Handling of that case: forget what we've 641 * Handling of that case: forget what we've
642 * got now, go to reread. 642 * got now, go to reread.
643 */ 643 */
644 err = -EAGAIN; 644 err = -EAGAIN;
645 count = 0; 645 count = 0;
646 break; 646 break;
647 } 647 }
648 blk = le32_to_cpu(*(chain[depth-1].p + count)); 648 blk = le32_to_cpu(*(chain[depth-1].p + count));
649 if (blk == first_block + count) 649 if (blk == first_block + count)
650 count++; 650 count++;
651 else 651 else
652 break; 652 break;
653 } 653 }
654 if (err != -EAGAIN) 654 if (err != -EAGAIN)
655 goto got_it; 655 goto got_it;
656 } 656 }
657 657
658 /* Next simple case - plain lookup or failed read of indirect block */ 658 /* Next simple case - plain lookup or failed read of indirect block */
659 if (!create || err == -EIO) 659 if (!create || err == -EIO)
660 goto cleanup; 660 goto cleanup;
661 661
662 mutex_lock(&ei->truncate_mutex); 662 mutex_lock(&ei->truncate_mutex);
663 /* 663 /*
664 * If the indirect block is missing while we are reading 664 * If the indirect block is missing while we are reading
665 * the chain(ext2_get_branch() returns -EAGAIN err), or 665 * the chain(ext2_get_branch() returns -EAGAIN err), or
666 * if the chain has been changed after we grab the semaphore, 666 * if the chain has been changed after we grab the semaphore,
667 * (either because another process truncated this branch, or 667 * (either because another process truncated this branch, or
668 * another get_block allocated this branch) re-grab the chain to see if 668 * another get_block allocated this branch) re-grab the chain to see if
669 * the request block has been allocated or not. 669 * the request block has been allocated or not.
670 * 670 *
671 * Since we already block the truncate/other get_block 671 * Since we already block the truncate/other get_block
672 * at this point, we will have the current copy of the chain when we 672 * at this point, we will have the current copy of the chain when we
673 * splice the branch into the tree. 673 * splice the branch into the tree.
674 */ 674 */
675 if (err == -EAGAIN || !verify_chain(chain, partial)) { 675 if (err == -EAGAIN || !verify_chain(chain, partial)) {
676 while (partial > chain) { 676 while (partial > chain) {
677 brelse(partial->bh); 677 brelse(partial->bh);
678 partial--; 678 partial--;
679 } 679 }
680 partial = ext2_get_branch(inode, depth, offsets, chain, &err); 680 partial = ext2_get_branch(inode, depth, offsets, chain, &err);
681 if (!partial) { 681 if (!partial) {
682 count++; 682 count++;
683 mutex_unlock(&ei->truncate_mutex); 683 mutex_unlock(&ei->truncate_mutex);
684 if (err) 684 if (err)
685 goto cleanup; 685 goto cleanup;
686 clear_buffer_new(bh_result); 686 clear_buffer_new(bh_result);
687 goto got_it; 687 goto got_it;
688 } 688 }
689 } 689 }
690 690
691 /* 691 /*
692 * Okay, we need to do block allocation. Lazily initialize the block 692 * Okay, we need to do block allocation. Lazily initialize the block
693 * allocation info here if necessary 693 * allocation info here if necessary
694 */ 694 */
695 if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info)) 695 if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
696 ext2_init_block_alloc_info(inode); 696 ext2_init_block_alloc_info(inode);
697 697
698 goal = ext2_find_goal(inode, iblock, partial); 698 goal = ext2_find_goal(inode, iblock, partial);
699 699
700 /* the number of blocks need to allocate for [d,t]indirect blocks */ 700 /* the number of blocks need to allocate for [d,t]indirect blocks */
701 indirect_blks = (chain + depth) - partial - 1; 701 indirect_blks = (chain + depth) - partial - 1;
702 /* 702 /*
703 * Next look up the indirect map to count the totoal number of 703 * Next look up the indirect map to count the totoal number of
704 * direct blocks to allocate for this branch. 704 * direct blocks to allocate for this branch.
705 */ 705 */
706 count = ext2_blks_to_allocate(partial, indirect_blks, 706 count = ext2_blks_to_allocate(partial, indirect_blks,
707 maxblocks, blocks_to_boundary); 707 maxblocks, blocks_to_boundary);
708 /* 708 /*
709 * XXX ???? Block out ext2_truncate while we alter the tree 709 * XXX ???? Block out ext2_truncate while we alter the tree
710 */ 710 */
711 err = ext2_alloc_branch(inode, indirect_blks, &count, goal, 711 err = ext2_alloc_branch(inode, indirect_blks, &count, goal,
712 offsets + (partial - chain), partial); 712 offsets + (partial - chain), partial);
713 713
714 if (err) { 714 if (err) {
715 mutex_unlock(&ei->truncate_mutex); 715 mutex_unlock(&ei->truncate_mutex);
716 goto cleanup; 716 goto cleanup;
717 } 717 }
718 718
719 if (ext2_use_xip(inode->i_sb)) { 719 if (ext2_use_xip(inode->i_sb)) {
720 /* 720 /*
721 * we need to clear the block 721 * we need to clear the block
722 */ 722 */
723 err = ext2_clear_xip_target (inode, 723 err = ext2_clear_xip_target (inode,
724 le32_to_cpu(chain[depth-1].key)); 724 le32_to_cpu(chain[depth-1].key));
725 if (err) { 725 if (err) {
726 mutex_unlock(&ei->truncate_mutex); 726 mutex_unlock(&ei->truncate_mutex);
727 goto cleanup; 727 goto cleanup;
728 } 728 }
729 } 729 }
730 730
731 ext2_splice_branch(inode, iblock, partial, indirect_blks, count); 731 ext2_splice_branch(inode, iblock, partial, indirect_blks, count);
732 mutex_unlock(&ei->truncate_mutex); 732 mutex_unlock(&ei->truncate_mutex);
733 set_buffer_new(bh_result); 733 set_buffer_new(bh_result);
734 got_it: 734 got_it:
735 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); 735 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
736 if (count > blocks_to_boundary) 736 if (count > blocks_to_boundary)
737 set_buffer_boundary(bh_result); 737 set_buffer_boundary(bh_result);
738 err = count; 738 err = count;
739 /* Clean up and exit */ 739 /* Clean up and exit */
740 partial = chain + depth - 1; /* the whole chain */ 740 partial = chain + depth - 1; /* the whole chain */
741 cleanup: 741 cleanup:
742 while (partial > chain) { 742 while (partial > chain) {
743 brelse(partial->bh); 743 brelse(partial->bh);
744 partial--; 744 partial--;
745 } 745 }
746 return err; 746 return err;
747 } 747 }
748 748
749 int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) 749 int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create)
750 { 750 {
751 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; 751 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
752 int ret = ext2_get_blocks(inode, iblock, max_blocks, 752 int ret = ext2_get_blocks(inode, iblock, max_blocks,
753 bh_result, create); 753 bh_result, create);
754 if (ret > 0) { 754 if (ret > 0) {
755 bh_result->b_size = (ret << inode->i_blkbits); 755 bh_result->b_size = (ret << inode->i_blkbits);
756 ret = 0; 756 ret = 0;
757 } 757 }
758 return ret; 758 return ret;
759 759
760 } 760 }
761 761
762 int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 762 int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
763 u64 start, u64 len) 763 u64 start, u64 len)
764 { 764 {
765 return generic_block_fiemap(inode, fieinfo, start, len, 765 return generic_block_fiemap(inode, fieinfo, start, len,
766 ext2_get_block); 766 ext2_get_block);
767 } 767 }
768 768
769 static int ext2_writepage(struct page *page, struct writeback_control *wbc) 769 static int ext2_writepage(struct page *page, struct writeback_control *wbc)
770 { 770 {
771 return block_write_full_page(page, ext2_get_block, wbc); 771 return block_write_full_page(page, ext2_get_block, wbc);
772 } 772 }
773 773
774 static int ext2_readpage(struct file *file, struct page *page) 774 static int ext2_readpage(struct file *file, struct page *page)
775 { 775 {
776 return mpage_readpage(page, ext2_get_block); 776 return mpage_readpage(page, ext2_get_block);
777 } 777 }
778 778
779 static int 779 static int
780 ext2_readpages(struct file *file, struct address_space *mapping, 780 ext2_readpages(struct file *file, struct address_space *mapping,
781 struct list_head *pages, unsigned nr_pages) 781 struct list_head *pages, unsigned nr_pages)
782 { 782 {
783 return mpage_readpages(mapping, pages, nr_pages, ext2_get_block); 783 return mpage_readpages(mapping, pages, nr_pages, ext2_get_block);
784 } 784 }
785 785
786 static int 786 static int
787 ext2_write_begin(struct file *file, struct address_space *mapping, 787 ext2_write_begin(struct file *file, struct address_space *mapping,
788 loff_t pos, unsigned len, unsigned flags, 788 loff_t pos, unsigned len, unsigned flags,
789 struct page **pagep, void **fsdata) 789 struct page **pagep, void **fsdata)
790 { 790 {
791 int ret; 791 int ret;
792 792
793 ret = block_write_begin(mapping, pos, len, flags, pagep, 793 ret = block_write_begin(mapping, pos, len, flags, pagep,
794 ext2_get_block); 794 ext2_get_block);
795 if (ret < 0) 795 if (ret < 0)
796 ext2_write_failed(mapping, pos + len); 796 ext2_write_failed(mapping, pos + len);
797 return ret; 797 return ret;
798 } 798 }
799 799
800 static int ext2_write_end(struct file *file, struct address_space *mapping, 800 static int ext2_write_end(struct file *file, struct address_space *mapping,
801 loff_t pos, unsigned len, unsigned copied, 801 loff_t pos, unsigned len, unsigned copied,
802 struct page *page, void *fsdata) 802 struct page *page, void *fsdata)
803 { 803 {
804 int ret; 804 int ret;
805 805
806 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); 806 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
807 if (ret < len) 807 if (ret < len)
808 ext2_write_failed(mapping, pos + len); 808 ext2_write_failed(mapping, pos + len);
809 return ret; 809 return ret;
810 } 810 }
811 811
812 static int 812 static int
813 ext2_nobh_write_begin(struct file *file, struct address_space *mapping, 813 ext2_nobh_write_begin(struct file *file, struct address_space *mapping,
814 loff_t pos, unsigned len, unsigned flags, 814 loff_t pos, unsigned len, unsigned flags,
815 struct page **pagep, void **fsdata) 815 struct page **pagep, void **fsdata)
816 { 816 {
817 int ret; 817 int ret;
818 818
819 ret = nobh_write_begin(mapping, pos, len, flags, pagep, fsdata, 819 ret = nobh_write_begin(mapping, pos, len, flags, pagep, fsdata,
820 ext2_get_block); 820 ext2_get_block);
821 if (ret < 0) 821 if (ret < 0)
822 ext2_write_failed(mapping, pos + len); 822 ext2_write_failed(mapping, pos + len);
823 return ret; 823 return ret;
824 } 824 }
825 825
826 static int ext2_nobh_writepage(struct page *page, 826 static int ext2_nobh_writepage(struct page *page,
827 struct writeback_control *wbc) 827 struct writeback_control *wbc)
828 { 828 {
829 return nobh_writepage(page, ext2_get_block, wbc); 829 return nobh_writepage(page, ext2_get_block, wbc);
830 } 830 }
831 831
832 static sector_t ext2_bmap(struct address_space *mapping, sector_t block) 832 static sector_t ext2_bmap(struct address_space *mapping, sector_t block)
833 { 833 {
834 return generic_block_bmap(mapping,block,ext2_get_block); 834 return generic_block_bmap(mapping,block,ext2_get_block);
835 } 835 }
836 836
837 static ssize_t 837 static ssize_t
838 ext2_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, 838 ext2_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
839 loff_t offset, unsigned long nr_segs) 839 loff_t offset, unsigned long nr_segs)
840 { 840 {
841 struct file *file = iocb->ki_filp; 841 struct file *file = iocb->ki_filp;
842 struct address_space *mapping = file->f_mapping; 842 struct address_space *mapping = file->f_mapping;
843 struct inode *inode = mapping->host; 843 struct inode *inode = mapping->host;
844 ssize_t ret; 844 ssize_t ret;
845 845
846 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, 846 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
847 iov, offset, nr_segs, ext2_get_block, NULL); 847 iov, offset, nr_segs, ext2_get_block, NULL);
848 if (ret < 0 && (rw & WRITE)) 848 if (ret < 0 && (rw & WRITE))
849 ext2_write_failed(mapping, offset + iov_length(iov, nr_segs)); 849 ext2_write_failed(mapping, offset + iov_length(iov, nr_segs));
850 return ret; 850 return ret;
851 } 851 }
852 852
853 static int 853 static int
854 ext2_writepages(struct address_space *mapping, struct writeback_control *wbc) 854 ext2_writepages(struct address_space *mapping, struct writeback_control *wbc)
855 { 855 {
856 return mpage_writepages(mapping, wbc, ext2_get_block); 856 return mpage_writepages(mapping, wbc, ext2_get_block);
857 } 857 }
858 858
859 const struct address_space_operations ext2_aops = { 859 const struct address_space_operations ext2_aops = {
860 .readpage = ext2_readpage, 860 .readpage = ext2_readpage,
861 .readpages = ext2_readpages, 861 .readpages = ext2_readpages,
862 .writepage = ext2_writepage, 862 .writepage = ext2_writepage,
863 .write_begin = ext2_write_begin, 863 .write_begin = ext2_write_begin,
864 .write_end = ext2_write_end, 864 .write_end = ext2_write_end,
865 .bmap = ext2_bmap, 865 .bmap = ext2_bmap,
866 .direct_IO = ext2_direct_IO, 866 .direct_IO = ext2_direct_IO,
867 .writepages = ext2_writepages, 867 .writepages = ext2_writepages,
868 .migratepage = buffer_migrate_page, 868 .migratepage = buffer_migrate_page,
869 .is_partially_uptodate = block_is_partially_uptodate, 869 .is_partially_uptodate = block_is_partially_uptodate,
870 .error_remove_page = generic_error_remove_page, 870 .error_remove_page = generic_error_remove_page,
871 }; 871 };
872 872
873 const struct address_space_operations ext2_aops_xip = { 873 const struct address_space_operations ext2_aops_xip = {
874 .bmap = ext2_bmap, 874 .bmap = ext2_bmap,
875 .get_xip_mem = ext2_get_xip_mem, 875 .get_xip_mem = ext2_get_xip_mem,
876 }; 876 };
877 877
878 const struct address_space_operations ext2_nobh_aops = { 878 const struct address_space_operations ext2_nobh_aops = {
879 .readpage = ext2_readpage, 879 .readpage = ext2_readpage,
880 .readpages = ext2_readpages, 880 .readpages = ext2_readpages,
881 .writepage = ext2_nobh_writepage, 881 .writepage = ext2_nobh_writepage,
882 .write_begin = ext2_nobh_write_begin, 882 .write_begin = ext2_nobh_write_begin,
883 .write_end = nobh_write_end, 883 .write_end = nobh_write_end,
884 .bmap = ext2_bmap, 884 .bmap = ext2_bmap,
885 .direct_IO = ext2_direct_IO, 885 .direct_IO = ext2_direct_IO,
886 .writepages = ext2_writepages, 886 .writepages = ext2_writepages,
887 .migratepage = buffer_migrate_page, 887 .migratepage = buffer_migrate_page,
888 .error_remove_page = generic_error_remove_page, 888 .error_remove_page = generic_error_remove_page,
889 }; 889 };
890 890
891 /* 891 /*
892 * Probably it should be a library function... search for first non-zero word 892 * Probably it should be a library function... search for first non-zero word
893 * or memcmp with zero_page, whatever is better for particular architecture. 893 * or memcmp with zero_page, whatever is better for particular architecture.
894 * Linus? 894 * Linus?
895 */ 895 */
896 static inline int all_zeroes(__le32 *p, __le32 *q) 896 static inline int all_zeroes(__le32 *p, __le32 *q)
897 { 897 {
898 while (p < q) 898 while (p < q)
899 if (*p++) 899 if (*p++)
900 return 0; 900 return 0;
901 return 1; 901 return 1;
902 } 902 }
903 903
904 /** 904 /**
905 * ext2_find_shared - find the indirect blocks for partial truncation. 905 * ext2_find_shared - find the indirect blocks for partial truncation.
906 * @inode: inode in question 906 * @inode: inode in question
907 * @depth: depth of the affected branch 907 * @depth: depth of the affected branch
908 * @offsets: offsets of pointers in that branch (see ext2_block_to_path) 908 * @offsets: offsets of pointers in that branch (see ext2_block_to_path)
909 * @chain: place to store the pointers to partial indirect blocks 909 * @chain: place to store the pointers to partial indirect blocks
910 * @top: place to the (detached) top of branch 910 * @top: place to the (detached) top of branch
911 * 911 *
912 * This is a helper function used by ext2_truncate(). 912 * This is a helper function used by ext2_truncate().
913 * 913 *
914 * When we do truncate() we may have to clean the ends of several indirect 914 * When we do truncate() we may have to clean the ends of several indirect
915 * blocks but leave the blocks themselves alive. Block is partially 915 * blocks but leave the blocks themselves alive. Block is partially
916 * truncated if some data below the new i_size is referred from it (and 916 * truncated if some data below the new i_size is referred from it (and
917 * it is on the path to the first completely truncated data block, indeed). 917 * it is on the path to the first completely truncated data block, indeed).
918 * We have to free the top of that path along with everything to the right 918 * We have to free the top of that path along with everything to the right
919 * of the path. Since no allocation past the truncation point is possible 919 * of the path. Since no allocation past the truncation point is possible
920 * until ext2_truncate() finishes, we may safely do the latter, but top 920 * until ext2_truncate() finishes, we may safely do the latter, but top
921 * of branch may require special attention - pageout below the truncation 921 * of branch may require special attention - pageout below the truncation
922 * point might try to populate it. 922 * point might try to populate it.
923 * 923 *
924 * We atomically detach the top of branch from the tree, store the block 924 * We atomically detach the top of branch from the tree, store the block
925 * number of its root in *@top, pointers to buffer_heads of partially 925 * number of its root in *@top, pointers to buffer_heads of partially
926 * truncated blocks - in @chain[].bh and pointers to their last elements 926 * truncated blocks - in @chain[].bh and pointers to their last elements
927 * that should not be removed - in @chain[].p. Return value is the pointer 927 * that should not be removed - in @chain[].p. Return value is the pointer
928 * to last filled element of @chain. 928 * to last filled element of @chain.
929 * 929 *
930 * The work left to caller to do the actual freeing of subtrees: 930 * The work left to caller to do the actual freeing of subtrees:
931 * a) free the subtree starting from *@top 931 * a) free the subtree starting from *@top
932 * b) free the subtrees whose roots are stored in 932 * b) free the subtrees whose roots are stored in
933 * (@chain[i].p+1 .. end of @chain[i].bh->b_data) 933 * (@chain[i].p+1 .. end of @chain[i].bh->b_data)
934 * c) free the subtrees growing from the inode past the @chain[0].p 934 * c) free the subtrees growing from the inode past the @chain[0].p
935 * (no partially truncated stuff there). 935 * (no partially truncated stuff there).
936 */ 936 */
937 937
938 static Indirect *ext2_find_shared(struct inode *inode, 938 static Indirect *ext2_find_shared(struct inode *inode,
939 int depth, 939 int depth,
940 int offsets[4], 940 int offsets[4],
941 Indirect chain[4], 941 Indirect chain[4],
942 __le32 *top) 942 __le32 *top)
943 { 943 {
944 Indirect *partial, *p; 944 Indirect *partial, *p;
945 int k, err; 945 int k, err;
946 946
947 *top = 0; 947 *top = 0;
948 for (k = depth; k > 1 && !offsets[k-1]; k--) 948 for (k = depth; k > 1 && !offsets[k-1]; k--)
949 ; 949 ;
950 partial = ext2_get_branch(inode, k, offsets, chain, &err); 950 partial = ext2_get_branch(inode, k, offsets, chain, &err);
951 if (!partial) 951 if (!partial)
952 partial = chain + k-1; 952 partial = chain + k-1;
953 /* 953 /*
954 * If the branch acquired continuation since we've looked at it - 954 * If the branch acquired continuation since we've looked at it -
955 * fine, it should all survive and (new) top doesn't belong to us. 955 * fine, it should all survive and (new) top doesn't belong to us.
956 */ 956 */
957 write_lock(&EXT2_I(inode)->i_meta_lock); 957 write_lock(&EXT2_I(inode)->i_meta_lock);
958 if (!partial->key && *partial->p) { 958 if (!partial->key && *partial->p) {
959 write_unlock(&EXT2_I(inode)->i_meta_lock); 959 write_unlock(&EXT2_I(inode)->i_meta_lock);
960 goto no_top; 960 goto no_top;
961 } 961 }
962 for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--) 962 for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
963 ; 963 ;
964 /* 964 /*
965 * OK, we've found the last block that must survive. The rest of our 965 * OK, we've found the last block that must survive. The rest of our
966 * branch should be detached before unlocking. However, if that rest 966 * branch should be detached before unlocking. However, if that rest
967 * of branch is all ours and does not grow immediately from the inode 967 * of branch is all ours and does not grow immediately from the inode
968 * it's easier to cheat and just decrement partial->p. 968 * it's easier to cheat and just decrement partial->p.
969 */ 969 */
970 if (p == chain + k - 1 && p > chain) { 970 if (p == chain + k - 1 && p > chain) {
971 p->p--; 971 p->p--;
972 } else { 972 } else {
973 *top = *p->p; 973 *top = *p->p;
974 *p->p = 0; 974 *p->p = 0;
975 } 975 }
976 write_unlock(&EXT2_I(inode)->i_meta_lock); 976 write_unlock(&EXT2_I(inode)->i_meta_lock);
977 977
978 while(partial > p) 978 while(partial > p)
979 { 979 {
980 brelse(partial->bh); 980 brelse(partial->bh);
981 partial--; 981 partial--;
982 } 982 }
983 no_top: 983 no_top:
984 return partial; 984 return partial;
985 } 985 }
986 986
987 /** 987 /**
988 * ext2_free_data - free a list of data blocks 988 * ext2_free_data - free a list of data blocks
989 * @inode: inode we are dealing with 989 * @inode: inode we are dealing with
990 * @p: array of block numbers 990 * @p: array of block numbers
991 * @q: points immediately past the end of array 991 * @q: points immediately past the end of array
992 * 992 *
993 * We are freeing all blocks referred from that array (numbers are 993 * We are freeing all blocks referred from that array (numbers are
994 * stored as little-endian 32-bit) and updating @inode->i_blocks 994 * stored as little-endian 32-bit) and updating @inode->i_blocks
995 * appropriately. 995 * appropriately.
996 */ 996 */
997 static inline void ext2_free_data(struct inode *inode, __le32 *p, __le32 *q) 997 static inline void ext2_free_data(struct inode *inode, __le32 *p, __le32 *q)
998 { 998 {
999 unsigned long block_to_free = 0, count = 0; 999 unsigned long block_to_free = 0, count = 0;
1000 unsigned long nr; 1000 unsigned long nr;
1001 1001
1002 for ( ; p < q ; p++) { 1002 for ( ; p < q ; p++) {
1003 nr = le32_to_cpu(*p); 1003 nr = le32_to_cpu(*p);
1004 if (nr) { 1004 if (nr) {
1005 *p = 0; 1005 *p = 0;
1006 /* accumulate blocks to free if they're contiguous */ 1006 /* accumulate blocks to free if they're contiguous */
1007 if (count == 0) 1007 if (count == 0)
1008 goto free_this; 1008 goto free_this;
1009 else if (block_to_free == nr - count) 1009 else if (block_to_free == nr - count)
1010 count++; 1010 count++;
1011 else { 1011 else {
1012 ext2_free_blocks (inode, block_to_free, count); 1012 ext2_free_blocks (inode, block_to_free, count);
1013 mark_inode_dirty(inode); 1013 mark_inode_dirty(inode);
1014 free_this: 1014 free_this:
1015 block_to_free = nr; 1015 block_to_free = nr;
1016 count = 1; 1016 count = 1;
1017 } 1017 }
1018 } 1018 }
1019 } 1019 }
1020 if (count > 0) { 1020 if (count > 0) {
1021 ext2_free_blocks (inode, block_to_free, count); 1021 ext2_free_blocks (inode, block_to_free, count);
1022 mark_inode_dirty(inode); 1022 mark_inode_dirty(inode);
1023 } 1023 }
1024 } 1024 }
1025 1025
1026 /** 1026 /**
1027 * ext2_free_branches - free an array of branches 1027 * ext2_free_branches - free an array of branches
1028 * @inode: inode we are dealing with 1028 * @inode: inode we are dealing with
1029 * @p: array of block numbers 1029 * @p: array of block numbers
1030 * @q: pointer immediately past the end of array 1030 * @q: pointer immediately past the end of array
1031 * @depth: depth of the branches to free 1031 * @depth: depth of the branches to free
1032 * 1032 *
1033 * We are freeing all blocks referred from these branches (numbers are 1033 * We are freeing all blocks referred from these branches (numbers are
1034 * stored as little-endian 32-bit) and updating @inode->i_blocks 1034 * stored as little-endian 32-bit) and updating @inode->i_blocks
1035 * appropriately. 1035 * appropriately.
1036 */ 1036 */
1037 static void ext2_free_branches(struct inode *inode, __le32 *p, __le32 *q, int depth) 1037 static void ext2_free_branches(struct inode *inode, __le32 *p, __le32 *q, int depth)
1038 { 1038 {
1039 struct buffer_head * bh; 1039 struct buffer_head * bh;
1040 unsigned long nr; 1040 unsigned long nr;
1041 1041
1042 if (depth--) { 1042 if (depth--) {
1043 int addr_per_block = EXT2_ADDR_PER_BLOCK(inode->i_sb); 1043 int addr_per_block = EXT2_ADDR_PER_BLOCK(inode->i_sb);
1044 for ( ; p < q ; p++) { 1044 for ( ; p < q ; p++) {
1045 nr = le32_to_cpu(*p); 1045 nr = le32_to_cpu(*p);
1046 if (!nr) 1046 if (!nr)
1047 continue; 1047 continue;
1048 *p = 0; 1048 *p = 0;
1049 bh = sb_bread(inode->i_sb, nr); 1049 bh = sb_bread(inode->i_sb, nr);
1050 /* 1050 /*
1051 * A read failure? Report error and clear slot 1051 * A read failure? Report error and clear slot
1052 * (should be rare). 1052 * (should be rare).
1053 */ 1053 */
1054 if (!bh) { 1054 if (!bh) {
1055 ext2_error(inode->i_sb, "ext2_free_branches", 1055 ext2_error(inode->i_sb, "ext2_free_branches",
1056 "Read failure, inode=%ld, block=%ld", 1056 "Read failure, inode=%ld, block=%ld",
1057 inode->i_ino, nr); 1057 inode->i_ino, nr);
1058 continue; 1058 continue;
1059 } 1059 }
1060 ext2_free_branches(inode, 1060 ext2_free_branches(inode,
1061 (__le32*)bh->b_data, 1061 (__le32*)bh->b_data,
1062 (__le32*)bh->b_data + addr_per_block, 1062 (__le32*)bh->b_data + addr_per_block,
1063 depth); 1063 depth);
1064 bforget(bh); 1064 bforget(bh);
1065 ext2_free_blocks(inode, nr, 1); 1065 ext2_free_blocks(inode, nr, 1);
1066 mark_inode_dirty(inode); 1066 mark_inode_dirty(inode);
1067 } 1067 }
1068 } else 1068 } else
1069 ext2_free_data(inode, p, q); 1069 ext2_free_data(inode, p, q);
1070 } 1070 }
1071 1071
1072 static void __ext2_truncate_blocks(struct inode *inode, loff_t offset) 1072 static void __ext2_truncate_blocks(struct inode *inode, loff_t offset)
1073 { 1073 {
1074 __le32 *i_data = EXT2_I(inode)->i_data; 1074 __le32 *i_data = EXT2_I(inode)->i_data;
1075 struct ext2_inode_info *ei = EXT2_I(inode); 1075 struct ext2_inode_info *ei = EXT2_I(inode);
1076 int addr_per_block = EXT2_ADDR_PER_BLOCK(inode->i_sb); 1076 int addr_per_block = EXT2_ADDR_PER_BLOCK(inode->i_sb);
1077 int offsets[4]; 1077 int offsets[4];
1078 Indirect chain[4]; 1078 Indirect chain[4];
1079 Indirect *partial; 1079 Indirect *partial;
1080 __le32 nr = 0; 1080 __le32 nr = 0;
1081 int n; 1081 int n;
1082 long iblock; 1082 long iblock;
1083 unsigned blocksize; 1083 unsigned blocksize;
1084 blocksize = inode->i_sb->s_blocksize; 1084 blocksize = inode->i_sb->s_blocksize;
1085 iblock = (offset + blocksize-1) >> EXT2_BLOCK_SIZE_BITS(inode->i_sb); 1085 iblock = (offset + blocksize-1) >> EXT2_BLOCK_SIZE_BITS(inode->i_sb);
1086 1086
1087 n = ext2_block_to_path(inode, iblock, offsets, NULL); 1087 n = ext2_block_to_path(inode, iblock, offsets, NULL);
1088 if (n == 0) 1088 if (n == 0)
1089 return; 1089 return;
1090 1090
1091 /* 1091 /*
1092 * From here we block out all ext2_get_block() callers who want to 1092 * From here we block out all ext2_get_block() callers who want to
1093 * modify the block allocation tree. 1093 * modify the block allocation tree.
1094 */ 1094 */
1095 mutex_lock(&ei->truncate_mutex); 1095 mutex_lock(&ei->truncate_mutex);
1096 1096
1097 if (n == 1) { 1097 if (n == 1) {
1098 ext2_free_data(inode, i_data+offsets[0], 1098 ext2_free_data(inode, i_data+offsets[0],
1099 i_data + EXT2_NDIR_BLOCKS); 1099 i_data + EXT2_NDIR_BLOCKS);
1100 goto do_indirects; 1100 goto do_indirects;
1101 } 1101 }
1102 1102
1103 partial = ext2_find_shared(inode, n, offsets, chain, &nr); 1103 partial = ext2_find_shared(inode, n, offsets, chain, &nr);
1104 /* Kill the top of shared branch (already detached) */ 1104 /* Kill the top of shared branch (already detached) */
1105 if (nr) { 1105 if (nr) {
1106 if (partial == chain) 1106 if (partial == chain)
1107 mark_inode_dirty(inode); 1107 mark_inode_dirty(inode);
1108 else 1108 else
1109 mark_buffer_dirty_inode(partial->bh, inode); 1109 mark_buffer_dirty_inode(partial->bh, inode);
1110 ext2_free_branches(inode, &nr, &nr+1, (chain+n-1) - partial); 1110 ext2_free_branches(inode, &nr, &nr+1, (chain+n-1) - partial);
1111 } 1111 }
1112 /* Clear the ends of indirect blocks on the shared branch */ 1112 /* Clear the ends of indirect blocks on the shared branch */
1113 while (partial > chain) { 1113 while (partial > chain) {
1114 ext2_free_branches(inode, 1114 ext2_free_branches(inode,
1115 partial->p + 1, 1115 partial->p + 1,
1116 (__le32*)partial->bh->b_data+addr_per_block, 1116 (__le32*)partial->bh->b_data+addr_per_block,
1117 (chain+n-1) - partial); 1117 (chain+n-1) - partial);
1118 mark_buffer_dirty_inode(partial->bh, inode); 1118 mark_buffer_dirty_inode(partial->bh, inode);
1119 brelse (partial->bh); 1119 brelse (partial->bh);
1120 partial--; 1120 partial--;
1121 } 1121 }
1122 do_indirects: 1122 do_indirects:
1123 /* Kill the remaining (whole) subtrees */ 1123 /* Kill the remaining (whole) subtrees */
1124 switch (offsets[0]) { 1124 switch (offsets[0]) {
1125 default: 1125 default:
1126 nr = i_data[EXT2_IND_BLOCK]; 1126 nr = i_data[EXT2_IND_BLOCK];
1127 if (nr) { 1127 if (nr) {
1128 i_data[EXT2_IND_BLOCK] = 0; 1128 i_data[EXT2_IND_BLOCK] = 0;
1129 mark_inode_dirty(inode); 1129 mark_inode_dirty(inode);
1130 ext2_free_branches(inode, &nr, &nr+1, 1); 1130 ext2_free_branches(inode, &nr, &nr+1, 1);
1131 } 1131 }
1132 case EXT2_IND_BLOCK: 1132 case EXT2_IND_BLOCK:
1133 nr = i_data[EXT2_DIND_BLOCK]; 1133 nr = i_data[EXT2_DIND_BLOCK];
1134 if (nr) { 1134 if (nr) {
1135 i_data[EXT2_DIND_BLOCK] = 0; 1135 i_data[EXT2_DIND_BLOCK] = 0;
1136 mark_inode_dirty(inode); 1136 mark_inode_dirty(inode);
1137 ext2_free_branches(inode, &nr, &nr+1, 2); 1137 ext2_free_branches(inode, &nr, &nr+1, 2);
1138 } 1138 }
1139 case EXT2_DIND_BLOCK: 1139 case EXT2_DIND_BLOCK:
1140 nr = i_data[EXT2_TIND_BLOCK]; 1140 nr = i_data[EXT2_TIND_BLOCK];
1141 if (nr) { 1141 if (nr) {
1142 i_data[EXT2_TIND_BLOCK] = 0; 1142 i_data[EXT2_TIND_BLOCK] = 0;
1143 mark_inode_dirty(inode); 1143 mark_inode_dirty(inode);
1144 ext2_free_branches(inode, &nr, &nr+1, 3); 1144 ext2_free_branches(inode, &nr, &nr+1, 3);
1145 } 1145 }
1146 case EXT2_TIND_BLOCK: 1146 case EXT2_TIND_BLOCK:
1147 ; 1147 ;
1148 } 1148 }
1149 1149
1150 ext2_discard_reservation(inode); 1150 ext2_discard_reservation(inode);
1151 1151
1152 mutex_unlock(&ei->truncate_mutex); 1152 mutex_unlock(&ei->truncate_mutex);
1153 } 1153 }
1154 1154
1155 static void ext2_truncate_blocks(struct inode *inode, loff_t offset) 1155 static void ext2_truncate_blocks(struct inode *inode, loff_t offset)
1156 { 1156 {
1157 /* 1157 /*
1158 * XXX: it seems like a bug here that we don't allow 1158 * XXX: it seems like a bug here that we don't allow
1159 * IS_APPEND inode to have blocks-past-i_size trimmed off. 1159 * IS_APPEND inode to have blocks-past-i_size trimmed off.
1160 * review and fix this. 1160 * review and fix this.
1161 * 1161 *
1162 * Also would be nice to be able to handle IO errors and such, 1162 * Also would be nice to be able to handle IO errors and such,
1163 * but that's probably too much to ask. 1163 * but that's probably too much to ask.
1164 */ 1164 */
1165 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 1165 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
1166 S_ISLNK(inode->i_mode))) 1166 S_ISLNK(inode->i_mode)))
1167 return; 1167 return;
1168 if (ext2_inode_is_fast_symlink(inode)) 1168 if (ext2_inode_is_fast_symlink(inode))
1169 return; 1169 return;
1170 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) 1170 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
1171 return; 1171 return;
1172 __ext2_truncate_blocks(inode, offset); 1172 __ext2_truncate_blocks(inode, offset);
1173 } 1173 }
1174 1174
1175 static int ext2_setsize(struct inode *inode, loff_t newsize) 1175 static int ext2_setsize(struct inode *inode, loff_t newsize)
1176 { 1176 {
1177 int error; 1177 int error;
1178 1178
1179 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 1179 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
1180 S_ISLNK(inode->i_mode))) 1180 S_ISLNK(inode->i_mode)))
1181 return -EINVAL; 1181 return -EINVAL;
1182 if (ext2_inode_is_fast_symlink(inode)) 1182 if (ext2_inode_is_fast_symlink(inode))
1183 return -EINVAL; 1183 return -EINVAL;
1184 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) 1184 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
1185 return -EPERM; 1185 return -EPERM;
1186 1186
1187 inode_dio_wait(inode);
1188
1187 if (mapping_is_xip(inode->i_mapping)) 1189 if (mapping_is_xip(inode->i_mapping))
1188 error = xip_truncate_page(inode->i_mapping, newsize); 1190 error = xip_truncate_page(inode->i_mapping, newsize);
1189 else if (test_opt(inode->i_sb, NOBH)) 1191 else if (test_opt(inode->i_sb, NOBH))
1190 error = nobh_truncate_page(inode->i_mapping, 1192 error = nobh_truncate_page(inode->i_mapping,
1191 newsize, ext2_get_block); 1193 newsize, ext2_get_block);
1192 else 1194 else
1193 error = block_truncate_page(inode->i_mapping, 1195 error = block_truncate_page(inode->i_mapping,
1194 newsize, ext2_get_block); 1196 newsize, ext2_get_block);
1195 if (error) 1197 if (error)
1196 return error; 1198 return error;
1197 1199
1198 truncate_setsize(inode, newsize); 1200 truncate_setsize(inode, newsize);
1199 __ext2_truncate_blocks(inode, newsize); 1201 __ext2_truncate_blocks(inode, newsize);
1200 1202
1201 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; 1203 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
1202 if (inode_needs_sync(inode)) { 1204 if (inode_needs_sync(inode)) {
1203 sync_mapping_buffers(inode->i_mapping); 1205 sync_mapping_buffers(inode->i_mapping);
1204 sync_inode_metadata(inode, 1); 1206 sync_inode_metadata(inode, 1);
1205 } else { 1207 } else {
1206 mark_inode_dirty(inode); 1208 mark_inode_dirty(inode);
1207 } 1209 }
1208 1210
1209 return 0; 1211 return 0;
1210 } 1212 }
1211 1213
1212 static struct ext2_inode *ext2_get_inode(struct super_block *sb, ino_t ino, 1214 static struct ext2_inode *ext2_get_inode(struct super_block *sb, ino_t ino,
1213 struct buffer_head **p) 1215 struct buffer_head **p)
1214 { 1216 {
1215 struct buffer_head * bh; 1217 struct buffer_head * bh;
1216 unsigned long block_group; 1218 unsigned long block_group;
1217 unsigned long block; 1219 unsigned long block;
1218 unsigned long offset; 1220 unsigned long offset;
1219 struct ext2_group_desc * gdp; 1221 struct ext2_group_desc * gdp;
1220 1222
1221 *p = NULL; 1223 *p = NULL;
1222 if ((ino != EXT2_ROOT_INO && ino < EXT2_FIRST_INO(sb)) || 1224 if ((ino != EXT2_ROOT_INO && ino < EXT2_FIRST_INO(sb)) ||
1223 ino > le32_to_cpu(EXT2_SB(sb)->s_es->s_inodes_count)) 1225 ino > le32_to_cpu(EXT2_SB(sb)->s_es->s_inodes_count))
1224 goto Einval; 1226 goto Einval;
1225 1227
1226 block_group = (ino - 1) / EXT2_INODES_PER_GROUP(sb); 1228 block_group = (ino - 1) / EXT2_INODES_PER_GROUP(sb);
1227 gdp = ext2_get_group_desc(sb, block_group, NULL); 1229 gdp = ext2_get_group_desc(sb, block_group, NULL);
1228 if (!gdp) 1230 if (!gdp)
1229 goto Egdp; 1231 goto Egdp;
1230 /* 1232 /*
1231 * Figure out the offset within the block group inode table 1233 * Figure out the offset within the block group inode table
1232 */ 1234 */
1233 offset = ((ino - 1) % EXT2_INODES_PER_GROUP(sb)) * EXT2_INODE_SIZE(sb); 1235 offset = ((ino - 1) % EXT2_INODES_PER_GROUP(sb)) * EXT2_INODE_SIZE(sb);
1234 block = le32_to_cpu(gdp->bg_inode_table) + 1236 block = le32_to_cpu(gdp->bg_inode_table) +
1235 (offset >> EXT2_BLOCK_SIZE_BITS(sb)); 1237 (offset >> EXT2_BLOCK_SIZE_BITS(sb));
1236 if (!(bh = sb_bread(sb, block))) 1238 if (!(bh = sb_bread(sb, block)))
1237 goto Eio; 1239 goto Eio;
1238 1240
1239 *p = bh; 1241 *p = bh;
1240 offset &= (EXT2_BLOCK_SIZE(sb) - 1); 1242 offset &= (EXT2_BLOCK_SIZE(sb) - 1);
1241 return (struct ext2_inode *) (bh->b_data + offset); 1243 return (struct ext2_inode *) (bh->b_data + offset);
1242 1244
1243 Einval: 1245 Einval:
1244 ext2_error(sb, "ext2_get_inode", "bad inode number: %lu", 1246 ext2_error(sb, "ext2_get_inode", "bad inode number: %lu",
1245 (unsigned long) ino); 1247 (unsigned long) ino);
1246 return ERR_PTR(-EINVAL); 1248 return ERR_PTR(-EINVAL);
1247 Eio: 1249 Eio:
1248 ext2_error(sb, "ext2_get_inode", 1250 ext2_error(sb, "ext2_get_inode",
1249 "unable to read inode block - inode=%lu, block=%lu", 1251 "unable to read inode block - inode=%lu, block=%lu",
1250 (unsigned long) ino, block); 1252 (unsigned long) ino, block);
1251 Egdp: 1253 Egdp:
1252 return ERR_PTR(-EIO); 1254 return ERR_PTR(-EIO);
1253 } 1255 }
1254 1256
1255 void ext2_set_inode_flags(struct inode *inode) 1257 void ext2_set_inode_flags(struct inode *inode)
1256 { 1258 {
1257 unsigned int flags = EXT2_I(inode)->i_flags; 1259 unsigned int flags = EXT2_I(inode)->i_flags;
1258 1260
1259 inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); 1261 inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
1260 if (flags & EXT2_SYNC_FL) 1262 if (flags & EXT2_SYNC_FL)
1261 inode->i_flags |= S_SYNC; 1263 inode->i_flags |= S_SYNC;
1262 if (flags & EXT2_APPEND_FL) 1264 if (flags & EXT2_APPEND_FL)
1263 inode->i_flags |= S_APPEND; 1265 inode->i_flags |= S_APPEND;
1264 if (flags & EXT2_IMMUTABLE_FL) 1266 if (flags & EXT2_IMMUTABLE_FL)
1265 inode->i_flags |= S_IMMUTABLE; 1267 inode->i_flags |= S_IMMUTABLE;
1266 if (flags & EXT2_NOATIME_FL) 1268 if (flags & EXT2_NOATIME_FL)
1267 inode->i_flags |= S_NOATIME; 1269 inode->i_flags |= S_NOATIME;
1268 if (flags & EXT2_DIRSYNC_FL) 1270 if (flags & EXT2_DIRSYNC_FL)
1269 inode->i_flags |= S_DIRSYNC; 1271 inode->i_flags |= S_DIRSYNC;
1270 } 1272 }
1271 1273
1272 /* Propagate flags from i_flags to EXT2_I(inode)->i_flags */ 1274 /* Propagate flags from i_flags to EXT2_I(inode)->i_flags */
1273 void ext2_get_inode_flags(struct ext2_inode_info *ei) 1275 void ext2_get_inode_flags(struct ext2_inode_info *ei)
1274 { 1276 {
1275 unsigned int flags = ei->vfs_inode.i_flags; 1277 unsigned int flags = ei->vfs_inode.i_flags;
1276 1278
1277 ei->i_flags &= ~(EXT2_SYNC_FL|EXT2_APPEND_FL| 1279 ei->i_flags &= ~(EXT2_SYNC_FL|EXT2_APPEND_FL|
1278 EXT2_IMMUTABLE_FL|EXT2_NOATIME_FL|EXT2_DIRSYNC_FL); 1280 EXT2_IMMUTABLE_FL|EXT2_NOATIME_FL|EXT2_DIRSYNC_FL);
1279 if (flags & S_SYNC) 1281 if (flags & S_SYNC)
1280 ei->i_flags |= EXT2_SYNC_FL; 1282 ei->i_flags |= EXT2_SYNC_FL;
1281 if (flags & S_APPEND) 1283 if (flags & S_APPEND)
1282 ei->i_flags |= EXT2_APPEND_FL; 1284 ei->i_flags |= EXT2_APPEND_FL;
1283 if (flags & S_IMMUTABLE) 1285 if (flags & S_IMMUTABLE)
1284 ei->i_flags |= EXT2_IMMUTABLE_FL; 1286 ei->i_flags |= EXT2_IMMUTABLE_FL;
1285 if (flags & S_NOATIME) 1287 if (flags & S_NOATIME)
1286 ei->i_flags |= EXT2_NOATIME_FL; 1288 ei->i_flags |= EXT2_NOATIME_FL;
1287 if (flags & S_DIRSYNC) 1289 if (flags & S_DIRSYNC)
1288 ei->i_flags |= EXT2_DIRSYNC_FL; 1290 ei->i_flags |= EXT2_DIRSYNC_FL;
1289 } 1291 }
1290 1292
1291 struct inode *ext2_iget (struct super_block *sb, unsigned long ino) 1293 struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
1292 { 1294 {
1293 struct ext2_inode_info *ei; 1295 struct ext2_inode_info *ei;
1294 struct buffer_head * bh; 1296 struct buffer_head * bh;
1295 struct ext2_inode *raw_inode; 1297 struct ext2_inode *raw_inode;
1296 struct inode *inode; 1298 struct inode *inode;
1297 long ret = -EIO; 1299 long ret = -EIO;
1298 int n; 1300 int n;
1299 1301
1300 inode = iget_locked(sb, ino); 1302 inode = iget_locked(sb, ino);
1301 if (!inode) 1303 if (!inode)
1302 return ERR_PTR(-ENOMEM); 1304 return ERR_PTR(-ENOMEM);
1303 if (!(inode->i_state & I_NEW)) 1305 if (!(inode->i_state & I_NEW))
1304 return inode; 1306 return inode;
1305 1307
1306 ei = EXT2_I(inode); 1308 ei = EXT2_I(inode);
1307 ei->i_block_alloc_info = NULL; 1309 ei->i_block_alloc_info = NULL;
1308 1310
1309 raw_inode = ext2_get_inode(inode->i_sb, ino, &bh); 1311 raw_inode = ext2_get_inode(inode->i_sb, ino, &bh);
1310 if (IS_ERR(raw_inode)) { 1312 if (IS_ERR(raw_inode)) {
1311 ret = PTR_ERR(raw_inode); 1313 ret = PTR_ERR(raw_inode);
1312 goto bad_inode; 1314 goto bad_inode;
1313 } 1315 }
1314 1316
1315 inode->i_mode = le16_to_cpu(raw_inode->i_mode); 1317 inode->i_mode = le16_to_cpu(raw_inode->i_mode);
1316 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); 1318 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
1317 inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); 1319 inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
1318 if (!(test_opt (inode->i_sb, NO_UID32))) { 1320 if (!(test_opt (inode->i_sb, NO_UID32))) {
1319 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; 1321 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
1320 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; 1322 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
1321 } 1323 }
1322 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); 1324 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
1323 inode->i_size = le32_to_cpu(raw_inode->i_size); 1325 inode->i_size = le32_to_cpu(raw_inode->i_size);
1324 inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime); 1326 inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime);
1325 inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime); 1327 inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime);
1326 inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime); 1328 inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime);
1327 inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = 0; 1329 inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = 0;
1328 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); 1330 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
1329 /* We now have enough fields to check if the inode was active or not. 1331 /* We now have enough fields to check if the inode was active or not.
1330 * This is needed because nfsd might try to access dead inodes 1332 * This is needed because nfsd might try to access dead inodes
1331 * the test is that same one that e2fsck uses 1333 * the test is that same one that e2fsck uses
1332 * NeilBrown 1999oct15 1334 * NeilBrown 1999oct15
1333 */ 1335 */
1334 if (inode->i_nlink == 0 && (inode->i_mode == 0 || ei->i_dtime)) { 1336 if (inode->i_nlink == 0 && (inode->i_mode == 0 || ei->i_dtime)) {
1335 /* this inode is deleted */ 1337 /* this inode is deleted */
1336 brelse (bh); 1338 brelse (bh);
1337 ret = -ESTALE; 1339 ret = -ESTALE;
1338 goto bad_inode; 1340 goto bad_inode;
1339 } 1341 }
1340 inode->i_blocks = le32_to_cpu(raw_inode->i_blocks); 1342 inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
1341 ei->i_flags = le32_to_cpu(raw_inode->i_flags); 1343 ei->i_flags = le32_to_cpu(raw_inode->i_flags);
1342 ei->i_faddr = le32_to_cpu(raw_inode->i_faddr); 1344 ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
1343 ei->i_frag_no = raw_inode->i_frag; 1345 ei->i_frag_no = raw_inode->i_frag;
1344 ei->i_frag_size = raw_inode->i_fsize; 1346 ei->i_frag_size = raw_inode->i_fsize;
1345 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl); 1347 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
1346 ei->i_dir_acl = 0; 1348 ei->i_dir_acl = 0;
1347 if (S_ISREG(inode->i_mode)) 1349 if (S_ISREG(inode->i_mode))
1348 inode->i_size |= ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32; 1350 inode->i_size |= ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
1349 else 1351 else
1350 ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl); 1352 ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
1351 ei->i_dtime = 0; 1353 ei->i_dtime = 0;
1352 inode->i_generation = le32_to_cpu(raw_inode->i_generation); 1354 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
1353 ei->i_state = 0; 1355 ei->i_state = 0;
1354 ei->i_block_group = (ino - 1) / EXT2_INODES_PER_GROUP(inode->i_sb); 1356 ei->i_block_group = (ino - 1) / EXT2_INODES_PER_GROUP(inode->i_sb);
1355 ei->i_dir_start_lookup = 0; 1357 ei->i_dir_start_lookup = 0;
1356 1358
1357 /* 1359 /*
1358 * NOTE! The in-memory inode i_data array is in little-endian order 1360 * NOTE! The in-memory inode i_data array is in little-endian order
1359 * even on big-endian machines: we do NOT byteswap the block numbers! 1361 * even on big-endian machines: we do NOT byteswap the block numbers!
1360 */ 1362 */
1361 for (n = 0; n < EXT2_N_BLOCKS; n++) 1363 for (n = 0; n < EXT2_N_BLOCKS; n++)
1362 ei->i_data[n] = raw_inode->i_block[n]; 1364 ei->i_data[n] = raw_inode->i_block[n];
1363 1365
1364 if (S_ISREG(inode->i_mode)) { 1366 if (S_ISREG(inode->i_mode)) {
1365 inode->i_op = &ext2_file_inode_operations; 1367 inode->i_op = &ext2_file_inode_operations;
1366 if (ext2_use_xip(inode->i_sb)) { 1368 if (ext2_use_xip(inode->i_sb)) {
1367 inode->i_mapping->a_ops = &ext2_aops_xip; 1369 inode->i_mapping->a_ops = &ext2_aops_xip;
1368 inode->i_fop = &ext2_xip_file_operations; 1370 inode->i_fop = &ext2_xip_file_operations;
1369 } else if (test_opt(inode->i_sb, NOBH)) { 1371 } else if (test_opt(inode->i_sb, NOBH)) {
1370 inode->i_mapping->a_ops = &ext2_nobh_aops; 1372 inode->i_mapping->a_ops = &ext2_nobh_aops;
1371 inode->i_fop = &ext2_file_operations; 1373 inode->i_fop = &ext2_file_operations;
1372 } else { 1374 } else {
1373 inode->i_mapping->a_ops = &ext2_aops; 1375 inode->i_mapping->a_ops = &ext2_aops;
1374 inode->i_fop = &ext2_file_operations; 1376 inode->i_fop = &ext2_file_operations;
1375 } 1377 }
1376 } else if (S_ISDIR(inode->i_mode)) { 1378 } else if (S_ISDIR(inode->i_mode)) {
1377 inode->i_op = &ext2_dir_inode_operations; 1379 inode->i_op = &ext2_dir_inode_operations;
1378 inode->i_fop = &ext2_dir_operations; 1380 inode->i_fop = &ext2_dir_operations;
1379 if (test_opt(inode->i_sb, NOBH)) 1381 if (test_opt(inode->i_sb, NOBH))
1380 inode->i_mapping->a_ops = &ext2_nobh_aops; 1382 inode->i_mapping->a_ops = &ext2_nobh_aops;
1381 else 1383 else
1382 inode->i_mapping->a_ops = &ext2_aops; 1384 inode->i_mapping->a_ops = &ext2_aops;
1383 } else if (S_ISLNK(inode->i_mode)) { 1385 } else if (S_ISLNK(inode->i_mode)) {
1384 if (ext2_inode_is_fast_symlink(inode)) { 1386 if (ext2_inode_is_fast_symlink(inode)) {
1385 inode->i_op = &ext2_fast_symlink_inode_operations; 1387 inode->i_op = &ext2_fast_symlink_inode_operations;
1386 nd_terminate_link(ei->i_data, inode->i_size, 1388 nd_terminate_link(ei->i_data, inode->i_size,
1387 sizeof(ei->i_data) - 1); 1389 sizeof(ei->i_data) - 1);
1388 } else { 1390 } else {
1389 inode->i_op = &ext2_symlink_inode_operations; 1391 inode->i_op = &ext2_symlink_inode_operations;
1390 if (test_opt(inode->i_sb, NOBH)) 1392 if (test_opt(inode->i_sb, NOBH))
1391 inode->i_mapping->a_ops = &ext2_nobh_aops; 1393 inode->i_mapping->a_ops = &ext2_nobh_aops;
1392 else 1394 else
1393 inode->i_mapping->a_ops = &ext2_aops; 1395 inode->i_mapping->a_ops = &ext2_aops;
1394 } 1396 }
1395 } else { 1397 } else {
1396 inode->i_op = &ext2_special_inode_operations; 1398 inode->i_op = &ext2_special_inode_operations;
1397 if (raw_inode->i_block[0]) 1399 if (raw_inode->i_block[0])
1398 init_special_inode(inode, inode->i_mode, 1400 init_special_inode(inode, inode->i_mode,
1399 old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); 1401 old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
1400 else 1402 else
1401 init_special_inode(inode, inode->i_mode, 1403 init_special_inode(inode, inode->i_mode,
1402 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 1404 new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
1403 } 1405 }
1404 brelse (bh); 1406 brelse (bh);
1405 ext2_set_inode_flags(inode); 1407 ext2_set_inode_flags(inode);
1406 unlock_new_inode(inode); 1408 unlock_new_inode(inode);
1407 return inode; 1409 return inode;
1408 1410
1409 bad_inode: 1411 bad_inode:
1410 iget_failed(inode); 1412 iget_failed(inode);
1411 return ERR_PTR(ret); 1413 return ERR_PTR(ret);
1412 } 1414 }
1413 1415
1414 static int __ext2_write_inode(struct inode *inode, int do_sync) 1416 static int __ext2_write_inode(struct inode *inode, int do_sync)
1415 { 1417 {
1416 struct ext2_inode_info *ei = EXT2_I(inode); 1418 struct ext2_inode_info *ei = EXT2_I(inode);
1417 struct super_block *sb = inode->i_sb; 1419 struct super_block *sb = inode->i_sb;
1418 ino_t ino = inode->i_ino; 1420 ino_t ino = inode->i_ino;
1419 uid_t uid = inode->i_uid; 1421 uid_t uid = inode->i_uid;
1420 gid_t gid = inode->i_gid; 1422 gid_t gid = inode->i_gid;
1421 struct buffer_head * bh; 1423 struct buffer_head * bh;
1422 struct ext2_inode * raw_inode = ext2_get_inode(sb, ino, &bh); 1424 struct ext2_inode * raw_inode = ext2_get_inode(sb, ino, &bh);
1423 int n; 1425 int n;
1424 int err = 0; 1426 int err = 0;
1425 1427
1426 if (IS_ERR(raw_inode)) 1428 if (IS_ERR(raw_inode))
1427 return -EIO; 1429 return -EIO;
1428 1430
1429 /* For fields not not tracking in the in-memory inode, 1431 /* For fields not not tracking in the in-memory inode,
1430 * initialise them to zero for new inodes. */ 1432 * initialise them to zero for new inodes. */
1431 if (ei->i_state & EXT2_STATE_NEW) 1433 if (ei->i_state & EXT2_STATE_NEW)
1432 memset(raw_inode, 0, EXT2_SB(sb)->s_inode_size); 1434 memset(raw_inode, 0, EXT2_SB(sb)->s_inode_size);
1433 1435
1434 ext2_get_inode_flags(ei); 1436 ext2_get_inode_flags(ei);
1435 raw_inode->i_mode = cpu_to_le16(inode->i_mode); 1437 raw_inode->i_mode = cpu_to_le16(inode->i_mode);
1436 if (!(test_opt(sb, NO_UID32))) { 1438 if (!(test_opt(sb, NO_UID32))) {
1437 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(uid)); 1439 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(uid));
1438 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(gid)); 1440 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(gid));
1439 /* 1441 /*
1440 * Fix up interoperability with old kernels. Otherwise, old inodes get 1442 * Fix up interoperability with old kernels. Otherwise, old inodes get
1441 * re-used with the upper 16 bits of the uid/gid intact 1443 * re-used with the upper 16 bits of the uid/gid intact
1442 */ 1444 */
1443 if (!ei->i_dtime) { 1445 if (!ei->i_dtime) {
1444 raw_inode->i_uid_high = cpu_to_le16(high_16_bits(uid)); 1446 raw_inode->i_uid_high = cpu_to_le16(high_16_bits(uid));
1445 raw_inode->i_gid_high = cpu_to_le16(high_16_bits(gid)); 1447 raw_inode->i_gid_high = cpu_to_le16(high_16_bits(gid));
1446 } else { 1448 } else {
1447 raw_inode->i_uid_high = 0; 1449 raw_inode->i_uid_high = 0;
1448 raw_inode->i_gid_high = 0; 1450 raw_inode->i_gid_high = 0;
1449 } 1451 }
1450 } else { 1452 } else {
1451 raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(uid)); 1453 raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(uid));
1452 raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(gid)); 1454 raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(gid));
1453 raw_inode->i_uid_high = 0; 1455 raw_inode->i_uid_high = 0;
1454 raw_inode->i_gid_high = 0; 1456 raw_inode->i_gid_high = 0;
1455 } 1457 }
1456 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); 1458 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
1457 raw_inode->i_size = cpu_to_le32(inode->i_size); 1459 raw_inode->i_size = cpu_to_le32(inode->i_size);
1458 raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec); 1460 raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
1459 raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); 1461 raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
1460 raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); 1462 raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
1461 1463
1462 raw_inode->i_blocks = cpu_to_le32(inode->i_blocks); 1464 raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
1463 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); 1465 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
1464 raw_inode->i_flags = cpu_to_le32(ei->i_flags); 1466 raw_inode->i_flags = cpu_to_le32(ei->i_flags);
1465 raw_inode->i_faddr = cpu_to_le32(ei->i_faddr); 1467 raw_inode->i_faddr = cpu_to_le32(ei->i_faddr);
1466 raw_inode->i_frag = ei->i_frag_no; 1468 raw_inode->i_frag = ei->i_frag_no;
1467 raw_inode->i_fsize = ei->i_frag_size; 1469 raw_inode->i_fsize = ei->i_frag_size;
1468 raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl); 1470 raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
1469 if (!S_ISREG(inode->i_mode)) 1471 if (!S_ISREG(inode->i_mode))
1470 raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl); 1472 raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
1471 else { 1473 else {
1472 raw_inode->i_size_high = cpu_to_le32(inode->i_size >> 32); 1474 raw_inode->i_size_high = cpu_to_le32(inode->i_size >> 32);
1473 if (inode->i_size > 0x7fffffffULL) { 1475 if (inode->i_size > 0x7fffffffULL) {
1474 if (!EXT2_HAS_RO_COMPAT_FEATURE(sb, 1476 if (!EXT2_HAS_RO_COMPAT_FEATURE(sb,
1475 EXT2_FEATURE_RO_COMPAT_LARGE_FILE) || 1477 EXT2_FEATURE_RO_COMPAT_LARGE_FILE) ||
1476 EXT2_SB(sb)->s_es->s_rev_level == 1478 EXT2_SB(sb)->s_es->s_rev_level ==
1477 cpu_to_le32(EXT2_GOOD_OLD_REV)) { 1479 cpu_to_le32(EXT2_GOOD_OLD_REV)) {
1478 /* If this is the first large file 1480 /* If this is the first large file
1479 * created, add a flag to the superblock. 1481 * created, add a flag to the superblock.
1480 */ 1482 */
1481 spin_lock(&EXT2_SB(sb)->s_lock); 1483 spin_lock(&EXT2_SB(sb)->s_lock);
1482 ext2_update_dynamic_rev(sb); 1484 ext2_update_dynamic_rev(sb);
1483 EXT2_SET_RO_COMPAT_FEATURE(sb, 1485 EXT2_SET_RO_COMPAT_FEATURE(sb,
1484 EXT2_FEATURE_RO_COMPAT_LARGE_FILE); 1486 EXT2_FEATURE_RO_COMPAT_LARGE_FILE);
1485 spin_unlock(&EXT2_SB(sb)->s_lock); 1487 spin_unlock(&EXT2_SB(sb)->s_lock);
1486 ext2_write_super(sb); 1488 ext2_write_super(sb);
1487 } 1489 }
1488 } 1490 }
1489 } 1491 }
1490 1492
1491 raw_inode->i_generation = cpu_to_le32(inode->i_generation); 1493 raw_inode->i_generation = cpu_to_le32(inode->i_generation);
1492 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { 1494 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
1493 if (old_valid_dev(inode->i_rdev)) { 1495 if (old_valid_dev(inode->i_rdev)) {
1494 raw_inode->i_block[0] = 1496 raw_inode->i_block[0] =
1495 cpu_to_le32(old_encode_dev(inode->i_rdev)); 1497 cpu_to_le32(old_encode_dev(inode->i_rdev));
1496 raw_inode->i_block[1] = 0; 1498 raw_inode->i_block[1] = 0;
1497 } else { 1499 } else {
1498 raw_inode->i_block[0] = 0; 1500 raw_inode->i_block[0] = 0;
1499 raw_inode->i_block[1] = 1501 raw_inode->i_block[1] =
1500 cpu_to_le32(new_encode_dev(inode->i_rdev)); 1502 cpu_to_le32(new_encode_dev(inode->i_rdev));
1501 raw_inode->i_block[2] = 0; 1503 raw_inode->i_block[2] = 0;
1502 } 1504 }
1503 } else for (n = 0; n < EXT2_N_BLOCKS; n++) 1505 } else for (n = 0; n < EXT2_N_BLOCKS; n++)
1504 raw_inode->i_block[n] = ei->i_data[n]; 1506 raw_inode->i_block[n] = ei->i_data[n];
1505 mark_buffer_dirty(bh); 1507 mark_buffer_dirty(bh);
1506 if (do_sync) { 1508 if (do_sync) {
1507 sync_dirty_buffer(bh); 1509 sync_dirty_buffer(bh);
1508 if (buffer_req(bh) && !buffer_uptodate(bh)) { 1510 if (buffer_req(bh) && !buffer_uptodate(bh)) {
1509 printk ("IO error syncing ext2 inode [%s:%08lx]\n", 1511 printk ("IO error syncing ext2 inode [%s:%08lx]\n",
1510 sb->s_id, (unsigned long) ino); 1512 sb->s_id, (unsigned long) ino);
1511 err = -EIO; 1513 err = -EIO;
1512 } 1514 }
1513 } 1515 }
1514 ei->i_state &= ~EXT2_STATE_NEW; 1516 ei->i_state &= ~EXT2_STATE_NEW;
1515 brelse (bh); 1517 brelse (bh);
1516 return err; 1518 return err;
1517 } 1519 }
1518 1520
1519 int ext2_write_inode(struct inode *inode, struct writeback_control *wbc) 1521 int ext2_write_inode(struct inode *inode, struct writeback_control *wbc)
1520 { 1522 {
1521 return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); 1523 return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
1522 } 1524 }
1523 1525
1524 int ext2_setattr(struct dentry *dentry, struct iattr *iattr) 1526 int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
1525 { 1527 {
1526 struct inode *inode = dentry->d_inode; 1528 struct inode *inode = dentry->d_inode;
1527 int error; 1529 int error;
1528 1530
1529 error = inode_change_ok(inode, iattr); 1531 error = inode_change_ok(inode, iattr);
1530 if (error) 1532 if (error)
1531 return error; 1533 return error;
1532 1534
1533 if (is_quota_modification(inode, iattr)) 1535 if (is_quota_modification(inode, iattr))
1534 dquot_initialize(inode); 1536 dquot_initialize(inode);
1535 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || 1537 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
1536 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { 1538 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
1537 error = dquot_transfer(inode, iattr); 1539 error = dquot_transfer(inode, iattr);
1538 if (error) 1540 if (error)
1539 return error; 1541 return error;
1540 } 1542 }
1541 if (iattr->ia_valid & ATTR_SIZE && iattr->ia_size != inode->i_size) { 1543 if (iattr->ia_valid & ATTR_SIZE && iattr->ia_size != inode->i_size) {
1542 error = ext2_setsize(inode, iattr->ia_size); 1544 error = ext2_setsize(inode, iattr->ia_size);
1543 if (error) 1545 if (error)
1544 return error; 1546 return error;
1545 } 1547 }
1546 setattr_copy(inode, iattr); 1548 setattr_copy(inode, iattr);
1547 if (iattr->ia_valid & ATTR_MODE) 1549 if (iattr->ia_valid & ATTR_MODE)
1548 error = ext2_acl_chmod(inode); 1550 error = ext2_acl_chmod(inode);
1549 mark_inode_dirty(inode); 1551 mark_inode_dirty(inode);
1550 1552
1551 return error; 1553 return error;
1552 } 1554 }
1553 1555
1 /* 1 /*
2 * linux/fs/ext3/inode.c 2 * linux/fs/ext3/inode.c
3 * 3 *
4 * Copyright (C) 1992, 1993, 1994, 1995 4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr) 5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal 6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI) 7 * Universite Pierre et Marie Curie (Paris VI)
8 * 8 *
9 * from 9 * from
10 * 10 *
11 * linux/fs/minix/inode.c 11 * linux/fs/minix/inode.c
12 * 12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds 13 * Copyright (C) 1991, 1992 Linus Torvalds
14 * 14 *
15 * Goal-directed block allocation by Stephen Tweedie 15 * Goal-directed block allocation by Stephen Tweedie
16 * (sct@redhat.com), 1993, 1998 16 * (sct@redhat.com), 1993, 1998
17 * Big-endian to little-endian byte-swapping/bitmaps by 17 * Big-endian to little-endian byte-swapping/bitmaps by
18 * David S. Miller (davem@caip.rutgers.edu), 1995 18 * David S. Miller (davem@caip.rutgers.edu), 1995
19 * 64-bit file support on 64-bit platforms by Jakub Jelinek 19 * 64-bit file support on 64-bit platforms by Jakub Jelinek
20 * (jj@sunsite.ms.mff.cuni.cz) 20 * (jj@sunsite.ms.mff.cuni.cz)
21 * 21 *
22 * Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000 22 * Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
23 */ 23 */
24 24
25 #include <linux/module.h> 25 #include <linux/module.h>
26 #include <linux/fs.h> 26 #include <linux/fs.h>
27 #include <linux/time.h> 27 #include <linux/time.h>
28 #include <linux/ext3_jbd.h> 28 #include <linux/ext3_jbd.h>
29 #include <linux/jbd.h> 29 #include <linux/jbd.h>
30 #include <linux/highuid.h> 30 #include <linux/highuid.h>
31 #include <linux/pagemap.h> 31 #include <linux/pagemap.h>
32 #include <linux/quotaops.h> 32 #include <linux/quotaops.h>
33 #include <linux/string.h> 33 #include <linux/string.h>
34 #include <linux/buffer_head.h> 34 #include <linux/buffer_head.h>
35 #include <linux/writeback.h> 35 #include <linux/writeback.h>
36 #include <linux/mpage.h> 36 #include <linux/mpage.h>
37 #include <linux/uio.h> 37 #include <linux/uio.h>
38 #include <linux/bio.h> 38 #include <linux/bio.h>
39 #include <linux/fiemap.h> 39 #include <linux/fiemap.h>
40 #include <linux/namei.h> 40 #include <linux/namei.h>
41 #include "xattr.h" 41 #include "xattr.h"
42 #include "acl.h" 42 #include "acl.h"
43 43
44 static int ext3_writepage_trans_blocks(struct inode *inode); 44 static int ext3_writepage_trans_blocks(struct inode *inode);
45 45
46 /* 46 /*
47 * Test whether an inode is a fast symlink. 47 * Test whether an inode is a fast symlink.
48 */ 48 */
49 static int ext3_inode_is_fast_symlink(struct inode *inode) 49 static int ext3_inode_is_fast_symlink(struct inode *inode)
50 { 50 {
51 int ea_blocks = EXT3_I(inode)->i_file_acl ? 51 int ea_blocks = EXT3_I(inode)->i_file_acl ?
52 (inode->i_sb->s_blocksize >> 9) : 0; 52 (inode->i_sb->s_blocksize >> 9) : 0;
53 53
54 return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); 54 return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
55 } 55 }
56 56
57 /* 57 /*
58 * The ext3 forget function must perform a revoke if we are freeing data 58 * The ext3 forget function must perform a revoke if we are freeing data
59 * which has been journaled. Metadata (eg. indirect blocks) must be 59 * which has been journaled. Metadata (eg. indirect blocks) must be
60 * revoked in all cases. 60 * revoked in all cases.
61 * 61 *
62 * "bh" may be NULL: a metadata block may have been freed from memory 62 * "bh" may be NULL: a metadata block may have been freed from memory
63 * but there may still be a record of it in the journal, and that record 63 * but there may still be a record of it in the journal, and that record
64 * still needs to be revoked. 64 * still needs to be revoked.
65 */ 65 */
66 int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode, 66 int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode,
67 struct buffer_head *bh, ext3_fsblk_t blocknr) 67 struct buffer_head *bh, ext3_fsblk_t blocknr)
68 { 68 {
69 int err; 69 int err;
70 70
71 might_sleep(); 71 might_sleep();
72 72
73 BUFFER_TRACE(bh, "enter"); 73 BUFFER_TRACE(bh, "enter");
74 74
75 jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " 75 jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
76 "data mode %lx\n", 76 "data mode %lx\n",
77 bh, is_metadata, inode->i_mode, 77 bh, is_metadata, inode->i_mode,
78 test_opt(inode->i_sb, DATA_FLAGS)); 78 test_opt(inode->i_sb, DATA_FLAGS));
79 79
80 /* Never use the revoke function if we are doing full data 80 /* Never use the revoke function if we are doing full data
81 * journaling: there is no need to, and a V1 superblock won't 81 * journaling: there is no need to, and a V1 superblock won't
82 * support it. Otherwise, only skip the revoke on un-journaled 82 * support it. Otherwise, only skip the revoke on un-journaled
83 * data blocks. */ 83 * data blocks. */
84 84
85 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA || 85 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ||
86 (!is_metadata && !ext3_should_journal_data(inode))) { 86 (!is_metadata && !ext3_should_journal_data(inode))) {
87 if (bh) { 87 if (bh) {
88 BUFFER_TRACE(bh, "call journal_forget"); 88 BUFFER_TRACE(bh, "call journal_forget");
89 return ext3_journal_forget(handle, bh); 89 return ext3_journal_forget(handle, bh);
90 } 90 }
91 return 0; 91 return 0;
92 } 92 }
93 93
94 /* 94 /*
95 * data!=journal && (is_metadata || should_journal_data(inode)) 95 * data!=journal && (is_metadata || should_journal_data(inode))
96 */ 96 */
97 BUFFER_TRACE(bh, "call ext3_journal_revoke"); 97 BUFFER_TRACE(bh, "call ext3_journal_revoke");
98 err = ext3_journal_revoke(handle, blocknr, bh); 98 err = ext3_journal_revoke(handle, blocknr, bh);
99 if (err) 99 if (err)
100 ext3_abort(inode->i_sb, __func__, 100 ext3_abort(inode->i_sb, __func__,
101 "error %d when attempting revoke", err); 101 "error %d when attempting revoke", err);
102 BUFFER_TRACE(bh, "exit"); 102 BUFFER_TRACE(bh, "exit");
103 return err; 103 return err;
104 } 104 }
105 105
106 /* 106 /*
107 * Work out how many blocks we need to proceed with the next chunk of a 107 * Work out how many blocks we need to proceed with the next chunk of a
108 * truncate transaction. 108 * truncate transaction.
109 */ 109 */
110 static unsigned long blocks_for_truncate(struct inode *inode) 110 static unsigned long blocks_for_truncate(struct inode *inode)
111 { 111 {
112 unsigned long needed; 112 unsigned long needed;
113 113
114 needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); 114 needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
115 115
116 /* Give ourselves just enough room to cope with inodes in which 116 /* Give ourselves just enough room to cope with inodes in which
117 * i_blocks is corrupt: we've seen disk corruptions in the past 117 * i_blocks is corrupt: we've seen disk corruptions in the past
118 * which resulted in random data in an inode which looked enough 118 * which resulted in random data in an inode which looked enough
119 * like a regular file for ext3 to try to delete it. Things 119 * like a regular file for ext3 to try to delete it. Things
120 * will go a bit crazy if that happens, but at least we should 120 * will go a bit crazy if that happens, but at least we should
121 * try not to panic the whole kernel. */ 121 * try not to panic the whole kernel. */
122 if (needed < 2) 122 if (needed < 2)
123 needed = 2; 123 needed = 2;
124 124
125 /* But we need to bound the transaction so we don't overflow the 125 /* But we need to bound the transaction so we don't overflow the
126 * journal. */ 126 * journal. */
127 if (needed > EXT3_MAX_TRANS_DATA) 127 if (needed > EXT3_MAX_TRANS_DATA)
128 needed = EXT3_MAX_TRANS_DATA; 128 needed = EXT3_MAX_TRANS_DATA;
129 129
130 return EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + needed; 130 return EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
131 } 131 }
132 132
133 /* 133 /*
134 * Truncate transactions can be complex and absolutely huge. So we need to 134 * Truncate transactions can be complex and absolutely huge. So we need to
135 * be able to restart the transaction at a conventient checkpoint to make 135 * be able to restart the transaction at a conventient checkpoint to make
136 * sure we don't overflow the journal. 136 * sure we don't overflow the journal.
137 * 137 *
138 * start_transaction gets us a new handle for a truncate transaction, 138 * start_transaction gets us a new handle for a truncate transaction,
139 * and extend_transaction tries to extend the existing one a bit. If 139 * and extend_transaction tries to extend the existing one a bit. If
140 * extend fails, we need to propagate the failure up and restart the 140 * extend fails, we need to propagate the failure up and restart the
141 * transaction in the top-level truncate loop. --sct 141 * transaction in the top-level truncate loop. --sct
142 */ 142 */
143 static handle_t *start_transaction(struct inode *inode) 143 static handle_t *start_transaction(struct inode *inode)
144 { 144 {
145 handle_t *result; 145 handle_t *result;
146 146
147 result = ext3_journal_start(inode, blocks_for_truncate(inode)); 147 result = ext3_journal_start(inode, blocks_for_truncate(inode));
148 if (!IS_ERR(result)) 148 if (!IS_ERR(result))
149 return result; 149 return result;
150 150
151 ext3_std_error(inode->i_sb, PTR_ERR(result)); 151 ext3_std_error(inode->i_sb, PTR_ERR(result));
152 return result; 152 return result;
153 } 153 }
154 154
155 /* 155 /*
156 * Try to extend this transaction for the purposes of truncation. 156 * Try to extend this transaction for the purposes of truncation.
157 * 157 *
158 * Returns 0 if we managed to create more room. If we can't create more 158 * Returns 0 if we managed to create more room. If we can't create more
159 * room, and the transaction must be restarted we return 1. 159 * room, and the transaction must be restarted we return 1.
160 */ 160 */
161 static int try_to_extend_transaction(handle_t *handle, struct inode *inode) 161 static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
162 { 162 {
163 if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS) 163 if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS)
164 return 0; 164 return 0;
165 if (!ext3_journal_extend(handle, blocks_for_truncate(inode))) 165 if (!ext3_journal_extend(handle, blocks_for_truncate(inode)))
166 return 0; 166 return 0;
167 return 1; 167 return 1;
168 } 168 }
169 169
170 /* 170 /*
171 * Restart the transaction associated with *handle. This does a commit, 171 * Restart the transaction associated with *handle. This does a commit,
172 * so before we call here everything must be consistently dirtied against 172 * so before we call here everything must be consistently dirtied against
173 * this transaction. 173 * this transaction.
174 */ 174 */
175 static int truncate_restart_transaction(handle_t *handle, struct inode *inode) 175 static int truncate_restart_transaction(handle_t *handle, struct inode *inode)
176 { 176 {
177 int ret; 177 int ret;
178 178
179 jbd_debug(2, "restarting handle %p\n", handle); 179 jbd_debug(2, "restarting handle %p\n", handle);
180 /* 180 /*
181 * Drop truncate_mutex to avoid deadlock with ext3_get_blocks_handle 181 * Drop truncate_mutex to avoid deadlock with ext3_get_blocks_handle
182 * At this moment, get_block can be called only for blocks inside 182 * At this moment, get_block can be called only for blocks inside
183 * i_size since page cache has been already dropped and writes are 183 * i_size since page cache has been already dropped and writes are
184 * blocked by i_mutex. So we can safely drop the truncate_mutex. 184 * blocked by i_mutex. So we can safely drop the truncate_mutex.
185 */ 185 */
186 mutex_unlock(&EXT3_I(inode)->truncate_mutex); 186 mutex_unlock(&EXT3_I(inode)->truncate_mutex);
187 ret = ext3_journal_restart(handle, blocks_for_truncate(inode)); 187 ret = ext3_journal_restart(handle, blocks_for_truncate(inode));
188 mutex_lock(&EXT3_I(inode)->truncate_mutex); 188 mutex_lock(&EXT3_I(inode)->truncate_mutex);
189 return ret; 189 return ret;
190 } 190 }
191 191
192 /* 192 /*
193 * Called at inode eviction from icache 193 * Called at inode eviction from icache
194 */ 194 */
195 void ext3_evict_inode (struct inode *inode) 195 void ext3_evict_inode (struct inode *inode)
196 { 196 {
197 struct ext3_block_alloc_info *rsv; 197 struct ext3_block_alloc_info *rsv;
198 handle_t *handle; 198 handle_t *handle;
199 int want_delete = 0; 199 int want_delete = 0;
200 200
201 if (!inode->i_nlink && !is_bad_inode(inode)) { 201 if (!inode->i_nlink && !is_bad_inode(inode)) {
202 dquot_initialize(inode); 202 dquot_initialize(inode);
203 want_delete = 1; 203 want_delete = 1;
204 } 204 }
205 205
206 truncate_inode_pages(&inode->i_data, 0); 206 truncate_inode_pages(&inode->i_data, 0);
207 207
208 ext3_discard_reservation(inode); 208 ext3_discard_reservation(inode);
209 rsv = EXT3_I(inode)->i_block_alloc_info; 209 rsv = EXT3_I(inode)->i_block_alloc_info;
210 EXT3_I(inode)->i_block_alloc_info = NULL; 210 EXT3_I(inode)->i_block_alloc_info = NULL;
211 if (unlikely(rsv)) 211 if (unlikely(rsv))
212 kfree(rsv); 212 kfree(rsv);
213 213
214 if (!want_delete) 214 if (!want_delete)
215 goto no_delete; 215 goto no_delete;
216 216
217 handle = start_transaction(inode); 217 handle = start_transaction(inode);
218 if (IS_ERR(handle)) { 218 if (IS_ERR(handle)) {
219 /* 219 /*
220 * If we're going to skip the normal cleanup, we still need to 220 * If we're going to skip the normal cleanup, we still need to
221 * make sure that the in-core orphan linked list is properly 221 * make sure that the in-core orphan linked list is properly
222 * cleaned up. 222 * cleaned up.
223 */ 223 */
224 ext3_orphan_del(NULL, inode); 224 ext3_orphan_del(NULL, inode);
225 goto no_delete; 225 goto no_delete;
226 } 226 }
227 227
228 if (IS_SYNC(inode)) 228 if (IS_SYNC(inode))
229 handle->h_sync = 1; 229 handle->h_sync = 1;
230 inode->i_size = 0; 230 inode->i_size = 0;
231 if (inode->i_blocks) 231 if (inode->i_blocks)
232 ext3_truncate(inode); 232 ext3_truncate(inode);
233 /* 233 /*
234 * Kill off the orphan record which ext3_truncate created. 234 * Kill off the orphan record which ext3_truncate created.
235 * AKPM: I think this can be inside the above `if'. 235 * AKPM: I think this can be inside the above `if'.
236 * Note that ext3_orphan_del() has to be able to cope with the 236 * Note that ext3_orphan_del() has to be able to cope with the
237 * deletion of a non-existent orphan - this is because we don't 237 * deletion of a non-existent orphan - this is because we don't
238 * know if ext3_truncate() actually created an orphan record. 238 * know if ext3_truncate() actually created an orphan record.
239 * (Well, we could do this if we need to, but heck - it works) 239 * (Well, we could do this if we need to, but heck - it works)
240 */ 240 */
241 ext3_orphan_del(handle, inode); 241 ext3_orphan_del(handle, inode);
242 EXT3_I(inode)->i_dtime = get_seconds(); 242 EXT3_I(inode)->i_dtime = get_seconds();
243 243
244 /* 244 /*
245 * One subtle ordering requirement: if anything has gone wrong 245 * One subtle ordering requirement: if anything has gone wrong
246 * (transaction abort, IO errors, whatever), then we can still 246 * (transaction abort, IO errors, whatever), then we can still
247 * do these next steps (the fs will already have been marked as 247 * do these next steps (the fs will already have been marked as
248 * having errors), but we can't free the inode if the mark_dirty 248 * having errors), but we can't free the inode if the mark_dirty
249 * fails. 249 * fails.
250 */ 250 */
251 if (ext3_mark_inode_dirty(handle, inode)) { 251 if (ext3_mark_inode_dirty(handle, inode)) {
252 /* If that failed, just dquot_drop() and be done with that */ 252 /* If that failed, just dquot_drop() and be done with that */
253 dquot_drop(inode); 253 dquot_drop(inode);
254 end_writeback(inode); 254 end_writeback(inode);
255 } else { 255 } else {
256 ext3_xattr_delete_inode(handle, inode); 256 ext3_xattr_delete_inode(handle, inode);
257 dquot_free_inode(inode); 257 dquot_free_inode(inode);
258 dquot_drop(inode); 258 dquot_drop(inode);
259 end_writeback(inode); 259 end_writeback(inode);
260 ext3_free_inode(handle, inode); 260 ext3_free_inode(handle, inode);
261 } 261 }
262 ext3_journal_stop(handle); 262 ext3_journal_stop(handle);
263 return; 263 return;
264 no_delete: 264 no_delete:
265 end_writeback(inode); 265 end_writeback(inode);
266 dquot_drop(inode); 266 dquot_drop(inode);
267 } 267 }
268 268
269 typedef struct { 269 typedef struct {
270 __le32 *p; 270 __le32 *p;
271 __le32 key; 271 __le32 key;
272 struct buffer_head *bh; 272 struct buffer_head *bh;
273 } Indirect; 273 } Indirect;
274 274
275 static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) 275 static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
276 { 276 {
277 p->key = *(p->p = v); 277 p->key = *(p->p = v);
278 p->bh = bh; 278 p->bh = bh;
279 } 279 }
280 280
281 static int verify_chain(Indirect *from, Indirect *to) 281 static int verify_chain(Indirect *from, Indirect *to)
282 { 282 {
283 while (from <= to && from->key == *from->p) 283 while (from <= to && from->key == *from->p)
284 from++; 284 from++;
285 return (from > to); 285 return (from > to);
286 } 286 }
287 287
288 /** 288 /**
289 * ext3_block_to_path - parse the block number into array of offsets 289 * ext3_block_to_path - parse the block number into array of offsets
290 * @inode: inode in question (we are only interested in its superblock) 290 * @inode: inode in question (we are only interested in its superblock)
291 * @i_block: block number to be parsed 291 * @i_block: block number to be parsed
292 * @offsets: array to store the offsets in 292 * @offsets: array to store the offsets in
293 * @boundary: set this non-zero if the referred-to block is likely to be 293 * @boundary: set this non-zero if the referred-to block is likely to be
294 * followed (on disk) by an indirect block. 294 * followed (on disk) by an indirect block.
295 * 295 *
296 * To store the locations of file's data ext3 uses a data structure common 296 * To store the locations of file's data ext3 uses a data structure common
297 * for UNIX filesystems - tree of pointers anchored in the inode, with 297 * for UNIX filesystems - tree of pointers anchored in the inode, with
298 * data blocks at leaves and indirect blocks in intermediate nodes. 298 * data blocks at leaves and indirect blocks in intermediate nodes.
299 * This function translates the block number into path in that tree - 299 * This function translates the block number into path in that tree -
300 * return value is the path length and @offsets[n] is the offset of 300 * return value is the path length and @offsets[n] is the offset of
301 * pointer to (n+1)th node in the nth one. If @block is out of range 301 * pointer to (n+1)th node in the nth one. If @block is out of range
302 * (negative or too large) warning is printed and zero returned. 302 * (negative or too large) warning is printed and zero returned.
303 * 303 *
304 * Note: function doesn't find node addresses, so no IO is needed. All 304 * Note: function doesn't find node addresses, so no IO is needed. All
305 * we need to know is the capacity of indirect blocks (taken from the 305 * we need to know is the capacity of indirect blocks (taken from the
306 * inode->i_sb). 306 * inode->i_sb).
307 */ 307 */
308 308
309 /* 309 /*
310 * Portability note: the last comparison (check that we fit into triple 310 * Portability note: the last comparison (check that we fit into triple
311 * indirect block) is spelled differently, because otherwise on an 311 * indirect block) is spelled differently, because otherwise on an
312 * architecture with 32-bit longs and 8Kb pages we might get into trouble 312 * architecture with 32-bit longs and 8Kb pages we might get into trouble
313 * if our filesystem had 8Kb blocks. We might use long long, but that would 313 * if our filesystem had 8Kb blocks. We might use long long, but that would
314 * kill us on x86. Oh, well, at least the sign propagation does not matter - 314 * kill us on x86. Oh, well, at least the sign propagation does not matter -
315 * i_block would have to be negative in the very beginning, so we would not 315 * i_block would have to be negative in the very beginning, so we would not
316 * get there at all. 316 * get there at all.
317 */ 317 */
318 318
319 static int ext3_block_to_path(struct inode *inode, 319 static int ext3_block_to_path(struct inode *inode,
320 long i_block, int offsets[4], int *boundary) 320 long i_block, int offsets[4], int *boundary)
321 { 321 {
322 int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb); 322 int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb);
323 int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb); 323 int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb);
324 const long direct_blocks = EXT3_NDIR_BLOCKS, 324 const long direct_blocks = EXT3_NDIR_BLOCKS,
325 indirect_blocks = ptrs, 325 indirect_blocks = ptrs,
326 double_blocks = (1 << (ptrs_bits * 2)); 326 double_blocks = (1 << (ptrs_bits * 2));
327 int n = 0; 327 int n = 0;
328 int final = 0; 328 int final = 0;
329 329
330 if (i_block < 0) { 330 if (i_block < 0) {
331 ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0"); 331 ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0");
332 } else if (i_block < direct_blocks) { 332 } else if (i_block < direct_blocks) {
333 offsets[n++] = i_block; 333 offsets[n++] = i_block;
334 final = direct_blocks; 334 final = direct_blocks;
335 } else if ( (i_block -= direct_blocks) < indirect_blocks) { 335 } else if ( (i_block -= direct_blocks) < indirect_blocks) {
336 offsets[n++] = EXT3_IND_BLOCK; 336 offsets[n++] = EXT3_IND_BLOCK;
337 offsets[n++] = i_block; 337 offsets[n++] = i_block;
338 final = ptrs; 338 final = ptrs;
339 } else if ((i_block -= indirect_blocks) < double_blocks) { 339 } else if ((i_block -= indirect_blocks) < double_blocks) {
340 offsets[n++] = EXT3_DIND_BLOCK; 340 offsets[n++] = EXT3_DIND_BLOCK;
341 offsets[n++] = i_block >> ptrs_bits; 341 offsets[n++] = i_block >> ptrs_bits;
342 offsets[n++] = i_block & (ptrs - 1); 342 offsets[n++] = i_block & (ptrs - 1);
343 final = ptrs; 343 final = ptrs;
344 } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { 344 } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
345 offsets[n++] = EXT3_TIND_BLOCK; 345 offsets[n++] = EXT3_TIND_BLOCK;
346 offsets[n++] = i_block >> (ptrs_bits * 2); 346 offsets[n++] = i_block >> (ptrs_bits * 2);
347 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); 347 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
348 offsets[n++] = i_block & (ptrs - 1); 348 offsets[n++] = i_block & (ptrs - 1);
349 final = ptrs; 349 final = ptrs;
350 } else { 350 } else {
351 ext3_warning(inode->i_sb, "ext3_block_to_path", "block > big"); 351 ext3_warning(inode->i_sb, "ext3_block_to_path", "block > big");
352 } 352 }
353 if (boundary) 353 if (boundary)
354 *boundary = final - 1 - (i_block & (ptrs - 1)); 354 *boundary = final - 1 - (i_block & (ptrs - 1));
355 return n; 355 return n;
356 } 356 }
357 357
358 /** 358 /**
359 * ext3_get_branch - read the chain of indirect blocks leading to data 359 * ext3_get_branch - read the chain of indirect blocks leading to data
360 * @inode: inode in question 360 * @inode: inode in question
361 * @depth: depth of the chain (1 - direct pointer, etc.) 361 * @depth: depth of the chain (1 - direct pointer, etc.)
362 * @offsets: offsets of pointers in inode/indirect blocks 362 * @offsets: offsets of pointers in inode/indirect blocks
363 * @chain: place to store the result 363 * @chain: place to store the result
364 * @err: here we store the error value 364 * @err: here we store the error value
365 * 365 *
366 * Function fills the array of triples <key, p, bh> and returns %NULL 366 * Function fills the array of triples <key, p, bh> and returns %NULL
367 * if everything went OK or the pointer to the last filled triple 367 * if everything went OK or the pointer to the last filled triple
368 * (incomplete one) otherwise. Upon the return chain[i].key contains 368 * (incomplete one) otherwise. Upon the return chain[i].key contains
369 * the number of (i+1)-th block in the chain (as it is stored in memory, 369 * the number of (i+1)-th block in the chain (as it is stored in memory,
370 * i.e. little-endian 32-bit), chain[i].p contains the address of that 370 * i.e. little-endian 32-bit), chain[i].p contains the address of that
371 * number (it points into struct inode for i==0 and into the bh->b_data 371 * number (it points into struct inode for i==0 and into the bh->b_data
372 * for i>0) and chain[i].bh points to the buffer_head of i-th indirect 372 * for i>0) and chain[i].bh points to the buffer_head of i-th indirect
373 * block for i>0 and NULL for i==0. In other words, it holds the block 373 * block for i>0 and NULL for i==0. In other words, it holds the block
374 * numbers of the chain, addresses they were taken from (and where we can 374 * numbers of the chain, addresses they were taken from (and where we can
375 * verify that chain did not change) and buffer_heads hosting these 375 * verify that chain did not change) and buffer_heads hosting these
376 * numbers. 376 * numbers.
377 * 377 *
378 * Function stops when it stumbles upon zero pointer (absent block) 378 * Function stops when it stumbles upon zero pointer (absent block)
379 * (pointer to last triple returned, *@err == 0) 379 * (pointer to last triple returned, *@err == 0)
380 * or when it gets an IO error reading an indirect block 380 * or when it gets an IO error reading an indirect block
381 * (ditto, *@err == -EIO) 381 * (ditto, *@err == -EIO)
382 * or when it notices that chain had been changed while it was reading 382 * or when it notices that chain had been changed while it was reading
383 * (ditto, *@err == -EAGAIN) 383 * (ditto, *@err == -EAGAIN)
384 * or when it reads all @depth-1 indirect blocks successfully and finds 384 * or when it reads all @depth-1 indirect blocks successfully and finds
385 * the whole chain, all way to the data (returns %NULL, *err == 0). 385 * the whole chain, all way to the data (returns %NULL, *err == 0).
386 */ 386 */
387 static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets, 387 static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets,
388 Indirect chain[4], int *err) 388 Indirect chain[4], int *err)
389 { 389 {
390 struct super_block *sb = inode->i_sb; 390 struct super_block *sb = inode->i_sb;
391 Indirect *p = chain; 391 Indirect *p = chain;
392 struct buffer_head *bh; 392 struct buffer_head *bh;
393 393
394 *err = 0; 394 *err = 0;
395 /* i_data is not going away, no lock needed */ 395 /* i_data is not going away, no lock needed */
396 add_chain (chain, NULL, EXT3_I(inode)->i_data + *offsets); 396 add_chain (chain, NULL, EXT3_I(inode)->i_data + *offsets);
397 if (!p->key) 397 if (!p->key)
398 goto no_block; 398 goto no_block;
399 while (--depth) { 399 while (--depth) {
400 bh = sb_bread(sb, le32_to_cpu(p->key)); 400 bh = sb_bread(sb, le32_to_cpu(p->key));
401 if (!bh) 401 if (!bh)
402 goto failure; 402 goto failure;
403 /* Reader: pointers */ 403 /* Reader: pointers */
404 if (!verify_chain(chain, p)) 404 if (!verify_chain(chain, p))
405 goto changed; 405 goto changed;
406 add_chain(++p, bh, (__le32*)bh->b_data + *++offsets); 406 add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
407 /* Reader: end */ 407 /* Reader: end */
408 if (!p->key) 408 if (!p->key)
409 goto no_block; 409 goto no_block;
410 } 410 }
411 return NULL; 411 return NULL;
412 412
413 changed: 413 changed:
414 brelse(bh); 414 brelse(bh);
415 *err = -EAGAIN; 415 *err = -EAGAIN;
416 goto no_block; 416 goto no_block;
417 failure: 417 failure:
418 *err = -EIO; 418 *err = -EIO;
419 no_block: 419 no_block:
420 return p; 420 return p;
421 } 421 }
422 422
423 /** 423 /**
424 * ext3_find_near - find a place for allocation with sufficient locality 424 * ext3_find_near - find a place for allocation with sufficient locality
425 * @inode: owner 425 * @inode: owner
426 * @ind: descriptor of indirect block. 426 * @ind: descriptor of indirect block.
427 * 427 *
428 * This function returns the preferred place for block allocation. 428 * This function returns the preferred place for block allocation.
429 * It is used when heuristic for sequential allocation fails. 429 * It is used when heuristic for sequential allocation fails.
430 * Rules are: 430 * Rules are:
431 * + if there is a block to the left of our position - allocate near it. 431 * + if there is a block to the left of our position - allocate near it.
432 * + if pointer will live in indirect block - allocate near that block. 432 * + if pointer will live in indirect block - allocate near that block.
433 * + if pointer will live in inode - allocate in the same 433 * + if pointer will live in inode - allocate in the same
434 * cylinder group. 434 * cylinder group.
435 * 435 *
436 * In the latter case we colour the starting block by the callers PID to 436 * In the latter case we colour the starting block by the callers PID to
437 * prevent it from clashing with concurrent allocations for a different inode 437 * prevent it from clashing with concurrent allocations for a different inode
438 * in the same block group. The PID is used here so that functionally related 438 * in the same block group. The PID is used here so that functionally related
439 * files will be close-by on-disk. 439 * files will be close-by on-disk.
440 * 440 *
441 * Caller must make sure that @ind is valid and will stay that way. 441 * Caller must make sure that @ind is valid and will stay that way.
442 */ 442 */
443 static ext3_fsblk_t ext3_find_near(struct inode *inode, Indirect *ind) 443 static ext3_fsblk_t ext3_find_near(struct inode *inode, Indirect *ind)
444 { 444 {
445 struct ext3_inode_info *ei = EXT3_I(inode); 445 struct ext3_inode_info *ei = EXT3_I(inode);
446 __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data; 446 __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
447 __le32 *p; 447 __le32 *p;
448 ext3_fsblk_t bg_start; 448 ext3_fsblk_t bg_start;
449 ext3_grpblk_t colour; 449 ext3_grpblk_t colour;
450 450
451 /* Try to find previous block */ 451 /* Try to find previous block */
452 for (p = ind->p - 1; p >= start; p--) { 452 for (p = ind->p - 1; p >= start; p--) {
453 if (*p) 453 if (*p)
454 return le32_to_cpu(*p); 454 return le32_to_cpu(*p);
455 } 455 }
456 456
457 /* No such thing, so let's try location of indirect block */ 457 /* No such thing, so let's try location of indirect block */
458 if (ind->bh) 458 if (ind->bh)
459 return ind->bh->b_blocknr; 459 return ind->bh->b_blocknr;
460 460
461 /* 461 /*
462 * It is going to be referred to from the inode itself? OK, just put it 462 * It is going to be referred to from the inode itself? OK, just put it
463 * into the same cylinder group then. 463 * into the same cylinder group then.
464 */ 464 */
465 bg_start = ext3_group_first_block_no(inode->i_sb, ei->i_block_group); 465 bg_start = ext3_group_first_block_no(inode->i_sb, ei->i_block_group);
466 colour = (current->pid % 16) * 466 colour = (current->pid % 16) *
467 (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); 467 (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
468 return bg_start + colour; 468 return bg_start + colour;
469 } 469 }
470 470
471 /** 471 /**
472 * ext3_find_goal - find a preferred place for allocation. 472 * ext3_find_goal - find a preferred place for allocation.
473 * @inode: owner 473 * @inode: owner
474 * @block: block we want 474 * @block: block we want
475 * @partial: pointer to the last triple within a chain 475 * @partial: pointer to the last triple within a chain
476 * 476 *
477 * Normally this function find the preferred place for block allocation, 477 * Normally this function find the preferred place for block allocation,
478 * returns it. 478 * returns it.
479 */ 479 */
480 480
481 static ext3_fsblk_t ext3_find_goal(struct inode *inode, long block, 481 static ext3_fsblk_t ext3_find_goal(struct inode *inode, long block,
482 Indirect *partial) 482 Indirect *partial)
483 { 483 {
484 struct ext3_block_alloc_info *block_i; 484 struct ext3_block_alloc_info *block_i;
485 485
486 block_i = EXT3_I(inode)->i_block_alloc_info; 486 block_i = EXT3_I(inode)->i_block_alloc_info;
487 487
488 /* 488 /*
489 * try the heuristic for sequential allocation, 489 * try the heuristic for sequential allocation,
490 * failing that at least try to get decent locality. 490 * failing that at least try to get decent locality.
491 */ 491 */
492 if (block_i && (block == block_i->last_alloc_logical_block + 1) 492 if (block_i && (block == block_i->last_alloc_logical_block + 1)
493 && (block_i->last_alloc_physical_block != 0)) { 493 && (block_i->last_alloc_physical_block != 0)) {
494 return block_i->last_alloc_physical_block + 1; 494 return block_i->last_alloc_physical_block + 1;
495 } 495 }
496 496
497 return ext3_find_near(inode, partial); 497 return ext3_find_near(inode, partial);
498 } 498 }
499 499
500 /** 500 /**
501 * ext3_blks_to_allocate - Look up the block map and count the number 501 * ext3_blks_to_allocate - Look up the block map and count the number
502 * of direct blocks need to be allocated for the given branch. 502 * of direct blocks need to be allocated for the given branch.
503 * 503 *
504 * @branch: chain of indirect blocks 504 * @branch: chain of indirect blocks
505 * @k: number of blocks need for indirect blocks 505 * @k: number of blocks need for indirect blocks
506 * @blks: number of data blocks to be mapped. 506 * @blks: number of data blocks to be mapped.
507 * @blocks_to_boundary: the offset in the indirect block 507 * @blocks_to_boundary: the offset in the indirect block
508 * 508 *
509 * return the total number of blocks to be allocate, including the 509 * return the total number of blocks to be allocate, including the
510 * direct and indirect blocks. 510 * direct and indirect blocks.
511 */ 511 */
512 static int ext3_blks_to_allocate(Indirect *branch, int k, unsigned long blks, 512 static int ext3_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
513 int blocks_to_boundary) 513 int blocks_to_boundary)
514 { 514 {
515 unsigned long count = 0; 515 unsigned long count = 0;
516 516
517 /* 517 /*
518 * Simple case, [t,d]Indirect block(s) has not allocated yet 518 * Simple case, [t,d]Indirect block(s) has not allocated yet
519 * then it's clear blocks on that path have not allocated 519 * then it's clear blocks on that path have not allocated
520 */ 520 */
521 if (k > 0) { 521 if (k > 0) {
522 /* right now we don't handle cross boundary allocation */ 522 /* right now we don't handle cross boundary allocation */
523 if (blks < blocks_to_boundary + 1) 523 if (blks < blocks_to_boundary + 1)
524 count += blks; 524 count += blks;
525 else 525 else
526 count += blocks_to_boundary + 1; 526 count += blocks_to_boundary + 1;
527 return count; 527 return count;
528 } 528 }
529 529
530 count++; 530 count++;
531 while (count < blks && count <= blocks_to_boundary && 531 while (count < blks && count <= blocks_to_boundary &&
532 le32_to_cpu(*(branch[0].p + count)) == 0) { 532 le32_to_cpu(*(branch[0].p + count)) == 0) {
533 count++; 533 count++;
534 } 534 }
535 return count; 535 return count;
536 } 536 }
537 537
538 /** 538 /**
539 * ext3_alloc_blocks - multiple allocate blocks needed for a branch 539 * ext3_alloc_blocks - multiple allocate blocks needed for a branch
540 * @handle: handle for this transaction 540 * @handle: handle for this transaction
541 * @inode: owner 541 * @inode: owner
542 * @goal: preferred place for allocation 542 * @goal: preferred place for allocation
543 * @indirect_blks: the number of blocks need to allocate for indirect 543 * @indirect_blks: the number of blocks need to allocate for indirect
544 * blocks 544 * blocks
545 * @blks: number of blocks need to allocated for direct blocks 545 * @blks: number of blocks need to allocated for direct blocks
546 * @new_blocks: on return it will store the new block numbers for 546 * @new_blocks: on return it will store the new block numbers for
547 * the indirect blocks(if needed) and the first direct block, 547 * the indirect blocks(if needed) and the first direct block,
548 * @err: here we store the error value 548 * @err: here we store the error value
549 * 549 *
550 * return the number of direct blocks allocated 550 * return the number of direct blocks allocated
551 */ 551 */
552 static int ext3_alloc_blocks(handle_t *handle, struct inode *inode, 552 static int ext3_alloc_blocks(handle_t *handle, struct inode *inode,
553 ext3_fsblk_t goal, int indirect_blks, int blks, 553 ext3_fsblk_t goal, int indirect_blks, int blks,
554 ext3_fsblk_t new_blocks[4], int *err) 554 ext3_fsblk_t new_blocks[4], int *err)
555 { 555 {
556 int target, i; 556 int target, i;
557 unsigned long count = 0; 557 unsigned long count = 0;
558 int index = 0; 558 int index = 0;
559 ext3_fsblk_t current_block = 0; 559 ext3_fsblk_t current_block = 0;
560 int ret = 0; 560 int ret = 0;
561 561
562 /* 562 /*
563 * Here we try to allocate the requested multiple blocks at once, 563 * Here we try to allocate the requested multiple blocks at once,
564 * on a best-effort basis. 564 * on a best-effort basis.
565 * To build a branch, we should allocate blocks for 565 * To build a branch, we should allocate blocks for
566 * the indirect blocks(if not allocated yet), and at least 566 * the indirect blocks(if not allocated yet), and at least
567 * the first direct block of this branch. That's the 567 * the first direct block of this branch. That's the
568 * minimum number of blocks need to allocate(required) 568 * minimum number of blocks need to allocate(required)
569 */ 569 */
570 target = blks + indirect_blks; 570 target = blks + indirect_blks;
571 571
572 while (1) { 572 while (1) {
573 count = target; 573 count = target;
574 /* allocating blocks for indirect blocks and direct blocks */ 574 /* allocating blocks for indirect blocks and direct blocks */
575 current_block = ext3_new_blocks(handle,inode,goal,&count,err); 575 current_block = ext3_new_blocks(handle,inode,goal,&count,err);
576 if (*err) 576 if (*err)
577 goto failed_out; 577 goto failed_out;
578 578
579 target -= count; 579 target -= count;
580 /* allocate blocks for indirect blocks */ 580 /* allocate blocks for indirect blocks */
581 while (index < indirect_blks && count) { 581 while (index < indirect_blks && count) {
582 new_blocks[index++] = current_block++; 582 new_blocks[index++] = current_block++;
583 count--; 583 count--;
584 } 584 }
585 585
586 if (count > 0) 586 if (count > 0)
587 break; 587 break;
588 } 588 }
589 589
590 /* save the new block number for the first direct block */ 590 /* save the new block number for the first direct block */
591 new_blocks[index] = current_block; 591 new_blocks[index] = current_block;
592 592
593 /* total number of blocks allocated for direct blocks */ 593 /* total number of blocks allocated for direct blocks */
594 ret = count; 594 ret = count;
595 *err = 0; 595 *err = 0;
596 return ret; 596 return ret;
597 failed_out: 597 failed_out:
598 for (i = 0; i <index; i++) 598 for (i = 0; i <index; i++)
599 ext3_free_blocks(handle, inode, new_blocks[i], 1); 599 ext3_free_blocks(handle, inode, new_blocks[i], 1);
600 return ret; 600 return ret;
601 } 601 }
602 602
603 /** 603 /**
604 * ext3_alloc_branch - allocate and set up a chain of blocks. 604 * ext3_alloc_branch - allocate and set up a chain of blocks.
605 * @handle: handle for this transaction 605 * @handle: handle for this transaction
606 * @inode: owner 606 * @inode: owner
607 * @indirect_blks: number of allocated indirect blocks 607 * @indirect_blks: number of allocated indirect blocks
608 * @blks: number of allocated direct blocks 608 * @blks: number of allocated direct blocks
609 * @goal: preferred place for allocation 609 * @goal: preferred place for allocation
610 * @offsets: offsets (in the blocks) to store the pointers to next. 610 * @offsets: offsets (in the blocks) to store the pointers to next.
611 * @branch: place to store the chain in. 611 * @branch: place to store the chain in.
612 * 612 *
613 * This function allocates blocks, zeroes out all but the last one, 613 * This function allocates blocks, zeroes out all but the last one,
614 * links them into chain and (if we are synchronous) writes them to disk. 614 * links them into chain and (if we are synchronous) writes them to disk.
615 * In other words, it prepares a branch that can be spliced onto the 615 * In other words, it prepares a branch that can be spliced onto the
616 * inode. It stores the information about that chain in the branch[], in 616 * inode. It stores the information about that chain in the branch[], in
617 * the same format as ext3_get_branch() would do. We are calling it after 617 * the same format as ext3_get_branch() would do. We are calling it after
618 * we had read the existing part of chain and partial points to the last 618 * we had read the existing part of chain and partial points to the last
619 * triple of that (one with zero ->key). Upon the exit we have the same 619 * triple of that (one with zero ->key). Upon the exit we have the same
620 * picture as after the successful ext3_get_block(), except that in one 620 * picture as after the successful ext3_get_block(), except that in one
621 * place chain is disconnected - *branch->p is still zero (we did not 621 * place chain is disconnected - *branch->p is still zero (we did not
622 * set the last link), but branch->key contains the number that should 622 * set the last link), but branch->key contains the number that should
623 * be placed into *branch->p to fill that gap. 623 * be placed into *branch->p to fill that gap.
624 * 624 *
625 * If allocation fails we free all blocks we've allocated (and forget 625 * If allocation fails we free all blocks we've allocated (and forget
626 * their buffer_heads) and return the error value the from failed 626 * their buffer_heads) and return the error value the from failed
627 * ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain 627 * ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain
628 * as described above and return 0. 628 * as described above and return 0.
629 */ 629 */
630 static int ext3_alloc_branch(handle_t *handle, struct inode *inode, 630 static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
631 int indirect_blks, int *blks, ext3_fsblk_t goal, 631 int indirect_blks, int *blks, ext3_fsblk_t goal,
632 int *offsets, Indirect *branch) 632 int *offsets, Indirect *branch)
633 { 633 {
634 int blocksize = inode->i_sb->s_blocksize; 634 int blocksize = inode->i_sb->s_blocksize;
635 int i, n = 0; 635 int i, n = 0;
636 int err = 0; 636 int err = 0;
637 struct buffer_head *bh; 637 struct buffer_head *bh;
638 int num; 638 int num;
639 ext3_fsblk_t new_blocks[4]; 639 ext3_fsblk_t new_blocks[4];
640 ext3_fsblk_t current_block; 640 ext3_fsblk_t current_block;
641 641
642 num = ext3_alloc_blocks(handle, inode, goal, indirect_blks, 642 num = ext3_alloc_blocks(handle, inode, goal, indirect_blks,
643 *blks, new_blocks, &err); 643 *blks, new_blocks, &err);
644 if (err) 644 if (err)
645 return err; 645 return err;
646 646
647 branch[0].key = cpu_to_le32(new_blocks[0]); 647 branch[0].key = cpu_to_le32(new_blocks[0]);
648 /* 648 /*
649 * metadata blocks and data blocks are allocated. 649 * metadata blocks and data blocks are allocated.
650 */ 650 */
651 for (n = 1; n <= indirect_blks; n++) { 651 for (n = 1; n <= indirect_blks; n++) {
652 /* 652 /*
653 * Get buffer_head for parent block, zero it out 653 * Get buffer_head for parent block, zero it out
654 * and set the pointer to new one, then send 654 * and set the pointer to new one, then send
655 * parent to disk. 655 * parent to disk.
656 */ 656 */
657 bh = sb_getblk(inode->i_sb, new_blocks[n-1]); 657 bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
658 branch[n].bh = bh; 658 branch[n].bh = bh;
659 lock_buffer(bh); 659 lock_buffer(bh);
660 BUFFER_TRACE(bh, "call get_create_access"); 660 BUFFER_TRACE(bh, "call get_create_access");
661 err = ext3_journal_get_create_access(handle, bh); 661 err = ext3_journal_get_create_access(handle, bh);
662 if (err) { 662 if (err) {
663 unlock_buffer(bh); 663 unlock_buffer(bh);
664 brelse(bh); 664 brelse(bh);
665 goto failed; 665 goto failed;
666 } 666 }
667 667
668 memset(bh->b_data, 0, blocksize); 668 memset(bh->b_data, 0, blocksize);
669 branch[n].p = (__le32 *) bh->b_data + offsets[n]; 669 branch[n].p = (__le32 *) bh->b_data + offsets[n];
670 branch[n].key = cpu_to_le32(new_blocks[n]); 670 branch[n].key = cpu_to_le32(new_blocks[n]);
671 *branch[n].p = branch[n].key; 671 *branch[n].p = branch[n].key;
672 if ( n == indirect_blks) { 672 if ( n == indirect_blks) {
673 current_block = new_blocks[n]; 673 current_block = new_blocks[n];
674 /* 674 /*
675 * End of chain, update the last new metablock of 675 * End of chain, update the last new metablock of
676 * the chain to point to the new allocated 676 * the chain to point to the new allocated
677 * data blocks numbers 677 * data blocks numbers
678 */ 678 */
679 for (i=1; i < num; i++) 679 for (i=1; i < num; i++)
680 *(branch[n].p + i) = cpu_to_le32(++current_block); 680 *(branch[n].p + i) = cpu_to_le32(++current_block);
681 } 681 }
682 BUFFER_TRACE(bh, "marking uptodate"); 682 BUFFER_TRACE(bh, "marking uptodate");
683 set_buffer_uptodate(bh); 683 set_buffer_uptodate(bh);
684 unlock_buffer(bh); 684 unlock_buffer(bh);
685 685
686 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); 686 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
687 err = ext3_journal_dirty_metadata(handle, bh); 687 err = ext3_journal_dirty_metadata(handle, bh);
688 if (err) 688 if (err)
689 goto failed; 689 goto failed;
690 } 690 }
691 *blks = num; 691 *blks = num;
692 return err; 692 return err;
693 failed: 693 failed:
694 /* Allocation failed, free what we already allocated */ 694 /* Allocation failed, free what we already allocated */
695 for (i = 1; i <= n ; i++) { 695 for (i = 1; i <= n ; i++) {
696 BUFFER_TRACE(branch[i].bh, "call journal_forget"); 696 BUFFER_TRACE(branch[i].bh, "call journal_forget");
697 ext3_journal_forget(handle, branch[i].bh); 697 ext3_journal_forget(handle, branch[i].bh);
698 } 698 }
699 for (i = 0; i <indirect_blks; i++) 699 for (i = 0; i <indirect_blks; i++)
700 ext3_free_blocks(handle, inode, new_blocks[i], 1); 700 ext3_free_blocks(handle, inode, new_blocks[i], 1);
701 701
702 ext3_free_blocks(handle, inode, new_blocks[i], num); 702 ext3_free_blocks(handle, inode, new_blocks[i], num);
703 703
704 return err; 704 return err;
705 } 705 }
706 706
707 /** 707 /**
708 * ext3_splice_branch - splice the allocated branch onto inode. 708 * ext3_splice_branch - splice the allocated branch onto inode.
709 * @handle: handle for this transaction 709 * @handle: handle for this transaction
710 * @inode: owner 710 * @inode: owner
711 * @block: (logical) number of block we are adding 711 * @block: (logical) number of block we are adding
712 * @where: location of missing link 712 * @where: location of missing link
713 * @num: number of indirect blocks we are adding 713 * @num: number of indirect blocks we are adding
714 * @blks: number of direct blocks we are adding 714 * @blks: number of direct blocks we are adding
715 * 715 *
716 * This function fills the missing link and does all housekeeping needed in 716 * This function fills the missing link and does all housekeeping needed in
717 * inode (->i_blocks, etc.). In case of success we end up with the full 717 * inode (->i_blocks, etc.). In case of success we end up with the full
718 * chain to new block and return 0. 718 * chain to new block and return 0.
719 */ 719 */
720 static int ext3_splice_branch(handle_t *handle, struct inode *inode, 720 static int ext3_splice_branch(handle_t *handle, struct inode *inode,
721 long block, Indirect *where, int num, int blks) 721 long block, Indirect *where, int num, int blks)
722 { 722 {
723 int i; 723 int i;
724 int err = 0; 724 int err = 0;
725 struct ext3_block_alloc_info *block_i; 725 struct ext3_block_alloc_info *block_i;
726 ext3_fsblk_t current_block; 726 ext3_fsblk_t current_block;
727 struct ext3_inode_info *ei = EXT3_I(inode); 727 struct ext3_inode_info *ei = EXT3_I(inode);
728 728
729 block_i = ei->i_block_alloc_info; 729 block_i = ei->i_block_alloc_info;
730 /* 730 /*
731 * If we're splicing into a [td]indirect block (as opposed to the 731 * If we're splicing into a [td]indirect block (as opposed to the
732 * inode) then we need to get write access to the [td]indirect block 732 * inode) then we need to get write access to the [td]indirect block
733 * before the splice. 733 * before the splice.
734 */ 734 */
735 if (where->bh) { 735 if (where->bh) {
736 BUFFER_TRACE(where->bh, "get_write_access"); 736 BUFFER_TRACE(where->bh, "get_write_access");
737 err = ext3_journal_get_write_access(handle, where->bh); 737 err = ext3_journal_get_write_access(handle, where->bh);
738 if (err) 738 if (err)
739 goto err_out; 739 goto err_out;
740 } 740 }
741 /* That's it */ 741 /* That's it */
742 742
743 *where->p = where->key; 743 *where->p = where->key;
744 744
745 /* 745 /*
746 * Update the host buffer_head or inode to point to more just allocated 746 * Update the host buffer_head or inode to point to more just allocated
747 * direct blocks blocks 747 * direct blocks blocks
748 */ 748 */
749 if (num == 0 && blks > 1) { 749 if (num == 0 && blks > 1) {
750 current_block = le32_to_cpu(where->key) + 1; 750 current_block = le32_to_cpu(where->key) + 1;
751 for (i = 1; i < blks; i++) 751 for (i = 1; i < blks; i++)
752 *(where->p + i ) = cpu_to_le32(current_block++); 752 *(where->p + i ) = cpu_to_le32(current_block++);
753 } 753 }
754 754
755 /* 755 /*
756 * update the most recently allocated logical & physical block 756 * update the most recently allocated logical & physical block
757 * in i_block_alloc_info, to assist find the proper goal block for next 757 * in i_block_alloc_info, to assist find the proper goal block for next
758 * allocation 758 * allocation
759 */ 759 */
760 if (block_i) { 760 if (block_i) {
761 block_i->last_alloc_logical_block = block + blks - 1; 761 block_i->last_alloc_logical_block = block + blks - 1;
762 block_i->last_alloc_physical_block = 762 block_i->last_alloc_physical_block =
763 le32_to_cpu(where[num].key) + blks - 1; 763 le32_to_cpu(where[num].key) + blks - 1;
764 } 764 }
765 765
766 /* We are done with atomic stuff, now do the rest of housekeeping */ 766 /* We are done with atomic stuff, now do the rest of housekeeping */
767 767
768 inode->i_ctime = CURRENT_TIME_SEC; 768 inode->i_ctime = CURRENT_TIME_SEC;
769 ext3_mark_inode_dirty(handle, inode); 769 ext3_mark_inode_dirty(handle, inode);
770 /* ext3_mark_inode_dirty already updated i_sync_tid */ 770 /* ext3_mark_inode_dirty already updated i_sync_tid */
771 atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); 771 atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
772 772
773 /* had we spliced it onto indirect block? */ 773 /* had we spliced it onto indirect block? */
774 if (where->bh) { 774 if (where->bh) {
775 /* 775 /*
776 * If we spliced it onto an indirect block, we haven't 776 * If we spliced it onto an indirect block, we haven't
777 * altered the inode. Note however that if it is being spliced 777 * altered the inode. Note however that if it is being spliced
778 * onto an indirect block at the very end of the file (the 778 * onto an indirect block at the very end of the file (the
779 * file is growing) then we *will* alter the inode to reflect 779 * file is growing) then we *will* alter the inode to reflect
780 * the new i_size. But that is not done here - it is done in 780 * the new i_size. But that is not done here - it is done in
781 * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode. 781 * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode.
782 */ 782 */
783 jbd_debug(5, "splicing indirect only\n"); 783 jbd_debug(5, "splicing indirect only\n");
784 BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata"); 784 BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata");
785 err = ext3_journal_dirty_metadata(handle, where->bh); 785 err = ext3_journal_dirty_metadata(handle, where->bh);
786 if (err) 786 if (err)
787 goto err_out; 787 goto err_out;
788 } else { 788 } else {
789 /* 789 /*
790 * OK, we spliced it into the inode itself on a direct block. 790 * OK, we spliced it into the inode itself on a direct block.
791 * Inode was dirtied above. 791 * Inode was dirtied above.
792 */ 792 */
793 jbd_debug(5, "splicing direct\n"); 793 jbd_debug(5, "splicing direct\n");
794 } 794 }
795 return err; 795 return err;
796 796
797 err_out: 797 err_out:
798 for (i = 1; i <= num; i++) { 798 for (i = 1; i <= num; i++) {
799 BUFFER_TRACE(where[i].bh, "call journal_forget"); 799 BUFFER_TRACE(where[i].bh, "call journal_forget");
800 ext3_journal_forget(handle, where[i].bh); 800 ext3_journal_forget(handle, where[i].bh);
801 ext3_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1); 801 ext3_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1);
802 } 802 }
803 ext3_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks); 803 ext3_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks);
804 804
805 return err; 805 return err;
806 } 806 }
807 807
808 /* 808 /*
809 * Allocation strategy is simple: if we have to allocate something, we will 809 * Allocation strategy is simple: if we have to allocate something, we will
810 * have to go the whole way to leaf. So let's do it before attaching anything 810 * have to go the whole way to leaf. So let's do it before attaching anything
811 * to tree, set linkage between the newborn blocks, write them if sync is 811 * to tree, set linkage between the newborn blocks, write them if sync is
812 * required, recheck the path, free and repeat if check fails, otherwise 812 * required, recheck the path, free and repeat if check fails, otherwise
813 * set the last missing link (that will protect us from any truncate-generated 813 * set the last missing link (that will protect us from any truncate-generated
814 * removals - all blocks on the path are immune now) and possibly force the 814 * removals - all blocks on the path are immune now) and possibly force the
815 * write on the parent block. 815 * write on the parent block.
816 * That has a nice additional property: no special recovery from the failed 816 * That has a nice additional property: no special recovery from the failed
817 * allocations is needed - we simply release blocks and do not touch anything 817 * allocations is needed - we simply release blocks and do not touch anything
818 * reachable from inode. 818 * reachable from inode.
819 * 819 *
820 * `handle' can be NULL if create == 0. 820 * `handle' can be NULL if create == 0.
821 * 821 *
822 * The BKL may not be held on entry here. Be sure to take it early. 822 * The BKL may not be held on entry here. Be sure to take it early.
823 * return > 0, # of blocks mapped or allocated. 823 * return > 0, # of blocks mapped or allocated.
824 * return = 0, if plain lookup failed. 824 * return = 0, if plain lookup failed.
825 * return < 0, error case. 825 * return < 0, error case.
826 */ 826 */
827 int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, 827 int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
828 sector_t iblock, unsigned long maxblocks, 828 sector_t iblock, unsigned long maxblocks,
829 struct buffer_head *bh_result, 829 struct buffer_head *bh_result,
830 int create) 830 int create)
831 { 831 {
832 int err = -EIO; 832 int err = -EIO;
833 int offsets[4]; 833 int offsets[4];
834 Indirect chain[4]; 834 Indirect chain[4];
835 Indirect *partial; 835 Indirect *partial;
836 ext3_fsblk_t goal; 836 ext3_fsblk_t goal;
837 int indirect_blks; 837 int indirect_blks;
838 int blocks_to_boundary = 0; 838 int blocks_to_boundary = 0;
839 int depth; 839 int depth;
840 struct ext3_inode_info *ei = EXT3_I(inode); 840 struct ext3_inode_info *ei = EXT3_I(inode);
841 int count = 0; 841 int count = 0;
842 ext3_fsblk_t first_block = 0; 842 ext3_fsblk_t first_block = 0;
843 843
844 844
845 J_ASSERT(handle != NULL || create == 0); 845 J_ASSERT(handle != NULL || create == 0);
846 depth = ext3_block_to_path(inode,iblock,offsets,&blocks_to_boundary); 846 depth = ext3_block_to_path(inode,iblock,offsets,&blocks_to_boundary);
847 847
848 if (depth == 0) 848 if (depth == 0)
849 goto out; 849 goto out;
850 850
851 partial = ext3_get_branch(inode, depth, offsets, chain, &err); 851 partial = ext3_get_branch(inode, depth, offsets, chain, &err);
852 852
853 /* Simplest case - block found, no allocation needed */ 853 /* Simplest case - block found, no allocation needed */
854 if (!partial) { 854 if (!partial) {
855 first_block = le32_to_cpu(chain[depth - 1].key); 855 first_block = le32_to_cpu(chain[depth - 1].key);
856 clear_buffer_new(bh_result); 856 clear_buffer_new(bh_result);
857 count++; 857 count++;
858 /*map more blocks*/ 858 /*map more blocks*/
859 while (count < maxblocks && count <= blocks_to_boundary) { 859 while (count < maxblocks && count <= blocks_to_boundary) {
860 ext3_fsblk_t blk; 860 ext3_fsblk_t blk;
861 861
862 if (!verify_chain(chain, chain + depth - 1)) { 862 if (!verify_chain(chain, chain + depth - 1)) {
863 /* 863 /*
864 * Indirect block might be removed by 864 * Indirect block might be removed by
865 * truncate while we were reading it. 865 * truncate while we were reading it.
866 * Handling of that case: forget what we've 866 * Handling of that case: forget what we've
867 * got now. Flag the err as EAGAIN, so it 867 * got now. Flag the err as EAGAIN, so it
868 * will reread. 868 * will reread.
869 */ 869 */
870 err = -EAGAIN; 870 err = -EAGAIN;
871 count = 0; 871 count = 0;
872 break; 872 break;
873 } 873 }
874 blk = le32_to_cpu(*(chain[depth-1].p + count)); 874 blk = le32_to_cpu(*(chain[depth-1].p + count));
875 875
876 if (blk == first_block + count) 876 if (blk == first_block + count)
877 count++; 877 count++;
878 else 878 else
879 break; 879 break;
880 } 880 }
881 if (err != -EAGAIN) 881 if (err != -EAGAIN)
882 goto got_it; 882 goto got_it;
883 } 883 }
884 884
885 /* Next simple case - plain lookup or failed read of indirect block */ 885 /* Next simple case - plain lookup or failed read of indirect block */
886 if (!create || err == -EIO) 886 if (!create || err == -EIO)
887 goto cleanup; 887 goto cleanup;
888 888
889 mutex_lock(&ei->truncate_mutex); 889 mutex_lock(&ei->truncate_mutex);
890 890
891 /* 891 /*
892 * If the indirect block is missing while we are reading 892 * If the indirect block is missing while we are reading
893 * the chain(ext3_get_branch() returns -EAGAIN err), or 893 * the chain(ext3_get_branch() returns -EAGAIN err), or
894 * if the chain has been changed after we grab the semaphore, 894 * if the chain has been changed after we grab the semaphore,
895 * (either because another process truncated this branch, or 895 * (either because another process truncated this branch, or
896 * another get_block allocated this branch) re-grab the chain to see if 896 * another get_block allocated this branch) re-grab the chain to see if
897 * the request block has been allocated or not. 897 * the request block has been allocated or not.
898 * 898 *
899 * Since we already block the truncate/other get_block 899 * Since we already block the truncate/other get_block
900 * at this point, we will have the current copy of the chain when we 900 * at this point, we will have the current copy of the chain when we
901 * splice the branch into the tree. 901 * splice the branch into the tree.
902 */ 902 */
903 if (err == -EAGAIN || !verify_chain(chain, partial)) { 903 if (err == -EAGAIN || !verify_chain(chain, partial)) {
904 while (partial > chain) { 904 while (partial > chain) {
905 brelse(partial->bh); 905 brelse(partial->bh);
906 partial--; 906 partial--;
907 } 907 }
908 partial = ext3_get_branch(inode, depth, offsets, chain, &err); 908 partial = ext3_get_branch(inode, depth, offsets, chain, &err);
909 if (!partial) { 909 if (!partial) {
910 count++; 910 count++;
911 mutex_unlock(&ei->truncate_mutex); 911 mutex_unlock(&ei->truncate_mutex);
912 if (err) 912 if (err)
913 goto cleanup; 913 goto cleanup;
914 clear_buffer_new(bh_result); 914 clear_buffer_new(bh_result);
915 goto got_it; 915 goto got_it;
916 } 916 }
917 } 917 }
918 918
919 /* 919 /*
920 * Okay, we need to do block allocation. Lazily initialize the block 920 * Okay, we need to do block allocation. Lazily initialize the block
921 * allocation info here if necessary 921 * allocation info here if necessary
922 */ 922 */
923 if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info)) 923 if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
924 ext3_init_block_alloc_info(inode); 924 ext3_init_block_alloc_info(inode);
925 925
926 goal = ext3_find_goal(inode, iblock, partial); 926 goal = ext3_find_goal(inode, iblock, partial);
927 927
928 /* the number of blocks need to allocate for [d,t]indirect blocks */ 928 /* the number of blocks need to allocate for [d,t]indirect blocks */
929 indirect_blks = (chain + depth) - partial - 1; 929 indirect_blks = (chain + depth) - partial - 1;
930 930
931 /* 931 /*
932 * Next look up the indirect map to count the totoal number of 932 * Next look up the indirect map to count the totoal number of
933 * direct blocks to allocate for this branch. 933 * direct blocks to allocate for this branch.
934 */ 934 */
935 count = ext3_blks_to_allocate(partial, indirect_blks, 935 count = ext3_blks_to_allocate(partial, indirect_blks,
936 maxblocks, blocks_to_boundary); 936 maxblocks, blocks_to_boundary);
937 /* 937 /*
938 * Block out ext3_truncate while we alter the tree 938 * Block out ext3_truncate while we alter the tree
939 */ 939 */
940 err = ext3_alloc_branch(handle, inode, indirect_blks, &count, goal, 940 err = ext3_alloc_branch(handle, inode, indirect_blks, &count, goal,
941 offsets + (partial - chain), partial); 941 offsets + (partial - chain), partial);
942 942
943 /* 943 /*
944 * The ext3_splice_branch call will free and forget any buffers 944 * The ext3_splice_branch call will free and forget any buffers
945 * on the new chain if there is a failure, but that risks using 945 * on the new chain if there is a failure, but that risks using
946 * up transaction credits, especially for bitmaps where the 946 * up transaction credits, especially for bitmaps where the
947 * credits cannot be returned. Can we handle this somehow? We 947 * credits cannot be returned. Can we handle this somehow? We
948 * may need to return -EAGAIN upwards in the worst case. --sct 948 * may need to return -EAGAIN upwards in the worst case. --sct
949 */ 949 */
950 if (!err) 950 if (!err)
951 err = ext3_splice_branch(handle, inode, iblock, 951 err = ext3_splice_branch(handle, inode, iblock,
952 partial, indirect_blks, count); 952 partial, indirect_blks, count);
953 mutex_unlock(&ei->truncate_mutex); 953 mutex_unlock(&ei->truncate_mutex);
954 if (err) 954 if (err)
955 goto cleanup; 955 goto cleanup;
956 956
957 set_buffer_new(bh_result); 957 set_buffer_new(bh_result);
958 got_it: 958 got_it:
959 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); 959 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
960 if (count > blocks_to_boundary) 960 if (count > blocks_to_boundary)
961 set_buffer_boundary(bh_result); 961 set_buffer_boundary(bh_result);
962 err = count; 962 err = count;
963 /* Clean up and exit */ 963 /* Clean up and exit */
964 partial = chain + depth - 1; /* the whole chain */ 964 partial = chain + depth - 1; /* the whole chain */
965 cleanup: 965 cleanup:
966 while (partial > chain) { 966 while (partial > chain) {
967 BUFFER_TRACE(partial->bh, "call brelse"); 967 BUFFER_TRACE(partial->bh, "call brelse");
968 brelse(partial->bh); 968 brelse(partial->bh);
969 partial--; 969 partial--;
970 } 970 }
971 BUFFER_TRACE(bh_result, "returned"); 971 BUFFER_TRACE(bh_result, "returned");
972 out: 972 out:
973 return err; 973 return err;
974 } 974 }
975 975
976 /* Maximum number of blocks we map for direct IO at once. */ 976 /* Maximum number of blocks we map for direct IO at once. */
977 #define DIO_MAX_BLOCKS 4096 977 #define DIO_MAX_BLOCKS 4096
978 /* 978 /*
979 * Number of credits we need for writing DIO_MAX_BLOCKS: 979 * Number of credits we need for writing DIO_MAX_BLOCKS:
980 * We need sb + group descriptor + bitmap + inode -> 4 980 * We need sb + group descriptor + bitmap + inode -> 4
981 * For B blocks with A block pointers per block we need: 981 * For B blocks with A block pointers per block we need:
982 * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect). 982 * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect).
983 * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25. 983 * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25.
984 */ 984 */
985 #define DIO_CREDITS 25 985 #define DIO_CREDITS 25
986 986
987 static int ext3_get_block(struct inode *inode, sector_t iblock, 987 static int ext3_get_block(struct inode *inode, sector_t iblock,
988 struct buffer_head *bh_result, int create) 988 struct buffer_head *bh_result, int create)
989 { 989 {
990 handle_t *handle = ext3_journal_current_handle(); 990 handle_t *handle = ext3_journal_current_handle();
991 int ret = 0, started = 0; 991 int ret = 0, started = 0;
992 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; 992 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
993 993
994 if (create && !handle) { /* Direct IO write... */ 994 if (create && !handle) { /* Direct IO write... */
995 if (max_blocks > DIO_MAX_BLOCKS) 995 if (max_blocks > DIO_MAX_BLOCKS)
996 max_blocks = DIO_MAX_BLOCKS; 996 max_blocks = DIO_MAX_BLOCKS;
997 handle = ext3_journal_start(inode, DIO_CREDITS + 997 handle = ext3_journal_start(inode, DIO_CREDITS +
998 EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb)); 998 EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb));
999 if (IS_ERR(handle)) { 999 if (IS_ERR(handle)) {
1000 ret = PTR_ERR(handle); 1000 ret = PTR_ERR(handle);
1001 goto out; 1001 goto out;
1002 } 1002 }
1003 started = 1; 1003 started = 1;
1004 } 1004 }
1005 1005
1006 ret = ext3_get_blocks_handle(handle, inode, iblock, 1006 ret = ext3_get_blocks_handle(handle, inode, iblock,
1007 max_blocks, bh_result, create); 1007 max_blocks, bh_result, create);
1008 if (ret > 0) { 1008 if (ret > 0) {
1009 bh_result->b_size = (ret << inode->i_blkbits); 1009 bh_result->b_size = (ret << inode->i_blkbits);
1010 ret = 0; 1010 ret = 0;
1011 } 1011 }
1012 if (started) 1012 if (started)
1013 ext3_journal_stop(handle); 1013 ext3_journal_stop(handle);
1014 out: 1014 out:
1015 return ret; 1015 return ret;
1016 } 1016 }
1017 1017
1018 int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 1018 int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
1019 u64 start, u64 len) 1019 u64 start, u64 len)
1020 { 1020 {
1021 return generic_block_fiemap(inode, fieinfo, start, len, 1021 return generic_block_fiemap(inode, fieinfo, start, len,
1022 ext3_get_block); 1022 ext3_get_block);
1023 } 1023 }
1024 1024
1025 /* 1025 /*
1026 * `handle' can be NULL if create is zero 1026 * `handle' can be NULL if create is zero
1027 */ 1027 */
1028 struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode, 1028 struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode,
1029 long block, int create, int *errp) 1029 long block, int create, int *errp)
1030 { 1030 {
1031 struct buffer_head dummy; 1031 struct buffer_head dummy;
1032 int fatal = 0, err; 1032 int fatal = 0, err;
1033 1033
1034 J_ASSERT(handle != NULL || create == 0); 1034 J_ASSERT(handle != NULL || create == 0);
1035 1035
1036 dummy.b_state = 0; 1036 dummy.b_state = 0;
1037 dummy.b_blocknr = -1000; 1037 dummy.b_blocknr = -1000;
1038 buffer_trace_init(&dummy.b_history); 1038 buffer_trace_init(&dummy.b_history);
1039 err = ext3_get_blocks_handle(handle, inode, block, 1, 1039 err = ext3_get_blocks_handle(handle, inode, block, 1,
1040 &dummy, create); 1040 &dummy, create);
1041 /* 1041 /*
1042 * ext3_get_blocks_handle() returns number of blocks 1042 * ext3_get_blocks_handle() returns number of blocks
1043 * mapped. 0 in case of a HOLE. 1043 * mapped. 0 in case of a HOLE.
1044 */ 1044 */
1045 if (err > 0) { 1045 if (err > 0) {
1046 if (err > 1) 1046 if (err > 1)
1047 WARN_ON(1); 1047 WARN_ON(1);
1048 err = 0; 1048 err = 0;
1049 } 1049 }
1050 *errp = err; 1050 *errp = err;
1051 if (!err && buffer_mapped(&dummy)) { 1051 if (!err && buffer_mapped(&dummy)) {
1052 struct buffer_head *bh; 1052 struct buffer_head *bh;
1053 bh = sb_getblk(inode->i_sb, dummy.b_blocknr); 1053 bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
1054 if (!bh) { 1054 if (!bh) {
1055 *errp = -EIO; 1055 *errp = -EIO;
1056 goto err; 1056 goto err;
1057 } 1057 }
1058 if (buffer_new(&dummy)) { 1058 if (buffer_new(&dummy)) {
1059 J_ASSERT(create != 0); 1059 J_ASSERT(create != 0);
1060 J_ASSERT(handle != NULL); 1060 J_ASSERT(handle != NULL);
1061 1061
1062 /* 1062 /*
1063 * Now that we do not always journal data, we should 1063 * Now that we do not always journal data, we should
1064 * keep in mind whether this should always journal the 1064 * keep in mind whether this should always journal the
1065 * new buffer as metadata. For now, regular file 1065 * new buffer as metadata. For now, regular file
1066 * writes use ext3_get_block instead, so it's not a 1066 * writes use ext3_get_block instead, so it's not a
1067 * problem. 1067 * problem.
1068 */ 1068 */
1069 lock_buffer(bh); 1069 lock_buffer(bh);
1070 BUFFER_TRACE(bh, "call get_create_access"); 1070 BUFFER_TRACE(bh, "call get_create_access");
1071 fatal = ext3_journal_get_create_access(handle, bh); 1071 fatal = ext3_journal_get_create_access(handle, bh);
1072 if (!fatal && !buffer_uptodate(bh)) { 1072 if (!fatal && !buffer_uptodate(bh)) {
1073 memset(bh->b_data,0,inode->i_sb->s_blocksize); 1073 memset(bh->b_data,0,inode->i_sb->s_blocksize);
1074 set_buffer_uptodate(bh); 1074 set_buffer_uptodate(bh);
1075 } 1075 }
1076 unlock_buffer(bh); 1076 unlock_buffer(bh);
1077 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); 1077 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
1078 err = ext3_journal_dirty_metadata(handle, bh); 1078 err = ext3_journal_dirty_metadata(handle, bh);
1079 if (!fatal) 1079 if (!fatal)
1080 fatal = err; 1080 fatal = err;
1081 } else { 1081 } else {
1082 BUFFER_TRACE(bh, "not a new buffer"); 1082 BUFFER_TRACE(bh, "not a new buffer");
1083 } 1083 }
1084 if (fatal) { 1084 if (fatal) {
1085 *errp = fatal; 1085 *errp = fatal;
1086 brelse(bh); 1086 brelse(bh);
1087 bh = NULL; 1087 bh = NULL;
1088 } 1088 }
1089 return bh; 1089 return bh;
1090 } 1090 }
1091 err: 1091 err:
1092 return NULL; 1092 return NULL;
1093 } 1093 }
1094 1094
1095 struct buffer_head *ext3_bread(handle_t *handle, struct inode *inode, 1095 struct buffer_head *ext3_bread(handle_t *handle, struct inode *inode,
1096 int block, int create, int *err) 1096 int block, int create, int *err)
1097 { 1097 {
1098 struct buffer_head * bh; 1098 struct buffer_head * bh;
1099 1099
1100 bh = ext3_getblk(handle, inode, block, create, err); 1100 bh = ext3_getblk(handle, inode, block, create, err);
1101 if (!bh) 1101 if (!bh)
1102 return bh; 1102 return bh;
1103 if (buffer_uptodate(bh)) 1103 if (buffer_uptodate(bh))
1104 return bh; 1104 return bh;
1105 ll_rw_block(READ_META, 1, &bh); 1105 ll_rw_block(READ_META, 1, &bh);
1106 wait_on_buffer(bh); 1106 wait_on_buffer(bh);
1107 if (buffer_uptodate(bh)) 1107 if (buffer_uptodate(bh))
1108 return bh; 1108 return bh;
1109 put_bh(bh); 1109 put_bh(bh);
1110 *err = -EIO; 1110 *err = -EIO;
1111 return NULL; 1111 return NULL;
1112 } 1112 }
1113 1113
1114 static int walk_page_buffers( handle_t *handle, 1114 static int walk_page_buffers( handle_t *handle,
1115 struct buffer_head *head, 1115 struct buffer_head *head,
1116 unsigned from, 1116 unsigned from,
1117 unsigned to, 1117 unsigned to,
1118 int *partial, 1118 int *partial,
1119 int (*fn)( handle_t *handle, 1119 int (*fn)( handle_t *handle,
1120 struct buffer_head *bh)) 1120 struct buffer_head *bh))
1121 { 1121 {
1122 struct buffer_head *bh; 1122 struct buffer_head *bh;
1123 unsigned block_start, block_end; 1123 unsigned block_start, block_end;
1124 unsigned blocksize = head->b_size; 1124 unsigned blocksize = head->b_size;
1125 int err, ret = 0; 1125 int err, ret = 0;
1126 struct buffer_head *next; 1126 struct buffer_head *next;
1127 1127
1128 for ( bh = head, block_start = 0; 1128 for ( bh = head, block_start = 0;
1129 ret == 0 && (bh != head || !block_start); 1129 ret == 0 && (bh != head || !block_start);
1130 block_start = block_end, bh = next) 1130 block_start = block_end, bh = next)
1131 { 1131 {
1132 next = bh->b_this_page; 1132 next = bh->b_this_page;
1133 block_end = block_start + blocksize; 1133 block_end = block_start + blocksize;
1134 if (block_end <= from || block_start >= to) { 1134 if (block_end <= from || block_start >= to) {
1135 if (partial && !buffer_uptodate(bh)) 1135 if (partial && !buffer_uptodate(bh))
1136 *partial = 1; 1136 *partial = 1;
1137 continue; 1137 continue;
1138 } 1138 }
1139 err = (*fn)(handle, bh); 1139 err = (*fn)(handle, bh);
1140 if (!ret) 1140 if (!ret)
1141 ret = err; 1141 ret = err;
1142 } 1142 }
1143 return ret; 1143 return ret;
1144 } 1144 }
1145 1145
1146 /* 1146 /*
1147 * To preserve ordering, it is essential that the hole instantiation and 1147 * To preserve ordering, it is essential that the hole instantiation and
1148 * the data write be encapsulated in a single transaction. We cannot 1148 * the data write be encapsulated in a single transaction. We cannot
1149 * close off a transaction and start a new one between the ext3_get_block() 1149 * close off a transaction and start a new one between the ext3_get_block()
1150 * and the commit_write(). So doing the journal_start at the start of 1150 * and the commit_write(). So doing the journal_start at the start of
1151 * prepare_write() is the right place. 1151 * prepare_write() is the right place.
1152 * 1152 *
1153 * Also, this function can nest inside ext3_writepage() -> 1153 * Also, this function can nest inside ext3_writepage() ->
1154 * block_write_full_page(). In that case, we *know* that ext3_writepage() 1154 * block_write_full_page(). In that case, we *know* that ext3_writepage()
1155 * has generated enough buffer credits to do the whole page. So we won't 1155 * has generated enough buffer credits to do the whole page. So we won't
1156 * block on the journal in that case, which is good, because the caller may 1156 * block on the journal in that case, which is good, because the caller may
1157 * be PF_MEMALLOC. 1157 * be PF_MEMALLOC.
1158 * 1158 *
1159 * By accident, ext3 can be reentered when a transaction is open via 1159 * By accident, ext3 can be reentered when a transaction is open via
1160 * quota file writes. If we were to commit the transaction while thus 1160 * quota file writes. If we were to commit the transaction while thus
1161 * reentered, there can be a deadlock - we would be holding a quota 1161 * reentered, there can be a deadlock - we would be holding a quota
1162 * lock, and the commit would never complete if another thread had a 1162 * lock, and the commit would never complete if another thread had a
1163 * transaction open and was blocking on the quota lock - a ranking 1163 * transaction open and was blocking on the quota lock - a ranking
1164 * violation. 1164 * violation.
1165 * 1165 *
1166 * So what we do is to rely on the fact that journal_stop/journal_start 1166 * So what we do is to rely on the fact that journal_stop/journal_start
1167 * will _not_ run commit under these circumstances because handle->h_ref 1167 * will _not_ run commit under these circumstances because handle->h_ref
1168 * is elevated. We'll still have enough credits for the tiny quotafile 1168 * is elevated. We'll still have enough credits for the tiny quotafile
1169 * write. 1169 * write.
1170 */ 1170 */
1171 static int do_journal_get_write_access(handle_t *handle, 1171 static int do_journal_get_write_access(handle_t *handle,
1172 struct buffer_head *bh) 1172 struct buffer_head *bh)
1173 { 1173 {
1174 int dirty = buffer_dirty(bh); 1174 int dirty = buffer_dirty(bh);
1175 int ret; 1175 int ret;
1176 1176
1177 if (!buffer_mapped(bh) || buffer_freed(bh)) 1177 if (!buffer_mapped(bh) || buffer_freed(bh))
1178 return 0; 1178 return 0;
1179 /* 1179 /*
1180 * __block_prepare_write() could have dirtied some buffers. Clean 1180 * __block_prepare_write() could have dirtied some buffers. Clean
1181 * the dirty bit as jbd2_journal_get_write_access() could complain 1181 * the dirty bit as jbd2_journal_get_write_access() could complain
1182 * otherwise about fs integrity issues. Setting of the dirty bit 1182 * otherwise about fs integrity issues. Setting of the dirty bit
1183 * by __block_prepare_write() isn't a real problem here as we clear 1183 * by __block_prepare_write() isn't a real problem here as we clear
1184 * the bit before releasing a page lock and thus writeback cannot 1184 * the bit before releasing a page lock and thus writeback cannot
1185 * ever write the buffer. 1185 * ever write the buffer.
1186 */ 1186 */
1187 if (dirty) 1187 if (dirty)
1188 clear_buffer_dirty(bh); 1188 clear_buffer_dirty(bh);
1189 ret = ext3_journal_get_write_access(handle, bh); 1189 ret = ext3_journal_get_write_access(handle, bh);
1190 if (!ret && dirty) 1190 if (!ret && dirty)
1191 ret = ext3_journal_dirty_metadata(handle, bh); 1191 ret = ext3_journal_dirty_metadata(handle, bh);
1192 return ret; 1192 return ret;
1193 } 1193 }
1194 1194
1195 /* 1195 /*
1196 * Truncate blocks that were not used by write. We have to truncate the 1196 * Truncate blocks that were not used by write. We have to truncate the
1197 * pagecache as well so that corresponding buffers get properly unmapped. 1197 * pagecache as well so that corresponding buffers get properly unmapped.
1198 */ 1198 */
1199 static void ext3_truncate_failed_write(struct inode *inode) 1199 static void ext3_truncate_failed_write(struct inode *inode)
1200 { 1200 {
1201 truncate_inode_pages(inode->i_mapping, inode->i_size); 1201 truncate_inode_pages(inode->i_mapping, inode->i_size);
1202 ext3_truncate(inode); 1202 ext3_truncate(inode);
1203 } 1203 }
1204 1204
1205 static int ext3_write_begin(struct file *file, struct address_space *mapping, 1205 static int ext3_write_begin(struct file *file, struct address_space *mapping,
1206 loff_t pos, unsigned len, unsigned flags, 1206 loff_t pos, unsigned len, unsigned flags,
1207 struct page **pagep, void **fsdata) 1207 struct page **pagep, void **fsdata)
1208 { 1208 {
1209 struct inode *inode = mapping->host; 1209 struct inode *inode = mapping->host;
1210 int ret; 1210 int ret;
1211 handle_t *handle; 1211 handle_t *handle;
1212 int retries = 0; 1212 int retries = 0;
1213 struct page *page; 1213 struct page *page;
1214 pgoff_t index; 1214 pgoff_t index;
1215 unsigned from, to; 1215 unsigned from, to;
1216 /* Reserve one block more for addition to orphan list in case 1216 /* Reserve one block more for addition to orphan list in case
1217 * we allocate blocks but write fails for some reason */ 1217 * we allocate blocks but write fails for some reason */
1218 int needed_blocks = ext3_writepage_trans_blocks(inode) + 1; 1218 int needed_blocks = ext3_writepage_trans_blocks(inode) + 1;
1219 1219
1220 index = pos >> PAGE_CACHE_SHIFT; 1220 index = pos >> PAGE_CACHE_SHIFT;
1221 from = pos & (PAGE_CACHE_SIZE - 1); 1221 from = pos & (PAGE_CACHE_SIZE - 1);
1222 to = from + len; 1222 to = from + len;
1223 1223
1224 retry: 1224 retry:
1225 page = grab_cache_page_write_begin(mapping, index, flags); 1225 page = grab_cache_page_write_begin(mapping, index, flags);
1226 if (!page) 1226 if (!page)
1227 return -ENOMEM; 1227 return -ENOMEM;
1228 *pagep = page; 1228 *pagep = page;
1229 1229
1230 handle = ext3_journal_start(inode, needed_blocks); 1230 handle = ext3_journal_start(inode, needed_blocks);
1231 if (IS_ERR(handle)) { 1231 if (IS_ERR(handle)) {
1232 unlock_page(page); 1232 unlock_page(page);
1233 page_cache_release(page); 1233 page_cache_release(page);
1234 ret = PTR_ERR(handle); 1234 ret = PTR_ERR(handle);
1235 goto out; 1235 goto out;
1236 } 1236 }
1237 ret = __block_write_begin(page, pos, len, ext3_get_block); 1237 ret = __block_write_begin(page, pos, len, ext3_get_block);
1238 if (ret) 1238 if (ret)
1239 goto write_begin_failed; 1239 goto write_begin_failed;
1240 1240
1241 if (ext3_should_journal_data(inode)) { 1241 if (ext3_should_journal_data(inode)) {
1242 ret = walk_page_buffers(handle, page_buffers(page), 1242 ret = walk_page_buffers(handle, page_buffers(page),
1243 from, to, NULL, do_journal_get_write_access); 1243 from, to, NULL, do_journal_get_write_access);
1244 } 1244 }
1245 write_begin_failed: 1245 write_begin_failed:
1246 if (ret) { 1246 if (ret) {
1247 /* 1247 /*
1248 * block_write_begin may have instantiated a few blocks 1248 * block_write_begin may have instantiated a few blocks
1249 * outside i_size. Trim these off again. Don't need 1249 * outside i_size. Trim these off again. Don't need
1250 * i_size_read because we hold i_mutex. 1250 * i_size_read because we hold i_mutex.
1251 * 1251 *
1252 * Add inode to orphan list in case we crash before truncate 1252 * Add inode to orphan list in case we crash before truncate
1253 * finishes. Do this only if ext3_can_truncate() agrees so 1253 * finishes. Do this only if ext3_can_truncate() agrees so
1254 * that orphan processing code is happy. 1254 * that orphan processing code is happy.
1255 */ 1255 */
1256 if (pos + len > inode->i_size && ext3_can_truncate(inode)) 1256 if (pos + len > inode->i_size && ext3_can_truncate(inode))
1257 ext3_orphan_add(handle, inode); 1257 ext3_orphan_add(handle, inode);
1258 ext3_journal_stop(handle); 1258 ext3_journal_stop(handle);
1259 unlock_page(page); 1259 unlock_page(page);
1260 page_cache_release(page); 1260 page_cache_release(page);
1261 if (pos + len > inode->i_size) 1261 if (pos + len > inode->i_size)
1262 ext3_truncate_failed_write(inode); 1262 ext3_truncate_failed_write(inode);
1263 } 1263 }
1264 if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) 1264 if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
1265 goto retry; 1265 goto retry;
1266 out: 1266 out:
1267 return ret; 1267 return ret;
1268 } 1268 }
1269 1269
1270 1270
1271 int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh) 1271 int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
1272 { 1272 {
1273 int err = journal_dirty_data(handle, bh); 1273 int err = journal_dirty_data(handle, bh);
1274 if (err) 1274 if (err)
1275 ext3_journal_abort_handle(__func__, __func__, 1275 ext3_journal_abort_handle(__func__, __func__,
1276 bh, handle, err); 1276 bh, handle, err);
1277 return err; 1277 return err;
1278 } 1278 }
1279 1279
1280 /* For ordered writepage and write_end functions */ 1280 /* For ordered writepage and write_end functions */
1281 static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh) 1281 static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
1282 { 1282 {
1283 /* 1283 /*
1284 * Write could have mapped the buffer but it didn't copy the data in 1284 * Write could have mapped the buffer but it didn't copy the data in
1285 * yet. So avoid filing such buffer into a transaction. 1285 * yet. So avoid filing such buffer into a transaction.
1286 */ 1286 */
1287 if (buffer_mapped(bh) && buffer_uptodate(bh)) 1287 if (buffer_mapped(bh) && buffer_uptodate(bh))
1288 return ext3_journal_dirty_data(handle, bh); 1288 return ext3_journal_dirty_data(handle, bh);
1289 return 0; 1289 return 0;
1290 } 1290 }
1291 1291
1292 /* For write_end() in data=journal mode */ 1292 /* For write_end() in data=journal mode */
1293 static int write_end_fn(handle_t *handle, struct buffer_head *bh) 1293 static int write_end_fn(handle_t *handle, struct buffer_head *bh)
1294 { 1294 {
1295 if (!buffer_mapped(bh) || buffer_freed(bh)) 1295 if (!buffer_mapped(bh) || buffer_freed(bh))
1296 return 0; 1296 return 0;
1297 set_buffer_uptodate(bh); 1297 set_buffer_uptodate(bh);
1298 return ext3_journal_dirty_metadata(handle, bh); 1298 return ext3_journal_dirty_metadata(handle, bh);
1299 } 1299 }
1300 1300
1301 /* 1301 /*
1302 * This is nasty and subtle: ext3_write_begin() could have allocated blocks 1302 * This is nasty and subtle: ext3_write_begin() could have allocated blocks
1303 * for the whole page but later we failed to copy the data in. Update inode 1303 * for the whole page but later we failed to copy the data in. Update inode
1304 * size according to what we managed to copy. The rest is going to be 1304 * size according to what we managed to copy. The rest is going to be
1305 * truncated in write_end function. 1305 * truncated in write_end function.
1306 */ 1306 */
1307 static void update_file_sizes(struct inode *inode, loff_t pos, unsigned copied) 1307 static void update_file_sizes(struct inode *inode, loff_t pos, unsigned copied)
1308 { 1308 {
1309 /* What matters to us is i_disksize. We don't write i_size anywhere */ 1309 /* What matters to us is i_disksize. We don't write i_size anywhere */
1310 if (pos + copied > inode->i_size) 1310 if (pos + copied > inode->i_size)
1311 i_size_write(inode, pos + copied); 1311 i_size_write(inode, pos + copied);
1312 if (pos + copied > EXT3_I(inode)->i_disksize) { 1312 if (pos + copied > EXT3_I(inode)->i_disksize) {
1313 EXT3_I(inode)->i_disksize = pos + copied; 1313 EXT3_I(inode)->i_disksize = pos + copied;
1314 mark_inode_dirty(inode); 1314 mark_inode_dirty(inode);
1315 } 1315 }
1316 } 1316 }
1317 1317
1318 /* 1318 /*
1319 * We need to pick up the new inode size which generic_commit_write gave us 1319 * We need to pick up the new inode size which generic_commit_write gave us
1320 * `file' can be NULL - eg, when called from page_symlink(). 1320 * `file' can be NULL - eg, when called from page_symlink().
1321 * 1321 *
1322 * ext3 never places buffers on inode->i_mapping->private_list. metadata 1322 * ext3 never places buffers on inode->i_mapping->private_list. metadata
1323 * buffers are managed internally. 1323 * buffers are managed internally.
1324 */ 1324 */
1325 static int ext3_ordered_write_end(struct file *file, 1325 static int ext3_ordered_write_end(struct file *file,
1326 struct address_space *mapping, 1326 struct address_space *mapping,
1327 loff_t pos, unsigned len, unsigned copied, 1327 loff_t pos, unsigned len, unsigned copied,
1328 struct page *page, void *fsdata) 1328 struct page *page, void *fsdata)
1329 { 1329 {
1330 handle_t *handle = ext3_journal_current_handle(); 1330 handle_t *handle = ext3_journal_current_handle();
1331 struct inode *inode = file->f_mapping->host; 1331 struct inode *inode = file->f_mapping->host;
1332 unsigned from, to; 1332 unsigned from, to;
1333 int ret = 0, ret2; 1333 int ret = 0, ret2;
1334 1334
1335 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); 1335 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
1336 1336
1337 from = pos & (PAGE_CACHE_SIZE - 1); 1337 from = pos & (PAGE_CACHE_SIZE - 1);
1338 to = from + copied; 1338 to = from + copied;
1339 ret = walk_page_buffers(handle, page_buffers(page), 1339 ret = walk_page_buffers(handle, page_buffers(page),
1340 from, to, NULL, journal_dirty_data_fn); 1340 from, to, NULL, journal_dirty_data_fn);
1341 1341
1342 if (ret == 0) 1342 if (ret == 0)
1343 update_file_sizes(inode, pos, copied); 1343 update_file_sizes(inode, pos, copied);
1344 /* 1344 /*
1345 * There may be allocated blocks outside of i_size because 1345 * There may be allocated blocks outside of i_size because
1346 * we failed to copy some data. Prepare for truncate. 1346 * we failed to copy some data. Prepare for truncate.
1347 */ 1347 */
1348 if (pos + len > inode->i_size && ext3_can_truncate(inode)) 1348 if (pos + len > inode->i_size && ext3_can_truncate(inode))
1349 ext3_orphan_add(handle, inode); 1349 ext3_orphan_add(handle, inode);
1350 ret2 = ext3_journal_stop(handle); 1350 ret2 = ext3_journal_stop(handle);
1351 if (!ret) 1351 if (!ret)
1352 ret = ret2; 1352 ret = ret2;
1353 unlock_page(page); 1353 unlock_page(page);
1354 page_cache_release(page); 1354 page_cache_release(page);
1355 1355
1356 if (pos + len > inode->i_size) 1356 if (pos + len > inode->i_size)
1357 ext3_truncate_failed_write(inode); 1357 ext3_truncate_failed_write(inode);
1358 return ret ? ret : copied; 1358 return ret ? ret : copied;
1359 } 1359 }
1360 1360
1361 static int ext3_writeback_write_end(struct file *file, 1361 static int ext3_writeback_write_end(struct file *file,
1362 struct address_space *mapping, 1362 struct address_space *mapping,
1363 loff_t pos, unsigned len, unsigned copied, 1363 loff_t pos, unsigned len, unsigned copied,
1364 struct page *page, void *fsdata) 1364 struct page *page, void *fsdata)
1365 { 1365 {
1366 handle_t *handle = ext3_journal_current_handle(); 1366 handle_t *handle = ext3_journal_current_handle();
1367 struct inode *inode = file->f_mapping->host; 1367 struct inode *inode = file->f_mapping->host;
1368 int ret; 1368 int ret;
1369 1369
1370 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); 1370 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
1371 update_file_sizes(inode, pos, copied); 1371 update_file_sizes(inode, pos, copied);
1372 /* 1372 /*
1373 * There may be allocated blocks outside of i_size because 1373 * There may be allocated blocks outside of i_size because
1374 * we failed to copy some data. Prepare for truncate. 1374 * we failed to copy some data. Prepare for truncate.
1375 */ 1375 */
1376 if (pos + len > inode->i_size && ext3_can_truncate(inode)) 1376 if (pos + len > inode->i_size && ext3_can_truncate(inode))
1377 ext3_orphan_add(handle, inode); 1377 ext3_orphan_add(handle, inode);
1378 ret = ext3_journal_stop(handle); 1378 ret = ext3_journal_stop(handle);
1379 unlock_page(page); 1379 unlock_page(page);
1380 page_cache_release(page); 1380 page_cache_release(page);
1381 1381
1382 if (pos + len > inode->i_size) 1382 if (pos + len > inode->i_size)
1383 ext3_truncate_failed_write(inode); 1383 ext3_truncate_failed_write(inode);
1384 return ret ? ret : copied; 1384 return ret ? ret : copied;
1385 } 1385 }
1386 1386
1387 static int ext3_journalled_write_end(struct file *file, 1387 static int ext3_journalled_write_end(struct file *file,
1388 struct address_space *mapping, 1388 struct address_space *mapping,
1389 loff_t pos, unsigned len, unsigned copied, 1389 loff_t pos, unsigned len, unsigned copied,
1390 struct page *page, void *fsdata) 1390 struct page *page, void *fsdata)
1391 { 1391 {
1392 handle_t *handle = ext3_journal_current_handle(); 1392 handle_t *handle = ext3_journal_current_handle();
1393 struct inode *inode = mapping->host; 1393 struct inode *inode = mapping->host;
1394 int ret = 0, ret2; 1394 int ret = 0, ret2;
1395 int partial = 0; 1395 int partial = 0;
1396 unsigned from, to; 1396 unsigned from, to;
1397 1397
1398 from = pos & (PAGE_CACHE_SIZE - 1); 1398 from = pos & (PAGE_CACHE_SIZE - 1);
1399 to = from + len; 1399 to = from + len;
1400 1400
1401 if (copied < len) { 1401 if (copied < len) {
1402 if (!PageUptodate(page)) 1402 if (!PageUptodate(page))
1403 copied = 0; 1403 copied = 0;
1404 page_zero_new_buffers(page, from + copied, to); 1404 page_zero_new_buffers(page, from + copied, to);
1405 to = from + copied; 1405 to = from + copied;
1406 } 1406 }
1407 1407
1408 ret = walk_page_buffers(handle, page_buffers(page), from, 1408 ret = walk_page_buffers(handle, page_buffers(page), from,
1409 to, &partial, write_end_fn); 1409 to, &partial, write_end_fn);
1410 if (!partial) 1410 if (!partial)
1411 SetPageUptodate(page); 1411 SetPageUptodate(page);
1412 1412
1413 if (pos + copied > inode->i_size) 1413 if (pos + copied > inode->i_size)
1414 i_size_write(inode, pos + copied); 1414 i_size_write(inode, pos + copied);
1415 /* 1415 /*
1416 * There may be allocated blocks outside of i_size because 1416 * There may be allocated blocks outside of i_size because
1417 * we failed to copy some data. Prepare for truncate. 1417 * we failed to copy some data. Prepare for truncate.
1418 */ 1418 */
1419 if (pos + len > inode->i_size && ext3_can_truncate(inode)) 1419 if (pos + len > inode->i_size && ext3_can_truncate(inode))
1420 ext3_orphan_add(handle, inode); 1420 ext3_orphan_add(handle, inode);
1421 ext3_set_inode_state(inode, EXT3_STATE_JDATA); 1421 ext3_set_inode_state(inode, EXT3_STATE_JDATA);
1422 if (inode->i_size > EXT3_I(inode)->i_disksize) { 1422 if (inode->i_size > EXT3_I(inode)->i_disksize) {
1423 EXT3_I(inode)->i_disksize = inode->i_size; 1423 EXT3_I(inode)->i_disksize = inode->i_size;
1424 ret2 = ext3_mark_inode_dirty(handle, inode); 1424 ret2 = ext3_mark_inode_dirty(handle, inode);
1425 if (!ret) 1425 if (!ret)
1426 ret = ret2; 1426 ret = ret2;
1427 } 1427 }
1428 1428
1429 ret2 = ext3_journal_stop(handle); 1429 ret2 = ext3_journal_stop(handle);
1430 if (!ret) 1430 if (!ret)
1431 ret = ret2; 1431 ret = ret2;
1432 unlock_page(page); 1432 unlock_page(page);
1433 page_cache_release(page); 1433 page_cache_release(page);
1434 1434
1435 if (pos + len > inode->i_size) 1435 if (pos + len > inode->i_size)
1436 ext3_truncate_failed_write(inode); 1436 ext3_truncate_failed_write(inode);
1437 return ret ? ret : copied; 1437 return ret ? ret : copied;
1438 } 1438 }
1439 1439
1440 /* 1440 /*
1441 * bmap() is special. It gets used by applications such as lilo and by 1441 * bmap() is special. It gets used by applications such as lilo and by
1442 * the swapper to find the on-disk block of a specific piece of data. 1442 * the swapper to find the on-disk block of a specific piece of data.
1443 * 1443 *
1444 * Naturally, this is dangerous if the block concerned is still in the 1444 * Naturally, this is dangerous if the block concerned is still in the
1445 * journal. If somebody makes a swapfile on an ext3 data-journaling 1445 * journal. If somebody makes a swapfile on an ext3 data-journaling
1446 * filesystem and enables swap, then they may get a nasty shock when the 1446 * filesystem and enables swap, then they may get a nasty shock when the
1447 * data getting swapped to that swapfile suddenly gets overwritten by 1447 * data getting swapped to that swapfile suddenly gets overwritten by
1448 * the original zero's written out previously to the journal and 1448 * the original zero's written out previously to the journal and
1449 * awaiting writeback in the kernel's buffer cache. 1449 * awaiting writeback in the kernel's buffer cache.
1450 * 1450 *
1451 * So, if we see any bmap calls here on a modified, data-journaled file, 1451 * So, if we see any bmap calls here on a modified, data-journaled file,
1452 * take extra steps to flush any blocks which might be in the cache. 1452 * take extra steps to flush any blocks which might be in the cache.
1453 */ 1453 */
1454 static sector_t ext3_bmap(struct address_space *mapping, sector_t block) 1454 static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
1455 { 1455 {
1456 struct inode *inode = mapping->host; 1456 struct inode *inode = mapping->host;
1457 journal_t *journal; 1457 journal_t *journal;
1458 int err; 1458 int err;
1459 1459
1460 if (ext3_test_inode_state(inode, EXT3_STATE_JDATA)) { 1460 if (ext3_test_inode_state(inode, EXT3_STATE_JDATA)) {
1461 /* 1461 /*
1462 * This is a REALLY heavyweight approach, but the use of 1462 * This is a REALLY heavyweight approach, but the use of
1463 * bmap on dirty files is expected to be extremely rare: 1463 * bmap on dirty files is expected to be extremely rare:
1464 * only if we run lilo or swapon on a freshly made file 1464 * only if we run lilo or swapon on a freshly made file
1465 * do we expect this to happen. 1465 * do we expect this to happen.
1466 * 1466 *
1467 * (bmap requires CAP_SYS_RAWIO so this does not 1467 * (bmap requires CAP_SYS_RAWIO so this does not
1468 * represent an unprivileged user DOS attack --- we'd be 1468 * represent an unprivileged user DOS attack --- we'd be
1469 * in trouble if mortal users could trigger this path at 1469 * in trouble if mortal users could trigger this path at
1470 * will.) 1470 * will.)
1471 * 1471 *
1472 * NB. EXT3_STATE_JDATA is not set on files other than 1472 * NB. EXT3_STATE_JDATA is not set on files other than
1473 * regular files. If somebody wants to bmap a directory 1473 * regular files. If somebody wants to bmap a directory
1474 * or symlink and gets confused because the buffer 1474 * or symlink and gets confused because the buffer
1475 * hasn't yet been flushed to disk, they deserve 1475 * hasn't yet been flushed to disk, they deserve
1476 * everything they get. 1476 * everything they get.
1477 */ 1477 */
1478 1478
1479 ext3_clear_inode_state(inode, EXT3_STATE_JDATA); 1479 ext3_clear_inode_state(inode, EXT3_STATE_JDATA);
1480 journal = EXT3_JOURNAL(inode); 1480 journal = EXT3_JOURNAL(inode);
1481 journal_lock_updates(journal); 1481 journal_lock_updates(journal);
1482 err = journal_flush(journal); 1482 err = journal_flush(journal);
1483 journal_unlock_updates(journal); 1483 journal_unlock_updates(journal);
1484 1484
1485 if (err) 1485 if (err)
1486 return 0; 1486 return 0;
1487 } 1487 }
1488 1488
1489 return generic_block_bmap(mapping,block,ext3_get_block); 1489 return generic_block_bmap(mapping,block,ext3_get_block);
1490 } 1490 }
1491 1491
1492 static int bget_one(handle_t *handle, struct buffer_head *bh) 1492 static int bget_one(handle_t *handle, struct buffer_head *bh)
1493 { 1493 {
1494 get_bh(bh); 1494 get_bh(bh);
1495 return 0; 1495 return 0;
1496 } 1496 }
1497 1497
1498 static int bput_one(handle_t *handle, struct buffer_head *bh) 1498 static int bput_one(handle_t *handle, struct buffer_head *bh)
1499 { 1499 {
1500 put_bh(bh); 1500 put_bh(bh);
1501 return 0; 1501 return 0;
1502 } 1502 }
1503 1503
1504 static int buffer_unmapped(handle_t *handle, struct buffer_head *bh) 1504 static int buffer_unmapped(handle_t *handle, struct buffer_head *bh)
1505 { 1505 {
1506 return !buffer_mapped(bh); 1506 return !buffer_mapped(bh);
1507 } 1507 }
1508 1508
1509 /* 1509 /*
1510 * Note that we always start a transaction even if we're not journalling 1510 * Note that we always start a transaction even if we're not journalling
1511 * data. This is to preserve ordering: any hole instantiation within 1511 * data. This is to preserve ordering: any hole instantiation within
1512 * __block_write_full_page -> ext3_get_block() should be journalled 1512 * __block_write_full_page -> ext3_get_block() should be journalled
1513 * along with the data so we don't crash and then get metadata which 1513 * along with the data so we don't crash and then get metadata which
1514 * refers to old data. 1514 * refers to old data.
1515 * 1515 *
1516 * In all journalling modes block_write_full_page() will start the I/O. 1516 * In all journalling modes block_write_full_page() will start the I/O.
1517 * 1517 *
1518 * Problem: 1518 * Problem:
1519 * 1519 *
1520 * ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> 1520 * ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
1521 * ext3_writepage() 1521 * ext3_writepage()
1522 * 1522 *
1523 * Similar for: 1523 * Similar for:
1524 * 1524 *
1525 * ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ... 1525 * ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ...
1526 * 1526 *
1527 * Same applies to ext3_get_block(). We will deadlock on various things like 1527 * Same applies to ext3_get_block(). We will deadlock on various things like
1528 * lock_journal and i_truncate_mutex. 1528 * lock_journal and i_truncate_mutex.
1529 * 1529 *
1530 * Setting PF_MEMALLOC here doesn't work - too many internal memory 1530 * Setting PF_MEMALLOC here doesn't work - too many internal memory
1531 * allocations fail. 1531 * allocations fail.
1532 * 1532 *
1533 * 16May01: If we're reentered then journal_current_handle() will be 1533 * 16May01: If we're reentered then journal_current_handle() will be
1534 * non-zero. We simply *return*. 1534 * non-zero. We simply *return*.
1535 * 1535 *
1536 * 1 July 2001: @@@ FIXME: 1536 * 1 July 2001: @@@ FIXME:
1537 * In journalled data mode, a data buffer may be metadata against the 1537 * In journalled data mode, a data buffer may be metadata against the
1538 * current transaction. But the same file is part of a shared mapping 1538 * current transaction. But the same file is part of a shared mapping
1539 * and someone does a writepage() on it. 1539 * and someone does a writepage() on it.
1540 * 1540 *
1541 * We will move the buffer onto the async_data list, but *after* it has 1541 * We will move the buffer onto the async_data list, but *after* it has
1542 * been dirtied. So there's a small window where we have dirty data on 1542 * been dirtied. So there's a small window where we have dirty data on
1543 * BJ_Metadata. 1543 * BJ_Metadata.
1544 * 1544 *
1545 * Note that this only applies to the last partial page in the file. The 1545 * Note that this only applies to the last partial page in the file. The
1546 * bit which block_write_full_page() uses prepare/commit for. (That's 1546 * bit which block_write_full_page() uses prepare/commit for. (That's
1547 * broken code anyway: it's wrong for msync()). 1547 * broken code anyway: it's wrong for msync()).
1548 * 1548 *
1549 * It's a rare case: affects the final partial page, for journalled data 1549 * It's a rare case: affects the final partial page, for journalled data
1550 * where the file is subject to bith write() and writepage() in the same 1550 * where the file is subject to bith write() and writepage() in the same
1551 * transction. To fix it we'll need a custom block_write_full_page(). 1551 * transction. To fix it we'll need a custom block_write_full_page().
1552 * We'll probably need that anyway for journalling writepage() output. 1552 * We'll probably need that anyway for journalling writepage() output.
1553 * 1553 *
1554 * We don't honour synchronous mounts for writepage(). That would be 1554 * We don't honour synchronous mounts for writepage(). That would be
1555 * disastrous. Any write() or metadata operation will sync the fs for 1555 * disastrous. Any write() or metadata operation will sync the fs for
1556 * us. 1556 * us.
1557 * 1557 *
1558 * AKPM2: if all the page's buffers are mapped to disk and !data=journal, 1558 * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
1559 * we don't need to open a transaction here. 1559 * we don't need to open a transaction here.
1560 */ 1560 */
1561 static int ext3_ordered_writepage(struct page *page, 1561 static int ext3_ordered_writepage(struct page *page,
1562 struct writeback_control *wbc) 1562 struct writeback_control *wbc)
1563 { 1563 {
1564 struct inode *inode = page->mapping->host; 1564 struct inode *inode = page->mapping->host;
1565 struct buffer_head *page_bufs; 1565 struct buffer_head *page_bufs;
1566 handle_t *handle = NULL; 1566 handle_t *handle = NULL;
1567 int ret = 0; 1567 int ret = 0;
1568 int err; 1568 int err;
1569 1569
1570 J_ASSERT(PageLocked(page)); 1570 J_ASSERT(PageLocked(page));
1571 WARN_ON_ONCE(IS_RDONLY(inode)); 1571 WARN_ON_ONCE(IS_RDONLY(inode));
1572 1572
1573 /* 1573 /*
1574 * We give up here if we're reentered, because it might be for a 1574 * We give up here if we're reentered, because it might be for a
1575 * different filesystem. 1575 * different filesystem.
1576 */ 1576 */
1577 if (ext3_journal_current_handle()) 1577 if (ext3_journal_current_handle())
1578 goto out_fail; 1578 goto out_fail;
1579 1579
1580 if (!page_has_buffers(page)) { 1580 if (!page_has_buffers(page)) {
1581 create_empty_buffers(page, inode->i_sb->s_blocksize, 1581 create_empty_buffers(page, inode->i_sb->s_blocksize,
1582 (1 << BH_Dirty)|(1 << BH_Uptodate)); 1582 (1 << BH_Dirty)|(1 << BH_Uptodate));
1583 page_bufs = page_buffers(page); 1583 page_bufs = page_buffers(page);
1584 } else { 1584 } else {
1585 page_bufs = page_buffers(page); 1585 page_bufs = page_buffers(page);
1586 if (!walk_page_buffers(NULL, page_bufs, 0, PAGE_CACHE_SIZE, 1586 if (!walk_page_buffers(NULL, page_bufs, 0, PAGE_CACHE_SIZE,
1587 NULL, buffer_unmapped)) { 1587 NULL, buffer_unmapped)) {
1588 /* Provide NULL get_block() to catch bugs if buffers 1588 /* Provide NULL get_block() to catch bugs if buffers
1589 * weren't really mapped */ 1589 * weren't really mapped */
1590 return block_write_full_page(page, NULL, wbc); 1590 return block_write_full_page(page, NULL, wbc);
1591 } 1591 }
1592 } 1592 }
1593 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); 1593 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1594 1594
1595 if (IS_ERR(handle)) { 1595 if (IS_ERR(handle)) {
1596 ret = PTR_ERR(handle); 1596 ret = PTR_ERR(handle);
1597 goto out_fail; 1597 goto out_fail;
1598 } 1598 }
1599 1599
1600 walk_page_buffers(handle, page_bufs, 0, 1600 walk_page_buffers(handle, page_bufs, 0,
1601 PAGE_CACHE_SIZE, NULL, bget_one); 1601 PAGE_CACHE_SIZE, NULL, bget_one);
1602 1602
1603 ret = block_write_full_page(page, ext3_get_block, wbc); 1603 ret = block_write_full_page(page, ext3_get_block, wbc);
1604 1604
1605 /* 1605 /*
1606 * The page can become unlocked at any point now, and 1606 * The page can become unlocked at any point now, and
1607 * truncate can then come in and change things. So we 1607 * truncate can then come in and change things. So we
1608 * can't touch *page from now on. But *page_bufs is 1608 * can't touch *page from now on. But *page_bufs is
1609 * safe due to elevated refcount. 1609 * safe due to elevated refcount.
1610 */ 1610 */
1611 1611
1612 /* 1612 /*
1613 * And attach them to the current transaction. But only if 1613 * And attach them to the current transaction. But only if
1614 * block_write_full_page() succeeded. Otherwise they are unmapped, 1614 * block_write_full_page() succeeded. Otherwise they are unmapped,
1615 * and generally junk. 1615 * and generally junk.
1616 */ 1616 */
1617 if (ret == 0) { 1617 if (ret == 0) {
1618 err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, 1618 err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
1619 NULL, journal_dirty_data_fn); 1619 NULL, journal_dirty_data_fn);
1620 if (!ret) 1620 if (!ret)
1621 ret = err; 1621 ret = err;
1622 } 1622 }
1623 walk_page_buffers(handle, page_bufs, 0, 1623 walk_page_buffers(handle, page_bufs, 0,
1624 PAGE_CACHE_SIZE, NULL, bput_one); 1624 PAGE_CACHE_SIZE, NULL, bput_one);
1625 err = ext3_journal_stop(handle); 1625 err = ext3_journal_stop(handle);
1626 if (!ret) 1626 if (!ret)
1627 ret = err; 1627 ret = err;
1628 return ret; 1628 return ret;
1629 1629
1630 out_fail: 1630 out_fail:
1631 redirty_page_for_writepage(wbc, page); 1631 redirty_page_for_writepage(wbc, page);
1632 unlock_page(page); 1632 unlock_page(page);
1633 return ret; 1633 return ret;
1634 } 1634 }
1635 1635
1636 static int ext3_writeback_writepage(struct page *page, 1636 static int ext3_writeback_writepage(struct page *page,
1637 struct writeback_control *wbc) 1637 struct writeback_control *wbc)
1638 { 1638 {
1639 struct inode *inode = page->mapping->host; 1639 struct inode *inode = page->mapping->host;
1640 handle_t *handle = NULL; 1640 handle_t *handle = NULL;
1641 int ret = 0; 1641 int ret = 0;
1642 int err; 1642 int err;
1643 1643
1644 J_ASSERT(PageLocked(page)); 1644 J_ASSERT(PageLocked(page));
1645 WARN_ON_ONCE(IS_RDONLY(inode)); 1645 WARN_ON_ONCE(IS_RDONLY(inode));
1646 1646
1647 if (ext3_journal_current_handle()) 1647 if (ext3_journal_current_handle())
1648 goto out_fail; 1648 goto out_fail;
1649 1649
1650 if (page_has_buffers(page)) { 1650 if (page_has_buffers(page)) {
1651 if (!walk_page_buffers(NULL, page_buffers(page), 0, 1651 if (!walk_page_buffers(NULL, page_buffers(page), 0,
1652 PAGE_CACHE_SIZE, NULL, buffer_unmapped)) { 1652 PAGE_CACHE_SIZE, NULL, buffer_unmapped)) {
1653 /* Provide NULL get_block() to catch bugs if buffers 1653 /* Provide NULL get_block() to catch bugs if buffers
1654 * weren't really mapped */ 1654 * weren't really mapped */
1655 return block_write_full_page(page, NULL, wbc); 1655 return block_write_full_page(page, NULL, wbc);
1656 } 1656 }
1657 } 1657 }
1658 1658
1659 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); 1659 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1660 if (IS_ERR(handle)) { 1660 if (IS_ERR(handle)) {
1661 ret = PTR_ERR(handle); 1661 ret = PTR_ERR(handle);
1662 goto out_fail; 1662 goto out_fail;
1663 } 1663 }
1664 1664
1665 ret = block_write_full_page(page, ext3_get_block, wbc); 1665 ret = block_write_full_page(page, ext3_get_block, wbc);
1666 1666
1667 err = ext3_journal_stop(handle); 1667 err = ext3_journal_stop(handle);
1668 if (!ret) 1668 if (!ret)
1669 ret = err; 1669 ret = err;
1670 return ret; 1670 return ret;
1671 1671
1672 out_fail: 1672 out_fail:
1673 redirty_page_for_writepage(wbc, page); 1673 redirty_page_for_writepage(wbc, page);
1674 unlock_page(page); 1674 unlock_page(page);
1675 return ret; 1675 return ret;
1676 } 1676 }
1677 1677
1678 static int ext3_journalled_writepage(struct page *page, 1678 static int ext3_journalled_writepage(struct page *page,
1679 struct writeback_control *wbc) 1679 struct writeback_control *wbc)
1680 { 1680 {
1681 struct inode *inode = page->mapping->host; 1681 struct inode *inode = page->mapping->host;
1682 handle_t *handle = NULL; 1682 handle_t *handle = NULL;
1683 int ret = 0; 1683 int ret = 0;
1684 int err; 1684 int err;
1685 1685
1686 J_ASSERT(PageLocked(page)); 1686 J_ASSERT(PageLocked(page));
1687 WARN_ON_ONCE(IS_RDONLY(inode)); 1687 WARN_ON_ONCE(IS_RDONLY(inode));
1688 1688
1689 if (ext3_journal_current_handle()) 1689 if (ext3_journal_current_handle())
1690 goto no_write; 1690 goto no_write;
1691 1691
1692 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); 1692 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1693 if (IS_ERR(handle)) { 1693 if (IS_ERR(handle)) {
1694 ret = PTR_ERR(handle); 1694 ret = PTR_ERR(handle);
1695 goto no_write; 1695 goto no_write;
1696 } 1696 }
1697 1697
1698 if (!page_has_buffers(page) || PageChecked(page)) { 1698 if (!page_has_buffers(page) || PageChecked(page)) {
1699 /* 1699 /*
1700 * It's mmapped pagecache. Add buffers and journal it. There 1700 * It's mmapped pagecache. Add buffers and journal it. There
1701 * doesn't seem much point in redirtying the page here. 1701 * doesn't seem much point in redirtying the page here.
1702 */ 1702 */
1703 ClearPageChecked(page); 1703 ClearPageChecked(page);
1704 ret = __block_write_begin(page, 0, PAGE_CACHE_SIZE, 1704 ret = __block_write_begin(page, 0, PAGE_CACHE_SIZE,
1705 ext3_get_block); 1705 ext3_get_block);
1706 if (ret != 0) { 1706 if (ret != 0) {
1707 ext3_journal_stop(handle); 1707 ext3_journal_stop(handle);
1708 goto out_unlock; 1708 goto out_unlock;
1709 } 1709 }
1710 ret = walk_page_buffers(handle, page_buffers(page), 0, 1710 ret = walk_page_buffers(handle, page_buffers(page), 0,
1711 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access); 1711 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
1712 1712
1713 err = walk_page_buffers(handle, page_buffers(page), 0, 1713 err = walk_page_buffers(handle, page_buffers(page), 0,
1714 PAGE_CACHE_SIZE, NULL, write_end_fn); 1714 PAGE_CACHE_SIZE, NULL, write_end_fn);
1715 if (ret == 0) 1715 if (ret == 0)
1716 ret = err; 1716 ret = err;
1717 ext3_set_inode_state(inode, EXT3_STATE_JDATA); 1717 ext3_set_inode_state(inode, EXT3_STATE_JDATA);
1718 unlock_page(page); 1718 unlock_page(page);
1719 } else { 1719 } else {
1720 /* 1720 /*
1721 * It may be a page full of checkpoint-mode buffers. We don't 1721 * It may be a page full of checkpoint-mode buffers. We don't
1722 * really know unless we go poke around in the buffer_heads. 1722 * really know unless we go poke around in the buffer_heads.
1723 * But block_write_full_page will do the right thing. 1723 * But block_write_full_page will do the right thing.
1724 */ 1724 */
1725 ret = block_write_full_page(page, ext3_get_block, wbc); 1725 ret = block_write_full_page(page, ext3_get_block, wbc);
1726 } 1726 }
1727 err = ext3_journal_stop(handle); 1727 err = ext3_journal_stop(handle);
1728 if (!ret) 1728 if (!ret)
1729 ret = err; 1729 ret = err;
1730 out: 1730 out:
1731 return ret; 1731 return ret;
1732 1732
1733 no_write: 1733 no_write:
1734 redirty_page_for_writepage(wbc, page); 1734 redirty_page_for_writepage(wbc, page);
1735 out_unlock: 1735 out_unlock:
1736 unlock_page(page); 1736 unlock_page(page);
1737 goto out; 1737 goto out;
1738 } 1738 }
1739 1739
1740 static int ext3_readpage(struct file *file, struct page *page) 1740 static int ext3_readpage(struct file *file, struct page *page)
1741 { 1741 {
1742 return mpage_readpage(page, ext3_get_block); 1742 return mpage_readpage(page, ext3_get_block);
1743 } 1743 }
1744 1744
1745 static int 1745 static int
1746 ext3_readpages(struct file *file, struct address_space *mapping, 1746 ext3_readpages(struct file *file, struct address_space *mapping,
1747 struct list_head *pages, unsigned nr_pages) 1747 struct list_head *pages, unsigned nr_pages)
1748 { 1748 {
1749 return mpage_readpages(mapping, pages, nr_pages, ext3_get_block); 1749 return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
1750 } 1750 }
1751 1751
1752 static void ext3_invalidatepage(struct page *page, unsigned long offset) 1752 static void ext3_invalidatepage(struct page *page, unsigned long offset)
1753 { 1753 {
1754 journal_t *journal = EXT3_JOURNAL(page->mapping->host); 1754 journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1755 1755
1756 /* 1756 /*
1757 * If it's a full truncate we just forget about the pending dirtying 1757 * If it's a full truncate we just forget about the pending dirtying
1758 */ 1758 */
1759 if (offset == 0) 1759 if (offset == 0)
1760 ClearPageChecked(page); 1760 ClearPageChecked(page);
1761 1761
1762 journal_invalidatepage(journal, page, offset); 1762 journal_invalidatepage(journal, page, offset);
1763 } 1763 }
1764 1764
1765 static int ext3_releasepage(struct page *page, gfp_t wait) 1765 static int ext3_releasepage(struct page *page, gfp_t wait)
1766 { 1766 {
1767 journal_t *journal = EXT3_JOURNAL(page->mapping->host); 1767 journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1768 1768
1769 WARN_ON(PageChecked(page)); 1769 WARN_ON(PageChecked(page));
1770 if (!page_has_buffers(page)) 1770 if (!page_has_buffers(page))
1771 return 0; 1771 return 0;
1772 return journal_try_to_free_buffers(journal, page, wait); 1772 return journal_try_to_free_buffers(journal, page, wait);
1773 } 1773 }
1774 1774
1775 /* 1775 /*
1776 * If the O_DIRECT write will extend the file then add this inode to the 1776 * If the O_DIRECT write will extend the file then add this inode to the
1777 * orphan list. So recovery will truncate it back to the original size 1777 * orphan list. So recovery will truncate it back to the original size
1778 * if the machine crashes during the write. 1778 * if the machine crashes during the write.
1779 * 1779 *
1780 * If the O_DIRECT write is intantiating holes inside i_size and the machine 1780 * If the O_DIRECT write is intantiating holes inside i_size and the machine
1781 * crashes then stale disk data _may_ be exposed inside the file. But current 1781 * crashes then stale disk data _may_ be exposed inside the file. But current
1782 * VFS code falls back into buffered path in that case so we are safe. 1782 * VFS code falls back into buffered path in that case so we are safe.
1783 */ 1783 */
1784 static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb, 1784 static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
1785 const struct iovec *iov, loff_t offset, 1785 const struct iovec *iov, loff_t offset,
1786 unsigned long nr_segs) 1786 unsigned long nr_segs)
1787 { 1787 {
1788 struct file *file = iocb->ki_filp; 1788 struct file *file = iocb->ki_filp;
1789 struct inode *inode = file->f_mapping->host; 1789 struct inode *inode = file->f_mapping->host;
1790 struct ext3_inode_info *ei = EXT3_I(inode); 1790 struct ext3_inode_info *ei = EXT3_I(inode);
1791 handle_t *handle; 1791 handle_t *handle;
1792 ssize_t ret; 1792 ssize_t ret;
1793 int orphan = 0; 1793 int orphan = 0;
1794 size_t count = iov_length(iov, nr_segs); 1794 size_t count = iov_length(iov, nr_segs);
1795 int retries = 0; 1795 int retries = 0;
1796 1796
1797 if (rw == WRITE) { 1797 if (rw == WRITE) {
1798 loff_t final_size = offset + count; 1798 loff_t final_size = offset + count;
1799 1799
1800 if (final_size > inode->i_size) { 1800 if (final_size > inode->i_size) {
1801 /* Credits for sb + inode write */ 1801 /* Credits for sb + inode write */
1802 handle = ext3_journal_start(inode, 2); 1802 handle = ext3_journal_start(inode, 2);
1803 if (IS_ERR(handle)) { 1803 if (IS_ERR(handle)) {
1804 ret = PTR_ERR(handle); 1804 ret = PTR_ERR(handle);
1805 goto out; 1805 goto out;
1806 } 1806 }
1807 ret = ext3_orphan_add(handle, inode); 1807 ret = ext3_orphan_add(handle, inode);
1808 if (ret) { 1808 if (ret) {
1809 ext3_journal_stop(handle); 1809 ext3_journal_stop(handle);
1810 goto out; 1810 goto out;
1811 } 1811 }
1812 orphan = 1; 1812 orphan = 1;
1813 ei->i_disksize = inode->i_size; 1813 ei->i_disksize = inode->i_size;
1814 ext3_journal_stop(handle); 1814 ext3_journal_stop(handle);
1815 } 1815 }
1816 } 1816 }
1817 1817
1818 retry: 1818 retry:
1819 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 1819 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
1820 offset, nr_segs, 1820 offset, nr_segs,
1821 ext3_get_block, NULL); 1821 ext3_get_block, NULL);
1822 /* 1822 /*
1823 * In case of error extending write may have instantiated a few 1823 * In case of error extending write may have instantiated a few
1824 * blocks outside i_size. Trim these off again. 1824 * blocks outside i_size. Trim these off again.
1825 */ 1825 */
1826 if (unlikely((rw & WRITE) && ret < 0)) { 1826 if (unlikely((rw & WRITE) && ret < 0)) {
1827 loff_t isize = i_size_read(inode); 1827 loff_t isize = i_size_read(inode);
1828 loff_t end = offset + iov_length(iov, nr_segs); 1828 loff_t end = offset + iov_length(iov, nr_segs);
1829 1829
1830 if (end > isize) 1830 if (end > isize)
1831 vmtruncate(inode, isize); 1831 vmtruncate(inode, isize);
1832 } 1832 }
1833 if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) 1833 if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
1834 goto retry; 1834 goto retry;
1835 1835
1836 if (orphan) { 1836 if (orphan) {
1837 int err; 1837 int err;
1838 1838
1839 /* Credits for sb + inode write */ 1839 /* Credits for sb + inode write */
1840 handle = ext3_journal_start(inode, 2); 1840 handle = ext3_journal_start(inode, 2);
1841 if (IS_ERR(handle)) { 1841 if (IS_ERR(handle)) {
1842 /* This is really bad luck. We've written the data 1842 /* This is really bad luck. We've written the data
1843 * but cannot extend i_size. Truncate allocated blocks 1843 * but cannot extend i_size. Truncate allocated blocks
1844 * and pretend the write failed... */ 1844 * and pretend the write failed... */
1845 ext3_truncate(inode); 1845 ext3_truncate(inode);
1846 ret = PTR_ERR(handle); 1846 ret = PTR_ERR(handle);
1847 goto out; 1847 goto out;
1848 } 1848 }
1849 if (inode->i_nlink) 1849 if (inode->i_nlink)
1850 ext3_orphan_del(handle, inode); 1850 ext3_orphan_del(handle, inode);
1851 if (ret > 0) { 1851 if (ret > 0) {
1852 loff_t end = offset + ret; 1852 loff_t end = offset + ret;
1853 if (end > inode->i_size) { 1853 if (end > inode->i_size) {
1854 ei->i_disksize = end; 1854 ei->i_disksize = end;
1855 i_size_write(inode, end); 1855 i_size_write(inode, end);
1856 /* 1856 /*
1857 * We're going to return a positive `ret' 1857 * We're going to return a positive `ret'
1858 * here due to non-zero-length I/O, so there's 1858 * here due to non-zero-length I/O, so there's
1859 * no way of reporting error returns from 1859 * no way of reporting error returns from
1860 * ext3_mark_inode_dirty() to userspace. So 1860 * ext3_mark_inode_dirty() to userspace. So
1861 * ignore it. 1861 * ignore it.
1862 */ 1862 */
1863 ext3_mark_inode_dirty(handle, inode); 1863 ext3_mark_inode_dirty(handle, inode);
1864 } 1864 }
1865 } 1865 }
1866 err = ext3_journal_stop(handle); 1866 err = ext3_journal_stop(handle);
1867 if (ret == 0) 1867 if (ret == 0)
1868 ret = err; 1868 ret = err;
1869 } 1869 }
1870 out: 1870 out:
1871 return ret; 1871 return ret;
1872 } 1872 }
1873 1873
1874 /* 1874 /*
1875 * Pages can be marked dirty completely asynchronously from ext3's journalling 1875 * Pages can be marked dirty completely asynchronously from ext3's journalling
1876 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do 1876 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
1877 * much here because ->set_page_dirty is called under VFS locks. The page is 1877 * much here because ->set_page_dirty is called under VFS locks. The page is
1878 * not necessarily locked. 1878 * not necessarily locked.
1879 * 1879 *
1880 * We cannot just dirty the page and leave attached buffers clean, because the 1880 * We cannot just dirty the page and leave attached buffers clean, because the
1881 * buffers' dirty state is "definitive". We cannot just set the buffers dirty 1881 * buffers' dirty state is "definitive". We cannot just set the buffers dirty
1882 * or jbddirty because all the journalling code will explode. 1882 * or jbddirty because all the journalling code will explode.
1883 * 1883 *
1884 * So what we do is to mark the page "pending dirty" and next time writepage 1884 * So what we do is to mark the page "pending dirty" and next time writepage
1885 * is called, propagate that into the buffers appropriately. 1885 * is called, propagate that into the buffers appropriately.
1886 */ 1886 */
1887 static int ext3_journalled_set_page_dirty(struct page *page) 1887 static int ext3_journalled_set_page_dirty(struct page *page)
1888 { 1888 {
1889 SetPageChecked(page); 1889 SetPageChecked(page);
1890 return __set_page_dirty_nobuffers(page); 1890 return __set_page_dirty_nobuffers(page);
1891 } 1891 }
1892 1892
1893 static const struct address_space_operations ext3_ordered_aops = { 1893 static const struct address_space_operations ext3_ordered_aops = {
1894 .readpage = ext3_readpage, 1894 .readpage = ext3_readpage,
1895 .readpages = ext3_readpages, 1895 .readpages = ext3_readpages,
1896 .writepage = ext3_ordered_writepage, 1896 .writepage = ext3_ordered_writepage,
1897 .write_begin = ext3_write_begin, 1897 .write_begin = ext3_write_begin,
1898 .write_end = ext3_ordered_write_end, 1898 .write_end = ext3_ordered_write_end,
1899 .bmap = ext3_bmap, 1899 .bmap = ext3_bmap,
1900 .invalidatepage = ext3_invalidatepage, 1900 .invalidatepage = ext3_invalidatepage,
1901 .releasepage = ext3_releasepage, 1901 .releasepage = ext3_releasepage,
1902 .direct_IO = ext3_direct_IO, 1902 .direct_IO = ext3_direct_IO,
1903 .migratepage = buffer_migrate_page, 1903 .migratepage = buffer_migrate_page,
1904 .is_partially_uptodate = block_is_partially_uptodate, 1904 .is_partially_uptodate = block_is_partially_uptodate,
1905 .error_remove_page = generic_error_remove_page, 1905 .error_remove_page = generic_error_remove_page,
1906 }; 1906 };
1907 1907
1908 static const struct address_space_operations ext3_writeback_aops = { 1908 static const struct address_space_operations ext3_writeback_aops = {
1909 .readpage = ext3_readpage, 1909 .readpage = ext3_readpage,
1910 .readpages = ext3_readpages, 1910 .readpages = ext3_readpages,
1911 .writepage = ext3_writeback_writepage, 1911 .writepage = ext3_writeback_writepage,
1912 .write_begin = ext3_write_begin, 1912 .write_begin = ext3_write_begin,
1913 .write_end = ext3_writeback_write_end, 1913 .write_end = ext3_writeback_write_end,
1914 .bmap = ext3_bmap, 1914 .bmap = ext3_bmap,
1915 .invalidatepage = ext3_invalidatepage, 1915 .invalidatepage = ext3_invalidatepage,
1916 .releasepage = ext3_releasepage, 1916 .releasepage = ext3_releasepage,
1917 .direct_IO = ext3_direct_IO, 1917 .direct_IO = ext3_direct_IO,
1918 .migratepage = buffer_migrate_page, 1918 .migratepage = buffer_migrate_page,
1919 .is_partially_uptodate = block_is_partially_uptodate, 1919 .is_partially_uptodate = block_is_partially_uptodate,
1920 .error_remove_page = generic_error_remove_page, 1920 .error_remove_page = generic_error_remove_page,
1921 }; 1921 };
1922 1922
1923 static const struct address_space_operations ext3_journalled_aops = { 1923 static const struct address_space_operations ext3_journalled_aops = {
1924 .readpage = ext3_readpage, 1924 .readpage = ext3_readpage,
1925 .readpages = ext3_readpages, 1925 .readpages = ext3_readpages,
1926 .writepage = ext3_journalled_writepage, 1926 .writepage = ext3_journalled_writepage,
1927 .write_begin = ext3_write_begin, 1927 .write_begin = ext3_write_begin,
1928 .write_end = ext3_journalled_write_end, 1928 .write_end = ext3_journalled_write_end,
1929 .set_page_dirty = ext3_journalled_set_page_dirty, 1929 .set_page_dirty = ext3_journalled_set_page_dirty,
1930 .bmap = ext3_bmap, 1930 .bmap = ext3_bmap,
1931 .invalidatepage = ext3_invalidatepage, 1931 .invalidatepage = ext3_invalidatepage,
1932 .releasepage = ext3_releasepage, 1932 .releasepage = ext3_releasepage,
1933 .is_partially_uptodate = block_is_partially_uptodate, 1933 .is_partially_uptodate = block_is_partially_uptodate,
1934 .error_remove_page = generic_error_remove_page, 1934 .error_remove_page = generic_error_remove_page,
1935 }; 1935 };
1936 1936
1937 void ext3_set_aops(struct inode *inode) 1937 void ext3_set_aops(struct inode *inode)
1938 { 1938 {
1939 if (ext3_should_order_data(inode)) 1939 if (ext3_should_order_data(inode))
1940 inode->i_mapping->a_ops = &ext3_ordered_aops; 1940 inode->i_mapping->a_ops = &ext3_ordered_aops;
1941 else if (ext3_should_writeback_data(inode)) 1941 else if (ext3_should_writeback_data(inode))
1942 inode->i_mapping->a_ops = &ext3_writeback_aops; 1942 inode->i_mapping->a_ops = &ext3_writeback_aops;
1943 else 1943 else
1944 inode->i_mapping->a_ops = &ext3_journalled_aops; 1944 inode->i_mapping->a_ops = &ext3_journalled_aops;
1945 } 1945 }
1946 1946
1947 /* 1947 /*
1948 * ext3_block_truncate_page() zeroes out a mapping from file offset `from' 1948 * ext3_block_truncate_page() zeroes out a mapping from file offset `from'
1949 * up to the end of the block which corresponds to `from'. 1949 * up to the end of the block which corresponds to `from'.
1950 * This required during truncate. We need to physically zero the tail end 1950 * This required during truncate. We need to physically zero the tail end
1951 * of that block so it doesn't yield old data if the file is later grown. 1951 * of that block so it doesn't yield old data if the file is later grown.
1952 */ 1952 */
1953 static int ext3_block_truncate_page(handle_t *handle, struct page *page, 1953 static int ext3_block_truncate_page(handle_t *handle, struct page *page,
1954 struct address_space *mapping, loff_t from) 1954 struct address_space *mapping, loff_t from)
1955 { 1955 {
1956 ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT; 1956 ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT;
1957 unsigned offset = from & (PAGE_CACHE_SIZE-1); 1957 unsigned offset = from & (PAGE_CACHE_SIZE-1);
1958 unsigned blocksize, iblock, length, pos; 1958 unsigned blocksize, iblock, length, pos;
1959 struct inode *inode = mapping->host; 1959 struct inode *inode = mapping->host;
1960 struct buffer_head *bh; 1960 struct buffer_head *bh;
1961 int err = 0; 1961 int err = 0;
1962 1962
1963 blocksize = inode->i_sb->s_blocksize; 1963 blocksize = inode->i_sb->s_blocksize;
1964 length = blocksize - (offset & (blocksize - 1)); 1964 length = blocksize - (offset & (blocksize - 1));
1965 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); 1965 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1966 1966
1967 if (!page_has_buffers(page)) 1967 if (!page_has_buffers(page))
1968 create_empty_buffers(page, blocksize, 0); 1968 create_empty_buffers(page, blocksize, 0);
1969 1969
1970 /* Find the buffer that contains "offset" */ 1970 /* Find the buffer that contains "offset" */
1971 bh = page_buffers(page); 1971 bh = page_buffers(page);
1972 pos = blocksize; 1972 pos = blocksize;
1973 while (offset >= pos) { 1973 while (offset >= pos) {
1974 bh = bh->b_this_page; 1974 bh = bh->b_this_page;
1975 iblock++; 1975 iblock++;
1976 pos += blocksize; 1976 pos += blocksize;
1977 } 1977 }
1978 1978
1979 err = 0; 1979 err = 0;
1980 if (buffer_freed(bh)) { 1980 if (buffer_freed(bh)) {
1981 BUFFER_TRACE(bh, "freed: skip"); 1981 BUFFER_TRACE(bh, "freed: skip");
1982 goto unlock; 1982 goto unlock;
1983 } 1983 }
1984 1984
1985 if (!buffer_mapped(bh)) { 1985 if (!buffer_mapped(bh)) {
1986 BUFFER_TRACE(bh, "unmapped"); 1986 BUFFER_TRACE(bh, "unmapped");
1987 ext3_get_block(inode, iblock, bh, 0); 1987 ext3_get_block(inode, iblock, bh, 0);
1988 /* unmapped? It's a hole - nothing to do */ 1988 /* unmapped? It's a hole - nothing to do */
1989 if (!buffer_mapped(bh)) { 1989 if (!buffer_mapped(bh)) {
1990 BUFFER_TRACE(bh, "still unmapped"); 1990 BUFFER_TRACE(bh, "still unmapped");
1991 goto unlock; 1991 goto unlock;
1992 } 1992 }
1993 } 1993 }
1994 1994
1995 /* Ok, it's mapped. Make sure it's up-to-date */ 1995 /* Ok, it's mapped. Make sure it's up-to-date */
1996 if (PageUptodate(page)) 1996 if (PageUptodate(page))
1997 set_buffer_uptodate(bh); 1997 set_buffer_uptodate(bh);
1998 1998
1999 if (!buffer_uptodate(bh)) { 1999 if (!buffer_uptodate(bh)) {
2000 err = -EIO; 2000 err = -EIO;
2001 ll_rw_block(READ, 1, &bh); 2001 ll_rw_block(READ, 1, &bh);
2002 wait_on_buffer(bh); 2002 wait_on_buffer(bh);
2003 /* Uhhuh. Read error. Complain and punt. */ 2003 /* Uhhuh. Read error. Complain and punt. */
2004 if (!buffer_uptodate(bh)) 2004 if (!buffer_uptodate(bh))
2005 goto unlock; 2005 goto unlock;
2006 } 2006 }
2007 2007
2008 if (ext3_should_journal_data(inode)) { 2008 if (ext3_should_journal_data(inode)) {
2009 BUFFER_TRACE(bh, "get write access"); 2009 BUFFER_TRACE(bh, "get write access");
2010 err = ext3_journal_get_write_access(handle, bh); 2010 err = ext3_journal_get_write_access(handle, bh);
2011 if (err) 2011 if (err)
2012 goto unlock; 2012 goto unlock;
2013 } 2013 }
2014 2014
2015 zero_user(page, offset, length); 2015 zero_user(page, offset, length);
2016 BUFFER_TRACE(bh, "zeroed end of block"); 2016 BUFFER_TRACE(bh, "zeroed end of block");
2017 2017
2018 err = 0; 2018 err = 0;
2019 if (ext3_should_journal_data(inode)) { 2019 if (ext3_should_journal_data(inode)) {
2020 err = ext3_journal_dirty_metadata(handle, bh); 2020 err = ext3_journal_dirty_metadata(handle, bh);
2021 } else { 2021 } else {
2022 if (ext3_should_order_data(inode)) 2022 if (ext3_should_order_data(inode))
2023 err = ext3_journal_dirty_data(handle, bh); 2023 err = ext3_journal_dirty_data(handle, bh);
2024 mark_buffer_dirty(bh); 2024 mark_buffer_dirty(bh);
2025 } 2025 }
2026 2026
2027 unlock: 2027 unlock:
2028 unlock_page(page); 2028 unlock_page(page);
2029 page_cache_release(page); 2029 page_cache_release(page);
2030 return err; 2030 return err;
2031 } 2031 }
2032 2032
2033 /* 2033 /*
2034 * Probably it should be a library function... search for first non-zero word 2034 * Probably it should be a library function... search for first non-zero word
2035 * or memcmp with zero_page, whatever is better for particular architecture. 2035 * or memcmp with zero_page, whatever is better for particular architecture.
2036 * Linus? 2036 * Linus?
2037 */ 2037 */
2038 static inline int all_zeroes(__le32 *p, __le32 *q) 2038 static inline int all_zeroes(__le32 *p, __le32 *q)
2039 { 2039 {
2040 while (p < q) 2040 while (p < q)
2041 if (*p++) 2041 if (*p++)
2042 return 0; 2042 return 0;
2043 return 1; 2043 return 1;
2044 } 2044 }
2045 2045
2046 /** 2046 /**
2047 * ext3_find_shared - find the indirect blocks for partial truncation. 2047 * ext3_find_shared - find the indirect blocks for partial truncation.
2048 * @inode: inode in question 2048 * @inode: inode in question
2049 * @depth: depth of the affected branch 2049 * @depth: depth of the affected branch
2050 * @offsets: offsets of pointers in that branch (see ext3_block_to_path) 2050 * @offsets: offsets of pointers in that branch (see ext3_block_to_path)
2051 * @chain: place to store the pointers to partial indirect blocks 2051 * @chain: place to store the pointers to partial indirect blocks
2052 * @top: place to the (detached) top of branch 2052 * @top: place to the (detached) top of branch
2053 * 2053 *
2054 * This is a helper function used by ext3_truncate(). 2054 * This is a helper function used by ext3_truncate().
2055 * 2055 *
2056 * When we do truncate() we may have to clean the ends of several 2056 * When we do truncate() we may have to clean the ends of several
2057 * indirect blocks but leave the blocks themselves alive. Block is 2057 * indirect blocks but leave the blocks themselves alive. Block is
2058 * partially truncated if some data below the new i_size is referred 2058 * partially truncated if some data below the new i_size is referred
2059 * from it (and it is on the path to the first completely truncated 2059 * from it (and it is on the path to the first completely truncated
2060 * data block, indeed). We have to free the top of that path along 2060 * data block, indeed). We have to free the top of that path along
2061 * with everything to the right of the path. Since no allocation 2061 * with everything to the right of the path. Since no allocation
2062 * past the truncation point is possible until ext3_truncate() 2062 * past the truncation point is possible until ext3_truncate()
2063 * finishes, we may safely do the latter, but top of branch may 2063 * finishes, we may safely do the latter, but top of branch may
2064 * require special attention - pageout below the truncation point 2064 * require special attention - pageout below the truncation point
2065 * might try to populate it. 2065 * might try to populate it.
2066 * 2066 *
2067 * We atomically detach the top of branch from the tree, store the 2067 * We atomically detach the top of branch from the tree, store the
2068 * block number of its root in *@top, pointers to buffer_heads of 2068 * block number of its root in *@top, pointers to buffer_heads of
2069 * partially truncated blocks - in @chain[].bh and pointers to 2069 * partially truncated blocks - in @chain[].bh and pointers to
2070 * their last elements that should not be removed - in 2070 * their last elements that should not be removed - in
2071 * @chain[].p. Return value is the pointer to last filled element 2071 * @chain[].p. Return value is the pointer to last filled element
2072 * of @chain. 2072 * of @chain.
2073 * 2073 *
2074 * The work left to caller to do the actual freeing of subtrees: 2074 * The work left to caller to do the actual freeing of subtrees:
2075 * a) free the subtree starting from *@top 2075 * a) free the subtree starting from *@top
2076 * b) free the subtrees whose roots are stored in 2076 * b) free the subtrees whose roots are stored in
2077 * (@chain[i].p+1 .. end of @chain[i].bh->b_data) 2077 * (@chain[i].p+1 .. end of @chain[i].bh->b_data)
2078 * c) free the subtrees growing from the inode past the @chain[0]. 2078 * c) free the subtrees growing from the inode past the @chain[0].
2079 * (no partially truncated stuff there). */ 2079 * (no partially truncated stuff there). */
2080 2080
2081 static Indirect *ext3_find_shared(struct inode *inode, int depth, 2081 static Indirect *ext3_find_shared(struct inode *inode, int depth,
2082 int offsets[4], Indirect chain[4], __le32 *top) 2082 int offsets[4], Indirect chain[4], __le32 *top)
2083 { 2083 {
2084 Indirect *partial, *p; 2084 Indirect *partial, *p;
2085 int k, err; 2085 int k, err;
2086 2086
2087 *top = 0; 2087 *top = 0;
2088 /* Make k index the deepest non-null offset + 1 */ 2088 /* Make k index the deepest non-null offset + 1 */
2089 for (k = depth; k > 1 && !offsets[k-1]; k--) 2089 for (k = depth; k > 1 && !offsets[k-1]; k--)
2090 ; 2090 ;
2091 partial = ext3_get_branch(inode, k, offsets, chain, &err); 2091 partial = ext3_get_branch(inode, k, offsets, chain, &err);
2092 /* Writer: pointers */ 2092 /* Writer: pointers */
2093 if (!partial) 2093 if (!partial)
2094 partial = chain + k-1; 2094 partial = chain + k-1;
2095 /* 2095 /*
2096 * If the branch acquired continuation since we've looked at it - 2096 * If the branch acquired continuation since we've looked at it -
2097 * fine, it should all survive and (new) top doesn't belong to us. 2097 * fine, it should all survive and (new) top doesn't belong to us.
2098 */ 2098 */
2099 if (!partial->key && *partial->p) 2099 if (!partial->key && *partial->p)
2100 /* Writer: end */ 2100 /* Writer: end */
2101 goto no_top; 2101 goto no_top;
2102 for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--) 2102 for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
2103 ; 2103 ;
2104 /* 2104 /*
2105 * OK, we've found the last block that must survive. The rest of our 2105 * OK, we've found the last block that must survive. The rest of our
2106 * branch should be detached before unlocking. However, if that rest 2106 * branch should be detached before unlocking. However, if that rest
2107 * of branch is all ours and does not grow immediately from the inode 2107 * of branch is all ours and does not grow immediately from the inode
2108 * it's easier to cheat and just decrement partial->p. 2108 * it's easier to cheat and just decrement partial->p.
2109 */ 2109 */
2110 if (p == chain + k - 1 && p > chain) { 2110 if (p == chain + k - 1 && p > chain) {
2111 p->p--; 2111 p->p--;
2112 } else { 2112 } else {
2113 *top = *p->p; 2113 *top = *p->p;
2114 /* Nope, don't do this in ext3. Must leave the tree intact */ 2114 /* Nope, don't do this in ext3. Must leave the tree intact */
2115 #if 0 2115 #if 0
2116 *p->p = 0; 2116 *p->p = 0;
2117 #endif 2117 #endif
2118 } 2118 }
2119 /* Writer: end */ 2119 /* Writer: end */
2120 2120
2121 while(partial > p) { 2121 while(partial > p) {
2122 brelse(partial->bh); 2122 brelse(partial->bh);
2123 partial--; 2123 partial--;
2124 } 2124 }
2125 no_top: 2125 no_top:
2126 return partial; 2126 return partial;
2127 } 2127 }
2128 2128
2129 /* 2129 /*
2130 * Zero a number of block pointers in either an inode or an indirect block. 2130 * Zero a number of block pointers in either an inode or an indirect block.
2131 * If we restart the transaction we must again get write access to the 2131 * If we restart the transaction we must again get write access to the
2132 * indirect block for further modification. 2132 * indirect block for further modification.
2133 * 2133 *
2134 * We release `count' blocks on disk, but (last - first) may be greater 2134 * We release `count' blocks on disk, but (last - first) may be greater
2135 * than `count' because there can be holes in there. 2135 * than `count' because there can be holes in there.
2136 */ 2136 */
2137 static void ext3_clear_blocks(handle_t *handle, struct inode *inode, 2137 static void ext3_clear_blocks(handle_t *handle, struct inode *inode,
2138 struct buffer_head *bh, ext3_fsblk_t block_to_free, 2138 struct buffer_head *bh, ext3_fsblk_t block_to_free,
2139 unsigned long count, __le32 *first, __le32 *last) 2139 unsigned long count, __le32 *first, __le32 *last)
2140 { 2140 {
2141 __le32 *p; 2141 __le32 *p;
2142 if (try_to_extend_transaction(handle, inode)) { 2142 if (try_to_extend_transaction(handle, inode)) {
2143 if (bh) { 2143 if (bh) {
2144 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); 2144 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
2145 if (ext3_journal_dirty_metadata(handle, bh)) 2145 if (ext3_journal_dirty_metadata(handle, bh))
2146 return; 2146 return;
2147 } 2147 }
2148 ext3_mark_inode_dirty(handle, inode); 2148 ext3_mark_inode_dirty(handle, inode);
2149 truncate_restart_transaction(handle, inode); 2149 truncate_restart_transaction(handle, inode);
2150 if (bh) { 2150 if (bh) {
2151 BUFFER_TRACE(bh, "retaking write access"); 2151 BUFFER_TRACE(bh, "retaking write access");
2152 if (ext3_journal_get_write_access(handle, bh)) 2152 if (ext3_journal_get_write_access(handle, bh))
2153 return; 2153 return;
2154 } 2154 }
2155 } 2155 }
2156 2156
2157 /* 2157 /*
2158 * Any buffers which are on the journal will be in memory. We find 2158 * Any buffers which are on the journal will be in memory. We find
2159 * them on the hash table so journal_revoke() will run journal_forget() 2159 * them on the hash table so journal_revoke() will run journal_forget()
2160 * on them. We've already detached each block from the file, so 2160 * on them. We've already detached each block from the file, so
2161 * bforget() in journal_forget() should be safe. 2161 * bforget() in journal_forget() should be safe.
2162 * 2162 *
2163 * AKPM: turn on bforget in journal_forget()!!! 2163 * AKPM: turn on bforget in journal_forget()!!!
2164 */ 2164 */
2165 for (p = first; p < last; p++) { 2165 for (p = first; p < last; p++) {
2166 u32 nr = le32_to_cpu(*p); 2166 u32 nr = le32_to_cpu(*p);
2167 if (nr) { 2167 if (nr) {
2168 struct buffer_head *bh; 2168 struct buffer_head *bh;
2169 2169
2170 *p = 0; 2170 *p = 0;
2171 bh = sb_find_get_block(inode->i_sb, nr); 2171 bh = sb_find_get_block(inode->i_sb, nr);
2172 ext3_forget(handle, 0, inode, bh, nr); 2172 ext3_forget(handle, 0, inode, bh, nr);
2173 } 2173 }
2174 } 2174 }
2175 2175
2176 ext3_free_blocks(handle, inode, block_to_free, count); 2176 ext3_free_blocks(handle, inode, block_to_free, count);
2177 } 2177 }
2178 2178
2179 /** 2179 /**
2180 * ext3_free_data - free a list of data blocks 2180 * ext3_free_data - free a list of data blocks
2181 * @handle: handle for this transaction 2181 * @handle: handle for this transaction
2182 * @inode: inode we are dealing with 2182 * @inode: inode we are dealing with
2183 * @this_bh: indirect buffer_head which contains *@first and *@last 2183 * @this_bh: indirect buffer_head which contains *@first and *@last
2184 * @first: array of block numbers 2184 * @first: array of block numbers
2185 * @last: points immediately past the end of array 2185 * @last: points immediately past the end of array
2186 * 2186 *
2187 * We are freeing all blocks referred from that array (numbers are stored as 2187 * We are freeing all blocks referred from that array (numbers are stored as
2188 * little-endian 32-bit) and updating @inode->i_blocks appropriately. 2188 * little-endian 32-bit) and updating @inode->i_blocks appropriately.
2189 * 2189 *
2190 * We accumulate contiguous runs of blocks to free. Conveniently, if these 2190 * We accumulate contiguous runs of blocks to free. Conveniently, if these
2191 * blocks are contiguous then releasing them at one time will only affect one 2191 * blocks are contiguous then releasing them at one time will only affect one
2192 * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't 2192 * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
2193 * actually use a lot of journal space. 2193 * actually use a lot of journal space.
2194 * 2194 *
2195 * @this_bh will be %NULL if @first and @last point into the inode's direct 2195 * @this_bh will be %NULL if @first and @last point into the inode's direct
2196 * block pointers. 2196 * block pointers.
2197 */ 2197 */
2198 static void ext3_free_data(handle_t *handle, struct inode *inode, 2198 static void ext3_free_data(handle_t *handle, struct inode *inode,
2199 struct buffer_head *this_bh, 2199 struct buffer_head *this_bh,
2200 __le32 *first, __le32 *last) 2200 __le32 *first, __le32 *last)
2201 { 2201 {
2202 ext3_fsblk_t block_to_free = 0; /* Starting block # of a run */ 2202 ext3_fsblk_t block_to_free = 0; /* Starting block # of a run */
2203 unsigned long count = 0; /* Number of blocks in the run */ 2203 unsigned long count = 0; /* Number of blocks in the run */
2204 __le32 *block_to_free_p = NULL; /* Pointer into inode/ind 2204 __le32 *block_to_free_p = NULL; /* Pointer into inode/ind
2205 corresponding to 2205 corresponding to
2206 block_to_free */ 2206 block_to_free */
2207 ext3_fsblk_t nr; /* Current block # */ 2207 ext3_fsblk_t nr; /* Current block # */
2208 __le32 *p; /* Pointer into inode/ind 2208 __le32 *p; /* Pointer into inode/ind
2209 for current block */ 2209 for current block */
2210 int err; 2210 int err;
2211 2211
2212 if (this_bh) { /* For indirect block */ 2212 if (this_bh) { /* For indirect block */
2213 BUFFER_TRACE(this_bh, "get_write_access"); 2213 BUFFER_TRACE(this_bh, "get_write_access");
2214 err = ext3_journal_get_write_access(handle, this_bh); 2214 err = ext3_journal_get_write_access(handle, this_bh);
2215 /* Important: if we can't update the indirect pointers 2215 /* Important: if we can't update the indirect pointers
2216 * to the blocks, we can't free them. */ 2216 * to the blocks, we can't free them. */
2217 if (err) 2217 if (err)
2218 return; 2218 return;
2219 } 2219 }
2220 2220
2221 for (p = first; p < last; p++) { 2221 for (p = first; p < last; p++) {
2222 nr = le32_to_cpu(*p); 2222 nr = le32_to_cpu(*p);
2223 if (nr) { 2223 if (nr) {
2224 /* accumulate blocks to free if they're contiguous */ 2224 /* accumulate blocks to free if they're contiguous */
2225 if (count == 0) { 2225 if (count == 0) {
2226 block_to_free = nr; 2226 block_to_free = nr;
2227 block_to_free_p = p; 2227 block_to_free_p = p;
2228 count = 1; 2228 count = 1;
2229 } else if (nr == block_to_free + count) { 2229 } else if (nr == block_to_free + count) {
2230 count++; 2230 count++;
2231 } else { 2231 } else {
2232 ext3_clear_blocks(handle, inode, this_bh, 2232 ext3_clear_blocks(handle, inode, this_bh,
2233 block_to_free, 2233 block_to_free,
2234 count, block_to_free_p, p); 2234 count, block_to_free_p, p);
2235 block_to_free = nr; 2235 block_to_free = nr;
2236 block_to_free_p = p; 2236 block_to_free_p = p;
2237 count = 1; 2237 count = 1;
2238 } 2238 }
2239 } 2239 }
2240 } 2240 }
2241 2241
2242 if (count > 0) 2242 if (count > 0)
2243 ext3_clear_blocks(handle, inode, this_bh, block_to_free, 2243 ext3_clear_blocks(handle, inode, this_bh, block_to_free,
2244 count, block_to_free_p, p); 2244 count, block_to_free_p, p);
2245 2245
2246 if (this_bh) { 2246 if (this_bh) {
2247 BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata"); 2247 BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata");
2248 2248
2249 /* 2249 /*
2250 * The buffer head should have an attached journal head at this 2250 * The buffer head should have an attached journal head at this
2251 * point. However, if the data is corrupted and an indirect 2251 * point. However, if the data is corrupted and an indirect
2252 * block pointed to itself, it would have been detached when 2252 * block pointed to itself, it would have been detached when
2253 * the block was cleared. Check for this instead of OOPSing. 2253 * the block was cleared. Check for this instead of OOPSing.
2254 */ 2254 */
2255 if (bh2jh(this_bh)) 2255 if (bh2jh(this_bh))
2256 ext3_journal_dirty_metadata(handle, this_bh); 2256 ext3_journal_dirty_metadata(handle, this_bh);
2257 else 2257 else
2258 ext3_error(inode->i_sb, "ext3_free_data", 2258 ext3_error(inode->i_sb, "ext3_free_data",
2259 "circular indirect block detected, " 2259 "circular indirect block detected, "
2260 "inode=%lu, block=%llu", 2260 "inode=%lu, block=%llu",
2261 inode->i_ino, 2261 inode->i_ino,
2262 (unsigned long long)this_bh->b_blocknr); 2262 (unsigned long long)this_bh->b_blocknr);
2263 } 2263 }
2264 } 2264 }
2265 2265
2266 /** 2266 /**
2267 * ext3_free_branches - free an array of branches 2267 * ext3_free_branches - free an array of branches
2268 * @handle: JBD handle for this transaction 2268 * @handle: JBD handle for this transaction
2269 * @inode: inode we are dealing with 2269 * @inode: inode we are dealing with
2270 * @parent_bh: the buffer_head which contains *@first and *@last 2270 * @parent_bh: the buffer_head which contains *@first and *@last
2271 * @first: array of block numbers 2271 * @first: array of block numbers
2272 * @last: pointer immediately past the end of array 2272 * @last: pointer immediately past the end of array
2273 * @depth: depth of the branches to free 2273 * @depth: depth of the branches to free
2274 * 2274 *
2275 * We are freeing all blocks referred from these branches (numbers are 2275 * We are freeing all blocks referred from these branches (numbers are
2276 * stored as little-endian 32-bit) and updating @inode->i_blocks 2276 * stored as little-endian 32-bit) and updating @inode->i_blocks
2277 * appropriately. 2277 * appropriately.
2278 */ 2278 */
2279 static void ext3_free_branches(handle_t *handle, struct inode *inode, 2279 static void ext3_free_branches(handle_t *handle, struct inode *inode,
2280 struct buffer_head *parent_bh, 2280 struct buffer_head *parent_bh,
2281 __le32 *first, __le32 *last, int depth) 2281 __le32 *first, __le32 *last, int depth)
2282 { 2282 {
2283 ext3_fsblk_t nr; 2283 ext3_fsblk_t nr;
2284 __le32 *p; 2284 __le32 *p;
2285 2285
2286 if (is_handle_aborted(handle)) 2286 if (is_handle_aborted(handle))
2287 return; 2287 return;
2288 2288
2289 if (depth--) { 2289 if (depth--) {
2290 struct buffer_head *bh; 2290 struct buffer_head *bh;
2291 int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb); 2291 int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
2292 p = last; 2292 p = last;
2293 while (--p >= first) { 2293 while (--p >= first) {
2294 nr = le32_to_cpu(*p); 2294 nr = le32_to_cpu(*p);
2295 if (!nr) 2295 if (!nr)
2296 continue; /* A hole */ 2296 continue; /* A hole */
2297 2297
2298 /* Go read the buffer for the next level down */ 2298 /* Go read the buffer for the next level down */
2299 bh = sb_bread(inode->i_sb, nr); 2299 bh = sb_bread(inode->i_sb, nr);
2300 2300
2301 /* 2301 /*
2302 * A read failure? Report error and clear slot 2302 * A read failure? Report error and clear slot
2303 * (should be rare). 2303 * (should be rare).
2304 */ 2304 */
2305 if (!bh) { 2305 if (!bh) {
2306 ext3_error(inode->i_sb, "ext3_free_branches", 2306 ext3_error(inode->i_sb, "ext3_free_branches",
2307 "Read failure, inode=%lu, block="E3FSBLK, 2307 "Read failure, inode=%lu, block="E3FSBLK,
2308 inode->i_ino, nr); 2308 inode->i_ino, nr);
2309 continue; 2309 continue;
2310 } 2310 }
2311 2311
2312 /* This zaps the entire block. Bottom up. */ 2312 /* This zaps the entire block. Bottom up. */
2313 BUFFER_TRACE(bh, "free child branches"); 2313 BUFFER_TRACE(bh, "free child branches");
2314 ext3_free_branches(handle, inode, bh, 2314 ext3_free_branches(handle, inode, bh,
2315 (__le32*)bh->b_data, 2315 (__le32*)bh->b_data,
2316 (__le32*)bh->b_data + addr_per_block, 2316 (__le32*)bh->b_data + addr_per_block,
2317 depth); 2317 depth);
2318 2318
2319 /* 2319 /*
2320 * Everything below this this pointer has been 2320 * Everything below this this pointer has been
2321 * released. Now let this top-of-subtree go. 2321 * released. Now let this top-of-subtree go.
2322 * 2322 *
2323 * We want the freeing of this indirect block to be 2323 * We want the freeing of this indirect block to be
2324 * atomic in the journal with the updating of the 2324 * atomic in the journal with the updating of the
2325 * bitmap block which owns it. So make some room in 2325 * bitmap block which owns it. So make some room in
2326 * the journal. 2326 * the journal.
2327 * 2327 *
2328 * We zero the parent pointer *after* freeing its 2328 * We zero the parent pointer *after* freeing its
2329 * pointee in the bitmaps, so if extend_transaction() 2329 * pointee in the bitmaps, so if extend_transaction()
2330 * for some reason fails to put the bitmap changes and 2330 * for some reason fails to put the bitmap changes and
2331 * the release into the same transaction, recovery 2331 * the release into the same transaction, recovery
2332 * will merely complain about releasing a free block, 2332 * will merely complain about releasing a free block,
2333 * rather than leaking blocks. 2333 * rather than leaking blocks.
2334 */ 2334 */
2335 if (is_handle_aborted(handle)) 2335 if (is_handle_aborted(handle))
2336 return; 2336 return;
2337 if (try_to_extend_transaction(handle, inode)) { 2337 if (try_to_extend_transaction(handle, inode)) {
2338 ext3_mark_inode_dirty(handle, inode); 2338 ext3_mark_inode_dirty(handle, inode);
2339 truncate_restart_transaction(handle, inode); 2339 truncate_restart_transaction(handle, inode);
2340 } 2340 }
2341 2341
2342 /* 2342 /*
2343 * We've probably journalled the indirect block several 2343 * We've probably journalled the indirect block several
2344 * times during the truncate. But it's no longer 2344 * times during the truncate. But it's no longer
2345 * needed and we now drop it from the transaction via 2345 * needed and we now drop it from the transaction via
2346 * journal_revoke(). 2346 * journal_revoke().
2347 * 2347 *
2348 * That's easy if it's exclusively part of this 2348 * That's easy if it's exclusively part of this
2349 * transaction. But if it's part of the committing 2349 * transaction. But if it's part of the committing
2350 * transaction then journal_forget() will simply 2350 * transaction then journal_forget() will simply
2351 * brelse() it. That means that if the underlying 2351 * brelse() it. That means that if the underlying
2352 * block is reallocated in ext3_get_block(), 2352 * block is reallocated in ext3_get_block(),
2353 * unmap_underlying_metadata() will find this block 2353 * unmap_underlying_metadata() will find this block
2354 * and will try to get rid of it. damn, damn. Thus 2354 * and will try to get rid of it. damn, damn. Thus
2355 * we don't allow a block to be reallocated until 2355 * we don't allow a block to be reallocated until
2356 * a transaction freeing it has fully committed. 2356 * a transaction freeing it has fully committed.
2357 * 2357 *
2358 * We also have to make sure journal replay after a 2358 * We also have to make sure journal replay after a
2359 * crash does not overwrite non-journaled data blocks 2359 * crash does not overwrite non-journaled data blocks
2360 * with old metadata when the block got reallocated for 2360 * with old metadata when the block got reallocated for
2361 * data. Thus we have to store a revoke record for a 2361 * data. Thus we have to store a revoke record for a
2362 * block in the same transaction in which we free the 2362 * block in the same transaction in which we free the
2363 * block. 2363 * block.
2364 */ 2364 */
2365 ext3_forget(handle, 1, inode, bh, bh->b_blocknr); 2365 ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
2366 2366
2367 ext3_free_blocks(handle, inode, nr, 1); 2367 ext3_free_blocks(handle, inode, nr, 1);
2368 2368
2369 if (parent_bh) { 2369 if (parent_bh) {
2370 /* 2370 /*
2371 * The block which we have just freed is 2371 * The block which we have just freed is
2372 * pointed to by an indirect block: journal it 2372 * pointed to by an indirect block: journal it
2373 */ 2373 */
2374 BUFFER_TRACE(parent_bh, "get_write_access"); 2374 BUFFER_TRACE(parent_bh, "get_write_access");
2375 if (!ext3_journal_get_write_access(handle, 2375 if (!ext3_journal_get_write_access(handle,
2376 parent_bh)){ 2376 parent_bh)){
2377 *p = 0; 2377 *p = 0;
2378 BUFFER_TRACE(parent_bh, 2378 BUFFER_TRACE(parent_bh,
2379 "call ext3_journal_dirty_metadata"); 2379 "call ext3_journal_dirty_metadata");
2380 ext3_journal_dirty_metadata(handle, 2380 ext3_journal_dirty_metadata(handle,
2381 parent_bh); 2381 parent_bh);
2382 } 2382 }
2383 } 2383 }
2384 } 2384 }
2385 } else { 2385 } else {
2386 /* We have reached the bottom of the tree. */ 2386 /* We have reached the bottom of the tree. */
2387 BUFFER_TRACE(parent_bh, "free data blocks"); 2387 BUFFER_TRACE(parent_bh, "free data blocks");
2388 ext3_free_data(handle, inode, parent_bh, first, last); 2388 ext3_free_data(handle, inode, parent_bh, first, last);
2389 } 2389 }
2390 } 2390 }
2391 2391
2392 int ext3_can_truncate(struct inode *inode) 2392 int ext3_can_truncate(struct inode *inode)
2393 { 2393 {
2394 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) 2394 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
2395 return 0; 2395 return 0;
2396 if (S_ISREG(inode->i_mode)) 2396 if (S_ISREG(inode->i_mode))
2397 return 1; 2397 return 1;
2398 if (S_ISDIR(inode->i_mode)) 2398 if (S_ISDIR(inode->i_mode))
2399 return 1; 2399 return 1;
2400 if (S_ISLNK(inode->i_mode)) 2400 if (S_ISLNK(inode->i_mode))
2401 return !ext3_inode_is_fast_symlink(inode); 2401 return !ext3_inode_is_fast_symlink(inode);
2402 return 0; 2402 return 0;
2403 } 2403 }
2404 2404
2405 /* 2405 /*
2406 * ext3_truncate() 2406 * ext3_truncate()
2407 * 2407 *
2408 * We block out ext3_get_block() block instantiations across the entire 2408 * We block out ext3_get_block() block instantiations across the entire
2409 * transaction, and VFS/VM ensures that ext3_truncate() cannot run 2409 * transaction, and VFS/VM ensures that ext3_truncate() cannot run
2410 * simultaneously on behalf of the same inode. 2410 * simultaneously on behalf of the same inode.
2411 * 2411 *
2412 * As we work through the truncate and commmit bits of it to the journal there 2412 * As we work through the truncate and commmit bits of it to the journal there
2413 * is one core, guiding principle: the file's tree must always be consistent on 2413 * is one core, guiding principle: the file's tree must always be consistent on
2414 * disk. We must be able to restart the truncate after a crash. 2414 * disk. We must be able to restart the truncate after a crash.
2415 * 2415 *
2416 * The file's tree may be transiently inconsistent in memory (although it 2416 * The file's tree may be transiently inconsistent in memory (although it
2417 * probably isn't), but whenever we close off and commit a journal transaction, 2417 * probably isn't), but whenever we close off and commit a journal transaction,
2418 * the contents of (the filesystem + the journal) must be consistent and 2418 * the contents of (the filesystem + the journal) must be consistent and
2419 * restartable. It's pretty simple, really: bottom up, right to left (although 2419 * restartable. It's pretty simple, really: bottom up, right to left (although
2420 * left-to-right works OK too). 2420 * left-to-right works OK too).
2421 * 2421 *
2422 * Note that at recovery time, journal replay occurs *before* the restart of 2422 * Note that at recovery time, journal replay occurs *before* the restart of
2423 * truncate against the orphan inode list. 2423 * truncate against the orphan inode list.
2424 * 2424 *
2425 * The committed inode has the new, desired i_size (which is the same as 2425 * The committed inode has the new, desired i_size (which is the same as
2426 * i_disksize in this case). After a crash, ext3_orphan_cleanup() will see 2426 * i_disksize in this case). After a crash, ext3_orphan_cleanup() will see
2427 * that this inode's truncate did not complete and it will again call 2427 * that this inode's truncate did not complete and it will again call
2428 * ext3_truncate() to have another go. So there will be instantiated blocks 2428 * ext3_truncate() to have another go. So there will be instantiated blocks
2429 * to the right of the truncation point in a crashed ext3 filesystem. But 2429 * to the right of the truncation point in a crashed ext3 filesystem. But
2430 * that's fine - as long as they are linked from the inode, the post-crash 2430 * that's fine - as long as they are linked from the inode, the post-crash
2431 * ext3_truncate() run will find them and release them. 2431 * ext3_truncate() run will find them and release them.
2432 */ 2432 */
2433 void ext3_truncate(struct inode *inode) 2433 void ext3_truncate(struct inode *inode)
2434 { 2434 {
2435 handle_t *handle; 2435 handle_t *handle;
2436 struct ext3_inode_info *ei = EXT3_I(inode); 2436 struct ext3_inode_info *ei = EXT3_I(inode);
2437 __le32 *i_data = ei->i_data; 2437 __le32 *i_data = ei->i_data;
2438 int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb); 2438 int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
2439 struct address_space *mapping = inode->i_mapping; 2439 struct address_space *mapping = inode->i_mapping;
2440 int offsets[4]; 2440 int offsets[4];
2441 Indirect chain[4]; 2441 Indirect chain[4];
2442 Indirect *partial; 2442 Indirect *partial;
2443 __le32 nr = 0; 2443 __le32 nr = 0;
2444 int n; 2444 int n;
2445 long last_block; 2445 long last_block;
2446 unsigned blocksize = inode->i_sb->s_blocksize; 2446 unsigned blocksize = inode->i_sb->s_blocksize;
2447 struct page *page; 2447 struct page *page;
2448 2448
2449 if (!ext3_can_truncate(inode)) 2449 if (!ext3_can_truncate(inode))
2450 goto out_notrans; 2450 goto out_notrans;
2451 2451
2452 if (inode->i_size == 0 && ext3_should_writeback_data(inode)) 2452 if (inode->i_size == 0 && ext3_should_writeback_data(inode))
2453 ext3_set_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE); 2453 ext3_set_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE);
2454 2454
2455 /* 2455 /*
2456 * We have to lock the EOF page here, because lock_page() nests 2456 * We have to lock the EOF page here, because lock_page() nests
2457 * outside journal_start(). 2457 * outside journal_start().
2458 */ 2458 */
2459 if ((inode->i_size & (blocksize - 1)) == 0) { 2459 if ((inode->i_size & (blocksize - 1)) == 0) {
2460 /* Block boundary? Nothing to do */ 2460 /* Block boundary? Nothing to do */
2461 page = NULL; 2461 page = NULL;
2462 } else { 2462 } else {
2463 page = grab_cache_page(mapping, 2463 page = grab_cache_page(mapping,
2464 inode->i_size >> PAGE_CACHE_SHIFT); 2464 inode->i_size >> PAGE_CACHE_SHIFT);
2465 if (!page) 2465 if (!page)
2466 goto out_notrans; 2466 goto out_notrans;
2467 } 2467 }
2468 2468
2469 handle = start_transaction(inode); 2469 handle = start_transaction(inode);
2470 if (IS_ERR(handle)) { 2470 if (IS_ERR(handle)) {
2471 if (page) { 2471 if (page) {
2472 clear_highpage(page); 2472 clear_highpage(page);
2473 flush_dcache_page(page); 2473 flush_dcache_page(page);
2474 unlock_page(page); 2474 unlock_page(page);
2475 page_cache_release(page); 2475 page_cache_release(page);
2476 } 2476 }
2477 goto out_notrans; 2477 goto out_notrans;
2478 } 2478 }
2479 2479
2480 last_block = (inode->i_size + blocksize-1) 2480 last_block = (inode->i_size + blocksize-1)
2481 >> EXT3_BLOCK_SIZE_BITS(inode->i_sb); 2481 >> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
2482 2482
2483 if (page) 2483 if (page)
2484 ext3_block_truncate_page(handle, page, mapping, inode->i_size); 2484 ext3_block_truncate_page(handle, page, mapping, inode->i_size);
2485 2485
2486 n = ext3_block_to_path(inode, last_block, offsets, NULL); 2486 n = ext3_block_to_path(inode, last_block, offsets, NULL);
2487 if (n == 0) 2487 if (n == 0)
2488 goto out_stop; /* error */ 2488 goto out_stop; /* error */
2489 2489
2490 /* 2490 /*
2491 * OK. This truncate is going to happen. We add the inode to the 2491 * OK. This truncate is going to happen. We add the inode to the
2492 * orphan list, so that if this truncate spans multiple transactions, 2492 * orphan list, so that if this truncate spans multiple transactions,
2493 * and we crash, we will resume the truncate when the filesystem 2493 * and we crash, we will resume the truncate when the filesystem
2494 * recovers. It also marks the inode dirty, to catch the new size. 2494 * recovers. It also marks the inode dirty, to catch the new size.
2495 * 2495 *
2496 * Implication: the file must always be in a sane, consistent 2496 * Implication: the file must always be in a sane, consistent
2497 * truncatable state while each transaction commits. 2497 * truncatable state while each transaction commits.
2498 */ 2498 */
2499 if (ext3_orphan_add(handle, inode)) 2499 if (ext3_orphan_add(handle, inode))
2500 goto out_stop; 2500 goto out_stop;
2501 2501
2502 /* 2502 /*
2503 * The orphan list entry will now protect us from any crash which 2503 * The orphan list entry will now protect us from any crash which
2504 * occurs before the truncate completes, so it is now safe to propagate 2504 * occurs before the truncate completes, so it is now safe to propagate
2505 * the new, shorter inode size (held for now in i_size) into the 2505 * the new, shorter inode size (held for now in i_size) into the
2506 * on-disk inode. We do this via i_disksize, which is the value which 2506 * on-disk inode. We do this via i_disksize, which is the value which
2507 * ext3 *really* writes onto the disk inode. 2507 * ext3 *really* writes onto the disk inode.
2508 */ 2508 */
2509 ei->i_disksize = inode->i_size; 2509 ei->i_disksize = inode->i_size;
2510 2510
2511 /* 2511 /*
2512 * From here we block out all ext3_get_block() callers who want to 2512 * From here we block out all ext3_get_block() callers who want to
2513 * modify the block allocation tree. 2513 * modify the block allocation tree.
2514 */ 2514 */
2515 mutex_lock(&ei->truncate_mutex); 2515 mutex_lock(&ei->truncate_mutex);
2516 2516
2517 if (n == 1) { /* direct blocks */ 2517 if (n == 1) { /* direct blocks */
2518 ext3_free_data(handle, inode, NULL, i_data+offsets[0], 2518 ext3_free_data(handle, inode, NULL, i_data+offsets[0],
2519 i_data + EXT3_NDIR_BLOCKS); 2519 i_data + EXT3_NDIR_BLOCKS);
2520 goto do_indirects; 2520 goto do_indirects;
2521 } 2521 }
2522 2522
2523 partial = ext3_find_shared(inode, n, offsets, chain, &nr); 2523 partial = ext3_find_shared(inode, n, offsets, chain, &nr);
2524 /* Kill the top of shared branch (not detached) */ 2524 /* Kill the top of shared branch (not detached) */
2525 if (nr) { 2525 if (nr) {
2526 if (partial == chain) { 2526 if (partial == chain) {
2527 /* Shared branch grows from the inode */ 2527 /* Shared branch grows from the inode */
2528 ext3_free_branches(handle, inode, NULL, 2528 ext3_free_branches(handle, inode, NULL,
2529 &nr, &nr+1, (chain+n-1) - partial); 2529 &nr, &nr+1, (chain+n-1) - partial);
2530 *partial->p = 0; 2530 *partial->p = 0;
2531 /* 2531 /*
2532 * We mark the inode dirty prior to restart, 2532 * We mark the inode dirty prior to restart,
2533 * and prior to stop. No need for it here. 2533 * and prior to stop. No need for it here.
2534 */ 2534 */
2535 } else { 2535 } else {
2536 /* Shared branch grows from an indirect block */ 2536 /* Shared branch grows from an indirect block */
2537 ext3_free_branches(handle, inode, partial->bh, 2537 ext3_free_branches(handle, inode, partial->bh,
2538 partial->p, 2538 partial->p,
2539 partial->p+1, (chain+n-1) - partial); 2539 partial->p+1, (chain+n-1) - partial);
2540 } 2540 }
2541 } 2541 }
2542 /* Clear the ends of indirect blocks on the shared branch */ 2542 /* Clear the ends of indirect blocks on the shared branch */
2543 while (partial > chain) { 2543 while (partial > chain) {
2544 ext3_free_branches(handle, inode, partial->bh, partial->p + 1, 2544 ext3_free_branches(handle, inode, partial->bh, partial->p + 1,
2545 (__le32*)partial->bh->b_data+addr_per_block, 2545 (__le32*)partial->bh->b_data+addr_per_block,
2546 (chain+n-1) - partial); 2546 (chain+n-1) - partial);
2547 BUFFER_TRACE(partial->bh, "call brelse"); 2547 BUFFER_TRACE(partial->bh, "call brelse");
2548 brelse (partial->bh); 2548 brelse (partial->bh);
2549 partial--; 2549 partial--;
2550 } 2550 }
2551 do_indirects: 2551 do_indirects:
2552 /* Kill the remaining (whole) subtrees */ 2552 /* Kill the remaining (whole) subtrees */
2553 switch (offsets[0]) { 2553 switch (offsets[0]) {
2554 default: 2554 default:
2555 nr = i_data[EXT3_IND_BLOCK]; 2555 nr = i_data[EXT3_IND_BLOCK];
2556 if (nr) { 2556 if (nr) {
2557 ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 1); 2557 ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
2558 i_data[EXT3_IND_BLOCK] = 0; 2558 i_data[EXT3_IND_BLOCK] = 0;
2559 } 2559 }
2560 case EXT3_IND_BLOCK: 2560 case EXT3_IND_BLOCK:
2561 nr = i_data[EXT3_DIND_BLOCK]; 2561 nr = i_data[EXT3_DIND_BLOCK];
2562 if (nr) { 2562 if (nr) {
2563 ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 2); 2563 ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
2564 i_data[EXT3_DIND_BLOCK] = 0; 2564 i_data[EXT3_DIND_BLOCK] = 0;
2565 } 2565 }
2566 case EXT3_DIND_BLOCK: 2566 case EXT3_DIND_BLOCK:
2567 nr = i_data[EXT3_TIND_BLOCK]; 2567 nr = i_data[EXT3_TIND_BLOCK];
2568 if (nr) { 2568 if (nr) {
2569 ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 3); 2569 ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
2570 i_data[EXT3_TIND_BLOCK] = 0; 2570 i_data[EXT3_TIND_BLOCK] = 0;
2571 } 2571 }
2572 case EXT3_TIND_BLOCK: 2572 case EXT3_TIND_BLOCK:
2573 ; 2573 ;
2574 } 2574 }
2575 2575
2576 ext3_discard_reservation(inode); 2576 ext3_discard_reservation(inode);
2577 2577
2578 mutex_unlock(&ei->truncate_mutex); 2578 mutex_unlock(&ei->truncate_mutex);
2579 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; 2579 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
2580 ext3_mark_inode_dirty(handle, inode); 2580 ext3_mark_inode_dirty(handle, inode);
2581 2581
2582 /* 2582 /*
2583 * In a multi-transaction truncate, we only make the final transaction 2583 * In a multi-transaction truncate, we only make the final transaction
2584 * synchronous 2584 * synchronous
2585 */ 2585 */
2586 if (IS_SYNC(inode)) 2586 if (IS_SYNC(inode))
2587 handle->h_sync = 1; 2587 handle->h_sync = 1;
2588 out_stop: 2588 out_stop:
2589 /* 2589 /*
2590 * If this was a simple ftruncate(), and the file will remain alive 2590 * If this was a simple ftruncate(), and the file will remain alive
2591 * then we need to clear up the orphan record which we created above. 2591 * then we need to clear up the orphan record which we created above.
2592 * However, if this was a real unlink then we were called by 2592 * However, if this was a real unlink then we were called by
2593 * ext3_evict_inode(), and we allow that function to clean up the 2593 * ext3_evict_inode(), and we allow that function to clean up the
2594 * orphan info for us. 2594 * orphan info for us.
2595 */ 2595 */
2596 if (inode->i_nlink) 2596 if (inode->i_nlink)
2597 ext3_orphan_del(handle, inode); 2597 ext3_orphan_del(handle, inode);
2598 2598
2599 ext3_journal_stop(handle); 2599 ext3_journal_stop(handle);
2600 return; 2600 return;
2601 out_notrans: 2601 out_notrans:
2602 /* 2602 /*
2603 * Delete the inode from orphan list so that it doesn't stay there 2603 * Delete the inode from orphan list so that it doesn't stay there
2604 * forever and trigger assertion on umount. 2604 * forever and trigger assertion on umount.
2605 */ 2605 */
2606 if (inode->i_nlink) 2606 if (inode->i_nlink)
2607 ext3_orphan_del(NULL, inode); 2607 ext3_orphan_del(NULL, inode);
2608 } 2608 }
2609 2609
2610 static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb, 2610 static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb,
2611 unsigned long ino, struct ext3_iloc *iloc) 2611 unsigned long ino, struct ext3_iloc *iloc)
2612 { 2612 {
2613 unsigned long block_group; 2613 unsigned long block_group;
2614 unsigned long offset; 2614 unsigned long offset;
2615 ext3_fsblk_t block; 2615 ext3_fsblk_t block;
2616 struct ext3_group_desc *gdp; 2616 struct ext3_group_desc *gdp;
2617 2617
2618 if (!ext3_valid_inum(sb, ino)) { 2618 if (!ext3_valid_inum(sb, ino)) {
2619 /* 2619 /*
2620 * This error is already checked for in namei.c unless we are 2620 * This error is already checked for in namei.c unless we are
2621 * looking at an NFS filehandle, in which case no error 2621 * looking at an NFS filehandle, in which case no error
2622 * report is needed 2622 * report is needed
2623 */ 2623 */
2624 return 0; 2624 return 0;
2625 } 2625 }
2626 2626
2627 block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb); 2627 block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
2628 gdp = ext3_get_group_desc(sb, block_group, NULL); 2628 gdp = ext3_get_group_desc(sb, block_group, NULL);
2629 if (!gdp) 2629 if (!gdp)
2630 return 0; 2630 return 0;
2631 /* 2631 /*
2632 * Figure out the offset within the block group inode table 2632 * Figure out the offset within the block group inode table
2633 */ 2633 */
2634 offset = ((ino - 1) % EXT3_INODES_PER_GROUP(sb)) * 2634 offset = ((ino - 1) % EXT3_INODES_PER_GROUP(sb)) *
2635 EXT3_INODE_SIZE(sb); 2635 EXT3_INODE_SIZE(sb);
2636 block = le32_to_cpu(gdp->bg_inode_table) + 2636 block = le32_to_cpu(gdp->bg_inode_table) +
2637 (offset >> EXT3_BLOCK_SIZE_BITS(sb)); 2637 (offset >> EXT3_BLOCK_SIZE_BITS(sb));
2638 2638
2639 iloc->block_group = block_group; 2639 iloc->block_group = block_group;
2640 iloc->offset = offset & (EXT3_BLOCK_SIZE(sb) - 1); 2640 iloc->offset = offset & (EXT3_BLOCK_SIZE(sb) - 1);
2641 return block; 2641 return block;
2642 } 2642 }
2643 2643
2644 /* 2644 /*
2645 * ext3_get_inode_loc returns with an extra refcount against the inode's 2645 * ext3_get_inode_loc returns with an extra refcount against the inode's
2646 * underlying buffer_head on success. If 'in_mem' is true, we have all 2646 * underlying buffer_head on success. If 'in_mem' is true, we have all
2647 * data in memory that is needed to recreate the on-disk version of this 2647 * data in memory that is needed to recreate the on-disk version of this
2648 * inode. 2648 * inode.
2649 */ 2649 */
2650 static int __ext3_get_inode_loc(struct inode *inode, 2650 static int __ext3_get_inode_loc(struct inode *inode,
2651 struct ext3_iloc *iloc, int in_mem) 2651 struct ext3_iloc *iloc, int in_mem)
2652 { 2652 {
2653 ext3_fsblk_t block; 2653 ext3_fsblk_t block;
2654 struct buffer_head *bh; 2654 struct buffer_head *bh;
2655 2655
2656 block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc); 2656 block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc);
2657 if (!block) 2657 if (!block)
2658 return -EIO; 2658 return -EIO;
2659 2659
2660 bh = sb_getblk(inode->i_sb, block); 2660 bh = sb_getblk(inode->i_sb, block);
2661 if (!bh) { 2661 if (!bh) {
2662 ext3_error (inode->i_sb, "ext3_get_inode_loc", 2662 ext3_error (inode->i_sb, "ext3_get_inode_loc",
2663 "unable to read inode block - " 2663 "unable to read inode block - "
2664 "inode=%lu, block="E3FSBLK, 2664 "inode=%lu, block="E3FSBLK,
2665 inode->i_ino, block); 2665 inode->i_ino, block);
2666 return -EIO; 2666 return -EIO;
2667 } 2667 }
2668 if (!buffer_uptodate(bh)) { 2668 if (!buffer_uptodate(bh)) {
2669 lock_buffer(bh); 2669 lock_buffer(bh);
2670 2670
2671 /* 2671 /*
2672 * If the buffer has the write error flag, we have failed 2672 * If the buffer has the write error flag, we have failed
2673 * to write out another inode in the same block. In this 2673 * to write out another inode in the same block. In this
2674 * case, we don't have to read the block because we may 2674 * case, we don't have to read the block because we may
2675 * read the old inode data successfully. 2675 * read the old inode data successfully.
2676 */ 2676 */
2677 if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) 2677 if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
2678 set_buffer_uptodate(bh); 2678 set_buffer_uptodate(bh);
2679 2679
2680 if (buffer_uptodate(bh)) { 2680 if (buffer_uptodate(bh)) {
2681 /* someone brought it uptodate while we waited */ 2681 /* someone brought it uptodate while we waited */
2682 unlock_buffer(bh); 2682 unlock_buffer(bh);
2683 goto has_buffer; 2683 goto has_buffer;
2684 } 2684 }
2685 2685
2686 /* 2686 /*
2687 * If we have all information of the inode in memory and this 2687 * If we have all information of the inode in memory and this
2688 * is the only valid inode in the block, we need not read the 2688 * is the only valid inode in the block, we need not read the
2689 * block. 2689 * block.
2690 */ 2690 */
2691 if (in_mem) { 2691 if (in_mem) {
2692 struct buffer_head *bitmap_bh; 2692 struct buffer_head *bitmap_bh;
2693 struct ext3_group_desc *desc; 2693 struct ext3_group_desc *desc;
2694 int inodes_per_buffer; 2694 int inodes_per_buffer;
2695 int inode_offset, i; 2695 int inode_offset, i;
2696 int block_group; 2696 int block_group;
2697 int start; 2697 int start;
2698 2698
2699 block_group = (inode->i_ino - 1) / 2699 block_group = (inode->i_ino - 1) /
2700 EXT3_INODES_PER_GROUP(inode->i_sb); 2700 EXT3_INODES_PER_GROUP(inode->i_sb);
2701 inodes_per_buffer = bh->b_size / 2701 inodes_per_buffer = bh->b_size /
2702 EXT3_INODE_SIZE(inode->i_sb); 2702 EXT3_INODE_SIZE(inode->i_sb);
2703 inode_offset = ((inode->i_ino - 1) % 2703 inode_offset = ((inode->i_ino - 1) %
2704 EXT3_INODES_PER_GROUP(inode->i_sb)); 2704 EXT3_INODES_PER_GROUP(inode->i_sb));
2705 start = inode_offset & ~(inodes_per_buffer - 1); 2705 start = inode_offset & ~(inodes_per_buffer - 1);
2706 2706
2707 /* Is the inode bitmap in cache? */ 2707 /* Is the inode bitmap in cache? */
2708 desc = ext3_get_group_desc(inode->i_sb, 2708 desc = ext3_get_group_desc(inode->i_sb,
2709 block_group, NULL); 2709 block_group, NULL);
2710 if (!desc) 2710 if (!desc)
2711 goto make_io; 2711 goto make_io;
2712 2712
2713 bitmap_bh = sb_getblk(inode->i_sb, 2713 bitmap_bh = sb_getblk(inode->i_sb,
2714 le32_to_cpu(desc->bg_inode_bitmap)); 2714 le32_to_cpu(desc->bg_inode_bitmap));
2715 if (!bitmap_bh) 2715 if (!bitmap_bh)
2716 goto make_io; 2716 goto make_io;
2717 2717
2718 /* 2718 /*
2719 * If the inode bitmap isn't in cache then the 2719 * If the inode bitmap isn't in cache then the
2720 * optimisation may end up performing two reads instead 2720 * optimisation may end up performing two reads instead
2721 * of one, so skip it. 2721 * of one, so skip it.
2722 */ 2722 */
2723 if (!buffer_uptodate(bitmap_bh)) { 2723 if (!buffer_uptodate(bitmap_bh)) {
2724 brelse(bitmap_bh); 2724 brelse(bitmap_bh);
2725 goto make_io; 2725 goto make_io;
2726 } 2726 }
2727 for (i = start; i < start + inodes_per_buffer; i++) { 2727 for (i = start; i < start + inodes_per_buffer; i++) {
2728 if (i == inode_offset) 2728 if (i == inode_offset)
2729 continue; 2729 continue;
2730 if (ext3_test_bit(i, bitmap_bh->b_data)) 2730 if (ext3_test_bit(i, bitmap_bh->b_data))
2731 break; 2731 break;
2732 } 2732 }
2733 brelse(bitmap_bh); 2733 brelse(bitmap_bh);
2734 if (i == start + inodes_per_buffer) { 2734 if (i == start + inodes_per_buffer) {
2735 /* all other inodes are free, so skip I/O */ 2735 /* all other inodes are free, so skip I/O */
2736 memset(bh->b_data, 0, bh->b_size); 2736 memset(bh->b_data, 0, bh->b_size);
2737 set_buffer_uptodate(bh); 2737 set_buffer_uptodate(bh);
2738 unlock_buffer(bh); 2738 unlock_buffer(bh);
2739 goto has_buffer; 2739 goto has_buffer;
2740 } 2740 }
2741 } 2741 }
2742 2742
2743 make_io: 2743 make_io:
2744 /* 2744 /*
2745 * There are other valid inodes in the buffer, this inode 2745 * There are other valid inodes in the buffer, this inode
2746 * has in-inode xattrs, or we don't have this inode in memory. 2746 * has in-inode xattrs, or we don't have this inode in memory.
2747 * Read the block from disk. 2747 * Read the block from disk.
2748 */ 2748 */
2749 get_bh(bh); 2749 get_bh(bh);
2750 bh->b_end_io = end_buffer_read_sync; 2750 bh->b_end_io = end_buffer_read_sync;
2751 submit_bh(READ_META, bh); 2751 submit_bh(READ_META, bh);
2752 wait_on_buffer(bh); 2752 wait_on_buffer(bh);
2753 if (!buffer_uptodate(bh)) { 2753 if (!buffer_uptodate(bh)) {
2754 ext3_error(inode->i_sb, "ext3_get_inode_loc", 2754 ext3_error(inode->i_sb, "ext3_get_inode_loc",
2755 "unable to read inode block - " 2755 "unable to read inode block - "
2756 "inode=%lu, block="E3FSBLK, 2756 "inode=%lu, block="E3FSBLK,
2757 inode->i_ino, block); 2757 inode->i_ino, block);
2758 brelse(bh); 2758 brelse(bh);
2759 return -EIO; 2759 return -EIO;
2760 } 2760 }
2761 } 2761 }
2762 has_buffer: 2762 has_buffer:
2763 iloc->bh = bh; 2763 iloc->bh = bh;
2764 return 0; 2764 return 0;
2765 } 2765 }
2766 2766
2767 int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc) 2767 int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc)
2768 { 2768 {
2769 /* We have all inode data except xattrs in memory here. */ 2769 /* We have all inode data except xattrs in memory here. */
2770 return __ext3_get_inode_loc(inode, iloc, 2770 return __ext3_get_inode_loc(inode, iloc,
2771 !ext3_test_inode_state(inode, EXT3_STATE_XATTR)); 2771 !ext3_test_inode_state(inode, EXT3_STATE_XATTR));
2772 } 2772 }
2773 2773
2774 void ext3_set_inode_flags(struct inode *inode) 2774 void ext3_set_inode_flags(struct inode *inode)
2775 { 2775 {
2776 unsigned int flags = EXT3_I(inode)->i_flags; 2776 unsigned int flags = EXT3_I(inode)->i_flags;
2777 2777
2778 inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); 2778 inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
2779 if (flags & EXT3_SYNC_FL) 2779 if (flags & EXT3_SYNC_FL)
2780 inode->i_flags |= S_SYNC; 2780 inode->i_flags |= S_SYNC;
2781 if (flags & EXT3_APPEND_FL) 2781 if (flags & EXT3_APPEND_FL)
2782 inode->i_flags |= S_APPEND; 2782 inode->i_flags |= S_APPEND;
2783 if (flags & EXT3_IMMUTABLE_FL) 2783 if (flags & EXT3_IMMUTABLE_FL)
2784 inode->i_flags |= S_IMMUTABLE; 2784 inode->i_flags |= S_IMMUTABLE;
2785 if (flags & EXT3_NOATIME_FL) 2785 if (flags & EXT3_NOATIME_FL)
2786 inode->i_flags |= S_NOATIME; 2786 inode->i_flags |= S_NOATIME;
2787 if (flags & EXT3_DIRSYNC_FL) 2787 if (flags & EXT3_DIRSYNC_FL)
2788 inode->i_flags |= S_DIRSYNC; 2788 inode->i_flags |= S_DIRSYNC;
2789 } 2789 }
2790 2790
2791 /* Propagate flags from i_flags to EXT3_I(inode)->i_flags */ 2791 /* Propagate flags from i_flags to EXT3_I(inode)->i_flags */
2792 void ext3_get_inode_flags(struct ext3_inode_info *ei) 2792 void ext3_get_inode_flags(struct ext3_inode_info *ei)
2793 { 2793 {
2794 unsigned int flags = ei->vfs_inode.i_flags; 2794 unsigned int flags = ei->vfs_inode.i_flags;
2795 2795
2796 ei->i_flags &= ~(EXT3_SYNC_FL|EXT3_APPEND_FL| 2796 ei->i_flags &= ~(EXT3_SYNC_FL|EXT3_APPEND_FL|
2797 EXT3_IMMUTABLE_FL|EXT3_NOATIME_FL|EXT3_DIRSYNC_FL); 2797 EXT3_IMMUTABLE_FL|EXT3_NOATIME_FL|EXT3_DIRSYNC_FL);
2798 if (flags & S_SYNC) 2798 if (flags & S_SYNC)
2799 ei->i_flags |= EXT3_SYNC_FL; 2799 ei->i_flags |= EXT3_SYNC_FL;
2800 if (flags & S_APPEND) 2800 if (flags & S_APPEND)
2801 ei->i_flags |= EXT3_APPEND_FL; 2801 ei->i_flags |= EXT3_APPEND_FL;
2802 if (flags & S_IMMUTABLE) 2802 if (flags & S_IMMUTABLE)
2803 ei->i_flags |= EXT3_IMMUTABLE_FL; 2803 ei->i_flags |= EXT3_IMMUTABLE_FL;
2804 if (flags & S_NOATIME) 2804 if (flags & S_NOATIME)
2805 ei->i_flags |= EXT3_NOATIME_FL; 2805 ei->i_flags |= EXT3_NOATIME_FL;
2806 if (flags & S_DIRSYNC) 2806 if (flags & S_DIRSYNC)
2807 ei->i_flags |= EXT3_DIRSYNC_FL; 2807 ei->i_flags |= EXT3_DIRSYNC_FL;
2808 } 2808 }
2809 2809
2810 struct inode *ext3_iget(struct super_block *sb, unsigned long ino) 2810 struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
2811 { 2811 {
2812 struct ext3_iloc iloc; 2812 struct ext3_iloc iloc;
2813 struct ext3_inode *raw_inode; 2813 struct ext3_inode *raw_inode;
2814 struct ext3_inode_info *ei; 2814 struct ext3_inode_info *ei;
2815 struct buffer_head *bh; 2815 struct buffer_head *bh;
2816 struct inode *inode; 2816 struct inode *inode;
2817 journal_t *journal = EXT3_SB(sb)->s_journal; 2817 journal_t *journal = EXT3_SB(sb)->s_journal;
2818 transaction_t *transaction; 2818 transaction_t *transaction;
2819 long ret; 2819 long ret;
2820 int block; 2820 int block;
2821 2821
2822 inode = iget_locked(sb, ino); 2822 inode = iget_locked(sb, ino);
2823 if (!inode) 2823 if (!inode)
2824 return ERR_PTR(-ENOMEM); 2824 return ERR_PTR(-ENOMEM);
2825 if (!(inode->i_state & I_NEW)) 2825 if (!(inode->i_state & I_NEW))
2826 return inode; 2826 return inode;
2827 2827
2828 ei = EXT3_I(inode); 2828 ei = EXT3_I(inode);
2829 ei->i_block_alloc_info = NULL; 2829 ei->i_block_alloc_info = NULL;
2830 2830
2831 ret = __ext3_get_inode_loc(inode, &iloc, 0); 2831 ret = __ext3_get_inode_loc(inode, &iloc, 0);
2832 if (ret < 0) 2832 if (ret < 0)
2833 goto bad_inode; 2833 goto bad_inode;
2834 bh = iloc.bh; 2834 bh = iloc.bh;
2835 raw_inode = ext3_raw_inode(&iloc); 2835 raw_inode = ext3_raw_inode(&iloc);
2836 inode->i_mode = le16_to_cpu(raw_inode->i_mode); 2836 inode->i_mode = le16_to_cpu(raw_inode->i_mode);
2837 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); 2837 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
2838 inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); 2838 inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
2839 if(!(test_opt (inode->i_sb, NO_UID32))) { 2839 if(!(test_opt (inode->i_sb, NO_UID32))) {
2840 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; 2840 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
2841 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; 2841 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
2842 } 2842 }
2843 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); 2843 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
2844 inode->i_size = le32_to_cpu(raw_inode->i_size); 2844 inode->i_size = le32_to_cpu(raw_inode->i_size);
2845 inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime); 2845 inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime);
2846 inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime); 2846 inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime);
2847 inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime); 2847 inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime);
2848 inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0; 2848 inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
2849 2849
2850 ei->i_state_flags = 0; 2850 ei->i_state_flags = 0;
2851 ei->i_dir_start_lookup = 0; 2851 ei->i_dir_start_lookup = 0;
2852 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); 2852 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
2853 /* We now have enough fields to check if the inode was active or not. 2853 /* We now have enough fields to check if the inode was active or not.
2854 * This is needed because nfsd might try to access dead inodes 2854 * This is needed because nfsd might try to access dead inodes
2855 * the test is that same one that e2fsck uses 2855 * the test is that same one that e2fsck uses
2856 * NeilBrown 1999oct15 2856 * NeilBrown 1999oct15
2857 */ 2857 */
2858 if (inode->i_nlink == 0) { 2858 if (inode->i_nlink == 0) {
2859 if (inode->i_mode == 0 || 2859 if (inode->i_mode == 0 ||
2860 !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) { 2860 !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) {
2861 /* this inode is deleted */ 2861 /* this inode is deleted */
2862 brelse (bh); 2862 brelse (bh);
2863 ret = -ESTALE; 2863 ret = -ESTALE;
2864 goto bad_inode; 2864 goto bad_inode;
2865 } 2865 }
2866 /* The only unlinked inodes we let through here have 2866 /* The only unlinked inodes we let through here have
2867 * valid i_mode and are being read by the orphan 2867 * valid i_mode and are being read by the orphan
2868 * recovery code: that's fine, we're about to complete 2868 * recovery code: that's fine, we're about to complete
2869 * the process of deleting those. */ 2869 * the process of deleting those. */
2870 } 2870 }
2871 inode->i_blocks = le32_to_cpu(raw_inode->i_blocks); 2871 inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
2872 ei->i_flags = le32_to_cpu(raw_inode->i_flags); 2872 ei->i_flags = le32_to_cpu(raw_inode->i_flags);
2873 #ifdef EXT3_FRAGMENTS 2873 #ifdef EXT3_FRAGMENTS
2874 ei->i_faddr = le32_to_cpu(raw_inode->i_faddr); 2874 ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
2875 ei->i_frag_no = raw_inode->i_frag; 2875 ei->i_frag_no = raw_inode->i_frag;
2876 ei->i_frag_size = raw_inode->i_fsize; 2876 ei->i_frag_size = raw_inode->i_fsize;
2877 #endif 2877 #endif
2878 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl); 2878 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
2879 if (!S_ISREG(inode->i_mode)) { 2879 if (!S_ISREG(inode->i_mode)) {
2880 ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl); 2880 ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
2881 } else { 2881 } else {
2882 inode->i_size |= 2882 inode->i_size |=
2883 ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32; 2883 ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
2884 } 2884 }
2885 ei->i_disksize = inode->i_size; 2885 ei->i_disksize = inode->i_size;
2886 inode->i_generation = le32_to_cpu(raw_inode->i_generation); 2886 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
2887 ei->i_block_group = iloc.block_group; 2887 ei->i_block_group = iloc.block_group;
2888 /* 2888 /*
2889 * NOTE! The in-memory inode i_data array is in little-endian order 2889 * NOTE! The in-memory inode i_data array is in little-endian order
2890 * even on big-endian machines: we do NOT byteswap the block numbers! 2890 * even on big-endian machines: we do NOT byteswap the block numbers!
2891 */ 2891 */
2892 for (block = 0; block < EXT3_N_BLOCKS; block++) 2892 for (block = 0; block < EXT3_N_BLOCKS; block++)
2893 ei->i_data[block] = raw_inode->i_block[block]; 2893 ei->i_data[block] = raw_inode->i_block[block];
2894 INIT_LIST_HEAD(&ei->i_orphan); 2894 INIT_LIST_HEAD(&ei->i_orphan);
2895 2895
2896 /* 2896 /*
2897 * Set transaction id's of transactions that have to be committed 2897 * Set transaction id's of transactions that have to be committed
2898 * to finish f[data]sync. We set them to currently running transaction 2898 * to finish f[data]sync. We set them to currently running transaction
2899 * as we cannot be sure that the inode or some of its metadata isn't 2899 * as we cannot be sure that the inode or some of its metadata isn't
2900 * part of the transaction - the inode could have been reclaimed and 2900 * part of the transaction - the inode could have been reclaimed and
2901 * now it is reread from disk. 2901 * now it is reread from disk.
2902 */ 2902 */
2903 if (journal) { 2903 if (journal) {
2904 tid_t tid; 2904 tid_t tid;
2905 2905
2906 spin_lock(&journal->j_state_lock); 2906 spin_lock(&journal->j_state_lock);
2907 if (journal->j_running_transaction) 2907 if (journal->j_running_transaction)
2908 transaction = journal->j_running_transaction; 2908 transaction = journal->j_running_transaction;
2909 else 2909 else
2910 transaction = journal->j_committing_transaction; 2910 transaction = journal->j_committing_transaction;
2911 if (transaction) 2911 if (transaction)
2912 tid = transaction->t_tid; 2912 tid = transaction->t_tid;
2913 else 2913 else
2914 tid = journal->j_commit_sequence; 2914 tid = journal->j_commit_sequence;
2915 spin_unlock(&journal->j_state_lock); 2915 spin_unlock(&journal->j_state_lock);
2916 atomic_set(&ei->i_sync_tid, tid); 2916 atomic_set(&ei->i_sync_tid, tid);
2917 atomic_set(&ei->i_datasync_tid, tid); 2917 atomic_set(&ei->i_datasync_tid, tid);
2918 } 2918 }
2919 2919
2920 if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 && 2920 if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 &&
2921 EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) { 2921 EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) {
2922 /* 2922 /*
2923 * When mke2fs creates big inodes it does not zero out 2923 * When mke2fs creates big inodes it does not zero out
2924 * the unused bytes above EXT3_GOOD_OLD_INODE_SIZE, 2924 * the unused bytes above EXT3_GOOD_OLD_INODE_SIZE,
2925 * so ignore those first few inodes. 2925 * so ignore those first few inodes.
2926 */ 2926 */
2927 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); 2927 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
2928 if (EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > 2928 if (EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
2929 EXT3_INODE_SIZE(inode->i_sb)) { 2929 EXT3_INODE_SIZE(inode->i_sb)) {
2930 brelse (bh); 2930 brelse (bh);
2931 ret = -EIO; 2931 ret = -EIO;
2932 goto bad_inode; 2932 goto bad_inode;
2933 } 2933 }
2934 if (ei->i_extra_isize == 0) { 2934 if (ei->i_extra_isize == 0) {
2935 /* The extra space is currently unused. Use it. */ 2935 /* The extra space is currently unused. Use it. */
2936 ei->i_extra_isize = sizeof(struct ext3_inode) - 2936 ei->i_extra_isize = sizeof(struct ext3_inode) -
2937 EXT3_GOOD_OLD_INODE_SIZE; 2937 EXT3_GOOD_OLD_INODE_SIZE;
2938 } else { 2938 } else {
2939 __le32 *magic = (void *)raw_inode + 2939 __le32 *magic = (void *)raw_inode +
2940 EXT3_GOOD_OLD_INODE_SIZE + 2940 EXT3_GOOD_OLD_INODE_SIZE +
2941 ei->i_extra_isize; 2941 ei->i_extra_isize;
2942 if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC)) 2942 if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC))
2943 ext3_set_inode_state(inode, EXT3_STATE_XATTR); 2943 ext3_set_inode_state(inode, EXT3_STATE_XATTR);
2944 } 2944 }
2945 } else 2945 } else
2946 ei->i_extra_isize = 0; 2946 ei->i_extra_isize = 0;
2947 2947
2948 if (S_ISREG(inode->i_mode)) { 2948 if (S_ISREG(inode->i_mode)) {
2949 inode->i_op = &ext3_file_inode_operations; 2949 inode->i_op = &ext3_file_inode_operations;
2950 inode->i_fop = &ext3_file_operations; 2950 inode->i_fop = &ext3_file_operations;
2951 ext3_set_aops(inode); 2951 ext3_set_aops(inode);
2952 } else if (S_ISDIR(inode->i_mode)) { 2952 } else if (S_ISDIR(inode->i_mode)) {
2953 inode->i_op = &ext3_dir_inode_operations; 2953 inode->i_op = &ext3_dir_inode_operations;
2954 inode->i_fop = &ext3_dir_operations; 2954 inode->i_fop = &ext3_dir_operations;
2955 } else if (S_ISLNK(inode->i_mode)) { 2955 } else if (S_ISLNK(inode->i_mode)) {
2956 if (ext3_inode_is_fast_symlink(inode)) { 2956 if (ext3_inode_is_fast_symlink(inode)) {
2957 inode->i_op = &ext3_fast_symlink_inode_operations; 2957 inode->i_op = &ext3_fast_symlink_inode_operations;
2958 nd_terminate_link(ei->i_data, inode->i_size, 2958 nd_terminate_link(ei->i_data, inode->i_size,
2959 sizeof(ei->i_data) - 1); 2959 sizeof(ei->i_data) - 1);
2960 } else { 2960 } else {
2961 inode->i_op = &ext3_symlink_inode_operations; 2961 inode->i_op = &ext3_symlink_inode_operations;
2962 ext3_set_aops(inode); 2962 ext3_set_aops(inode);
2963 } 2963 }
2964 } else { 2964 } else {
2965 inode->i_op = &ext3_special_inode_operations; 2965 inode->i_op = &ext3_special_inode_operations;
2966 if (raw_inode->i_block[0]) 2966 if (raw_inode->i_block[0])
2967 init_special_inode(inode, inode->i_mode, 2967 init_special_inode(inode, inode->i_mode,
2968 old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); 2968 old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
2969 else 2969 else
2970 init_special_inode(inode, inode->i_mode, 2970 init_special_inode(inode, inode->i_mode,
2971 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 2971 new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
2972 } 2972 }
2973 brelse (iloc.bh); 2973 brelse (iloc.bh);
2974 ext3_set_inode_flags(inode); 2974 ext3_set_inode_flags(inode);
2975 unlock_new_inode(inode); 2975 unlock_new_inode(inode);
2976 return inode; 2976 return inode;
2977 2977
2978 bad_inode: 2978 bad_inode:
2979 iget_failed(inode); 2979 iget_failed(inode);
2980 return ERR_PTR(ret); 2980 return ERR_PTR(ret);
2981 } 2981 }
2982 2982
2983 /* 2983 /*
2984 * Post the struct inode info into an on-disk inode location in the 2984 * Post the struct inode info into an on-disk inode location in the
2985 * buffer-cache. This gobbles the caller's reference to the 2985 * buffer-cache. This gobbles the caller's reference to the
2986 * buffer_head in the inode location struct. 2986 * buffer_head in the inode location struct.
2987 * 2987 *
2988 * The caller must have write access to iloc->bh. 2988 * The caller must have write access to iloc->bh.
2989 */ 2989 */
2990 static int ext3_do_update_inode(handle_t *handle, 2990 static int ext3_do_update_inode(handle_t *handle,
2991 struct inode *inode, 2991 struct inode *inode,
2992 struct ext3_iloc *iloc) 2992 struct ext3_iloc *iloc)
2993 { 2993 {
2994 struct ext3_inode *raw_inode = ext3_raw_inode(iloc); 2994 struct ext3_inode *raw_inode = ext3_raw_inode(iloc);
2995 struct ext3_inode_info *ei = EXT3_I(inode); 2995 struct ext3_inode_info *ei = EXT3_I(inode);
2996 struct buffer_head *bh = iloc->bh; 2996 struct buffer_head *bh = iloc->bh;
2997 int err = 0, rc, block; 2997 int err = 0, rc, block;
2998 2998
2999 again: 2999 again:
3000 /* we can't allow multiple procs in here at once, its a bit racey */ 3000 /* we can't allow multiple procs in here at once, its a bit racey */
3001 lock_buffer(bh); 3001 lock_buffer(bh);
3002 3002
3003 /* For fields not not tracking in the in-memory inode, 3003 /* For fields not not tracking in the in-memory inode,
3004 * initialise them to zero for new inodes. */ 3004 * initialise them to zero for new inodes. */
3005 if (ext3_test_inode_state(inode, EXT3_STATE_NEW)) 3005 if (ext3_test_inode_state(inode, EXT3_STATE_NEW))
3006 memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size); 3006 memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
3007 3007
3008 ext3_get_inode_flags(ei); 3008 ext3_get_inode_flags(ei);
3009 raw_inode->i_mode = cpu_to_le16(inode->i_mode); 3009 raw_inode->i_mode = cpu_to_le16(inode->i_mode);
3010 if(!(test_opt(inode->i_sb, NO_UID32))) { 3010 if(!(test_opt(inode->i_sb, NO_UID32))) {
3011 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); 3011 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
3012 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); 3012 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
3013 /* 3013 /*
3014 * Fix up interoperability with old kernels. Otherwise, old inodes get 3014 * Fix up interoperability with old kernels. Otherwise, old inodes get
3015 * re-used with the upper 16 bits of the uid/gid intact 3015 * re-used with the upper 16 bits of the uid/gid intact
3016 */ 3016 */
3017 if(!ei->i_dtime) { 3017 if(!ei->i_dtime) {
3018 raw_inode->i_uid_high = 3018 raw_inode->i_uid_high =
3019 cpu_to_le16(high_16_bits(inode->i_uid)); 3019 cpu_to_le16(high_16_bits(inode->i_uid));
3020 raw_inode->i_gid_high = 3020 raw_inode->i_gid_high =
3021 cpu_to_le16(high_16_bits(inode->i_gid)); 3021 cpu_to_le16(high_16_bits(inode->i_gid));
3022 } else { 3022 } else {
3023 raw_inode->i_uid_high = 0; 3023 raw_inode->i_uid_high = 0;
3024 raw_inode->i_gid_high = 0; 3024 raw_inode->i_gid_high = 0;
3025 } 3025 }
3026 } else { 3026 } else {
3027 raw_inode->i_uid_low = 3027 raw_inode->i_uid_low =
3028 cpu_to_le16(fs_high2lowuid(inode->i_uid)); 3028 cpu_to_le16(fs_high2lowuid(inode->i_uid));
3029 raw_inode->i_gid_low = 3029 raw_inode->i_gid_low =
3030 cpu_to_le16(fs_high2lowgid(inode->i_gid)); 3030 cpu_to_le16(fs_high2lowgid(inode->i_gid));
3031 raw_inode->i_uid_high = 0; 3031 raw_inode->i_uid_high = 0;
3032 raw_inode->i_gid_high = 0; 3032 raw_inode->i_gid_high = 0;
3033 } 3033 }
3034 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); 3034 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
3035 raw_inode->i_size = cpu_to_le32(ei->i_disksize); 3035 raw_inode->i_size = cpu_to_le32(ei->i_disksize);
3036 raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec); 3036 raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
3037 raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); 3037 raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
3038 raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); 3038 raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
3039 raw_inode->i_blocks = cpu_to_le32(inode->i_blocks); 3039 raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
3040 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); 3040 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
3041 raw_inode->i_flags = cpu_to_le32(ei->i_flags); 3041 raw_inode->i_flags = cpu_to_le32(ei->i_flags);
3042 #ifdef EXT3_FRAGMENTS 3042 #ifdef EXT3_FRAGMENTS
3043 raw_inode->i_faddr = cpu_to_le32(ei->i_faddr); 3043 raw_inode->i_faddr = cpu_to_le32(ei->i_faddr);
3044 raw_inode->i_frag = ei->i_frag_no; 3044 raw_inode->i_frag = ei->i_frag_no;
3045 raw_inode->i_fsize = ei->i_frag_size; 3045 raw_inode->i_fsize = ei->i_frag_size;
3046 #endif 3046 #endif
3047 raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl); 3047 raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
3048 if (!S_ISREG(inode->i_mode)) { 3048 if (!S_ISREG(inode->i_mode)) {
3049 raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl); 3049 raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
3050 } else { 3050 } else {
3051 raw_inode->i_size_high = 3051 raw_inode->i_size_high =
3052 cpu_to_le32(ei->i_disksize >> 32); 3052 cpu_to_le32(ei->i_disksize >> 32);
3053 if (ei->i_disksize > 0x7fffffffULL) { 3053 if (ei->i_disksize > 0x7fffffffULL) {
3054 struct super_block *sb = inode->i_sb; 3054 struct super_block *sb = inode->i_sb;
3055 if (!EXT3_HAS_RO_COMPAT_FEATURE(sb, 3055 if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
3056 EXT3_FEATURE_RO_COMPAT_LARGE_FILE) || 3056 EXT3_FEATURE_RO_COMPAT_LARGE_FILE) ||
3057 EXT3_SB(sb)->s_es->s_rev_level == 3057 EXT3_SB(sb)->s_es->s_rev_level ==
3058 cpu_to_le32(EXT3_GOOD_OLD_REV)) { 3058 cpu_to_le32(EXT3_GOOD_OLD_REV)) {
3059 /* If this is the first large file 3059 /* If this is the first large file
3060 * created, add a flag to the superblock. 3060 * created, add a flag to the superblock.
3061 */ 3061 */
3062 unlock_buffer(bh); 3062 unlock_buffer(bh);
3063 err = ext3_journal_get_write_access(handle, 3063 err = ext3_journal_get_write_access(handle,
3064 EXT3_SB(sb)->s_sbh); 3064 EXT3_SB(sb)->s_sbh);
3065 if (err) 3065 if (err)
3066 goto out_brelse; 3066 goto out_brelse;
3067 3067
3068 ext3_update_dynamic_rev(sb); 3068 ext3_update_dynamic_rev(sb);
3069 EXT3_SET_RO_COMPAT_FEATURE(sb, 3069 EXT3_SET_RO_COMPAT_FEATURE(sb,
3070 EXT3_FEATURE_RO_COMPAT_LARGE_FILE); 3070 EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
3071 handle->h_sync = 1; 3071 handle->h_sync = 1;
3072 err = ext3_journal_dirty_metadata(handle, 3072 err = ext3_journal_dirty_metadata(handle,
3073 EXT3_SB(sb)->s_sbh); 3073 EXT3_SB(sb)->s_sbh);
3074 /* get our lock and start over */ 3074 /* get our lock and start over */
3075 goto again; 3075 goto again;
3076 } 3076 }
3077 } 3077 }
3078 } 3078 }
3079 raw_inode->i_generation = cpu_to_le32(inode->i_generation); 3079 raw_inode->i_generation = cpu_to_le32(inode->i_generation);
3080 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { 3080 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
3081 if (old_valid_dev(inode->i_rdev)) { 3081 if (old_valid_dev(inode->i_rdev)) {
3082 raw_inode->i_block[0] = 3082 raw_inode->i_block[0] =
3083 cpu_to_le32(old_encode_dev(inode->i_rdev)); 3083 cpu_to_le32(old_encode_dev(inode->i_rdev));
3084 raw_inode->i_block[1] = 0; 3084 raw_inode->i_block[1] = 0;
3085 } else { 3085 } else {
3086 raw_inode->i_block[0] = 0; 3086 raw_inode->i_block[0] = 0;
3087 raw_inode->i_block[1] = 3087 raw_inode->i_block[1] =
3088 cpu_to_le32(new_encode_dev(inode->i_rdev)); 3088 cpu_to_le32(new_encode_dev(inode->i_rdev));
3089 raw_inode->i_block[2] = 0; 3089 raw_inode->i_block[2] = 0;
3090 } 3090 }
3091 } else for (block = 0; block < EXT3_N_BLOCKS; block++) 3091 } else for (block = 0; block < EXT3_N_BLOCKS; block++)
3092 raw_inode->i_block[block] = ei->i_data[block]; 3092 raw_inode->i_block[block] = ei->i_data[block];
3093 3093
3094 if (ei->i_extra_isize) 3094 if (ei->i_extra_isize)
3095 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); 3095 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
3096 3096
3097 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); 3097 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
3098 unlock_buffer(bh); 3098 unlock_buffer(bh);
3099 rc = ext3_journal_dirty_metadata(handle, bh); 3099 rc = ext3_journal_dirty_metadata(handle, bh);
3100 if (!err) 3100 if (!err)
3101 err = rc; 3101 err = rc;
3102 ext3_clear_inode_state(inode, EXT3_STATE_NEW); 3102 ext3_clear_inode_state(inode, EXT3_STATE_NEW);
3103 3103
3104 atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid); 3104 atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid);
3105 out_brelse: 3105 out_brelse:
3106 brelse (bh); 3106 brelse (bh);
3107 ext3_std_error(inode->i_sb, err); 3107 ext3_std_error(inode->i_sb, err);
3108 return err; 3108 return err;
3109 } 3109 }
3110 3110
3111 /* 3111 /*
3112 * ext3_write_inode() 3112 * ext3_write_inode()
3113 * 3113 *
3114 * We are called from a few places: 3114 * We are called from a few places:
3115 * 3115 *
3116 * - Within generic_file_write() for O_SYNC files. 3116 * - Within generic_file_write() for O_SYNC files.
3117 * Here, there will be no transaction running. We wait for any running 3117 * Here, there will be no transaction running. We wait for any running
3118 * trasnaction to commit. 3118 * trasnaction to commit.
3119 * 3119 *
3120 * - Within sys_sync(), kupdate and such. 3120 * - Within sys_sync(), kupdate and such.
3121 * We wait on commit, if tol to. 3121 * We wait on commit, if tol to.
3122 * 3122 *
3123 * - Within prune_icache() (PF_MEMALLOC == true) 3123 * - Within prune_icache() (PF_MEMALLOC == true)
3124 * Here we simply return. We can't afford to block kswapd on the 3124 * Here we simply return. We can't afford to block kswapd on the
3125 * journal commit. 3125 * journal commit.
3126 * 3126 *
3127 * In all cases it is actually safe for us to return without doing anything, 3127 * In all cases it is actually safe for us to return without doing anything,
3128 * because the inode has been copied into a raw inode buffer in 3128 * because the inode has been copied into a raw inode buffer in
3129 * ext3_mark_inode_dirty(). This is a correctness thing for O_SYNC and for 3129 * ext3_mark_inode_dirty(). This is a correctness thing for O_SYNC and for
3130 * knfsd. 3130 * knfsd.
3131 * 3131 *
3132 * Note that we are absolutely dependent upon all inode dirtiers doing the 3132 * Note that we are absolutely dependent upon all inode dirtiers doing the
3133 * right thing: they *must* call mark_inode_dirty() after dirtying info in 3133 * right thing: they *must* call mark_inode_dirty() after dirtying info in
3134 * which we are interested. 3134 * which we are interested.
3135 * 3135 *
3136 * It would be a bug for them to not do this. The code: 3136 * It would be a bug for them to not do this. The code:
3137 * 3137 *
3138 * mark_inode_dirty(inode) 3138 * mark_inode_dirty(inode)
3139 * stuff(); 3139 * stuff();
3140 * inode->i_size = expr; 3140 * inode->i_size = expr;
3141 * 3141 *
3142 * is in error because a kswapd-driven write_inode() could occur while 3142 * is in error because a kswapd-driven write_inode() could occur while
3143 * `stuff()' is running, and the new i_size will be lost. Plus the inode 3143 * `stuff()' is running, and the new i_size will be lost. Plus the inode
3144 * will no longer be on the superblock's dirty inode list. 3144 * will no longer be on the superblock's dirty inode list.
3145 */ 3145 */
3146 int ext3_write_inode(struct inode *inode, struct writeback_control *wbc) 3146 int ext3_write_inode(struct inode *inode, struct writeback_control *wbc)
3147 { 3147 {
3148 if (current->flags & PF_MEMALLOC) 3148 if (current->flags & PF_MEMALLOC)
3149 return 0; 3149 return 0;
3150 3150
3151 if (ext3_journal_current_handle()) { 3151 if (ext3_journal_current_handle()) {
3152 jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); 3152 jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
3153 dump_stack(); 3153 dump_stack();
3154 return -EIO; 3154 return -EIO;
3155 } 3155 }
3156 3156
3157 if (wbc->sync_mode != WB_SYNC_ALL) 3157 if (wbc->sync_mode != WB_SYNC_ALL)
3158 return 0; 3158 return 0;
3159 3159
3160 return ext3_force_commit(inode->i_sb); 3160 return ext3_force_commit(inode->i_sb);
3161 } 3161 }
3162 3162
3163 /* 3163 /*
3164 * ext3_setattr() 3164 * ext3_setattr()
3165 * 3165 *
3166 * Called from notify_change. 3166 * Called from notify_change.
3167 * 3167 *
3168 * We want to trap VFS attempts to truncate the file as soon as 3168 * We want to trap VFS attempts to truncate the file as soon as
3169 * possible. In particular, we want to make sure that when the VFS 3169 * possible. In particular, we want to make sure that when the VFS
3170 * shrinks i_size, we put the inode on the orphan list and modify 3170 * shrinks i_size, we put the inode on the orphan list and modify
3171 * i_disksize immediately, so that during the subsequent flushing of 3171 * i_disksize immediately, so that during the subsequent flushing of
3172 * dirty pages and freeing of disk blocks, we can guarantee that any 3172 * dirty pages and freeing of disk blocks, we can guarantee that any
3173 * commit will leave the blocks being flushed in an unused state on 3173 * commit will leave the blocks being flushed in an unused state on
3174 * disk. (On recovery, the inode will get truncated and the blocks will 3174 * disk. (On recovery, the inode will get truncated and the blocks will
3175 * be freed, so we have a strong guarantee that no future commit will 3175 * be freed, so we have a strong guarantee that no future commit will
3176 * leave these blocks visible to the user.) 3176 * leave these blocks visible to the user.)
3177 * 3177 *
3178 * Called with inode->sem down. 3178 * Called with inode->sem down.
3179 */ 3179 */
3180 int ext3_setattr(struct dentry *dentry, struct iattr *attr) 3180 int ext3_setattr(struct dentry *dentry, struct iattr *attr)
3181 { 3181 {
3182 struct inode *inode = dentry->d_inode; 3182 struct inode *inode = dentry->d_inode;
3183 int error, rc = 0; 3183 int error, rc = 0;
3184 const unsigned int ia_valid = attr->ia_valid; 3184 const unsigned int ia_valid = attr->ia_valid;
3185 3185
3186 error = inode_change_ok(inode, attr); 3186 error = inode_change_ok(inode, attr);
3187 if (error) 3187 if (error)
3188 return error; 3188 return error;
3189 3189
3190 if (is_quota_modification(inode, attr)) 3190 if (is_quota_modification(inode, attr))
3191 dquot_initialize(inode); 3191 dquot_initialize(inode);
3192 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || 3192 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
3193 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { 3193 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
3194 handle_t *handle; 3194 handle_t *handle;
3195 3195
3196 /* (user+group)*(old+new) structure, inode write (sb, 3196 /* (user+group)*(old+new) structure, inode write (sb,
3197 * inode block, ? - but truncate inode update has it) */ 3197 * inode block, ? - but truncate inode update has it) */
3198 handle = ext3_journal_start(inode, EXT3_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+ 3198 handle = ext3_journal_start(inode, EXT3_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
3199 EXT3_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)+3); 3199 EXT3_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)+3);
3200 if (IS_ERR(handle)) { 3200 if (IS_ERR(handle)) {
3201 error = PTR_ERR(handle); 3201 error = PTR_ERR(handle);
3202 goto err_out; 3202 goto err_out;
3203 } 3203 }
3204 error = dquot_transfer(inode, attr); 3204 error = dquot_transfer(inode, attr);
3205 if (error) { 3205 if (error) {
3206 ext3_journal_stop(handle); 3206 ext3_journal_stop(handle);
3207 return error; 3207 return error;
3208 } 3208 }
3209 /* Update corresponding info in inode so that everything is in 3209 /* Update corresponding info in inode so that everything is in
3210 * one transaction */ 3210 * one transaction */
3211 if (attr->ia_valid & ATTR_UID) 3211 if (attr->ia_valid & ATTR_UID)
3212 inode->i_uid = attr->ia_uid; 3212 inode->i_uid = attr->ia_uid;
3213 if (attr->ia_valid & ATTR_GID) 3213 if (attr->ia_valid & ATTR_GID)
3214 inode->i_gid = attr->ia_gid; 3214 inode->i_gid = attr->ia_gid;
3215 error = ext3_mark_inode_dirty(handle, inode); 3215 error = ext3_mark_inode_dirty(handle, inode);
3216 ext3_journal_stop(handle); 3216 ext3_journal_stop(handle);
3217 } 3217 }
3218 3218
3219 if (attr->ia_valid & ATTR_SIZE)
3220 inode_dio_wait(inode);
3221
3219 if (S_ISREG(inode->i_mode) && 3222 if (S_ISREG(inode->i_mode) &&
3220 attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { 3223 attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
3221 handle_t *handle; 3224 handle_t *handle;
3222 3225
3223 handle = ext3_journal_start(inode, 3); 3226 handle = ext3_journal_start(inode, 3);
3224 if (IS_ERR(handle)) { 3227 if (IS_ERR(handle)) {
3225 error = PTR_ERR(handle); 3228 error = PTR_ERR(handle);
3226 goto err_out; 3229 goto err_out;
3227 } 3230 }
3228 3231
3229 error = ext3_orphan_add(handle, inode); 3232 error = ext3_orphan_add(handle, inode);
3230 EXT3_I(inode)->i_disksize = attr->ia_size; 3233 EXT3_I(inode)->i_disksize = attr->ia_size;
3231 rc = ext3_mark_inode_dirty(handle, inode); 3234 rc = ext3_mark_inode_dirty(handle, inode);
3232 if (!error) 3235 if (!error)
3233 error = rc; 3236 error = rc;
3234 ext3_journal_stop(handle); 3237 ext3_journal_stop(handle);
3235 } 3238 }
3236 3239
3237 if ((attr->ia_valid & ATTR_SIZE) && 3240 if ((attr->ia_valid & ATTR_SIZE) &&
3238 attr->ia_size != i_size_read(inode)) { 3241 attr->ia_size != i_size_read(inode)) {
3239 rc = vmtruncate(inode, attr->ia_size); 3242 rc = vmtruncate(inode, attr->ia_size);
3240 if (rc) 3243 if (rc)
3241 goto err_out; 3244 goto err_out;
3242 } 3245 }
3243 3246
3244 setattr_copy(inode, attr); 3247 setattr_copy(inode, attr);
3245 mark_inode_dirty(inode); 3248 mark_inode_dirty(inode);
3246 3249
3247 if (ia_valid & ATTR_MODE) 3250 if (ia_valid & ATTR_MODE)
3248 rc = ext3_acl_chmod(inode); 3251 rc = ext3_acl_chmod(inode);
3249 3252
3250 err_out: 3253 err_out:
3251 ext3_std_error(inode->i_sb, error); 3254 ext3_std_error(inode->i_sb, error);
3252 if (!error) 3255 if (!error)
3253 error = rc; 3256 error = rc;
3254 return error; 3257 return error;
3255 } 3258 }
3256 3259
3257 3260
3258 /* 3261 /*
3259 * How many blocks doth make a writepage()? 3262 * How many blocks doth make a writepage()?
3260 * 3263 *
3261 * With N blocks per page, it may be: 3264 * With N blocks per page, it may be:
3262 * N data blocks 3265 * N data blocks
3263 * 2 indirect block 3266 * 2 indirect block
3264 * 2 dindirect 3267 * 2 dindirect
3265 * 1 tindirect 3268 * 1 tindirect
3266 * N+5 bitmap blocks (from the above) 3269 * N+5 bitmap blocks (from the above)
3267 * N+5 group descriptor summary blocks 3270 * N+5 group descriptor summary blocks
3268 * 1 inode block 3271 * 1 inode block
3269 * 1 superblock. 3272 * 1 superblock.
3270 * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files 3273 * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files
3271 * 3274 *
3272 * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS 3275 * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
3273 * 3276 *
3274 * With ordered or writeback data it's the same, less the N data blocks. 3277 * With ordered or writeback data it's the same, less the N data blocks.
3275 * 3278 *
3276 * If the inode's direct blocks can hold an integral number of pages then a 3279 * If the inode's direct blocks can hold an integral number of pages then a
3277 * page cannot straddle two indirect blocks, and we can only touch one indirect 3280 * page cannot straddle two indirect blocks, and we can only touch one indirect
3278 * and dindirect block, and the "5" above becomes "3". 3281 * and dindirect block, and the "5" above becomes "3".
3279 * 3282 *
3280 * This still overestimates under most circumstances. If we were to pass the 3283 * This still overestimates under most circumstances. If we were to pass the
3281 * start and end offsets in here as well we could do block_to_path() on each 3284 * start and end offsets in here as well we could do block_to_path() on each
3282 * block and work out the exact number of indirects which are touched. Pah. 3285 * block and work out the exact number of indirects which are touched. Pah.
3283 */ 3286 */
3284 3287
3285 static int ext3_writepage_trans_blocks(struct inode *inode) 3288 static int ext3_writepage_trans_blocks(struct inode *inode)
3286 { 3289 {
3287 int bpp = ext3_journal_blocks_per_page(inode); 3290 int bpp = ext3_journal_blocks_per_page(inode);
3288 int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; 3291 int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
3289 int ret; 3292 int ret;
3290 3293
3291 if (ext3_should_journal_data(inode)) 3294 if (ext3_should_journal_data(inode))
3292 ret = 3 * (bpp + indirects) + 2; 3295 ret = 3 * (bpp + indirects) + 2;
3293 else 3296 else
3294 ret = 2 * (bpp + indirects) + indirects + 2; 3297 ret = 2 * (bpp + indirects) + indirects + 2;
3295 3298
3296 #ifdef CONFIG_QUOTA 3299 #ifdef CONFIG_QUOTA
3297 /* We know that structure was already allocated during dquot_initialize so 3300 /* We know that structure was already allocated during dquot_initialize so
3298 * we will be updating only the data blocks + inodes */ 3301 * we will be updating only the data blocks + inodes */
3299 ret += EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); 3302 ret += EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
3300 #endif 3303 #endif
3301 3304
3302 return ret; 3305 return ret;
3303 } 3306 }
3304 3307
3305 /* 3308 /*
3306 * The caller must have previously called ext3_reserve_inode_write(). 3309 * The caller must have previously called ext3_reserve_inode_write().
3307 * Give this, we know that the caller already has write access to iloc->bh. 3310 * Give this, we know that the caller already has write access to iloc->bh.
3308 */ 3311 */
3309 int ext3_mark_iloc_dirty(handle_t *handle, 3312 int ext3_mark_iloc_dirty(handle_t *handle,
3310 struct inode *inode, struct ext3_iloc *iloc) 3313 struct inode *inode, struct ext3_iloc *iloc)
3311 { 3314 {
3312 int err = 0; 3315 int err = 0;
3313 3316
3314 /* the do_update_inode consumes one bh->b_count */ 3317 /* the do_update_inode consumes one bh->b_count */
3315 get_bh(iloc->bh); 3318 get_bh(iloc->bh);
3316 3319
3317 /* ext3_do_update_inode() does journal_dirty_metadata */ 3320 /* ext3_do_update_inode() does journal_dirty_metadata */
3318 err = ext3_do_update_inode(handle, inode, iloc); 3321 err = ext3_do_update_inode(handle, inode, iloc);
3319 put_bh(iloc->bh); 3322 put_bh(iloc->bh);
3320 return err; 3323 return err;
3321 } 3324 }
3322 3325
3323 /* 3326 /*
3324 * On success, We end up with an outstanding reference count against 3327 * On success, We end up with an outstanding reference count against
3325 * iloc->bh. This _must_ be cleaned up later. 3328 * iloc->bh. This _must_ be cleaned up later.
3326 */ 3329 */
3327 3330
3328 int 3331 int
3329 ext3_reserve_inode_write(handle_t *handle, struct inode *inode, 3332 ext3_reserve_inode_write(handle_t *handle, struct inode *inode,
3330 struct ext3_iloc *iloc) 3333 struct ext3_iloc *iloc)
3331 { 3334 {
3332 int err = 0; 3335 int err = 0;
3333 if (handle) { 3336 if (handle) {
3334 err = ext3_get_inode_loc(inode, iloc); 3337 err = ext3_get_inode_loc(inode, iloc);
3335 if (!err) { 3338 if (!err) {
3336 BUFFER_TRACE(iloc->bh, "get_write_access"); 3339 BUFFER_TRACE(iloc->bh, "get_write_access");
3337 err = ext3_journal_get_write_access(handle, iloc->bh); 3340 err = ext3_journal_get_write_access(handle, iloc->bh);
3338 if (err) { 3341 if (err) {
3339 brelse(iloc->bh); 3342 brelse(iloc->bh);
3340 iloc->bh = NULL; 3343 iloc->bh = NULL;
3341 } 3344 }
3342 } 3345 }
3343 } 3346 }
3344 ext3_std_error(inode->i_sb, err); 3347 ext3_std_error(inode->i_sb, err);
3345 return err; 3348 return err;
3346 } 3349 }
3347 3350
3348 /* 3351 /*
3349 * What we do here is to mark the in-core inode as clean with respect to inode 3352 * What we do here is to mark the in-core inode as clean with respect to inode
3350 * dirtiness (it may still be data-dirty). 3353 * dirtiness (it may still be data-dirty).
3351 * This means that the in-core inode may be reaped by prune_icache 3354 * This means that the in-core inode may be reaped by prune_icache
3352 * without having to perform any I/O. This is a very good thing, 3355 * without having to perform any I/O. This is a very good thing,
3353 * because *any* task may call prune_icache - even ones which 3356 * because *any* task may call prune_icache - even ones which
3354 * have a transaction open against a different journal. 3357 * have a transaction open against a different journal.
3355 * 3358 *
3356 * Is this cheating? Not really. Sure, we haven't written the 3359 * Is this cheating? Not really. Sure, we haven't written the
3357 * inode out, but prune_icache isn't a user-visible syncing function. 3360 * inode out, but prune_icache isn't a user-visible syncing function.
3358 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) 3361 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
3359 * we start and wait on commits. 3362 * we start and wait on commits.
3360 * 3363 *
3361 * Is this efficient/effective? Well, we're being nice to the system 3364 * Is this efficient/effective? Well, we're being nice to the system
3362 * by cleaning up our inodes proactively so they can be reaped 3365 * by cleaning up our inodes proactively so they can be reaped
3363 * without I/O. But we are potentially leaving up to five seconds' 3366 * without I/O. But we are potentially leaving up to five seconds'
3364 * worth of inodes floating about which prune_icache wants us to 3367 * worth of inodes floating about which prune_icache wants us to
3365 * write out. One way to fix that would be to get prune_icache() 3368 * write out. One way to fix that would be to get prune_icache()
3366 * to do a write_super() to free up some memory. It has the desired 3369 * to do a write_super() to free up some memory. It has the desired
3367 * effect. 3370 * effect.
3368 */ 3371 */
3369 int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode) 3372 int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
3370 { 3373 {
3371 struct ext3_iloc iloc; 3374 struct ext3_iloc iloc;
3372 int err; 3375 int err;
3373 3376
3374 might_sleep(); 3377 might_sleep();
3375 err = ext3_reserve_inode_write(handle, inode, &iloc); 3378 err = ext3_reserve_inode_write(handle, inode, &iloc);
3376 if (!err) 3379 if (!err)
3377 err = ext3_mark_iloc_dirty(handle, inode, &iloc); 3380 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
3378 return err; 3381 return err;
3379 } 3382 }
3380 3383
3381 /* 3384 /*
3382 * ext3_dirty_inode() is called from __mark_inode_dirty() 3385 * ext3_dirty_inode() is called from __mark_inode_dirty()
3383 * 3386 *
3384 * We're really interested in the case where a file is being extended. 3387 * We're really interested in the case where a file is being extended.
3385 * i_size has been changed by generic_commit_write() and we thus need 3388 * i_size has been changed by generic_commit_write() and we thus need
3386 * to include the updated inode in the current transaction. 3389 * to include the updated inode in the current transaction.
3387 * 3390 *
3388 * Also, dquot_alloc_space() will always dirty the inode when blocks 3391 * Also, dquot_alloc_space() will always dirty the inode when blocks
3389 * are allocated to the file. 3392 * are allocated to the file.
3390 * 3393 *
3391 * If the inode is marked synchronous, we don't honour that here - doing 3394 * If the inode is marked synchronous, we don't honour that here - doing
3392 * so would cause a commit on atime updates, which we don't bother doing. 3395 * so would cause a commit on atime updates, which we don't bother doing.
3393 * We handle synchronous inodes at the highest possible level. 3396 * We handle synchronous inodes at the highest possible level.
3394 */ 3397 */
3395 void ext3_dirty_inode(struct inode *inode, int flags) 3398 void ext3_dirty_inode(struct inode *inode, int flags)
3396 { 3399 {
3397 handle_t *current_handle = ext3_journal_current_handle(); 3400 handle_t *current_handle = ext3_journal_current_handle();
3398 handle_t *handle; 3401 handle_t *handle;
3399 3402
3400 handle = ext3_journal_start(inode, 2); 3403 handle = ext3_journal_start(inode, 2);
3401 if (IS_ERR(handle)) 3404 if (IS_ERR(handle))
3402 goto out; 3405 goto out;
3403 if (current_handle && 3406 if (current_handle &&
3404 current_handle->h_transaction != handle->h_transaction) { 3407 current_handle->h_transaction != handle->h_transaction) {
3405 /* This task has a transaction open against a different fs */ 3408 /* This task has a transaction open against a different fs */
3406 printk(KERN_EMERG "%s: transactions do not match!\n", 3409 printk(KERN_EMERG "%s: transactions do not match!\n",
3407 __func__); 3410 __func__);
3408 } else { 3411 } else {
3409 jbd_debug(5, "marking dirty. outer handle=%p\n", 3412 jbd_debug(5, "marking dirty. outer handle=%p\n",
3410 current_handle); 3413 current_handle);
3411 ext3_mark_inode_dirty(handle, inode); 3414 ext3_mark_inode_dirty(handle, inode);
3412 } 3415 }
3413 ext3_journal_stop(handle); 3416 ext3_journal_stop(handle);
3414 out: 3417 out:
3415 return; 3418 return;
3416 } 3419 }
3417 3420
3418 #if 0 3421 #if 0
3419 /* 3422 /*
3420 * Bind an inode's backing buffer_head into this transaction, to prevent 3423 * Bind an inode's backing buffer_head into this transaction, to prevent
3421 * it from being flushed to disk early. Unlike 3424 * it from being flushed to disk early. Unlike
3422 * ext3_reserve_inode_write, this leaves behind no bh reference and 3425 * ext3_reserve_inode_write, this leaves behind no bh reference and
3423 * returns no iloc structure, so the caller needs to repeat the iloc 3426 * returns no iloc structure, so the caller needs to repeat the iloc
3424 * lookup to mark the inode dirty later. 3427 * lookup to mark the inode dirty later.
3425 */ 3428 */
3426 static int ext3_pin_inode(handle_t *handle, struct inode *inode) 3429 static int ext3_pin_inode(handle_t *handle, struct inode *inode)
3427 { 3430 {
3428 struct ext3_iloc iloc; 3431 struct ext3_iloc iloc;
3429 3432
3430 int err = 0; 3433 int err = 0;
3431 if (handle) { 3434 if (handle) {
3432 err = ext3_get_inode_loc(inode, &iloc); 3435 err = ext3_get_inode_loc(inode, &iloc);
3433 if (!err) { 3436 if (!err) {
3434 BUFFER_TRACE(iloc.bh, "get_write_access"); 3437 BUFFER_TRACE(iloc.bh, "get_write_access");
3435 err = journal_get_write_access(handle, iloc.bh); 3438 err = journal_get_write_access(handle, iloc.bh);
3436 if (!err) 3439 if (!err)
3437 err = ext3_journal_dirty_metadata(handle, 3440 err = ext3_journal_dirty_metadata(handle,
3438 iloc.bh); 3441 iloc.bh);
3439 brelse(iloc.bh); 3442 brelse(iloc.bh);
3440 } 3443 }
3441 } 3444 }
3442 ext3_std_error(inode->i_sb, err); 3445 ext3_std_error(inode->i_sb, err);
3443 return err; 3446 return err;
3444 } 3447 }
3445 #endif 3448 #endif
3446 3449
3447 int ext3_change_inode_journal_flag(struct inode *inode, int val) 3450 int ext3_change_inode_journal_flag(struct inode *inode, int val)
3448 { 3451 {
3449 journal_t *journal; 3452 journal_t *journal;
3450 handle_t *handle; 3453 handle_t *handle;
3451 int err; 3454 int err;
3452 3455
3453 /* 3456 /*
3454 * We have to be very careful here: changing a data block's 3457 * We have to be very careful here: changing a data block's
3455 * journaling status dynamically is dangerous. If we write a 3458 * journaling status dynamically is dangerous. If we write a
3456 * data block to the journal, change the status and then delete 3459 * data block to the journal, change the status and then delete
3457 * that block, we risk forgetting to revoke the old log record 3460 * that block, we risk forgetting to revoke the old log record
3458 * from the journal and so a subsequent replay can corrupt data. 3461 * from the journal and so a subsequent replay can corrupt data.
3459 * So, first we make sure that the journal is empty and that 3462 * So, first we make sure that the journal is empty and that
3460 * nobody is changing anything. 3463 * nobody is changing anything.
3461 */ 3464 */
3462 3465
3463 journal = EXT3_JOURNAL(inode); 3466 journal = EXT3_JOURNAL(inode);
3464 if (is_journal_aborted(journal)) 3467 if (is_journal_aborted(journal))
3465 return -EROFS; 3468 return -EROFS;
3466 3469
3467 journal_lock_updates(journal); 3470 journal_lock_updates(journal);
3468 journal_flush(journal); 3471 journal_flush(journal);
3469 3472
3470 /* 3473 /*
3471 * OK, there are no updates running now, and all cached data is 3474 * OK, there are no updates running now, and all cached data is
3472 * synced to disk. We are now in a completely consistent state 3475 * synced to disk. We are now in a completely consistent state
3473 * which doesn't have anything in the journal, and we know that 3476 * which doesn't have anything in the journal, and we know that
3474 * no filesystem updates are running, so it is safe to modify 3477 * no filesystem updates are running, so it is safe to modify
3475 * the inode's in-core data-journaling state flag now. 3478 * the inode's in-core data-journaling state flag now.
3476 */ 3479 */
3477 3480
3478 if (val) 3481 if (val)
3479 EXT3_I(inode)->i_flags |= EXT3_JOURNAL_DATA_FL; 3482 EXT3_I(inode)->i_flags |= EXT3_JOURNAL_DATA_FL;
3480 else 3483 else
3481 EXT3_I(inode)->i_flags &= ~EXT3_JOURNAL_DATA_FL; 3484 EXT3_I(inode)->i_flags &= ~EXT3_JOURNAL_DATA_FL;
3482 ext3_set_aops(inode); 3485 ext3_set_aops(inode);
3483 3486
3484 journal_unlock_updates(journal); 3487 journal_unlock_updates(journal);
3485 3488
3486 /* Finally we can mark the inode as dirty. */ 3489 /* Finally we can mark the inode as dirty. */
3487 3490
3488 handle = ext3_journal_start(inode, 1); 3491 handle = ext3_journal_start(inode, 1);
3489 if (IS_ERR(handle)) 3492 if (IS_ERR(handle))
3490 return PTR_ERR(handle); 3493 return PTR_ERR(handle);
3491 3494
3492 err = ext3_mark_inode_dirty(handle, inode); 3495 err = ext3_mark_inode_dirty(handle, inode);
3493 handle->h_sync = 1; 3496 handle->h_sync = 1;
3494 ext3_journal_stop(handle); 3497 ext3_journal_stop(handle);
3495 ext3_std_error(inode->i_sb, err); 3498 ext3_std_error(inode->i_sb, err);
3496 3499
3497 return err; 3500 return err;
3498 } 3501 }
3499 3502
1 /* 1 /*
2 * linux/fs/ext4/inode.c 2 * linux/fs/ext4/inode.c
3 * 3 *
4 * Copyright (C) 1992, 1993, 1994, 1995 4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr) 5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal 6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI) 7 * Universite Pierre et Marie Curie (Paris VI)
8 * 8 *
9 * from 9 * from
10 * 10 *
11 * linux/fs/minix/inode.c 11 * linux/fs/minix/inode.c
12 * 12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds 13 * Copyright (C) 1991, 1992 Linus Torvalds
14 * 14 *
15 * Goal-directed block allocation by Stephen Tweedie 15 * Goal-directed block allocation by Stephen Tweedie
16 * (sct@redhat.com), 1993, 1998 16 * (sct@redhat.com), 1993, 1998
17 * Big-endian to little-endian byte-swapping/bitmaps by 17 * Big-endian to little-endian byte-swapping/bitmaps by
18 * David S. Miller (davem@caip.rutgers.edu), 1995 18 * David S. Miller (davem@caip.rutgers.edu), 1995
19 * 64-bit file support on 64-bit platforms by Jakub Jelinek 19 * 64-bit file support on 64-bit platforms by Jakub Jelinek
20 * (jj@sunsite.ms.mff.cuni.cz) 20 * (jj@sunsite.ms.mff.cuni.cz)
21 * 21 *
22 * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000 22 * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
23 */ 23 */
24 24
25 #include <linux/module.h> 25 #include <linux/module.h>
26 #include <linux/fs.h> 26 #include <linux/fs.h>
27 #include <linux/time.h> 27 #include <linux/time.h>
28 #include <linux/jbd2.h> 28 #include <linux/jbd2.h>
29 #include <linux/highuid.h> 29 #include <linux/highuid.h>
30 #include <linux/pagemap.h> 30 #include <linux/pagemap.h>
31 #include <linux/quotaops.h> 31 #include <linux/quotaops.h>
32 #include <linux/string.h> 32 #include <linux/string.h>
33 #include <linux/buffer_head.h> 33 #include <linux/buffer_head.h>
34 #include <linux/writeback.h> 34 #include <linux/writeback.h>
35 #include <linux/pagevec.h> 35 #include <linux/pagevec.h>
36 #include <linux/mpage.h> 36 #include <linux/mpage.h>
37 #include <linux/namei.h> 37 #include <linux/namei.h>
38 #include <linux/uio.h> 38 #include <linux/uio.h>
39 #include <linux/bio.h> 39 #include <linux/bio.h>
40 #include <linux/workqueue.h> 40 #include <linux/workqueue.h>
41 #include <linux/kernel.h> 41 #include <linux/kernel.h>
42 #include <linux/printk.h> 42 #include <linux/printk.h>
43 #include <linux/slab.h> 43 #include <linux/slab.h>
44 #include <linux/ratelimit.h> 44 #include <linux/ratelimit.h>
45 45
46 #include "ext4_jbd2.h" 46 #include "ext4_jbd2.h"
47 #include "xattr.h" 47 #include "xattr.h"
48 #include "acl.h" 48 #include "acl.h"
49 #include "ext4_extents.h" 49 #include "ext4_extents.h"
50 50
51 #include <trace/events/ext4.h> 51 #include <trace/events/ext4.h>
52 52
53 #define MPAGE_DA_EXTENT_TAIL 0x01 53 #define MPAGE_DA_EXTENT_TAIL 0x01
54 54
55 static inline int ext4_begin_ordered_truncate(struct inode *inode, 55 static inline int ext4_begin_ordered_truncate(struct inode *inode,
56 loff_t new_size) 56 loff_t new_size)
57 { 57 {
58 trace_ext4_begin_ordered_truncate(inode, new_size); 58 trace_ext4_begin_ordered_truncate(inode, new_size);
59 /* 59 /*
60 * If jinode is zero, then we never opened the file for 60 * If jinode is zero, then we never opened the file for
61 * writing, so there's no need to call 61 * writing, so there's no need to call
62 * jbd2_journal_begin_ordered_truncate() since there's no 62 * jbd2_journal_begin_ordered_truncate() since there's no
63 * outstanding writes we need to flush. 63 * outstanding writes we need to flush.
64 */ 64 */
65 if (!EXT4_I(inode)->jinode) 65 if (!EXT4_I(inode)->jinode)
66 return 0; 66 return 0;
67 return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode), 67 return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
68 EXT4_I(inode)->jinode, 68 EXT4_I(inode)->jinode,
69 new_size); 69 new_size);
70 } 70 }
71 71
72 static void ext4_invalidatepage(struct page *page, unsigned long offset); 72 static void ext4_invalidatepage(struct page *page, unsigned long offset);
73 static int noalloc_get_block_write(struct inode *inode, sector_t iblock, 73 static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
74 struct buffer_head *bh_result, int create); 74 struct buffer_head *bh_result, int create);
75 static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode); 75 static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
76 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate); 76 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
77 static int __ext4_journalled_writepage(struct page *page, unsigned int len); 77 static int __ext4_journalled_writepage(struct page *page, unsigned int len);
78 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); 78 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
79 79
80 /* 80 /*
81 * Test whether an inode is a fast symlink. 81 * Test whether an inode is a fast symlink.
82 */ 82 */
83 static int ext4_inode_is_fast_symlink(struct inode *inode) 83 static int ext4_inode_is_fast_symlink(struct inode *inode)
84 { 84 {
85 int ea_blocks = EXT4_I(inode)->i_file_acl ? 85 int ea_blocks = EXT4_I(inode)->i_file_acl ?
86 (inode->i_sb->s_blocksize >> 9) : 0; 86 (inode->i_sb->s_blocksize >> 9) : 0;
87 87
88 return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); 88 return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
89 } 89 }
90 90
91 /* 91 /*
92 * Work out how many blocks we need to proceed with the next chunk of a 92 * Work out how many blocks we need to proceed with the next chunk of a
93 * truncate transaction. 93 * truncate transaction.
94 */ 94 */
95 static unsigned long blocks_for_truncate(struct inode *inode) 95 static unsigned long blocks_for_truncate(struct inode *inode)
96 { 96 {
97 ext4_lblk_t needed; 97 ext4_lblk_t needed;
98 98
99 needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); 99 needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
100 100
101 /* Give ourselves just enough room to cope with inodes in which 101 /* Give ourselves just enough room to cope with inodes in which
102 * i_blocks is corrupt: we've seen disk corruptions in the past 102 * i_blocks is corrupt: we've seen disk corruptions in the past
103 * which resulted in random data in an inode which looked enough 103 * which resulted in random data in an inode which looked enough
104 * like a regular file for ext4 to try to delete it. Things 104 * like a regular file for ext4 to try to delete it. Things
105 * will go a bit crazy if that happens, but at least we should 105 * will go a bit crazy if that happens, but at least we should
106 * try not to panic the whole kernel. */ 106 * try not to panic the whole kernel. */
107 if (needed < 2) 107 if (needed < 2)
108 needed = 2; 108 needed = 2;
109 109
110 /* But we need to bound the transaction so we don't overflow the 110 /* But we need to bound the transaction so we don't overflow the
111 * journal. */ 111 * journal. */
112 if (needed > EXT4_MAX_TRANS_DATA) 112 if (needed > EXT4_MAX_TRANS_DATA)
113 needed = EXT4_MAX_TRANS_DATA; 113 needed = EXT4_MAX_TRANS_DATA;
114 114
115 return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed; 115 return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
116 } 116 }
117 117
118 /* 118 /*
119 * Truncate transactions can be complex and absolutely huge. So we need to 119 * Truncate transactions can be complex and absolutely huge. So we need to
120 * be able to restart the transaction at a conventient checkpoint to make 120 * be able to restart the transaction at a conventient checkpoint to make
121 * sure we don't overflow the journal. 121 * sure we don't overflow the journal.
122 * 122 *
123 * start_transaction gets us a new handle for a truncate transaction, 123 * start_transaction gets us a new handle for a truncate transaction,
124 * and extend_transaction tries to extend the existing one a bit. If 124 * and extend_transaction tries to extend the existing one a bit. If
125 * extend fails, we need to propagate the failure up and restart the 125 * extend fails, we need to propagate the failure up and restart the
126 * transaction in the top-level truncate loop. --sct 126 * transaction in the top-level truncate loop. --sct
127 */ 127 */
128 static handle_t *start_transaction(struct inode *inode) 128 static handle_t *start_transaction(struct inode *inode)
129 { 129 {
130 handle_t *result; 130 handle_t *result;
131 131
132 result = ext4_journal_start(inode, blocks_for_truncate(inode)); 132 result = ext4_journal_start(inode, blocks_for_truncate(inode));
133 if (!IS_ERR(result)) 133 if (!IS_ERR(result))
134 return result; 134 return result;
135 135
136 ext4_std_error(inode->i_sb, PTR_ERR(result)); 136 ext4_std_error(inode->i_sb, PTR_ERR(result));
137 return result; 137 return result;
138 } 138 }
139 139
140 /* 140 /*
141 * Try to extend this transaction for the purposes of truncation. 141 * Try to extend this transaction for the purposes of truncation.
142 * 142 *
143 * Returns 0 if we managed to create more room. If we can't create more 143 * Returns 0 if we managed to create more room. If we can't create more
144 * room, and the transaction must be restarted we return 1. 144 * room, and the transaction must be restarted we return 1.
145 */ 145 */
146 static int try_to_extend_transaction(handle_t *handle, struct inode *inode) 146 static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
147 { 147 {
148 if (!ext4_handle_valid(handle)) 148 if (!ext4_handle_valid(handle))
149 return 0; 149 return 0;
150 if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) 150 if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
151 return 0; 151 return 0;
152 if (!ext4_journal_extend(handle, blocks_for_truncate(inode))) 152 if (!ext4_journal_extend(handle, blocks_for_truncate(inode)))
153 return 0; 153 return 0;
154 return 1; 154 return 1;
155 } 155 }
156 156
157 /* 157 /*
158 * Restart the transaction associated with *handle. This does a commit, 158 * Restart the transaction associated with *handle. This does a commit,
159 * so before we call here everything must be consistently dirtied against 159 * so before we call here everything must be consistently dirtied against
160 * this transaction. 160 * this transaction.
161 */ 161 */
162 int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode, 162 int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
163 int nblocks) 163 int nblocks)
164 { 164 {
165 int ret; 165 int ret;
166 166
167 /* 167 /*
168 * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this 168 * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this
169 * moment, get_block can be called only for blocks inside i_size since 169 * moment, get_block can be called only for blocks inside i_size since
170 * page cache has been already dropped and writes are blocked by 170 * page cache has been already dropped and writes are blocked by
171 * i_mutex. So we can safely drop the i_data_sem here. 171 * i_mutex. So we can safely drop the i_data_sem here.
172 */ 172 */
173 BUG_ON(EXT4_JOURNAL(inode) == NULL); 173 BUG_ON(EXT4_JOURNAL(inode) == NULL);
174 jbd_debug(2, "restarting handle %p\n", handle); 174 jbd_debug(2, "restarting handle %p\n", handle);
175 up_write(&EXT4_I(inode)->i_data_sem); 175 up_write(&EXT4_I(inode)->i_data_sem);
176 ret = ext4_journal_restart(handle, nblocks); 176 ret = ext4_journal_restart(handle, nblocks);
177 down_write(&EXT4_I(inode)->i_data_sem); 177 down_write(&EXT4_I(inode)->i_data_sem);
178 ext4_discard_preallocations(inode); 178 ext4_discard_preallocations(inode);
179 179
180 return ret; 180 return ret;
181 } 181 }
182 182
183 /* 183 /*
184 * Called at the last iput() if i_nlink is zero. 184 * Called at the last iput() if i_nlink is zero.
185 */ 185 */
186 void ext4_evict_inode(struct inode *inode) 186 void ext4_evict_inode(struct inode *inode)
187 { 187 {
188 handle_t *handle; 188 handle_t *handle;
189 int err; 189 int err;
190 190
191 trace_ext4_evict_inode(inode); 191 trace_ext4_evict_inode(inode);
192 if (inode->i_nlink) { 192 if (inode->i_nlink) {
193 truncate_inode_pages(&inode->i_data, 0); 193 truncate_inode_pages(&inode->i_data, 0);
194 goto no_delete; 194 goto no_delete;
195 } 195 }
196 196
197 if (!is_bad_inode(inode)) 197 if (!is_bad_inode(inode))
198 dquot_initialize(inode); 198 dquot_initialize(inode);
199 199
200 if (ext4_should_order_data(inode)) 200 if (ext4_should_order_data(inode))
201 ext4_begin_ordered_truncate(inode, 0); 201 ext4_begin_ordered_truncate(inode, 0);
202 truncate_inode_pages(&inode->i_data, 0); 202 truncate_inode_pages(&inode->i_data, 0);
203 203
204 if (is_bad_inode(inode)) 204 if (is_bad_inode(inode))
205 goto no_delete; 205 goto no_delete;
206 206
207 handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3); 207 handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3);
208 if (IS_ERR(handle)) { 208 if (IS_ERR(handle)) {
209 ext4_std_error(inode->i_sb, PTR_ERR(handle)); 209 ext4_std_error(inode->i_sb, PTR_ERR(handle));
210 /* 210 /*
211 * If we're going to skip the normal cleanup, we still need to 211 * If we're going to skip the normal cleanup, we still need to
212 * make sure that the in-core orphan linked list is properly 212 * make sure that the in-core orphan linked list is properly
213 * cleaned up. 213 * cleaned up.
214 */ 214 */
215 ext4_orphan_del(NULL, inode); 215 ext4_orphan_del(NULL, inode);
216 goto no_delete; 216 goto no_delete;
217 } 217 }
218 218
219 if (IS_SYNC(inode)) 219 if (IS_SYNC(inode))
220 ext4_handle_sync(handle); 220 ext4_handle_sync(handle);
221 inode->i_size = 0; 221 inode->i_size = 0;
222 err = ext4_mark_inode_dirty(handle, inode); 222 err = ext4_mark_inode_dirty(handle, inode);
223 if (err) { 223 if (err) {
224 ext4_warning(inode->i_sb, 224 ext4_warning(inode->i_sb,
225 "couldn't mark inode dirty (err %d)", err); 225 "couldn't mark inode dirty (err %d)", err);
226 goto stop_handle; 226 goto stop_handle;
227 } 227 }
228 if (inode->i_blocks) 228 if (inode->i_blocks)
229 ext4_truncate(inode); 229 ext4_truncate(inode);
230 230
231 /* 231 /*
232 * ext4_ext_truncate() doesn't reserve any slop when it 232 * ext4_ext_truncate() doesn't reserve any slop when it
233 * restarts journal transactions; therefore there may not be 233 * restarts journal transactions; therefore there may not be
234 * enough credits left in the handle to remove the inode from 234 * enough credits left in the handle to remove the inode from
235 * the orphan list and set the dtime field. 235 * the orphan list and set the dtime field.
236 */ 236 */
237 if (!ext4_handle_has_enough_credits(handle, 3)) { 237 if (!ext4_handle_has_enough_credits(handle, 3)) {
238 err = ext4_journal_extend(handle, 3); 238 err = ext4_journal_extend(handle, 3);
239 if (err > 0) 239 if (err > 0)
240 err = ext4_journal_restart(handle, 3); 240 err = ext4_journal_restart(handle, 3);
241 if (err != 0) { 241 if (err != 0) {
242 ext4_warning(inode->i_sb, 242 ext4_warning(inode->i_sb,
243 "couldn't extend journal (err %d)", err); 243 "couldn't extend journal (err %d)", err);
244 stop_handle: 244 stop_handle:
245 ext4_journal_stop(handle); 245 ext4_journal_stop(handle);
246 ext4_orphan_del(NULL, inode); 246 ext4_orphan_del(NULL, inode);
247 goto no_delete; 247 goto no_delete;
248 } 248 }
249 } 249 }
250 250
251 /* 251 /*
252 * Kill off the orphan record which ext4_truncate created. 252 * Kill off the orphan record which ext4_truncate created.
253 * AKPM: I think this can be inside the above `if'. 253 * AKPM: I think this can be inside the above `if'.
254 * Note that ext4_orphan_del() has to be able to cope with the 254 * Note that ext4_orphan_del() has to be able to cope with the
255 * deletion of a non-existent orphan - this is because we don't 255 * deletion of a non-existent orphan - this is because we don't
256 * know if ext4_truncate() actually created an orphan record. 256 * know if ext4_truncate() actually created an orphan record.
257 * (Well, we could do this if we need to, but heck - it works) 257 * (Well, we could do this if we need to, but heck - it works)
258 */ 258 */
259 ext4_orphan_del(handle, inode); 259 ext4_orphan_del(handle, inode);
260 EXT4_I(inode)->i_dtime = get_seconds(); 260 EXT4_I(inode)->i_dtime = get_seconds();
261 261
262 /* 262 /*
263 * One subtle ordering requirement: if anything has gone wrong 263 * One subtle ordering requirement: if anything has gone wrong
264 * (transaction abort, IO errors, whatever), then we can still 264 * (transaction abort, IO errors, whatever), then we can still
265 * do these next steps (the fs will already have been marked as 265 * do these next steps (the fs will already have been marked as
266 * having errors), but we can't free the inode if the mark_dirty 266 * having errors), but we can't free the inode if the mark_dirty
267 * fails. 267 * fails.
268 */ 268 */
269 if (ext4_mark_inode_dirty(handle, inode)) 269 if (ext4_mark_inode_dirty(handle, inode))
270 /* If that failed, just do the required in-core inode clear. */ 270 /* If that failed, just do the required in-core inode clear. */
271 ext4_clear_inode(inode); 271 ext4_clear_inode(inode);
272 else 272 else
273 ext4_free_inode(handle, inode); 273 ext4_free_inode(handle, inode);
274 ext4_journal_stop(handle); 274 ext4_journal_stop(handle);
275 return; 275 return;
276 no_delete: 276 no_delete:
277 ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ 277 ext4_clear_inode(inode); /* We must guarantee clearing of inode... */
278 } 278 }
279 279
280 typedef struct { 280 typedef struct {
281 __le32 *p; 281 __le32 *p;
282 __le32 key; 282 __le32 key;
283 struct buffer_head *bh; 283 struct buffer_head *bh;
284 } Indirect; 284 } Indirect;
285 285
286 static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) 286 static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
287 { 287 {
288 p->key = *(p->p = v); 288 p->key = *(p->p = v);
289 p->bh = bh; 289 p->bh = bh;
290 } 290 }
291 291
292 /** 292 /**
293 * ext4_block_to_path - parse the block number into array of offsets 293 * ext4_block_to_path - parse the block number into array of offsets
294 * @inode: inode in question (we are only interested in its superblock) 294 * @inode: inode in question (we are only interested in its superblock)
295 * @i_block: block number to be parsed 295 * @i_block: block number to be parsed
296 * @offsets: array to store the offsets in 296 * @offsets: array to store the offsets in
297 * @boundary: set this non-zero if the referred-to block is likely to be 297 * @boundary: set this non-zero if the referred-to block is likely to be
298 * followed (on disk) by an indirect block. 298 * followed (on disk) by an indirect block.
299 * 299 *
300 * To store the locations of file's data ext4 uses a data structure common 300 * To store the locations of file's data ext4 uses a data structure common
301 * for UNIX filesystems - tree of pointers anchored in the inode, with 301 * for UNIX filesystems - tree of pointers anchored in the inode, with
302 * data blocks at leaves and indirect blocks in intermediate nodes. 302 * data blocks at leaves and indirect blocks in intermediate nodes.
303 * This function translates the block number into path in that tree - 303 * This function translates the block number into path in that tree -
304 * return value is the path length and @offsets[n] is the offset of 304 * return value is the path length and @offsets[n] is the offset of
305 * pointer to (n+1)th node in the nth one. If @block is out of range 305 * pointer to (n+1)th node in the nth one. If @block is out of range
306 * (negative or too large) warning is printed and zero returned. 306 * (negative or too large) warning is printed and zero returned.
307 * 307 *
308 * Note: function doesn't find node addresses, so no IO is needed. All 308 * Note: function doesn't find node addresses, so no IO is needed. All
309 * we need to know is the capacity of indirect blocks (taken from the 309 * we need to know is the capacity of indirect blocks (taken from the
310 * inode->i_sb). 310 * inode->i_sb).
311 */ 311 */
312 312
313 /* 313 /*
314 * Portability note: the last comparison (check that we fit into triple 314 * Portability note: the last comparison (check that we fit into triple
315 * indirect block) is spelled differently, because otherwise on an 315 * indirect block) is spelled differently, because otherwise on an
316 * architecture with 32-bit longs and 8Kb pages we might get into trouble 316 * architecture with 32-bit longs and 8Kb pages we might get into trouble
317 * if our filesystem had 8Kb blocks. We might use long long, but that would 317 * if our filesystem had 8Kb blocks. We might use long long, but that would
318 * kill us on x86. Oh, well, at least the sign propagation does not matter - 318 * kill us on x86. Oh, well, at least the sign propagation does not matter -
319 * i_block would have to be negative in the very beginning, so we would not 319 * i_block would have to be negative in the very beginning, so we would not
320 * get there at all. 320 * get there at all.
321 */ 321 */
322 322
323 static int ext4_block_to_path(struct inode *inode, 323 static int ext4_block_to_path(struct inode *inode,
324 ext4_lblk_t i_block, 324 ext4_lblk_t i_block,
325 ext4_lblk_t offsets[4], int *boundary) 325 ext4_lblk_t offsets[4], int *boundary)
326 { 326 {
327 int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); 327 int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
328 int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); 328 int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
329 const long direct_blocks = EXT4_NDIR_BLOCKS, 329 const long direct_blocks = EXT4_NDIR_BLOCKS,
330 indirect_blocks = ptrs, 330 indirect_blocks = ptrs,
331 double_blocks = (1 << (ptrs_bits * 2)); 331 double_blocks = (1 << (ptrs_bits * 2));
332 int n = 0; 332 int n = 0;
333 int final = 0; 333 int final = 0;
334 334
335 if (i_block < direct_blocks) { 335 if (i_block < direct_blocks) {
336 offsets[n++] = i_block; 336 offsets[n++] = i_block;
337 final = direct_blocks; 337 final = direct_blocks;
338 } else if ((i_block -= direct_blocks) < indirect_blocks) { 338 } else if ((i_block -= direct_blocks) < indirect_blocks) {
339 offsets[n++] = EXT4_IND_BLOCK; 339 offsets[n++] = EXT4_IND_BLOCK;
340 offsets[n++] = i_block; 340 offsets[n++] = i_block;
341 final = ptrs; 341 final = ptrs;
342 } else if ((i_block -= indirect_blocks) < double_blocks) { 342 } else if ((i_block -= indirect_blocks) < double_blocks) {
343 offsets[n++] = EXT4_DIND_BLOCK; 343 offsets[n++] = EXT4_DIND_BLOCK;
344 offsets[n++] = i_block >> ptrs_bits; 344 offsets[n++] = i_block >> ptrs_bits;
345 offsets[n++] = i_block & (ptrs - 1); 345 offsets[n++] = i_block & (ptrs - 1);
346 final = ptrs; 346 final = ptrs;
347 } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { 347 } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
348 offsets[n++] = EXT4_TIND_BLOCK; 348 offsets[n++] = EXT4_TIND_BLOCK;
349 offsets[n++] = i_block >> (ptrs_bits * 2); 349 offsets[n++] = i_block >> (ptrs_bits * 2);
350 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); 350 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
351 offsets[n++] = i_block & (ptrs - 1); 351 offsets[n++] = i_block & (ptrs - 1);
352 final = ptrs; 352 final = ptrs;
353 } else { 353 } else {
354 ext4_warning(inode->i_sb, "block %lu > max in inode %lu", 354 ext4_warning(inode->i_sb, "block %lu > max in inode %lu",
355 i_block + direct_blocks + 355 i_block + direct_blocks +
356 indirect_blocks + double_blocks, inode->i_ino); 356 indirect_blocks + double_blocks, inode->i_ino);
357 } 357 }
358 if (boundary) 358 if (boundary)
359 *boundary = final - 1 - (i_block & (ptrs - 1)); 359 *boundary = final - 1 - (i_block & (ptrs - 1));
360 return n; 360 return n;
361 } 361 }
362 362
363 static int __ext4_check_blockref(const char *function, unsigned int line, 363 static int __ext4_check_blockref(const char *function, unsigned int line,
364 struct inode *inode, 364 struct inode *inode,
365 __le32 *p, unsigned int max) 365 __le32 *p, unsigned int max)
366 { 366 {
367 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; 367 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
368 __le32 *bref = p; 368 __le32 *bref = p;
369 unsigned int blk; 369 unsigned int blk;
370 370
371 while (bref < p+max) { 371 while (bref < p+max) {
372 blk = le32_to_cpu(*bref++); 372 blk = le32_to_cpu(*bref++);
373 if (blk && 373 if (blk &&
374 unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), 374 unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
375 blk, 1))) { 375 blk, 1))) {
376 es->s_last_error_block = cpu_to_le64(blk); 376 es->s_last_error_block = cpu_to_le64(blk);
377 ext4_error_inode(inode, function, line, blk, 377 ext4_error_inode(inode, function, line, blk,
378 "invalid block"); 378 "invalid block");
379 return -EIO; 379 return -EIO;
380 } 380 }
381 } 381 }
382 return 0; 382 return 0;
383 } 383 }
384 384
385 385
386 #define ext4_check_indirect_blockref(inode, bh) \ 386 #define ext4_check_indirect_blockref(inode, bh) \
387 __ext4_check_blockref(__func__, __LINE__, inode, \ 387 __ext4_check_blockref(__func__, __LINE__, inode, \
388 (__le32 *)(bh)->b_data, \ 388 (__le32 *)(bh)->b_data, \
389 EXT4_ADDR_PER_BLOCK((inode)->i_sb)) 389 EXT4_ADDR_PER_BLOCK((inode)->i_sb))
390 390
391 #define ext4_check_inode_blockref(inode) \ 391 #define ext4_check_inode_blockref(inode) \
392 __ext4_check_blockref(__func__, __LINE__, inode, \ 392 __ext4_check_blockref(__func__, __LINE__, inode, \
393 EXT4_I(inode)->i_data, \ 393 EXT4_I(inode)->i_data, \
394 EXT4_NDIR_BLOCKS) 394 EXT4_NDIR_BLOCKS)
395 395
396 /** 396 /**
397 * ext4_get_branch - read the chain of indirect blocks leading to data 397 * ext4_get_branch - read the chain of indirect blocks leading to data
398 * @inode: inode in question 398 * @inode: inode in question
399 * @depth: depth of the chain (1 - direct pointer, etc.) 399 * @depth: depth of the chain (1 - direct pointer, etc.)
400 * @offsets: offsets of pointers in inode/indirect blocks 400 * @offsets: offsets of pointers in inode/indirect blocks
401 * @chain: place to store the result 401 * @chain: place to store the result
402 * @err: here we store the error value 402 * @err: here we store the error value
403 * 403 *
404 * Function fills the array of triples <key, p, bh> and returns %NULL 404 * Function fills the array of triples <key, p, bh> and returns %NULL
405 * if everything went OK or the pointer to the last filled triple 405 * if everything went OK or the pointer to the last filled triple
406 * (incomplete one) otherwise. Upon the return chain[i].key contains 406 * (incomplete one) otherwise. Upon the return chain[i].key contains
407 * the number of (i+1)-th block in the chain (as it is stored in memory, 407 * the number of (i+1)-th block in the chain (as it is stored in memory,
408 * i.e. little-endian 32-bit), chain[i].p contains the address of that 408 * i.e. little-endian 32-bit), chain[i].p contains the address of that
409 * number (it points into struct inode for i==0 and into the bh->b_data 409 * number (it points into struct inode for i==0 and into the bh->b_data
410 * for i>0) and chain[i].bh points to the buffer_head of i-th indirect 410 * for i>0) and chain[i].bh points to the buffer_head of i-th indirect
411 * block for i>0 and NULL for i==0. In other words, it holds the block 411 * block for i>0 and NULL for i==0. In other words, it holds the block
412 * numbers of the chain, addresses they were taken from (and where we can 412 * numbers of the chain, addresses they were taken from (and where we can
413 * verify that chain did not change) and buffer_heads hosting these 413 * verify that chain did not change) and buffer_heads hosting these
414 * numbers. 414 * numbers.
415 * 415 *
416 * Function stops when it stumbles upon zero pointer (absent block) 416 * Function stops when it stumbles upon zero pointer (absent block)
417 * (pointer to last triple returned, *@err == 0) 417 * (pointer to last triple returned, *@err == 0)
418 * or when it gets an IO error reading an indirect block 418 * or when it gets an IO error reading an indirect block
419 * (ditto, *@err == -EIO) 419 * (ditto, *@err == -EIO)
420 * or when it reads all @depth-1 indirect blocks successfully and finds 420 * or when it reads all @depth-1 indirect blocks successfully and finds
421 * the whole chain, all way to the data (returns %NULL, *err == 0). 421 * the whole chain, all way to the data (returns %NULL, *err == 0).
422 * 422 *
423 * Need to be called with 423 * Need to be called with
424 * down_read(&EXT4_I(inode)->i_data_sem) 424 * down_read(&EXT4_I(inode)->i_data_sem)
425 */ 425 */
426 static Indirect *ext4_get_branch(struct inode *inode, int depth, 426 static Indirect *ext4_get_branch(struct inode *inode, int depth,
427 ext4_lblk_t *offsets, 427 ext4_lblk_t *offsets,
428 Indirect chain[4], int *err) 428 Indirect chain[4], int *err)
429 { 429 {
430 struct super_block *sb = inode->i_sb; 430 struct super_block *sb = inode->i_sb;
431 Indirect *p = chain; 431 Indirect *p = chain;
432 struct buffer_head *bh; 432 struct buffer_head *bh;
433 433
434 *err = 0; 434 *err = 0;
435 /* i_data is not going away, no lock needed */ 435 /* i_data is not going away, no lock needed */
436 add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets); 436 add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets);
437 if (!p->key) 437 if (!p->key)
438 goto no_block; 438 goto no_block;
439 while (--depth) { 439 while (--depth) {
440 bh = sb_getblk(sb, le32_to_cpu(p->key)); 440 bh = sb_getblk(sb, le32_to_cpu(p->key));
441 if (unlikely(!bh)) 441 if (unlikely(!bh))
442 goto failure; 442 goto failure;
443 443
444 if (!bh_uptodate_or_lock(bh)) { 444 if (!bh_uptodate_or_lock(bh)) {
445 if (bh_submit_read(bh) < 0) { 445 if (bh_submit_read(bh) < 0) {
446 put_bh(bh); 446 put_bh(bh);
447 goto failure; 447 goto failure;
448 } 448 }
449 /* validate block references */ 449 /* validate block references */
450 if (ext4_check_indirect_blockref(inode, bh)) { 450 if (ext4_check_indirect_blockref(inode, bh)) {
451 put_bh(bh); 451 put_bh(bh);
452 goto failure; 452 goto failure;
453 } 453 }
454 } 454 }
455 455
456 add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); 456 add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
457 /* Reader: end */ 457 /* Reader: end */
458 if (!p->key) 458 if (!p->key)
459 goto no_block; 459 goto no_block;
460 } 460 }
461 return NULL; 461 return NULL;
462 462
463 failure: 463 failure:
464 *err = -EIO; 464 *err = -EIO;
465 no_block: 465 no_block:
466 return p; 466 return p;
467 } 467 }
468 468
469 /** 469 /**
470 * ext4_find_near - find a place for allocation with sufficient locality 470 * ext4_find_near - find a place for allocation with sufficient locality
471 * @inode: owner 471 * @inode: owner
472 * @ind: descriptor of indirect block. 472 * @ind: descriptor of indirect block.
473 * 473 *
474 * This function returns the preferred place for block allocation. 474 * This function returns the preferred place for block allocation.
475 * It is used when heuristic for sequential allocation fails. 475 * It is used when heuristic for sequential allocation fails.
476 * Rules are: 476 * Rules are:
477 * + if there is a block to the left of our position - allocate near it. 477 * + if there is a block to the left of our position - allocate near it.
478 * + if pointer will live in indirect block - allocate near that block. 478 * + if pointer will live in indirect block - allocate near that block.
479 * + if pointer will live in inode - allocate in the same 479 * + if pointer will live in inode - allocate in the same
480 * cylinder group. 480 * cylinder group.
481 * 481 *
482 * In the latter case we colour the starting block by the callers PID to 482 * In the latter case we colour the starting block by the callers PID to
483 * prevent it from clashing with concurrent allocations for a different inode 483 * prevent it from clashing with concurrent allocations for a different inode
484 * in the same block group. The PID is used here so that functionally related 484 * in the same block group. The PID is used here so that functionally related
485 * files will be close-by on-disk. 485 * files will be close-by on-disk.
486 * 486 *
487 * Caller must make sure that @ind is valid and will stay that way. 487 * Caller must make sure that @ind is valid and will stay that way.
488 */ 488 */
489 static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) 489 static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
490 { 490 {
491 struct ext4_inode_info *ei = EXT4_I(inode); 491 struct ext4_inode_info *ei = EXT4_I(inode);
492 __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; 492 __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data;
493 __le32 *p; 493 __le32 *p;
494 ext4_fsblk_t bg_start; 494 ext4_fsblk_t bg_start;
495 ext4_fsblk_t last_block; 495 ext4_fsblk_t last_block;
496 ext4_grpblk_t colour; 496 ext4_grpblk_t colour;
497 ext4_group_t block_group; 497 ext4_group_t block_group;
498 int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); 498 int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
499 499
500 /* Try to find previous block */ 500 /* Try to find previous block */
501 for (p = ind->p - 1; p >= start; p--) { 501 for (p = ind->p - 1; p >= start; p--) {
502 if (*p) 502 if (*p)
503 return le32_to_cpu(*p); 503 return le32_to_cpu(*p);
504 } 504 }
505 505
506 /* No such thing, so let's try location of indirect block */ 506 /* No such thing, so let's try location of indirect block */
507 if (ind->bh) 507 if (ind->bh)
508 return ind->bh->b_blocknr; 508 return ind->bh->b_blocknr;
509 509
510 /* 510 /*
511 * It is going to be referred to from the inode itself? OK, just put it 511 * It is going to be referred to from the inode itself? OK, just put it
512 * into the same cylinder group then. 512 * into the same cylinder group then.
513 */ 513 */
514 block_group = ei->i_block_group; 514 block_group = ei->i_block_group;
515 if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { 515 if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
516 block_group &= ~(flex_size-1); 516 block_group &= ~(flex_size-1);
517 if (S_ISREG(inode->i_mode)) 517 if (S_ISREG(inode->i_mode))
518 block_group++; 518 block_group++;
519 } 519 }
520 bg_start = ext4_group_first_block_no(inode->i_sb, block_group); 520 bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
521 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; 521 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
522 522
523 /* 523 /*
524 * If we are doing delayed allocation, we don't need take 524 * If we are doing delayed allocation, we don't need take
525 * colour into account. 525 * colour into account.
526 */ 526 */
527 if (test_opt(inode->i_sb, DELALLOC)) 527 if (test_opt(inode->i_sb, DELALLOC))
528 return bg_start; 528 return bg_start;
529 529
530 if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) 530 if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
531 colour = (current->pid % 16) * 531 colour = (current->pid % 16) *
532 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); 532 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
533 else 533 else
534 colour = (current->pid % 16) * ((last_block - bg_start) / 16); 534 colour = (current->pid % 16) * ((last_block - bg_start) / 16);
535 return bg_start + colour; 535 return bg_start + colour;
536 } 536 }
537 537
538 /** 538 /**
539 * ext4_find_goal - find a preferred place for allocation. 539 * ext4_find_goal - find a preferred place for allocation.
540 * @inode: owner 540 * @inode: owner
541 * @block: block we want 541 * @block: block we want
542 * @partial: pointer to the last triple within a chain 542 * @partial: pointer to the last triple within a chain
543 * 543 *
544 * Normally this function find the preferred place for block allocation, 544 * Normally this function find the preferred place for block allocation,
545 * returns it. 545 * returns it.
546 * Because this is only used for non-extent files, we limit the block nr 546 * Because this is only used for non-extent files, we limit the block nr
547 * to 32 bits. 547 * to 32 bits.
548 */ 548 */
549 static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, 549 static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
550 Indirect *partial) 550 Indirect *partial)
551 { 551 {
552 ext4_fsblk_t goal; 552 ext4_fsblk_t goal;
553 553
554 /* 554 /*
555 * XXX need to get goal block from mballoc's data structures 555 * XXX need to get goal block from mballoc's data structures
556 */ 556 */
557 557
558 goal = ext4_find_near(inode, partial); 558 goal = ext4_find_near(inode, partial);
559 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; 559 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
560 return goal; 560 return goal;
561 } 561 }
562 562
563 /** 563 /**
564 * ext4_blks_to_allocate - Look up the block map and count the number 564 * ext4_blks_to_allocate - Look up the block map and count the number
565 * of direct blocks need to be allocated for the given branch. 565 * of direct blocks need to be allocated for the given branch.
566 * 566 *
567 * @branch: chain of indirect blocks 567 * @branch: chain of indirect blocks
568 * @k: number of blocks need for indirect blocks 568 * @k: number of blocks need for indirect blocks
569 * @blks: number of data blocks to be mapped. 569 * @blks: number of data blocks to be mapped.
570 * @blocks_to_boundary: the offset in the indirect block 570 * @blocks_to_boundary: the offset in the indirect block
571 * 571 *
572 * return the total number of blocks to be allocate, including the 572 * return the total number of blocks to be allocate, including the
573 * direct and indirect blocks. 573 * direct and indirect blocks.
574 */ 574 */
575 static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, 575 static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
576 int blocks_to_boundary) 576 int blocks_to_boundary)
577 { 577 {
578 unsigned int count = 0; 578 unsigned int count = 0;
579 579
580 /* 580 /*
581 * Simple case, [t,d]Indirect block(s) has not allocated yet 581 * Simple case, [t,d]Indirect block(s) has not allocated yet
582 * then it's clear blocks on that path have not allocated 582 * then it's clear blocks on that path have not allocated
583 */ 583 */
584 if (k > 0) { 584 if (k > 0) {
585 /* right now we don't handle cross boundary allocation */ 585 /* right now we don't handle cross boundary allocation */
586 if (blks < blocks_to_boundary + 1) 586 if (blks < blocks_to_boundary + 1)
587 count += blks; 587 count += blks;
588 else 588 else
589 count += blocks_to_boundary + 1; 589 count += blocks_to_boundary + 1;
590 return count; 590 return count;
591 } 591 }
592 592
593 count++; 593 count++;
594 while (count < blks && count <= blocks_to_boundary && 594 while (count < blks && count <= blocks_to_boundary &&
595 le32_to_cpu(*(branch[0].p + count)) == 0) { 595 le32_to_cpu(*(branch[0].p + count)) == 0) {
596 count++; 596 count++;
597 } 597 }
598 return count; 598 return count;
599 } 599 }
600 600
601 /** 601 /**
602 * ext4_alloc_blocks: multiple allocate blocks needed for a branch 602 * ext4_alloc_blocks: multiple allocate blocks needed for a branch
603 * @handle: handle for this transaction 603 * @handle: handle for this transaction
604 * @inode: inode which needs allocated blocks 604 * @inode: inode which needs allocated blocks
605 * @iblock: the logical block to start allocated at 605 * @iblock: the logical block to start allocated at
606 * @goal: preferred physical block of allocation 606 * @goal: preferred physical block of allocation
607 * @indirect_blks: the number of blocks need to allocate for indirect 607 * @indirect_blks: the number of blocks need to allocate for indirect
608 * blocks 608 * blocks
609 * @blks: number of desired blocks 609 * @blks: number of desired blocks
610 * @new_blocks: on return it will store the new block numbers for 610 * @new_blocks: on return it will store the new block numbers for
611 * the indirect blocks(if needed) and the first direct block, 611 * the indirect blocks(if needed) and the first direct block,
612 * @err: on return it will store the error code 612 * @err: on return it will store the error code
613 * 613 *
614 * This function will return the number of blocks allocated as 614 * This function will return the number of blocks allocated as
615 * requested by the passed-in parameters. 615 * requested by the passed-in parameters.
616 */ 616 */
617 static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, 617 static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
618 ext4_lblk_t iblock, ext4_fsblk_t goal, 618 ext4_lblk_t iblock, ext4_fsblk_t goal,
619 int indirect_blks, int blks, 619 int indirect_blks, int blks,
620 ext4_fsblk_t new_blocks[4], int *err) 620 ext4_fsblk_t new_blocks[4], int *err)
621 { 621 {
622 struct ext4_allocation_request ar; 622 struct ext4_allocation_request ar;
623 int target, i; 623 int target, i;
624 unsigned long count = 0, blk_allocated = 0; 624 unsigned long count = 0, blk_allocated = 0;
625 int index = 0; 625 int index = 0;
626 ext4_fsblk_t current_block = 0; 626 ext4_fsblk_t current_block = 0;
627 int ret = 0; 627 int ret = 0;
628 628
629 /* 629 /*
630 * Here we try to allocate the requested multiple blocks at once, 630 * Here we try to allocate the requested multiple blocks at once,
631 * on a best-effort basis. 631 * on a best-effort basis.
632 * To build a branch, we should allocate blocks for 632 * To build a branch, we should allocate blocks for
633 * the indirect blocks(if not allocated yet), and at least 633 * the indirect blocks(if not allocated yet), and at least
634 * the first direct block of this branch. That's the 634 * the first direct block of this branch. That's the
635 * minimum number of blocks need to allocate(required) 635 * minimum number of blocks need to allocate(required)
636 */ 636 */
637 /* first we try to allocate the indirect blocks */ 637 /* first we try to allocate the indirect blocks */
638 target = indirect_blks; 638 target = indirect_blks;
639 while (target > 0) { 639 while (target > 0) {
640 count = target; 640 count = target;
641 /* allocating blocks for indirect blocks and direct blocks */ 641 /* allocating blocks for indirect blocks and direct blocks */
642 current_block = ext4_new_meta_blocks(handle, inode, goal, 642 current_block = ext4_new_meta_blocks(handle, inode, goal,
643 0, &count, err); 643 0, &count, err);
644 if (*err) 644 if (*err)
645 goto failed_out; 645 goto failed_out;
646 646
647 if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) { 647 if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) {
648 EXT4_ERROR_INODE(inode, 648 EXT4_ERROR_INODE(inode,
649 "current_block %llu + count %lu > %d!", 649 "current_block %llu + count %lu > %d!",
650 current_block, count, 650 current_block, count,
651 EXT4_MAX_BLOCK_FILE_PHYS); 651 EXT4_MAX_BLOCK_FILE_PHYS);
652 *err = -EIO; 652 *err = -EIO;
653 goto failed_out; 653 goto failed_out;
654 } 654 }
655 655
656 target -= count; 656 target -= count;
657 /* allocate blocks for indirect blocks */ 657 /* allocate blocks for indirect blocks */
658 while (index < indirect_blks && count) { 658 while (index < indirect_blks && count) {
659 new_blocks[index++] = current_block++; 659 new_blocks[index++] = current_block++;
660 count--; 660 count--;
661 } 661 }
662 if (count > 0) { 662 if (count > 0) {
663 /* 663 /*
664 * save the new block number 664 * save the new block number
665 * for the first direct block 665 * for the first direct block
666 */ 666 */
667 new_blocks[index] = current_block; 667 new_blocks[index] = current_block;
668 printk(KERN_INFO "%s returned more blocks than " 668 printk(KERN_INFO "%s returned more blocks than "
669 "requested\n", __func__); 669 "requested\n", __func__);
670 WARN_ON(1); 670 WARN_ON(1);
671 break; 671 break;
672 } 672 }
673 } 673 }
674 674
675 target = blks - count ; 675 target = blks - count ;
676 blk_allocated = count; 676 blk_allocated = count;
677 if (!target) 677 if (!target)
678 goto allocated; 678 goto allocated;
679 /* Now allocate data blocks */ 679 /* Now allocate data blocks */
680 memset(&ar, 0, sizeof(ar)); 680 memset(&ar, 0, sizeof(ar));
681 ar.inode = inode; 681 ar.inode = inode;
682 ar.goal = goal; 682 ar.goal = goal;
683 ar.len = target; 683 ar.len = target;
684 ar.logical = iblock; 684 ar.logical = iblock;
685 if (S_ISREG(inode->i_mode)) 685 if (S_ISREG(inode->i_mode))
686 /* enable in-core preallocation only for regular files */ 686 /* enable in-core preallocation only for regular files */
687 ar.flags = EXT4_MB_HINT_DATA; 687 ar.flags = EXT4_MB_HINT_DATA;
688 688
689 current_block = ext4_mb_new_blocks(handle, &ar, err); 689 current_block = ext4_mb_new_blocks(handle, &ar, err);
690 if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) { 690 if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) {
691 EXT4_ERROR_INODE(inode, 691 EXT4_ERROR_INODE(inode,
692 "current_block %llu + ar.len %d > %d!", 692 "current_block %llu + ar.len %d > %d!",
693 current_block, ar.len, 693 current_block, ar.len,
694 EXT4_MAX_BLOCK_FILE_PHYS); 694 EXT4_MAX_BLOCK_FILE_PHYS);
695 *err = -EIO; 695 *err = -EIO;
696 goto failed_out; 696 goto failed_out;
697 } 697 }
698 698
699 if (*err && (target == blks)) { 699 if (*err && (target == blks)) {
700 /* 700 /*
701 * if the allocation failed and we didn't allocate 701 * if the allocation failed and we didn't allocate
702 * any blocks before 702 * any blocks before
703 */ 703 */
704 goto failed_out; 704 goto failed_out;
705 } 705 }
706 if (!*err) { 706 if (!*err) {
707 if (target == blks) { 707 if (target == blks) {
708 /* 708 /*
709 * save the new block number 709 * save the new block number
710 * for the first direct block 710 * for the first direct block
711 */ 711 */
712 new_blocks[index] = current_block; 712 new_blocks[index] = current_block;
713 } 713 }
714 blk_allocated += ar.len; 714 blk_allocated += ar.len;
715 } 715 }
716 allocated: 716 allocated:
717 /* total number of blocks allocated for direct blocks */ 717 /* total number of blocks allocated for direct blocks */
718 ret = blk_allocated; 718 ret = blk_allocated;
719 *err = 0; 719 *err = 0;
720 return ret; 720 return ret;
721 failed_out: 721 failed_out:
722 for (i = 0; i < index; i++) 722 for (i = 0; i < index; i++)
723 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); 723 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
724 return ret; 724 return ret;
725 } 725 }
726 726
727 /** 727 /**
728 * ext4_alloc_branch - allocate and set up a chain of blocks. 728 * ext4_alloc_branch - allocate and set up a chain of blocks.
729 * @handle: handle for this transaction 729 * @handle: handle for this transaction
730 * @inode: owner 730 * @inode: owner
731 * @indirect_blks: number of allocated indirect blocks 731 * @indirect_blks: number of allocated indirect blocks
732 * @blks: number of allocated direct blocks 732 * @blks: number of allocated direct blocks
733 * @goal: preferred place for allocation 733 * @goal: preferred place for allocation
734 * @offsets: offsets (in the blocks) to store the pointers to next. 734 * @offsets: offsets (in the blocks) to store the pointers to next.
735 * @branch: place to store the chain in. 735 * @branch: place to store the chain in.
736 * 736 *
737 * This function allocates blocks, zeroes out all but the last one, 737 * This function allocates blocks, zeroes out all but the last one,
738 * links them into chain and (if we are synchronous) writes them to disk. 738 * links them into chain and (if we are synchronous) writes them to disk.
739 * In other words, it prepares a branch that can be spliced onto the 739 * In other words, it prepares a branch that can be spliced onto the
740 * inode. It stores the information about that chain in the branch[], in 740 * inode. It stores the information about that chain in the branch[], in
741 * the same format as ext4_get_branch() would do. We are calling it after 741 * the same format as ext4_get_branch() would do. We are calling it after
742 * we had read the existing part of chain and partial points to the last 742 * we had read the existing part of chain and partial points to the last
743 * triple of that (one with zero ->key). Upon the exit we have the same 743 * triple of that (one with zero ->key). Upon the exit we have the same
744 * picture as after the successful ext4_get_block(), except that in one 744 * picture as after the successful ext4_get_block(), except that in one
745 * place chain is disconnected - *branch->p is still zero (we did not 745 * place chain is disconnected - *branch->p is still zero (we did not
746 * set the last link), but branch->key contains the number that should 746 * set the last link), but branch->key contains the number that should
747 * be placed into *branch->p to fill that gap. 747 * be placed into *branch->p to fill that gap.
748 * 748 *
749 * If allocation fails we free all blocks we've allocated (and forget 749 * If allocation fails we free all blocks we've allocated (and forget
750 * their buffer_heads) and return the error value the from failed 750 * their buffer_heads) and return the error value the from failed
751 * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain 751 * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain
752 * as described above and return 0. 752 * as described above and return 0.
753 */ 753 */
754 static int ext4_alloc_branch(handle_t *handle, struct inode *inode, 754 static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
755 ext4_lblk_t iblock, int indirect_blks, 755 ext4_lblk_t iblock, int indirect_blks,
756 int *blks, ext4_fsblk_t goal, 756 int *blks, ext4_fsblk_t goal,
757 ext4_lblk_t *offsets, Indirect *branch) 757 ext4_lblk_t *offsets, Indirect *branch)
758 { 758 {
759 int blocksize = inode->i_sb->s_blocksize; 759 int blocksize = inode->i_sb->s_blocksize;
760 int i, n = 0; 760 int i, n = 0;
761 int err = 0; 761 int err = 0;
762 struct buffer_head *bh; 762 struct buffer_head *bh;
763 int num; 763 int num;
764 ext4_fsblk_t new_blocks[4]; 764 ext4_fsblk_t new_blocks[4];
765 ext4_fsblk_t current_block; 765 ext4_fsblk_t current_block;
766 766
767 num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, 767 num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
768 *blks, new_blocks, &err); 768 *blks, new_blocks, &err);
769 if (err) 769 if (err)
770 return err; 770 return err;
771 771
772 branch[0].key = cpu_to_le32(new_blocks[0]); 772 branch[0].key = cpu_to_le32(new_blocks[0]);
773 /* 773 /*
774 * metadata blocks and data blocks are allocated. 774 * metadata blocks and data blocks are allocated.
775 */ 775 */
776 for (n = 1; n <= indirect_blks; n++) { 776 for (n = 1; n <= indirect_blks; n++) {
777 /* 777 /*
778 * Get buffer_head for parent block, zero it out 778 * Get buffer_head for parent block, zero it out
779 * and set the pointer to new one, then send 779 * and set the pointer to new one, then send
780 * parent to disk. 780 * parent to disk.
781 */ 781 */
782 bh = sb_getblk(inode->i_sb, new_blocks[n-1]); 782 bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
783 if (unlikely(!bh)) { 783 if (unlikely(!bh)) {
784 err = -EIO; 784 err = -EIO;
785 goto failed; 785 goto failed;
786 } 786 }
787 787
788 branch[n].bh = bh; 788 branch[n].bh = bh;
789 lock_buffer(bh); 789 lock_buffer(bh);
790 BUFFER_TRACE(bh, "call get_create_access"); 790 BUFFER_TRACE(bh, "call get_create_access");
791 err = ext4_journal_get_create_access(handle, bh); 791 err = ext4_journal_get_create_access(handle, bh);
792 if (err) { 792 if (err) {
793 /* Don't brelse(bh) here; it's done in 793 /* Don't brelse(bh) here; it's done in
794 * ext4_journal_forget() below */ 794 * ext4_journal_forget() below */
795 unlock_buffer(bh); 795 unlock_buffer(bh);
796 goto failed; 796 goto failed;
797 } 797 }
798 798
799 memset(bh->b_data, 0, blocksize); 799 memset(bh->b_data, 0, blocksize);
800 branch[n].p = (__le32 *) bh->b_data + offsets[n]; 800 branch[n].p = (__le32 *) bh->b_data + offsets[n];
801 branch[n].key = cpu_to_le32(new_blocks[n]); 801 branch[n].key = cpu_to_le32(new_blocks[n]);
802 *branch[n].p = branch[n].key; 802 *branch[n].p = branch[n].key;
803 if (n == indirect_blks) { 803 if (n == indirect_blks) {
804 current_block = new_blocks[n]; 804 current_block = new_blocks[n];
805 /* 805 /*
806 * End of chain, update the last new metablock of 806 * End of chain, update the last new metablock of
807 * the chain to point to the new allocated 807 * the chain to point to the new allocated
808 * data blocks numbers 808 * data blocks numbers
809 */ 809 */
810 for (i = 1; i < num; i++) 810 for (i = 1; i < num; i++)
811 *(branch[n].p + i) = cpu_to_le32(++current_block); 811 *(branch[n].p + i) = cpu_to_le32(++current_block);
812 } 812 }
813 BUFFER_TRACE(bh, "marking uptodate"); 813 BUFFER_TRACE(bh, "marking uptodate");
814 set_buffer_uptodate(bh); 814 set_buffer_uptodate(bh);
815 unlock_buffer(bh); 815 unlock_buffer(bh);
816 816
817 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 817 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
818 err = ext4_handle_dirty_metadata(handle, inode, bh); 818 err = ext4_handle_dirty_metadata(handle, inode, bh);
819 if (err) 819 if (err)
820 goto failed; 820 goto failed;
821 } 821 }
822 *blks = num; 822 *blks = num;
823 return err; 823 return err;
824 failed: 824 failed:
825 /* Allocation failed, free what we already allocated */ 825 /* Allocation failed, free what we already allocated */
826 ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0); 826 ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0);
827 for (i = 1; i <= n ; i++) { 827 for (i = 1; i <= n ; i++) {
828 /* 828 /*
829 * branch[i].bh is newly allocated, so there is no 829 * branch[i].bh is newly allocated, so there is no
830 * need to revoke the block, which is why we don't 830 * need to revoke the block, which is why we don't
831 * need to set EXT4_FREE_BLOCKS_METADATA. 831 * need to set EXT4_FREE_BLOCKS_METADATA.
832 */ 832 */
833 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 833 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1,
834 EXT4_FREE_BLOCKS_FORGET); 834 EXT4_FREE_BLOCKS_FORGET);
835 } 835 }
836 for (i = n+1; i < indirect_blks; i++) 836 for (i = n+1; i < indirect_blks; i++)
837 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); 837 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
838 838
839 ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0); 839 ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0);
840 840
841 return err; 841 return err;
842 } 842 }
843 843
844 /** 844 /**
845 * ext4_splice_branch - splice the allocated branch onto inode. 845 * ext4_splice_branch - splice the allocated branch onto inode.
846 * @handle: handle for this transaction 846 * @handle: handle for this transaction
847 * @inode: owner 847 * @inode: owner
848 * @block: (logical) number of block we are adding 848 * @block: (logical) number of block we are adding
849 * @chain: chain of indirect blocks (with a missing link - see 849 * @chain: chain of indirect blocks (with a missing link - see
850 * ext4_alloc_branch) 850 * ext4_alloc_branch)
851 * @where: location of missing link 851 * @where: location of missing link
852 * @num: number of indirect blocks we are adding 852 * @num: number of indirect blocks we are adding
853 * @blks: number of direct blocks we are adding 853 * @blks: number of direct blocks we are adding
854 * 854 *
855 * This function fills the missing link and does all housekeeping needed in 855 * This function fills the missing link and does all housekeeping needed in
856 * inode (->i_blocks, etc.). In case of success we end up with the full 856 * inode (->i_blocks, etc.). In case of success we end up with the full
857 * chain to new block and return 0. 857 * chain to new block and return 0.
858 */ 858 */
859 static int ext4_splice_branch(handle_t *handle, struct inode *inode, 859 static int ext4_splice_branch(handle_t *handle, struct inode *inode,
860 ext4_lblk_t block, Indirect *where, int num, 860 ext4_lblk_t block, Indirect *where, int num,
861 int blks) 861 int blks)
862 { 862 {
863 int i; 863 int i;
864 int err = 0; 864 int err = 0;
865 ext4_fsblk_t current_block; 865 ext4_fsblk_t current_block;
866 866
867 /* 867 /*
868 * If we're splicing into a [td]indirect block (as opposed to the 868 * If we're splicing into a [td]indirect block (as opposed to the
869 * inode) then we need to get write access to the [td]indirect block 869 * inode) then we need to get write access to the [td]indirect block
870 * before the splice. 870 * before the splice.
871 */ 871 */
872 if (where->bh) { 872 if (where->bh) {
873 BUFFER_TRACE(where->bh, "get_write_access"); 873 BUFFER_TRACE(where->bh, "get_write_access");
874 err = ext4_journal_get_write_access(handle, where->bh); 874 err = ext4_journal_get_write_access(handle, where->bh);
875 if (err) 875 if (err)
876 goto err_out; 876 goto err_out;
877 } 877 }
878 /* That's it */ 878 /* That's it */
879 879
880 *where->p = where->key; 880 *where->p = where->key;
881 881
882 /* 882 /*
883 * Update the host buffer_head or inode to point to more just allocated 883 * Update the host buffer_head or inode to point to more just allocated
884 * direct blocks blocks 884 * direct blocks blocks
885 */ 885 */
886 if (num == 0 && blks > 1) { 886 if (num == 0 && blks > 1) {
887 current_block = le32_to_cpu(where->key) + 1; 887 current_block = le32_to_cpu(where->key) + 1;
888 for (i = 1; i < blks; i++) 888 for (i = 1; i < blks; i++)
889 *(where->p + i) = cpu_to_le32(current_block++); 889 *(where->p + i) = cpu_to_le32(current_block++);
890 } 890 }
891 891
892 /* We are done with atomic stuff, now do the rest of housekeeping */ 892 /* We are done with atomic stuff, now do the rest of housekeeping */
893 /* had we spliced it onto indirect block? */ 893 /* had we spliced it onto indirect block? */
894 if (where->bh) { 894 if (where->bh) {
895 /* 895 /*
896 * If we spliced it onto an indirect block, we haven't 896 * If we spliced it onto an indirect block, we haven't
897 * altered the inode. Note however that if it is being spliced 897 * altered the inode. Note however that if it is being spliced
898 * onto an indirect block at the very end of the file (the 898 * onto an indirect block at the very end of the file (the
899 * file is growing) then we *will* alter the inode to reflect 899 * file is growing) then we *will* alter the inode to reflect
900 * the new i_size. But that is not done here - it is done in 900 * the new i_size. But that is not done here - it is done in
901 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. 901 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
902 */ 902 */
903 jbd_debug(5, "splicing indirect only\n"); 903 jbd_debug(5, "splicing indirect only\n");
904 BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); 904 BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
905 err = ext4_handle_dirty_metadata(handle, inode, where->bh); 905 err = ext4_handle_dirty_metadata(handle, inode, where->bh);
906 if (err) 906 if (err)
907 goto err_out; 907 goto err_out;
908 } else { 908 } else {
909 /* 909 /*
910 * OK, we spliced it into the inode itself on a direct block. 910 * OK, we spliced it into the inode itself on a direct block.
911 */ 911 */
912 ext4_mark_inode_dirty(handle, inode); 912 ext4_mark_inode_dirty(handle, inode);
913 jbd_debug(5, "splicing direct\n"); 913 jbd_debug(5, "splicing direct\n");
914 } 914 }
915 return err; 915 return err;
916 916
917 err_out: 917 err_out:
918 for (i = 1; i <= num; i++) { 918 for (i = 1; i <= num; i++) {
919 /* 919 /*
920 * branch[i].bh is newly allocated, so there is no 920 * branch[i].bh is newly allocated, so there is no
921 * need to revoke the block, which is why we don't 921 * need to revoke the block, which is why we don't
922 * need to set EXT4_FREE_BLOCKS_METADATA. 922 * need to set EXT4_FREE_BLOCKS_METADATA.
923 */ 923 */
924 ext4_free_blocks(handle, inode, where[i].bh, 0, 1, 924 ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
925 EXT4_FREE_BLOCKS_FORGET); 925 EXT4_FREE_BLOCKS_FORGET);
926 } 926 }
927 ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key), 927 ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key),
928 blks, 0); 928 blks, 0);
929 929
930 return err; 930 return err;
931 } 931 }
932 932
933 /* 933 /*
934 * The ext4_ind_map_blocks() function handles non-extents inodes 934 * The ext4_ind_map_blocks() function handles non-extents inodes
935 * (i.e., using the traditional indirect/double-indirect i_blocks 935 * (i.e., using the traditional indirect/double-indirect i_blocks
936 * scheme) for ext4_map_blocks(). 936 * scheme) for ext4_map_blocks().
937 * 937 *
938 * Allocation strategy is simple: if we have to allocate something, we will 938 * Allocation strategy is simple: if we have to allocate something, we will
939 * have to go the whole way to leaf. So let's do it before attaching anything 939 * have to go the whole way to leaf. So let's do it before attaching anything
940 * to tree, set linkage between the newborn blocks, write them if sync is 940 * to tree, set linkage between the newborn blocks, write them if sync is
941 * required, recheck the path, free and repeat if check fails, otherwise 941 * required, recheck the path, free and repeat if check fails, otherwise
942 * set the last missing link (that will protect us from any truncate-generated 942 * set the last missing link (that will protect us from any truncate-generated
943 * removals - all blocks on the path are immune now) and possibly force the 943 * removals - all blocks on the path are immune now) and possibly force the
944 * write on the parent block. 944 * write on the parent block.
945 * That has a nice additional property: no special recovery from the failed 945 * That has a nice additional property: no special recovery from the failed
946 * allocations is needed - we simply release blocks and do not touch anything 946 * allocations is needed - we simply release blocks and do not touch anything
947 * reachable from inode. 947 * reachable from inode.
948 * 948 *
949 * `handle' can be NULL if create == 0. 949 * `handle' can be NULL if create == 0.
950 * 950 *
951 * return > 0, # of blocks mapped or allocated. 951 * return > 0, # of blocks mapped or allocated.
952 * return = 0, if plain lookup failed. 952 * return = 0, if plain lookup failed.
953 * return < 0, error case. 953 * return < 0, error case.
954 * 954 *
955 * The ext4_ind_get_blocks() function should be called with 955 * The ext4_ind_get_blocks() function should be called with
956 * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem 956 * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem
957 * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or 957 * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or
958 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system 958 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
959 * blocks. 959 * blocks.
960 */ 960 */
961 static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, 961 static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
962 struct ext4_map_blocks *map, 962 struct ext4_map_blocks *map,
963 int flags) 963 int flags)
964 { 964 {
965 int err = -EIO; 965 int err = -EIO;
966 ext4_lblk_t offsets[4]; 966 ext4_lblk_t offsets[4];
967 Indirect chain[4]; 967 Indirect chain[4];
968 Indirect *partial; 968 Indirect *partial;
969 ext4_fsblk_t goal; 969 ext4_fsblk_t goal;
970 int indirect_blks; 970 int indirect_blks;
971 int blocks_to_boundary = 0; 971 int blocks_to_boundary = 0;
972 int depth; 972 int depth;
973 int count = 0; 973 int count = 0;
974 ext4_fsblk_t first_block = 0; 974 ext4_fsblk_t first_block = 0;
975 975
976 trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); 976 trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
977 J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); 977 J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
978 J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); 978 J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
979 depth = ext4_block_to_path(inode, map->m_lblk, offsets, 979 depth = ext4_block_to_path(inode, map->m_lblk, offsets,
980 &blocks_to_boundary); 980 &blocks_to_boundary);
981 981
982 if (depth == 0) 982 if (depth == 0)
983 goto out; 983 goto out;
984 984
985 partial = ext4_get_branch(inode, depth, offsets, chain, &err); 985 partial = ext4_get_branch(inode, depth, offsets, chain, &err);
986 986
987 /* Simplest case - block found, no allocation needed */ 987 /* Simplest case - block found, no allocation needed */
988 if (!partial) { 988 if (!partial) {
989 first_block = le32_to_cpu(chain[depth - 1].key); 989 first_block = le32_to_cpu(chain[depth - 1].key);
990 count++; 990 count++;
991 /*map more blocks*/ 991 /*map more blocks*/
992 while (count < map->m_len && count <= blocks_to_boundary) { 992 while (count < map->m_len && count <= blocks_to_boundary) {
993 ext4_fsblk_t blk; 993 ext4_fsblk_t blk;
994 994
995 blk = le32_to_cpu(*(chain[depth-1].p + count)); 995 blk = le32_to_cpu(*(chain[depth-1].p + count));
996 996
997 if (blk == first_block + count) 997 if (blk == first_block + count)
998 count++; 998 count++;
999 else 999 else
1000 break; 1000 break;
1001 } 1001 }
1002 goto got_it; 1002 goto got_it;
1003 } 1003 }
1004 1004
1005 /* Next simple case - plain lookup or failed read of indirect block */ 1005 /* Next simple case - plain lookup or failed read of indirect block */
1006 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO) 1006 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO)
1007 goto cleanup; 1007 goto cleanup;
1008 1008
1009 /* 1009 /*
1010 * Okay, we need to do block allocation. 1010 * Okay, we need to do block allocation.
1011 */ 1011 */
1012 goal = ext4_find_goal(inode, map->m_lblk, partial); 1012 goal = ext4_find_goal(inode, map->m_lblk, partial);
1013 1013
1014 /* the number of blocks need to allocate for [d,t]indirect blocks */ 1014 /* the number of blocks need to allocate for [d,t]indirect blocks */
1015 indirect_blks = (chain + depth) - partial - 1; 1015 indirect_blks = (chain + depth) - partial - 1;
1016 1016
1017 /* 1017 /*
1018 * Next look up the indirect map to count the totoal number of 1018 * Next look up the indirect map to count the totoal number of
1019 * direct blocks to allocate for this branch. 1019 * direct blocks to allocate for this branch.
1020 */ 1020 */
1021 count = ext4_blks_to_allocate(partial, indirect_blks, 1021 count = ext4_blks_to_allocate(partial, indirect_blks,
1022 map->m_len, blocks_to_boundary); 1022 map->m_len, blocks_to_boundary);
1023 /* 1023 /*
1024 * Block out ext4_truncate while we alter the tree 1024 * Block out ext4_truncate while we alter the tree
1025 */ 1025 */
1026 err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks, 1026 err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,
1027 &count, goal, 1027 &count, goal,
1028 offsets + (partial - chain), partial); 1028 offsets + (partial - chain), partial);
1029 1029
1030 /* 1030 /*
1031 * The ext4_splice_branch call will free and forget any buffers 1031 * The ext4_splice_branch call will free and forget any buffers
1032 * on the new chain if there is a failure, but that risks using 1032 * on the new chain if there is a failure, but that risks using
1033 * up transaction credits, especially for bitmaps where the 1033 * up transaction credits, especially for bitmaps where the
1034 * credits cannot be returned. Can we handle this somehow? We 1034 * credits cannot be returned. Can we handle this somehow? We
1035 * may need to return -EAGAIN upwards in the worst case. --sct 1035 * may need to return -EAGAIN upwards in the worst case. --sct
1036 */ 1036 */
1037 if (!err) 1037 if (!err)
1038 err = ext4_splice_branch(handle, inode, map->m_lblk, 1038 err = ext4_splice_branch(handle, inode, map->m_lblk,
1039 partial, indirect_blks, count); 1039 partial, indirect_blks, count);
1040 if (err) 1040 if (err)
1041 goto cleanup; 1041 goto cleanup;
1042 1042
1043 map->m_flags |= EXT4_MAP_NEW; 1043 map->m_flags |= EXT4_MAP_NEW;
1044 1044
1045 ext4_update_inode_fsync_trans(handle, inode, 1); 1045 ext4_update_inode_fsync_trans(handle, inode, 1);
1046 got_it: 1046 got_it:
1047 map->m_flags |= EXT4_MAP_MAPPED; 1047 map->m_flags |= EXT4_MAP_MAPPED;
1048 map->m_pblk = le32_to_cpu(chain[depth-1].key); 1048 map->m_pblk = le32_to_cpu(chain[depth-1].key);
1049 map->m_len = count; 1049 map->m_len = count;
1050 if (count > blocks_to_boundary) 1050 if (count > blocks_to_boundary)
1051 map->m_flags |= EXT4_MAP_BOUNDARY; 1051 map->m_flags |= EXT4_MAP_BOUNDARY;
1052 err = count; 1052 err = count;
1053 /* Clean up and exit */ 1053 /* Clean up and exit */
1054 partial = chain + depth - 1; /* the whole chain */ 1054 partial = chain + depth - 1; /* the whole chain */
1055 cleanup: 1055 cleanup:
1056 while (partial > chain) { 1056 while (partial > chain) {
1057 BUFFER_TRACE(partial->bh, "call brelse"); 1057 BUFFER_TRACE(partial->bh, "call brelse");
1058 brelse(partial->bh); 1058 brelse(partial->bh);
1059 partial--; 1059 partial--;
1060 } 1060 }
1061 out: 1061 out:
1062 trace_ext4_ind_map_blocks_exit(inode, map->m_lblk, 1062 trace_ext4_ind_map_blocks_exit(inode, map->m_lblk,
1063 map->m_pblk, map->m_len, err); 1063 map->m_pblk, map->m_len, err);
1064 return err; 1064 return err;
1065 } 1065 }
1066 1066
1067 #ifdef CONFIG_QUOTA 1067 #ifdef CONFIG_QUOTA
1068 qsize_t *ext4_get_reserved_space(struct inode *inode) 1068 qsize_t *ext4_get_reserved_space(struct inode *inode)
1069 { 1069 {
1070 return &EXT4_I(inode)->i_reserved_quota; 1070 return &EXT4_I(inode)->i_reserved_quota;
1071 } 1071 }
1072 #endif 1072 #endif
1073 1073
1074 /* 1074 /*
1075 * Calculate the number of metadata blocks need to reserve 1075 * Calculate the number of metadata blocks need to reserve
1076 * to allocate a new block at @lblocks for non extent file based file 1076 * to allocate a new block at @lblocks for non extent file based file
1077 */ 1077 */
1078 static int ext4_indirect_calc_metadata_amount(struct inode *inode, 1078 static int ext4_indirect_calc_metadata_amount(struct inode *inode,
1079 sector_t lblock) 1079 sector_t lblock)
1080 { 1080 {
1081 struct ext4_inode_info *ei = EXT4_I(inode); 1081 struct ext4_inode_info *ei = EXT4_I(inode);
1082 sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1); 1082 sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);
1083 int blk_bits; 1083 int blk_bits;
1084 1084
1085 if (lblock < EXT4_NDIR_BLOCKS) 1085 if (lblock < EXT4_NDIR_BLOCKS)
1086 return 0; 1086 return 0;
1087 1087
1088 lblock -= EXT4_NDIR_BLOCKS; 1088 lblock -= EXT4_NDIR_BLOCKS;
1089 1089
1090 if (ei->i_da_metadata_calc_len && 1090 if (ei->i_da_metadata_calc_len &&
1091 (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) { 1091 (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) {
1092 ei->i_da_metadata_calc_len++; 1092 ei->i_da_metadata_calc_len++;
1093 return 0; 1093 return 0;
1094 } 1094 }
1095 ei->i_da_metadata_calc_last_lblock = lblock & dind_mask; 1095 ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
1096 ei->i_da_metadata_calc_len = 1; 1096 ei->i_da_metadata_calc_len = 1;
1097 blk_bits = order_base_2(lblock); 1097 blk_bits = order_base_2(lblock);
1098 return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; 1098 return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
1099 } 1099 }
1100 1100
1101 /* 1101 /*
1102 * Calculate the number of metadata blocks need to reserve 1102 * Calculate the number of metadata blocks need to reserve
1103 * to allocate a block located at @lblock 1103 * to allocate a block located at @lblock
1104 */ 1104 */
1105 static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) 1105 static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
1106 { 1106 {
1107 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 1107 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
1108 return ext4_ext_calc_metadata_amount(inode, lblock); 1108 return ext4_ext_calc_metadata_amount(inode, lblock);
1109 1109
1110 return ext4_indirect_calc_metadata_amount(inode, lblock); 1110 return ext4_indirect_calc_metadata_amount(inode, lblock);
1111 } 1111 }
1112 1112
1113 /* 1113 /*
1114 * Called with i_data_sem down, which is important since we can call 1114 * Called with i_data_sem down, which is important since we can call
1115 * ext4_discard_preallocations() from here. 1115 * ext4_discard_preallocations() from here.
1116 */ 1116 */
1117 void ext4_da_update_reserve_space(struct inode *inode, 1117 void ext4_da_update_reserve_space(struct inode *inode,
1118 int used, int quota_claim) 1118 int used, int quota_claim)
1119 { 1119 {
1120 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1120 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1121 struct ext4_inode_info *ei = EXT4_I(inode); 1121 struct ext4_inode_info *ei = EXT4_I(inode);
1122 1122
1123 spin_lock(&ei->i_block_reservation_lock); 1123 spin_lock(&ei->i_block_reservation_lock);
1124 trace_ext4_da_update_reserve_space(inode, used); 1124 trace_ext4_da_update_reserve_space(inode, used);
1125 if (unlikely(used > ei->i_reserved_data_blocks)) { 1125 if (unlikely(used > ei->i_reserved_data_blocks)) {
1126 ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d " 1126 ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
1127 "with only %d reserved data blocks\n", 1127 "with only %d reserved data blocks\n",
1128 __func__, inode->i_ino, used, 1128 __func__, inode->i_ino, used,
1129 ei->i_reserved_data_blocks); 1129 ei->i_reserved_data_blocks);
1130 WARN_ON(1); 1130 WARN_ON(1);
1131 used = ei->i_reserved_data_blocks; 1131 used = ei->i_reserved_data_blocks;
1132 } 1132 }
1133 1133
1134 /* Update per-inode reservations */ 1134 /* Update per-inode reservations */
1135 ei->i_reserved_data_blocks -= used; 1135 ei->i_reserved_data_blocks -= used;
1136 ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks; 1136 ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
1137 percpu_counter_sub(&sbi->s_dirtyblocks_counter, 1137 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
1138 used + ei->i_allocated_meta_blocks); 1138 used + ei->i_allocated_meta_blocks);
1139 ei->i_allocated_meta_blocks = 0; 1139 ei->i_allocated_meta_blocks = 0;
1140 1140
1141 if (ei->i_reserved_data_blocks == 0) { 1141 if (ei->i_reserved_data_blocks == 0) {
1142 /* 1142 /*
1143 * We can release all of the reserved metadata blocks 1143 * We can release all of the reserved metadata blocks
1144 * only when we have written all of the delayed 1144 * only when we have written all of the delayed
1145 * allocation blocks. 1145 * allocation blocks.
1146 */ 1146 */
1147 percpu_counter_sub(&sbi->s_dirtyblocks_counter, 1147 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
1148 ei->i_reserved_meta_blocks); 1148 ei->i_reserved_meta_blocks);
1149 ei->i_reserved_meta_blocks = 0; 1149 ei->i_reserved_meta_blocks = 0;
1150 ei->i_da_metadata_calc_len = 0; 1150 ei->i_da_metadata_calc_len = 0;
1151 } 1151 }
1152 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1152 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1153 1153
1154 /* Update quota subsystem for data blocks */ 1154 /* Update quota subsystem for data blocks */
1155 if (quota_claim) 1155 if (quota_claim)
1156 dquot_claim_block(inode, used); 1156 dquot_claim_block(inode, used);
1157 else { 1157 else {
1158 /* 1158 /*
1159 * We did fallocate with an offset that is already delayed 1159 * We did fallocate with an offset that is already delayed
1160 * allocated. So on delayed allocated writeback we should 1160 * allocated. So on delayed allocated writeback we should
1161 * not re-claim the quota for fallocated blocks. 1161 * not re-claim the quota for fallocated blocks.
1162 */ 1162 */
1163 dquot_release_reservation_block(inode, used); 1163 dquot_release_reservation_block(inode, used);
1164 } 1164 }
1165 1165
1166 /* 1166 /*
1167 * If we have done all the pending block allocations and if 1167 * If we have done all the pending block allocations and if
1168 * there aren't any writers on the inode, we can discard the 1168 * there aren't any writers on the inode, we can discard the
1169 * inode's preallocations. 1169 * inode's preallocations.
1170 */ 1170 */
1171 if ((ei->i_reserved_data_blocks == 0) && 1171 if ((ei->i_reserved_data_blocks == 0) &&
1172 (atomic_read(&inode->i_writecount) == 0)) 1172 (atomic_read(&inode->i_writecount) == 0))
1173 ext4_discard_preallocations(inode); 1173 ext4_discard_preallocations(inode);
1174 } 1174 }
1175 1175
1176 static int __check_block_validity(struct inode *inode, const char *func, 1176 static int __check_block_validity(struct inode *inode, const char *func,
1177 unsigned int line, 1177 unsigned int line,
1178 struct ext4_map_blocks *map) 1178 struct ext4_map_blocks *map)
1179 { 1179 {
1180 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk, 1180 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
1181 map->m_len)) { 1181 map->m_len)) {
1182 ext4_error_inode(inode, func, line, map->m_pblk, 1182 ext4_error_inode(inode, func, line, map->m_pblk,
1183 "lblock %lu mapped to illegal pblock " 1183 "lblock %lu mapped to illegal pblock "
1184 "(length %d)", (unsigned long) map->m_lblk, 1184 "(length %d)", (unsigned long) map->m_lblk,
1185 map->m_len); 1185 map->m_len);
1186 return -EIO; 1186 return -EIO;
1187 } 1187 }
1188 return 0; 1188 return 0;
1189 } 1189 }
1190 1190
1191 #define check_block_validity(inode, map) \ 1191 #define check_block_validity(inode, map) \
1192 __check_block_validity((inode), __func__, __LINE__, (map)) 1192 __check_block_validity((inode), __func__, __LINE__, (map))
1193 1193
1194 /* 1194 /*
1195 * Return the number of contiguous dirty pages in a given inode 1195 * Return the number of contiguous dirty pages in a given inode
1196 * starting at page frame idx. 1196 * starting at page frame idx.
1197 */ 1197 */
1198 static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, 1198 static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
1199 unsigned int max_pages) 1199 unsigned int max_pages)
1200 { 1200 {
1201 struct address_space *mapping = inode->i_mapping; 1201 struct address_space *mapping = inode->i_mapping;
1202 pgoff_t index; 1202 pgoff_t index;
1203 struct pagevec pvec; 1203 struct pagevec pvec;
1204 pgoff_t num = 0; 1204 pgoff_t num = 0;
1205 int i, nr_pages, done = 0; 1205 int i, nr_pages, done = 0;
1206 1206
1207 if (max_pages == 0) 1207 if (max_pages == 0)
1208 return 0; 1208 return 0;
1209 pagevec_init(&pvec, 0); 1209 pagevec_init(&pvec, 0);
1210 while (!done) { 1210 while (!done) {
1211 index = idx; 1211 index = idx;
1212 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 1212 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
1213 PAGECACHE_TAG_DIRTY, 1213 PAGECACHE_TAG_DIRTY,
1214 (pgoff_t)PAGEVEC_SIZE); 1214 (pgoff_t)PAGEVEC_SIZE);
1215 if (nr_pages == 0) 1215 if (nr_pages == 0)
1216 break; 1216 break;
1217 for (i = 0; i < nr_pages; i++) { 1217 for (i = 0; i < nr_pages; i++) {
1218 struct page *page = pvec.pages[i]; 1218 struct page *page = pvec.pages[i];
1219 struct buffer_head *bh, *head; 1219 struct buffer_head *bh, *head;
1220 1220
1221 lock_page(page); 1221 lock_page(page);
1222 if (unlikely(page->mapping != mapping) || 1222 if (unlikely(page->mapping != mapping) ||
1223 !PageDirty(page) || 1223 !PageDirty(page) ||
1224 PageWriteback(page) || 1224 PageWriteback(page) ||
1225 page->index != idx) { 1225 page->index != idx) {
1226 done = 1; 1226 done = 1;
1227 unlock_page(page); 1227 unlock_page(page);
1228 break; 1228 break;
1229 } 1229 }
1230 if (page_has_buffers(page)) { 1230 if (page_has_buffers(page)) {
1231 bh = head = page_buffers(page); 1231 bh = head = page_buffers(page);
1232 do { 1232 do {
1233 if (!buffer_delay(bh) && 1233 if (!buffer_delay(bh) &&
1234 !buffer_unwritten(bh)) 1234 !buffer_unwritten(bh))
1235 done = 1; 1235 done = 1;
1236 bh = bh->b_this_page; 1236 bh = bh->b_this_page;
1237 } while (!done && (bh != head)); 1237 } while (!done && (bh != head));
1238 } 1238 }
1239 unlock_page(page); 1239 unlock_page(page);
1240 if (done) 1240 if (done)
1241 break; 1241 break;
1242 idx++; 1242 idx++;
1243 num++; 1243 num++;
1244 if (num >= max_pages) { 1244 if (num >= max_pages) {
1245 done = 1; 1245 done = 1;
1246 break; 1246 break;
1247 } 1247 }
1248 } 1248 }
1249 pagevec_release(&pvec); 1249 pagevec_release(&pvec);
1250 } 1250 }
1251 return num; 1251 return num;
1252 } 1252 }
1253 1253
1254 /* 1254 /*
1255 * The ext4_map_blocks() function tries to look up the requested blocks, 1255 * The ext4_map_blocks() function tries to look up the requested blocks,
1256 * and returns if the blocks are already mapped. 1256 * and returns if the blocks are already mapped.
1257 * 1257 *
1258 * Otherwise it takes the write lock of the i_data_sem and allocate blocks 1258 * Otherwise it takes the write lock of the i_data_sem and allocate blocks
1259 * and store the allocated blocks in the result buffer head and mark it 1259 * and store the allocated blocks in the result buffer head and mark it
1260 * mapped. 1260 * mapped.
1261 * 1261 *
1262 * If file type is extents based, it will call ext4_ext_map_blocks(), 1262 * If file type is extents based, it will call ext4_ext_map_blocks(),
1263 * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping 1263 * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
1264 * based files 1264 * based files
1265 * 1265 *
1266 * On success, it returns the number of blocks being mapped or allocate. 1266 * On success, it returns the number of blocks being mapped or allocate.
1267 * if create==0 and the blocks are pre-allocated and uninitialized block, 1267 * if create==0 and the blocks are pre-allocated and uninitialized block,
1268 * the result buffer head is unmapped. If the create ==1, it will make sure 1268 * the result buffer head is unmapped. If the create ==1, it will make sure
1269 * the buffer head is mapped. 1269 * the buffer head is mapped.
1270 * 1270 *
1271 * It returns 0 if plain look up failed (blocks have not been allocated), in 1271 * It returns 0 if plain look up failed (blocks have not been allocated), in
1272 * that casem, buffer head is unmapped 1272 * that casem, buffer head is unmapped
1273 * 1273 *
1274 * It returns the error in case of allocation failure. 1274 * It returns the error in case of allocation failure.
1275 */ 1275 */
1276 int ext4_map_blocks(handle_t *handle, struct inode *inode, 1276 int ext4_map_blocks(handle_t *handle, struct inode *inode,
1277 struct ext4_map_blocks *map, int flags) 1277 struct ext4_map_blocks *map, int flags)
1278 { 1278 {
1279 int retval; 1279 int retval;
1280 1280
1281 map->m_flags = 0; 1281 map->m_flags = 0;
1282 ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u," 1282 ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
1283 "logical block %lu\n", inode->i_ino, flags, map->m_len, 1283 "logical block %lu\n", inode->i_ino, flags, map->m_len,
1284 (unsigned long) map->m_lblk); 1284 (unsigned long) map->m_lblk);
1285 /* 1285 /*
1286 * Try to see if we can get the block without requesting a new 1286 * Try to see if we can get the block without requesting a new
1287 * file system block. 1287 * file system block.
1288 */ 1288 */
1289 down_read((&EXT4_I(inode)->i_data_sem)); 1289 down_read((&EXT4_I(inode)->i_data_sem));
1290 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 1290 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
1291 retval = ext4_ext_map_blocks(handle, inode, map, 0); 1291 retval = ext4_ext_map_blocks(handle, inode, map, 0);
1292 } else { 1292 } else {
1293 retval = ext4_ind_map_blocks(handle, inode, map, 0); 1293 retval = ext4_ind_map_blocks(handle, inode, map, 0);
1294 } 1294 }
1295 up_read((&EXT4_I(inode)->i_data_sem)); 1295 up_read((&EXT4_I(inode)->i_data_sem));
1296 1296
1297 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { 1297 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
1298 int ret = check_block_validity(inode, map); 1298 int ret = check_block_validity(inode, map);
1299 if (ret != 0) 1299 if (ret != 0)
1300 return ret; 1300 return ret;
1301 } 1301 }
1302 1302
1303 /* If it is only a block(s) look up */ 1303 /* If it is only a block(s) look up */
1304 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) 1304 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
1305 return retval; 1305 return retval;
1306 1306
1307 /* 1307 /*
1308 * Returns if the blocks have already allocated 1308 * Returns if the blocks have already allocated
1309 * 1309 *
1310 * Note that if blocks have been preallocated 1310 * Note that if blocks have been preallocated
1311 * ext4_ext_get_block() returns th create = 0 1311 * ext4_ext_get_block() returns th create = 0
1312 * with buffer head unmapped. 1312 * with buffer head unmapped.
1313 */ 1313 */
1314 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) 1314 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
1315 return retval; 1315 return retval;
1316 1316
1317 /* 1317 /*
1318 * When we call get_blocks without the create flag, the 1318 * When we call get_blocks without the create flag, the
1319 * BH_Unwritten flag could have gotten set if the blocks 1319 * BH_Unwritten flag could have gotten set if the blocks
1320 * requested were part of a uninitialized extent. We need to 1320 * requested were part of a uninitialized extent. We need to
1321 * clear this flag now that we are committed to convert all or 1321 * clear this flag now that we are committed to convert all or
1322 * part of the uninitialized extent to be an initialized 1322 * part of the uninitialized extent to be an initialized
1323 * extent. This is because we need to avoid the combination 1323 * extent. This is because we need to avoid the combination
1324 * of BH_Unwritten and BH_Mapped flags being simultaneously 1324 * of BH_Unwritten and BH_Mapped flags being simultaneously
1325 * set on the buffer_head. 1325 * set on the buffer_head.
1326 */ 1326 */
1327 map->m_flags &= ~EXT4_MAP_UNWRITTEN; 1327 map->m_flags &= ~EXT4_MAP_UNWRITTEN;
1328 1328
1329 /* 1329 /*
1330 * New blocks allocate and/or writing to uninitialized extent 1330 * New blocks allocate and/or writing to uninitialized extent
1331 * will possibly result in updating i_data, so we take 1331 * will possibly result in updating i_data, so we take
1332 * the write lock of i_data_sem, and call get_blocks() 1332 * the write lock of i_data_sem, and call get_blocks()
1333 * with create == 1 flag. 1333 * with create == 1 flag.
1334 */ 1334 */
1335 down_write((&EXT4_I(inode)->i_data_sem)); 1335 down_write((&EXT4_I(inode)->i_data_sem));
1336 1336
1337 /* 1337 /*
1338 * if the caller is from delayed allocation writeout path 1338 * if the caller is from delayed allocation writeout path
1339 * we have already reserved fs blocks for allocation 1339 * we have already reserved fs blocks for allocation
1340 * let the underlying get_block() function know to 1340 * let the underlying get_block() function know to
1341 * avoid double accounting 1341 * avoid double accounting
1342 */ 1342 */
1343 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 1343 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
1344 ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); 1344 ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
1345 /* 1345 /*
1346 * We need to check for EXT4 here because migrate 1346 * We need to check for EXT4 here because migrate
1347 * could have changed the inode type in between 1347 * could have changed the inode type in between
1348 */ 1348 */
1349 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 1349 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
1350 retval = ext4_ext_map_blocks(handle, inode, map, flags); 1350 retval = ext4_ext_map_blocks(handle, inode, map, flags);
1351 } else { 1351 } else {
1352 retval = ext4_ind_map_blocks(handle, inode, map, flags); 1352 retval = ext4_ind_map_blocks(handle, inode, map, flags);
1353 1353
1354 if (retval > 0 && map->m_flags & EXT4_MAP_NEW) { 1354 if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
1355 /* 1355 /*
1356 * We allocated new blocks which will result in 1356 * We allocated new blocks which will result in
1357 * i_data's format changing. Force the migrate 1357 * i_data's format changing. Force the migrate
1358 * to fail by clearing migrate flags 1358 * to fail by clearing migrate flags
1359 */ 1359 */
1360 ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); 1360 ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
1361 } 1361 }
1362 1362
1363 /* 1363 /*
1364 * Update reserved blocks/metadata blocks after successful 1364 * Update reserved blocks/metadata blocks after successful
1365 * block allocation which had been deferred till now. We don't 1365 * block allocation which had been deferred till now. We don't
1366 * support fallocate for non extent files. So we can update 1366 * support fallocate for non extent files. So we can update
1367 * reserve space here. 1367 * reserve space here.
1368 */ 1368 */
1369 if ((retval > 0) && 1369 if ((retval > 0) &&
1370 (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)) 1370 (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
1371 ext4_da_update_reserve_space(inode, retval, 1); 1371 ext4_da_update_reserve_space(inode, retval, 1);
1372 } 1372 }
1373 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 1373 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
1374 ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); 1374 ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
1375 1375
1376 up_write((&EXT4_I(inode)->i_data_sem)); 1376 up_write((&EXT4_I(inode)->i_data_sem));
1377 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { 1377 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
1378 int ret = check_block_validity(inode, map); 1378 int ret = check_block_validity(inode, map);
1379 if (ret != 0) 1379 if (ret != 0)
1380 return ret; 1380 return ret;
1381 } 1381 }
1382 return retval; 1382 return retval;
1383 } 1383 }
1384 1384
1385 /* Maximum number of blocks we map for direct IO at once. */ 1385 /* Maximum number of blocks we map for direct IO at once. */
1386 #define DIO_MAX_BLOCKS 4096 1386 #define DIO_MAX_BLOCKS 4096
1387 1387
1388 static int _ext4_get_block(struct inode *inode, sector_t iblock, 1388 static int _ext4_get_block(struct inode *inode, sector_t iblock,
1389 struct buffer_head *bh, int flags) 1389 struct buffer_head *bh, int flags)
1390 { 1390 {
1391 handle_t *handle = ext4_journal_current_handle(); 1391 handle_t *handle = ext4_journal_current_handle();
1392 struct ext4_map_blocks map; 1392 struct ext4_map_blocks map;
1393 int ret = 0, started = 0; 1393 int ret = 0, started = 0;
1394 int dio_credits; 1394 int dio_credits;
1395 1395
1396 map.m_lblk = iblock; 1396 map.m_lblk = iblock;
1397 map.m_len = bh->b_size >> inode->i_blkbits; 1397 map.m_len = bh->b_size >> inode->i_blkbits;
1398 1398
1399 if (flags && !handle) { 1399 if (flags && !handle) {
1400 /* Direct IO write... */ 1400 /* Direct IO write... */
1401 if (map.m_len > DIO_MAX_BLOCKS) 1401 if (map.m_len > DIO_MAX_BLOCKS)
1402 map.m_len = DIO_MAX_BLOCKS; 1402 map.m_len = DIO_MAX_BLOCKS;
1403 dio_credits = ext4_chunk_trans_blocks(inode, map.m_len); 1403 dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
1404 handle = ext4_journal_start(inode, dio_credits); 1404 handle = ext4_journal_start(inode, dio_credits);
1405 if (IS_ERR(handle)) { 1405 if (IS_ERR(handle)) {
1406 ret = PTR_ERR(handle); 1406 ret = PTR_ERR(handle);
1407 return ret; 1407 return ret;
1408 } 1408 }
1409 started = 1; 1409 started = 1;
1410 } 1410 }
1411 1411
1412 ret = ext4_map_blocks(handle, inode, &map, flags); 1412 ret = ext4_map_blocks(handle, inode, &map, flags);
1413 if (ret > 0) { 1413 if (ret > 0) {
1414 map_bh(bh, inode->i_sb, map.m_pblk); 1414 map_bh(bh, inode->i_sb, map.m_pblk);
1415 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; 1415 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
1416 bh->b_size = inode->i_sb->s_blocksize * map.m_len; 1416 bh->b_size = inode->i_sb->s_blocksize * map.m_len;
1417 ret = 0; 1417 ret = 0;
1418 } 1418 }
1419 if (started) 1419 if (started)
1420 ext4_journal_stop(handle); 1420 ext4_journal_stop(handle);
1421 return ret; 1421 return ret;
1422 } 1422 }
1423 1423
1424 int ext4_get_block(struct inode *inode, sector_t iblock, 1424 int ext4_get_block(struct inode *inode, sector_t iblock,
1425 struct buffer_head *bh, int create) 1425 struct buffer_head *bh, int create)
1426 { 1426 {
1427 return _ext4_get_block(inode, iblock, bh, 1427 return _ext4_get_block(inode, iblock, bh,
1428 create ? EXT4_GET_BLOCKS_CREATE : 0); 1428 create ? EXT4_GET_BLOCKS_CREATE : 0);
1429 } 1429 }
1430 1430
1431 /* 1431 /*
1432 * `handle' can be NULL if create is zero 1432 * `handle' can be NULL if create is zero
1433 */ 1433 */
1434 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, 1434 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
1435 ext4_lblk_t block, int create, int *errp) 1435 ext4_lblk_t block, int create, int *errp)
1436 { 1436 {
1437 struct ext4_map_blocks map; 1437 struct ext4_map_blocks map;
1438 struct buffer_head *bh; 1438 struct buffer_head *bh;
1439 int fatal = 0, err; 1439 int fatal = 0, err;
1440 1440
1441 J_ASSERT(handle != NULL || create == 0); 1441 J_ASSERT(handle != NULL || create == 0);
1442 1442
1443 map.m_lblk = block; 1443 map.m_lblk = block;
1444 map.m_len = 1; 1444 map.m_len = 1;
1445 err = ext4_map_blocks(handle, inode, &map, 1445 err = ext4_map_blocks(handle, inode, &map,
1446 create ? EXT4_GET_BLOCKS_CREATE : 0); 1446 create ? EXT4_GET_BLOCKS_CREATE : 0);
1447 1447
1448 if (err < 0) 1448 if (err < 0)
1449 *errp = err; 1449 *errp = err;
1450 if (err <= 0) 1450 if (err <= 0)
1451 return NULL; 1451 return NULL;
1452 *errp = 0; 1452 *errp = 0;
1453 1453
1454 bh = sb_getblk(inode->i_sb, map.m_pblk); 1454 bh = sb_getblk(inode->i_sb, map.m_pblk);
1455 if (!bh) { 1455 if (!bh) {
1456 *errp = -EIO; 1456 *errp = -EIO;
1457 return NULL; 1457 return NULL;
1458 } 1458 }
1459 if (map.m_flags & EXT4_MAP_NEW) { 1459 if (map.m_flags & EXT4_MAP_NEW) {
1460 J_ASSERT(create != 0); 1460 J_ASSERT(create != 0);
1461 J_ASSERT(handle != NULL); 1461 J_ASSERT(handle != NULL);
1462 1462
1463 /* 1463 /*
1464 * Now that we do not always journal data, we should 1464 * Now that we do not always journal data, we should
1465 * keep in mind whether this should always journal the 1465 * keep in mind whether this should always journal the
1466 * new buffer as metadata. For now, regular file 1466 * new buffer as metadata. For now, regular file
1467 * writes use ext4_get_block instead, so it's not a 1467 * writes use ext4_get_block instead, so it's not a
1468 * problem. 1468 * problem.
1469 */ 1469 */
1470 lock_buffer(bh); 1470 lock_buffer(bh);
1471 BUFFER_TRACE(bh, "call get_create_access"); 1471 BUFFER_TRACE(bh, "call get_create_access");
1472 fatal = ext4_journal_get_create_access(handle, bh); 1472 fatal = ext4_journal_get_create_access(handle, bh);
1473 if (!fatal && !buffer_uptodate(bh)) { 1473 if (!fatal && !buffer_uptodate(bh)) {
1474 memset(bh->b_data, 0, inode->i_sb->s_blocksize); 1474 memset(bh->b_data, 0, inode->i_sb->s_blocksize);
1475 set_buffer_uptodate(bh); 1475 set_buffer_uptodate(bh);
1476 } 1476 }
1477 unlock_buffer(bh); 1477 unlock_buffer(bh);
1478 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 1478 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1479 err = ext4_handle_dirty_metadata(handle, inode, bh); 1479 err = ext4_handle_dirty_metadata(handle, inode, bh);
1480 if (!fatal) 1480 if (!fatal)
1481 fatal = err; 1481 fatal = err;
1482 } else { 1482 } else {
1483 BUFFER_TRACE(bh, "not a new buffer"); 1483 BUFFER_TRACE(bh, "not a new buffer");
1484 } 1484 }
1485 if (fatal) { 1485 if (fatal) {
1486 *errp = fatal; 1486 *errp = fatal;
1487 brelse(bh); 1487 brelse(bh);
1488 bh = NULL; 1488 bh = NULL;
1489 } 1489 }
1490 return bh; 1490 return bh;
1491 } 1491 }
1492 1492
1493 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, 1493 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
1494 ext4_lblk_t block, int create, int *err) 1494 ext4_lblk_t block, int create, int *err)
1495 { 1495 {
1496 struct buffer_head *bh; 1496 struct buffer_head *bh;
1497 1497
1498 bh = ext4_getblk(handle, inode, block, create, err); 1498 bh = ext4_getblk(handle, inode, block, create, err);
1499 if (!bh) 1499 if (!bh)
1500 return bh; 1500 return bh;
1501 if (buffer_uptodate(bh)) 1501 if (buffer_uptodate(bh))
1502 return bh; 1502 return bh;
1503 ll_rw_block(READ_META, 1, &bh); 1503 ll_rw_block(READ_META, 1, &bh);
1504 wait_on_buffer(bh); 1504 wait_on_buffer(bh);
1505 if (buffer_uptodate(bh)) 1505 if (buffer_uptodate(bh))
1506 return bh; 1506 return bh;
1507 put_bh(bh); 1507 put_bh(bh);
1508 *err = -EIO; 1508 *err = -EIO;
1509 return NULL; 1509 return NULL;
1510 } 1510 }
1511 1511
1512 static int walk_page_buffers(handle_t *handle, 1512 static int walk_page_buffers(handle_t *handle,
1513 struct buffer_head *head, 1513 struct buffer_head *head,
1514 unsigned from, 1514 unsigned from,
1515 unsigned to, 1515 unsigned to,
1516 int *partial, 1516 int *partial,
1517 int (*fn)(handle_t *handle, 1517 int (*fn)(handle_t *handle,
1518 struct buffer_head *bh)) 1518 struct buffer_head *bh))
1519 { 1519 {
1520 struct buffer_head *bh; 1520 struct buffer_head *bh;
1521 unsigned block_start, block_end; 1521 unsigned block_start, block_end;
1522 unsigned blocksize = head->b_size; 1522 unsigned blocksize = head->b_size;
1523 int err, ret = 0; 1523 int err, ret = 0;
1524 struct buffer_head *next; 1524 struct buffer_head *next;
1525 1525
1526 for (bh = head, block_start = 0; 1526 for (bh = head, block_start = 0;
1527 ret == 0 && (bh != head || !block_start); 1527 ret == 0 && (bh != head || !block_start);
1528 block_start = block_end, bh = next) { 1528 block_start = block_end, bh = next) {
1529 next = bh->b_this_page; 1529 next = bh->b_this_page;
1530 block_end = block_start + blocksize; 1530 block_end = block_start + blocksize;
1531 if (block_end <= from || block_start >= to) { 1531 if (block_end <= from || block_start >= to) {
1532 if (partial && !buffer_uptodate(bh)) 1532 if (partial && !buffer_uptodate(bh))
1533 *partial = 1; 1533 *partial = 1;
1534 continue; 1534 continue;
1535 } 1535 }
1536 err = (*fn)(handle, bh); 1536 err = (*fn)(handle, bh);
1537 if (!ret) 1537 if (!ret)
1538 ret = err; 1538 ret = err;
1539 } 1539 }
1540 return ret; 1540 return ret;
1541 } 1541 }
1542 1542
1543 /* 1543 /*
1544 * To preserve ordering, it is essential that the hole instantiation and 1544 * To preserve ordering, it is essential that the hole instantiation and
1545 * the data write be encapsulated in a single transaction. We cannot 1545 * the data write be encapsulated in a single transaction. We cannot
1546 * close off a transaction and start a new one between the ext4_get_block() 1546 * close off a transaction and start a new one between the ext4_get_block()
1547 * and the commit_write(). So doing the jbd2_journal_start at the start of 1547 * and the commit_write(). So doing the jbd2_journal_start at the start of
1548 * prepare_write() is the right place. 1548 * prepare_write() is the right place.
1549 * 1549 *
1550 * Also, this function can nest inside ext4_writepage() -> 1550 * Also, this function can nest inside ext4_writepage() ->
1551 * block_write_full_page(). In that case, we *know* that ext4_writepage() 1551 * block_write_full_page(). In that case, we *know* that ext4_writepage()
1552 * has generated enough buffer credits to do the whole page. So we won't 1552 * has generated enough buffer credits to do the whole page. So we won't
1553 * block on the journal in that case, which is good, because the caller may 1553 * block on the journal in that case, which is good, because the caller may
1554 * be PF_MEMALLOC. 1554 * be PF_MEMALLOC.
1555 * 1555 *
1556 * By accident, ext4 can be reentered when a transaction is open via 1556 * By accident, ext4 can be reentered when a transaction is open via
1557 * quota file writes. If we were to commit the transaction while thus 1557 * quota file writes. If we were to commit the transaction while thus
1558 * reentered, there can be a deadlock - we would be holding a quota 1558 * reentered, there can be a deadlock - we would be holding a quota
1559 * lock, and the commit would never complete if another thread had a 1559 * lock, and the commit would never complete if another thread had a
1560 * transaction open and was blocking on the quota lock - a ranking 1560 * transaction open and was blocking on the quota lock - a ranking
1561 * violation. 1561 * violation.
1562 * 1562 *
1563 * So what we do is to rely on the fact that jbd2_journal_stop/journal_start 1563 * So what we do is to rely on the fact that jbd2_journal_stop/journal_start
1564 * will _not_ run commit under these circumstances because handle->h_ref 1564 * will _not_ run commit under these circumstances because handle->h_ref
1565 * is elevated. We'll still have enough credits for the tiny quotafile 1565 * is elevated. We'll still have enough credits for the tiny quotafile
1566 * write. 1566 * write.
1567 */ 1567 */
1568 static int do_journal_get_write_access(handle_t *handle, 1568 static int do_journal_get_write_access(handle_t *handle,
1569 struct buffer_head *bh) 1569 struct buffer_head *bh)
1570 { 1570 {
1571 int dirty = buffer_dirty(bh); 1571 int dirty = buffer_dirty(bh);
1572 int ret; 1572 int ret;
1573 1573
1574 if (!buffer_mapped(bh) || buffer_freed(bh)) 1574 if (!buffer_mapped(bh) || buffer_freed(bh))
1575 return 0; 1575 return 0;
1576 /* 1576 /*
1577 * __block_write_begin() could have dirtied some buffers. Clean 1577 * __block_write_begin() could have dirtied some buffers. Clean
1578 * the dirty bit as jbd2_journal_get_write_access() could complain 1578 * the dirty bit as jbd2_journal_get_write_access() could complain
1579 * otherwise about fs integrity issues. Setting of the dirty bit 1579 * otherwise about fs integrity issues. Setting of the dirty bit
1580 * by __block_write_begin() isn't a real problem here as we clear 1580 * by __block_write_begin() isn't a real problem here as we clear
1581 * the bit before releasing a page lock and thus writeback cannot 1581 * the bit before releasing a page lock and thus writeback cannot
1582 * ever write the buffer. 1582 * ever write the buffer.
1583 */ 1583 */
1584 if (dirty) 1584 if (dirty)
1585 clear_buffer_dirty(bh); 1585 clear_buffer_dirty(bh);
1586 ret = ext4_journal_get_write_access(handle, bh); 1586 ret = ext4_journal_get_write_access(handle, bh);
1587 if (!ret && dirty) 1587 if (!ret && dirty)
1588 ret = ext4_handle_dirty_metadata(handle, NULL, bh); 1588 ret = ext4_handle_dirty_metadata(handle, NULL, bh);
1589 return ret; 1589 return ret;
1590 } 1590 }
1591 1591
1592 /* 1592 /*
1593 * Truncate blocks that were not used by write. We have to truncate the 1593 * Truncate blocks that were not used by write. We have to truncate the
1594 * pagecache as well so that corresponding buffers get properly unmapped. 1594 * pagecache as well so that corresponding buffers get properly unmapped.
1595 */ 1595 */
1596 static void ext4_truncate_failed_write(struct inode *inode) 1596 static void ext4_truncate_failed_write(struct inode *inode)
1597 { 1597 {
1598 truncate_inode_pages(inode->i_mapping, inode->i_size); 1598 truncate_inode_pages(inode->i_mapping, inode->i_size);
1599 ext4_truncate(inode); 1599 ext4_truncate(inode);
1600 } 1600 }
1601 1601
1602 static int ext4_get_block_write(struct inode *inode, sector_t iblock, 1602 static int ext4_get_block_write(struct inode *inode, sector_t iblock,
1603 struct buffer_head *bh_result, int create); 1603 struct buffer_head *bh_result, int create);
1604 static int ext4_write_begin(struct file *file, struct address_space *mapping, 1604 static int ext4_write_begin(struct file *file, struct address_space *mapping,
1605 loff_t pos, unsigned len, unsigned flags, 1605 loff_t pos, unsigned len, unsigned flags,
1606 struct page **pagep, void **fsdata) 1606 struct page **pagep, void **fsdata)
1607 { 1607 {
1608 struct inode *inode = mapping->host; 1608 struct inode *inode = mapping->host;
1609 int ret, needed_blocks; 1609 int ret, needed_blocks;
1610 handle_t *handle; 1610 handle_t *handle;
1611 int retries = 0; 1611 int retries = 0;
1612 struct page *page; 1612 struct page *page;
1613 pgoff_t index; 1613 pgoff_t index;
1614 unsigned from, to; 1614 unsigned from, to;
1615 1615
1616 trace_ext4_write_begin(inode, pos, len, flags); 1616 trace_ext4_write_begin(inode, pos, len, flags);
1617 /* 1617 /*
1618 * Reserve one block more for addition to orphan list in case 1618 * Reserve one block more for addition to orphan list in case
1619 * we allocate blocks but write fails for some reason 1619 * we allocate blocks but write fails for some reason
1620 */ 1620 */
1621 needed_blocks = ext4_writepage_trans_blocks(inode) + 1; 1621 needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
1622 index = pos >> PAGE_CACHE_SHIFT; 1622 index = pos >> PAGE_CACHE_SHIFT;
1623 from = pos & (PAGE_CACHE_SIZE - 1); 1623 from = pos & (PAGE_CACHE_SIZE - 1);
1624 to = from + len; 1624 to = from + len;
1625 1625
1626 retry: 1626 retry:
1627 handle = ext4_journal_start(inode, needed_blocks); 1627 handle = ext4_journal_start(inode, needed_blocks);
1628 if (IS_ERR(handle)) { 1628 if (IS_ERR(handle)) {
1629 ret = PTR_ERR(handle); 1629 ret = PTR_ERR(handle);
1630 goto out; 1630 goto out;
1631 } 1631 }
1632 1632
1633 /* We cannot recurse into the filesystem as the transaction is already 1633 /* We cannot recurse into the filesystem as the transaction is already
1634 * started */ 1634 * started */
1635 flags |= AOP_FLAG_NOFS; 1635 flags |= AOP_FLAG_NOFS;
1636 1636
1637 page = grab_cache_page_write_begin(mapping, index, flags); 1637 page = grab_cache_page_write_begin(mapping, index, flags);
1638 if (!page) { 1638 if (!page) {
1639 ext4_journal_stop(handle); 1639 ext4_journal_stop(handle);
1640 ret = -ENOMEM; 1640 ret = -ENOMEM;
1641 goto out; 1641 goto out;
1642 } 1642 }
1643 *pagep = page; 1643 *pagep = page;
1644 1644
1645 if (ext4_should_dioread_nolock(inode)) 1645 if (ext4_should_dioread_nolock(inode))
1646 ret = __block_write_begin(page, pos, len, ext4_get_block_write); 1646 ret = __block_write_begin(page, pos, len, ext4_get_block_write);
1647 else 1647 else
1648 ret = __block_write_begin(page, pos, len, ext4_get_block); 1648 ret = __block_write_begin(page, pos, len, ext4_get_block);
1649 1649
1650 if (!ret && ext4_should_journal_data(inode)) { 1650 if (!ret && ext4_should_journal_data(inode)) {
1651 ret = walk_page_buffers(handle, page_buffers(page), 1651 ret = walk_page_buffers(handle, page_buffers(page),
1652 from, to, NULL, do_journal_get_write_access); 1652 from, to, NULL, do_journal_get_write_access);
1653 } 1653 }
1654 1654
1655 if (ret) { 1655 if (ret) {
1656 unlock_page(page); 1656 unlock_page(page);
1657 page_cache_release(page); 1657 page_cache_release(page);
1658 /* 1658 /*
1659 * __block_write_begin may have instantiated a few blocks 1659 * __block_write_begin may have instantiated a few blocks
1660 * outside i_size. Trim these off again. Don't need 1660 * outside i_size. Trim these off again. Don't need
1661 * i_size_read because we hold i_mutex. 1661 * i_size_read because we hold i_mutex.
1662 * 1662 *
1663 * Add inode to orphan list in case we crash before 1663 * Add inode to orphan list in case we crash before
1664 * truncate finishes 1664 * truncate finishes
1665 */ 1665 */
1666 if (pos + len > inode->i_size && ext4_can_truncate(inode)) 1666 if (pos + len > inode->i_size && ext4_can_truncate(inode))
1667 ext4_orphan_add(handle, inode); 1667 ext4_orphan_add(handle, inode);
1668 1668
1669 ext4_journal_stop(handle); 1669 ext4_journal_stop(handle);
1670 if (pos + len > inode->i_size) { 1670 if (pos + len > inode->i_size) {
1671 ext4_truncate_failed_write(inode); 1671 ext4_truncate_failed_write(inode);
1672 /* 1672 /*
1673 * If truncate failed early the inode might 1673 * If truncate failed early the inode might
1674 * still be on the orphan list; we need to 1674 * still be on the orphan list; we need to
1675 * make sure the inode is removed from the 1675 * make sure the inode is removed from the
1676 * orphan list in that case. 1676 * orphan list in that case.
1677 */ 1677 */
1678 if (inode->i_nlink) 1678 if (inode->i_nlink)
1679 ext4_orphan_del(NULL, inode); 1679 ext4_orphan_del(NULL, inode);
1680 } 1680 }
1681 } 1681 }
1682 1682
1683 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 1683 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
1684 goto retry; 1684 goto retry;
1685 out: 1685 out:
1686 return ret; 1686 return ret;
1687 } 1687 }
1688 1688
1689 /* For write_end() in data=journal mode */ 1689 /* For write_end() in data=journal mode */
1690 static int write_end_fn(handle_t *handle, struct buffer_head *bh) 1690 static int write_end_fn(handle_t *handle, struct buffer_head *bh)
1691 { 1691 {
1692 if (!buffer_mapped(bh) || buffer_freed(bh)) 1692 if (!buffer_mapped(bh) || buffer_freed(bh))
1693 return 0; 1693 return 0;
1694 set_buffer_uptodate(bh); 1694 set_buffer_uptodate(bh);
1695 return ext4_handle_dirty_metadata(handle, NULL, bh); 1695 return ext4_handle_dirty_metadata(handle, NULL, bh);
1696 } 1696 }
1697 1697
1698 static int ext4_generic_write_end(struct file *file, 1698 static int ext4_generic_write_end(struct file *file,
1699 struct address_space *mapping, 1699 struct address_space *mapping,
1700 loff_t pos, unsigned len, unsigned copied, 1700 loff_t pos, unsigned len, unsigned copied,
1701 struct page *page, void *fsdata) 1701 struct page *page, void *fsdata)
1702 { 1702 {
1703 int i_size_changed = 0; 1703 int i_size_changed = 0;
1704 struct inode *inode = mapping->host; 1704 struct inode *inode = mapping->host;
1705 handle_t *handle = ext4_journal_current_handle(); 1705 handle_t *handle = ext4_journal_current_handle();
1706 1706
1707 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); 1707 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
1708 1708
1709 /* 1709 /*
1710 * No need to use i_size_read() here, the i_size 1710 * No need to use i_size_read() here, the i_size
1711 * cannot change under us because we hold i_mutex. 1711 * cannot change under us because we hold i_mutex.
1712 * 1712 *
1713 * But it's important to update i_size while still holding page lock: 1713 * But it's important to update i_size while still holding page lock:
1714 * page writeout could otherwise come in and zero beyond i_size. 1714 * page writeout could otherwise come in and zero beyond i_size.
1715 */ 1715 */
1716 if (pos + copied > inode->i_size) { 1716 if (pos + copied > inode->i_size) {
1717 i_size_write(inode, pos + copied); 1717 i_size_write(inode, pos + copied);
1718 i_size_changed = 1; 1718 i_size_changed = 1;
1719 } 1719 }
1720 1720
1721 if (pos + copied > EXT4_I(inode)->i_disksize) { 1721 if (pos + copied > EXT4_I(inode)->i_disksize) {
1722 /* We need to mark inode dirty even if 1722 /* We need to mark inode dirty even if
1723 * new_i_size is less that inode->i_size 1723 * new_i_size is less that inode->i_size
1724 * bu greater than i_disksize.(hint delalloc) 1724 * bu greater than i_disksize.(hint delalloc)
1725 */ 1725 */
1726 ext4_update_i_disksize(inode, (pos + copied)); 1726 ext4_update_i_disksize(inode, (pos + copied));
1727 i_size_changed = 1; 1727 i_size_changed = 1;
1728 } 1728 }
1729 unlock_page(page); 1729 unlock_page(page);
1730 page_cache_release(page); 1730 page_cache_release(page);
1731 1731
1732 /* 1732 /*
1733 * Don't mark the inode dirty under page lock. First, it unnecessarily 1733 * Don't mark the inode dirty under page lock. First, it unnecessarily
1734 * makes the holding time of page lock longer. Second, it forces lock 1734 * makes the holding time of page lock longer. Second, it forces lock
1735 * ordering of page lock and transaction start for journaling 1735 * ordering of page lock and transaction start for journaling
1736 * filesystems. 1736 * filesystems.
1737 */ 1737 */
1738 if (i_size_changed) 1738 if (i_size_changed)
1739 ext4_mark_inode_dirty(handle, inode); 1739 ext4_mark_inode_dirty(handle, inode);
1740 1740
1741 return copied; 1741 return copied;
1742 } 1742 }
1743 1743
1744 /* 1744 /*
1745 * We need to pick up the new inode size which generic_commit_write gave us 1745 * We need to pick up the new inode size which generic_commit_write gave us
1746 * `file' can be NULL - eg, when called from page_symlink(). 1746 * `file' can be NULL - eg, when called from page_symlink().
1747 * 1747 *
1748 * ext4 never places buffers on inode->i_mapping->private_list. metadata 1748 * ext4 never places buffers on inode->i_mapping->private_list. metadata
1749 * buffers are managed internally. 1749 * buffers are managed internally.
1750 */ 1750 */
1751 static int ext4_ordered_write_end(struct file *file, 1751 static int ext4_ordered_write_end(struct file *file,
1752 struct address_space *mapping, 1752 struct address_space *mapping,
1753 loff_t pos, unsigned len, unsigned copied, 1753 loff_t pos, unsigned len, unsigned copied,
1754 struct page *page, void *fsdata) 1754 struct page *page, void *fsdata)
1755 { 1755 {
1756 handle_t *handle = ext4_journal_current_handle(); 1756 handle_t *handle = ext4_journal_current_handle();
1757 struct inode *inode = mapping->host; 1757 struct inode *inode = mapping->host;
1758 int ret = 0, ret2; 1758 int ret = 0, ret2;
1759 1759
1760 trace_ext4_ordered_write_end(inode, pos, len, copied); 1760 trace_ext4_ordered_write_end(inode, pos, len, copied);
1761 ret = ext4_jbd2_file_inode(handle, inode); 1761 ret = ext4_jbd2_file_inode(handle, inode);
1762 1762
1763 if (ret == 0) { 1763 if (ret == 0) {
1764 ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, 1764 ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
1765 page, fsdata); 1765 page, fsdata);
1766 copied = ret2; 1766 copied = ret2;
1767 if (pos + len > inode->i_size && ext4_can_truncate(inode)) 1767 if (pos + len > inode->i_size && ext4_can_truncate(inode))
1768 /* if we have allocated more blocks and copied 1768 /* if we have allocated more blocks and copied
1769 * less. We will have blocks allocated outside 1769 * less. We will have blocks allocated outside
1770 * inode->i_size. So truncate them 1770 * inode->i_size. So truncate them
1771 */ 1771 */
1772 ext4_orphan_add(handle, inode); 1772 ext4_orphan_add(handle, inode);
1773 if (ret2 < 0) 1773 if (ret2 < 0)
1774 ret = ret2; 1774 ret = ret2;
1775 } 1775 }
1776 ret2 = ext4_journal_stop(handle); 1776 ret2 = ext4_journal_stop(handle);
1777 if (!ret) 1777 if (!ret)
1778 ret = ret2; 1778 ret = ret2;
1779 1779
1780 if (pos + len > inode->i_size) { 1780 if (pos + len > inode->i_size) {
1781 ext4_truncate_failed_write(inode); 1781 ext4_truncate_failed_write(inode);
1782 /* 1782 /*
1783 * If truncate failed early the inode might still be 1783 * If truncate failed early the inode might still be
1784 * on the orphan list; we need to make sure the inode 1784 * on the orphan list; we need to make sure the inode
1785 * is removed from the orphan list in that case. 1785 * is removed from the orphan list in that case.
1786 */ 1786 */
1787 if (inode->i_nlink) 1787 if (inode->i_nlink)
1788 ext4_orphan_del(NULL, inode); 1788 ext4_orphan_del(NULL, inode);
1789 } 1789 }
1790 1790
1791 1791
1792 return ret ? ret : copied; 1792 return ret ? ret : copied;
1793 } 1793 }
1794 1794
1795 static int ext4_writeback_write_end(struct file *file, 1795 static int ext4_writeback_write_end(struct file *file,
1796 struct address_space *mapping, 1796 struct address_space *mapping,
1797 loff_t pos, unsigned len, unsigned copied, 1797 loff_t pos, unsigned len, unsigned copied,
1798 struct page *page, void *fsdata) 1798 struct page *page, void *fsdata)
1799 { 1799 {
1800 handle_t *handle = ext4_journal_current_handle(); 1800 handle_t *handle = ext4_journal_current_handle();
1801 struct inode *inode = mapping->host; 1801 struct inode *inode = mapping->host;
1802 int ret = 0, ret2; 1802 int ret = 0, ret2;
1803 1803
1804 trace_ext4_writeback_write_end(inode, pos, len, copied); 1804 trace_ext4_writeback_write_end(inode, pos, len, copied);
1805 ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, 1805 ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
1806 page, fsdata); 1806 page, fsdata);
1807 copied = ret2; 1807 copied = ret2;
1808 if (pos + len > inode->i_size && ext4_can_truncate(inode)) 1808 if (pos + len > inode->i_size && ext4_can_truncate(inode))
1809 /* if we have allocated more blocks and copied 1809 /* if we have allocated more blocks and copied
1810 * less. We will have blocks allocated outside 1810 * less. We will have blocks allocated outside
1811 * inode->i_size. So truncate them 1811 * inode->i_size. So truncate them
1812 */ 1812 */
1813 ext4_orphan_add(handle, inode); 1813 ext4_orphan_add(handle, inode);
1814 1814
1815 if (ret2 < 0) 1815 if (ret2 < 0)
1816 ret = ret2; 1816 ret = ret2;
1817 1817
1818 ret2 = ext4_journal_stop(handle); 1818 ret2 = ext4_journal_stop(handle);
1819 if (!ret) 1819 if (!ret)
1820 ret = ret2; 1820 ret = ret2;
1821 1821
1822 if (pos + len > inode->i_size) { 1822 if (pos + len > inode->i_size) {
1823 ext4_truncate_failed_write(inode); 1823 ext4_truncate_failed_write(inode);
1824 /* 1824 /*
1825 * If truncate failed early the inode might still be 1825 * If truncate failed early the inode might still be
1826 * on the orphan list; we need to make sure the inode 1826 * on the orphan list; we need to make sure the inode
1827 * is removed from the orphan list in that case. 1827 * is removed from the orphan list in that case.
1828 */ 1828 */
1829 if (inode->i_nlink) 1829 if (inode->i_nlink)
1830 ext4_orphan_del(NULL, inode); 1830 ext4_orphan_del(NULL, inode);
1831 } 1831 }
1832 1832
1833 return ret ? ret : copied; 1833 return ret ? ret : copied;
1834 } 1834 }
1835 1835
1836 static int ext4_journalled_write_end(struct file *file, 1836 static int ext4_journalled_write_end(struct file *file,
1837 struct address_space *mapping, 1837 struct address_space *mapping,
1838 loff_t pos, unsigned len, unsigned copied, 1838 loff_t pos, unsigned len, unsigned copied,
1839 struct page *page, void *fsdata) 1839 struct page *page, void *fsdata)
1840 { 1840 {
1841 handle_t *handle = ext4_journal_current_handle(); 1841 handle_t *handle = ext4_journal_current_handle();
1842 struct inode *inode = mapping->host; 1842 struct inode *inode = mapping->host;
1843 int ret = 0, ret2; 1843 int ret = 0, ret2;
1844 int partial = 0; 1844 int partial = 0;
1845 unsigned from, to; 1845 unsigned from, to;
1846 loff_t new_i_size; 1846 loff_t new_i_size;
1847 1847
1848 trace_ext4_journalled_write_end(inode, pos, len, copied); 1848 trace_ext4_journalled_write_end(inode, pos, len, copied);
1849 from = pos & (PAGE_CACHE_SIZE - 1); 1849 from = pos & (PAGE_CACHE_SIZE - 1);
1850 to = from + len; 1850 to = from + len;
1851 1851
1852 if (copied < len) { 1852 if (copied < len) {
1853 if (!PageUptodate(page)) 1853 if (!PageUptodate(page))
1854 copied = 0; 1854 copied = 0;
1855 page_zero_new_buffers(page, from+copied, to); 1855 page_zero_new_buffers(page, from+copied, to);
1856 } 1856 }
1857 1857
1858 ret = walk_page_buffers(handle, page_buffers(page), from, 1858 ret = walk_page_buffers(handle, page_buffers(page), from,
1859 to, &partial, write_end_fn); 1859 to, &partial, write_end_fn);
1860 if (!partial) 1860 if (!partial)
1861 SetPageUptodate(page); 1861 SetPageUptodate(page);
1862 new_i_size = pos + copied; 1862 new_i_size = pos + copied;
1863 if (new_i_size > inode->i_size) 1863 if (new_i_size > inode->i_size)
1864 i_size_write(inode, pos+copied); 1864 i_size_write(inode, pos+copied);
1865 ext4_set_inode_state(inode, EXT4_STATE_JDATA); 1865 ext4_set_inode_state(inode, EXT4_STATE_JDATA);
1866 if (new_i_size > EXT4_I(inode)->i_disksize) { 1866 if (new_i_size > EXT4_I(inode)->i_disksize) {
1867 ext4_update_i_disksize(inode, new_i_size); 1867 ext4_update_i_disksize(inode, new_i_size);
1868 ret2 = ext4_mark_inode_dirty(handle, inode); 1868 ret2 = ext4_mark_inode_dirty(handle, inode);
1869 if (!ret) 1869 if (!ret)
1870 ret = ret2; 1870 ret = ret2;
1871 } 1871 }
1872 1872
1873 unlock_page(page); 1873 unlock_page(page);
1874 page_cache_release(page); 1874 page_cache_release(page);
1875 if (pos + len > inode->i_size && ext4_can_truncate(inode)) 1875 if (pos + len > inode->i_size && ext4_can_truncate(inode))
1876 /* if we have allocated more blocks and copied 1876 /* if we have allocated more blocks and copied
1877 * less. We will have blocks allocated outside 1877 * less. We will have blocks allocated outside
1878 * inode->i_size. So truncate them 1878 * inode->i_size. So truncate them
1879 */ 1879 */
1880 ext4_orphan_add(handle, inode); 1880 ext4_orphan_add(handle, inode);
1881 1881
1882 ret2 = ext4_journal_stop(handle); 1882 ret2 = ext4_journal_stop(handle);
1883 if (!ret) 1883 if (!ret)
1884 ret = ret2; 1884 ret = ret2;
1885 if (pos + len > inode->i_size) { 1885 if (pos + len > inode->i_size) {
1886 ext4_truncate_failed_write(inode); 1886 ext4_truncate_failed_write(inode);
1887 /* 1887 /*
1888 * If truncate failed early the inode might still be 1888 * If truncate failed early the inode might still be
1889 * on the orphan list; we need to make sure the inode 1889 * on the orphan list; we need to make sure the inode
1890 * is removed from the orphan list in that case. 1890 * is removed from the orphan list in that case.
1891 */ 1891 */
1892 if (inode->i_nlink) 1892 if (inode->i_nlink)
1893 ext4_orphan_del(NULL, inode); 1893 ext4_orphan_del(NULL, inode);
1894 } 1894 }
1895 1895
1896 return ret ? ret : copied; 1896 return ret ? ret : copied;
1897 } 1897 }
1898 1898
1899 /* 1899 /*
1900 * Reserve a single block located at lblock 1900 * Reserve a single block located at lblock
1901 */ 1901 */
1902 static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) 1902 static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
1903 { 1903 {
1904 int retries = 0; 1904 int retries = 0;
1905 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1905 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1906 struct ext4_inode_info *ei = EXT4_I(inode); 1906 struct ext4_inode_info *ei = EXT4_I(inode);
1907 unsigned long md_needed; 1907 unsigned long md_needed;
1908 int ret; 1908 int ret;
1909 1909
1910 /* 1910 /*
1911 * recalculate the amount of metadata blocks to reserve 1911 * recalculate the amount of metadata blocks to reserve
1912 * in order to allocate nrblocks 1912 * in order to allocate nrblocks
1913 * worse case is one extent per block 1913 * worse case is one extent per block
1914 */ 1914 */
1915 repeat: 1915 repeat:
1916 spin_lock(&ei->i_block_reservation_lock); 1916 spin_lock(&ei->i_block_reservation_lock);
1917 md_needed = ext4_calc_metadata_amount(inode, lblock); 1917 md_needed = ext4_calc_metadata_amount(inode, lblock);
1918 trace_ext4_da_reserve_space(inode, md_needed); 1918 trace_ext4_da_reserve_space(inode, md_needed);
1919 spin_unlock(&ei->i_block_reservation_lock); 1919 spin_unlock(&ei->i_block_reservation_lock);
1920 1920
1921 /* 1921 /*
1922 * We will charge metadata quota at writeout time; this saves 1922 * We will charge metadata quota at writeout time; this saves
1923 * us from metadata over-estimation, though we may go over by 1923 * us from metadata over-estimation, though we may go over by
1924 * a small amount in the end. Here we just reserve for data. 1924 * a small amount in the end. Here we just reserve for data.
1925 */ 1925 */
1926 ret = dquot_reserve_block(inode, 1); 1926 ret = dquot_reserve_block(inode, 1);
1927 if (ret) 1927 if (ret)
1928 return ret; 1928 return ret;
1929 /* 1929 /*
1930 * We do still charge estimated metadata to the sb though; 1930 * We do still charge estimated metadata to the sb though;
1931 * we cannot afford to run out of free blocks. 1931 * we cannot afford to run out of free blocks.
1932 */ 1932 */
1933 if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) { 1933 if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) {
1934 dquot_release_reservation_block(inode, 1); 1934 dquot_release_reservation_block(inode, 1);
1935 if (ext4_should_retry_alloc(inode->i_sb, &retries)) { 1935 if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1936 yield(); 1936 yield();
1937 goto repeat; 1937 goto repeat;
1938 } 1938 }
1939 return -ENOSPC; 1939 return -ENOSPC;
1940 } 1940 }
1941 spin_lock(&ei->i_block_reservation_lock); 1941 spin_lock(&ei->i_block_reservation_lock);
1942 ei->i_reserved_data_blocks++; 1942 ei->i_reserved_data_blocks++;
1943 ei->i_reserved_meta_blocks += md_needed; 1943 ei->i_reserved_meta_blocks += md_needed;
1944 spin_unlock(&ei->i_block_reservation_lock); 1944 spin_unlock(&ei->i_block_reservation_lock);
1945 1945
1946 return 0; /* success */ 1946 return 0; /* success */
1947 } 1947 }
1948 1948
1949 static void ext4_da_release_space(struct inode *inode, int to_free) 1949 static void ext4_da_release_space(struct inode *inode, int to_free)
1950 { 1950 {
1951 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1951 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1952 struct ext4_inode_info *ei = EXT4_I(inode); 1952 struct ext4_inode_info *ei = EXT4_I(inode);
1953 1953
1954 if (!to_free) 1954 if (!to_free)
1955 return; /* Nothing to release, exit */ 1955 return; /* Nothing to release, exit */
1956 1956
1957 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1957 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1958 1958
1959 trace_ext4_da_release_space(inode, to_free); 1959 trace_ext4_da_release_space(inode, to_free);
1960 if (unlikely(to_free > ei->i_reserved_data_blocks)) { 1960 if (unlikely(to_free > ei->i_reserved_data_blocks)) {
1961 /* 1961 /*
1962 * if there aren't enough reserved blocks, then the 1962 * if there aren't enough reserved blocks, then the
1963 * counter is messed up somewhere. Since this 1963 * counter is messed up somewhere. Since this
1964 * function is called from invalidate page, it's 1964 * function is called from invalidate page, it's
1965 * harmless to return without any action. 1965 * harmless to return without any action.
1966 */ 1966 */
1967 ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: " 1967 ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
1968 "ino %lu, to_free %d with only %d reserved " 1968 "ino %lu, to_free %d with only %d reserved "
1969 "data blocks\n", inode->i_ino, to_free, 1969 "data blocks\n", inode->i_ino, to_free,
1970 ei->i_reserved_data_blocks); 1970 ei->i_reserved_data_blocks);
1971 WARN_ON(1); 1971 WARN_ON(1);
1972 to_free = ei->i_reserved_data_blocks; 1972 to_free = ei->i_reserved_data_blocks;
1973 } 1973 }
1974 ei->i_reserved_data_blocks -= to_free; 1974 ei->i_reserved_data_blocks -= to_free;
1975 1975
1976 if (ei->i_reserved_data_blocks == 0) { 1976 if (ei->i_reserved_data_blocks == 0) {
1977 /* 1977 /*
1978 * We can release all of the reserved metadata blocks 1978 * We can release all of the reserved metadata blocks
1979 * only when we have written all of the delayed 1979 * only when we have written all of the delayed
1980 * allocation blocks. 1980 * allocation blocks.
1981 */ 1981 */
1982 percpu_counter_sub(&sbi->s_dirtyblocks_counter, 1982 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
1983 ei->i_reserved_meta_blocks); 1983 ei->i_reserved_meta_blocks);
1984 ei->i_reserved_meta_blocks = 0; 1984 ei->i_reserved_meta_blocks = 0;
1985 ei->i_da_metadata_calc_len = 0; 1985 ei->i_da_metadata_calc_len = 0;
1986 } 1986 }
1987 1987
1988 /* update fs dirty data blocks counter */ 1988 /* update fs dirty data blocks counter */
1989 percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free); 1989 percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
1990 1990
1991 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1991 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1992 1992
1993 dquot_release_reservation_block(inode, to_free); 1993 dquot_release_reservation_block(inode, to_free);
1994 } 1994 }
1995 1995
1996 static void ext4_da_page_release_reservation(struct page *page, 1996 static void ext4_da_page_release_reservation(struct page *page,
1997 unsigned long offset) 1997 unsigned long offset)
1998 { 1998 {
1999 int to_release = 0; 1999 int to_release = 0;
2000 struct buffer_head *head, *bh; 2000 struct buffer_head *head, *bh;
2001 unsigned int curr_off = 0; 2001 unsigned int curr_off = 0;
2002 2002
2003 head = page_buffers(page); 2003 head = page_buffers(page);
2004 bh = head; 2004 bh = head;
2005 do { 2005 do {
2006 unsigned int next_off = curr_off + bh->b_size; 2006 unsigned int next_off = curr_off + bh->b_size;
2007 2007
2008 if ((offset <= curr_off) && (buffer_delay(bh))) { 2008 if ((offset <= curr_off) && (buffer_delay(bh))) {
2009 to_release++; 2009 to_release++;
2010 clear_buffer_delay(bh); 2010 clear_buffer_delay(bh);
2011 } 2011 }
2012 curr_off = next_off; 2012 curr_off = next_off;
2013 } while ((bh = bh->b_this_page) != head); 2013 } while ((bh = bh->b_this_page) != head);
2014 ext4_da_release_space(page->mapping->host, to_release); 2014 ext4_da_release_space(page->mapping->host, to_release);
2015 } 2015 }
2016 2016
2017 /* 2017 /*
2018 * Delayed allocation stuff 2018 * Delayed allocation stuff
2019 */ 2019 */
2020 2020
2021 /* 2021 /*
2022 * mpage_da_submit_io - walks through extent of pages and try to write 2022 * mpage_da_submit_io - walks through extent of pages and try to write
2023 * them with writepage() call back 2023 * them with writepage() call back
2024 * 2024 *
2025 * @mpd->inode: inode 2025 * @mpd->inode: inode
2026 * @mpd->first_page: first page of the extent 2026 * @mpd->first_page: first page of the extent
2027 * @mpd->next_page: page after the last page of the extent 2027 * @mpd->next_page: page after the last page of the extent
2028 * 2028 *
2029 * By the time mpage_da_submit_io() is called we expect all blocks 2029 * By the time mpage_da_submit_io() is called we expect all blocks
2030 * to be allocated. this may be wrong if allocation failed. 2030 * to be allocated. this may be wrong if allocation failed.
2031 * 2031 *
2032 * As pages are already locked by write_cache_pages(), we can't use it 2032 * As pages are already locked by write_cache_pages(), we can't use it
2033 */ 2033 */
2034 static int mpage_da_submit_io(struct mpage_da_data *mpd, 2034 static int mpage_da_submit_io(struct mpage_da_data *mpd,
2035 struct ext4_map_blocks *map) 2035 struct ext4_map_blocks *map)
2036 { 2036 {
2037 struct pagevec pvec; 2037 struct pagevec pvec;
2038 unsigned long index, end; 2038 unsigned long index, end;
2039 int ret = 0, err, nr_pages, i; 2039 int ret = 0, err, nr_pages, i;
2040 struct inode *inode = mpd->inode; 2040 struct inode *inode = mpd->inode;
2041 struct address_space *mapping = inode->i_mapping; 2041 struct address_space *mapping = inode->i_mapping;
2042 loff_t size = i_size_read(inode); 2042 loff_t size = i_size_read(inode);
2043 unsigned int len, block_start; 2043 unsigned int len, block_start;
2044 struct buffer_head *bh, *page_bufs = NULL; 2044 struct buffer_head *bh, *page_bufs = NULL;
2045 int journal_data = ext4_should_journal_data(inode); 2045 int journal_data = ext4_should_journal_data(inode);
2046 sector_t pblock = 0, cur_logical = 0; 2046 sector_t pblock = 0, cur_logical = 0;
2047 struct ext4_io_submit io_submit; 2047 struct ext4_io_submit io_submit;
2048 2048
2049 BUG_ON(mpd->next_page <= mpd->first_page); 2049 BUG_ON(mpd->next_page <= mpd->first_page);
2050 memset(&io_submit, 0, sizeof(io_submit)); 2050 memset(&io_submit, 0, sizeof(io_submit));
2051 /* 2051 /*
2052 * We need to start from the first_page to the next_page - 1 2052 * We need to start from the first_page to the next_page - 1
2053 * to make sure we also write the mapped dirty buffer_heads. 2053 * to make sure we also write the mapped dirty buffer_heads.
2054 * If we look at mpd->b_blocknr we would only be looking 2054 * If we look at mpd->b_blocknr we would only be looking
2055 * at the currently mapped buffer_heads. 2055 * at the currently mapped buffer_heads.
2056 */ 2056 */
2057 index = mpd->first_page; 2057 index = mpd->first_page;
2058 end = mpd->next_page - 1; 2058 end = mpd->next_page - 1;
2059 2059
2060 pagevec_init(&pvec, 0); 2060 pagevec_init(&pvec, 0);
2061 while (index <= end) { 2061 while (index <= end) {
2062 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); 2062 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
2063 if (nr_pages == 0) 2063 if (nr_pages == 0)
2064 break; 2064 break;
2065 for (i = 0; i < nr_pages; i++) { 2065 for (i = 0; i < nr_pages; i++) {
2066 int commit_write = 0, skip_page = 0; 2066 int commit_write = 0, skip_page = 0;
2067 struct page *page = pvec.pages[i]; 2067 struct page *page = pvec.pages[i];
2068 2068
2069 index = page->index; 2069 index = page->index;
2070 if (index > end) 2070 if (index > end)
2071 break; 2071 break;
2072 2072
2073 if (index == size >> PAGE_CACHE_SHIFT) 2073 if (index == size >> PAGE_CACHE_SHIFT)
2074 len = size & ~PAGE_CACHE_MASK; 2074 len = size & ~PAGE_CACHE_MASK;
2075 else 2075 else
2076 len = PAGE_CACHE_SIZE; 2076 len = PAGE_CACHE_SIZE;
2077 if (map) { 2077 if (map) {
2078 cur_logical = index << (PAGE_CACHE_SHIFT - 2078 cur_logical = index << (PAGE_CACHE_SHIFT -
2079 inode->i_blkbits); 2079 inode->i_blkbits);
2080 pblock = map->m_pblk + (cur_logical - 2080 pblock = map->m_pblk + (cur_logical -
2081 map->m_lblk); 2081 map->m_lblk);
2082 } 2082 }
2083 index++; 2083 index++;
2084 2084
2085 BUG_ON(!PageLocked(page)); 2085 BUG_ON(!PageLocked(page));
2086 BUG_ON(PageWriteback(page)); 2086 BUG_ON(PageWriteback(page));
2087 2087
2088 /* 2088 /*
2089 * If the page does not have buffers (for 2089 * If the page does not have buffers (for
2090 * whatever reason), try to create them using 2090 * whatever reason), try to create them using
2091 * __block_write_begin. If this fails, 2091 * __block_write_begin. If this fails,
2092 * skip the page and move on. 2092 * skip the page and move on.
2093 */ 2093 */
2094 if (!page_has_buffers(page)) { 2094 if (!page_has_buffers(page)) {
2095 if (__block_write_begin(page, 0, len, 2095 if (__block_write_begin(page, 0, len,
2096 noalloc_get_block_write)) { 2096 noalloc_get_block_write)) {
2097 skip_page: 2097 skip_page:
2098 unlock_page(page); 2098 unlock_page(page);
2099 continue; 2099 continue;
2100 } 2100 }
2101 commit_write = 1; 2101 commit_write = 1;
2102 } 2102 }
2103 2103
2104 bh = page_bufs = page_buffers(page); 2104 bh = page_bufs = page_buffers(page);
2105 block_start = 0; 2105 block_start = 0;
2106 do { 2106 do {
2107 if (!bh) 2107 if (!bh)
2108 goto skip_page; 2108 goto skip_page;
2109 if (map && (cur_logical >= map->m_lblk) && 2109 if (map && (cur_logical >= map->m_lblk) &&
2110 (cur_logical <= (map->m_lblk + 2110 (cur_logical <= (map->m_lblk +
2111 (map->m_len - 1)))) { 2111 (map->m_len - 1)))) {
2112 if (buffer_delay(bh)) { 2112 if (buffer_delay(bh)) {
2113 clear_buffer_delay(bh); 2113 clear_buffer_delay(bh);
2114 bh->b_blocknr = pblock; 2114 bh->b_blocknr = pblock;
2115 } 2115 }
2116 if (buffer_unwritten(bh) || 2116 if (buffer_unwritten(bh) ||
2117 buffer_mapped(bh)) 2117 buffer_mapped(bh))
2118 BUG_ON(bh->b_blocknr != pblock); 2118 BUG_ON(bh->b_blocknr != pblock);
2119 if (map->m_flags & EXT4_MAP_UNINIT) 2119 if (map->m_flags & EXT4_MAP_UNINIT)
2120 set_buffer_uninit(bh); 2120 set_buffer_uninit(bh);
2121 clear_buffer_unwritten(bh); 2121 clear_buffer_unwritten(bh);
2122 } 2122 }
2123 2123
2124 /* skip page if block allocation undone */ 2124 /* skip page if block allocation undone */
2125 if (buffer_delay(bh) || buffer_unwritten(bh)) 2125 if (buffer_delay(bh) || buffer_unwritten(bh))
2126 skip_page = 1; 2126 skip_page = 1;
2127 bh = bh->b_this_page; 2127 bh = bh->b_this_page;
2128 block_start += bh->b_size; 2128 block_start += bh->b_size;
2129 cur_logical++; 2129 cur_logical++;
2130 pblock++; 2130 pblock++;
2131 } while (bh != page_bufs); 2131 } while (bh != page_bufs);
2132 2132
2133 if (skip_page) 2133 if (skip_page)
2134 goto skip_page; 2134 goto skip_page;
2135 2135
2136 if (commit_write) 2136 if (commit_write)
2137 /* mark the buffer_heads as dirty & uptodate */ 2137 /* mark the buffer_heads as dirty & uptodate */
2138 block_commit_write(page, 0, len); 2138 block_commit_write(page, 0, len);
2139 2139
2140 clear_page_dirty_for_io(page); 2140 clear_page_dirty_for_io(page);
2141 /* 2141 /*
2142 * Delalloc doesn't support data journalling, 2142 * Delalloc doesn't support data journalling,
2143 * but eventually maybe we'll lift this 2143 * but eventually maybe we'll lift this
2144 * restriction. 2144 * restriction.
2145 */ 2145 */
2146 if (unlikely(journal_data && PageChecked(page))) 2146 if (unlikely(journal_data && PageChecked(page)))
2147 err = __ext4_journalled_writepage(page, len); 2147 err = __ext4_journalled_writepage(page, len);
2148 else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT)) 2148 else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))
2149 err = ext4_bio_write_page(&io_submit, page, 2149 err = ext4_bio_write_page(&io_submit, page,
2150 len, mpd->wbc); 2150 len, mpd->wbc);
2151 else 2151 else
2152 err = block_write_full_page(page, 2152 err = block_write_full_page(page,
2153 noalloc_get_block_write, mpd->wbc); 2153 noalloc_get_block_write, mpd->wbc);
2154 2154
2155 if (!err) 2155 if (!err)
2156 mpd->pages_written++; 2156 mpd->pages_written++;
2157 /* 2157 /*
2158 * In error case, we have to continue because 2158 * In error case, we have to continue because
2159 * remaining pages are still locked 2159 * remaining pages are still locked
2160 */ 2160 */
2161 if (ret == 0) 2161 if (ret == 0)
2162 ret = err; 2162 ret = err;
2163 } 2163 }
2164 pagevec_release(&pvec); 2164 pagevec_release(&pvec);
2165 } 2165 }
2166 ext4_io_submit(&io_submit); 2166 ext4_io_submit(&io_submit);
2167 return ret; 2167 return ret;
2168 } 2168 }
2169 2169
2170 static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd) 2170 static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
2171 { 2171 {
2172 int nr_pages, i; 2172 int nr_pages, i;
2173 pgoff_t index, end; 2173 pgoff_t index, end;
2174 struct pagevec pvec; 2174 struct pagevec pvec;
2175 struct inode *inode = mpd->inode; 2175 struct inode *inode = mpd->inode;
2176 struct address_space *mapping = inode->i_mapping; 2176 struct address_space *mapping = inode->i_mapping;
2177 2177
2178 index = mpd->first_page; 2178 index = mpd->first_page;
2179 end = mpd->next_page - 1; 2179 end = mpd->next_page - 1;
2180 while (index <= end) { 2180 while (index <= end) {
2181 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); 2181 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
2182 if (nr_pages == 0) 2182 if (nr_pages == 0)
2183 break; 2183 break;
2184 for (i = 0; i < nr_pages; i++) { 2184 for (i = 0; i < nr_pages; i++) {
2185 struct page *page = pvec.pages[i]; 2185 struct page *page = pvec.pages[i];
2186 if (page->index > end) 2186 if (page->index > end)
2187 break; 2187 break;
2188 BUG_ON(!PageLocked(page)); 2188 BUG_ON(!PageLocked(page));
2189 BUG_ON(PageWriteback(page)); 2189 BUG_ON(PageWriteback(page));
2190 block_invalidatepage(page, 0); 2190 block_invalidatepage(page, 0);
2191 ClearPageUptodate(page); 2191 ClearPageUptodate(page);
2192 unlock_page(page); 2192 unlock_page(page);
2193 } 2193 }
2194 index = pvec.pages[nr_pages - 1]->index + 1; 2194 index = pvec.pages[nr_pages - 1]->index + 1;
2195 pagevec_release(&pvec); 2195 pagevec_release(&pvec);
2196 } 2196 }
2197 return; 2197 return;
2198 } 2198 }
2199 2199
2200 static void ext4_print_free_blocks(struct inode *inode) 2200 static void ext4_print_free_blocks(struct inode *inode)
2201 { 2201 {
2202 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2202 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2203 printk(KERN_CRIT "Total free blocks count %lld\n", 2203 printk(KERN_CRIT "Total free blocks count %lld\n",
2204 ext4_count_free_blocks(inode->i_sb)); 2204 ext4_count_free_blocks(inode->i_sb));
2205 printk(KERN_CRIT "Free/Dirty block details\n"); 2205 printk(KERN_CRIT "Free/Dirty block details\n");
2206 printk(KERN_CRIT "free_blocks=%lld\n", 2206 printk(KERN_CRIT "free_blocks=%lld\n",
2207 (long long) percpu_counter_sum(&sbi->s_freeblocks_counter)); 2207 (long long) percpu_counter_sum(&sbi->s_freeblocks_counter));
2208 printk(KERN_CRIT "dirty_blocks=%lld\n", 2208 printk(KERN_CRIT "dirty_blocks=%lld\n",
2209 (long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter)); 2209 (long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
2210 printk(KERN_CRIT "Block reservation details\n"); 2210 printk(KERN_CRIT "Block reservation details\n");
2211 printk(KERN_CRIT "i_reserved_data_blocks=%u\n", 2211 printk(KERN_CRIT "i_reserved_data_blocks=%u\n",
2212 EXT4_I(inode)->i_reserved_data_blocks); 2212 EXT4_I(inode)->i_reserved_data_blocks);
2213 printk(KERN_CRIT "i_reserved_meta_blocks=%u\n", 2213 printk(KERN_CRIT "i_reserved_meta_blocks=%u\n",
2214 EXT4_I(inode)->i_reserved_meta_blocks); 2214 EXT4_I(inode)->i_reserved_meta_blocks);
2215 return; 2215 return;
2216 } 2216 }
2217 2217
2218 /* 2218 /*
2219 * mpage_da_map_and_submit - go through given space, map them 2219 * mpage_da_map_and_submit - go through given space, map them
2220 * if necessary, and then submit them for I/O 2220 * if necessary, and then submit them for I/O
2221 * 2221 *
2222 * @mpd - bh describing space 2222 * @mpd - bh describing space
2223 * 2223 *
2224 * The function skips space we know is already mapped to disk blocks. 2224 * The function skips space we know is already mapped to disk blocks.
2225 * 2225 *
2226 */ 2226 */
2227 static void mpage_da_map_and_submit(struct mpage_da_data *mpd) 2227 static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
2228 { 2228 {
2229 int err, blks, get_blocks_flags; 2229 int err, blks, get_blocks_flags;
2230 struct ext4_map_blocks map, *mapp = NULL; 2230 struct ext4_map_blocks map, *mapp = NULL;
2231 sector_t next = mpd->b_blocknr; 2231 sector_t next = mpd->b_blocknr;
2232 unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits; 2232 unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
2233 loff_t disksize = EXT4_I(mpd->inode)->i_disksize; 2233 loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
2234 handle_t *handle = NULL; 2234 handle_t *handle = NULL;
2235 2235
2236 /* 2236 /*
2237 * If the blocks are mapped already, or we couldn't accumulate 2237 * If the blocks are mapped already, or we couldn't accumulate
2238 * any blocks, then proceed immediately to the submission stage. 2238 * any blocks, then proceed immediately to the submission stage.
2239 */ 2239 */
2240 if ((mpd->b_size == 0) || 2240 if ((mpd->b_size == 0) ||
2241 ((mpd->b_state & (1 << BH_Mapped)) && 2241 ((mpd->b_state & (1 << BH_Mapped)) &&
2242 !(mpd->b_state & (1 << BH_Delay)) && 2242 !(mpd->b_state & (1 << BH_Delay)) &&
2243 !(mpd->b_state & (1 << BH_Unwritten)))) 2243 !(mpd->b_state & (1 << BH_Unwritten))))
2244 goto submit_io; 2244 goto submit_io;
2245 2245
2246 handle = ext4_journal_current_handle(); 2246 handle = ext4_journal_current_handle();
2247 BUG_ON(!handle); 2247 BUG_ON(!handle);
2248 2248
2249 /* 2249 /*
2250 * Call ext4_map_blocks() to allocate any delayed allocation 2250 * Call ext4_map_blocks() to allocate any delayed allocation
2251 * blocks, or to convert an uninitialized extent to be 2251 * blocks, or to convert an uninitialized extent to be
2252 * initialized (in the case where we have written into 2252 * initialized (in the case where we have written into
2253 * one or more preallocated blocks). 2253 * one or more preallocated blocks).
2254 * 2254 *
2255 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to 2255 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
2256 * indicate that we are on the delayed allocation path. This 2256 * indicate that we are on the delayed allocation path. This
2257 * affects functions in many different parts of the allocation 2257 * affects functions in many different parts of the allocation
2258 * call path. This flag exists primarily because we don't 2258 * call path. This flag exists primarily because we don't
2259 * want to change *many* call functions, so ext4_map_blocks() 2259 * want to change *many* call functions, so ext4_map_blocks()
2260 * will set the EXT4_STATE_DELALLOC_RESERVED flag once the 2260 * will set the EXT4_STATE_DELALLOC_RESERVED flag once the
2261 * inode's allocation semaphore is taken. 2261 * inode's allocation semaphore is taken.
2262 * 2262 *
2263 * If the blocks in questions were delalloc blocks, set 2263 * If the blocks in questions were delalloc blocks, set
2264 * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting 2264 * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
2265 * variables are updated after the blocks have been allocated. 2265 * variables are updated after the blocks have been allocated.
2266 */ 2266 */
2267 map.m_lblk = next; 2267 map.m_lblk = next;
2268 map.m_len = max_blocks; 2268 map.m_len = max_blocks;
2269 get_blocks_flags = EXT4_GET_BLOCKS_CREATE; 2269 get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
2270 if (ext4_should_dioread_nolock(mpd->inode)) 2270 if (ext4_should_dioread_nolock(mpd->inode))
2271 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; 2271 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
2272 if (mpd->b_state & (1 << BH_Delay)) 2272 if (mpd->b_state & (1 << BH_Delay))
2273 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; 2273 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
2274 2274
2275 blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags); 2275 blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
2276 if (blks < 0) { 2276 if (blks < 0) {
2277 struct super_block *sb = mpd->inode->i_sb; 2277 struct super_block *sb = mpd->inode->i_sb;
2278 2278
2279 err = blks; 2279 err = blks;
2280 /* 2280 /*
2281 * If get block returns EAGAIN or ENOSPC and there 2281 * If get block returns EAGAIN or ENOSPC and there
2282 * appears to be free blocks we will just let 2282 * appears to be free blocks we will just let
2283 * mpage_da_submit_io() unlock all of the pages. 2283 * mpage_da_submit_io() unlock all of the pages.
2284 */ 2284 */
2285 if (err == -EAGAIN) 2285 if (err == -EAGAIN)
2286 goto submit_io; 2286 goto submit_io;
2287 2287
2288 if (err == -ENOSPC && 2288 if (err == -ENOSPC &&
2289 ext4_count_free_blocks(sb)) { 2289 ext4_count_free_blocks(sb)) {
2290 mpd->retval = err; 2290 mpd->retval = err;
2291 goto submit_io; 2291 goto submit_io;
2292 } 2292 }
2293 2293
2294 /* 2294 /*
2295 * get block failure will cause us to loop in 2295 * get block failure will cause us to loop in
2296 * writepages, because a_ops->writepage won't be able 2296 * writepages, because a_ops->writepage won't be able
2297 * to make progress. The page will be redirtied by 2297 * to make progress. The page will be redirtied by
2298 * writepage and writepages will again try to write 2298 * writepage and writepages will again try to write
2299 * the same. 2299 * the same.
2300 */ 2300 */
2301 if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) { 2301 if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
2302 ext4_msg(sb, KERN_CRIT, 2302 ext4_msg(sb, KERN_CRIT,
2303 "delayed block allocation failed for inode %lu " 2303 "delayed block allocation failed for inode %lu "
2304 "at logical offset %llu with max blocks %zd " 2304 "at logical offset %llu with max blocks %zd "
2305 "with error %d", mpd->inode->i_ino, 2305 "with error %d", mpd->inode->i_ino,
2306 (unsigned long long) next, 2306 (unsigned long long) next,
2307 mpd->b_size >> mpd->inode->i_blkbits, err); 2307 mpd->b_size >> mpd->inode->i_blkbits, err);
2308 ext4_msg(sb, KERN_CRIT, 2308 ext4_msg(sb, KERN_CRIT,
2309 "This should not happen!! Data will be lost\n"); 2309 "This should not happen!! Data will be lost\n");
2310 if (err == -ENOSPC) 2310 if (err == -ENOSPC)
2311 ext4_print_free_blocks(mpd->inode); 2311 ext4_print_free_blocks(mpd->inode);
2312 } 2312 }
2313 /* invalidate all the pages */ 2313 /* invalidate all the pages */
2314 ext4_da_block_invalidatepages(mpd); 2314 ext4_da_block_invalidatepages(mpd);
2315 2315
2316 /* Mark this page range as having been completed */ 2316 /* Mark this page range as having been completed */
2317 mpd->io_done = 1; 2317 mpd->io_done = 1;
2318 return; 2318 return;
2319 } 2319 }
2320 BUG_ON(blks == 0); 2320 BUG_ON(blks == 0);
2321 2321
2322 mapp = &map; 2322 mapp = &map;
2323 if (map.m_flags & EXT4_MAP_NEW) { 2323 if (map.m_flags & EXT4_MAP_NEW) {
2324 struct block_device *bdev = mpd->inode->i_sb->s_bdev; 2324 struct block_device *bdev = mpd->inode->i_sb->s_bdev;
2325 int i; 2325 int i;
2326 2326
2327 for (i = 0; i < map.m_len; i++) 2327 for (i = 0; i < map.m_len; i++)
2328 unmap_underlying_metadata(bdev, map.m_pblk + i); 2328 unmap_underlying_metadata(bdev, map.m_pblk + i);
2329 } 2329 }
2330 2330
2331 if (ext4_should_order_data(mpd->inode)) { 2331 if (ext4_should_order_data(mpd->inode)) {
2332 err = ext4_jbd2_file_inode(handle, mpd->inode); 2332 err = ext4_jbd2_file_inode(handle, mpd->inode);
2333 if (err) 2333 if (err)
2334 /* This only happens if the journal is aborted */ 2334 /* This only happens if the journal is aborted */
2335 return; 2335 return;
2336 } 2336 }
2337 2337
2338 /* 2338 /*
2339 * Update on-disk size along with block allocation. 2339 * Update on-disk size along with block allocation.
2340 */ 2340 */
2341 disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits; 2341 disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
2342 if (disksize > i_size_read(mpd->inode)) 2342 if (disksize > i_size_read(mpd->inode))
2343 disksize = i_size_read(mpd->inode); 2343 disksize = i_size_read(mpd->inode);
2344 if (disksize > EXT4_I(mpd->inode)->i_disksize) { 2344 if (disksize > EXT4_I(mpd->inode)->i_disksize) {
2345 ext4_update_i_disksize(mpd->inode, disksize); 2345 ext4_update_i_disksize(mpd->inode, disksize);
2346 err = ext4_mark_inode_dirty(handle, mpd->inode); 2346 err = ext4_mark_inode_dirty(handle, mpd->inode);
2347 if (err) 2347 if (err)
2348 ext4_error(mpd->inode->i_sb, 2348 ext4_error(mpd->inode->i_sb,
2349 "Failed to mark inode %lu dirty", 2349 "Failed to mark inode %lu dirty",
2350 mpd->inode->i_ino); 2350 mpd->inode->i_ino);
2351 } 2351 }
2352 2352
2353 submit_io: 2353 submit_io:
2354 mpage_da_submit_io(mpd, mapp); 2354 mpage_da_submit_io(mpd, mapp);
2355 mpd->io_done = 1; 2355 mpd->io_done = 1;
2356 } 2356 }
2357 2357
2358 #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ 2358 #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
2359 (1 << BH_Delay) | (1 << BH_Unwritten)) 2359 (1 << BH_Delay) | (1 << BH_Unwritten))
2360 2360
2361 /* 2361 /*
2362 * mpage_add_bh_to_extent - try to add one more block to extent of blocks 2362 * mpage_add_bh_to_extent - try to add one more block to extent of blocks
2363 * 2363 *
2364 * @mpd->lbh - extent of blocks 2364 * @mpd->lbh - extent of blocks
2365 * @logical - logical number of the block in the file 2365 * @logical - logical number of the block in the file
2366 * @bh - bh of the block (used to access block's state) 2366 * @bh - bh of the block (used to access block's state)
2367 * 2367 *
2368 * the function is used to collect contig. blocks in same state 2368 * the function is used to collect contig. blocks in same state
2369 */ 2369 */
2370 static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, 2370 static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
2371 sector_t logical, size_t b_size, 2371 sector_t logical, size_t b_size,
2372 unsigned long b_state) 2372 unsigned long b_state)
2373 { 2373 {
2374 sector_t next; 2374 sector_t next;
2375 int nrblocks = mpd->b_size >> mpd->inode->i_blkbits; 2375 int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
2376 2376
2377 /* 2377 /*
2378 * XXX Don't go larger than mballoc is willing to allocate 2378 * XXX Don't go larger than mballoc is willing to allocate
2379 * This is a stopgap solution. We eventually need to fold 2379 * This is a stopgap solution. We eventually need to fold
2380 * mpage_da_submit_io() into this function and then call 2380 * mpage_da_submit_io() into this function and then call
2381 * ext4_map_blocks() multiple times in a loop 2381 * ext4_map_blocks() multiple times in a loop
2382 */ 2382 */
2383 if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize) 2383 if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
2384 goto flush_it; 2384 goto flush_it;
2385 2385
2386 /* check if thereserved journal credits might overflow */ 2386 /* check if thereserved journal credits might overflow */
2387 if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) { 2387 if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {
2388 if (nrblocks >= EXT4_MAX_TRANS_DATA) { 2388 if (nrblocks >= EXT4_MAX_TRANS_DATA) {
2389 /* 2389 /*
2390 * With non-extent format we are limited by the journal 2390 * With non-extent format we are limited by the journal
2391 * credit available. Total credit needed to insert 2391 * credit available. Total credit needed to insert
2392 * nrblocks contiguous blocks is dependent on the 2392 * nrblocks contiguous blocks is dependent on the
2393 * nrblocks. So limit nrblocks. 2393 * nrblocks. So limit nrblocks.
2394 */ 2394 */
2395 goto flush_it; 2395 goto flush_it;
2396 } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) > 2396 } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) >
2397 EXT4_MAX_TRANS_DATA) { 2397 EXT4_MAX_TRANS_DATA) {
2398 /* 2398 /*
2399 * Adding the new buffer_head would make it cross the 2399 * Adding the new buffer_head would make it cross the
2400 * allowed limit for which we have journal credit 2400 * allowed limit for which we have journal credit
2401 * reserved. So limit the new bh->b_size 2401 * reserved. So limit the new bh->b_size
2402 */ 2402 */
2403 b_size = (EXT4_MAX_TRANS_DATA - nrblocks) << 2403 b_size = (EXT4_MAX_TRANS_DATA - nrblocks) <<
2404 mpd->inode->i_blkbits; 2404 mpd->inode->i_blkbits;
2405 /* we will do mpage_da_submit_io in the next loop */ 2405 /* we will do mpage_da_submit_io in the next loop */
2406 } 2406 }
2407 } 2407 }
2408 /* 2408 /*
2409 * First block in the extent 2409 * First block in the extent
2410 */ 2410 */
2411 if (mpd->b_size == 0) { 2411 if (mpd->b_size == 0) {
2412 mpd->b_blocknr = logical; 2412 mpd->b_blocknr = logical;
2413 mpd->b_size = b_size; 2413 mpd->b_size = b_size;
2414 mpd->b_state = b_state & BH_FLAGS; 2414 mpd->b_state = b_state & BH_FLAGS;
2415 return; 2415 return;
2416 } 2416 }
2417 2417
2418 next = mpd->b_blocknr + nrblocks; 2418 next = mpd->b_blocknr + nrblocks;
2419 /* 2419 /*
2420 * Can we merge the block to our big extent? 2420 * Can we merge the block to our big extent?
2421 */ 2421 */
2422 if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) { 2422 if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
2423 mpd->b_size += b_size; 2423 mpd->b_size += b_size;
2424 return; 2424 return;
2425 } 2425 }
2426 2426
2427 flush_it: 2427 flush_it:
2428 /* 2428 /*
2429 * We couldn't merge the block to our extent, so we 2429 * We couldn't merge the block to our extent, so we
2430 * need to flush current extent and start new one 2430 * need to flush current extent and start new one
2431 */ 2431 */
2432 mpage_da_map_and_submit(mpd); 2432 mpage_da_map_and_submit(mpd);
2433 return; 2433 return;
2434 } 2434 }
2435 2435
2436 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) 2436 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
2437 { 2437 {
2438 return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); 2438 return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
2439 } 2439 }
2440 2440
2441 /* 2441 /*
2442 * This is a special get_blocks_t callback which is used by 2442 * This is a special get_blocks_t callback which is used by
2443 * ext4_da_write_begin(). It will either return mapped block or 2443 * ext4_da_write_begin(). It will either return mapped block or
2444 * reserve space for a single block. 2444 * reserve space for a single block.
2445 * 2445 *
2446 * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set. 2446 * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.
2447 * We also have b_blocknr = -1 and b_bdev initialized properly 2447 * We also have b_blocknr = -1 and b_bdev initialized properly
2448 * 2448 *
2449 * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set. 2449 * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.
2450 * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev 2450 * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
2451 * initialized properly. 2451 * initialized properly.
2452 */ 2452 */
2453 static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, 2453 static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2454 struct buffer_head *bh, int create) 2454 struct buffer_head *bh, int create)
2455 { 2455 {
2456 struct ext4_map_blocks map; 2456 struct ext4_map_blocks map;
2457 int ret = 0; 2457 int ret = 0;
2458 sector_t invalid_block = ~((sector_t) 0xffff); 2458 sector_t invalid_block = ~((sector_t) 0xffff);
2459 2459
2460 if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) 2460 if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
2461 invalid_block = ~0; 2461 invalid_block = ~0;
2462 2462
2463 BUG_ON(create == 0); 2463 BUG_ON(create == 0);
2464 BUG_ON(bh->b_size != inode->i_sb->s_blocksize); 2464 BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
2465 2465
2466 map.m_lblk = iblock; 2466 map.m_lblk = iblock;
2467 map.m_len = 1; 2467 map.m_len = 1;
2468 2468
2469 /* 2469 /*
2470 * first, we need to know whether the block is allocated already 2470 * first, we need to know whether the block is allocated already
2471 * preallocated blocks are unmapped but should treated 2471 * preallocated blocks are unmapped but should treated
2472 * the same as allocated blocks. 2472 * the same as allocated blocks.
2473 */ 2473 */
2474 ret = ext4_map_blocks(NULL, inode, &map, 0); 2474 ret = ext4_map_blocks(NULL, inode, &map, 0);
2475 if (ret < 0) 2475 if (ret < 0)
2476 return ret; 2476 return ret;
2477 if (ret == 0) { 2477 if (ret == 0) {
2478 if (buffer_delay(bh)) 2478 if (buffer_delay(bh))
2479 return 0; /* Not sure this could or should happen */ 2479 return 0; /* Not sure this could or should happen */
2480 /* 2480 /*
2481 * XXX: __block_write_begin() unmaps passed block, is it OK? 2481 * XXX: __block_write_begin() unmaps passed block, is it OK?
2482 */ 2482 */
2483 ret = ext4_da_reserve_space(inode, iblock); 2483 ret = ext4_da_reserve_space(inode, iblock);
2484 if (ret) 2484 if (ret)
2485 /* not enough space to reserve */ 2485 /* not enough space to reserve */
2486 return ret; 2486 return ret;
2487 2487
2488 map_bh(bh, inode->i_sb, invalid_block); 2488 map_bh(bh, inode->i_sb, invalid_block);
2489 set_buffer_new(bh); 2489 set_buffer_new(bh);
2490 set_buffer_delay(bh); 2490 set_buffer_delay(bh);
2491 return 0; 2491 return 0;
2492 } 2492 }
2493 2493
2494 map_bh(bh, inode->i_sb, map.m_pblk); 2494 map_bh(bh, inode->i_sb, map.m_pblk);
2495 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; 2495 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
2496 2496
2497 if (buffer_unwritten(bh)) { 2497 if (buffer_unwritten(bh)) {
2498 /* A delayed write to unwritten bh should be marked 2498 /* A delayed write to unwritten bh should be marked
2499 * new and mapped. Mapped ensures that we don't do 2499 * new and mapped. Mapped ensures that we don't do
2500 * get_block multiple times when we write to the same 2500 * get_block multiple times when we write to the same
2501 * offset and new ensures that we do proper zero out 2501 * offset and new ensures that we do proper zero out
2502 * for partial write. 2502 * for partial write.
2503 */ 2503 */
2504 set_buffer_new(bh); 2504 set_buffer_new(bh);
2505 set_buffer_mapped(bh); 2505 set_buffer_mapped(bh);
2506 } 2506 }
2507 return 0; 2507 return 0;
2508 } 2508 }
2509 2509
2510 /* 2510 /*
2511 * This function is used as a standard get_block_t calback function 2511 * This function is used as a standard get_block_t calback function
2512 * when there is no desire to allocate any blocks. It is used as a 2512 * when there is no desire to allocate any blocks. It is used as a
2513 * callback function for block_write_begin() and block_write_full_page(). 2513 * callback function for block_write_begin() and block_write_full_page().
2514 * These functions should only try to map a single block at a time. 2514 * These functions should only try to map a single block at a time.
2515 * 2515 *
2516 * Since this function doesn't do block allocations even if the caller 2516 * Since this function doesn't do block allocations even if the caller
2517 * requests it by passing in create=1, it is critically important that 2517 * requests it by passing in create=1, it is critically important that
2518 * any caller checks to make sure that any buffer heads are returned 2518 * any caller checks to make sure that any buffer heads are returned
2519 * by this function are either all already mapped or marked for 2519 * by this function are either all already mapped or marked for
2520 * delayed allocation before calling block_write_full_page(). Otherwise, 2520 * delayed allocation before calling block_write_full_page(). Otherwise,
2521 * b_blocknr could be left unitialized, and the page write functions will 2521 * b_blocknr could be left unitialized, and the page write functions will
2522 * be taken by surprise. 2522 * be taken by surprise.
2523 */ 2523 */
2524 static int noalloc_get_block_write(struct inode *inode, sector_t iblock, 2524 static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
2525 struct buffer_head *bh_result, int create) 2525 struct buffer_head *bh_result, int create)
2526 { 2526 {
2527 BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); 2527 BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
2528 return _ext4_get_block(inode, iblock, bh_result, 0); 2528 return _ext4_get_block(inode, iblock, bh_result, 0);
2529 } 2529 }
2530 2530
2531 static int bget_one(handle_t *handle, struct buffer_head *bh) 2531 static int bget_one(handle_t *handle, struct buffer_head *bh)
2532 { 2532 {
2533 get_bh(bh); 2533 get_bh(bh);
2534 return 0; 2534 return 0;
2535 } 2535 }
2536 2536
2537 static int bput_one(handle_t *handle, struct buffer_head *bh) 2537 static int bput_one(handle_t *handle, struct buffer_head *bh)
2538 { 2538 {
2539 put_bh(bh); 2539 put_bh(bh);
2540 return 0; 2540 return 0;
2541 } 2541 }
2542 2542
2543 static int __ext4_journalled_writepage(struct page *page, 2543 static int __ext4_journalled_writepage(struct page *page,
2544 unsigned int len) 2544 unsigned int len)
2545 { 2545 {
2546 struct address_space *mapping = page->mapping; 2546 struct address_space *mapping = page->mapping;
2547 struct inode *inode = mapping->host; 2547 struct inode *inode = mapping->host;
2548 struct buffer_head *page_bufs; 2548 struct buffer_head *page_bufs;
2549 handle_t *handle = NULL; 2549 handle_t *handle = NULL;
2550 int ret = 0; 2550 int ret = 0;
2551 int err; 2551 int err;
2552 2552
2553 ClearPageChecked(page); 2553 ClearPageChecked(page);
2554 page_bufs = page_buffers(page); 2554 page_bufs = page_buffers(page);
2555 BUG_ON(!page_bufs); 2555 BUG_ON(!page_bufs);
2556 walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); 2556 walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
2557 /* As soon as we unlock the page, it can go away, but we have 2557 /* As soon as we unlock the page, it can go away, but we have
2558 * references to buffers so we are safe */ 2558 * references to buffers so we are safe */
2559 unlock_page(page); 2559 unlock_page(page);
2560 2560
2561 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); 2561 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
2562 if (IS_ERR(handle)) { 2562 if (IS_ERR(handle)) {
2563 ret = PTR_ERR(handle); 2563 ret = PTR_ERR(handle);
2564 goto out; 2564 goto out;
2565 } 2565 }
2566 2566
2567 ret = walk_page_buffers(handle, page_bufs, 0, len, NULL, 2567 ret = walk_page_buffers(handle, page_bufs, 0, len, NULL,
2568 do_journal_get_write_access); 2568 do_journal_get_write_access);
2569 2569
2570 err = walk_page_buffers(handle, page_bufs, 0, len, NULL, 2570 err = walk_page_buffers(handle, page_bufs, 0, len, NULL,
2571 write_end_fn); 2571 write_end_fn);
2572 if (ret == 0) 2572 if (ret == 0)
2573 ret = err; 2573 ret = err;
2574 err = ext4_journal_stop(handle); 2574 err = ext4_journal_stop(handle);
2575 if (!ret) 2575 if (!ret)
2576 ret = err; 2576 ret = err;
2577 2577
2578 walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); 2578 walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
2579 ext4_set_inode_state(inode, EXT4_STATE_JDATA); 2579 ext4_set_inode_state(inode, EXT4_STATE_JDATA);
2580 out: 2580 out:
2581 return ret; 2581 return ret;
2582 } 2582 }
2583 2583
2584 static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode); 2584 static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
2585 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate); 2585 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
2586 2586
2587 /* 2587 /*
2588 * Note that we don't need to start a transaction unless we're journaling data 2588 * Note that we don't need to start a transaction unless we're journaling data
2589 * because we should have holes filled from ext4_page_mkwrite(). We even don't 2589 * because we should have holes filled from ext4_page_mkwrite(). We even don't
2590 * need to file the inode to the transaction's list in ordered mode because if 2590 * need to file the inode to the transaction's list in ordered mode because if
2591 * we are writing back data added by write(), the inode is already there and if 2591 * we are writing back data added by write(), the inode is already there and if
2592 * we are writing back data modified via mmap(), no one guarantees in which 2592 * we are writing back data modified via mmap(), no one guarantees in which
2593 * transaction the data will hit the disk. In case we are journaling data, we 2593 * transaction the data will hit the disk. In case we are journaling data, we
2594 * cannot start transaction directly because transaction start ranks above page 2594 * cannot start transaction directly because transaction start ranks above page
2595 * lock so we have to do some magic. 2595 * lock so we have to do some magic.
2596 * 2596 *
2597 * This function can get called via... 2597 * This function can get called via...
2598 * - ext4_da_writepages after taking page lock (have journal handle) 2598 * - ext4_da_writepages after taking page lock (have journal handle)
2599 * - journal_submit_inode_data_buffers (no journal handle) 2599 * - journal_submit_inode_data_buffers (no journal handle)
2600 * - shrink_page_list via pdflush (no journal handle) 2600 * - shrink_page_list via pdflush (no journal handle)
2601 * - grab_page_cache when doing write_begin (have journal handle) 2601 * - grab_page_cache when doing write_begin (have journal handle)
2602 * 2602 *
2603 * We don't do any block allocation in this function. If we have page with 2603 * We don't do any block allocation in this function. If we have page with
2604 * multiple blocks we need to write those buffer_heads that are mapped. This 2604 * multiple blocks we need to write those buffer_heads that are mapped. This
2605 * is important for mmaped based write. So if we do with blocksize 1K 2605 * is important for mmaped based write. So if we do with blocksize 1K
2606 * truncate(f, 1024); 2606 * truncate(f, 1024);
2607 * a = mmap(f, 0, 4096); 2607 * a = mmap(f, 0, 4096);
2608 * a[0] = 'a'; 2608 * a[0] = 'a';
2609 * truncate(f, 4096); 2609 * truncate(f, 4096);
2610 * we have in the page first buffer_head mapped via page_mkwrite call back 2610 * we have in the page first buffer_head mapped via page_mkwrite call back
2611 * but other bufer_heads would be unmapped but dirty(dirty done via the 2611 * but other bufer_heads would be unmapped but dirty(dirty done via the
2612 * do_wp_page). So writepage should write the first block. If we modify 2612 * do_wp_page). So writepage should write the first block. If we modify
2613 * the mmap area beyond 1024 we will again get a page_fault and the 2613 * the mmap area beyond 1024 we will again get a page_fault and the
2614 * page_mkwrite callback will do the block allocation and mark the 2614 * page_mkwrite callback will do the block allocation and mark the
2615 * buffer_heads mapped. 2615 * buffer_heads mapped.
2616 * 2616 *
2617 * We redirty the page if we have any buffer_heads that is either delay or 2617 * We redirty the page if we have any buffer_heads that is either delay or
2618 * unwritten in the page. 2618 * unwritten in the page.
2619 * 2619 *
2620 * We can get recursively called as show below. 2620 * We can get recursively called as show below.
2621 * 2621 *
2622 * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> 2622 * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
2623 * ext4_writepage() 2623 * ext4_writepage()
2624 * 2624 *
2625 * But since we don't do any block allocation we should not deadlock. 2625 * But since we don't do any block allocation we should not deadlock.
2626 * Page also have the dirty flag cleared so we don't get recurive page_lock. 2626 * Page also have the dirty flag cleared so we don't get recurive page_lock.
2627 */ 2627 */
2628 static int ext4_writepage(struct page *page, 2628 static int ext4_writepage(struct page *page,
2629 struct writeback_control *wbc) 2629 struct writeback_control *wbc)
2630 { 2630 {
2631 int ret = 0, commit_write = 0; 2631 int ret = 0, commit_write = 0;
2632 loff_t size; 2632 loff_t size;
2633 unsigned int len; 2633 unsigned int len;
2634 struct buffer_head *page_bufs = NULL; 2634 struct buffer_head *page_bufs = NULL;
2635 struct inode *inode = page->mapping->host; 2635 struct inode *inode = page->mapping->host;
2636 2636
2637 trace_ext4_writepage(page); 2637 trace_ext4_writepage(page);
2638 size = i_size_read(inode); 2638 size = i_size_read(inode);
2639 if (page->index == size >> PAGE_CACHE_SHIFT) 2639 if (page->index == size >> PAGE_CACHE_SHIFT)
2640 len = size & ~PAGE_CACHE_MASK; 2640 len = size & ~PAGE_CACHE_MASK;
2641 else 2641 else
2642 len = PAGE_CACHE_SIZE; 2642 len = PAGE_CACHE_SIZE;
2643 2643
2644 /* 2644 /*
2645 * If the page does not have buffers (for whatever reason), 2645 * If the page does not have buffers (for whatever reason),
2646 * try to create them using __block_write_begin. If this 2646 * try to create them using __block_write_begin. If this
2647 * fails, redirty the page and move on. 2647 * fails, redirty the page and move on.
2648 */ 2648 */
2649 if (!page_has_buffers(page)) { 2649 if (!page_has_buffers(page)) {
2650 if (__block_write_begin(page, 0, len, 2650 if (__block_write_begin(page, 0, len,
2651 noalloc_get_block_write)) { 2651 noalloc_get_block_write)) {
2652 redirty_page: 2652 redirty_page:
2653 redirty_page_for_writepage(wbc, page); 2653 redirty_page_for_writepage(wbc, page);
2654 unlock_page(page); 2654 unlock_page(page);
2655 return 0; 2655 return 0;
2656 } 2656 }
2657 commit_write = 1; 2657 commit_write = 1;
2658 } 2658 }
2659 page_bufs = page_buffers(page); 2659 page_bufs = page_buffers(page);
2660 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, 2660 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
2661 ext4_bh_delay_or_unwritten)) { 2661 ext4_bh_delay_or_unwritten)) {
2662 /* 2662 /*
2663 * We don't want to do block allocation, so redirty 2663 * We don't want to do block allocation, so redirty
2664 * the page and return. We may reach here when we do 2664 * the page and return. We may reach here when we do
2665 * a journal commit via journal_submit_inode_data_buffers. 2665 * a journal commit via journal_submit_inode_data_buffers.
2666 * We can also reach here via shrink_page_list 2666 * We can also reach here via shrink_page_list
2667 */ 2667 */
2668 goto redirty_page; 2668 goto redirty_page;
2669 } 2669 }
2670 if (commit_write) 2670 if (commit_write)
2671 /* now mark the buffer_heads as dirty and uptodate */ 2671 /* now mark the buffer_heads as dirty and uptodate */
2672 block_commit_write(page, 0, len); 2672 block_commit_write(page, 0, len);
2673 2673
2674 if (PageChecked(page) && ext4_should_journal_data(inode)) 2674 if (PageChecked(page) && ext4_should_journal_data(inode))
2675 /* 2675 /*
2676 * It's mmapped pagecache. Add buffers and journal it. There 2676 * It's mmapped pagecache. Add buffers and journal it. There
2677 * doesn't seem much point in redirtying the page here. 2677 * doesn't seem much point in redirtying the page here.
2678 */ 2678 */
2679 return __ext4_journalled_writepage(page, len); 2679 return __ext4_journalled_writepage(page, len);
2680 2680
2681 if (buffer_uninit(page_bufs)) { 2681 if (buffer_uninit(page_bufs)) {
2682 ext4_set_bh_endio(page_bufs, inode); 2682 ext4_set_bh_endio(page_bufs, inode);
2683 ret = block_write_full_page_endio(page, noalloc_get_block_write, 2683 ret = block_write_full_page_endio(page, noalloc_get_block_write,
2684 wbc, ext4_end_io_buffer_write); 2684 wbc, ext4_end_io_buffer_write);
2685 } else 2685 } else
2686 ret = block_write_full_page(page, noalloc_get_block_write, 2686 ret = block_write_full_page(page, noalloc_get_block_write,
2687 wbc); 2687 wbc);
2688 2688
2689 return ret; 2689 return ret;
2690 } 2690 }
2691 2691
2692 /* 2692 /*
2693 * This is called via ext4_da_writepages() to 2693 * This is called via ext4_da_writepages() to
2694 * calculate the total number of credits to reserve to fit 2694 * calculate the total number of credits to reserve to fit
2695 * a single extent allocation into a single transaction, 2695 * a single extent allocation into a single transaction,
2696 * ext4_da_writpeages() will loop calling this before 2696 * ext4_da_writpeages() will loop calling this before
2697 * the block allocation. 2697 * the block allocation.
2698 */ 2698 */
2699 2699
2700 static int ext4_da_writepages_trans_blocks(struct inode *inode) 2700 static int ext4_da_writepages_trans_blocks(struct inode *inode)
2701 { 2701 {
2702 int max_blocks = EXT4_I(inode)->i_reserved_data_blocks; 2702 int max_blocks = EXT4_I(inode)->i_reserved_data_blocks;
2703 2703
2704 /* 2704 /*
2705 * With non-extent format the journal credit needed to 2705 * With non-extent format the journal credit needed to
2706 * insert nrblocks contiguous block is dependent on 2706 * insert nrblocks contiguous block is dependent on
2707 * number of contiguous block. So we will limit 2707 * number of contiguous block. So we will limit
2708 * number of contiguous block to a sane value 2708 * number of contiguous block to a sane value
2709 */ 2709 */
2710 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) && 2710 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
2711 (max_blocks > EXT4_MAX_TRANS_DATA)) 2711 (max_blocks > EXT4_MAX_TRANS_DATA))
2712 max_blocks = EXT4_MAX_TRANS_DATA; 2712 max_blocks = EXT4_MAX_TRANS_DATA;
2713 2713
2714 return ext4_chunk_trans_blocks(inode, max_blocks); 2714 return ext4_chunk_trans_blocks(inode, max_blocks);
2715 } 2715 }
2716 2716
2717 /* 2717 /*
2718 * write_cache_pages_da - walk the list of dirty pages of the given 2718 * write_cache_pages_da - walk the list of dirty pages of the given
2719 * address space and accumulate pages that need writing, and call 2719 * address space and accumulate pages that need writing, and call
2720 * mpage_da_map_and_submit to map a single contiguous memory region 2720 * mpage_da_map_and_submit to map a single contiguous memory region
2721 * and then write them. 2721 * and then write them.
2722 */ 2722 */
2723 static int write_cache_pages_da(struct address_space *mapping, 2723 static int write_cache_pages_da(struct address_space *mapping,
2724 struct writeback_control *wbc, 2724 struct writeback_control *wbc,
2725 struct mpage_da_data *mpd, 2725 struct mpage_da_data *mpd,
2726 pgoff_t *done_index) 2726 pgoff_t *done_index)
2727 { 2727 {
2728 struct buffer_head *bh, *head; 2728 struct buffer_head *bh, *head;
2729 struct inode *inode = mapping->host; 2729 struct inode *inode = mapping->host;
2730 struct pagevec pvec; 2730 struct pagevec pvec;
2731 unsigned int nr_pages; 2731 unsigned int nr_pages;
2732 sector_t logical; 2732 sector_t logical;
2733 pgoff_t index, end; 2733 pgoff_t index, end;
2734 long nr_to_write = wbc->nr_to_write; 2734 long nr_to_write = wbc->nr_to_write;
2735 int i, tag, ret = 0; 2735 int i, tag, ret = 0;
2736 2736
2737 memset(mpd, 0, sizeof(struct mpage_da_data)); 2737 memset(mpd, 0, sizeof(struct mpage_da_data));
2738 mpd->wbc = wbc; 2738 mpd->wbc = wbc;
2739 mpd->inode = inode; 2739 mpd->inode = inode;
2740 pagevec_init(&pvec, 0); 2740 pagevec_init(&pvec, 0);
2741 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2741 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2742 end = wbc->range_end >> PAGE_CACHE_SHIFT; 2742 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2743 2743
2744 if (wbc->sync_mode == WB_SYNC_ALL) 2744 if (wbc->sync_mode == WB_SYNC_ALL)
2745 tag = PAGECACHE_TAG_TOWRITE; 2745 tag = PAGECACHE_TAG_TOWRITE;
2746 else 2746 else
2747 tag = PAGECACHE_TAG_DIRTY; 2747 tag = PAGECACHE_TAG_DIRTY;
2748 2748
2749 *done_index = index; 2749 *done_index = index;
2750 while (index <= end) { 2750 while (index <= end) {
2751 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 2751 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
2752 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); 2752 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
2753 if (nr_pages == 0) 2753 if (nr_pages == 0)
2754 return 0; 2754 return 0;
2755 2755
2756 for (i = 0; i < nr_pages; i++) { 2756 for (i = 0; i < nr_pages; i++) {
2757 struct page *page = pvec.pages[i]; 2757 struct page *page = pvec.pages[i];
2758 2758
2759 /* 2759 /*
2760 * At this point, the page may be truncated or 2760 * At this point, the page may be truncated or
2761 * invalidated (changing page->mapping to NULL), or 2761 * invalidated (changing page->mapping to NULL), or
2762 * even swizzled back from swapper_space to tmpfs file 2762 * even swizzled back from swapper_space to tmpfs file
2763 * mapping. However, page->index will not change 2763 * mapping. However, page->index will not change
2764 * because we have a reference on the page. 2764 * because we have a reference on the page.
2765 */ 2765 */
2766 if (page->index > end) 2766 if (page->index > end)
2767 goto out; 2767 goto out;
2768 2768
2769 *done_index = page->index + 1; 2769 *done_index = page->index + 1;
2770 2770
2771 /* 2771 /*
2772 * If we can't merge this page, and we have 2772 * If we can't merge this page, and we have
2773 * accumulated an contiguous region, write it 2773 * accumulated an contiguous region, write it
2774 */ 2774 */
2775 if ((mpd->next_page != page->index) && 2775 if ((mpd->next_page != page->index) &&
2776 (mpd->next_page != mpd->first_page)) { 2776 (mpd->next_page != mpd->first_page)) {
2777 mpage_da_map_and_submit(mpd); 2777 mpage_da_map_and_submit(mpd);
2778 goto ret_extent_tail; 2778 goto ret_extent_tail;
2779 } 2779 }
2780 2780
2781 lock_page(page); 2781 lock_page(page);
2782 2782
2783 /* 2783 /*
2784 * If the page is no longer dirty, or its 2784 * If the page is no longer dirty, or its
2785 * mapping no longer corresponds to inode we 2785 * mapping no longer corresponds to inode we
2786 * are writing (which means it has been 2786 * are writing (which means it has been
2787 * truncated or invalidated), or the page is 2787 * truncated or invalidated), or the page is
2788 * already under writeback and we are not 2788 * already under writeback and we are not
2789 * doing a data integrity writeback, skip the page 2789 * doing a data integrity writeback, skip the page
2790 */ 2790 */
2791 if (!PageDirty(page) || 2791 if (!PageDirty(page) ||
2792 (PageWriteback(page) && 2792 (PageWriteback(page) &&
2793 (wbc->sync_mode == WB_SYNC_NONE)) || 2793 (wbc->sync_mode == WB_SYNC_NONE)) ||
2794 unlikely(page->mapping != mapping)) { 2794 unlikely(page->mapping != mapping)) {
2795 unlock_page(page); 2795 unlock_page(page);
2796 continue; 2796 continue;
2797 } 2797 }
2798 2798
2799 wait_on_page_writeback(page); 2799 wait_on_page_writeback(page);
2800 BUG_ON(PageWriteback(page)); 2800 BUG_ON(PageWriteback(page));
2801 2801
2802 if (mpd->next_page != page->index) 2802 if (mpd->next_page != page->index)
2803 mpd->first_page = page->index; 2803 mpd->first_page = page->index;
2804 mpd->next_page = page->index + 1; 2804 mpd->next_page = page->index + 1;
2805 logical = (sector_t) page->index << 2805 logical = (sector_t) page->index <<
2806 (PAGE_CACHE_SHIFT - inode->i_blkbits); 2806 (PAGE_CACHE_SHIFT - inode->i_blkbits);
2807 2807
2808 if (!page_has_buffers(page)) { 2808 if (!page_has_buffers(page)) {
2809 mpage_add_bh_to_extent(mpd, logical, 2809 mpage_add_bh_to_extent(mpd, logical,
2810 PAGE_CACHE_SIZE, 2810 PAGE_CACHE_SIZE,
2811 (1 << BH_Dirty) | (1 << BH_Uptodate)); 2811 (1 << BH_Dirty) | (1 << BH_Uptodate));
2812 if (mpd->io_done) 2812 if (mpd->io_done)
2813 goto ret_extent_tail; 2813 goto ret_extent_tail;
2814 } else { 2814 } else {
2815 /* 2815 /*
2816 * Page with regular buffer heads, 2816 * Page with regular buffer heads,
2817 * just add all dirty ones 2817 * just add all dirty ones
2818 */ 2818 */
2819 head = page_buffers(page); 2819 head = page_buffers(page);
2820 bh = head; 2820 bh = head;
2821 do { 2821 do {
2822 BUG_ON(buffer_locked(bh)); 2822 BUG_ON(buffer_locked(bh));
2823 /* 2823 /*
2824 * We need to try to allocate 2824 * We need to try to allocate
2825 * unmapped blocks in the same page. 2825 * unmapped blocks in the same page.
2826 * Otherwise we won't make progress 2826 * Otherwise we won't make progress
2827 * with the page in ext4_writepage 2827 * with the page in ext4_writepage
2828 */ 2828 */
2829 if (ext4_bh_delay_or_unwritten(NULL, bh)) { 2829 if (ext4_bh_delay_or_unwritten(NULL, bh)) {
2830 mpage_add_bh_to_extent(mpd, logical, 2830 mpage_add_bh_to_extent(mpd, logical,
2831 bh->b_size, 2831 bh->b_size,
2832 bh->b_state); 2832 bh->b_state);
2833 if (mpd->io_done) 2833 if (mpd->io_done)
2834 goto ret_extent_tail; 2834 goto ret_extent_tail;
2835 } else if (buffer_dirty(bh) && (buffer_mapped(bh))) { 2835 } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
2836 /* 2836 /*
2837 * mapped dirty buffer. We need 2837 * mapped dirty buffer. We need
2838 * to update the b_state 2838 * to update the b_state
2839 * because we look at b_state 2839 * because we look at b_state
2840 * in mpage_da_map_blocks. We 2840 * in mpage_da_map_blocks. We
2841 * don't update b_size because 2841 * don't update b_size because
2842 * if we find an unmapped 2842 * if we find an unmapped
2843 * buffer_head later we need to 2843 * buffer_head later we need to
2844 * use the b_state flag of that 2844 * use the b_state flag of that
2845 * buffer_head. 2845 * buffer_head.
2846 */ 2846 */
2847 if (mpd->b_size == 0) 2847 if (mpd->b_size == 0)
2848 mpd->b_state = bh->b_state & BH_FLAGS; 2848 mpd->b_state = bh->b_state & BH_FLAGS;
2849 } 2849 }
2850 logical++; 2850 logical++;
2851 } while ((bh = bh->b_this_page) != head); 2851 } while ((bh = bh->b_this_page) != head);
2852 } 2852 }
2853 2853
2854 if (nr_to_write > 0) { 2854 if (nr_to_write > 0) {
2855 nr_to_write--; 2855 nr_to_write--;
2856 if (nr_to_write == 0 && 2856 if (nr_to_write == 0 &&
2857 wbc->sync_mode == WB_SYNC_NONE) 2857 wbc->sync_mode == WB_SYNC_NONE)
2858 /* 2858 /*
2859 * We stop writing back only if we are 2859 * We stop writing back only if we are
2860 * not doing integrity sync. In case of 2860 * not doing integrity sync. In case of
2861 * integrity sync we have to keep going 2861 * integrity sync we have to keep going
2862 * because someone may be concurrently 2862 * because someone may be concurrently
2863 * dirtying pages, and we might have 2863 * dirtying pages, and we might have
2864 * synced a lot of newly appeared dirty 2864 * synced a lot of newly appeared dirty
2865 * pages, but have not synced all of the 2865 * pages, but have not synced all of the
2866 * old dirty pages. 2866 * old dirty pages.
2867 */ 2867 */
2868 goto out; 2868 goto out;
2869 } 2869 }
2870 } 2870 }
2871 pagevec_release(&pvec); 2871 pagevec_release(&pvec);
2872 cond_resched(); 2872 cond_resched();
2873 } 2873 }
2874 return 0; 2874 return 0;
2875 ret_extent_tail: 2875 ret_extent_tail:
2876 ret = MPAGE_DA_EXTENT_TAIL; 2876 ret = MPAGE_DA_EXTENT_TAIL;
2877 out: 2877 out:
2878 pagevec_release(&pvec); 2878 pagevec_release(&pvec);
2879 cond_resched(); 2879 cond_resched();
2880 return ret; 2880 return ret;
2881 } 2881 }
2882 2882
2883 2883
2884 static int ext4_da_writepages(struct address_space *mapping, 2884 static int ext4_da_writepages(struct address_space *mapping,
2885 struct writeback_control *wbc) 2885 struct writeback_control *wbc)
2886 { 2886 {
2887 pgoff_t index; 2887 pgoff_t index;
2888 int range_whole = 0; 2888 int range_whole = 0;
2889 handle_t *handle = NULL; 2889 handle_t *handle = NULL;
2890 struct mpage_da_data mpd; 2890 struct mpage_da_data mpd;
2891 struct inode *inode = mapping->host; 2891 struct inode *inode = mapping->host;
2892 int pages_written = 0; 2892 int pages_written = 0;
2893 unsigned int max_pages; 2893 unsigned int max_pages;
2894 int range_cyclic, cycled = 1, io_done = 0; 2894 int range_cyclic, cycled = 1, io_done = 0;
2895 int needed_blocks, ret = 0; 2895 int needed_blocks, ret = 0;
2896 long desired_nr_to_write, nr_to_writebump = 0; 2896 long desired_nr_to_write, nr_to_writebump = 0;
2897 loff_t range_start = wbc->range_start; 2897 loff_t range_start = wbc->range_start;
2898 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2898 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2899 pgoff_t done_index = 0; 2899 pgoff_t done_index = 0;
2900 pgoff_t end; 2900 pgoff_t end;
2901 2901
2902 trace_ext4_da_writepages(inode, wbc); 2902 trace_ext4_da_writepages(inode, wbc);
2903 2903
2904 /* 2904 /*
2905 * No pages to write? This is mainly a kludge to avoid starting 2905 * No pages to write? This is mainly a kludge to avoid starting
2906 * a transaction for special inodes like journal inode on last iput() 2906 * a transaction for special inodes like journal inode on last iput()
2907 * because that could violate lock ordering on umount 2907 * because that could violate lock ordering on umount
2908 */ 2908 */
2909 if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 2909 if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
2910 return 0; 2910 return 0;
2911 2911
2912 /* 2912 /*
2913 * If the filesystem has aborted, it is read-only, so return 2913 * If the filesystem has aborted, it is read-only, so return
2914 * right away instead of dumping stack traces later on that 2914 * right away instead of dumping stack traces later on that
2915 * will obscure the real source of the problem. We test 2915 * will obscure the real source of the problem. We test
2916 * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because 2916 * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
2917 * the latter could be true if the filesystem is mounted 2917 * the latter could be true if the filesystem is mounted
2918 * read-only, and in that case, ext4_da_writepages should 2918 * read-only, and in that case, ext4_da_writepages should
2919 * *never* be called, so if that ever happens, we would want 2919 * *never* be called, so if that ever happens, we would want
2920 * the stack trace. 2920 * the stack trace.
2921 */ 2921 */
2922 if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) 2922 if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
2923 return -EROFS; 2923 return -EROFS;
2924 2924
2925 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 2925 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2926 range_whole = 1; 2926 range_whole = 1;
2927 2927
2928 range_cyclic = wbc->range_cyclic; 2928 range_cyclic = wbc->range_cyclic;
2929 if (wbc->range_cyclic) { 2929 if (wbc->range_cyclic) {
2930 index = mapping->writeback_index; 2930 index = mapping->writeback_index;
2931 if (index) 2931 if (index)
2932 cycled = 0; 2932 cycled = 0;
2933 wbc->range_start = index << PAGE_CACHE_SHIFT; 2933 wbc->range_start = index << PAGE_CACHE_SHIFT;
2934 wbc->range_end = LLONG_MAX; 2934 wbc->range_end = LLONG_MAX;
2935 wbc->range_cyclic = 0; 2935 wbc->range_cyclic = 0;
2936 end = -1; 2936 end = -1;
2937 } else { 2937 } else {
2938 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2938 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2939 end = wbc->range_end >> PAGE_CACHE_SHIFT; 2939 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2940 } 2940 }
2941 2941
2942 /* 2942 /*
2943 * This works around two forms of stupidity. The first is in 2943 * This works around two forms of stupidity. The first is in
2944 * the writeback code, which caps the maximum number of pages 2944 * the writeback code, which caps the maximum number of pages
2945 * written to be 1024 pages. This is wrong on multiple 2945 * written to be 1024 pages. This is wrong on multiple
2946 * levels; different architectues have a different page size, 2946 * levels; different architectues have a different page size,
2947 * which changes the maximum amount of data which gets 2947 * which changes the maximum amount of data which gets
2948 * written. Secondly, 4 megabytes is way too small. XFS 2948 * written. Secondly, 4 megabytes is way too small. XFS
2949 * forces this value to be 16 megabytes by multiplying 2949 * forces this value to be 16 megabytes by multiplying
2950 * nr_to_write parameter by four, and then relies on its 2950 * nr_to_write parameter by four, and then relies on its
2951 * allocator to allocate larger extents to make them 2951 * allocator to allocate larger extents to make them
2952 * contiguous. Unfortunately this brings us to the second 2952 * contiguous. Unfortunately this brings us to the second
2953 * stupidity, which is that ext4's mballoc code only allocates 2953 * stupidity, which is that ext4's mballoc code only allocates
2954 * at most 2048 blocks. So we force contiguous writes up to 2954 * at most 2048 blocks. So we force contiguous writes up to
2955 * the number of dirty blocks in the inode, or 2955 * the number of dirty blocks in the inode, or
2956 * sbi->max_writeback_mb_bump whichever is smaller. 2956 * sbi->max_writeback_mb_bump whichever is smaller.
2957 */ 2957 */
2958 max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT); 2958 max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
2959 if (!range_cyclic && range_whole) { 2959 if (!range_cyclic && range_whole) {
2960 if (wbc->nr_to_write == LONG_MAX) 2960 if (wbc->nr_to_write == LONG_MAX)
2961 desired_nr_to_write = wbc->nr_to_write; 2961 desired_nr_to_write = wbc->nr_to_write;
2962 else 2962 else
2963 desired_nr_to_write = wbc->nr_to_write * 8; 2963 desired_nr_to_write = wbc->nr_to_write * 8;
2964 } else 2964 } else
2965 desired_nr_to_write = ext4_num_dirty_pages(inode, index, 2965 desired_nr_to_write = ext4_num_dirty_pages(inode, index,
2966 max_pages); 2966 max_pages);
2967 if (desired_nr_to_write > max_pages) 2967 if (desired_nr_to_write > max_pages)
2968 desired_nr_to_write = max_pages; 2968 desired_nr_to_write = max_pages;
2969 2969
2970 if (wbc->nr_to_write < desired_nr_to_write) { 2970 if (wbc->nr_to_write < desired_nr_to_write) {
2971 nr_to_writebump = desired_nr_to_write - wbc->nr_to_write; 2971 nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
2972 wbc->nr_to_write = desired_nr_to_write; 2972 wbc->nr_to_write = desired_nr_to_write;
2973 } 2973 }
2974 2974
2975 retry: 2975 retry:
2976 if (wbc->sync_mode == WB_SYNC_ALL) 2976 if (wbc->sync_mode == WB_SYNC_ALL)
2977 tag_pages_for_writeback(mapping, index, end); 2977 tag_pages_for_writeback(mapping, index, end);
2978 2978
2979 while (!ret && wbc->nr_to_write > 0) { 2979 while (!ret && wbc->nr_to_write > 0) {
2980 2980
2981 /* 2981 /*
2982 * we insert one extent at a time. So we need 2982 * we insert one extent at a time. So we need
2983 * credit needed for single extent allocation. 2983 * credit needed for single extent allocation.
2984 * journalled mode is currently not supported 2984 * journalled mode is currently not supported
2985 * by delalloc 2985 * by delalloc
2986 */ 2986 */
2987 BUG_ON(ext4_should_journal_data(inode)); 2987 BUG_ON(ext4_should_journal_data(inode));
2988 needed_blocks = ext4_da_writepages_trans_blocks(inode); 2988 needed_blocks = ext4_da_writepages_trans_blocks(inode);
2989 2989
2990 /* start a new transaction*/ 2990 /* start a new transaction*/
2991 handle = ext4_journal_start(inode, needed_blocks); 2991 handle = ext4_journal_start(inode, needed_blocks);
2992 if (IS_ERR(handle)) { 2992 if (IS_ERR(handle)) {
2993 ret = PTR_ERR(handle); 2993 ret = PTR_ERR(handle);
2994 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " 2994 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
2995 "%ld pages, ino %lu; err %d", __func__, 2995 "%ld pages, ino %lu; err %d", __func__,
2996 wbc->nr_to_write, inode->i_ino, ret); 2996 wbc->nr_to_write, inode->i_ino, ret);
2997 goto out_writepages; 2997 goto out_writepages;
2998 } 2998 }
2999 2999
3000 /* 3000 /*
3001 * Now call write_cache_pages_da() to find the next 3001 * Now call write_cache_pages_da() to find the next
3002 * contiguous region of logical blocks that need 3002 * contiguous region of logical blocks that need
3003 * blocks to be allocated by ext4 and submit them. 3003 * blocks to be allocated by ext4 and submit them.
3004 */ 3004 */
3005 ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index); 3005 ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
3006 /* 3006 /*
3007 * If we have a contiguous extent of pages and we 3007 * If we have a contiguous extent of pages and we
3008 * haven't done the I/O yet, map the blocks and submit 3008 * haven't done the I/O yet, map the blocks and submit
3009 * them for I/O. 3009 * them for I/O.
3010 */ 3010 */
3011 if (!mpd.io_done && mpd.next_page != mpd.first_page) { 3011 if (!mpd.io_done && mpd.next_page != mpd.first_page) {
3012 mpage_da_map_and_submit(&mpd); 3012 mpage_da_map_and_submit(&mpd);
3013 ret = MPAGE_DA_EXTENT_TAIL; 3013 ret = MPAGE_DA_EXTENT_TAIL;
3014 } 3014 }
3015 trace_ext4_da_write_pages(inode, &mpd); 3015 trace_ext4_da_write_pages(inode, &mpd);
3016 wbc->nr_to_write -= mpd.pages_written; 3016 wbc->nr_to_write -= mpd.pages_written;
3017 3017
3018 ext4_journal_stop(handle); 3018 ext4_journal_stop(handle);
3019 3019
3020 if ((mpd.retval == -ENOSPC) && sbi->s_journal) { 3020 if ((mpd.retval == -ENOSPC) && sbi->s_journal) {
3021 /* commit the transaction which would 3021 /* commit the transaction which would
3022 * free blocks released in the transaction 3022 * free blocks released in the transaction
3023 * and try again 3023 * and try again
3024 */ 3024 */
3025 jbd2_journal_force_commit_nested(sbi->s_journal); 3025 jbd2_journal_force_commit_nested(sbi->s_journal);
3026 ret = 0; 3026 ret = 0;
3027 } else if (ret == MPAGE_DA_EXTENT_TAIL) { 3027 } else if (ret == MPAGE_DA_EXTENT_TAIL) {
3028 /* 3028 /*
3029 * got one extent now try with 3029 * got one extent now try with
3030 * rest of the pages 3030 * rest of the pages
3031 */ 3031 */
3032 pages_written += mpd.pages_written; 3032 pages_written += mpd.pages_written;
3033 ret = 0; 3033 ret = 0;
3034 io_done = 1; 3034 io_done = 1;
3035 } else if (wbc->nr_to_write) 3035 } else if (wbc->nr_to_write)
3036 /* 3036 /*
3037 * There is no more writeout needed 3037 * There is no more writeout needed
3038 * or we requested for a noblocking writeout 3038 * or we requested for a noblocking writeout
3039 * and we found the device congested 3039 * and we found the device congested
3040 */ 3040 */
3041 break; 3041 break;
3042 } 3042 }
3043 if (!io_done && !cycled) { 3043 if (!io_done && !cycled) {
3044 cycled = 1; 3044 cycled = 1;
3045 index = 0; 3045 index = 0;
3046 wbc->range_start = index << PAGE_CACHE_SHIFT; 3046 wbc->range_start = index << PAGE_CACHE_SHIFT;
3047 wbc->range_end = mapping->writeback_index - 1; 3047 wbc->range_end = mapping->writeback_index - 1;
3048 goto retry; 3048 goto retry;
3049 } 3049 }
3050 3050
3051 /* Update index */ 3051 /* Update index */
3052 wbc->range_cyclic = range_cyclic; 3052 wbc->range_cyclic = range_cyclic;
3053 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 3053 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
3054 /* 3054 /*
3055 * set the writeback_index so that range_cyclic 3055 * set the writeback_index so that range_cyclic
3056 * mode will write it back later 3056 * mode will write it back later
3057 */ 3057 */
3058 mapping->writeback_index = done_index; 3058 mapping->writeback_index = done_index;
3059 3059
3060 out_writepages: 3060 out_writepages:
3061 wbc->nr_to_write -= nr_to_writebump; 3061 wbc->nr_to_write -= nr_to_writebump;
3062 wbc->range_start = range_start; 3062 wbc->range_start = range_start;
3063 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); 3063 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
3064 return ret; 3064 return ret;
3065 } 3065 }
3066 3066
3067 #define FALL_BACK_TO_NONDELALLOC 1 3067 #define FALL_BACK_TO_NONDELALLOC 1
3068 static int ext4_nonda_switch(struct super_block *sb) 3068 static int ext4_nonda_switch(struct super_block *sb)
3069 { 3069 {
3070 s64 free_blocks, dirty_blocks; 3070 s64 free_blocks, dirty_blocks;
3071 struct ext4_sb_info *sbi = EXT4_SB(sb); 3071 struct ext4_sb_info *sbi = EXT4_SB(sb);
3072 3072
3073 /* 3073 /*
3074 * switch to non delalloc mode if we are running low 3074 * switch to non delalloc mode if we are running low
3075 * on free block. The free block accounting via percpu 3075 * on free block. The free block accounting via percpu
3076 * counters can get slightly wrong with percpu_counter_batch getting 3076 * counters can get slightly wrong with percpu_counter_batch getting
3077 * accumulated on each CPU without updating global counters 3077 * accumulated on each CPU without updating global counters
3078 * Delalloc need an accurate free block accounting. So switch 3078 * Delalloc need an accurate free block accounting. So switch
3079 * to non delalloc when we are near to error range. 3079 * to non delalloc when we are near to error range.
3080 */ 3080 */
3081 free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); 3081 free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
3082 dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter); 3082 dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter);
3083 if (2 * free_blocks < 3 * dirty_blocks || 3083 if (2 * free_blocks < 3 * dirty_blocks ||
3084 free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) { 3084 free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
3085 /* 3085 /*
3086 * free block count is less than 150% of dirty blocks 3086 * free block count is less than 150% of dirty blocks
3087 * or free blocks is less than watermark 3087 * or free blocks is less than watermark
3088 */ 3088 */
3089 return 1; 3089 return 1;
3090 } 3090 }
3091 /* 3091 /*
3092 * Even if we don't switch but are nearing capacity, 3092 * Even if we don't switch but are nearing capacity,
3093 * start pushing delalloc when 1/2 of free blocks are dirty. 3093 * start pushing delalloc when 1/2 of free blocks are dirty.
3094 */ 3094 */
3095 if (free_blocks < 2 * dirty_blocks) 3095 if (free_blocks < 2 * dirty_blocks)
3096 writeback_inodes_sb_if_idle(sb); 3096 writeback_inodes_sb_if_idle(sb);
3097 3097
3098 return 0; 3098 return 0;
3099 } 3099 }
3100 3100
3101 static int ext4_da_write_begin(struct file *file, struct address_space *mapping, 3101 static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
3102 loff_t pos, unsigned len, unsigned flags, 3102 loff_t pos, unsigned len, unsigned flags,
3103 struct page **pagep, void **fsdata) 3103 struct page **pagep, void **fsdata)
3104 { 3104 {
3105 int ret, retries = 0; 3105 int ret, retries = 0;
3106 struct page *page; 3106 struct page *page;
3107 pgoff_t index; 3107 pgoff_t index;
3108 struct inode *inode = mapping->host; 3108 struct inode *inode = mapping->host;
3109 handle_t *handle; 3109 handle_t *handle;
3110 3110
3111 index = pos >> PAGE_CACHE_SHIFT; 3111 index = pos >> PAGE_CACHE_SHIFT;
3112 3112
3113 if (ext4_nonda_switch(inode->i_sb)) { 3113 if (ext4_nonda_switch(inode->i_sb)) {
3114 *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; 3114 *fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
3115 return ext4_write_begin(file, mapping, pos, 3115 return ext4_write_begin(file, mapping, pos,
3116 len, flags, pagep, fsdata); 3116 len, flags, pagep, fsdata);
3117 } 3117 }
3118 *fsdata = (void *)0; 3118 *fsdata = (void *)0;
3119 trace_ext4_da_write_begin(inode, pos, len, flags); 3119 trace_ext4_da_write_begin(inode, pos, len, flags);
3120 retry: 3120 retry:
3121 /* 3121 /*
3122 * With delayed allocation, we don't log the i_disksize update 3122 * With delayed allocation, we don't log the i_disksize update
3123 * if there is delayed block allocation. But we still need 3123 * if there is delayed block allocation. But we still need
3124 * to journalling the i_disksize update if writes to the end 3124 * to journalling the i_disksize update if writes to the end
3125 * of file which has an already mapped buffer. 3125 * of file which has an already mapped buffer.
3126 */ 3126 */
3127 handle = ext4_journal_start(inode, 1); 3127 handle = ext4_journal_start(inode, 1);
3128 if (IS_ERR(handle)) { 3128 if (IS_ERR(handle)) {
3129 ret = PTR_ERR(handle); 3129 ret = PTR_ERR(handle);
3130 goto out; 3130 goto out;
3131 } 3131 }
3132 /* We cannot recurse into the filesystem as the transaction is already 3132 /* We cannot recurse into the filesystem as the transaction is already
3133 * started */ 3133 * started */
3134 flags |= AOP_FLAG_NOFS; 3134 flags |= AOP_FLAG_NOFS;
3135 3135
3136 page = grab_cache_page_write_begin(mapping, index, flags); 3136 page = grab_cache_page_write_begin(mapping, index, flags);
3137 if (!page) { 3137 if (!page) {
3138 ext4_journal_stop(handle); 3138 ext4_journal_stop(handle);
3139 ret = -ENOMEM; 3139 ret = -ENOMEM;
3140 goto out; 3140 goto out;
3141 } 3141 }
3142 *pagep = page; 3142 *pagep = page;
3143 3143
3144 ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep); 3144 ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
3145 if (ret < 0) { 3145 if (ret < 0) {
3146 unlock_page(page); 3146 unlock_page(page);
3147 ext4_journal_stop(handle); 3147 ext4_journal_stop(handle);
3148 page_cache_release(page); 3148 page_cache_release(page);
3149 /* 3149 /*
3150 * block_write_begin may have instantiated a few blocks 3150 * block_write_begin may have instantiated a few blocks
3151 * outside i_size. Trim these off again. Don't need 3151 * outside i_size. Trim these off again. Don't need
3152 * i_size_read because we hold i_mutex. 3152 * i_size_read because we hold i_mutex.
3153 */ 3153 */
3154 if (pos + len > inode->i_size) 3154 if (pos + len > inode->i_size)
3155 ext4_truncate_failed_write(inode); 3155 ext4_truncate_failed_write(inode);
3156 } 3156 }
3157 3157
3158 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 3158 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
3159 goto retry; 3159 goto retry;
3160 out: 3160 out:
3161 return ret; 3161 return ret;
3162 } 3162 }
3163 3163
3164 /* 3164 /*
3165 * Check if we should update i_disksize 3165 * Check if we should update i_disksize
3166 * when write to the end of file but not require block allocation 3166 * when write to the end of file but not require block allocation
3167 */ 3167 */
3168 static int ext4_da_should_update_i_disksize(struct page *page, 3168 static int ext4_da_should_update_i_disksize(struct page *page,
3169 unsigned long offset) 3169 unsigned long offset)
3170 { 3170 {
3171 struct buffer_head *bh; 3171 struct buffer_head *bh;
3172 struct inode *inode = page->mapping->host; 3172 struct inode *inode = page->mapping->host;
3173 unsigned int idx; 3173 unsigned int idx;
3174 int i; 3174 int i;
3175 3175
3176 bh = page_buffers(page); 3176 bh = page_buffers(page);
3177 idx = offset >> inode->i_blkbits; 3177 idx = offset >> inode->i_blkbits;
3178 3178
3179 for (i = 0; i < idx; i++) 3179 for (i = 0; i < idx; i++)
3180 bh = bh->b_this_page; 3180 bh = bh->b_this_page;
3181 3181
3182 if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh)) 3182 if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh))
3183 return 0; 3183 return 0;
3184 return 1; 3184 return 1;
3185 } 3185 }
3186 3186
3187 static int ext4_da_write_end(struct file *file, 3187 static int ext4_da_write_end(struct file *file,
3188 struct address_space *mapping, 3188 struct address_space *mapping,
3189 loff_t pos, unsigned len, unsigned copied, 3189 loff_t pos, unsigned len, unsigned copied,
3190 struct page *page, void *fsdata) 3190 struct page *page, void *fsdata)
3191 { 3191 {
3192 struct inode *inode = mapping->host; 3192 struct inode *inode = mapping->host;
3193 int ret = 0, ret2; 3193 int ret = 0, ret2;
3194 handle_t *handle = ext4_journal_current_handle(); 3194 handle_t *handle = ext4_journal_current_handle();
3195 loff_t new_i_size; 3195 loff_t new_i_size;
3196 unsigned long start, end; 3196 unsigned long start, end;
3197 int write_mode = (int)(unsigned long)fsdata; 3197 int write_mode = (int)(unsigned long)fsdata;
3198 3198
3199 if (write_mode == FALL_BACK_TO_NONDELALLOC) { 3199 if (write_mode == FALL_BACK_TO_NONDELALLOC) {
3200 if (ext4_should_order_data(inode)) { 3200 if (ext4_should_order_data(inode)) {
3201 return ext4_ordered_write_end(file, mapping, pos, 3201 return ext4_ordered_write_end(file, mapping, pos,
3202 len, copied, page, fsdata); 3202 len, copied, page, fsdata);
3203 } else if (ext4_should_writeback_data(inode)) { 3203 } else if (ext4_should_writeback_data(inode)) {
3204 return ext4_writeback_write_end(file, mapping, pos, 3204 return ext4_writeback_write_end(file, mapping, pos,
3205 len, copied, page, fsdata); 3205 len, copied, page, fsdata);
3206 } else { 3206 } else {
3207 BUG(); 3207 BUG();
3208 } 3208 }
3209 } 3209 }
3210 3210
3211 trace_ext4_da_write_end(inode, pos, len, copied); 3211 trace_ext4_da_write_end(inode, pos, len, copied);
3212 start = pos & (PAGE_CACHE_SIZE - 1); 3212 start = pos & (PAGE_CACHE_SIZE - 1);
3213 end = start + copied - 1; 3213 end = start + copied - 1;
3214 3214
3215 /* 3215 /*
3216 * generic_write_end() will run mark_inode_dirty() if i_size 3216 * generic_write_end() will run mark_inode_dirty() if i_size
3217 * changes. So let's piggyback the i_disksize mark_inode_dirty 3217 * changes. So let's piggyback the i_disksize mark_inode_dirty
3218 * into that. 3218 * into that.
3219 */ 3219 */
3220 3220
3221 new_i_size = pos + copied; 3221 new_i_size = pos + copied;
3222 if (new_i_size > EXT4_I(inode)->i_disksize) { 3222 if (new_i_size > EXT4_I(inode)->i_disksize) {
3223 if (ext4_da_should_update_i_disksize(page, end)) { 3223 if (ext4_da_should_update_i_disksize(page, end)) {
3224 down_write(&EXT4_I(inode)->i_data_sem); 3224 down_write(&EXT4_I(inode)->i_data_sem);
3225 if (new_i_size > EXT4_I(inode)->i_disksize) { 3225 if (new_i_size > EXT4_I(inode)->i_disksize) {
3226 /* 3226 /*
3227 * Updating i_disksize when extending file 3227 * Updating i_disksize when extending file
3228 * without needing block allocation 3228 * without needing block allocation
3229 */ 3229 */
3230 if (ext4_should_order_data(inode)) 3230 if (ext4_should_order_data(inode))
3231 ret = ext4_jbd2_file_inode(handle, 3231 ret = ext4_jbd2_file_inode(handle,
3232 inode); 3232 inode);
3233 3233
3234 EXT4_I(inode)->i_disksize = new_i_size; 3234 EXT4_I(inode)->i_disksize = new_i_size;
3235 } 3235 }
3236 up_write(&EXT4_I(inode)->i_data_sem); 3236 up_write(&EXT4_I(inode)->i_data_sem);
3237 /* We need to mark inode dirty even if 3237 /* We need to mark inode dirty even if
3238 * new_i_size is less that inode->i_size 3238 * new_i_size is less that inode->i_size
3239 * bu greater than i_disksize.(hint delalloc) 3239 * bu greater than i_disksize.(hint delalloc)
3240 */ 3240 */
3241 ext4_mark_inode_dirty(handle, inode); 3241 ext4_mark_inode_dirty(handle, inode);
3242 } 3242 }
3243 } 3243 }
3244 ret2 = generic_write_end(file, mapping, pos, len, copied, 3244 ret2 = generic_write_end(file, mapping, pos, len, copied,
3245 page, fsdata); 3245 page, fsdata);
3246 copied = ret2; 3246 copied = ret2;
3247 if (ret2 < 0) 3247 if (ret2 < 0)
3248 ret = ret2; 3248 ret = ret2;
3249 ret2 = ext4_journal_stop(handle); 3249 ret2 = ext4_journal_stop(handle);
3250 if (!ret) 3250 if (!ret)
3251 ret = ret2; 3251 ret = ret2;
3252 3252
3253 return ret ? ret : copied; 3253 return ret ? ret : copied;
3254 } 3254 }
3255 3255
3256 static void ext4_da_invalidatepage(struct page *page, unsigned long offset) 3256 static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
3257 { 3257 {
3258 /* 3258 /*
3259 * Drop reserved blocks 3259 * Drop reserved blocks
3260 */ 3260 */
3261 BUG_ON(!PageLocked(page)); 3261 BUG_ON(!PageLocked(page));
3262 if (!page_has_buffers(page)) 3262 if (!page_has_buffers(page))
3263 goto out; 3263 goto out;
3264 3264
3265 ext4_da_page_release_reservation(page, offset); 3265 ext4_da_page_release_reservation(page, offset);
3266 3266
3267 out: 3267 out:
3268 ext4_invalidatepage(page, offset); 3268 ext4_invalidatepage(page, offset);
3269 3269
3270 return; 3270 return;
3271 } 3271 }
3272 3272
3273 /* 3273 /*
3274 * Force all delayed allocation blocks to be allocated for a given inode. 3274 * Force all delayed allocation blocks to be allocated for a given inode.
3275 */ 3275 */
3276 int ext4_alloc_da_blocks(struct inode *inode) 3276 int ext4_alloc_da_blocks(struct inode *inode)
3277 { 3277 {
3278 trace_ext4_alloc_da_blocks(inode); 3278 trace_ext4_alloc_da_blocks(inode);
3279 3279
3280 if (!EXT4_I(inode)->i_reserved_data_blocks && 3280 if (!EXT4_I(inode)->i_reserved_data_blocks &&
3281 !EXT4_I(inode)->i_reserved_meta_blocks) 3281 !EXT4_I(inode)->i_reserved_meta_blocks)
3282 return 0; 3282 return 0;
3283 3283
3284 /* 3284 /*
3285 * We do something simple for now. The filemap_flush() will 3285 * We do something simple for now. The filemap_flush() will
3286 * also start triggering a write of the data blocks, which is 3286 * also start triggering a write of the data blocks, which is
3287 * not strictly speaking necessary (and for users of 3287 * not strictly speaking necessary (and for users of
3288 * laptop_mode, not even desirable). However, to do otherwise 3288 * laptop_mode, not even desirable). However, to do otherwise
3289 * would require replicating code paths in: 3289 * would require replicating code paths in:
3290 * 3290 *
3291 * ext4_da_writepages() -> 3291 * ext4_da_writepages() ->
3292 * write_cache_pages() ---> (via passed in callback function) 3292 * write_cache_pages() ---> (via passed in callback function)
3293 * __mpage_da_writepage() --> 3293 * __mpage_da_writepage() -->
3294 * mpage_add_bh_to_extent() 3294 * mpage_add_bh_to_extent()
3295 * mpage_da_map_blocks() 3295 * mpage_da_map_blocks()
3296 * 3296 *
3297 * The problem is that write_cache_pages(), located in 3297 * The problem is that write_cache_pages(), located in
3298 * mm/page-writeback.c, marks pages clean in preparation for 3298 * mm/page-writeback.c, marks pages clean in preparation for
3299 * doing I/O, which is not desirable if we're not planning on 3299 * doing I/O, which is not desirable if we're not planning on
3300 * doing I/O at all. 3300 * doing I/O at all.
3301 * 3301 *
3302 * We could call write_cache_pages(), and then redirty all of 3302 * We could call write_cache_pages(), and then redirty all of
3303 * the pages by calling redirty_page_for_writepage() but that 3303 * the pages by calling redirty_page_for_writepage() but that
3304 * would be ugly in the extreme. So instead we would need to 3304 * would be ugly in the extreme. So instead we would need to
3305 * replicate parts of the code in the above functions, 3305 * replicate parts of the code in the above functions,
3306 * simplifying them because we wouldn't actually intend to 3306 * simplifying them because we wouldn't actually intend to
3307 * write out the pages, but rather only collect contiguous 3307 * write out the pages, but rather only collect contiguous
3308 * logical block extents, call the multi-block allocator, and 3308 * logical block extents, call the multi-block allocator, and
3309 * then update the buffer heads with the block allocations. 3309 * then update the buffer heads with the block allocations.
3310 * 3310 *
3311 * For now, though, we'll cheat by calling filemap_flush(), 3311 * For now, though, we'll cheat by calling filemap_flush(),
3312 * which will map the blocks, and start the I/O, but not 3312 * which will map the blocks, and start the I/O, but not
3313 * actually wait for the I/O to complete. 3313 * actually wait for the I/O to complete.
3314 */ 3314 */
3315 return filemap_flush(inode->i_mapping); 3315 return filemap_flush(inode->i_mapping);
3316 } 3316 }
3317 3317
3318 /* 3318 /*
3319 * bmap() is special. It gets used by applications such as lilo and by 3319 * bmap() is special. It gets used by applications such as lilo and by
3320 * the swapper to find the on-disk block of a specific piece of data. 3320 * the swapper to find the on-disk block of a specific piece of data.
3321 * 3321 *
3322 * Naturally, this is dangerous if the block concerned is still in the 3322 * Naturally, this is dangerous if the block concerned is still in the
3323 * journal. If somebody makes a swapfile on an ext4 data-journaling 3323 * journal. If somebody makes a swapfile on an ext4 data-journaling
3324 * filesystem and enables swap, then they may get a nasty shock when the 3324 * filesystem and enables swap, then they may get a nasty shock when the
3325 * data getting swapped to that swapfile suddenly gets overwritten by 3325 * data getting swapped to that swapfile suddenly gets overwritten by
3326 * the original zero's written out previously to the journal and 3326 * the original zero's written out previously to the journal and
3327 * awaiting writeback in the kernel's buffer cache. 3327 * awaiting writeback in the kernel's buffer cache.
3328 * 3328 *
3329 * So, if we see any bmap calls here on a modified, data-journaled file, 3329 * So, if we see any bmap calls here on a modified, data-journaled file,
3330 * take extra steps to flush any blocks which might be in the cache. 3330 * take extra steps to flush any blocks which might be in the cache.
3331 */ 3331 */
3332 static sector_t ext4_bmap(struct address_space *mapping, sector_t block) 3332 static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
3333 { 3333 {
3334 struct inode *inode = mapping->host; 3334 struct inode *inode = mapping->host;
3335 journal_t *journal; 3335 journal_t *journal;
3336 int err; 3336 int err;
3337 3337
3338 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && 3338 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
3339 test_opt(inode->i_sb, DELALLOC)) { 3339 test_opt(inode->i_sb, DELALLOC)) {
3340 /* 3340 /*
3341 * With delalloc we want to sync the file 3341 * With delalloc we want to sync the file
3342 * so that we can make sure we allocate 3342 * so that we can make sure we allocate
3343 * blocks for file 3343 * blocks for file
3344 */ 3344 */
3345 filemap_write_and_wait(mapping); 3345 filemap_write_and_wait(mapping);
3346 } 3346 }
3347 3347
3348 if (EXT4_JOURNAL(inode) && 3348 if (EXT4_JOURNAL(inode) &&
3349 ext4_test_inode_state(inode, EXT4_STATE_JDATA)) { 3349 ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {
3350 /* 3350 /*
3351 * This is a REALLY heavyweight approach, but the use of 3351 * This is a REALLY heavyweight approach, but the use of
3352 * bmap on dirty files is expected to be extremely rare: 3352 * bmap on dirty files is expected to be extremely rare:
3353 * only if we run lilo or swapon on a freshly made file 3353 * only if we run lilo or swapon on a freshly made file
3354 * do we expect this to happen. 3354 * do we expect this to happen.
3355 * 3355 *
3356 * (bmap requires CAP_SYS_RAWIO so this does not 3356 * (bmap requires CAP_SYS_RAWIO so this does not
3357 * represent an unprivileged user DOS attack --- we'd be 3357 * represent an unprivileged user DOS attack --- we'd be
3358 * in trouble if mortal users could trigger this path at 3358 * in trouble if mortal users could trigger this path at
3359 * will.) 3359 * will.)
3360 * 3360 *
3361 * NB. EXT4_STATE_JDATA is not set on files other than 3361 * NB. EXT4_STATE_JDATA is not set on files other than
3362 * regular files. If somebody wants to bmap a directory 3362 * regular files. If somebody wants to bmap a directory
3363 * or symlink and gets confused because the buffer 3363 * or symlink and gets confused because the buffer
3364 * hasn't yet been flushed to disk, they deserve 3364 * hasn't yet been flushed to disk, they deserve
3365 * everything they get. 3365 * everything they get.
3366 */ 3366 */
3367 3367
3368 ext4_clear_inode_state(inode, EXT4_STATE_JDATA); 3368 ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
3369 journal = EXT4_JOURNAL(inode); 3369 journal = EXT4_JOURNAL(inode);
3370 jbd2_journal_lock_updates(journal); 3370 jbd2_journal_lock_updates(journal);
3371 err = jbd2_journal_flush(journal); 3371 err = jbd2_journal_flush(journal);
3372 jbd2_journal_unlock_updates(journal); 3372 jbd2_journal_unlock_updates(journal);
3373 3373
3374 if (err) 3374 if (err)
3375 return 0; 3375 return 0;
3376 } 3376 }
3377 3377
3378 return generic_block_bmap(mapping, block, ext4_get_block); 3378 return generic_block_bmap(mapping, block, ext4_get_block);
3379 } 3379 }
3380 3380
3381 static int ext4_readpage(struct file *file, struct page *page) 3381 static int ext4_readpage(struct file *file, struct page *page)
3382 { 3382 {
3383 trace_ext4_readpage(page); 3383 trace_ext4_readpage(page);
3384 return mpage_readpage(page, ext4_get_block); 3384 return mpage_readpage(page, ext4_get_block);
3385 } 3385 }
3386 3386
3387 static int 3387 static int
3388 ext4_readpages(struct file *file, struct address_space *mapping, 3388 ext4_readpages(struct file *file, struct address_space *mapping,
3389 struct list_head *pages, unsigned nr_pages) 3389 struct list_head *pages, unsigned nr_pages)
3390 { 3390 {
3391 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); 3391 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
3392 } 3392 }
3393 3393
3394 static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset) 3394 static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
3395 { 3395 {
3396 struct buffer_head *head, *bh; 3396 struct buffer_head *head, *bh;
3397 unsigned int curr_off = 0; 3397 unsigned int curr_off = 0;
3398 3398
3399 if (!page_has_buffers(page)) 3399 if (!page_has_buffers(page))
3400 return; 3400 return;
3401 head = bh = page_buffers(page); 3401 head = bh = page_buffers(page);
3402 do { 3402 do {
3403 if (offset <= curr_off && test_clear_buffer_uninit(bh) 3403 if (offset <= curr_off && test_clear_buffer_uninit(bh)
3404 && bh->b_private) { 3404 && bh->b_private) {
3405 ext4_free_io_end(bh->b_private); 3405 ext4_free_io_end(bh->b_private);
3406 bh->b_private = NULL; 3406 bh->b_private = NULL;
3407 bh->b_end_io = NULL; 3407 bh->b_end_io = NULL;
3408 } 3408 }
3409 curr_off = curr_off + bh->b_size; 3409 curr_off = curr_off + bh->b_size;
3410 bh = bh->b_this_page; 3410 bh = bh->b_this_page;
3411 } while (bh != head); 3411 } while (bh != head);
3412 } 3412 }
3413 3413
3414 static void ext4_invalidatepage(struct page *page, unsigned long offset) 3414 static void ext4_invalidatepage(struct page *page, unsigned long offset)
3415 { 3415 {
3416 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 3416 journal_t *journal = EXT4_JOURNAL(page->mapping->host);
3417 3417
3418 trace_ext4_invalidatepage(page, offset); 3418 trace_ext4_invalidatepage(page, offset);
3419 3419
3420 /* 3420 /*
3421 * free any io_end structure allocated for buffers to be discarded 3421 * free any io_end structure allocated for buffers to be discarded
3422 */ 3422 */
3423 if (ext4_should_dioread_nolock(page->mapping->host)) 3423 if (ext4_should_dioread_nolock(page->mapping->host))
3424 ext4_invalidatepage_free_endio(page, offset); 3424 ext4_invalidatepage_free_endio(page, offset);
3425 /* 3425 /*
3426 * If it's a full truncate we just forget about the pending dirtying 3426 * If it's a full truncate we just forget about the pending dirtying
3427 */ 3427 */
3428 if (offset == 0) 3428 if (offset == 0)
3429 ClearPageChecked(page); 3429 ClearPageChecked(page);
3430 3430
3431 if (journal) 3431 if (journal)
3432 jbd2_journal_invalidatepage(journal, page, offset); 3432 jbd2_journal_invalidatepage(journal, page, offset);
3433 else 3433 else
3434 block_invalidatepage(page, offset); 3434 block_invalidatepage(page, offset);
3435 } 3435 }
3436 3436
3437 static int ext4_releasepage(struct page *page, gfp_t wait) 3437 static int ext4_releasepage(struct page *page, gfp_t wait)
3438 { 3438 {
3439 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 3439 journal_t *journal = EXT4_JOURNAL(page->mapping->host);
3440 3440
3441 trace_ext4_releasepage(page); 3441 trace_ext4_releasepage(page);
3442 3442
3443 WARN_ON(PageChecked(page)); 3443 WARN_ON(PageChecked(page));
3444 if (!page_has_buffers(page)) 3444 if (!page_has_buffers(page))
3445 return 0; 3445 return 0;
3446 if (journal) 3446 if (journal)
3447 return jbd2_journal_try_to_free_buffers(journal, page, wait); 3447 return jbd2_journal_try_to_free_buffers(journal, page, wait);
3448 else 3448 else
3449 return try_to_free_buffers(page); 3449 return try_to_free_buffers(page);
3450 } 3450 }
3451 3451
3452 /* 3452 /*
3453 * O_DIRECT for ext3 (or indirect map) based files 3453 * O_DIRECT for ext3 (or indirect map) based files
3454 * 3454 *
3455 * If the O_DIRECT write will extend the file then add this inode to the 3455 * If the O_DIRECT write will extend the file then add this inode to the
3456 * orphan list. So recovery will truncate it back to the original size 3456 * orphan list. So recovery will truncate it back to the original size
3457 * if the machine crashes during the write. 3457 * if the machine crashes during the write.
3458 * 3458 *
3459 * If the O_DIRECT write is intantiating holes inside i_size and the machine 3459 * If the O_DIRECT write is intantiating holes inside i_size and the machine
3460 * crashes then stale disk data _may_ be exposed inside the file. But current 3460 * crashes then stale disk data _may_ be exposed inside the file. But current
3461 * VFS code falls back into buffered path in that case so we are safe. 3461 * VFS code falls back into buffered path in that case so we are safe.
3462 */ 3462 */
3463 static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, 3463 static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
3464 const struct iovec *iov, loff_t offset, 3464 const struct iovec *iov, loff_t offset,
3465 unsigned long nr_segs) 3465 unsigned long nr_segs)
3466 { 3466 {
3467 struct file *file = iocb->ki_filp; 3467 struct file *file = iocb->ki_filp;
3468 struct inode *inode = file->f_mapping->host; 3468 struct inode *inode = file->f_mapping->host;
3469 struct ext4_inode_info *ei = EXT4_I(inode); 3469 struct ext4_inode_info *ei = EXT4_I(inode);
3470 handle_t *handle; 3470 handle_t *handle;
3471 ssize_t ret; 3471 ssize_t ret;
3472 int orphan = 0; 3472 int orphan = 0;
3473 size_t count = iov_length(iov, nr_segs); 3473 size_t count = iov_length(iov, nr_segs);
3474 int retries = 0; 3474 int retries = 0;
3475 3475
3476 if (rw == WRITE) { 3476 if (rw == WRITE) {
3477 loff_t final_size = offset + count; 3477 loff_t final_size = offset + count;
3478 3478
3479 if (final_size > inode->i_size) { 3479 if (final_size > inode->i_size) {
3480 /* Credits for sb + inode write */ 3480 /* Credits for sb + inode write */
3481 handle = ext4_journal_start(inode, 2); 3481 handle = ext4_journal_start(inode, 2);
3482 if (IS_ERR(handle)) { 3482 if (IS_ERR(handle)) {
3483 ret = PTR_ERR(handle); 3483 ret = PTR_ERR(handle);
3484 goto out; 3484 goto out;
3485 } 3485 }
3486 ret = ext4_orphan_add(handle, inode); 3486 ret = ext4_orphan_add(handle, inode);
3487 if (ret) { 3487 if (ret) {
3488 ext4_journal_stop(handle); 3488 ext4_journal_stop(handle);
3489 goto out; 3489 goto out;
3490 } 3490 }
3491 orphan = 1; 3491 orphan = 1;
3492 ei->i_disksize = inode->i_size; 3492 ei->i_disksize = inode->i_size;
3493 ext4_journal_stop(handle); 3493 ext4_journal_stop(handle);
3494 } 3494 }
3495 } 3495 }
3496 3496
3497 retry: 3497 retry:
3498 if (rw == READ && ext4_should_dioread_nolock(inode)) 3498 if (rw == READ && ext4_should_dioread_nolock(inode))
3499 ret = __blockdev_direct_IO(rw, iocb, inode, 3499 ret = __blockdev_direct_IO(rw, iocb, inode,
3500 inode->i_sb->s_bdev, iov, 3500 inode->i_sb->s_bdev, iov,
3501 offset, nr_segs, 3501 offset, nr_segs,
3502 ext4_get_block, NULL, NULL, 0); 3502 ext4_get_block, NULL, NULL, 0);
3503 else { 3503 else {
3504 ret = blockdev_direct_IO(rw, iocb, inode, 3504 ret = blockdev_direct_IO(rw, iocb, inode,
3505 inode->i_sb->s_bdev, iov, 3505 inode->i_sb->s_bdev, iov,
3506 offset, nr_segs, 3506 offset, nr_segs,
3507 ext4_get_block, NULL); 3507 ext4_get_block, NULL);
3508 3508
3509 if (unlikely((rw & WRITE) && ret < 0)) { 3509 if (unlikely((rw & WRITE) && ret < 0)) {
3510 loff_t isize = i_size_read(inode); 3510 loff_t isize = i_size_read(inode);
3511 loff_t end = offset + iov_length(iov, nr_segs); 3511 loff_t end = offset + iov_length(iov, nr_segs);
3512 3512
3513 if (end > isize) 3513 if (end > isize)
3514 ext4_truncate_failed_write(inode); 3514 ext4_truncate_failed_write(inode);
3515 } 3515 }
3516 } 3516 }
3517 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 3517 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
3518 goto retry; 3518 goto retry;
3519 3519
3520 if (orphan) { 3520 if (orphan) {
3521 int err; 3521 int err;
3522 3522
3523 /* Credits for sb + inode write */ 3523 /* Credits for sb + inode write */
3524 handle = ext4_journal_start(inode, 2); 3524 handle = ext4_journal_start(inode, 2);
3525 if (IS_ERR(handle)) { 3525 if (IS_ERR(handle)) {
3526 /* This is really bad luck. We've written the data 3526 /* This is really bad luck. We've written the data
3527 * but cannot extend i_size. Bail out and pretend 3527 * but cannot extend i_size. Bail out and pretend
3528 * the write failed... */ 3528 * the write failed... */
3529 ret = PTR_ERR(handle); 3529 ret = PTR_ERR(handle);
3530 if (inode->i_nlink) 3530 if (inode->i_nlink)
3531 ext4_orphan_del(NULL, inode); 3531 ext4_orphan_del(NULL, inode);
3532 3532
3533 goto out; 3533 goto out;
3534 } 3534 }
3535 if (inode->i_nlink) 3535 if (inode->i_nlink)
3536 ext4_orphan_del(handle, inode); 3536 ext4_orphan_del(handle, inode);
3537 if (ret > 0) { 3537 if (ret > 0) {
3538 loff_t end = offset + ret; 3538 loff_t end = offset + ret;
3539 if (end > inode->i_size) { 3539 if (end > inode->i_size) {
3540 ei->i_disksize = end; 3540 ei->i_disksize = end;
3541 i_size_write(inode, end); 3541 i_size_write(inode, end);
3542 /* 3542 /*
3543 * We're going to return a positive `ret' 3543 * We're going to return a positive `ret'
3544 * here due to non-zero-length I/O, so there's 3544 * here due to non-zero-length I/O, so there's
3545 * no way of reporting error returns from 3545 * no way of reporting error returns from
3546 * ext4_mark_inode_dirty() to userspace. So 3546 * ext4_mark_inode_dirty() to userspace. So
3547 * ignore it. 3547 * ignore it.
3548 */ 3548 */
3549 ext4_mark_inode_dirty(handle, inode); 3549 ext4_mark_inode_dirty(handle, inode);
3550 } 3550 }
3551 } 3551 }
3552 err = ext4_journal_stop(handle); 3552 err = ext4_journal_stop(handle);
3553 if (ret == 0) 3553 if (ret == 0)
3554 ret = err; 3554 ret = err;
3555 } 3555 }
3556 out: 3556 out:
3557 return ret; 3557 return ret;
3558 } 3558 }
3559 3559
3560 /* 3560 /*
3561 * ext4_get_block used when preparing for a DIO write or buffer write. 3561 * ext4_get_block used when preparing for a DIO write or buffer write.
3562 * We allocate an uinitialized extent if blocks haven't been allocated. 3562 * We allocate an uinitialized extent if blocks haven't been allocated.
3563 * The extent will be converted to initialized after the IO is complete. 3563 * The extent will be converted to initialized after the IO is complete.
3564 */ 3564 */
3565 static int ext4_get_block_write(struct inode *inode, sector_t iblock, 3565 static int ext4_get_block_write(struct inode *inode, sector_t iblock,
3566 struct buffer_head *bh_result, int create) 3566 struct buffer_head *bh_result, int create)
3567 { 3567 {
3568 ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n", 3568 ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
3569 inode->i_ino, create); 3569 inode->i_ino, create);
3570 return _ext4_get_block(inode, iblock, bh_result, 3570 return _ext4_get_block(inode, iblock, bh_result,
3571 EXT4_GET_BLOCKS_IO_CREATE_EXT); 3571 EXT4_GET_BLOCKS_IO_CREATE_EXT);
3572 } 3572 }
3573 3573
3574 static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, 3574 static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3575 ssize_t size, void *private, int ret, 3575 ssize_t size, void *private, int ret,
3576 bool is_async) 3576 bool is_async)
3577 { 3577 {
3578 ext4_io_end_t *io_end = iocb->private; 3578 ext4_io_end_t *io_end = iocb->private;
3579 struct workqueue_struct *wq; 3579 struct workqueue_struct *wq;
3580 unsigned long flags; 3580 unsigned long flags;
3581 struct ext4_inode_info *ei; 3581 struct ext4_inode_info *ei;
3582 3582
3583 /* if not async direct IO or dio with 0 bytes write, just return */ 3583 /* if not async direct IO or dio with 0 bytes write, just return */
3584 if (!io_end || !size) 3584 if (!io_end || !size)
3585 goto out; 3585 goto out;
3586 3586
3587 ext_debug("ext4_end_io_dio(): io_end 0x%p" 3587 ext_debug("ext4_end_io_dio(): io_end 0x%p"
3588 "for inode %lu, iocb 0x%p, offset %llu, size %llu\n", 3588 "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
3589 iocb->private, io_end->inode->i_ino, iocb, offset, 3589 iocb->private, io_end->inode->i_ino, iocb, offset,
3590 size); 3590 size);
3591 3591
3592 /* if not aio dio with unwritten extents, just free io and return */ 3592 /* if not aio dio with unwritten extents, just free io and return */
3593 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { 3593 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
3594 ext4_free_io_end(io_end); 3594 ext4_free_io_end(io_end);
3595 iocb->private = NULL; 3595 iocb->private = NULL;
3596 out: 3596 out:
3597 if (is_async) 3597 if (is_async)
3598 aio_complete(iocb, ret, 0); 3598 aio_complete(iocb, ret, 0);
3599 return; 3599 return;
3600 } 3600 }
3601 3601
3602 io_end->offset = offset; 3602 io_end->offset = offset;
3603 io_end->size = size; 3603 io_end->size = size;
3604 if (is_async) { 3604 if (is_async) {
3605 io_end->iocb = iocb; 3605 io_end->iocb = iocb;
3606 io_end->result = ret; 3606 io_end->result = ret;
3607 } 3607 }
3608 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; 3608 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
3609 3609
3610 /* Add the io_end to per-inode completed aio dio list*/ 3610 /* Add the io_end to per-inode completed aio dio list*/
3611 ei = EXT4_I(io_end->inode); 3611 ei = EXT4_I(io_end->inode);
3612 spin_lock_irqsave(&ei->i_completed_io_lock, flags); 3612 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3613 list_add_tail(&io_end->list, &ei->i_completed_io_list); 3613 list_add_tail(&io_end->list, &ei->i_completed_io_list);
3614 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 3614 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3615 3615
3616 /* queue the work to convert unwritten extents to written */ 3616 /* queue the work to convert unwritten extents to written */
3617 queue_work(wq, &io_end->work); 3617 queue_work(wq, &io_end->work);
3618 iocb->private = NULL; 3618 iocb->private = NULL;
3619 } 3619 }
3620 3620
3621 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) 3621 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
3622 { 3622 {
3623 ext4_io_end_t *io_end = bh->b_private; 3623 ext4_io_end_t *io_end = bh->b_private;
3624 struct workqueue_struct *wq; 3624 struct workqueue_struct *wq;
3625 struct inode *inode; 3625 struct inode *inode;
3626 unsigned long flags; 3626 unsigned long flags;
3627 3627
3628 if (!test_clear_buffer_uninit(bh) || !io_end) 3628 if (!test_clear_buffer_uninit(bh) || !io_end)
3629 goto out; 3629 goto out;
3630 3630
3631 if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) { 3631 if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
3632 printk("sb umounted, discard end_io request for inode %lu\n", 3632 printk("sb umounted, discard end_io request for inode %lu\n",
3633 io_end->inode->i_ino); 3633 io_end->inode->i_ino);
3634 ext4_free_io_end(io_end); 3634 ext4_free_io_end(io_end);
3635 goto out; 3635 goto out;
3636 } 3636 }
3637 3637
3638 io_end->flag = EXT4_IO_END_UNWRITTEN; 3638 io_end->flag = EXT4_IO_END_UNWRITTEN;
3639 inode = io_end->inode; 3639 inode = io_end->inode;
3640 3640
3641 /* Add the io_end to per-inode completed io list*/ 3641 /* Add the io_end to per-inode completed io list*/
3642 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); 3642 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
3643 list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list); 3643 list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
3644 spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); 3644 spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
3645 3645
3646 wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq; 3646 wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
3647 /* queue the work to convert unwritten extents to written */ 3647 /* queue the work to convert unwritten extents to written */
3648 queue_work(wq, &io_end->work); 3648 queue_work(wq, &io_end->work);
3649 out: 3649 out:
3650 bh->b_private = NULL; 3650 bh->b_private = NULL;
3651 bh->b_end_io = NULL; 3651 bh->b_end_io = NULL;
3652 clear_buffer_uninit(bh); 3652 clear_buffer_uninit(bh);
3653 end_buffer_async_write(bh, uptodate); 3653 end_buffer_async_write(bh, uptodate);
3654 } 3654 }
3655 3655
3656 static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode) 3656 static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
3657 { 3657 {
3658 ext4_io_end_t *io_end; 3658 ext4_io_end_t *io_end;
3659 struct page *page = bh->b_page; 3659 struct page *page = bh->b_page;
3660 loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT; 3660 loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT;
3661 size_t size = bh->b_size; 3661 size_t size = bh->b_size;
3662 3662
3663 retry: 3663 retry:
3664 io_end = ext4_init_io_end(inode, GFP_ATOMIC); 3664 io_end = ext4_init_io_end(inode, GFP_ATOMIC);
3665 if (!io_end) { 3665 if (!io_end) {
3666 pr_warn_ratelimited("%s: allocation fail\n", __func__); 3666 pr_warn_ratelimited("%s: allocation fail\n", __func__);
3667 schedule(); 3667 schedule();
3668 goto retry; 3668 goto retry;
3669 } 3669 }
3670 io_end->offset = offset; 3670 io_end->offset = offset;
3671 io_end->size = size; 3671 io_end->size = size;
3672 /* 3672 /*
3673 * We need to hold a reference to the page to make sure it 3673 * We need to hold a reference to the page to make sure it
3674 * doesn't get evicted before ext4_end_io_work() has a chance 3674 * doesn't get evicted before ext4_end_io_work() has a chance
3675 * to convert the extent from written to unwritten. 3675 * to convert the extent from written to unwritten.
3676 */ 3676 */
3677 io_end->page = page; 3677 io_end->page = page;
3678 get_page(io_end->page); 3678 get_page(io_end->page);
3679 3679
3680 bh->b_private = io_end; 3680 bh->b_private = io_end;
3681 bh->b_end_io = ext4_end_io_buffer_write; 3681 bh->b_end_io = ext4_end_io_buffer_write;
3682 return 0; 3682 return 0;
3683 } 3683 }
3684 3684
3685 /* 3685 /*
3686 * For ext4 extent files, ext4 will do direct-io write to holes, 3686 * For ext4 extent files, ext4 will do direct-io write to holes,
3687 * preallocated extents, and those write extend the file, no need to 3687 * preallocated extents, and those write extend the file, no need to
3688 * fall back to buffered IO. 3688 * fall back to buffered IO.
3689 * 3689 *
3690 * For holes, we fallocate those blocks, mark them as uninitialized 3690 * For holes, we fallocate those blocks, mark them as uninitialized
3691 * If those blocks were preallocated, we mark sure they are splited, but 3691 * If those blocks were preallocated, we mark sure they are splited, but
3692 * still keep the range to write as uninitialized. 3692 * still keep the range to write as uninitialized.
3693 * 3693 *
3694 * The unwrritten extents will be converted to written when DIO is completed. 3694 * The unwrritten extents will be converted to written when DIO is completed.
3695 * For async direct IO, since the IO may still pending when return, we 3695 * For async direct IO, since the IO may still pending when return, we
3696 * set up an end_io call back function, which will do the conversion 3696 * set up an end_io call back function, which will do the conversion
3697 * when async direct IO completed. 3697 * when async direct IO completed.
3698 * 3698 *
3699 * If the O_DIRECT write will extend the file then add this inode to the 3699 * If the O_DIRECT write will extend the file then add this inode to the
3700 * orphan list. So recovery will truncate it back to the original size 3700 * orphan list. So recovery will truncate it back to the original size
3701 * if the machine crashes during the write. 3701 * if the machine crashes during the write.
3702 * 3702 *
3703 */ 3703 */
3704 static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, 3704 static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3705 const struct iovec *iov, loff_t offset, 3705 const struct iovec *iov, loff_t offset,
3706 unsigned long nr_segs) 3706 unsigned long nr_segs)
3707 { 3707 {
3708 struct file *file = iocb->ki_filp; 3708 struct file *file = iocb->ki_filp;
3709 struct inode *inode = file->f_mapping->host; 3709 struct inode *inode = file->f_mapping->host;
3710 ssize_t ret; 3710 ssize_t ret;
3711 size_t count = iov_length(iov, nr_segs); 3711 size_t count = iov_length(iov, nr_segs);
3712 3712
3713 loff_t final_size = offset + count; 3713 loff_t final_size = offset + count;
3714 if (rw == WRITE && final_size <= inode->i_size) { 3714 if (rw == WRITE && final_size <= inode->i_size) {
3715 /* 3715 /*
3716 * We could direct write to holes and fallocate. 3716 * We could direct write to holes and fallocate.
3717 * 3717 *
3718 * Allocated blocks to fill the hole are marked as uninitialized 3718 * Allocated blocks to fill the hole are marked as uninitialized
3719 * to prevent parallel buffered read to expose the stale data 3719 * to prevent parallel buffered read to expose the stale data
3720 * before DIO complete the data IO. 3720 * before DIO complete the data IO.
3721 * 3721 *
3722 * As to previously fallocated extents, ext4 get_block 3722 * As to previously fallocated extents, ext4 get_block
3723 * will just simply mark the buffer mapped but still 3723 * will just simply mark the buffer mapped but still
3724 * keep the extents uninitialized. 3724 * keep the extents uninitialized.
3725 * 3725 *
3726 * for non AIO case, we will convert those unwritten extents 3726 * for non AIO case, we will convert those unwritten extents
3727 * to written after return back from blockdev_direct_IO. 3727 * to written after return back from blockdev_direct_IO.
3728 * 3728 *
3729 * for async DIO, the conversion needs to be defered when 3729 * for async DIO, the conversion needs to be defered when
3730 * the IO is completed. The ext4 end_io callback function 3730 * the IO is completed. The ext4 end_io callback function
3731 * will be called to take care of the conversion work. 3731 * will be called to take care of the conversion work.
3732 * Here for async case, we allocate an io_end structure to 3732 * Here for async case, we allocate an io_end structure to
3733 * hook to the iocb. 3733 * hook to the iocb.
3734 */ 3734 */
3735 iocb->private = NULL; 3735 iocb->private = NULL;
3736 EXT4_I(inode)->cur_aio_dio = NULL; 3736 EXT4_I(inode)->cur_aio_dio = NULL;
3737 if (!is_sync_kiocb(iocb)) { 3737 if (!is_sync_kiocb(iocb)) {
3738 iocb->private = ext4_init_io_end(inode, GFP_NOFS); 3738 iocb->private = ext4_init_io_end(inode, GFP_NOFS);
3739 if (!iocb->private) 3739 if (!iocb->private)
3740 return -ENOMEM; 3740 return -ENOMEM;
3741 /* 3741 /*
3742 * we save the io structure for current async 3742 * we save the io structure for current async
3743 * direct IO, so that later ext4_map_blocks() 3743 * direct IO, so that later ext4_map_blocks()
3744 * could flag the io structure whether there 3744 * could flag the io structure whether there
3745 * is a unwritten extents needs to be converted 3745 * is a unwritten extents needs to be converted
3746 * when IO is completed. 3746 * when IO is completed.
3747 */ 3747 */
3748 EXT4_I(inode)->cur_aio_dio = iocb->private; 3748 EXT4_I(inode)->cur_aio_dio = iocb->private;
3749 } 3749 }
3750 3750
3751 ret = blockdev_direct_IO(rw, iocb, inode, 3751 ret = blockdev_direct_IO(rw, iocb, inode,
3752 inode->i_sb->s_bdev, iov, 3752 inode->i_sb->s_bdev, iov,
3753 offset, nr_segs, 3753 offset, nr_segs,
3754 ext4_get_block_write, 3754 ext4_get_block_write,
3755 ext4_end_io_dio); 3755 ext4_end_io_dio);
3756 if (iocb->private) 3756 if (iocb->private)
3757 EXT4_I(inode)->cur_aio_dio = NULL; 3757 EXT4_I(inode)->cur_aio_dio = NULL;
3758 /* 3758 /*
3759 * The io_end structure takes a reference to the inode, 3759 * The io_end structure takes a reference to the inode,
3760 * that structure needs to be destroyed and the 3760 * that structure needs to be destroyed and the
3761 * reference to the inode need to be dropped, when IO is 3761 * reference to the inode need to be dropped, when IO is
3762 * complete, even with 0 byte write, or failed. 3762 * complete, even with 0 byte write, or failed.
3763 * 3763 *
3764 * In the successful AIO DIO case, the io_end structure will be 3764 * In the successful AIO DIO case, the io_end structure will be
3765 * desctroyed and the reference to the inode will be dropped 3765 * desctroyed and the reference to the inode will be dropped
3766 * after the end_io call back function is called. 3766 * after the end_io call back function is called.
3767 * 3767 *
3768 * In the case there is 0 byte write, or error case, since 3768 * In the case there is 0 byte write, or error case, since
3769 * VFS direct IO won't invoke the end_io call back function, 3769 * VFS direct IO won't invoke the end_io call back function,
3770 * we need to free the end_io structure here. 3770 * we need to free the end_io structure here.
3771 */ 3771 */
3772 if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { 3772 if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
3773 ext4_free_io_end(iocb->private); 3773 ext4_free_io_end(iocb->private);
3774 iocb->private = NULL; 3774 iocb->private = NULL;
3775 } else if (ret > 0 && ext4_test_inode_state(inode, 3775 } else if (ret > 0 && ext4_test_inode_state(inode,
3776 EXT4_STATE_DIO_UNWRITTEN)) { 3776 EXT4_STATE_DIO_UNWRITTEN)) {
3777 int err; 3777 int err;
3778 /* 3778 /*
3779 * for non AIO case, since the IO is already 3779 * for non AIO case, since the IO is already
3780 * completed, we could do the conversion right here 3780 * completed, we could do the conversion right here
3781 */ 3781 */
3782 err = ext4_convert_unwritten_extents(inode, 3782 err = ext4_convert_unwritten_extents(inode,
3783 offset, ret); 3783 offset, ret);
3784 if (err < 0) 3784 if (err < 0)
3785 ret = err; 3785 ret = err;
3786 ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); 3786 ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3787 } 3787 }
3788 return ret; 3788 return ret;
3789 } 3789 }
3790 3790
3791 /* for write the the end of file case, we fall back to old way */ 3791 /* for write the the end of file case, we fall back to old way */
3792 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); 3792 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
3793 } 3793 }
3794 3794
3795 static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, 3795 static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3796 const struct iovec *iov, loff_t offset, 3796 const struct iovec *iov, loff_t offset,
3797 unsigned long nr_segs) 3797 unsigned long nr_segs)
3798 { 3798 {
3799 struct file *file = iocb->ki_filp; 3799 struct file *file = iocb->ki_filp;
3800 struct inode *inode = file->f_mapping->host; 3800 struct inode *inode = file->f_mapping->host;
3801 ssize_t ret; 3801 ssize_t ret;
3802 3802
3803 trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw); 3803 trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
3804 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3804 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3805 ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); 3805 ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
3806 else 3806 else
3807 ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); 3807 ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
3808 trace_ext4_direct_IO_exit(inode, offset, 3808 trace_ext4_direct_IO_exit(inode, offset,
3809 iov_length(iov, nr_segs), rw, ret); 3809 iov_length(iov, nr_segs), rw, ret);
3810 return ret; 3810 return ret;
3811 } 3811 }
3812 3812
3813 /* 3813 /*
3814 * Pages can be marked dirty completely asynchronously from ext4's journalling 3814 * Pages can be marked dirty completely asynchronously from ext4's journalling
3815 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do 3815 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
3816 * much here because ->set_page_dirty is called under VFS locks. The page is 3816 * much here because ->set_page_dirty is called under VFS locks. The page is
3817 * not necessarily locked. 3817 * not necessarily locked.
3818 * 3818 *
3819 * We cannot just dirty the page and leave attached buffers clean, because the 3819 * We cannot just dirty the page and leave attached buffers clean, because the
3820 * buffers' dirty state is "definitive". We cannot just set the buffers dirty 3820 * buffers' dirty state is "definitive". We cannot just set the buffers dirty
3821 * or jbddirty because all the journalling code will explode. 3821 * or jbddirty because all the journalling code will explode.
3822 * 3822 *
3823 * So what we do is to mark the page "pending dirty" and next time writepage 3823 * So what we do is to mark the page "pending dirty" and next time writepage
3824 * is called, propagate that into the buffers appropriately. 3824 * is called, propagate that into the buffers appropriately.
3825 */ 3825 */
3826 static int ext4_journalled_set_page_dirty(struct page *page) 3826 static int ext4_journalled_set_page_dirty(struct page *page)
3827 { 3827 {
3828 SetPageChecked(page); 3828 SetPageChecked(page);
3829 return __set_page_dirty_nobuffers(page); 3829 return __set_page_dirty_nobuffers(page);
3830 } 3830 }
3831 3831
3832 static const struct address_space_operations ext4_ordered_aops = { 3832 static const struct address_space_operations ext4_ordered_aops = {
3833 .readpage = ext4_readpage, 3833 .readpage = ext4_readpage,
3834 .readpages = ext4_readpages, 3834 .readpages = ext4_readpages,
3835 .writepage = ext4_writepage, 3835 .writepage = ext4_writepage,
3836 .write_begin = ext4_write_begin, 3836 .write_begin = ext4_write_begin,
3837 .write_end = ext4_ordered_write_end, 3837 .write_end = ext4_ordered_write_end,
3838 .bmap = ext4_bmap, 3838 .bmap = ext4_bmap,
3839 .invalidatepage = ext4_invalidatepage, 3839 .invalidatepage = ext4_invalidatepage,
3840 .releasepage = ext4_releasepage, 3840 .releasepage = ext4_releasepage,
3841 .direct_IO = ext4_direct_IO, 3841 .direct_IO = ext4_direct_IO,
3842 .migratepage = buffer_migrate_page, 3842 .migratepage = buffer_migrate_page,
3843 .is_partially_uptodate = block_is_partially_uptodate, 3843 .is_partially_uptodate = block_is_partially_uptodate,
3844 .error_remove_page = generic_error_remove_page, 3844 .error_remove_page = generic_error_remove_page,
3845 }; 3845 };
3846 3846
3847 static const struct address_space_operations ext4_writeback_aops = { 3847 static const struct address_space_operations ext4_writeback_aops = {
3848 .readpage = ext4_readpage, 3848 .readpage = ext4_readpage,
3849 .readpages = ext4_readpages, 3849 .readpages = ext4_readpages,
3850 .writepage = ext4_writepage, 3850 .writepage = ext4_writepage,
3851 .write_begin = ext4_write_begin, 3851 .write_begin = ext4_write_begin,
3852 .write_end = ext4_writeback_write_end, 3852 .write_end = ext4_writeback_write_end,
3853 .bmap = ext4_bmap, 3853 .bmap = ext4_bmap,
3854 .invalidatepage = ext4_invalidatepage, 3854 .invalidatepage = ext4_invalidatepage,
3855 .releasepage = ext4_releasepage, 3855 .releasepage = ext4_releasepage,
3856 .direct_IO = ext4_direct_IO, 3856 .direct_IO = ext4_direct_IO,
3857 .migratepage = buffer_migrate_page, 3857 .migratepage = buffer_migrate_page,
3858 .is_partially_uptodate = block_is_partially_uptodate, 3858 .is_partially_uptodate = block_is_partially_uptodate,
3859 .error_remove_page = generic_error_remove_page, 3859 .error_remove_page = generic_error_remove_page,
3860 }; 3860 };
3861 3861
3862 static const struct address_space_operations ext4_journalled_aops = { 3862 static const struct address_space_operations ext4_journalled_aops = {
3863 .readpage = ext4_readpage, 3863 .readpage = ext4_readpage,
3864 .readpages = ext4_readpages, 3864 .readpages = ext4_readpages,
3865 .writepage = ext4_writepage, 3865 .writepage = ext4_writepage,
3866 .write_begin = ext4_write_begin, 3866 .write_begin = ext4_write_begin,
3867 .write_end = ext4_journalled_write_end, 3867 .write_end = ext4_journalled_write_end,
3868 .set_page_dirty = ext4_journalled_set_page_dirty, 3868 .set_page_dirty = ext4_journalled_set_page_dirty,
3869 .bmap = ext4_bmap, 3869 .bmap = ext4_bmap,
3870 .invalidatepage = ext4_invalidatepage, 3870 .invalidatepage = ext4_invalidatepage,
3871 .releasepage = ext4_releasepage, 3871 .releasepage = ext4_releasepage,
3872 .is_partially_uptodate = block_is_partially_uptodate, 3872 .is_partially_uptodate = block_is_partially_uptodate,
3873 .error_remove_page = generic_error_remove_page, 3873 .error_remove_page = generic_error_remove_page,
3874 }; 3874 };
3875 3875
3876 static const struct address_space_operations ext4_da_aops = { 3876 static const struct address_space_operations ext4_da_aops = {
3877 .readpage = ext4_readpage, 3877 .readpage = ext4_readpage,
3878 .readpages = ext4_readpages, 3878 .readpages = ext4_readpages,
3879 .writepage = ext4_writepage, 3879 .writepage = ext4_writepage,
3880 .writepages = ext4_da_writepages, 3880 .writepages = ext4_da_writepages,
3881 .write_begin = ext4_da_write_begin, 3881 .write_begin = ext4_da_write_begin,
3882 .write_end = ext4_da_write_end, 3882 .write_end = ext4_da_write_end,
3883 .bmap = ext4_bmap, 3883 .bmap = ext4_bmap,
3884 .invalidatepage = ext4_da_invalidatepage, 3884 .invalidatepage = ext4_da_invalidatepage,
3885 .releasepage = ext4_releasepage, 3885 .releasepage = ext4_releasepage,
3886 .direct_IO = ext4_direct_IO, 3886 .direct_IO = ext4_direct_IO,
3887 .migratepage = buffer_migrate_page, 3887 .migratepage = buffer_migrate_page,
3888 .is_partially_uptodate = block_is_partially_uptodate, 3888 .is_partially_uptodate = block_is_partially_uptodate,
3889 .error_remove_page = generic_error_remove_page, 3889 .error_remove_page = generic_error_remove_page,
3890 }; 3890 };
3891 3891
3892 void ext4_set_aops(struct inode *inode) 3892 void ext4_set_aops(struct inode *inode)
3893 { 3893 {
3894 if (ext4_should_order_data(inode) && 3894 if (ext4_should_order_data(inode) &&
3895 test_opt(inode->i_sb, DELALLOC)) 3895 test_opt(inode->i_sb, DELALLOC))
3896 inode->i_mapping->a_ops = &ext4_da_aops; 3896 inode->i_mapping->a_ops = &ext4_da_aops;
3897 else if (ext4_should_order_data(inode)) 3897 else if (ext4_should_order_data(inode))
3898 inode->i_mapping->a_ops = &ext4_ordered_aops; 3898 inode->i_mapping->a_ops = &ext4_ordered_aops;
3899 else if (ext4_should_writeback_data(inode) && 3899 else if (ext4_should_writeback_data(inode) &&
3900 test_opt(inode->i_sb, DELALLOC)) 3900 test_opt(inode->i_sb, DELALLOC))
3901 inode->i_mapping->a_ops = &ext4_da_aops; 3901 inode->i_mapping->a_ops = &ext4_da_aops;
3902 else if (ext4_should_writeback_data(inode)) 3902 else if (ext4_should_writeback_data(inode))
3903 inode->i_mapping->a_ops = &ext4_writeback_aops; 3903 inode->i_mapping->a_ops = &ext4_writeback_aops;
3904 else 3904 else
3905 inode->i_mapping->a_ops = &ext4_journalled_aops; 3905 inode->i_mapping->a_ops = &ext4_journalled_aops;
3906 } 3906 }
3907 3907
3908 /* 3908 /*
3909 * ext4_block_truncate_page() zeroes out a mapping from file offset `from' 3909 * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
3910 * up to the end of the block which corresponds to `from'. 3910 * up to the end of the block which corresponds to `from'.
3911 * This required during truncate. We need to physically zero the tail end 3911 * This required during truncate. We need to physically zero the tail end
3912 * of that block so it doesn't yield old data if the file is later grown. 3912 * of that block so it doesn't yield old data if the file is later grown.
3913 */ 3913 */
3914 int ext4_block_truncate_page(handle_t *handle, 3914 int ext4_block_truncate_page(handle_t *handle,
3915 struct address_space *mapping, loff_t from) 3915 struct address_space *mapping, loff_t from)
3916 { 3916 {
3917 unsigned offset = from & (PAGE_CACHE_SIZE-1); 3917 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3918 unsigned length; 3918 unsigned length;
3919 unsigned blocksize; 3919 unsigned blocksize;
3920 struct inode *inode = mapping->host; 3920 struct inode *inode = mapping->host;
3921 3921
3922 blocksize = inode->i_sb->s_blocksize; 3922 blocksize = inode->i_sb->s_blocksize;
3923 length = blocksize - (offset & (blocksize - 1)); 3923 length = blocksize - (offset & (blocksize - 1));
3924 3924
3925 return ext4_block_zero_page_range(handle, mapping, from, length); 3925 return ext4_block_zero_page_range(handle, mapping, from, length);
3926 } 3926 }
3927 3927
3928 /* 3928 /*
3929 * ext4_block_zero_page_range() zeros out a mapping of length 'length' 3929 * ext4_block_zero_page_range() zeros out a mapping of length 'length'
3930 * starting from file offset 'from'. The range to be zero'd must 3930 * starting from file offset 'from'. The range to be zero'd must
3931 * be contained with in one block. If the specified range exceeds 3931 * be contained with in one block. If the specified range exceeds
3932 * the end of the block it will be shortened to end of the block 3932 * the end of the block it will be shortened to end of the block
3933 * that cooresponds to 'from' 3933 * that cooresponds to 'from'
3934 */ 3934 */
3935 int ext4_block_zero_page_range(handle_t *handle, 3935 int ext4_block_zero_page_range(handle_t *handle,
3936 struct address_space *mapping, loff_t from, loff_t length) 3936 struct address_space *mapping, loff_t from, loff_t length)
3937 { 3937 {
3938 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; 3938 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
3939 unsigned offset = from & (PAGE_CACHE_SIZE-1); 3939 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3940 unsigned blocksize, max, pos; 3940 unsigned blocksize, max, pos;
3941 ext4_lblk_t iblock; 3941 ext4_lblk_t iblock;
3942 struct inode *inode = mapping->host; 3942 struct inode *inode = mapping->host;
3943 struct buffer_head *bh; 3943 struct buffer_head *bh;
3944 struct page *page; 3944 struct page *page;
3945 int err = 0; 3945 int err = 0;
3946 3946
3947 page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, 3947 page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
3948 mapping_gfp_mask(mapping) & ~__GFP_FS); 3948 mapping_gfp_mask(mapping) & ~__GFP_FS);
3949 if (!page) 3949 if (!page)
3950 return -EINVAL; 3950 return -EINVAL;
3951 3951
3952 blocksize = inode->i_sb->s_blocksize; 3952 blocksize = inode->i_sb->s_blocksize;
3953 max = blocksize - (offset & (blocksize - 1)); 3953 max = blocksize - (offset & (blocksize - 1));
3954 3954
3955 /* 3955 /*
3956 * correct length if it does not fall between 3956 * correct length if it does not fall between
3957 * 'from' and the end of the block 3957 * 'from' and the end of the block
3958 */ 3958 */
3959 if (length > max || length < 0) 3959 if (length > max || length < 0)
3960 length = max; 3960 length = max;
3961 3961
3962 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); 3962 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
3963 3963
3964 if (!page_has_buffers(page)) 3964 if (!page_has_buffers(page))
3965 create_empty_buffers(page, blocksize, 0); 3965 create_empty_buffers(page, blocksize, 0);
3966 3966
3967 /* Find the buffer that contains "offset" */ 3967 /* Find the buffer that contains "offset" */
3968 bh = page_buffers(page); 3968 bh = page_buffers(page);
3969 pos = blocksize; 3969 pos = blocksize;
3970 while (offset >= pos) { 3970 while (offset >= pos) {
3971 bh = bh->b_this_page; 3971 bh = bh->b_this_page;
3972 iblock++; 3972 iblock++;
3973 pos += blocksize; 3973 pos += blocksize;
3974 } 3974 }
3975 3975
3976 err = 0; 3976 err = 0;
3977 if (buffer_freed(bh)) { 3977 if (buffer_freed(bh)) {
3978 BUFFER_TRACE(bh, "freed: skip"); 3978 BUFFER_TRACE(bh, "freed: skip");
3979 goto unlock; 3979 goto unlock;
3980 } 3980 }
3981 3981
3982 if (!buffer_mapped(bh)) { 3982 if (!buffer_mapped(bh)) {
3983 BUFFER_TRACE(bh, "unmapped"); 3983 BUFFER_TRACE(bh, "unmapped");
3984 ext4_get_block(inode, iblock, bh, 0); 3984 ext4_get_block(inode, iblock, bh, 0);
3985 /* unmapped? It's a hole - nothing to do */ 3985 /* unmapped? It's a hole - nothing to do */
3986 if (!buffer_mapped(bh)) { 3986 if (!buffer_mapped(bh)) {
3987 BUFFER_TRACE(bh, "still unmapped"); 3987 BUFFER_TRACE(bh, "still unmapped");
3988 goto unlock; 3988 goto unlock;
3989 } 3989 }
3990 } 3990 }
3991 3991
3992 /* Ok, it's mapped. Make sure it's up-to-date */ 3992 /* Ok, it's mapped. Make sure it's up-to-date */
3993 if (PageUptodate(page)) 3993 if (PageUptodate(page))
3994 set_buffer_uptodate(bh); 3994 set_buffer_uptodate(bh);
3995 3995
3996 if (!buffer_uptodate(bh)) { 3996 if (!buffer_uptodate(bh)) {
3997 err = -EIO; 3997 err = -EIO;
3998 ll_rw_block(READ, 1, &bh); 3998 ll_rw_block(READ, 1, &bh);
3999 wait_on_buffer(bh); 3999 wait_on_buffer(bh);
4000 /* Uhhuh. Read error. Complain and punt. */ 4000 /* Uhhuh. Read error. Complain and punt. */
4001 if (!buffer_uptodate(bh)) 4001 if (!buffer_uptodate(bh))
4002 goto unlock; 4002 goto unlock;
4003 } 4003 }
4004 4004
4005 if (ext4_should_journal_data(inode)) { 4005 if (ext4_should_journal_data(inode)) {
4006 BUFFER_TRACE(bh, "get write access"); 4006 BUFFER_TRACE(bh, "get write access");
4007 err = ext4_journal_get_write_access(handle, bh); 4007 err = ext4_journal_get_write_access(handle, bh);
4008 if (err) 4008 if (err)
4009 goto unlock; 4009 goto unlock;
4010 } 4010 }
4011 4011
4012 zero_user(page, offset, length); 4012 zero_user(page, offset, length);
4013 4013
4014 BUFFER_TRACE(bh, "zeroed end of block"); 4014 BUFFER_TRACE(bh, "zeroed end of block");
4015 4015
4016 err = 0; 4016 err = 0;
4017 if (ext4_should_journal_data(inode)) { 4017 if (ext4_should_journal_data(inode)) {
4018 err = ext4_handle_dirty_metadata(handle, inode, bh); 4018 err = ext4_handle_dirty_metadata(handle, inode, bh);
4019 } else { 4019 } else {
4020 if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode) 4020 if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode)
4021 err = ext4_jbd2_file_inode(handle, inode); 4021 err = ext4_jbd2_file_inode(handle, inode);
4022 mark_buffer_dirty(bh); 4022 mark_buffer_dirty(bh);
4023 } 4023 }
4024 4024
4025 unlock: 4025 unlock:
4026 unlock_page(page); 4026 unlock_page(page);
4027 page_cache_release(page); 4027 page_cache_release(page);
4028 return err; 4028 return err;
4029 } 4029 }
4030 4030
4031 /* 4031 /*
4032 * Probably it should be a library function... search for first non-zero word 4032 * Probably it should be a library function... search for first non-zero word
4033 * or memcmp with zero_page, whatever is better for particular architecture. 4033 * or memcmp with zero_page, whatever is better for particular architecture.
4034 * Linus? 4034 * Linus?
4035 */ 4035 */
4036 static inline int all_zeroes(__le32 *p, __le32 *q) 4036 static inline int all_zeroes(__le32 *p, __le32 *q)
4037 { 4037 {
4038 while (p < q) 4038 while (p < q)
4039 if (*p++) 4039 if (*p++)
4040 return 0; 4040 return 0;
4041 return 1; 4041 return 1;
4042 } 4042 }
4043 4043
4044 /** 4044 /**
4045 * ext4_find_shared - find the indirect blocks for partial truncation. 4045 * ext4_find_shared - find the indirect blocks for partial truncation.
4046 * @inode: inode in question 4046 * @inode: inode in question
4047 * @depth: depth of the affected branch 4047 * @depth: depth of the affected branch
4048 * @offsets: offsets of pointers in that branch (see ext4_block_to_path) 4048 * @offsets: offsets of pointers in that branch (see ext4_block_to_path)
4049 * @chain: place to store the pointers to partial indirect blocks 4049 * @chain: place to store the pointers to partial indirect blocks
4050 * @top: place to the (detached) top of branch 4050 * @top: place to the (detached) top of branch
4051 * 4051 *
4052 * This is a helper function used by ext4_truncate(). 4052 * This is a helper function used by ext4_truncate().
4053 * 4053 *
4054 * When we do truncate() we may have to clean the ends of several 4054 * When we do truncate() we may have to clean the ends of several
4055 * indirect blocks but leave the blocks themselves alive. Block is 4055 * indirect blocks but leave the blocks themselves alive. Block is
4056 * partially truncated if some data below the new i_size is referred 4056 * partially truncated if some data below the new i_size is referred
4057 * from it (and it is on the path to the first completely truncated 4057 * from it (and it is on the path to the first completely truncated
4058 * data block, indeed). We have to free the top of that path along 4058 * data block, indeed). We have to free the top of that path along
4059 * with everything to the right of the path. Since no allocation 4059 * with everything to the right of the path. Since no allocation
4060 * past the truncation point is possible until ext4_truncate() 4060 * past the truncation point is possible until ext4_truncate()
4061 * finishes, we may safely do the latter, but top of branch may 4061 * finishes, we may safely do the latter, but top of branch may
4062 * require special attention - pageout below the truncation point 4062 * require special attention - pageout below the truncation point
4063 * might try to populate it. 4063 * might try to populate it.
4064 * 4064 *
4065 * We atomically detach the top of branch from the tree, store the 4065 * We atomically detach the top of branch from the tree, store the
4066 * block number of its root in *@top, pointers to buffer_heads of 4066 * block number of its root in *@top, pointers to buffer_heads of
4067 * partially truncated blocks - in @chain[].bh and pointers to 4067 * partially truncated blocks - in @chain[].bh and pointers to
4068 * their last elements that should not be removed - in 4068 * their last elements that should not be removed - in
4069 * @chain[].p. Return value is the pointer to last filled element 4069 * @chain[].p. Return value is the pointer to last filled element
4070 * of @chain. 4070 * of @chain.
4071 * 4071 *
4072 * The work left to caller to do the actual freeing of subtrees: 4072 * The work left to caller to do the actual freeing of subtrees:
4073 * a) free the subtree starting from *@top 4073 * a) free the subtree starting from *@top
4074 * b) free the subtrees whose roots are stored in 4074 * b) free the subtrees whose roots are stored in
4075 * (@chain[i].p+1 .. end of @chain[i].bh->b_data) 4075 * (@chain[i].p+1 .. end of @chain[i].bh->b_data)
4076 * c) free the subtrees growing from the inode past the @chain[0]. 4076 * c) free the subtrees growing from the inode past the @chain[0].
4077 * (no partially truncated stuff there). */ 4077 * (no partially truncated stuff there). */
4078 4078
4079 static Indirect *ext4_find_shared(struct inode *inode, int depth, 4079 static Indirect *ext4_find_shared(struct inode *inode, int depth,
4080 ext4_lblk_t offsets[4], Indirect chain[4], 4080 ext4_lblk_t offsets[4], Indirect chain[4],
4081 __le32 *top) 4081 __le32 *top)
4082 { 4082 {
4083 Indirect *partial, *p; 4083 Indirect *partial, *p;
4084 int k, err; 4084 int k, err;
4085 4085
4086 *top = 0; 4086 *top = 0;
4087 /* Make k index the deepest non-null offset + 1 */ 4087 /* Make k index the deepest non-null offset + 1 */
4088 for (k = depth; k > 1 && !offsets[k-1]; k--) 4088 for (k = depth; k > 1 && !offsets[k-1]; k--)
4089 ; 4089 ;
4090 partial = ext4_get_branch(inode, k, offsets, chain, &err); 4090 partial = ext4_get_branch(inode, k, offsets, chain, &err);
4091 /* Writer: pointers */ 4091 /* Writer: pointers */
4092 if (!partial) 4092 if (!partial)
4093 partial = chain + k-1; 4093 partial = chain + k-1;
4094 /* 4094 /*
4095 * If the branch acquired continuation since we've looked at it - 4095 * If the branch acquired continuation since we've looked at it -
4096 * fine, it should all survive and (new) top doesn't belong to us. 4096 * fine, it should all survive and (new) top doesn't belong to us.
4097 */ 4097 */
4098 if (!partial->key && *partial->p) 4098 if (!partial->key && *partial->p)
4099 /* Writer: end */ 4099 /* Writer: end */
4100 goto no_top; 4100 goto no_top;
4101 for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--) 4101 for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--)
4102 ; 4102 ;
4103 /* 4103 /*
4104 * OK, we've found the last block that must survive. The rest of our 4104 * OK, we've found the last block that must survive. The rest of our
4105 * branch should be detached before unlocking. However, if that rest 4105 * branch should be detached before unlocking. However, if that rest
4106 * of branch is all ours and does not grow immediately from the inode 4106 * of branch is all ours and does not grow immediately from the inode
4107 * it's easier to cheat and just decrement partial->p. 4107 * it's easier to cheat and just decrement partial->p.
4108 */ 4108 */
4109 if (p == chain + k - 1 && p > chain) { 4109 if (p == chain + k - 1 && p > chain) {
4110 p->p--; 4110 p->p--;
4111 } else { 4111 } else {
4112 *top = *p->p; 4112 *top = *p->p;
4113 /* Nope, don't do this in ext4. Must leave the tree intact */ 4113 /* Nope, don't do this in ext4. Must leave the tree intact */
4114 #if 0 4114 #if 0
4115 *p->p = 0; 4115 *p->p = 0;
4116 #endif 4116 #endif
4117 } 4117 }
4118 /* Writer: end */ 4118 /* Writer: end */
4119 4119
4120 while (partial > p) { 4120 while (partial > p) {
4121 brelse(partial->bh); 4121 brelse(partial->bh);
4122 partial--; 4122 partial--;
4123 } 4123 }
4124 no_top: 4124 no_top:
4125 return partial; 4125 return partial;
4126 } 4126 }
4127 4127
4128 /* 4128 /*
4129 * Zero a number of block pointers in either an inode or an indirect block. 4129 * Zero a number of block pointers in either an inode or an indirect block.
4130 * If we restart the transaction we must again get write access to the 4130 * If we restart the transaction we must again get write access to the
4131 * indirect block for further modification. 4131 * indirect block for further modification.
4132 * 4132 *
4133 * We release `count' blocks on disk, but (last - first) may be greater 4133 * We release `count' blocks on disk, but (last - first) may be greater
4134 * than `count' because there can be holes in there. 4134 * than `count' because there can be holes in there.
4135 * 4135 *
4136 * Return 0 on success, 1 on invalid block range 4136 * Return 0 on success, 1 on invalid block range
4137 * and < 0 on fatal error. 4137 * and < 0 on fatal error.
4138 */ 4138 */
4139 static int ext4_clear_blocks(handle_t *handle, struct inode *inode, 4139 static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
4140 struct buffer_head *bh, 4140 struct buffer_head *bh,
4141 ext4_fsblk_t block_to_free, 4141 ext4_fsblk_t block_to_free,
4142 unsigned long count, __le32 *first, 4142 unsigned long count, __le32 *first,
4143 __le32 *last) 4143 __le32 *last)
4144 { 4144 {
4145 __le32 *p; 4145 __le32 *p;
4146 int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; 4146 int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
4147 int err; 4147 int err;
4148 4148
4149 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 4149 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
4150 flags |= EXT4_FREE_BLOCKS_METADATA; 4150 flags |= EXT4_FREE_BLOCKS_METADATA;
4151 4151
4152 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, 4152 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
4153 count)) { 4153 count)) {
4154 EXT4_ERROR_INODE(inode, "attempt to clear invalid " 4154 EXT4_ERROR_INODE(inode, "attempt to clear invalid "
4155 "blocks %llu len %lu", 4155 "blocks %llu len %lu",
4156 (unsigned long long) block_to_free, count); 4156 (unsigned long long) block_to_free, count);
4157 return 1; 4157 return 1;
4158 } 4158 }
4159 4159
4160 if (try_to_extend_transaction(handle, inode)) { 4160 if (try_to_extend_transaction(handle, inode)) {
4161 if (bh) { 4161 if (bh) {
4162 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 4162 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
4163 err = ext4_handle_dirty_metadata(handle, inode, bh); 4163 err = ext4_handle_dirty_metadata(handle, inode, bh);
4164 if (unlikely(err)) 4164 if (unlikely(err))
4165 goto out_err; 4165 goto out_err;
4166 } 4166 }
4167 err = ext4_mark_inode_dirty(handle, inode); 4167 err = ext4_mark_inode_dirty(handle, inode);
4168 if (unlikely(err)) 4168 if (unlikely(err))
4169 goto out_err; 4169 goto out_err;
4170 err = ext4_truncate_restart_trans(handle, inode, 4170 err = ext4_truncate_restart_trans(handle, inode,
4171 blocks_for_truncate(inode)); 4171 blocks_for_truncate(inode));
4172 if (unlikely(err)) 4172 if (unlikely(err))
4173 goto out_err; 4173 goto out_err;
4174 if (bh) { 4174 if (bh) {
4175 BUFFER_TRACE(bh, "retaking write access"); 4175 BUFFER_TRACE(bh, "retaking write access");
4176 err = ext4_journal_get_write_access(handle, bh); 4176 err = ext4_journal_get_write_access(handle, bh);
4177 if (unlikely(err)) 4177 if (unlikely(err))
4178 goto out_err; 4178 goto out_err;
4179 } 4179 }
4180 } 4180 }
4181 4181
4182 for (p = first; p < last; p++) 4182 for (p = first; p < last; p++)
4183 *p = 0; 4183 *p = 0;
4184 4184
4185 ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags); 4185 ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags);
4186 return 0; 4186 return 0;
4187 out_err: 4187 out_err:
4188 ext4_std_error(inode->i_sb, err); 4188 ext4_std_error(inode->i_sb, err);
4189 return err; 4189 return err;
4190 } 4190 }
4191 4191
4192 /** 4192 /**
4193 * ext4_free_data - free a list of data blocks 4193 * ext4_free_data - free a list of data blocks
4194 * @handle: handle for this transaction 4194 * @handle: handle for this transaction
4195 * @inode: inode we are dealing with 4195 * @inode: inode we are dealing with
4196 * @this_bh: indirect buffer_head which contains *@first and *@last 4196 * @this_bh: indirect buffer_head which contains *@first and *@last
4197 * @first: array of block numbers 4197 * @first: array of block numbers
4198 * @last: points immediately past the end of array 4198 * @last: points immediately past the end of array
4199 * 4199 *
4200 * We are freeing all blocks referred from that array (numbers are stored as 4200 * We are freeing all blocks referred from that array (numbers are stored as
4201 * little-endian 32-bit) and updating @inode->i_blocks appropriately. 4201 * little-endian 32-bit) and updating @inode->i_blocks appropriately.
4202 * 4202 *
4203 * We accumulate contiguous runs of blocks to free. Conveniently, if these 4203 * We accumulate contiguous runs of blocks to free. Conveniently, if these
4204 * blocks are contiguous then releasing them at one time will only affect one 4204 * blocks are contiguous then releasing them at one time will only affect one
4205 * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't 4205 * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
4206 * actually use a lot of journal space. 4206 * actually use a lot of journal space.
4207 * 4207 *
4208 * @this_bh will be %NULL if @first and @last point into the inode's direct 4208 * @this_bh will be %NULL if @first and @last point into the inode's direct
4209 * block pointers. 4209 * block pointers.
4210 */ 4210 */
4211 static void ext4_free_data(handle_t *handle, struct inode *inode, 4211 static void ext4_free_data(handle_t *handle, struct inode *inode,
4212 struct buffer_head *this_bh, 4212 struct buffer_head *this_bh,
4213 __le32 *first, __le32 *last) 4213 __le32 *first, __le32 *last)
4214 { 4214 {
4215 ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */ 4215 ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */
4216 unsigned long count = 0; /* Number of blocks in the run */ 4216 unsigned long count = 0; /* Number of blocks in the run */
4217 __le32 *block_to_free_p = NULL; /* Pointer into inode/ind 4217 __le32 *block_to_free_p = NULL; /* Pointer into inode/ind
4218 corresponding to 4218 corresponding to
4219 block_to_free */ 4219 block_to_free */
4220 ext4_fsblk_t nr; /* Current block # */ 4220 ext4_fsblk_t nr; /* Current block # */
4221 __le32 *p; /* Pointer into inode/ind 4221 __le32 *p; /* Pointer into inode/ind
4222 for current block */ 4222 for current block */
4223 int err = 0; 4223 int err = 0;
4224 4224
4225 if (this_bh) { /* For indirect block */ 4225 if (this_bh) { /* For indirect block */
4226 BUFFER_TRACE(this_bh, "get_write_access"); 4226 BUFFER_TRACE(this_bh, "get_write_access");
4227 err = ext4_journal_get_write_access(handle, this_bh); 4227 err = ext4_journal_get_write_access(handle, this_bh);
4228 /* Important: if we can't update the indirect pointers 4228 /* Important: if we can't update the indirect pointers
4229 * to the blocks, we can't free them. */ 4229 * to the blocks, we can't free them. */
4230 if (err) 4230 if (err)
4231 return; 4231 return;
4232 } 4232 }
4233 4233
4234 for (p = first; p < last; p++) { 4234 for (p = first; p < last; p++) {
4235 nr = le32_to_cpu(*p); 4235 nr = le32_to_cpu(*p);
4236 if (nr) { 4236 if (nr) {
4237 /* accumulate blocks to free if they're contiguous */ 4237 /* accumulate blocks to free if they're contiguous */
4238 if (count == 0) { 4238 if (count == 0) {
4239 block_to_free = nr; 4239 block_to_free = nr;
4240 block_to_free_p = p; 4240 block_to_free_p = p;
4241 count = 1; 4241 count = 1;
4242 } else if (nr == block_to_free + count) { 4242 } else if (nr == block_to_free + count) {
4243 count++; 4243 count++;
4244 } else { 4244 } else {
4245 err = ext4_clear_blocks(handle, inode, this_bh, 4245 err = ext4_clear_blocks(handle, inode, this_bh,
4246 block_to_free, count, 4246 block_to_free, count,
4247 block_to_free_p, p); 4247 block_to_free_p, p);
4248 if (err) 4248 if (err)
4249 break; 4249 break;
4250 block_to_free = nr; 4250 block_to_free = nr;
4251 block_to_free_p = p; 4251 block_to_free_p = p;
4252 count = 1; 4252 count = 1;
4253 } 4253 }
4254 } 4254 }
4255 } 4255 }
4256 4256
4257 if (!err && count > 0) 4257 if (!err && count > 0)
4258 err = ext4_clear_blocks(handle, inode, this_bh, block_to_free, 4258 err = ext4_clear_blocks(handle, inode, this_bh, block_to_free,
4259 count, block_to_free_p, p); 4259 count, block_to_free_p, p);
4260 if (err < 0) 4260 if (err < 0)
4261 /* fatal error */ 4261 /* fatal error */
4262 return; 4262 return;
4263 4263
4264 if (this_bh) { 4264 if (this_bh) {
4265 BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); 4265 BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");
4266 4266
4267 /* 4267 /*
4268 * The buffer head should have an attached journal head at this 4268 * The buffer head should have an attached journal head at this
4269 * point. However, if the data is corrupted and an indirect 4269 * point. However, if the data is corrupted and an indirect
4270 * block pointed to itself, it would have been detached when 4270 * block pointed to itself, it would have been detached when
4271 * the block was cleared. Check for this instead of OOPSing. 4271 * the block was cleared. Check for this instead of OOPSing.
4272 */ 4272 */
4273 if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) 4273 if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
4274 ext4_handle_dirty_metadata(handle, inode, this_bh); 4274 ext4_handle_dirty_metadata(handle, inode, this_bh);
4275 else 4275 else
4276 EXT4_ERROR_INODE(inode, 4276 EXT4_ERROR_INODE(inode,
4277 "circular indirect block detected at " 4277 "circular indirect block detected at "
4278 "block %llu", 4278 "block %llu",
4279 (unsigned long long) this_bh->b_blocknr); 4279 (unsigned long long) this_bh->b_blocknr);
4280 } 4280 }
4281 } 4281 }
4282 4282
4283 /** 4283 /**
4284 * ext4_free_branches - free an array of branches 4284 * ext4_free_branches - free an array of branches
4285 * @handle: JBD handle for this transaction 4285 * @handle: JBD handle for this transaction
4286 * @inode: inode we are dealing with 4286 * @inode: inode we are dealing with
4287 * @parent_bh: the buffer_head which contains *@first and *@last 4287 * @parent_bh: the buffer_head which contains *@first and *@last
4288 * @first: array of block numbers 4288 * @first: array of block numbers
4289 * @last: pointer immediately past the end of array 4289 * @last: pointer immediately past the end of array
4290 * @depth: depth of the branches to free 4290 * @depth: depth of the branches to free
4291 * 4291 *
4292 * We are freeing all blocks referred from these branches (numbers are 4292 * We are freeing all blocks referred from these branches (numbers are
4293 * stored as little-endian 32-bit) and updating @inode->i_blocks 4293 * stored as little-endian 32-bit) and updating @inode->i_blocks
4294 * appropriately. 4294 * appropriately.
4295 */ 4295 */
4296 static void ext4_free_branches(handle_t *handle, struct inode *inode, 4296 static void ext4_free_branches(handle_t *handle, struct inode *inode,
4297 struct buffer_head *parent_bh, 4297 struct buffer_head *parent_bh,
4298 __le32 *first, __le32 *last, int depth) 4298 __le32 *first, __le32 *last, int depth)
4299 { 4299 {
4300 ext4_fsblk_t nr; 4300 ext4_fsblk_t nr;
4301 __le32 *p; 4301 __le32 *p;
4302 4302
4303 if (ext4_handle_is_aborted(handle)) 4303 if (ext4_handle_is_aborted(handle))
4304 return; 4304 return;
4305 4305
4306 if (depth--) { 4306 if (depth--) {
4307 struct buffer_head *bh; 4307 struct buffer_head *bh;
4308 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); 4308 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
4309 p = last; 4309 p = last;
4310 while (--p >= first) { 4310 while (--p >= first) {
4311 nr = le32_to_cpu(*p); 4311 nr = le32_to_cpu(*p);
4312 if (!nr) 4312 if (!nr)
4313 continue; /* A hole */ 4313 continue; /* A hole */
4314 4314
4315 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), 4315 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
4316 nr, 1)) { 4316 nr, 1)) {
4317 EXT4_ERROR_INODE(inode, 4317 EXT4_ERROR_INODE(inode,
4318 "invalid indirect mapped " 4318 "invalid indirect mapped "
4319 "block %lu (level %d)", 4319 "block %lu (level %d)",
4320 (unsigned long) nr, depth); 4320 (unsigned long) nr, depth);
4321 break; 4321 break;
4322 } 4322 }
4323 4323
4324 /* Go read the buffer for the next level down */ 4324 /* Go read the buffer for the next level down */
4325 bh = sb_bread(inode->i_sb, nr); 4325 bh = sb_bread(inode->i_sb, nr);
4326 4326
4327 /* 4327 /*
4328 * A read failure? Report error and clear slot 4328 * A read failure? Report error and clear slot
4329 * (should be rare). 4329 * (should be rare).
4330 */ 4330 */
4331 if (!bh) { 4331 if (!bh) {
4332 EXT4_ERROR_INODE_BLOCK(inode, nr, 4332 EXT4_ERROR_INODE_BLOCK(inode, nr,
4333 "Read failure"); 4333 "Read failure");
4334 continue; 4334 continue;
4335 } 4335 }
4336 4336
4337 /* This zaps the entire block. Bottom up. */ 4337 /* This zaps the entire block. Bottom up. */
4338 BUFFER_TRACE(bh, "free child branches"); 4338 BUFFER_TRACE(bh, "free child branches");
4339 ext4_free_branches(handle, inode, bh, 4339 ext4_free_branches(handle, inode, bh,
4340 (__le32 *) bh->b_data, 4340 (__le32 *) bh->b_data,
4341 (__le32 *) bh->b_data + addr_per_block, 4341 (__le32 *) bh->b_data + addr_per_block,
4342 depth); 4342 depth);
4343 brelse(bh); 4343 brelse(bh);
4344 4344
4345 /* 4345 /*
4346 * Everything below this this pointer has been 4346 * Everything below this this pointer has been
4347 * released. Now let this top-of-subtree go. 4347 * released. Now let this top-of-subtree go.
4348 * 4348 *
4349 * We want the freeing of this indirect block to be 4349 * We want the freeing of this indirect block to be
4350 * atomic in the journal with the updating of the 4350 * atomic in the journal with the updating of the
4351 * bitmap block which owns it. So make some room in 4351 * bitmap block which owns it. So make some room in
4352 * the journal. 4352 * the journal.
4353 * 4353 *
4354 * We zero the parent pointer *after* freeing its 4354 * We zero the parent pointer *after* freeing its
4355 * pointee in the bitmaps, so if extend_transaction() 4355 * pointee in the bitmaps, so if extend_transaction()
4356 * for some reason fails to put the bitmap changes and 4356 * for some reason fails to put the bitmap changes and
4357 * the release into the same transaction, recovery 4357 * the release into the same transaction, recovery
4358 * will merely complain about releasing a free block, 4358 * will merely complain about releasing a free block,
4359 * rather than leaking blocks. 4359 * rather than leaking blocks.
4360 */ 4360 */
4361 if (ext4_handle_is_aborted(handle)) 4361 if (ext4_handle_is_aborted(handle))
4362 return; 4362 return;
4363 if (try_to_extend_transaction(handle, inode)) { 4363 if (try_to_extend_transaction(handle, inode)) {
4364 ext4_mark_inode_dirty(handle, inode); 4364 ext4_mark_inode_dirty(handle, inode);
4365 ext4_truncate_restart_trans(handle, inode, 4365 ext4_truncate_restart_trans(handle, inode,
4366 blocks_for_truncate(inode)); 4366 blocks_for_truncate(inode));
4367 } 4367 }
4368 4368
4369 /* 4369 /*
4370 * The forget flag here is critical because if 4370 * The forget flag here is critical because if
4371 * we are journaling (and not doing data 4371 * we are journaling (and not doing data
4372 * journaling), we have to make sure a revoke 4372 * journaling), we have to make sure a revoke
4373 * record is written to prevent the journal 4373 * record is written to prevent the journal
4374 * replay from overwriting the (former) 4374 * replay from overwriting the (former)
4375 * indirect block if it gets reallocated as a 4375 * indirect block if it gets reallocated as a
4376 * data block. This must happen in the same 4376 * data block. This must happen in the same
4377 * transaction where the data blocks are 4377 * transaction where the data blocks are
4378 * actually freed. 4378 * actually freed.
4379 */ 4379 */
4380 ext4_free_blocks(handle, inode, NULL, nr, 1, 4380 ext4_free_blocks(handle, inode, NULL, nr, 1,
4381 EXT4_FREE_BLOCKS_METADATA| 4381 EXT4_FREE_BLOCKS_METADATA|
4382 EXT4_FREE_BLOCKS_FORGET); 4382 EXT4_FREE_BLOCKS_FORGET);
4383 4383
4384 if (parent_bh) { 4384 if (parent_bh) {
4385 /* 4385 /*
4386 * The block which we have just freed is 4386 * The block which we have just freed is
4387 * pointed to by an indirect block: journal it 4387 * pointed to by an indirect block: journal it
4388 */ 4388 */
4389 BUFFER_TRACE(parent_bh, "get_write_access"); 4389 BUFFER_TRACE(parent_bh, "get_write_access");
4390 if (!ext4_journal_get_write_access(handle, 4390 if (!ext4_journal_get_write_access(handle,
4391 parent_bh)){ 4391 parent_bh)){
4392 *p = 0; 4392 *p = 0;
4393 BUFFER_TRACE(parent_bh, 4393 BUFFER_TRACE(parent_bh,
4394 "call ext4_handle_dirty_metadata"); 4394 "call ext4_handle_dirty_metadata");
4395 ext4_handle_dirty_metadata(handle, 4395 ext4_handle_dirty_metadata(handle,
4396 inode, 4396 inode,
4397 parent_bh); 4397 parent_bh);
4398 } 4398 }
4399 } 4399 }
4400 } 4400 }
4401 } else { 4401 } else {
4402 /* We have reached the bottom of the tree. */ 4402 /* We have reached the bottom of the tree. */
4403 BUFFER_TRACE(parent_bh, "free data blocks"); 4403 BUFFER_TRACE(parent_bh, "free data blocks");
4404 ext4_free_data(handle, inode, parent_bh, first, last); 4404 ext4_free_data(handle, inode, parent_bh, first, last);
4405 } 4405 }
4406 } 4406 }
4407 4407
4408 int ext4_can_truncate(struct inode *inode) 4408 int ext4_can_truncate(struct inode *inode)
4409 { 4409 {
4410 if (S_ISREG(inode->i_mode)) 4410 if (S_ISREG(inode->i_mode))
4411 return 1; 4411 return 1;
4412 if (S_ISDIR(inode->i_mode)) 4412 if (S_ISDIR(inode->i_mode))
4413 return 1; 4413 return 1;
4414 if (S_ISLNK(inode->i_mode)) 4414 if (S_ISLNK(inode->i_mode))
4415 return !ext4_inode_is_fast_symlink(inode); 4415 return !ext4_inode_is_fast_symlink(inode);
4416 return 0; 4416 return 0;
4417 } 4417 }
4418 4418
4419 /* 4419 /*
4420 * ext4_punch_hole: punches a hole in a file by releaseing the blocks 4420 * ext4_punch_hole: punches a hole in a file by releaseing the blocks
4421 * associated with the given offset and length 4421 * associated with the given offset and length
4422 * 4422 *
4423 * @inode: File inode 4423 * @inode: File inode
4424 * @offset: The offset where the hole will begin 4424 * @offset: The offset where the hole will begin
4425 * @len: The length of the hole 4425 * @len: The length of the hole
4426 * 4426 *
4427 * Returns: 0 on sucess or negative on failure 4427 * Returns: 0 on sucess or negative on failure
4428 */ 4428 */
4429 4429
4430 int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) 4430 int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
4431 { 4431 {
4432 struct inode *inode = file->f_path.dentry->d_inode; 4432 struct inode *inode = file->f_path.dentry->d_inode;
4433 if (!S_ISREG(inode->i_mode)) 4433 if (!S_ISREG(inode->i_mode))
4434 return -ENOTSUPP; 4434 return -ENOTSUPP;
4435 4435
4436 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 4436 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
4437 /* TODO: Add support for non extent hole punching */ 4437 /* TODO: Add support for non extent hole punching */
4438 return -ENOTSUPP; 4438 return -ENOTSUPP;
4439 } 4439 }
4440 4440
4441 return ext4_ext_punch_hole(file, offset, length); 4441 return ext4_ext_punch_hole(file, offset, length);
4442 } 4442 }
4443 4443
4444 /* 4444 /*
4445 * ext4_truncate() 4445 * ext4_truncate()
4446 * 4446 *
4447 * We block out ext4_get_block() block instantiations across the entire 4447 * We block out ext4_get_block() block instantiations across the entire
4448 * transaction, and VFS/VM ensures that ext4_truncate() cannot run 4448 * transaction, and VFS/VM ensures that ext4_truncate() cannot run
4449 * simultaneously on behalf of the same inode. 4449 * simultaneously on behalf of the same inode.
4450 * 4450 *
4451 * As we work through the truncate and commmit bits of it to the journal there 4451 * As we work through the truncate and commmit bits of it to the journal there
4452 * is one core, guiding principle: the file's tree must always be consistent on 4452 * is one core, guiding principle: the file's tree must always be consistent on
4453 * disk. We must be able to restart the truncate after a crash. 4453 * disk. We must be able to restart the truncate after a crash.
4454 * 4454 *
4455 * The file's tree may be transiently inconsistent in memory (although it 4455 * The file's tree may be transiently inconsistent in memory (although it
4456 * probably isn't), but whenever we close off and commit a journal transaction, 4456 * probably isn't), but whenever we close off and commit a journal transaction,
4457 * the contents of (the filesystem + the journal) must be consistent and 4457 * the contents of (the filesystem + the journal) must be consistent and
4458 * restartable. It's pretty simple, really: bottom up, right to left (although 4458 * restartable. It's pretty simple, really: bottom up, right to left (although
4459 * left-to-right works OK too). 4459 * left-to-right works OK too).
4460 * 4460 *
4461 * Note that at recovery time, journal replay occurs *before* the restart of 4461 * Note that at recovery time, journal replay occurs *before* the restart of
4462 * truncate against the orphan inode list. 4462 * truncate against the orphan inode list.
4463 * 4463 *
4464 * The committed inode has the new, desired i_size (which is the same as 4464 * The committed inode has the new, desired i_size (which is the same as
4465 * i_disksize in this case). After a crash, ext4_orphan_cleanup() will see 4465 * i_disksize in this case). After a crash, ext4_orphan_cleanup() will see
4466 * that this inode's truncate did not complete and it will again call 4466 * that this inode's truncate did not complete and it will again call
4467 * ext4_truncate() to have another go. So there will be instantiated blocks 4467 * ext4_truncate() to have another go. So there will be instantiated blocks
4468 * to the right of the truncation point in a crashed ext4 filesystem. But 4468 * to the right of the truncation point in a crashed ext4 filesystem. But
4469 * that's fine - as long as they are linked from the inode, the post-crash 4469 * that's fine - as long as they are linked from the inode, the post-crash
4470 * ext4_truncate() run will find them and release them. 4470 * ext4_truncate() run will find them and release them.
4471 */ 4471 */
4472 void ext4_truncate(struct inode *inode) 4472 void ext4_truncate(struct inode *inode)
4473 { 4473 {
4474 handle_t *handle; 4474 handle_t *handle;
4475 struct ext4_inode_info *ei = EXT4_I(inode); 4475 struct ext4_inode_info *ei = EXT4_I(inode);
4476 __le32 *i_data = ei->i_data; 4476 __le32 *i_data = ei->i_data;
4477 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); 4477 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
4478 struct address_space *mapping = inode->i_mapping; 4478 struct address_space *mapping = inode->i_mapping;
4479 ext4_lblk_t offsets[4]; 4479 ext4_lblk_t offsets[4];
4480 Indirect chain[4]; 4480 Indirect chain[4];
4481 Indirect *partial; 4481 Indirect *partial;
4482 __le32 nr = 0; 4482 __le32 nr = 0;
4483 int n = 0; 4483 int n = 0;
4484 ext4_lblk_t last_block, max_block; 4484 ext4_lblk_t last_block, max_block;
4485 unsigned blocksize = inode->i_sb->s_blocksize; 4485 unsigned blocksize = inode->i_sb->s_blocksize;
4486 4486
4487 trace_ext4_truncate_enter(inode); 4487 trace_ext4_truncate_enter(inode);
4488 4488
4489 if (!ext4_can_truncate(inode)) 4489 if (!ext4_can_truncate(inode))
4490 return; 4490 return;
4491 4491
4492 ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); 4492 ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
4493 4493
4494 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) 4494 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
4495 ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); 4495 ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
4496 4496
4497 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 4497 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
4498 ext4_ext_truncate(inode); 4498 ext4_ext_truncate(inode);
4499 trace_ext4_truncate_exit(inode); 4499 trace_ext4_truncate_exit(inode);
4500 return; 4500 return;
4501 } 4501 }
4502 4502
4503 handle = start_transaction(inode); 4503 handle = start_transaction(inode);
4504 if (IS_ERR(handle)) 4504 if (IS_ERR(handle))
4505 return; /* AKPM: return what? */ 4505 return; /* AKPM: return what? */
4506 4506
4507 last_block = (inode->i_size + blocksize-1) 4507 last_block = (inode->i_size + blocksize-1)
4508 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); 4508 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
4509 max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) 4509 max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
4510 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); 4510 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
4511 4511
4512 if (inode->i_size & (blocksize - 1)) 4512 if (inode->i_size & (blocksize - 1))
4513 if (ext4_block_truncate_page(handle, mapping, inode->i_size)) 4513 if (ext4_block_truncate_page(handle, mapping, inode->i_size))
4514 goto out_stop; 4514 goto out_stop;
4515 4515
4516 if (last_block != max_block) { 4516 if (last_block != max_block) {
4517 n = ext4_block_to_path(inode, last_block, offsets, NULL); 4517 n = ext4_block_to_path(inode, last_block, offsets, NULL);
4518 if (n == 0) 4518 if (n == 0)
4519 goto out_stop; /* error */ 4519 goto out_stop; /* error */
4520 } 4520 }
4521 4521
4522 /* 4522 /*
4523 * OK. This truncate is going to happen. We add the inode to the 4523 * OK. This truncate is going to happen. We add the inode to the
4524 * orphan list, so that if this truncate spans multiple transactions, 4524 * orphan list, so that if this truncate spans multiple transactions,
4525 * and we crash, we will resume the truncate when the filesystem 4525 * and we crash, we will resume the truncate when the filesystem
4526 * recovers. It also marks the inode dirty, to catch the new size. 4526 * recovers. It also marks the inode dirty, to catch the new size.
4527 * 4527 *
4528 * Implication: the file must always be in a sane, consistent 4528 * Implication: the file must always be in a sane, consistent
4529 * truncatable state while each transaction commits. 4529 * truncatable state while each transaction commits.
4530 */ 4530 */
4531 if (ext4_orphan_add(handle, inode)) 4531 if (ext4_orphan_add(handle, inode))
4532 goto out_stop; 4532 goto out_stop;
4533 4533
4534 /* 4534 /*
4535 * From here we block out all ext4_get_block() callers who want to 4535 * From here we block out all ext4_get_block() callers who want to
4536 * modify the block allocation tree. 4536 * modify the block allocation tree.
4537 */ 4537 */
4538 down_write(&ei->i_data_sem); 4538 down_write(&ei->i_data_sem);
4539 4539
4540 ext4_discard_preallocations(inode); 4540 ext4_discard_preallocations(inode);
4541 4541
4542 /* 4542 /*
4543 * The orphan list entry will now protect us from any crash which 4543 * The orphan list entry will now protect us from any crash which
4544 * occurs before the truncate completes, so it is now safe to propagate 4544 * occurs before the truncate completes, so it is now safe to propagate
4545 * the new, shorter inode size (held for now in i_size) into the 4545 * the new, shorter inode size (held for now in i_size) into the
4546 * on-disk inode. We do this via i_disksize, which is the value which 4546 * on-disk inode. We do this via i_disksize, which is the value which
4547 * ext4 *really* writes onto the disk inode. 4547 * ext4 *really* writes onto the disk inode.
4548 */ 4548 */
4549 ei->i_disksize = inode->i_size; 4549 ei->i_disksize = inode->i_size;
4550 4550
4551 if (last_block == max_block) { 4551 if (last_block == max_block) {
4552 /* 4552 /*
4553 * It is unnecessary to free any data blocks if last_block is 4553 * It is unnecessary to free any data blocks if last_block is
4554 * equal to the indirect block limit. 4554 * equal to the indirect block limit.
4555 */ 4555 */
4556 goto out_unlock; 4556 goto out_unlock;
4557 } else if (n == 1) { /* direct blocks */ 4557 } else if (n == 1) { /* direct blocks */
4558 ext4_free_data(handle, inode, NULL, i_data+offsets[0], 4558 ext4_free_data(handle, inode, NULL, i_data+offsets[0],
4559 i_data + EXT4_NDIR_BLOCKS); 4559 i_data + EXT4_NDIR_BLOCKS);
4560 goto do_indirects; 4560 goto do_indirects;
4561 } 4561 }
4562 4562
4563 partial = ext4_find_shared(inode, n, offsets, chain, &nr); 4563 partial = ext4_find_shared(inode, n, offsets, chain, &nr);
4564 /* Kill the top of shared branch (not detached) */ 4564 /* Kill the top of shared branch (not detached) */
4565 if (nr) { 4565 if (nr) {
4566 if (partial == chain) { 4566 if (partial == chain) {
4567 /* Shared branch grows from the inode */ 4567 /* Shared branch grows from the inode */
4568 ext4_free_branches(handle, inode, NULL, 4568 ext4_free_branches(handle, inode, NULL,
4569 &nr, &nr+1, (chain+n-1) - partial); 4569 &nr, &nr+1, (chain+n-1) - partial);
4570 *partial->p = 0; 4570 *partial->p = 0;
4571 /* 4571 /*
4572 * We mark the inode dirty prior to restart, 4572 * We mark the inode dirty prior to restart,
4573 * and prior to stop. No need for it here. 4573 * and prior to stop. No need for it here.
4574 */ 4574 */
4575 } else { 4575 } else {
4576 /* Shared branch grows from an indirect block */ 4576 /* Shared branch grows from an indirect block */
4577 BUFFER_TRACE(partial->bh, "get_write_access"); 4577 BUFFER_TRACE(partial->bh, "get_write_access");
4578 ext4_free_branches(handle, inode, partial->bh, 4578 ext4_free_branches(handle, inode, partial->bh,
4579 partial->p, 4579 partial->p,
4580 partial->p+1, (chain+n-1) - partial); 4580 partial->p+1, (chain+n-1) - partial);
4581 } 4581 }
4582 } 4582 }
4583 /* Clear the ends of indirect blocks on the shared branch */ 4583 /* Clear the ends of indirect blocks on the shared branch */
4584 while (partial > chain) { 4584 while (partial > chain) {
4585 ext4_free_branches(handle, inode, partial->bh, partial->p + 1, 4585 ext4_free_branches(handle, inode, partial->bh, partial->p + 1,
4586 (__le32*)partial->bh->b_data+addr_per_block, 4586 (__le32*)partial->bh->b_data+addr_per_block,
4587 (chain+n-1) - partial); 4587 (chain+n-1) - partial);
4588 BUFFER_TRACE(partial->bh, "call brelse"); 4588 BUFFER_TRACE(partial->bh, "call brelse");
4589 brelse(partial->bh); 4589 brelse(partial->bh);
4590 partial--; 4590 partial--;
4591 } 4591 }
4592 do_indirects: 4592 do_indirects:
4593 /* Kill the remaining (whole) subtrees */ 4593 /* Kill the remaining (whole) subtrees */
4594 switch (offsets[0]) { 4594 switch (offsets[0]) {
4595 default: 4595 default:
4596 nr = i_data[EXT4_IND_BLOCK]; 4596 nr = i_data[EXT4_IND_BLOCK];
4597 if (nr) { 4597 if (nr) {
4598 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); 4598 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
4599 i_data[EXT4_IND_BLOCK] = 0; 4599 i_data[EXT4_IND_BLOCK] = 0;
4600 } 4600 }
4601 case EXT4_IND_BLOCK: 4601 case EXT4_IND_BLOCK:
4602 nr = i_data[EXT4_DIND_BLOCK]; 4602 nr = i_data[EXT4_DIND_BLOCK];
4603 if (nr) { 4603 if (nr) {
4604 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); 4604 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
4605 i_data[EXT4_DIND_BLOCK] = 0; 4605 i_data[EXT4_DIND_BLOCK] = 0;
4606 } 4606 }
4607 case EXT4_DIND_BLOCK: 4607 case EXT4_DIND_BLOCK:
4608 nr = i_data[EXT4_TIND_BLOCK]; 4608 nr = i_data[EXT4_TIND_BLOCK];
4609 if (nr) { 4609 if (nr) {
4610 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); 4610 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
4611 i_data[EXT4_TIND_BLOCK] = 0; 4611 i_data[EXT4_TIND_BLOCK] = 0;
4612 } 4612 }
4613 case EXT4_TIND_BLOCK: 4613 case EXT4_TIND_BLOCK:
4614 ; 4614 ;
4615 } 4615 }
4616 4616
4617 out_unlock: 4617 out_unlock:
4618 up_write(&ei->i_data_sem); 4618 up_write(&ei->i_data_sem);
4619 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 4619 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4620 ext4_mark_inode_dirty(handle, inode); 4620 ext4_mark_inode_dirty(handle, inode);
4621 4621
4622 /* 4622 /*
4623 * In a multi-transaction truncate, we only make the final transaction 4623 * In a multi-transaction truncate, we only make the final transaction
4624 * synchronous 4624 * synchronous
4625 */ 4625 */
4626 if (IS_SYNC(inode)) 4626 if (IS_SYNC(inode))
4627 ext4_handle_sync(handle); 4627 ext4_handle_sync(handle);
4628 out_stop: 4628 out_stop:
4629 /* 4629 /*
4630 * If this was a simple ftruncate(), and the file will remain alive 4630 * If this was a simple ftruncate(), and the file will remain alive
4631 * then we need to clear up the orphan record which we created above. 4631 * then we need to clear up the orphan record which we created above.
4632 * However, if this was a real unlink then we were called by 4632 * However, if this was a real unlink then we were called by
4633 * ext4_delete_inode(), and we allow that function to clean up the 4633 * ext4_delete_inode(), and we allow that function to clean up the
4634 * orphan info for us. 4634 * orphan info for us.
4635 */ 4635 */
4636 if (inode->i_nlink) 4636 if (inode->i_nlink)
4637 ext4_orphan_del(handle, inode); 4637 ext4_orphan_del(handle, inode);
4638 4638
4639 ext4_journal_stop(handle); 4639 ext4_journal_stop(handle);
4640 trace_ext4_truncate_exit(inode); 4640 trace_ext4_truncate_exit(inode);
4641 } 4641 }
4642 4642
4643 /* 4643 /*
4644 * ext4_get_inode_loc returns with an extra refcount against the inode's 4644 * ext4_get_inode_loc returns with an extra refcount against the inode's
4645 * underlying buffer_head on success. If 'in_mem' is true, we have all 4645 * underlying buffer_head on success. If 'in_mem' is true, we have all
4646 * data in memory that is needed to recreate the on-disk version of this 4646 * data in memory that is needed to recreate the on-disk version of this
4647 * inode. 4647 * inode.
4648 */ 4648 */
4649 static int __ext4_get_inode_loc(struct inode *inode, 4649 static int __ext4_get_inode_loc(struct inode *inode,
4650 struct ext4_iloc *iloc, int in_mem) 4650 struct ext4_iloc *iloc, int in_mem)
4651 { 4651 {
4652 struct ext4_group_desc *gdp; 4652 struct ext4_group_desc *gdp;
4653 struct buffer_head *bh; 4653 struct buffer_head *bh;
4654 struct super_block *sb = inode->i_sb; 4654 struct super_block *sb = inode->i_sb;
4655 ext4_fsblk_t block; 4655 ext4_fsblk_t block;
4656 int inodes_per_block, inode_offset; 4656 int inodes_per_block, inode_offset;
4657 4657
4658 iloc->bh = NULL; 4658 iloc->bh = NULL;
4659 if (!ext4_valid_inum(sb, inode->i_ino)) 4659 if (!ext4_valid_inum(sb, inode->i_ino))
4660 return -EIO; 4660 return -EIO;
4661 4661
4662 iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb); 4662 iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
4663 gdp = ext4_get_group_desc(sb, iloc->block_group, NULL); 4663 gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
4664 if (!gdp) 4664 if (!gdp)
4665 return -EIO; 4665 return -EIO;
4666 4666
4667 /* 4667 /*
4668 * Figure out the offset within the block group inode table 4668 * Figure out the offset within the block group inode table
4669 */ 4669 */
4670 inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; 4670 inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
4671 inode_offset = ((inode->i_ino - 1) % 4671 inode_offset = ((inode->i_ino - 1) %
4672 EXT4_INODES_PER_GROUP(sb)); 4672 EXT4_INODES_PER_GROUP(sb));
4673 block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block); 4673 block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
4674 iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb); 4674 iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
4675 4675
4676 bh = sb_getblk(sb, block); 4676 bh = sb_getblk(sb, block);
4677 if (!bh) { 4677 if (!bh) {
4678 EXT4_ERROR_INODE_BLOCK(inode, block, 4678 EXT4_ERROR_INODE_BLOCK(inode, block,
4679 "unable to read itable block"); 4679 "unable to read itable block");
4680 return -EIO; 4680 return -EIO;
4681 } 4681 }
4682 if (!buffer_uptodate(bh)) { 4682 if (!buffer_uptodate(bh)) {
4683 lock_buffer(bh); 4683 lock_buffer(bh);
4684 4684
4685 /* 4685 /*
4686 * If the buffer has the write error flag, we have failed 4686 * If the buffer has the write error flag, we have failed
4687 * to write out another inode in the same block. In this 4687 * to write out another inode in the same block. In this
4688 * case, we don't have to read the block because we may 4688 * case, we don't have to read the block because we may
4689 * read the old inode data successfully. 4689 * read the old inode data successfully.
4690 */ 4690 */
4691 if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) 4691 if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
4692 set_buffer_uptodate(bh); 4692 set_buffer_uptodate(bh);
4693 4693
4694 if (buffer_uptodate(bh)) { 4694 if (buffer_uptodate(bh)) {
4695 /* someone brought it uptodate while we waited */ 4695 /* someone brought it uptodate while we waited */
4696 unlock_buffer(bh); 4696 unlock_buffer(bh);
4697 goto has_buffer; 4697 goto has_buffer;
4698 } 4698 }
4699 4699
4700 /* 4700 /*
4701 * If we have all information of the inode in memory and this 4701 * If we have all information of the inode in memory and this
4702 * is the only valid inode in the block, we need not read the 4702 * is the only valid inode in the block, we need not read the
4703 * block. 4703 * block.
4704 */ 4704 */
4705 if (in_mem) { 4705 if (in_mem) {
4706 struct buffer_head *bitmap_bh; 4706 struct buffer_head *bitmap_bh;
4707 int i, start; 4707 int i, start;
4708 4708
4709 start = inode_offset & ~(inodes_per_block - 1); 4709 start = inode_offset & ~(inodes_per_block - 1);
4710 4710
4711 /* Is the inode bitmap in cache? */ 4711 /* Is the inode bitmap in cache? */
4712 bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp)); 4712 bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
4713 if (!bitmap_bh) 4713 if (!bitmap_bh)
4714 goto make_io; 4714 goto make_io;
4715 4715
4716 /* 4716 /*
4717 * If the inode bitmap isn't in cache then the 4717 * If the inode bitmap isn't in cache then the
4718 * optimisation may end up performing two reads instead 4718 * optimisation may end up performing two reads instead
4719 * of one, so skip it. 4719 * of one, so skip it.
4720 */ 4720 */
4721 if (!buffer_uptodate(bitmap_bh)) { 4721 if (!buffer_uptodate(bitmap_bh)) {
4722 brelse(bitmap_bh); 4722 brelse(bitmap_bh);
4723 goto make_io; 4723 goto make_io;
4724 } 4724 }
4725 for (i = start; i < start + inodes_per_block; i++) { 4725 for (i = start; i < start + inodes_per_block; i++) {
4726 if (i == inode_offset) 4726 if (i == inode_offset)
4727 continue; 4727 continue;
4728 if (ext4_test_bit(i, bitmap_bh->b_data)) 4728 if (ext4_test_bit(i, bitmap_bh->b_data))
4729 break; 4729 break;
4730 } 4730 }
4731 brelse(bitmap_bh); 4731 brelse(bitmap_bh);
4732 if (i == start + inodes_per_block) { 4732 if (i == start + inodes_per_block) {
4733 /* all other inodes are free, so skip I/O */ 4733 /* all other inodes are free, so skip I/O */
4734 memset(bh->b_data, 0, bh->b_size); 4734 memset(bh->b_data, 0, bh->b_size);
4735 set_buffer_uptodate(bh); 4735 set_buffer_uptodate(bh);
4736 unlock_buffer(bh); 4736 unlock_buffer(bh);
4737 goto has_buffer; 4737 goto has_buffer;
4738 } 4738 }
4739 } 4739 }
4740 4740
4741 make_io: 4741 make_io:
4742 /* 4742 /*
4743 * If we need to do any I/O, try to pre-readahead extra 4743 * If we need to do any I/O, try to pre-readahead extra
4744 * blocks from the inode table. 4744 * blocks from the inode table.
4745 */ 4745 */
4746 if (EXT4_SB(sb)->s_inode_readahead_blks) { 4746 if (EXT4_SB(sb)->s_inode_readahead_blks) {
4747 ext4_fsblk_t b, end, table; 4747 ext4_fsblk_t b, end, table;
4748 unsigned num; 4748 unsigned num;
4749 4749
4750 table = ext4_inode_table(sb, gdp); 4750 table = ext4_inode_table(sb, gdp);
4751 /* s_inode_readahead_blks is always a power of 2 */ 4751 /* s_inode_readahead_blks is always a power of 2 */
4752 b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1); 4752 b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
4753 if (table > b) 4753 if (table > b)
4754 b = table; 4754 b = table;
4755 end = b + EXT4_SB(sb)->s_inode_readahead_blks; 4755 end = b + EXT4_SB(sb)->s_inode_readahead_blks;
4756 num = EXT4_INODES_PER_GROUP(sb); 4756 num = EXT4_INODES_PER_GROUP(sb);
4757 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 4757 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
4758 EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) 4758 EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
4759 num -= ext4_itable_unused_count(sb, gdp); 4759 num -= ext4_itable_unused_count(sb, gdp);
4760 table += num / inodes_per_block; 4760 table += num / inodes_per_block;
4761 if (end > table) 4761 if (end > table)
4762 end = table; 4762 end = table;
4763 while (b <= end) 4763 while (b <= end)
4764 sb_breadahead(sb, b++); 4764 sb_breadahead(sb, b++);
4765 } 4765 }
4766 4766
4767 /* 4767 /*
4768 * There are other valid inodes in the buffer, this inode 4768 * There are other valid inodes in the buffer, this inode
4769 * has in-inode xattrs, or we don't have this inode in memory. 4769 * has in-inode xattrs, or we don't have this inode in memory.
4770 * Read the block from disk. 4770 * Read the block from disk.
4771 */ 4771 */
4772 trace_ext4_load_inode(inode); 4772 trace_ext4_load_inode(inode);
4773 get_bh(bh); 4773 get_bh(bh);
4774 bh->b_end_io = end_buffer_read_sync; 4774 bh->b_end_io = end_buffer_read_sync;
4775 submit_bh(READ_META, bh); 4775 submit_bh(READ_META, bh);
4776 wait_on_buffer(bh); 4776 wait_on_buffer(bh);
4777 if (!buffer_uptodate(bh)) { 4777 if (!buffer_uptodate(bh)) {
4778 EXT4_ERROR_INODE_BLOCK(inode, block, 4778 EXT4_ERROR_INODE_BLOCK(inode, block,
4779 "unable to read itable block"); 4779 "unable to read itable block");
4780 brelse(bh); 4780 brelse(bh);
4781 return -EIO; 4781 return -EIO;
4782 } 4782 }
4783 } 4783 }
4784 has_buffer: 4784 has_buffer:
4785 iloc->bh = bh; 4785 iloc->bh = bh;
4786 return 0; 4786 return 0;
4787 } 4787 }
4788 4788
4789 int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) 4789 int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
4790 { 4790 {
4791 /* We have all inode data except xattrs in memory here. */ 4791 /* We have all inode data except xattrs in memory here. */
4792 return __ext4_get_inode_loc(inode, iloc, 4792 return __ext4_get_inode_loc(inode, iloc,
4793 !ext4_test_inode_state(inode, EXT4_STATE_XATTR)); 4793 !ext4_test_inode_state(inode, EXT4_STATE_XATTR));
4794 } 4794 }
4795 4795
4796 void ext4_set_inode_flags(struct inode *inode) 4796 void ext4_set_inode_flags(struct inode *inode)
4797 { 4797 {
4798 unsigned int flags = EXT4_I(inode)->i_flags; 4798 unsigned int flags = EXT4_I(inode)->i_flags;
4799 4799
4800 inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); 4800 inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
4801 if (flags & EXT4_SYNC_FL) 4801 if (flags & EXT4_SYNC_FL)
4802 inode->i_flags |= S_SYNC; 4802 inode->i_flags |= S_SYNC;
4803 if (flags & EXT4_APPEND_FL) 4803 if (flags & EXT4_APPEND_FL)
4804 inode->i_flags |= S_APPEND; 4804 inode->i_flags |= S_APPEND;
4805 if (flags & EXT4_IMMUTABLE_FL) 4805 if (flags & EXT4_IMMUTABLE_FL)
4806 inode->i_flags |= S_IMMUTABLE; 4806 inode->i_flags |= S_IMMUTABLE;
4807 if (flags & EXT4_NOATIME_FL) 4807 if (flags & EXT4_NOATIME_FL)
4808 inode->i_flags |= S_NOATIME; 4808 inode->i_flags |= S_NOATIME;
4809 if (flags & EXT4_DIRSYNC_FL) 4809 if (flags & EXT4_DIRSYNC_FL)
4810 inode->i_flags |= S_DIRSYNC; 4810 inode->i_flags |= S_DIRSYNC;
4811 } 4811 }
4812 4812
4813 /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ 4813 /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
4814 void ext4_get_inode_flags(struct ext4_inode_info *ei) 4814 void ext4_get_inode_flags(struct ext4_inode_info *ei)
4815 { 4815 {
4816 unsigned int vfs_fl; 4816 unsigned int vfs_fl;
4817 unsigned long old_fl, new_fl; 4817 unsigned long old_fl, new_fl;
4818 4818
4819 do { 4819 do {
4820 vfs_fl = ei->vfs_inode.i_flags; 4820 vfs_fl = ei->vfs_inode.i_flags;
4821 old_fl = ei->i_flags; 4821 old_fl = ei->i_flags;
4822 new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL| 4822 new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
4823 EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL| 4823 EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|
4824 EXT4_DIRSYNC_FL); 4824 EXT4_DIRSYNC_FL);
4825 if (vfs_fl & S_SYNC) 4825 if (vfs_fl & S_SYNC)
4826 new_fl |= EXT4_SYNC_FL; 4826 new_fl |= EXT4_SYNC_FL;
4827 if (vfs_fl & S_APPEND) 4827 if (vfs_fl & S_APPEND)
4828 new_fl |= EXT4_APPEND_FL; 4828 new_fl |= EXT4_APPEND_FL;
4829 if (vfs_fl & S_IMMUTABLE) 4829 if (vfs_fl & S_IMMUTABLE)
4830 new_fl |= EXT4_IMMUTABLE_FL; 4830 new_fl |= EXT4_IMMUTABLE_FL;
4831 if (vfs_fl & S_NOATIME) 4831 if (vfs_fl & S_NOATIME)
4832 new_fl |= EXT4_NOATIME_FL; 4832 new_fl |= EXT4_NOATIME_FL;
4833 if (vfs_fl & S_DIRSYNC) 4833 if (vfs_fl & S_DIRSYNC)
4834 new_fl |= EXT4_DIRSYNC_FL; 4834 new_fl |= EXT4_DIRSYNC_FL;
4835 } while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl); 4835 } while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl);
4836 } 4836 }
4837 4837
4838 static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, 4838 static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
4839 struct ext4_inode_info *ei) 4839 struct ext4_inode_info *ei)
4840 { 4840 {
4841 blkcnt_t i_blocks ; 4841 blkcnt_t i_blocks ;
4842 struct inode *inode = &(ei->vfs_inode); 4842 struct inode *inode = &(ei->vfs_inode);
4843 struct super_block *sb = inode->i_sb; 4843 struct super_block *sb = inode->i_sb;
4844 4844
4845 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 4845 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
4846 EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) { 4846 EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
4847 /* we are using combined 48 bit field */ 4847 /* we are using combined 48 bit field */
4848 i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 | 4848 i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
4849 le32_to_cpu(raw_inode->i_blocks_lo); 4849 le32_to_cpu(raw_inode->i_blocks_lo);
4850 if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) { 4850 if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) {
4851 /* i_blocks represent file system block size */ 4851 /* i_blocks represent file system block size */
4852 return i_blocks << (inode->i_blkbits - 9); 4852 return i_blocks << (inode->i_blkbits - 9);
4853 } else { 4853 } else {
4854 return i_blocks; 4854 return i_blocks;
4855 } 4855 }
4856 } else { 4856 } else {
4857 return le32_to_cpu(raw_inode->i_blocks_lo); 4857 return le32_to_cpu(raw_inode->i_blocks_lo);
4858 } 4858 }
4859 } 4859 }
4860 4860
4861 struct inode *ext4_iget(struct super_block *sb, unsigned long ino) 4861 struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4862 { 4862 {
4863 struct ext4_iloc iloc; 4863 struct ext4_iloc iloc;
4864 struct ext4_inode *raw_inode; 4864 struct ext4_inode *raw_inode;
4865 struct ext4_inode_info *ei; 4865 struct ext4_inode_info *ei;
4866 struct inode *inode; 4866 struct inode *inode;
4867 journal_t *journal = EXT4_SB(sb)->s_journal; 4867 journal_t *journal = EXT4_SB(sb)->s_journal;
4868 long ret; 4868 long ret;
4869 int block; 4869 int block;
4870 4870
4871 inode = iget_locked(sb, ino); 4871 inode = iget_locked(sb, ino);
4872 if (!inode) 4872 if (!inode)
4873 return ERR_PTR(-ENOMEM); 4873 return ERR_PTR(-ENOMEM);
4874 if (!(inode->i_state & I_NEW)) 4874 if (!(inode->i_state & I_NEW))
4875 return inode; 4875 return inode;
4876 4876
4877 ei = EXT4_I(inode); 4877 ei = EXT4_I(inode);
4878 iloc.bh = NULL; 4878 iloc.bh = NULL;
4879 4879
4880 ret = __ext4_get_inode_loc(inode, &iloc, 0); 4880 ret = __ext4_get_inode_loc(inode, &iloc, 0);
4881 if (ret < 0) 4881 if (ret < 0)
4882 goto bad_inode; 4882 goto bad_inode;
4883 raw_inode = ext4_raw_inode(&iloc); 4883 raw_inode = ext4_raw_inode(&iloc);
4884 inode->i_mode = le16_to_cpu(raw_inode->i_mode); 4884 inode->i_mode = le16_to_cpu(raw_inode->i_mode);
4885 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); 4885 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
4886 inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); 4886 inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
4887 if (!(test_opt(inode->i_sb, NO_UID32))) { 4887 if (!(test_opt(inode->i_sb, NO_UID32))) {
4888 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; 4888 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
4889 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; 4889 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
4890 } 4890 }
4891 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); 4891 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
4892 4892
4893 ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ 4893 ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
4894 ei->i_dir_start_lookup = 0; 4894 ei->i_dir_start_lookup = 0;
4895 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); 4895 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
4896 /* We now have enough fields to check if the inode was active or not. 4896 /* We now have enough fields to check if the inode was active or not.
4897 * This is needed because nfsd might try to access dead inodes 4897 * This is needed because nfsd might try to access dead inodes
4898 * the test is that same one that e2fsck uses 4898 * the test is that same one that e2fsck uses
4899 * NeilBrown 1999oct15 4899 * NeilBrown 1999oct15
4900 */ 4900 */
4901 if (inode->i_nlink == 0) { 4901 if (inode->i_nlink == 0) {
4902 if (inode->i_mode == 0 || 4902 if (inode->i_mode == 0 ||
4903 !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { 4903 !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
4904 /* this inode is deleted */ 4904 /* this inode is deleted */
4905 ret = -ESTALE; 4905 ret = -ESTALE;
4906 goto bad_inode; 4906 goto bad_inode;
4907 } 4907 }
4908 /* The only unlinked inodes we let through here have 4908 /* The only unlinked inodes we let through here have
4909 * valid i_mode and are being read by the orphan 4909 * valid i_mode and are being read by the orphan
4910 * recovery code: that's fine, we're about to complete 4910 * recovery code: that's fine, we're about to complete
4911 * the process of deleting those. */ 4911 * the process of deleting those. */
4912 } 4912 }
4913 ei->i_flags = le32_to_cpu(raw_inode->i_flags); 4913 ei->i_flags = le32_to_cpu(raw_inode->i_flags);
4914 inode->i_blocks = ext4_inode_blocks(raw_inode, ei); 4914 inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
4915 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); 4915 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
4916 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) 4916 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
4917 ei->i_file_acl |= 4917 ei->i_file_acl |=
4918 ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; 4918 ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
4919 inode->i_size = ext4_isize(raw_inode); 4919 inode->i_size = ext4_isize(raw_inode);
4920 ei->i_disksize = inode->i_size; 4920 ei->i_disksize = inode->i_size;
4921 #ifdef CONFIG_QUOTA 4921 #ifdef CONFIG_QUOTA
4922 ei->i_reserved_quota = 0; 4922 ei->i_reserved_quota = 0;
4923 #endif 4923 #endif
4924 inode->i_generation = le32_to_cpu(raw_inode->i_generation); 4924 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
4925 ei->i_block_group = iloc.block_group; 4925 ei->i_block_group = iloc.block_group;
4926 ei->i_last_alloc_group = ~0; 4926 ei->i_last_alloc_group = ~0;
4927 /* 4927 /*
4928 * NOTE! The in-memory inode i_data array is in little-endian order 4928 * NOTE! The in-memory inode i_data array is in little-endian order
4929 * even on big-endian machines: we do NOT byteswap the block numbers! 4929 * even on big-endian machines: we do NOT byteswap the block numbers!
4930 */ 4930 */
4931 for (block = 0; block < EXT4_N_BLOCKS; block++) 4931 for (block = 0; block < EXT4_N_BLOCKS; block++)
4932 ei->i_data[block] = raw_inode->i_block[block]; 4932 ei->i_data[block] = raw_inode->i_block[block];
4933 INIT_LIST_HEAD(&ei->i_orphan); 4933 INIT_LIST_HEAD(&ei->i_orphan);
4934 4934
4935 /* 4935 /*
4936 * Set transaction id's of transactions that have to be committed 4936 * Set transaction id's of transactions that have to be committed
4937 * to finish f[data]sync. We set them to currently running transaction 4937 * to finish f[data]sync. We set them to currently running transaction
4938 * as we cannot be sure that the inode or some of its metadata isn't 4938 * as we cannot be sure that the inode or some of its metadata isn't
4939 * part of the transaction - the inode could have been reclaimed and 4939 * part of the transaction - the inode could have been reclaimed and
4940 * now it is reread from disk. 4940 * now it is reread from disk.
4941 */ 4941 */
4942 if (journal) { 4942 if (journal) {
4943 transaction_t *transaction; 4943 transaction_t *transaction;
4944 tid_t tid; 4944 tid_t tid;
4945 4945
4946 read_lock(&journal->j_state_lock); 4946 read_lock(&journal->j_state_lock);
4947 if (journal->j_running_transaction) 4947 if (journal->j_running_transaction)
4948 transaction = journal->j_running_transaction; 4948 transaction = journal->j_running_transaction;
4949 else 4949 else
4950 transaction = journal->j_committing_transaction; 4950 transaction = journal->j_committing_transaction;
4951 if (transaction) 4951 if (transaction)
4952 tid = transaction->t_tid; 4952 tid = transaction->t_tid;
4953 else 4953 else
4954 tid = journal->j_commit_sequence; 4954 tid = journal->j_commit_sequence;
4955 read_unlock(&journal->j_state_lock); 4955 read_unlock(&journal->j_state_lock);
4956 ei->i_sync_tid = tid; 4956 ei->i_sync_tid = tid;
4957 ei->i_datasync_tid = tid; 4957 ei->i_datasync_tid = tid;
4958 } 4958 }
4959 4959
4960 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 4960 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
4961 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); 4961 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
4962 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > 4962 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
4963 EXT4_INODE_SIZE(inode->i_sb)) { 4963 EXT4_INODE_SIZE(inode->i_sb)) {
4964 ret = -EIO; 4964 ret = -EIO;
4965 goto bad_inode; 4965 goto bad_inode;
4966 } 4966 }
4967 if (ei->i_extra_isize == 0) { 4967 if (ei->i_extra_isize == 0) {
4968 /* The extra space is currently unused. Use it. */ 4968 /* The extra space is currently unused. Use it. */
4969 ei->i_extra_isize = sizeof(struct ext4_inode) - 4969 ei->i_extra_isize = sizeof(struct ext4_inode) -
4970 EXT4_GOOD_OLD_INODE_SIZE; 4970 EXT4_GOOD_OLD_INODE_SIZE;
4971 } else { 4971 } else {
4972 __le32 *magic = (void *)raw_inode + 4972 __le32 *magic = (void *)raw_inode +
4973 EXT4_GOOD_OLD_INODE_SIZE + 4973 EXT4_GOOD_OLD_INODE_SIZE +
4974 ei->i_extra_isize; 4974 ei->i_extra_isize;
4975 if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) 4975 if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
4976 ext4_set_inode_state(inode, EXT4_STATE_XATTR); 4976 ext4_set_inode_state(inode, EXT4_STATE_XATTR);
4977 } 4977 }
4978 } else 4978 } else
4979 ei->i_extra_isize = 0; 4979 ei->i_extra_isize = 0;
4980 4980
4981 EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode); 4981 EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
4982 EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode); 4982 EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
4983 EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); 4983 EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
4984 EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); 4984 EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
4985 4985
4986 inode->i_version = le32_to_cpu(raw_inode->i_disk_version); 4986 inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
4987 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 4987 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
4988 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) 4988 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
4989 inode->i_version |= 4989 inode->i_version |=
4990 (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; 4990 (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
4991 } 4991 }
4992 4992
4993 ret = 0; 4993 ret = 0;
4994 if (ei->i_file_acl && 4994 if (ei->i_file_acl &&
4995 !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { 4995 !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
4996 EXT4_ERROR_INODE(inode, "bad extended attribute block %llu", 4996 EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
4997 ei->i_file_acl); 4997 ei->i_file_acl);
4998 ret = -EIO; 4998 ret = -EIO;
4999 goto bad_inode; 4999 goto bad_inode;
5000 } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 5000 } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
5001 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 5001 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
5002 (S_ISLNK(inode->i_mode) && 5002 (S_ISLNK(inode->i_mode) &&
5003 !ext4_inode_is_fast_symlink(inode))) 5003 !ext4_inode_is_fast_symlink(inode)))
5004 /* Validate extent which is part of inode */ 5004 /* Validate extent which is part of inode */
5005 ret = ext4_ext_check_inode(inode); 5005 ret = ext4_ext_check_inode(inode);
5006 } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 5006 } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
5007 (S_ISLNK(inode->i_mode) && 5007 (S_ISLNK(inode->i_mode) &&
5008 !ext4_inode_is_fast_symlink(inode))) { 5008 !ext4_inode_is_fast_symlink(inode))) {
5009 /* Validate block references which are part of inode */ 5009 /* Validate block references which are part of inode */
5010 ret = ext4_check_inode_blockref(inode); 5010 ret = ext4_check_inode_blockref(inode);
5011 } 5011 }
5012 if (ret) 5012 if (ret)
5013 goto bad_inode; 5013 goto bad_inode;
5014 5014
5015 if (S_ISREG(inode->i_mode)) { 5015 if (S_ISREG(inode->i_mode)) {
5016 inode->i_op = &ext4_file_inode_operations; 5016 inode->i_op = &ext4_file_inode_operations;
5017 inode->i_fop = &ext4_file_operations; 5017 inode->i_fop = &ext4_file_operations;
5018 ext4_set_aops(inode); 5018 ext4_set_aops(inode);
5019 } else if (S_ISDIR(inode->i_mode)) { 5019 } else if (S_ISDIR(inode->i_mode)) {
5020 inode->i_op = &ext4_dir_inode_operations; 5020 inode->i_op = &ext4_dir_inode_operations;
5021 inode->i_fop = &ext4_dir_operations; 5021 inode->i_fop = &ext4_dir_operations;
5022 } else if (S_ISLNK(inode->i_mode)) { 5022 } else if (S_ISLNK(inode->i_mode)) {
5023 if (ext4_inode_is_fast_symlink(inode)) { 5023 if (ext4_inode_is_fast_symlink(inode)) {
5024 inode->i_op = &ext4_fast_symlink_inode_operations; 5024 inode->i_op = &ext4_fast_symlink_inode_operations;
5025 nd_terminate_link(ei->i_data, inode->i_size, 5025 nd_terminate_link(ei->i_data, inode->i_size,
5026 sizeof(ei->i_data) - 1); 5026 sizeof(ei->i_data) - 1);
5027 } else { 5027 } else {
5028 inode->i_op = &ext4_symlink_inode_operations; 5028 inode->i_op = &ext4_symlink_inode_operations;
5029 ext4_set_aops(inode); 5029 ext4_set_aops(inode);
5030 } 5030 }
5031 } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || 5031 } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
5032 S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { 5032 S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
5033 inode->i_op = &ext4_special_inode_operations; 5033 inode->i_op = &ext4_special_inode_operations;
5034 if (raw_inode->i_block[0]) 5034 if (raw_inode->i_block[0])
5035 init_special_inode(inode, inode->i_mode, 5035 init_special_inode(inode, inode->i_mode,
5036 old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); 5036 old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
5037 else 5037 else
5038 init_special_inode(inode, inode->i_mode, 5038 init_special_inode(inode, inode->i_mode,
5039 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 5039 new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
5040 } else { 5040 } else {
5041 ret = -EIO; 5041 ret = -EIO;
5042 EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode); 5042 EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
5043 goto bad_inode; 5043 goto bad_inode;
5044 } 5044 }
5045 brelse(iloc.bh); 5045 brelse(iloc.bh);
5046 ext4_set_inode_flags(inode); 5046 ext4_set_inode_flags(inode);
5047 unlock_new_inode(inode); 5047 unlock_new_inode(inode);
5048 return inode; 5048 return inode;
5049 5049
5050 bad_inode: 5050 bad_inode:
5051 brelse(iloc.bh); 5051 brelse(iloc.bh);
5052 iget_failed(inode); 5052 iget_failed(inode);
5053 return ERR_PTR(ret); 5053 return ERR_PTR(ret);
5054 } 5054 }
5055 5055
5056 static int ext4_inode_blocks_set(handle_t *handle, 5056 static int ext4_inode_blocks_set(handle_t *handle,
5057 struct ext4_inode *raw_inode, 5057 struct ext4_inode *raw_inode,
5058 struct ext4_inode_info *ei) 5058 struct ext4_inode_info *ei)
5059 { 5059 {
5060 struct inode *inode = &(ei->vfs_inode); 5060 struct inode *inode = &(ei->vfs_inode);
5061 u64 i_blocks = inode->i_blocks; 5061 u64 i_blocks = inode->i_blocks;
5062 struct super_block *sb = inode->i_sb; 5062 struct super_block *sb = inode->i_sb;
5063 5063
5064 if (i_blocks <= ~0U) { 5064 if (i_blocks <= ~0U) {
5065 /* 5065 /*
5066 * i_blocks can be represnted in a 32 bit variable 5066 * i_blocks can be represnted in a 32 bit variable
5067 * as multiple of 512 bytes 5067 * as multiple of 512 bytes
5068 */ 5068 */
5069 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 5069 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
5070 raw_inode->i_blocks_high = 0; 5070 raw_inode->i_blocks_high = 0;
5071 ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); 5071 ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
5072 return 0; 5072 return 0;
5073 } 5073 }
5074 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) 5074 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
5075 return -EFBIG; 5075 return -EFBIG;
5076 5076
5077 if (i_blocks <= 0xffffffffffffULL) { 5077 if (i_blocks <= 0xffffffffffffULL) {
5078 /* 5078 /*
5079 * i_blocks can be represented in a 48 bit variable 5079 * i_blocks can be represented in a 48 bit variable
5080 * as multiple of 512 bytes 5080 * as multiple of 512 bytes
5081 */ 5081 */
5082 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 5082 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
5083 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); 5083 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
5084 ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); 5084 ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
5085 } else { 5085 } else {
5086 ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE); 5086 ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
5087 /* i_block is stored in file system block size */ 5087 /* i_block is stored in file system block size */
5088 i_blocks = i_blocks >> (inode->i_blkbits - 9); 5088 i_blocks = i_blocks >> (inode->i_blkbits - 9);
5089 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 5089 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
5090 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); 5090 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
5091 } 5091 }
5092 return 0; 5092 return 0;
5093 } 5093 }
5094 5094
5095 /* 5095 /*
5096 * Post the struct inode info into an on-disk inode location in the 5096 * Post the struct inode info into an on-disk inode location in the
5097 * buffer-cache. This gobbles the caller's reference to the 5097 * buffer-cache. This gobbles the caller's reference to the
5098 * buffer_head in the inode location struct. 5098 * buffer_head in the inode location struct.
5099 * 5099 *
5100 * The caller must have write access to iloc->bh. 5100 * The caller must have write access to iloc->bh.
5101 */ 5101 */
5102 static int ext4_do_update_inode(handle_t *handle, 5102 static int ext4_do_update_inode(handle_t *handle,
5103 struct inode *inode, 5103 struct inode *inode,
5104 struct ext4_iloc *iloc) 5104 struct ext4_iloc *iloc)
5105 { 5105 {
5106 struct ext4_inode *raw_inode = ext4_raw_inode(iloc); 5106 struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
5107 struct ext4_inode_info *ei = EXT4_I(inode); 5107 struct ext4_inode_info *ei = EXT4_I(inode);
5108 struct buffer_head *bh = iloc->bh; 5108 struct buffer_head *bh = iloc->bh;
5109 int err = 0, rc, block; 5109 int err = 0, rc, block;
5110 5110
5111 /* For fields not not tracking in the in-memory inode, 5111 /* For fields not not tracking in the in-memory inode,
5112 * initialise them to zero for new inodes. */ 5112 * initialise them to zero for new inodes. */
5113 if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) 5113 if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
5114 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); 5114 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
5115 5115
5116 ext4_get_inode_flags(ei); 5116 ext4_get_inode_flags(ei);
5117 raw_inode->i_mode = cpu_to_le16(inode->i_mode); 5117 raw_inode->i_mode = cpu_to_le16(inode->i_mode);
5118 if (!(test_opt(inode->i_sb, NO_UID32))) { 5118 if (!(test_opt(inode->i_sb, NO_UID32))) {
5119 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); 5119 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
5120 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); 5120 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
5121 /* 5121 /*
5122 * Fix up interoperability with old kernels. Otherwise, old inodes get 5122 * Fix up interoperability with old kernels. Otherwise, old inodes get
5123 * re-used with the upper 16 bits of the uid/gid intact 5123 * re-used with the upper 16 bits of the uid/gid intact
5124 */ 5124 */
5125 if (!ei->i_dtime) { 5125 if (!ei->i_dtime) {
5126 raw_inode->i_uid_high = 5126 raw_inode->i_uid_high =
5127 cpu_to_le16(high_16_bits(inode->i_uid)); 5127 cpu_to_le16(high_16_bits(inode->i_uid));
5128 raw_inode->i_gid_high = 5128 raw_inode->i_gid_high =
5129 cpu_to_le16(high_16_bits(inode->i_gid)); 5129 cpu_to_le16(high_16_bits(inode->i_gid));
5130 } else { 5130 } else {
5131 raw_inode->i_uid_high = 0; 5131 raw_inode->i_uid_high = 0;
5132 raw_inode->i_gid_high = 0; 5132 raw_inode->i_gid_high = 0;
5133 } 5133 }
5134 } else { 5134 } else {
5135 raw_inode->i_uid_low = 5135 raw_inode->i_uid_low =
5136 cpu_to_le16(fs_high2lowuid(inode->i_uid)); 5136 cpu_to_le16(fs_high2lowuid(inode->i_uid));
5137 raw_inode->i_gid_low = 5137 raw_inode->i_gid_low =
5138 cpu_to_le16(fs_high2lowgid(inode->i_gid)); 5138 cpu_to_le16(fs_high2lowgid(inode->i_gid));
5139 raw_inode->i_uid_high = 0; 5139 raw_inode->i_uid_high = 0;
5140 raw_inode->i_gid_high = 0; 5140 raw_inode->i_gid_high = 0;
5141 } 5141 }
5142 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); 5142 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
5143 5143
5144 EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode); 5144 EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
5145 EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); 5145 EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
5146 EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); 5146 EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
5147 EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); 5147 EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
5148 5148
5149 if (ext4_inode_blocks_set(handle, raw_inode, ei)) 5149 if (ext4_inode_blocks_set(handle, raw_inode, ei))
5150 goto out_brelse; 5150 goto out_brelse;
5151 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); 5151 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
5152 raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF); 5152 raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
5153 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 5153 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
5154 cpu_to_le32(EXT4_OS_HURD)) 5154 cpu_to_le32(EXT4_OS_HURD))
5155 raw_inode->i_file_acl_high = 5155 raw_inode->i_file_acl_high =
5156 cpu_to_le16(ei->i_file_acl >> 32); 5156 cpu_to_le16(ei->i_file_acl >> 32);
5157 raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); 5157 raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
5158 ext4_isize_set(raw_inode, ei->i_disksize); 5158 ext4_isize_set(raw_inode, ei->i_disksize);
5159 if (ei->i_disksize > 0x7fffffffULL) { 5159 if (ei->i_disksize > 0x7fffffffULL) {
5160 struct super_block *sb = inode->i_sb; 5160 struct super_block *sb = inode->i_sb;
5161 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, 5161 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
5162 EXT4_FEATURE_RO_COMPAT_LARGE_FILE) || 5162 EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
5163 EXT4_SB(sb)->s_es->s_rev_level == 5163 EXT4_SB(sb)->s_es->s_rev_level ==
5164 cpu_to_le32(EXT4_GOOD_OLD_REV)) { 5164 cpu_to_le32(EXT4_GOOD_OLD_REV)) {
5165 /* If this is the first large file 5165 /* If this is the first large file
5166 * created, add a flag to the superblock. 5166 * created, add a flag to the superblock.
5167 */ 5167 */
5168 err = ext4_journal_get_write_access(handle, 5168 err = ext4_journal_get_write_access(handle,
5169 EXT4_SB(sb)->s_sbh); 5169 EXT4_SB(sb)->s_sbh);
5170 if (err) 5170 if (err)
5171 goto out_brelse; 5171 goto out_brelse;
5172 ext4_update_dynamic_rev(sb); 5172 ext4_update_dynamic_rev(sb);
5173 EXT4_SET_RO_COMPAT_FEATURE(sb, 5173 EXT4_SET_RO_COMPAT_FEATURE(sb,
5174 EXT4_FEATURE_RO_COMPAT_LARGE_FILE); 5174 EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
5175 sb->s_dirt = 1; 5175 sb->s_dirt = 1;
5176 ext4_handle_sync(handle); 5176 ext4_handle_sync(handle);
5177 err = ext4_handle_dirty_metadata(handle, NULL, 5177 err = ext4_handle_dirty_metadata(handle, NULL,
5178 EXT4_SB(sb)->s_sbh); 5178 EXT4_SB(sb)->s_sbh);
5179 } 5179 }
5180 } 5180 }
5181 raw_inode->i_generation = cpu_to_le32(inode->i_generation); 5181 raw_inode->i_generation = cpu_to_le32(inode->i_generation);
5182 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { 5182 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
5183 if (old_valid_dev(inode->i_rdev)) { 5183 if (old_valid_dev(inode->i_rdev)) {
5184 raw_inode->i_block[0] = 5184 raw_inode->i_block[0] =
5185 cpu_to_le32(old_encode_dev(inode->i_rdev)); 5185 cpu_to_le32(old_encode_dev(inode->i_rdev));
5186 raw_inode->i_block[1] = 0; 5186 raw_inode->i_block[1] = 0;
5187 } else { 5187 } else {
5188 raw_inode->i_block[0] = 0; 5188 raw_inode->i_block[0] = 0;
5189 raw_inode->i_block[1] = 5189 raw_inode->i_block[1] =
5190 cpu_to_le32(new_encode_dev(inode->i_rdev)); 5190 cpu_to_le32(new_encode_dev(inode->i_rdev));
5191 raw_inode->i_block[2] = 0; 5191 raw_inode->i_block[2] = 0;
5192 } 5192 }
5193 } else 5193 } else
5194 for (block = 0; block < EXT4_N_BLOCKS; block++) 5194 for (block = 0; block < EXT4_N_BLOCKS; block++)
5195 raw_inode->i_block[block] = ei->i_data[block]; 5195 raw_inode->i_block[block] = ei->i_data[block];
5196 5196
5197 raw_inode->i_disk_version = cpu_to_le32(inode->i_version); 5197 raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
5198 if (ei->i_extra_isize) { 5198 if (ei->i_extra_isize) {
5199 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) 5199 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
5200 raw_inode->i_version_hi = 5200 raw_inode->i_version_hi =
5201 cpu_to_le32(inode->i_version >> 32); 5201 cpu_to_le32(inode->i_version >> 32);
5202 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); 5202 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
5203 } 5203 }
5204 5204
5205 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 5205 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
5206 rc = ext4_handle_dirty_metadata(handle, NULL, bh); 5206 rc = ext4_handle_dirty_metadata(handle, NULL, bh);
5207 if (!err) 5207 if (!err)
5208 err = rc; 5208 err = rc;
5209 ext4_clear_inode_state(inode, EXT4_STATE_NEW); 5209 ext4_clear_inode_state(inode, EXT4_STATE_NEW);
5210 5210
5211 ext4_update_inode_fsync_trans(handle, inode, 0); 5211 ext4_update_inode_fsync_trans(handle, inode, 0);
5212 out_brelse: 5212 out_brelse:
5213 brelse(bh); 5213 brelse(bh);
5214 ext4_std_error(inode->i_sb, err); 5214 ext4_std_error(inode->i_sb, err);
5215 return err; 5215 return err;
5216 } 5216 }
5217 5217
5218 /* 5218 /*
5219 * ext4_write_inode() 5219 * ext4_write_inode()
5220 * 5220 *
5221 * We are called from a few places: 5221 * We are called from a few places:
5222 * 5222 *
5223 * - Within generic_file_write() for O_SYNC files. 5223 * - Within generic_file_write() for O_SYNC files.
5224 * Here, there will be no transaction running. We wait for any running 5224 * Here, there will be no transaction running. We wait for any running
5225 * trasnaction to commit. 5225 * trasnaction to commit.
5226 * 5226 *
5227 * - Within sys_sync(), kupdate and such. 5227 * - Within sys_sync(), kupdate and such.
5228 * We wait on commit, if tol to. 5228 * We wait on commit, if tol to.
5229 * 5229 *
5230 * - Within prune_icache() (PF_MEMALLOC == true) 5230 * - Within prune_icache() (PF_MEMALLOC == true)
5231 * Here we simply return. We can't afford to block kswapd on the 5231 * Here we simply return. We can't afford to block kswapd on the
5232 * journal commit. 5232 * journal commit.
5233 * 5233 *
5234 * In all cases it is actually safe for us to return without doing anything, 5234 * In all cases it is actually safe for us to return without doing anything,
5235 * because the inode has been copied into a raw inode buffer in 5235 * because the inode has been copied into a raw inode buffer in
5236 * ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for 5236 * ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for
5237 * knfsd. 5237 * knfsd.
5238 * 5238 *
5239 * Note that we are absolutely dependent upon all inode dirtiers doing the 5239 * Note that we are absolutely dependent upon all inode dirtiers doing the
5240 * right thing: they *must* call mark_inode_dirty() after dirtying info in 5240 * right thing: they *must* call mark_inode_dirty() after dirtying info in
5241 * which we are interested. 5241 * which we are interested.
5242 * 5242 *
5243 * It would be a bug for them to not do this. The code: 5243 * It would be a bug for them to not do this. The code:
5244 * 5244 *
5245 * mark_inode_dirty(inode) 5245 * mark_inode_dirty(inode)
5246 * stuff(); 5246 * stuff();
5247 * inode->i_size = expr; 5247 * inode->i_size = expr;
5248 * 5248 *
5249 * is in error because a kswapd-driven write_inode() could occur while 5249 * is in error because a kswapd-driven write_inode() could occur while
5250 * `stuff()' is running, and the new i_size will be lost. Plus the inode 5250 * `stuff()' is running, and the new i_size will be lost. Plus the inode
5251 * will no longer be on the superblock's dirty inode list. 5251 * will no longer be on the superblock's dirty inode list.
5252 */ 5252 */
5253 int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) 5253 int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
5254 { 5254 {
5255 int err; 5255 int err;
5256 5256
5257 if (current->flags & PF_MEMALLOC) 5257 if (current->flags & PF_MEMALLOC)
5258 return 0; 5258 return 0;
5259 5259
5260 if (EXT4_SB(inode->i_sb)->s_journal) { 5260 if (EXT4_SB(inode->i_sb)->s_journal) {
5261 if (ext4_journal_current_handle()) { 5261 if (ext4_journal_current_handle()) {
5262 jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); 5262 jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
5263 dump_stack(); 5263 dump_stack();
5264 return -EIO; 5264 return -EIO;
5265 } 5265 }
5266 5266
5267 if (wbc->sync_mode != WB_SYNC_ALL) 5267 if (wbc->sync_mode != WB_SYNC_ALL)
5268 return 0; 5268 return 0;
5269 5269
5270 err = ext4_force_commit(inode->i_sb); 5270 err = ext4_force_commit(inode->i_sb);
5271 } else { 5271 } else {
5272 struct ext4_iloc iloc; 5272 struct ext4_iloc iloc;
5273 5273
5274 err = __ext4_get_inode_loc(inode, &iloc, 0); 5274 err = __ext4_get_inode_loc(inode, &iloc, 0);
5275 if (err) 5275 if (err)
5276 return err; 5276 return err;
5277 if (wbc->sync_mode == WB_SYNC_ALL) 5277 if (wbc->sync_mode == WB_SYNC_ALL)
5278 sync_dirty_buffer(iloc.bh); 5278 sync_dirty_buffer(iloc.bh);
5279 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { 5279 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
5280 EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr, 5280 EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
5281 "IO error syncing inode"); 5281 "IO error syncing inode");
5282 err = -EIO; 5282 err = -EIO;
5283 } 5283 }
5284 brelse(iloc.bh); 5284 brelse(iloc.bh);
5285 } 5285 }
5286 return err; 5286 return err;
5287 } 5287 }
5288 5288
5289 /* 5289 /*
5290 * ext4_setattr() 5290 * ext4_setattr()
5291 * 5291 *
5292 * Called from notify_change. 5292 * Called from notify_change.
5293 * 5293 *
5294 * We want to trap VFS attempts to truncate the file as soon as 5294 * We want to trap VFS attempts to truncate the file as soon as
5295 * possible. In particular, we want to make sure that when the VFS 5295 * possible. In particular, we want to make sure that when the VFS
5296 * shrinks i_size, we put the inode on the orphan list and modify 5296 * shrinks i_size, we put the inode on the orphan list and modify
5297 * i_disksize immediately, so that during the subsequent flushing of 5297 * i_disksize immediately, so that during the subsequent flushing of
5298 * dirty pages and freeing of disk blocks, we can guarantee that any 5298 * dirty pages and freeing of disk blocks, we can guarantee that any
5299 * commit will leave the blocks being flushed in an unused state on 5299 * commit will leave the blocks being flushed in an unused state on
5300 * disk. (On recovery, the inode will get truncated and the blocks will 5300 * disk. (On recovery, the inode will get truncated and the blocks will
5301 * be freed, so we have a strong guarantee that no future commit will 5301 * be freed, so we have a strong guarantee that no future commit will
5302 * leave these blocks visible to the user.) 5302 * leave these blocks visible to the user.)
5303 * 5303 *
5304 * Another thing we have to assure is that if we are in ordered mode 5304 * Another thing we have to assure is that if we are in ordered mode
5305 * and inode is still attached to the committing transaction, we must 5305 * and inode is still attached to the committing transaction, we must
5306 * we start writeout of all the dirty pages which are being truncated. 5306 * we start writeout of all the dirty pages which are being truncated.
5307 * This way we are sure that all the data written in the previous 5307 * This way we are sure that all the data written in the previous
5308 * transaction are already on disk (truncate waits for pages under 5308 * transaction are already on disk (truncate waits for pages under
5309 * writeback). 5309 * writeback).
5310 * 5310 *
5311 * Called with inode->i_mutex down. 5311 * Called with inode->i_mutex down.
5312 */ 5312 */
5313 int ext4_setattr(struct dentry *dentry, struct iattr *attr) 5313 int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5314 { 5314 {
5315 struct inode *inode = dentry->d_inode; 5315 struct inode *inode = dentry->d_inode;
5316 int error, rc = 0; 5316 int error, rc = 0;
5317 int orphan = 0; 5317 int orphan = 0;
5318 const unsigned int ia_valid = attr->ia_valid; 5318 const unsigned int ia_valid = attr->ia_valid;
5319 5319
5320 error = inode_change_ok(inode, attr); 5320 error = inode_change_ok(inode, attr);
5321 if (error) 5321 if (error)
5322 return error; 5322 return error;
5323 5323
5324 if (is_quota_modification(inode, attr)) 5324 if (is_quota_modification(inode, attr))
5325 dquot_initialize(inode); 5325 dquot_initialize(inode);
5326 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || 5326 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
5327 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { 5327 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
5328 handle_t *handle; 5328 handle_t *handle;
5329 5329
5330 /* (user+group)*(old+new) structure, inode write (sb, 5330 /* (user+group)*(old+new) structure, inode write (sb,
5331 * inode block, ? - but truncate inode update has it) */ 5331 * inode block, ? - but truncate inode update has it) */
5332 handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+ 5332 handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
5333 EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3); 5333 EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);
5334 if (IS_ERR(handle)) { 5334 if (IS_ERR(handle)) {
5335 error = PTR_ERR(handle); 5335 error = PTR_ERR(handle);
5336 goto err_out; 5336 goto err_out;
5337 } 5337 }
5338 error = dquot_transfer(inode, attr); 5338 error = dquot_transfer(inode, attr);
5339 if (error) { 5339 if (error) {
5340 ext4_journal_stop(handle); 5340 ext4_journal_stop(handle);
5341 return error; 5341 return error;
5342 } 5342 }
5343 /* Update corresponding info in inode so that everything is in 5343 /* Update corresponding info in inode so that everything is in
5344 * one transaction */ 5344 * one transaction */
5345 if (attr->ia_valid & ATTR_UID) 5345 if (attr->ia_valid & ATTR_UID)
5346 inode->i_uid = attr->ia_uid; 5346 inode->i_uid = attr->ia_uid;
5347 if (attr->ia_valid & ATTR_GID) 5347 if (attr->ia_valid & ATTR_GID)
5348 inode->i_gid = attr->ia_gid; 5348 inode->i_gid = attr->ia_gid;
5349 error = ext4_mark_inode_dirty(handle, inode); 5349 error = ext4_mark_inode_dirty(handle, inode);
5350 ext4_journal_stop(handle); 5350 ext4_journal_stop(handle);
5351 } 5351 }
5352 5352
5353 if (attr->ia_valid & ATTR_SIZE) { 5353 if (attr->ia_valid & ATTR_SIZE) {
5354 inode_dio_wait(inode);
5355
5354 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { 5356 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
5355 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 5357 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
5356 5358
5357 if (attr->ia_size > sbi->s_bitmap_maxbytes) 5359 if (attr->ia_size > sbi->s_bitmap_maxbytes)
5358 return -EFBIG; 5360 return -EFBIG;
5359 } 5361 }
5360 } 5362 }
5361 5363
5362 if (S_ISREG(inode->i_mode) && 5364 if (S_ISREG(inode->i_mode) &&
5363 attr->ia_valid & ATTR_SIZE && 5365 attr->ia_valid & ATTR_SIZE &&
5364 (attr->ia_size < inode->i_size)) { 5366 (attr->ia_size < inode->i_size)) {
5365 handle_t *handle; 5367 handle_t *handle;
5366 5368
5367 handle = ext4_journal_start(inode, 3); 5369 handle = ext4_journal_start(inode, 3);
5368 if (IS_ERR(handle)) { 5370 if (IS_ERR(handle)) {
5369 error = PTR_ERR(handle); 5371 error = PTR_ERR(handle);
5370 goto err_out; 5372 goto err_out;
5371 } 5373 }
5372 if (ext4_handle_valid(handle)) { 5374 if (ext4_handle_valid(handle)) {
5373 error = ext4_orphan_add(handle, inode); 5375 error = ext4_orphan_add(handle, inode);
5374 orphan = 1; 5376 orphan = 1;
5375 } 5377 }
5376 EXT4_I(inode)->i_disksize = attr->ia_size; 5378 EXT4_I(inode)->i_disksize = attr->ia_size;
5377 rc = ext4_mark_inode_dirty(handle, inode); 5379 rc = ext4_mark_inode_dirty(handle, inode);
5378 if (!error) 5380 if (!error)
5379 error = rc; 5381 error = rc;
5380 ext4_journal_stop(handle); 5382 ext4_journal_stop(handle);
5381 5383
5382 if (ext4_should_order_data(inode)) { 5384 if (ext4_should_order_data(inode)) {
5383 error = ext4_begin_ordered_truncate(inode, 5385 error = ext4_begin_ordered_truncate(inode,
5384 attr->ia_size); 5386 attr->ia_size);
5385 if (error) { 5387 if (error) {
5386 /* Do as much error cleanup as possible */ 5388 /* Do as much error cleanup as possible */
5387 handle = ext4_journal_start(inode, 3); 5389 handle = ext4_journal_start(inode, 3);
5388 if (IS_ERR(handle)) { 5390 if (IS_ERR(handle)) {
5389 ext4_orphan_del(NULL, inode); 5391 ext4_orphan_del(NULL, inode);
5390 goto err_out; 5392 goto err_out;
5391 } 5393 }
5392 ext4_orphan_del(handle, inode); 5394 ext4_orphan_del(handle, inode);
5393 orphan = 0; 5395 orphan = 0;
5394 ext4_journal_stop(handle); 5396 ext4_journal_stop(handle);
5395 goto err_out; 5397 goto err_out;
5396 } 5398 }
5397 } 5399 }
5398 } 5400 }
5399 5401
5400 if (attr->ia_valid & ATTR_SIZE) { 5402 if (attr->ia_valid & ATTR_SIZE) {
5401 if (attr->ia_size != i_size_read(inode)) { 5403 if (attr->ia_size != i_size_read(inode)) {
5402 truncate_setsize(inode, attr->ia_size); 5404 truncate_setsize(inode, attr->ia_size);
5403 ext4_truncate(inode); 5405 ext4_truncate(inode);
5404 } else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)) 5406 } else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
5405 ext4_truncate(inode); 5407 ext4_truncate(inode);
5406 } 5408 }
5407 5409
5408 if (!rc) { 5410 if (!rc) {
5409 setattr_copy(inode, attr); 5411 setattr_copy(inode, attr);
5410 mark_inode_dirty(inode); 5412 mark_inode_dirty(inode);
5411 } 5413 }
5412 5414
5413 /* 5415 /*
5414 * If the call to ext4_truncate failed to get a transaction handle at 5416 * If the call to ext4_truncate failed to get a transaction handle at
5415 * all, we need to clean up the in-core orphan list manually. 5417 * all, we need to clean up the in-core orphan list manually.
5416 */ 5418 */
5417 if (orphan && inode->i_nlink) 5419 if (orphan && inode->i_nlink)
5418 ext4_orphan_del(NULL, inode); 5420 ext4_orphan_del(NULL, inode);
5419 5421
5420 if (!rc && (ia_valid & ATTR_MODE)) 5422 if (!rc && (ia_valid & ATTR_MODE))
5421 rc = ext4_acl_chmod(inode); 5423 rc = ext4_acl_chmod(inode);
5422 5424
5423 err_out: 5425 err_out:
5424 ext4_std_error(inode->i_sb, error); 5426 ext4_std_error(inode->i_sb, error);
5425 if (!error) 5427 if (!error)
5426 error = rc; 5428 error = rc;
5427 return error; 5429 return error;
5428 } 5430 }
5429 5431
5430 int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, 5432 int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
5431 struct kstat *stat) 5433 struct kstat *stat)
5432 { 5434 {
5433 struct inode *inode; 5435 struct inode *inode;
5434 unsigned long delalloc_blocks; 5436 unsigned long delalloc_blocks;
5435 5437
5436 inode = dentry->d_inode; 5438 inode = dentry->d_inode;
5437 generic_fillattr(inode, stat); 5439 generic_fillattr(inode, stat);
5438 5440
5439 /* 5441 /*
5440 * We can't update i_blocks if the block allocation is delayed 5442 * We can't update i_blocks if the block allocation is delayed
5441 * otherwise in the case of system crash before the real block 5443 * otherwise in the case of system crash before the real block
5442 * allocation is done, we will have i_blocks inconsistent with 5444 * allocation is done, we will have i_blocks inconsistent with
5443 * on-disk file blocks. 5445 * on-disk file blocks.
5444 * We always keep i_blocks updated together with real 5446 * We always keep i_blocks updated together with real
5445 * allocation. But to not confuse with user, stat 5447 * allocation. But to not confuse with user, stat
5446 * will return the blocks that include the delayed allocation 5448 * will return the blocks that include the delayed allocation
5447 * blocks for this file. 5449 * blocks for this file.
5448 */ 5450 */
5449 delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks; 5451 delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
5450 5452
5451 stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; 5453 stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
5452 return 0; 5454 return 0;
5453 } 5455 }
5454 5456
5455 static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks, 5457 static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
5456 int chunk) 5458 int chunk)
5457 { 5459 {
5458 int indirects; 5460 int indirects;
5459 5461
5460 /* if nrblocks are contiguous */ 5462 /* if nrblocks are contiguous */
5461 if (chunk) { 5463 if (chunk) {
5462 /* 5464 /*
5463 * With N contiguous data blocks, we need at most 5465 * With N contiguous data blocks, we need at most
5464 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, 5466 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
5465 * 2 dindirect blocks, and 1 tindirect block 5467 * 2 dindirect blocks, and 1 tindirect block
5466 */ 5468 */
5467 return DIV_ROUND_UP(nrblocks, 5469 return DIV_ROUND_UP(nrblocks,
5468 EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; 5470 EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
5469 } 5471 }
5470 /* 5472 /*
5471 * if nrblocks are not contiguous, worse case, each block touch 5473 * if nrblocks are not contiguous, worse case, each block touch
5472 * a indirect block, and each indirect block touch a double indirect 5474 * a indirect block, and each indirect block touch a double indirect
5473 * block, plus a triple indirect block 5475 * block, plus a triple indirect block
5474 */ 5476 */
5475 indirects = nrblocks * 2 + 1; 5477 indirects = nrblocks * 2 + 1;
5476 return indirects; 5478 return indirects;
5477 } 5479 }
5478 5480
5479 static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) 5481 static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
5480 { 5482 {
5481 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 5483 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
5482 return ext4_indirect_trans_blocks(inode, nrblocks, chunk); 5484 return ext4_indirect_trans_blocks(inode, nrblocks, chunk);
5483 return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); 5485 return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
5484 } 5486 }
5485 5487
5486 /* 5488 /*
5487 * Account for index blocks, block groups bitmaps and block group 5489 * Account for index blocks, block groups bitmaps and block group
5488 * descriptor blocks if modify datablocks and index blocks 5490 * descriptor blocks if modify datablocks and index blocks
5489 * worse case, the indexs blocks spread over different block groups 5491 * worse case, the indexs blocks spread over different block groups
5490 * 5492 *
5491 * If datablocks are discontiguous, they are possible to spread over 5493 * If datablocks are discontiguous, they are possible to spread over
5492 * different block groups too. If they are contiuguous, with flexbg, 5494 * different block groups too. If they are contiuguous, with flexbg,
5493 * they could still across block group boundary. 5495 * they could still across block group boundary.
5494 * 5496 *
5495 * Also account for superblock, inode, quota and xattr blocks 5497 * Also account for superblock, inode, quota and xattr blocks
5496 */ 5498 */
5497 static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) 5499 static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
5498 { 5500 {
5499 ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); 5501 ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
5500 int gdpblocks; 5502 int gdpblocks;
5501 int idxblocks; 5503 int idxblocks;
5502 int ret = 0; 5504 int ret = 0;
5503 5505
5504 /* 5506 /*
5505 * How many index blocks need to touch to modify nrblocks? 5507 * How many index blocks need to touch to modify nrblocks?
5506 * The "Chunk" flag indicating whether the nrblocks is 5508 * The "Chunk" flag indicating whether the nrblocks is
5507 * physically contiguous on disk 5509 * physically contiguous on disk
5508 * 5510 *
5509 * For Direct IO and fallocate, they calls get_block to allocate 5511 * For Direct IO and fallocate, they calls get_block to allocate
5510 * one single extent at a time, so they could set the "Chunk" flag 5512 * one single extent at a time, so they could set the "Chunk" flag
5511 */ 5513 */
5512 idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk); 5514 idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk);
5513 5515
5514 ret = idxblocks; 5516 ret = idxblocks;
5515 5517
5516 /* 5518 /*
5517 * Now let's see how many group bitmaps and group descriptors need 5519 * Now let's see how many group bitmaps and group descriptors need
5518 * to account 5520 * to account
5519 */ 5521 */
5520 groups = idxblocks; 5522 groups = idxblocks;
5521 if (chunk) 5523 if (chunk)
5522 groups += 1; 5524 groups += 1;
5523 else 5525 else
5524 groups += nrblocks; 5526 groups += nrblocks;
5525 5527
5526 gdpblocks = groups; 5528 gdpblocks = groups;
5527 if (groups > ngroups) 5529 if (groups > ngroups)
5528 groups = ngroups; 5530 groups = ngroups;
5529 if (groups > EXT4_SB(inode->i_sb)->s_gdb_count) 5531 if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
5530 gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; 5532 gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
5531 5533
5532 /* bitmaps and block group descriptor blocks */ 5534 /* bitmaps and block group descriptor blocks */
5533 ret += groups + gdpblocks; 5535 ret += groups + gdpblocks;
5534 5536
5535 /* Blocks for super block, inode, quota and xattr blocks */ 5537 /* Blocks for super block, inode, quota and xattr blocks */
5536 ret += EXT4_META_TRANS_BLOCKS(inode->i_sb); 5538 ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);
5537 5539
5538 return ret; 5540 return ret;
5539 } 5541 }
5540 5542
5541 /* 5543 /*
5542 * Calculate the total number of credits to reserve to fit 5544 * Calculate the total number of credits to reserve to fit
5543 * the modification of a single pages into a single transaction, 5545 * the modification of a single pages into a single transaction,
5544 * which may include multiple chunks of block allocations. 5546 * which may include multiple chunks of block allocations.
5545 * 5547 *
5546 * This could be called via ext4_write_begin() 5548 * This could be called via ext4_write_begin()
5547 * 5549 *
5548 * We need to consider the worse case, when 5550 * We need to consider the worse case, when
5549 * one new block per extent. 5551 * one new block per extent.
5550 */ 5552 */
5551 int ext4_writepage_trans_blocks(struct inode *inode) 5553 int ext4_writepage_trans_blocks(struct inode *inode)
5552 { 5554 {
5553 int bpp = ext4_journal_blocks_per_page(inode); 5555 int bpp = ext4_journal_blocks_per_page(inode);
5554 int ret; 5556 int ret;
5555 5557
5556 ret = ext4_meta_trans_blocks(inode, bpp, 0); 5558 ret = ext4_meta_trans_blocks(inode, bpp, 0);
5557 5559
5558 /* Account for data blocks for journalled mode */ 5560 /* Account for data blocks for journalled mode */
5559 if (ext4_should_journal_data(inode)) 5561 if (ext4_should_journal_data(inode))
5560 ret += bpp; 5562 ret += bpp;
5561 return ret; 5563 return ret;
5562 } 5564 }
5563 5565
5564 /* 5566 /*
5565 * Calculate the journal credits for a chunk of data modification. 5567 * Calculate the journal credits for a chunk of data modification.
5566 * 5568 *
5567 * This is called from DIO, fallocate or whoever calling 5569 * This is called from DIO, fallocate or whoever calling
5568 * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks. 5570 * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks.
5569 * 5571 *
5570 * journal buffers for data blocks are not included here, as DIO 5572 * journal buffers for data blocks are not included here, as DIO
5571 * and fallocate do no need to journal data buffers. 5573 * and fallocate do no need to journal data buffers.
5572 */ 5574 */
5573 int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks) 5575 int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
5574 { 5576 {
5575 return ext4_meta_trans_blocks(inode, nrblocks, 1); 5577 return ext4_meta_trans_blocks(inode, nrblocks, 1);
5576 } 5578 }
5577 5579
5578 /* 5580 /*
5579 * The caller must have previously called ext4_reserve_inode_write(). 5581 * The caller must have previously called ext4_reserve_inode_write().
5580 * Give this, we know that the caller already has write access to iloc->bh. 5582 * Give this, we know that the caller already has write access to iloc->bh.
5581 */ 5583 */
5582 int ext4_mark_iloc_dirty(handle_t *handle, 5584 int ext4_mark_iloc_dirty(handle_t *handle,
5583 struct inode *inode, struct ext4_iloc *iloc) 5585 struct inode *inode, struct ext4_iloc *iloc)
5584 { 5586 {
5585 int err = 0; 5587 int err = 0;
5586 5588
5587 if (test_opt(inode->i_sb, I_VERSION)) 5589 if (test_opt(inode->i_sb, I_VERSION))
5588 inode_inc_iversion(inode); 5590 inode_inc_iversion(inode);
5589 5591
5590 /* the do_update_inode consumes one bh->b_count */ 5592 /* the do_update_inode consumes one bh->b_count */
5591 get_bh(iloc->bh); 5593 get_bh(iloc->bh);
5592 5594
5593 /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */ 5595 /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
5594 err = ext4_do_update_inode(handle, inode, iloc); 5596 err = ext4_do_update_inode(handle, inode, iloc);
5595 put_bh(iloc->bh); 5597 put_bh(iloc->bh);
5596 return err; 5598 return err;
5597 } 5599 }
5598 5600
5599 /* 5601 /*
5600 * On success, We end up with an outstanding reference count against 5602 * On success, We end up with an outstanding reference count against
5601 * iloc->bh. This _must_ be cleaned up later. 5603 * iloc->bh. This _must_ be cleaned up later.
5602 */ 5604 */
5603 5605
5604 int 5606 int
5605 ext4_reserve_inode_write(handle_t *handle, struct inode *inode, 5607 ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
5606 struct ext4_iloc *iloc) 5608 struct ext4_iloc *iloc)
5607 { 5609 {
5608 int err; 5610 int err;
5609 5611
5610 err = ext4_get_inode_loc(inode, iloc); 5612 err = ext4_get_inode_loc(inode, iloc);
5611 if (!err) { 5613 if (!err) {
5612 BUFFER_TRACE(iloc->bh, "get_write_access"); 5614 BUFFER_TRACE(iloc->bh, "get_write_access");
5613 err = ext4_journal_get_write_access(handle, iloc->bh); 5615 err = ext4_journal_get_write_access(handle, iloc->bh);
5614 if (err) { 5616 if (err) {
5615 brelse(iloc->bh); 5617 brelse(iloc->bh);
5616 iloc->bh = NULL; 5618 iloc->bh = NULL;
5617 } 5619 }
5618 } 5620 }
5619 ext4_std_error(inode->i_sb, err); 5621 ext4_std_error(inode->i_sb, err);
5620 return err; 5622 return err;
5621 } 5623 }
5622 5624
5623 /* 5625 /*
5624 * Expand an inode by new_extra_isize bytes. 5626 * Expand an inode by new_extra_isize bytes.
5625 * Returns 0 on success or negative error number on failure. 5627 * Returns 0 on success or negative error number on failure.
5626 */ 5628 */
5627 static int ext4_expand_extra_isize(struct inode *inode, 5629 static int ext4_expand_extra_isize(struct inode *inode,
5628 unsigned int new_extra_isize, 5630 unsigned int new_extra_isize,
5629 struct ext4_iloc iloc, 5631 struct ext4_iloc iloc,
5630 handle_t *handle) 5632 handle_t *handle)
5631 { 5633 {
5632 struct ext4_inode *raw_inode; 5634 struct ext4_inode *raw_inode;
5633 struct ext4_xattr_ibody_header *header; 5635 struct ext4_xattr_ibody_header *header;
5634 5636
5635 if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) 5637 if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)
5636 return 0; 5638 return 0;
5637 5639
5638 raw_inode = ext4_raw_inode(&iloc); 5640 raw_inode = ext4_raw_inode(&iloc);
5639 5641
5640 header = IHDR(inode, raw_inode); 5642 header = IHDR(inode, raw_inode);
5641 5643
5642 /* No extended attributes present */ 5644 /* No extended attributes present */
5643 if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) || 5645 if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
5644 header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { 5646 header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
5645 memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0, 5647 memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
5646 new_extra_isize); 5648 new_extra_isize);
5647 EXT4_I(inode)->i_extra_isize = new_extra_isize; 5649 EXT4_I(inode)->i_extra_isize = new_extra_isize;
5648 return 0; 5650 return 0;
5649 } 5651 }
5650 5652
5651 /* try to expand with EAs present */ 5653 /* try to expand with EAs present */
5652 return ext4_expand_extra_isize_ea(inode, new_extra_isize, 5654 return ext4_expand_extra_isize_ea(inode, new_extra_isize,
5653 raw_inode, handle); 5655 raw_inode, handle);
5654 } 5656 }
5655 5657
5656 /* 5658 /*
5657 * What we do here is to mark the in-core inode as clean with respect to inode 5659 * What we do here is to mark the in-core inode as clean with respect to inode
5658 * dirtiness (it may still be data-dirty). 5660 * dirtiness (it may still be data-dirty).
5659 * This means that the in-core inode may be reaped by prune_icache 5661 * This means that the in-core inode may be reaped by prune_icache
5660 * without having to perform any I/O. This is a very good thing, 5662 * without having to perform any I/O. This is a very good thing,
5661 * because *any* task may call prune_icache - even ones which 5663 * because *any* task may call prune_icache - even ones which
5662 * have a transaction open against a different journal. 5664 * have a transaction open against a different journal.
5663 * 5665 *
5664 * Is this cheating? Not really. Sure, we haven't written the 5666 * Is this cheating? Not really. Sure, we haven't written the
5665 * inode out, but prune_icache isn't a user-visible syncing function. 5667 * inode out, but prune_icache isn't a user-visible syncing function.
5666 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) 5668 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
5667 * we start and wait on commits. 5669 * we start and wait on commits.
5668 * 5670 *
5669 * Is this efficient/effective? Well, we're being nice to the system 5671 * Is this efficient/effective? Well, we're being nice to the system
5670 * by cleaning up our inodes proactively so they can be reaped 5672 * by cleaning up our inodes proactively so they can be reaped
5671 * without I/O. But we are potentially leaving up to five seconds' 5673 * without I/O. But we are potentially leaving up to five seconds'
5672 * worth of inodes floating about which prune_icache wants us to 5674 * worth of inodes floating about which prune_icache wants us to
5673 * write out. One way to fix that would be to get prune_icache() 5675 * write out. One way to fix that would be to get prune_icache()
5674 * to do a write_super() to free up some memory. It has the desired 5676 * to do a write_super() to free up some memory. It has the desired
5675 * effect. 5677 * effect.
5676 */ 5678 */
5677 int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) 5679 int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
5678 { 5680 {
5679 struct ext4_iloc iloc; 5681 struct ext4_iloc iloc;
5680 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 5682 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
5681 static unsigned int mnt_count; 5683 static unsigned int mnt_count;
5682 int err, ret; 5684 int err, ret;
5683 5685
5684 might_sleep(); 5686 might_sleep();
5685 trace_ext4_mark_inode_dirty(inode, _RET_IP_); 5687 trace_ext4_mark_inode_dirty(inode, _RET_IP_);
5686 err = ext4_reserve_inode_write(handle, inode, &iloc); 5688 err = ext4_reserve_inode_write(handle, inode, &iloc);
5687 if (ext4_handle_valid(handle) && 5689 if (ext4_handle_valid(handle) &&
5688 EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && 5690 EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
5689 !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { 5691 !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
5690 /* 5692 /*
5691 * We need extra buffer credits since we may write into EA block 5693 * We need extra buffer credits since we may write into EA block
5692 * with this same handle. If journal_extend fails, then it will 5694 * with this same handle. If journal_extend fails, then it will
5693 * only result in a minor loss of functionality for that inode. 5695 * only result in a minor loss of functionality for that inode.
5694 * If this is felt to be critical, then e2fsck should be run to 5696 * If this is felt to be critical, then e2fsck should be run to
5695 * force a large enough s_min_extra_isize. 5697 * force a large enough s_min_extra_isize.
5696 */ 5698 */
5697 if ((jbd2_journal_extend(handle, 5699 if ((jbd2_journal_extend(handle,
5698 EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) { 5700 EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) {
5699 ret = ext4_expand_extra_isize(inode, 5701 ret = ext4_expand_extra_isize(inode,
5700 sbi->s_want_extra_isize, 5702 sbi->s_want_extra_isize,
5701 iloc, handle); 5703 iloc, handle);
5702 if (ret) { 5704 if (ret) {
5703 ext4_set_inode_state(inode, 5705 ext4_set_inode_state(inode,
5704 EXT4_STATE_NO_EXPAND); 5706 EXT4_STATE_NO_EXPAND);
5705 if (mnt_count != 5707 if (mnt_count !=
5706 le16_to_cpu(sbi->s_es->s_mnt_count)) { 5708 le16_to_cpu(sbi->s_es->s_mnt_count)) {
5707 ext4_warning(inode->i_sb, 5709 ext4_warning(inode->i_sb,
5708 "Unable to expand inode %lu. Delete" 5710 "Unable to expand inode %lu. Delete"
5709 " some EAs or run e2fsck.", 5711 " some EAs or run e2fsck.",
5710 inode->i_ino); 5712 inode->i_ino);
5711 mnt_count = 5713 mnt_count =
5712 le16_to_cpu(sbi->s_es->s_mnt_count); 5714 le16_to_cpu(sbi->s_es->s_mnt_count);
5713 } 5715 }
5714 } 5716 }
5715 } 5717 }
5716 } 5718 }
5717 if (!err) 5719 if (!err)
5718 err = ext4_mark_iloc_dirty(handle, inode, &iloc); 5720 err = ext4_mark_iloc_dirty(handle, inode, &iloc);
5719 return err; 5721 return err;
5720 } 5722 }
5721 5723
5722 /* 5724 /*
5723 * ext4_dirty_inode() is called from __mark_inode_dirty() 5725 * ext4_dirty_inode() is called from __mark_inode_dirty()
5724 * 5726 *
5725 * We're really interested in the case where a file is being extended. 5727 * We're really interested in the case where a file is being extended.
5726 * i_size has been changed by generic_commit_write() and we thus need 5728 * i_size has been changed by generic_commit_write() and we thus need
5727 * to include the updated inode in the current transaction. 5729 * to include the updated inode in the current transaction.
5728 * 5730 *
5729 * Also, dquot_alloc_block() will always dirty the inode when blocks 5731 * Also, dquot_alloc_block() will always dirty the inode when blocks
5730 * are allocated to the file. 5732 * are allocated to the file.
5731 * 5733 *
5732 * If the inode is marked synchronous, we don't honour that here - doing 5734 * If the inode is marked synchronous, we don't honour that here - doing
5733 * so would cause a commit on atime updates, which we don't bother doing. 5735 * so would cause a commit on atime updates, which we don't bother doing.
5734 * We handle synchronous inodes at the highest possible level. 5736 * We handle synchronous inodes at the highest possible level.
5735 */ 5737 */
5736 void ext4_dirty_inode(struct inode *inode, int flags) 5738 void ext4_dirty_inode(struct inode *inode, int flags)
5737 { 5739 {
5738 handle_t *handle; 5740 handle_t *handle;
5739 5741
5740 handle = ext4_journal_start(inode, 2); 5742 handle = ext4_journal_start(inode, 2);
5741 if (IS_ERR(handle)) 5743 if (IS_ERR(handle))
5742 goto out; 5744 goto out;
5743 5745
5744 ext4_mark_inode_dirty(handle, inode); 5746 ext4_mark_inode_dirty(handle, inode);
5745 5747
5746 ext4_journal_stop(handle); 5748 ext4_journal_stop(handle);
5747 out: 5749 out:
5748 return; 5750 return;
5749 } 5751 }
5750 5752
5751 #if 0 5753 #if 0
5752 /* 5754 /*
5753 * Bind an inode's backing buffer_head into this transaction, to prevent 5755 * Bind an inode's backing buffer_head into this transaction, to prevent
5754 * it from being flushed to disk early. Unlike 5756 * it from being flushed to disk early. Unlike
5755 * ext4_reserve_inode_write, this leaves behind no bh reference and 5757 * ext4_reserve_inode_write, this leaves behind no bh reference and
5756 * returns no iloc structure, so the caller needs to repeat the iloc 5758 * returns no iloc structure, so the caller needs to repeat the iloc
5757 * lookup to mark the inode dirty later. 5759 * lookup to mark the inode dirty later.
5758 */ 5760 */
5759 static int ext4_pin_inode(handle_t *handle, struct inode *inode) 5761 static int ext4_pin_inode(handle_t *handle, struct inode *inode)
5760 { 5762 {
5761 struct ext4_iloc iloc; 5763 struct ext4_iloc iloc;
5762 5764
5763 int err = 0; 5765 int err = 0;
5764 if (handle) { 5766 if (handle) {
5765 err = ext4_get_inode_loc(inode, &iloc); 5767 err = ext4_get_inode_loc(inode, &iloc);
5766 if (!err) { 5768 if (!err) {
5767 BUFFER_TRACE(iloc.bh, "get_write_access"); 5769 BUFFER_TRACE(iloc.bh, "get_write_access");
5768 err = jbd2_journal_get_write_access(handle, iloc.bh); 5770 err = jbd2_journal_get_write_access(handle, iloc.bh);
5769 if (!err) 5771 if (!err)
5770 err = ext4_handle_dirty_metadata(handle, 5772 err = ext4_handle_dirty_metadata(handle,
5771 NULL, 5773 NULL,
5772 iloc.bh); 5774 iloc.bh);
5773 brelse(iloc.bh); 5775 brelse(iloc.bh);
5774 } 5776 }
5775 } 5777 }
5776 ext4_std_error(inode->i_sb, err); 5778 ext4_std_error(inode->i_sb, err);
5777 return err; 5779 return err;
5778 } 5780 }
5779 #endif 5781 #endif
5780 5782
5781 int ext4_change_inode_journal_flag(struct inode *inode, int val) 5783 int ext4_change_inode_journal_flag(struct inode *inode, int val)
5782 { 5784 {
5783 journal_t *journal; 5785 journal_t *journal;
5784 handle_t *handle; 5786 handle_t *handle;
5785 int err; 5787 int err;
5786 5788
5787 /* 5789 /*
5788 * We have to be very careful here: changing a data block's 5790 * We have to be very careful here: changing a data block's
5789 * journaling status dynamically is dangerous. If we write a 5791 * journaling status dynamically is dangerous. If we write a
5790 * data block to the journal, change the status and then delete 5792 * data block to the journal, change the status and then delete
5791 * that block, we risk forgetting to revoke the old log record 5793 * that block, we risk forgetting to revoke the old log record
5792 * from the journal and so a subsequent replay can corrupt data. 5794 * from the journal and so a subsequent replay can corrupt data.
5793 * So, first we make sure that the journal is empty and that 5795 * So, first we make sure that the journal is empty and that
5794 * nobody is changing anything. 5796 * nobody is changing anything.
5795 */ 5797 */
5796 5798
5797 journal = EXT4_JOURNAL(inode); 5799 journal = EXT4_JOURNAL(inode);
5798 if (!journal) 5800 if (!journal)
5799 return 0; 5801 return 0;
5800 if (is_journal_aborted(journal)) 5802 if (is_journal_aborted(journal))
5801 return -EROFS; 5803 return -EROFS;
5802 5804
5803 jbd2_journal_lock_updates(journal); 5805 jbd2_journal_lock_updates(journal);
5804 jbd2_journal_flush(journal); 5806 jbd2_journal_flush(journal);
5805 5807
5806 /* 5808 /*
5807 * OK, there are no updates running now, and all cached data is 5809 * OK, there are no updates running now, and all cached data is
5808 * synced to disk. We are now in a completely consistent state 5810 * synced to disk. We are now in a completely consistent state
5809 * which doesn't have anything in the journal, and we know that 5811 * which doesn't have anything in the journal, and we know that
5810 * no filesystem updates are running, so it is safe to modify 5812 * no filesystem updates are running, so it is safe to modify
5811 * the inode's in-core data-journaling state flag now. 5813 * the inode's in-core data-journaling state flag now.
5812 */ 5814 */
5813 5815
5814 if (val) 5816 if (val)
5815 ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); 5817 ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
5816 else 5818 else
5817 ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); 5819 ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
5818 ext4_set_aops(inode); 5820 ext4_set_aops(inode);
5819 5821
5820 jbd2_journal_unlock_updates(journal); 5822 jbd2_journal_unlock_updates(journal);
5821 5823
5822 /* Finally we can mark the inode as dirty. */ 5824 /* Finally we can mark the inode as dirty. */
5823 5825
5824 handle = ext4_journal_start(inode, 1); 5826 handle = ext4_journal_start(inode, 1);
5825 if (IS_ERR(handle)) 5827 if (IS_ERR(handle))
5826 return PTR_ERR(handle); 5828 return PTR_ERR(handle);
5827 5829
5828 err = ext4_mark_inode_dirty(handle, inode); 5830 err = ext4_mark_inode_dirty(handle, inode);
5829 ext4_handle_sync(handle); 5831 ext4_handle_sync(handle);
5830 ext4_journal_stop(handle); 5832 ext4_journal_stop(handle);
5831 ext4_std_error(inode->i_sb, err); 5833 ext4_std_error(inode->i_sb, err);
5832 5834
5833 return err; 5835 return err;
5834 } 5836 }
5835 5837
5836 static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh) 5838 static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
5837 { 5839 {
5838 return !buffer_mapped(bh); 5840 return !buffer_mapped(bh);
5839 } 5841 }
5840 5842
5841 int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 5843 int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5842 { 5844 {
5843 struct page *page = vmf->page; 5845 struct page *page = vmf->page;
5844 loff_t size; 5846 loff_t size;
5845 unsigned long len; 5847 unsigned long len;
5846 int ret; 5848 int ret;
5847 struct file *file = vma->vm_file; 5849 struct file *file = vma->vm_file;
5848 struct inode *inode = file->f_path.dentry->d_inode; 5850 struct inode *inode = file->f_path.dentry->d_inode;
5849 struct address_space *mapping = inode->i_mapping; 5851 struct address_space *mapping = inode->i_mapping;
5850 handle_t *handle; 5852 handle_t *handle;
5851 get_block_t *get_block; 5853 get_block_t *get_block;
5852 int retries = 0; 5854 int retries = 0;
5853 5855
5854 /* 5856 /*
5855 * This check is racy but catches the common case. We rely on 5857 * This check is racy but catches the common case. We rely on
5856 * __block_page_mkwrite() to do a reliable check. 5858 * __block_page_mkwrite() to do a reliable check.
5857 */ 5859 */
5858 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 5860 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
5859 /* Delalloc case is easy... */ 5861 /* Delalloc case is easy... */
5860 if (test_opt(inode->i_sb, DELALLOC) && 5862 if (test_opt(inode->i_sb, DELALLOC) &&
5861 !ext4_should_journal_data(inode) && 5863 !ext4_should_journal_data(inode) &&
5862 !ext4_nonda_switch(inode->i_sb)) { 5864 !ext4_nonda_switch(inode->i_sb)) {
5863 do { 5865 do {
5864 ret = __block_page_mkwrite(vma, vmf, 5866 ret = __block_page_mkwrite(vma, vmf,
5865 ext4_da_get_block_prep); 5867 ext4_da_get_block_prep);
5866 } while (ret == -ENOSPC && 5868 } while (ret == -ENOSPC &&
5867 ext4_should_retry_alloc(inode->i_sb, &retries)); 5869 ext4_should_retry_alloc(inode->i_sb, &retries));
5868 goto out_ret; 5870 goto out_ret;
5869 } 5871 }
5870 5872
5871 lock_page(page); 5873 lock_page(page);
5872 size = i_size_read(inode); 5874 size = i_size_read(inode);
5873 /* Page got truncated from under us? */ 5875 /* Page got truncated from under us? */
5874 if (page->mapping != mapping || page_offset(page) > size) { 5876 if (page->mapping != mapping || page_offset(page) > size) {
5875 unlock_page(page); 5877 unlock_page(page);
5876 ret = VM_FAULT_NOPAGE; 5878 ret = VM_FAULT_NOPAGE;
5877 goto out; 5879 goto out;
5878 } 5880 }
5879 5881
5880 if (page->index == size >> PAGE_CACHE_SHIFT) 5882 if (page->index == size >> PAGE_CACHE_SHIFT)
5881 len = size & ~PAGE_CACHE_MASK; 5883 len = size & ~PAGE_CACHE_MASK;
5882 else 5884 else
5883 len = PAGE_CACHE_SIZE; 5885 len = PAGE_CACHE_SIZE;
5884 /* 5886 /*
5885 * Return if we have all the buffers mapped. This avoids the need to do 5887 * Return if we have all the buffers mapped. This avoids the need to do
5886 * journal_start/journal_stop which can block and take a long time 5888 * journal_start/journal_stop which can block and take a long time
5887 */ 5889 */
5888 if (page_has_buffers(page)) { 5890 if (page_has_buffers(page)) {
5889 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, 5891 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
5890 ext4_bh_unmapped)) { 5892 ext4_bh_unmapped)) {
5891 /* Wait so that we don't change page under IO */ 5893 /* Wait so that we don't change page under IO */
5892 wait_on_page_writeback(page); 5894 wait_on_page_writeback(page);
5893 ret = VM_FAULT_LOCKED; 5895 ret = VM_FAULT_LOCKED;
5894 goto out; 5896 goto out;
5895 } 5897 }
5896 } 5898 }
5897 unlock_page(page); 5899 unlock_page(page);
5898 /* OK, we need to fill the hole... */ 5900 /* OK, we need to fill the hole... */
5899 if (ext4_should_dioread_nolock(inode)) 5901 if (ext4_should_dioread_nolock(inode))
5900 get_block = ext4_get_block_write; 5902 get_block = ext4_get_block_write;
5901 else 5903 else
5902 get_block = ext4_get_block; 5904 get_block = ext4_get_block;
5903 retry_alloc: 5905 retry_alloc:
5904 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); 5906 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
5905 if (IS_ERR(handle)) { 5907 if (IS_ERR(handle)) {
5906 ret = VM_FAULT_SIGBUS; 5908 ret = VM_FAULT_SIGBUS;
5907 goto out; 5909 goto out;
5908 } 5910 }
5909 ret = __block_page_mkwrite(vma, vmf, get_block); 5911 ret = __block_page_mkwrite(vma, vmf, get_block);
5910 if (!ret && ext4_should_journal_data(inode)) { 5912 if (!ret && ext4_should_journal_data(inode)) {
5911 if (walk_page_buffers(handle, page_buffers(page), 0, 5913 if (walk_page_buffers(handle, page_buffers(page), 0,
5912 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) { 5914 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
5913 unlock_page(page); 5915 unlock_page(page);
5914 ret = VM_FAULT_SIGBUS; 5916 ret = VM_FAULT_SIGBUS;
5915 goto out; 5917 goto out;
5916 } 5918 }
5917 ext4_set_inode_state(inode, EXT4_STATE_JDATA); 5919 ext4_set_inode_state(inode, EXT4_STATE_JDATA);
5918 } 5920 }
5919 ext4_journal_stop(handle); 5921 ext4_journal_stop(handle);
5920 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 5922 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
5921 goto retry_alloc; 5923 goto retry_alloc;
5922 out_ret: 5924 out_ret:
5923 ret = block_page_mkwrite_return(ret); 5925 ret = block_page_mkwrite_return(ret);
5924 out: 5926 out:
5925 return ret; 5927 return ret;
5926 } 5928 }
5927 5929
1 /* 1 /*
2 * linux/fs/fat/file.c 2 * linux/fs/fat/file.c
3 * 3 *
4 * Written 1992,1993 by Werner Almesberger 4 * Written 1992,1993 by Werner Almesberger
5 * 5 *
6 * regular file handling primitives for fat-based filesystems 6 * regular file handling primitives for fat-based filesystems
7 */ 7 */
8 8
9 #include <linux/capability.h> 9 #include <linux/capability.h>
10 #include <linux/module.h> 10 #include <linux/module.h>
11 #include <linux/compat.h> 11 #include <linux/compat.h>
12 #include <linux/mount.h> 12 #include <linux/mount.h>
13 #include <linux/time.h> 13 #include <linux/time.h>
14 #include <linux/buffer_head.h> 14 #include <linux/buffer_head.h>
15 #include <linux/writeback.h> 15 #include <linux/writeback.h>
16 #include <linux/backing-dev.h> 16 #include <linux/backing-dev.h>
17 #include <linux/blkdev.h> 17 #include <linux/blkdev.h>
18 #include <linux/fsnotify.h> 18 #include <linux/fsnotify.h>
19 #include <linux/security.h> 19 #include <linux/security.h>
20 #include "fat.h" 20 #include "fat.h"
21 21
22 static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr) 22 static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr)
23 { 23 {
24 u32 attr; 24 u32 attr;
25 25
26 mutex_lock(&inode->i_mutex); 26 mutex_lock(&inode->i_mutex);
27 attr = fat_make_attrs(inode); 27 attr = fat_make_attrs(inode);
28 mutex_unlock(&inode->i_mutex); 28 mutex_unlock(&inode->i_mutex);
29 29
30 return put_user(attr, user_attr); 30 return put_user(attr, user_attr);
31 } 31 }
32 32
33 static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) 33 static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
34 { 34 {
35 struct inode *inode = file->f_path.dentry->d_inode; 35 struct inode *inode = file->f_path.dentry->d_inode;
36 struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); 36 struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
37 int is_dir = S_ISDIR(inode->i_mode); 37 int is_dir = S_ISDIR(inode->i_mode);
38 u32 attr, oldattr; 38 u32 attr, oldattr;
39 struct iattr ia; 39 struct iattr ia;
40 int err; 40 int err;
41 41
42 err = get_user(attr, user_attr); 42 err = get_user(attr, user_attr);
43 if (err) 43 if (err)
44 goto out; 44 goto out;
45 45
46 mutex_lock(&inode->i_mutex); 46 mutex_lock(&inode->i_mutex);
47 err = mnt_want_write(file->f_path.mnt); 47 err = mnt_want_write(file->f_path.mnt);
48 if (err) 48 if (err)
49 goto out_unlock_inode; 49 goto out_unlock_inode;
50 50
51 /* 51 /*
52 * ATTR_VOLUME and ATTR_DIR cannot be changed; this also 52 * ATTR_VOLUME and ATTR_DIR cannot be changed; this also
53 * prevents the user from turning us into a VFAT 53 * prevents the user from turning us into a VFAT
54 * longname entry. Also, we obviously can't set 54 * longname entry. Also, we obviously can't set
55 * any of the NTFS attributes in the high 24 bits. 55 * any of the NTFS attributes in the high 24 bits.
56 */ 56 */
57 attr &= 0xff & ~(ATTR_VOLUME | ATTR_DIR); 57 attr &= 0xff & ~(ATTR_VOLUME | ATTR_DIR);
58 /* Merge in ATTR_VOLUME and ATTR_DIR */ 58 /* Merge in ATTR_VOLUME and ATTR_DIR */
59 attr |= (MSDOS_I(inode)->i_attrs & ATTR_VOLUME) | 59 attr |= (MSDOS_I(inode)->i_attrs & ATTR_VOLUME) |
60 (is_dir ? ATTR_DIR : 0); 60 (is_dir ? ATTR_DIR : 0);
61 oldattr = fat_make_attrs(inode); 61 oldattr = fat_make_attrs(inode);
62 62
63 /* Equivalent to a chmod() */ 63 /* Equivalent to a chmod() */
64 ia.ia_valid = ATTR_MODE | ATTR_CTIME; 64 ia.ia_valid = ATTR_MODE | ATTR_CTIME;
65 ia.ia_ctime = current_fs_time(inode->i_sb); 65 ia.ia_ctime = current_fs_time(inode->i_sb);
66 if (is_dir) 66 if (is_dir)
67 ia.ia_mode = fat_make_mode(sbi, attr, S_IRWXUGO); 67 ia.ia_mode = fat_make_mode(sbi, attr, S_IRWXUGO);
68 else { 68 else {
69 ia.ia_mode = fat_make_mode(sbi, attr, 69 ia.ia_mode = fat_make_mode(sbi, attr,
70 S_IRUGO | S_IWUGO | (inode->i_mode & S_IXUGO)); 70 S_IRUGO | S_IWUGO | (inode->i_mode & S_IXUGO));
71 } 71 }
72 72
73 /* The root directory has no attributes */ 73 /* The root directory has no attributes */
74 if (inode->i_ino == MSDOS_ROOT_INO && attr != ATTR_DIR) { 74 if (inode->i_ino == MSDOS_ROOT_INO && attr != ATTR_DIR) {
75 err = -EINVAL; 75 err = -EINVAL;
76 goto out_drop_write; 76 goto out_drop_write;
77 } 77 }
78 78
79 if (sbi->options.sys_immutable && 79 if (sbi->options.sys_immutable &&
80 ((attr | oldattr) & ATTR_SYS) && 80 ((attr | oldattr) & ATTR_SYS) &&
81 !capable(CAP_LINUX_IMMUTABLE)) { 81 !capable(CAP_LINUX_IMMUTABLE)) {
82 err = -EPERM; 82 err = -EPERM;
83 goto out_drop_write; 83 goto out_drop_write;
84 } 84 }
85 85
86 /* 86 /*
87 * The security check is questionable... We single 87 * The security check is questionable... We single
88 * out the RO attribute for checking by the security 88 * out the RO attribute for checking by the security
89 * module, just because it maps to a file mode. 89 * module, just because it maps to a file mode.
90 */ 90 */
91 err = security_inode_setattr(file->f_path.dentry, &ia); 91 err = security_inode_setattr(file->f_path.dentry, &ia);
92 if (err) 92 if (err)
93 goto out_drop_write; 93 goto out_drop_write;
94 94
95 /* This MUST be done before doing anything irreversible... */ 95 /* This MUST be done before doing anything irreversible... */
96 err = fat_setattr(file->f_path.dentry, &ia); 96 err = fat_setattr(file->f_path.dentry, &ia);
97 if (err) 97 if (err)
98 goto out_drop_write; 98 goto out_drop_write;
99 99
100 fsnotify_change(file->f_path.dentry, ia.ia_valid); 100 fsnotify_change(file->f_path.dentry, ia.ia_valid);
101 if (sbi->options.sys_immutable) { 101 if (sbi->options.sys_immutable) {
102 if (attr & ATTR_SYS) 102 if (attr & ATTR_SYS)
103 inode->i_flags |= S_IMMUTABLE; 103 inode->i_flags |= S_IMMUTABLE;
104 else 104 else
105 inode->i_flags &= ~S_IMMUTABLE; 105 inode->i_flags &= ~S_IMMUTABLE;
106 } 106 }
107 107
108 fat_save_attrs(inode, attr); 108 fat_save_attrs(inode, attr);
109 mark_inode_dirty(inode); 109 mark_inode_dirty(inode);
110 out_drop_write: 110 out_drop_write:
111 mnt_drop_write(file->f_path.mnt); 111 mnt_drop_write(file->f_path.mnt);
112 out_unlock_inode: 112 out_unlock_inode:
113 mutex_unlock(&inode->i_mutex); 113 mutex_unlock(&inode->i_mutex);
114 out: 114 out:
115 return err; 115 return err;
116 } 116 }
117 117
118 long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 118 long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
119 { 119 {
120 struct inode *inode = filp->f_path.dentry->d_inode; 120 struct inode *inode = filp->f_path.dentry->d_inode;
121 u32 __user *user_attr = (u32 __user *)arg; 121 u32 __user *user_attr = (u32 __user *)arg;
122 122
123 switch (cmd) { 123 switch (cmd) {
124 case FAT_IOCTL_GET_ATTRIBUTES: 124 case FAT_IOCTL_GET_ATTRIBUTES:
125 return fat_ioctl_get_attributes(inode, user_attr); 125 return fat_ioctl_get_attributes(inode, user_attr);
126 case FAT_IOCTL_SET_ATTRIBUTES: 126 case FAT_IOCTL_SET_ATTRIBUTES:
127 return fat_ioctl_set_attributes(filp, user_attr); 127 return fat_ioctl_set_attributes(filp, user_attr);
128 default: 128 default:
129 return -ENOTTY; /* Inappropriate ioctl for device */ 129 return -ENOTTY; /* Inappropriate ioctl for device */
130 } 130 }
131 } 131 }
132 132
133 #ifdef CONFIG_COMPAT 133 #ifdef CONFIG_COMPAT
134 static long fat_generic_compat_ioctl(struct file *filp, unsigned int cmd, 134 static long fat_generic_compat_ioctl(struct file *filp, unsigned int cmd,
135 unsigned long arg) 135 unsigned long arg)
136 136
137 { 137 {
138 return fat_generic_ioctl(filp, cmd, (unsigned long)compat_ptr(arg)); 138 return fat_generic_ioctl(filp, cmd, (unsigned long)compat_ptr(arg));
139 } 139 }
140 #endif 140 #endif
141 141
142 static int fat_file_release(struct inode *inode, struct file *filp) 142 static int fat_file_release(struct inode *inode, struct file *filp)
143 { 143 {
144 if ((filp->f_mode & FMODE_WRITE) && 144 if ((filp->f_mode & FMODE_WRITE) &&
145 MSDOS_SB(inode->i_sb)->options.flush) { 145 MSDOS_SB(inode->i_sb)->options.flush) {
146 fat_flush_inodes(inode->i_sb, inode, NULL); 146 fat_flush_inodes(inode->i_sb, inode, NULL);
147 congestion_wait(BLK_RW_ASYNC, HZ/10); 147 congestion_wait(BLK_RW_ASYNC, HZ/10);
148 } 148 }
149 return 0; 149 return 0;
150 } 150 }
151 151
152 int fat_file_fsync(struct file *filp, int datasync) 152 int fat_file_fsync(struct file *filp, int datasync)
153 { 153 {
154 struct inode *inode = filp->f_mapping->host; 154 struct inode *inode = filp->f_mapping->host;
155 int res, err; 155 int res, err;
156 156
157 res = generic_file_fsync(filp, datasync); 157 res = generic_file_fsync(filp, datasync);
158 err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping); 158 err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping);
159 159
160 return res ? res : err; 160 return res ? res : err;
161 } 161 }
162 162
163 163
164 const struct file_operations fat_file_operations = { 164 const struct file_operations fat_file_operations = {
165 .llseek = generic_file_llseek, 165 .llseek = generic_file_llseek,
166 .read = do_sync_read, 166 .read = do_sync_read,
167 .write = do_sync_write, 167 .write = do_sync_write,
168 .aio_read = generic_file_aio_read, 168 .aio_read = generic_file_aio_read,
169 .aio_write = generic_file_aio_write, 169 .aio_write = generic_file_aio_write,
170 .mmap = generic_file_mmap, 170 .mmap = generic_file_mmap,
171 .release = fat_file_release, 171 .release = fat_file_release,
172 .unlocked_ioctl = fat_generic_ioctl, 172 .unlocked_ioctl = fat_generic_ioctl,
173 #ifdef CONFIG_COMPAT 173 #ifdef CONFIG_COMPAT
174 .compat_ioctl = fat_generic_compat_ioctl, 174 .compat_ioctl = fat_generic_compat_ioctl,
175 #endif 175 #endif
176 .fsync = fat_file_fsync, 176 .fsync = fat_file_fsync,
177 .splice_read = generic_file_splice_read, 177 .splice_read = generic_file_splice_read,
178 }; 178 };
179 179
180 static int fat_cont_expand(struct inode *inode, loff_t size) 180 static int fat_cont_expand(struct inode *inode, loff_t size)
181 { 181 {
182 struct address_space *mapping = inode->i_mapping; 182 struct address_space *mapping = inode->i_mapping;
183 loff_t start = inode->i_size, count = size - inode->i_size; 183 loff_t start = inode->i_size, count = size - inode->i_size;
184 int err; 184 int err;
185 185
186 err = generic_cont_expand_simple(inode, size); 186 err = generic_cont_expand_simple(inode, size);
187 if (err) 187 if (err)
188 goto out; 188 goto out;
189 189
190 inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; 190 inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
191 mark_inode_dirty(inode); 191 mark_inode_dirty(inode);
192 if (IS_SYNC(inode)) { 192 if (IS_SYNC(inode)) {
193 int err2; 193 int err2;
194 194
195 /* 195 /*
196 * Opencode syncing since we don't have a file open to use 196 * Opencode syncing since we don't have a file open to use
197 * standard fsync path. 197 * standard fsync path.
198 */ 198 */
199 err = filemap_fdatawrite_range(mapping, start, 199 err = filemap_fdatawrite_range(mapping, start,
200 start + count - 1); 200 start + count - 1);
201 err2 = sync_mapping_buffers(mapping); 201 err2 = sync_mapping_buffers(mapping);
202 if (!err) 202 if (!err)
203 err = err2; 203 err = err2;
204 err2 = write_inode_now(inode, 1); 204 err2 = write_inode_now(inode, 1);
205 if (!err) 205 if (!err)
206 err = err2; 206 err = err2;
207 if (!err) { 207 if (!err) {
208 err = filemap_fdatawait_range(mapping, start, 208 err = filemap_fdatawait_range(mapping, start,
209 start + count - 1); 209 start + count - 1);
210 } 210 }
211 } 211 }
212 out: 212 out:
213 return err; 213 return err;
214 } 214 }
215 215
216 /* Free all clusters after the skip'th cluster. */ 216 /* Free all clusters after the skip'th cluster. */
217 static int fat_free(struct inode *inode, int skip) 217 static int fat_free(struct inode *inode, int skip)
218 { 218 {
219 struct super_block *sb = inode->i_sb; 219 struct super_block *sb = inode->i_sb;
220 int err, wait, free_start, i_start, i_logstart; 220 int err, wait, free_start, i_start, i_logstart;
221 221
222 if (MSDOS_I(inode)->i_start == 0) 222 if (MSDOS_I(inode)->i_start == 0)
223 return 0; 223 return 0;
224 224
225 fat_cache_inval_inode(inode); 225 fat_cache_inval_inode(inode);
226 226
227 wait = IS_DIRSYNC(inode); 227 wait = IS_DIRSYNC(inode);
228 i_start = free_start = MSDOS_I(inode)->i_start; 228 i_start = free_start = MSDOS_I(inode)->i_start;
229 i_logstart = MSDOS_I(inode)->i_logstart; 229 i_logstart = MSDOS_I(inode)->i_logstart;
230 230
231 /* First, we write the new file size. */ 231 /* First, we write the new file size. */
232 if (!skip) { 232 if (!skip) {
233 MSDOS_I(inode)->i_start = 0; 233 MSDOS_I(inode)->i_start = 0;
234 MSDOS_I(inode)->i_logstart = 0; 234 MSDOS_I(inode)->i_logstart = 0;
235 } 235 }
236 MSDOS_I(inode)->i_attrs |= ATTR_ARCH; 236 MSDOS_I(inode)->i_attrs |= ATTR_ARCH;
237 inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; 237 inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
238 if (wait) { 238 if (wait) {
239 err = fat_sync_inode(inode); 239 err = fat_sync_inode(inode);
240 if (err) { 240 if (err) {
241 MSDOS_I(inode)->i_start = i_start; 241 MSDOS_I(inode)->i_start = i_start;
242 MSDOS_I(inode)->i_logstart = i_logstart; 242 MSDOS_I(inode)->i_logstart = i_logstart;
243 return err; 243 return err;
244 } 244 }
245 } else 245 } else
246 mark_inode_dirty(inode); 246 mark_inode_dirty(inode);
247 247
248 /* Write a new EOF, and get the remaining cluster chain for freeing. */ 248 /* Write a new EOF, and get the remaining cluster chain for freeing. */
249 if (skip) { 249 if (skip) {
250 struct fat_entry fatent; 250 struct fat_entry fatent;
251 int ret, fclus, dclus; 251 int ret, fclus, dclus;
252 252
253 ret = fat_get_cluster(inode, skip - 1, &fclus, &dclus); 253 ret = fat_get_cluster(inode, skip - 1, &fclus, &dclus);
254 if (ret < 0) 254 if (ret < 0)
255 return ret; 255 return ret;
256 else if (ret == FAT_ENT_EOF) 256 else if (ret == FAT_ENT_EOF)
257 return 0; 257 return 0;
258 258
259 fatent_init(&fatent); 259 fatent_init(&fatent);
260 ret = fat_ent_read(inode, &fatent, dclus); 260 ret = fat_ent_read(inode, &fatent, dclus);
261 if (ret == FAT_ENT_EOF) { 261 if (ret == FAT_ENT_EOF) {
262 fatent_brelse(&fatent); 262 fatent_brelse(&fatent);
263 return 0; 263 return 0;
264 } else if (ret == FAT_ENT_FREE) { 264 } else if (ret == FAT_ENT_FREE) {
265 fat_fs_error(sb, 265 fat_fs_error(sb,
266 "%s: invalid cluster chain (i_pos %lld)", 266 "%s: invalid cluster chain (i_pos %lld)",
267 __func__, MSDOS_I(inode)->i_pos); 267 __func__, MSDOS_I(inode)->i_pos);
268 ret = -EIO; 268 ret = -EIO;
269 } else if (ret > 0) { 269 } else if (ret > 0) {
270 err = fat_ent_write(inode, &fatent, FAT_ENT_EOF, wait); 270 err = fat_ent_write(inode, &fatent, FAT_ENT_EOF, wait);
271 if (err) 271 if (err)
272 ret = err; 272 ret = err;
273 } 273 }
274 fatent_brelse(&fatent); 274 fatent_brelse(&fatent);
275 if (ret < 0) 275 if (ret < 0)
276 return ret; 276 return ret;
277 277
278 free_start = ret; 278 free_start = ret;
279 } 279 }
280 inode->i_blocks = skip << (MSDOS_SB(sb)->cluster_bits - 9); 280 inode->i_blocks = skip << (MSDOS_SB(sb)->cluster_bits - 9);
281 281
282 /* Freeing the remained cluster chain */ 282 /* Freeing the remained cluster chain */
283 return fat_free_clusters(inode, free_start); 283 return fat_free_clusters(inode, free_start);
284 } 284 }
285 285
286 void fat_truncate_blocks(struct inode *inode, loff_t offset) 286 void fat_truncate_blocks(struct inode *inode, loff_t offset)
287 { 287 {
288 struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); 288 struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
289 const unsigned int cluster_size = sbi->cluster_size; 289 const unsigned int cluster_size = sbi->cluster_size;
290 int nr_clusters; 290 int nr_clusters;
291 291
292 /* 292 /*
293 * This protects against truncating a file bigger than it was then 293 * This protects against truncating a file bigger than it was then
294 * trying to write into the hole. 294 * trying to write into the hole.
295 */ 295 */
296 if (MSDOS_I(inode)->mmu_private > offset) 296 if (MSDOS_I(inode)->mmu_private > offset)
297 MSDOS_I(inode)->mmu_private = offset; 297 MSDOS_I(inode)->mmu_private = offset;
298 298
299 nr_clusters = (offset + (cluster_size - 1)) >> sbi->cluster_bits; 299 nr_clusters = (offset + (cluster_size - 1)) >> sbi->cluster_bits;
300 300
301 fat_free(inode, nr_clusters); 301 fat_free(inode, nr_clusters);
302 fat_flush_inodes(inode->i_sb, inode, NULL); 302 fat_flush_inodes(inode->i_sb, inode, NULL);
303 } 303 }
304 304
305 int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) 305 int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
306 { 306 {
307 struct inode *inode = dentry->d_inode; 307 struct inode *inode = dentry->d_inode;
308 generic_fillattr(inode, stat); 308 generic_fillattr(inode, stat);
309 stat->blksize = MSDOS_SB(inode->i_sb)->cluster_size; 309 stat->blksize = MSDOS_SB(inode->i_sb)->cluster_size;
310 return 0; 310 return 0;
311 } 311 }
312 EXPORT_SYMBOL_GPL(fat_getattr); 312 EXPORT_SYMBOL_GPL(fat_getattr);
313 313
314 static int fat_sanitize_mode(const struct msdos_sb_info *sbi, 314 static int fat_sanitize_mode(const struct msdos_sb_info *sbi,
315 struct inode *inode, umode_t *mode_ptr) 315 struct inode *inode, umode_t *mode_ptr)
316 { 316 {
317 mode_t mask, perm; 317 mode_t mask, perm;
318 318
319 /* 319 /*
320 * Note, the basic check is already done by a caller of 320 * Note, the basic check is already done by a caller of
321 * (attr->ia_mode & ~FAT_VALID_MODE) 321 * (attr->ia_mode & ~FAT_VALID_MODE)
322 */ 322 */
323 323
324 if (S_ISREG(inode->i_mode)) 324 if (S_ISREG(inode->i_mode))
325 mask = sbi->options.fs_fmask; 325 mask = sbi->options.fs_fmask;
326 else 326 else
327 mask = sbi->options.fs_dmask; 327 mask = sbi->options.fs_dmask;
328 328
329 perm = *mode_ptr & ~(S_IFMT | mask); 329 perm = *mode_ptr & ~(S_IFMT | mask);
330 330
331 /* 331 /*
332 * Of the r and x bits, all (subject to umask) must be present. Of the 332 * Of the r and x bits, all (subject to umask) must be present. Of the
333 * w bits, either all (subject to umask) or none must be present. 333 * w bits, either all (subject to umask) or none must be present.
334 * 334 *
335 * If fat_mode_can_hold_ro(inode) is false, can't change w bits. 335 * If fat_mode_can_hold_ro(inode) is false, can't change w bits.
336 */ 336 */
337 if ((perm & (S_IRUGO | S_IXUGO)) != (inode->i_mode & (S_IRUGO|S_IXUGO))) 337 if ((perm & (S_IRUGO | S_IXUGO)) != (inode->i_mode & (S_IRUGO|S_IXUGO)))
338 return -EPERM; 338 return -EPERM;
339 if (fat_mode_can_hold_ro(inode)) { 339 if (fat_mode_can_hold_ro(inode)) {
340 if ((perm & S_IWUGO) && ((perm & S_IWUGO) != (S_IWUGO & ~mask))) 340 if ((perm & S_IWUGO) && ((perm & S_IWUGO) != (S_IWUGO & ~mask)))
341 return -EPERM; 341 return -EPERM;
342 } else { 342 } else {
343 if ((perm & S_IWUGO) != (S_IWUGO & ~mask)) 343 if ((perm & S_IWUGO) != (S_IWUGO & ~mask))
344 return -EPERM; 344 return -EPERM;
345 } 345 }
346 346
347 *mode_ptr &= S_IFMT | perm; 347 *mode_ptr &= S_IFMT | perm;
348 348
349 return 0; 349 return 0;
350 } 350 }
351 351
352 static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode) 352 static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode)
353 { 353 {
354 mode_t allow_utime = sbi->options.allow_utime; 354 mode_t allow_utime = sbi->options.allow_utime;
355 355
356 if (current_fsuid() != inode->i_uid) { 356 if (current_fsuid() != inode->i_uid) {
357 if (in_group_p(inode->i_gid)) 357 if (in_group_p(inode->i_gid))
358 allow_utime >>= 3; 358 allow_utime >>= 3;
359 if (allow_utime & MAY_WRITE) 359 if (allow_utime & MAY_WRITE)
360 return 1; 360 return 1;
361 } 361 }
362 362
363 /* use a default check */ 363 /* use a default check */
364 return 0; 364 return 0;
365 } 365 }
366 366
367 #define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET) 367 #define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)
368 /* valid file mode bits */ 368 /* valid file mode bits */
369 #define FAT_VALID_MODE (S_IFREG | S_IFDIR | S_IRWXUGO) 369 #define FAT_VALID_MODE (S_IFREG | S_IFDIR | S_IRWXUGO)
370 370
371 int fat_setattr(struct dentry *dentry, struct iattr *attr) 371 int fat_setattr(struct dentry *dentry, struct iattr *attr)
372 { 372 {
373 struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb); 373 struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb);
374 struct inode *inode = dentry->d_inode; 374 struct inode *inode = dentry->d_inode;
375 unsigned int ia_valid; 375 unsigned int ia_valid;
376 int error; 376 int error;
377 377
378 /* Check for setting the inode time. */ 378 /* Check for setting the inode time. */
379 ia_valid = attr->ia_valid; 379 ia_valid = attr->ia_valid;
380 if (ia_valid & TIMES_SET_FLAGS) { 380 if (ia_valid & TIMES_SET_FLAGS) {
381 if (fat_allow_set_time(sbi, inode)) 381 if (fat_allow_set_time(sbi, inode))
382 attr->ia_valid &= ~TIMES_SET_FLAGS; 382 attr->ia_valid &= ~TIMES_SET_FLAGS;
383 } 383 }
384 384
385 error = inode_change_ok(inode, attr); 385 error = inode_change_ok(inode, attr);
386 attr->ia_valid = ia_valid; 386 attr->ia_valid = ia_valid;
387 if (error) { 387 if (error) {
388 if (sbi->options.quiet) 388 if (sbi->options.quiet)
389 error = 0; 389 error = 0;
390 goto out; 390 goto out;
391 } 391 }
392 392
393 /* 393 /*
394 * Expand the file. Since inode_setattr() updates ->i_size 394 * Expand the file. Since inode_setattr() updates ->i_size
395 * before calling the ->truncate(), but FAT needs to fill the 395 * before calling the ->truncate(), but FAT needs to fill the
396 * hole before it. XXX: this is no longer true with new truncate 396 * hole before it. XXX: this is no longer true with new truncate
397 * sequence. 397 * sequence.
398 */ 398 */
399 if (attr->ia_valid & ATTR_SIZE) { 399 if (attr->ia_valid & ATTR_SIZE) {
400 inode_dio_wait(inode);
401
400 if (attr->ia_size > inode->i_size) { 402 if (attr->ia_size > inode->i_size) {
401 error = fat_cont_expand(inode, attr->ia_size); 403 error = fat_cont_expand(inode, attr->ia_size);
402 if (error || attr->ia_valid == ATTR_SIZE) 404 if (error || attr->ia_valid == ATTR_SIZE)
403 goto out; 405 goto out;
404 attr->ia_valid &= ~ATTR_SIZE; 406 attr->ia_valid &= ~ATTR_SIZE;
405 } 407 }
406 } 408 }
407 409
408 if (((attr->ia_valid & ATTR_UID) && 410 if (((attr->ia_valid & ATTR_UID) &&
409 (attr->ia_uid != sbi->options.fs_uid)) || 411 (attr->ia_uid != sbi->options.fs_uid)) ||
410 ((attr->ia_valid & ATTR_GID) && 412 ((attr->ia_valid & ATTR_GID) &&
411 (attr->ia_gid != sbi->options.fs_gid)) || 413 (attr->ia_gid != sbi->options.fs_gid)) ||
412 ((attr->ia_valid & ATTR_MODE) && 414 ((attr->ia_valid & ATTR_MODE) &&
413 (attr->ia_mode & ~FAT_VALID_MODE))) 415 (attr->ia_mode & ~FAT_VALID_MODE)))
414 error = -EPERM; 416 error = -EPERM;
415 417
416 if (error) { 418 if (error) {
417 if (sbi->options.quiet) 419 if (sbi->options.quiet)
418 error = 0; 420 error = 0;
419 goto out; 421 goto out;
420 } 422 }
421 423
422 /* 424 /*
423 * We don't return -EPERM here. Yes, strange, but this is too 425 * We don't return -EPERM here. Yes, strange, but this is too
424 * old behavior. 426 * old behavior.
425 */ 427 */
426 if (attr->ia_valid & ATTR_MODE) { 428 if (attr->ia_valid & ATTR_MODE) {
427 if (fat_sanitize_mode(sbi, inode, &attr->ia_mode) < 0) 429 if (fat_sanitize_mode(sbi, inode, &attr->ia_mode) < 0)
428 attr->ia_valid &= ~ATTR_MODE; 430 attr->ia_valid &= ~ATTR_MODE;
429 } 431 }
430 432
431 if (attr->ia_valid & ATTR_SIZE) { 433 if (attr->ia_valid & ATTR_SIZE) {
432 down_write(&MSDOS_I(inode)->truncate_lock); 434 down_write(&MSDOS_I(inode)->truncate_lock);
433 truncate_setsize(inode, attr->ia_size); 435 truncate_setsize(inode, attr->ia_size);
434 fat_truncate_blocks(inode, attr->ia_size); 436 fat_truncate_blocks(inode, attr->ia_size);
435 up_write(&MSDOS_I(inode)->truncate_lock); 437 up_write(&MSDOS_I(inode)->truncate_lock);
436 } 438 }
437 439
438 setattr_copy(inode, attr); 440 setattr_copy(inode, attr);
439 mark_inode_dirty(inode); 441 mark_inode_dirty(inode);
440 out: 442 out:
441 return error; 443 return error;
442 } 444 }
443 EXPORT_SYMBOL_GPL(fat_setattr); 445 EXPORT_SYMBOL_GPL(fat_setattr);
444 446
445 const struct inode_operations fat_file_inode_operations = { 447 const struct inode_operations fat_file_inode_operations = {
446 .setattr = fat_setattr, 448 .setattr = fat_setattr,
447 .getattr = fat_getattr, 449 .getattr = fat_getattr,
448 }; 450 };
449 451
1 /* 1 /*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This copyrighted material is made available to anyone wishing to use, 5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions 6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10 #include <linux/spinlock.h> 10 #include <linux/spinlock.h>
11 #include <linux/completion.h> 11 #include <linux/completion.h>
12 #include <linux/buffer_head.h> 12 #include <linux/buffer_head.h>
13 #include <linux/gfs2_ondisk.h> 13 #include <linux/gfs2_ondisk.h>
14 #include <linux/crc32.h> 14 #include <linux/crc32.h>
15 15
16 #include "gfs2.h" 16 #include "gfs2.h"
17 #include "incore.h" 17 #include "incore.h"
18 #include "bmap.h" 18 #include "bmap.h"
19 #include "glock.h" 19 #include "glock.h"
20 #include "inode.h" 20 #include "inode.h"
21 #include "meta_io.h" 21 #include "meta_io.h"
22 #include "quota.h" 22 #include "quota.h"
23 #include "rgrp.h" 23 #include "rgrp.h"
24 #include "super.h" 24 #include "super.h"
25 #include "trans.h" 25 #include "trans.h"
26 #include "dir.h" 26 #include "dir.h"
27 #include "util.h" 27 #include "util.h"
28 #include "trace_gfs2.h" 28 #include "trace_gfs2.h"
29 29
30 /* This doesn't need to be that large as max 64 bit pointers in a 4k 30 /* This doesn't need to be that large as max 64 bit pointers in a 4k
31 * block is 512, so __u16 is fine for that. It saves stack space to 31 * block is 512, so __u16 is fine for that. It saves stack space to
32 * keep it small. 32 * keep it small.
33 */ 33 */
34 struct metapath { 34 struct metapath {
35 struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT]; 35 struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT];
36 __u16 mp_list[GFS2_MAX_META_HEIGHT]; 36 __u16 mp_list[GFS2_MAX_META_HEIGHT];
37 }; 37 };
38 38
39 typedef int (*block_call_t) (struct gfs2_inode *ip, struct buffer_head *dibh, 39 typedef int (*block_call_t) (struct gfs2_inode *ip, struct buffer_head *dibh,
40 struct buffer_head *bh, __be64 *top, 40 struct buffer_head *bh, __be64 *top,
41 __be64 *bottom, unsigned int height, 41 __be64 *bottom, unsigned int height,
42 void *data); 42 void *data);
43 43
44 struct strip_mine { 44 struct strip_mine {
45 int sm_first; 45 int sm_first;
46 unsigned int sm_height; 46 unsigned int sm_height;
47 }; 47 };
48 48
49 /** 49 /**
50 * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page 50 * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
51 * @ip: the inode 51 * @ip: the inode
52 * @dibh: the dinode buffer 52 * @dibh: the dinode buffer
53 * @block: the block number that was allocated 53 * @block: the block number that was allocated
54 * @page: The (optional) page. This is looked up if @page is NULL 54 * @page: The (optional) page. This is looked up if @page is NULL
55 * 55 *
56 * Returns: errno 56 * Returns: errno
57 */ 57 */
58 58
59 static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, 59 static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
60 u64 block, struct page *page) 60 u64 block, struct page *page)
61 { 61 {
62 struct inode *inode = &ip->i_inode; 62 struct inode *inode = &ip->i_inode;
63 struct buffer_head *bh; 63 struct buffer_head *bh;
64 int release = 0; 64 int release = 0;
65 65
66 if (!page || page->index) { 66 if (!page || page->index) {
67 page = grab_cache_page(inode->i_mapping, 0); 67 page = grab_cache_page(inode->i_mapping, 0);
68 if (!page) 68 if (!page)
69 return -ENOMEM; 69 return -ENOMEM;
70 release = 1; 70 release = 1;
71 } 71 }
72 72
73 if (!PageUptodate(page)) { 73 if (!PageUptodate(page)) {
74 void *kaddr = kmap(page); 74 void *kaddr = kmap(page);
75 u64 dsize = i_size_read(inode); 75 u64 dsize = i_size_read(inode);
76 76
77 if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode))) 77 if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode)))
78 dsize = dibh->b_size - sizeof(struct gfs2_dinode); 78 dsize = dibh->b_size - sizeof(struct gfs2_dinode);
79 79
80 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize); 80 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
81 memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize); 81 memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize);
82 kunmap(page); 82 kunmap(page);
83 83
84 SetPageUptodate(page); 84 SetPageUptodate(page);
85 } 85 }
86 86
87 if (!page_has_buffers(page)) 87 if (!page_has_buffers(page))
88 create_empty_buffers(page, 1 << inode->i_blkbits, 88 create_empty_buffers(page, 1 << inode->i_blkbits,
89 (1 << BH_Uptodate)); 89 (1 << BH_Uptodate));
90 90
91 bh = page_buffers(page); 91 bh = page_buffers(page);
92 92
93 if (!buffer_mapped(bh)) 93 if (!buffer_mapped(bh))
94 map_bh(bh, inode->i_sb, block); 94 map_bh(bh, inode->i_sb, block);
95 95
96 set_buffer_uptodate(bh); 96 set_buffer_uptodate(bh);
97 if (!gfs2_is_jdata(ip)) 97 if (!gfs2_is_jdata(ip))
98 mark_buffer_dirty(bh); 98 mark_buffer_dirty(bh);
99 if (!gfs2_is_writeback(ip)) 99 if (!gfs2_is_writeback(ip))
100 gfs2_trans_add_bh(ip->i_gl, bh, 0); 100 gfs2_trans_add_bh(ip->i_gl, bh, 0);
101 101
102 if (release) { 102 if (release) {
103 unlock_page(page); 103 unlock_page(page);
104 page_cache_release(page); 104 page_cache_release(page);
105 } 105 }
106 106
107 return 0; 107 return 0;
108 } 108 }
109 109
110 /** 110 /**
111 * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big 111 * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
112 * @ip: The GFS2 inode to unstuff 112 * @ip: The GFS2 inode to unstuff
113 * @page: The (optional) page. This is looked up if the @page is NULL 113 * @page: The (optional) page. This is looked up if the @page is NULL
114 * 114 *
115 * This routine unstuffs a dinode and returns it to a "normal" state such 115 * This routine unstuffs a dinode and returns it to a "normal" state such
116 * that the height can be grown in the traditional way. 116 * that the height can be grown in the traditional way.
117 * 117 *
118 * Returns: errno 118 * Returns: errno
119 */ 119 */
120 120
121 int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page) 121 int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
122 { 122 {
123 struct buffer_head *bh, *dibh; 123 struct buffer_head *bh, *dibh;
124 struct gfs2_dinode *di; 124 struct gfs2_dinode *di;
125 u64 block = 0; 125 u64 block = 0;
126 int isdir = gfs2_is_dir(ip); 126 int isdir = gfs2_is_dir(ip);
127 int error; 127 int error;
128 128
129 down_write(&ip->i_rw_mutex); 129 down_write(&ip->i_rw_mutex);
130 130
131 error = gfs2_meta_inode_buffer(ip, &dibh); 131 error = gfs2_meta_inode_buffer(ip, &dibh);
132 if (error) 132 if (error)
133 goto out; 133 goto out;
134 134
135 if (i_size_read(&ip->i_inode)) { 135 if (i_size_read(&ip->i_inode)) {
136 /* Get a free block, fill it with the stuffed data, 136 /* Get a free block, fill it with the stuffed data,
137 and write it out to disk */ 137 and write it out to disk */
138 138
139 unsigned int n = 1; 139 unsigned int n = 1;
140 error = gfs2_alloc_block(ip, &block, &n); 140 error = gfs2_alloc_block(ip, &block, &n);
141 if (error) 141 if (error)
142 goto out_brelse; 142 goto out_brelse;
143 if (isdir) { 143 if (isdir) {
144 gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), block, 1); 144 gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), block, 1);
145 error = gfs2_dir_get_new_buffer(ip, block, &bh); 145 error = gfs2_dir_get_new_buffer(ip, block, &bh);
146 if (error) 146 if (error)
147 goto out_brelse; 147 goto out_brelse;
148 gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header), 148 gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header),
149 dibh, sizeof(struct gfs2_dinode)); 149 dibh, sizeof(struct gfs2_dinode));
150 brelse(bh); 150 brelse(bh);
151 } else { 151 } else {
152 error = gfs2_unstuffer_page(ip, dibh, block, page); 152 error = gfs2_unstuffer_page(ip, dibh, block, page);
153 if (error) 153 if (error)
154 goto out_brelse; 154 goto out_brelse;
155 } 155 }
156 } 156 }
157 157
158 /* Set up the pointer to the new block */ 158 /* Set up the pointer to the new block */
159 159
160 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 160 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
161 di = (struct gfs2_dinode *)dibh->b_data; 161 di = (struct gfs2_dinode *)dibh->b_data;
162 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); 162 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
163 163
164 if (i_size_read(&ip->i_inode)) { 164 if (i_size_read(&ip->i_inode)) {
165 *(__be64 *)(di + 1) = cpu_to_be64(block); 165 *(__be64 *)(di + 1) = cpu_to_be64(block);
166 gfs2_add_inode_blocks(&ip->i_inode, 1); 166 gfs2_add_inode_blocks(&ip->i_inode, 1);
167 di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); 167 di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
168 } 168 }
169 169
170 ip->i_height = 1; 170 ip->i_height = 1;
171 di->di_height = cpu_to_be16(1); 171 di->di_height = cpu_to_be16(1);
172 172
173 out_brelse: 173 out_brelse:
174 brelse(dibh); 174 brelse(dibh);
175 out: 175 out:
176 up_write(&ip->i_rw_mutex); 176 up_write(&ip->i_rw_mutex);
177 return error; 177 return error;
178 } 178 }
179 179
180 180
181 /** 181 /**
182 * find_metapath - Find path through the metadata tree 182 * find_metapath - Find path through the metadata tree
183 * @sdp: The superblock 183 * @sdp: The superblock
184 * @mp: The metapath to return the result in 184 * @mp: The metapath to return the result in
185 * @block: The disk block to look up 185 * @block: The disk block to look up
186 * @height: The pre-calculated height of the metadata tree 186 * @height: The pre-calculated height of the metadata tree
187 * 187 *
188 * This routine returns a struct metapath structure that defines a path 188 * This routine returns a struct metapath structure that defines a path
189 * through the metadata of inode "ip" to get to block "block". 189 * through the metadata of inode "ip" to get to block "block".
190 * 190 *
191 * Example: 191 * Example:
192 * Given: "ip" is a height 3 file, "offset" is 101342453, and this is a 192 * Given: "ip" is a height 3 file, "offset" is 101342453, and this is a
193 * filesystem with a blocksize of 4096. 193 * filesystem with a blocksize of 4096.
194 * 194 *
195 * find_metapath() would return a struct metapath structure set to: 195 * find_metapath() would return a struct metapath structure set to:
196 * mp_offset = 101342453, mp_height = 3, mp_list[0] = 0, mp_list[1] = 48, 196 * mp_offset = 101342453, mp_height = 3, mp_list[0] = 0, mp_list[1] = 48,
197 * and mp_list[2] = 165. 197 * and mp_list[2] = 165.
198 * 198 *
199 * That means that in order to get to the block containing the byte at 199 * That means that in order to get to the block containing the byte at
200 * offset 101342453, we would load the indirect block pointed to by pointer 200 * offset 101342453, we would load the indirect block pointed to by pointer
201 * 0 in the dinode. We would then load the indirect block pointed to by 201 * 0 in the dinode. We would then load the indirect block pointed to by
202 * pointer 48 in that indirect block. We would then load the data block 202 * pointer 48 in that indirect block. We would then load the data block
203 * pointed to by pointer 165 in that indirect block. 203 * pointed to by pointer 165 in that indirect block.
204 * 204 *
205 * ---------------------------------------- 205 * ----------------------------------------
206 * | Dinode | | 206 * | Dinode | |
207 * | | 4| 207 * | | 4|
208 * | |0 1 2 3 4 5 9| 208 * | |0 1 2 3 4 5 9|
209 * | | 6| 209 * | | 6|
210 * ---------------------------------------- 210 * ----------------------------------------
211 * | 211 * |
212 * | 212 * |
213 * V 213 * V
214 * ---------------------------------------- 214 * ----------------------------------------
215 * | Indirect Block | 215 * | Indirect Block |
216 * | 5| 216 * | 5|
217 * | 4 4 4 4 4 5 5 1| 217 * | 4 4 4 4 4 5 5 1|
218 * |0 5 6 7 8 9 0 1 2| 218 * |0 5 6 7 8 9 0 1 2|
219 * ---------------------------------------- 219 * ----------------------------------------
220 * | 220 * |
221 * | 221 * |
222 * V 222 * V
223 * ---------------------------------------- 223 * ----------------------------------------
224 * | Indirect Block | 224 * | Indirect Block |
225 * | 1 1 1 1 1 5| 225 * | 1 1 1 1 1 5|
226 * | 6 6 6 6 6 1| 226 * | 6 6 6 6 6 1|
227 * |0 3 4 5 6 7 2| 227 * |0 3 4 5 6 7 2|
228 * ---------------------------------------- 228 * ----------------------------------------
229 * | 229 * |
230 * | 230 * |
231 * V 231 * V
232 * ---------------------------------------- 232 * ----------------------------------------
233 * | Data block containing offset | 233 * | Data block containing offset |
234 * | 101342453 | 234 * | 101342453 |
235 * | | 235 * | |
236 * | | 236 * | |
237 * ---------------------------------------- 237 * ----------------------------------------
238 * 238 *
239 */ 239 */
240 240
241 static void find_metapath(const struct gfs2_sbd *sdp, u64 block, 241 static void find_metapath(const struct gfs2_sbd *sdp, u64 block,
242 struct metapath *mp, unsigned int height) 242 struct metapath *mp, unsigned int height)
243 { 243 {
244 unsigned int i; 244 unsigned int i;
245 245
246 for (i = height; i--;) 246 for (i = height; i--;)
247 mp->mp_list[i] = do_div(block, sdp->sd_inptrs); 247 mp->mp_list[i] = do_div(block, sdp->sd_inptrs);
248 248
249 } 249 }
250 250
251 static inline unsigned int metapath_branch_start(const struct metapath *mp) 251 static inline unsigned int metapath_branch_start(const struct metapath *mp)
252 { 252 {
253 if (mp->mp_list[0] == 0) 253 if (mp->mp_list[0] == 0)
254 return 2; 254 return 2;
255 return 1; 255 return 1;
256 } 256 }
257 257
258 /** 258 /**
259 * metapointer - Return pointer to start of metadata in a buffer 259 * metapointer - Return pointer to start of metadata in a buffer
260 * @height: The metadata height (0 = dinode) 260 * @height: The metadata height (0 = dinode)
261 * @mp: The metapath 261 * @mp: The metapath
262 * 262 *
263 * Return a pointer to the block number of the next height of the metadata 263 * Return a pointer to the block number of the next height of the metadata
264 * tree given a buffer containing the pointer to the current height of the 264 * tree given a buffer containing the pointer to the current height of the
265 * metadata tree. 265 * metadata tree.
266 */ 266 */
267 267
268 static inline __be64 *metapointer(unsigned int height, const struct metapath *mp) 268 static inline __be64 *metapointer(unsigned int height, const struct metapath *mp)
269 { 269 {
270 struct buffer_head *bh = mp->mp_bh[height]; 270 struct buffer_head *bh = mp->mp_bh[height];
271 unsigned int head_size = (height > 0) ? 271 unsigned int head_size = (height > 0) ?
272 sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode); 272 sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode);
273 return ((__be64 *)(bh->b_data + head_size)) + mp->mp_list[height]; 273 return ((__be64 *)(bh->b_data + head_size)) + mp->mp_list[height];
274 } 274 }
275 275
276 /** 276 /**
277 * lookup_metapath - Walk the metadata tree to a specific point 277 * lookup_metapath - Walk the metadata tree to a specific point
278 * @ip: The inode 278 * @ip: The inode
279 * @mp: The metapath 279 * @mp: The metapath
280 * 280 *
281 * Assumes that the inode's buffer has already been looked up and 281 * Assumes that the inode's buffer has already been looked up and
282 * hooked onto mp->mp_bh[0] and that the metapath has been initialised 282 * hooked onto mp->mp_bh[0] and that the metapath has been initialised
283 * by find_metapath(). 283 * by find_metapath().
284 * 284 *
285 * If this function encounters part of the tree which has not been 285 * If this function encounters part of the tree which has not been
286 * allocated, it returns the current height of the tree at the point 286 * allocated, it returns the current height of the tree at the point
287 * at which it found the unallocated block. Blocks which are found are 287 * at which it found the unallocated block. Blocks which are found are
288 * added to the mp->mp_bh[] list. 288 * added to the mp->mp_bh[] list.
289 * 289 *
290 * Returns: error or height of metadata tree 290 * Returns: error or height of metadata tree
291 */ 291 */
292 292
293 static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp) 293 static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
294 { 294 {
295 unsigned int end_of_metadata = ip->i_height - 1; 295 unsigned int end_of_metadata = ip->i_height - 1;
296 unsigned int x; 296 unsigned int x;
297 __be64 *ptr; 297 __be64 *ptr;
298 u64 dblock; 298 u64 dblock;
299 int ret; 299 int ret;
300 300
301 for (x = 0; x < end_of_metadata; x++) { 301 for (x = 0; x < end_of_metadata; x++) {
302 ptr = metapointer(x, mp); 302 ptr = metapointer(x, mp);
303 dblock = be64_to_cpu(*ptr); 303 dblock = be64_to_cpu(*ptr);
304 if (!dblock) 304 if (!dblock)
305 return x + 1; 305 return x + 1;
306 306
307 ret = gfs2_meta_indirect_buffer(ip, x+1, dblock, 0, &mp->mp_bh[x+1]); 307 ret = gfs2_meta_indirect_buffer(ip, x+1, dblock, 0, &mp->mp_bh[x+1]);
308 if (ret) 308 if (ret)
309 return ret; 309 return ret;
310 } 310 }
311 311
312 return ip->i_height; 312 return ip->i_height;
313 } 313 }
314 314
315 static inline void release_metapath(struct metapath *mp) 315 static inline void release_metapath(struct metapath *mp)
316 { 316 {
317 int i; 317 int i;
318 318
319 for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) { 319 for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) {
320 if (mp->mp_bh[i] == NULL) 320 if (mp->mp_bh[i] == NULL)
321 break; 321 break;
322 brelse(mp->mp_bh[i]); 322 brelse(mp->mp_bh[i]);
323 } 323 }
324 } 324 }
325 325
326 /** 326 /**
327 * gfs2_extent_length - Returns length of an extent of blocks 327 * gfs2_extent_length - Returns length of an extent of blocks
328 * @start: Start of the buffer 328 * @start: Start of the buffer
329 * @len: Length of the buffer in bytes 329 * @len: Length of the buffer in bytes
330 * @ptr: Current position in the buffer 330 * @ptr: Current position in the buffer
331 * @limit: Max extent length to return (0 = unlimited) 331 * @limit: Max extent length to return (0 = unlimited)
332 * @eob: Set to 1 if we hit "end of block" 332 * @eob: Set to 1 if we hit "end of block"
333 * 333 *
334 * If the first block is zero (unallocated) it will return the number of 334 * If the first block is zero (unallocated) it will return the number of
335 * unallocated blocks in the extent, otherwise it will return the number 335 * unallocated blocks in the extent, otherwise it will return the number
336 * of contiguous blocks in the extent. 336 * of contiguous blocks in the extent.
337 * 337 *
338 * Returns: The length of the extent (minimum of one block) 338 * Returns: The length of the extent (minimum of one block)
339 */ 339 */
340 340
341 static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __be64 *ptr, unsigned limit, int *eob) 341 static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __be64 *ptr, unsigned limit, int *eob)
342 { 342 {
343 const __be64 *end = (start + len); 343 const __be64 *end = (start + len);
344 const __be64 *first = ptr; 344 const __be64 *first = ptr;
345 u64 d = be64_to_cpu(*ptr); 345 u64 d = be64_to_cpu(*ptr);
346 346
347 *eob = 0; 347 *eob = 0;
348 do { 348 do {
349 ptr++; 349 ptr++;
350 if (ptr >= end) 350 if (ptr >= end)
351 break; 351 break;
352 if (limit && --limit == 0) 352 if (limit && --limit == 0)
353 break; 353 break;
354 if (d) 354 if (d)
355 d++; 355 d++;
356 } while(be64_to_cpu(*ptr) == d); 356 } while(be64_to_cpu(*ptr) == d);
357 if (ptr >= end) 357 if (ptr >= end)
358 *eob = 1; 358 *eob = 1;
359 return (ptr - first); 359 return (ptr - first);
360 } 360 }
361 361
362 static inline void bmap_lock(struct gfs2_inode *ip, int create) 362 static inline void bmap_lock(struct gfs2_inode *ip, int create)
363 { 363 {
364 if (create) 364 if (create)
365 down_write(&ip->i_rw_mutex); 365 down_write(&ip->i_rw_mutex);
366 else 366 else
367 down_read(&ip->i_rw_mutex); 367 down_read(&ip->i_rw_mutex);
368 } 368 }
369 369
370 static inline void bmap_unlock(struct gfs2_inode *ip, int create) 370 static inline void bmap_unlock(struct gfs2_inode *ip, int create)
371 { 371 {
372 if (create) 372 if (create)
373 up_write(&ip->i_rw_mutex); 373 up_write(&ip->i_rw_mutex);
374 else 374 else
375 up_read(&ip->i_rw_mutex); 375 up_read(&ip->i_rw_mutex);
376 } 376 }
377 377
378 static inline __be64 *gfs2_indirect_init(struct metapath *mp, 378 static inline __be64 *gfs2_indirect_init(struct metapath *mp,
379 struct gfs2_glock *gl, unsigned int i, 379 struct gfs2_glock *gl, unsigned int i,
380 unsigned offset, u64 bn) 380 unsigned offset, u64 bn)
381 { 381 {
382 __be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data + 382 __be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data +
383 ((i > 1) ? sizeof(struct gfs2_meta_header) : 383 ((i > 1) ? sizeof(struct gfs2_meta_header) :
384 sizeof(struct gfs2_dinode))); 384 sizeof(struct gfs2_dinode)));
385 BUG_ON(i < 1); 385 BUG_ON(i < 1);
386 BUG_ON(mp->mp_bh[i] != NULL); 386 BUG_ON(mp->mp_bh[i] != NULL);
387 mp->mp_bh[i] = gfs2_meta_new(gl, bn); 387 mp->mp_bh[i] = gfs2_meta_new(gl, bn);
388 gfs2_trans_add_bh(gl, mp->mp_bh[i], 1); 388 gfs2_trans_add_bh(gl, mp->mp_bh[i], 1);
389 gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN); 389 gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
390 gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header)); 390 gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header));
391 ptr += offset; 391 ptr += offset;
392 *ptr = cpu_to_be64(bn); 392 *ptr = cpu_to_be64(bn);
393 return ptr; 393 return ptr;
394 } 394 }
395 395
396 enum alloc_state { 396 enum alloc_state {
397 ALLOC_DATA = 0, 397 ALLOC_DATA = 0,
398 ALLOC_GROW_DEPTH = 1, 398 ALLOC_GROW_DEPTH = 1,
399 ALLOC_GROW_HEIGHT = 2, 399 ALLOC_GROW_HEIGHT = 2,
400 /* ALLOC_UNSTUFF = 3, TBD and rather complicated */ 400 /* ALLOC_UNSTUFF = 3, TBD and rather complicated */
401 }; 401 };
402 402
403 /** 403 /**
404 * gfs2_bmap_alloc - Build a metadata tree of the requested height 404 * gfs2_bmap_alloc - Build a metadata tree of the requested height
405 * @inode: The GFS2 inode 405 * @inode: The GFS2 inode
406 * @lblock: The logical starting block of the extent 406 * @lblock: The logical starting block of the extent
407 * @bh_map: This is used to return the mapping details 407 * @bh_map: This is used to return the mapping details
408 * @mp: The metapath 408 * @mp: The metapath
409 * @sheight: The starting height (i.e. whats already mapped) 409 * @sheight: The starting height (i.e. whats already mapped)
410 * @height: The height to build to 410 * @height: The height to build to
411 * @maxlen: The max number of data blocks to alloc 411 * @maxlen: The max number of data blocks to alloc
412 * 412 *
413 * In this routine we may have to alloc: 413 * In this routine we may have to alloc:
414 * i) Indirect blocks to grow the metadata tree height 414 * i) Indirect blocks to grow the metadata tree height
415 * ii) Indirect blocks to fill in lower part of the metadata tree 415 * ii) Indirect blocks to fill in lower part of the metadata tree
416 * iii) Data blocks 416 * iii) Data blocks
417 * 417 *
418 * The function is in two parts. The first part works out the total 418 * The function is in two parts. The first part works out the total
419 * number of blocks which we need. The second part does the actual 419 * number of blocks which we need. The second part does the actual
420 * allocation asking for an extent at a time (if enough contiguous free 420 * allocation asking for an extent at a time (if enough contiguous free
421 * blocks are available, there will only be one request per bmap call) 421 * blocks are available, there will only be one request per bmap call)
422 * and uses the state machine to initialise the blocks in order. 422 * and uses the state machine to initialise the blocks in order.
423 * 423 *
424 * Returns: errno on error 424 * Returns: errno on error
425 */ 425 */
426 426
427 static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, 427 static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
428 struct buffer_head *bh_map, struct metapath *mp, 428 struct buffer_head *bh_map, struct metapath *mp,
429 const unsigned int sheight, 429 const unsigned int sheight,
430 const unsigned int height, 430 const unsigned int height,
431 const unsigned int maxlen) 431 const unsigned int maxlen)
432 { 432 {
433 struct gfs2_inode *ip = GFS2_I(inode); 433 struct gfs2_inode *ip = GFS2_I(inode);
434 struct gfs2_sbd *sdp = GFS2_SB(inode); 434 struct gfs2_sbd *sdp = GFS2_SB(inode);
435 struct buffer_head *dibh = mp->mp_bh[0]; 435 struct buffer_head *dibh = mp->mp_bh[0];
436 u64 bn, dblock = 0; 436 u64 bn, dblock = 0;
437 unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0; 437 unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
438 unsigned dblks = 0; 438 unsigned dblks = 0;
439 unsigned ptrs_per_blk; 439 unsigned ptrs_per_blk;
440 const unsigned end_of_metadata = height - 1; 440 const unsigned end_of_metadata = height - 1;
441 int eob = 0; 441 int eob = 0;
442 enum alloc_state state; 442 enum alloc_state state;
443 __be64 *ptr; 443 __be64 *ptr;
444 __be64 zero_bn = 0; 444 __be64 zero_bn = 0;
445 445
446 BUG_ON(sheight < 1); 446 BUG_ON(sheight < 1);
447 BUG_ON(dibh == NULL); 447 BUG_ON(dibh == NULL);
448 448
449 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 449 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
450 450
451 if (height == sheight) { 451 if (height == sheight) {
452 struct buffer_head *bh; 452 struct buffer_head *bh;
453 /* Bottom indirect block exists, find unalloced extent size */ 453 /* Bottom indirect block exists, find unalloced extent size */
454 ptr = metapointer(end_of_metadata, mp); 454 ptr = metapointer(end_of_metadata, mp);
455 bh = mp->mp_bh[end_of_metadata]; 455 bh = mp->mp_bh[end_of_metadata];
456 dblks = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen, 456 dblks = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen,
457 &eob); 457 &eob);
458 BUG_ON(dblks < 1); 458 BUG_ON(dblks < 1);
459 state = ALLOC_DATA; 459 state = ALLOC_DATA;
460 } else { 460 } else {
461 /* Need to allocate indirect blocks */ 461 /* Need to allocate indirect blocks */
462 ptrs_per_blk = height > 1 ? sdp->sd_inptrs : sdp->sd_diptrs; 462 ptrs_per_blk = height > 1 ? sdp->sd_inptrs : sdp->sd_diptrs;
463 dblks = min(maxlen, ptrs_per_blk - mp->mp_list[end_of_metadata]); 463 dblks = min(maxlen, ptrs_per_blk - mp->mp_list[end_of_metadata]);
464 if (height == ip->i_height) { 464 if (height == ip->i_height) {
465 /* Writing into existing tree, extend tree down */ 465 /* Writing into existing tree, extend tree down */
466 iblks = height - sheight; 466 iblks = height - sheight;
467 state = ALLOC_GROW_DEPTH; 467 state = ALLOC_GROW_DEPTH;
468 } else { 468 } else {
469 /* Building up tree height */ 469 /* Building up tree height */
470 state = ALLOC_GROW_HEIGHT; 470 state = ALLOC_GROW_HEIGHT;
471 iblks = height - ip->i_height; 471 iblks = height - ip->i_height;
472 branch_start = metapath_branch_start(mp); 472 branch_start = metapath_branch_start(mp);
473 iblks += (height - branch_start); 473 iblks += (height - branch_start);
474 } 474 }
475 } 475 }
476 476
477 /* start of the second part of the function (state machine) */ 477 /* start of the second part of the function (state machine) */
478 478
479 blks = dblks + iblks; 479 blks = dblks + iblks;
480 i = sheight; 480 i = sheight;
481 do { 481 do {
482 int error; 482 int error;
483 n = blks - alloced; 483 n = blks - alloced;
484 error = gfs2_alloc_block(ip, &bn, &n); 484 error = gfs2_alloc_block(ip, &bn, &n);
485 if (error) 485 if (error)
486 return error; 486 return error;
487 alloced += n; 487 alloced += n;
488 if (state != ALLOC_DATA || gfs2_is_jdata(ip)) 488 if (state != ALLOC_DATA || gfs2_is_jdata(ip))
489 gfs2_trans_add_unrevoke(sdp, bn, n); 489 gfs2_trans_add_unrevoke(sdp, bn, n);
490 switch (state) { 490 switch (state) {
491 /* Growing height of tree */ 491 /* Growing height of tree */
492 case ALLOC_GROW_HEIGHT: 492 case ALLOC_GROW_HEIGHT:
493 if (i == 1) { 493 if (i == 1) {
494 ptr = (__be64 *)(dibh->b_data + 494 ptr = (__be64 *)(dibh->b_data +
495 sizeof(struct gfs2_dinode)); 495 sizeof(struct gfs2_dinode));
496 zero_bn = *ptr; 496 zero_bn = *ptr;
497 } 497 }
498 for (; i - 1 < height - ip->i_height && n > 0; i++, n--) 498 for (; i - 1 < height - ip->i_height && n > 0; i++, n--)
499 gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++); 499 gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++);
500 if (i - 1 == height - ip->i_height) { 500 if (i - 1 == height - ip->i_height) {
501 i--; 501 i--;
502 gfs2_buffer_copy_tail(mp->mp_bh[i], 502 gfs2_buffer_copy_tail(mp->mp_bh[i],
503 sizeof(struct gfs2_meta_header), 503 sizeof(struct gfs2_meta_header),
504 dibh, sizeof(struct gfs2_dinode)); 504 dibh, sizeof(struct gfs2_dinode));
505 gfs2_buffer_clear_tail(dibh, 505 gfs2_buffer_clear_tail(dibh,
506 sizeof(struct gfs2_dinode) + 506 sizeof(struct gfs2_dinode) +
507 sizeof(__be64)); 507 sizeof(__be64));
508 ptr = (__be64 *)(mp->mp_bh[i]->b_data + 508 ptr = (__be64 *)(mp->mp_bh[i]->b_data +
509 sizeof(struct gfs2_meta_header)); 509 sizeof(struct gfs2_meta_header));
510 *ptr = zero_bn; 510 *ptr = zero_bn;
511 state = ALLOC_GROW_DEPTH; 511 state = ALLOC_GROW_DEPTH;
512 for(i = branch_start; i < height; i++) { 512 for(i = branch_start; i < height; i++) {
513 if (mp->mp_bh[i] == NULL) 513 if (mp->mp_bh[i] == NULL)
514 break; 514 break;
515 brelse(mp->mp_bh[i]); 515 brelse(mp->mp_bh[i]);
516 mp->mp_bh[i] = NULL; 516 mp->mp_bh[i] = NULL;
517 } 517 }
518 i = branch_start; 518 i = branch_start;
519 } 519 }
520 if (n == 0) 520 if (n == 0)
521 break; 521 break;
522 /* Branching from existing tree */ 522 /* Branching from existing tree */
523 case ALLOC_GROW_DEPTH: 523 case ALLOC_GROW_DEPTH:
524 if (i > 1 && i < height) 524 if (i > 1 && i < height)
525 gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[i-1], 1); 525 gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[i-1], 1);
526 for (; i < height && n > 0; i++, n--) 526 for (; i < height && n > 0; i++, n--)
527 gfs2_indirect_init(mp, ip->i_gl, i, 527 gfs2_indirect_init(mp, ip->i_gl, i,
528 mp->mp_list[i-1], bn++); 528 mp->mp_list[i-1], bn++);
529 if (i == height) 529 if (i == height)
530 state = ALLOC_DATA; 530 state = ALLOC_DATA;
531 if (n == 0) 531 if (n == 0)
532 break; 532 break;
533 /* Tree complete, adding data blocks */ 533 /* Tree complete, adding data blocks */
534 case ALLOC_DATA: 534 case ALLOC_DATA:
535 BUG_ON(n > dblks); 535 BUG_ON(n > dblks);
536 BUG_ON(mp->mp_bh[end_of_metadata] == NULL); 536 BUG_ON(mp->mp_bh[end_of_metadata] == NULL);
537 gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[end_of_metadata], 1); 537 gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[end_of_metadata], 1);
538 dblks = n; 538 dblks = n;
539 ptr = metapointer(end_of_metadata, mp); 539 ptr = metapointer(end_of_metadata, mp);
540 dblock = bn; 540 dblock = bn;
541 while (n-- > 0) 541 while (n-- > 0)
542 *ptr++ = cpu_to_be64(bn++); 542 *ptr++ = cpu_to_be64(bn++);
543 break; 543 break;
544 } 544 }
545 } while ((state != ALLOC_DATA) || !dblock); 545 } while ((state != ALLOC_DATA) || !dblock);
546 546
547 ip->i_height = height; 547 ip->i_height = height;
548 gfs2_add_inode_blocks(&ip->i_inode, alloced); 548 gfs2_add_inode_blocks(&ip->i_inode, alloced);
549 gfs2_dinode_out(ip, mp->mp_bh[0]->b_data); 549 gfs2_dinode_out(ip, mp->mp_bh[0]->b_data);
550 map_bh(bh_map, inode->i_sb, dblock); 550 map_bh(bh_map, inode->i_sb, dblock);
551 bh_map->b_size = dblks << inode->i_blkbits; 551 bh_map->b_size = dblks << inode->i_blkbits;
552 set_buffer_new(bh_map); 552 set_buffer_new(bh_map);
553 return 0; 553 return 0;
554 } 554 }
555 555
556 /** 556 /**
557 * gfs2_block_map - Map a block from an inode to a disk block 557 * gfs2_block_map - Map a block from an inode to a disk block
558 * @inode: The inode 558 * @inode: The inode
559 * @lblock: The logical block number 559 * @lblock: The logical block number
560 * @bh_map: The bh to be mapped 560 * @bh_map: The bh to be mapped
561 * @create: True if its ok to alloc blocks to satify the request 561 * @create: True if its ok to alloc blocks to satify the request
562 * 562 *
563 * Sets buffer_mapped() if successful, sets buffer_boundary() if a 563 * Sets buffer_mapped() if successful, sets buffer_boundary() if a
564 * read of metadata will be required before the next block can be 564 * read of metadata will be required before the next block can be
565 * mapped. Sets buffer_new() if new blocks were allocated. 565 * mapped. Sets buffer_new() if new blocks were allocated.
566 * 566 *
567 * Returns: errno 567 * Returns: errno
568 */ 568 */
569 569
570 int gfs2_block_map(struct inode *inode, sector_t lblock, 570 int gfs2_block_map(struct inode *inode, sector_t lblock,
571 struct buffer_head *bh_map, int create) 571 struct buffer_head *bh_map, int create)
572 { 572 {
573 struct gfs2_inode *ip = GFS2_I(inode); 573 struct gfs2_inode *ip = GFS2_I(inode);
574 struct gfs2_sbd *sdp = GFS2_SB(inode); 574 struct gfs2_sbd *sdp = GFS2_SB(inode);
575 unsigned int bsize = sdp->sd_sb.sb_bsize; 575 unsigned int bsize = sdp->sd_sb.sb_bsize;
576 const unsigned int maxlen = bh_map->b_size >> inode->i_blkbits; 576 const unsigned int maxlen = bh_map->b_size >> inode->i_blkbits;
577 const u64 *arr = sdp->sd_heightsize; 577 const u64 *arr = sdp->sd_heightsize;
578 __be64 *ptr; 578 __be64 *ptr;
579 u64 size; 579 u64 size;
580 struct metapath mp; 580 struct metapath mp;
581 int ret; 581 int ret;
582 int eob; 582 int eob;
583 unsigned int len; 583 unsigned int len;
584 struct buffer_head *bh; 584 struct buffer_head *bh;
585 u8 height; 585 u8 height;
586 586
587 BUG_ON(maxlen == 0); 587 BUG_ON(maxlen == 0);
588 588
589 memset(mp.mp_bh, 0, sizeof(mp.mp_bh)); 589 memset(mp.mp_bh, 0, sizeof(mp.mp_bh));
590 bmap_lock(ip, create); 590 bmap_lock(ip, create);
591 clear_buffer_mapped(bh_map); 591 clear_buffer_mapped(bh_map);
592 clear_buffer_new(bh_map); 592 clear_buffer_new(bh_map);
593 clear_buffer_boundary(bh_map); 593 clear_buffer_boundary(bh_map);
594 trace_gfs2_bmap(ip, bh_map, lblock, create, 1); 594 trace_gfs2_bmap(ip, bh_map, lblock, create, 1);
595 if (gfs2_is_dir(ip)) { 595 if (gfs2_is_dir(ip)) {
596 bsize = sdp->sd_jbsize; 596 bsize = sdp->sd_jbsize;
597 arr = sdp->sd_jheightsize; 597 arr = sdp->sd_jheightsize;
598 } 598 }
599 599
600 ret = gfs2_meta_inode_buffer(ip, &mp.mp_bh[0]); 600 ret = gfs2_meta_inode_buffer(ip, &mp.mp_bh[0]);
601 if (ret) 601 if (ret)
602 goto out; 602 goto out;
603 603
604 height = ip->i_height; 604 height = ip->i_height;
605 size = (lblock + 1) * bsize; 605 size = (lblock + 1) * bsize;
606 while (size > arr[height]) 606 while (size > arr[height])
607 height++; 607 height++;
608 find_metapath(sdp, lblock, &mp, height); 608 find_metapath(sdp, lblock, &mp, height);
609 ret = 1; 609 ret = 1;
610 if (height > ip->i_height || gfs2_is_stuffed(ip)) 610 if (height > ip->i_height || gfs2_is_stuffed(ip))
611 goto do_alloc; 611 goto do_alloc;
612 ret = lookup_metapath(ip, &mp); 612 ret = lookup_metapath(ip, &mp);
613 if (ret < 0) 613 if (ret < 0)
614 goto out; 614 goto out;
615 if (ret != ip->i_height) 615 if (ret != ip->i_height)
616 goto do_alloc; 616 goto do_alloc;
617 ptr = metapointer(ip->i_height - 1, &mp); 617 ptr = metapointer(ip->i_height - 1, &mp);
618 if (*ptr == 0) 618 if (*ptr == 0)
619 goto do_alloc; 619 goto do_alloc;
620 map_bh(bh_map, inode->i_sb, be64_to_cpu(*ptr)); 620 map_bh(bh_map, inode->i_sb, be64_to_cpu(*ptr));
621 bh = mp.mp_bh[ip->i_height - 1]; 621 bh = mp.mp_bh[ip->i_height - 1];
622 len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen, &eob); 622 len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen, &eob);
623 bh_map->b_size = (len << inode->i_blkbits); 623 bh_map->b_size = (len << inode->i_blkbits);
624 if (eob) 624 if (eob)
625 set_buffer_boundary(bh_map); 625 set_buffer_boundary(bh_map);
626 ret = 0; 626 ret = 0;
627 out: 627 out:
628 release_metapath(&mp); 628 release_metapath(&mp);
629 trace_gfs2_bmap(ip, bh_map, lblock, create, ret); 629 trace_gfs2_bmap(ip, bh_map, lblock, create, ret);
630 bmap_unlock(ip, create); 630 bmap_unlock(ip, create);
631 return ret; 631 return ret;
632 632
633 do_alloc: 633 do_alloc:
634 /* All allocations are done here, firstly check create flag */ 634 /* All allocations are done here, firstly check create flag */
635 if (!create) { 635 if (!create) {
636 BUG_ON(gfs2_is_stuffed(ip)); 636 BUG_ON(gfs2_is_stuffed(ip));
637 ret = 0; 637 ret = 0;
638 goto out; 638 goto out;
639 } 639 }
640 640
641 /* At this point ret is the tree depth of already allocated blocks */ 641 /* At this point ret is the tree depth of already allocated blocks */
642 ret = gfs2_bmap_alloc(inode, lblock, bh_map, &mp, ret, height, maxlen); 642 ret = gfs2_bmap_alloc(inode, lblock, bh_map, &mp, ret, height, maxlen);
643 goto out; 643 goto out;
644 } 644 }
645 645
646 /* 646 /*
647 * Deprecated: do not use in new code 647 * Deprecated: do not use in new code
648 */ 648 */
649 int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen) 649 int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen)
650 { 650 {
651 struct buffer_head bh = { .b_state = 0, .b_blocknr = 0 }; 651 struct buffer_head bh = { .b_state = 0, .b_blocknr = 0 };
652 int ret; 652 int ret;
653 int create = *new; 653 int create = *new;
654 654
655 BUG_ON(!extlen); 655 BUG_ON(!extlen);
656 BUG_ON(!dblock); 656 BUG_ON(!dblock);
657 BUG_ON(!new); 657 BUG_ON(!new);
658 658
659 bh.b_size = 1 << (inode->i_blkbits + (create ? 0 : 5)); 659 bh.b_size = 1 << (inode->i_blkbits + (create ? 0 : 5));
660 ret = gfs2_block_map(inode, lblock, &bh, create); 660 ret = gfs2_block_map(inode, lblock, &bh, create);
661 *extlen = bh.b_size >> inode->i_blkbits; 661 *extlen = bh.b_size >> inode->i_blkbits;
662 *dblock = bh.b_blocknr; 662 *dblock = bh.b_blocknr;
663 if (buffer_new(&bh)) 663 if (buffer_new(&bh))
664 *new = 1; 664 *new = 1;
665 else 665 else
666 *new = 0; 666 *new = 0;
667 return ret; 667 return ret;
668 } 668 }
669 669
670 /** 670 /**
671 * recursive_scan - recursively scan through the end of a file 671 * recursive_scan - recursively scan through the end of a file
672 * @ip: the inode 672 * @ip: the inode
673 * @dibh: the dinode buffer 673 * @dibh: the dinode buffer
674 * @mp: the path through the metadata to the point to start 674 * @mp: the path through the metadata to the point to start
675 * @height: the height the recursion is at 675 * @height: the height the recursion is at
676 * @block: the indirect block to look at 676 * @block: the indirect block to look at
677 * @first: 1 if this is the first block 677 * @first: 1 if this is the first block
678 * @bc: the call to make for each piece of metadata 678 * @bc: the call to make for each piece of metadata
679 * @data: data opaque to this function to pass to @bc 679 * @data: data opaque to this function to pass to @bc
680 * 680 *
681 * When this is first called @height and @block should be zero and 681 * When this is first called @height and @block should be zero and
682 * @first should be 1. 682 * @first should be 1.
683 * 683 *
684 * Returns: errno 684 * Returns: errno
685 */ 685 */
686 686
687 static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh, 687 static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
688 struct metapath *mp, unsigned int height, 688 struct metapath *mp, unsigned int height,
689 u64 block, int first, block_call_t bc, 689 u64 block, int first, block_call_t bc,
690 void *data) 690 void *data)
691 { 691 {
692 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 692 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
693 struct buffer_head *bh = NULL; 693 struct buffer_head *bh = NULL;
694 __be64 *top, *bottom; 694 __be64 *top, *bottom;
695 u64 bn; 695 u64 bn;
696 int error; 696 int error;
697 int mh_size = sizeof(struct gfs2_meta_header); 697 int mh_size = sizeof(struct gfs2_meta_header);
698 698
699 if (!height) { 699 if (!height) {
700 error = gfs2_meta_inode_buffer(ip, &bh); 700 error = gfs2_meta_inode_buffer(ip, &bh);
701 if (error) 701 if (error)
702 return error; 702 return error;
703 dibh = bh; 703 dibh = bh;
704 704
705 top = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0]; 705 top = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0];
706 bottom = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs; 706 bottom = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs;
707 } else { 707 } else {
708 error = gfs2_meta_indirect_buffer(ip, height, block, 0, &bh); 708 error = gfs2_meta_indirect_buffer(ip, height, block, 0, &bh);
709 if (error) 709 if (error)
710 return error; 710 return error;
711 711
712 top = (__be64 *)(bh->b_data + mh_size) + 712 top = (__be64 *)(bh->b_data + mh_size) +
713 (first ? mp->mp_list[height] : 0); 713 (first ? mp->mp_list[height] : 0);
714 714
715 bottom = (__be64 *)(bh->b_data + mh_size) + sdp->sd_inptrs; 715 bottom = (__be64 *)(bh->b_data + mh_size) + sdp->sd_inptrs;
716 } 716 }
717 717
718 error = bc(ip, dibh, bh, top, bottom, height, data); 718 error = bc(ip, dibh, bh, top, bottom, height, data);
719 if (error) 719 if (error)
720 goto out; 720 goto out;
721 721
722 if (height < ip->i_height - 1) 722 if (height < ip->i_height - 1)
723 for (; top < bottom; top++, first = 0) { 723 for (; top < bottom; top++, first = 0) {
724 if (!*top) 724 if (!*top)
725 continue; 725 continue;
726 726
727 bn = be64_to_cpu(*top); 727 bn = be64_to_cpu(*top);
728 728
729 error = recursive_scan(ip, dibh, mp, height + 1, bn, 729 error = recursive_scan(ip, dibh, mp, height + 1, bn,
730 first, bc, data); 730 first, bc, data);
731 if (error) 731 if (error)
732 break; 732 break;
733 } 733 }
734 734
735 out: 735 out:
736 brelse(bh); 736 brelse(bh);
737 return error; 737 return error;
738 } 738 }
739 739
740 /** 740 /**
741 * do_strip - Look for a layer a particular layer of the file and strip it off 741 * do_strip - Look for a layer a particular layer of the file and strip it off
742 * @ip: the inode 742 * @ip: the inode
743 * @dibh: the dinode buffer 743 * @dibh: the dinode buffer
744 * @bh: A buffer of pointers 744 * @bh: A buffer of pointers
745 * @top: The first pointer in the buffer 745 * @top: The first pointer in the buffer
746 * @bottom: One more than the last pointer 746 * @bottom: One more than the last pointer
747 * @height: the height this buffer is at 747 * @height: the height this buffer is at
748 * @data: a pointer to a struct strip_mine 748 * @data: a pointer to a struct strip_mine
749 * 749 *
750 * Returns: errno 750 * Returns: errno
751 */ 751 */
752 752
753 static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, 753 static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
754 struct buffer_head *bh, __be64 *top, __be64 *bottom, 754 struct buffer_head *bh, __be64 *top, __be64 *bottom,
755 unsigned int height, void *data) 755 unsigned int height, void *data)
756 { 756 {
757 struct strip_mine *sm = data; 757 struct strip_mine *sm = data;
758 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 758 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
759 struct gfs2_rgrp_list rlist; 759 struct gfs2_rgrp_list rlist;
760 u64 bn, bstart; 760 u64 bn, bstart;
761 u32 blen, btotal; 761 u32 blen, btotal;
762 __be64 *p; 762 __be64 *p;
763 unsigned int rg_blocks = 0; 763 unsigned int rg_blocks = 0;
764 int metadata; 764 int metadata;
765 unsigned int revokes = 0; 765 unsigned int revokes = 0;
766 int x; 766 int x;
767 int error = 0; 767 int error = 0;
768 768
769 if (!*top) 769 if (!*top)
770 sm->sm_first = 0; 770 sm->sm_first = 0;
771 771
772 if (height != sm->sm_height) 772 if (height != sm->sm_height)
773 return 0; 773 return 0;
774 774
775 if (sm->sm_first) { 775 if (sm->sm_first) {
776 top++; 776 top++;
777 sm->sm_first = 0; 777 sm->sm_first = 0;
778 } 778 }
779 779
780 metadata = (height != ip->i_height - 1); 780 metadata = (height != ip->i_height - 1);
781 if (metadata) 781 if (metadata)
782 revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs; 782 revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
783 else if (ip->i_depth) 783 else if (ip->i_depth)
784 revokes = sdp->sd_inptrs; 784 revokes = sdp->sd_inptrs;
785 785
786 if (ip != GFS2_I(sdp->sd_rindex)) 786 if (ip != GFS2_I(sdp->sd_rindex))
787 error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh); 787 error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
788 else if (!sdp->sd_rgrps) 788 else if (!sdp->sd_rgrps)
789 error = gfs2_ri_update(ip); 789 error = gfs2_ri_update(ip);
790 790
791 if (error) 791 if (error)
792 return error; 792 return error;
793 793
794 memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); 794 memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
795 bstart = 0; 795 bstart = 0;
796 blen = 0; 796 blen = 0;
797 797
798 for (p = top; p < bottom; p++) { 798 for (p = top; p < bottom; p++) {
799 if (!*p) 799 if (!*p)
800 continue; 800 continue;
801 801
802 bn = be64_to_cpu(*p); 802 bn = be64_to_cpu(*p);
803 803
804 if (bstart + blen == bn) 804 if (bstart + blen == bn)
805 blen++; 805 blen++;
806 else { 806 else {
807 if (bstart) 807 if (bstart)
808 gfs2_rlist_add(sdp, &rlist, bstart); 808 gfs2_rlist_add(sdp, &rlist, bstart);
809 809
810 bstart = bn; 810 bstart = bn;
811 blen = 1; 811 blen = 1;
812 } 812 }
813 } 813 }
814 814
815 if (bstart) 815 if (bstart)
816 gfs2_rlist_add(sdp, &rlist, bstart); 816 gfs2_rlist_add(sdp, &rlist, bstart);
817 else 817 else
818 goto out; /* Nothing to do */ 818 goto out; /* Nothing to do */
819 819
820 gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE); 820 gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
821 821
822 for (x = 0; x < rlist.rl_rgrps; x++) { 822 for (x = 0; x < rlist.rl_rgrps; x++) {
823 struct gfs2_rgrpd *rgd; 823 struct gfs2_rgrpd *rgd;
824 rgd = rlist.rl_ghs[x].gh_gl->gl_object; 824 rgd = rlist.rl_ghs[x].gh_gl->gl_object;
825 rg_blocks += rgd->rd_length; 825 rg_blocks += rgd->rd_length;
826 } 826 }
827 827
828 error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs); 828 error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
829 if (error) 829 if (error)
830 goto out_rlist; 830 goto out_rlist;
831 831
832 error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE + 832 error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
833 RES_INDIRECT + RES_STATFS + RES_QUOTA, 833 RES_INDIRECT + RES_STATFS + RES_QUOTA,
834 revokes); 834 revokes);
835 if (error) 835 if (error)
836 goto out_rg_gunlock; 836 goto out_rg_gunlock;
837 837
838 down_write(&ip->i_rw_mutex); 838 down_write(&ip->i_rw_mutex);
839 839
840 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 840 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
841 gfs2_trans_add_bh(ip->i_gl, bh, 1); 841 gfs2_trans_add_bh(ip->i_gl, bh, 1);
842 842
843 bstart = 0; 843 bstart = 0;
844 blen = 0; 844 blen = 0;
845 btotal = 0; 845 btotal = 0;
846 846
847 for (p = top; p < bottom; p++) { 847 for (p = top; p < bottom; p++) {
848 if (!*p) 848 if (!*p)
849 continue; 849 continue;
850 850
851 bn = be64_to_cpu(*p); 851 bn = be64_to_cpu(*p);
852 852
853 if (bstart + blen == bn) 853 if (bstart + blen == bn)
854 blen++; 854 blen++;
855 else { 855 else {
856 if (bstart) { 856 if (bstart) {
857 if (metadata) 857 if (metadata)
858 __gfs2_free_meta(ip, bstart, blen); 858 __gfs2_free_meta(ip, bstart, blen);
859 else 859 else
860 __gfs2_free_data(ip, bstart, blen); 860 __gfs2_free_data(ip, bstart, blen);
861 861
862 btotal += blen; 862 btotal += blen;
863 } 863 }
864 864
865 bstart = bn; 865 bstart = bn;
866 blen = 1; 866 blen = 1;
867 } 867 }
868 868
869 *p = 0; 869 *p = 0;
870 gfs2_add_inode_blocks(&ip->i_inode, -1); 870 gfs2_add_inode_blocks(&ip->i_inode, -1);
871 } 871 }
872 if (bstart) { 872 if (bstart) {
873 if (metadata) 873 if (metadata)
874 __gfs2_free_meta(ip, bstart, blen); 874 __gfs2_free_meta(ip, bstart, blen);
875 else 875 else
876 __gfs2_free_data(ip, bstart, blen); 876 __gfs2_free_data(ip, bstart, blen);
877 877
878 btotal += blen; 878 btotal += blen;
879 } 879 }
880 880
881 gfs2_statfs_change(sdp, 0, +btotal, 0); 881 gfs2_statfs_change(sdp, 0, +btotal, 0);
882 gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid, 882 gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
883 ip->i_inode.i_gid); 883 ip->i_inode.i_gid);
884 884
885 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 885 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
886 886
887 gfs2_dinode_out(ip, dibh->b_data); 887 gfs2_dinode_out(ip, dibh->b_data);
888 888
889 up_write(&ip->i_rw_mutex); 889 up_write(&ip->i_rw_mutex);
890 890
891 gfs2_trans_end(sdp); 891 gfs2_trans_end(sdp);
892 892
893 out_rg_gunlock: 893 out_rg_gunlock:
894 gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs); 894 gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
895 out_rlist: 895 out_rlist:
896 gfs2_rlist_free(&rlist); 896 gfs2_rlist_free(&rlist);
897 out: 897 out:
898 if (ip != GFS2_I(sdp->sd_rindex)) 898 if (ip != GFS2_I(sdp->sd_rindex))
899 gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh); 899 gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh);
900 return error; 900 return error;
901 } 901 }
902 902
903 /** 903 /**
904 * gfs2_block_truncate_page - Deal with zeroing out data for truncate 904 * gfs2_block_truncate_page - Deal with zeroing out data for truncate
905 * 905 *
906 * This is partly borrowed from ext3. 906 * This is partly borrowed from ext3.
907 */ 907 */
908 static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from) 908 static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from)
909 { 909 {
910 struct inode *inode = mapping->host; 910 struct inode *inode = mapping->host;
911 struct gfs2_inode *ip = GFS2_I(inode); 911 struct gfs2_inode *ip = GFS2_I(inode);
912 unsigned long index = from >> PAGE_CACHE_SHIFT; 912 unsigned long index = from >> PAGE_CACHE_SHIFT;
913 unsigned offset = from & (PAGE_CACHE_SIZE-1); 913 unsigned offset = from & (PAGE_CACHE_SIZE-1);
914 unsigned blocksize, iblock, length, pos; 914 unsigned blocksize, iblock, length, pos;
915 struct buffer_head *bh; 915 struct buffer_head *bh;
916 struct page *page; 916 struct page *page;
917 int err; 917 int err;
918 918
919 page = grab_cache_page(mapping, index); 919 page = grab_cache_page(mapping, index);
920 if (!page) 920 if (!page)
921 return 0; 921 return 0;
922 922
923 blocksize = inode->i_sb->s_blocksize; 923 blocksize = inode->i_sb->s_blocksize;
924 length = blocksize - (offset & (blocksize - 1)); 924 length = blocksize - (offset & (blocksize - 1));
925 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); 925 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
926 926
927 if (!page_has_buffers(page)) 927 if (!page_has_buffers(page))
928 create_empty_buffers(page, blocksize, 0); 928 create_empty_buffers(page, blocksize, 0);
929 929
930 /* Find the buffer that contains "offset" */ 930 /* Find the buffer that contains "offset" */
931 bh = page_buffers(page); 931 bh = page_buffers(page);
932 pos = blocksize; 932 pos = blocksize;
933 while (offset >= pos) { 933 while (offset >= pos) {
934 bh = bh->b_this_page; 934 bh = bh->b_this_page;
935 iblock++; 935 iblock++;
936 pos += blocksize; 936 pos += blocksize;
937 } 937 }
938 938
939 err = 0; 939 err = 0;
940 940
941 if (!buffer_mapped(bh)) { 941 if (!buffer_mapped(bh)) {
942 gfs2_block_map(inode, iblock, bh, 0); 942 gfs2_block_map(inode, iblock, bh, 0);
943 /* unmapped? It's a hole - nothing to do */ 943 /* unmapped? It's a hole - nothing to do */
944 if (!buffer_mapped(bh)) 944 if (!buffer_mapped(bh))
945 goto unlock; 945 goto unlock;
946 } 946 }
947 947
948 /* Ok, it's mapped. Make sure it's up-to-date */ 948 /* Ok, it's mapped. Make sure it's up-to-date */
949 if (PageUptodate(page)) 949 if (PageUptodate(page))
950 set_buffer_uptodate(bh); 950 set_buffer_uptodate(bh);
951 951
952 if (!buffer_uptodate(bh)) { 952 if (!buffer_uptodate(bh)) {
953 err = -EIO; 953 err = -EIO;
954 ll_rw_block(READ, 1, &bh); 954 ll_rw_block(READ, 1, &bh);
955 wait_on_buffer(bh); 955 wait_on_buffer(bh);
956 /* Uhhuh. Read error. Complain and punt. */ 956 /* Uhhuh. Read error. Complain and punt. */
957 if (!buffer_uptodate(bh)) 957 if (!buffer_uptodate(bh))
958 goto unlock; 958 goto unlock;
959 err = 0; 959 err = 0;
960 } 960 }
961 961
962 if (!gfs2_is_writeback(ip)) 962 if (!gfs2_is_writeback(ip))
963 gfs2_trans_add_bh(ip->i_gl, bh, 0); 963 gfs2_trans_add_bh(ip->i_gl, bh, 0);
964 964
965 zero_user(page, offset, length); 965 zero_user(page, offset, length);
966 mark_buffer_dirty(bh); 966 mark_buffer_dirty(bh);
967 unlock: 967 unlock:
968 unlock_page(page); 968 unlock_page(page);
969 page_cache_release(page); 969 page_cache_release(page);
970 return err; 970 return err;
971 } 971 }
972 972
973 static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize) 973 static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
974 { 974 {
975 struct gfs2_inode *ip = GFS2_I(inode); 975 struct gfs2_inode *ip = GFS2_I(inode);
976 struct gfs2_sbd *sdp = GFS2_SB(inode); 976 struct gfs2_sbd *sdp = GFS2_SB(inode);
977 struct address_space *mapping = inode->i_mapping; 977 struct address_space *mapping = inode->i_mapping;
978 struct buffer_head *dibh; 978 struct buffer_head *dibh;
979 int journaled = gfs2_is_jdata(ip); 979 int journaled = gfs2_is_jdata(ip);
980 int error; 980 int error;
981 981
982 error = gfs2_trans_begin(sdp, 982 error = gfs2_trans_begin(sdp,
983 RES_DINODE + (journaled ? RES_JDATA : 0), 0); 983 RES_DINODE + (journaled ? RES_JDATA : 0), 0);
984 if (error) 984 if (error)
985 return error; 985 return error;
986 986
987 error = gfs2_meta_inode_buffer(ip, &dibh); 987 error = gfs2_meta_inode_buffer(ip, &dibh);
988 if (error) 988 if (error)
989 goto out; 989 goto out;
990 990
991 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 991 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
992 992
993 if (gfs2_is_stuffed(ip)) { 993 if (gfs2_is_stuffed(ip)) {
994 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize); 994 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
995 } else { 995 } else {
996 if (newsize & (u64)(sdp->sd_sb.sb_bsize - 1)) { 996 if (newsize & (u64)(sdp->sd_sb.sb_bsize - 1)) {
997 error = gfs2_block_truncate_page(mapping, newsize); 997 error = gfs2_block_truncate_page(mapping, newsize);
998 if (error) 998 if (error)
999 goto out_brelse; 999 goto out_brelse;
1000 } 1000 }
1001 ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG; 1001 ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
1002 } 1002 }
1003 1003
1004 i_size_write(inode, newsize); 1004 i_size_write(inode, newsize);
1005 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 1005 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1006 gfs2_dinode_out(ip, dibh->b_data); 1006 gfs2_dinode_out(ip, dibh->b_data);
1007 1007
1008 truncate_pagecache(inode, oldsize, newsize); 1008 truncate_pagecache(inode, oldsize, newsize);
1009 out_brelse: 1009 out_brelse:
1010 brelse(dibh); 1010 brelse(dibh);
1011 out: 1011 out:
1012 gfs2_trans_end(sdp); 1012 gfs2_trans_end(sdp);
1013 return error; 1013 return error;
1014 } 1014 }
1015 1015
1016 static int trunc_dealloc(struct gfs2_inode *ip, u64 size) 1016 static int trunc_dealloc(struct gfs2_inode *ip, u64 size)
1017 { 1017 {
1018 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1018 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1019 unsigned int height = ip->i_height; 1019 unsigned int height = ip->i_height;
1020 u64 lblock; 1020 u64 lblock;
1021 struct metapath mp; 1021 struct metapath mp;
1022 int error; 1022 int error;
1023 1023
1024 if (!size) 1024 if (!size)
1025 lblock = 0; 1025 lblock = 0;
1026 else 1026 else
1027 lblock = (size - 1) >> sdp->sd_sb.sb_bsize_shift; 1027 lblock = (size - 1) >> sdp->sd_sb.sb_bsize_shift;
1028 1028
1029 find_metapath(sdp, lblock, &mp, ip->i_height); 1029 find_metapath(sdp, lblock, &mp, ip->i_height);
1030 if (!gfs2_alloc_get(ip)) 1030 if (!gfs2_alloc_get(ip))
1031 return -ENOMEM; 1031 return -ENOMEM;
1032 1032
1033 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); 1033 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
1034 if (error) 1034 if (error)
1035 goto out; 1035 goto out;
1036 1036
1037 while (height--) { 1037 while (height--) {
1038 struct strip_mine sm; 1038 struct strip_mine sm;
1039 sm.sm_first = !!size; 1039 sm.sm_first = !!size;
1040 sm.sm_height = height; 1040 sm.sm_height = height;
1041 1041
1042 error = recursive_scan(ip, NULL, &mp, 0, 0, 1, do_strip, &sm); 1042 error = recursive_scan(ip, NULL, &mp, 0, 0, 1, do_strip, &sm);
1043 if (error) 1043 if (error)
1044 break; 1044 break;
1045 } 1045 }
1046 1046
1047 gfs2_quota_unhold(ip); 1047 gfs2_quota_unhold(ip);
1048 1048
1049 out: 1049 out:
1050 gfs2_alloc_put(ip); 1050 gfs2_alloc_put(ip);
1051 return error; 1051 return error;
1052 } 1052 }
1053 1053
1054 static int trunc_end(struct gfs2_inode *ip) 1054 static int trunc_end(struct gfs2_inode *ip)
1055 { 1055 {
1056 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1056 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1057 struct buffer_head *dibh; 1057 struct buffer_head *dibh;
1058 int error; 1058 int error;
1059 1059
1060 error = gfs2_trans_begin(sdp, RES_DINODE, 0); 1060 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1061 if (error) 1061 if (error)
1062 return error; 1062 return error;
1063 1063
1064 down_write(&ip->i_rw_mutex); 1064 down_write(&ip->i_rw_mutex);
1065 1065
1066 error = gfs2_meta_inode_buffer(ip, &dibh); 1066 error = gfs2_meta_inode_buffer(ip, &dibh);
1067 if (error) 1067 if (error)
1068 goto out; 1068 goto out;
1069 1069
1070 if (!i_size_read(&ip->i_inode)) { 1070 if (!i_size_read(&ip->i_inode)) {
1071 ip->i_height = 0; 1071 ip->i_height = 0;
1072 ip->i_goal = ip->i_no_addr; 1072 ip->i_goal = ip->i_no_addr;
1073 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); 1073 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
1074 } 1074 }
1075 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 1075 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1076 ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG; 1076 ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
1077 1077
1078 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 1078 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1079 gfs2_dinode_out(ip, dibh->b_data); 1079 gfs2_dinode_out(ip, dibh->b_data);
1080 brelse(dibh); 1080 brelse(dibh);
1081 1081
1082 out: 1082 out:
1083 up_write(&ip->i_rw_mutex); 1083 up_write(&ip->i_rw_mutex);
1084 gfs2_trans_end(sdp); 1084 gfs2_trans_end(sdp);
1085 return error; 1085 return error;
1086 } 1086 }
1087 1087
1088 /** 1088 /**
1089 * do_shrink - make a file smaller 1089 * do_shrink - make a file smaller
1090 * @inode: the inode 1090 * @inode: the inode
1091 * @oldsize: the current inode size 1091 * @oldsize: the current inode size
1092 * @newsize: the size to make the file 1092 * @newsize: the size to make the file
1093 * 1093 *
1094 * Called with an exclusive lock on @inode. The @size must 1094 * Called with an exclusive lock on @inode. The @size must
1095 * be equal to or smaller than the current inode size. 1095 * be equal to or smaller than the current inode size.
1096 * 1096 *
1097 * Returns: errno 1097 * Returns: errno
1098 */ 1098 */
1099 1099
1100 static int do_shrink(struct inode *inode, u64 oldsize, u64 newsize) 1100 static int do_shrink(struct inode *inode, u64 oldsize, u64 newsize)
1101 { 1101 {
1102 struct gfs2_inode *ip = GFS2_I(inode); 1102 struct gfs2_inode *ip = GFS2_I(inode);
1103 int error; 1103 int error;
1104 1104
1105 error = trunc_start(inode, oldsize, newsize); 1105 error = trunc_start(inode, oldsize, newsize);
1106 if (error < 0) 1106 if (error < 0)
1107 return error; 1107 return error;
1108 if (gfs2_is_stuffed(ip)) 1108 if (gfs2_is_stuffed(ip))
1109 return 0; 1109 return 0;
1110 1110
1111 error = trunc_dealloc(ip, newsize); 1111 error = trunc_dealloc(ip, newsize);
1112 if (error == 0) 1112 if (error == 0)
1113 error = trunc_end(ip); 1113 error = trunc_end(ip);
1114 1114
1115 return error; 1115 return error;
1116 } 1116 }
1117 1117
1118 void gfs2_trim_blocks(struct inode *inode) 1118 void gfs2_trim_blocks(struct inode *inode)
1119 { 1119 {
1120 u64 size = inode->i_size; 1120 u64 size = inode->i_size;
1121 int ret; 1121 int ret;
1122 1122
1123 ret = do_shrink(inode, size, size); 1123 ret = do_shrink(inode, size, size);
1124 WARN_ON(ret != 0); 1124 WARN_ON(ret != 0);
1125 } 1125 }
1126 1126
1127 /** 1127 /**
1128 * do_grow - Touch and update inode size 1128 * do_grow - Touch and update inode size
1129 * @inode: The inode 1129 * @inode: The inode
1130 * @size: The new size 1130 * @size: The new size
1131 * 1131 *
1132 * This function updates the timestamps on the inode and 1132 * This function updates the timestamps on the inode and
1133 * may also increase the size of the inode. This function 1133 * may also increase the size of the inode. This function
1134 * must not be called with @size any smaller than the current 1134 * must not be called with @size any smaller than the current
1135 * inode size. 1135 * inode size.
1136 * 1136 *
1137 * Although it is not strictly required to unstuff files here, 1137 * Although it is not strictly required to unstuff files here,
1138 * earlier versions of GFS2 have a bug in the stuffed file reading 1138 * earlier versions of GFS2 have a bug in the stuffed file reading
1139 * code which will result in a buffer overrun if the size is larger 1139 * code which will result in a buffer overrun if the size is larger
1140 * than the max stuffed file size. In order to prevent this from 1140 * than the max stuffed file size. In order to prevent this from
1141 * occurring, such files are unstuffed, but in other cases we can 1141 * occurring, such files are unstuffed, but in other cases we can
1142 * just update the inode size directly. 1142 * just update the inode size directly.
1143 * 1143 *
1144 * Returns: 0 on success, or -ve on error 1144 * Returns: 0 on success, or -ve on error
1145 */ 1145 */
1146 1146
1147 static int do_grow(struct inode *inode, u64 size) 1147 static int do_grow(struct inode *inode, u64 size)
1148 { 1148 {
1149 struct gfs2_inode *ip = GFS2_I(inode); 1149 struct gfs2_inode *ip = GFS2_I(inode);
1150 struct gfs2_sbd *sdp = GFS2_SB(inode); 1150 struct gfs2_sbd *sdp = GFS2_SB(inode);
1151 struct buffer_head *dibh; 1151 struct buffer_head *dibh;
1152 struct gfs2_alloc *al = NULL; 1152 struct gfs2_alloc *al = NULL;
1153 int error; 1153 int error;
1154 1154
1155 if (gfs2_is_stuffed(ip) && 1155 if (gfs2_is_stuffed(ip) &&
1156 (size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)))) { 1156 (size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)))) {
1157 al = gfs2_alloc_get(ip); 1157 al = gfs2_alloc_get(ip);
1158 if (al == NULL) 1158 if (al == NULL)
1159 return -ENOMEM; 1159 return -ENOMEM;
1160 1160
1161 error = gfs2_quota_lock_check(ip); 1161 error = gfs2_quota_lock_check(ip);
1162 if (error) 1162 if (error)
1163 goto do_grow_alloc_put; 1163 goto do_grow_alloc_put;
1164 1164
1165 al->al_requested = 1; 1165 al->al_requested = 1;
1166 error = gfs2_inplace_reserve(ip); 1166 error = gfs2_inplace_reserve(ip);
1167 if (error) 1167 if (error)
1168 goto do_grow_qunlock; 1168 goto do_grow_qunlock;
1169 } 1169 }
1170 1170
1171 error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT, 0); 1171 error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT, 0);
1172 if (error) 1172 if (error)
1173 goto do_grow_release; 1173 goto do_grow_release;
1174 1174
1175 if (al) { 1175 if (al) {
1176 error = gfs2_unstuff_dinode(ip, NULL); 1176 error = gfs2_unstuff_dinode(ip, NULL);
1177 if (error) 1177 if (error)
1178 goto do_end_trans; 1178 goto do_end_trans;
1179 } 1179 }
1180 1180
1181 error = gfs2_meta_inode_buffer(ip, &dibh); 1181 error = gfs2_meta_inode_buffer(ip, &dibh);
1182 if (error) 1182 if (error)
1183 goto do_end_trans; 1183 goto do_end_trans;
1184 1184
1185 i_size_write(inode, size); 1185 i_size_write(inode, size);
1186 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 1186 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1187 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 1187 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1188 gfs2_dinode_out(ip, dibh->b_data); 1188 gfs2_dinode_out(ip, dibh->b_data);
1189 brelse(dibh); 1189 brelse(dibh);
1190 1190
1191 do_end_trans: 1191 do_end_trans:
1192 gfs2_trans_end(sdp); 1192 gfs2_trans_end(sdp);
1193 do_grow_release: 1193 do_grow_release:
1194 if (al) { 1194 if (al) {
1195 gfs2_inplace_release(ip); 1195 gfs2_inplace_release(ip);
1196 do_grow_qunlock: 1196 do_grow_qunlock:
1197 gfs2_quota_unlock(ip); 1197 gfs2_quota_unlock(ip);
1198 do_grow_alloc_put: 1198 do_grow_alloc_put:
1199 gfs2_alloc_put(ip); 1199 gfs2_alloc_put(ip);
1200 } 1200 }
1201 return error; 1201 return error;
1202 } 1202 }
1203 1203
1204 /** 1204 /**
1205 * gfs2_setattr_size - make a file a given size 1205 * gfs2_setattr_size - make a file a given size
1206 * @inode: the inode 1206 * @inode: the inode
1207 * @newsize: the size to make the file 1207 * @newsize: the size to make the file
1208 * 1208 *
1209 * The file size can grow, shrink, or stay the same size. This 1209 * The file size can grow, shrink, or stay the same size. This
1210 * is called holding i_mutex and an exclusive glock on the inode 1210 * is called holding i_mutex and an exclusive glock on the inode
1211 * in question. 1211 * in question.
1212 * 1212 *
1213 * Returns: errno 1213 * Returns: errno
1214 */ 1214 */
1215 1215
1216 int gfs2_setattr_size(struct inode *inode, u64 newsize) 1216 int gfs2_setattr_size(struct inode *inode, u64 newsize)
1217 { 1217 {
1218 int ret; 1218 int ret;
1219 u64 oldsize; 1219 u64 oldsize;
1220 1220
1221 BUG_ON(!S_ISREG(inode->i_mode)); 1221 BUG_ON(!S_ISREG(inode->i_mode));
1222 1222
1223 ret = inode_newsize_ok(inode, newsize); 1223 ret = inode_newsize_ok(inode, newsize);
1224 if (ret) 1224 if (ret)
1225 return ret; 1225 return ret;
1226 1226
1227 inode_dio_wait(inode);
1228
1227 oldsize = inode->i_size; 1229 oldsize = inode->i_size;
1228 if (newsize >= oldsize) 1230 if (newsize >= oldsize)
1229 return do_grow(inode, newsize); 1231 return do_grow(inode, newsize);
1230 1232
1231 return do_shrink(inode, oldsize, newsize); 1233 return do_shrink(inode, oldsize, newsize);
1232 } 1234 }
1233 1235
1234 int gfs2_truncatei_resume(struct gfs2_inode *ip) 1236 int gfs2_truncatei_resume(struct gfs2_inode *ip)
1235 { 1237 {
1236 int error; 1238 int error;
1237 error = trunc_dealloc(ip, i_size_read(&ip->i_inode)); 1239 error = trunc_dealloc(ip, i_size_read(&ip->i_inode));
1238 if (!error) 1240 if (!error)
1239 error = trunc_end(ip); 1241 error = trunc_end(ip);
1240 return error; 1242 return error;
1241 } 1243 }
1242 1244
1243 int gfs2_file_dealloc(struct gfs2_inode *ip) 1245 int gfs2_file_dealloc(struct gfs2_inode *ip)
1244 { 1246 {
1245 return trunc_dealloc(ip, 0); 1247 return trunc_dealloc(ip, 0);
1246 } 1248 }
1247 1249
1248 /** 1250 /**
1249 * gfs2_write_alloc_required - figure out if a write will require an allocation 1251 * gfs2_write_alloc_required - figure out if a write will require an allocation
1250 * @ip: the file being written to 1252 * @ip: the file being written to
1251 * @offset: the offset to write to 1253 * @offset: the offset to write to
1252 * @len: the number of bytes being written 1254 * @len: the number of bytes being written
1253 * 1255 *
1254 * Returns: 1 if an alloc is required, 0 otherwise 1256 * Returns: 1 if an alloc is required, 0 otherwise
1255 */ 1257 */
1256 1258
1257 int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, 1259 int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
1258 unsigned int len) 1260 unsigned int len)
1259 { 1261 {
1260 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1262 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1261 struct buffer_head bh; 1263 struct buffer_head bh;
1262 unsigned int shift; 1264 unsigned int shift;
1263 u64 lblock, lblock_stop, size; 1265 u64 lblock, lblock_stop, size;
1264 u64 end_of_file; 1266 u64 end_of_file;
1265 1267
1266 if (!len) 1268 if (!len)
1267 return 0; 1269 return 0;
1268 1270
1269 if (gfs2_is_stuffed(ip)) { 1271 if (gfs2_is_stuffed(ip)) {
1270 if (offset + len > 1272 if (offset + len >
1271 sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) 1273 sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
1272 return 1; 1274 return 1;
1273 return 0; 1275 return 0;
1274 } 1276 }
1275 1277
1276 shift = sdp->sd_sb.sb_bsize_shift; 1278 shift = sdp->sd_sb.sb_bsize_shift;
1277 BUG_ON(gfs2_is_dir(ip)); 1279 BUG_ON(gfs2_is_dir(ip));
1278 end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift; 1280 end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift;
1279 lblock = offset >> shift; 1281 lblock = offset >> shift;
1280 lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift; 1282 lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
1281 if (lblock_stop > end_of_file) 1283 if (lblock_stop > end_of_file)
1282 return 1; 1284 return 1;
1283 1285
1284 size = (lblock_stop - lblock) << shift; 1286 size = (lblock_stop - lblock) << shift;
1285 do { 1287 do {
1286 bh.b_state = 0; 1288 bh.b_state = 0;
1287 bh.b_size = size; 1289 bh.b_size = size;
1288 gfs2_block_map(&ip->i_inode, lblock, &bh, 0); 1290 gfs2_block_map(&ip->i_inode, lblock, &bh, 0);
1289 if (!buffer_mapped(&bh)) 1291 if (!buffer_mapped(&bh))
1290 return 1; 1292 return 1;
1291 size -= bh.b_size; 1293 size -= bh.b_size;
1292 lblock += (bh.b_size >> ip->i_inode.i_blkbits); 1294 lblock += (bh.b_size >> ip->i_inode.i_blkbits);
1293 } while(size > 0); 1295 } while(size > 0);
1294 1296
1295 return 0; 1297 return 0;
1296 } 1298 }
1297 1299
1298 1300
1 /* 1 /*
2 * linux/fs/hfs/inode.c 2 * linux/fs/hfs/inode.c
3 * 3 *
4 * Copyright (C) 1995-1997 Paul H. Hargrove 4 * Copyright (C) 1995-1997 Paul H. Hargrove
5 * (C) 2003 Ardis Technologies <roman@ardistech.com> 5 * (C) 2003 Ardis Technologies <roman@ardistech.com>
6 * This file may be distributed under the terms of the GNU General Public License. 6 * This file may be distributed under the terms of the GNU General Public License.
7 * 7 *
8 * This file contains inode-related functions which do not depend on 8 * This file contains inode-related functions which do not depend on
9 * which scheme is being used to represent forks. 9 * which scheme is being used to represent forks.
10 * 10 *
11 * Based on the minix file system code, (C) 1991, 1992 by Linus Torvalds 11 * Based on the minix file system code, (C) 1991, 1992 by Linus Torvalds
12 */ 12 */
13 13
14 #include <linux/pagemap.h> 14 #include <linux/pagemap.h>
15 #include <linux/mpage.h> 15 #include <linux/mpage.h>
16 #include <linux/sched.h> 16 #include <linux/sched.h>
17 17
18 #include "hfs_fs.h" 18 #include "hfs_fs.h"
19 #include "btree.h" 19 #include "btree.h"
20 20
21 static const struct file_operations hfs_file_operations; 21 static const struct file_operations hfs_file_operations;
22 static const struct inode_operations hfs_file_inode_operations; 22 static const struct inode_operations hfs_file_inode_operations;
23 23
24 /*================ Variable-like macros ================*/ 24 /*================ Variable-like macros ================*/
25 25
26 #define HFS_VALID_MODE_BITS (S_IFREG | S_IFDIR | S_IRWXUGO) 26 #define HFS_VALID_MODE_BITS (S_IFREG | S_IFDIR | S_IRWXUGO)
27 27
28 static int hfs_writepage(struct page *page, struct writeback_control *wbc) 28 static int hfs_writepage(struct page *page, struct writeback_control *wbc)
29 { 29 {
30 return block_write_full_page(page, hfs_get_block, wbc); 30 return block_write_full_page(page, hfs_get_block, wbc);
31 } 31 }
32 32
33 static int hfs_readpage(struct file *file, struct page *page) 33 static int hfs_readpage(struct file *file, struct page *page)
34 { 34 {
35 return block_read_full_page(page, hfs_get_block); 35 return block_read_full_page(page, hfs_get_block);
36 } 36 }
37 37
38 static int hfs_write_begin(struct file *file, struct address_space *mapping, 38 static int hfs_write_begin(struct file *file, struct address_space *mapping,
39 loff_t pos, unsigned len, unsigned flags, 39 loff_t pos, unsigned len, unsigned flags,
40 struct page **pagep, void **fsdata) 40 struct page **pagep, void **fsdata)
41 { 41 {
42 int ret; 42 int ret;
43 43
44 *pagep = NULL; 44 *pagep = NULL;
45 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 45 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
46 hfs_get_block, 46 hfs_get_block,
47 &HFS_I(mapping->host)->phys_size); 47 &HFS_I(mapping->host)->phys_size);
48 if (unlikely(ret)) { 48 if (unlikely(ret)) {
49 loff_t isize = mapping->host->i_size; 49 loff_t isize = mapping->host->i_size;
50 if (pos + len > isize) 50 if (pos + len > isize)
51 vmtruncate(mapping->host, isize); 51 vmtruncate(mapping->host, isize);
52 } 52 }
53 53
54 return ret; 54 return ret;
55 } 55 }
56 56
57 static sector_t hfs_bmap(struct address_space *mapping, sector_t block) 57 static sector_t hfs_bmap(struct address_space *mapping, sector_t block)
58 { 58 {
59 return generic_block_bmap(mapping, block, hfs_get_block); 59 return generic_block_bmap(mapping, block, hfs_get_block);
60 } 60 }
61 61
62 static int hfs_releasepage(struct page *page, gfp_t mask) 62 static int hfs_releasepage(struct page *page, gfp_t mask)
63 { 63 {
64 struct inode *inode = page->mapping->host; 64 struct inode *inode = page->mapping->host;
65 struct super_block *sb = inode->i_sb; 65 struct super_block *sb = inode->i_sb;
66 struct hfs_btree *tree; 66 struct hfs_btree *tree;
67 struct hfs_bnode *node; 67 struct hfs_bnode *node;
68 u32 nidx; 68 u32 nidx;
69 int i, res = 1; 69 int i, res = 1;
70 70
71 switch (inode->i_ino) { 71 switch (inode->i_ino) {
72 case HFS_EXT_CNID: 72 case HFS_EXT_CNID:
73 tree = HFS_SB(sb)->ext_tree; 73 tree = HFS_SB(sb)->ext_tree;
74 break; 74 break;
75 case HFS_CAT_CNID: 75 case HFS_CAT_CNID:
76 tree = HFS_SB(sb)->cat_tree; 76 tree = HFS_SB(sb)->cat_tree;
77 break; 77 break;
78 default: 78 default:
79 BUG(); 79 BUG();
80 return 0; 80 return 0;
81 } 81 }
82 82
83 if (!tree) 83 if (!tree)
84 return 0; 84 return 0;
85 85
86 if (tree->node_size >= PAGE_CACHE_SIZE) { 86 if (tree->node_size >= PAGE_CACHE_SIZE) {
87 nidx = page->index >> (tree->node_size_shift - PAGE_CACHE_SHIFT); 87 nidx = page->index >> (tree->node_size_shift - PAGE_CACHE_SHIFT);
88 spin_lock(&tree->hash_lock); 88 spin_lock(&tree->hash_lock);
89 node = hfs_bnode_findhash(tree, nidx); 89 node = hfs_bnode_findhash(tree, nidx);
90 if (!node) 90 if (!node)
91 ; 91 ;
92 else if (atomic_read(&node->refcnt)) 92 else if (atomic_read(&node->refcnt))
93 res = 0; 93 res = 0;
94 if (res && node) { 94 if (res && node) {
95 hfs_bnode_unhash(node); 95 hfs_bnode_unhash(node);
96 hfs_bnode_free(node); 96 hfs_bnode_free(node);
97 } 97 }
98 spin_unlock(&tree->hash_lock); 98 spin_unlock(&tree->hash_lock);
99 } else { 99 } else {
100 nidx = page->index << (PAGE_CACHE_SHIFT - tree->node_size_shift); 100 nidx = page->index << (PAGE_CACHE_SHIFT - tree->node_size_shift);
101 i = 1 << (PAGE_CACHE_SHIFT - tree->node_size_shift); 101 i = 1 << (PAGE_CACHE_SHIFT - tree->node_size_shift);
102 spin_lock(&tree->hash_lock); 102 spin_lock(&tree->hash_lock);
103 do { 103 do {
104 node = hfs_bnode_findhash(tree, nidx++); 104 node = hfs_bnode_findhash(tree, nidx++);
105 if (!node) 105 if (!node)
106 continue; 106 continue;
107 if (atomic_read(&node->refcnt)) { 107 if (atomic_read(&node->refcnt)) {
108 res = 0; 108 res = 0;
109 break; 109 break;
110 } 110 }
111 hfs_bnode_unhash(node); 111 hfs_bnode_unhash(node);
112 hfs_bnode_free(node); 112 hfs_bnode_free(node);
113 } while (--i && nidx < tree->node_count); 113 } while (--i && nidx < tree->node_count);
114 spin_unlock(&tree->hash_lock); 114 spin_unlock(&tree->hash_lock);
115 } 115 }
116 return res ? try_to_free_buffers(page) : 0; 116 return res ? try_to_free_buffers(page) : 0;
117 } 117 }
118 118
119 static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb, 119 static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb,
120 const struct iovec *iov, loff_t offset, unsigned long nr_segs) 120 const struct iovec *iov, loff_t offset, unsigned long nr_segs)
121 { 121 {
122 struct file *file = iocb->ki_filp; 122 struct file *file = iocb->ki_filp;
123 struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host; 123 struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
124 ssize_t ret; 124 ssize_t ret;
125 125
126 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 126 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
127 offset, nr_segs, hfs_get_block, NULL); 127 offset, nr_segs, hfs_get_block, NULL);
128 128
129 /* 129 /*
130 * In case of error extending write may have instantiated a few 130 * In case of error extending write may have instantiated a few
131 * blocks outside i_size. Trim these off again. 131 * blocks outside i_size. Trim these off again.
132 */ 132 */
133 if (unlikely((rw & WRITE) && ret < 0)) { 133 if (unlikely((rw & WRITE) && ret < 0)) {
134 loff_t isize = i_size_read(inode); 134 loff_t isize = i_size_read(inode);
135 loff_t end = offset + iov_length(iov, nr_segs); 135 loff_t end = offset + iov_length(iov, nr_segs);
136 136
137 if (end > isize) 137 if (end > isize)
138 vmtruncate(inode, isize); 138 vmtruncate(inode, isize);
139 } 139 }
140 140
141 return ret; 141 return ret;
142 } 142 }
143 143
144 static int hfs_writepages(struct address_space *mapping, 144 static int hfs_writepages(struct address_space *mapping,
145 struct writeback_control *wbc) 145 struct writeback_control *wbc)
146 { 146 {
147 return mpage_writepages(mapping, wbc, hfs_get_block); 147 return mpage_writepages(mapping, wbc, hfs_get_block);
148 } 148 }
149 149
150 const struct address_space_operations hfs_btree_aops = { 150 const struct address_space_operations hfs_btree_aops = {
151 .readpage = hfs_readpage, 151 .readpage = hfs_readpage,
152 .writepage = hfs_writepage, 152 .writepage = hfs_writepage,
153 .write_begin = hfs_write_begin, 153 .write_begin = hfs_write_begin,
154 .write_end = generic_write_end, 154 .write_end = generic_write_end,
155 .bmap = hfs_bmap, 155 .bmap = hfs_bmap,
156 .releasepage = hfs_releasepage, 156 .releasepage = hfs_releasepage,
157 }; 157 };
158 158
159 const struct address_space_operations hfs_aops = { 159 const struct address_space_operations hfs_aops = {
160 .readpage = hfs_readpage, 160 .readpage = hfs_readpage,
161 .writepage = hfs_writepage, 161 .writepage = hfs_writepage,
162 .write_begin = hfs_write_begin, 162 .write_begin = hfs_write_begin,
163 .write_end = generic_write_end, 163 .write_end = generic_write_end,
164 .bmap = hfs_bmap, 164 .bmap = hfs_bmap,
165 .direct_IO = hfs_direct_IO, 165 .direct_IO = hfs_direct_IO,
166 .writepages = hfs_writepages, 166 .writepages = hfs_writepages,
167 }; 167 };
168 168
169 /* 169 /*
170 * hfs_new_inode 170 * hfs_new_inode
171 */ 171 */
172 struct inode *hfs_new_inode(struct inode *dir, struct qstr *name, int mode) 172 struct inode *hfs_new_inode(struct inode *dir, struct qstr *name, int mode)
173 { 173 {
174 struct super_block *sb = dir->i_sb; 174 struct super_block *sb = dir->i_sb;
175 struct inode *inode = new_inode(sb); 175 struct inode *inode = new_inode(sb);
176 if (!inode) 176 if (!inode)
177 return NULL; 177 return NULL;
178 178
179 mutex_init(&HFS_I(inode)->extents_lock); 179 mutex_init(&HFS_I(inode)->extents_lock);
180 INIT_LIST_HEAD(&HFS_I(inode)->open_dir_list); 180 INIT_LIST_HEAD(&HFS_I(inode)->open_dir_list);
181 hfs_cat_build_key(sb, (btree_key *)&HFS_I(inode)->cat_key, dir->i_ino, name); 181 hfs_cat_build_key(sb, (btree_key *)&HFS_I(inode)->cat_key, dir->i_ino, name);
182 inode->i_ino = HFS_SB(sb)->next_id++; 182 inode->i_ino = HFS_SB(sb)->next_id++;
183 inode->i_mode = mode; 183 inode->i_mode = mode;
184 inode->i_uid = current_fsuid(); 184 inode->i_uid = current_fsuid();
185 inode->i_gid = current_fsgid(); 185 inode->i_gid = current_fsgid();
186 inode->i_nlink = 1; 186 inode->i_nlink = 1;
187 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; 187 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
188 HFS_I(inode)->flags = 0; 188 HFS_I(inode)->flags = 0;
189 HFS_I(inode)->rsrc_inode = NULL; 189 HFS_I(inode)->rsrc_inode = NULL;
190 HFS_I(inode)->fs_blocks = 0; 190 HFS_I(inode)->fs_blocks = 0;
191 if (S_ISDIR(mode)) { 191 if (S_ISDIR(mode)) {
192 inode->i_size = 2; 192 inode->i_size = 2;
193 HFS_SB(sb)->folder_count++; 193 HFS_SB(sb)->folder_count++;
194 if (dir->i_ino == HFS_ROOT_CNID) 194 if (dir->i_ino == HFS_ROOT_CNID)
195 HFS_SB(sb)->root_dirs++; 195 HFS_SB(sb)->root_dirs++;
196 inode->i_op = &hfs_dir_inode_operations; 196 inode->i_op = &hfs_dir_inode_operations;
197 inode->i_fop = &hfs_dir_operations; 197 inode->i_fop = &hfs_dir_operations;
198 inode->i_mode |= S_IRWXUGO; 198 inode->i_mode |= S_IRWXUGO;
199 inode->i_mode &= ~HFS_SB(inode->i_sb)->s_dir_umask; 199 inode->i_mode &= ~HFS_SB(inode->i_sb)->s_dir_umask;
200 } else if (S_ISREG(mode)) { 200 } else if (S_ISREG(mode)) {
201 HFS_I(inode)->clump_blocks = HFS_SB(sb)->clumpablks; 201 HFS_I(inode)->clump_blocks = HFS_SB(sb)->clumpablks;
202 HFS_SB(sb)->file_count++; 202 HFS_SB(sb)->file_count++;
203 if (dir->i_ino == HFS_ROOT_CNID) 203 if (dir->i_ino == HFS_ROOT_CNID)
204 HFS_SB(sb)->root_files++; 204 HFS_SB(sb)->root_files++;
205 inode->i_op = &hfs_file_inode_operations; 205 inode->i_op = &hfs_file_inode_operations;
206 inode->i_fop = &hfs_file_operations; 206 inode->i_fop = &hfs_file_operations;
207 inode->i_mapping->a_ops = &hfs_aops; 207 inode->i_mapping->a_ops = &hfs_aops;
208 inode->i_mode |= S_IRUGO|S_IXUGO; 208 inode->i_mode |= S_IRUGO|S_IXUGO;
209 if (mode & S_IWUSR) 209 if (mode & S_IWUSR)
210 inode->i_mode |= S_IWUGO; 210 inode->i_mode |= S_IWUGO;
211 inode->i_mode &= ~HFS_SB(inode->i_sb)->s_file_umask; 211 inode->i_mode &= ~HFS_SB(inode->i_sb)->s_file_umask;
212 HFS_I(inode)->phys_size = 0; 212 HFS_I(inode)->phys_size = 0;
213 HFS_I(inode)->alloc_blocks = 0; 213 HFS_I(inode)->alloc_blocks = 0;
214 HFS_I(inode)->first_blocks = 0; 214 HFS_I(inode)->first_blocks = 0;
215 HFS_I(inode)->cached_start = 0; 215 HFS_I(inode)->cached_start = 0;
216 HFS_I(inode)->cached_blocks = 0; 216 HFS_I(inode)->cached_blocks = 0;
217 memset(HFS_I(inode)->first_extents, 0, sizeof(hfs_extent_rec)); 217 memset(HFS_I(inode)->first_extents, 0, sizeof(hfs_extent_rec));
218 memset(HFS_I(inode)->cached_extents, 0, sizeof(hfs_extent_rec)); 218 memset(HFS_I(inode)->cached_extents, 0, sizeof(hfs_extent_rec));
219 } 219 }
220 insert_inode_hash(inode); 220 insert_inode_hash(inode);
221 mark_inode_dirty(inode); 221 mark_inode_dirty(inode);
222 set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags); 222 set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags);
223 sb->s_dirt = 1; 223 sb->s_dirt = 1;
224 224
225 return inode; 225 return inode;
226 } 226 }
227 227
228 void hfs_delete_inode(struct inode *inode) 228 void hfs_delete_inode(struct inode *inode)
229 { 229 {
230 struct super_block *sb = inode->i_sb; 230 struct super_block *sb = inode->i_sb;
231 231
232 dprint(DBG_INODE, "delete_inode: %lu\n", inode->i_ino); 232 dprint(DBG_INODE, "delete_inode: %lu\n", inode->i_ino);
233 if (S_ISDIR(inode->i_mode)) { 233 if (S_ISDIR(inode->i_mode)) {
234 HFS_SB(sb)->folder_count--; 234 HFS_SB(sb)->folder_count--;
235 if (HFS_I(inode)->cat_key.ParID == cpu_to_be32(HFS_ROOT_CNID)) 235 if (HFS_I(inode)->cat_key.ParID == cpu_to_be32(HFS_ROOT_CNID))
236 HFS_SB(sb)->root_dirs--; 236 HFS_SB(sb)->root_dirs--;
237 set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags); 237 set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags);
238 sb->s_dirt = 1; 238 sb->s_dirt = 1;
239 return; 239 return;
240 } 240 }
241 HFS_SB(sb)->file_count--; 241 HFS_SB(sb)->file_count--;
242 if (HFS_I(inode)->cat_key.ParID == cpu_to_be32(HFS_ROOT_CNID)) 242 if (HFS_I(inode)->cat_key.ParID == cpu_to_be32(HFS_ROOT_CNID))
243 HFS_SB(sb)->root_files--; 243 HFS_SB(sb)->root_files--;
244 if (S_ISREG(inode->i_mode)) { 244 if (S_ISREG(inode->i_mode)) {
245 if (!inode->i_nlink) { 245 if (!inode->i_nlink) {
246 inode->i_size = 0; 246 inode->i_size = 0;
247 hfs_file_truncate(inode); 247 hfs_file_truncate(inode);
248 } 248 }
249 } 249 }
250 set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags); 250 set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags);
251 sb->s_dirt = 1; 251 sb->s_dirt = 1;
252 } 252 }
253 253
254 void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext, 254 void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext,
255 __be32 __log_size, __be32 phys_size, u32 clump_size) 255 __be32 __log_size, __be32 phys_size, u32 clump_size)
256 { 256 {
257 struct super_block *sb = inode->i_sb; 257 struct super_block *sb = inode->i_sb;
258 u32 log_size = be32_to_cpu(__log_size); 258 u32 log_size = be32_to_cpu(__log_size);
259 u16 count; 259 u16 count;
260 int i; 260 int i;
261 261
262 memcpy(HFS_I(inode)->first_extents, ext, sizeof(hfs_extent_rec)); 262 memcpy(HFS_I(inode)->first_extents, ext, sizeof(hfs_extent_rec));
263 for (count = 0, i = 0; i < 3; i++) 263 for (count = 0, i = 0; i < 3; i++)
264 count += be16_to_cpu(ext[i].count); 264 count += be16_to_cpu(ext[i].count);
265 HFS_I(inode)->first_blocks = count; 265 HFS_I(inode)->first_blocks = count;
266 266
267 inode->i_size = HFS_I(inode)->phys_size = log_size; 267 inode->i_size = HFS_I(inode)->phys_size = log_size;
268 HFS_I(inode)->fs_blocks = (log_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; 268 HFS_I(inode)->fs_blocks = (log_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
269 inode_set_bytes(inode, HFS_I(inode)->fs_blocks << sb->s_blocksize_bits); 269 inode_set_bytes(inode, HFS_I(inode)->fs_blocks << sb->s_blocksize_bits);
270 HFS_I(inode)->alloc_blocks = be32_to_cpu(phys_size) / 270 HFS_I(inode)->alloc_blocks = be32_to_cpu(phys_size) /
271 HFS_SB(sb)->alloc_blksz; 271 HFS_SB(sb)->alloc_blksz;
272 HFS_I(inode)->clump_blocks = clump_size / HFS_SB(sb)->alloc_blksz; 272 HFS_I(inode)->clump_blocks = clump_size / HFS_SB(sb)->alloc_blksz;
273 if (!HFS_I(inode)->clump_blocks) 273 if (!HFS_I(inode)->clump_blocks)
274 HFS_I(inode)->clump_blocks = HFS_SB(sb)->clumpablks; 274 HFS_I(inode)->clump_blocks = HFS_SB(sb)->clumpablks;
275 } 275 }
276 276
277 struct hfs_iget_data { 277 struct hfs_iget_data {
278 struct hfs_cat_key *key; 278 struct hfs_cat_key *key;
279 hfs_cat_rec *rec; 279 hfs_cat_rec *rec;
280 }; 280 };
281 281
282 static int hfs_test_inode(struct inode *inode, void *data) 282 static int hfs_test_inode(struct inode *inode, void *data)
283 { 283 {
284 struct hfs_iget_data *idata = data; 284 struct hfs_iget_data *idata = data;
285 hfs_cat_rec *rec; 285 hfs_cat_rec *rec;
286 286
287 rec = idata->rec; 287 rec = idata->rec;
288 switch (rec->type) { 288 switch (rec->type) {
289 case HFS_CDR_DIR: 289 case HFS_CDR_DIR:
290 return inode->i_ino == be32_to_cpu(rec->dir.DirID); 290 return inode->i_ino == be32_to_cpu(rec->dir.DirID);
291 case HFS_CDR_FIL: 291 case HFS_CDR_FIL:
292 return inode->i_ino == be32_to_cpu(rec->file.FlNum); 292 return inode->i_ino == be32_to_cpu(rec->file.FlNum);
293 default: 293 default:
294 BUG(); 294 BUG();
295 return 1; 295 return 1;
296 } 296 }
297 } 297 }
298 298
299 /* 299 /*
300 * hfs_read_inode 300 * hfs_read_inode
301 */ 301 */
302 static int hfs_read_inode(struct inode *inode, void *data) 302 static int hfs_read_inode(struct inode *inode, void *data)
303 { 303 {
304 struct hfs_iget_data *idata = data; 304 struct hfs_iget_data *idata = data;
305 struct hfs_sb_info *hsb = HFS_SB(inode->i_sb); 305 struct hfs_sb_info *hsb = HFS_SB(inode->i_sb);
306 hfs_cat_rec *rec; 306 hfs_cat_rec *rec;
307 307
308 HFS_I(inode)->flags = 0; 308 HFS_I(inode)->flags = 0;
309 HFS_I(inode)->rsrc_inode = NULL; 309 HFS_I(inode)->rsrc_inode = NULL;
310 mutex_init(&HFS_I(inode)->extents_lock); 310 mutex_init(&HFS_I(inode)->extents_lock);
311 INIT_LIST_HEAD(&HFS_I(inode)->open_dir_list); 311 INIT_LIST_HEAD(&HFS_I(inode)->open_dir_list);
312 312
313 /* Initialize the inode */ 313 /* Initialize the inode */
314 inode->i_uid = hsb->s_uid; 314 inode->i_uid = hsb->s_uid;
315 inode->i_gid = hsb->s_gid; 315 inode->i_gid = hsb->s_gid;
316 inode->i_nlink = 1; 316 inode->i_nlink = 1;
317 317
318 if (idata->key) 318 if (idata->key)
319 HFS_I(inode)->cat_key = *idata->key; 319 HFS_I(inode)->cat_key = *idata->key;
320 else 320 else
321 HFS_I(inode)->flags |= HFS_FLG_RSRC; 321 HFS_I(inode)->flags |= HFS_FLG_RSRC;
322 HFS_I(inode)->tz_secondswest = sys_tz.tz_minuteswest * 60; 322 HFS_I(inode)->tz_secondswest = sys_tz.tz_minuteswest * 60;
323 323
324 rec = idata->rec; 324 rec = idata->rec;
325 switch (rec->type) { 325 switch (rec->type) {
326 case HFS_CDR_FIL: 326 case HFS_CDR_FIL:
327 if (!HFS_IS_RSRC(inode)) { 327 if (!HFS_IS_RSRC(inode)) {
328 hfs_inode_read_fork(inode, rec->file.ExtRec, rec->file.LgLen, 328 hfs_inode_read_fork(inode, rec->file.ExtRec, rec->file.LgLen,
329 rec->file.PyLen, be16_to_cpu(rec->file.ClpSize)); 329 rec->file.PyLen, be16_to_cpu(rec->file.ClpSize));
330 } else { 330 } else {
331 hfs_inode_read_fork(inode, rec->file.RExtRec, rec->file.RLgLen, 331 hfs_inode_read_fork(inode, rec->file.RExtRec, rec->file.RLgLen,
332 rec->file.RPyLen, be16_to_cpu(rec->file.ClpSize)); 332 rec->file.RPyLen, be16_to_cpu(rec->file.ClpSize));
333 } 333 }
334 334
335 inode->i_ino = be32_to_cpu(rec->file.FlNum); 335 inode->i_ino = be32_to_cpu(rec->file.FlNum);
336 inode->i_mode = S_IRUGO | S_IXUGO; 336 inode->i_mode = S_IRUGO | S_IXUGO;
337 if (!(rec->file.Flags & HFS_FIL_LOCK)) 337 if (!(rec->file.Flags & HFS_FIL_LOCK))
338 inode->i_mode |= S_IWUGO; 338 inode->i_mode |= S_IWUGO;
339 inode->i_mode &= ~hsb->s_file_umask; 339 inode->i_mode &= ~hsb->s_file_umask;
340 inode->i_mode |= S_IFREG; 340 inode->i_mode |= S_IFREG;
341 inode->i_ctime = inode->i_atime = inode->i_mtime = 341 inode->i_ctime = inode->i_atime = inode->i_mtime =
342 hfs_m_to_utime(rec->file.MdDat); 342 hfs_m_to_utime(rec->file.MdDat);
343 inode->i_op = &hfs_file_inode_operations; 343 inode->i_op = &hfs_file_inode_operations;
344 inode->i_fop = &hfs_file_operations; 344 inode->i_fop = &hfs_file_operations;
345 inode->i_mapping->a_ops = &hfs_aops; 345 inode->i_mapping->a_ops = &hfs_aops;
346 break; 346 break;
347 case HFS_CDR_DIR: 347 case HFS_CDR_DIR:
348 inode->i_ino = be32_to_cpu(rec->dir.DirID); 348 inode->i_ino = be32_to_cpu(rec->dir.DirID);
349 inode->i_size = be16_to_cpu(rec->dir.Val) + 2; 349 inode->i_size = be16_to_cpu(rec->dir.Val) + 2;
350 HFS_I(inode)->fs_blocks = 0; 350 HFS_I(inode)->fs_blocks = 0;
351 inode->i_mode = S_IFDIR | (S_IRWXUGO & ~hsb->s_dir_umask); 351 inode->i_mode = S_IFDIR | (S_IRWXUGO & ~hsb->s_dir_umask);
352 inode->i_ctime = inode->i_atime = inode->i_mtime = 352 inode->i_ctime = inode->i_atime = inode->i_mtime =
353 hfs_m_to_utime(rec->dir.MdDat); 353 hfs_m_to_utime(rec->dir.MdDat);
354 inode->i_op = &hfs_dir_inode_operations; 354 inode->i_op = &hfs_dir_inode_operations;
355 inode->i_fop = &hfs_dir_operations; 355 inode->i_fop = &hfs_dir_operations;
356 break; 356 break;
357 default: 357 default:
358 make_bad_inode(inode); 358 make_bad_inode(inode);
359 } 359 }
360 return 0; 360 return 0;
361 } 361 }
362 362
363 /* 363 /*
364 * __hfs_iget() 364 * __hfs_iget()
365 * 365 *
366 * Given the MDB for a HFS filesystem, a 'key' and an 'entry' in 366 * Given the MDB for a HFS filesystem, a 'key' and an 'entry' in
367 * the catalog B-tree and the 'type' of the desired file return the 367 * the catalog B-tree and the 'type' of the desired file return the
368 * inode for that file/directory or NULL. Note that 'type' indicates 368 * inode for that file/directory or NULL. Note that 'type' indicates
369 * whether we want the actual file or directory, or the corresponding 369 * whether we want the actual file or directory, or the corresponding
370 * metadata (AppleDouble header file or CAP metadata file). 370 * metadata (AppleDouble header file or CAP metadata file).
371 */ 371 */
372 struct inode *hfs_iget(struct super_block *sb, struct hfs_cat_key *key, hfs_cat_rec *rec) 372 struct inode *hfs_iget(struct super_block *sb, struct hfs_cat_key *key, hfs_cat_rec *rec)
373 { 373 {
374 struct hfs_iget_data data = { key, rec }; 374 struct hfs_iget_data data = { key, rec };
375 struct inode *inode; 375 struct inode *inode;
376 u32 cnid; 376 u32 cnid;
377 377
378 switch (rec->type) { 378 switch (rec->type) {
379 case HFS_CDR_DIR: 379 case HFS_CDR_DIR:
380 cnid = be32_to_cpu(rec->dir.DirID); 380 cnid = be32_to_cpu(rec->dir.DirID);
381 break; 381 break;
382 case HFS_CDR_FIL: 382 case HFS_CDR_FIL:
383 cnid = be32_to_cpu(rec->file.FlNum); 383 cnid = be32_to_cpu(rec->file.FlNum);
384 break; 384 break;
385 default: 385 default:
386 return NULL; 386 return NULL;
387 } 387 }
388 inode = iget5_locked(sb, cnid, hfs_test_inode, hfs_read_inode, &data); 388 inode = iget5_locked(sb, cnid, hfs_test_inode, hfs_read_inode, &data);
389 if (inode && (inode->i_state & I_NEW)) 389 if (inode && (inode->i_state & I_NEW))
390 unlock_new_inode(inode); 390 unlock_new_inode(inode);
391 return inode; 391 return inode;
392 } 392 }
393 393
394 void hfs_inode_write_fork(struct inode *inode, struct hfs_extent *ext, 394 void hfs_inode_write_fork(struct inode *inode, struct hfs_extent *ext,
395 __be32 *log_size, __be32 *phys_size) 395 __be32 *log_size, __be32 *phys_size)
396 { 396 {
397 memcpy(ext, HFS_I(inode)->first_extents, sizeof(hfs_extent_rec)); 397 memcpy(ext, HFS_I(inode)->first_extents, sizeof(hfs_extent_rec));
398 398
399 if (log_size) 399 if (log_size)
400 *log_size = cpu_to_be32(inode->i_size); 400 *log_size = cpu_to_be32(inode->i_size);
401 if (phys_size) 401 if (phys_size)
402 *phys_size = cpu_to_be32(HFS_I(inode)->alloc_blocks * 402 *phys_size = cpu_to_be32(HFS_I(inode)->alloc_blocks *
403 HFS_SB(inode->i_sb)->alloc_blksz); 403 HFS_SB(inode->i_sb)->alloc_blksz);
404 } 404 }
405 405
406 int hfs_write_inode(struct inode *inode, struct writeback_control *wbc) 406 int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
407 { 407 {
408 struct inode *main_inode = inode; 408 struct inode *main_inode = inode;
409 struct hfs_find_data fd; 409 struct hfs_find_data fd;
410 hfs_cat_rec rec; 410 hfs_cat_rec rec;
411 411
412 dprint(DBG_INODE, "hfs_write_inode: %lu\n", inode->i_ino); 412 dprint(DBG_INODE, "hfs_write_inode: %lu\n", inode->i_ino);
413 hfs_ext_write_extent(inode); 413 hfs_ext_write_extent(inode);
414 414
415 if (inode->i_ino < HFS_FIRSTUSER_CNID) { 415 if (inode->i_ino < HFS_FIRSTUSER_CNID) {
416 switch (inode->i_ino) { 416 switch (inode->i_ino) {
417 case HFS_ROOT_CNID: 417 case HFS_ROOT_CNID:
418 break; 418 break;
419 case HFS_EXT_CNID: 419 case HFS_EXT_CNID:
420 hfs_btree_write(HFS_SB(inode->i_sb)->ext_tree); 420 hfs_btree_write(HFS_SB(inode->i_sb)->ext_tree);
421 return 0; 421 return 0;
422 case HFS_CAT_CNID: 422 case HFS_CAT_CNID:
423 hfs_btree_write(HFS_SB(inode->i_sb)->cat_tree); 423 hfs_btree_write(HFS_SB(inode->i_sb)->cat_tree);
424 return 0; 424 return 0;
425 default: 425 default:
426 BUG(); 426 BUG();
427 return -EIO; 427 return -EIO;
428 } 428 }
429 } 429 }
430 430
431 if (HFS_IS_RSRC(inode)) 431 if (HFS_IS_RSRC(inode))
432 main_inode = HFS_I(inode)->rsrc_inode; 432 main_inode = HFS_I(inode)->rsrc_inode;
433 433
434 if (!main_inode->i_nlink) 434 if (!main_inode->i_nlink)
435 return 0; 435 return 0;
436 436
437 if (hfs_find_init(HFS_SB(main_inode->i_sb)->cat_tree, &fd)) 437 if (hfs_find_init(HFS_SB(main_inode->i_sb)->cat_tree, &fd))
438 /* panic? */ 438 /* panic? */
439 return -EIO; 439 return -EIO;
440 440
441 fd.search_key->cat = HFS_I(main_inode)->cat_key; 441 fd.search_key->cat = HFS_I(main_inode)->cat_key;
442 if (hfs_brec_find(&fd)) 442 if (hfs_brec_find(&fd))
443 /* panic? */ 443 /* panic? */
444 goto out; 444 goto out;
445 445
446 if (S_ISDIR(main_inode->i_mode)) { 446 if (S_ISDIR(main_inode->i_mode)) {
447 if (fd.entrylength < sizeof(struct hfs_cat_dir)) 447 if (fd.entrylength < sizeof(struct hfs_cat_dir))
448 /* panic? */; 448 /* panic? */;
449 hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, 449 hfs_bnode_read(fd.bnode, &rec, fd.entryoffset,
450 sizeof(struct hfs_cat_dir)); 450 sizeof(struct hfs_cat_dir));
451 if (rec.type != HFS_CDR_DIR || 451 if (rec.type != HFS_CDR_DIR ||
452 be32_to_cpu(rec.dir.DirID) != inode->i_ino) { 452 be32_to_cpu(rec.dir.DirID) != inode->i_ino) {
453 } 453 }
454 454
455 rec.dir.MdDat = hfs_u_to_mtime(inode->i_mtime); 455 rec.dir.MdDat = hfs_u_to_mtime(inode->i_mtime);
456 rec.dir.Val = cpu_to_be16(inode->i_size - 2); 456 rec.dir.Val = cpu_to_be16(inode->i_size - 2);
457 457
458 hfs_bnode_write(fd.bnode, &rec, fd.entryoffset, 458 hfs_bnode_write(fd.bnode, &rec, fd.entryoffset,
459 sizeof(struct hfs_cat_dir)); 459 sizeof(struct hfs_cat_dir));
460 } else if (HFS_IS_RSRC(inode)) { 460 } else if (HFS_IS_RSRC(inode)) {
461 hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, 461 hfs_bnode_read(fd.bnode, &rec, fd.entryoffset,
462 sizeof(struct hfs_cat_file)); 462 sizeof(struct hfs_cat_file));
463 hfs_inode_write_fork(inode, rec.file.RExtRec, 463 hfs_inode_write_fork(inode, rec.file.RExtRec,
464 &rec.file.RLgLen, &rec.file.RPyLen); 464 &rec.file.RLgLen, &rec.file.RPyLen);
465 hfs_bnode_write(fd.bnode, &rec, fd.entryoffset, 465 hfs_bnode_write(fd.bnode, &rec, fd.entryoffset,
466 sizeof(struct hfs_cat_file)); 466 sizeof(struct hfs_cat_file));
467 } else { 467 } else {
468 if (fd.entrylength < sizeof(struct hfs_cat_file)) 468 if (fd.entrylength < sizeof(struct hfs_cat_file))
469 /* panic? */; 469 /* panic? */;
470 hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, 470 hfs_bnode_read(fd.bnode, &rec, fd.entryoffset,
471 sizeof(struct hfs_cat_file)); 471 sizeof(struct hfs_cat_file));
472 if (rec.type != HFS_CDR_FIL || 472 if (rec.type != HFS_CDR_FIL ||
473 be32_to_cpu(rec.file.FlNum) != inode->i_ino) { 473 be32_to_cpu(rec.file.FlNum) != inode->i_ino) {
474 } 474 }
475 475
476 if (inode->i_mode & S_IWUSR) 476 if (inode->i_mode & S_IWUSR)
477 rec.file.Flags &= ~HFS_FIL_LOCK; 477 rec.file.Flags &= ~HFS_FIL_LOCK;
478 else 478 else
479 rec.file.Flags |= HFS_FIL_LOCK; 479 rec.file.Flags |= HFS_FIL_LOCK;
480 hfs_inode_write_fork(inode, rec.file.ExtRec, &rec.file.LgLen, &rec.file.PyLen); 480 hfs_inode_write_fork(inode, rec.file.ExtRec, &rec.file.LgLen, &rec.file.PyLen);
481 rec.file.MdDat = hfs_u_to_mtime(inode->i_mtime); 481 rec.file.MdDat = hfs_u_to_mtime(inode->i_mtime);
482 482
483 hfs_bnode_write(fd.bnode, &rec, fd.entryoffset, 483 hfs_bnode_write(fd.bnode, &rec, fd.entryoffset,
484 sizeof(struct hfs_cat_file)); 484 sizeof(struct hfs_cat_file));
485 } 485 }
486 out: 486 out:
487 hfs_find_exit(&fd); 487 hfs_find_exit(&fd);
488 return 0; 488 return 0;
489 } 489 }
490 490
491 static struct dentry *hfs_file_lookup(struct inode *dir, struct dentry *dentry, 491 static struct dentry *hfs_file_lookup(struct inode *dir, struct dentry *dentry,
492 struct nameidata *nd) 492 struct nameidata *nd)
493 { 493 {
494 struct inode *inode = NULL; 494 struct inode *inode = NULL;
495 hfs_cat_rec rec; 495 hfs_cat_rec rec;
496 struct hfs_find_data fd; 496 struct hfs_find_data fd;
497 int res; 497 int res;
498 498
499 if (HFS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc")) 499 if (HFS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc"))
500 goto out; 500 goto out;
501 501
502 inode = HFS_I(dir)->rsrc_inode; 502 inode = HFS_I(dir)->rsrc_inode;
503 if (inode) 503 if (inode)
504 goto out; 504 goto out;
505 505
506 inode = new_inode(dir->i_sb); 506 inode = new_inode(dir->i_sb);
507 if (!inode) 507 if (!inode)
508 return ERR_PTR(-ENOMEM); 508 return ERR_PTR(-ENOMEM);
509 509
510 hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd); 510 hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd);
511 fd.search_key->cat = HFS_I(dir)->cat_key; 511 fd.search_key->cat = HFS_I(dir)->cat_key;
512 res = hfs_brec_read(&fd, &rec, sizeof(rec)); 512 res = hfs_brec_read(&fd, &rec, sizeof(rec));
513 if (!res) { 513 if (!res) {
514 struct hfs_iget_data idata = { NULL, &rec }; 514 struct hfs_iget_data idata = { NULL, &rec };
515 hfs_read_inode(inode, &idata); 515 hfs_read_inode(inode, &idata);
516 } 516 }
517 hfs_find_exit(&fd); 517 hfs_find_exit(&fd);
518 if (res) { 518 if (res) {
519 iput(inode); 519 iput(inode);
520 return ERR_PTR(res); 520 return ERR_PTR(res);
521 } 521 }
522 HFS_I(inode)->rsrc_inode = dir; 522 HFS_I(inode)->rsrc_inode = dir;
523 HFS_I(dir)->rsrc_inode = inode; 523 HFS_I(dir)->rsrc_inode = inode;
524 igrab(dir); 524 igrab(dir);
525 hlist_add_fake(&inode->i_hash); 525 hlist_add_fake(&inode->i_hash);
526 mark_inode_dirty(inode); 526 mark_inode_dirty(inode);
527 out: 527 out:
528 d_add(dentry, inode); 528 d_add(dentry, inode);
529 return NULL; 529 return NULL;
530 } 530 }
531 531
532 void hfs_evict_inode(struct inode *inode) 532 void hfs_evict_inode(struct inode *inode)
533 { 533 {
534 truncate_inode_pages(&inode->i_data, 0); 534 truncate_inode_pages(&inode->i_data, 0);
535 end_writeback(inode); 535 end_writeback(inode);
536 if (HFS_IS_RSRC(inode) && HFS_I(inode)->rsrc_inode) { 536 if (HFS_IS_RSRC(inode) && HFS_I(inode)->rsrc_inode) {
537 HFS_I(HFS_I(inode)->rsrc_inode)->rsrc_inode = NULL; 537 HFS_I(HFS_I(inode)->rsrc_inode)->rsrc_inode = NULL;
538 iput(HFS_I(inode)->rsrc_inode); 538 iput(HFS_I(inode)->rsrc_inode);
539 } 539 }
540 } 540 }
541 541
542 static int hfs_file_open(struct inode *inode, struct file *file) 542 static int hfs_file_open(struct inode *inode, struct file *file)
543 { 543 {
544 if (HFS_IS_RSRC(inode)) 544 if (HFS_IS_RSRC(inode))
545 inode = HFS_I(inode)->rsrc_inode; 545 inode = HFS_I(inode)->rsrc_inode;
546 atomic_inc(&HFS_I(inode)->opencnt); 546 atomic_inc(&HFS_I(inode)->opencnt);
547 return 0; 547 return 0;
548 } 548 }
549 549
550 static int hfs_file_release(struct inode *inode, struct file *file) 550 static int hfs_file_release(struct inode *inode, struct file *file)
551 { 551 {
552 //struct super_block *sb = inode->i_sb; 552 //struct super_block *sb = inode->i_sb;
553 553
554 if (HFS_IS_RSRC(inode)) 554 if (HFS_IS_RSRC(inode))
555 inode = HFS_I(inode)->rsrc_inode; 555 inode = HFS_I(inode)->rsrc_inode;
556 if (atomic_dec_and_test(&HFS_I(inode)->opencnt)) { 556 if (atomic_dec_and_test(&HFS_I(inode)->opencnt)) {
557 mutex_lock(&inode->i_mutex); 557 mutex_lock(&inode->i_mutex);
558 hfs_file_truncate(inode); 558 hfs_file_truncate(inode);
559 //if (inode->i_flags & S_DEAD) { 559 //if (inode->i_flags & S_DEAD) {
560 // hfs_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL); 560 // hfs_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL);
561 // hfs_delete_inode(inode); 561 // hfs_delete_inode(inode);
562 //} 562 //}
563 mutex_unlock(&inode->i_mutex); 563 mutex_unlock(&inode->i_mutex);
564 } 564 }
565 return 0; 565 return 0;
566 } 566 }
567 567
568 /* 568 /*
569 * hfs_notify_change() 569 * hfs_notify_change()
570 * 570 *
571 * Based very closely on fs/msdos/inode.c by Werner Almesberger 571 * Based very closely on fs/msdos/inode.c by Werner Almesberger
572 * 572 *
573 * This is the notify_change() field in the super_operations structure 573 * This is the notify_change() field in the super_operations structure
574 * for HFS file systems. The purpose is to take that changes made to 574 * for HFS file systems. The purpose is to take that changes made to
575 * an inode and apply then in a filesystem-dependent manner. In this 575 * an inode and apply then in a filesystem-dependent manner. In this
576 * case the process has a few of tasks to do: 576 * case the process has a few of tasks to do:
577 * 1) prevent changes to the i_uid and i_gid fields. 577 * 1) prevent changes to the i_uid and i_gid fields.
578 * 2) map file permissions to the closest allowable permissions 578 * 2) map file permissions to the closest allowable permissions
579 * 3) Since multiple Linux files can share the same on-disk inode under 579 * 3) Since multiple Linux files can share the same on-disk inode under
580 * HFS (for instance the data and resource forks of a file) a change 580 * HFS (for instance the data and resource forks of a file) a change
581 * to permissions must be applied to all other in-core inodes which 581 * to permissions must be applied to all other in-core inodes which
582 * correspond to the same HFS file. 582 * correspond to the same HFS file.
583 */ 583 */
584 584
585 int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr) 585 int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr)
586 { 586 {
587 struct inode *inode = dentry->d_inode; 587 struct inode *inode = dentry->d_inode;
588 struct hfs_sb_info *hsb = HFS_SB(inode->i_sb); 588 struct hfs_sb_info *hsb = HFS_SB(inode->i_sb);
589 int error; 589 int error;
590 590
591 error = inode_change_ok(inode, attr); /* basic permission checks */ 591 error = inode_change_ok(inode, attr); /* basic permission checks */
592 if (error) 592 if (error)
593 return error; 593 return error;
594 594
595 /* no uig/gid changes and limit which mode bits can be set */ 595 /* no uig/gid changes and limit which mode bits can be set */
596 if (((attr->ia_valid & ATTR_UID) && 596 if (((attr->ia_valid & ATTR_UID) &&
597 (attr->ia_uid != hsb->s_uid)) || 597 (attr->ia_uid != hsb->s_uid)) ||
598 ((attr->ia_valid & ATTR_GID) && 598 ((attr->ia_valid & ATTR_GID) &&
599 (attr->ia_gid != hsb->s_gid)) || 599 (attr->ia_gid != hsb->s_gid)) ||
600 ((attr->ia_valid & ATTR_MODE) && 600 ((attr->ia_valid & ATTR_MODE) &&
601 ((S_ISDIR(inode->i_mode) && 601 ((S_ISDIR(inode->i_mode) &&
602 (attr->ia_mode != inode->i_mode)) || 602 (attr->ia_mode != inode->i_mode)) ||
603 (attr->ia_mode & ~HFS_VALID_MODE_BITS)))) { 603 (attr->ia_mode & ~HFS_VALID_MODE_BITS)))) {
604 return hsb->s_quiet ? 0 : error; 604 return hsb->s_quiet ? 0 : error;
605 } 605 }
606 606
607 if (attr->ia_valid & ATTR_MODE) { 607 if (attr->ia_valid & ATTR_MODE) {
608 /* Only the 'w' bits can ever change and only all together. */ 608 /* Only the 'w' bits can ever change and only all together. */
609 if (attr->ia_mode & S_IWUSR) 609 if (attr->ia_mode & S_IWUSR)
610 attr->ia_mode = inode->i_mode | S_IWUGO; 610 attr->ia_mode = inode->i_mode | S_IWUGO;
611 else 611 else
612 attr->ia_mode = inode->i_mode & ~S_IWUGO; 612 attr->ia_mode = inode->i_mode & ~S_IWUGO;
613 attr->ia_mode &= S_ISDIR(inode->i_mode) ? ~hsb->s_dir_umask: ~hsb->s_file_umask; 613 attr->ia_mode &= S_ISDIR(inode->i_mode) ? ~hsb->s_dir_umask: ~hsb->s_file_umask;
614 } 614 }
615 615
616 if ((attr->ia_valid & ATTR_SIZE) && 616 if ((attr->ia_valid & ATTR_SIZE) &&
617 attr->ia_size != i_size_read(inode)) { 617 attr->ia_size != i_size_read(inode)) {
618 inode_dio_wait(inode);
619
618 error = vmtruncate(inode, attr->ia_size); 620 error = vmtruncate(inode, attr->ia_size);
619 if (error) 621 if (error)
620 return error; 622 return error;
621 } 623 }
622 624
623 setattr_copy(inode, attr); 625 setattr_copy(inode, attr);
624 mark_inode_dirty(inode); 626 mark_inode_dirty(inode);
625 return 0; 627 return 0;
626 } 628 }
627 629
628 static int hfs_file_fsync(struct file *filp, int datasync) 630 static int hfs_file_fsync(struct file *filp, int datasync)
629 { 631 {
630 struct inode *inode = filp->f_mapping->host; 632 struct inode *inode = filp->f_mapping->host;
631 struct super_block * sb; 633 struct super_block * sb;
632 int ret, err; 634 int ret, err;
633 635
634 /* sync the inode to buffers */ 636 /* sync the inode to buffers */
635 ret = write_inode_now(inode, 0); 637 ret = write_inode_now(inode, 0);
636 638
637 /* sync the superblock to buffers */ 639 /* sync the superblock to buffers */
638 sb = inode->i_sb; 640 sb = inode->i_sb;
639 if (sb->s_dirt) { 641 if (sb->s_dirt) {
640 lock_super(sb); 642 lock_super(sb);
641 sb->s_dirt = 0; 643 sb->s_dirt = 0;
642 if (!(sb->s_flags & MS_RDONLY)) 644 if (!(sb->s_flags & MS_RDONLY))
643 hfs_mdb_commit(sb); 645 hfs_mdb_commit(sb);
644 unlock_super(sb); 646 unlock_super(sb);
645 } 647 }
646 /* .. finally sync the buffers to disk */ 648 /* .. finally sync the buffers to disk */
647 err = sync_blockdev(sb->s_bdev); 649 err = sync_blockdev(sb->s_bdev);
648 if (!ret) 650 if (!ret)
649 ret = err; 651 ret = err;
650 return ret; 652 return ret;
651 } 653 }
652 654
653 static const struct file_operations hfs_file_operations = { 655 static const struct file_operations hfs_file_operations = {
654 .llseek = generic_file_llseek, 656 .llseek = generic_file_llseek,
655 .read = do_sync_read, 657 .read = do_sync_read,
656 .aio_read = generic_file_aio_read, 658 .aio_read = generic_file_aio_read,
657 .write = do_sync_write, 659 .write = do_sync_write,
658 .aio_write = generic_file_aio_write, 660 .aio_write = generic_file_aio_write,
659 .mmap = generic_file_mmap, 661 .mmap = generic_file_mmap,
660 .splice_read = generic_file_splice_read, 662 .splice_read = generic_file_splice_read,
661 .fsync = hfs_file_fsync, 663 .fsync = hfs_file_fsync,
662 .open = hfs_file_open, 664 .open = hfs_file_open,
663 .release = hfs_file_release, 665 .release = hfs_file_release,
664 }; 666 };
665 667
666 static const struct inode_operations hfs_file_inode_operations = { 668 static const struct inode_operations hfs_file_inode_operations = {
667 .lookup = hfs_file_lookup, 669 .lookup = hfs_file_lookup,
668 .truncate = hfs_file_truncate, 670 .truncate = hfs_file_truncate,
669 .setattr = hfs_inode_setattr, 671 .setattr = hfs_inode_setattr,
670 .setxattr = hfs_setxattr, 672 .setxattr = hfs_setxattr,
671 .getxattr = hfs_getxattr, 673 .getxattr = hfs_getxattr,
672 .listxattr = hfs_listxattr, 674 .listxattr = hfs_listxattr,
673 }; 675 };
674 676
1 /* 1 /*
2 * linux/fs/hfsplus/inode.c 2 * linux/fs/hfsplus/inode.c
3 * 3 *
4 * Copyright (C) 2001 4 * Copyright (C) 2001
5 * Brad Boyer (flar@allandria.com) 5 * Brad Boyer (flar@allandria.com)
6 * (C) 2003 Ardis Technologies <roman@ardistech.com> 6 * (C) 2003 Ardis Technologies <roman@ardistech.com>
7 * 7 *
8 * Inode handling routines 8 * Inode handling routines
9 */ 9 */
10 10
11 #include <linux/blkdev.h> 11 #include <linux/blkdev.h>
12 #include <linux/mm.h> 12 #include <linux/mm.h>
13 #include <linux/fs.h> 13 #include <linux/fs.h>
14 #include <linux/pagemap.h> 14 #include <linux/pagemap.h>
15 #include <linux/mpage.h> 15 #include <linux/mpage.h>
16 #include <linux/sched.h> 16 #include <linux/sched.h>
17 17
18 #include "hfsplus_fs.h" 18 #include "hfsplus_fs.h"
19 #include "hfsplus_raw.h" 19 #include "hfsplus_raw.h"
20 20
21 static int hfsplus_readpage(struct file *file, struct page *page) 21 static int hfsplus_readpage(struct file *file, struct page *page)
22 { 22 {
23 return block_read_full_page(page, hfsplus_get_block); 23 return block_read_full_page(page, hfsplus_get_block);
24 } 24 }
25 25
26 static int hfsplus_writepage(struct page *page, struct writeback_control *wbc) 26 static int hfsplus_writepage(struct page *page, struct writeback_control *wbc)
27 { 27 {
28 return block_write_full_page(page, hfsplus_get_block, wbc); 28 return block_write_full_page(page, hfsplus_get_block, wbc);
29 } 29 }
30 30
31 static int hfsplus_write_begin(struct file *file, struct address_space *mapping, 31 static int hfsplus_write_begin(struct file *file, struct address_space *mapping,
32 loff_t pos, unsigned len, unsigned flags, 32 loff_t pos, unsigned len, unsigned flags,
33 struct page **pagep, void **fsdata) 33 struct page **pagep, void **fsdata)
34 { 34 {
35 int ret; 35 int ret;
36 36
37 *pagep = NULL; 37 *pagep = NULL;
38 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 38 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
39 hfsplus_get_block, 39 hfsplus_get_block,
40 &HFSPLUS_I(mapping->host)->phys_size); 40 &HFSPLUS_I(mapping->host)->phys_size);
41 if (unlikely(ret)) { 41 if (unlikely(ret)) {
42 loff_t isize = mapping->host->i_size; 42 loff_t isize = mapping->host->i_size;
43 if (pos + len > isize) 43 if (pos + len > isize)
44 vmtruncate(mapping->host, isize); 44 vmtruncate(mapping->host, isize);
45 } 45 }
46 46
47 return ret; 47 return ret;
48 } 48 }
49 49
50 static sector_t hfsplus_bmap(struct address_space *mapping, sector_t block) 50 static sector_t hfsplus_bmap(struct address_space *mapping, sector_t block)
51 { 51 {
52 return generic_block_bmap(mapping, block, hfsplus_get_block); 52 return generic_block_bmap(mapping, block, hfsplus_get_block);
53 } 53 }
54 54
55 static int hfsplus_releasepage(struct page *page, gfp_t mask) 55 static int hfsplus_releasepage(struct page *page, gfp_t mask)
56 { 56 {
57 struct inode *inode = page->mapping->host; 57 struct inode *inode = page->mapping->host;
58 struct super_block *sb = inode->i_sb; 58 struct super_block *sb = inode->i_sb;
59 struct hfs_btree *tree; 59 struct hfs_btree *tree;
60 struct hfs_bnode *node; 60 struct hfs_bnode *node;
61 u32 nidx; 61 u32 nidx;
62 int i, res = 1; 62 int i, res = 1;
63 63
64 switch (inode->i_ino) { 64 switch (inode->i_ino) {
65 case HFSPLUS_EXT_CNID: 65 case HFSPLUS_EXT_CNID:
66 tree = HFSPLUS_SB(sb)->ext_tree; 66 tree = HFSPLUS_SB(sb)->ext_tree;
67 break; 67 break;
68 case HFSPLUS_CAT_CNID: 68 case HFSPLUS_CAT_CNID:
69 tree = HFSPLUS_SB(sb)->cat_tree; 69 tree = HFSPLUS_SB(sb)->cat_tree;
70 break; 70 break;
71 case HFSPLUS_ATTR_CNID: 71 case HFSPLUS_ATTR_CNID:
72 tree = HFSPLUS_SB(sb)->attr_tree; 72 tree = HFSPLUS_SB(sb)->attr_tree;
73 break; 73 break;
74 default: 74 default:
75 BUG(); 75 BUG();
76 return 0; 76 return 0;
77 } 77 }
78 if (!tree) 78 if (!tree)
79 return 0; 79 return 0;
80 if (tree->node_size >= PAGE_CACHE_SIZE) { 80 if (tree->node_size >= PAGE_CACHE_SIZE) {
81 nidx = page->index >> 81 nidx = page->index >>
82 (tree->node_size_shift - PAGE_CACHE_SHIFT); 82 (tree->node_size_shift - PAGE_CACHE_SHIFT);
83 spin_lock(&tree->hash_lock); 83 spin_lock(&tree->hash_lock);
84 node = hfs_bnode_findhash(tree, nidx); 84 node = hfs_bnode_findhash(tree, nidx);
85 if (!node) 85 if (!node)
86 ; 86 ;
87 else if (atomic_read(&node->refcnt)) 87 else if (atomic_read(&node->refcnt))
88 res = 0; 88 res = 0;
89 if (res && node) { 89 if (res && node) {
90 hfs_bnode_unhash(node); 90 hfs_bnode_unhash(node);
91 hfs_bnode_free(node); 91 hfs_bnode_free(node);
92 } 92 }
93 spin_unlock(&tree->hash_lock); 93 spin_unlock(&tree->hash_lock);
94 } else { 94 } else {
95 nidx = page->index << 95 nidx = page->index <<
96 (PAGE_CACHE_SHIFT - tree->node_size_shift); 96 (PAGE_CACHE_SHIFT - tree->node_size_shift);
97 i = 1 << (PAGE_CACHE_SHIFT - tree->node_size_shift); 97 i = 1 << (PAGE_CACHE_SHIFT - tree->node_size_shift);
98 spin_lock(&tree->hash_lock); 98 spin_lock(&tree->hash_lock);
99 do { 99 do {
100 node = hfs_bnode_findhash(tree, nidx++); 100 node = hfs_bnode_findhash(tree, nidx++);
101 if (!node) 101 if (!node)
102 continue; 102 continue;
103 if (atomic_read(&node->refcnt)) { 103 if (atomic_read(&node->refcnt)) {
104 res = 0; 104 res = 0;
105 break; 105 break;
106 } 106 }
107 hfs_bnode_unhash(node); 107 hfs_bnode_unhash(node);
108 hfs_bnode_free(node); 108 hfs_bnode_free(node);
109 } while (--i && nidx < tree->node_count); 109 } while (--i && nidx < tree->node_count);
110 spin_unlock(&tree->hash_lock); 110 spin_unlock(&tree->hash_lock);
111 } 111 }
112 return res ? try_to_free_buffers(page) : 0; 112 return res ? try_to_free_buffers(page) : 0;
113 } 113 }
114 114
115 static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb, 115 static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb,
116 const struct iovec *iov, loff_t offset, unsigned long nr_segs) 116 const struct iovec *iov, loff_t offset, unsigned long nr_segs)
117 { 117 {
118 struct file *file = iocb->ki_filp; 118 struct file *file = iocb->ki_filp;
119 struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host; 119 struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
120 ssize_t ret; 120 ssize_t ret;
121 121
122 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 122 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
123 offset, nr_segs, hfsplus_get_block, NULL); 123 offset, nr_segs, hfsplus_get_block, NULL);
124 124
125 /* 125 /*
126 * In case of error extending write may have instantiated a few 126 * In case of error extending write may have instantiated a few
127 * blocks outside i_size. Trim these off again. 127 * blocks outside i_size. Trim these off again.
128 */ 128 */
129 if (unlikely((rw & WRITE) && ret < 0)) { 129 if (unlikely((rw & WRITE) && ret < 0)) {
130 loff_t isize = i_size_read(inode); 130 loff_t isize = i_size_read(inode);
131 loff_t end = offset + iov_length(iov, nr_segs); 131 loff_t end = offset + iov_length(iov, nr_segs);
132 132
133 if (end > isize) 133 if (end > isize)
134 vmtruncate(inode, isize); 134 vmtruncate(inode, isize);
135 } 135 }
136 136
137 return ret; 137 return ret;
138 } 138 }
139 139
140 static int hfsplus_writepages(struct address_space *mapping, 140 static int hfsplus_writepages(struct address_space *mapping,
141 struct writeback_control *wbc) 141 struct writeback_control *wbc)
142 { 142 {
143 return mpage_writepages(mapping, wbc, hfsplus_get_block); 143 return mpage_writepages(mapping, wbc, hfsplus_get_block);
144 } 144 }
145 145
146 const struct address_space_operations hfsplus_btree_aops = { 146 const struct address_space_operations hfsplus_btree_aops = {
147 .readpage = hfsplus_readpage, 147 .readpage = hfsplus_readpage,
148 .writepage = hfsplus_writepage, 148 .writepage = hfsplus_writepage,
149 .write_begin = hfsplus_write_begin, 149 .write_begin = hfsplus_write_begin,
150 .write_end = generic_write_end, 150 .write_end = generic_write_end,
151 .bmap = hfsplus_bmap, 151 .bmap = hfsplus_bmap,
152 .releasepage = hfsplus_releasepage, 152 .releasepage = hfsplus_releasepage,
153 }; 153 };
154 154
155 const struct address_space_operations hfsplus_aops = { 155 const struct address_space_operations hfsplus_aops = {
156 .readpage = hfsplus_readpage, 156 .readpage = hfsplus_readpage,
157 .writepage = hfsplus_writepage, 157 .writepage = hfsplus_writepage,
158 .write_begin = hfsplus_write_begin, 158 .write_begin = hfsplus_write_begin,
159 .write_end = generic_write_end, 159 .write_end = generic_write_end,
160 .bmap = hfsplus_bmap, 160 .bmap = hfsplus_bmap,
161 .direct_IO = hfsplus_direct_IO, 161 .direct_IO = hfsplus_direct_IO,
162 .writepages = hfsplus_writepages, 162 .writepages = hfsplus_writepages,
163 }; 163 };
164 164
165 const struct dentry_operations hfsplus_dentry_operations = { 165 const struct dentry_operations hfsplus_dentry_operations = {
166 .d_hash = hfsplus_hash_dentry, 166 .d_hash = hfsplus_hash_dentry,
167 .d_compare = hfsplus_compare_dentry, 167 .d_compare = hfsplus_compare_dentry,
168 }; 168 };
169 169
170 static struct dentry *hfsplus_file_lookup(struct inode *dir, 170 static struct dentry *hfsplus_file_lookup(struct inode *dir,
171 struct dentry *dentry, struct nameidata *nd) 171 struct dentry *dentry, struct nameidata *nd)
172 { 172 {
173 struct hfs_find_data fd; 173 struct hfs_find_data fd;
174 struct super_block *sb = dir->i_sb; 174 struct super_block *sb = dir->i_sb;
175 struct inode *inode = NULL; 175 struct inode *inode = NULL;
176 struct hfsplus_inode_info *hip; 176 struct hfsplus_inode_info *hip;
177 int err; 177 int err;
178 178
179 if (HFSPLUS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc")) 179 if (HFSPLUS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc"))
180 goto out; 180 goto out;
181 181
182 inode = HFSPLUS_I(dir)->rsrc_inode; 182 inode = HFSPLUS_I(dir)->rsrc_inode;
183 if (inode) 183 if (inode)
184 goto out; 184 goto out;
185 185
186 inode = new_inode(sb); 186 inode = new_inode(sb);
187 if (!inode) 187 if (!inode)
188 return ERR_PTR(-ENOMEM); 188 return ERR_PTR(-ENOMEM);
189 189
190 hip = HFSPLUS_I(inode); 190 hip = HFSPLUS_I(inode);
191 inode->i_ino = dir->i_ino; 191 inode->i_ino = dir->i_ino;
192 INIT_LIST_HEAD(&hip->open_dir_list); 192 INIT_LIST_HEAD(&hip->open_dir_list);
193 mutex_init(&hip->extents_lock); 193 mutex_init(&hip->extents_lock);
194 hip->extent_state = 0; 194 hip->extent_state = 0;
195 hip->flags = 0; 195 hip->flags = 0;
196 set_bit(HFSPLUS_I_RSRC, &hip->flags); 196 set_bit(HFSPLUS_I_RSRC, &hip->flags);
197 197
198 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); 198 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
199 err = hfsplus_find_cat(sb, dir->i_ino, &fd); 199 err = hfsplus_find_cat(sb, dir->i_ino, &fd);
200 if (!err) 200 if (!err)
201 err = hfsplus_cat_read_inode(inode, &fd); 201 err = hfsplus_cat_read_inode(inode, &fd);
202 hfs_find_exit(&fd); 202 hfs_find_exit(&fd);
203 if (err) { 203 if (err) {
204 iput(inode); 204 iput(inode);
205 return ERR_PTR(err); 205 return ERR_PTR(err);
206 } 206 }
207 hip->rsrc_inode = dir; 207 hip->rsrc_inode = dir;
208 HFSPLUS_I(dir)->rsrc_inode = inode; 208 HFSPLUS_I(dir)->rsrc_inode = inode;
209 igrab(dir); 209 igrab(dir);
210 210
211 /* 211 /*
212 * __mark_inode_dirty expects inodes to be hashed. Since we don't 212 * __mark_inode_dirty expects inodes to be hashed. Since we don't
213 * want resource fork inodes in the regular inode space, we make them 213 * want resource fork inodes in the regular inode space, we make them
214 * appear hashed, but do not put on any lists. hlist_del() 214 * appear hashed, but do not put on any lists. hlist_del()
215 * will work fine and require no locking. 215 * will work fine and require no locking.
216 */ 216 */
217 hlist_add_fake(&inode->i_hash); 217 hlist_add_fake(&inode->i_hash);
218 218
219 mark_inode_dirty(inode); 219 mark_inode_dirty(inode);
220 out: 220 out:
221 d_add(dentry, inode); 221 d_add(dentry, inode);
222 return NULL; 222 return NULL;
223 } 223 }
224 224
225 static void hfsplus_get_perms(struct inode *inode, 225 static void hfsplus_get_perms(struct inode *inode,
226 struct hfsplus_perm *perms, int dir) 226 struct hfsplus_perm *perms, int dir)
227 { 227 {
228 struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb); 228 struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
229 u16 mode; 229 u16 mode;
230 230
231 mode = be16_to_cpu(perms->mode); 231 mode = be16_to_cpu(perms->mode);
232 232
233 inode->i_uid = be32_to_cpu(perms->owner); 233 inode->i_uid = be32_to_cpu(perms->owner);
234 if (!inode->i_uid && !mode) 234 if (!inode->i_uid && !mode)
235 inode->i_uid = sbi->uid; 235 inode->i_uid = sbi->uid;
236 236
237 inode->i_gid = be32_to_cpu(perms->group); 237 inode->i_gid = be32_to_cpu(perms->group);
238 if (!inode->i_gid && !mode) 238 if (!inode->i_gid && !mode)
239 inode->i_gid = sbi->gid; 239 inode->i_gid = sbi->gid;
240 240
241 if (dir) { 241 if (dir) {
242 mode = mode ? (mode & S_IALLUGO) : (S_IRWXUGO & ~(sbi->umask)); 242 mode = mode ? (mode & S_IALLUGO) : (S_IRWXUGO & ~(sbi->umask));
243 mode |= S_IFDIR; 243 mode |= S_IFDIR;
244 } else if (!mode) 244 } else if (!mode)
245 mode = S_IFREG | ((S_IRUGO|S_IWUGO) & ~(sbi->umask)); 245 mode = S_IFREG | ((S_IRUGO|S_IWUGO) & ~(sbi->umask));
246 inode->i_mode = mode; 246 inode->i_mode = mode;
247 247
248 HFSPLUS_I(inode)->userflags = perms->userflags; 248 HFSPLUS_I(inode)->userflags = perms->userflags;
249 if (perms->rootflags & HFSPLUS_FLG_IMMUTABLE) 249 if (perms->rootflags & HFSPLUS_FLG_IMMUTABLE)
250 inode->i_flags |= S_IMMUTABLE; 250 inode->i_flags |= S_IMMUTABLE;
251 else 251 else
252 inode->i_flags &= ~S_IMMUTABLE; 252 inode->i_flags &= ~S_IMMUTABLE;
253 if (perms->rootflags & HFSPLUS_FLG_APPEND) 253 if (perms->rootflags & HFSPLUS_FLG_APPEND)
254 inode->i_flags |= S_APPEND; 254 inode->i_flags |= S_APPEND;
255 else 255 else
256 inode->i_flags &= ~S_APPEND; 256 inode->i_flags &= ~S_APPEND;
257 } 257 }
258 258
259 static int hfsplus_file_open(struct inode *inode, struct file *file) 259 static int hfsplus_file_open(struct inode *inode, struct file *file)
260 { 260 {
261 if (HFSPLUS_IS_RSRC(inode)) 261 if (HFSPLUS_IS_RSRC(inode))
262 inode = HFSPLUS_I(inode)->rsrc_inode; 262 inode = HFSPLUS_I(inode)->rsrc_inode;
263 if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) 263 if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
264 return -EOVERFLOW; 264 return -EOVERFLOW;
265 atomic_inc(&HFSPLUS_I(inode)->opencnt); 265 atomic_inc(&HFSPLUS_I(inode)->opencnt);
266 return 0; 266 return 0;
267 } 267 }
268 268
269 static int hfsplus_file_release(struct inode *inode, struct file *file) 269 static int hfsplus_file_release(struct inode *inode, struct file *file)
270 { 270 {
271 struct super_block *sb = inode->i_sb; 271 struct super_block *sb = inode->i_sb;
272 272
273 if (HFSPLUS_IS_RSRC(inode)) 273 if (HFSPLUS_IS_RSRC(inode))
274 inode = HFSPLUS_I(inode)->rsrc_inode; 274 inode = HFSPLUS_I(inode)->rsrc_inode;
275 if (atomic_dec_and_test(&HFSPLUS_I(inode)->opencnt)) { 275 if (atomic_dec_and_test(&HFSPLUS_I(inode)->opencnt)) {
276 mutex_lock(&inode->i_mutex); 276 mutex_lock(&inode->i_mutex);
277 hfsplus_file_truncate(inode); 277 hfsplus_file_truncate(inode);
278 if (inode->i_flags & S_DEAD) { 278 if (inode->i_flags & S_DEAD) {
279 hfsplus_delete_cat(inode->i_ino, 279 hfsplus_delete_cat(inode->i_ino,
280 HFSPLUS_SB(sb)->hidden_dir, NULL); 280 HFSPLUS_SB(sb)->hidden_dir, NULL);
281 hfsplus_delete_inode(inode); 281 hfsplus_delete_inode(inode);
282 } 282 }
283 mutex_unlock(&inode->i_mutex); 283 mutex_unlock(&inode->i_mutex);
284 } 284 }
285 return 0; 285 return 0;
286 } 286 }
287 287
288 static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr) 288 static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
289 { 289 {
290 struct inode *inode = dentry->d_inode; 290 struct inode *inode = dentry->d_inode;
291 int error; 291 int error;
292 292
293 error = inode_change_ok(inode, attr); 293 error = inode_change_ok(inode, attr);
294 if (error) 294 if (error)
295 return error; 295 return error;
296 296
297 if ((attr->ia_valid & ATTR_SIZE) && 297 if ((attr->ia_valid & ATTR_SIZE) &&
298 attr->ia_size != i_size_read(inode)) { 298 attr->ia_size != i_size_read(inode)) {
299 inode_dio_wait(inode);
300
299 error = vmtruncate(inode, attr->ia_size); 301 error = vmtruncate(inode, attr->ia_size);
300 if (error) 302 if (error)
301 return error; 303 return error;
302 } 304 }
303 305
304 setattr_copy(inode, attr); 306 setattr_copy(inode, attr);
305 mark_inode_dirty(inode); 307 mark_inode_dirty(inode);
306 return 0; 308 return 0;
307 } 309 }
308 310
309 int hfsplus_file_fsync(struct file *file, int datasync) 311 int hfsplus_file_fsync(struct file *file, int datasync)
310 { 312 {
311 struct inode *inode = file->f_mapping->host; 313 struct inode *inode = file->f_mapping->host;
312 struct hfsplus_inode_info *hip = HFSPLUS_I(inode); 314 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
313 struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb); 315 struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
314 int error = 0, error2; 316 int error = 0, error2;
315 317
316 /* 318 /*
317 * Sync inode metadata into the catalog and extent trees. 319 * Sync inode metadata into the catalog and extent trees.
318 */ 320 */
319 sync_inode_metadata(inode, 1); 321 sync_inode_metadata(inode, 1);
320 322
321 /* 323 /*
322 * And explicitly write out the btrees. 324 * And explicitly write out the btrees.
323 */ 325 */
324 if (test_and_clear_bit(HFSPLUS_I_CAT_DIRTY, &hip->flags)) 326 if (test_and_clear_bit(HFSPLUS_I_CAT_DIRTY, &hip->flags))
325 error = filemap_write_and_wait(sbi->cat_tree->inode->i_mapping); 327 error = filemap_write_and_wait(sbi->cat_tree->inode->i_mapping);
326 328
327 if (test_and_clear_bit(HFSPLUS_I_EXT_DIRTY, &hip->flags)) { 329 if (test_and_clear_bit(HFSPLUS_I_EXT_DIRTY, &hip->flags)) {
328 error2 = 330 error2 =
329 filemap_write_and_wait(sbi->ext_tree->inode->i_mapping); 331 filemap_write_and_wait(sbi->ext_tree->inode->i_mapping);
330 if (!error) 332 if (!error)
331 error = error2; 333 error = error2;
332 } 334 }
333 335
334 if (test_and_clear_bit(HFSPLUS_I_ALLOC_DIRTY, &hip->flags)) { 336 if (test_and_clear_bit(HFSPLUS_I_ALLOC_DIRTY, &hip->flags)) {
335 error2 = filemap_write_and_wait(sbi->alloc_file->i_mapping); 337 error2 = filemap_write_and_wait(sbi->alloc_file->i_mapping);
336 if (!error) 338 if (!error)
337 error = error2; 339 error = error2;
338 } 340 }
339 341
340 if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags)) 342 if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
341 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); 343 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
342 344
343 return error; 345 return error;
344 } 346 }
345 347
346 static const struct inode_operations hfsplus_file_inode_operations = { 348 static const struct inode_operations hfsplus_file_inode_operations = {
347 .lookup = hfsplus_file_lookup, 349 .lookup = hfsplus_file_lookup,
348 .truncate = hfsplus_file_truncate, 350 .truncate = hfsplus_file_truncate,
349 .setattr = hfsplus_setattr, 351 .setattr = hfsplus_setattr,
350 .setxattr = hfsplus_setxattr, 352 .setxattr = hfsplus_setxattr,
351 .getxattr = hfsplus_getxattr, 353 .getxattr = hfsplus_getxattr,
352 .listxattr = hfsplus_listxattr, 354 .listxattr = hfsplus_listxattr,
353 }; 355 };
354 356
355 static const struct file_operations hfsplus_file_operations = { 357 static const struct file_operations hfsplus_file_operations = {
356 .llseek = generic_file_llseek, 358 .llseek = generic_file_llseek,
357 .read = do_sync_read, 359 .read = do_sync_read,
358 .aio_read = generic_file_aio_read, 360 .aio_read = generic_file_aio_read,
359 .write = do_sync_write, 361 .write = do_sync_write,
360 .aio_write = generic_file_aio_write, 362 .aio_write = generic_file_aio_write,
361 .mmap = generic_file_mmap, 363 .mmap = generic_file_mmap,
362 .splice_read = generic_file_splice_read, 364 .splice_read = generic_file_splice_read,
363 .fsync = hfsplus_file_fsync, 365 .fsync = hfsplus_file_fsync,
364 .open = hfsplus_file_open, 366 .open = hfsplus_file_open,
365 .release = hfsplus_file_release, 367 .release = hfsplus_file_release,
366 .unlocked_ioctl = hfsplus_ioctl, 368 .unlocked_ioctl = hfsplus_ioctl,
367 }; 369 };
368 370
369 struct inode *hfsplus_new_inode(struct super_block *sb, int mode) 371 struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
370 { 372 {
371 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); 373 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
372 struct inode *inode = new_inode(sb); 374 struct inode *inode = new_inode(sb);
373 struct hfsplus_inode_info *hip; 375 struct hfsplus_inode_info *hip;
374 376
375 if (!inode) 377 if (!inode)
376 return NULL; 378 return NULL;
377 379
378 inode->i_ino = sbi->next_cnid++; 380 inode->i_ino = sbi->next_cnid++;
379 inode->i_mode = mode; 381 inode->i_mode = mode;
380 inode->i_uid = current_fsuid(); 382 inode->i_uid = current_fsuid();
381 inode->i_gid = current_fsgid(); 383 inode->i_gid = current_fsgid();
382 inode->i_nlink = 1; 384 inode->i_nlink = 1;
383 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; 385 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
384 386
385 hip = HFSPLUS_I(inode); 387 hip = HFSPLUS_I(inode);
386 INIT_LIST_HEAD(&hip->open_dir_list); 388 INIT_LIST_HEAD(&hip->open_dir_list);
387 mutex_init(&hip->extents_lock); 389 mutex_init(&hip->extents_lock);
388 atomic_set(&hip->opencnt, 0); 390 atomic_set(&hip->opencnt, 0);
389 hip->extent_state = 0; 391 hip->extent_state = 0;
390 hip->flags = 0; 392 hip->flags = 0;
391 memset(hip->first_extents, 0, sizeof(hfsplus_extent_rec)); 393 memset(hip->first_extents, 0, sizeof(hfsplus_extent_rec));
392 memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec)); 394 memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
393 hip->alloc_blocks = 0; 395 hip->alloc_blocks = 0;
394 hip->first_blocks = 0; 396 hip->first_blocks = 0;
395 hip->cached_start = 0; 397 hip->cached_start = 0;
396 hip->cached_blocks = 0; 398 hip->cached_blocks = 0;
397 hip->phys_size = 0; 399 hip->phys_size = 0;
398 hip->fs_blocks = 0; 400 hip->fs_blocks = 0;
399 hip->rsrc_inode = NULL; 401 hip->rsrc_inode = NULL;
400 if (S_ISDIR(inode->i_mode)) { 402 if (S_ISDIR(inode->i_mode)) {
401 inode->i_size = 2; 403 inode->i_size = 2;
402 sbi->folder_count++; 404 sbi->folder_count++;
403 inode->i_op = &hfsplus_dir_inode_operations; 405 inode->i_op = &hfsplus_dir_inode_operations;
404 inode->i_fop = &hfsplus_dir_operations; 406 inode->i_fop = &hfsplus_dir_operations;
405 } else if (S_ISREG(inode->i_mode)) { 407 } else if (S_ISREG(inode->i_mode)) {
406 sbi->file_count++; 408 sbi->file_count++;
407 inode->i_op = &hfsplus_file_inode_operations; 409 inode->i_op = &hfsplus_file_inode_operations;
408 inode->i_fop = &hfsplus_file_operations; 410 inode->i_fop = &hfsplus_file_operations;
409 inode->i_mapping->a_ops = &hfsplus_aops; 411 inode->i_mapping->a_ops = &hfsplus_aops;
410 hip->clump_blocks = sbi->data_clump_blocks; 412 hip->clump_blocks = sbi->data_clump_blocks;
411 } else if (S_ISLNK(inode->i_mode)) { 413 } else if (S_ISLNK(inode->i_mode)) {
412 sbi->file_count++; 414 sbi->file_count++;
413 inode->i_op = &page_symlink_inode_operations; 415 inode->i_op = &page_symlink_inode_operations;
414 inode->i_mapping->a_ops = &hfsplus_aops; 416 inode->i_mapping->a_ops = &hfsplus_aops;
415 hip->clump_blocks = 1; 417 hip->clump_blocks = 1;
416 } else 418 } else
417 sbi->file_count++; 419 sbi->file_count++;
418 insert_inode_hash(inode); 420 insert_inode_hash(inode);
419 mark_inode_dirty(inode); 421 mark_inode_dirty(inode);
420 sb->s_dirt = 1; 422 sb->s_dirt = 1;
421 423
422 return inode; 424 return inode;
423 } 425 }
424 426
425 void hfsplus_delete_inode(struct inode *inode) 427 void hfsplus_delete_inode(struct inode *inode)
426 { 428 {
427 struct super_block *sb = inode->i_sb; 429 struct super_block *sb = inode->i_sb;
428 430
429 if (S_ISDIR(inode->i_mode)) { 431 if (S_ISDIR(inode->i_mode)) {
430 HFSPLUS_SB(sb)->folder_count--; 432 HFSPLUS_SB(sb)->folder_count--;
431 sb->s_dirt = 1; 433 sb->s_dirt = 1;
432 return; 434 return;
433 } 435 }
434 HFSPLUS_SB(sb)->file_count--; 436 HFSPLUS_SB(sb)->file_count--;
435 if (S_ISREG(inode->i_mode)) { 437 if (S_ISREG(inode->i_mode)) {
436 if (!inode->i_nlink) { 438 if (!inode->i_nlink) {
437 inode->i_size = 0; 439 inode->i_size = 0;
438 hfsplus_file_truncate(inode); 440 hfsplus_file_truncate(inode);
439 } 441 }
440 } else if (S_ISLNK(inode->i_mode)) { 442 } else if (S_ISLNK(inode->i_mode)) {
441 inode->i_size = 0; 443 inode->i_size = 0;
442 hfsplus_file_truncate(inode); 444 hfsplus_file_truncate(inode);
443 } 445 }
444 sb->s_dirt = 1; 446 sb->s_dirt = 1;
445 } 447 }
446 448
447 void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork) 449 void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork)
448 { 450 {
449 struct super_block *sb = inode->i_sb; 451 struct super_block *sb = inode->i_sb;
450 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); 452 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
451 struct hfsplus_inode_info *hip = HFSPLUS_I(inode); 453 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
452 u32 count; 454 u32 count;
453 int i; 455 int i;
454 456
455 memcpy(&hip->first_extents, &fork->extents, sizeof(hfsplus_extent_rec)); 457 memcpy(&hip->first_extents, &fork->extents, sizeof(hfsplus_extent_rec));
456 for (count = 0, i = 0; i < 8; i++) 458 for (count = 0, i = 0; i < 8; i++)
457 count += be32_to_cpu(fork->extents[i].block_count); 459 count += be32_to_cpu(fork->extents[i].block_count);
458 hip->first_blocks = count; 460 hip->first_blocks = count;
459 memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec)); 461 memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
460 hip->cached_start = 0; 462 hip->cached_start = 0;
461 hip->cached_blocks = 0; 463 hip->cached_blocks = 0;
462 464
463 hip->alloc_blocks = be32_to_cpu(fork->total_blocks); 465 hip->alloc_blocks = be32_to_cpu(fork->total_blocks);
464 hip->phys_size = inode->i_size = be64_to_cpu(fork->total_size); 466 hip->phys_size = inode->i_size = be64_to_cpu(fork->total_size);
465 hip->fs_blocks = 467 hip->fs_blocks =
466 (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; 468 (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
467 inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits); 469 inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits);
468 hip->clump_blocks = 470 hip->clump_blocks =
469 be32_to_cpu(fork->clump_size) >> sbi->alloc_blksz_shift; 471 be32_to_cpu(fork->clump_size) >> sbi->alloc_blksz_shift;
470 if (!hip->clump_blocks) { 472 if (!hip->clump_blocks) {
471 hip->clump_blocks = HFSPLUS_IS_RSRC(inode) ? 473 hip->clump_blocks = HFSPLUS_IS_RSRC(inode) ?
472 sbi->rsrc_clump_blocks : 474 sbi->rsrc_clump_blocks :
473 sbi->data_clump_blocks; 475 sbi->data_clump_blocks;
474 } 476 }
475 } 477 }
476 478
477 void hfsplus_inode_write_fork(struct inode *inode, 479 void hfsplus_inode_write_fork(struct inode *inode,
478 struct hfsplus_fork_raw *fork) 480 struct hfsplus_fork_raw *fork)
479 { 481 {
480 memcpy(&fork->extents, &HFSPLUS_I(inode)->first_extents, 482 memcpy(&fork->extents, &HFSPLUS_I(inode)->first_extents,
481 sizeof(hfsplus_extent_rec)); 483 sizeof(hfsplus_extent_rec));
482 fork->total_size = cpu_to_be64(inode->i_size); 484 fork->total_size = cpu_to_be64(inode->i_size);
483 fork->total_blocks = cpu_to_be32(HFSPLUS_I(inode)->alloc_blocks); 485 fork->total_blocks = cpu_to_be32(HFSPLUS_I(inode)->alloc_blocks);
484 } 486 }
485 487
486 int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) 488 int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
487 { 489 {
488 hfsplus_cat_entry entry; 490 hfsplus_cat_entry entry;
489 int res = 0; 491 int res = 0;
490 u16 type; 492 u16 type;
491 493
492 type = hfs_bnode_read_u16(fd->bnode, fd->entryoffset); 494 type = hfs_bnode_read_u16(fd->bnode, fd->entryoffset);
493 495
494 HFSPLUS_I(inode)->linkid = 0; 496 HFSPLUS_I(inode)->linkid = 0;
495 if (type == HFSPLUS_FOLDER) { 497 if (type == HFSPLUS_FOLDER) {
496 struct hfsplus_cat_folder *folder = &entry.folder; 498 struct hfsplus_cat_folder *folder = &entry.folder;
497 499
498 if (fd->entrylength < sizeof(struct hfsplus_cat_folder)) 500 if (fd->entrylength < sizeof(struct hfsplus_cat_folder))
499 /* panic? */; 501 /* panic? */;
500 hfs_bnode_read(fd->bnode, &entry, fd->entryoffset, 502 hfs_bnode_read(fd->bnode, &entry, fd->entryoffset,
501 sizeof(struct hfsplus_cat_folder)); 503 sizeof(struct hfsplus_cat_folder));
502 hfsplus_get_perms(inode, &folder->permissions, 1); 504 hfsplus_get_perms(inode, &folder->permissions, 1);
503 inode->i_nlink = 1; 505 inode->i_nlink = 1;
504 inode->i_size = 2 + be32_to_cpu(folder->valence); 506 inode->i_size = 2 + be32_to_cpu(folder->valence);
505 inode->i_atime = hfsp_mt2ut(folder->access_date); 507 inode->i_atime = hfsp_mt2ut(folder->access_date);
506 inode->i_mtime = hfsp_mt2ut(folder->content_mod_date); 508 inode->i_mtime = hfsp_mt2ut(folder->content_mod_date);
507 inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date); 509 inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date);
508 HFSPLUS_I(inode)->create_date = folder->create_date; 510 HFSPLUS_I(inode)->create_date = folder->create_date;
509 HFSPLUS_I(inode)->fs_blocks = 0; 511 HFSPLUS_I(inode)->fs_blocks = 0;
510 inode->i_op = &hfsplus_dir_inode_operations; 512 inode->i_op = &hfsplus_dir_inode_operations;
511 inode->i_fop = &hfsplus_dir_operations; 513 inode->i_fop = &hfsplus_dir_operations;
512 } else if (type == HFSPLUS_FILE) { 514 } else if (type == HFSPLUS_FILE) {
513 struct hfsplus_cat_file *file = &entry.file; 515 struct hfsplus_cat_file *file = &entry.file;
514 516
515 if (fd->entrylength < sizeof(struct hfsplus_cat_file)) 517 if (fd->entrylength < sizeof(struct hfsplus_cat_file))
516 /* panic? */; 518 /* panic? */;
517 hfs_bnode_read(fd->bnode, &entry, fd->entryoffset, 519 hfs_bnode_read(fd->bnode, &entry, fd->entryoffset,
518 sizeof(struct hfsplus_cat_file)); 520 sizeof(struct hfsplus_cat_file));
519 521
520 hfsplus_inode_read_fork(inode, HFSPLUS_IS_RSRC(inode) ? 522 hfsplus_inode_read_fork(inode, HFSPLUS_IS_RSRC(inode) ?
521 &file->rsrc_fork : &file->data_fork); 523 &file->rsrc_fork : &file->data_fork);
522 hfsplus_get_perms(inode, &file->permissions, 0); 524 hfsplus_get_perms(inode, &file->permissions, 0);
523 inode->i_nlink = 1; 525 inode->i_nlink = 1;
524 if (S_ISREG(inode->i_mode)) { 526 if (S_ISREG(inode->i_mode)) {
525 if (file->permissions.dev) 527 if (file->permissions.dev)
526 inode->i_nlink = 528 inode->i_nlink =
527 be32_to_cpu(file->permissions.dev); 529 be32_to_cpu(file->permissions.dev);
528 inode->i_op = &hfsplus_file_inode_operations; 530 inode->i_op = &hfsplus_file_inode_operations;
529 inode->i_fop = &hfsplus_file_operations; 531 inode->i_fop = &hfsplus_file_operations;
530 inode->i_mapping->a_ops = &hfsplus_aops; 532 inode->i_mapping->a_ops = &hfsplus_aops;
531 } else if (S_ISLNK(inode->i_mode)) { 533 } else if (S_ISLNK(inode->i_mode)) {
532 inode->i_op = &page_symlink_inode_operations; 534 inode->i_op = &page_symlink_inode_operations;
533 inode->i_mapping->a_ops = &hfsplus_aops; 535 inode->i_mapping->a_ops = &hfsplus_aops;
534 } else { 536 } else {
535 init_special_inode(inode, inode->i_mode, 537 init_special_inode(inode, inode->i_mode,
536 be32_to_cpu(file->permissions.dev)); 538 be32_to_cpu(file->permissions.dev));
537 } 539 }
538 inode->i_atime = hfsp_mt2ut(file->access_date); 540 inode->i_atime = hfsp_mt2ut(file->access_date);
539 inode->i_mtime = hfsp_mt2ut(file->content_mod_date); 541 inode->i_mtime = hfsp_mt2ut(file->content_mod_date);
540 inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date); 542 inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date);
541 HFSPLUS_I(inode)->create_date = file->create_date; 543 HFSPLUS_I(inode)->create_date = file->create_date;
542 } else { 544 } else {
543 printk(KERN_ERR "hfs: bad catalog entry used to create inode\n"); 545 printk(KERN_ERR "hfs: bad catalog entry used to create inode\n");
544 res = -EIO; 546 res = -EIO;
545 } 547 }
546 return res; 548 return res;
547 } 549 }
548 550
549 int hfsplus_cat_write_inode(struct inode *inode) 551 int hfsplus_cat_write_inode(struct inode *inode)
550 { 552 {
551 struct inode *main_inode = inode; 553 struct inode *main_inode = inode;
552 struct hfs_find_data fd; 554 struct hfs_find_data fd;
553 hfsplus_cat_entry entry; 555 hfsplus_cat_entry entry;
554 556
555 if (HFSPLUS_IS_RSRC(inode)) 557 if (HFSPLUS_IS_RSRC(inode))
556 main_inode = HFSPLUS_I(inode)->rsrc_inode; 558 main_inode = HFSPLUS_I(inode)->rsrc_inode;
557 559
558 if (!main_inode->i_nlink) 560 if (!main_inode->i_nlink)
559 return 0; 561 return 0;
560 562
561 if (hfs_find_init(HFSPLUS_SB(main_inode->i_sb)->cat_tree, &fd)) 563 if (hfs_find_init(HFSPLUS_SB(main_inode->i_sb)->cat_tree, &fd))
562 /* panic? */ 564 /* panic? */
563 return -EIO; 565 return -EIO;
564 566
565 if (hfsplus_find_cat(main_inode->i_sb, main_inode->i_ino, &fd)) 567 if (hfsplus_find_cat(main_inode->i_sb, main_inode->i_ino, &fd))
566 /* panic? */ 568 /* panic? */
567 goto out; 569 goto out;
568 570
569 if (S_ISDIR(main_inode->i_mode)) { 571 if (S_ISDIR(main_inode->i_mode)) {
570 struct hfsplus_cat_folder *folder = &entry.folder; 572 struct hfsplus_cat_folder *folder = &entry.folder;
571 573
572 if (fd.entrylength < sizeof(struct hfsplus_cat_folder)) 574 if (fd.entrylength < sizeof(struct hfsplus_cat_folder))
573 /* panic? */; 575 /* panic? */;
574 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, 576 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
575 sizeof(struct hfsplus_cat_folder)); 577 sizeof(struct hfsplus_cat_folder));
576 /* simple node checks? */ 578 /* simple node checks? */
577 hfsplus_cat_set_perms(inode, &folder->permissions); 579 hfsplus_cat_set_perms(inode, &folder->permissions);
578 folder->access_date = hfsp_ut2mt(inode->i_atime); 580 folder->access_date = hfsp_ut2mt(inode->i_atime);
579 folder->content_mod_date = hfsp_ut2mt(inode->i_mtime); 581 folder->content_mod_date = hfsp_ut2mt(inode->i_mtime);
580 folder->attribute_mod_date = hfsp_ut2mt(inode->i_ctime); 582 folder->attribute_mod_date = hfsp_ut2mt(inode->i_ctime);
581 folder->valence = cpu_to_be32(inode->i_size - 2); 583 folder->valence = cpu_to_be32(inode->i_size - 2);
582 hfs_bnode_write(fd.bnode, &entry, fd.entryoffset, 584 hfs_bnode_write(fd.bnode, &entry, fd.entryoffset,
583 sizeof(struct hfsplus_cat_folder)); 585 sizeof(struct hfsplus_cat_folder));
584 } else if (HFSPLUS_IS_RSRC(inode)) { 586 } else if (HFSPLUS_IS_RSRC(inode)) {
585 struct hfsplus_cat_file *file = &entry.file; 587 struct hfsplus_cat_file *file = &entry.file;
586 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, 588 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
587 sizeof(struct hfsplus_cat_file)); 589 sizeof(struct hfsplus_cat_file));
588 hfsplus_inode_write_fork(inode, &file->rsrc_fork); 590 hfsplus_inode_write_fork(inode, &file->rsrc_fork);
589 hfs_bnode_write(fd.bnode, &entry, fd.entryoffset, 591 hfs_bnode_write(fd.bnode, &entry, fd.entryoffset,
590 sizeof(struct hfsplus_cat_file)); 592 sizeof(struct hfsplus_cat_file));
591 } else { 593 } else {
592 struct hfsplus_cat_file *file = &entry.file; 594 struct hfsplus_cat_file *file = &entry.file;
593 595
594 if (fd.entrylength < sizeof(struct hfsplus_cat_file)) 596 if (fd.entrylength < sizeof(struct hfsplus_cat_file))
595 /* panic? */; 597 /* panic? */;
596 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, 598 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
597 sizeof(struct hfsplus_cat_file)); 599 sizeof(struct hfsplus_cat_file));
598 hfsplus_inode_write_fork(inode, &file->data_fork); 600 hfsplus_inode_write_fork(inode, &file->data_fork);
599 hfsplus_cat_set_perms(inode, &file->permissions); 601 hfsplus_cat_set_perms(inode, &file->permissions);
600 if (HFSPLUS_FLG_IMMUTABLE & 602 if (HFSPLUS_FLG_IMMUTABLE &
601 (file->permissions.rootflags | 603 (file->permissions.rootflags |
602 file->permissions.userflags)) 604 file->permissions.userflags))
603 file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED); 605 file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED);
604 else 606 else
605 file->flags &= cpu_to_be16(~HFSPLUS_FILE_LOCKED); 607 file->flags &= cpu_to_be16(~HFSPLUS_FILE_LOCKED);
606 file->access_date = hfsp_ut2mt(inode->i_atime); 608 file->access_date = hfsp_ut2mt(inode->i_atime);
607 file->content_mod_date = hfsp_ut2mt(inode->i_mtime); 609 file->content_mod_date = hfsp_ut2mt(inode->i_mtime);
608 file->attribute_mod_date = hfsp_ut2mt(inode->i_ctime); 610 file->attribute_mod_date = hfsp_ut2mt(inode->i_ctime);
609 hfs_bnode_write(fd.bnode, &entry, fd.entryoffset, 611 hfs_bnode_write(fd.bnode, &entry, fd.entryoffset,
610 sizeof(struct hfsplus_cat_file)); 612 sizeof(struct hfsplus_cat_file));
611 } 613 }
612 614
613 set_bit(HFSPLUS_I_CAT_DIRTY, &HFSPLUS_I(inode)->flags); 615 set_bit(HFSPLUS_I_CAT_DIRTY, &HFSPLUS_I(inode)->flags);
614 out: 616 out:
615 hfs_find_exit(&fd); 617 hfs_find_exit(&fd);
616 return 0; 618 return 0;
617 } 619 }
618 620
1 /* 1 /*
2 * Copyright (C) International Business Machines Corp., 2000-2002 2 * Copyright (C) International Business Machines Corp., 2000-2002
3 * Portions Copyright (C) Christoph Hellwig, 2001-2002 3 * Portions Copyright (C) Christoph Hellwig, 2001-2002
4 * 4 *
5 * This program is free software; you can redistribute it and/or modify 5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by 6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or 7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version. 8 * (at your option) any later version.
9 * 9 *
10 * This program is distributed in the hope that it will be useful, 10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
13 * the GNU General Public License for more details. 13 * the GNU General Public License for more details.
14 * 14 *
15 * You should have received a copy of the GNU General Public License 15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software 16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */ 18 */
19 19
20 #include <linux/mm.h> 20 #include <linux/mm.h>
21 #include <linux/fs.h> 21 #include <linux/fs.h>
22 #include <linux/quotaops.h> 22 #include <linux/quotaops.h>
23 #include "jfs_incore.h" 23 #include "jfs_incore.h"
24 #include "jfs_inode.h" 24 #include "jfs_inode.h"
25 #include "jfs_dmap.h" 25 #include "jfs_dmap.h"
26 #include "jfs_txnmgr.h" 26 #include "jfs_txnmgr.h"
27 #include "jfs_xattr.h" 27 #include "jfs_xattr.h"
28 #include "jfs_acl.h" 28 #include "jfs_acl.h"
29 #include "jfs_debug.h" 29 #include "jfs_debug.h"
30 30
31 int jfs_fsync(struct file *file, int datasync) 31 int jfs_fsync(struct file *file, int datasync)
32 { 32 {
33 struct inode *inode = file->f_mapping->host; 33 struct inode *inode = file->f_mapping->host;
34 int rc = 0; 34 int rc = 0;
35 35
36 if (!(inode->i_state & I_DIRTY) || 36 if (!(inode->i_state & I_DIRTY) ||
37 (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) { 37 (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) {
38 /* Make sure committed changes hit the disk */ 38 /* Make sure committed changes hit the disk */
39 jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1); 39 jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1);
40 return rc; 40 return rc;
41 } 41 }
42 42
43 rc |= jfs_commit_inode(inode, 1); 43 rc |= jfs_commit_inode(inode, 1);
44 44
45 return rc ? -EIO : 0; 45 return rc ? -EIO : 0;
46 } 46 }
47 47
48 static int jfs_open(struct inode *inode, struct file *file) 48 static int jfs_open(struct inode *inode, struct file *file)
49 { 49 {
50 int rc; 50 int rc;
51 51
52 if ((rc = dquot_file_open(inode, file))) 52 if ((rc = dquot_file_open(inode, file)))
53 return rc; 53 return rc;
54 54
55 /* 55 /*
56 * We attempt to allow only one "active" file open per aggregate 56 * We attempt to allow only one "active" file open per aggregate
57 * group. Otherwise, appending to files in parallel can cause 57 * group. Otherwise, appending to files in parallel can cause
58 * fragmentation within the files. 58 * fragmentation within the files.
59 * 59 *
60 * If the file is empty, it was probably just created and going 60 * If the file is empty, it was probably just created and going
61 * to be written to. If it has a size, we'll hold off until the 61 * to be written to. If it has a size, we'll hold off until the
62 * file is actually grown. 62 * file is actually grown.
63 */ 63 */
64 if (S_ISREG(inode->i_mode) && file->f_mode & FMODE_WRITE && 64 if (S_ISREG(inode->i_mode) && file->f_mode & FMODE_WRITE &&
65 (inode->i_size == 0)) { 65 (inode->i_size == 0)) {
66 struct jfs_inode_info *ji = JFS_IP(inode); 66 struct jfs_inode_info *ji = JFS_IP(inode);
67 spin_lock_irq(&ji->ag_lock); 67 spin_lock_irq(&ji->ag_lock);
68 if (ji->active_ag == -1) { 68 if (ji->active_ag == -1) {
69 struct jfs_sb_info *jfs_sb = JFS_SBI(inode->i_sb); 69 struct jfs_sb_info *jfs_sb = JFS_SBI(inode->i_sb);
70 ji->active_ag = BLKTOAG(addressPXD(&ji->ixpxd), jfs_sb); 70 ji->active_ag = BLKTOAG(addressPXD(&ji->ixpxd), jfs_sb);
71 atomic_inc( &jfs_sb->bmap->db_active[ji->active_ag]); 71 atomic_inc( &jfs_sb->bmap->db_active[ji->active_ag]);
72 } 72 }
73 spin_unlock_irq(&ji->ag_lock); 73 spin_unlock_irq(&ji->ag_lock);
74 } 74 }
75 75
76 return 0; 76 return 0;
77 } 77 }
78 static int jfs_release(struct inode *inode, struct file *file) 78 static int jfs_release(struct inode *inode, struct file *file)
79 { 79 {
80 struct jfs_inode_info *ji = JFS_IP(inode); 80 struct jfs_inode_info *ji = JFS_IP(inode);
81 81
82 spin_lock_irq(&ji->ag_lock); 82 spin_lock_irq(&ji->ag_lock);
83 if (ji->active_ag != -1) { 83 if (ji->active_ag != -1) {
84 struct bmap *bmap = JFS_SBI(inode->i_sb)->bmap; 84 struct bmap *bmap = JFS_SBI(inode->i_sb)->bmap;
85 atomic_dec(&bmap->db_active[ji->active_ag]); 85 atomic_dec(&bmap->db_active[ji->active_ag]);
86 ji->active_ag = -1; 86 ji->active_ag = -1;
87 } 87 }
88 spin_unlock_irq(&ji->ag_lock); 88 spin_unlock_irq(&ji->ag_lock);
89 89
90 return 0; 90 return 0;
91 } 91 }
92 92
93 int jfs_setattr(struct dentry *dentry, struct iattr *iattr) 93 int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
94 { 94 {
95 struct inode *inode = dentry->d_inode; 95 struct inode *inode = dentry->d_inode;
96 int rc; 96 int rc;
97 97
98 rc = inode_change_ok(inode, iattr); 98 rc = inode_change_ok(inode, iattr);
99 if (rc) 99 if (rc)
100 return rc; 100 return rc;
101 101
102 if (is_quota_modification(inode, iattr)) 102 if (is_quota_modification(inode, iattr))
103 dquot_initialize(inode); 103 dquot_initialize(inode);
104 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || 104 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
105 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { 105 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
106 rc = dquot_transfer(inode, iattr); 106 rc = dquot_transfer(inode, iattr);
107 if (rc) 107 if (rc)
108 return rc; 108 return rc;
109 } 109 }
110 110
111 if ((iattr->ia_valid & ATTR_SIZE) && 111 if ((iattr->ia_valid & ATTR_SIZE) &&
112 iattr->ia_size != i_size_read(inode)) { 112 iattr->ia_size != i_size_read(inode)) {
113 inode_dio_wait(inode);
114
113 rc = vmtruncate(inode, iattr->ia_size); 115 rc = vmtruncate(inode, iattr->ia_size);
114 if (rc) 116 if (rc)
115 return rc; 117 return rc;
116 } 118 }
117 119
118 setattr_copy(inode, iattr); 120 setattr_copy(inode, iattr);
119 mark_inode_dirty(inode); 121 mark_inode_dirty(inode);
120 122
121 if (iattr->ia_valid & ATTR_MODE) 123 if (iattr->ia_valid & ATTR_MODE)
122 rc = jfs_acl_chmod(inode); 124 rc = jfs_acl_chmod(inode);
123 return rc; 125 return rc;
124 } 126 }
125 127
126 const struct inode_operations jfs_file_inode_operations = { 128 const struct inode_operations jfs_file_inode_operations = {
127 .truncate = jfs_truncate, 129 .truncate = jfs_truncate,
128 .setxattr = jfs_setxattr, 130 .setxattr = jfs_setxattr,
129 .getxattr = jfs_getxattr, 131 .getxattr = jfs_getxattr,
130 .listxattr = jfs_listxattr, 132 .listxattr = jfs_listxattr,
131 .removexattr = jfs_removexattr, 133 .removexattr = jfs_removexattr,
132 .setattr = jfs_setattr, 134 .setattr = jfs_setattr,
133 #ifdef CONFIG_JFS_POSIX_ACL 135 #ifdef CONFIG_JFS_POSIX_ACL
134 .check_acl = jfs_check_acl, 136 .check_acl = jfs_check_acl,
135 #endif 137 #endif
136 }; 138 };
137 139
138 const struct file_operations jfs_file_operations = { 140 const struct file_operations jfs_file_operations = {
139 .open = jfs_open, 141 .open = jfs_open,
140 .llseek = generic_file_llseek, 142 .llseek = generic_file_llseek,
141 .write = do_sync_write, 143 .write = do_sync_write,
142 .read = do_sync_read, 144 .read = do_sync_read,
143 .aio_read = generic_file_aio_read, 145 .aio_read = generic_file_aio_read,
144 .aio_write = generic_file_aio_write, 146 .aio_write = generic_file_aio_write,
145 .mmap = generic_file_mmap, 147 .mmap = generic_file_mmap,
146 .splice_read = generic_file_splice_read, 148 .splice_read = generic_file_splice_read,
147 .splice_write = generic_file_splice_write, 149 .splice_write = generic_file_splice_write,
148 .fsync = jfs_fsync, 150 .fsync = jfs_fsync,
149 .release = jfs_release, 151 .release = jfs_release,
150 .unlocked_ioctl = jfs_ioctl, 152 .unlocked_ioctl = jfs_ioctl,
151 #ifdef CONFIG_COMPAT 153 #ifdef CONFIG_COMPAT
152 .compat_ioctl = jfs_compat_ioctl, 154 .compat_ioctl = jfs_compat_ioctl,
153 #endif 155 #endif
154 }; 156 };
155 157
1 /* 1 /*
2 * inode.c - NILFS inode operations. 2 * inode.c - NILFS inode operations.
3 * 3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. 4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify 6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by 7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or 8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version. 9 * (at your option) any later version.
10 * 10 *
11 * This program is distributed in the hope that it will be useful, 11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details. 14 * GNU General Public License for more details.
15 * 15 *
16 * You should have received a copy of the GNU General Public License 16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software 17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 * 19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net> 20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 * 21 *
22 */ 22 */
23 23
24 #include <linux/buffer_head.h> 24 #include <linux/buffer_head.h>
25 #include <linux/gfp.h> 25 #include <linux/gfp.h>
26 #include <linux/mpage.h> 26 #include <linux/mpage.h>
27 #include <linux/writeback.h> 27 #include <linux/writeback.h>
28 #include <linux/uio.h> 28 #include <linux/uio.h>
29 #include "nilfs.h" 29 #include "nilfs.h"
30 #include "btnode.h" 30 #include "btnode.h"
31 #include "segment.h" 31 #include "segment.h"
32 #include "page.h" 32 #include "page.h"
33 #include "mdt.h" 33 #include "mdt.h"
34 #include "cpfile.h" 34 #include "cpfile.h"
35 #include "ifile.h" 35 #include "ifile.h"
36 36
37 struct nilfs_iget_args { 37 struct nilfs_iget_args {
38 u64 ino; 38 u64 ino;
39 __u64 cno; 39 __u64 cno;
40 struct nilfs_root *root; 40 struct nilfs_root *root;
41 int for_gc; 41 int for_gc;
42 }; 42 };
43 43
44 void nilfs_inode_add_blocks(struct inode *inode, int n) 44 void nilfs_inode_add_blocks(struct inode *inode, int n)
45 { 45 {
46 struct nilfs_root *root = NILFS_I(inode)->i_root; 46 struct nilfs_root *root = NILFS_I(inode)->i_root;
47 47
48 inode_add_bytes(inode, (1 << inode->i_blkbits) * n); 48 inode_add_bytes(inode, (1 << inode->i_blkbits) * n);
49 if (root) 49 if (root)
50 atomic_add(n, &root->blocks_count); 50 atomic_add(n, &root->blocks_count);
51 } 51 }
52 52
53 void nilfs_inode_sub_blocks(struct inode *inode, int n) 53 void nilfs_inode_sub_blocks(struct inode *inode, int n)
54 { 54 {
55 struct nilfs_root *root = NILFS_I(inode)->i_root; 55 struct nilfs_root *root = NILFS_I(inode)->i_root;
56 56
57 inode_sub_bytes(inode, (1 << inode->i_blkbits) * n); 57 inode_sub_bytes(inode, (1 << inode->i_blkbits) * n);
58 if (root) 58 if (root)
59 atomic_sub(n, &root->blocks_count); 59 atomic_sub(n, &root->blocks_count);
60 } 60 }
61 61
62 /** 62 /**
63 * nilfs_get_block() - get a file block on the filesystem (callback function) 63 * nilfs_get_block() - get a file block on the filesystem (callback function)
64 * @inode - inode struct of the target file 64 * @inode - inode struct of the target file
65 * @blkoff - file block number 65 * @blkoff - file block number
66 * @bh_result - buffer head to be mapped on 66 * @bh_result - buffer head to be mapped on
67 * @create - indicate whether allocating the block or not when it has not 67 * @create - indicate whether allocating the block or not when it has not
68 * been allocated yet. 68 * been allocated yet.
69 * 69 *
70 * This function does not issue actual read request of the specified data 70 * This function does not issue actual read request of the specified data
71 * block. It is done by VFS. 71 * block. It is done by VFS.
72 */ 72 */
73 int nilfs_get_block(struct inode *inode, sector_t blkoff, 73 int nilfs_get_block(struct inode *inode, sector_t blkoff,
74 struct buffer_head *bh_result, int create) 74 struct buffer_head *bh_result, int create)
75 { 75 {
76 struct nilfs_inode_info *ii = NILFS_I(inode); 76 struct nilfs_inode_info *ii = NILFS_I(inode);
77 struct the_nilfs *nilfs = inode->i_sb->s_fs_info; 77 struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
78 __u64 blknum = 0; 78 __u64 blknum = 0;
79 int err = 0, ret; 79 int err = 0, ret;
80 unsigned maxblocks = bh_result->b_size >> inode->i_blkbits; 80 unsigned maxblocks = bh_result->b_size >> inode->i_blkbits;
81 81
82 down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); 82 down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
83 ret = nilfs_bmap_lookup_contig(ii->i_bmap, blkoff, &blknum, maxblocks); 83 ret = nilfs_bmap_lookup_contig(ii->i_bmap, blkoff, &blknum, maxblocks);
84 up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); 84 up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
85 if (ret >= 0) { /* found */ 85 if (ret >= 0) { /* found */
86 map_bh(bh_result, inode->i_sb, blknum); 86 map_bh(bh_result, inode->i_sb, blknum);
87 if (ret > 0) 87 if (ret > 0)
88 bh_result->b_size = (ret << inode->i_blkbits); 88 bh_result->b_size = (ret << inode->i_blkbits);
89 goto out; 89 goto out;
90 } 90 }
91 /* data block was not found */ 91 /* data block was not found */
92 if (ret == -ENOENT && create) { 92 if (ret == -ENOENT && create) {
93 struct nilfs_transaction_info ti; 93 struct nilfs_transaction_info ti;
94 94
95 bh_result->b_blocknr = 0; 95 bh_result->b_blocknr = 0;
96 err = nilfs_transaction_begin(inode->i_sb, &ti, 1); 96 err = nilfs_transaction_begin(inode->i_sb, &ti, 1);
97 if (unlikely(err)) 97 if (unlikely(err))
98 goto out; 98 goto out;
99 err = nilfs_bmap_insert(ii->i_bmap, (unsigned long)blkoff, 99 err = nilfs_bmap_insert(ii->i_bmap, (unsigned long)blkoff,
100 (unsigned long)bh_result); 100 (unsigned long)bh_result);
101 if (unlikely(err != 0)) { 101 if (unlikely(err != 0)) {
102 if (err == -EEXIST) { 102 if (err == -EEXIST) {
103 /* 103 /*
104 * The get_block() function could be called 104 * The get_block() function could be called
105 * from multiple callers for an inode. 105 * from multiple callers for an inode.
106 * However, the page having this block must 106 * However, the page having this block must
107 * be locked in this case. 107 * be locked in this case.
108 */ 108 */
109 printk(KERN_WARNING 109 printk(KERN_WARNING
110 "nilfs_get_block: a race condition " 110 "nilfs_get_block: a race condition "
111 "while inserting a data block. " 111 "while inserting a data block. "
112 "(inode number=%lu, file block " 112 "(inode number=%lu, file block "
113 "offset=%llu)\n", 113 "offset=%llu)\n",
114 inode->i_ino, 114 inode->i_ino,
115 (unsigned long long)blkoff); 115 (unsigned long long)blkoff);
116 err = 0; 116 err = 0;
117 } 117 }
118 nilfs_transaction_abort(inode->i_sb); 118 nilfs_transaction_abort(inode->i_sb);
119 goto out; 119 goto out;
120 } 120 }
121 nilfs_mark_inode_dirty(inode); 121 nilfs_mark_inode_dirty(inode);
122 nilfs_transaction_commit(inode->i_sb); /* never fails */ 122 nilfs_transaction_commit(inode->i_sb); /* never fails */
123 /* Error handling should be detailed */ 123 /* Error handling should be detailed */
124 set_buffer_new(bh_result); 124 set_buffer_new(bh_result);
125 set_buffer_delay(bh_result); 125 set_buffer_delay(bh_result);
126 map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed 126 map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed
127 to proper value */ 127 to proper value */
128 } else if (ret == -ENOENT) { 128 } else if (ret == -ENOENT) {
129 /* not found is not error (e.g. hole); must return without 129 /* not found is not error (e.g. hole); must return without
130 the mapped state flag. */ 130 the mapped state flag. */
131 ; 131 ;
132 } else { 132 } else {
133 err = ret; 133 err = ret;
134 } 134 }
135 135
136 out: 136 out:
137 return err; 137 return err;
138 } 138 }
139 139
140 /** 140 /**
141 * nilfs_readpage() - implement readpage() method of nilfs_aops {} 141 * nilfs_readpage() - implement readpage() method of nilfs_aops {}
142 * address_space_operations. 142 * address_space_operations.
143 * @file - file struct of the file to be read 143 * @file - file struct of the file to be read
144 * @page - the page to be read 144 * @page - the page to be read
145 */ 145 */
146 static int nilfs_readpage(struct file *file, struct page *page) 146 static int nilfs_readpage(struct file *file, struct page *page)
147 { 147 {
148 return mpage_readpage(page, nilfs_get_block); 148 return mpage_readpage(page, nilfs_get_block);
149 } 149 }
150 150
151 /** 151 /**
152 * nilfs_readpages() - implement readpages() method of nilfs_aops {} 152 * nilfs_readpages() - implement readpages() method of nilfs_aops {}
153 * address_space_operations. 153 * address_space_operations.
154 * @file - file struct of the file to be read 154 * @file - file struct of the file to be read
155 * @mapping - address_space struct used for reading multiple pages 155 * @mapping - address_space struct used for reading multiple pages
156 * @pages - the pages to be read 156 * @pages - the pages to be read
157 * @nr_pages - number of pages to be read 157 * @nr_pages - number of pages to be read
158 */ 158 */
159 static int nilfs_readpages(struct file *file, struct address_space *mapping, 159 static int nilfs_readpages(struct file *file, struct address_space *mapping,
160 struct list_head *pages, unsigned nr_pages) 160 struct list_head *pages, unsigned nr_pages)
161 { 161 {
162 return mpage_readpages(mapping, pages, nr_pages, nilfs_get_block); 162 return mpage_readpages(mapping, pages, nr_pages, nilfs_get_block);
163 } 163 }
164 164
165 static int nilfs_writepages(struct address_space *mapping, 165 static int nilfs_writepages(struct address_space *mapping,
166 struct writeback_control *wbc) 166 struct writeback_control *wbc)
167 { 167 {
168 struct inode *inode = mapping->host; 168 struct inode *inode = mapping->host;
169 int err = 0; 169 int err = 0;
170 170
171 if (wbc->sync_mode == WB_SYNC_ALL) 171 if (wbc->sync_mode == WB_SYNC_ALL)
172 err = nilfs_construct_dsync_segment(inode->i_sb, inode, 172 err = nilfs_construct_dsync_segment(inode->i_sb, inode,
173 wbc->range_start, 173 wbc->range_start,
174 wbc->range_end); 174 wbc->range_end);
175 return err; 175 return err;
176 } 176 }
177 177
178 static int nilfs_writepage(struct page *page, struct writeback_control *wbc) 178 static int nilfs_writepage(struct page *page, struct writeback_control *wbc)
179 { 179 {
180 struct inode *inode = page->mapping->host; 180 struct inode *inode = page->mapping->host;
181 int err; 181 int err;
182 182
183 redirty_page_for_writepage(wbc, page); 183 redirty_page_for_writepage(wbc, page);
184 unlock_page(page); 184 unlock_page(page);
185 185
186 if (wbc->sync_mode == WB_SYNC_ALL) { 186 if (wbc->sync_mode == WB_SYNC_ALL) {
187 err = nilfs_construct_segment(inode->i_sb); 187 err = nilfs_construct_segment(inode->i_sb);
188 if (unlikely(err)) 188 if (unlikely(err))
189 return err; 189 return err;
190 } else if (wbc->for_reclaim) 190 } else if (wbc->for_reclaim)
191 nilfs_flush_segment(inode->i_sb, inode->i_ino); 191 nilfs_flush_segment(inode->i_sb, inode->i_ino);
192 192
193 return 0; 193 return 0;
194 } 194 }
195 195
196 static int nilfs_set_page_dirty(struct page *page) 196 static int nilfs_set_page_dirty(struct page *page)
197 { 197 {
198 int ret = __set_page_dirty_buffers(page); 198 int ret = __set_page_dirty_buffers(page);
199 199
200 if (ret) { 200 if (ret) {
201 struct inode *inode = page->mapping->host; 201 struct inode *inode = page->mapping->host;
202 unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits); 202 unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits);
203 203
204 nilfs_set_file_dirty(inode, nr_dirty); 204 nilfs_set_file_dirty(inode, nr_dirty);
205 } 205 }
206 return ret; 206 return ret;
207 } 207 }
208 208
209 static int nilfs_write_begin(struct file *file, struct address_space *mapping, 209 static int nilfs_write_begin(struct file *file, struct address_space *mapping,
210 loff_t pos, unsigned len, unsigned flags, 210 loff_t pos, unsigned len, unsigned flags,
211 struct page **pagep, void **fsdata) 211 struct page **pagep, void **fsdata)
212 212
213 { 213 {
214 struct inode *inode = mapping->host; 214 struct inode *inode = mapping->host;
215 int err = nilfs_transaction_begin(inode->i_sb, NULL, 1); 215 int err = nilfs_transaction_begin(inode->i_sb, NULL, 1);
216 216
217 if (unlikely(err)) 217 if (unlikely(err))
218 return err; 218 return err;
219 219
220 err = block_write_begin(mapping, pos, len, flags, pagep, 220 err = block_write_begin(mapping, pos, len, flags, pagep,
221 nilfs_get_block); 221 nilfs_get_block);
222 if (unlikely(err)) { 222 if (unlikely(err)) {
223 loff_t isize = mapping->host->i_size; 223 loff_t isize = mapping->host->i_size;
224 if (pos + len > isize) 224 if (pos + len > isize)
225 vmtruncate(mapping->host, isize); 225 vmtruncate(mapping->host, isize);
226 226
227 nilfs_transaction_abort(inode->i_sb); 227 nilfs_transaction_abort(inode->i_sb);
228 } 228 }
229 return err; 229 return err;
230 } 230 }
231 231
232 static int nilfs_write_end(struct file *file, struct address_space *mapping, 232 static int nilfs_write_end(struct file *file, struct address_space *mapping,
233 loff_t pos, unsigned len, unsigned copied, 233 loff_t pos, unsigned len, unsigned copied,
234 struct page *page, void *fsdata) 234 struct page *page, void *fsdata)
235 { 235 {
236 struct inode *inode = mapping->host; 236 struct inode *inode = mapping->host;
237 unsigned start = pos & (PAGE_CACHE_SIZE - 1); 237 unsigned start = pos & (PAGE_CACHE_SIZE - 1);
238 unsigned nr_dirty; 238 unsigned nr_dirty;
239 int err; 239 int err;
240 240
241 nr_dirty = nilfs_page_count_clean_buffers(page, start, 241 nr_dirty = nilfs_page_count_clean_buffers(page, start,
242 start + copied); 242 start + copied);
243 copied = generic_write_end(file, mapping, pos, len, copied, page, 243 copied = generic_write_end(file, mapping, pos, len, copied, page,
244 fsdata); 244 fsdata);
245 nilfs_set_file_dirty(inode, nr_dirty); 245 nilfs_set_file_dirty(inode, nr_dirty);
246 err = nilfs_transaction_commit(inode->i_sb); 246 err = nilfs_transaction_commit(inode->i_sb);
247 return err ? : copied; 247 return err ? : copied;
248 } 248 }
249 249
250 static ssize_t 250 static ssize_t
251 nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, 251 nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
252 loff_t offset, unsigned long nr_segs) 252 loff_t offset, unsigned long nr_segs)
253 { 253 {
254 struct file *file = iocb->ki_filp; 254 struct file *file = iocb->ki_filp;
255 struct inode *inode = file->f_mapping->host; 255 struct inode *inode = file->f_mapping->host;
256 ssize_t size; 256 ssize_t size;
257 257
258 if (rw == WRITE) 258 if (rw == WRITE)
259 return 0; 259 return 0;
260 260
261 /* Needs synchronization with the cleaner */ 261 /* Needs synchronization with the cleaner */
262 size = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 262 size = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
263 offset, nr_segs, nilfs_get_block, NULL); 263 offset, nr_segs, nilfs_get_block, NULL);
264 264
265 /* 265 /*
266 * In case of error extending write may have instantiated a few 266 * In case of error extending write may have instantiated a few
267 * blocks outside i_size. Trim these off again. 267 * blocks outside i_size. Trim these off again.
268 */ 268 */
269 if (unlikely((rw & WRITE) && size < 0)) { 269 if (unlikely((rw & WRITE) && size < 0)) {
270 loff_t isize = i_size_read(inode); 270 loff_t isize = i_size_read(inode);
271 loff_t end = offset + iov_length(iov, nr_segs); 271 loff_t end = offset + iov_length(iov, nr_segs);
272 272
273 if (end > isize) 273 if (end > isize)
274 vmtruncate(inode, isize); 274 vmtruncate(inode, isize);
275 } 275 }
276 276
277 return size; 277 return size;
278 } 278 }
279 279
280 const struct address_space_operations nilfs_aops = { 280 const struct address_space_operations nilfs_aops = {
281 .writepage = nilfs_writepage, 281 .writepage = nilfs_writepage,
282 .readpage = nilfs_readpage, 282 .readpage = nilfs_readpage,
283 .writepages = nilfs_writepages, 283 .writepages = nilfs_writepages,
284 .set_page_dirty = nilfs_set_page_dirty, 284 .set_page_dirty = nilfs_set_page_dirty,
285 .readpages = nilfs_readpages, 285 .readpages = nilfs_readpages,
286 .write_begin = nilfs_write_begin, 286 .write_begin = nilfs_write_begin,
287 .write_end = nilfs_write_end, 287 .write_end = nilfs_write_end,
288 /* .releasepage = nilfs_releasepage, */ 288 /* .releasepage = nilfs_releasepage, */
289 .invalidatepage = block_invalidatepage, 289 .invalidatepage = block_invalidatepage,
290 .direct_IO = nilfs_direct_IO, 290 .direct_IO = nilfs_direct_IO,
291 .is_partially_uptodate = block_is_partially_uptodate, 291 .is_partially_uptodate = block_is_partially_uptodate,
292 }; 292 };
293 293
294 struct inode *nilfs_new_inode(struct inode *dir, int mode) 294 struct inode *nilfs_new_inode(struct inode *dir, int mode)
295 { 295 {
296 struct super_block *sb = dir->i_sb; 296 struct super_block *sb = dir->i_sb;
297 struct the_nilfs *nilfs = sb->s_fs_info; 297 struct the_nilfs *nilfs = sb->s_fs_info;
298 struct inode *inode; 298 struct inode *inode;
299 struct nilfs_inode_info *ii; 299 struct nilfs_inode_info *ii;
300 struct nilfs_root *root; 300 struct nilfs_root *root;
301 int err = -ENOMEM; 301 int err = -ENOMEM;
302 ino_t ino; 302 ino_t ino;
303 303
304 inode = new_inode(sb); 304 inode = new_inode(sb);
305 if (unlikely(!inode)) 305 if (unlikely(!inode))
306 goto failed; 306 goto failed;
307 307
308 mapping_set_gfp_mask(inode->i_mapping, 308 mapping_set_gfp_mask(inode->i_mapping,
309 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); 309 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
310 310
311 root = NILFS_I(dir)->i_root; 311 root = NILFS_I(dir)->i_root;
312 ii = NILFS_I(inode); 312 ii = NILFS_I(inode);
313 ii->i_state = 1 << NILFS_I_NEW; 313 ii->i_state = 1 << NILFS_I_NEW;
314 ii->i_root = root; 314 ii->i_root = root;
315 315
316 err = nilfs_ifile_create_inode(root->ifile, &ino, &ii->i_bh); 316 err = nilfs_ifile_create_inode(root->ifile, &ino, &ii->i_bh);
317 if (unlikely(err)) 317 if (unlikely(err))
318 goto failed_ifile_create_inode; 318 goto failed_ifile_create_inode;
319 /* reference count of i_bh inherits from nilfs_mdt_read_block() */ 319 /* reference count of i_bh inherits from nilfs_mdt_read_block() */
320 320
321 atomic_inc(&root->inodes_count); 321 atomic_inc(&root->inodes_count);
322 inode_init_owner(inode, dir, mode); 322 inode_init_owner(inode, dir, mode);
323 inode->i_ino = ino; 323 inode->i_ino = ino;
324 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 324 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
325 325
326 if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) { 326 if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
327 err = nilfs_bmap_read(ii->i_bmap, NULL); 327 err = nilfs_bmap_read(ii->i_bmap, NULL);
328 if (err < 0) 328 if (err < 0)
329 goto failed_bmap; 329 goto failed_bmap;
330 330
331 set_bit(NILFS_I_BMAP, &ii->i_state); 331 set_bit(NILFS_I_BMAP, &ii->i_state);
332 /* No lock is needed; iget() ensures it. */ 332 /* No lock is needed; iget() ensures it. */
333 } 333 }
334 334
335 ii->i_flags = nilfs_mask_flags( 335 ii->i_flags = nilfs_mask_flags(
336 mode, NILFS_I(dir)->i_flags & NILFS_FL_INHERITED); 336 mode, NILFS_I(dir)->i_flags & NILFS_FL_INHERITED);
337 337
338 /* ii->i_file_acl = 0; */ 338 /* ii->i_file_acl = 0; */
339 /* ii->i_dir_acl = 0; */ 339 /* ii->i_dir_acl = 0; */
340 ii->i_dir_start_lookup = 0; 340 ii->i_dir_start_lookup = 0;
341 nilfs_set_inode_flags(inode); 341 nilfs_set_inode_flags(inode);
342 spin_lock(&nilfs->ns_next_gen_lock); 342 spin_lock(&nilfs->ns_next_gen_lock);
343 inode->i_generation = nilfs->ns_next_generation++; 343 inode->i_generation = nilfs->ns_next_generation++;
344 spin_unlock(&nilfs->ns_next_gen_lock); 344 spin_unlock(&nilfs->ns_next_gen_lock);
345 insert_inode_hash(inode); 345 insert_inode_hash(inode);
346 346
347 err = nilfs_init_acl(inode, dir); 347 err = nilfs_init_acl(inode, dir);
348 if (unlikely(err)) 348 if (unlikely(err))
349 goto failed_acl; /* never occur. When supporting 349 goto failed_acl; /* never occur. When supporting
350 nilfs_init_acl(), proper cancellation of 350 nilfs_init_acl(), proper cancellation of
351 above jobs should be considered */ 351 above jobs should be considered */
352 352
353 return inode; 353 return inode;
354 354
355 failed_acl: 355 failed_acl:
356 failed_bmap: 356 failed_bmap:
357 inode->i_nlink = 0; 357 inode->i_nlink = 0;
358 iput(inode); /* raw_inode will be deleted through 358 iput(inode); /* raw_inode will be deleted through
359 generic_delete_inode() */ 359 generic_delete_inode() */
360 goto failed; 360 goto failed;
361 361
362 failed_ifile_create_inode: 362 failed_ifile_create_inode:
363 make_bad_inode(inode); 363 make_bad_inode(inode);
364 iput(inode); /* if i_nlink == 1, generic_forget_inode() will be 364 iput(inode); /* if i_nlink == 1, generic_forget_inode() will be
365 called */ 365 called */
366 failed: 366 failed:
367 return ERR_PTR(err); 367 return ERR_PTR(err);
368 } 368 }
369 369
370 void nilfs_set_inode_flags(struct inode *inode) 370 void nilfs_set_inode_flags(struct inode *inode)
371 { 371 {
372 unsigned int flags = NILFS_I(inode)->i_flags; 372 unsigned int flags = NILFS_I(inode)->i_flags;
373 373
374 inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | 374 inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME |
375 S_DIRSYNC); 375 S_DIRSYNC);
376 if (flags & FS_SYNC_FL) 376 if (flags & FS_SYNC_FL)
377 inode->i_flags |= S_SYNC; 377 inode->i_flags |= S_SYNC;
378 if (flags & FS_APPEND_FL) 378 if (flags & FS_APPEND_FL)
379 inode->i_flags |= S_APPEND; 379 inode->i_flags |= S_APPEND;
380 if (flags & FS_IMMUTABLE_FL) 380 if (flags & FS_IMMUTABLE_FL)
381 inode->i_flags |= S_IMMUTABLE; 381 inode->i_flags |= S_IMMUTABLE;
382 if (flags & FS_NOATIME_FL) 382 if (flags & FS_NOATIME_FL)
383 inode->i_flags |= S_NOATIME; 383 inode->i_flags |= S_NOATIME;
384 if (flags & FS_DIRSYNC_FL) 384 if (flags & FS_DIRSYNC_FL)
385 inode->i_flags |= S_DIRSYNC; 385 inode->i_flags |= S_DIRSYNC;
386 mapping_set_gfp_mask(inode->i_mapping, 386 mapping_set_gfp_mask(inode->i_mapping,
387 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); 387 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
388 } 388 }
389 389
390 int nilfs_read_inode_common(struct inode *inode, 390 int nilfs_read_inode_common(struct inode *inode,
391 struct nilfs_inode *raw_inode) 391 struct nilfs_inode *raw_inode)
392 { 392 {
393 struct nilfs_inode_info *ii = NILFS_I(inode); 393 struct nilfs_inode_info *ii = NILFS_I(inode);
394 int err; 394 int err;
395 395
396 inode->i_mode = le16_to_cpu(raw_inode->i_mode); 396 inode->i_mode = le16_to_cpu(raw_inode->i_mode);
397 inode->i_uid = (uid_t)le32_to_cpu(raw_inode->i_uid); 397 inode->i_uid = (uid_t)le32_to_cpu(raw_inode->i_uid);
398 inode->i_gid = (gid_t)le32_to_cpu(raw_inode->i_gid); 398 inode->i_gid = (gid_t)le32_to_cpu(raw_inode->i_gid);
399 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); 399 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
400 inode->i_size = le64_to_cpu(raw_inode->i_size); 400 inode->i_size = le64_to_cpu(raw_inode->i_size);
401 inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime); 401 inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
402 inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime); 402 inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime);
403 inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime); 403 inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
404 inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); 404 inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
405 inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec); 405 inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
406 inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); 406 inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
407 if (inode->i_nlink == 0 && inode->i_mode == 0) 407 if (inode->i_nlink == 0 && inode->i_mode == 0)
408 return -EINVAL; /* this inode is deleted */ 408 return -EINVAL; /* this inode is deleted */
409 409
410 inode->i_blocks = le64_to_cpu(raw_inode->i_blocks); 410 inode->i_blocks = le64_to_cpu(raw_inode->i_blocks);
411 ii->i_flags = le32_to_cpu(raw_inode->i_flags); 411 ii->i_flags = le32_to_cpu(raw_inode->i_flags);
412 #if 0 412 #if 0
413 ii->i_file_acl = le32_to_cpu(raw_inode->i_file_acl); 413 ii->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
414 ii->i_dir_acl = S_ISREG(inode->i_mode) ? 414 ii->i_dir_acl = S_ISREG(inode->i_mode) ?
415 0 : le32_to_cpu(raw_inode->i_dir_acl); 415 0 : le32_to_cpu(raw_inode->i_dir_acl);
416 #endif 416 #endif
417 ii->i_dir_start_lookup = 0; 417 ii->i_dir_start_lookup = 0;
418 inode->i_generation = le32_to_cpu(raw_inode->i_generation); 418 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
419 419
420 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 420 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
421 S_ISLNK(inode->i_mode)) { 421 S_ISLNK(inode->i_mode)) {
422 err = nilfs_bmap_read(ii->i_bmap, raw_inode); 422 err = nilfs_bmap_read(ii->i_bmap, raw_inode);
423 if (err < 0) 423 if (err < 0)
424 return err; 424 return err;
425 set_bit(NILFS_I_BMAP, &ii->i_state); 425 set_bit(NILFS_I_BMAP, &ii->i_state);
426 /* No lock is needed; iget() ensures it. */ 426 /* No lock is needed; iget() ensures it. */
427 } 427 }
428 return 0; 428 return 0;
429 } 429 }
430 430
431 static int __nilfs_read_inode(struct super_block *sb, 431 static int __nilfs_read_inode(struct super_block *sb,
432 struct nilfs_root *root, unsigned long ino, 432 struct nilfs_root *root, unsigned long ino,
433 struct inode *inode) 433 struct inode *inode)
434 { 434 {
435 struct the_nilfs *nilfs = sb->s_fs_info; 435 struct the_nilfs *nilfs = sb->s_fs_info;
436 struct buffer_head *bh; 436 struct buffer_head *bh;
437 struct nilfs_inode *raw_inode; 437 struct nilfs_inode *raw_inode;
438 int err; 438 int err;
439 439
440 down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); 440 down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
441 err = nilfs_ifile_get_inode_block(root->ifile, ino, &bh); 441 err = nilfs_ifile_get_inode_block(root->ifile, ino, &bh);
442 if (unlikely(err)) 442 if (unlikely(err))
443 goto bad_inode; 443 goto bad_inode;
444 444
445 raw_inode = nilfs_ifile_map_inode(root->ifile, ino, bh); 445 raw_inode = nilfs_ifile_map_inode(root->ifile, ino, bh);
446 446
447 err = nilfs_read_inode_common(inode, raw_inode); 447 err = nilfs_read_inode_common(inode, raw_inode);
448 if (err) 448 if (err)
449 goto failed_unmap; 449 goto failed_unmap;
450 450
451 if (S_ISREG(inode->i_mode)) { 451 if (S_ISREG(inode->i_mode)) {
452 inode->i_op = &nilfs_file_inode_operations; 452 inode->i_op = &nilfs_file_inode_operations;
453 inode->i_fop = &nilfs_file_operations; 453 inode->i_fop = &nilfs_file_operations;
454 inode->i_mapping->a_ops = &nilfs_aops; 454 inode->i_mapping->a_ops = &nilfs_aops;
455 } else if (S_ISDIR(inode->i_mode)) { 455 } else if (S_ISDIR(inode->i_mode)) {
456 inode->i_op = &nilfs_dir_inode_operations; 456 inode->i_op = &nilfs_dir_inode_operations;
457 inode->i_fop = &nilfs_dir_operations; 457 inode->i_fop = &nilfs_dir_operations;
458 inode->i_mapping->a_ops = &nilfs_aops; 458 inode->i_mapping->a_ops = &nilfs_aops;
459 } else if (S_ISLNK(inode->i_mode)) { 459 } else if (S_ISLNK(inode->i_mode)) {
460 inode->i_op = &nilfs_symlink_inode_operations; 460 inode->i_op = &nilfs_symlink_inode_operations;
461 inode->i_mapping->a_ops = &nilfs_aops; 461 inode->i_mapping->a_ops = &nilfs_aops;
462 } else { 462 } else {
463 inode->i_op = &nilfs_special_inode_operations; 463 inode->i_op = &nilfs_special_inode_operations;
464 init_special_inode( 464 init_special_inode(
465 inode, inode->i_mode, 465 inode, inode->i_mode,
466 huge_decode_dev(le64_to_cpu(raw_inode->i_device_code))); 466 huge_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
467 } 467 }
468 nilfs_ifile_unmap_inode(root->ifile, ino, bh); 468 nilfs_ifile_unmap_inode(root->ifile, ino, bh);
469 brelse(bh); 469 brelse(bh);
470 up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); 470 up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
471 nilfs_set_inode_flags(inode); 471 nilfs_set_inode_flags(inode);
472 return 0; 472 return 0;
473 473
474 failed_unmap: 474 failed_unmap:
475 nilfs_ifile_unmap_inode(root->ifile, ino, bh); 475 nilfs_ifile_unmap_inode(root->ifile, ino, bh);
476 brelse(bh); 476 brelse(bh);
477 477
478 bad_inode: 478 bad_inode:
479 up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); 479 up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
480 return err; 480 return err;
481 } 481 }
482 482
483 static int nilfs_iget_test(struct inode *inode, void *opaque) 483 static int nilfs_iget_test(struct inode *inode, void *opaque)
484 { 484 {
485 struct nilfs_iget_args *args = opaque; 485 struct nilfs_iget_args *args = opaque;
486 struct nilfs_inode_info *ii; 486 struct nilfs_inode_info *ii;
487 487
488 if (args->ino != inode->i_ino || args->root != NILFS_I(inode)->i_root) 488 if (args->ino != inode->i_ino || args->root != NILFS_I(inode)->i_root)
489 return 0; 489 return 0;
490 490
491 ii = NILFS_I(inode); 491 ii = NILFS_I(inode);
492 if (!test_bit(NILFS_I_GCINODE, &ii->i_state)) 492 if (!test_bit(NILFS_I_GCINODE, &ii->i_state))
493 return !args->for_gc; 493 return !args->for_gc;
494 494
495 return args->for_gc && args->cno == ii->i_cno; 495 return args->for_gc && args->cno == ii->i_cno;
496 } 496 }
497 497
498 static int nilfs_iget_set(struct inode *inode, void *opaque) 498 static int nilfs_iget_set(struct inode *inode, void *opaque)
499 { 499 {
500 struct nilfs_iget_args *args = opaque; 500 struct nilfs_iget_args *args = opaque;
501 501
502 inode->i_ino = args->ino; 502 inode->i_ino = args->ino;
503 if (args->for_gc) { 503 if (args->for_gc) {
504 NILFS_I(inode)->i_state = 1 << NILFS_I_GCINODE; 504 NILFS_I(inode)->i_state = 1 << NILFS_I_GCINODE;
505 NILFS_I(inode)->i_cno = args->cno; 505 NILFS_I(inode)->i_cno = args->cno;
506 NILFS_I(inode)->i_root = NULL; 506 NILFS_I(inode)->i_root = NULL;
507 } else { 507 } else {
508 if (args->root && args->ino == NILFS_ROOT_INO) 508 if (args->root && args->ino == NILFS_ROOT_INO)
509 nilfs_get_root(args->root); 509 nilfs_get_root(args->root);
510 NILFS_I(inode)->i_root = args->root; 510 NILFS_I(inode)->i_root = args->root;
511 } 511 }
512 return 0; 512 return 0;
513 } 513 }
514 514
515 struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root, 515 struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root,
516 unsigned long ino) 516 unsigned long ino)
517 { 517 {
518 struct nilfs_iget_args args = { 518 struct nilfs_iget_args args = {
519 .ino = ino, .root = root, .cno = 0, .for_gc = 0 519 .ino = ino, .root = root, .cno = 0, .for_gc = 0
520 }; 520 };
521 521
522 return ilookup5(sb, ino, nilfs_iget_test, &args); 522 return ilookup5(sb, ino, nilfs_iget_test, &args);
523 } 523 }
524 524
525 struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root, 525 struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root,
526 unsigned long ino) 526 unsigned long ino)
527 { 527 {
528 struct nilfs_iget_args args = { 528 struct nilfs_iget_args args = {
529 .ino = ino, .root = root, .cno = 0, .for_gc = 0 529 .ino = ino, .root = root, .cno = 0, .for_gc = 0
530 }; 530 };
531 531
532 return iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args); 532 return iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args);
533 } 533 }
534 534
535 struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root, 535 struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root,
536 unsigned long ino) 536 unsigned long ino)
537 { 537 {
538 struct inode *inode; 538 struct inode *inode;
539 int err; 539 int err;
540 540
541 inode = nilfs_iget_locked(sb, root, ino); 541 inode = nilfs_iget_locked(sb, root, ino);
542 if (unlikely(!inode)) 542 if (unlikely(!inode))
543 return ERR_PTR(-ENOMEM); 543 return ERR_PTR(-ENOMEM);
544 if (!(inode->i_state & I_NEW)) 544 if (!(inode->i_state & I_NEW))
545 return inode; 545 return inode;
546 546
547 err = __nilfs_read_inode(sb, root, ino, inode); 547 err = __nilfs_read_inode(sb, root, ino, inode);
548 if (unlikely(err)) { 548 if (unlikely(err)) {
549 iget_failed(inode); 549 iget_failed(inode);
550 return ERR_PTR(err); 550 return ERR_PTR(err);
551 } 551 }
552 unlock_new_inode(inode); 552 unlock_new_inode(inode);
553 return inode; 553 return inode;
554 } 554 }
555 555
556 struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino, 556 struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino,
557 __u64 cno) 557 __u64 cno)
558 { 558 {
559 struct nilfs_iget_args args = { 559 struct nilfs_iget_args args = {
560 .ino = ino, .root = NULL, .cno = cno, .for_gc = 1 560 .ino = ino, .root = NULL, .cno = cno, .for_gc = 1
561 }; 561 };
562 struct inode *inode; 562 struct inode *inode;
563 int err; 563 int err;
564 564
565 inode = iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args); 565 inode = iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args);
566 if (unlikely(!inode)) 566 if (unlikely(!inode))
567 return ERR_PTR(-ENOMEM); 567 return ERR_PTR(-ENOMEM);
568 if (!(inode->i_state & I_NEW)) 568 if (!(inode->i_state & I_NEW))
569 return inode; 569 return inode;
570 570
571 err = nilfs_init_gcinode(inode); 571 err = nilfs_init_gcinode(inode);
572 if (unlikely(err)) { 572 if (unlikely(err)) {
573 iget_failed(inode); 573 iget_failed(inode);
574 return ERR_PTR(err); 574 return ERR_PTR(err);
575 } 575 }
576 unlock_new_inode(inode); 576 unlock_new_inode(inode);
577 return inode; 577 return inode;
578 } 578 }
579 579
580 void nilfs_write_inode_common(struct inode *inode, 580 void nilfs_write_inode_common(struct inode *inode,
581 struct nilfs_inode *raw_inode, int has_bmap) 581 struct nilfs_inode *raw_inode, int has_bmap)
582 { 582 {
583 struct nilfs_inode_info *ii = NILFS_I(inode); 583 struct nilfs_inode_info *ii = NILFS_I(inode);
584 584
585 raw_inode->i_mode = cpu_to_le16(inode->i_mode); 585 raw_inode->i_mode = cpu_to_le16(inode->i_mode);
586 raw_inode->i_uid = cpu_to_le32(inode->i_uid); 586 raw_inode->i_uid = cpu_to_le32(inode->i_uid);
587 raw_inode->i_gid = cpu_to_le32(inode->i_gid); 587 raw_inode->i_gid = cpu_to_le32(inode->i_gid);
588 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); 588 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
589 raw_inode->i_size = cpu_to_le64(inode->i_size); 589 raw_inode->i_size = cpu_to_le64(inode->i_size);
590 raw_inode->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); 590 raw_inode->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
591 raw_inode->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec); 591 raw_inode->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
592 raw_inode->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 592 raw_inode->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
593 raw_inode->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); 593 raw_inode->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
594 raw_inode->i_blocks = cpu_to_le64(inode->i_blocks); 594 raw_inode->i_blocks = cpu_to_le64(inode->i_blocks);
595 595
596 raw_inode->i_flags = cpu_to_le32(ii->i_flags); 596 raw_inode->i_flags = cpu_to_le32(ii->i_flags);
597 raw_inode->i_generation = cpu_to_le32(inode->i_generation); 597 raw_inode->i_generation = cpu_to_le32(inode->i_generation);
598 598
599 if (NILFS_ROOT_METADATA_FILE(inode->i_ino)) { 599 if (NILFS_ROOT_METADATA_FILE(inode->i_ino)) {
600 struct the_nilfs *nilfs = inode->i_sb->s_fs_info; 600 struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
601 601
602 /* zero-fill unused portion in the case of super root block */ 602 /* zero-fill unused portion in the case of super root block */
603 raw_inode->i_xattr = 0; 603 raw_inode->i_xattr = 0;
604 raw_inode->i_pad = 0; 604 raw_inode->i_pad = 0;
605 memset((void *)raw_inode + sizeof(*raw_inode), 0, 605 memset((void *)raw_inode + sizeof(*raw_inode), 0,
606 nilfs->ns_inode_size - sizeof(*raw_inode)); 606 nilfs->ns_inode_size - sizeof(*raw_inode));
607 } 607 }
608 608
609 if (has_bmap) 609 if (has_bmap)
610 nilfs_bmap_write(ii->i_bmap, raw_inode); 610 nilfs_bmap_write(ii->i_bmap, raw_inode);
611 else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) 611 else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
612 raw_inode->i_device_code = 612 raw_inode->i_device_code =
613 cpu_to_le64(huge_encode_dev(inode->i_rdev)); 613 cpu_to_le64(huge_encode_dev(inode->i_rdev));
614 /* When extending inode, nilfs->ns_inode_size should be checked 614 /* When extending inode, nilfs->ns_inode_size should be checked
615 for substitutions of appended fields */ 615 for substitutions of appended fields */
616 } 616 }
617 617
618 void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh) 618 void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh)
619 { 619 {
620 ino_t ino = inode->i_ino; 620 ino_t ino = inode->i_ino;
621 struct nilfs_inode_info *ii = NILFS_I(inode); 621 struct nilfs_inode_info *ii = NILFS_I(inode);
622 struct inode *ifile = ii->i_root->ifile; 622 struct inode *ifile = ii->i_root->ifile;
623 struct nilfs_inode *raw_inode; 623 struct nilfs_inode *raw_inode;
624 624
625 raw_inode = nilfs_ifile_map_inode(ifile, ino, ibh); 625 raw_inode = nilfs_ifile_map_inode(ifile, ino, ibh);
626 626
627 if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state)) 627 if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state))
628 memset(raw_inode, 0, NILFS_MDT(ifile)->mi_entry_size); 628 memset(raw_inode, 0, NILFS_MDT(ifile)->mi_entry_size);
629 set_bit(NILFS_I_INODE_DIRTY, &ii->i_state); 629 set_bit(NILFS_I_INODE_DIRTY, &ii->i_state);
630 630
631 nilfs_write_inode_common(inode, raw_inode, 0); 631 nilfs_write_inode_common(inode, raw_inode, 0);
632 /* XXX: call with has_bmap = 0 is a workaround to avoid 632 /* XXX: call with has_bmap = 0 is a workaround to avoid
633 deadlock of bmap. This delays update of i_bmap to just 633 deadlock of bmap. This delays update of i_bmap to just
634 before writing */ 634 before writing */
635 nilfs_ifile_unmap_inode(ifile, ino, ibh); 635 nilfs_ifile_unmap_inode(ifile, ino, ibh);
636 } 636 }
637 637
638 #define NILFS_MAX_TRUNCATE_BLOCKS 16384 /* 64MB for 4KB block */ 638 #define NILFS_MAX_TRUNCATE_BLOCKS 16384 /* 64MB for 4KB block */
639 639
640 static void nilfs_truncate_bmap(struct nilfs_inode_info *ii, 640 static void nilfs_truncate_bmap(struct nilfs_inode_info *ii,
641 unsigned long from) 641 unsigned long from)
642 { 642 {
643 unsigned long b; 643 unsigned long b;
644 int ret; 644 int ret;
645 645
646 if (!test_bit(NILFS_I_BMAP, &ii->i_state)) 646 if (!test_bit(NILFS_I_BMAP, &ii->i_state))
647 return; 647 return;
648 repeat: 648 repeat:
649 ret = nilfs_bmap_last_key(ii->i_bmap, &b); 649 ret = nilfs_bmap_last_key(ii->i_bmap, &b);
650 if (ret == -ENOENT) 650 if (ret == -ENOENT)
651 return; 651 return;
652 else if (ret < 0) 652 else if (ret < 0)
653 goto failed; 653 goto failed;
654 654
655 if (b < from) 655 if (b < from)
656 return; 656 return;
657 657
658 b -= min_t(unsigned long, NILFS_MAX_TRUNCATE_BLOCKS, b - from); 658 b -= min_t(unsigned long, NILFS_MAX_TRUNCATE_BLOCKS, b - from);
659 ret = nilfs_bmap_truncate(ii->i_bmap, b); 659 ret = nilfs_bmap_truncate(ii->i_bmap, b);
660 nilfs_relax_pressure_in_lock(ii->vfs_inode.i_sb); 660 nilfs_relax_pressure_in_lock(ii->vfs_inode.i_sb);
661 if (!ret || (ret == -ENOMEM && 661 if (!ret || (ret == -ENOMEM &&
662 nilfs_bmap_truncate(ii->i_bmap, b) == 0)) 662 nilfs_bmap_truncate(ii->i_bmap, b) == 0))
663 goto repeat; 663 goto repeat;
664 664
665 failed: 665 failed:
666 nilfs_warning(ii->vfs_inode.i_sb, __func__, 666 nilfs_warning(ii->vfs_inode.i_sb, __func__,
667 "failed to truncate bmap (ino=%lu, err=%d)", 667 "failed to truncate bmap (ino=%lu, err=%d)",
668 ii->vfs_inode.i_ino, ret); 668 ii->vfs_inode.i_ino, ret);
669 } 669 }
670 670
671 void nilfs_truncate(struct inode *inode) 671 void nilfs_truncate(struct inode *inode)
672 { 672 {
673 unsigned long blkoff; 673 unsigned long blkoff;
674 unsigned int blocksize; 674 unsigned int blocksize;
675 struct nilfs_transaction_info ti; 675 struct nilfs_transaction_info ti;
676 struct super_block *sb = inode->i_sb; 676 struct super_block *sb = inode->i_sb;
677 struct nilfs_inode_info *ii = NILFS_I(inode); 677 struct nilfs_inode_info *ii = NILFS_I(inode);
678 678
679 if (!test_bit(NILFS_I_BMAP, &ii->i_state)) 679 if (!test_bit(NILFS_I_BMAP, &ii->i_state))
680 return; 680 return;
681 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) 681 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
682 return; 682 return;
683 683
684 blocksize = sb->s_blocksize; 684 blocksize = sb->s_blocksize;
685 blkoff = (inode->i_size + blocksize - 1) >> sb->s_blocksize_bits; 685 blkoff = (inode->i_size + blocksize - 1) >> sb->s_blocksize_bits;
686 nilfs_transaction_begin(sb, &ti, 0); /* never fails */ 686 nilfs_transaction_begin(sb, &ti, 0); /* never fails */
687 687
688 block_truncate_page(inode->i_mapping, inode->i_size, nilfs_get_block); 688 block_truncate_page(inode->i_mapping, inode->i_size, nilfs_get_block);
689 689
690 nilfs_truncate_bmap(ii, blkoff); 690 nilfs_truncate_bmap(ii, blkoff);
691 691
692 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 692 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
693 if (IS_SYNC(inode)) 693 if (IS_SYNC(inode))
694 nilfs_set_transaction_flag(NILFS_TI_SYNC); 694 nilfs_set_transaction_flag(NILFS_TI_SYNC);
695 695
696 nilfs_mark_inode_dirty(inode); 696 nilfs_mark_inode_dirty(inode);
697 nilfs_set_file_dirty(inode, 0); 697 nilfs_set_file_dirty(inode, 0);
698 nilfs_transaction_commit(sb); 698 nilfs_transaction_commit(sb);
699 /* May construct a logical segment and may fail in sync mode. 699 /* May construct a logical segment and may fail in sync mode.
700 But truncate has no return value. */ 700 But truncate has no return value. */
701 } 701 }
702 702
703 static void nilfs_clear_inode(struct inode *inode) 703 static void nilfs_clear_inode(struct inode *inode)
704 { 704 {
705 struct nilfs_inode_info *ii = NILFS_I(inode); 705 struct nilfs_inode_info *ii = NILFS_I(inode);
706 struct nilfs_mdt_info *mdi = NILFS_MDT(inode); 706 struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
707 707
708 /* 708 /*
709 * Free resources allocated in nilfs_read_inode(), here. 709 * Free resources allocated in nilfs_read_inode(), here.
710 */ 710 */
711 BUG_ON(!list_empty(&ii->i_dirty)); 711 BUG_ON(!list_empty(&ii->i_dirty));
712 brelse(ii->i_bh); 712 brelse(ii->i_bh);
713 ii->i_bh = NULL; 713 ii->i_bh = NULL;
714 714
715 if (mdi && mdi->mi_palloc_cache) 715 if (mdi && mdi->mi_palloc_cache)
716 nilfs_palloc_destroy_cache(inode); 716 nilfs_palloc_destroy_cache(inode);
717 717
718 if (test_bit(NILFS_I_BMAP, &ii->i_state)) 718 if (test_bit(NILFS_I_BMAP, &ii->i_state))
719 nilfs_bmap_clear(ii->i_bmap); 719 nilfs_bmap_clear(ii->i_bmap);
720 720
721 nilfs_btnode_cache_clear(&ii->i_btnode_cache); 721 nilfs_btnode_cache_clear(&ii->i_btnode_cache);
722 722
723 if (ii->i_root && inode->i_ino == NILFS_ROOT_INO) 723 if (ii->i_root && inode->i_ino == NILFS_ROOT_INO)
724 nilfs_put_root(ii->i_root); 724 nilfs_put_root(ii->i_root);
725 } 725 }
726 726
727 void nilfs_evict_inode(struct inode *inode) 727 void nilfs_evict_inode(struct inode *inode)
728 { 728 {
729 struct nilfs_transaction_info ti; 729 struct nilfs_transaction_info ti;
730 struct super_block *sb = inode->i_sb; 730 struct super_block *sb = inode->i_sb;
731 struct nilfs_inode_info *ii = NILFS_I(inode); 731 struct nilfs_inode_info *ii = NILFS_I(inode);
732 int ret; 732 int ret;
733 733
734 if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) { 734 if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) {
735 if (inode->i_data.nrpages) 735 if (inode->i_data.nrpages)
736 truncate_inode_pages(&inode->i_data, 0); 736 truncate_inode_pages(&inode->i_data, 0);
737 end_writeback(inode); 737 end_writeback(inode);
738 nilfs_clear_inode(inode); 738 nilfs_clear_inode(inode);
739 return; 739 return;
740 } 740 }
741 nilfs_transaction_begin(sb, &ti, 0); /* never fails */ 741 nilfs_transaction_begin(sb, &ti, 0); /* never fails */
742 742
743 if (inode->i_data.nrpages) 743 if (inode->i_data.nrpages)
744 truncate_inode_pages(&inode->i_data, 0); 744 truncate_inode_pages(&inode->i_data, 0);
745 745
746 /* TODO: some of the following operations may fail. */ 746 /* TODO: some of the following operations may fail. */
747 nilfs_truncate_bmap(ii, 0); 747 nilfs_truncate_bmap(ii, 0);
748 nilfs_mark_inode_dirty(inode); 748 nilfs_mark_inode_dirty(inode);
749 end_writeback(inode); 749 end_writeback(inode);
750 750
751 ret = nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino); 751 ret = nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino);
752 if (!ret) 752 if (!ret)
753 atomic_dec(&ii->i_root->inodes_count); 753 atomic_dec(&ii->i_root->inodes_count);
754 754
755 nilfs_clear_inode(inode); 755 nilfs_clear_inode(inode);
756 756
757 if (IS_SYNC(inode)) 757 if (IS_SYNC(inode))
758 nilfs_set_transaction_flag(NILFS_TI_SYNC); 758 nilfs_set_transaction_flag(NILFS_TI_SYNC);
759 nilfs_transaction_commit(sb); 759 nilfs_transaction_commit(sb);
760 /* May construct a logical segment and may fail in sync mode. 760 /* May construct a logical segment and may fail in sync mode.
761 But delete_inode has no return value. */ 761 But delete_inode has no return value. */
762 } 762 }
763 763
764 int nilfs_setattr(struct dentry *dentry, struct iattr *iattr) 764 int nilfs_setattr(struct dentry *dentry, struct iattr *iattr)
765 { 765 {
766 struct nilfs_transaction_info ti; 766 struct nilfs_transaction_info ti;
767 struct inode *inode = dentry->d_inode; 767 struct inode *inode = dentry->d_inode;
768 struct super_block *sb = inode->i_sb; 768 struct super_block *sb = inode->i_sb;
769 int err; 769 int err;
770 770
771 err = inode_change_ok(inode, iattr); 771 err = inode_change_ok(inode, iattr);
772 if (err) 772 if (err)
773 return err; 773 return err;
774 774
775 err = nilfs_transaction_begin(sb, &ti, 0); 775 err = nilfs_transaction_begin(sb, &ti, 0);
776 if (unlikely(err)) 776 if (unlikely(err))
777 return err; 777 return err;
778 778
779 if ((iattr->ia_valid & ATTR_SIZE) && 779 if ((iattr->ia_valid & ATTR_SIZE) &&
780 iattr->ia_size != i_size_read(inode)) { 780 iattr->ia_size != i_size_read(inode)) {
781 inode_dio_wait(inode);
782
781 err = vmtruncate(inode, iattr->ia_size); 783 err = vmtruncate(inode, iattr->ia_size);
782 if (unlikely(err)) 784 if (unlikely(err))
783 goto out_err; 785 goto out_err;
784 } 786 }
785 787
786 setattr_copy(inode, iattr); 788 setattr_copy(inode, iattr);
787 mark_inode_dirty(inode); 789 mark_inode_dirty(inode);
788 790
789 if (iattr->ia_valid & ATTR_MODE) { 791 if (iattr->ia_valid & ATTR_MODE) {
790 err = nilfs_acl_chmod(inode); 792 err = nilfs_acl_chmod(inode);
791 if (unlikely(err)) 793 if (unlikely(err))
792 goto out_err; 794 goto out_err;
793 } 795 }
794 796
795 return nilfs_transaction_commit(sb); 797 return nilfs_transaction_commit(sb);
796 798
797 out_err: 799 out_err:
798 nilfs_transaction_abort(sb); 800 nilfs_transaction_abort(sb);
799 return err; 801 return err;
800 } 802 }
801 803
802 int nilfs_permission(struct inode *inode, int mask) 804 int nilfs_permission(struct inode *inode, int mask)
803 { 805 {
804 struct nilfs_root *root = NILFS_I(inode)->i_root; 806 struct nilfs_root *root = NILFS_I(inode)->i_root;
805 if ((mask & MAY_WRITE) && root && 807 if ((mask & MAY_WRITE) && root &&
806 root->cno != NILFS_CPTREE_CURRENT_CNO) 808 root->cno != NILFS_CPTREE_CURRENT_CNO)
807 return -EROFS; /* snapshot is not writable */ 809 return -EROFS; /* snapshot is not writable */
808 810
809 return generic_permission(inode, mask); 811 return generic_permission(inode, mask);
810 } 812 }
811 813
812 int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh) 814 int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh)
813 { 815 {
814 struct the_nilfs *nilfs = inode->i_sb->s_fs_info; 816 struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
815 struct nilfs_inode_info *ii = NILFS_I(inode); 817 struct nilfs_inode_info *ii = NILFS_I(inode);
816 int err; 818 int err;
817 819
818 spin_lock(&nilfs->ns_inode_lock); 820 spin_lock(&nilfs->ns_inode_lock);
819 if (ii->i_bh == NULL) { 821 if (ii->i_bh == NULL) {
820 spin_unlock(&nilfs->ns_inode_lock); 822 spin_unlock(&nilfs->ns_inode_lock);
821 err = nilfs_ifile_get_inode_block(ii->i_root->ifile, 823 err = nilfs_ifile_get_inode_block(ii->i_root->ifile,
822 inode->i_ino, pbh); 824 inode->i_ino, pbh);
823 if (unlikely(err)) 825 if (unlikely(err))
824 return err; 826 return err;
825 spin_lock(&nilfs->ns_inode_lock); 827 spin_lock(&nilfs->ns_inode_lock);
826 if (ii->i_bh == NULL) 828 if (ii->i_bh == NULL)
827 ii->i_bh = *pbh; 829 ii->i_bh = *pbh;
828 else { 830 else {
829 brelse(*pbh); 831 brelse(*pbh);
830 *pbh = ii->i_bh; 832 *pbh = ii->i_bh;
831 } 833 }
832 } else 834 } else
833 *pbh = ii->i_bh; 835 *pbh = ii->i_bh;
834 836
835 get_bh(*pbh); 837 get_bh(*pbh);
836 spin_unlock(&nilfs->ns_inode_lock); 838 spin_unlock(&nilfs->ns_inode_lock);
837 return 0; 839 return 0;
838 } 840 }
839 841
840 int nilfs_inode_dirty(struct inode *inode) 842 int nilfs_inode_dirty(struct inode *inode)
841 { 843 {
842 struct nilfs_inode_info *ii = NILFS_I(inode); 844 struct nilfs_inode_info *ii = NILFS_I(inode);
843 struct the_nilfs *nilfs = inode->i_sb->s_fs_info; 845 struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
844 int ret = 0; 846 int ret = 0;
845 847
846 if (!list_empty(&ii->i_dirty)) { 848 if (!list_empty(&ii->i_dirty)) {
847 spin_lock(&nilfs->ns_inode_lock); 849 spin_lock(&nilfs->ns_inode_lock);
848 ret = test_bit(NILFS_I_DIRTY, &ii->i_state) || 850 ret = test_bit(NILFS_I_DIRTY, &ii->i_state) ||
849 test_bit(NILFS_I_BUSY, &ii->i_state); 851 test_bit(NILFS_I_BUSY, &ii->i_state);
850 spin_unlock(&nilfs->ns_inode_lock); 852 spin_unlock(&nilfs->ns_inode_lock);
851 } 853 }
852 return ret; 854 return ret;
853 } 855 }
854 856
855 int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty) 857 int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty)
856 { 858 {
857 struct nilfs_inode_info *ii = NILFS_I(inode); 859 struct nilfs_inode_info *ii = NILFS_I(inode);
858 struct the_nilfs *nilfs = inode->i_sb->s_fs_info; 860 struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
859 861
860 atomic_add(nr_dirty, &nilfs->ns_ndirtyblks); 862 atomic_add(nr_dirty, &nilfs->ns_ndirtyblks);
861 863
862 if (test_and_set_bit(NILFS_I_DIRTY, &ii->i_state)) 864 if (test_and_set_bit(NILFS_I_DIRTY, &ii->i_state))
863 return 0; 865 return 0;
864 866
865 spin_lock(&nilfs->ns_inode_lock); 867 spin_lock(&nilfs->ns_inode_lock);
866 if (!test_bit(NILFS_I_QUEUED, &ii->i_state) && 868 if (!test_bit(NILFS_I_QUEUED, &ii->i_state) &&
867 !test_bit(NILFS_I_BUSY, &ii->i_state)) { 869 !test_bit(NILFS_I_BUSY, &ii->i_state)) {
868 /* Because this routine may race with nilfs_dispose_list(), 870 /* Because this routine may race with nilfs_dispose_list(),
869 we have to check NILFS_I_QUEUED here, too. */ 871 we have to check NILFS_I_QUEUED here, too. */
870 if (list_empty(&ii->i_dirty) && igrab(inode) == NULL) { 872 if (list_empty(&ii->i_dirty) && igrab(inode) == NULL) {
871 /* This will happen when somebody is freeing 873 /* This will happen when somebody is freeing
872 this inode. */ 874 this inode. */
873 nilfs_warning(inode->i_sb, __func__, 875 nilfs_warning(inode->i_sb, __func__,
874 "cannot get inode (ino=%lu)\n", 876 "cannot get inode (ino=%lu)\n",
875 inode->i_ino); 877 inode->i_ino);
876 spin_unlock(&nilfs->ns_inode_lock); 878 spin_unlock(&nilfs->ns_inode_lock);
877 return -EINVAL; /* NILFS_I_DIRTY may remain for 879 return -EINVAL; /* NILFS_I_DIRTY may remain for
878 freeing inode */ 880 freeing inode */
879 } 881 }
880 list_move_tail(&ii->i_dirty, &nilfs->ns_dirty_files); 882 list_move_tail(&ii->i_dirty, &nilfs->ns_dirty_files);
881 set_bit(NILFS_I_QUEUED, &ii->i_state); 883 set_bit(NILFS_I_QUEUED, &ii->i_state);
882 } 884 }
883 spin_unlock(&nilfs->ns_inode_lock); 885 spin_unlock(&nilfs->ns_inode_lock);
884 return 0; 886 return 0;
885 } 887 }
886 888
887 int nilfs_mark_inode_dirty(struct inode *inode) 889 int nilfs_mark_inode_dirty(struct inode *inode)
888 { 890 {
889 struct buffer_head *ibh; 891 struct buffer_head *ibh;
890 int err; 892 int err;
891 893
892 err = nilfs_load_inode_block(inode, &ibh); 894 err = nilfs_load_inode_block(inode, &ibh);
893 if (unlikely(err)) { 895 if (unlikely(err)) {
894 nilfs_warning(inode->i_sb, __func__, 896 nilfs_warning(inode->i_sb, __func__,
895 "failed to reget inode block.\n"); 897 "failed to reget inode block.\n");
896 return err; 898 return err;
897 } 899 }
898 nilfs_update_inode(inode, ibh); 900 nilfs_update_inode(inode, ibh);
899 mark_buffer_dirty(ibh); 901 mark_buffer_dirty(ibh);
900 nilfs_mdt_mark_dirty(NILFS_I(inode)->i_root->ifile); 902 nilfs_mdt_mark_dirty(NILFS_I(inode)->i_root->ifile);
901 brelse(ibh); 903 brelse(ibh);
902 return 0; 904 return 0;
903 } 905 }
904 906
905 /** 907 /**
906 * nilfs_dirty_inode - reflect changes on given inode to an inode block. 908 * nilfs_dirty_inode - reflect changes on given inode to an inode block.
907 * @inode: inode of the file to be registered. 909 * @inode: inode of the file to be registered.
908 * 910 *
909 * nilfs_dirty_inode() loads a inode block containing the specified 911 * nilfs_dirty_inode() loads a inode block containing the specified
910 * @inode and copies data from a nilfs_inode to a corresponding inode 912 * @inode and copies data from a nilfs_inode to a corresponding inode
911 * entry in the inode block. This operation is excluded from the segment 913 * entry in the inode block. This operation is excluded from the segment
912 * construction. This function can be called both as a single operation 914 * construction. This function can be called both as a single operation
913 * and as a part of indivisible file operations. 915 * and as a part of indivisible file operations.
914 */ 916 */
915 void nilfs_dirty_inode(struct inode *inode, int flags) 917 void nilfs_dirty_inode(struct inode *inode, int flags)
916 { 918 {
917 struct nilfs_transaction_info ti; 919 struct nilfs_transaction_info ti;
918 struct nilfs_mdt_info *mdi = NILFS_MDT(inode); 920 struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
919 921
920 if (is_bad_inode(inode)) { 922 if (is_bad_inode(inode)) {
921 nilfs_warning(inode->i_sb, __func__, 923 nilfs_warning(inode->i_sb, __func__,
922 "tried to mark bad_inode dirty. ignored.\n"); 924 "tried to mark bad_inode dirty. ignored.\n");
923 dump_stack(); 925 dump_stack();
924 return; 926 return;
925 } 927 }
926 if (mdi) { 928 if (mdi) {
927 nilfs_mdt_mark_dirty(inode); 929 nilfs_mdt_mark_dirty(inode);
928 return; 930 return;
929 } 931 }
930 nilfs_transaction_begin(inode->i_sb, &ti, 0); 932 nilfs_transaction_begin(inode->i_sb, &ti, 0);
931 nilfs_mark_inode_dirty(inode); 933 nilfs_mark_inode_dirty(inode);
932 nilfs_transaction_commit(inode->i_sb); /* never fails */ 934 nilfs_transaction_commit(inode->i_sb); /* never fails */
933 } 935 }
934 936
935 int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 937 int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
936 __u64 start, __u64 len) 938 __u64 start, __u64 len)
937 { 939 {
938 struct the_nilfs *nilfs = inode->i_sb->s_fs_info; 940 struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
939 __u64 logical = 0, phys = 0, size = 0; 941 __u64 logical = 0, phys = 0, size = 0;
940 __u32 flags = 0; 942 __u32 flags = 0;
941 loff_t isize; 943 loff_t isize;
942 sector_t blkoff, end_blkoff; 944 sector_t blkoff, end_blkoff;
943 sector_t delalloc_blkoff; 945 sector_t delalloc_blkoff;
944 unsigned long delalloc_blklen; 946 unsigned long delalloc_blklen;
945 unsigned int blkbits = inode->i_blkbits; 947 unsigned int blkbits = inode->i_blkbits;
946 int ret, n; 948 int ret, n;
947 949
948 ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); 950 ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
949 if (ret) 951 if (ret)
950 return ret; 952 return ret;
951 953
952 mutex_lock(&inode->i_mutex); 954 mutex_lock(&inode->i_mutex);
953 955
954 isize = i_size_read(inode); 956 isize = i_size_read(inode);
955 957
956 blkoff = start >> blkbits; 958 blkoff = start >> blkbits;
957 end_blkoff = (start + len - 1) >> blkbits; 959 end_blkoff = (start + len - 1) >> blkbits;
958 960
959 delalloc_blklen = nilfs_find_uncommitted_extent(inode, blkoff, 961 delalloc_blklen = nilfs_find_uncommitted_extent(inode, blkoff,
960 &delalloc_blkoff); 962 &delalloc_blkoff);
961 963
962 do { 964 do {
963 __u64 blkphy; 965 __u64 blkphy;
964 unsigned int maxblocks; 966 unsigned int maxblocks;
965 967
966 if (delalloc_blklen && blkoff == delalloc_blkoff) { 968 if (delalloc_blklen && blkoff == delalloc_blkoff) {
967 if (size) { 969 if (size) {
968 /* End of the current extent */ 970 /* End of the current extent */
969 ret = fiemap_fill_next_extent( 971 ret = fiemap_fill_next_extent(
970 fieinfo, logical, phys, size, flags); 972 fieinfo, logical, phys, size, flags);
971 if (ret) 973 if (ret)
972 break; 974 break;
973 } 975 }
974 if (blkoff > end_blkoff) 976 if (blkoff > end_blkoff)
975 break; 977 break;
976 978
977 flags = FIEMAP_EXTENT_MERGED | FIEMAP_EXTENT_DELALLOC; 979 flags = FIEMAP_EXTENT_MERGED | FIEMAP_EXTENT_DELALLOC;
978 logical = blkoff << blkbits; 980 logical = blkoff << blkbits;
979 phys = 0; 981 phys = 0;
980 size = delalloc_blklen << blkbits; 982 size = delalloc_blklen << blkbits;
981 983
982 blkoff = delalloc_blkoff + delalloc_blklen; 984 blkoff = delalloc_blkoff + delalloc_blklen;
983 delalloc_blklen = nilfs_find_uncommitted_extent( 985 delalloc_blklen = nilfs_find_uncommitted_extent(
984 inode, blkoff, &delalloc_blkoff); 986 inode, blkoff, &delalloc_blkoff);
985 continue; 987 continue;
986 } 988 }
987 989
988 /* 990 /*
989 * Limit the number of blocks that we look up so as 991 * Limit the number of blocks that we look up so as
990 * not to get into the next delayed allocation extent. 992 * not to get into the next delayed allocation extent.
991 */ 993 */
992 maxblocks = INT_MAX; 994 maxblocks = INT_MAX;
993 if (delalloc_blklen) 995 if (delalloc_blklen)
994 maxblocks = min_t(sector_t, delalloc_blkoff - blkoff, 996 maxblocks = min_t(sector_t, delalloc_blkoff - blkoff,
995 maxblocks); 997 maxblocks);
996 blkphy = 0; 998 blkphy = 0;
997 999
998 down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); 1000 down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
999 n = nilfs_bmap_lookup_contig( 1001 n = nilfs_bmap_lookup_contig(
1000 NILFS_I(inode)->i_bmap, blkoff, &blkphy, maxblocks); 1002 NILFS_I(inode)->i_bmap, blkoff, &blkphy, maxblocks);
1001 up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); 1003 up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
1002 1004
1003 if (n < 0) { 1005 if (n < 0) {
1004 int past_eof; 1006 int past_eof;
1005 1007
1006 if (unlikely(n != -ENOENT)) 1008 if (unlikely(n != -ENOENT))
1007 break; /* error */ 1009 break; /* error */
1008 1010
1009 /* HOLE */ 1011 /* HOLE */
1010 blkoff++; 1012 blkoff++;
1011 past_eof = ((blkoff << blkbits) >= isize); 1013 past_eof = ((blkoff << blkbits) >= isize);
1012 1014
1013 if (size) { 1015 if (size) {
1014 /* End of the current extent */ 1016 /* End of the current extent */
1015 1017
1016 if (past_eof) 1018 if (past_eof)
1017 flags |= FIEMAP_EXTENT_LAST; 1019 flags |= FIEMAP_EXTENT_LAST;
1018 1020
1019 ret = fiemap_fill_next_extent( 1021 ret = fiemap_fill_next_extent(
1020 fieinfo, logical, phys, size, flags); 1022 fieinfo, logical, phys, size, flags);
1021 if (ret) 1023 if (ret)
1022 break; 1024 break;
1023 size = 0; 1025 size = 0;
1024 } 1026 }
1025 if (blkoff > end_blkoff || past_eof) 1027 if (blkoff > end_blkoff || past_eof)
1026 break; 1028 break;
1027 } else { 1029 } else {
1028 if (size) { 1030 if (size) {
1029 if (phys && blkphy << blkbits == phys + size) { 1031 if (phys && blkphy << blkbits == phys + size) {
1030 /* The current extent goes on */ 1032 /* The current extent goes on */
1031 size += n << blkbits; 1033 size += n << blkbits;
1032 } else { 1034 } else {
1033 /* Terminate the current extent */ 1035 /* Terminate the current extent */
1034 ret = fiemap_fill_next_extent( 1036 ret = fiemap_fill_next_extent(
1035 fieinfo, logical, phys, size, 1037 fieinfo, logical, phys, size,
1036 flags); 1038 flags);
1037 if (ret || blkoff > end_blkoff) 1039 if (ret || blkoff > end_blkoff)
1038 break; 1040 break;
1039 1041
1040 /* Start another extent */ 1042 /* Start another extent */
1041 flags = FIEMAP_EXTENT_MERGED; 1043 flags = FIEMAP_EXTENT_MERGED;
1042 logical = blkoff << blkbits; 1044 logical = blkoff << blkbits;
1043 phys = blkphy << blkbits; 1045 phys = blkphy << blkbits;
1044 size = n << blkbits; 1046 size = n << blkbits;
1045 } 1047 }
1046 } else { 1048 } else {
1047 /* Start a new extent */ 1049 /* Start a new extent */
1048 flags = FIEMAP_EXTENT_MERGED; 1050 flags = FIEMAP_EXTENT_MERGED;
1049 logical = blkoff << blkbits; 1051 logical = blkoff << blkbits;
1050 phys = blkphy << blkbits; 1052 phys = blkphy << blkbits;
1051 size = n << blkbits; 1053 size = n << blkbits;
1052 } 1054 }
1053 blkoff += n; 1055 blkoff += n;
1054 } 1056 }
1055 cond_resched(); 1057 cond_resched();
1056 } while (true); 1058 } while (true);
1057 1059
1058 /* If ret is 1 then we just hit the end of the extent array */ 1060 /* If ret is 1 then we just hit the end of the extent array */
1059 if (ret == 1) 1061 if (ret == 1)
1060 ret = 0; 1062 ret = 0;
1061 1063
1062 mutex_unlock(&inode->i_mutex); 1064 mutex_unlock(&inode->i_mutex);
1063 return ret; 1065 return ret;
1064 } 1066 }
1065 1067
1 /* -*- mode: c; c-basic-offset: 8; -*- 1 /* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0: 2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 * 3 *
4 * file.c 4 * file.c
5 * 5 *
6 * File open, close, extend, truncate 6 * File open, close, extend, truncate
7 * 7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 * 9 *
10 * This program is free software; you can redistribute it and/or 10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public 11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either 12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version. 13 * version 2 of the License, or (at your option) any later version.
14 * 14 *
15 * This program is distributed in the hope that it will be useful, 15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details. 18 * General Public License for more details.
19 * 19 *
20 * You should have received a copy of the GNU General Public 20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the 21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA. 23 * Boston, MA 021110-1307, USA.
24 */ 24 */
25 25
26 #include <linux/capability.h> 26 #include <linux/capability.h>
27 #include <linux/fs.h> 27 #include <linux/fs.h>
28 #include <linux/types.h> 28 #include <linux/types.h>
29 #include <linux/slab.h> 29 #include <linux/slab.h>
30 #include <linux/highmem.h> 30 #include <linux/highmem.h>
31 #include <linux/pagemap.h> 31 #include <linux/pagemap.h>
32 #include <linux/uio.h> 32 #include <linux/uio.h>
33 #include <linux/sched.h> 33 #include <linux/sched.h>
34 #include <linux/splice.h> 34 #include <linux/splice.h>
35 #include <linux/mount.h> 35 #include <linux/mount.h>
36 #include <linux/writeback.h> 36 #include <linux/writeback.h>
37 #include <linux/falloc.h> 37 #include <linux/falloc.h>
38 #include <linux/quotaops.h> 38 #include <linux/quotaops.h>
39 #include <linux/blkdev.h> 39 #include <linux/blkdev.h>
40 40
41 #include <cluster/masklog.h> 41 #include <cluster/masklog.h>
42 42
43 #include "ocfs2.h" 43 #include "ocfs2.h"
44 44
45 #include "alloc.h" 45 #include "alloc.h"
46 #include "aops.h" 46 #include "aops.h"
47 #include "dir.h" 47 #include "dir.h"
48 #include "dlmglue.h" 48 #include "dlmglue.h"
49 #include "extent_map.h" 49 #include "extent_map.h"
50 #include "file.h" 50 #include "file.h"
51 #include "sysfile.h" 51 #include "sysfile.h"
52 #include "inode.h" 52 #include "inode.h"
53 #include "ioctl.h" 53 #include "ioctl.h"
54 #include "journal.h" 54 #include "journal.h"
55 #include "locks.h" 55 #include "locks.h"
56 #include "mmap.h" 56 #include "mmap.h"
57 #include "suballoc.h" 57 #include "suballoc.h"
58 #include "super.h" 58 #include "super.h"
59 #include "xattr.h" 59 #include "xattr.h"
60 #include "acl.h" 60 #include "acl.h"
61 #include "quota.h" 61 #include "quota.h"
62 #include "refcounttree.h" 62 #include "refcounttree.h"
63 #include "ocfs2_trace.h" 63 #include "ocfs2_trace.h"
64 64
65 #include "buffer_head_io.h" 65 #include "buffer_head_io.h"
66 66
67 static int ocfs2_init_file_private(struct inode *inode, struct file *file) 67 static int ocfs2_init_file_private(struct inode *inode, struct file *file)
68 { 68 {
69 struct ocfs2_file_private *fp; 69 struct ocfs2_file_private *fp;
70 70
71 fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL); 71 fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL);
72 if (!fp) 72 if (!fp)
73 return -ENOMEM; 73 return -ENOMEM;
74 74
75 fp->fp_file = file; 75 fp->fp_file = file;
76 mutex_init(&fp->fp_mutex); 76 mutex_init(&fp->fp_mutex);
77 ocfs2_file_lock_res_init(&fp->fp_flock, fp); 77 ocfs2_file_lock_res_init(&fp->fp_flock, fp);
78 file->private_data = fp; 78 file->private_data = fp;
79 79
80 return 0; 80 return 0;
81 } 81 }
82 82
83 static void ocfs2_free_file_private(struct inode *inode, struct file *file) 83 static void ocfs2_free_file_private(struct inode *inode, struct file *file)
84 { 84 {
85 struct ocfs2_file_private *fp = file->private_data; 85 struct ocfs2_file_private *fp = file->private_data;
86 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 86 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
87 87
88 if (fp) { 88 if (fp) {
89 ocfs2_simple_drop_lockres(osb, &fp->fp_flock); 89 ocfs2_simple_drop_lockres(osb, &fp->fp_flock);
90 ocfs2_lock_res_free(&fp->fp_flock); 90 ocfs2_lock_res_free(&fp->fp_flock);
91 kfree(fp); 91 kfree(fp);
92 file->private_data = NULL; 92 file->private_data = NULL;
93 } 93 }
94 } 94 }
95 95
96 static int ocfs2_file_open(struct inode *inode, struct file *file) 96 static int ocfs2_file_open(struct inode *inode, struct file *file)
97 { 97 {
98 int status; 98 int status;
99 int mode = file->f_flags; 99 int mode = file->f_flags;
100 struct ocfs2_inode_info *oi = OCFS2_I(inode); 100 struct ocfs2_inode_info *oi = OCFS2_I(inode);
101 101
102 trace_ocfs2_file_open(inode, file, file->f_path.dentry, 102 trace_ocfs2_file_open(inode, file, file->f_path.dentry,
103 (unsigned long long)OCFS2_I(inode)->ip_blkno, 103 (unsigned long long)OCFS2_I(inode)->ip_blkno,
104 file->f_path.dentry->d_name.len, 104 file->f_path.dentry->d_name.len,
105 file->f_path.dentry->d_name.name, mode); 105 file->f_path.dentry->d_name.name, mode);
106 106
107 if (file->f_mode & FMODE_WRITE) 107 if (file->f_mode & FMODE_WRITE)
108 dquot_initialize(inode); 108 dquot_initialize(inode);
109 109
110 spin_lock(&oi->ip_lock); 110 spin_lock(&oi->ip_lock);
111 111
112 /* Check that the inode hasn't been wiped from disk by another 112 /* Check that the inode hasn't been wiped from disk by another
113 * node. If it hasn't then we're safe as long as we hold the 113 * node. If it hasn't then we're safe as long as we hold the
114 * spin lock until our increment of open count. */ 114 * spin lock until our increment of open count. */
115 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { 115 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
116 spin_unlock(&oi->ip_lock); 116 spin_unlock(&oi->ip_lock);
117 117
118 status = -ENOENT; 118 status = -ENOENT;
119 goto leave; 119 goto leave;
120 } 120 }
121 121
122 if (mode & O_DIRECT) 122 if (mode & O_DIRECT)
123 oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT; 123 oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT;
124 124
125 oi->ip_open_count++; 125 oi->ip_open_count++;
126 spin_unlock(&oi->ip_lock); 126 spin_unlock(&oi->ip_lock);
127 127
128 status = ocfs2_init_file_private(inode, file); 128 status = ocfs2_init_file_private(inode, file);
129 if (status) { 129 if (status) {
130 /* 130 /*
131 * We want to set open count back if we're failing the 131 * We want to set open count back if we're failing the
132 * open. 132 * open.
133 */ 133 */
134 spin_lock(&oi->ip_lock); 134 spin_lock(&oi->ip_lock);
135 oi->ip_open_count--; 135 oi->ip_open_count--;
136 spin_unlock(&oi->ip_lock); 136 spin_unlock(&oi->ip_lock);
137 } 137 }
138 138
139 leave: 139 leave:
140 return status; 140 return status;
141 } 141 }
142 142
143 static int ocfs2_file_release(struct inode *inode, struct file *file) 143 static int ocfs2_file_release(struct inode *inode, struct file *file)
144 { 144 {
145 struct ocfs2_inode_info *oi = OCFS2_I(inode); 145 struct ocfs2_inode_info *oi = OCFS2_I(inode);
146 146
147 spin_lock(&oi->ip_lock); 147 spin_lock(&oi->ip_lock);
148 if (!--oi->ip_open_count) 148 if (!--oi->ip_open_count)
149 oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT; 149 oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
150 150
151 trace_ocfs2_file_release(inode, file, file->f_path.dentry, 151 trace_ocfs2_file_release(inode, file, file->f_path.dentry,
152 oi->ip_blkno, 152 oi->ip_blkno,
153 file->f_path.dentry->d_name.len, 153 file->f_path.dentry->d_name.len,
154 file->f_path.dentry->d_name.name, 154 file->f_path.dentry->d_name.name,
155 oi->ip_open_count); 155 oi->ip_open_count);
156 spin_unlock(&oi->ip_lock); 156 spin_unlock(&oi->ip_lock);
157 157
158 ocfs2_free_file_private(inode, file); 158 ocfs2_free_file_private(inode, file);
159 159
160 return 0; 160 return 0;
161 } 161 }
162 162
163 static int ocfs2_dir_open(struct inode *inode, struct file *file) 163 static int ocfs2_dir_open(struct inode *inode, struct file *file)
164 { 164 {
165 return ocfs2_init_file_private(inode, file); 165 return ocfs2_init_file_private(inode, file);
166 } 166 }
167 167
168 static int ocfs2_dir_release(struct inode *inode, struct file *file) 168 static int ocfs2_dir_release(struct inode *inode, struct file *file)
169 { 169 {
170 ocfs2_free_file_private(inode, file); 170 ocfs2_free_file_private(inode, file);
171 return 0; 171 return 0;
172 } 172 }
173 173
174 static int ocfs2_sync_file(struct file *file, int datasync) 174 static int ocfs2_sync_file(struct file *file, int datasync)
175 { 175 {
176 int err = 0; 176 int err = 0;
177 journal_t *journal; 177 journal_t *journal;
178 struct inode *inode = file->f_mapping->host; 178 struct inode *inode = file->f_mapping->host;
179 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 179 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
180 180
181 trace_ocfs2_sync_file(inode, file, file->f_path.dentry, 181 trace_ocfs2_sync_file(inode, file, file->f_path.dentry,
182 OCFS2_I(inode)->ip_blkno, 182 OCFS2_I(inode)->ip_blkno,
183 file->f_path.dentry->d_name.len, 183 file->f_path.dentry->d_name.len,
184 file->f_path.dentry->d_name.name, 184 file->f_path.dentry->d_name.name,
185 (unsigned long long)datasync); 185 (unsigned long long)datasync);
186 186
187 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) { 187 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
188 /* 188 /*
189 * We still have to flush drive's caches to get data to the 189 * We still have to flush drive's caches to get data to the
190 * platter 190 * platter
191 */ 191 */
192 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) 192 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
193 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); 193 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
194 goto bail; 194 goto bail;
195 } 195 }
196 196
197 journal = osb->journal->j_journal; 197 journal = osb->journal->j_journal;
198 err = jbd2_journal_force_commit(journal); 198 err = jbd2_journal_force_commit(journal);
199 199
200 bail: 200 bail:
201 if (err) 201 if (err)
202 mlog_errno(err); 202 mlog_errno(err);
203 203
204 return (err < 0) ? -EIO : 0; 204 return (err < 0) ? -EIO : 0;
205 } 205 }
206 206
207 int ocfs2_should_update_atime(struct inode *inode, 207 int ocfs2_should_update_atime(struct inode *inode,
208 struct vfsmount *vfsmnt) 208 struct vfsmount *vfsmnt)
209 { 209 {
210 struct timespec now; 210 struct timespec now;
211 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 211 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
212 212
213 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) 213 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
214 return 0; 214 return 0;
215 215
216 if ((inode->i_flags & S_NOATIME) || 216 if ((inode->i_flags & S_NOATIME) ||
217 ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))) 217 ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)))
218 return 0; 218 return 0;
219 219
220 /* 220 /*
221 * We can be called with no vfsmnt structure - NFSD will 221 * We can be called with no vfsmnt structure - NFSD will
222 * sometimes do this. 222 * sometimes do this.
223 * 223 *
224 * Note that our action here is different than touch_atime() - 224 * Note that our action here is different than touch_atime() -
225 * if we can't tell whether this is a noatime mount, then we 225 * if we can't tell whether this is a noatime mount, then we
226 * don't know whether to trust the value of s_atime_quantum. 226 * don't know whether to trust the value of s_atime_quantum.
227 */ 227 */
228 if (vfsmnt == NULL) 228 if (vfsmnt == NULL)
229 return 0; 229 return 0;
230 230
231 if ((vfsmnt->mnt_flags & MNT_NOATIME) || 231 if ((vfsmnt->mnt_flags & MNT_NOATIME) ||
232 ((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))) 232 ((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
233 return 0; 233 return 0;
234 234
235 if (vfsmnt->mnt_flags & MNT_RELATIME) { 235 if (vfsmnt->mnt_flags & MNT_RELATIME) {
236 if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) || 236 if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) ||
237 (timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0)) 237 (timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0))
238 return 1; 238 return 1;
239 239
240 return 0; 240 return 0;
241 } 241 }
242 242
243 now = CURRENT_TIME; 243 now = CURRENT_TIME;
244 if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum)) 244 if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum))
245 return 0; 245 return 0;
246 else 246 else
247 return 1; 247 return 1;
248 } 248 }
249 249
250 int ocfs2_update_inode_atime(struct inode *inode, 250 int ocfs2_update_inode_atime(struct inode *inode,
251 struct buffer_head *bh) 251 struct buffer_head *bh)
252 { 252 {
253 int ret; 253 int ret;
254 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 254 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
255 handle_t *handle; 255 handle_t *handle;
256 struct ocfs2_dinode *di = (struct ocfs2_dinode *) bh->b_data; 256 struct ocfs2_dinode *di = (struct ocfs2_dinode *) bh->b_data;
257 257
258 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 258 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
259 if (IS_ERR(handle)) { 259 if (IS_ERR(handle)) {
260 ret = PTR_ERR(handle); 260 ret = PTR_ERR(handle);
261 mlog_errno(ret); 261 mlog_errno(ret);
262 goto out; 262 goto out;
263 } 263 }
264 264
265 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh, 265 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
266 OCFS2_JOURNAL_ACCESS_WRITE); 266 OCFS2_JOURNAL_ACCESS_WRITE);
267 if (ret) { 267 if (ret) {
268 mlog_errno(ret); 268 mlog_errno(ret);
269 goto out_commit; 269 goto out_commit;
270 } 270 }
271 271
272 /* 272 /*
273 * Don't use ocfs2_mark_inode_dirty() here as we don't always 273 * Don't use ocfs2_mark_inode_dirty() here as we don't always
274 * have i_mutex to guard against concurrent changes to other 274 * have i_mutex to guard against concurrent changes to other
275 * inode fields. 275 * inode fields.
276 */ 276 */
277 inode->i_atime = CURRENT_TIME; 277 inode->i_atime = CURRENT_TIME;
278 di->i_atime = cpu_to_le64(inode->i_atime.tv_sec); 278 di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
279 di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); 279 di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
280 ocfs2_journal_dirty(handle, bh); 280 ocfs2_journal_dirty(handle, bh);
281 281
282 out_commit: 282 out_commit:
283 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); 283 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
284 out: 284 out:
285 return ret; 285 return ret;
286 } 286 }
287 287
288 static int ocfs2_set_inode_size(handle_t *handle, 288 static int ocfs2_set_inode_size(handle_t *handle,
289 struct inode *inode, 289 struct inode *inode,
290 struct buffer_head *fe_bh, 290 struct buffer_head *fe_bh,
291 u64 new_i_size) 291 u64 new_i_size)
292 { 292 {
293 int status; 293 int status;
294 294
295 i_size_write(inode, new_i_size); 295 i_size_write(inode, new_i_size);
296 inode->i_blocks = ocfs2_inode_sector_count(inode); 296 inode->i_blocks = ocfs2_inode_sector_count(inode);
297 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 297 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
298 298
299 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); 299 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
300 if (status < 0) { 300 if (status < 0) {
301 mlog_errno(status); 301 mlog_errno(status);
302 goto bail; 302 goto bail;
303 } 303 }
304 304
305 bail: 305 bail:
306 return status; 306 return status;
307 } 307 }
308 308
309 int ocfs2_simple_size_update(struct inode *inode, 309 int ocfs2_simple_size_update(struct inode *inode,
310 struct buffer_head *di_bh, 310 struct buffer_head *di_bh,
311 u64 new_i_size) 311 u64 new_i_size)
312 { 312 {
313 int ret; 313 int ret;
314 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 314 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
315 handle_t *handle = NULL; 315 handle_t *handle = NULL;
316 316
317 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 317 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
318 if (IS_ERR(handle)) { 318 if (IS_ERR(handle)) {
319 ret = PTR_ERR(handle); 319 ret = PTR_ERR(handle);
320 mlog_errno(ret); 320 mlog_errno(ret);
321 goto out; 321 goto out;
322 } 322 }
323 323
324 ret = ocfs2_set_inode_size(handle, inode, di_bh, 324 ret = ocfs2_set_inode_size(handle, inode, di_bh,
325 new_i_size); 325 new_i_size);
326 if (ret < 0) 326 if (ret < 0)
327 mlog_errno(ret); 327 mlog_errno(ret);
328 328
329 ocfs2_commit_trans(osb, handle); 329 ocfs2_commit_trans(osb, handle);
330 out: 330 out:
331 return ret; 331 return ret;
332 } 332 }
333 333
334 static int ocfs2_cow_file_pos(struct inode *inode, 334 static int ocfs2_cow_file_pos(struct inode *inode,
335 struct buffer_head *fe_bh, 335 struct buffer_head *fe_bh,
336 u64 offset) 336 u64 offset)
337 { 337 {
338 int status; 338 int status;
339 u32 phys, cpos = offset >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; 339 u32 phys, cpos = offset >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
340 unsigned int num_clusters = 0; 340 unsigned int num_clusters = 0;
341 unsigned int ext_flags = 0; 341 unsigned int ext_flags = 0;
342 342
343 /* 343 /*
344 * If the new offset is aligned to the range of the cluster, there is 344 * If the new offset is aligned to the range of the cluster, there is
345 * no space for ocfs2_zero_range_for_truncate to fill, so no need to 345 * no space for ocfs2_zero_range_for_truncate to fill, so no need to
346 * CoW either. 346 * CoW either.
347 */ 347 */
348 if ((offset & (OCFS2_SB(inode->i_sb)->s_clustersize - 1)) == 0) 348 if ((offset & (OCFS2_SB(inode->i_sb)->s_clustersize - 1)) == 0)
349 return 0; 349 return 0;
350 350
351 status = ocfs2_get_clusters(inode, cpos, &phys, 351 status = ocfs2_get_clusters(inode, cpos, &phys,
352 &num_clusters, &ext_flags); 352 &num_clusters, &ext_flags);
353 if (status) { 353 if (status) {
354 mlog_errno(status); 354 mlog_errno(status);
355 goto out; 355 goto out;
356 } 356 }
357 357
358 if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) 358 if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
359 goto out; 359 goto out;
360 360
361 return ocfs2_refcount_cow(inode, NULL, fe_bh, cpos, 1, cpos+1); 361 return ocfs2_refcount_cow(inode, NULL, fe_bh, cpos, 1, cpos+1);
362 362
363 out: 363 out:
364 return status; 364 return status;
365 } 365 }
366 366
367 static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, 367 static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
368 struct inode *inode, 368 struct inode *inode,
369 struct buffer_head *fe_bh, 369 struct buffer_head *fe_bh,
370 u64 new_i_size) 370 u64 new_i_size)
371 { 371 {
372 int status; 372 int status;
373 handle_t *handle; 373 handle_t *handle;
374 struct ocfs2_dinode *di; 374 struct ocfs2_dinode *di;
375 u64 cluster_bytes; 375 u64 cluster_bytes;
376 376
377 /* 377 /*
378 * We need to CoW the cluster contains the offset if it is reflinked 378 * We need to CoW the cluster contains the offset if it is reflinked
379 * since we will call ocfs2_zero_range_for_truncate later which will 379 * since we will call ocfs2_zero_range_for_truncate later which will
380 * write "0" from offset to the end of the cluster. 380 * write "0" from offset to the end of the cluster.
381 */ 381 */
382 status = ocfs2_cow_file_pos(inode, fe_bh, new_i_size); 382 status = ocfs2_cow_file_pos(inode, fe_bh, new_i_size);
383 if (status) { 383 if (status) {
384 mlog_errno(status); 384 mlog_errno(status);
385 return status; 385 return status;
386 } 386 }
387 387
388 /* TODO: This needs to actually orphan the inode in this 388 /* TODO: This needs to actually orphan the inode in this
389 * transaction. */ 389 * transaction. */
390 390
391 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 391 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
392 if (IS_ERR(handle)) { 392 if (IS_ERR(handle)) {
393 status = PTR_ERR(handle); 393 status = PTR_ERR(handle);
394 mlog_errno(status); 394 mlog_errno(status);
395 goto out; 395 goto out;
396 } 396 }
397 397
398 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh, 398 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
399 OCFS2_JOURNAL_ACCESS_WRITE); 399 OCFS2_JOURNAL_ACCESS_WRITE);
400 if (status < 0) { 400 if (status < 0) {
401 mlog_errno(status); 401 mlog_errno(status);
402 goto out_commit; 402 goto out_commit;
403 } 403 }
404 404
405 /* 405 /*
406 * Do this before setting i_size. 406 * Do this before setting i_size.
407 */ 407 */
408 cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size); 408 cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
409 status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size, 409 status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size,
410 cluster_bytes); 410 cluster_bytes);
411 if (status) { 411 if (status) {
412 mlog_errno(status); 412 mlog_errno(status);
413 goto out_commit; 413 goto out_commit;
414 } 414 }
415 415
416 i_size_write(inode, new_i_size); 416 i_size_write(inode, new_i_size);
417 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 417 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
418 418
419 di = (struct ocfs2_dinode *) fe_bh->b_data; 419 di = (struct ocfs2_dinode *) fe_bh->b_data;
420 di->i_size = cpu_to_le64(new_i_size); 420 di->i_size = cpu_to_le64(new_i_size);
421 di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); 421 di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
422 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 422 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
423 423
424 ocfs2_journal_dirty(handle, fe_bh); 424 ocfs2_journal_dirty(handle, fe_bh);
425 425
426 out_commit: 426 out_commit:
427 ocfs2_commit_trans(osb, handle); 427 ocfs2_commit_trans(osb, handle);
428 out: 428 out:
429 return status; 429 return status;
430 } 430 }
431 431
432 static int ocfs2_truncate_file(struct inode *inode, 432 static int ocfs2_truncate_file(struct inode *inode,
433 struct buffer_head *di_bh, 433 struct buffer_head *di_bh,
434 u64 new_i_size) 434 u64 new_i_size)
435 { 435 {
436 int status = 0; 436 int status = 0;
437 struct ocfs2_dinode *fe = NULL; 437 struct ocfs2_dinode *fe = NULL;
438 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 438 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
439 439
440 /* We trust di_bh because it comes from ocfs2_inode_lock(), which 440 /* We trust di_bh because it comes from ocfs2_inode_lock(), which
441 * already validated it */ 441 * already validated it */
442 fe = (struct ocfs2_dinode *) di_bh->b_data; 442 fe = (struct ocfs2_dinode *) di_bh->b_data;
443 443
444 trace_ocfs2_truncate_file((unsigned long long)OCFS2_I(inode)->ip_blkno, 444 trace_ocfs2_truncate_file((unsigned long long)OCFS2_I(inode)->ip_blkno,
445 (unsigned long long)le64_to_cpu(fe->i_size), 445 (unsigned long long)le64_to_cpu(fe->i_size),
446 (unsigned long long)new_i_size); 446 (unsigned long long)new_i_size);
447 447
448 mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode), 448 mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
449 "Inode %llu, inode i_size = %lld != di " 449 "Inode %llu, inode i_size = %lld != di "
450 "i_size = %llu, i_flags = 0x%x\n", 450 "i_size = %llu, i_flags = 0x%x\n",
451 (unsigned long long)OCFS2_I(inode)->ip_blkno, 451 (unsigned long long)OCFS2_I(inode)->ip_blkno,
452 i_size_read(inode), 452 i_size_read(inode),
453 (unsigned long long)le64_to_cpu(fe->i_size), 453 (unsigned long long)le64_to_cpu(fe->i_size),
454 le32_to_cpu(fe->i_flags)); 454 le32_to_cpu(fe->i_flags));
455 455
456 if (new_i_size > le64_to_cpu(fe->i_size)) { 456 if (new_i_size > le64_to_cpu(fe->i_size)) {
457 trace_ocfs2_truncate_file_error( 457 trace_ocfs2_truncate_file_error(
458 (unsigned long long)le64_to_cpu(fe->i_size), 458 (unsigned long long)le64_to_cpu(fe->i_size),
459 (unsigned long long)new_i_size); 459 (unsigned long long)new_i_size);
460 status = -EINVAL; 460 status = -EINVAL;
461 mlog_errno(status); 461 mlog_errno(status);
462 goto bail; 462 goto bail;
463 } 463 }
464 464
465 /* lets handle the simple truncate cases before doing any more 465 /* lets handle the simple truncate cases before doing any more
466 * cluster locking. */ 466 * cluster locking. */
467 if (new_i_size == le64_to_cpu(fe->i_size)) 467 if (new_i_size == le64_to_cpu(fe->i_size))
468 goto bail; 468 goto bail;
469 469
470 down_write(&OCFS2_I(inode)->ip_alloc_sem); 470 down_write(&OCFS2_I(inode)->ip_alloc_sem);
471 471
472 ocfs2_resv_discard(&osb->osb_la_resmap, 472 ocfs2_resv_discard(&osb->osb_la_resmap,
473 &OCFS2_I(inode)->ip_la_data_resv); 473 &OCFS2_I(inode)->ip_la_data_resv);
474 474
475 /* 475 /*
476 * The inode lock forced other nodes to sync and drop their 476 * The inode lock forced other nodes to sync and drop their
477 * pages, which (correctly) happens even if we have a truncate 477 * pages, which (correctly) happens even if we have a truncate
478 * without allocation change - ocfs2 cluster sizes can be much 478 * without allocation change - ocfs2 cluster sizes can be much
479 * greater than page size, so we have to truncate them 479 * greater than page size, so we have to truncate them
480 * anyway. 480 * anyway.
481 */ 481 */
482 unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1); 482 unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
483 truncate_inode_pages(inode->i_mapping, new_i_size); 483 truncate_inode_pages(inode->i_mapping, new_i_size);
484 484
485 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 485 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
486 status = ocfs2_truncate_inline(inode, di_bh, new_i_size, 486 status = ocfs2_truncate_inline(inode, di_bh, new_i_size,
487 i_size_read(inode), 1); 487 i_size_read(inode), 1);
488 if (status) 488 if (status)
489 mlog_errno(status); 489 mlog_errno(status);
490 490
491 goto bail_unlock_sem; 491 goto bail_unlock_sem;
492 } 492 }
493 493
494 /* alright, we're going to need to do a full blown alloc size 494 /* alright, we're going to need to do a full blown alloc size
495 * change. Orphan the inode so that recovery can complete the 495 * change. Orphan the inode so that recovery can complete the
496 * truncate if necessary. This does the task of marking 496 * truncate if necessary. This does the task of marking
497 * i_size. */ 497 * i_size. */
498 status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); 498 status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
499 if (status < 0) { 499 if (status < 0) {
500 mlog_errno(status); 500 mlog_errno(status);
501 goto bail_unlock_sem; 501 goto bail_unlock_sem;
502 } 502 }
503 503
504 status = ocfs2_commit_truncate(osb, inode, di_bh); 504 status = ocfs2_commit_truncate(osb, inode, di_bh);
505 if (status < 0) { 505 if (status < 0) {
506 mlog_errno(status); 506 mlog_errno(status);
507 goto bail_unlock_sem; 507 goto bail_unlock_sem;
508 } 508 }
509 509
510 /* TODO: orphan dir cleanup here. */ 510 /* TODO: orphan dir cleanup here. */
511 bail_unlock_sem: 511 bail_unlock_sem:
512 up_write(&OCFS2_I(inode)->ip_alloc_sem); 512 up_write(&OCFS2_I(inode)->ip_alloc_sem);
513 513
514 bail: 514 bail:
515 if (!status && OCFS2_I(inode)->ip_clusters == 0) 515 if (!status && OCFS2_I(inode)->ip_clusters == 0)
516 status = ocfs2_try_remove_refcount_tree(inode, di_bh); 516 status = ocfs2_try_remove_refcount_tree(inode, di_bh);
517 517
518 return status; 518 return status;
519 } 519 }
520 520
521 /* 521 /*
522 * extend file allocation only here. 522 * extend file allocation only here.
523 * we'll update all the disk stuff, and oip->alloc_size 523 * we'll update all the disk stuff, and oip->alloc_size
524 * 524 *
525 * expect stuff to be locked, a transaction started and enough data / 525 * expect stuff to be locked, a transaction started and enough data /
526 * metadata reservations in the contexts. 526 * metadata reservations in the contexts.
527 * 527 *
528 * Will return -EAGAIN, and a reason if a restart is needed. 528 * Will return -EAGAIN, and a reason if a restart is needed.
529 * If passed in, *reason will always be set, even in error. 529 * If passed in, *reason will always be set, even in error.
530 */ 530 */
531 int ocfs2_add_inode_data(struct ocfs2_super *osb, 531 int ocfs2_add_inode_data(struct ocfs2_super *osb,
532 struct inode *inode, 532 struct inode *inode,
533 u32 *logical_offset, 533 u32 *logical_offset,
534 u32 clusters_to_add, 534 u32 clusters_to_add,
535 int mark_unwritten, 535 int mark_unwritten,
536 struct buffer_head *fe_bh, 536 struct buffer_head *fe_bh,
537 handle_t *handle, 537 handle_t *handle,
538 struct ocfs2_alloc_context *data_ac, 538 struct ocfs2_alloc_context *data_ac,
539 struct ocfs2_alloc_context *meta_ac, 539 struct ocfs2_alloc_context *meta_ac,
540 enum ocfs2_alloc_restarted *reason_ret) 540 enum ocfs2_alloc_restarted *reason_ret)
541 { 541 {
542 int ret; 542 int ret;
543 struct ocfs2_extent_tree et; 543 struct ocfs2_extent_tree et;
544 544
545 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh); 545 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh);
546 ret = ocfs2_add_clusters_in_btree(handle, &et, logical_offset, 546 ret = ocfs2_add_clusters_in_btree(handle, &et, logical_offset,
547 clusters_to_add, mark_unwritten, 547 clusters_to_add, mark_unwritten,
548 data_ac, meta_ac, reason_ret); 548 data_ac, meta_ac, reason_ret);
549 549
550 return ret; 550 return ret;
551 } 551 }
552 552
553 static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start, 553 static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
554 u32 clusters_to_add, int mark_unwritten) 554 u32 clusters_to_add, int mark_unwritten)
555 { 555 {
556 int status = 0; 556 int status = 0;
557 int restart_func = 0; 557 int restart_func = 0;
558 int credits; 558 int credits;
559 u32 prev_clusters; 559 u32 prev_clusters;
560 struct buffer_head *bh = NULL; 560 struct buffer_head *bh = NULL;
561 struct ocfs2_dinode *fe = NULL; 561 struct ocfs2_dinode *fe = NULL;
562 handle_t *handle = NULL; 562 handle_t *handle = NULL;
563 struct ocfs2_alloc_context *data_ac = NULL; 563 struct ocfs2_alloc_context *data_ac = NULL;
564 struct ocfs2_alloc_context *meta_ac = NULL; 564 struct ocfs2_alloc_context *meta_ac = NULL;
565 enum ocfs2_alloc_restarted why; 565 enum ocfs2_alloc_restarted why;
566 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 566 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
567 struct ocfs2_extent_tree et; 567 struct ocfs2_extent_tree et;
568 int did_quota = 0; 568 int did_quota = 0;
569 569
570 /* 570 /*
571 * This function only exists for file systems which don't 571 * This function only exists for file systems which don't
572 * support holes. 572 * support holes.
573 */ 573 */
574 BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb)); 574 BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
575 575
576 status = ocfs2_read_inode_block(inode, &bh); 576 status = ocfs2_read_inode_block(inode, &bh);
577 if (status < 0) { 577 if (status < 0) {
578 mlog_errno(status); 578 mlog_errno(status);
579 goto leave; 579 goto leave;
580 } 580 }
581 fe = (struct ocfs2_dinode *) bh->b_data; 581 fe = (struct ocfs2_dinode *) bh->b_data;
582 582
583 restart_all: 583 restart_all:
584 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); 584 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
585 585
586 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh); 586 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh);
587 status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0, 587 status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
588 &data_ac, &meta_ac); 588 &data_ac, &meta_ac);
589 if (status) { 589 if (status) {
590 mlog_errno(status); 590 mlog_errno(status);
591 goto leave; 591 goto leave;
592 } 592 }
593 593
594 credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list, 594 credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list,
595 clusters_to_add); 595 clusters_to_add);
596 handle = ocfs2_start_trans(osb, credits); 596 handle = ocfs2_start_trans(osb, credits);
597 if (IS_ERR(handle)) { 597 if (IS_ERR(handle)) {
598 status = PTR_ERR(handle); 598 status = PTR_ERR(handle);
599 handle = NULL; 599 handle = NULL;
600 mlog_errno(status); 600 mlog_errno(status);
601 goto leave; 601 goto leave;
602 } 602 }
603 603
604 restarted_transaction: 604 restarted_transaction:
605 trace_ocfs2_extend_allocation( 605 trace_ocfs2_extend_allocation(
606 (unsigned long long)OCFS2_I(inode)->ip_blkno, 606 (unsigned long long)OCFS2_I(inode)->ip_blkno,
607 (unsigned long long)i_size_read(inode), 607 (unsigned long long)i_size_read(inode),
608 le32_to_cpu(fe->i_clusters), clusters_to_add, 608 le32_to_cpu(fe->i_clusters), clusters_to_add,
609 why, restart_func); 609 why, restart_func);
610 610
611 status = dquot_alloc_space_nodirty(inode, 611 status = dquot_alloc_space_nodirty(inode,
612 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); 612 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
613 if (status) 613 if (status)
614 goto leave; 614 goto leave;
615 did_quota = 1; 615 did_quota = 1;
616 616
617 /* reserve a write to the file entry early on - that we if we 617 /* reserve a write to the file entry early on - that we if we
618 * run out of credits in the allocation path, we can still 618 * run out of credits in the allocation path, we can still
619 * update i_size. */ 619 * update i_size. */
620 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh, 620 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
621 OCFS2_JOURNAL_ACCESS_WRITE); 621 OCFS2_JOURNAL_ACCESS_WRITE);
622 if (status < 0) { 622 if (status < 0) {
623 mlog_errno(status); 623 mlog_errno(status);
624 goto leave; 624 goto leave;
625 } 625 }
626 626
627 prev_clusters = OCFS2_I(inode)->ip_clusters; 627 prev_clusters = OCFS2_I(inode)->ip_clusters;
628 628
629 status = ocfs2_add_inode_data(osb, 629 status = ocfs2_add_inode_data(osb,
630 inode, 630 inode,
631 &logical_start, 631 &logical_start,
632 clusters_to_add, 632 clusters_to_add,
633 mark_unwritten, 633 mark_unwritten,
634 bh, 634 bh,
635 handle, 635 handle,
636 data_ac, 636 data_ac,
637 meta_ac, 637 meta_ac,
638 &why); 638 &why);
639 if ((status < 0) && (status != -EAGAIN)) { 639 if ((status < 0) && (status != -EAGAIN)) {
640 if (status != -ENOSPC) 640 if (status != -ENOSPC)
641 mlog_errno(status); 641 mlog_errno(status);
642 goto leave; 642 goto leave;
643 } 643 }
644 644
645 ocfs2_journal_dirty(handle, bh); 645 ocfs2_journal_dirty(handle, bh);
646 646
647 spin_lock(&OCFS2_I(inode)->ip_lock); 647 spin_lock(&OCFS2_I(inode)->ip_lock);
648 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); 648 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
649 spin_unlock(&OCFS2_I(inode)->ip_lock); 649 spin_unlock(&OCFS2_I(inode)->ip_lock);
650 /* Release unused quota reservation */ 650 /* Release unused quota reservation */
651 dquot_free_space(inode, 651 dquot_free_space(inode,
652 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); 652 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
653 did_quota = 0; 653 did_quota = 0;
654 654
655 if (why != RESTART_NONE && clusters_to_add) { 655 if (why != RESTART_NONE && clusters_to_add) {
656 if (why == RESTART_META) { 656 if (why == RESTART_META) {
657 restart_func = 1; 657 restart_func = 1;
658 status = 0; 658 status = 0;
659 } else { 659 } else {
660 BUG_ON(why != RESTART_TRANS); 660 BUG_ON(why != RESTART_TRANS);
661 661
662 /* TODO: This can be more intelligent. */ 662 /* TODO: This can be more intelligent. */
663 credits = ocfs2_calc_extend_credits(osb->sb, 663 credits = ocfs2_calc_extend_credits(osb->sb,
664 &fe->id2.i_list, 664 &fe->id2.i_list,
665 clusters_to_add); 665 clusters_to_add);
666 status = ocfs2_extend_trans(handle, credits); 666 status = ocfs2_extend_trans(handle, credits);
667 if (status < 0) { 667 if (status < 0) {
668 /* handle still has to be committed at 668 /* handle still has to be committed at
669 * this point. */ 669 * this point. */
670 status = -ENOMEM; 670 status = -ENOMEM;
671 mlog_errno(status); 671 mlog_errno(status);
672 goto leave; 672 goto leave;
673 } 673 }
674 goto restarted_transaction; 674 goto restarted_transaction;
675 } 675 }
676 } 676 }
677 677
678 trace_ocfs2_extend_allocation_end(OCFS2_I(inode)->ip_blkno, 678 trace_ocfs2_extend_allocation_end(OCFS2_I(inode)->ip_blkno,
679 le32_to_cpu(fe->i_clusters), 679 le32_to_cpu(fe->i_clusters),
680 (unsigned long long)le64_to_cpu(fe->i_size), 680 (unsigned long long)le64_to_cpu(fe->i_size),
681 OCFS2_I(inode)->ip_clusters, 681 OCFS2_I(inode)->ip_clusters,
682 (unsigned long long)i_size_read(inode)); 682 (unsigned long long)i_size_read(inode));
683 683
684 leave: 684 leave:
685 if (status < 0 && did_quota) 685 if (status < 0 && did_quota)
686 dquot_free_space(inode, 686 dquot_free_space(inode,
687 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); 687 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
688 if (handle) { 688 if (handle) {
689 ocfs2_commit_trans(osb, handle); 689 ocfs2_commit_trans(osb, handle);
690 handle = NULL; 690 handle = NULL;
691 } 691 }
692 if (data_ac) { 692 if (data_ac) {
693 ocfs2_free_alloc_context(data_ac); 693 ocfs2_free_alloc_context(data_ac);
694 data_ac = NULL; 694 data_ac = NULL;
695 } 695 }
696 if (meta_ac) { 696 if (meta_ac) {
697 ocfs2_free_alloc_context(meta_ac); 697 ocfs2_free_alloc_context(meta_ac);
698 meta_ac = NULL; 698 meta_ac = NULL;
699 } 699 }
700 if ((!status) && restart_func) { 700 if ((!status) && restart_func) {
701 restart_func = 0; 701 restart_func = 0;
702 goto restart_all; 702 goto restart_all;
703 } 703 }
704 brelse(bh); 704 brelse(bh);
705 bh = NULL; 705 bh = NULL;
706 706
707 return status; 707 return status;
708 } 708 }
709 709
710 /* 710 /*
711 * While a write will already be ordering the data, a truncate will not. 711 * While a write will already be ordering the data, a truncate will not.
712 * Thus, we need to explicitly order the zeroed pages. 712 * Thus, we need to explicitly order the zeroed pages.
713 */ 713 */
714 static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode) 714 static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode)
715 { 715 {
716 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 716 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
717 handle_t *handle = NULL; 717 handle_t *handle = NULL;
718 int ret = 0; 718 int ret = 0;
719 719
720 if (!ocfs2_should_order_data(inode)) 720 if (!ocfs2_should_order_data(inode))
721 goto out; 721 goto out;
722 722
723 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 723 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
724 if (IS_ERR(handle)) { 724 if (IS_ERR(handle)) {
725 ret = -ENOMEM; 725 ret = -ENOMEM;
726 mlog_errno(ret); 726 mlog_errno(ret);
727 goto out; 727 goto out;
728 } 728 }
729 729
730 ret = ocfs2_jbd2_file_inode(handle, inode); 730 ret = ocfs2_jbd2_file_inode(handle, inode);
731 if (ret < 0) 731 if (ret < 0)
732 mlog_errno(ret); 732 mlog_errno(ret);
733 733
734 out: 734 out:
735 if (ret) { 735 if (ret) {
736 if (!IS_ERR(handle)) 736 if (!IS_ERR(handle))
737 ocfs2_commit_trans(osb, handle); 737 ocfs2_commit_trans(osb, handle);
738 handle = ERR_PTR(ret); 738 handle = ERR_PTR(ret);
739 } 739 }
740 return handle; 740 return handle;
741 } 741 }
742 742
743 /* Some parts of this taken from generic_cont_expand, which turned out 743 /* Some parts of this taken from generic_cont_expand, which turned out
744 * to be too fragile to do exactly what we need without us having to 744 * to be too fragile to do exactly what we need without us having to
745 * worry about recursive locking in ->write_begin() and ->write_end(). */ 745 * worry about recursive locking in ->write_begin() and ->write_end(). */
746 static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, 746 static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
747 u64 abs_to) 747 u64 abs_to)
748 { 748 {
749 struct address_space *mapping = inode->i_mapping; 749 struct address_space *mapping = inode->i_mapping;
750 struct page *page; 750 struct page *page;
751 unsigned long index = abs_from >> PAGE_CACHE_SHIFT; 751 unsigned long index = abs_from >> PAGE_CACHE_SHIFT;
752 handle_t *handle = NULL; 752 handle_t *handle = NULL;
753 int ret = 0; 753 int ret = 0;
754 unsigned zero_from, zero_to, block_start, block_end; 754 unsigned zero_from, zero_to, block_start, block_end;
755 755
756 BUG_ON(abs_from >= abs_to); 756 BUG_ON(abs_from >= abs_to);
757 BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT)); 757 BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
758 BUG_ON(abs_from & (inode->i_blkbits - 1)); 758 BUG_ON(abs_from & (inode->i_blkbits - 1));
759 759
760 page = find_or_create_page(mapping, index, GFP_NOFS); 760 page = find_or_create_page(mapping, index, GFP_NOFS);
761 if (!page) { 761 if (!page) {
762 ret = -ENOMEM; 762 ret = -ENOMEM;
763 mlog_errno(ret); 763 mlog_errno(ret);
764 goto out; 764 goto out;
765 } 765 }
766 766
767 /* Get the offsets within the page that we want to zero */ 767 /* Get the offsets within the page that we want to zero */
768 zero_from = abs_from & (PAGE_CACHE_SIZE - 1); 768 zero_from = abs_from & (PAGE_CACHE_SIZE - 1);
769 zero_to = abs_to & (PAGE_CACHE_SIZE - 1); 769 zero_to = abs_to & (PAGE_CACHE_SIZE - 1);
770 if (!zero_to) 770 if (!zero_to)
771 zero_to = PAGE_CACHE_SIZE; 771 zero_to = PAGE_CACHE_SIZE;
772 772
773 trace_ocfs2_write_zero_page( 773 trace_ocfs2_write_zero_page(
774 (unsigned long long)OCFS2_I(inode)->ip_blkno, 774 (unsigned long long)OCFS2_I(inode)->ip_blkno,
775 (unsigned long long)abs_from, 775 (unsigned long long)abs_from,
776 (unsigned long long)abs_to, 776 (unsigned long long)abs_to,
777 index, zero_from, zero_to); 777 index, zero_from, zero_to);
778 778
779 /* We know that zero_from is block aligned */ 779 /* We know that zero_from is block aligned */
780 for (block_start = zero_from; block_start < zero_to; 780 for (block_start = zero_from; block_start < zero_to;
781 block_start = block_end) { 781 block_start = block_end) {
782 block_end = block_start + (1 << inode->i_blkbits); 782 block_end = block_start + (1 << inode->i_blkbits);
783 783
784 /* 784 /*
785 * block_start is block-aligned. Bump it by one to force 785 * block_start is block-aligned. Bump it by one to force
786 * __block_write_begin and block_commit_write to zero the 786 * __block_write_begin and block_commit_write to zero the
787 * whole block. 787 * whole block.
788 */ 788 */
789 ret = __block_write_begin(page, block_start + 1, 0, 789 ret = __block_write_begin(page, block_start + 1, 0,
790 ocfs2_get_block); 790 ocfs2_get_block);
791 if (ret < 0) { 791 if (ret < 0) {
792 mlog_errno(ret); 792 mlog_errno(ret);
793 goto out_unlock; 793 goto out_unlock;
794 } 794 }
795 795
796 if (!handle) { 796 if (!handle) {
797 handle = ocfs2_zero_start_ordered_transaction(inode); 797 handle = ocfs2_zero_start_ordered_transaction(inode);
798 if (IS_ERR(handle)) { 798 if (IS_ERR(handle)) {
799 ret = PTR_ERR(handle); 799 ret = PTR_ERR(handle);
800 handle = NULL; 800 handle = NULL;
801 break; 801 break;
802 } 802 }
803 } 803 }
804 804
805 /* must not update i_size! */ 805 /* must not update i_size! */
806 ret = block_commit_write(page, block_start + 1, 806 ret = block_commit_write(page, block_start + 1,
807 block_start + 1); 807 block_start + 1);
808 if (ret < 0) 808 if (ret < 0)
809 mlog_errno(ret); 809 mlog_errno(ret);
810 else 810 else
811 ret = 0; 811 ret = 0;
812 } 812 }
813 813
814 if (handle) 814 if (handle)
815 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); 815 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
816 816
817 out_unlock: 817 out_unlock:
818 unlock_page(page); 818 unlock_page(page);
819 page_cache_release(page); 819 page_cache_release(page);
820 out: 820 out:
821 return ret; 821 return ret;
822 } 822 }
823 823
824 /* 824 /*
825 * Find the next range to zero. We do this in terms of bytes because 825 * Find the next range to zero. We do this in terms of bytes because
826 * that's what ocfs2_zero_extend() wants, and it is dealing with the 826 * that's what ocfs2_zero_extend() wants, and it is dealing with the
827 * pagecache. We may return multiple extents. 827 * pagecache. We may return multiple extents.
828 * 828 *
829 * zero_start and zero_end are ocfs2_zero_extend()s current idea of what 829 * zero_start and zero_end are ocfs2_zero_extend()s current idea of what
830 * needs to be zeroed. range_start and range_end return the next zeroing 830 * needs to be zeroed. range_start and range_end return the next zeroing
831 * range. A subsequent call should pass the previous range_end as its 831 * range. A subsequent call should pass the previous range_end as its
832 * zero_start. If range_end is 0, there's nothing to do. 832 * zero_start. If range_end is 0, there's nothing to do.
833 * 833 *
834 * Unwritten extents are skipped over. Refcounted extents are CoWd. 834 * Unwritten extents are skipped over. Refcounted extents are CoWd.
835 */ 835 */
836 static int ocfs2_zero_extend_get_range(struct inode *inode, 836 static int ocfs2_zero_extend_get_range(struct inode *inode,
837 struct buffer_head *di_bh, 837 struct buffer_head *di_bh,
838 u64 zero_start, u64 zero_end, 838 u64 zero_start, u64 zero_end,
839 u64 *range_start, u64 *range_end) 839 u64 *range_start, u64 *range_end)
840 { 840 {
841 int rc = 0, needs_cow = 0; 841 int rc = 0, needs_cow = 0;
842 u32 p_cpos, zero_clusters = 0; 842 u32 p_cpos, zero_clusters = 0;
843 u32 zero_cpos = 843 u32 zero_cpos =
844 zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; 844 zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
845 u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end); 845 u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end);
846 unsigned int num_clusters = 0; 846 unsigned int num_clusters = 0;
847 unsigned int ext_flags = 0; 847 unsigned int ext_flags = 0;
848 848
849 while (zero_cpos < last_cpos) { 849 while (zero_cpos < last_cpos) {
850 rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos, 850 rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos,
851 &num_clusters, &ext_flags); 851 &num_clusters, &ext_flags);
852 if (rc) { 852 if (rc) {
853 mlog_errno(rc); 853 mlog_errno(rc);
854 goto out; 854 goto out;
855 } 855 }
856 856
857 if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) { 857 if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
858 zero_clusters = num_clusters; 858 zero_clusters = num_clusters;
859 if (ext_flags & OCFS2_EXT_REFCOUNTED) 859 if (ext_flags & OCFS2_EXT_REFCOUNTED)
860 needs_cow = 1; 860 needs_cow = 1;
861 break; 861 break;
862 } 862 }
863 863
864 zero_cpos += num_clusters; 864 zero_cpos += num_clusters;
865 } 865 }
866 if (!zero_clusters) { 866 if (!zero_clusters) {
867 *range_end = 0; 867 *range_end = 0;
868 goto out; 868 goto out;
869 } 869 }
870 870
871 while ((zero_cpos + zero_clusters) < last_cpos) { 871 while ((zero_cpos + zero_clusters) < last_cpos) {
872 rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters, 872 rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters,
873 &p_cpos, &num_clusters, 873 &p_cpos, &num_clusters,
874 &ext_flags); 874 &ext_flags);
875 if (rc) { 875 if (rc) {
876 mlog_errno(rc); 876 mlog_errno(rc);
877 goto out; 877 goto out;
878 } 878 }
879 879
880 if (!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN)) 880 if (!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN))
881 break; 881 break;
882 if (ext_flags & OCFS2_EXT_REFCOUNTED) 882 if (ext_flags & OCFS2_EXT_REFCOUNTED)
883 needs_cow = 1; 883 needs_cow = 1;
884 zero_clusters += num_clusters; 884 zero_clusters += num_clusters;
885 } 885 }
886 if ((zero_cpos + zero_clusters) > last_cpos) 886 if ((zero_cpos + zero_clusters) > last_cpos)
887 zero_clusters = last_cpos - zero_cpos; 887 zero_clusters = last_cpos - zero_cpos;
888 888
889 if (needs_cow) { 889 if (needs_cow) {
890 rc = ocfs2_refcount_cow(inode, NULL, di_bh, zero_cpos, 890 rc = ocfs2_refcount_cow(inode, NULL, di_bh, zero_cpos,
891 zero_clusters, UINT_MAX); 891 zero_clusters, UINT_MAX);
892 if (rc) { 892 if (rc) {
893 mlog_errno(rc); 893 mlog_errno(rc);
894 goto out; 894 goto out;
895 } 895 }
896 } 896 }
897 897
898 *range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos); 898 *range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos);
899 *range_end = ocfs2_clusters_to_bytes(inode->i_sb, 899 *range_end = ocfs2_clusters_to_bytes(inode->i_sb,
900 zero_cpos + zero_clusters); 900 zero_cpos + zero_clusters);
901 901
902 out: 902 out:
903 return rc; 903 return rc;
904 } 904 }
905 905
906 /* 906 /*
907 * Zero one range returned from ocfs2_zero_extend_get_range(). The caller 907 * Zero one range returned from ocfs2_zero_extend_get_range(). The caller
908 * has made sure that the entire range needs zeroing. 908 * has made sure that the entire range needs zeroing.
909 */ 909 */
910 static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start, 910 static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
911 u64 range_end) 911 u64 range_end)
912 { 912 {
913 int rc = 0; 913 int rc = 0;
914 u64 next_pos; 914 u64 next_pos;
915 u64 zero_pos = range_start; 915 u64 zero_pos = range_start;
916 916
917 trace_ocfs2_zero_extend_range( 917 trace_ocfs2_zero_extend_range(
918 (unsigned long long)OCFS2_I(inode)->ip_blkno, 918 (unsigned long long)OCFS2_I(inode)->ip_blkno,
919 (unsigned long long)range_start, 919 (unsigned long long)range_start,
920 (unsigned long long)range_end); 920 (unsigned long long)range_end);
921 BUG_ON(range_start >= range_end); 921 BUG_ON(range_start >= range_end);
922 922
923 while (zero_pos < range_end) { 923 while (zero_pos < range_end) {
924 next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE; 924 next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;
925 if (next_pos > range_end) 925 if (next_pos > range_end)
926 next_pos = range_end; 926 next_pos = range_end;
927 rc = ocfs2_write_zero_page(inode, zero_pos, next_pos); 927 rc = ocfs2_write_zero_page(inode, zero_pos, next_pos);
928 if (rc < 0) { 928 if (rc < 0) {
929 mlog_errno(rc); 929 mlog_errno(rc);
930 break; 930 break;
931 } 931 }
932 zero_pos = next_pos; 932 zero_pos = next_pos;
933 933
934 /* 934 /*
935 * Very large extends have the potential to lock up 935 * Very large extends have the potential to lock up
936 * the cpu for extended periods of time. 936 * the cpu for extended periods of time.
937 */ 937 */
938 cond_resched(); 938 cond_resched();
939 } 939 }
940 940
941 return rc; 941 return rc;
942 } 942 }
943 943
944 int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh, 944 int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
945 loff_t zero_to_size) 945 loff_t zero_to_size)
946 { 946 {
947 int ret = 0; 947 int ret = 0;
948 u64 zero_start, range_start = 0, range_end = 0; 948 u64 zero_start, range_start = 0, range_end = 0;
949 struct super_block *sb = inode->i_sb; 949 struct super_block *sb = inode->i_sb;
950 950
951 zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode)); 951 zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
952 trace_ocfs2_zero_extend((unsigned long long)OCFS2_I(inode)->ip_blkno, 952 trace_ocfs2_zero_extend((unsigned long long)OCFS2_I(inode)->ip_blkno,
953 (unsigned long long)zero_start, 953 (unsigned long long)zero_start,
954 (unsigned long long)i_size_read(inode)); 954 (unsigned long long)i_size_read(inode));
955 while (zero_start < zero_to_size) { 955 while (zero_start < zero_to_size) {
956 ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start, 956 ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
957 zero_to_size, 957 zero_to_size,
958 &range_start, 958 &range_start,
959 &range_end); 959 &range_end);
960 if (ret) { 960 if (ret) {
961 mlog_errno(ret); 961 mlog_errno(ret);
962 break; 962 break;
963 } 963 }
964 if (!range_end) 964 if (!range_end)
965 break; 965 break;
966 /* Trim the ends */ 966 /* Trim the ends */
967 if (range_start < zero_start) 967 if (range_start < zero_start)
968 range_start = zero_start; 968 range_start = zero_start;
969 if (range_end > zero_to_size) 969 if (range_end > zero_to_size)
970 range_end = zero_to_size; 970 range_end = zero_to_size;
971 971
972 ret = ocfs2_zero_extend_range(inode, range_start, 972 ret = ocfs2_zero_extend_range(inode, range_start,
973 range_end); 973 range_end);
974 if (ret) { 974 if (ret) {
975 mlog_errno(ret); 975 mlog_errno(ret);
976 break; 976 break;
977 } 977 }
978 zero_start = range_end; 978 zero_start = range_end;
979 } 979 }
980 980
981 return ret; 981 return ret;
982 } 982 }
983 983
984 int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh, 984 int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
985 u64 new_i_size, u64 zero_to) 985 u64 new_i_size, u64 zero_to)
986 { 986 {
987 int ret; 987 int ret;
988 u32 clusters_to_add; 988 u32 clusters_to_add;
989 struct ocfs2_inode_info *oi = OCFS2_I(inode); 989 struct ocfs2_inode_info *oi = OCFS2_I(inode);
990 990
991 /* 991 /*
992 * Only quota files call this without a bh, and they can't be 992 * Only quota files call this without a bh, and they can't be
993 * refcounted. 993 * refcounted.
994 */ 994 */
995 BUG_ON(!di_bh && (oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); 995 BUG_ON(!di_bh && (oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
996 BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE)); 996 BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE));
997 997
998 clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size); 998 clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
999 if (clusters_to_add < oi->ip_clusters) 999 if (clusters_to_add < oi->ip_clusters)
1000 clusters_to_add = 0; 1000 clusters_to_add = 0;
1001 else 1001 else
1002 clusters_to_add -= oi->ip_clusters; 1002 clusters_to_add -= oi->ip_clusters;
1003 1003
1004 if (clusters_to_add) { 1004 if (clusters_to_add) {
1005 ret = __ocfs2_extend_allocation(inode, oi->ip_clusters, 1005 ret = __ocfs2_extend_allocation(inode, oi->ip_clusters,
1006 clusters_to_add, 0); 1006 clusters_to_add, 0);
1007 if (ret) { 1007 if (ret) {
1008 mlog_errno(ret); 1008 mlog_errno(ret);
1009 goto out; 1009 goto out;
1010 } 1010 }
1011 } 1011 }
1012 1012
1013 /* 1013 /*
1014 * Call this even if we don't add any clusters to the tree. We 1014 * Call this even if we don't add any clusters to the tree. We
1015 * still need to zero the area between the old i_size and the 1015 * still need to zero the area between the old i_size and the
1016 * new i_size. 1016 * new i_size.
1017 */ 1017 */
1018 ret = ocfs2_zero_extend(inode, di_bh, zero_to); 1018 ret = ocfs2_zero_extend(inode, di_bh, zero_to);
1019 if (ret < 0) 1019 if (ret < 0)
1020 mlog_errno(ret); 1020 mlog_errno(ret);
1021 1021
1022 out: 1022 out:
1023 return ret; 1023 return ret;
1024 } 1024 }
1025 1025
1026 static int ocfs2_extend_file(struct inode *inode, 1026 static int ocfs2_extend_file(struct inode *inode,
1027 struct buffer_head *di_bh, 1027 struct buffer_head *di_bh,
1028 u64 new_i_size) 1028 u64 new_i_size)
1029 { 1029 {
1030 int ret = 0; 1030 int ret = 0;
1031 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1031 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1032 1032
1033 BUG_ON(!di_bh); 1033 BUG_ON(!di_bh);
1034 1034
1035 /* setattr sometimes calls us like this. */ 1035 /* setattr sometimes calls us like this. */
1036 if (new_i_size == 0) 1036 if (new_i_size == 0)
1037 goto out; 1037 goto out;
1038 1038
1039 if (i_size_read(inode) == new_i_size) 1039 if (i_size_read(inode) == new_i_size)
1040 goto out; 1040 goto out;
1041 BUG_ON(new_i_size < i_size_read(inode)); 1041 BUG_ON(new_i_size < i_size_read(inode));
1042 1042
1043 /* 1043 /*
1044 * The alloc sem blocks people in read/write from reading our 1044 * The alloc sem blocks people in read/write from reading our
1045 * allocation until we're done changing it. We depend on 1045 * allocation until we're done changing it. We depend on
1046 * i_mutex to block other extend/truncate calls while we're 1046 * i_mutex to block other extend/truncate calls while we're
1047 * here. We even have to hold it for sparse files because there 1047 * here. We even have to hold it for sparse files because there
1048 * might be some tail zeroing. 1048 * might be some tail zeroing.
1049 */ 1049 */
1050 down_write(&oi->ip_alloc_sem); 1050 down_write(&oi->ip_alloc_sem);
1051 1051
1052 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 1052 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1053 /* 1053 /*
1054 * We can optimize small extends by keeping the inodes 1054 * We can optimize small extends by keeping the inodes
1055 * inline data. 1055 * inline data.
1056 */ 1056 */
1057 if (ocfs2_size_fits_inline_data(di_bh, new_i_size)) { 1057 if (ocfs2_size_fits_inline_data(di_bh, new_i_size)) {
1058 up_write(&oi->ip_alloc_sem); 1058 up_write(&oi->ip_alloc_sem);
1059 goto out_update_size; 1059 goto out_update_size;
1060 } 1060 }
1061 1061
1062 ret = ocfs2_convert_inline_data_to_extents(inode, di_bh); 1062 ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
1063 if (ret) { 1063 if (ret) {
1064 up_write(&oi->ip_alloc_sem); 1064 up_write(&oi->ip_alloc_sem);
1065 mlog_errno(ret); 1065 mlog_errno(ret);
1066 goto out; 1066 goto out;
1067 } 1067 }
1068 } 1068 }
1069 1069
1070 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) 1070 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
1071 ret = ocfs2_zero_extend(inode, di_bh, new_i_size); 1071 ret = ocfs2_zero_extend(inode, di_bh, new_i_size);
1072 else 1072 else
1073 ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size, 1073 ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size,
1074 new_i_size); 1074 new_i_size);
1075 1075
1076 up_write(&oi->ip_alloc_sem); 1076 up_write(&oi->ip_alloc_sem);
1077 1077
1078 if (ret < 0) { 1078 if (ret < 0) {
1079 mlog_errno(ret); 1079 mlog_errno(ret);
1080 goto out; 1080 goto out;
1081 } 1081 }
1082 1082
1083 out_update_size: 1083 out_update_size:
1084 ret = ocfs2_simple_size_update(inode, di_bh, new_i_size); 1084 ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
1085 if (ret < 0) 1085 if (ret < 0)
1086 mlog_errno(ret); 1086 mlog_errno(ret);
1087 1087
1088 out: 1088 out:
1089 return ret; 1089 return ret;
1090 } 1090 }
1091 1091
1092 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) 1092 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1093 { 1093 {
1094 int status = 0, size_change; 1094 int status = 0, size_change;
1095 struct inode *inode = dentry->d_inode; 1095 struct inode *inode = dentry->d_inode;
1096 struct super_block *sb = inode->i_sb; 1096 struct super_block *sb = inode->i_sb;
1097 struct ocfs2_super *osb = OCFS2_SB(sb); 1097 struct ocfs2_super *osb = OCFS2_SB(sb);
1098 struct buffer_head *bh = NULL; 1098 struct buffer_head *bh = NULL;
1099 handle_t *handle = NULL; 1099 handle_t *handle = NULL;
1100 struct dquot *transfer_to[MAXQUOTAS] = { }; 1100 struct dquot *transfer_to[MAXQUOTAS] = { };
1101 int qtype; 1101 int qtype;
1102 1102
1103 trace_ocfs2_setattr(inode, dentry, 1103 trace_ocfs2_setattr(inode, dentry,
1104 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1104 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1105 dentry->d_name.len, dentry->d_name.name, 1105 dentry->d_name.len, dentry->d_name.name,
1106 attr->ia_valid, attr->ia_mode, 1106 attr->ia_valid, attr->ia_mode,
1107 attr->ia_uid, attr->ia_gid); 1107 attr->ia_uid, attr->ia_gid);
1108 1108
1109 /* ensuring we don't even attempt to truncate a symlink */ 1109 /* ensuring we don't even attempt to truncate a symlink */
1110 if (S_ISLNK(inode->i_mode)) 1110 if (S_ISLNK(inode->i_mode))
1111 attr->ia_valid &= ~ATTR_SIZE; 1111 attr->ia_valid &= ~ATTR_SIZE;
1112 1112
1113 #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \ 1113 #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
1114 | ATTR_GID | ATTR_UID | ATTR_MODE) 1114 | ATTR_GID | ATTR_UID | ATTR_MODE)
1115 if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) 1115 if (!(attr->ia_valid & OCFS2_VALID_ATTRS))
1116 return 0; 1116 return 0;
1117 1117
1118 status = inode_change_ok(inode, attr); 1118 status = inode_change_ok(inode, attr);
1119 if (status) 1119 if (status)
1120 return status; 1120 return status;
1121 1121
1122 if (is_quota_modification(inode, attr)) 1122 if (is_quota_modification(inode, attr))
1123 dquot_initialize(inode); 1123 dquot_initialize(inode);
1124 size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; 1124 size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
1125 if (size_change) { 1125 if (size_change) {
1126 status = ocfs2_rw_lock(inode, 1); 1126 status = ocfs2_rw_lock(inode, 1);
1127 if (status < 0) { 1127 if (status < 0) {
1128 mlog_errno(status); 1128 mlog_errno(status);
1129 goto bail; 1129 goto bail;
1130 } 1130 }
1131 } 1131 }
1132 1132
1133 status = ocfs2_inode_lock(inode, &bh, 1); 1133 status = ocfs2_inode_lock(inode, &bh, 1);
1134 if (status < 0) { 1134 if (status < 0) {
1135 if (status != -ENOENT) 1135 if (status != -ENOENT)
1136 mlog_errno(status); 1136 mlog_errno(status);
1137 goto bail_unlock_rw; 1137 goto bail_unlock_rw;
1138 } 1138 }
1139 1139
1140 if (size_change && attr->ia_size != i_size_read(inode)) { 1140 if (size_change && attr->ia_size != i_size_read(inode)) {
1141 status = inode_newsize_ok(inode, attr->ia_size); 1141 status = inode_newsize_ok(inode, attr->ia_size);
1142 if (status) 1142 if (status)
1143 goto bail_unlock; 1143 goto bail_unlock;
1144 1144
1145 inode_dio_wait(inode);
1146
1145 if (i_size_read(inode) > attr->ia_size) { 1147 if (i_size_read(inode) > attr->ia_size) {
1146 if (ocfs2_should_order_data(inode)) { 1148 if (ocfs2_should_order_data(inode)) {
1147 status = ocfs2_begin_ordered_truncate(inode, 1149 status = ocfs2_begin_ordered_truncate(inode,
1148 attr->ia_size); 1150 attr->ia_size);
1149 if (status) 1151 if (status)
1150 goto bail_unlock; 1152 goto bail_unlock;
1151 } 1153 }
1152 status = ocfs2_truncate_file(inode, bh, attr->ia_size); 1154 status = ocfs2_truncate_file(inode, bh, attr->ia_size);
1153 } else 1155 } else
1154 status = ocfs2_extend_file(inode, bh, attr->ia_size); 1156 status = ocfs2_extend_file(inode, bh, attr->ia_size);
1155 if (status < 0) { 1157 if (status < 0) {
1156 if (status != -ENOSPC) 1158 if (status != -ENOSPC)
1157 mlog_errno(status); 1159 mlog_errno(status);
1158 status = -ENOSPC; 1160 status = -ENOSPC;
1159 goto bail_unlock; 1161 goto bail_unlock;
1160 } 1162 }
1161 } 1163 }
1162 1164
1163 if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || 1165 if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
1164 (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { 1166 (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
1165 /* 1167 /*
1166 * Gather pointers to quota structures so that allocation / 1168 * Gather pointers to quota structures so that allocation /
1167 * freeing of quota structures happens here and not inside 1169 * freeing of quota structures happens here and not inside
1168 * dquot_transfer() where we have problems with lock ordering 1170 * dquot_transfer() where we have problems with lock ordering
1169 */ 1171 */
1170 if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid 1172 if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid
1171 && OCFS2_HAS_RO_COMPAT_FEATURE(sb, 1173 && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1172 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { 1174 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
1173 transfer_to[USRQUOTA] = dqget(sb, attr->ia_uid, 1175 transfer_to[USRQUOTA] = dqget(sb, attr->ia_uid,
1174 USRQUOTA); 1176 USRQUOTA);
1175 if (!transfer_to[USRQUOTA]) { 1177 if (!transfer_to[USRQUOTA]) {
1176 status = -ESRCH; 1178 status = -ESRCH;
1177 goto bail_unlock; 1179 goto bail_unlock;
1178 } 1180 }
1179 } 1181 }
1180 if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid 1182 if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid
1181 && OCFS2_HAS_RO_COMPAT_FEATURE(sb, 1183 && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1182 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { 1184 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
1183 transfer_to[GRPQUOTA] = dqget(sb, attr->ia_gid, 1185 transfer_to[GRPQUOTA] = dqget(sb, attr->ia_gid,
1184 GRPQUOTA); 1186 GRPQUOTA);
1185 if (!transfer_to[GRPQUOTA]) { 1187 if (!transfer_to[GRPQUOTA]) {
1186 status = -ESRCH; 1188 status = -ESRCH;
1187 goto bail_unlock; 1189 goto bail_unlock;
1188 } 1190 }
1189 } 1191 }
1190 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS + 1192 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS +
1191 2 * ocfs2_quota_trans_credits(sb)); 1193 2 * ocfs2_quota_trans_credits(sb));
1192 if (IS_ERR(handle)) { 1194 if (IS_ERR(handle)) {
1193 status = PTR_ERR(handle); 1195 status = PTR_ERR(handle);
1194 mlog_errno(status); 1196 mlog_errno(status);
1195 goto bail_unlock; 1197 goto bail_unlock;
1196 } 1198 }
1197 status = __dquot_transfer(inode, transfer_to); 1199 status = __dquot_transfer(inode, transfer_to);
1198 if (status < 0) 1200 if (status < 0)
1199 goto bail_commit; 1201 goto bail_commit;
1200 } else { 1202 } else {
1201 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 1203 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1202 if (IS_ERR(handle)) { 1204 if (IS_ERR(handle)) {
1203 status = PTR_ERR(handle); 1205 status = PTR_ERR(handle);
1204 mlog_errno(status); 1206 mlog_errno(status);
1205 goto bail_unlock; 1207 goto bail_unlock;
1206 } 1208 }
1207 } 1209 }
1208 1210
1209 /* 1211 /*
1210 * This will intentionally not wind up calling truncate_setsize(), 1212 * This will intentionally not wind up calling truncate_setsize(),
1211 * since all the work for a size change has been done above. 1213 * since all the work for a size change has been done above.
1212 * Otherwise, we could get into problems with truncate as 1214 * Otherwise, we could get into problems with truncate as
1213 * ip_alloc_sem is used there to protect against i_size 1215 * ip_alloc_sem is used there to protect against i_size
1214 * changes. 1216 * changes.
1215 * 1217 *
1216 * XXX: this means the conditional below can probably be removed. 1218 * XXX: this means the conditional below can probably be removed.
1217 */ 1219 */
1218 if ((attr->ia_valid & ATTR_SIZE) && 1220 if ((attr->ia_valid & ATTR_SIZE) &&
1219 attr->ia_size != i_size_read(inode)) { 1221 attr->ia_size != i_size_read(inode)) {
1220 status = vmtruncate(inode, attr->ia_size); 1222 status = vmtruncate(inode, attr->ia_size);
1221 if (status) { 1223 if (status) {
1222 mlog_errno(status); 1224 mlog_errno(status);
1223 goto bail_commit; 1225 goto bail_commit;
1224 } 1226 }
1225 } 1227 }
1226 1228
1227 setattr_copy(inode, attr); 1229 setattr_copy(inode, attr);
1228 mark_inode_dirty(inode); 1230 mark_inode_dirty(inode);
1229 1231
1230 status = ocfs2_mark_inode_dirty(handle, inode, bh); 1232 status = ocfs2_mark_inode_dirty(handle, inode, bh);
1231 if (status < 0) 1233 if (status < 0)
1232 mlog_errno(status); 1234 mlog_errno(status);
1233 1235
1234 bail_commit: 1236 bail_commit:
1235 ocfs2_commit_trans(osb, handle); 1237 ocfs2_commit_trans(osb, handle);
1236 bail_unlock: 1238 bail_unlock:
1237 ocfs2_inode_unlock(inode, 1); 1239 ocfs2_inode_unlock(inode, 1);
1238 bail_unlock_rw: 1240 bail_unlock_rw:
1239 if (size_change) 1241 if (size_change)
1240 ocfs2_rw_unlock(inode, 1); 1242 ocfs2_rw_unlock(inode, 1);
1241 bail: 1243 bail:
1242 brelse(bh); 1244 brelse(bh);
1243 1245
1244 /* Release quota pointers in case we acquired them */ 1246 /* Release quota pointers in case we acquired them */
1245 for (qtype = 0; qtype < MAXQUOTAS; qtype++) 1247 for (qtype = 0; qtype < MAXQUOTAS; qtype++)
1246 dqput(transfer_to[qtype]); 1248 dqput(transfer_to[qtype]);
1247 1249
1248 if (!status && attr->ia_valid & ATTR_MODE) { 1250 if (!status && attr->ia_valid & ATTR_MODE) {
1249 status = ocfs2_acl_chmod(inode); 1251 status = ocfs2_acl_chmod(inode);
1250 if (status < 0) 1252 if (status < 0)
1251 mlog_errno(status); 1253 mlog_errno(status);
1252 } 1254 }
1253 1255
1254 return status; 1256 return status;
1255 } 1257 }
1256 1258
1257 int ocfs2_getattr(struct vfsmount *mnt, 1259 int ocfs2_getattr(struct vfsmount *mnt,
1258 struct dentry *dentry, 1260 struct dentry *dentry,
1259 struct kstat *stat) 1261 struct kstat *stat)
1260 { 1262 {
1261 struct inode *inode = dentry->d_inode; 1263 struct inode *inode = dentry->d_inode;
1262 struct super_block *sb = dentry->d_inode->i_sb; 1264 struct super_block *sb = dentry->d_inode->i_sb;
1263 struct ocfs2_super *osb = sb->s_fs_info; 1265 struct ocfs2_super *osb = sb->s_fs_info;
1264 int err; 1266 int err;
1265 1267
1266 err = ocfs2_inode_revalidate(dentry); 1268 err = ocfs2_inode_revalidate(dentry);
1267 if (err) { 1269 if (err) {
1268 if (err != -ENOENT) 1270 if (err != -ENOENT)
1269 mlog_errno(err); 1271 mlog_errno(err);
1270 goto bail; 1272 goto bail;
1271 } 1273 }
1272 1274
1273 generic_fillattr(inode, stat); 1275 generic_fillattr(inode, stat);
1274 1276
1275 /* We set the blksize from the cluster size for performance */ 1277 /* We set the blksize from the cluster size for performance */
1276 stat->blksize = osb->s_clustersize; 1278 stat->blksize = osb->s_clustersize;
1277 1279
1278 bail: 1280 bail:
1279 return err; 1281 return err;
1280 } 1282 }
1281 1283
1282 int ocfs2_permission(struct inode *inode, int mask) 1284 int ocfs2_permission(struct inode *inode, int mask)
1283 { 1285 {
1284 int ret; 1286 int ret;
1285 1287
1286 if (mask & MAY_NOT_BLOCK) 1288 if (mask & MAY_NOT_BLOCK)
1287 return -ECHILD; 1289 return -ECHILD;
1288 1290
1289 ret = ocfs2_inode_lock(inode, NULL, 0); 1291 ret = ocfs2_inode_lock(inode, NULL, 0);
1290 if (ret) { 1292 if (ret) {
1291 if (ret != -ENOENT) 1293 if (ret != -ENOENT)
1292 mlog_errno(ret); 1294 mlog_errno(ret);
1293 goto out; 1295 goto out;
1294 } 1296 }
1295 1297
1296 ret = generic_permission(inode, mask); 1298 ret = generic_permission(inode, mask);
1297 1299
1298 ocfs2_inode_unlock(inode, 0); 1300 ocfs2_inode_unlock(inode, 0);
1299 out: 1301 out:
1300 return ret; 1302 return ret;
1301 } 1303 }
1302 1304
1303 static int __ocfs2_write_remove_suid(struct inode *inode, 1305 static int __ocfs2_write_remove_suid(struct inode *inode,
1304 struct buffer_head *bh) 1306 struct buffer_head *bh)
1305 { 1307 {
1306 int ret; 1308 int ret;
1307 handle_t *handle; 1309 handle_t *handle;
1308 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1310 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1309 struct ocfs2_dinode *di; 1311 struct ocfs2_dinode *di;
1310 1312
1311 trace_ocfs2_write_remove_suid( 1313 trace_ocfs2_write_remove_suid(
1312 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1314 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1313 inode->i_mode); 1315 inode->i_mode);
1314 1316
1315 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 1317 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1316 if (IS_ERR(handle)) { 1318 if (IS_ERR(handle)) {
1317 ret = PTR_ERR(handle); 1319 ret = PTR_ERR(handle);
1318 mlog_errno(ret); 1320 mlog_errno(ret);
1319 goto out; 1321 goto out;
1320 } 1322 }
1321 1323
1322 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh, 1324 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
1323 OCFS2_JOURNAL_ACCESS_WRITE); 1325 OCFS2_JOURNAL_ACCESS_WRITE);
1324 if (ret < 0) { 1326 if (ret < 0) {
1325 mlog_errno(ret); 1327 mlog_errno(ret);
1326 goto out_trans; 1328 goto out_trans;
1327 } 1329 }
1328 1330
1329 inode->i_mode &= ~S_ISUID; 1331 inode->i_mode &= ~S_ISUID;
1330 if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP)) 1332 if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))
1331 inode->i_mode &= ~S_ISGID; 1333 inode->i_mode &= ~S_ISGID;
1332 1334
1333 di = (struct ocfs2_dinode *) bh->b_data; 1335 di = (struct ocfs2_dinode *) bh->b_data;
1334 di->i_mode = cpu_to_le16(inode->i_mode); 1336 di->i_mode = cpu_to_le16(inode->i_mode);
1335 1337
1336 ocfs2_journal_dirty(handle, bh); 1338 ocfs2_journal_dirty(handle, bh);
1337 1339
1338 out_trans: 1340 out_trans:
1339 ocfs2_commit_trans(osb, handle); 1341 ocfs2_commit_trans(osb, handle);
1340 out: 1342 out:
1341 return ret; 1343 return ret;
1342 } 1344 }
1343 1345
1344 /* 1346 /*
1345 * Will look for holes and unwritten extents in the range starting at 1347 * Will look for holes and unwritten extents in the range starting at
1346 * pos for count bytes (inclusive). 1348 * pos for count bytes (inclusive).
1347 */ 1349 */
1348 static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos, 1350 static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
1349 size_t count) 1351 size_t count)
1350 { 1352 {
1351 int ret = 0; 1353 int ret = 0;
1352 unsigned int extent_flags; 1354 unsigned int extent_flags;
1353 u32 cpos, clusters, extent_len, phys_cpos; 1355 u32 cpos, clusters, extent_len, phys_cpos;
1354 struct super_block *sb = inode->i_sb; 1356 struct super_block *sb = inode->i_sb;
1355 1357
1356 cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits; 1358 cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
1357 clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos; 1359 clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
1358 1360
1359 while (clusters) { 1361 while (clusters) {
1360 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len, 1362 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
1361 &extent_flags); 1363 &extent_flags);
1362 if (ret < 0) { 1364 if (ret < 0) {
1363 mlog_errno(ret); 1365 mlog_errno(ret);
1364 goto out; 1366 goto out;
1365 } 1367 }
1366 1368
1367 if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) { 1369 if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
1368 ret = 1; 1370 ret = 1;
1369 break; 1371 break;
1370 } 1372 }
1371 1373
1372 if (extent_len > clusters) 1374 if (extent_len > clusters)
1373 extent_len = clusters; 1375 extent_len = clusters;
1374 1376
1375 clusters -= extent_len; 1377 clusters -= extent_len;
1376 cpos += extent_len; 1378 cpos += extent_len;
1377 } 1379 }
1378 out: 1380 out:
1379 return ret; 1381 return ret;
1380 } 1382 }
1381 1383
1382 static int ocfs2_write_remove_suid(struct inode *inode) 1384 static int ocfs2_write_remove_suid(struct inode *inode)
1383 { 1385 {
1384 int ret; 1386 int ret;
1385 struct buffer_head *bh = NULL; 1387 struct buffer_head *bh = NULL;
1386 1388
1387 ret = ocfs2_read_inode_block(inode, &bh); 1389 ret = ocfs2_read_inode_block(inode, &bh);
1388 if (ret < 0) { 1390 if (ret < 0) {
1389 mlog_errno(ret); 1391 mlog_errno(ret);
1390 goto out; 1392 goto out;
1391 } 1393 }
1392 1394
1393 ret = __ocfs2_write_remove_suid(inode, bh); 1395 ret = __ocfs2_write_remove_suid(inode, bh);
1394 out: 1396 out:
1395 brelse(bh); 1397 brelse(bh);
1396 return ret; 1398 return ret;
1397 } 1399 }
1398 1400
1399 /* 1401 /*
1400 * Allocate enough extents to cover the region starting at byte offset 1402 * Allocate enough extents to cover the region starting at byte offset
1401 * start for len bytes. Existing extents are skipped, any extents 1403 * start for len bytes. Existing extents are skipped, any extents
1402 * added are marked as "unwritten". 1404 * added are marked as "unwritten".
1403 */ 1405 */
1404 static int ocfs2_allocate_unwritten_extents(struct inode *inode, 1406 static int ocfs2_allocate_unwritten_extents(struct inode *inode,
1405 u64 start, u64 len) 1407 u64 start, u64 len)
1406 { 1408 {
1407 int ret; 1409 int ret;
1408 u32 cpos, phys_cpos, clusters, alloc_size; 1410 u32 cpos, phys_cpos, clusters, alloc_size;
1409 u64 end = start + len; 1411 u64 end = start + len;
1410 struct buffer_head *di_bh = NULL; 1412 struct buffer_head *di_bh = NULL;
1411 1413
1412 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 1414 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1413 ret = ocfs2_read_inode_block(inode, &di_bh); 1415 ret = ocfs2_read_inode_block(inode, &di_bh);
1414 if (ret) { 1416 if (ret) {
1415 mlog_errno(ret); 1417 mlog_errno(ret);
1416 goto out; 1418 goto out;
1417 } 1419 }
1418 1420
1419 /* 1421 /*
1420 * Nothing to do if the requested reservation range 1422 * Nothing to do if the requested reservation range
1421 * fits within the inode. 1423 * fits within the inode.
1422 */ 1424 */
1423 if (ocfs2_size_fits_inline_data(di_bh, end)) 1425 if (ocfs2_size_fits_inline_data(di_bh, end))
1424 goto out; 1426 goto out;
1425 1427
1426 ret = ocfs2_convert_inline_data_to_extents(inode, di_bh); 1428 ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
1427 if (ret) { 1429 if (ret) {
1428 mlog_errno(ret); 1430 mlog_errno(ret);
1429 goto out; 1431 goto out;
1430 } 1432 }
1431 } 1433 }
1432 1434
1433 /* 1435 /*
1434 * We consider both start and len to be inclusive. 1436 * We consider both start and len to be inclusive.
1435 */ 1437 */
1436 cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; 1438 cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
1437 clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len); 1439 clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len);
1438 clusters -= cpos; 1440 clusters -= cpos;
1439 1441
1440 while (clusters) { 1442 while (clusters) {
1441 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, 1443 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
1442 &alloc_size, NULL); 1444 &alloc_size, NULL);
1443 if (ret) { 1445 if (ret) {
1444 mlog_errno(ret); 1446 mlog_errno(ret);
1445 goto out; 1447 goto out;
1446 } 1448 }
1447 1449
1448 /* 1450 /*
1449 * Hole or existing extent len can be arbitrary, so 1451 * Hole or existing extent len can be arbitrary, so
1450 * cap it to our own allocation request. 1452 * cap it to our own allocation request.
1451 */ 1453 */
1452 if (alloc_size > clusters) 1454 if (alloc_size > clusters)
1453 alloc_size = clusters; 1455 alloc_size = clusters;
1454 1456
1455 if (phys_cpos) { 1457 if (phys_cpos) {
1456 /* 1458 /*
1457 * We already have an allocation at this 1459 * We already have an allocation at this
1458 * region so we can safely skip it. 1460 * region so we can safely skip it.
1459 */ 1461 */
1460 goto next; 1462 goto next;
1461 } 1463 }
1462 1464
1463 ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1); 1465 ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1);
1464 if (ret) { 1466 if (ret) {
1465 if (ret != -ENOSPC) 1467 if (ret != -ENOSPC)
1466 mlog_errno(ret); 1468 mlog_errno(ret);
1467 goto out; 1469 goto out;
1468 } 1470 }
1469 1471
1470 next: 1472 next:
1471 cpos += alloc_size; 1473 cpos += alloc_size;
1472 clusters -= alloc_size; 1474 clusters -= alloc_size;
1473 } 1475 }
1474 1476
1475 ret = 0; 1477 ret = 0;
1476 out: 1478 out:
1477 1479
1478 brelse(di_bh); 1480 brelse(di_bh);
1479 return ret; 1481 return ret;
1480 } 1482 }
1481 1483
1482 /* 1484 /*
1483 * Truncate a byte range, avoiding pages within partial clusters. This 1485 * Truncate a byte range, avoiding pages within partial clusters. This
1484 * preserves those pages for the zeroing code to write to. 1486 * preserves those pages for the zeroing code to write to.
1485 */ 1487 */
1486 static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start, 1488 static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start,
1487 u64 byte_len) 1489 u64 byte_len)
1488 { 1490 {
1489 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1491 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1490 loff_t start, end; 1492 loff_t start, end;
1491 struct address_space *mapping = inode->i_mapping; 1493 struct address_space *mapping = inode->i_mapping;
1492 1494
1493 start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start); 1495 start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start);
1494 end = byte_start + byte_len; 1496 end = byte_start + byte_len;
1495 end = end & ~(osb->s_clustersize - 1); 1497 end = end & ~(osb->s_clustersize - 1);
1496 1498
1497 if (start < end) { 1499 if (start < end) {
1498 unmap_mapping_range(mapping, start, end - start, 0); 1500 unmap_mapping_range(mapping, start, end - start, 0);
1499 truncate_inode_pages_range(mapping, start, end - 1); 1501 truncate_inode_pages_range(mapping, start, end - 1);
1500 } 1502 }
1501 } 1503 }
1502 1504
1503 static int ocfs2_zero_partial_clusters(struct inode *inode, 1505 static int ocfs2_zero_partial_clusters(struct inode *inode,
1504 u64 start, u64 len) 1506 u64 start, u64 len)
1505 { 1507 {
1506 int ret = 0; 1508 int ret = 0;
1507 u64 tmpend, end = start + len; 1509 u64 tmpend, end = start + len;
1508 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1510 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1509 unsigned int csize = osb->s_clustersize; 1511 unsigned int csize = osb->s_clustersize;
1510 handle_t *handle; 1512 handle_t *handle;
1511 1513
1512 /* 1514 /*
1513 * The "start" and "end" values are NOT necessarily part of 1515 * The "start" and "end" values are NOT necessarily part of
1514 * the range whose allocation is being deleted. Rather, this 1516 * the range whose allocation is being deleted. Rather, this
1515 * is what the user passed in with the request. We must zero 1517 * is what the user passed in with the request. We must zero
1516 * partial clusters here. There's no need to worry about 1518 * partial clusters here. There's no need to worry about
1517 * physical allocation - the zeroing code knows to skip holes. 1519 * physical allocation - the zeroing code knows to skip holes.
1518 */ 1520 */
1519 trace_ocfs2_zero_partial_clusters( 1521 trace_ocfs2_zero_partial_clusters(
1520 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1522 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1521 (unsigned long long)start, (unsigned long long)end); 1523 (unsigned long long)start, (unsigned long long)end);
1522 1524
1523 /* 1525 /*
1524 * If both edges are on a cluster boundary then there's no 1526 * If both edges are on a cluster boundary then there's no
1525 * zeroing required as the region is part of the allocation to 1527 * zeroing required as the region is part of the allocation to
1526 * be truncated. 1528 * be truncated.
1527 */ 1529 */
1528 if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0) 1530 if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0)
1529 goto out; 1531 goto out;
1530 1532
1531 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 1533 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1532 if (IS_ERR(handle)) { 1534 if (IS_ERR(handle)) {
1533 ret = PTR_ERR(handle); 1535 ret = PTR_ERR(handle);
1534 mlog_errno(ret); 1536 mlog_errno(ret);
1535 goto out; 1537 goto out;
1536 } 1538 }
1537 1539
1538 /* 1540 /*
1539 * We want to get the byte offset of the end of the 1st cluster. 1541 * We want to get the byte offset of the end of the 1st cluster.
1540 */ 1542 */
1541 tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1)); 1543 tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1));
1542 if (tmpend > end) 1544 if (tmpend > end)
1543 tmpend = end; 1545 tmpend = end;
1544 1546
1545 trace_ocfs2_zero_partial_clusters_range1((unsigned long long)start, 1547 trace_ocfs2_zero_partial_clusters_range1((unsigned long long)start,
1546 (unsigned long long)tmpend); 1548 (unsigned long long)tmpend);
1547 1549
1548 ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend); 1550 ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend);
1549 if (ret) 1551 if (ret)
1550 mlog_errno(ret); 1552 mlog_errno(ret);
1551 1553
1552 if (tmpend < end) { 1554 if (tmpend < end) {
1553 /* 1555 /*
1554 * This may make start and end equal, but the zeroing 1556 * This may make start and end equal, but the zeroing
1555 * code will skip any work in that case so there's no 1557 * code will skip any work in that case so there's no
1556 * need to catch it up here. 1558 * need to catch it up here.
1557 */ 1559 */
1558 start = end & ~(osb->s_clustersize - 1); 1560 start = end & ~(osb->s_clustersize - 1);
1559 1561
1560 trace_ocfs2_zero_partial_clusters_range2( 1562 trace_ocfs2_zero_partial_clusters_range2(
1561 (unsigned long long)start, (unsigned long long)end); 1563 (unsigned long long)start, (unsigned long long)end);
1562 1564
1563 ret = ocfs2_zero_range_for_truncate(inode, handle, start, end); 1565 ret = ocfs2_zero_range_for_truncate(inode, handle, start, end);
1564 if (ret) 1566 if (ret)
1565 mlog_errno(ret); 1567 mlog_errno(ret);
1566 } 1568 }
1567 1569
1568 ocfs2_commit_trans(osb, handle); 1570 ocfs2_commit_trans(osb, handle);
1569 out: 1571 out:
1570 return ret; 1572 return ret;
1571 } 1573 }
1572 1574
1573 static int ocfs2_find_rec(struct ocfs2_extent_list *el, u32 pos) 1575 static int ocfs2_find_rec(struct ocfs2_extent_list *el, u32 pos)
1574 { 1576 {
1575 int i; 1577 int i;
1576 struct ocfs2_extent_rec *rec = NULL; 1578 struct ocfs2_extent_rec *rec = NULL;
1577 1579
1578 for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) { 1580 for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
1579 1581
1580 rec = &el->l_recs[i]; 1582 rec = &el->l_recs[i];
1581 1583
1582 if (le32_to_cpu(rec->e_cpos) < pos) 1584 if (le32_to_cpu(rec->e_cpos) < pos)
1583 break; 1585 break;
1584 } 1586 }
1585 1587
1586 return i; 1588 return i;
1587 } 1589 }
1588 1590
1589 /* 1591 /*
1590 * Helper to calculate the punching pos and length in one run, we handle the 1592 * Helper to calculate the punching pos and length in one run, we handle the
1591 * following three cases in order: 1593 * following three cases in order:
1592 * 1594 *
1593 * - remove the entire record 1595 * - remove the entire record
1594 * - remove a partial record 1596 * - remove a partial record
1595 * - no record needs to be removed (hole-punching completed) 1597 * - no record needs to be removed (hole-punching completed)
1596 */ 1598 */
1597 static void ocfs2_calc_trunc_pos(struct inode *inode, 1599 static void ocfs2_calc_trunc_pos(struct inode *inode,
1598 struct ocfs2_extent_list *el, 1600 struct ocfs2_extent_list *el,
1599 struct ocfs2_extent_rec *rec, 1601 struct ocfs2_extent_rec *rec,
1600 u32 trunc_start, u32 *trunc_cpos, 1602 u32 trunc_start, u32 *trunc_cpos,
1601 u32 *trunc_len, u32 *trunc_end, 1603 u32 *trunc_len, u32 *trunc_end,
1602 u64 *blkno, int *done) 1604 u64 *blkno, int *done)
1603 { 1605 {
1604 int ret = 0; 1606 int ret = 0;
1605 u32 coff, range; 1607 u32 coff, range;
1606 1608
1607 range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); 1609 range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
1608 1610
1609 if (le32_to_cpu(rec->e_cpos) >= trunc_start) { 1611 if (le32_to_cpu(rec->e_cpos) >= trunc_start) {
1610 /* 1612 /*
1611 * remove an entire extent record. 1613 * remove an entire extent record.
1612 */ 1614 */
1613 *trunc_cpos = le32_to_cpu(rec->e_cpos); 1615 *trunc_cpos = le32_to_cpu(rec->e_cpos);
1614 /* 1616 /*
1615 * Skip holes if any. 1617 * Skip holes if any.
1616 */ 1618 */
1617 if (range < *trunc_end) 1619 if (range < *trunc_end)
1618 *trunc_end = range; 1620 *trunc_end = range;
1619 *trunc_len = *trunc_end - le32_to_cpu(rec->e_cpos); 1621 *trunc_len = *trunc_end - le32_to_cpu(rec->e_cpos);
1620 *blkno = le64_to_cpu(rec->e_blkno); 1622 *blkno = le64_to_cpu(rec->e_blkno);
1621 *trunc_end = le32_to_cpu(rec->e_cpos); 1623 *trunc_end = le32_to_cpu(rec->e_cpos);
1622 } else if (range > trunc_start) { 1624 } else if (range > trunc_start) {
1623 /* 1625 /*
1624 * remove a partial extent record, which means we're 1626 * remove a partial extent record, which means we're
1625 * removing the last extent record. 1627 * removing the last extent record.
1626 */ 1628 */
1627 *trunc_cpos = trunc_start; 1629 *trunc_cpos = trunc_start;
1628 /* 1630 /*
1629 * skip hole if any. 1631 * skip hole if any.
1630 */ 1632 */
1631 if (range < *trunc_end) 1633 if (range < *trunc_end)
1632 *trunc_end = range; 1634 *trunc_end = range;
1633 *trunc_len = *trunc_end - trunc_start; 1635 *trunc_len = *trunc_end - trunc_start;
1634 coff = trunc_start - le32_to_cpu(rec->e_cpos); 1636 coff = trunc_start - le32_to_cpu(rec->e_cpos);
1635 *blkno = le64_to_cpu(rec->e_blkno) + 1637 *blkno = le64_to_cpu(rec->e_blkno) +
1636 ocfs2_clusters_to_blocks(inode->i_sb, coff); 1638 ocfs2_clusters_to_blocks(inode->i_sb, coff);
1637 *trunc_end = trunc_start; 1639 *trunc_end = trunc_start;
1638 } else { 1640 } else {
1639 /* 1641 /*
1640 * It may have two following possibilities: 1642 * It may have two following possibilities:
1641 * 1643 *
1642 * - last record has been removed 1644 * - last record has been removed
1643 * - trunc_start was within a hole 1645 * - trunc_start was within a hole
1644 * 1646 *
1645 * both two cases mean the completion of hole punching. 1647 * both two cases mean the completion of hole punching.
1646 */ 1648 */
1647 ret = 1; 1649 ret = 1;
1648 } 1650 }
1649 1651
1650 *done = ret; 1652 *done = ret;
1651 } 1653 }
1652 1654
1653 static int ocfs2_remove_inode_range(struct inode *inode, 1655 static int ocfs2_remove_inode_range(struct inode *inode,
1654 struct buffer_head *di_bh, u64 byte_start, 1656 struct buffer_head *di_bh, u64 byte_start,
1655 u64 byte_len) 1657 u64 byte_len)
1656 { 1658 {
1657 int ret = 0, flags = 0, done = 0, i; 1659 int ret = 0, flags = 0, done = 0, i;
1658 u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos; 1660 u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
1659 u32 cluster_in_el; 1661 u32 cluster_in_el;
1660 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1662 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1661 struct ocfs2_cached_dealloc_ctxt dealloc; 1663 struct ocfs2_cached_dealloc_ctxt dealloc;
1662 struct address_space *mapping = inode->i_mapping; 1664 struct address_space *mapping = inode->i_mapping;
1663 struct ocfs2_extent_tree et; 1665 struct ocfs2_extent_tree et;
1664 struct ocfs2_path *path = NULL; 1666 struct ocfs2_path *path = NULL;
1665 struct ocfs2_extent_list *el = NULL; 1667 struct ocfs2_extent_list *el = NULL;
1666 struct ocfs2_extent_rec *rec = NULL; 1668 struct ocfs2_extent_rec *rec = NULL;
1667 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 1669 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1668 u64 blkno, refcount_loc = le64_to_cpu(di->i_refcount_loc); 1670 u64 blkno, refcount_loc = le64_to_cpu(di->i_refcount_loc);
1669 1671
1670 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); 1672 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
1671 ocfs2_init_dealloc_ctxt(&dealloc); 1673 ocfs2_init_dealloc_ctxt(&dealloc);
1672 1674
1673 trace_ocfs2_remove_inode_range( 1675 trace_ocfs2_remove_inode_range(
1674 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1676 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1675 (unsigned long long)byte_start, 1677 (unsigned long long)byte_start,
1676 (unsigned long long)byte_len); 1678 (unsigned long long)byte_len);
1677 1679
1678 if (byte_len == 0) 1680 if (byte_len == 0)
1679 return 0; 1681 return 0;
1680 1682
1681 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 1683 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1682 ret = ocfs2_truncate_inline(inode, di_bh, byte_start, 1684 ret = ocfs2_truncate_inline(inode, di_bh, byte_start,
1683 byte_start + byte_len, 0); 1685 byte_start + byte_len, 0);
1684 if (ret) { 1686 if (ret) {
1685 mlog_errno(ret); 1687 mlog_errno(ret);
1686 goto out; 1688 goto out;
1687 } 1689 }
1688 /* 1690 /*
1689 * There's no need to get fancy with the page cache 1691 * There's no need to get fancy with the page cache
1690 * truncate of an inline-data inode. We're talking 1692 * truncate of an inline-data inode. We're talking
1691 * about less than a page here, which will be cached 1693 * about less than a page here, which will be cached
1692 * in the dinode buffer anyway. 1694 * in the dinode buffer anyway.
1693 */ 1695 */
1694 unmap_mapping_range(mapping, 0, 0, 0); 1696 unmap_mapping_range(mapping, 0, 0, 0);
1695 truncate_inode_pages(mapping, 0); 1697 truncate_inode_pages(mapping, 0);
1696 goto out; 1698 goto out;
1697 } 1699 }
1698 1700
1699 /* 1701 /*
1700 * For reflinks, we may need to CoW 2 clusters which might be 1702 * For reflinks, we may need to CoW 2 clusters which might be
1701 * partially zero'd later, if hole's start and end offset were 1703 * partially zero'd later, if hole's start and end offset were
1702 * within one cluster(means is not exactly aligned to clustersize). 1704 * within one cluster(means is not exactly aligned to clustersize).
1703 */ 1705 */
1704 1706
1705 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) { 1707 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) {
1706 1708
1707 ret = ocfs2_cow_file_pos(inode, di_bh, byte_start); 1709 ret = ocfs2_cow_file_pos(inode, di_bh, byte_start);
1708 if (ret) { 1710 if (ret) {
1709 mlog_errno(ret); 1711 mlog_errno(ret);
1710 goto out; 1712 goto out;
1711 } 1713 }
1712 1714
1713 ret = ocfs2_cow_file_pos(inode, di_bh, byte_start + byte_len); 1715 ret = ocfs2_cow_file_pos(inode, di_bh, byte_start + byte_len);
1714 if (ret) { 1716 if (ret) {
1715 mlog_errno(ret); 1717 mlog_errno(ret);
1716 goto out; 1718 goto out;
1717 } 1719 }
1718 } 1720 }
1719 1721
1720 trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start); 1722 trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
1721 trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits; 1723 trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits;
1722 cluster_in_el = trunc_end; 1724 cluster_in_el = trunc_end;
1723 1725
1724 ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len); 1726 ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
1725 if (ret) { 1727 if (ret) {
1726 mlog_errno(ret); 1728 mlog_errno(ret);
1727 goto out; 1729 goto out;
1728 } 1730 }
1729 1731
1730 path = ocfs2_new_path_from_et(&et); 1732 path = ocfs2_new_path_from_et(&et);
1731 if (!path) { 1733 if (!path) {
1732 ret = -ENOMEM; 1734 ret = -ENOMEM;
1733 mlog_errno(ret); 1735 mlog_errno(ret);
1734 goto out; 1736 goto out;
1735 } 1737 }
1736 1738
1737 while (trunc_end > trunc_start) { 1739 while (trunc_end > trunc_start) {
1738 1740
1739 ret = ocfs2_find_path(INODE_CACHE(inode), path, 1741 ret = ocfs2_find_path(INODE_CACHE(inode), path,
1740 cluster_in_el); 1742 cluster_in_el);
1741 if (ret) { 1743 if (ret) {
1742 mlog_errno(ret); 1744 mlog_errno(ret);
1743 goto out; 1745 goto out;
1744 } 1746 }
1745 1747
1746 el = path_leaf_el(path); 1748 el = path_leaf_el(path);
1747 1749
1748 i = ocfs2_find_rec(el, trunc_end); 1750 i = ocfs2_find_rec(el, trunc_end);
1749 /* 1751 /*
1750 * Need to go to previous extent block. 1752 * Need to go to previous extent block.
1751 */ 1753 */
1752 if (i < 0) { 1754 if (i < 0) {
1753 if (path->p_tree_depth == 0) 1755 if (path->p_tree_depth == 0)
1754 break; 1756 break;
1755 1757
1756 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, 1758 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
1757 path, 1759 path,
1758 &cluster_in_el); 1760 &cluster_in_el);
1759 if (ret) { 1761 if (ret) {
1760 mlog_errno(ret); 1762 mlog_errno(ret);
1761 goto out; 1763 goto out;
1762 } 1764 }
1763 1765
1764 /* 1766 /*
1765 * We've reached the leftmost extent block, 1767 * We've reached the leftmost extent block,
1766 * it's safe to leave. 1768 * it's safe to leave.
1767 */ 1769 */
1768 if (cluster_in_el == 0) 1770 if (cluster_in_el == 0)
1769 break; 1771 break;
1770 1772
1771 /* 1773 /*
1772 * The 'pos' searched for previous extent block is 1774 * The 'pos' searched for previous extent block is
1773 * always one cluster less than actual trunc_end. 1775 * always one cluster less than actual trunc_end.
1774 */ 1776 */
1775 trunc_end = cluster_in_el + 1; 1777 trunc_end = cluster_in_el + 1;
1776 1778
1777 ocfs2_reinit_path(path, 1); 1779 ocfs2_reinit_path(path, 1);
1778 1780
1779 continue; 1781 continue;
1780 1782
1781 } else 1783 } else
1782 rec = &el->l_recs[i]; 1784 rec = &el->l_recs[i];
1783 1785
1784 ocfs2_calc_trunc_pos(inode, el, rec, trunc_start, &trunc_cpos, 1786 ocfs2_calc_trunc_pos(inode, el, rec, trunc_start, &trunc_cpos,
1785 &trunc_len, &trunc_end, &blkno, &done); 1787 &trunc_len, &trunc_end, &blkno, &done);
1786 if (done) 1788 if (done)
1787 break; 1789 break;
1788 1790
1789 flags = rec->e_flags; 1791 flags = rec->e_flags;
1790 phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno); 1792 phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
1791 1793
1792 ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos, 1794 ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
1793 phys_cpos, trunc_len, flags, 1795 phys_cpos, trunc_len, flags,
1794 &dealloc, refcount_loc); 1796 &dealloc, refcount_loc);
1795 if (ret < 0) { 1797 if (ret < 0) {
1796 mlog_errno(ret); 1798 mlog_errno(ret);
1797 goto out; 1799 goto out;
1798 } 1800 }
1799 1801
1800 cluster_in_el = trunc_end; 1802 cluster_in_el = trunc_end;
1801 1803
1802 ocfs2_reinit_path(path, 1); 1804 ocfs2_reinit_path(path, 1);
1803 } 1805 }
1804 1806
1805 ocfs2_truncate_cluster_pages(inode, byte_start, byte_len); 1807 ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
1806 1808
1807 out: 1809 out:
1808 ocfs2_schedule_truncate_log_flush(osb, 1); 1810 ocfs2_schedule_truncate_log_flush(osb, 1);
1809 ocfs2_run_deallocs(osb, &dealloc); 1811 ocfs2_run_deallocs(osb, &dealloc);
1810 1812
1811 return ret; 1813 return ret;
1812 } 1814 }
1813 1815
1814 /* 1816 /*
1815 * Parts of this function taken from xfs_change_file_space() 1817 * Parts of this function taken from xfs_change_file_space()
1816 */ 1818 */
1817 static int __ocfs2_change_file_space(struct file *file, struct inode *inode, 1819 static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1818 loff_t f_pos, unsigned int cmd, 1820 loff_t f_pos, unsigned int cmd,
1819 struct ocfs2_space_resv *sr, 1821 struct ocfs2_space_resv *sr,
1820 int change_size) 1822 int change_size)
1821 { 1823 {
1822 int ret; 1824 int ret;
1823 s64 llen; 1825 s64 llen;
1824 loff_t size; 1826 loff_t size;
1825 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1827 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1826 struct buffer_head *di_bh = NULL; 1828 struct buffer_head *di_bh = NULL;
1827 handle_t *handle; 1829 handle_t *handle;
1828 unsigned long long max_off = inode->i_sb->s_maxbytes; 1830 unsigned long long max_off = inode->i_sb->s_maxbytes;
1829 1831
1830 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) 1832 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
1831 return -EROFS; 1833 return -EROFS;
1832 1834
1833 mutex_lock(&inode->i_mutex); 1835 mutex_lock(&inode->i_mutex);
1834 1836
1835 /* 1837 /*
1836 * This prevents concurrent writes on other nodes 1838 * This prevents concurrent writes on other nodes
1837 */ 1839 */
1838 ret = ocfs2_rw_lock(inode, 1); 1840 ret = ocfs2_rw_lock(inode, 1);
1839 if (ret) { 1841 if (ret) {
1840 mlog_errno(ret); 1842 mlog_errno(ret);
1841 goto out; 1843 goto out;
1842 } 1844 }
1843 1845
1844 ret = ocfs2_inode_lock(inode, &di_bh, 1); 1846 ret = ocfs2_inode_lock(inode, &di_bh, 1);
1845 if (ret) { 1847 if (ret) {
1846 mlog_errno(ret); 1848 mlog_errno(ret);
1847 goto out_rw_unlock; 1849 goto out_rw_unlock;
1848 } 1850 }
1849 1851
1850 if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) { 1852 if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
1851 ret = -EPERM; 1853 ret = -EPERM;
1852 goto out_inode_unlock; 1854 goto out_inode_unlock;
1853 } 1855 }
1854 1856
1855 switch (sr->l_whence) { 1857 switch (sr->l_whence) {
1856 case 0: /*SEEK_SET*/ 1858 case 0: /*SEEK_SET*/
1857 break; 1859 break;
1858 case 1: /*SEEK_CUR*/ 1860 case 1: /*SEEK_CUR*/
1859 sr->l_start += f_pos; 1861 sr->l_start += f_pos;
1860 break; 1862 break;
1861 case 2: /*SEEK_END*/ 1863 case 2: /*SEEK_END*/
1862 sr->l_start += i_size_read(inode); 1864 sr->l_start += i_size_read(inode);
1863 break; 1865 break;
1864 default: 1866 default:
1865 ret = -EINVAL; 1867 ret = -EINVAL;
1866 goto out_inode_unlock; 1868 goto out_inode_unlock;
1867 } 1869 }
1868 sr->l_whence = 0; 1870 sr->l_whence = 0;
1869 1871
1870 llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len; 1872 llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len;
1871 1873
1872 if (sr->l_start < 0 1874 if (sr->l_start < 0
1873 || sr->l_start > max_off 1875 || sr->l_start > max_off
1874 || (sr->l_start + llen) < 0 1876 || (sr->l_start + llen) < 0
1875 || (sr->l_start + llen) > max_off) { 1877 || (sr->l_start + llen) > max_off) {
1876 ret = -EINVAL; 1878 ret = -EINVAL;
1877 goto out_inode_unlock; 1879 goto out_inode_unlock;
1878 } 1880 }
1879 size = sr->l_start + sr->l_len; 1881 size = sr->l_start + sr->l_len;
1880 1882
1881 if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) { 1883 if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) {
1882 if (sr->l_len <= 0) { 1884 if (sr->l_len <= 0) {
1883 ret = -EINVAL; 1885 ret = -EINVAL;
1884 goto out_inode_unlock; 1886 goto out_inode_unlock;
1885 } 1887 }
1886 } 1888 }
1887 1889
1888 if (file && should_remove_suid(file->f_path.dentry)) { 1890 if (file && should_remove_suid(file->f_path.dentry)) {
1889 ret = __ocfs2_write_remove_suid(inode, di_bh); 1891 ret = __ocfs2_write_remove_suid(inode, di_bh);
1890 if (ret) { 1892 if (ret) {
1891 mlog_errno(ret); 1893 mlog_errno(ret);
1892 goto out_inode_unlock; 1894 goto out_inode_unlock;
1893 } 1895 }
1894 } 1896 }
1895 1897
1896 down_write(&OCFS2_I(inode)->ip_alloc_sem); 1898 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1897 switch (cmd) { 1899 switch (cmd) {
1898 case OCFS2_IOC_RESVSP: 1900 case OCFS2_IOC_RESVSP:
1899 case OCFS2_IOC_RESVSP64: 1901 case OCFS2_IOC_RESVSP64:
1900 /* 1902 /*
1901 * This takes unsigned offsets, but the signed ones we 1903 * This takes unsigned offsets, but the signed ones we
1902 * pass have been checked against overflow above. 1904 * pass have been checked against overflow above.
1903 */ 1905 */
1904 ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start, 1906 ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start,
1905 sr->l_len); 1907 sr->l_len);
1906 break; 1908 break;
1907 case OCFS2_IOC_UNRESVSP: 1909 case OCFS2_IOC_UNRESVSP:
1908 case OCFS2_IOC_UNRESVSP64: 1910 case OCFS2_IOC_UNRESVSP64:
1909 ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start, 1911 ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start,
1910 sr->l_len); 1912 sr->l_len);
1911 break; 1913 break;
1912 default: 1914 default:
1913 ret = -EINVAL; 1915 ret = -EINVAL;
1914 } 1916 }
1915 up_write(&OCFS2_I(inode)->ip_alloc_sem); 1917 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1916 if (ret) { 1918 if (ret) {
1917 mlog_errno(ret); 1919 mlog_errno(ret);
1918 goto out_inode_unlock; 1920 goto out_inode_unlock;
1919 } 1921 }
1920 1922
1921 /* 1923 /*
1922 * We update c/mtime for these changes 1924 * We update c/mtime for these changes
1923 */ 1925 */
1924 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 1926 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1925 if (IS_ERR(handle)) { 1927 if (IS_ERR(handle)) {
1926 ret = PTR_ERR(handle); 1928 ret = PTR_ERR(handle);
1927 mlog_errno(ret); 1929 mlog_errno(ret);
1928 goto out_inode_unlock; 1930 goto out_inode_unlock;
1929 } 1931 }
1930 1932
1931 if (change_size && i_size_read(inode) < size) 1933 if (change_size && i_size_read(inode) < size)
1932 i_size_write(inode, size); 1934 i_size_write(inode, size);
1933 1935
1934 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 1936 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1935 ret = ocfs2_mark_inode_dirty(handle, inode, di_bh); 1937 ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
1936 if (ret < 0) 1938 if (ret < 0)
1937 mlog_errno(ret); 1939 mlog_errno(ret);
1938 1940
1939 ocfs2_commit_trans(osb, handle); 1941 ocfs2_commit_trans(osb, handle);
1940 1942
1941 out_inode_unlock: 1943 out_inode_unlock:
1942 brelse(di_bh); 1944 brelse(di_bh);
1943 ocfs2_inode_unlock(inode, 1); 1945 ocfs2_inode_unlock(inode, 1);
1944 out_rw_unlock: 1946 out_rw_unlock:
1945 ocfs2_rw_unlock(inode, 1); 1947 ocfs2_rw_unlock(inode, 1);
1946 1948
1947 out: 1949 out:
1948 mutex_unlock(&inode->i_mutex); 1950 mutex_unlock(&inode->i_mutex);
1949 return ret; 1951 return ret;
1950 } 1952 }
1951 1953
1952 int ocfs2_change_file_space(struct file *file, unsigned int cmd, 1954 int ocfs2_change_file_space(struct file *file, unsigned int cmd,
1953 struct ocfs2_space_resv *sr) 1955 struct ocfs2_space_resv *sr)
1954 { 1956 {
1955 struct inode *inode = file->f_path.dentry->d_inode; 1957 struct inode *inode = file->f_path.dentry->d_inode;
1956 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1958 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1957 1959
1958 if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) && 1960 if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
1959 !ocfs2_writes_unwritten_extents(osb)) 1961 !ocfs2_writes_unwritten_extents(osb))
1960 return -ENOTTY; 1962 return -ENOTTY;
1961 else if ((cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) && 1963 else if ((cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) &&
1962 !ocfs2_sparse_alloc(osb)) 1964 !ocfs2_sparse_alloc(osb))
1963 return -ENOTTY; 1965 return -ENOTTY;
1964 1966
1965 if (!S_ISREG(inode->i_mode)) 1967 if (!S_ISREG(inode->i_mode))
1966 return -EINVAL; 1968 return -EINVAL;
1967 1969
1968 if (!(file->f_mode & FMODE_WRITE)) 1970 if (!(file->f_mode & FMODE_WRITE))
1969 return -EBADF; 1971 return -EBADF;
1970 1972
1971 return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0); 1973 return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
1972 } 1974 }
1973 1975
1974 static long ocfs2_fallocate(struct file *file, int mode, loff_t offset, 1976 static long ocfs2_fallocate(struct file *file, int mode, loff_t offset,
1975 loff_t len) 1977 loff_t len)
1976 { 1978 {
1977 struct inode *inode = file->f_path.dentry->d_inode; 1979 struct inode *inode = file->f_path.dentry->d_inode;
1978 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1980 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1979 struct ocfs2_space_resv sr; 1981 struct ocfs2_space_resv sr;
1980 int change_size = 1; 1982 int change_size = 1;
1981 int cmd = OCFS2_IOC_RESVSP64; 1983 int cmd = OCFS2_IOC_RESVSP64;
1982 1984
1983 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 1985 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
1984 return -EOPNOTSUPP; 1986 return -EOPNOTSUPP;
1985 if (!ocfs2_writes_unwritten_extents(osb)) 1987 if (!ocfs2_writes_unwritten_extents(osb))
1986 return -EOPNOTSUPP; 1988 return -EOPNOTSUPP;
1987 1989
1988 if (mode & FALLOC_FL_KEEP_SIZE) 1990 if (mode & FALLOC_FL_KEEP_SIZE)
1989 change_size = 0; 1991 change_size = 0;
1990 1992
1991 if (mode & FALLOC_FL_PUNCH_HOLE) 1993 if (mode & FALLOC_FL_PUNCH_HOLE)
1992 cmd = OCFS2_IOC_UNRESVSP64; 1994 cmd = OCFS2_IOC_UNRESVSP64;
1993 1995
1994 sr.l_whence = 0; 1996 sr.l_whence = 0;
1995 sr.l_start = (s64)offset; 1997 sr.l_start = (s64)offset;
1996 sr.l_len = (s64)len; 1998 sr.l_len = (s64)len;
1997 1999
1998 return __ocfs2_change_file_space(NULL, inode, offset, cmd, &sr, 2000 return __ocfs2_change_file_space(NULL, inode, offset, cmd, &sr,
1999 change_size); 2001 change_size);
2000 } 2002 }
2001 2003
2002 int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos, 2004 int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
2003 size_t count) 2005 size_t count)
2004 { 2006 {
2005 int ret = 0; 2007 int ret = 0;
2006 unsigned int extent_flags; 2008 unsigned int extent_flags;
2007 u32 cpos, clusters, extent_len, phys_cpos; 2009 u32 cpos, clusters, extent_len, phys_cpos;
2008 struct super_block *sb = inode->i_sb; 2010 struct super_block *sb = inode->i_sb;
2009 2011
2010 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) || 2012 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) ||
2011 !(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) || 2013 !(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) ||
2012 OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 2014 OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
2013 return 0; 2015 return 0;
2014 2016
2015 cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits; 2017 cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
2016 clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos; 2018 clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
2017 2019
2018 while (clusters) { 2020 while (clusters) {
2019 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len, 2021 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
2020 &extent_flags); 2022 &extent_flags);
2021 if (ret < 0) { 2023 if (ret < 0) {
2022 mlog_errno(ret); 2024 mlog_errno(ret);
2023 goto out; 2025 goto out;
2024 } 2026 }
2025 2027
2026 if (phys_cpos && (extent_flags & OCFS2_EXT_REFCOUNTED)) { 2028 if (phys_cpos && (extent_flags & OCFS2_EXT_REFCOUNTED)) {
2027 ret = 1; 2029 ret = 1;
2028 break; 2030 break;
2029 } 2031 }
2030 2032
2031 if (extent_len > clusters) 2033 if (extent_len > clusters)
2032 extent_len = clusters; 2034 extent_len = clusters;
2033 2035
2034 clusters -= extent_len; 2036 clusters -= extent_len;
2035 cpos += extent_len; 2037 cpos += extent_len;
2036 } 2038 }
2037 out: 2039 out:
2038 return ret; 2040 return ret;
2039 } 2041 }
2040 2042
2041 static int ocfs2_prepare_inode_for_refcount(struct inode *inode, 2043 static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
2042 struct file *file, 2044 struct file *file,
2043 loff_t pos, size_t count, 2045 loff_t pos, size_t count,
2044 int *meta_level) 2046 int *meta_level)
2045 { 2047 {
2046 int ret; 2048 int ret;
2047 struct buffer_head *di_bh = NULL; 2049 struct buffer_head *di_bh = NULL;
2048 u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; 2050 u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
2049 u32 clusters = 2051 u32 clusters =
2050 ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos; 2052 ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
2051 2053
2052 ret = ocfs2_inode_lock(inode, &di_bh, 1); 2054 ret = ocfs2_inode_lock(inode, &di_bh, 1);
2053 if (ret) { 2055 if (ret) {
2054 mlog_errno(ret); 2056 mlog_errno(ret);
2055 goto out; 2057 goto out;
2056 } 2058 }
2057 2059
2058 *meta_level = 1; 2060 *meta_level = 1;
2059 2061
2060 ret = ocfs2_refcount_cow(inode, file, di_bh, cpos, clusters, UINT_MAX); 2062 ret = ocfs2_refcount_cow(inode, file, di_bh, cpos, clusters, UINT_MAX);
2061 if (ret) 2063 if (ret)
2062 mlog_errno(ret); 2064 mlog_errno(ret);
2063 out: 2065 out:
2064 brelse(di_bh); 2066 brelse(di_bh);
2065 return ret; 2067 return ret;
2066 } 2068 }
2067 2069
2068 static int ocfs2_prepare_inode_for_write(struct file *file, 2070 static int ocfs2_prepare_inode_for_write(struct file *file,
2069 loff_t *ppos, 2071 loff_t *ppos,
2070 size_t count, 2072 size_t count,
2071 int appending, 2073 int appending,
2072 int *direct_io, 2074 int *direct_io,
2073 int *has_refcount) 2075 int *has_refcount)
2074 { 2076 {
2075 int ret = 0, meta_level = 0; 2077 int ret = 0, meta_level = 0;
2076 struct dentry *dentry = file->f_path.dentry; 2078 struct dentry *dentry = file->f_path.dentry;
2077 struct inode *inode = dentry->d_inode; 2079 struct inode *inode = dentry->d_inode;
2078 loff_t saved_pos = 0, end; 2080 loff_t saved_pos = 0, end;
2079 2081
2080 /* 2082 /*
2081 * We start with a read level meta lock and only jump to an ex 2083 * We start with a read level meta lock and only jump to an ex
2082 * if we need to make modifications here. 2084 * if we need to make modifications here.
2083 */ 2085 */
2084 for(;;) { 2086 for(;;) {
2085 ret = ocfs2_inode_lock(inode, NULL, meta_level); 2087 ret = ocfs2_inode_lock(inode, NULL, meta_level);
2086 if (ret < 0) { 2088 if (ret < 0) {
2087 meta_level = -1; 2089 meta_level = -1;
2088 mlog_errno(ret); 2090 mlog_errno(ret);
2089 goto out; 2091 goto out;
2090 } 2092 }
2091 2093
2092 /* Clear suid / sgid if necessary. We do this here 2094 /* Clear suid / sgid if necessary. We do this here
2093 * instead of later in the write path because 2095 * instead of later in the write path because
2094 * remove_suid() calls ->setattr without any hint that 2096 * remove_suid() calls ->setattr without any hint that
2095 * we may have already done our cluster locking. Since 2097 * we may have already done our cluster locking. Since
2096 * ocfs2_setattr() *must* take cluster locks to 2098 * ocfs2_setattr() *must* take cluster locks to
2097 * proceeed, this will lead us to recursively lock the 2099 * proceeed, this will lead us to recursively lock the
2098 * inode. There's also the dinode i_size state which 2100 * inode. There's also the dinode i_size state which
2099 * can be lost via setattr during extending writes (we 2101 * can be lost via setattr during extending writes (we
2100 * set inode->i_size at the end of a write. */ 2102 * set inode->i_size at the end of a write. */
2101 if (should_remove_suid(dentry)) { 2103 if (should_remove_suid(dentry)) {
2102 if (meta_level == 0) { 2104 if (meta_level == 0) {
2103 ocfs2_inode_unlock(inode, meta_level); 2105 ocfs2_inode_unlock(inode, meta_level);
2104 meta_level = 1; 2106 meta_level = 1;
2105 continue; 2107 continue;
2106 } 2108 }
2107 2109
2108 ret = ocfs2_write_remove_suid(inode); 2110 ret = ocfs2_write_remove_suid(inode);
2109 if (ret < 0) { 2111 if (ret < 0) {
2110 mlog_errno(ret); 2112 mlog_errno(ret);
2111 goto out_unlock; 2113 goto out_unlock;
2112 } 2114 }
2113 } 2115 }
2114 2116
2115 /* work on a copy of ppos until we're sure that we won't have 2117 /* work on a copy of ppos until we're sure that we won't have
2116 * to recalculate it due to relocking. */ 2118 * to recalculate it due to relocking. */
2117 if (appending) 2119 if (appending)
2118 saved_pos = i_size_read(inode); 2120 saved_pos = i_size_read(inode);
2119 else 2121 else
2120 saved_pos = *ppos; 2122 saved_pos = *ppos;
2121 2123
2122 end = saved_pos + count; 2124 end = saved_pos + count;
2123 2125
2124 ret = ocfs2_check_range_for_refcount(inode, saved_pos, count); 2126 ret = ocfs2_check_range_for_refcount(inode, saved_pos, count);
2125 if (ret == 1) { 2127 if (ret == 1) {
2126 ocfs2_inode_unlock(inode, meta_level); 2128 ocfs2_inode_unlock(inode, meta_level);
2127 meta_level = -1; 2129 meta_level = -1;
2128 2130
2129 ret = ocfs2_prepare_inode_for_refcount(inode, 2131 ret = ocfs2_prepare_inode_for_refcount(inode,
2130 file, 2132 file,
2131 saved_pos, 2133 saved_pos,
2132 count, 2134 count,
2133 &meta_level); 2135 &meta_level);
2134 if (has_refcount) 2136 if (has_refcount)
2135 *has_refcount = 1; 2137 *has_refcount = 1;
2136 if (direct_io) 2138 if (direct_io)
2137 *direct_io = 0; 2139 *direct_io = 0;
2138 } 2140 }
2139 2141
2140 if (ret < 0) { 2142 if (ret < 0) {
2141 mlog_errno(ret); 2143 mlog_errno(ret);
2142 goto out_unlock; 2144 goto out_unlock;
2143 } 2145 }
2144 2146
2145 /* 2147 /*
2146 * Skip the O_DIRECT checks if we don't need 2148 * Skip the O_DIRECT checks if we don't need
2147 * them. 2149 * them.
2148 */ 2150 */
2149 if (!direct_io || !(*direct_io)) 2151 if (!direct_io || !(*direct_io))
2150 break; 2152 break;
2151 2153
2152 /* 2154 /*
2153 * There's no sane way to do direct writes to an inode 2155 * There's no sane way to do direct writes to an inode
2154 * with inline data. 2156 * with inline data.
2155 */ 2157 */
2156 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 2158 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
2157 *direct_io = 0; 2159 *direct_io = 0;
2158 break; 2160 break;
2159 } 2161 }
2160 2162
2161 /* 2163 /*
2162 * Allowing concurrent direct writes means 2164 * Allowing concurrent direct writes means
2163 * i_size changes wouldn't be synchronized, so 2165 * i_size changes wouldn't be synchronized, so
2164 * one node could wind up truncating another 2166 * one node could wind up truncating another
2165 * nodes writes. 2167 * nodes writes.
2166 */ 2168 */
2167 if (end > i_size_read(inode)) { 2169 if (end > i_size_read(inode)) {
2168 *direct_io = 0; 2170 *direct_io = 0;
2169 break; 2171 break;
2170 } 2172 }
2171 2173
2172 /* 2174 /*
2173 * We don't fill holes during direct io, so 2175 * We don't fill holes during direct io, so
2174 * check for them here. If any are found, the 2176 * check for them here. If any are found, the
2175 * caller will have to retake some cluster 2177 * caller will have to retake some cluster
2176 * locks and initiate the io as buffered. 2178 * locks and initiate the io as buffered.
2177 */ 2179 */
2178 ret = ocfs2_check_range_for_holes(inode, saved_pos, count); 2180 ret = ocfs2_check_range_for_holes(inode, saved_pos, count);
2179 if (ret == 1) { 2181 if (ret == 1) {
2180 *direct_io = 0; 2182 *direct_io = 0;
2181 ret = 0; 2183 ret = 0;
2182 } else if (ret < 0) 2184 } else if (ret < 0)
2183 mlog_errno(ret); 2185 mlog_errno(ret);
2184 break; 2186 break;
2185 } 2187 }
2186 2188
2187 if (appending) 2189 if (appending)
2188 *ppos = saved_pos; 2190 *ppos = saved_pos;
2189 2191
2190 out_unlock: 2192 out_unlock:
2191 trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno, 2193 trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
2192 saved_pos, appending, count, 2194 saved_pos, appending, count,
2193 direct_io, has_refcount); 2195 direct_io, has_refcount);
2194 2196
2195 if (meta_level >= 0) 2197 if (meta_level >= 0)
2196 ocfs2_inode_unlock(inode, meta_level); 2198 ocfs2_inode_unlock(inode, meta_level);
2197 2199
2198 out: 2200 out:
2199 return ret; 2201 return ret;
2200 } 2202 }
2201 2203
2202 static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, 2204 static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
2203 const struct iovec *iov, 2205 const struct iovec *iov,
2204 unsigned long nr_segs, 2206 unsigned long nr_segs,
2205 loff_t pos) 2207 loff_t pos)
2206 { 2208 {
2207 int ret, direct_io, appending, rw_level, have_alloc_sem = 0; 2209 int ret, direct_io, appending, rw_level, have_alloc_sem = 0;
2208 int can_do_direct, has_refcount = 0; 2210 int can_do_direct, has_refcount = 0;
2209 ssize_t written = 0; 2211 ssize_t written = 0;
2210 size_t ocount; /* original count */ 2212 size_t ocount; /* original count */
2211 size_t count; /* after file limit checks */ 2213 size_t count; /* after file limit checks */
2212 loff_t old_size, *ppos = &iocb->ki_pos; 2214 loff_t old_size, *ppos = &iocb->ki_pos;
2213 u32 old_clusters; 2215 u32 old_clusters;
2214 struct file *file = iocb->ki_filp; 2216 struct file *file = iocb->ki_filp;
2215 struct inode *inode = file->f_path.dentry->d_inode; 2217 struct inode *inode = file->f_path.dentry->d_inode;
2216 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2218 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2217 int full_coherency = !(osb->s_mount_opt & 2219 int full_coherency = !(osb->s_mount_opt &
2218 OCFS2_MOUNT_COHERENCY_BUFFERED); 2220 OCFS2_MOUNT_COHERENCY_BUFFERED);
2219 2221
2220 trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry, 2222 trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
2221 (unsigned long long)OCFS2_I(inode)->ip_blkno, 2223 (unsigned long long)OCFS2_I(inode)->ip_blkno,
2222 file->f_path.dentry->d_name.len, 2224 file->f_path.dentry->d_name.len,
2223 file->f_path.dentry->d_name.name, 2225 file->f_path.dentry->d_name.name,
2224 (unsigned int)nr_segs); 2226 (unsigned int)nr_segs);
2225 2227
2226 if (iocb->ki_left == 0) 2228 if (iocb->ki_left == 0)
2227 return 0; 2229 return 0;
2228 2230
2229 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 2231 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
2230 2232
2231 appending = file->f_flags & O_APPEND ? 1 : 0; 2233 appending = file->f_flags & O_APPEND ? 1 : 0;
2232 direct_io = file->f_flags & O_DIRECT ? 1 : 0; 2234 direct_io = file->f_flags & O_DIRECT ? 1 : 0;
2233 2235
2234 mutex_lock(&inode->i_mutex); 2236 mutex_lock(&inode->i_mutex);
2235 2237
2236 ocfs2_iocb_clear_sem_locked(iocb); 2238 ocfs2_iocb_clear_sem_locked(iocb);
2237 2239
2238 relock: 2240 relock:
2239 /* to match setattr's i_mutex -> rw_lock ordering */ 2241 /* to match setattr's i_mutex -> rw_lock ordering */
2240 if (direct_io) { 2242 if (direct_io) {
2241 atomic_inc(&inode->i_dio_count); 2243 atomic_inc(&inode->i_dio_count);
2242 have_alloc_sem = 1; 2244 have_alloc_sem = 1;
2243 /* communicate with ocfs2_dio_end_io */ 2245 /* communicate with ocfs2_dio_end_io */
2244 ocfs2_iocb_set_sem_locked(iocb); 2246 ocfs2_iocb_set_sem_locked(iocb);
2245 } 2247 }
2246 2248
2247 /* 2249 /*
2248 * Concurrent O_DIRECT writes are allowed with 2250 * Concurrent O_DIRECT writes are allowed with
2249 * mount_option "coherency=buffered". 2251 * mount_option "coherency=buffered".
2250 */ 2252 */
2251 rw_level = (!direct_io || full_coherency); 2253 rw_level = (!direct_io || full_coherency);
2252 2254
2253 ret = ocfs2_rw_lock(inode, rw_level); 2255 ret = ocfs2_rw_lock(inode, rw_level);
2254 if (ret < 0) { 2256 if (ret < 0) {
2255 mlog_errno(ret); 2257 mlog_errno(ret);
2256 goto out_sems; 2258 goto out_sems;
2257 } 2259 }
2258 2260
2259 /* 2261 /*
2260 * O_DIRECT writes with "coherency=full" need to take EX cluster 2262 * O_DIRECT writes with "coherency=full" need to take EX cluster
2261 * inode_lock to guarantee coherency. 2263 * inode_lock to guarantee coherency.
2262 */ 2264 */
2263 if (direct_io && full_coherency) { 2265 if (direct_io && full_coherency) {
2264 /* 2266 /*
2265 * We need to take and drop the inode lock to force 2267 * We need to take and drop the inode lock to force
2266 * other nodes to drop their caches. Buffered I/O 2268 * other nodes to drop their caches. Buffered I/O
2267 * already does this in write_begin(). 2269 * already does this in write_begin().
2268 */ 2270 */
2269 ret = ocfs2_inode_lock(inode, NULL, 1); 2271 ret = ocfs2_inode_lock(inode, NULL, 1);
2270 if (ret < 0) { 2272 if (ret < 0) {
2271 mlog_errno(ret); 2273 mlog_errno(ret);
2272 goto out_sems; 2274 goto out_sems;
2273 } 2275 }
2274 2276
2275 ocfs2_inode_unlock(inode, 1); 2277 ocfs2_inode_unlock(inode, 1);
2276 } 2278 }
2277 2279
2278 can_do_direct = direct_io; 2280 can_do_direct = direct_io;
2279 ret = ocfs2_prepare_inode_for_write(file, ppos, 2281 ret = ocfs2_prepare_inode_for_write(file, ppos,
2280 iocb->ki_left, appending, 2282 iocb->ki_left, appending,
2281 &can_do_direct, &has_refcount); 2283 &can_do_direct, &has_refcount);
2282 if (ret < 0) { 2284 if (ret < 0) {
2283 mlog_errno(ret); 2285 mlog_errno(ret);
2284 goto out; 2286 goto out;
2285 } 2287 }
2286 2288
2287 /* 2289 /*
2288 * We can't complete the direct I/O as requested, fall back to 2290 * We can't complete the direct I/O as requested, fall back to
2289 * buffered I/O. 2291 * buffered I/O.
2290 */ 2292 */
2291 if (direct_io && !can_do_direct) { 2293 if (direct_io && !can_do_direct) {
2292 ocfs2_rw_unlock(inode, rw_level); 2294 ocfs2_rw_unlock(inode, rw_level);
2293 inode_dio_done(inode); 2295 inode_dio_done(inode);
2294 2296
2295 have_alloc_sem = 0; 2297 have_alloc_sem = 0;
2296 rw_level = -1; 2298 rw_level = -1;
2297 2299
2298 direct_io = 0; 2300 direct_io = 0;
2299 goto relock; 2301 goto relock;
2300 } 2302 }
2301 2303
2302 /* 2304 /*
2303 * To later detect whether a journal commit for sync writes is 2305 * To later detect whether a journal commit for sync writes is
2304 * necessary, we sample i_size, and cluster count here. 2306 * necessary, we sample i_size, and cluster count here.
2305 */ 2307 */
2306 old_size = i_size_read(inode); 2308 old_size = i_size_read(inode);
2307 old_clusters = OCFS2_I(inode)->ip_clusters; 2309 old_clusters = OCFS2_I(inode)->ip_clusters;
2308 2310
2309 /* communicate with ocfs2_dio_end_io */ 2311 /* communicate with ocfs2_dio_end_io */
2310 ocfs2_iocb_set_rw_locked(iocb, rw_level); 2312 ocfs2_iocb_set_rw_locked(iocb, rw_level);
2311 2313
2312 ret = generic_segment_checks(iov, &nr_segs, &ocount, 2314 ret = generic_segment_checks(iov, &nr_segs, &ocount,
2313 VERIFY_READ); 2315 VERIFY_READ);
2314 if (ret) 2316 if (ret)
2315 goto out_dio; 2317 goto out_dio;
2316 2318
2317 count = ocount; 2319 count = ocount;
2318 ret = generic_write_checks(file, ppos, &count, 2320 ret = generic_write_checks(file, ppos, &count,
2319 S_ISBLK(inode->i_mode)); 2321 S_ISBLK(inode->i_mode));
2320 if (ret) 2322 if (ret)
2321 goto out_dio; 2323 goto out_dio;
2322 2324
2323 if (direct_io) { 2325 if (direct_io) {
2324 written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, 2326 written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
2325 ppos, count, ocount); 2327 ppos, count, ocount);
2326 if (written < 0) { 2328 if (written < 0) {
2327 ret = written; 2329 ret = written;
2328 goto out_dio; 2330 goto out_dio;
2329 } 2331 }
2330 } else { 2332 } else {
2331 current->backing_dev_info = file->f_mapping->backing_dev_info; 2333 current->backing_dev_info = file->f_mapping->backing_dev_info;
2332 written = generic_file_buffered_write(iocb, iov, nr_segs, *ppos, 2334 written = generic_file_buffered_write(iocb, iov, nr_segs, *ppos,
2333 ppos, count, 0); 2335 ppos, count, 0);
2334 current->backing_dev_info = NULL; 2336 current->backing_dev_info = NULL;
2335 } 2337 }
2336 2338
2337 out_dio: 2339 out_dio:
2338 /* buffered aio wouldn't have proper lock coverage today */ 2340 /* buffered aio wouldn't have proper lock coverage today */
2339 BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); 2341 BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
2340 2342
2341 if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) || 2343 if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) ||
2342 ((file->f_flags & O_DIRECT) && !direct_io)) { 2344 ((file->f_flags & O_DIRECT) && !direct_io)) {
2343 ret = filemap_fdatawrite_range(file->f_mapping, pos, 2345 ret = filemap_fdatawrite_range(file->f_mapping, pos,
2344 pos + count - 1); 2346 pos + count - 1);
2345 if (ret < 0) 2347 if (ret < 0)
2346 written = ret; 2348 written = ret;
2347 2349
2348 if (!ret && ((old_size != i_size_read(inode)) || 2350 if (!ret && ((old_size != i_size_read(inode)) ||
2349 (old_clusters != OCFS2_I(inode)->ip_clusters) || 2351 (old_clusters != OCFS2_I(inode)->ip_clusters) ||
2350 has_refcount)) { 2352 has_refcount)) {
2351 ret = jbd2_journal_force_commit(osb->journal->j_journal); 2353 ret = jbd2_journal_force_commit(osb->journal->j_journal);
2352 if (ret < 0) 2354 if (ret < 0)
2353 written = ret; 2355 written = ret;
2354 } 2356 }
2355 2357
2356 if (!ret) 2358 if (!ret)
2357 ret = filemap_fdatawait_range(file->f_mapping, pos, 2359 ret = filemap_fdatawait_range(file->f_mapping, pos,
2358 pos + count - 1); 2360 pos + count - 1);
2359 } 2361 }
2360 2362
2361 /* 2363 /*
2362 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io 2364 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
2363 * function pointer which is called when o_direct io completes so that 2365 * function pointer which is called when o_direct io completes so that
2364 * it can unlock our rw lock. 2366 * it can unlock our rw lock.
2365 * Unfortunately there are error cases which call end_io and others 2367 * Unfortunately there are error cases which call end_io and others
2366 * that don't. so we don't have to unlock the rw_lock if either an 2368 * that don't. so we don't have to unlock the rw_lock if either an
2367 * async dio is going to do it in the future or an end_io after an 2369 * async dio is going to do it in the future or an end_io after an
2368 * error has already done it. 2370 * error has already done it.
2369 */ 2371 */
2370 if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) { 2372 if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
2371 rw_level = -1; 2373 rw_level = -1;
2372 have_alloc_sem = 0; 2374 have_alloc_sem = 0;
2373 } 2375 }
2374 2376
2375 out: 2377 out:
2376 if (rw_level != -1) 2378 if (rw_level != -1)
2377 ocfs2_rw_unlock(inode, rw_level); 2379 ocfs2_rw_unlock(inode, rw_level);
2378 2380
2379 out_sems: 2381 out_sems:
2380 if (have_alloc_sem) { 2382 if (have_alloc_sem) {
2381 inode_dio_done(inode); 2383 inode_dio_done(inode);
2382 ocfs2_iocb_clear_sem_locked(iocb); 2384 ocfs2_iocb_clear_sem_locked(iocb);
2383 } 2385 }
2384 2386
2385 mutex_unlock(&inode->i_mutex); 2387 mutex_unlock(&inode->i_mutex);
2386 2388
2387 if (written) 2389 if (written)
2388 ret = written; 2390 ret = written;
2389 return ret; 2391 return ret;
2390 } 2392 }
2391 2393
2392 static int ocfs2_splice_to_file(struct pipe_inode_info *pipe, 2394 static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
2393 struct file *out, 2395 struct file *out,
2394 struct splice_desc *sd) 2396 struct splice_desc *sd)
2395 { 2397 {
2396 int ret; 2398 int ret;
2397 2399
2398 ret = ocfs2_prepare_inode_for_write(out, &sd->pos, 2400 ret = ocfs2_prepare_inode_for_write(out, &sd->pos,
2399 sd->total_len, 0, NULL, NULL); 2401 sd->total_len, 0, NULL, NULL);
2400 if (ret < 0) { 2402 if (ret < 0) {
2401 mlog_errno(ret); 2403 mlog_errno(ret);
2402 return ret; 2404 return ret;
2403 } 2405 }
2404 2406
2405 return splice_from_pipe_feed(pipe, sd, pipe_to_file); 2407 return splice_from_pipe_feed(pipe, sd, pipe_to_file);
2406 } 2408 }
2407 2409
2408 static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, 2410 static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
2409 struct file *out, 2411 struct file *out,
2410 loff_t *ppos, 2412 loff_t *ppos,
2411 size_t len, 2413 size_t len,
2412 unsigned int flags) 2414 unsigned int flags)
2413 { 2415 {
2414 int ret; 2416 int ret;
2415 struct address_space *mapping = out->f_mapping; 2417 struct address_space *mapping = out->f_mapping;
2416 struct inode *inode = mapping->host; 2418 struct inode *inode = mapping->host;
2417 struct splice_desc sd = { 2419 struct splice_desc sd = {
2418 .total_len = len, 2420 .total_len = len,
2419 .flags = flags, 2421 .flags = flags,
2420 .pos = *ppos, 2422 .pos = *ppos,
2421 .u.file = out, 2423 .u.file = out,
2422 }; 2424 };
2423 2425
2424 2426
2425 trace_ocfs2_file_splice_write(inode, out, out->f_path.dentry, 2427 trace_ocfs2_file_splice_write(inode, out, out->f_path.dentry,
2426 (unsigned long long)OCFS2_I(inode)->ip_blkno, 2428 (unsigned long long)OCFS2_I(inode)->ip_blkno,
2427 out->f_path.dentry->d_name.len, 2429 out->f_path.dentry->d_name.len,
2428 out->f_path.dentry->d_name.name, len); 2430 out->f_path.dentry->d_name.name, len);
2429 2431
2430 if (pipe->inode) 2432 if (pipe->inode)
2431 mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT); 2433 mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT);
2432 2434
2433 splice_from_pipe_begin(&sd); 2435 splice_from_pipe_begin(&sd);
2434 do { 2436 do {
2435 ret = splice_from_pipe_next(pipe, &sd); 2437 ret = splice_from_pipe_next(pipe, &sd);
2436 if (ret <= 0) 2438 if (ret <= 0)
2437 break; 2439 break;
2438 2440
2439 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); 2441 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
2440 ret = ocfs2_rw_lock(inode, 1); 2442 ret = ocfs2_rw_lock(inode, 1);
2441 if (ret < 0) 2443 if (ret < 0)
2442 mlog_errno(ret); 2444 mlog_errno(ret);
2443 else { 2445 else {
2444 ret = ocfs2_splice_to_file(pipe, out, &sd); 2446 ret = ocfs2_splice_to_file(pipe, out, &sd);
2445 ocfs2_rw_unlock(inode, 1); 2447 ocfs2_rw_unlock(inode, 1);
2446 } 2448 }
2447 mutex_unlock(&inode->i_mutex); 2449 mutex_unlock(&inode->i_mutex);
2448 } while (ret > 0); 2450 } while (ret > 0);
2449 splice_from_pipe_end(pipe, &sd); 2451 splice_from_pipe_end(pipe, &sd);
2450 2452
2451 if (pipe->inode) 2453 if (pipe->inode)
2452 mutex_unlock(&pipe->inode->i_mutex); 2454 mutex_unlock(&pipe->inode->i_mutex);
2453 2455
2454 if (sd.num_spliced) 2456 if (sd.num_spliced)
2455 ret = sd.num_spliced; 2457 ret = sd.num_spliced;
2456 2458
2457 if (ret > 0) { 2459 if (ret > 0) {
2458 unsigned long nr_pages; 2460 unsigned long nr_pages;
2459 int err; 2461 int err;
2460 2462
2461 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 2463 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
2462 2464
2463 err = generic_write_sync(out, *ppos, ret); 2465 err = generic_write_sync(out, *ppos, ret);
2464 if (err) 2466 if (err)
2465 ret = err; 2467 ret = err;
2466 else 2468 else
2467 *ppos += ret; 2469 *ppos += ret;
2468 2470
2469 balance_dirty_pages_ratelimited_nr(mapping, nr_pages); 2471 balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
2470 } 2472 }
2471 2473
2472 return ret; 2474 return ret;
2473 } 2475 }
2474 2476
2475 static ssize_t ocfs2_file_splice_read(struct file *in, 2477 static ssize_t ocfs2_file_splice_read(struct file *in,
2476 loff_t *ppos, 2478 loff_t *ppos,
2477 struct pipe_inode_info *pipe, 2479 struct pipe_inode_info *pipe,
2478 size_t len, 2480 size_t len,
2479 unsigned int flags) 2481 unsigned int flags)
2480 { 2482 {
2481 int ret = 0, lock_level = 0; 2483 int ret = 0, lock_level = 0;
2482 struct inode *inode = in->f_path.dentry->d_inode; 2484 struct inode *inode = in->f_path.dentry->d_inode;
2483 2485
2484 trace_ocfs2_file_splice_read(inode, in, in->f_path.dentry, 2486 trace_ocfs2_file_splice_read(inode, in, in->f_path.dentry,
2485 (unsigned long long)OCFS2_I(inode)->ip_blkno, 2487 (unsigned long long)OCFS2_I(inode)->ip_blkno,
2486 in->f_path.dentry->d_name.len, 2488 in->f_path.dentry->d_name.len,
2487 in->f_path.dentry->d_name.name, len); 2489 in->f_path.dentry->d_name.name, len);
2488 2490
2489 /* 2491 /*
2490 * See the comment in ocfs2_file_aio_read() 2492 * See the comment in ocfs2_file_aio_read()
2491 */ 2493 */
2492 ret = ocfs2_inode_lock_atime(inode, in->f_vfsmnt, &lock_level); 2494 ret = ocfs2_inode_lock_atime(inode, in->f_vfsmnt, &lock_level);
2493 if (ret < 0) { 2495 if (ret < 0) {
2494 mlog_errno(ret); 2496 mlog_errno(ret);
2495 goto bail; 2497 goto bail;
2496 } 2498 }
2497 ocfs2_inode_unlock(inode, lock_level); 2499 ocfs2_inode_unlock(inode, lock_level);
2498 2500
2499 ret = generic_file_splice_read(in, ppos, pipe, len, flags); 2501 ret = generic_file_splice_read(in, ppos, pipe, len, flags);
2500 2502
2501 bail: 2503 bail:
2502 return ret; 2504 return ret;
2503 } 2505 }
2504 2506
2505 static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, 2507 static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
2506 const struct iovec *iov, 2508 const struct iovec *iov,
2507 unsigned long nr_segs, 2509 unsigned long nr_segs,
2508 loff_t pos) 2510 loff_t pos)
2509 { 2511 {
2510 int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0; 2512 int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;
2511 struct file *filp = iocb->ki_filp; 2513 struct file *filp = iocb->ki_filp;
2512 struct inode *inode = filp->f_path.dentry->d_inode; 2514 struct inode *inode = filp->f_path.dentry->d_inode;
2513 2515
2514 trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry, 2516 trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry,
2515 (unsigned long long)OCFS2_I(inode)->ip_blkno, 2517 (unsigned long long)OCFS2_I(inode)->ip_blkno,
2516 filp->f_path.dentry->d_name.len, 2518 filp->f_path.dentry->d_name.len,
2517 filp->f_path.dentry->d_name.name, nr_segs); 2519 filp->f_path.dentry->d_name.name, nr_segs);
2518 2520
2519 2521
2520 if (!inode) { 2522 if (!inode) {
2521 ret = -EINVAL; 2523 ret = -EINVAL;
2522 mlog_errno(ret); 2524 mlog_errno(ret);
2523 goto bail; 2525 goto bail;
2524 } 2526 }
2525 2527
2526 ocfs2_iocb_clear_sem_locked(iocb); 2528 ocfs2_iocb_clear_sem_locked(iocb);
2527 2529
2528 /* 2530 /*
2529 * buffered reads protect themselves in ->readpage(). O_DIRECT reads 2531 * buffered reads protect themselves in ->readpage(). O_DIRECT reads
2530 * need locks to protect pending reads from racing with truncate. 2532 * need locks to protect pending reads from racing with truncate.
2531 */ 2533 */
2532 if (filp->f_flags & O_DIRECT) { 2534 if (filp->f_flags & O_DIRECT) {
2533 have_alloc_sem = 1; 2535 have_alloc_sem = 1;
2534 atomic_inc(&inode->i_dio_count); 2536 atomic_inc(&inode->i_dio_count);
2535 ocfs2_iocb_set_sem_locked(iocb); 2537 ocfs2_iocb_set_sem_locked(iocb);
2536 2538
2537 ret = ocfs2_rw_lock(inode, 0); 2539 ret = ocfs2_rw_lock(inode, 0);
2538 if (ret < 0) { 2540 if (ret < 0) {
2539 mlog_errno(ret); 2541 mlog_errno(ret);
2540 goto bail; 2542 goto bail;
2541 } 2543 }
2542 rw_level = 0; 2544 rw_level = 0;
2543 /* communicate with ocfs2_dio_end_io */ 2545 /* communicate with ocfs2_dio_end_io */
2544 ocfs2_iocb_set_rw_locked(iocb, rw_level); 2546 ocfs2_iocb_set_rw_locked(iocb, rw_level);
2545 } 2547 }
2546 2548
2547 /* 2549 /*
2548 * We're fine letting folks race truncates and extending 2550 * We're fine letting folks race truncates and extending
2549 * writes with read across the cluster, just like they can 2551 * writes with read across the cluster, just like they can
2550 * locally. Hence no rw_lock during read. 2552 * locally. Hence no rw_lock during read.
2551 * 2553 *
2552 * Take and drop the meta data lock to update inode fields 2554 * Take and drop the meta data lock to update inode fields
2553 * like i_size. This allows the checks down below 2555 * like i_size. This allows the checks down below
2554 * generic_file_aio_read() a chance of actually working. 2556 * generic_file_aio_read() a chance of actually working.
2555 */ 2557 */
2556 ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level); 2558 ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
2557 if (ret < 0) { 2559 if (ret < 0) {
2558 mlog_errno(ret); 2560 mlog_errno(ret);
2559 goto bail; 2561 goto bail;
2560 } 2562 }
2561 ocfs2_inode_unlock(inode, lock_level); 2563 ocfs2_inode_unlock(inode, lock_level);
2562 2564
2563 ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos); 2565 ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
2564 trace_generic_file_aio_read_ret(ret); 2566 trace_generic_file_aio_read_ret(ret);
2565 2567
2566 /* buffered aio wouldn't have proper lock coverage today */ 2568 /* buffered aio wouldn't have proper lock coverage today */
2567 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); 2569 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
2568 2570
2569 /* see ocfs2_file_aio_write */ 2571 /* see ocfs2_file_aio_write */
2570 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { 2572 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
2571 rw_level = -1; 2573 rw_level = -1;
2572 have_alloc_sem = 0; 2574 have_alloc_sem = 0;
2573 } 2575 }
2574 2576
2575 bail: 2577 bail:
2576 if (have_alloc_sem) { 2578 if (have_alloc_sem) {
2577 inode_dio_done(inode); 2579 inode_dio_done(inode);
2578 ocfs2_iocb_clear_sem_locked(iocb); 2580 ocfs2_iocb_clear_sem_locked(iocb);
2579 } 2581 }
2580 if (rw_level != -1) 2582 if (rw_level != -1)
2581 ocfs2_rw_unlock(inode, rw_level); 2583 ocfs2_rw_unlock(inode, rw_level);
2582 2584
2583 return ret; 2585 return ret;
2584 } 2586 }
2585 2587
2586 const struct inode_operations ocfs2_file_iops = { 2588 const struct inode_operations ocfs2_file_iops = {
2587 .setattr = ocfs2_setattr, 2589 .setattr = ocfs2_setattr,
2588 .getattr = ocfs2_getattr, 2590 .getattr = ocfs2_getattr,
2589 .permission = ocfs2_permission, 2591 .permission = ocfs2_permission,
2590 .setxattr = generic_setxattr, 2592 .setxattr = generic_setxattr,
2591 .getxattr = generic_getxattr, 2593 .getxattr = generic_getxattr,
2592 .listxattr = ocfs2_listxattr, 2594 .listxattr = ocfs2_listxattr,
2593 .removexattr = generic_removexattr, 2595 .removexattr = generic_removexattr,
2594 .fiemap = ocfs2_fiemap, 2596 .fiemap = ocfs2_fiemap,
2595 .check_acl = ocfs2_check_acl, 2597 .check_acl = ocfs2_check_acl,
2596 }; 2598 };
2597 2599
2598 const struct inode_operations ocfs2_special_file_iops = { 2600 const struct inode_operations ocfs2_special_file_iops = {
2599 .setattr = ocfs2_setattr, 2601 .setattr = ocfs2_setattr,
2600 .getattr = ocfs2_getattr, 2602 .getattr = ocfs2_getattr,
2601 .permission = ocfs2_permission, 2603 .permission = ocfs2_permission,
2602 .check_acl = ocfs2_check_acl, 2604 .check_acl = ocfs2_check_acl,
2603 }; 2605 };
2604 2606
2605 /* 2607 /*
2606 * Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with 2608 * Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with
2607 * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks! 2609 * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
2608 */ 2610 */
2609 const struct file_operations ocfs2_fops = { 2611 const struct file_operations ocfs2_fops = {
2610 .llseek = generic_file_llseek, 2612 .llseek = generic_file_llseek,
2611 .read = do_sync_read, 2613 .read = do_sync_read,
2612 .write = do_sync_write, 2614 .write = do_sync_write,
2613 .mmap = ocfs2_mmap, 2615 .mmap = ocfs2_mmap,
2614 .fsync = ocfs2_sync_file, 2616 .fsync = ocfs2_sync_file,
2615 .release = ocfs2_file_release, 2617 .release = ocfs2_file_release,
2616 .open = ocfs2_file_open, 2618 .open = ocfs2_file_open,
2617 .aio_read = ocfs2_file_aio_read, 2619 .aio_read = ocfs2_file_aio_read,
2618 .aio_write = ocfs2_file_aio_write, 2620 .aio_write = ocfs2_file_aio_write,
2619 .unlocked_ioctl = ocfs2_ioctl, 2621 .unlocked_ioctl = ocfs2_ioctl,
2620 #ifdef CONFIG_COMPAT 2622 #ifdef CONFIG_COMPAT
2621 .compat_ioctl = ocfs2_compat_ioctl, 2623 .compat_ioctl = ocfs2_compat_ioctl,
2622 #endif 2624 #endif
2623 .lock = ocfs2_lock, 2625 .lock = ocfs2_lock,
2624 .flock = ocfs2_flock, 2626 .flock = ocfs2_flock,
2625 .splice_read = ocfs2_file_splice_read, 2627 .splice_read = ocfs2_file_splice_read,
2626 .splice_write = ocfs2_file_splice_write, 2628 .splice_write = ocfs2_file_splice_write,
2627 .fallocate = ocfs2_fallocate, 2629 .fallocate = ocfs2_fallocate,
2628 }; 2630 };
2629 2631
2630 const struct file_operations ocfs2_dops = { 2632 const struct file_operations ocfs2_dops = {
2631 .llseek = generic_file_llseek, 2633 .llseek = generic_file_llseek,
2632 .read = generic_read_dir, 2634 .read = generic_read_dir,
2633 .readdir = ocfs2_readdir, 2635 .readdir = ocfs2_readdir,
2634 .fsync = ocfs2_sync_file, 2636 .fsync = ocfs2_sync_file,
2635 .release = ocfs2_dir_release, 2637 .release = ocfs2_dir_release,
2636 .open = ocfs2_dir_open, 2638 .open = ocfs2_dir_open,
2637 .unlocked_ioctl = ocfs2_ioctl, 2639 .unlocked_ioctl = ocfs2_ioctl,
2638 #ifdef CONFIG_COMPAT 2640 #ifdef CONFIG_COMPAT
2639 .compat_ioctl = ocfs2_compat_ioctl, 2641 .compat_ioctl = ocfs2_compat_ioctl,
2640 #endif 2642 #endif
2641 .lock = ocfs2_lock, 2643 .lock = ocfs2_lock,
2642 .flock = ocfs2_flock, 2644 .flock = ocfs2_flock,
2643 }; 2645 };
2644 2646
2645 /* 2647 /*
2646 * POSIX-lockless variants of our file_operations. 2648 * POSIX-lockless variants of our file_operations.
2647 * 2649 *
2648 * These will be used if the underlying cluster stack does not support 2650 * These will be used if the underlying cluster stack does not support
2649 * posix file locking, if the user passes the "localflocks" mount 2651 * posix file locking, if the user passes the "localflocks" mount
2650 * option, or if we have a local-only fs. 2652 * option, or if we have a local-only fs.
2651 * 2653 *
2652 * ocfs2_flock is in here because all stacks handle UNIX file locks, 2654 * ocfs2_flock is in here because all stacks handle UNIX file locks,
2653 * so we still want it in the case of no stack support for 2655 * so we still want it in the case of no stack support for
2654 * plocks. Internally, it will do the right thing when asked to ignore 2656 * plocks. Internally, it will do the right thing when asked to ignore
2655 * the cluster. 2657 * the cluster.
2656 */ 2658 */
2657 const struct file_operations ocfs2_fops_no_plocks = { 2659 const struct file_operations ocfs2_fops_no_plocks = {
2658 .llseek = generic_file_llseek, 2660 .llseek = generic_file_llseek,
2659 .read = do_sync_read, 2661 .read = do_sync_read,
2660 .write = do_sync_write, 2662 .write = do_sync_write,
2661 .mmap = ocfs2_mmap, 2663 .mmap = ocfs2_mmap,
2662 .fsync = ocfs2_sync_file, 2664 .fsync = ocfs2_sync_file,
2663 .release = ocfs2_file_release, 2665 .release = ocfs2_file_release,
2664 .open = ocfs2_file_open, 2666 .open = ocfs2_file_open,
2665 .aio_read = ocfs2_file_aio_read, 2667 .aio_read = ocfs2_file_aio_read,
2666 .aio_write = ocfs2_file_aio_write, 2668 .aio_write = ocfs2_file_aio_write,
2667 .unlocked_ioctl = ocfs2_ioctl, 2669 .unlocked_ioctl = ocfs2_ioctl,
2668 #ifdef CONFIG_COMPAT 2670 #ifdef CONFIG_COMPAT
2669 .compat_ioctl = ocfs2_compat_ioctl, 2671 .compat_ioctl = ocfs2_compat_ioctl,
2670 #endif 2672 #endif
2671 .flock = ocfs2_flock, 2673 .flock = ocfs2_flock,
2672 .splice_read = ocfs2_file_splice_read, 2674 .splice_read = ocfs2_file_splice_read,
2673 .splice_write = ocfs2_file_splice_write, 2675 .splice_write = ocfs2_file_splice_write,
2674 .fallocate = ocfs2_fallocate, 2676 .fallocate = ocfs2_fallocate,
2675 }; 2677 };
2676 2678
2677 const struct file_operations ocfs2_dops_no_plocks = { 2679 const struct file_operations ocfs2_dops_no_plocks = {
2678 .llseek = generic_file_llseek, 2680 .llseek = generic_file_llseek,
2679 .read = generic_read_dir, 2681 .read = generic_read_dir,
2680 .readdir = ocfs2_readdir, 2682 .readdir = ocfs2_readdir,
2681 .fsync = ocfs2_sync_file, 2683 .fsync = ocfs2_sync_file,
2682 .release = ocfs2_dir_release, 2684 .release = ocfs2_dir_release,
2683 .open = ocfs2_dir_open, 2685 .open = ocfs2_dir_open,
2684 .unlocked_ioctl = ocfs2_ioctl, 2686 .unlocked_ioctl = ocfs2_ioctl,
2685 #ifdef CONFIG_COMPAT 2687 #ifdef CONFIG_COMPAT
2686 .compat_ioctl = ocfs2_compat_ioctl, 2688 .compat_ioctl = ocfs2_compat_ioctl,
2687 #endif 2689 #endif
2688 .flock = ocfs2_flock, 2690 .flock = ocfs2_flock,
2689 }; 2691 };
2690 2692
1 /* 1 /*
2 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README 2 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3 */ 3 */
4 4
5 #include <linux/time.h> 5 #include <linux/time.h>
6 #include <linux/fs.h> 6 #include <linux/fs.h>
7 #include <linux/reiserfs_fs.h> 7 #include <linux/reiserfs_fs.h>
8 #include <linux/reiserfs_acl.h> 8 #include <linux/reiserfs_acl.h>
9 #include <linux/reiserfs_xattr.h> 9 #include <linux/reiserfs_xattr.h>
10 #include <linux/exportfs.h> 10 #include <linux/exportfs.h>
11 #include <linux/pagemap.h> 11 #include <linux/pagemap.h>
12 #include <linux/highmem.h> 12 #include <linux/highmem.h>
13 #include <linux/slab.h> 13 #include <linux/slab.h>
14 #include <asm/uaccess.h> 14 #include <asm/uaccess.h>
15 #include <asm/unaligned.h> 15 #include <asm/unaligned.h>
16 #include <linux/buffer_head.h> 16 #include <linux/buffer_head.h>
17 #include <linux/mpage.h> 17 #include <linux/mpage.h>
18 #include <linux/writeback.h> 18 #include <linux/writeback.h>
19 #include <linux/quotaops.h> 19 #include <linux/quotaops.h>
20 #include <linux/swap.h> 20 #include <linux/swap.h>
21 21
22 int reiserfs_commit_write(struct file *f, struct page *page, 22 int reiserfs_commit_write(struct file *f, struct page *page,
23 unsigned from, unsigned to); 23 unsigned from, unsigned to);
24 24
25 void reiserfs_evict_inode(struct inode *inode) 25 void reiserfs_evict_inode(struct inode *inode)
26 { 26 {
27 /* We need blocks for transaction + (user+group) quota update (possibly delete) */ 27 /* We need blocks for transaction + (user+group) quota update (possibly delete) */
28 int jbegin_count = 28 int jbegin_count =
29 JOURNAL_PER_BALANCE_CNT * 2 + 29 JOURNAL_PER_BALANCE_CNT * 2 +
30 2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb); 30 2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb);
31 struct reiserfs_transaction_handle th; 31 struct reiserfs_transaction_handle th;
32 int depth; 32 int depth;
33 int err; 33 int err;
34 34
35 if (!inode->i_nlink && !is_bad_inode(inode)) 35 if (!inode->i_nlink && !is_bad_inode(inode))
36 dquot_initialize(inode); 36 dquot_initialize(inode);
37 37
38 truncate_inode_pages(&inode->i_data, 0); 38 truncate_inode_pages(&inode->i_data, 0);
39 if (inode->i_nlink) 39 if (inode->i_nlink)
40 goto no_delete; 40 goto no_delete;
41 41
42 depth = reiserfs_write_lock_once(inode->i_sb); 42 depth = reiserfs_write_lock_once(inode->i_sb);
43 43
44 /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */ 44 /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */
45 if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) { /* also handles bad_inode case */ 45 if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) { /* also handles bad_inode case */
46 reiserfs_delete_xattrs(inode); 46 reiserfs_delete_xattrs(inode);
47 47
48 if (journal_begin(&th, inode->i_sb, jbegin_count)) 48 if (journal_begin(&th, inode->i_sb, jbegin_count))
49 goto out; 49 goto out;
50 reiserfs_update_inode_transaction(inode); 50 reiserfs_update_inode_transaction(inode);
51 51
52 reiserfs_discard_prealloc(&th, inode); 52 reiserfs_discard_prealloc(&th, inode);
53 53
54 err = reiserfs_delete_object(&th, inode); 54 err = reiserfs_delete_object(&th, inode);
55 55
56 /* Do quota update inside a transaction for journaled quotas. We must do that 56 /* Do quota update inside a transaction for journaled quotas. We must do that
57 * after delete_object so that quota updates go into the same transaction as 57 * after delete_object so that quota updates go into the same transaction as
58 * stat data deletion */ 58 * stat data deletion */
59 if (!err) 59 if (!err)
60 dquot_free_inode(inode); 60 dquot_free_inode(inode);
61 61
62 if (journal_end(&th, inode->i_sb, jbegin_count)) 62 if (journal_end(&th, inode->i_sb, jbegin_count))
63 goto out; 63 goto out;
64 64
65 /* check return value from reiserfs_delete_object after 65 /* check return value from reiserfs_delete_object after
66 * ending the transaction 66 * ending the transaction
67 */ 67 */
68 if (err) 68 if (err)
69 goto out; 69 goto out;
70 70
71 /* all items of file are deleted, so we can remove "save" link */ 71 /* all items of file are deleted, so we can remove "save" link */
72 remove_save_link(inode, 0 /* not truncate */ ); /* we can't do anything 72 remove_save_link(inode, 0 /* not truncate */ ); /* we can't do anything
73 * about an error here */ 73 * about an error here */
74 } else { 74 } else {
75 /* no object items are in the tree */ 75 /* no object items are in the tree */
76 ; 76 ;
77 } 77 }
78 out: 78 out:
79 end_writeback(inode); /* note this must go after the journal_end to prevent deadlock */ 79 end_writeback(inode); /* note this must go after the journal_end to prevent deadlock */
80 dquot_drop(inode); 80 dquot_drop(inode);
81 inode->i_blocks = 0; 81 inode->i_blocks = 0;
82 reiserfs_write_unlock_once(inode->i_sb, depth); 82 reiserfs_write_unlock_once(inode->i_sb, depth);
83 return; 83 return;
84 84
85 no_delete: 85 no_delete:
86 end_writeback(inode); 86 end_writeback(inode);
87 dquot_drop(inode); 87 dquot_drop(inode);
88 } 88 }
89 89
90 static void _make_cpu_key(struct cpu_key *key, int version, __u32 dirid, 90 static void _make_cpu_key(struct cpu_key *key, int version, __u32 dirid,
91 __u32 objectid, loff_t offset, int type, int length) 91 __u32 objectid, loff_t offset, int type, int length)
92 { 92 {
93 key->version = version; 93 key->version = version;
94 94
95 key->on_disk_key.k_dir_id = dirid; 95 key->on_disk_key.k_dir_id = dirid;
96 key->on_disk_key.k_objectid = objectid; 96 key->on_disk_key.k_objectid = objectid;
97 set_cpu_key_k_offset(key, offset); 97 set_cpu_key_k_offset(key, offset);
98 set_cpu_key_k_type(key, type); 98 set_cpu_key_k_type(key, type);
99 key->key_length = length; 99 key->key_length = length;
100 } 100 }
101 101
102 /* take base of inode_key (it comes from inode always) (dirid, objectid) and version from an inode, set 102 /* take base of inode_key (it comes from inode always) (dirid, objectid) and version from an inode, set
103 offset and type of key */ 103 offset and type of key */
104 void make_cpu_key(struct cpu_key *key, struct inode *inode, loff_t offset, 104 void make_cpu_key(struct cpu_key *key, struct inode *inode, loff_t offset,
105 int type, int length) 105 int type, int length)
106 { 106 {
107 _make_cpu_key(key, get_inode_item_key_version(inode), 107 _make_cpu_key(key, get_inode_item_key_version(inode),
108 le32_to_cpu(INODE_PKEY(inode)->k_dir_id), 108 le32_to_cpu(INODE_PKEY(inode)->k_dir_id),
109 le32_to_cpu(INODE_PKEY(inode)->k_objectid), offset, type, 109 le32_to_cpu(INODE_PKEY(inode)->k_objectid), offset, type,
110 length); 110 length);
111 } 111 }
112 112
113 // 113 //
114 // when key is 0, do not set version and short key 114 // when key is 0, do not set version and short key
115 // 115 //
116 inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key, 116 inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key,
117 int version, 117 int version,
118 loff_t offset, int type, int length, 118 loff_t offset, int type, int length,
119 int entry_count /*or ih_free_space */ ) 119 int entry_count /*or ih_free_space */ )
120 { 120 {
121 if (key) { 121 if (key) {
122 ih->ih_key.k_dir_id = cpu_to_le32(key->on_disk_key.k_dir_id); 122 ih->ih_key.k_dir_id = cpu_to_le32(key->on_disk_key.k_dir_id);
123 ih->ih_key.k_objectid = 123 ih->ih_key.k_objectid =
124 cpu_to_le32(key->on_disk_key.k_objectid); 124 cpu_to_le32(key->on_disk_key.k_objectid);
125 } 125 }
126 put_ih_version(ih, version); 126 put_ih_version(ih, version);
127 set_le_ih_k_offset(ih, offset); 127 set_le_ih_k_offset(ih, offset);
128 set_le_ih_k_type(ih, type); 128 set_le_ih_k_type(ih, type);
129 put_ih_item_len(ih, length); 129 put_ih_item_len(ih, length);
130 /* set_ih_free_space (ih, 0); */ 130 /* set_ih_free_space (ih, 0); */
131 // for directory items it is entry count, for directs and stat 131 // for directory items it is entry count, for directs and stat
132 // datas - 0xffff, for indirects - 0 132 // datas - 0xffff, for indirects - 0
133 put_ih_entry_count(ih, entry_count); 133 put_ih_entry_count(ih, entry_count);
134 } 134 }
135 135
136 // 136 //
137 // FIXME: we might cache recently accessed indirect item 137 // FIXME: we might cache recently accessed indirect item
138 138
139 // Ugh. Not too eager for that.... 139 // Ugh. Not too eager for that....
140 // I cut the code until such time as I see a convincing argument (benchmark). 140 // I cut the code until such time as I see a convincing argument (benchmark).
141 // I don't want a bloated inode struct..., and I don't like code complexity.... 141 // I don't want a bloated inode struct..., and I don't like code complexity....
142 142
143 /* cutting the code is fine, since it really isn't in use yet and is easy 143 /* cutting the code is fine, since it really isn't in use yet and is easy
144 ** to add back in. But, Vladimir has a really good idea here. Think 144 ** to add back in. But, Vladimir has a really good idea here. Think
145 ** about what happens for reading a file. For each page, 145 ** about what happens for reading a file. For each page,
146 ** The VFS layer calls reiserfs_readpage, who searches the tree to find 146 ** The VFS layer calls reiserfs_readpage, who searches the tree to find
147 ** an indirect item. This indirect item has X number of pointers, where 147 ** an indirect item. This indirect item has X number of pointers, where
148 ** X is a big number if we've done the block allocation right. But, 148 ** X is a big number if we've done the block allocation right. But,
149 ** we only use one or two of these pointers during each call to readpage, 149 ** we only use one or two of these pointers during each call to readpage,
150 ** needlessly researching again later on. 150 ** needlessly researching again later on.
151 ** 151 **
152 ** The size of the cache could be dynamic based on the size of the file. 152 ** The size of the cache could be dynamic based on the size of the file.
153 ** 153 **
154 ** I'd also like to see us cache the location the stat data item, since 154 ** I'd also like to see us cache the location the stat data item, since
155 ** we are needlessly researching for that frequently. 155 ** we are needlessly researching for that frequently.
156 ** 156 **
157 ** --chris 157 ** --chris
158 */ 158 */
159 159
160 /* If this page has a file tail in it, and 160 /* If this page has a file tail in it, and
161 ** it was read in by get_block_create_0, the page data is valid, 161 ** it was read in by get_block_create_0, the page data is valid,
162 ** but tail is still sitting in a direct item, and we can't write to 162 ** but tail is still sitting in a direct item, and we can't write to
163 ** it. So, look through this page, and check all the mapped buffers 163 ** it. So, look through this page, and check all the mapped buffers
164 ** to make sure they have valid block numbers. Any that don't need 164 ** to make sure they have valid block numbers. Any that don't need
165 ** to be unmapped, so that __block_write_begin will correctly call 165 ** to be unmapped, so that __block_write_begin will correctly call
166 ** reiserfs_get_block to convert the tail into an unformatted node 166 ** reiserfs_get_block to convert the tail into an unformatted node
167 */ 167 */
168 static inline void fix_tail_page_for_writing(struct page *page) 168 static inline void fix_tail_page_for_writing(struct page *page)
169 { 169 {
170 struct buffer_head *head, *next, *bh; 170 struct buffer_head *head, *next, *bh;
171 171
172 if (page && page_has_buffers(page)) { 172 if (page && page_has_buffers(page)) {
173 head = page_buffers(page); 173 head = page_buffers(page);
174 bh = head; 174 bh = head;
175 do { 175 do {
176 next = bh->b_this_page; 176 next = bh->b_this_page;
177 if (buffer_mapped(bh) && bh->b_blocknr == 0) { 177 if (buffer_mapped(bh) && bh->b_blocknr == 0) {
178 reiserfs_unmap_buffer(bh); 178 reiserfs_unmap_buffer(bh);
179 } 179 }
180 bh = next; 180 bh = next;
181 } while (bh != head); 181 } while (bh != head);
182 } 182 }
183 } 183 }
184 184
185 /* reiserfs_get_block does not need to allocate a block only if it has been 185 /* reiserfs_get_block does not need to allocate a block only if it has been
186 done already or non-hole position has been found in the indirect item */ 186 done already or non-hole position has been found in the indirect item */
187 static inline int allocation_needed(int retval, b_blocknr_t allocated, 187 static inline int allocation_needed(int retval, b_blocknr_t allocated,
188 struct item_head *ih, 188 struct item_head *ih,
189 __le32 * item, int pos_in_item) 189 __le32 * item, int pos_in_item)
190 { 190 {
191 if (allocated) 191 if (allocated)
192 return 0; 192 return 0;
193 if (retval == POSITION_FOUND && is_indirect_le_ih(ih) && 193 if (retval == POSITION_FOUND && is_indirect_le_ih(ih) &&
194 get_block_num(item, pos_in_item)) 194 get_block_num(item, pos_in_item))
195 return 0; 195 return 0;
196 return 1; 196 return 1;
197 } 197 }
198 198
199 static inline int indirect_item_found(int retval, struct item_head *ih) 199 static inline int indirect_item_found(int retval, struct item_head *ih)
200 { 200 {
201 return (retval == POSITION_FOUND) && is_indirect_le_ih(ih); 201 return (retval == POSITION_FOUND) && is_indirect_le_ih(ih);
202 } 202 }
203 203
204 static inline void set_block_dev_mapped(struct buffer_head *bh, 204 static inline void set_block_dev_mapped(struct buffer_head *bh,
205 b_blocknr_t block, struct inode *inode) 205 b_blocknr_t block, struct inode *inode)
206 { 206 {
207 map_bh(bh, inode->i_sb, block); 207 map_bh(bh, inode->i_sb, block);
208 } 208 }
209 209
210 // 210 //
211 // files which were created in the earlier version can not be longer, 211 // files which were created in the earlier version can not be longer,
212 // than 2 gb 212 // than 2 gb
213 // 213 //
214 static int file_capable(struct inode *inode, sector_t block) 214 static int file_capable(struct inode *inode, sector_t block)
215 { 215 {
216 if (get_inode_item_key_version(inode) != KEY_FORMAT_3_5 || // it is new file. 216 if (get_inode_item_key_version(inode) != KEY_FORMAT_3_5 || // it is new file.
217 block < (1 << (31 - inode->i_sb->s_blocksize_bits))) // old file, but 'block' is inside of 2gb 217 block < (1 << (31 - inode->i_sb->s_blocksize_bits))) // old file, but 'block' is inside of 2gb
218 return 1; 218 return 1;
219 219
220 return 0; 220 return 0;
221 } 221 }
222 222
223 static int restart_transaction(struct reiserfs_transaction_handle *th, 223 static int restart_transaction(struct reiserfs_transaction_handle *th,
224 struct inode *inode, struct treepath *path) 224 struct inode *inode, struct treepath *path)
225 { 225 {
226 struct super_block *s = th->t_super; 226 struct super_block *s = th->t_super;
227 int len = th->t_blocks_allocated; 227 int len = th->t_blocks_allocated;
228 int err; 228 int err;
229 229
230 BUG_ON(!th->t_trans_id); 230 BUG_ON(!th->t_trans_id);
231 BUG_ON(!th->t_refcount); 231 BUG_ON(!th->t_refcount);
232 232
233 pathrelse(path); 233 pathrelse(path);
234 234
235 /* we cannot restart while nested */ 235 /* we cannot restart while nested */
236 if (th->t_refcount > 1) { 236 if (th->t_refcount > 1) {
237 return 0; 237 return 0;
238 } 238 }
239 reiserfs_update_sd(th, inode); 239 reiserfs_update_sd(th, inode);
240 err = journal_end(th, s, len); 240 err = journal_end(th, s, len);
241 if (!err) { 241 if (!err) {
242 err = journal_begin(th, s, JOURNAL_PER_BALANCE_CNT * 6); 242 err = journal_begin(th, s, JOURNAL_PER_BALANCE_CNT * 6);
243 if (!err) 243 if (!err)
244 reiserfs_update_inode_transaction(inode); 244 reiserfs_update_inode_transaction(inode);
245 } 245 }
246 return err; 246 return err;
247 } 247 }
248 248
249 // it is called by get_block when create == 0. Returns block number 249 // it is called by get_block when create == 0. Returns block number
250 // for 'block'-th logical block of file. When it hits direct item it 250 // for 'block'-th logical block of file. When it hits direct item it
251 // returns 0 (being called from bmap) or read direct item into piece 251 // returns 0 (being called from bmap) or read direct item into piece
252 // of page (bh_result) 252 // of page (bh_result)
253 253
254 // Please improve the english/clarity in the comment above, as it is 254 // Please improve the english/clarity in the comment above, as it is
255 // hard to understand. 255 // hard to understand.
256 256
257 static int _get_block_create_0(struct inode *inode, sector_t block, 257 static int _get_block_create_0(struct inode *inode, sector_t block,
258 struct buffer_head *bh_result, int args) 258 struct buffer_head *bh_result, int args)
259 { 259 {
260 INITIALIZE_PATH(path); 260 INITIALIZE_PATH(path);
261 struct cpu_key key; 261 struct cpu_key key;
262 struct buffer_head *bh; 262 struct buffer_head *bh;
263 struct item_head *ih, tmp_ih; 263 struct item_head *ih, tmp_ih;
264 b_blocknr_t blocknr; 264 b_blocknr_t blocknr;
265 char *p = NULL; 265 char *p = NULL;
266 int chars; 266 int chars;
267 int ret; 267 int ret;
268 int result; 268 int result;
269 int done = 0; 269 int done = 0;
270 unsigned long offset; 270 unsigned long offset;
271 271
272 // prepare the key to look for the 'block'-th block of file 272 // prepare the key to look for the 'block'-th block of file
273 make_cpu_key(&key, inode, 273 make_cpu_key(&key, inode,
274 (loff_t) block * inode->i_sb->s_blocksize + 1, TYPE_ANY, 274 (loff_t) block * inode->i_sb->s_blocksize + 1, TYPE_ANY,
275 3); 275 3);
276 276
277 result = search_for_position_by_key(inode->i_sb, &key, &path); 277 result = search_for_position_by_key(inode->i_sb, &key, &path);
278 if (result != POSITION_FOUND) { 278 if (result != POSITION_FOUND) {
279 pathrelse(&path); 279 pathrelse(&path);
280 if (p) 280 if (p)
281 kunmap(bh_result->b_page); 281 kunmap(bh_result->b_page);
282 if (result == IO_ERROR) 282 if (result == IO_ERROR)
283 return -EIO; 283 return -EIO;
284 // We do not return -ENOENT if there is a hole but page is uptodate, because it means 284 // We do not return -ENOENT if there is a hole but page is uptodate, because it means
285 // That there is some MMAPED data associated with it that is yet to be written to disk. 285 // That there is some MMAPED data associated with it that is yet to be written to disk.
286 if ((args & GET_BLOCK_NO_HOLE) 286 if ((args & GET_BLOCK_NO_HOLE)
287 && !PageUptodate(bh_result->b_page)) { 287 && !PageUptodate(bh_result->b_page)) {
288 return -ENOENT; 288 return -ENOENT;
289 } 289 }
290 return 0; 290 return 0;
291 } 291 }
292 // 292 //
293 bh = get_last_bh(&path); 293 bh = get_last_bh(&path);
294 ih = get_ih(&path); 294 ih = get_ih(&path);
295 if (is_indirect_le_ih(ih)) { 295 if (is_indirect_le_ih(ih)) {
296 __le32 *ind_item = (__le32 *) B_I_PITEM(bh, ih); 296 __le32 *ind_item = (__le32 *) B_I_PITEM(bh, ih);
297 297
298 /* FIXME: here we could cache indirect item or part of it in 298 /* FIXME: here we could cache indirect item or part of it in
299 the inode to avoid search_by_key in case of subsequent 299 the inode to avoid search_by_key in case of subsequent
300 access to file */ 300 access to file */
301 blocknr = get_block_num(ind_item, path.pos_in_item); 301 blocknr = get_block_num(ind_item, path.pos_in_item);
302 ret = 0; 302 ret = 0;
303 if (blocknr) { 303 if (blocknr) {
304 map_bh(bh_result, inode->i_sb, blocknr); 304 map_bh(bh_result, inode->i_sb, blocknr);
305 if (path.pos_in_item == 305 if (path.pos_in_item ==
306 ((ih_item_len(ih) / UNFM_P_SIZE) - 1)) { 306 ((ih_item_len(ih) / UNFM_P_SIZE) - 1)) {
307 set_buffer_boundary(bh_result); 307 set_buffer_boundary(bh_result);
308 } 308 }
309 } else 309 } else
310 // We do not return -ENOENT if there is a hole but page is uptodate, because it means 310 // We do not return -ENOENT if there is a hole but page is uptodate, because it means
311 // That there is some MMAPED data associated with it that is yet to be written to disk. 311 // That there is some MMAPED data associated with it that is yet to be written to disk.
312 if ((args & GET_BLOCK_NO_HOLE) 312 if ((args & GET_BLOCK_NO_HOLE)
313 && !PageUptodate(bh_result->b_page)) { 313 && !PageUptodate(bh_result->b_page)) {
314 ret = -ENOENT; 314 ret = -ENOENT;
315 } 315 }
316 316
317 pathrelse(&path); 317 pathrelse(&path);
318 if (p) 318 if (p)
319 kunmap(bh_result->b_page); 319 kunmap(bh_result->b_page);
320 return ret; 320 return ret;
321 } 321 }
322 // requested data are in direct item(s) 322 // requested data are in direct item(s)
323 if (!(args & GET_BLOCK_READ_DIRECT)) { 323 if (!(args & GET_BLOCK_READ_DIRECT)) {
324 // we are called by bmap. FIXME: we can not map block of file 324 // we are called by bmap. FIXME: we can not map block of file
325 // when it is stored in direct item(s) 325 // when it is stored in direct item(s)
326 pathrelse(&path); 326 pathrelse(&path);
327 if (p) 327 if (p)
328 kunmap(bh_result->b_page); 328 kunmap(bh_result->b_page);
329 return -ENOENT; 329 return -ENOENT;
330 } 330 }
331 331
332 /* if we've got a direct item, and the buffer or page was uptodate, 332 /* if we've got a direct item, and the buffer or page was uptodate,
333 ** we don't want to pull data off disk again. skip to the 333 ** we don't want to pull data off disk again. skip to the
334 ** end, where we map the buffer and return 334 ** end, where we map the buffer and return
335 */ 335 */
336 if (buffer_uptodate(bh_result)) { 336 if (buffer_uptodate(bh_result)) {
337 goto finished; 337 goto finished;
338 } else 338 } else
339 /* 339 /*
340 ** grab_tail_page can trigger calls to reiserfs_get_block on up to date 340 ** grab_tail_page can trigger calls to reiserfs_get_block on up to date
341 ** pages without any buffers. If the page is up to date, we don't want 341 ** pages without any buffers. If the page is up to date, we don't want
342 ** read old data off disk. Set the up to date bit on the buffer instead 342 ** read old data off disk. Set the up to date bit on the buffer instead
343 ** and jump to the end 343 ** and jump to the end
344 */ 344 */
345 if (!bh_result->b_page || PageUptodate(bh_result->b_page)) { 345 if (!bh_result->b_page || PageUptodate(bh_result->b_page)) {
346 set_buffer_uptodate(bh_result); 346 set_buffer_uptodate(bh_result);
347 goto finished; 347 goto finished;
348 } 348 }
349 // read file tail into part of page 349 // read file tail into part of page
350 offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1); 350 offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1);
351 copy_item_head(&tmp_ih, ih); 351 copy_item_head(&tmp_ih, ih);
352 352
353 /* we only want to kmap if we are reading the tail into the page. 353 /* we only want to kmap if we are reading the tail into the page.
354 ** this is not the common case, so we don't kmap until we are 354 ** this is not the common case, so we don't kmap until we are
355 ** sure we need to. But, this means the item might move if 355 ** sure we need to. But, this means the item might move if
356 ** kmap schedules 356 ** kmap schedules
357 */ 357 */
358 if (!p) 358 if (!p)
359 p = (char *)kmap(bh_result->b_page); 359 p = (char *)kmap(bh_result->b_page);
360 360
361 p += offset; 361 p += offset;
362 memset(p, 0, inode->i_sb->s_blocksize); 362 memset(p, 0, inode->i_sb->s_blocksize);
363 do { 363 do {
364 if (!is_direct_le_ih(ih)) { 364 if (!is_direct_le_ih(ih)) {
365 BUG(); 365 BUG();
366 } 366 }
367 /* make sure we don't read more bytes than actually exist in 367 /* make sure we don't read more bytes than actually exist in
368 ** the file. This can happen in odd cases where i_size isn't 368 ** the file. This can happen in odd cases where i_size isn't
369 ** correct, and when direct item padding results in a few 369 ** correct, and when direct item padding results in a few
370 ** extra bytes at the end of the direct item 370 ** extra bytes at the end of the direct item
371 */ 371 */
372 if ((le_ih_k_offset(ih) + path.pos_in_item) > inode->i_size) 372 if ((le_ih_k_offset(ih) + path.pos_in_item) > inode->i_size)
373 break; 373 break;
374 if ((le_ih_k_offset(ih) - 1 + ih_item_len(ih)) > inode->i_size) { 374 if ((le_ih_k_offset(ih) - 1 + ih_item_len(ih)) > inode->i_size) {
375 chars = 375 chars =
376 inode->i_size - (le_ih_k_offset(ih) - 1) - 376 inode->i_size - (le_ih_k_offset(ih) - 1) -
377 path.pos_in_item; 377 path.pos_in_item;
378 done = 1; 378 done = 1;
379 } else { 379 } else {
380 chars = ih_item_len(ih) - path.pos_in_item; 380 chars = ih_item_len(ih) - path.pos_in_item;
381 } 381 }
382 memcpy(p, B_I_PITEM(bh, ih) + path.pos_in_item, chars); 382 memcpy(p, B_I_PITEM(bh, ih) + path.pos_in_item, chars);
383 383
384 if (done) 384 if (done)
385 break; 385 break;
386 386
387 p += chars; 387 p += chars;
388 388
389 if (PATH_LAST_POSITION(&path) != (B_NR_ITEMS(bh) - 1)) 389 if (PATH_LAST_POSITION(&path) != (B_NR_ITEMS(bh) - 1))
390 // we done, if read direct item is not the last item of 390 // we done, if read direct item is not the last item of
391 // node FIXME: we could try to check right delimiting key 391 // node FIXME: we could try to check right delimiting key
392 // to see whether direct item continues in the right 392 // to see whether direct item continues in the right
393 // neighbor or rely on i_size 393 // neighbor or rely on i_size
394 break; 394 break;
395 395
396 // update key to look for the next piece 396 // update key to look for the next piece
397 set_cpu_key_k_offset(&key, cpu_key_k_offset(&key) + chars); 397 set_cpu_key_k_offset(&key, cpu_key_k_offset(&key) + chars);
398 result = search_for_position_by_key(inode->i_sb, &key, &path); 398 result = search_for_position_by_key(inode->i_sb, &key, &path);
399 if (result != POSITION_FOUND) 399 if (result != POSITION_FOUND)
400 // i/o error most likely 400 // i/o error most likely
401 break; 401 break;
402 bh = get_last_bh(&path); 402 bh = get_last_bh(&path);
403 ih = get_ih(&path); 403 ih = get_ih(&path);
404 } while (1); 404 } while (1);
405 405
406 flush_dcache_page(bh_result->b_page); 406 flush_dcache_page(bh_result->b_page);
407 kunmap(bh_result->b_page); 407 kunmap(bh_result->b_page);
408 408
409 finished: 409 finished:
410 pathrelse(&path); 410 pathrelse(&path);
411 411
412 if (result == IO_ERROR) 412 if (result == IO_ERROR)
413 return -EIO; 413 return -EIO;
414 414
415 /* this buffer has valid data, but isn't valid for io. mapping it to 415 /* this buffer has valid data, but isn't valid for io. mapping it to
416 * block #0 tells the rest of reiserfs it just has a tail in it 416 * block #0 tells the rest of reiserfs it just has a tail in it
417 */ 417 */
418 map_bh(bh_result, inode->i_sb, 0); 418 map_bh(bh_result, inode->i_sb, 0);
419 set_buffer_uptodate(bh_result); 419 set_buffer_uptodate(bh_result);
420 return 0; 420 return 0;
421 } 421 }
422 422
423 // this is called to create file map. So, _get_block_create_0 will not 423 // this is called to create file map. So, _get_block_create_0 will not
424 // read direct item 424 // read direct item
425 static int reiserfs_bmap(struct inode *inode, sector_t block, 425 static int reiserfs_bmap(struct inode *inode, sector_t block,
426 struct buffer_head *bh_result, int create) 426 struct buffer_head *bh_result, int create)
427 { 427 {
428 if (!file_capable(inode, block)) 428 if (!file_capable(inode, block))
429 return -EFBIG; 429 return -EFBIG;
430 430
431 reiserfs_write_lock(inode->i_sb); 431 reiserfs_write_lock(inode->i_sb);
432 /* do not read the direct item */ 432 /* do not read the direct item */
433 _get_block_create_0(inode, block, bh_result, 0); 433 _get_block_create_0(inode, block, bh_result, 0);
434 reiserfs_write_unlock(inode->i_sb); 434 reiserfs_write_unlock(inode->i_sb);
435 return 0; 435 return 0;
436 } 436 }
437 437
438 /* special version of get_block that is only used by grab_tail_page right 438 /* special version of get_block that is only used by grab_tail_page right
439 ** now. It is sent to __block_write_begin, and when you try to get a 439 ** now. It is sent to __block_write_begin, and when you try to get a
440 ** block past the end of the file (or a block from a hole) it returns 440 ** block past the end of the file (or a block from a hole) it returns
441 ** -ENOENT instead of a valid buffer. __block_write_begin expects to 441 ** -ENOENT instead of a valid buffer. __block_write_begin expects to
442 ** be able to do i/o on the buffers returned, unless an error value 442 ** be able to do i/o on the buffers returned, unless an error value
443 ** is also returned. 443 ** is also returned.
444 ** 444 **
445 ** So, this allows __block_write_begin to be used for reading a single block 445 ** So, this allows __block_write_begin to be used for reading a single block
446 ** in a page. Where it does not produce a valid page for holes, or past the 446 ** in a page. Where it does not produce a valid page for holes, or past the
447 ** end of the file. This turns out to be exactly what we need for reading 447 ** end of the file. This turns out to be exactly what we need for reading
448 ** tails for conversion. 448 ** tails for conversion.
449 ** 449 **
450 ** The point of the wrapper is forcing a certain value for create, even 450 ** The point of the wrapper is forcing a certain value for create, even
451 ** though the VFS layer is calling this function with create==1. If you 451 ** though the VFS layer is calling this function with create==1. If you
452 ** don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block, 452 ** don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block,
453 ** don't use this function. 453 ** don't use this function.
454 */ 454 */
455 static int reiserfs_get_block_create_0(struct inode *inode, sector_t block, 455 static int reiserfs_get_block_create_0(struct inode *inode, sector_t block,
456 struct buffer_head *bh_result, 456 struct buffer_head *bh_result,
457 int create) 457 int create)
458 { 458 {
459 return reiserfs_get_block(inode, block, bh_result, GET_BLOCK_NO_HOLE); 459 return reiserfs_get_block(inode, block, bh_result, GET_BLOCK_NO_HOLE);
460 } 460 }
461 461
462 /* This is special helper for reiserfs_get_block in case we are executing 462 /* This is special helper for reiserfs_get_block in case we are executing
463 direct_IO request. */ 463 direct_IO request. */
464 static int reiserfs_get_blocks_direct_io(struct inode *inode, 464 static int reiserfs_get_blocks_direct_io(struct inode *inode,
465 sector_t iblock, 465 sector_t iblock,
466 struct buffer_head *bh_result, 466 struct buffer_head *bh_result,
467 int create) 467 int create)
468 { 468 {
469 int ret; 469 int ret;
470 470
471 bh_result->b_page = NULL; 471 bh_result->b_page = NULL;
472 472
473 /* We set the b_size before reiserfs_get_block call since it is 473 /* We set the b_size before reiserfs_get_block call since it is
474 referenced in convert_tail_for_hole() that may be called from 474 referenced in convert_tail_for_hole() that may be called from
475 reiserfs_get_block() */ 475 reiserfs_get_block() */
476 bh_result->b_size = (1 << inode->i_blkbits); 476 bh_result->b_size = (1 << inode->i_blkbits);
477 477
478 ret = reiserfs_get_block(inode, iblock, bh_result, 478 ret = reiserfs_get_block(inode, iblock, bh_result,
479 create | GET_BLOCK_NO_DANGLE); 479 create | GET_BLOCK_NO_DANGLE);
480 if (ret) 480 if (ret)
481 goto out; 481 goto out;
482 482
483 /* don't allow direct io onto tail pages */ 483 /* don't allow direct io onto tail pages */
484 if (buffer_mapped(bh_result) && bh_result->b_blocknr == 0) { 484 if (buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
485 /* make sure future calls to the direct io funcs for this offset 485 /* make sure future calls to the direct io funcs for this offset
486 ** in the file fail by unmapping the buffer 486 ** in the file fail by unmapping the buffer
487 */ 487 */
488 clear_buffer_mapped(bh_result); 488 clear_buffer_mapped(bh_result);
489 ret = -EINVAL; 489 ret = -EINVAL;
490 } 490 }
491 /* Possible unpacked tail. Flush the data before pages have 491 /* Possible unpacked tail. Flush the data before pages have
492 disappeared */ 492 disappeared */
493 if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) { 493 if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) {
494 int err; 494 int err;
495 495
496 reiserfs_write_lock(inode->i_sb); 496 reiserfs_write_lock(inode->i_sb);
497 497
498 err = reiserfs_commit_for_inode(inode); 498 err = reiserfs_commit_for_inode(inode);
499 REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask; 499 REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
500 500
501 reiserfs_write_unlock(inode->i_sb); 501 reiserfs_write_unlock(inode->i_sb);
502 502
503 if (err < 0) 503 if (err < 0)
504 ret = err; 504 ret = err;
505 } 505 }
506 out: 506 out:
507 return ret; 507 return ret;
508 } 508 }
509 509
510 /* 510 /*
511 ** helper function for when reiserfs_get_block is called for a hole 511 ** helper function for when reiserfs_get_block is called for a hole
512 ** but the file tail is still in a direct item 512 ** but the file tail is still in a direct item
513 ** bh_result is the buffer head for the hole 513 ** bh_result is the buffer head for the hole
514 ** tail_offset is the offset of the start of the tail in the file 514 ** tail_offset is the offset of the start of the tail in the file
515 ** 515 **
516 ** This calls prepare_write, which will start a new transaction 516 ** This calls prepare_write, which will start a new transaction
517 ** you should not be in a transaction, or have any paths held when you 517 ** you should not be in a transaction, or have any paths held when you
518 ** call this. 518 ** call this.
519 */ 519 */
520 static int convert_tail_for_hole(struct inode *inode, 520 static int convert_tail_for_hole(struct inode *inode,
521 struct buffer_head *bh_result, 521 struct buffer_head *bh_result,
522 loff_t tail_offset) 522 loff_t tail_offset)
523 { 523 {
524 unsigned long index; 524 unsigned long index;
525 unsigned long tail_end; 525 unsigned long tail_end;
526 unsigned long tail_start; 526 unsigned long tail_start;
527 struct page *tail_page; 527 struct page *tail_page;
528 struct page *hole_page = bh_result->b_page; 528 struct page *hole_page = bh_result->b_page;
529 int retval = 0; 529 int retval = 0;
530 530
531 if ((tail_offset & (bh_result->b_size - 1)) != 1) 531 if ((tail_offset & (bh_result->b_size - 1)) != 1)
532 return -EIO; 532 return -EIO;
533 533
534 /* always try to read until the end of the block */ 534 /* always try to read until the end of the block */
535 tail_start = tail_offset & (PAGE_CACHE_SIZE - 1); 535 tail_start = tail_offset & (PAGE_CACHE_SIZE - 1);
536 tail_end = (tail_start | (bh_result->b_size - 1)) + 1; 536 tail_end = (tail_start | (bh_result->b_size - 1)) + 1;
537 537
538 index = tail_offset >> PAGE_CACHE_SHIFT; 538 index = tail_offset >> PAGE_CACHE_SHIFT;
539 /* hole_page can be zero in case of direct_io, we are sure 539 /* hole_page can be zero in case of direct_io, we are sure
540 that we cannot get here if we write with O_DIRECT into 540 that we cannot get here if we write with O_DIRECT into
541 tail page */ 541 tail page */
542 if (!hole_page || index != hole_page->index) { 542 if (!hole_page || index != hole_page->index) {
543 tail_page = grab_cache_page(inode->i_mapping, index); 543 tail_page = grab_cache_page(inode->i_mapping, index);
544 retval = -ENOMEM; 544 retval = -ENOMEM;
545 if (!tail_page) { 545 if (!tail_page) {
546 goto out; 546 goto out;
547 } 547 }
548 } else { 548 } else {
549 tail_page = hole_page; 549 tail_page = hole_page;
550 } 550 }
551 551
552 /* we don't have to make sure the conversion did not happen while 552 /* we don't have to make sure the conversion did not happen while
553 ** we were locking the page because anyone that could convert 553 ** we were locking the page because anyone that could convert
554 ** must first take i_mutex. 554 ** must first take i_mutex.
555 ** 555 **
556 ** We must fix the tail page for writing because it might have buffers 556 ** We must fix the tail page for writing because it might have buffers
557 ** that are mapped, but have a block number of 0. This indicates tail 557 ** that are mapped, but have a block number of 0. This indicates tail
558 ** data that has been read directly into the page, and 558 ** data that has been read directly into the page, and
559 ** __block_write_begin won't trigger a get_block in this case. 559 ** __block_write_begin won't trigger a get_block in this case.
560 */ 560 */
561 fix_tail_page_for_writing(tail_page); 561 fix_tail_page_for_writing(tail_page);
562 retval = __reiserfs_write_begin(tail_page, tail_start, 562 retval = __reiserfs_write_begin(tail_page, tail_start,
563 tail_end - tail_start); 563 tail_end - tail_start);
564 if (retval) 564 if (retval)
565 goto unlock; 565 goto unlock;
566 566
567 /* tail conversion might change the data in the page */ 567 /* tail conversion might change the data in the page */
568 flush_dcache_page(tail_page); 568 flush_dcache_page(tail_page);
569 569
570 retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_end); 570 retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_end);
571 571
572 unlock: 572 unlock:
573 if (tail_page != hole_page) { 573 if (tail_page != hole_page) {
574 unlock_page(tail_page); 574 unlock_page(tail_page);
575 page_cache_release(tail_page); 575 page_cache_release(tail_page);
576 } 576 }
577 out: 577 out:
578 return retval; 578 return retval;
579 } 579 }
580 580
581 static inline int _allocate_block(struct reiserfs_transaction_handle *th, 581 static inline int _allocate_block(struct reiserfs_transaction_handle *th,
582 sector_t block, 582 sector_t block,
583 struct inode *inode, 583 struct inode *inode,
584 b_blocknr_t * allocated_block_nr, 584 b_blocknr_t * allocated_block_nr,
585 struct treepath *path, int flags) 585 struct treepath *path, int flags)
586 { 586 {
587 BUG_ON(!th->t_trans_id); 587 BUG_ON(!th->t_trans_id);
588 588
589 #ifdef REISERFS_PREALLOCATE 589 #ifdef REISERFS_PREALLOCATE
590 if (!(flags & GET_BLOCK_NO_IMUX)) { 590 if (!(flags & GET_BLOCK_NO_IMUX)) {
591 return reiserfs_new_unf_blocknrs2(th, inode, allocated_block_nr, 591 return reiserfs_new_unf_blocknrs2(th, inode, allocated_block_nr,
592 path, block); 592 path, block);
593 } 593 }
594 #endif 594 #endif
595 return reiserfs_new_unf_blocknrs(th, inode, allocated_block_nr, path, 595 return reiserfs_new_unf_blocknrs(th, inode, allocated_block_nr, path,
596 block); 596 block);
597 } 597 }
598 598
599 int reiserfs_get_block(struct inode *inode, sector_t block, 599 int reiserfs_get_block(struct inode *inode, sector_t block,
600 struct buffer_head *bh_result, int create) 600 struct buffer_head *bh_result, int create)
601 { 601 {
602 int repeat, retval = 0; 602 int repeat, retval = 0;
603 b_blocknr_t allocated_block_nr = 0; // b_blocknr_t is (unsigned) 32 bit int 603 b_blocknr_t allocated_block_nr = 0; // b_blocknr_t is (unsigned) 32 bit int
604 INITIALIZE_PATH(path); 604 INITIALIZE_PATH(path);
605 int pos_in_item; 605 int pos_in_item;
606 struct cpu_key key; 606 struct cpu_key key;
607 struct buffer_head *bh, *unbh = NULL; 607 struct buffer_head *bh, *unbh = NULL;
608 struct item_head *ih, tmp_ih; 608 struct item_head *ih, tmp_ih;
609 __le32 *item; 609 __le32 *item;
610 int done; 610 int done;
611 int fs_gen; 611 int fs_gen;
612 int lock_depth; 612 int lock_depth;
613 struct reiserfs_transaction_handle *th = NULL; 613 struct reiserfs_transaction_handle *th = NULL;
614 /* space reserved in transaction batch: 614 /* space reserved in transaction batch:
615 . 3 balancings in direct->indirect conversion 615 . 3 balancings in direct->indirect conversion
616 . 1 block involved into reiserfs_update_sd() 616 . 1 block involved into reiserfs_update_sd()
617 XXX in practically impossible worst case direct2indirect() 617 XXX in practically impossible worst case direct2indirect()
618 can incur (much) more than 3 balancings. 618 can incur (much) more than 3 balancings.
619 quota update for user, group */ 619 quota update for user, group */
620 int jbegin_count = 620 int jbegin_count =
621 JOURNAL_PER_BALANCE_CNT * 3 + 1 + 621 JOURNAL_PER_BALANCE_CNT * 3 + 1 +
622 2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb); 622 2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb);
623 int version; 623 int version;
624 int dangle = 1; 624 int dangle = 1;
625 loff_t new_offset = 625 loff_t new_offset =
626 (((loff_t) block) << inode->i_sb->s_blocksize_bits) + 1; 626 (((loff_t) block) << inode->i_sb->s_blocksize_bits) + 1;
627 627
628 lock_depth = reiserfs_write_lock_once(inode->i_sb); 628 lock_depth = reiserfs_write_lock_once(inode->i_sb);
629 version = get_inode_item_key_version(inode); 629 version = get_inode_item_key_version(inode);
630 630
631 if (!file_capable(inode, block)) { 631 if (!file_capable(inode, block)) {
632 reiserfs_write_unlock_once(inode->i_sb, lock_depth); 632 reiserfs_write_unlock_once(inode->i_sb, lock_depth);
633 return -EFBIG; 633 return -EFBIG;
634 } 634 }
635 635
636 /* if !create, we aren't changing the FS, so we don't need to 636 /* if !create, we aren't changing the FS, so we don't need to
637 ** log anything, so we don't need to start a transaction 637 ** log anything, so we don't need to start a transaction
638 */ 638 */
639 if (!(create & GET_BLOCK_CREATE)) { 639 if (!(create & GET_BLOCK_CREATE)) {
640 int ret; 640 int ret;
641 /* find number of block-th logical block of the file */ 641 /* find number of block-th logical block of the file */
642 ret = _get_block_create_0(inode, block, bh_result, 642 ret = _get_block_create_0(inode, block, bh_result,
643 create | GET_BLOCK_READ_DIRECT); 643 create | GET_BLOCK_READ_DIRECT);
644 reiserfs_write_unlock_once(inode->i_sb, lock_depth); 644 reiserfs_write_unlock_once(inode->i_sb, lock_depth);
645 return ret; 645 return ret;
646 } 646 }
647 /* 647 /*
648 * if we're already in a transaction, make sure to close 648 * if we're already in a transaction, make sure to close
649 * any new transactions we start in this func 649 * any new transactions we start in this func
650 */ 650 */
651 if ((create & GET_BLOCK_NO_DANGLE) || 651 if ((create & GET_BLOCK_NO_DANGLE) ||
652 reiserfs_transaction_running(inode->i_sb)) 652 reiserfs_transaction_running(inode->i_sb))
653 dangle = 0; 653 dangle = 0;
654 654
655 /* If file is of such a size, that it might have a tail and tails are enabled 655 /* If file is of such a size, that it might have a tail and tails are enabled
656 ** we should mark it as possibly needing tail packing on close 656 ** we should mark it as possibly needing tail packing on close
657 */ 657 */
658 if ((have_large_tails(inode->i_sb) 658 if ((have_large_tails(inode->i_sb)
659 && inode->i_size < i_block_size(inode) * 4) 659 && inode->i_size < i_block_size(inode) * 4)
660 || (have_small_tails(inode->i_sb) 660 || (have_small_tails(inode->i_sb)
661 && inode->i_size < i_block_size(inode))) 661 && inode->i_size < i_block_size(inode)))
662 REISERFS_I(inode)->i_flags |= i_pack_on_close_mask; 662 REISERFS_I(inode)->i_flags |= i_pack_on_close_mask;
663 663
664 /* set the key of the first byte in the 'block'-th block of file */ 664 /* set the key of the first byte in the 'block'-th block of file */
665 make_cpu_key(&key, inode, new_offset, TYPE_ANY, 3 /*key length */ ); 665 make_cpu_key(&key, inode, new_offset, TYPE_ANY, 3 /*key length */ );
666 if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) { 666 if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) {
667 start_trans: 667 start_trans:
668 th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count); 668 th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count);
669 if (!th) { 669 if (!th) {
670 retval = -ENOMEM; 670 retval = -ENOMEM;
671 goto failure; 671 goto failure;
672 } 672 }
673 reiserfs_update_inode_transaction(inode); 673 reiserfs_update_inode_transaction(inode);
674 } 674 }
675 research: 675 research:
676 676
677 retval = search_for_position_by_key(inode->i_sb, &key, &path); 677 retval = search_for_position_by_key(inode->i_sb, &key, &path);
678 if (retval == IO_ERROR) { 678 if (retval == IO_ERROR) {
679 retval = -EIO; 679 retval = -EIO;
680 goto failure; 680 goto failure;
681 } 681 }
682 682
683 bh = get_last_bh(&path); 683 bh = get_last_bh(&path);
684 ih = get_ih(&path); 684 ih = get_ih(&path);
685 item = get_item(&path); 685 item = get_item(&path);
686 pos_in_item = path.pos_in_item; 686 pos_in_item = path.pos_in_item;
687 687
688 fs_gen = get_generation(inode->i_sb); 688 fs_gen = get_generation(inode->i_sb);
689 copy_item_head(&tmp_ih, ih); 689 copy_item_head(&tmp_ih, ih);
690 690
691 if (allocation_needed 691 if (allocation_needed
692 (retval, allocated_block_nr, ih, item, pos_in_item)) { 692 (retval, allocated_block_nr, ih, item, pos_in_item)) {
693 /* we have to allocate block for the unformatted node */ 693 /* we have to allocate block for the unformatted node */
694 if (!th) { 694 if (!th) {
695 pathrelse(&path); 695 pathrelse(&path);
696 goto start_trans; 696 goto start_trans;
697 } 697 }
698 698
699 repeat = 699 repeat =
700 _allocate_block(th, block, inode, &allocated_block_nr, 700 _allocate_block(th, block, inode, &allocated_block_nr,
701 &path, create); 701 &path, create);
702 702
703 if (repeat == NO_DISK_SPACE || repeat == QUOTA_EXCEEDED) { 703 if (repeat == NO_DISK_SPACE || repeat == QUOTA_EXCEEDED) {
704 /* restart the transaction to give the journal a chance to free 704 /* restart the transaction to give the journal a chance to free
705 ** some blocks. releases the path, so we have to go back to 705 ** some blocks. releases the path, so we have to go back to
706 ** research if we succeed on the second try 706 ** research if we succeed on the second try
707 */ 707 */
708 SB_JOURNAL(inode->i_sb)->j_next_async_flush = 1; 708 SB_JOURNAL(inode->i_sb)->j_next_async_flush = 1;
709 retval = restart_transaction(th, inode, &path); 709 retval = restart_transaction(th, inode, &path);
710 if (retval) 710 if (retval)
711 goto failure; 711 goto failure;
712 repeat = 712 repeat =
713 _allocate_block(th, block, inode, 713 _allocate_block(th, block, inode,
714 &allocated_block_nr, NULL, create); 714 &allocated_block_nr, NULL, create);
715 715
716 if (repeat != NO_DISK_SPACE && repeat != QUOTA_EXCEEDED) { 716 if (repeat != NO_DISK_SPACE && repeat != QUOTA_EXCEEDED) {
717 goto research; 717 goto research;
718 } 718 }
719 if (repeat == QUOTA_EXCEEDED) 719 if (repeat == QUOTA_EXCEEDED)
720 retval = -EDQUOT; 720 retval = -EDQUOT;
721 else 721 else
722 retval = -ENOSPC; 722 retval = -ENOSPC;
723 goto failure; 723 goto failure;
724 } 724 }
725 725
726 if (fs_changed(fs_gen, inode->i_sb) 726 if (fs_changed(fs_gen, inode->i_sb)
727 && item_moved(&tmp_ih, &path)) { 727 && item_moved(&tmp_ih, &path)) {
728 goto research; 728 goto research;
729 } 729 }
730 } 730 }
731 731
732 if (indirect_item_found(retval, ih)) { 732 if (indirect_item_found(retval, ih)) {
733 b_blocknr_t unfm_ptr; 733 b_blocknr_t unfm_ptr;
734 /* 'block'-th block is in the file already (there is 734 /* 'block'-th block is in the file already (there is
735 corresponding cell in some indirect item). But it may be 735 corresponding cell in some indirect item). But it may be
736 zero unformatted node pointer (hole) */ 736 zero unformatted node pointer (hole) */
737 unfm_ptr = get_block_num(item, pos_in_item); 737 unfm_ptr = get_block_num(item, pos_in_item);
738 if (unfm_ptr == 0) { 738 if (unfm_ptr == 0) {
739 /* use allocated block to plug the hole */ 739 /* use allocated block to plug the hole */
740 reiserfs_prepare_for_journal(inode->i_sb, bh, 1); 740 reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
741 if (fs_changed(fs_gen, inode->i_sb) 741 if (fs_changed(fs_gen, inode->i_sb)
742 && item_moved(&tmp_ih, &path)) { 742 && item_moved(&tmp_ih, &path)) {
743 reiserfs_restore_prepared_buffer(inode->i_sb, 743 reiserfs_restore_prepared_buffer(inode->i_sb,
744 bh); 744 bh);
745 goto research; 745 goto research;
746 } 746 }
747 set_buffer_new(bh_result); 747 set_buffer_new(bh_result);
748 if (buffer_dirty(bh_result) 748 if (buffer_dirty(bh_result)
749 && reiserfs_data_ordered(inode->i_sb)) 749 && reiserfs_data_ordered(inode->i_sb))
750 reiserfs_add_ordered_list(inode, bh_result); 750 reiserfs_add_ordered_list(inode, bh_result);
751 put_block_num(item, pos_in_item, allocated_block_nr); 751 put_block_num(item, pos_in_item, allocated_block_nr);
752 unfm_ptr = allocated_block_nr; 752 unfm_ptr = allocated_block_nr;
753 journal_mark_dirty(th, inode->i_sb, bh); 753 journal_mark_dirty(th, inode->i_sb, bh);
754 reiserfs_update_sd(th, inode); 754 reiserfs_update_sd(th, inode);
755 } 755 }
756 set_block_dev_mapped(bh_result, unfm_ptr, inode); 756 set_block_dev_mapped(bh_result, unfm_ptr, inode);
757 pathrelse(&path); 757 pathrelse(&path);
758 retval = 0; 758 retval = 0;
759 if (!dangle && th) 759 if (!dangle && th)
760 retval = reiserfs_end_persistent_transaction(th); 760 retval = reiserfs_end_persistent_transaction(th);
761 761
762 reiserfs_write_unlock_once(inode->i_sb, lock_depth); 762 reiserfs_write_unlock_once(inode->i_sb, lock_depth);
763 763
764 /* the item was found, so new blocks were not added to the file 764 /* the item was found, so new blocks were not added to the file
765 ** there is no need to make sure the inode is updated with this 765 ** there is no need to make sure the inode is updated with this
766 ** transaction 766 ** transaction
767 */ 767 */
768 return retval; 768 return retval;
769 } 769 }
770 770
771 if (!th) { 771 if (!th) {
772 pathrelse(&path); 772 pathrelse(&path);
773 goto start_trans; 773 goto start_trans;
774 } 774 }
775 775
776 /* desired position is not found or is in the direct item. We have 776 /* desired position is not found or is in the direct item. We have
777 to append file with holes up to 'block'-th block converting 777 to append file with holes up to 'block'-th block converting
778 direct items to indirect one if necessary */ 778 direct items to indirect one if necessary */
779 done = 0; 779 done = 0;
780 do { 780 do {
781 if (is_statdata_le_ih(ih)) { 781 if (is_statdata_le_ih(ih)) {
782 __le32 unp = 0; 782 __le32 unp = 0;
783 struct cpu_key tmp_key; 783 struct cpu_key tmp_key;
784 784
785 /* indirect item has to be inserted */ 785 /* indirect item has to be inserted */
786 make_le_item_head(&tmp_ih, &key, version, 1, 786 make_le_item_head(&tmp_ih, &key, version, 1,
787 TYPE_INDIRECT, UNFM_P_SIZE, 787 TYPE_INDIRECT, UNFM_P_SIZE,
788 0 /* free_space */ ); 788 0 /* free_space */ );
789 789
790 if (cpu_key_k_offset(&key) == 1) { 790 if (cpu_key_k_offset(&key) == 1) {
791 /* we are going to add 'block'-th block to the file. Use 791 /* we are going to add 'block'-th block to the file. Use
792 allocated block for that */ 792 allocated block for that */
793 unp = cpu_to_le32(allocated_block_nr); 793 unp = cpu_to_le32(allocated_block_nr);
794 set_block_dev_mapped(bh_result, 794 set_block_dev_mapped(bh_result,
795 allocated_block_nr, inode); 795 allocated_block_nr, inode);
796 set_buffer_new(bh_result); 796 set_buffer_new(bh_result);
797 done = 1; 797 done = 1;
798 } 798 }
799 tmp_key = key; // ;) 799 tmp_key = key; // ;)
800 set_cpu_key_k_offset(&tmp_key, 1); 800 set_cpu_key_k_offset(&tmp_key, 1);
801 PATH_LAST_POSITION(&path)++; 801 PATH_LAST_POSITION(&path)++;
802 802
803 retval = 803 retval =
804 reiserfs_insert_item(th, &path, &tmp_key, &tmp_ih, 804 reiserfs_insert_item(th, &path, &tmp_key, &tmp_ih,
805 inode, (char *)&unp); 805 inode, (char *)&unp);
806 if (retval) { 806 if (retval) {
807 reiserfs_free_block(th, inode, 807 reiserfs_free_block(th, inode,
808 allocated_block_nr, 1); 808 allocated_block_nr, 1);
809 goto failure; // retval == -ENOSPC, -EDQUOT or -EIO or -EEXIST 809 goto failure; // retval == -ENOSPC, -EDQUOT or -EIO or -EEXIST
810 } 810 }
811 //mark_tail_converted (inode); 811 //mark_tail_converted (inode);
812 } else if (is_direct_le_ih(ih)) { 812 } else if (is_direct_le_ih(ih)) {
813 /* direct item has to be converted */ 813 /* direct item has to be converted */
814 loff_t tail_offset; 814 loff_t tail_offset;
815 815
816 tail_offset = 816 tail_offset =
817 ((le_ih_k_offset(ih) - 817 ((le_ih_k_offset(ih) -
818 1) & ~(inode->i_sb->s_blocksize - 1)) + 1; 818 1) & ~(inode->i_sb->s_blocksize - 1)) + 1;
819 if (tail_offset == cpu_key_k_offset(&key)) { 819 if (tail_offset == cpu_key_k_offset(&key)) {
820 /* direct item we just found fits into block we have 820 /* direct item we just found fits into block we have
821 to map. Convert it into unformatted node: use 821 to map. Convert it into unformatted node: use
822 bh_result for the conversion */ 822 bh_result for the conversion */
823 set_block_dev_mapped(bh_result, 823 set_block_dev_mapped(bh_result,
824 allocated_block_nr, inode); 824 allocated_block_nr, inode);
825 unbh = bh_result; 825 unbh = bh_result;
826 done = 1; 826 done = 1;
827 } else { 827 } else {
828 /* we have to padd file tail stored in direct item(s) 828 /* we have to padd file tail stored in direct item(s)
829 up to block size and convert it to unformatted 829 up to block size and convert it to unformatted
830 node. FIXME: this should also get into page cache */ 830 node. FIXME: this should also get into page cache */
831 831
832 pathrelse(&path); 832 pathrelse(&path);
833 /* 833 /*
834 * ugly, but we can only end the transaction if 834 * ugly, but we can only end the transaction if
835 * we aren't nested 835 * we aren't nested
836 */ 836 */
837 BUG_ON(!th->t_refcount); 837 BUG_ON(!th->t_refcount);
838 if (th->t_refcount == 1) { 838 if (th->t_refcount == 1) {
839 retval = 839 retval =
840 reiserfs_end_persistent_transaction 840 reiserfs_end_persistent_transaction
841 (th); 841 (th);
842 th = NULL; 842 th = NULL;
843 if (retval) 843 if (retval)
844 goto failure; 844 goto failure;
845 } 845 }
846 846
847 retval = 847 retval =
848 convert_tail_for_hole(inode, bh_result, 848 convert_tail_for_hole(inode, bh_result,
849 tail_offset); 849 tail_offset);
850 if (retval) { 850 if (retval) {
851 if (retval != -ENOSPC) 851 if (retval != -ENOSPC)
852 reiserfs_error(inode->i_sb, 852 reiserfs_error(inode->i_sb,
853 "clm-6004", 853 "clm-6004",
854 "convert tail failed " 854 "convert tail failed "
855 "inode %lu, error %d", 855 "inode %lu, error %d",
856 inode->i_ino, 856 inode->i_ino,
857 retval); 857 retval);
858 if (allocated_block_nr) { 858 if (allocated_block_nr) {
859 /* the bitmap, the super, and the stat data == 3 */ 859 /* the bitmap, the super, and the stat data == 3 */
860 if (!th) 860 if (!th)
861 th = reiserfs_persistent_transaction(inode->i_sb, 3); 861 th = reiserfs_persistent_transaction(inode->i_sb, 3);
862 if (th) 862 if (th)
863 reiserfs_free_block(th, 863 reiserfs_free_block(th,
864 inode, 864 inode,
865 allocated_block_nr, 865 allocated_block_nr,
866 1); 866 1);
867 } 867 }
868 goto failure; 868 goto failure;
869 } 869 }
870 goto research; 870 goto research;
871 } 871 }
872 retval = 872 retval =
873 direct2indirect(th, inode, &path, unbh, 873 direct2indirect(th, inode, &path, unbh,
874 tail_offset); 874 tail_offset);
875 if (retval) { 875 if (retval) {
876 reiserfs_unmap_buffer(unbh); 876 reiserfs_unmap_buffer(unbh);
877 reiserfs_free_block(th, inode, 877 reiserfs_free_block(th, inode,
878 allocated_block_nr, 1); 878 allocated_block_nr, 1);
879 goto failure; 879 goto failure;
880 } 880 }
881 /* it is important the set_buffer_uptodate is done after 881 /* it is important the set_buffer_uptodate is done after
882 ** the direct2indirect. The buffer might contain valid 882 ** the direct2indirect. The buffer might contain valid
883 ** data newer than the data on disk (read by readpage, changed, 883 ** data newer than the data on disk (read by readpage, changed,
884 ** and then sent here by writepage). direct2indirect needs 884 ** and then sent here by writepage). direct2indirect needs
885 ** to know if unbh was already up to date, so it can decide 885 ** to know if unbh was already up to date, so it can decide
886 ** if the data in unbh needs to be replaced with data from 886 ** if the data in unbh needs to be replaced with data from
887 ** the disk 887 ** the disk
888 */ 888 */
889 set_buffer_uptodate(unbh); 889 set_buffer_uptodate(unbh);
890 890
891 /* unbh->b_page == NULL in case of DIRECT_IO request, this means 891 /* unbh->b_page == NULL in case of DIRECT_IO request, this means
892 buffer will disappear shortly, so it should not be added to 892 buffer will disappear shortly, so it should not be added to
893 */ 893 */
894 if (unbh->b_page) { 894 if (unbh->b_page) {
895 /* we've converted the tail, so we must 895 /* we've converted the tail, so we must
896 ** flush unbh before the transaction commits 896 ** flush unbh before the transaction commits
897 */ 897 */
898 reiserfs_add_tail_list(inode, unbh); 898 reiserfs_add_tail_list(inode, unbh);
899 899
900 /* mark it dirty now to prevent commit_write from adding 900 /* mark it dirty now to prevent commit_write from adding
901 ** this buffer to the inode's dirty buffer list 901 ** this buffer to the inode's dirty buffer list
902 */ 902 */
903 /* 903 /*
904 * AKPM: changed __mark_buffer_dirty to mark_buffer_dirty(). 904 * AKPM: changed __mark_buffer_dirty to mark_buffer_dirty().
905 * It's still atomic, but it sets the page dirty too, 905 * It's still atomic, but it sets the page dirty too,
906 * which makes it eligible for writeback at any time by the 906 * which makes it eligible for writeback at any time by the
907 * VM (which was also the case with __mark_buffer_dirty()) 907 * VM (which was also the case with __mark_buffer_dirty())
908 */ 908 */
909 mark_buffer_dirty(unbh); 909 mark_buffer_dirty(unbh);
910 } 910 }
911 } else { 911 } else {
912 /* append indirect item with holes if needed, when appending 912 /* append indirect item with holes if needed, when appending
913 pointer to 'block'-th block use block, which is already 913 pointer to 'block'-th block use block, which is already
914 allocated */ 914 allocated */
915 struct cpu_key tmp_key; 915 struct cpu_key tmp_key;
916 unp_t unf_single = 0; // We use this in case we need to allocate only 916 unp_t unf_single = 0; // We use this in case we need to allocate only
917 // one block which is a fastpath 917 // one block which is a fastpath
918 unp_t *un; 918 unp_t *un;
919 __u64 max_to_insert = 919 __u64 max_to_insert =
920 MAX_ITEM_LEN(inode->i_sb->s_blocksize) / 920 MAX_ITEM_LEN(inode->i_sb->s_blocksize) /
921 UNFM_P_SIZE; 921 UNFM_P_SIZE;
922 __u64 blocks_needed; 922 __u64 blocks_needed;
923 923
924 RFALSE(pos_in_item != ih_item_len(ih) / UNFM_P_SIZE, 924 RFALSE(pos_in_item != ih_item_len(ih) / UNFM_P_SIZE,
925 "vs-804: invalid position for append"); 925 "vs-804: invalid position for append");
926 /* indirect item has to be appended, set up key of that position */ 926 /* indirect item has to be appended, set up key of that position */
927 make_cpu_key(&tmp_key, inode, 927 make_cpu_key(&tmp_key, inode,
928 le_key_k_offset(version, 928 le_key_k_offset(version,
929 &(ih->ih_key)) + 929 &(ih->ih_key)) +
930 op_bytes_number(ih, 930 op_bytes_number(ih,
931 inode->i_sb->s_blocksize), 931 inode->i_sb->s_blocksize),
932 //pos_in_item * inode->i_sb->s_blocksize, 932 //pos_in_item * inode->i_sb->s_blocksize,
933 TYPE_INDIRECT, 3); // key type is unimportant 933 TYPE_INDIRECT, 3); // key type is unimportant
934 934
935 RFALSE(cpu_key_k_offset(&tmp_key) > cpu_key_k_offset(&key), 935 RFALSE(cpu_key_k_offset(&tmp_key) > cpu_key_k_offset(&key),
936 "green-805: invalid offset"); 936 "green-805: invalid offset");
937 blocks_needed = 937 blocks_needed =
938 1 + 938 1 +
939 ((cpu_key_k_offset(&key) - 939 ((cpu_key_k_offset(&key) -
940 cpu_key_k_offset(&tmp_key)) >> inode->i_sb-> 940 cpu_key_k_offset(&tmp_key)) >> inode->i_sb->
941 s_blocksize_bits); 941 s_blocksize_bits);
942 942
943 if (blocks_needed == 1) { 943 if (blocks_needed == 1) {
944 un = &unf_single; 944 un = &unf_single;
945 } else { 945 } else {
946 un = kzalloc(min(blocks_needed, max_to_insert) * UNFM_P_SIZE, GFP_NOFS); 946 un = kzalloc(min(blocks_needed, max_to_insert) * UNFM_P_SIZE, GFP_NOFS);
947 if (!un) { 947 if (!un) {
948 un = &unf_single; 948 un = &unf_single;
949 blocks_needed = 1; 949 blocks_needed = 1;
950 max_to_insert = 0; 950 max_to_insert = 0;
951 } 951 }
952 } 952 }
953 if (blocks_needed <= max_to_insert) { 953 if (blocks_needed <= max_to_insert) {
954 /* we are going to add target block to the file. Use allocated 954 /* we are going to add target block to the file. Use allocated
955 block for that */ 955 block for that */
956 un[blocks_needed - 1] = 956 un[blocks_needed - 1] =
957 cpu_to_le32(allocated_block_nr); 957 cpu_to_le32(allocated_block_nr);
958 set_block_dev_mapped(bh_result, 958 set_block_dev_mapped(bh_result,
959 allocated_block_nr, inode); 959 allocated_block_nr, inode);
960 set_buffer_new(bh_result); 960 set_buffer_new(bh_result);
961 done = 1; 961 done = 1;
962 } else { 962 } else {
963 /* paste hole to the indirect item */ 963 /* paste hole to the indirect item */
964 /* If kmalloc failed, max_to_insert becomes zero and it means we 964 /* If kmalloc failed, max_to_insert becomes zero and it means we
965 only have space for one block */ 965 only have space for one block */
966 blocks_needed = 966 blocks_needed =
967 max_to_insert ? max_to_insert : 1; 967 max_to_insert ? max_to_insert : 1;
968 } 968 }
969 retval = 969 retval =
970 reiserfs_paste_into_item(th, &path, &tmp_key, inode, 970 reiserfs_paste_into_item(th, &path, &tmp_key, inode,
971 (char *)un, 971 (char *)un,
972 UNFM_P_SIZE * 972 UNFM_P_SIZE *
973 blocks_needed); 973 blocks_needed);
974 974
975 if (blocks_needed != 1) 975 if (blocks_needed != 1)
976 kfree(un); 976 kfree(un);
977 977
978 if (retval) { 978 if (retval) {
979 reiserfs_free_block(th, inode, 979 reiserfs_free_block(th, inode,
980 allocated_block_nr, 1); 980 allocated_block_nr, 1);
981 goto failure; 981 goto failure;
982 } 982 }
983 if (!done) { 983 if (!done) {
984 /* We need to mark new file size in case this function will be 984 /* We need to mark new file size in case this function will be
985 interrupted/aborted later on. And we may do this only for 985 interrupted/aborted later on. And we may do this only for
986 holes. */ 986 holes. */
987 inode->i_size += 987 inode->i_size +=
988 inode->i_sb->s_blocksize * blocks_needed; 988 inode->i_sb->s_blocksize * blocks_needed;
989 } 989 }
990 } 990 }
991 991
992 if (done == 1) 992 if (done == 1)
993 break; 993 break;
994 994
995 /* this loop could log more blocks than we had originally asked 995 /* this loop could log more blocks than we had originally asked
996 ** for. So, we have to allow the transaction to end if it is 996 ** for. So, we have to allow the transaction to end if it is
997 ** too big or too full. Update the inode so things are 997 ** too big or too full. Update the inode so things are
998 ** consistent if we crash before the function returns 998 ** consistent if we crash before the function returns
999 ** 999 **
1000 ** release the path so that anybody waiting on the path before 1000 ** release the path so that anybody waiting on the path before
1001 ** ending their transaction will be able to continue. 1001 ** ending their transaction will be able to continue.
1002 */ 1002 */
1003 if (journal_transaction_should_end(th, th->t_blocks_allocated)) { 1003 if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
1004 retval = restart_transaction(th, inode, &path); 1004 retval = restart_transaction(th, inode, &path);
1005 if (retval) 1005 if (retval)
1006 goto failure; 1006 goto failure;
1007 } 1007 }
1008 /* 1008 /*
1009 * inserting indirect pointers for a hole can take a 1009 * inserting indirect pointers for a hole can take a
1010 * long time. reschedule if needed and also release the write 1010 * long time. reschedule if needed and also release the write
1011 * lock for others. 1011 * lock for others.
1012 */ 1012 */
1013 if (need_resched()) { 1013 if (need_resched()) {
1014 reiserfs_write_unlock_once(inode->i_sb, lock_depth); 1014 reiserfs_write_unlock_once(inode->i_sb, lock_depth);
1015 schedule(); 1015 schedule();
1016 lock_depth = reiserfs_write_lock_once(inode->i_sb); 1016 lock_depth = reiserfs_write_lock_once(inode->i_sb);
1017 } 1017 }
1018 1018
1019 retval = search_for_position_by_key(inode->i_sb, &key, &path); 1019 retval = search_for_position_by_key(inode->i_sb, &key, &path);
1020 if (retval == IO_ERROR) { 1020 if (retval == IO_ERROR) {
1021 retval = -EIO; 1021 retval = -EIO;
1022 goto failure; 1022 goto failure;
1023 } 1023 }
1024 if (retval == POSITION_FOUND) { 1024 if (retval == POSITION_FOUND) {
1025 reiserfs_warning(inode->i_sb, "vs-825", 1025 reiserfs_warning(inode->i_sb, "vs-825",
1026 "%K should not be found", &key); 1026 "%K should not be found", &key);
1027 retval = -EEXIST; 1027 retval = -EEXIST;
1028 if (allocated_block_nr) 1028 if (allocated_block_nr)
1029 reiserfs_free_block(th, inode, 1029 reiserfs_free_block(th, inode,
1030 allocated_block_nr, 1); 1030 allocated_block_nr, 1);
1031 pathrelse(&path); 1031 pathrelse(&path);
1032 goto failure; 1032 goto failure;
1033 } 1033 }
1034 bh = get_last_bh(&path); 1034 bh = get_last_bh(&path);
1035 ih = get_ih(&path); 1035 ih = get_ih(&path);
1036 item = get_item(&path); 1036 item = get_item(&path);
1037 pos_in_item = path.pos_in_item; 1037 pos_in_item = path.pos_in_item;
1038 } while (1); 1038 } while (1);
1039 1039
1040 retval = 0; 1040 retval = 0;
1041 1041
1042 failure: 1042 failure:
1043 if (th && (!dangle || (retval && !th->t_trans_id))) { 1043 if (th && (!dangle || (retval && !th->t_trans_id))) {
1044 int err; 1044 int err;
1045 if (th->t_trans_id) 1045 if (th->t_trans_id)
1046 reiserfs_update_sd(th, inode); 1046 reiserfs_update_sd(th, inode);
1047 err = reiserfs_end_persistent_transaction(th); 1047 err = reiserfs_end_persistent_transaction(th);
1048 if (err) 1048 if (err)
1049 retval = err; 1049 retval = err;
1050 } 1050 }
1051 1051
1052 reiserfs_write_unlock_once(inode->i_sb, lock_depth); 1052 reiserfs_write_unlock_once(inode->i_sb, lock_depth);
1053 reiserfs_check_path(&path); 1053 reiserfs_check_path(&path);
1054 return retval; 1054 return retval;
1055 } 1055 }
1056 1056
1057 static int 1057 static int
1058 reiserfs_readpages(struct file *file, struct address_space *mapping, 1058 reiserfs_readpages(struct file *file, struct address_space *mapping,
1059 struct list_head *pages, unsigned nr_pages) 1059 struct list_head *pages, unsigned nr_pages)
1060 { 1060 {
1061 return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_block); 1061 return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_block);
1062 } 1062 }
1063 1063
1064 /* Compute real number of used bytes by file 1064 /* Compute real number of used bytes by file
1065 * Following three functions can go away when we'll have enough space in stat item 1065 * Following three functions can go away when we'll have enough space in stat item
1066 */ 1066 */
1067 static int real_space_diff(struct inode *inode, int sd_size) 1067 static int real_space_diff(struct inode *inode, int sd_size)
1068 { 1068 {
1069 int bytes; 1069 int bytes;
1070 loff_t blocksize = inode->i_sb->s_blocksize; 1070 loff_t blocksize = inode->i_sb->s_blocksize;
1071 1071
1072 if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) 1072 if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode))
1073 return sd_size; 1073 return sd_size;
1074 1074
1075 /* End of file is also in full block with indirect reference, so round 1075 /* End of file is also in full block with indirect reference, so round
1076 ** up to the next block. 1076 ** up to the next block.
1077 ** 1077 **
1078 ** there is just no way to know if the tail is actually packed 1078 ** there is just no way to know if the tail is actually packed
1079 ** on the file, so we have to assume it isn't. When we pack the 1079 ** on the file, so we have to assume it isn't. When we pack the
1080 ** tail, we add 4 bytes to pretend there really is an unformatted 1080 ** tail, we add 4 bytes to pretend there really is an unformatted
1081 ** node pointer 1081 ** node pointer
1082 */ 1082 */
1083 bytes = 1083 bytes =
1084 ((inode->i_size + 1084 ((inode->i_size +
1085 (blocksize - 1)) >> inode->i_sb->s_blocksize_bits) * UNFM_P_SIZE + 1085 (blocksize - 1)) >> inode->i_sb->s_blocksize_bits) * UNFM_P_SIZE +
1086 sd_size; 1086 sd_size;
1087 return bytes; 1087 return bytes;
1088 } 1088 }
1089 1089
1090 static inline loff_t to_real_used_space(struct inode *inode, ulong blocks, 1090 static inline loff_t to_real_used_space(struct inode *inode, ulong blocks,
1091 int sd_size) 1091 int sd_size)
1092 { 1092 {
1093 if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) { 1093 if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
1094 return inode->i_size + 1094 return inode->i_size +
1095 (loff_t) (real_space_diff(inode, sd_size)); 1095 (loff_t) (real_space_diff(inode, sd_size));
1096 } 1096 }
1097 return ((loff_t) real_space_diff(inode, sd_size)) + 1097 return ((loff_t) real_space_diff(inode, sd_size)) +
1098 (((loff_t) blocks) << 9); 1098 (((loff_t) blocks) << 9);
1099 } 1099 }
1100 1100
1101 /* Compute number of blocks used by file in ReiserFS counting */ 1101 /* Compute number of blocks used by file in ReiserFS counting */
1102 static inline ulong to_fake_used_blocks(struct inode *inode, int sd_size) 1102 static inline ulong to_fake_used_blocks(struct inode *inode, int sd_size)
1103 { 1103 {
1104 loff_t bytes = inode_get_bytes(inode); 1104 loff_t bytes = inode_get_bytes(inode);
1105 loff_t real_space = real_space_diff(inode, sd_size); 1105 loff_t real_space = real_space_diff(inode, sd_size);
1106 1106
1107 /* keeps fsck and non-quota versions of reiserfs happy */ 1107 /* keeps fsck and non-quota versions of reiserfs happy */
1108 if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) { 1108 if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
1109 bytes += (loff_t) 511; 1109 bytes += (loff_t) 511;
1110 } 1110 }
1111 1111
1112 /* files from before the quota patch might i_blocks such that 1112 /* files from before the quota patch might i_blocks such that
1113 ** bytes < real_space. Deal with that here to prevent it from 1113 ** bytes < real_space. Deal with that here to prevent it from
1114 ** going negative. 1114 ** going negative.
1115 */ 1115 */
1116 if (bytes < real_space) 1116 if (bytes < real_space)
1117 return 0; 1117 return 0;
1118 return (bytes - real_space) >> 9; 1118 return (bytes - real_space) >> 9;
1119 } 1119 }
1120 1120
1121 // 1121 //
1122 // BAD: new directories have stat data of new type and all other items 1122 // BAD: new directories have stat data of new type and all other items
1123 // of old type. Version stored in the inode says about body items, so 1123 // of old type. Version stored in the inode says about body items, so
1124 // in update_stat_data we can not rely on inode, but have to check 1124 // in update_stat_data we can not rely on inode, but have to check
1125 // item version directly 1125 // item version directly
1126 // 1126 //
1127 1127
1128 // called by read_locked_inode 1128 // called by read_locked_inode
1129 static void init_inode(struct inode *inode, struct treepath *path) 1129 static void init_inode(struct inode *inode, struct treepath *path)
1130 { 1130 {
1131 struct buffer_head *bh; 1131 struct buffer_head *bh;
1132 struct item_head *ih; 1132 struct item_head *ih;
1133 __u32 rdev; 1133 __u32 rdev;
1134 //int version = ITEM_VERSION_1; 1134 //int version = ITEM_VERSION_1;
1135 1135
1136 bh = PATH_PLAST_BUFFER(path); 1136 bh = PATH_PLAST_BUFFER(path);
1137 ih = PATH_PITEM_HEAD(path); 1137 ih = PATH_PITEM_HEAD(path);
1138 1138
1139 copy_key(INODE_PKEY(inode), &(ih->ih_key)); 1139 copy_key(INODE_PKEY(inode), &(ih->ih_key));
1140 1140
1141 INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list)); 1141 INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list));
1142 REISERFS_I(inode)->i_flags = 0; 1142 REISERFS_I(inode)->i_flags = 0;
1143 REISERFS_I(inode)->i_prealloc_block = 0; 1143 REISERFS_I(inode)->i_prealloc_block = 0;
1144 REISERFS_I(inode)->i_prealloc_count = 0; 1144 REISERFS_I(inode)->i_prealloc_count = 0;
1145 REISERFS_I(inode)->i_trans_id = 0; 1145 REISERFS_I(inode)->i_trans_id = 0;
1146 REISERFS_I(inode)->i_jl = NULL; 1146 REISERFS_I(inode)->i_jl = NULL;
1147 reiserfs_init_xattr_rwsem(inode); 1147 reiserfs_init_xattr_rwsem(inode);
1148 1148
1149 if (stat_data_v1(ih)) { 1149 if (stat_data_v1(ih)) {
1150 struct stat_data_v1 *sd = 1150 struct stat_data_v1 *sd =
1151 (struct stat_data_v1 *)B_I_PITEM(bh, ih); 1151 (struct stat_data_v1 *)B_I_PITEM(bh, ih);
1152 unsigned long blocks; 1152 unsigned long blocks;
1153 1153
1154 set_inode_item_key_version(inode, KEY_FORMAT_3_5); 1154 set_inode_item_key_version(inode, KEY_FORMAT_3_5);
1155 set_inode_sd_version(inode, STAT_DATA_V1); 1155 set_inode_sd_version(inode, STAT_DATA_V1);
1156 inode->i_mode = sd_v1_mode(sd); 1156 inode->i_mode = sd_v1_mode(sd);
1157 inode->i_nlink = sd_v1_nlink(sd); 1157 inode->i_nlink = sd_v1_nlink(sd);
1158 inode->i_uid = sd_v1_uid(sd); 1158 inode->i_uid = sd_v1_uid(sd);
1159 inode->i_gid = sd_v1_gid(sd); 1159 inode->i_gid = sd_v1_gid(sd);
1160 inode->i_size = sd_v1_size(sd); 1160 inode->i_size = sd_v1_size(sd);
1161 inode->i_atime.tv_sec = sd_v1_atime(sd); 1161 inode->i_atime.tv_sec = sd_v1_atime(sd);
1162 inode->i_mtime.tv_sec = sd_v1_mtime(sd); 1162 inode->i_mtime.tv_sec = sd_v1_mtime(sd);
1163 inode->i_ctime.tv_sec = sd_v1_ctime(sd); 1163 inode->i_ctime.tv_sec = sd_v1_ctime(sd);
1164 inode->i_atime.tv_nsec = 0; 1164 inode->i_atime.tv_nsec = 0;
1165 inode->i_ctime.tv_nsec = 0; 1165 inode->i_ctime.tv_nsec = 0;
1166 inode->i_mtime.tv_nsec = 0; 1166 inode->i_mtime.tv_nsec = 0;
1167 1167
1168 inode->i_blocks = sd_v1_blocks(sd); 1168 inode->i_blocks = sd_v1_blocks(sd);
1169 inode->i_generation = le32_to_cpu(INODE_PKEY(inode)->k_dir_id); 1169 inode->i_generation = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
1170 blocks = (inode->i_size + 511) >> 9; 1170 blocks = (inode->i_size + 511) >> 9;
1171 blocks = _ROUND_UP(blocks, inode->i_sb->s_blocksize >> 9); 1171 blocks = _ROUND_UP(blocks, inode->i_sb->s_blocksize >> 9);
1172 if (inode->i_blocks > blocks) { 1172 if (inode->i_blocks > blocks) {
1173 // there was a bug in <=3.5.23 when i_blocks could take negative 1173 // there was a bug in <=3.5.23 when i_blocks could take negative
1174 // values. Starting from 3.5.17 this value could even be stored in 1174 // values. Starting from 3.5.17 this value could even be stored in
1175 // stat data. For such files we set i_blocks based on file 1175 // stat data. For such files we set i_blocks based on file
1176 // size. Just 2 notes: this can be wrong for sparce files. On-disk value will be 1176 // size. Just 2 notes: this can be wrong for sparce files. On-disk value will be
1177 // only updated if file's inode will ever change 1177 // only updated if file's inode will ever change
1178 inode->i_blocks = blocks; 1178 inode->i_blocks = blocks;
1179 } 1179 }
1180 1180
1181 rdev = sd_v1_rdev(sd); 1181 rdev = sd_v1_rdev(sd);
1182 REISERFS_I(inode)->i_first_direct_byte = 1182 REISERFS_I(inode)->i_first_direct_byte =
1183 sd_v1_first_direct_byte(sd); 1183 sd_v1_first_direct_byte(sd);
1184 /* an early bug in the quota code can give us an odd number for the 1184 /* an early bug in the quota code can give us an odd number for the
1185 ** block count. This is incorrect, fix it here. 1185 ** block count. This is incorrect, fix it here.
1186 */ 1186 */
1187 if (inode->i_blocks & 1) { 1187 if (inode->i_blocks & 1) {
1188 inode->i_blocks++; 1188 inode->i_blocks++;
1189 } 1189 }
1190 inode_set_bytes(inode, 1190 inode_set_bytes(inode,
1191 to_real_used_space(inode, inode->i_blocks, 1191 to_real_used_space(inode, inode->i_blocks,
1192 SD_V1_SIZE)); 1192 SD_V1_SIZE));
1193 /* nopack is initially zero for v1 objects. For v2 objects, 1193 /* nopack is initially zero for v1 objects. For v2 objects,
1194 nopack is initialised from sd_attrs */ 1194 nopack is initialised from sd_attrs */
1195 REISERFS_I(inode)->i_flags &= ~i_nopack_mask; 1195 REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
1196 } else { 1196 } else {
1197 // new stat data found, but object may have old items 1197 // new stat data found, but object may have old items
1198 // (directories and symlinks) 1198 // (directories and symlinks)
1199 struct stat_data *sd = (struct stat_data *)B_I_PITEM(bh, ih); 1199 struct stat_data *sd = (struct stat_data *)B_I_PITEM(bh, ih);
1200 1200
1201 inode->i_mode = sd_v2_mode(sd); 1201 inode->i_mode = sd_v2_mode(sd);
1202 inode->i_nlink = sd_v2_nlink(sd); 1202 inode->i_nlink = sd_v2_nlink(sd);
1203 inode->i_uid = sd_v2_uid(sd); 1203 inode->i_uid = sd_v2_uid(sd);
1204 inode->i_size = sd_v2_size(sd); 1204 inode->i_size = sd_v2_size(sd);
1205 inode->i_gid = sd_v2_gid(sd); 1205 inode->i_gid = sd_v2_gid(sd);
1206 inode->i_mtime.tv_sec = sd_v2_mtime(sd); 1206 inode->i_mtime.tv_sec = sd_v2_mtime(sd);
1207 inode->i_atime.tv_sec = sd_v2_atime(sd); 1207 inode->i_atime.tv_sec = sd_v2_atime(sd);
1208 inode->i_ctime.tv_sec = sd_v2_ctime(sd); 1208 inode->i_ctime.tv_sec = sd_v2_ctime(sd);
1209 inode->i_ctime.tv_nsec = 0; 1209 inode->i_ctime.tv_nsec = 0;
1210 inode->i_mtime.tv_nsec = 0; 1210 inode->i_mtime.tv_nsec = 0;
1211 inode->i_atime.tv_nsec = 0; 1211 inode->i_atime.tv_nsec = 0;
1212 inode->i_blocks = sd_v2_blocks(sd); 1212 inode->i_blocks = sd_v2_blocks(sd);
1213 rdev = sd_v2_rdev(sd); 1213 rdev = sd_v2_rdev(sd);
1214 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) 1214 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
1215 inode->i_generation = 1215 inode->i_generation =
1216 le32_to_cpu(INODE_PKEY(inode)->k_dir_id); 1216 le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
1217 else 1217 else
1218 inode->i_generation = sd_v2_generation(sd); 1218 inode->i_generation = sd_v2_generation(sd);
1219 1219
1220 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 1220 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
1221 set_inode_item_key_version(inode, KEY_FORMAT_3_5); 1221 set_inode_item_key_version(inode, KEY_FORMAT_3_5);
1222 else 1222 else
1223 set_inode_item_key_version(inode, KEY_FORMAT_3_6); 1223 set_inode_item_key_version(inode, KEY_FORMAT_3_6);
1224 REISERFS_I(inode)->i_first_direct_byte = 0; 1224 REISERFS_I(inode)->i_first_direct_byte = 0;
1225 set_inode_sd_version(inode, STAT_DATA_V2); 1225 set_inode_sd_version(inode, STAT_DATA_V2);
1226 inode_set_bytes(inode, 1226 inode_set_bytes(inode,
1227 to_real_used_space(inode, inode->i_blocks, 1227 to_real_used_space(inode, inode->i_blocks,
1228 SD_V2_SIZE)); 1228 SD_V2_SIZE));
1229 /* read persistent inode attributes from sd and initialise 1229 /* read persistent inode attributes from sd and initialise
1230 generic inode flags from them */ 1230 generic inode flags from them */
1231 REISERFS_I(inode)->i_attrs = sd_v2_attrs(sd); 1231 REISERFS_I(inode)->i_attrs = sd_v2_attrs(sd);
1232 sd_attrs_to_i_attrs(sd_v2_attrs(sd), inode); 1232 sd_attrs_to_i_attrs(sd_v2_attrs(sd), inode);
1233 } 1233 }
1234 1234
1235 pathrelse(path); 1235 pathrelse(path);
1236 if (S_ISREG(inode->i_mode)) { 1236 if (S_ISREG(inode->i_mode)) {
1237 inode->i_op = &reiserfs_file_inode_operations; 1237 inode->i_op = &reiserfs_file_inode_operations;
1238 inode->i_fop = &reiserfs_file_operations; 1238 inode->i_fop = &reiserfs_file_operations;
1239 inode->i_mapping->a_ops = &reiserfs_address_space_operations; 1239 inode->i_mapping->a_ops = &reiserfs_address_space_operations;
1240 } else if (S_ISDIR(inode->i_mode)) { 1240 } else if (S_ISDIR(inode->i_mode)) {
1241 inode->i_op = &reiserfs_dir_inode_operations; 1241 inode->i_op = &reiserfs_dir_inode_operations;
1242 inode->i_fop = &reiserfs_dir_operations; 1242 inode->i_fop = &reiserfs_dir_operations;
1243 } else if (S_ISLNK(inode->i_mode)) { 1243 } else if (S_ISLNK(inode->i_mode)) {
1244 inode->i_op = &reiserfs_symlink_inode_operations; 1244 inode->i_op = &reiserfs_symlink_inode_operations;
1245 inode->i_mapping->a_ops = &reiserfs_address_space_operations; 1245 inode->i_mapping->a_ops = &reiserfs_address_space_operations;
1246 } else { 1246 } else {
1247 inode->i_blocks = 0; 1247 inode->i_blocks = 0;
1248 inode->i_op = &reiserfs_special_inode_operations; 1248 inode->i_op = &reiserfs_special_inode_operations;
1249 init_special_inode(inode, inode->i_mode, new_decode_dev(rdev)); 1249 init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
1250 } 1250 }
1251 } 1251 }
1252 1252
1253 // update new stat data with inode fields 1253 // update new stat data with inode fields
1254 static void inode2sd(void *sd, struct inode *inode, loff_t size) 1254 static void inode2sd(void *sd, struct inode *inode, loff_t size)
1255 { 1255 {
1256 struct stat_data *sd_v2 = (struct stat_data *)sd; 1256 struct stat_data *sd_v2 = (struct stat_data *)sd;
1257 __u16 flags; 1257 __u16 flags;
1258 1258
1259 set_sd_v2_mode(sd_v2, inode->i_mode); 1259 set_sd_v2_mode(sd_v2, inode->i_mode);
1260 set_sd_v2_nlink(sd_v2, inode->i_nlink); 1260 set_sd_v2_nlink(sd_v2, inode->i_nlink);
1261 set_sd_v2_uid(sd_v2, inode->i_uid); 1261 set_sd_v2_uid(sd_v2, inode->i_uid);
1262 set_sd_v2_size(sd_v2, size); 1262 set_sd_v2_size(sd_v2, size);
1263 set_sd_v2_gid(sd_v2, inode->i_gid); 1263 set_sd_v2_gid(sd_v2, inode->i_gid);
1264 set_sd_v2_mtime(sd_v2, inode->i_mtime.tv_sec); 1264 set_sd_v2_mtime(sd_v2, inode->i_mtime.tv_sec);
1265 set_sd_v2_atime(sd_v2, inode->i_atime.tv_sec); 1265 set_sd_v2_atime(sd_v2, inode->i_atime.tv_sec);
1266 set_sd_v2_ctime(sd_v2, inode->i_ctime.tv_sec); 1266 set_sd_v2_ctime(sd_v2, inode->i_ctime.tv_sec);
1267 set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE)); 1267 set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE));
1268 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) 1268 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
1269 set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev)); 1269 set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev));
1270 else 1270 else
1271 set_sd_v2_generation(sd_v2, inode->i_generation); 1271 set_sd_v2_generation(sd_v2, inode->i_generation);
1272 flags = REISERFS_I(inode)->i_attrs; 1272 flags = REISERFS_I(inode)->i_attrs;
1273 i_attrs_to_sd_attrs(inode, &flags); 1273 i_attrs_to_sd_attrs(inode, &flags);
1274 set_sd_v2_attrs(sd_v2, flags); 1274 set_sd_v2_attrs(sd_v2, flags);
1275 } 1275 }
1276 1276
1277 // used to copy inode's fields to old stat data 1277 // used to copy inode's fields to old stat data
1278 static void inode2sd_v1(void *sd, struct inode *inode, loff_t size) 1278 static void inode2sd_v1(void *sd, struct inode *inode, loff_t size)
1279 { 1279 {
1280 struct stat_data_v1 *sd_v1 = (struct stat_data_v1 *)sd; 1280 struct stat_data_v1 *sd_v1 = (struct stat_data_v1 *)sd;
1281 1281
1282 set_sd_v1_mode(sd_v1, inode->i_mode); 1282 set_sd_v1_mode(sd_v1, inode->i_mode);
1283 set_sd_v1_uid(sd_v1, inode->i_uid); 1283 set_sd_v1_uid(sd_v1, inode->i_uid);
1284 set_sd_v1_gid(sd_v1, inode->i_gid); 1284 set_sd_v1_gid(sd_v1, inode->i_gid);
1285 set_sd_v1_nlink(sd_v1, inode->i_nlink); 1285 set_sd_v1_nlink(sd_v1, inode->i_nlink);
1286 set_sd_v1_size(sd_v1, size); 1286 set_sd_v1_size(sd_v1, size);
1287 set_sd_v1_atime(sd_v1, inode->i_atime.tv_sec); 1287 set_sd_v1_atime(sd_v1, inode->i_atime.tv_sec);
1288 set_sd_v1_ctime(sd_v1, inode->i_ctime.tv_sec); 1288 set_sd_v1_ctime(sd_v1, inode->i_ctime.tv_sec);
1289 set_sd_v1_mtime(sd_v1, inode->i_mtime.tv_sec); 1289 set_sd_v1_mtime(sd_v1, inode->i_mtime.tv_sec);
1290 1290
1291 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) 1291 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
1292 set_sd_v1_rdev(sd_v1, new_encode_dev(inode->i_rdev)); 1292 set_sd_v1_rdev(sd_v1, new_encode_dev(inode->i_rdev));
1293 else 1293 else
1294 set_sd_v1_blocks(sd_v1, to_fake_used_blocks(inode, SD_V1_SIZE)); 1294 set_sd_v1_blocks(sd_v1, to_fake_used_blocks(inode, SD_V1_SIZE));
1295 1295
1296 // Sigh. i_first_direct_byte is back 1296 // Sigh. i_first_direct_byte is back
1297 set_sd_v1_first_direct_byte(sd_v1, 1297 set_sd_v1_first_direct_byte(sd_v1,
1298 REISERFS_I(inode)->i_first_direct_byte); 1298 REISERFS_I(inode)->i_first_direct_byte);
1299 } 1299 }
1300 1300
1301 /* NOTE, you must prepare the buffer head before sending it here, 1301 /* NOTE, you must prepare the buffer head before sending it here,
1302 ** and then log it after the call 1302 ** and then log it after the call
1303 */ 1303 */
1304 static void update_stat_data(struct treepath *path, struct inode *inode, 1304 static void update_stat_data(struct treepath *path, struct inode *inode,
1305 loff_t size) 1305 loff_t size)
1306 { 1306 {
1307 struct buffer_head *bh; 1307 struct buffer_head *bh;
1308 struct item_head *ih; 1308 struct item_head *ih;
1309 1309
1310 bh = PATH_PLAST_BUFFER(path); 1310 bh = PATH_PLAST_BUFFER(path);
1311 ih = PATH_PITEM_HEAD(path); 1311 ih = PATH_PITEM_HEAD(path);
1312 1312
1313 if (!is_statdata_le_ih(ih)) 1313 if (!is_statdata_le_ih(ih))
1314 reiserfs_panic(inode->i_sb, "vs-13065", "key %k, found item %h", 1314 reiserfs_panic(inode->i_sb, "vs-13065", "key %k, found item %h",
1315 INODE_PKEY(inode), ih); 1315 INODE_PKEY(inode), ih);
1316 1316
1317 if (stat_data_v1(ih)) { 1317 if (stat_data_v1(ih)) {
1318 // path points to old stat data 1318 // path points to old stat data
1319 inode2sd_v1(B_I_PITEM(bh, ih), inode, size); 1319 inode2sd_v1(B_I_PITEM(bh, ih), inode, size);
1320 } else { 1320 } else {
1321 inode2sd(B_I_PITEM(bh, ih), inode, size); 1321 inode2sd(B_I_PITEM(bh, ih), inode, size);
1322 } 1322 }
1323 1323
1324 return; 1324 return;
1325 } 1325 }
1326 1326
1327 void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th, 1327 void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th,
1328 struct inode *inode, loff_t size) 1328 struct inode *inode, loff_t size)
1329 { 1329 {
1330 struct cpu_key key; 1330 struct cpu_key key;
1331 INITIALIZE_PATH(path); 1331 INITIALIZE_PATH(path);
1332 struct buffer_head *bh; 1332 struct buffer_head *bh;
1333 int fs_gen; 1333 int fs_gen;
1334 struct item_head *ih, tmp_ih; 1334 struct item_head *ih, tmp_ih;
1335 int retval; 1335 int retval;
1336 1336
1337 BUG_ON(!th->t_trans_id); 1337 BUG_ON(!th->t_trans_id);
1338 1338
1339 make_cpu_key(&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3); //key type is unimportant 1339 make_cpu_key(&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3); //key type is unimportant
1340 1340
1341 for (;;) { 1341 for (;;) {
1342 int pos; 1342 int pos;
1343 /* look for the object's stat data */ 1343 /* look for the object's stat data */
1344 retval = search_item(inode->i_sb, &key, &path); 1344 retval = search_item(inode->i_sb, &key, &path);
1345 if (retval == IO_ERROR) { 1345 if (retval == IO_ERROR) {
1346 reiserfs_error(inode->i_sb, "vs-13050", 1346 reiserfs_error(inode->i_sb, "vs-13050",
1347 "i/o failure occurred trying to " 1347 "i/o failure occurred trying to "
1348 "update %K stat data", &key); 1348 "update %K stat data", &key);
1349 return; 1349 return;
1350 } 1350 }
1351 if (retval == ITEM_NOT_FOUND) { 1351 if (retval == ITEM_NOT_FOUND) {
1352 pos = PATH_LAST_POSITION(&path); 1352 pos = PATH_LAST_POSITION(&path);
1353 pathrelse(&path); 1353 pathrelse(&path);
1354 if (inode->i_nlink == 0) { 1354 if (inode->i_nlink == 0) {
1355 /*reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: i_nlink == 0, stat data not found"); */ 1355 /*reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: i_nlink == 0, stat data not found"); */
1356 return; 1356 return;
1357 } 1357 }
1358 reiserfs_warning(inode->i_sb, "vs-13060", 1358 reiserfs_warning(inode->i_sb, "vs-13060",
1359 "stat data of object %k (nlink == %d) " 1359 "stat data of object %k (nlink == %d) "
1360 "not found (pos %d)", 1360 "not found (pos %d)",
1361 INODE_PKEY(inode), inode->i_nlink, 1361 INODE_PKEY(inode), inode->i_nlink,
1362 pos); 1362 pos);
1363 reiserfs_check_path(&path); 1363 reiserfs_check_path(&path);
1364 return; 1364 return;
1365 } 1365 }
1366 1366
1367 /* sigh, prepare_for_journal might schedule. When it schedules the 1367 /* sigh, prepare_for_journal might schedule. When it schedules the
1368 ** FS might change. We have to detect that, and loop back to the 1368 ** FS might change. We have to detect that, and loop back to the
1369 ** search if the stat data item has moved 1369 ** search if the stat data item has moved
1370 */ 1370 */
1371 bh = get_last_bh(&path); 1371 bh = get_last_bh(&path);
1372 ih = get_ih(&path); 1372 ih = get_ih(&path);
1373 copy_item_head(&tmp_ih, ih); 1373 copy_item_head(&tmp_ih, ih);
1374 fs_gen = get_generation(inode->i_sb); 1374 fs_gen = get_generation(inode->i_sb);
1375 reiserfs_prepare_for_journal(inode->i_sb, bh, 1); 1375 reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
1376 if (fs_changed(fs_gen, inode->i_sb) 1376 if (fs_changed(fs_gen, inode->i_sb)
1377 && item_moved(&tmp_ih, &path)) { 1377 && item_moved(&tmp_ih, &path)) {
1378 reiserfs_restore_prepared_buffer(inode->i_sb, bh); 1378 reiserfs_restore_prepared_buffer(inode->i_sb, bh);
1379 continue; /* Stat_data item has been moved after scheduling. */ 1379 continue; /* Stat_data item has been moved after scheduling. */
1380 } 1380 }
1381 break; 1381 break;
1382 } 1382 }
1383 update_stat_data(&path, inode, size); 1383 update_stat_data(&path, inode, size);
1384 journal_mark_dirty(th, th->t_super, bh); 1384 journal_mark_dirty(th, th->t_super, bh);
1385 pathrelse(&path); 1385 pathrelse(&path);
1386 return; 1386 return;
1387 } 1387 }
1388 1388
1389 /* reiserfs_read_locked_inode is called to read the inode off disk, and it 1389 /* reiserfs_read_locked_inode is called to read the inode off disk, and it
1390 ** does a make_bad_inode when things go wrong. But, we need to make sure 1390 ** does a make_bad_inode when things go wrong. But, we need to make sure
1391 ** and clear the key in the private portion of the inode, otherwise a 1391 ** and clear the key in the private portion of the inode, otherwise a
1392 ** corresponding iput might try to delete whatever object the inode last 1392 ** corresponding iput might try to delete whatever object the inode last
1393 ** represented. 1393 ** represented.
1394 */ 1394 */
1395 static void reiserfs_make_bad_inode(struct inode *inode) 1395 static void reiserfs_make_bad_inode(struct inode *inode)
1396 { 1396 {
1397 memset(INODE_PKEY(inode), 0, KEY_SIZE); 1397 memset(INODE_PKEY(inode), 0, KEY_SIZE);
1398 make_bad_inode(inode); 1398 make_bad_inode(inode);
1399 } 1399 }
1400 1400
1401 // 1401 //
1402 // initially this function was derived from minix or ext2's analog and 1402 // initially this function was derived from minix or ext2's analog and
1403 // evolved as the prototype did 1403 // evolved as the prototype did
1404 // 1404 //
1405 1405
1406 int reiserfs_init_locked_inode(struct inode *inode, void *p) 1406 int reiserfs_init_locked_inode(struct inode *inode, void *p)
1407 { 1407 {
1408 struct reiserfs_iget_args *args = (struct reiserfs_iget_args *)p; 1408 struct reiserfs_iget_args *args = (struct reiserfs_iget_args *)p;
1409 inode->i_ino = args->objectid; 1409 inode->i_ino = args->objectid;
1410 INODE_PKEY(inode)->k_dir_id = cpu_to_le32(args->dirid); 1410 INODE_PKEY(inode)->k_dir_id = cpu_to_le32(args->dirid);
1411 return 0; 1411 return 0;
1412 } 1412 }
1413 1413
1414 /* looks for stat data in the tree, and fills up the fields of in-core 1414 /* looks for stat data in the tree, and fills up the fields of in-core
1415 inode stat data fields */ 1415 inode stat data fields */
1416 void reiserfs_read_locked_inode(struct inode *inode, 1416 void reiserfs_read_locked_inode(struct inode *inode,
1417 struct reiserfs_iget_args *args) 1417 struct reiserfs_iget_args *args)
1418 { 1418 {
1419 INITIALIZE_PATH(path_to_sd); 1419 INITIALIZE_PATH(path_to_sd);
1420 struct cpu_key key; 1420 struct cpu_key key;
1421 unsigned long dirino; 1421 unsigned long dirino;
1422 int retval; 1422 int retval;
1423 1423
1424 dirino = args->dirid; 1424 dirino = args->dirid;
1425 1425
1426 /* set version 1, version 2 could be used too, because stat data 1426 /* set version 1, version 2 could be used too, because stat data
1427 key is the same in both versions */ 1427 key is the same in both versions */
1428 key.version = KEY_FORMAT_3_5; 1428 key.version = KEY_FORMAT_3_5;
1429 key.on_disk_key.k_dir_id = dirino; 1429 key.on_disk_key.k_dir_id = dirino;
1430 key.on_disk_key.k_objectid = inode->i_ino; 1430 key.on_disk_key.k_objectid = inode->i_ino;
1431 key.on_disk_key.k_offset = 0; 1431 key.on_disk_key.k_offset = 0;
1432 key.on_disk_key.k_type = 0; 1432 key.on_disk_key.k_type = 0;
1433 1433
1434 /* look for the object's stat data */ 1434 /* look for the object's stat data */
1435 retval = search_item(inode->i_sb, &key, &path_to_sd); 1435 retval = search_item(inode->i_sb, &key, &path_to_sd);
1436 if (retval == IO_ERROR) { 1436 if (retval == IO_ERROR) {
1437 reiserfs_error(inode->i_sb, "vs-13070", 1437 reiserfs_error(inode->i_sb, "vs-13070",
1438 "i/o failure occurred trying to find " 1438 "i/o failure occurred trying to find "
1439 "stat data of %K", &key); 1439 "stat data of %K", &key);
1440 reiserfs_make_bad_inode(inode); 1440 reiserfs_make_bad_inode(inode);
1441 return; 1441 return;
1442 } 1442 }
1443 if (retval != ITEM_FOUND) { 1443 if (retval != ITEM_FOUND) {
1444 /* a stale NFS handle can trigger this without it being an error */ 1444 /* a stale NFS handle can trigger this without it being an error */
1445 pathrelse(&path_to_sd); 1445 pathrelse(&path_to_sd);
1446 reiserfs_make_bad_inode(inode); 1446 reiserfs_make_bad_inode(inode);
1447 inode->i_nlink = 0; 1447 inode->i_nlink = 0;
1448 return; 1448 return;
1449 } 1449 }
1450 1450
1451 init_inode(inode, &path_to_sd); 1451 init_inode(inode, &path_to_sd);
1452 1452
1453 /* It is possible that knfsd is trying to access inode of a file 1453 /* It is possible that knfsd is trying to access inode of a file
1454 that is being removed from the disk by some other thread. As we 1454 that is being removed from the disk by some other thread. As we
1455 update sd on unlink all that is required is to check for nlink 1455 update sd on unlink all that is required is to check for nlink
1456 here. This bug was first found by Sizif when debugging 1456 here. This bug was first found by Sizif when debugging
1457 SquidNG/Butterfly, forgotten, and found again after Philippe 1457 SquidNG/Butterfly, forgotten, and found again after Philippe
1458 Gramoulle <philippe.gramoulle@mmania.com> reproduced it. 1458 Gramoulle <philippe.gramoulle@mmania.com> reproduced it.
1459 1459
1460 More logical fix would require changes in fs/inode.c:iput() to 1460 More logical fix would require changes in fs/inode.c:iput() to
1461 remove inode from hash-table _after_ fs cleaned disk stuff up and 1461 remove inode from hash-table _after_ fs cleaned disk stuff up and
1462 in iget() to return NULL if I_FREEING inode is found in 1462 in iget() to return NULL if I_FREEING inode is found in
1463 hash-table. */ 1463 hash-table. */
1464 /* Currently there is one place where it's ok to meet inode with 1464 /* Currently there is one place where it's ok to meet inode with
1465 nlink==0: processing of open-unlinked and half-truncated files 1465 nlink==0: processing of open-unlinked and half-truncated files
1466 during mount (fs/reiserfs/super.c:finish_unfinished()). */ 1466 during mount (fs/reiserfs/super.c:finish_unfinished()). */
1467 if ((inode->i_nlink == 0) && 1467 if ((inode->i_nlink == 0) &&
1468 !REISERFS_SB(inode->i_sb)->s_is_unlinked_ok) { 1468 !REISERFS_SB(inode->i_sb)->s_is_unlinked_ok) {
1469 reiserfs_warning(inode->i_sb, "vs-13075", 1469 reiserfs_warning(inode->i_sb, "vs-13075",
1470 "dead inode read from disk %K. " 1470 "dead inode read from disk %K. "
1471 "This is likely to be race with knfsd. Ignore", 1471 "This is likely to be race with knfsd. Ignore",
1472 &key); 1472 &key);
1473 reiserfs_make_bad_inode(inode); 1473 reiserfs_make_bad_inode(inode);
1474 } 1474 }
1475 1475
1476 reiserfs_check_path(&path_to_sd); /* init inode should be relsing */ 1476 reiserfs_check_path(&path_to_sd); /* init inode should be relsing */
1477 1477
1478 } 1478 }
1479 1479
1480 /** 1480 /**
1481 * reiserfs_find_actor() - "find actor" reiserfs supplies to iget5_locked(). 1481 * reiserfs_find_actor() - "find actor" reiserfs supplies to iget5_locked().
1482 * 1482 *
1483 * @inode: inode from hash table to check 1483 * @inode: inode from hash table to check
1484 * @opaque: "cookie" passed to iget5_locked(). This is &reiserfs_iget_args. 1484 * @opaque: "cookie" passed to iget5_locked(). This is &reiserfs_iget_args.
1485 * 1485 *
1486 * This function is called by iget5_locked() to distinguish reiserfs inodes 1486 * This function is called by iget5_locked() to distinguish reiserfs inodes
1487 * having the same inode numbers. Such inodes can only exist due to some 1487 * having the same inode numbers. Such inodes can only exist due to some
1488 * error condition. One of them should be bad. Inodes with identical 1488 * error condition. One of them should be bad. Inodes with identical
1489 * inode numbers (objectids) are distinguished by parent directory ids. 1489 * inode numbers (objectids) are distinguished by parent directory ids.
1490 * 1490 *
1491 */ 1491 */
1492 int reiserfs_find_actor(struct inode *inode, void *opaque) 1492 int reiserfs_find_actor(struct inode *inode, void *opaque)
1493 { 1493 {
1494 struct reiserfs_iget_args *args; 1494 struct reiserfs_iget_args *args;
1495 1495
1496 args = opaque; 1496 args = opaque;
1497 /* args is already in CPU order */ 1497 /* args is already in CPU order */
1498 return (inode->i_ino == args->objectid) && 1498 return (inode->i_ino == args->objectid) &&
1499 (le32_to_cpu(INODE_PKEY(inode)->k_dir_id) == args->dirid); 1499 (le32_to_cpu(INODE_PKEY(inode)->k_dir_id) == args->dirid);
1500 } 1500 }
1501 1501
1502 struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key) 1502 struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key)
1503 { 1503 {
1504 struct inode *inode; 1504 struct inode *inode;
1505 struct reiserfs_iget_args args; 1505 struct reiserfs_iget_args args;
1506 1506
1507 args.objectid = key->on_disk_key.k_objectid; 1507 args.objectid = key->on_disk_key.k_objectid;
1508 args.dirid = key->on_disk_key.k_dir_id; 1508 args.dirid = key->on_disk_key.k_dir_id;
1509 reiserfs_write_unlock(s); 1509 reiserfs_write_unlock(s);
1510 inode = iget5_locked(s, key->on_disk_key.k_objectid, 1510 inode = iget5_locked(s, key->on_disk_key.k_objectid,
1511 reiserfs_find_actor, reiserfs_init_locked_inode, 1511 reiserfs_find_actor, reiserfs_init_locked_inode,
1512 (void *)(&args)); 1512 (void *)(&args));
1513 reiserfs_write_lock(s); 1513 reiserfs_write_lock(s);
1514 if (!inode) 1514 if (!inode)
1515 return ERR_PTR(-ENOMEM); 1515 return ERR_PTR(-ENOMEM);
1516 1516
1517 if (inode->i_state & I_NEW) { 1517 if (inode->i_state & I_NEW) {
1518 reiserfs_read_locked_inode(inode, &args); 1518 reiserfs_read_locked_inode(inode, &args);
1519 unlock_new_inode(inode); 1519 unlock_new_inode(inode);
1520 } 1520 }
1521 1521
1522 if (comp_short_keys(INODE_PKEY(inode), key) || is_bad_inode(inode)) { 1522 if (comp_short_keys(INODE_PKEY(inode), key) || is_bad_inode(inode)) {
1523 /* either due to i/o error or a stale NFS handle */ 1523 /* either due to i/o error or a stale NFS handle */
1524 iput(inode); 1524 iput(inode);
1525 inode = NULL; 1525 inode = NULL;
1526 } 1526 }
1527 return inode; 1527 return inode;
1528 } 1528 }
1529 1529
1530 static struct dentry *reiserfs_get_dentry(struct super_block *sb, 1530 static struct dentry *reiserfs_get_dentry(struct super_block *sb,
1531 u32 objectid, u32 dir_id, u32 generation) 1531 u32 objectid, u32 dir_id, u32 generation)
1532 1532
1533 { 1533 {
1534 struct cpu_key key; 1534 struct cpu_key key;
1535 struct inode *inode; 1535 struct inode *inode;
1536 1536
1537 key.on_disk_key.k_objectid = objectid; 1537 key.on_disk_key.k_objectid = objectid;
1538 key.on_disk_key.k_dir_id = dir_id; 1538 key.on_disk_key.k_dir_id = dir_id;
1539 reiserfs_write_lock(sb); 1539 reiserfs_write_lock(sb);
1540 inode = reiserfs_iget(sb, &key); 1540 inode = reiserfs_iget(sb, &key);
1541 if (inode && !IS_ERR(inode) && generation != 0 && 1541 if (inode && !IS_ERR(inode) && generation != 0 &&
1542 generation != inode->i_generation) { 1542 generation != inode->i_generation) {
1543 iput(inode); 1543 iput(inode);
1544 inode = NULL; 1544 inode = NULL;
1545 } 1545 }
1546 reiserfs_write_unlock(sb); 1546 reiserfs_write_unlock(sb);
1547 1547
1548 return d_obtain_alias(inode); 1548 return d_obtain_alias(inode);
1549 } 1549 }
1550 1550
1551 struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid, 1551 struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
1552 int fh_len, int fh_type) 1552 int fh_len, int fh_type)
1553 { 1553 {
1554 /* fhtype happens to reflect the number of u32s encoded. 1554 /* fhtype happens to reflect the number of u32s encoded.
1555 * due to a bug in earlier code, fhtype might indicate there 1555 * due to a bug in earlier code, fhtype might indicate there
1556 * are more u32s then actually fitted. 1556 * are more u32s then actually fitted.
1557 * so if fhtype seems to be more than len, reduce fhtype. 1557 * so if fhtype seems to be more than len, reduce fhtype.
1558 * Valid types are: 1558 * Valid types are:
1559 * 2 - objectid + dir_id - legacy support 1559 * 2 - objectid + dir_id - legacy support
1560 * 3 - objectid + dir_id + generation 1560 * 3 - objectid + dir_id + generation
1561 * 4 - objectid + dir_id + objectid and dirid of parent - legacy 1561 * 4 - objectid + dir_id + objectid and dirid of parent - legacy
1562 * 5 - objectid + dir_id + generation + objectid and dirid of parent 1562 * 5 - objectid + dir_id + generation + objectid and dirid of parent
1563 * 6 - as above plus generation of directory 1563 * 6 - as above plus generation of directory
1564 * 6 does not fit in NFSv2 handles 1564 * 6 does not fit in NFSv2 handles
1565 */ 1565 */
1566 if (fh_type > fh_len) { 1566 if (fh_type > fh_len) {
1567 if (fh_type != 6 || fh_len != 5) 1567 if (fh_type != 6 || fh_len != 5)
1568 reiserfs_warning(sb, "reiserfs-13077", 1568 reiserfs_warning(sb, "reiserfs-13077",
1569 "nfsd/reiserfs, fhtype=%d, len=%d - odd", 1569 "nfsd/reiserfs, fhtype=%d, len=%d - odd",
1570 fh_type, fh_len); 1570 fh_type, fh_len);
1571 fh_type = 5; 1571 fh_type = 5;
1572 } 1572 }
1573 1573
1574 return reiserfs_get_dentry(sb, fid->raw[0], fid->raw[1], 1574 return reiserfs_get_dentry(sb, fid->raw[0], fid->raw[1],
1575 (fh_type == 3 || fh_type >= 5) ? fid->raw[2] : 0); 1575 (fh_type == 3 || fh_type >= 5) ? fid->raw[2] : 0);
1576 } 1576 }
1577 1577
1578 struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid, 1578 struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid,
1579 int fh_len, int fh_type) 1579 int fh_len, int fh_type)
1580 { 1580 {
1581 if (fh_type < 4) 1581 if (fh_type < 4)
1582 return NULL; 1582 return NULL;
1583 1583
1584 return reiserfs_get_dentry(sb, 1584 return reiserfs_get_dentry(sb,
1585 (fh_type >= 5) ? fid->raw[3] : fid->raw[2], 1585 (fh_type >= 5) ? fid->raw[3] : fid->raw[2],
1586 (fh_type >= 5) ? fid->raw[4] : fid->raw[3], 1586 (fh_type >= 5) ? fid->raw[4] : fid->raw[3],
1587 (fh_type == 6) ? fid->raw[5] : 0); 1587 (fh_type == 6) ? fid->raw[5] : 0);
1588 } 1588 }
1589 1589
1590 int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp, 1590 int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp,
1591 int need_parent) 1591 int need_parent)
1592 { 1592 {
1593 struct inode *inode = dentry->d_inode; 1593 struct inode *inode = dentry->d_inode;
1594 int maxlen = *lenp; 1594 int maxlen = *lenp;
1595 1595
1596 if (need_parent && (maxlen < 5)) { 1596 if (need_parent && (maxlen < 5)) {
1597 *lenp = 5; 1597 *lenp = 5;
1598 return 255; 1598 return 255;
1599 } else if (maxlen < 3) { 1599 } else if (maxlen < 3) {
1600 *lenp = 3; 1600 *lenp = 3;
1601 return 255; 1601 return 255;
1602 } 1602 }
1603 1603
1604 data[0] = inode->i_ino; 1604 data[0] = inode->i_ino;
1605 data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id); 1605 data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
1606 data[2] = inode->i_generation; 1606 data[2] = inode->i_generation;
1607 *lenp = 3; 1607 *lenp = 3;
1608 /* no room for directory info? return what we've stored so far */ 1608 /* no room for directory info? return what we've stored so far */
1609 if (maxlen < 5 || !need_parent) 1609 if (maxlen < 5 || !need_parent)
1610 return 3; 1610 return 3;
1611 1611
1612 spin_lock(&dentry->d_lock); 1612 spin_lock(&dentry->d_lock);
1613 inode = dentry->d_parent->d_inode; 1613 inode = dentry->d_parent->d_inode;
1614 data[3] = inode->i_ino; 1614 data[3] = inode->i_ino;
1615 data[4] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id); 1615 data[4] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
1616 *lenp = 5; 1616 *lenp = 5;
1617 if (maxlen >= 6) { 1617 if (maxlen >= 6) {
1618 data[5] = inode->i_generation; 1618 data[5] = inode->i_generation;
1619 *lenp = 6; 1619 *lenp = 6;
1620 } 1620 }
1621 spin_unlock(&dentry->d_lock); 1621 spin_unlock(&dentry->d_lock);
1622 return *lenp; 1622 return *lenp;
1623 } 1623 }
1624 1624
1625 /* looks for stat data, then copies fields to it, marks the buffer 1625 /* looks for stat data, then copies fields to it, marks the buffer
1626 containing stat data as dirty */ 1626 containing stat data as dirty */
1627 /* reiserfs inodes are never really dirty, since the dirty inode call 1627 /* reiserfs inodes are never really dirty, since the dirty inode call
1628 ** always logs them. This call allows the VFS inode marking routines 1628 ** always logs them. This call allows the VFS inode marking routines
1629 ** to properly mark inodes for datasync and such, but only actually 1629 ** to properly mark inodes for datasync and such, but only actually
1630 ** does something when called for a synchronous update. 1630 ** does something when called for a synchronous update.
1631 */ 1631 */
1632 int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc) 1632 int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc)
1633 { 1633 {
1634 struct reiserfs_transaction_handle th; 1634 struct reiserfs_transaction_handle th;
1635 int jbegin_count = 1; 1635 int jbegin_count = 1;
1636 1636
1637 if (inode->i_sb->s_flags & MS_RDONLY) 1637 if (inode->i_sb->s_flags & MS_RDONLY)
1638 return -EROFS; 1638 return -EROFS;
1639 /* memory pressure can sometimes initiate write_inode calls with sync == 1, 1639 /* memory pressure can sometimes initiate write_inode calls with sync == 1,
1640 ** these cases are just when the system needs ram, not when the 1640 ** these cases are just when the system needs ram, not when the
1641 ** inode needs to reach disk for safety, and they can safely be 1641 ** inode needs to reach disk for safety, and they can safely be
1642 ** ignored because the altered inode has already been logged. 1642 ** ignored because the altered inode has already been logged.
1643 */ 1643 */
1644 if (wbc->sync_mode == WB_SYNC_ALL && !(current->flags & PF_MEMALLOC)) { 1644 if (wbc->sync_mode == WB_SYNC_ALL && !(current->flags & PF_MEMALLOC)) {
1645 reiserfs_write_lock(inode->i_sb); 1645 reiserfs_write_lock(inode->i_sb);
1646 if (!journal_begin(&th, inode->i_sb, jbegin_count)) { 1646 if (!journal_begin(&th, inode->i_sb, jbegin_count)) {
1647 reiserfs_update_sd(&th, inode); 1647 reiserfs_update_sd(&th, inode);
1648 journal_end_sync(&th, inode->i_sb, jbegin_count); 1648 journal_end_sync(&th, inode->i_sb, jbegin_count);
1649 } 1649 }
1650 reiserfs_write_unlock(inode->i_sb); 1650 reiserfs_write_unlock(inode->i_sb);
1651 } 1651 }
1652 return 0; 1652 return 0;
1653 } 1653 }
1654 1654
1655 /* stat data of new object is inserted already, this inserts the item 1655 /* stat data of new object is inserted already, this inserts the item
1656 containing "." and ".." entries */ 1656 containing "." and ".." entries */
1657 static int reiserfs_new_directory(struct reiserfs_transaction_handle *th, 1657 static int reiserfs_new_directory(struct reiserfs_transaction_handle *th,
1658 struct inode *inode, 1658 struct inode *inode,
1659 struct item_head *ih, struct treepath *path, 1659 struct item_head *ih, struct treepath *path,
1660 struct inode *dir) 1660 struct inode *dir)
1661 { 1661 {
1662 struct super_block *sb = th->t_super; 1662 struct super_block *sb = th->t_super;
1663 char empty_dir[EMPTY_DIR_SIZE]; 1663 char empty_dir[EMPTY_DIR_SIZE];
1664 char *body = empty_dir; 1664 char *body = empty_dir;
1665 struct cpu_key key; 1665 struct cpu_key key;
1666 int retval; 1666 int retval;
1667 1667
1668 BUG_ON(!th->t_trans_id); 1668 BUG_ON(!th->t_trans_id);
1669 1669
1670 _make_cpu_key(&key, KEY_FORMAT_3_5, le32_to_cpu(ih->ih_key.k_dir_id), 1670 _make_cpu_key(&key, KEY_FORMAT_3_5, le32_to_cpu(ih->ih_key.k_dir_id),
1671 le32_to_cpu(ih->ih_key.k_objectid), DOT_OFFSET, 1671 le32_to_cpu(ih->ih_key.k_objectid), DOT_OFFSET,
1672 TYPE_DIRENTRY, 3 /*key length */ ); 1672 TYPE_DIRENTRY, 3 /*key length */ );
1673 1673
1674 /* compose item head for new item. Directories consist of items of 1674 /* compose item head for new item. Directories consist of items of
1675 old type (ITEM_VERSION_1). Do not set key (second arg is 0), it 1675 old type (ITEM_VERSION_1). Do not set key (second arg is 0), it
1676 is done by reiserfs_new_inode */ 1676 is done by reiserfs_new_inode */
1677 if (old_format_only(sb)) { 1677 if (old_format_only(sb)) {
1678 make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET, 1678 make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET,
1679 TYPE_DIRENTRY, EMPTY_DIR_SIZE_V1, 2); 1679 TYPE_DIRENTRY, EMPTY_DIR_SIZE_V1, 2);
1680 1680
1681 make_empty_dir_item_v1(body, ih->ih_key.k_dir_id, 1681 make_empty_dir_item_v1(body, ih->ih_key.k_dir_id,
1682 ih->ih_key.k_objectid, 1682 ih->ih_key.k_objectid,
1683 INODE_PKEY(dir)->k_dir_id, 1683 INODE_PKEY(dir)->k_dir_id,
1684 INODE_PKEY(dir)->k_objectid); 1684 INODE_PKEY(dir)->k_objectid);
1685 } else { 1685 } else {
1686 make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET, 1686 make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET,
1687 TYPE_DIRENTRY, EMPTY_DIR_SIZE, 2); 1687 TYPE_DIRENTRY, EMPTY_DIR_SIZE, 2);
1688 1688
1689 make_empty_dir_item(body, ih->ih_key.k_dir_id, 1689 make_empty_dir_item(body, ih->ih_key.k_dir_id,
1690 ih->ih_key.k_objectid, 1690 ih->ih_key.k_objectid,
1691 INODE_PKEY(dir)->k_dir_id, 1691 INODE_PKEY(dir)->k_dir_id,
1692 INODE_PKEY(dir)->k_objectid); 1692 INODE_PKEY(dir)->k_objectid);
1693 } 1693 }
1694 1694
1695 /* look for place in the tree for new item */ 1695 /* look for place in the tree for new item */
1696 retval = search_item(sb, &key, path); 1696 retval = search_item(sb, &key, path);
1697 if (retval == IO_ERROR) { 1697 if (retval == IO_ERROR) {
1698 reiserfs_error(sb, "vs-13080", 1698 reiserfs_error(sb, "vs-13080",
1699 "i/o failure occurred creating new directory"); 1699 "i/o failure occurred creating new directory");
1700 return -EIO; 1700 return -EIO;
1701 } 1701 }
1702 if (retval == ITEM_FOUND) { 1702 if (retval == ITEM_FOUND) {
1703 pathrelse(path); 1703 pathrelse(path);
1704 reiserfs_warning(sb, "vs-13070", 1704 reiserfs_warning(sb, "vs-13070",
1705 "object with this key exists (%k)", 1705 "object with this key exists (%k)",
1706 &(ih->ih_key)); 1706 &(ih->ih_key));
1707 return -EEXIST; 1707 return -EEXIST;
1708 } 1708 }
1709 1709
1710 /* insert item, that is empty directory item */ 1710 /* insert item, that is empty directory item */
1711 return reiserfs_insert_item(th, path, &key, ih, inode, body); 1711 return reiserfs_insert_item(th, path, &key, ih, inode, body);
1712 } 1712 }
1713 1713
1714 /* stat data of object has been inserted, this inserts the item 1714 /* stat data of object has been inserted, this inserts the item
1715 containing the body of symlink */ 1715 containing the body of symlink */
1716 static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th, struct inode *inode, /* Inode of symlink */ 1716 static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th, struct inode *inode, /* Inode of symlink */
1717 struct item_head *ih, 1717 struct item_head *ih,
1718 struct treepath *path, const char *symname, 1718 struct treepath *path, const char *symname,
1719 int item_len) 1719 int item_len)
1720 { 1720 {
1721 struct super_block *sb = th->t_super; 1721 struct super_block *sb = th->t_super;
1722 struct cpu_key key; 1722 struct cpu_key key;
1723 int retval; 1723 int retval;
1724 1724
1725 BUG_ON(!th->t_trans_id); 1725 BUG_ON(!th->t_trans_id);
1726 1726
1727 _make_cpu_key(&key, KEY_FORMAT_3_5, 1727 _make_cpu_key(&key, KEY_FORMAT_3_5,
1728 le32_to_cpu(ih->ih_key.k_dir_id), 1728 le32_to_cpu(ih->ih_key.k_dir_id),
1729 le32_to_cpu(ih->ih_key.k_objectid), 1729 le32_to_cpu(ih->ih_key.k_objectid),
1730 1, TYPE_DIRECT, 3 /*key length */ ); 1730 1, TYPE_DIRECT, 3 /*key length */ );
1731 1731
1732 make_le_item_head(ih, NULL, KEY_FORMAT_3_5, 1, TYPE_DIRECT, item_len, 1732 make_le_item_head(ih, NULL, KEY_FORMAT_3_5, 1, TYPE_DIRECT, item_len,
1733 0 /*free_space */ ); 1733 0 /*free_space */ );
1734 1734
1735 /* look for place in the tree for new item */ 1735 /* look for place in the tree for new item */
1736 retval = search_item(sb, &key, path); 1736 retval = search_item(sb, &key, path);
1737 if (retval == IO_ERROR) { 1737 if (retval == IO_ERROR) {
1738 reiserfs_error(sb, "vs-13080", 1738 reiserfs_error(sb, "vs-13080",
1739 "i/o failure occurred creating new symlink"); 1739 "i/o failure occurred creating new symlink");
1740 return -EIO; 1740 return -EIO;
1741 } 1741 }
1742 if (retval == ITEM_FOUND) { 1742 if (retval == ITEM_FOUND) {
1743 pathrelse(path); 1743 pathrelse(path);
1744 reiserfs_warning(sb, "vs-13080", 1744 reiserfs_warning(sb, "vs-13080",
1745 "object with this key exists (%k)", 1745 "object with this key exists (%k)",
1746 &(ih->ih_key)); 1746 &(ih->ih_key));
1747 return -EEXIST; 1747 return -EEXIST;
1748 } 1748 }
1749 1749
1750 /* insert item, that is body of symlink */ 1750 /* insert item, that is body of symlink */
1751 return reiserfs_insert_item(th, path, &key, ih, inode, symname); 1751 return reiserfs_insert_item(th, path, &key, ih, inode, symname);
1752 } 1752 }
1753 1753
1754 /* inserts the stat data into the tree, and then calls 1754 /* inserts the stat data into the tree, and then calls
1755 reiserfs_new_directory (to insert ".", ".." item if new object is 1755 reiserfs_new_directory (to insert ".", ".." item if new object is
1756 directory) or reiserfs_new_symlink (to insert symlink body if new 1756 directory) or reiserfs_new_symlink (to insert symlink body if new
1757 object is symlink) or nothing (if new object is regular file) 1757 object is symlink) or nothing (if new object is regular file)
1758 1758
1759 NOTE! uid and gid must already be set in the inode. If we return 1759 NOTE! uid and gid must already be set in the inode. If we return
1760 non-zero due to an error, we have to drop the quota previously allocated 1760 non-zero due to an error, we have to drop the quota previously allocated
1761 for the fresh inode. This can only be done outside a transaction, so 1761 for the fresh inode. This can only be done outside a transaction, so
1762 if we return non-zero, we also end the transaction. */ 1762 if we return non-zero, we also end the transaction. */
1763 int reiserfs_new_inode(struct reiserfs_transaction_handle *th, 1763 int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1764 struct inode *dir, int mode, const char *symname, 1764 struct inode *dir, int mode, const char *symname,
1765 /* 0 for regular, EMTRY_DIR_SIZE for dirs, 1765 /* 0 for regular, EMTRY_DIR_SIZE for dirs,
1766 strlen (symname) for symlinks) */ 1766 strlen (symname) for symlinks) */
1767 loff_t i_size, struct dentry *dentry, 1767 loff_t i_size, struct dentry *dentry,
1768 struct inode *inode, 1768 struct inode *inode,
1769 struct reiserfs_security_handle *security) 1769 struct reiserfs_security_handle *security)
1770 { 1770 {
1771 struct super_block *sb; 1771 struct super_block *sb;
1772 struct reiserfs_iget_args args; 1772 struct reiserfs_iget_args args;
1773 INITIALIZE_PATH(path_to_key); 1773 INITIALIZE_PATH(path_to_key);
1774 struct cpu_key key; 1774 struct cpu_key key;
1775 struct item_head ih; 1775 struct item_head ih;
1776 struct stat_data sd; 1776 struct stat_data sd;
1777 int retval; 1777 int retval;
1778 int err; 1778 int err;
1779 1779
1780 BUG_ON(!th->t_trans_id); 1780 BUG_ON(!th->t_trans_id);
1781 1781
1782 dquot_initialize(inode); 1782 dquot_initialize(inode);
1783 err = dquot_alloc_inode(inode); 1783 err = dquot_alloc_inode(inode);
1784 if (err) 1784 if (err)
1785 goto out_end_trans; 1785 goto out_end_trans;
1786 if (!dir->i_nlink) { 1786 if (!dir->i_nlink) {
1787 err = -EPERM; 1787 err = -EPERM;
1788 goto out_bad_inode; 1788 goto out_bad_inode;
1789 } 1789 }
1790 1790
1791 sb = dir->i_sb; 1791 sb = dir->i_sb;
1792 1792
1793 /* item head of new item */ 1793 /* item head of new item */
1794 ih.ih_key.k_dir_id = reiserfs_choose_packing(dir); 1794 ih.ih_key.k_dir_id = reiserfs_choose_packing(dir);
1795 ih.ih_key.k_objectid = cpu_to_le32(reiserfs_get_unused_objectid(th)); 1795 ih.ih_key.k_objectid = cpu_to_le32(reiserfs_get_unused_objectid(th));
1796 if (!ih.ih_key.k_objectid) { 1796 if (!ih.ih_key.k_objectid) {
1797 err = -ENOMEM; 1797 err = -ENOMEM;
1798 goto out_bad_inode; 1798 goto out_bad_inode;
1799 } 1799 }
1800 args.objectid = inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid); 1800 args.objectid = inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid);
1801 if (old_format_only(sb)) 1801 if (old_format_only(sb))
1802 make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET, 1802 make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET,
1803 TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT); 1803 TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT);
1804 else 1804 else
1805 make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET, 1805 make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET,
1806 TYPE_STAT_DATA, SD_SIZE, MAX_US_INT); 1806 TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
1807 memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE); 1807 memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE);
1808 args.dirid = le32_to_cpu(ih.ih_key.k_dir_id); 1808 args.dirid = le32_to_cpu(ih.ih_key.k_dir_id);
1809 if (insert_inode_locked4(inode, args.objectid, 1809 if (insert_inode_locked4(inode, args.objectid,
1810 reiserfs_find_actor, &args) < 0) { 1810 reiserfs_find_actor, &args) < 0) {
1811 err = -EINVAL; 1811 err = -EINVAL;
1812 goto out_bad_inode; 1812 goto out_bad_inode;
1813 } 1813 }
1814 if (old_format_only(sb)) 1814 if (old_format_only(sb))
1815 /* not a perfect generation count, as object ids can be reused, but 1815 /* not a perfect generation count, as object ids can be reused, but
1816 ** this is as good as reiserfs can do right now. 1816 ** this is as good as reiserfs can do right now.
1817 ** note that the private part of inode isn't filled in yet, we have 1817 ** note that the private part of inode isn't filled in yet, we have
1818 ** to use the directory. 1818 ** to use the directory.
1819 */ 1819 */
1820 inode->i_generation = le32_to_cpu(INODE_PKEY(dir)->k_objectid); 1820 inode->i_generation = le32_to_cpu(INODE_PKEY(dir)->k_objectid);
1821 else 1821 else
1822 #if defined( USE_INODE_GENERATION_COUNTER ) 1822 #if defined( USE_INODE_GENERATION_COUNTER )
1823 inode->i_generation = 1823 inode->i_generation =
1824 le32_to_cpu(REISERFS_SB(sb)->s_rs->s_inode_generation); 1824 le32_to_cpu(REISERFS_SB(sb)->s_rs->s_inode_generation);
1825 #else 1825 #else
1826 inode->i_generation = ++event; 1826 inode->i_generation = ++event;
1827 #endif 1827 #endif
1828 1828
1829 /* fill stat data */ 1829 /* fill stat data */
1830 inode->i_nlink = (S_ISDIR(mode) ? 2 : 1); 1830 inode->i_nlink = (S_ISDIR(mode) ? 2 : 1);
1831 1831
1832 /* uid and gid must already be set by the caller for quota init */ 1832 /* uid and gid must already be set by the caller for quota init */
1833 1833
1834 /* symlink cannot be immutable or append only, right? */ 1834 /* symlink cannot be immutable or append only, right? */
1835 if (S_ISLNK(inode->i_mode)) 1835 if (S_ISLNK(inode->i_mode))
1836 inode->i_flags &= ~(S_IMMUTABLE | S_APPEND); 1836 inode->i_flags &= ~(S_IMMUTABLE | S_APPEND);
1837 1837
1838 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; 1838 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
1839 inode->i_size = i_size; 1839 inode->i_size = i_size;
1840 inode->i_blocks = 0; 1840 inode->i_blocks = 0;
1841 inode->i_bytes = 0; 1841 inode->i_bytes = 0;
1842 REISERFS_I(inode)->i_first_direct_byte = S_ISLNK(mode) ? 1 : 1842 REISERFS_I(inode)->i_first_direct_byte = S_ISLNK(mode) ? 1 :
1843 U32_MAX /*NO_BYTES_IN_DIRECT_ITEM */ ; 1843 U32_MAX /*NO_BYTES_IN_DIRECT_ITEM */ ;
1844 1844
1845 INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list)); 1845 INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list));
1846 REISERFS_I(inode)->i_flags = 0; 1846 REISERFS_I(inode)->i_flags = 0;
1847 REISERFS_I(inode)->i_prealloc_block = 0; 1847 REISERFS_I(inode)->i_prealloc_block = 0;
1848 REISERFS_I(inode)->i_prealloc_count = 0; 1848 REISERFS_I(inode)->i_prealloc_count = 0;
1849 REISERFS_I(inode)->i_trans_id = 0; 1849 REISERFS_I(inode)->i_trans_id = 0;
1850 REISERFS_I(inode)->i_jl = NULL; 1850 REISERFS_I(inode)->i_jl = NULL;
1851 REISERFS_I(inode)->i_attrs = 1851 REISERFS_I(inode)->i_attrs =
1852 REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK; 1852 REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK;
1853 sd_attrs_to_i_attrs(REISERFS_I(inode)->i_attrs, inode); 1853 sd_attrs_to_i_attrs(REISERFS_I(inode)->i_attrs, inode);
1854 reiserfs_init_xattr_rwsem(inode); 1854 reiserfs_init_xattr_rwsem(inode);
1855 1855
1856 /* key to search for correct place for new stat data */ 1856 /* key to search for correct place for new stat data */
1857 _make_cpu_key(&key, KEY_FORMAT_3_6, le32_to_cpu(ih.ih_key.k_dir_id), 1857 _make_cpu_key(&key, KEY_FORMAT_3_6, le32_to_cpu(ih.ih_key.k_dir_id),
1858 le32_to_cpu(ih.ih_key.k_objectid), SD_OFFSET, 1858 le32_to_cpu(ih.ih_key.k_objectid), SD_OFFSET,
1859 TYPE_STAT_DATA, 3 /*key length */ ); 1859 TYPE_STAT_DATA, 3 /*key length */ );
1860 1860
1861 /* find proper place for inserting of stat data */ 1861 /* find proper place for inserting of stat data */
1862 retval = search_item(sb, &key, &path_to_key); 1862 retval = search_item(sb, &key, &path_to_key);
1863 if (retval == IO_ERROR) { 1863 if (retval == IO_ERROR) {
1864 err = -EIO; 1864 err = -EIO;
1865 goto out_bad_inode; 1865 goto out_bad_inode;
1866 } 1866 }
1867 if (retval == ITEM_FOUND) { 1867 if (retval == ITEM_FOUND) {
1868 pathrelse(&path_to_key); 1868 pathrelse(&path_to_key);
1869 err = -EEXIST; 1869 err = -EEXIST;
1870 goto out_bad_inode; 1870 goto out_bad_inode;
1871 } 1871 }
1872 if (old_format_only(sb)) { 1872 if (old_format_only(sb)) {
1873 if (inode->i_uid & ~0xffff || inode->i_gid & ~0xffff) { 1873 if (inode->i_uid & ~0xffff || inode->i_gid & ~0xffff) {
1874 pathrelse(&path_to_key); 1874 pathrelse(&path_to_key);
1875 /* i_uid or i_gid is too big to be stored in stat data v3.5 */ 1875 /* i_uid or i_gid is too big to be stored in stat data v3.5 */
1876 err = -EINVAL; 1876 err = -EINVAL;
1877 goto out_bad_inode; 1877 goto out_bad_inode;
1878 } 1878 }
1879 inode2sd_v1(&sd, inode, inode->i_size); 1879 inode2sd_v1(&sd, inode, inode->i_size);
1880 } else { 1880 } else {
1881 inode2sd(&sd, inode, inode->i_size); 1881 inode2sd(&sd, inode, inode->i_size);
1882 } 1882 }
1883 // store in in-core inode the key of stat data and version all 1883 // store in in-core inode the key of stat data and version all
1884 // object items will have (directory items will have old offset 1884 // object items will have (directory items will have old offset
1885 // format, other new objects will consist of new items) 1885 // format, other new objects will consist of new items)
1886 if (old_format_only(sb) || S_ISDIR(mode) || S_ISLNK(mode)) 1886 if (old_format_only(sb) || S_ISDIR(mode) || S_ISLNK(mode))
1887 set_inode_item_key_version(inode, KEY_FORMAT_3_5); 1887 set_inode_item_key_version(inode, KEY_FORMAT_3_5);
1888 else 1888 else
1889 set_inode_item_key_version(inode, KEY_FORMAT_3_6); 1889 set_inode_item_key_version(inode, KEY_FORMAT_3_6);
1890 if (old_format_only(sb)) 1890 if (old_format_only(sb))
1891 set_inode_sd_version(inode, STAT_DATA_V1); 1891 set_inode_sd_version(inode, STAT_DATA_V1);
1892 else 1892 else
1893 set_inode_sd_version(inode, STAT_DATA_V2); 1893 set_inode_sd_version(inode, STAT_DATA_V2);
1894 1894
1895 /* insert the stat data into the tree */ 1895 /* insert the stat data into the tree */
1896 #ifdef DISPLACE_NEW_PACKING_LOCALITIES 1896 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
1897 if (REISERFS_I(dir)->new_packing_locality) 1897 if (REISERFS_I(dir)->new_packing_locality)
1898 th->displace_new_blocks = 1; 1898 th->displace_new_blocks = 1;
1899 #endif 1899 #endif
1900 retval = 1900 retval =
1901 reiserfs_insert_item(th, &path_to_key, &key, &ih, inode, 1901 reiserfs_insert_item(th, &path_to_key, &key, &ih, inode,
1902 (char *)(&sd)); 1902 (char *)(&sd));
1903 if (retval) { 1903 if (retval) {
1904 err = retval; 1904 err = retval;
1905 reiserfs_check_path(&path_to_key); 1905 reiserfs_check_path(&path_to_key);
1906 goto out_bad_inode; 1906 goto out_bad_inode;
1907 } 1907 }
1908 #ifdef DISPLACE_NEW_PACKING_LOCALITIES 1908 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
1909 if (!th->displace_new_blocks) 1909 if (!th->displace_new_blocks)
1910 REISERFS_I(dir)->new_packing_locality = 0; 1910 REISERFS_I(dir)->new_packing_locality = 0;
1911 #endif 1911 #endif
1912 if (S_ISDIR(mode)) { 1912 if (S_ISDIR(mode)) {
1913 /* insert item with "." and ".." */ 1913 /* insert item with "." and ".." */
1914 retval = 1914 retval =
1915 reiserfs_new_directory(th, inode, &ih, &path_to_key, dir); 1915 reiserfs_new_directory(th, inode, &ih, &path_to_key, dir);
1916 } 1916 }
1917 1917
1918 if (S_ISLNK(mode)) { 1918 if (S_ISLNK(mode)) {
1919 /* insert body of symlink */ 1919 /* insert body of symlink */
1920 if (!old_format_only(sb)) 1920 if (!old_format_only(sb))
1921 i_size = ROUND_UP(i_size); 1921 i_size = ROUND_UP(i_size);
1922 retval = 1922 retval =
1923 reiserfs_new_symlink(th, inode, &ih, &path_to_key, symname, 1923 reiserfs_new_symlink(th, inode, &ih, &path_to_key, symname,
1924 i_size); 1924 i_size);
1925 } 1925 }
1926 if (retval) { 1926 if (retval) {
1927 err = retval; 1927 err = retval;
1928 reiserfs_check_path(&path_to_key); 1928 reiserfs_check_path(&path_to_key);
1929 journal_end(th, th->t_super, th->t_blocks_allocated); 1929 journal_end(th, th->t_super, th->t_blocks_allocated);
1930 goto out_inserted_sd; 1930 goto out_inserted_sd;
1931 } 1931 }
1932 1932
1933 if (reiserfs_posixacl(inode->i_sb)) { 1933 if (reiserfs_posixacl(inode->i_sb)) {
1934 retval = reiserfs_inherit_default_acl(th, dir, dentry, inode); 1934 retval = reiserfs_inherit_default_acl(th, dir, dentry, inode);
1935 if (retval) { 1935 if (retval) {
1936 err = retval; 1936 err = retval;
1937 reiserfs_check_path(&path_to_key); 1937 reiserfs_check_path(&path_to_key);
1938 journal_end(th, th->t_super, th->t_blocks_allocated); 1938 journal_end(th, th->t_super, th->t_blocks_allocated);
1939 goto out_inserted_sd; 1939 goto out_inserted_sd;
1940 } 1940 }
1941 } else if (inode->i_sb->s_flags & MS_POSIXACL) { 1941 } else if (inode->i_sb->s_flags & MS_POSIXACL) {
1942 reiserfs_warning(inode->i_sb, "jdm-13090", 1942 reiserfs_warning(inode->i_sb, "jdm-13090",
1943 "ACLs aren't enabled in the fs, " 1943 "ACLs aren't enabled in the fs, "
1944 "but vfs thinks they are!"); 1944 "but vfs thinks they are!");
1945 } else if (IS_PRIVATE(dir)) 1945 } else if (IS_PRIVATE(dir))
1946 inode->i_flags |= S_PRIVATE; 1946 inode->i_flags |= S_PRIVATE;
1947 1947
1948 if (security->name) { 1948 if (security->name) {
1949 retval = reiserfs_security_write(th, inode, security); 1949 retval = reiserfs_security_write(th, inode, security);
1950 if (retval) { 1950 if (retval) {
1951 err = retval; 1951 err = retval;
1952 reiserfs_check_path(&path_to_key); 1952 reiserfs_check_path(&path_to_key);
1953 retval = journal_end(th, th->t_super, 1953 retval = journal_end(th, th->t_super,
1954 th->t_blocks_allocated); 1954 th->t_blocks_allocated);
1955 if (retval) 1955 if (retval)
1956 err = retval; 1956 err = retval;
1957 goto out_inserted_sd; 1957 goto out_inserted_sd;
1958 } 1958 }
1959 } 1959 }
1960 1960
1961 reiserfs_update_sd(th, inode); 1961 reiserfs_update_sd(th, inode);
1962 reiserfs_check_path(&path_to_key); 1962 reiserfs_check_path(&path_to_key);
1963 1963
1964 return 0; 1964 return 0;
1965 1965
1966 /* it looks like you can easily compress these two goto targets into 1966 /* it looks like you can easily compress these two goto targets into
1967 * one. Keeping it like this doesn't actually hurt anything, and they 1967 * one. Keeping it like this doesn't actually hurt anything, and they
1968 * are place holders for what the quota code actually needs. 1968 * are place holders for what the quota code actually needs.
1969 */ 1969 */
1970 out_bad_inode: 1970 out_bad_inode:
1971 /* Invalidate the object, nothing was inserted yet */ 1971 /* Invalidate the object, nothing was inserted yet */
1972 INODE_PKEY(inode)->k_objectid = 0; 1972 INODE_PKEY(inode)->k_objectid = 0;
1973 1973
1974 /* Quota change must be inside a transaction for journaling */ 1974 /* Quota change must be inside a transaction for journaling */
1975 dquot_free_inode(inode); 1975 dquot_free_inode(inode);
1976 1976
1977 out_end_trans: 1977 out_end_trans:
1978 journal_end(th, th->t_super, th->t_blocks_allocated); 1978 journal_end(th, th->t_super, th->t_blocks_allocated);
1979 /* Drop can be outside and it needs more credits so it's better to have it outside */ 1979 /* Drop can be outside and it needs more credits so it's better to have it outside */
1980 dquot_drop(inode); 1980 dquot_drop(inode);
1981 inode->i_flags |= S_NOQUOTA; 1981 inode->i_flags |= S_NOQUOTA;
1982 make_bad_inode(inode); 1982 make_bad_inode(inode);
1983 1983
1984 out_inserted_sd: 1984 out_inserted_sd:
1985 inode->i_nlink = 0; 1985 inode->i_nlink = 0;
1986 th->t_trans_id = 0; /* so the caller can't use this handle later */ 1986 th->t_trans_id = 0; /* so the caller can't use this handle later */
1987 unlock_new_inode(inode); /* OK to do even if we hadn't locked it */ 1987 unlock_new_inode(inode); /* OK to do even if we hadn't locked it */
1988 iput(inode); 1988 iput(inode);
1989 return err; 1989 return err;
1990 } 1990 }
1991 1991
1992 /* 1992 /*
1993 ** finds the tail page in the page cache, 1993 ** finds the tail page in the page cache,
1994 ** reads the last block in. 1994 ** reads the last block in.
1995 ** 1995 **
1996 ** On success, page_result is set to a locked, pinned page, and bh_result 1996 ** On success, page_result is set to a locked, pinned page, and bh_result
1997 ** is set to an up to date buffer for the last block in the file. returns 0. 1997 ** is set to an up to date buffer for the last block in the file. returns 0.
1998 ** 1998 **
1999 ** tail conversion is not done, so bh_result might not be valid for writing 1999 ** tail conversion is not done, so bh_result might not be valid for writing
2000 ** check buffer_mapped(bh_result) and bh_result->b_blocknr != 0 before 2000 ** check buffer_mapped(bh_result) and bh_result->b_blocknr != 0 before
2001 ** trying to write the block. 2001 ** trying to write the block.
2002 ** 2002 **
2003 ** on failure, nonzero is returned, page_result and bh_result are untouched. 2003 ** on failure, nonzero is returned, page_result and bh_result are untouched.
2004 */ 2004 */
2005 static int grab_tail_page(struct inode *inode, 2005 static int grab_tail_page(struct inode *inode,
2006 struct page **page_result, 2006 struct page **page_result,
2007 struct buffer_head **bh_result) 2007 struct buffer_head **bh_result)
2008 { 2008 {
2009 2009
2010 /* we want the page with the last byte in the file, 2010 /* we want the page with the last byte in the file,
2011 ** not the page that will hold the next byte for appending 2011 ** not the page that will hold the next byte for appending
2012 */ 2012 */
2013 unsigned long index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT; 2013 unsigned long index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
2014 unsigned long pos = 0; 2014 unsigned long pos = 0;
2015 unsigned long start = 0; 2015 unsigned long start = 0;
2016 unsigned long blocksize = inode->i_sb->s_blocksize; 2016 unsigned long blocksize = inode->i_sb->s_blocksize;
2017 unsigned long offset = (inode->i_size) & (PAGE_CACHE_SIZE - 1); 2017 unsigned long offset = (inode->i_size) & (PAGE_CACHE_SIZE - 1);
2018 struct buffer_head *bh; 2018 struct buffer_head *bh;
2019 struct buffer_head *head; 2019 struct buffer_head *head;
2020 struct page *page; 2020 struct page *page;
2021 int error; 2021 int error;
2022 2022
2023 /* we know that we are only called with inode->i_size > 0. 2023 /* we know that we are only called with inode->i_size > 0.
2024 ** we also know that a file tail can never be as big as a block 2024 ** we also know that a file tail can never be as big as a block
2025 ** If i_size % blocksize == 0, our file is currently block aligned 2025 ** If i_size % blocksize == 0, our file is currently block aligned
2026 ** and it won't need converting or zeroing after a truncate. 2026 ** and it won't need converting or zeroing after a truncate.
2027 */ 2027 */
2028 if ((offset & (blocksize - 1)) == 0) { 2028 if ((offset & (blocksize - 1)) == 0) {
2029 return -ENOENT; 2029 return -ENOENT;
2030 } 2030 }
2031 page = grab_cache_page(inode->i_mapping, index); 2031 page = grab_cache_page(inode->i_mapping, index);
2032 error = -ENOMEM; 2032 error = -ENOMEM;
2033 if (!page) { 2033 if (!page) {
2034 goto out; 2034 goto out;
2035 } 2035 }
2036 /* start within the page of the last block in the file */ 2036 /* start within the page of the last block in the file */
2037 start = (offset / blocksize) * blocksize; 2037 start = (offset / blocksize) * blocksize;
2038 2038
2039 error = __block_write_begin(page, start, offset - start, 2039 error = __block_write_begin(page, start, offset - start,
2040 reiserfs_get_block_create_0); 2040 reiserfs_get_block_create_0);
2041 if (error) 2041 if (error)
2042 goto unlock; 2042 goto unlock;
2043 2043
2044 head = page_buffers(page); 2044 head = page_buffers(page);
2045 bh = head; 2045 bh = head;
2046 do { 2046 do {
2047 if (pos >= start) { 2047 if (pos >= start) {
2048 break; 2048 break;
2049 } 2049 }
2050 bh = bh->b_this_page; 2050 bh = bh->b_this_page;
2051 pos += blocksize; 2051 pos += blocksize;
2052 } while (bh != head); 2052 } while (bh != head);
2053 2053
2054 if (!buffer_uptodate(bh)) { 2054 if (!buffer_uptodate(bh)) {
2055 /* note, this should never happen, prepare_write should 2055 /* note, this should never happen, prepare_write should
2056 ** be taking care of this for us. If the buffer isn't up to date, 2056 ** be taking care of this for us. If the buffer isn't up to date,
2057 ** I've screwed up the code to find the buffer, or the code to 2057 ** I've screwed up the code to find the buffer, or the code to
2058 ** call prepare_write 2058 ** call prepare_write
2059 */ 2059 */
2060 reiserfs_error(inode->i_sb, "clm-6000", 2060 reiserfs_error(inode->i_sb, "clm-6000",
2061 "error reading block %lu", bh->b_blocknr); 2061 "error reading block %lu", bh->b_blocknr);
2062 error = -EIO; 2062 error = -EIO;
2063 goto unlock; 2063 goto unlock;
2064 } 2064 }
2065 *bh_result = bh; 2065 *bh_result = bh;
2066 *page_result = page; 2066 *page_result = page;
2067 2067
2068 out: 2068 out:
2069 return error; 2069 return error;
2070 2070
2071 unlock: 2071 unlock:
2072 unlock_page(page); 2072 unlock_page(page);
2073 page_cache_release(page); 2073 page_cache_release(page);
2074 return error; 2074 return error;
2075 } 2075 }
2076 2076
2077 /* 2077 /*
2078 ** vfs version of truncate file. Must NOT be called with 2078 ** vfs version of truncate file. Must NOT be called with
2079 ** a transaction already started. 2079 ** a transaction already started.
2080 ** 2080 **
2081 ** some code taken from block_truncate_page 2081 ** some code taken from block_truncate_page
2082 */ 2082 */
2083 int reiserfs_truncate_file(struct inode *inode, int update_timestamps) 2083 int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
2084 { 2084 {
2085 struct reiserfs_transaction_handle th; 2085 struct reiserfs_transaction_handle th;
2086 /* we want the offset for the first byte after the end of the file */ 2086 /* we want the offset for the first byte after the end of the file */
2087 unsigned long offset = inode->i_size & (PAGE_CACHE_SIZE - 1); 2087 unsigned long offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
2088 unsigned blocksize = inode->i_sb->s_blocksize; 2088 unsigned blocksize = inode->i_sb->s_blocksize;
2089 unsigned length; 2089 unsigned length;
2090 struct page *page = NULL; 2090 struct page *page = NULL;
2091 int error; 2091 int error;
2092 struct buffer_head *bh = NULL; 2092 struct buffer_head *bh = NULL;
2093 int err2; 2093 int err2;
2094 int lock_depth; 2094 int lock_depth;
2095 2095
2096 lock_depth = reiserfs_write_lock_once(inode->i_sb); 2096 lock_depth = reiserfs_write_lock_once(inode->i_sb);
2097 2097
2098 if (inode->i_size > 0) { 2098 if (inode->i_size > 0) {
2099 error = grab_tail_page(inode, &page, &bh); 2099 error = grab_tail_page(inode, &page, &bh);
2100 if (error) { 2100 if (error) {
2101 // -ENOENT means we truncated past the end of the file, 2101 // -ENOENT means we truncated past the end of the file,
2102 // and get_block_create_0 could not find a block to read in, 2102 // and get_block_create_0 could not find a block to read in,
2103 // which is ok. 2103 // which is ok.
2104 if (error != -ENOENT) 2104 if (error != -ENOENT)
2105 reiserfs_error(inode->i_sb, "clm-6001", 2105 reiserfs_error(inode->i_sb, "clm-6001",
2106 "grab_tail_page failed %d", 2106 "grab_tail_page failed %d",
2107 error); 2107 error);
2108 page = NULL; 2108 page = NULL;
2109 bh = NULL; 2109 bh = NULL;
2110 } 2110 }
2111 } 2111 }
2112 2112
2113 /* so, if page != NULL, we have a buffer head for the offset at 2113 /* so, if page != NULL, we have a buffer head for the offset at
2114 ** the end of the file. if the bh is mapped, and bh->b_blocknr != 0, 2114 ** the end of the file. if the bh is mapped, and bh->b_blocknr != 0,
2115 ** then we have an unformatted node. Otherwise, we have a direct item, 2115 ** then we have an unformatted node. Otherwise, we have a direct item,
2116 ** and no zeroing is required on disk. We zero after the truncate, 2116 ** and no zeroing is required on disk. We zero after the truncate,
2117 ** because the truncate might pack the item anyway 2117 ** because the truncate might pack the item anyway
2118 ** (it will unmap bh if it packs). 2118 ** (it will unmap bh if it packs).
2119 */ 2119 */
2120 /* it is enough to reserve space in transaction for 2 balancings: 2120 /* it is enough to reserve space in transaction for 2 balancings:
2121 one for "save" link adding and another for the first 2121 one for "save" link adding and another for the first
2122 cut_from_item. 1 is for update_sd */ 2122 cut_from_item. 1 is for update_sd */
2123 error = journal_begin(&th, inode->i_sb, 2123 error = journal_begin(&th, inode->i_sb,
2124 JOURNAL_PER_BALANCE_CNT * 2 + 1); 2124 JOURNAL_PER_BALANCE_CNT * 2 + 1);
2125 if (error) 2125 if (error)
2126 goto out; 2126 goto out;
2127 reiserfs_update_inode_transaction(inode); 2127 reiserfs_update_inode_transaction(inode);
2128 if (update_timestamps) 2128 if (update_timestamps)
2129 /* we are doing real truncate: if the system crashes before the last 2129 /* we are doing real truncate: if the system crashes before the last
2130 transaction of truncating gets committed - on reboot the file 2130 transaction of truncating gets committed - on reboot the file
2131 either appears truncated properly or not truncated at all */ 2131 either appears truncated properly or not truncated at all */
2132 add_save_link(&th, inode, 1); 2132 add_save_link(&th, inode, 1);
2133 err2 = reiserfs_do_truncate(&th, inode, page, update_timestamps); 2133 err2 = reiserfs_do_truncate(&th, inode, page, update_timestamps);
2134 error = 2134 error =
2135 journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 2 + 1); 2135 journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 2 + 1);
2136 if (error) 2136 if (error)
2137 goto out; 2137 goto out;
2138 2138
2139 /* check reiserfs_do_truncate after ending the transaction */ 2139 /* check reiserfs_do_truncate after ending the transaction */
2140 if (err2) { 2140 if (err2) {
2141 error = err2; 2141 error = err2;
2142 goto out; 2142 goto out;
2143 } 2143 }
2144 2144
2145 if (update_timestamps) { 2145 if (update_timestamps) {
2146 error = remove_save_link(inode, 1 /* truncate */); 2146 error = remove_save_link(inode, 1 /* truncate */);
2147 if (error) 2147 if (error)
2148 goto out; 2148 goto out;
2149 } 2149 }
2150 2150
2151 if (page) { 2151 if (page) {
2152 length = offset & (blocksize - 1); 2152 length = offset & (blocksize - 1);
2153 /* if we are not on a block boundary */ 2153 /* if we are not on a block boundary */
2154 if (length) { 2154 if (length) {
2155 length = blocksize - length; 2155 length = blocksize - length;
2156 zero_user(page, offset, length); 2156 zero_user(page, offset, length);
2157 if (buffer_mapped(bh) && bh->b_blocknr != 0) { 2157 if (buffer_mapped(bh) && bh->b_blocknr != 0) {
2158 mark_buffer_dirty(bh); 2158 mark_buffer_dirty(bh);
2159 } 2159 }
2160 } 2160 }
2161 unlock_page(page); 2161 unlock_page(page);
2162 page_cache_release(page); 2162 page_cache_release(page);
2163 } 2163 }
2164 2164
2165 reiserfs_write_unlock_once(inode->i_sb, lock_depth); 2165 reiserfs_write_unlock_once(inode->i_sb, lock_depth);
2166 2166
2167 return 0; 2167 return 0;
2168 out: 2168 out:
2169 if (page) { 2169 if (page) {
2170 unlock_page(page); 2170 unlock_page(page);
2171 page_cache_release(page); 2171 page_cache_release(page);
2172 } 2172 }
2173 2173
2174 reiserfs_write_unlock_once(inode->i_sb, lock_depth); 2174 reiserfs_write_unlock_once(inode->i_sb, lock_depth);
2175 2175
2176 return error; 2176 return error;
2177 } 2177 }
2178 2178
2179 static int map_block_for_writepage(struct inode *inode, 2179 static int map_block_for_writepage(struct inode *inode,
2180 struct buffer_head *bh_result, 2180 struct buffer_head *bh_result,
2181 unsigned long block) 2181 unsigned long block)
2182 { 2182 {
2183 struct reiserfs_transaction_handle th; 2183 struct reiserfs_transaction_handle th;
2184 int fs_gen; 2184 int fs_gen;
2185 struct item_head tmp_ih; 2185 struct item_head tmp_ih;
2186 struct item_head *ih; 2186 struct item_head *ih;
2187 struct buffer_head *bh; 2187 struct buffer_head *bh;
2188 __le32 *item; 2188 __le32 *item;
2189 struct cpu_key key; 2189 struct cpu_key key;
2190 INITIALIZE_PATH(path); 2190 INITIALIZE_PATH(path);
2191 int pos_in_item; 2191 int pos_in_item;
2192 int jbegin_count = JOURNAL_PER_BALANCE_CNT; 2192 int jbegin_count = JOURNAL_PER_BALANCE_CNT;
2193 loff_t byte_offset = ((loff_t)block << inode->i_sb->s_blocksize_bits)+1; 2193 loff_t byte_offset = ((loff_t)block << inode->i_sb->s_blocksize_bits)+1;
2194 int retval; 2194 int retval;
2195 int use_get_block = 0; 2195 int use_get_block = 0;
2196 int bytes_copied = 0; 2196 int bytes_copied = 0;
2197 int copy_size; 2197 int copy_size;
2198 int trans_running = 0; 2198 int trans_running = 0;
2199 2199
2200 /* catch places below that try to log something without starting a trans */ 2200 /* catch places below that try to log something without starting a trans */
2201 th.t_trans_id = 0; 2201 th.t_trans_id = 0;
2202 2202
2203 if (!buffer_uptodate(bh_result)) { 2203 if (!buffer_uptodate(bh_result)) {
2204 return -EIO; 2204 return -EIO;
2205 } 2205 }
2206 2206
2207 kmap(bh_result->b_page); 2207 kmap(bh_result->b_page);
2208 start_over: 2208 start_over:
2209 reiserfs_write_lock(inode->i_sb); 2209 reiserfs_write_lock(inode->i_sb);
2210 make_cpu_key(&key, inode, byte_offset, TYPE_ANY, 3); 2210 make_cpu_key(&key, inode, byte_offset, TYPE_ANY, 3);
2211 2211
2212 research: 2212 research:
2213 retval = search_for_position_by_key(inode->i_sb, &key, &path); 2213 retval = search_for_position_by_key(inode->i_sb, &key, &path);
2214 if (retval != POSITION_FOUND) { 2214 if (retval != POSITION_FOUND) {
2215 use_get_block = 1; 2215 use_get_block = 1;
2216 goto out; 2216 goto out;
2217 } 2217 }
2218 2218
2219 bh = get_last_bh(&path); 2219 bh = get_last_bh(&path);
2220 ih = get_ih(&path); 2220 ih = get_ih(&path);
2221 item = get_item(&path); 2221 item = get_item(&path);
2222 pos_in_item = path.pos_in_item; 2222 pos_in_item = path.pos_in_item;
2223 2223
2224 /* we've found an unformatted node */ 2224 /* we've found an unformatted node */
2225 if (indirect_item_found(retval, ih)) { 2225 if (indirect_item_found(retval, ih)) {
2226 if (bytes_copied > 0) { 2226 if (bytes_copied > 0) {
2227 reiserfs_warning(inode->i_sb, "clm-6002", 2227 reiserfs_warning(inode->i_sb, "clm-6002",
2228 "bytes_copied %d", bytes_copied); 2228 "bytes_copied %d", bytes_copied);
2229 } 2229 }
2230 if (!get_block_num(item, pos_in_item)) { 2230 if (!get_block_num(item, pos_in_item)) {
2231 /* crap, we are writing to a hole */ 2231 /* crap, we are writing to a hole */
2232 use_get_block = 1; 2232 use_get_block = 1;
2233 goto out; 2233 goto out;
2234 } 2234 }
2235 set_block_dev_mapped(bh_result, 2235 set_block_dev_mapped(bh_result,
2236 get_block_num(item, pos_in_item), inode); 2236 get_block_num(item, pos_in_item), inode);
2237 } else if (is_direct_le_ih(ih)) { 2237 } else if (is_direct_le_ih(ih)) {
2238 char *p; 2238 char *p;
2239 p = page_address(bh_result->b_page); 2239 p = page_address(bh_result->b_page);
2240 p += (byte_offset - 1) & (PAGE_CACHE_SIZE - 1); 2240 p += (byte_offset - 1) & (PAGE_CACHE_SIZE - 1);
2241 copy_size = ih_item_len(ih) - pos_in_item; 2241 copy_size = ih_item_len(ih) - pos_in_item;
2242 2242
2243 fs_gen = get_generation(inode->i_sb); 2243 fs_gen = get_generation(inode->i_sb);
2244 copy_item_head(&tmp_ih, ih); 2244 copy_item_head(&tmp_ih, ih);
2245 2245
2246 if (!trans_running) { 2246 if (!trans_running) {
2247 /* vs-3050 is gone, no need to drop the path */ 2247 /* vs-3050 is gone, no need to drop the path */
2248 retval = journal_begin(&th, inode->i_sb, jbegin_count); 2248 retval = journal_begin(&th, inode->i_sb, jbegin_count);
2249 if (retval) 2249 if (retval)
2250 goto out; 2250 goto out;
2251 reiserfs_update_inode_transaction(inode); 2251 reiserfs_update_inode_transaction(inode);
2252 trans_running = 1; 2252 trans_running = 1;
2253 if (fs_changed(fs_gen, inode->i_sb) 2253 if (fs_changed(fs_gen, inode->i_sb)
2254 && item_moved(&tmp_ih, &path)) { 2254 && item_moved(&tmp_ih, &path)) {
2255 reiserfs_restore_prepared_buffer(inode->i_sb, 2255 reiserfs_restore_prepared_buffer(inode->i_sb,
2256 bh); 2256 bh);
2257 goto research; 2257 goto research;
2258 } 2258 }
2259 } 2259 }
2260 2260
2261 reiserfs_prepare_for_journal(inode->i_sb, bh, 1); 2261 reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
2262 2262
2263 if (fs_changed(fs_gen, inode->i_sb) 2263 if (fs_changed(fs_gen, inode->i_sb)
2264 && item_moved(&tmp_ih, &path)) { 2264 && item_moved(&tmp_ih, &path)) {
2265 reiserfs_restore_prepared_buffer(inode->i_sb, bh); 2265 reiserfs_restore_prepared_buffer(inode->i_sb, bh);
2266 goto research; 2266 goto research;
2267 } 2267 }
2268 2268
2269 memcpy(B_I_PITEM(bh, ih) + pos_in_item, p + bytes_copied, 2269 memcpy(B_I_PITEM(bh, ih) + pos_in_item, p + bytes_copied,
2270 copy_size); 2270 copy_size);
2271 2271
2272 journal_mark_dirty(&th, inode->i_sb, bh); 2272 journal_mark_dirty(&th, inode->i_sb, bh);
2273 bytes_copied += copy_size; 2273 bytes_copied += copy_size;
2274 set_block_dev_mapped(bh_result, 0, inode); 2274 set_block_dev_mapped(bh_result, 0, inode);
2275 2275
2276 /* are there still bytes left? */ 2276 /* are there still bytes left? */
2277 if (bytes_copied < bh_result->b_size && 2277 if (bytes_copied < bh_result->b_size &&
2278 (byte_offset + bytes_copied) < inode->i_size) { 2278 (byte_offset + bytes_copied) < inode->i_size) {
2279 set_cpu_key_k_offset(&key, 2279 set_cpu_key_k_offset(&key,
2280 cpu_key_k_offset(&key) + 2280 cpu_key_k_offset(&key) +
2281 copy_size); 2281 copy_size);
2282 goto research; 2282 goto research;
2283 } 2283 }
2284 } else { 2284 } else {
2285 reiserfs_warning(inode->i_sb, "clm-6003", 2285 reiserfs_warning(inode->i_sb, "clm-6003",
2286 "bad item inode %lu", inode->i_ino); 2286 "bad item inode %lu", inode->i_ino);
2287 retval = -EIO; 2287 retval = -EIO;
2288 goto out; 2288 goto out;
2289 } 2289 }
2290 retval = 0; 2290 retval = 0;
2291 2291
2292 out: 2292 out:
2293 pathrelse(&path); 2293 pathrelse(&path);
2294 if (trans_running) { 2294 if (trans_running) {
2295 int err = journal_end(&th, inode->i_sb, jbegin_count); 2295 int err = journal_end(&th, inode->i_sb, jbegin_count);
2296 if (err) 2296 if (err)
2297 retval = err; 2297 retval = err;
2298 trans_running = 0; 2298 trans_running = 0;
2299 } 2299 }
2300 reiserfs_write_unlock(inode->i_sb); 2300 reiserfs_write_unlock(inode->i_sb);
2301 2301
2302 /* this is where we fill in holes in the file. */ 2302 /* this is where we fill in holes in the file. */
2303 if (use_get_block) { 2303 if (use_get_block) {
2304 retval = reiserfs_get_block(inode, block, bh_result, 2304 retval = reiserfs_get_block(inode, block, bh_result,
2305 GET_BLOCK_CREATE | GET_BLOCK_NO_IMUX 2305 GET_BLOCK_CREATE | GET_BLOCK_NO_IMUX
2306 | GET_BLOCK_NO_DANGLE); 2306 | GET_BLOCK_NO_DANGLE);
2307 if (!retval) { 2307 if (!retval) {
2308 if (!buffer_mapped(bh_result) 2308 if (!buffer_mapped(bh_result)
2309 || bh_result->b_blocknr == 0) { 2309 || bh_result->b_blocknr == 0) {
2310 /* get_block failed to find a mapped unformatted node. */ 2310 /* get_block failed to find a mapped unformatted node. */
2311 use_get_block = 0; 2311 use_get_block = 0;
2312 goto start_over; 2312 goto start_over;
2313 } 2313 }
2314 } 2314 }
2315 } 2315 }
2316 kunmap(bh_result->b_page); 2316 kunmap(bh_result->b_page);
2317 2317
2318 if (!retval && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) { 2318 if (!retval && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
2319 /* we've copied data from the page into the direct item, so the 2319 /* we've copied data from the page into the direct item, so the
2320 * buffer in the page is now clean, mark it to reflect that. 2320 * buffer in the page is now clean, mark it to reflect that.
2321 */ 2321 */
2322 lock_buffer(bh_result); 2322 lock_buffer(bh_result);
2323 clear_buffer_dirty(bh_result); 2323 clear_buffer_dirty(bh_result);
2324 unlock_buffer(bh_result); 2324 unlock_buffer(bh_result);
2325 } 2325 }
2326 return retval; 2326 return retval;
2327 } 2327 }
2328 2328
2329 /* 2329 /*
2330 * mason@suse.com: updated in 2.5.54 to follow the same general io 2330 * mason@suse.com: updated in 2.5.54 to follow the same general io
2331 * start/recovery path as __block_write_full_page, along with special 2331 * start/recovery path as __block_write_full_page, along with special
2332 * code to handle reiserfs tails. 2332 * code to handle reiserfs tails.
2333 */ 2333 */
2334 static int reiserfs_write_full_page(struct page *page, 2334 static int reiserfs_write_full_page(struct page *page,
2335 struct writeback_control *wbc) 2335 struct writeback_control *wbc)
2336 { 2336 {
2337 struct inode *inode = page->mapping->host; 2337 struct inode *inode = page->mapping->host;
2338 unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT; 2338 unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
2339 int error = 0; 2339 int error = 0;
2340 unsigned long block; 2340 unsigned long block;
2341 sector_t last_block; 2341 sector_t last_block;
2342 struct buffer_head *head, *bh; 2342 struct buffer_head *head, *bh;
2343 int partial = 0; 2343 int partial = 0;
2344 int nr = 0; 2344 int nr = 0;
2345 int checked = PageChecked(page); 2345 int checked = PageChecked(page);
2346 struct reiserfs_transaction_handle th; 2346 struct reiserfs_transaction_handle th;
2347 struct super_block *s = inode->i_sb; 2347 struct super_block *s = inode->i_sb;
2348 int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize; 2348 int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
2349 th.t_trans_id = 0; 2349 th.t_trans_id = 0;
2350 2350
2351 /* no logging allowed when nonblocking or from PF_MEMALLOC */ 2351 /* no logging allowed when nonblocking or from PF_MEMALLOC */
2352 if (checked && (current->flags & PF_MEMALLOC)) { 2352 if (checked && (current->flags & PF_MEMALLOC)) {
2353 redirty_page_for_writepage(wbc, page); 2353 redirty_page_for_writepage(wbc, page);
2354 unlock_page(page); 2354 unlock_page(page);
2355 return 0; 2355 return 0;
2356 } 2356 }
2357 2357
2358 /* The page dirty bit is cleared before writepage is called, which 2358 /* The page dirty bit is cleared before writepage is called, which
2359 * means we have to tell create_empty_buffers to make dirty buffers 2359 * means we have to tell create_empty_buffers to make dirty buffers
2360 * The page really should be up to date at this point, so tossing 2360 * The page really should be up to date at this point, so tossing
2361 * in the BH_Uptodate is just a sanity check. 2361 * in the BH_Uptodate is just a sanity check.
2362 */ 2362 */
2363 if (!page_has_buffers(page)) { 2363 if (!page_has_buffers(page)) {
2364 create_empty_buffers(page, s->s_blocksize, 2364 create_empty_buffers(page, s->s_blocksize,
2365 (1 << BH_Dirty) | (1 << BH_Uptodate)); 2365 (1 << BH_Dirty) | (1 << BH_Uptodate));
2366 } 2366 }
2367 head = page_buffers(page); 2367 head = page_buffers(page);
2368 2368
2369 /* last page in the file, zero out any contents past the 2369 /* last page in the file, zero out any contents past the
2370 ** last byte in the file 2370 ** last byte in the file
2371 */ 2371 */
2372 if (page->index >= end_index) { 2372 if (page->index >= end_index) {
2373 unsigned last_offset; 2373 unsigned last_offset;
2374 2374
2375 last_offset = inode->i_size & (PAGE_CACHE_SIZE - 1); 2375 last_offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
2376 /* no file contents in this page */ 2376 /* no file contents in this page */
2377 if (page->index >= end_index + 1 || !last_offset) { 2377 if (page->index >= end_index + 1 || !last_offset) {
2378 unlock_page(page); 2378 unlock_page(page);
2379 return 0; 2379 return 0;
2380 } 2380 }
2381 zero_user_segment(page, last_offset, PAGE_CACHE_SIZE); 2381 zero_user_segment(page, last_offset, PAGE_CACHE_SIZE);
2382 } 2382 }
2383 bh = head; 2383 bh = head;
2384 block = page->index << (PAGE_CACHE_SHIFT - s->s_blocksize_bits); 2384 block = page->index << (PAGE_CACHE_SHIFT - s->s_blocksize_bits);
2385 last_block = (i_size_read(inode) - 1) >> inode->i_blkbits; 2385 last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
2386 /* first map all the buffers, logging any direct items we find */ 2386 /* first map all the buffers, logging any direct items we find */
2387 do { 2387 do {
2388 if (block > last_block) { 2388 if (block > last_block) {
2389 /* 2389 /*
2390 * This can happen when the block size is less than 2390 * This can happen when the block size is less than
2391 * the page size. The corresponding bytes in the page 2391 * the page size. The corresponding bytes in the page
2392 * were zero filled above 2392 * were zero filled above
2393 */ 2393 */
2394 clear_buffer_dirty(bh); 2394 clear_buffer_dirty(bh);
2395 set_buffer_uptodate(bh); 2395 set_buffer_uptodate(bh);
2396 } else if ((checked || buffer_dirty(bh)) && 2396 } else if ((checked || buffer_dirty(bh)) &&
2397 (!buffer_mapped(bh) || (buffer_mapped(bh) 2397 (!buffer_mapped(bh) || (buffer_mapped(bh)
2398 && bh->b_blocknr == 2398 && bh->b_blocknr ==
2399 0))) { 2399 0))) {
2400 /* not mapped yet, or it points to a direct item, search 2400 /* not mapped yet, or it points to a direct item, search
2401 * the btree for the mapping info, and log any direct 2401 * the btree for the mapping info, and log any direct
2402 * items found 2402 * items found
2403 */ 2403 */
2404 if ((error = map_block_for_writepage(inode, bh, block))) { 2404 if ((error = map_block_for_writepage(inode, bh, block))) {
2405 goto fail; 2405 goto fail;
2406 } 2406 }
2407 } 2407 }
2408 bh = bh->b_this_page; 2408 bh = bh->b_this_page;
2409 block++; 2409 block++;
2410 } while (bh != head); 2410 } while (bh != head);
2411 2411
2412 /* 2412 /*
2413 * we start the transaction after map_block_for_writepage, 2413 * we start the transaction after map_block_for_writepage,
2414 * because it can create holes in the file (an unbounded operation). 2414 * because it can create holes in the file (an unbounded operation).
2415 * starting it here, we can make a reliable estimate for how many 2415 * starting it here, we can make a reliable estimate for how many
2416 * blocks we're going to log 2416 * blocks we're going to log
2417 */ 2417 */
2418 if (checked) { 2418 if (checked) {
2419 ClearPageChecked(page); 2419 ClearPageChecked(page);
2420 reiserfs_write_lock(s); 2420 reiserfs_write_lock(s);
2421 error = journal_begin(&th, s, bh_per_page + 1); 2421 error = journal_begin(&th, s, bh_per_page + 1);
2422 if (error) { 2422 if (error) {
2423 reiserfs_write_unlock(s); 2423 reiserfs_write_unlock(s);
2424 goto fail; 2424 goto fail;
2425 } 2425 }
2426 reiserfs_update_inode_transaction(inode); 2426 reiserfs_update_inode_transaction(inode);
2427 } 2427 }
2428 /* now go through and lock any dirty buffers on the page */ 2428 /* now go through and lock any dirty buffers on the page */
2429 do { 2429 do {
2430 get_bh(bh); 2430 get_bh(bh);
2431 if (!buffer_mapped(bh)) 2431 if (!buffer_mapped(bh))
2432 continue; 2432 continue;
2433 if (buffer_mapped(bh) && bh->b_blocknr == 0) 2433 if (buffer_mapped(bh) && bh->b_blocknr == 0)
2434 continue; 2434 continue;
2435 2435
2436 if (checked) { 2436 if (checked) {
2437 reiserfs_prepare_for_journal(s, bh, 1); 2437 reiserfs_prepare_for_journal(s, bh, 1);
2438 journal_mark_dirty(&th, s, bh); 2438 journal_mark_dirty(&th, s, bh);
2439 continue; 2439 continue;
2440 } 2440 }
2441 /* from this point on, we know the buffer is mapped to a 2441 /* from this point on, we know the buffer is mapped to a
2442 * real block and not a direct item 2442 * real block and not a direct item
2443 */ 2443 */
2444 if (wbc->sync_mode != WB_SYNC_NONE) { 2444 if (wbc->sync_mode != WB_SYNC_NONE) {
2445 lock_buffer(bh); 2445 lock_buffer(bh);
2446 } else { 2446 } else {
2447 if (!trylock_buffer(bh)) { 2447 if (!trylock_buffer(bh)) {
2448 redirty_page_for_writepage(wbc, page); 2448 redirty_page_for_writepage(wbc, page);
2449 continue; 2449 continue;
2450 } 2450 }
2451 } 2451 }
2452 if (test_clear_buffer_dirty(bh)) { 2452 if (test_clear_buffer_dirty(bh)) {
2453 mark_buffer_async_write(bh); 2453 mark_buffer_async_write(bh);
2454 } else { 2454 } else {
2455 unlock_buffer(bh); 2455 unlock_buffer(bh);
2456 } 2456 }
2457 } while ((bh = bh->b_this_page) != head); 2457 } while ((bh = bh->b_this_page) != head);
2458 2458
2459 if (checked) { 2459 if (checked) {
2460 error = journal_end(&th, s, bh_per_page + 1); 2460 error = journal_end(&th, s, bh_per_page + 1);
2461 reiserfs_write_unlock(s); 2461 reiserfs_write_unlock(s);
2462 if (error) 2462 if (error)
2463 goto fail; 2463 goto fail;
2464 } 2464 }
2465 BUG_ON(PageWriteback(page)); 2465 BUG_ON(PageWriteback(page));
2466 set_page_writeback(page); 2466 set_page_writeback(page);
2467 unlock_page(page); 2467 unlock_page(page);
2468 2468
2469 /* 2469 /*
2470 * since any buffer might be the only dirty buffer on the page, 2470 * since any buffer might be the only dirty buffer on the page,
2471 * the first submit_bh can bring the page out of writeback. 2471 * the first submit_bh can bring the page out of writeback.
2472 * be careful with the buffers. 2472 * be careful with the buffers.
2473 */ 2473 */
2474 do { 2474 do {
2475 struct buffer_head *next = bh->b_this_page; 2475 struct buffer_head *next = bh->b_this_page;
2476 if (buffer_async_write(bh)) { 2476 if (buffer_async_write(bh)) {
2477 submit_bh(WRITE, bh); 2477 submit_bh(WRITE, bh);
2478 nr++; 2478 nr++;
2479 } 2479 }
2480 put_bh(bh); 2480 put_bh(bh);
2481 bh = next; 2481 bh = next;
2482 } while (bh != head); 2482 } while (bh != head);
2483 2483
2484 error = 0; 2484 error = 0;
2485 done: 2485 done:
2486 if (nr == 0) { 2486 if (nr == 0) {
2487 /* 2487 /*
2488 * if this page only had a direct item, it is very possible for 2488 * if this page only had a direct item, it is very possible for
2489 * no io to be required without there being an error. Or, 2489 * no io to be required without there being an error. Or,
2490 * someone else could have locked them and sent them down the 2490 * someone else could have locked them and sent them down the
2491 * pipe without locking the page 2491 * pipe without locking the page
2492 */ 2492 */
2493 bh = head; 2493 bh = head;
2494 do { 2494 do {
2495 if (!buffer_uptodate(bh)) { 2495 if (!buffer_uptodate(bh)) {
2496 partial = 1; 2496 partial = 1;
2497 break; 2497 break;
2498 } 2498 }
2499 bh = bh->b_this_page; 2499 bh = bh->b_this_page;
2500 } while (bh != head); 2500 } while (bh != head);
2501 if (!partial) 2501 if (!partial)
2502 SetPageUptodate(page); 2502 SetPageUptodate(page);
2503 end_page_writeback(page); 2503 end_page_writeback(page);
2504 } 2504 }
2505 return error; 2505 return error;
2506 2506
2507 fail: 2507 fail:
2508 /* catches various errors, we need to make sure any valid dirty blocks 2508 /* catches various errors, we need to make sure any valid dirty blocks
2509 * get to the media. The page is currently locked and not marked for 2509 * get to the media. The page is currently locked and not marked for
2510 * writeback 2510 * writeback
2511 */ 2511 */
2512 ClearPageUptodate(page); 2512 ClearPageUptodate(page);
2513 bh = head; 2513 bh = head;
2514 do { 2514 do {
2515 get_bh(bh); 2515 get_bh(bh);
2516 if (buffer_mapped(bh) && buffer_dirty(bh) && bh->b_blocknr) { 2516 if (buffer_mapped(bh) && buffer_dirty(bh) && bh->b_blocknr) {
2517 lock_buffer(bh); 2517 lock_buffer(bh);
2518 mark_buffer_async_write(bh); 2518 mark_buffer_async_write(bh);
2519 } else { 2519 } else {
2520 /* 2520 /*
2521 * clear any dirty bits that might have come from getting 2521 * clear any dirty bits that might have come from getting
2522 * attached to a dirty page 2522 * attached to a dirty page
2523 */ 2523 */
2524 clear_buffer_dirty(bh); 2524 clear_buffer_dirty(bh);
2525 } 2525 }
2526 bh = bh->b_this_page; 2526 bh = bh->b_this_page;
2527 } while (bh != head); 2527 } while (bh != head);
2528 SetPageError(page); 2528 SetPageError(page);
2529 BUG_ON(PageWriteback(page)); 2529 BUG_ON(PageWriteback(page));
2530 set_page_writeback(page); 2530 set_page_writeback(page);
2531 unlock_page(page); 2531 unlock_page(page);
2532 do { 2532 do {
2533 struct buffer_head *next = bh->b_this_page; 2533 struct buffer_head *next = bh->b_this_page;
2534 if (buffer_async_write(bh)) { 2534 if (buffer_async_write(bh)) {
2535 clear_buffer_dirty(bh); 2535 clear_buffer_dirty(bh);
2536 submit_bh(WRITE, bh); 2536 submit_bh(WRITE, bh);
2537 nr++; 2537 nr++;
2538 } 2538 }
2539 put_bh(bh); 2539 put_bh(bh);
2540 bh = next; 2540 bh = next;
2541 } while (bh != head); 2541 } while (bh != head);
2542 goto done; 2542 goto done;
2543 } 2543 }
2544 2544
2545 static int reiserfs_readpage(struct file *f, struct page *page) 2545 static int reiserfs_readpage(struct file *f, struct page *page)
2546 { 2546 {
2547 return block_read_full_page(page, reiserfs_get_block); 2547 return block_read_full_page(page, reiserfs_get_block);
2548 } 2548 }
2549 2549
2550 static int reiserfs_writepage(struct page *page, struct writeback_control *wbc) 2550 static int reiserfs_writepage(struct page *page, struct writeback_control *wbc)
2551 { 2551 {
2552 struct inode *inode = page->mapping->host; 2552 struct inode *inode = page->mapping->host;
2553 reiserfs_wait_on_write_block(inode->i_sb); 2553 reiserfs_wait_on_write_block(inode->i_sb);
2554 return reiserfs_write_full_page(page, wbc); 2554 return reiserfs_write_full_page(page, wbc);
2555 } 2555 }
2556 2556
2557 static void reiserfs_truncate_failed_write(struct inode *inode) 2557 static void reiserfs_truncate_failed_write(struct inode *inode)
2558 { 2558 {
2559 truncate_inode_pages(inode->i_mapping, inode->i_size); 2559 truncate_inode_pages(inode->i_mapping, inode->i_size);
2560 reiserfs_truncate_file(inode, 0); 2560 reiserfs_truncate_file(inode, 0);
2561 } 2561 }
2562 2562
2563 static int reiserfs_write_begin(struct file *file, 2563 static int reiserfs_write_begin(struct file *file,
2564 struct address_space *mapping, 2564 struct address_space *mapping,
2565 loff_t pos, unsigned len, unsigned flags, 2565 loff_t pos, unsigned len, unsigned flags,
2566 struct page **pagep, void **fsdata) 2566 struct page **pagep, void **fsdata)
2567 { 2567 {
2568 struct inode *inode; 2568 struct inode *inode;
2569 struct page *page; 2569 struct page *page;
2570 pgoff_t index; 2570 pgoff_t index;
2571 int ret; 2571 int ret;
2572 int old_ref = 0; 2572 int old_ref = 0;
2573 2573
2574 inode = mapping->host; 2574 inode = mapping->host;
2575 *fsdata = 0; 2575 *fsdata = 0;
2576 if (flags & AOP_FLAG_CONT_EXPAND && 2576 if (flags & AOP_FLAG_CONT_EXPAND &&
2577 (pos & (inode->i_sb->s_blocksize - 1)) == 0) { 2577 (pos & (inode->i_sb->s_blocksize - 1)) == 0) {
2578 pos ++; 2578 pos ++;
2579 *fsdata = (void *)(unsigned long)flags; 2579 *fsdata = (void *)(unsigned long)flags;
2580 } 2580 }
2581 2581
2582 index = pos >> PAGE_CACHE_SHIFT; 2582 index = pos >> PAGE_CACHE_SHIFT;
2583 page = grab_cache_page_write_begin(mapping, index, flags); 2583 page = grab_cache_page_write_begin(mapping, index, flags);
2584 if (!page) 2584 if (!page)
2585 return -ENOMEM; 2585 return -ENOMEM;
2586 *pagep = page; 2586 *pagep = page;
2587 2587
2588 reiserfs_wait_on_write_block(inode->i_sb); 2588 reiserfs_wait_on_write_block(inode->i_sb);
2589 fix_tail_page_for_writing(page); 2589 fix_tail_page_for_writing(page);
2590 if (reiserfs_transaction_running(inode->i_sb)) { 2590 if (reiserfs_transaction_running(inode->i_sb)) {
2591 struct reiserfs_transaction_handle *th; 2591 struct reiserfs_transaction_handle *th;
2592 th = (struct reiserfs_transaction_handle *)current-> 2592 th = (struct reiserfs_transaction_handle *)current->
2593 journal_info; 2593 journal_info;
2594 BUG_ON(!th->t_refcount); 2594 BUG_ON(!th->t_refcount);
2595 BUG_ON(!th->t_trans_id); 2595 BUG_ON(!th->t_trans_id);
2596 old_ref = th->t_refcount; 2596 old_ref = th->t_refcount;
2597 th->t_refcount++; 2597 th->t_refcount++;
2598 } 2598 }
2599 ret = __block_write_begin(page, pos, len, reiserfs_get_block); 2599 ret = __block_write_begin(page, pos, len, reiserfs_get_block);
2600 if (ret && reiserfs_transaction_running(inode->i_sb)) { 2600 if (ret && reiserfs_transaction_running(inode->i_sb)) {
2601 struct reiserfs_transaction_handle *th = current->journal_info; 2601 struct reiserfs_transaction_handle *th = current->journal_info;
2602 /* this gets a little ugly. If reiserfs_get_block returned an 2602 /* this gets a little ugly. If reiserfs_get_block returned an
2603 * error and left a transacstion running, we've got to close it, 2603 * error and left a transacstion running, we've got to close it,
2604 * and we've got to free handle if it was a persistent transaction. 2604 * and we've got to free handle if it was a persistent transaction.
2605 * 2605 *
2606 * But, if we had nested into an existing transaction, we need 2606 * But, if we had nested into an existing transaction, we need
2607 * to just drop the ref count on the handle. 2607 * to just drop the ref count on the handle.
2608 * 2608 *
2609 * If old_ref == 0, the transaction is from reiserfs_get_block, 2609 * If old_ref == 0, the transaction is from reiserfs_get_block,
2610 * and it was a persistent trans. Otherwise, it was nested above. 2610 * and it was a persistent trans. Otherwise, it was nested above.
2611 */ 2611 */
2612 if (th->t_refcount > old_ref) { 2612 if (th->t_refcount > old_ref) {
2613 if (old_ref) 2613 if (old_ref)
2614 th->t_refcount--; 2614 th->t_refcount--;
2615 else { 2615 else {
2616 int err; 2616 int err;
2617 reiserfs_write_lock(inode->i_sb); 2617 reiserfs_write_lock(inode->i_sb);
2618 err = reiserfs_end_persistent_transaction(th); 2618 err = reiserfs_end_persistent_transaction(th);
2619 reiserfs_write_unlock(inode->i_sb); 2619 reiserfs_write_unlock(inode->i_sb);
2620 if (err) 2620 if (err)
2621 ret = err; 2621 ret = err;
2622 } 2622 }
2623 } 2623 }
2624 } 2624 }
2625 if (ret) { 2625 if (ret) {
2626 unlock_page(page); 2626 unlock_page(page);
2627 page_cache_release(page); 2627 page_cache_release(page);
2628 /* Truncate allocated blocks */ 2628 /* Truncate allocated blocks */
2629 reiserfs_truncate_failed_write(inode); 2629 reiserfs_truncate_failed_write(inode);
2630 } 2630 }
2631 return ret; 2631 return ret;
2632 } 2632 }
2633 2633
2634 int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len) 2634 int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len)
2635 { 2635 {
2636 struct inode *inode = page->mapping->host; 2636 struct inode *inode = page->mapping->host;
2637 int ret; 2637 int ret;
2638 int old_ref = 0; 2638 int old_ref = 0;
2639 2639
2640 reiserfs_write_unlock(inode->i_sb); 2640 reiserfs_write_unlock(inode->i_sb);
2641 reiserfs_wait_on_write_block(inode->i_sb); 2641 reiserfs_wait_on_write_block(inode->i_sb);
2642 reiserfs_write_lock(inode->i_sb); 2642 reiserfs_write_lock(inode->i_sb);
2643 2643
2644 fix_tail_page_for_writing(page); 2644 fix_tail_page_for_writing(page);
2645 if (reiserfs_transaction_running(inode->i_sb)) { 2645 if (reiserfs_transaction_running(inode->i_sb)) {
2646 struct reiserfs_transaction_handle *th; 2646 struct reiserfs_transaction_handle *th;
2647 th = (struct reiserfs_transaction_handle *)current-> 2647 th = (struct reiserfs_transaction_handle *)current->
2648 journal_info; 2648 journal_info;
2649 BUG_ON(!th->t_refcount); 2649 BUG_ON(!th->t_refcount);
2650 BUG_ON(!th->t_trans_id); 2650 BUG_ON(!th->t_trans_id);
2651 old_ref = th->t_refcount; 2651 old_ref = th->t_refcount;
2652 th->t_refcount++; 2652 th->t_refcount++;
2653 } 2653 }
2654 2654
2655 ret = __block_write_begin(page, from, len, reiserfs_get_block); 2655 ret = __block_write_begin(page, from, len, reiserfs_get_block);
2656 if (ret && reiserfs_transaction_running(inode->i_sb)) { 2656 if (ret && reiserfs_transaction_running(inode->i_sb)) {
2657 struct reiserfs_transaction_handle *th = current->journal_info; 2657 struct reiserfs_transaction_handle *th = current->journal_info;
2658 /* this gets a little ugly. If reiserfs_get_block returned an 2658 /* this gets a little ugly. If reiserfs_get_block returned an
2659 * error and left a transacstion running, we've got to close it, 2659 * error and left a transacstion running, we've got to close it,
2660 * and we've got to free handle if it was a persistent transaction. 2660 * and we've got to free handle if it was a persistent transaction.
2661 * 2661 *
2662 * But, if we had nested into an existing transaction, we need 2662 * But, if we had nested into an existing transaction, we need
2663 * to just drop the ref count on the handle. 2663 * to just drop the ref count on the handle.
2664 * 2664 *
2665 * If old_ref == 0, the transaction is from reiserfs_get_block, 2665 * If old_ref == 0, the transaction is from reiserfs_get_block,
2666 * and it was a persistent trans. Otherwise, it was nested above. 2666 * and it was a persistent trans. Otherwise, it was nested above.
2667 */ 2667 */
2668 if (th->t_refcount > old_ref) { 2668 if (th->t_refcount > old_ref) {
2669 if (old_ref) 2669 if (old_ref)
2670 th->t_refcount--; 2670 th->t_refcount--;
2671 else { 2671 else {
2672 int err; 2672 int err;
2673 reiserfs_write_lock(inode->i_sb); 2673 reiserfs_write_lock(inode->i_sb);
2674 err = reiserfs_end_persistent_transaction(th); 2674 err = reiserfs_end_persistent_transaction(th);
2675 reiserfs_write_unlock(inode->i_sb); 2675 reiserfs_write_unlock(inode->i_sb);
2676 if (err) 2676 if (err)
2677 ret = err; 2677 ret = err;
2678 } 2678 }
2679 } 2679 }
2680 } 2680 }
2681 return ret; 2681 return ret;
2682 2682
2683 } 2683 }
2684 2684
2685 static sector_t reiserfs_aop_bmap(struct address_space *as, sector_t block) 2685 static sector_t reiserfs_aop_bmap(struct address_space *as, sector_t block)
2686 { 2686 {
2687 return generic_block_bmap(as, block, reiserfs_bmap); 2687 return generic_block_bmap(as, block, reiserfs_bmap);
2688 } 2688 }
2689 2689
2690 static int reiserfs_write_end(struct file *file, struct address_space *mapping, 2690 static int reiserfs_write_end(struct file *file, struct address_space *mapping,
2691 loff_t pos, unsigned len, unsigned copied, 2691 loff_t pos, unsigned len, unsigned copied,
2692 struct page *page, void *fsdata) 2692 struct page *page, void *fsdata)
2693 { 2693 {
2694 struct inode *inode = page->mapping->host; 2694 struct inode *inode = page->mapping->host;
2695 int ret = 0; 2695 int ret = 0;
2696 int update_sd = 0; 2696 int update_sd = 0;
2697 struct reiserfs_transaction_handle *th; 2697 struct reiserfs_transaction_handle *th;
2698 unsigned start; 2698 unsigned start;
2699 int lock_depth = 0; 2699 int lock_depth = 0;
2700 bool locked = false; 2700 bool locked = false;
2701 2701
2702 if ((unsigned long)fsdata & AOP_FLAG_CONT_EXPAND) 2702 if ((unsigned long)fsdata & AOP_FLAG_CONT_EXPAND)
2703 pos ++; 2703 pos ++;
2704 2704
2705 reiserfs_wait_on_write_block(inode->i_sb); 2705 reiserfs_wait_on_write_block(inode->i_sb);
2706 if (reiserfs_transaction_running(inode->i_sb)) 2706 if (reiserfs_transaction_running(inode->i_sb))
2707 th = current->journal_info; 2707 th = current->journal_info;
2708 else 2708 else
2709 th = NULL; 2709 th = NULL;
2710 2710
2711 start = pos & (PAGE_CACHE_SIZE - 1); 2711 start = pos & (PAGE_CACHE_SIZE - 1);
2712 if (unlikely(copied < len)) { 2712 if (unlikely(copied < len)) {
2713 if (!PageUptodate(page)) 2713 if (!PageUptodate(page))
2714 copied = 0; 2714 copied = 0;
2715 2715
2716 page_zero_new_buffers(page, start + copied, start + len); 2716 page_zero_new_buffers(page, start + copied, start + len);
2717 } 2717 }
2718 flush_dcache_page(page); 2718 flush_dcache_page(page);
2719 2719
2720 reiserfs_commit_page(inode, page, start, start + copied); 2720 reiserfs_commit_page(inode, page, start, start + copied);
2721 2721
2722 /* generic_commit_write does this for us, but does not update the 2722 /* generic_commit_write does this for us, but does not update the
2723 ** transaction tracking stuff when the size changes. So, we have 2723 ** transaction tracking stuff when the size changes. So, we have
2724 ** to do the i_size updates here. 2724 ** to do the i_size updates here.
2725 */ 2725 */
2726 if (pos + copied > inode->i_size) { 2726 if (pos + copied > inode->i_size) {
2727 struct reiserfs_transaction_handle myth; 2727 struct reiserfs_transaction_handle myth;
2728 lock_depth = reiserfs_write_lock_once(inode->i_sb); 2728 lock_depth = reiserfs_write_lock_once(inode->i_sb);
2729 locked = true; 2729 locked = true;
2730 /* If the file have grown beyond the border where it 2730 /* If the file have grown beyond the border where it
2731 can have a tail, unmark it as needing a tail 2731 can have a tail, unmark it as needing a tail
2732 packing */ 2732 packing */
2733 if ((have_large_tails(inode->i_sb) 2733 if ((have_large_tails(inode->i_sb)
2734 && inode->i_size > i_block_size(inode) * 4) 2734 && inode->i_size > i_block_size(inode) * 4)
2735 || (have_small_tails(inode->i_sb) 2735 || (have_small_tails(inode->i_sb)
2736 && inode->i_size > i_block_size(inode))) 2736 && inode->i_size > i_block_size(inode)))
2737 REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask; 2737 REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
2738 2738
2739 ret = journal_begin(&myth, inode->i_sb, 1); 2739 ret = journal_begin(&myth, inode->i_sb, 1);
2740 if (ret) 2740 if (ret)
2741 goto journal_error; 2741 goto journal_error;
2742 2742
2743 reiserfs_update_inode_transaction(inode); 2743 reiserfs_update_inode_transaction(inode);
2744 inode->i_size = pos + copied; 2744 inode->i_size = pos + copied;
2745 /* 2745 /*
2746 * this will just nest into our transaction. It's important 2746 * this will just nest into our transaction. It's important
2747 * to use mark_inode_dirty so the inode gets pushed around on the 2747 * to use mark_inode_dirty so the inode gets pushed around on the
2748 * dirty lists, and so that O_SYNC works as expected 2748 * dirty lists, and so that O_SYNC works as expected
2749 */ 2749 */
2750 mark_inode_dirty(inode); 2750 mark_inode_dirty(inode);
2751 reiserfs_update_sd(&myth, inode); 2751 reiserfs_update_sd(&myth, inode);
2752 update_sd = 1; 2752 update_sd = 1;
2753 ret = journal_end(&myth, inode->i_sb, 1); 2753 ret = journal_end(&myth, inode->i_sb, 1);
2754 if (ret) 2754 if (ret)
2755 goto journal_error; 2755 goto journal_error;
2756 } 2756 }
2757 if (th) { 2757 if (th) {
2758 if (!locked) { 2758 if (!locked) {
2759 lock_depth = reiserfs_write_lock_once(inode->i_sb); 2759 lock_depth = reiserfs_write_lock_once(inode->i_sb);
2760 locked = true; 2760 locked = true;
2761 } 2761 }
2762 if (!update_sd) 2762 if (!update_sd)
2763 mark_inode_dirty(inode); 2763 mark_inode_dirty(inode);
2764 ret = reiserfs_end_persistent_transaction(th); 2764 ret = reiserfs_end_persistent_transaction(th);
2765 if (ret) 2765 if (ret)
2766 goto out; 2766 goto out;
2767 } 2767 }
2768 2768
2769 out: 2769 out:
2770 if (locked) 2770 if (locked)
2771 reiserfs_write_unlock_once(inode->i_sb, lock_depth); 2771 reiserfs_write_unlock_once(inode->i_sb, lock_depth);
2772 unlock_page(page); 2772 unlock_page(page);
2773 page_cache_release(page); 2773 page_cache_release(page);
2774 2774
2775 if (pos + len > inode->i_size) 2775 if (pos + len > inode->i_size)
2776 reiserfs_truncate_failed_write(inode); 2776 reiserfs_truncate_failed_write(inode);
2777 2777
2778 return ret == 0 ? copied : ret; 2778 return ret == 0 ? copied : ret;
2779 2779
2780 journal_error: 2780 journal_error:
2781 reiserfs_write_unlock_once(inode->i_sb, lock_depth); 2781 reiserfs_write_unlock_once(inode->i_sb, lock_depth);
2782 locked = false; 2782 locked = false;
2783 if (th) { 2783 if (th) {
2784 if (!update_sd) 2784 if (!update_sd)
2785 reiserfs_update_sd(th, inode); 2785 reiserfs_update_sd(th, inode);
2786 ret = reiserfs_end_persistent_transaction(th); 2786 ret = reiserfs_end_persistent_transaction(th);
2787 } 2787 }
2788 goto out; 2788 goto out;
2789 } 2789 }
2790 2790
2791 int reiserfs_commit_write(struct file *f, struct page *page, 2791 int reiserfs_commit_write(struct file *f, struct page *page,
2792 unsigned from, unsigned to) 2792 unsigned from, unsigned to)
2793 { 2793 {
2794 struct inode *inode = page->mapping->host; 2794 struct inode *inode = page->mapping->host;
2795 loff_t pos = ((loff_t) page->index << PAGE_CACHE_SHIFT) + to; 2795 loff_t pos = ((loff_t) page->index << PAGE_CACHE_SHIFT) + to;
2796 int ret = 0; 2796 int ret = 0;
2797 int update_sd = 0; 2797 int update_sd = 0;
2798 struct reiserfs_transaction_handle *th = NULL; 2798 struct reiserfs_transaction_handle *th = NULL;
2799 2799
2800 reiserfs_write_unlock(inode->i_sb); 2800 reiserfs_write_unlock(inode->i_sb);
2801 reiserfs_wait_on_write_block(inode->i_sb); 2801 reiserfs_wait_on_write_block(inode->i_sb);
2802 reiserfs_write_lock(inode->i_sb); 2802 reiserfs_write_lock(inode->i_sb);
2803 2803
2804 if (reiserfs_transaction_running(inode->i_sb)) { 2804 if (reiserfs_transaction_running(inode->i_sb)) {
2805 th = current->journal_info; 2805 th = current->journal_info;
2806 } 2806 }
2807 reiserfs_commit_page(inode, page, from, to); 2807 reiserfs_commit_page(inode, page, from, to);
2808 2808
2809 /* generic_commit_write does this for us, but does not update the 2809 /* generic_commit_write does this for us, but does not update the
2810 ** transaction tracking stuff when the size changes. So, we have 2810 ** transaction tracking stuff when the size changes. So, we have
2811 ** to do the i_size updates here. 2811 ** to do the i_size updates here.
2812 */ 2812 */
2813 if (pos > inode->i_size) { 2813 if (pos > inode->i_size) {
2814 struct reiserfs_transaction_handle myth; 2814 struct reiserfs_transaction_handle myth;
2815 /* If the file have grown beyond the border where it 2815 /* If the file have grown beyond the border where it
2816 can have a tail, unmark it as needing a tail 2816 can have a tail, unmark it as needing a tail
2817 packing */ 2817 packing */
2818 if ((have_large_tails(inode->i_sb) 2818 if ((have_large_tails(inode->i_sb)
2819 && inode->i_size > i_block_size(inode) * 4) 2819 && inode->i_size > i_block_size(inode) * 4)
2820 || (have_small_tails(inode->i_sb) 2820 || (have_small_tails(inode->i_sb)
2821 && inode->i_size > i_block_size(inode))) 2821 && inode->i_size > i_block_size(inode)))
2822 REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask; 2822 REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
2823 2823
2824 ret = journal_begin(&myth, inode->i_sb, 1); 2824 ret = journal_begin(&myth, inode->i_sb, 1);
2825 if (ret) 2825 if (ret)
2826 goto journal_error; 2826 goto journal_error;
2827 2827
2828 reiserfs_update_inode_transaction(inode); 2828 reiserfs_update_inode_transaction(inode);
2829 inode->i_size = pos; 2829 inode->i_size = pos;
2830 /* 2830 /*
2831 * this will just nest into our transaction. It's important 2831 * this will just nest into our transaction. It's important
2832 * to use mark_inode_dirty so the inode gets pushed around on the 2832 * to use mark_inode_dirty so the inode gets pushed around on the
2833 * dirty lists, and so that O_SYNC works as expected 2833 * dirty lists, and so that O_SYNC works as expected
2834 */ 2834 */
2835 mark_inode_dirty(inode); 2835 mark_inode_dirty(inode);
2836 reiserfs_update_sd(&myth, inode); 2836 reiserfs_update_sd(&myth, inode);
2837 update_sd = 1; 2837 update_sd = 1;
2838 ret = journal_end(&myth, inode->i_sb, 1); 2838 ret = journal_end(&myth, inode->i_sb, 1);
2839 if (ret) 2839 if (ret)
2840 goto journal_error; 2840 goto journal_error;
2841 } 2841 }
2842 if (th) { 2842 if (th) {
2843 if (!update_sd) 2843 if (!update_sd)
2844 mark_inode_dirty(inode); 2844 mark_inode_dirty(inode);
2845 ret = reiserfs_end_persistent_transaction(th); 2845 ret = reiserfs_end_persistent_transaction(th);
2846 if (ret) 2846 if (ret)
2847 goto out; 2847 goto out;
2848 } 2848 }
2849 2849
2850 out: 2850 out:
2851 return ret; 2851 return ret;
2852 2852
2853 journal_error: 2853 journal_error:
2854 if (th) { 2854 if (th) {
2855 if (!update_sd) 2855 if (!update_sd)
2856 reiserfs_update_sd(th, inode); 2856 reiserfs_update_sd(th, inode);
2857 ret = reiserfs_end_persistent_transaction(th); 2857 ret = reiserfs_end_persistent_transaction(th);
2858 } 2858 }
2859 2859
2860 return ret; 2860 return ret;
2861 } 2861 }
2862 2862
2863 void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode) 2863 void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode)
2864 { 2864 {
2865 if (reiserfs_attrs(inode->i_sb)) { 2865 if (reiserfs_attrs(inode->i_sb)) {
2866 if (sd_attrs & REISERFS_SYNC_FL) 2866 if (sd_attrs & REISERFS_SYNC_FL)
2867 inode->i_flags |= S_SYNC; 2867 inode->i_flags |= S_SYNC;
2868 else 2868 else
2869 inode->i_flags &= ~S_SYNC; 2869 inode->i_flags &= ~S_SYNC;
2870 if (sd_attrs & REISERFS_IMMUTABLE_FL) 2870 if (sd_attrs & REISERFS_IMMUTABLE_FL)
2871 inode->i_flags |= S_IMMUTABLE; 2871 inode->i_flags |= S_IMMUTABLE;
2872 else 2872 else
2873 inode->i_flags &= ~S_IMMUTABLE; 2873 inode->i_flags &= ~S_IMMUTABLE;
2874 if (sd_attrs & REISERFS_APPEND_FL) 2874 if (sd_attrs & REISERFS_APPEND_FL)
2875 inode->i_flags |= S_APPEND; 2875 inode->i_flags |= S_APPEND;
2876 else 2876 else
2877 inode->i_flags &= ~S_APPEND; 2877 inode->i_flags &= ~S_APPEND;
2878 if (sd_attrs & REISERFS_NOATIME_FL) 2878 if (sd_attrs & REISERFS_NOATIME_FL)
2879 inode->i_flags |= S_NOATIME; 2879 inode->i_flags |= S_NOATIME;
2880 else 2880 else
2881 inode->i_flags &= ~S_NOATIME; 2881 inode->i_flags &= ~S_NOATIME;
2882 if (sd_attrs & REISERFS_NOTAIL_FL) 2882 if (sd_attrs & REISERFS_NOTAIL_FL)
2883 REISERFS_I(inode)->i_flags |= i_nopack_mask; 2883 REISERFS_I(inode)->i_flags |= i_nopack_mask;
2884 else 2884 else
2885 REISERFS_I(inode)->i_flags &= ~i_nopack_mask; 2885 REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
2886 } 2886 }
2887 } 2887 }
2888 2888
2889 void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs) 2889 void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs)
2890 { 2890 {
2891 if (reiserfs_attrs(inode->i_sb)) { 2891 if (reiserfs_attrs(inode->i_sb)) {
2892 if (inode->i_flags & S_IMMUTABLE) 2892 if (inode->i_flags & S_IMMUTABLE)
2893 *sd_attrs |= REISERFS_IMMUTABLE_FL; 2893 *sd_attrs |= REISERFS_IMMUTABLE_FL;
2894 else 2894 else
2895 *sd_attrs &= ~REISERFS_IMMUTABLE_FL; 2895 *sd_attrs &= ~REISERFS_IMMUTABLE_FL;
2896 if (inode->i_flags & S_SYNC) 2896 if (inode->i_flags & S_SYNC)
2897 *sd_attrs |= REISERFS_SYNC_FL; 2897 *sd_attrs |= REISERFS_SYNC_FL;
2898 else 2898 else
2899 *sd_attrs &= ~REISERFS_SYNC_FL; 2899 *sd_attrs &= ~REISERFS_SYNC_FL;
2900 if (inode->i_flags & S_NOATIME) 2900 if (inode->i_flags & S_NOATIME)
2901 *sd_attrs |= REISERFS_NOATIME_FL; 2901 *sd_attrs |= REISERFS_NOATIME_FL;
2902 else 2902 else
2903 *sd_attrs &= ~REISERFS_NOATIME_FL; 2903 *sd_attrs &= ~REISERFS_NOATIME_FL;
2904 if (REISERFS_I(inode)->i_flags & i_nopack_mask) 2904 if (REISERFS_I(inode)->i_flags & i_nopack_mask)
2905 *sd_attrs |= REISERFS_NOTAIL_FL; 2905 *sd_attrs |= REISERFS_NOTAIL_FL;
2906 else 2906 else
2907 *sd_attrs &= ~REISERFS_NOTAIL_FL; 2907 *sd_attrs &= ~REISERFS_NOTAIL_FL;
2908 } 2908 }
2909 } 2909 }
2910 2910
2911 /* decide if this buffer needs to stay around for data logging or ordered 2911 /* decide if this buffer needs to stay around for data logging or ordered
2912 ** write purposes 2912 ** write purposes
2913 */ 2913 */
2914 static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh) 2914 static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
2915 { 2915 {
2916 int ret = 1; 2916 int ret = 1;
2917 struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb); 2917 struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
2918 2918
2919 lock_buffer(bh); 2919 lock_buffer(bh);
2920 spin_lock(&j->j_dirty_buffers_lock); 2920 spin_lock(&j->j_dirty_buffers_lock);
2921 if (!buffer_mapped(bh)) { 2921 if (!buffer_mapped(bh)) {
2922 goto free_jh; 2922 goto free_jh;
2923 } 2923 }
2924 /* the page is locked, and the only places that log a data buffer 2924 /* the page is locked, and the only places that log a data buffer
2925 * also lock the page. 2925 * also lock the page.
2926 */ 2926 */
2927 if (reiserfs_file_data_log(inode)) { 2927 if (reiserfs_file_data_log(inode)) {
2928 /* 2928 /*
2929 * very conservative, leave the buffer pinned if 2929 * very conservative, leave the buffer pinned if
2930 * anyone might need it. 2930 * anyone might need it.
2931 */ 2931 */
2932 if (buffer_journaled(bh) || buffer_journal_dirty(bh)) { 2932 if (buffer_journaled(bh) || buffer_journal_dirty(bh)) {
2933 ret = 0; 2933 ret = 0;
2934 } 2934 }
2935 } else if (buffer_dirty(bh)) { 2935 } else if (buffer_dirty(bh)) {
2936 struct reiserfs_journal_list *jl; 2936 struct reiserfs_journal_list *jl;
2937 struct reiserfs_jh *jh = bh->b_private; 2937 struct reiserfs_jh *jh = bh->b_private;
2938 2938
2939 /* why is this safe? 2939 /* why is this safe?
2940 * reiserfs_setattr updates i_size in the on disk 2940 * reiserfs_setattr updates i_size in the on disk
2941 * stat data before allowing vmtruncate to be called. 2941 * stat data before allowing vmtruncate to be called.
2942 * 2942 *
2943 * If buffer was put onto the ordered list for this 2943 * If buffer was put onto the ordered list for this
2944 * transaction, we know for sure either this transaction 2944 * transaction, we know for sure either this transaction
2945 * or an older one already has updated i_size on disk, 2945 * or an older one already has updated i_size on disk,
2946 * and this ordered data won't be referenced in the file 2946 * and this ordered data won't be referenced in the file
2947 * if we crash. 2947 * if we crash.
2948 * 2948 *
2949 * if the buffer was put onto the ordered list for an older 2949 * if the buffer was put onto the ordered list for an older
2950 * transaction, we need to leave it around 2950 * transaction, we need to leave it around
2951 */ 2951 */
2952 if (jh && (jl = jh->jl) 2952 if (jh && (jl = jh->jl)
2953 && jl != SB_JOURNAL(inode->i_sb)->j_current_jl) 2953 && jl != SB_JOURNAL(inode->i_sb)->j_current_jl)
2954 ret = 0; 2954 ret = 0;
2955 } 2955 }
2956 free_jh: 2956 free_jh:
2957 if (ret && bh->b_private) { 2957 if (ret && bh->b_private) {
2958 reiserfs_free_jh(bh); 2958 reiserfs_free_jh(bh);
2959 } 2959 }
2960 spin_unlock(&j->j_dirty_buffers_lock); 2960 spin_unlock(&j->j_dirty_buffers_lock);
2961 unlock_buffer(bh); 2961 unlock_buffer(bh);
2962 return ret; 2962 return ret;
2963 } 2963 }
2964 2964
2965 /* clm -- taken from fs/buffer.c:block_invalidate_page */ 2965 /* clm -- taken from fs/buffer.c:block_invalidate_page */
2966 static void reiserfs_invalidatepage(struct page *page, unsigned long offset) 2966 static void reiserfs_invalidatepage(struct page *page, unsigned long offset)
2967 { 2967 {
2968 struct buffer_head *head, *bh, *next; 2968 struct buffer_head *head, *bh, *next;
2969 struct inode *inode = page->mapping->host; 2969 struct inode *inode = page->mapping->host;
2970 unsigned int curr_off = 0; 2970 unsigned int curr_off = 0;
2971 int ret = 1; 2971 int ret = 1;
2972 2972
2973 BUG_ON(!PageLocked(page)); 2973 BUG_ON(!PageLocked(page));
2974 2974
2975 if (offset == 0) 2975 if (offset == 0)
2976 ClearPageChecked(page); 2976 ClearPageChecked(page);
2977 2977
2978 if (!page_has_buffers(page)) 2978 if (!page_has_buffers(page))
2979 goto out; 2979 goto out;
2980 2980
2981 head = page_buffers(page); 2981 head = page_buffers(page);
2982 bh = head; 2982 bh = head;
2983 do { 2983 do {
2984 unsigned int next_off = curr_off + bh->b_size; 2984 unsigned int next_off = curr_off + bh->b_size;
2985 next = bh->b_this_page; 2985 next = bh->b_this_page;
2986 2986
2987 /* 2987 /*
2988 * is this block fully invalidated? 2988 * is this block fully invalidated?
2989 */ 2989 */
2990 if (offset <= curr_off) { 2990 if (offset <= curr_off) {
2991 if (invalidatepage_can_drop(inode, bh)) 2991 if (invalidatepage_can_drop(inode, bh))
2992 reiserfs_unmap_buffer(bh); 2992 reiserfs_unmap_buffer(bh);
2993 else 2993 else
2994 ret = 0; 2994 ret = 0;
2995 } 2995 }
2996 curr_off = next_off; 2996 curr_off = next_off;
2997 bh = next; 2997 bh = next;
2998 } while (bh != head); 2998 } while (bh != head);
2999 2999
3000 /* 3000 /*
3001 * We release buffers only if the entire page is being invalidated. 3001 * We release buffers only if the entire page is being invalidated.
3002 * The get_block cached value has been unconditionally invalidated, 3002 * The get_block cached value has been unconditionally invalidated,
3003 * so real IO is not possible anymore. 3003 * so real IO is not possible anymore.
3004 */ 3004 */
3005 if (!offset && ret) { 3005 if (!offset && ret) {
3006 ret = try_to_release_page(page, 0); 3006 ret = try_to_release_page(page, 0);
3007 /* maybe should BUG_ON(!ret); - neilb */ 3007 /* maybe should BUG_ON(!ret); - neilb */
3008 } 3008 }
3009 out: 3009 out:
3010 return; 3010 return;
3011 } 3011 }
3012 3012
3013 static int reiserfs_set_page_dirty(struct page *page) 3013 static int reiserfs_set_page_dirty(struct page *page)
3014 { 3014 {
3015 struct inode *inode = page->mapping->host; 3015 struct inode *inode = page->mapping->host;
3016 if (reiserfs_file_data_log(inode)) { 3016 if (reiserfs_file_data_log(inode)) {
3017 SetPageChecked(page); 3017 SetPageChecked(page);
3018 return __set_page_dirty_nobuffers(page); 3018 return __set_page_dirty_nobuffers(page);
3019 } 3019 }
3020 return __set_page_dirty_buffers(page); 3020 return __set_page_dirty_buffers(page);
3021 } 3021 }
3022 3022
3023 /* 3023 /*
3024 * Returns 1 if the page's buffers were dropped. The page is locked. 3024 * Returns 1 if the page's buffers were dropped. The page is locked.
3025 * 3025 *
3026 * Takes j_dirty_buffers_lock to protect the b_assoc_buffers list_heads 3026 * Takes j_dirty_buffers_lock to protect the b_assoc_buffers list_heads
3027 * in the buffers at page_buffers(page). 3027 * in the buffers at page_buffers(page).
3028 * 3028 *
3029 * even in -o notail mode, we can't be sure an old mount without -o notail 3029 * even in -o notail mode, we can't be sure an old mount without -o notail
3030 * didn't create files with tails. 3030 * didn't create files with tails.
3031 */ 3031 */
3032 static int reiserfs_releasepage(struct page *page, gfp_t unused_gfp_flags) 3032 static int reiserfs_releasepage(struct page *page, gfp_t unused_gfp_flags)
3033 { 3033 {
3034 struct inode *inode = page->mapping->host; 3034 struct inode *inode = page->mapping->host;
3035 struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb); 3035 struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
3036 struct buffer_head *head; 3036 struct buffer_head *head;
3037 struct buffer_head *bh; 3037 struct buffer_head *bh;
3038 int ret = 1; 3038 int ret = 1;
3039 3039
3040 WARN_ON(PageChecked(page)); 3040 WARN_ON(PageChecked(page));
3041 spin_lock(&j->j_dirty_buffers_lock); 3041 spin_lock(&j->j_dirty_buffers_lock);
3042 head = page_buffers(page); 3042 head = page_buffers(page);
3043 bh = head; 3043 bh = head;
3044 do { 3044 do {
3045 if (bh->b_private) { 3045 if (bh->b_private) {
3046 if (!buffer_dirty(bh) && !buffer_locked(bh)) { 3046 if (!buffer_dirty(bh) && !buffer_locked(bh)) {
3047 reiserfs_free_jh(bh); 3047 reiserfs_free_jh(bh);
3048 } else { 3048 } else {
3049 ret = 0; 3049 ret = 0;
3050 break; 3050 break;
3051 } 3051 }
3052 } 3052 }
3053 bh = bh->b_this_page; 3053 bh = bh->b_this_page;
3054 } while (bh != head); 3054 } while (bh != head);
3055 if (ret) 3055 if (ret)
3056 ret = try_to_free_buffers(page); 3056 ret = try_to_free_buffers(page);
3057 spin_unlock(&j->j_dirty_buffers_lock); 3057 spin_unlock(&j->j_dirty_buffers_lock);
3058 return ret; 3058 return ret;
3059 } 3059 }
3060 3060
3061 /* We thank Mingming Cao for helping us understand in great detail what 3061 /* We thank Mingming Cao for helping us understand in great detail what
3062 to do in this section of the code. */ 3062 to do in this section of the code. */
3063 static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb, 3063 static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
3064 const struct iovec *iov, loff_t offset, 3064 const struct iovec *iov, loff_t offset,
3065 unsigned long nr_segs) 3065 unsigned long nr_segs)
3066 { 3066 {
3067 struct file *file = iocb->ki_filp; 3067 struct file *file = iocb->ki_filp;
3068 struct inode *inode = file->f_mapping->host; 3068 struct inode *inode = file->f_mapping->host;
3069 ssize_t ret; 3069 ssize_t ret;
3070 3070
3071 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 3071 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
3072 offset, nr_segs, 3072 offset, nr_segs,
3073 reiserfs_get_blocks_direct_io, NULL); 3073 reiserfs_get_blocks_direct_io, NULL);
3074 3074
3075 /* 3075 /*
3076 * In case of error extending write may have instantiated a few 3076 * In case of error extending write may have instantiated a few
3077 * blocks outside i_size. Trim these off again. 3077 * blocks outside i_size. Trim these off again.
3078 */ 3078 */
3079 if (unlikely((rw & WRITE) && ret < 0)) { 3079 if (unlikely((rw & WRITE) && ret < 0)) {
3080 loff_t isize = i_size_read(inode); 3080 loff_t isize = i_size_read(inode);
3081 loff_t end = offset + iov_length(iov, nr_segs); 3081 loff_t end = offset + iov_length(iov, nr_segs);
3082 3082
3083 if (end > isize) 3083 if (end > isize)
3084 vmtruncate(inode, isize); 3084 vmtruncate(inode, isize);
3085 } 3085 }
3086 3086
3087 return ret; 3087 return ret;
3088 } 3088 }
3089 3089
3090 int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) 3090 int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
3091 { 3091 {
3092 struct inode *inode = dentry->d_inode; 3092 struct inode *inode = dentry->d_inode;
3093 unsigned int ia_valid; 3093 unsigned int ia_valid;
3094 int depth; 3094 int depth;
3095 int error; 3095 int error;
3096 3096
3097 error = inode_change_ok(inode, attr); 3097 error = inode_change_ok(inode, attr);
3098 if (error) 3098 if (error)
3099 return error; 3099 return error;
3100 3100
3101 /* must be turned off for recursive notify_change calls */ 3101 /* must be turned off for recursive notify_change calls */
3102 ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID); 3102 ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID);
3103 3103
3104 depth = reiserfs_write_lock_once(inode->i_sb); 3104 depth = reiserfs_write_lock_once(inode->i_sb);
3105 if (is_quota_modification(inode, attr)) 3105 if (is_quota_modification(inode, attr))
3106 dquot_initialize(inode); 3106 dquot_initialize(inode);
3107 3107
3108 if (attr->ia_valid & ATTR_SIZE) { 3108 if (attr->ia_valid & ATTR_SIZE) {
3109 /* version 2 items will be caught by the s_maxbytes check 3109 /* version 2 items will be caught by the s_maxbytes check
3110 ** done for us in vmtruncate 3110 ** done for us in vmtruncate
3111 */ 3111 */
3112 if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 && 3112 if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 &&
3113 attr->ia_size > MAX_NON_LFS) { 3113 attr->ia_size > MAX_NON_LFS) {
3114 error = -EFBIG; 3114 error = -EFBIG;
3115 goto out; 3115 goto out;
3116 } 3116 }
3117
3118 inode_dio_wait(inode);
3119
3117 /* fill in hole pointers in the expanding truncate case. */ 3120 /* fill in hole pointers in the expanding truncate case. */
3118 if (attr->ia_size > inode->i_size) { 3121 if (attr->ia_size > inode->i_size) {
3119 error = generic_cont_expand_simple(inode, attr->ia_size); 3122 error = generic_cont_expand_simple(inode, attr->ia_size);
3120 if (REISERFS_I(inode)->i_prealloc_count > 0) { 3123 if (REISERFS_I(inode)->i_prealloc_count > 0) {
3121 int err; 3124 int err;
3122 struct reiserfs_transaction_handle th; 3125 struct reiserfs_transaction_handle th;
3123 /* we're changing at most 2 bitmaps, inode + super */ 3126 /* we're changing at most 2 bitmaps, inode + super */
3124 err = journal_begin(&th, inode->i_sb, 4); 3127 err = journal_begin(&th, inode->i_sb, 4);
3125 if (!err) { 3128 if (!err) {
3126 reiserfs_discard_prealloc(&th, inode); 3129 reiserfs_discard_prealloc(&th, inode);
3127 err = journal_end(&th, inode->i_sb, 4); 3130 err = journal_end(&th, inode->i_sb, 4);
3128 } 3131 }
3129 if (err) 3132 if (err)
3130 error = err; 3133 error = err;
3131 } 3134 }
3132 if (error) 3135 if (error)
3133 goto out; 3136 goto out;
3134 /* 3137 /*
3135 * file size is changed, ctime and mtime are 3138 * file size is changed, ctime and mtime are
3136 * to be updated 3139 * to be updated
3137 */ 3140 */
3138 attr->ia_valid |= (ATTR_MTIME | ATTR_CTIME); 3141 attr->ia_valid |= (ATTR_MTIME | ATTR_CTIME);
3139 } 3142 }
3140 } 3143 }
3141 3144
3142 if ((((attr->ia_valid & ATTR_UID) && (attr->ia_uid & ~0xffff)) || 3145 if ((((attr->ia_valid & ATTR_UID) && (attr->ia_uid & ~0xffff)) ||
3143 ((attr->ia_valid & ATTR_GID) && (attr->ia_gid & ~0xffff))) && 3146 ((attr->ia_valid & ATTR_GID) && (attr->ia_gid & ~0xffff))) &&
3144 (get_inode_sd_version(inode) == STAT_DATA_V1)) { 3147 (get_inode_sd_version(inode) == STAT_DATA_V1)) {
3145 /* stat data of format v3.5 has 16 bit uid and gid */ 3148 /* stat data of format v3.5 has 16 bit uid and gid */
3146 error = -EINVAL; 3149 error = -EINVAL;
3147 goto out; 3150 goto out;
3148 } 3151 }
3149 3152
3150 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || 3153 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
3151 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { 3154 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
3152 struct reiserfs_transaction_handle th; 3155 struct reiserfs_transaction_handle th;
3153 int jbegin_count = 3156 int jbegin_count =
3154 2 * 3157 2 *
3155 (REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb) + 3158 (REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb) +
3156 REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb)) + 3159 REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb)) +
3157 2; 3160 2;
3158 3161
3159 error = reiserfs_chown_xattrs(inode, attr); 3162 error = reiserfs_chown_xattrs(inode, attr);
3160 3163
3161 if (error) 3164 if (error)
3162 return error; 3165 return error;
3163 3166
3164 /* (user+group)*(old+new) structure - we count quota info and , inode write (sb, inode) */ 3167 /* (user+group)*(old+new) structure - we count quota info and , inode write (sb, inode) */
3165 error = journal_begin(&th, inode->i_sb, jbegin_count); 3168 error = journal_begin(&th, inode->i_sb, jbegin_count);
3166 if (error) 3169 if (error)
3167 goto out; 3170 goto out;
3168 error = dquot_transfer(inode, attr); 3171 error = dquot_transfer(inode, attr);
3169 if (error) { 3172 if (error) {
3170 journal_end(&th, inode->i_sb, jbegin_count); 3173 journal_end(&th, inode->i_sb, jbegin_count);
3171 goto out; 3174 goto out;
3172 } 3175 }
3173 3176
3174 /* Update corresponding info in inode so that everything is in 3177 /* Update corresponding info in inode so that everything is in
3175 * one transaction */ 3178 * one transaction */
3176 if (attr->ia_valid & ATTR_UID) 3179 if (attr->ia_valid & ATTR_UID)
3177 inode->i_uid = attr->ia_uid; 3180 inode->i_uid = attr->ia_uid;
3178 if (attr->ia_valid & ATTR_GID) 3181 if (attr->ia_valid & ATTR_GID)
3179 inode->i_gid = attr->ia_gid; 3182 inode->i_gid = attr->ia_gid;
3180 mark_inode_dirty(inode); 3183 mark_inode_dirty(inode);
3181 error = journal_end(&th, inode->i_sb, jbegin_count); 3184 error = journal_end(&th, inode->i_sb, jbegin_count);
3182 if (error) 3185 if (error)
3183 goto out; 3186 goto out;
3184 } 3187 }
3185 3188
3186 /* 3189 /*
3187 * Relax the lock here, as it might truncate the 3190 * Relax the lock here, as it might truncate the
3188 * inode pages and wait for inode pages locks. 3191 * inode pages and wait for inode pages locks.
3189 * To release such page lock, the owner needs the 3192 * To release such page lock, the owner needs the
3190 * reiserfs lock 3193 * reiserfs lock
3191 */ 3194 */
3192 reiserfs_write_unlock_once(inode->i_sb, depth); 3195 reiserfs_write_unlock_once(inode->i_sb, depth);
3193 if ((attr->ia_valid & ATTR_SIZE) && 3196 if ((attr->ia_valid & ATTR_SIZE) &&
3194 attr->ia_size != i_size_read(inode)) 3197 attr->ia_size != i_size_read(inode))
3195 error = vmtruncate(inode, attr->ia_size); 3198 error = vmtruncate(inode, attr->ia_size);
3196 3199
3197 if (!error) { 3200 if (!error) {
3198 setattr_copy(inode, attr); 3201 setattr_copy(inode, attr);
3199 mark_inode_dirty(inode); 3202 mark_inode_dirty(inode);
3200 } 3203 }
3201 depth = reiserfs_write_lock_once(inode->i_sb); 3204 depth = reiserfs_write_lock_once(inode->i_sb);
3202 3205
3203 if (!error && reiserfs_posixacl(inode->i_sb)) { 3206 if (!error && reiserfs_posixacl(inode->i_sb)) {
3204 if (attr->ia_valid & ATTR_MODE) 3207 if (attr->ia_valid & ATTR_MODE)
3205 error = reiserfs_acl_chmod(inode); 3208 error = reiserfs_acl_chmod(inode);
3206 } 3209 }
3207 3210
3208 out: 3211 out:
3209 reiserfs_write_unlock_once(inode->i_sb, depth); 3212 reiserfs_write_unlock_once(inode->i_sb, depth);
3210 3213
3211 return error; 3214 return error;
3212 } 3215 }
3213 3216
3214 const struct address_space_operations reiserfs_address_space_operations = { 3217 const struct address_space_operations reiserfs_address_space_operations = {
3215 .writepage = reiserfs_writepage, 3218 .writepage = reiserfs_writepage,
3216 .readpage = reiserfs_readpage, 3219 .readpage = reiserfs_readpage,
3217 .readpages = reiserfs_readpages, 3220 .readpages = reiserfs_readpages,
3218 .releasepage = reiserfs_releasepage, 3221 .releasepage = reiserfs_releasepage,
3219 .invalidatepage = reiserfs_invalidatepage, 3222 .invalidatepage = reiserfs_invalidatepage,
3220 .write_begin = reiserfs_write_begin, 3223 .write_begin = reiserfs_write_begin,
3221 .write_end = reiserfs_write_end, 3224 .write_end = reiserfs_write_end,
3222 .bmap = reiserfs_aop_bmap, 3225 .bmap = reiserfs_aop_bmap,
3223 .direct_IO = reiserfs_direct_IO, 3226 .direct_IO = reiserfs_direct_IO,
3224 .set_page_dirty = reiserfs_set_page_dirty, 3227 .set_page_dirty = reiserfs_set_page_dirty,
3225 }; 3228 };
3226 3229