Commit 53013cba4118a5cfe8f7c7ea5e5bc1c48b160f76
1 parent
0c056c50a6
Exists in
master
and in
20 other branches
ocfs2: take data locks around extend
We need to take a data lock around extends to protect the pages that ocfs2_zero_extend is going to be pulling into the page cache. Otherwise an extend on one node might populate the page cache with data pages that have no lock coverage. Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Showing 3 changed files with 87 additions and 33 deletions Inline Diff
fs/ocfs2/aops.c
1 | /* -*- mode: c; c-basic-offset: 8; -*- | 1 | /* -*- mode: c; c-basic-offset: 8; -*- |
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | 2 | * vim: noexpandtab sw=8 ts=8 sts=0: |
3 | * | 3 | * |
4 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | 4 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. |
5 | * | 5 | * |
6 | * This program is free software; you can redistribute it and/or | 6 | * This program is free software; you can redistribute it and/or |
7 | * modify it under the terms of the GNU General Public | 7 | * modify it under the terms of the GNU General Public |
8 | * License as published by the Free Software Foundation; either | 8 | * License as published by the Free Software Foundation; either |
9 | * version 2 of the License, or (at your option) any later version. | 9 | * version 2 of the License, or (at your option) any later version. |
10 | * | 10 | * |
11 | * This program is distributed in the hope that it will be useful, | 11 | * This program is distributed in the hope that it will be useful, |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | * General Public License for more details. | 14 | * General Public License for more details. |
15 | * | 15 | * |
16 | * You should have received a copy of the GNU General Public | 16 | * You should have received a copy of the GNU General Public |
17 | * License along with this program; if not, write to the | 17 | * License along with this program; if not, write to the |
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | 18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, |
19 | * Boston, MA 021110-1307, USA. | 19 | * Boston, MA 021110-1307, USA. |
20 | */ | 20 | */ |
21 | 21 | ||
22 | #include <linux/fs.h> | 22 | #include <linux/fs.h> |
23 | #include <linux/slab.h> | 23 | #include <linux/slab.h> |
24 | #include <linux/highmem.h> | 24 | #include <linux/highmem.h> |
25 | #include <linux/pagemap.h> | 25 | #include <linux/pagemap.h> |
26 | #include <asm/byteorder.h> | 26 | #include <asm/byteorder.h> |
27 | 27 | ||
28 | #define MLOG_MASK_PREFIX ML_FILE_IO | 28 | #define MLOG_MASK_PREFIX ML_FILE_IO |
29 | #include <cluster/masklog.h> | 29 | #include <cluster/masklog.h> |
30 | 30 | ||
31 | #include "ocfs2.h" | 31 | #include "ocfs2.h" |
32 | 32 | ||
33 | #include "alloc.h" | 33 | #include "alloc.h" |
34 | #include "aops.h" | 34 | #include "aops.h" |
35 | #include "dlmglue.h" | 35 | #include "dlmglue.h" |
36 | #include "extent_map.h" | 36 | #include "extent_map.h" |
37 | #include "file.h" | 37 | #include "file.h" |
38 | #include "inode.h" | 38 | #include "inode.h" |
39 | #include "journal.h" | 39 | #include "journal.h" |
40 | #include "super.h" | 40 | #include "super.h" |
41 | #include "symlink.h" | 41 | #include "symlink.h" |
42 | 42 | ||
43 | #include "buffer_head_io.h" | 43 | #include "buffer_head_io.h" |
44 | 44 | ||
45 | static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, | 45 | static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, |
46 | struct buffer_head *bh_result, int create) | 46 | struct buffer_head *bh_result, int create) |
47 | { | 47 | { |
48 | int err = -EIO; | 48 | int err = -EIO; |
49 | int status; | 49 | int status; |
50 | struct ocfs2_dinode *fe = NULL; | 50 | struct ocfs2_dinode *fe = NULL; |
51 | struct buffer_head *bh = NULL; | 51 | struct buffer_head *bh = NULL; |
52 | struct buffer_head *buffer_cache_bh = NULL; | 52 | struct buffer_head *buffer_cache_bh = NULL; |
53 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 53 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
54 | void *kaddr; | 54 | void *kaddr; |
55 | 55 | ||
56 | mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, | 56 | mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, |
57 | (unsigned long long)iblock, bh_result, create); | 57 | (unsigned long long)iblock, bh_result, create); |
58 | 58 | ||
59 | BUG_ON(ocfs2_inode_is_fast_symlink(inode)); | 59 | BUG_ON(ocfs2_inode_is_fast_symlink(inode)); |
60 | 60 | ||
61 | if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) { | 61 | if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) { |
62 | mlog(ML_ERROR, "block offset > PATH_MAX: %llu", | 62 | mlog(ML_ERROR, "block offset > PATH_MAX: %llu", |
63 | (unsigned long long)iblock); | 63 | (unsigned long long)iblock); |
64 | goto bail; | 64 | goto bail; |
65 | } | 65 | } |
66 | 66 | ||
67 | status = ocfs2_read_block(OCFS2_SB(inode->i_sb), | 67 | status = ocfs2_read_block(OCFS2_SB(inode->i_sb), |
68 | OCFS2_I(inode)->ip_blkno, | 68 | OCFS2_I(inode)->ip_blkno, |
69 | &bh, OCFS2_BH_CACHED, inode); | 69 | &bh, OCFS2_BH_CACHED, inode); |
70 | if (status < 0) { | 70 | if (status < 0) { |
71 | mlog_errno(status); | 71 | mlog_errno(status); |
72 | goto bail; | 72 | goto bail; |
73 | } | 73 | } |
74 | fe = (struct ocfs2_dinode *) bh->b_data; | 74 | fe = (struct ocfs2_dinode *) bh->b_data; |
75 | 75 | ||
76 | if (!OCFS2_IS_VALID_DINODE(fe)) { | 76 | if (!OCFS2_IS_VALID_DINODE(fe)) { |
77 | mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n", | 77 | mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n", |
78 | (unsigned long long)fe->i_blkno, 7, fe->i_signature); | 78 | (unsigned long long)fe->i_blkno, 7, fe->i_signature); |
79 | goto bail; | 79 | goto bail; |
80 | } | 80 | } |
81 | 81 | ||
82 | if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb, | 82 | if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb, |
83 | le32_to_cpu(fe->i_clusters))) { | 83 | le32_to_cpu(fe->i_clusters))) { |
84 | mlog(ML_ERROR, "block offset is outside the allocated size: " | 84 | mlog(ML_ERROR, "block offset is outside the allocated size: " |
85 | "%llu\n", (unsigned long long)iblock); | 85 | "%llu\n", (unsigned long long)iblock); |
86 | goto bail; | 86 | goto bail; |
87 | } | 87 | } |
88 | 88 | ||
89 | /* We don't use the page cache to create symlink data, so if | 89 | /* We don't use the page cache to create symlink data, so if |
90 | * need be, copy it over from the buffer cache. */ | 90 | * need be, copy it over from the buffer cache. */ |
91 | if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) { | 91 | if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) { |
92 | u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + | 92 | u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + |
93 | iblock; | 93 | iblock; |
94 | buffer_cache_bh = sb_getblk(osb->sb, blkno); | 94 | buffer_cache_bh = sb_getblk(osb->sb, blkno); |
95 | if (!buffer_cache_bh) { | 95 | if (!buffer_cache_bh) { |
96 | mlog(ML_ERROR, "couldn't getblock for symlink!\n"); | 96 | mlog(ML_ERROR, "couldn't getblock for symlink!\n"); |
97 | goto bail; | 97 | goto bail; |
98 | } | 98 | } |
99 | 99 | ||
100 | /* we haven't locked out transactions, so a commit | 100 | /* we haven't locked out transactions, so a commit |
101 | * could've happened. Since we've got a reference on | 101 | * could've happened. Since we've got a reference on |
102 | * the bh, even if it commits while we're doing the | 102 | * the bh, even if it commits while we're doing the |
103 | * copy, the data is still good. */ | 103 | * copy, the data is still good. */ |
104 | if (buffer_jbd(buffer_cache_bh) | 104 | if (buffer_jbd(buffer_cache_bh) |
105 | && ocfs2_inode_is_new(inode)) { | 105 | && ocfs2_inode_is_new(inode)) { |
106 | kaddr = kmap_atomic(bh_result->b_page, KM_USER0); | 106 | kaddr = kmap_atomic(bh_result->b_page, KM_USER0); |
107 | if (!kaddr) { | 107 | if (!kaddr) { |
108 | mlog(ML_ERROR, "couldn't kmap!\n"); | 108 | mlog(ML_ERROR, "couldn't kmap!\n"); |
109 | goto bail; | 109 | goto bail; |
110 | } | 110 | } |
111 | memcpy(kaddr + (bh_result->b_size * iblock), | 111 | memcpy(kaddr + (bh_result->b_size * iblock), |
112 | buffer_cache_bh->b_data, | 112 | buffer_cache_bh->b_data, |
113 | bh_result->b_size); | 113 | bh_result->b_size); |
114 | kunmap_atomic(kaddr, KM_USER0); | 114 | kunmap_atomic(kaddr, KM_USER0); |
115 | set_buffer_uptodate(bh_result); | 115 | set_buffer_uptodate(bh_result); |
116 | } | 116 | } |
117 | brelse(buffer_cache_bh); | 117 | brelse(buffer_cache_bh); |
118 | } | 118 | } |
119 | 119 | ||
120 | map_bh(bh_result, inode->i_sb, | 120 | map_bh(bh_result, inode->i_sb, |
121 | le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock); | 121 | le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock); |
122 | 122 | ||
123 | err = 0; | 123 | err = 0; |
124 | 124 | ||
125 | bail: | 125 | bail: |
126 | if (bh) | 126 | if (bh) |
127 | brelse(bh); | 127 | brelse(bh); |
128 | 128 | ||
129 | mlog_exit(err); | 129 | mlog_exit(err); |
130 | return err; | 130 | return err; |
131 | } | 131 | } |
132 | 132 | ||
133 | static int ocfs2_get_block(struct inode *inode, sector_t iblock, | 133 | static int ocfs2_get_block(struct inode *inode, sector_t iblock, |
134 | struct buffer_head *bh_result, int create) | 134 | struct buffer_head *bh_result, int create) |
135 | { | 135 | { |
136 | int err = 0; | 136 | int err = 0; |
137 | u64 p_blkno, past_eof; | 137 | u64 p_blkno, past_eof; |
138 | 138 | ||
139 | mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, | 139 | mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, |
140 | (unsigned long long)iblock, bh_result, create); | 140 | (unsigned long long)iblock, bh_result, create); |
141 | 141 | ||
142 | if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) | 142 | if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) |
143 | mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n", | 143 | mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n", |
144 | inode, inode->i_ino); | 144 | inode, inode->i_ino); |
145 | 145 | ||
146 | if (S_ISLNK(inode->i_mode)) { | 146 | if (S_ISLNK(inode->i_mode)) { |
147 | /* this always does I/O for some reason. */ | 147 | /* this always does I/O for some reason. */ |
148 | err = ocfs2_symlink_get_block(inode, iblock, bh_result, create); | 148 | err = ocfs2_symlink_get_block(inode, iblock, bh_result, create); |
149 | goto bail; | 149 | goto bail; |
150 | } | 150 | } |
151 | 151 | ||
152 | /* this can happen if another node truncs after our extend! */ | 152 | /* this can happen if another node truncs after our extend! */ |
153 | spin_lock(&OCFS2_I(inode)->ip_lock); | 153 | spin_lock(&OCFS2_I(inode)->ip_lock); |
154 | if (iblock >= ocfs2_clusters_to_blocks(inode->i_sb, | 154 | if (iblock >= ocfs2_clusters_to_blocks(inode->i_sb, |
155 | OCFS2_I(inode)->ip_clusters)) | 155 | OCFS2_I(inode)->ip_clusters)) |
156 | err = -EIO; | 156 | err = -EIO; |
157 | spin_unlock(&OCFS2_I(inode)->ip_lock); | 157 | spin_unlock(&OCFS2_I(inode)->ip_lock); |
158 | if (err) | 158 | if (err) |
159 | goto bail; | 159 | goto bail; |
160 | 160 | ||
161 | err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno, | 161 | err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno, |
162 | NULL); | 162 | NULL); |
163 | if (err) { | 163 | if (err) { |
164 | mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, " | 164 | mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, " |
165 | "%llu, NULL)\n", err, inode, (unsigned long long)iblock, | 165 | "%llu, NULL)\n", err, inode, (unsigned long long)iblock, |
166 | (unsigned long long)p_blkno); | 166 | (unsigned long long)p_blkno); |
167 | goto bail; | 167 | goto bail; |
168 | } | 168 | } |
169 | 169 | ||
170 | map_bh(bh_result, inode->i_sb, p_blkno); | 170 | map_bh(bh_result, inode->i_sb, p_blkno); |
171 | 171 | ||
172 | if (bh_result->b_blocknr == 0) { | 172 | if (bh_result->b_blocknr == 0) { |
173 | err = -EIO; | 173 | err = -EIO; |
174 | mlog(ML_ERROR, "iblock = %llu p_blkno = %llu blkno=(%llu)\n", | 174 | mlog(ML_ERROR, "iblock = %llu p_blkno = %llu blkno=(%llu)\n", |
175 | (unsigned long long)iblock, | 175 | (unsigned long long)iblock, |
176 | (unsigned long long)p_blkno, | 176 | (unsigned long long)p_blkno, |
177 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | 177 | (unsigned long long)OCFS2_I(inode)->ip_blkno); |
178 | } | 178 | } |
179 | 179 | ||
180 | past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); | 180 | past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); |
181 | mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino, | 181 | mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino, |
182 | (unsigned long long)past_eof); | 182 | (unsigned long long)past_eof); |
183 | 183 | ||
184 | if (create && (iblock >= past_eof)) | 184 | if (create && (iblock >= past_eof)) |
185 | set_buffer_new(bh_result); | 185 | set_buffer_new(bh_result); |
186 | 186 | ||
187 | bail: | 187 | bail: |
188 | if (err < 0) | 188 | if (err < 0) |
189 | err = -EIO; | 189 | err = -EIO; |
190 | 190 | ||
191 | mlog_exit(err); | 191 | mlog_exit(err); |
192 | return err; | 192 | return err; |
193 | } | 193 | } |
194 | 194 | ||
195 | static int ocfs2_readpage(struct file *file, struct page *page) | 195 | static int ocfs2_readpage(struct file *file, struct page *page) |
196 | { | 196 | { |
197 | struct inode *inode = page->mapping->host; | 197 | struct inode *inode = page->mapping->host; |
198 | loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT; | 198 | loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT; |
199 | int ret, unlock = 1; | 199 | int ret, unlock = 1; |
200 | 200 | ||
201 | mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0)); | 201 | mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0)); |
202 | 202 | ||
203 | ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page); | 203 | ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page); |
204 | if (ret != 0) { | 204 | if (ret != 0) { |
205 | if (ret == AOP_TRUNCATED_PAGE) | 205 | if (ret == AOP_TRUNCATED_PAGE) |
206 | unlock = 0; | 206 | unlock = 0; |
207 | mlog_errno(ret); | 207 | mlog_errno(ret); |
208 | goto out; | 208 | goto out; |
209 | } | 209 | } |
210 | 210 | ||
211 | down_read(&OCFS2_I(inode)->ip_alloc_sem); | 211 | down_read(&OCFS2_I(inode)->ip_alloc_sem); |
212 | 212 | ||
213 | /* | 213 | /* |
214 | * i_size might have just been updated as we grabed the meta lock. We | 214 | * i_size might have just been updated as we grabed the meta lock. We |
215 | * might now be discovering a truncate that hit on another node. | 215 | * might now be discovering a truncate that hit on another node. |
216 | * block_read_full_page->get_block freaks out if it is asked to read | 216 | * block_read_full_page->get_block freaks out if it is asked to read |
217 | * beyond the end of a file, so we check here. Callers | 217 | * beyond the end of a file, so we check here. Callers |
218 | * (generic_file_read, fault->nopage) are clever enough to check i_size | 218 | * (generic_file_read, fault->nopage) are clever enough to check i_size |
219 | * and notice that the page they just read isn't needed. | 219 | * and notice that the page they just read isn't needed. |
220 | * | 220 | * |
221 | * XXX sys_readahead() seems to get that wrong? | 221 | * XXX sys_readahead() seems to get that wrong? |
222 | */ | 222 | */ |
223 | if (start >= i_size_read(inode)) { | 223 | if (start >= i_size_read(inode)) { |
224 | char *addr = kmap(page); | 224 | char *addr = kmap(page); |
225 | memset(addr, 0, PAGE_SIZE); | 225 | memset(addr, 0, PAGE_SIZE); |
226 | flush_dcache_page(page); | 226 | flush_dcache_page(page); |
227 | kunmap(page); | 227 | kunmap(page); |
228 | SetPageUptodate(page); | 228 | SetPageUptodate(page); |
229 | ret = 0; | 229 | ret = 0; |
230 | goto out_alloc; | 230 | goto out_alloc; |
231 | } | 231 | } |
232 | 232 | ||
233 | ret = ocfs2_data_lock_with_page(inode, 0, page); | 233 | ret = ocfs2_data_lock_with_page(inode, 0, page); |
234 | if (ret != 0) { | 234 | if (ret != 0) { |
235 | if (ret == AOP_TRUNCATED_PAGE) | 235 | if (ret == AOP_TRUNCATED_PAGE) |
236 | unlock = 0; | 236 | unlock = 0; |
237 | mlog_errno(ret); | 237 | mlog_errno(ret); |
238 | goto out_alloc; | 238 | goto out_alloc; |
239 | } | 239 | } |
240 | 240 | ||
241 | ret = block_read_full_page(page, ocfs2_get_block); | 241 | ret = block_read_full_page(page, ocfs2_get_block); |
242 | unlock = 0; | 242 | unlock = 0; |
243 | 243 | ||
244 | ocfs2_data_unlock(inode, 0); | 244 | ocfs2_data_unlock(inode, 0); |
245 | out_alloc: | 245 | out_alloc: |
246 | up_read(&OCFS2_I(inode)->ip_alloc_sem); | 246 | up_read(&OCFS2_I(inode)->ip_alloc_sem); |
247 | ocfs2_meta_unlock(inode, 0); | 247 | ocfs2_meta_unlock(inode, 0); |
248 | out: | 248 | out: |
249 | if (unlock) | 249 | if (unlock) |
250 | unlock_page(page); | 250 | unlock_page(page); |
251 | mlog_exit(ret); | 251 | mlog_exit(ret); |
252 | return ret; | 252 | return ret; |
253 | } | 253 | } |
254 | 254 | ||
255 | /* Note: Because we don't support holes, our allocation has | 255 | /* Note: Because we don't support holes, our allocation has |
256 | * already happened (allocation writes zeros to the file data) | 256 | * already happened (allocation writes zeros to the file data) |
257 | * so we don't have to worry about ordered writes in | 257 | * so we don't have to worry about ordered writes in |
258 | * ocfs2_writepage. | 258 | * ocfs2_writepage. |
259 | * | 259 | * |
260 | * ->writepage is called during the process of invalidating the page cache | 260 | * ->writepage is called during the process of invalidating the page cache |
261 | * during blocked lock processing. It can't block on any cluster locks | 261 | * during blocked lock processing. It can't block on any cluster locks |
262 | * to during block mapping. It's relying on the fact that the block | 262 | * to during block mapping. It's relying on the fact that the block |
263 | * mapping can't have disappeared under the dirty pages that it is | 263 | * mapping can't have disappeared under the dirty pages that it is |
264 | * being asked to write back. | 264 | * being asked to write back. |
265 | */ | 265 | */ |
266 | static int ocfs2_writepage(struct page *page, struct writeback_control *wbc) | 266 | static int ocfs2_writepage(struct page *page, struct writeback_control *wbc) |
267 | { | 267 | { |
268 | int ret; | 268 | int ret; |
269 | 269 | ||
270 | mlog_entry("(0x%p)\n", page); | 270 | mlog_entry("(0x%p)\n", page); |
271 | 271 | ||
272 | ret = block_write_full_page(page, ocfs2_get_block, wbc); | 272 | ret = block_write_full_page(page, ocfs2_get_block, wbc); |
273 | 273 | ||
274 | mlog_exit(ret); | 274 | mlog_exit(ret); |
275 | 275 | ||
276 | return ret; | 276 | return ret; |
277 | } | 277 | } |
278 | 278 | ||
279 | /* This can also be called from ocfs2_write_zero_page() which has done | ||
280 | * it's own cluster locking. */ | ||
281 | int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page, | ||
282 | unsigned from, unsigned to) | ||
283 | { | ||
284 | int ret; | ||
285 | |||
286 | down_read(&OCFS2_I(inode)->ip_alloc_sem); | ||
287 | |||
288 | ret = block_prepare_write(page, from, to, ocfs2_get_block); | ||
289 | |||
290 | up_read(&OCFS2_I(inode)->ip_alloc_sem); | ||
291 | |||
292 | return ret; | ||
293 | } | ||
294 | |||
279 | /* | 295 | /* |
280 | * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called | 296 | * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called |
281 | * from loopback. It must be able to perform its own locking around | 297 | * from loopback. It must be able to perform its own locking around |
282 | * ocfs2_get_block(). | 298 | * ocfs2_get_block(). |
283 | */ | 299 | */ |
284 | int ocfs2_prepare_write(struct file *file, struct page *page, | 300 | static int ocfs2_prepare_write(struct file *file, struct page *page, |
285 | unsigned from, unsigned to) | 301 | unsigned from, unsigned to) |
286 | { | 302 | { |
287 | struct inode *inode = page->mapping->host; | 303 | struct inode *inode = page->mapping->host; |
288 | int ret; | 304 | int ret; |
289 | 305 | ||
290 | mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to); | 306 | mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to); |
291 | 307 | ||
292 | ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page); | 308 | ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page); |
293 | if (ret != 0) { | 309 | if (ret != 0) { |
294 | mlog_errno(ret); | 310 | mlog_errno(ret); |
295 | goto out; | 311 | goto out; |
296 | } | 312 | } |
297 | 313 | ||
298 | down_read(&OCFS2_I(inode)->ip_alloc_sem); | 314 | ret = ocfs2_prepare_write_nolock(inode, page, from, to); |
299 | 315 | ||
300 | ret = block_prepare_write(page, from, to, ocfs2_get_block); | ||
301 | |||
302 | up_read(&OCFS2_I(inode)->ip_alloc_sem); | ||
303 | |||
304 | ocfs2_meta_unlock(inode, 0); | 316 | ocfs2_meta_unlock(inode, 0); |
305 | out: | 317 | out: |
306 | mlog_exit(ret); | 318 | mlog_exit(ret); |
307 | return ret; | 319 | return ret; |
308 | } | 320 | } |
309 | 321 | ||
310 | /* Taken from ext3. We don't necessarily need the full blown | 322 | /* Taken from ext3. We don't necessarily need the full blown |
311 | * functionality yet, but IMHO it's better to cut and paste the whole | 323 | * functionality yet, but IMHO it's better to cut and paste the whole |
312 | * thing so we can avoid introducing our own bugs (and easily pick up | 324 | * thing so we can avoid introducing our own bugs (and easily pick up |
313 | * their fixes when they happen) --Mark */ | 325 | * their fixes when they happen) --Mark */ |
314 | static int walk_page_buffers( handle_t *handle, | 326 | static int walk_page_buffers( handle_t *handle, |
315 | struct buffer_head *head, | 327 | struct buffer_head *head, |
316 | unsigned from, | 328 | unsigned from, |
317 | unsigned to, | 329 | unsigned to, |
318 | int *partial, | 330 | int *partial, |
319 | int (*fn)( handle_t *handle, | 331 | int (*fn)( handle_t *handle, |
320 | struct buffer_head *bh)) | 332 | struct buffer_head *bh)) |
321 | { | 333 | { |
322 | struct buffer_head *bh; | 334 | struct buffer_head *bh; |
323 | unsigned block_start, block_end; | 335 | unsigned block_start, block_end; |
324 | unsigned blocksize = head->b_size; | 336 | unsigned blocksize = head->b_size; |
325 | int err, ret = 0; | 337 | int err, ret = 0; |
326 | struct buffer_head *next; | 338 | struct buffer_head *next; |
327 | 339 | ||
328 | for ( bh = head, block_start = 0; | 340 | for ( bh = head, block_start = 0; |
329 | ret == 0 && (bh != head || !block_start); | 341 | ret == 0 && (bh != head || !block_start); |
330 | block_start = block_end, bh = next) | 342 | block_start = block_end, bh = next) |
331 | { | 343 | { |
332 | next = bh->b_this_page; | 344 | next = bh->b_this_page; |
333 | block_end = block_start + blocksize; | 345 | block_end = block_start + blocksize; |
334 | if (block_end <= from || block_start >= to) { | 346 | if (block_end <= from || block_start >= to) { |
335 | if (partial && !buffer_uptodate(bh)) | 347 | if (partial && !buffer_uptodate(bh)) |
336 | *partial = 1; | 348 | *partial = 1; |
337 | continue; | 349 | continue; |
338 | } | 350 | } |
339 | err = (*fn)(handle, bh); | 351 | err = (*fn)(handle, bh); |
340 | if (!ret) | 352 | if (!ret) |
341 | ret = err; | 353 | ret = err; |
342 | } | 354 | } |
343 | return ret; | 355 | return ret; |
344 | } | 356 | } |
345 | 357 | ||
346 | struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode, | 358 | struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode, |
347 | struct page *page, | 359 | struct page *page, |
348 | unsigned from, | 360 | unsigned from, |
349 | unsigned to) | 361 | unsigned to) |
350 | { | 362 | { |
351 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 363 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
352 | struct ocfs2_journal_handle *handle = NULL; | 364 | struct ocfs2_journal_handle *handle = NULL; |
353 | int ret = 0; | 365 | int ret = 0; |
354 | 366 | ||
355 | handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS); | 367 | handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS); |
356 | if (!handle) { | 368 | if (!handle) { |
357 | ret = -ENOMEM; | 369 | ret = -ENOMEM; |
358 | mlog_errno(ret); | 370 | mlog_errno(ret); |
359 | goto out; | 371 | goto out; |
360 | } | 372 | } |
361 | 373 | ||
362 | if (ocfs2_should_order_data(inode)) { | 374 | if (ocfs2_should_order_data(inode)) { |
363 | ret = walk_page_buffers(handle->k_handle, | 375 | ret = walk_page_buffers(handle->k_handle, |
364 | page_buffers(page), | 376 | page_buffers(page), |
365 | from, to, NULL, | 377 | from, to, NULL, |
366 | ocfs2_journal_dirty_data); | 378 | ocfs2_journal_dirty_data); |
367 | if (ret < 0) | 379 | if (ret < 0) |
368 | mlog_errno(ret); | 380 | mlog_errno(ret); |
369 | } | 381 | } |
370 | out: | 382 | out: |
371 | if (ret) { | 383 | if (ret) { |
372 | if (handle) | 384 | if (handle) |
373 | ocfs2_commit_trans(handle); | 385 | ocfs2_commit_trans(handle); |
374 | handle = ERR_PTR(ret); | 386 | handle = ERR_PTR(ret); |
375 | } | 387 | } |
376 | return handle; | 388 | return handle; |
377 | } | 389 | } |
378 | 390 | ||
379 | static int ocfs2_commit_write(struct file *file, struct page *page, | 391 | static int ocfs2_commit_write(struct file *file, struct page *page, |
380 | unsigned from, unsigned to) | 392 | unsigned from, unsigned to) |
381 | { | 393 | { |
382 | int ret, extending = 0, locklevel = 0; | 394 | int ret, extending = 0, locklevel = 0; |
383 | loff_t new_i_size; | 395 | loff_t new_i_size; |
384 | struct buffer_head *di_bh = NULL; | 396 | struct buffer_head *di_bh = NULL; |
385 | struct inode *inode = page->mapping->host; | 397 | struct inode *inode = page->mapping->host; |
386 | struct ocfs2_journal_handle *handle = NULL; | 398 | struct ocfs2_journal_handle *handle = NULL; |
387 | 399 | ||
388 | mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to); | 400 | mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to); |
389 | 401 | ||
390 | /* NOTE: ocfs2_file_aio_write has ensured that it's safe for | 402 | /* NOTE: ocfs2_file_aio_write has ensured that it's safe for |
391 | * us to sample inode->i_size here without the metadata lock: | 403 | * us to sample inode->i_size here without the metadata lock: |
392 | * | 404 | * |
393 | * 1) We're currently holding the inode alloc lock, so no | 405 | * 1) We're currently holding the inode alloc lock, so no |
394 | * nodes can change it underneath us. | 406 | * nodes can change it underneath us. |
395 | * | 407 | * |
396 | * 2) We've had to take the metadata lock at least once | 408 | * 2) We've had to take the metadata lock at least once |
397 | * already to check for extending writes, hence insuring | 409 | * already to check for extending writes, hence insuring |
398 | * that our current copy is also up to date. | 410 | * that our current copy is also up to date. |
399 | */ | 411 | */ |
400 | new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; | 412 | new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; |
401 | if (new_i_size > i_size_read(inode)) { | 413 | if (new_i_size > i_size_read(inode)) { |
402 | extending = 1; | 414 | extending = 1; |
403 | locklevel = 1; | 415 | locklevel = 1; |
404 | } | 416 | } |
405 | 417 | ||
406 | ret = ocfs2_meta_lock_with_page(inode, NULL, &di_bh, locklevel, page); | 418 | ret = ocfs2_meta_lock_with_page(inode, NULL, &di_bh, locklevel, page); |
407 | if (ret != 0) { | 419 | if (ret != 0) { |
408 | mlog_errno(ret); | 420 | mlog_errno(ret); |
409 | goto out; | 421 | goto out; |
410 | } | 422 | } |
411 | 423 | ||
412 | ret = ocfs2_data_lock_with_page(inode, 1, page); | 424 | ret = ocfs2_data_lock_with_page(inode, 1, page); |
413 | if (ret != 0) { | 425 | if (ret != 0) { |
414 | mlog_errno(ret); | 426 | mlog_errno(ret); |
415 | goto out_unlock_meta; | 427 | goto out_unlock_meta; |
416 | } | 428 | } |
417 | 429 | ||
418 | if (extending) { | 430 | if (extending) { |
419 | handle = ocfs2_start_walk_page_trans(inode, page, from, to); | 431 | handle = ocfs2_start_walk_page_trans(inode, page, from, to); |
420 | if (IS_ERR(handle)) { | 432 | if (IS_ERR(handle)) { |
421 | ret = PTR_ERR(handle); | 433 | ret = PTR_ERR(handle); |
422 | handle = NULL; | 434 | handle = NULL; |
423 | goto out_unlock_data; | 435 | goto out_unlock_data; |
424 | } | 436 | } |
425 | 437 | ||
426 | /* Mark our buffer early. We'd rather catch this error up here | 438 | /* Mark our buffer early. We'd rather catch this error up here |
427 | * as opposed to after a successful commit_write which would | 439 | * as opposed to after a successful commit_write which would |
428 | * require us to set back inode->i_size. */ | 440 | * require us to set back inode->i_size. */ |
429 | ret = ocfs2_journal_access(handle, inode, di_bh, | 441 | ret = ocfs2_journal_access(handle, inode, di_bh, |
430 | OCFS2_JOURNAL_ACCESS_WRITE); | 442 | OCFS2_JOURNAL_ACCESS_WRITE); |
431 | if (ret < 0) { | 443 | if (ret < 0) { |
432 | mlog_errno(ret); | 444 | mlog_errno(ret); |
433 | goto out_commit; | 445 | goto out_commit; |
434 | } | 446 | } |
435 | } | 447 | } |
436 | 448 | ||
437 | /* might update i_size */ | 449 | /* might update i_size */ |
438 | ret = generic_commit_write(file, page, from, to); | 450 | ret = generic_commit_write(file, page, from, to); |
439 | if (ret < 0) { | 451 | if (ret < 0) { |
440 | mlog_errno(ret); | 452 | mlog_errno(ret); |
441 | goto out_commit; | 453 | goto out_commit; |
442 | } | 454 | } |
443 | 455 | ||
444 | if (extending) { | 456 | if (extending) { |
445 | loff_t size = (u64) i_size_read(inode); | 457 | loff_t size = (u64) i_size_read(inode); |
446 | struct ocfs2_dinode *di = | 458 | struct ocfs2_dinode *di = |
447 | (struct ocfs2_dinode *)di_bh->b_data; | 459 | (struct ocfs2_dinode *)di_bh->b_data; |
448 | 460 | ||
449 | /* ocfs2_mark_inode_dirty is too heavy to use here. */ | 461 | /* ocfs2_mark_inode_dirty is too heavy to use here. */ |
450 | inode->i_blocks = ocfs2_align_bytes_to_sectors(size); | 462 | inode->i_blocks = ocfs2_align_bytes_to_sectors(size); |
451 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; | 463 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; |
452 | 464 | ||
453 | di->i_size = cpu_to_le64(size); | 465 | di->i_size = cpu_to_le64(size); |
454 | di->i_ctime = di->i_mtime = | 466 | di->i_ctime = di->i_mtime = |
455 | cpu_to_le64(inode->i_mtime.tv_sec); | 467 | cpu_to_le64(inode->i_mtime.tv_sec); |
456 | di->i_ctime_nsec = di->i_mtime_nsec = | 468 | di->i_ctime_nsec = di->i_mtime_nsec = |
457 | cpu_to_le32(inode->i_mtime.tv_nsec); | 469 | cpu_to_le32(inode->i_mtime.tv_nsec); |
458 | 470 | ||
459 | ret = ocfs2_journal_dirty(handle, di_bh); | 471 | ret = ocfs2_journal_dirty(handle, di_bh); |
460 | if (ret < 0) { | 472 | if (ret < 0) { |
461 | mlog_errno(ret); | 473 | mlog_errno(ret); |
462 | goto out_commit; | 474 | goto out_commit; |
463 | } | 475 | } |
464 | } | 476 | } |
465 | 477 | ||
466 | BUG_ON(extending && (i_size_read(inode) != new_i_size)); | 478 | BUG_ON(extending && (i_size_read(inode) != new_i_size)); |
467 | 479 | ||
468 | out_commit: | 480 | out_commit: |
469 | if (handle) | 481 | if (handle) |
470 | ocfs2_commit_trans(handle); | 482 | ocfs2_commit_trans(handle); |
471 | out_unlock_data: | 483 | out_unlock_data: |
472 | ocfs2_data_unlock(inode, 1); | 484 | ocfs2_data_unlock(inode, 1); |
473 | out_unlock_meta: | 485 | out_unlock_meta: |
474 | ocfs2_meta_unlock(inode, locklevel); | 486 | ocfs2_meta_unlock(inode, locklevel); |
475 | out: | 487 | out: |
476 | if (di_bh) | 488 | if (di_bh) |
477 | brelse(di_bh); | 489 | brelse(di_bh); |
478 | 490 | ||
479 | mlog_exit(ret); | 491 | mlog_exit(ret); |
480 | return ret; | 492 | return ret; |
481 | } | 493 | } |
482 | 494 | ||
483 | static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) | 495 | static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) |
484 | { | 496 | { |
485 | sector_t status; | 497 | sector_t status; |
486 | u64 p_blkno = 0; | 498 | u64 p_blkno = 0; |
487 | int err = 0; | 499 | int err = 0; |
488 | struct inode *inode = mapping->host; | 500 | struct inode *inode = mapping->host; |
489 | 501 | ||
490 | mlog_entry("(block = %llu)\n", (unsigned long long)block); | 502 | mlog_entry("(block = %llu)\n", (unsigned long long)block); |
491 | 503 | ||
492 | /* We don't need to lock journal system files, since they aren't | 504 | /* We don't need to lock journal system files, since they aren't |
493 | * accessed concurrently from multiple nodes. | 505 | * accessed concurrently from multiple nodes. |
494 | */ | 506 | */ |
495 | if (!INODE_JOURNAL(inode)) { | 507 | if (!INODE_JOURNAL(inode)) { |
496 | err = ocfs2_meta_lock(inode, NULL, NULL, 0); | 508 | err = ocfs2_meta_lock(inode, NULL, NULL, 0); |
497 | if (err) { | 509 | if (err) { |
498 | if (err != -ENOENT) | 510 | if (err != -ENOENT) |
499 | mlog_errno(err); | 511 | mlog_errno(err); |
500 | goto bail; | 512 | goto bail; |
501 | } | 513 | } |
502 | down_read(&OCFS2_I(inode)->ip_alloc_sem); | 514 | down_read(&OCFS2_I(inode)->ip_alloc_sem); |
503 | } | 515 | } |
504 | 516 | ||
505 | err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno, | 517 | err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno, |
506 | NULL); | 518 | NULL); |
507 | 519 | ||
508 | if (!INODE_JOURNAL(inode)) { | 520 | if (!INODE_JOURNAL(inode)) { |
509 | up_read(&OCFS2_I(inode)->ip_alloc_sem); | 521 | up_read(&OCFS2_I(inode)->ip_alloc_sem); |
510 | ocfs2_meta_unlock(inode, 0); | 522 | ocfs2_meta_unlock(inode, 0); |
511 | } | 523 | } |
512 | 524 | ||
513 | if (err) { | 525 | if (err) { |
514 | mlog(ML_ERROR, "get_blocks() failed, block = %llu\n", | 526 | mlog(ML_ERROR, "get_blocks() failed, block = %llu\n", |
515 | (unsigned long long)block); | 527 | (unsigned long long)block); |
516 | mlog_errno(err); | 528 | mlog_errno(err); |
517 | goto bail; | 529 | goto bail; |
518 | } | 530 | } |
519 | 531 | ||
520 | 532 | ||
521 | bail: | 533 | bail: |
522 | status = err ? 0 : p_blkno; | 534 | status = err ? 0 : p_blkno; |
523 | 535 | ||
524 | mlog_exit((int)status); | 536 | mlog_exit((int)status); |
525 | 537 | ||
526 | return status; | 538 | return status; |
527 | } | 539 | } |
528 | 540 | ||
529 | /* | 541 | /* |
530 | * TODO: Make this into a generic get_blocks function. | 542 | * TODO: Make this into a generic get_blocks function. |
531 | * | 543 | * |
532 | * From do_direct_io in direct-io.c: | 544 | * From do_direct_io in direct-io.c: |
533 | * "So what we do is to permit the ->get_blocks function to populate | 545 | * "So what we do is to permit the ->get_blocks function to populate |
534 | * bh.b_size with the size of IO which is permitted at this offset and | 546 | * bh.b_size with the size of IO which is permitted at this offset and |
535 | * this i_blkbits." | 547 | * this i_blkbits." |
536 | * | 548 | * |
537 | * This function is called directly from get_more_blocks in direct-io.c. | 549 | * This function is called directly from get_more_blocks in direct-io.c. |
538 | * | 550 | * |
539 | * called like this: dio->get_blocks(dio->inode, fs_startblk, | 551 | * called like this: dio->get_blocks(dio->inode, fs_startblk, |
540 | * fs_count, map_bh, dio->rw == WRITE); | 552 | * fs_count, map_bh, dio->rw == WRITE); |
541 | */ | 553 | */ |
542 | static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, | 554 | static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, |
543 | struct buffer_head *bh_result, int create) | 555 | struct buffer_head *bh_result, int create) |
544 | { | 556 | { |
545 | int ret; | 557 | int ret; |
546 | u64 vbo_max; /* file offset, max_blocks from iblock */ | 558 | u64 vbo_max; /* file offset, max_blocks from iblock */ |
547 | u64 p_blkno; | 559 | u64 p_blkno; |
548 | int contig_blocks; | 560 | int contig_blocks; |
549 | unsigned char blocksize_bits; | 561 | unsigned char blocksize_bits; |
550 | unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; | 562 | unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; |
551 | 563 | ||
552 | if (!inode || !bh_result) { | 564 | if (!inode || !bh_result) { |
553 | mlog(ML_ERROR, "inode or bh_result is null\n"); | 565 | mlog(ML_ERROR, "inode or bh_result is null\n"); |
554 | return -EIO; | 566 | return -EIO; |
555 | } | 567 | } |
556 | 568 | ||
557 | blocksize_bits = inode->i_sb->s_blocksize_bits; | 569 | blocksize_bits = inode->i_sb->s_blocksize_bits; |
558 | 570 | ||
559 | /* This function won't even be called if the request isn't all | 571 | /* This function won't even be called if the request isn't all |
560 | * nicely aligned and of the right size, so there's no need | 572 | * nicely aligned and of the right size, so there's no need |
561 | * for us to check any of that. */ | 573 | * for us to check any of that. */ |
562 | 574 | ||
563 | vbo_max = ((u64)iblock + max_blocks) << blocksize_bits; | 575 | vbo_max = ((u64)iblock + max_blocks) << blocksize_bits; |
564 | 576 | ||
565 | spin_lock(&OCFS2_I(inode)->ip_lock); | 577 | spin_lock(&OCFS2_I(inode)->ip_lock); |
566 | if ((iblock + max_blocks) > | 578 | if ((iblock + max_blocks) > |
567 | ocfs2_clusters_to_blocks(inode->i_sb, | 579 | ocfs2_clusters_to_blocks(inode->i_sb, |
568 | OCFS2_I(inode)->ip_clusters)) { | 580 | OCFS2_I(inode)->ip_clusters)) { |
569 | spin_unlock(&OCFS2_I(inode)->ip_lock); | 581 | spin_unlock(&OCFS2_I(inode)->ip_lock); |
570 | ret = -EIO; | 582 | ret = -EIO; |
571 | goto bail; | 583 | goto bail; |
572 | } | 584 | } |
573 | spin_unlock(&OCFS2_I(inode)->ip_lock); | 585 | spin_unlock(&OCFS2_I(inode)->ip_lock); |
574 | 586 | ||
575 | /* This figures out the size of the next contiguous block, and | 587 | /* This figures out the size of the next contiguous block, and |
576 | * our logical offset */ | 588 | * our logical offset */ |
577 | ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno, | 589 | ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno, |
578 | &contig_blocks); | 590 | &contig_blocks); |
579 | if (ret) { | 591 | if (ret) { |
580 | mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", | 592 | mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", |
581 | (unsigned long long)iblock); | 593 | (unsigned long long)iblock); |
582 | ret = -EIO; | 594 | ret = -EIO; |
583 | goto bail; | 595 | goto bail; |
584 | } | 596 | } |
585 | 597 | ||
586 | map_bh(bh_result, inode->i_sb, p_blkno); | 598 | map_bh(bh_result, inode->i_sb, p_blkno); |
587 | 599 | ||
588 | /* make sure we don't map more than max_blocks blocks here as | 600 | /* make sure we don't map more than max_blocks blocks here as |
589 | that's all the kernel will handle at this point. */ | 601 | that's all the kernel will handle at this point. */ |
590 | if (max_blocks < contig_blocks) | 602 | if (max_blocks < contig_blocks) |
591 | contig_blocks = max_blocks; | 603 | contig_blocks = max_blocks; |
592 | bh_result->b_size = contig_blocks << blocksize_bits; | 604 | bh_result->b_size = contig_blocks << blocksize_bits; |
593 | bail: | 605 | bail: |
594 | return ret; | 606 | return ret; |
595 | } | 607 | } |
596 | 608 | ||
597 | /* | 609 | /* |
598 | * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're | 610 | * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're |
599 | * particularly interested in the aio/dio case. Like the core uses | 611 | * particularly interested in the aio/dio case. Like the core uses |
600 | * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from | 612 | * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from |
601 | * truncation on another. | 613 | * truncation on another. |
602 | */ | 614 | */ |
603 | static void ocfs2_dio_end_io(struct kiocb *iocb, | 615 | static void ocfs2_dio_end_io(struct kiocb *iocb, |
604 | loff_t offset, | 616 | loff_t offset, |
605 | ssize_t bytes, | 617 | ssize_t bytes, |
606 | void *private) | 618 | void *private) |
607 | { | 619 | { |
608 | struct inode *inode = iocb->ki_filp->f_dentry->d_inode; | 620 | struct inode *inode = iocb->ki_filp->f_dentry->d_inode; |
609 | 621 | ||
610 | /* this io's submitter should not have unlocked this before we could */ | 622 | /* this io's submitter should not have unlocked this before we could */ |
611 | BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); | 623 | BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); |
612 | ocfs2_iocb_clear_rw_locked(iocb); | 624 | ocfs2_iocb_clear_rw_locked(iocb); |
613 | up_read(&inode->i_alloc_sem); | 625 | up_read(&inode->i_alloc_sem); |
614 | ocfs2_rw_unlock(inode, 0); | 626 | ocfs2_rw_unlock(inode, 0); |
615 | } | 627 | } |
616 | 628 | ||
617 | static ssize_t ocfs2_direct_IO(int rw, | 629 | static ssize_t ocfs2_direct_IO(int rw, |
618 | struct kiocb *iocb, | 630 | struct kiocb *iocb, |
619 | const struct iovec *iov, | 631 | const struct iovec *iov, |
620 | loff_t offset, | 632 | loff_t offset, |
621 | unsigned long nr_segs) | 633 | unsigned long nr_segs) |
622 | { | 634 | { |
623 | struct file *file = iocb->ki_filp; | 635 | struct file *file = iocb->ki_filp; |
624 | struct inode *inode = file->f_dentry->d_inode->i_mapping->host; | 636 | struct inode *inode = file->f_dentry->d_inode->i_mapping->host; |
625 | int ret; | 637 | int ret; |
626 | 638 | ||
627 | mlog_entry_void(); | 639 | mlog_entry_void(); |
640 | |||
641 | /* | ||
642 | * We get PR data locks even for O_DIRECT. This allows | ||
643 | * concurrent O_DIRECT I/O but doesn't let O_DIRECT with | ||
644 | * extending and buffered zeroing writes race. If they did | ||
645 | * race then the buffered zeroing could be written back after | ||
646 | * the O_DIRECT I/O. It's one thing to tell people not to mix | ||
647 | * buffered and O_DIRECT writes, but expecting them to | ||
648 | * understand that file extension is also an implicit buffered | ||
649 | * write is too much. By getting the PR we force writeback of | ||
650 | * the buffered zeroing before proceeding. | ||
651 | */ | ||
652 | ret = ocfs2_data_lock(inode, 0); | ||
653 | if (ret < 0) { | ||
654 | mlog_errno(ret); | ||
655 | goto out; | ||
656 | } | ||
657 | ocfs2_data_unlock(inode, 0); | ||
658 | |||
628 | ret = blockdev_direct_IO_no_locking(rw, iocb, inode, | 659 | ret = blockdev_direct_IO_no_locking(rw, iocb, inode, |
629 | inode->i_sb->s_bdev, iov, offset, | 660 | inode->i_sb->s_bdev, iov, offset, |
630 | nr_segs, | 661 | nr_segs, |
631 | ocfs2_direct_IO_get_blocks, | 662 | ocfs2_direct_IO_get_blocks, |
632 | ocfs2_dio_end_io); | 663 | ocfs2_dio_end_io); |
664 | out: | ||
633 | mlog_exit(ret); | 665 | mlog_exit(ret); |
634 | return ret; | 666 | return ret; |
635 | } | 667 | } |
636 | 668 | ||
637 | struct address_space_operations ocfs2_aops = { | 669 | struct address_space_operations ocfs2_aops = { |
638 | .readpage = ocfs2_readpage, | 670 | .readpage = ocfs2_readpage, |
639 | .writepage = ocfs2_writepage, | 671 | .writepage = ocfs2_writepage, |
640 | .prepare_write = ocfs2_prepare_write, | 672 | .prepare_write = ocfs2_prepare_write, |
641 | .commit_write = ocfs2_commit_write, | 673 | .commit_write = ocfs2_commit_write, |
642 | .bmap = ocfs2_bmap, | 674 | .bmap = ocfs2_bmap, |
fs/ocfs2/aops.h
1 | /* -*- mode: c; c-basic-offset: 8; -*- | 1 | /* -*- mode: c; c-basic-offset: 8; -*- |
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | 2 | * vim: noexpandtab sw=8 ts=8 sts=0: |
3 | * | 3 | * |
4 | * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved. | 4 | * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved. |
5 | * | 5 | * |
6 | * This program is free software; you can redistribute it and/or | 6 | * This program is free software; you can redistribute it and/or |
7 | * modify it under the terms of the GNU General Public | 7 | * modify it under the terms of the GNU General Public |
8 | * License as published by the Free Software Foundation; either | 8 | * License as published by the Free Software Foundation; either |
9 | * version 2 of the License, or (at your option) any later version. | 9 | * version 2 of the License, or (at your option) any later version. |
10 | * | 10 | * |
11 | * This program is distributed in the hope that it will be useful, | 11 | * This program is distributed in the hope that it will be useful, |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | * General Public License for more details. | 14 | * General Public License for more details. |
15 | * | 15 | * |
16 | * You should have received a copy of the GNU General Public | 16 | * You should have received a copy of the GNU General Public |
17 | * License along with this program; if not, write to the | 17 | * License along with this program; if not, write to the |
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | 18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, |
19 | * Boston, MA 021110-1307, USA. | 19 | * Boston, MA 021110-1307, USA. |
20 | */ | 20 | */ |
21 | 21 | ||
22 | #ifndef OCFS2_AOPS_H | 22 | #ifndef OCFS2_AOPS_H |
23 | #define OCFS2_AOPS_H | 23 | #define OCFS2_AOPS_H |
24 | 24 | ||
25 | int ocfs2_prepare_write(struct file *file, struct page *page, | 25 | int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page, |
26 | unsigned from, unsigned to); | 26 | unsigned from, unsigned to); |
27 | 27 | ||
28 | struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode, | 28 | struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode, |
29 | struct page *page, | 29 | struct page *page, |
30 | unsigned from, | 30 | unsigned from, |
31 | unsigned to); | 31 | unsigned to); |
32 | 32 | ||
33 | /* all ocfs2_dio_end_io()'s fault */ | 33 | /* all ocfs2_dio_end_io()'s fault */ |
34 | #define ocfs2_iocb_is_rw_locked(iocb) \ | 34 | #define ocfs2_iocb_is_rw_locked(iocb) \ |
35 | test_bit(0, (unsigned long *)&iocb->private) | 35 | test_bit(0, (unsigned long *)&iocb->private) |
36 | #define ocfs2_iocb_set_rw_locked(iocb) \ | 36 | #define ocfs2_iocb_set_rw_locked(iocb) \ |
37 | set_bit(0, (unsigned long *)&iocb->private) | 37 | set_bit(0, (unsigned long *)&iocb->private) |
38 | #define ocfs2_iocb_clear_rw_locked(iocb) \ | 38 | #define ocfs2_iocb_clear_rw_locked(iocb) \ |
39 | clear_bit(0, (unsigned long *)&iocb->private) | 39 | clear_bit(0, (unsigned long *)&iocb->private) |
40 | 40 | ||
41 | #endif /* OCFS2_FILE_H */ | 41 | #endif /* OCFS2_FILE_H */ |
42 | 42 |
fs/ocfs2/file.c
1 | /* -*- mode: c; c-basic-offset: 8; -*- | 1 | /* -*- mode: c; c-basic-offset: 8; -*- |
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | 2 | * vim: noexpandtab sw=8 ts=8 sts=0: |
3 | * | 3 | * |
4 | * file.c | 4 | * file.c |
5 | * | 5 | * |
6 | * File open, close, extend, truncate | 6 | * File open, close, extend, truncate |
7 | * | 7 | * |
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | 8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. |
9 | * | 9 | * |
10 | * This program is free software; you can redistribute it and/or | 10 | * This program is free software; you can redistribute it and/or |
11 | * modify it under the terms of the GNU General Public | 11 | * modify it under the terms of the GNU General Public |
12 | * License as published by the Free Software Foundation; either | 12 | * License as published by the Free Software Foundation; either |
13 | * version 2 of the License, or (at your option) any later version. | 13 | * version 2 of the License, or (at your option) any later version. |
14 | * | 14 | * |
15 | * This program is distributed in the hope that it will be useful, | 15 | * This program is distributed in the hope that it will be useful, |
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
18 | * General Public License for more details. | 18 | * General Public License for more details. |
19 | * | 19 | * |
20 | * You should have received a copy of the GNU General Public | 20 | * You should have received a copy of the GNU General Public |
21 | * License along with this program; if not, write to the | 21 | * License along with this program; if not, write to the |
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | 22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, |
23 | * Boston, MA 021110-1307, USA. | 23 | * Boston, MA 021110-1307, USA. |
24 | */ | 24 | */ |
25 | 25 | ||
26 | #include <linux/capability.h> | 26 | #include <linux/capability.h> |
27 | #include <linux/fs.h> | 27 | #include <linux/fs.h> |
28 | #include <linux/types.h> | 28 | #include <linux/types.h> |
29 | #include <linux/slab.h> | 29 | #include <linux/slab.h> |
30 | #include <linux/highmem.h> | 30 | #include <linux/highmem.h> |
31 | #include <linux/pagemap.h> | 31 | #include <linux/pagemap.h> |
32 | #include <linux/uio.h> | 32 | #include <linux/uio.h> |
33 | 33 | ||
34 | #define MLOG_MASK_PREFIX ML_INODE | 34 | #define MLOG_MASK_PREFIX ML_INODE |
35 | #include <cluster/masklog.h> | 35 | #include <cluster/masklog.h> |
36 | 36 | ||
37 | #include "ocfs2.h" | 37 | #include "ocfs2.h" |
38 | 38 | ||
39 | #include "alloc.h" | 39 | #include "alloc.h" |
40 | #include "aops.h" | 40 | #include "aops.h" |
41 | #include "dir.h" | 41 | #include "dir.h" |
42 | #include "dlmglue.h" | 42 | #include "dlmglue.h" |
43 | #include "extent_map.h" | 43 | #include "extent_map.h" |
44 | #include "file.h" | 44 | #include "file.h" |
45 | #include "sysfile.h" | 45 | #include "sysfile.h" |
46 | #include "inode.h" | 46 | #include "inode.h" |
47 | #include "journal.h" | 47 | #include "journal.h" |
48 | #include "mmap.h" | 48 | #include "mmap.h" |
49 | #include "suballoc.h" | 49 | #include "suballoc.h" |
50 | #include "super.h" | 50 | #include "super.h" |
51 | 51 | ||
52 | #include "buffer_head_io.h" | 52 | #include "buffer_head_io.h" |
53 | 53 | ||
54 | static int ocfs2_sync_inode(struct inode *inode) | 54 | static int ocfs2_sync_inode(struct inode *inode) |
55 | { | 55 | { |
56 | filemap_fdatawrite(inode->i_mapping); | 56 | filemap_fdatawrite(inode->i_mapping); |
57 | return sync_mapping_buffers(inode->i_mapping); | 57 | return sync_mapping_buffers(inode->i_mapping); |
58 | } | 58 | } |
59 | 59 | ||
60 | static int ocfs2_file_open(struct inode *inode, struct file *file) | 60 | static int ocfs2_file_open(struct inode *inode, struct file *file) |
61 | { | 61 | { |
62 | int status; | 62 | int status; |
63 | int mode = file->f_flags; | 63 | int mode = file->f_flags; |
64 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | 64 | struct ocfs2_inode_info *oi = OCFS2_I(inode); |
65 | 65 | ||
66 | mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, | 66 | mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, |
67 | file->f_dentry->d_name.len, file->f_dentry->d_name.name); | 67 | file->f_dentry->d_name.len, file->f_dentry->d_name.name); |
68 | 68 | ||
69 | spin_lock(&oi->ip_lock); | 69 | spin_lock(&oi->ip_lock); |
70 | 70 | ||
71 | /* Check that the inode hasn't been wiped from disk by another | 71 | /* Check that the inode hasn't been wiped from disk by another |
72 | * node. If it hasn't then we're safe as long as we hold the | 72 | * node. If it hasn't then we're safe as long as we hold the |
73 | * spin lock until our increment of open count. */ | 73 | * spin lock until our increment of open count. */ |
74 | if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { | 74 | if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { |
75 | spin_unlock(&oi->ip_lock); | 75 | spin_unlock(&oi->ip_lock); |
76 | 76 | ||
77 | status = -ENOENT; | 77 | status = -ENOENT; |
78 | goto leave; | 78 | goto leave; |
79 | } | 79 | } |
80 | 80 | ||
81 | if (mode & O_DIRECT) | 81 | if (mode & O_DIRECT) |
82 | oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT; | 82 | oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT; |
83 | 83 | ||
84 | oi->ip_open_count++; | 84 | oi->ip_open_count++; |
85 | spin_unlock(&oi->ip_lock); | 85 | spin_unlock(&oi->ip_lock); |
86 | status = 0; | 86 | status = 0; |
87 | leave: | 87 | leave: |
88 | mlog_exit(status); | 88 | mlog_exit(status); |
89 | return status; | 89 | return status; |
90 | } | 90 | } |
91 | 91 | ||
92 | static int ocfs2_file_release(struct inode *inode, struct file *file) | 92 | static int ocfs2_file_release(struct inode *inode, struct file *file) |
93 | { | 93 | { |
94 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | 94 | struct ocfs2_inode_info *oi = OCFS2_I(inode); |
95 | 95 | ||
96 | mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, | 96 | mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, |
97 | file->f_dentry->d_name.len, | 97 | file->f_dentry->d_name.len, |
98 | file->f_dentry->d_name.name); | 98 | file->f_dentry->d_name.name); |
99 | 99 | ||
100 | spin_lock(&oi->ip_lock); | 100 | spin_lock(&oi->ip_lock); |
101 | if (!--oi->ip_open_count) | 101 | if (!--oi->ip_open_count) |
102 | oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT; | 102 | oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT; |
103 | spin_unlock(&oi->ip_lock); | 103 | spin_unlock(&oi->ip_lock); |
104 | 104 | ||
105 | mlog_exit(0); | 105 | mlog_exit(0); |
106 | 106 | ||
107 | return 0; | 107 | return 0; |
108 | } | 108 | } |
109 | 109 | ||
110 | static int ocfs2_sync_file(struct file *file, | 110 | static int ocfs2_sync_file(struct file *file, |
111 | struct dentry *dentry, | 111 | struct dentry *dentry, |
112 | int datasync) | 112 | int datasync) |
113 | { | 113 | { |
114 | int err = 0; | 114 | int err = 0; |
115 | journal_t *journal; | 115 | journal_t *journal; |
116 | struct inode *inode = dentry->d_inode; | 116 | struct inode *inode = dentry->d_inode; |
117 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 117 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
118 | 118 | ||
119 | mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync, | 119 | mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync, |
120 | dentry->d_name.len, dentry->d_name.name); | 120 | dentry->d_name.len, dentry->d_name.name); |
121 | 121 | ||
122 | err = ocfs2_sync_inode(dentry->d_inode); | 122 | err = ocfs2_sync_inode(dentry->d_inode); |
123 | if (err) | 123 | if (err) |
124 | goto bail; | 124 | goto bail; |
125 | 125 | ||
126 | journal = osb->journal->j_journal; | 126 | journal = osb->journal->j_journal; |
127 | err = journal_force_commit(journal); | 127 | err = journal_force_commit(journal); |
128 | 128 | ||
129 | bail: | 129 | bail: |
130 | mlog_exit(err); | 130 | mlog_exit(err); |
131 | 131 | ||
132 | return (err < 0) ? -EIO : 0; | 132 | return (err < 0) ? -EIO : 0; |
133 | } | 133 | } |
134 | 134 | ||
135 | int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle, | 135 | int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle, |
136 | struct inode *inode, | 136 | struct inode *inode, |
137 | struct buffer_head *fe_bh, | 137 | struct buffer_head *fe_bh, |
138 | u64 new_i_size) | 138 | u64 new_i_size) |
139 | { | 139 | { |
140 | int status; | 140 | int status; |
141 | 141 | ||
142 | mlog_entry_void(); | 142 | mlog_entry_void(); |
143 | i_size_write(inode, new_i_size); | 143 | i_size_write(inode, new_i_size); |
144 | inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size); | 144 | inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size); |
145 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; | 145 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; |
146 | 146 | ||
147 | status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); | 147 | status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); |
148 | if (status < 0) { | 148 | if (status < 0) { |
149 | mlog_errno(status); | 149 | mlog_errno(status); |
150 | goto bail; | 150 | goto bail; |
151 | } | 151 | } |
152 | 152 | ||
153 | bail: | 153 | bail: |
154 | mlog_exit(status); | 154 | mlog_exit(status); |
155 | return status; | 155 | return status; |
156 | } | 156 | } |
157 | 157 | ||
158 | static int ocfs2_simple_size_update(struct inode *inode, | 158 | static int ocfs2_simple_size_update(struct inode *inode, |
159 | struct buffer_head *di_bh, | 159 | struct buffer_head *di_bh, |
160 | u64 new_i_size) | 160 | u64 new_i_size) |
161 | { | 161 | { |
162 | int ret; | 162 | int ret; |
163 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 163 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
164 | struct ocfs2_journal_handle *handle = NULL; | 164 | struct ocfs2_journal_handle *handle = NULL; |
165 | 165 | ||
166 | handle = ocfs2_start_trans(osb, NULL, | 166 | handle = ocfs2_start_trans(osb, NULL, |
167 | OCFS2_INODE_UPDATE_CREDITS); | 167 | OCFS2_INODE_UPDATE_CREDITS); |
168 | if (handle == NULL) { | 168 | if (handle == NULL) { |
169 | ret = -ENOMEM; | 169 | ret = -ENOMEM; |
170 | mlog_errno(ret); | 170 | mlog_errno(ret); |
171 | goto out; | 171 | goto out; |
172 | } | 172 | } |
173 | 173 | ||
174 | ret = ocfs2_set_inode_size(handle, inode, di_bh, | 174 | ret = ocfs2_set_inode_size(handle, inode, di_bh, |
175 | new_i_size); | 175 | new_i_size); |
176 | if (ret < 0) | 176 | if (ret < 0) |
177 | mlog_errno(ret); | 177 | mlog_errno(ret); |
178 | 178 | ||
179 | ocfs2_commit_trans(handle); | 179 | ocfs2_commit_trans(handle); |
180 | out: | 180 | out: |
181 | return ret; | 181 | return ret; |
182 | } | 182 | } |
183 | 183 | ||
184 | static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, | 184 | static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, |
185 | struct inode *inode, | 185 | struct inode *inode, |
186 | struct buffer_head *fe_bh, | 186 | struct buffer_head *fe_bh, |
187 | u64 new_i_size) | 187 | u64 new_i_size) |
188 | { | 188 | { |
189 | int status; | 189 | int status; |
190 | struct ocfs2_journal_handle *handle; | 190 | struct ocfs2_journal_handle *handle; |
191 | 191 | ||
192 | mlog_entry_void(); | 192 | mlog_entry_void(); |
193 | 193 | ||
194 | /* TODO: This needs to actually orphan the inode in this | 194 | /* TODO: This needs to actually orphan the inode in this |
195 | * transaction. */ | 195 | * transaction. */ |
196 | 196 | ||
197 | handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS); | 197 | handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS); |
198 | if (IS_ERR(handle)) { | 198 | if (IS_ERR(handle)) { |
199 | status = PTR_ERR(handle); | 199 | status = PTR_ERR(handle); |
200 | mlog_errno(status); | 200 | mlog_errno(status); |
201 | goto out; | 201 | goto out; |
202 | } | 202 | } |
203 | 203 | ||
204 | status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size); | 204 | status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size); |
205 | if (status < 0) | 205 | if (status < 0) |
206 | mlog_errno(status); | 206 | mlog_errno(status); |
207 | 207 | ||
208 | ocfs2_commit_trans(handle); | 208 | ocfs2_commit_trans(handle); |
209 | out: | 209 | out: |
210 | mlog_exit(status); | 210 | mlog_exit(status); |
211 | return status; | 211 | return status; |
212 | } | 212 | } |
213 | 213 | ||
214 | static int ocfs2_truncate_file(struct inode *inode, | 214 | static int ocfs2_truncate_file(struct inode *inode, |
215 | struct buffer_head *di_bh, | 215 | struct buffer_head *di_bh, |
216 | u64 new_i_size) | 216 | u64 new_i_size) |
217 | { | 217 | { |
218 | int status = 0; | 218 | int status = 0; |
219 | struct ocfs2_dinode *fe = NULL; | 219 | struct ocfs2_dinode *fe = NULL; |
220 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 220 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
221 | struct ocfs2_truncate_context *tc = NULL; | 221 | struct ocfs2_truncate_context *tc = NULL; |
222 | 222 | ||
223 | mlog_entry("(inode = %llu, new_i_size = %llu\n", | 223 | mlog_entry("(inode = %llu, new_i_size = %llu\n", |
224 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | 224 | (unsigned long long)OCFS2_I(inode)->ip_blkno, |
225 | (unsigned long long)new_i_size); | 225 | (unsigned long long)new_i_size); |
226 | 226 | ||
227 | truncate_inode_pages(inode->i_mapping, new_i_size); | 227 | truncate_inode_pages(inode->i_mapping, new_i_size); |
228 | 228 | ||
229 | fe = (struct ocfs2_dinode *) di_bh->b_data; | 229 | fe = (struct ocfs2_dinode *) di_bh->b_data; |
230 | if (!OCFS2_IS_VALID_DINODE(fe)) { | 230 | if (!OCFS2_IS_VALID_DINODE(fe)) { |
231 | OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); | 231 | OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); |
232 | status = -EIO; | 232 | status = -EIO; |
233 | goto bail; | 233 | goto bail; |
234 | } | 234 | } |
235 | 235 | ||
236 | mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode), | 236 | mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode), |
237 | "Inode %llu, inode i_size = %lld != di " | 237 | "Inode %llu, inode i_size = %lld != di " |
238 | "i_size = %llu, i_flags = 0x%x\n", | 238 | "i_size = %llu, i_flags = 0x%x\n", |
239 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | 239 | (unsigned long long)OCFS2_I(inode)->ip_blkno, |
240 | i_size_read(inode), | 240 | i_size_read(inode), |
241 | (unsigned long long)le64_to_cpu(fe->i_size), | 241 | (unsigned long long)le64_to_cpu(fe->i_size), |
242 | le32_to_cpu(fe->i_flags)); | 242 | le32_to_cpu(fe->i_flags)); |
243 | 243 | ||
244 | if (new_i_size > le64_to_cpu(fe->i_size)) { | 244 | if (new_i_size > le64_to_cpu(fe->i_size)) { |
245 | mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n", | 245 | mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n", |
246 | (unsigned long long)le64_to_cpu(fe->i_size), | 246 | (unsigned long long)le64_to_cpu(fe->i_size), |
247 | (unsigned long long)new_i_size); | 247 | (unsigned long long)new_i_size); |
248 | status = -EINVAL; | 248 | status = -EINVAL; |
249 | mlog_errno(status); | 249 | mlog_errno(status); |
250 | goto bail; | 250 | goto bail; |
251 | } | 251 | } |
252 | 252 | ||
253 | mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n", | 253 | mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n", |
254 | (unsigned long long)le64_to_cpu(fe->i_blkno), | 254 | (unsigned long long)le64_to_cpu(fe->i_blkno), |
255 | (unsigned long long)le64_to_cpu(fe->i_size), | 255 | (unsigned long long)le64_to_cpu(fe->i_size), |
256 | (unsigned long long)new_i_size); | 256 | (unsigned long long)new_i_size); |
257 | 257 | ||
258 | /* lets handle the simple truncate cases before doing any more | 258 | /* lets handle the simple truncate cases before doing any more |
259 | * cluster locking. */ | 259 | * cluster locking. */ |
260 | if (new_i_size == le64_to_cpu(fe->i_size)) | 260 | if (new_i_size == le64_to_cpu(fe->i_size)) |
261 | goto bail; | 261 | goto bail; |
262 | 262 | ||
263 | /* This forces other nodes to sync and drop their pages. Do | 263 | /* This forces other nodes to sync and drop their pages. Do |
264 | * this even if we have a truncate without allocation change - | 264 | * this even if we have a truncate without allocation change - |
265 | * ocfs2 cluster sizes can be much greater than page size, so | 265 | * ocfs2 cluster sizes can be much greater than page size, so |
266 | * we have to truncate them anyway. */ | 266 | * we have to truncate them anyway. */ |
267 | status = ocfs2_data_lock(inode, 1); | 267 | status = ocfs2_data_lock(inode, 1); |
268 | if (status < 0) { | 268 | if (status < 0) { |
269 | mlog_errno(status); | 269 | mlog_errno(status); |
270 | goto bail; | 270 | goto bail; |
271 | } | 271 | } |
272 | ocfs2_data_unlock(inode, 1); | 272 | ocfs2_data_unlock(inode, 1); |
273 | 273 | ||
274 | if (le32_to_cpu(fe->i_clusters) == | 274 | if (le32_to_cpu(fe->i_clusters) == |
275 | ocfs2_clusters_for_bytes(osb->sb, new_i_size)) { | 275 | ocfs2_clusters_for_bytes(osb->sb, new_i_size)) { |
276 | mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n", | 276 | mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n", |
277 | fe->i_clusters); | 277 | fe->i_clusters); |
278 | /* No allocation change is required, so lets fast path | 278 | /* No allocation change is required, so lets fast path |
279 | * this truncate. */ | 279 | * this truncate. */ |
280 | status = ocfs2_simple_size_update(inode, di_bh, new_i_size); | 280 | status = ocfs2_simple_size_update(inode, di_bh, new_i_size); |
281 | if (status < 0) | 281 | if (status < 0) |
282 | mlog_errno(status); | 282 | mlog_errno(status); |
283 | goto bail; | 283 | goto bail; |
284 | } | 284 | } |
285 | 285 | ||
286 | /* alright, we're going to need to do a full blown alloc size | 286 | /* alright, we're going to need to do a full blown alloc size |
287 | * change. Orphan the inode so that recovery can complete the | 287 | * change. Orphan the inode so that recovery can complete the |
288 | * truncate if necessary. This does the task of marking | 288 | * truncate if necessary. This does the task of marking |
289 | * i_size. */ | 289 | * i_size. */ |
290 | status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); | 290 | status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); |
291 | if (status < 0) { | 291 | if (status < 0) { |
292 | mlog_errno(status); | 292 | mlog_errno(status); |
293 | goto bail; | 293 | goto bail; |
294 | } | 294 | } |
295 | 295 | ||
296 | status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); | 296 | status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); |
297 | if (status < 0) { | 297 | if (status < 0) { |
298 | mlog_errno(status); | 298 | mlog_errno(status); |
299 | goto bail; | 299 | goto bail; |
300 | } | 300 | } |
301 | 301 | ||
302 | status = ocfs2_commit_truncate(osb, inode, di_bh, tc); | 302 | status = ocfs2_commit_truncate(osb, inode, di_bh, tc); |
303 | if (status < 0) { | 303 | if (status < 0) { |
304 | mlog_errno(status); | 304 | mlog_errno(status); |
305 | goto bail; | 305 | goto bail; |
306 | } | 306 | } |
307 | 307 | ||
308 | /* TODO: orphan dir cleanup here. */ | 308 | /* TODO: orphan dir cleanup here. */ |
309 | bail: | 309 | bail: |
310 | 310 | ||
311 | mlog_exit(status); | 311 | mlog_exit(status); |
312 | return status; | 312 | return status; |
313 | } | 313 | } |
314 | 314 | ||
315 | /* | 315 | /* |
316 | * extend allocation only here. | 316 | * extend allocation only here. |
317 | * we'll update all the disk stuff, and oip->alloc_size | 317 | * we'll update all the disk stuff, and oip->alloc_size |
318 | * | 318 | * |
319 | * expect stuff to be locked, a transaction started and enough data / | 319 | * expect stuff to be locked, a transaction started and enough data / |
320 | * metadata reservations in the contexts. | 320 | * metadata reservations in the contexts. |
321 | * | 321 | * |
322 | * Will return -EAGAIN, and a reason if a restart is needed. | 322 | * Will return -EAGAIN, and a reason if a restart is needed. |
323 | * If passed in, *reason will always be set, even in error. | 323 | * If passed in, *reason will always be set, even in error. |
324 | */ | 324 | */ |
325 | int ocfs2_do_extend_allocation(struct ocfs2_super *osb, | 325 | int ocfs2_do_extend_allocation(struct ocfs2_super *osb, |
326 | struct inode *inode, | 326 | struct inode *inode, |
327 | u32 clusters_to_add, | 327 | u32 clusters_to_add, |
328 | struct buffer_head *fe_bh, | 328 | struct buffer_head *fe_bh, |
329 | struct ocfs2_journal_handle *handle, | 329 | struct ocfs2_journal_handle *handle, |
330 | struct ocfs2_alloc_context *data_ac, | 330 | struct ocfs2_alloc_context *data_ac, |
331 | struct ocfs2_alloc_context *meta_ac, | 331 | struct ocfs2_alloc_context *meta_ac, |
332 | enum ocfs2_alloc_restarted *reason_ret) | 332 | enum ocfs2_alloc_restarted *reason_ret) |
333 | { | 333 | { |
334 | int status = 0; | 334 | int status = 0; |
335 | int free_extents; | 335 | int free_extents; |
336 | struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; | 336 | struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; |
337 | enum ocfs2_alloc_restarted reason = RESTART_NONE; | 337 | enum ocfs2_alloc_restarted reason = RESTART_NONE; |
338 | u32 bit_off, num_bits; | 338 | u32 bit_off, num_bits; |
339 | u64 block; | 339 | u64 block; |
340 | 340 | ||
341 | BUG_ON(!clusters_to_add); | 341 | BUG_ON(!clusters_to_add); |
342 | 342 | ||
343 | free_extents = ocfs2_num_free_extents(osb, inode, fe); | 343 | free_extents = ocfs2_num_free_extents(osb, inode, fe); |
344 | if (free_extents < 0) { | 344 | if (free_extents < 0) { |
345 | status = free_extents; | 345 | status = free_extents; |
346 | mlog_errno(status); | 346 | mlog_errno(status); |
347 | goto leave; | 347 | goto leave; |
348 | } | 348 | } |
349 | 349 | ||
350 | /* there are two cases which could cause us to EAGAIN in the | 350 | /* there are two cases which could cause us to EAGAIN in the |
351 | * we-need-more-metadata case: | 351 | * we-need-more-metadata case: |
352 | * 1) we haven't reserved *any* | 352 | * 1) we haven't reserved *any* |
353 | * 2) we are so fragmented, we've needed to add metadata too | 353 | * 2) we are so fragmented, we've needed to add metadata too |
354 | * many times. */ | 354 | * many times. */ |
355 | if (!free_extents && !meta_ac) { | 355 | if (!free_extents && !meta_ac) { |
356 | mlog(0, "we haven't reserved any metadata!\n"); | 356 | mlog(0, "we haven't reserved any metadata!\n"); |
357 | status = -EAGAIN; | 357 | status = -EAGAIN; |
358 | reason = RESTART_META; | 358 | reason = RESTART_META; |
359 | goto leave; | 359 | goto leave; |
360 | } else if ((!free_extents) | 360 | } else if ((!free_extents) |
361 | && (ocfs2_alloc_context_bits_left(meta_ac) | 361 | && (ocfs2_alloc_context_bits_left(meta_ac) |
362 | < ocfs2_extend_meta_needed(fe))) { | 362 | < ocfs2_extend_meta_needed(fe))) { |
363 | mlog(0, "filesystem is really fragmented...\n"); | 363 | mlog(0, "filesystem is really fragmented...\n"); |
364 | status = -EAGAIN; | 364 | status = -EAGAIN; |
365 | reason = RESTART_META; | 365 | reason = RESTART_META; |
366 | goto leave; | 366 | goto leave; |
367 | } | 367 | } |
368 | 368 | ||
369 | status = ocfs2_claim_clusters(osb, handle, data_ac, 1, | 369 | status = ocfs2_claim_clusters(osb, handle, data_ac, 1, |
370 | &bit_off, &num_bits); | 370 | &bit_off, &num_bits); |
371 | if (status < 0) { | 371 | if (status < 0) { |
372 | if (status != -ENOSPC) | 372 | if (status != -ENOSPC) |
373 | mlog_errno(status); | 373 | mlog_errno(status); |
374 | goto leave; | 374 | goto leave; |
375 | } | 375 | } |
376 | 376 | ||
377 | BUG_ON(num_bits > clusters_to_add); | 377 | BUG_ON(num_bits > clusters_to_add); |
378 | 378 | ||
379 | /* reserve our write early -- insert_extent may update the inode */ | 379 | /* reserve our write early -- insert_extent may update the inode */ |
380 | status = ocfs2_journal_access(handle, inode, fe_bh, | 380 | status = ocfs2_journal_access(handle, inode, fe_bh, |
381 | OCFS2_JOURNAL_ACCESS_WRITE); | 381 | OCFS2_JOURNAL_ACCESS_WRITE); |
382 | if (status < 0) { | 382 | if (status < 0) { |
383 | mlog_errno(status); | 383 | mlog_errno(status); |
384 | goto leave; | 384 | goto leave; |
385 | } | 385 | } |
386 | 386 | ||
387 | block = ocfs2_clusters_to_blocks(osb->sb, bit_off); | 387 | block = ocfs2_clusters_to_blocks(osb->sb, bit_off); |
388 | mlog(0, "Allocating %u clusters at block %u for inode %llu\n", | 388 | mlog(0, "Allocating %u clusters at block %u for inode %llu\n", |
389 | num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); | 389 | num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); |
390 | status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block, | 390 | status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block, |
391 | num_bits, meta_ac); | 391 | num_bits, meta_ac); |
392 | if (status < 0) { | 392 | if (status < 0) { |
393 | mlog_errno(status); | 393 | mlog_errno(status); |
394 | goto leave; | 394 | goto leave; |
395 | } | 395 | } |
396 | 396 | ||
397 | le32_add_cpu(&fe->i_clusters, num_bits); | 397 | le32_add_cpu(&fe->i_clusters, num_bits); |
398 | spin_lock(&OCFS2_I(inode)->ip_lock); | 398 | spin_lock(&OCFS2_I(inode)->ip_lock); |
399 | OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); | 399 | OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); |
400 | spin_unlock(&OCFS2_I(inode)->ip_lock); | 400 | spin_unlock(&OCFS2_I(inode)->ip_lock); |
401 | 401 | ||
402 | status = ocfs2_journal_dirty(handle, fe_bh); | 402 | status = ocfs2_journal_dirty(handle, fe_bh); |
403 | if (status < 0) { | 403 | if (status < 0) { |
404 | mlog_errno(status); | 404 | mlog_errno(status); |
405 | goto leave; | 405 | goto leave; |
406 | } | 406 | } |
407 | 407 | ||
408 | clusters_to_add -= num_bits; | 408 | clusters_to_add -= num_bits; |
409 | 409 | ||
410 | if (clusters_to_add) { | 410 | if (clusters_to_add) { |
411 | mlog(0, "need to alloc once more, clusters = %u, wanted = " | 411 | mlog(0, "need to alloc once more, clusters = %u, wanted = " |
412 | "%u\n", fe->i_clusters, clusters_to_add); | 412 | "%u\n", fe->i_clusters, clusters_to_add); |
413 | status = -EAGAIN; | 413 | status = -EAGAIN; |
414 | reason = RESTART_TRANS; | 414 | reason = RESTART_TRANS; |
415 | } | 415 | } |
416 | 416 | ||
417 | leave: | 417 | leave: |
418 | mlog_exit(status); | 418 | mlog_exit(status); |
419 | if (reason_ret) | 419 | if (reason_ret) |
420 | *reason_ret = reason; | 420 | *reason_ret = reason; |
421 | return status; | 421 | return status; |
422 | } | 422 | } |
423 | 423 | ||
424 | static int ocfs2_extend_allocation(struct inode *inode, | 424 | static int ocfs2_extend_allocation(struct inode *inode, |
425 | u32 clusters_to_add) | 425 | u32 clusters_to_add) |
426 | { | 426 | { |
427 | int status = 0; | 427 | int status = 0; |
428 | int restart_func = 0; | 428 | int restart_func = 0; |
429 | int drop_alloc_sem = 0; | 429 | int drop_alloc_sem = 0; |
430 | int credits, num_free_extents; | 430 | int credits, num_free_extents; |
431 | u32 prev_clusters; | 431 | u32 prev_clusters; |
432 | struct buffer_head *bh = NULL; | 432 | struct buffer_head *bh = NULL; |
433 | struct ocfs2_dinode *fe = NULL; | 433 | struct ocfs2_dinode *fe = NULL; |
434 | struct ocfs2_journal_handle *handle = NULL; | 434 | struct ocfs2_journal_handle *handle = NULL; |
435 | struct ocfs2_alloc_context *data_ac = NULL; | 435 | struct ocfs2_alloc_context *data_ac = NULL; |
436 | struct ocfs2_alloc_context *meta_ac = NULL; | 436 | struct ocfs2_alloc_context *meta_ac = NULL; |
437 | enum ocfs2_alloc_restarted why; | 437 | enum ocfs2_alloc_restarted why; |
438 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 438 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
439 | 439 | ||
440 | mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); | 440 | mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); |
441 | 441 | ||
442 | status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, | 442 | status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, |
443 | OCFS2_BH_CACHED, inode); | 443 | OCFS2_BH_CACHED, inode); |
444 | if (status < 0) { | 444 | if (status < 0) { |
445 | mlog_errno(status); | 445 | mlog_errno(status); |
446 | goto leave; | 446 | goto leave; |
447 | } | 447 | } |
448 | 448 | ||
449 | fe = (struct ocfs2_dinode *) bh->b_data; | 449 | fe = (struct ocfs2_dinode *) bh->b_data; |
450 | if (!OCFS2_IS_VALID_DINODE(fe)) { | 450 | if (!OCFS2_IS_VALID_DINODE(fe)) { |
451 | OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); | 451 | OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); |
452 | status = -EIO; | 452 | status = -EIO; |
453 | goto leave; | 453 | goto leave; |
454 | } | 454 | } |
455 | 455 | ||
456 | restart_all: | 456 | restart_all: |
457 | BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); | 457 | BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); |
458 | 458 | ||
459 | mlog(0, "extend inode %llu, i_size = %lld, fe->i_clusters = %u, " | 459 | mlog(0, "extend inode %llu, i_size = %lld, fe->i_clusters = %u, " |
460 | "clusters_to_add = %u\n", | 460 | "clusters_to_add = %u\n", |
461 | (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode), | 461 | (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode), |
462 | fe->i_clusters, clusters_to_add); | 462 | fe->i_clusters, clusters_to_add); |
463 | 463 | ||
464 | handle = ocfs2_alloc_handle(osb); | 464 | handle = ocfs2_alloc_handle(osb); |
465 | if (handle == NULL) { | 465 | if (handle == NULL) { |
466 | status = -ENOMEM; | 466 | status = -ENOMEM; |
467 | mlog_errno(status); | 467 | mlog_errno(status); |
468 | goto leave; | 468 | goto leave; |
469 | } | 469 | } |
470 | 470 | ||
471 | num_free_extents = ocfs2_num_free_extents(osb, | 471 | num_free_extents = ocfs2_num_free_extents(osb, |
472 | inode, | 472 | inode, |
473 | fe); | 473 | fe); |
474 | if (num_free_extents < 0) { | 474 | if (num_free_extents < 0) { |
475 | status = num_free_extents; | 475 | status = num_free_extents; |
476 | mlog_errno(status); | 476 | mlog_errno(status); |
477 | goto leave; | 477 | goto leave; |
478 | } | 478 | } |
479 | 479 | ||
480 | if (!num_free_extents) { | 480 | if (!num_free_extents) { |
481 | status = ocfs2_reserve_new_metadata(osb, | 481 | status = ocfs2_reserve_new_metadata(osb, |
482 | handle, | 482 | handle, |
483 | fe, | 483 | fe, |
484 | &meta_ac); | 484 | &meta_ac); |
485 | if (status < 0) { | 485 | if (status < 0) { |
486 | if (status != -ENOSPC) | 486 | if (status != -ENOSPC) |
487 | mlog_errno(status); | 487 | mlog_errno(status); |
488 | goto leave; | 488 | goto leave; |
489 | } | 489 | } |
490 | } | 490 | } |
491 | 491 | ||
492 | status = ocfs2_reserve_clusters(osb, | 492 | status = ocfs2_reserve_clusters(osb, |
493 | handle, | 493 | handle, |
494 | clusters_to_add, | 494 | clusters_to_add, |
495 | &data_ac); | 495 | &data_ac); |
496 | if (status < 0) { | 496 | if (status < 0) { |
497 | if (status != -ENOSPC) | 497 | if (status != -ENOSPC) |
498 | mlog_errno(status); | 498 | mlog_errno(status); |
499 | goto leave; | 499 | goto leave; |
500 | } | 500 | } |
501 | 501 | ||
502 | /* blocks peope in read/write from reading our allocation | 502 | /* blocks peope in read/write from reading our allocation |
503 | * until we're done changing it. We depend on i_mutex to block | 503 | * until we're done changing it. We depend on i_mutex to block |
504 | * other extend/truncate calls while we're here. Ordering wrt | 504 | * other extend/truncate calls while we're here. Ordering wrt |
505 | * start_trans is important here -- always do it before! */ | 505 | * start_trans is important here -- always do it before! */ |
506 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | 506 | down_write(&OCFS2_I(inode)->ip_alloc_sem); |
507 | drop_alloc_sem = 1; | 507 | drop_alloc_sem = 1; |
508 | 508 | ||
509 | credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add); | 509 | credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add); |
510 | handle = ocfs2_start_trans(osb, handle, credits); | 510 | handle = ocfs2_start_trans(osb, handle, credits); |
511 | if (IS_ERR(handle)) { | 511 | if (IS_ERR(handle)) { |
512 | status = PTR_ERR(handle); | 512 | status = PTR_ERR(handle); |
513 | handle = NULL; | 513 | handle = NULL; |
514 | mlog_errno(status); | 514 | mlog_errno(status); |
515 | goto leave; | 515 | goto leave; |
516 | } | 516 | } |
517 | 517 | ||
518 | restarted_transaction: | 518 | restarted_transaction: |
519 | /* reserve a write to the file entry early on - that we if we | 519 | /* reserve a write to the file entry early on - that we if we |
520 | * run out of credits in the allocation path, we can still | 520 | * run out of credits in the allocation path, we can still |
521 | * update i_size. */ | 521 | * update i_size. */ |
522 | status = ocfs2_journal_access(handle, inode, bh, | 522 | status = ocfs2_journal_access(handle, inode, bh, |
523 | OCFS2_JOURNAL_ACCESS_WRITE); | 523 | OCFS2_JOURNAL_ACCESS_WRITE); |
524 | if (status < 0) { | 524 | if (status < 0) { |
525 | mlog_errno(status); | 525 | mlog_errno(status); |
526 | goto leave; | 526 | goto leave; |
527 | } | 527 | } |
528 | 528 | ||
529 | prev_clusters = OCFS2_I(inode)->ip_clusters; | 529 | prev_clusters = OCFS2_I(inode)->ip_clusters; |
530 | 530 | ||
531 | status = ocfs2_do_extend_allocation(osb, | 531 | status = ocfs2_do_extend_allocation(osb, |
532 | inode, | 532 | inode, |
533 | clusters_to_add, | 533 | clusters_to_add, |
534 | bh, | 534 | bh, |
535 | handle, | 535 | handle, |
536 | data_ac, | 536 | data_ac, |
537 | meta_ac, | 537 | meta_ac, |
538 | &why); | 538 | &why); |
539 | if ((status < 0) && (status != -EAGAIN)) { | 539 | if ((status < 0) && (status != -EAGAIN)) { |
540 | if (status != -ENOSPC) | 540 | if (status != -ENOSPC) |
541 | mlog_errno(status); | 541 | mlog_errno(status); |
542 | goto leave; | 542 | goto leave; |
543 | } | 543 | } |
544 | 544 | ||
545 | status = ocfs2_journal_dirty(handle, bh); | 545 | status = ocfs2_journal_dirty(handle, bh); |
546 | if (status < 0) { | 546 | if (status < 0) { |
547 | mlog_errno(status); | 547 | mlog_errno(status); |
548 | goto leave; | 548 | goto leave; |
549 | } | 549 | } |
550 | 550 | ||
551 | spin_lock(&OCFS2_I(inode)->ip_lock); | 551 | spin_lock(&OCFS2_I(inode)->ip_lock); |
552 | clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); | 552 | clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); |
553 | spin_unlock(&OCFS2_I(inode)->ip_lock); | 553 | spin_unlock(&OCFS2_I(inode)->ip_lock); |
554 | 554 | ||
555 | if (why != RESTART_NONE && clusters_to_add) { | 555 | if (why != RESTART_NONE && clusters_to_add) { |
556 | if (why == RESTART_META) { | 556 | if (why == RESTART_META) { |
557 | mlog(0, "restarting function.\n"); | 557 | mlog(0, "restarting function.\n"); |
558 | restart_func = 1; | 558 | restart_func = 1; |
559 | } else { | 559 | } else { |
560 | BUG_ON(why != RESTART_TRANS); | 560 | BUG_ON(why != RESTART_TRANS); |
561 | 561 | ||
562 | mlog(0, "restarting transaction.\n"); | 562 | mlog(0, "restarting transaction.\n"); |
563 | /* TODO: This can be more intelligent. */ | 563 | /* TODO: This can be more intelligent. */ |
564 | credits = ocfs2_calc_extend_credits(osb->sb, | 564 | credits = ocfs2_calc_extend_credits(osb->sb, |
565 | fe, | 565 | fe, |
566 | clusters_to_add); | 566 | clusters_to_add); |
567 | status = ocfs2_extend_trans(handle, credits); | 567 | status = ocfs2_extend_trans(handle, credits); |
568 | if (status < 0) { | 568 | if (status < 0) { |
569 | /* handle still has to be committed at | 569 | /* handle still has to be committed at |
570 | * this point. */ | 570 | * this point. */ |
571 | status = -ENOMEM; | 571 | status = -ENOMEM; |
572 | mlog_errno(status); | 572 | mlog_errno(status); |
573 | goto leave; | 573 | goto leave; |
574 | } | 574 | } |
575 | goto restarted_transaction; | 575 | goto restarted_transaction; |
576 | } | 576 | } |
577 | } | 577 | } |
578 | 578 | ||
579 | mlog(0, "fe: i_clusters = %u, i_size=%llu\n", | 579 | mlog(0, "fe: i_clusters = %u, i_size=%llu\n", |
580 | fe->i_clusters, (unsigned long long)fe->i_size); | 580 | fe->i_clusters, (unsigned long long)fe->i_size); |
581 | mlog(0, "inode: ip_clusters=%u, i_size=%lld\n", | 581 | mlog(0, "inode: ip_clusters=%u, i_size=%lld\n", |
582 | OCFS2_I(inode)->ip_clusters, i_size_read(inode)); | 582 | OCFS2_I(inode)->ip_clusters, i_size_read(inode)); |
583 | 583 | ||
584 | leave: | 584 | leave: |
585 | if (drop_alloc_sem) { | 585 | if (drop_alloc_sem) { |
586 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | 586 | up_write(&OCFS2_I(inode)->ip_alloc_sem); |
587 | drop_alloc_sem = 0; | 587 | drop_alloc_sem = 0; |
588 | } | 588 | } |
589 | if (handle) { | 589 | if (handle) { |
590 | ocfs2_commit_trans(handle); | 590 | ocfs2_commit_trans(handle); |
591 | handle = NULL; | 591 | handle = NULL; |
592 | } | 592 | } |
593 | if (data_ac) { | 593 | if (data_ac) { |
594 | ocfs2_free_alloc_context(data_ac); | 594 | ocfs2_free_alloc_context(data_ac); |
595 | data_ac = NULL; | 595 | data_ac = NULL; |
596 | } | 596 | } |
597 | if (meta_ac) { | 597 | if (meta_ac) { |
598 | ocfs2_free_alloc_context(meta_ac); | 598 | ocfs2_free_alloc_context(meta_ac); |
599 | meta_ac = NULL; | 599 | meta_ac = NULL; |
600 | } | 600 | } |
601 | if ((!status) && restart_func) { | 601 | if ((!status) && restart_func) { |
602 | restart_func = 0; | 602 | restart_func = 0; |
603 | goto restart_all; | 603 | goto restart_all; |
604 | } | 604 | } |
605 | if (bh) { | 605 | if (bh) { |
606 | brelse(bh); | 606 | brelse(bh); |
607 | bh = NULL; | 607 | bh = NULL; |
608 | } | 608 | } |
609 | 609 | ||
610 | mlog_exit(status); | 610 | mlog_exit(status); |
611 | return status; | 611 | return status; |
612 | } | 612 | } |
613 | 613 | ||
614 | /* Some parts of this taken from generic_cont_expand, which turned out | 614 | /* Some parts of this taken from generic_cont_expand, which turned out |
615 | * to be too fragile to do exactly what we need without us having to | 615 | * to be too fragile to do exactly what we need without us having to |
616 | * worry about recursive locking in ->commit_write(). */ | 616 | * worry about recursive locking in ->prepare_write() and |
617 | * ->commit_write(). */ | ||
617 | static int ocfs2_write_zero_page(struct inode *inode, | 618 | static int ocfs2_write_zero_page(struct inode *inode, |
618 | u64 size) | 619 | u64 size) |
619 | { | 620 | { |
620 | struct address_space *mapping = inode->i_mapping; | 621 | struct address_space *mapping = inode->i_mapping; |
621 | struct page *page; | 622 | struct page *page; |
622 | unsigned long index; | 623 | unsigned long index; |
623 | unsigned int offset; | 624 | unsigned int offset; |
624 | struct ocfs2_journal_handle *handle = NULL; | 625 | struct ocfs2_journal_handle *handle = NULL; |
625 | int ret; | 626 | int ret; |
626 | 627 | ||
627 | offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */ | 628 | offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */ |
628 | /* ugh. in prepare/commit_write, if from==to==start of block, we | 629 | /* ugh. in prepare/commit_write, if from==to==start of block, we |
629 | ** skip the prepare. make sure we never send an offset for the start | 630 | ** skip the prepare. make sure we never send an offset for the start |
630 | ** of a block | 631 | ** of a block |
631 | */ | 632 | */ |
632 | if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) { | 633 | if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) { |
633 | offset++; | 634 | offset++; |
634 | } | 635 | } |
635 | index = size >> PAGE_CACHE_SHIFT; | 636 | index = size >> PAGE_CACHE_SHIFT; |
636 | 637 | ||
637 | page = grab_cache_page(mapping, index); | 638 | page = grab_cache_page(mapping, index); |
638 | if (!page) { | 639 | if (!page) { |
639 | ret = -ENOMEM; | 640 | ret = -ENOMEM; |
640 | mlog_errno(ret); | 641 | mlog_errno(ret); |
641 | goto out; | 642 | goto out; |
642 | } | 643 | } |
643 | 644 | ||
644 | ret = ocfs2_prepare_write(NULL, page, offset, offset); | 645 | ret = ocfs2_prepare_write_nolock(inode, page, offset, offset); |
645 | if (ret < 0) { | 646 | if (ret < 0) { |
646 | mlog_errno(ret); | 647 | mlog_errno(ret); |
647 | goto out_unlock; | 648 | goto out_unlock; |
648 | } | 649 | } |
649 | 650 | ||
650 | if (ocfs2_should_order_data(inode)) { | 651 | if (ocfs2_should_order_data(inode)) { |
651 | handle = ocfs2_start_walk_page_trans(inode, page, offset, | 652 | handle = ocfs2_start_walk_page_trans(inode, page, offset, |
652 | offset); | 653 | offset); |
653 | if (IS_ERR(handle)) { | 654 | if (IS_ERR(handle)) { |
654 | ret = PTR_ERR(handle); | 655 | ret = PTR_ERR(handle); |
655 | handle = NULL; | 656 | handle = NULL; |
656 | goto out_unlock; | 657 | goto out_unlock; |
657 | } | 658 | } |
658 | } | 659 | } |
659 | 660 | ||
660 | /* must not update i_size! */ | 661 | /* must not update i_size! */ |
661 | ret = block_commit_write(page, offset, offset); | 662 | ret = block_commit_write(page, offset, offset); |
662 | if (ret < 0) | 663 | if (ret < 0) |
663 | mlog_errno(ret); | 664 | mlog_errno(ret); |
664 | else | 665 | else |
665 | ret = 0; | 666 | ret = 0; |
666 | 667 | ||
667 | if (handle) | 668 | if (handle) |
668 | ocfs2_commit_trans(handle); | 669 | ocfs2_commit_trans(handle); |
669 | out_unlock: | 670 | out_unlock: |
670 | unlock_page(page); | 671 | unlock_page(page); |
671 | page_cache_release(page); | 672 | page_cache_release(page); |
672 | out: | 673 | out: |
673 | return ret; | 674 | return ret; |
674 | } | 675 | } |
675 | 676 | ||
676 | static int ocfs2_zero_extend(struct inode *inode, | 677 | static int ocfs2_zero_extend(struct inode *inode, |
677 | u64 zero_to_size) | 678 | u64 zero_to_size) |
678 | { | 679 | { |
679 | int ret = 0; | 680 | int ret = 0; |
680 | u64 start_off; | 681 | u64 start_off; |
681 | struct super_block *sb = inode->i_sb; | 682 | struct super_block *sb = inode->i_sb; |
682 | 683 | ||
683 | start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode)); | 684 | start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode)); |
684 | while (start_off < zero_to_size) { | 685 | while (start_off < zero_to_size) { |
685 | ret = ocfs2_write_zero_page(inode, start_off); | 686 | ret = ocfs2_write_zero_page(inode, start_off); |
686 | if (ret < 0) { | 687 | if (ret < 0) { |
687 | mlog_errno(ret); | 688 | mlog_errno(ret); |
688 | goto out; | 689 | goto out; |
689 | } | 690 | } |
690 | 691 | ||
691 | start_off += sb->s_blocksize; | 692 | start_off += sb->s_blocksize; |
692 | } | 693 | } |
693 | 694 | ||
694 | out: | 695 | out: |
695 | return ret; | 696 | return ret; |
696 | } | 697 | } |
697 | 698 | ||
699 | /* | ||
700 | * A tail_to_skip value > 0 indicates that we're being called from | ||
701 | * ocfs2_file_aio_write(). This has the following implications: | ||
702 | * | ||
703 | * - we don't want to update i_size | ||
704 | * - di_bh will be NULL, which is fine because it's only used in the | ||
705 | * case where we want to update i_size. | ||
706 | * - ocfs2_zero_extend() will then only be filling the hole created | ||
707 | * between i_size and the start of the write. | ||
708 | */ | ||
698 | static int ocfs2_extend_file(struct inode *inode, | 709 | static int ocfs2_extend_file(struct inode *inode, |
699 | struct buffer_head *di_bh, | 710 | struct buffer_head *di_bh, |
700 | u64 new_i_size) | 711 | u64 new_i_size, |
712 | size_t tail_to_skip) | ||
701 | { | 713 | { |
702 | int ret = 0; | 714 | int ret = 0; |
703 | u32 clusters_to_add; | 715 | u32 clusters_to_add; |
704 | 716 | ||
717 | BUG_ON(!tail_to_skip && !di_bh); | ||
718 | |||
705 | /* setattr sometimes calls us like this. */ | 719 | /* setattr sometimes calls us like this. */ |
706 | if (new_i_size == 0) | 720 | if (new_i_size == 0) |
707 | goto out; | 721 | goto out; |
708 | 722 | ||
709 | if (i_size_read(inode) == new_i_size) | 723 | if (i_size_read(inode) == new_i_size) |
710 | goto out; | 724 | goto out; |
711 | BUG_ON(new_i_size < i_size_read(inode)); | 725 | BUG_ON(new_i_size < i_size_read(inode)); |
712 | 726 | ||
713 | clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - | 727 | clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - |
714 | OCFS2_I(inode)->ip_clusters; | 728 | OCFS2_I(inode)->ip_clusters; |
715 | 729 | ||
716 | if (clusters_to_add) { | 730 | if (clusters_to_add) { |
717 | ret = ocfs2_extend_allocation(inode, clusters_to_add); | 731 | /* |
732 | * protect the pages that ocfs2_zero_extend is going to | ||
733 | * be pulling into the page cache.. we do this before the | ||
734 | * metadata extend so that we don't get into the situation | ||
735 | * where we've extended the metadata but can't get the data | ||
736 | * lock to zero. | ||
737 | */ | ||
738 | ret = ocfs2_data_lock(inode, 1); | ||
718 | if (ret < 0) { | 739 | if (ret < 0) { |
719 | mlog_errno(ret); | 740 | mlog_errno(ret); |
720 | goto out; | 741 | goto out; |
721 | } | 742 | } |
722 | 743 | ||
723 | ret = ocfs2_zero_extend(inode, new_i_size); | 744 | ret = ocfs2_extend_allocation(inode, clusters_to_add); |
724 | if (ret < 0) { | 745 | if (ret < 0) { |
725 | mlog_errno(ret); | 746 | mlog_errno(ret); |
726 | goto out; | 747 | goto out_unlock; |
727 | } | 748 | } |
728 | } | ||
729 | 749 | ||
730 | /* No allocation required, we just use this helper to | 750 | ret = ocfs2_zero_extend(inode, (u64)new_i_size - tail_to_skip); |
731 | * do a trivial update of i_size. */ | 751 | if (ret < 0) { |
732 | ret = ocfs2_simple_size_update(inode, di_bh, new_i_size); | 752 | mlog_errno(ret); |
733 | if (ret < 0) { | 753 | goto out_unlock; |
734 | mlog_errno(ret); | 754 | } |
735 | goto out; | ||
736 | } | 755 | } |
737 | 756 | ||
757 | if (!tail_to_skip) { | ||
758 | /* We're being called from ocfs2_setattr() which wants | ||
759 | * us to update i_size */ | ||
760 | ret = ocfs2_simple_size_update(inode, di_bh, new_i_size); | ||
761 | if (ret < 0) | ||
762 | mlog_errno(ret); | ||
763 | } | ||
764 | |||
765 | out_unlock: | ||
766 | if (clusters_to_add) /* this is the only case in which we lock */ | ||
767 | ocfs2_data_unlock(inode, 1); | ||
768 | |||
738 | out: | 769 | out: |
739 | return ret; | 770 | return ret; |
740 | } | 771 | } |
741 | 772 | ||
742 | int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) | 773 | int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) |
743 | { | 774 | { |
744 | int status = 0, size_change; | 775 | int status = 0, size_change; |
745 | struct inode *inode = dentry->d_inode; | 776 | struct inode *inode = dentry->d_inode; |
746 | struct super_block *sb = inode->i_sb; | 777 | struct super_block *sb = inode->i_sb; |
747 | struct ocfs2_super *osb = OCFS2_SB(sb); | 778 | struct ocfs2_super *osb = OCFS2_SB(sb); |
748 | struct buffer_head *bh = NULL; | 779 | struct buffer_head *bh = NULL; |
749 | struct ocfs2_journal_handle *handle = NULL; | 780 | struct ocfs2_journal_handle *handle = NULL; |
750 | 781 | ||
751 | mlog_entry("(0x%p, '%.*s')\n", dentry, | 782 | mlog_entry("(0x%p, '%.*s')\n", dentry, |
752 | dentry->d_name.len, dentry->d_name.name); | 783 | dentry->d_name.len, dentry->d_name.name); |
753 | 784 | ||
754 | if (attr->ia_valid & ATTR_MODE) | 785 | if (attr->ia_valid & ATTR_MODE) |
755 | mlog(0, "mode change: %d\n", attr->ia_mode); | 786 | mlog(0, "mode change: %d\n", attr->ia_mode); |
756 | if (attr->ia_valid & ATTR_UID) | 787 | if (attr->ia_valid & ATTR_UID) |
757 | mlog(0, "uid change: %d\n", attr->ia_uid); | 788 | mlog(0, "uid change: %d\n", attr->ia_uid); |
758 | if (attr->ia_valid & ATTR_GID) | 789 | if (attr->ia_valid & ATTR_GID) |
759 | mlog(0, "gid change: %d\n", attr->ia_gid); | 790 | mlog(0, "gid change: %d\n", attr->ia_gid); |
760 | if (attr->ia_valid & ATTR_SIZE) | 791 | if (attr->ia_valid & ATTR_SIZE) |
761 | mlog(0, "size change...\n"); | 792 | mlog(0, "size change...\n"); |
762 | if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME)) | 793 | if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME)) |
763 | mlog(0, "time change...\n"); | 794 | mlog(0, "time change...\n"); |
764 | 795 | ||
765 | #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \ | 796 | #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \ |
766 | | ATTR_GID | ATTR_UID | ATTR_MODE) | 797 | | ATTR_GID | ATTR_UID | ATTR_MODE) |
767 | if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) { | 798 | if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) { |
768 | mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid); | 799 | mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid); |
769 | return 0; | 800 | return 0; |
770 | } | 801 | } |
771 | 802 | ||
772 | status = inode_change_ok(inode, attr); | 803 | status = inode_change_ok(inode, attr); |
773 | if (status) | 804 | if (status) |
774 | return status; | 805 | return status; |
775 | 806 | ||
776 | size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; | 807 | size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; |
777 | if (size_change) { | 808 | if (size_change) { |
778 | status = ocfs2_rw_lock(inode, 1); | 809 | status = ocfs2_rw_lock(inode, 1); |
779 | if (status < 0) { | 810 | if (status < 0) { |
780 | mlog_errno(status); | 811 | mlog_errno(status); |
781 | goto bail; | 812 | goto bail; |
782 | } | 813 | } |
783 | } | 814 | } |
784 | 815 | ||
785 | status = ocfs2_meta_lock(inode, NULL, &bh, 1); | 816 | status = ocfs2_meta_lock(inode, NULL, &bh, 1); |
786 | if (status < 0) { | 817 | if (status < 0) { |
787 | if (status != -ENOENT) | 818 | if (status != -ENOENT) |
788 | mlog_errno(status); | 819 | mlog_errno(status); |
789 | goto bail_unlock_rw; | 820 | goto bail_unlock_rw; |
790 | } | 821 | } |
791 | 822 | ||
792 | if (size_change && attr->ia_size != i_size_read(inode)) { | 823 | if (size_change && attr->ia_size != i_size_read(inode)) { |
793 | if (i_size_read(inode) > attr->ia_size) | 824 | if (i_size_read(inode) > attr->ia_size) |
794 | status = ocfs2_truncate_file(inode, bh, attr->ia_size); | 825 | status = ocfs2_truncate_file(inode, bh, attr->ia_size); |
795 | else | 826 | else |
796 | status = ocfs2_extend_file(inode, bh, attr->ia_size); | 827 | status = ocfs2_extend_file(inode, bh, attr->ia_size, 0); |
797 | if (status < 0) { | 828 | if (status < 0) { |
798 | if (status != -ENOSPC) | 829 | if (status != -ENOSPC) |
799 | mlog_errno(status); | 830 | mlog_errno(status); |
800 | status = -ENOSPC; | 831 | status = -ENOSPC; |
801 | goto bail_unlock; | 832 | goto bail_unlock; |
802 | } | 833 | } |
803 | } | 834 | } |
804 | 835 | ||
805 | handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS); | 836 | handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS); |
806 | if (IS_ERR(handle)) { | 837 | if (IS_ERR(handle)) { |
807 | status = PTR_ERR(handle); | 838 | status = PTR_ERR(handle); |
808 | mlog_errno(status); | 839 | mlog_errno(status); |
809 | goto bail_unlock; | 840 | goto bail_unlock; |
810 | } | 841 | } |
811 | 842 | ||
812 | status = inode_setattr(inode, attr); | 843 | status = inode_setattr(inode, attr); |
813 | if (status < 0) { | 844 | if (status < 0) { |
814 | mlog_errno(status); | 845 | mlog_errno(status); |
815 | goto bail_commit; | 846 | goto bail_commit; |
816 | } | 847 | } |
817 | 848 | ||
818 | status = ocfs2_mark_inode_dirty(handle, inode, bh); | 849 | status = ocfs2_mark_inode_dirty(handle, inode, bh); |
819 | if (status < 0) | 850 | if (status < 0) |
820 | mlog_errno(status); | 851 | mlog_errno(status); |
821 | 852 | ||
822 | bail_commit: | 853 | bail_commit: |
823 | ocfs2_commit_trans(handle); | 854 | ocfs2_commit_trans(handle); |
824 | bail_unlock: | 855 | bail_unlock: |
825 | ocfs2_meta_unlock(inode, 1); | 856 | ocfs2_meta_unlock(inode, 1); |
826 | bail_unlock_rw: | 857 | bail_unlock_rw: |
827 | if (size_change) | 858 | if (size_change) |
828 | ocfs2_rw_unlock(inode, 1); | 859 | ocfs2_rw_unlock(inode, 1); |
829 | bail: | 860 | bail: |
830 | if (bh) | 861 | if (bh) |
831 | brelse(bh); | 862 | brelse(bh); |
832 | 863 | ||
833 | mlog_exit(status); | 864 | mlog_exit(status); |
834 | return status; | 865 | return status; |
835 | } | 866 | } |
836 | 867 | ||
837 | int ocfs2_getattr(struct vfsmount *mnt, | 868 | int ocfs2_getattr(struct vfsmount *mnt, |
838 | struct dentry *dentry, | 869 | struct dentry *dentry, |
839 | struct kstat *stat) | 870 | struct kstat *stat) |
840 | { | 871 | { |
841 | struct inode *inode = dentry->d_inode; | 872 | struct inode *inode = dentry->d_inode; |
842 | struct super_block *sb = dentry->d_inode->i_sb; | 873 | struct super_block *sb = dentry->d_inode->i_sb; |
843 | struct ocfs2_super *osb = sb->s_fs_info; | 874 | struct ocfs2_super *osb = sb->s_fs_info; |
844 | int err; | 875 | int err; |
845 | 876 | ||
846 | mlog_entry_void(); | 877 | mlog_entry_void(); |
847 | 878 | ||
848 | err = ocfs2_inode_revalidate(dentry); | 879 | err = ocfs2_inode_revalidate(dentry); |
849 | if (err) { | 880 | if (err) { |
850 | if (err != -ENOENT) | 881 | if (err != -ENOENT) |
851 | mlog_errno(err); | 882 | mlog_errno(err); |
852 | goto bail; | 883 | goto bail; |
853 | } | 884 | } |
854 | 885 | ||
855 | generic_fillattr(inode, stat); | 886 | generic_fillattr(inode, stat); |
856 | 887 | ||
857 | /* We set the blksize from the cluster size for performance */ | 888 | /* We set the blksize from the cluster size for performance */ |
858 | stat->blksize = osb->s_clustersize; | 889 | stat->blksize = osb->s_clustersize; |
859 | 890 | ||
860 | bail: | 891 | bail: |
861 | mlog_exit(err); | 892 | mlog_exit(err); |
862 | 893 | ||
863 | return err; | 894 | return err; |
864 | } | 895 | } |
865 | 896 | ||
866 | static int ocfs2_write_remove_suid(struct inode *inode) | 897 | static int ocfs2_write_remove_suid(struct inode *inode) |
867 | { | 898 | { |
868 | int ret; | 899 | int ret; |
869 | struct buffer_head *bh = NULL; | 900 | struct buffer_head *bh = NULL; |
870 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | 901 | struct ocfs2_inode_info *oi = OCFS2_I(inode); |
871 | struct ocfs2_journal_handle *handle; | 902 | struct ocfs2_journal_handle *handle; |
872 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 903 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
873 | struct ocfs2_dinode *di; | 904 | struct ocfs2_dinode *di; |
874 | 905 | ||
875 | mlog_entry("(Inode %llu, mode 0%o)\n", | 906 | mlog_entry("(Inode %llu, mode 0%o)\n", |
876 | (unsigned long long)oi->ip_blkno, inode->i_mode); | 907 | (unsigned long long)oi->ip_blkno, inode->i_mode); |
877 | 908 | ||
878 | handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS); | 909 | handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS); |
879 | if (handle == NULL) { | 910 | if (handle == NULL) { |
880 | ret = -ENOMEM; | 911 | ret = -ENOMEM; |
881 | mlog_errno(ret); | 912 | mlog_errno(ret); |
882 | goto out; | 913 | goto out; |
883 | } | 914 | } |
884 | 915 | ||
885 | ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode); | 916 | ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode); |
886 | if (ret < 0) { | 917 | if (ret < 0) { |
887 | mlog_errno(ret); | 918 | mlog_errno(ret); |
888 | goto out_trans; | 919 | goto out_trans; |
889 | } | 920 | } |
890 | 921 | ||
891 | ret = ocfs2_journal_access(handle, inode, bh, | 922 | ret = ocfs2_journal_access(handle, inode, bh, |
892 | OCFS2_JOURNAL_ACCESS_WRITE); | 923 | OCFS2_JOURNAL_ACCESS_WRITE); |
893 | if (ret < 0) { | 924 | if (ret < 0) { |
894 | mlog_errno(ret); | 925 | mlog_errno(ret); |
895 | goto out_bh; | 926 | goto out_bh; |
896 | } | 927 | } |
897 | 928 | ||
898 | inode->i_mode &= ~S_ISUID; | 929 | inode->i_mode &= ~S_ISUID; |
899 | if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP)) | 930 | if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP)) |
900 | inode->i_mode &= ~S_ISGID; | 931 | inode->i_mode &= ~S_ISGID; |
901 | 932 | ||
902 | di = (struct ocfs2_dinode *) bh->b_data; | 933 | di = (struct ocfs2_dinode *) bh->b_data; |
903 | di->i_mode = cpu_to_le16(inode->i_mode); | 934 | di->i_mode = cpu_to_le16(inode->i_mode); |
904 | 935 | ||
905 | ret = ocfs2_journal_dirty(handle, bh); | 936 | ret = ocfs2_journal_dirty(handle, bh); |
906 | if (ret < 0) | 937 | if (ret < 0) |
907 | mlog_errno(ret); | 938 | mlog_errno(ret); |
908 | out_bh: | 939 | out_bh: |
909 | brelse(bh); | 940 | brelse(bh); |
910 | out_trans: | 941 | out_trans: |
911 | ocfs2_commit_trans(handle); | 942 | ocfs2_commit_trans(handle); |
912 | out: | 943 | out: |
913 | mlog_exit(ret); | 944 | mlog_exit(ret); |
914 | return ret; | 945 | return ret; |
915 | } | 946 | } |
916 | 947 | ||
917 | static inline int ocfs2_write_should_remove_suid(struct inode *inode) | 948 | static inline int ocfs2_write_should_remove_suid(struct inode *inode) |
918 | { | 949 | { |
919 | mode_t mode = inode->i_mode; | 950 | mode_t mode = inode->i_mode; |
920 | 951 | ||
921 | if (!capable(CAP_FSETID)) { | 952 | if (!capable(CAP_FSETID)) { |
922 | if (unlikely(mode & S_ISUID)) | 953 | if (unlikely(mode & S_ISUID)) |
923 | return 1; | 954 | return 1; |
924 | 955 | ||
925 | if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) | 956 | if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) |
926 | return 1; | 957 | return 1; |
927 | } | 958 | } |
928 | return 0; | 959 | return 0; |
929 | } | 960 | } |
930 | 961 | ||
931 | static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, | 962 | static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, |
932 | const char __user *buf, | 963 | const char __user *buf, |
933 | size_t count, | 964 | size_t count, |
934 | loff_t pos) | 965 | loff_t pos) |
935 | { | 966 | { |
936 | struct iovec local_iov = { .iov_base = (void __user *)buf, | 967 | struct iovec local_iov = { .iov_base = (void __user *)buf, |
937 | .iov_len = count }; | 968 | .iov_len = count }; |
938 | int ret, rw_level = -1, meta_level = -1, have_alloc_sem = 0; | 969 | int ret, rw_level = -1, meta_level = -1, have_alloc_sem = 0; |
939 | u32 clusters; | 970 | u32 clusters; |
940 | struct file *filp = iocb->ki_filp; | 971 | struct file *filp = iocb->ki_filp; |
941 | struct inode *inode = filp->f_dentry->d_inode; | 972 | struct inode *inode = filp->f_dentry->d_inode; |
942 | loff_t newsize, saved_pos; | 973 | loff_t newsize, saved_pos; |
943 | 974 | ||
944 | mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf, | 975 | mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf, |
945 | (unsigned int)count, | 976 | (unsigned int)count, |
946 | filp->f_dentry->d_name.len, | 977 | filp->f_dentry->d_name.len, |
947 | filp->f_dentry->d_name.name); | 978 | filp->f_dentry->d_name.name); |
948 | 979 | ||
949 | /* happy write of zero bytes */ | 980 | /* happy write of zero bytes */ |
950 | if (count == 0) | 981 | if (count == 0) |
951 | return 0; | 982 | return 0; |
952 | 983 | ||
953 | if (!inode) { | 984 | if (!inode) { |
954 | mlog(0, "bad inode\n"); | 985 | mlog(0, "bad inode\n"); |
955 | return -EIO; | 986 | return -EIO; |
956 | } | 987 | } |
957 | 988 | ||
958 | mutex_lock(&inode->i_mutex); | 989 | mutex_lock(&inode->i_mutex); |
959 | /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ | 990 | /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ |
960 | if (filp->f_flags & O_DIRECT) { | 991 | if (filp->f_flags & O_DIRECT) { |
961 | have_alloc_sem = 1; | 992 | have_alloc_sem = 1; |
962 | down_read(&inode->i_alloc_sem); | 993 | down_read(&inode->i_alloc_sem); |
963 | } | 994 | } |
964 | 995 | ||
965 | /* concurrent O_DIRECT writes are allowed */ | 996 | /* concurrent O_DIRECT writes are allowed */ |
966 | rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1; | 997 | rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1; |
967 | ret = ocfs2_rw_lock(inode, rw_level); | 998 | ret = ocfs2_rw_lock(inode, rw_level); |
968 | if (ret < 0) { | 999 | if (ret < 0) { |
969 | rw_level = -1; | 1000 | rw_level = -1; |
970 | mlog_errno(ret); | 1001 | mlog_errno(ret); |
971 | goto out; | 1002 | goto out; |
972 | } | 1003 | } |
973 | 1004 | ||
974 | /* | 1005 | /* |
975 | * We sample i_size under a read level meta lock to see if our write | 1006 | * We sample i_size under a read level meta lock to see if our write |
976 | * is extending the file, if it is we back off and get a write level | 1007 | * is extending the file, if it is we back off and get a write level |
977 | * meta lock. | 1008 | * meta lock. |
978 | */ | 1009 | */ |
979 | meta_level = (filp->f_flags & O_APPEND) ? 1 : 0; | 1010 | meta_level = (filp->f_flags & O_APPEND) ? 1 : 0; |
980 | for(;;) { | 1011 | for(;;) { |
981 | ret = ocfs2_meta_lock(inode, NULL, NULL, meta_level); | 1012 | ret = ocfs2_meta_lock(inode, NULL, NULL, meta_level); |
982 | if (ret < 0) { | 1013 | if (ret < 0) { |
983 | meta_level = -1; | 1014 | meta_level = -1; |
984 | mlog_errno(ret); | 1015 | mlog_errno(ret); |
985 | goto out; | 1016 | goto out; |
986 | } | 1017 | } |
987 | 1018 | ||
988 | /* Clear suid / sgid if necessary. We do this here | 1019 | /* Clear suid / sgid if necessary. We do this here |
989 | * instead of later in the write path because | 1020 | * instead of later in the write path because |
990 | * remove_suid() calls ->setattr without any hint that | 1021 | * remove_suid() calls ->setattr without any hint that |
991 | * we may have already done our cluster locking. Since | 1022 | * we may have already done our cluster locking. Since |
992 | * ocfs2_setattr() *must* take cluster locks to | 1023 | * ocfs2_setattr() *must* take cluster locks to |
993 | * proceeed, this will lead us to recursively lock the | 1024 | * proceeed, this will lead us to recursively lock the |
994 | * inode. There's also the dinode i_size state which | 1025 | * inode. There's also the dinode i_size state which |
995 | * can be lost via setattr during extending writes (we | 1026 | * can be lost via setattr during extending writes (we |
996 | * set inode->i_size at the end of a write. */ | 1027 | * set inode->i_size at the end of a write. */ |
997 | if (ocfs2_write_should_remove_suid(inode)) { | 1028 | if (ocfs2_write_should_remove_suid(inode)) { |
998 | if (meta_level == 0) { | 1029 | if (meta_level == 0) { |
999 | ocfs2_meta_unlock(inode, meta_level); | 1030 | ocfs2_meta_unlock(inode, meta_level); |
1000 | meta_level = 1; | 1031 | meta_level = 1; |
1001 | continue; | 1032 | continue; |
1002 | } | 1033 | } |
1003 | 1034 | ||
1004 | ret = ocfs2_write_remove_suid(inode); | 1035 | ret = ocfs2_write_remove_suid(inode); |
1005 | if (ret < 0) { | 1036 | if (ret < 0) { |
1006 | mlog_errno(ret); | 1037 | mlog_errno(ret); |
1007 | goto out; | 1038 | goto out; |
1008 | } | 1039 | } |
1009 | } | 1040 | } |
1010 | 1041 | ||
1011 | /* work on a copy of ppos until we're sure that we won't have | 1042 | /* work on a copy of ppos until we're sure that we won't have |
1012 | * to recalculate it due to relocking. */ | 1043 | * to recalculate it due to relocking. */ |
1013 | if (filp->f_flags & O_APPEND) { | 1044 | if (filp->f_flags & O_APPEND) { |
1014 | saved_pos = i_size_read(inode); | 1045 | saved_pos = i_size_read(inode); |
1015 | mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos); | 1046 | mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos); |
1016 | } else { | 1047 | } else { |
1017 | saved_pos = iocb->ki_pos; | 1048 | saved_pos = iocb->ki_pos; |
1018 | } | 1049 | } |
1019 | newsize = count + saved_pos; | 1050 | newsize = count + saved_pos; |
1020 | 1051 | ||
1021 | mlog(0, "pos=%lld newsize=%lld cursize=%lld\n", | 1052 | mlog(0, "pos=%lld newsize=%lld cursize=%lld\n", |
1022 | (long long) saved_pos, (long long) newsize, | 1053 | (long long) saved_pos, (long long) newsize, |
1023 | (long long) i_size_read(inode)); | 1054 | (long long) i_size_read(inode)); |
1024 | 1055 | ||
1025 | /* No need for a higher level metadata lock if we're | 1056 | /* No need for a higher level metadata lock if we're |
1026 | * never going past i_size. */ | 1057 | * never going past i_size. */ |
1027 | if (newsize <= i_size_read(inode)) | 1058 | if (newsize <= i_size_read(inode)) |
1028 | break; | 1059 | break; |
1029 | 1060 | ||
1030 | if (meta_level == 0) { | 1061 | if (meta_level == 0) { |
1031 | ocfs2_meta_unlock(inode, meta_level); | 1062 | ocfs2_meta_unlock(inode, meta_level); |
1032 | meta_level = 1; | 1063 | meta_level = 1; |
1033 | continue; | 1064 | continue; |
1034 | } | 1065 | } |
1035 | 1066 | ||
1036 | spin_lock(&OCFS2_I(inode)->ip_lock); | 1067 | spin_lock(&OCFS2_I(inode)->ip_lock); |
1037 | clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) - | 1068 | clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) - |
1038 | OCFS2_I(inode)->ip_clusters; | 1069 | OCFS2_I(inode)->ip_clusters; |
1039 | spin_unlock(&OCFS2_I(inode)->ip_lock); | 1070 | spin_unlock(&OCFS2_I(inode)->ip_lock); |
1040 | 1071 | ||
1041 | mlog(0, "Writing at EOF, may need more allocation: " | 1072 | mlog(0, "Writing at EOF, may need more allocation: " |
1042 | "i_size = %lld, newsize = %lld, need %u clusters\n", | 1073 | "i_size = %lld, newsize = %lld, need %u clusters\n", |
1043 | (long long) i_size_read(inode), (long long) newsize, | 1074 | (long long) i_size_read(inode), (long long) newsize, |
1044 | clusters); | 1075 | clusters); |
1045 | 1076 | ||
1046 | /* We only want to continue the rest of this loop if | 1077 | /* We only want to continue the rest of this loop if |
1047 | * our extend will actually require more | 1078 | * our extend will actually require more |
1048 | * allocation. */ | 1079 | * allocation. */ |
1049 | if (!clusters) | 1080 | if (!clusters) |
1050 | break; | 1081 | break; |
1051 | 1082 | ||
1052 | ret = ocfs2_extend_allocation(inode, clusters); | 1083 | ret = ocfs2_extend_file(inode, NULL, newsize, count); |
1053 | if (ret < 0) { | 1084 | if (ret < 0) { |
1054 | if (ret != -ENOSPC) | 1085 | if (ret != -ENOSPC) |
1055 | mlog_errno(ret); | 1086 | mlog_errno(ret); |
1056 | goto out; | ||
1057 | } | ||
1058 | |||
1059 | /* Fill any holes which would've been created by this | ||
1060 | * write. If we're O_APPEND, this will wind up | ||
1061 | * (correctly) being a noop. */ | ||
1062 | ret = ocfs2_zero_extend(inode, (u64) newsize - count); | ||
1063 | if (ret < 0) { | ||
1064 | mlog_errno(ret); | ||
1065 | goto out; | 1087 | goto out; |
1066 | } | 1088 | } |
1067 | break; | 1089 | break; |
1068 | } | 1090 | } |
1069 | 1091 | ||
1070 | /* ok, we're done with i_size and alloc work */ | 1092 | /* ok, we're done with i_size and alloc work */ |
1071 | iocb->ki_pos = saved_pos; | 1093 | iocb->ki_pos = saved_pos; |
1072 | ocfs2_meta_unlock(inode, meta_level); | 1094 | ocfs2_meta_unlock(inode, meta_level); |
1073 | meta_level = -1; | 1095 | meta_level = -1; |
1074 | 1096 | ||
1075 | /* communicate with ocfs2_dio_end_io */ | 1097 | /* communicate with ocfs2_dio_end_io */ |
1076 | ocfs2_iocb_set_rw_locked(iocb); | 1098 | ocfs2_iocb_set_rw_locked(iocb); |
1077 | 1099 | ||
1078 | ret = generic_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos); | 1100 | ret = generic_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos); |
1079 | 1101 | ||
1080 | /* buffered aio wouldn't have proper lock coverage today */ | 1102 | /* buffered aio wouldn't have proper lock coverage today */ |
1081 | BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); | 1103 | BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); |
1082 | 1104 | ||
1083 | /* | 1105 | /* |
1084 | * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io | 1106 | * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io |
1085 | * function pointer which is called when o_direct io completes so that | 1107 | * function pointer which is called when o_direct io completes so that |
1086 | * it can unlock our rw lock. (it's the clustered equivalent of | 1108 | * it can unlock our rw lock. (it's the clustered equivalent of |
1087 | * i_alloc_sem; protects truncate from racing with pending ios). | 1109 | * i_alloc_sem; protects truncate from racing with pending ios). |
1088 | * Unfortunately there are error cases which call end_io and others | 1110 | * Unfortunately there are error cases which call end_io and others |
1089 | * that don't. so we don't have to unlock the rw_lock if either an | 1111 | * that don't. so we don't have to unlock the rw_lock if either an |
1090 | * async dio is going to do it in the future or an end_io after an | 1112 | * async dio is going to do it in the future or an end_io after an |
1091 | * error has already done it. | 1113 | * error has already done it. |
1092 | */ | 1114 | */ |
1093 | if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { | 1115 | if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { |
1094 | rw_level = -1; | 1116 | rw_level = -1; |
1095 | have_alloc_sem = 0; | 1117 | have_alloc_sem = 0; |
1096 | } | 1118 | } |
1097 | 1119 | ||
1098 | out: | 1120 | out: |
1099 | if (meta_level != -1) | 1121 | if (meta_level != -1) |
1100 | ocfs2_meta_unlock(inode, meta_level); | 1122 | ocfs2_meta_unlock(inode, meta_level); |
1101 | if (have_alloc_sem) | 1123 | if (have_alloc_sem) |
1102 | up_read(&inode->i_alloc_sem); | 1124 | up_read(&inode->i_alloc_sem); |
1103 | if (rw_level != -1) | 1125 | if (rw_level != -1) |
1104 | ocfs2_rw_unlock(inode, rw_level); | 1126 | ocfs2_rw_unlock(inode, rw_level); |
1105 | mutex_unlock(&inode->i_mutex); | 1127 | mutex_unlock(&inode->i_mutex); |
1106 | 1128 | ||
1107 | mlog_exit(ret); | 1129 | mlog_exit(ret); |
1108 | return ret; | 1130 | return ret; |
1109 | } | 1131 | } |
1110 | 1132 | ||
1111 | static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, | 1133 | static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, |
1112 | char __user *buf, | 1134 | char __user *buf, |
1113 | size_t count, | 1135 | size_t count, |
1114 | loff_t pos) | 1136 | loff_t pos) |
1115 | { | 1137 | { |
1116 | int ret = 0, rw_level = -1, have_alloc_sem = 0; | 1138 | int ret = 0, rw_level = -1, have_alloc_sem = 0; |
1117 | struct file *filp = iocb->ki_filp; | 1139 | struct file *filp = iocb->ki_filp; |
1118 | struct inode *inode = filp->f_dentry->d_inode; | 1140 | struct inode *inode = filp->f_dentry->d_inode; |
1119 | 1141 | ||
1120 | mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf, | 1142 | mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf, |
1121 | (unsigned int)count, | 1143 | (unsigned int)count, |
1122 | filp->f_dentry->d_name.len, | 1144 | filp->f_dentry->d_name.len, |
1123 | filp->f_dentry->d_name.name); | 1145 | filp->f_dentry->d_name.name); |
1124 | 1146 | ||
1125 | if (!inode) { | 1147 | if (!inode) { |
1126 | ret = -EINVAL; | 1148 | ret = -EINVAL; |
1127 | mlog_errno(ret); | 1149 | mlog_errno(ret); |
1128 | goto bail; | 1150 | goto bail; |
1129 | } | 1151 | } |
1130 | 1152 | ||
1131 | /* | 1153 | /* |
1132 | * buffered reads protect themselves in ->readpage(). O_DIRECT reads | 1154 | * buffered reads protect themselves in ->readpage(). O_DIRECT reads |
1133 | * need locks to protect pending reads from racing with truncate. | 1155 | * need locks to protect pending reads from racing with truncate. |
1134 | */ | 1156 | */ |
1135 | if (filp->f_flags & O_DIRECT) { | 1157 | if (filp->f_flags & O_DIRECT) { |
1136 | down_read(&inode->i_alloc_sem); | 1158 | down_read(&inode->i_alloc_sem); |
1137 | have_alloc_sem = 1; | 1159 | have_alloc_sem = 1; |
1138 | 1160 | ||
1139 | ret = ocfs2_rw_lock(inode, 0); | 1161 | ret = ocfs2_rw_lock(inode, 0); |
1140 | if (ret < 0) { | 1162 | if (ret < 0) { |
1141 | mlog_errno(ret); | 1163 | mlog_errno(ret); |
1142 | goto bail; | 1164 | goto bail; |
1143 | } | 1165 | } |
1144 | rw_level = 0; | 1166 | rw_level = 0; |
1145 | /* communicate with ocfs2_dio_end_io */ | 1167 | /* communicate with ocfs2_dio_end_io */ |
1146 | ocfs2_iocb_set_rw_locked(iocb); | 1168 | ocfs2_iocb_set_rw_locked(iocb); |
1147 | } | 1169 | } |
1148 | 1170 | ||
1149 | ret = generic_file_aio_read(iocb, buf, count, iocb->ki_pos); | 1171 | ret = generic_file_aio_read(iocb, buf, count, iocb->ki_pos); |
1150 | if (ret == -EINVAL) | 1172 | if (ret == -EINVAL) |
1151 | mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n"); | 1173 | mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n"); |
1152 | 1174 | ||
1153 | /* buffered aio wouldn't have proper lock coverage today */ | 1175 | /* buffered aio wouldn't have proper lock coverage today */ |
1154 | BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); | 1176 | BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); |
1155 | 1177 | ||
1156 | /* see ocfs2_file_aio_write */ | 1178 | /* see ocfs2_file_aio_write */ |
1157 | if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { | 1179 | if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { |
1158 | rw_level = -1; | 1180 | rw_level = -1; |
1159 | have_alloc_sem = 0; | 1181 | have_alloc_sem = 0; |
1160 | } | 1182 | } |
1161 | 1183 | ||
1162 | bail: | 1184 | bail: |
1163 | if (have_alloc_sem) | 1185 | if (have_alloc_sem) |
1164 | up_read(&inode->i_alloc_sem); | 1186 | up_read(&inode->i_alloc_sem); |
1165 | if (rw_level != -1) | 1187 | if (rw_level != -1) |
1166 | ocfs2_rw_unlock(inode, rw_level); | 1188 | ocfs2_rw_unlock(inode, rw_level); |
1167 | mlog_exit(ret); | 1189 | mlog_exit(ret); |
1168 | 1190 | ||
1169 | return ret; | 1191 | return ret; |
1170 | } | 1192 | } |
1171 | 1193 | ||
1172 | struct inode_operations ocfs2_file_iops = { | 1194 | struct inode_operations ocfs2_file_iops = { |
1173 | .setattr = ocfs2_setattr, | 1195 | .setattr = ocfs2_setattr, |
1174 | .getattr = ocfs2_getattr, | 1196 | .getattr = ocfs2_getattr, |
1175 | }; | 1197 | }; |
1176 | 1198 | ||
1177 | struct inode_operations ocfs2_special_file_iops = { | 1199 | struct inode_operations ocfs2_special_file_iops = { |
1178 | .setattr = ocfs2_setattr, | 1200 | .setattr = ocfs2_setattr, |
1179 | .getattr = ocfs2_getattr, | 1201 | .getattr = ocfs2_getattr, |
1180 | }; | 1202 | }; |
1181 | 1203 | ||
1182 | const struct file_operations ocfs2_fops = { | 1204 | const struct file_operations ocfs2_fops = { |
1183 | .read = do_sync_read, | 1205 | .read = do_sync_read, |
1184 | .write = do_sync_write, | 1206 | .write = do_sync_write, |
1185 | .sendfile = generic_file_sendfile, | 1207 | .sendfile = generic_file_sendfile, |
1186 | .mmap = ocfs2_mmap, | 1208 | .mmap = ocfs2_mmap, |
1187 | .fsync = ocfs2_sync_file, | 1209 | .fsync = ocfs2_sync_file, |
1188 | .release = ocfs2_file_release, | 1210 | .release = ocfs2_file_release, |