Commit 2501c4a066e633524791e8ce8dbfe615aca071cf

Authored by Filipe Manana
Committed by Greg Kroah-Hartman
1 parent bddf0faccf

Btrfs: read inode size after acquiring the mutex when punching a hole

commit a1a50f60a6bf4f861eb94793420274bc1ccd409a upstream.

In a previous change, commit 12870f1c9b2de7d475d22e73fd7db1b418599725,
I accidentally moved the roundup of inode->i_size to outside of the
critical section delimited by the inode mutex, which is not atomic and
not correct since the size can be changed by other task before we acquire
the mutex. Therefore fix it.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Chris Mason <clm@fb.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Showing 1 changed file with 2 additions and 1 deletions Inline Diff

1 /* 1 /*
2 * Copyright (C) 2007 Oracle. All rights reserved. 2 * Copyright (C) 2007 Oracle. All rights reserved.
3 * 3 *
4 * This program is free software; you can redistribute it and/or 4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public 5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation. 6 * License v2 as published by the Free Software Foundation.
7 * 7 *
8 * This program is distributed in the hope that it will be useful, 8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details. 11 * General Public License for more details.
12 * 12 *
13 * You should have received a copy of the GNU General Public 13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the 14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18 18
19 #include <linux/fs.h> 19 #include <linux/fs.h>
20 #include <linux/pagemap.h> 20 #include <linux/pagemap.h>
21 #include <linux/highmem.h> 21 #include <linux/highmem.h>
22 #include <linux/time.h> 22 #include <linux/time.h>
23 #include <linux/init.h> 23 #include <linux/init.h>
24 #include <linux/string.h> 24 #include <linux/string.h>
25 #include <linux/backing-dev.h> 25 #include <linux/backing-dev.h>
26 #include <linux/mpage.h> 26 #include <linux/mpage.h>
27 #include <linux/aio.h> 27 #include <linux/aio.h>
28 #include <linux/falloc.h> 28 #include <linux/falloc.h>
29 #include <linux/swap.h> 29 #include <linux/swap.h>
30 #include <linux/writeback.h> 30 #include <linux/writeback.h>
31 #include <linux/statfs.h> 31 #include <linux/statfs.h>
32 #include <linux/compat.h> 32 #include <linux/compat.h>
33 #include <linux/slab.h> 33 #include <linux/slab.h>
34 #include <linux/btrfs.h> 34 #include <linux/btrfs.h>
35 #include "ctree.h" 35 #include "ctree.h"
36 #include "disk-io.h" 36 #include "disk-io.h"
37 #include "transaction.h" 37 #include "transaction.h"
38 #include "btrfs_inode.h" 38 #include "btrfs_inode.h"
39 #include "print-tree.h" 39 #include "print-tree.h"
40 #include "tree-log.h" 40 #include "tree-log.h"
41 #include "locking.h" 41 #include "locking.h"
42 #include "volumes.h" 42 #include "volumes.h"
43 43
44 static struct kmem_cache *btrfs_inode_defrag_cachep; 44 static struct kmem_cache *btrfs_inode_defrag_cachep;
45 /* 45 /*
46 * when auto defrag is enabled we 46 * when auto defrag is enabled we
47 * queue up these defrag structs to remember which 47 * queue up these defrag structs to remember which
48 * inodes need defragging passes 48 * inodes need defragging passes
49 */ 49 */
50 struct inode_defrag { 50 struct inode_defrag {
51 struct rb_node rb_node; 51 struct rb_node rb_node;
52 /* objectid */ 52 /* objectid */
53 u64 ino; 53 u64 ino;
54 /* 54 /*
55 * transid where the defrag was added, we search for 55 * transid where the defrag was added, we search for
56 * extents newer than this 56 * extents newer than this
57 */ 57 */
58 u64 transid; 58 u64 transid;
59 59
60 /* root objectid */ 60 /* root objectid */
61 u64 root; 61 u64 root;
62 62
63 /* last offset we were able to defrag */ 63 /* last offset we were able to defrag */
64 u64 last_offset; 64 u64 last_offset;
65 65
66 /* if we've wrapped around back to zero once already */ 66 /* if we've wrapped around back to zero once already */
67 int cycled; 67 int cycled;
68 }; 68 };
69 69
70 static int __compare_inode_defrag(struct inode_defrag *defrag1, 70 static int __compare_inode_defrag(struct inode_defrag *defrag1,
71 struct inode_defrag *defrag2) 71 struct inode_defrag *defrag2)
72 { 72 {
73 if (defrag1->root > defrag2->root) 73 if (defrag1->root > defrag2->root)
74 return 1; 74 return 1;
75 else if (defrag1->root < defrag2->root) 75 else if (defrag1->root < defrag2->root)
76 return -1; 76 return -1;
77 else if (defrag1->ino > defrag2->ino) 77 else if (defrag1->ino > defrag2->ino)
78 return 1; 78 return 1;
79 else if (defrag1->ino < defrag2->ino) 79 else if (defrag1->ino < defrag2->ino)
80 return -1; 80 return -1;
81 else 81 else
82 return 0; 82 return 0;
83 } 83 }
84 84
85 /* pop a record for an inode into the defrag tree. The lock 85 /* pop a record for an inode into the defrag tree. The lock
86 * must be held already 86 * must be held already
87 * 87 *
88 * If you're inserting a record for an older transid than an 88 * If you're inserting a record for an older transid than an
89 * existing record, the transid already in the tree is lowered 89 * existing record, the transid already in the tree is lowered
90 * 90 *
91 * If an existing record is found the defrag item you 91 * If an existing record is found the defrag item you
92 * pass in is freed 92 * pass in is freed
93 */ 93 */
94 static int __btrfs_add_inode_defrag(struct inode *inode, 94 static int __btrfs_add_inode_defrag(struct inode *inode,
95 struct inode_defrag *defrag) 95 struct inode_defrag *defrag)
96 { 96 {
97 struct btrfs_root *root = BTRFS_I(inode)->root; 97 struct btrfs_root *root = BTRFS_I(inode)->root;
98 struct inode_defrag *entry; 98 struct inode_defrag *entry;
99 struct rb_node **p; 99 struct rb_node **p;
100 struct rb_node *parent = NULL; 100 struct rb_node *parent = NULL;
101 int ret; 101 int ret;
102 102
103 p = &root->fs_info->defrag_inodes.rb_node; 103 p = &root->fs_info->defrag_inodes.rb_node;
104 while (*p) { 104 while (*p) {
105 parent = *p; 105 parent = *p;
106 entry = rb_entry(parent, struct inode_defrag, rb_node); 106 entry = rb_entry(parent, struct inode_defrag, rb_node);
107 107
108 ret = __compare_inode_defrag(defrag, entry); 108 ret = __compare_inode_defrag(defrag, entry);
109 if (ret < 0) 109 if (ret < 0)
110 p = &parent->rb_left; 110 p = &parent->rb_left;
111 else if (ret > 0) 111 else if (ret > 0)
112 p = &parent->rb_right; 112 p = &parent->rb_right;
113 else { 113 else {
114 /* if we're reinserting an entry for 114 /* if we're reinserting an entry for
115 * an old defrag run, make sure to 115 * an old defrag run, make sure to
116 * lower the transid of our existing record 116 * lower the transid of our existing record
117 */ 117 */
118 if (defrag->transid < entry->transid) 118 if (defrag->transid < entry->transid)
119 entry->transid = defrag->transid; 119 entry->transid = defrag->transid;
120 if (defrag->last_offset > entry->last_offset) 120 if (defrag->last_offset > entry->last_offset)
121 entry->last_offset = defrag->last_offset; 121 entry->last_offset = defrag->last_offset;
122 return -EEXIST; 122 return -EEXIST;
123 } 123 }
124 } 124 }
125 set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); 125 set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
126 rb_link_node(&defrag->rb_node, parent, p); 126 rb_link_node(&defrag->rb_node, parent, p);
127 rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes); 127 rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
128 return 0; 128 return 0;
129 } 129 }
130 130
131 static inline int __need_auto_defrag(struct btrfs_root *root) 131 static inline int __need_auto_defrag(struct btrfs_root *root)
132 { 132 {
133 if (!btrfs_test_opt(root, AUTO_DEFRAG)) 133 if (!btrfs_test_opt(root, AUTO_DEFRAG))
134 return 0; 134 return 0;
135 135
136 if (btrfs_fs_closing(root->fs_info)) 136 if (btrfs_fs_closing(root->fs_info))
137 return 0; 137 return 0;
138 138
139 return 1; 139 return 1;
140 } 140 }
141 141
142 /* 142 /*
143 * insert a defrag record for this inode if auto defrag is 143 * insert a defrag record for this inode if auto defrag is
144 * enabled 144 * enabled
145 */ 145 */
146 int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, 146 int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
147 struct inode *inode) 147 struct inode *inode)
148 { 148 {
149 struct btrfs_root *root = BTRFS_I(inode)->root; 149 struct btrfs_root *root = BTRFS_I(inode)->root;
150 struct inode_defrag *defrag; 150 struct inode_defrag *defrag;
151 u64 transid; 151 u64 transid;
152 int ret; 152 int ret;
153 153
154 if (!__need_auto_defrag(root)) 154 if (!__need_auto_defrag(root))
155 return 0; 155 return 0;
156 156
157 if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) 157 if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
158 return 0; 158 return 0;
159 159
160 if (trans) 160 if (trans)
161 transid = trans->transid; 161 transid = trans->transid;
162 else 162 else
163 transid = BTRFS_I(inode)->root->last_trans; 163 transid = BTRFS_I(inode)->root->last_trans;
164 164
165 defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); 165 defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
166 if (!defrag) 166 if (!defrag)
167 return -ENOMEM; 167 return -ENOMEM;
168 168
169 defrag->ino = btrfs_ino(inode); 169 defrag->ino = btrfs_ino(inode);
170 defrag->transid = transid; 170 defrag->transid = transid;
171 defrag->root = root->root_key.objectid; 171 defrag->root = root->root_key.objectid;
172 172
173 spin_lock(&root->fs_info->defrag_inodes_lock); 173 spin_lock(&root->fs_info->defrag_inodes_lock);
174 if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) { 174 if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) {
175 /* 175 /*
176 * If we set IN_DEFRAG flag and evict the inode from memory, 176 * If we set IN_DEFRAG flag and evict the inode from memory,
177 * and then re-read this inode, this new inode doesn't have 177 * and then re-read this inode, this new inode doesn't have
178 * IN_DEFRAG flag. At the case, we may find the existed defrag. 178 * IN_DEFRAG flag. At the case, we may find the existed defrag.
179 */ 179 */
180 ret = __btrfs_add_inode_defrag(inode, defrag); 180 ret = __btrfs_add_inode_defrag(inode, defrag);
181 if (ret) 181 if (ret)
182 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 182 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
183 } else { 183 } else {
184 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 184 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
185 } 185 }
186 spin_unlock(&root->fs_info->defrag_inodes_lock); 186 spin_unlock(&root->fs_info->defrag_inodes_lock);
187 return 0; 187 return 0;
188 } 188 }
189 189
190 /* 190 /*
191 * Requeue the defrag object. If there is a defrag object that points to 191 * Requeue the defrag object. If there is a defrag object that points to
192 * the same inode in the tree, we will merge them together (by 192 * the same inode in the tree, we will merge them together (by
193 * __btrfs_add_inode_defrag()) and free the one that we want to requeue. 193 * __btrfs_add_inode_defrag()) and free the one that we want to requeue.
194 */ 194 */
195 static void btrfs_requeue_inode_defrag(struct inode *inode, 195 static void btrfs_requeue_inode_defrag(struct inode *inode,
196 struct inode_defrag *defrag) 196 struct inode_defrag *defrag)
197 { 197 {
198 struct btrfs_root *root = BTRFS_I(inode)->root; 198 struct btrfs_root *root = BTRFS_I(inode)->root;
199 int ret; 199 int ret;
200 200
201 if (!__need_auto_defrag(root)) 201 if (!__need_auto_defrag(root))
202 goto out; 202 goto out;
203 203
204 /* 204 /*
205 * Here we don't check the IN_DEFRAG flag, because we need merge 205 * Here we don't check the IN_DEFRAG flag, because we need merge
206 * them together. 206 * them together.
207 */ 207 */
208 spin_lock(&root->fs_info->defrag_inodes_lock); 208 spin_lock(&root->fs_info->defrag_inodes_lock);
209 ret = __btrfs_add_inode_defrag(inode, defrag); 209 ret = __btrfs_add_inode_defrag(inode, defrag);
210 spin_unlock(&root->fs_info->defrag_inodes_lock); 210 spin_unlock(&root->fs_info->defrag_inodes_lock);
211 if (ret) 211 if (ret)
212 goto out; 212 goto out;
213 return; 213 return;
214 out: 214 out:
215 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 215 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
216 } 216 }
217 217
218 /* 218 /*
219 * pick the defragable inode that we want, if it doesn't exist, we will get 219 * pick the defragable inode that we want, if it doesn't exist, we will get
220 * the next one. 220 * the next one.
221 */ 221 */
222 static struct inode_defrag * 222 static struct inode_defrag *
223 btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino) 223 btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
224 { 224 {
225 struct inode_defrag *entry = NULL; 225 struct inode_defrag *entry = NULL;
226 struct inode_defrag tmp; 226 struct inode_defrag tmp;
227 struct rb_node *p; 227 struct rb_node *p;
228 struct rb_node *parent = NULL; 228 struct rb_node *parent = NULL;
229 int ret; 229 int ret;
230 230
231 tmp.ino = ino; 231 tmp.ino = ino;
232 tmp.root = root; 232 tmp.root = root;
233 233
234 spin_lock(&fs_info->defrag_inodes_lock); 234 spin_lock(&fs_info->defrag_inodes_lock);
235 p = fs_info->defrag_inodes.rb_node; 235 p = fs_info->defrag_inodes.rb_node;
236 while (p) { 236 while (p) {
237 parent = p; 237 parent = p;
238 entry = rb_entry(parent, struct inode_defrag, rb_node); 238 entry = rb_entry(parent, struct inode_defrag, rb_node);
239 239
240 ret = __compare_inode_defrag(&tmp, entry); 240 ret = __compare_inode_defrag(&tmp, entry);
241 if (ret < 0) 241 if (ret < 0)
242 p = parent->rb_left; 242 p = parent->rb_left;
243 else if (ret > 0) 243 else if (ret > 0)
244 p = parent->rb_right; 244 p = parent->rb_right;
245 else 245 else
246 goto out; 246 goto out;
247 } 247 }
248 248
249 if (parent && __compare_inode_defrag(&tmp, entry) > 0) { 249 if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
250 parent = rb_next(parent); 250 parent = rb_next(parent);
251 if (parent) 251 if (parent)
252 entry = rb_entry(parent, struct inode_defrag, rb_node); 252 entry = rb_entry(parent, struct inode_defrag, rb_node);
253 else 253 else
254 entry = NULL; 254 entry = NULL;
255 } 255 }
256 out: 256 out:
257 if (entry) 257 if (entry)
258 rb_erase(parent, &fs_info->defrag_inodes); 258 rb_erase(parent, &fs_info->defrag_inodes);
259 spin_unlock(&fs_info->defrag_inodes_lock); 259 spin_unlock(&fs_info->defrag_inodes_lock);
260 return entry; 260 return entry;
261 } 261 }
262 262
263 void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) 263 void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
264 { 264 {
265 struct inode_defrag *defrag; 265 struct inode_defrag *defrag;
266 struct rb_node *node; 266 struct rb_node *node;
267 267
268 spin_lock(&fs_info->defrag_inodes_lock); 268 spin_lock(&fs_info->defrag_inodes_lock);
269 node = rb_first(&fs_info->defrag_inodes); 269 node = rb_first(&fs_info->defrag_inodes);
270 while (node) { 270 while (node) {
271 rb_erase(node, &fs_info->defrag_inodes); 271 rb_erase(node, &fs_info->defrag_inodes);
272 defrag = rb_entry(node, struct inode_defrag, rb_node); 272 defrag = rb_entry(node, struct inode_defrag, rb_node);
273 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 273 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
274 274
275 if (need_resched()) { 275 if (need_resched()) {
276 spin_unlock(&fs_info->defrag_inodes_lock); 276 spin_unlock(&fs_info->defrag_inodes_lock);
277 cond_resched(); 277 cond_resched();
278 spin_lock(&fs_info->defrag_inodes_lock); 278 spin_lock(&fs_info->defrag_inodes_lock);
279 } 279 }
280 280
281 node = rb_first(&fs_info->defrag_inodes); 281 node = rb_first(&fs_info->defrag_inodes);
282 } 282 }
283 spin_unlock(&fs_info->defrag_inodes_lock); 283 spin_unlock(&fs_info->defrag_inodes_lock);
284 } 284 }
285 285
286 #define BTRFS_DEFRAG_BATCH 1024 286 #define BTRFS_DEFRAG_BATCH 1024
287 287
288 static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, 288 static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
289 struct inode_defrag *defrag) 289 struct inode_defrag *defrag)
290 { 290 {
291 struct btrfs_root *inode_root; 291 struct btrfs_root *inode_root;
292 struct inode *inode; 292 struct inode *inode;
293 struct btrfs_key key; 293 struct btrfs_key key;
294 struct btrfs_ioctl_defrag_range_args range; 294 struct btrfs_ioctl_defrag_range_args range;
295 int num_defrag; 295 int num_defrag;
296 int index; 296 int index;
297 int ret; 297 int ret;
298 298
299 /* get the inode */ 299 /* get the inode */
300 key.objectid = defrag->root; 300 key.objectid = defrag->root;
301 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 301 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
302 key.offset = (u64)-1; 302 key.offset = (u64)-1;
303 303
304 index = srcu_read_lock(&fs_info->subvol_srcu); 304 index = srcu_read_lock(&fs_info->subvol_srcu);
305 305
306 inode_root = btrfs_read_fs_root_no_name(fs_info, &key); 306 inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
307 if (IS_ERR(inode_root)) { 307 if (IS_ERR(inode_root)) {
308 ret = PTR_ERR(inode_root); 308 ret = PTR_ERR(inode_root);
309 goto cleanup; 309 goto cleanup;
310 } 310 }
311 311
312 key.objectid = defrag->ino; 312 key.objectid = defrag->ino;
313 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 313 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
314 key.offset = 0; 314 key.offset = 0;
315 inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); 315 inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
316 if (IS_ERR(inode)) { 316 if (IS_ERR(inode)) {
317 ret = PTR_ERR(inode); 317 ret = PTR_ERR(inode);
318 goto cleanup; 318 goto cleanup;
319 } 319 }
320 srcu_read_unlock(&fs_info->subvol_srcu, index); 320 srcu_read_unlock(&fs_info->subvol_srcu, index);
321 321
322 /* do a chunk of defrag */ 322 /* do a chunk of defrag */
323 clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); 323 clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
324 memset(&range, 0, sizeof(range)); 324 memset(&range, 0, sizeof(range));
325 range.len = (u64)-1; 325 range.len = (u64)-1;
326 range.start = defrag->last_offset; 326 range.start = defrag->last_offset;
327 327
328 sb_start_write(fs_info->sb); 328 sb_start_write(fs_info->sb);
329 num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, 329 num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
330 BTRFS_DEFRAG_BATCH); 330 BTRFS_DEFRAG_BATCH);
331 sb_end_write(fs_info->sb); 331 sb_end_write(fs_info->sb);
332 /* 332 /*
333 * if we filled the whole defrag batch, there 333 * if we filled the whole defrag batch, there
334 * must be more work to do. Queue this defrag 334 * must be more work to do. Queue this defrag
335 * again 335 * again
336 */ 336 */
337 if (num_defrag == BTRFS_DEFRAG_BATCH) { 337 if (num_defrag == BTRFS_DEFRAG_BATCH) {
338 defrag->last_offset = range.start; 338 defrag->last_offset = range.start;
339 btrfs_requeue_inode_defrag(inode, defrag); 339 btrfs_requeue_inode_defrag(inode, defrag);
340 } else if (defrag->last_offset && !defrag->cycled) { 340 } else if (defrag->last_offset && !defrag->cycled) {
341 /* 341 /*
342 * we didn't fill our defrag batch, but 342 * we didn't fill our defrag batch, but
343 * we didn't start at zero. Make sure we loop 343 * we didn't start at zero. Make sure we loop
344 * around to the start of the file. 344 * around to the start of the file.
345 */ 345 */
346 defrag->last_offset = 0; 346 defrag->last_offset = 0;
347 defrag->cycled = 1; 347 defrag->cycled = 1;
348 btrfs_requeue_inode_defrag(inode, defrag); 348 btrfs_requeue_inode_defrag(inode, defrag);
349 } else { 349 } else {
350 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 350 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
351 } 351 }
352 352
353 iput(inode); 353 iput(inode);
354 return 0; 354 return 0;
355 cleanup: 355 cleanup:
356 srcu_read_unlock(&fs_info->subvol_srcu, index); 356 srcu_read_unlock(&fs_info->subvol_srcu, index);
357 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 357 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
358 return ret; 358 return ret;
359 } 359 }
360 360
361 /* 361 /*
362 * run through the list of inodes in the FS that need 362 * run through the list of inodes in the FS that need
363 * defragging 363 * defragging
364 */ 364 */
365 int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) 365 int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
366 { 366 {
367 struct inode_defrag *defrag; 367 struct inode_defrag *defrag;
368 u64 first_ino = 0; 368 u64 first_ino = 0;
369 u64 root_objectid = 0; 369 u64 root_objectid = 0;
370 370
371 atomic_inc(&fs_info->defrag_running); 371 atomic_inc(&fs_info->defrag_running);
372 while (1) { 372 while (1) {
373 /* Pause the auto defragger. */ 373 /* Pause the auto defragger. */
374 if (test_bit(BTRFS_FS_STATE_REMOUNTING, 374 if (test_bit(BTRFS_FS_STATE_REMOUNTING,
375 &fs_info->fs_state)) 375 &fs_info->fs_state))
376 break; 376 break;
377 377
378 if (!__need_auto_defrag(fs_info->tree_root)) 378 if (!__need_auto_defrag(fs_info->tree_root))
379 break; 379 break;
380 380
381 /* find an inode to defrag */ 381 /* find an inode to defrag */
382 defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, 382 defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
383 first_ino); 383 first_ino);
384 if (!defrag) { 384 if (!defrag) {
385 if (root_objectid || first_ino) { 385 if (root_objectid || first_ino) {
386 root_objectid = 0; 386 root_objectid = 0;
387 first_ino = 0; 387 first_ino = 0;
388 continue; 388 continue;
389 } else { 389 } else {
390 break; 390 break;
391 } 391 }
392 } 392 }
393 393
394 first_ino = defrag->ino + 1; 394 first_ino = defrag->ino + 1;
395 root_objectid = defrag->root; 395 root_objectid = defrag->root;
396 396
397 __btrfs_run_defrag_inode(fs_info, defrag); 397 __btrfs_run_defrag_inode(fs_info, defrag);
398 } 398 }
399 atomic_dec(&fs_info->defrag_running); 399 atomic_dec(&fs_info->defrag_running);
400 400
401 /* 401 /*
402 * during unmount, we use the transaction_wait queue to 402 * during unmount, we use the transaction_wait queue to
403 * wait for the defragger to stop 403 * wait for the defragger to stop
404 */ 404 */
405 wake_up(&fs_info->transaction_wait); 405 wake_up(&fs_info->transaction_wait);
406 return 0; 406 return 0;
407 } 407 }
408 408
409 /* simple helper to fault in pages and copy. This should go away 409 /* simple helper to fault in pages and copy. This should go away
410 * and be replaced with calls into generic code. 410 * and be replaced with calls into generic code.
411 */ 411 */
412 static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, 412 static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
413 size_t write_bytes, 413 size_t write_bytes,
414 struct page **prepared_pages, 414 struct page **prepared_pages,
415 struct iov_iter *i) 415 struct iov_iter *i)
416 { 416 {
417 size_t copied = 0; 417 size_t copied = 0;
418 size_t total_copied = 0; 418 size_t total_copied = 0;
419 int pg = 0; 419 int pg = 0;
420 int offset = pos & (PAGE_CACHE_SIZE - 1); 420 int offset = pos & (PAGE_CACHE_SIZE - 1);
421 421
422 while (write_bytes > 0) { 422 while (write_bytes > 0) {
423 size_t count = min_t(size_t, 423 size_t count = min_t(size_t,
424 PAGE_CACHE_SIZE - offset, write_bytes); 424 PAGE_CACHE_SIZE - offset, write_bytes);
425 struct page *page = prepared_pages[pg]; 425 struct page *page = prepared_pages[pg];
426 /* 426 /*
427 * Copy data from userspace to the current page 427 * Copy data from userspace to the current page
428 */ 428 */
429 copied = iov_iter_copy_from_user_atomic(page, i, offset, count); 429 copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
430 430
431 /* Flush processor's dcache for this page */ 431 /* Flush processor's dcache for this page */
432 flush_dcache_page(page); 432 flush_dcache_page(page);
433 433
434 /* 434 /*
435 * if we get a partial write, we can end up with 435 * if we get a partial write, we can end up with
436 * partially up to date pages. These add 436 * partially up to date pages. These add
437 * a lot of complexity, so make sure they don't 437 * a lot of complexity, so make sure they don't
438 * happen by forcing this copy to be retried. 438 * happen by forcing this copy to be retried.
439 * 439 *
440 * The rest of the btrfs_file_write code will fall 440 * The rest of the btrfs_file_write code will fall
441 * back to page at a time copies after we return 0. 441 * back to page at a time copies after we return 0.
442 */ 442 */
443 if (!PageUptodate(page) && copied < count) 443 if (!PageUptodate(page) && copied < count)
444 copied = 0; 444 copied = 0;
445 445
446 iov_iter_advance(i, copied); 446 iov_iter_advance(i, copied);
447 write_bytes -= copied; 447 write_bytes -= copied;
448 total_copied += copied; 448 total_copied += copied;
449 449
450 /* Return to btrfs_file_aio_write to fault page */ 450 /* Return to btrfs_file_aio_write to fault page */
451 if (unlikely(copied == 0)) 451 if (unlikely(copied == 0))
452 break; 452 break;
453 453
454 if (unlikely(copied < PAGE_CACHE_SIZE - offset)) { 454 if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
455 offset += copied; 455 offset += copied;
456 } else { 456 } else {
457 pg++; 457 pg++;
458 offset = 0; 458 offset = 0;
459 } 459 }
460 } 460 }
461 return total_copied; 461 return total_copied;
462 } 462 }
463 463
464 /* 464 /*
465 * unlocks pages after btrfs_file_write is done with them 465 * unlocks pages after btrfs_file_write is done with them
466 */ 466 */
467 static void btrfs_drop_pages(struct page **pages, size_t num_pages) 467 static void btrfs_drop_pages(struct page **pages, size_t num_pages)
468 { 468 {
469 size_t i; 469 size_t i;
470 for (i = 0; i < num_pages; i++) { 470 for (i = 0; i < num_pages; i++) {
471 /* page checked is some magic around finding pages that 471 /* page checked is some magic around finding pages that
472 * have been modified without going through btrfs_set_page_dirty 472 * have been modified without going through btrfs_set_page_dirty
473 * clear it here 473 * clear it here
474 */ 474 */
475 ClearPageChecked(pages[i]); 475 ClearPageChecked(pages[i]);
476 unlock_page(pages[i]); 476 unlock_page(pages[i]);
477 mark_page_accessed(pages[i]); 477 mark_page_accessed(pages[i]);
478 page_cache_release(pages[i]); 478 page_cache_release(pages[i]);
479 } 479 }
480 } 480 }
481 481
482 /* 482 /*
483 * after copy_from_user, pages need to be dirtied and we need to make 483 * after copy_from_user, pages need to be dirtied and we need to make
484 * sure holes are created between the current EOF and the start of 484 * sure holes are created between the current EOF and the start of
485 * any next extents (if required). 485 * any next extents (if required).
486 * 486 *
487 * this also makes the decision about creating an inline extent vs 487 * this also makes the decision about creating an inline extent vs
488 * doing real data extents, marking pages dirty and delalloc as required. 488 * doing real data extents, marking pages dirty and delalloc as required.
489 */ 489 */
490 int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, 490 int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
491 struct page **pages, size_t num_pages, 491 struct page **pages, size_t num_pages,
492 loff_t pos, size_t write_bytes, 492 loff_t pos, size_t write_bytes,
493 struct extent_state **cached) 493 struct extent_state **cached)
494 { 494 {
495 int err = 0; 495 int err = 0;
496 int i; 496 int i;
497 u64 num_bytes; 497 u64 num_bytes;
498 u64 start_pos; 498 u64 start_pos;
499 u64 end_of_last_block; 499 u64 end_of_last_block;
500 u64 end_pos = pos + write_bytes; 500 u64 end_pos = pos + write_bytes;
501 loff_t isize = i_size_read(inode); 501 loff_t isize = i_size_read(inode);
502 502
503 start_pos = pos & ~((u64)root->sectorsize - 1); 503 start_pos = pos & ~((u64)root->sectorsize - 1);
504 num_bytes = ALIGN(write_bytes + pos - start_pos, root->sectorsize); 504 num_bytes = ALIGN(write_bytes + pos - start_pos, root->sectorsize);
505 505
506 end_of_last_block = start_pos + num_bytes - 1; 506 end_of_last_block = start_pos + num_bytes - 1;
507 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, 507 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
508 cached); 508 cached);
509 if (err) 509 if (err)
510 return err; 510 return err;
511 511
512 for (i = 0; i < num_pages; i++) { 512 for (i = 0; i < num_pages; i++) {
513 struct page *p = pages[i]; 513 struct page *p = pages[i];
514 SetPageUptodate(p); 514 SetPageUptodate(p);
515 ClearPageChecked(p); 515 ClearPageChecked(p);
516 set_page_dirty(p); 516 set_page_dirty(p);
517 } 517 }
518 518
519 /* 519 /*
520 * we've only changed i_size in ram, and we haven't updated 520 * we've only changed i_size in ram, and we haven't updated
521 * the disk i_size. There is no need to log the inode 521 * the disk i_size. There is no need to log the inode
522 * at this time. 522 * at this time.
523 */ 523 */
524 if (end_pos > isize) 524 if (end_pos > isize)
525 i_size_write(inode, end_pos); 525 i_size_write(inode, end_pos);
526 return 0; 526 return 0;
527 } 527 }
528 528
529 /* 529 /*
530 * this drops all the extents in the cache that intersect the range 530 * this drops all the extents in the cache that intersect the range
531 * [start, end]. Existing extents are split as required. 531 * [start, end]. Existing extents are split as required.
532 */ 532 */
533 void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, 533 void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
534 int skip_pinned) 534 int skip_pinned)
535 { 535 {
536 struct extent_map *em; 536 struct extent_map *em;
537 struct extent_map *split = NULL; 537 struct extent_map *split = NULL;
538 struct extent_map *split2 = NULL; 538 struct extent_map *split2 = NULL;
539 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 539 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
540 u64 len = end - start + 1; 540 u64 len = end - start + 1;
541 u64 gen; 541 u64 gen;
542 int ret; 542 int ret;
543 int testend = 1; 543 int testend = 1;
544 unsigned long flags; 544 unsigned long flags;
545 int compressed = 0; 545 int compressed = 0;
546 bool modified; 546 bool modified;
547 547
548 WARN_ON(end < start); 548 WARN_ON(end < start);
549 if (end == (u64)-1) { 549 if (end == (u64)-1) {
550 len = (u64)-1; 550 len = (u64)-1;
551 testend = 0; 551 testend = 0;
552 } 552 }
553 while (1) { 553 while (1) {
554 int no_splits = 0; 554 int no_splits = 0;
555 555
556 modified = false; 556 modified = false;
557 if (!split) 557 if (!split)
558 split = alloc_extent_map(); 558 split = alloc_extent_map();
559 if (!split2) 559 if (!split2)
560 split2 = alloc_extent_map(); 560 split2 = alloc_extent_map();
561 if (!split || !split2) 561 if (!split || !split2)
562 no_splits = 1; 562 no_splits = 1;
563 563
564 write_lock(&em_tree->lock); 564 write_lock(&em_tree->lock);
565 em = lookup_extent_mapping(em_tree, start, len); 565 em = lookup_extent_mapping(em_tree, start, len);
566 if (!em) { 566 if (!em) {
567 write_unlock(&em_tree->lock); 567 write_unlock(&em_tree->lock);
568 break; 568 break;
569 } 569 }
570 flags = em->flags; 570 flags = em->flags;
571 gen = em->generation; 571 gen = em->generation;
572 if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { 572 if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
573 if (testend && em->start + em->len >= start + len) { 573 if (testend && em->start + em->len >= start + len) {
574 free_extent_map(em); 574 free_extent_map(em);
575 write_unlock(&em_tree->lock); 575 write_unlock(&em_tree->lock);
576 break; 576 break;
577 } 577 }
578 start = em->start + em->len; 578 start = em->start + em->len;
579 if (testend) 579 if (testend)
580 len = start + len - (em->start + em->len); 580 len = start + len - (em->start + em->len);
581 free_extent_map(em); 581 free_extent_map(em);
582 write_unlock(&em_tree->lock); 582 write_unlock(&em_tree->lock);
583 continue; 583 continue;
584 } 584 }
585 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 585 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
586 clear_bit(EXTENT_FLAG_PINNED, &em->flags); 586 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
587 clear_bit(EXTENT_FLAG_LOGGING, &flags); 587 clear_bit(EXTENT_FLAG_LOGGING, &flags);
588 modified = !list_empty(&em->list); 588 modified = !list_empty(&em->list);
589 if (no_splits) 589 if (no_splits)
590 goto next; 590 goto next;
591 591
592 if (em->start < start) { 592 if (em->start < start) {
593 split->start = em->start; 593 split->start = em->start;
594 split->len = start - em->start; 594 split->len = start - em->start;
595 595
596 if (em->block_start < EXTENT_MAP_LAST_BYTE) { 596 if (em->block_start < EXTENT_MAP_LAST_BYTE) {
597 split->orig_start = em->orig_start; 597 split->orig_start = em->orig_start;
598 split->block_start = em->block_start; 598 split->block_start = em->block_start;
599 599
600 if (compressed) 600 if (compressed)
601 split->block_len = em->block_len; 601 split->block_len = em->block_len;
602 else 602 else
603 split->block_len = split->len; 603 split->block_len = split->len;
604 split->orig_block_len = max(split->block_len, 604 split->orig_block_len = max(split->block_len,
605 em->orig_block_len); 605 em->orig_block_len);
606 split->ram_bytes = em->ram_bytes; 606 split->ram_bytes = em->ram_bytes;
607 } else { 607 } else {
608 split->orig_start = split->start; 608 split->orig_start = split->start;
609 split->block_len = 0; 609 split->block_len = 0;
610 split->block_start = em->block_start; 610 split->block_start = em->block_start;
611 split->orig_block_len = 0; 611 split->orig_block_len = 0;
612 split->ram_bytes = split->len; 612 split->ram_bytes = split->len;
613 } 613 }
614 614
615 split->generation = gen; 615 split->generation = gen;
616 split->bdev = em->bdev; 616 split->bdev = em->bdev;
617 split->flags = flags; 617 split->flags = flags;
618 split->compress_type = em->compress_type; 618 split->compress_type = em->compress_type;
619 replace_extent_mapping(em_tree, em, split, modified); 619 replace_extent_mapping(em_tree, em, split, modified);
620 free_extent_map(split); 620 free_extent_map(split);
621 split = split2; 621 split = split2;
622 split2 = NULL; 622 split2 = NULL;
623 } 623 }
624 if (testend && em->start + em->len > start + len) { 624 if (testend && em->start + em->len > start + len) {
625 u64 diff = start + len - em->start; 625 u64 diff = start + len - em->start;
626 626
627 split->start = start + len; 627 split->start = start + len;
628 split->len = em->start + em->len - (start + len); 628 split->len = em->start + em->len - (start + len);
629 split->bdev = em->bdev; 629 split->bdev = em->bdev;
630 split->flags = flags; 630 split->flags = flags;
631 split->compress_type = em->compress_type; 631 split->compress_type = em->compress_type;
632 split->generation = gen; 632 split->generation = gen;
633 633
634 if (em->block_start < EXTENT_MAP_LAST_BYTE) { 634 if (em->block_start < EXTENT_MAP_LAST_BYTE) {
635 split->orig_block_len = max(em->block_len, 635 split->orig_block_len = max(em->block_len,
636 em->orig_block_len); 636 em->orig_block_len);
637 637
638 split->ram_bytes = em->ram_bytes; 638 split->ram_bytes = em->ram_bytes;
639 if (compressed) { 639 if (compressed) {
640 split->block_len = em->block_len; 640 split->block_len = em->block_len;
641 split->block_start = em->block_start; 641 split->block_start = em->block_start;
642 split->orig_start = em->orig_start; 642 split->orig_start = em->orig_start;
643 } else { 643 } else {
644 split->block_len = split->len; 644 split->block_len = split->len;
645 split->block_start = em->block_start 645 split->block_start = em->block_start
646 + diff; 646 + diff;
647 split->orig_start = em->orig_start; 647 split->orig_start = em->orig_start;
648 } 648 }
649 } else { 649 } else {
650 split->ram_bytes = split->len; 650 split->ram_bytes = split->len;
651 split->orig_start = split->start; 651 split->orig_start = split->start;
652 split->block_len = 0; 652 split->block_len = 0;
653 split->block_start = em->block_start; 653 split->block_start = em->block_start;
654 split->orig_block_len = 0; 654 split->orig_block_len = 0;
655 } 655 }
656 656
657 if (extent_map_in_tree(em)) { 657 if (extent_map_in_tree(em)) {
658 replace_extent_mapping(em_tree, em, split, 658 replace_extent_mapping(em_tree, em, split,
659 modified); 659 modified);
660 } else { 660 } else {
661 ret = add_extent_mapping(em_tree, split, 661 ret = add_extent_mapping(em_tree, split,
662 modified); 662 modified);
663 ASSERT(ret == 0); /* Logic error */ 663 ASSERT(ret == 0); /* Logic error */
664 } 664 }
665 free_extent_map(split); 665 free_extent_map(split);
666 split = NULL; 666 split = NULL;
667 } 667 }
668 next: 668 next:
669 if (extent_map_in_tree(em)) 669 if (extent_map_in_tree(em))
670 remove_extent_mapping(em_tree, em); 670 remove_extent_mapping(em_tree, em);
671 write_unlock(&em_tree->lock); 671 write_unlock(&em_tree->lock);
672 672
673 /* once for us */ 673 /* once for us */
674 free_extent_map(em); 674 free_extent_map(em);
675 /* once for the tree*/ 675 /* once for the tree*/
676 free_extent_map(em); 676 free_extent_map(em);
677 } 677 }
678 if (split) 678 if (split)
679 free_extent_map(split); 679 free_extent_map(split);
680 if (split2) 680 if (split2)
681 free_extent_map(split2); 681 free_extent_map(split2);
682 } 682 }
683 683
684 /* 684 /*
685 * this is very complex, but the basic idea is to drop all extents 685 * this is very complex, but the basic idea is to drop all extents
686 * in the range start - end. hint_block is filled in with a block number 686 * in the range start - end. hint_block is filled in with a block number
687 * that would be a good hint to the block allocator for this file. 687 * that would be a good hint to the block allocator for this file.
688 * 688 *
689 * If an extent intersects the range but is not entirely inside the range 689 * If an extent intersects the range but is not entirely inside the range
690 * it is either truncated or split. Anything entirely inside the range 690 * it is either truncated or split. Anything entirely inside the range
691 * is deleted from the tree. 691 * is deleted from the tree.
692 */ 692 */
693 int __btrfs_drop_extents(struct btrfs_trans_handle *trans, 693 int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
694 struct btrfs_root *root, struct inode *inode, 694 struct btrfs_root *root, struct inode *inode,
695 struct btrfs_path *path, u64 start, u64 end, 695 struct btrfs_path *path, u64 start, u64 end,
696 u64 *drop_end, int drop_cache, 696 u64 *drop_end, int drop_cache,
697 int replace_extent, 697 int replace_extent,
698 u32 extent_item_size, 698 u32 extent_item_size,
699 int *key_inserted) 699 int *key_inserted)
700 { 700 {
701 struct extent_buffer *leaf; 701 struct extent_buffer *leaf;
702 struct btrfs_file_extent_item *fi; 702 struct btrfs_file_extent_item *fi;
703 struct btrfs_key key; 703 struct btrfs_key key;
704 struct btrfs_key new_key; 704 struct btrfs_key new_key;
705 u64 ino = btrfs_ino(inode); 705 u64 ino = btrfs_ino(inode);
706 u64 search_start = start; 706 u64 search_start = start;
707 u64 disk_bytenr = 0; 707 u64 disk_bytenr = 0;
708 u64 num_bytes = 0; 708 u64 num_bytes = 0;
709 u64 extent_offset = 0; 709 u64 extent_offset = 0;
710 u64 extent_end = 0; 710 u64 extent_end = 0;
711 int del_nr = 0; 711 int del_nr = 0;
712 int del_slot = 0; 712 int del_slot = 0;
713 int extent_type; 713 int extent_type;
714 int recow; 714 int recow;
715 int ret; 715 int ret;
716 int modify_tree = -1; 716 int modify_tree = -1;
717 int update_refs = (root->ref_cows || root == root->fs_info->tree_root); 717 int update_refs = (root->ref_cows || root == root->fs_info->tree_root);
718 int found = 0; 718 int found = 0;
719 int leafs_visited = 0; 719 int leafs_visited = 0;
720 720
721 if (drop_cache) 721 if (drop_cache)
722 btrfs_drop_extent_cache(inode, start, end - 1, 0); 722 btrfs_drop_extent_cache(inode, start, end - 1, 0);
723 723
724 if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent) 724 if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent)
725 modify_tree = 0; 725 modify_tree = 0;
726 726
727 while (1) { 727 while (1) {
728 recow = 0; 728 recow = 0;
729 ret = btrfs_lookup_file_extent(trans, root, path, ino, 729 ret = btrfs_lookup_file_extent(trans, root, path, ino,
730 search_start, modify_tree); 730 search_start, modify_tree);
731 if (ret < 0) 731 if (ret < 0)
732 break; 732 break;
733 if (ret > 0 && path->slots[0] > 0 && search_start == start) { 733 if (ret > 0 && path->slots[0] > 0 && search_start == start) {
734 leaf = path->nodes[0]; 734 leaf = path->nodes[0];
735 btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); 735 btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
736 if (key.objectid == ino && 736 if (key.objectid == ino &&
737 key.type == BTRFS_EXTENT_DATA_KEY) 737 key.type == BTRFS_EXTENT_DATA_KEY)
738 path->slots[0]--; 738 path->slots[0]--;
739 } 739 }
740 ret = 0; 740 ret = 0;
741 leafs_visited++; 741 leafs_visited++;
742 next_slot: 742 next_slot:
743 leaf = path->nodes[0]; 743 leaf = path->nodes[0];
744 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 744 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
745 BUG_ON(del_nr > 0); 745 BUG_ON(del_nr > 0);
746 ret = btrfs_next_leaf(root, path); 746 ret = btrfs_next_leaf(root, path);
747 if (ret < 0) 747 if (ret < 0)
748 break; 748 break;
749 if (ret > 0) { 749 if (ret > 0) {
750 ret = 0; 750 ret = 0;
751 break; 751 break;
752 } 752 }
753 leafs_visited++; 753 leafs_visited++;
754 leaf = path->nodes[0]; 754 leaf = path->nodes[0];
755 recow = 1; 755 recow = 1;
756 } 756 }
757 757
758 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 758 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
759 if (key.objectid > ino || 759 if (key.objectid > ino ||
760 key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end) 760 key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
761 break; 761 break;
762 762
763 fi = btrfs_item_ptr(leaf, path->slots[0], 763 fi = btrfs_item_ptr(leaf, path->slots[0],
764 struct btrfs_file_extent_item); 764 struct btrfs_file_extent_item);
765 extent_type = btrfs_file_extent_type(leaf, fi); 765 extent_type = btrfs_file_extent_type(leaf, fi);
766 766
767 if (extent_type == BTRFS_FILE_EXTENT_REG || 767 if (extent_type == BTRFS_FILE_EXTENT_REG ||
768 extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 768 extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
769 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 769 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
770 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 770 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
771 extent_offset = btrfs_file_extent_offset(leaf, fi); 771 extent_offset = btrfs_file_extent_offset(leaf, fi);
772 extent_end = key.offset + 772 extent_end = key.offset +
773 btrfs_file_extent_num_bytes(leaf, fi); 773 btrfs_file_extent_num_bytes(leaf, fi);
774 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 774 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
775 extent_end = key.offset + 775 extent_end = key.offset +
776 btrfs_file_extent_inline_len(leaf, 776 btrfs_file_extent_inline_len(leaf,
777 path->slots[0], fi); 777 path->slots[0], fi);
778 } else { 778 } else {
779 WARN_ON(1); 779 WARN_ON(1);
780 extent_end = search_start; 780 extent_end = search_start;
781 } 781 }
782 782
783 /* 783 /*
784 * Don't skip extent items representing 0 byte lengths. They 784 * Don't skip extent items representing 0 byte lengths. They
785 * used to be created (bug) if while punching holes we hit 785 * used to be created (bug) if while punching holes we hit
786 * -ENOSPC condition. So if we find one here, just ensure we 786 * -ENOSPC condition. So if we find one here, just ensure we
787 * delete it, otherwise we would insert a new file extent item 787 * delete it, otherwise we would insert a new file extent item
788 * with the same key (offset) as that 0 bytes length file 788 * with the same key (offset) as that 0 bytes length file
789 * extent item in the call to setup_items_for_insert() later 789 * extent item in the call to setup_items_for_insert() later
790 * in this function. 790 * in this function.
791 */ 791 */
792 if (extent_end == key.offset && extent_end >= search_start) 792 if (extent_end == key.offset && extent_end >= search_start)
793 goto delete_extent_item; 793 goto delete_extent_item;
794 794
795 if (extent_end <= search_start) { 795 if (extent_end <= search_start) {
796 path->slots[0]++; 796 path->slots[0]++;
797 goto next_slot; 797 goto next_slot;
798 } 798 }
799 799
800 found = 1; 800 found = 1;
801 search_start = max(key.offset, start); 801 search_start = max(key.offset, start);
802 if (recow || !modify_tree) { 802 if (recow || !modify_tree) {
803 modify_tree = -1; 803 modify_tree = -1;
804 btrfs_release_path(path); 804 btrfs_release_path(path);
805 continue; 805 continue;
806 } 806 }
807 807
808 /* 808 /*
809 * | - range to drop - | 809 * | - range to drop - |
810 * | -------- extent -------- | 810 * | -------- extent -------- |
811 */ 811 */
812 if (start > key.offset && end < extent_end) { 812 if (start > key.offset && end < extent_end) {
813 BUG_ON(del_nr > 0); 813 BUG_ON(del_nr > 0);
814 if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 814 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
815 ret = -EOPNOTSUPP; 815 ret = -EOPNOTSUPP;
816 break; 816 break;
817 } 817 }
818 818
819 memcpy(&new_key, &key, sizeof(new_key)); 819 memcpy(&new_key, &key, sizeof(new_key));
820 new_key.offset = start; 820 new_key.offset = start;
821 ret = btrfs_duplicate_item(trans, root, path, 821 ret = btrfs_duplicate_item(trans, root, path,
822 &new_key); 822 &new_key);
823 if (ret == -EAGAIN) { 823 if (ret == -EAGAIN) {
824 btrfs_release_path(path); 824 btrfs_release_path(path);
825 continue; 825 continue;
826 } 826 }
827 if (ret < 0) 827 if (ret < 0)
828 break; 828 break;
829 829
830 leaf = path->nodes[0]; 830 leaf = path->nodes[0];
831 fi = btrfs_item_ptr(leaf, path->slots[0] - 1, 831 fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
832 struct btrfs_file_extent_item); 832 struct btrfs_file_extent_item);
833 btrfs_set_file_extent_num_bytes(leaf, fi, 833 btrfs_set_file_extent_num_bytes(leaf, fi,
834 start - key.offset); 834 start - key.offset);
835 835
836 fi = btrfs_item_ptr(leaf, path->slots[0], 836 fi = btrfs_item_ptr(leaf, path->slots[0],
837 struct btrfs_file_extent_item); 837 struct btrfs_file_extent_item);
838 838
839 extent_offset += start - key.offset; 839 extent_offset += start - key.offset;
840 btrfs_set_file_extent_offset(leaf, fi, extent_offset); 840 btrfs_set_file_extent_offset(leaf, fi, extent_offset);
841 btrfs_set_file_extent_num_bytes(leaf, fi, 841 btrfs_set_file_extent_num_bytes(leaf, fi,
842 extent_end - start); 842 extent_end - start);
843 btrfs_mark_buffer_dirty(leaf); 843 btrfs_mark_buffer_dirty(leaf);
844 844
845 if (update_refs && disk_bytenr > 0) { 845 if (update_refs && disk_bytenr > 0) {
846 ret = btrfs_inc_extent_ref(trans, root, 846 ret = btrfs_inc_extent_ref(trans, root,
847 disk_bytenr, num_bytes, 0, 847 disk_bytenr, num_bytes, 0,
848 root->root_key.objectid, 848 root->root_key.objectid,
849 new_key.objectid, 849 new_key.objectid,
850 start - extent_offset, 0); 850 start - extent_offset, 0);
851 BUG_ON(ret); /* -ENOMEM */ 851 BUG_ON(ret); /* -ENOMEM */
852 } 852 }
853 key.offset = start; 853 key.offset = start;
854 } 854 }
855 /* 855 /*
856 * | ---- range to drop ----- | 856 * | ---- range to drop ----- |
857 * | -------- extent -------- | 857 * | -------- extent -------- |
858 */ 858 */
859 if (start <= key.offset && end < extent_end) { 859 if (start <= key.offset && end < extent_end) {
860 if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 860 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
861 ret = -EOPNOTSUPP; 861 ret = -EOPNOTSUPP;
862 break; 862 break;
863 } 863 }
864 864
865 memcpy(&new_key, &key, sizeof(new_key)); 865 memcpy(&new_key, &key, sizeof(new_key));
866 new_key.offset = end; 866 new_key.offset = end;
867 btrfs_set_item_key_safe(root, path, &new_key); 867 btrfs_set_item_key_safe(root, path, &new_key);
868 868
869 extent_offset += end - key.offset; 869 extent_offset += end - key.offset;
870 btrfs_set_file_extent_offset(leaf, fi, extent_offset); 870 btrfs_set_file_extent_offset(leaf, fi, extent_offset);
871 btrfs_set_file_extent_num_bytes(leaf, fi, 871 btrfs_set_file_extent_num_bytes(leaf, fi,
872 extent_end - end); 872 extent_end - end);
873 btrfs_mark_buffer_dirty(leaf); 873 btrfs_mark_buffer_dirty(leaf);
874 if (update_refs && disk_bytenr > 0) 874 if (update_refs && disk_bytenr > 0)
875 inode_sub_bytes(inode, end - key.offset); 875 inode_sub_bytes(inode, end - key.offset);
876 break; 876 break;
877 } 877 }
878 878
879 search_start = extent_end; 879 search_start = extent_end;
880 /* 880 /*
881 * | ---- range to drop ----- | 881 * | ---- range to drop ----- |
882 * | -------- extent -------- | 882 * | -------- extent -------- |
883 */ 883 */
884 if (start > key.offset && end >= extent_end) { 884 if (start > key.offset && end >= extent_end) {
885 BUG_ON(del_nr > 0); 885 BUG_ON(del_nr > 0);
886 if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 886 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
887 ret = -EOPNOTSUPP; 887 ret = -EOPNOTSUPP;
888 break; 888 break;
889 } 889 }
890 890
891 btrfs_set_file_extent_num_bytes(leaf, fi, 891 btrfs_set_file_extent_num_bytes(leaf, fi,
892 start - key.offset); 892 start - key.offset);
893 btrfs_mark_buffer_dirty(leaf); 893 btrfs_mark_buffer_dirty(leaf);
894 if (update_refs && disk_bytenr > 0) 894 if (update_refs && disk_bytenr > 0)
895 inode_sub_bytes(inode, extent_end - start); 895 inode_sub_bytes(inode, extent_end - start);
896 if (end == extent_end) 896 if (end == extent_end)
897 break; 897 break;
898 898
899 path->slots[0]++; 899 path->slots[0]++;
900 goto next_slot; 900 goto next_slot;
901 } 901 }
902 902
903 /* 903 /*
904 * | ---- range to drop ----- | 904 * | ---- range to drop ----- |
905 * | ------ extent ------ | 905 * | ------ extent ------ |
906 */ 906 */
907 if (start <= key.offset && end >= extent_end) { 907 if (start <= key.offset && end >= extent_end) {
908 delete_extent_item: 908 delete_extent_item:
909 if (del_nr == 0) { 909 if (del_nr == 0) {
910 del_slot = path->slots[0]; 910 del_slot = path->slots[0];
911 del_nr = 1; 911 del_nr = 1;
912 } else { 912 } else {
913 BUG_ON(del_slot + del_nr != path->slots[0]); 913 BUG_ON(del_slot + del_nr != path->slots[0]);
914 del_nr++; 914 del_nr++;
915 } 915 }
916 916
917 if (update_refs && 917 if (update_refs &&
918 extent_type == BTRFS_FILE_EXTENT_INLINE) { 918 extent_type == BTRFS_FILE_EXTENT_INLINE) {
919 inode_sub_bytes(inode, 919 inode_sub_bytes(inode,
920 extent_end - key.offset); 920 extent_end - key.offset);
921 extent_end = ALIGN(extent_end, 921 extent_end = ALIGN(extent_end,
922 root->sectorsize); 922 root->sectorsize);
923 } else if (update_refs && disk_bytenr > 0) { 923 } else if (update_refs && disk_bytenr > 0) {
924 ret = btrfs_free_extent(trans, root, 924 ret = btrfs_free_extent(trans, root,
925 disk_bytenr, num_bytes, 0, 925 disk_bytenr, num_bytes, 0,
926 root->root_key.objectid, 926 root->root_key.objectid,
927 key.objectid, key.offset - 927 key.objectid, key.offset -
928 extent_offset, 0); 928 extent_offset, 0);
929 BUG_ON(ret); /* -ENOMEM */ 929 BUG_ON(ret); /* -ENOMEM */
930 inode_sub_bytes(inode, 930 inode_sub_bytes(inode,
931 extent_end - key.offset); 931 extent_end - key.offset);
932 } 932 }
933 933
934 if (end == extent_end) 934 if (end == extent_end)
935 break; 935 break;
936 936
937 if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) { 937 if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
938 path->slots[0]++; 938 path->slots[0]++;
939 goto next_slot; 939 goto next_slot;
940 } 940 }
941 941
942 ret = btrfs_del_items(trans, root, path, del_slot, 942 ret = btrfs_del_items(trans, root, path, del_slot,
943 del_nr); 943 del_nr);
944 if (ret) { 944 if (ret) {
945 btrfs_abort_transaction(trans, root, ret); 945 btrfs_abort_transaction(trans, root, ret);
946 break; 946 break;
947 } 947 }
948 948
949 del_nr = 0; 949 del_nr = 0;
950 del_slot = 0; 950 del_slot = 0;
951 951
952 btrfs_release_path(path); 952 btrfs_release_path(path);
953 continue; 953 continue;
954 } 954 }
955 955
956 BUG_ON(1); 956 BUG_ON(1);
957 } 957 }
958 958
959 if (!ret && del_nr > 0) { 959 if (!ret && del_nr > 0) {
960 /* 960 /*
961 * Set path->slots[0] to first slot, so that after the delete 961 * Set path->slots[0] to first slot, so that after the delete
962 * if items are move off from our leaf to its immediate left or 962 * if items are move off from our leaf to its immediate left or
963 * right neighbor leafs, we end up with a correct and adjusted 963 * right neighbor leafs, we end up with a correct and adjusted
964 * path->slots[0] for our insertion (if replace_extent != 0). 964 * path->slots[0] for our insertion (if replace_extent != 0).
965 */ 965 */
966 path->slots[0] = del_slot; 966 path->slots[0] = del_slot;
967 ret = btrfs_del_items(trans, root, path, del_slot, del_nr); 967 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
968 if (ret) 968 if (ret)
969 btrfs_abort_transaction(trans, root, ret); 969 btrfs_abort_transaction(trans, root, ret);
970 } 970 }
971 971
972 leaf = path->nodes[0]; 972 leaf = path->nodes[0];
973 /* 973 /*
974 * If btrfs_del_items() was called, it might have deleted a leaf, in 974 * If btrfs_del_items() was called, it might have deleted a leaf, in
975 * which case it unlocked our path, so check path->locks[0] matches a 975 * which case it unlocked our path, so check path->locks[0] matches a
976 * write lock. 976 * write lock.
977 */ 977 */
978 if (!ret && replace_extent && leafs_visited == 1 && 978 if (!ret && replace_extent && leafs_visited == 1 &&
979 (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING || 979 (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING ||
980 path->locks[0] == BTRFS_WRITE_LOCK) && 980 path->locks[0] == BTRFS_WRITE_LOCK) &&
981 btrfs_leaf_free_space(root, leaf) >= 981 btrfs_leaf_free_space(root, leaf) >=
982 sizeof(struct btrfs_item) + extent_item_size) { 982 sizeof(struct btrfs_item) + extent_item_size) {
983 983
984 key.objectid = ino; 984 key.objectid = ino;
985 key.type = BTRFS_EXTENT_DATA_KEY; 985 key.type = BTRFS_EXTENT_DATA_KEY;
986 key.offset = start; 986 key.offset = start;
987 if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) { 987 if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
988 struct btrfs_key slot_key; 988 struct btrfs_key slot_key;
989 989
990 btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]); 990 btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
991 if (btrfs_comp_cpu_keys(&key, &slot_key) > 0) 991 if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
992 path->slots[0]++; 992 path->slots[0]++;
993 } 993 }
994 setup_items_for_insert(root, path, &key, 994 setup_items_for_insert(root, path, &key,
995 &extent_item_size, 995 &extent_item_size,
996 extent_item_size, 996 extent_item_size,
997 sizeof(struct btrfs_item) + 997 sizeof(struct btrfs_item) +
998 extent_item_size, 1); 998 extent_item_size, 1);
999 *key_inserted = 1; 999 *key_inserted = 1;
1000 } 1000 }
1001 1001
1002 if (!replace_extent || !(*key_inserted)) 1002 if (!replace_extent || !(*key_inserted))
1003 btrfs_release_path(path); 1003 btrfs_release_path(path);
1004 if (drop_end) 1004 if (drop_end)
1005 *drop_end = found ? min(end, extent_end) : end; 1005 *drop_end = found ? min(end, extent_end) : end;
1006 return ret; 1006 return ret;
1007 } 1007 }
1008 1008
1009 int btrfs_drop_extents(struct btrfs_trans_handle *trans, 1009 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
1010 struct btrfs_root *root, struct inode *inode, u64 start, 1010 struct btrfs_root *root, struct inode *inode, u64 start,
1011 u64 end, int drop_cache) 1011 u64 end, int drop_cache)
1012 { 1012 {
1013 struct btrfs_path *path; 1013 struct btrfs_path *path;
1014 int ret; 1014 int ret;
1015 1015
1016 path = btrfs_alloc_path(); 1016 path = btrfs_alloc_path();
1017 if (!path) 1017 if (!path)
1018 return -ENOMEM; 1018 return -ENOMEM;
1019 ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL, 1019 ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,
1020 drop_cache, 0, 0, NULL); 1020 drop_cache, 0, 0, NULL);
1021 btrfs_free_path(path); 1021 btrfs_free_path(path);
1022 return ret; 1022 return ret;
1023 } 1023 }
1024 1024
1025 static int extent_mergeable(struct extent_buffer *leaf, int slot, 1025 static int extent_mergeable(struct extent_buffer *leaf, int slot,
1026 u64 objectid, u64 bytenr, u64 orig_offset, 1026 u64 objectid, u64 bytenr, u64 orig_offset,
1027 u64 *start, u64 *end) 1027 u64 *start, u64 *end)
1028 { 1028 {
1029 struct btrfs_file_extent_item *fi; 1029 struct btrfs_file_extent_item *fi;
1030 struct btrfs_key key; 1030 struct btrfs_key key;
1031 u64 extent_end; 1031 u64 extent_end;
1032 1032
1033 if (slot < 0 || slot >= btrfs_header_nritems(leaf)) 1033 if (slot < 0 || slot >= btrfs_header_nritems(leaf))
1034 return 0; 1034 return 0;
1035 1035
1036 btrfs_item_key_to_cpu(leaf, &key, slot); 1036 btrfs_item_key_to_cpu(leaf, &key, slot);
1037 if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY) 1037 if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
1038 return 0; 1038 return 0;
1039 1039
1040 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 1040 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
1041 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG || 1041 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
1042 btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr || 1042 btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
1043 btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset || 1043 btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
1044 btrfs_file_extent_compression(leaf, fi) || 1044 btrfs_file_extent_compression(leaf, fi) ||
1045 btrfs_file_extent_encryption(leaf, fi) || 1045 btrfs_file_extent_encryption(leaf, fi) ||
1046 btrfs_file_extent_other_encoding(leaf, fi)) 1046 btrfs_file_extent_other_encoding(leaf, fi))
1047 return 0; 1047 return 0;
1048 1048
1049 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); 1049 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
1050 if ((*start && *start != key.offset) || (*end && *end != extent_end)) 1050 if ((*start && *start != key.offset) || (*end && *end != extent_end))
1051 return 0; 1051 return 0;
1052 1052
1053 *start = key.offset; 1053 *start = key.offset;
1054 *end = extent_end; 1054 *end = extent_end;
1055 return 1; 1055 return 1;
1056 } 1056 }
1057 1057
1058 /* 1058 /*
1059 * Mark extent in the range start - end as written. 1059 * Mark extent in the range start - end as written.
1060 * 1060 *
1061 * This changes extent type from 'pre-allocated' to 'regular'. If only 1061 * This changes extent type from 'pre-allocated' to 'regular'. If only
1062 * part of extent is marked as written, the extent will be split into 1062 * part of extent is marked as written, the extent will be split into
1063 * two or three. 1063 * two or three.
1064 */ 1064 */
1065 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, 1065 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
1066 struct inode *inode, u64 start, u64 end) 1066 struct inode *inode, u64 start, u64 end)
1067 { 1067 {
1068 struct btrfs_root *root = BTRFS_I(inode)->root; 1068 struct btrfs_root *root = BTRFS_I(inode)->root;
1069 struct extent_buffer *leaf; 1069 struct extent_buffer *leaf;
1070 struct btrfs_path *path; 1070 struct btrfs_path *path;
1071 struct btrfs_file_extent_item *fi; 1071 struct btrfs_file_extent_item *fi;
1072 struct btrfs_key key; 1072 struct btrfs_key key;
1073 struct btrfs_key new_key; 1073 struct btrfs_key new_key;
1074 u64 bytenr; 1074 u64 bytenr;
1075 u64 num_bytes; 1075 u64 num_bytes;
1076 u64 extent_end; 1076 u64 extent_end;
1077 u64 orig_offset; 1077 u64 orig_offset;
1078 u64 other_start; 1078 u64 other_start;
1079 u64 other_end; 1079 u64 other_end;
1080 u64 split; 1080 u64 split;
1081 int del_nr = 0; 1081 int del_nr = 0;
1082 int del_slot = 0; 1082 int del_slot = 0;
1083 int recow; 1083 int recow;
1084 int ret; 1084 int ret;
1085 u64 ino = btrfs_ino(inode); 1085 u64 ino = btrfs_ino(inode);
1086 1086
1087 path = btrfs_alloc_path(); 1087 path = btrfs_alloc_path();
1088 if (!path) 1088 if (!path)
1089 return -ENOMEM; 1089 return -ENOMEM;
1090 again: 1090 again:
1091 recow = 0; 1091 recow = 0;
1092 split = start; 1092 split = start;
1093 key.objectid = ino; 1093 key.objectid = ino;
1094 key.type = BTRFS_EXTENT_DATA_KEY; 1094 key.type = BTRFS_EXTENT_DATA_KEY;
1095 key.offset = split; 1095 key.offset = split;
1096 1096
1097 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1097 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1098 if (ret < 0) 1098 if (ret < 0)
1099 goto out; 1099 goto out;
1100 if (ret > 0 && path->slots[0] > 0) 1100 if (ret > 0 && path->slots[0] > 0)
1101 path->slots[0]--; 1101 path->slots[0]--;
1102 1102
1103 leaf = path->nodes[0]; 1103 leaf = path->nodes[0];
1104 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1104 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1105 BUG_ON(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY); 1105 BUG_ON(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY);
1106 fi = btrfs_item_ptr(leaf, path->slots[0], 1106 fi = btrfs_item_ptr(leaf, path->slots[0],
1107 struct btrfs_file_extent_item); 1107 struct btrfs_file_extent_item);
1108 BUG_ON(btrfs_file_extent_type(leaf, fi) != 1108 BUG_ON(btrfs_file_extent_type(leaf, fi) !=
1109 BTRFS_FILE_EXTENT_PREALLOC); 1109 BTRFS_FILE_EXTENT_PREALLOC);
1110 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); 1110 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
1111 BUG_ON(key.offset > start || extent_end < end); 1111 BUG_ON(key.offset > start || extent_end < end);
1112 1112
1113 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1113 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1114 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 1114 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1115 orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi); 1115 orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
1116 memcpy(&new_key, &key, sizeof(new_key)); 1116 memcpy(&new_key, &key, sizeof(new_key));
1117 1117
1118 if (start == key.offset && end < extent_end) { 1118 if (start == key.offset && end < extent_end) {
1119 other_start = 0; 1119 other_start = 0;
1120 other_end = start; 1120 other_end = start;
1121 if (extent_mergeable(leaf, path->slots[0] - 1, 1121 if (extent_mergeable(leaf, path->slots[0] - 1,
1122 ino, bytenr, orig_offset, 1122 ino, bytenr, orig_offset,
1123 &other_start, &other_end)) { 1123 &other_start, &other_end)) {
1124 new_key.offset = end; 1124 new_key.offset = end;
1125 btrfs_set_item_key_safe(root, path, &new_key); 1125 btrfs_set_item_key_safe(root, path, &new_key);
1126 fi = btrfs_item_ptr(leaf, path->slots[0], 1126 fi = btrfs_item_ptr(leaf, path->slots[0],
1127 struct btrfs_file_extent_item); 1127 struct btrfs_file_extent_item);
1128 btrfs_set_file_extent_generation(leaf, fi, 1128 btrfs_set_file_extent_generation(leaf, fi,
1129 trans->transid); 1129 trans->transid);
1130 btrfs_set_file_extent_num_bytes(leaf, fi, 1130 btrfs_set_file_extent_num_bytes(leaf, fi,
1131 extent_end - end); 1131 extent_end - end);
1132 btrfs_set_file_extent_offset(leaf, fi, 1132 btrfs_set_file_extent_offset(leaf, fi,
1133 end - orig_offset); 1133 end - orig_offset);
1134 fi = btrfs_item_ptr(leaf, path->slots[0] - 1, 1134 fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
1135 struct btrfs_file_extent_item); 1135 struct btrfs_file_extent_item);
1136 btrfs_set_file_extent_generation(leaf, fi, 1136 btrfs_set_file_extent_generation(leaf, fi,
1137 trans->transid); 1137 trans->transid);
1138 btrfs_set_file_extent_num_bytes(leaf, fi, 1138 btrfs_set_file_extent_num_bytes(leaf, fi,
1139 end - other_start); 1139 end - other_start);
1140 btrfs_mark_buffer_dirty(leaf); 1140 btrfs_mark_buffer_dirty(leaf);
1141 goto out; 1141 goto out;
1142 } 1142 }
1143 } 1143 }
1144 1144
1145 if (start > key.offset && end == extent_end) { 1145 if (start > key.offset && end == extent_end) {
1146 other_start = end; 1146 other_start = end;
1147 other_end = 0; 1147 other_end = 0;
1148 if (extent_mergeable(leaf, path->slots[0] + 1, 1148 if (extent_mergeable(leaf, path->slots[0] + 1,
1149 ino, bytenr, orig_offset, 1149 ino, bytenr, orig_offset,
1150 &other_start, &other_end)) { 1150 &other_start, &other_end)) {
1151 fi = btrfs_item_ptr(leaf, path->slots[0], 1151 fi = btrfs_item_ptr(leaf, path->slots[0],
1152 struct btrfs_file_extent_item); 1152 struct btrfs_file_extent_item);
1153 btrfs_set_file_extent_num_bytes(leaf, fi, 1153 btrfs_set_file_extent_num_bytes(leaf, fi,
1154 start - key.offset); 1154 start - key.offset);
1155 btrfs_set_file_extent_generation(leaf, fi, 1155 btrfs_set_file_extent_generation(leaf, fi,
1156 trans->transid); 1156 trans->transid);
1157 path->slots[0]++; 1157 path->slots[0]++;
1158 new_key.offset = start; 1158 new_key.offset = start;
1159 btrfs_set_item_key_safe(root, path, &new_key); 1159 btrfs_set_item_key_safe(root, path, &new_key);
1160 1160
1161 fi = btrfs_item_ptr(leaf, path->slots[0], 1161 fi = btrfs_item_ptr(leaf, path->slots[0],
1162 struct btrfs_file_extent_item); 1162 struct btrfs_file_extent_item);
1163 btrfs_set_file_extent_generation(leaf, fi, 1163 btrfs_set_file_extent_generation(leaf, fi,
1164 trans->transid); 1164 trans->transid);
1165 btrfs_set_file_extent_num_bytes(leaf, fi, 1165 btrfs_set_file_extent_num_bytes(leaf, fi,
1166 other_end - start); 1166 other_end - start);
1167 btrfs_set_file_extent_offset(leaf, fi, 1167 btrfs_set_file_extent_offset(leaf, fi,
1168 start - orig_offset); 1168 start - orig_offset);
1169 btrfs_mark_buffer_dirty(leaf); 1169 btrfs_mark_buffer_dirty(leaf);
1170 goto out; 1170 goto out;
1171 } 1171 }
1172 } 1172 }
1173 1173
1174 while (start > key.offset || end < extent_end) { 1174 while (start > key.offset || end < extent_end) {
1175 if (key.offset == start) 1175 if (key.offset == start)
1176 split = end; 1176 split = end;
1177 1177
1178 new_key.offset = split; 1178 new_key.offset = split;
1179 ret = btrfs_duplicate_item(trans, root, path, &new_key); 1179 ret = btrfs_duplicate_item(trans, root, path, &new_key);
1180 if (ret == -EAGAIN) { 1180 if (ret == -EAGAIN) {
1181 btrfs_release_path(path); 1181 btrfs_release_path(path);
1182 goto again; 1182 goto again;
1183 } 1183 }
1184 if (ret < 0) { 1184 if (ret < 0) {
1185 btrfs_abort_transaction(trans, root, ret); 1185 btrfs_abort_transaction(trans, root, ret);
1186 goto out; 1186 goto out;
1187 } 1187 }
1188 1188
1189 leaf = path->nodes[0]; 1189 leaf = path->nodes[0];
1190 fi = btrfs_item_ptr(leaf, path->slots[0] - 1, 1190 fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
1191 struct btrfs_file_extent_item); 1191 struct btrfs_file_extent_item);
1192 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 1192 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1193 btrfs_set_file_extent_num_bytes(leaf, fi, 1193 btrfs_set_file_extent_num_bytes(leaf, fi,
1194 split - key.offset); 1194 split - key.offset);
1195 1195
1196 fi = btrfs_item_ptr(leaf, path->slots[0], 1196 fi = btrfs_item_ptr(leaf, path->slots[0],
1197 struct btrfs_file_extent_item); 1197 struct btrfs_file_extent_item);
1198 1198
1199 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 1199 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1200 btrfs_set_file_extent_offset(leaf, fi, split - orig_offset); 1200 btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
1201 btrfs_set_file_extent_num_bytes(leaf, fi, 1201 btrfs_set_file_extent_num_bytes(leaf, fi,
1202 extent_end - split); 1202 extent_end - split);
1203 btrfs_mark_buffer_dirty(leaf); 1203 btrfs_mark_buffer_dirty(leaf);
1204 1204
1205 ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, 1205 ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
1206 root->root_key.objectid, 1206 root->root_key.objectid,
1207 ino, orig_offset, 0); 1207 ino, orig_offset, 0);
1208 BUG_ON(ret); /* -ENOMEM */ 1208 BUG_ON(ret); /* -ENOMEM */
1209 1209
1210 if (split == start) { 1210 if (split == start) {
1211 key.offset = start; 1211 key.offset = start;
1212 } else { 1212 } else {
1213 BUG_ON(start != key.offset); 1213 BUG_ON(start != key.offset);
1214 path->slots[0]--; 1214 path->slots[0]--;
1215 extent_end = end; 1215 extent_end = end;
1216 } 1216 }
1217 recow = 1; 1217 recow = 1;
1218 } 1218 }
1219 1219
1220 other_start = end; 1220 other_start = end;
1221 other_end = 0; 1221 other_end = 0;
1222 if (extent_mergeable(leaf, path->slots[0] + 1, 1222 if (extent_mergeable(leaf, path->slots[0] + 1,
1223 ino, bytenr, orig_offset, 1223 ino, bytenr, orig_offset,
1224 &other_start, &other_end)) { 1224 &other_start, &other_end)) {
1225 if (recow) { 1225 if (recow) {
1226 btrfs_release_path(path); 1226 btrfs_release_path(path);
1227 goto again; 1227 goto again;
1228 } 1228 }
1229 extent_end = other_end; 1229 extent_end = other_end;
1230 del_slot = path->slots[0] + 1; 1230 del_slot = path->slots[0] + 1;
1231 del_nr++; 1231 del_nr++;
1232 ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 1232 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1233 0, root->root_key.objectid, 1233 0, root->root_key.objectid,
1234 ino, orig_offset, 0); 1234 ino, orig_offset, 0);
1235 BUG_ON(ret); /* -ENOMEM */ 1235 BUG_ON(ret); /* -ENOMEM */
1236 } 1236 }
1237 other_start = 0; 1237 other_start = 0;
1238 other_end = start; 1238 other_end = start;
1239 if (extent_mergeable(leaf, path->slots[0] - 1, 1239 if (extent_mergeable(leaf, path->slots[0] - 1,
1240 ino, bytenr, orig_offset, 1240 ino, bytenr, orig_offset,
1241 &other_start, &other_end)) { 1241 &other_start, &other_end)) {
1242 if (recow) { 1242 if (recow) {
1243 btrfs_release_path(path); 1243 btrfs_release_path(path);
1244 goto again; 1244 goto again;
1245 } 1245 }
1246 key.offset = other_start; 1246 key.offset = other_start;
1247 del_slot = path->slots[0]; 1247 del_slot = path->slots[0];
1248 del_nr++; 1248 del_nr++;
1249 ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 1249 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1250 0, root->root_key.objectid, 1250 0, root->root_key.objectid,
1251 ino, orig_offset, 0); 1251 ino, orig_offset, 0);
1252 BUG_ON(ret); /* -ENOMEM */ 1252 BUG_ON(ret); /* -ENOMEM */
1253 } 1253 }
1254 if (del_nr == 0) { 1254 if (del_nr == 0) {
1255 fi = btrfs_item_ptr(leaf, path->slots[0], 1255 fi = btrfs_item_ptr(leaf, path->slots[0],
1256 struct btrfs_file_extent_item); 1256 struct btrfs_file_extent_item);
1257 btrfs_set_file_extent_type(leaf, fi, 1257 btrfs_set_file_extent_type(leaf, fi,
1258 BTRFS_FILE_EXTENT_REG); 1258 BTRFS_FILE_EXTENT_REG);
1259 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 1259 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1260 btrfs_mark_buffer_dirty(leaf); 1260 btrfs_mark_buffer_dirty(leaf);
1261 } else { 1261 } else {
1262 fi = btrfs_item_ptr(leaf, del_slot - 1, 1262 fi = btrfs_item_ptr(leaf, del_slot - 1,
1263 struct btrfs_file_extent_item); 1263 struct btrfs_file_extent_item);
1264 btrfs_set_file_extent_type(leaf, fi, 1264 btrfs_set_file_extent_type(leaf, fi,
1265 BTRFS_FILE_EXTENT_REG); 1265 BTRFS_FILE_EXTENT_REG);
1266 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 1266 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1267 btrfs_set_file_extent_num_bytes(leaf, fi, 1267 btrfs_set_file_extent_num_bytes(leaf, fi,
1268 extent_end - key.offset); 1268 extent_end - key.offset);
1269 btrfs_mark_buffer_dirty(leaf); 1269 btrfs_mark_buffer_dirty(leaf);
1270 1270
1271 ret = btrfs_del_items(trans, root, path, del_slot, del_nr); 1271 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
1272 if (ret < 0) { 1272 if (ret < 0) {
1273 btrfs_abort_transaction(trans, root, ret); 1273 btrfs_abort_transaction(trans, root, ret);
1274 goto out; 1274 goto out;
1275 } 1275 }
1276 } 1276 }
1277 out: 1277 out:
1278 btrfs_free_path(path); 1278 btrfs_free_path(path);
1279 return 0; 1279 return 0;
1280 } 1280 }
1281 1281
1282 /* 1282 /*
1283 * on error we return an unlocked page and the error value 1283 * on error we return an unlocked page and the error value
1284 * on success we return a locked page and 0 1284 * on success we return a locked page and 0
1285 */ 1285 */
1286 static int prepare_uptodate_page(struct page *page, u64 pos, 1286 static int prepare_uptodate_page(struct page *page, u64 pos,
1287 bool force_uptodate) 1287 bool force_uptodate)
1288 { 1288 {
1289 int ret = 0; 1289 int ret = 0;
1290 1290
1291 if (((pos & (PAGE_CACHE_SIZE - 1)) || force_uptodate) && 1291 if (((pos & (PAGE_CACHE_SIZE - 1)) || force_uptodate) &&
1292 !PageUptodate(page)) { 1292 !PageUptodate(page)) {
1293 ret = btrfs_readpage(NULL, page); 1293 ret = btrfs_readpage(NULL, page);
1294 if (ret) 1294 if (ret)
1295 return ret; 1295 return ret;
1296 lock_page(page); 1296 lock_page(page);
1297 if (!PageUptodate(page)) { 1297 if (!PageUptodate(page)) {
1298 unlock_page(page); 1298 unlock_page(page);
1299 return -EIO; 1299 return -EIO;
1300 } 1300 }
1301 } 1301 }
1302 return 0; 1302 return 0;
1303 } 1303 }
1304 1304
1305 /* 1305 /*
1306 * this just gets pages into the page cache and locks them down. 1306 * this just gets pages into the page cache and locks them down.
1307 */ 1307 */
1308 static noinline int prepare_pages(struct inode *inode, struct page **pages, 1308 static noinline int prepare_pages(struct inode *inode, struct page **pages,
1309 size_t num_pages, loff_t pos, 1309 size_t num_pages, loff_t pos,
1310 size_t write_bytes, bool force_uptodate) 1310 size_t write_bytes, bool force_uptodate)
1311 { 1311 {
1312 int i; 1312 int i;
1313 unsigned long index = pos >> PAGE_CACHE_SHIFT; 1313 unsigned long index = pos >> PAGE_CACHE_SHIFT;
1314 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 1314 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
1315 int err = 0; 1315 int err = 0;
1316 int faili; 1316 int faili;
1317 1317
1318 for (i = 0; i < num_pages; i++) { 1318 for (i = 0; i < num_pages; i++) {
1319 pages[i] = find_or_create_page(inode->i_mapping, index + i, 1319 pages[i] = find_or_create_page(inode->i_mapping, index + i,
1320 mask | __GFP_WRITE); 1320 mask | __GFP_WRITE);
1321 if (!pages[i]) { 1321 if (!pages[i]) {
1322 faili = i - 1; 1322 faili = i - 1;
1323 err = -ENOMEM; 1323 err = -ENOMEM;
1324 goto fail; 1324 goto fail;
1325 } 1325 }
1326 1326
1327 if (i == 0) 1327 if (i == 0)
1328 err = prepare_uptodate_page(pages[i], pos, 1328 err = prepare_uptodate_page(pages[i], pos,
1329 force_uptodate); 1329 force_uptodate);
1330 if (i == num_pages - 1) 1330 if (i == num_pages - 1)
1331 err = prepare_uptodate_page(pages[i], 1331 err = prepare_uptodate_page(pages[i],
1332 pos + write_bytes, false); 1332 pos + write_bytes, false);
1333 if (err) { 1333 if (err) {
1334 page_cache_release(pages[i]); 1334 page_cache_release(pages[i]);
1335 faili = i - 1; 1335 faili = i - 1;
1336 goto fail; 1336 goto fail;
1337 } 1337 }
1338 wait_on_page_writeback(pages[i]); 1338 wait_on_page_writeback(pages[i]);
1339 } 1339 }
1340 1340
1341 return 0; 1341 return 0;
1342 fail: 1342 fail:
1343 while (faili >= 0) { 1343 while (faili >= 0) {
1344 unlock_page(pages[faili]); 1344 unlock_page(pages[faili]);
1345 page_cache_release(pages[faili]); 1345 page_cache_release(pages[faili]);
1346 faili--; 1346 faili--;
1347 } 1347 }
1348 return err; 1348 return err;
1349 1349
1350 } 1350 }
1351 1351
1352 /* 1352 /*
1353 * This function locks the extent and properly waits for data=ordered extents 1353 * This function locks the extent and properly waits for data=ordered extents
1354 * to finish before allowing the pages to be modified if need. 1354 * to finish before allowing the pages to be modified if need.
1355 * 1355 *
1356 * The return value: 1356 * The return value:
1357 * 1 - the extent is locked 1357 * 1 - the extent is locked
1358 * 0 - the extent is not locked, and everything is OK 1358 * 0 - the extent is not locked, and everything is OK
1359 * -EAGAIN - need re-prepare the pages 1359 * -EAGAIN - need re-prepare the pages
1360 * the other < 0 number - Something wrong happens 1360 * the other < 0 number - Something wrong happens
1361 */ 1361 */
1362 static noinline int 1362 static noinline int
1363 lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages, 1363 lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
1364 size_t num_pages, loff_t pos, 1364 size_t num_pages, loff_t pos,
1365 u64 *lockstart, u64 *lockend, 1365 u64 *lockstart, u64 *lockend,
1366 struct extent_state **cached_state) 1366 struct extent_state **cached_state)
1367 { 1367 {
1368 u64 start_pos; 1368 u64 start_pos;
1369 u64 last_pos; 1369 u64 last_pos;
1370 int i; 1370 int i;
1371 int ret = 0; 1371 int ret = 0;
1372 1372
1373 start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1); 1373 start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1);
1374 last_pos = start_pos + ((u64)num_pages << PAGE_CACHE_SHIFT) - 1; 1374 last_pos = start_pos + ((u64)num_pages << PAGE_CACHE_SHIFT) - 1;
1375 1375
1376 if (start_pos < inode->i_size) { 1376 if (start_pos < inode->i_size) {
1377 struct btrfs_ordered_extent *ordered; 1377 struct btrfs_ordered_extent *ordered;
1378 lock_extent_bits(&BTRFS_I(inode)->io_tree, 1378 lock_extent_bits(&BTRFS_I(inode)->io_tree,
1379 start_pos, last_pos, 0, cached_state); 1379 start_pos, last_pos, 0, cached_state);
1380 ordered = btrfs_lookup_ordered_range(inode, start_pos, 1380 ordered = btrfs_lookup_ordered_range(inode, start_pos,
1381 last_pos - start_pos + 1); 1381 last_pos - start_pos + 1);
1382 if (ordered && 1382 if (ordered &&
1383 ordered->file_offset + ordered->len > start_pos && 1383 ordered->file_offset + ordered->len > start_pos &&
1384 ordered->file_offset <= last_pos) { 1384 ordered->file_offset <= last_pos) {
1385 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 1385 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1386 start_pos, last_pos, 1386 start_pos, last_pos,
1387 cached_state, GFP_NOFS); 1387 cached_state, GFP_NOFS);
1388 for (i = 0; i < num_pages; i++) { 1388 for (i = 0; i < num_pages; i++) {
1389 unlock_page(pages[i]); 1389 unlock_page(pages[i]);
1390 page_cache_release(pages[i]); 1390 page_cache_release(pages[i]);
1391 } 1391 }
1392 btrfs_start_ordered_extent(inode, ordered, 1); 1392 btrfs_start_ordered_extent(inode, ordered, 1);
1393 btrfs_put_ordered_extent(ordered); 1393 btrfs_put_ordered_extent(ordered);
1394 return -EAGAIN; 1394 return -EAGAIN;
1395 } 1395 }
1396 if (ordered) 1396 if (ordered)
1397 btrfs_put_ordered_extent(ordered); 1397 btrfs_put_ordered_extent(ordered);
1398 1398
1399 clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, 1399 clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
1400 last_pos, EXTENT_DIRTY | EXTENT_DELALLOC | 1400 last_pos, EXTENT_DIRTY | EXTENT_DELALLOC |
1401 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1401 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
1402 0, 0, cached_state, GFP_NOFS); 1402 0, 0, cached_state, GFP_NOFS);
1403 *lockstart = start_pos; 1403 *lockstart = start_pos;
1404 *lockend = last_pos; 1404 *lockend = last_pos;
1405 ret = 1; 1405 ret = 1;
1406 } 1406 }
1407 1407
1408 for (i = 0; i < num_pages; i++) { 1408 for (i = 0; i < num_pages; i++) {
1409 if (clear_page_dirty_for_io(pages[i])) 1409 if (clear_page_dirty_for_io(pages[i]))
1410 account_page_redirty(pages[i]); 1410 account_page_redirty(pages[i]);
1411 set_page_extent_mapped(pages[i]); 1411 set_page_extent_mapped(pages[i]);
1412 WARN_ON(!PageLocked(pages[i])); 1412 WARN_ON(!PageLocked(pages[i]));
1413 } 1413 }
1414 1414
1415 return ret; 1415 return ret;
1416 } 1416 }
1417 1417
1418 static noinline int check_can_nocow(struct inode *inode, loff_t pos, 1418 static noinline int check_can_nocow(struct inode *inode, loff_t pos,
1419 size_t *write_bytes) 1419 size_t *write_bytes)
1420 { 1420 {
1421 struct btrfs_root *root = BTRFS_I(inode)->root; 1421 struct btrfs_root *root = BTRFS_I(inode)->root;
1422 struct btrfs_ordered_extent *ordered; 1422 struct btrfs_ordered_extent *ordered;
1423 u64 lockstart, lockend; 1423 u64 lockstart, lockend;
1424 u64 num_bytes; 1424 u64 num_bytes;
1425 int ret; 1425 int ret;
1426 1426
1427 ret = btrfs_start_nocow_write(root); 1427 ret = btrfs_start_nocow_write(root);
1428 if (!ret) 1428 if (!ret)
1429 return -ENOSPC; 1429 return -ENOSPC;
1430 1430
1431 lockstart = round_down(pos, root->sectorsize); 1431 lockstart = round_down(pos, root->sectorsize);
1432 lockend = round_up(pos + *write_bytes, root->sectorsize) - 1; 1432 lockend = round_up(pos + *write_bytes, root->sectorsize) - 1;
1433 1433
1434 while (1) { 1434 while (1) {
1435 lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); 1435 lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
1436 ordered = btrfs_lookup_ordered_range(inode, lockstart, 1436 ordered = btrfs_lookup_ordered_range(inode, lockstart,
1437 lockend - lockstart + 1); 1437 lockend - lockstart + 1);
1438 if (!ordered) { 1438 if (!ordered) {
1439 break; 1439 break;
1440 } 1440 }
1441 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); 1441 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
1442 btrfs_start_ordered_extent(inode, ordered, 1); 1442 btrfs_start_ordered_extent(inode, ordered, 1);
1443 btrfs_put_ordered_extent(ordered); 1443 btrfs_put_ordered_extent(ordered);
1444 } 1444 }
1445 1445
1446 num_bytes = lockend - lockstart + 1; 1446 num_bytes = lockend - lockstart + 1;
1447 ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL); 1447 ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL);
1448 if (ret <= 0) { 1448 if (ret <= 0) {
1449 ret = 0; 1449 ret = 0;
1450 btrfs_end_nocow_write(root); 1450 btrfs_end_nocow_write(root);
1451 } else { 1451 } else {
1452 *write_bytes = min_t(size_t, *write_bytes , 1452 *write_bytes = min_t(size_t, *write_bytes ,
1453 num_bytes - pos + lockstart); 1453 num_bytes - pos + lockstart);
1454 } 1454 }
1455 1455
1456 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); 1456 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
1457 1457
1458 return ret; 1458 return ret;
1459 } 1459 }
1460 1460
1461 static noinline ssize_t __btrfs_buffered_write(struct file *file, 1461 static noinline ssize_t __btrfs_buffered_write(struct file *file,
1462 struct iov_iter *i, 1462 struct iov_iter *i,
1463 loff_t pos) 1463 loff_t pos)
1464 { 1464 {
1465 struct inode *inode = file_inode(file); 1465 struct inode *inode = file_inode(file);
1466 struct btrfs_root *root = BTRFS_I(inode)->root; 1466 struct btrfs_root *root = BTRFS_I(inode)->root;
1467 struct page **pages = NULL; 1467 struct page **pages = NULL;
1468 struct extent_state *cached_state = NULL; 1468 struct extent_state *cached_state = NULL;
1469 u64 release_bytes = 0; 1469 u64 release_bytes = 0;
1470 u64 lockstart; 1470 u64 lockstart;
1471 u64 lockend; 1471 u64 lockend;
1472 unsigned long first_index; 1472 unsigned long first_index;
1473 size_t num_written = 0; 1473 size_t num_written = 0;
1474 int nrptrs; 1474 int nrptrs;
1475 int ret = 0; 1475 int ret = 0;
1476 bool only_release_metadata = false; 1476 bool only_release_metadata = false;
1477 bool force_page_uptodate = false; 1477 bool force_page_uptodate = false;
1478 bool need_unlock; 1478 bool need_unlock;
1479 1479
1480 nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / 1480 nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
1481 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / 1481 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
1482 (sizeof(struct page *))); 1482 (sizeof(struct page *)));
1483 nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied); 1483 nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
1484 nrptrs = max(nrptrs, 8); 1484 nrptrs = max(nrptrs, 8);
1485 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); 1485 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
1486 if (!pages) 1486 if (!pages)
1487 return -ENOMEM; 1487 return -ENOMEM;
1488 1488
1489 first_index = pos >> PAGE_CACHE_SHIFT; 1489 first_index = pos >> PAGE_CACHE_SHIFT;
1490 1490
1491 while (iov_iter_count(i) > 0) { 1491 while (iov_iter_count(i) > 0) {
1492 size_t offset = pos & (PAGE_CACHE_SIZE - 1); 1492 size_t offset = pos & (PAGE_CACHE_SIZE - 1);
1493 size_t write_bytes = min(iov_iter_count(i), 1493 size_t write_bytes = min(iov_iter_count(i),
1494 nrptrs * (size_t)PAGE_CACHE_SIZE - 1494 nrptrs * (size_t)PAGE_CACHE_SIZE -
1495 offset); 1495 offset);
1496 size_t num_pages = (write_bytes + offset + 1496 size_t num_pages = (write_bytes + offset +
1497 PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1497 PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1498 size_t reserve_bytes; 1498 size_t reserve_bytes;
1499 size_t dirty_pages; 1499 size_t dirty_pages;
1500 size_t copied; 1500 size_t copied;
1501 1501
1502 WARN_ON(num_pages > nrptrs); 1502 WARN_ON(num_pages > nrptrs);
1503 1503
1504 /* 1504 /*
1505 * Fault pages before locking them in prepare_pages 1505 * Fault pages before locking them in prepare_pages
1506 * to avoid recursive lock 1506 * to avoid recursive lock
1507 */ 1507 */
1508 if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) { 1508 if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {
1509 ret = -EFAULT; 1509 ret = -EFAULT;
1510 break; 1510 break;
1511 } 1511 }
1512 1512
1513 reserve_bytes = num_pages << PAGE_CACHE_SHIFT; 1513 reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
1514 ret = btrfs_check_data_free_space(inode, reserve_bytes); 1514 ret = btrfs_check_data_free_space(inode, reserve_bytes);
1515 if (ret == -ENOSPC && 1515 if (ret == -ENOSPC &&
1516 (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | 1516 (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
1517 BTRFS_INODE_PREALLOC))) { 1517 BTRFS_INODE_PREALLOC))) {
1518 ret = check_can_nocow(inode, pos, &write_bytes); 1518 ret = check_can_nocow(inode, pos, &write_bytes);
1519 if (ret > 0) { 1519 if (ret > 0) {
1520 only_release_metadata = true; 1520 only_release_metadata = true;
1521 /* 1521 /*
1522 * our prealloc extent may be smaller than 1522 * our prealloc extent may be smaller than
1523 * write_bytes, so scale down. 1523 * write_bytes, so scale down.
1524 */ 1524 */
1525 num_pages = (write_bytes + offset + 1525 num_pages = (write_bytes + offset +
1526 PAGE_CACHE_SIZE - 1) >> 1526 PAGE_CACHE_SIZE - 1) >>
1527 PAGE_CACHE_SHIFT; 1527 PAGE_CACHE_SHIFT;
1528 reserve_bytes = num_pages << PAGE_CACHE_SHIFT; 1528 reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
1529 ret = 0; 1529 ret = 0;
1530 } else { 1530 } else {
1531 ret = -ENOSPC; 1531 ret = -ENOSPC;
1532 } 1532 }
1533 } 1533 }
1534 1534
1535 if (ret) 1535 if (ret)
1536 break; 1536 break;
1537 1537
1538 ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes); 1538 ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);
1539 if (ret) { 1539 if (ret) {
1540 if (!only_release_metadata) 1540 if (!only_release_metadata)
1541 btrfs_free_reserved_data_space(inode, 1541 btrfs_free_reserved_data_space(inode,
1542 reserve_bytes); 1542 reserve_bytes);
1543 else 1543 else
1544 btrfs_end_nocow_write(root); 1544 btrfs_end_nocow_write(root);
1545 break; 1545 break;
1546 } 1546 }
1547 1547
1548 release_bytes = reserve_bytes; 1548 release_bytes = reserve_bytes;
1549 need_unlock = false; 1549 need_unlock = false;
1550 again: 1550 again:
1551 /* 1551 /*
1552 * This is going to setup the pages array with the number of 1552 * This is going to setup the pages array with the number of
1553 * pages we want, so we don't really need to worry about the 1553 * pages we want, so we don't really need to worry about the
1554 * contents of pages from loop to loop 1554 * contents of pages from loop to loop
1555 */ 1555 */
1556 ret = prepare_pages(inode, pages, num_pages, 1556 ret = prepare_pages(inode, pages, num_pages,
1557 pos, write_bytes, 1557 pos, write_bytes,
1558 force_page_uptodate); 1558 force_page_uptodate);
1559 if (ret) 1559 if (ret)
1560 break; 1560 break;
1561 1561
1562 ret = lock_and_cleanup_extent_if_need(inode, pages, num_pages, 1562 ret = lock_and_cleanup_extent_if_need(inode, pages, num_pages,
1563 pos, &lockstart, &lockend, 1563 pos, &lockstart, &lockend,
1564 &cached_state); 1564 &cached_state);
1565 if (ret < 0) { 1565 if (ret < 0) {
1566 if (ret == -EAGAIN) 1566 if (ret == -EAGAIN)
1567 goto again; 1567 goto again;
1568 break; 1568 break;
1569 } else if (ret > 0) { 1569 } else if (ret > 0) {
1570 need_unlock = true; 1570 need_unlock = true;
1571 ret = 0; 1571 ret = 0;
1572 } 1572 }
1573 1573
1574 copied = btrfs_copy_from_user(pos, num_pages, 1574 copied = btrfs_copy_from_user(pos, num_pages,
1575 write_bytes, pages, i); 1575 write_bytes, pages, i);
1576 1576
1577 /* 1577 /*
1578 * if we have trouble faulting in the pages, fall 1578 * if we have trouble faulting in the pages, fall
1579 * back to one page at a time 1579 * back to one page at a time
1580 */ 1580 */
1581 if (copied < write_bytes) 1581 if (copied < write_bytes)
1582 nrptrs = 1; 1582 nrptrs = 1;
1583 1583
1584 if (copied == 0) { 1584 if (copied == 0) {
1585 force_page_uptodate = true; 1585 force_page_uptodate = true;
1586 dirty_pages = 0; 1586 dirty_pages = 0;
1587 } else { 1587 } else {
1588 force_page_uptodate = false; 1588 force_page_uptodate = false;
1589 dirty_pages = (copied + offset + 1589 dirty_pages = (copied + offset +
1590 PAGE_CACHE_SIZE - 1) >> 1590 PAGE_CACHE_SIZE - 1) >>
1591 PAGE_CACHE_SHIFT; 1591 PAGE_CACHE_SHIFT;
1592 } 1592 }
1593 1593
1594 /* 1594 /*
1595 * If we had a short copy we need to release the excess delaloc 1595 * If we had a short copy we need to release the excess delaloc
1596 * bytes we reserved. We need to increment outstanding_extents 1596 * bytes we reserved. We need to increment outstanding_extents
1597 * because btrfs_delalloc_release_space will decrement it, but 1597 * because btrfs_delalloc_release_space will decrement it, but
1598 * we still have an outstanding extent for the chunk we actually 1598 * we still have an outstanding extent for the chunk we actually
1599 * managed to copy. 1599 * managed to copy.
1600 */ 1600 */
1601 if (num_pages > dirty_pages) { 1601 if (num_pages > dirty_pages) {
1602 release_bytes = (num_pages - dirty_pages) << 1602 release_bytes = (num_pages - dirty_pages) <<
1603 PAGE_CACHE_SHIFT; 1603 PAGE_CACHE_SHIFT;
1604 if (copied > 0) { 1604 if (copied > 0) {
1605 spin_lock(&BTRFS_I(inode)->lock); 1605 spin_lock(&BTRFS_I(inode)->lock);
1606 BTRFS_I(inode)->outstanding_extents++; 1606 BTRFS_I(inode)->outstanding_extents++;
1607 spin_unlock(&BTRFS_I(inode)->lock); 1607 spin_unlock(&BTRFS_I(inode)->lock);
1608 } 1608 }
1609 if (only_release_metadata) 1609 if (only_release_metadata)
1610 btrfs_delalloc_release_metadata(inode, 1610 btrfs_delalloc_release_metadata(inode,
1611 release_bytes); 1611 release_bytes);
1612 else 1612 else
1613 btrfs_delalloc_release_space(inode, 1613 btrfs_delalloc_release_space(inode,
1614 release_bytes); 1614 release_bytes);
1615 } 1615 }
1616 1616
1617 release_bytes = dirty_pages << PAGE_CACHE_SHIFT; 1617 release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
1618 1618
1619 if (copied > 0) 1619 if (copied > 0)
1620 ret = btrfs_dirty_pages(root, inode, pages, 1620 ret = btrfs_dirty_pages(root, inode, pages,
1621 dirty_pages, pos, copied, 1621 dirty_pages, pos, copied,
1622 NULL); 1622 NULL);
1623 if (need_unlock) 1623 if (need_unlock)
1624 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 1624 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1625 lockstart, lockend, &cached_state, 1625 lockstart, lockend, &cached_state,
1626 GFP_NOFS); 1626 GFP_NOFS);
1627 if (ret) { 1627 if (ret) {
1628 btrfs_drop_pages(pages, num_pages); 1628 btrfs_drop_pages(pages, num_pages);
1629 break; 1629 break;
1630 } 1630 }
1631 1631
1632 release_bytes = 0; 1632 release_bytes = 0;
1633 if (only_release_metadata) 1633 if (only_release_metadata)
1634 btrfs_end_nocow_write(root); 1634 btrfs_end_nocow_write(root);
1635 1635
1636 if (only_release_metadata && copied > 0) { 1636 if (only_release_metadata && copied > 0) {
1637 u64 lockstart = round_down(pos, root->sectorsize); 1637 u64 lockstart = round_down(pos, root->sectorsize);
1638 u64 lockend = lockstart + 1638 u64 lockend = lockstart +
1639 (dirty_pages << PAGE_CACHE_SHIFT) - 1; 1639 (dirty_pages << PAGE_CACHE_SHIFT) - 1;
1640 1640
1641 set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, 1641 set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
1642 lockend, EXTENT_NORESERVE, NULL, 1642 lockend, EXTENT_NORESERVE, NULL,
1643 NULL, GFP_NOFS); 1643 NULL, GFP_NOFS);
1644 only_release_metadata = false; 1644 only_release_metadata = false;
1645 } 1645 }
1646 1646
1647 btrfs_drop_pages(pages, num_pages); 1647 btrfs_drop_pages(pages, num_pages);
1648 1648
1649 cond_resched(); 1649 cond_resched();
1650 1650
1651 balance_dirty_pages_ratelimited(inode->i_mapping); 1651 balance_dirty_pages_ratelimited(inode->i_mapping);
1652 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) 1652 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1653 btrfs_btree_balance_dirty(root); 1653 btrfs_btree_balance_dirty(root);
1654 1654
1655 pos += copied; 1655 pos += copied;
1656 num_written += copied; 1656 num_written += copied;
1657 } 1657 }
1658 1658
1659 kfree(pages); 1659 kfree(pages);
1660 1660
1661 if (release_bytes) { 1661 if (release_bytes) {
1662 if (only_release_metadata) { 1662 if (only_release_metadata) {
1663 btrfs_end_nocow_write(root); 1663 btrfs_end_nocow_write(root);
1664 btrfs_delalloc_release_metadata(inode, release_bytes); 1664 btrfs_delalloc_release_metadata(inode, release_bytes);
1665 } else { 1665 } else {
1666 btrfs_delalloc_release_space(inode, release_bytes); 1666 btrfs_delalloc_release_space(inode, release_bytes);
1667 } 1667 }
1668 } 1668 }
1669 1669
1670 return num_written ? num_written : ret; 1670 return num_written ? num_written : ret;
1671 } 1671 }
1672 1672
1673 static ssize_t __btrfs_direct_write(struct kiocb *iocb, 1673 static ssize_t __btrfs_direct_write(struct kiocb *iocb,
1674 const struct iovec *iov, 1674 const struct iovec *iov,
1675 unsigned long nr_segs, loff_t pos, 1675 unsigned long nr_segs, loff_t pos,
1676 size_t count, size_t ocount) 1676 size_t count, size_t ocount)
1677 { 1677 {
1678 struct file *file = iocb->ki_filp; 1678 struct file *file = iocb->ki_filp;
1679 struct iov_iter i; 1679 struct iov_iter i;
1680 ssize_t written; 1680 ssize_t written;
1681 ssize_t written_buffered; 1681 ssize_t written_buffered;
1682 loff_t endbyte; 1682 loff_t endbyte;
1683 int err; 1683 int err;
1684 1684
1685 written = generic_file_direct_write(iocb, iov, &nr_segs, pos, 1685 written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
1686 count, ocount); 1686 count, ocount);
1687 1687
1688 if (written < 0 || written == count) 1688 if (written < 0 || written == count)
1689 return written; 1689 return written;
1690 1690
1691 pos += written; 1691 pos += written;
1692 count -= written; 1692 count -= written;
1693 iov_iter_init(&i, iov, nr_segs, count, written); 1693 iov_iter_init(&i, iov, nr_segs, count, written);
1694 written_buffered = __btrfs_buffered_write(file, &i, pos); 1694 written_buffered = __btrfs_buffered_write(file, &i, pos);
1695 if (written_buffered < 0) { 1695 if (written_buffered < 0) {
1696 err = written_buffered; 1696 err = written_buffered;
1697 goto out; 1697 goto out;
1698 } 1698 }
1699 endbyte = pos + written_buffered - 1; 1699 endbyte = pos + written_buffered - 1;
1700 err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); 1700 err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
1701 if (err) 1701 if (err)
1702 goto out; 1702 goto out;
1703 written += written_buffered; 1703 written += written_buffered;
1704 iocb->ki_pos = pos + written_buffered; 1704 iocb->ki_pos = pos + written_buffered;
1705 invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT, 1705 invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT,
1706 endbyte >> PAGE_CACHE_SHIFT); 1706 endbyte >> PAGE_CACHE_SHIFT);
1707 out: 1707 out:
1708 return written ? written : err; 1708 return written ? written : err;
1709 } 1709 }
1710 1710
1711 static void update_time_for_write(struct inode *inode) 1711 static void update_time_for_write(struct inode *inode)
1712 { 1712 {
1713 struct timespec now; 1713 struct timespec now;
1714 1714
1715 if (IS_NOCMTIME(inode)) 1715 if (IS_NOCMTIME(inode))
1716 return; 1716 return;
1717 1717
1718 now = current_fs_time(inode->i_sb); 1718 now = current_fs_time(inode->i_sb);
1719 if (!timespec_equal(&inode->i_mtime, &now)) 1719 if (!timespec_equal(&inode->i_mtime, &now))
1720 inode->i_mtime = now; 1720 inode->i_mtime = now;
1721 1721
1722 if (!timespec_equal(&inode->i_ctime, &now)) 1722 if (!timespec_equal(&inode->i_ctime, &now))
1723 inode->i_ctime = now; 1723 inode->i_ctime = now;
1724 1724
1725 if (IS_I_VERSION(inode)) 1725 if (IS_I_VERSION(inode))
1726 inode_inc_iversion(inode); 1726 inode_inc_iversion(inode);
1727 } 1727 }
1728 1728
1729 static ssize_t btrfs_file_aio_write(struct kiocb *iocb, 1729 static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1730 const struct iovec *iov, 1730 const struct iovec *iov,
1731 unsigned long nr_segs, loff_t pos) 1731 unsigned long nr_segs, loff_t pos)
1732 { 1732 {
1733 struct file *file = iocb->ki_filp; 1733 struct file *file = iocb->ki_filp;
1734 struct inode *inode = file_inode(file); 1734 struct inode *inode = file_inode(file);
1735 struct btrfs_root *root = BTRFS_I(inode)->root; 1735 struct btrfs_root *root = BTRFS_I(inode)->root;
1736 u64 start_pos; 1736 u64 start_pos;
1737 u64 end_pos; 1737 u64 end_pos;
1738 ssize_t num_written = 0; 1738 ssize_t num_written = 0;
1739 ssize_t err = 0; 1739 ssize_t err = 0;
1740 size_t count, ocount; 1740 size_t count, ocount;
1741 bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host); 1741 bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
1742 1742
1743 mutex_lock(&inode->i_mutex); 1743 mutex_lock(&inode->i_mutex);
1744 1744
1745 err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); 1745 err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
1746 if (err) { 1746 if (err) {
1747 mutex_unlock(&inode->i_mutex); 1747 mutex_unlock(&inode->i_mutex);
1748 goto out; 1748 goto out;
1749 } 1749 }
1750 count = ocount; 1750 count = ocount;
1751 1751
1752 current->backing_dev_info = inode->i_mapping->backing_dev_info; 1752 current->backing_dev_info = inode->i_mapping->backing_dev_info;
1753 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); 1753 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
1754 if (err) { 1754 if (err) {
1755 mutex_unlock(&inode->i_mutex); 1755 mutex_unlock(&inode->i_mutex);
1756 goto out; 1756 goto out;
1757 } 1757 }
1758 1758
1759 if (count == 0) { 1759 if (count == 0) {
1760 mutex_unlock(&inode->i_mutex); 1760 mutex_unlock(&inode->i_mutex);
1761 goto out; 1761 goto out;
1762 } 1762 }
1763 1763
1764 err = file_remove_suid(file); 1764 err = file_remove_suid(file);
1765 if (err) { 1765 if (err) {
1766 mutex_unlock(&inode->i_mutex); 1766 mutex_unlock(&inode->i_mutex);
1767 goto out; 1767 goto out;
1768 } 1768 }
1769 1769
1770 /* 1770 /*
1771 * If BTRFS flips readonly due to some impossible error 1771 * If BTRFS flips readonly due to some impossible error
1772 * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR), 1772 * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
1773 * although we have opened a file as writable, we have 1773 * although we have opened a file as writable, we have
1774 * to stop this write operation to ensure FS consistency. 1774 * to stop this write operation to ensure FS consistency.
1775 */ 1775 */
1776 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) { 1776 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
1777 mutex_unlock(&inode->i_mutex); 1777 mutex_unlock(&inode->i_mutex);
1778 err = -EROFS; 1778 err = -EROFS;
1779 goto out; 1779 goto out;
1780 } 1780 }
1781 1781
1782 /* 1782 /*
1783 * We reserve space for updating the inode when we reserve space for the 1783 * We reserve space for updating the inode when we reserve space for the
1784 * extent we are going to write, so we will enospc out there. We don't 1784 * extent we are going to write, so we will enospc out there. We don't
1785 * need to start yet another transaction to update the inode as we will 1785 * need to start yet another transaction to update the inode as we will
1786 * update the inode when we finish writing whatever data we write. 1786 * update the inode when we finish writing whatever data we write.
1787 */ 1787 */
1788 update_time_for_write(inode); 1788 update_time_for_write(inode);
1789 1789
1790 start_pos = round_down(pos, root->sectorsize); 1790 start_pos = round_down(pos, root->sectorsize);
1791 if (start_pos > i_size_read(inode)) { 1791 if (start_pos > i_size_read(inode)) {
1792 /* Expand hole size to cover write data, preventing empty gap */ 1792 /* Expand hole size to cover write data, preventing empty gap */
1793 end_pos = round_up(pos + count, root->sectorsize); 1793 end_pos = round_up(pos + count, root->sectorsize);
1794 err = btrfs_cont_expand(inode, i_size_read(inode), end_pos); 1794 err = btrfs_cont_expand(inode, i_size_read(inode), end_pos);
1795 if (err) { 1795 if (err) {
1796 mutex_unlock(&inode->i_mutex); 1796 mutex_unlock(&inode->i_mutex);
1797 goto out; 1797 goto out;
1798 } 1798 }
1799 } 1799 }
1800 1800
1801 if (sync) 1801 if (sync)
1802 atomic_inc(&BTRFS_I(inode)->sync_writers); 1802 atomic_inc(&BTRFS_I(inode)->sync_writers);
1803 1803
1804 if (unlikely(file->f_flags & O_DIRECT)) { 1804 if (unlikely(file->f_flags & O_DIRECT)) {
1805 num_written = __btrfs_direct_write(iocb, iov, nr_segs, 1805 num_written = __btrfs_direct_write(iocb, iov, nr_segs,
1806 pos, count, ocount); 1806 pos, count, ocount);
1807 } else { 1807 } else {
1808 struct iov_iter i; 1808 struct iov_iter i;
1809 1809
1810 iov_iter_init(&i, iov, nr_segs, count, num_written); 1810 iov_iter_init(&i, iov, nr_segs, count, num_written);
1811 1811
1812 num_written = __btrfs_buffered_write(file, &i, pos); 1812 num_written = __btrfs_buffered_write(file, &i, pos);
1813 if (num_written > 0) 1813 if (num_written > 0)
1814 iocb->ki_pos = pos + num_written; 1814 iocb->ki_pos = pos + num_written;
1815 } 1815 }
1816 1816
1817 mutex_unlock(&inode->i_mutex); 1817 mutex_unlock(&inode->i_mutex);
1818 1818
1819 /* 1819 /*
1820 * we want to make sure fsync finds this change 1820 * we want to make sure fsync finds this change
1821 * but we haven't joined a transaction running right now. 1821 * but we haven't joined a transaction running right now.
1822 * 1822 *
1823 * Later on, someone is sure to update the inode and get the 1823 * Later on, someone is sure to update the inode and get the
1824 * real transid recorded. 1824 * real transid recorded.
1825 * 1825 *
1826 * We set last_trans now to the fs_info generation + 1, 1826 * We set last_trans now to the fs_info generation + 1,
1827 * this will either be one more than the running transaction 1827 * this will either be one more than the running transaction
1828 * or the generation used for the next transaction if there isn't 1828 * or the generation used for the next transaction if there isn't
1829 * one running right now. 1829 * one running right now.
1830 * 1830 *
1831 * We also have to set last_sub_trans to the current log transid, 1831 * We also have to set last_sub_trans to the current log transid,
1832 * otherwise subsequent syncs to a file that's been synced in this 1832 * otherwise subsequent syncs to a file that's been synced in this
1833 * transaction will appear to have already occured. 1833 * transaction will appear to have already occured.
1834 */ 1834 */
1835 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; 1835 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
1836 BTRFS_I(inode)->last_sub_trans = root->log_transid; 1836 BTRFS_I(inode)->last_sub_trans = root->log_transid;
1837 if (num_written > 0) { 1837 if (num_written > 0) {
1838 err = generic_write_sync(file, pos, num_written); 1838 err = generic_write_sync(file, pos, num_written);
1839 if (err < 0) 1839 if (err < 0)
1840 num_written = err; 1840 num_written = err;
1841 } 1841 }
1842 1842
1843 if (sync) 1843 if (sync)
1844 atomic_dec(&BTRFS_I(inode)->sync_writers); 1844 atomic_dec(&BTRFS_I(inode)->sync_writers);
1845 out: 1845 out:
1846 current->backing_dev_info = NULL; 1846 current->backing_dev_info = NULL;
1847 return num_written ? num_written : err; 1847 return num_written ? num_written : err;
1848 } 1848 }
1849 1849
1850 int btrfs_release_file(struct inode *inode, struct file *filp) 1850 int btrfs_release_file(struct inode *inode, struct file *filp)
1851 { 1851 {
1852 /* 1852 /*
1853 * ordered_data_close is set by settattr when we are about to truncate 1853 * ordered_data_close is set by settattr when we are about to truncate
1854 * a file from a non-zero size to a zero size. This tries to 1854 * a file from a non-zero size to a zero size. This tries to
1855 * flush down new bytes that may have been written if the 1855 * flush down new bytes that may have been written if the
1856 * application were using truncate to replace a file in place. 1856 * application were using truncate to replace a file in place.
1857 */ 1857 */
1858 if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, 1858 if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
1859 &BTRFS_I(inode)->runtime_flags)) { 1859 &BTRFS_I(inode)->runtime_flags)) {
1860 struct btrfs_trans_handle *trans; 1860 struct btrfs_trans_handle *trans;
1861 struct btrfs_root *root = BTRFS_I(inode)->root; 1861 struct btrfs_root *root = BTRFS_I(inode)->root;
1862 1862
1863 /* 1863 /*
1864 * We need to block on a committing transaction to keep us from 1864 * We need to block on a committing transaction to keep us from
1865 * throwing a ordered operation on to the list and causing 1865 * throwing a ordered operation on to the list and causing
1866 * something like sync to deadlock trying to flush out this 1866 * something like sync to deadlock trying to flush out this
1867 * inode. 1867 * inode.
1868 */ 1868 */
1869 trans = btrfs_start_transaction(root, 0); 1869 trans = btrfs_start_transaction(root, 0);
1870 if (IS_ERR(trans)) 1870 if (IS_ERR(trans))
1871 return PTR_ERR(trans); 1871 return PTR_ERR(trans);
1872 btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode); 1872 btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode);
1873 btrfs_end_transaction(trans, root); 1873 btrfs_end_transaction(trans, root);
1874 if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) 1874 if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
1875 filemap_flush(inode->i_mapping); 1875 filemap_flush(inode->i_mapping);
1876 } 1876 }
1877 if (filp->private_data) 1877 if (filp->private_data)
1878 btrfs_ioctl_trans_end(filp); 1878 btrfs_ioctl_trans_end(filp);
1879 return 0; 1879 return 0;
1880 } 1880 }
1881 1881
1882 /* 1882 /*
1883 * fsync call for both files and directories. This logs the inode into 1883 * fsync call for both files and directories. This logs the inode into
1884 * the tree log instead of forcing full commits whenever possible. 1884 * the tree log instead of forcing full commits whenever possible.
1885 * 1885 *
1886 * It needs to call filemap_fdatawait so that all ordered extent updates are 1886 * It needs to call filemap_fdatawait so that all ordered extent updates are
1887 * in the metadata btree are up to date for copying to the log. 1887 * in the metadata btree are up to date for copying to the log.
1888 * 1888 *
1889 * It drops the inode mutex before doing the tree log commit. This is an 1889 * It drops the inode mutex before doing the tree log commit. This is an
1890 * important optimization for directories because holding the mutex prevents 1890 * important optimization for directories because holding the mutex prevents
1891 * new operations on the dir while we write to disk. 1891 * new operations on the dir while we write to disk.
1892 */ 1892 */
1893 int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) 1893 int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1894 { 1894 {
1895 struct dentry *dentry = file->f_path.dentry; 1895 struct dentry *dentry = file->f_path.dentry;
1896 struct inode *inode = dentry->d_inode; 1896 struct inode *inode = dentry->d_inode;
1897 struct btrfs_root *root = BTRFS_I(inode)->root; 1897 struct btrfs_root *root = BTRFS_I(inode)->root;
1898 struct btrfs_trans_handle *trans; 1898 struct btrfs_trans_handle *trans;
1899 struct btrfs_log_ctx ctx; 1899 struct btrfs_log_ctx ctx;
1900 int ret = 0; 1900 int ret = 0;
1901 bool full_sync = 0; 1901 bool full_sync = 0;
1902 1902
1903 trace_btrfs_sync_file(file, datasync); 1903 trace_btrfs_sync_file(file, datasync);
1904 1904
1905 /* 1905 /*
1906 * We write the dirty pages in the range and wait until they complete 1906 * We write the dirty pages in the range and wait until they complete
1907 * out of the ->i_mutex. If so, we can flush the dirty pages by 1907 * out of the ->i_mutex. If so, we can flush the dirty pages by
1908 * multi-task, and make the performance up. See 1908 * multi-task, and make the performance up. See
1909 * btrfs_wait_ordered_range for an explanation of the ASYNC check. 1909 * btrfs_wait_ordered_range for an explanation of the ASYNC check.
1910 */ 1910 */
1911 atomic_inc(&BTRFS_I(inode)->sync_writers); 1911 atomic_inc(&BTRFS_I(inode)->sync_writers);
1912 ret = filemap_fdatawrite_range(inode->i_mapping, start, end); 1912 ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
1913 if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 1913 if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1914 &BTRFS_I(inode)->runtime_flags)) 1914 &BTRFS_I(inode)->runtime_flags))
1915 ret = filemap_fdatawrite_range(inode->i_mapping, start, end); 1915 ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
1916 atomic_dec(&BTRFS_I(inode)->sync_writers); 1916 atomic_dec(&BTRFS_I(inode)->sync_writers);
1917 if (ret) 1917 if (ret)
1918 return ret; 1918 return ret;
1919 1919
1920 mutex_lock(&inode->i_mutex); 1920 mutex_lock(&inode->i_mutex);
1921 1921
1922 /* 1922 /*
1923 * We flush the dirty pages again to avoid some dirty pages in the 1923 * We flush the dirty pages again to avoid some dirty pages in the
1924 * range being left. 1924 * range being left.
1925 */ 1925 */
1926 atomic_inc(&root->log_batch); 1926 atomic_inc(&root->log_batch);
1927 full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 1927 full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1928 &BTRFS_I(inode)->runtime_flags); 1928 &BTRFS_I(inode)->runtime_flags);
1929 if (full_sync) { 1929 if (full_sync) {
1930 ret = btrfs_wait_ordered_range(inode, start, end - start + 1); 1930 ret = btrfs_wait_ordered_range(inode, start, end - start + 1);
1931 if (ret) { 1931 if (ret) {
1932 mutex_unlock(&inode->i_mutex); 1932 mutex_unlock(&inode->i_mutex);
1933 goto out; 1933 goto out;
1934 } 1934 }
1935 } 1935 }
1936 atomic_inc(&root->log_batch); 1936 atomic_inc(&root->log_batch);
1937 1937
1938 /* 1938 /*
1939 * check the transaction that last modified this inode 1939 * check the transaction that last modified this inode
1940 * and see if its already been committed 1940 * and see if its already been committed
1941 */ 1941 */
1942 if (!BTRFS_I(inode)->last_trans) { 1942 if (!BTRFS_I(inode)->last_trans) {
1943 mutex_unlock(&inode->i_mutex); 1943 mutex_unlock(&inode->i_mutex);
1944 goto out; 1944 goto out;
1945 } 1945 }
1946 1946
1947 /* 1947 /*
1948 * if the last transaction that changed this file was before 1948 * if the last transaction that changed this file was before
1949 * the current transaction, we can bail out now without any 1949 * the current transaction, we can bail out now without any
1950 * syncing 1950 * syncing
1951 */ 1951 */
1952 smp_mb(); 1952 smp_mb();
1953 if (btrfs_inode_in_log(inode, root->fs_info->generation) || 1953 if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
1954 BTRFS_I(inode)->last_trans <= 1954 BTRFS_I(inode)->last_trans <=
1955 root->fs_info->last_trans_committed) { 1955 root->fs_info->last_trans_committed) {
1956 BTRFS_I(inode)->last_trans = 0; 1956 BTRFS_I(inode)->last_trans = 0;
1957 1957
1958 /* 1958 /*
1959 * We'v had everything committed since the last time we were 1959 * We'v had everything committed since the last time we were
1960 * modified so clear this flag in case it was set for whatever 1960 * modified so clear this flag in case it was set for whatever
1961 * reason, it's no longer relevant. 1961 * reason, it's no longer relevant.
1962 */ 1962 */
1963 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 1963 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1964 &BTRFS_I(inode)->runtime_flags); 1964 &BTRFS_I(inode)->runtime_flags);
1965 mutex_unlock(&inode->i_mutex); 1965 mutex_unlock(&inode->i_mutex);
1966 goto out; 1966 goto out;
1967 } 1967 }
1968 1968
1969 /* 1969 /*
1970 * ok we haven't committed the transaction yet, lets do a commit 1970 * ok we haven't committed the transaction yet, lets do a commit
1971 */ 1971 */
1972 if (file->private_data) 1972 if (file->private_data)
1973 btrfs_ioctl_trans_end(file); 1973 btrfs_ioctl_trans_end(file);
1974 1974
1975 /* 1975 /*
1976 * We use start here because we will need to wait on the IO to complete 1976 * We use start here because we will need to wait on the IO to complete
1977 * in btrfs_sync_log, which could require joining a transaction (for 1977 * in btrfs_sync_log, which could require joining a transaction (for
1978 * example checking cross references in the nocow path). If we use join 1978 * example checking cross references in the nocow path). If we use join
1979 * here we could get into a situation where we're waiting on IO to 1979 * here we could get into a situation where we're waiting on IO to
1980 * happen that is blocked on a transaction trying to commit. With start 1980 * happen that is blocked on a transaction trying to commit. With start
1981 * we inc the extwriter counter, so we wait for all extwriters to exit 1981 * we inc the extwriter counter, so we wait for all extwriters to exit
1982 * before we start blocking join'ers. This comment is to keep somebody 1982 * before we start blocking join'ers. This comment is to keep somebody
1983 * from thinking they are super smart and changing this to 1983 * from thinking they are super smart and changing this to
1984 * btrfs_join_transaction *cough*Josef*cough*. 1984 * btrfs_join_transaction *cough*Josef*cough*.
1985 */ 1985 */
1986 trans = btrfs_start_transaction(root, 0); 1986 trans = btrfs_start_transaction(root, 0);
1987 if (IS_ERR(trans)) { 1987 if (IS_ERR(trans)) {
1988 ret = PTR_ERR(trans); 1988 ret = PTR_ERR(trans);
1989 mutex_unlock(&inode->i_mutex); 1989 mutex_unlock(&inode->i_mutex);
1990 goto out; 1990 goto out;
1991 } 1991 }
1992 trans->sync = true; 1992 trans->sync = true;
1993 1993
1994 btrfs_init_log_ctx(&ctx); 1994 btrfs_init_log_ctx(&ctx);
1995 1995
1996 ret = btrfs_log_dentry_safe(trans, root, dentry, &ctx); 1996 ret = btrfs_log_dentry_safe(trans, root, dentry, &ctx);
1997 if (ret < 0) { 1997 if (ret < 0) {
1998 /* Fallthrough and commit/free transaction. */ 1998 /* Fallthrough and commit/free transaction. */
1999 ret = 1; 1999 ret = 1;
2000 } 2000 }
2001 2001
2002 /* we've logged all the items and now have a consistent 2002 /* we've logged all the items and now have a consistent
2003 * version of the file in the log. It is possible that 2003 * version of the file in the log. It is possible that
2004 * someone will come in and modify the file, but that's 2004 * someone will come in and modify the file, but that's
2005 * fine because the log is consistent on disk, and we 2005 * fine because the log is consistent on disk, and we
2006 * have references to all of the file's extents 2006 * have references to all of the file's extents
2007 * 2007 *
2008 * It is possible that someone will come in and log the 2008 * It is possible that someone will come in and log the
2009 * file again, but that will end up using the synchronization 2009 * file again, but that will end up using the synchronization
2010 * inside btrfs_sync_log to keep things safe. 2010 * inside btrfs_sync_log to keep things safe.
2011 */ 2011 */
2012 mutex_unlock(&inode->i_mutex); 2012 mutex_unlock(&inode->i_mutex);
2013 2013
2014 if (ret != BTRFS_NO_LOG_SYNC) { 2014 if (ret != BTRFS_NO_LOG_SYNC) {
2015 if (!ret) { 2015 if (!ret) {
2016 ret = btrfs_sync_log(trans, root, &ctx); 2016 ret = btrfs_sync_log(trans, root, &ctx);
2017 if (!ret) { 2017 if (!ret) {
2018 ret = btrfs_end_transaction(trans, root); 2018 ret = btrfs_end_transaction(trans, root);
2019 goto out; 2019 goto out;
2020 } 2020 }
2021 } 2021 }
2022 if (!full_sync) { 2022 if (!full_sync) {
2023 ret = btrfs_wait_ordered_range(inode, start, 2023 ret = btrfs_wait_ordered_range(inode, start,
2024 end - start + 1); 2024 end - start + 1);
2025 if (ret) 2025 if (ret)
2026 goto out; 2026 goto out;
2027 } 2027 }
2028 ret = btrfs_commit_transaction(trans, root); 2028 ret = btrfs_commit_transaction(trans, root);
2029 } else { 2029 } else {
2030 ret = btrfs_end_transaction(trans, root); 2030 ret = btrfs_end_transaction(trans, root);
2031 } 2031 }
2032 out: 2032 out:
2033 return ret > 0 ? -EIO : ret; 2033 return ret > 0 ? -EIO : ret;
2034 } 2034 }
2035 2035
2036 static const struct vm_operations_struct btrfs_file_vm_ops = { 2036 static const struct vm_operations_struct btrfs_file_vm_ops = {
2037 .fault = filemap_fault, 2037 .fault = filemap_fault,
2038 .map_pages = filemap_map_pages, 2038 .map_pages = filemap_map_pages,
2039 .page_mkwrite = btrfs_page_mkwrite, 2039 .page_mkwrite = btrfs_page_mkwrite,
2040 .remap_pages = generic_file_remap_pages, 2040 .remap_pages = generic_file_remap_pages,
2041 }; 2041 };
2042 2042
2043 static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) 2043 static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
2044 { 2044 {
2045 struct address_space *mapping = filp->f_mapping; 2045 struct address_space *mapping = filp->f_mapping;
2046 2046
2047 if (!mapping->a_ops->readpage) 2047 if (!mapping->a_ops->readpage)
2048 return -ENOEXEC; 2048 return -ENOEXEC;
2049 2049
2050 file_accessed(filp); 2050 file_accessed(filp);
2051 vma->vm_ops = &btrfs_file_vm_ops; 2051 vma->vm_ops = &btrfs_file_vm_ops;
2052 2052
2053 return 0; 2053 return 0;
2054 } 2054 }
2055 2055
2056 static int hole_mergeable(struct inode *inode, struct extent_buffer *leaf, 2056 static int hole_mergeable(struct inode *inode, struct extent_buffer *leaf,
2057 int slot, u64 start, u64 end) 2057 int slot, u64 start, u64 end)
2058 { 2058 {
2059 struct btrfs_file_extent_item *fi; 2059 struct btrfs_file_extent_item *fi;
2060 struct btrfs_key key; 2060 struct btrfs_key key;
2061 2061
2062 if (slot < 0 || slot >= btrfs_header_nritems(leaf)) 2062 if (slot < 0 || slot >= btrfs_header_nritems(leaf))
2063 return 0; 2063 return 0;
2064 2064
2065 btrfs_item_key_to_cpu(leaf, &key, slot); 2065 btrfs_item_key_to_cpu(leaf, &key, slot);
2066 if (key.objectid != btrfs_ino(inode) || 2066 if (key.objectid != btrfs_ino(inode) ||
2067 key.type != BTRFS_EXTENT_DATA_KEY) 2067 key.type != BTRFS_EXTENT_DATA_KEY)
2068 return 0; 2068 return 0;
2069 2069
2070 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 2070 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2071 2071
2072 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG) 2072 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2073 return 0; 2073 return 0;
2074 2074
2075 if (btrfs_file_extent_disk_bytenr(leaf, fi)) 2075 if (btrfs_file_extent_disk_bytenr(leaf, fi))
2076 return 0; 2076 return 0;
2077 2077
2078 if (key.offset == end) 2078 if (key.offset == end)
2079 return 1; 2079 return 1;
2080 if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start) 2080 if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
2081 return 1; 2081 return 1;
2082 return 0; 2082 return 0;
2083 } 2083 }
2084 2084
2085 static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode, 2085 static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
2086 struct btrfs_path *path, u64 offset, u64 end) 2086 struct btrfs_path *path, u64 offset, u64 end)
2087 { 2087 {
2088 struct btrfs_root *root = BTRFS_I(inode)->root; 2088 struct btrfs_root *root = BTRFS_I(inode)->root;
2089 struct extent_buffer *leaf; 2089 struct extent_buffer *leaf;
2090 struct btrfs_file_extent_item *fi; 2090 struct btrfs_file_extent_item *fi;
2091 struct extent_map *hole_em; 2091 struct extent_map *hole_em;
2092 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 2092 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2093 struct btrfs_key key; 2093 struct btrfs_key key;
2094 int ret; 2094 int ret;
2095 2095
2096 if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) 2096 if (btrfs_fs_incompat(root->fs_info, NO_HOLES))
2097 goto out; 2097 goto out;
2098 2098
2099 key.objectid = btrfs_ino(inode); 2099 key.objectid = btrfs_ino(inode);
2100 key.type = BTRFS_EXTENT_DATA_KEY; 2100 key.type = BTRFS_EXTENT_DATA_KEY;
2101 key.offset = offset; 2101 key.offset = offset;
2102 2102
2103 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2103 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2104 if (ret < 0) 2104 if (ret < 0)
2105 return ret; 2105 return ret;
2106 BUG_ON(!ret); 2106 BUG_ON(!ret);
2107 2107
2108 leaf = path->nodes[0]; 2108 leaf = path->nodes[0];
2109 if (hole_mergeable(inode, leaf, path->slots[0]-1, offset, end)) { 2109 if (hole_mergeable(inode, leaf, path->slots[0]-1, offset, end)) {
2110 u64 num_bytes; 2110 u64 num_bytes;
2111 2111
2112 path->slots[0]--; 2112 path->slots[0]--;
2113 fi = btrfs_item_ptr(leaf, path->slots[0], 2113 fi = btrfs_item_ptr(leaf, path->slots[0],
2114 struct btrfs_file_extent_item); 2114 struct btrfs_file_extent_item);
2115 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + 2115 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
2116 end - offset; 2116 end - offset;
2117 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); 2117 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2118 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); 2118 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2119 btrfs_set_file_extent_offset(leaf, fi, 0); 2119 btrfs_set_file_extent_offset(leaf, fi, 0);
2120 btrfs_mark_buffer_dirty(leaf); 2120 btrfs_mark_buffer_dirty(leaf);
2121 goto out; 2121 goto out;
2122 } 2122 }
2123 2123
2124 if (hole_mergeable(inode, leaf, path->slots[0]+1, offset, end)) { 2124 if (hole_mergeable(inode, leaf, path->slots[0]+1, offset, end)) {
2125 u64 num_bytes; 2125 u64 num_bytes;
2126 2126
2127 path->slots[0]++; 2127 path->slots[0]++;
2128 key.offset = offset; 2128 key.offset = offset;
2129 btrfs_set_item_key_safe(root, path, &key); 2129 btrfs_set_item_key_safe(root, path, &key);
2130 fi = btrfs_item_ptr(leaf, path->slots[0], 2130 fi = btrfs_item_ptr(leaf, path->slots[0],
2131 struct btrfs_file_extent_item); 2131 struct btrfs_file_extent_item);
2132 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end - 2132 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
2133 offset; 2133 offset;
2134 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); 2134 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2135 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); 2135 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2136 btrfs_set_file_extent_offset(leaf, fi, 0); 2136 btrfs_set_file_extent_offset(leaf, fi, 0);
2137 btrfs_mark_buffer_dirty(leaf); 2137 btrfs_mark_buffer_dirty(leaf);
2138 goto out; 2138 goto out;
2139 } 2139 }
2140 btrfs_release_path(path); 2140 btrfs_release_path(path);
2141 2141
2142 ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset, 2142 ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
2143 0, 0, end - offset, 0, end - offset, 2143 0, 0, end - offset, 0, end - offset,
2144 0, 0, 0); 2144 0, 0, 0);
2145 if (ret) 2145 if (ret)
2146 return ret; 2146 return ret;
2147 2147
2148 out: 2148 out:
2149 btrfs_release_path(path); 2149 btrfs_release_path(path);
2150 2150
2151 hole_em = alloc_extent_map(); 2151 hole_em = alloc_extent_map();
2152 if (!hole_em) { 2152 if (!hole_em) {
2153 btrfs_drop_extent_cache(inode, offset, end - 1, 0); 2153 btrfs_drop_extent_cache(inode, offset, end - 1, 0);
2154 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 2154 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2155 &BTRFS_I(inode)->runtime_flags); 2155 &BTRFS_I(inode)->runtime_flags);
2156 } else { 2156 } else {
2157 hole_em->start = offset; 2157 hole_em->start = offset;
2158 hole_em->len = end - offset; 2158 hole_em->len = end - offset;
2159 hole_em->ram_bytes = hole_em->len; 2159 hole_em->ram_bytes = hole_em->len;
2160 hole_em->orig_start = offset; 2160 hole_em->orig_start = offset;
2161 2161
2162 hole_em->block_start = EXTENT_MAP_HOLE; 2162 hole_em->block_start = EXTENT_MAP_HOLE;
2163 hole_em->block_len = 0; 2163 hole_em->block_len = 0;
2164 hole_em->orig_block_len = 0; 2164 hole_em->orig_block_len = 0;
2165 hole_em->bdev = root->fs_info->fs_devices->latest_bdev; 2165 hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
2166 hole_em->compress_type = BTRFS_COMPRESS_NONE; 2166 hole_em->compress_type = BTRFS_COMPRESS_NONE;
2167 hole_em->generation = trans->transid; 2167 hole_em->generation = trans->transid;
2168 2168
2169 do { 2169 do {
2170 btrfs_drop_extent_cache(inode, offset, end - 1, 0); 2170 btrfs_drop_extent_cache(inode, offset, end - 1, 0);
2171 write_lock(&em_tree->lock); 2171 write_lock(&em_tree->lock);
2172 ret = add_extent_mapping(em_tree, hole_em, 1); 2172 ret = add_extent_mapping(em_tree, hole_em, 1);
2173 write_unlock(&em_tree->lock); 2173 write_unlock(&em_tree->lock);
2174 } while (ret == -EEXIST); 2174 } while (ret == -EEXIST);
2175 free_extent_map(hole_em); 2175 free_extent_map(hole_em);
2176 if (ret) 2176 if (ret)
2177 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 2177 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2178 &BTRFS_I(inode)->runtime_flags); 2178 &BTRFS_I(inode)->runtime_flags);
2179 } 2179 }
2180 2180
2181 return 0; 2181 return 0;
2182 } 2182 }
2183 2183
2184 static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) 2184 static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2185 { 2185 {
2186 struct btrfs_root *root = BTRFS_I(inode)->root; 2186 struct btrfs_root *root = BTRFS_I(inode)->root;
2187 struct extent_state *cached_state = NULL; 2187 struct extent_state *cached_state = NULL;
2188 struct btrfs_path *path; 2188 struct btrfs_path *path;
2189 struct btrfs_block_rsv *rsv; 2189 struct btrfs_block_rsv *rsv;
2190 struct btrfs_trans_handle *trans; 2190 struct btrfs_trans_handle *trans;
2191 u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize); 2191 u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
2192 u64 lockend = round_down(offset + len, 2192 u64 lockend = round_down(offset + len,
2193 BTRFS_I(inode)->root->sectorsize) - 1; 2193 BTRFS_I(inode)->root->sectorsize) - 1;
2194 u64 cur_offset = lockstart; 2194 u64 cur_offset = lockstart;
2195 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 2195 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
2196 u64 drop_end; 2196 u64 drop_end;
2197 int ret = 0; 2197 int ret = 0;
2198 int err = 0; 2198 int err = 0;
2199 int rsv_count; 2199 int rsv_count;
2200 bool same_page = ((offset >> PAGE_CACHE_SHIFT) == 2200 bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
2201 ((offset + len - 1) >> PAGE_CACHE_SHIFT)); 2201 ((offset + len - 1) >> PAGE_CACHE_SHIFT));
2202 bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES); 2202 bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
2203 u64 ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE); 2203 u64 ino_size;
2204 2204
2205 ret = btrfs_wait_ordered_range(inode, offset, len); 2205 ret = btrfs_wait_ordered_range(inode, offset, len);
2206 if (ret) 2206 if (ret)
2207 return ret; 2207 return ret;
2208 2208
2209 mutex_lock(&inode->i_mutex); 2209 mutex_lock(&inode->i_mutex);
2210 ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE);
2210 /* 2211 /*
2211 * We needn't truncate any page which is beyond the end of the file 2212 * We needn't truncate any page which is beyond the end of the file
2212 * because we are sure there is no data there. 2213 * because we are sure there is no data there.
2213 */ 2214 */
2214 /* 2215 /*
2215 * Only do this if we are in the same page and we aren't doing the 2216 * Only do this if we are in the same page and we aren't doing the
2216 * entire page. 2217 * entire page.
2217 */ 2218 */
2218 if (same_page && len < PAGE_CACHE_SIZE) { 2219 if (same_page && len < PAGE_CACHE_SIZE) {
2219 if (offset < ino_size) 2220 if (offset < ino_size)
2220 ret = btrfs_truncate_page(inode, offset, len, 0); 2221 ret = btrfs_truncate_page(inode, offset, len, 0);
2221 mutex_unlock(&inode->i_mutex); 2222 mutex_unlock(&inode->i_mutex);
2222 return ret; 2223 return ret;
2223 } 2224 }
2224 2225
2225 /* zero back part of the first page */ 2226 /* zero back part of the first page */
2226 if (offset < ino_size) { 2227 if (offset < ino_size) {
2227 ret = btrfs_truncate_page(inode, offset, 0, 0); 2228 ret = btrfs_truncate_page(inode, offset, 0, 0);
2228 if (ret) { 2229 if (ret) {
2229 mutex_unlock(&inode->i_mutex); 2230 mutex_unlock(&inode->i_mutex);
2230 return ret; 2231 return ret;
2231 } 2232 }
2232 } 2233 }
2233 2234
2234 /* zero the front end of the last page */ 2235 /* zero the front end of the last page */
2235 if (offset + len < ino_size) { 2236 if (offset + len < ino_size) {
2236 ret = btrfs_truncate_page(inode, offset + len, 0, 1); 2237 ret = btrfs_truncate_page(inode, offset + len, 0, 1);
2237 if (ret) { 2238 if (ret) {
2238 mutex_unlock(&inode->i_mutex); 2239 mutex_unlock(&inode->i_mutex);
2239 return ret; 2240 return ret;
2240 } 2241 }
2241 } 2242 }
2242 2243
2243 if (lockend < lockstart) { 2244 if (lockend < lockstart) {
2244 mutex_unlock(&inode->i_mutex); 2245 mutex_unlock(&inode->i_mutex);
2245 return 0; 2246 return 0;
2246 } 2247 }
2247 2248
2248 while (1) { 2249 while (1) {
2249 struct btrfs_ordered_extent *ordered; 2250 struct btrfs_ordered_extent *ordered;
2250 2251
2251 truncate_pagecache_range(inode, lockstart, lockend); 2252 truncate_pagecache_range(inode, lockstart, lockend);
2252 2253
2253 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 2254 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2254 0, &cached_state); 2255 0, &cached_state);
2255 ordered = btrfs_lookup_first_ordered_extent(inode, lockend); 2256 ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
2256 2257
2257 /* 2258 /*
2258 * We need to make sure we have no ordered extents in this range 2259 * We need to make sure we have no ordered extents in this range
2259 * and nobody raced in and read a page in this range, if we did 2260 * and nobody raced in and read a page in this range, if we did
2260 * we need to try again. 2261 * we need to try again.
2261 */ 2262 */
2262 if ((!ordered || 2263 if ((!ordered ||
2263 (ordered->file_offset + ordered->len <= lockstart || 2264 (ordered->file_offset + ordered->len <= lockstart ||
2264 ordered->file_offset > lockend)) && 2265 ordered->file_offset > lockend)) &&
2265 !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart, 2266 !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart,
2266 lockend, EXTENT_UPTODATE, 0, 2267 lockend, EXTENT_UPTODATE, 0,
2267 cached_state)) { 2268 cached_state)) {
2268 if (ordered) 2269 if (ordered)
2269 btrfs_put_ordered_extent(ordered); 2270 btrfs_put_ordered_extent(ordered);
2270 break; 2271 break;
2271 } 2272 }
2272 if (ordered) 2273 if (ordered)
2273 btrfs_put_ordered_extent(ordered); 2274 btrfs_put_ordered_extent(ordered);
2274 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, 2275 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
2275 lockend, &cached_state, GFP_NOFS); 2276 lockend, &cached_state, GFP_NOFS);
2276 ret = btrfs_wait_ordered_range(inode, lockstart, 2277 ret = btrfs_wait_ordered_range(inode, lockstart,
2277 lockend - lockstart + 1); 2278 lockend - lockstart + 1);
2278 if (ret) { 2279 if (ret) {
2279 mutex_unlock(&inode->i_mutex); 2280 mutex_unlock(&inode->i_mutex);
2280 return ret; 2281 return ret;
2281 } 2282 }
2282 } 2283 }
2283 2284
2284 path = btrfs_alloc_path(); 2285 path = btrfs_alloc_path();
2285 if (!path) { 2286 if (!path) {
2286 ret = -ENOMEM; 2287 ret = -ENOMEM;
2287 goto out; 2288 goto out;
2288 } 2289 }
2289 2290
2290 rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); 2291 rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
2291 if (!rsv) { 2292 if (!rsv) {
2292 ret = -ENOMEM; 2293 ret = -ENOMEM;
2293 goto out_free; 2294 goto out_free;
2294 } 2295 }
2295 rsv->size = btrfs_calc_trunc_metadata_size(root, 1); 2296 rsv->size = btrfs_calc_trunc_metadata_size(root, 1);
2296 rsv->failfast = 1; 2297 rsv->failfast = 1;
2297 2298
2298 /* 2299 /*
2299 * 1 - update the inode 2300 * 1 - update the inode
2300 * 1 - removing the extents in the range 2301 * 1 - removing the extents in the range
2301 * 1 - adding the hole extent if no_holes isn't set 2302 * 1 - adding the hole extent if no_holes isn't set
2302 */ 2303 */
2303 rsv_count = no_holes ? 2 : 3; 2304 rsv_count = no_holes ? 2 : 3;
2304 trans = btrfs_start_transaction(root, rsv_count); 2305 trans = btrfs_start_transaction(root, rsv_count);
2305 if (IS_ERR(trans)) { 2306 if (IS_ERR(trans)) {
2306 err = PTR_ERR(trans); 2307 err = PTR_ERR(trans);
2307 goto out_free; 2308 goto out_free;
2308 } 2309 }
2309 2310
2310 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv, 2311 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
2311 min_size); 2312 min_size);
2312 BUG_ON(ret); 2313 BUG_ON(ret);
2313 trans->block_rsv = rsv; 2314 trans->block_rsv = rsv;
2314 2315
2315 while (cur_offset < lockend) { 2316 while (cur_offset < lockend) {
2316 ret = __btrfs_drop_extents(trans, root, inode, path, 2317 ret = __btrfs_drop_extents(trans, root, inode, path,
2317 cur_offset, lockend + 1, 2318 cur_offset, lockend + 1,
2318 &drop_end, 1, 0, 0, NULL); 2319 &drop_end, 1, 0, 0, NULL);
2319 if (ret != -ENOSPC) 2320 if (ret != -ENOSPC)
2320 break; 2321 break;
2321 2322
2322 trans->block_rsv = &root->fs_info->trans_block_rsv; 2323 trans->block_rsv = &root->fs_info->trans_block_rsv;
2323 2324
2324 if (cur_offset < ino_size) { 2325 if (cur_offset < ino_size) {
2325 ret = fill_holes(trans, inode, path, cur_offset, 2326 ret = fill_holes(trans, inode, path, cur_offset,
2326 drop_end); 2327 drop_end);
2327 if (ret) { 2328 if (ret) {
2328 err = ret; 2329 err = ret;
2329 break; 2330 break;
2330 } 2331 }
2331 } 2332 }
2332 2333
2333 cur_offset = drop_end; 2334 cur_offset = drop_end;
2334 2335
2335 ret = btrfs_update_inode(trans, root, inode); 2336 ret = btrfs_update_inode(trans, root, inode);
2336 if (ret) { 2337 if (ret) {
2337 err = ret; 2338 err = ret;
2338 break; 2339 break;
2339 } 2340 }
2340 2341
2341 btrfs_end_transaction(trans, root); 2342 btrfs_end_transaction(trans, root);
2342 btrfs_btree_balance_dirty(root); 2343 btrfs_btree_balance_dirty(root);
2343 2344
2344 trans = btrfs_start_transaction(root, rsv_count); 2345 trans = btrfs_start_transaction(root, rsv_count);
2345 if (IS_ERR(trans)) { 2346 if (IS_ERR(trans)) {
2346 ret = PTR_ERR(trans); 2347 ret = PTR_ERR(trans);
2347 trans = NULL; 2348 trans = NULL;
2348 break; 2349 break;
2349 } 2350 }
2350 2351
2351 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, 2352 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
2352 rsv, min_size); 2353 rsv, min_size);
2353 BUG_ON(ret); /* shouldn't happen */ 2354 BUG_ON(ret); /* shouldn't happen */
2354 trans->block_rsv = rsv; 2355 trans->block_rsv = rsv;
2355 } 2356 }
2356 2357
2357 if (ret) { 2358 if (ret) {
2358 err = ret; 2359 err = ret;
2359 goto out_trans; 2360 goto out_trans;
2360 } 2361 }
2361 2362
2362 trans->block_rsv = &root->fs_info->trans_block_rsv; 2363 trans->block_rsv = &root->fs_info->trans_block_rsv;
2363 /* 2364 /*
2364 * Don't insert file hole extent item if it's for a range beyond eof 2365 * Don't insert file hole extent item if it's for a range beyond eof
2365 * (because it's useless) or if it represents a 0 bytes range (when 2366 * (because it's useless) or if it represents a 0 bytes range (when
2366 * cur_offset == drop_end). 2367 * cur_offset == drop_end).
2367 */ 2368 */
2368 if (cur_offset < ino_size && cur_offset < drop_end) { 2369 if (cur_offset < ino_size && cur_offset < drop_end) {
2369 ret = fill_holes(trans, inode, path, cur_offset, drop_end); 2370 ret = fill_holes(trans, inode, path, cur_offset, drop_end);
2370 if (ret) { 2371 if (ret) {
2371 err = ret; 2372 err = ret;
2372 goto out_trans; 2373 goto out_trans;
2373 } 2374 }
2374 } 2375 }
2375 2376
2376 out_trans: 2377 out_trans:
2377 if (!trans) 2378 if (!trans)
2378 goto out_free; 2379 goto out_free;
2379 2380
2380 inode_inc_iversion(inode); 2381 inode_inc_iversion(inode);
2381 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 2382 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
2382 2383
2383 trans->block_rsv = &root->fs_info->trans_block_rsv; 2384 trans->block_rsv = &root->fs_info->trans_block_rsv;
2384 ret = btrfs_update_inode(trans, root, inode); 2385 ret = btrfs_update_inode(trans, root, inode);
2385 btrfs_end_transaction(trans, root); 2386 btrfs_end_transaction(trans, root);
2386 btrfs_btree_balance_dirty(root); 2387 btrfs_btree_balance_dirty(root);
2387 out_free: 2388 out_free:
2388 btrfs_free_path(path); 2389 btrfs_free_path(path);
2389 btrfs_free_block_rsv(root, rsv); 2390 btrfs_free_block_rsv(root, rsv);
2390 out: 2391 out:
2391 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, 2392 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2392 &cached_state, GFP_NOFS); 2393 &cached_state, GFP_NOFS);
2393 mutex_unlock(&inode->i_mutex); 2394 mutex_unlock(&inode->i_mutex);
2394 if (ret && !err) 2395 if (ret && !err)
2395 err = ret; 2396 err = ret;
2396 return err; 2397 return err;
2397 } 2398 }
2398 2399
2399 static long btrfs_fallocate(struct file *file, int mode, 2400 static long btrfs_fallocate(struct file *file, int mode,
2400 loff_t offset, loff_t len) 2401 loff_t offset, loff_t len)
2401 { 2402 {
2402 struct inode *inode = file_inode(file); 2403 struct inode *inode = file_inode(file);
2403 struct extent_state *cached_state = NULL; 2404 struct extent_state *cached_state = NULL;
2404 struct btrfs_root *root = BTRFS_I(inode)->root; 2405 struct btrfs_root *root = BTRFS_I(inode)->root;
2405 u64 cur_offset; 2406 u64 cur_offset;
2406 u64 last_byte; 2407 u64 last_byte;
2407 u64 alloc_start; 2408 u64 alloc_start;
2408 u64 alloc_end; 2409 u64 alloc_end;
2409 u64 alloc_hint = 0; 2410 u64 alloc_hint = 0;
2410 u64 locked_end; 2411 u64 locked_end;
2411 struct extent_map *em; 2412 struct extent_map *em;
2412 int blocksize = BTRFS_I(inode)->root->sectorsize; 2413 int blocksize = BTRFS_I(inode)->root->sectorsize;
2413 int ret; 2414 int ret;
2414 2415
2415 alloc_start = round_down(offset, blocksize); 2416 alloc_start = round_down(offset, blocksize);
2416 alloc_end = round_up(offset + len, blocksize); 2417 alloc_end = round_up(offset + len, blocksize);
2417 2418
2418 /* Make sure we aren't being give some crap mode */ 2419 /* Make sure we aren't being give some crap mode */
2419 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 2420 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
2420 return -EOPNOTSUPP; 2421 return -EOPNOTSUPP;
2421 2422
2422 if (mode & FALLOC_FL_PUNCH_HOLE) 2423 if (mode & FALLOC_FL_PUNCH_HOLE)
2423 return btrfs_punch_hole(inode, offset, len); 2424 return btrfs_punch_hole(inode, offset, len);
2424 2425
2425 /* 2426 /*
2426 * Make sure we have enough space before we do the 2427 * Make sure we have enough space before we do the
2427 * allocation. 2428 * allocation.
2428 */ 2429 */
2429 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); 2430 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
2430 if (ret) 2431 if (ret)
2431 return ret; 2432 return ret;
2432 if (root->fs_info->quota_enabled) { 2433 if (root->fs_info->quota_enabled) {
2433 ret = btrfs_qgroup_reserve(root, alloc_end - alloc_start); 2434 ret = btrfs_qgroup_reserve(root, alloc_end - alloc_start);
2434 if (ret) 2435 if (ret)
2435 goto out_reserve_fail; 2436 goto out_reserve_fail;
2436 } 2437 }
2437 2438
2438 mutex_lock(&inode->i_mutex); 2439 mutex_lock(&inode->i_mutex);
2439 ret = inode_newsize_ok(inode, alloc_end); 2440 ret = inode_newsize_ok(inode, alloc_end);
2440 if (ret) 2441 if (ret)
2441 goto out; 2442 goto out;
2442 2443
2443 if (alloc_start > inode->i_size) { 2444 if (alloc_start > inode->i_size) {
2444 ret = btrfs_cont_expand(inode, i_size_read(inode), 2445 ret = btrfs_cont_expand(inode, i_size_read(inode),
2445 alloc_start); 2446 alloc_start);
2446 if (ret) 2447 if (ret)
2447 goto out; 2448 goto out;
2448 } else { 2449 } else {
2449 /* 2450 /*
2450 * If we are fallocating from the end of the file onward we 2451 * If we are fallocating from the end of the file onward we
2451 * need to zero out the end of the page if i_size lands in the 2452 * need to zero out the end of the page if i_size lands in the
2452 * middle of a page. 2453 * middle of a page.
2453 */ 2454 */
2454 ret = btrfs_truncate_page(inode, inode->i_size, 0, 0); 2455 ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
2455 if (ret) 2456 if (ret)
2456 goto out; 2457 goto out;
2457 } 2458 }
2458 2459
2459 /* 2460 /*
2460 * wait for ordered IO before we have any locks. We'll loop again 2461 * wait for ordered IO before we have any locks. We'll loop again
2461 * below with the locks held. 2462 * below with the locks held.
2462 */ 2463 */
2463 ret = btrfs_wait_ordered_range(inode, alloc_start, 2464 ret = btrfs_wait_ordered_range(inode, alloc_start,
2464 alloc_end - alloc_start); 2465 alloc_end - alloc_start);
2465 if (ret) 2466 if (ret)
2466 goto out; 2467 goto out;
2467 2468
2468 locked_end = alloc_end - 1; 2469 locked_end = alloc_end - 1;
2469 while (1) { 2470 while (1) {
2470 struct btrfs_ordered_extent *ordered; 2471 struct btrfs_ordered_extent *ordered;
2471 2472
2472 /* the extent lock is ordered inside the running 2473 /* the extent lock is ordered inside the running
2473 * transaction 2474 * transaction
2474 */ 2475 */
2475 lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, 2476 lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
2476 locked_end, 0, &cached_state); 2477 locked_end, 0, &cached_state);
2477 ordered = btrfs_lookup_first_ordered_extent(inode, 2478 ordered = btrfs_lookup_first_ordered_extent(inode,
2478 alloc_end - 1); 2479 alloc_end - 1);
2479 if (ordered && 2480 if (ordered &&
2480 ordered->file_offset + ordered->len > alloc_start && 2481 ordered->file_offset + ordered->len > alloc_start &&
2481 ordered->file_offset < alloc_end) { 2482 ordered->file_offset < alloc_end) {
2482 btrfs_put_ordered_extent(ordered); 2483 btrfs_put_ordered_extent(ordered);
2483 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 2484 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
2484 alloc_start, locked_end, 2485 alloc_start, locked_end,
2485 &cached_state, GFP_NOFS); 2486 &cached_state, GFP_NOFS);
2486 /* 2487 /*
2487 * we can't wait on the range with the transaction 2488 * we can't wait on the range with the transaction
2488 * running or with the extent lock held 2489 * running or with the extent lock held
2489 */ 2490 */
2490 ret = btrfs_wait_ordered_range(inode, alloc_start, 2491 ret = btrfs_wait_ordered_range(inode, alloc_start,
2491 alloc_end - alloc_start); 2492 alloc_end - alloc_start);
2492 if (ret) 2493 if (ret)
2493 goto out; 2494 goto out;
2494 } else { 2495 } else {
2495 if (ordered) 2496 if (ordered)
2496 btrfs_put_ordered_extent(ordered); 2497 btrfs_put_ordered_extent(ordered);
2497 break; 2498 break;
2498 } 2499 }
2499 } 2500 }
2500 2501
2501 cur_offset = alloc_start; 2502 cur_offset = alloc_start;
2502 while (1) { 2503 while (1) {
2503 u64 actual_end; 2504 u64 actual_end;
2504 2505
2505 em = btrfs_get_extent(inode, NULL, 0, cur_offset, 2506 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
2506 alloc_end - cur_offset, 0); 2507 alloc_end - cur_offset, 0);
2507 if (IS_ERR_OR_NULL(em)) { 2508 if (IS_ERR_OR_NULL(em)) {
2508 if (!em) 2509 if (!em)
2509 ret = -ENOMEM; 2510 ret = -ENOMEM;
2510 else 2511 else
2511 ret = PTR_ERR(em); 2512 ret = PTR_ERR(em);
2512 break; 2513 break;
2513 } 2514 }
2514 last_byte = min(extent_map_end(em), alloc_end); 2515 last_byte = min(extent_map_end(em), alloc_end);
2515 actual_end = min_t(u64, extent_map_end(em), offset + len); 2516 actual_end = min_t(u64, extent_map_end(em), offset + len);
2516 last_byte = ALIGN(last_byte, blocksize); 2517 last_byte = ALIGN(last_byte, blocksize);
2517 2518
2518 if (em->block_start == EXTENT_MAP_HOLE || 2519 if (em->block_start == EXTENT_MAP_HOLE ||
2519 (cur_offset >= inode->i_size && 2520 (cur_offset >= inode->i_size &&
2520 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 2521 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
2521 ret = btrfs_prealloc_file_range(inode, mode, cur_offset, 2522 ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
2522 last_byte - cur_offset, 2523 last_byte - cur_offset,
2523 1 << inode->i_blkbits, 2524 1 << inode->i_blkbits,
2524 offset + len, 2525 offset + len,
2525 &alloc_hint); 2526 &alloc_hint);
2526 2527
2527 if (ret < 0) { 2528 if (ret < 0) {
2528 free_extent_map(em); 2529 free_extent_map(em);
2529 break; 2530 break;
2530 } 2531 }
2531 } else if (actual_end > inode->i_size && 2532 } else if (actual_end > inode->i_size &&
2532 !(mode & FALLOC_FL_KEEP_SIZE)) { 2533 !(mode & FALLOC_FL_KEEP_SIZE)) {
2533 /* 2534 /*
2534 * We didn't need to allocate any more space, but we 2535 * We didn't need to allocate any more space, but we
2535 * still extended the size of the file so we need to 2536 * still extended the size of the file so we need to
2536 * update i_size. 2537 * update i_size.
2537 */ 2538 */
2538 inode->i_ctime = CURRENT_TIME; 2539 inode->i_ctime = CURRENT_TIME;
2539 i_size_write(inode, actual_end); 2540 i_size_write(inode, actual_end);
2540 btrfs_ordered_update_i_size(inode, actual_end, NULL); 2541 btrfs_ordered_update_i_size(inode, actual_end, NULL);
2541 } 2542 }
2542 free_extent_map(em); 2543 free_extent_map(em);
2543 2544
2544 cur_offset = last_byte; 2545 cur_offset = last_byte;
2545 if (cur_offset >= alloc_end) { 2546 if (cur_offset >= alloc_end) {
2546 ret = 0; 2547 ret = 0;
2547 break; 2548 break;
2548 } 2549 }
2549 } 2550 }
2550 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 2551 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
2551 &cached_state, GFP_NOFS); 2552 &cached_state, GFP_NOFS);
2552 out: 2553 out:
2553 mutex_unlock(&inode->i_mutex); 2554 mutex_unlock(&inode->i_mutex);
2554 if (root->fs_info->quota_enabled) 2555 if (root->fs_info->quota_enabled)
2555 btrfs_qgroup_free(root, alloc_end - alloc_start); 2556 btrfs_qgroup_free(root, alloc_end - alloc_start);
2556 out_reserve_fail: 2557 out_reserve_fail:
2557 /* Let go of our reservation. */ 2558 /* Let go of our reservation. */
2558 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); 2559 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
2559 return ret; 2560 return ret;
2560 } 2561 }
2561 2562
2562 static int find_desired_extent(struct inode *inode, loff_t *offset, int whence) 2563 static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
2563 { 2564 {
2564 struct btrfs_root *root = BTRFS_I(inode)->root; 2565 struct btrfs_root *root = BTRFS_I(inode)->root;
2565 struct extent_map *em = NULL; 2566 struct extent_map *em = NULL;
2566 struct extent_state *cached_state = NULL; 2567 struct extent_state *cached_state = NULL;
2567 u64 lockstart = *offset; 2568 u64 lockstart = *offset;
2568 u64 lockend = i_size_read(inode); 2569 u64 lockend = i_size_read(inode);
2569 u64 start = *offset; 2570 u64 start = *offset;
2570 u64 len = i_size_read(inode); 2571 u64 len = i_size_read(inode);
2571 int ret = 0; 2572 int ret = 0;
2572 2573
2573 lockend = max_t(u64, root->sectorsize, lockend); 2574 lockend = max_t(u64, root->sectorsize, lockend);
2574 if (lockend <= lockstart) 2575 if (lockend <= lockstart)
2575 lockend = lockstart + root->sectorsize; 2576 lockend = lockstart + root->sectorsize;
2576 2577
2577 lockend--; 2578 lockend--;
2578 len = lockend - lockstart + 1; 2579 len = lockend - lockstart + 1;
2579 2580
2580 len = max_t(u64, len, root->sectorsize); 2581 len = max_t(u64, len, root->sectorsize);
2581 if (inode->i_size == 0) 2582 if (inode->i_size == 0)
2582 return -ENXIO; 2583 return -ENXIO;
2583 2584
2584 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0, 2585 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0,
2585 &cached_state); 2586 &cached_state);
2586 2587
2587 while (start < inode->i_size) { 2588 while (start < inode->i_size) {
2588 em = btrfs_get_extent_fiemap(inode, NULL, 0, start, len, 0); 2589 em = btrfs_get_extent_fiemap(inode, NULL, 0, start, len, 0);
2589 if (IS_ERR(em)) { 2590 if (IS_ERR(em)) {
2590 ret = PTR_ERR(em); 2591 ret = PTR_ERR(em);
2591 em = NULL; 2592 em = NULL;
2592 break; 2593 break;
2593 } 2594 }
2594 2595
2595 if (whence == SEEK_HOLE && 2596 if (whence == SEEK_HOLE &&
2596 (em->block_start == EXTENT_MAP_HOLE || 2597 (em->block_start == EXTENT_MAP_HOLE ||
2597 test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) 2598 test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
2598 break; 2599 break;
2599 else if (whence == SEEK_DATA && 2600 else if (whence == SEEK_DATA &&
2600 (em->block_start != EXTENT_MAP_HOLE && 2601 (em->block_start != EXTENT_MAP_HOLE &&
2601 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) 2602 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
2602 break; 2603 break;
2603 2604
2604 start = em->start + em->len; 2605 start = em->start + em->len;
2605 free_extent_map(em); 2606 free_extent_map(em);
2606 em = NULL; 2607 em = NULL;
2607 cond_resched(); 2608 cond_resched();
2608 } 2609 }
2609 free_extent_map(em); 2610 free_extent_map(em);
2610 if (!ret) { 2611 if (!ret) {
2611 if (whence == SEEK_DATA && start >= inode->i_size) 2612 if (whence == SEEK_DATA && start >= inode->i_size)
2612 ret = -ENXIO; 2613 ret = -ENXIO;
2613 else 2614 else
2614 *offset = min_t(loff_t, start, inode->i_size); 2615 *offset = min_t(loff_t, start, inode->i_size);
2615 } 2616 }
2616 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, 2617 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2617 &cached_state, GFP_NOFS); 2618 &cached_state, GFP_NOFS);
2618 return ret; 2619 return ret;
2619 } 2620 }
2620 2621
2621 static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) 2622 static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
2622 { 2623 {
2623 struct inode *inode = file->f_mapping->host; 2624 struct inode *inode = file->f_mapping->host;
2624 int ret; 2625 int ret;
2625 2626
2626 mutex_lock(&inode->i_mutex); 2627 mutex_lock(&inode->i_mutex);
2627 switch (whence) { 2628 switch (whence) {
2628 case SEEK_END: 2629 case SEEK_END:
2629 case SEEK_CUR: 2630 case SEEK_CUR:
2630 offset = generic_file_llseek(file, offset, whence); 2631 offset = generic_file_llseek(file, offset, whence);
2631 goto out; 2632 goto out;
2632 case SEEK_DATA: 2633 case SEEK_DATA:
2633 case SEEK_HOLE: 2634 case SEEK_HOLE:
2634 if (offset >= i_size_read(inode)) { 2635 if (offset >= i_size_read(inode)) {
2635 mutex_unlock(&inode->i_mutex); 2636 mutex_unlock(&inode->i_mutex);
2636 return -ENXIO; 2637 return -ENXIO;
2637 } 2638 }
2638 2639
2639 ret = find_desired_extent(inode, &offset, whence); 2640 ret = find_desired_extent(inode, &offset, whence);
2640 if (ret) { 2641 if (ret) {
2641 mutex_unlock(&inode->i_mutex); 2642 mutex_unlock(&inode->i_mutex);
2642 return ret; 2643 return ret;
2643 } 2644 }
2644 } 2645 }
2645 2646
2646 offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); 2647 offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
2647 out: 2648 out:
2648 mutex_unlock(&inode->i_mutex); 2649 mutex_unlock(&inode->i_mutex);
2649 return offset; 2650 return offset;
2650 } 2651 }
2651 2652
2652 const struct file_operations btrfs_file_operations = { 2653 const struct file_operations btrfs_file_operations = {
2653 .llseek = btrfs_file_llseek, 2654 .llseek = btrfs_file_llseek,
2654 .read = do_sync_read, 2655 .read = do_sync_read,
2655 .write = do_sync_write, 2656 .write = do_sync_write,
2656 .aio_read = generic_file_aio_read, 2657 .aio_read = generic_file_aio_read,
2657 .splice_read = generic_file_splice_read, 2658 .splice_read = generic_file_splice_read,
2658 .aio_write = btrfs_file_aio_write, 2659 .aio_write = btrfs_file_aio_write,
2659 .mmap = btrfs_file_mmap, 2660 .mmap = btrfs_file_mmap,
2660 .open = generic_file_open, 2661 .open = generic_file_open,
2661 .release = btrfs_release_file, 2662 .release = btrfs_release_file,
2662 .fsync = btrfs_sync_file, 2663 .fsync = btrfs_sync_file,
2663 .fallocate = btrfs_fallocate, 2664 .fallocate = btrfs_fallocate,
2664 .unlocked_ioctl = btrfs_ioctl, 2665 .unlocked_ioctl = btrfs_ioctl,
2665 #ifdef CONFIG_COMPAT 2666 #ifdef CONFIG_COMPAT
2666 .compat_ioctl = btrfs_ioctl, 2667 .compat_ioctl = btrfs_ioctl,
2667 #endif 2668 #endif
2668 }; 2669 };
2669 2670
2670 void btrfs_auto_defrag_exit(void) 2671 void btrfs_auto_defrag_exit(void)
2671 { 2672 {
2672 if (btrfs_inode_defrag_cachep) 2673 if (btrfs_inode_defrag_cachep)
2673 kmem_cache_destroy(btrfs_inode_defrag_cachep); 2674 kmem_cache_destroy(btrfs_inode_defrag_cachep);
2674 } 2675 }
2675 2676
2676 int btrfs_auto_defrag_init(void) 2677 int btrfs_auto_defrag_init(void)
2677 { 2678 {
2678 btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", 2679 btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
2679 sizeof(struct inode_defrag), 0, 2680 sizeof(struct inode_defrag), 0,
2680 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, 2681 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
2681 NULL); 2682 NULL);
2682 if (!btrfs_inode_defrag_cachep) 2683 if (!btrfs_inode_defrag_cachep)
2683 return -ENOMEM; 2684 return -ENOMEM;
2684 2685
2685 return 0; 2686 return 0;
2686 } 2687 }
2687 2688