Eric Lee / smarc-ti-linux-kernel

1

/*

1

/*

2

3

*

3

*

4

* This program is free software; you can redistribute it and/or

4

* This program is free software; you can redistribute it and/or

5

* modify it under the terms of the GNU General Public

5

* modify it under the terms of the GNU General Public

6

* License v2 as published by the Free Software Foundation.

6

* License v2 as published by the Free Software Foundation.

7

*

7

*

8

* This program is distributed in the hope that it will be useful,

8

* This program is distributed in the hope that it will be useful,

9

* but WITHOUT ANY WARRANTY; without even the implied warranty of

9

* but WITHOUT ANY WARRANTY; without even the implied warranty of

10

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

10

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

11

* General Public License for more details.

11

* General Public License for more details.

12

*

12

*

13

* You should have received a copy of the GNU General Public

13

* You should have received a copy of the GNU General Public

14

* License along with this program; if not, write to the

14

* License along with this program; if not, write to the

15

* Free Software Foundation, Inc., 59 Temple Place - Suite 330,

15

* Free Software Foundation, Inc., 59 Temple Place - Suite 330,

16

* Boston, MA 021110-1307, USA.

16

* Boston, MA 021110-1307, USA.

17

*/

17

*/

18

19

#include <linux/fs.h>

19

#include <linux/fs.h>

20

#include <linux/pagemap.h>

20

#include <linux/pagemap.h>

21

#include <linux/highmem.h>

21

#include <linux/highmem.h>

22

#include <linux/time.h>

22

#include <linux/time.h>

23

#include <linux/init.h>

23

#include <linux/init.h>

24

#include <linux/string.h>

24

#include <linux/string.h>

25

#include <linux/backing-dev.h>

25

#include <linux/backing-dev.h>

26

#include <linux/mpage.h>

26

#include <linux/mpage.h>

27

#include <linux/aio.h>

27

#include <linux/aio.h>

28

#include <linux/falloc.h>

28

#include <linux/falloc.h>

29

#include <linux/swap.h>

29

#include <linux/swap.h>

30

#include <linux/writeback.h>

30

#include <linux/writeback.h>

31

#include <linux/statfs.h>

31

#include <linux/statfs.h>

32

#include <linux/compat.h>

32

#include <linux/compat.h>

33

#include <linux/slab.h>

33

#include <linux/slab.h>

34

#include <linux/btrfs.h>

34

#include <linux/btrfs.h>

35

#include "ctree.h"

35

#include "ctree.h"

36

#include "disk-io.h"

36

#include "disk-io.h"

37

#include "transaction.h"

37

#include "transaction.h"

38

#include "btrfs_inode.h"

38

#include "btrfs_inode.h"

39

#include "print-tree.h"

39

#include "print-tree.h"

40

#include "tree-log.h"

40

#include "tree-log.h"

41

#include "locking.h"

41

#include "locking.h"

42

#include "volumes.h"

42

#include "volumes.h"

43

44

static struct kmem_cache *btrfs_inode_defrag_cachep;

44

static struct kmem_cache *btrfs_inode_defrag_cachep;

45

/*

45

/*

46

* when auto defrag is enabled we

46

* when auto defrag is enabled we

47

* queue up these defrag structs to remember which

47

* queue up these defrag structs to remember which

48

* inodes need defragging passes

48

* inodes need defragging passes

49

*/

49

*/

50

struct inode_defrag {

50

struct inode_defrag {

51

struct rb_node rb_node;

51

struct rb_node rb_node;

52

/* objectid */

52

/* objectid */

53

u64 ino;

53

u64 ino;

54

/*

54

/*

55

* transid where the defrag was added, we search for

55

* transid where the defrag was added, we search for

56

* extents newer than this

56

* extents newer than this

57

*/

57

*/

58

u64 transid;

58

u64 transid;

59

60

/* root objectid */

60

/* root objectid */

61

u64 root;

61

u64 root;

62

63

/* last offset we were able to defrag */

63

/* last offset we were able to defrag */

64

u64 last_offset;

64

u64 last_offset;

65

66

/* if we've wrapped around back to zero once already */

66

/* if we've wrapped around back to zero once already */

67

int cycled;

67

int cycled;

68

};

68

};

69

70

static int __compare_inode_defrag(struct inode_defrag *defrag1,

70

static int __compare_inode_defrag(struct inode_defrag *defrag1,

71

struct inode_defrag *defrag2)

71

struct inode_defrag *defrag2)

72

{

72

{

73

if (defrag1->root > defrag2->root)

73

if (defrag1->root > defrag2->root)

74

return 1;

74

return 1;

75

else if (defrag1->root < defrag2->root)

75

else if (defrag1->root < defrag2->root)

76

return -1;

76

return -1;

77

else if (defrag1->ino > defrag2->ino)

77

else if (defrag1->ino > defrag2->ino)

78

return 1;

78

return 1;

79

else if (defrag1->ino < defrag2->ino)

79

else if (defrag1->ino < defrag2->ino)

80

return -1;

80

return -1;

81

else

81

else

82

return 0;

82

return 0;

83

}

83

}

84

85

/* pop a record for an inode into the defrag tree. The lock

85

/* pop a record for an inode into the defrag tree. The lock

86

* must be held already

86

* must be held already

87

*

87

*

88

* If you're inserting a record for an older transid than an

88

* If you're inserting a record for an older transid than an

89

* existing record, the transid already in the tree is lowered

89

* existing record, the transid already in the tree is lowered

90

*

90

*

91

* If an existing record is found the defrag item you

91

* If an existing record is found the defrag item you

92

* pass in is freed

92

* pass in is freed

93

*/

93

*/

94

static int __btrfs_add_inode_defrag(struct inode *inode,

94

static int __btrfs_add_inode_defrag(struct inode *inode,

95

struct inode_defrag *defrag)

95

struct inode_defrag *defrag)

96

{

96

{

97

struct btrfs_root *root = BTRFS_I(inode)->root;

97

struct btrfs_root *root = BTRFS_I(inode)->root;

98

struct inode_defrag *entry;

98

struct inode_defrag *entry;

99

struct rb_node **p;

99

struct rb_node **p;

100

struct rb_node *parent = NULL;

100

struct rb_node *parent = NULL;

101

int ret;

101

int ret;

102

103

p = &root->fs_info->defrag_inodes.rb_node;

103

p = &root->fs_info->defrag_inodes.rb_node;

104

while (*p) {

104

while (*p) {

105

parent = *p;

105

parent = *p;

106

entry = rb_entry(parent, struct inode_defrag, rb_node);

106

entry = rb_entry(parent, struct inode_defrag, rb_node);

107

108

ret = __compare_inode_defrag(defrag, entry);

108

ret = __compare_inode_defrag(defrag, entry);

109

if (ret < 0)

109

if (ret < 0)

110

p = &parent->rb_left;

110

p = &parent->rb_left;

111

else if (ret > 0)

111

else if (ret > 0)

112

p = &parent->rb_right;

112

p = &parent->rb_right;

113

else {

113

else {

114

/* if we're reinserting an entry for

114

/* if we're reinserting an entry for

115

* an old defrag run, make sure to

115

* an old defrag run, make sure to

116

* lower the transid of our existing record

116

* lower the transid of our existing record

117

*/

117

*/

118

if (defrag->transid < entry->transid)

118

if (defrag->transid < entry->transid)

119

entry->transid = defrag->transid;

119

entry->transid = defrag->transid;

120

if (defrag->last_offset > entry->last_offset)

120

if (defrag->last_offset > entry->last_offset)

121

entry->last_offset = defrag->last_offset;

121

entry->last_offset = defrag->last_offset;

122

return -EEXIST;

122

return -EEXIST;

123

}

123

}

124

}

124

}

125

set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);

125

set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);

126

rb_link_node(&defrag->rb_node, parent, p);

126

rb_link_node(&defrag->rb_node, parent, p);

127

rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);

127

rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);

128

return 0;

128

return 0;

129

}

129

}

130

131

static inline int __need_auto_defrag(struct btrfs_root *root)

131

static inline int __need_auto_defrag(struct btrfs_root *root)

132

{

132

{

133

if (!btrfs_test_opt(root, AUTO_DEFRAG))

133

if (!btrfs_test_opt(root, AUTO_DEFRAG))

134

return 0;

134

return 0;

135

136

if (btrfs_fs_closing(root->fs_info))

136

if (btrfs_fs_closing(root->fs_info))

137

return 0;

137

return 0;

138

139

return 1;

139

return 1;

140

}

140

}

141

142

/*

142

/*

143

* insert a defrag record for this inode if auto defrag is

143

* insert a defrag record for this inode if auto defrag is

144

* enabled

144

* enabled

145

*/

145

*/

146

int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,

146

int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,

147

struct inode *inode)

147

struct inode *inode)

148

{

148

{

149

struct btrfs_root *root = BTRFS_I(inode)->root;

149

struct btrfs_root *root = BTRFS_I(inode)->root;

150

struct inode_defrag *defrag;

150

struct inode_defrag *defrag;

151

u64 transid;

151

u64 transid;

152

int ret;

152

int ret;

153

154

if (!__need_auto_defrag(root))

154

if (!__need_auto_defrag(root))

155

return 0;

155

return 0;

156

157

if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))

157

if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))

158

return 0;

158

return 0;

159

160

if (trans)

160

if (trans)

161

transid = trans->transid;

161

transid = trans->transid;

162

else

162

else

163

transid = BTRFS_I(inode)->root->last_trans;

163

transid = BTRFS_I(inode)->root->last_trans;

164

165

defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);

165

defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);

166

if (!defrag)

166

if (!defrag)

167

return -ENOMEM;

167

return -ENOMEM;

168

169

defrag->ino = btrfs_ino(inode);

169

defrag->ino = btrfs_ino(inode);

170

defrag->transid = transid;

170

defrag->transid = transid;

171

defrag->root = root->root_key.objectid;

171

defrag->root = root->root_key.objectid;

172

173

spin_lock(&root->fs_info->defrag_inodes_lock);

173

spin_lock(&root->fs_info->defrag_inodes_lock);

174

if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) {

174

if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) {

175

/*

175

/*

176

* If we set IN_DEFRAG flag and evict the inode from memory,

176

* If we set IN_DEFRAG flag and evict the inode from memory,

177

* and then re-read this inode, this new inode doesn't have

177

* and then re-read this inode, this new inode doesn't have

178

* IN_DEFRAG flag. At the case, we may find the existed defrag.

178

* IN_DEFRAG flag. At the case, we may find the existed defrag.

179

*/

179

*/

180

ret = __btrfs_add_inode_defrag(inode, defrag);

180

ret = __btrfs_add_inode_defrag(inode, defrag);

181

if (ret)

181

if (ret)

182

kmem_cache_free(btrfs_inode_defrag_cachep, defrag);

182

kmem_cache_free(btrfs_inode_defrag_cachep, defrag);

183

} else {

183

} else {

184

kmem_cache_free(btrfs_inode_defrag_cachep, defrag);

184

kmem_cache_free(btrfs_inode_defrag_cachep, defrag);

185

}

185

}

186

spin_unlock(&root->fs_info->defrag_inodes_lock);

186

spin_unlock(&root->fs_info->defrag_inodes_lock);

187

return 0;

187

return 0;

188

}

188

}

189

190

/*

190

/*

191

* Requeue the defrag object. If there is a defrag object that points to

191

* Requeue the defrag object. If there is a defrag object that points to

192

* the same inode in the tree, we will merge them together (by

192

* the same inode in the tree, we will merge them together (by

193

* __btrfs_add_inode_defrag()) and free the one that we want to requeue.

193

* __btrfs_add_inode_defrag()) and free the one that we want to requeue.

194

*/

194

*/

195

static void btrfs_requeue_inode_defrag(struct inode *inode,

195

static void btrfs_requeue_inode_defrag(struct inode *inode,

196

struct inode_defrag *defrag)

196

struct inode_defrag *defrag)

197

{

197

{

198

struct btrfs_root *root = BTRFS_I(inode)->root;

198

struct btrfs_root *root = BTRFS_I(inode)->root;

199

int ret;

199

int ret;

200

201

if (!__need_auto_defrag(root))

201

if (!__need_auto_defrag(root))

202

goto out;

202

goto out;

203

204

/*

204

/*

205

* Here we don't check the IN_DEFRAG flag, because we need merge

205

* Here we don't check the IN_DEFRAG flag, because we need merge

206

* them together.

206

* them together.

207

*/

207

*/

208

spin_lock(&root->fs_info->defrag_inodes_lock);

208

spin_lock(&root->fs_info->defrag_inodes_lock);

209

ret = __btrfs_add_inode_defrag(inode, defrag);

209

ret = __btrfs_add_inode_defrag(inode, defrag);

210

spin_unlock(&root->fs_info->defrag_inodes_lock);

210

spin_unlock(&root->fs_info->defrag_inodes_lock);

211

if (ret)

211

if (ret)

212

goto out;

212

goto out;

213

return;

213

return;

214

out:

214

out:

215

kmem_cache_free(btrfs_inode_defrag_cachep, defrag);

215

kmem_cache_free(btrfs_inode_defrag_cachep, defrag);

216

}

216

}

217

218

/*

218

/*

219

* pick the defragable inode that we want, if it doesn't exist, we will get

219

* pick the defragable inode that we want, if it doesn't exist, we will get

220

* the next one.

220

* the next one.

221

*/

221

*/

222

static struct inode_defrag *

222

static struct inode_defrag *

223

btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)

223

btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)

224

{

224

{

225

struct inode_defrag *entry = NULL;

225

struct inode_defrag *entry = NULL;

226

struct inode_defrag tmp;

226

struct inode_defrag tmp;

227

struct rb_node *p;

227

struct rb_node *p;

228

struct rb_node *parent = NULL;

228

struct rb_node *parent = NULL;

229

int ret;

229

int ret;

230

231

tmp.ino = ino;

231

tmp.ino = ino;

232

tmp.root = root;

232

tmp.root = root;

233

234

spin_lock(&fs_info->defrag_inodes_lock);

234

spin_lock(&fs_info->defrag_inodes_lock);

235

p = fs_info->defrag_inodes.rb_node;

235

p = fs_info->defrag_inodes.rb_node;

236

while (p) {

236

while (p) {

237

parent = p;

237

parent = p;

238

entry = rb_entry(parent, struct inode_defrag, rb_node);

238

entry = rb_entry(parent, struct inode_defrag, rb_node);

239

240

ret = __compare_inode_defrag(&tmp, entry);

240

ret = __compare_inode_defrag(&tmp, entry);

241

if (ret < 0)

241

if (ret < 0)

242

p = parent->rb_left;

242

p = parent->rb_left;

243

else if (ret > 0)

243

else if (ret > 0)

244

p = parent->rb_right;

244

p = parent->rb_right;

245

else

245

else

246

goto out;

246

goto out;

247

}

247

}

248

249

if (parent && __compare_inode_defrag(&tmp, entry) > 0) {

249

if (parent && __compare_inode_defrag(&tmp, entry) > 0) {

250

parent = rb_next(parent);

250

parent = rb_next(parent);

251

if (parent)

251

if (parent)

252

entry = rb_entry(parent, struct inode_defrag, rb_node);

252

entry = rb_entry(parent, struct inode_defrag, rb_node);

253

else

253

else

254

entry = NULL;

254

entry = NULL;

255

}

255

}

256

out:

256

out:

257

if (entry)

257

if (entry)

258

rb_erase(parent, &fs_info->defrag_inodes);

258

rb_erase(parent, &fs_info->defrag_inodes);

259

spin_unlock(&fs_info->defrag_inodes_lock);

259

spin_unlock(&fs_info->defrag_inodes_lock);

260

return entry;

260

return entry;

261

}

261

}

262

263

void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)

263

void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)

264

{

264

{

265

struct inode_defrag *defrag;

265

struct inode_defrag *defrag;

266

struct rb_node *node;

266

struct rb_node *node;

267

268

spin_lock(&fs_info->defrag_inodes_lock);

268

spin_lock(&fs_info->defrag_inodes_lock);

269

node = rb_first(&fs_info->defrag_inodes);

269

node = rb_first(&fs_info->defrag_inodes);

270

while (node) {

270

while (node) {

271

rb_erase(node, &fs_info->defrag_inodes);

271

rb_erase(node, &fs_info->defrag_inodes);

272

defrag = rb_entry(node, struct inode_defrag, rb_node);

272

defrag = rb_entry(node, struct inode_defrag, rb_node);

273

kmem_cache_free(btrfs_inode_defrag_cachep, defrag);

273

kmem_cache_free(btrfs_inode_defrag_cachep, defrag);

274

275

if (need_resched()) {

275

if (need_resched()) {

276

spin_unlock(&fs_info->defrag_inodes_lock);

276

spin_unlock(&fs_info->defrag_inodes_lock);

277

cond_resched();

277

cond_resched();

278

spin_lock(&fs_info->defrag_inodes_lock);

278

spin_lock(&fs_info->defrag_inodes_lock);

279

}

279

}

280

281

node = rb_first(&fs_info->defrag_inodes);

281

node = rb_first(&fs_info->defrag_inodes);

282

}

282

}

283

spin_unlock(&fs_info->defrag_inodes_lock);

283

spin_unlock(&fs_info->defrag_inodes_lock);

284

}

284

}

285

286

#define BTRFS_DEFRAG_BATCH 1024

286

#define BTRFS_DEFRAG_BATCH 1024

287

288

static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,

288

static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,

289

struct inode_defrag *defrag)

289

struct inode_defrag *defrag)

290

{

290

{

291

struct btrfs_root *inode_root;

291

struct btrfs_root *inode_root;

292

struct inode *inode;

292

struct inode *inode;

293

struct btrfs_key key;

293

struct btrfs_key key;

294

struct btrfs_ioctl_defrag_range_args range;

294

struct btrfs_ioctl_defrag_range_args range;

295

int num_defrag;

295

int num_defrag;

296

int index;

296

int index;

297

int ret;

297

int ret;

298

299

/* get the inode */

299

/* get the inode */

300

key.objectid = defrag->root;

300

key.objectid = defrag->root;

301

btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);

301

btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);

302

key.offset = (u64)-1;

302

key.offset = (u64)-1;

303

304

index = srcu_read_lock(&fs_info->subvol_srcu);

304

index = srcu_read_lock(&fs_info->subvol_srcu);

305

306

inode_root = btrfs_read_fs_root_no_name(fs_info, &key);

306

inode_root = btrfs_read_fs_root_no_name(fs_info, &key);

307

if (IS_ERR(inode_root)) {

307

if (IS_ERR(inode_root)) {

308

ret = PTR_ERR(inode_root);

308

ret = PTR_ERR(inode_root);

309

goto cleanup;

309

goto cleanup;

310

}

310

}

311

312

key.objectid = defrag->ino;

312

key.objectid = defrag->ino;

313

btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);

313

btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);

314

key.offset = 0;

314

key.offset = 0;

315

inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);

315

inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);

316

if (IS_ERR(inode)) {

316

if (IS_ERR(inode)) {

317

ret = PTR_ERR(inode);

317

ret = PTR_ERR(inode);

318

goto cleanup;

318

goto cleanup;

319

}

319

}

320

srcu_read_unlock(&fs_info->subvol_srcu, index);

320

srcu_read_unlock(&fs_info->subvol_srcu, index);

321

322

/* do a chunk of defrag */

322

/* do a chunk of defrag */

323

clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);

323

clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);

324

memset(&range, 0, sizeof(range));

324

memset(&range, 0, sizeof(range));

325

range.len = (u64)-1;

325

range.len = (u64)-1;

326

range.start = defrag->last_offset;

326

range.start = defrag->last_offset;

327

328

sb_start_write(fs_info->sb);

328

sb_start_write(fs_info->sb);

329

num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,

329

num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,

330

BTRFS_DEFRAG_BATCH);

330

BTRFS_DEFRAG_BATCH);

331

sb_end_write(fs_info->sb);

331

sb_end_write(fs_info->sb);

332

/*

332

/*

333

* if we filled the whole defrag batch, there

333

* if we filled the whole defrag batch, there

334

* must be more work to do. Queue this defrag

334

* must be more work to do. Queue this defrag

335

* again

335

* again

336

*/

336

*/

337

if (num_defrag == BTRFS_DEFRAG_BATCH) {

337

if (num_defrag == BTRFS_DEFRAG_BATCH) {

338

defrag->last_offset = range.start;

338

defrag->last_offset = range.start;

339

btrfs_requeue_inode_defrag(inode, defrag);

339

btrfs_requeue_inode_defrag(inode, defrag);

340

} else if (defrag->last_offset && !defrag->cycled) {

340

} else if (defrag->last_offset && !defrag->cycled) {

341

/*

341

/*

342

* we didn't fill our defrag batch, but

342

* we didn't fill our defrag batch, but

343

* we didn't start at zero. Make sure we loop

343

* we didn't start at zero. Make sure we loop

344

* around to the start of the file.

344

* around to the start of the file.

345

*/

345

*/

346

defrag->last_offset = 0;

346

defrag->last_offset = 0;

347

defrag->cycled = 1;

347

defrag->cycled = 1;

348

btrfs_requeue_inode_defrag(inode, defrag);

348

btrfs_requeue_inode_defrag(inode, defrag);

349

} else {

349

} else {

350

kmem_cache_free(btrfs_inode_defrag_cachep, defrag);

350

kmem_cache_free(btrfs_inode_defrag_cachep, defrag);

351

}

351

}

352

353

iput(inode);

353

iput(inode);

354

return 0;

354

return 0;

355

cleanup:

355

cleanup:

356

srcu_read_unlock(&fs_info->subvol_srcu, index);

356

srcu_read_unlock(&fs_info->subvol_srcu, index);

357

kmem_cache_free(btrfs_inode_defrag_cachep, defrag);

357

kmem_cache_free(btrfs_inode_defrag_cachep, defrag);

358

return ret;

358

return ret;

359

}

359

}

360

361

/*

361

/*

362

* run through the list of inodes in the FS that need

362

* run through the list of inodes in the FS that need

363

* defragging

363

* defragging

364

*/

364

*/

365

int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)

365

int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)

366

{

366

{

367

struct inode_defrag *defrag;

367

struct inode_defrag *defrag;

368

u64 first_ino = 0;

368

u64 first_ino = 0;

369

u64 root_objectid = 0;

369

u64 root_objectid = 0;

370

371

atomic_inc(&fs_info->defrag_running);

371

atomic_inc(&fs_info->defrag_running);

372

while (1) {

372

while (1) {

373

/* Pause the auto defragger. */

373

/* Pause the auto defragger. */

374

if (test_bit(BTRFS_FS_STATE_REMOUNTING,

374

if (test_bit(BTRFS_FS_STATE_REMOUNTING,

375

&fs_info->fs_state))

375

&fs_info->fs_state))

376

break;

376

break;

377

378

if (!__need_auto_defrag(fs_info->tree_root))

378

if (!__need_auto_defrag(fs_info->tree_root))

379

break;

379

break;

380

381

/* find an inode to defrag */

381

/* find an inode to defrag */

382

defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,

382

defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,

383

first_ino);

383

first_ino);

384

if (!defrag) {

384

if (!defrag) {

385

if (root_objectid || first_ino) {

385

if (root_objectid || first_ino) {

386

root_objectid = 0;

386

root_objectid = 0;

387

first_ino = 0;

387

first_ino = 0;

388

continue;

388

continue;

389

} else {

389

} else {

390

break;

390

break;

391

}

391

}

392

}

392

}

393

394

first_ino = defrag->ino + 1;

394

first_ino = defrag->ino + 1;

395

root_objectid = defrag->root;

395

root_objectid = defrag->root;

396

397

__btrfs_run_defrag_inode(fs_info, defrag);

397

__btrfs_run_defrag_inode(fs_info, defrag);

398

}

398

}

399

atomic_dec(&fs_info->defrag_running);

399

atomic_dec(&fs_info->defrag_running);

400

401

/*

401

/*

402

* during unmount, we use the transaction_wait queue to

402

* during unmount, we use the transaction_wait queue to

403

* wait for the defragger to stop

403

* wait for the defragger to stop

404

*/

404

*/

405

wake_up(&fs_info->transaction_wait);

405

wake_up(&fs_info->transaction_wait);

406

return 0;

406

return 0;

407

}

407

}

408

409

/* simple helper to fault in pages and copy. This should go away

409

/* simple helper to fault in pages and copy. This should go away

410

* and be replaced with calls into generic code.

410

* and be replaced with calls into generic code.

411

*/

411

*/

412

static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,

412

static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,

413

size_t write_bytes,

413

size_t write_bytes,

414

struct page **prepared_pages,

414

struct page **prepared_pages,

415

struct iov_iter *i)

415

struct iov_iter *i)

416

{

416

{

417

size_t copied = 0;

417

size_t copied = 0;

418

size_t total_copied = 0;

418

size_t total_copied = 0;

419

int pg = 0;

419

int pg = 0;

420

int offset = pos & (PAGE_CACHE_SIZE - 1);

420

int offset = pos & (PAGE_CACHE_SIZE - 1);

421

422

while (write_bytes > 0) {

422

while (write_bytes > 0) {

423

size_t count = min_t(size_t,

423

size_t count = min_t(size_t,

424

PAGE_CACHE_SIZE - offset, write_bytes);

424

PAGE_CACHE_SIZE - offset, write_bytes);

425

struct page *page = prepared_pages[pg];

425

struct page *page = prepared_pages[pg];

426

/*

426

/*

427

* Copy data from userspace to the current page

427

* Copy data from userspace to the current page

428

*/

428

*/

429

copied = iov_iter_copy_from_user_atomic(page, i, offset, count);

429

copied = iov_iter_copy_from_user_atomic(page, i, offset, count);

430

431

/* Flush processor's dcache for this page */

431

/* Flush processor's dcache for this page */

432

flush_dcache_page(page);

432

flush_dcache_page(page);

433

434

/*

434

/*

435

* if we get a partial write, we can end up with

435

* if we get a partial write, we can end up with

436

* partially up to date pages. These add

436

* partially up to date pages. These add

437

* a lot of complexity, so make sure they don't

437

* a lot of complexity, so make sure they don't

438

* happen by forcing this copy to be retried.

438

* happen by forcing this copy to be retried.

439

*

439

*

440

* The rest of the btrfs_file_write code will fall

440

* The rest of the btrfs_file_write code will fall

441

* back to page at a time copies after we return 0.

441

* back to page at a time copies after we return 0.

442

*/

442

*/

443

if (!PageUptodate(page) && copied < count)

443

if (!PageUptodate(page) && copied < count)

444

copied = 0;

444

copied = 0;

445

446

iov_iter_advance(i, copied);

446

iov_iter_advance(i, copied);

447

write_bytes -= copied;

447

write_bytes -= copied;

448

total_copied += copied;

448

total_copied += copied;

449

450

/* Return to btrfs_file_aio_write to fault page */

450

/* Return to btrfs_file_aio_write to fault page */

451

if (unlikely(copied == 0))

451

if (unlikely(copied == 0))

452

break;

452

break;

453

454

if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {

454

if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {

455

offset += copied;

455

offset += copied;

456

} else {

456

} else {

457

pg++;

457

pg++;

458

offset = 0;

458

offset = 0;

459

}

459

}

460

}

460

}

461

return total_copied;

461

return total_copied;

462

}

462

}

463

464

/*

464

/*

465

* unlocks pages after btrfs_file_write is done with them

465

* unlocks pages after btrfs_file_write is done with them

466

*/

466

*/

467

static void btrfs_drop_pages(struct page **pages, size_t num_pages)

467

static void btrfs_drop_pages(struct page **pages, size_t num_pages)

468

{

468

{

469

size_t i;

469

size_t i;

470

for (i = 0; i < num_pages; i++) {

470

for (i = 0; i < num_pages; i++) {

471

/* page checked is some magic around finding pages that

471

/* page checked is some magic around finding pages that

472

* have been modified without going through btrfs_set_page_dirty

472

* have been modified without going through btrfs_set_page_dirty

473

* clear it here

473

* clear it here

474

*/

474

*/

475

ClearPageChecked(pages[i]);

475

ClearPageChecked(pages[i]);

476

unlock_page(pages[i]);

476

unlock_page(pages[i]);

477

mark_page_accessed(pages[i]);

477

mark_page_accessed(pages[i]);

478

page_cache_release(pages[i]);

478

page_cache_release(pages[i]);

479

}

479

}

480

}

480

}

481

482

/*

482

/*

483

* after copy_from_user, pages need to be dirtied and we need to make

483

* after copy_from_user, pages need to be dirtied and we need to make

484

* sure holes are created between the current EOF and the start of

484

* sure holes are created between the current EOF and the start of

485

* any next extents (if required).

485

* any next extents (if required).

486

*

486

*

487

* this also makes the decision about creating an inline extent vs

487

* this also makes the decision about creating an inline extent vs

488

* doing real data extents, marking pages dirty and delalloc as required.

488

* doing real data extents, marking pages dirty and delalloc as required.

489

*/

489

*/

490

int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,

490

int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,

491

struct page **pages, size_t num_pages,

491

struct page **pages, size_t num_pages,

492

loff_t pos, size_t write_bytes,

492

loff_t pos, size_t write_bytes,

493

struct extent_state **cached)

493

struct extent_state **cached)

494

{

494

{

495

int err = 0;

495

int err = 0;

496

int i;

496

int i;

497

u64 num_bytes;

497

u64 num_bytes;

498

u64 start_pos;

498

u64 start_pos;

499

u64 end_of_last_block;

499

u64 end_of_last_block;

500

u64 end_pos = pos + write_bytes;

500

u64 end_pos = pos + write_bytes;

501

loff_t isize = i_size_read(inode);

501

loff_t isize = i_size_read(inode);

502

503

start_pos = pos & ~((u64)root->sectorsize - 1);

503

start_pos = pos & ~((u64)root->sectorsize - 1);

504

num_bytes = ALIGN(write_bytes + pos - start_pos, root->sectorsize);

504

num_bytes = ALIGN(write_bytes + pos - start_pos, root->sectorsize);

505

506

end_of_last_block = start_pos + num_bytes - 1;

506

end_of_last_block = start_pos + num_bytes - 1;

507

err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,

507

err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,

508

cached);

508

cached);

509

if (err)

509

if (err)

510

return err;

510

return err;

511

512

for (i = 0; i < num_pages; i++) {

512

for (i = 0; i < num_pages; i++) {

513

struct page *p = pages[i];

513

struct page *p = pages[i];

514

SetPageUptodate(p);

514

SetPageUptodate(p);

515

ClearPageChecked(p);

515

ClearPageChecked(p);

516

set_page_dirty(p);

516

set_page_dirty(p);

517

}

517

}

518

519

/*

519

/*

520

* we've only changed i_size in ram, and we haven't updated

520

* we've only changed i_size in ram, and we haven't updated

521

* the disk i_size. There is no need to log the inode

521

* the disk i_size. There is no need to log the inode

522

* at this time.

522

* at this time.

523

*/

523

*/

524

if (end_pos > isize)

524

if (end_pos > isize)

525

i_size_write(inode, end_pos);

525

i_size_write(inode, end_pos);

526

return 0;

526

return 0;

527

}

527

}

528

529

/*

529

/*

530

* this drops all the extents in the cache that intersect the range

530

* this drops all the extents in the cache that intersect the range

531

* [start, end]. Existing extents are split as required.

531

* [start, end]. Existing extents are split as required.

532

*/

532

*/

533

void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,

533

void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,

534

int skip_pinned)

534

int skip_pinned)

535

{

535

{

536

struct extent_map *em;

536

struct extent_map *em;

537

struct extent_map *split = NULL;

537

struct extent_map *split = NULL;

538

struct extent_map *split2 = NULL;

538

struct extent_map *split2 = NULL;

539

struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;

539

struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;

540

u64 len = end - start + 1;

540

u64 len = end - start + 1;

541

u64 gen;

541

u64 gen;

542

int ret;

542

int ret;

543

int testend = 1;

543

int testend = 1;

544

unsigned long flags;

544

unsigned long flags;

545

int compressed = 0;

545

int compressed = 0;

546

bool modified;

546

bool modified;

547

548

WARN_ON(end < start);

548

WARN_ON(end < start);

549

if (end == (u64)-1) {

549

if (end == (u64)-1) {

550

len = (u64)-1;

550

len = (u64)-1;

551

testend = 0;

551

testend = 0;

552

}

552

}

553

while (1) {

553

while (1) {

554

int no_splits = 0;

554

int no_splits = 0;

555

556

modified = false;

556

modified = false;

557

if (!split)

557

if (!split)

558

split = alloc_extent_map();

558

split = alloc_extent_map();

559

if (!split2)

559

if (!split2)

560

split2 = alloc_extent_map();

560

split2 = alloc_extent_map();

561

if (!split || !split2)

561

if (!split || !split2)

562

no_splits = 1;

562

no_splits = 1;

563

564

write_lock(&em_tree->lock);

564

write_lock(&em_tree->lock);

565

em = lookup_extent_mapping(em_tree, start, len);

565

em = lookup_extent_mapping(em_tree, start, len);

566

if (!em) {

566

if (!em) {

567

write_unlock(&em_tree->lock);

567

write_unlock(&em_tree->lock);

568

break;

568

break;

569

}

569

}

570

flags = em->flags;

570

flags = em->flags;

571

gen = em->generation;

571

gen = em->generation;

572

if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {

572

if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {

573

if (testend && em->start + em->len >= start + len) {

573

if (testend && em->start + em->len >= start + len) {

574

free_extent_map(em);

574

free_extent_map(em);

575

write_unlock(&em_tree->lock);

575

write_unlock(&em_tree->lock);

576

break;

576

break;

577

}

577

}

578

start = em->start + em->len;

578

start = em->start + em->len;

579

if (testend)

579

if (testend)

580

len = start + len - (em->start + em->len);

580

len = start + len - (em->start + em->len);

581

free_extent_map(em);

581

free_extent_map(em);

582

write_unlock(&em_tree->lock);

582

write_unlock(&em_tree->lock);

583

continue;

583

continue;

584

}

584

}

585

compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);

585

compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);

586

clear_bit(EXTENT_FLAG_PINNED, &em->flags);

586

clear_bit(EXTENT_FLAG_PINNED, &em->flags);

587

clear_bit(EXTENT_FLAG_LOGGING, &flags);

587

clear_bit(EXTENT_FLAG_LOGGING, &flags);

588

modified = !list_empty(&em->list);

588

modified = !list_empty(&em->list);

589

if (no_splits)

589

if (no_splits)

590

goto next;

590

goto next;

591

592

if (em->start < start) {

592

if (em->start < start) {

593

split->start = em->start;

593

split->start = em->start;

594

split->len = start - em->start;

594

split->len = start - em->start;

595

596

if (em->block_start < EXTENT_MAP_LAST_BYTE) {

596

if (em->block_start < EXTENT_MAP_LAST_BYTE) {

597

split->orig_start = em->orig_start;

597

split->orig_start = em->orig_start;

598

split->block_start = em->block_start;

598

split->block_start = em->block_start;

599

600

if (compressed)

600

if (compressed)

601

split->block_len = em->block_len;

601

split->block_len = em->block_len;

602

else

602

else

603

split->block_len = split->len;

603

split->block_len = split->len;

604

split->orig_block_len = max(split->block_len,

604

split->orig_block_len = max(split->block_len,

605

em->orig_block_len);

605

em->orig_block_len);

606

split->ram_bytes = em->ram_bytes;

606

split->ram_bytes = em->ram_bytes;

607

} else {

607

} else {

608

split->orig_start = split->start;

608

split->orig_start = split->start;

609

split->block_len = 0;

609

split->block_len = 0;

610

split->block_start = em->block_start;

610

split->block_start = em->block_start;

611

split->orig_block_len = 0;

611

split->orig_block_len = 0;

612

split->ram_bytes = split->len;

612

split->ram_bytes = split->len;

613

}

613

}

614

615

split->generation = gen;

615

split->generation = gen;

616

split->bdev = em->bdev;

616

split->bdev = em->bdev;

617

split->flags = flags;

617

split->flags = flags;

618

split->compress_type = em->compress_type;

618

split->compress_type = em->compress_type;

619

replace_extent_mapping(em_tree, em, split, modified);

619

replace_extent_mapping(em_tree, em, split, modified);

620

free_extent_map(split);

620

free_extent_map(split);

621

split = split2;

621

split = split2;

622

split2 = NULL;

622

split2 = NULL;

623

}

623

}

624

if (testend && em->start + em->len > start + len) {

624

if (testend && em->start + em->len > start + len) {

625

u64 diff = start + len - em->start;

625

u64 diff = start + len - em->start;

626

627

split->start = start + len;

627

split->start = start + len;

628

split->len = em->start + em->len - (start + len);

628

split->len = em->start + em->len - (start + len);

629

split->bdev = em->bdev;

629

split->bdev = em->bdev;

630

split->flags = flags;

630

split->flags = flags;

631

split->compress_type = em->compress_type;

631

split->compress_type = em->compress_type;

632

split->generation = gen;

632

split->generation = gen;

633

634

if (em->block_start < EXTENT_MAP_LAST_BYTE) {

634

if (em->block_start < EXTENT_MAP_LAST_BYTE) {

635

split->orig_block_len = max(em->block_len,

635

split->orig_block_len = max(em->block_len,

636

em->orig_block_len);

636

em->orig_block_len);

637

638

split->ram_bytes = em->ram_bytes;

638

split->ram_bytes = em->ram_bytes;

639

if (compressed) {

639

if (compressed) {

640

split->block_len = em->block_len;

640

split->block_len = em->block_len;

641

split->block_start = em->block_start;

641

split->block_start = em->block_start;

642

split->orig_start = em->orig_start;

642

split->orig_start = em->orig_start;

643

} else {

643

} else {

644

split->block_len = split->len;

644

split->block_len = split->len;

645

split->block_start = em->block_start

645

split->block_start = em->block_start

646

+ diff;

646

+ diff;

647

split->orig_start = em->orig_start;

647

split->orig_start = em->orig_start;

648

}

648

}

649

} else {

649

} else {

650

split->ram_bytes = split->len;

650

split->ram_bytes = split->len;

651

split->orig_start = split->start;

651

split->orig_start = split->start;

652

split->block_len = 0;

652

split->block_len = 0;

653

split->block_start = em->block_start;

653

split->block_start = em->block_start;

654

split->orig_block_len = 0;

654

split->orig_block_len = 0;

655

}

655

}

656

657

if (extent_map_in_tree(em)) {

657

if (extent_map_in_tree(em)) {

658

replace_extent_mapping(em_tree, em, split,

658

replace_extent_mapping(em_tree, em, split,

659

modified);

659

modified);

660

} else {

660

} else {

661

ret = add_extent_mapping(em_tree, split,

661

ret = add_extent_mapping(em_tree, split,

662

modified);

662

modified);

663

ASSERT(ret == 0); /* Logic error */

663

ASSERT(ret == 0); /* Logic error */

664

}

664

}

665

free_extent_map(split);

665

free_extent_map(split);

666

split = NULL;

666

split = NULL;

667

}

667

}

668

if (extent_map_in_tree(em))

669

if (extent_map_in_tree(em))

670

remove_extent_mapping(em_tree, em);

670

remove_extent_mapping(em_tree, em);

671

write_unlock(&em_tree->lock);

671

write_unlock(&em_tree->lock);

672

673

/* once for us */

673

/* once for us */

674

free_extent_map(em);

674

free_extent_map(em);

675

/* once for the tree*/

675

/* once for the tree*/

676

free_extent_map(em);

676

free_extent_map(em);

677

}

677

}

678

if (split)

678

if (split)

679

free_extent_map(split);

679

free_extent_map(split);

680

if (split2)

680

if (split2)

681

free_extent_map(split2);

681

free_extent_map(split2);

682

}

682

}

683

684

/*

684

/*

685

* this is very complex, but the basic idea is to drop all extents

685

* this is very complex, but the basic idea is to drop all extents

686

* in the range start - end. hint_block is filled in with a block number

686

* in the range start - end. hint_block is filled in with a block number

687

* that would be a good hint to the block allocator for this file.

687

* that would be a good hint to the block allocator for this file.

688

*

688

*

689

* If an extent intersects the range but is not entirely inside the range

689

* If an extent intersects the range but is not entirely inside the range

690

* it is either truncated or split. Anything entirely inside the range

690

* it is either truncated or split. Anything entirely inside the range

691

* is deleted from the tree.

691

* is deleted from the tree.

692

*/

692

*/

693

int __btrfs_drop_extents(struct btrfs_trans_handle *trans,

693

int __btrfs_drop_extents(struct btrfs_trans_handle *trans,

694

struct btrfs_root *root, struct inode *inode,

694

struct btrfs_root *root, struct inode *inode,

695

struct btrfs_path *path, u64 start, u64 end,

695

struct btrfs_path *path, u64 start, u64 end,

696

u64 *drop_end, int drop_cache,

696

u64 *drop_end, int drop_cache,

697

int replace_extent,

697

int replace_extent,

698

u32 extent_item_size,

698

u32 extent_item_size,

699

int *key_inserted)

699

int *key_inserted)

700

{

700

{

701

struct extent_buffer *leaf;

701

struct extent_buffer *leaf;

702

struct btrfs_file_extent_item *fi;

702

struct btrfs_file_extent_item *fi;

703

struct btrfs_key key;

703

struct btrfs_key key;

704

struct btrfs_key new_key;

704

struct btrfs_key new_key;

705

u64 ino = btrfs_ino(inode);

705

u64 ino = btrfs_ino(inode);

706

u64 search_start = start;

706

u64 search_start = start;

707

u64 disk_bytenr = 0;

707

u64 disk_bytenr = 0;

708

u64 num_bytes = 0;

708

u64 num_bytes = 0;

709

u64 extent_offset = 0;

709

u64 extent_offset = 0;

710

u64 extent_end = 0;

710

u64 extent_end = 0;

711

int del_nr = 0;

711

int del_nr = 0;

712

int del_slot = 0;

712

int del_slot = 0;

713

int extent_type;

713

int extent_type;

714

int recow;

714

int recow;

715

int ret;

715

int ret;

716

int modify_tree = -1;

716

int modify_tree = -1;

717

int update_refs = (root->ref_cows || root == root->fs_info->tree_root);

717

int update_refs = (root->ref_cows || root == root->fs_info->tree_root);

718

int found = 0;

718

int found = 0;

719

int leafs_visited = 0;

719

int leafs_visited = 0;

720

721

if (drop_cache)

721

if (drop_cache)

722

btrfs_drop_extent_cache(inode, start, end - 1, 0);

722

btrfs_drop_extent_cache(inode, start, end - 1, 0);

723

724

if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent)

724

if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent)

725

modify_tree = 0;

725

modify_tree = 0;

726

727

while (1) {

727

while (1) {

728

recow = 0;

728

recow = 0;

729

ret = btrfs_lookup_file_extent(trans, root, path, ino,

729

ret = btrfs_lookup_file_extent(trans, root, path, ino,

730

search_start, modify_tree);

730

search_start, modify_tree);

731

if (ret < 0)

731

if (ret < 0)

732

break;

732

break;

733

if (ret > 0 && path->slots[0] > 0 && search_start == start) {

733

if (ret > 0 && path->slots[0] > 0 && search_start == start) {

734

leaf = path->nodes[0];

734

leaf = path->nodes[0];

735

btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);

735

btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);

736

if (key.objectid == ino &&

736

if (key.objectid == ino &&

737

key.type == BTRFS_EXTENT_DATA_KEY)

737

key.type == BTRFS_EXTENT_DATA_KEY)

738

path->slots[0]--;

738

path->slots[0]--;

739

}

739

}

740

ret = 0;

740

ret = 0;

741

leafs_visited++;

741

leafs_visited++;

742

next_slot:

742

next_slot:

743

leaf = path->nodes[0];

743

leaf = path->nodes[0];

744

if (path->slots[0] >= btrfs_header_nritems(leaf)) {

744

if (path->slots[0] >= btrfs_header_nritems(leaf)) {

745

BUG_ON(del_nr > 0);

745

BUG_ON(del_nr > 0);

746

ret = btrfs_next_leaf(root, path);

746

ret = btrfs_next_leaf(root, path);

747

if (ret < 0)

747

if (ret < 0)

748

break;

748

break;

749

if (ret > 0) {

749

if (ret > 0) {

750

ret = 0;

750

ret = 0;

751

break;

751

break;

752

}

752

}

753

leafs_visited++;

753

leafs_visited++;

754

leaf = path->nodes[0];

754

leaf = path->nodes[0];

755

recow = 1;

755

recow = 1;

756

}

756

}

757

758

btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);

758

btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);

759

if (key.objectid > ino ||

759

if (key.objectid > ino ||

760

key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)

760

key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)

761

break;

761

break;

762

763

fi = btrfs_item_ptr(leaf, path->slots[0],

763

fi = btrfs_item_ptr(leaf, path->slots[0],

764

struct btrfs_file_extent_item);

764

struct btrfs_file_extent_item);

765

extent_type = btrfs_file_extent_type(leaf, fi);

765

extent_type = btrfs_file_extent_type(leaf, fi);

766

767

if (extent_type == BTRFS_FILE_EXTENT_REG ||

767

if (extent_type == BTRFS_FILE_EXTENT_REG ||

768

extent_type == BTRFS_FILE_EXTENT_PREALLOC) {

768

extent_type == BTRFS_FILE_EXTENT_PREALLOC) {

769

disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);

769

disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);

770

num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);

770

num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);

771

extent_offset = btrfs_file_extent_offset(leaf, fi);

771

extent_offset = btrfs_file_extent_offset(leaf, fi);

772

extent_end = key.offset +

772

extent_end = key.offset +

773

btrfs_file_extent_num_bytes(leaf, fi);

773

btrfs_file_extent_num_bytes(leaf, fi);

774

} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {

774

} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {

775

extent_end = key.offset +

775

extent_end = key.offset +

776

btrfs_file_extent_inline_len(leaf,

776

btrfs_file_extent_inline_len(leaf,

777

path->slots[0], fi);

777

path->slots[0], fi);

778

} else {

778

} else {

779

WARN_ON(1);

779

WARN_ON(1);

780

extent_end = search_start;

780

extent_end = search_start;

781

}

781

}

782

783

/*

783

/*

784

* Don't skip extent items representing 0 byte lengths. They

784

* Don't skip extent items representing 0 byte lengths. They

785

* used to be created (bug) if while punching holes we hit

785

* used to be created (bug) if while punching holes we hit

786

* -ENOSPC condition. So if we find one here, just ensure we

786

* -ENOSPC condition. So if we find one here, just ensure we

787

* delete it, otherwise we would insert a new file extent item

787

* delete it, otherwise we would insert a new file extent item

788

* with the same key (offset) as that 0 bytes length file

788

* with the same key (offset) as that 0 bytes length file

789

* extent item in the call to setup_items_for_insert() later

789

* extent item in the call to setup_items_for_insert() later

790

* in this function.

790

* in this function.

791

*/

791

*/

792

if (extent_end == key.offset && extent_end >= search_start)

792

if (extent_end == key.offset && extent_end >= search_start)

793

goto delete_extent_item;

793

goto delete_extent_item;

794

795

if (extent_end <= search_start) {

795

if (extent_end <= search_start) {

796

path->slots[0]++;

796

path->slots[0]++;

797

goto next_slot;

797

goto next_slot;

798

}

798

}

799

800

found = 1;

800

found = 1;

801

search_start = max(key.offset, start);

801

search_start = max(key.offset, start);

802

if (recow || !modify_tree) {

802

if (recow || !modify_tree) {

803

modify_tree = -1;

803

modify_tree = -1;

804

btrfs_release_path(path);

804

btrfs_release_path(path);

805

continue;

805

continue;

806

}

806

}

807

808

/*

808

/*

809

* | - range to drop - |

809

* | - range to drop - |

810

* | -------- extent -------- |

810

* | -------- extent -------- |

811

*/

811

*/

812

if (start > key.offset && end < extent_end) {

812

if (start > key.offset && end < extent_end) {

813

BUG_ON(del_nr > 0);

813

BUG_ON(del_nr > 0);

814

if (extent_type == BTRFS_FILE_EXTENT_INLINE) {

814

if (extent_type == BTRFS_FILE_EXTENT_INLINE) {

815

ret = -EOPNOTSUPP;

815

ret = -EOPNOTSUPP;

816

break;

816

break;

817

}

817

}

818

819

memcpy(&new_key, &key, sizeof(new_key));

819

memcpy(&new_key, &key, sizeof(new_key));

820

new_key.offset = start;

820

new_key.offset = start;

821

ret = btrfs_duplicate_item(trans, root, path,

821

ret = btrfs_duplicate_item(trans, root, path,

822

&new_key);

822

&new_key);

823

if (ret == -EAGAIN) {

823

if (ret == -EAGAIN) {

824

btrfs_release_path(path);

824

btrfs_release_path(path);

825

continue;

825

continue;

826

}

826

}

827

if (ret < 0)

827

if (ret < 0)

828

break;

828

break;

829

830

leaf = path->nodes[0];

830

leaf = path->nodes[0];

831

fi = btrfs_item_ptr(leaf, path->slots[0] - 1,

831

fi = btrfs_item_ptr(leaf, path->slots[0] - 1,

832

struct btrfs_file_extent_item);

832

struct btrfs_file_extent_item);

833

btrfs_set_file_extent_num_bytes(leaf, fi,

833

btrfs_set_file_extent_num_bytes(leaf, fi,

834

start - key.offset);

834

start - key.offset);

835

836

fi = btrfs_item_ptr(leaf, path->slots[0],

836

fi = btrfs_item_ptr(leaf, path->slots[0],

837

struct btrfs_file_extent_item);

837

struct btrfs_file_extent_item);

838

839

extent_offset += start - key.offset;

839

extent_offset += start - key.offset;

840

btrfs_set_file_extent_offset(leaf, fi, extent_offset);

840

btrfs_set_file_extent_offset(leaf, fi, extent_offset);

841

btrfs_set_file_extent_num_bytes(leaf, fi,

841

btrfs_set_file_extent_num_bytes(leaf, fi,

842

extent_end - start);

842

extent_end - start);

843

btrfs_mark_buffer_dirty(leaf);

843

btrfs_mark_buffer_dirty(leaf);

844

845

if (update_refs && disk_bytenr > 0) {

845

if (update_refs && disk_bytenr > 0) {

846

ret = btrfs_inc_extent_ref(trans, root,

846

ret = btrfs_inc_extent_ref(trans, root,

847

disk_bytenr, num_bytes, 0,

847

disk_bytenr, num_bytes, 0,

848

root->root_key.objectid,

848

root->root_key.objectid,

849

new_key.objectid,

849

new_key.objectid,

850

start - extent_offset, 0);

850

start - extent_offset, 0);

851

BUG_ON(ret); /* -ENOMEM */

851

BUG_ON(ret); /* -ENOMEM */

852

}

852

}

853

key.offset = start;

853

key.offset = start;

854

}

854

}

855

/*

855

/*

856

* | ---- range to drop ----- |

856

* | ---- range to drop ----- |

857

* | -------- extent -------- |

857

* | -------- extent -------- |

858

*/

858

*/

859

if (start <= key.offset && end < extent_end) {

859

if (start <= key.offset && end < extent_end) {

860

if (extent_type == BTRFS_FILE_EXTENT_INLINE) {

860

if (extent_type == BTRFS_FILE_EXTENT_INLINE) {

861

ret = -EOPNOTSUPP;

861

ret = -EOPNOTSUPP;

862

break;

862

break;

863

}

863

}

864

865

memcpy(&new_key, &key, sizeof(new_key));

865

memcpy(&new_key, &key, sizeof(new_key));

866

new_key.offset = end;

866

new_key.offset = end;

867

btrfs_set_item_key_safe(root, path, &new_key);

867

btrfs_set_item_key_safe(root, path, &new_key);

868

869

extent_offset += end - key.offset;

869

extent_offset += end - key.offset;

870

btrfs_set_file_extent_offset(leaf, fi, extent_offset);

870

btrfs_set_file_extent_offset(leaf, fi, extent_offset);

871

btrfs_set_file_extent_num_bytes(leaf, fi,

871

btrfs_set_file_extent_num_bytes(leaf, fi,

872

extent_end - end);

872

extent_end - end);

873

btrfs_mark_buffer_dirty(leaf);

873

btrfs_mark_buffer_dirty(leaf);

874

if (update_refs && disk_bytenr > 0)

874

if (update_refs && disk_bytenr > 0)

875

inode_sub_bytes(inode, end - key.offset);

875

inode_sub_bytes(inode, end - key.offset);

876

break;

876

break;

877

}

877

}

878

879

search_start = extent_end;

879

search_start = extent_end;

880

/*

880

/*

881

* | ---- range to drop ----- |

881

* | ---- range to drop ----- |

882

* | -------- extent -------- |

882

* | -------- extent -------- |

883

*/

883

*/

884

if (start > key.offset && end >= extent_end) {

884

if (start > key.offset && end >= extent_end) {

885

BUG_ON(del_nr > 0);

885

BUG_ON(del_nr > 0);

886

if (extent_type == BTRFS_FILE_EXTENT_INLINE) {

886

if (extent_type == BTRFS_FILE_EXTENT_INLINE) {

887

ret = -EOPNOTSUPP;

887

ret = -EOPNOTSUPP;

888

break;

888

break;

889

}

889

}

890

891

btrfs_set_file_extent_num_bytes(leaf, fi,

891

btrfs_set_file_extent_num_bytes(leaf, fi,

892

start - key.offset);

892

start - key.offset);

893

btrfs_mark_buffer_dirty(leaf);

893

btrfs_mark_buffer_dirty(leaf);

894

if (update_refs && disk_bytenr > 0)

894

if (update_refs && disk_bytenr > 0)

895

inode_sub_bytes(inode, extent_end - start);

895

inode_sub_bytes(inode, extent_end - start);

896

if (end == extent_end)

896

if (end == extent_end)

897

break;

897

break;

898

899

path->slots[0]++;

899

path->slots[0]++;

900

goto next_slot;

900

goto next_slot;

901

}

901

}

902

903

/*

903

/*

904

* | ---- range to drop ----- |

904

* | ---- range to drop ----- |

905

* | ------ extent ------ |

905

* | ------ extent ------ |

906

*/

906

*/

907

if (start <= key.offset && end >= extent_end) {

907

if (start <= key.offset && end >= extent_end) {

908

delete_extent_item:

908

delete_extent_item:

909

if (del_nr == 0) {

909

if (del_nr == 0) {

910

del_slot = path->slots[0];

910

del_slot = path->slots[0];

911

del_nr = 1;

911

del_nr = 1;

912

} else {

912

} else {

913

BUG_ON(del_slot + del_nr != path->slots[0]);

913

BUG_ON(del_slot + del_nr != path->slots[0]);

914

del_nr++;

914

del_nr++;

915

}

915

}

916

917

if (update_refs &&

917

if (update_refs &&

918

extent_type == BTRFS_FILE_EXTENT_INLINE) {

918

extent_type == BTRFS_FILE_EXTENT_INLINE) {

919

inode_sub_bytes(inode,

919

inode_sub_bytes(inode,

920

extent_end - key.offset);

920

extent_end - key.offset);

921

extent_end = ALIGN(extent_end,

921

extent_end = ALIGN(extent_end,

922

root->sectorsize);

922

root->sectorsize);

923

} else if (update_refs && disk_bytenr > 0) {

923

} else if (update_refs && disk_bytenr > 0) {

924

ret = btrfs_free_extent(trans, root,

924

ret = btrfs_free_extent(trans, root,

925

disk_bytenr, num_bytes, 0,

925

disk_bytenr, num_bytes, 0,

926

root->root_key.objectid,

926

root->root_key.objectid,

927

key.objectid, key.offset -

927

key.objectid, key.offset -

928

extent_offset, 0);

928

extent_offset, 0);

929

BUG_ON(ret); /* -ENOMEM */

929

BUG_ON(ret); /* -ENOMEM */

930

inode_sub_bytes(inode,

930

inode_sub_bytes(inode,

931

extent_end - key.offset);

931

extent_end - key.offset);

932

}

932

}

933

934

if (end == extent_end)

934

if (end == extent_end)

935

break;

935

break;

936

937

if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {

937

if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {

938

path->slots[0]++;

938

path->slots[0]++;

939

goto next_slot;

939

goto next_slot;

940

}

940

}

941

942

ret = btrfs_del_items(trans, root, path, del_slot,

942

ret = btrfs_del_items(trans, root, path, del_slot,

943

del_nr);

943

del_nr);

944

if (ret) {

944

if (ret) {

945

btrfs_abort_transaction(trans, root, ret);

945

btrfs_abort_transaction(trans, root, ret);

946

break;

946

break;

947

}

947

}

948

949

del_nr = 0;

949

del_nr = 0;

950

del_slot = 0;

950

del_slot = 0;

951

952

btrfs_release_path(path);

952

btrfs_release_path(path);

953

continue;

953

continue;

954

}

954

}

955

956

BUG_ON(1);

956

BUG_ON(1);

957

}

957

}

958

959

if (!ret && del_nr > 0) {

959

if (!ret && del_nr > 0) {

960

/*

960

/*

961

* Set path->slots[0] to first slot, so that after the delete

961

* Set path->slots[0] to first slot, so that after the delete

962

* if items are move off from our leaf to its immediate left or

962

* if items are move off from our leaf to its immediate left or

963

* right neighbor leafs, we end up with a correct and adjusted

963

* right neighbor leafs, we end up with a correct and adjusted

964

* path->slots[0] for our insertion (if replace_extent != 0).

964

* path->slots[0] for our insertion (if replace_extent != 0).

965

*/

965

*/

966

path->slots[0] = del_slot;

966

path->slots[0] = del_slot;

967

ret = btrfs_del_items(trans, root, path, del_slot, del_nr);

967

ret = btrfs_del_items(trans, root, path, del_slot, del_nr);

968

if (ret)

968

if (ret)

969

btrfs_abort_transaction(trans, root, ret);

969

btrfs_abort_transaction(trans, root, ret);

970

}

970

}

971

972

leaf = path->nodes[0];

972

leaf = path->nodes[0];

973

/*

973

/*

974

* If btrfs_del_items() was called, it might have deleted a leaf, in

974

* If btrfs_del_items() was called, it might have deleted a leaf, in

975

* which case it unlocked our path, so check path->locks[0] matches a

975

* which case it unlocked our path, so check path->locks[0] matches a

976

* write lock.

976

* write lock.

977

*/

977

*/

978

if (!ret && replace_extent && leafs_visited == 1 &&

978

if (!ret && replace_extent && leafs_visited == 1 &&

979

(path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING ||

979

(path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING ||

980

path->locks[0] == BTRFS_WRITE_LOCK) &&

980

path->locks[0] == BTRFS_WRITE_LOCK) &&

981

btrfs_leaf_free_space(root, leaf) >=

981

btrfs_leaf_free_space(root, leaf) >=

982

sizeof(struct btrfs_item) + extent_item_size) {

982

sizeof(struct btrfs_item) + extent_item_size) {

983

984

key.objectid = ino;

984

key.objectid = ino;

985

key.type = BTRFS_EXTENT_DATA_KEY;

985

key.type = BTRFS_EXTENT_DATA_KEY;

986

key.offset = start;

986

key.offset = start;

987

if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {

987

if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {

988

struct btrfs_key slot_key;

988

struct btrfs_key slot_key;

989

990

btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);

990

btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);

991

if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)

991

if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)

992

path->slots[0]++;

992

path->slots[0]++;

993

}

993

}

994

setup_items_for_insert(root, path, &key,

994

setup_items_for_insert(root, path, &key,

995

&extent_item_size,

995

&extent_item_size,

996

extent_item_size,

996

extent_item_size,

997

sizeof(struct btrfs_item) +

997

sizeof(struct btrfs_item) +

998

extent_item_size, 1);

998

extent_item_size, 1);

999

*key_inserted = 1;

999

*key_inserted = 1;

1000

}

1000

}

1001

1002

if (!replace_extent || !(*key_inserted))

1002

if (!replace_extent || !(*key_inserted))

1003

btrfs_release_path(path);

1003

btrfs_release_path(path);

1004

if (drop_end)

1004

if (drop_end)

1005

*drop_end = found ? min(end, extent_end) : end;

1005

*drop_end = found ? min(end, extent_end) : end;

1006

return ret;

1006

return ret;

1007

}

1007

}

1008

1009

int btrfs_drop_extents(struct btrfs_trans_handle *trans,

1009

int btrfs_drop_extents(struct btrfs_trans_handle *trans,

1010

struct btrfs_root *root, struct inode *inode, u64 start,

1010

struct btrfs_root *root, struct inode *inode, u64 start,

1011

u64 end, int drop_cache)

1011

u64 end, int drop_cache)

1012

{

1012

{

1013

struct btrfs_path *path;

1013

struct btrfs_path *path;

1014

int ret;

1014

int ret;

1015

1016

path = btrfs_alloc_path();

1016

path = btrfs_alloc_path();

1017

if (!path)

1017

if (!path)

1018

return -ENOMEM;

1018

return -ENOMEM;

1019

ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,

1019

ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,

1020

drop_cache, 0, 0, NULL);

1020

drop_cache, 0, 0, NULL);

1021

btrfs_free_path(path);

1021

btrfs_free_path(path);

1022

return ret;

1022

return ret;

1023

}

1023

}

1024

1025

static int extent_mergeable(struct extent_buffer *leaf, int slot,

1025

static int extent_mergeable(struct extent_buffer *leaf, int slot,

1026

u64 objectid, u64 bytenr, u64 orig_offset,

1026

u64 objectid, u64 bytenr, u64 orig_offset,

1027

u64 *start, u64 *end)

1027

u64 *start, u64 *end)

1028

{

1028

{

1029

struct btrfs_file_extent_item *fi;

1029

struct btrfs_file_extent_item *fi;

1030

struct btrfs_key key;

1030

struct btrfs_key key;

1031

u64 extent_end;

1031

u64 extent_end;

1032

1033

if (slot < 0 || slot >= btrfs_header_nritems(leaf))

1033

if (slot < 0 || slot >= btrfs_header_nritems(leaf))

1034

return 0;

1034

return 0;

1035

1036

btrfs_item_key_to_cpu(leaf, &key, slot);

1036

btrfs_item_key_to_cpu(leaf, &key, slot);

1037

if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)

1037

if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)

1038

return 0;

1038

return 0;

1039

1040

fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);

1040

fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);

1041

if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||

1041

if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||

1042

btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||

1042

btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||

1043

btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||

1043

btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||

1044

btrfs_file_extent_compression(leaf, fi) ||

1044

btrfs_file_extent_compression(leaf, fi) ||

1045

btrfs_file_extent_encryption(leaf, fi) ||

1045

btrfs_file_extent_encryption(leaf, fi) ||

1046

btrfs_file_extent_other_encoding(leaf, fi))

1046

btrfs_file_extent_other_encoding(leaf, fi))

1047

return 0;

1047

return 0;

1048

1049

extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);

1049

extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);

1050

if ((*start && *start != key.offset) || (*end && *end != extent_end))

1050

if ((*start && *start != key.offset) || (*end && *end != extent_end))

1051

return 0;

1051

return 0;

1052

1053

*start = key.offset;

1053

*start = key.offset;

1054

*end = extent_end;

1054

*end = extent_end;

1055

return 1;

1055

return 1;

1056

}

1056

}

1057

1058

/*

1058

/*

1059

* Mark extent in the range start - end as written.

1059

* Mark extent in the range start - end as written.

1060

*

1060

*

1061

* This changes extent type from 'pre-allocated' to 'regular'. If only

1061

* This changes extent type from 'pre-allocated' to 'regular'. If only

1062

* part of extent is marked as written, the extent will be split into

1062

* part of extent is marked as written, the extent will be split into

1063

* two or three.

1063

* two or three.

1064

*/

1064

*/

1065

int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,

1065

int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,

1066

struct inode *inode, u64 start, u64 end)

1066

struct inode *inode, u64 start, u64 end)

1067

{

1067

{

1068

struct btrfs_root *root = BTRFS_I(inode)->root;

1068

struct btrfs_root *root = BTRFS_I(inode)->root;

1069

struct extent_buffer *leaf;

1069

struct extent_buffer *leaf;

1070

struct btrfs_path *path;

1070

struct btrfs_path *path;

1071

struct btrfs_file_extent_item *fi;

1071

struct btrfs_file_extent_item *fi;

1072

struct btrfs_key key;

1072

struct btrfs_key key;

1073

struct btrfs_key new_key;

1073

struct btrfs_key new_key;

1074

u64 bytenr;

1074

u64 bytenr;

1075

u64 num_bytes;

1075

u64 num_bytes;

1076

u64 extent_end;

1076

u64 extent_end;

1077

u64 orig_offset;

1077

u64 orig_offset;

1078

u64 other_start;

1078

u64 other_start;

1079

u64 other_end;

1079

u64 other_end;

1080

u64 split;

1080

u64 split;

1081

int del_nr = 0;

1081

int del_nr = 0;

1082

int del_slot = 0;

1082

int del_slot = 0;

1083

int recow;

1083

int recow;

1084

int ret;

1084

int ret;

1085

u64 ino = btrfs_ino(inode);

1085

u64 ino = btrfs_ino(inode);

1086

1087

path = btrfs_alloc_path();

1087

path = btrfs_alloc_path();

1088

if (!path)

1088

if (!path)

1089

return -ENOMEM;

1089

return -ENOMEM;

1090

again:

1090

again:

1091

recow = 0;

1091

recow = 0;

1092

split = start;

1092

split = start;

1093

key.objectid = ino;

1093

key.objectid = ino;

1094

key.type = BTRFS_EXTENT_DATA_KEY;

1094

key.type = BTRFS_EXTENT_DATA_KEY;

1095

key.offset = split;

1095

key.offset = split;

1096

1097

ret = btrfs_search_slot(trans, root, &key, path, -1, 1);

1097

ret = btrfs_search_slot(trans, root, &key, path, -1, 1);

1098

if (ret < 0)

1098

if (ret < 0)

1099

goto out;

1099

goto out;

1100

if (ret > 0 && path->slots[0] > 0)

1100

if (ret > 0 && path->slots[0] > 0)

1101

path->slots[0]--;

1101

path->slots[0]--;

1102

1103

leaf = path->nodes[0];

1103

leaf = path->nodes[0];

1104

btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);

1104

btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);

1105

BUG_ON(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY);

1105

BUG_ON(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY);

1106

fi = btrfs_item_ptr(leaf, path->slots[0],

1106

fi = btrfs_item_ptr(leaf, path->slots[0],

1107

struct btrfs_file_extent_item);

1107

struct btrfs_file_extent_item);

1108

BUG_ON(btrfs_file_extent_type(leaf, fi) !=

1108

BUG_ON(btrfs_file_extent_type(leaf, fi) !=

1109

BTRFS_FILE_EXTENT_PREALLOC);

1109

BTRFS_FILE_EXTENT_PREALLOC);

1110

extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);

1110

extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);

1111

BUG_ON(key.offset > start || extent_end < end);

1111

BUG_ON(key.offset > start || extent_end < end);

1112

1113

bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);

1113

bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);

1114

num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);

1114

num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);

1115

orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);

1115

orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);

1116

memcpy(&new_key, &key, sizeof(new_key));

1116

memcpy(&new_key, &key, sizeof(new_key));

1117

1118

if (start == key.offset && end < extent_end) {

1118

if (start == key.offset && end < extent_end) {

1119

other_start = 0;

1119

other_start = 0;

1120

other_end = start;

1120

other_end = start;

1121

if (extent_mergeable(leaf, path->slots[0] - 1,

1121

if (extent_mergeable(leaf, path->slots[0] - 1,

1122

ino, bytenr, orig_offset,

1122

ino, bytenr, orig_offset,

1123

&other_start, &other_end)) {

1123

&other_start, &other_end)) {

1124

new_key.offset = end;

1124

new_key.offset = end;

1125

btrfs_set_item_key_safe(root, path, &new_key);

1125

btrfs_set_item_key_safe(root, path, &new_key);

1126

fi = btrfs_item_ptr(leaf, path->slots[0],

1126

fi = btrfs_item_ptr(leaf, path->slots[0],

1127

struct btrfs_file_extent_item);

1127

struct btrfs_file_extent_item);

1128

btrfs_set_file_extent_generation(leaf, fi,

1128

btrfs_set_file_extent_generation(leaf, fi,

1129

trans->transid);

1129

trans->transid);

1130

btrfs_set_file_extent_num_bytes(leaf, fi,

1130

btrfs_set_file_extent_num_bytes(leaf, fi,

1131

extent_end - end);

1131

extent_end - end);

1132

btrfs_set_file_extent_offset(leaf, fi,

1132

btrfs_set_file_extent_offset(leaf, fi,

1133

end - orig_offset);

1133

end - orig_offset);

1134

fi = btrfs_item_ptr(leaf, path->slots[0] - 1,

1134

fi = btrfs_item_ptr(leaf, path->slots[0] - 1,

1135

struct btrfs_file_extent_item);

1135

struct btrfs_file_extent_item);

1136

btrfs_set_file_extent_generation(leaf, fi,

1136

btrfs_set_file_extent_generation(leaf, fi,

1137

trans->transid);

1137

trans->transid);

1138

btrfs_set_file_extent_num_bytes(leaf, fi,

1138

btrfs_set_file_extent_num_bytes(leaf, fi,

1139

end - other_start);

1139

end - other_start);

1140

btrfs_mark_buffer_dirty(leaf);

1140

btrfs_mark_buffer_dirty(leaf);

1141

goto out;

1141

goto out;

1142

}

1142

}

1143

}

1143

}

1144

1145

if (start > key.offset && end == extent_end) {

1145

if (start > key.offset && end == extent_end) {

1146

other_start = end;

1146

other_start = end;

1147

other_end = 0;

1147

other_end = 0;

1148

if (extent_mergeable(leaf, path->slots[0] + 1,

1148

if (extent_mergeable(leaf, path->slots[0] + 1,

1149

ino, bytenr, orig_offset,

1149

ino, bytenr, orig_offset,

1150

&other_start, &other_end)) {

1150

&other_start, &other_end)) {

1151

fi = btrfs_item_ptr(leaf, path->slots[0],

1151

fi = btrfs_item_ptr(leaf, path->slots[0],

1152

struct btrfs_file_extent_item);

1152

struct btrfs_file_extent_item);

1153

btrfs_set_file_extent_num_bytes(leaf, fi,

1153

btrfs_set_file_extent_num_bytes(leaf, fi,

1154

start - key.offset);

1154

start - key.offset);

1155

btrfs_set_file_extent_generation(leaf, fi,

1155

btrfs_set_file_extent_generation(leaf, fi,

1156

trans->transid);

1156

trans->transid);

1157

path->slots[0]++;

1157

path->slots[0]++;

1158

new_key.offset = start;

1158

new_key.offset = start;

1159

btrfs_set_item_key_safe(root, path, &new_key);

1159

btrfs_set_item_key_safe(root, path, &new_key);

1160

1161

fi = btrfs_item_ptr(leaf, path->slots[0],

1161

fi = btrfs_item_ptr(leaf, path->slots[0],

1162

struct btrfs_file_extent_item);

1162

struct btrfs_file_extent_item);

1163

btrfs_set_file_extent_generation(leaf, fi,

1163

btrfs_set_file_extent_generation(leaf, fi,

1164

trans->transid);

1164

trans->transid);

1165

btrfs_set_file_extent_num_bytes(leaf, fi,

1165

btrfs_set_file_extent_num_bytes(leaf, fi,

1166

other_end - start);

1166

other_end - start);

1167

btrfs_set_file_extent_offset(leaf, fi,

1167

btrfs_set_file_extent_offset(leaf, fi,

1168

start - orig_offset);

1168

start - orig_offset);

1169

btrfs_mark_buffer_dirty(leaf);

1169

btrfs_mark_buffer_dirty(leaf);

1170

goto out;

1170

goto out;

1171

}

1171

}

1172

}

1172

}

1173

1174

while (start > key.offset || end < extent_end) {

1174

while (start > key.offset || end < extent_end) {

1175

if (key.offset == start)

1175

if (key.offset == start)

1176

split = end;

1176

split = end;

1177

1178

new_key.offset = split;

1178

new_key.offset = split;

1179

ret = btrfs_duplicate_item(trans, root, path, &new_key);

1179

ret = btrfs_duplicate_item(trans, root, path, &new_key);

1180

if (ret == -EAGAIN) {

1180

if (ret == -EAGAIN) {

1181

btrfs_release_path(path);

1181

btrfs_release_path(path);

1182

goto again;

1182

goto again;

1183

}

1183

}

1184

if (ret < 0) {

1184

if (ret < 0) {

1185

btrfs_abort_transaction(trans, root, ret);

1185

btrfs_abort_transaction(trans, root, ret);

1186

goto out;

1186

goto out;

1187

}

1187

}

1188

1189

leaf = path->nodes[0];

1189

leaf = path->nodes[0];

1190

fi = btrfs_item_ptr(leaf, path->slots[0] - 1,

1190

fi = btrfs_item_ptr(leaf, path->slots[0] - 1,

1191

struct btrfs_file_extent_item);

1191

struct btrfs_file_extent_item);

1192

btrfs_set_file_extent_generation(leaf, fi, trans->transid);

1192

btrfs_set_file_extent_generation(leaf, fi, trans->transid);

1193

btrfs_set_file_extent_num_bytes(leaf, fi,

1193

btrfs_set_file_extent_num_bytes(leaf, fi,

1194

split - key.offset);

1194

split - key.offset);

1195

1196

fi = btrfs_item_ptr(leaf, path->slots[0],

1196

fi = btrfs_item_ptr(leaf, path->slots[0],

1197

struct btrfs_file_extent_item);

1197

struct btrfs_file_extent_item);

1198

1199

btrfs_set_file_extent_generation(leaf, fi, trans->transid);

1199

btrfs_set_file_extent_generation(leaf, fi, trans->transid);

1200

btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);

1200

btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);

1201

btrfs_set_file_extent_num_bytes(leaf, fi,

1201

btrfs_set_file_extent_num_bytes(leaf, fi,

1202

extent_end - split);

1202

extent_end - split);

1203

btrfs_mark_buffer_dirty(leaf);

1203

btrfs_mark_buffer_dirty(leaf);

1204

1205

ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,

1205

ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,

1206

root->root_key.objectid,

1206

root->root_key.objectid,

1207

ino, orig_offset, 0);

1207

ino, orig_offset, 0);

1208

BUG_ON(ret); /* -ENOMEM */

1208

BUG_ON(ret); /* -ENOMEM */

1209

1210

if (split == start) {

1210

if (split == start) {

1211

key.offset = start;

1211

key.offset = start;

1212

} else {

1212

} else {

1213

BUG_ON(start != key.offset);

1213

BUG_ON(start != key.offset);

1214

path->slots[0]--;

1214

path->slots[0]--;

1215

extent_end = end;

1215

extent_end = end;

1216

}

1216

}

1217

recow = 1;

1217

recow = 1;

1218

}

1218

}

1219

1220

other_start = end;

1220

other_start = end;

1221

other_end = 0;

1221

other_end = 0;

1222

if (extent_mergeable(leaf, path->slots[0] + 1,

1222

if (extent_mergeable(leaf, path->slots[0] + 1,

1223

ino, bytenr, orig_offset,

1223

ino, bytenr, orig_offset,

1224

&other_start, &other_end)) {

1224

&other_start, &other_end)) {

1225

if (recow) {

1225

if (recow) {

1226

btrfs_release_path(path);

1226

btrfs_release_path(path);

1227

goto again;

1227

goto again;

1228

}

1228

}

1229

extent_end = other_end;

1229

extent_end = other_end;

1230

del_slot = path->slots[0] + 1;

1230

del_slot = path->slots[0] + 1;

1231

del_nr++;

1231

del_nr++;

1232

ret = btrfs_free_extent(trans, root, bytenr, num_bytes,

1232

ret = btrfs_free_extent(trans, root, bytenr, num_bytes,

1233

0, root->root_key.objectid,

1233

0, root->root_key.objectid,

1234

ino, orig_offset, 0);

1234

ino, orig_offset, 0);

1235

BUG_ON(ret); /* -ENOMEM */

1235

BUG_ON(ret); /* -ENOMEM */

1236

}

1236

}

1237

other_start = 0;

1237

other_start = 0;

1238

other_end = start;

1238

other_end = start;

1239

if (extent_mergeable(leaf, path->slots[0] - 1,

1239

if (extent_mergeable(leaf, path->slots[0] - 1,

1240

ino, bytenr, orig_offset,

1240

ino, bytenr, orig_offset,

1241

&other_start, &other_end)) {

1241

&other_start, &other_end)) {

1242

if (recow) {

1242

if (recow) {

1243

btrfs_release_path(path);

1243

btrfs_release_path(path);

1244

goto again;

1244

goto again;

1245

}

1245

}

1246

key.offset = other_start;

1246

key.offset = other_start;

1247

del_slot = path->slots[0];

1247

del_slot = path->slots[0];

1248

del_nr++;

1248

del_nr++;

1249

ret = btrfs_free_extent(trans, root, bytenr, num_bytes,

1249

ret = btrfs_free_extent(trans, root, bytenr, num_bytes,

1250

0, root->root_key.objectid,

1250

0, root->root_key.objectid,

1251

ino, orig_offset, 0);

1251

ino, orig_offset, 0);

1252

BUG_ON(ret); /* -ENOMEM */

1252

BUG_ON(ret); /* -ENOMEM */

1253

}

1253

}

1254

if (del_nr == 0) {

1254

if (del_nr == 0) {

1255

fi = btrfs_item_ptr(leaf, path->slots[0],

1255

fi = btrfs_item_ptr(leaf, path->slots[0],

1256

struct btrfs_file_extent_item);

1256

struct btrfs_file_extent_item);

1257

btrfs_set_file_extent_type(leaf, fi,

1257

btrfs_set_file_extent_type(leaf, fi,

1258

BTRFS_FILE_EXTENT_REG);

1258

BTRFS_FILE_EXTENT_REG);

1259

btrfs_set_file_extent_generation(leaf, fi, trans->transid);

1259

btrfs_set_file_extent_generation(leaf, fi, trans->transid);

1260

btrfs_mark_buffer_dirty(leaf);

1260

btrfs_mark_buffer_dirty(leaf);

1261

} else {

1261

} else {

1262

fi = btrfs_item_ptr(leaf, del_slot - 1,

1262

fi = btrfs_item_ptr(leaf, del_slot - 1,

1263

struct btrfs_file_extent_item);

1263

struct btrfs_file_extent_item);

1264

btrfs_set_file_extent_type(leaf, fi,

1264

btrfs_set_file_extent_type(leaf, fi,

1265

BTRFS_FILE_EXTENT_REG);

1265

BTRFS_FILE_EXTENT_REG);

1266

btrfs_set_file_extent_generation(leaf, fi, trans->transid);

1266

btrfs_set_file_extent_generation(leaf, fi, trans->transid);

1267

btrfs_set_file_extent_num_bytes(leaf, fi,

1267

btrfs_set_file_extent_num_bytes(leaf, fi,

1268

extent_end - key.offset);

1268

extent_end - key.offset);

1269

btrfs_mark_buffer_dirty(leaf);

1269

btrfs_mark_buffer_dirty(leaf);

1270

1271

ret = btrfs_del_items(trans, root, path, del_slot, del_nr);

1271

ret = btrfs_del_items(trans, root, path, del_slot, del_nr);

1272

if (ret < 0) {

1272

if (ret < 0) {

1273

btrfs_abort_transaction(trans, root, ret);

1273

btrfs_abort_transaction(trans, root, ret);

1274

goto out;

1274

goto out;

1275

}

1275

}

1276

}

1276

}

1277

out:

1277

out:

1278

btrfs_free_path(path);

1278

btrfs_free_path(path);

1279

return 0;

1279

return 0;

1280

}

1280

}

1281

1282

/*

1282

/*

1283

* on error we return an unlocked page and the error value

1283

* on error we return an unlocked page and the error value

1284

* on success we return a locked page and 0

1284

* on success we return a locked page and 0

1285

*/

1285

*/

1286

static int prepare_uptodate_page(struct page *page, u64 pos,

1286

static int prepare_uptodate_page(struct page *page, u64 pos,

1287

bool force_uptodate)

1287

bool force_uptodate)

1288

{

1288

{

1289

int ret = 0;

1289

int ret = 0;

1290

1291

if (((pos & (PAGE_CACHE_SIZE - 1)) || force_uptodate) &&

1291

if (((pos & (PAGE_CACHE_SIZE - 1)) || force_uptodate) &&

1292

!PageUptodate(page)) {

1292

!PageUptodate(page)) {

1293

ret = btrfs_readpage(NULL, page);

1293

ret = btrfs_readpage(NULL, page);

1294

if (ret)

1294

if (ret)

1295

return ret;

1295

return ret;

1296

lock_page(page);

1296

lock_page(page);

1297

if (!PageUptodate(page)) {

1297

if (!PageUptodate(page)) {

1298

unlock_page(page);

1298

unlock_page(page);

1299

return -EIO;

1299

return -EIO;

1300

}

1300

}

1301

}

1301

}

1302

return 0;

1302

return 0;

1303

}

1303

}

1304

1305

/*

1305

/*

1306

* this just gets pages into the page cache and locks them down.

1306

* this just gets pages into the page cache and locks them down.

1307

*/

1307

*/

1308

static noinline int prepare_pages(struct inode *inode, struct page **pages,

1308

static noinline int prepare_pages(struct inode *inode, struct page **pages,

1309

size_t num_pages, loff_t pos,

1309

size_t num_pages, loff_t pos,

1310

size_t write_bytes, bool force_uptodate)

1310

size_t write_bytes, bool force_uptodate)

1311

{

1311

{

1312

int i;

1312

int i;

1313

unsigned long index = pos >> PAGE_CACHE_SHIFT;

1313

unsigned long index = pos >> PAGE_CACHE_SHIFT;

1314

gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);

1314

gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);

1315

int err = 0;

1315

int err = 0;

1316

int faili;

1316

int faili;

1317

1318

for (i = 0; i < num_pages; i++) {

1318

for (i = 0; i < num_pages; i++) {

1319

pages[i] = find_or_create_page(inode->i_mapping, index + i,

1319

pages[i] = find_or_create_page(inode->i_mapping, index + i,

1320

mask | __GFP_WRITE);

1320

mask | __GFP_WRITE);

1321

if (!pages[i]) {

1321

if (!pages[i]) {

1322

faili = i - 1;

1322

faili = i - 1;

1323

err = -ENOMEM;

1323

err = -ENOMEM;

1324

goto fail;

1324

goto fail;

1325

}

1325

}

1326

1327

if (i == 0)

1327

if (i == 0)

1328

err = prepare_uptodate_page(pages[i], pos,

1328

err = prepare_uptodate_page(pages[i], pos,

1329

force_uptodate);

1329

force_uptodate);

1330

if (i == num_pages - 1)

1330

if (i == num_pages - 1)

1331

err = prepare_uptodate_page(pages[i],

1331

err = prepare_uptodate_page(pages[i],

1332

pos + write_bytes, false);

1332

pos + write_bytes, false);

1333

if (err) {

1333

if (err) {

1334

page_cache_release(pages[i]);

1334

page_cache_release(pages[i]);

1335

faili = i - 1;

1335

faili = i - 1;

1336

goto fail;

1336

goto fail;

1337

}

1337

}

1338

wait_on_page_writeback(pages[i]);

1338

wait_on_page_writeback(pages[i]);

1339

}

1339

}

1340

1341

return 0;

1341

return 0;

1342

fail:

1342

fail:

1343

while (faili >= 0) {

1343

while (faili >= 0) {

1344

unlock_page(pages[faili]);

1344

unlock_page(pages[faili]);

1345

page_cache_release(pages[faili]);

1345

page_cache_release(pages[faili]);

1346

faili--;

1346

faili--;

1347

}

1347

}

1348

return err;

1348

return err;

1349

1350

}

1350

}

1351

1352

/*

1352

/*

1353

* This function locks the extent and properly waits for data=ordered extents

1353

* This function locks the extent and properly waits for data=ordered extents

1354

* to finish before allowing the pages to be modified if need.

1354

* to finish before allowing the pages to be modified if need.

1355

*

1355

*

1356

* The return value:

1356

* The return value:

1357

* 1 - the extent is locked

1357

* 1 - the extent is locked

1358

* 0 - the extent is not locked, and everything is OK

1358

* 0 - the extent is not locked, and everything is OK

1359

* -EAGAIN - need re-prepare the pages

1359

* -EAGAIN - need re-prepare the pages

1360

* the other < 0 number - Something wrong happens

1360

* the other < 0 number - Something wrong happens

1361

*/

1361

*/

1362

static noinline int

1362

static noinline int

1363

lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,

1363

lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,

1364

size_t num_pages, loff_t pos,

1364

size_t num_pages, loff_t pos,

1365

u64 *lockstart, u64 *lockend,

1365

u64 *lockstart, u64 *lockend,

1366

struct extent_state **cached_state)

1366

struct extent_state **cached_state)

1367

{

1367

{

1368

u64 start_pos;

1368

u64 start_pos;

1369

u64 last_pos;

1369

u64 last_pos;

1370

int i;

1370

int i;

1371

int ret = 0;

1371

int ret = 0;

1372

1373

start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1);

1373

start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1);

1374

last_pos = start_pos + ((u64)num_pages << PAGE_CACHE_SHIFT) - 1;

1374

last_pos = start_pos + ((u64)num_pages << PAGE_CACHE_SHIFT) - 1;

1375

1376

if (start_pos < inode->i_size) {

1376

if (start_pos < inode->i_size) {

1377

struct btrfs_ordered_extent *ordered;

1377

struct btrfs_ordered_extent *ordered;

1378

lock_extent_bits(&BTRFS_I(inode)->io_tree,

1378

lock_extent_bits(&BTRFS_I(inode)->io_tree,

1379

start_pos, last_pos, 0, cached_state);

1379

start_pos, last_pos, 0, cached_state);

1380

ordered = btrfs_lookup_ordered_range(inode, start_pos,

1380

ordered = btrfs_lookup_ordered_range(inode, start_pos,

1381

last_pos - start_pos + 1);

1381

last_pos - start_pos + 1);

1382

if (ordered &&

1382

if (ordered &&

1383

ordered->file_offset + ordered->len > start_pos &&

1383

ordered->file_offset + ordered->len > start_pos &&

1384

ordered->file_offset <= last_pos) {

1384

ordered->file_offset <= last_pos) {

1385

unlock_extent_cached(&BTRFS_I(inode)->io_tree,

1385

unlock_extent_cached(&BTRFS_I(inode)->io_tree,

1386

start_pos, last_pos,

1386

start_pos, last_pos,

1387

cached_state, GFP_NOFS);

1387

cached_state, GFP_NOFS);

1388

for (i = 0; i < num_pages; i++) {

1388

for (i = 0; i < num_pages; i++) {

1389

unlock_page(pages[i]);

1389

unlock_page(pages[i]);

1390

page_cache_release(pages[i]);

1390

page_cache_release(pages[i]);

1391

}

1391

}

1392

btrfs_start_ordered_extent(inode, ordered, 1);

1392

btrfs_start_ordered_extent(inode, ordered, 1);

1393

btrfs_put_ordered_extent(ordered);

1393

btrfs_put_ordered_extent(ordered);

1394

return -EAGAIN;

1394

return -EAGAIN;

1395

}

1395

}

1396

if (ordered)

1396

if (ordered)

1397

btrfs_put_ordered_extent(ordered);

1397

btrfs_put_ordered_extent(ordered);

1398

1399

clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,

1399

clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,

1400

last_pos, EXTENT_DIRTY | EXTENT_DELALLOC |

1400

last_pos, EXTENT_DIRTY | EXTENT_DELALLOC |

1401

EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,

1401

EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,

1402

0, 0, cached_state, GFP_NOFS);

1402

0, 0, cached_state, GFP_NOFS);

1403

*lockstart = start_pos;

1403

*lockstart = start_pos;

1404

*lockend = last_pos;

1404

*lockend = last_pos;

1405

ret = 1;

1405

ret = 1;

1406

}

1406

}

1407

1408

for (i = 0; i < num_pages; i++) {

1408

for (i = 0; i < num_pages; i++) {

1409

if (clear_page_dirty_for_io(pages[i]))

1409

if (clear_page_dirty_for_io(pages[i]))

1410

account_page_redirty(pages[i]);

1410

account_page_redirty(pages[i]);

1411

set_page_extent_mapped(pages[i]);

1411

set_page_extent_mapped(pages[i]);

1412

WARN_ON(!PageLocked(pages[i]));

1412

WARN_ON(!PageLocked(pages[i]));

1413

}

1413

}

1414

1415

return ret;

1415

return ret;

1416

}

1416

}

1417

1418

static noinline int check_can_nocow(struct inode *inode, loff_t pos,

1418

static noinline int check_can_nocow(struct inode *inode, loff_t pos,

1419

size_t *write_bytes)

1419

size_t *write_bytes)

1420

{

1420

{

1421

struct btrfs_root *root = BTRFS_I(inode)->root;

1421

struct btrfs_root *root = BTRFS_I(inode)->root;

1422

struct btrfs_ordered_extent *ordered;

1422

struct btrfs_ordered_extent *ordered;

1423

u64 lockstart, lockend;

1423

u64 lockstart, lockend;

1424

u64 num_bytes;

1424

u64 num_bytes;

1425

int ret;

1425

int ret;

1426

1427

ret = btrfs_start_nocow_write(root);

1427

ret = btrfs_start_nocow_write(root);

1428

if (!ret)

1428

if (!ret)

1429

return -ENOSPC;

1429

return -ENOSPC;

1430

1431

lockstart = round_down(pos, root->sectorsize);

1431

lockstart = round_down(pos, root->sectorsize);

1432

lockend = round_up(pos + *write_bytes, root->sectorsize) - 1;

1432

lockend = round_up(pos + *write_bytes, root->sectorsize) - 1;

1433

1434

while (1) {

1434

while (1) {

1435

lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);

1435

lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);

1436

ordered = btrfs_lookup_ordered_range(inode, lockstart,

1436

ordered = btrfs_lookup_ordered_range(inode, lockstart,

1437

lockend - lockstart + 1);

1437

lockend - lockstart + 1);

1438

if (!ordered) {

1438

if (!ordered) {

1439

break;

1439

break;

1440

}

1440

}

1441

unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);

1441

unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);

1442

btrfs_start_ordered_extent(inode, ordered, 1);

1442

btrfs_start_ordered_extent(inode, ordered, 1);

1443

btrfs_put_ordered_extent(ordered);

1443

btrfs_put_ordered_extent(ordered);

1444

}

1444

}

1445

1446

num_bytes = lockend - lockstart + 1;

1446

num_bytes = lockend - lockstart + 1;

1447

ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL);

1447

ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL);

1448

if (ret <= 0) {

1448

if (ret <= 0) {

1449

ret = 0;

1449

ret = 0;

1450

btrfs_end_nocow_write(root);

1450

btrfs_end_nocow_write(root);

1451

} else {

1451

} else {

1452

*write_bytes = min_t(size_t, *write_bytes ,

1452

*write_bytes = min_t(size_t, *write_bytes ,

1453

num_bytes - pos + lockstart);

1453

num_bytes - pos + lockstart);

1454

}

1454

}

1455

1456

unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);

1456

unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);

1457

1458

return ret;

1458

return ret;

1459

}

1459

}

1460

1461

static noinline ssize_t __btrfs_buffered_write(struct file *file,

1461

static noinline ssize_t __btrfs_buffered_write(struct file *file,

1462

struct iov_iter *i,

1462

struct iov_iter *i,

1463

loff_t pos)

1463

loff_t pos)

1464

{

1464

{

1465

struct inode *inode = file_inode(file);

1465

struct inode *inode = file_inode(file);

1466

struct btrfs_root *root = BTRFS_I(inode)->root;

1466

struct btrfs_root *root = BTRFS_I(inode)->root;

1467

struct page **pages = NULL;

1467

struct page **pages = NULL;

1468

struct extent_state *cached_state = NULL;

1468

struct extent_state *cached_state = NULL;

1469

u64 release_bytes = 0;

1469

u64 release_bytes = 0;

1470

u64 lockstart;

1470

u64 lockstart;

1471

u64 lockend;

1471

u64 lockend;

1472

unsigned long first_index;

1472

unsigned long first_index;

1473

size_t num_written = 0;

1473

size_t num_written = 0;

1474

int nrptrs;

1474

int nrptrs;

1475

int ret = 0;

1475

int ret = 0;

1476

bool only_release_metadata = false;

1476

bool only_release_metadata = false;

1477

bool force_page_uptodate = false;

1477

bool force_page_uptodate = false;

1478

bool need_unlock;

1478

bool need_unlock;

1479

1480

nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /

1480

nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /

1481

PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /

1481

PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /

1482

(sizeof(struct page *)));

1482

(sizeof(struct page *)));

1483

nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);

1483

nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);

1484

nrptrs = max(nrptrs, 8);

1484

nrptrs = max(nrptrs, 8);

1485

pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);

1485

pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);

1486

if (!pages)

1486

if (!pages)

1487

return -ENOMEM;

1487

return -ENOMEM;

1488

1489

first_index = pos >> PAGE_CACHE_SHIFT;

1489

first_index = pos >> PAGE_CACHE_SHIFT;

1490

1491

while (iov_iter_count(i) > 0) {

1491

while (iov_iter_count(i) > 0) {

1492

size_t offset = pos & (PAGE_CACHE_SIZE - 1);

1492

size_t offset = pos & (PAGE_CACHE_SIZE - 1);

1493

size_t write_bytes = min(iov_iter_count(i),

1493

size_t write_bytes = min(iov_iter_count(i),

1494

nrptrs * (size_t)PAGE_CACHE_SIZE -

1494

nrptrs * (size_t)PAGE_CACHE_SIZE -

1495

offset);

1495

offset);

1496

size_t num_pages = (write_bytes + offset +

1496

size_t num_pages = (write_bytes + offset +

1497

PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;

1497

PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;

1498

size_t reserve_bytes;

1498

size_t reserve_bytes;

1499

size_t dirty_pages;

1499

size_t dirty_pages;

1500

size_t copied;

1500

size_t copied;

1501

1502

WARN_ON(num_pages > nrptrs);

1502

WARN_ON(num_pages > nrptrs);

1503

1504

/*

1504

/*

1505

* Fault pages before locking them in prepare_pages

1505

* Fault pages before locking them in prepare_pages

1506

* to avoid recursive lock

1506

* to avoid recursive lock

1507

*/

1507

*/

1508

if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {

1508

if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {

1509

ret = -EFAULT;

1509

ret = -EFAULT;

1510

break;

1510

break;

1511

}

1511

}

1512

1513

reserve_bytes = num_pages << PAGE_CACHE_SHIFT;

1513

reserve_bytes = num_pages << PAGE_CACHE_SHIFT;

1514

ret = btrfs_check_data_free_space(inode, reserve_bytes);

1514

ret = btrfs_check_data_free_space(inode, reserve_bytes);

1515

if (ret == -ENOSPC &&

1515

if (ret == -ENOSPC &&

1516

(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |

1516

(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |

1517

BTRFS_INODE_PREALLOC))) {

1517

BTRFS_INODE_PREALLOC))) {

1518

ret = check_can_nocow(inode, pos, &write_bytes);

1518

ret = check_can_nocow(inode, pos, &write_bytes);

1519

if (ret > 0) {

1519

if (ret > 0) {

1520

only_release_metadata = true;

1520

only_release_metadata = true;

1521

/*

1521

/*

1522

* our prealloc extent may be smaller than

1522

* our prealloc extent may be smaller than

1523

* write_bytes, so scale down.

1523

* write_bytes, so scale down.

1524

*/

1524

*/

1525

num_pages = (write_bytes + offset +

1525

num_pages = (write_bytes + offset +

1526

PAGE_CACHE_SIZE - 1) >>

1526

PAGE_CACHE_SIZE - 1) >>

1527

PAGE_CACHE_SHIFT;

1527

PAGE_CACHE_SHIFT;

1528

reserve_bytes = num_pages << PAGE_CACHE_SHIFT;

1528

reserve_bytes = num_pages << PAGE_CACHE_SHIFT;

1529

ret = 0;

1529

ret = 0;

1530

} else {

1530

} else {

1531

ret = -ENOSPC;

1531

ret = -ENOSPC;

1532

}

1532

}

1533

}

1533

}

1534

1535

if (ret)

1535

if (ret)

1536

break;

1536

break;

1537

1538

ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);

1538

ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);

1539

if (ret) {

1539

if (ret) {

1540

if (!only_release_metadata)

1540

if (!only_release_metadata)

1541

btrfs_free_reserved_data_space(inode,

1541

btrfs_free_reserved_data_space(inode,

1542

reserve_bytes);

1542

reserve_bytes);

1543

else

1543

else

1544

btrfs_end_nocow_write(root);

1544

btrfs_end_nocow_write(root);

1545

break;

1545

break;

1546

}

1546

}

1547

1548

release_bytes = reserve_bytes;

1548

release_bytes = reserve_bytes;

1549

need_unlock = false;

1549

need_unlock = false;

1550

again:

1550

again:

1551

/*

1551

/*

1552

* This is going to setup the pages array with the number of

1552

* This is going to setup the pages array with the number of

1553

* pages we want, so we don't really need to worry about the

1553

* pages we want, so we don't really need to worry about the

1554

* contents of pages from loop to loop

1554

* contents of pages from loop to loop

1555

*/

1555

*/

1556

ret = prepare_pages(inode, pages, num_pages,

1556

ret = prepare_pages(inode, pages, num_pages,

1557

pos, write_bytes,

1557

pos, write_bytes,

1558

force_page_uptodate);

1558

force_page_uptodate);

1559

if (ret)

1559

if (ret)

1560

break;

1560

break;

1561

1562

ret = lock_and_cleanup_extent_if_need(inode, pages, num_pages,

1562

ret = lock_and_cleanup_extent_if_need(inode, pages, num_pages,

1563

pos, &lockstart, &lockend,

1563

pos, &lockstart, &lockend,

1564

&cached_state);

1564

&cached_state);

1565

if (ret < 0) {

1565

if (ret < 0) {

1566

if (ret == -EAGAIN)

1566

if (ret == -EAGAIN)

1567

goto again;

1567

goto again;

1568

break;

1568

break;

1569

} else if (ret > 0) {

1569

} else if (ret > 0) {

1570

need_unlock = true;

1570

need_unlock = true;

1571

ret = 0;

1571

ret = 0;

1572

}

1572

}

1573

1574

copied = btrfs_copy_from_user(pos, num_pages,

1574

copied = btrfs_copy_from_user(pos, num_pages,

1575

write_bytes, pages, i);

1575

write_bytes, pages, i);

1576

1577

/*

1577

/*

1578

* if we have trouble faulting in the pages, fall

1578

* if we have trouble faulting in the pages, fall

1579

* back to one page at a time

1579

* back to one page at a time

1580

*/

1580

*/

1581

if (copied < write_bytes)

1581

if (copied < write_bytes)

1582

nrptrs = 1;

1582

nrptrs = 1;

1583

1584

if (copied == 0) {

1584

if (copied == 0) {

1585

force_page_uptodate = true;

1585

force_page_uptodate = true;

1586

dirty_pages = 0;

1586

dirty_pages = 0;

1587

} else {

1587

} else {

1588

force_page_uptodate = false;

1588

force_page_uptodate = false;

1589

dirty_pages = (copied + offset +

1589

dirty_pages = (copied + offset +

1590

PAGE_CACHE_SIZE - 1) >>

1590

PAGE_CACHE_SIZE - 1) >>

1591

PAGE_CACHE_SHIFT;

1591

PAGE_CACHE_SHIFT;

1592

}

1592

}

1593

1594

/*

1594

/*

1595

* If we had a short copy we need to release the excess delaloc

1595

* If we had a short copy we need to release the excess delaloc

1596

* bytes we reserved. We need to increment outstanding_extents

1596

* bytes we reserved. We need to increment outstanding_extents

1597

* because btrfs_delalloc_release_space will decrement it, but

1597

* because btrfs_delalloc_release_space will decrement it, but

1598

* we still have an outstanding extent for the chunk we actually

1598

* we still have an outstanding extent for the chunk we actually

1599

* managed to copy.

1599

* managed to copy.

1600

*/

1600

*/

1601

if (num_pages > dirty_pages) {

1601

if (num_pages > dirty_pages) {

1602

release_bytes = (num_pages - dirty_pages) <<

1602

release_bytes = (num_pages - dirty_pages) <<

1603

PAGE_CACHE_SHIFT;

1603

PAGE_CACHE_SHIFT;

1604

if (copied > 0) {

1604

if (copied > 0) {

1605

spin_lock(&BTRFS_I(inode)->lock);

1605

spin_lock(&BTRFS_I(inode)->lock);

1606

BTRFS_I(inode)->outstanding_extents++;

1606

BTRFS_I(inode)->outstanding_extents++;

1607

spin_unlock(&BTRFS_I(inode)->lock);

1607

spin_unlock(&BTRFS_I(inode)->lock);

1608

}

1608

}

1609

if (only_release_metadata)

1609

if (only_release_metadata)

1610

btrfs_delalloc_release_metadata(inode,

1610

btrfs_delalloc_release_metadata(inode,

1611

release_bytes);

1611

release_bytes);

1612

else

1612

else

1613

btrfs_delalloc_release_space(inode,

1613

btrfs_delalloc_release_space(inode,

1614

release_bytes);

1614

release_bytes);

1615

}

1615

}

1616

1617

release_bytes = dirty_pages << PAGE_CACHE_SHIFT;

1617

release_bytes = dirty_pages << PAGE_CACHE_SHIFT;

1618

1619

if (copied > 0)

1619

if (copied > 0)

1620

ret = btrfs_dirty_pages(root, inode, pages,

1620

ret = btrfs_dirty_pages(root, inode, pages,

1621

dirty_pages, pos, copied,

1621

dirty_pages, pos, copied,

1622

NULL);

1622

NULL);

1623

if (need_unlock)

1623

if (need_unlock)

1624

unlock_extent_cached(&BTRFS_I(inode)->io_tree,

1624

unlock_extent_cached(&BTRFS_I(inode)->io_tree,

1625

lockstart, lockend, &cached_state,

1625

lockstart, lockend, &cached_state,

1626

GFP_NOFS);

1626

GFP_NOFS);

1627

if (ret) {

1627

if (ret) {

1628

btrfs_drop_pages(pages, num_pages);

1628

btrfs_drop_pages(pages, num_pages);

1629

break;

1629

break;

1630

}

1630

}

1631

1632

release_bytes = 0;

1632

release_bytes = 0;

1633

if (only_release_metadata)

1633

if (only_release_metadata)

1634

btrfs_end_nocow_write(root);

1634

btrfs_end_nocow_write(root);

1635

1636

if (only_release_metadata && copied > 0) {

1636

if (only_release_metadata && copied > 0) {

1637

u64 lockstart = round_down(pos, root->sectorsize);

1637

u64 lockstart = round_down(pos, root->sectorsize);

1638

u64 lockend = lockstart +

1638

u64 lockend = lockstart +

1639

(dirty_pages << PAGE_CACHE_SHIFT) - 1;

1639

(dirty_pages << PAGE_CACHE_SHIFT) - 1;

1640

1641

set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,

1641

set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,

1642

lockend, EXTENT_NORESERVE, NULL,

1642

lockend, EXTENT_NORESERVE, NULL,

1643

NULL, GFP_NOFS);

1643

NULL, GFP_NOFS);

1644

only_release_metadata = false;

1644

only_release_metadata = false;

1645

}

1645

}

1646

1647

btrfs_drop_pages(pages, num_pages);

1647

btrfs_drop_pages(pages, num_pages);

1648

1649

cond_resched();

1649

cond_resched();

1650

1651

balance_dirty_pages_ratelimited(inode->i_mapping);

1651

balance_dirty_pages_ratelimited(inode->i_mapping);

1652

if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)

1652

if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)

1653

btrfs_btree_balance_dirty(root);

1653

btrfs_btree_balance_dirty(root);

1654

1655

pos += copied;

1655

pos += copied;

1656

num_written += copied;

1656

num_written += copied;

1657

}

1657

}

1658

1659

kfree(pages);

1659

kfree(pages);

1660

1661

if (release_bytes) {

1661

if (release_bytes) {

1662

if (only_release_metadata) {

1662

if (only_release_metadata) {

1663

btrfs_end_nocow_write(root);

1663

btrfs_end_nocow_write(root);

1664

btrfs_delalloc_release_metadata(inode, release_bytes);

1664

btrfs_delalloc_release_metadata(inode, release_bytes);

1665

} else {

1665

} else {

1666

btrfs_delalloc_release_space(inode, release_bytes);

1666

btrfs_delalloc_release_space(inode, release_bytes);

1667

}

1667

}

1668

}

1668

}

1669

1670

return num_written ? num_written : ret;

1670

return num_written ? num_written : ret;

1671

}

1671

}

1672

1673

static ssize_t __btrfs_direct_write(struct kiocb *iocb,

1673

static ssize_t __btrfs_direct_write(struct kiocb *iocb,

1674

const struct iovec *iov,

1674

const struct iovec *iov,

1675

unsigned long nr_segs, loff_t pos,

1675

unsigned long nr_segs, loff_t pos,

1676

size_t count, size_t ocount)

1676

size_t count, size_t ocount)

1677

{

1677

{

1678

struct file *file = iocb->ki_filp;

1678

struct file *file = iocb->ki_filp;

1679

struct iov_iter i;

1679

struct iov_iter i;

1680

ssize_t written;

1680

ssize_t written;

1681

ssize_t written_buffered;

1681

ssize_t written_buffered;

1682

loff_t endbyte;

1682

loff_t endbyte;

1683

int err;

1683

int err;

1684

1685

written = generic_file_direct_write(iocb, iov, &nr_segs, pos,

1685

written = generic_file_direct_write(iocb, iov, &nr_segs, pos,

1686

count, ocount);

1686

count, ocount);

1687

1688

if (written < 0 || written == count)

1688

if (written < 0 || written == count)

1689

return written;

1689

return written;

1690

1691

pos += written;

1691

pos += written;

1692

count -= written;

1692

count -= written;

1693

iov_iter_init(&i, iov, nr_segs, count, written);

1693

iov_iter_init(&i, iov, nr_segs, count, written);

1694

written_buffered = __btrfs_buffered_write(file, &i, pos);

1694

written_buffered = __btrfs_buffered_write(file, &i, pos);

1695

if (written_buffered < 0) {

1695

if (written_buffered < 0) {

1696

err = written_buffered;

1696

err = written_buffered;

1697

goto out;

1697

goto out;

1698

}

1698

}

1699

endbyte = pos + written_buffered - 1;

1699

endbyte = pos + written_buffered - 1;

1700

err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);

1700

err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);

1701

if (err)

1701

if (err)

1702

goto out;

1702

goto out;

1703

written += written_buffered;

1703

written += written_buffered;

1704

iocb->ki_pos = pos + written_buffered;

1704

iocb->ki_pos = pos + written_buffered;

1705

invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT,

1705

invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT,

1706

endbyte >> PAGE_CACHE_SHIFT);

1706

endbyte >> PAGE_CACHE_SHIFT);

1707

out:

1707

out:

1708

return written ? written : err;

1708

return written ? written : err;

1709

}

1709

}

1710

1711

static void update_time_for_write(struct inode *inode)

1711

static void update_time_for_write(struct inode *inode)

1712

{

1712

{

1713

struct timespec now;

1713

struct timespec now;

1714

1715

if (IS_NOCMTIME(inode))

1715

if (IS_NOCMTIME(inode))

1716

return;

1716

return;

1717

1718

now = current_fs_time(inode->i_sb);

1718

now = current_fs_time(inode->i_sb);

1719

if (!timespec_equal(&inode->i_mtime, &now))

1719

if (!timespec_equal(&inode->i_mtime, &now))

1720

inode->i_mtime = now;

1720

inode->i_mtime = now;

1721

1722

if (!timespec_equal(&inode->i_ctime, &now))

1722

if (!timespec_equal(&inode->i_ctime, &now))

1723

inode->i_ctime = now;

1723

inode->i_ctime = now;

1724

1725

if (IS_I_VERSION(inode))

1725

if (IS_I_VERSION(inode))

1726

inode_inc_iversion(inode);

1726

inode_inc_iversion(inode);

1727

}

1727

}

1728

1729

static ssize_t btrfs_file_aio_write(struct kiocb *iocb,

1729

static ssize_t btrfs_file_aio_write(struct kiocb *iocb,

1730

const struct iovec *iov,

1730

const struct iovec *iov,

1731

unsigned long nr_segs, loff_t pos)

1731

unsigned long nr_segs, loff_t pos)

1732

{

1732

{

1733

struct file *file = iocb->ki_filp;

1733

struct file *file = iocb->ki_filp;

1734

struct inode *inode = file_inode(file);

1734

struct inode *inode = file_inode(file);

1735

struct btrfs_root *root = BTRFS_I(inode)->root;

1735

struct btrfs_root *root = BTRFS_I(inode)->root;

1736

u64 start_pos;

1736

u64 start_pos;

1737

u64 end_pos;

1737

u64 end_pos;

1738

ssize_t num_written = 0;

1738

ssize_t num_written = 0;

1739

ssize_t err = 0;

1739

ssize_t err = 0;

1740

size_t count, ocount;

1740

size_t count, ocount;

1741

bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);

1741

bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);

1742

1743

mutex_lock(&inode->i_mutex);

1743

mutex_lock(&inode->i_mutex);

1744

1745

err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);

1745

err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);

1746

if (err) {

1746

if (err) {

1747

mutex_unlock(&inode->i_mutex);

1747

mutex_unlock(&inode->i_mutex);

1748

goto out;

1748

goto out;

1749

}

1749

}

1750

count = ocount;

1750

count = ocount;

1751

1752

current->backing_dev_info = inode->i_mapping->backing_dev_info;

1752

current->backing_dev_info = inode->i_mapping->backing_dev_info;

1753

err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));

1753

err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));

1754

if (err) {

1754

if (err) {

1755

mutex_unlock(&inode->i_mutex);

1755

mutex_unlock(&inode->i_mutex);

1756

goto out;

1756

goto out;

1757

}

1757

}

1758

1759

if (count == 0) {

1759

if (count == 0) {

1760

mutex_unlock(&inode->i_mutex);

1760

mutex_unlock(&inode->i_mutex);

1761

goto out;

1761

goto out;

1762

}

1762

}

1763

1764

err = file_remove_suid(file);

1764

err = file_remove_suid(file);

1765

if (err) {

1765

if (err) {

1766

mutex_unlock(&inode->i_mutex);

1766

mutex_unlock(&inode->i_mutex);

1767

goto out;

1767

goto out;

1768

}

1768

}

1769

1770

/*

1770

/*

1771

* If BTRFS flips readonly due to some impossible error

1771

* If BTRFS flips readonly due to some impossible error

1772

* (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),

1772

* (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),

1773

* although we have opened a file as writable, we have

1773

* although we have opened a file as writable, we have

1774

* to stop this write operation to ensure FS consistency.

1774

* to stop this write operation to ensure FS consistency.

1775

*/

1775

*/

1776

if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {

1776

if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {

1777

mutex_unlock(&inode->i_mutex);

1777

mutex_unlock(&inode->i_mutex);

1778

err = -EROFS;

1778

err = -EROFS;

1779

goto out;

1779

goto out;

1780

}

1780

}

1781

1782

/*

1782

/*

1783

* We reserve space for updating the inode when we reserve space for the

1783

* We reserve space for updating the inode when we reserve space for the

1784

* extent we are going to write, so we will enospc out there. We don't

1784

* extent we are going to write, so we will enospc out there. We don't

1785

* need to start yet another transaction to update the inode as we will

1785

* need to start yet another transaction to update the inode as we will

1786

* update the inode when we finish writing whatever data we write.

1786

* update the inode when we finish writing whatever data we write.

1787

*/

1787

*/

1788

update_time_for_write(inode);

1788

update_time_for_write(inode);

1789

1790

start_pos = round_down(pos, root->sectorsize);

1790

start_pos = round_down(pos, root->sectorsize);

1791

if (start_pos > i_size_read(inode)) {

1791

if (start_pos > i_size_read(inode)) {

1792

/* Expand hole size to cover write data, preventing empty gap */

1792

/* Expand hole size to cover write data, preventing empty gap */

1793

end_pos = round_up(pos + count, root->sectorsize);

1793

end_pos = round_up(pos + count, root->sectorsize);

1794

err = btrfs_cont_expand(inode, i_size_read(inode), end_pos);

1794

err = btrfs_cont_expand(inode, i_size_read(inode), end_pos);

1795

if (err) {

1795

if (err) {

1796

mutex_unlock(&inode->i_mutex);

1796

mutex_unlock(&inode->i_mutex);

1797

goto out;

1797

goto out;

1798

}

1798

}

1799

}

1799

}

1800

1801

if (sync)

1801

if (sync)

1802

atomic_inc(&BTRFS_I(inode)->sync_writers);

1802

atomic_inc(&BTRFS_I(inode)->sync_writers);

1803

1804

if (unlikely(file->f_flags & O_DIRECT)) {

1804

if (unlikely(file->f_flags & O_DIRECT)) {

1805

num_written = __btrfs_direct_write(iocb, iov, nr_segs,

1805

num_written = __btrfs_direct_write(iocb, iov, nr_segs,

1806

pos, count, ocount);

1806

pos, count, ocount);

1807

} else {

1807

} else {

1808

struct iov_iter i;

1808

struct iov_iter i;

1809

1810

iov_iter_init(&i, iov, nr_segs, count, num_written);

1810

iov_iter_init(&i, iov, nr_segs, count, num_written);

1811

1812

num_written = __btrfs_buffered_write(file, &i, pos);

1812

num_written = __btrfs_buffered_write(file, &i, pos);

1813

if (num_written > 0)

1813

if (num_written > 0)

1814

iocb->ki_pos = pos + num_written;

1814

iocb->ki_pos = pos + num_written;

1815

}

1815

}

1816

1817

mutex_unlock(&inode->i_mutex);

1817

mutex_unlock(&inode->i_mutex);

1818

1819

/*

1819

/*

1820

* we want to make sure fsync finds this change

1820

* we want to make sure fsync finds this change

1821

* but we haven't joined a transaction running right now.

1821

* but we haven't joined a transaction running right now.

1822

*

1822

*

1823

* Later on, someone is sure to update the inode and get the

1823

* Later on, someone is sure to update the inode and get the

1824

* real transid recorded.

1824

* real transid recorded.

1825

*

1825

*

1826

* We set last_trans now to the fs_info generation + 1,

1826

* We set last_trans now to the fs_info generation + 1,

1827

* this will either be one more than the running transaction

1827

* this will either be one more than the running transaction

1828

* or the generation used for the next transaction if there isn't

1828

* or the generation used for the next transaction if there isn't

1829

* one running right now.

1829

* one running right now.

1830

*

1830

*

1831

* We also have to set last_sub_trans to the current log transid,

1831

* We also have to set last_sub_trans to the current log transid,

1832

* otherwise subsequent syncs to a file that's been synced in this

1832

* otherwise subsequent syncs to a file that's been synced in this

1833

* transaction will appear to have already occured.

1833

* transaction will appear to have already occured.

1834

*/

1834

*/

1835

BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;

1835

BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;

1836

BTRFS_I(inode)->last_sub_trans = root->log_transid;

1836

BTRFS_I(inode)->last_sub_trans = root->log_transid;

1837

if (num_written > 0) {

1837

if (num_written > 0) {

1838

err = generic_write_sync(file, pos, num_written);

1838

err = generic_write_sync(file, pos, num_written);

1839

if (err < 0)

1839

if (err < 0)

1840

num_written = err;

1840

num_written = err;

1841

}

1841

}

1842

1843

if (sync)

1843

if (sync)

1844

atomic_dec(&BTRFS_I(inode)->sync_writers);

1844

atomic_dec(&BTRFS_I(inode)->sync_writers);

1845

out:

1845

out:

1846

current->backing_dev_info = NULL;

1846

current->backing_dev_info = NULL;

1847

return num_written ? num_written : err;

1847

return num_written ? num_written : err;

1848

}

1848

}

1849

1850

int btrfs_release_file(struct inode *inode, struct file *filp)

1850

int btrfs_release_file(struct inode *inode, struct file *filp)

1851

{

1851

{

1852

/*

1852

/*

1853

* ordered_data_close is set by settattr when we are about to truncate

1853

* ordered_data_close is set by settattr when we are about to truncate

1854

* a file from a non-zero size to a zero size. This tries to

1854

* a file from a non-zero size to a zero size. This tries to

1855

* flush down new bytes that may have been written if the

1855

* flush down new bytes that may have been written if the

1856

* application were using truncate to replace a file in place.

1856

* application were using truncate to replace a file in place.

1857

*/

1857

*/

1858

if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,

1858

if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,

1859

&BTRFS_I(inode)->runtime_flags)) {

1859

&BTRFS_I(inode)->runtime_flags)) {

1860

struct btrfs_trans_handle *trans;

1860

struct btrfs_trans_handle *trans;

1861

struct btrfs_root *root = BTRFS_I(inode)->root;

1861

struct btrfs_root *root = BTRFS_I(inode)->root;

1862

1863

/*

1863

/*

1864

* We need to block on a committing transaction to keep us from

1864

* We need to block on a committing transaction to keep us from

1865

* throwing a ordered operation on to the list and causing

1865

* throwing a ordered operation on to the list and causing

1866

* something like sync to deadlock trying to flush out this

1866

* something like sync to deadlock trying to flush out this

1867

* inode.

1867

* inode.

1868

*/

1868

*/

1869

trans = btrfs_start_transaction(root, 0);

1869

trans = btrfs_start_transaction(root, 0);

1870

if (IS_ERR(trans))

1870

if (IS_ERR(trans))

1871

return PTR_ERR(trans);

1871

return PTR_ERR(trans);

1872

btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode);

1872

btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode);

1873

btrfs_end_transaction(trans, root);

1873

btrfs_end_transaction(trans, root);

1874

if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)

1874

if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)

1875

filemap_flush(inode->i_mapping);

1875

filemap_flush(inode->i_mapping);

1876

}

1876

}

1877

if (filp->private_data)

1877

if (filp->private_data)

1878

btrfs_ioctl_trans_end(filp);

1878

btrfs_ioctl_trans_end(filp);

1879

return 0;

1879

return 0;

1880

}

1880

}

1881

1882

/*

1882

/*

1883

* fsync call for both files and directories. This logs the inode into

1883

* fsync call for both files and directories. This logs the inode into

1884

* the tree log instead of forcing full commits whenever possible.

1884

* the tree log instead of forcing full commits whenever possible.

1885

*

1885

*

1886

* It needs to call filemap_fdatawait so that all ordered extent updates are

1886

* It needs to call filemap_fdatawait so that all ordered extent updates are

1887

* in the metadata btree are up to date for copying to the log.

1887

* in the metadata btree are up to date for copying to the log.

1888

*

1888

*

1889

* It drops the inode mutex before doing the tree log commit. This is an

1889

* It drops the inode mutex before doing the tree log commit. This is an

1890

* important optimization for directories because holding the mutex prevents

1890

* important optimization for directories because holding the mutex prevents

1891

* new operations on the dir while we write to disk.

1891

* new operations on the dir while we write to disk.

1892

*/

1892

*/

1893

int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)

1893

int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)

1894

{

1894

{

1895

struct dentry *dentry = file->f_path.dentry;

1895

struct dentry *dentry = file->f_path.dentry;

1896

struct inode *inode = dentry->d_inode;

1896

struct inode *inode = dentry->d_inode;

1897

struct btrfs_root *root = BTRFS_I(inode)->root;

1897

struct btrfs_root *root = BTRFS_I(inode)->root;

1898

struct btrfs_trans_handle *trans;

1898

struct btrfs_trans_handle *trans;

1899

struct btrfs_log_ctx ctx;

1899

struct btrfs_log_ctx ctx;

1900

int ret = 0;

1900

int ret = 0;

1901

bool full_sync = 0;

1901

bool full_sync = 0;

1902

1903

trace_btrfs_sync_file(file, datasync);

1903

trace_btrfs_sync_file(file, datasync);

1904

1905

/*

1905

/*

1906

* We write the dirty pages in the range and wait until they complete

1906

* We write the dirty pages in the range and wait until they complete

1907

* out of the ->i_mutex. If so, we can flush the dirty pages by

1907

* out of the ->i_mutex. If so, we can flush the dirty pages by

1908

* multi-task, and make the performance up. See

1908

* multi-task, and make the performance up. See

1909

* btrfs_wait_ordered_range for an explanation of the ASYNC check.

1909

* btrfs_wait_ordered_range for an explanation of the ASYNC check.

1910

*/

1910

*/

1911

atomic_inc(&BTRFS_I(inode)->sync_writers);

1911

atomic_inc(&BTRFS_I(inode)->sync_writers);

1912

ret = filemap_fdatawrite_range(inode->i_mapping, start, end);

1912

ret = filemap_fdatawrite_range(inode->i_mapping, start, end);

1913

if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,

1913

if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,

1914

&BTRFS_I(inode)->runtime_flags))

1914

&BTRFS_I(inode)->runtime_flags))

1915

ret = filemap_fdatawrite_range(inode->i_mapping, start, end);

1915

ret = filemap_fdatawrite_range(inode->i_mapping, start, end);

1916

atomic_dec(&BTRFS_I(inode)->sync_writers);

1916

atomic_dec(&BTRFS_I(inode)->sync_writers);

1917

if (ret)

1917

if (ret)

1918

return ret;

1918

return ret;

1919

1920

mutex_lock(&inode->i_mutex);

1920

mutex_lock(&inode->i_mutex);

1921

1922

/*

1922

/*

1923

* We flush the dirty pages again to avoid some dirty pages in the

1923

* We flush the dirty pages again to avoid some dirty pages in the

1924

* range being left.

1924

* range being left.

1925

*/

1925

*/

1926

atomic_inc(&root->log_batch);

1926

atomic_inc(&root->log_batch);

1927

full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,

1927

full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,

1928

&BTRFS_I(inode)->runtime_flags);

1928

&BTRFS_I(inode)->runtime_flags);

1929

if (full_sync) {

1929

if (full_sync) {

1930

ret = btrfs_wait_ordered_range(inode, start, end - start + 1);

1930

ret = btrfs_wait_ordered_range(inode, start, end - start + 1);

1931

if (ret) {

1931

if (ret) {

1932

mutex_unlock(&inode->i_mutex);

1932

mutex_unlock(&inode->i_mutex);

1933

goto out;

1933

goto out;

1934

}

1934

}

1935

}

1935

}

1936

atomic_inc(&root->log_batch);

1936

atomic_inc(&root->log_batch);

1937

1938

/*

1938

/*

1939

* check the transaction that last modified this inode

1939

* check the transaction that last modified this inode

1940

* and see if its already been committed

1940

* and see if its already been committed

1941

*/

1941

*/

1942

if (!BTRFS_I(inode)->last_trans) {

1942

if (!BTRFS_I(inode)->last_trans) {

1943

mutex_unlock(&inode->i_mutex);

1943

mutex_unlock(&inode->i_mutex);

1944

goto out;

1944

goto out;

1945

}

1945

}

1946

1947

/*

1947

/*

1948

* if the last transaction that changed this file was before

1948

* if the last transaction that changed this file was before

1949

* the current transaction, we can bail out now without any

1949

* the current transaction, we can bail out now without any

1950

* syncing

1950

* syncing

1951

*/

1951

*/

1952

smp_mb();

1952

smp_mb();

1953

if (btrfs_inode_in_log(inode, root->fs_info->generation) ||

1953

if (btrfs_inode_in_log(inode, root->fs_info->generation) ||

1954

BTRFS_I(inode)->last_trans <=

1954

BTRFS_I(inode)->last_trans <=

1955

root->fs_info->last_trans_committed) {

1955

root->fs_info->last_trans_committed) {

1956

BTRFS_I(inode)->last_trans = 0;

1956

BTRFS_I(inode)->last_trans = 0;

1957

1958

/*

1958

/*

1959

* We'v had everything committed since the last time we were

1959

* We'v had everything committed since the last time we were

1960

* modified so clear this flag in case it was set for whatever

1960

* modified so clear this flag in case it was set for whatever

1961

* reason, it's no longer relevant.

1961

* reason, it's no longer relevant.

1962

*/

1962

*/

1963

clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,

1963

clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,

1964

&BTRFS_I(inode)->runtime_flags);

1964

&BTRFS_I(inode)->runtime_flags);

1965

mutex_unlock(&inode->i_mutex);

1965

mutex_unlock(&inode->i_mutex);

1966

goto out;

1966

goto out;

1967

}

1967

}

1968

1969

/*

1969

/*

1970

* ok we haven't committed the transaction yet, lets do a commit

1970

* ok we haven't committed the transaction yet, lets do a commit

1971

*/

1971

*/

1972

if (file->private_data)

1972

if (file->private_data)

1973

btrfs_ioctl_trans_end(file);

1973

btrfs_ioctl_trans_end(file);

1974

1975

/*

1975

/*

1976

* We use start here because we will need to wait on the IO to complete

1976

* We use start here because we will need to wait on the IO to complete

1977

* in btrfs_sync_log, which could require joining a transaction (for

1977

* in btrfs_sync_log, which could require joining a transaction (for

1978

* example checking cross references in the nocow path). If we use join

1978

* example checking cross references in the nocow path). If we use join

1979

* here we could get into a situation where we're waiting on IO to

1979

* here we could get into a situation where we're waiting on IO to

1980

* happen that is blocked on a transaction trying to commit. With start

1980

* happen that is blocked on a transaction trying to commit. With start

1981

* we inc the extwriter counter, so we wait for all extwriters to exit

1981

* we inc the extwriter counter, so we wait for all extwriters to exit

1982

* before we start blocking join'ers. This comment is to keep somebody

1982

* before we start blocking join'ers. This comment is to keep somebody

1983

* from thinking they are super smart and changing this to

1983

* from thinking they are super smart and changing this to

1984

* btrfs_join_transaction *cough*Josef*cough*.

1984

* btrfs_join_transaction *cough*Josef*cough*.

1985

*/

1985

*/

1986

trans = btrfs_start_transaction(root, 0);

1986

trans = btrfs_start_transaction(root, 0);

1987

if (IS_ERR(trans)) {

1987

if (IS_ERR(trans)) {

1988

ret = PTR_ERR(trans);

1988

ret = PTR_ERR(trans);

1989

mutex_unlock(&inode->i_mutex);

1989

mutex_unlock(&inode->i_mutex);

1990

goto out;

1990

goto out;

1991

}

1991

}

1992

trans->sync = true;

1992

trans->sync = true;

1993

1994

btrfs_init_log_ctx(&ctx);

1994

btrfs_init_log_ctx(&ctx);

1995

1996

ret = btrfs_log_dentry_safe(trans, root, dentry, &ctx);

1996

ret = btrfs_log_dentry_safe(trans, root, dentry, &ctx);

1997

if (ret < 0) {

1997

if (ret < 0) {

1998

/* Fallthrough and commit/free transaction. */

1998

/* Fallthrough and commit/free transaction. */

1999

ret = 1;

1999

ret = 1;

2000

}

2000

}

2001

2002

/* we've logged all the items and now have a consistent

2002

/* we've logged all the items and now have a consistent

2003

* version of the file in the log. It is possible that

2003

* version of the file in the log. It is possible that

2004

* someone will come in and modify the file, but that's

2004

* someone will come in and modify the file, but that's

2005

* fine because the log is consistent on disk, and we

2005

* fine because the log is consistent on disk, and we

2006

* have references to all of the file's extents

2006

* have references to all of the file's extents

2007

*

2007

*

2008

* It is possible that someone will come in and log the

2008

* It is possible that someone will come in and log the

2009

* file again, but that will end up using the synchronization

2009

* file again, but that will end up using the synchronization

2010

* inside btrfs_sync_log to keep things safe.

2010

* inside btrfs_sync_log to keep things safe.

2011

*/

2011

*/

2012

mutex_unlock(&inode->i_mutex);

2012

mutex_unlock(&inode->i_mutex);

2013

2014

if (ret != BTRFS_NO_LOG_SYNC) {

2014

if (ret != BTRFS_NO_LOG_SYNC) {

2015

if (!ret) {

2015

if (!ret) {

2016

ret = btrfs_sync_log(trans, root, &ctx);

2016

ret = btrfs_sync_log(trans, root, &ctx);

2017

if (!ret) {

2017

if (!ret) {

2018

ret = btrfs_end_transaction(trans, root);

2018

ret = btrfs_end_transaction(trans, root);

2019

goto out;

2019

goto out;

2020

}

2020

}

2021

}

2021

}

2022

if (!full_sync) {

2022

if (!full_sync) {

2023

ret = btrfs_wait_ordered_range(inode, start,

2023

ret = btrfs_wait_ordered_range(inode, start,

2024

end - start + 1);

2024

end - start + 1);

2025

if (ret)

2025

if (ret)

2026

goto out;

2026

goto out;

2027

}

2027

}

2028

ret = btrfs_commit_transaction(trans, root);

2028

ret = btrfs_commit_transaction(trans, root);

2029

} else {

2029

} else {

2030

ret = btrfs_end_transaction(trans, root);

2030

ret = btrfs_end_transaction(trans, root);

2031

}

2031

}

2032

out:

2032

out:

2033

return ret > 0 ? -EIO : ret;

2033

return ret > 0 ? -EIO : ret;

2034

}

2034

}

2035

2036

static const struct vm_operations_struct btrfs_file_vm_ops = {

2036

static const struct vm_operations_struct btrfs_file_vm_ops = {

2037

.fault = filemap_fault,

2037

.fault = filemap_fault,

2038

.map_pages = filemap_map_pages,

2038

.map_pages = filemap_map_pages,

2039

.page_mkwrite = btrfs_page_mkwrite,

2039

.page_mkwrite = btrfs_page_mkwrite,

2040

.remap_pages = generic_file_remap_pages,

2040

.remap_pages = generic_file_remap_pages,

2041

};

2041

};

2042

2043

static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)

2043

static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)

2044

{

2044

{

2045

struct address_space *mapping = filp->f_mapping;

2045

struct address_space *mapping = filp->f_mapping;

2046

2047

if (!mapping->a_ops->readpage)

2047

if (!mapping->a_ops->readpage)

2048

return -ENOEXEC;

2048

return -ENOEXEC;

2049

2050

file_accessed(filp);

2050

file_accessed(filp);

2051

vma->vm_ops = &btrfs_file_vm_ops;

2051

vma->vm_ops = &btrfs_file_vm_ops;

2052

2053

return 0;

2053

return 0;

2054

}

2054

}

2055

2056

static int hole_mergeable(struct inode *inode, struct extent_buffer *leaf,

2056

static int hole_mergeable(struct inode *inode, struct extent_buffer *leaf,

2057

int slot, u64 start, u64 end)

2057

int slot, u64 start, u64 end)

2058

{

2058

{

2059

struct btrfs_file_extent_item *fi;

2059

struct btrfs_file_extent_item *fi;

2060

struct btrfs_key key;

2060

struct btrfs_key key;

2061

2062

if (slot < 0 || slot >= btrfs_header_nritems(leaf))

2062

if (slot < 0 || slot >= btrfs_header_nritems(leaf))

2063

return 0;

2063

return 0;

2064

2065

btrfs_item_key_to_cpu(leaf, &key, slot);

2065

btrfs_item_key_to_cpu(leaf, &key, slot);

2066

if (key.objectid != btrfs_ino(inode) ||

2066

if (key.objectid != btrfs_ino(inode) ||

2067

key.type != BTRFS_EXTENT_DATA_KEY)

2067

key.type != BTRFS_EXTENT_DATA_KEY)

2068

return 0;

2068

return 0;

2069

2070

fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);

2070

fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);

2071

2072

if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)

2072

if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)

2073

return 0;

2073

return 0;

2074

2075

if (btrfs_file_extent_disk_bytenr(leaf, fi))

2075

if (btrfs_file_extent_disk_bytenr(leaf, fi))

2076

return 0;

2076

return 0;

2077

2078

if (key.offset == end)

2078

if (key.offset == end)

2079

return 1;

2079

return 1;

2080

if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)

2080

if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)

2081

return 1;

2081

return 1;

2082

return 0;

2082

return 0;

2083

}

2083

}

2084

2085

static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,

2085

static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,

2086

struct btrfs_path *path, u64 offset, u64 end)

2086

struct btrfs_path *path, u64 offset, u64 end)

2087

{

2087

{

2088

struct btrfs_root *root = BTRFS_I(inode)->root;

2088

struct btrfs_root *root = BTRFS_I(inode)->root;

2089

struct extent_buffer *leaf;

2089

struct extent_buffer *leaf;

2090

struct btrfs_file_extent_item *fi;

2090

struct btrfs_file_extent_item *fi;

2091

struct extent_map *hole_em;

2091

struct extent_map *hole_em;

2092

struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;

2092

struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;

2093

struct btrfs_key key;

2093

struct btrfs_key key;

2094

int ret;

2094

int ret;

2095

2096

if (btrfs_fs_incompat(root->fs_info, NO_HOLES))

2096

if (btrfs_fs_incompat(root->fs_info, NO_HOLES))

2097

goto out;

2097

goto out;

2098

2099

key.objectid = btrfs_ino(inode);

2099

key.objectid = btrfs_ino(inode);

2100

key.type = BTRFS_EXTENT_DATA_KEY;

2100

key.type = BTRFS_EXTENT_DATA_KEY;

2101

key.offset = offset;

2101

key.offset = offset;

2102

2103

ret = btrfs_search_slot(trans, root, &key, path, 0, 1);

2103

ret = btrfs_search_slot(trans, root, &key, path, 0, 1);

2104

if (ret < 0)

2104

if (ret < 0)

2105

return ret;

2105

return ret;

2106

BUG_ON(!ret);

2106

BUG_ON(!ret);

2107

2108

leaf = path->nodes[0];

2108

leaf = path->nodes[0];

2109

if (hole_mergeable(inode, leaf, path->slots[0]-1, offset, end)) {

2109

if (hole_mergeable(inode, leaf, path->slots[0]-1, offset, end)) {

2110

u64 num_bytes;

2110

u64 num_bytes;

2111

2112

path->slots[0]--;

2112

path->slots[0]--;

2113

fi = btrfs_item_ptr(leaf, path->slots[0],

2113

fi = btrfs_item_ptr(leaf, path->slots[0],

2114

struct btrfs_file_extent_item);

2114

struct btrfs_file_extent_item);

2115

num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +

2115

num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +

2116

end - offset;

2116

end - offset;

2117

btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);

2117

btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);

2118

btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);

2118

btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);

2119

btrfs_set_file_extent_offset(leaf, fi, 0);

2119

btrfs_set_file_extent_offset(leaf, fi, 0);

2120

btrfs_mark_buffer_dirty(leaf);

2120

btrfs_mark_buffer_dirty(leaf);

2121

goto out;

2121

goto out;

2122

}

2122

}

2123

2124

if (hole_mergeable(inode, leaf, path->slots[0]+1, offset, end)) {

2124

if (hole_mergeable(inode, leaf, path->slots[0]+1, offset, end)) {

2125

u64 num_bytes;

2125

u64 num_bytes;

2126

2127

path->slots[0]++;

2127

path->slots[0]++;

2128

key.offset = offset;

2128

key.offset = offset;

2129

btrfs_set_item_key_safe(root, path, &key);

2129

btrfs_set_item_key_safe(root, path, &key);

2130

fi = btrfs_item_ptr(leaf, path->slots[0],

2130

fi = btrfs_item_ptr(leaf, path->slots[0],

2131

struct btrfs_file_extent_item);

2131

struct btrfs_file_extent_item);

2132

num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -

2132

num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -

2133

offset;

2133

offset;

2134

btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);

2134

btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);

2135

btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);

2135

btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);

2136

btrfs_set_file_extent_offset(leaf, fi, 0);

2136

btrfs_set_file_extent_offset(leaf, fi, 0);

2137

btrfs_mark_buffer_dirty(leaf);

2137

btrfs_mark_buffer_dirty(leaf);

2138

goto out;

2138

goto out;

2139

}

2139

}

2140

btrfs_release_path(path);

2140

btrfs_release_path(path);

2141

2142

ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,

2142

ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,

2143

0, 0, end - offset, 0, end - offset,

2143

0, 0, end - offset, 0, end - offset,

2144

0, 0, 0);

2144

0, 0, 0);

2145

if (ret)

2145

if (ret)

2146

return ret;

2146

return ret;

2147

2148

out:

2148

out:

2149

btrfs_release_path(path);

2149

btrfs_release_path(path);

2150

2151

hole_em = alloc_extent_map();

2151

hole_em = alloc_extent_map();

2152

if (!hole_em) {

2152

if (!hole_em) {

2153

btrfs_drop_extent_cache(inode, offset, end - 1, 0);

2153

btrfs_drop_extent_cache(inode, offset, end - 1, 0);

2154

set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,

2154

set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,

2155

&BTRFS_I(inode)->runtime_flags);

2155

&BTRFS_I(inode)->runtime_flags);

2156

} else {

2156

} else {

2157

hole_em->start = offset;

2157

hole_em->start = offset;

2158

hole_em->len = end - offset;

2158

hole_em->len = end - offset;

2159

hole_em->ram_bytes = hole_em->len;

2159

hole_em->ram_bytes = hole_em->len;

2160

hole_em->orig_start = offset;

2160

hole_em->orig_start = offset;

2161

2162

hole_em->block_start = EXTENT_MAP_HOLE;

2162

hole_em->block_start = EXTENT_MAP_HOLE;

2163

hole_em->block_len = 0;

2163

hole_em->block_len = 0;

2164

hole_em->orig_block_len = 0;

2164

hole_em->orig_block_len = 0;

2165

hole_em->bdev = root->fs_info->fs_devices->latest_bdev;

2165

hole_em->bdev = root->fs_info->fs_devices->latest_bdev;

2166

hole_em->compress_type = BTRFS_COMPRESS_NONE;

2166

hole_em->compress_type = BTRFS_COMPRESS_NONE;

2167

hole_em->generation = trans->transid;

2167

hole_em->generation = trans->transid;

2168

2169

do {

2169

do {

2170

btrfs_drop_extent_cache(inode, offset, end - 1, 0);

2170

btrfs_drop_extent_cache(inode, offset, end - 1, 0);

2171

write_lock(&em_tree->lock);

2171

write_lock(&em_tree->lock);

2172

ret = add_extent_mapping(em_tree, hole_em, 1);

2172

ret = add_extent_mapping(em_tree, hole_em, 1);

2173

write_unlock(&em_tree->lock);

2173

write_unlock(&em_tree->lock);

2174

} while (ret == -EEXIST);

2174

} while (ret == -EEXIST);

2175

free_extent_map(hole_em);

2175

free_extent_map(hole_em);

2176

if (ret)

2176

if (ret)

2177

set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,

2177

set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,

2178

&BTRFS_I(inode)->runtime_flags);

2178

&BTRFS_I(inode)->runtime_flags);

2179

}

2179

}

2180

2181

return 0;

2181

return 0;

2182

}

2182

}

2183

2184

static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)

2184

static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)

2185

{

2185

{

2186

struct btrfs_root *root = BTRFS_I(inode)->root;

2186

struct btrfs_root *root = BTRFS_I(inode)->root;

2187

struct extent_state *cached_state = NULL;

2187

struct extent_state *cached_state = NULL;

2188

struct btrfs_path *path;

2188

struct btrfs_path *path;

2189

struct btrfs_block_rsv *rsv;

2189

struct btrfs_block_rsv *rsv;

2190

struct btrfs_trans_handle *trans;

2190

struct btrfs_trans_handle *trans;

2191

u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);

2191

u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);

2192

u64 lockend = round_down(offset + len,

2192

u64 lockend = round_down(offset + len,

2193

BTRFS_I(inode)->root->sectorsize) - 1;

2193

BTRFS_I(inode)->root->sectorsize) - 1;

2194

u64 cur_offset = lockstart;

2194

u64 cur_offset = lockstart;

2195

u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);

2195

u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);

2196

u64 drop_end;

2196

u64 drop_end;

2197

int ret = 0;

2197

int ret = 0;

2198

int err = 0;

2198

int err = 0;

2199

int rsv_count;

2199

int rsv_count;

2200

bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==

2200

bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==

2201

((offset + len - 1) >> PAGE_CACHE_SHIFT));

2201

((offset + len - 1) >> PAGE_CACHE_SHIFT));

2202

bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);

2202

bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);

2203

u64 ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE);

2203

u64 ino_size;

2204

2205

ret = btrfs_wait_ordered_range(inode, offset, len);

2205

ret = btrfs_wait_ordered_range(inode, offset, len);

2206

if (ret)

2206

if (ret)

2207

return ret;

2207

return ret;

2208

2209

mutex_lock(&inode->i_mutex);

2209

mutex_lock(&inode->i_mutex);

2210

ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE);

2210

/*

2211

/*

2211

* We needn't truncate any page which is beyond the end of the file

2212

* We needn't truncate any page which is beyond the end of the file

2212

* because we are sure there is no data there.

2213

* because we are sure there is no data there.

2213

*/

2214

*/

2214

/*

2215

/*

2215

* Only do this if we are in the same page and we aren't doing the

2216

* Only do this if we are in the same page and we aren't doing the

2216

* entire page.

2217

* entire page.

2217

*/

2218

*/

2218

if (same_page && len < PAGE_CACHE_SIZE) {

2219

if (same_page && len < PAGE_CACHE_SIZE) {

2219

if (offset < ino_size)

2220

if (offset < ino_size)

2220

ret = btrfs_truncate_page(inode, offset, len, 0);

2221

ret = btrfs_truncate_page(inode, offset, len, 0);

2221

mutex_unlock(&inode->i_mutex);

2222

mutex_unlock(&inode->i_mutex);

2222

return ret;

2223

return ret;

2223

}

2224

}

2224

2225

/* zero back part of the first page */

2226

/* zero back part of the first page */

2226

if (offset < ino_size) {

2227

if (offset < ino_size) {

2227

ret = btrfs_truncate_page(inode, offset, 0, 0);

2228

ret = btrfs_truncate_page(inode, offset, 0, 0);

2228

if (ret) {

2229

if (ret) {

2229

mutex_unlock(&inode->i_mutex);

2230

mutex_unlock(&inode->i_mutex);

2230

return ret;

2231

return ret;

2231

}

2232

}

2232

}

2233

}

2233

2234

/* zero the front end of the last page */

2235

/* zero the front end of the last page */

2235

if (offset + len < ino_size) {

2236

if (offset + len < ino_size) {

2236

ret = btrfs_truncate_page(inode, offset + len, 0, 1);

2237

ret = btrfs_truncate_page(inode, offset + len, 0, 1);

2237

if (ret) {

2238

if (ret) {

2238

mutex_unlock(&inode->i_mutex);

2239

mutex_unlock(&inode->i_mutex);

2239

return ret;

2240

return ret;

2240

}

2241

}

2241

}

2242

}

2242

2243

if (lockend < lockstart) {

2244

if (lockend < lockstart) {

2244

mutex_unlock(&inode->i_mutex);

2245

mutex_unlock(&inode->i_mutex);

2245

return 0;

2246

return 0;

2246

}

2247

}

2247

2248

while (1) {

2249

while (1) {

2249

struct btrfs_ordered_extent *ordered;

2250

struct btrfs_ordered_extent *ordered;

2250

2251

truncate_pagecache_range(inode, lockstart, lockend);

2252

truncate_pagecache_range(inode, lockstart, lockend);

2252

2253

lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,

2254

lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,

2254

0, &cached_state);

2255

0, &cached_state);

2255

ordered = btrfs_lookup_first_ordered_extent(inode, lockend);

2256

ordered = btrfs_lookup_first_ordered_extent(inode, lockend);

2256

2257

/*

2258

/*

2258

* We need to make sure we have no ordered extents in this range

2259

* We need to make sure we have no ordered extents in this range

2259

* and nobody raced in and read a page in this range, if we did

2260

* and nobody raced in and read a page in this range, if we did

2260

* we need to try again.

2261

* we need to try again.

2261

*/

2262

*/

2262

if ((!ordered ||

2263

if ((!ordered ||

2263

(ordered->file_offset + ordered->len <= lockstart ||

2264

(ordered->file_offset + ordered->len <= lockstart ||

2264

ordered->file_offset > lockend)) &&

2265

ordered->file_offset > lockend)) &&

2265

!test_range_bit(&BTRFS_I(inode)->io_tree, lockstart,

2266

!test_range_bit(&BTRFS_I(inode)->io_tree, lockstart,

2266

lockend, EXTENT_UPTODATE, 0,

2267

lockend, EXTENT_UPTODATE, 0,

2267

cached_state)) {

2268

cached_state)) {

2268

if (ordered)

2269

if (ordered)

2269

btrfs_put_ordered_extent(ordered);

2270

btrfs_put_ordered_extent(ordered);

2270

break;

2271

break;

2271

}

2272

}

2272

if (ordered)

2273

if (ordered)

2273

btrfs_put_ordered_extent(ordered);

2274

btrfs_put_ordered_extent(ordered);

2274

unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,

2275

unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,

2275

lockend, &cached_state, GFP_NOFS);

2276

lockend, &cached_state, GFP_NOFS);

2276

ret = btrfs_wait_ordered_range(inode, lockstart,

2277

ret = btrfs_wait_ordered_range(inode, lockstart,

2277

lockend - lockstart + 1);

2278

lockend - lockstart + 1);

2278

if (ret) {

2279

if (ret) {

2279

mutex_unlock(&inode->i_mutex);

2280

mutex_unlock(&inode->i_mutex);

2280

return ret;

2281

return ret;

2281

}

2282

}

2282

}

2283

}

2283

2284

path = btrfs_alloc_path();

2285

path = btrfs_alloc_path();

2285

if (!path) {

2286

if (!path) {

2286

ret = -ENOMEM;

2287

ret = -ENOMEM;

2287

goto out;

2288

goto out;

2288

}

2289

}

2289

2290

rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);

2291

rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);

2291

if (!rsv) {

2292

if (!rsv) {

2292

ret = -ENOMEM;

2293

ret = -ENOMEM;

2293

goto out_free;

2294

goto out_free;

2294

}

2295

}

2295

rsv->size = btrfs_calc_trunc_metadata_size(root, 1);

2296

rsv->size = btrfs_calc_trunc_metadata_size(root, 1);

2296

rsv->failfast = 1;

2297

rsv->failfast = 1;

2297

2298

/*

2299

/*

2299

* 1 - update the inode

2300

* 1 - update the inode

2300

* 1 - removing the extents in the range

2301

* 1 - removing the extents in the range

2301

* 1 - adding the hole extent if no_holes isn't set

2302

* 1 - adding the hole extent if no_holes isn't set

2302

*/

2303

*/

2303

rsv_count = no_holes ? 2 : 3;

2304

rsv_count = no_holes ? 2 : 3;

2304

trans = btrfs_start_transaction(root, rsv_count);

2305

trans = btrfs_start_transaction(root, rsv_count);

2305

if (IS_ERR(trans)) {

2306

if (IS_ERR(trans)) {

2306

err = PTR_ERR(trans);

2307

err = PTR_ERR(trans);

2307

goto out_free;

2308

goto out_free;

2308

}

2309

}

2309

2310

ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,

2311

ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,

2311

min_size);

2312

min_size);

2312

BUG_ON(ret);

2313

BUG_ON(ret);

2313

trans->block_rsv = rsv;

2314

trans->block_rsv = rsv;

2314

2315

while (cur_offset < lockend) {

2316

while (cur_offset < lockend) {

2316

ret = __btrfs_drop_extents(trans, root, inode, path,

2317

ret = __btrfs_drop_extents(trans, root, inode, path,

2317

cur_offset, lockend + 1,

2318

cur_offset, lockend + 1,

2318

&drop_end, 1, 0, 0, NULL);

2319

&drop_end, 1, 0, 0, NULL);

2319

if (ret != -ENOSPC)

2320

if (ret != -ENOSPC)

2320

break;

2321

break;

2321

2322

trans->block_rsv = &root->fs_info->trans_block_rsv;

2323

trans->block_rsv = &root->fs_info->trans_block_rsv;

2323

2324

if (cur_offset < ino_size) {

2325

if (cur_offset < ino_size) {

2325

ret = fill_holes(trans, inode, path, cur_offset,

2326

ret = fill_holes(trans, inode, path, cur_offset,

2326

drop_end);

2327

drop_end);

2327

if (ret) {

2328

if (ret) {

2328

err = ret;

2329

err = ret;

2329

break;

2330

break;

2330

}

2331

}

2331

}

2332

}

2332

2333

cur_offset = drop_end;

2334

cur_offset = drop_end;

2334

2335

ret = btrfs_update_inode(trans, root, inode);

2336

ret = btrfs_update_inode(trans, root, inode);

2336

if (ret) {

2337

if (ret) {

2337

err = ret;

2338

err = ret;

2338

break;

2339

break;

2339

}

2340

}

2340

2341

btrfs_end_transaction(trans, root);

2342

btrfs_end_transaction(trans, root);

2342

btrfs_btree_balance_dirty(root);

2343

btrfs_btree_balance_dirty(root);

2343

2344

trans = btrfs_start_transaction(root, rsv_count);

2345

trans = btrfs_start_transaction(root, rsv_count);

2345

if (IS_ERR(trans)) {

2346

if (IS_ERR(trans)) {

2346

ret = PTR_ERR(trans);

2347

ret = PTR_ERR(trans);

2347

trans = NULL;

2348

trans = NULL;

2348

break;

2349

break;

2349

}

2350

}

2350

2351

ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,

2352

ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,

2352

rsv, min_size);

2353

rsv, min_size);

2353

BUG_ON(ret); /* shouldn't happen */

2354

BUG_ON(ret); /* shouldn't happen */

2354

trans->block_rsv = rsv;

2355

trans->block_rsv = rsv;

2355

}

2356

}

2356

2357

if (ret) {

2358

if (ret) {

2358

err = ret;

2359

err = ret;

2359

goto out_trans;

2360

goto out_trans;

2360

}

2361

}

2361

2362

trans->block_rsv = &root->fs_info->trans_block_rsv;

2363

trans->block_rsv = &root->fs_info->trans_block_rsv;

2363

/*

2364

/*

2364

* Don't insert file hole extent item if it's for a range beyond eof

2365

* Don't insert file hole extent item if it's for a range beyond eof

2365

* (because it's useless) or if it represents a 0 bytes range (when

2366

* (because it's useless) or if it represents a 0 bytes range (when

2366

* cur_offset == drop_end).

2367

* cur_offset == drop_end).

2367

*/

2368

*/

2368

if (cur_offset < ino_size && cur_offset < drop_end) {

2369

if (cur_offset < ino_size && cur_offset < drop_end) {

2369

ret = fill_holes(trans, inode, path, cur_offset, drop_end);

2370

ret = fill_holes(trans, inode, path, cur_offset, drop_end);

2370

if (ret) {

2371

if (ret) {

2371

err = ret;

2372

err = ret;

2372

goto out_trans;

2373

goto out_trans;

2373

}

2374

}

2374

}

2375

}

2375

2376

out_trans:

2377

out_trans:

2377

if (!trans)

2378

if (!trans)

2378

goto out_free;

2379

goto out_free;

2379

2380

inode_inc_iversion(inode);

2381

inode_inc_iversion(inode);

2381

inode->i_mtime = inode->i_ctime = CURRENT_TIME;

2382

inode->i_mtime = inode->i_ctime = CURRENT_TIME;

2382

2383

trans->block_rsv = &root->fs_info->trans_block_rsv;

2384

trans->block_rsv = &root->fs_info->trans_block_rsv;

2384

ret = btrfs_update_inode(trans, root, inode);

2385

ret = btrfs_update_inode(trans, root, inode);

2385

btrfs_end_transaction(trans, root);

2386

btrfs_end_transaction(trans, root);

2386

btrfs_btree_balance_dirty(root);

2387

btrfs_btree_balance_dirty(root);

2387

out_free:

2388

out_free:

2388

btrfs_free_path(path);

2389

btrfs_free_path(path);

2389

btrfs_free_block_rsv(root, rsv);

2390

btrfs_free_block_rsv(root, rsv);

2390

out:

2391

out:

2391

unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,

2392

unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,

2392

&cached_state, GFP_NOFS);

2393

&cached_state, GFP_NOFS);

2393

mutex_unlock(&inode->i_mutex);

2394

mutex_unlock(&inode->i_mutex);

2394

if (ret && !err)

2395

if (ret && !err)

2395

err = ret;

2396

err = ret;

2396

return err;

2397

return err;

2397

}

2398

}

2398

2399

static long btrfs_fallocate(struct file *file, int mode,

2400

static long btrfs_fallocate(struct file *file, int mode,

2400

loff_t offset, loff_t len)

2401

loff_t offset, loff_t len)

2401

{

2402

{

2402

struct inode *inode = file_inode(file);

2403

struct inode *inode = file_inode(file);

2403

struct extent_state *cached_state = NULL;

2404

struct extent_state *cached_state = NULL;

2404

struct btrfs_root *root = BTRFS_I(inode)->root;

2405

struct btrfs_root *root = BTRFS_I(inode)->root;

2405

u64 cur_offset;

2406

u64 cur_offset;

2406

u64 last_byte;

2407

u64 last_byte;

2407

u64 alloc_start;

2408

u64 alloc_start;

2408

u64 alloc_end;

2409

u64 alloc_end;

2409

u64 alloc_hint = 0;

2410

u64 alloc_hint = 0;

2410

u64 locked_end;

2411

u64 locked_end;

2411

struct extent_map *em;

2412

struct extent_map *em;

2412

int blocksize = BTRFS_I(inode)->root->sectorsize;

2413

int blocksize = BTRFS_I(inode)->root->sectorsize;

2413

int ret;

2414

int ret;

2414

2415

alloc_start = round_down(offset, blocksize);

2416

alloc_start = round_down(offset, blocksize);

2416

alloc_end = round_up(offset + len, blocksize);

2417

alloc_end = round_up(offset + len, blocksize);

2417

2418

/* Make sure we aren't being give some crap mode */

2419

/* Make sure we aren't being give some crap mode */

2419

if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))

2420

if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))

2420

return -EOPNOTSUPP;

2421

return -EOPNOTSUPP;

2421

2422

if (mode & FALLOC_FL_PUNCH_HOLE)

2423

if (mode & FALLOC_FL_PUNCH_HOLE)

2423

return btrfs_punch_hole(inode, offset, len);

2424

return btrfs_punch_hole(inode, offset, len);

2424

2425

/*

2426

/*

2426

* Make sure we have enough space before we do the

2427

* Make sure we have enough space before we do the

2427

* allocation.

2428

* allocation.

2428

*/

2429

*/

2429

ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);

2430

ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);

2430

if (ret)

2431

if (ret)

2431

return ret;

2432

return ret;

2432

if (root->fs_info->quota_enabled) {

2433

if (root->fs_info->quota_enabled) {

2433

ret = btrfs_qgroup_reserve(root, alloc_end - alloc_start);

2434

ret = btrfs_qgroup_reserve(root, alloc_end - alloc_start);

2434

if (ret)

2435

if (ret)

2435

goto out_reserve_fail;

2436

goto out_reserve_fail;

2436

}

2437

}

2437

2438

mutex_lock(&inode->i_mutex);

2439

mutex_lock(&inode->i_mutex);

2439

ret = inode_newsize_ok(inode, alloc_end);

2440

ret = inode_newsize_ok(inode, alloc_end);

2440

if (ret)

2441

if (ret)

2441

goto out;

2442

goto out;

2442

2443

if (alloc_start > inode->i_size) {

2444

if (alloc_start > inode->i_size) {

2444

ret = btrfs_cont_expand(inode, i_size_read(inode),

2445

ret = btrfs_cont_expand(inode, i_size_read(inode),

2445

alloc_start);

2446

alloc_start);

2446

if (ret)

2447

if (ret)

2447

goto out;

2448

goto out;

2448

} else {

2449

} else {

2449

/*

2450

/*

2450

* If we are fallocating from the end of the file onward we

2451

* If we are fallocating from the end of the file onward we

2451

* need to zero out the end of the page if i_size lands in the

2452

* need to zero out the end of the page if i_size lands in the

2452

* middle of a page.

2453

* middle of a page.

2453

*/

2454

*/

2454

ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);

2455

ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);

2455

if (ret)

2456

if (ret)

2456

goto out;

2457

goto out;

2457

}

2458

}

2458

2459

/*

2460

/*

2460

* wait for ordered IO before we have any locks. We'll loop again

2461

* wait for ordered IO before we have any locks. We'll loop again

2461

* below with the locks held.

2462

* below with the locks held.

2462

*/

2463

*/

2463

ret = btrfs_wait_ordered_range(inode, alloc_start,

2464

ret = btrfs_wait_ordered_range(inode, alloc_start,

2464

alloc_end - alloc_start);

2465

alloc_end - alloc_start);

2465

if (ret)

2466

if (ret)

2466

goto out;

2467

goto out;

2467

2468

locked_end = alloc_end - 1;

2469

locked_end = alloc_end - 1;

2469

while (1) {

2470

while (1) {

2470

struct btrfs_ordered_extent *ordered;

2471

struct btrfs_ordered_extent *ordered;

2471

2472

/* the extent lock is ordered inside the running

2473

/* the extent lock is ordered inside the running

2473

* transaction

2474

* transaction

2474

*/

2475

*/

2475

lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,

2476

lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,

2476

locked_end, 0, &cached_state);

2477

locked_end, 0, &cached_state);

2477

ordered = btrfs_lookup_first_ordered_extent(inode,

2478

ordered = btrfs_lookup_first_ordered_extent(inode,

2478

alloc_end - 1);

2479

alloc_end - 1);

2479

if (ordered &&

2480

if (ordered &&

2480

ordered->file_offset + ordered->len > alloc_start &&

2481

ordered->file_offset + ordered->len > alloc_start &&

2481

ordered->file_offset < alloc_end) {

2482

ordered->file_offset < alloc_end) {

2482

btrfs_put_ordered_extent(ordered);

2483

btrfs_put_ordered_extent(ordered);

2483

unlock_extent_cached(&BTRFS_I(inode)->io_tree,

2484

unlock_extent_cached(&BTRFS_I(inode)->io_tree,

2484

alloc_start, locked_end,

2485

alloc_start, locked_end,

2485

&cached_state, GFP_NOFS);

2486

&cached_state, GFP_NOFS);

2486

/*

2487

/*

2487

* we can't wait on the range with the transaction

2488

* we can't wait on the range with the transaction

2488

* running or with the extent lock held

2489

* running or with the extent lock held

2489

*/

2490

*/

2490

ret = btrfs_wait_ordered_range(inode, alloc_start,

2491

ret = btrfs_wait_ordered_range(inode, alloc_start,

2491

alloc_end - alloc_start);

2492

alloc_end - alloc_start);

2492

if (ret)

2493

if (ret)

2493

goto out;

2494

goto out;

2494

} else {

2495

} else {

2495

if (ordered)

2496

if (ordered)

2496

btrfs_put_ordered_extent(ordered);

2497

btrfs_put_ordered_extent(ordered);

2497

break;

2498

break;

2498

}

2499

}

2499

}

2500

}

2500

2501

cur_offset = alloc_start;

2502

cur_offset = alloc_start;

2502

while (1) {

2503

while (1) {

2503

u64 actual_end;

2504

u64 actual_end;

2504

2505

em = btrfs_get_extent(inode, NULL, 0, cur_offset,

2506

em = btrfs_get_extent(inode, NULL, 0, cur_offset,

2506

alloc_end - cur_offset, 0);

2507

alloc_end - cur_offset, 0);

2507

if (IS_ERR_OR_NULL(em)) {

2508

if (IS_ERR_OR_NULL(em)) {

2508

if (!em)

2509

if (!em)

2509

ret = -ENOMEM;

2510

ret = -ENOMEM;

2510

else

2511

else

2511

ret = PTR_ERR(em);

2512

ret = PTR_ERR(em);

2512

break;

2513

break;

2513

}

2514

}

2514

last_byte = min(extent_map_end(em), alloc_end);

2515

last_byte = min(extent_map_end(em), alloc_end);

2515

actual_end = min_t(u64, extent_map_end(em), offset + len);

2516

actual_end = min_t(u64, extent_map_end(em), offset + len);

2516

last_byte = ALIGN(last_byte, blocksize);

2517

last_byte = ALIGN(last_byte, blocksize);

2517

2518

if (em->block_start == EXTENT_MAP_HOLE ||

2519

if (em->block_start == EXTENT_MAP_HOLE ||

2519

(cur_offset >= inode->i_size &&

2520

(cur_offset >= inode->i_size &&

2520

!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {

2521

!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {

2521

ret = btrfs_prealloc_file_range(inode, mode, cur_offset,

2522

ret = btrfs_prealloc_file_range(inode, mode, cur_offset,

2522

last_byte - cur_offset,

2523

last_byte - cur_offset,

2523

1 << inode->i_blkbits,

2524

1 << inode->i_blkbits,

2524

offset + len,

2525

offset + len,

2525

&alloc_hint);

2526

&alloc_hint);

2526

2527

if (ret < 0) {

2528

if (ret < 0) {

2528

free_extent_map(em);

2529

free_extent_map(em);

2529

break;

2530

break;

2530

}

2531

}

2531

} else if (actual_end > inode->i_size &&

2532

} else if (actual_end > inode->i_size &&

2532

!(mode & FALLOC_FL_KEEP_SIZE)) {

2533

!(mode & FALLOC_FL_KEEP_SIZE)) {

2533

/*

2534

/*

2534

* We didn't need to allocate any more space, but we

2535

* We didn't need to allocate any more space, but we

2535

* still extended the size of the file so we need to

2536

* still extended the size of the file so we need to

2536

* update i_size.

2537

* update i_size.

2537

*/

2538

*/

2538

inode->i_ctime = CURRENT_TIME;

2539

inode->i_ctime = CURRENT_TIME;

2539

i_size_write(inode, actual_end);

2540

i_size_write(inode, actual_end);

2540

btrfs_ordered_update_i_size(inode, actual_end, NULL);

2541

btrfs_ordered_update_i_size(inode, actual_end, NULL);

2541

}

2542

}

2542

free_extent_map(em);

2543

free_extent_map(em);

2543

2544

cur_offset = last_byte;

2545

cur_offset = last_byte;

2545

if (cur_offset >= alloc_end) {

2546

if (cur_offset >= alloc_end) {

2546

ret = 0;

2547

ret = 0;

2547

break;

2548

break;

2548

}

2549

}

2549

}

2550

}

2550

unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,

2551

unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,

2551

&cached_state, GFP_NOFS);

2552

&cached_state, GFP_NOFS);

2552

out:

2553

out:

2553

mutex_unlock(&inode->i_mutex);

2554

mutex_unlock(&inode->i_mutex);

2554

if (root->fs_info->quota_enabled)

2555

if (root->fs_info->quota_enabled)

2555

btrfs_qgroup_free(root, alloc_end - alloc_start);

2556

btrfs_qgroup_free(root, alloc_end - alloc_start);

2556

out_reserve_fail:

2557

out_reserve_fail:

2557

/* Let go of our reservation. */

2558

/* Let go of our reservation. */

2558

btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);

2559

btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);

2559

return ret;

2560

return ret;

2560

}

2561

}

2561

2562

static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)

2563

static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)

2563

{

2564

{

2564

struct btrfs_root *root = BTRFS_I(inode)->root;

2565

struct btrfs_root *root = BTRFS_I(inode)->root;

2565

struct extent_map *em = NULL;

2566

struct extent_map *em = NULL;

2566

struct extent_state *cached_state = NULL;

2567

struct extent_state *cached_state = NULL;

2567

u64 lockstart = *offset;

2568

u64 lockstart = *offset;

2568

u64 lockend = i_size_read(inode);

2569

u64 lockend = i_size_read(inode);

2569

u64 start = *offset;

2570

u64 start = *offset;

2570

u64 len = i_size_read(inode);

2571

u64 len = i_size_read(inode);

2571

int ret = 0;

2572

int ret = 0;

2572

2573

lockend = max_t(u64, root->sectorsize, lockend);

2574

lockend = max_t(u64, root->sectorsize, lockend);

2574

if (lockend <= lockstart)

2575

if (lockend <= lockstart)

2575

lockend = lockstart + root->sectorsize;

2576

lockend = lockstart + root->sectorsize;

2576

2577

lockend--;

2578

lockend--;

2578

len = lockend - lockstart + 1;

2579

len = lockend - lockstart + 1;

2579

2580

len = max_t(u64, len, root->sectorsize);

2581

len = max_t(u64, len, root->sectorsize);

2581

if (inode->i_size == 0)

2582

if (inode->i_size == 0)

2582

return -ENXIO;

2583

return -ENXIO;

2583

2584

lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0,

2585

lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0,

2585

&cached_state);

2586

&cached_state);

2586

2587

while (start < inode->i_size) {

2588

while (start < inode->i_size) {

2588

em = btrfs_get_extent_fiemap(inode, NULL, 0, start, len, 0);

2589

em = btrfs_get_extent_fiemap(inode, NULL, 0, start, len, 0);

2589

if (IS_ERR(em)) {

2590

if (IS_ERR(em)) {

2590

ret = PTR_ERR(em);

2591

ret = PTR_ERR(em);

2591

em = NULL;

2592

em = NULL;

2592

break;

2593

break;

2593

}

2594

}

2594

2595

if (whence == SEEK_HOLE &&

2596

if (whence == SEEK_HOLE &&

2596

(em->block_start == EXTENT_MAP_HOLE ||

2597

(em->block_start == EXTENT_MAP_HOLE ||

2597

test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))

2598

test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))

2598

break;

2599

break;

2599

else if (whence == SEEK_DATA &&

2600

else if (whence == SEEK_DATA &&

2600

(em->block_start != EXTENT_MAP_HOLE &&

2601

(em->block_start != EXTENT_MAP_HOLE &&

2601

!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))

2602

!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))

2602

break;

2603

break;

2603

2604

start = em->start + em->len;

2605

start = em->start + em->len;

2605

free_extent_map(em);

2606

free_extent_map(em);

2606

em = NULL;

2607

em = NULL;

2607

cond_resched();

2608

cond_resched();

2608

}

2609

}

2609

free_extent_map(em);

2610

free_extent_map(em);

2610

if (!ret) {

2611

if (!ret) {

2611

if (whence == SEEK_DATA && start >= inode->i_size)

2612

if (whence == SEEK_DATA && start >= inode->i_size)

2612

ret = -ENXIO;

2613

ret = -ENXIO;

2613

else

2614

else

2614

*offset = min_t(loff_t, start, inode->i_size);

2615

*offset = min_t(loff_t, start, inode->i_size);

2615

}

2616

}

2616

unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,

2617

unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,

2617

&cached_state, GFP_NOFS);

2618

&cached_state, GFP_NOFS);

2618

return ret;

2619

return ret;

2619

}

2620

}

2620

2621

static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)

2622

static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)

2622

{

2623

{

2623

struct inode *inode = file->f_mapping->host;

2624

struct inode *inode = file->f_mapping->host;

2624

int ret;

2625

int ret;

2625

2626

mutex_lock(&inode->i_mutex);

2627

mutex_lock(&inode->i_mutex);

2627

switch (whence) {

2628

switch (whence) {

2628

case SEEK_END:

2629

case SEEK_END:

2629

case SEEK_CUR:

2630

case SEEK_CUR:

2630

offset = generic_file_llseek(file, offset, whence);

2631

offset = generic_file_llseek(file, offset, whence);

2631

goto out;

2632

goto out;

2632

case SEEK_DATA:

2633

case SEEK_DATA:

2633

case SEEK_HOLE:

2634

case SEEK_HOLE:

2634

if (offset >= i_size_read(inode)) {

2635

if (offset >= i_size_read(inode)) {

2635

mutex_unlock(&inode->i_mutex);

2636

mutex_unlock(&inode->i_mutex);

2636

return -ENXIO;

2637

return -ENXIO;

2637

}

2638

}

2638

2639

ret = find_desired_extent(inode, &offset, whence);

2640

ret = find_desired_extent(inode, &offset, whence);

2640

if (ret) {

2641

if (ret) {

2641

mutex_unlock(&inode->i_mutex);

2642

mutex_unlock(&inode->i_mutex);

2642

return ret;

2643

return ret;

2643

}

2644

}

2644

}

2645

}

2645

2646

offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);

2647

offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);

2647

out:

2648

out:

2648

mutex_unlock(&inode->i_mutex);

2649

mutex_unlock(&inode->i_mutex);

2649

return offset;

2650

return offset;

2650

}

2651

}

2651

2652

const struct file_operations btrfs_file_operations = {

2653

const struct file_operations btrfs_file_operations = {

2653

.llseek = btrfs_file_llseek,

2654

.llseek = btrfs_file_llseek,

2654

.read = do_sync_read,

2655

.read = do_sync_read,

2655

.write = do_sync_write,

2656

.write = do_sync_write,

2656

.aio_read = generic_file_aio_read,

2657

.aio_read = generic_file_aio_read,

2657

.splice_read = generic_file_splice_read,

2658

.splice_read = generic_file_splice_read,

2658

.aio_write = btrfs_file_aio_write,

2659

.aio_write = btrfs_file_aio_write,

2659

.mmap = btrfs_file_mmap,

2660

.mmap = btrfs_file_mmap,

2660

.open = generic_file_open,

2661

.open = generic_file_open,

2661

.release = btrfs_release_file,

2662

.release = btrfs_release_file,

2662

.fsync = btrfs_sync_file,

2663

.fsync = btrfs_sync_file,

2663

.fallocate = btrfs_fallocate,

2664

.fallocate = btrfs_fallocate,

2664

.unlocked_ioctl = btrfs_ioctl,

2665

.unlocked_ioctl = btrfs_ioctl,

2665

#ifdef CONFIG_COMPAT

2666

#ifdef CONFIG_COMPAT

2666

.compat_ioctl = btrfs_ioctl,

2667

.compat_ioctl = btrfs_ioctl,

2667

#endif

2668

#endif

2668

};

2669

};

2669

2670

void btrfs_auto_defrag_exit(void)

2671

void btrfs_auto_defrag_exit(void)

2671

{

2672

{

2672

if (btrfs_inode_defrag_cachep)

2673

if (btrfs_inode_defrag_cachep)

2673

kmem_cache_destroy(btrfs_inode_defrag_cachep);

2674

kmem_cache_destroy(btrfs_inode_defrag_cachep);

2674

}

2675

}

2675

2676

int btrfs_auto_defrag_init(void)

2677

int btrfs_auto_defrag_init(void)

2677

{

2678

{

2678

btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",

2679

btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",

2679

sizeof(struct inode_defrag), 0,

2680

sizeof(struct inode_defrag), 0,

2680

SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,

2681

SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,

2681

NULL);

2682

NULL);

2682

if (!btrfs_inode_defrag_cachep)

2683

if (!btrfs_inode_defrag_cachep)

2683

return -ENOMEM;

2684

return -ENOMEM;

2684

2685

return 0;

2686

return 0;

2686

}

2687

}

2687

2688

GITLAB

Btrfs: read inode size after acquiring the mutex when punching a hole

 /*
  * Copyright (C) 2007 Oracle.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
  * License v2 as published by the Free Software Foundation.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
  *
  * You should have received a copy of the GNU General Public
  * License along with this program; if not, write to the
  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  * Boston, MA 021110-1307, USA.
  */
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
 #include <linux/time.h>
 #include <linux/init.h>
 #include <linux/string.h>
 #include <linux/backing-dev.h>
 #include <linux/mpage.h>
 #include <linux/aio.h>
 #include <linux/falloc.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
 #include <linux/statfs.h>
 #include <linux/compat.h>
 #include <linux/slab.h>
 #include <linux/btrfs.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
 #include "print-tree.h"
 #include "tree-log.h"
 #include "locking.h"
 #include "volumes.h"
 static struct kmem_cache *btrfs_inode_defrag_cachep;
 /*
  * when auto defrag is enabled we
  * queue up these defrag structs to remember which
  * inodes need defragging passes
  */
 struct inode_defrag {
 	struct rb_node rb_node;
 	/* objectid */
 	u64 ino;
 	/*
 	 * transid where the defrag was added, we search for
 	 * extents newer than this
 	 */
 	u64 transid;
 	/* root objectid */
 	u64 root;
 	/* last offset we were able to defrag */
 	u64 last_offset;
 	/* if we've wrapped around back to zero once already */
 	int cycled;
 };
 static int __compare_inode_defrag(struct inode_defrag *defrag1,
 				  struct inode_defrag *defrag2)
 {
 	if (defrag1->root > defrag2->root)
 		return 1;
 	else if (defrag1->root < defrag2->root)
 		return -1;
 	else if (defrag1->ino > defrag2->ino)
 		return 1;
 	else if (defrag1->ino < defrag2->ino)
 		return -1;
 	else
 		return 0;
 }
 /* pop a record for an inode into the defrag tree.  The lock
  * must be held already
  *
  * If you're inserting a record for an older transid than an
  * existing record, the transid already in the tree is lowered
  *
  * If an existing record is found the defrag item you
  * pass in is freed
  */
 static int __btrfs_add_inode_defrag(struct inode *inode,
 				    struct inode_defrag *defrag)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct inode_defrag *entry;
 	struct rb_node **p;
 	struct rb_node *parent = NULL;
 	int ret;
 	p = &root->fs_info->defrag_inodes.rb_node;
 	while (*p) {
 		parent = *p;
 		entry = rb_entry(parent, struct inode_defrag, rb_node);
 		ret = __compare_inode_defrag(defrag, entry);
 		if (ret < 0)
 			p = &parent->rb_left;
 		else if (ret > 0)
 			p = &parent->rb_right;
 		else {
 			/* if we're reinserting an entry for
 			 * an old defrag run, make sure to
 			 * lower the transid of our existing record
 			 */
 			if (defrag->transid < entry->transid)
 				entry->transid = defrag->transid;
 			if (defrag->last_offset > entry->last_offset)
 				entry->last_offset = defrag->last_offset;
 			return -EEXIST;
 		}
 	}
 	set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
 	rb_link_node(&defrag->rb_node, parent, p);
 	rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
 	return 0;
 }
 static inline int __need_auto_defrag(struct btrfs_root *root)
 {
 	if (!btrfs_test_opt(root, AUTO_DEFRAG))
 		return 0;
 	if (btrfs_fs_closing(root->fs_info))
 		return 0;
 	return 1;
 }
 /*
  * insert a defrag record for this inode if auto defrag is
  * enabled
  */
 int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
 			   struct inode *inode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct inode_defrag *defrag;
 	u64 transid;
 	int ret;
 	if (!__need_auto_defrag(root))
 		return 0;
 	if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
 		return 0;
 	if (trans)
 		transid = trans->transid;
 	else
 		transid = BTRFS_I(inode)->root->last_trans;
 	defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
 	if (!defrag)
 		return -ENOMEM;
 	defrag->ino = btrfs_ino(inode);
 	defrag->transid = transid;
 	defrag->root = root->root_key.objectid;
 	spin_lock(&root->fs_info->defrag_inodes_lock);
 	if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) {
 		/*
 		 * If we set IN_DEFRAG flag and evict the inode from memory,
 		 * and then re-read this inode, this new inode doesn't have
 		 * IN_DEFRAG flag. At the case, we may find the existed defrag.
 		 */
 		ret = __btrfs_add_inode_defrag(inode, defrag);
 		if (ret)
 			kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
 	} else {
 		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
 	}
 	spin_unlock(&root->fs_info->defrag_inodes_lock);
 	return 0;
 }
 /*
  * Requeue the defrag object. If there is a defrag object that points to
  * the same inode in the tree, we will merge them together (by
  * __btrfs_add_inode_defrag()) and free the one that we want to requeue.
  */
 static void btrfs_requeue_inode_defrag(struct inode *inode,
 				       struct inode_defrag *defrag)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret;
 	if (!__need_auto_defrag(root))
 		goto out;
 	/*
 	 * Here we don't check the IN_DEFRAG flag, because we need merge
 	 * them together.
 	 */
 	spin_lock(&root->fs_info->defrag_inodes_lock);
 	ret = __btrfs_add_inode_defrag(inode, defrag);
 	spin_unlock(&root->fs_info->defrag_inodes_lock);
 	if (ret)
 		goto out;
 	return;
 out:
 	kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
 }
 /*
  * pick the defragable inode that we want, if it doesn't exist, we will get
  * the next one.
  */
 static struct inode_defrag *
 btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
 {
 	struct inode_defrag *entry = NULL;
 	struct inode_defrag tmp;
 	struct rb_node *p;
 	struct rb_node *parent = NULL;
 	int ret;
 	tmp.ino = ino;
 	tmp.root = root;
 	spin_lock(&fs_info->defrag_inodes_lock);
 	p = fs_info->defrag_inodes.rb_node;
 	while (p) {
 		parent = p;
 		entry = rb_entry(parent, struct inode_defrag, rb_node);
 		ret = __compare_inode_defrag(&tmp, entry);
 		if (ret < 0)
 			p = parent->rb_left;
 		else if (ret > 0)
 			p = parent->rb_right;
 		else
 			goto out;
 	}
 	if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
 		parent = rb_next(parent);
 		if (parent)
 			entry = rb_entry(parent, struct inode_defrag, rb_node);
 		else
 			entry = NULL;
 	}
 out:
 	if (entry)
 		rb_erase(parent, &fs_info->defrag_inodes);
 	spin_unlock(&fs_info->defrag_inodes_lock);
 	return entry;
 }
 void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
 {
 	struct inode_defrag *defrag;
 	struct rb_node *node;
 	spin_lock(&fs_info->defrag_inodes_lock);
 	node = rb_first(&fs_info->defrag_inodes);
 	while (node) {
 		rb_erase(node, &fs_info->defrag_inodes);
 		defrag = rb_entry(node, struct inode_defrag, rb_node);
 		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
 		if (need_resched()) {
 			spin_unlock(&fs_info->defrag_inodes_lock);
 			cond_resched();
 			spin_lock(&fs_info->defrag_inodes_lock);
 		}
 		node = rb_first(&fs_info->defrag_inodes);
 	}
 	spin_unlock(&fs_info->defrag_inodes_lock);
 }
 #define BTRFS_DEFRAG_BATCH	1024
 static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
 				    struct inode_defrag *defrag)
 {
 	struct btrfs_root *inode_root;
 	struct inode *inode;
 	struct btrfs_key key;
 	struct btrfs_ioctl_defrag_range_args range;
 	int num_defrag;
 	int index;
 	int ret;
 	/* get the inode */
 	key.objectid = defrag->root;
 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
 	key.offset = (u64)-1;
 	index = srcu_read_lock(&fs_info->subvol_srcu);
 	inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
 	if (IS_ERR(inode_root)) {
 		ret = PTR_ERR(inode_root);
 		goto cleanup;
 	}
 	key.objectid = defrag->ino;
 	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
 	key.offset = 0;
 	inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
 	if (IS_ERR(inode)) {
 		ret = PTR_ERR(inode);
 		goto cleanup;
 	}
 	srcu_read_unlock(&fs_info->subvol_srcu, index);
 	/* do a chunk of defrag */
 	clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
 	memset(&range, 0, sizeof(range));
 	range.len = (u64)-1;
 	range.start = defrag->last_offset;
 	sb_start_write(fs_info->sb);
 	num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
 				       BTRFS_DEFRAG_BATCH);
 	sb_end_write(fs_info->sb);
 	/*
 	 * if we filled the whole defrag batch, there
 	 * must be more work to do.  Queue this defrag
 	 * again
 	 */
 	if (num_defrag == BTRFS_DEFRAG_BATCH) {
 		defrag->last_offset = range.start;
 		btrfs_requeue_inode_defrag(inode, defrag);
 	} else if (defrag->last_offset && !defrag->cycled) {
 		/*
 		 * we didn't fill our defrag batch, but
 		 * we didn't start at zero.  Make sure we loop
 		 * around to the start of the file.
 		 */
 		defrag->last_offset = 0;
 		defrag->cycled = 1;
 		btrfs_requeue_inode_defrag(inode, defrag);
 	} else {
 		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
 	}
 	iput(inode);
 	return 0;
 cleanup:
 	srcu_read_unlock(&fs_info->subvol_srcu, index);
 	kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
 	return ret;
 }
 /*
  * run through the list of inodes in the FS that need
  * defragging
  */
 int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
 {
 	struct inode_defrag *defrag;
 	u64 first_ino = 0;
 	u64 root_objectid = 0;
 	atomic_inc(&fs_info->defrag_running);
 	while (1) {
 		/* Pause the auto defragger. */
 		if (test_bit(BTRFS_FS_STATE_REMOUNTING,
 			     &fs_info->fs_state))
 			break;
 		if (!__need_auto_defrag(fs_info->tree_root))
 			break;
 		/* find an inode to defrag */
 		defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
 						 first_ino);
 		if (!defrag) {
 			if (root_objectid || first_ino) {
 				root_objectid = 0;
 				first_ino = 0;
 				continue;
 			} else {
 				break;
 			}
 		}
 		first_ino = defrag->ino + 1;
 		root_objectid = defrag->root;
 		__btrfs_run_defrag_inode(fs_info, defrag);
 	}
 	atomic_dec(&fs_info->defrag_running);
 	/*
 	 * during unmount, we use the transaction_wait queue to
 	 * wait for the defragger to stop
 	 */
 	wake_up(&fs_info->transaction_wait);
 	return 0;
 }
 /* simple helper to fault in pages and copy.  This should go away
  * and be replaced with calls into generic code.
  */
 static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
 					 size_t write_bytes,
 					 struct page **prepared_pages,
 					 struct iov_iter *i)
 {
 	size_t copied = 0;
 	size_t total_copied = 0;
 	int pg = 0;
 	int offset = pos & (PAGE_CACHE_SIZE - 1);
 	while (write_bytes > 0) {
 		size_t count = min_t(size_t,
 				     PAGE_CACHE_SIZE - offset, write_bytes);
 		struct page *page = prepared_pages[pg];
 		/*
 		 * Copy data from userspace to the current page
 		 */
 		copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
 		/* Flush processor's dcache for this page */
 		flush_dcache_page(page);
 		/*
 		 * if we get a partial write, we can end up with
 		 * partially up to date pages.  These add
 		 * a lot of complexity, so make sure they don't
 		 * happen by forcing this copy to be retried.
 		 *
 		 * The rest of the btrfs_file_write code will fall
 		 * back to page at a time copies after we return 0.
 		 */
 		if (!PageUptodate(page) && copied < count)
 			copied = 0;
 		iov_iter_advance(i, copied);
 		write_bytes -= copied;
 		total_copied += copied;
 		/* Return to btrfs_file_aio_write to fault page */
 		if (unlikely(copied == 0))
 			break;
 		if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
 			offset += copied;
 		} else {
 			pg++;
 			offset = 0;
 		}
 	}
 	return total_copied;
 }
 /*
  * unlocks pages after btrfs_file_write is done with them
  */
 static void btrfs_drop_pages(struct page **pages, size_t num_pages)
 {
 	size_t i;
 	for (i = 0; i < num_pages; i++) {
 		/* page checked is some magic around finding pages that
 		 * have been modified without going through btrfs_set_page_dirty
 		 * clear it here
 		 */
 		ClearPageChecked(pages[i]);
 		unlock_page(pages[i]);
 		mark_page_accessed(pages[i]);
 		page_cache_release(pages[i]);
 	}
 }
 /*
  * after copy_from_user, pages need to be dirtied and we need to make
  * sure holes are created between the current EOF and the start of
  * any next extents (if required).
  *
  * this also makes the decision about creating an inline extent vs
  * doing real data extents, marking pages dirty and delalloc as required.
  */
 int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
 			     struct page **pages, size_t num_pages,
 			     loff_t pos, size_t write_bytes,
 			     struct extent_state **cached)
 {
 	int err = 0;
 	int i;
 	u64 num_bytes;
 	u64 start_pos;
 	u64 end_of_last_block;
 	u64 end_pos = pos + write_bytes;
 	loff_t isize = i_size_read(inode);
 	start_pos = pos & ~((u64)root->sectorsize - 1);
 	num_bytes = ALIGN(write_bytes + pos - start_pos, root->sectorsize);
 	end_of_last_block = start_pos + num_bytes - 1;
 	err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
 					cached);
 	if (err)
 		return err;
 	for (i = 0; i < num_pages; i++) {
 		struct page *p = pages[i];
 		SetPageUptodate(p);
 		ClearPageChecked(p);
 		set_page_dirty(p);
 	}
 	/*
 	 * we've only changed i_size in ram, and we haven't updated
 	 * the disk i_size.  There is no need to log the inode
 	 * at this time.
 	 */
 	if (end_pos > isize)
 		i_size_write(inode, end_pos);
 	return 0;
 }
 /*
  * this drops all the extents in the cache that intersect the range
  * [start, end].  Existing extents are split as required.
  */
 void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 			     int skip_pinned)
 {
 	struct extent_map *em;
 	struct extent_map *split = NULL;
 	struct extent_map *split2 = NULL;
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	u64 len = end - start + 1;
 	u64 gen;
 	int ret;
 	int testend = 1;
 	unsigned long flags;
 	int compressed = 0;
 	bool modified;
 	WARN_ON(end < start);
 	if (end == (u64)-1) {
 		len = (u64)-1;
 		testend = 0;
 	}
 	while (1) {
 		int no_splits = 0;
 		modified = false;
 		if (!split)
 			split = alloc_extent_map();
 		if (!split2)
 			split2 = alloc_extent_map();
 		if (!split || !split2)
 			no_splits = 1;
 		write_lock(&em_tree->lock);
 		em = lookup_extent_mapping(em_tree, start, len);
 		if (!em) {
 			write_unlock(&em_tree->lock);
 			break;
 		}
 		flags = em->flags;
 		gen = em->generation;
 		if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
 			if (testend && em->start + em->len >= start + len) {
 				free_extent_map(em);
 				write_unlock(&em_tree->lock);
 				break;
 			}
 			start = em->start + em->len;
 			if (testend)
 				len = start + len - (em->start + em->len);
 			free_extent_map(em);
 			write_unlock(&em_tree->lock);
 			continue;
 		}
 		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
 		clear_bit(EXTENT_FLAG_LOGGING, &flags);
 		modified = !list_empty(&em->list);
 		if (no_splits)
 			goto next;
 		if (em->start < start) {
 			split->start = em->start;
 			split->len = start - em->start;
 			if (em->block_start < EXTENT_MAP_LAST_BYTE) {
 				split->orig_start = em->orig_start;
 				split->block_start = em->block_start;
 				if (compressed)
 					split->block_len = em->block_len;
 				else
 					split->block_len = split->len;
 				split->orig_block_len = max(split->block_len,
 						em->orig_block_len);
 				split->ram_bytes = em->ram_bytes;
 			} else {
 				split->orig_start = split->start;
 				split->block_len = 0;
 				split->block_start = em->block_start;
 				split->orig_block_len = 0;
 				split->ram_bytes = split->len;
 			}
 			split->generation = gen;
 			split->bdev = em->bdev;
 			split->flags = flags;
 			split->compress_type = em->compress_type;
 			replace_extent_mapping(em_tree, em, split, modified);
 			free_extent_map(split);
 			split = split2;
 			split2 = NULL;
 		}
 		if (testend && em->start + em->len > start + len) {
 			u64 diff = start + len - em->start;
 			split->start = start + len;
 			split->len = em->start + em->len - (start + len);
 			split->bdev = em->bdev;
 			split->flags = flags;
 			split->compress_type = em->compress_type;
 			split->generation = gen;
 			if (em->block_start < EXTENT_MAP_LAST_BYTE) {
 				split->orig_block_len = max(em->block_len,
 						    em->orig_block_len);
 				split->ram_bytes = em->ram_bytes;
 				if (compressed) {
 					split->block_len = em->block_len;
 					split->block_start = em->block_start;
 					split->orig_start = em->orig_start;
 				} else {
 					split->block_len = split->len;
 					split->block_start = em->block_start
 						+ diff;
 					split->orig_start = em->orig_start;
 				}
 			} else {
 				split->ram_bytes = split->len;
 				split->orig_start = split->start;
 				split->block_len = 0;
 				split->block_start = em->block_start;
 				split->orig_block_len = 0;
 			}
 			if (extent_map_in_tree(em)) {
 				replace_extent_mapping(em_tree, em, split,
 						       modified);
 			} else {
 				ret = add_extent_mapping(em_tree, split,
 							 modified);
 				ASSERT(ret == 0); /* Logic error */
 			}
 			free_extent_map(split);
 			split = NULL;
 		}
 next:
 		if (extent_map_in_tree(em))
 			remove_extent_mapping(em_tree, em);
 		write_unlock(&em_tree->lock);
 		/* once for us */
 		free_extent_map(em);
 		/* once for the tree*/
 		free_extent_map(em);
 	}
 	if (split)
 		free_extent_map(split);
 	if (split2)
 		free_extent_map(split2);
 }
 /*
  * this is very complex, but the basic idea is to drop all extents
  * in the range start - end.  hint_block is filled in with a block number
  * that would be a good hint to the block allocator for this file.
  *
  * If an extent intersects the range but is not entirely inside the range
  * it is either truncated or split.  Anything entirely inside the range
  * is deleted from the tree.
  */
 int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root, struct inode *inode,
 			 struct btrfs_path *path, u64 start, u64 end,
 			 u64 *drop_end, int drop_cache,
 			 int replace_extent,
 			 u32 extent_item_size,
 			 int *key_inserted)
 {
 	struct extent_buffer *leaf;
 	struct btrfs_file_extent_item *fi;
 	struct btrfs_key key;
 	struct btrfs_key new_key;
 	u64 ino = btrfs_ino(inode);
 	u64 search_start = start;
 	u64 disk_bytenr = 0;
 	u64 num_bytes = 0;
 	u64 extent_offset = 0;
 	u64 extent_end = 0;
 	int del_nr = 0;
 	int del_slot = 0;
 	int extent_type;
 	int recow;
 	int ret;
 	int modify_tree = -1;
 	int update_refs = (root->ref_cows || root == root->fs_info->tree_root);
 	int found = 0;
 	int leafs_visited = 0;
 	if (drop_cache)
 		btrfs_drop_extent_cache(inode, start, end - 1, 0);
 	if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent)
 		modify_tree = 0;
 	while (1) {
 		recow = 0;
 		ret = btrfs_lookup_file_extent(trans, root, path, ino,
 					       search_start, modify_tree);
 		if (ret < 0)
 			break;
 		if (ret > 0 && path->slots[0] > 0 && search_start == start) {
 			leaf = path->nodes[0];
 			btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
 			if (key.objectid == ino &&
 			    key.type == BTRFS_EXTENT_DATA_KEY)
 				path->slots[0]--;
 		}
 		ret = 0;
 		leafs_visited++;
 next_slot:
 		leaf = path->nodes[0];
 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
 			BUG_ON(del_nr > 0);
 			ret = btrfs_next_leaf(root, path);
 			if (ret < 0)
 				break;
 			if (ret > 0) {
 				ret = 0;
 				break;
 			}
 			leafs_visited++;
 			leaf = path->nodes[0];
 			recow = 1;
 		}
 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 		if (key.objectid > ino ||
 		    key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
 			break;
 		fi = btrfs_item_ptr(leaf, path->slots[0],
 				    struct btrfs_file_extent_item);
 		extent_type = btrfs_file_extent_type(leaf, fi);
 		if (extent_type == BTRFS_FILE_EXTENT_REG ||
 		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
 			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
 			num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
 			extent_offset = btrfs_file_extent_offset(leaf, fi);
 			extent_end = key.offset +
 				btrfs_file_extent_num_bytes(leaf, fi);
 		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 			extent_end = key.offset +
 				btrfs_file_extent_inline_len(leaf,
 						     path->slots[0], fi);
 		} else {
 			WARN_ON(1);
 			extent_end = search_start;
 		}
 		/*
 		 * Don't skip extent items representing 0 byte lengths. They
 		 * used to be created (bug) if while punching holes we hit
 		 * -ENOSPC condition. So if we find one here, just ensure we
 		 * delete it, otherwise we would insert a new file extent item
 		 * with the same key (offset) as that 0 bytes length file
 		 * extent item in the call to setup_items_for_insert() later
 		 * in this function.
 		 */
 		if (extent_end == key.offset && extent_end >= search_start)
 			goto delete_extent_item;
 		if (extent_end <= search_start) {
 			path->slots[0]++;
 			goto next_slot;
 		}
 		found = 1;
 		search_start = max(key.offset, start);
 		if (recow || !modify_tree) {
 			modify_tree = -1;
 			btrfs_release_path(path);
 			continue;
 		}
 		/*
 		 *     | - range to drop - |
 		 *  | -------- extent -------- |
 		 */
 		if (start > key.offset && end < extent_end) {
 			BUG_ON(del_nr > 0);
 			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 				ret = -EOPNOTSUPP;
 				break;
 			}
 			memcpy(&new_key, &key, sizeof(new_key));
 			new_key.offset = start;
 			ret = btrfs_duplicate_item(trans, root, path,
 						   &new_key);
 			if (ret == -EAGAIN) {
 				btrfs_release_path(path);
 				continue;
 			}
 			if (ret < 0)
 				break;
 			leaf = path->nodes[0];
 			fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
 					    struct btrfs_file_extent_item);
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							start - key.offset);
 			fi = btrfs_item_ptr(leaf, path->slots[0],
 					    struct btrfs_file_extent_item);
 			extent_offset += start - key.offset;
 			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							extent_end - start);
 			btrfs_mark_buffer_dirty(leaf);
 			if (update_refs && disk_bytenr > 0) {
 				ret = btrfs_inc_extent_ref(trans, root,
 						disk_bytenr, num_bytes, 0,
 						root->root_key.objectid,
 						new_key.objectid,
 						start - extent_offset, 0);
 				BUG_ON(ret); /* -ENOMEM */
 			}
 			key.offset = start;
 		}
 		/*
 		 *  | ---- range to drop ----- |
 		 *      | -------- extent -------- |
 		 */
 		if (start <= key.offset && end < extent_end) {
 			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 				ret = -EOPNOTSUPP;
 				break;
 			}
 			memcpy(&new_key, &key, sizeof(new_key));
 			new_key.offset = end;
 			btrfs_set_item_key_safe(root, path, &new_key);
 			extent_offset += end - key.offset;
 			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							extent_end - end);
 			btrfs_mark_buffer_dirty(leaf);
 			if (update_refs && disk_bytenr > 0)
 				inode_sub_bytes(inode, end - key.offset);
 			break;
 		}
 		search_start = extent_end;
 		/*
 		 *       | ---- range to drop ----- |
 		 *  | -------- extent -------- |
 		 */
 		if (start > key.offset && end >= extent_end) {
 			BUG_ON(del_nr > 0);
 			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 				ret = -EOPNOTSUPP;
 				break;
 			}
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							start - key.offset);
 			btrfs_mark_buffer_dirty(leaf);
 			if (update_refs && disk_bytenr > 0)
 				inode_sub_bytes(inode, extent_end - start);
 			if (end == extent_end)
 				break;
 			path->slots[0]++;
 			goto next_slot;
 		}
 		/*
 		 *  | ---- range to drop ----- |
 		 *    | ------ extent ------ |
 		 */
 		if (start <= key.offset && end >= extent_end) {
 delete_extent_item:
 			if (del_nr == 0) {
 				del_slot = path->slots[0];
 				del_nr = 1;
 			} else {
 				BUG_ON(del_slot + del_nr != path->slots[0]);
 				del_nr++;
 			}
 			if (update_refs &&
 			    extent_type == BTRFS_FILE_EXTENT_INLINE) {
 				inode_sub_bytes(inode,
 						extent_end - key.offset);
 				extent_end = ALIGN(extent_end,
 						   root->sectorsize);
 			} else if (update_refs && disk_bytenr > 0) {
 				ret = btrfs_free_extent(trans, root,
 						disk_bytenr, num_bytes, 0,
 						root->root_key.objectid,
 						key.objectid, key.offset -
 						extent_offset, 0);
 				BUG_ON(ret); /* -ENOMEM */
 				inode_sub_bytes(inode,
 						extent_end - key.offset);
 			}
 			if (end == extent_end)
 				break;
 			if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
 				path->slots[0]++;
 				goto next_slot;
 			}
 			ret = btrfs_del_items(trans, root, path, del_slot,
 					      del_nr);
 			if (ret) {
 				btrfs_abort_transaction(trans, root, ret);
 				break;
 			}
 			del_nr = 0;
 			del_slot = 0;
 			btrfs_release_path(path);
 			continue;
 		}
 		BUG_ON(1);
 	}
 	if (!ret && del_nr > 0) {
 		/*
 		 * Set path->slots[0] to first slot, so that after the delete
 		 * if items are move off from our leaf to its immediate left or
 		 * right neighbor leafs, we end up with a correct and adjusted
 		 * path->slots[0] for our insertion (if replace_extent != 0).
 		 */
 		path->slots[0] = del_slot;
 		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
 		if (ret)
 			btrfs_abort_transaction(trans, root, ret);
 	}
 	leaf = path->nodes[0];
 	/*
 	 * If btrfs_del_items() was called, it might have deleted a leaf, in
 	 * which case it unlocked our path, so check path->locks[0] matches a
 	 * write lock.
 	 */
 	if (!ret && replace_extent && leafs_visited == 1 &&
 	    (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING ||
 	     path->locks[0] == BTRFS_WRITE_LOCK) &&
 	    btrfs_leaf_free_space(root, leaf) >=
 	    sizeof(struct btrfs_item) + extent_item_size) {
 		key.objectid = ino;
 		key.type = BTRFS_EXTENT_DATA_KEY;
 		key.offset = start;
 		if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
 			struct btrfs_key slot_key;
 			btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
 			if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
 				path->slots[0]++;
 		}
 		setup_items_for_insert(root, path, &key,
 				       &extent_item_size,
 				       extent_item_size,
 				       sizeof(struct btrfs_item) +
 				       extent_item_size, 1);
 		*key_inserted = 1;
 	}
 	if (!replace_extent || !(*key_inserted))
 		btrfs_release_path(path);
 	if (drop_end)
 		*drop_end = found ? min(end, extent_end) : end;
 	return ret;
 }
 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct inode *inode, u64 start,
 		       u64 end, int drop_cache)
 {
 	struct btrfs_path *path;
 	int ret;
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 	ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,
 				   drop_cache, 0, 0, NULL);
 	btrfs_free_path(path);
 	return ret;
 }
 static int extent_mergeable(struct extent_buffer *leaf, int slot,
 			    u64 objectid, u64 bytenr, u64 orig_offset,
 			    u64 *start, u64 *end)
 {
 	struct btrfs_file_extent_item *fi;
 	struct btrfs_key key;
 	u64 extent_end;
 	if (slot < 0 || slot >= btrfs_header_nritems(leaf))
 		return 0;
 	btrfs_item_key_to_cpu(leaf, &key, slot);
 	if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
 		return 0;
 	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
 	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
 	    btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
 	    btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
 	    btrfs_file_extent_compression(leaf, fi) ||
 	    btrfs_file_extent_encryption(leaf, fi) ||
 	    btrfs_file_extent_other_encoding(leaf, fi))
 		return 0;
 	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
 	if ((*start && *start != key.offset) || (*end && *end != extent_end))
 		return 0;
 	*start = key.offset;
 	*end = extent_end;
 	return 1;
 }
 /*
  * Mark extent in the range start - end as written.
  *
  * This changes extent type from 'pre-allocated' to 'regular'. If only
  * part of extent is marked as written, the extent will be split into
  * two or three.
  */
 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 			      struct inode *inode, u64 start, u64 end)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_buffer *leaf;
 	struct btrfs_path *path;
 	struct btrfs_file_extent_item *fi;
 	struct btrfs_key key;
 	struct btrfs_key new_key;
 	u64 bytenr;
 	u64 num_bytes;
 	u64 extent_end;
 	u64 orig_offset;
 	u64 other_start;
 	u64 other_end;
 	u64 split;
 	int del_nr = 0;
 	int del_slot = 0;
 	int recow;
 	int ret;
 	u64 ino = btrfs_ino(inode);
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 again:
 	recow = 0;
 	split = start;
 	key.objectid = ino;
 	key.type = BTRFS_EXTENT_DATA_KEY;
 	key.offset = split;
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 	if (ret < 0)
 		goto out;
 	if (ret > 0 && path->slots[0] > 0)
 		path->slots[0]--;
 	leaf = path->nodes[0];
 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 	BUG_ON(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY);
 	fi = btrfs_item_ptr(leaf, path->slots[0],
 			    struct btrfs_file_extent_item);
 	BUG_ON(btrfs_file_extent_type(leaf, fi) !=
 	       BTRFS_FILE_EXTENT_PREALLOC);
 	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
 	BUG_ON(key.offset > start || extent_end < end);
 	bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
 	num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
 	orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
 	memcpy(&new_key, &key, sizeof(new_key));
 	if (start == key.offset && end < extent_end) {
 		other_start = 0;
 		other_end = start;
 		if (extent_mergeable(leaf, path->slots[0] - 1,
 				     ino, bytenr, orig_offset,
 				     &other_start, &other_end)) {
 			new_key.offset = end;
 			btrfs_set_item_key_safe(root, path, &new_key);
 			fi = btrfs_item_ptr(leaf, path->slots[0],
 					    struct btrfs_file_extent_item);
 			btrfs_set_file_extent_generation(leaf, fi,
 							 trans->transid);
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							extent_end - end);
 			btrfs_set_file_extent_offset(leaf, fi,
 						     end - orig_offset);
 			fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
 					    struct btrfs_file_extent_item);
 			btrfs_set_file_extent_generation(leaf, fi,
 							 trans->transid);
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							end - other_start);
 			btrfs_mark_buffer_dirty(leaf);
 			goto out;
 		}
 	}
 	if (start > key.offset && end == extent_end) {
 		other_start = end;
 		other_end = 0;
 		if (extent_mergeable(leaf, path->slots[0] + 1,
 				     ino, bytenr, orig_offset,
 				     &other_start, &other_end)) {
 			fi = btrfs_item_ptr(leaf, path->slots[0],
 					    struct btrfs_file_extent_item);
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							start - key.offset);
 			btrfs_set_file_extent_generation(leaf, fi,
 							 trans->transid);
 			path->slots[0]++;
 			new_key.offset = start;
 			btrfs_set_item_key_safe(root, path, &new_key);
 			fi = btrfs_item_ptr(leaf, path->slots[0],
 					    struct btrfs_file_extent_item);
 			btrfs_set_file_extent_generation(leaf, fi,
 							 trans->transid);
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							other_end - start);
 			btrfs_set_file_extent_offset(leaf, fi,
 						     start - orig_offset);
 			btrfs_mark_buffer_dirty(leaf);
 			goto out;
 		}
 	}
 	while (start > key.offset || end < extent_end) {
 		if (key.offset == start)
 			split = end;
 		new_key.offset = split;
 		ret = btrfs_duplicate_item(trans, root, path, &new_key);
 		if (ret == -EAGAIN) {
 			btrfs_release_path(path);
 			goto again;
 		}
 		if (ret < 0) {
 			btrfs_abort_transaction(trans, root, ret);
 			goto out;
 		}
 		leaf = path->nodes[0];
 		fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
 				    struct btrfs_file_extent_item);
 		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
 		btrfs_set_file_extent_num_bytes(leaf, fi,
 						split - key.offset);
 		fi = btrfs_item_ptr(leaf, path->slots[0],
 				    struct btrfs_file_extent_item);
 		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
 		btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
 		btrfs_set_file_extent_num_bytes(leaf, fi,
 						extent_end - split);
 		btrfs_mark_buffer_dirty(leaf);
 		ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
 					   root->root_key.objectid,
 					   ino, orig_offset, 0);
 		BUG_ON(ret); /* -ENOMEM */
 		if (split == start) {
 			key.offset = start;
 		} else {
 			BUG_ON(start != key.offset);
 			path->slots[0]--;
 			extent_end = end;
 		}
 		recow = 1;
 	}
 	other_start = end;
 	other_end = 0;
 	if (extent_mergeable(leaf, path->slots[0] + 1,
 			     ino, bytenr, orig_offset,
 			     &other_start, &other_end)) {
 		if (recow) {
 			btrfs_release_path(path);
 			goto again;
 		}
 		extent_end = other_end;
 		del_slot = path->slots[0] + 1;
 		del_nr++;
 		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
 					0, root->root_key.objectid,
 					ino, orig_offset, 0);
 		BUG_ON(ret); /* -ENOMEM */
 	}
 	other_start = 0;
 	other_end = start;
 	if (extent_mergeable(leaf, path->slots[0] - 1,
 			     ino, bytenr, orig_offset,
 			     &other_start, &other_end)) {
 		if (recow) {
 			btrfs_release_path(path);
 			goto again;
 		}
 		key.offset = other_start;
 		del_slot = path->slots[0];
 		del_nr++;
 		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
 					0, root->root_key.objectid,
 					ino, orig_offset, 0);
 		BUG_ON(ret); /* -ENOMEM */
 	}
 	if (del_nr == 0) {
 		fi = btrfs_item_ptr(leaf, path->slots[0],
 			   struct btrfs_file_extent_item);
 		btrfs_set_file_extent_type(leaf, fi,
 					   BTRFS_FILE_EXTENT_REG);
 		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
 		btrfs_mark_buffer_dirty(leaf);
 	} else {
 		fi = btrfs_item_ptr(leaf, del_slot - 1,
 			   struct btrfs_file_extent_item);
 		btrfs_set_file_extent_type(leaf, fi,
 					   BTRFS_FILE_EXTENT_REG);
 		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
 		btrfs_set_file_extent_num_bytes(leaf, fi,
 						extent_end - key.offset);
 		btrfs_mark_buffer_dirty(leaf);
 		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
 		if (ret < 0) {
 			btrfs_abort_transaction(trans, root, ret);
 			goto out;
 		}
 	}
 out:
 	btrfs_free_path(path);
 	return 0;
 }
 /*
  * on error we return an unlocked page and the error value
  * on success we return a locked page and 0
  */
 static int prepare_uptodate_page(struct page *page, u64 pos,
 				 bool force_uptodate)
 {
 	int ret = 0;
 	if (((pos & (PAGE_CACHE_SIZE - 1)) || force_uptodate) &&
 	    !PageUptodate(page)) {
 		ret = btrfs_readpage(NULL, page);
 		if (ret)
 			return ret;
 		lock_page(page);
 		if (!PageUptodate(page)) {
 			unlock_page(page);
 			return -EIO;
 		}
 	}
 	return 0;
 }
 /*
  * this just gets pages into the page cache and locks them down.
  */
 static noinline int prepare_pages(struct inode *inode, struct page **pages,
 				  size_t num_pages, loff_t pos,
 				  size_t write_bytes, bool force_uptodate)
 {
 	int i;
 	unsigned long index = pos >> PAGE_CACHE_SHIFT;
 	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
 	int err = 0;
 	int faili;
 	for (i = 0; i < num_pages; i++) {
 		pages[i] = find_or_create_page(inode->i_mapping, index + i,
 					       mask | __GFP_WRITE);
 		if (!pages[i]) {
 			faili = i - 1;
 			err = -ENOMEM;
 			goto fail;
 		}
 		if (i == 0)
 			err = prepare_uptodate_page(pages[i], pos,
 						    force_uptodate);
 		if (i == num_pages - 1)
 			err = prepare_uptodate_page(pages[i],
 						    pos + write_bytes, false);
 		if (err) {
 			page_cache_release(pages[i]);
 			faili = i - 1;
 			goto fail;
 		}
 		wait_on_page_writeback(pages[i]);
 	}
 	return 0;
 fail:
 	while (faili >= 0) {
 		unlock_page(pages[faili]);
 		page_cache_release(pages[faili]);
 		faili--;
 	}
 	return err;
 }
 /*
  * This function locks the extent and properly waits for data=ordered extents
  * to finish before allowing the pages to be modified if need.
  *
  * The return value:
  * 1 - the extent is locked
  * 0 - the extent is not locked, and everything is OK
  * -EAGAIN - need re-prepare the pages
  * the other < 0 number - Something wrong happens
  */
 static noinline int
 lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
 				size_t num_pages, loff_t pos,
 				u64 *lockstart, u64 *lockend,
 				struct extent_state **cached_state)
 {
 	u64 start_pos;
 	u64 last_pos;
 	int i;
 	int ret = 0;
 	start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1);
 	last_pos = start_pos + ((u64)num_pages << PAGE_CACHE_SHIFT) - 1;
 	if (start_pos < inode->i_size) {
 		struct btrfs_ordered_extent *ordered;
 		lock_extent_bits(&BTRFS_I(inode)->io_tree,
 				 start_pos, last_pos, 0, cached_state);
 		ordered = btrfs_lookup_ordered_range(inode, start_pos,
 						     last_pos - start_pos + 1);
 		if (ordered &&
 		    ordered->file_offset + ordered->len > start_pos &&
 		    ordered->file_offset <= last_pos) {
 			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
 					     start_pos, last_pos,
 					     cached_state, GFP_NOFS);
 			for (i = 0; i < num_pages; i++) {
 				unlock_page(pages[i]);
 				page_cache_release(pages[i]);
 			}
 			btrfs_start_ordered_extent(inode, ordered, 1);
 			btrfs_put_ordered_extent(ordered);
 			return -EAGAIN;
 		}
 		if (ordered)
 			btrfs_put_ordered_extent(ordered);
 		clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
 				  last_pos, EXTENT_DIRTY | EXTENT_DELALLOC |
 				  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
 				  0, 0, cached_state, GFP_NOFS);
 		*lockstart = start_pos;
 		*lockend = last_pos;
 		ret = 1;
 	}
 	for (i = 0; i < num_pages; i++) {
 		if (clear_page_dirty_for_io(pages[i]))
 			account_page_redirty(pages[i]);
 		set_page_extent_mapped(pages[i]);
 		WARN_ON(!PageLocked(pages[i]));
 	}
 	return ret;
 }
 static noinline int check_can_nocow(struct inode *inode, loff_t pos,
 				    size_t *write_bytes)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_ordered_extent *ordered;
 	u64 lockstart, lockend;
 	u64 num_bytes;
 	int ret;
 	ret = btrfs_start_nocow_write(root);
 	if (!ret)
 		return -ENOSPC;
 	lockstart = round_down(pos, root->sectorsize);
 	lockend = round_up(pos + *write_bytes, root->sectorsize) - 1;
 	while (1) {
 		lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
 		ordered = btrfs_lookup_ordered_range(inode, lockstart,
 						     lockend - lockstart + 1);
 		if (!ordered) {
 			break;
 		}
 		unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
 		btrfs_start_ordered_extent(inode, ordered, 1);
 		btrfs_put_ordered_extent(ordered);
 	}
 	num_bytes = lockend - lockstart + 1;
 	ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL);
 	if (ret <= 0) {
 		ret = 0;
 		btrfs_end_nocow_write(root);
 	} else {
 		*write_bytes = min_t(size_t, *write_bytes ,
 				     num_bytes - pos + lockstart);
 	}
 	unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
 	return ret;
 }
 static noinline ssize_t __btrfs_buffered_write(struct file *file,
 					       struct iov_iter *i,
 					       loff_t pos)
 {
 	struct inode *inode = file_inode(file);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct page **pages = NULL;
 	struct extent_state *cached_state = NULL;
 	u64 release_bytes = 0;
 	u64 lockstart;
 	u64 lockend;
 	unsigned long first_index;
 	size_t num_written = 0;
 	int nrptrs;
 	int ret = 0;
 	bool only_release_metadata = false;
 	bool force_page_uptodate = false;
 	bool need_unlock;
 	nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
 		     PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
 		     (sizeof(struct page *)));
 	nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
 	nrptrs = max(nrptrs, 8);
 	pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
 	if (!pages)
 		return -ENOMEM;
 	first_index = pos >> PAGE_CACHE_SHIFT;
 	while (iov_iter_count(i) > 0) {
 		size_t offset = pos & (PAGE_CACHE_SIZE - 1);
 		size_t write_bytes = min(iov_iter_count(i),
 					 nrptrs * (size_t)PAGE_CACHE_SIZE -
 					 offset);
 		size_t num_pages = (write_bytes + offset +
 				    PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 		size_t reserve_bytes;
 		size_t dirty_pages;
 		size_t copied;
 		WARN_ON(num_pages > nrptrs);
 		/*
 		 * Fault pages before locking them in prepare_pages
 		 * to avoid recursive lock
 		 */
 		if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {
 			ret = -EFAULT;
 			break;
 		}
 		reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
 		ret = btrfs_check_data_free_space(inode, reserve_bytes);
 		if (ret == -ENOSPC &&
 		    (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
 					      BTRFS_INODE_PREALLOC))) {
 			ret = check_can_nocow(inode, pos, &write_bytes);
 			if (ret > 0) {
 				only_release_metadata = true;
 				/*
 				 * our prealloc extent may be smaller than
 				 * write_bytes, so scale down.
 				 */
 				num_pages = (write_bytes + offset +
 					     PAGE_CACHE_SIZE - 1) >>
 					PAGE_CACHE_SHIFT;
 				reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
 				ret = 0;
 			} else {
 				ret = -ENOSPC;
 			}
 		}
 		if (ret)
 			break;
 		ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);
 		if (ret) {
 			if (!only_release_metadata)
 				btrfs_free_reserved_data_space(inode,
 							       reserve_bytes);
 			else
 				btrfs_end_nocow_write(root);
 			break;
 		}
 		release_bytes = reserve_bytes;
 		need_unlock = false;
 again:
 		/*
 		 * This is going to setup the pages array with the number of
 		 * pages we want, so we don't really need to worry about the
 		 * contents of pages from loop to loop
 		 */
 		ret = prepare_pages(inode, pages, num_pages,
 				    pos, write_bytes,
 				    force_page_uptodate);
 		if (ret)
 			break;
 		ret = lock_and_cleanup_extent_if_need(inode, pages, num_pages,
 						      pos, &lockstart, &lockend,
 						      &cached_state);
 		if (ret < 0) {
 			if (ret == -EAGAIN)
 				goto again;
 			break;
 		} else if (ret > 0) {
 			need_unlock = true;
 			ret = 0;
 		}
 		copied = btrfs_copy_from_user(pos, num_pages,
 					   write_bytes, pages, i);
 		/*
 		 * if we have trouble faulting in the pages, fall
 		 * back to one page at a time
 		 */
 		if (copied < write_bytes)
 			nrptrs = 1;
 		if (copied == 0) {
 			force_page_uptodate = true;
 			dirty_pages = 0;
 		} else {
 			force_page_uptodate = false;
 			dirty_pages = (copied + offset +
 				       PAGE_CACHE_SIZE - 1) >>
 				       PAGE_CACHE_SHIFT;
 		}
 		/*
 		 * If we had a short copy we need to release the excess delaloc
 		 * bytes we reserved.  We need to increment outstanding_extents
 		 * because btrfs_delalloc_release_space will decrement it, but
 		 * we still have an outstanding extent for the chunk we actually
 		 * managed to copy.
 		 */
 		if (num_pages > dirty_pages) {
 			release_bytes = (num_pages - dirty_pages) <<
 				PAGE_CACHE_SHIFT;
 			if (copied > 0) {
 				spin_lock(&BTRFS_I(inode)->lock);
 				BTRFS_I(inode)->outstanding_extents++;
 				spin_unlock(&BTRFS_I(inode)->lock);
 			}
 			if (only_release_metadata)
 				btrfs_delalloc_release_metadata(inode,
 								release_bytes);
 			else
 				btrfs_delalloc_release_space(inode,
 							     release_bytes);
 		}
 		release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
 		if (copied > 0)
 			ret = btrfs_dirty_pages(root, inode, pages,
 						dirty_pages, pos, copied,
 						NULL);
 		if (need_unlock)
 			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
 					     lockstart, lockend, &cached_state,
 					     GFP_NOFS);
 		if (ret) {
 			btrfs_drop_pages(pages, num_pages);
 			break;
 		}
 		release_bytes = 0;
 		if (only_release_metadata)
 			btrfs_end_nocow_write(root);
 		if (only_release_metadata && copied > 0) {
 			u64 lockstart = round_down(pos, root->sectorsize);
 			u64 lockend = lockstart +
 				(dirty_pages << PAGE_CACHE_SHIFT) - 1;
 			set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
 				       lockend, EXTENT_NORESERVE, NULL,
 				       NULL, GFP_NOFS);
 			only_release_metadata = false;
 		}
 		btrfs_drop_pages(pages, num_pages);
 		cond_resched();
 		balance_dirty_pages_ratelimited(inode->i_mapping);
 		if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
 			btrfs_btree_balance_dirty(root);
 		pos += copied;
 		num_written += copied;
 	}
 	kfree(pages);
 	if (release_bytes) {
 		if (only_release_metadata) {
 			btrfs_end_nocow_write(root);
 			btrfs_delalloc_release_metadata(inode, release_bytes);
 		} else {
 			btrfs_delalloc_release_space(inode, release_bytes);
 		}
 	}
 	return num_written ? num_written : ret;
 }
 static ssize_t __btrfs_direct_write(struct kiocb *iocb,
 				    const struct iovec *iov,
 				    unsigned long nr_segs, loff_t pos,
 				    size_t count, size_t ocount)
 {
 	struct file *file = iocb->ki_filp;
 	struct iov_iter i;
 	ssize_t written;
 	ssize_t written_buffered;
 	loff_t endbyte;
 	int err;
 	written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
 					    count, ocount);
 	if (written < 0 || written == count)
 		return written;
 	pos += written;
 	count -= written;
 	iov_iter_init(&i, iov, nr_segs, count, written);
 	written_buffered = __btrfs_buffered_write(file, &i, pos);
 	if (written_buffered < 0) {
 		err = written_buffered;
 		goto out;
 	}
 	endbyte = pos + written_buffered - 1;
 	err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
 	if (err)
 		goto out;
 	written += written_buffered;
 	iocb->ki_pos = pos + written_buffered;
 	invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT,
 				 endbyte >> PAGE_CACHE_SHIFT);
 out:
 	return written ? written : err;
 }
 static void update_time_for_write(struct inode *inode)
 {
 	struct timespec now;
 	if (IS_NOCMTIME(inode))
 		return;
 	now = current_fs_time(inode->i_sb);
 	if (!timespec_equal(&inode->i_mtime, &now))
 		inode->i_mtime = now;
 	if (!timespec_equal(&inode->i_ctime, &now))
 		inode->i_ctime = now;
 	if (IS_I_VERSION(inode))
 		inode_inc_iversion(inode);
 }
 static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 				    const struct iovec *iov,
 				    unsigned long nr_segs, loff_t pos)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file_inode(file);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	u64 start_pos;
 	u64 end_pos;
 	ssize_t num_written = 0;
 	ssize_t err = 0;
 	size_t count, ocount;
 	bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
 	mutex_lock(&inode->i_mutex);
 	err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
 	if (err) {
 		mutex_unlock(&inode->i_mutex);
 		goto out;
 	}
 	count = ocount;
 	current->backing_dev_info = inode->i_mapping->backing_dev_info;
 	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
 	if (err) {
 		mutex_unlock(&inode->i_mutex);
 		goto out;
 	}
 	if (count == 0) {
 		mutex_unlock(&inode->i_mutex);
 		goto out;
 	}
 	err = file_remove_suid(file);
 	if (err) {
 		mutex_unlock(&inode->i_mutex);
 		goto out;
 	}
 	/*
 	 * If BTRFS flips readonly due to some impossible error
 	 * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
 	 * although we have opened a file as writable, we have
 	 * to stop this write operation to ensure FS consistency.
 	 */
 	if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
 		mutex_unlock(&inode->i_mutex);
 		err = -EROFS;
 		goto out;
 	}
 	/*
 	 * We reserve space for updating the inode when we reserve space for the
 	 * extent we are going to write, so we will enospc out there.  We don't
 	 * need to start yet another transaction to update the inode as we will
 	 * update the inode when we finish writing whatever data we write.
 	 */
 	update_time_for_write(inode);
 	start_pos = round_down(pos, root->sectorsize);
 	if (start_pos > i_size_read(inode)) {
 		/* Expand hole size to cover write data, preventing empty gap */
 		end_pos = round_up(pos + count, root->sectorsize);
 		err = btrfs_cont_expand(inode, i_size_read(inode), end_pos);
 		if (err) {
 			mutex_unlock(&inode->i_mutex);
 			goto out;
 		}
 	}
 	if (sync)
 		atomic_inc(&BTRFS_I(inode)->sync_writers);
 	if (unlikely(file->f_flags & O_DIRECT)) {
 		num_written = __btrfs_direct_write(iocb, iov, nr_segs,
 						   pos, count, ocount);
 	} else {
 		struct iov_iter i;
 		iov_iter_init(&i, iov, nr_segs, count, num_written);
 		num_written = __btrfs_buffered_write(file, &i, pos);
 		if (num_written > 0)
 			iocb->ki_pos = pos + num_written;
 	}
 	mutex_unlock(&inode->i_mutex);
 	/*
 	 * we want to make sure fsync finds this change
 	 * but we haven't joined a transaction running right now.
 	 *
 	 * Later on, someone is sure to update the inode and get the
 	 * real transid recorded.
 	 *
 	 * We set last_trans now to the fs_info generation + 1,
 	 * this will either be one more than the running transaction
 	 * or the generation used for the next transaction if there isn't
 	 * one running right now.
 	 *
 	 * We also have to set last_sub_trans to the current log transid,
 	 * otherwise subsequent syncs to a file that's been synced in this
 	 * transaction will appear to have already occured.
 	 */
 	BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
 	BTRFS_I(inode)->last_sub_trans = root->log_transid;
 	if (num_written > 0) {
 		err = generic_write_sync(file, pos, num_written);
 		if (err < 0)
 			num_written = err;
 	}
 	if (sync)
 		atomic_dec(&BTRFS_I(inode)->sync_writers);
 out:
 	current->backing_dev_info = NULL;
 	return num_written ? num_written : err;
 }
 int btrfs_release_file(struct inode *inode, struct file *filp)
 {
 	/*
 	 * ordered_data_close is set by settattr when we are about to truncate
 	 * a file from a non-zero size to a zero size.  This tries to
 	 * flush down new bytes that may have been written if the
 	 * application were using truncate to replace a file in place.
 	 */
 	if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
 			       &BTRFS_I(inode)->runtime_flags)) {
 		struct btrfs_trans_handle *trans;
 		struct btrfs_root *root = BTRFS_I(inode)->root;
 		/*
 		 * We need to block on a committing transaction to keep us from
 		 * throwing a ordered operation on to the list and causing
 		 * something like sync to deadlock trying to flush out this
 		 * inode.
 		 */
 		trans = btrfs_start_transaction(root, 0);
 		if (IS_ERR(trans))
 			return PTR_ERR(trans);
 		btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode);
 		btrfs_end_transaction(trans, root);
 		if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
 			filemap_flush(inode->i_mapping);
 	}
 	if (filp->private_data)
 		btrfs_ioctl_trans_end(filp);
 	return 0;
 }
 /*
  * fsync call for both files and directories.  This logs the inode into
  * the tree log instead of forcing full commits whenever possible.
  *
  * It needs to call filemap_fdatawait so that all ordered extent updates are
  * in the metadata btree are up to date for copying to the log.
  *
  * It drops the inode mutex before doing the tree log commit.  This is an
  * important optimization for directories because holding the mutex prevents
  * new operations on the dir while we write to disk.
  */
 int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
 	struct btrfs_log_ctx ctx;
 	int ret = 0;
 	bool full_sync = 0;
 	trace_btrfs_sync_file(file, datasync);
 	/*
 	 * We write the dirty pages in the range and wait until they complete
 	 * out of the ->i_mutex. If so, we can flush the dirty pages by
 	 * multi-task, and make the performance up.  See
 	 * btrfs_wait_ordered_range for an explanation of the ASYNC check.
 	 */
 	atomic_inc(&BTRFS_I(inode)->sync_writers);
 	ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
 	if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
 			     &BTRFS_I(inode)->runtime_flags))
 		ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
 	atomic_dec(&BTRFS_I(inode)->sync_writers);
 	if (ret)
 		return ret;
 	mutex_lock(&inode->i_mutex);
 	/*
 	 * We flush the dirty pages again to avoid some dirty pages in the
 	 * range being left.
 	 */
 	atomic_inc(&root->log_batch);
 	full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
 			     &BTRFS_I(inode)->runtime_flags);
 	if (full_sync) {
 		ret = btrfs_wait_ordered_range(inode, start, end - start + 1);
 		if (ret) {
 			mutex_unlock(&inode->i_mutex);
 			goto out;
 		}
 	}
 	atomic_inc(&root->log_batch);
 	/*
 	 * check the transaction that last modified this inode
 	 * and see if its already been committed
 	 */
 	if (!BTRFS_I(inode)->last_trans) {
 		mutex_unlock(&inode->i_mutex);
 		goto out;
 	}
 	/*
 	 * if the last transaction that changed this file was before
 	 * the current transaction, we can bail out now without any
 	 * syncing
 	 */
 	smp_mb();
 	if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
 	    BTRFS_I(inode)->last_trans <=
 	    root->fs_info->last_trans_committed) {
 		BTRFS_I(inode)->last_trans = 0;
 		/*
 		 * We'v had everything committed since the last time we were
 		 * modified so clear this flag in case it was set for whatever
 		 * reason, it's no longer relevant.
 		 */
 		clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
 			  &BTRFS_I(inode)->runtime_flags);
 		mutex_unlock(&inode->i_mutex);
 		goto out;
 	}
 	/*
 	 * ok we haven't committed the transaction yet, lets do a commit
 	 */
 	if (file->private_data)
 		btrfs_ioctl_trans_end(file);
 	/*
 	 * We use start here because we will need to wait on the IO to complete
 	 * in btrfs_sync_log, which could require joining a transaction (for
 	 * example checking cross references in the nocow path).  If we use join
 	 * here we could get into a situation where we're waiting on IO to
 	 * happen that is blocked on a transaction trying to commit.  With start
 	 * we inc the extwriter counter, so we wait for all extwriters to exit
 	 * before we start blocking join'ers.  This comment is to keep somebody
 	 * from thinking they are super smart and changing this to
 	 * btrfs_join_transaction *cough*Josef*cough*.
 	 */
 	trans = btrfs_start_transaction(root, 0);
 	if (IS_ERR(trans)) {
 		ret = PTR_ERR(trans);
 		mutex_unlock(&inode->i_mutex);
 		goto out;
 	}
 	trans->sync = true;
 	btrfs_init_log_ctx(&ctx);
 	ret = btrfs_log_dentry_safe(trans, root, dentry, &ctx);
 	if (ret < 0) {
 		/* Fallthrough and commit/free transaction. */
 		ret = 1;
 	}
 	/* we've logged all the items and now have a consistent
 	 * version of the file in the log.  It is possible that
 	 * someone will come in and modify the file, but that's
 	 * fine because the log is consistent on disk, and we
 	 * have references to all of the file's extents
 	 *
 	 * It is possible that someone will come in and log the
 	 * file again, but that will end up using the synchronization
 	 * inside btrfs_sync_log to keep things safe.
 	 */
 	mutex_unlock(&inode->i_mutex);
 	if (ret != BTRFS_NO_LOG_SYNC) {
 		if (!ret) {
 			ret = btrfs_sync_log(trans, root, &ctx);
 			if (!ret) {
 				ret = btrfs_end_transaction(trans, root);
 				goto out;
 			}
 		}
 		if (!full_sync) {
 			ret = btrfs_wait_ordered_range(inode, start,
 						       end - start + 1);
 			if (ret)
 				goto out;
 		}
 		ret = btrfs_commit_transaction(trans, root);
 	} else {
 		ret = btrfs_end_transaction(trans, root);
 	}
 out:
 	return ret > 0 ? -EIO : ret;
 }
 static const struct vm_operations_struct btrfs_file_vm_ops = {
 	.fault		= filemap_fault,
 	.map_pages	= filemap_map_pages,
 	.page_mkwrite	= btrfs_page_mkwrite,
 	.remap_pages	= generic_file_remap_pages,
 };
 static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
 {
 	struct address_space *mapping = filp->f_mapping;
 	if (!mapping->a_ops->readpage)
 		return -ENOEXEC;
 	file_accessed(filp);
 	vma->vm_ops = &btrfs_file_vm_ops;
 	return 0;
 }
 static int hole_mergeable(struct inode *inode, struct extent_buffer *leaf,
 			  int slot, u64 start, u64 end)
 {
 	struct btrfs_file_extent_item *fi;
 	struct btrfs_key key;
 	if (slot < 0 || slot >= btrfs_header_nritems(leaf))
 		return 0;
 	btrfs_item_key_to_cpu(leaf, &key, slot);
 	if (key.objectid != btrfs_ino(inode) ||
 	    key.type != BTRFS_EXTENT_DATA_KEY)
 		return 0;
 	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
 	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
 		return 0;
 	if (btrfs_file_extent_disk_bytenr(leaf, fi))
 		return 0;
 	if (key.offset == end)
 		return 1;
 	if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
 		return 1;
 	return 0;
 }
 static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
 		      struct btrfs_path *path, u64 offset, u64 end)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_buffer *leaf;
 	struct btrfs_file_extent_item *fi;
 	struct extent_map *hole_em;
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	struct btrfs_key key;
 	int ret;
 	if (btrfs_fs_incompat(root->fs_info, NO_HOLES))
 		goto out;
 	key.objectid = btrfs_ino(inode);
 	key.type = BTRFS_EXTENT_DATA_KEY;
 	key.offset = offset;
 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
 	if (ret < 0)
 		return ret;
 	BUG_ON(!ret);
 	leaf = path->nodes[0];
 	if (hole_mergeable(inode, leaf, path->slots[0]-1, offset, end)) {
 		u64 num_bytes;
 		path->slots[0]--;
 		fi = btrfs_item_ptr(leaf, path->slots[0],
 				    struct btrfs_file_extent_item);
 		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
 			end - offset;
 		btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
 		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
 		btrfs_set_file_extent_offset(leaf, fi, 0);
 		btrfs_mark_buffer_dirty(leaf);
 		goto out;
 	}
 	if (hole_mergeable(inode, leaf, path->slots[0]+1, offset, end)) {
 		u64 num_bytes;
 		path->slots[0]++;
 		key.offset = offset;
 		btrfs_set_item_key_safe(root, path, &key);
 		fi = btrfs_item_ptr(leaf, path->slots[0],
 				    struct btrfs_file_extent_item);
 		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
 			offset;
 		btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
 		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
 		btrfs_set_file_extent_offset(leaf, fi, 0);
 		btrfs_mark_buffer_dirty(leaf);
 		goto out;
 	}
 	btrfs_release_path(path);
 	ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
 				       0, 0, end - offset, 0, end - offset,
 				       0, 0, 0);
 	if (ret)
 		return ret;
 out:
 	btrfs_release_path(path);
 	hole_em = alloc_extent_map();
 	if (!hole_em) {
 		btrfs_drop_extent_cache(inode, offset, end - 1, 0);
 		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
 			&BTRFS_I(inode)->runtime_flags);
 	} else {
 		hole_em->start = offset;
 		hole_em->len = end - offset;
 		hole_em->ram_bytes = hole_em->len;
 		hole_em->orig_start = offset;
 		hole_em->block_start = EXTENT_MAP_HOLE;
 		hole_em->block_len = 0;
 		hole_em->orig_block_len = 0;
 		hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
 		hole_em->compress_type = BTRFS_COMPRESS_NONE;
 		hole_em->generation = trans->transid;
 		do {
 			btrfs_drop_extent_cache(inode, offset, end - 1, 0);
 			write_lock(&em_tree->lock);
 			ret = add_extent_mapping(em_tree, hole_em, 1);
 			write_unlock(&em_tree->lock);
 		} while (ret == -EEXIST);
 		free_extent_map(hole_em);
 		if (ret)
 			set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
 				&BTRFS_I(inode)->runtime_flags);
 	}
 	return 0;
 }
 static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_state *cached_state = NULL;
 	struct btrfs_path *path;
 	struct btrfs_block_rsv *rsv;
 	struct btrfs_trans_handle *trans;
 	u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
 	u64 lockend = round_down(offset + len,
 				 BTRFS_I(inode)->root->sectorsize) - 1;
 	u64 cur_offset = lockstart;
 	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
 	u64 drop_end;
 	int ret = 0;
 	int err = 0;
 	int rsv_count;
 	bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
 			  ((offset + len - 1) >> PAGE_CACHE_SHIFT));
 	bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
-	u64 ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE);
+	u64 ino_size;
 	ret = btrfs_wait_ordered_range(inode, offset, len);
 	if (ret)
 		return ret;
 	mutex_lock(&inode->i_mutex);
+	ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE);
 	/*
 	 * We needn't truncate any page which is beyond the end of the file
 	 * because we are sure there is no data there.
 	 */
 	/*
 	 * Only do this if we are in the same page and we aren't doing the
 	 * entire page.
 	 */
 	if (same_page && len < PAGE_CACHE_SIZE) {
 		if (offset < ino_size)
 			ret = btrfs_truncate_page(inode, offset, len, 0);
 		mutex_unlock(&inode->i_mutex);
 		return ret;
 	}
 	/* zero back part of the first page */
 	if (offset < ino_size) {
 		ret = btrfs_truncate_page(inode, offset, 0, 0);
 		if (ret) {
 			mutex_unlock(&inode->i_mutex);
 			return ret;
 		}
 	}
 	/* zero the front end of the last page */
 	if (offset + len < ino_size) {
 		ret = btrfs_truncate_page(inode, offset + len, 0, 1);
 		if (ret) {
 			mutex_unlock(&inode->i_mutex);
 			return ret;
 		}
 	}
 	if (lockend < lockstart) {
 		mutex_unlock(&inode->i_mutex);
 		return 0;
 	}
 	while (1) {
 		struct btrfs_ordered_extent *ordered;
 		truncate_pagecache_range(inode, lockstart, lockend);
 		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
 				 0, &cached_state);
 		ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
 		/*
 		 * We need to make sure we have no ordered extents in this range
 		 * and nobody raced in and read a page in this range, if we did
 		 * we need to try again.
 		 */
 		if ((!ordered ||
 		    (ordered->file_offset + ordered->len <= lockstart ||
 		     ordered->file_offset > lockend)) &&
 		     !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart,
 				     lockend, EXTENT_UPTODATE, 0,
 				     cached_state)) {
 			if (ordered)
 				btrfs_put_ordered_extent(ordered);
 			break;
 		}
 		if (ordered)
 			btrfs_put_ordered_extent(ordered);
 		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
 				     lockend, &cached_state, GFP_NOFS);
 		ret = btrfs_wait_ordered_range(inode, lockstart,
 					       lockend - lockstart + 1);
 		if (ret) {
 			mutex_unlock(&inode->i_mutex);
 			return ret;
 		}
 	}
 	path = btrfs_alloc_path();
 	if (!path) {
 		ret = -ENOMEM;
 		goto out;
 	}
 	rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
 	if (!rsv) {
 		ret = -ENOMEM;
 		goto out_free;
 	}
 	rsv->size = btrfs_calc_trunc_metadata_size(root, 1);
 	rsv->failfast = 1;
 	/*
 	 * 1 - update the inode
 	 * 1 - removing the extents in the range
 	 * 1 - adding the hole extent if no_holes isn't set
 	 */
 	rsv_count = no_holes ? 2 : 3;
 	trans = btrfs_start_transaction(root, rsv_count);
 	if (IS_ERR(trans)) {
 		err = PTR_ERR(trans);
 		goto out_free;
 	}
 	ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
 				      min_size);
 	BUG_ON(ret);
 	trans->block_rsv = rsv;
 	while (cur_offset < lockend) {
 		ret = __btrfs_drop_extents(trans, root, inode, path,
 					   cur_offset, lockend + 1,
 					   &drop_end, 1, 0, 0, NULL);
 		if (ret != -ENOSPC)
 			break;
 		trans->block_rsv = &root->fs_info->trans_block_rsv;
 		if (cur_offset < ino_size) {
 			ret = fill_holes(trans, inode, path, cur_offset,
 					 drop_end);
 			if (ret) {
 				err = ret;
 				break;
 			}
 		}
 		cur_offset = drop_end;
 		ret = btrfs_update_inode(trans, root, inode);
 		if (ret) {
 			err = ret;
 			break;
 		}
 		btrfs_end_transaction(trans, root);
 		btrfs_btree_balance_dirty(root);
 		trans = btrfs_start_transaction(root, rsv_count);
 		if (IS_ERR(trans)) {
 			ret = PTR_ERR(trans);
 			trans = NULL;
 			break;
 		}
 		ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
 					      rsv, min_size);
 		BUG_ON(ret);	/* shouldn't happen */
 		trans->block_rsv = rsv;
 	}
 	if (ret) {
 		err = ret;
 		goto out_trans;
 	}
 	trans->block_rsv = &root->fs_info->trans_block_rsv;
 	/*
 	 * Don't insert file hole extent item if it's for a range beyond eof
 	 * (because it's useless) or if it represents a 0 bytes range (when
 	 * cur_offset == drop_end).
 	 */
 	if (cur_offset < ino_size && cur_offset < drop_end) {
 		ret = fill_holes(trans, inode, path, cur_offset, drop_end);
 		if (ret) {
 			err = ret;
 			goto out_trans;
 		}
 	}
 out_trans:
 	if (!trans)
 		goto out_free;
 	inode_inc_iversion(inode);
 	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	trans->block_rsv = &root->fs_info->trans_block_rsv;
 	ret = btrfs_update_inode(trans, root, inode);
 	btrfs_end_transaction(trans, root);
 	btrfs_btree_balance_dirty(root);
 out_free:
 	btrfs_free_path(path);
 	btrfs_free_block_rsv(root, rsv);
 out:
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
 			     &cached_state, GFP_NOFS);
 	mutex_unlock(&inode->i_mutex);
 	if (ret && !err)
 		err = ret;
 	return err;
 }
 static long btrfs_fallocate(struct file *file, int mode,
 			    loff_t offset, loff_t len)
 {
 	struct inode *inode = file_inode(file);
 	struct extent_state *cached_state = NULL;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	u64 cur_offset;
 	u64 last_byte;
 	u64 alloc_start;
 	u64 alloc_end;
 	u64 alloc_hint = 0;
 	u64 locked_end;
 	struct extent_map *em;
 	int blocksize = BTRFS_I(inode)->root->sectorsize;
 	int ret;
 	alloc_start = round_down(offset, blocksize);
 	alloc_end = round_up(offset + len, blocksize);
 	/* Make sure we aren't being give some crap mode */
 	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
 		return -EOPNOTSUPP;
 	if (mode & FALLOC_FL_PUNCH_HOLE)
 		return btrfs_punch_hole(inode, offset, len);
 	/*
 	 * Make sure we have enough space before we do the
 	 * allocation.
 	 */
 	ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
 	if (ret)
 		return ret;
 	if (root->fs_info->quota_enabled) {
 		ret = btrfs_qgroup_reserve(root, alloc_end - alloc_start);
 		if (ret)
 			goto out_reserve_fail;
 	}
 	mutex_lock(&inode->i_mutex);
 	ret = inode_newsize_ok(inode, alloc_end);
 	if (ret)
 		goto out;
 	if (alloc_start > inode->i_size) {
 		ret = btrfs_cont_expand(inode, i_size_read(inode),
 					alloc_start);
 		if (ret)
 			goto out;
 	} else {
 		/*
 		 * If we are fallocating from the end of the file onward we
 		 * need to zero out the end of the page if i_size lands in the
 		 * middle of a page.
 		 */
 		ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
 		if (ret)
 			goto out;
 	}
 	/*
 	 * wait for ordered IO before we have any locks.  We'll loop again
 	 * below with the locks held.
 	 */
 	ret = btrfs_wait_ordered_range(inode, alloc_start,
 				       alloc_end - alloc_start);
 	if (ret)
 		goto out;
 	locked_end = alloc_end - 1;
 	while (1) {
 		struct btrfs_ordered_extent *ordered;
 		/* the extent lock is ordered inside the running
 		 * transaction
 		 */
 		lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
 				 locked_end, 0, &cached_state);
 		ordered = btrfs_lookup_first_ordered_extent(inode,
 							    alloc_end - 1);
 		if (ordered &&
 		    ordered->file_offset + ordered->len > alloc_start &&
 		    ordered->file_offset < alloc_end) {
 			btrfs_put_ordered_extent(ordered);
 			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
 					     alloc_start, locked_end,
 					     &cached_state, GFP_NOFS);
 			/*
 			 * we can't wait on the range with the transaction
 			 * running or with the extent lock held
 			 */
 			ret = btrfs_wait_ordered_range(inode, alloc_start,
 						       alloc_end - alloc_start);
 			if (ret)
 				goto out;
 		} else {
 			if (ordered)
 				btrfs_put_ordered_extent(ordered);
 			break;
 		}
 	}
 	cur_offset = alloc_start;
 	while (1) {
 		u64 actual_end;
 		em = btrfs_get_extent(inode, NULL, 0, cur_offset,
 				      alloc_end - cur_offset, 0);
 		if (IS_ERR_OR_NULL(em)) {
 			if (!em)
 				ret = -ENOMEM;
 			else
 				ret = PTR_ERR(em);
 			break;
 		}
 		last_byte = min(extent_map_end(em), alloc_end);
 		actual_end = min_t(u64, extent_map_end(em), offset + len);
 		last_byte = ALIGN(last_byte, blocksize);
 		if (em->block_start == EXTENT_MAP_HOLE ||
 		    (cur_offset >= inode->i_size &&
 		     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
 			ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
 							last_byte - cur_offset,
 							1 << inode->i_blkbits,
 							offset + len,
 							&alloc_hint);
 			if (ret < 0) {
 				free_extent_map(em);
 				break;
 			}
 		} else if (actual_end > inode->i_size &&
 			   !(mode & FALLOC_FL_KEEP_SIZE)) {
 			/*
 			 * We didn't need to allocate any more space, but we
 			 * still extended the size of the file so we need to
 			 * update i_size.
 			 */
 			inode->i_ctime = CURRENT_TIME;
 			i_size_write(inode, actual_end);
 			btrfs_ordered_update_i_size(inode, actual_end, NULL);
 		}
 		free_extent_map(em);
 		cur_offset = last_byte;
 		if (cur_offset >= alloc_end) {
 			ret = 0;
 			break;
 		}
 	}
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
 			     &cached_state, GFP_NOFS);
 out:
 	mutex_unlock(&inode->i_mutex);
 	if (root->fs_info->quota_enabled)
 		btrfs_qgroup_free(root, alloc_end - alloc_start);
 out_reserve_fail:
 	/* Let go of our reservation. */
 	btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
 	return ret;
 }
 static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_map *em = NULL;
 	struct extent_state *cached_state = NULL;
 	u64 lockstart = *offset;
 	u64 lockend = i_size_read(inode);
 	u64 start = *offset;
 	u64 len = i_size_read(inode);
 	int ret = 0;
 	lockend = max_t(u64, root->sectorsize, lockend);
 	if (lockend <= lockstart)
 		lockend = lockstart + root->sectorsize;
 	lockend--;
 	len = lockend - lockstart + 1;
 	len = max_t(u64, len, root->sectorsize);
 	if (inode->i_size == 0)
 		return -ENXIO;
 	lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0,
 			 &cached_state);
 	while (start < inode->i_size) {
 		em = btrfs_get_extent_fiemap(inode, NULL, 0, start, len, 0);
 		if (IS_ERR(em)) {
 			ret = PTR_ERR(em);
 			em = NULL;
 			break;
 		}
 		if (whence == SEEK_HOLE &&
 		    (em->block_start == EXTENT_MAP_HOLE ||
 		     test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
 			break;
 		else if (whence == SEEK_DATA &&
 			   (em->block_start != EXTENT_MAP_HOLE &&
 			    !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
 			break;
 		start = em->start + em->len;
 		free_extent_map(em);
 		em = NULL;
 		cond_resched();
 	}
 	free_extent_map(em);
 	if (!ret) {
 		if (whence == SEEK_DATA && start >= inode->i_size)
 			ret = -ENXIO;
 		else
 			*offset = min_t(loff_t, start, inode->i_size);
 	}
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
 			     &cached_state, GFP_NOFS);
 	return ret;
 }
 static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
 {
 	struct inode *inode = file->f_mapping->host;
 	int ret;
 	mutex_lock(&inode->i_mutex);
 	switch (whence) {
 	case SEEK_END:
 	case SEEK_CUR:
 		offset = generic_file_llseek(file, offset, whence);
 		goto out;
 	case SEEK_DATA:
 	case SEEK_HOLE:
 		if (offset >= i_size_read(inode)) {
 			mutex_unlock(&inode->i_mutex);
 			return -ENXIO;
 		}
 		ret = find_desired_extent(inode, &offset, whence);
 		if (ret) {
 			mutex_unlock(&inode->i_mutex);
 			return ret;
 		}
 	}
 	offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
 out:
 	mutex_unlock(&inode->i_mutex);
 	return offset;
 }
 const struct file_operations btrfs_file_operations = {
 	.llseek		= btrfs_file_llseek,
 	.read		= do_sync_read,
 	.write		= do_sync_write,
 	.aio_read       = generic_file_aio_read,
 	.splice_read	= generic_file_splice_read,
 	.aio_write	= btrfs_file_aio_write,
 	.mmap		= btrfs_file_mmap,
 	.open		= generic_file_open,
 	.release	= btrfs_release_file,
 	.fsync		= btrfs_sync_file,
 	.fallocate	= btrfs_fallocate,
 	.unlocked_ioctl	= btrfs_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= btrfs_ioctl,
 #endif
 };
 void btrfs_auto_defrag_exit(void)
 {
 	if (btrfs_inode_defrag_cachep)
 		kmem_cache_destroy(btrfs_inode_defrag_cachep);
 }
 int btrfs_auto_defrag_init(void)
 {
 	btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
 					sizeof(struct inode_defrag), 0,
 					SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
 					NULL);
 	if (!btrfs_inode_defrag_cachep)
 		return -ENOMEM;
 	return 0;
 }