Eric Lee / smarc-fsl-linux-kernel

1

/*

1

/*

2

* linux/fs/ext4/inode.c

2

* linux/fs/ext4/inode.c

3

*

3

*

4

5

* Remy Card (card@masi.ibp.fr)

5

* Remy Card (card@masi.ibp.fr)

6

* Laboratoire MASI - Institut Blaise Pascal

6

* Laboratoire MASI - Institut Blaise Pascal

7

* Universite Pierre et Marie Curie (Paris VI)

7

* Universite Pierre et Marie Curie (Paris VI)

8

*

8

*

9

* from

9

* from

10

*

10

*

11

* linux/fs/minix/inode.c

11

* linux/fs/minix/inode.c

12

*

12

*

13

14

*

14

*

15

* Goal-directed block allocation by Stephen Tweedie

15

* Goal-directed block allocation by Stephen Tweedie

16

* (sct@redhat.com), 1993, 1998

16

* (sct@redhat.com), 1993, 1998

17

* Big-endian to little-endian byte-swapping/bitmaps by

17

* Big-endian to little-endian byte-swapping/bitmaps by

18

* David S. Miller (davem@caip.rutgers.edu), 1995

18

* David S. Miller (davem@caip.rutgers.edu), 1995

19

* 64-bit file support on 64-bit platforms by Jakub Jelinek

19

* 64-bit file support on 64-bit platforms by Jakub Jelinek

20

* (jj@sunsite.ms.mff.cuni.cz)

20

* (jj@sunsite.ms.mff.cuni.cz)

21

*

21

*

22

* Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000

22

* Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000

23

*/

23

*/

24

25

#include <linux/module.h>

25

#include <linux/module.h>

26

#include <linux/fs.h>

26

#include <linux/fs.h>

27

#include <linux/time.h>

27

#include <linux/time.h>

28

#include <linux/jbd2.h>

28

#include <linux/jbd2.h>

29

#include <linux/highuid.h>

29

#include <linux/highuid.h>

30

#include <linux/pagemap.h>

30

#include <linux/pagemap.h>

31

#include <linux/quotaops.h>

31

#include <linux/quotaops.h>

32

#include <linux/string.h>

32

#include <linux/string.h>

33

#include <linux/buffer_head.h>

33

#include <linux/buffer_head.h>

34

#include <linux/writeback.h>

34

#include <linux/writeback.h>

35

#include <linux/pagevec.h>

35

#include <linux/pagevec.h>

36

#include <linux/mpage.h>

36

#include <linux/mpage.h>

37

#include <linux/namei.h>

37

#include <linux/namei.h>

38

#include <linux/uio.h>

38

#include <linux/uio.h>

39

#include <linux/bio.h>

39

#include <linux/bio.h>

40

#include <linux/workqueue.h>

40

#include <linux/workqueue.h>

41

#include <linux/kernel.h>

41

#include <linux/kernel.h>

42

#include <linux/printk.h>

42

#include <linux/printk.h>

43

#include <linux/slab.h>

43

#include <linux/slab.h>

44

#include <linux/ratelimit.h>

44

#include <linux/ratelimit.h>

45

46

#include "ext4_jbd2.h"

46

#include "ext4_jbd2.h"

47

#include "xattr.h"

47

#include "xattr.h"

48

#include "acl.h"

48

#include "acl.h"

49

#include "ext4_extents.h"

49

#include "ext4_extents.h"

50

51

#include <trace/events/ext4.h>

51

#include <trace/events/ext4.h>

52

53

#define MPAGE_DA_EXTENT_TAIL 0x01

53

#define MPAGE_DA_EXTENT_TAIL 0x01

54

55

static inline int ext4_begin_ordered_truncate(struct inode *inode,

55

static inline int ext4_begin_ordered_truncate(struct inode *inode,

56

loff_t new_size)

56

loff_t new_size)

57

{

57

{

58

trace_ext4_begin_ordered_truncate(inode, new_size);

58

trace_ext4_begin_ordered_truncate(inode, new_size);

59

/*

59

/*

60

* If jinode is zero, then we never opened the file for

60

* If jinode is zero, then we never opened the file for

61

* writing, so there's no need to call

61

* writing, so there's no need to call

62

* jbd2_journal_begin_ordered_truncate() since there's no

62

* jbd2_journal_begin_ordered_truncate() since there's no

63

* outstanding writes we need to flush.

63

* outstanding writes we need to flush.

64

*/

64

*/

65

if (!EXT4_I(inode)->jinode)

65

if (!EXT4_I(inode)->jinode)

66

return 0;

66

return 0;

67

return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),

67

return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),

68

EXT4_I(inode)->jinode,

68

EXT4_I(inode)->jinode,

69

new_size);

69

new_size);

70

}

70

}

71

72

static void ext4_invalidatepage(struct page *page, unsigned long offset);

72

static void ext4_invalidatepage(struct page *page, unsigned long offset);

73

static int noalloc_get_block_write(struct inode *inode, sector_t iblock,

73

static int noalloc_get_block_write(struct inode *inode, sector_t iblock,

74

struct buffer_head *bh_result, int create);

74

struct buffer_head *bh_result, int create);

75

static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);

75

static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);

76

static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);

76

static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);

77

static int __ext4_journalled_writepage(struct page *page, unsigned int len);

77

static int __ext4_journalled_writepage(struct page *page, unsigned int len);

78

static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);

78

static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);

79

80

/*

80

/*

81

* Test whether an inode is a fast symlink.

81

* Test whether an inode is a fast symlink.

82

*/

82

*/

83

static int ext4_inode_is_fast_symlink(struct inode *inode)

83

static int ext4_inode_is_fast_symlink(struct inode *inode)

84

{

84

{

85

int ea_blocks = EXT4_I(inode)->i_file_acl ?

85

int ea_blocks = EXT4_I(inode)->i_file_acl ?

86

(inode->i_sb->s_blocksize >> 9) : 0;

86

(inode->i_sb->s_blocksize >> 9) : 0;

87

88

return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);

88

return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);

89

}

89

}

90

91

/*

91

/*

92

* Work out how many blocks we need to proceed with the next chunk of a

92

* Work out how many blocks we need to proceed with the next chunk of a

93

* truncate transaction.

93

* truncate transaction.

94

*/

94

*/

95

static unsigned long blocks_for_truncate(struct inode *inode)

95

static unsigned long blocks_for_truncate(struct inode *inode)

96

{

96

{

97

ext4_lblk_t needed;

97

ext4_lblk_t needed;

98

99

needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);

99

needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);

100

101

/* Give ourselves just enough room to cope with inodes in which

101

/* Give ourselves just enough room to cope with inodes in which

102

* i_blocks is corrupt: we've seen disk corruptions in the past

102

* i_blocks is corrupt: we've seen disk corruptions in the past

103

* which resulted in random data in an inode which looked enough

103

* which resulted in random data in an inode which looked enough

104

* like a regular file for ext4 to try to delete it. Things

104

* like a regular file for ext4 to try to delete it. Things

105

* will go a bit crazy if that happens, but at least we should

105

* will go a bit crazy if that happens, but at least we should

106

* try not to panic the whole kernel. */

106

* try not to panic the whole kernel. */

107

if (needed < 2)

107

if (needed < 2)

108

needed = 2;

108

needed = 2;

109

110

/* But we need to bound the transaction so we don't overflow the

110

/* But we need to bound the transaction so we don't overflow the

111

* journal. */

111

* journal. */

112

if (needed > EXT4_MAX_TRANS_DATA)

112

if (needed > EXT4_MAX_TRANS_DATA)

113

needed = EXT4_MAX_TRANS_DATA;

113

needed = EXT4_MAX_TRANS_DATA;

114

115

return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;

115

return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;

116

}

116

}

117

118

/*

118

/*

119

* Truncate transactions can be complex and absolutely huge. So we need to

119

* Truncate transactions can be complex and absolutely huge. So we need to

120

* be able to restart the transaction at a conventient checkpoint to make

120

* be able to restart the transaction at a conventient checkpoint to make

121

* sure we don't overflow the journal.

121

* sure we don't overflow the journal.

122

*

122

*

123

* start_transaction gets us a new handle for a truncate transaction,

123

* start_transaction gets us a new handle for a truncate transaction,

124

* and extend_transaction tries to extend the existing one a bit. If

124

* and extend_transaction tries to extend the existing one a bit. If

125

* extend fails, we need to propagate the failure up and restart the

125

* extend fails, we need to propagate the failure up and restart the

126

* transaction in the top-level truncate loop. --sct

126

* transaction in the top-level truncate loop. --sct

127

*/

127

*/

128

static handle_t *start_transaction(struct inode *inode)

128

static handle_t *start_transaction(struct inode *inode)

129

{

129

{

130

handle_t *result;

130

handle_t *result;

131

132

result = ext4_journal_start(inode, blocks_for_truncate(inode));

132

result = ext4_journal_start(inode, blocks_for_truncate(inode));

133

if (!IS_ERR(result))

133

if (!IS_ERR(result))

134

return result;

134

return result;

135

136

ext4_std_error(inode->i_sb, PTR_ERR(result));

136

ext4_std_error(inode->i_sb, PTR_ERR(result));

137

return result;

137

return result;

138

}

138

}

139

140

/*

140

/*

141

* Try to extend this transaction for the purposes of truncation.

141

* Try to extend this transaction for the purposes of truncation.

142

*

142

*

143

* Returns 0 if we managed to create more room. If we can't create more

143

* Returns 0 if we managed to create more room. If we can't create more

144

* room, and the transaction must be restarted we return 1.

144

* room, and the transaction must be restarted we return 1.

145

*/

145

*/

146

static int try_to_extend_transaction(handle_t *handle, struct inode *inode)

146

static int try_to_extend_transaction(handle_t *handle, struct inode *inode)

147

{

147

{

148

if (!ext4_handle_valid(handle))

148

if (!ext4_handle_valid(handle))

149

return 0;

149

return 0;

150

if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))

150

if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))

151

return 0;

151

return 0;

152

if (!ext4_journal_extend(handle, blocks_for_truncate(inode)))

152

if (!ext4_journal_extend(handle, blocks_for_truncate(inode)))

153

return 0;

153

return 0;

154

return 1;

154

return 1;

155

}

155

}

156

157

/*

157

/*

158

* Restart the transaction associated with *handle. This does a commit,

158

* Restart the transaction associated with *handle. This does a commit,

159

* so before we call here everything must be consistently dirtied against

159

* so before we call here everything must be consistently dirtied against

160

* this transaction.

160

* this transaction.

161

*/

161

*/

162

int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,

162

int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,

163

int nblocks)

163

int nblocks)

164

{

164

{

165

int ret;

165

int ret;

166

167

/*

167

/*

168

* Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this

168

* Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this

169

* moment, get_block can be called only for blocks inside i_size since

169

* moment, get_block can be called only for blocks inside i_size since

170

* page cache has been already dropped and writes are blocked by

170

* page cache has been already dropped and writes are blocked by

171

* i_mutex. So we can safely drop the i_data_sem here.

171

* i_mutex. So we can safely drop the i_data_sem here.

172

*/

172

*/

173

BUG_ON(EXT4_JOURNAL(inode) == NULL);

173

BUG_ON(EXT4_JOURNAL(inode) == NULL);

174

jbd_debug(2, "restarting handle %p\n", handle);

174

jbd_debug(2, "restarting handle %p\n", handle);

175

up_write(&EXT4_I(inode)->i_data_sem);

175

up_write(&EXT4_I(inode)->i_data_sem);

176

ret = ext4_journal_restart(handle, nblocks);

176

ret = ext4_journal_restart(handle, nblocks);

177

down_write(&EXT4_I(inode)->i_data_sem);

177

down_write(&EXT4_I(inode)->i_data_sem);

178

ext4_discard_preallocations(inode);

178

ext4_discard_preallocations(inode);

179

180

return ret;

180

return ret;

181

}

181

}

182

183

/*

183

/*

184

* Called at the last iput() if i_nlink is zero.

184

* Called at the last iput() if i_nlink is zero.

185

*/

185

*/

186

void ext4_evict_inode(struct inode *inode)

186

void ext4_evict_inode(struct inode *inode)

187

{

187

{

188

handle_t *handle;

188

handle_t *handle;

189

int err;

189

int err;

190

191

trace_ext4_evict_inode(inode);

191

trace_ext4_evict_inode(inode);

192

if (inode->i_nlink) {

192

if (inode->i_nlink) {

193

truncate_inode_pages(&inode->i_data, 0);

193

truncate_inode_pages(&inode->i_data, 0);

194

goto no_delete;

194

goto no_delete;

195

}

195

}

196

197

if (!is_bad_inode(inode))

197

if (!is_bad_inode(inode))

198

dquot_initialize(inode);

198

dquot_initialize(inode);

199

200

if (ext4_should_order_data(inode))

200

if (ext4_should_order_data(inode))

201

ext4_begin_ordered_truncate(inode, 0);

201

ext4_begin_ordered_truncate(inode, 0);

202

truncate_inode_pages(&inode->i_data, 0);

202

truncate_inode_pages(&inode->i_data, 0);

203

204

if (is_bad_inode(inode))

204

if (is_bad_inode(inode))

205

goto no_delete;

205

goto no_delete;

206

207

handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3);

207

handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3);

208

if (IS_ERR(handle)) {

208

if (IS_ERR(handle)) {

209

ext4_std_error(inode->i_sb, PTR_ERR(handle));

209

ext4_std_error(inode->i_sb, PTR_ERR(handle));

210

/*

210

/*

211

* If we're going to skip the normal cleanup, we still need to

211

* If we're going to skip the normal cleanup, we still need to

212

* make sure that the in-core orphan linked list is properly

212

* make sure that the in-core orphan linked list is properly

213

* cleaned up.

213

* cleaned up.

214

*/

214

*/

215

ext4_orphan_del(NULL, inode);

215

ext4_orphan_del(NULL, inode);

216

goto no_delete;

216

goto no_delete;

217

}

217

}

218

219

if (IS_SYNC(inode))

219

if (IS_SYNC(inode))

220

ext4_handle_sync(handle);

220

ext4_handle_sync(handle);

221

inode->i_size = 0;

221

inode->i_size = 0;

222

err = ext4_mark_inode_dirty(handle, inode);

222

err = ext4_mark_inode_dirty(handle, inode);

223

if (err) {

223

if (err) {

224

ext4_warning(inode->i_sb,

224

ext4_warning(inode->i_sb,

225

"couldn't mark inode dirty (err %d)", err);

225

"couldn't mark inode dirty (err %d)", err);

226

goto stop_handle;

226

goto stop_handle;

227

}

227

}

228

if (inode->i_blocks)

228

if (inode->i_blocks)

229

ext4_truncate(inode);

229

ext4_truncate(inode);

230

231

/*

231

/*

232

* ext4_ext_truncate() doesn't reserve any slop when it

232

* ext4_ext_truncate() doesn't reserve any slop when it

233

* restarts journal transactions; therefore there may not be

233

* restarts journal transactions; therefore there may not be

234

* enough credits left in the handle to remove the inode from

234

* enough credits left in the handle to remove the inode from

235

* the orphan list and set the dtime field.

235

* the orphan list and set the dtime field.

236

*/

236

*/

237

if (!ext4_handle_has_enough_credits(handle, 3)) {

237

if (!ext4_handle_has_enough_credits(handle, 3)) {

238

err = ext4_journal_extend(handle, 3);

238

err = ext4_journal_extend(handle, 3);

239

if (err > 0)

239

if (err > 0)

240

err = ext4_journal_restart(handle, 3);

240

err = ext4_journal_restart(handle, 3);

241

if (err != 0) {

241

if (err != 0) {

242

ext4_warning(inode->i_sb,

242

ext4_warning(inode->i_sb,

243

"couldn't extend journal (err %d)", err);

243

"couldn't extend journal (err %d)", err);

244

stop_handle:

244

stop_handle:

245

ext4_journal_stop(handle);

245

ext4_journal_stop(handle);

246

ext4_orphan_del(NULL, inode);

246

ext4_orphan_del(NULL, inode);

247

goto no_delete;

247

goto no_delete;

248

}

248

}

249

}

249

}

250

251

/*

251

/*

252

* Kill off the orphan record which ext4_truncate created.

252

* Kill off the orphan record which ext4_truncate created.

253

* AKPM: I think this can be inside the above `if'.

253

* AKPM: I think this can be inside the above `if'.

254

* Note that ext4_orphan_del() has to be able to cope with the

254

* Note that ext4_orphan_del() has to be able to cope with the

255

* deletion of a non-existent orphan - this is because we don't

255

* deletion of a non-existent orphan - this is because we don't

256

* know if ext4_truncate() actually created an orphan record.

256

* know if ext4_truncate() actually created an orphan record.

257

* (Well, we could do this if we need to, but heck - it works)

257

* (Well, we could do this if we need to, but heck - it works)

258

*/

258

*/

259

ext4_orphan_del(handle, inode);

259

ext4_orphan_del(handle, inode);

260

EXT4_I(inode)->i_dtime = get_seconds();

260

EXT4_I(inode)->i_dtime = get_seconds();

261

262

/*

262

/*

263

* One subtle ordering requirement: if anything has gone wrong

263

* One subtle ordering requirement: if anything has gone wrong

264

* (transaction abort, IO errors, whatever), then we can still

264

* (transaction abort, IO errors, whatever), then we can still

265

* do these next steps (the fs will already have been marked as

265

* do these next steps (the fs will already have been marked as

266

* having errors), but we can't free the inode if the mark_dirty

266

* having errors), but we can't free the inode if the mark_dirty

267

* fails.

267

* fails.

268

*/

268

*/

269

if (ext4_mark_inode_dirty(handle, inode))

269

if (ext4_mark_inode_dirty(handle, inode))

270

/* If that failed, just do the required in-core inode clear. */

270

/* If that failed, just do the required in-core inode clear. */

271

ext4_clear_inode(inode);

271

ext4_clear_inode(inode);

272

else

272

else

273

ext4_free_inode(handle, inode);

273

ext4_free_inode(handle, inode);

274

ext4_journal_stop(handle);

274

ext4_journal_stop(handle);

275

return;

275

return;

276

no_delete:

276

no_delete:

277

ext4_clear_inode(inode); /* We must guarantee clearing of inode... */

277

ext4_clear_inode(inode); /* We must guarantee clearing of inode... */

278

}

278

}

279

280

typedef struct {

280

typedef struct {

281

__le32 *p;

281

__le32 *p;

282

__le32 key;

282

__le32 key;

283

struct buffer_head *bh;

283

struct buffer_head *bh;

284

} Indirect;

284

} Indirect;

285

286

static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)

286

static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)

287

{

287

{

288

p->key = *(p->p = v);

288

p->key = *(p->p = v);

289

p->bh = bh;

289

p->bh = bh;

290

}

290

}

291

292

/**

292

/**

293

* ext4_block_to_path - parse the block number into array of offsets

293

* ext4_block_to_path - parse the block number into array of offsets

294

* @inode: inode in question (we are only interested in its superblock)

294

* @inode: inode in question (we are only interested in its superblock)

295

* @i_block: block number to be parsed

295

* @i_block: block number to be parsed

296

* @offsets: array to store the offsets in

296

* @offsets: array to store the offsets in

297

* @boundary: set this non-zero if the referred-to block is likely to be

297

* @boundary: set this non-zero if the referred-to block is likely to be

298

* followed (on disk) by an indirect block.

298

* followed (on disk) by an indirect block.

299

*

299

*

300

* To store the locations of file's data ext4 uses a data structure common

300

* To store the locations of file's data ext4 uses a data structure common

301

* for UNIX filesystems - tree of pointers anchored in the inode, with

301

* for UNIX filesystems - tree of pointers anchored in the inode, with

302

* data blocks at leaves and indirect blocks in intermediate nodes.

302

* data blocks at leaves and indirect blocks in intermediate nodes.

303

* This function translates the block number into path in that tree -

303

* This function translates the block number into path in that tree -

304

* return value is the path length and @offsets[n] is the offset of

304

* return value is the path length and @offsets[n] is the offset of

305

* pointer to (n+1)th node in the nth one. If @block is out of range

305

* pointer to (n+1)th node in the nth one. If @block is out of range

306

* (negative or too large) warning is printed and zero returned.

306

* (negative or too large) warning is printed and zero returned.

307

*

307

*

308

* Note: function doesn't find node addresses, so no IO is needed. All

308

* Note: function doesn't find node addresses, so no IO is needed. All

309

* we need to know is the capacity of indirect blocks (taken from the

309

* we need to know is the capacity of indirect blocks (taken from the

310

* inode->i_sb).

310

* inode->i_sb).

311

*/

311

*/

312

313

/*

313

/*

314

* Portability note: the last comparison (check that we fit into triple

314

* Portability note: the last comparison (check that we fit into triple

315

* indirect block) is spelled differently, because otherwise on an

315

* indirect block) is spelled differently, because otherwise on an

316

* architecture with 32-bit longs and 8Kb pages we might get into trouble

316

* architecture with 32-bit longs and 8Kb pages we might get into trouble

317

* if our filesystem had 8Kb blocks. We might use long long, but that would

317

* if our filesystem had 8Kb blocks. We might use long long, but that would

318

* kill us on x86. Oh, well, at least the sign propagation does not matter -

318

* kill us on x86. Oh, well, at least the sign propagation does not matter -

319

* i_block would have to be negative in the very beginning, so we would not

319

* i_block would have to be negative in the very beginning, so we would not

320

* get there at all.

320

* get there at all.

321

*/

321

*/

322

323

static int ext4_block_to_path(struct inode *inode,

323

static int ext4_block_to_path(struct inode *inode,

324

ext4_lblk_t i_block,

324

ext4_lblk_t i_block,

325

ext4_lblk_t offsets[4], int *boundary)

325

ext4_lblk_t offsets[4], int *boundary)

326

{

326

{

327

int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);

327

int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);

328

int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);

328

int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);

329

const long direct_blocks = EXT4_NDIR_BLOCKS,

329

const long direct_blocks = EXT4_NDIR_BLOCKS,

330

indirect_blocks = ptrs,

330

indirect_blocks = ptrs,

331

double_blocks = (1 << (ptrs_bits * 2));

331

double_blocks = (1 << (ptrs_bits * 2));

332

int n = 0;

332

int n = 0;

333

int final = 0;

333

int final = 0;

334

335

if (i_block < direct_blocks) {

335

if (i_block < direct_blocks) {

336

offsets[n++] = i_block;

336

offsets[n++] = i_block;

337

final = direct_blocks;

337

final = direct_blocks;

338

} else if ((i_block -= direct_blocks) < indirect_blocks) {

338

} else if ((i_block -= direct_blocks) < indirect_blocks) {

339

offsets[n++] = EXT4_IND_BLOCK;

339

offsets[n++] = EXT4_IND_BLOCK;

340

offsets[n++] = i_block;

340

offsets[n++] = i_block;

341

final = ptrs;

341

final = ptrs;

342

} else if ((i_block -= indirect_blocks) < double_blocks) {

342

} else if ((i_block -= indirect_blocks) < double_blocks) {

343

offsets[n++] = EXT4_DIND_BLOCK;

343

offsets[n++] = EXT4_DIND_BLOCK;

344

offsets[n++] = i_block >> ptrs_bits;

344

offsets[n++] = i_block >> ptrs_bits;

345

offsets[n++] = i_block & (ptrs - 1);

345

offsets[n++] = i_block & (ptrs - 1);

346

final = ptrs;

346

final = ptrs;

347

} else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {

347

} else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {

348

offsets[n++] = EXT4_TIND_BLOCK;

348

offsets[n++] = EXT4_TIND_BLOCK;

349

offsets[n++] = i_block >> (ptrs_bits * 2);

349

offsets[n++] = i_block >> (ptrs_bits * 2);

350

offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);

350

offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);

351

offsets[n++] = i_block & (ptrs - 1);

351

offsets[n++] = i_block & (ptrs - 1);

352

final = ptrs;

352

final = ptrs;

353

} else {

353

} else {

354

ext4_warning(inode->i_sb, "block %lu > max in inode %lu",

354

ext4_warning(inode->i_sb, "block %lu > max in inode %lu",

355

i_block + direct_blocks +

355

i_block + direct_blocks +

356

indirect_blocks + double_blocks, inode->i_ino);

356

indirect_blocks + double_blocks, inode->i_ino);

357

}

357

}

358

if (boundary)

358

if (boundary)

359

*boundary = final - 1 - (i_block & (ptrs - 1));

359

*boundary = final - 1 - (i_block & (ptrs - 1));

360

return n;

360

return n;

361

}

361

}

362

363

static int __ext4_check_blockref(const char *function, unsigned int line,

363

static int __ext4_check_blockref(const char *function, unsigned int line,

364

struct inode *inode,

364

struct inode *inode,

365

__le32 *p, unsigned int max)

365

__le32 *p, unsigned int max)

366

{

366

{

367

struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;

367

struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;

368

__le32 *bref = p;

368

__le32 *bref = p;

369

unsigned int blk;

369

unsigned int blk;

370

371

while (bref < p+max) {

371

while (bref < p+max) {

372

blk = le32_to_cpu(*bref++);

372

blk = le32_to_cpu(*bref++);

373

if (blk &&

373

if (blk &&

374

unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),

374

unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),

375

blk, 1))) {

375

blk, 1))) {

376

es->s_last_error_block = cpu_to_le64(blk);

376

es->s_last_error_block = cpu_to_le64(blk);

377

ext4_error_inode(inode, function, line, blk,

377

ext4_error_inode(inode, function, line, blk,

378

"invalid block");

378

"invalid block");

379

return -EIO;

379

return -EIO;

380

}

380

}

381

}

381

}

382

return 0;

382

return 0;

383

}

383

}

384

385

386

#define ext4_check_indirect_blockref(inode, bh) \

386

#define ext4_check_indirect_blockref(inode, bh) \

387

__ext4_check_blockref(__func__, __LINE__, inode, \

387

__ext4_check_blockref(__func__, __LINE__, inode, \

388

(__le32 *)(bh)->b_data, \

388

(__le32 *)(bh)->b_data, \

389

EXT4_ADDR_PER_BLOCK((inode)->i_sb))

389

EXT4_ADDR_PER_BLOCK((inode)->i_sb))

390

391

#define ext4_check_inode_blockref(inode) \

391

#define ext4_check_inode_blockref(inode) \

392

__ext4_check_blockref(__func__, __LINE__, inode, \

392

__ext4_check_blockref(__func__, __LINE__, inode, \

393

EXT4_I(inode)->i_data, \

393

EXT4_I(inode)->i_data, \

394

EXT4_NDIR_BLOCKS)

394

EXT4_NDIR_BLOCKS)

395

396

/**

396

/**

397

* ext4_get_branch - read the chain of indirect blocks leading to data

397

* ext4_get_branch - read the chain of indirect blocks leading to data

398

* @inode: inode in question

398

* @inode: inode in question

399

* @depth: depth of the chain (1 - direct pointer, etc.)

399

* @depth: depth of the chain (1 - direct pointer, etc.)

400

* @offsets: offsets of pointers in inode/indirect blocks

400

* @offsets: offsets of pointers in inode/indirect blocks

401

* @chain: place to store the result

401

* @chain: place to store the result

402

* @err: here we store the error value

402

* @err: here we store the error value

403

*

403

*

404

* Function fills the array of triples <key, p, bh> and returns %NULL

404

* Function fills the array of triples <key, p, bh> and returns %NULL

405

* if everything went OK or the pointer to the last filled triple

405

* if everything went OK or the pointer to the last filled triple

406

* (incomplete one) otherwise. Upon the return chain[i].key contains

406

* (incomplete one) otherwise. Upon the return chain[i].key contains

407

* the number of (i+1)-th block in the chain (as it is stored in memory,

407

* the number of (i+1)-th block in the chain (as it is stored in memory,

408

* i.e. little-endian 32-bit), chain[i].p contains the address of that

408

* i.e. little-endian 32-bit), chain[i].p contains the address of that

409

* number (it points into struct inode for i==0 and into the bh->b_data

409

* number (it points into struct inode for i==0 and into the bh->b_data

410

* for i>0) and chain[i].bh points to the buffer_head of i-th indirect

410

* for i>0) and chain[i].bh points to the buffer_head of i-th indirect

411

* block for i>0 and NULL for i==0. In other words, it holds the block

411

* block for i>0 and NULL for i==0. In other words, it holds the block

412

* numbers of the chain, addresses they were taken from (and where we can

412

* numbers of the chain, addresses they were taken from (and where we can

413

* verify that chain did not change) and buffer_heads hosting these

413

* verify that chain did not change) and buffer_heads hosting these

414

* numbers.

414

* numbers.

415

*

415

*

416

* Function stops when it stumbles upon zero pointer (absent block)

416

* Function stops when it stumbles upon zero pointer (absent block)

417

* (pointer to last triple returned, *@err == 0)

417

* (pointer to last triple returned, *@err == 0)

418

* or when it gets an IO error reading an indirect block

418

* or when it gets an IO error reading an indirect block

419

* (ditto, *@err == -EIO)

419

* (ditto, *@err == -EIO)

420

* or when it reads all @depth-1 indirect blocks successfully and finds

420

* or when it reads all @depth-1 indirect blocks successfully and finds

421

* the whole chain, all way to the data (returns %NULL, *err == 0).

421

* the whole chain, all way to the data (returns %NULL, *err == 0).

422

*

422

*

423

* Need to be called with

423

* Need to be called with

424

* down_read(&EXT4_I(inode)->i_data_sem)

424

* down_read(&EXT4_I(inode)->i_data_sem)

425

*/

425

*/

426

static Indirect *ext4_get_branch(struct inode *inode, int depth,

426

static Indirect *ext4_get_branch(struct inode *inode, int depth,

427

ext4_lblk_t *offsets,

427

ext4_lblk_t *offsets,

428

Indirect chain[4], int *err)

428

Indirect chain[4], int *err)

429

{

429

{

430

struct super_block *sb = inode->i_sb;

430

struct super_block *sb = inode->i_sb;

431

Indirect *p = chain;

431

Indirect *p = chain;

432

struct buffer_head *bh;

432

struct buffer_head *bh;

433

434

*err = 0;

434

*err = 0;

435

/* i_data is not going away, no lock needed */

435

/* i_data is not going away, no lock needed */

436

add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets);

436

add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets);

437

if (!p->key)

437

if (!p->key)

438

goto no_block;

438

goto no_block;

439

while (--depth) {

439

while (--depth) {

440

bh = sb_getblk(sb, le32_to_cpu(p->key));

440

bh = sb_getblk(sb, le32_to_cpu(p->key));

441

if (unlikely(!bh))

441

if (unlikely(!bh))

442

goto failure;

442

goto failure;

443

444

if (!bh_uptodate_or_lock(bh)) {

444

if (!bh_uptodate_or_lock(bh)) {

445

if (bh_submit_read(bh) < 0) {

445

if (bh_submit_read(bh) < 0) {

446

put_bh(bh);

446

put_bh(bh);

447

goto failure;

447

goto failure;

448

}

448

}

449

/* validate block references */

449

/* validate block references */

450

if (ext4_check_indirect_blockref(inode, bh)) {

450

if (ext4_check_indirect_blockref(inode, bh)) {

451

put_bh(bh);

451

put_bh(bh);

452

goto failure;

452

goto failure;

453

}

453

}

454

}

454

}

455

456

add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);

456

add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);

457

/* Reader: end */

457

/* Reader: end */

458

if (!p->key)

458

if (!p->key)

459

goto no_block;

459

goto no_block;

460

}

460

}

461

return NULL;

461

return NULL;

462

463

failure:

463

failure:

464

*err = -EIO;

464

*err = -EIO;

465

no_block:

465

no_block:

466

return p;

466

return p;

467

}

467

}

468

469

/**

469

/**

470

* ext4_find_near - find a place for allocation with sufficient locality

470

* ext4_find_near - find a place for allocation with sufficient locality

471

* @inode: owner

471

* @inode: owner

472

* @ind: descriptor of indirect block.

472

* @ind: descriptor of indirect block.

473

*

473

*

474

* This function returns the preferred place for block allocation.

474

* This function returns the preferred place for block allocation.

475

* It is used when heuristic for sequential allocation fails.

475

* It is used when heuristic for sequential allocation fails.

476

* Rules are:

476

* Rules are:

477

* + if there is a block to the left of our position - allocate near it.

477

* + if there is a block to the left of our position - allocate near it.

478

* + if pointer will live in indirect block - allocate near that block.

478

* + if pointer will live in indirect block - allocate near that block.

479

* + if pointer will live in inode - allocate in the same

479

* + if pointer will live in inode - allocate in the same

480

* cylinder group.

480

* cylinder group.

481

*

481

*

482

* In the latter case we colour the starting block by the callers PID to

482

* In the latter case we colour the starting block by the callers PID to

483

* prevent it from clashing with concurrent allocations for a different inode

483

* prevent it from clashing with concurrent allocations for a different inode

484

* in the same block group. The PID is used here so that functionally related

484

* in the same block group. The PID is used here so that functionally related

485

* files will be close-by on-disk.

485

* files will be close-by on-disk.

486

*

486

*

487

* Caller must make sure that @ind is valid and will stay that way.

487

* Caller must make sure that @ind is valid and will stay that way.

488

*/

488

*/

489

static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)

489

static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)

490

{

490

{

491

struct ext4_inode_info *ei = EXT4_I(inode);

491

struct ext4_inode_info *ei = EXT4_I(inode);

492

__le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data;

492

__le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data;

493

__le32 *p;

493

__le32 *p;

494

ext4_fsblk_t bg_start;

494

ext4_fsblk_t bg_start;

495

ext4_fsblk_t last_block;

495

ext4_fsblk_t last_block;

496

ext4_grpblk_t colour;

496

ext4_grpblk_t colour;

497

ext4_group_t block_group;

497

ext4_group_t block_group;

498

int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));

498

int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));

499

500

/* Try to find previous block */

500

/* Try to find previous block */

501

for (p = ind->p - 1; p >= start; p--) {

501

for (p = ind->p - 1; p >= start; p--) {

502

if (*p)

502

if (*p)

503

return le32_to_cpu(*p);

503

return le32_to_cpu(*p);

504

}

504

}

505

506

/* No such thing, so let's try location of indirect block */

506

/* No such thing, so let's try location of indirect block */

507

if (ind->bh)

507

if (ind->bh)

508

return ind->bh->b_blocknr;

508

return ind->bh->b_blocknr;

509

510

/*

510

/*

511

* It is going to be referred to from the inode itself? OK, just put it

511

* It is going to be referred to from the inode itself? OK, just put it

512

* into the same cylinder group then.

512

* into the same cylinder group then.

513

*/

513

*/

514

block_group = ei->i_block_group;

514

block_group = ei->i_block_group;

515

if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {

515

if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {

516

block_group &= ~(flex_size-1);

516

block_group &= ~(flex_size-1);

517

if (S_ISREG(inode->i_mode))

517

if (S_ISREG(inode->i_mode))

518

block_group++;

518

block_group++;

519

}

519

}

520

bg_start = ext4_group_first_block_no(inode->i_sb, block_group);

520

bg_start = ext4_group_first_block_no(inode->i_sb, block_group);

521

last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;

521

last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;

522

523

/*

523

/*

524

* If we are doing delayed allocation, we don't need take

524

* If we are doing delayed allocation, we don't need take

525

* colour into account.

525

* colour into account.

526

*/

526

*/

527

if (test_opt(inode->i_sb, DELALLOC))

527

if (test_opt(inode->i_sb, DELALLOC))

528

return bg_start;

528

return bg_start;

529

530

if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)

530

if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)

531

colour = (current->pid % 16) *

531

colour = (current->pid % 16) *

532

(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);

532

(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);

533

else

533

else

534

colour = (current->pid % 16) * ((last_block - bg_start) / 16);

534

colour = (current->pid % 16) * ((last_block - bg_start) / 16);

535

return bg_start + colour;

535

return bg_start + colour;

536

}

536

}

537

538

/**

538

/**

539

* ext4_find_goal - find a preferred place for allocation.

539

* ext4_find_goal - find a preferred place for allocation.

540

* @inode: owner

540

* @inode: owner

541

* @block: block we want

541

* @block: block we want

542

* @partial: pointer to the last triple within a chain

542

* @partial: pointer to the last triple within a chain

543

*

543

*

544

* Normally this function find the preferred place for block allocation,

544

* Normally this function find the preferred place for block allocation,

545

* returns it.

545

* returns it.

546

* Because this is only used for non-extent files, we limit the block nr

546

* Because this is only used for non-extent files, we limit the block nr

547

* to 32 bits.

547

* to 32 bits.

548

*/

548

*/

549

static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,

549

static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,

550

Indirect *partial)

550

Indirect *partial)

551

{

551

{

552

ext4_fsblk_t goal;

552

ext4_fsblk_t goal;

553

554

/*

554

/*

555

* XXX need to get goal block from mballoc's data structures

555

* XXX need to get goal block from mballoc's data structures

556

*/

556

*/

557

558

goal = ext4_find_near(inode, partial);

558

goal = ext4_find_near(inode, partial);

559

goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;

559

goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;

560

return goal;

560

return goal;

561

}

561

}

562

563

/**

563

/**

564

* ext4_blks_to_allocate - Look up the block map and count the number

564

* ext4_blks_to_allocate - Look up the block map and count the number

565

* of direct blocks need to be allocated for the given branch.

565

* of direct blocks need to be allocated for the given branch.

566

*

566

*

567

* @branch: chain of indirect blocks

567

* @branch: chain of indirect blocks

568

* @k: number of blocks need for indirect blocks

568

* @k: number of blocks need for indirect blocks

569

* @blks: number of data blocks to be mapped.

569

* @blks: number of data blocks to be mapped.

570

* @blocks_to_boundary: the offset in the indirect block

570

* @blocks_to_boundary: the offset in the indirect block

571

*

571

*

572

* return the total number of blocks to be allocate, including the

572

* return the total number of blocks to be allocate, including the

573

* direct and indirect blocks.

573

* direct and indirect blocks.

574

*/

574

*/

575

static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,

575

static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,

576

int blocks_to_boundary)

576

int blocks_to_boundary)

577

{

577

{

578

unsigned int count = 0;

578

unsigned int count = 0;

579

580

/*

580

/*

581

* Simple case, [t,d]Indirect block(s) has not allocated yet

581

* Simple case, [t,d]Indirect block(s) has not allocated yet

582

* then it's clear blocks on that path have not allocated

582

* then it's clear blocks on that path have not allocated

583

*/

583

*/

584

if (k > 0) {

584

if (k > 0) {

585

/* right now we don't handle cross boundary allocation */

585

/* right now we don't handle cross boundary allocation */

586

if (blks < blocks_to_boundary + 1)

586

if (blks < blocks_to_boundary + 1)

587

count += blks;

587

count += blks;

588

else

588

else

589

count += blocks_to_boundary + 1;

589

count += blocks_to_boundary + 1;

590

return count;

590

return count;

591

}

591

}

592

593

count++;

593

count++;

594

while (count < blks && count <= blocks_to_boundary &&

594

while (count < blks && count <= blocks_to_boundary &&

595

le32_to_cpu(*(branch[0].p + count)) == 0) {

595

le32_to_cpu(*(branch[0].p + count)) == 0) {

596

count++;

596

count++;

597

}

597

}

598

return count;

598

return count;

599

}

599

}

600

601

/**

601

/**

602

* ext4_alloc_blocks: multiple allocate blocks needed for a branch

602

* ext4_alloc_blocks: multiple allocate blocks needed for a branch

603

* @handle: handle for this transaction

603

* @handle: handle for this transaction

604

* @inode: inode which needs allocated blocks

604

* @inode: inode which needs allocated blocks

605

* @iblock: the logical block to start allocated at

605

* @iblock: the logical block to start allocated at

606

* @goal: preferred physical block of allocation

606

* @goal: preferred physical block of allocation

607

* @indirect_blks: the number of blocks need to allocate for indirect

607

* @indirect_blks: the number of blocks need to allocate for indirect

608

* blocks

608

* blocks

609

* @blks: number of desired blocks

609

* @blks: number of desired blocks

610

* @new_blocks: on return it will store the new block numbers for

610

* @new_blocks: on return it will store the new block numbers for

611

* the indirect blocks(if needed) and the first direct block,

611

* the indirect blocks(if needed) and the first direct block,

612

* @err: on return it will store the error code

612

* @err: on return it will store the error code

613

*

613

*

614

* This function will return the number of blocks allocated as

614

* This function will return the number of blocks allocated as

615

* requested by the passed-in parameters.

615

* requested by the passed-in parameters.

616

*/

616

*/

617

static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,

617

static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,

618

ext4_lblk_t iblock, ext4_fsblk_t goal,

618

ext4_lblk_t iblock, ext4_fsblk_t goal,

619

int indirect_blks, int blks,

619

int indirect_blks, int blks,

620

ext4_fsblk_t new_blocks[4], int *err)

620

ext4_fsblk_t new_blocks[4], int *err)

621

{

621

{

622

struct ext4_allocation_request ar;

622

struct ext4_allocation_request ar;

623

int target, i;

623

int target, i;

624

unsigned long count = 0, blk_allocated = 0;

624

unsigned long count = 0, blk_allocated = 0;

625

int index = 0;

625

int index = 0;

626

ext4_fsblk_t current_block = 0;

626

ext4_fsblk_t current_block = 0;

627

int ret = 0;

627

int ret = 0;

628

629

/*

629

/*

630

* Here we try to allocate the requested multiple blocks at once,

630

* Here we try to allocate the requested multiple blocks at once,

631

* on a best-effort basis.

631

* on a best-effort basis.

632

* To build a branch, we should allocate blocks for

632

* To build a branch, we should allocate blocks for

633

* the indirect blocks(if not allocated yet), and at least

633

* the indirect blocks(if not allocated yet), and at least

634

* the first direct block of this branch. That's the

634

* the first direct block of this branch. That's the

635

* minimum number of blocks need to allocate(required)

635

* minimum number of blocks need to allocate(required)

636

*/

636

*/

637

/* first we try to allocate the indirect blocks */

637

/* first we try to allocate the indirect blocks */

638

target = indirect_blks;

638

target = indirect_blks;

639

while (target > 0) {

639

while (target > 0) {

640

count = target;

640

count = target;

641

/* allocating blocks for indirect blocks and direct blocks */

641

/* allocating blocks for indirect blocks and direct blocks */

642

current_block = ext4_new_meta_blocks(handle, inode,

642

current_block = ext4_new_meta_blocks(handle, inode,

643

goal, &count, err);

643

goal, &count, err);

644

if (*err)

644

if (*err)

645

goto failed_out;

645

goto failed_out;

646

647

if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) {

647

if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) {

648

EXT4_ERROR_INODE(inode,

648

EXT4_ERROR_INODE(inode,

649

"current_block %llu + count %lu > %d!",

649

"current_block %llu + count %lu > %d!",

650

current_block, count,

650

current_block, count,

651

EXT4_MAX_BLOCK_FILE_PHYS);

651

EXT4_MAX_BLOCK_FILE_PHYS);

652

*err = -EIO;

652

*err = -EIO;

653

goto failed_out;

653

goto failed_out;

654

}

654

}

655

656

target -= count;

656

target -= count;

657

/* allocate blocks for indirect blocks */

657

/* allocate blocks for indirect blocks */

658

while (index < indirect_blks && count) {

658

while (index < indirect_blks && count) {

659

new_blocks[index++] = current_block++;

659

new_blocks[index++] = current_block++;

660

count--;

660

count--;

661

}

661

}

662

if (count > 0) {

662

if (count > 0) {

663

/*

663

/*

664

* save the new block number

664

* save the new block number

665

* for the first direct block

665

* for the first direct block

666

*/

666

*/

667

new_blocks[index] = current_block;

667

new_blocks[index] = current_block;

668

printk(KERN_INFO "%s returned more blocks than "

668

printk(KERN_INFO "%s returned more blocks than "

669

"requested\n", __func__);

669

"requested\n", __func__);

670

WARN_ON(1);

670

WARN_ON(1);

671

break;

671

break;

672

}

672

}

673

}

673

}

674

675

target = blks - count ;

675

target = blks - count ;

676

blk_allocated = count;

676

blk_allocated = count;

677

if (!target)

677

if (!target)

678

goto allocated;

678

goto allocated;

679

/* Now allocate data blocks */

679

/* Now allocate data blocks */

680

memset(&ar, 0, sizeof(ar));

680

memset(&ar, 0, sizeof(ar));

681

ar.inode = inode;

681

ar.inode = inode;

682

ar.goal = goal;

682

ar.goal = goal;

683

ar.len = target;

683

ar.len = target;

684

ar.logical = iblock;

684

ar.logical = iblock;

685

if (S_ISREG(inode->i_mode))

685

if (S_ISREG(inode->i_mode))

686

/* enable in-core preallocation only for regular files */

686

/* enable in-core preallocation only for regular files */

687

ar.flags = EXT4_MB_HINT_DATA;

687

ar.flags = EXT4_MB_HINT_DATA;

688

689

current_block = ext4_mb_new_blocks(handle, &ar, err);

689

current_block = ext4_mb_new_blocks(handle, &ar, err);

690

if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) {

690

if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) {

691

EXT4_ERROR_INODE(inode,

691

EXT4_ERROR_INODE(inode,

692

"current_block %llu + ar.len %d > %d!",

692

"current_block %llu + ar.len %d > %d!",

693

current_block, ar.len,

693

current_block, ar.len,

694

EXT4_MAX_BLOCK_FILE_PHYS);

694

EXT4_MAX_BLOCK_FILE_PHYS);

695

*err = -EIO;

695

*err = -EIO;

696

goto failed_out;

696

goto failed_out;

697

}

697

}

698

699

if (*err && (target == blks)) {

699

if (*err && (target == blks)) {

700

/*

700

/*

701

* if the allocation failed and we didn't allocate

701

* if the allocation failed and we didn't allocate

702

* any blocks before

702

* any blocks before

703

*/

703

*/

704

goto failed_out;

704

goto failed_out;

705

}

705

}

706

if (!*err) {

706

if (!*err) {

707

if (target == blks) {

707

if (target == blks) {

708

/*

708

/*

709

* save the new block number

709

* save the new block number

710

* for the first direct block

710

* for the first direct block

711

*/

711

*/

712

new_blocks[index] = current_block;

712

new_blocks[index] = current_block;

713

}

713

}

714

blk_allocated += ar.len;

714

blk_allocated += ar.len;

715

}

715

}

716

allocated:

716

allocated:

717

/* total number of blocks allocated for direct blocks */

717

/* total number of blocks allocated for direct blocks */

718

ret = blk_allocated;

718

ret = blk_allocated;

719

*err = 0;

719

*err = 0;

720

return ret;

720

return ret;

721

failed_out:

721

failed_out:

722

for (i = 0; i < index; i++)

722

for (i = 0; i < index; i++)

723

ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);

723

ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);

724

return ret;

724

return ret;

725

}

725

}

726

727

/**

727

/**

728

* ext4_alloc_branch - allocate and set up a chain of blocks.

728

* ext4_alloc_branch - allocate and set up a chain of blocks.

729

* @handle: handle for this transaction

729

* @handle: handle for this transaction

730

* @inode: owner

730

* @inode: owner

731

* @indirect_blks: number of allocated indirect blocks

731

* @indirect_blks: number of allocated indirect blocks

732

* @blks: number of allocated direct blocks

732

* @blks: number of allocated direct blocks

733

* @goal: preferred place for allocation

733

* @goal: preferred place for allocation

734

* @offsets: offsets (in the blocks) to store the pointers to next.

734

* @offsets: offsets (in the blocks) to store the pointers to next.

735

* @branch: place to store the chain in.

735

* @branch: place to store the chain in.

736

*

736

*

737

* This function allocates blocks, zeroes out all but the last one,

737

* This function allocates blocks, zeroes out all but the last one,

738

* links them into chain and (if we are synchronous) writes them to disk.

738

* links them into chain and (if we are synchronous) writes them to disk.

739

* In other words, it prepares a branch that can be spliced onto the

739

* In other words, it prepares a branch that can be spliced onto the

740

* inode. It stores the information about that chain in the branch[], in

740

* inode. It stores the information about that chain in the branch[], in

741

* the same format as ext4_get_branch() would do. We are calling it after

741

* the same format as ext4_get_branch() would do. We are calling it after

742

* we had read the existing part of chain and partial points to the last

742

* we had read the existing part of chain and partial points to the last

743

* triple of that (one with zero ->key). Upon the exit we have the same

743

* triple of that (one with zero ->key). Upon the exit we have the same

744

* picture as after the successful ext4_get_block(), except that in one

744

* picture as after the successful ext4_get_block(), except that in one

745

* place chain is disconnected - *branch->p is still zero (we did not

745

* place chain is disconnected - *branch->p is still zero (we did not

746

* set the last link), but branch->key contains the number that should

746

* set the last link), but branch->key contains the number that should

747

* be placed into *branch->p to fill that gap.

747

* be placed into *branch->p to fill that gap.

748

*

748

*

749

* If allocation fails we free all blocks we've allocated (and forget

749

* If allocation fails we free all blocks we've allocated (and forget

750

* their buffer_heads) and return the error value the from failed

750

* their buffer_heads) and return the error value the from failed

751

* ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain

751

* ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain

752

* as described above and return 0.

752

* as described above and return 0.

753

*/

753

*/

754

static int ext4_alloc_branch(handle_t *handle, struct inode *inode,

754

static int ext4_alloc_branch(handle_t *handle, struct inode *inode,

755

ext4_lblk_t iblock, int indirect_blks,

755

ext4_lblk_t iblock, int indirect_blks,

756

int *blks, ext4_fsblk_t goal,

756

int *blks, ext4_fsblk_t goal,

757

ext4_lblk_t *offsets, Indirect *branch)

757

ext4_lblk_t *offsets, Indirect *branch)

758

{

758

{

759

int blocksize = inode->i_sb->s_blocksize;

759

int blocksize = inode->i_sb->s_blocksize;

760

int i, n = 0;

760

int i, n = 0;

761

int err = 0;

761

int err = 0;

762

struct buffer_head *bh;

762

struct buffer_head *bh;

763

int num;

763

int num;

764

ext4_fsblk_t new_blocks[4];

764

ext4_fsblk_t new_blocks[4];

765

ext4_fsblk_t current_block;

765

ext4_fsblk_t current_block;

766

767

num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,

767

num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,

768

*blks, new_blocks, &err);

768

*blks, new_blocks, &err);

769

if (err)

769

if (err)

770

return err;

770

return err;

771

772

branch[0].key = cpu_to_le32(new_blocks[0]);

772

branch[0].key = cpu_to_le32(new_blocks[0]);

773

/*

773

/*

774

* metadata blocks and data blocks are allocated.

774

* metadata blocks and data blocks are allocated.

775

*/

775

*/

776

for (n = 1; n <= indirect_blks; n++) {

776

for (n = 1; n <= indirect_blks; n++) {

777

/*

777

/*

778

* Get buffer_head for parent block, zero it out

778

* Get buffer_head for parent block, zero it out

779

* and set the pointer to new one, then send

779

* and set the pointer to new one, then send

780

* parent to disk.

780

* parent to disk.

781

*/

781

*/

782

bh = sb_getblk(inode->i_sb, new_blocks[n-1]);

782

bh = sb_getblk(inode->i_sb, new_blocks[n-1]);

783

if (unlikely(!bh)) {

783

if (unlikely(!bh)) {

784

err = -EIO;

784

err = -EIO;

785

goto failed;

785

goto failed;

786

}

786

}

787

788

branch[n].bh = bh;

788

branch[n].bh = bh;

789

lock_buffer(bh);

789

lock_buffer(bh);

790

BUFFER_TRACE(bh, "call get_create_access");

790

BUFFER_TRACE(bh, "call get_create_access");

791

err = ext4_journal_get_create_access(handle, bh);

791

err = ext4_journal_get_create_access(handle, bh);

792

if (err) {

792

if (err) {

793

/* Don't brelse(bh) here; it's done in

793

/* Don't brelse(bh) here; it's done in

794

* ext4_journal_forget() below */

794

* ext4_journal_forget() below */

795

unlock_buffer(bh);

795

unlock_buffer(bh);

796

goto failed;

796

goto failed;

797

}

797

}

798

799

memset(bh->b_data, 0, blocksize);

799

memset(bh->b_data, 0, blocksize);

800

branch[n].p = (__le32 *) bh->b_data + offsets[n];

800

branch[n].p = (__le32 *) bh->b_data + offsets[n];

801

branch[n].key = cpu_to_le32(new_blocks[n]);

801

branch[n].key = cpu_to_le32(new_blocks[n]);

802

*branch[n].p = branch[n].key;

802

*branch[n].p = branch[n].key;

803

if (n == indirect_blks) {

803

if (n == indirect_blks) {

804

current_block = new_blocks[n];

804

current_block = new_blocks[n];

805

/*

805

/*

806

* End of chain, update the last new metablock of

806

* End of chain, update the last new metablock of

807

* the chain to point to the new allocated

807

* the chain to point to the new allocated

808

* data blocks numbers

808

* data blocks numbers

809

*/

809

*/

810

for (i = 1; i < num; i++)

810

for (i = 1; i < num; i++)

811

*(branch[n].p + i) = cpu_to_le32(++current_block);

811

*(branch[n].p + i) = cpu_to_le32(++current_block);

812

}

812

}

813

BUFFER_TRACE(bh, "marking uptodate");

813

BUFFER_TRACE(bh, "marking uptodate");

814

set_buffer_uptodate(bh);

814

set_buffer_uptodate(bh);

815

unlock_buffer(bh);

815

unlock_buffer(bh);

816

817

BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");

817

BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");

818

err = ext4_handle_dirty_metadata(handle, inode, bh);

818

err = ext4_handle_dirty_metadata(handle, inode, bh);

819

if (err)

819

if (err)

820

goto failed;

820

goto failed;

821

}

821

}

822

*blks = num;

822

*blks = num;

823

return err;

823

return err;

824

failed:

824

failed:

825

/* Allocation failed, free what we already allocated */

825

/* Allocation failed, free what we already allocated */

826

ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0);

826

ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0);

827

for (i = 1; i <= n ; i++) {

827

for (i = 1; i <= n ; i++) {

828

/*

828

/*

829

* branch[i].bh is newly allocated, so there is no

829

* branch[i].bh is newly allocated, so there is no

830

* need to revoke the block, which is why we don't

830

* need to revoke the block, which is why we don't

831

* need to set EXT4_FREE_BLOCKS_METADATA.

831

* need to set EXT4_FREE_BLOCKS_METADATA.

832

*/

832

*/

833

ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1,

833

ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1,

834

EXT4_FREE_BLOCKS_FORGET);

834

EXT4_FREE_BLOCKS_FORGET);

835

}

835

}

836

for (i = n+1; i < indirect_blks; i++)

836

for (i = n+1; i < indirect_blks; i++)

837

ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);

837

ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);

838

839

ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0);

839

ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0);

840

841

return err;

841

return err;

842

}

842

}

843

844

/**

844

/**

845

* ext4_splice_branch - splice the allocated branch onto inode.

845

* ext4_splice_branch - splice the allocated branch onto inode.

846

* @handle: handle for this transaction

846

* @handle: handle for this transaction

847

* @inode: owner

847

* @inode: owner

848

* @block: (logical) number of block we are adding

848

* @block: (logical) number of block we are adding

849

* @chain: chain of indirect blocks (with a missing link - see

849

* @chain: chain of indirect blocks (with a missing link - see

850

* ext4_alloc_branch)

850

* ext4_alloc_branch)

851

* @where: location of missing link

851

* @where: location of missing link

852

* @num: number of indirect blocks we are adding

852

* @num: number of indirect blocks we are adding

853

* @blks: number of direct blocks we are adding

853

* @blks: number of direct blocks we are adding

854

*

854

*

855

* This function fills the missing link and does all housekeeping needed in

855

* This function fills the missing link and does all housekeeping needed in

856

* inode (->i_blocks, etc.). In case of success we end up with the full

856

* inode (->i_blocks, etc.). In case of success we end up with the full

857

* chain to new block and return 0.

857

* chain to new block and return 0.

858

*/

858

*/

859

static int ext4_splice_branch(handle_t *handle, struct inode *inode,

859

static int ext4_splice_branch(handle_t *handle, struct inode *inode,

860

ext4_lblk_t block, Indirect *where, int num,

860

ext4_lblk_t block, Indirect *where, int num,

861

int blks)

861

int blks)

862

{

862

{

863

int i;

863

int i;

864

int err = 0;

864

int err = 0;

865

ext4_fsblk_t current_block;

865

ext4_fsblk_t current_block;

866

867

/*

867

/*

868

* If we're splicing into a [td]indirect block (as opposed to the

868

* If we're splicing into a [td]indirect block (as opposed to the

869

* inode) then we need to get write access to the [td]indirect block

869

* inode) then we need to get write access to the [td]indirect block

870

* before the splice.

870

* before the splice.

871

*/

871

*/

872

if (where->bh) {

872

if (where->bh) {

873

BUFFER_TRACE(where->bh, "get_write_access");

873

BUFFER_TRACE(where->bh, "get_write_access");

874

err = ext4_journal_get_write_access(handle, where->bh);

874

err = ext4_journal_get_write_access(handle, where->bh);

875

if (err)

875

if (err)

876

goto err_out;

876

goto err_out;

877

}

877

}

878

/* That's it */

878

/* That's it */

879

880

*where->p = where->key;

880

*where->p = where->key;

881

882

/*

882

/*

883

* Update the host buffer_head or inode to point to more just allocated

883

* Update the host buffer_head or inode to point to more just allocated

884

* direct blocks blocks

884

* direct blocks blocks

885

*/

885

*/

886

if (num == 0 && blks > 1) {

886

if (num == 0 && blks > 1) {

887

current_block = le32_to_cpu(where->key) + 1;

887

current_block = le32_to_cpu(where->key) + 1;

888

for (i = 1; i < blks; i++)

888

for (i = 1; i < blks; i++)

889

*(where->p + i) = cpu_to_le32(current_block++);

889

*(where->p + i) = cpu_to_le32(current_block++);

890

}

890

}

891

892

/* We are done with atomic stuff, now do the rest of housekeeping */

892

/* We are done with atomic stuff, now do the rest of housekeeping */

893

/* had we spliced it onto indirect block? */

893

/* had we spliced it onto indirect block? */

894

if (where->bh) {

894

if (where->bh) {

895

/*

895

/*

896

* If we spliced it onto an indirect block, we haven't

896

* If we spliced it onto an indirect block, we haven't

897

* altered the inode. Note however that if it is being spliced

897

* altered the inode. Note however that if it is being spliced

898

* onto an indirect block at the very end of the file (the

898

* onto an indirect block at the very end of the file (the

899

* file is growing) then we *will* alter the inode to reflect

899

* file is growing) then we *will* alter the inode to reflect

900

* the new i_size. But that is not done here - it is done in

900

* the new i_size. But that is not done here - it is done in

901

* generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.

901

* generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.

902

*/

902

*/

903

jbd_debug(5, "splicing indirect only\n");

903

jbd_debug(5, "splicing indirect only\n");

904

BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");

904

BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");

905

err = ext4_handle_dirty_metadata(handle, inode, where->bh);

905

err = ext4_handle_dirty_metadata(handle, inode, where->bh);

906

if (err)

906

if (err)

907

goto err_out;

907

goto err_out;

908

} else {

908

} else {

909

/*

909

/*

910

* OK, we spliced it into the inode itself on a direct block.

910

* OK, we spliced it into the inode itself on a direct block.

911

*/

911

*/

912

ext4_mark_inode_dirty(handle, inode);

912

ext4_mark_inode_dirty(handle, inode);

913

jbd_debug(5, "splicing direct\n");

913

jbd_debug(5, "splicing direct\n");

914

}

914

}

915

return err;

915

return err;

916

917

err_out:

917

err_out:

918

for (i = 1; i <= num; i++) {

918

for (i = 1; i <= num; i++) {

919

/*

919

/*

920

* branch[i].bh is newly allocated, so there is no

920

* branch[i].bh is newly allocated, so there is no

921

* need to revoke the block, which is why we don't

921

* need to revoke the block, which is why we don't

922

* need to set EXT4_FREE_BLOCKS_METADATA.

922

* need to set EXT4_FREE_BLOCKS_METADATA.

923

*/

923

*/

924

ext4_free_blocks(handle, inode, where[i].bh, 0, 1,

924

ext4_free_blocks(handle, inode, where[i].bh, 0, 1,

925

EXT4_FREE_BLOCKS_FORGET);

925

EXT4_FREE_BLOCKS_FORGET);

926

}

926

}

927

ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key),

927

ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key),

928

blks, 0);

928

blks, 0);

929

930

return err;

930

return err;

931

}

931

}

932

933

/*

933

/*

934

* The ext4_ind_map_blocks() function handles non-extents inodes

934

* The ext4_ind_map_blocks() function handles non-extents inodes

935

* (i.e., using the traditional indirect/double-indirect i_blocks

935

* (i.e., using the traditional indirect/double-indirect i_blocks

936

* scheme) for ext4_map_blocks().

936

* scheme) for ext4_map_blocks().

937

*

937

*

938

* Allocation strategy is simple: if we have to allocate something, we will

938

* Allocation strategy is simple: if we have to allocate something, we will

939

* have to go the whole way to leaf. So let's do it before attaching anything

939

* have to go the whole way to leaf. So let's do it before attaching anything

940

* to tree, set linkage between the newborn blocks, write them if sync is

940

* to tree, set linkage between the newborn blocks, write them if sync is

941

* required, recheck the path, free and repeat if check fails, otherwise

941

* required, recheck the path, free and repeat if check fails, otherwise

942

* set the last missing link (that will protect us from any truncate-generated

942

* set the last missing link (that will protect us from any truncate-generated

943

* removals - all blocks on the path are immune now) and possibly force the

943

* removals - all blocks on the path are immune now) and possibly force the

944

* write on the parent block.

944

* write on the parent block.

945

* That has a nice additional property: no special recovery from the failed

945

* That has a nice additional property: no special recovery from the failed

946

* allocations is needed - we simply release blocks and do not touch anything

946

* allocations is needed - we simply release blocks and do not touch anything

947

* reachable from inode.

947

* reachable from inode.

948

*

948

*

949

* `handle' can be NULL if create == 0.

949

* `handle' can be NULL if create == 0.

950

*

950

*

951

* return > 0, # of blocks mapped or allocated.

951

* return > 0, # of blocks mapped or allocated.

952

* return = 0, if plain lookup failed.

952

* return = 0, if plain lookup failed.

953

* return < 0, error case.

953

* return < 0, error case.

954

*

954

*

955

* The ext4_ind_get_blocks() function should be called with

955

* The ext4_ind_get_blocks() function should be called with

956

* down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem

956

* down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem

957

* blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or

957

* blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or

958

* down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system

958

* down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system

959

* blocks.

959

* blocks.

960

*/

960

*/

961

static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,

961

static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,

962

struct ext4_map_blocks *map,

962

struct ext4_map_blocks *map,

963

int flags)

963

int flags)

964

{

964

{

965

int err = -EIO;

965

int err = -EIO;

966

ext4_lblk_t offsets[4];

966

ext4_lblk_t offsets[4];

967

Indirect chain[4];

967

Indirect chain[4];

968

Indirect *partial;

968

Indirect *partial;

969

ext4_fsblk_t goal;

969

ext4_fsblk_t goal;

970

int indirect_blks;

970

int indirect_blks;

971

int blocks_to_boundary = 0;

971

int blocks_to_boundary = 0;

972

int depth;

972

int depth;

973

int count = 0;

973

int count = 0;

974

ext4_fsblk_t first_block = 0;

974

ext4_fsblk_t first_block = 0;

975

976

trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);

976

trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);

977

J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));

977

J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));

978

J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);

978

J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);

979

depth = ext4_block_to_path(inode, map->m_lblk, offsets,

979

depth = ext4_block_to_path(inode, map->m_lblk, offsets,

980

&blocks_to_boundary);

980

&blocks_to_boundary);

981

982

if (depth == 0)

982

if (depth == 0)

983

goto out;

983

goto out;

984

985

partial = ext4_get_branch(inode, depth, offsets, chain, &err);

985

partial = ext4_get_branch(inode, depth, offsets, chain, &err);

986

987

/* Simplest case - block found, no allocation needed */

987

/* Simplest case - block found, no allocation needed */

988

if (!partial) {

988

if (!partial) {

989

first_block = le32_to_cpu(chain[depth - 1].key);

989

first_block = le32_to_cpu(chain[depth - 1].key);

990

count++;

990

count++;

991

/*map more blocks*/

991

/*map more blocks*/

992

while (count < map->m_len && count <= blocks_to_boundary) {

992

while (count < map->m_len && count <= blocks_to_boundary) {

993

ext4_fsblk_t blk;

993

ext4_fsblk_t blk;

994

995

blk = le32_to_cpu(*(chain[depth-1].p + count));

995

blk = le32_to_cpu(*(chain[depth-1].p + count));

996

997

if (blk == first_block + count)

997

if (blk == first_block + count)

998

count++;

998

count++;

999

else

999

else

1000

break;

1000

break;

1001

}

1001

}

1002

goto got_it;

1002

goto got_it;

1003

}

1003

}

1004

1005

/* Next simple case - plain lookup or failed read of indirect block */

1005

/* Next simple case - plain lookup or failed read of indirect block */

1006

if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO)

1006

if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO)

1007

goto cleanup;

1007

goto cleanup;

1008

1009

/*

1009

/*

1010

* Okay, we need to do block allocation.

1010

* Okay, we need to do block allocation.

1011

*/

1011

*/

1012

goal = ext4_find_goal(inode, map->m_lblk, partial);

1012

goal = ext4_find_goal(inode, map->m_lblk, partial);

1013

1014

/* the number of blocks need to allocate for [d,t]indirect blocks */

1014

/* the number of blocks need to allocate for [d,t]indirect blocks */

1015

indirect_blks = (chain + depth) - partial - 1;

1015

indirect_blks = (chain + depth) - partial - 1;

1016

1017

/*

1017

/*

1018

* Next look up the indirect map to count the totoal number of

1018

* Next look up the indirect map to count the totoal number of

1019

* direct blocks to allocate for this branch.

1019

* direct blocks to allocate for this branch.

1020

*/

1020

*/

1021

count = ext4_blks_to_allocate(partial, indirect_blks,

1021

count = ext4_blks_to_allocate(partial, indirect_blks,

1022

map->m_len, blocks_to_boundary);

1022

map->m_len, blocks_to_boundary);

1023

/*

1023

/*

1024

* Block out ext4_truncate while we alter the tree

1024

* Block out ext4_truncate while we alter the tree

1025

*/

1025

*/

1026

err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,

1026

err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,

1027

&count, goal,

1027

&count, goal,

1028

offsets + (partial - chain), partial);

1028

offsets + (partial - chain), partial);

1029

1030

/*

1030

/*

1031

* The ext4_splice_branch call will free and forget any buffers

1031

* The ext4_splice_branch call will free and forget any buffers

1032

* on the new chain if there is a failure, but that risks using

1032

* on the new chain if there is a failure, but that risks using

1033

* up transaction credits, especially for bitmaps where the

1033

* up transaction credits, especially for bitmaps where the

1034

* credits cannot be returned. Can we handle this somehow? We

1034

* credits cannot be returned. Can we handle this somehow? We

1035

* may need to return -EAGAIN upwards in the worst case. --sct

1035

* may need to return -EAGAIN upwards in the worst case. --sct

1036

*/

1036

*/

1037

if (!err)

1037

if (!err)

1038

err = ext4_splice_branch(handle, inode, map->m_lblk,

1038

err = ext4_splice_branch(handle, inode, map->m_lblk,

1039

partial, indirect_blks, count);

1039

partial, indirect_blks, count);

1040

if (err)

1040

if (err)

1041

goto cleanup;

1041

goto cleanup;

1042

1043

map->m_flags |= EXT4_MAP_NEW;

1043

map->m_flags |= EXT4_MAP_NEW;

1044

1045

ext4_update_inode_fsync_trans(handle, inode, 1);

1045

ext4_update_inode_fsync_trans(handle, inode, 1);

1046

got_it:

1046

got_it:

1047

map->m_flags |= EXT4_MAP_MAPPED;

1047

map->m_flags |= EXT4_MAP_MAPPED;

1048

map->m_pblk = le32_to_cpu(chain[depth-1].key);

1048

map->m_pblk = le32_to_cpu(chain[depth-1].key);

1049

map->m_len = count;

1049

map->m_len = count;

1050

if (count > blocks_to_boundary)

1050

if (count > blocks_to_boundary)

1051

map->m_flags |= EXT4_MAP_BOUNDARY;

1051

map->m_flags |= EXT4_MAP_BOUNDARY;

1052

err = count;

1052

err = count;

1053

/* Clean up and exit */

1053

/* Clean up and exit */

1054

partial = chain + depth - 1; /* the whole chain */

1054

partial = chain + depth - 1; /* the whole chain */

1055

cleanup:

1055

cleanup:

1056

while (partial > chain) {

1056

while (partial > chain) {

1057

BUFFER_TRACE(partial->bh, "call brelse");

1057

BUFFER_TRACE(partial->bh, "call brelse");

1058

brelse(partial->bh);

1058

brelse(partial->bh);

1059

partial--;

1059

partial--;

1060

}

1060

}

1061

out:

1061

out:

1062

trace_ext4_ind_map_blocks_exit(inode, map->m_lblk,

1062

trace_ext4_ind_map_blocks_exit(inode, map->m_lblk,

1063

map->m_pblk, map->m_len, err);

1063

map->m_pblk, map->m_len, err);

1064

return err;

1064

return err;

1065

}

1065

}

1066

1067

#ifdef CONFIG_QUOTA

1067

#ifdef CONFIG_QUOTA

1068

qsize_t *ext4_get_reserved_space(struct inode *inode)

1068

qsize_t *ext4_get_reserved_space(struct inode *inode)

1069

{

1069

{

1070

return &EXT4_I(inode)->i_reserved_quota;

1070

return &EXT4_I(inode)->i_reserved_quota;

1071

}

1071

}

1072

#endif

1072

#endif

1073

1074

/*

1074

/*

1075

* Calculate the number of metadata blocks need to reserve

1075

* Calculate the number of metadata blocks need to reserve

1076

* to allocate a new block at @lblocks for non extent file based file

1076

* to allocate a new block at @lblocks for non extent file based file

1077

*/

1077

*/

1078

static int ext4_indirect_calc_metadata_amount(struct inode *inode,

1078

static int ext4_indirect_calc_metadata_amount(struct inode *inode,

1079

sector_t lblock)

1079

sector_t lblock)

1080

{

1080

{

1081

struct ext4_inode_info *ei = EXT4_I(inode);

1081

struct ext4_inode_info *ei = EXT4_I(inode);

1082

sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);

1082

sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);

1083

int blk_bits;

1083

int blk_bits;

1084

1085

if (lblock < EXT4_NDIR_BLOCKS)

1085

if (lblock < EXT4_NDIR_BLOCKS)

1086

return 0;

1086

return 0;

1087

1088

lblock -= EXT4_NDIR_BLOCKS;

1088

lblock -= EXT4_NDIR_BLOCKS;

1089

1090

if (ei->i_da_metadata_calc_len &&

1090

if (ei->i_da_metadata_calc_len &&

1091

(lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) {

1091

(lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) {

1092

ei->i_da_metadata_calc_len++;

1092

ei->i_da_metadata_calc_len++;

1093

return 0;

1093

return 0;

1094

}

1094

}

1095

ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;

1095

ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;

1096

ei->i_da_metadata_calc_len = 1;

1096

ei->i_da_metadata_calc_len = 1;

1097

blk_bits = order_base_2(lblock);

1097

blk_bits = order_base_2(lblock);

1098

return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;

1098

return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;

1099

}

1099

}

1100

1101

/*

1101

/*

1102

* Calculate the number of metadata blocks need to reserve

1102

* Calculate the number of metadata blocks need to reserve

1103

* to allocate a block located at @lblock

1103

* to allocate a block located at @lblock

1104

*/

1104

*/

1105

static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)

1105

static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)

1106

{

1106

{

1107

if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))

1107

if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))

1108

return ext4_ext_calc_metadata_amount(inode, lblock);

1108

return ext4_ext_calc_metadata_amount(inode, lblock);

1109

1110

return ext4_indirect_calc_metadata_amount(inode, lblock);

1110

return ext4_indirect_calc_metadata_amount(inode, lblock);

1111

}

1111

}

1112

1113

/*

1113

/*

1114

* Called with i_data_sem down, which is important since we can call

1114

* Called with i_data_sem down, which is important since we can call

1115

* ext4_discard_preallocations() from here.

1115

* ext4_discard_preallocations() from here.

1116

*/

1116

*/

1117

void ext4_da_update_reserve_space(struct inode *inode,

1117

void ext4_da_update_reserve_space(struct inode *inode,

1118

int used, int quota_claim)

1118

int used, int quota_claim)

1119

{

1119

{

1120

struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

1120

struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

1121

struct ext4_inode_info *ei = EXT4_I(inode);

1121

struct ext4_inode_info *ei = EXT4_I(inode);

1122

1123

spin_lock(&ei->i_block_reservation_lock);

1123

spin_lock(&ei->i_block_reservation_lock);

1124

trace_ext4_da_update_reserve_space(inode, used);

1124

trace_ext4_da_update_reserve_space(inode, used);

1125

if (unlikely(used > ei->i_reserved_data_blocks)) {

1125

if (unlikely(used > ei->i_reserved_data_blocks)) {

1126

ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "

1126

ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "

1127

"with only %d reserved data blocks\n",

1127

"with only %d reserved data blocks\n",

1128

__func__, inode->i_ino, used,

1128

__func__, inode->i_ino, used,

1129

ei->i_reserved_data_blocks);

1129

ei->i_reserved_data_blocks);

1130

WARN_ON(1);

1130

WARN_ON(1);

1131

used = ei->i_reserved_data_blocks;

1131

used = ei->i_reserved_data_blocks;

1132

}

1132

}

1133

1134

/* Update per-inode reservations */

1134

/* Update per-inode reservations */

1135

ei->i_reserved_data_blocks -= used;

1135

ei->i_reserved_data_blocks -= used;

1136

ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;

1136

ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;

1137

percpu_counter_sub(&sbi->s_dirtyblocks_counter,

1137

percpu_counter_sub(&sbi->s_dirtyblocks_counter,

1138

used + ei->i_allocated_meta_blocks);

1138

used + ei->i_allocated_meta_blocks);

1139

ei->i_allocated_meta_blocks = 0;

1139

ei->i_allocated_meta_blocks = 0;

1140

1141

if (ei->i_reserved_data_blocks == 0) {

1141

if (ei->i_reserved_data_blocks == 0) {

1142

/*

1142

/*

1143

* We can release all of the reserved metadata blocks

1143

* We can release all of the reserved metadata blocks

1144

* only when we have written all of the delayed

1144

* only when we have written all of the delayed

1145

* allocation blocks.

1145

* allocation blocks.

1146

*/

1146

*/

1147

percpu_counter_sub(&sbi->s_dirtyblocks_counter,

1147

percpu_counter_sub(&sbi->s_dirtyblocks_counter,

1148

ei->i_reserved_meta_blocks);

1148

ei->i_reserved_meta_blocks);

1149

ei->i_reserved_meta_blocks = 0;

1149

ei->i_reserved_meta_blocks = 0;

1150

ei->i_da_metadata_calc_len = 0;

1150

ei->i_da_metadata_calc_len = 0;

1151

}

1151

}

1152

spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);

1152

spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);

1153

1154

/* Update quota subsystem for data blocks */

1154

/* Update quota subsystem for data blocks */

1155

if (quota_claim)

1155

if (quota_claim)

1156

dquot_claim_block(inode, used);

1156

dquot_claim_block(inode, used);

1157

else {

1157

else {

1158

/*

1158

/*

1159

* We did fallocate with an offset that is already delayed

1159

* We did fallocate with an offset that is already delayed

1160

* allocated. So on delayed allocated writeback we should

1160

* allocated. So on delayed allocated writeback we should

1161

* not re-claim the quota for fallocated blocks.

1161

* not re-claim the quota for fallocated blocks.

1162

*/

1162

*/

1163

dquot_release_reservation_block(inode, used);

1163

dquot_release_reservation_block(inode, used);

1164

}

1164

}

1165

1166

/*

1166

/*

1167

* If we have done all the pending block allocations and if

1167

* If we have done all the pending block allocations and if

1168

* there aren't any writers on the inode, we can discard the

1168

* there aren't any writers on the inode, we can discard the

1169

* inode's preallocations.

1169

* inode's preallocations.

1170

*/

1170

*/

1171

if ((ei->i_reserved_data_blocks == 0) &&

1171

if ((ei->i_reserved_data_blocks == 0) &&

1172

(atomic_read(&inode->i_writecount) == 0))

1172

(atomic_read(&inode->i_writecount) == 0))

1173

ext4_discard_preallocations(inode);

1173

ext4_discard_preallocations(inode);

1174

}

1174

}

1175

1176

static int __check_block_validity(struct inode *inode, const char *func,

1176

static int __check_block_validity(struct inode *inode, const char *func,

1177

unsigned int line,

1177

unsigned int line,

1178

struct ext4_map_blocks *map)

1178

struct ext4_map_blocks *map)

1179

{

1179

{

1180

if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,

1180

if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,

1181

map->m_len)) {

1181

map->m_len)) {

1182

ext4_error_inode(inode, func, line, map->m_pblk,

1182

ext4_error_inode(inode, func, line, map->m_pblk,

1183

"lblock %lu mapped to illegal pblock "

1183

"lblock %lu mapped to illegal pblock "

1184

"(length %d)", (unsigned long) map->m_lblk,

1184

"(length %d)", (unsigned long) map->m_lblk,

1185

map->m_len);

1185

map->m_len);

1186

return -EIO;

1186

return -EIO;

1187

}

1187

}

1188

return 0;

1188

return 0;

1189

}

1189

}

1190

1191

#define check_block_validity(inode, map) \

1191

#define check_block_validity(inode, map) \

1192

__check_block_validity((inode), __func__, __LINE__, (map))

1192

__check_block_validity((inode), __func__, __LINE__, (map))

1193

1194

/*

1194

/*

1195

* Return the number of contiguous dirty pages in a given inode

1195

* Return the number of contiguous dirty pages in a given inode

1196

* starting at page frame idx.

1196

* starting at page frame idx.

1197

*/

1197

*/

1198

static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,

1198

static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,

1199

unsigned int max_pages)

1199

unsigned int max_pages)

1200

{

1200

{

1201

struct address_space *mapping = inode->i_mapping;

1201

struct address_space *mapping = inode->i_mapping;

1202

pgoff_t index;

1202

pgoff_t index;

1203

struct pagevec pvec;

1203

struct pagevec pvec;

1204

pgoff_t num = 0;

1204

pgoff_t num = 0;

1205

int i, nr_pages, done = 0;

1205

int i, nr_pages, done = 0;

1206

1207

if (max_pages == 0)

1207

if (max_pages == 0)

1208

return 0;

1208

return 0;

1209

pagevec_init(&pvec, 0);

1209

pagevec_init(&pvec, 0);

1210

while (!done) {

1210

while (!done) {

1211

index = idx;

1211

index = idx;

1212

nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,

1212

nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,

1213

PAGECACHE_TAG_DIRTY,

1213

PAGECACHE_TAG_DIRTY,

1214

(pgoff_t)PAGEVEC_SIZE);

1214

(pgoff_t)PAGEVEC_SIZE);

1215

if (nr_pages == 0)

1215

if (nr_pages == 0)

1216

break;

1216

break;

1217

for (i = 0; i < nr_pages; i++) {

1217

for (i = 0; i < nr_pages; i++) {

1218

struct page *page = pvec.pages[i];

1218

struct page *page = pvec.pages[i];

1219

struct buffer_head *bh, *head;

1219

struct buffer_head *bh, *head;

1220

1221

lock_page(page);

1221

lock_page(page);

1222

if (unlikely(page->mapping != mapping) ||

1222

if (unlikely(page->mapping != mapping) ||

1223

!PageDirty(page) ||

1223

!PageDirty(page) ||

1224

PageWriteback(page) ||

1224

PageWriteback(page) ||

1225

page->index != idx) {

1225

page->index != idx) {

1226

done = 1;

1226

done = 1;

1227

unlock_page(page);

1227

unlock_page(page);

1228

break;

1228

break;

1229

}

1229

}

1230

if (page_has_buffers(page)) {

1230

if (page_has_buffers(page)) {

1231

bh = head = page_buffers(page);

1231

bh = head = page_buffers(page);

1232

do {

1232

do {

1233

if (!buffer_delay(bh) &&

1233

if (!buffer_delay(bh) &&

1234

!buffer_unwritten(bh))

1234

!buffer_unwritten(bh))

1235

done = 1;

1235

done = 1;

1236

bh = bh->b_this_page;

1236

bh = bh->b_this_page;

1237

} while (!done && (bh != head));

1237

} while (!done && (bh != head));

1238

}

1238

}

1239

unlock_page(page);

1239

unlock_page(page);

1240

if (done)

1240

if (done)

1241

break;

1241

break;

1242

idx++;

1242

idx++;

1243

num++;

1243

num++;

1244

if (num >= max_pages) {

1244

if (num >= max_pages) {

1245

done = 1;

1245

done = 1;

1246

break;

1246

break;

1247

}

1247

}

1248

}

1248

}

1249

pagevec_release(&pvec);

1249

pagevec_release(&pvec);

1250

}

1250

}

1251

return num;

1251

return num;

1252

}

1252

}

1253

1254

/*

1254

/*

1255

* The ext4_map_blocks() function tries to look up the requested blocks,

1255

* The ext4_map_blocks() function tries to look up the requested blocks,

1256

* and returns if the blocks are already mapped.

1256

* and returns if the blocks are already mapped.

1257

*

1257

*

1258

* Otherwise it takes the write lock of the i_data_sem and allocate blocks

1258

* Otherwise it takes the write lock of the i_data_sem and allocate blocks

1259

* and store the allocated blocks in the result buffer head and mark it

1259

* and store the allocated blocks in the result buffer head and mark it

1260

* mapped.

1260

* mapped.

1261

*

1261

*

1262

* If file type is extents based, it will call ext4_ext_map_blocks(),

1262

* If file type is extents based, it will call ext4_ext_map_blocks(),

1263

* Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping

1263

* Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping

1264

* based files

1264

* based files

1265

*

1265

*

1266

* On success, it returns the number of blocks being mapped or allocate.

1266

* On success, it returns the number of blocks being mapped or allocate.

1267

* if create==0 and the blocks are pre-allocated and uninitialized block,

1267

* if create==0 and the blocks are pre-allocated and uninitialized block,

1268

* the result buffer head is unmapped. If the create ==1, it will make sure

1268

* the result buffer head is unmapped. If the create ==1, it will make sure

1269

* the buffer head is mapped.

1269

* the buffer head is mapped.

1270

*

1270

*

1271

* It returns 0 if plain look up failed (blocks have not been allocated), in

1271

* It returns 0 if plain look up failed (blocks have not been allocated), in

1272

* that casem, buffer head is unmapped

1272

* that casem, buffer head is unmapped

1273

*

1273

*

1274

* It returns the error in case of allocation failure.

1274

* It returns the error in case of allocation failure.

1275

*/

1275

*/

1276

int ext4_map_blocks(handle_t *handle, struct inode *inode,

1276

int ext4_map_blocks(handle_t *handle, struct inode *inode,

1277

struct ext4_map_blocks *map, int flags)

1277

struct ext4_map_blocks *map, int flags)

1278

{

1278

{

1279

int retval;

1279

int retval;

1280

1281

map->m_flags = 0;

1281

map->m_flags = 0;

1282

ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"

1282

ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"

1283

"logical block %lu\n", inode->i_ino, flags, map->m_len,

1283

"logical block %lu\n", inode->i_ino, flags, map->m_len,

1284

(unsigned long) map->m_lblk);

1284

(unsigned long) map->m_lblk);

1285

/*

1285

/*

1286

* Try to see if we can get the block without requesting a new

1286

* Try to see if we can get the block without requesting a new

1287

* file system block.

1287

* file system block.

1288

*/

1288

*/

1289

down_read((&EXT4_I(inode)->i_data_sem));

1289

down_read((&EXT4_I(inode)->i_data_sem));

1290

if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {

1290

if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {

1291

retval = ext4_ext_map_blocks(handle, inode, map, 0);

1291

retval = ext4_ext_map_blocks(handle, inode, map, 0);

1292

} else {

1292

} else {

1293

retval = ext4_ind_map_blocks(handle, inode, map, 0);

1293

retval = ext4_ind_map_blocks(handle, inode, map, 0);

1294

}

1294

}

1295

up_read((&EXT4_I(inode)->i_data_sem));

1295

up_read((&EXT4_I(inode)->i_data_sem));

1296

1297

if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {

1297

if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {

1298

int ret = check_block_validity(inode, map);

1298

int ret = check_block_validity(inode, map);

1299

if (ret != 0)

1299

if (ret != 0)

1300

return ret;

1300

return ret;

1301

}

1301

}

1302

1303

/* If it is only a block(s) look up */

1303

/* If it is only a block(s) look up */

1304

if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)

1304

if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)

1305

return retval;

1305

return retval;

1306

1307

/*

1307

/*

1308

* Returns if the blocks have already allocated

1308

* Returns if the blocks have already allocated

1309

*

1309

*

1310

* Note that if blocks have been preallocated

1310

* Note that if blocks have been preallocated

1311

* ext4_ext_get_block() returns th create = 0

1311

* ext4_ext_get_block() returns th create = 0

1312

* with buffer head unmapped.

1312

* with buffer head unmapped.

1313

*/

1313

*/

1314

if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)

1314

if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)

1315

return retval;

1315

return retval;

1316

1317

/*

1317

/*

1318

* When we call get_blocks without the create flag, the

1318

* When we call get_blocks without the create flag, the

1319

* BH_Unwritten flag could have gotten set if the blocks

1319

* BH_Unwritten flag could have gotten set if the blocks

1320

* requested were part of a uninitialized extent. We need to

1320

* requested were part of a uninitialized extent. We need to

1321

* clear this flag now that we are committed to convert all or

1321

* clear this flag now that we are committed to convert all or

1322

* part of the uninitialized extent to be an initialized

1322

* part of the uninitialized extent to be an initialized

1323

* extent. This is because we need to avoid the combination

1323

* extent. This is because we need to avoid the combination

1324

* of BH_Unwritten and BH_Mapped flags being simultaneously

1324

* of BH_Unwritten and BH_Mapped flags being simultaneously

1325

* set on the buffer_head.

1325

* set on the buffer_head.

1326

*/

1326

*/

1327

map->m_flags &= ~EXT4_MAP_UNWRITTEN;

1327

map->m_flags &= ~EXT4_MAP_UNWRITTEN;

1328

1329

/*

1329

/*

1330

* New blocks allocate and/or writing to uninitialized extent

1330

* New blocks allocate and/or writing to uninitialized extent

1331

* will possibly result in updating i_data, so we take

1331

* will possibly result in updating i_data, so we take

1332

* the write lock of i_data_sem, and call get_blocks()

1332

* the write lock of i_data_sem, and call get_blocks()

1333

* with create == 1 flag.

1333

* with create == 1 flag.

1334

*/

1334

*/

1335

down_write((&EXT4_I(inode)->i_data_sem));

1335

down_write((&EXT4_I(inode)->i_data_sem));

1336

1337

/*

1337

/*

1338

* if the caller is from delayed allocation writeout path

1338

* if the caller is from delayed allocation writeout path

1339

* we have already reserved fs blocks for allocation

1339

* we have already reserved fs blocks for allocation

1340

* let the underlying get_block() function know to

1340

* let the underlying get_block() function know to

1341

* avoid double accounting

1341

* avoid double accounting

1342

*/

1342

*/

1343

if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)

1343

if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)

1344

ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);

1344

ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);

1345

/*

1345

/*

1346

* We need to check for EXT4 here because migrate

1346

* We need to check for EXT4 here because migrate

1347

* could have changed the inode type in between

1347

* could have changed the inode type in between

1348

*/

1348

*/

1349

if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {

1349

if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {

1350

retval = ext4_ext_map_blocks(handle, inode, map, flags);

1350

retval = ext4_ext_map_blocks(handle, inode, map, flags);

1351

} else {

1351

} else {

1352

retval = ext4_ind_map_blocks(handle, inode, map, flags);

1352

retval = ext4_ind_map_blocks(handle, inode, map, flags);

1353

1354

if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {

1354

if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {

1355

/*

1355

/*

1356

* We allocated new blocks which will result in

1356

* We allocated new blocks which will result in

1357

* i_data's format changing. Force the migrate

1357

* i_data's format changing. Force the migrate

1358

* to fail by clearing migrate flags

1358

* to fail by clearing migrate flags

1359

*/

1359

*/

1360

ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);

1360

ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);

1361

}

1361

}

1362

1363

/*

1363

/*

1364

* Update reserved blocks/metadata blocks after successful

1364

* Update reserved blocks/metadata blocks after successful

1365

* block allocation which had been deferred till now. We don't

1365

* block allocation which had been deferred till now. We don't

1366

* support fallocate for non extent files. So we can update

1366

* support fallocate for non extent files. So we can update

1367

* reserve space here.

1367

* reserve space here.

1368

*/

1368

*/

1369

if ((retval > 0) &&

1369

if ((retval > 0) &&

1370

(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))

1370

(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))

1371

ext4_da_update_reserve_space(inode, retval, 1);

1371

ext4_da_update_reserve_space(inode, retval, 1);

1372

}

1372

}

1373

if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)

1373

if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)

1374

ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);

1374

ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);

1375

1376

up_write((&EXT4_I(inode)->i_data_sem));

1376

up_write((&EXT4_I(inode)->i_data_sem));

1377

if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {

1377

if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {

1378

int ret = check_block_validity(inode, map);

1378

int ret = check_block_validity(inode, map);

1379

if (ret != 0)

1379

if (ret != 0)

1380

return ret;

1380

return ret;

1381

}

1381

}

1382

return retval;

1382

return retval;

1383

}

1383

}

1384

1385

/* Maximum number of blocks we map for direct IO at once. */

1385

/* Maximum number of blocks we map for direct IO at once. */

1386

#define DIO_MAX_BLOCKS 4096

1386

#define DIO_MAX_BLOCKS 4096

1387

1388

static int _ext4_get_block(struct inode *inode, sector_t iblock,

1388

static int _ext4_get_block(struct inode *inode, sector_t iblock,

1389

struct buffer_head *bh, int flags)

1389

struct buffer_head *bh, int flags)

1390

{

1390

{

1391

handle_t *handle = ext4_journal_current_handle();

1391

handle_t *handle = ext4_journal_current_handle();

1392

struct ext4_map_blocks map;

1392

struct ext4_map_blocks map;

1393

int ret = 0, started = 0;

1393

int ret = 0, started = 0;

1394

int dio_credits;

1394

int dio_credits;

1395

1396

map.m_lblk = iblock;

1396

map.m_lblk = iblock;

1397

map.m_len = bh->b_size >> inode->i_blkbits;

1397

map.m_len = bh->b_size >> inode->i_blkbits;

1398

1399

if (flags && !handle) {

1399

if (flags && !handle) {

1400

/* Direct IO write... */

1400

/* Direct IO write... */

1401

if (map.m_len > DIO_MAX_BLOCKS)

1401

if (map.m_len > DIO_MAX_BLOCKS)

1402

map.m_len = DIO_MAX_BLOCKS;

1402

map.m_len = DIO_MAX_BLOCKS;

1403

dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);

1403

dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);

1404

handle = ext4_journal_start(inode, dio_credits);

1404

handle = ext4_journal_start(inode, dio_credits);

1405

if (IS_ERR(handle)) {

1405

if (IS_ERR(handle)) {

1406

ret = PTR_ERR(handle);

1406

ret = PTR_ERR(handle);

1407

return ret;

1407

return ret;

1408

}

1408

}

1409

started = 1;

1409

started = 1;

1410

}

1410

}

1411

1412

ret = ext4_map_blocks(handle, inode, &map, flags);

1412

ret = ext4_map_blocks(handle, inode, &map, flags);

1413

if (ret > 0) {

1413

if (ret > 0) {

1414

map_bh(bh, inode->i_sb, map.m_pblk);

1414

map_bh(bh, inode->i_sb, map.m_pblk);

1415

bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;

1415

bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;

1416

bh->b_size = inode->i_sb->s_blocksize * map.m_len;

1416

bh->b_size = inode->i_sb->s_blocksize * map.m_len;

1417

ret = 0;

1417

ret = 0;

1418

}

1418

}

1419

if (started)

1419

if (started)

1420

ext4_journal_stop(handle);

1420

ext4_journal_stop(handle);

1421

return ret;

1421

return ret;

1422

}

1422

}

1423

1424

int ext4_get_block(struct inode *inode, sector_t iblock,

1424

int ext4_get_block(struct inode *inode, sector_t iblock,

1425

struct buffer_head *bh, int create)

1425

struct buffer_head *bh, int create)

1426

{

1426

{

1427

return _ext4_get_block(inode, iblock, bh,

1427

return _ext4_get_block(inode, iblock, bh,

1428

create ? EXT4_GET_BLOCKS_CREATE : 0);

1428

create ? EXT4_GET_BLOCKS_CREATE : 0);

1429

}

1429

}

1430

1431

/*

1431

/*

1432

* `handle' can be NULL if create is zero

1432

* `handle' can be NULL if create is zero

1433

*/

1433

*/

1434

struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,

1434

struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,

1435

ext4_lblk_t block, int create, int *errp)

1435

ext4_lblk_t block, int create, int *errp)

1436

{

1436

{

1437

struct ext4_map_blocks map;

1437

struct ext4_map_blocks map;

1438

struct buffer_head *bh;

1438

struct buffer_head *bh;

1439

int fatal = 0, err;

1439

int fatal = 0, err;

1440

1441

J_ASSERT(handle != NULL || create == 0);

1441

J_ASSERT(handle != NULL || create == 0);

1442

1443

map.m_lblk = block;

1443

map.m_lblk = block;

1444

map.m_len = 1;

1444

map.m_len = 1;

1445

err = ext4_map_blocks(handle, inode, &map,

1445

err = ext4_map_blocks(handle, inode, &map,

1446

create ? EXT4_GET_BLOCKS_CREATE : 0);

1446

create ? EXT4_GET_BLOCKS_CREATE : 0);

1447

1448

if (err < 0)

1448

if (err < 0)

1449

*errp = err;

1449

*errp = err;

1450

if (err <= 0)

1450

if (err <= 0)

1451

return NULL;

1451

return NULL;

1452

*errp = 0;

1452

*errp = 0;

1453

1454

bh = sb_getblk(inode->i_sb, map.m_pblk);

1454

bh = sb_getblk(inode->i_sb, map.m_pblk);

1455

if (!bh) {

1455

if (!bh) {

1456

*errp = -EIO;

1456

*errp = -EIO;

1457

return NULL;

1457

return NULL;

1458

}

1458

}

1459

if (map.m_flags & EXT4_MAP_NEW) {

1459

if (map.m_flags & EXT4_MAP_NEW) {

1460

J_ASSERT(create != 0);

1460

J_ASSERT(create != 0);

1461

J_ASSERT(handle != NULL);

1461

J_ASSERT(handle != NULL);

1462

1463

/*

1463

/*

1464

* Now that we do not always journal data, we should

1464

* Now that we do not always journal data, we should

1465

* keep in mind whether this should always journal the

1465

* keep in mind whether this should always journal the

1466

* new buffer as metadata. For now, regular file

1466

* new buffer as metadata. For now, regular file

1467

* writes use ext4_get_block instead, so it's not a

1467

* writes use ext4_get_block instead, so it's not a

1468

* problem.

1468

* problem.

1469

*/

1469

*/

1470

lock_buffer(bh);

1470

lock_buffer(bh);

1471

BUFFER_TRACE(bh, "call get_create_access");

1471

BUFFER_TRACE(bh, "call get_create_access");

1472

fatal = ext4_journal_get_create_access(handle, bh);

1472

fatal = ext4_journal_get_create_access(handle, bh);

1473

if (!fatal && !buffer_uptodate(bh)) {

1473

if (!fatal && !buffer_uptodate(bh)) {

1474

memset(bh->b_data, 0, inode->i_sb->s_blocksize);

1474

memset(bh->b_data, 0, inode->i_sb->s_blocksize);

1475

set_buffer_uptodate(bh);

1475

set_buffer_uptodate(bh);

1476

}

1476

}

1477

unlock_buffer(bh);

1477

unlock_buffer(bh);

1478

BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");

1478

BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");

1479

err = ext4_handle_dirty_metadata(handle, inode, bh);

1479

err = ext4_handle_dirty_metadata(handle, inode, bh);

1480

if (!fatal)

1480

if (!fatal)

1481

fatal = err;

1481

fatal = err;

1482

} else {

1482

} else {

1483

BUFFER_TRACE(bh, "not a new buffer");

1483

BUFFER_TRACE(bh, "not a new buffer");

1484

}

1484

}

1485

if (fatal) {

1485

if (fatal) {

1486

*errp = fatal;

1486

*errp = fatal;

1487

brelse(bh);

1487

brelse(bh);

1488

bh = NULL;

1488

bh = NULL;

1489

}

1489

}

1490

return bh;

1490

return bh;

1491

}

1491

}

1492

1493

struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,

1493

struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,

1494

ext4_lblk_t block, int create, int *err)

1494

ext4_lblk_t block, int create, int *err)

1495

{

1495

{

1496

struct buffer_head *bh;

1496

struct buffer_head *bh;

1497

1498

bh = ext4_getblk(handle, inode, block, create, err);

1498

bh = ext4_getblk(handle, inode, block, create, err);

1499

if (!bh)

1499

if (!bh)

1500

return bh;

1500

return bh;

1501

if (buffer_uptodate(bh))

1501

if (buffer_uptodate(bh))

1502

return bh;

1502

return bh;

1503

ll_rw_block(READ_META, 1, &bh);

1503

ll_rw_block(READ_META, 1, &bh);

1504

wait_on_buffer(bh);

1504

wait_on_buffer(bh);

1505

if (buffer_uptodate(bh))

1505

if (buffer_uptodate(bh))

1506

return bh;

1506

return bh;

1507

put_bh(bh);

1507

put_bh(bh);

1508

*err = -EIO;

1508

*err = -EIO;

1509

return NULL;

1509

return NULL;

1510

}

1510

}

1511

1512

static int walk_page_buffers(handle_t *handle,

1512

static int walk_page_buffers(handle_t *handle,

1513

struct buffer_head *head,

1513

struct buffer_head *head,

1514

unsigned from,

1514

unsigned from,

1515

unsigned to,

1515

unsigned to,

1516

int *partial,

1516

int *partial,

1517

int (*fn)(handle_t *handle,

1517

int (*fn)(handle_t *handle,

1518

struct buffer_head *bh))

1518

struct buffer_head *bh))

1519

{

1519

{

1520

struct buffer_head *bh;

1520

struct buffer_head *bh;

1521

unsigned block_start, block_end;

1521

unsigned block_start, block_end;

1522

unsigned blocksize = head->b_size;

1522

unsigned blocksize = head->b_size;

1523

int err, ret = 0;

1523

int err, ret = 0;

1524

struct buffer_head *next;

1524

struct buffer_head *next;

1525

1526

for (bh = head, block_start = 0;

1526

for (bh = head, block_start = 0;

1527

ret == 0 && (bh != head || !block_start);

1527

ret == 0 && (bh != head || !block_start);

1528

block_start = block_end, bh = next) {

1528

block_start = block_end, bh = next) {

1529

next = bh->b_this_page;

1529

next = bh->b_this_page;

1530

block_end = block_start + blocksize;

1530

block_end = block_start + blocksize;

1531

if (block_end <= from || block_start >= to) {

1531

if (block_end <= from || block_start >= to) {

1532

if (partial && !buffer_uptodate(bh))

1532

if (partial && !buffer_uptodate(bh))

1533

*partial = 1;

1533

*partial = 1;

1534

continue;

1534

continue;

1535

}

1535

}

1536

err = (*fn)(handle, bh);

1536

err = (*fn)(handle, bh);

1537

if (!ret)

1537

if (!ret)

1538

ret = err;

1538

ret = err;

1539

}

1539

}

1540

return ret;

1540

return ret;

1541

}

1541

}

1542

1543

/*

1543

/*

1544

* To preserve ordering, it is essential that the hole instantiation and

1544

* To preserve ordering, it is essential that the hole instantiation and

1545

* the data write be encapsulated in a single transaction. We cannot

1545

* the data write be encapsulated in a single transaction. We cannot

1546

* close off a transaction and start a new one between the ext4_get_block()

1546

* close off a transaction and start a new one between the ext4_get_block()

1547

* and the commit_write(). So doing the jbd2_journal_start at the start of

1547

* and the commit_write(). So doing the jbd2_journal_start at the start of

1548

* prepare_write() is the right place.

1548

* prepare_write() is the right place.

1549

*

1549

*

1550

* Also, this function can nest inside ext4_writepage() ->

1550

* Also, this function can nest inside ext4_writepage() ->

1551

* block_write_full_page(). In that case, we *know* that ext4_writepage()

1551

* block_write_full_page(). In that case, we *know* that ext4_writepage()

1552

* has generated enough buffer credits to do the whole page. So we won't

1552

* has generated enough buffer credits to do the whole page. So we won't

1553

* block on the journal in that case, which is good, because the caller may

1553

* block on the journal in that case, which is good, because the caller may

1554

* be PF_MEMALLOC.

1554

* be PF_MEMALLOC.

1555

*

1555

*

1556

* By accident, ext4 can be reentered when a transaction is open via

1556

* By accident, ext4 can be reentered when a transaction is open via

1557

* quota file writes. If we were to commit the transaction while thus

1557

* quota file writes. If we were to commit the transaction while thus

1558

* reentered, there can be a deadlock - we would be holding a quota

1558

* reentered, there can be a deadlock - we would be holding a quota

1559

* lock, and the commit would never complete if another thread had a

1559

* lock, and the commit would never complete if another thread had a

1560

* transaction open and was blocking on the quota lock - a ranking

1560

* transaction open and was blocking on the quota lock - a ranking

1561

* violation.

1561

* violation.

1562

*

1562

*

1563

* So what we do is to rely on the fact that jbd2_journal_stop/journal_start

1563

* So what we do is to rely on the fact that jbd2_journal_stop/journal_start

1564

* will _not_ run commit under these circumstances because handle->h_ref

1564

* will _not_ run commit under these circumstances because handle->h_ref

1565

* is elevated. We'll still have enough credits for the tiny quotafile

1565

* is elevated. We'll still have enough credits for the tiny quotafile

1566

* write.

1566

* write.

1567

*/

1567

*/

1568

static int do_journal_get_write_access(handle_t *handle,

1568

static int do_journal_get_write_access(handle_t *handle,

1569

struct buffer_head *bh)

1569

struct buffer_head *bh)

1570

{

1570

{

1571

int dirty = buffer_dirty(bh);

1571

int dirty = buffer_dirty(bh);

1572

int ret;

1572

int ret;

1573

1574

if (!buffer_mapped(bh) || buffer_freed(bh))

1574

if (!buffer_mapped(bh) || buffer_freed(bh))

1575

return 0;

1575

return 0;

1576

/*

1576

/*

1577

* __block_write_begin() could have dirtied some buffers. Clean

1577

* __block_write_begin() could have dirtied some buffers. Clean

1578

* the dirty bit as jbd2_journal_get_write_access() could complain

1578

* the dirty bit as jbd2_journal_get_write_access() could complain

1579

* otherwise about fs integrity issues. Setting of the dirty bit

1579

* otherwise about fs integrity issues. Setting of the dirty bit

1580

* by __block_write_begin() isn't a real problem here as we clear

1580

* by __block_write_begin() isn't a real problem here as we clear

1581

* the bit before releasing a page lock and thus writeback cannot

1581

* the bit before releasing a page lock and thus writeback cannot

1582

* ever write the buffer.

1582

* ever write the buffer.

1583

*/

1583

*/

1584

if (dirty)

1584

if (dirty)

1585

clear_buffer_dirty(bh);

1585

clear_buffer_dirty(bh);

1586

ret = ext4_journal_get_write_access(handle, bh);

1586

ret = ext4_journal_get_write_access(handle, bh);

1587

if (!ret && dirty)

1587

if (!ret && dirty)

1588

ret = ext4_handle_dirty_metadata(handle, NULL, bh);

1588

ret = ext4_handle_dirty_metadata(handle, NULL, bh);

1589

return ret;

1589

return ret;

1590

}

1590

}

1591

1592

/*

1592

/*

1593

* Truncate blocks that were not used by write. We have to truncate the

1593

* Truncate blocks that were not used by write. We have to truncate the

1594

* pagecache as well so that corresponding buffers get properly unmapped.

1594

* pagecache as well so that corresponding buffers get properly unmapped.

1595

*/

1595

*/

1596

static void ext4_truncate_failed_write(struct inode *inode)

1596

static void ext4_truncate_failed_write(struct inode *inode)

1597

{

1597

{

1598

truncate_inode_pages(inode->i_mapping, inode->i_size);

1598

truncate_inode_pages(inode->i_mapping, inode->i_size);

1599

ext4_truncate(inode);

1599

ext4_truncate(inode);

1600

}

1600

}

1601

1602

static int ext4_get_block_write(struct inode *inode, sector_t iblock,

1602

static int ext4_get_block_write(struct inode *inode, sector_t iblock,

1603

struct buffer_head *bh_result, int create);

1603

struct buffer_head *bh_result, int create);

1604

static int ext4_write_begin(struct file *file, struct address_space *mapping,

1604

static int ext4_write_begin(struct file *file, struct address_space *mapping,

1605

loff_t pos, unsigned len, unsigned flags,

1605

loff_t pos, unsigned len, unsigned flags,

1606

struct page **pagep, void **fsdata)

1606

struct page **pagep, void **fsdata)

1607

{

1607

{

1608

struct inode *inode = mapping->host;

1608

struct inode *inode = mapping->host;

1609

int ret, needed_blocks;

1609

int ret, needed_blocks;

1610

handle_t *handle;

1610

handle_t *handle;

1611

int retries = 0;

1611

int retries = 0;

1612

struct page *page;

1612

struct page *page;

1613

pgoff_t index;

1613

pgoff_t index;

1614

unsigned from, to;

1614

unsigned from, to;

1615

1616

trace_ext4_write_begin(inode, pos, len, flags);

1616

trace_ext4_write_begin(inode, pos, len, flags);

1617

/*

1617

/*

1618

* Reserve one block more for addition to orphan list in case

1618

* Reserve one block more for addition to orphan list in case

1619

* we allocate blocks but write fails for some reason

1619

* we allocate blocks but write fails for some reason

1620

*/

1620

*/

1621

needed_blocks = ext4_writepage_trans_blocks(inode) + 1;

1621

needed_blocks = ext4_writepage_trans_blocks(inode) + 1;

1622

index = pos >> PAGE_CACHE_SHIFT;

1622

index = pos >> PAGE_CACHE_SHIFT;

1623

from = pos & (PAGE_CACHE_SIZE - 1);

1623

from = pos & (PAGE_CACHE_SIZE - 1);

1624

to = from + len;

1624

to = from + len;

1625

1626

retry:

1626

retry:

1627

handle = ext4_journal_start(inode, needed_blocks);

1627

handle = ext4_journal_start(inode, needed_blocks);

1628

if (IS_ERR(handle)) {

1628

if (IS_ERR(handle)) {

1629

ret = PTR_ERR(handle);

1629

ret = PTR_ERR(handle);

1630

goto out;

1630

goto out;

1631

}

1631

}

1632

1633

/* We cannot recurse into the filesystem as the transaction is already

1633

/* We cannot recurse into the filesystem as the transaction is already

1634

* started */

1634

* started */

1635

flags |= AOP_FLAG_NOFS;

1635

flags |= AOP_FLAG_NOFS;

1636

1637

page = grab_cache_page_write_begin(mapping, index, flags);

1637

page = grab_cache_page_write_begin(mapping, index, flags);

1638

if (!page) {

1638

if (!page) {

1639

ext4_journal_stop(handle);

1639

ext4_journal_stop(handle);

1640

ret = -ENOMEM;

1640

ret = -ENOMEM;

1641

goto out;

1641

goto out;

1642

}

1642

}

1643

*pagep = page;

1643

*pagep = page;

1644

1645

if (ext4_should_dioread_nolock(inode))

1645

if (ext4_should_dioread_nolock(inode))

1646

ret = __block_write_begin(page, pos, len, ext4_get_block_write);

1646

ret = __block_write_begin(page, pos, len, ext4_get_block_write);

1647

else

1647

else

1648

ret = __block_write_begin(page, pos, len, ext4_get_block);

1648

ret = __block_write_begin(page, pos, len, ext4_get_block);

1649

1650

if (!ret && ext4_should_journal_data(inode)) {

1650

if (!ret && ext4_should_journal_data(inode)) {

1651

ret = walk_page_buffers(handle, page_buffers(page),

1651

ret = walk_page_buffers(handle, page_buffers(page),

1652

from, to, NULL, do_journal_get_write_access);

1652

from, to, NULL, do_journal_get_write_access);

1653

}

1653

}

1654

1655

if (ret) {

1655

if (ret) {

1656

unlock_page(page);

1656

unlock_page(page);

1657

page_cache_release(page);

1657

page_cache_release(page);

1658

/*

1658

/*

1659

* __block_write_begin may have instantiated a few blocks

1659

* __block_write_begin may have instantiated a few blocks

1660

* outside i_size. Trim these off again. Don't need

1660

* outside i_size. Trim these off again. Don't need

1661

* i_size_read because we hold i_mutex.

1661

* i_size_read because we hold i_mutex.

1662

*

1662

*

1663

* Add inode to orphan list in case we crash before

1663

* Add inode to orphan list in case we crash before

1664

* truncate finishes

1664

* truncate finishes

1665

*/

1665

*/

1666

if (pos + len > inode->i_size && ext4_can_truncate(inode))

1666

if (pos + len > inode->i_size && ext4_can_truncate(inode))

1667

ext4_orphan_add(handle, inode);

1667

ext4_orphan_add(handle, inode);

1668

1669

ext4_journal_stop(handle);

1669

ext4_journal_stop(handle);

1670

if (pos + len > inode->i_size) {

1670

if (pos + len > inode->i_size) {

1671

ext4_truncate_failed_write(inode);

1671

ext4_truncate_failed_write(inode);

1672

/*

1672

/*

1673

* If truncate failed early the inode might

1673

* If truncate failed early the inode might

1674

* still be on the orphan list; we need to

1674

* still be on the orphan list; we need to

1675

* make sure the inode is removed from the

1675

* make sure the inode is removed from the

1676

* orphan list in that case.

1676

* orphan list in that case.

1677

*/

1677

*/

1678

if (inode->i_nlink)

1678

if (inode->i_nlink)

1679

ext4_orphan_del(NULL, inode);

1679

ext4_orphan_del(NULL, inode);

1680

}

1680

}

1681

}

1681

}

1682

1683

if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))

1683

if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))

1684

goto retry;

1684

goto retry;

1685

out:

1685

out:

1686

return ret;

1686

return ret;

1687

}

1687

}

1688

1689

/* For write_end() in data=journal mode */

1689

/* For write_end() in data=journal mode */

1690

static int write_end_fn(handle_t *handle, struct buffer_head *bh)

1690

static int write_end_fn(handle_t *handle, struct buffer_head *bh)

1691

{

1691

{

1692

if (!buffer_mapped(bh) || buffer_freed(bh))

1692

if (!buffer_mapped(bh) || buffer_freed(bh))

1693

return 0;

1693

return 0;

1694

set_buffer_uptodate(bh);

1694

set_buffer_uptodate(bh);

1695

return ext4_handle_dirty_metadata(handle, NULL, bh);

1695

return ext4_handle_dirty_metadata(handle, NULL, bh);

1696

}

1696

}

1697

1698

static int ext4_generic_write_end(struct file *file,

1698

static int ext4_generic_write_end(struct file *file,

1699

struct address_space *mapping,

1699

struct address_space *mapping,

1700

loff_t pos, unsigned len, unsigned copied,

1700

loff_t pos, unsigned len, unsigned copied,

1701

struct page *page, void *fsdata)

1701

struct page *page, void *fsdata)

1702

{

1702

{

1703

int i_size_changed = 0;

1703

int i_size_changed = 0;

1704

struct inode *inode = mapping->host;

1704

struct inode *inode = mapping->host;

1705

handle_t *handle = ext4_journal_current_handle();

1705

handle_t *handle = ext4_journal_current_handle();

1706

1707

copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);

1707

copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);

1708

1709

/*

1709

/*

1710

* No need to use i_size_read() here, the i_size

1710

* No need to use i_size_read() here, the i_size

1711

* cannot change under us because we hold i_mutex.

1711

* cannot change under us because we hold i_mutex.

1712

*

1712

*

1713

* But it's important to update i_size while still holding page lock:

1713

* But it's important to update i_size while still holding page lock:

1714

* page writeout could otherwise come in and zero beyond i_size.

1714

* page writeout could otherwise come in and zero beyond i_size.

1715

*/

1715

*/

1716

if (pos + copied > inode->i_size) {

1716

if (pos + copied > inode->i_size) {

1717

i_size_write(inode, pos + copied);

1717

i_size_write(inode, pos + copied);

1718

i_size_changed = 1;

1718

i_size_changed = 1;

1719

}

1719

}

1720

1721

if (pos + copied > EXT4_I(inode)->i_disksize) {

1721

if (pos + copied > EXT4_I(inode)->i_disksize) {

1722

/* We need to mark inode dirty even if

1722

/* We need to mark inode dirty even if

1723

* new_i_size is less that inode->i_size

1723

* new_i_size is less that inode->i_size

1724

* bu greater than i_disksize.(hint delalloc)

1724

* bu greater than i_disksize.(hint delalloc)

1725

*/

1725

*/

1726

ext4_update_i_disksize(inode, (pos + copied));

1726

ext4_update_i_disksize(inode, (pos + copied));

1727

i_size_changed = 1;

1727

i_size_changed = 1;

1728

}

1728

}

1729

unlock_page(page);

1729

unlock_page(page);

1730

page_cache_release(page);

1730

page_cache_release(page);

1731

1732

/*

1732

/*

1733

* Don't mark the inode dirty under page lock. First, it unnecessarily

1733

* Don't mark the inode dirty under page lock. First, it unnecessarily

1734

* makes the holding time of page lock longer. Second, it forces lock

1734

* makes the holding time of page lock longer. Second, it forces lock

1735

* ordering of page lock and transaction start for journaling

1735

* ordering of page lock and transaction start for journaling

1736

* filesystems.

1736

* filesystems.

1737

*/

1737

*/

1738

if (i_size_changed)

1738

if (i_size_changed)

1739

ext4_mark_inode_dirty(handle, inode);

1739

ext4_mark_inode_dirty(handle, inode);

1740

1741

return copied;

1741

return copied;

1742

}

1742

}

1743

1744

/*

1744

/*

1745

* We need to pick up the new inode size which generic_commit_write gave us

1745

* We need to pick up the new inode size which generic_commit_write gave us

1746

* `file' can be NULL - eg, when called from page_symlink().

1746

* `file' can be NULL - eg, when called from page_symlink().

1747

*

1747

*

1748

* ext4 never places buffers on inode->i_mapping->private_list. metadata

1748

* ext4 never places buffers on inode->i_mapping->private_list. metadata

1749

* buffers are managed internally.

1749

* buffers are managed internally.

1750

*/

1750

*/

1751

static int ext4_ordered_write_end(struct file *file,

1751

static int ext4_ordered_write_end(struct file *file,

1752

struct address_space *mapping,

1752

struct address_space *mapping,

1753

loff_t pos, unsigned len, unsigned copied,

1753

loff_t pos, unsigned len, unsigned copied,

1754

struct page *page, void *fsdata)

1754

struct page *page, void *fsdata)

1755

{

1755

{

1756

handle_t *handle = ext4_journal_current_handle();

1756

handle_t *handle = ext4_journal_current_handle();

1757

struct inode *inode = mapping->host;

1757

struct inode *inode = mapping->host;

1758

int ret = 0, ret2;

1758

int ret = 0, ret2;

1759

1760

trace_ext4_ordered_write_end(inode, pos, len, copied);

1760

trace_ext4_ordered_write_end(inode, pos, len, copied);

1761

ret = ext4_jbd2_file_inode(handle, inode);

1761

ret = ext4_jbd2_file_inode(handle, inode);

1762

1763

if (ret == 0) {

1763

if (ret == 0) {

1764

ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,

1764

ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,

1765

page, fsdata);

1765

page, fsdata);

1766

copied = ret2;

1766

copied = ret2;

1767

if (pos + len > inode->i_size && ext4_can_truncate(inode))

1767

if (pos + len > inode->i_size && ext4_can_truncate(inode))

1768

/* if we have allocated more blocks and copied

1768

/* if we have allocated more blocks and copied

1769

* less. We will have blocks allocated outside

1769

* less. We will have blocks allocated outside

1770

* inode->i_size. So truncate them

1770

* inode->i_size. So truncate them

1771

*/

1771

*/

1772

ext4_orphan_add(handle, inode);

1772

ext4_orphan_add(handle, inode);

1773

if (ret2 < 0)

1773

if (ret2 < 0)

1774

ret = ret2;

1774

ret = ret2;

1775

}

1775

}

1776

ret2 = ext4_journal_stop(handle);

1776

ret2 = ext4_journal_stop(handle);

1777

if (!ret)

1777

if (!ret)

1778

ret = ret2;

1778

ret = ret2;

1779

1780

if (pos + len > inode->i_size) {

1780

if (pos + len > inode->i_size) {

1781

ext4_truncate_failed_write(inode);

1781

ext4_truncate_failed_write(inode);

1782

/*

1782

/*

1783

* If truncate failed early the inode might still be

1783

* If truncate failed early the inode might still be

1784

* on the orphan list; we need to make sure the inode

1784

* on the orphan list; we need to make sure the inode

1785

* is removed from the orphan list in that case.

1785

* is removed from the orphan list in that case.

1786

*/

1786

*/

1787

if (inode->i_nlink)

1787

if (inode->i_nlink)

1788

ext4_orphan_del(NULL, inode);

1788

ext4_orphan_del(NULL, inode);

1789

}

1789

}

1790

1791

1792

return ret ? ret : copied;

1792

return ret ? ret : copied;

1793

}

1793

}

1794

1795

static int ext4_writeback_write_end(struct file *file,

1795

static int ext4_writeback_write_end(struct file *file,

1796

struct address_space *mapping,

1796

struct address_space *mapping,

1797

loff_t pos, unsigned len, unsigned copied,

1797

loff_t pos, unsigned len, unsigned copied,

1798

struct page *page, void *fsdata)

1798

struct page *page, void *fsdata)

1799

{

1799

{

1800

handle_t *handle = ext4_journal_current_handle();

1800

handle_t *handle = ext4_journal_current_handle();

1801

struct inode *inode = mapping->host;

1801

struct inode *inode = mapping->host;

1802

int ret = 0, ret2;

1802

int ret = 0, ret2;

1803

1804

trace_ext4_writeback_write_end(inode, pos, len, copied);

1804

trace_ext4_writeback_write_end(inode, pos, len, copied);

1805

ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,

1805

ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,

1806

page, fsdata);

1806

page, fsdata);

1807

copied = ret2;

1807

copied = ret2;

1808

if (pos + len > inode->i_size && ext4_can_truncate(inode))

1808

if (pos + len > inode->i_size && ext4_can_truncate(inode))

1809

/* if we have allocated more blocks and copied

1809

/* if we have allocated more blocks and copied

1810

* less. We will have blocks allocated outside

1810

* less. We will have blocks allocated outside

1811

* inode->i_size. So truncate them

1811

* inode->i_size. So truncate them

1812

*/

1812

*/

1813

ext4_orphan_add(handle, inode);

1813

ext4_orphan_add(handle, inode);

1814

1815

if (ret2 < 0)

1815

if (ret2 < 0)

1816

ret = ret2;

1816

ret = ret2;

1817

1818

ret2 = ext4_journal_stop(handle);

1818

ret2 = ext4_journal_stop(handle);

1819

if (!ret)

1819

if (!ret)

1820

ret = ret2;

1820

ret = ret2;

1821

1822

if (pos + len > inode->i_size) {

1822

if (pos + len > inode->i_size) {

1823

ext4_truncate_failed_write(inode);

1823

ext4_truncate_failed_write(inode);

1824

/*

1824

/*

1825

* If truncate failed early the inode might still be

1825

* If truncate failed early the inode might still be

1826

* on the orphan list; we need to make sure the inode

1826

* on the orphan list; we need to make sure the inode

1827

* is removed from the orphan list in that case.

1827

* is removed from the orphan list in that case.

1828

*/

1828

*/

1829

if (inode->i_nlink)

1829

if (inode->i_nlink)

1830

ext4_orphan_del(NULL, inode);

1830

ext4_orphan_del(NULL, inode);

1831

}

1831

}

1832

1833

return ret ? ret : copied;

1833

return ret ? ret : copied;

1834

}

1834

}

1835

1836

static int ext4_journalled_write_end(struct file *file,

1836

static int ext4_journalled_write_end(struct file *file,

1837

struct address_space *mapping,

1837

struct address_space *mapping,

1838

loff_t pos, unsigned len, unsigned copied,

1838

loff_t pos, unsigned len, unsigned copied,

1839

struct page *page, void *fsdata)

1839

struct page *page, void *fsdata)

1840

{

1840

{

1841

handle_t *handle = ext4_journal_current_handle();

1841

handle_t *handle = ext4_journal_current_handle();

1842

struct inode *inode = mapping->host;

1842

struct inode *inode = mapping->host;

1843

int ret = 0, ret2;

1843

int ret = 0, ret2;

1844

int partial = 0;

1844

int partial = 0;

1845

unsigned from, to;

1845

unsigned from, to;

1846

loff_t new_i_size;

1846

loff_t new_i_size;

1847

1848

trace_ext4_journalled_write_end(inode, pos, len, copied);

1848

trace_ext4_journalled_write_end(inode, pos, len, copied);

1849

from = pos & (PAGE_CACHE_SIZE - 1);

1849

from = pos & (PAGE_CACHE_SIZE - 1);

1850

to = from + len;

1850

to = from + len;

1851

1852

if (copied < len) {

1852

if (copied < len) {

1853

if (!PageUptodate(page))

1853

if (!PageUptodate(page))

1854

copied = 0;

1854

copied = 0;

1855

page_zero_new_buffers(page, from+copied, to);

1855

page_zero_new_buffers(page, from+copied, to);

1856

}

1856

}

1857

1858

ret = walk_page_buffers(handle, page_buffers(page), from,

1858

ret = walk_page_buffers(handle, page_buffers(page), from,

1859

to, &partial, write_end_fn);

1859

to, &partial, write_end_fn);

1860

if (!partial)

1860

if (!partial)

1861

SetPageUptodate(page);

1861

SetPageUptodate(page);

1862

new_i_size = pos + copied;

1862

new_i_size = pos + copied;

1863

if (new_i_size > inode->i_size)

1863

if (new_i_size > inode->i_size)

1864

i_size_write(inode, pos+copied);

1864

i_size_write(inode, pos+copied);

1865

ext4_set_inode_state(inode, EXT4_STATE_JDATA);

1865

ext4_set_inode_state(inode, EXT4_STATE_JDATA);

1866

if (new_i_size > EXT4_I(inode)->i_disksize) {

1866

if (new_i_size > EXT4_I(inode)->i_disksize) {

1867

ext4_update_i_disksize(inode, new_i_size);

1867

ext4_update_i_disksize(inode, new_i_size);

1868

ret2 = ext4_mark_inode_dirty(handle, inode);

1868

ret2 = ext4_mark_inode_dirty(handle, inode);

1869

if (!ret)

1869

if (!ret)

1870

ret = ret2;

1870

ret = ret2;

1871

}

1871

}

1872

1873

unlock_page(page);

1873

unlock_page(page);

1874

page_cache_release(page);

1874

page_cache_release(page);

1875

if (pos + len > inode->i_size && ext4_can_truncate(inode))

1875

if (pos + len > inode->i_size && ext4_can_truncate(inode))

1876

/* if we have allocated more blocks and copied

1876

/* if we have allocated more blocks and copied

1877

* less. We will have blocks allocated outside

1877

* less. We will have blocks allocated outside

1878

* inode->i_size. So truncate them

1878

* inode->i_size. So truncate them

1879

*/

1879

*/

1880

ext4_orphan_add(handle, inode);

1880

ext4_orphan_add(handle, inode);

1881

1882

ret2 = ext4_journal_stop(handle);

1882

ret2 = ext4_journal_stop(handle);

1883

if (!ret)

1883

if (!ret)

1884

ret = ret2;

1884

ret = ret2;

1885

if (pos + len > inode->i_size) {

1885

if (pos + len > inode->i_size) {

1886

ext4_truncate_failed_write(inode);

1886

ext4_truncate_failed_write(inode);

1887

/*

1887

/*

1888

* If truncate failed early the inode might still be

1888

* If truncate failed early the inode might still be

1889

* on the orphan list; we need to make sure the inode

1889

* on the orphan list; we need to make sure the inode

1890

* is removed from the orphan list in that case.

1890

* is removed from the orphan list in that case.

1891

*/

1891

*/

1892

if (inode->i_nlink)

1892

if (inode->i_nlink)

1893

ext4_orphan_del(NULL, inode);

1893

ext4_orphan_del(NULL, inode);

1894

}

1894

}

1895

1896

return ret ? ret : copied;

1896

return ret ? ret : copied;

1897

}

1897

}

1898

1899

/*

1899

/*

1900

* Reserve a single block located at lblock

1900

* Reserve a single block located at lblock

1901

*/

1901

*/

1902

static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)

1902

static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)

1903

{

1903

{

1904

int retries = 0;

1904

int retries = 0;

1905

struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

1905

struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

1906

struct ext4_inode_info *ei = EXT4_I(inode);

1906

struct ext4_inode_info *ei = EXT4_I(inode);

1907

unsigned long md_needed;

1907

unsigned long md_needed;

1908

int ret;

1908

int ret;

1909

1910

/*

1910

/*

1911

* recalculate the amount of metadata blocks to reserve

1911

* recalculate the amount of metadata blocks to reserve

1912

* in order to allocate nrblocks

1912

* in order to allocate nrblocks

1913

* worse case is one extent per block

1913

* worse case is one extent per block

1914

*/

1914

*/

1915

repeat:

1915

repeat:

1916

spin_lock(&ei->i_block_reservation_lock);

1916

spin_lock(&ei->i_block_reservation_lock);

1917

md_needed = ext4_calc_metadata_amount(inode, lblock);

1917

md_needed = ext4_calc_metadata_amount(inode, lblock);

1918

trace_ext4_da_reserve_space(inode, md_needed);

1918

trace_ext4_da_reserve_space(inode, md_needed);

1919

spin_unlock(&ei->i_block_reservation_lock);

1919

spin_unlock(&ei->i_block_reservation_lock);

1920

1921

/*

1921

/*

1922

* We will charge metadata quota at writeout time; this saves

1922

* We will charge metadata quota at writeout time; this saves

1923

* us from metadata over-estimation, though we may go over by

1923

* us from metadata over-estimation, though we may go over by

1924

* a small amount in the end. Here we just reserve for data.

1924

* a small amount in the end. Here we just reserve for data.

1925

*/

1925

*/

1926

ret = dquot_reserve_block(inode, 1);

1926

ret = dquot_reserve_block(inode, 1);

1927

if (ret)

1927

if (ret)

1928

return ret;

1928

return ret;

1929

/*

1929

/*

1930

* We do still charge estimated metadata to the sb though;

1930

* We do still charge estimated metadata to the sb though;

1931

* we cannot afford to run out of free blocks.

1931

* we cannot afford to run out of free blocks.

1932

*/

1932

*/

1933

if (ext4_claim_free_blocks(sbi, md_needed + 1)) {

1933

if (ext4_claim_free_blocks(sbi, md_needed + 1)) {

1934

dquot_release_reservation_block(inode, 1);

1934

dquot_release_reservation_block(inode, 1);

1935

if (ext4_should_retry_alloc(inode->i_sb, &retries)) {

1935

if (ext4_should_retry_alloc(inode->i_sb, &retries)) {

1936

yield();

1936

yield();

1937

goto repeat;

1937

goto repeat;

1938

}

1938

}

1939

return -ENOSPC;

1939

return -ENOSPC;

1940

}

1940

}

1941

spin_lock(&ei->i_block_reservation_lock);

1941

spin_lock(&ei->i_block_reservation_lock);

1942

ei->i_reserved_data_blocks++;

1942

ei->i_reserved_data_blocks++;

1943

ei->i_reserved_meta_blocks += md_needed;

1943

ei->i_reserved_meta_blocks += md_needed;

1944

spin_unlock(&ei->i_block_reservation_lock);

1944

spin_unlock(&ei->i_block_reservation_lock);

1945

1946

return 0; /* success */

1946

return 0; /* success */

1947

}

1947

}

1948

1949

static void ext4_da_release_space(struct inode *inode, int to_free)

1949

static void ext4_da_release_space(struct inode *inode, int to_free)

1950

{

1950

{

1951

struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

1951

struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

1952

struct ext4_inode_info *ei = EXT4_I(inode);

1952

struct ext4_inode_info *ei = EXT4_I(inode);

1953

1954

if (!to_free)

1954

if (!to_free)

1955

return; /* Nothing to release, exit */

1955

return; /* Nothing to release, exit */

1956

1957

spin_lock(&EXT4_I(inode)->i_block_reservation_lock);

1957

spin_lock(&EXT4_I(inode)->i_block_reservation_lock);

1958

1959

trace_ext4_da_release_space(inode, to_free);

1959

trace_ext4_da_release_space(inode, to_free);

1960

if (unlikely(to_free > ei->i_reserved_data_blocks)) {

1960

if (unlikely(to_free > ei->i_reserved_data_blocks)) {

1961

/*

1961

/*

1962

* if there aren't enough reserved blocks, then the

1962

* if there aren't enough reserved blocks, then the

1963

* counter is messed up somewhere. Since this

1963

* counter is messed up somewhere. Since this

1964

* function is called from invalidate page, it's

1964

* function is called from invalidate page, it's

1965

* harmless to return without any action.

1965

* harmless to return without any action.

1966

*/

1966

*/

1967

ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "

1967

ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "

1968

"ino %lu, to_free %d with only %d reserved "

1968

"ino %lu, to_free %d with only %d reserved "

1969

"data blocks\n", inode->i_ino, to_free,

1969

"data blocks\n", inode->i_ino, to_free,

1970

ei->i_reserved_data_blocks);

1970

ei->i_reserved_data_blocks);

1971

WARN_ON(1);

1971

WARN_ON(1);

1972

to_free = ei->i_reserved_data_blocks;

1972

to_free = ei->i_reserved_data_blocks;

1973

}

1973

}

1974

ei->i_reserved_data_blocks -= to_free;

1974

ei->i_reserved_data_blocks -= to_free;

1975

1976

if (ei->i_reserved_data_blocks == 0) {

1976

if (ei->i_reserved_data_blocks == 0) {

1977

/*

1977

/*

1978

* We can release all of the reserved metadata blocks

1978

* We can release all of the reserved metadata blocks

1979

* only when we have written all of the delayed

1979

* only when we have written all of the delayed

1980

* allocation blocks.

1980

* allocation blocks.

1981

*/

1981

*/

1982

percpu_counter_sub(&sbi->s_dirtyblocks_counter,

1982

percpu_counter_sub(&sbi->s_dirtyblocks_counter,

1983

ei->i_reserved_meta_blocks);

1983

ei->i_reserved_meta_blocks);

1984

ei->i_reserved_meta_blocks = 0;

1984

ei->i_reserved_meta_blocks = 0;

1985

ei->i_da_metadata_calc_len = 0;

1985

ei->i_da_metadata_calc_len = 0;

1986

}

1986

}

1987

1988

/* update fs dirty data blocks counter */

1988

/* update fs dirty data blocks counter */

1989

percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);

1989

percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);

1990

1991

spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);

1991

spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);

1992

1993

dquot_release_reservation_block(inode, to_free);

1993

dquot_release_reservation_block(inode, to_free);

1994

}

1994

}

1995

1996

static void ext4_da_page_release_reservation(struct page *page,

1996

static void ext4_da_page_release_reservation(struct page *page,

1997

unsigned long offset)

1997

unsigned long offset)

1998

{

1998

{

1999

int to_release = 0;

1999

int to_release = 0;

2000

struct buffer_head *head, *bh;

2000

struct buffer_head *head, *bh;

2001

unsigned int curr_off = 0;

2001

unsigned int curr_off = 0;

2002

2003

head = page_buffers(page);

2003

head = page_buffers(page);

2004

bh = head;

2004

bh = head;

2005

do {

2005

do {

2006

unsigned int next_off = curr_off + bh->b_size;

2006

unsigned int next_off = curr_off + bh->b_size;

2007

2008

if ((offset <= curr_off) && (buffer_delay(bh))) {

2008

if ((offset <= curr_off) && (buffer_delay(bh))) {

2009

to_release++;

2009

to_release++;

2010

clear_buffer_delay(bh);

2010

clear_buffer_delay(bh);

2011

}

2011

}

2012

curr_off = next_off;

2012

curr_off = next_off;

2013

} while ((bh = bh->b_this_page) != head);

2013

} while ((bh = bh->b_this_page) != head);

2014

ext4_da_release_space(page->mapping->host, to_release);

2014

ext4_da_release_space(page->mapping->host, to_release);

2015

}

2015

}

2016

2017

/*

2017

/*

2018

* Delayed allocation stuff

2018

* Delayed allocation stuff

2019

*/

2019

*/

2020

2021

/*

2021

/*

2022

* mpage_da_submit_io - walks through extent of pages and try to write

2022

* mpage_da_submit_io - walks through extent of pages and try to write

2023

* them with writepage() call back

2023

* them with writepage() call back

2024

*

2024

*

2025

* @mpd->inode: inode

2025

* @mpd->inode: inode

2026

* @mpd->first_page: first page of the extent

2026

* @mpd->first_page: first page of the extent

2027

* @mpd->next_page: page after the last page of the extent

2027

* @mpd->next_page: page after the last page of the extent

2028

*

2028

*

2029

* By the time mpage_da_submit_io() is called we expect all blocks

2029

* By the time mpage_da_submit_io() is called we expect all blocks

2030

* to be allocated. this may be wrong if allocation failed.

2030

* to be allocated. this may be wrong if allocation failed.

2031

*

2031

*

2032

* As pages are already locked by write_cache_pages(), we can't use it

2032

* As pages are already locked by write_cache_pages(), we can't use it

2033

*/

2033

*/

2034

static int mpage_da_submit_io(struct mpage_da_data *mpd,

2034

static int mpage_da_submit_io(struct mpage_da_data *mpd,

2035

struct ext4_map_blocks *map)

2035

struct ext4_map_blocks *map)

2036

{

2036

{

2037

struct pagevec pvec;

2037

struct pagevec pvec;

2038

unsigned long index, end;

2038

unsigned long index, end;

2039

int ret = 0, err, nr_pages, i;

2039

int ret = 0, err, nr_pages, i;

2040

struct inode *inode = mpd->inode;

2040

struct inode *inode = mpd->inode;

2041

struct address_space *mapping = inode->i_mapping;

2041

struct address_space *mapping = inode->i_mapping;

2042

loff_t size = i_size_read(inode);

2042

loff_t size = i_size_read(inode);

2043

unsigned int len, block_start;

2043

unsigned int len, block_start;

2044

struct buffer_head *bh, *page_bufs = NULL;

2044

struct buffer_head *bh, *page_bufs = NULL;

2045

int journal_data = ext4_should_journal_data(inode);

2045

int journal_data = ext4_should_journal_data(inode);

2046

sector_t pblock = 0, cur_logical = 0;

2046

sector_t pblock = 0, cur_logical = 0;

2047

struct ext4_io_submit io_submit;

2047

struct ext4_io_submit io_submit;

2048

2049

BUG_ON(mpd->next_page <= mpd->first_page);

2049

BUG_ON(mpd->next_page <= mpd->first_page);

2050

memset(&io_submit, 0, sizeof(io_submit));

2050

memset(&io_submit, 0, sizeof(io_submit));

2051

/*

2051

/*

2052

* We need to start from the first_page to the next_page - 1

2052

* We need to start from the first_page to the next_page - 1

2053

* to make sure we also write the mapped dirty buffer_heads.

2053

* to make sure we also write the mapped dirty buffer_heads.

2054

* If we look at mpd->b_blocknr we would only be looking

2054

* If we look at mpd->b_blocknr we would only be looking

2055

* at the currently mapped buffer_heads.

2055

* at the currently mapped buffer_heads.

2056

*/

2056

*/

2057

index = mpd->first_page;

2057

index = mpd->first_page;

2058

end = mpd->next_page - 1;

2058

end = mpd->next_page - 1;

2059

2060

pagevec_init(&pvec, 0);

2060

pagevec_init(&pvec, 0);

2061

while (index <= end) {

2061

while (index <= end) {

2062

nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);

2062

nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);

2063

if (nr_pages == 0)

2063

if (nr_pages == 0)

2064

break;

2064

break;

2065

for (i = 0; i < nr_pages; i++) {

2065

for (i = 0; i < nr_pages; i++) {

2066

int commit_write = 0, skip_page = 0;

2066

int commit_write = 0, skip_page = 0;

2067

struct page *page = pvec.pages[i];

2067

struct page *page = pvec.pages[i];

2068

2069

index = page->index;

2069

index = page->index;

2070

if (index > end)

2070

if (index > end)

2071

break;

2071

break;

2072

2073

if (index == size >> PAGE_CACHE_SHIFT)

2073

if (index == size >> PAGE_CACHE_SHIFT)

2074

len = size & ~PAGE_CACHE_MASK;

2074

len = size & ~PAGE_CACHE_MASK;

2075

else

2075

else

2076

len = PAGE_CACHE_SIZE;

2076

len = PAGE_CACHE_SIZE;

2077

if (map) {

2077

if (map) {

2078

cur_logical = index << (PAGE_CACHE_SHIFT -

2078

cur_logical = index << (PAGE_CACHE_SHIFT -

2079

inode->i_blkbits);

2079

inode->i_blkbits);

2080

pblock = map->m_pblk + (cur_logical -

2080

pblock = map->m_pblk + (cur_logical -

2081

map->m_lblk);

2081

map->m_lblk);

2082

}

2082

}

2083

index++;

2083

index++;

2084

2085

BUG_ON(!PageLocked(page));

2085

BUG_ON(!PageLocked(page));

2086

BUG_ON(PageWriteback(page));

2086

BUG_ON(PageWriteback(page));

2087

2088

/*

2088

/*

2089

* If the page does not have buffers (for

2089

* If the page does not have buffers (for

2090

* whatever reason), try to create them using

2090

* whatever reason), try to create them using

2091

* __block_write_begin. If this fails,

2091

* __block_write_begin. If this fails,

2092

* skip the page and move on.

2092

* skip the page and move on.

2093

*/

2093

*/

2094

if (!page_has_buffers(page)) {

2094

if (!page_has_buffers(page)) {

2095

if (__block_write_begin(page, 0, len,

2095

if (__block_write_begin(page, 0, len,

2096

noalloc_get_block_write)) {

2096

noalloc_get_block_write)) {

2097

skip_page:

2097

skip_page:

2098

unlock_page(page);

2098

unlock_page(page);

2099

continue;

2099

continue;

2100

}

2100

}

2101

commit_write = 1;

2101

commit_write = 1;

2102

}

2102

}

2103

2104

bh = page_bufs = page_buffers(page);

2104

bh = page_bufs = page_buffers(page);

2105

block_start = 0;

2105

block_start = 0;

2106

do {

2106

do {

2107

if (!bh)

2107

if (!bh)

2108

goto skip_page;

2108

goto skip_page;

2109

if (map && (cur_logical >= map->m_lblk) &&

2109

if (map && (cur_logical >= map->m_lblk) &&

2110

(cur_logical <= (map->m_lblk +

2110

(cur_logical <= (map->m_lblk +

2111

(map->m_len - 1)))) {

2111

(map->m_len - 1)))) {

2112

if (buffer_delay(bh)) {

2112

if (buffer_delay(bh)) {

2113

clear_buffer_delay(bh);

2113

clear_buffer_delay(bh);

2114

bh->b_blocknr = pblock;

2114

bh->b_blocknr = pblock;

2115

}

2115

}

2116

if (buffer_unwritten(bh) ||

2116

if (buffer_unwritten(bh) ||

2117

buffer_mapped(bh))

2117

buffer_mapped(bh))

2118

BUG_ON(bh->b_blocknr != pblock);

2118

BUG_ON(bh->b_blocknr != pblock);

2119

if (map->m_flags & EXT4_MAP_UNINIT)

2119

if (map->m_flags & EXT4_MAP_UNINIT)

2120

set_buffer_uninit(bh);

2120

set_buffer_uninit(bh);

2121

clear_buffer_unwritten(bh);

2121

clear_buffer_unwritten(bh);

2122

}

2122

}

2123

2124

/* skip page if block allocation undone */

2124

/* skip page if block allocation undone */

2125

if (buffer_delay(bh) || buffer_unwritten(bh))

2125

if (buffer_delay(bh) || buffer_unwritten(bh))

2126

skip_page = 1;

2126

skip_page = 1;

2127

bh = bh->b_this_page;

2127

bh = bh->b_this_page;

2128

block_start += bh->b_size;

2128

block_start += bh->b_size;

2129

cur_logical++;

2129

cur_logical++;

2130

pblock++;

2130

pblock++;

2131

} while (bh != page_bufs);

2131

} while (bh != page_bufs);

2132

2133

if (skip_page)

2133

if (skip_page)

2134

goto skip_page;

2134

goto skip_page;

2135

2136

if (commit_write)

2136

if (commit_write)

2137

/* mark the buffer_heads as dirty & uptodate */

2137

/* mark the buffer_heads as dirty & uptodate */

2138

block_commit_write(page, 0, len);

2138

block_commit_write(page, 0, len);

2139

2140

clear_page_dirty_for_io(page);

2140

clear_page_dirty_for_io(page);

2141

/*

2141

/*

2142

* Delalloc doesn't support data journalling,

2142

* Delalloc doesn't support data journalling,

2143

* but eventually maybe we'll lift this

2143

* but eventually maybe we'll lift this

2144

* restriction.

2144

* restriction.

2145

*/

2145

*/

2146

if (unlikely(journal_data && PageChecked(page)))

2146

if (unlikely(journal_data && PageChecked(page)))

2147

err = __ext4_journalled_writepage(page, len);

2147

err = __ext4_journalled_writepage(page, len);

2148

else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))

2148

else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))

2149

err = ext4_bio_write_page(&io_submit, page,

2149

err = ext4_bio_write_page(&io_submit, page,

2150

len, mpd->wbc);

2150

len, mpd->wbc);

2151

else

2151

else

2152

err = block_write_full_page(page,

2152

err = block_write_full_page(page,

2153

noalloc_get_block_write, mpd->wbc);

2153

noalloc_get_block_write, mpd->wbc);

2154

2155

if (!err)

2155

if (!err)

2156

mpd->pages_written++;

2156

mpd->pages_written++;

2157

/*

2157

/*

2158

* In error case, we have to continue because

2158

* In error case, we have to continue because

2159

* remaining pages are still locked

2159

* remaining pages are still locked

2160

*/

2160

*/

2161

if (ret == 0)

2161

if (ret == 0)

2162

ret = err;

2162

ret = err;

2163

}

2163

}

2164

pagevec_release(&pvec);

2164

pagevec_release(&pvec);

2165

}

2165

}

2166

ext4_io_submit(&io_submit);

2166

ext4_io_submit(&io_submit);

2167

return ret;

2167

return ret;

2168

}

2168

}

2169

2170

static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)

2170

static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)

2171

{

2171

{

2172

int nr_pages, i;

2172

int nr_pages, i;

2173

pgoff_t index, end;

2173

pgoff_t index, end;

2174

struct pagevec pvec;

2174

struct pagevec pvec;

2175

struct inode *inode = mpd->inode;

2175

struct inode *inode = mpd->inode;

2176

struct address_space *mapping = inode->i_mapping;

2176

struct address_space *mapping = inode->i_mapping;

2177

2178

index = mpd->first_page;

2178

index = mpd->first_page;

2179

end = mpd->next_page - 1;

2179

end = mpd->next_page - 1;

2180

while (index <= end) {

2180

while (index <= end) {

2181

nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);

2181

nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);

2182

if (nr_pages == 0)

2182

if (nr_pages == 0)

2183

break;

2183

break;

2184

for (i = 0; i < nr_pages; i++) {

2184

for (i = 0; i < nr_pages; i++) {

2185

struct page *page = pvec.pages[i];

2185

struct page *page = pvec.pages[i];

2186

if (page->index > end)

2186

if (page->index > end)

2187

break;

2187

break;

2188

BUG_ON(!PageLocked(page));

2188

BUG_ON(!PageLocked(page));

2189

BUG_ON(PageWriteback(page));

2189

BUG_ON(PageWriteback(page));

2190

block_invalidatepage(page, 0);

2190

block_invalidatepage(page, 0);

2191

ClearPageUptodate(page);

2191

ClearPageUptodate(page);

2192

unlock_page(page);

2192

unlock_page(page);

2193

}

2193

}

2194

index = pvec.pages[nr_pages - 1]->index + 1;

2194

index = pvec.pages[nr_pages - 1]->index + 1;

2195

pagevec_release(&pvec);

2195

pagevec_release(&pvec);

2196

}

2196

}

2197

return;

2197

return;

2198

}

2198

}

2199

2200

static void ext4_print_free_blocks(struct inode *inode)

2200

static void ext4_print_free_blocks(struct inode *inode)

2201

{

2201

{

2202

struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

2202

struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

2203

printk(KERN_CRIT "Total free blocks count %lld\n",

2203

printk(KERN_CRIT "Total free blocks count %lld\n",

2204

ext4_count_free_blocks(inode->i_sb));

2204

ext4_count_free_blocks(inode->i_sb));

2205

printk(KERN_CRIT "Free/Dirty block details\n");

2205

printk(KERN_CRIT "Free/Dirty block details\n");

2206

printk(KERN_CRIT "free_blocks=%lld\n",

2206

printk(KERN_CRIT "free_blocks=%lld\n",

2207

(long long) percpu_counter_sum(&sbi->s_freeblocks_counter));

2207

(long long) percpu_counter_sum(&sbi->s_freeblocks_counter));

2208

printk(KERN_CRIT "dirty_blocks=%lld\n",

2208

printk(KERN_CRIT "dirty_blocks=%lld\n",

2209

(long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter));

2209

(long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter));

2210

printk(KERN_CRIT "Block reservation details\n");

2210

printk(KERN_CRIT "Block reservation details\n");

2211

printk(KERN_CRIT "i_reserved_data_blocks=%u\n",

2211

printk(KERN_CRIT "i_reserved_data_blocks=%u\n",

2212

EXT4_I(inode)->i_reserved_data_blocks);

2212

EXT4_I(inode)->i_reserved_data_blocks);

2213

printk(KERN_CRIT "i_reserved_meta_blocks=%u\n",

2213

printk(KERN_CRIT "i_reserved_meta_blocks=%u\n",

2214

EXT4_I(inode)->i_reserved_meta_blocks);

2214

EXT4_I(inode)->i_reserved_meta_blocks);

2215

return;

2215

return;

2216

}

2216

}

2217

2218

/*

2218

/*

2219

* mpage_da_map_and_submit - go through given space, map them

2219

* mpage_da_map_and_submit - go through given space, map them

2220

* if necessary, and then submit them for I/O

2220

* if necessary, and then submit them for I/O

2221

*

2221

*

2222

* @mpd - bh describing space

2222

* @mpd - bh describing space

2223

*

2223

*

2224

* The function skips space we know is already mapped to disk blocks.

2224

* The function skips space we know is already mapped to disk blocks.

2225

*

2225

*

2226

*/

2226

*/

2227

static void mpage_da_map_and_submit(struct mpage_da_data *mpd)

2227

static void mpage_da_map_and_submit(struct mpage_da_data *mpd)

2228

{

2228

{

2229

int err, blks, get_blocks_flags;

2229

int err, blks, get_blocks_flags;

2230

struct ext4_map_blocks map, *mapp = NULL;

2230

struct ext4_map_blocks map, *mapp = NULL;

2231

sector_t next = mpd->b_blocknr;

2231

sector_t next = mpd->b_blocknr;

2232

unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;

2232

unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;

2233

loff_t disksize = EXT4_I(mpd->inode)->i_disksize;

2233

loff_t disksize = EXT4_I(mpd->inode)->i_disksize;

2234

handle_t *handle = NULL;

2234

handle_t *handle = NULL;

2235

2236

/*

2236

/*

2237

* If the blocks are mapped already, or we couldn't accumulate

2237

* If the blocks are mapped already, or we couldn't accumulate

2238

* any blocks, then proceed immediately to the submission stage.

2238

* any blocks, then proceed immediately to the submission stage.

2239

*/

2239

*/

2240

if ((mpd->b_size == 0) ||

2240

if ((mpd->b_size == 0) ||

2241

((mpd->b_state & (1 << BH_Mapped)) &&

2241

((mpd->b_state & (1 << BH_Mapped)) &&

2242

!(mpd->b_state & (1 << BH_Delay)) &&

2242

!(mpd->b_state & (1 << BH_Delay)) &&

2243

!(mpd->b_state & (1 << BH_Unwritten))))

2243

!(mpd->b_state & (1 << BH_Unwritten))))

2244

goto submit_io;

2244

goto submit_io;

2245

2246

handle = ext4_journal_current_handle();

2246

handle = ext4_journal_current_handle();

2247

BUG_ON(!handle);

2247

BUG_ON(!handle);

2248

2249

/*

2249

/*

2250

* Call ext4_map_blocks() to allocate any delayed allocation

2250

* Call ext4_map_blocks() to allocate any delayed allocation

2251

* blocks, or to convert an uninitialized extent to be

2251

* blocks, or to convert an uninitialized extent to be

2252

* initialized (in the case where we have written into

2252

* initialized (in the case where we have written into

2253

* one or more preallocated blocks).

2253

* one or more preallocated blocks).

2254

*

2254

*

2255

* We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to

2255

* We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to

2256

* indicate that we are on the delayed allocation path. This

2256

* indicate that we are on the delayed allocation path. This

2257

* affects functions in many different parts of the allocation

2257

* affects functions in many different parts of the allocation

2258

* call path. This flag exists primarily because we don't

2258

* call path. This flag exists primarily because we don't

2259

* want to change *many* call functions, so ext4_map_blocks()

2259

* want to change *many* call functions, so ext4_map_blocks()

2260

* will set the EXT4_STATE_DELALLOC_RESERVED flag once the

2260

* will set the EXT4_STATE_DELALLOC_RESERVED flag once the

2261

* inode's allocation semaphore is taken.

2261

* inode's allocation semaphore is taken.

2262

*

2262

*

2263

* If the blocks in questions were delalloc blocks, set

2263

* If the blocks in questions were delalloc blocks, set

2264

* EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting

2264

* EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting

2265

* variables are updated after the blocks have been allocated.

2265

* variables are updated after the blocks have been allocated.

2266

*/

2266

*/

2267

map.m_lblk = next;

2267

map.m_lblk = next;

2268

map.m_len = max_blocks;

2268

map.m_len = max_blocks;

2269

get_blocks_flags = EXT4_GET_BLOCKS_CREATE;

2269

get_blocks_flags = EXT4_GET_BLOCKS_CREATE;

2270

if (ext4_should_dioread_nolock(mpd->inode))

2270

if (ext4_should_dioread_nolock(mpd->inode))

2271

get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;

2271

get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;

2272

if (mpd->b_state & (1 << BH_Delay))

2272

if (mpd->b_state & (1 << BH_Delay))

2273

get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;

2273

get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;

2274

2275

blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);

2275

blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);

2276

if (blks < 0) {

2276

if (blks < 0) {

2277

struct super_block *sb = mpd->inode->i_sb;

2277

struct super_block *sb = mpd->inode->i_sb;

2278

2279

err = blks;

2279

err = blks;

2280

/*

2280

/*

2281

* If get block returns EAGAIN or ENOSPC and there

2281

* If get block returns EAGAIN or ENOSPC and there

2282

* appears to be free blocks we will just let

2282

* appears to be free blocks we will just let

2283

* mpage_da_submit_io() unlock all of the pages.

2283

* mpage_da_submit_io() unlock all of the pages.

2284

*/

2284

*/

2285

if (err == -EAGAIN)

2285

if (err == -EAGAIN)

2286

goto submit_io;

2286

goto submit_io;

2287

2288

if (err == -ENOSPC &&

2288

if (err == -ENOSPC &&

2289

ext4_count_free_blocks(sb)) {

2289

ext4_count_free_blocks(sb)) {

2290

mpd->retval = err;

2290

mpd->retval = err;

2291

goto submit_io;

2291

goto submit_io;

2292

}

2292

}

2293

2294

/*

2294

/*

2295

* get block failure will cause us to loop in

2295

* get block failure will cause us to loop in

2296

* writepages, because a_ops->writepage won't be able

2296

* writepages, because a_ops->writepage won't be able

2297

* to make progress. The page will be redirtied by

2297

* to make progress. The page will be redirtied by

2298

* writepage and writepages will again try to write

2298

* writepage and writepages will again try to write

2299

* the same.

2299

* the same.

2300

*/

2300

*/

2301

if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {

2301

if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {

2302

ext4_msg(sb, KERN_CRIT,

2302

ext4_msg(sb, KERN_CRIT,

2303

"delayed block allocation failed for inode %lu "

2303

"delayed block allocation failed for inode %lu "

2304

"at logical offset %llu with max blocks %zd "

2304

"at logical offset %llu with max blocks %zd "

2305

"with error %d", mpd->inode->i_ino,

2305

"with error %d", mpd->inode->i_ino,

2306

(unsigned long long) next,

2306

(unsigned long long) next,

2307

mpd->b_size >> mpd->inode->i_blkbits, err);

2307

mpd->b_size >> mpd->inode->i_blkbits, err);

2308

ext4_msg(sb, KERN_CRIT,

2308

ext4_msg(sb, KERN_CRIT,

2309

"This should not happen!! Data will be lost\n");

2309

"This should not happen!! Data will be lost\n");

2310

if (err == -ENOSPC)

2310

if (err == -ENOSPC)

2311

ext4_print_free_blocks(mpd->inode);

2311

ext4_print_free_blocks(mpd->inode);

2312

}

2312

}

2313

/* invalidate all the pages */

2313

/* invalidate all the pages */

2314

ext4_da_block_invalidatepages(mpd);

2314

ext4_da_block_invalidatepages(mpd);

2315

2316

/* Mark this page range as having been completed */

2316

/* Mark this page range as having been completed */

2317

mpd->io_done = 1;

2317

mpd->io_done = 1;

2318

return;

2318

return;

2319

}

2319

}

2320

BUG_ON(blks == 0);

2320

BUG_ON(blks == 0);

2321

2322

mapp = &map;

2322

mapp = &map;

2323

if (map.m_flags & EXT4_MAP_NEW) {

2323

if (map.m_flags & EXT4_MAP_NEW) {

2324

struct block_device *bdev = mpd->inode->i_sb->s_bdev;

2324

struct block_device *bdev = mpd->inode->i_sb->s_bdev;

2325

int i;

2325

int i;

2326

2327

for (i = 0; i < map.m_len; i++)

2327

for (i = 0; i < map.m_len; i++)

2328

unmap_underlying_metadata(bdev, map.m_pblk + i);

2328

unmap_underlying_metadata(bdev, map.m_pblk + i);

2329

}

2329

}

2330

2331

if (ext4_should_order_data(mpd->inode)) {

2331

if (ext4_should_order_data(mpd->inode)) {

2332

err = ext4_jbd2_file_inode(handle, mpd->inode);

2332

err = ext4_jbd2_file_inode(handle, mpd->inode);

2333

if (err)

2333

if (err)

2334

/* This only happens if the journal is aborted */

2334

/* This only happens if the journal is aborted */

2335

return;

2335

return;

2336

}

2336

}

2337

2338

/*

2338

/*

2339

* Update on-disk size along with block allocation.

2339

* Update on-disk size along with block allocation.

2340

*/

2340

*/

2341

disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;

2341

disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;

2342

if (disksize > i_size_read(mpd->inode))

2342

if (disksize > i_size_read(mpd->inode))

2343

disksize = i_size_read(mpd->inode);

2343

disksize = i_size_read(mpd->inode);

2344

if (disksize > EXT4_I(mpd->inode)->i_disksize) {

2344

if (disksize > EXT4_I(mpd->inode)->i_disksize) {

2345

ext4_update_i_disksize(mpd->inode, disksize);

2345

ext4_update_i_disksize(mpd->inode, disksize);

2346

err = ext4_mark_inode_dirty(handle, mpd->inode);

2346

err = ext4_mark_inode_dirty(handle, mpd->inode);

2347

if (err)

2347

if (err)

2348

ext4_error(mpd->inode->i_sb,

2348

ext4_error(mpd->inode->i_sb,

2349

"Failed to mark inode %lu dirty",

2349

"Failed to mark inode %lu dirty",

2350

mpd->inode->i_ino);

2350

mpd->inode->i_ino);

2351

}

2351

}

2352

2353

submit_io:

2353

submit_io:

2354

mpage_da_submit_io(mpd, mapp);

2354

mpage_da_submit_io(mpd, mapp);

2355

mpd->io_done = 1;

2355

mpd->io_done = 1;

2356

}

2356

}

2357

2358

#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \

2358

#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \

2359

(1 << BH_Delay) | (1 << BH_Unwritten))

2359

(1 << BH_Delay) | (1 << BH_Unwritten))

2360

2361

/*

2361

/*

2362

* mpage_add_bh_to_extent - try to add one more block to extent of blocks

2362

* mpage_add_bh_to_extent - try to add one more block to extent of blocks

2363

*

2363

*

2364

* @mpd->lbh - extent of blocks

2364

* @mpd->lbh - extent of blocks

2365

* @logical - logical number of the block in the file

2365

* @logical - logical number of the block in the file

2366

* @bh - bh of the block (used to access block's state)

2366

* @bh - bh of the block (used to access block's state)

2367

*

2367

*

2368

* the function is used to collect contig. blocks in same state

2368

* the function is used to collect contig. blocks in same state

2369

*/

2369

*/

2370

static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,

2370

static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,

2371

sector_t logical, size_t b_size,

2371

sector_t logical, size_t b_size,

2372

unsigned long b_state)

2372

unsigned long b_state)

2373

{

2373

{

2374

sector_t next;

2374

sector_t next;

2375

int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;

2375

int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;

2376

2377

/*

2377

/*

2378

* XXX Don't go larger than mballoc is willing to allocate

2378

* XXX Don't go larger than mballoc is willing to allocate

2379

* This is a stopgap solution. We eventually need to fold

2379

* This is a stopgap solution. We eventually need to fold

2380

* mpage_da_submit_io() into this function and then call

2380

* mpage_da_submit_io() into this function and then call

2381

* ext4_map_blocks() multiple times in a loop

2381

* ext4_map_blocks() multiple times in a loop

2382

*/

2382

*/

2383

if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)

2383

if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)

2384

goto flush_it;

2384

goto flush_it;

2385

2386

/* check if thereserved journal credits might overflow */

2386

/* check if thereserved journal credits might overflow */

2387

if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {

2387

if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {

2388

if (nrblocks >= EXT4_MAX_TRANS_DATA) {

2388

if (nrblocks >= EXT4_MAX_TRANS_DATA) {

2389

/*

2389

/*

2390

* With non-extent format we are limited by the journal

2390

* With non-extent format we are limited by the journal

2391

* credit available. Total credit needed to insert

2391

* credit available. Total credit needed to insert

2392

* nrblocks contiguous blocks is dependent on the

2392

* nrblocks contiguous blocks is dependent on the

2393

* nrblocks. So limit nrblocks.

2393

* nrblocks. So limit nrblocks.

2394

*/

2394

*/

2395

goto flush_it;

2395

goto flush_it;

2396

} else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) >

2396

} else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) >

2397

EXT4_MAX_TRANS_DATA) {

2397

EXT4_MAX_TRANS_DATA) {

2398

/*

2398

/*

2399

* Adding the new buffer_head would make it cross the

2399

* Adding the new buffer_head would make it cross the

2400

* allowed limit for which we have journal credit

2400

* allowed limit for which we have journal credit

2401

* reserved. So limit the new bh->b_size

2401

* reserved. So limit the new bh->b_size

2402

*/

2402

*/

2403

b_size = (EXT4_MAX_TRANS_DATA - nrblocks) <<

2403

b_size = (EXT4_MAX_TRANS_DATA - nrblocks) <<

2404

mpd->inode->i_blkbits;

2404

mpd->inode->i_blkbits;

2405

/* we will do mpage_da_submit_io in the next loop */

2405

/* we will do mpage_da_submit_io in the next loop */

2406

}

2406

}

2407

}

2407

}

2408

/*

2408

/*

2409

* First block in the extent

2409

* First block in the extent

2410

*/

2410

*/

2411

if (mpd->b_size == 0) {

2411

if (mpd->b_size == 0) {

2412

mpd->b_blocknr = logical;

2412

mpd->b_blocknr = logical;

2413

mpd->b_size = b_size;

2413

mpd->b_size = b_size;

2414

mpd->b_state = b_state & BH_FLAGS;

2414

mpd->b_state = b_state & BH_FLAGS;

2415

return;

2415

return;

2416

}

2416

}

2417

2418

next = mpd->b_blocknr + nrblocks;

2418

next = mpd->b_blocknr + nrblocks;

2419

/*

2419

/*

2420

* Can we merge the block to our big extent?

2420

* Can we merge the block to our big extent?

2421

*/

2421

*/

2422

if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {

2422

if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {

2423

mpd->b_size += b_size;

2423

mpd->b_size += b_size;

2424

return;

2424

return;

2425

}

2425

}

2426

2427

flush_it:

2427

flush_it:

2428

/*

2428

/*

2429

* We couldn't merge the block to our extent, so we

2429

* We couldn't merge the block to our extent, so we

2430

* need to flush current extent and start new one

2430

* need to flush current extent and start new one

2431

*/

2431

*/

2432

mpage_da_map_and_submit(mpd);

2432

mpage_da_map_and_submit(mpd);

2433

return;

2433

return;

2434

}

2434

}

2435

2436

static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)

2436

static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)

2437

{

2437

{

2438

return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);

2438

return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);

2439

}

2439

}

2440

2441

/*

2441

/*

2442

* This is a special get_blocks_t callback which is used by

2442

* This is a special get_blocks_t callback which is used by

2443

* ext4_da_write_begin(). It will either return mapped block or

2443

* ext4_da_write_begin(). It will either return mapped block or

2444

* reserve space for a single block.

2444

* reserve space for a single block.

2445

*

2445

*

2446

* For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.

2446

* For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.

2447

* We also have b_blocknr = -1 and b_bdev initialized properly

2447

* We also have b_blocknr = -1 and b_bdev initialized properly

2448

*

2448

*

2449

* For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.

2449

* For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.

2450

* We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev

2450

* We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev

2451

* initialized properly.

2451

* initialized properly.

2452

*/

2452

*/

2453

static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,

2453

static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,

2454

struct buffer_head *bh, int create)

2454

struct buffer_head *bh, int create)

2455

{

2455

{

2456

struct ext4_map_blocks map;

2456

struct ext4_map_blocks map;

2457

int ret = 0;

2457

int ret = 0;

2458

sector_t invalid_block = ~((sector_t) 0xffff);

2458

sector_t invalid_block = ~((sector_t) 0xffff);

2459

2460

if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))

2460

if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))

2461

invalid_block = ~0;

2461

invalid_block = ~0;

2462

2463

BUG_ON(create == 0);

2463

BUG_ON(create == 0);

2464

BUG_ON(bh->b_size != inode->i_sb->s_blocksize);

2464

BUG_ON(bh->b_size != inode->i_sb->s_blocksize);

2465

2466

map.m_lblk = iblock;

2466

map.m_lblk = iblock;

2467

map.m_len = 1;

2467

map.m_len = 1;

2468

2469

/*

2469

/*

2470

* first, we need to know whether the block is allocated already

2470

* first, we need to know whether the block is allocated already

2471

* preallocated blocks are unmapped but should treated

2471

* preallocated blocks are unmapped but should treated

2472

* the same as allocated blocks.

2472

* the same as allocated blocks.

2473

*/

2473

*/

2474

ret = ext4_map_blocks(NULL, inode, &map, 0);

2474

ret = ext4_map_blocks(NULL, inode, &map, 0);

2475

if (ret < 0)

2475

if (ret < 0)

2476

return ret;

2476

return ret;

2477

if (ret == 0) {

2477

if (ret == 0) {

2478

if (buffer_delay(bh))

2478

if (buffer_delay(bh))

2479

return 0; /* Not sure this could or should happen */

2479

return 0; /* Not sure this could or should happen */

2480

/*

2480

/*

2481

* XXX: __block_write_begin() unmaps passed block, is it OK?

2481

* XXX: __block_write_begin() unmaps passed block, is it OK?

2482

*/

2482

*/

2483

ret = ext4_da_reserve_space(inode, iblock);

2483

ret = ext4_da_reserve_space(inode, iblock);

2484

if (ret)

2484

if (ret)

2485

/* not enough space to reserve */

2485

/* not enough space to reserve */

2486

return ret;

2486

return ret;

2487

2488

map_bh(bh, inode->i_sb, invalid_block);

2488

map_bh(bh, inode->i_sb, invalid_block);

2489

set_buffer_new(bh);

2489

set_buffer_new(bh);

2490

set_buffer_delay(bh);

2490

set_buffer_delay(bh);

2491

return 0;

2491

return 0;

2492

}

2492

}

2493

2494

map_bh(bh, inode->i_sb, map.m_pblk);

2494

map_bh(bh, inode->i_sb, map.m_pblk);

2495

bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;

2495

bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;

2496

2497

if (buffer_unwritten(bh)) {

2497

if (buffer_unwritten(bh)) {

2498

/* A delayed write to unwritten bh should be marked

2498

/* A delayed write to unwritten bh should be marked

2499

* new and mapped. Mapped ensures that we don't do

2499

* new and mapped. Mapped ensures that we don't do

2500

* get_block multiple times when we write to the same

2500

* get_block multiple times when we write to the same

2501

* offset and new ensures that we do proper zero out

2501

* offset and new ensures that we do proper zero out

2502

* for partial write.

2502

* for partial write.

2503

*/

2503

*/

2504

set_buffer_new(bh);

2504

set_buffer_new(bh);

2505

set_buffer_mapped(bh);

2505

set_buffer_mapped(bh);

2506

}

2506

}

2507

return 0;

2507

return 0;

2508

}

2508

}

2509

2510

/*

2510

/*

2511

* This function is used as a standard get_block_t calback function

2511

* This function is used as a standard get_block_t calback function

2512

* when there is no desire to allocate any blocks. It is used as a

2512

* when there is no desire to allocate any blocks. It is used as a

2513

* callback function for block_write_begin() and block_write_full_page().

2513

* callback function for block_write_begin() and block_write_full_page().

2514

* These functions should only try to map a single block at a time.

2514

* These functions should only try to map a single block at a time.

2515

*

2515

*

2516

* Since this function doesn't do block allocations even if the caller

2516

* Since this function doesn't do block allocations even if the caller

2517

* requests it by passing in create=1, it is critically important that

2517

* requests it by passing in create=1, it is critically important that

2518

* any caller checks to make sure that any buffer heads are returned

2518

* any caller checks to make sure that any buffer heads are returned

2519

* by this function are either all already mapped or marked for

2519

* by this function are either all already mapped or marked for

2520

* delayed allocation before calling block_write_full_page(). Otherwise,

2520

* delayed allocation before calling block_write_full_page(). Otherwise,

2521

* b_blocknr could be left unitialized, and the page write functions will

2521

* b_blocknr could be left unitialized, and the page write functions will

2522

* be taken by surprise.

2522

* be taken by surprise.

2523

*/

2523

*/

2524

static int noalloc_get_block_write(struct inode *inode, sector_t iblock,

2524

static int noalloc_get_block_write(struct inode *inode, sector_t iblock,

2525

struct buffer_head *bh_result, int create)

2525

struct buffer_head *bh_result, int create)

2526

{

2526

{

2527

BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);

2527

BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);

2528

return _ext4_get_block(inode, iblock, bh_result, 0);

2528

return _ext4_get_block(inode, iblock, bh_result, 0);

2529

}

2529

}

2530

2531

static int bget_one(handle_t *handle, struct buffer_head *bh)

2531

static int bget_one(handle_t *handle, struct buffer_head *bh)

2532

{

2532

{

2533

get_bh(bh);

2533

get_bh(bh);

2534

return 0;

2534

return 0;

2535

}

2535

}

2536

2537

static int bput_one(handle_t *handle, struct buffer_head *bh)

2537

static int bput_one(handle_t *handle, struct buffer_head *bh)

2538

{

2538

{

2539

put_bh(bh);

2539

put_bh(bh);

2540

return 0;

2540

return 0;

2541

}

2541

}

2542

2543

static int __ext4_journalled_writepage(struct page *page,

2543

static int __ext4_journalled_writepage(struct page *page,

2544

unsigned int len)

2544

unsigned int len)

2545

{

2545

{

2546

struct address_space *mapping = page->mapping;

2546

struct address_space *mapping = page->mapping;

2547

struct inode *inode = mapping->host;

2547

struct inode *inode = mapping->host;

2548

struct buffer_head *page_bufs;

2548

struct buffer_head *page_bufs;

2549

handle_t *handle = NULL;

2549

handle_t *handle = NULL;

2550

int ret = 0;

2550

int ret = 0;

2551

int err;

2551

int err;

2552

2553

ClearPageChecked(page);

2553

ClearPageChecked(page);

2554

page_bufs = page_buffers(page);

2554

page_bufs = page_buffers(page);

2555

BUG_ON(!page_bufs);

2555

BUG_ON(!page_bufs);

2556

walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);

2556

walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);

2557

/* As soon as we unlock the page, it can go away, but we have

2557

/* As soon as we unlock the page, it can go away, but we have

2558

* references to buffers so we are safe */

2558

* references to buffers so we are safe */

2559

unlock_page(page);

2559

unlock_page(page);

2560

2561

handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));

2561

handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));

2562

if (IS_ERR(handle)) {

2562

if (IS_ERR(handle)) {

2563

ret = PTR_ERR(handle);

2563

ret = PTR_ERR(handle);

2564

goto out;

2564

goto out;

2565

}

2565

}

2566

2567

ret = walk_page_buffers(handle, page_bufs, 0, len, NULL,

2567

ret = walk_page_buffers(handle, page_bufs, 0, len, NULL,

2568

do_journal_get_write_access);

2568

do_journal_get_write_access);

2569

2570

err = walk_page_buffers(handle, page_bufs, 0, len, NULL,

2570

err = walk_page_buffers(handle, page_bufs, 0, len, NULL,

2571

write_end_fn);

2571

write_end_fn);

2572

if (ret == 0)

2572

if (ret == 0)

2573

ret = err;

2573

ret = err;

2574

err = ext4_journal_stop(handle);

2574

err = ext4_journal_stop(handle);

2575

if (!ret)

2575

if (!ret)

2576

ret = err;

2576

ret = err;

2577

2578

walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);

2578

walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);

2579

ext4_set_inode_state(inode, EXT4_STATE_JDATA);

2579

ext4_set_inode_state(inode, EXT4_STATE_JDATA);

2580

out:

2580

out:

2581

return ret;

2581

return ret;

2582

}

2582

}

2583

2584

static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);

2584

static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);

2585

static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);

2585

static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);

2586

2587

/*

2587

/*

2588

* Note that we don't need to start a transaction unless we're journaling data

2588

* Note that we don't need to start a transaction unless we're journaling data

2589

* because we should have holes filled from ext4_page_mkwrite(). We even don't

2589

* because we should have holes filled from ext4_page_mkwrite(). We even don't

2590

* need to file the inode to the transaction's list in ordered mode because if

2590

* need to file the inode to the transaction's list in ordered mode because if

2591

* we are writing back data added by write(), the inode is already there and if

2591

* we are writing back data added by write(), the inode is already there and if

2592

* we are writing back data modified via mmap(), no one guarantees in which

2592

* we are writing back data modified via mmap(), no one guarantees in which

2593

* transaction the data will hit the disk. In case we are journaling data, we

2593

* transaction the data will hit the disk. In case we are journaling data, we

2594

* cannot start transaction directly because transaction start ranks above page

2594

* cannot start transaction directly because transaction start ranks above page

2595

* lock so we have to do some magic.

2595

* lock so we have to do some magic.

2596

*

2596

*

2597

* This function can get called via...

2597

* This function can get called via...

2598

* - ext4_da_writepages after taking page lock (have journal handle)

2598

* - ext4_da_writepages after taking page lock (have journal handle)

2599

* - journal_submit_inode_data_buffers (no journal handle)

2599

* - journal_submit_inode_data_buffers (no journal handle)

2600

* - shrink_page_list via pdflush (no journal handle)

2600

* - shrink_page_list via pdflush (no journal handle)

2601

* - grab_page_cache when doing write_begin (have journal handle)

2601

* - grab_page_cache when doing write_begin (have journal handle)

2602

*

2602

*

2603

* We don't do any block allocation in this function. If we have page with

2603

* We don't do any block allocation in this function. If we have page with

2604

* multiple blocks we need to write those buffer_heads that are mapped. This

2604

* multiple blocks we need to write those buffer_heads that are mapped. This

2605

* is important for mmaped based write. So if we do with blocksize 1K

2605

* is important for mmaped based write. So if we do with blocksize 1K

2606

* truncate(f, 1024);

2606

* truncate(f, 1024);

2607

* a = mmap(f, 0, 4096);

2607

* a = mmap(f, 0, 4096);

2608

* a[0] = 'a';

2608

* a[0] = 'a';

2609

* truncate(f, 4096);

2609

* truncate(f, 4096);

2610

* we have in the page first buffer_head mapped via page_mkwrite call back

2610

* we have in the page first buffer_head mapped via page_mkwrite call back

2611

* but other bufer_heads would be unmapped but dirty(dirty done via the

2611

* but other bufer_heads would be unmapped but dirty(dirty done via the

2612

* do_wp_page). So writepage should write the first block. If we modify

2612

* do_wp_page). So writepage should write the first block. If we modify

2613

* the mmap area beyond 1024 we will again get a page_fault and the

2613

* the mmap area beyond 1024 we will again get a page_fault and the

2614

* page_mkwrite callback will do the block allocation and mark the

2614

* page_mkwrite callback will do the block allocation and mark the

2615

* buffer_heads mapped.

2615

* buffer_heads mapped.

2616

*

2616

*

2617

* We redirty the page if we have any buffer_heads that is either delay or

2617

* We redirty the page if we have any buffer_heads that is either delay or

2618

* unwritten in the page.

2618

* unwritten in the page.

2619

*

2619

*

2620

* We can get recursively called as show below.

2620

* We can get recursively called as show below.

2621

*

2621

*

2622

* ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->

2622

* ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->

2623

* ext4_writepage()

2623

* ext4_writepage()

2624

*

2624

*

2625

* But since we don't do any block allocation we should not deadlock.

2625

* But since we don't do any block allocation we should not deadlock.

2626

* Page also have the dirty flag cleared so we don't get recurive page_lock.

2626

* Page also have the dirty flag cleared so we don't get recurive page_lock.

2627

*/

2627

*/

2628

static int ext4_writepage(struct page *page,

2628

static int ext4_writepage(struct page *page,

2629

struct writeback_control *wbc)

2629

struct writeback_control *wbc)

2630

{

2630

{

2631

int ret = 0, commit_write = 0;

2631

int ret = 0, commit_write = 0;

2632

loff_t size;

2632

loff_t size;

2633

unsigned int len;

2633

unsigned int len;

2634

struct buffer_head *page_bufs = NULL;

2634

struct buffer_head *page_bufs = NULL;

2635

struct inode *inode = page->mapping->host;

2635

struct inode *inode = page->mapping->host;

2636

2637

trace_ext4_writepage(inode, page);

2637

trace_ext4_writepage(inode, page);

2638

size = i_size_read(inode);

2638

size = i_size_read(inode);

2639

if (page->index == size >> PAGE_CACHE_SHIFT)

2639

if (page->index == size >> PAGE_CACHE_SHIFT)

2640

len = size & ~PAGE_CACHE_MASK;

2640

len = size & ~PAGE_CACHE_MASK;

2641

else

2641

else

2642

len = PAGE_CACHE_SIZE;

2642

len = PAGE_CACHE_SIZE;

2643

2644

/*

2644

/*

2645

* If the page does not have buffers (for whatever reason),

2645

* If the page does not have buffers (for whatever reason),

2646

* try to create them using __block_write_begin. If this

2646

* try to create them using __block_write_begin. If this

2647

* fails, redirty the page and move on.

2647

* fails, redirty the page and move on.

2648

*/

2648

*/

2649

if (!page_has_buffers(page)) {

2649

if (!page_has_buffers(page)) {

2650

if (__block_write_begin(page, 0, len,

2650

if (__block_write_begin(page, 0, len,

2651

noalloc_get_block_write)) {

2651

noalloc_get_block_write)) {

2652

redirty_page:

2652

redirty_page:

2653

redirty_page_for_writepage(wbc, page);

2653

redirty_page_for_writepage(wbc, page);

2654

unlock_page(page);

2654

unlock_page(page);

2655

return 0;

2655

return 0;

2656

}

2656

}

2657

commit_write = 1;

2657

commit_write = 1;

2658

}

2658

}

2659

page_bufs = page_buffers(page);

2659

page_bufs = page_buffers(page);

2660

if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,

2660

if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,

2661

ext4_bh_delay_or_unwritten)) {

2661

ext4_bh_delay_or_unwritten)) {

2662

/*

2662

/*

2663

* We don't want to do block allocation, so redirty

2663

* We don't want to do block allocation, so redirty

2664

* the page and return. We may reach here when we do

2664

* the page and return. We may reach here when we do

2665

* a journal commit via journal_submit_inode_data_buffers.

2665

* a journal commit via journal_submit_inode_data_buffers.

2666

* We can also reach here via shrink_page_list

2666

* We can also reach here via shrink_page_list

2667

*/

2667

*/

2668

goto redirty_page;

2668

goto redirty_page;

2669

}

2669

}

2670

if (commit_write)

2670

if (commit_write)

2671

/* now mark the buffer_heads as dirty and uptodate */

2671

/* now mark the buffer_heads as dirty and uptodate */

2672

block_commit_write(page, 0, len);

2672

block_commit_write(page, 0, len);

2673

2674

if (PageChecked(page) && ext4_should_journal_data(inode))

2674

if (PageChecked(page) && ext4_should_journal_data(inode))

2675

/*

2675

/*

2676

* It's mmapped pagecache. Add buffers and journal it. There

2676

* It's mmapped pagecache. Add buffers and journal it. There

2677

* doesn't seem much point in redirtying the page here.

2677

* doesn't seem much point in redirtying the page here.

2678

*/

2678

*/

2679

return __ext4_journalled_writepage(page, len);

2679

return __ext4_journalled_writepage(page, len);

2680

2681

if (buffer_uninit(page_bufs)) {

2681

if (buffer_uninit(page_bufs)) {

2682

ext4_set_bh_endio(page_bufs, inode);

2682

ext4_set_bh_endio(page_bufs, inode);

2683

ret = block_write_full_page_endio(page, noalloc_get_block_write,

2683

ret = block_write_full_page_endio(page, noalloc_get_block_write,

2684

wbc, ext4_end_io_buffer_write);

2684

wbc, ext4_end_io_buffer_write);

2685

} else

2685

} else

2686

ret = block_write_full_page(page, noalloc_get_block_write,

2686

ret = block_write_full_page(page, noalloc_get_block_write,

2687

wbc);

2687

wbc);

2688

2689

return ret;

2689

return ret;

2690

}

2690

}

2691

2692

/*

2692

/*

2693

* This is called via ext4_da_writepages() to

2693

* This is called via ext4_da_writepages() to

2694

* calculate the total number of credits to reserve to fit

2694

* calculate the total number of credits to reserve to fit

2695

* a single extent allocation into a single transaction,

2695

* a single extent allocation into a single transaction,

2696

* ext4_da_writpeages() will loop calling this before

2696

* ext4_da_writpeages() will loop calling this before

2697

* the block allocation.

2697

* the block allocation.

2698

*/

2698

*/

2699

2700

static int ext4_da_writepages_trans_blocks(struct inode *inode)

2700

static int ext4_da_writepages_trans_blocks(struct inode *inode)

2701

{

2701

{

2702

int max_blocks = EXT4_I(inode)->i_reserved_data_blocks;

2702

int max_blocks = EXT4_I(inode)->i_reserved_data_blocks;

2703

2704

/*

2704

/*

2705

* With non-extent format the journal credit needed to

2705

* With non-extent format the journal credit needed to

2706

* insert nrblocks contiguous block is dependent on

2706

* insert nrblocks contiguous block is dependent on

2707

* number of contiguous block. So we will limit

2707

* number of contiguous block. So we will limit

2708

* number of contiguous block to a sane value

2708

* number of contiguous block to a sane value

2709

*/

2709

*/

2710

if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&

2710

if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&

2711

(max_blocks > EXT4_MAX_TRANS_DATA))

2711

(max_blocks > EXT4_MAX_TRANS_DATA))

2712

max_blocks = EXT4_MAX_TRANS_DATA;

2712

max_blocks = EXT4_MAX_TRANS_DATA;

2713

2714

return ext4_chunk_trans_blocks(inode, max_blocks);

2714

return ext4_chunk_trans_blocks(inode, max_blocks);

2715

}

2715

}

2716

2717

/*

2717

/*

2718

* write_cache_pages_da - walk the list of dirty pages of the given

2718

* write_cache_pages_da - walk the list of dirty pages of the given

2719

* address space and accumulate pages that need writing, and call

2719

* address space and accumulate pages that need writing, and call

2720

* mpage_da_map_and_submit to map a single contiguous memory region

2720

* mpage_da_map_and_submit to map a single contiguous memory region

2721

* and then write them.

2721

* and then write them.

2722

*/

2722

*/

2723

static int write_cache_pages_da(struct address_space *mapping,

2723

static int write_cache_pages_da(struct address_space *mapping,

2724

struct writeback_control *wbc,

2724

struct writeback_control *wbc,

2725

struct mpage_da_data *mpd,

2725

struct mpage_da_data *mpd,

2726

pgoff_t *done_index)

2726

pgoff_t *done_index)

2727

{

2727

{

2728

struct buffer_head *bh, *head;

2728

struct buffer_head *bh, *head;

2729

struct inode *inode = mapping->host;

2729

struct inode *inode = mapping->host;

2730

struct pagevec pvec;

2730

struct pagevec pvec;

2731

unsigned int nr_pages;

2731

unsigned int nr_pages;

2732

sector_t logical;

2732

sector_t logical;

2733

pgoff_t index, end;

2733

pgoff_t index, end;

2734

long nr_to_write = wbc->nr_to_write;

2734

long nr_to_write = wbc->nr_to_write;

2735

int i, tag, ret = 0;

2735

int i, tag, ret = 0;

2736

2737

memset(mpd, 0, sizeof(struct mpage_da_data));

2737

memset(mpd, 0, sizeof(struct mpage_da_data));

2738

mpd->wbc = wbc;

2738

mpd->wbc = wbc;

2739

mpd->inode = inode;

2739

mpd->inode = inode;

2740

pagevec_init(&pvec, 0);

2740

pagevec_init(&pvec, 0);

2741

index = wbc->range_start >> PAGE_CACHE_SHIFT;

2741

index = wbc->range_start >> PAGE_CACHE_SHIFT;

2742

end = wbc->range_end >> PAGE_CACHE_SHIFT;

2742

end = wbc->range_end >> PAGE_CACHE_SHIFT;

2743

2744

if (wbc->sync_mode == WB_SYNC_ALL)

2744

if (wbc->sync_mode == WB_SYNC_ALL)

2745

tag = PAGECACHE_TAG_TOWRITE;

2745

tag = PAGECACHE_TAG_TOWRITE;

2746

else

2746

else

2747

tag = PAGECACHE_TAG_DIRTY;

2747

tag = PAGECACHE_TAG_DIRTY;

2748

2749

*done_index = index;

2749

*done_index = index;

2750

while (index <= end) {

2750

while (index <= end) {

2751

nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,

2751

nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,

2752

min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);

2752

min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);

2753

if (nr_pages == 0)

2753

if (nr_pages == 0)

2754

return 0;

2754

return 0;

2755

2756

for (i = 0; i < nr_pages; i++) {

2756

for (i = 0; i < nr_pages; i++) {

2757

struct page *page = pvec.pages[i];

2757

struct page *page = pvec.pages[i];

2758

2759

/*

2759

/*

2760

* At this point, the page may be truncated or

2760

* At this point, the page may be truncated or

2761

* invalidated (changing page->mapping to NULL), or

2761

* invalidated (changing page->mapping to NULL), or

2762

* even swizzled back from swapper_space to tmpfs file

2762

* even swizzled back from swapper_space to tmpfs file

2763

* mapping. However, page->index will not change

2763

* mapping. However, page->index will not change

2764

* because we have a reference on the page.

2764

* because we have a reference on the page.

2765

*/

2765

*/

2766

if (page->index > end)

2766

if (page->index > end)

2767

goto out;

2767

goto out;

2768

2769

*done_index = page->index + 1;

2769

*done_index = page->index + 1;

2770

2771

/*

2771

/*

2772

* If we can't merge this page, and we have

2772

* If we can't merge this page, and we have

2773

* accumulated an contiguous region, write it

2773

* accumulated an contiguous region, write it

2774

*/

2774

*/

2775

if ((mpd->next_page != page->index) &&

2775

if ((mpd->next_page != page->index) &&

2776

(mpd->next_page != mpd->first_page)) {

2776

(mpd->next_page != mpd->first_page)) {

2777

mpage_da_map_and_submit(mpd);

2777

mpage_da_map_and_submit(mpd);

2778

goto ret_extent_tail;

2778

goto ret_extent_tail;

2779

}

2779

}

2780

2781

lock_page(page);

2781

lock_page(page);

2782

2783

/*

2783

/*

2784

* If the page is no longer dirty, or its

2784

* If the page is no longer dirty, or its

2785

* mapping no longer corresponds to inode we

2785

* mapping no longer corresponds to inode we

2786

* are writing (which means it has been

2786

* are writing (which means it has been

2787

* truncated or invalidated), or the page is

2787

* truncated or invalidated), or the page is

2788

* already under writeback and we are not

2788

* already under writeback and we are not

2789

* doing a data integrity writeback, skip the page

2789

* doing a data integrity writeback, skip the page

2790

*/

2790

*/

2791

if (!PageDirty(page) ||

2791

if (!PageDirty(page) ||

2792

(PageWriteback(page) &&

2792

(PageWriteback(page) &&

2793

(wbc->sync_mode == WB_SYNC_NONE)) ||

2793

(wbc->sync_mode == WB_SYNC_NONE)) ||

2794

unlikely(page->mapping != mapping)) {

2794

unlikely(page->mapping != mapping)) {

2795

unlock_page(page);

2795

unlock_page(page);

2796

continue;

2796

continue;

2797

}

2797

}

2798

2799

if (PageWriteback(page))

2799

wait_on_page_writeback(page);

2800

wait_on_page_writeback(page);

2801

2802

BUG_ON(PageWriteback(page));

2800

BUG_ON(PageWriteback(page));

2803

2801

2804

if (mpd->next_page != page->index)

2802

if (mpd->next_page != page->index)

2805

mpd->first_page = page->index;

2803

mpd->first_page = page->index;

2806

mpd->next_page = page->index + 1;

2804

mpd->next_page = page->index + 1;

2807

logical = (sector_t) page->index <<

2805

logical = (sector_t) page->index <<

2808

(PAGE_CACHE_SHIFT - inode->i_blkbits);

2806

(PAGE_CACHE_SHIFT - inode->i_blkbits);

2809

2807

2810

if (!page_has_buffers(page)) {

2808

if (!page_has_buffers(page)) {

2811

mpage_add_bh_to_extent(mpd, logical,

2809

mpage_add_bh_to_extent(mpd, logical,

2812

PAGE_CACHE_SIZE,

2810

PAGE_CACHE_SIZE,

2813

(1 << BH_Dirty) | (1 << BH_Uptodate));

2811

(1 << BH_Dirty) | (1 << BH_Uptodate));

2814

if (mpd->io_done)

2812

if (mpd->io_done)

2815

goto ret_extent_tail;

2813

goto ret_extent_tail;

2816

} else {

2814

} else {

2817

/*

2815

/*

2818

* Page with regular buffer heads,

2816

* Page with regular buffer heads,

2819

* just add all dirty ones

2817

* just add all dirty ones

2820

*/

2818

*/

2821

head = page_buffers(page);

2819

head = page_buffers(page);

2822

bh = head;

2820

bh = head;

2823

do {

2821

do {

2824

BUG_ON(buffer_locked(bh));

2822

BUG_ON(buffer_locked(bh));

2825

/*

2823

/*

2826

* We need to try to allocate

2824

* We need to try to allocate

2827

* unmapped blocks in the same page.

2825

* unmapped blocks in the same page.

2828

* Otherwise we won't make progress

2826

* Otherwise we won't make progress

2829

* with the page in ext4_writepage

2827

* with the page in ext4_writepage

2830

*/

2828

*/

2831

if (ext4_bh_delay_or_unwritten(NULL, bh)) {

2829

if (ext4_bh_delay_or_unwritten(NULL, bh)) {

2832

mpage_add_bh_to_extent(mpd, logical,

2830

mpage_add_bh_to_extent(mpd, logical,

2833

bh->b_size,

2831

bh->b_size,

2834

bh->b_state);

2832

bh->b_state);

2835

if (mpd->io_done)

2833

if (mpd->io_done)

2836

goto ret_extent_tail;

2834

goto ret_extent_tail;

2837

} else if (buffer_dirty(bh) && (buffer_mapped(bh))) {

2835

} else if (buffer_dirty(bh) && (buffer_mapped(bh))) {

2838

/*

2836

/*

2839

* mapped dirty buffer. We need

2837

* mapped dirty buffer. We need

2840

* to update the b_state

2838

* to update the b_state

2841

* because we look at b_state

2839

* because we look at b_state

2842

* in mpage_da_map_blocks. We

2840

* in mpage_da_map_blocks. We

2843

* don't update b_size because

2841

* don't update b_size because

2844

* if we find an unmapped

2842

* if we find an unmapped

2845

* buffer_head later we need to

2843

* buffer_head later we need to

2846

* use the b_state flag of that

2844

* use the b_state flag of that

2847

* buffer_head.

2845

* buffer_head.

2848

*/

2846

*/

2849

if (mpd->b_size == 0)

2847

if (mpd->b_size == 0)

2850

mpd->b_state = bh->b_state & BH_FLAGS;

2848

mpd->b_state = bh->b_state & BH_FLAGS;

2851

}

2849

}

2852

logical++;

2850

logical++;

2853

} while ((bh = bh->b_this_page) != head);

2851

} while ((bh = bh->b_this_page) != head);

2854

}

2852

}

2855

2853

2856

if (nr_to_write > 0) {

2854

if (nr_to_write > 0) {

2857

nr_to_write--;

2855

nr_to_write--;

2858

if (nr_to_write == 0 &&

2856

if (nr_to_write == 0 &&

2859

wbc->sync_mode == WB_SYNC_NONE)

2857

wbc->sync_mode == WB_SYNC_NONE)

2860

/*

2858

/*

2861

* We stop writing back only if we are

2859

* We stop writing back only if we are

2862

* not doing integrity sync. In case of

2860

* not doing integrity sync. In case of

2863

* integrity sync we have to keep going

2861

* integrity sync we have to keep going

2864

* because someone may be concurrently

2862

* because someone may be concurrently

2865

* dirtying pages, and we might have

2863

* dirtying pages, and we might have

2866

* synced a lot of newly appeared dirty

2864

* synced a lot of newly appeared dirty

2867

* pages, but have not synced all of the

2865

* pages, but have not synced all of the

2868

* old dirty pages.

2866

* old dirty pages.

2869

*/

2867

*/

2870

goto out;

2868

goto out;

2871

}

2869

}

2872

}

2870

}

2873

pagevec_release(&pvec);

2871

pagevec_release(&pvec);

2874

cond_resched();

2872

cond_resched();

2875

}

2873

}

2876

return 0;

2874

return 0;

2877

ret_extent_tail:

2875

ret_extent_tail:

2878

ret = MPAGE_DA_EXTENT_TAIL;

2876

ret = MPAGE_DA_EXTENT_TAIL;

2879

out:

2877

out:

2880

pagevec_release(&pvec);

2878

pagevec_release(&pvec);

2881

cond_resched();

2879

cond_resched();

2882

return ret;

2880

return ret;

2883

}

2881

}

2884

2882

2885

2883

2886

static int ext4_da_writepages(struct address_space *mapping,

2884

static int ext4_da_writepages(struct address_space *mapping,

2887

struct writeback_control *wbc)

2885

struct writeback_control *wbc)

2888

{

2886

{

2889

pgoff_t index;

2887

pgoff_t index;

2890

int range_whole = 0;

2888

int range_whole = 0;

2891

handle_t *handle = NULL;

2889

handle_t *handle = NULL;

2892

struct mpage_da_data mpd;

2890

struct mpage_da_data mpd;

2893

struct inode *inode = mapping->host;

2891

struct inode *inode = mapping->host;

2894

int pages_written = 0;

2892

int pages_written = 0;

2895

unsigned int max_pages;

2893

unsigned int max_pages;

2896

int range_cyclic, cycled = 1, io_done = 0;

2894

int range_cyclic, cycled = 1, io_done = 0;

2897

int needed_blocks, ret = 0;

2895

int needed_blocks, ret = 0;

2898

long desired_nr_to_write, nr_to_writebump = 0;

2896

long desired_nr_to_write, nr_to_writebump = 0;

2899

loff_t range_start = wbc->range_start;

2897

loff_t range_start = wbc->range_start;

2900

struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);

2898

struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);

2901

pgoff_t done_index = 0;

2899

pgoff_t done_index = 0;

2902

pgoff_t end;

2900

pgoff_t end;

2903

2901

2904

trace_ext4_da_writepages(inode, wbc);

2902

trace_ext4_da_writepages(inode, wbc);

2905

2903

2906

/*

2904

/*

2907

* No pages to write? This is mainly a kludge to avoid starting

2905

* No pages to write? This is mainly a kludge to avoid starting

2908

* a transaction for special inodes like journal inode on last iput()

2906

* a transaction for special inodes like journal inode on last iput()

2909

* because that could violate lock ordering on umount

2907

* because that could violate lock ordering on umount

2910

*/

2908

*/

2911

if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))

2909

if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))

2912

return 0;

2910

return 0;

2913

2911

2914

/*

2912

/*

2915

* If the filesystem has aborted, it is read-only, so return

2913

* If the filesystem has aborted, it is read-only, so return

2916

* right away instead of dumping stack traces later on that

2914

* right away instead of dumping stack traces later on that

2917

* will obscure the real source of the problem. We test

2915

* will obscure the real source of the problem. We test

2918

* EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because

2916

* EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because

2919

* the latter could be true if the filesystem is mounted

2917

* the latter could be true if the filesystem is mounted

2920

* read-only, and in that case, ext4_da_writepages should

2918

* read-only, and in that case, ext4_da_writepages should

2921

* *never* be called, so if that ever happens, we would want

2919

* *never* be called, so if that ever happens, we would want

2922

* the stack trace.

2920

* the stack trace.

2923

*/

2921

*/

2924

if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))

2922

if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))

2925

return -EROFS;

2923

return -EROFS;

2926

2924

2927

if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)

2925

if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)

2928

range_whole = 1;

2926

range_whole = 1;

2929

2927

2930

range_cyclic = wbc->range_cyclic;

2928

range_cyclic = wbc->range_cyclic;

2931

if (wbc->range_cyclic) {

2929

if (wbc->range_cyclic) {

2932

index = mapping->writeback_index;

2930

index = mapping->writeback_index;

2933

if (index)

2931

if (index)

2934

cycled = 0;

2932

cycled = 0;

2935

wbc->range_start = index << PAGE_CACHE_SHIFT;

2933

wbc->range_start = index << PAGE_CACHE_SHIFT;

2936

wbc->range_end = LLONG_MAX;

2934

wbc->range_end = LLONG_MAX;

2937

wbc->range_cyclic = 0;

2935

wbc->range_cyclic = 0;

2938

end = -1;

2936

end = -1;

2939

} else {

2937

} else {

2940

index = wbc->range_start >> PAGE_CACHE_SHIFT;

2938

index = wbc->range_start >> PAGE_CACHE_SHIFT;

2941

end = wbc->range_end >> PAGE_CACHE_SHIFT;

2939

end = wbc->range_end >> PAGE_CACHE_SHIFT;

2942

}

2940

}

2943

2941

2944

/*

2942

/*

2945

* This works around two forms of stupidity. The first is in

2943

* This works around two forms of stupidity. The first is in

2946

* the writeback code, which caps the maximum number of pages

2944

* the writeback code, which caps the maximum number of pages

2947

* written to be 1024 pages. This is wrong on multiple

2945

* written to be 1024 pages. This is wrong on multiple

2948

* levels; different architectues have a different page size,

2946

* levels; different architectues have a different page size,

2949

* which changes the maximum amount of data which gets

2947

* which changes the maximum amount of data which gets

2950

* written. Secondly, 4 megabytes is way too small. XFS

2948

* written. Secondly, 4 megabytes is way too small. XFS

2951

* forces this value to be 16 megabytes by multiplying

2949

* forces this value to be 16 megabytes by multiplying

2952

* nr_to_write parameter by four, and then relies on its

2950

* nr_to_write parameter by four, and then relies on its

2953

* allocator to allocate larger extents to make them

2951

* allocator to allocate larger extents to make them

2954

* contiguous. Unfortunately this brings us to the second

2952

* contiguous. Unfortunately this brings us to the second

2955

* stupidity, which is that ext4's mballoc code only allocates

2953

* stupidity, which is that ext4's mballoc code only allocates

2956

* at most 2048 blocks. So we force contiguous writes up to

2954

* at most 2048 blocks. So we force contiguous writes up to

2957

* the number of dirty blocks in the inode, or

2955

* the number of dirty blocks in the inode, or

2958

* sbi->max_writeback_mb_bump whichever is smaller.

2956

* sbi->max_writeback_mb_bump whichever is smaller.

2959

*/

2957

*/

2960

max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);

2958

max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);

2961

if (!range_cyclic && range_whole) {

2959

if (!range_cyclic && range_whole) {

2962

if (wbc->nr_to_write == LONG_MAX)

2960

if (wbc->nr_to_write == LONG_MAX)

2963

desired_nr_to_write = wbc->nr_to_write;

2961

desired_nr_to_write = wbc->nr_to_write;

2964

else

2962

else

2965

desired_nr_to_write = wbc->nr_to_write * 8;

2963

desired_nr_to_write = wbc->nr_to_write * 8;

2966

} else

2964

} else

2967

desired_nr_to_write = ext4_num_dirty_pages(inode, index,

2965

desired_nr_to_write = ext4_num_dirty_pages(inode, index,

2968

max_pages);

2966

max_pages);

2969

if (desired_nr_to_write > max_pages)

2967

if (desired_nr_to_write > max_pages)

2970

desired_nr_to_write = max_pages;

2968

desired_nr_to_write = max_pages;

2971

2969

2972

if (wbc->nr_to_write < desired_nr_to_write) {

2970

if (wbc->nr_to_write < desired_nr_to_write) {

2973

nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;

2971

nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;

2974

wbc->nr_to_write = desired_nr_to_write;

2972

wbc->nr_to_write = desired_nr_to_write;

2975

}

2973

}

2976

2974

2977

retry:

2975

retry:

2978

if (wbc->sync_mode == WB_SYNC_ALL)

2976

if (wbc->sync_mode == WB_SYNC_ALL)

2979

tag_pages_for_writeback(mapping, index, end);

2977

tag_pages_for_writeback(mapping, index, end);

2980

2978

2981

while (!ret && wbc->nr_to_write > 0) {

2979

while (!ret && wbc->nr_to_write > 0) {

2982

2980

2983

/*

2981

/*

2984

* we insert one extent at a time. So we need

2982

* we insert one extent at a time. So we need

2985

* credit needed for single extent allocation.

2983

* credit needed for single extent allocation.

2986

* journalled mode is currently not supported

2984

* journalled mode is currently not supported

2987

* by delalloc

2985

* by delalloc

2988

*/

2986

*/

2989

BUG_ON(ext4_should_journal_data(inode));

2987

BUG_ON(ext4_should_journal_data(inode));

2990

needed_blocks = ext4_da_writepages_trans_blocks(inode);

2988

needed_blocks = ext4_da_writepages_trans_blocks(inode);

2991

2989

2992

/* start a new transaction*/

2990

/* start a new transaction*/

2993

handle = ext4_journal_start(inode, needed_blocks);

2991

handle = ext4_journal_start(inode, needed_blocks);

2994

if (IS_ERR(handle)) {

2992

if (IS_ERR(handle)) {

2995

ret = PTR_ERR(handle);

2993

ret = PTR_ERR(handle);

2996

ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "

2994

ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "

2997

"%ld pages, ino %lu; err %d", __func__,

2995

"%ld pages, ino %lu; err %d", __func__,

2998

wbc->nr_to_write, inode->i_ino, ret);

2996

wbc->nr_to_write, inode->i_ino, ret);

2999

goto out_writepages;

2997

goto out_writepages;

3000

}

2998

}

3001

2999

3002

/*

3000

/*

3003

* Now call write_cache_pages_da() to find the next

3001

* Now call write_cache_pages_da() to find the next

3004

* contiguous region of logical blocks that need

3002

* contiguous region of logical blocks that need

3005

* blocks to be allocated by ext4 and submit them.

3003

* blocks to be allocated by ext4 and submit them.

3006

*/

3004

*/

3007

ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);

3005

ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);

3008

/*

3006

/*

3009

* If we have a contiguous extent of pages and we

3007

* If we have a contiguous extent of pages and we

3010

* haven't done the I/O yet, map the blocks and submit

3008

* haven't done the I/O yet, map the blocks and submit

3011

* them for I/O.

3009

* them for I/O.

3012

*/

3010

*/

3013

if (!mpd.io_done && mpd.next_page != mpd.first_page) {

3011

if (!mpd.io_done && mpd.next_page != mpd.first_page) {

3014

mpage_da_map_and_submit(&mpd);

3012

mpage_da_map_and_submit(&mpd);

3015

ret = MPAGE_DA_EXTENT_TAIL;

3013

ret = MPAGE_DA_EXTENT_TAIL;

3016

}

3014

}

3017

trace_ext4_da_write_pages(inode, &mpd);

3015

trace_ext4_da_write_pages(inode, &mpd);

3018

wbc->nr_to_write -= mpd.pages_written;

3016

wbc->nr_to_write -= mpd.pages_written;

3019

3017

3020

ext4_journal_stop(handle);

3018

ext4_journal_stop(handle);

3021

3019

3022

if ((mpd.retval == -ENOSPC) && sbi->s_journal) {

3020

if ((mpd.retval == -ENOSPC) && sbi->s_journal) {

3023

/* commit the transaction which would

3021

/* commit the transaction which would

3024

* free blocks released in the transaction

3022

* free blocks released in the transaction

3025

* and try again

3023

* and try again

3026

*/

3024

*/

3027

jbd2_journal_force_commit_nested(sbi->s_journal);

3025

jbd2_journal_force_commit_nested(sbi->s_journal);

3028

ret = 0;

3026

ret = 0;

3029

} else if (ret == MPAGE_DA_EXTENT_TAIL) {

3027

} else if (ret == MPAGE_DA_EXTENT_TAIL) {

3030

/*

3028

/*

3031

* got one extent now try with

3029

* got one extent now try with

3032

* rest of the pages

3030

* rest of the pages

3033

*/

3031

*/

3034

pages_written += mpd.pages_written;

3032

pages_written += mpd.pages_written;

3035

ret = 0;

3033

ret = 0;

3036

io_done = 1;

3034

io_done = 1;

3037

} else if (wbc->nr_to_write)

3035

} else if (wbc->nr_to_write)

3038

/*

3036

/*

3039

* There is no more writeout needed

3037

* There is no more writeout needed

3040

* or we requested for a noblocking writeout

3038

* or we requested for a noblocking writeout

3041

* and we found the device congested

3039

* and we found the device congested

3042

*/

3040

*/

3043

break;

3041

break;

3044

}

3042

}

3045

if (!io_done && !cycled) {

3043

if (!io_done && !cycled) {

3046

cycled = 1;

3044

cycled = 1;

3047

index = 0;

3045

index = 0;

3048

wbc->range_start = index << PAGE_CACHE_SHIFT;

3046

wbc->range_start = index << PAGE_CACHE_SHIFT;

3049

wbc->range_end = mapping->writeback_index - 1;

3047

wbc->range_end = mapping->writeback_index - 1;

3050

goto retry;

3048

goto retry;

3051

}

3049

}

3052

3050

3053

/* Update index */

3051

/* Update index */

3054

wbc->range_cyclic = range_cyclic;

3052

wbc->range_cyclic = range_cyclic;

3055

if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))

3053

if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))

3056

/*

3054

/*

3057

* set the writeback_index so that range_cyclic

3055

* set the writeback_index so that range_cyclic

3058

* mode will write it back later

3056

* mode will write it back later

3059

*/

3057

*/

3060

mapping->writeback_index = done_index;

3058

mapping->writeback_index = done_index;

3061

3059

3062

out_writepages:

3060

out_writepages:

3063

wbc->nr_to_write -= nr_to_writebump;

3061

wbc->nr_to_write -= nr_to_writebump;

3064

wbc->range_start = range_start;

3062

wbc->range_start = range_start;

3065

trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);

3063

trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);

3066

return ret;

3064

return ret;

3067

}

3065

}

3068

3066

3069

#define FALL_BACK_TO_NONDELALLOC 1

3067

#define FALL_BACK_TO_NONDELALLOC 1

3070

static int ext4_nonda_switch(struct super_block *sb)

3068

static int ext4_nonda_switch(struct super_block *sb)

3071

{

3069

{

3072

s64 free_blocks, dirty_blocks;

3070

s64 free_blocks, dirty_blocks;

3073

struct ext4_sb_info *sbi = EXT4_SB(sb);

3071

struct ext4_sb_info *sbi = EXT4_SB(sb);

3074

3072

3075

/*

3073

/*

3076

* switch to non delalloc mode if we are running low

3074

* switch to non delalloc mode if we are running low

3077

* on free block. The free block accounting via percpu

3075

* on free block. The free block accounting via percpu

3078

* counters can get slightly wrong with percpu_counter_batch getting

3076

* counters can get slightly wrong with percpu_counter_batch getting

3079

* accumulated on each CPU without updating global counters

3077

* accumulated on each CPU without updating global counters

3080

* Delalloc need an accurate free block accounting. So switch

3078

* Delalloc need an accurate free block accounting. So switch

3081

* to non delalloc when we are near to error range.

3079

* to non delalloc when we are near to error range.

3082

*/

3080

*/

3083

free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);

3081

free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);

3084

dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter);

3082

dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter);

3085

if (2 * free_blocks < 3 * dirty_blocks ||

3083

if (2 * free_blocks < 3 * dirty_blocks ||

3086

free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {

3084

free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {

3087

/*

3085

/*

3088

* free block count is less than 150% of dirty blocks

3086

* free block count is less than 150% of dirty blocks

3089

* or free blocks is less than watermark

3087

* or free blocks is less than watermark

3090

*/

3088

*/

3091

return 1;

3089

return 1;

3092

}

3090

}

3093

/*

3091

/*

3094

* Even if we don't switch but are nearing capacity,

3092

* Even if we don't switch but are nearing capacity,

3095

* start pushing delalloc when 1/2 of free blocks are dirty.

3093

* start pushing delalloc when 1/2 of free blocks are dirty.

3096

*/

3094

*/

3097

if (free_blocks < 2 * dirty_blocks)

3095

if (free_blocks < 2 * dirty_blocks)

3098

writeback_inodes_sb_if_idle(sb);

3096

writeback_inodes_sb_if_idle(sb);

3099

3097

3100

return 0;

3098

return 0;

3101

}

3099

}

3102

3100

3103

static int ext4_da_write_begin(struct file *file, struct address_space *mapping,

3101

static int ext4_da_write_begin(struct file *file, struct address_space *mapping,

3104

loff_t pos, unsigned len, unsigned flags,

3102

loff_t pos, unsigned len, unsigned flags,

3105

struct page **pagep, void **fsdata)

3103

struct page **pagep, void **fsdata)

3106

{

3104

{

3107

int ret, retries = 0;

3105

int ret, retries = 0;

3108

struct page *page;

3106

struct page *page;

3109

pgoff_t index;

3107

pgoff_t index;

3110

struct inode *inode = mapping->host;

3108

struct inode *inode = mapping->host;

3111

handle_t *handle;

3109

handle_t *handle;

3112

3110

3113

index = pos >> PAGE_CACHE_SHIFT;

3111

index = pos >> PAGE_CACHE_SHIFT;

3114

3112

3115

if (ext4_nonda_switch(inode->i_sb)) {

3113

if (ext4_nonda_switch(inode->i_sb)) {

3116

*fsdata = (void *)FALL_BACK_TO_NONDELALLOC;

3114

*fsdata = (void *)FALL_BACK_TO_NONDELALLOC;

3117

return ext4_write_begin(file, mapping, pos,

3115

return ext4_write_begin(file, mapping, pos,

3118

len, flags, pagep, fsdata);

3116

len, flags, pagep, fsdata);

3119

}

3117

}

3120

*fsdata = (void *)0;

3118

*fsdata = (void *)0;

3121

trace_ext4_da_write_begin(inode, pos, len, flags);

3119

trace_ext4_da_write_begin(inode, pos, len, flags);

3122

retry:

3120

retry:

3123

/*

3121

/*

3124

* With delayed allocation, we don't log the i_disksize update

3122

* With delayed allocation, we don't log the i_disksize update

3125

* if there is delayed block allocation. But we still need

3123

* if there is delayed block allocation. But we still need

3126

* to journalling the i_disksize update if writes to the end

3124

* to journalling the i_disksize update if writes to the end

3127

* of file which has an already mapped buffer.

3125

* of file which has an already mapped buffer.

3128

*/

3126

*/

3129

handle = ext4_journal_start(inode, 1);

3127

handle = ext4_journal_start(inode, 1);

3130

if (IS_ERR(handle)) {

3128

if (IS_ERR(handle)) {

3131

ret = PTR_ERR(handle);

3129

ret = PTR_ERR(handle);

3132

goto out;

3130

goto out;

3133

}

3131

}

3134

/* We cannot recurse into the filesystem as the transaction is already

3132

/* We cannot recurse into the filesystem as the transaction is already

3135

* started */

3133

* started */

3136

flags |= AOP_FLAG_NOFS;

3134

flags |= AOP_FLAG_NOFS;

3137

3135

3138

page = grab_cache_page_write_begin(mapping, index, flags);

3136

page = grab_cache_page_write_begin(mapping, index, flags);

3139

if (!page) {

3137

if (!page) {

3140

ext4_journal_stop(handle);

3138

ext4_journal_stop(handle);

3141

ret = -ENOMEM;

3139

ret = -ENOMEM;

3142

goto out;

3140

goto out;

3143

}

3141

}

3144

*pagep = page;

3142

*pagep = page;

3145

3143

3146

ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);

3144

ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);

3147

if (ret < 0) {

3145

if (ret < 0) {

3148

unlock_page(page);

3146

unlock_page(page);

3149

ext4_journal_stop(handle);

3147

ext4_journal_stop(handle);

3150

page_cache_release(page);

3148

page_cache_release(page);

3151

/*

3149

/*

3152

* block_write_begin may have instantiated a few blocks

3150

* block_write_begin may have instantiated a few blocks

3153

* outside i_size. Trim these off again. Don't need

3151

* outside i_size. Trim these off again. Don't need

3154

* i_size_read because we hold i_mutex.

3152

* i_size_read because we hold i_mutex.

3155

*/

3153

*/

3156

if (pos + len > inode->i_size)

3154

if (pos + len > inode->i_size)

3157

ext4_truncate_failed_write(inode);

3155

ext4_truncate_failed_write(inode);

3158

}

3156

}

3159

3157

3160

if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))

3158

if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))

3161

goto retry;

3159

goto retry;

3162

out:

3160

out:

3163

return ret;

3161

return ret;

3164

}

3162

}

3165

3163

3166

/*

3164

/*

3167

* Check if we should update i_disksize

3165

* Check if we should update i_disksize

3168

* when write to the end of file but not require block allocation

3166

* when write to the end of file but not require block allocation

3169

*/

3167

*/

3170

static int ext4_da_should_update_i_disksize(struct page *page,

3168

static int ext4_da_should_update_i_disksize(struct page *page,

3171

unsigned long offset)

3169

unsigned long offset)

3172

{

3170

{

3173

struct buffer_head *bh;

3171

struct buffer_head *bh;

3174

struct inode *inode = page->mapping->host;

3172

struct inode *inode = page->mapping->host;

3175

unsigned int idx;

3173

unsigned int idx;

3176

int i;

3174

int i;

3177

3175

3178

bh = page_buffers(page);

3176

bh = page_buffers(page);

3179

idx = offset >> inode->i_blkbits;

3177

idx = offset >> inode->i_blkbits;

3180

3178

3181

for (i = 0; i < idx; i++)

3179

for (i = 0; i < idx; i++)

3182

bh = bh->b_this_page;

3180

bh = bh->b_this_page;

3183

3181

3184

if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh))

3182

if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh))

3185

return 0;

3183

return 0;

3186

return 1;

3184

return 1;

3187

}

3185

}

3188

3186

3189

static int ext4_da_write_end(struct file *file,

3187

static int ext4_da_write_end(struct file *file,

3190

struct address_space *mapping,

3188

struct address_space *mapping,

3191

loff_t pos, unsigned len, unsigned copied,

3189

loff_t pos, unsigned len, unsigned copied,

3192

struct page *page, void *fsdata)

3190

struct page *page, void *fsdata)

3193

{

3191

{

3194

struct inode *inode = mapping->host;

3192

struct inode *inode = mapping->host;

3195

int ret = 0, ret2;

3193

int ret = 0, ret2;

3196

handle_t *handle = ext4_journal_current_handle();

3194

handle_t *handle = ext4_journal_current_handle();

3197

loff_t new_i_size;

3195

loff_t new_i_size;

3198

unsigned long start, end;

3196

unsigned long start, end;

3199

int write_mode = (int)(unsigned long)fsdata;

3197

int write_mode = (int)(unsigned long)fsdata;

3200

3198

3201

if (write_mode == FALL_BACK_TO_NONDELALLOC) {

3199

if (write_mode == FALL_BACK_TO_NONDELALLOC) {

3202

if (ext4_should_order_data(inode)) {

3200

if (ext4_should_order_data(inode)) {

3203

return ext4_ordered_write_end(file, mapping, pos,

3201

return ext4_ordered_write_end(file, mapping, pos,

3204

len, copied, page, fsdata);

3202

len, copied, page, fsdata);

3205

} else if (ext4_should_writeback_data(inode)) {

3203

} else if (ext4_should_writeback_data(inode)) {

3206

return ext4_writeback_write_end(file, mapping, pos,

3204

return ext4_writeback_write_end(file, mapping, pos,

3207

len, copied, page, fsdata);

3205

len, copied, page, fsdata);

3208

} else {

3206

} else {

3209

BUG();

3207

BUG();

3210

}

3208

}

3211

}

3209

}

3212

3210

3213

trace_ext4_da_write_end(inode, pos, len, copied);

3211

trace_ext4_da_write_end(inode, pos, len, copied);

3214

start = pos & (PAGE_CACHE_SIZE - 1);

3212

start = pos & (PAGE_CACHE_SIZE - 1);

3215

end = start + copied - 1;

3213

end = start + copied - 1;

3216

3214

3217

/*

3215

/*

3218

* generic_write_end() will run mark_inode_dirty() if i_size

3216

* generic_write_end() will run mark_inode_dirty() if i_size

3219

* changes. So let's piggyback the i_disksize mark_inode_dirty

3217

* changes. So let's piggyback the i_disksize mark_inode_dirty

3220

* into that.

3218

* into that.

3221

*/

3219

*/

3222

3220

3223

new_i_size = pos + copied;

3221

new_i_size = pos + copied;

3224

if (new_i_size > EXT4_I(inode)->i_disksize) {

3222

if (new_i_size > EXT4_I(inode)->i_disksize) {

3225

if (ext4_da_should_update_i_disksize(page, end)) {

3223

if (ext4_da_should_update_i_disksize(page, end)) {

3226

down_write(&EXT4_I(inode)->i_data_sem);

3224

down_write(&EXT4_I(inode)->i_data_sem);

3227

if (new_i_size > EXT4_I(inode)->i_disksize) {

3225

if (new_i_size > EXT4_I(inode)->i_disksize) {

3228

/*

3226

/*

3229

* Updating i_disksize when extending file

3227

* Updating i_disksize when extending file

3230

* without needing block allocation

3228

* without needing block allocation

3231

*/

3229

*/

3232

if (ext4_should_order_data(inode))

3230

if (ext4_should_order_data(inode))

3233

ret = ext4_jbd2_file_inode(handle,

3231

ret = ext4_jbd2_file_inode(handle,

3234

inode);

3232

inode);

3235

3233

3236

EXT4_I(inode)->i_disksize = new_i_size;

3234

EXT4_I(inode)->i_disksize = new_i_size;

3237

}

3235

}

3238

up_write(&EXT4_I(inode)->i_data_sem);

3236

up_write(&EXT4_I(inode)->i_data_sem);

3239

/* We need to mark inode dirty even if

3237

/* We need to mark inode dirty even if

3240

* new_i_size is less that inode->i_size

3238

* new_i_size is less that inode->i_size

3241

* bu greater than i_disksize.(hint delalloc)

3239

* bu greater than i_disksize.(hint delalloc)

3242

*/

3240

*/

3243

ext4_mark_inode_dirty(handle, inode);

3241

ext4_mark_inode_dirty(handle, inode);

3244

}

3242

}

3245

}

3243

}

3246

ret2 = generic_write_end(file, mapping, pos, len, copied,

3244

ret2 = generic_write_end(file, mapping, pos, len, copied,

3247

page, fsdata);

3245

page, fsdata);

3248

copied = ret2;

3246

copied = ret2;

3249

if (ret2 < 0)

3247

if (ret2 < 0)

3250

ret = ret2;

3248

ret = ret2;

3251

ret2 = ext4_journal_stop(handle);

3249

ret2 = ext4_journal_stop(handle);

3252

if (!ret)

3250

if (!ret)

3253

ret = ret2;

3251

ret = ret2;

3254

3252

3255

return ret ? ret : copied;

3253

return ret ? ret : copied;

3256

}

3254

}

3257

3255

3258

static void ext4_da_invalidatepage(struct page *page, unsigned long offset)

3256

static void ext4_da_invalidatepage(struct page *page, unsigned long offset)

3259

{

3257

{

3260

/*

3258

/*

3261

* Drop reserved blocks

3259

* Drop reserved blocks

3262

*/

3260

*/

3263

BUG_ON(!PageLocked(page));

3261

BUG_ON(!PageLocked(page));

3264

if (!page_has_buffers(page))

3262

if (!page_has_buffers(page))

3265

goto out;

3263

goto out;

3266

3264

3267

ext4_da_page_release_reservation(page, offset);

3265

ext4_da_page_release_reservation(page, offset);

3268

3266

3269

out:

3267

out:

3270

ext4_invalidatepage(page, offset);

3268

ext4_invalidatepage(page, offset);

3271

3269

3272

return;

3270

return;

3273

}

3271

}

3274

3272

3275

/*

3273

/*

3276

* Force all delayed allocation blocks to be allocated for a given inode.

3274

* Force all delayed allocation blocks to be allocated for a given inode.

3277

*/

3275

*/

3278

int ext4_alloc_da_blocks(struct inode *inode)

3276

int ext4_alloc_da_blocks(struct inode *inode)

3279

{

3277

{

3280

trace_ext4_alloc_da_blocks(inode);

3278

trace_ext4_alloc_da_blocks(inode);

3281

3279

3282

if (!EXT4_I(inode)->i_reserved_data_blocks &&

3280

if (!EXT4_I(inode)->i_reserved_data_blocks &&

3283

!EXT4_I(inode)->i_reserved_meta_blocks)

3281

!EXT4_I(inode)->i_reserved_meta_blocks)

3284

return 0;

3282

return 0;

3285

3283

3286

/*

3284

/*

3287

* We do something simple for now. The filemap_flush() will

3285

* We do something simple for now. The filemap_flush() will

3288

* also start triggering a write of the data blocks, which is

3286

* also start triggering a write of the data blocks, which is

3289

* not strictly speaking necessary (and for users of

3287

* not strictly speaking necessary (and for users of

3290

* laptop_mode, not even desirable). However, to do otherwise

3288

* laptop_mode, not even desirable). However, to do otherwise

3291

* would require replicating code paths in:

3289

* would require replicating code paths in:

3292

*

3290

*

3293

* ext4_da_writepages() ->

3291

* ext4_da_writepages() ->

3294

* write_cache_pages() ---> (via passed in callback function)

3292

* write_cache_pages() ---> (via passed in callback function)

3295

* __mpage_da_writepage() -->

3293

* __mpage_da_writepage() -->

3296

* mpage_add_bh_to_extent()

3294

* mpage_add_bh_to_extent()

3297

* mpage_da_map_blocks()

3295

* mpage_da_map_blocks()

3298

*

3296

*

3299

* The problem is that write_cache_pages(), located in

3297

* The problem is that write_cache_pages(), located in

3300

* mm/page-writeback.c, marks pages clean in preparation for

3298

* mm/page-writeback.c, marks pages clean in preparation for

3301

* doing I/O, which is not desirable if we're not planning on

3299

* doing I/O, which is not desirable if we're not planning on

3302

* doing I/O at all.

3300

* doing I/O at all.

3303

*

3301

*

3304

* We could call write_cache_pages(), and then redirty all of

3302

* We could call write_cache_pages(), and then redirty all of

3305

* the pages by calling redirty_page_for_writepage() but that

3303

* the pages by calling redirty_page_for_writepage() but that

3306

* would be ugly in the extreme. So instead we would need to

3304

* would be ugly in the extreme. So instead we would need to

3307

* replicate parts of the code in the above functions,

3305

* replicate parts of the code in the above functions,

3308

* simplifying them because we wouldn't actually intend to

3306

* simplifying them because we wouldn't actually intend to

3309

* write out the pages, but rather only collect contiguous

3307

* write out the pages, but rather only collect contiguous

3310

* logical block extents, call the multi-block allocator, and

3308

* logical block extents, call the multi-block allocator, and

3311

* then update the buffer heads with the block allocations.

3309

* then update the buffer heads with the block allocations.

3312

*

3310

*

3313

* For now, though, we'll cheat by calling filemap_flush(),

3311

* For now, though, we'll cheat by calling filemap_flush(),

3314

* which will map the blocks, and start the I/O, but not

3312

* which will map the blocks, and start the I/O, but not

3315

* actually wait for the I/O to complete.

3313

* actually wait for the I/O to complete.

3316

*/

3314

*/

3317

return filemap_flush(inode->i_mapping);

3315

return filemap_flush(inode->i_mapping);

3318

}

3316

}

3319

3317

3320

/*

3318

/*

3321

* bmap() is special. It gets used by applications such as lilo and by

3319

* bmap() is special. It gets used by applications such as lilo and by

3322

* the swapper to find the on-disk block of a specific piece of data.

3320

* the swapper to find the on-disk block of a specific piece of data.

3323

*

3321

*

3324

* Naturally, this is dangerous if the block concerned is still in the

3322

* Naturally, this is dangerous if the block concerned is still in the

3325

* journal. If somebody makes a swapfile on an ext4 data-journaling

3323

* journal. If somebody makes a swapfile on an ext4 data-journaling

3326

* filesystem and enables swap, then they may get a nasty shock when the

3324

* filesystem and enables swap, then they may get a nasty shock when the

3327

* data getting swapped to that swapfile suddenly gets overwritten by

3325

* data getting swapped to that swapfile suddenly gets overwritten by

3328

* the original zero's written out previously to the journal and

3326

* the original zero's written out previously to the journal and

3329

* awaiting writeback in the kernel's buffer cache.

3327

* awaiting writeback in the kernel's buffer cache.

3330

*

3328

*

3331

* So, if we see any bmap calls here on a modified, data-journaled file,

3329

* So, if we see any bmap calls here on a modified, data-journaled file,

3332

* take extra steps to flush any blocks which might be in the cache.

3330

* take extra steps to flush any blocks which might be in the cache.

3333

*/

3331

*/

3334

static sector_t ext4_bmap(struct address_space *mapping, sector_t block)

3332

static sector_t ext4_bmap(struct address_space *mapping, sector_t block)

3335

{

3333

{

3336

struct inode *inode = mapping->host;

3334

struct inode *inode = mapping->host;

3337

journal_t *journal;

3335

journal_t *journal;

3338

int err;

3336

int err;

3339

3337

3340

if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&

3338

if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&

3341

test_opt(inode->i_sb, DELALLOC)) {

3339

test_opt(inode->i_sb, DELALLOC)) {

3342

/*

3340

/*

3343

* With delalloc we want to sync the file

3341

* With delalloc we want to sync the file

3344

* so that we can make sure we allocate

3342

* so that we can make sure we allocate

3345

* blocks for file

3343

* blocks for file

3346

*/

3344

*/

3347

filemap_write_and_wait(mapping);

3345

filemap_write_and_wait(mapping);

3348

}

3346

}

3349

3347

3350

if (EXT4_JOURNAL(inode) &&

3348

if (EXT4_JOURNAL(inode) &&

3351

ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {

3349

ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {

3352

/*

3350

/*

3353

* This is a REALLY heavyweight approach, but the use of

3351

* This is a REALLY heavyweight approach, but the use of

3354

* bmap on dirty files is expected to be extremely rare:

3352

* bmap on dirty files is expected to be extremely rare:

3355

* only if we run lilo or swapon on a freshly made file

3353

* only if we run lilo or swapon on a freshly made file

3356

* do we expect this to happen.

3354

* do we expect this to happen.

3357

*

3355

*

3358

* (bmap requires CAP_SYS_RAWIO so this does not

3356

* (bmap requires CAP_SYS_RAWIO so this does not

3359

* represent an unprivileged user DOS attack --- we'd be

3357

* represent an unprivileged user DOS attack --- we'd be

3360

* in trouble if mortal users could trigger this path at

3358

* in trouble if mortal users could trigger this path at

3361

* will.)

3359

* will.)

3362

*

3360

*

3363

* NB. EXT4_STATE_JDATA is not set on files other than

3361

* NB. EXT4_STATE_JDATA is not set on files other than

3364

* regular files. If somebody wants to bmap a directory

3362

* regular files. If somebody wants to bmap a directory

3365

* or symlink and gets confused because the buffer

3363

* or symlink and gets confused because the buffer

3366

* hasn't yet been flushed to disk, they deserve

3364

* hasn't yet been flushed to disk, they deserve

3367

* everything they get.

3365

* everything they get.

3368

*/

3366

*/

3369

3367

3370

ext4_clear_inode_state(inode, EXT4_STATE_JDATA);

3368

ext4_clear_inode_state(inode, EXT4_STATE_JDATA);

3371

journal = EXT4_JOURNAL(inode);

3369

journal = EXT4_JOURNAL(inode);

3372

jbd2_journal_lock_updates(journal);

3370

jbd2_journal_lock_updates(journal);

3373

err = jbd2_journal_flush(journal);

3371

err = jbd2_journal_flush(journal);

3374

jbd2_journal_unlock_updates(journal);

3372

jbd2_journal_unlock_updates(journal);

3375

3373

3376

if (err)

3374

if (err)

3377

return 0;

3375

return 0;

3378

}

3376

}

3379

3377

3380

return generic_block_bmap(mapping, block, ext4_get_block);

3378

return generic_block_bmap(mapping, block, ext4_get_block);

3381

}

3379

}

3382

3380

3383

static int ext4_readpage(struct file *file, struct page *page)

3381

static int ext4_readpage(struct file *file, struct page *page)

3384

{

3382

{

3385

trace_ext4_readpage(page);

3383

trace_ext4_readpage(page);

3386

return mpage_readpage(page, ext4_get_block);

3384

return mpage_readpage(page, ext4_get_block);

3387

}

3385

}

3388

3386

3389

static int

3387

static int

3390

ext4_readpages(struct file *file, struct address_space *mapping,

3388

ext4_readpages(struct file *file, struct address_space *mapping,

3391

struct list_head *pages, unsigned nr_pages)

3389

struct list_head *pages, unsigned nr_pages)

3392

{

3390

{

3393

return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);

3391

return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);

3394

}

3392

}

3395

3393

3396

static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)

3394

static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)

3397

{

3395

{

3398

struct buffer_head *head, *bh;

3396

struct buffer_head *head, *bh;

3399

unsigned int curr_off = 0;

3397

unsigned int curr_off = 0;

3400

3398

3401

if (!page_has_buffers(page))

3399

if (!page_has_buffers(page))

3402

return;

3400

return;

3403

head = bh = page_buffers(page);

3401

head = bh = page_buffers(page);

3404

do {

3402

do {

3405

if (offset <= curr_off && test_clear_buffer_uninit(bh)

3403

if (offset <= curr_off && test_clear_buffer_uninit(bh)

3406

&& bh->b_private) {

3404

&& bh->b_private) {

3407

ext4_free_io_end(bh->b_private);

3405

ext4_free_io_end(bh->b_private);

3408

bh->b_private = NULL;

3406

bh->b_private = NULL;

3409

bh->b_end_io = NULL;

3407

bh->b_end_io = NULL;

3410

}

3408

}

3411

curr_off = curr_off + bh->b_size;

3409

curr_off = curr_off + bh->b_size;

3412

bh = bh->b_this_page;

3410

bh = bh->b_this_page;

3413

} while (bh != head);

3411

} while (bh != head);

3414

}

3412

}

3415

3413

3416

static void ext4_invalidatepage(struct page *page, unsigned long offset)

3414

static void ext4_invalidatepage(struct page *page, unsigned long offset)

3417

{

3415

{

3418

journal_t *journal = EXT4_JOURNAL(page->mapping->host);

3416

journal_t *journal = EXT4_JOURNAL(page->mapping->host);

3419

3417

3420

trace_ext4_invalidatepage(page, offset);

3418

trace_ext4_invalidatepage(page, offset);

3421

3419

3422

/*

3420

/*

3423

* free any io_end structure allocated for buffers to be discarded

3421

* free any io_end structure allocated for buffers to be discarded

3424

*/

3422

*/

3425

if (ext4_should_dioread_nolock(page->mapping->host))

3423

if (ext4_should_dioread_nolock(page->mapping->host))

3426

ext4_invalidatepage_free_endio(page, offset);

3424

ext4_invalidatepage_free_endio(page, offset);

3427

/*

3425

/*

3428

* If it's a full truncate we just forget about the pending dirtying

3426

* If it's a full truncate we just forget about the pending dirtying

3429

*/

3427

*/

3430

if (offset == 0)

3428

if (offset == 0)

3431

ClearPageChecked(page);

3429

ClearPageChecked(page);

3432

3430

3433

if (journal)

3431

if (journal)

3434

jbd2_journal_invalidatepage(journal, page, offset);

3432

jbd2_journal_invalidatepage(journal, page, offset);

3435

else

3433

else

3436

block_invalidatepage(page, offset);

3434

block_invalidatepage(page, offset);

3437

}

3435

}

3438

3436

3439

static int ext4_releasepage(struct page *page, gfp_t wait)

3437

static int ext4_releasepage(struct page *page, gfp_t wait)

3440

{

3438

{

3441

journal_t *journal = EXT4_JOURNAL(page->mapping->host);

3439

journal_t *journal = EXT4_JOURNAL(page->mapping->host);

3442

3440

3443

trace_ext4_releasepage(page);

3441

trace_ext4_releasepage(page);

3444

3442

3445

WARN_ON(PageChecked(page));

3443

WARN_ON(PageChecked(page));

3446

if (!page_has_buffers(page))

3444

if (!page_has_buffers(page))

3447

return 0;

3445

return 0;

3448

if (journal)

3446

if (journal)

3449

return jbd2_journal_try_to_free_buffers(journal, page, wait);

3447

return jbd2_journal_try_to_free_buffers(journal, page, wait);

3450

else

3448

else

3451

return try_to_free_buffers(page);

3449

return try_to_free_buffers(page);

3452

}

3450

}

3453

3451

3454

/*

3452

/*

3455

* O_DIRECT for ext3 (or indirect map) based files

3453

* O_DIRECT for ext3 (or indirect map) based files

3456

*

3454

*

3457

* If the O_DIRECT write will extend the file then add this inode to the

3455

* If the O_DIRECT write will extend the file then add this inode to the

3458

* orphan list. So recovery will truncate it back to the original size

3456

* orphan list. So recovery will truncate it back to the original size

3459

* if the machine crashes during the write.

3457

* if the machine crashes during the write.

3460

*

3458

*

3461

* If the O_DIRECT write is intantiating holes inside i_size and the machine

3459

* If the O_DIRECT write is intantiating holes inside i_size and the machine

3462

* crashes then stale disk data _may_ be exposed inside the file. But current

3460

* crashes then stale disk data _may_ be exposed inside the file. But current

3463

* VFS code falls back into buffered path in that case so we are safe.

3461

* VFS code falls back into buffered path in that case so we are safe.

3464

*/

3462

*/

3465

static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,

3463

static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,

3466

const struct iovec *iov, loff_t offset,

3464

const struct iovec *iov, loff_t offset,

3467

unsigned long nr_segs)

3465

unsigned long nr_segs)

3468

{

3466

{

3469

struct file *file = iocb->ki_filp;

3467

struct file *file = iocb->ki_filp;

3470

struct inode *inode = file->f_mapping->host;

3468

struct inode *inode = file->f_mapping->host;

3471

struct ext4_inode_info *ei = EXT4_I(inode);

3469

struct ext4_inode_info *ei = EXT4_I(inode);

3472

handle_t *handle;

3470

handle_t *handle;

3473

ssize_t ret;

3471

ssize_t ret;

3474

int orphan = 0;

3472

int orphan = 0;

3475

size_t count = iov_length(iov, nr_segs);

3473

size_t count = iov_length(iov, nr_segs);

3476

int retries = 0;

3474

int retries = 0;

3477

3475

3478

if (rw == WRITE) {

3476

if (rw == WRITE) {

3479

loff_t final_size = offset + count;

3477

loff_t final_size = offset + count;

3480

3478

3481

if (final_size > inode->i_size) {

3479

if (final_size > inode->i_size) {

3482

/* Credits for sb + inode write */

3480

/* Credits for sb + inode write */

3483

handle = ext4_journal_start(inode, 2);

3481

handle = ext4_journal_start(inode, 2);

3484

if (IS_ERR(handle)) {

3482

if (IS_ERR(handle)) {

3485

ret = PTR_ERR(handle);

3483

ret = PTR_ERR(handle);

3486

goto out;

3484

goto out;

3487

}

3485

}

3488

ret = ext4_orphan_add(handle, inode);

3486

ret = ext4_orphan_add(handle, inode);

3489

if (ret) {

3487

if (ret) {

3490

ext4_journal_stop(handle);

3488

ext4_journal_stop(handle);

3491

goto out;

3489

goto out;

3492

}

3490

}

3493

orphan = 1;

3491

orphan = 1;

3494

ei->i_disksize = inode->i_size;

3492

ei->i_disksize = inode->i_size;

3495

ext4_journal_stop(handle);

3493

ext4_journal_stop(handle);

3496

}

3494

}

3497

}

3495

}

3498

3496

3499

retry:

3497

retry:

3500

if (rw == READ && ext4_should_dioread_nolock(inode))

3498

if (rw == READ && ext4_should_dioread_nolock(inode))

3501

ret = __blockdev_direct_IO(rw, iocb, inode,

3499

ret = __blockdev_direct_IO(rw, iocb, inode,

3502

inode->i_sb->s_bdev, iov,

3500

inode->i_sb->s_bdev, iov,

3503

offset, nr_segs,

3501

offset, nr_segs,

3504

ext4_get_block, NULL, NULL, 0);

3502

ext4_get_block, NULL, NULL, 0);

3505

else {

3503

else {

3506

ret = blockdev_direct_IO(rw, iocb, inode,

3504

ret = blockdev_direct_IO(rw, iocb, inode,

3507

inode->i_sb->s_bdev, iov,

3505

inode->i_sb->s_bdev, iov,

3508

offset, nr_segs,

3506

offset, nr_segs,

3509

ext4_get_block, NULL);

3507

ext4_get_block, NULL);

3510

3508

3511

if (unlikely((rw & WRITE) && ret < 0)) {

3509

if (unlikely((rw & WRITE) && ret < 0)) {

3512

loff_t isize = i_size_read(inode);

3510

loff_t isize = i_size_read(inode);

3513

loff_t end = offset + iov_length(iov, nr_segs);

3511

loff_t end = offset + iov_length(iov, nr_segs);

3514

3512

3515

if (end > isize)

3513

if (end > isize)

3516

vmtruncate(inode, isize);

3514

vmtruncate(inode, isize);

3517

}

3515

}

3518

}

3516

}

3519

if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))

3517

if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))

3520

goto retry;

3518

goto retry;

3521

3519

3522

if (orphan) {

3520

if (orphan) {

3523

int err;

3521

int err;

3524

3522

3525

/* Credits for sb + inode write */

3523

/* Credits for sb + inode write */

3526

handle = ext4_journal_start(inode, 2);

3524

handle = ext4_journal_start(inode, 2);

3527

if (IS_ERR(handle)) {

3525

if (IS_ERR(handle)) {

3528

/* This is really bad luck. We've written the data

3526

/* This is really bad luck. We've written the data

3529

* but cannot extend i_size. Bail out and pretend

3527

* but cannot extend i_size. Bail out and pretend

3530

* the write failed... */

3528

* the write failed... */

3531

ret = PTR_ERR(handle);

3529

ret = PTR_ERR(handle);

3532

if (inode->i_nlink)

3530

if (inode->i_nlink)

3533

ext4_orphan_del(NULL, inode);

3531

ext4_orphan_del(NULL, inode);

3534

3532

3535

goto out;

3533

goto out;

3536

}

3534

}

3537

if (inode->i_nlink)

3535

if (inode->i_nlink)

3538

ext4_orphan_del(handle, inode);

3536

ext4_orphan_del(handle, inode);

3539

if (ret > 0) {

3537

if (ret > 0) {

3540

loff_t end = offset + ret;

3538

loff_t end = offset + ret;

3541

if (end > inode->i_size) {

3539

if (end > inode->i_size) {

3542

ei->i_disksize = end;

3540

ei->i_disksize = end;

3543

i_size_write(inode, end);

3541

i_size_write(inode, end);

3544

/*

3542

/*

3545

* We're going to return a positive `ret'

3543

* We're going to return a positive `ret'

3546

* here due to non-zero-length I/O, so there's

3544

* here due to non-zero-length I/O, so there's

3547

* no way of reporting error returns from

3545

* no way of reporting error returns from

3548

* ext4_mark_inode_dirty() to userspace. So

3546

* ext4_mark_inode_dirty() to userspace. So

3549

* ignore it.

3547

* ignore it.

3550

*/

3548

*/

3551

ext4_mark_inode_dirty(handle, inode);

3549

ext4_mark_inode_dirty(handle, inode);

3552

}

3550

}

3553

}

3551

}

3554

err = ext4_journal_stop(handle);

3552

err = ext4_journal_stop(handle);

3555

if (ret == 0)

3553

if (ret == 0)

3556

ret = err;

3554

ret = err;

3557

}

3555

}

3558

out:

3556

out:

3559

return ret;

3557

return ret;

3560

}

3558

}

3561

3559

3562

/*

3560

/*

3563

* ext4_get_block used when preparing for a DIO write or buffer write.

3561

* ext4_get_block used when preparing for a DIO write or buffer write.

3564

* We allocate an uinitialized extent if blocks haven't been allocated.

3562

* We allocate an uinitialized extent if blocks haven't been allocated.

3565

* The extent will be converted to initialized after the IO is complete.

3563

* The extent will be converted to initialized after the IO is complete.

3566

*/

3564

*/

3567

static int ext4_get_block_write(struct inode *inode, sector_t iblock,

3565

static int ext4_get_block_write(struct inode *inode, sector_t iblock,

3568

struct buffer_head *bh_result, int create)

3566

struct buffer_head *bh_result, int create)

3569

{

3567

{

3570

ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",

3568

ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",

3571

inode->i_ino, create);

3569

inode->i_ino, create);

3572

return _ext4_get_block(inode, iblock, bh_result,

3570

return _ext4_get_block(inode, iblock, bh_result,

3573

EXT4_GET_BLOCKS_IO_CREATE_EXT);

3571

EXT4_GET_BLOCKS_IO_CREATE_EXT);

3574

}

3572

}

3575

3573

3576

static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,

3574

static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,

3577

ssize_t size, void *private, int ret,

3575

ssize_t size, void *private, int ret,

3578

bool is_async)

3576

bool is_async)

3579

{

3577

{

3580

ext4_io_end_t *io_end = iocb->private;

3578

ext4_io_end_t *io_end = iocb->private;

3581

struct workqueue_struct *wq;

3579

struct workqueue_struct *wq;

3582

unsigned long flags;

3580

unsigned long flags;

3583

struct ext4_inode_info *ei;

3581

struct ext4_inode_info *ei;

3584

3582

3585

/* if not async direct IO or dio with 0 bytes write, just return */

3583

/* if not async direct IO or dio with 0 bytes write, just return */

3586

if (!io_end || !size)

3584

if (!io_end || !size)

3587

goto out;

3585

goto out;

3588

3586

3589

ext_debug("ext4_end_io_dio(): io_end 0x%p"

3587

ext_debug("ext4_end_io_dio(): io_end 0x%p"

3590

"for inode %lu, iocb 0x%p, offset %llu, size %llu\n",

3588

"for inode %lu, iocb 0x%p, offset %llu, size %llu\n",

3591

iocb->private, io_end->inode->i_ino, iocb, offset,

3589

iocb->private, io_end->inode->i_ino, iocb, offset,

3592

size);

3590

size);

3593

3591

3594

/* if not aio dio with unwritten extents, just free io and return */

3592

/* if not aio dio with unwritten extents, just free io and return */

3595

if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {

3593

if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {

3596

ext4_free_io_end(io_end);

3594

ext4_free_io_end(io_end);

3597

iocb->private = NULL;

3595

iocb->private = NULL;

3598

out:

3596

out:

3599

if (is_async)

3597

if (is_async)

3600

aio_complete(iocb, ret, 0);

3598

aio_complete(iocb, ret, 0);

3601

return;

3599

return;

3602

}

3600

}

3603

3601

3604

io_end->offset = offset;

3602

io_end->offset = offset;

3605

io_end->size = size;

3603

io_end->size = size;

3606

if (is_async) {

3604

if (is_async) {

3607

io_end->iocb = iocb;

3605

io_end->iocb = iocb;

3608

io_end->result = ret;

3606

io_end->result = ret;

3609

}

3607

}

3610

wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;

3608

wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;

3611

3609

3612

/* Add the io_end to per-inode completed aio dio list*/

3610

/* Add the io_end to per-inode completed aio dio list*/

3613

ei = EXT4_I(io_end->inode);

3611

ei = EXT4_I(io_end->inode);

3614

spin_lock_irqsave(&ei->i_completed_io_lock, flags);

3612

spin_lock_irqsave(&ei->i_completed_io_lock, flags);

3615

list_add_tail(&io_end->list, &ei->i_completed_io_list);

3613

list_add_tail(&io_end->list, &ei->i_completed_io_list);

3616

spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);

3614

spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);

3617

3615

3618

/* queue the work to convert unwritten extents to written */

3616

/* queue the work to convert unwritten extents to written */

3619

queue_work(wq, &io_end->work);

3617

queue_work(wq, &io_end->work);

3620

iocb->private = NULL;

3618

iocb->private = NULL;

3621

}

3619

}

3622

3620

3623

static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)

3621

static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)

3624

{

3622

{

3625

ext4_io_end_t *io_end = bh->b_private;

3623

ext4_io_end_t *io_end = bh->b_private;

3626

struct workqueue_struct *wq;

3624

struct workqueue_struct *wq;

3627

struct inode *inode;

3625

struct inode *inode;

3628

unsigned long flags;

3626

unsigned long flags;

3629

3627

3630

if (!test_clear_buffer_uninit(bh) || !io_end)

3628

if (!test_clear_buffer_uninit(bh) || !io_end)

3631

goto out;

3629

goto out;

3632

3630

3633

if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {

3631

if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {

3634

printk("sb umounted, discard end_io request for inode %lu\n",

3632

printk("sb umounted, discard end_io request for inode %lu\n",

3635

io_end->inode->i_ino);

3633

io_end->inode->i_ino);

3636

ext4_free_io_end(io_end);

3634

ext4_free_io_end(io_end);

3637

goto out;

3635

goto out;

3638

}

3636

}

3639

3637

3640

io_end->flag = EXT4_IO_END_UNWRITTEN;

3638

io_end->flag = EXT4_IO_END_UNWRITTEN;

3641

inode = io_end->inode;

3639

inode = io_end->inode;

3642

3640

3643

/* Add the io_end to per-inode completed io list*/

3641

/* Add the io_end to per-inode completed io list*/

3644

spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);

3642

spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);

3645

list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);

3643

list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);

3646

spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);

3644

spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);

3647

3645

3648

wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;

3646

wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;

3649

/* queue the work to convert unwritten extents to written */

3647

/* queue the work to convert unwritten extents to written */

3650

queue_work(wq, &io_end->work);

3648

queue_work(wq, &io_end->work);

3651

out:

3649

out:

3652

bh->b_private = NULL;

3650

bh->b_private = NULL;

3653

bh->b_end_io = NULL;

3651

bh->b_end_io = NULL;

3654

clear_buffer_uninit(bh);

3652

clear_buffer_uninit(bh);

3655

end_buffer_async_write(bh, uptodate);

3653

end_buffer_async_write(bh, uptodate);

3656

}

3654

}

3657

3655

3658

static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)

3656

static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)

3659

{

3657

{

3660

ext4_io_end_t *io_end;

3658

ext4_io_end_t *io_end;

3661

struct page *page = bh->b_page;

3659

struct page *page = bh->b_page;

3662

loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT;

3660

loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT;

3663

size_t size = bh->b_size;

3661

size_t size = bh->b_size;

3664

3662

3665

retry:

3663

retry:

3666

io_end = ext4_init_io_end(inode, GFP_ATOMIC);

3664

io_end = ext4_init_io_end(inode, GFP_ATOMIC);

3667

if (!io_end) {

3665

if (!io_end) {

3668

pr_warn_ratelimited("%s: allocation fail\n", __func__);

3666

pr_warn_ratelimited("%s: allocation fail\n", __func__);

3669

schedule();

3667

schedule();

3670

goto retry;

3668

goto retry;

3671

}

3669

}

3672

io_end->offset = offset;

3670

io_end->offset = offset;

3673

io_end->size = size;

3671

io_end->size = size;

3674

/*

3672

/*

3675

* We need to hold a reference to the page to make sure it

3673

* We need to hold a reference to the page to make sure it

3676

* doesn't get evicted before ext4_end_io_work() has a chance

3674

* doesn't get evicted before ext4_end_io_work() has a chance

3677

* to convert the extent from written to unwritten.

3675

* to convert the extent from written to unwritten.

3678

*/

3676

*/

3679

io_end->page = page;

3677

io_end->page = page;

3680

get_page(io_end->page);

3678

get_page(io_end->page);

3681

3679

3682

bh->b_private = io_end;

3680

bh->b_private = io_end;

3683

bh->b_end_io = ext4_end_io_buffer_write;

3681

bh->b_end_io = ext4_end_io_buffer_write;

3684

return 0;

3682

return 0;

3685

}

3683

}

3686

3684

3687

/*

3685

/*

3688

* For ext4 extent files, ext4 will do direct-io write to holes,

3686

* For ext4 extent files, ext4 will do direct-io write to holes,

3689

* preallocated extents, and those write extend the file, no need to

3687

* preallocated extents, and those write extend the file, no need to

3690

* fall back to buffered IO.

3688

* fall back to buffered IO.

3691

*

3689

*

3692

* For holes, we fallocate those blocks, mark them as uninitialized

3690

* For holes, we fallocate those blocks, mark them as uninitialized

3693

* If those blocks were preallocated, we mark sure they are splited, but

3691

* If those blocks were preallocated, we mark sure they are splited, but

3694

* still keep the range to write as uninitialized.

3692

* still keep the range to write as uninitialized.

3695

*

3693

*

3696

* The unwrritten extents will be converted to written when DIO is completed.

3694

* The unwrritten extents will be converted to written when DIO is completed.

3697

* For async direct IO, since the IO may still pending when return, we

3695

* For async direct IO, since the IO may still pending when return, we

3698

* set up an end_io call back function, which will do the conversion

3696

* set up an end_io call back function, which will do the conversion

3699

* when async direct IO completed.

3697

* when async direct IO completed.

3700

*

3698

*

3701

* If the O_DIRECT write will extend the file then add this inode to the

3699

* If the O_DIRECT write will extend the file then add this inode to the

3702

* orphan list. So recovery will truncate it back to the original size

3700

* orphan list. So recovery will truncate it back to the original size

3703

* if the machine crashes during the write.

3701

* if the machine crashes during the write.

3704

*

3702

*

3705

*/

3703

*/

3706

static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,

3704

static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,

3707

const struct iovec *iov, loff_t offset,

3705

const struct iovec *iov, loff_t offset,

3708

unsigned long nr_segs)

3706

unsigned long nr_segs)

3709

{

3707

{

3710

struct file *file = iocb->ki_filp;

3708

struct file *file = iocb->ki_filp;

3711

struct inode *inode = file->f_mapping->host;

3709

struct inode *inode = file->f_mapping->host;

3712

ssize_t ret;

3710

ssize_t ret;

3713

size_t count = iov_length(iov, nr_segs);

3711

size_t count = iov_length(iov, nr_segs);

3714

3712

3715

loff_t final_size = offset + count;

3713

loff_t final_size = offset + count;

3716

if (rw == WRITE && final_size <= inode->i_size) {

3714

if (rw == WRITE && final_size <= inode->i_size) {

3717

/*

3715

/*

3718

* We could direct write to holes and fallocate.

3716

* We could direct write to holes and fallocate.

3719

*

3717

*

3720

* Allocated blocks to fill the hole are marked as uninitialized

3718

* Allocated blocks to fill the hole are marked as uninitialized

3721

* to prevent parallel buffered read to expose the stale data

3719

* to prevent parallel buffered read to expose the stale data

3722

* before DIO complete the data IO.

3720

* before DIO complete the data IO.

3723

*

3721

*

3724

* As to previously fallocated extents, ext4 get_block

3722

* As to previously fallocated extents, ext4 get_block

3725

* will just simply mark the buffer mapped but still

3723

* will just simply mark the buffer mapped but still

3726

* keep the extents uninitialized.

3724

* keep the extents uninitialized.

3727

*

3725

*

3728

* for non AIO case, we will convert those unwritten extents

3726

* for non AIO case, we will convert those unwritten extents

3729

* to written after return back from blockdev_direct_IO.

3727

* to written after return back from blockdev_direct_IO.

3730

*

3728

*

3731

* for async DIO, the conversion needs to be defered when

3729

* for async DIO, the conversion needs to be defered when

3732

* the IO is completed. The ext4 end_io callback function

3730

* the IO is completed. The ext4 end_io callback function

3733

* will be called to take care of the conversion work.

3731

* will be called to take care of the conversion work.

3734

* Here for async case, we allocate an io_end structure to

3732

* Here for async case, we allocate an io_end structure to

3735

* hook to the iocb.

3733

* hook to the iocb.

3736

*/

3734

*/

3737

iocb->private = NULL;

3735

iocb->private = NULL;

3738

EXT4_I(inode)->cur_aio_dio = NULL;

3736

EXT4_I(inode)->cur_aio_dio = NULL;

3739

if (!is_sync_kiocb(iocb)) {

3737

if (!is_sync_kiocb(iocb)) {

3740

iocb->private = ext4_init_io_end(inode, GFP_NOFS);

3738

iocb->private = ext4_init_io_end(inode, GFP_NOFS);

3741

if (!iocb->private)

3739

if (!iocb->private)

3742

return -ENOMEM;

3740

return -ENOMEM;

3743

/*

3741

/*

3744

* we save the io structure for current async

3742

* we save the io structure for current async

3745

* direct IO, so that later ext4_map_blocks()

3743

* direct IO, so that later ext4_map_blocks()

3746

* could flag the io structure whether there

3744

* could flag the io structure whether there

3747

* is a unwritten extents needs to be converted

3745

* is a unwritten extents needs to be converted

3748

* when IO is completed.

3746

* when IO is completed.

3749

*/

3747

*/

3750

EXT4_I(inode)->cur_aio_dio = iocb->private;

3748

EXT4_I(inode)->cur_aio_dio = iocb->private;

3751

}

3749

}

3752

3750

3753

ret = blockdev_direct_IO(rw, iocb, inode,

3751

ret = blockdev_direct_IO(rw, iocb, inode,

3754

inode->i_sb->s_bdev, iov,

3752

inode->i_sb->s_bdev, iov,

3755

offset, nr_segs,

3753

offset, nr_segs,

3756

ext4_get_block_write,

3754

ext4_get_block_write,

3757

ext4_end_io_dio);

3755

ext4_end_io_dio);

3758

if (iocb->private)

3756

if (iocb->private)

3759

EXT4_I(inode)->cur_aio_dio = NULL;

3757

EXT4_I(inode)->cur_aio_dio = NULL;

3760

/*

3758

/*

3761

* The io_end structure takes a reference to the inode,

3759

* The io_end structure takes a reference to the inode,

3762

* that structure needs to be destroyed and the

3760

* that structure needs to be destroyed and the

3763

* reference to the inode need to be dropped, when IO is

3761

* reference to the inode need to be dropped, when IO is

3764

* complete, even with 0 byte write, or failed.

3762

* complete, even with 0 byte write, or failed.

3765

*

3763

*

3766

* In the successful AIO DIO case, the io_end structure will be

3764

* In the successful AIO DIO case, the io_end structure will be

3767

* desctroyed and the reference to the inode will be dropped

3765

* desctroyed and the reference to the inode will be dropped

3768

* after the end_io call back function is called.

3766

* after the end_io call back function is called.

3769

*

3767

*

3770

* In the case there is 0 byte write, or error case, since

3768

* In the case there is 0 byte write, or error case, since

3771

* VFS direct IO won't invoke the end_io call back function,

3769

* VFS direct IO won't invoke the end_io call back function,

3772

* we need to free the end_io structure here.

3770

* we need to free the end_io structure here.

3773

*/

3771

*/

3774

if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {

3772

if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {

3775

ext4_free_io_end(iocb->private);

3773

ext4_free_io_end(iocb->private);

3776

iocb->private = NULL;

3774

iocb->private = NULL;

3777

} else if (ret > 0 && ext4_test_inode_state(inode,

3775

} else if (ret > 0 && ext4_test_inode_state(inode,

3778

EXT4_STATE_DIO_UNWRITTEN)) {

3776

EXT4_STATE_DIO_UNWRITTEN)) {

3779

int err;

3777

int err;

3780

/*

3778

/*

3781

* for non AIO case, since the IO is already

3779

* for non AIO case, since the IO is already

3782

* completed, we could do the conversion right here

3780

* completed, we could do the conversion right here

3783

*/

3781

*/

3784

err = ext4_convert_unwritten_extents(inode,

3782

err = ext4_convert_unwritten_extents(inode,

3785

offset, ret);

3783

offset, ret);

3786

if (err < 0)

3784

if (err < 0)

3787

ret = err;

3785

ret = err;

3788

ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);

3786

ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);

3789

}

3787

}

3790

return ret;

3788

return ret;

3791

}

3789

}

3792

3790

3793

/* for write the the end of file case, we fall back to old way */

3791

/* for write the the end of file case, we fall back to old way */

3794

return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);

3792

return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);

3795

}

3793

}

3796

3794

3797

static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,

3795

static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,

3798

const struct iovec *iov, loff_t offset,

3796

const struct iovec *iov, loff_t offset,

3799

unsigned long nr_segs)

3797

unsigned long nr_segs)

3800

{

3798

{

3801

struct file *file = iocb->ki_filp;

3799

struct file *file = iocb->ki_filp;

3802

struct inode *inode = file->f_mapping->host;

3800

struct inode *inode = file->f_mapping->host;

3803

ssize_t ret;

3801

ssize_t ret;

3804

3802

3805

trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);

3803

trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);

3806

if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))

3804

if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))

3807

ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);

3805

ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);

3808

else

3806

else

3809

ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);

3807

ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);

3810

trace_ext4_direct_IO_exit(inode, offset,

3808

trace_ext4_direct_IO_exit(inode, offset,

3811

iov_length(iov, nr_segs), rw, ret);

3809

iov_length(iov, nr_segs), rw, ret);

3812

return ret;

3810

return ret;

3813

}

3811

}

3814

3812

3815

/*

3813

/*

3816

* Pages can be marked dirty completely asynchronously from ext4's journalling

3814

* Pages can be marked dirty completely asynchronously from ext4's journalling

3817

* activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do

3815

* activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do

3818

* much here because ->set_page_dirty is called under VFS locks. The page is

3816

* much here because ->set_page_dirty is called under VFS locks. The page is

3819

* not necessarily locked.

3817

* not necessarily locked.

3820

*

3818

*

3821

* We cannot just dirty the page and leave attached buffers clean, because the

3819

* We cannot just dirty the page and leave attached buffers clean, because the

3822

* buffers' dirty state is "definitive". We cannot just set the buffers dirty

3820

* buffers' dirty state is "definitive". We cannot just set the buffers dirty

3823

* or jbddirty because all the journalling code will explode.

3821

* or jbddirty because all the journalling code will explode.

3824

*

3822

*

3825

* So what we do is to mark the page "pending dirty" and next time writepage

3823

* So what we do is to mark the page "pending dirty" and next time writepage

3826

* is called, propagate that into the buffers appropriately.

3824

* is called, propagate that into the buffers appropriately.

3827

*/

3825

*/

3828

static int ext4_journalled_set_page_dirty(struct page *page)

3826

static int ext4_journalled_set_page_dirty(struct page *page)

3829

{

3827

{

3830

SetPageChecked(page);

3828

SetPageChecked(page);

3831

return __set_page_dirty_nobuffers(page);

3829

return __set_page_dirty_nobuffers(page);

3832

}

3830

}

3833

3831

3834

static const struct address_space_operations ext4_ordered_aops = {

3832

static const struct address_space_operations ext4_ordered_aops = {

3835

.readpage = ext4_readpage,

3833

.readpage = ext4_readpage,

3836

.readpages = ext4_readpages,

3834

.readpages = ext4_readpages,

3837

.writepage = ext4_writepage,

3835

.writepage = ext4_writepage,

3838

.write_begin = ext4_write_begin,

3836

.write_begin = ext4_write_begin,

3839

.write_end = ext4_ordered_write_end,

3837

.write_end = ext4_ordered_write_end,

3840

.bmap = ext4_bmap,

3838

.bmap = ext4_bmap,

3841

.invalidatepage = ext4_invalidatepage,

3839

.invalidatepage = ext4_invalidatepage,

3842

.releasepage = ext4_releasepage,

3840

.releasepage = ext4_releasepage,

3843

.direct_IO = ext4_direct_IO,

3841

.direct_IO = ext4_direct_IO,

3844

.migratepage = buffer_migrate_page,

3842

.migratepage = buffer_migrate_page,

3845

.is_partially_uptodate = block_is_partially_uptodate,

3843

.is_partially_uptodate = block_is_partially_uptodate,

3846

.error_remove_page = generic_error_remove_page,

3844

.error_remove_page = generic_error_remove_page,

3847

};

3845

};

3848

3846

3849

static const struct address_space_operations ext4_writeback_aops = {

3847

static const struct address_space_operations ext4_writeback_aops = {

3850

.readpage = ext4_readpage,

3848

.readpage = ext4_readpage,

3851

.readpages = ext4_readpages,

3849

.readpages = ext4_readpages,

3852

.writepage = ext4_writepage,

3850

.writepage = ext4_writepage,

3853

.write_begin = ext4_write_begin,

3851

.write_begin = ext4_write_begin,

3854

.write_end = ext4_writeback_write_end,

3852

.write_end = ext4_writeback_write_end,

3855

.bmap = ext4_bmap,

3853

.bmap = ext4_bmap,

3856

.invalidatepage = ext4_invalidatepage,

3854

.invalidatepage = ext4_invalidatepage,

3857

.releasepage = ext4_releasepage,

3855

.releasepage = ext4_releasepage,

3858

.direct_IO = ext4_direct_IO,

3856

.direct_IO = ext4_direct_IO,

3859

.migratepage = buffer_migrate_page,

3857

.migratepage = buffer_migrate_page,

3860

.is_partially_uptodate = block_is_partially_uptodate,

3858

.is_partially_uptodate = block_is_partially_uptodate,

3861

.error_remove_page = generic_error_remove_page,

3859

.error_remove_page = generic_error_remove_page,

3862

};

3860

};

3863

3861

3864

static const struct address_space_operations ext4_journalled_aops = {

3862

static const struct address_space_operations ext4_journalled_aops = {

3865

.readpage = ext4_readpage,

3863

.readpage = ext4_readpage,

3866

.readpages = ext4_readpages,

3864

.readpages = ext4_readpages,

3867

.writepage = ext4_writepage,

3865

.writepage = ext4_writepage,

3868

.write_begin = ext4_write_begin,

3866

.write_begin = ext4_write_begin,

3869

.write_end = ext4_journalled_write_end,

3867

.write_end = ext4_journalled_write_end,

3870

.set_page_dirty = ext4_journalled_set_page_dirty,

3868

.set_page_dirty = ext4_journalled_set_page_dirty,

3871

.bmap = ext4_bmap,

3869

.bmap = ext4_bmap,

3872

.invalidatepage = ext4_invalidatepage,

3870

.invalidatepage = ext4_invalidatepage,

3873

.releasepage = ext4_releasepage,

3871

.releasepage = ext4_releasepage,

3874

.is_partially_uptodate = block_is_partially_uptodate,

3872

.is_partially_uptodate = block_is_partially_uptodate,

3875

.error_remove_page = generic_error_remove_page,

3873

.error_remove_page = generic_error_remove_page,

3876

};

3874

};

3877

3875

3878

static const struct address_space_operations ext4_da_aops = {

3876

static const struct address_space_operations ext4_da_aops = {

3879

.readpage = ext4_readpage,

3877

.readpage = ext4_readpage,

3880

.readpages = ext4_readpages,

3878

.readpages = ext4_readpages,

3881

.writepage = ext4_writepage,

3879

.writepage = ext4_writepage,

3882

.writepages = ext4_da_writepages,

3880

.writepages = ext4_da_writepages,

3883

.write_begin = ext4_da_write_begin,

3881

.write_begin = ext4_da_write_begin,

3884

.write_end = ext4_da_write_end,

3882

.write_end = ext4_da_write_end,

3885

.bmap = ext4_bmap,

3883

.bmap = ext4_bmap,

3886

.invalidatepage = ext4_da_invalidatepage,

3884

.invalidatepage = ext4_da_invalidatepage,

3887

.releasepage = ext4_releasepage,

3885

.releasepage = ext4_releasepage,

3888

.direct_IO = ext4_direct_IO,

3886

.direct_IO = ext4_direct_IO,

3889

.migratepage = buffer_migrate_page,

3887

.migratepage = buffer_migrate_page,

3890

.is_partially_uptodate = block_is_partially_uptodate,

3888

.is_partially_uptodate = block_is_partially_uptodate,

3891

.error_remove_page = generic_error_remove_page,

3889

.error_remove_page = generic_error_remove_page,

3892

};

3890

};

3893

3891

3894

void ext4_set_aops(struct inode *inode)

3892

void ext4_set_aops(struct inode *inode)

3895

{

3893

{

3896

if (ext4_should_order_data(inode) &&

3894

if (ext4_should_order_data(inode) &&

3897

test_opt(inode->i_sb, DELALLOC))

3895

test_opt(inode->i_sb, DELALLOC))

3898

inode->i_mapping->a_ops = &ext4_da_aops;

3896

inode->i_mapping->a_ops = &ext4_da_aops;

3899

else if (ext4_should_order_data(inode))

3897

else if (ext4_should_order_data(inode))

3900

inode->i_mapping->a_ops = &ext4_ordered_aops;

3898

inode->i_mapping->a_ops = &ext4_ordered_aops;

3901

else if (ext4_should_writeback_data(inode) &&

3899

else if (ext4_should_writeback_data(inode) &&

3902

test_opt(inode->i_sb, DELALLOC))

3900

test_opt(inode->i_sb, DELALLOC))

3903

inode->i_mapping->a_ops = &ext4_da_aops;

3901

inode->i_mapping->a_ops = &ext4_da_aops;

3904

else if (ext4_should_writeback_data(inode))

3902

else if (ext4_should_writeback_data(inode))

3905

inode->i_mapping->a_ops = &ext4_writeback_aops;

3903

inode->i_mapping->a_ops = &ext4_writeback_aops;

3906

else

3904

else

3907

inode->i_mapping->a_ops = &ext4_journalled_aops;

3905

inode->i_mapping->a_ops = &ext4_journalled_aops;

3908

}

3906

}

3909

3907

3910

/*

3908

/*

3911

* ext4_block_truncate_page() zeroes out a mapping from file offset `from'

3909

* ext4_block_truncate_page() zeroes out a mapping from file offset `from'

3912

* up to the end of the block which corresponds to `from'.

3910

* up to the end of the block which corresponds to `from'.

3913

* This required during truncate. We need to physically zero the tail end

3911

* This required during truncate. We need to physically zero the tail end

3914

* of that block so it doesn't yield old data if the file is later grown.

3912

* of that block so it doesn't yield old data if the file is later grown.

3915

*/

3913

*/

3916

int ext4_block_truncate_page(handle_t *handle,

3914

int ext4_block_truncate_page(handle_t *handle,

3917

struct address_space *mapping, loff_t from)

3915

struct address_space *mapping, loff_t from)

3918

{

3916

{

3919

ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;

3917

ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;

3920

unsigned offset = from & (PAGE_CACHE_SIZE-1);

3918

unsigned offset = from & (PAGE_CACHE_SIZE-1);

3921

unsigned blocksize, length, pos;

3919

unsigned blocksize, length, pos;

3922

ext4_lblk_t iblock;

3920

ext4_lblk_t iblock;

3923

struct inode *inode = mapping->host;

3921

struct inode *inode = mapping->host;

3924

struct buffer_head *bh;

3922

struct buffer_head *bh;

3925

struct page *page;

3923

struct page *page;

3926

int err = 0;

3924

int err = 0;

3927

3925

3928

page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,

3926

page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,

3929

mapping_gfp_mask(mapping) & ~__GFP_FS);

3927

mapping_gfp_mask(mapping) & ~__GFP_FS);

3930

if (!page)

3928

if (!page)

3931

return -EINVAL;

3929

return -EINVAL;

3932

3930

3933

blocksize = inode->i_sb->s_blocksize;

3931

blocksize = inode->i_sb->s_blocksize;

3934

length = blocksize - (offset & (blocksize - 1));

3932

length = blocksize - (offset & (blocksize - 1));

3935

iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);

3933

iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);

3936

3934

3937

if (!page_has_buffers(page))

3935

if (!page_has_buffers(page))

3938

create_empty_buffers(page, blocksize, 0);

3936

create_empty_buffers(page, blocksize, 0);

3939

3937

3940

/* Find the buffer that contains "offset" */

3938

/* Find the buffer that contains "offset" */

3941

bh = page_buffers(page);

3939

bh = page_buffers(page);

3942

pos = blocksize;

3940

pos = blocksize;

3943

while (offset >= pos) {

3941

while (offset >= pos) {

3944

bh = bh->b_this_page;

3942

bh = bh->b_this_page;

3945

iblock++;

3943

iblock++;

3946

pos += blocksize;

3944

pos += blocksize;

3947

}

3945

}

3948

3946

3949

err = 0;

3947

err = 0;

3950

if (buffer_freed(bh)) {

3948

if (buffer_freed(bh)) {

3951

BUFFER_TRACE(bh, "freed: skip");

3949

BUFFER_TRACE(bh, "freed: skip");

3952

goto unlock;

3950

goto unlock;

3953

}

3951

}

3954

3952

3955

if (!buffer_mapped(bh)) {

3953

if (!buffer_mapped(bh)) {

3956

BUFFER_TRACE(bh, "unmapped");

3954

BUFFER_TRACE(bh, "unmapped");

3957

ext4_get_block(inode, iblock, bh, 0);

3955

ext4_get_block(inode, iblock, bh, 0);

3958

/* unmapped? It's a hole - nothing to do */

3956

/* unmapped? It's a hole - nothing to do */

3959

if (!buffer_mapped(bh)) {

3957

if (!buffer_mapped(bh)) {

3960

BUFFER_TRACE(bh, "still unmapped");

3958

BUFFER_TRACE(bh, "still unmapped");

3961

goto unlock;

3959

goto unlock;

3962

}

3960

}

3963

}

3961

}

3964

3962

3965

/* Ok, it's mapped. Make sure it's up-to-date */

3963

/* Ok, it's mapped. Make sure it's up-to-date */

3966

if (PageUptodate(page))

3964

if (PageUptodate(page))

3967

set_buffer_uptodate(bh);

3965

set_buffer_uptodate(bh);

3968

3966

3969

if (!buffer_uptodate(bh)) {

3967

if (!buffer_uptodate(bh)) {

3970

err = -EIO;

3968

err = -EIO;

3971

ll_rw_block(READ, 1, &bh);

3969

ll_rw_block(READ, 1, &bh);

3972

wait_on_buffer(bh);

3970

wait_on_buffer(bh);

3973

/* Uhhuh. Read error. Complain and punt. */

3971

/* Uhhuh. Read error. Complain and punt. */

3974

if (!buffer_uptodate(bh))

3972

if (!buffer_uptodate(bh))

3975

goto unlock;

3973

goto unlock;

3976

}

3974

}

3977

3975

3978

if (ext4_should_journal_data(inode)) {

3976

if (ext4_should_journal_data(inode)) {

3979

BUFFER_TRACE(bh, "get write access");

3977

BUFFER_TRACE(bh, "get write access");

3980

err = ext4_journal_get_write_access(handle, bh);

3978

err = ext4_journal_get_write_access(handle, bh);

3981

if (err)

3979

if (err)

3982

goto unlock;

3980

goto unlock;

3983

}

3981

}

3984

3982

3985

zero_user(page, offset, length);

3983

zero_user(page, offset, length);

3986

3984

3987

BUFFER_TRACE(bh, "zeroed end of block");

3985

BUFFER_TRACE(bh, "zeroed end of block");

3988

3986

3989

err = 0;

3987

err = 0;

3990

if (ext4_should_journal_data(inode)) {

3988

if (ext4_should_journal_data(inode)) {

3991

err = ext4_handle_dirty_metadata(handle, inode, bh);

3989

err = ext4_handle_dirty_metadata(handle, inode, bh);

3992

} else {

3990

} else {

3993

if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode)

3991

if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode)

3994

err = ext4_jbd2_file_inode(handle, inode);

3992

err = ext4_jbd2_file_inode(handle, inode);

3995

mark_buffer_dirty(bh);

3993

mark_buffer_dirty(bh);

3996

}

3994

}

3997

3995

3998

unlock:

3996

unlock:

3999

unlock_page(page);

3997

unlock_page(page);

4000

page_cache_release(page);

3998

page_cache_release(page);

4001

return err;

3999

return err;

4002

}

4000

}

4003

4001

4004

/*

4002

/*

4005

* Probably it should be a library function... search for first non-zero word

4003

* Probably it should be a library function... search for first non-zero word

4006

* or memcmp with zero_page, whatever is better for particular architecture.

4004

* or memcmp with zero_page, whatever is better for particular architecture.

4007

* Linus?

4005

* Linus?

4008

*/

4006

*/

4009

static inline int all_zeroes(__le32 *p, __le32 *q)

4007

static inline int all_zeroes(__le32 *p, __le32 *q)

4010

{

4008

{

4011

while (p < q)

4009

while (p < q)

4012

if (*p++)

4010

if (*p++)

4013

return 0;

4011

return 0;

4014

return 1;

4012

return 1;

4015

}

4013

}

4016

4014

4017

/**

4015

/**

4018

* ext4_find_shared - find the indirect blocks for partial truncation.

4016

* ext4_find_shared - find the indirect blocks for partial truncation.

4019

* @inode: inode in question

4017

* @inode: inode in question

4020

* @depth: depth of the affected branch

4018

* @depth: depth of the affected branch

4021

* @offsets: offsets of pointers in that branch (see ext4_block_to_path)

4019

* @offsets: offsets of pointers in that branch (see ext4_block_to_path)

4022

* @chain: place to store the pointers to partial indirect blocks

4020

* @chain: place to store the pointers to partial indirect blocks

4023

* @top: place to the (detached) top of branch

4021

* @top: place to the (detached) top of branch

4024

*

4022

*

4025

* This is a helper function used by ext4_truncate().

4023

* This is a helper function used by ext4_truncate().

4026

*

4024

*

4027

* When we do truncate() we may have to clean the ends of several

4025

* When we do truncate() we may have to clean the ends of several

4028

* indirect blocks but leave the blocks themselves alive. Block is

4026

* indirect blocks but leave the blocks themselves alive. Block is

4029

* partially truncated if some data below the new i_size is referred

4027

* partially truncated if some data below the new i_size is referred

4030

* from it (and it is on the path to the first completely truncated

4028

* from it (and it is on the path to the first completely truncated

4031

* data block, indeed). We have to free the top of that path along

4029

* data block, indeed). We have to free the top of that path along

4032

* with everything to the right of the path. Since no allocation

4030

* with everything to the right of the path. Since no allocation

4033

* past the truncation point is possible until ext4_truncate()

4031

* past the truncation point is possible until ext4_truncate()

4034

* finishes, we may safely do the latter, but top of branch may

4032

* finishes, we may safely do the latter, but top of branch may

4035

* require special attention - pageout below the truncation point

4033

* require special attention - pageout below the truncation point

4036

* might try to populate it.

4034

* might try to populate it.

4037

*

4035

*

4038

* We atomically detach the top of branch from the tree, store the

4036

* We atomically detach the top of branch from the tree, store the

4039

* block number of its root in *@top, pointers to buffer_heads of

4037

* block number of its root in *@top, pointers to buffer_heads of

4040

* partially truncated blocks - in @chain[].bh and pointers to

4038

* partially truncated blocks - in @chain[].bh and pointers to

4041

* their last elements that should not be removed - in

4039

* their last elements that should not be removed - in

4042

* @chain[].p. Return value is the pointer to last filled element

4040

* @chain[].p. Return value is the pointer to last filled element

4043

* of @chain.

4041

* of @chain.

4044

*

4042

*

4045

* The work left to caller to do the actual freeing of subtrees:

4043

* The work left to caller to do the actual freeing of subtrees:

4046

* a) free the subtree starting from *@top

4044

* a) free the subtree starting from *@top

4047

* b) free the subtrees whose roots are stored in

4045

* b) free the subtrees whose roots are stored in

4048

* (@chain[i].p+1 .. end of @chain[i].bh->b_data)

4046

* (@chain[i].p+1 .. end of @chain[i].bh->b_data)

4049

* c) free the subtrees growing from the inode past the @chain[0].

4047

* c) free the subtrees growing from the inode past the @chain[0].

4050

* (no partially truncated stuff there). */

4048

* (no partially truncated stuff there). */

4051

4049

4052

static Indirect *ext4_find_shared(struct inode *inode, int depth,

4050

static Indirect *ext4_find_shared(struct inode *inode, int depth,

4053

ext4_lblk_t offsets[4], Indirect chain[4],

4051

ext4_lblk_t offsets[4], Indirect chain[4],

4054

__le32 *top)

4052

__le32 *top)

4055

{

4053

{

4056

Indirect *partial, *p;

4054

Indirect *partial, *p;

4057

int k, err;

4055

int k, err;

4058

4056

4059

*top = 0;

4057

*top = 0;

4060

/* Make k index the deepest non-null offset + 1 */

4058

/* Make k index the deepest non-null offset + 1 */

4061

for (k = depth; k > 1 && !offsets[k-1]; k--)

4059

for (k = depth; k > 1 && !offsets[k-1]; k--)

4062

;

4060

;

4063

partial = ext4_get_branch(inode, k, offsets, chain, &err);

4061

partial = ext4_get_branch(inode, k, offsets, chain, &err);

4064

/* Writer: pointers */

4062

/* Writer: pointers */

4065

if (!partial)

4063

if (!partial)

4066

partial = chain + k-1;

4064

partial = chain + k-1;

4067

/*

4065

/*

4068

* If the branch acquired continuation since we've looked at it -

4066

* If the branch acquired continuation since we've looked at it -

4069

* fine, it should all survive and (new) top doesn't belong to us.

4067

* fine, it should all survive and (new) top doesn't belong to us.

4070

*/

4068

*/

4071

if (!partial->key && *partial->p)

4069

if (!partial->key && *partial->p)

4072

/* Writer: end */

4070

/* Writer: end */

4073

goto no_top;

4071

goto no_top;

4074

for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--)

4072

for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--)

4075

;

4073

;

4076

/*

4074

/*

4077

* OK, we've found the last block that must survive. The rest of our

4075

* OK, we've found the last block that must survive. The rest of our

4078

* branch should be detached before unlocking. However, if that rest

4076

* branch should be detached before unlocking. However, if that rest

4079

* of branch is all ours and does not grow immediately from the inode

4077

* of branch is all ours and does not grow immediately from the inode

4080

* it's easier to cheat and just decrement partial->p.

4078

* it's easier to cheat and just decrement partial->p.

4081

*/

4079

*/

4082

if (p == chain + k - 1 && p > chain) {

4080

if (p == chain + k - 1 && p > chain) {

4083

p->p--;

4081

p->p--;

4084

} else {

4082

} else {

4085

*top = *p->p;

4083

*top = *p->p;

4086

/* Nope, don't do this in ext4. Must leave the tree intact */

4084

/* Nope, don't do this in ext4. Must leave the tree intact */

4087

#if 0

4085

#if 0

4088

*p->p = 0;

4086

*p->p = 0;

4089

#endif

4087

#endif

4090

}

4088

}

4091

/* Writer: end */

4089

/* Writer: end */

4092

4090

4093

while (partial > p) {

4091

while (partial > p) {

4094

brelse(partial->bh);

4092

brelse(partial->bh);

4095

partial--;

4093

partial--;

4096

}

4094

}

4097

no_top:

4095

no_top:

4098

return partial;

4096

return partial;

4099

}

4097

}

4100

4098

4101

/*

4099

/*

4102

* Zero a number of block pointers in either an inode or an indirect block.

4100

* Zero a number of block pointers in either an inode or an indirect block.

4103

* If we restart the transaction we must again get write access to the

4101

* If we restart the transaction we must again get write access to the

4104

* indirect block for further modification.

4102

* indirect block for further modification.

4105

*

4103

*

4106

* We release `count' blocks on disk, but (last - first) may be greater

4104

* We release `count' blocks on disk, but (last - first) may be greater

4107

* than `count' because there can be holes in there.

4105

* than `count' because there can be holes in there.

4108

*

4106

*

4109

* Return 0 on success, 1 on invalid block range

4107

* Return 0 on success, 1 on invalid block range

4110

* and < 0 on fatal error.

4108

* and < 0 on fatal error.

4111

*/

4109

*/

4112

static int ext4_clear_blocks(handle_t *handle, struct inode *inode,

4110

static int ext4_clear_blocks(handle_t *handle, struct inode *inode,

4113

struct buffer_head *bh,

4111

struct buffer_head *bh,

4114

ext4_fsblk_t block_to_free,

4112

ext4_fsblk_t block_to_free,

4115

unsigned long count, __le32 *first,

4113

unsigned long count, __le32 *first,

4116

__le32 *last)

4114

__le32 *last)

4117

{

4115

{

4118

__le32 *p;

4116

__le32 *p;

4119

int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;

4117

int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;

4120

int err;

4118

int err;

4121

4119

4122

if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))

4120

if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))

4123

flags |= EXT4_FREE_BLOCKS_METADATA;

4121

flags |= EXT4_FREE_BLOCKS_METADATA;

4124

4122

4125

if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,

4123

if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,

4126

count)) {

4124

count)) {

4127

EXT4_ERROR_INODE(inode, "attempt to clear invalid "

4125

EXT4_ERROR_INODE(inode, "attempt to clear invalid "

4128

"blocks %llu len %lu",

4126

"blocks %llu len %lu",

4129

(unsigned long long) block_to_free, count);

4127

(unsigned long long) block_to_free, count);

4130

return 1;

4128

return 1;

4131

}

4129

}

4132

4130

4133

if (try_to_extend_transaction(handle, inode)) {

4131

if (try_to_extend_transaction(handle, inode)) {

4134

if (bh) {

4132

if (bh) {

4135

BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");

4133

BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");

4136

err = ext4_handle_dirty_metadata(handle, inode, bh);

4134

err = ext4_handle_dirty_metadata(handle, inode, bh);

4137

if (unlikely(err))

4135

if (unlikely(err))

4138

goto out_err;

4136

goto out_err;

4139

}

4137

}

4140

err = ext4_mark_inode_dirty(handle, inode);

4138

err = ext4_mark_inode_dirty(handle, inode);

4141

if (unlikely(err))

4139

if (unlikely(err))

4142

goto out_err;

4140

goto out_err;

4143

err = ext4_truncate_restart_trans(handle, inode,

4141

err = ext4_truncate_restart_trans(handle, inode,

4144

blocks_for_truncate(inode));

4142

blocks_for_truncate(inode));

4145

if (unlikely(err))

4143

if (unlikely(err))

4146

goto out_err;

4144

goto out_err;

4147

if (bh) {

4145

if (bh) {

4148

BUFFER_TRACE(bh, "retaking write access");

4146

BUFFER_TRACE(bh, "retaking write access");

4149

err = ext4_journal_get_write_access(handle, bh);

4147

err = ext4_journal_get_write_access(handle, bh);

4150

if (unlikely(err))

4148

if (unlikely(err))

4151

goto out_err;

4149

goto out_err;

4152

}

4150

}

4153

}

4151

}

4154

4152

4155

for (p = first; p < last; p++)

4153

for (p = first; p < last; p++)

4156

*p = 0;

4154

*p = 0;

4157

4155

4158

ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags);

4156

ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags);

4159

return 0;

4157

return 0;

4160

out_err:

4158

out_err:

4161

ext4_std_error(inode->i_sb, err);

4159

ext4_std_error(inode->i_sb, err);

4162

return err;

4160

return err;

4163

}

4161

}

4164

4162

4165

/**

4163

/**

4166

* ext4_free_data - free a list of data blocks

4164

* ext4_free_data - free a list of data blocks

4167

* @handle: handle for this transaction

4165

* @handle: handle for this transaction

4168

* @inode: inode we are dealing with

4166

* @inode: inode we are dealing with

4169

* @this_bh: indirect buffer_head which contains *@first and *@last

4167

* @this_bh: indirect buffer_head which contains *@first and *@last

4170

* @first: array of block numbers

4168

* @first: array of block numbers

4171

* @last: points immediately past the end of array

4169

* @last: points immediately past the end of array

4172

*

4170

*

4173

* We are freeing all blocks referred from that array (numbers are stored as

4171

* We are freeing all blocks referred from that array (numbers are stored as

4174

* little-endian 32-bit) and updating @inode->i_blocks appropriately.

4172

* little-endian 32-bit) and updating @inode->i_blocks appropriately.

4175

*

4173

*

4176

* We accumulate contiguous runs of blocks to free. Conveniently, if these

4174

* We accumulate contiguous runs of blocks to free. Conveniently, if these

4177

* blocks are contiguous then releasing them at one time will only affect one

4175

* blocks are contiguous then releasing them at one time will only affect one

4178

* or two bitmap blocks (+ group descriptor(s) and superblock) and we won't

4176

* or two bitmap blocks (+ group descriptor(s) and superblock) and we won't

4179

* actually use a lot of journal space.

4177

* actually use a lot of journal space.

4180

*

4178

*

4181

* @this_bh will be %NULL if @first and @last point into the inode's direct

4179

* @this_bh will be %NULL if @first and @last point into the inode's direct

4182

* block pointers.

4180

* block pointers.

4183

*/

4181

*/

4184

static void ext4_free_data(handle_t *handle, struct inode *inode,

4182

static void ext4_free_data(handle_t *handle, struct inode *inode,

4185

struct buffer_head *this_bh,

4183

struct buffer_head *this_bh,

4186

__le32 *first, __le32 *last)

4184

__le32 *first, __le32 *last)

4187

{

4185

{

4188

ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */

4186

ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */

4189

unsigned long count = 0; /* Number of blocks in the run */

4187

unsigned long count = 0; /* Number of blocks in the run */

4190

__le32 *block_to_free_p = NULL; /* Pointer into inode/ind

4188

__le32 *block_to_free_p = NULL; /* Pointer into inode/ind

4191

corresponding to

4189

corresponding to

4192

block_to_free */

4190

block_to_free */

4193

ext4_fsblk_t nr; /* Current block # */

4191

ext4_fsblk_t nr; /* Current block # */

4194

__le32 *p; /* Pointer into inode/ind

4192

__le32 *p; /* Pointer into inode/ind

4195

for current block */

4193

for current block */

4196

int err = 0;

4194

int err = 0;

4197

4195

4198

if (this_bh) { /* For indirect block */

4196

if (this_bh) { /* For indirect block */

4199

BUFFER_TRACE(this_bh, "get_write_access");

4197

BUFFER_TRACE(this_bh, "get_write_access");

4200

err = ext4_journal_get_write_access(handle, this_bh);

4198

err = ext4_journal_get_write_access(handle, this_bh);

4201

/* Important: if we can't update the indirect pointers

4199

/* Important: if we can't update the indirect pointers

4202

* to the blocks, we can't free them. */

4200

* to the blocks, we can't free them. */

4203

if (err)

4201

if (err)

4204

return;

4202

return;

4205

}

4203

}

4206

4204

4207

for (p = first; p < last; p++) {

4205

for (p = first; p < last; p++) {

4208

nr = le32_to_cpu(*p);

4206

nr = le32_to_cpu(*p);

4209

if (nr) {

4207

if (nr) {

4210

/* accumulate blocks to free if they're contiguous */

4208

/* accumulate blocks to free if they're contiguous */

4211

if (count == 0) {

4209

if (count == 0) {

4212

block_to_free = nr;

4210

block_to_free = nr;

4213

block_to_free_p = p;

4211

block_to_free_p = p;

4214

count = 1;

4212

count = 1;

4215

} else if (nr == block_to_free + count) {

4213

} else if (nr == block_to_free + count) {

4216

count++;

4214

count++;

4217

} else {

4215

} else {

4218

err = ext4_clear_blocks(handle, inode, this_bh,

4216

err = ext4_clear_blocks(handle, inode, this_bh,

4219

block_to_free, count,

4217

block_to_free, count,

4220

block_to_free_p, p);

4218

block_to_free_p, p);

4221

if (err)

4219

if (err)

4222

break;

4220

break;

4223

block_to_free = nr;

4221

block_to_free = nr;

4224

block_to_free_p = p;

4222

block_to_free_p = p;

4225

count = 1;

4223

count = 1;

4226

}

4224

}

4227

}

4225

}

4228

}

4226

}

4229

4227

4230

if (!err && count > 0)

4228

if (!err && count > 0)

4231

err = ext4_clear_blocks(handle, inode, this_bh, block_to_free,

4229

err = ext4_clear_blocks(handle, inode, this_bh, block_to_free,

4232

count, block_to_free_p, p);

4230

count, block_to_free_p, p);

4233

if (err < 0)

4231

if (err < 0)

4234

/* fatal error */

4232

/* fatal error */

4235

return;

4233

return;

4236

4234

4237

if (this_bh) {

4235

if (this_bh) {

4238

BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");

4236

BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");

4239

4237

4240

/*

4238

/*

4241

* The buffer head should have an attached journal head at this

4239

* The buffer head should have an attached journal head at this

4242

* point. However, if the data is corrupted and an indirect

4240

* point. However, if the data is corrupted and an indirect

4243

* block pointed to itself, it would have been detached when

4241

* block pointed to itself, it would have been detached when

4244

* the block was cleared. Check for this instead of OOPSing.

4242

* the block was cleared. Check for this instead of OOPSing.

4245

*/

4243

*/

4246

if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))

4244

if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))

4247

ext4_handle_dirty_metadata(handle, inode, this_bh);

4245

ext4_handle_dirty_metadata(handle, inode, this_bh);

4248

else

4246

else

4249

EXT4_ERROR_INODE(inode,

4247

EXT4_ERROR_INODE(inode,

4250

"circular indirect block detected at "

4248

"circular indirect block detected at "

4251

"block %llu",

4249

"block %llu",

4252

(unsigned long long) this_bh->b_blocknr);

4250

(unsigned long long) this_bh->b_blocknr);

4253

}

4251

}

4254

}

4252

}

4255

4253

4256

/**

4254

/**

4257

* ext4_free_branches - free an array of branches

4255

* ext4_free_branches - free an array of branches

4258

* @handle: JBD handle for this transaction

4256

* @handle: JBD handle for this transaction

4259

* @inode: inode we are dealing with

4257

* @inode: inode we are dealing with

4260

* @parent_bh: the buffer_head which contains *@first and *@last

4258

* @parent_bh: the buffer_head which contains *@first and *@last

4261

* @first: array of block numbers

4259

* @first: array of block numbers

4262

* @last: pointer immediately past the end of array

4260

* @last: pointer immediately past the end of array

4263

* @depth: depth of the branches to free

4261

* @depth: depth of the branches to free

4264

*

4262

*

4265

* We are freeing all blocks referred from these branches (numbers are

4263

* We are freeing all blocks referred from these branches (numbers are

4266

* stored as little-endian 32-bit) and updating @inode->i_blocks

4264

* stored as little-endian 32-bit) and updating @inode->i_blocks

4267

* appropriately.

4265

* appropriately.

4268

*/

4266

*/

4269

static void ext4_free_branches(handle_t *handle, struct inode *inode,

4267

static void ext4_free_branches(handle_t *handle, struct inode *inode,

4270

struct buffer_head *parent_bh,

4268

struct buffer_head *parent_bh,

4271

__le32 *first, __le32 *last, int depth)

4269

__le32 *first, __le32 *last, int depth)

4272

{

4270

{

4273

ext4_fsblk_t nr;

4271

ext4_fsblk_t nr;

4274

__le32 *p;

4272

__le32 *p;

4275

4273

4276

if (ext4_handle_is_aborted(handle))

4274

if (ext4_handle_is_aborted(handle))

4277

return;

4275

return;

4278

4276

4279

if (depth--) {

4277

if (depth--) {

4280

struct buffer_head *bh;

4278

struct buffer_head *bh;

4281

int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);

4279

int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);

4282

p = last;

4280

p = last;

4283

while (--p >= first) {

4281

while (--p >= first) {

4284

nr = le32_to_cpu(*p);

4282

nr = le32_to_cpu(*p);

4285

if (!nr)

4283

if (!nr)

4286

continue; /* A hole */

4284

continue; /* A hole */

4287

4285

4288

if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),

4286

if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),

4289

nr, 1)) {

4287

nr, 1)) {

4290

EXT4_ERROR_INODE(inode,

4288

EXT4_ERROR_INODE(inode,

4291

"invalid indirect mapped "

4289

"invalid indirect mapped "

4292

"block %lu (level %d)",

4290

"block %lu (level %d)",

4293

(unsigned long) nr, depth);

4291

(unsigned long) nr, depth);

4294

break;

4292

break;

4295

}

4293

}

4296

4294

4297

/* Go read the buffer for the next level down */

4295

/* Go read the buffer for the next level down */

4298

bh = sb_bread(inode->i_sb, nr);

4296

bh = sb_bread(inode->i_sb, nr);

4299

4297

4300

/*

4298

/*

4301

* A read failure? Report error and clear slot

4299

* A read failure? Report error and clear slot

4302

* (should be rare).

4300

* (should be rare).

4303

*/

4301

*/

4304

if (!bh) {

4302

if (!bh) {

4305

EXT4_ERROR_INODE_BLOCK(inode, nr,

4303

EXT4_ERROR_INODE_BLOCK(inode, nr,

4306

"Read failure");

4304

"Read failure");

4307

continue;

4305

continue;

4308

}

4306

}

4309

4307

4310

/* This zaps the entire block. Bottom up. */

4308

/* This zaps the entire block. Bottom up. */

4311

BUFFER_TRACE(bh, "free child branches");

4309

BUFFER_TRACE(bh, "free child branches");

4312

ext4_free_branches(handle, inode, bh,

4310

ext4_free_branches(handle, inode, bh,

4313

(__le32 *) bh->b_data,

4311

(__le32 *) bh->b_data,

4314

(__le32 *) bh->b_data + addr_per_block,

4312

(__le32 *) bh->b_data + addr_per_block,

4315

depth);

4313

depth);

4316

brelse(bh);

4314

brelse(bh);

4317

4315

4318

/*

4316

/*

4319

* Everything below this this pointer has been

4317

* Everything below this this pointer has been

4320

* released. Now let this top-of-subtree go.

4318

* released. Now let this top-of-subtree go.

4321

*

4319

*

4322

* We want the freeing of this indirect block to be

4320

* We want the freeing of this indirect block to be

4323

* atomic in the journal with the updating of the

4321

* atomic in the journal with the updating of the

4324

* bitmap block which owns it. So make some room in

4322

* bitmap block which owns it. So make some room in

4325

* the journal.

4323

* the journal.

4326

*

4324

*

4327

* We zero the parent pointer *after* freeing its

4325

* We zero the parent pointer *after* freeing its

4328

* pointee in the bitmaps, so if extend_transaction()

4326

* pointee in the bitmaps, so if extend_transaction()

4329

* for some reason fails to put the bitmap changes and

4327

* for some reason fails to put the bitmap changes and

4330

* the release into the same transaction, recovery

4328

* the release into the same transaction, recovery

4331

* will merely complain about releasing a free block,

4329

* will merely complain about releasing a free block,

4332

* rather than leaking blocks.

4330

* rather than leaking blocks.

4333

*/

4331

*/

4334

if (ext4_handle_is_aborted(handle))

4332

if (ext4_handle_is_aborted(handle))

4335

return;

4333

return;

4336

if (try_to_extend_transaction(handle, inode)) {

4334

if (try_to_extend_transaction(handle, inode)) {

4337

ext4_mark_inode_dirty(handle, inode);

4335

ext4_mark_inode_dirty(handle, inode);

4338

ext4_truncate_restart_trans(handle, inode,

4336

ext4_truncate_restart_trans(handle, inode,

4339

blocks_for_truncate(inode));

4337

blocks_for_truncate(inode));

4340

}

4338

}

4341

4339

4342

/*

4340

/*

4343

* The forget flag here is critical because if

4341

* The forget flag here is critical because if

4344

* we are journaling (and not doing data

4342

* we are journaling (and not doing data

4345

* journaling), we have to make sure a revoke

4343

* journaling), we have to make sure a revoke

4346

* record is written to prevent the journal

4344

* record is written to prevent the journal

4347

* replay from overwriting the (former)

4345

* replay from overwriting the (former)

4348

* indirect block if it gets reallocated as a

4346

* indirect block if it gets reallocated as a

4349

* data block. This must happen in the same

4347

* data block. This must happen in the same

4350

* transaction where the data blocks are

4348

* transaction where the data blocks are

4351

* actually freed.

4349

* actually freed.

4352

*/

4350

*/

4353

ext4_free_blocks(handle, inode, NULL, nr, 1,

4351

ext4_free_blocks(handle, inode, NULL, nr, 1,

4354

EXT4_FREE_BLOCKS_METADATA|

4352

EXT4_FREE_BLOCKS_METADATA|

4355

EXT4_FREE_BLOCKS_FORGET);

4353

EXT4_FREE_BLOCKS_FORGET);

4356

4354

4357

if (parent_bh) {

4355

if (parent_bh) {

4358

/*

4356

/*

4359

* The block which we have just freed is

4357

* The block which we have just freed is

4360

* pointed to by an indirect block: journal it

4358

* pointed to by an indirect block: journal it

4361

*/

4359

*/

4362

BUFFER_TRACE(parent_bh, "get_write_access");

4360

BUFFER_TRACE(parent_bh, "get_write_access");

4363

if (!ext4_journal_get_write_access(handle,

4361

if (!ext4_journal_get_write_access(handle,

4364

parent_bh)){

4362

parent_bh)){

4365

*p = 0;

4363

*p = 0;

4366

BUFFER_TRACE(parent_bh,

4364

BUFFER_TRACE(parent_bh,

4367

"call ext4_handle_dirty_metadata");

4365

"call ext4_handle_dirty_metadata");

4368

ext4_handle_dirty_metadata(handle,

4366

ext4_handle_dirty_metadata(handle,

4369

inode,

4367

inode,

4370

parent_bh);

4368

parent_bh);

4371

}

4369

}

4372

}

4370

}

4373

}

4371

}

4374

} else {

4372

} else {

4375

/* We have reached the bottom of the tree. */

4373

/* We have reached the bottom of the tree. */

4376

BUFFER_TRACE(parent_bh, "free data blocks");

4374

BUFFER_TRACE(parent_bh, "free data blocks");

4377

ext4_free_data(handle, inode, parent_bh, first, last);

4375

ext4_free_data(handle, inode, parent_bh, first, last);

4378

}

4376

}

4379

}

4377

}

4380

4378

4381

int ext4_can_truncate(struct inode *inode)

4379

int ext4_can_truncate(struct inode *inode)

4382

{

4380

{

4383

if (IS_APPEND(inode) || IS_IMMUTABLE(inode))

4381

if (IS_APPEND(inode) || IS_IMMUTABLE(inode))

4384

return 0;

4382

return 0;

4385

if (S_ISREG(inode->i_mode))

4383

if (S_ISREG(inode->i_mode))

4386

return 1;

4384

return 1;

4387

if (S_ISDIR(inode->i_mode))

4385

if (S_ISDIR(inode->i_mode))

4388

return 1;

4386

return 1;

4389

if (S_ISLNK(inode->i_mode))

4387

if (S_ISLNK(inode->i_mode))

4390

return !ext4_inode_is_fast_symlink(inode);

4388

return !ext4_inode_is_fast_symlink(inode);

4391

return 0;

4389

return 0;

4392

}

4390

}

4393

4391

4394

/*

4392

/*

4395

* ext4_truncate()

4393

* ext4_truncate()

4396

*

4394

*

4397

* We block out ext4_get_block() block instantiations across the entire

4395

* We block out ext4_get_block() block instantiations across the entire

4398

* transaction, and VFS/VM ensures that ext4_truncate() cannot run

4396

* transaction, and VFS/VM ensures that ext4_truncate() cannot run

4399

* simultaneously on behalf of the same inode.

4397

* simultaneously on behalf of the same inode.

4400

*

4398

*

4401

* As we work through the truncate and commmit bits of it to the journal there

4399

* As we work through the truncate and commmit bits of it to the journal there

4402

* is one core, guiding principle: the file's tree must always be consistent on

4400

* is one core, guiding principle: the file's tree must always be consistent on

4403

* disk. We must be able to restart the truncate after a crash.

4401

* disk. We must be able to restart the truncate after a crash.

4404

*

4402

*

4405

* The file's tree may be transiently inconsistent in memory (although it

4403

* The file's tree may be transiently inconsistent in memory (although it

4406

* probably isn't), but whenever we close off and commit a journal transaction,

4404

* probably isn't), but whenever we close off and commit a journal transaction,

4407

* the contents of (the filesystem + the journal) must be consistent and

4405

* the contents of (the filesystem + the journal) must be consistent and

4408

* restartable. It's pretty simple, really: bottom up, right to left (although

4406

* restartable. It's pretty simple, really: bottom up, right to left (although

4409

* left-to-right works OK too).

4407

* left-to-right works OK too).

4410

*

4408

*

4411

* Note that at recovery time, journal replay occurs *before* the restart of

4409

* Note that at recovery time, journal replay occurs *before* the restart of

4412

* truncate against the orphan inode list.

4410

* truncate against the orphan inode list.

4413

*

4411

*

4414

* The committed inode has the new, desired i_size (which is the same as

4412

* The committed inode has the new, desired i_size (which is the same as

4415

* i_disksize in this case). After a crash, ext4_orphan_cleanup() will see

4413

* i_disksize in this case). After a crash, ext4_orphan_cleanup() will see

4416

* that this inode's truncate did not complete and it will again call

4414

* that this inode's truncate did not complete and it will again call

4417

* ext4_truncate() to have another go. So there will be instantiated blocks

4415

* ext4_truncate() to have another go. So there will be instantiated blocks

4418

* to the right of the truncation point in a crashed ext4 filesystem. But

4416

* to the right of the truncation point in a crashed ext4 filesystem. But

4419

* that's fine - as long as they are linked from the inode, the post-crash

4417

* that's fine - as long as they are linked from the inode, the post-crash

4420

* ext4_truncate() run will find them and release them.

4418

* ext4_truncate() run will find them and release them.

4421

*/

4419

*/

4422

void ext4_truncate(struct inode *inode)

4420

void ext4_truncate(struct inode *inode)

4423

{

4421

{

4424

handle_t *handle;

4422

handle_t *handle;

4425

struct ext4_inode_info *ei = EXT4_I(inode);

4423

struct ext4_inode_info *ei = EXT4_I(inode);

4426

__le32 *i_data = ei->i_data;

4424

__le32 *i_data = ei->i_data;

4427

int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);

4425

int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);

4428

struct address_space *mapping = inode->i_mapping;

4426

struct address_space *mapping = inode->i_mapping;

4429

ext4_lblk_t offsets[4];

4427

ext4_lblk_t offsets[4];

4430

Indirect chain[4];

4428

Indirect chain[4];

4431

Indirect *partial;

4429

Indirect *partial;

4432

__le32 nr = 0;

4430

__le32 nr = 0;

4433

int n = 0;

4431

int n = 0;

4434

ext4_lblk_t last_block, max_block;

4432

ext4_lblk_t last_block, max_block;

4435

unsigned blocksize = inode->i_sb->s_blocksize;

4433

unsigned blocksize = inode->i_sb->s_blocksize;

4436

4434

4437

trace_ext4_truncate_enter(inode);

4435

trace_ext4_truncate_enter(inode);

4438

4436

4439

if (!ext4_can_truncate(inode))

4437

if (!ext4_can_truncate(inode))

4440

return;

4438

return;

4441

4439

4442

ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);

4440

ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);

4443

4441

4444

if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))

4442

if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))

4445

ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);

4443

ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);

4446

4444

4447

if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {

4445

if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {

4448

ext4_ext_truncate(inode);

4446

ext4_ext_truncate(inode);

4449

trace_ext4_truncate_exit(inode);

4447

trace_ext4_truncate_exit(inode);

4450

return;

4448

return;

4451

}

4449

}

4452

4450

4453

handle = start_transaction(inode);

4451

handle = start_transaction(inode);

4454

if (IS_ERR(handle))

4452

if (IS_ERR(handle))

4455

return; /* AKPM: return what? */

4453

return; /* AKPM: return what? */

4456

4454

4457

last_block = (inode->i_size + blocksize-1)

4455

last_block = (inode->i_size + blocksize-1)

4458

>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);

4456

>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);

4459

max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)

4457

max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)

4460

>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);

4458

>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);

4461

4459

4462

if (inode->i_size & (blocksize - 1))

4460

if (inode->i_size & (blocksize - 1))

4463

if (ext4_block_truncate_page(handle, mapping, inode->i_size))

4461

if (ext4_block_truncate_page(handle, mapping, inode->i_size))

4464

goto out_stop;

4462

goto out_stop;

4465

4463

4466

if (last_block != max_block) {

4464

if (last_block != max_block) {

4467

n = ext4_block_to_path(inode, last_block, offsets, NULL);

4465

n = ext4_block_to_path(inode, last_block, offsets, NULL);

4468

if (n == 0)

4466

if (n == 0)

4469

goto out_stop; /* error */

4467

goto out_stop; /* error */

4470

}

4468

}

4471

4469

4472

/*

4470

/*

4473

* OK. This truncate is going to happen. We add the inode to the

4471

* OK. This truncate is going to happen. We add the inode to the

4474

* orphan list, so that if this truncate spans multiple transactions,

4472

* orphan list, so that if this truncate spans multiple transactions,

4475

* and we crash, we will resume the truncate when the filesystem

4473

* and we crash, we will resume the truncate when the filesystem

4476

* recovers. It also marks the inode dirty, to catch the new size.

4474

* recovers. It also marks the inode dirty, to catch the new size.

4477

*

4475

*

4478

* Implication: the file must always be in a sane, consistent

4476

* Implication: the file must always be in a sane, consistent

4479

* truncatable state while each transaction commits.

4477

* truncatable state while each transaction commits.

4480

*/

4478

*/

4481

if (ext4_orphan_add(handle, inode))

4479

if (ext4_orphan_add(handle, inode))

4482

goto out_stop;

4480

goto out_stop;

4483

4481

4484

/*

4482

/*

4485

* From here we block out all ext4_get_block() callers who want to

4483

* From here we block out all ext4_get_block() callers who want to

4486

* modify the block allocation tree.

4484

* modify the block allocation tree.

4487

*/

4485

*/

4488

down_write(&ei->i_data_sem);

4486

down_write(&ei->i_data_sem);

4489

4487

4490

ext4_discard_preallocations(inode);

4488

ext4_discard_preallocations(inode);

4491

4489

4492

/*

4490

/*

4493

* The orphan list entry will now protect us from any crash which

4491

* The orphan list entry will now protect us from any crash which

4494

* occurs before the truncate completes, so it is now safe to propagate

4492

* occurs before the truncate completes, so it is now safe to propagate

4495

* the new, shorter inode size (held for now in i_size) into the

4493

* the new, shorter inode size (held for now in i_size) into the

4496

* on-disk inode. We do this via i_disksize, which is the value which

4494

* on-disk inode. We do this via i_disksize, which is the value which

4497

* ext4 *really* writes onto the disk inode.

4495

* ext4 *really* writes onto the disk inode.

4498

*/

4496

*/

4499

ei->i_disksize = inode->i_size;

4497

ei->i_disksize = inode->i_size;

4500

4498

4501

if (last_block == max_block) {

4499

if (last_block == max_block) {

4502

/*

4500

/*

4503

* It is unnecessary to free any data blocks if last_block is

4501

* It is unnecessary to free any data blocks if last_block is

4504

* equal to the indirect block limit.

4502

* equal to the indirect block limit.

4505

*/

4503

*/

4506

goto out_unlock;

4504

goto out_unlock;

4507

} else if (n == 1) { /* direct blocks */

4505

} else if (n == 1) { /* direct blocks */

4508

ext4_free_data(handle, inode, NULL, i_data+offsets[0],

4506

ext4_free_data(handle, inode, NULL, i_data+offsets[0],

4509

i_data + EXT4_NDIR_BLOCKS);

4507

i_data + EXT4_NDIR_BLOCKS);

4510

goto do_indirects;

4508

goto do_indirects;

4511

}

4509

}

4512

4510

4513

partial = ext4_find_shared(inode, n, offsets, chain, &nr);

4511

partial = ext4_find_shared(inode, n, offsets, chain, &nr);

4514

/* Kill the top of shared branch (not detached) */

4512

/* Kill the top of shared branch (not detached) */

4515

if (nr) {

4513

if (nr) {

4516

if (partial == chain) {

4514

if (partial == chain) {

4517

/* Shared branch grows from the inode */

4515

/* Shared branch grows from the inode */

4518

ext4_free_branches(handle, inode, NULL,

4516

ext4_free_branches(handle, inode, NULL,

4519

&nr, &nr+1, (chain+n-1) - partial);

4517

&nr, &nr+1, (chain+n-1) - partial);

4520

*partial->p = 0;

4518

*partial->p = 0;

4521

/*

4519

/*

4522

* We mark the inode dirty prior to restart,

4520

* We mark the inode dirty prior to restart,

4523

* and prior to stop. No need for it here.

4521

* and prior to stop. No need for it here.

4524

*/

4522

*/

4525

} else {

4523

} else {

4526

/* Shared branch grows from an indirect block */

4524

/* Shared branch grows from an indirect block */

4527

BUFFER_TRACE(partial->bh, "get_write_access");

4525

BUFFER_TRACE(partial->bh, "get_write_access");

4528

ext4_free_branches(handle, inode, partial->bh,

4526

ext4_free_branches(handle, inode, partial->bh,

4529

partial->p,

4527

partial->p,

4530

partial->p+1, (chain+n-1) - partial);

4528

partial->p+1, (chain+n-1) - partial);

4531

}

4529

}

4532

}

4530

}

4533

/* Clear the ends of indirect blocks on the shared branch */

4531

/* Clear the ends of indirect blocks on the shared branch */

4534

while (partial > chain) {

4532

while (partial > chain) {

4535

ext4_free_branches(handle, inode, partial->bh, partial->p + 1,

4533

ext4_free_branches(handle, inode, partial->bh, partial->p + 1,

4536

(__le32*)partial->bh->b_data+addr_per_block,

4534

(__le32*)partial->bh->b_data+addr_per_block,

4537

(chain+n-1) - partial);

4535

(chain+n-1) - partial);

4538

BUFFER_TRACE(partial->bh, "call brelse");

4536

BUFFER_TRACE(partial->bh, "call brelse");

4539

brelse(partial->bh);

4537

brelse(partial->bh);

4540

partial--;

4538

partial--;

4541

}

4539

}

4542

do_indirects:

4540

do_indirects:

4543

/* Kill the remaining (whole) subtrees */

4541

/* Kill the remaining (whole) subtrees */

4544

switch (offsets[0]) {

4542

switch (offsets[0]) {

4545

default:

4543

default:

4546

nr = i_data[EXT4_IND_BLOCK];

4544

nr = i_data[EXT4_IND_BLOCK];

4547

if (nr) {

4545

if (nr) {

4548

ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);

4546

ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);

4549

i_data[EXT4_IND_BLOCK] = 0;

4547

i_data[EXT4_IND_BLOCK] = 0;

4550

}

4548

}

4551

case EXT4_IND_BLOCK:

4549

case EXT4_IND_BLOCK:

4552

nr = i_data[EXT4_DIND_BLOCK];

4550

nr = i_data[EXT4_DIND_BLOCK];

4553

if (nr) {

4551

if (nr) {

4554

ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);

4552

ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);

4555

i_data[EXT4_DIND_BLOCK] = 0;

4553

i_data[EXT4_DIND_BLOCK] = 0;

4556

}

4554

}

4557

case EXT4_DIND_BLOCK:

4555

case EXT4_DIND_BLOCK:

4558

nr = i_data[EXT4_TIND_BLOCK];

4556

nr = i_data[EXT4_TIND_BLOCK];

4559

if (nr) {

4557

if (nr) {

4560

ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);

4558

ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);

4561

i_data[EXT4_TIND_BLOCK] = 0;

4559

i_data[EXT4_TIND_BLOCK] = 0;

4562

}

4560

}

4563

case EXT4_TIND_BLOCK:

4561

case EXT4_TIND_BLOCK:

4564

;

4562

;

4565

}

4563

}

4566

4564

4567

out_unlock:

4565

out_unlock:

4568

up_write(&ei->i_data_sem);

4566

up_write(&ei->i_data_sem);

4569

inode->i_mtime = inode->i_ctime = ext4_current_time(inode);

4567

inode->i_mtime = inode->i_ctime = ext4_current_time(inode);

4570

ext4_mark_inode_dirty(handle, inode);

4568

ext4_mark_inode_dirty(handle, inode);

4571

4569

4572

/*

4570

/*

4573

* In a multi-transaction truncate, we only make the final transaction

4571

* In a multi-transaction truncate, we only make the final transaction

4574

* synchronous

4572

* synchronous

4575

*/

4573

*/

4576

if (IS_SYNC(inode))

4574

if (IS_SYNC(inode))

4577

ext4_handle_sync(handle);

4575

ext4_handle_sync(handle);

4578

out_stop:

4576

out_stop:

4579

/*

4577

/*

4580

* If this was a simple ftruncate(), and the file will remain alive

4578

* If this was a simple ftruncate(), and the file will remain alive

4581

* then we need to clear up the orphan record which we created above.

4579

* then we need to clear up the orphan record which we created above.

4582

* However, if this was a real unlink then we were called by

4580

* However, if this was a real unlink then we were called by

4583

* ext4_delete_inode(), and we allow that function to clean up the

4581

* ext4_delete_inode(), and we allow that function to clean up the

4584

* orphan info for us.

4582

* orphan info for us.

4585

*/

4583

*/

4586

if (inode->i_nlink)

4584

if (inode->i_nlink)

4587

ext4_orphan_del(handle, inode);

4585

ext4_orphan_del(handle, inode);

4588

4586

4589

ext4_journal_stop(handle);

4587

ext4_journal_stop(handle);

4590

trace_ext4_truncate_exit(inode);

4588

trace_ext4_truncate_exit(inode);

4591

}

4589

}

4592

4590

4593

/*

4591

/*

4594

* ext4_get_inode_loc returns with an extra refcount against the inode's

4592

* ext4_get_inode_loc returns with an extra refcount against the inode's

4595

* underlying buffer_head on success. If 'in_mem' is true, we have all

4593

* underlying buffer_head on success. If 'in_mem' is true, we have all

4596

* data in memory that is needed to recreate the on-disk version of this

4594

* data in memory that is needed to recreate the on-disk version of this

4597

* inode.

4595

* inode.

4598

*/

4596

*/

4599

static int __ext4_get_inode_loc(struct inode *inode,

4597

static int __ext4_get_inode_loc(struct inode *inode,

4600

struct ext4_iloc *iloc, int in_mem)

4598

struct ext4_iloc *iloc, int in_mem)

4601

{

4599

{

4602

struct ext4_group_desc *gdp;

4600

struct ext4_group_desc *gdp;

4603

struct buffer_head *bh;

4601

struct buffer_head *bh;

4604

struct super_block *sb = inode->i_sb;

4602

struct super_block *sb = inode->i_sb;

4605

ext4_fsblk_t block;

4603

ext4_fsblk_t block;

4606

int inodes_per_block, inode_offset;

4604

int inodes_per_block, inode_offset;

4607

4605

4608

iloc->bh = NULL;

4606

iloc->bh = NULL;

4609

if (!ext4_valid_inum(sb, inode->i_ino))

4607

if (!ext4_valid_inum(sb, inode->i_ino))

4610

return -EIO;

4608

return -EIO;

4611

4609

4612

iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);

4610

iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);

4613

gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);

4611

gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);

4614

if (!gdp)

4612

if (!gdp)

4615

return -EIO;

4613

return -EIO;

4616

4614

4617

/*

4615

/*

4618

* Figure out the offset within the block group inode table

4616

* Figure out the offset within the block group inode table

4619

*/

4617

*/

4620

inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;

4618

inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;

4621

inode_offset = ((inode->i_ino - 1) %

4619

inode_offset = ((inode->i_ino - 1) %

4622

EXT4_INODES_PER_GROUP(sb));

4620

EXT4_INODES_PER_GROUP(sb));

4623

block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);

4621

block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);

4624

iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);

4622

iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);

4625

4623

4626

bh = sb_getblk(sb, block);

4624

bh = sb_getblk(sb, block);

4627

if (!bh) {

4625

if (!bh) {

4628

EXT4_ERROR_INODE_BLOCK(inode, block,

4626

EXT4_ERROR_INODE_BLOCK(inode, block,

4629

"unable to read itable block");

4627

"unable to read itable block");

4630

return -EIO;

4628

return -EIO;

4631

}

4629

}

4632

if (!buffer_uptodate(bh)) {

4630

if (!buffer_uptodate(bh)) {

4633

lock_buffer(bh);

4631

lock_buffer(bh);

4634

4632

4635

/*

4633

/*

4636

* If the buffer has the write error flag, we have failed

4634

* If the buffer has the write error flag, we have failed

4637

* to write out another inode in the same block. In this

4635

* to write out another inode in the same block. In this

4638

* case, we don't have to read the block because we may

4636

* case, we don't have to read the block because we may

4639

* read the old inode data successfully.

4637

* read the old inode data successfully.

4640

*/

4638

*/

4641

if (buffer_write_io_error(bh) && !buffer_uptodate(bh))

4639

if (buffer_write_io_error(bh) && !buffer_uptodate(bh))

4642

set_buffer_uptodate(bh);

4640

set_buffer_uptodate(bh);

4643

4641

4644

if (buffer_uptodate(bh)) {

4642

if (buffer_uptodate(bh)) {

4645

/* someone brought it uptodate while we waited */

4643

/* someone brought it uptodate while we waited */

4646

unlock_buffer(bh);

4644

unlock_buffer(bh);

4647

goto has_buffer;

4645

goto has_buffer;

4648

}

4646

}

4649

4647

4650

/*

4648

/*

4651

* If we have all information of the inode in memory and this

4649

* If we have all information of the inode in memory and this

4652

* is the only valid inode in the block, we need not read the

4650

* is the only valid inode in the block, we need not read the

4653

* block.

4651

* block.

4654

*/

4652

*/

4655

if (in_mem) {

4653

if (in_mem) {

4656

struct buffer_head *bitmap_bh;

4654

struct buffer_head *bitmap_bh;

4657

int i, start;

4655

int i, start;

4658

4656

4659

start = inode_offset & ~(inodes_per_block - 1);

4657

start = inode_offset & ~(inodes_per_block - 1);

4660

4658

4661

/* Is the inode bitmap in cache? */

4659

/* Is the inode bitmap in cache? */

4662

bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));

4660

bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));

4663

if (!bitmap_bh)

4661

if (!bitmap_bh)

4664

goto make_io;

4662

goto make_io;

4665

4663

4666

/*

4664

/*

4667

* If the inode bitmap isn't in cache then the

4665

* If the inode bitmap isn't in cache then the

4668

* optimisation may end up performing two reads instead

4666

* optimisation may end up performing two reads instead

4669

* of one, so skip it.

4667

* of one, so skip it.

4670

*/

4668

*/

4671

if (!buffer_uptodate(bitmap_bh)) {

4669

if (!buffer_uptodate(bitmap_bh)) {

4672

brelse(bitmap_bh);

4670

brelse(bitmap_bh);

4673

goto make_io;

4671

goto make_io;

4674

}

4672

}

4675

for (i = start; i < start + inodes_per_block; i++) {

4673

for (i = start; i < start + inodes_per_block; i++) {

4676

if (i == inode_offset)

4674

if (i == inode_offset)

4677

continue;

4675

continue;

4678

if (ext4_test_bit(i, bitmap_bh->b_data))

4676

if (ext4_test_bit(i, bitmap_bh->b_data))

4679

break;

4677

break;

4680

}

4678

}

4681

brelse(bitmap_bh);

4679

brelse(bitmap_bh);

4682

if (i == start + inodes_per_block) {

4680

if (i == start + inodes_per_block) {

4683

/* all other inodes are free, so skip I/O */

4681

/* all other inodes are free, so skip I/O */

4684

memset(bh->b_data, 0, bh->b_size);

4682

memset(bh->b_data, 0, bh->b_size);

4685

set_buffer_uptodate(bh);

4683

set_buffer_uptodate(bh);

4686

unlock_buffer(bh);

4684

unlock_buffer(bh);

4687

goto has_buffer;

4685

goto has_buffer;

4688

}

4686

}

4689

}

4687

}

4690

4688

4691

make_io:

4689

make_io:

4692

/*

4690

/*

4693

* If we need to do any I/O, try to pre-readahead extra

4691

* If we need to do any I/O, try to pre-readahead extra

4694

* blocks from the inode table.

4692

* blocks from the inode table.

4695

*/

4693

*/

4696

if (EXT4_SB(sb)->s_inode_readahead_blks) {

4694

if (EXT4_SB(sb)->s_inode_readahead_blks) {

4697

ext4_fsblk_t b, end, table;

4695

ext4_fsblk_t b, end, table;

4698

unsigned num;

4696

unsigned num;

4699

4697

4700

table = ext4_inode_table(sb, gdp);

4698

table = ext4_inode_table(sb, gdp);

4701

/* s_inode_readahead_blks is always a power of 2 */

4699

/* s_inode_readahead_blks is always a power of 2 */

4702

b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);

4700

b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);

4703

if (table > b)

4701

if (table > b)

4704

b = table;

4702

b = table;

4705

end = b + EXT4_SB(sb)->s_inode_readahead_blks;

4703

end = b + EXT4_SB(sb)->s_inode_readahead_blks;

4706

num = EXT4_INODES_PER_GROUP(sb);

4704

num = EXT4_INODES_PER_GROUP(sb);

4707

if (EXT4_HAS_RO_COMPAT_FEATURE(sb,

4705

if (EXT4_HAS_RO_COMPAT_FEATURE(sb,

4708

EXT4_FEATURE_RO_COMPAT_GDT_CSUM))

4706

EXT4_FEATURE_RO_COMPAT_GDT_CSUM))

4709

num -= ext4_itable_unused_count(sb, gdp);

4707

num -= ext4_itable_unused_count(sb, gdp);

4710

table += num / inodes_per_block;

4708

table += num / inodes_per_block;

4711

if (end > table)

4709

if (end > table)

4712

end = table;

4710

end = table;

4713

while (b <= end)

4711

while (b <= end)

4714

sb_breadahead(sb, b++);

4712

sb_breadahead(sb, b++);

4715

}

4713

}

4716

4714

4717

/*

4715

/*

4718

* There are other valid inodes in the buffer, this inode

4716

* There are other valid inodes in the buffer, this inode

4719

* has in-inode xattrs, or we don't have this inode in memory.

4717

* has in-inode xattrs, or we don't have this inode in memory.

4720

* Read the block from disk.

4718

* Read the block from disk.

4721

*/

4719

*/

4722

trace_ext4_load_inode(inode);

4720

trace_ext4_load_inode(inode);

4723

get_bh(bh);

4721

get_bh(bh);

4724

bh->b_end_io = end_buffer_read_sync;

4722

bh->b_end_io = end_buffer_read_sync;

4725

submit_bh(READ_META, bh);

4723

submit_bh(READ_META, bh);

4726

wait_on_buffer(bh);

4724

wait_on_buffer(bh);

4727

if (!buffer_uptodate(bh)) {

4725

if (!buffer_uptodate(bh)) {

4728

EXT4_ERROR_INODE_BLOCK(inode, block,

4726

EXT4_ERROR_INODE_BLOCK(inode, block,

4729

"unable to read itable block");

4727

"unable to read itable block");

4730

brelse(bh);

4728

brelse(bh);

4731

return -EIO;

4729

return -EIO;

4732

}

4730

}

4733

}

4731

}

4734

has_buffer:

4732

has_buffer:

4735

iloc->bh = bh;

4733

iloc->bh = bh;

4736

return 0;

4734

return 0;

4737

}

4735

}

4738

4736

4739

int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)

4737

int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)

4740

{

4738

{

4741

/* We have all inode data except xattrs in memory here. */

4739

/* We have all inode data except xattrs in memory here. */

4742

return __ext4_get_inode_loc(inode, iloc,

4740

return __ext4_get_inode_loc(inode, iloc,

4743

!ext4_test_inode_state(inode, EXT4_STATE_XATTR));

4741

!ext4_test_inode_state(inode, EXT4_STATE_XATTR));

4744

}

4742

}

4745

4743

4746

void ext4_set_inode_flags(struct inode *inode)

4744

void ext4_set_inode_flags(struct inode *inode)

4747

{

4745

{

4748

unsigned int flags = EXT4_I(inode)->i_flags;

4746

unsigned int flags = EXT4_I(inode)->i_flags;

4749

4747

4750

inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);

4748

inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);

4751

if (flags & EXT4_SYNC_FL)

4749

if (flags & EXT4_SYNC_FL)

4752

inode->i_flags |= S_SYNC;

4750

inode->i_flags |= S_SYNC;

4753

if (flags & EXT4_APPEND_FL)

4751

if (flags & EXT4_APPEND_FL)

4754

inode->i_flags |= S_APPEND;

4752

inode->i_flags |= S_APPEND;

4755

if (flags & EXT4_IMMUTABLE_FL)

4753

if (flags & EXT4_IMMUTABLE_FL)

4756

inode->i_flags |= S_IMMUTABLE;

4754

inode->i_flags |= S_IMMUTABLE;

4757

if (flags & EXT4_NOATIME_FL)

4755

if (flags & EXT4_NOATIME_FL)

4758

inode->i_flags |= S_NOATIME;

4756

inode->i_flags |= S_NOATIME;

4759

if (flags & EXT4_DIRSYNC_FL)

4757

if (flags & EXT4_DIRSYNC_FL)

4760

inode->i_flags |= S_DIRSYNC;

4758

inode->i_flags |= S_DIRSYNC;

4761

}

4759

}

4762

4760

4763

/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */

4761

/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */

4764

void ext4_get_inode_flags(struct ext4_inode_info *ei)

4762

void ext4_get_inode_flags(struct ext4_inode_info *ei)

4765

{

4763

{

4766

unsigned int vfs_fl;

4764

unsigned int vfs_fl;

4767

unsigned long old_fl, new_fl;

4765

unsigned long old_fl, new_fl;

4768

4766

4769

do {

4767

do {

4770

vfs_fl = ei->vfs_inode.i_flags;

4768

vfs_fl = ei->vfs_inode.i_flags;

4771

old_fl = ei->i_flags;

4769

old_fl = ei->i_flags;

4772

new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL|

4770

new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL|

4773

EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|

4771

EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|

4774

EXT4_DIRSYNC_FL);

4772

EXT4_DIRSYNC_FL);

4775

if (vfs_fl & S_SYNC)

4773

if (vfs_fl & S_SYNC)

4776

new_fl |= EXT4_SYNC_FL;

4774

new_fl |= EXT4_SYNC_FL;

4777

if (vfs_fl & S_APPEND)

4775

if (vfs_fl & S_APPEND)

4778

new_fl |= EXT4_APPEND_FL;

4776

new_fl |= EXT4_APPEND_FL;

4779

if (vfs_fl & S_IMMUTABLE)

4777

if (vfs_fl & S_IMMUTABLE)

4780

new_fl |= EXT4_IMMUTABLE_FL;

4778

new_fl |= EXT4_IMMUTABLE_FL;

4781

if (vfs_fl & S_NOATIME)

4779

if (vfs_fl & S_NOATIME)

4782

new_fl |= EXT4_NOATIME_FL;

4780

new_fl |= EXT4_NOATIME_FL;

4783

if (vfs_fl & S_DIRSYNC)

4781

if (vfs_fl & S_DIRSYNC)

4784

new_fl |= EXT4_DIRSYNC_FL;

4782

new_fl |= EXT4_DIRSYNC_FL;

4785

} while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl);

4783

} while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl);

4786

}

4784

}

4787

4785

4788

static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,

4786

static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,

4789

struct ext4_inode_info *ei)

4787

struct ext4_inode_info *ei)

4790

{

4788

{

4791

blkcnt_t i_blocks ;

4789

blkcnt_t i_blocks ;

4792

struct inode *inode = &(ei->vfs_inode);

4790

struct inode *inode = &(ei->vfs_inode);

4793

struct super_block *sb = inode->i_sb;

4791

struct super_block *sb = inode->i_sb;

4794

4792

4795

if (EXT4_HAS_RO_COMPAT_FEATURE(sb,

4793

if (EXT4_HAS_RO_COMPAT_FEATURE(sb,

4796

EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {

4794

EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {

4797

/* we are using combined 48 bit field */

4795

/* we are using combined 48 bit field */

4798

i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |

4796

i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |

4799

le32_to_cpu(raw_inode->i_blocks_lo);

4797

le32_to_cpu(raw_inode->i_blocks_lo);

4800

if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) {

4798

if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) {

4801

/* i_blocks represent file system block size */

4799

/* i_blocks represent file system block size */

4802

return i_blocks << (inode->i_blkbits - 9);

4800

return i_blocks << (inode->i_blkbits - 9);

4803

} else {

4801

} else {

4804

return i_blocks;

4802

return i_blocks;

4805

}

4803

}

4806

} else {

4804

} else {

4807

return le32_to_cpu(raw_inode->i_blocks_lo);

4805

return le32_to_cpu(raw_inode->i_blocks_lo);

4808

}

4806

}

4809

}

4807

}

4810

4808

4811

struct inode *ext4_iget(struct super_block *sb, unsigned long ino)

4809

struct inode *ext4_iget(struct super_block *sb, unsigned long ino)

4812

{

4810

{

4813

struct ext4_iloc iloc;

4811

struct ext4_iloc iloc;

4814

struct ext4_inode *raw_inode;

4812

struct ext4_inode *raw_inode;

4815

struct ext4_inode_info *ei;

4813

struct ext4_inode_info *ei;

4816

struct inode *inode;

4814

struct inode *inode;

4817

journal_t *journal = EXT4_SB(sb)->s_journal;

4815

journal_t *journal = EXT4_SB(sb)->s_journal;

4818

long ret;

4816

long ret;

4819

int block;

4817

int block;

4820

4818

4821

inode = iget_locked(sb, ino);

4819

inode = iget_locked(sb, ino);

4822

if (!inode)

4820

if (!inode)

4823

return ERR_PTR(-ENOMEM);

4821

return ERR_PTR(-ENOMEM);

4824

if (!(inode->i_state & I_NEW))

4822

if (!(inode->i_state & I_NEW))

4825

return inode;

4823

return inode;

4826

4824

4827

ei = EXT4_I(inode);

4825

ei = EXT4_I(inode);

4828

iloc.bh = NULL;

4826

iloc.bh = NULL;

4829

4827

4830

ret = __ext4_get_inode_loc(inode, &iloc, 0);

4828

ret = __ext4_get_inode_loc(inode, &iloc, 0);

4831

if (ret < 0)

4829

if (ret < 0)

4832

goto bad_inode;

4830

goto bad_inode;

4833

raw_inode = ext4_raw_inode(&iloc);

4831

raw_inode = ext4_raw_inode(&iloc);

4834

inode->i_mode = le16_to_cpu(raw_inode->i_mode);

4832

inode->i_mode = le16_to_cpu(raw_inode->i_mode);

4835

inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);

4833

inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);

4836

inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);

4834

inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);

4837

if (!(test_opt(inode->i_sb, NO_UID32))) {

4835

if (!(test_opt(inode->i_sb, NO_UID32))) {

4838

inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;

4836

inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;

4839

inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;

4837

inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;

4840

}

4838

}

4841

inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);

4839

inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);

4842

4840

4843

ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */

4841

ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */

4844

ei->i_dir_start_lookup = 0;

4842

ei->i_dir_start_lookup = 0;

4845

ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);

4843

ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);

4846

/* We now have enough fields to check if the inode was active or not.

4844

/* We now have enough fields to check if the inode was active or not.

4847

* This is needed because nfsd might try to access dead inodes

4845

* This is needed because nfsd might try to access dead inodes

4848

* the test is that same one that e2fsck uses

4846

* the test is that same one that e2fsck uses

4849

* NeilBrown 1999oct15

4847

* NeilBrown 1999oct15

4850

*/

4848

*/

4851

if (inode->i_nlink == 0) {

4849

if (inode->i_nlink == 0) {

4852

if (inode->i_mode == 0 ||

4850

if (inode->i_mode == 0 ||

4853

!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {

4851

!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {

4854

/* this inode is deleted */

4852

/* this inode is deleted */

4855

ret = -ESTALE;

4853

ret = -ESTALE;

4856

goto bad_inode;

4854

goto bad_inode;

4857

}

4855

}

4858

/* The only unlinked inodes we let through here have

4856

/* The only unlinked inodes we let through here have

4859

* valid i_mode and are being read by the orphan

4857

* valid i_mode and are being read by the orphan

4860

* recovery code: that's fine, we're about to complete

4858

* recovery code: that's fine, we're about to complete

4861

* the process of deleting those. */

4859

* the process of deleting those. */

4862

}

4860

}

4863

ei->i_flags = le32_to_cpu(raw_inode->i_flags);

4861

ei->i_flags = le32_to_cpu(raw_inode->i_flags);

4864

inode->i_blocks = ext4_inode_blocks(raw_inode, ei);

4862

inode->i_blocks = ext4_inode_blocks(raw_inode, ei);

4865

ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);

4863

ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);

4866

if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))

4864

if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))

4867

ei->i_file_acl |=

4865

ei->i_file_acl |=

4868

((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;

4866

((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;

4869

inode->i_size = ext4_isize(raw_inode);

4867

inode->i_size = ext4_isize(raw_inode);

4870

ei->i_disksize = inode->i_size;

4868

ei->i_disksize = inode->i_size;

4871

#ifdef CONFIG_QUOTA

4869

#ifdef CONFIG_QUOTA

4872

ei->i_reserved_quota = 0;

4870

ei->i_reserved_quota = 0;

4873

#endif

4871

#endif

4874

inode->i_generation = le32_to_cpu(raw_inode->i_generation);

4872

inode->i_generation = le32_to_cpu(raw_inode->i_generation);

4875

ei->i_block_group = iloc.block_group;

4873

ei->i_block_group = iloc.block_group;

4876

ei->i_last_alloc_group = ~0;

4874

ei->i_last_alloc_group = ~0;

4877

/*

4875

/*

4878

* NOTE! The in-memory inode i_data array is in little-endian order

4876

* NOTE! The in-memory inode i_data array is in little-endian order

4879

* even on big-endian machines: we do NOT byteswap the block numbers!

4877

* even on big-endian machines: we do NOT byteswap the block numbers!

4880

*/

4878

*/

4881

for (block = 0; block < EXT4_N_BLOCKS; block++)

4879

for (block = 0; block < EXT4_N_BLOCKS; block++)

4882

ei->i_data[block] = raw_inode->i_block[block];

4880

ei->i_data[block] = raw_inode->i_block[block];

4883

INIT_LIST_HEAD(&ei->i_orphan);

4881

INIT_LIST_HEAD(&ei->i_orphan);

4884

4882

4885

/*

4883

/*

4886

* Set transaction id's of transactions that have to be committed

4884

* Set transaction id's of transactions that have to be committed

4887

* to finish f[data]sync. We set them to currently running transaction

4885

* to finish f[data]sync. We set them to currently running transaction

4888

* as we cannot be sure that the inode or some of its metadata isn't

4886

* as we cannot be sure that the inode or some of its metadata isn't

4889

* part of the transaction - the inode could have been reclaimed and

4887

* part of the transaction - the inode could have been reclaimed and

4890

* now it is reread from disk.

4888

* now it is reread from disk.

4891

*/

4889

*/

4892

if (journal) {

4890

if (journal) {

4893

transaction_t *transaction;

4891

transaction_t *transaction;

4894

tid_t tid;

4892

tid_t tid;

4895

4893

4896

read_lock(&journal->j_state_lock);

4894

read_lock(&journal->j_state_lock);

4897

if (journal->j_running_transaction)

4895

if (journal->j_running_transaction)

4898

transaction = journal->j_running_transaction;

4896

transaction = journal->j_running_transaction;

4899

else

4897

else

4900

transaction = journal->j_committing_transaction;

4898

transaction = journal->j_committing_transaction;

4901

if (transaction)

4899

if (transaction)

4902

tid = transaction->t_tid;

4900

tid = transaction->t_tid;

4903

else

4901

else

4904

tid = journal->j_commit_sequence;

4902

tid = journal->j_commit_sequence;

4905

read_unlock(&journal->j_state_lock);

4903

read_unlock(&journal->j_state_lock);

4906

ei->i_sync_tid = tid;

4904

ei->i_sync_tid = tid;

4907

ei->i_datasync_tid = tid;

4905

ei->i_datasync_tid = tid;

4908

}

4906

}

4909

4907

4910

if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {

4908

if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {

4911

ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);

4909

ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);

4912

if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >

4910

if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >

4913

EXT4_INODE_SIZE(inode->i_sb)) {

4911

EXT4_INODE_SIZE(inode->i_sb)) {

4914

ret = -EIO;

4912

ret = -EIO;

4915

goto bad_inode;

4913

goto bad_inode;

4916

}

4914

}

4917

if (ei->i_extra_isize == 0) {

4915

if (ei->i_extra_isize == 0) {

4918

/* The extra space is currently unused. Use it. */

4916

/* The extra space is currently unused. Use it. */

4919

ei->i_extra_isize = sizeof(struct ext4_inode) -

4917

ei->i_extra_isize = sizeof(struct ext4_inode) -

4920

EXT4_GOOD_OLD_INODE_SIZE;

4918

EXT4_GOOD_OLD_INODE_SIZE;

4921

} else {

4919

} else {

4922

__le32 *magic = (void *)raw_inode +

4920

__le32 *magic = (void *)raw_inode +

4923

EXT4_GOOD_OLD_INODE_SIZE +

4921

EXT4_GOOD_OLD_INODE_SIZE +

4924

ei->i_extra_isize;

4922

ei->i_extra_isize;

4925

if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))

4923

if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))

4926

ext4_set_inode_state(inode, EXT4_STATE_XATTR);

4924

ext4_set_inode_state(inode, EXT4_STATE_XATTR);

4927

}

4925

}

4928

} else

4926

} else

4929

ei->i_extra_isize = 0;

4927

ei->i_extra_isize = 0;

4930

4928

4931

EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);

4929

EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);

4932

EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);

4930

EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);

4933

EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);

4931

EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);

4934

EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);

4932

EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);

4935

4933

4936

inode->i_version = le32_to_cpu(raw_inode->i_disk_version);

4934

inode->i_version = le32_to_cpu(raw_inode->i_disk_version);

4937

if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {

4935

if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {

4938

if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))

4936

if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))

4939

inode->i_version |=

4937

inode->i_version |=

4940

(__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;

4938

(__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;

4941

}

4939

}

4942

4940

4943

ret = 0;

4941

ret = 0;

4944

if (ei->i_file_acl &&

4942

if (ei->i_file_acl &&

4945

!ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {

4943

!ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {

4946

EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",

4944

EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",

4947

ei->i_file_acl);

4945

ei->i_file_acl);

4948

ret = -EIO;

4946

ret = -EIO;

4949

goto bad_inode;

4947

goto bad_inode;

4950

} else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {

4948

} else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {

4951

if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||

4949

if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||

4952

(S_ISLNK(inode->i_mode) &&

4950

(S_ISLNK(inode->i_mode) &&

4953

!ext4_inode_is_fast_symlink(inode)))

4951

!ext4_inode_is_fast_symlink(inode)))

4954

/* Validate extent which is part of inode */

4952

/* Validate extent which is part of inode */

4955

ret = ext4_ext_check_inode(inode);

4953

ret = ext4_ext_check_inode(inode);

4956

} else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||

4954

} else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||

4957

(S_ISLNK(inode->i_mode) &&

4955

(S_ISLNK(inode->i_mode) &&

4958

!ext4_inode_is_fast_symlink(inode))) {

4956

!ext4_inode_is_fast_symlink(inode))) {

4959

/* Validate block references which are part of inode */

4957

/* Validate block references which are part of inode */

4960

ret = ext4_check_inode_blockref(inode);

4958

ret = ext4_check_inode_blockref(inode);

4961

}

4959

}

4962

if (ret)

4960

if (ret)

4963

goto bad_inode;

4961

goto bad_inode;

4964

4962

4965

if (S_ISREG(inode->i_mode)) {

4963

if (S_ISREG(inode->i_mode)) {

4966

inode->i_op = &ext4_file_inode_operations;

4964

inode->i_op = &ext4_file_inode_operations;

4967

inode->i_fop = &ext4_file_operations;

4965

inode->i_fop = &ext4_file_operations;

4968

ext4_set_aops(inode);

4966

ext4_set_aops(inode);

4969

} else if (S_ISDIR(inode->i_mode)) {

4967

} else if (S_ISDIR(inode->i_mode)) {

4970

inode->i_op = &ext4_dir_inode_operations;

4968

inode->i_op = &ext4_dir_inode_operations;

4971

inode->i_fop = &ext4_dir_operations;

4969

inode->i_fop = &ext4_dir_operations;

4972

} else if (S_ISLNK(inode->i_mode)) {

4970

} else if (S_ISLNK(inode->i_mode)) {

4973

if (ext4_inode_is_fast_symlink(inode)) {

4971

if (ext4_inode_is_fast_symlink(inode)) {

4974

inode->i_op = &ext4_fast_symlink_inode_operations;

4972

inode->i_op = &ext4_fast_symlink_inode_operations;

4975

nd_terminate_link(ei->i_data, inode->i_size,

4973

nd_terminate_link(ei->i_data, inode->i_size,

4976

sizeof(ei->i_data) - 1);

4974

sizeof(ei->i_data) - 1);

4977

} else {

4975

} else {

4978

inode->i_op = &ext4_symlink_inode_operations;

4976

inode->i_op = &ext4_symlink_inode_operations;

4979

ext4_set_aops(inode);

4977

ext4_set_aops(inode);

4980

}

4978

}

4981

} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||

4979

} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||

4982

S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {

4980

S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {

4983

inode->i_op = &ext4_special_inode_operations;

4981

inode->i_op = &ext4_special_inode_operations;

4984

if (raw_inode->i_block[0])

4982

if (raw_inode->i_block[0])

4985

init_special_inode(inode, inode->i_mode,

4983

init_special_inode(inode, inode->i_mode,

4986

old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));

4984

old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));

4987

else

4985

else

4988

init_special_inode(inode, inode->i_mode,

4986

init_special_inode(inode, inode->i_mode,

4989

new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));

4987

new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));

4990

} else {

4988

} else {

4991

ret = -EIO;

4989

ret = -EIO;

4992

EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);

4990

EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);

4993

goto bad_inode;

4991

goto bad_inode;

4994

}

4992

}

4995

brelse(iloc.bh);

4993

brelse(iloc.bh);

4996

ext4_set_inode_flags(inode);

4994

ext4_set_inode_flags(inode);

4997

unlock_new_inode(inode);

4995

unlock_new_inode(inode);

4998

return inode;

4996

return inode;

4999

4997

5000

bad_inode:

4998

bad_inode:

5001

brelse(iloc.bh);

4999

brelse(iloc.bh);

5002

iget_failed(inode);

5000

iget_failed(inode);

5003

return ERR_PTR(ret);

5001

return ERR_PTR(ret);

5004

}

5002

}

5005

5003

5006

static int ext4_inode_blocks_set(handle_t *handle,

5004

static int ext4_inode_blocks_set(handle_t *handle,

5007

struct ext4_inode *raw_inode,

5005

struct ext4_inode *raw_inode,

5008

struct ext4_inode_info *ei)

5006

struct ext4_inode_info *ei)

5009

{

5007

{

5010

struct inode *inode = &(ei->vfs_inode);

5008

struct inode *inode = &(ei->vfs_inode);

5011

u64 i_blocks = inode->i_blocks;

5009

u64 i_blocks = inode->i_blocks;

5012

struct super_block *sb = inode->i_sb;

5010

struct super_block *sb = inode->i_sb;

5013

5011

5014

if (i_blocks <= ~0U) {

5012

if (i_blocks <= ~0U) {

5015

/*

5013

/*

5016

* i_blocks can be represnted in a 32 bit variable

5014

* i_blocks can be represnted in a 32 bit variable

5017

* as multiple of 512 bytes

5015

* as multiple of 512 bytes

5018

*/

5016

*/

5019

raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);

5017

raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);

5020

raw_inode->i_blocks_high = 0;

5018

raw_inode->i_blocks_high = 0;

5021

ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);

5019

ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);

5022

return 0;

5020

return 0;

5023

}

5021

}

5024

if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))

5022

if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))

5025

return -EFBIG;

5023

return -EFBIG;

5026

5024

5027

if (i_blocks <= 0xffffffffffffULL) {

5025

if (i_blocks <= 0xffffffffffffULL) {

5028

/*

5026

/*

5029

* i_blocks can be represented in a 48 bit variable

5027

* i_blocks can be represented in a 48 bit variable

5030

* as multiple of 512 bytes

5028

* as multiple of 512 bytes

5031

*/

5029

*/

5032

raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);

5030

raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);

5033

raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);

5031

raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);

5034

ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);

5032

ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);

5035

} else {

5033

} else {

5036

ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);

5034

ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);

5037

/* i_block is stored in file system block size */

5035

/* i_block is stored in file system block size */

5038

i_blocks = i_blocks >> (inode->i_blkbits - 9);

5036

i_blocks = i_blocks >> (inode->i_blkbits - 9);

5039

raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);

5037

raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);

5040

raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);

5038

raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);

5041

}

5039

}

5042

return 0;

5040

return 0;

5043

}

5041

}

5044

5042

5045

/*

5043

/*

5046

* Post the struct inode info into an on-disk inode location in the

5044

* Post the struct inode info into an on-disk inode location in the

5047

* buffer-cache. This gobbles the caller's reference to the

5045

* buffer-cache. This gobbles the caller's reference to the

5048

* buffer_head in the inode location struct.

5046

* buffer_head in the inode location struct.

5049

*

5047

*

5050

* The caller must have write access to iloc->bh.

5048

* The caller must have write access to iloc->bh.

5051

*/

5049

*/

5052

static int ext4_do_update_inode(handle_t *handle,

5050

static int ext4_do_update_inode(handle_t *handle,

5053

struct inode *inode,

5051

struct inode *inode,

5054

struct ext4_iloc *iloc)

5052

struct ext4_iloc *iloc)

5055

{

5053

{

5056

struct ext4_inode *raw_inode = ext4_raw_inode(iloc);

5054

struct ext4_inode *raw_inode = ext4_raw_inode(iloc);

5057

struct ext4_inode_info *ei = EXT4_I(inode);

5055

struct ext4_inode_info *ei = EXT4_I(inode);

5058

struct buffer_head *bh = iloc->bh;

5056

struct buffer_head *bh = iloc->bh;

5059

int err = 0, rc, block;

5057

int err = 0, rc, block;

5060

5058

5061

/* For fields not not tracking in the in-memory inode,

5059

/* For fields not not tracking in the in-memory inode,

5062

* initialise them to zero for new inodes. */

5060

* initialise them to zero for new inodes. */

5063

if (ext4_test_inode_state(inode, EXT4_STATE_NEW))

5061

if (ext4_test_inode_state(inode, EXT4_STATE_NEW))

5064

memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);

5062

memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);

5065

5063

5066

ext4_get_inode_flags(ei);

5064

ext4_get_inode_flags(ei);

5067

raw_inode->i_mode = cpu_to_le16(inode->i_mode);

5065

raw_inode->i_mode = cpu_to_le16(inode->i_mode);

5068

if (!(test_opt(inode->i_sb, NO_UID32))) {

5066

if (!(test_opt(inode->i_sb, NO_UID32))) {

5069

raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));

5067

raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));

5070

raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));

5068

raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));

5071

/*

5069

/*

5072

* Fix up interoperability with old kernels. Otherwise, old inodes get

5070

* Fix up interoperability with old kernels. Otherwise, old inodes get

5073

* re-used with the upper 16 bits of the uid/gid intact

5071

* re-used with the upper 16 bits of the uid/gid intact

5074

*/

5072

*/

5075

if (!ei->i_dtime) {

5073

if (!ei->i_dtime) {

5076

raw_inode->i_uid_high =

5074

raw_inode->i_uid_high =

5077

cpu_to_le16(high_16_bits(inode->i_uid));

5075

cpu_to_le16(high_16_bits(inode->i_uid));

5078

raw_inode->i_gid_high =

5076

raw_inode->i_gid_high =

5079

cpu_to_le16(high_16_bits(inode->i_gid));

5077

cpu_to_le16(high_16_bits(inode->i_gid));

5080

} else {

5078

} else {

5081

raw_inode->i_uid_high = 0;

5079

raw_inode->i_uid_high = 0;

5082

raw_inode->i_gid_high = 0;

5080

raw_inode->i_gid_high = 0;

5083

}

5081

}

5084

} else {

5082

} else {

5085

raw_inode->i_uid_low =

5083

raw_inode->i_uid_low =

5086

cpu_to_le16(fs_high2lowuid(inode->i_uid));

5084

cpu_to_le16(fs_high2lowuid(inode->i_uid));

5087

raw_inode->i_gid_low =

5085

raw_inode->i_gid_low =

5088

cpu_to_le16(fs_high2lowgid(inode->i_gid));

5086

cpu_to_le16(fs_high2lowgid(inode->i_gid));

5089

raw_inode->i_uid_high = 0;

5087

raw_inode->i_uid_high = 0;

5090

raw_inode->i_gid_high = 0;

5088

raw_inode->i_gid_high = 0;

5091

}

5089

}

5092

raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);

5090

raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);

5093

5091

5094

EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);

5092

EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);

5095

EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);

5093

EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);

5096

EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);

5094

EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);

5097

EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);

5095

EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);

5098

5096

5099

if (ext4_inode_blocks_set(handle, raw_inode, ei))

5097

if (ext4_inode_blocks_set(handle, raw_inode, ei))

5100

goto out_brelse;

5098

goto out_brelse;

5101

raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);

5099

raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);

5102

raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);

5100

raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);

5103

if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=

5101

if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=

5104

cpu_to_le32(EXT4_OS_HURD))

5102

cpu_to_le32(EXT4_OS_HURD))

5105

raw_inode->i_file_acl_high =

5103

raw_inode->i_file_acl_high =

5106

cpu_to_le16(ei->i_file_acl >> 32);

5104

cpu_to_le16(ei->i_file_acl >> 32);

5107

raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);

5105

raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);

5108

ext4_isize_set(raw_inode, ei->i_disksize);

5106

ext4_isize_set(raw_inode, ei->i_disksize);

5109

if (ei->i_disksize > 0x7fffffffULL) {

5107

if (ei->i_disksize > 0x7fffffffULL) {

5110

struct super_block *sb = inode->i_sb;

5108

struct super_block *sb = inode->i_sb;

5111

if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,

5109

if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,

5112

EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||

5110

EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||

5113

EXT4_SB(sb)->s_es->s_rev_level ==

5111

EXT4_SB(sb)->s_es->s_rev_level ==

5114

cpu_to_le32(EXT4_GOOD_OLD_REV)) {

5112

cpu_to_le32(EXT4_GOOD_OLD_REV)) {

5115

/* If this is the first large file

5113

/* If this is the first large file

5116

* created, add a flag to the superblock.

5114

* created, add a flag to the superblock.

5117

*/

5115

*/

5118

err = ext4_journal_get_write_access(handle,

5116

err = ext4_journal_get_write_access(handle,

5119

EXT4_SB(sb)->s_sbh);

5117

EXT4_SB(sb)->s_sbh);

5120

if (err)

5118

if (err)

5121

goto out_brelse;

5119

goto out_brelse;

5122

ext4_update_dynamic_rev(sb);

5120

ext4_update_dynamic_rev(sb);

5123

EXT4_SET_RO_COMPAT_FEATURE(sb,

5121

EXT4_SET_RO_COMPAT_FEATURE(sb,

5124

EXT4_FEATURE_RO_COMPAT_LARGE_FILE);

5122

EXT4_FEATURE_RO_COMPAT_LARGE_FILE);

5125

sb->s_dirt = 1;

5123

sb->s_dirt = 1;

5126

ext4_handle_sync(handle);

5124

ext4_handle_sync(handle);

5127

err = ext4_handle_dirty_metadata(handle, NULL,

5125

err = ext4_handle_dirty_metadata(handle, NULL,

5128

EXT4_SB(sb)->s_sbh);

5126

EXT4_SB(sb)->s_sbh);

5129

}

5127

}

5130

}

5128

}

5131

raw_inode->i_generation = cpu_to_le32(inode->i_generation);

5129

raw_inode->i_generation = cpu_to_le32(inode->i_generation);

5132

if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {

5130

if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {

5133

if (old_valid_dev(inode->i_rdev)) {

5131

if (old_valid_dev(inode->i_rdev)) {

5134

raw_inode->i_block[0] =

5132

raw_inode->i_block[0] =

5135

cpu_to_le32(old_encode_dev(inode->i_rdev));

5133

cpu_to_le32(old_encode_dev(inode->i_rdev));

5136

raw_inode->i_block[1] = 0;

5134

raw_inode->i_block[1] = 0;

5137

} else {

5135

} else {

5138

raw_inode->i_block[0] = 0;

5136

raw_inode->i_block[0] = 0;

5139

raw_inode->i_block[1] =

5137

raw_inode->i_block[1] =

5140

cpu_to_le32(new_encode_dev(inode->i_rdev));

5138

cpu_to_le32(new_encode_dev(inode->i_rdev));

5141

raw_inode->i_block[2] = 0;

5139

raw_inode->i_block[2] = 0;

5142

}

5140

}

5143

} else

5141

} else

5144

for (block = 0; block < EXT4_N_BLOCKS; block++)

5142

for (block = 0; block < EXT4_N_BLOCKS; block++)

5145

raw_inode->i_block[block] = ei->i_data[block];

5143

raw_inode->i_block[block] = ei->i_data[block];

5146

5144

5147

raw_inode->i_disk_version = cpu_to_le32(inode->i_version);

5145

raw_inode->i_disk_version = cpu_to_le32(inode->i_version);

5148

if (ei->i_extra_isize) {

5146

if (ei->i_extra_isize) {

5149

if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))

5147

if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))

5150

raw_inode->i_version_hi =

5148

raw_inode->i_version_hi =

5151

cpu_to_le32(inode->i_version >> 32);

5149

cpu_to_le32(inode->i_version >> 32);

5152

raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);

5150

raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);

5153

}

5151

}

5154

5152

5155

BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");

5153

BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");

5156

rc = ext4_handle_dirty_metadata(handle, NULL, bh);

5154

rc = ext4_handle_dirty_metadata(handle, NULL, bh);

5157

if (!err)

5155

if (!err)

5158

err = rc;

5156

err = rc;

5159

ext4_clear_inode_state(inode, EXT4_STATE_NEW);

5157

ext4_clear_inode_state(inode, EXT4_STATE_NEW);

5160

5158

5161

ext4_update_inode_fsync_trans(handle, inode, 0);

5159

ext4_update_inode_fsync_trans(handle, inode, 0);

5162

out_brelse:

5160

out_brelse:

5163

brelse(bh);

5161

brelse(bh);

5164

ext4_std_error(inode->i_sb, err);

5162

ext4_std_error(inode->i_sb, err);

5165

return err;

5163

return err;

5166

}

5164

}

5167

5165

5168

/*

5166

/*

5169

* ext4_write_inode()

5167

* ext4_write_inode()

5170

*

5168

*

5171

* We are called from a few places:

5169

* We are called from a few places:

5172

*

5170

*

5173

* - Within generic_file_write() for O_SYNC files.

5171

* - Within generic_file_write() for O_SYNC files.

5174

* Here, there will be no transaction running. We wait for any running

5172

* Here, there will be no transaction running. We wait for any running

5175

* trasnaction to commit.

5173

* trasnaction to commit.

5176

*

5174

*

5177

* - Within sys_sync(), kupdate and such.

5175

* - Within sys_sync(), kupdate and such.

5178

* We wait on commit, if tol to.

5176

* We wait on commit, if tol to.

5179

*

5177

*

5180

* - Within prune_icache() (PF_MEMALLOC == true)

5178

* - Within prune_icache() (PF_MEMALLOC == true)

5181

* Here we simply return. We can't afford to block kswapd on the

5179

* Here we simply return. We can't afford to block kswapd on the

5182

* journal commit.

5180

* journal commit.

5183

*

5181

*

5184

* In all cases it is actually safe for us to return without doing anything,

5182

* In all cases it is actually safe for us to return without doing anything,

5185

* because the inode has been copied into a raw inode buffer in

5183

* because the inode has been copied into a raw inode buffer in

5186

* ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for

5184

* ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for

5187

* knfsd.

5185

* knfsd.

5188

*

5186

*

5189

* Note that we are absolutely dependent upon all inode dirtiers doing the

5187

* Note that we are absolutely dependent upon all inode dirtiers doing the

5190

* right thing: they *must* call mark_inode_dirty() after dirtying info in

5188

* right thing: they *must* call mark_inode_dirty() after dirtying info in

5191

* which we are interested.

5189

* which we are interested.

5192

*

5190

*

5193

* It would be a bug for them to not do this. The code:

5191

* It would be a bug for them to not do this. The code:

5194

*

5192

*

5195

* mark_inode_dirty(inode)

5193

* mark_inode_dirty(inode)

5196

* stuff();

5194

* stuff();

5197

* inode->i_size = expr;

5195

* inode->i_size = expr;

5198

*

5196

*

5199

* is in error because a kswapd-driven write_inode() could occur while

5197

* is in error because a kswapd-driven write_inode() could occur while

5200

* `stuff()' is running, and the new i_size will be lost. Plus the inode

5198

* `stuff()' is running, and the new i_size will be lost. Plus the inode

5201

* will no longer be on the superblock's dirty inode list.

5199

* will no longer be on the superblock's dirty inode list.

5202

*/

5200

*/

5203

int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)

5201

int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)

5204

{

5202

{

5205

int err;

5203

int err;

5206

5204

5207

if (current->flags & PF_MEMALLOC)

5205

if (current->flags & PF_MEMALLOC)

5208

return 0;

5206

return 0;

5209

5207

5210

if (EXT4_SB(inode->i_sb)->s_journal) {

5208

if (EXT4_SB(inode->i_sb)->s_journal) {

5211

if (ext4_journal_current_handle()) {

5209

if (ext4_journal_current_handle()) {

5212

jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");

5210

jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");

5213

dump_stack();

5211

dump_stack();

5214

return -EIO;

5212

return -EIO;

5215

}

5213

}

5216

5214

5217

if (wbc->sync_mode != WB_SYNC_ALL)

5215

if (wbc->sync_mode != WB_SYNC_ALL)

5218

return 0;

5216

return 0;

5219

5217

5220

err = ext4_force_commit(inode->i_sb);

5218

err = ext4_force_commit(inode->i_sb);

5221

} else {

5219

} else {

5222

struct ext4_iloc iloc;

5220

struct ext4_iloc iloc;

5223

5221

5224

err = __ext4_get_inode_loc(inode, &iloc, 0);

5222

err = __ext4_get_inode_loc(inode, &iloc, 0);

5225

if (err)

5223

if (err)

5226

return err;

5224

return err;

5227

if (wbc->sync_mode == WB_SYNC_ALL)

5225

if (wbc->sync_mode == WB_SYNC_ALL)

5228

sync_dirty_buffer(iloc.bh);

5226

sync_dirty_buffer(iloc.bh);

5229

if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {

5227

if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {

5230

EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,

5228

EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,

5231

"IO error syncing inode");

5229

"IO error syncing inode");

5232

err = -EIO;

5230

err = -EIO;

5233

}

5231

}

5234

brelse(iloc.bh);

5232

brelse(iloc.bh);

5235

}

5233

}

5236

return err;

5234

return err;

5237

}

5235

}

5238

5236

5239

/*

5237

/*

5240

* ext4_setattr()

5238

* ext4_setattr()

5241

*

5239

*

5242

* Called from notify_change.

5240

* Called from notify_change.

5243

*

5241

*

5244

* We want to trap VFS attempts to truncate the file as soon as

5242

* We want to trap VFS attempts to truncate the file as soon as

5245

* possible. In particular, we want to make sure that when the VFS

5243

* possible. In particular, we want to make sure that when the VFS

5246

* shrinks i_size, we put the inode on the orphan list and modify

5244

* shrinks i_size, we put the inode on the orphan list and modify

5247

* i_disksize immediately, so that during the subsequent flushing of

5245

* i_disksize immediately, so that during the subsequent flushing of

5248

* dirty pages and freeing of disk blocks, we can guarantee that any

5246

* dirty pages and freeing of disk blocks, we can guarantee that any

5249

* commit will leave the blocks being flushed in an unused state on

5247

* commit will leave the blocks being flushed in an unused state on

5250

* disk. (On recovery, the inode will get truncated and the blocks will

5248

* disk. (On recovery, the inode will get truncated and the blocks will

5251

* be freed, so we have a strong guarantee that no future commit will

5249

* be freed, so we have a strong guarantee that no future commit will

5252

* leave these blocks visible to the user.)

5250

* leave these blocks visible to the user.)

5253

*

5251

*

5254

* Another thing we have to assure is that if we are in ordered mode

5252

* Another thing we have to assure is that if we are in ordered mode

5255

* and inode is still attached to the committing transaction, we must

5253

* and inode is still attached to the committing transaction, we must

5256

* we start writeout of all the dirty pages which are being truncated.

5254

* we start writeout of all the dirty pages which are being truncated.

5257

* This way we are sure that all the data written in the previous

5255

* This way we are sure that all the data written in the previous

5258

* transaction are already on disk (truncate waits for pages under

5256

* transaction are already on disk (truncate waits for pages under

5259

* writeback).

5257

* writeback).

5260

*

5258

*

5261

* Called with inode->i_mutex down.

5259

* Called with inode->i_mutex down.

5262

*/

5260

*/

5263

int ext4_setattr(struct dentry *dentry, struct iattr *attr)

5261

int ext4_setattr(struct dentry *dentry, struct iattr *attr)

5264

{

5262

{

5265

struct inode *inode = dentry->d_inode;

5263

struct inode *inode = dentry->d_inode;

5266

int error, rc = 0;

5264

int error, rc = 0;

5267

int orphan = 0;

5265

int orphan = 0;

5268

const unsigned int ia_valid = attr->ia_valid;

5266

const unsigned int ia_valid = attr->ia_valid;

5269

5267

5270

error = inode_change_ok(inode, attr);

5268

error = inode_change_ok(inode, attr);

5271

if (error)

5269

if (error)

5272

return error;

5270

return error;

5273

5271

5274

if (is_quota_modification(inode, attr))

5272

if (is_quota_modification(inode, attr))

5275

dquot_initialize(inode);

5273

dquot_initialize(inode);

5276

if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||

5274

if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||

5277

(ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {

5275

(ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {

5278

handle_t *handle;

5276

handle_t *handle;

5279

5277

5280

/* (user+group)*(old+new) structure, inode write (sb,

5278

/* (user+group)*(old+new) structure, inode write (sb,

5281

* inode block, ? - but truncate inode update has it) */

5279

* inode block, ? - but truncate inode update has it) */

5282

handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+

5280

handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+

5283

EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);

5281

EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);

5284

if (IS_ERR(handle)) {

5282

if (IS_ERR(handle)) {

5285

error = PTR_ERR(handle);

5283

error = PTR_ERR(handle);

5286

goto err_out;

5284

goto err_out;

5287

}

5285

}

5288

error = dquot_transfer(inode, attr);

5286

error = dquot_transfer(inode, attr);

5289

if (error) {

5287

if (error) {

5290

ext4_journal_stop(handle);

5288

ext4_journal_stop(handle);

5291

return error;

5289

return error;

5292

}

5290

}

5293

/* Update corresponding info in inode so that everything is in

5291

/* Update corresponding info in inode so that everything is in

5294

* one transaction */

5292

* one transaction */

5295

if (attr->ia_valid & ATTR_UID)

5293

if (attr->ia_valid & ATTR_UID)

5296

inode->i_uid = attr->ia_uid;

5294

inode->i_uid = attr->ia_uid;

5297

if (attr->ia_valid & ATTR_GID)

5295

if (attr->ia_valid & ATTR_GID)

5298

inode->i_gid = attr->ia_gid;

5296

inode->i_gid = attr->ia_gid;

5299

error = ext4_mark_inode_dirty(handle, inode);

5297

error = ext4_mark_inode_dirty(handle, inode);

5300

ext4_journal_stop(handle);

5298

ext4_journal_stop(handle);

5301

}

5299

}

5302

5300

5303

if (attr->ia_valid & ATTR_SIZE) {

5301

if (attr->ia_valid & ATTR_SIZE) {

5304

if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {

5302

if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {

5305

struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

5303

struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

5306

5304

5307

if (attr->ia_size > sbi->s_bitmap_maxbytes)

5305

if (attr->ia_size > sbi->s_bitmap_maxbytes)

5308

return -EFBIG;

5306

return -EFBIG;

5309

}

5307

}

5310

}

5308

}

5311

5309

5312

if (S_ISREG(inode->i_mode) &&

5310

if (S_ISREG(inode->i_mode) &&

5313

attr->ia_valid & ATTR_SIZE &&

5311

attr->ia_valid & ATTR_SIZE &&

5314

(attr->ia_size < inode->i_size ||

5312

(attr->ia_size < inode->i_size ||

5315

(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) {

5313

(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) {

5316

handle_t *handle;

5314

handle_t *handle;

5317

5315

5318

handle = ext4_journal_start(inode, 3);

5316

handle = ext4_journal_start(inode, 3);

5319

if (IS_ERR(handle)) {

5317

if (IS_ERR(handle)) {

5320

error = PTR_ERR(handle);

5318

error = PTR_ERR(handle);

5321

goto err_out;

5319

goto err_out;

5322

}

5320

}

5323

if (ext4_handle_valid(handle)) {

5321

if (ext4_handle_valid(handle)) {

5324

error = ext4_orphan_add(handle, inode);

5322

error = ext4_orphan_add(handle, inode);

5325

orphan = 1;

5323

orphan = 1;

5326

}

5324

}

5327

EXT4_I(inode)->i_disksize = attr->ia_size;

5325

EXT4_I(inode)->i_disksize = attr->ia_size;

5328

rc = ext4_mark_inode_dirty(handle, inode);

5326

rc = ext4_mark_inode_dirty(handle, inode);

5329

if (!error)

5327

if (!error)

5330

error = rc;

5328

error = rc;

5331

ext4_journal_stop(handle);

5329

ext4_journal_stop(handle);

5332

5330

5333

if (ext4_should_order_data(inode)) {

5331

if (ext4_should_order_data(inode)) {

5334

error = ext4_begin_ordered_truncate(inode,

5332

error = ext4_begin_ordered_truncate(inode,

5335

attr->ia_size);

5333

attr->ia_size);

5336

if (error) {

5334

if (error) {

5337

/* Do as much error cleanup as possible */

5335

/* Do as much error cleanup as possible */

5338

handle = ext4_journal_start(inode, 3);

5336

handle = ext4_journal_start(inode, 3);

5339

if (IS_ERR(handle)) {

5337

if (IS_ERR(handle)) {

5340

ext4_orphan_del(NULL, inode);

5338

ext4_orphan_del(NULL, inode);

5341

goto err_out;

5339

goto err_out;

5342

}

5340

}

5343

ext4_orphan_del(handle, inode);

5341

ext4_orphan_del(handle, inode);

5344

orphan = 0;

5342

orphan = 0;

5345

ext4_journal_stop(handle);

5343

ext4_journal_stop(handle);

5346

goto err_out;

5344

goto err_out;

5347

}

5345

}

5348

}

5346

}

5349

/* ext4_truncate will clear the flag */

5347

/* ext4_truncate will clear the flag */

5350

if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))

5348

if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))

5351

ext4_truncate(inode);

5349

ext4_truncate(inode);

5352

}

5350

}

5353

5351

5354

if ((attr->ia_valid & ATTR_SIZE) &&

5352

if ((attr->ia_valid & ATTR_SIZE) &&

5355

attr->ia_size != i_size_read(inode))

5353

attr->ia_size != i_size_read(inode))

5356

rc = vmtruncate(inode, attr->ia_size);

5354

rc = vmtruncate(inode, attr->ia_size);

5357

5355

5358

if (!rc) {

5356

if (!rc) {

5359

setattr_copy(inode, attr);

5357

setattr_copy(inode, attr);

5360

mark_inode_dirty(inode);

5358

mark_inode_dirty(inode);

5361

}

5359

}

5362

5360

5363

/*

5361

/*

5364

* If the call to ext4_truncate failed to get a transaction handle at

5362

* If the call to ext4_truncate failed to get a transaction handle at

5365

* all, we need to clean up the in-core orphan list manually.

5363

* all, we need to clean up the in-core orphan list manually.

5366

*/

5364

*/

5367

if (orphan && inode->i_nlink)

5365

if (orphan && inode->i_nlink)

5368

ext4_orphan_del(NULL, inode);

5366

ext4_orphan_del(NULL, inode);

5369

5367

5370

if (!rc && (ia_valid & ATTR_MODE))

5368

if (!rc && (ia_valid & ATTR_MODE))

5371

rc = ext4_acl_chmod(inode);

5369

rc = ext4_acl_chmod(inode);

5372

5370

5373

err_out:

5371

err_out:

5374

ext4_std_error(inode->i_sb, error);

5372

ext4_std_error(inode->i_sb, error);

5375

if (!error)

5373

if (!error)

5376

error = rc;

5374

error = rc;

5377

return error;

5375

return error;

5378

}

5376

}

5379

5377

5380

int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,

5378

int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,

5381

struct kstat *stat)

5379

struct kstat *stat)

5382

{

5380

{

5383

struct inode *inode;

5381

struct inode *inode;

5384

unsigned long delalloc_blocks;

5382

unsigned long delalloc_blocks;

5385

5383

5386

inode = dentry->d_inode;

5384

inode = dentry->d_inode;

5387

generic_fillattr(inode, stat);

5385

generic_fillattr(inode, stat);

5388

5386

5389

/*

5387

/*

5390

* We can't update i_blocks if the block allocation is delayed

5388

* We can't update i_blocks if the block allocation is delayed

5391

* otherwise in the case of system crash before the real block

5389

* otherwise in the case of system crash before the real block

5392

* allocation is done, we will have i_blocks inconsistent with

5390

* allocation is done, we will have i_blocks inconsistent with

5393

* on-disk file blocks.

5391

* on-disk file blocks.

5394

* We always keep i_blocks updated together with real

5392

* We always keep i_blocks updated together with real

5395

* allocation. But to not confuse with user, stat

5393

* allocation. But to not confuse with user, stat

5396

* will return the blocks that include the delayed allocation

5394

* will return the blocks that include the delayed allocation

5397

* blocks for this file.

5395

* blocks for this file.

5398

*/

5396

*/

5399

delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;

5397

delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;

5400

5398

5401

stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;

5399

stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;

5402

return 0;

5400

return 0;

5403

}

5401

}

5404

5402

5405

static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,

5403

static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,

5406

int chunk)

5404

int chunk)

5407

{

5405

{

5408

int indirects;

5406

int indirects;

5409

5407

5410

/* if nrblocks are contiguous */

5408

/* if nrblocks are contiguous */

5411

if (chunk) {

5409

if (chunk) {

5412

/*

5410

/*

5413

* With N contiguous data blocks, we need at most

5411

* With N contiguous data blocks, we need at most

5414

* N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,

5412

* N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,

5415

* 2 dindirect blocks, and 1 tindirect block

5413

* 2 dindirect blocks, and 1 tindirect block

5416

*/

5414

*/

5417

return DIV_ROUND_UP(nrblocks,

5415

return DIV_ROUND_UP(nrblocks,

5418

EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;

5416

EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;

5419

}

5417

}

5420

/*

5418

/*

5421

* if nrblocks are not contiguous, worse case, each block touch

5419

* if nrblocks are not contiguous, worse case, each block touch

5422

* a indirect block, and each indirect block touch a double indirect

5420

* a indirect block, and each indirect block touch a double indirect

5423

* block, plus a triple indirect block

5421

* block, plus a triple indirect block

5424

*/

5422

*/

5425

indirects = nrblocks * 2 + 1;

5423

indirects = nrblocks * 2 + 1;

5426

return indirects;

5424

return indirects;

5427

}

5425

}

5428

5426

5429

static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)

5427

static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)

5430

{

5428

{

5431

if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))

5429

if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))

5432

return ext4_indirect_trans_blocks(inode, nrblocks, chunk);

5430

return ext4_indirect_trans_blocks(inode, nrblocks, chunk);

5433

return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);

5431

return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);

5434

}

5432

}

5435

5433

5436

/*

5434

/*

5437

* Account for index blocks, block groups bitmaps and block group

5435

* Account for index blocks, block groups bitmaps and block group

5438

* descriptor blocks if modify datablocks and index blocks

5436

* descriptor blocks if modify datablocks and index blocks

5439

* worse case, the indexs blocks spread over different block groups

5437

* worse case, the indexs blocks spread over different block groups

5440

*

5438

*

5441

* If datablocks are discontiguous, they are possible to spread over

5439

* If datablocks are discontiguous, they are possible to spread over

5442

* different block groups too. If they are contiuguous, with flexbg,

5440

* different block groups too. If they are contiuguous, with flexbg,

5443

* they could still across block group boundary.

5441

* they could still across block group boundary.

5444

*

5442

*

5445

* Also account for superblock, inode, quota and xattr blocks

5443

* Also account for superblock, inode, quota and xattr blocks

5446

*/

5444

*/

5447

static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)

5445

static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)

5448

{

5446

{

5449

ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);

5447

ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);

5450

int gdpblocks;

5448

int gdpblocks;

5451

int idxblocks;

5449

int idxblocks;

5452

int ret = 0;

5450

int ret = 0;

5453

5451

5454

/*

5452

/*

5455

* How many index blocks need to touch to modify nrblocks?

5453

* How many index blocks need to touch to modify nrblocks?

5456

* The "Chunk" flag indicating whether the nrblocks is

5454

* The "Chunk" flag indicating whether the nrblocks is

5457

* physically contiguous on disk

5455

* physically contiguous on disk

5458

*

5456

*

5459

* For Direct IO and fallocate, they calls get_block to allocate

5457

* For Direct IO and fallocate, they calls get_block to allocate

5460

* one single extent at a time, so they could set the "Chunk" flag

5458

* one single extent at a time, so they could set the "Chunk" flag

5461

*/

5459

*/

5462

idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk);

5460

idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk);

5463

5461

5464

ret = idxblocks;

5462

ret = idxblocks;

5465

5463

5466

/*

5464

/*

5467

* Now let's see how many group bitmaps and group descriptors need

5465

* Now let's see how many group bitmaps and group descriptors need

5468

* to account

5466

* to account

5469

*/

5467

*/

5470

groups = idxblocks;

5468

groups = idxblocks;

5471

if (chunk)

5469

if (chunk)

5472

groups += 1;

5470

groups += 1;

5473

else

5471

else

5474

groups += nrblocks;

5472

groups += nrblocks;

5475

5473

5476

gdpblocks = groups;

5474

gdpblocks = groups;

5477

if (groups > ngroups)

5475

if (groups > ngroups)

5478

groups = ngroups;

5476

groups = ngroups;

5479

if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)

5477

if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)

5480

gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;

5478

gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;

5481

5479

5482

/* bitmaps and block group descriptor blocks */

5480

/* bitmaps and block group descriptor blocks */

5483

ret += groups + gdpblocks;

5481

ret += groups + gdpblocks;

5484

5482

5485

/* Blocks for super block, inode, quota and xattr blocks */

5483

/* Blocks for super block, inode, quota and xattr blocks */

5486

ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);

5484

ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);

5487

5485

5488

return ret;

5486

return ret;

5489

}

5487

}

5490

5488

5491

/*

5489

/*

5492

* Calculate the total number of credits to reserve to fit

5490

* Calculate the total number of credits to reserve to fit

5493

* the modification of a single pages into a single transaction,

5491

* the modification of a single pages into a single transaction,

5494

* which may include multiple chunks of block allocations.

5492

* which may include multiple chunks of block allocations.

5495

*

5493

*

5496

* This could be called via ext4_write_begin()

5494

* This could be called via ext4_write_begin()

5497

*

5495

*

5498

* We need to consider the worse case, when

5496

* We need to consider the worse case, when

5499

* one new block per extent.

5497

* one new block per extent.

5500

*/

5498

*/

5501

int ext4_writepage_trans_blocks(struct inode *inode)

5499

int ext4_writepage_trans_blocks(struct inode *inode)

5502

{

5500

{

5503

int bpp = ext4_journal_blocks_per_page(inode);

5501

int bpp = ext4_journal_blocks_per_page(inode);

5504

int ret;

5502

int ret;

5505

5503

5506

ret = ext4_meta_trans_blocks(inode, bpp, 0);

5504

ret = ext4_meta_trans_blocks(inode, bpp, 0);

5507

5505

5508

/* Account for data blocks for journalled mode */

5506

/* Account for data blocks for journalled mode */

5509

if (ext4_should_journal_data(inode))

5507

if (ext4_should_journal_data(inode))

5510

ret += bpp;

5508

ret += bpp;

5511

return ret;

5509

return ret;

5512

}

5510

}

5513

5511

5514

/*

5512

/*

5515

* Calculate the journal credits for a chunk of data modification.

5513

* Calculate the journal credits for a chunk of data modification.

5516

*

5514

*

5517

* This is called from DIO, fallocate or whoever calling

5515

* This is called from DIO, fallocate or whoever calling

5518

* ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks.

5516

* ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks.

5519

*

5517

*

5520

* journal buffers for data blocks are not included here, as DIO

5518

* journal buffers for data blocks are not included here, as DIO

5521

* and fallocate do no need to journal data buffers.

5519

* and fallocate do no need to journal data buffers.

5522

*/

5520

*/

5523

int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)

5521

int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)

5524

{

5522

{

5525

return ext4_meta_trans_blocks(inode, nrblocks, 1);

5523

return ext4_meta_trans_blocks(inode, nrblocks, 1);

5526

}

5524

}

5527

5525

5528

/*

5526

/*

5529

* The caller must have previously called ext4_reserve_inode_write().

5527

* The caller must have previously called ext4_reserve_inode_write().

5530

* Give this, we know that the caller already has write access to iloc->bh.

5528

* Give this, we know that the caller already has write access to iloc->bh.

5531

*/

5529

*/

5532

int ext4_mark_iloc_dirty(handle_t *handle,

5530

int ext4_mark_iloc_dirty(handle_t *handle,

5533

struct inode *inode, struct ext4_iloc *iloc)

5531

struct inode *inode, struct ext4_iloc *iloc)

5534

{

5532

{

5535

int err = 0;

5533

int err = 0;

5536

5534

5537

if (test_opt(inode->i_sb, I_VERSION))

5535

if (test_opt(inode->i_sb, I_VERSION))

5538

inode_inc_iversion(inode);

5536

inode_inc_iversion(inode);

5539

5537

5540

/* the do_update_inode consumes one bh->b_count */

5538

/* the do_update_inode consumes one bh->b_count */

5541

get_bh(iloc->bh);

5539

get_bh(iloc->bh);

5542

5540

5543

/* ext4_do_update_inode() does jbd2_journal_dirty_metadata */

5541

/* ext4_do_update_inode() does jbd2_journal_dirty_metadata */

5544

err = ext4_do_update_inode(handle, inode, iloc);

5542

err = ext4_do_update_inode(handle, inode, iloc);

5545

put_bh(iloc->bh);

5543

put_bh(iloc->bh);

5546

return err;

5544

return err;

5547

}

5545

}

5548

5546

5549

/*

5547

/*

5550

* On success, We end up with an outstanding reference count against

5548

* On success, We end up with an outstanding reference count against

5551

* iloc->bh. This _must_ be cleaned up later.

5549

* iloc->bh. This _must_ be cleaned up later.

5552

*/

5550

*/

5553

5551

5554

int

5552

int

5555

ext4_reserve_inode_write(handle_t *handle, struct inode *inode,

5553

ext4_reserve_inode_write(handle_t *handle, struct inode *inode,

5556

struct ext4_iloc *iloc)

5554

struct ext4_iloc *iloc)

5557

{

5555

{

5558

int err;

5556

int err;

5559

5557

5560

err = ext4_get_inode_loc(inode, iloc);

5558

err = ext4_get_inode_loc(inode, iloc);

5561

if (!err) {

5559

if (!err) {

5562

BUFFER_TRACE(iloc->bh, "get_write_access");

5560

BUFFER_TRACE(iloc->bh, "get_write_access");

5563

err = ext4_journal_get_write_access(handle, iloc->bh);

5561

err = ext4_journal_get_write_access(handle, iloc->bh);

5564

if (err) {

5562

if (err) {

5565

brelse(iloc->bh);

5563

brelse(iloc->bh);

5566

iloc->bh = NULL;

5564

iloc->bh = NULL;

5567

}

5565

}

5568

}

5566

}

5569

ext4_std_error(inode->i_sb, err);

5567

ext4_std_error(inode->i_sb, err);

5570

return err;

5568

return err;

5571

}

5569

}

5572

5570

5573

/*

5571

/*

5574

* Expand an inode by new_extra_isize bytes.

5572

* Expand an inode by new_extra_isize bytes.

5575

* Returns 0 on success or negative error number on failure.

5573

* Returns 0 on success or negative error number on failure.

5576

*/

5574

*/

5577

static int ext4_expand_extra_isize(struct inode *inode,

5575

static int ext4_expand_extra_isize(struct inode *inode,

5578

unsigned int new_extra_isize,

5576

unsigned int new_extra_isize,

5579

struct ext4_iloc iloc,

5577

struct ext4_iloc iloc,

5580

handle_t *handle)

5578

handle_t *handle)

5581

{

5579

{

5582

struct ext4_inode *raw_inode;

5580

struct ext4_inode *raw_inode;

5583

struct ext4_xattr_ibody_header *header;

5581

struct ext4_xattr_ibody_header *header;

5584

5582

5585

if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)

5583

if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)

5586

return 0;

5584

return 0;

5587

5585

5588

raw_inode = ext4_raw_inode(&iloc);

5586

raw_inode = ext4_raw_inode(&iloc);

5589

5587

5590

header = IHDR(inode, raw_inode);

5588

header = IHDR(inode, raw_inode);

5591

5589

5592

/* No extended attributes present */

5590

/* No extended attributes present */

5593

if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||

5591

if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||

5594

header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {

5592

header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {

5595

memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,

5593

memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,

5596

new_extra_isize);

5594

new_extra_isize);

5597

EXT4_I(inode)->i_extra_isize = new_extra_isize;

5595

EXT4_I(inode)->i_extra_isize = new_extra_isize;

5598

return 0;

5596

return 0;

5599

}

5597

}

5600

5598

5601

/* try to expand with EAs present */

5599

/* try to expand with EAs present */

5602

return ext4_expand_extra_isize_ea(inode, new_extra_isize,

5600

return ext4_expand_extra_isize_ea(inode, new_extra_isize,

5603

raw_inode, handle);

5601

raw_inode, handle);

5604

}

5602

}

5605

5603

5606

/*

5604

/*

5607

* What we do here is to mark the in-core inode as clean with respect to inode

5605

* What we do here is to mark the in-core inode as clean with respect to inode

5608

* dirtiness (it may still be data-dirty).

5606

* dirtiness (it may still be data-dirty).

5609

* This means that the in-core inode may be reaped by prune_icache

5607

* This means that the in-core inode may be reaped by prune_icache

5610

* without having to perform any I/O. This is a very good thing,

5608

* without having to perform any I/O. This is a very good thing,

5611

* because *any* task may call prune_icache - even ones which

5609

* because *any* task may call prune_icache - even ones which

5612

* have a transaction open against a different journal.

5610

* have a transaction open against a different journal.

5613

*

5611

*

5614

* Is this cheating? Not really. Sure, we haven't written the

5612

* Is this cheating? Not really. Sure, we haven't written the

5615

* inode out, but prune_icache isn't a user-visible syncing function.

5613

* inode out, but prune_icache isn't a user-visible syncing function.

5616

* Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)

5614

* Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)

5617

* we start and wait on commits.

5615

* we start and wait on commits.

5618

*

5616

*

5619

* Is this efficient/effective? Well, we're being nice to the system

5617

* Is this efficient/effective? Well, we're being nice to the system

5620

* by cleaning up our inodes proactively so they can be reaped

5618

* by cleaning up our inodes proactively so they can be reaped

5621

* without I/O. But we are potentially leaving up to five seconds'

5619

* without I/O. But we are potentially leaving up to five seconds'

5622

* worth of inodes floating about which prune_icache wants us to

5620

* worth of inodes floating about which prune_icache wants us to

5623

* write out. One way to fix that would be to get prune_icache()

5621

* write out. One way to fix that would be to get prune_icache()

5624

* to do a write_super() to free up some memory. It has the desired

5622

* to do a write_super() to free up some memory. It has the desired

5625

* effect.

5623

* effect.

5626

*/

5624

*/

5627

int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)

5625

int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)

5628

{

5626

{

5629

struct ext4_iloc iloc;

5627

struct ext4_iloc iloc;

5630

struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

5628

struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

5631

static unsigned int mnt_count;

5629

static unsigned int mnt_count;

5632

int err, ret;

5630

int err, ret;

5633

5631

5634

might_sleep();

5632

might_sleep();

5635

trace_ext4_mark_inode_dirty(inode, _RET_IP_);

5633

trace_ext4_mark_inode_dirty(inode, _RET_IP_);

5636

err = ext4_reserve_inode_write(handle, inode, &iloc);

5634

err = ext4_reserve_inode_write(handle, inode, &iloc);

5637

if (ext4_handle_valid(handle) &&

5635

if (ext4_handle_valid(handle) &&

5638

EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&

5636

EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&

5639

!ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {

5637

!ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {

5640

/*

5638

/*

5641

* We need extra buffer credits since we may write into EA block

5639

* We need extra buffer credits since we may write into EA block

5642

* with this same handle. If journal_extend fails, then it will

5640

* with this same handle. If journal_extend fails, then it will

5643

* only result in a minor loss of functionality for that inode.

5641

* only result in a minor loss of functionality for that inode.

5644

* If this is felt to be critical, then e2fsck should be run to

5642

* If this is felt to be critical, then e2fsck should be run to

5645

* force a large enough s_min_extra_isize.

5643

* force a large enough s_min_extra_isize.

5646

*/

5644

*/

5647

if ((jbd2_journal_extend(handle,

5645

if ((jbd2_journal_extend(handle,

5648

EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) {

5646

EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) {

5649

ret = ext4_expand_extra_isize(inode,

5647

ret = ext4_expand_extra_isize(inode,

5650

sbi->s_want_extra_isize,

5648

sbi->s_want_extra_isize,

5651

iloc, handle);

5649

iloc, handle);

5652

if (ret) {

5650

if (ret) {

5653

ext4_set_inode_state(inode,

5651

ext4_set_inode_state(inode,

5654

EXT4_STATE_NO_EXPAND);

5652

EXT4_STATE_NO_EXPAND);

5655

if (mnt_count !=

5653

if (mnt_count !=

5656

le16_to_cpu(sbi->s_es->s_mnt_count)) {

5654

le16_to_cpu(sbi->s_es->s_mnt_count)) {

5657

ext4_warning(inode->i_sb,

5655

ext4_warning(inode->i_sb,

5658

"Unable to expand inode %lu. Delete"

5656

"Unable to expand inode %lu. Delete"

5659

" some EAs or run e2fsck.",

5657

" some EAs or run e2fsck.",

5660

inode->i_ino);

5658

inode->i_ino);

5661

mnt_count =

5659

mnt_count =

5662

le16_to_cpu(sbi->s_es->s_mnt_count);

5660

le16_to_cpu(sbi->s_es->s_mnt_count);

5663

}

5661

}

5664

}

5662

}

5665

}

5663

}

5666

}

5664

}

5667

if (!err)

5665

if (!err)

5668

err = ext4_mark_iloc_dirty(handle, inode, &iloc);

5666

err = ext4_mark_iloc_dirty(handle, inode, &iloc);

5669

return err;

5667

return err;

5670

}

5668

}

5671

5669

5672

/*

5670

/*

5673

* ext4_dirty_inode() is called from __mark_inode_dirty()

5671

* ext4_dirty_inode() is called from __mark_inode_dirty()

5674

*

5672

*

5675

* We're really interested in the case where a file is being extended.

5673

* We're really interested in the case where a file is being extended.

5676

* i_size has been changed by generic_commit_write() and we thus need

5674

* i_size has been changed by generic_commit_write() and we thus need

5677

* to include the updated inode in the current transaction.

5675

* to include the updated inode in the current transaction.

5678

*

5676

*

5679

* Also, dquot_alloc_block() will always dirty the inode when blocks

5677

* Also, dquot_alloc_block() will always dirty the inode when blocks

5680

* are allocated to the file.

5678

* are allocated to the file.

5681

*

5679

*

5682

* If the inode is marked synchronous, we don't honour that here - doing

5680

* If the inode is marked synchronous, we don't honour that here - doing

5683

* so would cause a commit on atime updates, which we don't bother doing.

5681

* so would cause a commit on atime updates, which we don't bother doing.

5684

* We handle synchronous inodes at the highest possible level.

5682

* We handle synchronous inodes at the highest possible level.

5685

*/

5683

*/

5686

void ext4_dirty_inode(struct inode *inode)

5684

void ext4_dirty_inode(struct inode *inode)

5687

{

5685

{

5688

handle_t *handle;

5686

handle_t *handle;

5689

5687

5690

handle = ext4_journal_start(inode, 2);

5688

handle = ext4_journal_start(inode, 2);

5691

if (IS_ERR(handle))

5689

if (IS_ERR(handle))

5692

goto out;

5690

goto out;

5693

5691

5694

ext4_mark_inode_dirty(handle, inode);

5692

ext4_mark_inode_dirty(handle, inode);

5695

5693

5696

ext4_journal_stop(handle);

5694

ext4_journal_stop(handle);

5697

out:

5695

out:

5698

return;

5696

return;

5699

}

5697

}

5700

5698

5701

#if 0

5699

#if 0

5702

/*

5700

/*

5703

* Bind an inode's backing buffer_head into this transaction, to prevent

5701

* Bind an inode's backing buffer_head into this transaction, to prevent

5704

* it from being flushed to disk early. Unlike

5702

* it from being flushed to disk early. Unlike

5705

* ext4_reserve_inode_write, this leaves behind no bh reference and

5703

* ext4_reserve_inode_write, this leaves behind no bh reference and

5706

* returns no iloc structure, so the caller needs to repeat the iloc

5704

* returns no iloc structure, so the caller needs to repeat the iloc

5707

* lookup to mark the inode dirty later.

5705

* lookup to mark the inode dirty later.

5708

*/

5706

*/

5709

static int ext4_pin_inode(handle_t *handle, struct inode *inode)

5707

static int ext4_pin_inode(handle_t *handle, struct inode *inode)

5710

{

5708

{

5711

struct ext4_iloc iloc;

5709

struct ext4_iloc iloc;

5712

5710

5713

int err = 0;

5711

int err = 0;

5714

if (handle) {

5712

if (handle) {

5715

err = ext4_get_inode_loc(inode, &iloc);

5713

err = ext4_get_inode_loc(inode, &iloc);

5716

if (!err) {

5714

if (!err) {

5717

BUFFER_TRACE(iloc.bh, "get_write_access");

5715

BUFFER_TRACE(iloc.bh, "get_write_access");

5718

err = jbd2_journal_get_write_access(handle, iloc.bh);

5716

err = jbd2_journal_get_write_access(handle, iloc.bh);

5719

if (!err)

5717

if (!err)

5720

err = ext4_handle_dirty_metadata(handle,

5718

err = ext4_handle_dirty_metadata(handle,

5721

NULL,

5719

NULL,

5722

iloc.bh);

5720

iloc.bh);

5723

brelse(iloc.bh);

5721

brelse(iloc.bh);

5724

}

5722

}

5725

}

5723

}

5726

ext4_std_error(inode->i_sb, err);

5724

ext4_std_error(inode->i_sb, err);

5727

return err;

5725

return err;

5728

}

5726

}

5729

#endif

5727

#endif

5730

5728

5731

int ext4_change_inode_journal_flag(struct inode *inode, int val)

5729

int ext4_change_inode_journal_flag(struct inode *inode, int val)

5732

{

5730

{

5733

journal_t *journal;

5731

journal_t *journal;

5734

handle_t *handle;

5732

handle_t *handle;

5735

int err;

5733

int err;

5736

5734

5737

/*

5735

/*

5738

* We have to be very careful here: changing a data block's

5736

* We have to be very careful here: changing a data block's

5739

* journaling status dynamically is dangerous. If we write a

5737

* journaling status dynamically is dangerous. If we write a

5740

* data block to the journal, change the status and then delete

5738

* data block to the journal, change the status and then delete

5741

* that block, we risk forgetting to revoke the old log record

5739

* that block, we risk forgetting to revoke the old log record

5742

* from the journal and so a subsequent replay can corrupt data.

5740

* from the journal and so a subsequent replay can corrupt data.

5743

* So, first we make sure that the journal is empty and that

5741

* So, first we make sure that the journal is empty and that

5744

* nobody is changing anything.

5742

* nobody is changing anything.

5745

*/

5743

*/

5746

5744

5747

journal = EXT4_JOURNAL(inode);

5745

journal = EXT4_JOURNAL(inode);

5748

if (!journal)

5746

if (!journal)

5749

return 0;

5747

return 0;

5750

if (is_journal_aborted(journal))

5748

if (is_journal_aborted(journal))

5751

return -EROFS;

5749

return -EROFS;

5752

5750

5753

jbd2_journal_lock_updates(journal);

5751

jbd2_journal_lock_updates(journal);

5754

jbd2_journal_flush(journal);

5752

jbd2_journal_flush(journal);

5755

5753

5756

/*

5754

/*

5757

* OK, there are no updates running now, and all cached data is

5755

* OK, there are no updates running now, and all cached data is

5758

* synced to disk. We are now in a completely consistent state

5756

* synced to disk. We are now in a completely consistent state

5759

* which doesn't have anything in the journal, and we know that

5757

* which doesn't have anything in the journal, and we know that

5760

* no filesystem updates are running, so it is safe to modify

5758

* no filesystem updates are running, so it is safe to modify

5761

* the inode's in-core data-journaling state flag now.

5759

* the inode's in-core data-journaling state flag now.

5762

*/

5760

*/

5763

5761

5764

if (val)

5762

if (val)

5765

ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);

5763

ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);

5766

else

5764

else

5767

ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);

5765

ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);

5768

ext4_set_aops(inode);

5766

ext4_set_aops(inode);

5769

5767

5770

jbd2_journal_unlock_updates(journal);

5768

jbd2_journal_unlock_updates(journal);

5771

5769

5772

/* Finally we can mark the inode as dirty. */

5770

/* Finally we can mark the inode as dirty. */

5773

5771

5774

handle = ext4_journal_start(inode, 1);

5772

handle = ext4_journal_start(inode, 1);

5775

if (IS_ERR(handle))

5773

if (IS_ERR(handle))

5776

return PTR_ERR(handle);

5774

return PTR_ERR(handle);

5777

5775

5778

err = ext4_mark_inode_dirty(handle, inode);

5776

err = ext4_mark_inode_dirty(handle, inode);

5779

ext4_handle_sync(handle);

5777

ext4_handle_sync(handle);

5780

ext4_journal_stop(handle);

5778

ext4_journal_stop(handle);

5781

ext4_std_error(inode->i_sb, err);

5779

ext4_std_error(inode->i_sb, err);

5782

5780

5783

return err;

5781

return err;

5784

}

5782

}

5785

5783

5786

static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)

5784

static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)

5787

{

5785

{

5788

return !buffer_mapped(bh);

5786

return !buffer_mapped(bh);

5789

}

5787

}

5790

5788

5791

int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)

5789

int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)

5792

{

5790

{

5793

struct page *page = vmf->page;

5791

struct page *page = vmf->page;

5794

loff_t size;

5792

loff_t size;

5795

unsigned long len;

5793

unsigned long len;

5796

int ret = -EINVAL;

5794

int ret = -EINVAL;

5797

void *fsdata;

5795

void *fsdata;

5798

struct file *file = vma->vm_file;

5796

struct file *file = vma->vm_file;

5799

struct inode *inode = file->f_path.dentry->d_inode;

5797

struct inode *inode = file->f_path.dentry->d_inode;

5800

struct address_space *mapping = inode->i_mapping;

5798

struct address_space *mapping = inode->i_mapping;

5801

5799

5802

/*

5800

/*

5803

* Get i_alloc_sem to stop truncates messing with the inode. We cannot

5801

* Get i_alloc_sem to stop truncates messing with the inode. We cannot

5804

* get i_mutex because we are already holding mmap_sem.

5802

* get i_mutex because we are already holding mmap_sem.

5805

*/

5803

*/

5806

down_read(&inode->i_alloc_sem);

5804

down_read(&inode->i_alloc_sem);

5807

size = i_size_read(inode);

5805

size = i_size_read(inode);

5808

if (page->mapping != mapping || size <= page_offset(page)

5806

if (page->mapping != mapping || size <= page_offset(page)

5809

|| !PageUptodate(page)) {

5807

|| !PageUptodate(page)) {

5810

/* page got truncated from under us? */

5808

/* page got truncated from under us? */

5811

goto out_unlock;

5809

goto out_unlock;

5812

}

5810

}

5813

ret = 0;

5811

ret = 0;

5814

if (PageMappedToDisk(page))

5812

if (PageMappedToDisk(page))

5815

goto out_unlock;

5813

goto out_unlock;

5816

5814

5817

if (page->index == size >> PAGE_CACHE_SHIFT)

5815

if (page->index == size >> PAGE_CACHE_SHIFT)

5818

len = size & ~PAGE_CACHE_MASK;

5816

len = size & ~PAGE_CACHE_MASK;

5819

else

5817

else

5820

len = PAGE_CACHE_SIZE;

5818

len = PAGE_CACHE_SIZE;

5821

5819

5822

lock_page(page);

5820

lock_page(page);

5823

/*

5821

/*

5824

* return if we have all the buffers mapped. This avoid

5822

* return if we have all the buffers mapped. This avoid

5825

* the need to call write_begin/write_end which does a

5823

* the need to call write_begin/write_end which does a

5826

* journal_start/journal_stop which can block and take

5824

* journal_start/journal_stop which can block and take

5827

* long time

5825

* long time

5828

*/

5826

*/

5829

if (page_has_buffers(page)) {

5827

if (page_has_buffers(page)) {

5830

if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,

5828

if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,

5831

ext4_bh_unmapped)) {

5829

ext4_bh_unmapped)) {

5832

unlock_page(page);

5830

unlock_page(page);

5833

goto out_unlock;

5831

goto out_unlock;

5834

}

5832

}

5835

}

5833

}

5836

unlock_page(page);

5834

unlock_page(page);

5837

/*

5835

/*

5838

* OK, we need to fill the hole... Do write_begin write_end

5836

* OK, we need to fill the hole... Do write_begin write_end

5839

* to do block allocation/reservation.We are not holding

5837

* to do block allocation/reservation.We are not holding

5840

* inode.i__mutex here. That allow * parallel write_begin,

5838

* inode.i__mutex here. That allow * parallel write_begin,

5841

* write_end call. lock_page prevent this from happening

5839

* write_end call. lock_page prevent this from happening

5842

* on the same page though

5840

* on the same page though

5843

*/

5841

*/

5844

ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),

5842

ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),

5845

len, AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);

5843

len, AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);

5846

if (ret < 0)

5844

if (ret < 0)

5847

goto out_unlock;

5845

goto out_unlock;

5848

ret = mapping->a_ops->write_end(file, mapping, page_offset(page),

5846

ret = mapping->a_ops->write_end(file, mapping, page_offset(page),

5849

len, len, page, fsdata);

5847

len, len, page, fsdata);

5850

if (ret < 0)

5848

if (ret < 0)

5851

goto out_unlock;

5849

goto out_unlock;

5852

ret = 0;

5850

ret = 0;

5853

out_unlock:

5851

out_unlock:

5854

if (ret)

5852

if (ret)

5855

ret = VM_FAULT_SIGBUS;

5853

ret = VM_FAULT_SIGBUS;

5856

up_read(&inode->i_alloc_sem);

5854

up_read(&inode->i_alloc_sem);

5857

return ret;

5855

return ret;

5858

}

5856

}

5859

5857

GITLAB

Eric Lee / smarc-fsl-linux-kernel

ext4: clean up some wait_on_page_writeback calls

 /*
  *  linux/fs/ext4/inode.c
  *
  * Copyright (C) 1992, 1993, 1994, 1995
  * Remy Card (card@masi.ibp.fr)
  * Laboratoire MASI - Institut Blaise Pascal
  * Universite Pierre et Marie Curie (Paris VI)
  *
  *  from
  *
  *  linux/fs/minix/inode.c
  *
  *  Copyright (C) 1991, 1992  Linus Torvalds
  *
  *  Goal-directed block allocation by Stephen Tweedie
  *	(sct@redhat.com), 1993, 1998
  *  Big-endian to little-endian byte-swapping/bitmaps by
  *        David S. Miller (davem@caip.rutgers.edu), 1995
  *  64-bit file support on 64-bit platforms by Jakub Jelinek
  *	(jj@sunsite.ms.mff.cuni.cz)
  *
  *  Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
  */
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/time.h>
 #include <linux/jbd2.h>
 #include <linux/highuid.h>
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
 #include <linux/string.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
 #include <linux/pagevec.h>
 #include <linux/mpage.h>
 #include <linux/namei.h>
 #include <linux/uio.h>
 #include <linux/bio.h>
 #include <linux/workqueue.h>
 #include <linux/kernel.h>
 #include <linux/printk.h>
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
 #include "ext4_extents.h"
 #include <trace/events/ext4.h>
 #define MPAGE_DA_EXTENT_TAIL 0x01
 static inline int ext4_begin_ordered_truncate(struct inode *inode,
 					      loff_t new_size)
 {
 	trace_ext4_begin_ordered_truncate(inode, new_size);
 	/*
 	 * If jinode is zero, then we never opened the file for
 	 * writing, so there's no need to call
 	 * jbd2_journal_begin_ordered_truncate() since there's no
 	 * outstanding writes we need to flush.
 	 */
 	if (!EXT4_I(inode)->jinode)
 		return 0;
 	return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
 						   EXT4_I(inode)->jinode,
 						   new_size);
 }
 static void ext4_invalidatepage(struct page *page, unsigned long offset);
 static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
 				   struct buffer_head *bh_result, int create);
 static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
 static int __ext4_journalled_writepage(struct page *page, unsigned int len);
 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
 /*
  * Test whether an inode is a fast symlink.
  */
 static int ext4_inode_is_fast_symlink(struct inode *inode)
 {
 	int ea_blocks = EXT4_I(inode)->i_file_acl ?
 		(inode->i_sb->s_blocksize >> 9) : 0;
 	return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
 }
 /*
  * Work out how many blocks we need to proceed with the next chunk of a
  * truncate transaction.
  */
 static unsigned long blocks_for_truncate(struct inode *inode)
 {
 	ext4_lblk_t needed;
 	needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
 	/* Give ourselves just enough room to cope with inodes in which
 	 * i_blocks is corrupt: we've seen disk corruptions in the past
 	 * which resulted in random data in an inode which looked enough
 	 * like a regular file for ext4 to try to delete it.  Things
 	 * will go a bit crazy if that happens, but at least we should
 	 * try not to panic the whole kernel. */
 	if (needed < 2)
 		needed = 2;
 	/* But we need to bound the transaction so we don't overflow the
 	 * journal. */
 	if (needed > EXT4_MAX_TRANS_DATA)
 		needed = EXT4_MAX_TRANS_DATA;
 	return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
 }
 /*
  * Truncate transactions can be complex and absolutely huge.  So we need to
  * be able to restart the transaction at a conventient checkpoint to make
  * sure we don't overflow the journal.
  *
  * start_transaction gets us a new handle for a truncate transaction,
  * and extend_transaction tries to extend the existing one a bit.  If
  * extend fails, we need to propagate the failure up and restart the
  * transaction in the top-level truncate loop. --sct
  */
 static handle_t *start_transaction(struct inode *inode)
 {
 	handle_t *result;
 	result = ext4_journal_start(inode, blocks_for_truncate(inode));
 	if (!IS_ERR(result))
 		return result;
 	ext4_std_error(inode->i_sb, PTR_ERR(result));
 	return result;
 }
 /*
  * Try to extend this transaction for the purposes of truncation.
  *
  * Returns 0 if we managed to create more room.  If we can't create more
  * room, and the transaction must be restarted we return 1.
  */
 static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
 {
 	if (!ext4_handle_valid(handle))
 		return 0;
 	if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
 		return 0;
 	if (!ext4_journal_extend(handle, blocks_for_truncate(inode)))
 		return 0;
 	return 1;
 }
 /*
  * Restart the transaction associated with *handle.  This does a commit,
  * so before we call here everything must be consistently dirtied against
  * this transaction.
  */
 int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
 				 int nblocks)
 {
 	int ret;
 	/*
 	 * Drop i_data_sem to avoid deadlock with ext4_map_blocks.  At this
 	 * moment, get_block can be called only for blocks inside i_size since
 	 * page cache has been already dropped and writes are blocked by
 	 * i_mutex. So we can safely drop the i_data_sem here.
 	 */
 	BUG_ON(EXT4_JOURNAL(inode) == NULL);
 	jbd_debug(2, "restarting handle %p\n", handle);
 	up_write(&EXT4_I(inode)->i_data_sem);
 	ret = ext4_journal_restart(handle, nblocks);
 	down_write(&EXT4_I(inode)->i_data_sem);
 	ext4_discard_preallocations(inode);
 	return ret;
 }
 /*
  * Called at the last iput() if i_nlink is zero.
  */
 void ext4_evict_inode(struct inode *inode)
 {
 	handle_t *handle;
 	int err;
 	trace_ext4_evict_inode(inode);
 	if (inode->i_nlink) {
 		truncate_inode_pages(&inode->i_data, 0);
 		goto no_delete;
 	}
 	if (!is_bad_inode(inode))
 		dquot_initialize(inode);
 	if (ext4_should_order_data(inode))
 		ext4_begin_ordered_truncate(inode, 0);
 	truncate_inode_pages(&inode->i_data, 0);
 	if (is_bad_inode(inode))
 		goto no_delete;
 	handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3);
 	if (IS_ERR(handle)) {
 		ext4_std_error(inode->i_sb, PTR_ERR(handle));
 		/*
 		 * If we're going to skip the normal cleanup, we still need to
 		 * make sure that the in-core orphan linked list is properly
 		 * cleaned up.
 		 */
 		ext4_orphan_del(NULL, inode);
 		goto no_delete;
 	}
 	if (IS_SYNC(inode))
 		ext4_handle_sync(handle);
 	inode->i_size = 0;
 	err = ext4_mark_inode_dirty(handle, inode);
 	if (err) {
 		ext4_warning(inode->i_sb,
 			     "couldn't mark inode dirty (err %d)", err);
 		goto stop_handle;
 	}
 	if (inode->i_blocks)
 		ext4_truncate(inode);
 	/*
 	 * ext4_ext_truncate() doesn't reserve any slop when it
 	 * restarts journal transactions; therefore there may not be
 	 * enough credits left in the handle to remove the inode from
 	 * the orphan list and set the dtime field.
 	 */
 	if (!ext4_handle_has_enough_credits(handle, 3)) {
 		err = ext4_journal_extend(handle, 3);
 		if (err > 0)
 			err = ext4_journal_restart(handle, 3);
 		if (err != 0) {
 			ext4_warning(inode->i_sb,
 				     "couldn't extend journal (err %d)", err);
 		stop_handle:
 			ext4_journal_stop(handle);
 			ext4_orphan_del(NULL, inode);
 			goto no_delete;
 		}
 	}
 	/*
 	 * Kill off the orphan record which ext4_truncate created.
 	 * AKPM: I think this can be inside the above `if'.
 	 * Note that ext4_orphan_del() has to be able to cope with the
 	 * deletion of a non-existent orphan - this is because we don't
 	 * know if ext4_truncate() actually created an orphan record.
 	 * (Well, we could do this if we need to, but heck - it works)
 	 */
 	ext4_orphan_del(handle, inode);
 	EXT4_I(inode)->i_dtime	= get_seconds();
 	/*
 	 * One subtle ordering requirement: if anything has gone wrong
 	 * (transaction abort, IO errors, whatever), then we can still
 	 * do these next steps (the fs will already have been marked as
 	 * having errors), but we can't free the inode if the mark_dirty
 	 * fails.
 	 */
 	if (ext4_mark_inode_dirty(handle, inode))
 		/* If that failed, just do the required in-core inode clear. */
 		ext4_clear_inode(inode);
 	else
 		ext4_free_inode(handle, inode);
 	ext4_journal_stop(handle);
 	return;
 no_delete:
 	ext4_clear_inode(inode);	/* We must guarantee clearing of inode... */
 }
 typedef struct {
 	__le32	*p;
 	__le32	key;
 	struct buffer_head *bh;
 } Indirect;
 static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
 {
 	p->key = *(p->p = v);
 	p->bh = bh;
 }
 /**
  *	ext4_block_to_path - parse the block number into array of offsets
  *	@inode: inode in question (we are only interested in its superblock)
  *	@i_block: block number to be parsed
  *	@offsets: array to store the offsets in
  *	@boundary: set this non-zero if the referred-to block is likely to be
  *	       followed (on disk) by an indirect block.
  *
  *	To store the locations of file's data ext4 uses a data structure common
  *	for UNIX filesystems - tree of pointers anchored in the inode, with
  *	data blocks at leaves and indirect blocks in intermediate nodes.
  *	This function translates the block number into path in that tree -
  *	return value is the path length and @offsets[n] is the offset of
  *	pointer to (n+1)th node in the nth one. If @block is out of range
  *	(negative or too large) warning is printed and zero returned.
  *
  *	Note: function doesn't find node addresses, so no IO is needed. All
  *	we need to know is the capacity of indirect blocks (taken from the
  *	inode->i_sb).
  */
 /*
  * Portability note: the last comparison (check that we fit into triple
  * indirect block) is spelled differently, because otherwise on an
  * architecture with 32-bit longs and 8Kb pages we might get into trouble
  * if our filesystem had 8Kb blocks. We might use long long, but that would
  * kill us on x86. Oh, well, at least the sign propagation does not matter -
  * i_block would have to be negative in the very beginning, so we would not
  * get there at all.
  */
 static int ext4_block_to_path(struct inode *inode,
 			      ext4_lblk_t i_block,
 			      ext4_lblk_t offsets[4], int *boundary)
 {
 	int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
 	int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
 	const long direct_blocks = EXT4_NDIR_BLOCKS,
 		indirect_blocks = ptrs,
 		double_blocks = (1 << (ptrs_bits * 2));
 	int n = 0;
 	int final = 0;
 	if (i_block < direct_blocks) {
 		offsets[n++] = i_block;
 		final = direct_blocks;
 	} else if ((i_block -= direct_blocks) < indirect_blocks) {
 		offsets[n++] = EXT4_IND_BLOCK;
 		offsets[n++] = i_block;
 		final = ptrs;
 	} else if ((i_block -= indirect_blocks) < double_blocks) {
 		offsets[n++] = EXT4_DIND_BLOCK;
 		offsets[n++] = i_block >> ptrs_bits;
 		offsets[n++] = i_block & (ptrs - 1);
 		final = ptrs;
 	} else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
 		offsets[n++] = EXT4_TIND_BLOCK;
 		offsets[n++] = i_block >> (ptrs_bits * 2);
 		offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
 		offsets[n++] = i_block & (ptrs - 1);
 		final = ptrs;
 	} else {
 		ext4_warning(inode->i_sb, "block %lu > max in inode %lu",
 			     i_block + direct_blocks +
 			     indirect_blocks + double_blocks, inode->i_ino);
 	}
 	if (boundary)
 		*boundary = final - 1 - (i_block & (ptrs - 1));
 	return n;
 }
 static int __ext4_check_blockref(const char *function, unsigned int line,
 				 struct inode *inode,
 				 __le32 *p, unsigned int max)
 {
 	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
 	__le32 *bref = p;
 	unsigned int blk;
 	while (bref < p+max) {
 		blk = le32_to_cpu(*bref++);
 		if (blk &&
 		    unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
 						    blk, 1))) {
 			es->s_last_error_block = cpu_to_le64(blk);
 			ext4_error_inode(inode, function, line, blk,
 					 "invalid block");
 			return -EIO;
 		}
 	}
 	return 0;
 }
 #define ext4_check_indirect_blockref(inode, bh)                         \
 	__ext4_check_blockref(__func__, __LINE__, inode,		\
 			      (__le32 *)(bh)->b_data,			\
 			      EXT4_ADDR_PER_BLOCK((inode)->i_sb))
 #define ext4_check_inode_blockref(inode)                                \
 	__ext4_check_blockref(__func__, __LINE__, inode,		\
 			      EXT4_I(inode)->i_data,			\
 			      EXT4_NDIR_BLOCKS)
 /**
  *	ext4_get_branch - read the chain of indirect blocks leading to data
  *	@inode: inode in question
  *	@depth: depth of the chain (1 - direct pointer, etc.)
  *	@offsets: offsets of pointers in inode/indirect blocks
  *	@chain: place to store the result
  *	@err: here we store the error value
  *
  *	Function fills the array of triples <key, p, bh> and returns %NULL
  *	if everything went OK or the pointer to the last filled triple
  *	(incomplete one) otherwise. Upon the return chain[i].key contains
  *	the number of (i+1)-th block in the chain (as it is stored in memory,
  *	i.e. little-endian 32-bit), chain[i].p contains the address of that
  *	number (it points into struct inode for i==0 and into the bh->b_data
  *	for i>0) and chain[i].bh points to the buffer_head of i-th indirect
  *	block for i>0 and NULL for i==0. In other words, it holds the block
  *	numbers of the chain, addresses they were taken from (and where we can
  *	verify that chain did not change) and buffer_heads hosting these
  *	numbers.
  *
  *	Function stops when it stumbles upon zero pointer (absent block)
  *		(pointer to last triple returned, *@err == 0)
  *	or when it gets an IO error reading an indirect block
  *		(ditto, *@err == -EIO)
  *	or when it reads all @depth-1 indirect blocks successfully and finds
  *	the whole chain, all way to the data (returns %NULL, *err == 0).
  *
  *      Need to be called with
  *      down_read(&EXT4_I(inode)->i_data_sem)
  */
 static Indirect *ext4_get_branch(struct inode *inode, int depth,
 				 ext4_lblk_t  *offsets,
 				 Indirect chain[4], int *err)
 {
 	struct super_block *sb = inode->i_sb;
 	Indirect *p = chain;
 	struct buffer_head *bh;
 	*err = 0;
 	/* i_data is not going away, no lock needed */
 	add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets);
 	if (!p->key)
 		goto no_block;
 	while (--depth) {
 		bh = sb_getblk(sb, le32_to_cpu(p->key));
 		if (unlikely(!bh))
 			goto failure;
 		if (!bh_uptodate_or_lock(bh)) {
 			if (bh_submit_read(bh) < 0) {
 				put_bh(bh);
 				goto failure;
 			}
 			/* validate block references */
 			if (ext4_check_indirect_blockref(inode, bh)) {
 				put_bh(bh);
 				goto failure;
 			}
 		}
 		add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
 		/* Reader: end */
 		if (!p->key)
 			goto no_block;
 	}
 	return NULL;
 failure:
 	*err = -EIO;
 no_block:
 	return p;
 }
 /**
  *	ext4_find_near - find a place for allocation with sufficient locality
  *	@inode: owner
  *	@ind: descriptor of indirect block.
  *
  *	This function returns the preferred place for block allocation.
  *	It is used when heuristic for sequential allocation fails.
  *	Rules are:
  *	  + if there is a block to the left of our position - allocate near it.
  *	  + if pointer will live in indirect block - allocate near that block.
  *	  + if pointer will live in inode - allocate in the same
  *	    cylinder group.
  *
  * In the latter case we colour the starting block by the callers PID to
  * prevent it from clashing with concurrent allocations for a different inode
  * in the same block group.   The PID is used here so that functionally related
  * files will be close-by on-disk.
  *
  *	Caller must make sure that @ind is valid and will stay that way.
  */
 static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	__le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data;
 	__le32 *p;
 	ext4_fsblk_t bg_start;
 	ext4_fsblk_t last_block;
 	ext4_grpblk_t colour;
 	ext4_group_t block_group;
 	int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
 	/* Try to find previous block */
 	for (p = ind->p - 1; p >= start; p--) {
 		if (*p)
 			return le32_to_cpu(*p);
 	}
 	/* No such thing, so let's try location of indirect block */
 	if (ind->bh)
 		return ind->bh->b_blocknr;
 	/*
 	 * It is going to be referred to from the inode itself? OK, just put it
 	 * into the same cylinder group then.
 	 */
 	block_group = ei->i_block_group;
 	if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
 		block_group &= ~(flex_size-1);
 		if (S_ISREG(inode->i_mode))
 			block_group++;
 	}
 	bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
 	last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
 	/*
 	 * If we are doing delayed allocation, we don't need take
 	 * colour into account.
 	 */
 	if (test_opt(inode->i_sb, DELALLOC))
 		return bg_start;
 	if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
 		colour = (current->pid % 16) *
 			(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
 	else
 		colour = (current->pid % 16) * ((last_block - bg_start) / 16);
 	return bg_start + colour;
 }
 /**
  *	ext4_find_goal - find a preferred place for allocation.
  *	@inode: owner
  *	@block:  block we want
  *	@partial: pointer to the last triple within a chain
  *
  *	Normally this function find the preferred place for block allocation,
  *	returns it.
  *	Because this is only used for non-extent files, we limit the block nr
  *	to 32 bits.
  */
 static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
 				   Indirect *partial)
 {
 	ext4_fsblk_t goal;
 	/*
 	 * XXX need to get goal block from mballoc's data structures
 	 */
 	goal = ext4_find_near(inode, partial);
 	goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
 	return goal;
 }
 /**
  *	ext4_blks_to_allocate - Look up the block map and count the number
  *	of direct blocks need to be allocated for the given branch.
  *
  *	@branch: chain of indirect blocks
  *	@k: number of blocks need for indirect blocks
  *	@blks: number of data blocks to be mapped.
  *	@blocks_to_boundary:  the offset in the indirect block
  *
  *	return the total number of blocks to be allocate, including the
  *	direct and indirect blocks.
  */
 static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
 				 int blocks_to_boundary)
 {
 	unsigned int count = 0;
 	/*
 	 * Simple case, [t,d]Indirect block(s) has not allocated yet
 	 * then it's clear blocks on that path have not allocated
 	 */
 	if (k > 0) {
 		/* right now we don't handle cross boundary allocation */
 		if (blks < blocks_to_boundary + 1)
 			count += blks;
 		else
 			count += blocks_to_boundary + 1;
 		return count;
 	}
 	count++;
 	while (count < blks && count <= blocks_to_boundary &&
 		le32_to_cpu(*(branch[0].p + count)) == 0) {
 		count++;
 	}
 	return count;
 }
 /**
  *	ext4_alloc_blocks: multiple allocate blocks needed for a branch
  *	@handle: handle for this transaction
  *	@inode: inode which needs allocated blocks
  *	@iblock: the logical block to start allocated at
  *	@goal: preferred physical block of allocation
  *	@indirect_blks: the number of blocks need to allocate for indirect
  *			blocks
  *	@blks: number of desired blocks
  *	@new_blocks: on return it will store the new block numbers for
  *	the indirect blocks(if needed) and the first direct block,
  *	@err: on return it will store the error code
  *
  *	This function will return the number of blocks allocated as
  *	requested by the passed-in parameters.
  */
 static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
 			     ext4_lblk_t iblock, ext4_fsblk_t goal,
 			     int indirect_blks, int blks,
 			     ext4_fsblk_t new_blocks[4], int *err)
 {
 	struct ext4_allocation_request ar;
 	int target, i;
 	unsigned long count = 0, blk_allocated = 0;
 	int index = 0;
 	ext4_fsblk_t current_block = 0;
 	int ret = 0;
 	/*
 	 * Here we try to allocate the requested multiple blocks at once,
 	 * on a best-effort basis.
 	 * To build a branch, we should allocate blocks for
 	 * the indirect blocks(if not allocated yet), and at least
 	 * the first direct block of this branch.  That's the
 	 * minimum number of blocks need to allocate(required)
 	 */
 	/* first we try to allocate the indirect blocks */
 	target = indirect_blks;
 	while (target > 0) {
 		count = target;
 		/* allocating blocks for indirect blocks and direct blocks */
 		current_block = ext4_new_meta_blocks(handle, inode,
 							goal, &count, err);
 		if (*err)
 			goto failed_out;
 		if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) {
 			EXT4_ERROR_INODE(inode,
 					 "current_block %llu + count %lu > %d!",
 					 current_block, count,
 					 EXT4_MAX_BLOCK_FILE_PHYS);
 			*err = -EIO;
 			goto failed_out;
 		}
 		target -= count;
 		/* allocate blocks for indirect blocks */
 		while (index < indirect_blks && count) {
 			new_blocks[index++] = current_block++;
 			count--;
 		}
 		if (count > 0) {
 			/*
 			 * save the new block number
 			 * for the first direct block
 			 */
 			new_blocks[index] = current_block;
 			printk(KERN_INFO "%s returned more blocks than "
 						"requested\n", __func__);
 			WARN_ON(1);
 			break;
 		}
 	}
 	target = blks - count ;
 	blk_allocated = count;
 	if (!target)
 		goto allocated;
 	/* Now allocate data blocks */
 	memset(&ar, 0, sizeof(ar));
 	ar.inode = inode;
 	ar.goal = goal;
 	ar.len = target;
 	ar.logical = iblock;
 	if (S_ISREG(inode->i_mode))
 		/* enable in-core preallocation only for regular files */
 		ar.flags = EXT4_MB_HINT_DATA;
 	current_block = ext4_mb_new_blocks(handle, &ar, err);
 	if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) {
 		EXT4_ERROR_INODE(inode,
 				 "current_block %llu + ar.len %d > %d!",
 				 current_block, ar.len,
 				 EXT4_MAX_BLOCK_FILE_PHYS);
 		*err = -EIO;
 		goto failed_out;
 	}
 	if (*err && (target == blks)) {
 		/*
 		 * if the allocation failed and we didn't allocate
 		 * any blocks before
 		 */
 		goto failed_out;
 	}
 	if (!*err) {
 		if (target == blks) {
 			/*
 			 * save the new block number
 			 * for the first direct block
 			 */
 			new_blocks[index] = current_block;
 		}
 		blk_allocated += ar.len;
 	}
 allocated:
 	/* total number of blocks allocated for direct blocks */
 	ret = blk_allocated;
 	*err = 0;
 	return ret;
 failed_out:
 	for (i = 0; i < index; i++)
 		ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
 	return ret;
 }
 /**
  *	ext4_alloc_branch - allocate and set up a chain of blocks.
  *	@handle: handle for this transaction
  *	@inode: owner
  *	@indirect_blks: number of allocated indirect blocks
  *	@blks: number of allocated direct blocks
  *	@goal: preferred place for allocation
  *	@offsets: offsets (in the blocks) to store the pointers to next.
  *	@branch: place to store the chain in.
  *
  *	This function allocates blocks, zeroes out all but the last one,
  *	links them into chain and (if we are synchronous) writes them to disk.
  *	In other words, it prepares a branch that can be spliced onto the
  *	inode. It stores the information about that chain in the branch[], in
  *	the same format as ext4_get_branch() would do. We are calling it after
  *	we had read the existing part of chain and partial points to the last
  *	triple of that (one with zero ->key). Upon the exit we have the same
  *	picture as after the successful ext4_get_block(), except that in one
  *	place chain is disconnected - *branch->p is still zero (we did not
  *	set the last link), but branch->key contains the number that should
  *	be placed into *branch->p to fill that gap.
  *
  *	If allocation fails we free all blocks we've allocated (and forget
  *	their buffer_heads) and return the error value the from failed
  *	ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain
  *	as described above and return 0.
  */
 static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
 			     ext4_lblk_t iblock, int indirect_blks,
 			     int *blks, ext4_fsblk_t goal,
 			     ext4_lblk_t *offsets, Indirect *branch)
 {
 	int blocksize = inode->i_sb->s_blocksize;
 	int i, n = 0;
 	int err = 0;
 	struct buffer_head *bh;
 	int num;
 	ext4_fsblk_t new_blocks[4];
 	ext4_fsblk_t current_block;
 	num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
 				*blks, new_blocks, &err);
 	if (err)
 		return err;
 	branch[0].key = cpu_to_le32(new_blocks[0]);
 	/*
 	 * metadata blocks and data blocks are allocated.
 	 */
 	for (n = 1; n <= indirect_blks;  n++) {
 		/*
 		 * Get buffer_head for parent block, zero it out
 		 * and set the pointer to new one, then send
 		 * parent to disk.
 		 */
 		bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
 		if (unlikely(!bh)) {
 			err = -EIO;
 			goto failed;
 		}
 		branch[n].bh = bh;
 		lock_buffer(bh);
 		BUFFER_TRACE(bh, "call get_create_access");
 		err = ext4_journal_get_create_access(handle, bh);
 		if (err) {
 			/* Don't brelse(bh) here; it's done in
 			 * ext4_journal_forget() below */
 			unlock_buffer(bh);
 			goto failed;
 		}
 		memset(bh->b_data, 0, blocksize);
 		branch[n].p = (__le32 *) bh->b_data + offsets[n];
 		branch[n].key = cpu_to_le32(new_blocks[n]);
 		*branch[n].p = branch[n].key;
 		if (n == indirect_blks) {
 			current_block = new_blocks[n];
 			/*
 			 * End of chain, update the last new metablock of
 			 * the chain to point to the new allocated
 			 * data blocks numbers
 			 */
 			for (i = 1; i < num; i++)
 				*(branch[n].p + i) = cpu_to_le32(++current_block);
 		}
 		BUFFER_TRACE(bh, "marking uptodate");
 		set_buffer_uptodate(bh);
 		unlock_buffer(bh);
 		BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
 		err = ext4_handle_dirty_metadata(handle, inode, bh);
 		if (err)
 			goto failed;
 	}
 	*blks = num;
 	return err;
 failed:
 	/* Allocation failed, free what we already allocated */
 	ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0);
 	for (i = 1; i <= n ; i++) {
 		/*
 		 * branch[i].bh is newly allocated, so there is no
 		 * need to revoke the block, which is why we don't
 		 * need to set EXT4_FREE_BLOCKS_METADATA.
 		 */
 		ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1,
 				 EXT4_FREE_BLOCKS_FORGET);
 	}
 	for (i = n+1; i < indirect_blks; i++)
 		ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
 	ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0);
 	return err;
 }
 /**
  * ext4_splice_branch - splice the allocated branch onto inode.
  * @handle: handle for this transaction
  * @inode: owner
  * @block: (logical) number of block we are adding
  * @chain: chain of indirect blocks (with a missing link - see
  *	ext4_alloc_branch)
  * @where: location of missing link
  * @num:   number of indirect blocks we are adding
  * @blks:  number of direct blocks we are adding
  *
  * This function fills the missing link and does all housekeeping needed in
  * inode (->i_blocks, etc.). In case of success we end up with the full
  * chain to new block and return 0.
  */
 static int ext4_splice_branch(handle_t *handle, struct inode *inode,
 			      ext4_lblk_t block, Indirect *where, int num,
 			      int blks)
 {
 	int i;
 	int err = 0;
 	ext4_fsblk_t current_block;
 	/*
 	 * If we're splicing into a [td]indirect block (as opposed to the
 	 * inode) then we need to get write access to the [td]indirect block
 	 * before the splice.
 	 */
 	if (where->bh) {
 		BUFFER_TRACE(where->bh, "get_write_access");
 		err = ext4_journal_get_write_access(handle, where->bh);
 		if (err)
 			goto err_out;
 	}
 	/* That's it */
 	*where->p = where->key;
 	/*
 	 * Update the host buffer_head or inode to point to more just allocated
 	 * direct blocks blocks
 	 */
 	if (num == 0 && blks > 1) {
 		current_block = le32_to_cpu(where->key) + 1;
 		for (i = 1; i < blks; i++)
 			*(where->p + i) = cpu_to_le32(current_block++);
 	}
 	/* We are done with atomic stuff, now do the rest of housekeeping */
 	/* had we spliced it onto indirect block? */
 	if (where->bh) {
 		/*
 		 * If we spliced it onto an indirect block, we haven't
 		 * altered the inode.  Note however that if it is being spliced
 		 * onto an indirect block at the very end of the file (the
 		 * file is growing) then we *will* alter the inode to reflect
 		 * the new i_size.  But that is not done here - it is done in
 		 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
 		 */
 		jbd_debug(5, "splicing indirect only\n");
 		BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
 		err = ext4_handle_dirty_metadata(handle, inode, where->bh);
 		if (err)
 			goto err_out;
 	} else {
 		/*
 		 * OK, we spliced it into the inode itself on a direct block.
 		 */
 		ext4_mark_inode_dirty(handle, inode);
 		jbd_debug(5, "splicing direct\n");
 	}
 	return err;
 err_out:
 	for (i = 1; i <= num; i++) {
 		/*
 		 * branch[i].bh is newly allocated, so there is no
 		 * need to revoke the block, which is why we don't
 		 * need to set EXT4_FREE_BLOCKS_METADATA.
 		 */
 		ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
 				 EXT4_FREE_BLOCKS_FORGET);
 	}
 	ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key),
 			 blks, 0);
 	return err;
 }
 /*
  * The ext4_ind_map_blocks() function handles non-extents inodes
  * (i.e., using the traditional indirect/double-indirect i_blocks
  * scheme) for ext4_map_blocks().
  *
  * Allocation strategy is simple: if we have to allocate something, we will
  * have to go the whole way to leaf. So let's do it before attaching anything
  * to tree, set linkage between the newborn blocks, write them if sync is
  * required, recheck the path, free and repeat if check fails, otherwise
  * set the last missing link (that will protect us from any truncate-generated
  * removals - all blocks on the path are immune now) and possibly force the
  * write on the parent block.
  * That has a nice additional property: no special recovery from the failed
  * allocations is needed - we simply release blocks and do not touch anything
  * reachable from inode.
  *
  * `handle' can be NULL if create == 0.
  *
  * return > 0, # of blocks mapped or allocated.
  * return = 0, if plain lookup failed.
  * return < 0, error case.
  *
  * The ext4_ind_get_blocks() function should be called with
  * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem
  * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or
  * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
  * blocks.
  */
 static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
 			       struct ext4_map_blocks *map,
 			       int flags)
 {
 	int err = -EIO;
 	ext4_lblk_t offsets[4];
 	Indirect chain[4];
 	Indirect *partial;
 	ext4_fsblk_t goal;
 	int indirect_blks;
 	int blocks_to_boundary = 0;
 	int depth;
 	int count = 0;
 	ext4_fsblk_t first_block = 0;
 	trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
 	J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
 	J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
 	depth = ext4_block_to_path(inode, map->m_lblk, offsets,
 				   &blocks_to_boundary);
 	if (depth == 0)
 		goto out;
 	partial = ext4_get_branch(inode, depth, offsets, chain, &err);
 	/* Simplest case - block found, no allocation needed */
 	if (!partial) {
 		first_block = le32_to_cpu(chain[depth - 1].key);
 		count++;
 		/*map more blocks*/
 		while (count < map->m_len && count <= blocks_to_boundary) {
 			ext4_fsblk_t blk;
 			blk = le32_to_cpu(*(chain[depth-1].p + count));
 			if (blk == first_block + count)
 				count++;
 			else
 				break;
 		}
 		goto got_it;
 	}
 	/* Next simple case - plain lookup or failed read of indirect block */
 	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO)
 		goto cleanup;
 	/*
 	 * Okay, we need to do block allocation.
 	*/
 	goal = ext4_find_goal(inode, map->m_lblk, partial);
 	/* the number of blocks need to allocate for [d,t]indirect blocks */
 	indirect_blks = (chain + depth) - partial - 1;
 	/*
 	 * Next look up the indirect map to count the totoal number of
 	 * direct blocks to allocate for this branch.
 	 */
 	count = ext4_blks_to_allocate(partial, indirect_blks,
 				      map->m_len, blocks_to_boundary);
 	/*
 	 * Block out ext4_truncate while we alter the tree
 	 */
 	err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,
 				&count, goal,
 				offsets + (partial - chain), partial);
 	/*
 	 * The ext4_splice_branch call will free and forget any buffers
 	 * on the new chain if there is a failure, but that risks using
 	 * up transaction credits, especially for bitmaps where the
 	 * credits cannot be returned.  Can we handle this somehow?  We
 	 * may need to return -EAGAIN upwards in the worst case.  --sct
 	 */
 	if (!err)
 		err = ext4_splice_branch(handle, inode, map->m_lblk,
 					 partial, indirect_blks, count);
 	if (err)
 		goto cleanup;
 	map->m_flags |= EXT4_MAP_NEW;
 	ext4_update_inode_fsync_trans(handle, inode, 1);
 got_it:
 	map->m_flags |= EXT4_MAP_MAPPED;
 	map->m_pblk = le32_to_cpu(chain[depth-1].key);
 	map->m_len = count;
 	if (count > blocks_to_boundary)
 		map->m_flags |= EXT4_MAP_BOUNDARY;
 	err = count;
 	/* Clean up and exit */
 	partial = chain + depth - 1;	/* the whole chain */
 cleanup:
 	while (partial > chain) {
 		BUFFER_TRACE(partial->bh, "call brelse");
 		brelse(partial->bh);
 		partial--;
 	}
 out:
 	trace_ext4_ind_map_blocks_exit(inode, map->m_lblk,
 				map->m_pblk, map->m_len, err);
 	return err;
 }
 #ifdef CONFIG_QUOTA
 qsize_t *ext4_get_reserved_space(struct inode *inode)
 {
 	return &EXT4_I(inode)->i_reserved_quota;
 }
 #endif
 /*
  * Calculate the number of metadata blocks need to reserve
  * to allocate a new block at @lblocks for non extent file based file
  */
 static int ext4_indirect_calc_metadata_amount(struct inode *inode,
 					      sector_t lblock)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);
 	int blk_bits;
 	if (lblock < EXT4_NDIR_BLOCKS)
 		return 0;
 	lblock -= EXT4_NDIR_BLOCKS;
 	if (ei->i_da_metadata_calc_len &&
 	    (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) {
 		ei->i_da_metadata_calc_len++;
 		return 0;
 	}
 	ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
 	ei->i_da_metadata_calc_len = 1;
 	blk_bits = order_base_2(lblock);
 	return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
 }
 /*
  * Calculate the number of metadata blocks need to reserve
  * to allocate a block located at @lblock
  */
 static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
 {
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 		return ext4_ext_calc_metadata_amount(inode, lblock);
 	return ext4_indirect_calc_metadata_amount(inode, lblock);
 }
 /*
  * Called with i_data_sem down, which is important since we can call
  * ext4_discard_preallocations() from here.
  */
 void ext4_da_update_reserve_space(struct inode *inode,
 					int used, int quota_claim)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	spin_lock(&ei->i_block_reservation_lock);
 	trace_ext4_da_update_reserve_space(inode, used);
 	if (unlikely(used > ei->i_reserved_data_blocks)) {
 		ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
 			 "with only %d reserved data blocks\n",
 			 __func__, inode->i_ino, used,
 			 ei->i_reserved_data_blocks);
 		WARN_ON(1);
 		used = ei->i_reserved_data_blocks;
 	}
 	/* Update per-inode reservations */
 	ei->i_reserved_data_blocks -= used;
 	ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
 	percpu_counter_sub(&sbi->s_dirtyblocks_counter,
 			   used + ei->i_allocated_meta_blocks);
 	ei->i_allocated_meta_blocks = 0;
 	if (ei->i_reserved_data_blocks == 0) {
 		/*
 		 * We can release all of the reserved metadata blocks
 		 * only when we have written all of the delayed
 		 * allocation blocks.
 		 */
 		percpu_counter_sub(&sbi->s_dirtyblocks_counter,
 				   ei->i_reserved_meta_blocks);
 		ei->i_reserved_meta_blocks = 0;
 		ei->i_da_metadata_calc_len = 0;
 	}
 	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 	/* Update quota subsystem for data blocks */
 	if (quota_claim)
 		dquot_claim_block(inode, used);
 	else {
 		/*
 		 * We did fallocate with an offset that is already delayed
 		 * allocated. So on delayed allocated writeback we should
 		 * not re-claim the quota for fallocated blocks.
 		 */
 		dquot_release_reservation_block(inode, used);
 	}
 	/*
 	 * If we have done all the pending block allocations and if
 	 * there aren't any writers on the inode, we can discard the
 	 * inode's preallocations.
 	 */
 	if ((ei->i_reserved_data_blocks == 0) &&
 	    (atomic_read(&inode->i_writecount) == 0))
 		ext4_discard_preallocations(inode);
 }
 static int __check_block_validity(struct inode *inode, const char *func,
 				unsigned int line,
 				struct ext4_map_blocks *map)
 {
 	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
 				   map->m_len)) {
 		ext4_error_inode(inode, func, line, map->m_pblk,
 				 "lblock %lu mapped to illegal pblock "
 				 "(length %d)", (unsigned long) map->m_lblk,
 				 map->m_len);
 		return -EIO;
 	}
 	return 0;
 }
 #define check_block_validity(inode, map)	\
 	__check_block_validity((inode), __func__, __LINE__, (map))
 /*
  * Return the number of contiguous dirty pages in a given inode
  * starting at page frame idx.
  */
 static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
 				    unsigned int max_pages)
 {
 	struct address_space *mapping = inode->i_mapping;
 	pgoff_t	index;
 	struct pagevec pvec;
 	pgoff_t num = 0;
 	int i, nr_pages, done = 0;
 	if (max_pages == 0)
 		return 0;
 	pagevec_init(&pvec, 0);
 	while (!done) {
 		index = idx;
 		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
 					      PAGECACHE_TAG_DIRTY,
 					      (pgoff_t)PAGEVEC_SIZE);
 		if (nr_pages == 0)
 			break;
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
 			struct buffer_head *bh, *head;
 			lock_page(page);
 			if (unlikely(page->mapping != mapping) ||
 			    !PageDirty(page) ||
 			    PageWriteback(page) ||
 			    page->index != idx) {
 				done = 1;
 				unlock_page(page);
 				break;
 			}
 			if (page_has_buffers(page)) {
 				bh = head = page_buffers(page);
 				do {
 					if (!buffer_delay(bh) &&
 					    !buffer_unwritten(bh))
 						done = 1;
 					bh = bh->b_this_page;
 				} while (!done && (bh != head));
 			}
 			unlock_page(page);
 			if (done)
 				break;
 			idx++;
 			num++;
 			if (num >= max_pages) {
 				done = 1;
 				break;
 			}
 		}
 		pagevec_release(&pvec);
 	}
 	return num;
 }
 /*
  * The ext4_map_blocks() function tries to look up the requested blocks,
  * and returns if the blocks are already mapped.
  *
  * Otherwise it takes the write lock of the i_data_sem and allocate blocks
  * and store the allocated blocks in the result buffer head and mark it
  * mapped.
  *
  * If file type is extents based, it will call ext4_ext_map_blocks(),
  * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
  * based files
  *
  * On success, it returns the number of blocks being mapped or allocate.
  * if create==0 and the blocks are pre-allocated and uninitialized block,
  * the result buffer head is unmapped. If the create ==1, it will make sure
  * the buffer head is mapped.
  *
  * It returns 0 if plain look up failed (blocks have not been allocated), in
  * that casem, buffer head is unmapped
  *
  * It returns the error in case of allocation failure.
  */
 int ext4_map_blocks(handle_t *handle, struct inode *inode,
 		    struct ext4_map_blocks *map, int flags)
 {
 	int retval;
 	map->m_flags = 0;
 	ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
 		  "logical block %lu\n", inode->i_ino, flags, map->m_len,
 		  (unsigned long) map->m_lblk);
 	/*
 	 * Try to see if we can get the block without requesting a new
 	 * file system block.
 	 */
 	down_read((&EXT4_I(inode)->i_data_sem));
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
 		retval = ext4_ext_map_blocks(handle, inode, map, 0);
 	} else {
 		retval = ext4_ind_map_blocks(handle, inode, map, 0);
 	}
 	up_read((&EXT4_I(inode)->i_data_sem));
 	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
 		int ret = check_block_validity(inode, map);
 		if (ret != 0)
 			return ret;
 	}
 	/* If it is only a block(s) look up */
 	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
 		return retval;
 	/*
 	 * Returns if the blocks have already allocated
 	 *
 	 * Note that if blocks have been preallocated
 	 * ext4_ext_get_block() returns th create = 0
 	 * with buffer head unmapped.
 	 */
 	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
 		return retval;
 	/*
 	 * When we call get_blocks without the create flag, the
 	 * BH_Unwritten flag could have gotten set if the blocks
 	 * requested were part of a uninitialized extent.  We need to
 	 * clear this flag now that we are committed to convert all or
 	 * part of the uninitialized extent to be an initialized
 	 * extent.  This is because we need to avoid the combination
 	 * of BH_Unwritten and BH_Mapped flags being simultaneously
 	 * set on the buffer_head.
 	 */
 	map->m_flags &= ~EXT4_MAP_UNWRITTEN;
 	/*
 	 * New blocks allocate and/or writing to uninitialized extent
 	 * will possibly result in updating i_data, so we take
 	 * the write lock of i_data_sem, and call get_blocks()
 	 * with create == 1 flag.
 	 */
 	down_write((&EXT4_I(inode)->i_data_sem));
 	/*
 	 * if the caller is from delayed allocation writeout path
 	 * we have already reserved fs blocks for allocation
 	 * let the underlying get_block() function know to
 	 * avoid double accounting
 	 */
 	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
 		ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
 	/*
 	 * We need to check for EXT4 here because migrate
 	 * could have changed the inode type in between
 	 */
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
 		retval = ext4_ext_map_blocks(handle, inode, map, flags);
 	} else {
 		retval = ext4_ind_map_blocks(handle, inode, map, flags);
 		if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
 			/*
 			 * We allocated new blocks which will result in
 			 * i_data's format changing.  Force the migrate
 			 * to fail by clearing migrate flags
 			 */
 			ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
 		}
 		/*
 		 * Update reserved blocks/metadata blocks after successful
 		 * block allocation which had been deferred till now. We don't
 		 * support fallocate for non extent files. So we can update
 		 * reserve space here.
 		 */
 		if ((retval > 0) &&
 			(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
 			ext4_da_update_reserve_space(inode, retval, 1);
 	}
 	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
 		ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
 	up_write((&EXT4_I(inode)->i_data_sem));
 	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
 		int ret = check_block_validity(inode, map);
 		if (ret != 0)
 			return ret;
 	}
 	return retval;
 }
 /* Maximum number of blocks we map for direct IO at once. */
 #define DIO_MAX_BLOCKS 4096
 static int _ext4_get_block(struct inode *inode, sector_t iblock,
 			   struct buffer_head *bh, int flags)
 {
 	handle_t *handle = ext4_journal_current_handle();
 	struct ext4_map_blocks map;
 	int ret = 0, started = 0;
 	int dio_credits;
 	map.m_lblk = iblock;
 	map.m_len = bh->b_size >> inode->i_blkbits;
 	if (flags && !handle) {
 		/* Direct IO write... */
 		if (map.m_len > DIO_MAX_BLOCKS)
 			map.m_len = DIO_MAX_BLOCKS;
 		dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
 		handle = ext4_journal_start(inode, dio_credits);
 		if (IS_ERR(handle)) {
 			ret = PTR_ERR(handle);
 			return ret;
 		}
 		started = 1;
 	}
 	ret = ext4_map_blocks(handle, inode, &map, flags);
 	if (ret > 0) {
 		map_bh(bh, inode->i_sb, map.m_pblk);
 		bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
 		bh->b_size = inode->i_sb->s_blocksize * map.m_len;
 		ret = 0;
 	}
 	if (started)
 		ext4_journal_stop(handle);
 	return ret;
 }
 int ext4_get_block(struct inode *inode, sector_t iblock,
 		   struct buffer_head *bh, int create)
 {
 	return _ext4_get_block(inode, iblock, bh,
 			       create ? EXT4_GET_BLOCKS_CREATE : 0);
 }
 /*
  * `handle' can be NULL if create is zero
  */
 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
 				ext4_lblk_t block, int create, int *errp)
 {
 	struct ext4_map_blocks map;
 	struct buffer_head *bh;
 	int fatal = 0, err;
 	J_ASSERT(handle != NULL || create == 0);
 	map.m_lblk = block;
 	map.m_len = 1;
 	err = ext4_map_blocks(handle, inode, &map,
 			      create ? EXT4_GET_BLOCKS_CREATE : 0);
 	if (err < 0)
 		*errp = err;
 	if (err <= 0)
 		return NULL;
 	*errp = 0;
 	bh = sb_getblk(inode->i_sb, map.m_pblk);
 	if (!bh) {
 		*errp = -EIO;
 		return NULL;
 	}
 	if (map.m_flags & EXT4_MAP_NEW) {
 		J_ASSERT(create != 0);
 		J_ASSERT(handle != NULL);
 		/*
 		 * Now that we do not always journal data, we should
 		 * keep in mind whether this should always journal the
 		 * new buffer as metadata.  For now, regular file
 		 * writes use ext4_get_block instead, so it's not a
 		 * problem.
 		 */
 		lock_buffer(bh);
 		BUFFER_TRACE(bh, "call get_create_access");
 		fatal = ext4_journal_get_create_access(handle, bh);
 		if (!fatal && !buffer_uptodate(bh)) {
 			memset(bh->b_data, 0, inode->i_sb->s_blocksize);
 			set_buffer_uptodate(bh);
 		}
 		unlock_buffer(bh);
 		BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
 		err = ext4_handle_dirty_metadata(handle, inode, bh);
 		if (!fatal)
 			fatal = err;
 	} else {
 		BUFFER_TRACE(bh, "not a new buffer");
 	}
 	if (fatal) {
 		*errp = fatal;
 		brelse(bh);
 		bh = NULL;
 	}
 	return bh;
 }
 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
 			       ext4_lblk_t block, int create, int *err)
 {
 	struct buffer_head *bh;
 	bh = ext4_getblk(handle, inode, block, create, err);
 	if (!bh)
 		return bh;
 	if (buffer_uptodate(bh))
 		return bh;
 	ll_rw_block(READ_META, 1, &bh);
 	wait_on_buffer(bh);
 	if (buffer_uptodate(bh))
 		return bh;
 	put_bh(bh);
 	*err = -EIO;
 	return NULL;
 }
 static int walk_page_buffers(handle_t *handle,
 			     struct buffer_head *head,
 			     unsigned from,
 			     unsigned to,
 			     int *partial,
 			     int (*fn)(handle_t *handle,
 				       struct buffer_head *bh))
 {
 	struct buffer_head *bh;
 	unsigned block_start, block_end;
 	unsigned blocksize = head->b_size;
 	int err, ret = 0;
 	struct buffer_head *next;
 	for (bh = head, block_start = 0;
 	     ret == 0 && (bh != head || !block_start);
 	     block_start = block_end, bh = next) {
 		next = bh->b_this_page;
 		block_end = block_start + blocksize;
 		if (block_end <= from || block_start >= to) {
 			if (partial && !buffer_uptodate(bh))
 				*partial = 1;
 			continue;
 		}
 		err = (*fn)(handle, bh);
 		if (!ret)
 			ret = err;
 	}
 	return ret;
 }
 /*
  * To preserve ordering, it is essential that the hole instantiation and
  * the data write be encapsulated in a single transaction.  We cannot
  * close off a transaction and start a new one between the ext4_get_block()
  * and the commit_write().  So doing the jbd2_journal_start at the start of
  * prepare_write() is the right place.
  *
  * Also, this function can nest inside ext4_writepage() ->
  * block_write_full_page(). In that case, we *know* that ext4_writepage()
  * has generated enough buffer credits to do the whole page.  So we won't
  * block on the journal in that case, which is good, because the caller may
  * be PF_MEMALLOC.
  *
  * By accident, ext4 can be reentered when a transaction is open via
  * quota file writes.  If we were to commit the transaction while thus
  * reentered, there can be a deadlock - we would be holding a quota
  * lock, and the commit would never complete if another thread had a
  * transaction open and was blocking on the quota lock - a ranking
  * violation.
  *
  * So what we do is to rely on the fact that jbd2_journal_stop/journal_start
  * will _not_ run commit under these circumstances because handle->h_ref
  * is elevated.  We'll still have enough credits for the tiny quotafile
  * write.
  */
 static int do_journal_get_write_access(handle_t *handle,
 				       struct buffer_head *bh)
 {
 	int dirty = buffer_dirty(bh);
 	int ret;
 	if (!buffer_mapped(bh) || buffer_freed(bh))
 		return 0;
 	/*
 	 * __block_write_begin() could have dirtied some buffers. Clean
 	 * the dirty bit as jbd2_journal_get_write_access() could complain
 	 * otherwise about fs integrity issues. Setting of the dirty bit
 	 * by __block_write_begin() isn't a real problem here as we clear
 	 * the bit before releasing a page lock and thus writeback cannot
 	 * ever write the buffer.
 	 */
 	if (dirty)
 		clear_buffer_dirty(bh);
 	ret = ext4_journal_get_write_access(handle, bh);
 	if (!ret && dirty)
 		ret = ext4_handle_dirty_metadata(handle, NULL, bh);
 	return ret;
 }
 /*
  * Truncate blocks that were not used by write. We have to truncate the
  * pagecache as well so that corresponding buffers get properly unmapped.
  */
 static void ext4_truncate_failed_write(struct inode *inode)
 {
 	truncate_inode_pages(inode->i_mapping, inode->i_size);
 	ext4_truncate(inode);
 }
 static int ext4_get_block_write(struct inode *inode, sector_t iblock,
 		   struct buffer_head *bh_result, int create);
 static int ext4_write_begin(struct file *file, struct address_space *mapping,
 			    loff_t pos, unsigned len, unsigned flags,
 			    struct page **pagep, void **fsdata)
 {
 	struct inode *inode = mapping->host;
 	int ret, needed_blocks;
 	handle_t *handle;
 	int retries = 0;
 	struct page *page;
 	pgoff_t index;
 	unsigned from, to;
 	trace_ext4_write_begin(inode, pos, len, flags);
 	/*
 	 * Reserve one block more for addition to orphan list in case
 	 * we allocate blocks but write fails for some reason
 	 */
 	needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
 	index = pos >> PAGE_CACHE_SHIFT;
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + len;
 retry:
 	handle = ext4_journal_start(inode, needed_blocks);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		goto out;
 	}
 	/* We cannot recurse into the filesystem as the transaction is already
 	 * started */
 	flags |= AOP_FLAG_NOFS;
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page) {
 		ext4_journal_stop(handle);
 		ret = -ENOMEM;
 		goto out;
 	}
 	*pagep = page;
 	if (ext4_should_dioread_nolock(inode))
 		ret = __block_write_begin(page, pos, len, ext4_get_block_write);
 	else
 		ret = __block_write_begin(page, pos, len, ext4_get_block);
 	if (!ret && ext4_should_journal_data(inode)) {
 		ret = walk_page_buffers(handle, page_buffers(page),
 				from, to, NULL, do_journal_get_write_access);
 	}
 	if (ret) {
 		unlock_page(page);
 		page_cache_release(page);
 		/*
 		 * __block_write_begin may have instantiated a few blocks
 		 * outside i_size.  Trim these off again. Don't need
 		 * i_size_read because we hold i_mutex.
 		 *
 		 * Add inode to orphan list in case we crash before
 		 * truncate finishes
 		 */
 		if (pos + len > inode->i_size && ext4_can_truncate(inode))
 			ext4_orphan_add(handle, inode);
 		ext4_journal_stop(handle);
 		if (pos + len > inode->i_size) {
 			ext4_truncate_failed_write(inode);
 			/*
 			 * If truncate failed early the inode might
 			 * still be on the orphan list; we need to
 			 * make sure the inode is removed from the
 			 * orphan list in that case.
 			 */
 			if (inode->i_nlink)
 				ext4_orphan_del(NULL, inode);
 		}
 	}
 	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
 		goto retry;
 out:
 	return ret;
 }
 /* For write_end() in data=journal mode */
 static int write_end_fn(handle_t *handle, struct buffer_head *bh)
 {
 	if (!buffer_mapped(bh) || buffer_freed(bh))
 		return 0;
 	set_buffer_uptodate(bh);
 	return ext4_handle_dirty_metadata(handle, NULL, bh);
 }
 static int ext4_generic_write_end(struct file *file,
 				  struct address_space *mapping,
 				  loff_t pos, unsigned len, unsigned copied,
 				  struct page *page, void *fsdata)
 {
 	int i_size_changed = 0;
 	struct inode *inode = mapping->host;
 	handle_t *handle = ext4_journal_current_handle();
 	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
 	/*
 	 * No need to use i_size_read() here, the i_size
 	 * cannot change under us because we hold i_mutex.
 	 *
 	 * But it's important to update i_size while still holding page lock:
 	 * page writeout could otherwise come in and zero beyond i_size.
 	 */
 	if (pos + copied > inode->i_size) {
 		i_size_write(inode, pos + copied);
 		i_size_changed = 1;
 	}
 	if (pos + copied >  EXT4_I(inode)->i_disksize) {
 		/* We need to mark inode dirty even if
 		 * new_i_size is less that inode->i_size
 		 * bu greater than i_disksize.(hint delalloc)
 		 */
 		ext4_update_i_disksize(inode, (pos + copied));
 		i_size_changed = 1;
 	}
 	unlock_page(page);
 	page_cache_release(page);
 	/*
 	 * Don't mark the inode dirty under page lock. First, it unnecessarily
 	 * makes the holding time of page lock longer. Second, it forces lock
 	 * ordering of page lock and transaction start for journaling
 	 * filesystems.
 	 */
 	if (i_size_changed)
 		ext4_mark_inode_dirty(handle, inode);
 	return copied;
 }
 /*
  * We need to pick up the new inode size which generic_commit_write gave us
  * `file' can be NULL - eg, when called from page_symlink().
  *
  * ext4 never places buffers on inode->i_mapping->private_list.  metadata
  * buffers are managed internally.
  */
 static int ext4_ordered_write_end(struct file *file,
 				  struct address_space *mapping,
 				  loff_t pos, unsigned len, unsigned copied,
 				  struct page *page, void *fsdata)
 {
 	handle_t *handle = ext4_journal_current_handle();
 	struct inode *inode = mapping->host;
 	int ret = 0, ret2;
 	trace_ext4_ordered_write_end(inode, pos, len, copied);
 	ret = ext4_jbd2_file_inode(handle, inode);
 	if (ret == 0) {
 		ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
 		copied = ret2;
 		if (pos + len > inode->i_size && ext4_can_truncate(inode))
 			/* if we have allocated more blocks and copied
 			 * less. We will have blocks allocated outside
 			 * inode->i_size. So truncate them
 			 */
 			ext4_orphan_add(handle, inode);
 		if (ret2 < 0)
 			ret = ret2;
 	}
 	ret2 = ext4_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
 	if (pos + len > inode->i_size) {
 		ext4_truncate_failed_write(inode);
 		/*
 		 * If truncate failed early the inode might still be
 		 * on the orphan list; we need to make sure the inode
 		 * is removed from the orphan list in that case.
 		 */
 		if (inode->i_nlink)
 			ext4_orphan_del(NULL, inode);
 	}
 	return ret ? ret : copied;
 }
 static int ext4_writeback_write_end(struct file *file,
 				    struct address_space *mapping,
 				    loff_t pos, unsigned len, unsigned copied,
 				    struct page *page, void *fsdata)
 {
 	handle_t *handle = ext4_journal_current_handle();
 	struct inode *inode = mapping->host;
 	int ret = 0, ret2;
 	trace_ext4_writeback_write_end(inode, pos, len, copied);
 	ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
 	copied = ret2;
 	if (pos + len > inode->i_size && ext4_can_truncate(inode))
 		/* if we have allocated more blocks and copied
 		 * less. We will have blocks allocated outside
 		 * inode->i_size. So truncate them
 		 */
 		ext4_orphan_add(handle, inode);
 	if (ret2 < 0)
 		ret = ret2;
 	ret2 = ext4_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
 	if (pos + len > inode->i_size) {
 		ext4_truncate_failed_write(inode);
 		/*
 		 * If truncate failed early the inode might still be
 		 * on the orphan list; we need to make sure the inode
 		 * is removed from the orphan list in that case.
 		 */
 		if (inode->i_nlink)
 			ext4_orphan_del(NULL, inode);
 	}
 	return ret ? ret : copied;
 }
 static int ext4_journalled_write_end(struct file *file,
 				     struct address_space *mapping,
 				     loff_t pos, unsigned len, unsigned copied,
 				     struct page *page, void *fsdata)
 {
 	handle_t *handle = ext4_journal_current_handle();
 	struct inode *inode = mapping->host;
 	int ret = 0, ret2;
 	int partial = 0;
 	unsigned from, to;
 	loff_t new_i_size;
 	trace_ext4_journalled_write_end(inode, pos, len, copied);
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + len;
 	if (copied < len) {
 		if (!PageUptodate(page))
 			copied = 0;
 		page_zero_new_buffers(page, from+copied, to);
 	}
 	ret = walk_page_buffers(handle, page_buffers(page), from,
 				to, &partial, write_end_fn);
 	if (!partial)
 		SetPageUptodate(page);
 	new_i_size = pos + copied;
 	if (new_i_size > inode->i_size)
 		i_size_write(inode, pos+copied);
 	ext4_set_inode_state(inode, EXT4_STATE_JDATA);
 	if (new_i_size > EXT4_I(inode)->i_disksize) {
 		ext4_update_i_disksize(inode, new_i_size);
 		ret2 = ext4_mark_inode_dirty(handle, inode);
 		if (!ret)
 			ret = ret2;
 	}
 	unlock_page(page);
 	page_cache_release(page);
 	if (pos + len > inode->i_size && ext4_can_truncate(inode))
 		/* if we have allocated more blocks and copied
 		 * less. We will have blocks allocated outside
 		 * inode->i_size. So truncate them
 		 */
 		ext4_orphan_add(handle, inode);
 	ret2 = ext4_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
 	if (pos + len > inode->i_size) {
 		ext4_truncate_failed_write(inode);
 		/*
 		 * If truncate failed early the inode might still be
 		 * on the orphan list; we need to make sure the inode
 		 * is removed from the orphan list in that case.
 		 */
 		if (inode->i_nlink)
 			ext4_orphan_del(NULL, inode);
 	}
 	return ret ? ret : copied;
 }
 /*
  * Reserve a single block located at lblock
  */
 static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
 {
 	int retries = 0;
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	unsigned long md_needed;
 	int ret;
 	/*
 	 * recalculate the amount of metadata blocks to reserve
 	 * in order to allocate nrblocks
 	 * worse case is one extent per block
 	 */
 repeat:
 	spin_lock(&ei->i_block_reservation_lock);
 	md_needed = ext4_calc_metadata_amount(inode, lblock);
 	trace_ext4_da_reserve_space(inode, md_needed);
 	spin_unlock(&ei->i_block_reservation_lock);
 	/*
 	 * We will charge metadata quota at writeout time; this saves
 	 * us from metadata over-estimation, though we may go over by
 	 * a small amount in the end.  Here we just reserve for data.
 	 */
 	ret = dquot_reserve_block(inode, 1);
 	if (ret)
 		return ret;
 	/*
 	 * We do still charge estimated metadata to the sb though;
 	 * we cannot afford to run out of free blocks.
 	 */
 	if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
 		dquot_release_reservation_block(inode, 1);
 		if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
 			yield();
 			goto repeat;
 		}
 		return -ENOSPC;
 	}
 	spin_lock(&ei->i_block_reservation_lock);
 	ei->i_reserved_data_blocks++;
 	ei->i_reserved_meta_blocks += md_needed;
 	spin_unlock(&ei->i_block_reservation_lock);
 	return 0;       /* success */
 }
 static void ext4_da_release_space(struct inode *inode, int to_free)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	if (!to_free)
 		return;		/* Nothing to release, exit */
 	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
 	trace_ext4_da_release_space(inode, to_free);
 	if (unlikely(to_free > ei->i_reserved_data_blocks)) {
 		/*
 		 * if there aren't enough reserved blocks, then the
 		 * counter is messed up somewhere.  Since this
 		 * function is called from invalidate page, it's
 		 * harmless to return without any action.
 		 */
 		ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
 			 "ino %lu, to_free %d with only %d reserved "
 			 "data blocks\n", inode->i_ino, to_free,
 			 ei->i_reserved_data_blocks);
 		WARN_ON(1);
 		to_free = ei->i_reserved_data_blocks;
 	}
 	ei->i_reserved_data_blocks -= to_free;
 	if (ei->i_reserved_data_blocks == 0) {
 		/*
 		 * We can release all of the reserved metadata blocks
 		 * only when we have written all of the delayed
 		 * allocation blocks.
 		 */
 		percpu_counter_sub(&sbi->s_dirtyblocks_counter,
 				   ei->i_reserved_meta_blocks);
 		ei->i_reserved_meta_blocks = 0;
 		ei->i_da_metadata_calc_len = 0;
 	}
 	/* update fs dirty data blocks counter */
 	percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
 	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 	dquot_release_reservation_block(inode, to_free);
 }
 static void ext4_da_page_release_reservation(struct page *page,
 					     unsigned long offset)
 {
 	int to_release = 0;
 	struct buffer_head *head, *bh;
 	unsigned int curr_off = 0;
 	head = page_buffers(page);
 	bh = head;
 	do {
 		unsigned int next_off = curr_off + bh->b_size;
 		if ((offset <= curr_off) && (buffer_delay(bh))) {
 			to_release++;
 			clear_buffer_delay(bh);
 		}
 		curr_off = next_off;
 	} while ((bh = bh->b_this_page) != head);
 	ext4_da_release_space(page->mapping->host, to_release);
 }
 /*
  * Delayed allocation stuff
  */
 /*
  * mpage_da_submit_io - walks through extent of pages and try to write
  * them with writepage() call back
  *
  * @mpd->inode: inode
  * @mpd->first_page: first page of the extent
  * @mpd->next_page: page after the last page of the extent
  *
  * By the time mpage_da_submit_io() is called we expect all blocks
  * to be allocated. this may be wrong if allocation failed.
  *
  * As pages are already locked by write_cache_pages(), we can't use it
  */
 static int mpage_da_submit_io(struct mpage_da_data *mpd,
 			      struct ext4_map_blocks *map)
 {
 	struct pagevec pvec;
 	unsigned long index, end;
 	int ret = 0, err, nr_pages, i;
 	struct inode *inode = mpd->inode;
 	struct address_space *mapping = inode->i_mapping;
 	loff_t size = i_size_read(inode);
 	unsigned int len, block_start;
 	struct buffer_head *bh, *page_bufs = NULL;
 	int journal_data = ext4_should_journal_data(inode);
 	sector_t pblock = 0, cur_logical = 0;
 	struct ext4_io_submit io_submit;
 	BUG_ON(mpd->next_page <= mpd->first_page);
 	memset(&io_submit, 0, sizeof(io_submit));
 	/*
 	 * We need to start from the first_page to the next_page - 1
 	 * to make sure we also write the mapped dirty buffer_heads.
 	 * If we look at mpd->b_blocknr we would only be looking
 	 * at the currently mapped buffer_heads.
 	 */
 	index = mpd->first_page;
 	end = mpd->next_page - 1;
 	pagevec_init(&pvec, 0);
 	while (index <= end) {
 		nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
 		if (nr_pages == 0)
 			break;
 		for (i = 0; i < nr_pages; i++) {
 			int commit_write = 0, skip_page = 0;
 			struct page *page = pvec.pages[i];
 			index = page->index;
 			if (index > end)
 				break;
 			if (index == size >> PAGE_CACHE_SHIFT)
 				len = size & ~PAGE_CACHE_MASK;
 			else
 				len = PAGE_CACHE_SIZE;
 			if (map) {
 				cur_logical = index << (PAGE_CACHE_SHIFT -
 							inode->i_blkbits);
 				pblock = map->m_pblk + (cur_logical -
 							map->m_lblk);
 			}
 			index++;
 			BUG_ON(!PageLocked(page));
 			BUG_ON(PageWriteback(page));
 			/*
 			 * If the page does not have buffers (for
 			 * whatever reason), try to create them using
 			 * __block_write_begin.  If this fails,
 			 * skip the page and move on.
 			 */
 			if (!page_has_buffers(page)) {
 				if (__block_write_begin(page, 0, len,
 						noalloc_get_block_write)) {
 				skip_page:
 					unlock_page(page);
 					continue;
 				}
 				commit_write = 1;
 			}
 			bh = page_bufs = page_buffers(page);
 			block_start = 0;
 			do {
 				if (!bh)
 					goto skip_page;
 				if (map && (cur_logical >= map->m_lblk) &&
 				    (cur_logical <= (map->m_lblk +
 						     (map->m_len - 1)))) {
 					if (buffer_delay(bh)) {
 						clear_buffer_delay(bh);
 						bh->b_blocknr = pblock;
 					}
 					if (buffer_unwritten(bh) ||
 					    buffer_mapped(bh))
 						BUG_ON(bh->b_blocknr != pblock);
 					if (map->m_flags & EXT4_MAP_UNINIT)
 						set_buffer_uninit(bh);
 					clear_buffer_unwritten(bh);
 				}
 				/* skip page if block allocation undone */
 				if (buffer_delay(bh) || buffer_unwritten(bh))
 					skip_page = 1;
 				bh = bh->b_this_page;
 				block_start += bh->b_size;
 				cur_logical++;
 				pblock++;
 			} while (bh != page_bufs);
 			if (skip_page)
 				goto skip_page;
 			if (commit_write)
 				/* mark the buffer_heads as dirty & uptodate */
 				block_commit_write(page, 0, len);
 			clear_page_dirty_for_io(page);
 			/*
 			 * Delalloc doesn't support data journalling,
 			 * but eventually maybe we'll lift this
 			 * restriction.
 			 */
 			if (unlikely(journal_data && PageChecked(page)))
 				err = __ext4_journalled_writepage(page, len);
 			else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))
 				err = ext4_bio_write_page(&io_submit, page,
 							  len, mpd->wbc);
 			else
 				err = block_write_full_page(page,
 					noalloc_get_block_write, mpd->wbc);
 			if (!err)
 				mpd->pages_written++;
 			/*
 			 * In error case, we have to continue because
 			 * remaining pages are still locked
 			 */
 			if (ret == 0)
 				ret = err;
 		}
 		pagevec_release(&pvec);
 	}
 	ext4_io_submit(&io_submit);
 	return ret;
 }
 static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
 {
 	int nr_pages, i;
 	pgoff_t index, end;
 	struct pagevec pvec;
 	struct inode *inode = mpd->inode;
 	struct address_space *mapping = inode->i_mapping;
 	index = mpd->first_page;
 	end   = mpd->next_page - 1;
 	while (index <= end) {
 		nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
 		if (nr_pages == 0)
 			break;
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
 			if (page->index > end)
 				break;
 			BUG_ON(!PageLocked(page));
 			BUG_ON(PageWriteback(page));
 			block_invalidatepage(page, 0);
 			ClearPageUptodate(page);
 			unlock_page(page);
 		}
 		index = pvec.pages[nr_pages - 1]->index + 1;
 		pagevec_release(&pvec);
 	}
 	return;
 }
 static void ext4_print_free_blocks(struct inode *inode)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	printk(KERN_CRIT "Total free blocks count %lld\n",
 	       ext4_count_free_blocks(inode->i_sb));
 	printk(KERN_CRIT "Free/Dirty block details\n");
 	printk(KERN_CRIT "free_blocks=%lld\n",
 	       (long long) percpu_counter_sum(&sbi->s_freeblocks_counter));
 	printk(KERN_CRIT "dirty_blocks=%lld\n",
 	       (long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
 	printk(KERN_CRIT "Block reservation details\n");
 	printk(KERN_CRIT "i_reserved_data_blocks=%u\n",
 	       EXT4_I(inode)->i_reserved_data_blocks);
 	printk(KERN_CRIT "i_reserved_meta_blocks=%u\n",
 	       EXT4_I(inode)->i_reserved_meta_blocks);
 	return;
 }
 /*
  * mpage_da_map_and_submit - go through given space, map them
  *       if necessary, and then submit them for I/O
  *
  * @mpd - bh describing space
  *
  * The function skips space we know is already mapped to disk blocks.
  *
  */
 static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
 {
 	int err, blks, get_blocks_flags;
 	struct ext4_map_blocks map, *mapp = NULL;
 	sector_t next = mpd->b_blocknr;
 	unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
 	loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
 	handle_t *handle = NULL;
 	/*
 	 * If the blocks are mapped already, or we couldn't accumulate
 	 * any blocks, then proceed immediately to the submission stage.
 	 */
 	if ((mpd->b_size == 0) ||
 	    ((mpd->b_state  & (1 << BH_Mapped)) &&
 	     !(mpd->b_state & (1 << BH_Delay)) &&
 	     !(mpd->b_state & (1 << BH_Unwritten))))
 		goto submit_io;
 	handle = ext4_journal_current_handle();
 	BUG_ON(!handle);
 	/*
 	 * Call ext4_map_blocks() to allocate any delayed allocation
 	 * blocks, or to convert an uninitialized extent to be
 	 * initialized (in the case where we have written into
 	 * one or more preallocated blocks).
 	 *
 	 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
 	 * indicate that we are on the delayed allocation path.  This
 	 * affects functions in many different parts of the allocation
 	 * call path.  This flag exists primarily because we don't
 	 * want to change *many* call functions, so ext4_map_blocks()
 	 * will set the EXT4_STATE_DELALLOC_RESERVED flag once the
 	 * inode's allocation semaphore is taken.
 	 *
 	 * If the blocks in questions were delalloc blocks, set
 	 * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
 	 * variables are updated after the blocks have been allocated.
 	 */
 	map.m_lblk = next;
 	map.m_len = max_blocks;
 	get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
 	if (ext4_should_dioread_nolock(mpd->inode))
 		get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
 	if (mpd->b_state & (1 << BH_Delay))
 		get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
 	blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
 	if (blks < 0) {
 		struct super_block *sb = mpd->inode->i_sb;
 		err = blks;
 		/*
 		 * If get block returns EAGAIN or ENOSPC and there
 		 * appears to be free blocks we will just let
 		 * mpage_da_submit_io() unlock all of the pages.
 		 */
 		if (err == -EAGAIN)
 			goto submit_io;
 		if (err == -ENOSPC &&
 		    ext4_count_free_blocks(sb)) {
 			mpd->retval = err;
 			goto submit_io;
 		}
 		/*
 		 * get block failure will cause us to loop in
 		 * writepages, because a_ops->writepage won't be able
 		 * to make progress. The page will be redirtied by
 		 * writepage and writepages will again try to write
 		 * the same.
 		 */
 		if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
 			ext4_msg(sb, KERN_CRIT,
 				 "delayed block allocation failed for inode %lu "
 				 "at logical offset %llu with max blocks %zd "
 				 "with error %d", mpd->inode->i_ino,
 				 (unsigned long long) next,
 				 mpd->b_size >> mpd->inode->i_blkbits, err);
 			ext4_msg(sb, KERN_CRIT,
 				"This should not happen!! Data will be lost\n");
 			if (err == -ENOSPC)
 				ext4_print_free_blocks(mpd->inode);
 		}
 		/* invalidate all the pages */
 		ext4_da_block_invalidatepages(mpd);
 		/* Mark this page range as having been completed */
 		mpd->io_done = 1;
 		return;
 	}
 	BUG_ON(blks == 0);
 	mapp = &map;
 	if (map.m_flags & EXT4_MAP_NEW) {
 		struct block_device *bdev = mpd->inode->i_sb->s_bdev;
 		int i;
 		for (i = 0; i < map.m_len; i++)
 			unmap_underlying_metadata(bdev, map.m_pblk + i);
 	}
 	if (ext4_should_order_data(mpd->inode)) {
 		err = ext4_jbd2_file_inode(handle, mpd->inode);
 		if (err)
 			/* This only happens if the journal is aborted */
 			return;
 	}
 	/*
 	 * Update on-disk size along with block allocation.
 	 */
 	disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
 	if (disksize > i_size_read(mpd->inode))
 		disksize = i_size_read(mpd->inode);
 	if (disksize > EXT4_I(mpd->inode)->i_disksize) {
 		ext4_update_i_disksize(mpd->inode, disksize);
 		err = ext4_mark_inode_dirty(handle, mpd->inode);
 		if (err)
 			ext4_error(mpd->inode->i_sb,
 				   "Failed to mark inode %lu dirty",
 				   mpd->inode->i_ino);
 	}
 submit_io:
 	mpage_da_submit_io(mpd, mapp);
 	mpd->io_done = 1;
 }
 #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
 		(1 << BH_Delay) | (1 << BH_Unwritten))
 /*
  * mpage_add_bh_to_extent - try to add one more block to extent of blocks
  *
  * @mpd->lbh - extent of blocks
  * @logical - logical number of the block in the file
  * @bh - bh of the block (used to access block's state)
  *
  * the function is used to collect contig. blocks in same state
  */
 static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
 				   sector_t logical, size_t b_size,
 				   unsigned long b_state)
 {
 	sector_t next;
 	int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
 	/*
 	 * XXX Don't go larger than mballoc is willing to allocate
 	 * This is a stopgap solution.  We eventually need to fold
 	 * mpage_da_submit_io() into this function and then call
 	 * ext4_map_blocks() multiple times in a loop
 	 */
 	if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
 		goto flush_it;
 	/* check if thereserved journal credits might overflow */
 	if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {
 		if (nrblocks >= EXT4_MAX_TRANS_DATA) {
 			/*
 			 * With non-extent format we are limited by the journal
 			 * credit available.  Total credit needed to insert
 			 * nrblocks contiguous blocks is dependent on the
 			 * nrblocks.  So limit nrblocks.
 			 */
 			goto flush_it;
 		} else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) >
 				EXT4_MAX_TRANS_DATA) {
 			/*
 			 * Adding the new buffer_head would make it cross the
 			 * allowed limit for which we have journal credit
 			 * reserved. So limit the new bh->b_size
 			 */
 			b_size = (EXT4_MAX_TRANS_DATA - nrblocks) <<
 						mpd->inode->i_blkbits;
 			/* we will do mpage_da_submit_io in the next loop */
 		}
 	}
 	/*
 	 * First block in the extent
 	 */
 	if (mpd->b_size == 0) {
 		mpd->b_blocknr = logical;
 		mpd->b_size = b_size;
 		mpd->b_state = b_state & BH_FLAGS;
 		return;
 	}
 	next = mpd->b_blocknr + nrblocks;
 	/*
 	 * Can we merge the block to our big extent?
 	 */
 	if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
 		mpd->b_size += b_size;
 		return;
 	}
 flush_it:
 	/*
 	 * We couldn't merge the block to our extent, so we
 	 * need to flush current  extent and start new one
 	 */
 	mpage_da_map_and_submit(mpd);
 	return;
 }
 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
 {
 	return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
 }
 /*
  * This is a special get_blocks_t callback which is used by
  * ext4_da_write_begin().  It will either return mapped block or
  * reserve space for a single block.
  *
  * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.
  * We also have b_blocknr = -1 and b_bdev initialized properly
  *
  * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.
  * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
  * initialized properly.
  */
 static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 				  struct buffer_head *bh, int create)
 {
 	struct ext4_map_blocks map;
 	int ret = 0;
 	sector_t invalid_block = ~((sector_t) 0xffff);
 	if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
 		invalid_block = ~0;
 	BUG_ON(create == 0);
 	BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
 	map.m_lblk = iblock;
 	map.m_len = 1;
 	/*
 	 * first, we need to know whether the block is allocated already
 	 * preallocated blocks are unmapped but should treated
 	 * the same as allocated blocks.
 	 */
 	ret = ext4_map_blocks(NULL, inode, &map, 0);
 	if (ret < 0)
 		return ret;
 	if (ret == 0) {
 		if (buffer_delay(bh))
 			return 0; /* Not sure this could or should happen */
 		/*
 		 * XXX: __block_write_begin() unmaps passed block, is it OK?
 		 */
 		ret = ext4_da_reserve_space(inode, iblock);
 		if (ret)
 			/* not enough space to reserve */
 			return ret;
 		map_bh(bh, inode->i_sb, invalid_block);
 		set_buffer_new(bh);
 		set_buffer_delay(bh);
 		return 0;
 	}
 	map_bh(bh, inode->i_sb, map.m_pblk);
 	bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
 	if (buffer_unwritten(bh)) {
 		/* A delayed write to unwritten bh should be marked
 		 * new and mapped.  Mapped ensures that we don't do
 		 * get_block multiple times when we write to the same
 		 * offset and new ensures that we do proper zero out
 		 * for partial write.
 		 */
 		set_buffer_new(bh);
 		set_buffer_mapped(bh);
 	}
 	return 0;
 }
 /*
  * This function is used as a standard get_block_t calback function
  * when there is no desire to allocate any blocks.  It is used as a
  * callback function for block_write_begin() and block_write_full_page().
  * These functions should only try to map a single block at a time.
  *
  * Since this function doesn't do block allocations even if the caller
  * requests it by passing in create=1, it is critically important that
  * any caller checks to make sure that any buffer heads are returned
  * by this function are either all already mapped or marked for
  * delayed allocation before calling  block_write_full_page().  Otherwise,
  * b_blocknr could be left unitialized, and the page write functions will
  * be taken by surprise.
  */
 static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
 				   struct buffer_head *bh_result, int create)
 {
 	BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
 	return _ext4_get_block(inode, iblock, bh_result, 0);
 }
 static int bget_one(handle_t *handle, struct buffer_head *bh)
 {
 	get_bh(bh);
 	return 0;
 }
 static int bput_one(handle_t *handle, struct buffer_head *bh)
 {
 	put_bh(bh);
 	return 0;
 }
 static int __ext4_journalled_writepage(struct page *page,
 				       unsigned int len)
 {
 	struct address_space *mapping = page->mapping;
 	struct inode *inode = mapping->host;
 	struct buffer_head *page_bufs;
 	handle_t *handle = NULL;
 	int ret = 0;
 	int err;
 	ClearPageChecked(page);
 	page_bufs = page_buffers(page);
 	BUG_ON(!page_bufs);
 	walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
 	/* As soon as we unlock the page, it can go away, but we have
 	 * references to buffers so we are safe */
 	unlock_page(page);
 	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		goto out;
 	}
 	ret = walk_page_buffers(handle, page_bufs, 0, len, NULL,
 				do_journal_get_write_access);
 	err = walk_page_buffers(handle, page_bufs, 0, len, NULL,
 				write_end_fn);
 	if (ret == 0)
 		ret = err;
 	err = ext4_journal_stop(handle);
 	if (!ret)
 		ret = err;
 	walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
 	ext4_set_inode_state(inode, EXT4_STATE_JDATA);
 out:
 	return ret;
 }
 static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
 /*
  * Note that we don't need to start a transaction unless we're journaling data
  * because we should have holes filled from ext4_page_mkwrite(). We even don't
  * need to file the inode to the transaction's list in ordered mode because if
  * we are writing back data added by write(), the inode is already there and if
  * we are writing back data modified via mmap(), no one guarantees in which
  * transaction the data will hit the disk. In case we are journaling data, we
  * cannot start transaction directly because transaction start ranks above page
  * lock so we have to do some magic.
  *
  * This function can get called via...
  *   - ext4_da_writepages after taking page lock (have journal handle)
  *   - journal_submit_inode_data_buffers (no journal handle)
  *   - shrink_page_list via pdflush (no journal handle)
  *   - grab_page_cache when doing write_begin (have journal handle)
  *
  * We don't do any block allocation in this function. If we have page with
  * multiple blocks we need to write those buffer_heads that are mapped. This
  * is important for mmaped based write. So if we do with blocksize 1K
  * truncate(f, 1024);
  * a = mmap(f, 0, 4096);
  * a[0] = 'a';
  * truncate(f, 4096);
  * we have in the page first buffer_head mapped via page_mkwrite call back
  * but other bufer_heads would be unmapped but dirty(dirty done via the
  * do_wp_page). So writepage should write the first block. If we modify
  * the mmap area beyond 1024 we will again get a page_fault and the
  * page_mkwrite callback will do the block allocation and mark the
  * buffer_heads mapped.
  *
  * We redirty the page if we have any buffer_heads that is either delay or
  * unwritten in the page.
  *
  * We can get recursively called as show below.
  *
  *	ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
  *		ext4_writepage()
  *
  * But since we don't do any block allocation we should not deadlock.
  * Page also have the dirty flag cleared so we don't get recurive page_lock.
  */
 static int ext4_writepage(struct page *page,
 			  struct writeback_control *wbc)
 {
 	int ret = 0, commit_write = 0;
 	loff_t size;
 	unsigned int len;
 	struct buffer_head *page_bufs = NULL;
 	struct inode *inode = page->mapping->host;
 	trace_ext4_writepage(inode, page);
 	size = i_size_read(inode);
 	if (page->index == size >> PAGE_CACHE_SHIFT)
 		len = size & ~PAGE_CACHE_MASK;
 	else
 		len = PAGE_CACHE_SIZE;
 	/*
 	 * If the page does not have buffers (for whatever reason),
 	 * try to create them using __block_write_begin.  If this
 	 * fails, redirty the page and move on.
 	 */
 	if (!page_has_buffers(page)) {
 		if (__block_write_begin(page, 0, len,
 					noalloc_get_block_write)) {
 		redirty_page:
 			redirty_page_for_writepage(wbc, page);
 			unlock_page(page);
 			return 0;
 		}
 		commit_write = 1;
 	}
 	page_bufs = page_buffers(page);
 	if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
 			      ext4_bh_delay_or_unwritten)) {
 		/*
 		 * We don't want to do block allocation, so redirty
 		 * the page and return.  We may reach here when we do
 		 * a journal commit via journal_submit_inode_data_buffers.
 		 * We can also reach here via shrink_page_list
 		 */
 		goto redirty_page;
 	}
 	if (commit_write)
 		/* now mark the buffer_heads as dirty and uptodate */
 		block_commit_write(page, 0, len);
 	if (PageChecked(page) && ext4_should_journal_data(inode))
 		/*
 		 * It's mmapped pagecache.  Add buffers and journal it.  There
 		 * doesn't seem much point in redirtying the page here.
 		 */
 		return __ext4_journalled_writepage(page, len);
 	if (buffer_uninit(page_bufs)) {
 		ext4_set_bh_endio(page_bufs, inode);
 		ret = block_write_full_page_endio(page, noalloc_get_block_write,
 					    wbc, ext4_end_io_buffer_write);
 	} else
 		ret = block_write_full_page(page, noalloc_get_block_write,
 					    wbc);
 	return ret;
 }
 /*
  * This is called via ext4_da_writepages() to
  * calculate the total number of credits to reserve to fit
  * a single extent allocation into a single transaction,
  * ext4_da_writpeages() will loop calling this before
  * the block allocation.
  */
 static int ext4_da_writepages_trans_blocks(struct inode *inode)
 {
 	int max_blocks = EXT4_I(inode)->i_reserved_data_blocks;
 	/*
 	 * With non-extent format the journal credit needed to
 	 * insert nrblocks contiguous block is dependent on
 	 * number of contiguous block. So we will limit
 	 * number of contiguous block to a sane value
 	 */
 	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
 	    (max_blocks > EXT4_MAX_TRANS_DATA))
 		max_blocks = EXT4_MAX_TRANS_DATA;
 	return ext4_chunk_trans_blocks(inode, max_blocks);
 }
 /*
  * write_cache_pages_da - walk the list of dirty pages of the given
  * address space and accumulate pages that need writing, and call
  * mpage_da_map_and_submit to map a single contiguous memory region
  * and then write them.
  */
 static int write_cache_pages_da(struct address_space *mapping,
 				struct writeback_control *wbc,
 				struct mpage_da_data *mpd,
 				pgoff_t *done_index)
 {
 	struct buffer_head	*bh, *head;
 	struct inode		*inode = mapping->host;
 	struct pagevec		pvec;
 	unsigned int		nr_pages;
 	sector_t		logical;
 	pgoff_t			index, end;
 	long			nr_to_write = wbc->nr_to_write;
 	int			i, tag, ret = 0;
 	memset(mpd, 0, sizeof(struct mpage_da_data));
 	mpd->wbc = wbc;
 	mpd->inode = inode;
 	pagevec_init(&pvec, 0);
 	index = wbc->range_start >> PAGE_CACHE_SHIFT;
 	end = wbc->range_end >> PAGE_CACHE_SHIFT;
 	if (wbc->sync_mode == WB_SYNC_ALL)
 		tag = PAGECACHE_TAG_TOWRITE;
 	else
 		tag = PAGECACHE_TAG_DIRTY;
 	*done_index = index;
 	while (index <= end) {
 		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
 			      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
 		if (nr_pages == 0)
 			return 0;
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
 			/*
 			 * At this point, the page may be truncated or
 			 * invalidated (changing page->mapping to NULL), or
 			 * even swizzled back from swapper_space to tmpfs file
 			 * mapping. However, page->index will not change
 			 * because we have a reference on the page.
 			 */
 			if (page->index > end)
 				goto out;
 			*done_index = page->index + 1;
 			/*
 			 * If we can't merge this page, and we have
 			 * accumulated an contiguous region, write it
 			 */
 			if ((mpd->next_page != page->index) &&
 			    (mpd->next_page != mpd->first_page)) {
 				mpage_da_map_and_submit(mpd);
 				goto ret_extent_tail;
 			}
 			lock_page(page);
 			/*
 			 * If the page is no longer dirty, or its
 			 * mapping no longer corresponds to inode we
 			 * are writing (which means it has been
 			 * truncated or invalidated), or the page is
 			 * already under writeback and we are not
 			 * doing a data integrity writeback, skip the page
 			 */
 			if (!PageDirty(page) ||
 			    (PageWriteback(page) &&
 			     (wbc->sync_mode == WB_SYNC_NONE)) ||
 			    unlikely(page->mapping != mapping)) {
 				unlock_page(page);
 				continue;
 			}
-			if (PageWriteback(page))
+			wait_on_page_writeback(page);
-				wait_on_page_writeback(page);
 			BUG_ON(PageWriteback(page));
 			if (mpd->next_page != page->index)
 				mpd->first_page = page->index;
 			mpd->next_page = page->index + 1;
 			logical = (sector_t) page->index <<
 				(PAGE_CACHE_SHIFT - inode->i_blkbits);
 			if (!page_has_buffers(page)) {
 				mpage_add_bh_to_extent(mpd, logical,
 						       PAGE_CACHE_SIZE,
 						       (1 << BH_Dirty) | (1 << BH_Uptodate));
 				if (mpd->io_done)
 					goto ret_extent_tail;
 			} else {
 				/*
 				 * Page with regular buffer heads,
 				 * just add all dirty ones
 				 */
 				head = page_buffers(page);
 				bh = head;
 				do {
 					BUG_ON(buffer_locked(bh));
 					/*
 					 * We need to try to allocate
 					 * unmapped blocks in the same page.
 					 * Otherwise we won't make progress
 					 * with the page in ext4_writepage
 					 */
 					if (ext4_bh_delay_or_unwritten(NULL, bh)) {
 						mpage_add_bh_to_extent(mpd, logical,
 								       bh->b_size,
 								       bh->b_state);
 						if (mpd->io_done)
 							goto ret_extent_tail;
 					} else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
 						/*
 						 * mapped dirty buffer. We need
 						 * to update the b_state
 						 * because we look at b_state
 						 * in mpage_da_map_blocks.  We
 						 * don't update b_size because
 						 * if we find an unmapped
 						 * buffer_head later we need to
 						 * use the b_state flag of that
 						 * buffer_head.
 						 */
 						if (mpd->b_size == 0)
 							mpd->b_state = bh->b_state & BH_FLAGS;
 					}
 					logical++;
 				} while ((bh = bh->b_this_page) != head);
 			}
 			if (nr_to_write > 0) {
 				nr_to_write--;
 				if (nr_to_write == 0 &&
 				    wbc->sync_mode == WB_SYNC_NONE)
 					/*
 					 * We stop writing back only if we are
 					 * not doing integrity sync. In case of
 					 * integrity sync we have to keep going
 					 * because someone may be concurrently
 					 * dirtying pages, and we might have
 					 * synced a lot of newly appeared dirty
 					 * pages, but have not synced all of the
 					 * old dirty pages.
 					 */
 					goto out;
 			}
 		}
 		pagevec_release(&pvec);
 		cond_resched();
 	}
 	return 0;
 ret_extent_tail:
 	ret = MPAGE_DA_EXTENT_TAIL;
 out:
 	pagevec_release(&pvec);
 	cond_resched();
 	return ret;
 }
 static int ext4_da_writepages(struct address_space *mapping,
 			      struct writeback_control *wbc)
 {
 	pgoff_t	index;
 	int range_whole = 0;
 	handle_t *handle = NULL;
 	struct mpage_da_data mpd;
 	struct inode *inode = mapping->host;
 	int pages_written = 0;
 	unsigned int max_pages;
 	int range_cyclic, cycled = 1, io_done = 0;
 	int needed_blocks, ret = 0;
 	long desired_nr_to_write, nr_to_writebump = 0;
 	loff_t range_start = wbc->range_start;
 	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
 	pgoff_t done_index = 0;
 	pgoff_t end;
 	trace_ext4_da_writepages(inode, wbc);
 	/*
 	 * No pages to write? This is mainly a kludge to avoid starting
 	 * a transaction for special inodes like journal inode on last iput()
 	 * because that could violate lock ordering on umount
 	 */
 	if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
 		return 0;
 	/*
 	 * If the filesystem has aborted, it is read-only, so return
 	 * right away instead of dumping stack traces later on that
 	 * will obscure the real source of the problem.  We test
 	 * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
 	 * the latter could be true if the filesystem is mounted
 	 * read-only, and in that case, ext4_da_writepages should
 	 * *never* be called, so if that ever happens, we would want
 	 * the stack trace.
 	 */
 	if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
 		return -EROFS;
 	if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
 		range_whole = 1;
 	range_cyclic = wbc->range_cyclic;
 	if (wbc->range_cyclic) {
 		index = mapping->writeback_index;
 		if (index)
 			cycled = 0;
 		wbc->range_start = index << PAGE_CACHE_SHIFT;
 		wbc->range_end  = LLONG_MAX;
 		wbc->range_cyclic = 0;
 		end = -1;
 	} else {
 		index = wbc->range_start >> PAGE_CACHE_SHIFT;
 		end = wbc->range_end >> PAGE_CACHE_SHIFT;
 	}
 	/*
 	 * This works around two forms of stupidity.  The first is in
 	 * the writeback code, which caps the maximum number of pages
 	 * written to be 1024 pages.  This is wrong on multiple
 	 * levels; different architectues have a different page size,
 	 * which changes the maximum amount of data which gets
 	 * written.  Secondly, 4 megabytes is way too small.  XFS
 	 * forces this value to be 16 megabytes by multiplying
 	 * nr_to_write parameter by four, and then relies on its
 	 * allocator to allocate larger extents to make them
 	 * contiguous.  Unfortunately this brings us to the second
 	 * stupidity, which is that ext4's mballoc code only allocates
 	 * at most 2048 blocks.  So we force contiguous writes up to
 	 * the number of dirty blocks in the inode, or
 	 * sbi->max_writeback_mb_bump whichever is smaller.
 	 */
 	max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
 	if (!range_cyclic && range_whole) {
 		if (wbc->nr_to_write == LONG_MAX)
 			desired_nr_to_write = wbc->nr_to_write;
 		else
 			desired_nr_to_write = wbc->nr_to_write * 8;
 	} else
 		desired_nr_to_write = ext4_num_dirty_pages(inode, index,
 							   max_pages);
 	if (desired_nr_to_write > max_pages)
 		desired_nr_to_write = max_pages;
 	if (wbc->nr_to_write < desired_nr_to_write) {
 		nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
 		wbc->nr_to_write = desired_nr_to_write;
 	}
 retry:
 	if (wbc->sync_mode == WB_SYNC_ALL)
 		tag_pages_for_writeback(mapping, index, end);
 	while (!ret && wbc->nr_to_write > 0) {
 		/*
 		 * we  insert one extent at a time. So we need
 		 * credit needed for single extent allocation.
 		 * journalled mode is currently not supported
 		 * by delalloc
 		 */
 		BUG_ON(ext4_should_journal_data(inode));
 		needed_blocks = ext4_da_writepages_trans_blocks(inode);
 		/* start a new transaction*/
 		handle = ext4_journal_start(inode, needed_blocks);
 		if (IS_ERR(handle)) {
 			ret = PTR_ERR(handle);
 			ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
 			       "%ld pages, ino %lu; err %d", __func__,
 				wbc->nr_to_write, inode->i_ino, ret);
 			goto out_writepages;
 		}
 		/*
 		 * Now call write_cache_pages_da() to find the next
 		 * contiguous region of logical blocks that need
 		 * blocks to be allocated by ext4 and submit them.
 		 */
 		ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
 		/*
 		 * If we have a contiguous extent of pages and we
 		 * haven't done the I/O yet, map the blocks and submit
 		 * them for I/O.
 		 */
 		if (!mpd.io_done && mpd.next_page != mpd.first_page) {
 			mpage_da_map_and_submit(&mpd);
 			ret = MPAGE_DA_EXTENT_TAIL;
 		}
 		trace_ext4_da_write_pages(inode, &mpd);
 		wbc->nr_to_write -= mpd.pages_written;
 		ext4_journal_stop(handle);
 		if ((mpd.retval == -ENOSPC) && sbi->s_journal) {
 			/* commit the transaction which would
 			 * free blocks released in the transaction
 			 * and try again
 			 */
 			jbd2_journal_force_commit_nested(sbi->s_journal);
 			ret = 0;
 		} else if (ret == MPAGE_DA_EXTENT_TAIL) {
 			/*
 			 * got one extent now try with
 			 * rest of the pages
 			 */
 			pages_written += mpd.pages_written;
 			ret = 0;
 			io_done = 1;
 		} else if (wbc->nr_to_write)
 			/*
 			 * There is no more writeout needed
 			 * or we requested for a noblocking writeout
 			 * and we found the device congested
 			 */
 			break;
 	}
 	if (!io_done && !cycled) {
 		cycled = 1;
 		index = 0;
 		wbc->range_start = index << PAGE_CACHE_SHIFT;
 		wbc->range_end  = mapping->writeback_index - 1;
 		goto retry;
 	}
 	/* Update index */
 	wbc->range_cyclic = range_cyclic;
 	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
 		/*
 		 * set the writeback_index so that range_cyclic
 		 * mode will write it back later
 		 */
 		mapping->writeback_index = done_index;
 out_writepages:
 	wbc->nr_to_write -= nr_to_writebump;
 	wbc->range_start = range_start;
 	trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
 	return ret;
 }
 #define FALL_BACK_TO_NONDELALLOC 1
 static int ext4_nonda_switch(struct super_block *sb)
 {
 	s64 free_blocks, dirty_blocks;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	/*
 	 * switch to non delalloc mode if we are running low
 	 * on free block. The free block accounting via percpu
 	 * counters can get slightly wrong with percpu_counter_batch getting
 	 * accumulated on each CPU without updating global counters
 	 * Delalloc need an accurate free block accounting. So switch
 	 * to non delalloc when we are near to error range.
 	 */
 	free_blocks  = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
 	dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter);
 	if (2 * free_blocks < 3 * dirty_blocks ||
 		free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
 		/*
 		 * free block count is less than 150% of dirty blocks
 		 * or free blocks is less than watermark
 		 */
 		return 1;
 	}
 	/*
 	 * Even if we don't switch but are nearing capacity,
 	 * start pushing delalloc when 1/2 of free blocks are dirty.
 	 */
 	if (free_blocks < 2 * dirty_blocks)
 		writeback_inodes_sb_if_idle(sb);
 	return 0;
 }
 static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 			       loff_t pos, unsigned len, unsigned flags,
 			       struct page **pagep, void **fsdata)
 {
 	int ret, retries = 0;
 	struct page *page;
 	pgoff_t index;
 	struct inode *inode = mapping->host;
 	handle_t *handle;
 	index = pos >> PAGE_CACHE_SHIFT;
 	if (ext4_nonda_switch(inode->i_sb)) {
 		*fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
 		return ext4_write_begin(file, mapping, pos,
 					len, flags, pagep, fsdata);
 	}
 	*fsdata = (void *)0;
 	trace_ext4_da_write_begin(inode, pos, len, flags);
 retry:
 	/*
 	 * With delayed allocation, we don't log the i_disksize update
 	 * if there is delayed block allocation. But we still need
 	 * to journalling the i_disksize update if writes to the end
 	 * of file which has an already mapped buffer.
 	 */
 	handle = ext4_journal_start(inode, 1);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		goto out;
 	}
 	/* We cannot recurse into the filesystem as the transaction is already
 	 * started */
 	flags |= AOP_FLAG_NOFS;
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page) {
 		ext4_journal_stop(handle);
 		ret = -ENOMEM;
 		goto out;
 	}
 	*pagep = page;
 	ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
 	if (ret < 0) {
 		unlock_page(page);
 		ext4_journal_stop(handle);
 		page_cache_release(page);
 		/*
 		 * block_write_begin may have instantiated a few blocks
 		 * outside i_size.  Trim these off again. Don't need
 		 * i_size_read because we hold i_mutex.
 		 */
 		if (pos + len > inode->i_size)
 			ext4_truncate_failed_write(inode);
 	}
 	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
 		goto retry;
 out:
 	return ret;
 }
 /*
  * Check if we should update i_disksize
  * when write to the end of file but not require block allocation
  */
 static int ext4_da_should_update_i_disksize(struct page *page,
 					    unsigned long offset)
 {
 	struct buffer_head *bh;
 	struct inode *inode = page->mapping->host;
 	unsigned int idx;
 	int i;
 	bh = page_buffers(page);
 	idx = offset >> inode->i_blkbits;
 	for (i = 0; i < idx; i++)
 		bh = bh->b_this_page;
 	if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh))
 		return 0;
 	return 1;
 }
 static int ext4_da_write_end(struct file *file,
 			     struct address_space *mapping,
 			     loff_t pos, unsigned len, unsigned copied,
 			     struct page *page, void *fsdata)
 {
 	struct inode *inode = mapping->host;
 	int ret = 0, ret2;
 	handle_t *handle = ext4_journal_current_handle();
 	loff_t new_i_size;
 	unsigned long start, end;
 	int write_mode = (int)(unsigned long)fsdata;
 	if (write_mode == FALL_BACK_TO_NONDELALLOC) {
 		if (ext4_should_order_data(inode)) {
 			return ext4_ordered_write_end(file, mapping, pos,
 					len, copied, page, fsdata);
 		} else if (ext4_should_writeback_data(inode)) {
 			return ext4_writeback_write_end(file, mapping, pos,
 					len, copied, page, fsdata);
 		} else {
 			BUG();
 		}
 	}
 	trace_ext4_da_write_end(inode, pos, len, copied);
 	start = pos & (PAGE_CACHE_SIZE - 1);
 	end = start + copied - 1;
 	/*
 	 * generic_write_end() will run mark_inode_dirty() if i_size
 	 * changes.  So let's piggyback the i_disksize mark_inode_dirty
 	 * into that.
 	 */
 	new_i_size = pos + copied;
 	if (new_i_size > EXT4_I(inode)->i_disksize) {
 		if (ext4_da_should_update_i_disksize(page, end)) {
 			down_write(&EXT4_I(inode)->i_data_sem);
 			if (new_i_size > EXT4_I(inode)->i_disksize) {
 				/*
 				 * Updating i_disksize when extending file
 				 * without needing block allocation
 				 */
 				if (ext4_should_order_data(inode))
 					ret = ext4_jbd2_file_inode(handle,
 								   inode);
 				EXT4_I(inode)->i_disksize = new_i_size;
 			}
 			up_write(&EXT4_I(inode)->i_data_sem);
 			/* We need to mark inode dirty even if
 			 * new_i_size is less that inode->i_size
 			 * bu greater than i_disksize.(hint delalloc)
 			 */
 			ext4_mark_inode_dirty(handle, inode);
 		}
 	}
 	ret2 = generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
 	copied = ret2;
 	if (ret2 < 0)
 		ret = ret2;
 	ret2 = ext4_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
 	return ret ? ret : copied;
 }
 static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
 {
 	/*
 	 * Drop reserved blocks
 	 */
 	BUG_ON(!PageLocked(page));
 	if (!page_has_buffers(page))
 		goto out;
 	ext4_da_page_release_reservation(page, offset);
 out:
 	ext4_invalidatepage(page, offset);
 	return;
 }
 /*
  * Force all delayed allocation blocks to be allocated for a given inode.
  */
 int ext4_alloc_da_blocks(struct inode *inode)
 {
 	trace_ext4_alloc_da_blocks(inode);
 	if (!EXT4_I(inode)->i_reserved_data_blocks &&
 	    !EXT4_I(inode)->i_reserved_meta_blocks)
 		return 0;
 	/*
 	 * We do something simple for now.  The filemap_flush() will
 	 * also start triggering a write of the data blocks, which is
 	 * not strictly speaking necessary (and for users of
 	 * laptop_mode, not even desirable).  However, to do otherwise
 	 * would require replicating code paths in:
 	 *
 	 * ext4_da_writepages() ->
 	 *    write_cache_pages() ---> (via passed in callback function)
 	 *        __mpage_da_writepage() -->
 	 *           mpage_add_bh_to_extent()
 	 *           mpage_da_map_blocks()
 	 *
 	 * The problem is that write_cache_pages(), located in
 	 * mm/page-writeback.c, marks pages clean in preparation for
 	 * doing I/O, which is not desirable if we're not planning on
 	 * doing I/O at all.
 	 *
 	 * We could call write_cache_pages(), and then redirty all of
 	 * the pages by calling redirty_page_for_writepage() but that
 	 * would be ugly in the extreme.  So instead we would need to
 	 * replicate parts of the code in the above functions,
 	 * simplifying them because we wouldn't actually intend to
 	 * write out the pages, but rather only collect contiguous
 	 * logical block extents, call the multi-block allocator, and
 	 * then update the buffer heads with the block allocations.
 	 *
 	 * For now, though, we'll cheat by calling filemap_flush(),
 	 * which will map the blocks, and start the I/O, but not
 	 * actually wait for the I/O to complete.
 	 */
 	return filemap_flush(inode->i_mapping);
 }
 /*
  * bmap() is special.  It gets used by applications such as lilo and by
  * the swapper to find the on-disk block of a specific piece of data.
  *
  * Naturally, this is dangerous if the block concerned is still in the
  * journal.  If somebody makes a swapfile on an ext4 data-journaling
  * filesystem and enables swap, then they may get a nasty shock when the
  * data getting swapped to that swapfile suddenly gets overwritten by
  * the original zero's written out previously to the journal and
  * awaiting writeback in the kernel's buffer cache.
  *
  * So, if we see any bmap calls here on a modified, data-journaled file,
  * take extra steps to flush any blocks which might be in the cache.
  */
 static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
 {
 	struct inode *inode = mapping->host;
 	journal_t *journal;
 	int err;
 	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
 			test_opt(inode->i_sb, DELALLOC)) {
 		/*
 		 * With delalloc we want to sync the file
 		 * so that we can make sure we allocate
 		 * blocks for file
 		 */
 		filemap_write_and_wait(mapping);
 	}
 	if (EXT4_JOURNAL(inode) &&
 	    ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {
 		/*
 		 * This is a REALLY heavyweight approach, but the use of
 		 * bmap on dirty files is expected to be extremely rare:
 		 * only if we run lilo or swapon on a freshly made file
 		 * do we expect this to happen.
 		 *
 		 * (bmap requires CAP_SYS_RAWIO so this does not
 		 * represent an unprivileged user DOS attack --- we'd be
 		 * in trouble if mortal users could trigger this path at
 		 * will.)
 		 *
 		 * NB. EXT4_STATE_JDATA is not set on files other than
 		 * regular files.  If somebody wants to bmap a directory
 		 * or symlink and gets confused because the buffer
 		 * hasn't yet been flushed to disk, they deserve
 		 * everything they get.
 		 */
 		ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
 		journal = EXT4_JOURNAL(inode);
 		jbd2_journal_lock_updates(journal);
 		err = jbd2_journal_flush(journal);
 		jbd2_journal_unlock_updates(journal);
 		if (err)
 			return 0;
 	}
 	return generic_block_bmap(mapping, block, ext4_get_block);
 }
 static int ext4_readpage(struct file *file, struct page *page)
 {
 	trace_ext4_readpage(page);
 	return mpage_readpage(page, ext4_get_block);
 }
 static int
 ext4_readpages(struct file *file, struct address_space *mapping,
 		struct list_head *pages, unsigned nr_pages)
 {
 	return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
 }
 static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
 {
 	struct buffer_head *head, *bh;
 	unsigned int curr_off = 0;
 	if (!page_has_buffers(page))
 		return;
 	head = bh = page_buffers(page);
 	do {
 		if (offset <= curr_off && test_clear_buffer_uninit(bh)
 					&& bh->b_private) {
 			ext4_free_io_end(bh->b_private);
 			bh->b_private = NULL;
 			bh->b_end_io = NULL;
 		}
 		curr_off = curr_off + bh->b_size;
 		bh = bh->b_this_page;
 	} while (bh != head);
 }
 static void ext4_invalidatepage(struct page *page, unsigned long offset)
 {
 	journal_t *journal = EXT4_JOURNAL(page->mapping->host);
 	trace_ext4_invalidatepage(page, offset);
 	/*
 	 * free any io_end structure allocated for buffers to be discarded
 	 */
 	if (ext4_should_dioread_nolock(page->mapping->host))
 		ext4_invalidatepage_free_endio(page, offset);
 	/*
 	 * If it's a full truncate we just forget about the pending dirtying
 	 */
 	if (offset == 0)
 		ClearPageChecked(page);
 	if (journal)
 		jbd2_journal_invalidatepage(journal, page, offset);
 	else
 		block_invalidatepage(page, offset);
 }
 static int ext4_releasepage(struct page *page, gfp_t wait)
 {
 	journal_t *journal = EXT4_JOURNAL(page->mapping->host);
 	trace_ext4_releasepage(page);
 	WARN_ON(PageChecked(page));
 	if (!page_has_buffers(page))
 		return 0;
 	if (journal)
 		return jbd2_journal_try_to_free_buffers(journal, page, wait);
 	else
 		return try_to_free_buffers(page);
 }
 /*
  * O_DIRECT for ext3 (or indirect map) based files
  *
  * If the O_DIRECT write will extend the file then add this inode to the
  * orphan list.  So recovery will truncate it back to the original size
  * if the machine crashes during the write.
  *
  * If the O_DIRECT write is intantiating holes inside i_size and the machine
  * crashes then stale disk data _may_ be exposed inside the file. But current
  * VFS code falls back into buffered path in that case so we are safe.
  */
 static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
 			      const struct iovec *iov, loff_t offset,
 			      unsigned long nr_segs)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	handle_t *handle;
 	ssize_t ret;
 	int orphan = 0;
 	size_t count = iov_length(iov, nr_segs);
 	int retries = 0;
 	if (rw == WRITE) {
 		loff_t final_size = offset + count;
 		if (final_size > inode->i_size) {
 			/* Credits for sb + inode write */
 			handle = ext4_journal_start(inode, 2);
 			if (IS_ERR(handle)) {
 				ret = PTR_ERR(handle);
 				goto out;
 			}
 			ret = ext4_orphan_add(handle, inode);
 			if (ret) {
 				ext4_journal_stop(handle);
 				goto out;
 			}
 			orphan = 1;
 			ei->i_disksize = inode->i_size;
 			ext4_journal_stop(handle);
 		}
 	}
 retry:
 	if (rw == READ && ext4_should_dioread_nolock(inode))
 		ret = __blockdev_direct_IO(rw, iocb, inode,
 				 inode->i_sb->s_bdev, iov,
 				 offset, nr_segs,
 				 ext4_get_block, NULL, NULL, 0);
 	else {
 		ret = blockdev_direct_IO(rw, iocb, inode,
 				 inode->i_sb->s_bdev, iov,
 				 offset, nr_segs,
 				 ext4_get_block, NULL);
 		if (unlikely((rw & WRITE) && ret < 0)) {
 			loff_t isize = i_size_read(inode);
 			loff_t end = offset + iov_length(iov, nr_segs);
 			if (end > isize)
 				vmtruncate(inode, isize);
 		}
 	}
 	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
 		goto retry;
 	if (orphan) {
 		int err;
 		/* Credits for sb + inode write */
 		handle = ext4_journal_start(inode, 2);
 		if (IS_ERR(handle)) {
 			/* This is really bad luck. We've written the data
 			 * but cannot extend i_size. Bail out and pretend
 			 * the write failed... */
 			ret = PTR_ERR(handle);
 			if (inode->i_nlink)
 				ext4_orphan_del(NULL, inode);
 			goto out;
 		}
 		if (inode->i_nlink)
 			ext4_orphan_del(handle, inode);
 		if (ret > 0) {
 			loff_t end = offset + ret;
 			if (end > inode->i_size) {
 				ei->i_disksize = end;
 				i_size_write(inode, end);
 				/*
 				 * We're going to return a positive `ret'
 				 * here due to non-zero-length I/O, so there's
 				 * no way of reporting error returns from
 				 * ext4_mark_inode_dirty() to userspace.  So
 				 * ignore it.
 				 */
 				ext4_mark_inode_dirty(handle, inode);
 			}
 		}
 		err = ext4_journal_stop(handle);
 		if (ret == 0)
 			ret = err;
 	}
 out:
 	return ret;
 }
 /*
  * ext4_get_block used when preparing for a DIO write or buffer write.
  * We allocate an uinitialized extent if blocks haven't been allocated.
  * The extent will be converted to initialized after the IO is complete.
  */
 static int ext4_get_block_write(struct inode *inode, sector_t iblock,
 		   struct buffer_head *bh_result, int create)
 {
 	ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
 		   inode->i_ino, create);
 	return _ext4_get_block(inode, iblock, bh_result,
 			       EXT4_GET_BLOCKS_IO_CREATE_EXT);
 }
 static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 			    ssize_t size, void *private, int ret,
 			    bool is_async)
 {
         ext4_io_end_t *io_end = iocb->private;
 	struct workqueue_struct *wq;
 	unsigned long flags;
 	struct ext4_inode_info *ei;
 	/* if not async direct IO or dio with 0 bytes write, just return */
 	if (!io_end || !size)
 		goto out;
 	ext_debug("ext4_end_io_dio(): io_end 0x%p"
 		  "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
  		  iocb->private, io_end->inode->i_ino, iocb, offset,
 		  size);
 	/* if not aio dio with unwritten extents, just free io and return */
 	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
 		ext4_free_io_end(io_end);
 		iocb->private = NULL;
 out:
 		if (is_async)
 			aio_complete(iocb, ret, 0);
 		return;
 	}
 	io_end->offset = offset;
 	io_end->size = size;
 	if (is_async) {
 		io_end->iocb = iocb;
 		io_end->result = ret;
 	}
 	wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
 	/* Add the io_end to per-inode completed aio dio list*/
 	ei = EXT4_I(io_end->inode);
 	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
 	list_add_tail(&io_end->list, &ei->i_completed_io_list);
 	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
 	/* queue the work to convert unwritten extents to written */
 	queue_work(wq, &io_end->work);
 	iocb->private = NULL;
 }
 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
 {
 	ext4_io_end_t *io_end = bh->b_private;
 	struct workqueue_struct *wq;
 	struct inode *inode;
 	unsigned long flags;
 	if (!test_clear_buffer_uninit(bh) || !io_end)
 		goto out;
 	if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
 		printk("sb umounted, discard end_io request for inode %lu\n",
 			io_end->inode->i_ino);
 		ext4_free_io_end(io_end);
 		goto out;
 	}
 	io_end->flag = EXT4_IO_END_UNWRITTEN;
 	inode = io_end->inode;
 	/* Add the io_end to per-inode completed io list*/
 	spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
 	list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
 	spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
 	wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
 	/* queue the work to convert unwritten extents to written */
 	queue_work(wq, &io_end->work);
 out:
 	bh->b_private = NULL;
 	bh->b_end_io = NULL;
 	clear_buffer_uninit(bh);
 	end_buffer_async_write(bh, uptodate);
 }
 static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
 {
 	ext4_io_end_t *io_end;
 	struct page *page = bh->b_page;
 	loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT;
 	size_t size = bh->b_size;
 retry:
 	io_end = ext4_init_io_end(inode, GFP_ATOMIC);
 	if (!io_end) {
 		pr_warn_ratelimited("%s: allocation fail\n", __func__);
 		schedule();
 		goto retry;
 	}
 	io_end->offset = offset;
 	io_end->size = size;
 	/*
 	 * We need to hold a reference to the page to make sure it
 	 * doesn't get evicted before ext4_end_io_work() has a chance
 	 * to convert the extent from written to unwritten.
 	 */
 	io_end->page = page;
 	get_page(io_end->page);
 	bh->b_private = io_end;
 	bh->b_end_io = ext4_end_io_buffer_write;
 	return 0;
 }
 /*
  * For ext4 extent files, ext4 will do direct-io write to holes,
  * preallocated extents, and those write extend the file, no need to
  * fall back to buffered IO.
  *
  * For holes, we fallocate those blocks, mark them as uninitialized
  * If those blocks were preallocated, we mark sure they are splited, but
  * still keep the range to write as uninitialized.
  *
  * The unwrritten extents will be converted to written when DIO is completed.
  * For async direct IO, since the IO may still pending when return, we
  * set up an end_io call back function, which will do the conversion
  * when async direct IO completed.
  *
  * If the O_DIRECT write will extend the file then add this inode to the
  * orphan list.  So recovery will truncate it back to the original size
  * if the machine crashes during the write.
  *
  */
 static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 			      const struct iovec *iov, loff_t offset,
 			      unsigned long nr_segs)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 	ssize_t ret;
 	size_t count = iov_length(iov, nr_segs);
 	loff_t final_size = offset + count;
 	if (rw == WRITE && final_size <= inode->i_size) {
 		/*
  		 * We could direct write to holes and fallocate.
 		 *
  		 * Allocated blocks to fill the hole are marked as uninitialized
  		 * to prevent parallel buffered read to expose the stale data
  		 * before DIO complete the data IO.
 		 *
  		 * As to previously fallocated extents, ext4 get_block
  		 * will just simply mark the buffer mapped but still
  		 * keep the extents uninitialized.
  		 *
 		 * for non AIO case, we will convert those unwritten extents
 		 * to written after return back from blockdev_direct_IO.
 		 *
 		 * for async DIO, the conversion needs to be defered when
 		 * the IO is completed. The ext4 end_io callback function
 		 * will be called to take care of the conversion work.
 		 * Here for async case, we allocate an io_end structure to
 		 * hook to the iocb.
  		 */
 		iocb->private = NULL;
 		EXT4_I(inode)->cur_aio_dio = NULL;
 		if (!is_sync_kiocb(iocb)) {
 			iocb->private = ext4_init_io_end(inode, GFP_NOFS);
 			if (!iocb->private)
 				return -ENOMEM;
 			/*
 			 * we save the io structure for current async
 			 * direct IO, so that later ext4_map_blocks()
 			 * could flag the io structure whether there
 			 * is a unwritten extents needs to be converted
 			 * when IO is completed.
 			 */
 			EXT4_I(inode)->cur_aio_dio = iocb->private;
 		}
 		ret = blockdev_direct_IO(rw, iocb, inode,
 					 inode->i_sb->s_bdev, iov,
 					 offset, nr_segs,
 					 ext4_get_block_write,
 					 ext4_end_io_dio);
 		if (iocb->private)
 			EXT4_I(inode)->cur_aio_dio = NULL;
 		/*
 		 * The io_end structure takes a reference to the inode,
 		 * that structure needs to be destroyed and the
 		 * reference to the inode need to be dropped, when IO is
 		 * complete, even with 0 byte write, or failed.
 		 *
 		 * In the successful AIO DIO case, the io_end structure will be
 		 * desctroyed and the reference to the inode will be dropped
 		 * after the end_io call back function is called.
 		 *
 		 * In the case there is 0 byte write, or error case, since
 		 * VFS direct IO won't invoke the end_io call back function,
 		 * we need to free the end_io structure here.
 		 */
 		if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
 			ext4_free_io_end(iocb->private);
 			iocb->private = NULL;
 		} else if (ret > 0 && ext4_test_inode_state(inode,
 						EXT4_STATE_DIO_UNWRITTEN)) {
 			int err;
 			/*
 			 * for non AIO case, since the IO is already
 			 * completed, we could do the conversion right here
 			 */
 			err = ext4_convert_unwritten_extents(inode,
 							     offset, ret);
 			if (err < 0)
 				ret = err;
 			ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
 		}
 		return ret;
 	}
 	/* for write the the end of file case, we fall back to old way */
 	return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
 }
 static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
 			      const struct iovec *iov, loff_t offset,
 			      unsigned long nr_segs)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 	ssize_t ret;
 	trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 		ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
 	else
 		ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
 	trace_ext4_direct_IO_exit(inode, offset,
 				iov_length(iov, nr_segs), rw, ret);
 	return ret;
 }
 /*
  * Pages can be marked dirty completely asynchronously from ext4's journalling
  * activity.  By filemap_sync_pte(), try_to_unmap_one(), etc.  We cannot do
  * much here because ->set_page_dirty is called under VFS locks.  The page is
  * not necessarily locked.
  *
  * We cannot just dirty the page and leave attached buffers clean, because the
  * buffers' dirty state is "definitive".  We cannot just set the buffers dirty
  * or jbddirty because all the journalling code will explode.
  *
  * So what we do is to mark the page "pending dirty" and next time writepage
  * is called, propagate that into the buffers appropriately.
  */
 static int ext4_journalled_set_page_dirty(struct page *page)
 {
 	SetPageChecked(page);
 	return __set_page_dirty_nobuffers(page);
 }
 static const struct address_space_operations ext4_ordered_aops = {
 	.readpage		= ext4_readpage,
 	.readpages		= ext4_readpages,
 	.writepage		= ext4_writepage,
 	.write_begin		= ext4_write_begin,
 	.write_end		= ext4_ordered_write_end,
 	.bmap			= ext4_bmap,
 	.invalidatepage		= ext4_invalidatepage,
 	.releasepage		= ext4_releasepage,
 	.direct_IO		= ext4_direct_IO,
 	.migratepage		= buffer_migrate_page,
 	.is_partially_uptodate  = block_is_partially_uptodate,
 	.error_remove_page	= generic_error_remove_page,
 };
 static const struct address_space_operations ext4_writeback_aops = {
 	.readpage		= ext4_readpage,
 	.readpages		= ext4_readpages,
 	.writepage		= ext4_writepage,
 	.write_begin		= ext4_write_begin,
 	.write_end		= ext4_writeback_write_end,
 	.bmap			= ext4_bmap,
 	.invalidatepage		= ext4_invalidatepage,
 	.releasepage		= ext4_releasepage,
 	.direct_IO		= ext4_direct_IO,
 	.migratepage		= buffer_migrate_page,
 	.is_partially_uptodate  = block_is_partially_uptodate,
 	.error_remove_page	= generic_error_remove_page,
 };
 static const struct address_space_operations ext4_journalled_aops = {
 	.readpage		= ext4_readpage,
 	.readpages		= ext4_readpages,
 	.writepage		= ext4_writepage,
 	.write_begin		= ext4_write_begin,
 	.write_end		= ext4_journalled_write_end,
 	.set_page_dirty		= ext4_journalled_set_page_dirty,
 	.bmap			= ext4_bmap,
 	.invalidatepage		= ext4_invalidatepage,
 	.releasepage		= ext4_releasepage,
 	.is_partially_uptodate  = block_is_partially_uptodate,
 	.error_remove_page	= generic_error_remove_page,
 };
 static const struct address_space_operations ext4_da_aops = {
 	.readpage		= ext4_readpage,
 	.readpages		= ext4_readpages,
 	.writepage		= ext4_writepage,
 	.writepages		= ext4_da_writepages,
 	.write_begin		= ext4_da_write_begin,
 	.write_end		= ext4_da_write_end,
 	.bmap			= ext4_bmap,
 	.invalidatepage		= ext4_da_invalidatepage,
 	.releasepage		= ext4_releasepage,
 	.direct_IO		= ext4_direct_IO,
 	.migratepage		= buffer_migrate_page,
 	.is_partially_uptodate  = block_is_partially_uptodate,
 	.error_remove_page	= generic_error_remove_page,
 };
 void ext4_set_aops(struct inode *inode)
 {
 	if (ext4_should_order_data(inode) &&
 		test_opt(inode->i_sb, DELALLOC))
 		inode->i_mapping->a_ops = &ext4_da_aops;
 	else if (ext4_should_order_data(inode))
 		inode->i_mapping->a_ops = &ext4_ordered_aops;
 	else if (ext4_should_writeback_data(inode) &&
 		 test_opt(inode->i_sb, DELALLOC))
 		inode->i_mapping->a_ops = &ext4_da_aops;
 	else if (ext4_should_writeback_data(inode))
 		inode->i_mapping->a_ops = &ext4_writeback_aops;
 	else
 		inode->i_mapping->a_ops = &ext4_journalled_aops;
 }
 /*
  * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
  * up to the end of the block which corresponds to `from'.
  * This required during truncate. We need to physically zero the tail end
  * of that block so it doesn't yield old data if the file is later grown.
  */
 int ext4_block_truncate_page(handle_t *handle,
 		struct address_space *mapping, loff_t from)
 {
 	ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
 	unsigned blocksize, length, pos;
 	ext4_lblk_t iblock;
 	struct inode *inode = mapping->host;
 	struct buffer_head *bh;
 	struct page *page;
 	int err = 0;
 	page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
 				   mapping_gfp_mask(mapping) & ~__GFP_FS);
 	if (!page)
 		return -EINVAL;
 	blocksize = inode->i_sb->s_blocksize;
 	length = blocksize - (offset & (blocksize - 1));
 	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
 	if (!page_has_buffers(page))
 		create_empty_buffers(page, blocksize, 0);
 	/* Find the buffer that contains "offset" */
 	bh = page_buffers(page);
 	pos = blocksize;
 	while (offset >= pos) {
 		bh = bh->b_this_page;
 		iblock++;
 		pos += blocksize;
 	}
 	err = 0;
 	if (buffer_freed(bh)) {
 		BUFFER_TRACE(bh, "freed: skip");
 		goto unlock;
 	}
 	if (!buffer_mapped(bh)) {
 		BUFFER_TRACE(bh, "unmapped");
 		ext4_get_block(inode, iblock, bh, 0);
 		/* unmapped? It's a hole - nothing to do */
 		if (!buffer_mapped(bh)) {
 			BUFFER_TRACE(bh, "still unmapped");
 			goto unlock;
 		}
 	}
 	/* Ok, it's mapped. Make sure it's up-to-date */
 	if (PageUptodate(page))
 		set_buffer_uptodate(bh);
 	if (!buffer_uptodate(bh)) {
 		err = -EIO;
 		ll_rw_block(READ, 1, &bh);
 		wait_on_buffer(bh);
 		/* Uhhuh. Read error. Complain and punt. */
 		if (!buffer_uptodate(bh))
 			goto unlock;
 	}
 	if (ext4_should_journal_data(inode)) {
 		BUFFER_TRACE(bh, "get write access");
 		err = ext4_journal_get_write_access(handle, bh);
 		if (err)
 			goto unlock;
 	}
 	zero_user(page, offset, length);
 	BUFFER_TRACE(bh, "zeroed end of block");
 	err = 0;
 	if (ext4_should_journal_data(inode)) {
 		err = ext4_handle_dirty_metadata(handle, inode, bh);
 	} else {
 		if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode)
 			err = ext4_jbd2_file_inode(handle, inode);
 		mark_buffer_dirty(bh);
 	}
 unlock:
 	unlock_page(page);
 	page_cache_release(page);
 	return err;
 }
 /*
  * Probably it should be a library function... search for first non-zero word
  * or memcmp with zero_page, whatever is better for particular architecture.
  * Linus?
  */
 static inline int all_zeroes(__le32 *p, __le32 *q)
 {
 	while (p < q)
 		if (*p++)
 			return 0;
 	return 1;
 }
 /**
  *	ext4_find_shared - find the indirect blocks for partial truncation.
  *	@inode:	  inode in question
  *	@depth:	  depth of the affected branch
  *	@offsets: offsets of pointers in that branch (see ext4_block_to_path)
  *	@chain:	  place to store the pointers to partial indirect blocks
  *	@top:	  place to the (detached) top of branch
  *
  *	This is a helper function used by ext4_truncate().
  *
  *	When we do truncate() we may have to clean the ends of several
  *	indirect blocks but leave the blocks themselves alive. Block is
  *	partially truncated if some data below the new i_size is referred
  *	from it (and it is on the path to the first completely truncated
  *	data block, indeed).  We have to free the top of that path along
  *	with everything to the right of the path. Since no allocation
  *	past the truncation point is possible until ext4_truncate()
  *	finishes, we may safely do the latter, but top of branch may
  *	require special attention - pageout below the truncation point
  *	might try to populate it.
  *
  *	We atomically detach the top of branch from the tree, store the
  *	block number of its root in *@top, pointers to buffer_heads of
  *	partially truncated blocks - in @chain[].bh and pointers to
  *	their last elements that should not be removed - in
  *	@chain[].p. Return value is the pointer to last filled element
  *	of @chain.
  *
  *	The work left to caller to do the actual freeing of subtrees:
  *		a) free the subtree starting from *@top
  *		b) free the subtrees whose roots are stored in
  *			(@chain[i].p+1 .. end of @chain[i].bh->b_data)
  *		c) free the subtrees growing from the inode past the @chain[0].
  *			(no partially truncated stuff there).  */
 static Indirect *ext4_find_shared(struct inode *inode, int depth,
 				  ext4_lblk_t offsets[4], Indirect chain[4],
 				  __le32 *top)
 {
 	Indirect *partial, *p;
 	int k, err;
 	*top = 0;
 	/* Make k index the deepest non-null offset + 1 */
 	for (k = depth; k > 1 && !offsets[k-1]; k--)
 		;
 	partial = ext4_get_branch(inode, k, offsets, chain, &err);
 	/* Writer: pointers */
 	if (!partial)
 		partial = chain + k-1;
 	/*
 	 * If the branch acquired continuation since we've looked at it -
 	 * fine, it should all survive and (new) top doesn't belong to us.
 	 */
 	if (!partial->key && *partial->p)
 		/* Writer: end */
 		goto no_top;
 	for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--)
 		;
 	/*
 	 * OK, we've found the last block that must survive. The rest of our
 	 * branch should be detached before unlocking. However, if that rest
 	 * of branch is all ours and does not grow immediately from the inode
 	 * it's easier to cheat and just decrement partial->p.
 	 */
 	if (p == chain + k - 1 && p > chain) {
 		p->p--;
 	} else {
 		*top = *p->p;
 		/* Nope, don't do this in ext4.  Must leave the tree intact */
 #if 0
 		*p->p = 0;
 #endif
 	}
 	/* Writer: end */
 	while (partial > p) {
 		brelse(partial->bh);
 		partial--;
 	}
 no_top:
 	return partial;
 }
 /*
  * Zero a number of block pointers in either an inode or an indirect block.
  * If we restart the transaction we must again get write access to the
  * indirect block for further modification.
  *
  * We release `count' blocks on disk, but (last - first) may be greater
  * than `count' because there can be holes in there.
  *
  * Return 0 on success, 1 on invalid block range
  * and < 0 on fatal error.
  */
 static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
 			     struct buffer_head *bh,
 			     ext4_fsblk_t block_to_free,
 			     unsigned long count, __le32 *first,
 			     __le32 *last)
 {
 	__le32 *p;
 	int	flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
 	int	err;
 	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
 		flags |= EXT4_FREE_BLOCKS_METADATA;
 	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
 				   count)) {
 		EXT4_ERROR_INODE(inode, "attempt to clear invalid "
 				 "blocks %llu len %lu",
 				 (unsigned long long) block_to_free, count);
 		return 1;
 	}
 	if (try_to_extend_transaction(handle, inode)) {
 		if (bh) {
 			BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
 			err = ext4_handle_dirty_metadata(handle, inode, bh);
 			if (unlikely(err))
 				goto out_err;
 		}
 		err = ext4_mark_inode_dirty(handle, inode);
 		if (unlikely(err))
 			goto out_err;
 		err = ext4_truncate_restart_trans(handle, inode,
 						  blocks_for_truncate(inode));
 		if (unlikely(err))
 			goto out_err;
 		if (bh) {
 			BUFFER_TRACE(bh, "retaking write access");
 			err = ext4_journal_get_write_access(handle, bh);
 			if (unlikely(err))
 				goto out_err;
 		}
 	}
 	for (p = first; p < last; p++)
 		*p = 0;
 	ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags);
 	return 0;
 out_err:
 	ext4_std_error(inode->i_sb, err);
 	return err;
 }
 /**
  * ext4_free_data - free a list of data blocks
  * @handle:	handle for this transaction
  * @inode:	inode we are dealing with
  * @this_bh:	indirect buffer_head which contains *@first and *@last
  * @first:	array of block numbers
  * @last:	points immediately past the end of array
  *
  * We are freeing all blocks referred from that array (numbers are stored as
  * little-endian 32-bit) and updating @inode->i_blocks appropriately.
  *
  * We accumulate contiguous runs of blocks to free.  Conveniently, if these
  * blocks are contiguous then releasing them at one time will only affect one
  * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
  * actually use a lot of journal space.
  *
  * @this_bh will be %NULL if @first and @last point into the inode's direct
  * block pointers.
  */
 static void ext4_free_data(handle_t *handle, struct inode *inode,
 			   struct buffer_head *this_bh,
 			   __le32 *first, __le32 *last)
 {
 	ext4_fsblk_t block_to_free = 0;    /* Starting block # of a run */
 	unsigned long count = 0;	    /* Number of blocks in the run */
 	__le32 *block_to_free_p = NULL;	    /* Pointer into inode/ind
 					       corresponding to
 					       block_to_free */
 	ext4_fsblk_t nr;		    /* Current block # */
 	__le32 *p;			    /* Pointer into inode/ind
 					       for current block */
 	int err = 0;
 	if (this_bh) {				/* For indirect block */
 		BUFFER_TRACE(this_bh, "get_write_access");
 		err = ext4_journal_get_write_access(handle, this_bh);
 		/* Important: if we can't update the indirect pointers
 		 * to the blocks, we can't free them. */
 		if (err)
 			return;
 	}
 	for (p = first; p < last; p++) {
 		nr = le32_to_cpu(*p);
 		if (nr) {
 			/* accumulate blocks to free if they're contiguous */
 			if (count == 0) {
 				block_to_free = nr;
 				block_to_free_p = p;
 				count = 1;
 			} else if (nr == block_to_free + count) {
 				count++;
 			} else {
 				err = ext4_clear_blocks(handle, inode, this_bh,
 						        block_to_free, count,
 						        block_to_free_p, p);
 				if (err)
 					break;
 				block_to_free = nr;
 				block_to_free_p = p;
 				count = 1;
 			}
 		}
 	}
 	if (!err && count > 0)
 		err = ext4_clear_blocks(handle, inode, this_bh, block_to_free,
 					count, block_to_free_p, p);
 	if (err < 0)
 		/* fatal error */
 		return;
 	if (this_bh) {
 		BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");
 		/*
 		 * The buffer head should have an attached journal head at this
 		 * point. However, if the data is corrupted and an indirect
 		 * block pointed to itself, it would have been detached when
 		 * the block was cleared. Check for this instead of OOPSing.
 		 */
 		if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
 			ext4_handle_dirty_metadata(handle, inode, this_bh);
 		else
 			EXT4_ERROR_INODE(inode,
 					 "circular indirect block detected at "
 					 "block %llu",
 				(unsigned long long) this_bh->b_blocknr);
 	}
 }
 /**
  *	ext4_free_branches - free an array of branches
  *	@handle: JBD handle for this transaction
  *	@inode:	inode we are dealing with
  *	@parent_bh: the buffer_head which contains *@first and *@last
  *	@first:	array of block numbers
  *	@last:	pointer immediately past the end of array
  *	@depth:	depth of the branches to free
  *
  *	We are freeing all blocks referred from these branches (numbers are
  *	stored as little-endian 32-bit) and updating @inode->i_blocks
  *	appropriately.
  */
 static void ext4_free_branches(handle_t *handle, struct inode *inode,
 			       struct buffer_head *parent_bh,
 			       __le32 *first, __le32 *last, int depth)
 {
 	ext4_fsblk_t nr;
 	__le32 *p;
 	if (ext4_handle_is_aborted(handle))
 		return;
 	if (depth--) {
 		struct buffer_head *bh;
 		int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
 		p = last;
 		while (--p >= first) {
 			nr = le32_to_cpu(*p);
 			if (!nr)
 				continue;		/* A hole */
 			if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
 						   nr, 1)) {
 				EXT4_ERROR_INODE(inode,
 						 "invalid indirect mapped "
 						 "block %lu (level %d)",
 						 (unsigned long) nr, depth);
 				break;
 			}
 			/* Go read the buffer for the next level down */
 			bh = sb_bread(inode->i_sb, nr);
 			/*
 			 * A read failure? Report error and clear slot
 			 * (should be rare).
 			 */
 			if (!bh) {
 				EXT4_ERROR_INODE_BLOCK(inode, nr,
 						       "Read failure");
 				continue;
 			}
 			/* This zaps the entire block.  Bottom up. */
 			BUFFER_TRACE(bh, "free child branches");
 			ext4_free_branches(handle, inode, bh,
 					(__le32 *) bh->b_data,
 					(__le32 *) bh->b_data + addr_per_block,
 					depth);
 			brelse(bh);
 			/*
 			 * Everything below this this pointer has been
 			 * released.  Now let this top-of-subtree go.
 			 *
 			 * We want the freeing of this indirect block to be
 			 * atomic in the journal with the updating of the
 			 * bitmap block which owns it.  So make some room in
 			 * the journal.
 			 *
 			 * We zero the parent pointer *after* freeing its
 			 * pointee in the bitmaps, so if extend_transaction()
 			 * for some reason fails to put the bitmap changes and
 			 * the release into the same transaction, recovery
 			 * will merely complain about releasing a free block,
 			 * rather than leaking blocks.
 			 */
 			if (ext4_handle_is_aborted(handle))
 				return;
 			if (try_to_extend_transaction(handle, inode)) {
 				ext4_mark_inode_dirty(handle, inode);
 				ext4_truncate_restart_trans(handle, inode,
 					    blocks_for_truncate(inode));
 			}
 			/*
 			 * The forget flag here is critical because if
 			 * we are journaling (and not doing data
 			 * journaling), we have to make sure a revoke
 			 * record is written to prevent the journal
 			 * replay from overwriting the (former)
 			 * indirect block if it gets reallocated as a
 			 * data block.  This must happen in the same
 			 * transaction where the data blocks are
 			 * actually freed.
 			 */
 			ext4_free_blocks(handle, inode, NULL, nr, 1,
 					 EXT4_FREE_BLOCKS_METADATA|
 					 EXT4_FREE_BLOCKS_FORGET);
 			if (parent_bh) {
 				/*
 				 * The block which we have just freed is
 				 * pointed to by an indirect block: journal it
 				 */
 				BUFFER_TRACE(parent_bh, "get_write_access");
 				if (!ext4_journal_get_write_access(handle,
 								   parent_bh)){
 					*p = 0;
 					BUFFER_TRACE(parent_bh,
 					"call ext4_handle_dirty_metadata");
 					ext4_handle_dirty_metadata(handle,
 								   inode,
 								   parent_bh);
 				}
 			}
 		}
 	} else {
 		/* We have reached the bottom of the tree. */
 		BUFFER_TRACE(parent_bh, "free data blocks");
 		ext4_free_data(handle, inode, parent_bh, first, last);
 	}
 }
 int ext4_can_truncate(struct inode *inode)
 {
 	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
 		return 0;
 	if (S_ISREG(inode->i_mode))
 		return 1;
 	if (S_ISDIR(inode->i_mode))
 		return 1;
 	if (S_ISLNK(inode->i_mode))
 		return !ext4_inode_is_fast_symlink(inode);
 	return 0;
 }
 /*
  * ext4_truncate()
  *
  * We block out ext4_get_block() block instantiations across the entire
  * transaction, and VFS/VM ensures that ext4_truncate() cannot run
  * simultaneously on behalf of the same inode.
  *
  * As we work through the truncate and commmit bits of it to the journal there
  * is one core, guiding principle: the file's tree must always be consistent on
  * disk.  We must be able to restart the truncate after a crash.
  *
  * The file's tree may be transiently inconsistent in memory (although it
  * probably isn't), but whenever we close off and commit a journal transaction,
  * the contents of (the filesystem + the journal) must be consistent and
  * restartable.  It's pretty simple, really: bottom up, right to left (although
  * left-to-right works OK too).
  *
  * Note that at recovery time, journal replay occurs *before* the restart of
  * truncate against the orphan inode list.
  *
  * The committed inode has the new, desired i_size (which is the same as
  * i_disksize in this case).  After a crash, ext4_orphan_cleanup() will see
  * that this inode's truncate did not complete and it will again call
  * ext4_truncate() to have another go.  So there will be instantiated blocks
  * to the right of the truncation point in a crashed ext4 filesystem.  But
  * that's fine - as long as they are linked from the inode, the post-crash
  * ext4_truncate() run will find them and release them.
  */
 void ext4_truncate(struct inode *inode)
 {
 	handle_t *handle;
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	__le32 *i_data = ei->i_data;
 	int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
 	struct address_space *mapping = inode->i_mapping;
 	ext4_lblk_t offsets[4];
 	Indirect chain[4];
 	Indirect *partial;
 	__le32 nr = 0;
 	int n = 0;
 	ext4_lblk_t last_block, max_block;
 	unsigned blocksize = inode->i_sb->s_blocksize;
 	trace_ext4_truncate_enter(inode);
 	if (!ext4_can_truncate(inode))
 		return;
 	ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
 	if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
 		ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
 		ext4_ext_truncate(inode);
 		trace_ext4_truncate_exit(inode);
 		return;
 	}
 	handle = start_transaction(inode);
 	if (IS_ERR(handle))
 		return;		/* AKPM: return what? */
 	last_block = (inode->i_size + blocksize-1)
 					>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
 	max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
 					>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
 	if (inode->i_size & (blocksize - 1))
 		if (ext4_block_truncate_page(handle, mapping, inode->i_size))
 			goto out_stop;
 	if (last_block != max_block) {
 		n = ext4_block_to_path(inode, last_block, offsets, NULL);
 		if (n == 0)
 			goto out_stop;	/* error */
 	}
 	/*
 	 * OK.  This truncate is going to happen.  We add the inode to the
 	 * orphan list, so that if this truncate spans multiple transactions,
 	 * and we crash, we will resume the truncate when the filesystem
 	 * recovers.  It also marks the inode dirty, to catch the new size.
 	 *
 	 * Implication: the file must always be in a sane, consistent
 	 * truncatable state while each transaction commits.
 	 */
 	if (ext4_orphan_add(handle, inode))
 		goto out_stop;
 	/*
 	 * From here we block out all ext4_get_block() callers who want to
 	 * modify the block allocation tree.
 	 */
 	down_write(&ei->i_data_sem);
 	ext4_discard_preallocations(inode);
 	/*
 	 * The orphan list entry will now protect us from any crash which
 	 * occurs before the truncate completes, so it is now safe to propagate
 	 * the new, shorter inode size (held for now in i_size) into the
 	 * on-disk inode. We do this via i_disksize, which is the value which
 	 * ext4 *really* writes onto the disk inode.
 	 */
 	ei->i_disksize = inode->i_size;
 	if (last_block == max_block) {
 		/*
 		 * It is unnecessary to free any data blocks if last_block is
 		 * equal to the indirect block limit.
 		 */
 		goto out_unlock;
 	} else if (n == 1) {		/* direct blocks */
 		ext4_free_data(handle, inode, NULL, i_data+offsets[0],
 			       i_data + EXT4_NDIR_BLOCKS);
 		goto do_indirects;
 	}
 	partial = ext4_find_shared(inode, n, offsets, chain, &nr);
 	/* Kill the top of shared branch (not detached) */
 	if (nr) {
 		if (partial == chain) {
 			/* Shared branch grows from the inode */
 			ext4_free_branches(handle, inode, NULL,
 					   &nr, &nr+1, (chain+n-1) - partial);
 			*partial->p = 0;
 			/*
 			 * We mark the inode dirty prior to restart,
 			 * and prior to stop.  No need for it here.
 			 */
 		} else {
 			/* Shared branch grows from an indirect block */
 			BUFFER_TRACE(partial->bh, "get_write_access");
 			ext4_free_branches(handle, inode, partial->bh,
 					partial->p,
 					partial->p+1, (chain+n-1) - partial);
 		}
 	}
 	/* Clear the ends of indirect blocks on the shared branch */
 	while (partial > chain) {
 		ext4_free_branches(handle, inode, partial->bh, partial->p + 1,
 				   (__le32*)partial->bh->b_data+addr_per_block,
 				   (chain+n-1) - partial);
 		BUFFER_TRACE(partial->bh, "call brelse");
 		brelse(partial->bh);
 		partial--;
 	}
 do_indirects:
 	/* Kill the remaining (whole) subtrees */
 	switch (offsets[0]) {
 	default:
 		nr = i_data[EXT4_IND_BLOCK];
 		if (nr) {
 			ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
 			i_data[EXT4_IND_BLOCK] = 0;
 		}
 	case EXT4_IND_BLOCK:
 		nr = i_data[EXT4_DIND_BLOCK];
 		if (nr) {
 			ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
 			i_data[EXT4_DIND_BLOCK] = 0;
 		}
 	case EXT4_DIND_BLOCK:
 		nr = i_data[EXT4_TIND_BLOCK];
 		if (nr) {
 			ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
 			i_data[EXT4_TIND_BLOCK] = 0;
 		}
 	case EXT4_TIND_BLOCK:
 		;
 	}
 out_unlock:
 	up_write(&ei->i_data_sem);
 	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
 	ext4_mark_inode_dirty(handle, inode);
 	/*
 	 * In a multi-transaction truncate, we only make the final transaction
 	 * synchronous
 	 */
 	if (IS_SYNC(inode))
 		ext4_handle_sync(handle);
 out_stop:
 	/*
 	 * If this was a simple ftruncate(), and the file will remain alive
 	 * then we need to clear up the orphan record which we created above.
 	 * However, if this was a real unlink then we were called by
 	 * ext4_delete_inode(), and we allow that function to clean up the
 	 * orphan info for us.
 	 */
 	if (inode->i_nlink)
 		ext4_orphan_del(handle, inode);
 	ext4_journal_stop(handle);
 	trace_ext4_truncate_exit(inode);
 }
 /*
  * ext4_get_inode_loc returns with an extra refcount against the inode's
  * underlying buffer_head on success. If 'in_mem' is true, we have all
  * data in memory that is needed to recreate the on-disk version of this
  * inode.
  */
 static int __ext4_get_inode_loc(struct inode *inode,
 				struct ext4_iloc *iloc, int in_mem)
 {
 	struct ext4_group_desc	*gdp;
 	struct buffer_head	*bh;
 	struct super_block	*sb = inode->i_sb;
 	ext4_fsblk_t		block;
 	int			inodes_per_block, inode_offset;
 	iloc->bh = NULL;
 	if (!ext4_valid_inum(sb, inode->i_ino))
 		return -EIO;
 	iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
 	gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
 	if (!gdp)
 		return -EIO;
 	/*
 	 * Figure out the offset within the block group inode table
 	 */
 	inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
 	inode_offset = ((inode->i_ino - 1) %
 			EXT4_INODES_PER_GROUP(sb));
 	block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
 	iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
 	bh = sb_getblk(sb, block);
 	if (!bh) {
 		EXT4_ERROR_INODE_BLOCK(inode, block,
 				       "unable to read itable block");
 		return -EIO;
 	}
 	if (!buffer_uptodate(bh)) {
 		lock_buffer(bh);
 		/*
 		 * If the buffer has the write error flag, we have failed
 		 * to write out another inode in the same block.  In this
 		 * case, we don't have to read the block because we may
 		 * read the old inode data successfully.
 		 */
 		if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
 			set_buffer_uptodate(bh);
 		if (buffer_uptodate(bh)) {
 			/* someone brought it uptodate while we waited */
 			unlock_buffer(bh);
 			goto has_buffer;
 		}
 		/*
 		 * If we have all information of the inode in memory and this
 		 * is the only valid inode in the block, we need not read the
 		 * block.
 		 */
 		if (in_mem) {
 			struct buffer_head *bitmap_bh;
 			int i, start;
 			start = inode_offset & ~(inodes_per_block - 1);
 			/* Is the inode bitmap in cache? */
 			bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
 			if (!bitmap_bh)
 				goto make_io;
 			/*
 			 * If the inode bitmap isn't in cache then the
 			 * optimisation may end up performing two reads instead
 			 * of one, so skip it.
 			 */
 			if (!buffer_uptodate(bitmap_bh)) {
 				brelse(bitmap_bh);
 				goto make_io;
 			}
 			for (i = start; i < start + inodes_per_block; i++) {
 				if (i == inode_offset)
 					continue;
 				if (ext4_test_bit(i, bitmap_bh->b_data))
 					break;
 			}
 			brelse(bitmap_bh);
 			if (i == start + inodes_per_block) {
 				/* all other inodes are free, so skip I/O */
 				memset(bh->b_data, 0, bh->b_size);
 				set_buffer_uptodate(bh);
 				unlock_buffer(bh);
 				goto has_buffer;
 			}
 		}
 make_io:
 		/*
 		 * If we need to do any I/O, try to pre-readahead extra
 		 * blocks from the inode table.
 		 */
 		if (EXT4_SB(sb)->s_inode_readahead_blks) {
 			ext4_fsblk_t b, end, table;
 			unsigned num;
 			table = ext4_inode_table(sb, gdp);
 			/* s_inode_readahead_blks is always a power of 2 */
 			b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
 			if (table > b)
 				b = table;
 			end = b + EXT4_SB(sb)->s_inode_readahead_blks;
 			num = EXT4_INODES_PER_GROUP(sb);
 			if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
 				       EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
 				num -= ext4_itable_unused_count(sb, gdp);
 			table += num / inodes_per_block;
 			if (end > table)
 				end = table;
 			while (b <= end)
 				sb_breadahead(sb, b++);
 		}
 		/*
 		 * There are other valid inodes in the buffer, this inode
 		 * has in-inode xattrs, or we don't have this inode in memory.
 		 * Read the block from disk.
 		 */
 		trace_ext4_load_inode(inode);
 		get_bh(bh);
 		bh->b_end_io = end_buffer_read_sync;
 		submit_bh(READ_META, bh);
 		wait_on_buffer(bh);
 		if (!buffer_uptodate(bh)) {
 			EXT4_ERROR_INODE_BLOCK(inode, block,
 					       "unable to read itable block");
 			brelse(bh);
 			return -EIO;
 		}
 	}
 has_buffer:
 	iloc->bh = bh;
 	return 0;
 }
 int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
 {
 	/* We have all inode data except xattrs in memory here. */
 	return __ext4_get_inode_loc(inode, iloc,
 		!ext4_test_inode_state(inode, EXT4_STATE_XATTR));
 }
 void ext4_set_inode_flags(struct inode *inode)
 {
 	unsigned int flags = EXT4_I(inode)->i_flags;
 	inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
 	if (flags & EXT4_SYNC_FL)
 		inode->i_flags |= S_SYNC;
 	if (flags & EXT4_APPEND_FL)
 		inode->i_flags |= S_APPEND;
 	if (flags & EXT4_IMMUTABLE_FL)
 		inode->i_flags |= S_IMMUTABLE;
 	if (flags & EXT4_NOATIME_FL)
 		inode->i_flags |= S_NOATIME;
 	if (flags & EXT4_DIRSYNC_FL)
 		inode->i_flags |= S_DIRSYNC;
 }
 /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
 void ext4_get_inode_flags(struct ext4_inode_info *ei)
 {
 	unsigned int vfs_fl;
 	unsigned long old_fl, new_fl;
 	do {
 		vfs_fl = ei->vfs_inode.i_flags;
 		old_fl = ei->i_flags;
 		new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
 				EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|
 				EXT4_DIRSYNC_FL);
 		if (vfs_fl & S_SYNC)
 			new_fl |= EXT4_SYNC_FL;
 		if (vfs_fl & S_APPEND)
 			new_fl |= EXT4_APPEND_FL;
 		if (vfs_fl & S_IMMUTABLE)
 			new_fl |= EXT4_IMMUTABLE_FL;
 		if (vfs_fl & S_NOATIME)
 			new_fl |= EXT4_NOATIME_FL;
 		if (vfs_fl & S_DIRSYNC)
 			new_fl |= EXT4_DIRSYNC_FL;
 	} while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl);
 }
 static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
 				  struct ext4_inode_info *ei)
 {
 	blkcnt_t i_blocks ;
 	struct inode *inode = &(ei->vfs_inode);
 	struct super_block *sb = inode->i_sb;
 	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
 				EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
 		/* we are using combined 48 bit field */
 		i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
 					le32_to_cpu(raw_inode->i_blocks_lo);
 		if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) {
 			/* i_blocks represent file system block size */
 			return i_blocks  << (inode->i_blkbits - 9);
 		} else {
 			return i_blocks;
 		}
 	} else {
 		return le32_to_cpu(raw_inode->i_blocks_lo);
 	}
 }
 struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 {
 	struct ext4_iloc iloc;
 	struct ext4_inode *raw_inode;
 	struct ext4_inode_info *ei;
 	struct inode *inode;
 	journal_t *journal = EXT4_SB(sb)->s_journal;
 	long ret;
 	int block;
 	inode = iget_locked(sb, ino);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 	if (!(inode->i_state & I_NEW))
 		return inode;
 	ei = EXT4_I(inode);
 	iloc.bh = NULL;
 	ret = __ext4_get_inode_loc(inode, &iloc, 0);
 	if (ret < 0)
 		goto bad_inode;
 	raw_inode = ext4_raw_inode(&iloc);
 	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
 	inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
 	inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
 	if (!(test_opt(inode->i_sb, NO_UID32))) {
 		inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
 		inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
 	}
 	inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
 	ext4_clear_state_flags(ei);	/* Only relevant on 32-bit archs */
 	ei->i_dir_start_lookup = 0;
 	ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
 	/* We now have enough fields to check if the inode was active or not.
 	 * This is needed because nfsd might try to access dead inodes
 	 * the test is that same one that e2fsck uses
 	 * NeilBrown 1999oct15
 	 */
 	if (inode->i_nlink == 0) {
 		if (inode->i_mode == 0 ||
 		    !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
 			/* this inode is deleted */
 			ret = -ESTALE;
 			goto bad_inode;
 		}
 		/* The only unlinked inodes we let through here have
 		 * valid i_mode and are being read by the orphan
 		 * recovery code: that's fine, we're about to complete
 		 * the process of deleting those. */
 	}
 	ei->i_flags = le32_to_cpu(raw_inode->i_flags);
 	inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
 	ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
 		ei->i_file_acl |=
 			((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
 	inode->i_size = ext4_isize(raw_inode);
 	ei->i_disksize = inode->i_size;
 #ifdef CONFIG_QUOTA
 	ei->i_reserved_quota = 0;
 #endif
 	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
 	ei->i_block_group = iloc.block_group;
 	ei->i_last_alloc_group = ~0;
 	/*
 	 * NOTE! The in-memory inode i_data array is in little-endian order
 	 * even on big-endian machines: we do NOT byteswap the block numbers!
 	 */
 	for (block = 0; block < EXT4_N_BLOCKS; block++)
 		ei->i_data[block] = raw_inode->i_block[block];
 	INIT_LIST_HEAD(&ei->i_orphan);
 	/*
 	 * Set transaction id's of transactions that have to be committed
 	 * to finish f[data]sync. We set them to currently running transaction
 	 * as we cannot be sure that the inode or some of its metadata isn't
 	 * part of the transaction - the inode could have been reclaimed and
 	 * now it is reread from disk.
 	 */
 	if (journal) {
 		transaction_t *transaction;
 		tid_t tid;
 		read_lock(&journal->j_state_lock);
 		if (journal->j_running_transaction)
 			transaction = journal->j_running_transaction;
 		else
 			transaction = journal->j_committing_transaction;
 		if (transaction)
 			tid = transaction->t_tid;
 		else
 			tid = journal->j_commit_sequence;
 		read_unlock(&journal->j_state_lock);
 		ei->i_sync_tid = tid;
 		ei->i_datasync_tid = tid;
 	}
 	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
 		ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
 		if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
 		    EXT4_INODE_SIZE(inode->i_sb)) {
 			ret = -EIO;
 			goto bad_inode;
 		}
 		if (ei->i_extra_isize == 0) {
 			/* The extra space is currently unused. Use it. */
 			ei->i_extra_isize = sizeof(struct ext4_inode) -
 					    EXT4_GOOD_OLD_INODE_SIZE;
 		} else {
 			__le32 *magic = (void *)raw_inode +
 					EXT4_GOOD_OLD_INODE_SIZE +
 					ei->i_extra_isize;
 			if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
 				ext4_set_inode_state(inode, EXT4_STATE_XATTR);
 		}
 	} else
 		ei->i_extra_isize = 0;
 	EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
 	EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
 	EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
 	EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
 	inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
 	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
 		if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
 			inode->i_version |=
 			(__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
 	}
 	ret = 0;
 	if (ei->i_file_acl &&
 	    !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
 		EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
 				 ei->i_file_acl);
 		ret = -EIO;
 		goto bad_inode;
 	} else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
 		if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
 		    (S_ISLNK(inode->i_mode) &&
 		     !ext4_inode_is_fast_symlink(inode)))
 			/* Validate extent which is part of inode */
 			ret = ext4_ext_check_inode(inode);
 	} else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
 		   (S_ISLNK(inode->i_mode) &&
 		    !ext4_inode_is_fast_symlink(inode))) {
 		/* Validate block references which are part of inode */
 		ret = ext4_check_inode_blockref(inode);
 	}
 	if (ret)
 		goto bad_inode;
 	if (S_ISREG(inode->i_mode)) {
 		inode->i_op = &ext4_file_inode_operations;
 		inode->i_fop = &ext4_file_operations;
 		ext4_set_aops(inode);
 	} else if (S_ISDIR(inode->i_mode)) {
 		inode->i_op = &ext4_dir_inode_operations;
 		inode->i_fop = &ext4_dir_operations;
 	} else if (S_ISLNK(inode->i_mode)) {
 		if (ext4_inode_is_fast_symlink(inode)) {
 			inode->i_op = &ext4_fast_symlink_inode_operations;
 			nd_terminate_link(ei->i_data, inode->i_size,
 				sizeof(ei->i_data) - 1);
 		} else {
 			inode->i_op = &ext4_symlink_inode_operations;
 			ext4_set_aops(inode);
 		}
 	} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
 	      S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
 		inode->i_op = &ext4_special_inode_operations;
 		if (raw_inode->i_block[0])
 			init_special_inode(inode, inode->i_mode,
 			   old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
 		else
 			init_special_inode(inode, inode->i_mode,
 			   new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
 	} else {
 		ret = -EIO;
 		EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
 		goto bad_inode;
 	}
 	brelse(iloc.bh);
 	ext4_set_inode_flags(inode);
 	unlock_new_inode(inode);
 	return inode;
 bad_inode:
 	brelse(iloc.bh);
 	iget_failed(inode);
 	return ERR_PTR(ret);
 }
 static int ext4_inode_blocks_set(handle_t *handle,
 				struct ext4_inode *raw_inode,
 				struct ext4_inode_info *ei)
 {
 	struct inode *inode = &(ei->vfs_inode);
 	u64 i_blocks = inode->i_blocks;
 	struct super_block *sb = inode->i_sb;
 	if (i_blocks <= ~0U) {
 		/*
 		 * i_blocks can be represnted in a 32 bit variable
 		 * as multiple of 512 bytes
 		 */
 		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
 		raw_inode->i_blocks_high = 0;
 		ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
 		return 0;
 	}
 	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
 		return -EFBIG;
 	if (i_blocks <= 0xffffffffffffULL) {
 		/*
 		 * i_blocks can be represented in a 48 bit variable
 		 * as multiple of 512 bytes
 		 */
 		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
 		raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
 		ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
 	} else {
 		ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
 		/* i_block is stored in file system block size */
 		i_blocks = i_blocks >> (inode->i_blkbits - 9);
 		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
 		raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
 	}
 	return 0;
 }
 /*
  * Post the struct inode info into an on-disk inode location in the
  * buffer-cache.  This gobbles the caller's reference to the
  * buffer_head in the inode location struct.
  *
  * The caller must have write access to iloc->bh.
  */
 static int ext4_do_update_inode(handle_t *handle,
 				struct inode *inode,
 				struct ext4_iloc *iloc)
 {
 	struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	struct buffer_head *bh = iloc->bh;
 	int err = 0, rc, block;
 	/* For fields not not tracking in the in-memory inode,
 	 * initialise them to zero for new inodes. */
 	if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
 		memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
 	ext4_get_inode_flags(ei);
 	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
 	if (!(test_opt(inode->i_sb, NO_UID32))) {
 		raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
 		raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
 /*
  * Fix up interoperability with old kernels. Otherwise, old inodes get
  * re-used with the upper 16 bits of the uid/gid intact
  */
 		if (!ei->i_dtime) {
 			raw_inode->i_uid_high =
 				cpu_to_le16(high_16_bits(inode->i_uid));
 			raw_inode->i_gid_high =
 				cpu_to_le16(high_16_bits(inode->i_gid));
 		} else {
 			raw_inode->i_uid_high = 0;
 			raw_inode->i_gid_high = 0;
 		}
 	} else {
 		raw_inode->i_uid_low =
 			cpu_to_le16(fs_high2lowuid(inode->i_uid));
 		raw_inode->i_gid_low =
 			cpu_to_le16(fs_high2lowgid(inode->i_gid));
 		raw_inode->i_uid_high = 0;
 		raw_inode->i_gid_high = 0;
 	}
 	raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
 	EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
 	EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
 	EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
 	EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
 	if (ext4_inode_blocks_set(handle, raw_inode, ei))
 		goto out_brelse;
 	raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
 	raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
 	if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
 	    cpu_to_le32(EXT4_OS_HURD))
 		raw_inode->i_file_acl_high =
 			cpu_to_le16(ei->i_file_acl >> 32);
 	raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
 	ext4_isize_set(raw_inode, ei->i_disksize);
 	if (ei->i_disksize > 0x7fffffffULL) {
 		struct super_block *sb = inode->i_sb;
 		if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
 				EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
 				EXT4_SB(sb)->s_es->s_rev_level ==
 				cpu_to_le32(EXT4_GOOD_OLD_REV)) {
 			/* If this is the first large file
 			 * created, add a flag to the superblock.
 			 */
 			err = ext4_journal_get_write_access(handle,
 					EXT4_SB(sb)->s_sbh);
 			if (err)
 				goto out_brelse;
 			ext4_update_dynamic_rev(sb);
 			EXT4_SET_RO_COMPAT_FEATURE(sb,
 					EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
 			sb->s_dirt = 1;
 			ext4_handle_sync(handle);
 			err = ext4_handle_dirty_metadata(handle, NULL,
 					EXT4_SB(sb)->s_sbh);
 		}
 	}
 	raw_inode->i_generation = cpu_to_le32(inode->i_generation);
 	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
 		if (old_valid_dev(inode->i_rdev)) {
 			raw_inode->i_block[0] =
 				cpu_to_le32(old_encode_dev(inode->i_rdev));
 			raw_inode->i_block[1] = 0;
 		} else {
 			raw_inode->i_block[0] = 0;
 			raw_inode->i_block[1] =
 				cpu_to_le32(new_encode_dev(inode->i_rdev));
 			raw_inode->i_block[2] = 0;
 		}
 	} else
 		for (block = 0; block < EXT4_N_BLOCKS; block++)
 			raw_inode->i_block[block] = ei->i_data[block];
 	raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
 	if (ei->i_extra_isize) {
 		if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
 			raw_inode->i_version_hi =
 			cpu_to_le32(inode->i_version >> 32);
 		raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
 	}
 	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
 	rc = ext4_handle_dirty_metadata(handle, NULL, bh);
 	if (!err)
 		err = rc;
 	ext4_clear_inode_state(inode, EXT4_STATE_NEW);
 	ext4_update_inode_fsync_trans(handle, inode, 0);
 out_brelse:
 	brelse(bh);
 	ext4_std_error(inode->i_sb, err);
 	return err;
 }
 /*
  * ext4_write_inode()
  *
  * We are called from a few places:
  *
  * - Within generic_file_write() for O_SYNC files.
  *   Here, there will be no transaction running. We wait for any running
  *   trasnaction to commit.
  *
  * - Within sys_sync(), kupdate and such.
  *   We wait on commit, if tol to.
  *
  * - Within prune_icache() (PF_MEMALLOC == true)
  *   Here we simply return.  We can't afford to block kswapd on the
  *   journal commit.
  *
  * In all cases it is actually safe for us to return without doing anything,
  * because the inode has been copied into a raw inode buffer in
  * ext4_mark_inode_dirty().  This is a correctness thing for O_SYNC and for
  * knfsd.
  *
  * Note that we are absolutely dependent upon all inode dirtiers doing the
  * right thing: they *must* call mark_inode_dirty() after dirtying info in
  * which we are interested.
  *
  * It would be a bug for them to not do this.  The code:
  *
  *	mark_inode_dirty(inode)
  *	stuff();
  *	inode->i_size = expr;
  *
  * is in error because a kswapd-driven write_inode() could occur while
  * `stuff()' is running, and the new i_size will be lost.  Plus the inode
  * will no longer be on the superblock's dirty inode list.
  */
 int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	int err;
 	if (current->flags & PF_MEMALLOC)
 		return 0;
 	if (EXT4_SB(inode->i_sb)->s_journal) {
 		if (ext4_journal_current_handle()) {
 			jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
 			dump_stack();
 			return -EIO;
 		}
 		if (wbc->sync_mode != WB_SYNC_ALL)
 			return 0;
 		err = ext4_force_commit(inode->i_sb);
 	} else {
 		struct ext4_iloc iloc;
 		err = __ext4_get_inode_loc(inode, &iloc, 0);
 		if (err)
 			return err;
 		if (wbc->sync_mode == WB_SYNC_ALL)
 			sync_dirty_buffer(iloc.bh);
 		if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
 			EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
 					 "IO error syncing inode");
 			err = -EIO;
 		}
 		brelse(iloc.bh);
 	}
 	return err;
 }
 /*
  * ext4_setattr()
  *
  * Called from notify_change.
  *
  * We want to trap VFS attempts to truncate the file as soon as
  * possible.  In particular, we want to make sure that when the VFS
  * shrinks i_size, we put the inode on the orphan list and modify
  * i_disksize immediately, so that during the subsequent flushing of
  * dirty pages and freeing of disk blocks, we can guarantee that any
  * commit will leave the blocks being flushed in an unused state on
  * disk.  (On recovery, the inode will get truncated and the blocks will
  * be freed, so we have a strong guarantee that no future commit will
  * leave these blocks visible to the user.)
  *
  * Another thing we have to assure is that if we are in ordered mode
  * and inode is still attached to the committing transaction, we must
  * we start writeout of all the dirty pages which are being truncated.
  * This way we are sure that all the data written in the previous
  * transaction are already on disk (truncate waits for pages under
  * writeback).
  *
  * Called with inode->i_mutex down.
  */
 int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = dentry->d_inode;
 	int error, rc = 0;
 	int orphan = 0;
 	const unsigned int ia_valid = attr->ia_valid;
 	error = inode_change_ok(inode, attr);
 	if (error)
 		return error;
 	if (is_quota_modification(inode, attr))
 		dquot_initialize(inode);
 	if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
 		(ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
 		handle_t *handle;
 		/* (user+group)*(old+new) structure, inode write (sb,
 		 * inode block, ? - but truncate inode update has it) */
 		handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
 					EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);
 		if (IS_ERR(handle)) {
 			error = PTR_ERR(handle);
 			goto err_out;
 		}
 		error = dquot_transfer(inode, attr);
 		if (error) {
 			ext4_journal_stop(handle);
 			return error;
 		}
 		/* Update corresponding info in inode so that everything is in
 		 * one transaction */
 		if (attr->ia_valid & ATTR_UID)
 			inode->i_uid = attr->ia_uid;
 		if (attr->ia_valid & ATTR_GID)
 			inode->i_gid = attr->ia_gid;
 		error = ext4_mark_inode_dirty(handle, inode);
 		ext4_journal_stop(handle);
 	}
 	if (attr->ia_valid & ATTR_SIZE) {
 		if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
 			struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 			if (attr->ia_size > sbi->s_bitmap_maxbytes)
 				return -EFBIG;
 		}
 	}
 	if (S_ISREG(inode->i_mode) &&
 	    attr->ia_valid & ATTR_SIZE &&
 	    (attr->ia_size < inode->i_size ||
 	     (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) {
 		handle_t *handle;
 		handle = ext4_journal_start(inode, 3);
 		if (IS_ERR(handle)) {
 			error = PTR_ERR(handle);
 			goto err_out;
 		}
 		if (ext4_handle_valid(handle)) {
 			error = ext4_orphan_add(handle, inode);
 			orphan = 1;
 		}
 		EXT4_I(inode)->i_disksize = attr->ia_size;
 		rc = ext4_mark_inode_dirty(handle, inode);
 		if (!error)
 			error = rc;
 		ext4_journal_stop(handle);
 		if (ext4_should_order_data(inode)) {
 			error = ext4_begin_ordered_truncate(inode,
 							    attr->ia_size);
 			if (error) {
 				/* Do as much error cleanup as possible */
 				handle = ext4_journal_start(inode, 3);
 				if (IS_ERR(handle)) {
 					ext4_orphan_del(NULL, inode);
 					goto err_out;
 				}
 				ext4_orphan_del(handle, inode);
 				orphan = 0;
 				ext4_journal_stop(handle);
 				goto err_out;
 			}
 		}
 		/* ext4_truncate will clear the flag */
 		if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))
 			ext4_truncate(inode);
 	}
 	if ((attr->ia_valid & ATTR_SIZE) &&
 	    attr->ia_size != i_size_read(inode))
 		rc = vmtruncate(inode, attr->ia_size);
 	if (!rc) {
 		setattr_copy(inode, attr);
 		mark_inode_dirty(inode);
 	}
 	/*
 	 * If the call to ext4_truncate failed to get a transaction handle at
 	 * all, we need to clean up the in-core orphan list manually.
 	 */
 	if (orphan && inode->i_nlink)
 		ext4_orphan_del(NULL, inode);
 	if (!rc && (ia_valid & ATTR_MODE))
 		rc = ext4_acl_chmod(inode);
 err_out:
 	ext4_std_error(inode->i_sb, error);
 	if (!error)
 		error = rc;
 	return error;
 }
 int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		 struct kstat *stat)
 {
 	struct inode *inode;
 	unsigned long delalloc_blocks;
 	inode = dentry->d_inode;
 	generic_fillattr(inode, stat);
 	/*
 	 * We can't update i_blocks if the block allocation is delayed
 	 * otherwise in the case of system crash before the real block
 	 * allocation is done, we will have i_blocks inconsistent with
 	 * on-disk file blocks.
 	 * We always keep i_blocks updated together with real
 	 * allocation. But to not confuse with user, stat
 	 * will return the blocks that include the delayed allocation
 	 * blocks for this file.
 	 */
 	delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
 	stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
 	return 0;
 }
 static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
 				      int chunk)
 {
 	int indirects;
 	/* if nrblocks are contiguous */
 	if (chunk) {
 		/*
 		 * With N contiguous data blocks, we need at most
 		 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
 		 * 2 dindirect blocks, and 1 tindirect block
 		 */
 		return DIV_ROUND_UP(nrblocks,
 				    EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
 	}
 	/*
 	 * if nrblocks are not contiguous, worse case, each block touch
 	 * a indirect block, and each indirect block touch a double indirect
 	 * block, plus a triple indirect block
 	 */
 	indirects = nrblocks * 2 + 1;
 	return indirects;
 }
 static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 {
 	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 		return ext4_indirect_trans_blocks(inode, nrblocks, chunk);
 	return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
 }
 /*
  * Account for index blocks, block groups bitmaps and block group
  * descriptor blocks if modify datablocks and index blocks
  * worse case, the indexs blocks spread over different block groups
  *
  * If datablocks are discontiguous, they are possible to spread over
  * different block groups too. If they are contiuguous, with flexbg,
  * they could still across block group boundary.
  *
  * Also account for superblock, inode, quota and xattr blocks
  */
 static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 {
 	ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
 	int gdpblocks;
 	int idxblocks;
 	int ret = 0;
 	/*
 	 * How many index blocks need to touch to modify nrblocks?
 	 * The "Chunk" flag indicating whether the nrblocks is
 	 * physically contiguous on disk
 	 *
 	 * For Direct IO and fallocate, they calls get_block to allocate
 	 * one single extent at a time, so they could set the "Chunk" flag
 	 */
 	idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk);
 	ret = idxblocks;
 	/*
 	 * Now let's see how many group bitmaps and group descriptors need
 	 * to account
 	 */
 	groups = idxblocks;
 	if (chunk)
 		groups += 1;
 	else
 		groups += nrblocks;
 	gdpblocks = groups;
 	if (groups > ngroups)
 		groups = ngroups;
 	if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
 		gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
 	/* bitmaps and block group descriptor blocks */
 	ret += groups + gdpblocks;
 	/* Blocks for super block, inode, quota and xattr blocks */
 	ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);
 	return ret;
 }
 /*
  * Calculate the total number of credits to reserve to fit
  * the modification of a single pages into a single transaction,
  * which may include multiple chunks of block allocations.
  *
  * This could be called via ext4_write_begin()
  *
  * We need to consider the worse case, when
  * one new block per extent.
  */
 int ext4_writepage_trans_blocks(struct inode *inode)
 {
 	int bpp = ext4_journal_blocks_per_page(inode);
 	int ret;
 	ret = ext4_meta_trans_blocks(inode, bpp, 0);
 	/* Account for data blocks for journalled mode */
 	if (ext4_should_journal_data(inode))
 		ret += bpp;
 	return ret;
 }
 /*
  * Calculate the journal credits for a chunk of data modification.
  *
  * This is called from DIO, fallocate or whoever calling
  * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks.
  *
  * journal buffers for data blocks are not included here, as DIO
  * and fallocate do no need to journal data buffers.
  */
 int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
 {
 	return ext4_meta_trans_blocks(inode, nrblocks, 1);
 }
 /*
  * The caller must have previously called ext4_reserve_inode_write().
  * Give this, we know that the caller already has write access to iloc->bh.
  */
 int ext4_mark_iloc_dirty(handle_t *handle,
 			 struct inode *inode, struct ext4_iloc *iloc)
 {
 	int err = 0;
 	if (test_opt(inode->i_sb, I_VERSION))
 		inode_inc_iversion(inode);
 	/* the do_update_inode consumes one bh->b_count */
 	get_bh(iloc->bh);
 	/* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
 	err = ext4_do_update_inode(handle, inode, iloc);
 	put_bh(iloc->bh);
 	return err;
 }
 /*
  * On success, We end up with an outstanding reference count against
  * iloc->bh.  This _must_ be cleaned up later.
  */
 int
 ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
 			 struct ext4_iloc *iloc)
 {
 	int err;
 	err = ext4_get_inode_loc(inode, iloc);
 	if (!err) {
 		BUFFER_TRACE(iloc->bh, "get_write_access");
 		err = ext4_journal_get_write_access(handle, iloc->bh);
 		if (err) {
 			brelse(iloc->bh);
 			iloc->bh = NULL;
 		}
 	}
 	ext4_std_error(inode->i_sb, err);
 	return err;
 }
 /*
  * Expand an inode by new_extra_isize bytes.
  * Returns 0 on success or negative error number on failure.
  */
 static int ext4_expand_extra_isize(struct inode *inode,
 				   unsigned int new_extra_isize,
 				   struct ext4_iloc iloc,
 				   handle_t *handle)
 {
 	struct ext4_inode *raw_inode;
 	struct ext4_xattr_ibody_header *header;
 	if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)
 		return 0;
 	raw_inode = ext4_raw_inode(&iloc);
 	header = IHDR(inode, raw_inode);
 	/* No extended attributes present */
 	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
 	    header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
 		memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
 			new_extra_isize);
 		EXT4_I(inode)->i_extra_isize = new_extra_isize;
 		return 0;
 	}
 	/* try to expand with EAs present */
 	return ext4_expand_extra_isize_ea(inode, new_extra_isize,
 					  raw_inode, handle);
 }
 /*
  * What we do here is to mark the in-core inode as clean with respect to inode
  * dirtiness (it may still be data-dirty).
  * This means that the in-core inode may be reaped by prune_icache
  * without having to perform any I/O.  This is a very good thing,
  * because *any* task may call prune_icache - even ones which
  * have a transaction open against a different journal.
  *
  * Is this cheating?  Not really.  Sure, we haven't written the
  * inode out, but prune_icache isn't a user-visible syncing function.
  * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
  * we start and wait on commits.
  *
  * Is this efficient/effective?  Well, we're being nice to the system
  * by cleaning up our inodes proactively so they can be reaped
  * without I/O.  But we are potentially leaving up to five seconds'
  * worth of inodes floating about which prune_icache wants us to
  * write out.  One way to fix that would be to get prune_icache()
  * to do a write_super() to free up some memory.  It has the desired
  * effect.
  */
 int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
 {
 	struct ext4_iloc iloc;
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	static unsigned int mnt_count;
 	int err, ret;
 	might_sleep();
 	trace_ext4_mark_inode_dirty(inode, _RET_IP_);
 	err = ext4_reserve_inode_write(handle, inode, &iloc);
 	if (ext4_handle_valid(handle) &&
 	    EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
 	    !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
 		/*
 		 * We need extra buffer credits since we may write into EA block
 		 * with this same handle. If journal_extend fails, then it will
 		 * only result in a minor loss of functionality for that inode.
 		 * If this is felt to be critical, then e2fsck should be run to
 		 * force a large enough s_min_extra_isize.
 		 */
 		if ((jbd2_journal_extend(handle,
 			     EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) {
 			ret = ext4_expand_extra_isize(inode,
 						      sbi->s_want_extra_isize,
 						      iloc, handle);
 			if (ret) {
 				ext4_set_inode_state(inode,
 						     EXT4_STATE_NO_EXPAND);
 				if (mnt_count !=
 					le16_to_cpu(sbi->s_es->s_mnt_count)) {
 					ext4_warning(inode->i_sb,
 					"Unable to expand inode %lu. Delete"
 					" some EAs or run e2fsck.",
 					inode->i_ino);
 					mnt_count =
 					  le16_to_cpu(sbi->s_es->s_mnt_count);
 				}
 			}
 		}
 	}
 	if (!err)
 		err = ext4_mark_iloc_dirty(handle, inode, &iloc);
 	return err;
 }
 /*
  * ext4_dirty_inode() is called from __mark_inode_dirty()
  *
  * We're really interested in the case where a file is being extended.
  * i_size has been changed by generic_commit_write() and we thus need
  * to include the updated inode in the current transaction.
  *
  * Also, dquot_alloc_block() will always dirty the inode when blocks
  * are allocated to the file.
  *
  * If the inode is marked synchronous, we don't honour that here - doing
  * so would cause a commit on atime updates, which we don't bother doing.
  * We handle synchronous inodes at the highest possible level.
  */
 void ext4_dirty_inode(struct inode *inode)
 {
 	handle_t *handle;
 	handle = ext4_journal_start(inode, 2);
 	if (IS_ERR(handle))
 		goto out;
 	ext4_mark_inode_dirty(handle, inode);
 	ext4_journal_stop(handle);
 out:
 	return;
 }
 #if 0
 /*
  * Bind an inode's backing buffer_head into this transaction, to prevent
  * it from being flushed to disk early.  Unlike
  * ext4_reserve_inode_write, this leaves behind no bh reference and
  * returns no iloc structure, so the caller needs to repeat the iloc
  * lookup to mark the inode dirty later.
  */
 static int ext4_pin_inode(handle_t *handle, struct inode *inode)
 {
 	struct ext4_iloc iloc;
 	int err = 0;
 	if (handle) {
 		err = ext4_get_inode_loc(inode, &iloc);
 		if (!err) {
 			BUFFER_TRACE(iloc.bh, "get_write_access");
 			err = jbd2_journal_get_write_access(handle, iloc.bh);
 			if (!err)
 				err = ext4_handle_dirty_metadata(handle,
 								 NULL,
 								 iloc.bh);
 			brelse(iloc.bh);
 		}
 	}
 	ext4_std_error(inode->i_sb, err);
 	return err;
 }
 #endif
 int ext4_change_inode_journal_flag(struct inode *inode, int val)
 {
 	journal_t *journal;
 	handle_t *handle;
 	int err;
 	/*
 	 * We have to be very careful here: changing a data block's
 	 * journaling status dynamically is dangerous.  If we write a
 	 * data block to the journal, change the status and then delete
 	 * that block, we risk forgetting to revoke the old log record
 	 * from the journal and so a subsequent replay can corrupt data.
 	 * So, first we make sure that the journal is empty and that
 	 * nobody is changing anything.
 	 */
 	journal = EXT4_JOURNAL(inode);
 	if (!journal)
 		return 0;
 	if (is_journal_aborted(journal))
 		return -EROFS;
 	jbd2_journal_lock_updates(journal);
 	jbd2_journal_flush(journal);
 	/*
 	 * OK, there are no updates running now, and all cached data is
 	 * synced to disk.  We are now in a completely consistent state
 	 * which doesn't have anything in the journal, and we know that
 	 * no filesystem updates are running, so it is safe to modify
 	 * the inode's in-core data-journaling state flag now.
 	 */
 	if (val)
 		ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
 	else
 		ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
 	ext4_set_aops(inode);
 	jbd2_journal_unlock_updates(journal);
 	/* Finally we can mark the inode as dirty. */
 	handle = ext4_journal_start(inode, 1);
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 	err = ext4_mark_inode_dirty(handle, inode);
 	ext4_handle_sync(handle);
 	ext4_journal_stop(handle);
 	ext4_std_error(inode->i_sb, err);
 	return err;
 }
 static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
 {
 	return !buffer_mapped(bh);
 }
 int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
 	loff_t size;
 	unsigned long len;
 	int ret = -EINVAL;
 	void *fsdata;
 	struct file *file = vma->vm_file;
 	struct inode *inode = file->f_path.dentry->d_inode;
 	struct address_space *mapping = inode->i_mapping;
 	/*
 	 * Get i_alloc_sem to stop truncates messing with the inode. We cannot
 	 * get i_mutex because we are already holding mmap_sem.
 	 */
 	down_read(&inode->i_alloc_sem);
 	size = i_size_read(inode);
 	if (page->mapping != mapping || size <= page_offset(page)
 	    || !PageUptodate(page)) {
 		/* page got truncated from under us? */
 		goto out_unlock;
 	}
 	ret = 0;
 	if (PageMappedToDisk(page))
 		goto out_unlock;
 	if (page->index == size >> PAGE_CACHE_SHIFT)
 		len = size & ~PAGE_CACHE_MASK;
 	else
 		len = PAGE_CACHE_SIZE;
 	lock_page(page);
 	/*
 	 * return if we have all the buffers mapped. This avoid
 	 * the need to call write_begin/write_end which does a
 	 * journal_start/journal_stop which can block and take
 	 * long time
 	 */
 	if (page_has_buffers(page)) {
 		if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
 					ext4_bh_unmapped)) {
 			unlock_page(page);
 			goto out_unlock;
 		}
 	}
 	unlock_page(page);
 	/*
 	 * OK, we need to fill the hole... Do write_begin write_end
 	 * to do block allocation/reservation.We are not holding
 	 * inode.i__mutex here. That allow * parallel write_begin,
 	 * write_end call. lock_page prevent this from happening
 	 * on the same page though
 	 */
 	ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
 			len, AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
 	if (ret < 0)
 		goto out_unlock;
 	ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
 			len, len, page, fsdata);
 	if (ret < 0)
 		goto out_unlock;
 	ret = 0;
 out_unlock:
 	if (ret)
 		ret = VM_FAULT_SIGBUS;
 	up_read(&inode->i_alloc_sem);
 	return ret;
 }

 /*
  * Copyright (c) 2008,2009 NEC Software Tohoku, Ltd.
  * Written by Takashi Sato <t-sato@yk.jp.nec.com>
  *            Akira Fujita <a-fujita@rs.jp.nec.com>
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of version 2.1 of the GNU Lesser General Public License
  * as published by the Free Software Foundation.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  */
 #include <linux/fs.h>
 #include <linux/quotaops.h>
 #include <linux/slab.h>
 #include "ext4_jbd2.h"
 #include "ext4_extents.h"
 #include "ext4.h"
 /**
  * get_ext_path - Find an extent path for designated logical block number.
  *
  * @inode:	an inode which is searched
  * @lblock:	logical block number to find an extent path
  * @path:	pointer to an extent path pointer (for output)
  *
  * ext4_ext_find_extent wrapper. Return 0 on success, or a negative error value
  * on failure.
  */
 static inline int
 get_ext_path(struct inode *inode, ext4_lblk_t lblock,
 		struct ext4_ext_path **path)
 {
 	int ret = 0;
 	*path = ext4_ext_find_extent(inode, lblock, *path);
 	if (IS_ERR(*path)) {
 		ret = PTR_ERR(*path);
 		*path = NULL;
 	} else if ((*path)[ext_depth(inode)].p_ext == NULL)
 		ret = -ENODATA;
 	return ret;
 }
 /**
  * copy_extent_status - Copy the extent's initialization status
  *
  * @src:	an extent for getting initialize status
  * @dest:	an extent to be set the status
  */
 static void
 copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest)
 {
 	if (ext4_ext_is_uninitialized(src))
 		ext4_ext_mark_uninitialized(dest);
 	else
 		dest->ee_len = cpu_to_le16(ext4_ext_get_actual_len(dest));
 }
 /**
  * mext_next_extent - Search for the next extent and set it to "extent"
  *
  * @inode:	inode which is searched
  * @path:	this will obtain data for the next extent
  * @extent:	pointer to the next extent we have just gotten
  *
  * Search the next extent in the array of ext4_ext_path structure (@path)
  * and set it to ext4_extent structure (@extent). In addition, the member of
  * @path (->p_ext) also points the next extent. Return 0 on success, 1 if
  * ext4_ext_path structure refers to the last extent, or a negative error
  * value on failure.
  */
 static int
 mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
 		      struct ext4_extent **extent)
 {
 	struct ext4_extent_header *eh;
 	int ppos, leaf_ppos = path->p_depth;
 	ppos = leaf_ppos;
 	if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
 		/* leaf block */
 		*extent = ++path[ppos].p_ext;
 		path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
 		return 0;
 	}
 	while (--ppos >= 0) {
 		if (EXT_LAST_INDEX(path[ppos].p_hdr) >
 		    path[ppos].p_idx) {
 			int cur_ppos = ppos;
 			/* index block */
 			path[ppos].p_idx++;
 			path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
 			if (path[ppos+1].p_bh)
 				brelse(path[ppos+1].p_bh);
 			path[ppos+1].p_bh =
 				sb_bread(inode->i_sb, path[ppos].p_block);
 			if (!path[ppos+1].p_bh)
 				return -EIO;
 			path[ppos+1].p_hdr =
 				ext_block_hdr(path[ppos+1].p_bh);
 			/* Halfway index block */
 			while (++cur_ppos < leaf_ppos) {
 				path[cur_ppos].p_idx =
 					EXT_FIRST_INDEX(path[cur_ppos].p_hdr);
 				path[cur_ppos].p_block =
 					ext4_idx_pblock(path[cur_ppos].p_idx);
 				if (path[cur_ppos+1].p_bh)
 					brelse(path[cur_ppos+1].p_bh);
 				path[cur_ppos+1].p_bh = sb_bread(inode->i_sb,
 					path[cur_ppos].p_block);
 				if (!path[cur_ppos+1].p_bh)
 					return -EIO;
 				path[cur_ppos+1].p_hdr =
 					ext_block_hdr(path[cur_ppos+1].p_bh);
 			}
 			path[leaf_ppos].p_ext = *extent = NULL;
 			eh = path[leaf_ppos].p_hdr;
 			if (le16_to_cpu(eh->eh_entries) == 0)
 				/* empty leaf is found */
 				return -ENODATA;
 			/* leaf block */
 			path[leaf_ppos].p_ext = *extent =
 				EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
 			path[leaf_ppos].p_block =
 					ext4_ext_pblock(path[leaf_ppos].p_ext);
 			return 0;
 		}
 	}
 	/* We found the last extent */
 	return 1;
 }
 /**
  * mext_check_null_inode - NULL check for two inodes
  *
  * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
  */
 static int
 mext_check_null_inode(struct inode *inode1, struct inode *inode2,
 		      const char *function, unsigned int line)
 {
 	int ret = 0;
 	if (inode1 == NULL) {
 		__ext4_error(inode2->i_sb, function, line,
 			"Both inodes should not be NULL: "
 			"inode1 NULL inode2 %lu", inode2->i_ino);
 		ret = -EIO;
 	} else if (inode2 == NULL) {
 		__ext4_error(inode1->i_sb, function, line,
 			"Both inodes should not be NULL: "
 			"inode1 %lu inode2 NULL", inode1->i_ino);
 		ret = -EIO;
 	}
 	return ret;
 }
 /**
  * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem
  *
  * @orig_inode:		original inode structure
  * @donor_inode:	donor inode structure
  * Acquire write lock of i_data_sem of the two inodes (orig and donor) by
  * i_ino order.
  */
 static void
 double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
 {
 	struct inode *first = orig_inode, *second = donor_inode;
 	/*
 	 * Use the inode number to provide the stable locking order instead
 	 * of its address, because the C language doesn't guarantee you can
 	 * compare pointers that don't come from the same array.
 	 */
 	if (donor_inode->i_ino < orig_inode->i_ino) {
 		first = donor_inode;
 		second = orig_inode;
 	}
 	down_write(&EXT4_I(first)->i_data_sem);
 	down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING);
 }
 /**
  * double_up_write_data_sem - Release two inodes' write lock of i_data_sem
  *
  * @orig_inode:		original inode structure to be released its lock first
  * @donor_inode:	donor inode structure to be released its lock second
  * Release write lock of i_data_sem of two inodes (orig and donor).
  */
 static void
 double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
 {
 	up_write(&EXT4_I(orig_inode)->i_data_sem);
 	up_write(&EXT4_I(donor_inode)->i_data_sem);
 }
 /**
  * mext_insert_across_blocks - Insert extents across leaf block
  *
  * @handle:		journal handle
  * @orig_inode:		original inode
  * @o_start:		first original extent to be changed
  * @o_end:		last original extent to be changed
  * @start_ext:		first new extent to be inserted
  * @new_ext:		middle of new extent to be inserted
  * @end_ext:		last new extent to be inserted
  *
  * Allocate a new leaf block and insert extents into it. Return 0 on success,
  * or a negative error value on failure.
  */
 static int
 mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
 		struct ext4_extent *o_start, struct ext4_extent *o_end,
 		struct ext4_extent *start_ext, struct ext4_extent *new_ext,
 		struct ext4_extent *end_ext)
 {
 	struct ext4_ext_path *orig_path = NULL;
 	ext4_lblk_t eblock = 0;
 	int new_flag = 0;
 	int end_flag = 0;
 	int err = 0;
 	if (start_ext->ee_len && new_ext->ee_len && end_ext->ee_len) {
 		if (o_start == o_end) {
 			/*       start_ext   new_ext    end_ext
 			 * donor |---------|-----------|--------|
 			 * orig  |------------------------------|
 			 */
 			end_flag = 1;
 		} else {
 			/*       start_ext   new_ext   end_ext
 			 * donor |---------|----------|---------|
 			 * orig  |---------------|--------------|
 			 */
 			o_end->ee_block = end_ext->ee_block;
 			o_end->ee_len = end_ext->ee_len;
 			ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
 		}
 		o_start->ee_len = start_ext->ee_len;
 		eblock = le32_to_cpu(start_ext->ee_block);
 		new_flag = 1;
 	} else if (start_ext->ee_len && new_ext->ee_len &&
 		   !end_ext->ee_len && o_start == o_end) {
 		/*	 start_ext	new_ext
 		 * donor |--------------|---------------|
 		 * orig  |------------------------------|
 		 */
 		o_start->ee_len = start_ext->ee_len;
 		eblock = le32_to_cpu(start_ext->ee_block);
 		new_flag = 1;
 	} else if (!start_ext->ee_len && new_ext->ee_len &&
 		   end_ext->ee_len && o_start == o_end) {
 		/*	  new_ext	end_ext
 		 * donor |--------------|---------------|
 		 * orig  |------------------------------|
 		 */
 		o_end->ee_block = end_ext->ee_block;
 		o_end->ee_len = end_ext->ee_len;
 		ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
 		/*
 		 * Set 0 to the extent block if new_ext was
 		 * the first block.
 		 */
 		if (new_ext->ee_block)
 			eblock = le32_to_cpu(new_ext->ee_block);
 		new_flag = 1;
 	} else {
 		ext4_debug("ext4 move extent: Unexpected insert case\n");
 		return -EIO;
 	}
 	if (new_flag) {
 		err = get_ext_path(orig_inode, eblock, &orig_path);
 		if (err)
 			goto out;
 		if (ext4_ext_insert_extent(handle, orig_inode,
 					orig_path, new_ext, 0))
 			goto out;
 	}
 	if (end_flag) {
 		err = get_ext_path(orig_inode,
 				le32_to_cpu(end_ext->ee_block) - 1, &orig_path);
 		if (err)
 			goto out;
 		if (ext4_ext_insert_extent(handle, orig_inode,
 					   orig_path, end_ext, 0))
 			goto out;
 	}
 out:
 	if (orig_path) {
 		ext4_ext_drop_refs(orig_path);
 		kfree(orig_path);
 	}
 	return err;
 }
 /**
  * mext_insert_inside_block - Insert new extent to the extent block
  *
  * @o_start:		first original extent to be moved
  * @o_end:		last original extent to be moved
  * @start_ext:		first new extent to be inserted
  * @new_ext:		middle of new extent to be inserted
  * @end_ext:		last new extent to be inserted
  * @eh:			extent header of target leaf block
  * @range_to_move:	used to decide how to insert extent
  *
  * Insert extents into the leaf block. The extent (@o_start) is overwritten
  * by inserted extents.
  */
 static void
 mext_insert_inside_block(struct ext4_extent *o_start,
 			      struct ext4_extent *o_end,
 			      struct ext4_extent *start_ext,
 			      struct ext4_extent *new_ext,
 			      struct ext4_extent *end_ext,
 			      struct ext4_extent_header *eh,
 			      int range_to_move)
 {
 	int i = 0;
 	unsigned long len;
 	/* Move the existing extents */
 	if (range_to_move && o_end < EXT_LAST_EXTENT(eh)) {
 		len = (unsigned long)(EXT_LAST_EXTENT(eh) + 1) -
 			(unsigned long)(o_end + 1);
 		memmove(o_end + 1 + range_to_move, o_end + 1, len);
 	}
 	/* Insert start entry */
 	if (start_ext->ee_len)
 		o_start[i++].ee_len = start_ext->ee_len;
 	/* Insert new entry */
 	if (new_ext->ee_len) {
 		o_start[i] = *new_ext;
 		ext4_ext_store_pblock(&o_start[i++], ext4_ext_pblock(new_ext));
 	}
 	/* Insert end entry */
 	if (end_ext->ee_len)
 		o_start[i] = *end_ext;
 	/* Increment the total entries counter on the extent block */
 	le16_add_cpu(&eh->eh_entries, range_to_move);
 }
 /**
  * mext_insert_extents - Insert new extent
  *
  * @handle:	journal handle
  * @orig_inode:	original inode
  * @orig_path:	path indicates first extent to be changed
  * @o_start:	first original extent to be changed
  * @o_end:	last original extent to be changed
  * @start_ext:	first new extent to be inserted
  * @new_ext:	middle of new extent to be inserted
  * @end_ext:	last new extent to be inserted
  *
  * Call the function to insert extents. If we cannot add more extents into
  * the leaf block, we call mext_insert_across_blocks() to create a
  * new leaf block. Otherwise call mext_insert_inside_block(). Return 0
  * on success, or a negative error value on failure.
  */
 static int
 mext_insert_extents(handle_t *handle, struct inode *orig_inode,
 			 struct ext4_ext_path *orig_path,
 			 struct ext4_extent *o_start,
 			 struct ext4_extent *o_end,
 			 struct ext4_extent *start_ext,
 			 struct ext4_extent *new_ext,
 			 struct ext4_extent *end_ext)
 {
 	struct  ext4_extent_header *eh;
 	unsigned long need_slots, slots_range;
 	int	range_to_move, depth, ret;
 	/*
 	 * The extents need to be inserted
 	 * start_extent + new_extent + end_extent.
 	 */
 	need_slots = (start_ext->ee_len ? 1 : 0) + (end_ext->ee_len ? 1 : 0) +
 		(new_ext->ee_len ? 1 : 0);
 	/* The number of slots between start and end */
 	slots_range = ((unsigned long)(o_end + 1) - (unsigned long)o_start + 1)
 		/ sizeof(struct ext4_extent);
 	/* Range to move the end of extent */
 	range_to_move = need_slots - slots_range;
 	depth = orig_path->p_depth;
 	orig_path += depth;
 	eh = orig_path->p_hdr;
 	if (depth) {
 		/* Register to journal */
 		ret = ext4_journal_get_write_access(handle, orig_path->p_bh);
 		if (ret)
 			return ret;
 	}
 	/* Expansion */
 	if (range_to_move > 0 &&
 		(range_to_move > le16_to_cpu(eh->eh_max)
 			- le16_to_cpu(eh->eh_entries))) {
 		ret = mext_insert_across_blocks(handle, orig_inode, o_start,
 					o_end, start_ext, new_ext, end_ext);
 		if (ret < 0)
 			return ret;
 	} else
 		mext_insert_inside_block(o_start, o_end, start_ext, new_ext,
 						end_ext, eh, range_to_move);
 	if (depth) {
 		ret = ext4_handle_dirty_metadata(handle, orig_inode,
 						 orig_path->p_bh);
 		if (ret)
 			return ret;
 	} else {
 		ret = ext4_mark_inode_dirty(handle, orig_inode);
 		if (ret < 0)
 			return ret;
 	}
 	return 0;
 }
 /**
  * mext_leaf_block - Move one leaf extent block into the inode.
  *
  * @handle:		journal handle
  * @orig_inode:		original inode
  * @orig_path:		path indicates first extent to be changed
  * @dext:		donor extent
  * @from:		start offset on the target file
  *
  * In order to insert extents into the leaf block, we must divide the extent
  * in the leaf block into three extents. The one is located to be inserted
  * extents, and the others are located around it.
  *
  * Therefore, this function creates structures to save extents of the leaf
  * block, and inserts extents by calling mext_insert_extents() with
  * created extents. Return 0 on success, or a negative error value on failure.
  */
 static int
 mext_leaf_block(handle_t *handle, struct inode *orig_inode,
 		     struct ext4_ext_path *orig_path, struct ext4_extent *dext,
 		     ext4_lblk_t *from)
 {
 	struct ext4_extent *oext, *o_start, *o_end, *prev_ext;
 	struct ext4_extent new_ext, start_ext, end_ext;
 	ext4_lblk_t new_ext_end;
 	int oext_alen, new_ext_alen, end_ext_alen;
 	int depth = ext_depth(orig_inode);
 	int ret;
 	start_ext.ee_block = end_ext.ee_block = 0;
 	o_start = o_end = oext = orig_path[depth].p_ext;
 	oext_alen = ext4_ext_get_actual_len(oext);
 	start_ext.ee_len = end_ext.ee_len = 0;
 	new_ext.ee_block = cpu_to_le32(*from);
 	ext4_ext_store_pblock(&new_ext, ext4_ext_pblock(dext));
 	new_ext.ee_len = dext->ee_len;
 	new_ext_alen = ext4_ext_get_actual_len(&new_ext);
 	new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1;
 	/*
 	 * Case: original extent is first
 	 * oext      |--------|
 	 * new_ext      |--|
 	 * start_ext |--|
 	 */
 	if (le32_to_cpu(oext->ee_block) < le32_to_cpu(new_ext.ee_block) &&
 		le32_to_cpu(new_ext.ee_block) <
 		le32_to_cpu(oext->ee_block) + oext_alen) {
 		start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) -
 					       le32_to_cpu(oext->ee_block));
 		start_ext.ee_block = oext->ee_block;
 		copy_extent_status(oext, &start_ext);
 	} else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) {
 		prev_ext = oext - 1;
 		/*
 		 * We can merge new_ext into previous extent,
 		 * if these are contiguous and same extent type.
 		 */
 		if (ext4_can_extents_be_merged(orig_inode, prev_ext,
 					       &new_ext)) {
 			o_start = prev_ext;
 			start_ext.ee_len = cpu_to_le16(
 				ext4_ext_get_actual_len(prev_ext) +
 				new_ext_alen);
 			start_ext.ee_block = oext->ee_block;
 			copy_extent_status(prev_ext, &start_ext);
 			new_ext.ee_len = 0;
 		}
 	}
 	/*
 	 * Case: new_ext_end must be less than oext
 	 * oext      |-----------|
 	 * new_ext       |-------|
 	 */
 	if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
 		EXT4_ERROR_INODE(orig_inode,
 			"new_ext_end(%u) should be less than or equal to "
 			"oext->ee_block(%u) + oext_alen(%d) - 1",
 			new_ext_end, le32_to_cpu(oext->ee_block),
 			oext_alen);
 		ret = -EIO;
 		goto out;
 	}
 	/*
 	 * Case: new_ext is smaller than original extent
 	 * oext    |---------------|
 	 * new_ext |-----------|
 	 * end_ext             |---|
 	 */
 	if (le32_to_cpu(oext->ee_block) <= new_ext_end &&
 		new_ext_end < le32_to_cpu(oext->ee_block) + oext_alen - 1) {
 		end_ext.ee_len =
 			cpu_to_le16(le32_to_cpu(oext->ee_block) +
 			oext_alen - 1 - new_ext_end);
 		copy_extent_status(oext, &end_ext);
 		end_ext_alen = ext4_ext_get_actual_len(&end_ext);
 		ext4_ext_store_pblock(&end_ext,
 			(ext4_ext_pblock(o_end) + oext_alen - end_ext_alen));
 		end_ext.ee_block =
 			cpu_to_le32(le32_to_cpu(o_end->ee_block) +
 			oext_alen - end_ext_alen);
 	}
 	ret = mext_insert_extents(handle, orig_inode, orig_path, o_start,
 				o_end, &start_ext, &new_ext, &end_ext);
 out:
 	return ret;
 }
 /**
  * mext_calc_swap_extents - Calculate extents for extent swapping.
  *
  * @tmp_dext:		the extent that will belong to the original inode
  * @tmp_oext:		the extent that will belong to the donor inode
  * @orig_off:		block offset of original inode
  * @donor_off:		block offset of donor inode
  * @max_count:		the maximum length of extents
  *
  * Return 0 on success, or a negative error value on failure.
  */
 static int
 mext_calc_swap_extents(struct ext4_extent *tmp_dext,
 			      struct ext4_extent *tmp_oext,
 			      ext4_lblk_t orig_off, ext4_lblk_t donor_off,
 			      ext4_lblk_t max_count)
 {
 	ext4_lblk_t diff, orig_diff;
 	struct ext4_extent dext_old, oext_old;
 	BUG_ON(orig_off != donor_off);
 	/* original and donor extents have to cover the same block offset */
 	if (orig_off < le32_to_cpu(tmp_oext->ee_block) ||
 	    le32_to_cpu(tmp_oext->ee_block) +
 			ext4_ext_get_actual_len(tmp_oext) - 1 < orig_off)
 		return -ENODATA;
 	if (orig_off < le32_to_cpu(tmp_dext->ee_block) ||
 	    le32_to_cpu(tmp_dext->ee_block) +
 			ext4_ext_get_actual_len(tmp_dext) - 1 < orig_off)
 		return -ENODATA;
 	dext_old = *tmp_dext;
 	oext_old = *tmp_oext;
 	/* When tmp_dext is too large, pick up the target range. */
 	diff = donor_off - le32_to_cpu(tmp_dext->ee_block);
 	ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff);
 	tmp_dext->ee_block =
 			cpu_to_le32(le32_to_cpu(tmp_dext->ee_block) + diff);
 	tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_dext->ee_len) - diff);
 	if (max_count < ext4_ext_get_actual_len(tmp_dext))
 		tmp_dext->ee_len = cpu_to_le16(max_count);
 	orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block);
 	ext4_ext_store_pblock(tmp_oext, ext4_ext_pblock(tmp_oext) + orig_diff);
 	/* Adjust extent length if donor extent is larger than orig */
 	if (ext4_ext_get_actual_len(tmp_dext) >
 	    ext4_ext_get_actual_len(tmp_oext) - orig_diff)
 		tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_oext->ee_len) -
 						orig_diff);
 	tmp_oext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(tmp_dext));
 	copy_extent_status(&oext_old, tmp_dext);
 	copy_extent_status(&dext_old, tmp_oext);
 	return 0;
 }
 /**
  * mext_replace_branches - Replace original extents with new extents
  *
  * @handle:		journal handle
  * @orig_inode:		original inode
  * @donor_inode:	donor inode
  * @from:		block offset of orig_inode
  * @count:		block count to be replaced
  * @err:		pointer to save return value
  *
  * Replace original inode extents and donor inode extents page by page.
  * We implement this replacement in the following three steps:
  * 1. Save the block information of original and donor inodes into
  *    dummy extents.
  * 2. Change the block information of original inode to point at the
  *    donor inode blocks.
  * 3. Change the block information of donor inode to point at the saved
  *    original inode blocks in the dummy extents.
  *
  * Return replaced block count.
  */
 static int
 mext_replace_branches(handle_t *handle, struct inode *orig_inode,
 			   struct inode *donor_inode, ext4_lblk_t from,
 			   ext4_lblk_t count, int *err)
 {
 	struct ext4_ext_path *orig_path = NULL;
 	struct ext4_ext_path *donor_path = NULL;
 	struct ext4_extent *oext, *dext;
 	struct ext4_extent tmp_dext, tmp_oext;
 	ext4_lblk_t orig_off = from, donor_off = from;
 	int depth;
 	int replaced_count = 0;
 	int dext_alen;
 	/* Protect extent trees against block allocations via delalloc */
 	double_down_write_data_sem(orig_inode, donor_inode);
 	/* Get the original extent for the block "orig_off" */
 	*err = get_ext_path(orig_inode, orig_off, &orig_path);
 	if (*err)
 		goto out;
 	/* Get the donor extent for the head */
 	*err = get_ext_path(donor_inode, donor_off, &donor_path);
 	if (*err)
 		goto out;
 	depth = ext_depth(orig_inode);
 	oext = orig_path[depth].p_ext;
 	tmp_oext = *oext;
 	depth = ext_depth(donor_inode);
 	dext = donor_path[depth].p_ext;
 	tmp_dext = *dext;
 	*err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
 				      donor_off, count);
 	if (*err)
 		goto out;
 	/* Loop for the donor extents */
 	while (1) {
 		/* The extent for donor must be found. */
 		if (!dext) {
 			EXT4_ERROR_INODE(donor_inode,
 				   "The extent for donor must be found");
 			*err = -EIO;
 			goto out;
 		} else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
 			EXT4_ERROR_INODE(donor_inode,
 				"Donor offset(%u) and the first block of donor "
 				"extent(%u) should be equal",
 				donor_off,
 				le32_to_cpu(tmp_dext.ee_block));
 			*err = -EIO;
 			goto out;
 		}
 		/* Set donor extent to orig extent */
 		*err = mext_leaf_block(handle, orig_inode,
 					   orig_path, &tmp_dext, &orig_off);
 		if (*err)
 			goto out;
 		/* Set orig extent to donor extent */
 		*err = mext_leaf_block(handle, donor_inode,
 					   donor_path, &tmp_oext, &donor_off);
 		if (*err)
 			goto out;
 		dext_alen = ext4_ext_get_actual_len(&tmp_dext);
 		replaced_count += dext_alen;
 		donor_off += dext_alen;
 		orig_off += dext_alen;
 		/* Already moved the expected blocks */
 		if (replaced_count >= count)
 			break;
 		if (orig_path)
 			ext4_ext_drop_refs(orig_path);
 		*err = get_ext_path(orig_inode, orig_off, &orig_path);
 		if (*err)
 			goto out;
 		depth = ext_depth(orig_inode);
 		oext = orig_path[depth].p_ext;
 		tmp_oext = *oext;
 		if (donor_path)
 			ext4_ext_drop_refs(donor_path);
 		*err = get_ext_path(donor_inode, donor_off, &donor_path);
 		if (*err)
 			goto out;
 		depth = ext_depth(donor_inode);
 		dext = donor_path[depth].p_ext;
 		tmp_dext = *dext;
 		*err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
 					   donor_off, count - replaced_count);
 		if (*err)
 			goto out;
 	}
 out:
 	if (orig_path) {
 		ext4_ext_drop_refs(orig_path);
 		kfree(orig_path);
 	}
 	if (donor_path) {
 		ext4_ext_drop_refs(donor_path);
 		kfree(donor_path);
 	}
 	ext4_ext_invalidate_cache(orig_inode);
 	ext4_ext_invalidate_cache(donor_inode);
 	double_up_write_data_sem(orig_inode, donor_inode);
 	return replaced_count;
 }
 /**
  * move_extent_per_page - Move extent data per page
  *
  * @o_filp:			file structure of original file
  * @donor_inode:		donor inode
  * @orig_page_offset:		page index on original file
  * @data_offset_in_page:	block index where data swapping starts
  * @block_len_in_page:		the number of blocks to be swapped
  * @uninit:			orig extent is uninitialized or not
  * @err:			pointer to save return value
  *
  * Save the data in original inode blocks and replace original inode extents
  * with donor inode extents by calling mext_replace_branches().
  * Finally, write out the saved data in new original inode blocks. Return
  * replaced block count.
  */
 static int
 move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
 		  pgoff_t orig_page_offset, int data_offset_in_page,
 		  int block_len_in_page, int uninit, int *err)
 {
 	struct inode *orig_inode = o_filp->f_dentry->d_inode;
 	struct address_space *mapping = orig_inode->i_mapping;
 	struct buffer_head *bh;
 	struct page *page = NULL;
 	const struct address_space_operations *a_ops = mapping->a_ops;
 	handle_t *handle;
 	ext4_lblk_t orig_blk_offset;
 	long long offs = orig_page_offset << PAGE_CACHE_SHIFT;
 	unsigned long blocksize = orig_inode->i_sb->s_blocksize;
 	unsigned int w_flags = 0;
 	unsigned int tmp_data_size, data_size, replaced_size;
 	void *fsdata;
 	int i, jblocks;
 	int err2 = 0;
 	int replaced_count = 0;
 	int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
 	/*
 	 * It needs twice the amount of ordinary journal buffers because
 	 * inode and donor_inode may change each different metadata blocks.
 	 */
 	jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
 	handle = ext4_journal_start(orig_inode, jblocks);
 	if (IS_ERR(handle)) {
 		*err = PTR_ERR(handle);
 		return 0;
 	}
 	if (segment_eq(get_fs(), KERNEL_DS))
 		w_flags |= AOP_FLAG_UNINTERRUPTIBLE;
 	orig_blk_offset = orig_page_offset * blocks_per_page +
 		data_offset_in_page;
 	/*
 	 * If orig extent is uninitialized one,
 	 * it's not necessary force the page into memory
 	 * and then force it to be written out again.
 	 * Just swap data blocks between orig and donor.
 	 */
 	if (uninit) {
 		replaced_count = mext_replace_branches(handle, orig_inode,
 						donor_inode, orig_blk_offset,
 						block_len_in_page, err);
 		goto out2;
 	}
 	offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
 	/* Calculate data_size */
 	if ((orig_blk_offset + block_len_in_page - 1) ==
 	    ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
 		/* Replace the last block */
 		tmp_data_size = orig_inode->i_size & (blocksize - 1);
 		/*
 		 * If data_size equal zero, it shows data_size is multiples of
 		 * blocksize. So we set appropriate value.
 		 */
 		if (tmp_data_size == 0)
 			tmp_data_size = blocksize;
 		data_size = tmp_data_size +
 			((block_len_in_page - 1) << orig_inode->i_blkbits);
 	} else
 		data_size = block_len_in_page << orig_inode->i_blkbits;
 	replaced_size = data_size;
 	*err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags,
 				 &page, &fsdata);
 	if (unlikely(*err < 0))
 		goto out;
 	if (!PageUptodate(page)) {
 		mapping->a_ops->readpage(o_filp, page);
 		lock_page(page);
 	}
 	/*
 	 * try_to_release_page() doesn't call releasepage in writeback mode.
 	 * We should care about the order of writing to the same file
 	 * by multiple move extent processes.
 	 * It needs to call wait_on_page_writeback() to wait for the
 	 * writeback of the page.
 	 */
-	if (PageWriteback(page))
+	wait_on_page_writeback(page);
-		wait_on_page_writeback(page);
 	/* Release old bh and drop refs */
 	try_to_release_page(page, 0);
 	replaced_count = mext_replace_branches(handle, orig_inode, donor_inode,
 					orig_blk_offset, block_len_in_page,
 					&err2);
 	if (err2) {
 		if (replaced_count) {
 			block_len_in_page = replaced_count;
 			replaced_size =
 				block_len_in_page << orig_inode->i_blkbits;
 		} else
 			goto out;
 	}
 	if (!page_has_buffers(page))
 		create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0);
 	bh = page_buffers(page);
 	for (i = 0; i < data_offset_in_page; i++)
 		bh = bh->b_this_page;
 	for (i = 0; i < block_len_in_page; i++) {
 		*err = ext4_get_block(orig_inode,
 				(sector_t)(orig_blk_offset + i), bh, 0);
 		if (*err < 0)
 			goto out;
 		if (bh->b_this_page != NULL)
 			bh = bh->b_this_page;
 	}
 	*err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size,
 			       page, fsdata);
 	page = NULL;
 out:
 	if (unlikely(page)) {
 		if (PageLocked(page))
 			unlock_page(page);
 		page_cache_release(page);
 		ext4_journal_stop(handle);
 	}
 out2:
 	ext4_journal_stop(handle);
 	if (err2)
 		*err = err2;
 	return replaced_count;
 }
 /**
  * mext_check_arguments - Check whether move extent can be done
  *
  * @orig_inode:		original inode
  * @donor_inode:	donor inode
  * @orig_start:		logical start offset in block for orig
  * @donor_start:	logical start offset in block for donor
  * @len:		the number of blocks to be moved
  *
  * Check the arguments of ext4_move_extents() whether the files can be
  * exchanged with each other.
  * Return 0 on success, or a negative error value on failure.
  */
 static int
 mext_check_arguments(struct inode *orig_inode,
 		     struct inode *donor_inode, __u64 orig_start,
 		     __u64 donor_start, __u64 *len)
 {
 	ext4_lblk_t orig_blocks, donor_blocks;
 	unsigned int blkbits = orig_inode->i_blkbits;
 	unsigned int blocksize = 1 << blkbits;
 	if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
 		ext4_debug("ext4 move extent: suid or sgid is set"
 			   " to donor file [ino:orig %lu, donor %lu]\n",
 			   orig_inode->i_ino, donor_inode->i_ino);
 		return -EINVAL;
 	}
 	if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode))
 		return -EPERM;
 	/* Ext4 move extent does not support swapfile */
 	if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
 		ext4_debug("ext4 move extent: The argument files should "
 			"not be swapfile [ino:orig %lu, donor %lu]\n",
 			orig_inode->i_ino, donor_inode->i_ino);
 		return -EINVAL;
 	}
 	/* Files should be in the same ext4 FS */
 	if (orig_inode->i_sb != donor_inode->i_sb) {
 		ext4_debug("ext4 move extent: The argument files "
 			"should be in same FS [ino:orig %lu, donor %lu]\n",
 			orig_inode->i_ino, donor_inode->i_ino);
 		return -EINVAL;
 	}
 	/* Ext4 move extent supports only extent based file */
 	if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) {
 		ext4_debug("ext4 move extent: orig file is not extents "
 			"based file [ino:orig %lu]\n", orig_inode->i_ino);
 		return -EOPNOTSUPP;
 	} else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) {
 		ext4_debug("ext4 move extent: donor file is not extents "
 			"based file [ino:donor %lu]\n", donor_inode->i_ino);
 		return -EOPNOTSUPP;
 	}
 	if ((!orig_inode->i_size) || (!donor_inode->i_size)) {
 		ext4_debug("ext4 move extent: File size is 0 byte\n");
 		return -EINVAL;
 	}
 	/* Start offset should be same */
 	if (orig_start != donor_start) {
 		ext4_debug("ext4 move extent: orig and donor's start "
 			"offset are not same [ino:orig %lu, donor %lu]\n",
 			orig_inode->i_ino, donor_inode->i_ino);
 		return -EINVAL;
 	}
 	if ((orig_start > EXT_MAX_BLOCK) ||
 	    (donor_start > EXT_MAX_BLOCK) ||
 	    (*len > EXT_MAX_BLOCK) ||
 	    (orig_start + *len > EXT_MAX_BLOCK))  {
 		ext4_debug("ext4 move extent: Can't handle over [%u] blocks "
 			"[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCK,
 			orig_inode->i_ino, donor_inode->i_ino);
 		return -EINVAL;
 	}
 	if (orig_inode->i_size > donor_inode->i_size) {
 		donor_blocks = (donor_inode->i_size + blocksize - 1) >> blkbits;
 		/* TODO: eliminate this artificial restriction */
 		if (orig_start >= donor_blocks) {
 			ext4_debug("ext4 move extent: orig start offset "
 			"[%llu] should be less than donor file blocks "
 			"[%u] [ino:orig %lu, donor %lu]\n",
 			orig_start, donor_blocks,
 			orig_inode->i_ino, donor_inode->i_ino);
 			return -EINVAL;
 		}
 		/* TODO: eliminate this artificial restriction */
 		if (orig_start + *len > donor_blocks) {
 			ext4_debug("ext4 move extent: End offset [%llu] should "
 				"be less than donor file blocks [%u]."
 				"So adjust length from %llu to %llu "
 				"[ino:orig %lu, donor %lu]\n",
 				orig_start + *len, donor_blocks,
 				*len, donor_blocks - orig_start,
 				orig_inode->i_ino, donor_inode->i_ino);
 			*len = donor_blocks - orig_start;
 		}
 	} else {
 		orig_blocks = (orig_inode->i_size + blocksize - 1) >> blkbits;
 		if (orig_start >= orig_blocks) {
 			ext4_debug("ext4 move extent: start offset [%llu] "
 				"should be less than original file blocks "
 				"[%u] [ino:orig %lu, donor %lu]\n",
 				 orig_start, orig_blocks,
 				orig_inode->i_ino, donor_inode->i_ino);
 			return -EINVAL;
 		}
 		if (orig_start + *len > orig_blocks) {
 			ext4_debug("ext4 move extent: Adjust length "
 				"from %llu to %llu. Because it should be "
 				"less than original file blocks "
 				"[ino:orig %lu, donor %lu]\n",
 				*len, orig_blocks - orig_start,
 				orig_inode->i_ino, donor_inode->i_ino);
 			*len = orig_blocks - orig_start;
 		}
 	}
 	if (!*len) {
 		ext4_debug("ext4 move extent: len should not be 0 "
 			"[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
 			donor_inode->i_ino);
 		return -EINVAL;
 	}
 	return 0;
 }
 /**
  * mext_inode_double_lock - Lock i_mutex on both @inode1 and @inode2
  *
  * @inode1:	the inode structure
  * @inode2:	the inode structure
  *
  * Lock two inodes' i_mutex by i_ino order.
  * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
  */
 static int
 mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
 {
 	int ret = 0;
 	BUG_ON(inode1 == NULL && inode2 == NULL);
 	ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__);
 	if (ret < 0)
 		goto out;
 	if (inode1 == inode2) {
 		mutex_lock(&inode1->i_mutex);
 		goto out;
 	}
 	if (inode1->i_ino < inode2->i_ino) {
 		mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
 		mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
 	} else {
 		mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
 		mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
 	}
 out:
 	return ret;
 }
 /**
  * mext_inode_double_unlock - Release i_mutex on both @inode1 and @inode2
  *
  * @inode1:     the inode that is released first
  * @inode2:     the inode that is released second
  *
  * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
  */
 static int
 mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
 {
 	int ret = 0;
 	BUG_ON(inode1 == NULL && inode2 == NULL);
 	ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__);
 	if (ret < 0)
 		goto out;
 	if (inode1)
 		mutex_unlock(&inode1->i_mutex);
 	if (inode2 && inode2 != inode1)
 		mutex_unlock(&inode2->i_mutex);
 out:
 	return ret;
 }
 /**
  * ext4_move_extents - Exchange the specified range of a file
  *
  * @o_filp:		file structure of the original file
  * @d_filp:		file structure of the donor file
  * @orig_start:		start offset in block for orig
  * @donor_start:	start offset in block for donor
  * @len:		the number of blocks to be moved
  * @moved_len:		moved block length
  *
  * This function returns 0 and moved block length is set in moved_len
  * if succeed, otherwise returns error value.
  *
  * Note: ext4_move_extents() proceeds the following order.
  * 1:ext4_move_extents() calculates the last block number of moving extent
  *   function by the start block number (orig_start) and the number of blocks
  *   to be moved (len) specified as arguments.
  *   If the {orig, donor}_start points a hole, the extent's start offset
  *   pointed by ext_cur (current extent), holecheck_path, orig_path are set
  *   after hole behind.
  * 2:Continue step 3 to step 5, until the holecheck_path points to last_extent
  *   or the ext_cur exceeds the block_end which is last logical block number.
  * 3:To get the length of continues area, call mext_next_extent()
  *   specified with the ext_cur (initial value is holecheck_path) re-cursive,
  *   until find un-continuous extent, the start logical block number exceeds
  *   the block_end or the extent points to the last extent.
  * 4:Exchange the original inode data with donor inode data
  *   from orig_page_offset to seq_end_page.
  *   The start indexes of data are specified as arguments.
  *   That of the original inode is orig_page_offset,
  *   and the donor inode is also orig_page_offset
  *   (To easily handle blocksize != pagesize case, the offset for the
  *   donor inode is block unit).
  * 5:Update holecheck_path and orig_path to points a next proceeding extent,
  *   then returns to step 2.
  * 6:Release holecheck_path, orig_path and set the len to moved_len
  *   which shows the number of moved blocks.
  *   The moved_len is useful for the command to calculate the file offset
  *   for starting next move extent ioctl.
  * 7:Return 0 on success, or a negative error value on failure.
  */
 int
 ext4_move_extents(struct file *o_filp, struct file *d_filp,
 		 __u64 orig_start, __u64 donor_start, __u64 len,
 		 __u64 *moved_len)
 {
 	struct inode *orig_inode = o_filp->f_dentry->d_inode;
 	struct inode *donor_inode = d_filp->f_dentry->d_inode;
 	struct ext4_ext_path *orig_path = NULL, *holecheck_path = NULL;
 	struct ext4_extent *ext_prev, *ext_cur, *ext_dummy;
 	ext4_lblk_t block_start = orig_start;
 	ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
 	ext4_lblk_t rest_blocks;
 	pgoff_t orig_page_offset = 0, seq_end_page;
 	int ret1, ret2, depth, last_extent = 0;
 	int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
 	int data_offset_in_page;
 	int block_len_in_page;
 	int uninit;
 	/* orig and donor should be different file */
 	if (orig_inode->i_ino == donor_inode->i_ino) {
 		ext4_debug("ext4 move extent: The argument files should not "
 			"be same file [ino:orig %lu, donor %lu]\n",
 			orig_inode->i_ino, donor_inode->i_ino);
 		return -EINVAL;
 	}
 	/* Regular file check */
 	if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
 		ext4_debug("ext4 move extent: The argument files should be "
 			"regular file [ino:orig %lu, donor %lu]\n",
 			orig_inode->i_ino, donor_inode->i_ino);
 		return -EINVAL;
 	}
 	/* Protect orig and donor inodes against a truncate */
 	ret1 = mext_inode_double_lock(orig_inode, donor_inode);
 	if (ret1 < 0)
 		return ret1;
 	/* Protect extent tree against block allocations via delalloc */
 	double_down_write_data_sem(orig_inode, donor_inode);
 	/* Check the filesystem environment whether move_extent can be done */
 	ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start,
 				    donor_start, &len);
 	if (ret1)
 		goto out;
 	file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits;
 	block_end = block_start + len - 1;
 	if (file_end < block_end)
 		len -= block_end - file_end;
 	ret1 = get_ext_path(orig_inode, block_start, &orig_path);
 	if (ret1)
 		goto out;
 	/* Get path structure to check the hole */
 	ret1 = get_ext_path(orig_inode, block_start, &holecheck_path);
 	if (ret1)
 		goto out;
 	depth = ext_depth(orig_inode);
 	ext_cur = holecheck_path[depth].p_ext;
 	/*
 	 * Get proper starting location of block replacement if block_start was
 	 * within the hole.
 	 */
 	if (le32_to_cpu(ext_cur->ee_block) +
 		ext4_ext_get_actual_len(ext_cur) - 1 < block_start) {
 		/*
 		 * The hole exists between extents or the tail of
 		 * original file.
 		 */
 		last_extent = mext_next_extent(orig_inode,
 					holecheck_path, &ext_cur);
 		if (last_extent < 0) {
 			ret1 = last_extent;
 			goto out;
 		}
 		last_extent = mext_next_extent(orig_inode, orig_path,
 							&ext_dummy);
 		if (last_extent < 0) {
 			ret1 = last_extent;
 			goto out;
 		}
 		seq_start = le32_to_cpu(ext_cur->ee_block);
 	} else if (le32_to_cpu(ext_cur->ee_block) > block_start)
 		/* The hole exists at the beginning of original file. */
 		seq_start = le32_to_cpu(ext_cur->ee_block);
 	else
 		seq_start = block_start;
 	/* No blocks within the specified range. */
 	if (le32_to_cpu(ext_cur->ee_block) > block_end) {
 		ext4_debug("ext4 move extent: The specified range of file "
 							"may be the hole\n");
 		ret1 = -EINVAL;
 		goto out;
 	}
 	/* Adjust start blocks */
 	add_blocks = min(le32_to_cpu(ext_cur->ee_block) +
 			 ext4_ext_get_actual_len(ext_cur), block_end + 1) -
 		     max(le32_to_cpu(ext_cur->ee_block), block_start);
 	while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) {
 		seq_blocks += add_blocks;
 		/* Adjust tail blocks */
 		if (seq_start + seq_blocks - 1 > block_end)
 			seq_blocks = block_end - seq_start + 1;
 		ext_prev = ext_cur;
 		last_extent = mext_next_extent(orig_inode, holecheck_path,
 						&ext_cur);
 		if (last_extent < 0) {
 			ret1 = last_extent;
 			break;
 		}
 		add_blocks = ext4_ext_get_actual_len(ext_cur);
 		/*
 		 * Extend the length of contiguous block (seq_blocks)
 		 * if extents are contiguous.
 		 */
 		if (ext4_can_extents_be_merged(orig_inode,
 					       ext_prev, ext_cur) &&
 		    block_end >= le32_to_cpu(ext_cur->ee_block) &&
 		    !last_extent)
 			continue;
 		/* Is original extent is uninitialized */
 		uninit = ext4_ext_is_uninitialized(ext_prev);
 		data_offset_in_page = seq_start % blocks_per_page;
 		/*
 		 * Calculate data blocks count that should be swapped
 		 * at the first page.
 		 */
 		if (data_offset_in_page + seq_blocks > blocks_per_page) {
 			/* Swapped blocks are across pages */
 			block_len_in_page =
 					blocks_per_page - data_offset_in_page;
 		} else {
 			/* Swapped blocks are in a page */
 			block_len_in_page = seq_blocks;
 		}
 		orig_page_offset = seq_start >>
 				(PAGE_CACHE_SHIFT - orig_inode->i_blkbits);
 		seq_end_page = (seq_start + seq_blocks - 1) >>
 				(PAGE_CACHE_SHIFT - orig_inode->i_blkbits);
 		seq_start = le32_to_cpu(ext_cur->ee_block);
 		rest_blocks = seq_blocks;
 		/*
 		 * Up semaphore to avoid following problems:
 		 * a. transaction deadlock among ext4_journal_start,
 		 *    ->write_begin via pagefault, and jbd2_journal_commit
 		 * b. racing with ->readpage, ->write_begin, and ext4_get_block
 		 *    in move_extent_per_page
 		 */
 		double_up_write_data_sem(orig_inode, donor_inode);
 		while (orig_page_offset <= seq_end_page) {
 			/* Swap original branches with new branches */
 			block_len_in_page = move_extent_per_page(
 						o_filp, donor_inode,
 						orig_page_offset,
 						data_offset_in_page,
 						block_len_in_page, uninit,
 						&ret1);
 			/* Count how many blocks we have exchanged */
 			*moved_len += block_len_in_page;
 			if (ret1 < 0)
 				break;
 			if (*moved_len > len) {
 				EXT4_ERROR_INODE(orig_inode,
 					"We replaced blocks too much! "
 					"sum of replaced: %llu requested: %llu",
 					*moved_len, len);
 				ret1 = -EIO;
 				break;
 			}
 			orig_page_offset++;
 			data_offset_in_page = 0;
 			rest_blocks -= block_len_in_page;
 			if (rest_blocks > blocks_per_page)
 				block_len_in_page = blocks_per_page;
 			else
 				block_len_in_page = rest_blocks;
 		}
 		double_down_write_data_sem(orig_inode, donor_inode);
 		if (ret1 < 0)
 			break;
 		/* Decrease buffer counter */
 		if (holecheck_path)
 			ext4_ext_drop_refs(holecheck_path);
 		ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path);
 		if (ret1)
 			break;
 		depth = holecheck_path->p_depth;
 		/* Decrease buffer counter */
 		if (orig_path)
 			ext4_ext_drop_refs(orig_path);
 		ret1 = get_ext_path(orig_inode, seq_start, &orig_path);
 		if (ret1)
 			break;
 		ext_cur = holecheck_path[depth].p_ext;
 		add_blocks = ext4_ext_get_actual_len(ext_cur);
 		seq_blocks = 0;
 	}
 out:
 	if (*moved_len) {
 		ext4_discard_preallocations(orig_inode);
 		ext4_discard_preallocations(donor_inode);
 	}
 	if (orig_path) {
 		ext4_ext_drop_refs(orig_path);
 		kfree(orig_path);
 	}
 	if (holecheck_path) {
 		ext4_ext_drop_refs(holecheck_path);
 		kfree(holecheck_path);
 	}
 	double_up_write_data_sem(orig_inode, donor_inode);
 	ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
 	if (ret1)
 		return ret1;
 	else if (ret2)
 		return ret2;
 	return 0;
 }