Doug / smarc-fsl-linux-kernel

1

/*

1

/*

2

* linux/fs/ext4/inode.c

2

* linux/fs/ext4/inode.c

3

*

3

*

4

5

* Remy Card (card@masi.ibp.fr)

5

* Remy Card (card@masi.ibp.fr)

6

* Laboratoire MASI - Institut Blaise Pascal

6

* Laboratoire MASI - Institut Blaise Pascal

7

* Universite Pierre et Marie Curie (Paris VI)

7

* Universite Pierre et Marie Curie (Paris VI)

8

*

8

*

9

* from

9

* from

10

*

10

*

11

* linux/fs/minix/inode.c

11

* linux/fs/minix/inode.c

12

*

12

*

13

14

*

14

*

15

* Goal-directed block allocation by Stephen Tweedie

15

* Goal-directed block allocation by Stephen Tweedie

16

* (sct@redhat.com), 1993, 1998

16

* (sct@redhat.com), 1993, 1998

17

* Big-endian to little-endian byte-swapping/bitmaps by

17

* Big-endian to little-endian byte-swapping/bitmaps by

18

* David S. Miller (davem@caip.rutgers.edu), 1995

18

* David S. Miller (davem@caip.rutgers.edu), 1995

19

* 64-bit file support on 64-bit platforms by Jakub Jelinek

19

* 64-bit file support on 64-bit platforms by Jakub Jelinek

20

* (jj@sunsite.ms.mff.cuni.cz)

20

* (jj@sunsite.ms.mff.cuni.cz)

21

*

21

*

22

* Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000

22

* Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000

23

*/

23

*/

24

25

#include <linux/module.h>

25

#include <linux/module.h>

26

#include <linux/fs.h>

26

#include <linux/fs.h>

27

#include <linux/time.h>

27

#include <linux/time.h>

28

#include <linux/jbd2.h>

28

#include <linux/jbd2.h>

29

#include <linux/highuid.h>

29

#include <linux/highuid.h>

30

#include <linux/pagemap.h>

30

#include <linux/pagemap.h>

31

#include <linux/quotaops.h>

31

#include <linux/quotaops.h>

32

#include <linux/string.h>

32

#include <linux/string.h>

33

#include <linux/buffer_head.h>

33

#include <linux/buffer_head.h>

34

#include <linux/writeback.h>

34

#include <linux/writeback.h>

35

#include <linux/mpage.h>

35

#include <linux/mpage.h>

36

#include <linux/uio.h>

36

#include <linux/uio.h>

37

#include <linux/bio.h>

37

#include <linux/bio.h>

38

#include "ext4_jbd2.h"

38

#include "ext4_jbd2.h"

39

#include "xattr.h"

39

#include "xattr.h"

40

#include "acl.h"

40

#include "acl.h"

41

42

/*

42

/*

43

* Test whether an inode is a fast symlink.

43

* Test whether an inode is a fast symlink.

44

*/

44

*/

45

static int ext4_inode_is_fast_symlink(struct inode *inode)

45

static int ext4_inode_is_fast_symlink(struct inode *inode)

46

{

46

{

47

int ea_blocks = EXT4_I(inode)->i_file_acl ?

47

int ea_blocks = EXT4_I(inode)->i_file_acl ?

48

(inode->i_sb->s_blocksize >> 9) : 0;

48

(inode->i_sb->s_blocksize >> 9) : 0;

49

50

return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);

50

return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);

51

}

51

}

52

53

/*

53

/*

54

* The ext4 forget function must perform a revoke if we are freeing data

54

* The ext4 forget function must perform a revoke if we are freeing data

55

* which has been journaled. Metadata (eg. indirect blocks) must be

55

* which has been journaled. Metadata (eg. indirect blocks) must be

56

* revoked in all cases.

56

* revoked in all cases.

57

*

57

*

58

* "bh" may be NULL: a metadata block may have been freed from memory

58

* "bh" may be NULL: a metadata block may have been freed from memory

59

* but there may still be a record of it in the journal, and that record

59

* but there may still be a record of it in the journal, and that record

60

* still needs to be revoked.

60

* still needs to be revoked.

61

*/

61

*/

62

int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,

62

int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,

63

struct buffer_head *bh, ext4_fsblk_t blocknr)

63

struct buffer_head *bh, ext4_fsblk_t blocknr)

64

{

64

{

65

int err;

65

int err;

66

67

might_sleep();

67

might_sleep();

68

69

BUFFER_TRACE(bh, "enter");

69

BUFFER_TRACE(bh, "enter");

70

71

jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "

71

jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "

72

"data mode %lx\n",

72

"data mode %lx\n",

73

bh, is_metadata, inode->i_mode,

73

bh, is_metadata, inode->i_mode,

74

test_opt(inode->i_sb, DATA_FLAGS));

74

test_opt(inode->i_sb, DATA_FLAGS));

75

76

/* Never use the revoke function if we are doing full data

76

/* Never use the revoke function if we are doing full data

77

* journaling: there is no need to, and a V1 superblock won't

77

* journaling: there is no need to, and a V1 superblock won't

78

* support it. Otherwise, only skip the revoke on un-journaled

78

* support it. Otherwise, only skip the revoke on un-journaled

79

* data blocks. */

79

* data blocks. */

80

81

if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||

81

if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||

82

(!is_metadata && !ext4_should_journal_data(inode))) {

82

(!is_metadata && !ext4_should_journal_data(inode))) {

83

if (bh) {

83

if (bh) {

84

BUFFER_TRACE(bh, "call jbd2_journal_forget");

84

BUFFER_TRACE(bh, "call jbd2_journal_forget");

85

return ext4_journal_forget(handle, bh);

85

return ext4_journal_forget(handle, bh);

86

}

86

}

87

return 0;

87

return 0;

88

}

88

}

89

90

/*

90

/*

91

* data!=journal && (is_metadata || should_journal_data(inode))

91

* data!=journal && (is_metadata || should_journal_data(inode))

92

*/

92

*/

93

BUFFER_TRACE(bh, "call ext4_journal_revoke");

93

BUFFER_TRACE(bh, "call ext4_journal_revoke");

94

err = ext4_journal_revoke(handle, blocknr, bh);

94

err = ext4_journal_revoke(handle, blocknr, bh);

95

if (err)

95

if (err)

96

ext4_abort(inode->i_sb, __func__,

96

ext4_abort(inode->i_sb, __func__,

97

"error %d when attempting revoke", err);

97

"error %d when attempting revoke", err);

98

BUFFER_TRACE(bh, "exit");

98

BUFFER_TRACE(bh, "exit");

99

return err;

99

return err;

100

}

100

}

101

102

/*

102

/*

103

* Work out how many blocks we need to proceed with the next chunk of a

103

* Work out how many blocks we need to proceed with the next chunk of a

104

* truncate transaction.

104

* truncate transaction.

105

*/

105

*/

106

static unsigned long blocks_for_truncate(struct inode *inode)

106

static unsigned long blocks_for_truncate(struct inode *inode)

107

{

107

{

108

ext4_lblk_t needed;

108

ext4_lblk_t needed;

109

110

needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);

110

needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);

111

112

/* Give ourselves just enough room to cope with inodes in which

112

/* Give ourselves just enough room to cope with inodes in which

113

* i_blocks is corrupt: we've seen disk corruptions in the past

113

* i_blocks is corrupt: we've seen disk corruptions in the past

114

* which resulted in random data in an inode which looked enough

114

* which resulted in random data in an inode which looked enough

115

* like a regular file for ext4 to try to delete it. Things

115

* like a regular file for ext4 to try to delete it. Things

116

* will go a bit crazy if that happens, but at least we should

116

* will go a bit crazy if that happens, but at least we should

117

* try not to panic the whole kernel. */

117

* try not to panic the whole kernel. */

118

if (needed < 2)

118

if (needed < 2)

119

needed = 2;

119

needed = 2;

120

121

/* But we need to bound the transaction so we don't overflow the

121

/* But we need to bound the transaction so we don't overflow the

122

* journal. */

122

* journal. */

123

if (needed > EXT4_MAX_TRANS_DATA)

123

if (needed > EXT4_MAX_TRANS_DATA)

124

needed = EXT4_MAX_TRANS_DATA;

124

needed = EXT4_MAX_TRANS_DATA;

125

126

return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;

126

return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;

127

}

127

}

128

129

/*

129

/*

130

* Truncate transactions can be complex and absolutely huge. So we need to

130

* Truncate transactions can be complex and absolutely huge. So we need to

131

* be able to restart the transaction at a conventient checkpoint to make

131

* be able to restart the transaction at a conventient checkpoint to make

132

* sure we don't overflow the journal.

132

* sure we don't overflow the journal.

133

*

133

*

134

* start_transaction gets us a new handle for a truncate transaction,

134

* start_transaction gets us a new handle for a truncate transaction,

135

* and extend_transaction tries to extend the existing one a bit. If

135

* and extend_transaction tries to extend the existing one a bit. If

136

* extend fails, we need to propagate the failure up and restart the

136

* extend fails, we need to propagate the failure up and restart the

137

* transaction in the top-level truncate loop. --sct

137

* transaction in the top-level truncate loop. --sct

138

*/

138

*/

139

static handle_t *start_transaction(struct inode *inode)

139

static handle_t *start_transaction(struct inode *inode)

140

{

140

{

141

handle_t *result;

141

handle_t *result;

142

143

result = ext4_journal_start(inode, blocks_for_truncate(inode));

143

result = ext4_journal_start(inode, blocks_for_truncate(inode));

144

if (!IS_ERR(result))

144

if (!IS_ERR(result))

145

return result;

145

return result;

146

147

ext4_std_error(inode->i_sb, PTR_ERR(result));

147

ext4_std_error(inode->i_sb, PTR_ERR(result));

148

return result;

148

return result;

149

}

149

}

150

151

/*

151

/*

152

* Try to extend this transaction for the purposes of truncation.

152

* Try to extend this transaction for the purposes of truncation.

153

*

153

*

154

* Returns 0 if we managed to create more room. If we can't create more

154

* Returns 0 if we managed to create more room. If we can't create more

155

* room, and the transaction must be restarted we return 1.

155

* room, and the transaction must be restarted we return 1.

156

*/

156

*/

157

static int try_to_extend_transaction(handle_t *handle, struct inode *inode)

157

static int try_to_extend_transaction(handle_t *handle, struct inode *inode)

158

{

158

{

159

if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS)

159

if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS)

160

return 0;

160

return 0;

161

if (!ext4_journal_extend(handle, blocks_for_truncate(inode)))

161

if (!ext4_journal_extend(handle, blocks_for_truncate(inode)))

162

return 0;

162

return 0;

163

return 1;

163

return 1;

164

}

164

}

165

166

/*

166

/*

167

* Restart the transaction associated with *handle. This does a commit,

167

* Restart the transaction associated with *handle. This does a commit,

168

* so before we call here everything must be consistently dirtied against

168

* so before we call here everything must be consistently dirtied against

169

* this transaction.

169

* this transaction.

170

*/

170

*/

171

static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)

171

static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)

172

{

172

{

173

jbd_debug(2, "restarting handle %p\n", handle);

173

jbd_debug(2, "restarting handle %p\n", handle);

174

return ext4_journal_restart(handle, blocks_for_truncate(inode));

174

return ext4_journal_restart(handle, blocks_for_truncate(inode));

175

}

175

}

176

177

/*

177

/*

178

* Called at the last iput() if i_nlink is zero.

178

* Called at the last iput() if i_nlink is zero.

179

*/

179

*/

180

void ext4_delete_inode (struct inode * inode)

180

void ext4_delete_inode (struct inode * inode)

181

{

181

{

182

handle_t *handle;

182

handle_t *handle;

183

184

truncate_inode_pages(&inode->i_data, 0);

184

truncate_inode_pages(&inode->i_data, 0);

185

186

if (is_bad_inode(inode))

186

if (is_bad_inode(inode))

187

goto no_delete;

187

goto no_delete;

188

189

handle = start_transaction(inode);

189

handle = start_transaction(inode);

190

if (IS_ERR(handle)) {

190

if (IS_ERR(handle)) {

191

/*

191

/*

192

* If we're going to skip the normal cleanup, we still need to

192

* If we're going to skip the normal cleanup, we still need to

193

* make sure that the in-core orphan linked list is properly

193

* make sure that the in-core orphan linked list is properly

194

* cleaned up.

194

* cleaned up.

195

*/

195

*/

196

ext4_orphan_del(NULL, inode);

196

ext4_orphan_del(NULL, inode);

197

goto no_delete;

197

goto no_delete;

198

}

198

}

199

200

if (IS_SYNC(inode))

200

if (IS_SYNC(inode))

201

handle->h_sync = 1;

201

handle->h_sync = 1;

202

inode->i_size = 0;

202

inode->i_size = 0;

203

if (inode->i_blocks)

203

if (inode->i_blocks)

204

ext4_truncate(inode);

204

ext4_truncate(inode);

205

/*

205

/*

206

* Kill off the orphan record which ext4_truncate created.

206

* Kill off the orphan record which ext4_truncate created.

207

* AKPM: I think this can be inside the above `if'.

207

* AKPM: I think this can be inside the above `if'.

208

* Note that ext4_orphan_del() has to be able to cope with the

208

* Note that ext4_orphan_del() has to be able to cope with the

209

* deletion of a non-existent orphan - this is because we don't

209

* deletion of a non-existent orphan - this is because we don't

210

* know if ext4_truncate() actually created an orphan record.

210

* know if ext4_truncate() actually created an orphan record.

211

* (Well, we could do this if we need to, but heck - it works)

211

* (Well, we could do this if we need to, but heck - it works)

212

*/

212

*/

213

ext4_orphan_del(handle, inode);

213

ext4_orphan_del(handle, inode);

214

EXT4_I(inode)->i_dtime = get_seconds();

214

EXT4_I(inode)->i_dtime = get_seconds();

215

216

/*

216

/*

217

* One subtle ordering requirement: if anything has gone wrong

217

* One subtle ordering requirement: if anything has gone wrong

218

* (transaction abort, IO errors, whatever), then we can still

218

* (transaction abort, IO errors, whatever), then we can still

219

* do these next steps (the fs will already have been marked as

219

* do these next steps (the fs will already have been marked as

220

* having errors), but we can't free the inode if the mark_dirty

220

* having errors), but we can't free the inode if the mark_dirty

221

* fails.

221

* fails.

222

*/

222

*/

223

if (ext4_mark_inode_dirty(handle, inode))

223

if (ext4_mark_inode_dirty(handle, inode))

224

/* If that failed, just do the required in-core inode clear. */

224

/* If that failed, just do the required in-core inode clear. */

225

clear_inode(inode);

225

clear_inode(inode);

226

else

226

else

227

ext4_free_inode(handle, inode);

227

ext4_free_inode(handle, inode);

228

ext4_journal_stop(handle);

228

ext4_journal_stop(handle);

229

return;

229

return;

230

no_delete:

230

no_delete:

231

clear_inode(inode); /* We must guarantee clearing of inode... */

231

clear_inode(inode); /* We must guarantee clearing of inode... */

232

}

232

}

233

234

typedef struct {

234

typedef struct {

235

__le32 *p;

235

__le32 *p;

236

__le32 key;

236

__le32 key;

237

struct buffer_head *bh;

237

struct buffer_head *bh;

238

} Indirect;

238

} Indirect;

239

240

static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)

240

static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)

241

{

241

{

242

p->key = *(p->p = v);

242

p->key = *(p->p = v);

243

p->bh = bh;

243

p->bh = bh;

244

}

244

}

245

246

/**

246

/**

247

* ext4_block_to_path - parse the block number into array of offsets

247

* ext4_block_to_path - parse the block number into array of offsets

248

* @inode: inode in question (we are only interested in its superblock)

248

* @inode: inode in question (we are only interested in its superblock)

249

* @i_block: block number to be parsed

249

* @i_block: block number to be parsed

250

* @offsets: array to store the offsets in

250

* @offsets: array to store the offsets in

251

* @boundary: set this non-zero if the referred-to block is likely to be

251

* @boundary: set this non-zero if the referred-to block is likely to be

252

* followed (on disk) by an indirect block.

252

* followed (on disk) by an indirect block.

253

*

253

*

254

* To store the locations of file's data ext4 uses a data structure common

254

* To store the locations of file's data ext4 uses a data structure common

255

* for UNIX filesystems - tree of pointers anchored in the inode, with

255

* for UNIX filesystems - tree of pointers anchored in the inode, with

256

* data blocks at leaves and indirect blocks in intermediate nodes.

256

* data blocks at leaves and indirect blocks in intermediate nodes.

257

* This function translates the block number into path in that tree -

257

* This function translates the block number into path in that tree -

258

* return value is the path length and @offsets[n] is the offset of

258

* return value is the path length and @offsets[n] is the offset of

259

* pointer to (n+1)th node in the nth one. If @block is out of range

259

* pointer to (n+1)th node in the nth one. If @block is out of range

260

* (negative or too large) warning is printed and zero returned.

260

* (negative or too large) warning is printed and zero returned.

261

*

261

*

262

* Note: function doesn't find node addresses, so no IO is needed. All

262

* Note: function doesn't find node addresses, so no IO is needed. All

263

* we need to know is the capacity of indirect blocks (taken from the

263

* we need to know is the capacity of indirect blocks (taken from the

264

* inode->i_sb).

264

* inode->i_sb).

265

*/

265

*/

266

267

/*

267

/*

268

* Portability note: the last comparison (check that we fit into triple

268

* Portability note: the last comparison (check that we fit into triple

269

* indirect block) is spelled differently, because otherwise on an

269

* indirect block) is spelled differently, because otherwise on an

270

* architecture with 32-bit longs and 8Kb pages we might get into trouble

270

* architecture with 32-bit longs and 8Kb pages we might get into trouble

271

* if our filesystem had 8Kb blocks. We might use long long, but that would

271

* if our filesystem had 8Kb blocks. We might use long long, but that would

272

* kill us on x86. Oh, well, at least the sign propagation does not matter -

272

* kill us on x86. Oh, well, at least the sign propagation does not matter -

273

* i_block would have to be negative in the very beginning, so we would not

273

* i_block would have to be negative in the very beginning, so we would not

274

* get there at all.

274

* get there at all.

275

*/

275

*/

276

277

static int ext4_block_to_path(struct inode *inode,

277

static int ext4_block_to_path(struct inode *inode,

278

ext4_lblk_t i_block,

278

ext4_lblk_t i_block,

279

ext4_lblk_t offsets[4], int *boundary)

279

ext4_lblk_t offsets[4], int *boundary)

280

{

280

{

281

int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);

281

int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);

282

int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);

282

int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);

283

const long direct_blocks = EXT4_NDIR_BLOCKS,

283

const long direct_blocks = EXT4_NDIR_BLOCKS,

284

indirect_blocks = ptrs,

284

indirect_blocks = ptrs,

285

double_blocks = (1 << (ptrs_bits * 2));

285

double_blocks = (1 << (ptrs_bits * 2));

286

int n = 0;

286

int n = 0;

287

int final = 0;

287

int final = 0;

288

289

if (i_block < 0) {

289

if (i_block < 0) {

290

ext4_warning (inode->i_sb, "ext4_block_to_path", "block < 0");

290

ext4_warning (inode->i_sb, "ext4_block_to_path", "block < 0");

291

} else if (i_block < direct_blocks) {

291

} else if (i_block < direct_blocks) {

292

offsets[n++] = i_block;

292

offsets[n++] = i_block;

293

final = direct_blocks;

293

final = direct_blocks;

294

} else if ( (i_block -= direct_blocks) < indirect_blocks) {

294

} else if ( (i_block -= direct_blocks) < indirect_blocks) {

295

offsets[n++] = EXT4_IND_BLOCK;

295

offsets[n++] = EXT4_IND_BLOCK;

296

offsets[n++] = i_block;

296

offsets[n++] = i_block;

297

final = ptrs;

297

final = ptrs;

298

} else if ((i_block -= indirect_blocks) < double_blocks) {

298

} else if ((i_block -= indirect_blocks) < double_blocks) {

299

offsets[n++] = EXT4_DIND_BLOCK;

299

offsets[n++] = EXT4_DIND_BLOCK;

300

offsets[n++] = i_block >> ptrs_bits;

300

offsets[n++] = i_block >> ptrs_bits;

301

offsets[n++] = i_block & (ptrs - 1);

301

offsets[n++] = i_block & (ptrs - 1);

302

final = ptrs;

302

final = ptrs;

303

} else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {

303

} else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {

304

offsets[n++] = EXT4_TIND_BLOCK;

304

offsets[n++] = EXT4_TIND_BLOCK;

305

offsets[n++] = i_block >> (ptrs_bits * 2);

305

offsets[n++] = i_block >> (ptrs_bits * 2);

306

offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);

306

offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);

307

offsets[n++] = i_block & (ptrs - 1);

307

offsets[n++] = i_block & (ptrs - 1);

308

final = ptrs;

308

final = ptrs;

309

} else {

309

} else {

310

ext4_warning(inode->i_sb, "ext4_block_to_path",

310

ext4_warning(inode->i_sb, "ext4_block_to_path",

311

"block %lu > max",

311

"block %lu > max",

312

i_block + direct_blocks +

312

i_block + direct_blocks +

313

indirect_blocks + double_blocks);

313

indirect_blocks + double_blocks);

314

}

314

}

315

if (boundary)

315

if (boundary)

316

*boundary = final - 1 - (i_block & (ptrs - 1));

316

*boundary = final - 1 - (i_block & (ptrs - 1));

317

return n;

317

return n;

318

}

318

}

319

320

/**

320

/**

321

* ext4_get_branch - read the chain of indirect blocks leading to data

321

* ext4_get_branch - read the chain of indirect blocks leading to data

322

* @inode: inode in question

322

* @inode: inode in question

323

* @depth: depth of the chain (1 - direct pointer, etc.)

323

* @depth: depth of the chain (1 - direct pointer, etc.)

324

* @offsets: offsets of pointers in inode/indirect blocks

324

* @offsets: offsets of pointers in inode/indirect blocks

325

* @chain: place to store the result

325

* @chain: place to store the result

326

* @err: here we store the error value

326

* @err: here we store the error value

327

*

327

*

328

* Function fills the array of triples <key, p, bh> and returns %NULL

328

* Function fills the array of triples <key, p, bh> and returns %NULL

329

* if everything went OK or the pointer to the last filled triple

329

* if everything went OK or the pointer to the last filled triple

330

* (incomplete one) otherwise. Upon the return chain[i].key contains

330

* (incomplete one) otherwise. Upon the return chain[i].key contains

331

* the number of (i+1)-th block in the chain (as it is stored in memory,

331

* the number of (i+1)-th block in the chain (as it is stored in memory,

332

* i.e. little-endian 32-bit), chain[i].p contains the address of that

332

* i.e. little-endian 32-bit), chain[i].p contains the address of that

333

* number (it points into struct inode for i==0 and into the bh->b_data

333

* number (it points into struct inode for i==0 and into the bh->b_data

334

* for i>0) and chain[i].bh points to the buffer_head of i-th indirect

334

* for i>0) and chain[i].bh points to the buffer_head of i-th indirect

335

* block for i>0 and NULL for i==0. In other words, it holds the block

335

* block for i>0 and NULL for i==0. In other words, it holds the block

336

* numbers of the chain, addresses they were taken from (and where we can

336

* numbers of the chain, addresses they were taken from (and where we can

337

* verify that chain did not change) and buffer_heads hosting these

337

* verify that chain did not change) and buffer_heads hosting these

338

* numbers.

338

* numbers.

339

*

339

*

340

* Function stops when it stumbles upon zero pointer (absent block)

340

* Function stops when it stumbles upon zero pointer (absent block)

341

* (pointer to last triple returned, *@err == 0)

341

* (pointer to last triple returned, *@err == 0)

342

* or when it gets an IO error reading an indirect block

342

* or when it gets an IO error reading an indirect block

343

* (ditto, *@err == -EIO)

343

* (ditto, *@err == -EIO)

344

* or when it reads all @depth-1 indirect blocks successfully and finds

344

* or when it reads all @depth-1 indirect blocks successfully and finds

345

* the whole chain, all way to the data (returns %NULL, *err == 0).

345

* the whole chain, all way to the data (returns %NULL, *err == 0).

346

*

346

*

347

* Need to be called with

347

* Need to be called with

348

* down_read(&EXT4_I(inode)->i_data_sem)

348

* down_read(&EXT4_I(inode)->i_data_sem)

349

*/

349

*/

350

static Indirect *ext4_get_branch(struct inode *inode, int depth,

350

static Indirect *ext4_get_branch(struct inode *inode, int depth,

351

ext4_lblk_t *offsets,

351

ext4_lblk_t *offsets,

352

Indirect chain[4], int *err)

352

Indirect chain[4], int *err)

353

{

353

{

354

struct super_block *sb = inode->i_sb;

354

struct super_block *sb = inode->i_sb;

355

Indirect *p = chain;

355

Indirect *p = chain;

356

struct buffer_head *bh;

356

struct buffer_head *bh;

357

358

*err = 0;

358

*err = 0;

359

/* i_data is not going away, no lock needed */

359

/* i_data is not going away, no lock needed */

360

add_chain (chain, NULL, EXT4_I(inode)->i_data + *offsets);

360

add_chain (chain, NULL, EXT4_I(inode)->i_data + *offsets);

361

if (!p->key)

361

if (!p->key)

362

goto no_block;

362

goto no_block;

363

while (--depth) {

363

while (--depth) {

364

bh = sb_bread(sb, le32_to_cpu(p->key));

364

bh = sb_bread(sb, le32_to_cpu(p->key));

365

if (!bh)

365

if (!bh)

366

goto failure;

366

goto failure;

367

add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);

367

add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);

368

/* Reader: end */

368

/* Reader: end */

369

if (!p->key)

369

if (!p->key)

370

goto no_block;

370

goto no_block;

371

}

371

}

372

return NULL;

372

return NULL;

373

374

failure:

374

failure:

375

*err = -EIO;

375

*err = -EIO;

376

no_block:

376

no_block:

377

return p;

377

return p;

378

}

378

}

379

380

/**

380

/**

381

* ext4_find_near - find a place for allocation with sufficient locality

381

* ext4_find_near - find a place for allocation with sufficient locality

382

* @inode: owner

382

* @inode: owner

383

* @ind: descriptor of indirect block.

383

* @ind: descriptor of indirect block.

384

*

384

*

385

* This function returns the preferred place for block allocation.

385

* This function returns the preferred place for block allocation.

386

* It is used when heuristic for sequential allocation fails.

386

* It is used when heuristic for sequential allocation fails.

387

* Rules are:

387

* Rules are:

388

* + if there is a block to the left of our position - allocate near it.

388

* + if there is a block to the left of our position - allocate near it.

389

* + if pointer will live in indirect block - allocate near that block.

389

* + if pointer will live in indirect block - allocate near that block.

390

* + if pointer will live in inode - allocate in the same

390

* + if pointer will live in inode - allocate in the same

391

* cylinder group.

391

* cylinder group.

392

*

392

*

393

* In the latter case we colour the starting block by the callers PID to

393

* In the latter case we colour the starting block by the callers PID to

394

* prevent it from clashing with concurrent allocations for a different inode

394

* prevent it from clashing with concurrent allocations for a different inode

395

* in the same block group. The PID is used here so that functionally related

395

* in the same block group. The PID is used here so that functionally related

396

* files will be close-by on-disk.

396

* files will be close-by on-disk.

397

*

397

*

398

* Caller must make sure that @ind is valid and will stay that way.

398

* Caller must make sure that @ind is valid and will stay that way.

399

*/

399

*/

400

static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)

400

static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)

401

{

401

{

402

struct ext4_inode_info *ei = EXT4_I(inode);

402

struct ext4_inode_info *ei = EXT4_I(inode);

403

__le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;

403

__le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;

404

__le32 *p;

404

__le32 *p;

405

ext4_fsblk_t bg_start;

405

ext4_fsblk_t bg_start;

406

ext4_fsblk_t last_block;

406

ext4_fsblk_t last_block;

407

ext4_grpblk_t colour;

407

ext4_grpblk_t colour;

408

409

/* Try to find previous block */

409

/* Try to find previous block */

410

for (p = ind->p - 1; p >= start; p--) {

410

for (p = ind->p - 1; p >= start; p--) {

411

if (*p)

411

if (*p)

412

return le32_to_cpu(*p);

412

return le32_to_cpu(*p);

413

}

413

}

414

415

/* No such thing, so let's try location of indirect block */

415

/* No such thing, so let's try location of indirect block */

416

if (ind->bh)

416

if (ind->bh)

417

return ind->bh->b_blocknr;

417

return ind->bh->b_blocknr;

418

419

/*

419

/*

420

* It is going to be referred to from the inode itself? OK, just put it

420

* It is going to be referred to from the inode itself? OK, just put it

421

* into the same cylinder group then.

421

* into the same cylinder group then.

422

*/

422

*/

423

bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group);

423

bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group);

424

last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;

424

last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;

425

426

if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)

426

if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)

427

colour = (current->pid % 16) *

427

colour = (current->pid % 16) *

428

(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);

428

(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);

429

else

429

else

430

colour = (current->pid % 16) * ((last_block - bg_start) / 16);

430

colour = (current->pid % 16) * ((last_block - bg_start) / 16);

431

return bg_start + colour;

431

return bg_start + colour;

432

}

432

}

433

434

/**

434

/**

435

* ext4_find_goal - find a preferred place for allocation.

435

* ext4_find_goal - find a preferred place for allocation.

436

* @inode: owner

436

* @inode: owner

437

* @block: block we want

437

* @block: block we want

438

* @partial: pointer to the last triple within a chain

438

* @partial: pointer to the last triple within a chain

439

*

439

*

440

* Normally this function find the preferred place for block allocation,

440

* Normally this function find the preferred place for block allocation,

441

* returns it.

441

* returns it.

442

*/

442

*/

443

static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,

443

static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,

444

Indirect *partial)

444

Indirect *partial)

445

{

445

{

446

struct ext4_block_alloc_info *block_i;

446

struct ext4_block_alloc_info *block_i;

447

448

block_i = EXT4_I(inode)->i_block_alloc_info;

448

block_i = EXT4_I(inode)->i_block_alloc_info;

449

450

/*

450

/*

451

* try the heuristic for sequential allocation,

451

* try the heuristic for sequential allocation,

452

* failing that at least try to get decent locality.

452

* failing that at least try to get decent locality.

453

*/

453

*/

454

if (block_i && (block == block_i->last_alloc_logical_block + 1)

454

if (block_i && (block == block_i->last_alloc_logical_block + 1)

455

&& (block_i->last_alloc_physical_block != 0)) {

455

&& (block_i->last_alloc_physical_block != 0)) {

456

return block_i->last_alloc_physical_block + 1;

456

return block_i->last_alloc_physical_block + 1;

457

}

457

}

458

459

return ext4_find_near(inode, partial);

459

return ext4_find_near(inode, partial);

460

}

460

}

461

462

/**

462

/**

463

* ext4_blks_to_allocate: Look up the block map and count the number

463

* ext4_blks_to_allocate: Look up the block map and count the number

464

* of direct blocks need to be allocated for the given branch.

464

* of direct blocks need to be allocated for the given branch.

465

*

465

*

466

* @branch: chain of indirect blocks

466

* @branch: chain of indirect blocks

467

* @k: number of blocks need for indirect blocks

467

* @k: number of blocks need for indirect blocks

468

* @blks: number of data blocks to be mapped.

468

* @blks: number of data blocks to be mapped.

469

* @blocks_to_boundary: the offset in the indirect block

469

* @blocks_to_boundary: the offset in the indirect block

470

*

470

*

471

* return the total number of blocks to be allocate, including the

471

* return the total number of blocks to be allocate, including the

472

* direct and indirect blocks.

472

* direct and indirect blocks.

473

*/

473

*/

474

static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks,

474

static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks,

475

int blocks_to_boundary)

475

int blocks_to_boundary)

476

{

476

{

477

unsigned long count = 0;

477

unsigned long count = 0;

478

479

/*

479

/*

480

* Simple case, [t,d]Indirect block(s) has not allocated yet

480

* Simple case, [t,d]Indirect block(s) has not allocated yet

481

* then it's clear blocks on that path have not allocated

481

* then it's clear blocks on that path have not allocated

482

*/

482

*/

483

if (k > 0) {

483

if (k > 0) {

484

/* right now we don't handle cross boundary allocation */

484

/* right now we don't handle cross boundary allocation */

485

if (blks < blocks_to_boundary + 1)

485

if (blks < blocks_to_boundary + 1)

486

count += blks;

486

count += blks;

487

else

487

else

488

count += blocks_to_boundary + 1;

488

count += blocks_to_boundary + 1;

489

return count;

489

return count;

490

}

490

}

491

492

count++;

492

count++;

493

while (count < blks && count <= blocks_to_boundary &&

493

while (count < blks && count <= blocks_to_boundary &&

494

le32_to_cpu(*(branch[0].p + count)) == 0) {

494

le32_to_cpu(*(branch[0].p + count)) == 0) {

495

count++;

495

count++;

496

}

496

}

497

return count;

497

return count;

498

}

498

}

499

500

/**

500

/**

501

* ext4_alloc_blocks: multiple allocate blocks needed for a branch

501

* ext4_alloc_blocks: multiple allocate blocks needed for a branch

502

* @indirect_blks: the number of blocks need to allocate for indirect

502

* @indirect_blks: the number of blocks need to allocate for indirect

503

* blocks

503

* blocks

504

*

504

*

505

* @new_blocks: on return it will store the new block numbers for

505

* @new_blocks: on return it will store the new block numbers for

506

* the indirect blocks(if needed) and the first direct block,

506

* the indirect blocks(if needed) and the first direct block,

507

* @blks: on return it will store the total number of allocated

507

* @blks: on return it will store the total number of allocated

508

* direct blocks

508

* direct blocks

509

*/

509

*/

510

static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,

510

static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,

511

ext4_fsblk_t goal, int indirect_blks, int blks,

511

ext4_fsblk_t goal, int indirect_blks, int blks,

512

ext4_fsblk_t new_blocks[4], int *err)

512

ext4_fsblk_t new_blocks[4], int *err)

513

{

513

{

514

int target, i;

514

int target, i;

515

unsigned long count = 0;

515

unsigned long count = 0;

516

int index = 0;

516

int index = 0;

517

ext4_fsblk_t current_block = 0;

517

ext4_fsblk_t current_block = 0;

518

int ret = 0;

518

int ret = 0;

519

520

/*

520

/*

521

* Here we try to allocate the requested multiple blocks at once,

521

* Here we try to allocate the requested multiple blocks at once,

522

* on a best-effort basis.

522

* on a best-effort basis.

523

* To build a branch, we should allocate blocks for

523

* To build a branch, we should allocate blocks for

524

* the indirect blocks(if not allocated yet), and at least

524

* the indirect blocks(if not allocated yet), and at least

525

* the first direct block of this branch. That's the

525

* the first direct block of this branch. That's the

526

* minimum number of blocks need to allocate(required)

526

* minimum number of blocks need to allocate(required)

527

*/

527

*/

528

target = blks + indirect_blks;

528

target = blks + indirect_blks;

529

530

while (1) {

530

while (1) {

531

count = target;

531

count = target;

532

/* allocating blocks for indirect blocks and direct blocks */

532

/* allocating blocks for indirect blocks and direct blocks */

533

current_block = ext4_new_blocks(handle,inode,goal,&count,err);

533

current_block = ext4_new_blocks(handle,inode,goal,&count,err);

534

if (*err)

534

if (*err)

535

goto failed_out;

535

goto failed_out;

536

537

target -= count;

537

target -= count;

538

/* allocate blocks for indirect blocks */

538

/* allocate blocks for indirect blocks */

539

while (index < indirect_blks && count) {

539

while (index < indirect_blks && count) {

540

new_blocks[index++] = current_block++;

540

new_blocks[index++] = current_block++;

541

count--;

541

count--;

542

}

542

}

543

544

if (count > 0)

544

if (count > 0)

545

break;

545

break;

546

}

546

}

547

548

/* save the new block number for the first direct block */

548

/* save the new block number for the first direct block */

549

new_blocks[index] = current_block;

549

new_blocks[index] = current_block;

550

551

/* total number of blocks allocated for direct blocks */

551

/* total number of blocks allocated for direct blocks */

552

ret = count;

552

ret = count;

553

*err = 0;

553

*err = 0;

554

return ret;

554

return ret;

555

failed_out:

555

failed_out:

556

for (i = 0; i <index; i++)

556

for (i = 0; i <index; i++)

557

ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);

557

ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);

558

return ret;

558

return ret;

559

}

559

}

560

561

/**

561

/**

562

* ext4_alloc_branch - allocate and set up a chain of blocks.

562

* ext4_alloc_branch - allocate and set up a chain of blocks.

563

* @inode: owner

563

* @inode: owner

564

* @indirect_blks: number of allocated indirect blocks

564

* @indirect_blks: number of allocated indirect blocks

565

* @blks: number of allocated direct blocks

565

* @blks: number of allocated direct blocks

566

* @offsets: offsets (in the blocks) to store the pointers to next.

566

* @offsets: offsets (in the blocks) to store the pointers to next.

567

* @branch: place to store the chain in.

567

* @branch: place to store the chain in.

568

*

568

*

569

* This function allocates blocks, zeroes out all but the last one,

569

* This function allocates blocks, zeroes out all but the last one,

570

* links them into chain and (if we are synchronous) writes them to disk.

570

* links them into chain and (if we are synchronous) writes them to disk.

571

* In other words, it prepares a branch that can be spliced onto the

571

* In other words, it prepares a branch that can be spliced onto the

572

* inode. It stores the information about that chain in the branch[], in

572

* inode. It stores the information about that chain in the branch[], in

573

* the same format as ext4_get_branch() would do. We are calling it after

573

* the same format as ext4_get_branch() would do. We are calling it after

574

* we had read the existing part of chain and partial points to the last

574

* we had read the existing part of chain and partial points to the last

575

* triple of that (one with zero ->key). Upon the exit we have the same

575

* triple of that (one with zero ->key). Upon the exit we have the same

576

* picture as after the successful ext4_get_block(), except that in one

576

* picture as after the successful ext4_get_block(), except that in one

577

* place chain is disconnected - *branch->p is still zero (we did not

577

* place chain is disconnected - *branch->p is still zero (we did not

578

* set the last link), but branch->key contains the number that should

578

* set the last link), but branch->key contains the number that should

579

* be placed into *branch->p to fill that gap.

579

* be placed into *branch->p to fill that gap.

580

*

580

*

581

* If allocation fails we free all blocks we've allocated (and forget

581

* If allocation fails we free all blocks we've allocated (and forget

582

* their buffer_heads) and return the error value the from failed

582

* their buffer_heads) and return the error value the from failed

583

* ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain

583

* ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain

584

* as described above and return 0.

584

* as described above and return 0.

585

*/

585

*/

586

static int ext4_alloc_branch(handle_t *handle, struct inode *inode,

586

static int ext4_alloc_branch(handle_t *handle, struct inode *inode,

587

int indirect_blks, int *blks, ext4_fsblk_t goal,

587

int indirect_blks, int *blks, ext4_fsblk_t goal,

588

ext4_lblk_t *offsets, Indirect *branch)

588

ext4_lblk_t *offsets, Indirect *branch)

589

{

589

{

590

int blocksize = inode->i_sb->s_blocksize;

590

int blocksize = inode->i_sb->s_blocksize;

591

int i, n = 0;

591

int i, n = 0;

592

int err = 0;

592

int err = 0;

593

struct buffer_head *bh;

593

struct buffer_head *bh;

594

int num;

594

int num;

595

ext4_fsblk_t new_blocks[4];

595

ext4_fsblk_t new_blocks[4];

596

ext4_fsblk_t current_block;

596

ext4_fsblk_t current_block;

597

598

num = ext4_alloc_blocks(handle, inode, goal, indirect_blks,

598

num = ext4_alloc_blocks(handle, inode, goal, indirect_blks,

599

*blks, new_blocks, &err);

599

*blks, new_blocks, &err);

600

if (err)

600

if (err)

601

return err;

601

return err;

602

603

branch[0].key = cpu_to_le32(new_blocks[0]);

603

branch[0].key = cpu_to_le32(new_blocks[0]);

604

/*

604

/*

605

* metadata blocks and data blocks are allocated.

605

* metadata blocks and data blocks are allocated.

606

*/

606

*/

607

for (n = 1; n <= indirect_blks; n++) {

607

for (n = 1; n <= indirect_blks; n++) {

608

/*

608

/*

609

* Get buffer_head for parent block, zero it out

609

* Get buffer_head for parent block, zero it out

610

* and set the pointer to new one, then send

610

* and set the pointer to new one, then send

611

* parent to disk.

611

* parent to disk.

612

*/

612

*/

613

bh = sb_getblk(inode->i_sb, new_blocks[n-1]);

613

bh = sb_getblk(inode->i_sb, new_blocks[n-1]);

614

branch[n].bh = bh;

614

branch[n].bh = bh;

615

lock_buffer(bh);

615

lock_buffer(bh);

616

BUFFER_TRACE(bh, "call get_create_access");

616

BUFFER_TRACE(bh, "call get_create_access");

617

err = ext4_journal_get_create_access(handle, bh);

617

err = ext4_journal_get_create_access(handle, bh);

618

if (err) {

618

if (err) {

619

unlock_buffer(bh);

619

unlock_buffer(bh);

620

brelse(bh);

620

brelse(bh);

621

goto failed;

621

goto failed;

622

}

622

}

623

624

memset(bh->b_data, 0, blocksize);

624

memset(bh->b_data, 0, blocksize);

625

branch[n].p = (__le32 *) bh->b_data + offsets[n];

625

branch[n].p = (__le32 *) bh->b_data + offsets[n];

626

branch[n].key = cpu_to_le32(new_blocks[n]);

626

branch[n].key = cpu_to_le32(new_blocks[n]);

627

*branch[n].p = branch[n].key;

627

*branch[n].p = branch[n].key;

628

if ( n == indirect_blks) {

628

if ( n == indirect_blks) {

629

current_block = new_blocks[n];

629

current_block = new_blocks[n];

630

/*

630

/*

631

* End of chain, update the last new metablock of

631

* End of chain, update the last new metablock of

632

* the chain to point to the new allocated

632

* the chain to point to the new allocated

633

* data blocks numbers

633

* data blocks numbers

634

*/

634

*/

635

for (i=1; i < num; i++)

635

for (i=1; i < num; i++)

636

*(branch[n].p + i) = cpu_to_le32(++current_block);

636

*(branch[n].p + i) = cpu_to_le32(++current_block);

637

}

637

}

638

BUFFER_TRACE(bh, "marking uptodate");

638

BUFFER_TRACE(bh, "marking uptodate");

639

set_buffer_uptodate(bh);

639

set_buffer_uptodate(bh);

640

unlock_buffer(bh);

640

unlock_buffer(bh);

641

642

BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");

642

BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");

643

err = ext4_journal_dirty_metadata(handle, bh);

643

err = ext4_journal_dirty_metadata(handle, bh);

644

if (err)

644

if (err)

645

goto failed;

645

goto failed;

646

}

646

}

647

*blks = num;

647

*blks = num;

648

return err;

648

return err;

649

failed:

649

failed:

650

/* Allocation failed, free what we already allocated */

650

/* Allocation failed, free what we already allocated */

651

for (i = 1; i <= n ; i++) {

651

for (i = 1; i <= n ; i++) {

652

BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget");

652

BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget");

653

ext4_journal_forget(handle, branch[i].bh);

653

ext4_journal_forget(handle, branch[i].bh);

654

}

654

}

655

for (i = 0; i <indirect_blks; i++)

655

for (i = 0; i <indirect_blks; i++)

656

ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);

656

ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);

657

658

ext4_free_blocks(handle, inode, new_blocks[i], num, 0);

658

ext4_free_blocks(handle, inode, new_blocks[i], num, 0);

659

660

return err;

660

return err;

661

}

661

}

662

663

/**

663

/**

664

* ext4_splice_branch - splice the allocated branch onto inode.

664

* ext4_splice_branch - splice the allocated branch onto inode.

665

* @inode: owner

665

* @inode: owner

666

* @block: (logical) number of block we are adding

666

* @block: (logical) number of block we are adding

667

* @chain: chain of indirect blocks (with a missing link - see

667

* @chain: chain of indirect blocks (with a missing link - see

668

* ext4_alloc_branch)

668

* ext4_alloc_branch)

669

* @where: location of missing link

669

* @where: location of missing link

670

* @num: number of indirect blocks we are adding

670

* @num: number of indirect blocks we are adding

671

* @blks: number of direct blocks we are adding

671

* @blks: number of direct blocks we are adding

672

*

672

*

673

* This function fills the missing link and does all housekeeping needed in

673

* This function fills the missing link and does all housekeeping needed in

674

* inode (->i_blocks, etc.). In case of success we end up with the full

674

* inode (->i_blocks, etc.). In case of success we end up with the full

675

* chain to new block and return 0.

675

* chain to new block and return 0.

676

*/

676

*/

677

static int ext4_splice_branch(handle_t *handle, struct inode *inode,

677

static int ext4_splice_branch(handle_t *handle, struct inode *inode,

678

ext4_lblk_t block, Indirect *where, int num, int blks)

678

ext4_lblk_t block, Indirect *where, int num, int blks)

679

{

679

{

680

int i;

680

int i;

681

int err = 0;

681

int err = 0;

682

struct ext4_block_alloc_info *block_i;

682

struct ext4_block_alloc_info *block_i;

683

ext4_fsblk_t current_block;

683

ext4_fsblk_t current_block;

684

685

block_i = EXT4_I(inode)->i_block_alloc_info;

685

block_i = EXT4_I(inode)->i_block_alloc_info;

686

/*

686

/*

687

* If we're splicing into a [td]indirect block (as opposed to the

687

* If we're splicing into a [td]indirect block (as opposed to the

688

* inode) then we need to get write access to the [td]indirect block

688

* inode) then we need to get write access to the [td]indirect block

689

* before the splice.

689

* before the splice.

690

*/

690

*/

691

if (where->bh) {

691

if (where->bh) {

692

BUFFER_TRACE(where->bh, "get_write_access");

692

BUFFER_TRACE(where->bh, "get_write_access");

693

err = ext4_journal_get_write_access(handle, where->bh);

693

err = ext4_journal_get_write_access(handle, where->bh);

694

if (err)

694

if (err)

695

goto err_out;

695

goto err_out;

696

}

696

}

697

/* That's it */

697

/* That's it */

698

699

*where->p = where->key;

699

*where->p = where->key;

700

701

/*

701

/*

702

* Update the host buffer_head or inode to point to more just allocated

702

* Update the host buffer_head or inode to point to more just allocated

703

* direct blocks blocks

703

* direct blocks blocks

704

*/

704

*/

705

if (num == 0 && blks > 1) {

705

if (num == 0 && blks > 1) {

706

current_block = le32_to_cpu(where->key) + 1;

706

current_block = le32_to_cpu(where->key) + 1;

707

for (i = 1; i < blks; i++)

707

for (i = 1; i < blks; i++)

708

*(where->p + i ) = cpu_to_le32(current_block++);

708

*(where->p + i ) = cpu_to_le32(current_block++);

709

}

709

}

710

711

/*

711

/*

712

* update the most recently allocated logical & physical block

712

* update the most recently allocated logical & physical block

713

* in i_block_alloc_info, to assist find the proper goal block for next

713

* in i_block_alloc_info, to assist find the proper goal block for next

714

* allocation

714

* allocation

715

*/

715

*/

716

if (block_i) {

716

if (block_i) {

717

block_i->last_alloc_logical_block = block + blks - 1;

717

block_i->last_alloc_logical_block = block + blks - 1;

718

block_i->last_alloc_physical_block =

718

block_i->last_alloc_physical_block =

719

le32_to_cpu(where[num].key) + blks - 1;

719

le32_to_cpu(where[num].key) + blks - 1;

720

}

720

}

721

722

/* We are done with atomic stuff, now do the rest of housekeeping */

722

/* We are done with atomic stuff, now do the rest of housekeeping */

723

724

inode->i_ctime = ext4_current_time(inode);

724

inode->i_ctime = ext4_current_time(inode);

725

ext4_mark_inode_dirty(handle, inode);

725

ext4_mark_inode_dirty(handle, inode);

726

727

/* had we spliced it onto indirect block? */

727

/* had we spliced it onto indirect block? */

728

if (where->bh) {

728

if (where->bh) {

729

/*

729

/*

730

* If we spliced it onto an indirect block, we haven't

730

* If we spliced it onto an indirect block, we haven't

731

* altered the inode. Note however that if it is being spliced

731

* altered the inode. Note however that if it is being spliced

732

* onto an indirect block at the very end of the file (the

732

* onto an indirect block at the very end of the file (the

733

* file is growing) then we *will* alter the inode to reflect

733

* file is growing) then we *will* alter the inode to reflect

734

* the new i_size. But that is not done here - it is done in

734

* the new i_size. But that is not done here - it is done in

735

* generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.

735

* generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.

736

*/

736

*/

737

jbd_debug(5, "splicing indirect only\n");

737

jbd_debug(5, "splicing indirect only\n");

738

BUFFER_TRACE(where->bh, "call ext4_journal_dirty_metadata");

738

BUFFER_TRACE(where->bh, "call ext4_journal_dirty_metadata");

739

err = ext4_journal_dirty_metadata(handle, where->bh);

739

err = ext4_journal_dirty_metadata(handle, where->bh);

740

if (err)

740

if (err)

741

goto err_out;

741

goto err_out;

742

} else {

742

} else {

743

/*

743

/*

744

* OK, we spliced it into the inode itself on a direct block.

744

* OK, we spliced it into the inode itself on a direct block.

745

* Inode was dirtied above.

745

* Inode was dirtied above.

746

*/

746

*/

747

jbd_debug(5, "splicing direct\n");

747

jbd_debug(5, "splicing direct\n");

748

}

748

}

749

return err;

749

return err;

750

751

err_out:

751

err_out:

752

for (i = 1; i <= num; i++) {

752

for (i = 1; i <= num; i++) {

753

BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget");

753

BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget");

754

ext4_journal_forget(handle, where[i].bh);

754

ext4_journal_forget(handle, where[i].bh);

755

ext4_free_blocks(handle, inode,

755

ext4_free_blocks(handle, inode,

756

le32_to_cpu(where[i-1].key), 1, 0);

756

le32_to_cpu(where[i-1].key), 1, 0);

757

}

757

}

758

ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0);

758

ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0);

759

760

return err;

760

return err;

761

}

761

}

762

763

/*

763

/*

764

* Allocation strategy is simple: if we have to allocate something, we will

764

* Allocation strategy is simple: if we have to allocate something, we will

765

* have to go the whole way to leaf. So let's do it before attaching anything

765

* have to go the whole way to leaf. So let's do it before attaching anything

766

* to tree, set linkage between the newborn blocks, write them if sync is

766

* to tree, set linkage between the newborn blocks, write them if sync is

767

* required, recheck the path, free and repeat if check fails, otherwise

767

* required, recheck the path, free and repeat if check fails, otherwise

768

* set the last missing link (that will protect us from any truncate-generated

768

* set the last missing link (that will protect us from any truncate-generated

769

* removals - all blocks on the path are immune now) and possibly force the

769

* removals - all blocks on the path are immune now) and possibly force the

770

* write on the parent block.

770

* write on the parent block.

771

* That has a nice additional property: no special recovery from the failed

771

* That has a nice additional property: no special recovery from the failed

772

* allocations is needed - we simply release blocks and do not touch anything

772

* allocations is needed - we simply release blocks and do not touch anything

773

* reachable from inode.

773

* reachable from inode.

774

*

774

*

775

* `handle' can be NULL if create == 0.

775

* `handle' can be NULL if create == 0.

776

*

776

*

777

* return > 0, # of blocks mapped or allocated.

777

* return > 0, # of blocks mapped or allocated.

778

* return = 0, if plain lookup failed.

778

* return = 0, if plain lookup failed.

779

* return < 0, error case.

779

* return < 0, error case.

780

*

780

*

781

*

781

*

782

* Need to be called with

782

* Need to be called with

783

* down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block

783

* down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block

784

* (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)

784

* (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)

785

*/

785

*/

786

int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,

786

int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,

787

ext4_lblk_t iblock, unsigned long maxblocks,

787

ext4_lblk_t iblock, unsigned long maxblocks,

788

struct buffer_head *bh_result,

788

struct buffer_head *bh_result,

789

int create, int extend_disksize)

789

int create, int extend_disksize)

790

{

790

{

791

int err = -EIO;

791

int err = -EIO;

792

ext4_lblk_t offsets[4];

792

ext4_lblk_t offsets[4];

793

Indirect chain[4];

793

Indirect chain[4];

794

Indirect *partial;

794

Indirect *partial;

795

ext4_fsblk_t goal;

795

ext4_fsblk_t goal;

796

int indirect_blks;

796

int indirect_blks;

797

int blocks_to_boundary = 0;

797

int blocks_to_boundary = 0;

798

int depth;

798

int depth;

799

struct ext4_inode_info *ei = EXT4_I(inode);

799

struct ext4_inode_info *ei = EXT4_I(inode);

800

int count = 0;

800

int count = 0;

801

ext4_fsblk_t first_block = 0;

801

ext4_fsblk_t first_block = 0;

802

803

804

J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));

804

J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));

805

J_ASSERT(handle != NULL || create == 0);

805

J_ASSERT(handle != NULL || create == 0);

806

depth = ext4_block_to_path(inode, iblock, offsets,

806

depth = ext4_block_to_path(inode, iblock, offsets,

807

&blocks_to_boundary);

807

&blocks_to_boundary);

808

809

if (depth == 0)

809

if (depth == 0)

810

goto out;

810

goto out;

811

812

partial = ext4_get_branch(inode, depth, offsets, chain, &err);

812

partial = ext4_get_branch(inode, depth, offsets, chain, &err);

813

814

/* Simplest case - block found, no allocation needed */

814

/* Simplest case - block found, no allocation needed */

815

if (!partial) {

815

if (!partial) {

816

first_block = le32_to_cpu(chain[depth - 1].key);

816

first_block = le32_to_cpu(chain[depth - 1].key);

817

clear_buffer_new(bh_result);

817

clear_buffer_new(bh_result);

818

count++;

818

count++;

819

/*map more blocks*/

819

/*map more blocks*/

820

while (count < maxblocks && count <= blocks_to_boundary) {

820

while (count < maxblocks && count <= blocks_to_boundary) {

821

ext4_fsblk_t blk;

821

ext4_fsblk_t blk;

822

823

blk = le32_to_cpu(*(chain[depth-1].p + count));

823

blk = le32_to_cpu(*(chain[depth-1].p + count));

824

825

if (blk == first_block + count)

825

if (blk == first_block + count)

826

count++;

826

count++;

827

else

827

else

828

break;

828

break;

829

}

829

}

830

goto got_it;

830

goto got_it;

831

}

831

}

832

833

/* Next simple case - plain lookup or failed read of indirect block */

833

/* Next simple case - plain lookup or failed read of indirect block */

834

if (!create || err == -EIO)

834

if (!create || err == -EIO)

835

goto cleanup;

835

goto cleanup;

836

837

/*

837

/*

838

* Okay, we need to do block allocation. Lazily initialize the block

838

* Okay, we need to do block allocation. Lazily initialize the block

839

* allocation info here if necessary

839

* allocation info here if necessary

840

*/

840

*/

841

if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))

841

if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))

842

ext4_init_block_alloc_info(inode);

842

ext4_init_block_alloc_info(inode);

843

844

goal = ext4_find_goal(inode, iblock, partial);

844

goal = ext4_find_goal(inode, iblock, partial);

845

846

/* the number of blocks need to allocate for [d,t]indirect blocks */

846

/* the number of blocks need to allocate for [d,t]indirect blocks */

847

indirect_blks = (chain + depth) - partial - 1;

847

indirect_blks = (chain + depth) - partial - 1;

848

849

/*

849

/*

850

* Next look up the indirect map to count the totoal number of

850

* Next look up the indirect map to count the totoal number of

851

* direct blocks to allocate for this branch.

851

* direct blocks to allocate for this branch.

852

*/

852

*/

853

count = ext4_blks_to_allocate(partial, indirect_blks,

853

count = ext4_blks_to_allocate(partial, indirect_blks,

854

maxblocks, blocks_to_boundary);

854

maxblocks, blocks_to_boundary);

855

/*

855

/*

856

* Block out ext4_truncate while we alter the tree

856

* Block out ext4_truncate while we alter the tree

857

*/

857

*/

858

err = ext4_alloc_branch(handle, inode, indirect_blks, &count, goal,

858

err = ext4_alloc_branch(handle, inode, indirect_blks, &count, goal,

859

offsets + (partial - chain), partial);

859

offsets + (partial - chain), partial);

860

861

/*

861

/*

862

* The ext4_splice_branch call will free and forget any buffers

862

* The ext4_splice_branch call will free and forget any buffers

863

* on the new chain if there is a failure, but that risks using

863

* on the new chain if there is a failure, but that risks using

864

* up transaction credits, especially for bitmaps where the

864

* up transaction credits, especially for bitmaps where the

865

* credits cannot be returned. Can we handle this somehow? We

865

* credits cannot be returned. Can we handle this somehow? We

866

* may need to return -EAGAIN upwards in the worst case. --sct

866

* may need to return -EAGAIN upwards in the worst case. --sct

867

*/

867

*/

868

if (!err)

868

if (!err)

869

err = ext4_splice_branch(handle, inode, iblock,

869

err = ext4_splice_branch(handle, inode, iblock,

870

partial, indirect_blks, count);

870

partial, indirect_blks, count);

871

/*

871

/*

872

* i_disksize growing is protected by i_data_sem. Don't forget to

872

* i_disksize growing is protected by i_data_sem. Don't forget to

873

* protect it if you're about to implement concurrent

873

* protect it if you're about to implement concurrent

874

* ext4_get_block() -bzzz

874

* ext4_get_block() -bzzz

875

*/

875

*/

876

if (!err && extend_disksize && inode->i_size > ei->i_disksize)

876

if (!err && extend_disksize && inode->i_size > ei->i_disksize)

877

ei->i_disksize = inode->i_size;

877

ei->i_disksize = inode->i_size;

878

if (err)

878

if (err)

879

goto cleanup;

879

goto cleanup;

880

881

set_buffer_new(bh_result);

881

set_buffer_new(bh_result);

882

got_it:

882

got_it:

883

map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));

883

map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));

884

if (count > blocks_to_boundary)

884

if (count > blocks_to_boundary)

885

set_buffer_boundary(bh_result);

885

set_buffer_boundary(bh_result);

886

err = count;

886

err = count;

887

/* Clean up and exit */

887

/* Clean up and exit */

888

partial = chain + depth - 1; /* the whole chain */

888

partial = chain + depth - 1; /* the whole chain */

889

cleanup:

889

cleanup:

890

while (partial > chain) {

890

while (partial > chain) {

891

BUFFER_TRACE(partial->bh, "call brelse");

891

BUFFER_TRACE(partial->bh, "call brelse");

892

brelse(partial->bh);

892

brelse(partial->bh);

893

partial--;

893

partial--;

894

}

894

}

895

BUFFER_TRACE(bh_result, "returned");

895

BUFFER_TRACE(bh_result, "returned");

896

out:

896

out:

897

return err;

897

return err;

898

}

898

}

899

900

/* Maximum number of blocks we map for direct IO at once. */

900

/* Maximum number of blocks we map for direct IO at once. */

901

#define DIO_MAX_BLOCKS 4096

901

#define DIO_MAX_BLOCKS 4096

902

/*

902

/*

903

* Number of credits we need for writing DIO_MAX_BLOCKS:

903

* Number of credits we need for writing DIO_MAX_BLOCKS:

904

* We need sb + group descriptor + bitmap + inode -> 4

904

* We need sb + group descriptor + bitmap + inode -> 4

905

* For B blocks with A block pointers per block we need:

905

* For B blocks with A block pointers per block we need:

906

* 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect).

906

* 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect).

907

* If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25.

907

* If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25.

908

*/

908

*/

909

#define DIO_CREDITS 25

909

#define DIO_CREDITS 25

910

911

912

/*

912

/*

913

*

913

*

914

*

914

*

915

* ext4_ext4 get_block() wrapper function

915

* ext4_ext4 get_block() wrapper function

916

* It will do a look up first, and returns if the blocks already mapped.

916

* It will do a look up first, and returns if the blocks already mapped.

917

* Otherwise it takes the write lock of the i_data_sem and allocate blocks

917

* Otherwise it takes the write lock of the i_data_sem and allocate blocks

918

* and store the allocated blocks in the result buffer head and mark it

918

* and store the allocated blocks in the result buffer head and mark it

919

* mapped.

919

* mapped.

920

*

920

*

921

* If file type is extents based, it will call ext4_ext_get_blocks(),

921

* If file type is extents based, it will call ext4_ext_get_blocks(),

922

* Otherwise, call with ext4_get_blocks_handle() to handle indirect mapping

922

* Otherwise, call with ext4_get_blocks_handle() to handle indirect mapping

923

* based files

923

* based files

924

*

924

*

925

* On success, it returns the number of blocks being mapped or allocate.

925

* On success, it returns the number of blocks being mapped or allocate.

926

* if create==0 and the blocks are pre-allocated and uninitialized block,

926

* if create==0 and the blocks are pre-allocated and uninitialized block,

927

* the result buffer head is unmapped. If the create ==1, it will make sure

927

* the result buffer head is unmapped. If the create ==1, it will make sure

928

* the buffer head is mapped.

928

* the buffer head is mapped.

929

*

929

*

930

* It returns 0 if plain look up failed (blocks have not been allocated), in

930

* It returns 0 if plain look up failed (blocks have not been allocated), in

931

* that casem, buffer head is unmapped

931

* that casem, buffer head is unmapped

932

*

932

*

933

* It returns the error in case of allocation failure.

933

* It returns the error in case of allocation failure.

934

*/

934

*/

935

int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,

935

int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,

936

unsigned long max_blocks, struct buffer_head *bh,

936

unsigned long max_blocks, struct buffer_head *bh,

937

int create, int extend_disksize)

937

int create, int extend_disksize)

938

{

938

{

939

int retval;

939

int retval;

940

941

clear_buffer_mapped(bh);

941

clear_buffer_mapped(bh);

942

943

/*

943

/*

944

* Try to see if we can get the block without requesting

944

* Try to see if we can get the block without requesting

945

* for new file system block.

945

* for new file system block.

946

*/

946

*/

947

down_read((&EXT4_I(inode)->i_data_sem));

947

down_read((&EXT4_I(inode)->i_data_sem));

948

if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {

948

if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {

949

retval = ext4_ext_get_blocks(handle, inode, block, max_blocks,

949

retval = ext4_ext_get_blocks(handle, inode, block, max_blocks,

950

bh, 0, 0);

950

bh, 0, 0);

951

} else {

951

} else {

952

retval = ext4_get_blocks_handle(handle,

952

retval = ext4_get_blocks_handle(handle,

953

inode, block, max_blocks, bh, 0, 0);

953

inode, block, max_blocks, bh, 0, 0);

954

}

954

}

955

up_read((&EXT4_I(inode)->i_data_sem));

955

up_read((&EXT4_I(inode)->i_data_sem));

956

957

/* If it is only a block(s) look up */

957

/* If it is only a block(s) look up */

958

if (!create)

958

if (!create)

959

return retval;

959

return retval;

960

961

/*

961

/*

962

* Returns if the blocks have already allocated

962

* Returns if the blocks have already allocated

963

*

963

*

964

* Note that if blocks have been preallocated

964

* Note that if blocks have been preallocated

965

* ext4_ext_get_block() returns th create = 0

965

* ext4_ext_get_block() returns th create = 0

966

* with buffer head unmapped.

966

* with buffer head unmapped.

967

*/

967

*/

968

if (retval > 0 && buffer_mapped(bh))

968

if (retval > 0 && buffer_mapped(bh))

969

return retval;

969

return retval;

970

971

/*

971

/*

972

* New blocks allocate and/or writing to uninitialized extent

972

* New blocks allocate and/or writing to uninitialized extent

973

* will possibly result in updating i_data, so we take

973

* will possibly result in updating i_data, so we take

974

* the write lock of i_data_sem, and call get_blocks()

974

* the write lock of i_data_sem, and call get_blocks()

975

* with create == 1 flag.

975

* with create == 1 flag.

976

*/

976

*/

977

down_write((&EXT4_I(inode)->i_data_sem));

977

down_write((&EXT4_I(inode)->i_data_sem));

978

/*

978

/*

979

* We need to check for EXT4 here because migrate

979

* We need to check for EXT4 here because migrate

980

* could have changed the inode type in between

980

* could have changed the inode type in between

981

*/

981

*/

982

if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {

982

if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {

983

retval = ext4_ext_get_blocks(handle, inode, block, max_blocks,

983

retval = ext4_ext_get_blocks(handle, inode, block, max_blocks,

984

bh, create, extend_disksize);

984

bh, create, extend_disksize);

985

} else {

985

} else {

986

retval = ext4_get_blocks_handle(handle, inode, block,

986

retval = ext4_get_blocks_handle(handle, inode, block,

987

max_blocks, bh, create, extend_disksize);

987

max_blocks, bh, create, extend_disksize);

988

989

if (retval > 0 && buffer_new(bh)) {

989

if (retval > 0 && buffer_new(bh)) {

990

/*

990

/*

991

* We allocated new blocks which will result in

991

* We allocated new blocks which will result in

992

* i_data's format changing. Force the migrate

992

* i_data's format changing. Force the migrate

993

* to fail by clearing migrate flags

993

* to fail by clearing migrate flags

994

*/

994

*/

995

EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags &

995

EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags &

996

~EXT4_EXT_MIGRATE;

996

~EXT4_EXT_MIGRATE;

997

}

997

}

998

}

998

}

999

up_write((&EXT4_I(inode)->i_data_sem));

999

up_write((&EXT4_I(inode)->i_data_sem));

1000

return retval;

1000

return retval;

1001

}

1001

}

1002

1003

static int ext4_get_block(struct inode *inode, sector_t iblock,

1003

static int ext4_get_block(struct inode *inode, sector_t iblock,

1004

struct buffer_head *bh_result, int create)

1004

struct buffer_head *bh_result, int create)

1005

{

1005

{

1006

handle_t *handle = ext4_journal_current_handle();

1006

handle_t *handle = ext4_journal_current_handle();

1007

int ret = 0, started = 0;

1007

int ret = 0, started = 0;

1008

unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;

1008

unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;

1009

1010

if (create && !handle) {

1010

if (create && !handle) {

1011

/* Direct IO write... */

1011

/* Direct IO write... */

1012

if (max_blocks > DIO_MAX_BLOCKS)

1012

if (max_blocks > DIO_MAX_BLOCKS)

1013

max_blocks = DIO_MAX_BLOCKS;

1013

max_blocks = DIO_MAX_BLOCKS;

1014

handle = ext4_journal_start(inode, DIO_CREDITS +

1014

handle = ext4_journal_start(inode, DIO_CREDITS +

1015

2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb));

1015

2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb));

1016

if (IS_ERR(handle)) {

1016

if (IS_ERR(handle)) {

1017

ret = PTR_ERR(handle);

1017

ret = PTR_ERR(handle);

1018

goto out;

1018

goto out;

1019

}

1019

}

1020

started = 1;

1020

started = 1;

1021

}

1021

}

1022

1023

ret = ext4_get_blocks_wrap(handle, inode, iblock,

1023

ret = ext4_get_blocks_wrap(handle, inode, iblock,

1024

max_blocks, bh_result, create, 0);

1024

max_blocks, bh_result, create, 0);

1025

if (ret > 0) {

1025

if (ret > 0) {

1026

bh_result->b_size = (ret << inode->i_blkbits);

1026

bh_result->b_size = (ret << inode->i_blkbits);

1027

ret = 0;

1027

ret = 0;

1028

}

1028

}

1029

if (started)

1029

if (started)

1030

ext4_journal_stop(handle);

1030

ext4_journal_stop(handle);

1031

out:

1031

out:

1032

return ret;

1032

return ret;

1033

}

1033

}

1034

1035

/*

1035

/*

1036

* `handle' can be NULL if create is zero

1036

* `handle' can be NULL if create is zero

1037

*/

1037

*/

1038

struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,

1038

struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,

1039

ext4_lblk_t block, int create, int *errp)

1039

ext4_lblk_t block, int create, int *errp)

1040

{

1040

{

1041

struct buffer_head dummy;

1041

struct buffer_head dummy;

1042

int fatal = 0, err;

1042

int fatal = 0, err;

1043

1044

J_ASSERT(handle != NULL || create == 0);

1044

J_ASSERT(handle != NULL || create == 0);

1045

1046

dummy.b_state = 0;

1046

dummy.b_state = 0;

1047

dummy.b_blocknr = -1000;

1047

dummy.b_blocknr = -1000;

1048

buffer_trace_init(&dummy.b_history);

1048

buffer_trace_init(&dummy.b_history);

1049

err = ext4_get_blocks_wrap(handle, inode, block, 1,

1049

err = ext4_get_blocks_wrap(handle, inode, block, 1,

1050

&dummy, create, 1);

1050

&dummy, create, 1);

1051

/*

1051

/*

1052

* ext4_get_blocks_handle() returns number of blocks

1052

* ext4_get_blocks_handle() returns number of blocks

1053

* mapped. 0 in case of a HOLE.

1053

* mapped. 0 in case of a HOLE.

1054

*/

1054

*/

1055

if (err > 0) {

1055

if (err > 0) {

1056

if (err > 1)

1056

if (err > 1)

1057

WARN_ON(1);

1057

WARN_ON(1);

1058

err = 0;

1058

err = 0;

1059

}

1059

}

1060

*errp = err;

1060

*errp = err;

1061

if (!err && buffer_mapped(&dummy)) {

1061

if (!err && buffer_mapped(&dummy)) {

1062

struct buffer_head *bh;

1062

struct buffer_head *bh;

1063

bh = sb_getblk(inode->i_sb, dummy.b_blocknr);

1063

bh = sb_getblk(inode->i_sb, dummy.b_blocknr);

1064

if (!bh) {

1064

if (!bh) {

1065

*errp = -EIO;

1065

*errp = -EIO;

1066

goto err;

1066

goto err;

1067

}

1067

}

1068

if (buffer_new(&dummy)) {

1068

if (buffer_new(&dummy)) {

1069

J_ASSERT(create != 0);

1069

J_ASSERT(create != 0);

1070

J_ASSERT(handle != NULL);

1070

J_ASSERT(handle != NULL);

1071

1072

/*

1072

/*

1073

* Now that we do not always journal data, we should

1073

* Now that we do not always journal data, we should

1074

* keep in mind whether this should always journal the

1074

* keep in mind whether this should always journal the

1075

* new buffer as metadata. For now, regular file

1075

* new buffer as metadata. For now, regular file

1076

* writes use ext4_get_block instead, so it's not a

1076

* writes use ext4_get_block instead, so it's not a

1077

* problem.

1077

* problem.

1078

*/

1078

*/

1079

lock_buffer(bh);

1079

lock_buffer(bh);

1080

BUFFER_TRACE(bh, "call get_create_access");

1080

BUFFER_TRACE(bh, "call get_create_access");

1081

fatal = ext4_journal_get_create_access(handle, bh);

1081

fatal = ext4_journal_get_create_access(handle, bh);

1082

if (!fatal && !buffer_uptodate(bh)) {

1082

if (!fatal && !buffer_uptodate(bh)) {

1083

memset(bh->b_data,0,inode->i_sb->s_blocksize);

1083

memset(bh->b_data,0,inode->i_sb->s_blocksize);

1084

set_buffer_uptodate(bh);

1084

set_buffer_uptodate(bh);

1085

}

1085

}

1086

unlock_buffer(bh);

1086

unlock_buffer(bh);

1087

BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");

1087

BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");

1088

err = ext4_journal_dirty_metadata(handle, bh);

1088

err = ext4_journal_dirty_metadata(handle, bh);

1089

if (!fatal)

1089

if (!fatal)

1090

fatal = err;

1090

fatal = err;

1091

} else {

1091

} else {

1092

BUFFER_TRACE(bh, "not a new buffer");

1092

BUFFER_TRACE(bh, "not a new buffer");

1093

}

1093

}

1094

if (fatal) {

1094

if (fatal) {

1095

*errp = fatal;

1095

*errp = fatal;

1096

brelse(bh);

1096

brelse(bh);

1097

bh = NULL;

1097

bh = NULL;

1098

}

1098

}

1099

return bh;

1099

return bh;

1100

}

1100

}

1101

err:

1101

err:

1102

return NULL;

1102

return NULL;

1103

}

1103

}

1104

1105

struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,

1105

struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,

1106

ext4_lblk_t block, int create, int *err)

1106

ext4_lblk_t block, int create, int *err)

1107

{

1107

{

1108

struct buffer_head * bh;

1108

struct buffer_head * bh;

1109

1110

bh = ext4_getblk(handle, inode, block, create, err);

1110

bh = ext4_getblk(handle, inode, block, create, err);

1111

if (!bh)

1111

if (!bh)

1112

return bh;

1112

return bh;

1113

if (buffer_uptodate(bh))

1113

if (buffer_uptodate(bh))

1114

return bh;

1114

return bh;

1115

ll_rw_block(READ_META, 1, &bh);

1115

ll_rw_block(READ_META, 1, &bh);

1116

wait_on_buffer(bh);

1116

wait_on_buffer(bh);

1117

if (buffer_uptodate(bh))

1117

if (buffer_uptodate(bh))

1118

return bh;

1118

return bh;

1119

put_bh(bh);

1119

put_bh(bh);

1120

*err = -EIO;

1120

*err = -EIO;

1121

return NULL;

1121

return NULL;

1122

}

1122

}

1123

1124

static int walk_page_buffers( handle_t *handle,

1124

static int walk_page_buffers( handle_t *handle,

1125

struct buffer_head *head,

1125

struct buffer_head *head,

1126

unsigned from,

1126

unsigned from,

1127

unsigned to,

1127

unsigned to,

1128

int *partial,

1128

int *partial,

1129

int (*fn)( handle_t *handle,

1129

int (*fn)( handle_t *handle,

1130

struct buffer_head *bh))

1130

struct buffer_head *bh))

1131

{

1131

{

1132

struct buffer_head *bh;

1132

struct buffer_head *bh;

1133

unsigned block_start, block_end;

1133

unsigned block_start, block_end;

1134

unsigned blocksize = head->b_size;

1134

unsigned blocksize = head->b_size;

1135

int err, ret = 0;

1135

int err, ret = 0;

1136

struct buffer_head *next;

1136

struct buffer_head *next;

1137

1138

for ( bh = head, block_start = 0;

1138

for ( bh = head, block_start = 0;

1139

ret == 0 && (bh != head || !block_start);

1139

ret == 0 && (bh != head || !block_start);

1140

block_start = block_end, bh = next)

1140

block_start = block_end, bh = next)

1141

{

1141

{

1142

next = bh->b_this_page;

1142

next = bh->b_this_page;

1143

block_end = block_start + blocksize;

1143

block_end = block_start + blocksize;

1144

if (block_end <= from || block_start >= to) {

1144

if (block_end <= from || block_start >= to) {

1145

if (partial && !buffer_uptodate(bh))

1145

if (partial && !buffer_uptodate(bh))

1146

*partial = 1;

1146

*partial = 1;

1147

continue;

1147

continue;

1148

}

1148

}

1149

err = (*fn)(handle, bh);

1149

err = (*fn)(handle, bh);

1150

if (!ret)

1150

if (!ret)

1151

ret = err;

1151

ret = err;

1152

}

1152

}

1153

return ret;

1153

return ret;

1154

}

1154

}

1155

1156

/*

1156

/*

1157

* To preserve ordering, it is essential that the hole instantiation and

1157

* To preserve ordering, it is essential that the hole instantiation and

1158

* the data write be encapsulated in a single transaction. We cannot

1158

* the data write be encapsulated in a single transaction. We cannot

1159

* close off a transaction and start a new one between the ext4_get_block()

1159

* close off a transaction and start a new one between the ext4_get_block()

1160

* and the commit_write(). So doing the jbd2_journal_start at the start of

1160

* and the commit_write(). So doing the jbd2_journal_start at the start of

1161

* prepare_write() is the right place.

1161

* prepare_write() is the right place.

1162

*

1162

*

1163

* Also, this function can nest inside ext4_writepage() ->

1163

* Also, this function can nest inside ext4_writepage() ->

1164

* block_write_full_page(). In that case, we *know* that ext4_writepage()

1164

* block_write_full_page(). In that case, we *know* that ext4_writepage()

1165

* has generated enough buffer credits to do the whole page. So we won't

1165

* has generated enough buffer credits to do the whole page. So we won't

1166

* block on the journal in that case, which is good, because the caller may

1166

* block on the journal in that case, which is good, because the caller may

1167

* be PF_MEMALLOC.

1167

* be PF_MEMALLOC.

1168

*

1168

*

1169

* By accident, ext4 can be reentered when a transaction is open via

1169

* By accident, ext4 can be reentered when a transaction is open via

1170

* quota file writes. If we were to commit the transaction while thus

1170

* quota file writes. If we were to commit the transaction while thus

1171

* reentered, there can be a deadlock - we would be holding a quota

1171

* reentered, there can be a deadlock - we would be holding a quota

1172

* lock, and the commit would never complete if another thread had a

1172

* lock, and the commit would never complete if another thread had a

1173

* transaction open and was blocking on the quota lock - a ranking

1173

* transaction open and was blocking on the quota lock - a ranking

1174

* violation.

1174

* violation.

1175

*

1175

*

1176

* So what we do is to rely on the fact that jbd2_journal_stop/journal_start

1176

* So what we do is to rely on the fact that jbd2_journal_stop/journal_start

1177

* will _not_ run commit under these circumstances because handle->h_ref

1177

* will _not_ run commit under these circumstances because handle->h_ref

1178

* is elevated. We'll still have enough credits for the tiny quotafile

1178

* is elevated. We'll still have enough credits for the tiny quotafile

1179

* write.

1179

* write.

1180

*/

1180

*/

1181

static int do_journal_get_write_access(handle_t *handle,

1181

static int do_journal_get_write_access(handle_t *handle,

1182

struct buffer_head *bh)

1182

struct buffer_head *bh)

1183

{

1183

{

1184

if (!buffer_mapped(bh) || buffer_freed(bh))

1184

if (!buffer_mapped(bh) || buffer_freed(bh))

1185

return 0;

1185

return 0;

1186

return ext4_journal_get_write_access(handle, bh);

1186

return ext4_journal_get_write_access(handle, bh);

1187

}

1187

}

1188

1189

static int ext4_write_begin(struct file *file, struct address_space *mapping,

1189

static int ext4_write_begin(struct file *file, struct address_space *mapping,

1190

loff_t pos, unsigned len, unsigned flags,

1190

loff_t pos, unsigned len, unsigned flags,

1191

struct page **pagep, void **fsdata)

1191

struct page **pagep, void **fsdata)

1192

{

1192

{

1193

struct inode *inode = mapping->host;

1193

struct inode *inode = mapping->host;

1194

int ret, needed_blocks = ext4_writepage_trans_blocks(inode);

1194

int ret, needed_blocks = ext4_writepage_trans_blocks(inode);

1195

handle_t *handle;

1195

handle_t *handle;

1196

int retries = 0;

1196

int retries = 0;

1197

struct page *page;

1197

struct page *page;

1198

pgoff_t index;

1198

pgoff_t index;

1199

unsigned from, to;

1199

unsigned from, to;

1200

1201

index = pos >> PAGE_CACHE_SHIFT;

1201

index = pos >> PAGE_CACHE_SHIFT;

1202

from = pos & (PAGE_CACHE_SIZE - 1);

1202

from = pos & (PAGE_CACHE_SIZE - 1);

1203

to = from + len;

1203

to = from + len;

1204

1205

retry:

1205

retry:

1206

page = __grab_cache_page(mapping, index);

1206

page = __grab_cache_page(mapping, index);

1207

if (!page)

1207

if (!page)

1208

return -ENOMEM;

1208

return -ENOMEM;

1209

*pagep = page;

1209

*pagep = page;

1210

1211

handle = ext4_journal_start(inode, needed_blocks);

1211

handle = ext4_journal_start(inode, needed_blocks);

1212

if (IS_ERR(handle)) {

1212

if (IS_ERR(handle)) {

1213

unlock_page(page);

1213

unlock_page(page);

1214

page_cache_release(page);

1214

page_cache_release(page);

1215

ret = PTR_ERR(handle);

1215

ret = PTR_ERR(handle);

1216

goto out;

1216

goto out;

1217

}

1217

}

1218

1219

ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,

1219

ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,

1220

ext4_get_block);

1220

ext4_get_block);

1221

1222

if (!ret && ext4_should_journal_data(inode)) {

1222

if (!ret && ext4_should_journal_data(inode)) {

1223

ret = walk_page_buffers(handle, page_buffers(page),

1223

ret = walk_page_buffers(handle, page_buffers(page),

1224

from, to, NULL, do_journal_get_write_access);

1224

from, to, NULL, do_journal_get_write_access);

1225

}

1225

}

1226

1227

if (ret) {

1227

if (ret) {

1228

ext4_journal_stop(handle);

1228

ext4_journal_stop(handle);

1229

unlock_page(page);

1229

unlock_page(page);

1230

page_cache_release(page);

1230

page_cache_release(page);

1231

}

1231

}

1232

1233

if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))

1233

if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))

1234

goto retry;

1234

goto retry;

1235

out:

1235

out:

1236

return ret;

1236

return ret;

1237

}

1237

}

1238

1239

int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh)

1239

int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh)

1240

{

1240

{

1241

int err = jbd2_journal_dirty_data(handle, bh);

1241

int err = jbd2_journal_dirty_data(handle, bh);

1242

if (err)

1242

if (err)

1243

ext4_journal_abort_handle(__func__, __func__,

1243

ext4_journal_abort_handle(__func__, __func__,

1244

bh, handle, err);

1244

bh, handle, err);

1245

return err;

1245

return err;

1246

}

1246

}

1247

1248

/* For write_end() in data=journal mode */

1248

/* For write_end() in data=journal mode */

1249

static int write_end_fn(handle_t *handle, struct buffer_head *bh)

1249

static int write_end_fn(handle_t *handle, struct buffer_head *bh)

1250

{

1250

{

1251

if (!buffer_mapped(bh) || buffer_freed(bh))

1251

if (!buffer_mapped(bh) || buffer_freed(bh))

1252

return 0;

1252

return 0;

1253

set_buffer_uptodate(bh);

1253

set_buffer_uptodate(bh);

1254

return ext4_journal_dirty_metadata(handle, bh);

1254

return ext4_journal_dirty_metadata(handle, bh);

1255

}

1255

}

1256

1257

/*

1257

/*

1258

* Generic write_end handler for ordered and writeback ext4 journal modes.

1258

* Generic write_end handler for ordered and writeback ext4 journal modes.

1259

* We can't use generic_write_end, because that unlocks the page and we need to

1259

* We can't use generic_write_end, because that unlocks the page and we need to

1260

* unlock the page after ext4_journal_stop, but ext4_journal_stop must run

1260

* unlock the page after ext4_journal_stop, but ext4_journal_stop must run

1261

* after block_write_end.

1261

* after block_write_end.

1262

*/

1262

*/

1263

static int ext4_generic_write_end(struct file *file,

1263

static int ext4_generic_write_end(struct file *file,

1264

struct address_space *mapping,

1264

struct address_space *mapping,

1265

loff_t pos, unsigned len, unsigned copied,

1265

loff_t pos, unsigned len, unsigned copied,

1266

struct page *page, void *fsdata)

1266

struct page *page, void *fsdata)

1267

{

1267

{

1268

struct inode *inode = file->f_mapping->host;

1268

struct inode *inode = file->f_mapping->host;

1269

1270

copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);

1270

copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);

1271

1272

if (pos+copied > inode->i_size) {

1272

if (pos+copied > inode->i_size) {

1273

i_size_write(inode, pos+copied);

1273

i_size_write(inode, pos+copied);

1274

mark_inode_dirty(inode);

1274

mark_inode_dirty(inode);

1275

}

1275

}

1276

1277

return copied;

1277

return copied;

1278

}

1278

}

1279

1280

/*

1280

/*

1281

* We need to pick up the new inode size which generic_commit_write gave us

1281

* We need to pick up the new inode size which generic_commit_write gave us

1282

* `file' can be NULL - eg, when called from page_symlink().

1282

* `file' can be NULL - eg, when called from page_symlink().

1283

*

1283

*

1284

* ext4 never places buffers on inode->i_mapping->private_list. metadata

1284

* ext4 never places buffers on inode->i_mapping->private_list. metadata

1285

* buffers are managed internally.

1285

* buffers are managed internally.

1286

*/

1286

*/

1287

static int ext4_ordered_write_end(struct file *file,

1287

static int ext4_ordered_write_end(struct file *file,

1288

struct address_space *mapping,

1288

struct address_space *mapping,

1289

loff_t pos, unsigned len, unsigned copied,

1289

loff_t pos, unsigned len, unsigned copied,

1290

struct page *page, void *fsdata)

1290

struct page *page, void *fsdata)

1291

{

1291

{

1292

handle_t *handle = ext4_journal_current_handle();

1292

handle_t *handle = ext4_journal_current_handle();

1293

struct inode *inode = file->f_mapping->host;

1293

struct inode *inode = file->f_mapping->host;

1294

unsigned from, to;

1294

unsigned from, to;

1295

int ret = 0, ret2;

1295

int ret = 0, ret2;

1296

1297

from = pos & (PAGE_CACHE_SIZE - 1);

1297

from = pos & (PAGE_CACHE_SIZE - 1);

1298

to = from + len;

1298

to = from + len;

1299

1300

ret = walk_page_buffers(handle, page_buffers(page),

1300

ret = walk_page_buffers(handle, page_buffers(page),

1301

from, to, NULL, ext4_journal_dirty_data);

1301

from, to, NULL, ext4_journal_dirty_data);

1302

1303

if (ret == 0) {

1303

if (ret == 0) {

1304

/*

1304

/*

1305

* generic_write_end() will run mark_inode_dirty() if i_size

1305

* generic_write_end() will run mark_inode_dirty() if i_size

1306

* changes. So let's piggyback the i_disksize mark_inode_dirty

1306

* changes. So let's piggyback the i_disksize mark_inode_dirty

1307

* into that.

1307

* into that.

1308

*/

1308

*/

1309

loff_t new_i_size;

1309

loff_t new_i_size;

1310

1311

new_i_size = pos + copied;

1311

new_i_size = pos + copied;

1312

if (new_i_size > EXT4_I(inode)->i_disksize)

1312

if (new_i_size > EXT4_I(inode)->i_disksize)

1313

EXT4_I(inode)->i_disksize = new_i_size;

1313

EXT4_I(inode)->i_disksize = new_i_size;

1314

copied = ext4_generic_write_end(file, mapping, pos, len, copied,

1314

ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,

1315

page, fsdata);

1315

page, fsdata);

1316

if (copied < 0)

1316

copied = ret2;

1317

ret = copied;

1317

if (ret2 < 0)

1318

ret = ret2;

1318

}

1319

}

1319

ret2 = ext4_journal_stop(handle);

1320

ret2 = ext4_journal_stop(handle);

1320

if (!ret)

1321

if (!ret)

1321

ret = ret2;

1322

ret = ret2;

1322

unlock_page(page);

1323

unlock_page(page);

1323

page_cache_release(page);

1324

page_cache_release(page);

1324

1325

return ret ? ret : copied;

1326

return ret ? ret : copied;

1326

}

1327

}

1327

1328

static int ext4_writeback_write_end(struct file *file,

1329

static int ext4_writeback_write_end(struct file *file,

1329

struct address_space *mapping,

1330

struct address_space *mapping,

1330

loff_t pos, unsigned len, unsigned copied,

1331

loff_t pos, unsigned len, unsigned copied,

1331

struct page *page, void *fsdata)

1332

struct page *page, void *fsdata)

1332

{

1333

{

1333

handle_t *handle = ext4_journal_current_handle();

1334

handle_t *handle = ext4_journal_current_handle();

1334

struct inode *inode = file->f_mapping->host;

1335

struct inode *inode = file->f_mapping->host;

1335

int ret = 0, ret2;

1336

int ret = 0, ret2;

1336

loff_t new_i_size;

1337

loff_t new_i_size;

1337

1338

new_i_size = pos + copied;

1339

new_i_size = pos + copied;

1339

if (new_i_size > EXT4_I(inode)->i_disksize)

1340

if (new_i_size > EXT4_I(inode)->i_disksize)

1340

EXT4_I(inode)->i_disksize = new_i_size;

1341

EXT4_I(inode)->i_disksize = new_i_size;

1341

1342

copied = ext4_generic_write_end(file, mapping, pos, len, copied,

1343

ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,

1343

page, fsdata);

1344

page, fsdata);

1344

if (copied < 0)

1345

copied = ret2;

1345

ret = copied;

1346

if (ret2 < 0)

1347

ret = ret2;

1346

1348

1347

ret2 = ext4_journal_stop(handle);

1349

ret2 = ext4_journal_stop(handle);

1348

if (!ret)

1350

if (!ret)

1349

ret = ret2;

1351

ret = ret2;

1350

unlock_page(page);

1352

unlock_page(page);

1351

page_cache_release(page);

1353

page_cache_release(page);

1352

1354

1353

return ret ? ret : copied;

1355

return ret ? ret : copied;

1354

}

1356

}

1355

1357

1356

static int ext4_journalled_write_end(struct file *file,

1358

static int ext4_journalled_write_end(struct file *file,

1357

struct address_space *mapping,

1359

struct address_space *mapping,

1358

loff_t pos, unsigned len, unsigned copied,

1360

loff_t pos, unsigned len, unsigned copied,

1359

struct page *page, void *fsdata)

1361

struct page *page, void *fsdata)

1360

{

1362

{

1361

handle_t *handle = ext4_journal_current_handle();

1363

handle_t *handle = ext4_journal_current_handle();

1362

struct inode *inode = mapping->host;

1364

struct inode *inode = mapping->host;

1363

int ret = 0, ret2;

1365

int ret = 0, ret2;

1364

int partial = 0;

1366

int partial = 0;

1365

unsigned from, to;

1367

unsigned from, to;

1366

1368

1367

from = pos & (PAGE_CACHE_SIZE - 1);

1369

from = pos & (PAGE_CACHE_SIZE - 1);

1368

to = from + len;

1370

to = from + len;

1369

1371

1370

if (copied < len) {

1372

if (copied < len) {

1371

if (!PageUptodate(page))

1373

if (!PageUptodate(page))

1372

copied = 0;

1374

copied = 0;

1373

page_zero_new_buffers(page, from+copied, to);

1375

page_zero_new_buffers(page, from+copied, to);

1374

}

1376

}

1375

1377

1376

ret = walk_page_buffers(handle, page_buffers(page), from,

1378

ret = walk_page_buffers(handle, page_buffers(page), from,

1377

to, &partial, write_end_fn);

1379

to, &partial, write_end_fn);

1378

if (!partial)

1380

if (!partial)

1379

SetPageUptodate(page);

1381

SetPageUptodate(page);

1380

if (pos+copied > inode->i_size)

1382

if (pos+copied > inode->i_size)

1381

i_size_write(inode, pos+copied);

1383

i_size_write(inode, pos+copied);

1382

EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;

1384

EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;

1383

if (inode->i_size > EXT4_I(inode)->i_disksize) {

1385

if (inode->i_size > EXT4_I(inode)->i_disksize) {

1384

EXT4_I(inode)->i_disksize = inode->i_size;

1386

EXT4_I(inode)->i_disksize = inode->i_size;

1385

ret2 = ext4_mark_inode_dirty(handle, inode);

1387

ret2 = ext4_mark_inode_dirty(handle, inode);

1386

if (!ret)

1388

if (!ret)

1387

ret = ret2;

1389

ret = ret2;

1388

}

1390

}

1389

1391

1390

ret2 = ext4_journal_stop(handle);

1392

ret2 = ext4_journal_stop(handle);

1391

if (!ret)

1393

if (!ret)

1392

ret = ret2;

1394

ret = ret2;

1393

unlock_page(page);

1395

unlock_page(page);

1394

page_cache_release(page);

1396

page_cache_release(page);

1395

1397

1396

return ret ? ret : copied;

1398

return ret ? ret : copied;

1397

}

1399

}

1398

1400

1399

/*

1401

/*

1400

* bmap() is special. It gets used by applications such as lilo and by

1402

* bmap() is special. It gets used by applications such as lilo and by

1401

* the swapper to find the on-disk block of a specific piece of data.

1403

* the swapper to find the on-disk block of a specific piece of data.

1402

*

1404

*

1403

* Naturally, this is dangerous if the block concerned is still in the

1405

* Naturally, this is dangerous if the block concerned is still in the

1404

* journal. If somebody makes a swapfile on an ext4 data-journaling

1406

* journal. If somebody makes a swapfile on an ext4 data-journaling

1405

* filesystem and enables swap, then they may get a nasty shock when the

1407

* filesystem and enables swap, then they may get a nasty shock when the

1406

* data getting swapped to that swapfile suddenly gets overwritten by

1408

* data getting swapped to that swapfile suddenly gets overwritten by

1407

* the original zero's written out previously to the journal and

1409

* the original zero's written out previously to the journal and

1408

* awaiting writeback in the kernel's buffer cache.

1410

* awaiting writeback in the kernel's buffer cache.

1409

*

1411

*

1410

* So, if we see any bmap calls here on a modified, data-journaled file,

1412

* So, if we see any bmap calls here on a modified, data-journaled file,

1411

* take extra steps to flush any blocks which might be in the cache.

1413

* take extra steps to flush any blocks which might be in the cache.

1412

*/

1414

*/

1413

static sector_t ext4_bmap(struct address_space *mapping, sector_t block)

1415

static sector_t ext4_bmap(struct address_space *mapping, sector_t block)

1414

{

1416

{

1415

struct inode *inode = mapping->host;

1417

struct inode *inode = mapping->host;

1416

journal_t *journal;

1418

journal_t *journal;

1417

int err;

1419

int err;

1418

1420

1419

if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {

1421

if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {

1420

/*

1422

/*

1421

* This is a REALLY heavyweight approach, but the use of

1423

* This is a REALLY heavyweight approach, but the use of

1422

* bmap on dirty files is expected to be extremely rare:

1424

* bmap on dirty files is expected to be extremely rare:

1423

* only if we run lilo or swapon on a freshly made file

1425

* only if we run lilo or swapon on a freshly made file

1424

* do we expect this to happen.

1426

* do we expect this to happen.

1425

*

1427

*

1426

* (bmap requires CAP_SYS_RAWIO so this does not

1428

* (bmap requires CAP_SYS_RAWIO so this does not

1427

* represent an unprivileged user DOS attack --- we'd be

1429

* represent an unprivileged user DOS attack --- we'd be

1428

* in trouble if mortal users could trigger this path at

1430

* in trouble if mortal users could trigger this path at

1429

* will.)

1431

* will.)

1430

*

1432

*

1431

* NB. EXT4_STATE_JDATA is not set on files other than

1433

* NB. EXT4_STATE_JDATA is not set on files other than

1432

* regular files. If somebody wants to bmap a directory

1434

* regular files. If somebody wants to bmap a directory

1433

* or symlink and gets confused because the buffer

1435

* or symlink and gets confused because the buffer

1434

* hasn't yet been flushed to disk, they deserve

1436

* hasn't yet been flushed to disk, they deserve

1435

* everything they get.

1437

* everything they get.

1436

*/

1438

*/

1437

1439

1438

EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA;

1440

EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA;

1439

journal = EXT4_JOURNAL(inode);

1441

journal = EXT4_JOURNAL(inode);

1440

jbd2_journal_lock_updates(journal);

1442

jbd2_journal_lock_updates(journal);

1441

err = jbd2_journal_flush(journal);

1443

err = jbd2_journal_flush(journal);

1442

jbd2_journal_unlock_updates(journal);

1444

jbd2_journal_unlock_updates(journal);

1443

1445

1444

if (err)

1446

if (err)

1445

return 0;

1447

return 0;

1446

}

1448

}

1447

1449

1448

return generic_block_bmap(mapping,block,ext4_get_block);

1450

return generic_block_bmap(mapping,block,ext4_get_block);

1449

}

1451

}

1450

1452

1451

static int bget_one(handle_t *handle, struct buffer_head *bh)

1453

static int bget_one(handle_t *handle, struct buffer_head *bh)

1452

{

1454

{

1453

get_bh(bh);

1455

get_bh(bh);

1454

return 0;

1456

return 0;

1455

}

1457

}

1456

1458

1457

static int bput_one(handle_t *handle, struct buffer_head *bh)

1459

static int bput_one(handle_t *handle, struct buffer_head *bh)

1458

{

1460

{

1459

put_bh(bh);

1461

put_bh(bh);

1460

return 0;

1462

return 0;

1461

}

1463

}

1462

1464

1463

static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)

1465

static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)

1464

{

1466

{

1465

if (buffer_mapped(bh))

1467

if (buffer_mapped(bh))

1466

return ext4_journal_dirty_data(handle, bh);

1468

return ext4_journal_dirty_data(handle, bh);

1467

return 0;

1469

return 0;

1468

}

1470

}

1469

1471

1470

/*

1472

/*

1471

* Note that we always start a transaction even if we're not journalling

1473

* Note that we always start a transaction even if we're not journalling

1472

* data. This is to preserve ordering: any hole instantiation within

1474

* data. This is to preserve ordering: any hole instantiation within

1473

* __block_write_full_page -> ext4_get_block() should be journalled

1475

* __block_write_full_page -> ext4_get_block() should be journalled

1474

* along with the data so we don't crash and then get metadata which

1476

* along with the data so we don't crash and then get metadata which

1475

* refers to old data.

1477

* refers to old data.

1476

*

1478

*

1477

* In all journalling modes block_write_full_page() will start the I/O.

1479

* In all journalling modes block_write_full_page() will start the I/O.

1478

*

1480

*

1479

* Problem:

1481

* Problem:

1480

*

1482

*

1481

* ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->

1483

* ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->

1482

* ext4_writepage()

1484

* ext4_writepage()

1483

*

1485

*

1484

* Similar for:

1486

* Similar for:

1485

*

1487

*

1486

* ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ...

1488

* ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ...

1487

*

1489

*

1488

* Same applies to ext4_get_block(). We will deadlock on various things like

1490

* Same applies to ext4_get_block(). We will deadlock on various things like

1489

* lock_journal and i_data_sem

1491

* lock_journal and i_data_sem

1490

*

1492

*

1491

* Setting PF_MEMALLOC here doesn't work - too many internal memory

1493

* Setting PF_MEMALLOC here doesn't work - too many internal memory

1492

* allocations fail.

1494

* allocations fail.

1493

*

1495

*

1494

* 16May01: If we're reentered then journal_current_handle() will be

1496

* 16May01: If we're reentered then journal_current_handle() will be

1495

* non-zero. We simply *return*.

1497

* non-zero. We simply *return*.

1496

*

1498

*

1497

* 1 July 2001: @@@ FIXME:

1499

* 1 July 2001: @@@ FIXME:

1498

* In journalled data mode, a data buffer may be metadata against the

1500

* In journalled data mode, a data buffer may be metadata against the

1499

* current transaction. But the same file is part of a shared mapping

1501

* current transaction. But the same file is part of a shared mapping

1500

* and someone does a writepage() on it.

1502

* and someone does a writepage() on it.

1501

*

1503

*

1502

* We will move the buffer onto the async_data list, but *after* it has

1504

* We will move the buffer onto the async_data list, but *after* it has

1503

* been dirtied. So there's a small window where we have dirty data on

1505

* been dirtied. So there's a small window where we have dirty data on

1504

* BJ_Metadata.

1506

* BJ_Metadata.

1505

*

1507

*

1506

* Note that this only applies to the last partial page in the file. The

1508

* Note that this only applies to the last partial page in the file. The

1507

* bit which block_write_full_page() uses prepare/commit for. (That's

1509

* bit which block_write_full_page() uses prepare/commit for. (That's

1508

* broken code anyway: it's wrong for msync()).

1510

* broken code anyway: it's wrong for msync()).

1509

*

1511

*

1510

* It's a rare case: affects the final partial page, for journalled data

1512

* It's a rare case: affects the final partial page, for journalled data

1511

* where the file is subject to bith write() and writepage() in the same

1513

* where the file is subject to bith write() and writepage() in the same

1512

* transction. To fix it we'll need a custom block_write_full_page().

1514

* transction. To fix it we'll need a custom block_write_full_page().

1513

* We'll probably need that anyway for journalling writepage() output.

1515

* We'll probably need that anyway for journalling writepage() output.

1514

*

1516

*

1515

* We don't honour synchronous mounts for writepage(). That would be

1517

* We don't honour synchronous mounts for writepage(). That would be

1516

* disastrous. Any write() or metadata operation will sync the fs for

1518

* disastrous. Any write() or metadata operation will sync the fs for

1517

* us.

1519

* us.

1518

*

1520

*

1519

* AKPM2: if all the page's buffers are mapped to disk and !data=journal,

1521

* AKPM2: if all the page's buffers are mapped to disk and !data=journal,

1520

* we don't need to open a transaction here.

1522

* we don't need to open a transaction here.

1521

*/

1523

*/

1522

static int ext4_ordered_writepage(struct page *page,

1524

static int ext4_ordered_writepage(struct page *page,

1523

struct writeback_control *wbc)

1525

struct writeback_control *wbc)

1524

{

1526

{

1525

struct inode *inode = page->mapping->host;

1527

struct inode *inode = page->mapping->host;

1526

struct buffer_head *page_bufs;

1528

struct buffer_head *page_bufs;

1527

handle_t *handle = NULL;

1529

handle_t *handle = NULL;

1528

int ret = 0;

1530

int ret = 0;

1529

int err;

1531

int err;

1530

1532

1531

J_ASSERT(PageLocked(page));

1533

J_ASSERT(PageLocked(page));

1532

1534

1533

/*

1535

/*

1534

* We give up here if we're reentered, because it might be for a

1536

* We give up here if we're reentered, because it might be for a

1535

* different filesystem.

1537

* different filesystem.

1536

*/

1538

*/

1537

if (ext4_journal_current_handle())

1539

if (ext4_journal_current_handle())

1538

goto out_fail;

1540

goto out_fail;

1539

1541

1540

handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));

1542

handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));

1541

1543

1542

if (IS_ERR(handle)) {

1544

if (IS_ERR(handle)) {

1543

ret = PTR_ERR(handle);

1545

ret = PTR_ERR(handle);

1544

goto out_fail;

1546

goto out_fail;

1545

}

1547

}

1546

1548

1547

if (!page_has_buffers(page)) {

1549

if (!page_has_buffers(page)) {

1548

create_empty_buffers(page, inode->i_sb->s_blocksize,

1550

create_empty_buffers(page, inode->i_sb->s_blocksize,

1549

(1 << BH_Dirty)|(1 << BH_Uptodate));

1551

(1 << BH_Dirty)|(1 << BH_Uptodate));

1550

}

1552

}

1551

page_bufs = page_buffers(page);

1553

page_bufs = page_buffers(page);

1552

walk_page_buffers(handle, page_bufs, 0,

1554

walk_page_buffers(handle, page_bufs, 0,

1553

PAGE_CACHE_SIZE, NULL, bget_one);

1555

PAGE_CACHE_SIZE, NULL, bget_one);

1554

1556

1555

ret = block_write_full_page(page, ext4_get_block, wbc);

1557

ret = block_write_full_page(page, ext4_get_block, wbc);

1556

1558

1557

/*

1559

/*

1558

* The page can become unlocked at any point now, and

1560

* The page can become unlocked at any point now, and

1559

* truncate can then come in and change things. So we

1561

* truncate can then come in and change things. So we

1560

* can't touch *page from now on. But *page_bufs is

1562

* can't touch *page from now on. But *page_bufs is

1561

* safe due to elevated refcount.

1563

* safe due to elevated refcount.

1562

*/

1564

*/

1563

1565

1564

/*

1566

/*

1565

* And attach them to the current transaction. But only if

1567

* And attach them to the current transaction. But only if

1566

* block_write_full_page() succeeded. Otherwise they are unmapped,

1568

* block_write_full_page() succeeded. Otherwise they are unmapped,

1567

* and generally junk.

1569

* and generally junk.

1568

*/

1570

*/

1569

if (ret == 0) {

1571

if (ret == 0) {

1570

err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,

1572

err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,

1571

NULL, jbd2_journal_dirty_data_fn);

1573

NULL, jbd2_journal_dirty_data_fn);

1572

if (!ret)

1574

if (!ret)

1573

ret = err;

1575

ret = err;

1574

}

1576

}

1575

walk_page_buffers(handle, page_bufs, 0,

1577

walk_page_buffers(handle, page_bufs, 0,

1576

PAGE_CACHE_SIZE, NULL, bput_one);

1578

PAGE_CACHE_SIZE, NULL, bput_one);

1577

err = ext4_journal_stop(handle);

1579

err = ext4_journal_stop(handle);

1578

if (!ret)

1580

if (!ret)

1579

ret = err;

1581

ret = err;

1580

return ret;

1582

return ret;

1581

1583

1582

out_fail:

1584

out_fail:

1583

redirty_page_for_writepage(wbc, page);

1585

redirty_page_for_writepage(wbc, page);

1584

unlock_page(page);

1586

unlock_page(page);

1585

return ret;

1587

return ret;

1586

}

1588

}

1587

1589

1588

static int ext4_writeback_writepage(struct page *page,

1590

static int ext4_writeback_writepage(struct page *page,

1589

struct writeback_control *wbc)

1591

struct writeback_control *wbc)

1590

{

1592

{

1591

struct inode *inode = page->mapping->host;

1593

struct inode *inode = page->mapping->host;

1592

handle_t *handle = NULL;

1594

handle_t *handle = NULL;

1593

int ret = 0;

1595

int ret = 0;

1594

int err;

1596

int err;

1595

1597

1596

if (ext4_journal_current_handle())

1598

if (ext4_journal_current_handle())

1597

goto out_fail;

1599

goto out_fail;

1598

1600

1599

handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));

1601

handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));

1600

if (IS_ERR(handle)) {

1602

if (IS_ERR(handle)) {

1601

ret = PTR_ERR(handle);

1603

ret = PTR_ERR(handle);

1602

goto out_fail;

1604

goto out_fail;

1603

}

1605

}

1604

1606

1605

if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))

1607

if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))

1606

ret = nobh_writepage(page, ext4_get_block, wbc);

1608

ret = nobh_writepage(page, ext4_get_block, wbc);

1607

else

1609

else

1608

ret = block_write_full_page(page, ext4_get_block, wbc);

1610

ret = block_write_full_page(page, ext4_get_block, wbc);

1609

1611

1610

err = ext4_journal_stop(handle);

1612

err = ext4_journal_stop(handle);

1611

if (!ret)

1613

if (!ret)

1612

ret = err;

1614

ret = err;

1613

return ret;

1615

return ret;

1614

1616

1615

out_fail:

1617

out_fail:

1616

redirty_page_for_writepage(wbc, page);

1618

redirty_page_for_writepage(wbc, page);

1617

unlock_page(page);

1619

unlock_page(page);

1618

return ret;

1620

return ret;

1619

}

1621

}

1620

1622

1621

static int ext4_journalled_writepage(struct page *page,

1623

static int ext4_journalled_writepage(struct page *page,

1622

struct writeback_control *wbc)

1624

struct writeback_control *wbc)

1623

{

1625

{

1624

struct inode *inode = page->mapping->host;

1626

struct inode *inode = page->mapping->host;

1625

handle_t *handle = NULL;

1627

handle_t *handle = NULL;

1626

int ret = 0;

1628

int ret = 0;

1627

int err;

1629

int err;

1628

1630

1629

if (ext4_journal_current_handle())

1631

if (ext4_journal_current_handle())

1630

goto no_write;

1632

goto no_write;

1631

1633

1632

handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));

1634

handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));

1633

if (IS_ERR(handle)) {

1635

if (IS_ERR(handle)) {

1634

ret = PTR_ERR(handle);

1636

ret = PTR_ERR(handle);

1635

goto no_write;

1637

goto no_write;

1636

}

1638

}

1637

1639

1638

if (!page_has_buffers(page) || PageChecked(page)) {

1640

if (!page_has_buffers(page) || PageChecked(page)) {

1639

/*

1641

/*

1640

* It's mmapped pagecache. Add buffers and journal it. There

1642

* It's mmapped pagecache. Add buffers and journal it. There

1641

* doesn't seem much point in redirtying the page here.

1643

* doesn't seem much point in redirtying the page here.

1642

*/

1644

*/

1643

ClearPageChecked(page);

1645

ClearPageChecked(page);

1644

ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,

1646

ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,

1645

ext4_get_block);

1647

ext4_get_block);

1646

if (ret != 0) {

1648

if (ret != 0) {

1647

ext4_journal_stop(handle);

1649

ext4_journal_stop(handle);

1648

goto out_unlock;

1650

goto out_unlock;

1649

}

1651

}

1650

ret = walk_page_buffers(handle, page_buffers(page), 0,

1652

ret = walk_page_buffers(handle, page_buffers(page), 0,

1651

PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);

1653

PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);

1652

1654

1653

err = walk_page_buffers(handle, page_buffers(page), 0,

1655

err = walk_page_buffers(handle, page_buffers(page), 0,

1654

PAGE_CACHE_SIZE, NULL, write_end_fn);

1656

PAGE_CACHE_SIZE, NULL, write_end_fn);

1655

if (ret == 0)

1657

if (ret == 0)

1656

ret = err;

1658

ret = err;

1657

EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;

1659

EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;

1658

unlock_page(page);

1660

unlock_page(page);

1659

} else {

1661

} else {

1660

/*

1662

/*

1661

* It may be a page full of checkpoint-mode buffers. We don't

1663

* It may be a page full of checkpoint-mode buffers. We don't

1662

* really know unless we go poke around in the buffer_heads.

1664

* really know unless we go poke around in the buffer_heads.

1663

* But block_write_full_page will do the right thing.

1665

* But block_write_full_page will do the right thing.

1664

*/

1666

*/

1665

ret = block_write_full_page(page, ext4_get_block, wbc);

1667

ret = block_write_full_page(page, ext4_get_block, wbc);

1666

}

1668

}

1667

err = ext4_journal_stop(handle);

1669

err = ext4_journal_stop(handle);

1668

if (!ret)

1670

if (!ret)

1669

ret = err;

1671

ret = err;

1670

out:

1672

out:

1671

return ret;

1673

return ret;

1672

1674

1673

no_write:

1675

no_write:

1674

redirty_page_for_writepage(wbc, page);

1676

redirty_page_for_writepage(wbc, page);

1675

out_unlock:

1677

out_unlock:

1676

unlock_page(page);

1678

unlock_page(page);

1677

goto out;

1679

goto out;

1678

}

1680

}

1679

1681

1680

static int ext4_readpage(struct file *file, struct page *page)

1682

static int ext4_readpage(struct file *file, struct page *page)

1681

{

1683

{

1682

return mpage_readpage(page, ext4_get_block);

1684

return mpage_readpage(page, ext4_get_block);

1683

}

1685

}

1684

1686

1685

static int

1687

static int

1686

ext4_readpages(struct file *file, struct address_space *mapping,

1688

ext4_readpages(struct file *file, struct address_space *mapping,

1687

struct list_head *pages, unsigned nr_pages)

1689

struct list_head *pages, unsigned nr_pages)

1688

{

1690

{

1689

return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);

1691

return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);

1690

}

1692

}

1691

1693

1692

static void ext4_invalidatepage(struct page *page, unsigned long offset)

1694

static void ext4_invalidatepage(struct page *page, unsigned long offset)

1693

{

1695

{

1694

journal_t *journal = EXT4_JOURNAL(page->mapping->host);

1696

journal_t *journal = EXT4_JOURNAL(page->mapping->host);

1695

1697

1696

/*

1698

/*

1697

* If it's a full truncate we just forget about the pending dirtying

1699

* If it's a full truncate we just forget about the pending dirtying

1698

*/

1700

*/

1699

if (offset == 0)

1701

if (offset == 0)

1700

ClearPageChecked(page);

1702

ClearPageChecked(page);

1701

1703

1702

jbd2_journal_invalidatepage(journal, page, offset);

1704

jbd2_journal_invalidatepage(journal, page, offset);

1703

}

1705

}

1704

1706

1705

static int ext4_releasepage(struct page *page, gfp_t wait)

1707

static int ext4_releasepage(struct page *page, gfp_t wait)

1706

{

1708

{

1707

journal_t *journal = EXT4_JOURNAL(page->mapping->host);

1709

journal_t *journal = EXT4_JOURNAL(page->mapping->host);

1708

1710

1709

WARN_ON(PageChecked(page));

1711

WARN_ON(PageChecked(page));

1710

if (!page_has_buffers(page))

1712

if (!page_has_buffers(page))

1711

return 0;

1713

return 0;

1712

return jbd2_journal_try_to_free_buffers(journal, page, wait);

1714

return jbd2_journal_try_to_free_buffers(journal, page, wait);

1713

}

1715

}

1714

1716

1715

/*

1717

/*

1716

* If the O_DIRECT write will extend the file then add this inode to the

1718

* If the O_DIRECT write will extend the file then add this inode to the

1717

* orphan list. So recovery will truncate it back to the original size

1719

* orphan list. So recovery will truncate it back to the original size

1718

* if the machine crashes during the write.

1720

* if the machine crashes during the write.

1719

*

1721

*

1720

* If the O_DIRECT write is intantiating holes inside i_size and the machine

1722

* If the O_DIRECT write is intantiating holes inside i_size and the machine

1721

* crashes then stale disk data _may_ be exposed inside the file. But current

1723

* crashes then stale disk data _may_ be exposed inside the file. But current

1722

* VFS code falls back into buffered path in that case so we are safe.

1724

* VFS code falls back into buffered path in that case so we are safe.

1723

*/

1725

*/

1724

static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,

1726

static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,

1725

const struct iovec *iov, loff_t offset,

1727

const struct iovec *iov, loff_t offset,

1726

unsigned long nr_segs)

1728

unsigned long nr_segs)

1727

{

1729

{

1728

struct file *file = iocb->ki_filp;

1730

struct file *file = iocb->ki_filp;

1729

struct inode *inode = file->f_mapping->host;

1731

struct inode *inode = file->f_mapping->host;

1730

struct ext4_inode_info *ei = EXT4_I(inode);

1732

struct ext4_inode_info *ei = EXT4_I(inode);

1731

handle_t *handle;

1733

handle_t *handle;

1732

ssize_t ret;

1734

ssize_t ret;

1733

int orphan = 0;

1735

int orphan = 0;

1734

size_t count = iov_length(iov, nr_segs);

1736

size_t count = iov_length(iov, nr_segs);

1735

1737

1736

if (rw == WRITE) {

1738

if (rw == WRITE) {

1737

loff_t final_size = offset + count;

1739

loff_t final_size = offset + count;

1738

1740

1739

if (final_size > inode->i_size) {

1741

if (final_size > inode->i_size) {

1740

/* Credits for sb + inode write */

1742

/* Credits for sb + inode write */

1741

handle = ext4_journal_start(inode, 2);

1743

handle = ext4_journal_start(inode, 2);

1742

if (IS_ERR(handle)) {

1744

if (IS_ERR(handle)) {

1743

ret = PTR_ERR(handle);

1745

ret = PTR_ERR(handle);

1744

goto out;

1746

goto out;

1745

}

1747

}

1746

ret = ext4_orphan_add(handle, inode);

1748

ret = ext4_orphan_add(handle, inode);

1747

if (ret) {

1749

if (ret) {

1748

ext4_journal_stop(handle);

1750

ext4_journal_stop(handle);

1749

goto out;

1751

goto out;

1750

}

1752

}

1751

orphan = 1;

1753

orphan = 1;

1752

ei->i_disksize = inode->i_size;

1754

ei->i_disksize = inode->i_size;

1753

ext4_journal_stop(handle);

1755

ext4_journal_stop(handle);

1754

}

1756

}

1755

}

1757

}

1756

1758

1757

ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,

1759

ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,

1758

offset, nr_segs,

1760

offset, nr_segs,

1759

ext4_get_block, NULL);

1761

ext4_get_block, NULL);

1760

1762

1761

if (orphan) {

1763

if (orphan) {

1762

int err;

1764

int err;

1763

1765

1764

/* Credits for sb + inode write */

1766

/* Credits for sb + inode write */

1765

handle = ext4_journal_start(inode, 2);

1767

handle = ext4_journal_start(inode, 2);

1766

if (IS_ERR(handle)) {

1768

if (IS_ERR(handle)) {

1767

/* This is really bad luck. We've written the data

1769

/* This is really bad luck. We've written the data

1768

* but cannot extend i_size. Bail out and pretend

1770

* but cannot extend i_size. Bail out and pretend

1769

* the write failed... */

1771

* the write failed... */

1770

ret = PTR_ERR(handle);

1772

ret = PTR_ERR(handle);

1771

goto out;

1773

goto out;

1772

}

1774

}

1773

if (inode->i_nlink)

1775

if (inode->i_nlink)

1774

ext4_orphan_del(handle, inode);

1776

ext4_orphan_del(handle, inode);

1775

if (ret > 0) {

1777

if (ret > 0) {

1776

loff_t end = offset + ret;

1778

loff_t end = offset + ret;

1777

if (end > inode->i_size) {

1779

if (end > inode->i_size) {

1778

ei->i_disksize = end;

1780

ei->i_disksize = end;

1779

i_size_write(inode, end);

1781

i_size_write(inode, end);

1780

/*

1782

/*

1781

* We're going to return a positive `ret'

1783

* We're going to return a positive `ret'

1782

* here due to non-zero-length I/O, so there's

1784

* here due to non-zero-length I/O, so there's

1783

* no way of reporting error returns from

1785

* no way of reporting error returns from

1784

* ext4_mark_inode_dirty() to userspace. So

1786

* ext4_mark_inode_dirty() to userspace. So

1785

* ignore it.

1787

* ignore it.

1786

*/

1788

*/

1787

ext4_mark_inode_dirty(handle, inode);

1789

ext4_mark_inode_dirty(handle, inode);

1788

}

1790

}

1789

}

1791

}

1790

err = ext4_journal_stop(handle);

1792

err = ext4_journal_stop(handle);

1791

if (ret == 0)

1793

if (ret == 0)

1792

ret = err;

1794

ret = err;

1793

}

1795

}

1794

out:

1796

out:

1795

return ret;

1797

return ret;

1796

}

1798

}

1797

1799

1798

/*

1800

/*

1799

* Pages can be marked dirty completely asynchronously from ext4's journalling

1801

* Pages can be marked dirty completely asynchronously from ext4's journalling

1800

* activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do

1802

* activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do

1801

* much here because ->set_page_dirty is called under VFS locks. The page is

1803

* much here because ->set_page_dirty is called under VFS locks. The page is

1802

* not necessarily locked.

1804

* not necessarily locked.

1803

*

1805

*

1804

* We cannot just dirty the page and leave attached buffers clean, because the

1806

* We cannot just dirty the page and leave attached buffers clean, because the

1805

* buffers' dirty state is "definitive". We cannot just set the buffers dirty

1807

* buffers' dirty state is "definitive". We cannot just set the buffers dirty

1806

* or jbddirty because all the journalling code will explode.

1808

* or jbddirty because all the journalling code will explode.

1807

*

1809

*

1808

* So what we do is to mark the page "pending dirty" and next time writepage

1810

* So what we do is to mark the page "pending dirty" and next time writepage

1809

* is called, propagate that into the buffers appropriately.

1811

* is called, propagate that into the buffers appropriately.

1810

*/

1812

*/

1811

static int ext4_journalled_set_page_dirty(struct page *page)

1813

static int ext4_journalled_set_page_dirty(struct page *page)

1812

{

1814

{

1813

SetPageChecked(page);

1815

SetPageChecked(page);

1814

return __set_page_dirty_nobuffers(page);

1816

return __set_page_dirty_nobuffers(page);

1815

}

1817

}

1816

1818

1817

static const struct address_space_operations ext4_ordered_aops = {

1819

static const struct address_space_operations ext4_ordered_aops = {

1818

.readpage = ext4_readpage,

1820

.readpage = ext4_readpage,

1819

.readpages = ext4_readpages,

1821

.readpages = ext4_readpages,

1820

.writepage = ext4_ordered_writepage,

1822

.writepage = ext4_ordered_writepage,

1821

.sync_page = block_sync_page,

1823

.sync_page = block_sync_page,

1822

.write_begin = ext4_write_begin,

1824

.write_begin = ext4_write_begin,

1823

.write_end = ext4_ordered_write_end,

1825

.write_end = ext4_ordered_write_end,

1824

.bmap = ext4_bmap,

1826

.bmap = ext4_bmap,

1825

.invalidatepage = ext4_invalidatepage,

1827

.invalidatepage = ext4_invalidatepage,

1826

.releasepage = ext4_releasepage,

1828

.releasepage = ext4_releasepage,

1827

.direct_IO = ext4_direct_IO,

1829

.direct_IO = ext4_direct_IO,

1828

.migratepage = buffer_migrate_page,

1830

.migratepage = buffer_migrate_page,

1829

};

1831

};

1830

1832

1831

static const struct address_space_operations ext4_writeback_aops = {

1833

static const struct address_space_operations ext4_writeback_aops = {

1832

.readpage = ext4_readpage,

1834

.readpage = ext4_readpage,

1833

.readpages = ext4_readpages,

1835

.readpages = ext4_readpages,

1834

.writepage = ext4_writeback_writepage,

1836

.writepage = ext4_writeback_writepage,

1835

.sync_page = block_sync_page,

1837

.sync_page = block_sync_page,

1836

.write_begin = ext4_write_begin,

1838

.write_begin = ext4_write_begin,

1837

.write_end = ext4_writeback_write_end,

1839

.write_end = ext4_writeback_write_end,

1838

.bmap = ext4_bmap,

1840

.bmap = ext4_bmap,

1839

.invalidatepage = ext4_invalidatepage,

1841

.invalidatepage = ext4_invalidatepage,

1840

.releasepage = ext4_releasepage,

1842

.releasepage = ext4_releasepage,

1841

.direct_IO = ext4_direct_IO,

1843

.direct_IO = ext4_direct_IO,

1842

.migratepage = buffer_migrate_page,

1844

.migratepage = buffer_migrate_page,

1843

};

1845

};

1844

1846

1845

static const struct address_space_operations ext4_journalled_aops = {

1847

static const struct address_space_operations ext4_journalled_aops = {

1846

.readpage = ext4_readpage,

1848

.readpage = ext4_readpage,

1847

.readpages = ext4_readpages,

1849

.readpages = ext4_readpages,

1848

.writepage = ext4_journalled_writepage,

1850

.writepage = ext4_journalled_writepage,

1849

.sync_page = block_sync_page,

1851

.sync_page = block_sync_page,

1850

.write_begin = ext4_write_begin,

1852

.write_begin = ext4_write_begin,

1851

.write_end = ext4_journalled_write_end,

1853

.write_end = ext4_journalled_write_end,

1852

.set_page_dirty = ext4_journalled_set_page_dirty,

1854

.set_page_dirty = ext4_journalled_set_page_dirty,

1853

.bmap = ext4_bmap,

1855

.bmap = ext4_bmap,

1854

.invalidatepage = ext4_invalidatepage,

1856

.invalidatepage = ext4_invalidatepage,

1855

.releasepage = ext4_releasepage,

1857

.releasepage = ext4_releasepage,

1856

};

1858

};

1857

1859

1858

void ext4_set_aops(struct inode *inode)

1860

void ext4_set_aops(struct inode *inode)

1859

{

1861

{

1860

if (ext4_should_order_data(inode))

1862

if (ext4_should_order_data(inode))

1861

inode->i_mapping->a_ops = &ext4_ordered_aops;

1863

inode->i_mapping->a_ops = &ext4_ordered_aops;

1862

else if (ext4_should_writeback_data(inode))

1864

else if (ext4_should_writeback_data(inode))

1863

inode->i_mapping->a_ops = &ext4_writeback_aops;

1865

inode->i_mapping->a_ops = &ext4_writeback_aops;

1864

else

1866

else

1865

inode->i_mapping->a_ops = &ext4_journalled_aops;

1867

inode->i_mapping->a_ops = &ext4_journalled_aops;

1866

}

1868

}

1867

1869

1868

/*

1870

/*

1869

* ext4_block_truncate_page() zeroes out a mapping from file offset `from'

1871

* ext4_block_truncate_page() zeroes out a mapping from file offset `from'

1870

* up to the end of the block which corresponds to `from'.

1872

* up to the end of the block which corresponds to `from'.

1871

* This required during truncate. We need to physically zero the tail end

1873

* This required during truncate. We need to physically zero the tail end

1872

* of that block so it doesn't yield old data if the file is later grown.

1874

* of that block so it doesn't yield old data if the file is later grown.

1873

*/

1875

*/

1874

int ext4_block_truncate_page(handle_t *handle, struct page *page,

1876

int ext4_block_truncate_page(handle_t *handle, struct page *page,

1875

struct address_space *mapping, loff_t from)

1877

struct address_space *mapping, loff_t from)

1876

{

1878

{

1877

ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;

1879

ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;

1878

unsigned offset = from & (PAGE_CACHE_SIZE-1);

1880

unsigned offset = from & (PAGE_CACHE_SIZE-1);

1879

unsigned blocksize, length, pos;

1881

unsigned blocksize, length, pos;

1880

ext4_lblk_t iblock;

1882

ext4_lblk_t iblock;

1881

struct inode *inode = mapping->host;

1883

struct inode *inode = mapping->host;

1882

struct buffer_head *bh;

1884

struct buffer_head *bh;

1883

int err = 0;

1885

int err = 0;

1884

1886

1885

blocksize = inode->i_sb->s_blocksize;

1887

blocksize = inode->i_sb->s_blocksize;

1886

length = blocksize - (offset & (blocksize - 1));

1888

length = blocksize - (offset & (blocksize - 1));

1887

iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);

1889

iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);

1888

1890

1889

/*

1891

/*

1890

* For "nobh" option, we can only work if we don't need to

1892

* For "nobh" option, we can only work if we don't need to

1891

* read-in the page - otherwise we create buffers to do the IO.

1893

* read-in the page - otherwise we create buffers to do the IO.

1892

*/

1894

*/

1893

if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&

1895

if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&

1894

ext4_should_writeback_data(inode) && PageUptodate(page)) {

1896

ext4_should_writeback_data(inode) && PageUptodate(page)) {

1895

zero_user(page, offset, length);

1897

zero_user(page, offset, length);

1896

set_page_dirty(page);

1898

set_page_dirty(page);

1897

goto unlock;

1899

goto unlock;

1898

}

1900

}

1899

1901

1900

if (!page_has_buffers(page))

1902

if (!page_has_buffers(page))

1901

create_empty_buffers(page, blocksize, 0);

1903

create_empty_buffers(page, blocksize, 0);

1902

1904

1903

/* Find the buffer that contains "offset" */

1905

/* Find the buffer that contains "offset" */

1904

bh = page_buffers(page);

1906

bh = page_buffers(page);

1905

pos = blocksize;

1907

pos = blocksize;

1906

while (offset >= pos) {

1908

while (offset >= pos) {

1907

bh = bh->b_this_page;

1909

bh = bh->b_this_page;

1908

iblock++;

1910

iblock++;

1909

pos += blocksize;

1911

pos += blocksize;

1910

}

1912

}

1911

1913

1912

err = 0;

1914

err = 0;

1913

if (buffer_freed(bh)) {

1915

if (buffer_freed(bh)) {

1914

BUFFER_TRACE(bh, "freed: skip");

1916

BUFFER_TRACE(bh, "freed: skip");

1915

goto unlock;

1917

goto unlock;

1916

}

1918

}

1917

1919

1918

if (!buffer_mapped(bh)) {

1920

if (!buffer_mapped(bh)) {

1919

BUFFER_TRACE(bh, "unmapped");

1921

BUFFER_TRACE(bh, "unmapped");

1920

ext4_get_block(inode, iblock, bh, 0);

1922

ext4_get_block(inode, iblock, bh, 0);

1921

/* unmapped? It's a hole - nothing to do */

1923

/* unmapped? It's a hole - nothing to do */

1922

if (!buffer_mapped(bh)) {

1924

if (!buffer_mapped(bh)) {

1923

BUFFER_TRACE(bh, "still unmapped");

1925

BUFFER_TRACE(bh, "still unmapped");

1924

goto unlock;

1926

goto unlock;

1925

}

1927

}

1926

}

1928

}

1927

1929

1928

/* Ok, it's mapped. Make sure it's up-to-date */

1930

/* Ok, it's mapped. Make sure it's up-to-date */

1929

if (PageUptodate(page))

1931

if (PageUptodate(page))

1930

set_buffer_uptodate(bh);

1932

set_buffer_uptodate(bh);

1931

1933

1932

if (!buffer_uptodate(bh)) {

1934

if (!buffer_uptodate(bh)) {

1933

err = -EIO;

1935

err = -EIO;

1934

ll_rw_block(READ, 1, &bh);

1936

ll_rw_block(READ, 1, &bh);

1935

wait_on_buffer(bh);

1937

wait_on_buffer(bh);

1936

/* Uhhuh. Read error. Complain and punt. */

1938

/* Uhhuh. Read error. Complain and punt. */

1937

if (!buffer_uptodate(bh))

1939

if (!buffer_uptodate(bh))

1938

goto unlock;

1940

goto unlock;

1939

}

1941

}

1940

1942

1941

if (ext4_should_journal_data(inode)) {

1943

if (ext4_should_journal_data(inode)) {

1942

BUFFER_TRACE(bh, "get write access");

1944

BUFFER_TRACE(bh, "get write access");

1943

err = ext4_journal_get_write_access(handle, bh);

1945

err = ext4_journal_get_write_access(handle, bh);

1944

if (err)

1946

if (err)

1945

goto unlock;

1947

goto unlock;

1946

}

1948

}

1947

1949

1948

zero_user(page, offset, length);

1950

zero_user(page, offset, length);

1949

1951

1950

BUFFER_TRACE(bh, "zeroed end of block");

1952

BUFFER_TRACE(bh, "zeroed end of block");

1951

1953

1952

err = 0;

1954

err = 0;

1953

if (ext4_should_journal_data(inode)) {

1955

if (ext4_should_journal_data(inode)) {

1954

err = ext4_journal_dirty_metadata(handle, bh);

1956

err = ext4_journal_dirty_metadata(handle, bh);

1955

} else {

1957

} else {

1956

if (ext4_should_order_data(inode))

1958

if (ext4_should_order_data(inode))

1957

err = ext4_journal_dirty_data(handle, bh);

1959

err = ext4_journal_dirty_data(handle, bh);

1958

mark_buffer_dirty(bh);

1960

mark_buffer_dirty(bh);

1959

}

1961

}

1960

1962

1961

unlock:

1963

unlock:

1962

unlock_page(page);

1964

unlock_page(page);

1963

page_cache_release(page);

1965

page_cache_release(page);

1964

return err;

1966

return err;

1965

}

1967

}

1966

1968

1967

/*

1969

/*

1968

* Probably it should be a library function... search for first non-zero word

1970

* Probably it should be a library function... search for first non-zero word

1969

* or memcmp with zero_page, whatever is better for particular architecture.

1971

* or memcmp with zero_page, whatever is better for particular architecture.

1970

* Linus?

1972

* Linus?

1971

*/

1973

*/

1972

static inline int all_zeroes(__le32 *p, __le32 *q)

1974

static inline int all_zeroes(__le32 *p, __le32 *q)

1973

{

1975

{

1974

while (p < q)

1976

while (p < q)

1975

if (*p++)

1977

if (*p++)

1976

return 0;

1978

return 0;

1977

return 1;

1979

return 1;

1978

}

1980

}

1979

1981

1980

/**

1982

/**

1981

* ext4_find_shared - find the indirect blocks for partial truncation.

1983

* ext4_find_shared - find the indirect blocks for partial truncation.

1982

* @inode: inode in question

1984

* @inode: inode in question

1983

* @depth: depth of the affected branch

1985

* @depth: depth of the affected branch

1984

* @offsets: offsets of pointers in that branch (see ext4_block_to_path)

1986

* @offsets: offsets of pointers in that branch (see ext4_block_to_path)

1985

* @chain: place to store the pointers to partial indirect blocks

1987

* @chain: place to store the pointers to partial indirect blocks

1986

* @top: place to the (detached) top of branch

1988

* @top: place to the (detached) top of branch

1987

*

1989

*

1988

* This is a helper function used by ext4_truncate().

1990

* This is a helper function used by ext4_truncate().

1989

*

1991

*

1990

* When we do truncate() we may have to clean the ends of several

1992

* When we do truncate() we may have to clean the ends of several

1991

* indirect blocks but leave the blocks themselves alive. Block is

1993

* indirect blocks but leave the blocks themselves alive. Block is

1992

* partially truncated if some data below the new i_size is refered

1994

* partially truncated if some data below the new i_size is refered

1993

* from it (and it is on the path to the first completely truncated

1995

* from it (and it is on the path to the first completely truncated

1994

* data block, indeed). We have to free the top of that path along

1996

* data block, indeed). We have to free the top of that path along

1995

* with everything to the right of the path. Since no allocation

1997

* with everything to the right of the path. Since no allocation

1996

* past the truncation point is possible until ext4_truncate()

1998

* past the truncation point is possible until ext4_truncate()

1997

* finishes, we may safely do the latter, but top of branch may

1999

* finishes, we may safely do the latter, but top of branch may

1998

* require special attention - pageout below the truncation point

2000

* require special attention - pageout below the truncation point

1999

* might try to populate it.

2001

* might try to populate it.

2000

*

2002

*

2001

* We atomically detach the top of branch from the tree, store the

2003

* We atomically detach the top of branch from the tree, store the

2002

* block number of its root in *@top, pointers to buffer_heads of

2004

* block number of its root in *@top, pointers to buffer_heads of

2003

* partially truncated blocks - in @chain[].bh and pointers to

2005

* partially truncated blocks - in @chain[].bh and pointers to

2004

* their last elements that should not be removed - in

2006

* their last elements that should not be removed - in

2005

* @chain[].p. Return value is the pointer to last filled element

2007

* @chain[].p. Return value is the pointer to last filled element

2006

* of @chain.

2008

* of @chain.

2007

*

2009

*

2008

* The work left to caller to do the actual freeing of subtrees:

2010

* The work left to caller to do the actual freeing of subtrees:

2009

* a) free the subtree starting from *@top

2011

* a) free the subtree starting from *@top

2010

* b) free the subtrees whose roots are stored in

2012

* b) free the subtrees whose roots are stored in

2011

* (@chain[i].p+1 .. end of @chain[i].bh->b_data)

2013

* (@chain[i].p+1 .. end of @chain[i].bh->b_data)

2012

* c) free the subtrees growing from the inode past the @chain[0].

2014

* c) free the subtrees growing from the inode past the @chain[0].

2013

* (no partially truncated stuff there). */

2015

* (no partially truncated stuff there). */

2014

2016

2015

static Indirect *ext4_find_shared(struct inode *inode, int depth,

2017

static Indirect *ext4_find_shared(struct inode *inode, int depth,

2016

ext4_lblk_t offsets[4], Indirect chain[4], __le32 *top)

2018

ext4_lblk_t offsets[4], Indirect chain[4], __le32 *top)

2017

{

2019

{

2018

Indirect *partial, *p;

2020

Indirect *partial, *p;

2019

int k, err;

2021

int k, err;

2020

2022

2021

*top = 0;

2023

*top = 0;

2022

/* Make k index the deepest non-null offest + 1 */

2024

/* Make k index the deepest non-null offest + 1 */

2023

for (k = depth; k > 1 && !offsets[k-1]; k--)

2025

for (k = depth; k > 1 && !offsets[k-1]; k--)

2024

;

2026

;

2025

partial = ext4_get_branch(inode, k, offsets, chain, &err);

2027

partial = ext4_get_branch(inode, k, offsets, chain, &err);

2026

/* Writer: pointers */

2028

/* Writer: pointers */

2027

if (!partial)

2029

if (!partial)

2028

partial = chain + k-1;

2030

partial = chain + k-1;

2029

/*

2031

/*

2030

* If the branch acquired continuation since we've looked at it -

2032

* If the branch acquired continuation since we've looked at it -

2031

* fine, it should all survive and (new) top doesn't belong to us.

2033

* fine, it should all survive and (new) top doesn't belong to us.

2032

*/

2034

*/

2033

if (!partial->key && *partial->p)

2035

if (!partial->key && *partial->p)

2034

/* Writer: end */

2036

/* Writer: end */

2035

goto no_top;

2037

goto no_top;

2036

for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)

2038

for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)

2037

;

2039

;

2038

/*

2040

/*

2039

* OK, we've found the last block that must survive. The rest of our

2041

* OK, we've found the last block that must survive. The rest of our

2040

* branch should be detached before unlocking. However, if that rest

2042

* branch should be detached before unlocking. However, if that rest

2041

* of branch is all ours and does not grow immediately from the inode

2043

* of branch is all ours and does not grow immediately from the inode

2042

* it's easier to cheat and just decrement partial->p.

2044

* it's easier to cheat and just decrement partial->p.

2043

*/

2045

*/

2044

if (p == chain + k - 1 && p > chain) {

2046

if (p == chain + k - 1 && p > chain) {

2045

p->p--;

2047

p->p--;

2046

} else {

2048

} else {

2047

*top = *p->p;

2049

*top = *p->p;

2048

/* Nope, don't do this in ext4. Must leave the tree intact */

2050

/* Nope, don't do this in ext4. Must leave the tree intact */

2049

#if 0

2051

#if 0

2050

*p->p = 0;

2052

*p->p = 0;

2051

#endif

2053

#endif

2052

}

2054

}

2053

/* Writer: end */

2055

/* Writer: end */

2054

2056

2055

while(partial > p) {

2057

while(partial > p) {

2056

brelse(partial->bh);

2058

brelse(partial->bh);

2057

partial--;

2059

partial--;

2058

}

2060

}

2059

no_top:

2061

no_top:

2060

return partial;

2062

return partial;

2061

}

2063

}

2062

2064

2063

/*

2065

/*

2064

* Zero a number of block pointers in either an inode or an indirect block.

2066

* Zero a number of block pointers in either an inode or an indirect block.

2065

* If we restart the transaction we must again get write access to the

2067

* If we restart the transaction we must again get write access to the

2066

* indirect block for further modification.

2068

* indirect block for further modification.

2067

*

2069

*

2068

* We release `count' blocks on disk, but (last - first) may be greater

2070

* We release `count' blocks on disk, but (last - first) may be greater

2069

* than `count' because there can be holes in there.

2071

* than `count' because there can be holes in there.

2070

*/

2072

*/

2071

static void ext4_clear_blocks(handle_t *handle, struct inode *inode,

2073

static void ext4_clear_blocks(handle_t *handle, struct inode *inode,

2072

struct buffer_head *bh, ext4_fsblk_t block_to_free,

2074

struct buffer_head *bh, ext4_fsblk_t block_to_free,

2073

unsigned long count, __le32 *first, __le32 *last)

2075

unsigned long count, __le32 *first, __le32 *last)

2074

{

2076

{

2075

__le32 *p;

2077

__le32 *p;

2076

if (try_to_extend_transaction(handle, inode)) {

2078

if (try_to_extend_transaction(handle, inode)) {

2077

if (bh) {

2079

if (bh) {

2078

BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");

2080

BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");

2079

ext4_journal_dirty_metadata(handle, bh);

2081

ext4_journal_dirty_metadata(handle, bh);

2080

}

2082

}

2081

ext4_mark_inode_dirty(handle, inode);

2083

ext4_mark_inode_dirty(handle, inode);

2082

ext4_journal_test_restart(handle, inode);

2084

ext4_journal_test_restart(handle, inode);

2083

if (bh) {

2085

if (bh) {

2084

BUFFER_TRACE(bh, "retaking write access");

2086

BUFFER_TRACE(bh, "retaking write access");

2085

ext4_journal_get_write_access(handle, bh);

2087

ext4_journal_get_write_access(handle, bh);

2086

}

2088

}

2087

}

2089

}

2088

2090

2089

/*

2091

/*

2090

* Any buffers which are on the journal will be in memory. We find

2092

* Any buffers which are on the journal will be in memory. We find

2091

* them on the hash table so jbd2_journal_revoke() will run jbd2_journal_forget()

2093

* them on the hash table so jbd2_journal_revoke() will run jbd2_journal_forget()

2092

* on them. We've already detached each block from the file, so

2094

* on them. We've already detached each block from the file, so

2093

* bforget() in jbd2_journal_forget() should be safe.

2095

* bforget() in jbd2_journal_forget() should be safe.

2094

*

2096

*

2095

* AKPM: turn on bforget in jbd2_journal_forget()!!!

2097

* AKPM: turn on bforget in jbd2_journal_forget()!!!

2096

*/

2098

*/

2097

for (p = first; p < last; p++) {

2099

for (p = first; p < last; p++) {

2098

u32 nr = le32_to_cpu(*p);

2100

u32 nr = le32_to_cpu(*p);

2099

if (nr) {

2101

if (nr) {

2100

struct buffer_head *tbh;

2102

struct buffer_head *tbh;

2101

2103

2102

*p = 0;

2104

*p = 0;

2103

tbh = sb_find_get_block(inode->i_sb, nr);

2105

tbh = sb_find_get_block(inode->i_sb, nr);

2104

ext4_forget(handle, 0, inode, tbh, nr);

2106

ext4_forget(handle, 0, inode, tbh, nr);

2105

}

2107

}

2106

}

2108

}

2107

2109

2108

ext4_free_blocks(handle, inode, block_to_free, count, 0);

2110

ext4_free_blocks(handle, inode, block_to_free, count, 0);

2109

}

2111

}

2110

2112

2111

/**

2113

/**

2112

* ext4_free_data - free a list of data blocks

2114

* ext4_free_data - free a list of data blocks

2113

* @handle: handle for this transaction

2115

* @handle: handle for this transaction

2114

* @inode: inode we are dealing with

2116

* @inode: inode we are dealing with

2115

* @this_bh: indirect buffer_head which contains *@first and *@last

2117

* @this_bh: indirect buffer_head which contains *@first and *@last

2116

* @first: array of block numbers

2118

* @first: array of block numbers

2117

* @last: points immediately past the end of array

2119

* @last: points immediately past the end of array

2118

*

2120

*

2119

* We are freeing all blocks refered from that array (numbers are stored as

2121

* We are freeing all blocks refered from that array (numbers are stored as

2120

* little-endian 32-bit) and updating @inode->i_blocks appropriately.

2122

* little-endian 32-bit) and updating @inode->i_blocks appropriately.

2121

*

2123

*

2122

* We accumulate contiguous runs of blocks to free. Conveniently, if these

2124

* We accumulate contiguous runs of blocks to free. Conveniently, if these

2123

* blocks are contiguous then releasing them at one time will only affect one

2125

* blocks are contiguous then releasing them at one time will only affect one

2124

* or two bitmap blocks (+ group descriptor(s) and superblock) and we won't

2126

* or two bitmap blocks (+ group descriptor(s) and superblock) and we won't

2125

* actually use a lot of journal space.

2127

* actually use a lot of journal space.

2126

*

2128

*

2127

* @this_bh will be %NULL if @first and @last point into the inode's direct

2129

* @this_bh will be %NULL if @first and @last point into the inode's direct

2128

* block pointers.

2130

* block pointers.

2129

*/

2131

*/

2130

static void ext4_free_data(handle_t *handle, struct inode *inode,

2132

static void ext4_free_data(handle_t *handle, struct inode *inode,

2131

struct buffer_head *this_bh,

2133

struct buffer_head *this_bh,

2132

__le32 *first, __le32 *last)

2134

__le32 *first, __le32 *last)

2133

{

2135

{

2134

ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */

2136

ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */

2135

unsigned long count = 0; /* Number of blocks in the run */

2137

unsigned long count = 0; /* Number of blocks in the run */

2136

__le32 *block_to_free_p = NULL; /* Pointer into inode/ind

2138

__le32 *block_to_free_p = NULL; /* Pointer into inode/ind

2137

corresponding to

2139

corresponding to

2138

block_to_free */

2140

block_to_free */

2139

ext4_fsblk_t nr; /* Current block # */

2141

ext4_fsblk_t nr; /* Current block # */

2140

__le32 *p; /* Pointer into inode/ind

2142

__le32 *p; /* Pointer into inode/ind

2141

for current block */

2143

for current block */

2142

int err;

2144

int err;

2143

2145

2144

if (this_bh) { /* For indirect block */

2146

if (this_bh) { /* For indirect block */

2145

BUFFER_TRACE(this_bh, "get_write_access");

2147

BUFFER_TRACE(this_bh, "get_write_access");

2146

err = ext4_journal_get_write_access(handle, this_bh);

2148

err = ext4_journal_get_write_access(handle, this_bh);

2147

/* Important: if we can't update the indirect pointers

2149

/* Important: if we can't update the indirect pointers

2148

* to the blocks, we can't free them. */

2150

* to the blocks, we can't free them. */

2149

if (err)

2151

if (err)

2150

return;

2152

return;

2151

}

2153

}

2152

2154

2153

for (p = first; p < last; p++) {

2155

for (p = first; p < last; p++) {

2154

nr = le32_to_cpu(*p);

2156

nr = le32_to_cpu(*p);

2155

if (nr) {

2157

if (nr) {

2156

/* accumulate blocks to free if they're contiguous */

2158

/* accumulate blocks to free if they're contiguous */

2157

if (count == 0) {

2159

if (count == 0) {

2158

block_to_free = nr;

2160

block_to_free = nr;

2159

block_to_free_p = p;

2161

block_to_free_p = p;

2160

count = 1;

2162

count = 1;

2161

} else if (nr == block_to_free + count) {

2163

} else if (nr == block_to_free + count) {

2162

count++;

2164

count++;

2163

} else {

2165

} else {

2164

ext4_clear_blocks(handle, inode, this_bh,

2166

ext4_clear_blocks(handle, inode, this_bh,

2165

block_to_free,

2167

block_to_free,

2166

count, block_to_free_p, p);

2168

count, block_to_free_p, p);

2167

block_to_free = nr;

2169

block_to_free = nr;

2168

block_to_free_p = p;

2170

block_to_free_p = p;

2169

count = 1;

2171

count = 1;

2170

}

2172

}

2171

}

2173

}

2172

}

2174

}

2173

2175

2174

if (count > 0)

2176

if (count > 0)

2175

ext4_clear_blocks(handle, inode, this_bh, block_to_free,

2177

ext4_clear_blocks(handle, inode, this_bh, block_to_free,

2176

count, block_to_free_p, p);

2178

count, block_to_free_p, p);

2177

2179

2178

if (this_bh) {

2180

if (this_bh) {

2179

BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata");

2181

BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata");

2180

ext4_journal_dirty_metadata(handle, this_bh);

2182

ext4_journal_dirty_metadata(handle, this_bh);

2181

}

2183

}

2182

}

2184

}

2183

2185

2184

/**

2186

/**

2185

* ext4_free_branches - free an array of branches

2187

* ext4_free_branches - free an array of branches

2186

* @handle: JBD handle for this transaction

2188

* @handle: JBD handle for this transaction

2187

* @inode: inode we are dealing with

2189

* @inode: inode we are dealing with

2188

* @parent_bh: the buffer_head which contains *@first and *@last

2190

* @parent_bh: the buffer_head which contains *@first and *@last

2189

* @first: array of block numbers

2191

* @first: array of block numbers

2190

* @last: pointer immediately past the end of array

2192

* @last: pointer immediately past the end of array

2191

* @depth: depth of the branches to free

2193

* @depth: depth of the branches to free

2192

*

2194

*

2193

* We are freeing all blocks refered from these branches (numbers are

2195

* We are freeing all blocks refered from these branches (numbers are

2194

* stored as little-endian 32-bit) and updating @inode->i_blocks

2196

* stored as little-endian 32-bit) and updating @inode->i_blocks

2195

* appropriately.

2197

* appropriately.

2196

*/

2198

*/

2197

static void ext4_free_branches(handle_t *handle, struct inode *inode,

2199

static void ext4_free_branches(handle_t *handle, struct inode *inode,

2198

struct buffer_head *parent_bh,

2200

struct buffer_head *parent_bh,

2199

__le32 *first, __le32 *last, int depth)

2201

__le32 *first, __le32 *last, int depth)

2200

{

2202

{

2201

ext4_fsblk_t nr;

2203

ext4_fsblk_t nr;

2202

__le32 *p;

2204

__le32 *p;

2203

2205

2204

if (is_handle_aborted(handle))

2206

if (is_handle_aborted(handle))

2205

return;

2207

return;

2206

2208

2207

if (depth--) {

2209

if (depth--) {

2208

struct buffer_head *bh;

2210

struct buffer_head *bh;

2209

int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);

2211

int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);

2210

p = last;

2212

p = last;

2211

while (--p >= first) {

2213

while (--p >= first) {

2212

nr = le32_to_cpu(*p);

2214

nr = le32_to_cpu(*p);

2213

if (!nr)

2215

if (!nr)

2214

continue; /* A hole */

2216

continue; /* A hole */

2215

2217

2216

/* Go read the buffer for the next level down */

2218

/* Go read the buffer for the next level down */

2217

bh = sb_bread(inode->i_sb, nr);

2219

bh = sb_bread(inode->i_sb, nr);

2218

2220

2219

/*

2221

/*

2220

* A read failure? Report error and clear slot

2222

* A read failure? Report error and clear slot

2221

* (should be rare).

2223

* (should be rare).

2222

*/

2224

*/

2223

if (!bh) {

2225

if (!bh) {

2224

ext4_error(inode->i_sb, "ext4_free_branches",

2226

ext4_error(inode->i_sb, "ext4_free_branches",

2225

"Read failure, inode=%lu, block=%llu",

2227

"Read failure, inode=%lu, block=%llu",

2226

inode->i_ino, nr);

2228

inode->i_ino, nr);

2227

continue;

2229

continue;

2228

}

2230

}

2229

2231

2230

/* This zaps the entire block. Bottom up. */

2232

/* This zaps the entire block. Bottom up. */

2231

BUFFER_TRACE(bh, "free child branches");

2233

BUFFER_TRACE(bh, "free child branches");

2232

ext4_free_branches(handle, inode, bh,

2234

ext4_free_branches(handle, inode, bh,

2233

(__le32*)bh->b_data,

2235

(__le32*)bh->b_data,

2234

(__le32*)bh->b_data + addr_per_block,

2236

(__le32*)bh->b_data + addr_per_block,

2235

depth);

2237

depth);

2236

2238

2237

/*

2239

/*

2238

* We've probably journalled the indirect block several

2240

* We've probably journalled the indirect block several

2239

* times during the truncate. But it's no longer

2241

* times during the truncate. But it's no longer

2240

* needed and we now drop it from the transaction via

2242

* needed and we now drop it from the transaction via

2241

* jbd2_journal_revoke().

2243

* jbd2_journal_revoke().

2242

*

2244

*

2243

* That's easy if it's exclusively part of this

2245

* That's easy if it's exclusively part of this

2244

* transaction. But if it's part of the committing

2246

* transaction. But if it's part of the committing

2245

* transaction then jbd2_journal_forget() will simply

2247

* transaction then jbd2_journal_forget() will simply

2246

* brelse() it. That means that if the underlying

2248

* brelse() it. That means that if the underlying

2247

* block is reallocated in ext4_get_block(),

2249

* block is reallocated in ext4_get_block(),

2248

* unmap_underlying_metadata() will find this block

2250

* unmap_underlying_metadata() will find this block

2249

* and will try to get rid of it. damn, damn.

2251

* and will try to get rid of it. damn, damn.

2250

*

2252

*

2251

* If this block has already been committed to the

2253

* If this block has already been committed to the

2252

* journal, a revoke record will be written. And

2254

* journal, a revoke record will be written. And

2253

* revoke records must be emitted *before* clearing

2255

* revoke records must be emitted *before* clearing

2254

* this block's bit in the bitmaps.

2256

* this block's bit in the bitmaps.

2255

*/

2257

*/

2256

ext4_forget(handle, 1, inode, bh, bh->b_blocknr);

2258

ext4_forget(handle, 1, inode, bh, bh->b_blocknr);

2257

2259

2258

/*

2260

/*

2259

* Everything below this this pointer has been

2261

* Everything below this this pointer has been

2260

* released. Now let this top-of-subtree go.

2262

* released. Now let this top-of-subtree go.

2261

*

2263

*

2262

* We want the freeing of this indirect block to be

2264

* We want the freeing of this indirect block to be

2263

* atomic in the journal with the updating of the

2265

* atomic in the journal with the updating of the

2264

* bitmap block which owns it. So make some room in

2266

* bitmap block which owns it. So make some room in

2265

* the journal.

2267

* the journal.

2266

*

2268

*

2267

* We zero the parent pointer *after* freeing its

2269

* We zero the parent pointer *after* freeing its

2268

* pointee in the bitmaps, so if extend_transaction()

2270

* pointee in the bitmaps, so if extend_transaction()

2269

* for some reason fails to put the bitmap changes and

2271

* for some reason fails to put the bitmap changes and

2270

* the release into the same transaction, recovery

2272

* the release into the same transaction, recovery

2271

* will merely complain about releasing a free block,

2273

* will merely complain about releasing a free block,

2272

* rather than leaking blocks.

2274

* rather than leaking blocks.

2273

*/

2275

*/

2274

if (is_handle_aborted(handle))

2276

if (is_handle_aborted(handle))

2275

return;

2277

return;

2276

if (try_to_extend_transaction(handle, inode)) {

2278

if (try_to_extend_transaction(handle, inode)) {

2277

ext4_mark_inode_dirty(handle, inode);

2279

ext4_mark_inode_dirty(handle, inode);

2278

ext4_journal_test_restart(handle, inode);

2280

ext4_journal_test_restart(handle, inode);

2279

}

2281

}

2280

2282

2281

ext4_free_blocks(handle, inode, nr, 1, 1);

2283

ext4_free_blocks(handle, inode, nr, 1, 1);

2282

2284

2283

if (parent_bh) {

2285

if (parent_bh) {

2284

/*

2286

/*

2285

* The block which we have just freed is

2287

* The block which we have just freed is

2286

* pointed to by an indirect block: journal it

2288

* pointed to by an indirect block: journal it

2287

*/

2289

*/

2288

BUFFER_TRACE(parent_bh, "get_write_access");

2290

BUFFER_TRACE(parent_bh, "get_write_access");

2289

if (!ext4_journal_get_write_access(handle,

2291

if (!ext4_journal_get_write_access(handle,

2290

parent_bh)){

2292

parent_bh)){

2291

*p = 0;

2293

*p = 0;

2292

BUFFER_TRACE(parent_bh,

2294

BUFFER_TRACE(parent_bh,

2293

"call ext4_journal_dirty_metadata");

2295

"call ext4_journal_dirty_metadata");

2294

ext4_journal_dirty_metadata(handle,

2296

ext4_journal_dirty_metadata(handle,

2295

parent_bh);

2297

parent_bh);

2296

}

2298

}

2297

}

2299

}

2298

}

2300

}

2299

} else {

2301

} else {

2300

/* We have reached the bottom of the tree. */

2302

/* We have reached the bottom of the tree. */

2301

BUFFER_TRACE(parent_bh, "free data blocks");

2303

BUFFER_TRACE(parent_bh, "free data blocks");

2302

ext4_free_data(handle, inode, parent_bh, first, last);

2304

ext4_free_data(handle, inode, parent_bh, first, last);

2303

}

2305

}

2304

}

2306

}

2305

2307

2306

/*

2308

/*

2307

* ext4_truncate()

2309

* ext4_truncate()

2308

*

2310

*

2309

* We block out ext4_get_block() block instantiations across the entire

2311

* We block out ext4_get_block() block instantiations across the entire

2310

* transaction, and VFS/VM ensures that ext4_truncate() cannot run

2312

* transaction, and VFS/VM ensures that ext4_truncate() cannot run

2311

* simultaneously on behalf of the same inode.

2313

* simultaneously on behalf of the same inode.

2312

*

2314

*

2313

* As we work through the truncate and commmit bits of it to the journal there

2315

* As we work through the truncate and commmit bits of it to the journal there

2314

* is one core, guiding principle: the file's tree must always be consistent on

2316

* is one core, guiding principle: the file's tree must always be consistent on

2315

* disk. We must be able to restart the truncate after a crash.

2317

* disk. We must be able to restart the truncate after a crash.

2316

*

2318

*

2317

* The file's tree may be transiently inconsistent in memory (although it

2319

* The file's tree may be transiently inconsistent in memory (although it

2318

* probably isn't), but whenever we close off and commit a journal transaction,

2320

* probably isn't), but whenever we close off and commit a journal transaction,

2319

* the contents of (the filesystem + the journal) must be consistent and

2321

* the contents of (the filesystem + the journal) must be consistent and

2320

* restartable. It's pretty simple, really: bottom up, right to left (although

2322

* restartable. It's pretty simple, really: bottom up, right to left (although

2321

* left-to-right works OK too).

2323

* left-to-right works OK too).

2322

*

2324

*

2323

* Note that at recovery time, journal replay occurs *before* the restart of

2325

* Note that at recovery time, journal replay occurs *before* the restart of

2324

* truncate against the orphan inode list.

2326

* truncate against the orphan inode list.

2325

*

2327

*

2326

* The committed inode has the new, desired i_size (which is the same as

2328

* The committed inode has the new, desired i_size (which is the same as

2327

* i_disksize in this case). After a crash, ext4_orphan_cleanup() will see

2329

* i_disksize in this case). After a crash, ext4_orphan_cleanup() will see

2328

* that this inode's truncate did not complete and it will again call

2330

* that this inode's truncate did not complete and it will again call

2329

* ext4_truncate() to have another go. So there will be instantiated blocks

2331

* ext4_truncate() to have another go. So there will be instantiated blocks

2330

* to the right of the truncation point in a crashed ext4 filesystem. But

2332

* to the right of the truncation point in a crashed ext4 filesystem. But

2331

* that's fine - as long as they are linked from the inode, the post-crash

2333

* that's fine - as long as they are linked from the inode, the post-crash

2332

* ext4_truncate() run will find them and release them.

2334

* ext4_truncate() run will find them and release them.

2333

*/

2335

*/

2334

void ext4_truncate(struct inode *inode)

2336

void ext4_truncate(struct inode *inode)

2335

{

2337

{

2336

handle_t *handle;

2338

handle_t *handle;

2337

struct ext4_inode_info *ei = EXT4_I(inode);

2339

struct ext4_inode_info *ei = EXT4_I(inode);

2338

__le32 *i_data = ei->i_data;

2340

__le32 *i_data = ei->i_data;

2339

int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);

2341

int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);

2340

struct address_space *mapping = inode->i_mapping;

2342

struct address_space *mapping = inode->i_mapping;

2341

ext4_lblk_t offsets[4];

2343

ext4_lblk_t offsets[4];

2342

Indirect chain[4];

2344

Indirect chain[4];

2343

Indirect *partial;

2345

Indirect *partial;

2344

__le32 nr = 0;

2346

__le32 nr = 0;

2345

int n;

2347

int n;

2346

ext4_lblk_t last_block;

2348

ext4_lblk_t last_block;

2347

unsigned blocksize = inode->i_sb->s_blocksize;

2349

unsigned blocksize = inode->i_sb->s_blocksize;

2348

struct page *page;

2350

struct page *page;

2349

2351

2350

if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||

2352

if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||

2351

S_ISLNK(inode->i_mode)))

2353

S_ISLNK(inode->i_mode)))

2352

return;

2354

return;

2353

if (ext4_inode_is_fast_symlink(inode))

2355

if (ext4_inode_is_fast_symlink(inode))

2354

return;

2356

return;

2355

if (IS_APPEND(inode) || IS_IMMUTABLE(inode))

2357

if (IS_APPEND(inode) || IS_IMMUTABLE(inode))

2356

return;

2358

return;

2357

2359

2358

/*

2360

/*

2359

* We have to lock the EOF page here, because lock_page() nests

2361

* We have to lock the EOF page here, because lock_page() nests

2360

* outside jbd2_journal_start().

2362

* outside jbd2_journal_start().

2361

*/

2363

*/

2362

if ((inode->i_size & (blocksize - 1)) == 0) {

2364

if ((inode->i_size & (blocksize - 1)) == 0) {

2363

/* Block boundary? Nothing to do */

2365

/* Block boundary? Nothing to do */

2364

page = NULL;

2366

page = NULL;

2365

} else {

2367

} else {

2366

page = grab_cache_page(mapping,

2368

page = grab_cache_page(mapping,

2367

inode->i_size >> PAGE_CACHE_SHIFT);

2369

inode->i_size >> PAGE_CACHE_SHIFT);

2368

if (!page)

2370

if (!page)

2369

return;

2371

return;

2370

}

2372

}

2371

2373

2372

if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {

2374

if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {

2373

ext4_ext_truncate(inode, page);

2375

ext4_ext_truncate(inode, page);

2374

return;

2376

return;

2375

}

2377

}

2376

2378

2377

handle = start_transaction(inode);

2379

handle = start_transaction(inode);

2378

if (IS_ERR(handle)) {

2380

if (IS_ERR(handle)) {

2379

if (page) {

2381

if (page) {

2380

clear_highpage(page);

2382

clear_highpage(page);

2381

flush_dcache_page(page);

2383

flush_dcache_page(page);

2382

unlock_page(page);

2384

unlock_page(page);

2383

page_cache_release(page);

2385

page_cache_release(page);

2384

}

2386

}

2385

return; /* AKPM: return what? */

2387

return; /* AKPM: return what? */

2386

}

2388

}

2387

2389

2388

last_block = (inode->i_size + blocksize-1)

2390

last_block = (inode->i_size + blocksize-1)

2389

>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);

2391

>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);

2390

2392

2391

if (page)

2393

if (page)

2392

ext4_block_truncate_page(handle, page, mapping, inode->i_size);

2394

ext4_block_truncate_page(handle, page, mapping, inode->i_size);

2393

2395

2394

n = ext4_block_to_path(inode, last_block, offsets, NULL);

2396

n = ext4_block_to_path(inode, last_block, offsets, NULL);

2395

if (n == 0)

2397

if (n == 0)

2396

goto out_stop; /* error */

2398

goto out_stop; /* error */

2397

2399

2398

/*

2400

/*

2399

* OK. This truncate is going to happen. We add the inode to the

2401

* OK. This truncate is going to happen. We add the inode to the

2400

* orphan list, so that if this truncate spans multiple transactions,

2402

* orphan list, so that if this truncate spans multiple transactions,

2401

* and we crash, we will resume the truncate when the filesystem

2403

* and we crash, we will resume the truncate when the filesystem

2402

* recovers. It also marks the inode dirty, to catch the new size.

2404

* recovers. It also marks the inode dirty, to catch the new size.

2403

*

2405

*

2404

* Implication: the file must always be in a sane, consistent

2406

* Implication: the file must always be in a sane, consistent

2405

* truncatable state while each transaction commits.

2407

* truncatable state while each transaction commits.

2406

*/

2408

*/

2407

if (ext4_orphan_add(handle, inode))

2409

if (ext4_orphan_add(handle, inode))

2408

goto out_stop;

2410

goto out_stop;

2409

2411

2410

/*

2412

/*

2411

* The orphan list entry will now protect us from any crash which

2413

* The orphan list entry will now protect us from any crash which

2412

* occurs before the truncate completes, so it is now safe to propagate

2414

* occurs before the truncate completes, so it is now safe to propagate

2413

* the new, shorter inode size (held for now in i_size) into the

2415

* the new, shorter inode size (held for now in i_size) into the

2414

* on-disk inode. We do this via i_disksize, which is the value which

2416

* on-disk inode. We do this via i_disksize, which is the value which

2415

* ext4 *really* writes onto the disk inode.

2417

* ext4 *really* writes onto the disk inode.

2416

*/

2418

*/

2417

ei->i_disksize = inode->i_size;

2419

ei->i_disksize = inode->i_size;

2418

2420

2419

/*

2421

/*

2420

* From here we block out all ext4_get_block() callers who want to

2422

* From here we block out all ext4_get_block() callers who want to

2421

* modify the block allocation tree.

2423

* modify the block allocation tree.

2422

*/

2424

*/

2423

down_write(&ei->i_data_sem);

2425

down_write(&ei->i_data_sem);

2424

2426

2425

if (n == 1) { /* direct blocks */

2427

if (n == 1) { /* direct blocks */

2426

ext4_free_data(handle, inode, NULL, i_data+offsets[0],

2428

ext4_free_data(handle, inode, NULL, i_data+offsets[0],

2427

i_data + EXT4_NDIR_BLOCKS);

2429

i_data + EXT4_NDIR_BLOCKS);

2428

goto do_indirects;

2430

goto do_indirects;

2429

}

2431

}

2430

2432

2431

partial = ext4_find_shared(inode, n, offsets, chain, &nr);

2433

partial = ext4_find_shared(inode, n, offsets, chain, &nr);

2432

/* Kill the top of shared branch (not detached) */

2434

/* Kill the top of shared branch (not detached) */

2433

if (nr) {

2435

if (nr) {

2434

if (partial == chain) {

2436

if (partial == chain) {

2435

/* Shared branch grows from the inode */

2437

/* Shared branch grows from the inode */

2436

ext4_free_branches(handle, inode, NULL,

2438

ext4_free_branches(handle, inode, NULL,

2437

&nr, &nr+1, (chain+n-1) - partial);

2439

&nr, &nr+1, (chain+n-1) - partial);

2438

*partial->p = 0;

2440

*partial->p = 0;

2439

/*

2441

/*

2440

* We mark the inode dirty prior to restart,

2442

* We mark the inode dirty prior to restart,

2441

* and prior to stop. No need for it here.

2443

* and prior to stop. No need for it here.

2442

*/

2444

*/

2443

} else {

2445

} else {

2444

/* Shared branch grows from an indirect block */

2446

/* Shared branch grows from an indirect block */

2445

BUFFER_TRACE(partial->bh, "get_write_access");

2447

BUFFER_TRACE(partial->bh, "get_write_access");

2446

ext4_free_branches(handle, inode, partial->bh,

2448

ext4_free_branches(handle, inode, partial->bh,

2447

partial->p,

2449

partial->p,

2448

partial->p+1, (chain+n-1) - partial);

2450

partial->p+1, (chain+n-1) - partial);

2449

}

2451

}

2450

}

2452

}

2451

/* Clear the ends of indirect blocks on the shared branch */

2453

/* Clear the ends of indirect blocks on the shared branch */

2452

while (partial > chain) {

2454

while (partial > chain) {

2453

ext4_free_branches(handle, inode, partial->bh, partial->p + 1,

2455

ext4_free_branches(handle, inode, partial->bh, partial->p + 1,

2454

(__le32*)partial->bh->b_data+addr_per_block,

2456

(__le32*)partial->bh->b_data+addr_per_block,

2455

(chain+n-1) - partial);

2457

(chain+n-1) - partial);

2456

BUFFER_TRACE(partial->bh, "call brelse");

2458

BUFFER_TRACE(partial->bh, "call brelse");

2457

brelse (partial->bh);

2459

brelse (partial->bh);

2458

partial--;

2460

partial--;

2459

}

2461

}

2460

do_indirects:

2462

do_indirects:

2461

/* Kill the remaining (whole) subtrees */

2463

/* Kill the remaining (whole) subtrees */

2462

switch (offsets[0]) {

2464

switch (offsets[0]) {

2463

default:

2465

default:

2464

nr = i_data[EXT4_IND_BLOCK];

2466

nr = i_data[EXT4_IND_BLOCK];

2465

if (nr) {

2467

if (nr) {

2466

ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);

2468

ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);

2467

i_data[EXT4_IND_BLOCK] = 0;

2469

i_data[EXT4_IND_BLOCK] = 0;

2468

}

2470

}

2469

case EXT4_IND_BLOCK:

2471

case EXT4_IND_BLOCK:

2470

nr = i_data[EXT4_DIND_BLOCK];

2472

nr = i_data[EXT4_DIND_BLOCK];

2471

if (nr) {

2473

if (nr) {

2472

ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);

2474

ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);

2473

i_data[EXT4_DIND_BLOCK] = 0;

2475

i_data[EXT4_DIND_BLOCK] = 0;

2474

}

2476

}

2475

case EXT4_DIND_BLOCK:

2477

case EXT4_DIND_BLOCK:

2476

nr = i_data[EXT4_TIND_BLOCK];

2478

nr = i_data[EXT4_TIND_BLOCK];

2477

if (nr) {

2479

if (nr) {

2478

ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);

2480

ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);

2479

i_data[EXT4_TIND_BLOCK] = 0;

2481

i_data[EXT4_TIND_BLOCK] = 0;

2480

}

2482

}

2481

case EXT4_TIND_BLOCK:

2483

case EXT4_TIND_BLOCK:

2482

;

2484

;

2483

}

2485

}

2484

2486

2485

ext4_discard_reservation(inode);

2487

ext4_discard_reservation(inode);

2486

2488

2487

up_write(&ei->i_data_sem);

2489

up_write(&ei->i_data_sem);

2488

inode->i_mtime = inode->i_ctime = ext4_current_time(inode);

2490

inode->i_mtime = inode->i_ctime = ext4_current_time(inode);

2489

ext4_mark_inode_dirty(handle, inode);

2491

ext4_mark_inode_dirty(handle, inode);

2490

2492

2491

/*

2493

/*

2492

* In a multi-transaction truncate, we only make the final transaction

2494

* In a multi-transaction truncate, we only make the final transaction

2493

* synchronous

2495

* synchronous

2494

*/

2496

*/

2495

if (IS_SYNC(inode))

2497

if (IS_SYNC(inode))

2496

handle->h_sync = 1;

2498

handle->h_sync = 1;

2497

out_stop:

2499

out_stop:

2498

/*

2500

/*

2499

* If this was a simple ftruncate(), and the file will remain alive

2501

* If this was a simple ftruncate(), and the file will remain alive

2500

* then we need to clear up the orphan record which we created above.

2502

* then we need to clear up the orphan record which we created above.

2501

* However, if this was a real unlink then we were called by

2503

* However, if this was a real unlink then we were called by

2502

* ext4_delete_inode(), and we allow that function to clean up the

2504

* ext4_delete_inode(), and we allow that function to clean up the

2503

* orphan info for us.

2505

* orphan info for us.

2504

*/

2506

*/

2505

if (inode->i_nlink)

2507

if (inode->i_nlink)

2506

ext4_orphan_del(handle, inode);

2508

ext4_orphan_del(handle, inode);

2507

2509

2508

ext4_journal_stop(handle);

2510

ext4_journal_stop(handle);

2509

}

2511

}

2510

2512

2511

static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,

2513

static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,

2512

unsigned long ino, struct ext4_iloc *iloc)

2514

unsigned long ino, struct ext4_iloc *iloc)

2513

{

2515

{

2514

ext4_group_t block_group;

2516

ext4_group_t block_group;

2515

unsigned long offset;

2517

unsigned long offset;

2516

ext4_fsblk_t block;

2518

ext4_fsblk_t block;

2517

struct ext4_group_desc *gdp;

2519

struct ext4_group_desc *gdp;

2518

2520

2519

if (!ext4_valid_inum(sb, ino)) {

2521

if (!ext4_valid_inum(sb, ino)) {

2520

/*

2522

/*

2521

* This error is already checked for in namei.c unless we are

2523

* This error is already checked for in namei.c unless we are

2522

* looking at an NFS filehandle, in which case no error

2524

* looking at an NFS filehandle, in which case no error

2523

* report is needed

2525

* report is needed

2524

*/

2526

*/

2525

return 0;

2527

return 0;

2526

}

2528

}

2527

2529

2528

block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);

2530

block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);

2529

gdp = ext4_get_group_desc(sb, block_group, NULL);

2531

gdp = ext4_get_group_desc(sb, block_group, NULL);

2530

if (!gdp)

2532

if (!gdp)

2531

return 0;

2533

return 0;

2532

2534

2533

/*

2535

/*

2534

* Figure out the offset within the block group inode table

2536

* Figure out the offset within the block group inode table

2535

*/

2537

*/

2536

offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)) *

2538

offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)) *

2537

EXT4_INODE_SIZE(sb);

2539

EXT4_INODE_SIZE(sb);

2538

block = ext4_inode_table(sb, gdp) +

2540

block = ext4_inode_table(sb, gdp) +

2539

(offset >> EXT4_BLOCK_SIZE_BITS(sb));

2541

(offset >> EXT4_BLOCK_SIZE_BITS(sb));

2540

2542

2541

iloc->block_group = block_group;

2543

iloc->block_group = block_group;

2542

iloc->offset = offset & (EXT4_BLOCK_SIZE(sb) - 1);

2544

iloc->offset = offset & (EXT4_BLOCK_SIZE(sb) - 1);

2543

return block;

2545

return block;

2544

}

2546

}

2545

2547

2546

/*

2548

/*

2547

* ext4_get_inode_loc returns with an extra refcount against the inode's

2549

* ext4_get_inode_loc returns with an extra refcount against the inode's

2548

* underlying buffer_head on success. If 'in_mem' is true, we have all

2550

* underlying buffer_head on success. If 'in_mem' is true, we have all

2549

* data in memory that is needed to recreate the on-disk version of this

2551

* data in memory that is needed to recreate the on-disk version of this

2550

* inode.

2552

* inode.

2551

*/

2553

*/

2552

static int __ext4_get_inode_loc(struct inode *inode,

2554

static int __ext4_get_inode_loc(struct inode *inode,

2553

struct ext4_iloc *iloc, int in_mem)

2555

struct ext4_iloc *iloc, int in_mem)

2554

{

2556

{

2555

ext4_fsblk_t block;

2557

ext4_fsblk_t block;

2556

struct buffer_head *bh;

2558

struct buffer_head *bh;

2557

2559

2558

block = ext4_get_inode_block(inode->i_sb, inode->i_ino, iloc);

2560

block = ext4_get_inode_block(inode->i_sb, inode->i_ino, iloc);

2559

if (!block)

2561

if (!block)

2560

return -EIO;

2562

return -EIO;

2561

2563

2562

bh = sb_getblk(inode->i_sb, block);

2564

bh = sb_getblk(inode->i_sb, block);

2563

if (!bh) {

2565

if (!bh) {

2564

ext4_error (inode->i_sb, "ext4_get_inode_loc",

2566

ext4_error (inode->i_sb, "ext4_get_inode_loc",

2565

"unable to read inode block - "

2567

"unable to read inode block - "

2566

"inode=%lu, block=%llu",

2568

"inode=%lu, block=%llu",

2567

inode->i_ino, block);

2569

inode->i_ino, block);

2568

return -EIO;

2570

return -EIO;

2569

}

2571

}

2570

if (!buffer_uptodate(bh)) {

2572

if (!buffer_uptodate(bh)) {

2571

lock_buffer(bh);

2573

lock_buffer(bh);

2572

if (buffer_uptodate(bh)) {

2574

if (buffer_uptodate(bh)) {

2573

/* someone brought it uptodate while we waited */

2575

/* someone brought it uptodate while we waited */

2574

unlock_buffer(bh);

2576

unlock_buffer(bh);

2575

goto has_buffer;

2577

goto has_buffer;

2576

}

2578

}

2577

2579

2578

/*

2580

/*

2579

* If we have all information of the inode in memory and this

2581

* If we have all information of the inode in memory and this

2580

* is the only valid inode in the block, we need not read the

2582

* is the only valid inode in the block, we need not read the

2581

* block.

2583

* block.

2582

*/

2584

*/

2583

if (in_mem) {

2585

if (in_mem) {

2584

struct buffer_head *bitmap_bh;

2586

struct buffer_head *bitmap_bh;

2585

struct ext4_group_desc *desc;

2587

struct ext4_group_desc *desc;

2586

int inodes_per_buffer;

2588

int inodes_per_buffer;

2587

int inode_offset, i;

2589

int inode_offset, i;

2588

ext4_group_t block_group;

2590

ext4_group_t block_group;

2589

int start;

2591

int start;

2590

2592

2591

block_group = (inode->i_ino - 1) /

2593

block_group = (inode->i_ino - 1) /

2592

EXT4_INODES_PER_GROUP(inode->i_sb);

2594

EXT4_INODES_PER_GROUP(inode->i_sb);

2593

inodes_per_buffer = bh->b_size /

2595

inodes_per_buffer = bh->b_size /

2594

EXT4_INODE_SIZE(inode->i_sb);

2596

EXT4_INODE_SIZE(inode->i_sb);

2595

inode_offset = ((inode->i_ino - 1) %

2597

inode_offset = ((inode->i_ino - 1) %

2596

EXT4_INODES_PER_GROUP(inode->i_sb));

2598

EXT4_INODES_PER_GROUP(inode->i_sb));

2597

start = inode_offset & ~(inodes_per_buffer - 1);

2599

start = inode_offset & ~(inodes_per_buffer - 1);

2598

2600

2599

/* Is the inode bitmap in cache? */

2601

/* Is the inode bitmap in cache? */

2600

desc = ext4_get_group_desc(inode->i_sb,

2602

desc = ext4_get_group_desc(inode->i_sb,

2601

block_group, NULL);

2603

block_group, NULL);

2602

if (!desc)

2604

if (!desc)

2603

goto make_io;

2605

goto make_io;

2604

2606

2605

bitmap_bh = sb_getblk(inode->i_sb,

2607

bitmap_bh = sb_getblk(inode->i_sb,

2606

ext4_inode_bitmap(inode->i_sb, desc));

2608

ext4_inode_bitmap(inode->i_sb, desc));

2607

if (!bitmap_bh)

2609

if (!bitmap_bh)

2608

goto make_io;

2610

goto make_io;

2609

2611

2610

/*

2612

/*

2611

* If the inode bitmap isn't in cache then the

2613

* If the inode bitmap isn't in cache then the

2612

* optimisation may end up performing two reads instead

2614

* optimisation may end up performing two reads instead

2613

* of one, so skip it.

2615

* of one, so skip it.

2614

*/

2616

*/

2615

if (!buffer_uptodate(bitmap_bh)) {

2617

if (!buffer_uptodate(bitmap_bh)) {

2616

brelse(bitmap_bh);

2618

brelse(bitmap_bh);

2617

goto make_io;

2619

goto make_io;

2618

}

2620

}

2619

for (i = start; i < start + inodes_per_buffer; i++) {

2621

for (i = start; i < start + inodes_per_buffer; i++) {

2620

if (i == inode_offset)

2622

if (i == inode_offset)

2621

continue;

2623

continue;

2622

if (ext4_test_bit(i, bitmap_bh->b_data))

2624

if (ext4_test_bit(i, bitmap_bh->b_data))

2623

break;

2625

break;

2624

}

2626

}

2625

brelse(bitmap_bh);

2627

brelse(bitmap_bh);

2626

if (i == start + inodes_per_buffer) {

2628

if (i == start + inodes_per_buffer) {

2627

/* all other inodes are free, so skip I/O */

2629

/* all other inodes are free, so skip I/O */

2628

memset(bh->b_data, 0, bh->b_size);

2630

memset(bh->b_data, 0, bh->b_size);

2629

set_buffer_uptodate(bh);

2631

set_buffer_uptodate(bh);

2630

unlock_buffer(bh);

2632

unlock_buffer(bh);

2631

goto has_buffer;

2633

goto has_buffer;

2632

}

2634

}

2633

}

2635

}

2634

2636

2635

make_io:

2637

make_io:

2636

/*

2638

/*

2637

* There are other valid inodes in the buffer, this inode

2639

* There are other valid inodes in the buffer, this inode

2638

* has in-inode xattrs, or we don't have this inode in memory.

2640

* has in-inode xattrs, or we don't have this inode in memory.

2639

* Read the block from disk.

2641

* Read the block from disk.

2640

*/

2642

*/

2641

get_bh(bh);

2643

get_bh(bh);

2642

bh->b_end_io = end_buffer_read_sync;

2644

bh->b_end_io = end_buffer_read_sync;

2643

submit_bh(READ_META, bh);

2645

submit_bh(READ_META, bh);

2644

wait_on_buffer(bh);

2646

wait_on_buffer(bh);

2645

if (!buffer_uptodate(bh)) {

2647

if (!buffer_uptodate(bh)) {

2646

ext4_error(inode->i_sb, "ext4_get_inode_loc",

2648

ext4_error(inode->i_sb, "ext4_get_inode_loc",

2647

"unable to read inode block - "

2649

"unable to read inode block - "

2648

"inode=%lu, block=%llu",

2650

"inode=%lu, block=%llu",

2649

inode->i_ino, block);

2651

inode->i_ino, block);

2650

brelse(bh);

2652

brelse(bh);

2651

return -EIO;

2653

return -EIO;

2652

}

2654

}

2653

}

2655

}

2654

has_buffer:

2656

has_buffer:

2655

iloc->bh = bh;

2657

iloc->bh = bh;

2656

return 0;

2658

return 0;

2657

}

2659

}

2658

2660

2659

int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)

2661

int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)

2660

{

2662

{

2661

/* We have all inode data except xattrs in memory here. */

2663

/* We have all inode data except xattrs in memory here. */

2662

return __ext4_get_inode_loc(inode, iloc,

2664

return __ext4_get_inode_loc(inode, iloc,

2663

!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR));

2665

!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR));

2664

}

2666

}

2665

2667

2666

void ext4_set_inode_flags(struct inode *inode)

2668

void ext4_set_inode_flags(struct inode *inode)

2667

{

2669

{

2668

unsigned int flags = EXT4_I(inode)->i_flags;

2670

unsigned int flags = EXT4_I(inode)->i_flags;

2669

2671

2670

inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);

2672

inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);

2671

if (flags & EXT4_SYNC_FL)

2673

if (flags & EXT4_SYNC_FL)

2672

inode->i_flags |= S_SYNC;

2674

inode->i_flags |= S_SYNC;

2673

if (flags & EXT4_APPEND_FL)

2675

if (flags & EXT4_APPEND_FL)

2674

inode->i_flags |= S_APPEND;

2676

inode->i_flags |= S_APPEND;

2675

if (flags & EXT4_IMMUTABLE_FL)

2677

if (flags & EXT4_IMMUTABLE_FL)

2676

inode->i_flags |= S_IMMUTABLE;

2678

inode->i_flags |= S_IMMUTABLE;

2677

if (flags & EXT4_NOATIME_FL)

2679

if (flags & EXT4_NOATIME_FL)

2678

inode->i_flags |= S_NOATIME;

2680

inode->i_flags |= S_NOATIME;

2679

if (flags & EXT4_DIRSYNC_FL)

2681

if (flags & EXT4_DIRSYNC_FL)

2680

inode->i_flags |= S_DIRSYNC;

2682

inode->i_flags |= S_DIRSYNC;

2681

}

2683

}

2682

2684

2683

/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */

2685

/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */

2684

void ext4_get_inode_flags(struct ext4_inode_info *ei)

2686

void ext4_get_inode_flags(struct ext4_inode_info *ei)

2685

{

2687

{

2686

unsigned int flags = ei->vfs_inode.i_flags;

2688

unsigned int flags = ei->vfs_inode.i_flags;

2687

2689

2688

ei->i_flags &= ~(EXT4_SYNC_FL|EXT4_APPEND_FL|

2690

ei->i_flags &= ~(EXT4_SYNC_FL|EXT4_APPEND_FL|

2689

EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|EXT4_DIRSYNC_FL);

2691

EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|EXT4_DIRSYNC_FL);

2690

if (flags & S_SYNC)

2692

if (flags & S_SYNC)

2691

ei->i_flags |= EXT4_SYNC_FL;

2693

ei->i_flags |= EXT4_SYNC_FL;

2692

if (flags & S_APPEND)

2694

if (flags & S_APPEND)

2693

ei->i_flags |= EXT4_APPEND_FL;

2695

ei->i_flags |= EXT4_APPEND_FL;

2694

if (flags & S_IMMUTABLE)

2696

if (flags & S_IMMUTABLE)

2695

ei->i_flags |= EXT4_IMMUTABLE_FL;

2697

ei->i_flags |= EXT4_IMMUTABLE_FL;

2696

if (flags & S_NOATIME)

2698

if (flags & S_NOATIME)

2697

ei->i_flags |= EXT4_NOATIME_FL;

2699

ei->i_flags |= EXT4_NOATIME_FL;

2698

if (flags & S_DIRSYNC)

2700

if (flags & S_DIRSYNC)

2699

ei->i_flags |= EXT4_DIRSYNC_FL;

2701

ei->i_flags |= EXT4_DIRSYNC_FL;

2700

}

2702

}

2701

static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,

2703

static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,

2702

struct ext4_inode_info *ei)

2704

struct ext4_inode_info *ei)

2703

{

2705

{

2704

blkcnt_t i_blocks ;

2706

blkcnt_t i_blocks ;

2705

struct inode *inode = &(ei->vfs_inode);

2707

struct inode *inode = &(ei->vfs_inode);

2706

struct super_block *sb = inode->i_sb;

2708

struct super_block *sb = inode->i_sb;

2707

2709

2708

if (EXT4_HAS_RO_COMPAT_FEATURE(sb,

2710

if (EXT4_HAS_RO_COMPAT_FEATURE(sb,

2709

EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {

2711

EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {

2710

/* we are using combined 48 bit field */

2712

/* we are using combined 48 bit field */

2711

i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |

2713

i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |

2712

le32_to_cpu(raw_inode->i_blocks_lo);

2714

le32_to_cpu(raw_inode->i_blocks_lo);

2713

if (ei->i_flags & EXT4_HUGE_FILE_FL) {

2715

if (ei->i_flags & EXT4_HUGE_FILE_FL) {

2714

/* i_blocks represent file system block size */

2716

/* i_blocks represent file system block size */

2715

return i_blocks << (inode->i_blkbits - 9);

2717

return i_blocks << (inode->i_blkbits - 9);

2716

} else {

2718

} else {

2717

return i_blocks;

2719

return i_blocks;

2718

}

2720

}

2719

} else {

2721

} else {

2720

return le32_to_cpu(raw_inode->i_blocks_lo);

2722

return le32_to_cpu(raw_inode->i_blocks_lo);

2721

}

2723

}

2722

}

2724

}

2723

2725

2724

struct inode *ext4_iget(struct super_block *sb, unsigned long ino)

2726

struct inode *ext4_iget(struct super_block *sb, unsigned long ino)

2725

{

2727

{

2726

struct ext4_iloc iloc;

2728

struct ext4_iloc iloc;

2727

struct ext4_inode *raw_inode;

2729

struct ext4_inode *raw_inode;

2728

struct ext4_inode_info *ei;

2730

struct ext4_inode_info *ei;

2729

struct buffer_head *bh;

2731

struct buffer_head *bh;

2730

struct inode *inode;

2732

struct inode *inode;

2731

long ret;

2733

long ret;

2732

int block;

2734

int block;

2733

2735

2734

inode = iget_locked(sb, ino);

2736

inode = iget_locked(sb, ino);

2735

if (!inode)

2737

if (!inode)

2736

return ERR_PTR(-ENOMEM);

2738

return ERR_PTR(-ENOMEM);

2737

if (!(inode->i_state & I_NEW))

2739

if (!(inode->i_state & I_NEW))

2738

return inode;

2740

return inode;

2739

2741

2740

ei = EXT4_I(inode);

2742

ei = EXT4_I(inode);

2741

#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL

2743

#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL

2742

ei->i_acl = EXT4_ACL_NOT_CACHED;

2744

ei->i_acl = EXT4_ACL_NOT_CACHED;

2743

ei->i_default_acl = EXT4_ACL_NOT_CACHED;

2745

ei->i_default_acl = EXT4_ACL_NOT_CACHED;

2744

#endif

2746

#endif

2745

ei->i_block_alloc_info = NULL;

2747

ei->i_block_alloc_info = NULL;

2746

2748

2747

ret = __ext4_get_inode_loc(inode, &iloc, 0);

2749

ret = __ext4_get_inode_loc(inode, &iloc, 0);

2748

if (ret < 0)

2750

if (ret < 0)

2749

goto bad_inode;

2751

goto bad_inode;

2750

bh = iloc.bh;

2752

bh = iloc.bh;

2751

raw_inode = ext4_raw_inode(&iloc);

2753

raw_inode = ext4_raw_inode(&iloc);

2752

inode->i_mode = le16_to_cpu(raw_inode->i_mode);

2754

inode->i_mode = le16_to_cpu(raw_inode->i_mode);

2753

inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);

2755

inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);

2754

inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);

2756

inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);

2755

if(!(test_opt (inode->i_sb, NO_UID32))) {

2757

if(!(test_opt (inode->i_sb, NO_UID32))) {

2756

inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;

2758

inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;

2757

inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;

2759

inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;

2758

}

2760

}

2759

inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);

2761

inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);

2760

2762

2761

ei->i_state = 0;

2763

ei->i_state = 0;

2762

ei->i_dir_start_lookup = 0;

2764

ei->i_dir_start_lookup = 0;

2763

ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);

2765

ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);

2764

/* We now have enough fields to check if the inode was active or not.

2766

/* We now have enough fields to check if the inode was active or not.

2765

* This is needed because nfsd might try to access dead inodes

2767

* This is needed because nfsd might try to access dead inodes

2766

* the test is that same one that e2fsck uses

2768

* the test is that same one that e2fsck uses

2767

* NeilBrown 1999oct15

2769

* NeilBrown 1999oct15

2768

*/

2770

*/

2769

if (inode->i_nlink == 0) {

2771

if (inode->i_nlink == 0) {

2770

if (inode->i_mode == 0 ||

2772

if (inode->i_mode == 0 ||

2771

!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {

2773

!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {

2772

/* this inode is deleted */

2774

/* this inode is deleted */

2773

brelse (bh);

2775

brelse (bh);

2774

ret = -ESTALE;

2776

ret = -ESTALE;

2775

goto bad_inode;

2777

goto bad_inode;

2776

}

2778

}

2777

/* The only unlinked inodes we let through here have

2779

/* The only unlinked inodes we let through here have

2778

* valid i_mode and are being read by the orphan

2780

* valid i_mode and are being read by the orphan

2779

* recovery code: that's fine, we're about to complete

2781

* recovery code: that's fine, we're about to complete

2780

* the process of deleting those. */

2782

* the process of deleting those. */

2781

}

2783

}

2782

ei->i_flags = le32_to_cpu(raw_inode->i_flags);

2784

ei->i_flags = le32_to_cpu(raw_inode->i_flags);

2783

inode->i_blocks = ext4_inode_blocks(raw_inode, ei);

2785

inode->i_blocks = ext4_inode_blocks(raw_inode, ei);

2784

ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);

2786

ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);

2785

if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=

2787

if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=

2786

cpu_to_le32(EXT4_OS_HURD)) {

2788

cpu_to_le32(EXT4_OS_HURD)) {

2787

ei->i_file_acl |=

2789

ei->i_file_acl |=

2788

((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;

2790

((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;

2789

}

2791

}

2790

inode->i_size = ext4_isize(raw_inode);

2792

inode->i_size = ext4_isize(raw_inode);

2791

ei->i_disksize = inode->i_size;

2793

ei->i_disksize = inode->i_size;

2792

inode->i_generation = le32_to_cpu(raw_inode->i_generation);

2794

inode->i_generation = le32_to_cpu(raw_inode->i_generation);

2793

ei->i_block_group = iloc.block_group;

2795

ei->i_block_group = iloc.block_group;

2794

/*

2796

/*

2795

* NOTE! The in-memory inode i_data array is in little-endian order

2797

* NOTE! The in-memory inode i_data array is in little-endian order

2796

* even on big-endian machines: we do NOT byteswap the block numbers!

2798

* even on big-endian machines: we do NOT byteswap the block numbers!

2797

*/

2799

*/

2798

for (block = 0; block < EXT4_N_BLOCKS; block++)

2800

for (block = 0; block < EXT4_N_BLOCKS; block++)

2799

ei->i_data[block] = raw_inode->i_block[block];

2801

ei->i_data[block] = raw_inode->i_block[block];

2800

INIT_LIST_HEAD(&ei->i_orphan);

2802

INIT_LIST_HEAD(&ei->i_orphan);

2801

2803

2802

if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {

2804

if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {

2803

ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);

2805

ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);

2804

if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >

2806

if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >

2805

EXT4_INODE_SIZE(inode->i_sb)) {

2807

EXT4_INODE_SIZE(inode->i_sb)) {

2806

brelse (bh);

2808

brelse (bh);

2807

ret = -EIO;

2809

ret = -EIO;

2808

goto bad_inode;

2810

goto bad_inode;

2809

}

2811

}

2810

if (ei->i_extra_isize == 0) {

2812

if (ei->i_extra_isize == 0) {

2811

/* The extra space is currently unused. Use it. */

2813

/* The extra space is currently unused. Use it. */

2812

ei->i_extra_isize = sizeof(struct ext4_inode) -

2814

ei->i_extra_isize = sizeof(struct ext4_inode) -

2813

EXT4_GOOD_OLD_INODE_SIZE;

2815

EXT4_GOOD_OLD_INODE_SIZE;

2814

} else {

2816

} else {

2815

__le32 *magic = (void *)raw_inode +

2817

__le32 *magic = (void *)raw_inode +

2816

EXT4_GOOD_OLD_INODE_SIZE +

2818

EXT4_GOOD_OLD_INODE_SIZE +

2817

ei->i_extra_isize;

2819

ei->i_extra_isize;

2818

if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))

2820

if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))

2819

ei->i_state |= EXT4_STATE_XATTR;

2821

ei->i_state |= EXT4_STATE_XATTR;

2820

}

2822

}

2821

} else

2823

} else

2822

ei->i_extra_isize = 0;

2824

ei->i_extra_isize = 0;

2823

2825

2824

EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);

2826

EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);

2825

EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);

2827

EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);

2826

EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);

2828

EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);

2827

EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);

2829

EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);

2828

2830

2829

inode->i_version = le32_to_cpu(raw_inode->i_disk_version);

2831

inode->i_version = le32_to_cpu(raw_inode->i_disk_version);

2830

if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {

2832

if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {

2831

if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))

2833

if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))

2832

inode->i_version |=

2834

inode->i_version |=

2833

(__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;

2835

(__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;

2834

}

2836

}

2835

2837

2836

if (S_ISREG(inode->i_mode)) {

2838

if (S_ISREG(inode->i_mode)) {

2837

inode->i_op = &ext4_file_inode_operations;

2839

inode->i_op = &ext4_file_inode_operations;

2838

inode->i_fop = &ext4_file_operations;

2840

inode->i_fop = &ext4_file_operations;

2839

ext4_set_aops(inode);

2841

ext4_set_aops(inode);

2840

} else if (S_ISDIR(inode->i_mode)) {

2842

} else if (S_ISDIR(inode->i_mode)) {

2841

inode->i_op = &ext4_dir_inode_operations;

2843

inode->i_op = &ext4_dir_inode_operations;

2842

inode->i_fop = &ext4_dir_operations;

2844

inode->i_fop = &ext4_dir_operations;

2843

} else if (S_ISLNK(inode->i_mode)) {

2845

} else if (S_ISLNK(inode->i_mode)) {

2844

if (ext4_inode_is_fast_symlink(inode))

2846

if (ext4_inode_is_fast_symlink(inode))

2845

inode->i_op = &ext4_fast_symlink_inode_operations;

2847

inode->i_op = &ext4_fast_symlink_inode_operations;

2846

else {

2848

else {

2847

inode->i_op = &ext4_symlink_inode_operations;

2849

inode->i_op = &ext4_symlink_inode_operations;

2848

ext4_set_aops(inode);

2850

ext4_set_aops(inode);

2849

}

2851

}

2850

} else {

2852

} else {

2851

inode->i_op = &ext4_special_inode_operations;

2853

inode->i_op = &ext4_special_inode_operations;

2852

if (raw_inode->i_block[0])

2854

if (raw_inode->i_block[0])

2853

init_special_inode(inode, inode->i_mode,

2855

init_special_inode(inode, inode->i_mode,

2854

old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));

2856

old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));

2855

else

2857

else

2856

init_special_inode(inode, inode->i_mode,

2858

init_special_inode(inode, inode->i_mode,

2857

new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));

2859

new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));

2858

}

2860

}

2859

brelse (iloc.bh);

2861

brelse (iloc.bh);

2860

ext4_set_inode_flags(inode);

2862

ext4_set_inode_flags(inode);

2861

unlock_new_inode(inode);

2863

unlock_new_inode(inode);

2862

return inode;

2864

return inode;

2863

2865

2864

bad_inode:

2866

bad_inode:

2865

iget_failed(inode);

2867

iget_failed(inode);

2866

return ERR_PTR(ret);

2868

return ERR_PTR(ret);

2867

}

2869

}

2868

2870

2869

static int ext4_inode_blocks_set(handle_t *handle,

2871

static int ext4_inode_blocks_set(handle_t *handle,

2870

struct ext4_inode *raw_inode,

2872

struct ext4_inode *raw_inode,

2871

struct ext4_inode_info *ei)

2873

struct ext4_inode_info *ei)

2872

{

2874

{

2873

struct inode *inode = &(ei->vfs_inode);

2875

struct inode *inode = &(ei->vfs_inode);

2874

u64 i_blocks = inode->i_blocks;

2876

u64 i_blocks = inode->i_blocks;

2875

struct super_block *sb = inode->i_sb;

2877

struct super_block *sb = inode->i_sb;

2876

int err = 0;

2878

int err = 0;

2877

2879

2878

if (i_blocks <= ~0U) {

2880

if (i_blocks <= ~0U) {

2879

/*

2881

/*

2880

* i_blocks can be represnted in a 32 bit variable

2882

* i_blocks can be represnted in a 32 bit variable

2881

* as multiple of 512 bytes

2883

* as multiple of 512 bytes

2882

*/

2884

*/

2883

raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);

2885

raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);

2884

raw_inode->i_blocks_high = 0;

2886

raw_inode->i_blocks_high = 0;

2885

ei->i_flags &= ~EXT4_HUGE_FILE_FL;

2887

ei->i_flags &= ~EXT4_HUGE_FILE_FL;

2886

} else if (i_blocks <= 0xffffffffffffULL) {

2888

} else if (i_blocks <= 0xffffffffffffULL) {

2887

/*

2889

/*

2888

* i_blocks can be represented in a 48 bit variable

2890

* i_blocks can be represented in a 48 bit variable

2889

* as multiple of 512 bytes

2891

* as multiple of 512 bytes

2890

*/

2892

*/

2891

err = ext4_update_rocompat_feature(handle, sb,

2893

err = ext4_update_rocompat_feature(handle, sb,

2892

EXT4_FEATURE_RO_COMPAT_HUGE_FILE);

2894

EXT4_FEATURE_RO_COMPAT_HUGE_FILE);

2893

if (err)

2895

if (err)

2894

goto err_out;

2896

goto err_out;

2895

/* i_block is stored in the split 48 bit fields */

2897

/* i_block is stored in the split 48 bit fields */

2896

raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);

2898

raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);

2897

raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);

2899

raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);

2898

ei->i_flags &= ~EXT4_HUGE_FILE_FL;

2900

ei->i_flags &= ~EXT4_HUGE_FILE_FL;

2899

} else {

2901

} else {

2900

/*

2902

/*

2901

* i_blocks should be represented in a 48 bit variable

2903

* i_blocks should be represented in a 48 bit variable

2902

* as multiple of file system block size

2904

* as multiple of file system block size

2903

*/

2905

*/

2904

err = ext4_update_rocompat_feature(handle, sb,

2906

err = ext4_update_rocompat_feature(handle, sb,

2905

EXT4_FEATURE_RO_COMPAT_HUGE_FILE);

2907

EXT4_FEATURE_RO_COMPAT_HUGE_FILE);

2906

if (err)

2908

if (err)

2907

goto err_out;

2909

goto err_out;

2908

ei->i_flags |= EXT4_HUGE_FILE_FL;

2910

ei->i_flags |= EXT4_HUGE_FILE_FL;

2909

/* i_block is stored in file system block size */

2911

/* i_block is stored in file system block size */

2910

i_blocks = i_blocks >> (inode->i_blkbits - 9);

2912

i_blocks = i_blocks >> (inode->i_blkbits - 9);

2911

raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);

2913

raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);

2912

raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);

2914

raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);

2913

}

2915

}

2914

err_out:

2916

err_out:

2915

return err;

2917

return err;

2916

}

2918

}

2917

2919

2918

/*

2920

/*

2919

* Post the struct inode info into an on-disk inode location in the

2921

* Post the struct inode info into an on-disk inode location in the

2920

* buffer-cache. This gobbles the caller's reference to the

2922

* buffer-cache. This gobbles the caller's reference to the

2921

* buffer_head in the inode location struct.

2923

* buffer_head in the inode location struct.

2922

*

2924

*

2923

* The caller must have write access to iloc->bh.

2925

* The caller must have write access to iloc->bh.

2924

*/

2926

*/

2925

static int ext4_do_update_inode(handle_t *handle,

2927

static int ext4_do_update_inode(handle_t *handle,

2926

struct inode *inode,

2928

struct inode *inode,

2927

struct ext4_iloc *iloc)

2929

struct ext4_iloc *iloc)

2928

{

2930

{

2929

struct ext4_inode *raw_inode = ext4_raw_inode(iloc);

2931

struct ext4_inode *raw_inode = ext4_raw_inode(iloc);

2930

struct ext4_inode_info *ei = EXT4_I(inode);

2932

struct ext4_inode_info *ei = EXT4_I(inode);

2931

struct buffer_head *bh = iloc->bh;

2933

struct buffer_head *bh = iloc->bh;

2932

int err = 0, rc, block;

2934

int err = 0, rc, block;

2933

2935

2934

/* For fields not not tracking in the in-memory inode,

2936

/* For fields not not tracking in the in-memory inode,

2935

* initialise them to zero for new inodes. */

2937

* initialise them to zero for new inodes. */

2936

if (ei->i_state & EXT4_STATE_NEW)

2938

if (ei->i_state & EXT4_STATE_NEW)

2937

memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);

2939

memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);

2938

2940

2939

ext4_get_inode_flags(ei);

2941

ext4_get_inode_flags(ei);

2940

raw_inode->i_mode = cpu_to_le16(inode->i_mode);

2942

raw_inode->i_mode = cpu_to_le16(inode->i_mode);

2941

if(!(test_opt(inode->i_sb, NO_UID32))) {

2943

if(!(test_opt(inode->i_sb, NO_UID32))) {

2942

raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));

2944

raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));

2943

raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));

2945

raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));

2944

/*

2946

/*

2945

* Fix up interoperability with old kernels. Otherwise, old inodes get

2947

* Fix up interoperability with old kernels. Otherwise, old inodes get

2946

* re-used with the upper 16 bits of the uid/gid intact

2948

* re-used with the upper 16 bits of the uid/gid intact

2947

*/

2949

*/

2948

if(!ei->i_dtime) {

2950

if(!ei->i_dtime) {

2949

raw_inode->i_uid_high =

2951

raw_inode->i_uid_high =

2950

cpu_to_le16(high_16_bits(inode->i_uid));

2952

cpu_to_le16(high_16_bits(inode->i_uid));

2951

raw_inode->i_gid_high =

2953

raw_inode->i_gid_high =

2952

cpu_to_le16(high_16_bits(inode->i_gid));

2954

cpu_to_le16(high_16_bits(inode->i_gid));

2953

} else {

2955

} else {

2954

raw_inode->i_uid_high = 0;

2956

raw_inode->i_uid_high = 0;

2955

raw_inode->i_gid_high = 0;

2957

raw_inode->i_gid_high = 0;

2956

}

2958

}

2957

} else {

2959

} else {

2958

raw_inode->i_uid_low =

2960

raw_inode->i_uid_low =

2959

cpu_to_le16(fs_high2lowuid(inode->i_uid));

2961

cpu_to_le16(fs_high2lowuid(inode->i_uid));

2960

raw_inode->i_gid_low =

2962

raw_inode->i_gid_low =

2961

cpu_to_le16(fs_high2lowgid(inode->i_gid));

2963

cpu_to_le16(fs_high2lowgid(inode->i_gid));

2962

raw_inode->i_uid_high = 0;

2964

raw_inode->i_uid_high = 0;

2963

raw_inode->i_gid_high = 0;

2965

raw_inode->i_gid_high = 0;

2964

}

2966

}

2965

raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);

2967

raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);

2966

2968

2967

EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);

2969

EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);

2968

EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);

2970

EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);

2969

EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);

2971

EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);

2970

EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);

2972

EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);

2971

2973

2972

if (ext4_inode_blocks_set(handle, raw_inode, ei))

2974

if (ext4_inode_blocks_set(handle, raw_inode, ei))

2973

goto out_brelse;

2975

goto out_brelse;

2974

raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);

2976

raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);

2975

/* clear the migrate flag in the raw_inode */

2977

/* clear the migrate flag in the raw_inode */

2976

raw_inode->i_flags = cpu_to_le32(ei->i_flags & ~EXT4_EXT_MIGRATE);

2978

raw_inode->i_flags = cpu_to_le32(ei->i_flags & ~EXT4_EXT_MIGRATE);

2977

if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=

2979

if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=

2978

cpu_to_le32(EXT4_OS_HURD))

2980

cpu_to_le32(EXT4_OS_HURD))

2979

raw_inode->i_file_acl_high =

2981

raw_inode->i_file_acl_high =

2980

cpu_to_le16(ei->i_file_acl >> 32);

2982

cpu_to_le16(ei->i_file_acl >> 32);

2981

raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);

2983

raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);

2982

ext4_isize_set(raw_inode, ei->i_disksize);

2984

ext4_isize_set(raw_inode, ei->i_disksize);

2983

if (ei->i_disksize > 0x7fffffffULL) {

2985

if (ei->i_disksize > 0x7fffffffULL) {

2984

struct super_block *sb = inode->i_sb;

2986

struct super_block *sb = inode->i_sb;

2985

if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,

2987

if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,

2986

EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||

2988

EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||

2987

EXT4_SB(sb)->s_es->s_rev_level ==

2989

EXT4_SB(sb)->s_es->s_rev_level ==

2988

cpu_to_le32(EXT4_GOOD_OLD_REV)) {

2990

cpu_to_le32(EXT4_GOOD_OLD_REV)) {

2989

/* If this is the first large file

2991

/* If this is the first large file

2990

* created, add a flag to the superblock.

2992

* created, add a flag to the superblock.

2991

*/

2993

*/

2992

err = ext4_journal_get_write_access(handle,

2994

err = ext4_journal_get_write_access(handle,

2993

EXT4_SB(sb)->s_sbh);

2995

EXT4_SB(sb)->s_sbh);

2994

if (err)

2996

if (err)

2995

goto out_brelse;

2997

goto out_brelse;

2996

ext4_update_dynamic_rev(sb);

2998

ext4_update_dynamic_rev(sb);

2997

EXT4_SET_RO_COMPAT_FEATURE(sb,

2999

EXT4_SET_RO_COMPAT_FEATURE(sb,

2998

EXT4_FEATURE_RO_COMPAT_LARGE_FILE);

3000

EXT4_FEATURE_RO_COMPAT_LARGE_FILE);

2999

sb->s_dirt = 1;

3001

sb->s_dirt = 1;

3000

handle->h_sync = 1;

3002

handle->h_sync = 1;

3001

err = ext4_journal_dirty_metadata(handle,

3003

err = ext4_journal_dirty_metadata(handle,

3002

EXT4_SB(sb)->s_sbh);

3004

EXT4_SB(sb)->s_sbh);

3003

}

3005

}

3004

}

3006

}

3005

raw_inode->i_generation = cpu_to_le32(inode->i_generation);

3007

raw_inode->i_generation = cpu_to_le32(inode->i_generation);

3006

if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {

3008

if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {

3007

if (old_valid_dev(inode->i_rdev)) {

3009

if (old_valid_dev(inode->i_rdev)) {

3008

raw_inode->i_block[0] =

3010

raw_inode->i_block[0] =

3009

cpu_to_le32(old_encode_dev(inode->i_rdev));

3011

cpu_to_le32(old_encode_dev(inode->i_rdev));

3010

raw_inode->i_block[1] = 0;

3012

raw_inode->i_block[1] = 0;

3011

} else {

3013

} else {

3012

raw_inode->i_block[0] = 0;

3014

raw_inode->i_block[0] = 0;

3013

raw_inode->i_block[1] =

3015

raw_inode->i_block[1] =

3014

cpu_to_le32(new_encode_dev(inode->i_rdev));

3016

cpu_to_le32(new_encode_dev(inode->i_rdev));

3015

raw_inode->i_block[2] = 0;

3017

raw_inode->i_block[2] = 0;

3016

}

3018

}

3017

} else for (block = 0; block < EXT4_N_BLOCKS; block++)

3019

} else for (block = 0; block < EXT4_N_BLOCKS; block++)

3018

raw_inode->i_block[block] = ei->i_data[block];

3020

raw_inode->i_block[block] = ei->i_data[block];

3019

3021

3020

raw_inode->i_disk_version = cpu_to_le32(inode->i_version);

3022

raw_inode->i_disk_version = cpu_to_le32(inode->i_version);

3021

if (ei->i_extra_isize) {

3023

if (ei->i_extra_isize) {

3022

if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))

3024

if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))

3023

raw_inode->i_version_hi =

3025

raw_inode->i_version_hi =

3024

cpu_to_le32(inode->i_version >> 32);

3026

cpu_to_le32(inode->i_version >> 32);

3025

raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);

3027

raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);

3026

}

3028

}

3027

3029

3028

3030

3029

BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");

3031

BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");

3030

rc = ext4_journal_dirty_metadata(handle, bh);

3032

rc = ext4_journal_dirty_metadata(handle, bh);

3031

if (!err)

3033

if (!err)

3032

err = rc;

3034

err = rc;

3033

ei->i_state &= ~EXT4_STATE_NEW;

3035

ei->i_state &= ~EXT4_STATE_NEW;

3034

3036

3035

out_brelse:

3037

out_brelse:

3036

brelse (bh);

3038

brelse (bh);

3037

ext4_std_error(inode->i_sb, err);

3039

ext4_std_error(inode->i_sb, err);

3038

return err;

3040

return err;

3039

}

3041

}

3040

3042

3041

/*

3043

/*

3042

* ext4_write_inode()

3044

* ext4_write_inode()

3043

*

3045

*

3044

* We are called from a few places:

3046

* We are called from a few places:

3045

*

3047

*

3046

* - Within generic_file_write() for O_SYNC files.

3048

* - Within generic_file_write() for O_SYNC files.

3047

* Here, there will be no transaction running. We wait for any running

3049

* Here, there will be no transaction running. We wait for any running

3048

* trasnaction to commit.

3050

* trasnaction to commit.

3049

*

3051

*

3050

* - Within sys_sync(), kupdate and such.

3052

* - Within sys_sync(), kupdate and such.

3051

* We wait on commit, if tol to.

3053

* We wait on commit, if tol to.

3052

*

3054

*

3053

* - Within prune_icache() (PF_MEMALLOC == true)

3055

* - Within prune_icache() (PF_MEMALLOC == true)

3054

* Here we simply return. We can't afford to block kswapd on the

3056

* Here we simply return. We can't afford to block kswapd on the

3055

* journal commit.

3057

* journal commit.

3056

*

3058

*

3057

* In all cases it is actually safe for us to return without doing anything,

3059

* In all cases it is actually safe for us to return without doing anything,

3058

* because the inode has been copied into a raw inode buffer in

3060

* because the inode has been copied into a raw inode buffer in

3059

* ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for

3061

* ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for

3060

* knfsd.

3062

* knfsd.

3061

*

3063

*

3062

* Note that we are absolutely dependent upon all inode dirtiers doing the

3064

* Note that we are absolutely dependent upon all inode dirtiers doing the

3063

* right thing: they *must* call mark_inode_dirty() after dirtying info in

3065

* right thing: they *must* call mark_inode_dirty() after dirtying info in

3064

* which we are interested.

3066

* which we are interested.

3065

*

3067

*

3066

* It would be a bug for them to not do this. The code:

3068

* It would be a bug for them to not do this. The code:

3067

*

3069

*

3068

* mark_inode_dirty(inode)

3070

* mark_inode_dirty(inode)

3069

* stuff();

3071

* stuff();

3070

* inode->i_size = expr;

3072

* inode->i_size = expr;

3071

*

3073

*

3072

* is in error because a kswapd-driven write_inode() could occur while

3074

* is in error because a kswapd-driven write_inode() could occur while

3073

* `stuff()' is running, and the new i_size will be lost. Plus the inode

3075

* `stuff()' is running, and the new i_size will be lost. Plus the inode

3074

* will no longer be on the superblock's dirty inode list.

3076

* will no longer be on the superblock's dirty inode list.

3075

*/

3077

*/

3076

int ext4_write_inode(struct inode *inode, int wait)

3078

int ext4_write_inode(struct inode *inode, int wait)

3077

{

3079

{

3078

if (current->flags & PF_MEMALLOC)

3080

if (current->flags & PF_MEMALLOC)

3079

return 0;

3081

return 0;

3080

3082

3081

if (ext4_journal_current_handle()) {

3083

if (ext4_journal_current_handle()) {

3082

jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");

3084

jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");

3083

dump_stack();

3085

dump_stack();

3084

return -EIO;

3086

return -EIO;

3085

}

3087

}

3086

3088

3087

if (!wait)

3089

if (!wait)

3088

return 0;

3090

return 0;

3089

3091

3090

return ext4_force_commit(inode->i_sb);

3092

return ext4_force_commit(inode->i_sb);

3091

}

3093

}

3092

3094

3093

/*

3095

/*

3094

* ext4_setattr()

3096

* ext4_setattr()

3095

*

3097

*

3096

* Called from notify_change.

3098

* Called from notify_change.

3097

*

3099

*

3098

* We want to trap VFS attempts to truncate the file as soon as

3100

* We want to trap VFS attempts to truncate the file as soon as

3099

* possible. In particular, we want to make sure that when the VFS

3101

* possible. In particular, we want to make sure that when the VFS

3100

* shrinks i_size, we put the inode on the orphan list and modify

3102

* shrinks i_size, we put the inode on the orphan list and modify

3101

* i_disksize immediately, so that during the subsequent flushing of

3103

* i_disksize immediately, so that during the subsequent flushing of

3102

* dirty pages and freeing of disk blocks, we can guarantee that any

3104

* dirty pages and freeing of disk blocks, we can guarantee that any

3103

* commit will leave the blocks being flushed in an unused state on

3105

* commit will leave the blocks being flushed in an unused state on

3104

* disk. (On recovery, the inode will get truncated and the blocks will

3106

* disk. (On recovery, the inode will get truncated and the blocks will

3105

* be freed, so we have a strong guarantee that no future commit will

3107

* be freed, so we have a strong guarantee that no future commit will

3106

* leave these blocks visible to the user.)

3108

* leave these blocks visible to the user.)

3107

*

3109

*

3108

* Called with inode->sem down.

3110

* Called with inode->sem down.

3109

*/

3111

*/

3110

int ext4_setattr(struct dentry *dentry, struct iattr *attr)

3112

int ext4_setattr(struct dentry *dentry, struct iattr *attr)

3111

{

3113

{

3112

struct inode *inode = dentry->d_inode;

3114

struct inode *inode = dentry->d_inode;

3113

int error, rc = 0;

3115

int error, rc = 0;

3114

const unsigned int ia_valid = attr->ia_valid;

3116

const unsigned int ia_valid = attr->ia_valid;

3115

3117

3116

error = inode_change_ok(inode, attr);

3118

error = inode_change_ok(inode, attr);

3117

if (error)

3119

if (error)

3118

return error;

3120

return error;

3119

3121

3120

if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||

3122

if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||

3121

(ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {

3123

(ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {

3122

handle_t *handle;

3124

handle_t *handle;

3123

3125

3124

/* (user+group)*(old+new) structure, inode write (sb,

3126

/* (user+group)*(old+new) structure, inode write (sb,

3125

* inode block, ? - but truncate inode update has it) */

3127

* inode block, ? - but truncate inode update has it) */

3126

handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+

3128

handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+

3127

EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3);

3129

EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3);

3128

if (IS_ERR(handle)) {

3130

if (IS_ERR(handle)) {

3129

error = PTR_ERR(handle);

3131

error = PTR_ERR(handle);

3130

goto err_out;

3132

goto err_out;

3131

}

3133

}

3132

error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;

3134

error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;

3133

if (error) {

3135

if (error) {

3134

ext4_journal_stop(handle);

3136

ext4_journal_stop(handle);

3135

return error;

3137

return error;

3136

}

3138

}

3137

/* Update corresponding info in inode so that everything is in

3139

/* Update corresponding info in inode so that everything is in

3138

* one transaction */

3140

* one transaction */

3139

if (attr->ia_valid & ATTR_UID)

3141

if (attr->ia_valid & ATTR_UID)

3140

inode->i_uid = attr->ia_uid;

3142

inode->i_uid = attr->ia_uid;

3141

if (attr->ia_valid & ATTR_GID)

3143

if (attr->ia_valid & ATTR_GID)

3142

inode->i_gid = attr->ia_gid;

3144

inode->i_gid = attr->ia_gid;

3143

error = ext4_mark_inode_dirty(handle, inode);

3145

error = ext4_mark_inode_dirty(handle, inode);

3144

ext4_journal_stop(handle);

3146

ext4_journal_stop(handle);

3145

}

3147

}

3146

3148

3147

if (attr->ia_valid & ATTR_SIZE) {

3149

if (attr->ia_valid & ATTR_SIZE) {

3148

if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {

3150

if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {

3149

struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

3151

struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

3150

3152

3151

if (attr->ia_size > sbi->s_bitmap_maxbytes) {

3153

if (attr->ia_size > sbi->s_bitmap_maxbytes) {

3152

error = -EFBIG;

3154

error = -EFBIG;

3153

goto err_out;

3155

goto err_out;

3154

}

3156

}

3155

}

3157

}

3156

}

3158

}

3157

3159

3158

if (S_ISREG(inode->i_mode) &&

3160

if (S_ISREG(inode->i_mode) &&

3159

attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {

3161

attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {

3160

handle_t *handle;

3162

handle_t *handle;

3161

3163

3162

handle = ext4_journal_start(inode, 3);

3164

handle = ext4_journal_start(inode, 3);

3163

if (IS_ERR(handle)) {

3165

if (IS_ERR(handle)) {

3164

error = PTR_ERR(handle);

3166

error = PTR_ERR(handle);

3165

goto err_out;

3167

goto err_out;

3166

}

3168

}

3167

3169

3168

error = ext4_orphan_add(handle, inode);

3170

error = ext4_orphan_add(handle, inode);

3169

EXT4_I(inode)->i_disksize = attr->ia_size;

3171

EXT4_I(inode)->i_disksize = attr->ia_size;

3170

rc = ext4_mark_inode_dirty(handle, inode);

3172

rc = ext4_mark_inode_dirty(handle, inode);

3171

if (!error)

3173

if (!error)

3172

error = rc;

3174

error = rc;

3173

ext4_journal_stop(handle);

3175

ext4_journal_stop(handle);

3174

}

3176

}

3175

3177

3176

rc = inode_setattr(inode, attr);

3178

rc = inode_setattr(inode, attr);

3177

3179

3178

/* If inode_setattr's call to ext4_truncate failed to get a

3180

/* If inode_setattr's call to ext4_truncate failed to get a

3179

* transaction handle at all, we need to clean up the in-core

3181

* transaction handle at all, we need to clean up the in-core

3180

* orphan list manually. */

3182

* orphan list manually. */

3181

if (inode->i_nlink)

3183

if (inode->i_nlink)

3182

ext4_orphan_del(NULL, inode);

3184

ext4_orphan_del(NULL, inode);

3183

3185

3184

if (!rc && (ia_valid & ATTR_MODE))

3186

if (!rc && (ia_valid & ATTR_MODE))

3185

rc = ext4_acl_chmod(inode);

3187

rc = ext4_acl_chmod(inode);

3186

3188

3187

err_out:

3189

err_out:

3188

ext4_std_error(inode->i_sb, error);

3190

ext4_std_error(inode->i_sb, error);

3189

if (!error)

3191

if (!error)

3190

error = rc;

3192

error = rc;

3191

return error;

3193

return error;

3192

}

3194

}

3193

3195

3194

3196

3195

/*

3197

/*

3196

* How many blocks doth make a writepage()?

3198

* How many blocks doth make a writepage()?

3197

*

3199

*

3198

* With N blocks per page, it may be:

3200

* With N blocks per page, it may be:

3199

* N data blocks

3201

* N data blocks

3200

* 2 indirect block

3202

* 2 indirect block

3201

* 2 dindirect

3203

* 2 dindirect

3202

* 1 tindirect

3204

* 1 tindirect

3203

* N+5 bitmap blocks (from the above)

3205

* N+5 bitmap blocks (from the above)

3204

* N+5 group descriptor summary blocks

3206

* N+5 group descriptor summary blocks

3205

* 1 inode block

3207

* 1 inode block

3206

* 1 superblock.

3208

* 1 superblock.

3207

* 2 * EXT4_SINGLEDATA_TRANS_BLOCKS for the quote files

3209

* 2 * EXT4_SINGLEDATA_TRANS_BLOCKS for the quote files

3208

*

3210

*

3209

* 3 * (N + 5) + 2 + 2 * EXT4_SINGLEDATA_TRANS_BLOCKS

3211

* 3 * (N + 5) + 2 + 2 * EXT4_SINGLEDATA_TRANS_BLOCKS

3210

*

3212

*

3211

* With ordered or writeback data it's the same, less the N data blocks.

3213

* With ordered or writeback data it's the same, less the N data blocks.

3212

*

3214

*

3213

* If the inode's direct blocks can hold an integral number of pages then a

3215

* If the inode's direct blocks can hold an integral number of pages then a

3214

* page cannot straddle two indirect blocks, and we can only touch one indirect

3216

* page cannot straddle two indirect blocks, and we can only touch one indirect

3215

* and dindirect block, and the "5" above becomes "3".

3217

* and dindirect block, and the "5" above becomes "3".

3216

*

3218

*

3217

* This still overestimates under most circumstances. If we were to pass the

3219

* This still overestimates under most circumstances. If we were to pass the

3218

* start and end offsets in here as well we could do block_to_path() on each

3220

* start and end offsets in here as well we could do block_to_path() on each

3219

* block and work out the exact number of indirects which are touched. Pah.

3221

* block and work out the exact number of indirects which are touched. Pah.

3220

*/

3222

*/

3221

3223

3222

int ext4_writepage_trans_blocks(struct inode *inode)

3224

int ext4_writepage_trans_blocks(struct inode *inode)

3223

{

3225

{

3224

int bpp = ext4_journal_blocks_per_page(inode);

3226

int bpp = ext4_journal_blocks_per_page(inode);

3225

int indirects = (EXT4_NDIR_BLOCKS % bpp) ? 5 : 3;

3227

int indirects = (EXT4_NDIR_BLOCKS % bpp) ? 5 : 3;

3226

int ret;

3228

int ret;

3227

3229

3228

if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)

3230

if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)

3229

return ext4_ext_writepage_trans_blocks(inode, bpp);

3231

return ext4_ext_writepage_trans_blocks(inode, bpp);

3230

3232

3231

if (ext4_should_journal_data(inode))

3233

if (ext4_should_journal_data(inode))

3232

ret = 3 * (bpp + indirects) + 2;

3234

ret = 3 * (bpp + indirects) + 2;

3233

else

3235

else

3234

ret = 2 * (bpp + indirects) + 2;

3236

ret = 2 * (bpp + indirects) + 2;

3235

3237

3236

#ifdef CONFIG_QUOTA

3238

#ifdef CONFIG_QUOTA

3237

/* We know that structure was already allocated during DQUOT_INIT so

3239

/* We know that structure was already allocated during DQUOT_INIT so

3238

* we will be updating only the data blocks + inodes */

3240

* we will be updating only the data blocks + inodes */

3239

ret += 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);

3241

ret += 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);

3240

#endif

3242

#endif

3241

3243

3242

return ret;

3244

return ret;

3243

}

3245

}

3244

3246

3245

/*

3247

/*

3246

* The caller must have previously called ext4_reserve_inode_write().

3248

* The caller must have previously called ext4_reserve_inode_write().

3247

* Give this, we know that the caller already has write access to iloc->bh.

3249

* Give this, we know that the caller already has write access to iloc->bh.

3248

*/

3250

*/

3249

int ext4_mark_iloc_dirty(handle_t *handle,

3251

int ext4_mark_iloc_dirty(handle_t *handle,

3250

struct inode *inode, struct ext4_iloc *iloc)

3252

struct inode *inode, struct ext4_iloc *iloc)

3251

{

3253

{

3252

int err = 0;

3254

int err = 0;

3253

3255

3254

if (test_opt(inode->i_sb, I_VERSION))

3256

if (test_opt(inode->i_sb, I_VERSION))

3255

inode_inc_iversion(inode);

3257

inode_inc_iversion(inode);

3256

3258

3257

/* the do_update_inode consumes one bh->b_count */

3259

/* the do_update_inode consumes one bh->b_count */

3258

get_bh(iloc->bh);

3260

get_bh(iloc->bh);

3259

3261

3260

/* ext4_do_update_inode() does jbd2_journal_dirty_metadata */

3262

/* ext4_do_update_inode() does jbd2_journal_dirty_metadata */

3261

err = ext4_do_update_inode(handle, inode, iloc);

3263

err = ext4_do_update_inode(handle, inode, iloc);

3262

put_bh(iloc->bh);

3264

put_bh(iloc->bh);

3263

return err;

3265

return err;

3264

}

3266

}

3265

3267

3266

/*

3268

/*

3267

* On success, We end up with an outstanding reference count against

3269

* On success, We end up with an outstanding reference count against

3268

* iloc->bh. This _must_ be cleaned up later.

3270

* iloc->bh. This _must_ be cleaned up later.

3269

*/

3271

*/

3270

3272

3271

int

3273

int

3272

ext4_reserve_inode_write(handle_t *handle, struct inode *inode,

3274

ext4_reserve_inode_write(handle_t *handle, struct inode *inode,

3273

struct ext4_iloc *iloc)

3275

struct ext4_iloc *iloc)

3274

{

3276

{

3275

int err = 0;

3277

int err = 0;

3276

if (handle) {

3278

if (handle) {

3277

err = ext4_get_inode_loc(inode, iloc);

3279

err = ext4_get_inode_loc(inode, iloc);

3278

if (!err) {

3280

if (!err) {

3279

BUFFER_TRACE(iloc->bh, "get_write_access");

3281

BUFFER_TRACE(iloc->bh, "get_write_access");

3280

err = ext4_journal_get_write_access(handle, iloc->bh);

3282

err = ext4_journal_get_write_access(handle, iloc->bh);

3281

if (err) {

3283

if (err) {

3282

brelse(iloc->bh);

3284

brelse(iloc->bh);

3283

iloc->bh = NULL;

3285

iloc->bh = NULL;

3284

}

3286

}

3285

}

3287

}

3286

}

3288

}

3287

ext4_std_error(inode->i_sb, err);

3289

ext4_std_error(inode->i_sb, err);

3288

return err;

3290

return err;

3289

}

3291

}

3290

3292

3291

/*

3293

/*

3292

* Expand an inode by new_extra_isize bytes.

3294

* Expand an inode by new_extra_isize bytes.

3293

* Returns 0 on success or negative error number on failure.

3295

* Returns 0 on success or negative error number on failure.

3294

*/

3296

*/

3295

static int ext4_expand_extra_isize(struct inode *inode,

3297

static int ext4_expand_extra_isize(struct inode *inode,

3296

unsigned int new_extra_isize,

3298

unsigned int new_extra_isize,

3297

struct ext4_iloc iloc,

3299

struct ext4_iloc iloc,

3298

handle_t *handle)

3300

handle_t *handle)

3299

{

3301

{

3300

struct ext4_inode *raw_inode;

3302

struct ext4_inode *raw_inode;

3301

struct ext4_xattr_ibody_header *header;

3303

struct ext4_xattr_ibody_header *header;

3302

struct ext4_xattr_entry *entry;

3304

struct ext4_xattr_entry *entry;

3303

3305

3304

if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)

3306

if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)

3305

return 0;

3307

return 0;

3306

3308

3307

raw_inode = ext4_raw_inode(&iloc);

3309

raw_inode = ext4_raw_inode(&iloc);

3308

3310

3309

header = IHDR(inode, raw_inode);

3311

header = IHDR(inode, raw_inode);

3310

entry = IFIRST(header);

3312

entry = IFIRST(header);

3311

3313

3312

/* No extended attributes present */

3314

/* No extended attributes present */

3313

if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) ||

3315

if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) ||

3314

header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {

3316

header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {

3315

memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,

3317

memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,

3316

new_extra_isize);

3318

new_extra_isize);

3317

EXT4_I(inode)->i_extra_isize = new_extra_isize;

3319

EXT4_I(inode)->i_extra_isize = new_extra_isize;

3318

return 0;

3320

return 0;

3319

}

3321

}

3320

3322

3321

/* try to expand with EAs present */

3323

/* try to expand with EAs present */

3322

return ext4_expand_extra_isize_ea(inode, new_extra_isize,

3324

return ext4_expand_extra_isize_ea(inode, new_extra_isize,

3323

raw_inode, handle);

3325

raw_inode, handle);

3324

}

3326

}

3325

3327

3326

/*

3328

/*

3327

* What we do here is to mark the in-core inode as clean with respect to inode

3329

* What we do here is to mark the in-core inode as clean with respect to inode

3328

* dirtiness (it may still be data-dirty).

3330

* dirtiness (it may still be data-dirty).

3329

* This means that the in-core inode may be reaped by prune_icache

3331

* This means that the in-core inode may be reaped by prune_icache

3330

* without having to perform any I/O. This is a very good thing,

3332

* without having to perform any I/O. This is a very good thing,

3331

* because *any* task may call prune_icache - even ones which

3333

* because *any* task may call prune_icache - even ones which

3332

* have a transaction open against a different journal.

3334

* have a transaction open against a different journal.

3333

*

3335

*

3334

* Is this cheating? Not really. Sure, we haven't written the

3336

* Is this cheating? Not really. Sure, we haven't written the

3335

* inode out, but prune_icache isn't a user-visible syncing function.

3337

* inode out, but prune_icache isn't a user-visible syncing function.

3336

* Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)

3338

* Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)

3337

* we start and wait on commits.

3339

* we start and wait on commits.

3338

*

3340

*

3339

* Is this efficient/effective? Well, we're being nice to the system

3341

* Is this efficient/effective? Well, we're being nice to the system

3340

* by cleaning up our inodes proactively so they can be reaped

3342

* by cleaning up our inodes proactively so they can be reaped

3341

* without I/O. But we are potentially leaving up to five seconds'

3343

* without I/O. But we are potentially leaving up to five seconds'

3342

* worth of inodes floating about which prune_icache wants us to

3344

* worth of inodes floating about which prune_icache wants us to

3343

* write out. One way to fix that would be to get prune_icache()

3345

* write out. One way to fix that would be to get prune_icache()

3344

* to do a write_super() to free up some memory. It has the desired

3346

* to do a write_super() to free up some memory. It has the desired

3345

* effect.

3347

* effect.

3346

*/

3348

*/

3347

int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)

3349

int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)

3348

{

3350

{

3349

struct ext4_iloc iloc;

3351

struct ext4_iloc iloc;

3350

struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

3352

struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

3351

static unsigned int mnt_count;

3353

static unsigned int mnt_count;

3352

int err, ret;

3354

int err, ret;

3353

3355

3354

might_sleep();

3356

might_sleep();

3355

err = ext4_reserve_inode_write(handle, inode, &iloc);

3357

err = ext4_reserve_inode_write(handle, inode, &iloc);

3356

if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&

3358

if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&

3357

!(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) {

3359

!(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) {

3358

/*

3360

/*

3359

* We need extra buffer credits since we may write into EA block

3361

* We need extra buffer credits since we may write into EA block

3360

* with this same handle. If journal_extend fails, then it will

3362

* with this same handle. If journal_extend fails, then it will

3361

* only result in a minor loss of functionality for that inode.

3363

* only result in a minor loss of functionality for that inode.

3362

* If this is felt to be critical, then e2fsck should be run to

3364

* If this is felt to be critical, then e2fsck should be run to

3363

* force a large enough s_min_extra_isize.

3365

* force a large enough s_min_extra_isize.

3364

*/

3366

*/

3365

if ((jbd2_journal_extend(handle,

3367

if ((jbd2_journal_extend(handle,

3366

EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) {

3368

EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) {

3367

ret = ext4_expand_extra_isize(inode,

3369

ret = ext4_expand_extra_isize(inode,

3368

sbi->s_want_extra_isize,

3370

sbi->s_want_extra_isize,

3369

iloc, handle);

3371

iloc, handle);

3370

if (ret) {

3372

if (ret) {

3371

EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;

3373

EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;

3372

if (mnt_count !=

3374

if (mnt_count !=

3373

le16_to_cpu(sbi->s_es->s_mnt_count)) {

3375

le16_to_cpu(sbi->s_es->s_mnt_count)) {

3374

ext4_warning(inode->i_sb, __func__,

3376

ext4_warning(inode->i_sb, __func__,

3375

"Unable to expand inode %lu. Delete"

3377

"Unable to expand inode %lu. Delete"

3376

" some EAs or run e2fsck.",

3378

" some EAs or run e2fsck.",

3377

inode->i_ino);

3379

inode->i_ino);

3378

mnt_count =

3380

mnt_count =

3379

le16_to_cpu(sbi->s_es->s_mnt_count);

3381

le16_to_cpu(sbi->s_es->s_mnt_count);

3380

}

3382

}

3381

}

3383

}

3382

}

3384

}

3383

}

3385

}

3384

if (!err)

3386

if (!err)

3385

err = ext4_mark_iloc_dirty(handle, inode, &iloc);

3387

err = ext4_mark_iloc_dirty(handle, inode, &iloc);

3386

return err;

3388

return err;

3387

}

3389

}

3388

3390

3389

/*

3391

/*

3390

* ext4_dirty_inode() is called from __mark_inode_dirty()

3392

* ext4_dirty_inode() is called from __mark_inode_dirty()

3391

*

3393

*

3392

* We're really interested in the case where a file is being extended.

3394

* We're really interested in the case where a file is being extended.

3393

* i_size has been changed by generic_commit_write() and we thus need

3395

* i_size has been changed by generic_commit_write() and we thus need

3394

* to include the updated inode in the current transaction.

3396

* to include the updated inode in the current transaction.

3395

*

3397

*

3396

* Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks

3398

* Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks

3397

* are allocated to the file.

3399

* are allocated to the file.

3398

*

3400

*

3399

* If the inode is marked synchronous, we don't honour that here - doing

3401

* If the inode is marked synchronous, we don't honour that here - doing

3400

* so would cause a commit on atime updates, which we don't bother doing.

3402

* so would cause a commit on atime updates, which we don't bother doing.

3401

* We handle synchronous inodes at the highest possible level.

3403

* We handle synchronous inodes at the highest possible level.

3402

*/

3404

*/

3403

void ext4_dirty_inode(struct inode *inode)

3405

void ext4_dirty_inode(struct inode *inode)

3404

{

3406

{

3405

handle_t *current_handle = ext4_journal_current_handle();

3407

handle_t *current_handle = ext4_journal_current_handle();

3406

handle_t *handle;

3408

handle_t *handle;

3407

3409

3408

handle = ext4_journal_start(inode, 2);

3410

handle = ext4_journal_start(inode, 2);

3409

if (IS_ERR(handle))

3411

if (IS_ERR(handle))

3410

goto out;

3412

goto out;

3411

if (current_handle &&

3413

if (current_handle &&

3412

current_handle->h_transaction != handle->h_transaction) {

3414

current_handle->h_transaction != handle->h_transaction) {

3413

/* This task has a transaction open against a different fs */

3415

/* This task has a transaction open against a different fs */

3414

printk(KERN_EMERG "%s: transactions do not match!\n",

3416

printk(KERN_EMERG "%s: transactions do not match!\n",

3415

__func__);

3417

__func__);

3416

} else {

3418

} else {

3417

jbd_debug(5, "marking dirty. outer handle=%p\n",

3419

jbd_debug(5, "marking dirty. outer handle=%p\n",

3418

current_handle);

3420

current_handle);

3419

ext4_mark_inode_dirty(handle, inode);

3421

ext4_mark_inode_dirty(handle, inode);

3420

}

3422

}

3421

ext4_journal_stop(handle);

3423

ext4_journal_stop(handle);

3422

out:

3424

out:

3423

return;

3425

return;

3424

}

3426

}

3425

3427

3426

#if 0

3428

#if 0

3427

/*

3429

/*

3428

* Bind an inode's backing buffer_head into this transaction, to prevent

3430

* Bind an inode's backing buffer_head into this transaction, to prevent

3429

* it from being flushed to disk early. Unlike

3431

* it from being flushed to disk early. Unlike

3430

* ext4_reserve_inode_write, this leaves behind no bh reference and

3432

* ext4_reserve_inode_write, this leaves behind no bh reference and

3431

* returns no iloc structure, so the caller needs to repeat the iloc

3433

* returns no iloc structure, so the caller needs to repeat the iloc

3432

* lookup to mark the inode dirty later.

3434

* lookup to mark the inode dirty later.

3433

*/

3435

*/

3434

static int ext4_pin_inode(handle_t *handle, struct inode *inode)

3436

static int ext4_pin_inode(handle_t *handle, struct inode *inode)

3435

{

3437

{

3436

struct ext4_iloc iloc;

3438

struct ext4_iloc iloc;

3437

3439

3438

int err = 0;

3440

int err = 0;

3439

if (handle) {

3441

if (handle) {

3440

err = ext4_get_inode_loc(inode, &iloc);

3442

err = ext4_get_inode_loc(inode, &iloc);

3441

if (!err) {

3443

if (!err) {

3442

BUFFER_TRACE(iloc.bh, "get_write_access");

3444

BUFFER_TRACE(iloc.bh, "get_write_access");

3443

err = jbd2_journal_get_write_access(handle, iloc.bh);

3445

err = jbd2_journal_get_write_access(handle, iloc.bh);

3444

if (!err)

3446

if (!err)

3445

err = ext4_journal_dirty_metadata(handle,

3447

err = ext4_journal_dirty_metadata(handle,

3446

iloc.bh);

3448

iloc.bh);

3447

brelse(iloc.bh);

3449

brelse(iloc.bh);

3448

}

3450

}

3449

}

3451

}

3450

ext4_std_error(inode->i_sb, err);

3452

ext4_std_error(inode->i_sb, err);

3451

return err;

3453

return err;

3452

}

3454

}

3453

#endif

3455

#endif

3454

3456

3455

int ext4_change_inode_journal_flag(struct inode *inode, int val)

3457

int ext4_change_inode_journal_flag(struct inode *inode, int val)

3456

{

3458

{

3457

journal_t *journal;

3459

journal_t *journal;

3458

handle_t *handle;

3460

handle_t *handle;

3459

int err;

3461

int err;

3460

3462

3461

/*

3463

/*

3462

* We have to be very careful here: changing a data block's

3464

* We have to be very careful here: changing a data block's

3463

* journaling status dynamically is dangerous. If we write a

3465

* journaling status dynamically is dangerous. If we write a

3464

* data block to the journal, change the status and then delete

3466

* data block to the journal, change the status and then delete

3465

* that block, we risk forgetting to revoke the old log record

3467

* that block, we risk forgetting to revoke the old log record

3466

* from the journal and so a subsequent replay can corrupt data.

3468

* from the journal and so a subsequent replay can corrupt data.

3467

* So, first we make sure that the journal is empty and that

3469

* So, first we make sure that the journal is empty and that

3468

* nobody is changing anything.

3470

* nobody is changing anything.

3469

*/

3471

*/

3470

3472

3471

journal = EXT4_JOURNAL(inode);

3473

journal = EXT4_JOURNAL(inode);

3472

if (is_journal_aborted(journal))

3474

if (is_journal_aborted(journal))

3473

return -EROFS;

3475

return -EROFS;

3474

3476

3475

jbd2_journal_lock_updates(journal);

3477

jbd2_journal_lock_updates(journal);

3476

jbd2_journal_flush(journal);

3478

jbd2_journal_flush(journal);

3477

3479

3478

/*

3480

/*

3479

* OK, there are no updates running now, and all cached data is

3481

* OK, there are no updates running now, and all cached data is

3480

* synced to disk. We are now in a completely consistent state

3482

* synced to disk. We are now in a completely consistent state

3481

* which doesn't have anything in the journal, and we know that

3483

* which doesn't have anything in the journal, and we know that

3482

* no filesystem updates are running, so it is safe to modify

3484

* no filesystem updates are running, so it is safe to modify

3483

* the inode's in-core data-journaling state flag now.

3485

* the inode's in-core data-journaling state flag now.

3484

*/

3486

*/

3485

3487

3486

if (val)

3488

if (val)

3487

EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL;

3489

EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL;

3488

else

3490

else

3489

EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL;

3491

EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL;

3490

ext4_set_aops(inode);

3492

ext4_set_aops(inode);

3491

3493

3492

jbd2_journal_unlock_updates(journal);

3494

jbd2_journal_unlock_updates(journal);

3493

3495

3494

/* Finally we can mark the inode as dirty. */

3496

/* Finally we can mark the inode as dirty. */

3495

3497

3496

handle = ext4_journal_start(inode, 1);

3498

handle = ext4_journal_start(inode, 1);

3497

if (IS_ERR(handle))

3499

if (IS_ERR(handle))

3498

return PTR_ERR(handle);

3500

return PTR_ERR(handle);

3499

3501

3500

err = ext4_mark_inode_dirty(handle, inode);

3502

err = ext4_mark_inode_dirty(handle, inode);

3501

handle->h_sync = 1;

3503

handle->h_sync = 1;

3502

ext4_journal_stop(handle);

3504

ext4_journal_stop(handle);

3503

ext4_std_error(inode->i_sb, err);

3505

ext4_std_error(inode->i_sb, err);

3504

3506

3505

return err;

3507

return err;

3506

}

3508

}

3507

3509

GITLAB

ext4: fix test ext_generic_write_end() copied return value

 /*
  *  linux/fs/ext4/inode.c
  *
  * Copyright (C) 1992, 1993, 1994, 1995
  * Remy Card (card@masi.ibp.fr)
  * Laboratoire MASI - Institut Blaise Pascal
  * Universite Pierre et Marie Curie (Paris VI)
  *
  *  from
  *
  *  linux/fs/minix/inode.c
  *
  *  Copyright (C) 1991, 1992  Linus Torvalds
  *
  *  Goal-directed block allocation by Stephen Tweedie
  *	(sct@redhat.com), 1993, 1998
  *  Big-endian to little-endian byte-swapping/bitmaps by
  *        David S. Miller (davem@caip.rutgers.edu), 1995
  *  64-bit file support on 64-bit platforms by Jakub Jelinek
  *	(jj@sunsite.ms.mff.cuni.cz)
  *
  *  Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
  */
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/time.h>
 #include <linux/jbd2.h>
 #include <linux/highuid.h>
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
 #include <linux/string.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
 #include <linux/mpage.h>
 #include <linux/uio.h>
 #include <linux/bio.h>
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
 /*
  * Test whether an inode is a fast symlink.
  */
 static int ext4_inode_is_fast_symlink(struct inode *inode)
 {
 	int ea_blocks = EXT4_I(inode)->i_file_acl ?
 		(inode->i_sb->s_blocksize >> 9) : 0;
 	return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
 }
 /*
  * The ext4 forget function must perform a revoke if we are freeing data
  * which has been journaled.  Metadata (eg. indirect blocks) must be
  * revoked in all cases.
  *
  * "bh" may be NULL: a metadata block may have been freed from memory
  * but there may still be a record of it in the journal, and that record
  * still needs to be revoked.
  */
 int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
 			struct buffer_head *bh, ext4_fsblk_t blocknr)
 {
 	int err;
 	might_sleep();
 	BUFFER_TRACE(bh, "enter");
 	jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
 		  "data mode %lx\n",
 		  bh, is_metadata, inode->i_mode,
 		  test_opt(inode->i_sb, DATA_FLAGS));
 	/* Never use the revoke function if we are doing full data
 	 * journaling: there is no need to, and a V1 superblock won't
 	 * support it.  Otherwise, only skip the revoke on un-journaled
 	 * data blocks. */
 	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
 	    (!is_metadata && !ext4_should_journal_data(inode))) {
 		if (bh) {
 			BUFFER_TRACE(bh, "call jbd2_journal_forget");
 			return ext4_journal_forget(handle, bh);
 		}
 		return 0;
 	}
 	/*
 	 * data!=journal && (is_metadata || should_journal_data(inode))
 	 */
 	BUFFER_TRACE(bh, "call ext4_journal_revoke");
 	err = ext4_journal_revoke(handle, blocknr, bh);
 	if (err)
 		ext4_abort(inode->i_sb, __func__,
 			   "error %d when attempting revoke", err);
 	BUFFER_TRACE(bh, "exit");
 	return err;
 }
 /*
  * Work out how many blocks we need to proceed with the next chunk of a
  * truncate transaction.
  */
 static unsigned long blocks_for_truncate(struct inode *inode)
 {
 	ext4_lblk_t needed;
 	needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
 	/* Give ourselves just enough room to cope with inodes in which
 	 * i_blocks is corrupt: we've seen disk corruptions in the past
 	 * which resulted in random data in an inode which looked enough
 	 * like a regular file for ext4 to try to delete it.  Things
 	 * will go a bit crazy if that happens, but at least we should
 	 * try not to panic the whole kernel. */
 	if (needed < 2)
 		needed = 2;
 	/* But we need to bound the transaction so we don't overflow the
 	 * journal. */
 	if (needed > EXT4_MAX_TRANS_DATA)
 		needed = EXT4_MAX_TRANS_DATA;
 	return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
 }
 /*
  * Truncate transactions can be complex and absolutely huge.  So we need to
  * be able to restart the transaction at a conventient checkpoint to make
  * sure we don't overflow the journal.
  *
  * start_transaction gets us a new handle for a truncate transaction,
  * and extend_transaction tries to extend the existing one a bit.  If
  * extend fails, we need to propagate the failure up and restart the
  * transaction in the top-level truncate loop. --sct
  */
 static handle_t *start_transaction(struct inode *inode)
 {
 	handle_t *result;
 	result = ext4_journal_start(inode, blocks_for_truncate(inode));
 	if (!IS_ERR(result))
 		return result;
 	ext4_std_error(inode->i_sb, PTR_ERR(result));
 	return result;
 }
 /*
  * Try to extend this transaction for the purposes of truncation.
  *
  * Returns 0 if we managed to create more room.  If we can't create more
  * room, and the transaction must be restarted we return 1.
  */
 static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
 {
 	if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS)
 		return 0;
 	if (!ext4_journal_extend(handle, blocks_for_truncate(inode)))
 		return 0;
 	return 1;
 }
 /*
  * Restart the transaction associated with *handle.  This does a commit,
  * so before we call here everything must be consistently dirtied against
  * this transaction.
  */
 static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
 {
 	jbd_debug(2, "restarting handle %p\n", handle);
 	return ext4_journal_restart(handle, blocks_for_truncate(inode));
 }
 /*
  * Called at the last iput() if i_nlink is zero.
  */
 void ext4_delete_inode (struct inode * inode)
 {
 	handle_t *handle;
 	truncate_inode_pages(&inode->i_data, 0);
 	if (is_bad_inode(inode))
 		goto no_delete;
 	handle = start_transaction(inode);
 	if (IS_ERR(handle)) {
 		/*
 		 * If we're going to skip the normal cleanup, we still need to
 		 * make sure that the in-core orphan linked list is properly
 		 * cleaned up.
 		 */
 		ext4_orphan_del(NULL, inode);
 		goto no_delete;
 	}
 	if (IS_SYNC(inode))
 		handle->h_sync = 1;
 	inode->i_size = 0;
 	if (inode->i_blocks)
 		ext4_truncate(inode);
 	/*
 	 * Kill off the orphan record which ext4_truncate created.
 	 * AKPM: I think this can be inside the above `if'.
 	 * Note that ext4_orphan_del() has to be able to cope with the
 	 * deletion of a non-existent orphan - this is because we don't
 	 * know if ext4_truncate() actually created an orphan record.
 	 * (Well, we could do this if we need to, but heck - it works)
 	 */
 	ext4_orphan_del(handle, inode);
 	EXT4_I(inode)->i_dtime	= get_seconds();
 	/*
 	 * One subtle ordering requirement: if anything has gone wrong
 	 * (transaction abort, IO errors, whatever), then we can still
 	 * do these next steps (the fs will already have been marked as
 	 * having errors), but we can't free the inode if the mark_dirty
 	 * fails.
 	 */
 	if (ext4_mark_inode_dirty(handle, inode))
 		/* If that failed, just do the required in-core inode clear. */
 		clear_inode(inode);
 	else
 		ext4_free_inode(handle, inode);
 	ext4_journal_stop(handle);
 	return;
 no_delete:
 	clear_inode(inode);	/* We must guarantee clearing of inode... */
 }
 typedef struct {
 	__le32	*p;
 	__le32	key;
 	struct buffer_head *bh;
 } Indirect;
 static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
 {
 	p->key = *(p->p = v);
 	p->bh = bh;
 }
 /**
  *	ext4_block_to_path - parse the block number into array of offsets
  *	@inode: inode in question (we are only interested in its superblock)
  *	@i_block: block number to be parsed
  *	@offsets: array to store the offsets in
  *	@boundary: set this non-zero if the referred-to block is likely to be
  *	       followed (on disk) by an indirect block.
  *
  *	To store the locations of file's data ext4 uses a data structure common
  *	for UNIX filesystems - tree of pointers anchored in the inode, with
  *	data blocks at leaves and indirect blocks in intermediate nodes.
  *	This function translates the block number into path in that tree -
  *	return value is the path length and @offsets[n] is the offset of
  *	pointer to (n+1)th node in the nth one. If @block is out of range
  *	(negative or too large) warning is printed and zero returned.
  *
  *	Note: function doesn't find node addresses, so no IO is needed. All
  *	we need to know is the capacity of indirect blocks (taken from the
  *	inode->i_sb).
  */
 /*
  * Portability note: the last comparison (check that we fit into triple
  * indirect block) is spelled differently, because otherwise on an
  * architecture with 32-bit longs and 8Kb pages we might get into trouble
  * if our filesystem had 8Kb blocks. We might use long long, but that would
  * kill us on x86. Oh, well, at least the sign propagation does not matter -
  * i_block would have to be negative in the very beginning, so we would not
  * get there at all.
  */
 static int ext4_block_to_path(struct inode *inode,
 			ext4_lblk_t i_block,
 			ext4_lblk_t offsets[4], int *boundary)
 {
 	int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
 	int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
 	const long direct_blocks = EXT4_NDIR_BLOCKS,
 		indirect_blocks = ptrs,
 		double_blocks = (1 << (ptrs_bits * 2));
 	int n = 0;
 	int final = 0;
 	if (i_block < 0) {
 		ext4_warning (inode->i_sb, "ext4_block_to_path", "block < 0");
 	} else if (i_block < direct_blocks) {
 		offsets[n++] = i_block;
 		final = direct_blocks;
 	} else if ( (i_block -= direct_blocks) < indirect_blocks) {
 		offsets[n++] = EXT4_IND_BLOCK;
 		offsets[n++] = i_block;
 		final = ptrs;
 	} else if ((i_block -= indirect_blocks) < double_blocks) {
 		offsets[n++] = EXT4_DIND_BLOCK;
 		offsets[n++] = i_block >> ptrs_bits;
 		offsets[n++] = i_block & (ptrs - 1);
 		final = ptrs;
 	} else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
 		offsets[n++] = EXT4_TIND_BLOCK;
 		offsets[n++] = i_block >> (ptrs_bits * 2);
 		offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
 		offsets[n++] = i_block & (ptrs - 1);
 		final = ptrs;
 	} else {
 		ext4_warning(inode->i_sb, "ext4_block_to_path",
 				"block %lu > max",
 				i_block + direct_blocks +
 				indirect_blocks + double_blocks);
 	}
 	if (boundary)
 		*boundary = final - 1 - (i_block & (ptrs - 1));
 	return n;
 }
 /**
  *	ext4_get_branch - read the chain of indirect blocks leading to data
  *	@inode: inode in question
  *	@depth: depth of the chain (1 - direct pointer, etc.)
  *	@offsets: offsets of pointers in inode/indirect blocks
  *	@chain: place to store the result
  *	@err: here we store the error value
  *
  *	Function fills the array of triples <key, p, bh> and returns %NULL
  *	if everything went OK or the pointer to the last filled triple
  *	(incomplete one) otherwise. Upon the return chain[i].key contains
  *	the number of (i+1)-th block in the chain (as it is stored in memory,
  *	i.e. little-endian 32-bit), chain[i].p contains the address of that
  *	number (it points into struct inode for i==0 and into the bh->b_data
  *	for i>0) and chain[i].bh points to the buffer_head of i-th indirect
  *	block for i>0 and NULL for i==0. In other words, it holds the block
  *	numbers of the chain, addresses they were taken from (and where we can
  *	verify that chain did not change) and buffer_heads hosting these
  *	numbers.
  *
  *	Function stops when it stumbles upon zero pointer (absent block)
  *		(pointer to last triple returned, *@err == 0)
  *	or when it gets an IO error reading an indirect block
  *		(ditto, *@err == -EIO)
  *	or when it reads all @depth-1 indirect blocks successfully and finds
  *	the whole chain, all way to the data (returns %NULL, *err == 0).
  *
  *      Need to be called with
  *      down_read(&EXT4_I(inode)->i_data_sem)
  */
 static Indirect *ext4_get_branch(struct inode *inode, int depth,
 				 ext4_lblk_t  *offsets,
 				 Indirect chain[4], int *err)
 {
 	struct super_block *sb = inode->i_sb;
 	Indirect *p = chain;
 	struct buffer_head *bh;
 	*err = 0;
 	/* i_data is not going away, no lock needed */
 	add_chain (chain, NULL, EXT4_I(inode)->i_data + *offsets);
 	if (!p->key)
 		goto no_block;
 	while (--depth) {
 		bh = sb_bread(sb, le32_to_cpu(p->key));
 		if (!bh)
 			goto failure;
 		add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
 		/* Reader: end */
 		if (!p->key)
 			goto no_block;
 	}
 	return NULL;
 failure:
 	*err = -EIO;
 no_block:
 	return p;
 }
 /**
  *	ext4_find_near - find a place for allocation with sufficient locality
  *	@inode: owner
  *	@ind: descriptor of indirect block.
  *
  *	This function returns the preferred place for block allocation.
  *	It is used when heuristic for sequential allocation fails.
  *	Rules are:
  *	  + if there is a block to the left of our position - allocate near it.
  *	  + if pointer will live in indirect block - allocate near that block.
  *	  + if pointer will live in inode - allocate in the same
  *	    cylinder group.
  *
  * In the latter case we colour the starting block by the callers PID to
  * prevent it from clashing with concurrent allocations for a different inode
  * in the same block group.   The PID is used here so that functionally related
  * files will be close-by on-disk.
  *
  *	Caller must make sure that @ind is valid and will stay that way.
  */
 static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	__le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
 	__le32 *p;
 	ext4_fsblk_t bg_start;
 	ext4_fsblk_t last_block;
 	ext4_grpblk_t colour;
 	/* Try to find previous block */
 	for (p = ind->p - 1; p >= start; p--) {
 		if (*p)
 			return le32_to_cpu(*p);
 	}
 	/* No such thing, so let's try location of indirect block */
 	if (ind->bh)
 		return ind->bh->b_blocknr;
 	/*
 	 * It is going to be referred to from the inode itself? OK, just put it
 	 * into the same cylinder group then.
 	 */
 	bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group);
 	last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
 	if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
 		colour = (current->pid % 16) *
 			(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
 	else
 		colour = (current->pid % 16) * ((last_block - bg_start) / 16);
 	return bg_start + colour;
 }
 /**
  *	ext4_find_goal - find a preferred place for allocation.
  *	@inode: owner
  *	@block:  block we want
  *	@partial: pointer to the last triple within a chain
  *
  *	Normally this function find the preferred place for block allocation,
  *	returns it.
  */
 static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
 		Indirect *partial)
 {
 	struct ext4_block_alloc_info *block_i;
 	block_i =  EXT4_I(inode)->i_block_alloc_info;
 	/*
 	 * try the heuristic for sequential allocation,
 	 * failing that at least try to get decent locality.
 	 */
 	if (block_i && (block == block_i->last_alloc_logical_block + 1)
 		&& (block_i->last_alloc_physical_block != 0)) {
 		return block_i->last_alloc_physical_block + 1;
 	}
 	return ext4_find_near(inode, partial);
 }
 /**
  *	ext4_blks_to_allocate: Look up the block map and count the number
  *	of direct blocks need to be allocated for the given branch.
  *
  *	@branch: chain of indirect blocks
  *	@k: number of blocks need for indirect blocks
  *	@blks: number of data blocks to be mapped.
  *	@blocks_to_boundary:  the offset in the indirect block
  *
  *	return the total number of blocks to be allocate, including the
  *	direct and indirect blocks.
  */
 static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
 		int blocks_to_boundary)
 {
 	unsigned long count = 0;
 	/*
 	 * Simple case, [t,d]Indirect block(s) has not allocated yet
 	 * then it's clear blocks on that path have not allocated
 	 */
 	if (k > 0) {
 		/* right now we don't handle cross boundary allocation */
 		if (blks < blocks_to_boundary + 1)
 			count += blks;
 		else
 			count += blocks_to_boundary + 1;
 		return count;
 	}
 	count++;
 	while (count < blks && count <= blocks_to_boundary &&
 		le32_to_cpu(*(branch[0].p + count)) == 0) {
 		count++;
 	}
 	return count;
 }
 /**
  *	ext4_alloc_blocks: multiple allocate blocks needed for a branch
  *	@indirect_blks: the number of blocks need to allocate for indirect
  *			blocks
  *
  *	@new_blocks: on return it will store the new block numbers for
  *	the indirect blocks(if needed) and the first direct block,
  *	@blks:	on return it will store the total number of allocated
  *		direct blocks
  */
 static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
 			ext4_fsblk_t goal, int indirect_blks, int blks,
 			ext4_fsblk_t new_blocks[4], int *err)
 {
 	int target, i;
 	unsigned long count = 0;
 	int index = 0;
 	ext4_fsblk_t current_block = 0;
 	int ret = 0;
 	/*
 	 * Here we try to allocate the requested multiple blocks at once,
 	 * on a best-effort basis.
 	 * To build a branch, we should allocate blocks for
 	 * the indirect blocks(if not allocated yet), and at least
 	 * the first direct block of this branch.  That's the
 	 * minimum number of blocks need to allocate(required)
 	 */
 	target = blks + indirect_blks;
 	while (1) {
 		count = target;
 		/* allocating blocks for indirect blocks and direct blocks */
 		current_block = ext4_new_blocks(handle,inode,goal,&count,err);
 		if (*err)
 			goto failed_out;
 		target -= count;
 		/* allocate blocks for indirect blocks */
 		while (index < indirect_blks && count) {
 			new_blocks[index++] = current_block++;
 			count--;
 		}
 		if (count > 0)
 			break;
 	}
 	/* save the new block number for the first direct block */
 	new_blocks[index] = current_block;
 	/* total number of blocks allocated for direct blocks */
 	ret = count;
 	*err = 0;
 	return ret;
 failed_out:
 	for (i = 0; i <index; i++)
 		ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
 	return ret;
 }
 /**
  *	ext4_alloc_branch - allocate and set up a chain of blocks.
  *	@inode: owner
  *	@indirect_blks: number of allocated indirect blocks
  *	@blks: number of allocated direct blocks
  *	@offsets: offsets (in the blocks) to store the pointers to next.
  *	@branch: place to store the chain in.
  *
  *	This function allocates blocks, zeroes out all but the last one,
  *	links them into chain and (if we are synchronous) writes them to disk.
  *	In other words, it prepares a branch that can be spliced onto the
  *	inode. It stores the information about that chain in the branch[], in
  *	the same format as ext4_get_branch() would do. We are calling it after
  *	we had read the existing part of chain and partial points to the last
  *	triple of that (one with zero ->key). Upon the exit we have the same
  *	picture as after the successful ext4_get_block(), except that in one
  *	place chain is disconnected - *branch->p is still zero (we did not
  *	set the last link), but branch->key contains the number that should
  *	be placed into *branch->p to fill that gap.
  *
  *	If allocation fails we free all blocks we've allocated (and forget
  *	their buffer_heads) and return the error value the from failed
  *	ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain
  *	as described above and return 0.
  */
 static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
 			int indirect_blks, int *blks, ext4_fsblk_t goal,
 			ext4_lblk_t *offsets, Indirect *branch)
 {
 	int blocksize = inode->i_sb->s_blocksize;
 	int i, n = 0;
 	int err = 0;
 	struct buffer_head *bh;
 	int num;
 	ext4_fsblk_t new_blocks[4];
 	ext4_fsblk_t current_block;
 	num = ext4_alloc_blocks(handle, inode, goal, indirect_blks,
 				*blks, new_blocks, &err);
 	if (err)
 		return err;
 	branch[0].key = cpu_to_le32(new_blocks[0]);
 	/*
 	 * metadata blocks and data blocks are allocated.
 	 */
 	for (n = 1; n <= indirect_blks;  n++) {
 		/*
 		 * Get buffer_head for parent block, zero it out
 		 * and set the pointer to new one, then send
 		 * parent to disk.
 		 */
 		bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
 		branch[n].bh = bh;
 		lock_buffer(bh);
 		BUFFER_TRACE(bh, "call get_create_access");
 		err = ext4_journal_get_create_access(handle, bh);
 		if (err) {
 			unlock_buffer(bh);
 			brelse(bh);
 			goto failed;
 		}
 		memset(bh->b_data, 0, blocksize);
 		branch[n].p = (__le32 *) bh->b_data + offsets[n];
 		branch[n].key = cpu_to_le32(new_blocks[n]);
 		*branch[n].p = branch[n].key;
 		if ( n == indirect_blks) {
 			current_block = new_blocks[n];
 			/*
 			 * End of chain, update the last new metablock of
 			 * the chain to point to the new allocated
 			 * data blocks numbers
 			 */
 			for (i=1; i < num; i++)
 				*(branch[n].p + i) = cpu_to_le32(++current_block);
 		}
 		BUFFER_TRACE(bh, "marking uptodate");
 		set_buffer_uptodate(bh);
 		unlock_buffer(bh);
 		BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
 		err = ext4_journal_dirty_metadata(handle, bh);
 		if (err)
 			goto failed;
 	}
 	*blks = num;
 	return err;
 failed:
 	/* Allocation failed, free what we already allocated */
 	for (i = 1; i <= n ; i++) {
 		BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget");
 		ext4_journal_forget(handle, branch[i].bh);
 	}
 	for (i = 0; i <indirect_blks; i++)
 		ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
 	ext4_free_blocks(handle, inode, new_blocks[i], num, 0);
 	return err;
 }
 /**
  * ext4_splice_branch - splice the allocated branch onto inode.
  * @inode: owner
  * @block: (logical) number of block we are adding
  * @chain: chain of indirect blocks (with a missing link - see
  *	ext4_alloc_branch)
  * @where: location of missing link
  * @num:   number of indirect blocks we are adding
  * @blks:  number of direct blocks we are adding
  *
  * This function fills the missing link and does all housekeeping needed in
  * inode (->i_blocks, etc.). In case of success we end up with the full
  * chain to new block and return 0.
  */
 static int ext4_splice_branch(handle_t *handle, struct inode *inode,
 			ext4_lblk_t block, Indirect *where, int num, int blks)
 {
 	int i;
 	int err = 0;
 	struct ext4_block_alloc_info *block_i;
 	ext4_fsblk_t current_block;
 	block_i = EXT4_I(inode)->i_block_alloc_info;
 	/*
 	 * If we're splicing into a [td]indirect block (as opposed to the
 	 * inode) then we need to get write access to the [td]indirect block
 	 * before the splice.
 	 */
 	if (where->bh) {
 		BUFFER_TRACE(where->bh, "get_write_access");
 		err = ext4_journal_get_write_access(handle, where->bh);
 		if (err)
 			goto err_out;
 	}
 	/* That's it */
 	*where->p = where->key;
 	/*
 	 * Update the host buffer_head or inode to point to more just allocated
 	 * direct blocks blocks
 	 */
 	if (num == 0 && blks > 1) {
 		current_block = le32_to_cpu(where->key) + 1;
 		for (i = 1; i < blks; i++)
 			*(where->p + i ) = cpu_to_le32(current_block++);
 	}
 	/*
 	 * update the most recently allocated logical & physical block
 	 * in i_block_alloc_info, to assist find the proper goal block for next
 	 * allocation
 	 */
 	if (block_i) {
 		block_i->last_alloc_logical_block = block + blks - 1;
 		block_i->last_alloc_physical_block =
 				le32_to_cpu(where[num].key) + blks - 1;
 	}
 	/* We are done with atomic stuff, now do the rest of housekeeping */
 	inode->i_ctime = ext4_current_time(inode);
 	ext4_mark_inode_dirty(handle, inode);
 	/* had we spliced it onto indirect block? */
 	if (where->bh) {
 		/*
 		 * If we spliced it onto an indirect block, we haven't
 		 * altered the inode.  Note however that if it is being spliced
 		 * onto an indirect block at the very end of the file (the
 		 * file is growing) then we *will* alter the inode to reflect
 		 * the new i_size.  But that is not done here - it is done in
 		 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
 		 */
 		jbd_debug(5, "splicing indirect only\n");
 		BUFFER_TRACE(where->bh, "call ext4_journal_dirty_metadata");
 		err = ext4_journal_dirty_metadata(handle, where->bh);
 		if (err)
 			goto err_out;
 	} else {
 		/*
 		 * OK, we spliced it into the inode itself on a direct block.
 		 * Inode was dirtied above.
 		 */
 		jbd_debug(5, "splicing direct\n");
 	}
 	return err;
 err_out:
 	for (i = 1; i <= num; i++) {
 		BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget");
 		ext4_journal_forget(handle, where[i].bh);
 		ext4_free_blocks(handle, inode,
 					le32_to_cpu(where[i-1].key), 1, 0);
 	}
 	ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0);
 	return err;
 }
 /*
  * Allocation strategy is simple: if we have to allocate something, we will
  * have to go the whole way to leaf. So let's do it before attaching anything
  * to tree, set linkage between the newborn blocks, write them if sync is
  * required, recheck the path, free and repeat if check fails, otherwise
  * set the last missing link (that will protect us from any truncate-generated
  * removals - all blocks on the path are immune now) and possibly force the
  * write on the parent block.
  * That has a nice additional property: no special recovery from the failed
  * allocations is needed - we simply release blocks and do not touch anything
  * reachable from inode.
  *
  * `handle' can be NULL if create == 0.
  *
  * return > 0, # of blocks mapped or allocated.
  * return = 0, if plain lookup failed.
  * return < 0, error case.
  *
  *
  * Need to be called with
  * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
  * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
  */
 int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
 		ext4_lblk_t iblock, unsigned long maxblocks,
 		struct buffer_head *bh_result,
 		int create, int extend_disksize)
 {
 	int err = -EIO;
 	ext4_lblk_t offsets[4];
 	Indirect chain[4];
 	Indirect *partial;
 	ext4_fsblk_t goal;
 	int indirect_blks;
 	int blocks_to_boundary = 0;
 	int depth;
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	int count = 0;
 	ext4_fsblk_t first_block = 0;
 	J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
 	J_ASSERT(handle != NULL || create == 0);
 	depth = ext4_block_to_path(inode, iblock, offsets,
 					&blocks_to_boundary);
 	if (depth == 0)
 		goto out;
 	partial = ext4_get_branch(inode, depth, offsets, chain, &err);
 	/* Simplest case - block found, no allocation needed */
 	if (!partial) {
 		first_block = le32_to_cpu(chain[depth - 1].key);
 		clear_buffer_new(bh_result);
 		count++;
 		/*map more blocks*/
 		while (count < maxblocks && count <= blocks_to_boundary) {
 			ext4_fsblk_t blk;
 			blk = le32_to_cpu(*(chain[depth-1].p + count));
 			if (blk == first_block + count)
 				count++;
 			else
 				break;
 		}
 		goto got_it;
 	}
 	/* Next simple case - plain lookup or failed read of indirect block */
 	if (!create || err == -EIO)
 		goto cleanup;
 	/*
 	 * Okay, we need to do block allocation.  Lazily initialize the block
 	 * allocation info here if necessary
 	*/
 	if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
 		ext4_init_block_alloc_info(inode);
 	goal = ext4_find_goal(inode, iblock, partial);
 	/* the number of blocks need to allocate for [d,t]indirect blocks */
 	indirect_blks = (chain + depth) - partial - 1;
 	/*
 	 * Next look up the indirect map to count the totoal number of
 	 * direct blocks to allocate for this branch.
 	 */
 	count = ext4_blks_to_allocate(partial, indirect_blks,
 					maxblocks, blocks_to_boundary);
 	/*
 	 * Block out ext4_truncate while we alter the tree
 	 */
 	err = ext4_alloc_branch(handle, inode, indirect_blks, &count, goal,
 				offsets + (partial - chain), partial);
 	/*
 	 * The ext4_splice_branch call will free and forget any buffers
 	 * on the new chain if there is a failure, but that risks using
 	 * up transaction credits, especially for bitmaps where the
 	 * credits cannot be returned.  Can we handle this somehow?  We
 	 * may need to return -EAGAIN upwards in the worst case.  --sct
 	 */
 	if (!err)
 		err = ext4_splice_branch(handle, inode, iblock,
 					partial, indirect_blks, count);
 	/*
 	 * i_disksize growing is protected by i_data_sem.  Don't forget to
 	 * protect it if you're about to implement concurrent
 	 * ext4_get_block() -bzzz
 	*/
 	if (!err && extend_disksize && inode->i_size > ei->i_disksize)
 		ei->i_disksize = inode->i_size;
 	if (err)
 		goto cleanup;
 	set_buffer_new(bh_result);
 got_it:
 	map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
 	if (count > blocks_to_boundary)
 		set_buffer_boundary(bh_result);
 	err = count;
 	/* Clean up and exit */
 	partial = chain + depth - 1;	/* the whole chain */
 cleanup:
 	while (partial > chain) {
 		BUFFER_TRACE(partial->bh, "call brelse");
 		brelse(partial->bh);
 		partial--;
 	}
 	BUFFER_TRACE(bh_result, "returned");
 out:
 	return err;
 }
 /* Maximum number of blocks we map for direct IO at once. */
 #define DIO_MAX_BLOCKS 4096
 /*
  * Number of credits we need for writing DIO_MAX_BLOCKS:
  * We need sb + group descriptor + bitmap + inode -> 4
  * For B blocks with A block pointers per block we need:
  * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect).
  * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25.
  */
 #define DIO_CREDITS 25
 /*
  *
  *
  * ext4_ext4 get_block() wrapper function
  * It will do a look up first, and returns if the blocks already mapped.
  * Otherwise it takes the write lock of the i_data_sem and allocate blocks
  * and store the allocated blocks in the result buffer head and mark it
  * mapped.
  *
  * If file type is extents based, it will call ext4_ext_get_blocks(),
  * Otherwise, call with ext4_get_blocks_handle() to handle indirect mapping
  * based files
  *
  * On success, it returns the number of blocks being mapped or allocate.
  * if create==0 and the blocks are pre-allocated and uninitialized block,
  * the result buffer head is unmapped. If the create ==1, it will make sure
  * the buffer head is mapped.
  *
  * It returns 0 if plain look up failed (blocks have not been allocated), in
  * that casem, buffer head is unmapped
  *
  * It returns the error in case of allocation failure.
  */
 int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
 			unsigned long max_blocks, struct buffer_head *bh,
 			int create, int extend_disksize)
 {
 	int retval;
 	clear_buffer_mapped(bh);
 	/*
 	 * Try to see if we can get  the block without requesting
 	 * for new file system block.
 	 */
 	down_read((&EXT4_I(inode)->i_data_sem));
 	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
 		retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
 				bh, 0, 0);
 	} else {
 		retval = ext4_get_blocks_handle(handle,
 				inode, block, max_blocks, bh, 0, 0);
 	}
 	up_read((&EXT4_I(inode)->i_data_sem));
 	/* If it is only a block(s) look up */
 	if (!create)
 		return retval;
 	/*
 	 * Returns if the blocks have already allocated
 	 *
 	 * Note that if blocks have been preallocated
 	 * ext4_ext_get_block() returns th create = 0
 	 * with buffer head unmapped.
 	 */
 	if (retval > 0 && buffer_mapped(bh))
 		return retval;
 	/*
 	 * New blocks allocate and/or writing to uninitialized extent
 	 * will possibly result in updating i_data, so we take
 	 * the write lock of i_data_sem, and call get_blocks()
 	 * with create == 1 flag.
 	 */
 	down_write((&EXT4_I(inode)->i_data_sem));
 	/*
 	 * We need to check for EXT4 here because migrate
 	 * could have changed the inode type in between
 	 */
 	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
 		retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
 				bh, create, extend_disksize);
 	} else {
 		retval = ext4_get_blocks_handle(handle, inode, block,
 				max_blocks, bh, create, extend_disksize);
 		if (retval > 0 && buffer_new(bh)) {
 			/*
 			 * We allocated new blocks which will result in
 			 * i_data's format changing.  Force the migrate
 			 * to fail by clearing migrate flags
 			 */
 			EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags &
 							~EXT4_EXT_MIGRATE;
 		}
 	}
 	up_write((&EXT4_I(inode)->i_data_sem));
 	return retval;
 }
 static int ext4_get_block(struct inode *inode, sector_t iblock,
 			struct buffer_head *bh_result, int create)
 {
 	handle_t *handle = ext4_journal_current_handle();
 	int ret = 0, started = 0;
 	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
 	if (create && !handle) {
 		/* Direct IO write... */
 		if (max_blocks > DIO_MAX_BLOCKS)
 			max_blocks = DIO_MAX_BLOCKS;
 		handle = ext4_journal_start(inode, DIO_CREDITS +
 			      2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb));
 		if (IS_ERR(handle)) {
 			ret = PTR_ERR(handle);
 			goto out;
 		}
 		started = 1;
 	}
 	ret = ext4_get_blocks_wrap(handle, inode, iblock,
 					max_blocks, bh_result, create, 0);
 	if (ret > 0) {
 		bh_result->b_size = (ret << inode->i_blkbits);
 		ret = 0;
 	}
 	if (started)
 		ext4_journal_stop(handle);
 out:
 	return ret;
 }
 /*
  * `handle' can be NULL if create is zero
  */
 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
 				ext4_lblk_t block, int create, int *errp)
 {
 	struct buffer_head dummy;
 	int fatal = 0, err;
 	J_ASSERT(handle != NULL || create == 0);
 	dummy.b_state = 0;
 	dummy.b_blocknr = -1000;
 	buffer_trace_init(&dummy.b_history);
 	err = ext4_get_blocks_wrap(handle, inode, block, 1,
 					&dummy, create, 1);
 	/*
 	 * ext4_get_blocks_handle() returns number of blocks
 	 * mapped. 0 in case of a HOLE.
 	 */
 	if (err > 0) {
 		if (err > 1)
 			WARN_ON(1);
 		err = 0;
 	}
 	*errp = err;
 	if (!err && buffer_mapped(&dummy)) {
 		struct buffer_head *bh;
 		bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
 		if (!bh) {
 			*errp = -EIO;
 			goto err;
 		}
 		if (buffer_new(&dummy)) {
 			J_ASSERT(create != 0);
 			J_ASSERT(handle != NULL);
 			/*
 			 * Now that we do not always journal data, we should
 			 * keep in mind whether this should always journal the
 			 * new buffer as metadata.  For now, regular file
 			 * writes use ext4_get_block instead, so it's not a
 			 * problem.
 			 */
 			lock_buffer(bh);
 			BUFFER_TRACE(bh, "call get_create_access");
 			fatal = ext4_journal_get_create_access(handle, bh);
 			if (!fatal && !buffer_uptodate(bh)) {
 				memset(bh->b_data,0,inode->i_sb->s_blocksize);
 				set_buffer_uptodate(bh);
 			}
 			unlock_buffer(bh);
 			BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
 			err = ext4_journal_dirty_metadata(handle, bh);
 			if (!fatal)
 				fatal = err;
 		} else {
 			BUFFER_TRACE(bh, "not a new buffer");
 		}
 		if (fatal) {
 			*errp = fatal;
 			brelse(bh);
 			bh = NULL;
 		}
 		return bh;
 	}
 err:
 	return NULL;
 }
 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
 			       ext4_lblk_t block, int create, int *err)
 {
 	struct buffer_head * bh;
 	bh = ext4_getblk(handle, inode, block, create, err);
 	if (!bh)
 		return bh;
 	if (buffer_uptodate(bh))
 		return bh;
 	ll_rw_block(READ_META, 1, &bh);
 	wait_on_buffer(bh);
 	if (buffer_uptodate(bh))
 		return bh;
 	put_bh(bh);
 	*err = -EIO;
 	return NULL;
 }
 static int walk_page_buffers(	handle_t *handle,
 				struct buffer_head *head,
 				unsigned from,
 				unsigned to,
 				int *partial,
 				int (*fn)(	handle_t *handle,
 						struct buffer_head *bh))
 {
 	struct buffer_head *bh;
 	unsigned block_start, block_end;
 	unsigned blocksize = head->b_size;
 	int err, ret = 0;
 	struct buffer_head *next;
 	for (	bh = head, block_start = 0;
 		ret == 0 && (bh != head || !block_start);
 		block_start = block_end, bh = next)
 	{
 		next = bh->b_this_page;
 		block_end = block_start + blocksize;
 		if (block_end <= from || block_start >= to) {
 			if (partial && !buffer_uptodate(bh))
 				*partial = 1;
 			continue;
 		}
 		err = (*fn)(handle, bh);
 		if (!ret)
 			ret = err;
 	}
 	return ret;
 }
 /*
  * To preserve ordering, it is essential that the hole instantiation and
  * the data write be encapsulated in a single transaction.  We cannot
  * close off a transaction and start a new one between the ext4_get_block()
  * and the commit_write().  So doing the jbd2_journal_start at the start of
  * prepare_write() is the right place.
  *
  * Also, this function can nest inside ext4_writepage() ->
  * block_write_full_page(). In that case, we *know* that ext4_writepage()
  * has generated enough buffer credits to do the whole page.  So we won't
  * block on the journal in that case, which is good, because the caller may
  * be PF_MEMALLOC.
  *
  * By accident, ext4 can be reentered when a transaction is open via
  * quota file writes.  If we were to commit the transaction while thus
  * reentered, there can be a deadlock - we would be holding a quota
  * lock, and the commit would never complete if another thread had a
  * transaction open and was blocking on the quota lock - a ranking
  * violation.
  *
  * So what we do is to rely on the fact that jbd2_journal_stop/journal_start
  * will _not_ run commit under these circumstances because handle->h_ref
  * is elevated.  We'll still have enough credits for the tiny quotafile
  * write.
  */
 static int do_journal_get_write_access(handle_t *handle,
 					struct buffer_head *bh)
 {
 	if (!buffer_mapped(bh) || buffer_freed(bh))
 		return 0;
 	return ext4_journal_get_write_access(handle, bh);
 }
 static int ext4_write_begin(struct file *file, struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned flags,
 				struct page **pagep, void **fsdata)
 {
  	struct inode *inode = mapping->host;
 	int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
 	handle_t *handle;
 	int retries = 0;
  	struct page *page;
  	pgoff_t index;
  	unsigned from, to;
  	index = pos >> PAGE_CACHE_SHIFT;
  	from = pos & (PAGE_CACHE_SIZE - 1);
  	to = from + len;
 retry:
  	page = __grab_cache_page(mapping, index);
  	if (!page)
  		return -ENOMEM;
  	*pagep = page;
   	handle = ext4_journal_start(inode, needed_blocks);
   	if (IS_ERR(handle)) {
  		unlock_page(page);
  		page_cache_release(page);
   		ret = PTR_ERR(handle);
   		goto out;
 	}
 	ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
 							ext4_get_block);
 	if (!ret && ext4_should_journal_data(inode)) {
 		ret = walk_page_buffers(handle, page_buffers(page),
 				from, to, NULL, do_journal_get_write_access);
 	}
 	if (ret) {
 		ext4_journal_stop(handle);
  		unlock_page(page);
  		page_cache_release(page);
 	}
 	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
 		goto retry;
 out:
 	return ret;
 }
 int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
 {
 	int err = jbd2_journal_dirty_data(handle, bh);
 	if (err)
 		ext4_journal_abort_handle(__func__, __func__,
 						bh, handle, err);
 	return err;
 }
 /* For write_end() in data=journal mode */
 static int write_end_fn(handle_t *handle, struct buffer_head *bh)
 {
 	if (!buffer_mapped(bh) || buffer_freed(bh))
 		return 0;
 	set_buffer_uptodate(bh);
 	return ext4_journal_dirty_metadata(handle, bh);
 }
 /*
  * Generic write_end handler for ordered and writeback ext4 journal modes.
  * We can't use generic_write_end, because that unlocks the page and we need to
  * unlock the page after ext4_journal_stop, but ext4_journal_stop must run
  * after block_write_end.
  */
 static int ext4_generic_write_end(struct file *file,
 				struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned copied,
 				struct page *page, void *fsdata)
 {
 	struct inode *inode = file->f_mapping->host;
 	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
 	if (pos+copied > inode->i_size) {
 		i_size_write(inode, pos+copied);
 		mark_inode_dirty(inode);
 	}
 	return copied;
 }
 /*
  * We need to pick up the new inode size which generic_commit_write gave us
  * `file' can be NULL - eg, when called from page_symlink().
  *
  * ext4 never places buffers on inode->i_mapping->private_list.  metadata
  * buffers are managed internally.
  */
 static int ext4_ordered_write_end(struct file *file,
 				struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned copied,
 				struct page *page, void *fsdata)
 {
 	handle_t *handle = ext4_journal_current_handle();
 	struct inode *inode = file->f_mapping->host;
 	unsigned from, to;
 	int ret = 0, ret2;
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + len;
 	ret = walk_page_buffers(handle, page_buffers(page),
 		from, to, NULL, ext4_journal_dirty_data);
 	if (ret == 0) {
 		/*
 		 * generic_write_end() will run mark_inode_dirty() if i_size
 		 * changes.  So let's piggyback the i_disksize mark_inode_dirty
 		 * into that.
 		 */
 		loff_t new_i_size;
 		new_i_size = pos + copied;
 		if (new_i_size > EXT4_I(inode)->i_disksize)
 			EXT4_I(inode)->i_disksize = new_i_size;
-		copied = ext4_generic_write_end(file, mapping, pos, len, copied,
+		ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
-		if (copied < 0)
+		copied = ret2;
-			ret = copied;
+		if (ret2 < 0)
+			ret = ret2;
 	}
 	ret2 = ext4_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
 	unlock_page(page);
 	page_cache_release(page);
 	return ret ? ret : copied;
 }
 static int ext4_writeback_write_end(struct file *file,
 				struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned copied,
 				struct page *page, void *fsdata)
 {
 	handle_t *handle = ext4_journal_current_handle();
 	struct inode *inode = file->f_mapping->host;
 	int ret = 0, ret2;
 	loff_t new_i_size;
 	new_i_size = pos + copied;
 	if (new_i_size > EXT4_I(inode)->i_disksize)
 		EXT4_I(inode)->i_disksize = new_i_size;
-	copied = ext4_generic_write_end(file, mapping, pos, len, copied,
+	ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
-	if (copied < 0)
+	copied = ret2;
-		ret = copied;
+	if (ret2 < 0)
+		ret = ret2;
 	ret2 = ext4_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
 	unlock_page(page);
 	page_cache_release(page);
 	return ret ? ret : copied;
 }
 static int ext4_journalled_write_end(struct file *file,
 				struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned copied,
 				struct page *page, void *fsdata)
 {
 	handle_t *handle = ext4_journal_current_handle();
 	struct inode *inode = mapping->host;
 	int ret = 0, ret2;
 	int partial = 0;
 	unsigned from, to;
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + len;
 	if (copied < len) {
 		if (!PageUptodate(page))
 			copied = 0;
 		page_zero_new_buffers(page, from+copied, to);
 	}
 	ret = walk_page_buffers(handle, page_buffers(page), from,
 				to, &partial, write_end_fn);
 	if (!partial)
 		SetPageUptodate(page);
 	if (pos+copied > inode->i_size)
 		i_size_write(inode, pos+copied);
 	EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
 	if (inode->i_size > EXT4_I(inode)->i_disksize) {
 		EXT4_I(inode)->i_disksize = inode->i_size;
 		ret2 = ext4_mark_inode_dirty(handle, inode);
 		if (!ret)
 			ret = ret2;
 	}
 	ret2 = ext4_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
 	unlock_page(page);
 	page_cache_release(page);
 	return ret ? ret : copied;
 }
 /*
  * bmap() is special.  It gets used by applications such as lilo and by
  * the swapper to find the on-disk block of a specific piece of data.
  *
  * Naturally, this is dangerous if the block concerned is still in the
  * journal.  If somebody makes a swapfile on an ext4 data-journaling
  * filesystem and enables swap, then they may get a nasty shock when the
  * data getting swapped to that swapfile suddenly gets overwritten by
  * the original zero's written out previously to the journal and
  * awaiting writeback in the kernel's buffer cache.
  *
  * So, if we see any bmap calls here on a modified, data-journaled file,
  * take extra steps to flush any blocks which might be in the cache.
  */
 static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
 {
 	struct inode *inode = mapping->host;
 	journal_t *journal;
 	int err;
 	if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
 		/*
 		 * This is a REALLY heavyweight approach, but the use of
 		 * bmap on dirty files is expected to be extremely rare:
 		 * only if we run lilo or swapon on a freshly made file
 		 * do we expect this to happen.
 		 *
 		 * (bmap requires CAP_SYS_RAWIO so this does not
 		 * represent an unprivileged user DOS attack --- we'd be
 		 * in trouble if mortal users could trigger this path at
 		 * will.)
 		 *
 		 * NB. EXT4_STATE_JDATA is not set on files other than
 		 * regular files.  If somebody wants to bmap a directory
 		 * or symlink and gets confused because the buffer
 		 * hasn't yet been flushed to disk, they deserve
 		 * everything they get.
 		 */
 		EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA;
 		journal = EXT4_JOURNAL(inode);
 		jbd2_journal_lock_updates(journal);
 		err = jbd2_journal_flush(journal);
 		jbd2_journal_unlock_updates(journal);
 		if (err)
 			return 0;
 	}
 	return generic_block_bmap(mapping,block,ext4_get_block);
 }
 static int bget_one(handle_t *handle, struct buffer_head *bh)
 {
 	get_bh(bh);
 	return 0;
 }
 static int bput_one(handle_t *handle, struct buffer_head *bh)
 {
 	put_bh(bh);
 	return 0;
 }
 static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
 {
 	if (buffer_mapped(bh))
 		return ext4_journal_dirty_data(handle, bh);
 	return 0;
 }
 /*
  * Note that we always start a transaction even if we're not journalling
  * data.  This is to preserve ordering: any hole instantiation within
  * __block_write_full_page -> ext4_get_block() should be journalled
  * along with the data so we don't crash and then get metadata which
  * refers to old data.
  *
  * In all journalling modes block_write_full_page() will start the I/O.
  *
  * Problem:
  *
  *	ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
  *		ext4_writepage()
  *
  * Similar for:
  *
  *	ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ...
  *
  * Same applies to ext4_get_block().  We will deadlock on various things like
  * lock_journal and i_data_sem
  *
  * Setting PF_MEMALLOC here doesn't work - too many internal memory
  * allocations fail.
  *
  * 16May01: If we're reentered then journal_current_handle() will be
  *	    non-zero. We simply *return*.
  *
  * 1 July 2001: @@@ FIXME:
  *   In journalled data mode, a data buffer may be metadata against the
  *   current transaction.  But the same file is part of a shared mapping
  *   and someone does a writepage() on it.
  *
  *   We will move the buffer onto the async_data list, but *after* it has
  *   been dirtied. So there's a small window where we have dirty data on
  *   BJ_Metadata.
  *
  *   Note that this only applies to the last partial page in the file.  The
  *   bit which block_write_full_page() uses prepare/commit for.  (That's
  *   broken code anyway: it's wrong for msync()).
  *
  *   It's a rare case: affects the final partial page, for journalled data
  *   where the file is subject to bith write() and writepage() in the same
  *   transction.  To fix it we'll need a custom block_write_full_page().
  *   We'll probably need that anyway for journalling writepage() output.
  *
  * We don't honour synchronous mounts for writepage().  That would be
  * disastrous.  Any write() or metadata operation will sync the fs for
  * us.
  *
  * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
  * we don't need to open a transaction here.
  */
 static int ext4_ordered_writepage(struct page *page,
 				struct writeback_control *wbc)
 {
 	struct inode *inode = page->mapping->host;
 	struct buffer_head *page_bufs;
 	handle_t *handle = NULL;
 	int ret = 0;
 	int err;
 	J_ASSERT(PageLocked(page));
 	/*
 	 * We give up here if we're reentered, because it might be for a
 	 * different filesystem.
 	 */
 	if (ext4_journal_current_handle())
 		goto out_fail;
 	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		goto out_fail;
 	}
 	if (!page_has_buffers(page)) {
 		create_empty_buffers(page, inode->i_sb->s_blocksize,
 				(1 << BH_Dirty)|(1 << BH_Uptodate));
 	}
 	page_bufs = page_buffers(page);
 	walk_page_buffers(handle, page_bufs, 0,
 			PAGE_CACHE_SIZE, NULL, bget_one);
 	ret = block_write_full_page(page, ext4_get_block, wbc);
 	/*
 	 * The page can become unlocked at any point now, and
 	 * truncate can then come in and change things.  So we
 	 * can't touch *page from now on.  But *page_bufs is
 	 * safe due to elevated refcount.
 	 */
 	/*
 	 * And attach them to the current transaction.  But only if
 	 * block_write_full_page() succeeded.  Otherwise they are unmapped,
 	 * and generally junk.
 	 */
 	if (ret == 0) {
 		err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
 					NULL, jbd2_journal_dirty_data_fn);
 		if (!ret)
 			ret = err;
 	}
 	walk_page_buffers(handle, page_bufs, 0,
 			PAGE_CACHE_SIZE, NULL, bput_one);
 	err = ext4_journal_stop(handle);
 	if (!ret)
 		ret = err;
 	return ret;
 out_fail:
 	redirty_page_for_writepage(wbc, page);
 	unlock_page(page);
 	return ret;
 }
 static int ext4_writeback_writepage(struct page *page,
 				struct writeback_control *wbc)
 {
 	struct inode *inode = page->mapping->host;
 	handle_t *handle = NULL;
 	int ret = 0;
 	int err;
 	if (ext4_journal_current_handle())
 		goto out_fail;
 	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		goto out_fail;
 	}
 	if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
 		ret = nobh_writepage(page, ext4_get_block, wbc);
 	else
 		ret = block_write_full_page(page, ext4_get_block, wbc);
 	err = ext4_journal_stop(handle);
 	if (!ret)
 		ret = err;
 	return ret;
 out_fail:
 	redirty_page_for_writepage(wbc, page);
 	unlock_page(page);
 	return ret;
 }
 static int ext4_journalled_writepage(struct page *page,
 				struct writeback_control *wbc)
 {
 	struct inode *inode = page->mapping->host;
 	handle_t *handle = NULL;
 	int ret = 0;
 	int err;
 	if (ext4_journal_current_handle())
 		goto no_write;
 	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		goto no_write;
 	}
 	if (!page_has_buffers(page) || PageChecked(page)) {
 		/*
 		 * It's mmapped pagecache.  Add buffers and journal it.  There
 		 * doesn't seem much point in redirtying the page here.
 		 */
 		ClearPageChecked(page);
 		ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
 					ext4_get_block);
 		if (ret != 0) {
 			ext4_journal_stop(handle);
 			goto out_unlock;
 		}
 		ret = walk_page_buffers(handle, page_buffers(page), 0,
 			PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
 		err = walk_page_buffers(handle, page_buffers(page), 0,
 				PAGE_CACHE_SIZE, NULL, write_end_fn);
 		if (ret == 0)
 			ret = err;
 		EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
 		unlock_page(page);
 	} else {
 		/*
 		 * It may be a page full of checkpoint-mode buffers.  We don't
 		 * really know unless we go poke around in the buffer_heads.
 		 * But block_write_full_page will do the right thing.
 		 */
 		ret = block_write_full_page(page, ext4_get_block, wbc);
 	}
 	err = ext4_journal_stop(handle);
 	if (!ret)
 		ret = err;
 out:
 	return ret;
 no_write:
 	redirty_page_for_writepage(wbc, page);
 out_unlock:
 	unlock_page(page);
 	goto out;
 }
 static int ext4_readpage(struct file *file, struct page *page)
 {
 	return mpage_readpage(page, ext4_get_block);
 }
 static int
 ext4_readpages(struct file *file, struct address_space *mapping,
 		struct list_head *pages, unsigned nr_pages)
 {
 	return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
 }
 static void ext4_invalidatepage(struct page *page, unsigned long offset)
 {
 	journal_t *journal = EXT4_JOURNAL(page->mapping->host);
 	/*
 	 * If it's a full truncate we just forget about the pending dirtying
 	 */
 	if (offset == 0)
 		ClearPageChecked(page);
 	jbd2_journal_invalidatepage(journal, page, offset);
 }
 static int ext4_releasepage(struct page *page, gfp_t wait)
 {
 	journal_t *journal = EXT4_JOURNAL(page->mapping->host);
 	WARN_ON(PageChecked(page));
 	if (!page_has_buffers(page))
 		return 0;
 	return jbd2_journal_try_to_free_buffers(journal, page, wait);
 }
 /*
  * If the O_DIRECT write will extend the file then add this inode to the
  * orphan list.  So recovery will truncate it back to the original size
  * if the machine crashes during the write.
  *
  * If the O_DIRECT write is intantiating holes inside i_size and the machine
  * crashes then stale disk data _may_ be exposed inside the file. But current
  * VFS code falls back into buffered path in that case so we are safe.
  */
 static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
 			const struct iovec *iov, loff_t offset,
 			unsigned long nr_segs)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	handle_t *handle;
 	ssize_t ret;
 	int orphan = 0;
 	size_t count = iov_length(iov, nr_segs);
 	if (rw == WRITE) {
 		loff_t final_size = offset + count;
 		if (final_size > inode->i_size) {
 			/* Credits for sb + inode write */
 			handle = ext4_journal_start(inode, 2);
 			if (IS_ERR(handle)) {
 				ret = PTR_ERR(handle);
 				goto out;
 			}
 			ret = ext4_orphan_add(handle, inode);
 			if (ret) {
 				ext4_journal_stop(handle);
 				goto out;
 			}
 			orphan = 1;
 			ei->i_disksize = inode->i_size;
 			ext4_journal_stop(handle);
 		}
 	}
 	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
 				 offset, nr_segs,
 				 ext4_get_block, NULL);
 	if (orphan) {
 		int err;
 		/* Credits for sb + inode write */
 		handle = ext4_journal_start(inode, 2);
 		if (IS_ERR(handle)) {
 			/* This is really bad luck. We've written the data
 			 * but cannot extend i_size. Bail out and pretend
 			 * the write failed... */
 			ret = PTR_ERR(handle);
 			goto out;
 		}
 		if (inode->i_nlink)
 			ext4_orphan_del(handle, inode);
 		if (ret > 0) {
 			loff_t end = offset + ret;
 			if (end > inode->i_size) {
 				ei->i_disksize = end;
 				i_size_write(inode, end);
 				/*
 				 * We're going to return a positive `ret'
 				 * here due to non-zero-length I/O, so there's
 				 * no way of reporting error returns from
 				 * ext4_mark_inode_dirty() to userspace.  So
 				 * ignore it.
 				 */
 				ext4_mark_inode_dirty(handle, inode);
 			}
 		}
 		err = ext4_journal_stop(handle);
 		if (ret == 0)
 			ret = err;
 	}
 out:
 	return ret;
 }
 /*
  * Pages can be marked dirty completely asynchronously from ext4's journalling
  * activity.  By filemap_sync_pte(), try_to_unmap_one(), etc.  We cannot do
  * much here because ->set_page_dirty is called under VFS locks.  The page is
  * not necessarily locked.
  *
  * We cannot just dirty the page and leave attached buffers clean, because the
  * buffers' dirty state is "definitive".  We cannot just set the buffers dirty
  * or jbddirty because all the journalling code will explode.
  *
  * So what we do is to mark the page "pending dirty" and next time writepage
  * is called, propagate that into the buffers appropriately.
  */
 static int ext4_journalled_set_page_dirty(struct page *page)
 {
 	SetPageChecked(page);
 	return __set_page_dirty_nobuffers(page);
 }
 static const struct address_space_operations ext4_ordered_aops = {
 	.readpage	= ext4_readpage,
 	.readpages	= ext4_readpages,
 	.writepage	= ext4_ordered_writepage,
 	.sync_page	= block_sync_page,
 	.write_begin	= ext4_write_begin,
 	.write_end	= ext4_ordered_write_end,
 	.bmap		= ext4_bmap,
 	.invalidatepage	= ext4_invalidatepage,
 	.releasepage	= ext4_releasepage,
 	.direct_IO	= ext4_direct_IO,
 	.migratepage	= buffer_migrate_page,
 };
 static const struct address_space_operations ext4_writeback_aops = {
 	.readpage	= ext4_readpage,
 	.readpages	= ext4_readpages,
 	.writepage	= ext4_writeback_writepage,
 	.sync_page	= block_sync_page,
 	.write_begin	= ext4_write_begin,
 	.write_end	= ext4_writeback_write_end,
 	.bmap		= ext4_bmap,
 	.invalidatepage	= ext4_invalidatepage,
 	.releasepage	= ext4_releasepage,
 	.direct_IO	= ext4_direct_IO,
 	.migratepage	= buffer_migrate_page,
 };
 static const struct address_space_operations ext4_journalled_aops = {
 	.readpage	= ext4_readpage,
 	.readpages	= ext4_readpages,
 	.writepage	= ext4_journalled_writepage,
 	.sync_page	= block_sync_page,
 	.write_begin	= ext4_write_begin,
 	.write_end	= ext4_journalled_write_end,
 	.set_page_dirty	= ext4_journalled_set_page_dirty,
 	.bmap		= ext4_bmap,
 	.invalidatepage	= ext4_invalidatepage,
 	.releasepage	= ext4_releasepage,
 };
 void ext4_set_aops(struct inode *inode)
 {
 	if (ext4_should_order_data(inode))
 		inode->i_mapping->a_ops = &ext4_ordered_aops;
 	else if (ext4_should_writeback_data(inode))
 		inode->i_mapping->a_ops = &ext4_writeback_aops;
 	else
 		inode->i_mapping->a_ops = &ext4_journalled_aops;
 }
 /*
  * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
  * up to the end of the block which corresponds to `from'.
  * This required during truncate. We need to physically zero the tail end
  * of that block so it doesn't yield old data if the file is later grown.
  */
 int ext4_block_truncate_page(handle_t *handle, struct page *page,
 		struct address_space *mapping, loff_t from)
 {
 	ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
 	unsigned blocksize, length, pos;
 	ext4_lblk_t iblock;
 	struct inode *inode = mapping->host;
 	struct buffer_head *bh;
 	int err = 0;
 	blocksize = inode->i_sb->s_blocksize;
 	length = blocksize - (offset & (blocksize - 1));
 	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
 	/*
 	 * For "nobh" option,  we can only work if we don't need to
 	 * read-in the page - otherwise we create buffers to do the IO.
 	 */
 	if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
 	     ext4_should_writeback_data(inode) && PageUptodate(page)) {
 		zero_user(page, offset, length);
 		set_page_dirty(page);
 		goto unlock;
 	}
 	if (!page_has_buffers(page))
 		create_empty_buffers(page, blocksize, 0);
 	/* Find the buffer that contains "offset" */
 	bh = page_buffers(page);
 	pos = blocksize;
 	while (offset >= pos) {
 		bh = bh->b_this_page;
 		iblock++;
 		pos += blocksize;
 	}
 	err = 0;
 	if (buffer_freed(bh)) {
 		BUFFER_TRACE(bh, "freed: skip");
 		goto unlock;
 	}
 	if (!buffer_mapped(bh)) {
 		BUFFER_TRACE(bh, "unmapped");
 		ext4_get_block(inode, iblock, bh, 0);
 		/* unmapped? It's a hole - nothing to do */
 		if (!buffer_mapped(bh)) {
 			BUFFER_TRACE(bh, "still unmapped");
 			goto unlock;
 		}
 	}
 	/* Ok, it's mapped. Make sure it's up-to-date */
 	if (PageUptodate(page))
 		set_buffer_uptodate(bh);
 	if (!buffer_uptodate(bh)) {
 		err = -EIO;
 		ll_rw_block(READ, 1, &bh);
 		wait_on_buffer(bh);
 		/* Uhhuh. Read error. Complain and punt. */
 		if (!buffer_uptodate(bh))
 			goto unlock;
 	}
 	if (ext4_should_journal_data(inode)) {
 		BUFFER_TRACE(bh, "get write access");
 		err = ext4_journal_get_write_access(handle, bh);
 		if (err)
 			goto unlock;
 	}
 	zero_user(page, offset, length);
 	BUFFER_TRACE(bh, "zeroed end of block");
 	err = 0;
 	if (ext4_should_journal_data(inode)) {
 		err = ext4_journal_dirty_metadata(handle, bh);
 	} else {
 		if (ext4_should_order_data(inode))
 			err = ext4_journal_dirty_data(handle, bh);
 		mark_buffer_dirty(bh);
 	}
 unlock:
 	unlock_page(page);
 	page_cache_release(page);
 	return err;
 }
 /*
  * Probably it should be a library function... search for first non-zero word
  * or memcmp with zero_page, whatever is better for particular architecture.
  * Linus?
  */
 static inline int all_zeroes(__le32 *p, __le32 *q)
 {
 	while (p < q)
 		if (*p++)
 			return 0;
 	return 1;
 }
 /**
  *	ext4_find_shared - find the indirect blocks for partial truncation.
  *	@inode:	  inode in question
  *	@depth:	  depth of the affected branch
  *	@offsets: offsets of pointers in that branch (see ext4_block_to_path)
  *	@chain:	  place to store the pointers to partial indirect blocks
  *	@top:	  place to the (detached) top of branch
  *
  *	This is a helper function used by ext4_truncate().
  *
  *	When we do truncate() we may have to clean the ends of several
  *	indirect blocks but leave the blocks themselves alive. Block is
  *	partially truncated if some data below the new i_size is refered
  *	from it (and it is on the path to the first completely truncated
  *	data block, indeed).  We have to free the top of that path along
  *	with everything to the right of the path. Since no allocation
  *	past the truncation point is possible until ext4_truncate()
  *	finishes, we may safely do the latter, but top of branch may
  *	require special attention - pageout below the truncation point
  *	might try to populate it.
  *
  *	We atomically detach the top of branch from the tree, store the
  *	block number of its root in *@top, pointers to buffer_heads of
  *	partially truncated blocks - in @chain[].bh and pointers to
  *	their last elements that should not be removed - in
  *	@chain[].p. Return value is the pointer to last filled element
  *	of @chain.
  *
  *	The work left to caller to do the actual freeing of subtrees:
  *		a) free the subtree starting from *@top
  *		b) free the subtrees whose roots are stored in
  *			(@chain[i].p+1 .. end of @chain[i].bh->b_data)
  *		c) free the subtrees growing from the inode past the @chain[0].
  *			(no partially truncated stuff there).  */
 static Indirect *ext4_find_shared(struct inode *inode, int depth,
 			ext4_lblk_t offsets[4], Indirect chain[4], __le32 *top)
 {
 	Indirect *partial, *p;
 	int k, err;
 	*top = 0;
 	/* Make k index the deepest non-null offest + 1 */
 	for (k = depth; k > 1 && !offsets[k-1]; k--)
 		;
 	partial = ext4_get_branch(inode, k, offsets, chain, &err);
 	/* Writer: pointers */
 	if (!partial)
 		partial = chain + k-1;
 	/*
 	 * If the branch acquired continuation since we've looked at it -
 	 * fine, it should all survive and (new) top doesn't belong to us.
 	 */
 	if (!partial->key && *partial->p)
 		/* Writer: end */
 		goto no_top;
 	for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
 		;
 	/*
 	 * OK, we've found the last block that must survive. The rest of our
 	 * branch should be detached before unlocking. However, if that rest
 	 * of branch is all ours and does not grow immediately from the inode
 	 * it's easier to cheat and just decrement partial->p.
 	 */
 	if (p == chain + k - 1 && p > chain) {
 		p->p--;
 	} else {
 		*top = *p->p;
 		/* Nope, don't do this in ext4.  Must leave the tree intact */
 #if 0
 		*p->p = 0;
 #endif
 	}
 	/* Writer: end */
 	while(partial > p) {
 		brelse(partial->bh);
 		partial--;
 	}
 no_top:
 	return partial;
 }
 /*
  * Zero a number of block pointers in either an inode or an indirect block.
  * If we restart the transaction we must again get write access to the
  * indirect block for further modification.
  *
  * We release `count' blocks on disk, but (last - first) may be greater
  * than `count' because there can be holes in there.
  */
 static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
 		struct buffer_head *bh, ext4_fsblk_t block_to_free,
 		unsigned long count, __le32 *first, __le32 *last)
 {
 	__le32 *p;
 	if (try_to_extend_transaction(handle, inode)) {
 		if (bh) {
 			BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
 			ext4_journal_dirty_metadata(handle, bh);
 		}
 		ext4_mark_inode_dirty(handle, inode);
 		ext4_journal_test_restart(handle, inode);
 		if (bh) {
 			BUFFER_TRACE(bh, "retaking write access");
 			ext4_journal_get_write_access(handle, bh);
 		}
 	}
 	/*
 	 * Any buffers which are on the journal will be in memory. We find
 	 * them on the hash table so jbd2_journal_revoke() will run jbd2_journal_forget()
 	 * on them.  We've already detached each block from the file, so
 	 * bforget() in jbd2_journal_forget() should be safe.
 	 *
 	 * AKPM: turn on bforget in jbd2_journal_forget()!!!
 	 */
 	for (p = first; p < last; p++) {
 		u32 nr = le32_to_cpu(*p);
 		if (nr) {
 			struct buffer_head *tbh;
 			*p = 0;
 			tbh = sb_find_get_block(inode->i_sb, nr);
 			ext4_forget(handle, 0, inode, tbh, nr);
 		}
 	}
 	ext4_free_blocks(handle, inode, block_to_free, count, 0);
 }
 /**
  * ext4_free_data - free a list of data blocks
  * @handle:	handle for this transaction
  * @inode:	inode we are dealing with
  * @this_bh:	indirect buffer_head which contains *@first and *@last
  * @first:	array of block numbers
  * @last:	points immediately past the end of array
  *
  * We are freeing all blocks refered from that array (numbers are stored as
  * little-endian 32-bit) and updating @inode->i_blocks appropriately.
  *
  * We accumulate contiguous runs of blocks to free.  Conveniently, if these
  * blocks are contiguous then releasing them at one time will only affect one
  * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
  * actually use a lot of journal space.
  *
  * @this_bh will be %NULL if @first and @last point into the inode's direct
  * block pointers.
  */
 static void ext4_free_data(handle_t *handle, struct inode *inode,
 			   struct buffer_head *this_bh,
 			   __le32 *first, __le32 *last)
 {
 	ext4_fsblk_t block_to_free = 0;    /* Starting block # of a run */
 	unsigned long count = 0;	    /* Number of blocks in the run */
 	__le32 *block_to_free_p = NULL;	    /* Pointer into inode/ind
 					       corresponding to
 					       block_to_free */
 	ext4_fsblk_t nr;		    /* Current block # */
 	__le32 *p;			    /* Pointer into inode/ind
 					       for current block */
 	int err;
 	if (this_bh) {				/* For indirect block */
 		BUFFER_TRACE(this_bh, "get_write_access");
 		err = ext4_journal_get_write_access(handle, this_bh);
 		/* Important: if we can't update the indirect pointers
 		 * to the blocks, we can't free them. */
 		if (err)
 			return;
 	}
 	for (p = first; p < last; p++) {
 		nr = le32_to_cpu(*p);
 		if (nr) {
 			/* accumulate blocks to free if they're contiguous */
 			if (count == 0) {
 				block_to_free = nr;
 				block_to_free_p = p;
 				count = 1;
 			} else if (nr == block_to_free + count) {
 				count++;
 			} else {
 				ext4_clear_blocks(handle, inode, this_bh,
 						  block_to_free,
 						  count, block_to_free_p, p);
 				block_to_free = nr;
 				block_to_free_p = p;
 				count = 1;
 			}
 		}
 	}
 	if (count > 0)
 		ext4_clear_blocks(handle, inode, this_bh, block_to_free,
 				  count, block_to_free_p, p);
 	if (this_bh) {
 		BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata");
 		ext4_journal_dirty_metadata(handle, this_bh);
 	}
 }
 /**
  *	ext4_free_branches - free an array of branches
  *	@handle: JBD handle for this transaction
  *	@inode:	inode we are dealing with
  *	@parent_bh: the buffer_head which contains *@first and *@last
  *	@first:	array of block numbers
  *	@last:	pointer immediately past the end of array
  *	@depth:	depth of the branches to free
  *
  *	We are freeing all blocks refered from these branches (numbers are
  *	stored as little-endian 32-bit) and updating @inode->i_blocks
  *	appropriately.
  */
 static void ext4_free_branches(handle_t *handle, struct inode *inode,
 			       struct buffer_head *parent_bh,
 			       __le32 *first, __le32 *last, int depth)
 {
 	ext4_fsblk_t nr;
 	__le32 *p;
 	if (is_handle_aborted(handle))
 		return;
 	if (depth--) {
 		struct buffer_head *bh;
 		int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
 		p = last;
 		while (--p >= first) {
 			nr = le32_to_cpu(*p);
 			if (!nr)
 				continue;		/* A hole */
 			/* Go read the buffer for the next level down */
 			bh = sb_bread(inode->i_sb, nr);
 			/*
 			 * A read failure? Report error and clear slot
 			 * (should be rare).
 			 */
 			if (!bh) {
 				ext4_error(inode->i_sb, "ext4_free_branches",
 					   "Read failure, inode=%lu, block=%llu",
 					   inode->i_ino, nr);
 				continue;
 			}
 			/* This zaps the entire block.  Bottom up. */
 			BUFFER_TRACE(bh, "free child branches");
 			ext4_free_branches(handle, inode, bh,
 					   (__le32*)bh->b_data,
 					   (__le32*)bh->b_data + addr_per_block,
 					   depth);
 			/*
 			 * We've probably journalled the indirect block several
 			 * times during the truncate.  But it's no longer
 			 * needed and we now drop it from the transaction via
 			 * jbd2_journal_revoke().
 			 *
 			 * That's easy if it's exclusively part of this
 			 * transaction.  But if it's part of the committing
 			 * transaction then jbd2_journal_forget() will simply
 			 * brelse() it.  That means that if the underlying
 			 * block is reallocated in ext4_get_block(),
 			 * unmap_underlying_metadata() will find this block
 			 * and will try to get rid of it.  damn, damn.
 			 *
 			 * If this block has already been committed to the
 			 * journal, a revoke record will be written.  And
 			 * revoke records must be emitted *before* clearing
 			 * this block's bit in the bitmaps.
 			 */
 			ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
 			/*
 			 * Everything below this this pointer has been
 			 * released.  Now let this top-of-subtree go.
 			 *
 			 * We want the freeing of this indirect block to be
 			 * atomic in the journal with the updating of the
 			 * bitmap block which owns it.  So make some room in
 			 * the journal.
 			 *
 			 * We zero the parent pointer *after* freeing its
 			 * pointee in the bitmaps, so if extend_transaction()
 			 * for some reason fails to put the bitmap changes and
 			 * the release into the same transaction, recovery
 			 * will merely complain about releasing a free block,
 			 * rather than leaking blocks.
 			 */
 			if (is_handle_aborted(handle))
 				return;
 			if (try_to_extend_transaction(handle, inode)) {
 				ext4_mark_inode_dirty(handle, inode);
 				ext4_journal_test_restart(handle, inode);
 			}
 			ext4_free_blocks(handle, inode, nr, 1, 1);
 			if (parent_bh) {
 				/*
 				 * The block which we have just freed is
 				 * pointed to by an indirect block: journal it
 				 */
 				BUFFER_TRACE(parent_bh, "get_write_access");
 				if (!ext4_journal_get_write_access(handle,
 								   parent_bh)){
 					*p = 0;
 					BUFFER_TRACE(parent_bh,
 					"call ext4_journal_dirty_metadata");
 					ext4_journal_dirty_metadata(handle,
 								    parent_bh);
 				}
 			}
 		}
 	} else {
 		/* We have reached the bottom of the tree. */
 		BUFFER_TRACE(parent_bh, "free data blocks");
 		ext4_free_data(handle, inode, parent_bh, first, last);
 	}
 }
 /*
  * ext4_truncate()
  *
  * We block out ext4_get_block() block instantiations across the entire
  * transaction, and VFS/VM ensures that ext4_truncate() cannot run
  * simultaneously on behalf of the same inode.
  *
  * As we work through the truncate and commmit bits of it to the journal there
  * is one core, guiding principle: the file's tree must always be consistent on
  * disk.  We must be able to restart the truncate after a crash.
  *
  * The file's tree may be transiently inconsistent in memory (although it
  * probably isn't), but whenever we close off and commit a journal transaction,
  * the contents of (the filesystem + the journal) must be consistent and
  * restartable.  It's pretty simple, really: bottom up, right to left (although
  * left-to-right works OK too).
  *
  * Note that at recovery time, journal replay occurs *before* the restart of
  * truncate against the orphan inode list.
  *
  * The committed inode has the new, desired i_size (which is the same as
  * i_disksize in this case).  After a crash, ext4_orphan_cleanup() will see
  * that this inode's truncate did not complete and it will again call
  * ext4_truncate() to have another go.  So there will be instantiated blocks
  * to the right of the truncation point in a crashed ext4 filesystem.  But
  * that's fine - as long as they are linked from the inode, the post-crash
  * ext4_truncate() run will find them and release them.
  */
 void ext4_truncate(struct inode *inode)
 {
 	handle_t *handle;
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	__le32 *i_data = ei->i_data;
 	int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
 	struct address_space *mapping = inode->i_mapping;
 	ext4_lblk_t offsets[4];
 	Indirect chain[4];
 	Indirect *partial;
 	__le32 nr = 0;
 	int n;
 	ext4_lblk_t last_block;
 	unsigned blocksize = inode->i_sb->s_blocksize;
 	struct page *page;
 	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
 	    S_ISLNK(inode->i_mode)))
 		return;
 	if (ext4_inode_is_fast_symlink(inode))
 		return;
 	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
 		return;
 	/*
 	 * We have to lock the EOF page here, because lock_page() nests
 	 * outside jbd2_journal_start().
 	 */
 	if ((inode->i_size & (blocksize - 1)) == 0) {
 		/* Block boundary? Nothing to do */
 		page = NULL;
 	} else {
 		page = grab_cache_page(mapping,
 				inode->i_size >> PAGE_CACHE_SHIFT);
 		if (!page)
 			return;
 	}
 	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
 		ext4_ext_truncate(inode, page);
 		return;
 	}
 	handle = start_transaction(inode);
 	if (IS_ERR(handle)) {
 		if (page) {
 			clear_highpage(page);
 			flush_dcache_page(page);
 			unlock_page(page);
 			page_cache_release(page);
 		}
 		return;		/* AKPM: return what? */
 	}
 	last_block = (inode->i_size + blocksize-1)
 					>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
 	if (page)
 		ext4_block_truncate_page(handle, page, mapping, inode->i_size);
 	n = ext4_block_to_path(inode, last_block, offsets, NULL);
 	if (n == 0)
 		goto out_stop;	/* error */
 	/*
 	 * OK.  This truncate is going to happen.  We add the inode to the
 	 * orphan list, so that if this truncate spans multiple transactions,
 	 * and we crash, we will resume the truncate when the filesystem
 	 * recovers.  It also marks the inode dirty, to catch the new size.
 	 *
 	 * Implication: the file must always be in a sane, consistent
 	 * truncatable state while each transaction commits.
 	 */
 	if (ext4_orphan_add(handle, inode))
 		goto out_stop;
 	/*
 	 * The orphan list entry will now protect us from any crash which
 	 * occurs before the truncate completes, so it is now safe to propagate
 	 * the new, shorter inode size (held for now in i_size) into the
 	 * on-disk inode. We do this via i_disksize, which is the value which
 	 * ext4 *really* writes onto the disk inode.
 	 */
 	ei->i_disksize = inode->i_size;
 	/*
 	 * From here we block out all ext4_get_block() callers who want to
 	 * modify the block allocation tree.
 	 */
 	down_write(&ei->i_data_sem);
 	if (n == 1) {		/* direct blocks */
 		ext4_free_data(handle, inode, NULL, i_data+offsets[0],
 			       i_data + EXT4_NDIR_BLOCKS);
 		goto do_indirects;
 	}
 	partial = ext4_find_shared(inode, n, offsets, chain, &nr);
 	/* Kill the top of shared branch (not detached) */
 	if (nr) {
 		if (partial == chain) {
 			/* Shared branch grows from the inode */
 			ext4_free_branches(handle, inode, NULL,
 					   &nr, &nr+1, (chain+n-1) - partial);
 			*partial->p = 0;
 			/*
 			 * We mark the inode dirty prior to restart,
 			 * and prior to stop.  No need for it here.
 			 */
 		} else {
 			/* Shared branch grows from an indirect block */
 			BUFFER_TRACE(partial->bh, "get_write_access");
 			ext4_free_branches(handle, inode, partial->bh,
 					partial->p,
 					partial->p+1, (chain+n-1) - partial);
 		}
 	}
 	/* Clear the ends of indirect blocks on the shared branch */
 	while (partial > chain) {
 		ext4_free_branches(handle, inode, partial->bh, partial->p + 1,
 				   (__le32*)partial->bh->b_data+addr_per_block,
 				   (chain+n-1) - partial);
 		BUFFER_TRACE(partial->bh, "call brelse");
 		brelse (partial->bh);
 		partial--;
 	}
 do_indirects:
 	/* Kill the remaining (whole) subtrees */
 	switch (offsets[0]) {
 	default:
 		nr = i_data[EXT4_IND_BLOCK];
 		if (nr) {
 			ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
 			i_data[EXT4_IND_BLOCK] = 0;
 		}
 	case EXT4_IND_BLOCK:
 		nr = i_data[EXT4_DIND_BLOCK];
 		if (nr) {
 			ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
 			i_data[EXT4_DIND_BLOCK] = 0;
 		}
 	case EXT4_DIND_BLOCK:
 		nr = i_data[EXT4_TIND_BLOCK];
 		if (nr) {
 			ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
 			i_data[EXT4_TIND_BLOCK] = 0;
 		}
 	case EXT4_TIND_BLOCK:
 		;
 	}
 	ext4_discard_reservation(inode);
 	up_write(&ei->i_data_sem);
 	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
 	ext4_mark_inode_dirty(handle, inode);
 	/*
 	 * In a multi-transaction truncate, we only make the final transaction
 	 * synchronous
 	 */
 	if (IS_SYNC(inode))
 		handle->h_sync = 1;
 out_stop:
 	/*
 	 * If this was a simple ftruncate(), and the file will remain alive
 	 * then we need to clear up the orphan record which we created above.
 	 * However, if this was a real unlink then we were called by
 	 * ext4_delete_inode(), and we allow that function to clean up the
 	 * orphan info for us.
 	 */
 	if (inode->i_nlink)
 		ext4_orphan_del(handle, inode);
 	ext4_journal_stop(handle);
 }
 static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
 		unsigned long ino, struct ext4_iloc *iloc)
 {
 	ext4_group_t block_group;
 	unsigned long offset;
 	ext4_fsblk_t block;
 	struct ext4_group_desc *gdp;
 	if (!ext4_valid_inum(sb, ino)) {
 		/*
 		 * This error is already checked for in namei.c unless we are
 		 * looking at an NFS filehandle, in which case no error
 		 * report is needed
 		 */
 		return 0;
 	}
 	block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
 	gdp = ext4_get_group_desc(sb, block_group, NULL);
 	if (!gdp)
 		return 0;
 	/*
 	 * Figure out the offset within the block group inode table
 	 */
 	offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)) *
 		EXT4_INODE_SIZE(sb);
 	block = ext4_inode_table(sb, gdp) +
 		(offset >> EXT4_BLOCK_SIZE_BITS(sb));
 	iloc->block_group = block_group;
 	iloc->offset = offset & (EXT4_BLOCK_SIZE(sb) - 1);
 	return block;
 }
 /*
  * ext4_get_inode_loc returns with an extra refcount against the inode's
  * underlying buffer_head on success. If 'in_mem' is true, we have all
  * data in memory that is needed to recreate the on-disk version of this
  * inode.
  */
 static int __ext4_get_inode_loc(struct inode *inode,
 				struct ext4_iloc *iloc, int in_mem)
 {
 	ext4_fsblk_t block;
 	struct buffer_head *bh;
 	block = ext4_get_inode_block(inode->i_sb, inode->i_ino, iloc);
 	if (!block)
 		return -EIO;
 	bh = sb_getblk(inode->i_sb, block);
 	if (!bh) {
 		ext4_error (inode->i_sb, "ext4_get_inode_loc",
 				"unable to read inode block - "
 				"inode=%lu, block=%llu",
 				 inode->i_ino, block);
 		return -EIO;
 	}
 	if (!buffer_uptodate(bh)) {
 		lock_buffer(bh);
 		if (buffer_uptodate(bh)) {
 			/* someone brought it uptodate while we waited */
 			unlock_buffer(bh);
 			goto has_buffer;
 		}
 		/*
 		 * If we have all information of the inode in memory and this
 		 * is the only valid inode in the block, we need not read the
 		 * block.
 		 */
 		if (in_mem) {
 			struct buffer_head *bitmap_bh;
 			struct ext4_group_desc *desc;
 			int inodes_per_buffer;
 			int inode_offset, i;
 			ext4_group_t block_group;
 			int start;
 			block_group = (inode->i_ino - 1) /
 					EXT4_INODES_PER_GROUP(inode->i_sb);
 			inodes_per_buffer = bh->b_size /
 				EXT4_INODE_SIZE(inode->i_sb);
 			inode_offset = ((inode->i_ino - 1) %
 					EXT4_INODES_PER_GROUP(inode->i_sb));
 			start = inode_offset & ~(inodes_per_buffer - 1);
 			/* Is the inode bitmap in cache? */
 			desc = ext4_get_group_desc(inode->i_sb,
 						block_group, NULL);
 			if (!desc)
 				goto make_io;
 			bitmap_bh = sb_getblk(inode->i_sb,
 				ext4_inode_bitmap(inode->i_sb, desc));
 			if (!bitmap_bh)
 				goto make_io;
 			/*
 			 * If the inode bitmap isn't in cache then the
 			 * optimisation may end up performing two reads instead
 			 * of one, so skip it.
 			 */
 			if (!buffer_uptodate(bitmap_bh)) {
 				brelse(bitmap_bh);
 				goto make_io;
 			}
 			for (i = start; i < start + inodes_per_buffer; i++) {
 				if (i == inode_offset)
 					continue;
 				if (ext4_test_bit(i, bitmap_bh->b_data))
 					break;
 			}
 			brelse(bitmap_bh);
 			if (i == start + inodes_per_buffer) {
 				/* all other inodes are free, so skip I/O */
 				memset(bh->b_data, 0, bh->b_size);
 				set_buffer_uptodate(bh);
 				unlock_buffer(bh);
 				goto has_buffer;
 			}
 		}
 make_io:
 		/*
 		 * There are other valid inodes in the buffer, this inode
 		 * has in-inode xattrs, or we don't have this inode in memory.
 		 * Read the block from disk.
 		 */
 		get_bh(bh);
 		bh->b_end_io = end_buffer_read_sync;
 		submit_bh(READ_META, bh);
 		wait_on_buffer(bh);
 		if (!buffer_uptodate(bh)) {
 			ext4_error(inode->i_sb, "ext4_get_inode_loc",
 					"unable to read inode block - "
 					"inode=%lu, block=%llu",
 					inode->i_ino, block);
 			brelse(bh);
 			return -EIO;
 		}
 	}
 has_buffer:
 	iloc->bh = bh;
 	return 0;
 }
 int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
 {
 	/* We have all inode data except xattrs in memory here. */
 	return __ext4_get_inode_loc(inode, iloc,
 		!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR));
 }
 void ext4_set_inode_flags(struct inode *inode)
 {
 	unsigned int flags = EXT4_I(inode)->i_flags;
 	inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
 	if (flags & EXT4_SYNC_FL)
 		inode->i_flags |= S_SYNC;
 	if (flags & EXT4_APPEND_FL)
 		inode->i_flags |= S_APPEND;
 	if (flags & EXT4_IMMUTABLE_FL)
 		inode->i_flags |= S_IMMUTABLE;
 	if (flags & EXT4_NOATIME_FL)
 		inode->i_flags |= S_NOATIME;
 	if (flags & EXT4_DIRSYNC_FL)
 		inode->i_flags |= S_DIRSYNC;
 }
 /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
 void ext4_get_inode_flags(struct ext4_inode_info *ei)
 {
 	unsigned int flags = ei->vfs_inode.i_flags;
 	ei->i_flags &= ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
 			EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|EXT4_DIRSYNC_FL);
 	if (flags & S_SYNC)
 		ei->i_flags |= EXT4_SYNC_FL;
 	if (flags & S_APPEND)
 		ei->i_flags |= EXT4_APPEND_FL;
 	if (flags & S_IMMUTABLE)
 		ei->i_flags |= EXT4_IMMUTABLE_FL;
 	if (flags & S_NOATIME)
 		ei->i_flags |= EXT4_NOATIME_FL;
 	if (flags & S_DIRSYNC)
 		ei->i_flags |= EXT4_DIRSYNC_FL;
 }
 static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
 					struct ext4_inode_info *ei)
 {
 	blkcnt_t i_blocks ;
 	struct inode *inode = &(ei->vfs_inode);
 	struct super_block *sb = inode->i_sb;
 	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
 				EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
 		/* we are using combined 48 bit field */
 		i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
 					le32_to_cpu(raw_inode->i_blocks_lo);
 		if (ei->i_flags & EXT4_HUGE_FILE_FL) {
 			/* i_blocks represent file system block size */
 			return i_blocks  << (inode->i_blkbits - 9);
 		} else {
 			return i_blocks;
 		}
 	} else {
 		return le32_to_cpu(raw_inode->i_blocks_lo);
 	}
 }
 struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 {
 	struct ext4_iloc iloc;
 	struct ext4_inode *raw_inode;
 	struct ext4_inode_info *ei;
 	struct buffer_head *bh;
 	struct inode *inode;
 	long ret;
 	int block;
 	inode = iget_locked(sb, ino);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 	if (!(inode->i_state & I_NEW))
 		return inode;
 	ei = EXT4_I(inode);
 #ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
 	ei->i_acl = EXT4_ACL_NOT_CACHED;
 	ei->i_default_acl = EXT4_ACL_NOT_CACHED;
 #endif
 	ei->i_block_alloc_info = NULL;
 	ret = __ext4_get_inode_loc(inode, &iloc, 0);
 	if (ret < 0)
 		goto bad_inode;
 	bh = iloc.bh;
 	raw_inode = ext4_raw_inode(&iloc);
 	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
 	inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
 	inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
 	if(!(test_opt (inode->i_sb, NO_UID32))) {
 		inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
 		inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
 	}
 	inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
 	ei->i_state = 0;
 	ei->i_dir_start_lookup = 0;
 	ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
 	/* We now have enough fields to check if the inode was active or not.
 	 * This is needed because nfsd might try to access dead inodes
 	 * the test is that same one that e2fsck uses
 	 * NeilBrown 1999oct15
 	 */
 	if (inode->i_nlink == 0) {
 		if (inode->i_mode == 0 ||
 		    !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
 			/* this inode is deleted */
 			brelse (bh);
 			ret = -ESTALE;
 			goto bad_inode;
 		}
 		/* The only unlinked inodes we let through here have
 		 * valid i_mode and are being read by the orphan
 		 * recovery code: that's fine, we're about to complete
 		 * the process of deleting those. */
 	}
 	ei->i_flags = le32_to_cpu(raw_inode->i_flags);
 	inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
 	ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
 	if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
 	    cpu_to_le32(EXT4_OS_HURD)) {
 		ei->i_file_acl |=
 			((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
 	}
 	inode->i_size = ext4_isize(raw_inode);
 	ei->i_disksize = inode->i_size;
 	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
 	ei->i_block_group = iloc.block_group;
 	/*
 	 * NOTE! The in-memory inode i_data array is in little-endian order
 	 * even on big-endian machines: we do NOT byteswap the block numbers!
 	 */
 	for (block = 0; block < EXT4_N_BLOCKS; block++)
 		ei->i_data[block] = raw_inode->i_block[block];
 	INIT_LIST_HEAD(&ei->i_orphan);
 	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
 		ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
 		if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
 		    EXT4_INODE_SIZE(inode->i_sb)) {
 			brelse (bh);
 			ret = -EIO;
 			goto bad_inode;
 		}
 		if (ei->i_extra_isize == 0) {
 			/* The extra space is currently unused. Use it. */
 			ei->i_extra_isize = sizeof(struct ext4_inode) -
 					    EXT4_GOOD_OLD_INODE_SIZE;
 		} else {
 			__le32 *magic = (void *)raw_inode +
 					EXT4_GOOD_OLD_INODE_SIZE +
 					ei->i_extra_isize;
 			if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
 				 ei->i_state |= EXT4_STATE_XATTR;
 		}
 	} else
 		ei->i_extra_isize = 0;
 	EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
 	EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
 	EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
 	EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
 	inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
 	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
 		if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
 			inode->i_version |=
 			(__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
 	}
 	if (S_ISREG(inode->i_mode)) {
 		inode->i_op = &ext4_file_inode_operations;
 		inode->i_fop = &ext4_file_operations;
 		ext4_set_aops(inode);
 	} else if (S_ISDIR(inode->i_mode)) {
 		inode->i_op = &ext4_dir_inode_operations;
 		inode->i_fop = &ext4_dir_operations;
 	} else if (S_ISLNK(inode->i_mode)) {
 		if (ext4_inode_is_fast_symlink(inode))
 			inode->i_op = &ext4_fast_symlink_inode_operations;
 		else {
 			inode->i_op = &ext4_symlink_inode_operations;
 			ext4_set_aops(inode);
 		}
 	} else {
 		inode->i_op = &ext4_special_inode_operations;
 		if (raw_inode->i_block[0])
 			init_special_inode(inode, inode->i_mode,
 			   old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
 		else
 			init_special_inode(inode, inode->i_mode,
 			   new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
 	}
 	brelse (iloc.bh);
 	ext4_set_inode_flags(inode);
 	unlock_new_inode(inode);
 	return inode;
 bad_inode:
 	iget_failed(inode);
 	return ERR_PTR(ret);
 }
 static int ext4_inode_blocks_set(handle_t *handle,
 				struct ext4_inode *raw_inode,
 				struct ext4_inode_info *ei)
 {
 	struct inode *inode = &(ei->vfs_inode);
 	u64 i_blocks = inode->i_blocks;
 	struct super_block *sb = inode->i_sb;
 	int err = 0;
 	if (i_blocks <= ~0U) {
 		/*
 		 * i_blocks can be represnted in a 32 bit variable
 		 * as multiple of 512 bytes
 		 */
 		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
 		raw_inode->i_blocks_high = 0;
 		ei->i_flags &= ~EXT4_HUGE_FILE_FL;
 	} else if (i_blocks <= 0xffffffffffffULL) {
 		/*
 		 * i_blocks can be represented in a 48 bit variable
 		 * as multiple of 512 bytes
 		 */
 		err = ext4_update_rocompat_feature(handle, sb,
 					    EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
 		if (err)
 			goto  err_out;
 		/* i_block is stored in the split  48 bit fields */
 		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
 		raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
 		ei->i_flags &= ~EXT4_HUGE_FILE_FL;
 	} else {
 		/*
 		 * i_blocks should be represented in a 48 bit variable
 		 * as multiple of  file system block size
 		 */
 		err = ext4_update_rocompat_feature(handle, sb,
 					    EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
 		if (err)
 			goto  err_out;
 		ei->i_flags |= EXT4_HUGE_FILE_FL;
 		/* i_block is stored in file system block size */
 		i_blocks = i_blocks >> (inode->i_blkbits - 9);
 		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
 		raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
 	}
 err_out:
 	return err;
 }
 /*
  * Post the struct inode info into an on-disk inode location in the
  * buffer-cache.  This gobbles the caller's reference to the
  * buffer_head in the inode location struct.
  *
  * The caller must have write access to iloc->bh.
  */
 static int ext4_do_update_inode(handle_t *handle,
 				struct inode *inode,
 				struct ext4_iloc *iloc)
 {
 	struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	struct buffer_head *bh = iloc->bh;
 	int err = 0, rc, block;
 	/* For fields not not tracking in the in-memory inode,
 	 * initialise them to zero for new inodes. */
 	if (ei->i_state & EXT4_STATE_NEW)
 		memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
 	ext4_get_inode_flags(ei);
 	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
 	if(!(test_opt(inode->i_sb, NO_UID32))) {
 		raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
 		raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
 /*
  * Fix up interoperability with old kernels. Otherwise, old inodes get
  * re-used with the upper 16 bits of the uid/gid intact
  */
 		if(!ei->i_dtime) {
 			raw_inode->i_uid_high =
 				cpu_to_le16(high_16_bits(inode->i_uid));
 			raw_inode->i_gid_high =
 				cpu_to_le16(high_16_bits(inode->i_gid));
 		} else {
 			raw_inode->i_uid_high = 0;
 			raw_inode->i_gid_high = 0;
 		}
 	} else {
 		raw_inode->i_uid_low =
 			cpu_to_le16(fs_high2lowuid(inode->i_uid));
 		raw_inode->i_gid_low =
 			cpu_to_le16(fs_high2lowgid(inode->i_gid));
 		raw_inode->i_uid_high = 0;
 		raw_inode->i_gid_high = 0;
 	}
 	raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
 	EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
 	EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
 	EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
 	EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
 	if (ext4_inode_blocks_set(handle, raw_inode, ei))
 		goto out_brelse;
 	raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
 	/* clear the migrate flag in the raw_inode */
 	raw_inode->i_flags = cpu_to_le32(ei->i_flags & ~EXT4_EXT_MIGRATE);
 	if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
 	    cpu_to_le32(EXT4_OS_HURD))
 		raw_inode->i_file_acl_high =
 			cpu_to_le16(ei->i_file_acl >> 32);
 	raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
 	ext4_isize_set(raw_inode, ei->i_disksize);
 	if (ei->i_disksize > 0x7fffffffULL) {
 		struct super_block *sb = inode->i_sb;
 		if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
 				EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
 				EXT4_SB(sb)->s_es->s_rev_level ==
 				cpu_to_le32(EXT4_GOOD_OLD_REV)) {
 			/* If this is the first large file
 			 * created, add a flag to the superblock.
 			 */
 			err = ext4_journal_get_write_access(handle,
 					EXT4_SB(sb)->s_sbh);
 			if (err)
 				goto out_brelse;
 			ext4_update_dynamic_rev(sb);
 			EXT4_SET_RO_COMPAT_FEATURE(sb,
 					EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
 			sb->s_dirt = 1;
 			handle->h_sync = 1;
 			err = ext4_journal_dirty_metadata(handle,
 					EXT4_SB(sb)->s_sbh);
 		}
 	}
 	raw_inode->i_generation = cpu_to_le32(inode->i_generation);
 	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
 		if (old_valid_dev(inode->i_rdev)) {
 			raw_inode->i_block[0] =
 				cpu_to_le32(old_encode_dev(inode->i_rdev));
 			raw_inode->i_block[1] = 0;
 		} else {
 			raw_inode->i_block[0] = 0;
 			raw_inode->i_block[1] =
 				cpu_to_le32(new_encode_dev(inode->i_rdev));
 			raw_inode->i_block[2] = 0;
 		}
 	} else for (block = 0; block < EXT4_N_BLOCKS; block++)
 		raw_inode->i_block[block] = ei->i_data[block];
 	raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
 	if (ei->i_extra_isize) {
 		if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
 			raw_inode->i_version_hi =
 			cpu_to_le32(inode->i_version >> 32);
 		raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
 	}
 	BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
 	rc = ext4_journal_dirty_metadata(handle, bh);
 	if (!err)
 		err = rc;
 	ei->i_state &= ~EXT4_STATE_NEW;
 out_brelse:
 	brelse (bh);
 	ext4_std_error(inode->i_sb, err);
 	return err;
 }
 /*
  * ext4_write_inode()
  *
  * We are called from a few places:
  *
  * - Within generic_file_write() for O_SYNC files.
  *   Here, there will be no transaction running. We wait for any running
  *   trasnaction to commit.
  *
  * - Within sys_sync(), kupdate and such.
  *   We wait on commit, if tol to.
  *
  * - Within prune_icache() (PF_MEMALLOC == true)
  *   Here we simply return.  We can't afford to block kswapd on the
  *   journal commit.
  *
  * In all cases it is actually safe for us to return without doing anything,
  * because the inode has been copied into a raw inode buffer in
  * ext4_mark_inode_dirty().  This is a correctness thing for O_SYNC and for
  * knfsd.
  *
  * Note that we are absolutely dependent upon all inode dirtiers doing the
  * right thing: they *must* call mark_inode_dirty() after dirtying info in
  * which we are interested.
  *
  * It would be a bug for them to not do this.  The code:
  *
  *	mark_inode_dirty(inode)
  *	stuff();
  *	inode->i_size = expr;
  *
  * is in error because a kswapd-driven write_inode() could occur while
  * `stuff()' is running, and the new i_size will be lost.  Plus the inode
  * will no longer be on the superblock's dirty inode list.
  */
 int ext4_write_inode(struct inode *inode, int wait)
 {
 	if (current->flags & PF_MEMALLOC)
 		return 0;
 	if (ext4_journal_current_handle()) {
 		jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
 		dump_stack();
 		return -EIO;
 	}
 	if (!wait)
 		return 0;
 	return ext4_force_commit(inode->i_sb);
 }
 /*
  * ext4_setattr()
  *
  * Called from notify_change.
  *
  * We want to trap VFS attempts to truncate the file as soon as
  * possible.  In particular, we want to make sure that when the VFS
  * shrinks i_size, we put the inode on the orphan list and modify
  * i_disksize immediately, so that during the subsequent flushing of
  * dirty pages and freeing of disk blocks, we can guarantee that any
  * commit will leave the blocks being flushed in an unused state on
  * disk.  (On recovery, the inode will get truncated and the blocks will
  * be freed, so we have a strong guarantee that no future commit will
  * leave these blocks visible to the user.)
  *
  * Called with inode->sem down.
  */
 int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = dentry->d_inode;
 	int error, rc = 0;
 	const unsigned int ia_valid = attr->ia_valid;
 	error = inode_change_ok(inode, attr);
 	if (error)
 		return error;
 	if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
 		(ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
 		handle_t *handle;
 		/* (user+group)*(old+new) structure, inode write (sb,
 		 * inode block, ? - but truncate inode update has it) */
 		handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+
 					EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
 		if (IS_ERR(handle)) {
 			error = PTR_ERR(handle);
 			goto err_out;
 		}
 		error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
 		if (error) {
 			ext4_journal_stop(handle);
 			return error;
 		}
 		/* Update corresponding info in inode so that everything is in
 		 * one transaction */
 		if (attr->ia_valid & ATTR_UID)
 			inode->i_uid = attr->ia_uid;
 		if (attr->ia_valid & ATTR_GID)
 			inode->i_gid = attr->ia_gid;
 		error = ext4_mark_inode_dirty(handle, inode);
 		ext4_journal_stop(handle);
 	}
 	if (attr->ia_valid & ATTR_SIZE) {
 		if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
 			struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 			if (attr->ia_size > sbi->s_bitmap_maxbytes) {
 				error = -EFBIG;
 				goto err_out;
 			}
 		}
 	}
 	if (S_ISREG(inode->i_mode) &&
 	    attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
 		handle_t *handle;
 		handle = ext4_journal_start(inode, 3);
 		if (IS_ERR(handle)) {
 			error = PTR_ERR(handle);
 			goto err_out;
 		}
 		error = ext4_orphan_add(handle, inode);
 		EXT4_I(inode)->i_disksize = attr->ia_size;
 		rc = ext4_mark_inode_dirty(handle, inode);
 		if (!error)
 			error = rc;
 		ext4_journal_stop(handle);
 	}
 	rc = inode_setattr(inode, attr);
 	/* If inode_setattr's call to ext4_truncate failed to get a
 	 * transaction handle at all, we need to clean up the in-core
 	 * orphan list manually. */
 	if (inode->i_nlink)
 		ext4_orphan_del(NULL, inode);
 	if (!rc && (ia_valid & ATTR_MODE))
 		rc = ext4_acl_chmod(inode);
 err_out:
 	ext4_std_error(inode->i_sb, error);
 	if (!error)
 		error = rc;
 	return error;
 }
 /*
  * How many blocks doth make a writepage()?
  *
  * With N blocks per page, it may be:
  * N data blocks
  * 2 indirect block
  * 2 dindirect
  * 1 tindirect
  * N+5 bitmap blocks (from the above)
  * N+5 group descriptor summary blocks
  * 1 inode block
  * 1 superblock.
  * 2 * EXT4_SINGLEDATA_TRANS_BLOCKS for the quote files
  *
  * 3 * (N + 5) + 2 + 2 * EXT4_SINGLEDATA_TRANS_BLOCKS
  *
  * With ordered or writeback data it's the same, less the N data blocks.
  *
  * If the inode's direct blocks can hold an integral number of pages then a
  * page cannot straddle two indirect blocks, and we can only touch one indirect
  * and dindirect block, and the "5" above becomes "3".
  *
  * This still overestimates under most circumstances.  If we were to pass the
  * start and end offsets in here as well we could do block_to_path() on each
  * block and work out the exact number of indirects which are touched.  Pah.
  */
 int ext4_writepage_trans_blocks(struct inode *inode)
 {
 	int bpp = ext4_journal_blocks_per_page(inode);
 	int indirects = (EXT4_NDIR_BLOCKS % bpp) ? 5 : 3;
 	int ret;
 	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
 		return ext4_ext_writepage_trans_blocks(inode, bpp);
 	if (ext4_should_journal_data(inode))
 		ret = 3 * (bpp + indirects) + 2;
 	else
 		ret = 2 * (bpp + indirects) + 2;
 #ifdef CONFIG_QUOTA
 	/* We know that structure was already allocated during DQUOT_INIT so
 	 * we will be updating only the data blocks + inodes */
 	ret += 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
 #endif
 	return ret;
 }
 /*
  * The caller must have previously called ext4_reserve_inode_write().
  * Give this, we know that the caller already has write access to iloc->bh.
  */
 int ext4_mark_iloc_dirty(handle_t *handle,
 		struct inode *inode, struct ext4_iloc *iloc)
 {
 	int err = 0;
 	if (test_opt(inode->i_sb, I_VERSION))
 		inode_inc_iversion(inode);
 	/* the do_update_inode consumes one bh->b_count */
 	get_bh(iloc->bh);
 	/* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
 	err = ext4_do_update_inode(handle, inode, iloc);
 	put_bh(iloc->bh);
 	return err;
 }
 /*
  * On success, We end up with an outstanding reference count against
  * iloc->bh.  This _must_ be cleaned up later.
  */
 int
 ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
 			 struct ext4_iloc *iloc)
 {
 	int err = 0;
 	if (handle) {
 		err = ext4_get_inode_loc(inode, iloc);
 		if (!err) {
 			BUFFER_TRACE(iloc->bh, "get_write_access");
 			err = ext4_journal_get_write_access(handle, iloc->bh);
 			if (err) {
 				brelse(iloc->bh);
 				iloc->bh = NULL;
 			}
 		}
 	}
 	ext4_std_error(inode->i_sb, err);
 	return err;
 }
 /*
  * Expand an inode by new_extra_isize bytes.
  * Returns 0 on success or negative error number on failure.
  */
 static int ext4_expand_extra_isize(struct inode *inode,
 				   unsigned int new_extra_isize,
 				   struct ext4_iloc iloc,
 				   handle_t *handle)
 {
 	struct ext4_inode *raw_inode;
 	struct ext4_xattr_ibody_header *header;
 	struct ext4_xattr_entry *entry;
 	if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)
 		return 0;
 	raw_inode = ext4_raw_inode(&iloc);
 	header = IHDR(inode, raw_inode);
 	entry = IFIRST(header);
 	/* No extended attributes present */
 	if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) ||
 		header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
 		memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
 			new_extra_isize);
 		EXT4_I(inode)->i_extra_isize = new_extra_isize;
 		return 0;
 	}
 	/* try to expand with EAs present */
 	return ext4_expand_extra_isize_ea(inode, new_extra_isize,
 					  raw_inode, handle);
 }
 /*
  * What we do here is to mark the in-core inode as clean with respect to inode
  * dirtiness (it may still be data-dirty).
  * This means that the in-core inode may be reaped by prune_icache
  * without having to perform any I/O.  This is a very good thing,
  * because *any* task may call prune_icache - even ones which
  * have a transaction open against a different journal.
  *
  * Is this cheating?  Not really.  Sure, we haven't written the
  * inode out, but prune_icache isn't a user-visible syncing function.
  * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
  * we start and wait on commits.
  *
  * Is this efficient/effective?  Well, we're being nice to the system
  * by cleaning up our inodes proactively so they can be reaped
  * without I/O.  But we are potentially leaving up to five seconds'
  * worth of inodes floating about which prune_icache wants us to
  * write out.  One way to fix that would be to get prune_icache()
  * to do a write_super() to free up some memory.  It has the desired
  * effect.
  */
 int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
 {
 	struct ext4_iloc iloc;
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	static unsigned int mnt_count;
 	int err, ret;
 	might_sleep();
 	err = ext4_reserve_inode_write(handle, inode, &iloc);
 	if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
 	    !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) {
 		/*
 		 * We need extra buffer credits since we may write into EA block
 		 * with this same handle. If journal_extend fails, then it will
 		 * only result in a minor loss of functionality for that inode.
 		 * If this is felt to be critical, then e2fsck should be run to
 		 * force a large enough s_min_extra_isize.
 		 */
 		if ((jbd2_journal_extend(handle,
 			     EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) {
 			ret = ext4_expand_extra_isize(inode,
 						      sbi->s_want_extra_isize,
 						      iloc, handle);
 			if (ret) {
 				EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
 				if (mnt_count !=
 					le16_to_cpu(sbi->s_es->s_mnt_count)) {
 					ext4_warning(inode->i_sb, __func__,
 					"Unable to expand inode %lu. Delete"
 					" some EAs or run e2fsck.",
 					inode->i_ino);
 					mnt_count =
 					  le16_to_cpu(sbi->s_es->s_mnt_count);
 				}
 			}
 		}
 	}
 	if (!err)
 		err = ext4_mark_iloc_dirty(handle, inode, &iloc);
 	return err;
 }
 /*
  * ext4_dirty_inode() is called from __mark_inode_dirty()
  *
  * We're really interested in the case where a file is being extended.
  * i_size has been changed by generic_commit_write() and we thus need
  * to include the updated inode in the current transaction.
  *
  * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
  * are allocated to the file.
  *
  * If the inode is marked synchronous, we don't honour that here - doing
  * so would cause a commit on atime updates, which we don't bother doing.
  * We handle synchronous inodes at the highest possible level.
  */
 void ext4_dirty_inode(struct inode *inode)
 {
 	handle_t *current_handle = ext4_journal_current_handle();
 	handle_t *handle;
 	handle = ext4_journal_start(inode, 2);
 	if (IS_ERR(handle))
 		goto out;
 	if (current_handle &&
 		current_handle->h_transaction != handle->h_transaction) {
 		/* This task has a transaction open against a different fs */
 		printk(KERN_EMERG "%s: transactions do not match!\n",
 		       __func__);
 	} else {
 		jbd_debug(5, "marking dirty.  outer handle=%p\n",
 				current_handle);
 		ext4_mark_inode_dirty(handle, inode);
 	}
 	ext4_journal_stop(handle);
 out:
 	return;
 }
 #if 0
 /*
  * Bind an inode's backing buffer_head into this transaction, to prevent
  * it from being flushed to disk early.  Unlike
  * ext4_reserve_inode_write, this leaves behind no bh reference and
  * returns no iloc structure, so the caller needs to repeat the iloc
  * lookup to mark the inode dirty later.
  */
 static int ext4_pin_inode(handle_t *handle, struct inode *inode)
 {
 	struct ext4_iloc iloc;
 	int err = 0;
 	if (handle) {
 		err = ext4_get_inode_loc(inode, &iloc);
 		if (!err) {
 			BUFFER_TRACE(iloc.bh, "get_write_access");
 			err = jbd2_journal_get_write_access(handle, iloc.bh);
 			if (!err)
 				err = ext4_journal_dirty_metadata(handle,
 								  iloc.bh);
 			brelse(iloc.bh);
 		}
 	}
 	ext4_std_error(inode->i_sb, err);
 	return err;
 }
 #endif
 int ext4_change_inode_journal_flag(struct inode *inode, int val)
 {
 	journal_t *journal;
 	handle_t *handle;
 	int err;
 	/*
 	 * We have to be very careful here: changing a data block's
 	 * journaling status dynamically is dangerous.  If we write a
 	 * data block to the journal, change the status and then delete
 	 * that block, we risk forgetting to revoke the old log record
 	 * from the journal and so a subsequent replay can corrupt data.
 	 * So, first we make sure that the journal is empty and that
 	 * nobody is changing anything.
 	 */
 	journal = EXT4_JOURNAL(inode);
 	if (is_journal_aborted(journal))
 		return -EROFS;
 	jbd2_journal_lock_updates(journal);
 	jbd2_journal_flush(journal);
 	/*
 	 * OK, there are no updates running now, and all cached data is
 	 * synced to disk.  We are now in a completely consistent state
 	 * which doesn't have anything in the journal, and we know that
 	 * no filesystem updates are running, so it is safe to modify
 	 * the inode's in-core data-journaling state flag now.
 	 */
 	if (val)
 		EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL;
 	else
 		EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL;
 	ext4_set_aops(inode);
 	jbd2_journal_unlock_updates(journal);
 	/* Finally we can mark the inode as dirty. */
 	handle = ext4_journal_start(inode, 1);
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 	err = ext4_mark_inode_dirty(handle, inode);
 	handle->h_sync = 1;
 	ext4_journal_stop(handle);
 	ext4_std_error(inode->i_sb, err);
 	return err;
 }