Doug / smarc-fsl-linux-kernel

1

/*

1

/*

2

3

4

*

4

*

5

* This program is free software; you can redistribute it and/or

5

* This program is free software; you can redistribute it and/or

6

* modify it under the terms of the GNU General Public License as

6

* modify it under the terms of the GNU General Public License as

7

* published by the Free Software Foundation.

7

* published by the Free Software Foundation.

8

*

8

*

9

* This program is distributed in the hope that it would be useful,

9

* This program is distributed in the hope that it would be useful,

10

* but WITHOUT ANY WARRANTY; without even the implied warranty of

10

* but WITHOUT ANY WARRANTY; without even the implied warranty of

11

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

11

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

12

* GNU General Public License for more details.

12

* GNU General Public License for more details.

13

*

13

*

14

* You should have received a copy of the GNU General Public License

14

* You should have received a copy of the GNU General Public License

15

* along with this program; if not, write the Free Software Foundation,

15

* along with this program; if not, write the Free Software Foundation,

16

* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA

16

* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA

17

*/

17

*/

18

#include "xfs.h"

18

#include "xfs.h"

19

#include "xfs_bit.h"

19

#include "xfs_bit.h"

20

#include "xfs_log.h"

20

#include "xfs_log.h"

21

#include "xfs_inum.h"

21

#include "xfs_inum.h"

22

#include "xfs_sb.h"

22

#include "xfs_sb.h"

23

#include "xfs_ag.h"

23

#include "xfs_ag.h"

24

#include "xfs_trans.h"

24

#include "xfs_trans.h"

25

#include "xfs_mount.h"

25

#include "xfs_mount.h"

26

#include "xfs_bmap_btree.h"

26

#include "xfs_bmap_btree.h"

27

#include "xfs_dinode.h"

27

#include "xfs_dinode.h"

28

#include "xfs_inode.h"

28

#include "xfs_inode.h"

29

#include "xfs_inode_item.h"

29

#include "xfs_alloc.h"

30

#include "xfs_alloc.h"

30

#include "xfs_error.h"

31

#include "xfs_error.h"

31

#include "xfs_rw.h"

32

#include "xfs_rw.h"

32

#include "xfs_iomap.h"

33

#include "xfs_iomap.h"

33

#include "xfs_vnodeops.h"

34

#include "xfs_vnodeops.h"

34

#include "xfs_trace.h"

35

#include "xfs_trace.h"

35

#include "xfs_bmap.h"

36

#include "xfs_bmap.h"

36

#include <linux/gfp.h>

37

#include <linux/gfp.h>

37

#include <linux/mpage.h>

38

#include <linux/mpage.h>

38

#include <linux/pagevec.h>

39

#include <linux/pagevec.h>

39

#include <linux/writeback.h>

40

#include <linux/writeback.h>

40

41

void

42

void

42

xfs_count_page_state(

43

xfs_count_page_state(

43

struct page *page,

44

struct page *page,

44

int *delalloc,

45

int *delalloc,

45

int *unwritten)

46

int *unwritten)

46

{

47

{

47

struct buffer_head *bh, *head;

48

struct buffer_head *bh, *head;

48

49

*delalloc = *unwritten = 0;

50

*delalloc = *unwritten = 0;

50

51

bh = head = page_buffers(page);

52

bh = head = page_buffers(page);

52

do {

53

do {

53

if (buffer_unwritten(bh))

54

if (buffer_unwritten(bh))

54

(*unwritten) = 1;

55

(*unwritten) = 1;

55

else if (buffer_delay(bh))

56

else if (buffer_delay(bh))

56

(*delalloc) = 1;

57

(*delalloc) = 1;

57

} while ((bh = bh->b_this_page) != head);

58

} while ((bh = bh->b_this_page) != head);

58

}

59

}

59

60

STATIC struct block_device *

61

STATIC struct block_device *

61

xfs_find_bdev_for_inode(

62

xfs_find_bdev_for_inode(

62

struct inode *inode)

63

struct inode *inode)

63

{

64

{

64

struct xfs_inode *ip = XFS_I(inode);

65

struct xfs_inode *ip = XFS_I(inode);

65

struct xfs_mount *mp = ip->i_mount;

66

struct xfs_mount *mp = ip->i_mount;

66

67

if (XFS_IS_REALTIME_INODE(ip))

68

if (XFS_IS_REALTIME_INODE(ip))

68

return mp->m_rtdev_targp->bt_bdev;

69

return mp->m_rtdev_targp->bt_bdev;

69

else

70

else

70

return mp->m_ddev_targp->bt_bdev;

71

return mp->m_ddev_targp->bt_bdev;

71

}

72

}

72

73

/*

74

/*

74

* We're now finished for good with this ioend structure.

75

* We're now finished for good with this ioend structure.

75

* Update the page state via the associated buffer_heads,

76

* Update the page state via the associated buffer_heads,

76

* release holds on the inode and bio, and finally free

77

* release holds on the inode and bio, and finally free

77

* up memory. Do not use the ioend after this.

78

* up memory. Do not use the ioend after this.

78

*/

79

*/

79

STATIC void

80

STATIC void

80

xfs_destroy_ioend(

81

xfs_destroy_ioend(

81

xfs_ioend_t *ioend)

82

xfs_ioend_t *ioend)

82

{

83

{

83

struct buffer_head *bh, *next;

84

struct buffer_head *bh, *next;

84

85

for (bh = ioend->io_buffer_head; bh; bh = next) {

86

for (bh = ioend->io_buffer_head; bh; bh = next) {

86

next = bh->b_private;

87

next = bh->b_private;

87

bh->b_end_io(bh, !ioend->io_error);

88

bh->b_end_io(bh, !ioend->io_error);

88

}

89

}

89

90

if (ioend->io_iocb) {

91

if (ioend->io_iocb) {

91

if (ioend->io_isasync) {

92

if (ioend->io_isasync) {

92

aio_complete(ioend->io_iocb, ioend->io_error ?

93

aio_complete(ioend->io_iocb, ioend->io_error ?

93

ioend->io_error : ioend->io_result, 0);

94

ioend->io_error : ioend->io_result, 0);

94

}

95

}

95

inode_dio_done(ioend->io_inode);

96

inode_dio_done(ioend->io_inode);

96

}

97

}

97

98

mempool_free(ioend, xfs_ioend_pool);

99

mempool_free(ioend, xfs_ioend_pool);

99

}

100

}

100

101

/*

102

/*

102

* Fast and loose check if this write could update the on-disk inode size.

103

* Fast and loose check if this write could update the on-disk inode size.

103

*/

104

*/

104

static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)

105

static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)

105

{

106

{

106

return ioend->io_offset + ioend->io_size >

107

return ioend->io_offset + ioend->io_size >

107

XFS_I(ioend->io_inode)->i_d.di_size;

108

XFS_I(ioend->io_inode)->i_d.di_size;

108

}

109

}

109

110

111

STATIC int

112

xfs_setfilesize_trans_alloc(

113

struct xfs_ioend *ioend)

114

{

115

struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;

116

struct xfs_trans *tp;

117

int error;

118

119

tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);

120

121

error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);

122

if (error) {

123

xfs_trans_cancel(tp, 0);

124

return error;

125

}

126

127

ioend->io_append_trans = tp;

128

129

/*

130

* We hand off the transaction to the completion thread now, so

131

* clear the flag here.

132

*/

133

current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);

134

return 0;

135

}

136

110

/*

137

/*

111

* Update on-disk file size now that data has been written to disk.

138

* Update on-disk file size now that data has been written to disk.

112

*/

139

*/

113

STATIC void

140

STATIC int

114

xfs_setfilesize(

141

xfs_setfilesize(

115

struct xfs_ioend *ioend)

142

struct xfs_ioend *ioend)

116

{

143

{

117

struct xfs_inode *ip = XFS_I(ioend->io_inode);

144

struct xfs_inode *ip = XFS_I(ioend->io_inode);

145

struct xfs_trans *tp = ioend->io_append_trans;

118

xfs_fsize_t isize;

146

xfs_fsize_t isize;

119

147

148

/*

149

* The transaction was allocated in the I/O submission thread,

150

* thus we need to mark ourselves as beeing in a transaction

151

* manually.

152

*/

153

current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);

154

120

xfs_ilock(ip, XFS_ILOCK_EXCL);

155

xfs_ilock(ip, XFS_ILOCK_EXCL);

121

isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size);

156

isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size);

122

if (isize) {

157

if (!isize) {

123

trace_xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);

158

xfs_iunlock(ip, XFS_ILOCK_EXCL);

124

ip->i_d.di_size = isize;

159

xfs_trans_cancel(tp, 0);

125

xfs_mark_inode_dirty(ip);

160

return 0;

126

}

161

}

127

162

128

xfs_iunlock(ip, XFS_ILOCK_EXCL);

163

trace_xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);

164

165

ip->i_d.di_size = isize;

166

xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);

167

xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);

168

169

return xfs_trans_commit(tp, 0);

129

}

170

}

130

171

131

/*

172

/*

132

* Schedule IO completion handling on the final put of an ioend.

173

* Schedule IO completion handling on the final put of an ioend.

133

*

174

*

134

* If there is no work to do we might as well call it a day and free the

175

* If there is no work to do we might as well call it a day and free the

135

* ioend right now.

176

* ioend right now.

136

*/

177

*/

137

STATIC void

178

STATIC void

138

xfs_finish_ioend(

179

xfs_finish_ioend(

139

struct xfs_ioend *ioend)

180

struct xfs_ioend *ioend)

140

{

181

{

141

if (atomic_dec_and_test(&ioend->io_remaining)) {

182

if (atomic_dec_and_test(&ioend->io_remaining)) {

142

struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;

183

struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;

143

184

144

if (ioend->io_type == IO_UNWRITTEN)

185

if (ioend->io_type == IO_UNWRITTEN)

145

queue_work(mp->m_unwritten_workqueue, &ioend->io_work);

186

queue_work(mp->m_unwritten_workqueue, &ioend->io_work);

146

else if (xfs_ioend_is_append(ioend))

187

else if (ioend->io_append_trans)

147

queue_work(mp->m_data_workqueue, &ioend->io_work);

188

queue_work(mp->m_data_workqueue, &ioend->io_work);

148

else

189

else

149

xfs_destroy_ioend(ioend);

190

xfs_destroy_ioend(ioend);

150

}

191

}

151

}

192

}

152

193

153

/*

194

/*

154

* IO write completion.

195

* IO write completion.

155

*/

196

*/

156

STATIC void

197

STATIC void

157

xfs_end_io(

198

xfs_end_io(

158

struct work_struct *work)

199

struct work_struct *work)

159

{

200

{

160

xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work);

201

xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work);

161

struct xfs_inode *ip = XFS_I(ioend->io_inode);

202

struct xfs_inode *ip = XFS_I(ioend->io_inode);

162

int error = 0;

203

int error = 0;

163

204

164

if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {

205

if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {

165

ioend->io_error = -EIO;

206

ioend->io_error = -EIO;

166

goto done;

207

goto done;

167

}

208

}

168

if (ioend->io_error)

209

if (ioend->io_error)

169

goto done;

210

goto done;

170

211

171

/*

212

/*

172

* For unwritten extents we need to issue transactions to convert a

213

* For unwritten extents we need to issue transactions to convert a

173

* range to normal written extens after the data I/O has finished.

214

* range to normal written extens after the data I/O has finished.

174

*/

215

*/

175

if (ioend->io_type == IO_UNWRITTEN) {

216

if (ioend->io_type == IO_UNWRITTEN) {

217

/*

218

* For buffered I/O we never preallocate a transaction when

219

* doing the unwritten extent conversion, but for direct I/O

220

* we do not know if we are converting an unwritten extent

221

* or not at the point where we preallocate the transaction.

222

*/

223

if (ioend->io_append_trans) {

224

ASSERT(ioend->io_isdirect);

225

226

current_set_flags_nested(

227

&ioend->io_append_trans->t_pflags, PF_FSTRANS);

228

xfs_trans_cancel(ioend->io_append_trans, 0);

229

}

230

176

error = xfs_iomap_write_unwritten(ip, ioend->io_offset,

231

error = xfs_iomap_write_unwritten(ip, ioend->io_offset,

177

ioend->io_size);

232

ioend->io_size);

178

if (error) {

233

if (error) {

179

ioend->io_error = -error;

234

ioend->io_error = -error;

180

goto done;

235

goto done;

181

}

236

}

237

} else if (ioend->io_append_trans) {

238

error = xfs_setfilesize(ioend);

239

if (error)

240

ioend->io_error = -error;

182

} else {

241

} else {

183

/*

242

ASSERT(!xfs_ioend_is_append(ioend));

184

* We might have to update the on-disk file size after

185

* extending writes.

186

*/

187

xfs_setfilesize(ioend);

188

}

243

}

189

244

190

done:

245

done:

191

xfs_destroy_ioend(ioend);

246

xfs_destroy_ioend(ioend);

192

}

247

}

193

248

194

/*

249

/*

195

* Call IO completion handling in caller context on the final put of an ioend.

250

* Call IO completion handling in caller context on the final put of an ioend.

196

*/

251

*/

197

STATIC void

252

STATIC void

198

xfs_finish_ioend_sync(

253

xfs_finish_ioend_sync(

199

struct xfs_ioend *ioend)

254

struct xfs_ioend *ioend)

200

{

255

{

201

if (atomic_dec_and_test(&ioend->io_remaining))

256

if (atomic_dec_and_test(&ioend->io_remaining))

202

xfs_end_io(&ioend->io_work);

257

xfs_end_io(&ioend->io_work);

203

}

258

}

204

259

205

/*

260

/*

206

* Allocate and initialise an IO completion structure.

261

* Allocate and initialise an IO completion structure.

207

* We need to track unwritten extent write completion here initially.

262

* We need to track unwritten extent write completion here initially.

208

* We'll need to extend this for updating the ondisk inode size later

263

* We'll need to extend this for updating the ondisk inode size later

209

* (vs. incore size).

264

* (vs. incore size).

210

*/

265

*/

211

STATIC xfs_ioend_t *

266

STATIC xfs_ioend_t *

212

xfs_alloc_ioend(

267

xfs_alloc_ioend(

213

struct inode *inode,

268

struct inode *inode,

214

unsigned int type)

269

unsigned int type)

215

{

270

{

216

xfs_ioend_t *ioend;

271

xfs_ioend_t *ioend;

217

272

218

ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS);

273

ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS);

219

274

220

/*

275

/*

221

* Set the count to 1 initially, which will prevent an I/O

276

* Set the count to 1 initially, which will prevent an I/O

222

* completion callback from happening before we have started

277

* completion callback from happening before we have started

223

* all the I/O from calling the completion routine too early.

278

* all the I/O from calling the completion routine too early.

224

*/

279

*/

225

atomic_set(&ioend->io_remaining, 1);

280

atomic_set(&ioend->io_remaining, 1);

226

ioend->io_isasync = 0;

281

ioend->io_isasync = 0;

282

ioend->io_isdirect = 0;

227

ioend->io_error = 0;

283

ioend->io_error = 0;

228

ioend->io_list = NULL;

284

ioend->io_list = NULL;

229

ioend->io_type = type;

285

ioend->io_type = type;

230

ioend->io_inode = inode;

286

ioend->io_inode = inode;

231

ioend->io_buffer_head = NULL;

287

ioend->io_buffer_head = NULL;

232

ioend->io_buffer_tail = NULL;

288

ioend->io_buffer_tail = NULL;

233

ioend->io_offset = 0;

289

ioend->io_offset = 0;

234

ioend->io_size = 0;

290

ioend->io_size = 0;

235

ioend->io_iocb = NULL;

291

ioend->io_iocb = NULL;

236

ioend->io_result = 0;

292

ioend->io_result = 0;

293

ioend->io_append_trans = NULL;

237

294

238

INIT_WORK(&ioend->io_work, xfs_end_io);

295

INIT_WORK(&ioend->io_work, xfs_end_io);

239

return ioend;

296

return ioend;

240

}

297

}

241

298

242

STATIC int

299

STATIC int

243

xfs_map_blocks(

300

xfs_map_blocks(

244

struct inode *inode,

301

struct inode *inode,

245

loff_t offset,

302

loff_t offset,

246

struct xfs_bmbt_irec *imap,

303

struct xfs_bmbt_irec *imap,

247

int type,

304

int type,

248

int nonblocking)

305

int nonblocking)

249

{

306

{

250

struct xfs_inode *ip = XFS_I(inode);

307

struct xfs_inode *ip = XFS_I(inode);

251

struct xfs_mount *mp = ip->i_mount;

308

struct xfs_mount *mp = ip->i_mount;

252

ssize_t count = 1 << inode->i_blkbits;

309

ssize_t count = 1 << inode->i_blkbits;

253

xfs_fileoff_t offset_fsb, end_fsb;

310

xfs_fileoff_t offset_fsb, end_fsb;

254

int error = 0;

311

int error = 0;

255

int bmapi_flags = XFS_BMAPI_ENTIRE;

312

int bmapi_flags = XFS_BMAPI_ENTIRE;

256

int nimaps = 1;

313

int nimaps = 1;

257

314

258

if (XFS_FORCED_SHUTDOWN(mp))

315

if (XFS_FORCED_SHUTDOWN(mp))

259

return -XFS_ERROR(EIO);

316

return -XFS_ERROR(EIO);

260

317

261

if (type == IO_UNWRITTEN)

318

if (type == IO_UNWRITTEN)

262

bmapi_flags |= XFS_BMAPI_IGSTATE;

319

bmapi_flags |= XFS_BMAPI_IGSTATE;

263

320

264

if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {

321

if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {

265

if (nonblocking)

322

if (nonblocking)

266

return -XFS_ERROR(EAGAIN);

323

return -XFS_ERROR(EAGAIN);

267

xfs_ilock(ip, XFS_ILOCK_SHARED);

324

xfs_ilock(ip, XFS_ILOCK_SHARED);

268

}

325

}

269

326

270

ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||

327

ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||

271

(ip->i_df.if_flags & XFS_IFEXTENTS));

328

(ip->i_df.if_flags & XFS_IFEXTENTS));

272

ASSERT(offset <= mp->m_maxioffset);

329

ASSERT(offset <= mp->m_maxioffset);

273

330

274

if (offset + count > mp->m_maxioffset)

331

if (offset + count > mp->m_maxioffset)

275

count = mp->m_maxioffset - offset;

332

count = mp->m_maxioffset - offset;

276

end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);

333

end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);

277

offset_fsb = XFS_B_TO_FSBT(mp, offset);

334

offset_fsb = XFS_B_TO_FSBT(mp, offset);

278

error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,

335

error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,

279

imap, &nimaps, bmapi_flags);

336

imap, &nimaps, bmapi_flags);

280

xfs_iunlock(ip, XFS_ILOCK_SHARED);

337

xfs_iunlock(ip, XFS_ILOCK_SHARED);

281

338

282

if (error)

339

if (error)

283

return -XFS_ERROR(error);

340

return -XFS_ERROR(error);

284

341

285

if (type == IO_DELALLOC &&

342

if (type == IO_DELALLOC &&

286

(!nimaps || isnullstartblock(imap->br_startblock))) {

343

(!nimaps || isnullstartblock(imap->br_startblock))) {

287

error = xfs_iomap_write_allocate(ip, offset, count, imap);

344

error = xfs_iomap_write_allocate(ip, offset, count, imap);

288

if (!error)

345

if (!error)

289

trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);

346

trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);

290

return -XFS_ERROR(error);

347

return -XFS_ERROR(error);

291

}

348

}

292

349

293

#ifdef DEBUG

350

#ifdef DEBUG

294

if (type == IO_UNWRITTEN) {

351

if (type == IO_UNWRITTEN) {

295

ASSERT(nimaps);

352

ASSERT(nimaps);

296

ASSERT(imap->br_startblock != HOLESTARTBLOCK);

353

ASSERT(imap->br_startblock != HOLESTARTBLOCK);

297

ASSERT(imap->br_startblock != DELAYSTARTBLOCK);

354

ASSERT(imap->br_startblock != DELAYSTARTBLOCK);

298

}

355

}

299

#endif

356

#endif

300

if (nimaps)

357

if (nimaps)

301

trace_xfs_map_blocks_found(ip, offset, count, type, imap);

358

trace_xfs_map_blocks_found(ip, offset, count, type, imap);

302

return 0;

359

return 0;

303

}

360

}

304

361

305

STATIC int

362

STATIC int

306

xfs_imap_valid(

363

xfs_imap_valid(

307

struct inode *inode,

364

struct inode *inode,

308

struct xfs_bmbt_irec *imap,

365

struct xfs_bmbt_irec *imap,

309

xfs_off_t offset)

366

xfs_off_t offset)

310

{

367

{

311

offset >>= inode->i_blkbits;

368

offset >>= inode->i_blkbits;

312

369

313

return offset >= imap->br_startoff &&

370

return offset >= imap->br_startoff &&

314

offset < imap->br_startoff + imap->br_blockcount;

371

offset < imap->br_startoff + imap->br_blockcount;

315

}

372

}

316

373

317

/*

374

/*

318

* BIO completion handler for buffered IO.

375

* BIO completion handler for buffered IO.

319

*/

376

*/

320

STATIC void

377

STATIC void

321

xfs_end_bio(

378

xfs_end_bio(

322

struct bio *bio,

379

struct bio *bio,

323

int error)

380

int error)

324

{

381

{

325

xfs_ioend_t *ioend = bio->bi_private;

382

xfs_ioend_t *ioend = bio->bi_private;

326

383

327

ASSERT(atomic_read(&bio->bi_cnt) >= 1);

384

ASSERT(atomic_read(&bio->bi_cnt) >= 1);

328

ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error;

385

ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error;

329

386

330

/* Toss bio and pass work off to an xfsdatad thread */

387

/* Toss bio and pass work off to an xfsdatad thread */

331

bio->bi_private = NULL;

388

bio->bi_private = NULL;

332

bio->bi_end_io = NULL;

389

bio->bi_end_io = NULL;

333

bio_put(bio);

390

bio_put(bio);

334

391

335

xfs_finish_ioend(ioend);

392

xfs_finish_ioend(ioend);

336

}

393

}

337

394

338

STATIC void

395

STATIC void

339

xfs_submit_ioend_bio(

396

xfs_submit_ioend_bio(

340

struct writeback_control *wbc,

397

struct writeback_control *wbc,

341

xfs_ioend_t *ioend,

398

xfs_ioend_t *ioend,

342

struct bio *bio)

399

struct bio *bio)

343

{

400

{

344

struct xfs_inode *ip = XFS_I(ioend->io_inode);

345

atomic_inc(&ioend->io_remaining);

401

atomic_inc(&ioend->io_remaining);

346

bio->bi_private = ioend;

402

bio->bi_private = ioend;

347

bio->bi_end_io = xfs_end_bio;

403

bio->bi_end_io = xfs_end_bio;

348

349

/*

350

* If the I/O is beyond EOF we mark the inode dirty immediately

351

* but don't update the inode size until I/O completion.

352

*/

353

if (xfs_new_eof(ip, ioend->io_offset + ioend->io_size))

354

xfs_mark_inode_dirty(ip);

355

356

submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio);

404

submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio);

357

}

405

}

358

406

359

STATIC struct bio *

407

STATIC struct bio *

360

xfs_alloc_ioend_bio(

408

xfs_alloc_ioend_bio(

361

struct buffer_head *bh)

409

struct buffer_head *bh)

362

{

410

{

363

int nvecs = bio_get_nr_vecs(bh->b_bdev);

411

int nvecs = bio_get_nr_vecs(bh->b_bdev);

364

struct bio *bio = bio_alloc(GFP_NOIO, nvecs);

412

struct bio *bio = bio_alloc(GFP_NOIO, nvecs);

365

413

366

ASSERT(bio->bi_private == NULL);

414

ASSERT(bio->bi_private == NULL);

367

bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);

415

bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);

368

bio->bi_bdev = bh->b_bdev;

416

bio->bi_bdev = bh->b_bdev;

369

return bio;

417

return bio;

370

}

418

}

371

419

372

STATIC void

420

STATIC void

373

xfs_start_buffer_writeback(

421

xfs_start_buffer_writeback(

374

struct buffer_head *bh)

422

struct buffer_head *bh)

375

{

423

{

376

ASSERT(buffer_mapped(bh));

424

ASSERT(buffer_mapped(bh));

377

ASSERT(buffer_locked(bh));

425

ASSERT(buffer_locked(bh));

378

ASSERT(!buffer_delay(bh));

426

ASSERT(!buffer_delay(bh));

379

ASSERT(!buffer_unwritten(bh));

427

ASSERT(!buffer_unwritten(bh));

380

428

381

mark_buffer_async_write(bh);

429

mark_buffer_async_write(bh);

382

set_buffer_uptodate(bh);

430

set_buffer_uptodate(bh);

383

clear_buffer_dirty(bh);

431

clear_buffer_dirty(bh);

384

}

432

}

385

433

386

STATIC void

434

STATIC void

387

xfs_start_page_writeback(

435

xfs_start_page_writeback(

388

struct page *page,

436

struct page *page,

389

int clear_dirty,

437

int clear_dirty,

390

int buffers)

438

int buffers)

391

{

439

{

392

ASSERT(PageLocked(page));

440

ASSERT(PageLocked(page));

393

ASSERT(!PageWriteback(page));

441

ASSERT(!PageWriteback(page));

394

if (clear_dirty)

442

if (clear_dirty)

395

clear_page_dirty_for_io(page);

443

clear_page_dirty_for_io(page);

396

set_page_writeback(page);

444

set_page_writeback(page);

397

unlock_page(page);

445

unlock_page(page);

398

/* If no buffers on the page are to be written, finish it here */

446

/* If no buffers on the page are to be written, finish it here */

399

if (!buffers)

447

if (!buffers)

400

end_page_writeback(page);

448

end_page_writeback(page);

401

}

449

}

402

450

403

static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh)

451

static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh)

404

{

452

{

405

return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));

453

return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));

406

}

454

}

407

455

408

/*

456

/*

409

* Submit all of the bios for all of the ioends we have saved up, covering the

457

* Submit all of the bios for all of the ioends we have saved up, covering the

410

* initial writepage page and also any probed pages.

458

* initial writepage page and also any probed pages.

411

*

459

*

412

* Because we may have multiple ioends spanning a page, we need to start

460

* Because we may have multiple ioends spanning a page, we need to start

413

* writeback on all the buffers before we submit them for I/O. If we mark the

461

* writeback on all the buffers before we submit them for I/O. If we mark the

414

* buffers as we got, then we can end up with a page that only has buffers

462

* buffers as we got, then we can end up with a page that only has buffers

415

* marked async write and I/O complete on can occur before we mark the other

463

* marked async write and I/O complete on can occur before we mark the other

416

* buffers async write.

464

* buffers async write.

417

*

465

*

418

* The end result of this is that we trip a bug in end_page_writeback() because

466

* The end result of this is that we trip a bug in end_page_writeback() because

419

* we call it twice for the one page as the code in end_buffer_async_write()

467

* we call it twice for the one page as the code in end_buffer_async_write()

420

* assumes that all buffers on the page are started at the same time.

468

* assumes that all buffers on the page are started at the same time.

421

*

469

*

422

* The fix is two passes across the ioend list - one to start writeback on the

470

* The fix is two passes across the ioend list - one to start writeback on the

423

* buffer_heads, and then submit them for I/O on the second pass.

471

* buffer_heads, and then submit them for I/O on the second pass.

424

*/

472

*/

425

STATIC void

473

STATIC void

426

xfs_submit_ioend(

474

xfs_submit_ioend(

427

struct writeback_control *wbc,

475

struct writeback_control *wbc,

428

xfs_ioend_t *ioend)

476

xfs_ioend_t *ioend)

429

{

477

{

430

xfs_ioend_t *head = ioend;

478

xfs_ioend_t *head = ioend;

431

xfs_ioend_t *next;

479

xfs_ioend_t *next;

432

struct buffer_head *bh;

480

struct buffer_head *bh;

433

struct bio *bio;

481

struct bio *bio;

434

sector_t lastblock = 0;

482

sector_t lastblock = 0;

435

483

436

/* Pass 1 - start writeback */

484

/* Pass 1 - start writeback */

437

do {

485

do {

438

next = ioend->io_list;

486

next = ioend->io_list;

439

for (bh = ioend->io_buffer_head; bh; bh = bh->b_private)

487

for (bh = ioend->io_buffer_head; bh; bh = bh->b_private)

440

xfs_start_buffer_writeback(bh);

488

xfs_start_buffer_writeback(bh);

441

} while ((ioend = next) != NULL);

489

} while ((ioend = next) != NULL);

442

490

443

/* Pass 2 - submit I/O */

491

/* Pass 2 - submit I/O */

444

ioend = head;

492

ioend = head;

445

do {

493

do {

446

next = ioend->io_list;

494

next = ioend->io_list;

447

bio = NULL;

495

bio = NULL;

448

496

449

for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {

497

for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {

450

498

451

if (!bio) {

499

if (!bio) {

452

retry:

500

retry:

453

bio = xfs_alloc_ioend_bio(bh);

501

bio = xfs_alloc_ioend_bio(bh);

454

} else if (bh->b_blocknr != lastblock + 1) {

502

} else if (bh->b_blocknr != lastblock + 1) {

455

xfs_submit_ioend_bio(wbc, ioend, bio);

503

xfs_submit_ioend_bio(wbc, ioend, bio);

456

goto retry;

504

goto retry;

457

}

505

}

458

506

459

if (bio_add_buffer(bio, bh) != bh->b_size) {

507

if (bio_add_buffer(bio, bh) != bh->b_size) {

460

xfs_submit_ioend_bio(wbc, ioend, bio);

508

xfs_submit_ioend_bio(wbc, ioend, bio);

461

goto retry;

509

goto retry;

462

}

510

}

463

511

464

lastblock = bh->b_blocknr;

512

lastblock = bh->b_blocknr;

465

}

513

}

466

if (bio)

514

if (bio)

467

xfs_submit_ioend_bio(wbc, ioend, bio);

515

xfs_submit_ioend_bio(wbc, ioend, bio);

468

xfs_finish_ioend(ioend);

516

xfs_finish_ioend(ioend);

469

} while ((ioend = next) != NULL);

517

} while ((ioend = next) != NULL);

470

}

518

}

471

519

472

/*

520

/*

473

* Cancel submission of all buffer_heads so far in this endio.

521

* Cancel submission of all buffer_heads so far in this endio.

474

* Toss the endio too. Only ever called for the initial page

522

* Toss the endio too. Only ever called for the initial page

475

* in a writepage request, so only ever one page.

523

* in a writepage request, so only ever one page.

476

*/

524

*/

477

STATIC void

525

STATIC void

478

xfs_cancel_ioend(

526

xfs_cancel_ioend(

479

xfs_ioend_t *ioend)

527

xfs_ioend_t *ioend)

480

{

528

{

481

xfs_ioend_t *next;

529

xfs_ioend_t *next;

482

struct buffer_head *bh, *next_bh;

530

struct buffer_head *bh, *next_bh;

483

531

484

do {

532

do {

485

next = ioend->io_list;

533

next = ioend->io_list;

486

bh = ioend->io_buffer_head;

534

bh = ioend->io_buffer_head;

487

do {

535

do {

488

next_bh = bh->b_private;

536

next_bh = bh->b_private;

489

clear_buffer_async_write(bh);

537

clear_buffer_async_write(bh);

490

unlock_buffer(bh);

538

unlock_buffer(bh);

491

} while ((bh = next_bh) != NULL);

539

} while ((bh = next_bh) != NULL);

492

540

493

mempool_free(ioend, xfs_ioend_pool);

541

mempool_free(ioend, xfs_ioend_pool);

494

} while ((ioend = next) != NULL);

542

} while ((ioend = next) != NULL);

495

}

543

}

496

544

497

/*

545

/*

498

* Test to see if we've been building up a completion structure for

546

* Test to see if we've been building up a completion structure for

499

* earlier buffers -- if so, we try to append to this ioend if we

547

* earlier buffers -- if so, we try to append to this ioend if we

500

* can, otherwise we finish off any current ioend and start another.

548

* can, otherwise we finish off any current ioend and start another.

501

* Return true if we've finished the given ioend.

549

* Return true if we've finished the given ioend.

502

*/

550

*/

503

STATIC void

551

STATIC void

504

xfs_add_to_ioend(

552

xfs_add_to_ioend(

505

struct inode *inode,

553

struct inode *inode,

506

struct buffer_head *bh,

554

struct buffer_head *bh,

507

xfs_off_t offset,

555

xfs_off_t offset,

508

unsigned int type,

556

unsigned int type,

509

xfs_ioend_t **result,

557

xfs_ioend_t **result,

510

int need_ioend)

558

int need_ioend)

511

{

559

{

512

xfs_ioend_t *ioend = *result;

560

xfs_ioend_t *ioend = *result;

513

561

514

if (!ioend || need_ioend || type != ioend->io_type) {

562

if (!ioend || need_ioend || type != ioend->io_type) {

515

xfs_ioend_t *previous = *result;

563

xfs_ioend_t *previous = *result;

516

564

517

ioend = xfs_alloc_ioend(inode, type);

565

ioend = xfs_alloc_ioend(inode, type);

518

ioend->io_offset = offset;

566

ioend->io_offset = offset;

519

ioend->io_buffer_head = bh;

567

ioend->io_buffer_head = bh;

520

ioend->io_buffer_tail = bh;

568

ioend->io_buffer_tail = bh;

521

if (previous)

569

if (previous)

522

previous->io_list = ioend;

570

previous->io_list = ioend;

523

*result = ioend;

571

*result = ioend;

524

} else {

572

} else {

525

ioend->io_buffer_tail->b_private = bh;

573

ioend->io_buffer_tail->b_private = bh;

526

ioend->io_buffer_tail = bh;

574

ioend->io_buffer_tail = bh;

527

}

575

}

528

576

529

bh->b_private = NULL;

577

bh->b_private = NULL;

530

ioend->io_size += bh->b_size;

578

ioend->io_size += bh->b_size;

531

}

579

}

532

580

533

STATIC void

581

STATIC void

534

xfs_map_buffer(

582

xfs_map_buffer(

535

struct inode *inode,

583

struct inode *inode,

536

struct buffer_head *bh,

584

struct buffer_head *bh,

537

struct xfs_bmbt_irec *imap,

585

struct xfs_bmbt_irec *imap,

538

xfs_off_t offset)

586

xfs_off_t offset)

539

{

587

{

540

sector_t bn;

588

sector_t bn;

541

struct xfs_mount *m = XFS_I(inode)->i_mount;

589

struct xfs_mount *m = XFS_I(inode)->i_mount;

542

xfs_off_t iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff);

590

xfs_off_t iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff);

543

xfs_daddr_t iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock);

591

xfs_daddr_t iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock);

544

592

545

ASSERT(imap->br_startblock != HOLESTARTBLOCK);

593

ASSERT(imap->br_startblock != HOLESTARTBLOCK);

546

ASSERT(imap->br_startblock != DELAYSTARTBLOCK);

594

ASSERT(imap->br_startblock != DELAYSTARTBLOCK);

547

595

548

bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) +

596

bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) +

549

((offset - iomap_offset) >> inode->i_blkbits);

597

((offset - iomap_offset) >> inode->i_blkbits);

550

598

551

ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode)));

599

ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode)));

552

600

553

bh->b_blocknr = bn;

601

bh->b_blocknr = bn;

554

set_buffer_mapped(bh);

602

set_buffer_mapped(bh);

555

}

603

}

556

604

557

STATIC void

605

STATIC void

558

xfs_map_at_offset(

606

xfs_map_at_offset(

559

struct inode *inode,

607

struct inode *inode,

560

struct buffer_head *bh,

608

struct buffer_head *bh,

561

struct xfs_bmbt_irec *imap,

609

struct xfs_bmbt_irec *imap,

562

xfs_off_t offset)

610

xfs_off_t offset)

563

{

611

{

564

ASSERT(imap->br_startblock != HOLESTARTBLOCK);

612

ASSERT(imap->br_startblock != HOLESTARTBLOCK);

565

ASSERT(imap->br_startblock != DELAYSTARTBLOCK);

613

ASSERT(imap->br_startblock != DELAYSTARTBLOCK);

566

614

567

xfs_map_buffer(inode, bh, imap, offset);

615

xfs_map_buffer(inode, bh, imap, offset);

568

set_buffer_mapped(bh);

616

set_buffer_mapped(bh);

569

clear_buffer_delay(bh);

617

clear_buffer_delay(bh);

570

clear_buffer_unwritten(bh);

618

clear_buffer_unwritten(bh);

571

}

619

}

572

620

573

/*

621

/*

574

* Test if a given page is suitable for writing as part of an unwritten

622

* Test if a given page is suitable for writing as part of an unwritten

575

* or delayed allocate extent.

623

* or delayed allocate extent.

576

*/

624

*/

577

STATIC int

625

STATIC int

578

xfs_is_delayed_page(

626

xfs_is_delayed_page(

579

struct page *page,

627

struct page *page,

580

unsigned int type)

628

unsigned int type)

581

{

629

{

582

if (PageWriteback(page))

630

if (PageWriteback(page))

583

return 0;

631

return 0;

584

632

585

if (page->mapping && page_has_buffers(page)) {

633

if (page->mapping && page_has_buffers(page)) {

586

struct buffer_head *bh, *head;

634

struct buffer_head *bh, *head;

587

int acceptable = 0;

635

int acceptable = 0;

588

636

589

bh = head = page_buffers(page);

637

bh = head = page_buffers(page);

590

do {

638

do {

591

if (buffer_unwritten(bh))

639

if (buffer_unwritten(bh))

592

acceptable = (type == IO_UNWRITTEN);

640

acceptable = (type == IO_UNWRITTEN);

593

else if (buffer_delay(bh))

641

else if (buffer_delay(bh))

594

acceptable = (type == IO_DELALLOC);

642

acceptable = (type == IO_DELALLOC);

595

else if (buffer_dirty(bh) && buffer_mapped(bh))

643

else if (buffer_dirty(bh) && buffer_mapped(bh))

596

acceptable = (type == IO_OVERWRITE);

644

acceptable = (type == IO_OVERWRITE);

597

else

645

else

598

break;

646

break;

599

} while ((bh = bh->b_this_page) != head);

647

} while ((bh = bh->b_this_page) != head);

600

648

601

if (acceptable)

649

if (acceptable)

602

return 1;

650

return 1;

603

}

651

}

604

652

605

return 0;

653

return 0;

606

}

654

}

607

655

608

/*

656

/*

609

* Allocate & map buffers for page given the extent map. Write it out.

657

* Allocate & map buffers for page given the extent map. Write it out.

610

* except for the original page of a writepage, this is called on

658

* except for the original page of a writepage, this is called on

611

* delalloc/unwritten pages only, for the original page it is possible

659

* delalloc/unwritten pages only, for the original page it is possible

612

* that the page has no mapping at all.

660

* that the page has no mapping at all.

613

*/

661

*/

614

STATIC int

662

STATIC int

615

xfs_convert_page(

663

xfs_convert_page(

616

struct inode *inode,

664

struct inode *inode,

617

struct page *page,

665

struct page *page,

618

loff_t tindex,

666

loff_t tindex,

619

struct xfs_bmbt_irec *imap,

667

struct xfs_bmbt_irec *imap,

620

xfs_ioend_t **ioendp,

668

xfs_ioend_t **ioendp,

621

struct writeback_control *wbc)

669

struct writeback_control *wbc)

622

{

670

{

623

struct buffer_head *bh, *head;

671

struct buffer_head *bh, *head;

624

xfs_off_t end_offset;

672

xfs_off_t end_offset;

625

unsigned long p_offset;

673

unsigned long p_offset;

626

unsigned int type;

674

unsigned int type;

627

int len, page_dirty;

675

int len, page_dirty;

628

int count = 0, done = 0, uptodate = 1;

676

int count = 0, done = 0, uptodate = 1;

629

xfs_off_t offset = page_offset(page);

677

xfs_off_t offset = page_offset(page);

630

678

631

if (page->index != tindex)

679

if (page->index != tindex)

632

goto fail;

680

goto fail;

633

if (!trylock_page(page))

681

if (!trylock_page(page))

634

goto fail;

682

goto fail;

635

if (PageWriteback(page))

683

if (PageWriteback(page))

636

goto fail_unlock_page;

684

goto fail_unlock_page;

637

if (page->mapping != inode->i_mapping)

685

if (page->mapping != inode->i_mapping)

638

goto fail_unlock_page;

686

goto fail_unlock_page;

639

if (!xfs_is_delayed_page(page, (*ioendp)->io_type))

687

if (!xfs_is_delayed_page(page, (*ioendp)->io_type))

640

goto fail_unlock_page;

688

goto fail_unlock_page;

641

689

642

/*

690

/*

643

* page_dirty is initially a count of buffers on the page before

691

* page_dirty is initially a count of buffers on the page before

644

* EOF and is decremented as we move each into a cleanable state.

692

* EOF and is decremented as we move each into a cleanable state.

645

*

693

*

646

* Derivation:

694

* Derivation:

647

*

695

*

648

* End offset is the highest offset that this page should represent.

696

* End offset is the highest offset that this page should represent.

649

* If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))

697

* If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))

650

* will evaluate non-zero and be less than PAGE_CACHE_SIZE and

698

* will evaluate non-zero and be less than PAGE_CACHE_SIZE and

651

* hence give us the correct page_dirty count. On any other page,

699

* hence give us the correct page_dirty count. On any other page,

652

* it will be zero and in that case we need page_dirty to be the

700

* it will be zero and in that case we need page_dirty to be the

653

* count of buffers on the page.

701

* count of buffers on the page.

654

*/

702

*/

655

end_offset = min_t(unsigned long long,

703

end_offset = min_t(unsigned long long,

656

(xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,

704

(xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,

657

i_size_read(inode));

705

i_size_read(inode));

658

706

659

len = 1 << inode->i_blkbits;

707

len = 1 << inode->i_blkbits;

660

p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),

708

p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),

661

PAGE_CACHE_SIZE);

709

PAGE_CACHE_SIZE);

662

p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;

710

p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;

663

page_dirty = p_offset / len;

711

page_dirty = p_offset / len;

664

712

665

bh = head = page_buffers(page);

713

bh = head = page_buffers(page);

666

do {

714

do {

667

if (offset >= end_offset)

715

if (offset >= end_offset)

668

break;

716

break;

669

if (!buffer_uptodate(bh))

717

if (!buffer_uptodate(bh))

670

uptodate = 0;

718

uptodate = 0;

671

if (!(PageUptodate(page) || buffer_uptodate(bh))) {

719

if (!(PageUptodate(page) || buffer_uptodate(bh))) {

672

done = 1;

720

done = 1;

673

continue;

721

continue;

674

}

722

}

675

723

676

if (buffer_unwritten(bh) || buffer_delay(bh) ||

724

if (buffer_unwritten(bh) || buffer_delay(bh) ||

677

buffer_mapped(bh)) {

725

buffer_mapped(bh)) {

678

if (buffer_unwritten(bh))

726

if (buffer_unwritten(bh))

679

type = IO_UNWRITTEN;

727

type = IO_UNWRITTEN;

680

else if (buffer_delay(bh))

728

else if (buffer_delay(bh))

681

type = IO_DELALLOC;

729

type = IO_DELALLOC;

682

else

730

else

683

type = IO_OVERWRITE;

731

type = IO_OVERWRITE;

684

732

685

if (!xfs_imap_valid(inode, imap, offset)) {

733

if (!xfs_imap_valid(inode, imap, offset)) {

686

done = 1;

734

done = 1;

687

continue;

735

continue;

688

}

736

}

689

737

690

lock_buffer(bh);

738

lock_buffer(bh);

691

if (type != IO_OVERWRITE)

739

if (type != IO_OVERWRITE)

692

xfs_map_at_offset(inode, bh, imap, offset);

740

xfs_map_at_offset(inode, bh, imap, offset);

693

xfs_add_to_ioend(inode, bh, offset, type,

741

xfs_add_to_ioend(inode, bh, offset, type,

694

ioendp, done);

742

ioendp, done);

695

743

696

page_dirty--;

744

page_dirty--;

697

count++;

745

count++;

698

} else {

746

} else {

699

done = 1;

747

done = 1;

700

}

748

}

701

} while (offset += len, (bh = bh->b_this_page) != head);

749

} while (offset += len, (bh = bh->b_this_page) != head);

702

750

703

if (uptodate && bh == head)

751

if (uptodate && bh == head)

704

SetPageUptodate(page);

752

SetPageUptodate(page);

705

753

706

if (count) {

754

if (count) {

707

if (--wbc->nr_to_write <= 0 &&

755

if (--wbc->nr_to_write <= 0 &&

708

wbc->sync_mode == WB_SYNC_NONE)

756

wbc->sync_mode == WB_SYNC_NONE)

709

done = 1;

757

done = 1;

710

}

758

}

711

xfs_start_page_writeback(page, !page_dirty, count);

759

xfs_start_page_writeback(page, !page_dirty, count);

712

760

713

return done;

761

return done;

714

fail_unlock_page:

762

fail_unlock_page:

715

unlock_page(page);

763

unlock_page(page);

716

fail:

764

fail:

717

return 1;

765

return 1;

718

}

766

}

719

767

720

/*

768

/*

721

* Convert & write out a cluster of pages in the same extent as defined

769

* Convert & write out a cluster of pages in the same extent as defined

722

* by mp and following the start page.

770

* by mp and following the start page.

723

*/

771

*/

724

STATIC void

772

STATIC void

725

xfs_cluster_write(

773

xfs_cluster_write(

726

struct inode *inode,

774

struct inode *inode,

727

pgoff_t tindex,

775

pgoff_t tindex,

728

struct xfs_bmbt_irec *imap,

776

struct xfs_bmbt_irec *imap,

729

xfs_ioend_t **ioendp,

777

xfs_ioend_t **ioendp,

730

struct writeback_control *wbc,

778

struct writeback_control *wbc,

731

pgoff_t tlast)

779

pgoff_t tlast)

732

{

780

{

733

struct pagevec pvec;

781

struct pagevec pvec;

734

int done = 0, i;

782

int done = 0, i;

735

783

736

pagevec_init(&pvec, 0);

784

pagevec_init(&pvec, 0);

737

while (!done && tindex <= tlast) {

785

while (!done && tindex <= tlast) {

738

unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);

786

unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);

739

787

740

if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))

788

if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))

741

break;

789

break;

742

790

743

for (i = 0; i < pagevec_count(&pvec); i++) {

791

for (i = 0; i < pagevec_count(&pvec); i++) {

744

done = xfs_convert_page(inode, pvec.pages[i], tindex++,

792

done = xfs_convert_page(inode, pvec.pages[i], tindex++,

745

imap, ioendp, wbc);

793

imap, ioendp, wbc);

746

if (done)

794

if (done)

747

break;

795

break;

748

}

796

}

749

797

750

pagevec_release(&pvec);

798

pagevec_release(&pvec);

751

cond_resched();

799

cond_resched();

752

}

800

}

753

}

801

}

754

802

755

STATIC void

803

STATIC void

756

xfs_vm_invalidatepage(

804

xfs_vm_invalidatepage(

757

struct page *page,

805

struct page *page,

758

unsigned long offset)

806

unsigned long offset)

759

{

807

{

760

trace_xfs_invalidatepage(page->mapping->host, page, offset);

808

trace_xfs_invalidatepage(page->mapping->host, page, offset);

761

block_invalidatepage(page, offset);

809

block_invalidatepage(page, offset);

762

}

810

}

763

811

764

/*

812

/*

765

* If the page has delalloc buffers on it, we need to punch them out before we

813

* If the page has delalloc buffers on it, we need to punch them out before we

766

* invalidate the page. If we don't, we leave a stale delalloc mapping on the

814

* invalidate the page. If we don't, we leave a stale delalloc mapping on the

767

* inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read

815

* inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read

768

* is done on that same region - the delalloc extent is returned when none is

816

* is done on that same region - the delalloc extent is returned when none is

769

* supposed to be there.

817

* supposed to be there.

770

*

818

*

771

* We prevent this by truncating away the delalloc regions on the page before

819

* We prevent this by truncating away the delalloc regions on the page before

772

* invalidating it. Because they are delalloc, we can do this without needing a

820

* invalidating it. Because they are delalloc, we can do this without needing a

773

* transaction. Indeed - if we get ENOSPC errors, we have to be able to do this

821

* transaction. Indeed - if we get ENOSPC errors, we have to be able to do this

774

* truncation without a transaction as there is no space left for block

822

* truncation without a transaction as there is no space left for block

775

* reservation (typically why we see a ENOSPC in writeback).

823

* reservation (typically why we see a ENOSPC in writeback).

776

*

824

*

777

* This is not a performance critical path, so for now just do the punching a

825

* This is not a performance critical path, so for now just do the punching a

778

* buffer head at a time.

826

* buffer head at a time.

779

*/

827

*/

780

STATIC void

828

STATIC void

781

xfs_aops_discard_page(

829

xfs_aops_discard_page(

782

struct page *page)

830

struct page *page)

783

{

831

{

784

struct inode *inode = page->mapping->host;

832

struct inode *inode = page->mapping->host;

785

struct xfs_inode *ip = XFS_I(inode);

833

struct xfs_inode *ip = XFS_I(inode);

786

struct buffer_head *bh, *head;

834

struct buffer_head *bh, *head;

787

loff_t offset = page_offset(page);

835

loff_t offset = page_offset(page);

788

836

789

if (!xfs_is_delayed_page(page, IO_DELALLOC))

837

if (!xfs_is_delayed_page(page, IO_DELALLOC))

790

goto out_invalidate;

838

goto out_invalidate;

791

839

792

if (XFS_FORCED_SHUTDOWN(ip->i_mount))

840

if (XFS_FORCED_SHUTDOWN(ip->i_mount))

793

goto out_invalidate;

841

goto out_invalidate;

794

842

795

xfs_alert(ip->i_mount,

843

xfs_alert(ip->i_mount,

796

"page discard on page %p, inode 0x%llx, offset %llu.",

844

"page discard on page %p, inode 0x%llx, offset %llu.",

797

page, ip->i_ino, offset);

845

page, ip->i_ino, offset);

798

846

799

xfs_ilock(ip, XFS_ILOCK_EXCL);

847

xfs_ilock(ip, XFS_ILOCK_EXCL);

800

bh = head = page_buffers(page);

848

bh = head = page_buffers(page);

801

do {

849

do {

802

int error;

850

int error;

803

xfs_fileoff_t start_fsb;

851

xfs_fileoff_t start_fsb;

804

852

805

if (!buffer_delay(bh))

853

if (!buffer_delay(bh))

806

goto next_buffer;

854

goto next_buffer;

807

855

808

start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);

856

start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);

809

error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1);

857

error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1);

810

if (error) {

858

if (error) {

811

/* something screwed, just bail */

859

/* something screwed, just bail */

812

if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {

860

if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {

813

xfs_alert(ip->i_mount,

861

xfs_alert(ip->i_mount,

814

"page discard unable to remove delalloc mapping.");

862

"page discard unable to remove delalloc mapping.");

815

}

863

}

816

break;

864

break;

817

}

865

}

818

next_buffer:

866

next_buffer:

819

offset += 1 << inode->i_blkbits;

867

offset += 1 << inode->i_blkbits;

820

868

821

} while ((bh = bh->b_this_page) != head);

869

} while ((bh = bh->b_this_page) != head);

822

870

823

xfs_iunlock(ip, XFS_ILOCK_EXCL);

871

xfs_iunlock(ip, XFS_ILOCK_EXCL);

824

out_invalidate:

872

out_invalidate:

825

xfs_vm_invalidatepage(page, 0);

873

xfs_vm_invalidatepage(page, 0);

826

return;

874

return;

827

}

875

}

828

876

829

/*

877

/*

830

* Write out a dirty page.

878

* Write out a dirty page.

831

*

879

*

832

* For delalloc space on the page we need to allocate space and flush it.

880

* For delalloc space on the page we need to allocate space and flush it.

833

* For unwritten space on the page we need to start the conversion to

881

* For unwritten space on the page we need to start the conversion to

834

* regular allocated space.

882

* regular allocated space.

835

* For any other dirty buffer heads on the page we should flush them.

883

* For any other dirty buffer heads on the page we should flush them.

836

*/

884

*/

837

STATIC int

885

STATIC int

838

xfs_vm_writepage(

886

xfs_vm_writepage(

839

struct page *page,

887

struct page *page,

840

struct writeback_control *wbc)

888

struct writeback_control *wbc)

841

{

889

{

842

struct inode *inode = page->mapping->host;

890

struct inode *inode = page->mapping->host;

843

struct buffer_head *bh, *head;

891

struct buffer_head *bh, *head;

844

struct xfs_bmbt_irec imap;

892

struct xfs_bmbt_irec imap;

845

xfs_ioend_t *ioend = NULL, *iohead = NULL;

893

xfs_ioend_t *ioend = NULL, *iohead = NULL;

846

loff_t offset;

894

loff_t offset;

847

unsigned int type;

895

unsigned int type;

848

__uint64_t end_offset;

896

__uint64_t end_offset;

849

pgoff_t end_index, last_index;

897

pgoff_t end_index, last_index;

850

ssize_t len;

898

ssize_t len;

851

int err, imap_valid = 0, uptodate = 1;

899

int err, imap_valid = 0, uptodate = 1;

852

int count = 0;

900

int count = 0;

853

int nonblocking = 0;

901

int nonblocking = 0;

854

902

855

trace_xfs_writepage(inode, page, 0);

903

trace_xfs_writepage(inode, page, 0);

856

904

857

ASSERT(page_has_buffers(page));

905

ASSERT(page_has_buffers(page));

858

906

859

/*

907

/*

860

* Refuse to write the page out if we are called from reclaim context.

908

* Refuse to write the page out if we are called from reclaim context.

861

*

909

*

862

* This avoids stack overflows when called from deeply used stacks in

910

* This avoids stack overflows when called from deeply used stacks in

863

* random callers for direct reclaim or memcg reclaim. We explicitly

911

* random callers for direct reclaim or memcg reclaim. We explicitly

864

* allow reclaim from kswapd as the stack usage there is relatively low.

912

* allow reclaim from kswapd as the stack usage there is relatively low.

865

*

913

*

866

* This should never happen except in the case of a VM regression so

914

* This should never happen except in the case of a VM regression so

867

* warn about it.

915

* warn about it.

868

*/

916

*/

869

if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==

917

if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==

870

PF_MEMALLOC))

918

PF_MEMALLOC))

871

goto redirty;

919

goto redirty;

872

920

873

/*

921

/*

874

* Given that we do not allow direct reclaim to call us, we should

922

* Given that we do not allow direct reclaim to call us, we should

875

* never be called while in a filesystem transaction.

923

* never be called while in a filesystem transaction.

876

*/

924

*/

877

if (WARN_ON(current->flags & PF_FSTRANS))

925

if (WARN_ON(current->flags & PF_FSTRANS))

878

goto redirty;

926

goto redirty;

879

927

880

/* Is this page beyond the end of the file? */

928

/* Is this page beyond the end of the file? */

881

offset = i_size_read(inode);

929

offset = i_size_read(inode);

882

end_index = offset >> PAGE_CACHE_SHIFT;

930

end_index = offset >> PAGE_CACHE_SHIFT;

883

last_index = (offset - 1) >> PAGE_CACHE_SHIFT;

931

last_index = (offset - 1) >> PAGE_CACHE_SHIFT;

884

if (page->index >= end_index) {

932

if (page->index >= end_index) {

885

if ((page->index >= end_index + 1) ||

933

if ((page->index >= end_index + 1) ||

886

!(i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) {

934

!(i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) {

887

unlock_page(page);

935

unlock_page(page);

888

return 0;

936

return 0;

889

}

937

}

890

}

938

}

891

939

892

end_offset = min_t(unsigned long long,

940

end_offset = min_t(unsigned long long,

893

(xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,

941

(xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,

894

offset);

942

offset);

895

len = 1 << inode->i_blkbits;

943

len = 1 << inode->i_blkbits;

896

944

897

bh = head = page_buffers(page);

945

bh = head = page_buffers(page);

898

offset = page_offset(page);

946

offset = page_offset(page);

899

type = IO_OVERWRITE;

947

type = IO_OVERWRITE;

900

948

901

if (wbc->sync_mode == WB_SYNC_NONE)

949

if (wbc->sync_mode == WB_SYNC_NONE)

902

nonblocking = 1;

950

nonblocking = 1;

903

951

904

do {

952

do {

905

int new_ioend = 0;

953

int new_ioend = 0;

906

954

907

if (offset >= end_offset)

955

if (offset >= end_offset)

908

break;

956

break;

909

if (!buffer_uptodate(bh))

957

if (!buffer_uptodate(bh))

910

uptodate = 0;

958

uptodate = 0;

911

959

912

/*

960

/*

913

* set_page_dirty dirties all buffers in a page, independent

961

* set_page_dirty dirties all buffers in a page, independent

914

* of their state. The dirty state however is entirely

962

* of their state. The dirty state however is entirely

915

* meaningless for holes (!mapped && uptodate), so skip

963

* meaningless for holes (!mapped && uptodate), so skip

916

* buffers covering holes here.

964

* buffers covering holes here.

917

*/

965

*/

918

if (!buffer_mapped(bh) && buffer_uptodate(bh)) {

966

if (!buffer_mapped(bh) && buffer_uptodate(bh)) {

919

imap_valid = 0;

967

imap_valid = 0;

920

continue;

968

continue;

921

}

969

}

922

970

923

if (buffer_unwritten(bh)) {

971

if (buffer_unwritten(bh)) {

924

if (type != IO_UNWRITTEN) {

972

if (type != IO_UNWRITTEN) {

925

type = IO_UNWRITTEN;

973

type = IO_UNWRITTEN;

926

imap_valid = 0;

974

imap_valid = 0;

927

}

975

}

928

} else if (buffer_delay(bh)) {

976

} else if (buffer_delay(bh)) {

929

if (type != IO_DELALLOC) {

977

if (type != IO_DELALLOC) {

930

type = IO_DELALLOC;

978

type = IO_DELALLOC;

931

imap_valid = 0;

979

imap_valid = 0;

932

}

980

}

933

} else if (buffer_uptodate(bh)) {

981

} else if (buffer_uptodate(bh)) {

934

if (type != IO_OVERWRITE) {

982

if (type != IO_OVERWRITE) {

935

type = IO_OVERWRITE;

983

type = IO_OVERWRITE;

936

imap_valid = 0;

984

imap_valid = 0;

937

}

985

}

938

} else {

986

} else {

939

if (PageUptodate(page)) {

987

if (PageUptodate(page)) {

940

ASSERT(buffer_mapped(bh));

988

ASSERT(buffer_mapped(bh));

941

imap_valid = 0;

989

imap_valid = 0;

942

}

990

}

943

continue;

991

continue;

944

}

992

}

945

993

946

if (imap_valid)

994

if (imap_valid)

947

imap_valid = xfs_imap_valid(inode, &imap, offset);

995

imap_valid = xfs_imap_valid(inode, &imap, offset);

948

if (!imap_valid) {

996

if (!imap_valid) {

949

/*

997

/*

950

* If we didn't have a valid mapping then we need to

998

* If we didn't have a valid mapping then we need to

951

* put the new mapping into a separate ioend structure.

999

* put the new mapping into a separate ioend structure.

952

* This ensures non-contiguous extents always have

1000

* This ensures non-contiguous extents always have

953

* separate ioends, which is particularly important

1001

* separate ioends, which is particularly important

954

* for unwritten extent conversion at I/O completion

1002

* for unwritten extent conversion at I/O completion

955

* time.

1003

* time.

956

*/

1004

*/

957

new_ioend = 1;

1005

new_ioend = 1;

958

err = xfs_map_blocks(inode, offset, &imap, type,

1006

err = xfs_map_blocks(inode, offset, &imap, type,

959

nonblocking);

1007

nonblocking);

960

if (err)

1008

if (err)

961

goto error;

1009

goto error;

962

imap_valid = xfs_imap_valid(inode, &imap, offset);

1010

imap_valid = xfs_imap_valid(inode, &imap, offset);

963

}

1011

}

964

if (imap_valid) {

1012

if (imap_valid) {

965

lock_buffer(bh);

1013

lock_buffer(bh);

966

if (type != IO_OVERWRITE)

1014

if (type != IO_OVERWRITE)

967

xfs_map_at_offset(inode, bh, &imap, offset);

1015

xfs_map_at_offset(inode, bh, &imap, offset);

968

xfs_add_to_ioend(inode, bh, offset, type, &ioend,

1016

xfs_add_to_ioend(inode, bh, offset, type, &ioend,

969

new_ioend);

1017

new_ioend);

970

count++;

1018

count++;

971

}

1019

}

972

1020

973

if (!iohead)

1021

if (!iohead)

974

iohead = ioend;

1022

iohead = ioend;

975

1023

976

} while (offset += len, ((bh = bh->b_this_page) != head));

1024

} while (offset += len, ((bh = bh->b_this_page) != head));

977

1025

978

if (uptodate && bh == head)

1026

if (uptodate && bh == head)

979

SetPageUptodate(page);

1027

SetPageUptodate(page);

980

1028

981

xfs_start_page_writeback(page, 1, count);

1029

xfs_start_page_writeback(page, 1, count);

982

1030

983

if (ioend && imap_valid) {

1031

if (ioend && imap_valid) {

984

xfs_off_t end_index;

1032

xfs_off_t end_index;

985

1033

986

end_index = imap.br_startoff + imap.br_blockcount;

1034

end_index = imap.br_startoff + imap.br_blockcount;

987

1035

988

/* to bytes */

1036

/* to bytes */

989

end_index <<= inode->i_blkbits;

1037

end_index <<= inode->i_blkbits;

990

1038

991

/* to pages */

1039

/* to pages */

992

end_index = (end_index - 1) >> PAGE_CACHE_SHIFT;

1040

end_index = (end_index - 1) >> PAGE_CACHE_SHIFT;

993

1041

994

/* check against file size */

1042

/* check against file size */

995

if (end_index > last_index)

1043

if (end_index > last_index)

996

end_index = last_index;

1044

end_index = last_index;

997

1045

998

xfs_cluster_write(inode, page->index + 1, &imap, &ioend,

1046

xfs_cluster_write(inode, page->index + 1, &imap, &ioend,

999

wbc, end_index);

1047

wbc, end_index);

1000

}

1048

}

1001

1049

1002

if (iohead)

1050

if (iohead) {

1051

/*

1052

* Reserve log space if we might write beyond the on-disk

1053

* inode size.

1054

*/

1055

if (ioend->io_type != IO_UNWRITTEN &&

1056

xfs_ioend_is_append(ioend)) {

1057

err = xfs_setfilesize_trans_alloc(ioend);

1058

if (err)

1059

goto error;

1060

}

1061

1003

xfs_submit_ioend(wbc, iohead);

1062

xfs_submit_ioend(wbc, iohead);

1063

}

1004

1064

1005

return 0;

1065

return 0;

1006

1066

1007

error:

1067

error:

1008

if (iohead)

1068

if (iohead)

1009

xfs_cancel_ioend(iohead);

1069

xfs_cancel_ioend(iohead);

1010

1070

1011

if (err == -EAGAIN)

1071

if (err == -EAGAIN)

1012

goto redirty;

1072

goto redirty;

1013

1073

1014

xfs_aops_discard_page(page);

1074

xfs_aops_discard_page(page);

1015

ClearPageUptodate(page);

1075

ClearPageUptodate(page);

1016

unlock_page(page);

1076

unlock_page(page);

1017

return err;

1077

return err;

1018

1078

1019

redirty:

1079

redirty:

1020

redirty_page_for_writepage(wbc, page);

1080

redirty_page_for_writepage(wbc, page);

1021

unlock_page(page);

1081

unlock_page(page);

1022

return 0;

1082

return 0;

1023

}

1083

}

1024

1084

1025

STATIC int

1085

STATIC int

1026

xfs_vm_writepages(

1086

xfs_vm_writepages(

1027

struct address_space *mapping,

1087

struct address_space *mapping,

1028

struct writeback_control *wbc)

1088

struct writeback_control *wbc)

1029

{

1089

{

1030

xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);

1090

xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);

1031

return generic_writepages(mapping, wbc);

1091

return generic_writepages(mapping, wbc);

1032

}

1092

}

1033

1093

1034

/*

1094

/*

1035

* Called to move a page into cleanable state - and from there

1095

* Called to move a page into cleanable state - and from there

1036

* to be released. The page should already be clean. We always

1096

* to be released. The page should already be clean. We always

1037

* have buffer heads in this call.

1097

* have buffer heads in this call.

1038

*

1098

*

1039

* Returns 1 if the page is ok to release, 0 otherwise.

1099

* Returns 1 if the page is ok to release, 0 otherwise.

1040

*/

1100

*/

1041

STATIC int

1101

STATIC int

1042

xfs_vm_releasepage(

1102

xfs_vm_releasepage(

1043

struct page *page,

1103

struct page *page,

1044

gfp_t gfp_mask)

1104

gfp_t gfp_mask)

1045

{

1105

{

1046

int delalloc, unwritten;

1106

int delalloc, unwritten;

1047

1107

1048

trace_xfs_releasepage(page->mapping->host, page, 0);

1108

trace_xfs_releasepage(page->mapping->host, page, 0);

1049

1109

1050

xfs_count_page_state(page, &delalloc, &unwritten);

1110

xfs_count_page_state(page, &delalloc, &unwritten);

1051

1111

1052

if (WARN_ON(delalloc))

1112

if (WARN_ON(delalloc))

1053

return 0;

1113

return 0;

1054

if (WARN_ON(unwritten))

1114

if (WARN_ON(unwritten))

1055

return 0;

1115

return 0;

1056

1116

1057

return try_to_free_buffers(page);

1117

return try_to_free_buffers(page);

1058

}

1118

}

1059

1119

1060

STATIC int

1120

STATIC int

1061

__xfs_get_blocks(

1121

__xfs_get_blocks(

1062

struct inode *inode,

1122

struct inode *inode,

1063

sector_t iblock,

1123

sector_t iblock,

1064

struct buffer_head *bh_result,

1124

struct buffer_head *bh_result,

1065

int create,

1125

int create,

1066

int direct)

1126

int direct)

1067

{

1127

{

1068

struct xfs_inode *ip = XFS_I(inode);

1128

struct xfs_inode *ip = XFS_I(inode);

1069

struct xfs_mount *mp = ip->i_mount;

1129

struct xfs_mount *mp = ip->i_mount;

1070

xfs_fileoff_t offset_fsb, end_fsb;

1130

xfs_fileoff_t offset_fsb, end_fsb;

1071

int error = 0;

1131

int error = 0;

1072

int lockmode = 0;

1132

int lockmode = 0;

1073

struct xfs_bmbt_irec imap;

1133

struct xfs_bmbt_irec imap;

1074

int nimaps = 1;

1134

int nimaps = 1;

1075

xfs_off_t offset;

1135

xfs_off_t offset;

1076

ssize_t size;

1136

ssize_t size;

1077

int new = 0;

1137

int new = 0;

1078

1138

1079

if (XFS_FORCED_SHUTDOWN(mp))

1139

if (XFS_FORCED_SHUTDOWN(mp))

1080

return -XFS_ERROR(EIO);

1140

return -XFS_ERROR(EIO);

1081

1141

1082

offset = (xfs_off_t)iblock << inode->i_blkbits;

1142

offset = (xfs_off_t)iblock << inode->i_blkbits;

1083

ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));

1143

ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));

1084

size = bh_result->b_size;

1144

size = bh_result->b_size;

1085

1145

1086

if (!create && direct && offset >= i_size_read(inode))

1146

if (!create && direct && offset >= i_size_read(inode))

1087

return 0;

1147

return 0;

1088

1148

1089

if (create) {

1149

if (create) {

1090

lockmode = XFS_ILOCK_EXCL;

1150

lockmode = XFS_ILOCK_EXCL;

1091

xfs_ilock(ip, lockmode);

1151

xfs_ilock(ip, lockmode);

1092

} else {

1152

} else {

1093

lockmode = xfs_ilock_map_shared(ip);

1153

lockmode = xfs_ilock_map_shared(ip);

1094

}

1154

}

1095

1155

1096

ASSERT(offset <= mp->m_maxioffset);

1156

ASSERT(offset <= mp->m_maxioffset);

1097

if (offset + size > mp->m_maxioffset)

1157

if (offset + size > mp->m_maxioffset)

1098

size = mp->m_maxioffset - offset;

1158

size = mp->m_maxioffset - offset;

1099

end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);

1159

end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);

1100

offset_fsb = XFS_B_TO_FSBT(mp, offset);

1160

offset_fsb = XFS_B_TO_FSBT(mp, offset);

1101

1161

1102

error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,

1162

error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,

1103

&imap, &nimaps, XFS_BMAPI_ENTIRE);

1163

&imap, &nimaps, XFS_BMAPI_ENTIRE);

1104

if (error)

1164

if (error)

1105

goto out_unlock;

1165

goto out_unlock;

1106

1166

1107

if (create &&

1167

if (create &&

1108

(!nimaps ||

1168

(!nimaps ||

1109

(imap.br_startblock == HOLESTARTBLOCK ||

1169

(imap.br_startblock == HOLESTARTBLOCK ||

1110

imap.br_startblock == DELAYSTARTBLOCK))) {

1170

imap.br_startblock == DELAYSTARTBLOCK))) {

1111

if (direct) {

1171

if (direct) {

1112

error = xfs_iomap_write_direct(ip, offset, size,

1172

error = xfs_iomap_write_direct(ip, offset, size,

1113

&imap, nimaps);

1173

&imap, nimaps);

1114

} else {

1174

} else {

1115

error = xfs_iomap_write_delay(ip, offset, size, &imap);

1175

error = xfs_iomap_write_delay(ip, offset, size, &imap);

1116

}

1176

}

1117

if (error)

1177

if (error)

1118

goto out_unlock;

1178

goto out_unlock;

1119

1179

1120

trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);

1180

trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);

1121

} else if (nimaps) {

1181

} else if (nimaps) {

1122

trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);

1182

trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);

1123

} else {

1183

} else {

1124

trace_xfs_get_blocks_notfound(ip, offset, size);

1184

trace_xfs_get_blocks_notfound(ip, offset, size);

1125

goto out_unlock;

1185

goto out_unlock;

1126

}

1186

}

1127

xfs_iunlock(ip, lockmode);

1187

xfs_iunlock(ip, lockmode);

1128

1188

1129

if (imap.br_startblock != HOLESTARTBLOCK &&

1189

if (imap.br_startblock != HOLESTARTBLOCK &&

1130

imap.br_startblock != DELAYSTARTBLOCK) {

1190

imap.br_startblock != DELAYSTARTBLOCK) {

1131

/*

1191

/*

1132

* For unwritten extents do not report a disk address on

1192

* For unwritten extents do not report a disk address on

1133

* the read case (treat as if we're reading into a hole).

1193

* the read case (treat as if we're reading into a hole).

1134

*/

1194

*/

1135

if (create || !ISUNWRITTEN(&imap))

1195

if (create || !ISUNWRITTEN(&imap))

1136

xfs_map_buffer(inode, bh_result, &imap, offset);

1196

xfs_map_buffer(inode, bh_result, &imap, offset);

1137

if (create && ISUNWRITTEN(&imap)) {

1197

if (create && ISUNWRITTEN(&imap)) {

1138

if (direct)

1198

if (direct)

1139

bh_result->b_private = inode;

1199

bh_result->b_private = inode;

1140

set_buffer_unwritten(bh_result);

1200

set_buffer_unwritten(bh_result);

1141

}

1201

}

1142

}

1202

}

1143

1203

1144

/*

1204

/*

1145

* If this is a realtime file, data may be on a different device.

1205

* If this is a realtime file, data may be on a different device.

1146

* to that pointed to from the buffer_head b_bdev currently.

1206

* to that pointed to from the buffer_head b_bdev currently.

1147

*/

1207

*/

1148

bh_result->b_bdev = xfs_find_bdev_for_inode(inode);

1208

bh_result->b_bdev = xfs_find_bdev_for_inode(inode);

1149

1209

1150

/*

1210

/*

1151

* If we previously allocated a block out beyond eof and we are now

1211

* If we previously allocated a block out beyond eof and we are now

1152

* coming back to use it then we will need to flag it as new even if it

1212

* coming back to use it then we will need to flag it as new even if it

1153

* has a disk address.

1213

* has a disk address.

1154

*

1214

*

1155

* With sub-block writes into unwritten extents we also need to mark

1215

* With sub-block writes into unwritten extents we also need to mark

1156

* the buffer as new so that the unwritten parts of the buffer gets

1216

* the buffer as new so that the unwritten parts of the buffer gets

1157

* correctly zeroed.

1217

* correctly zeroed.

1158

*/

1218

*/

1159

if (create &&

1219

if (create &&

1160

((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||

1220

((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||

1161

(offset >= i_size_read(inode)) ||

1221

(offset >= i_size_read(inode)) ||

1162

(new || ISUNWRITTEN(&imap))))

1222

(new || ISUNWRITTEN(&imap))))

1163

set_buffer_new(bh_result);

1223

set_buffer_new(bh_result);

1164

1224

1165

if (imap.br_startblock == DELAYSTARTBLOCK) {

1225

if (imap.br_startblock == DELAYSTARTBLOCK) {

1166

BUG_ON(direct);

1226

BUG_ON(direct);

1167

if (create) {

1227

if (create) {

1168

set_buffer_uptodate(bh_result);

1228

set_buffer_uptodate(bh_result);

1169

set_buffer_mapped(bh_result);

1229

set_buffer_mapped(bh_result);

1170

set_buffer_delay(bh_result);

1230

set_buffer_delay(bh_result);

1171

}

1231

}

1172

}

1232

}

1173

1233

1174

/*

1234

/*

1175

* If this is O_DIRECT or the mpage code calling tell them how large

1235

* If this is O_DIRECT or the mpage code calling tell them how large

1176

* the mapping is, so that we can avoid repeated get_blocks calls.

1236

* the mapping is, so that we can avoid repeated get_blocks calls.

1177

*/

1237

*/

1178

if (direct || size > (1 << inode->i_blkbits)) {

1238

if (direct || size > (1 << inode->i_blkbits)) {

1179

xfs_off_t mapping_size;

1239

xfs_off_t mapping_size;

1180

1240

1181

mapping_size = imap.br_startoff + imap.br_blockcount - iblock;

1241

mapping_size = imap.br_startoff + imap.br_blockcount - iblock;

1182

mapping_size <<= inode->i_blkbits;

1242

mapping_size <<= inode->i_blkbits;

1183

1243

1184

ASSERT(mapping_size > 0);

1244

ASSERT(mapping_size > 0);

1185

if (mapping_size > size)

1245

if (mapping_size > size)

1186

mapping_size = size;

1246

mapping_size = size;

1187

if (mapping_size > LONG_MAX)

1247

if (mapping_size > LONG_MAX)

1188

mapping_size = LONG_MAX;

1248

mapping_size = LONG_MAX;

1189

1249

1190

bh_result->b_size = mapping_size;

1250

bh_result->b_size = mapping_size;

1191

}

1251

}

1192

1252

1193

return 0;

1253

return 0;

1194

1254

1195

out_unlock:

1255

out_unlock:

1196

xfs_iunlock(ip, lockmode);

1256

xfs_iunlock(ip, lockmode);

1197

return -error;

1257

return -error;

1198

}

1258

}

1199

1259

1200

int

1260

int

1201

xfs_get_blocks(

1261

xfs_get_blocks(

1202

struct inode *inode,

1262

struct inode *inode,

1203

sector_t iblock,

1263

sector_t iblock,

1204

struct buffer_head *bh_result,

1264

struct buffer_head *bh_result,

1205

int create)

1265

int create)

1206

{

1266

{

1207

return __xfs_get_blocks(inode, iblock, bh_result, create, 0);

1267

return __xfs_get_blocks(inode, iblock, bh_result, create, 0);

1208

}

1268

}

1209

1269

1210

STATIC int

1270

STATIC int

1211

xfs_get_blocks_direct(

1271

xfs_get_blocks_direct(

1212

struct inode *inode,

1272

struct inode *inode,

1213

sector_t iblock,

1273

sector_t iblock,

1214

struct buffer_head *bh_result,

1274

struct buffer_head *bh_result,

1215

int create)

1275

int create)

1216

{

1276

{

1217

return __xfs_get_blocks(inode, iblock, bh_result, create, 1);

1277

return __xfs_get_blocks(inode, iblock, bh_result, create, 1);

1218

}

1278

}

1219

1279

1220

/*

1280

/*

1221

* Complete a direct I/O write request.

1281

* Complete a direct I/O write request.

1222

*

1282

*

1223

* If the private argument is non-NULL __xfs_get_blocks signals us that we

1283

* If the private argument is non-NULL __xfs_get_blocks signals us that we

1224

* need to issue a transaction to convert the range from unwritten to written

1284

* need to issue a transaction to convert the range from unwritten to written

1225

* extents. In case this is regular synchronous I/O we just call xfs_end_io

1285

* extents. In case this is regular synchronous I/O we just call xfs_end_io

1226

* to do this and we are done. But in case this was a successful AIO

1286

* to do this and we are done. But in case this was a successful AIO

1227

* request this handler is called from interrupt context, from which we

1287

* request this handler is called from interrupt context, from which we

1228

* can't start transactions. In that case offload the I/O completion to

1288

* can't start transactions. In that case offload the I/O completion to

1229

* the workqueues we also use for buffered I/O completion.

1289

* the workqueues we also use for buffered I/O completion.

1230

*/

1290

*/

1231

STATIC void

1291

STATIC void

1232

xfs_end_io_direct_write(

1292

xfs_end_io_direct_write(

1233

struct kiocb *iocb,

1293

struct kiocb *iocb,

1234

loff_t offset,

1294

loff_t offset,

1235

ssize_t size,

1295

ssize_t size,

1236

void *private,

1296

void *private,

1237

int ret,

1297

int ret,

1238

bool is_async)

1298

bool is_async)

1239

{

1299

{

1240

struct xfs_ioend *ioend = iocb->private;

1300

struct xfs_ioend *ioend = iocb->private;

1241

1301

1242

/*

1302

/*

1243

* While the generic direct I/O code updates the inode size, it does

1303

* While the generic direct I/O code updates the inode size, it does

1244

* so only after the end_io handler is called, which means our

1304

* so only after the end_io handler is called, which means our

1245

* end_io handler thinks the on-disk size is outside the in-core

1305

* end_io handler thinks the on-disk size is outside the in-core

1246

* size. To prevent this just update it a little bit earlier here.

1306

* size. To prevent this just update it a little bit earlier here.

1247

*/

1307

*/

1248

if (offset + size > i_size_read(ioend->io_inode))

1308

if (offset + size > i_size_read(ioend->io_inode))

1249

i_size_write(ioend->io_inode, offset + size);

1309

i_size_write(ioend->io_inode, offset + size);

1250

1310

1251

/*

1311

/*

1252

* blockdev_direct_IO can return an error even after the I/O

1312

* blockdev_direct_IO can return an error even after the I/O

1253

* completion handler was called. Thus we need to protect

1313

* completion handler was called. Thus we need to protect

1254

* against double-freeing.

1314

* against double-freeing.

1255

*/

1315

*/

1256

iocb->private = NULL;

1316

iocb->private = NULL;

1257

1317

1258

ioend->io_offset = offset;

1318

ioend->io_offset = offset;

1259

ioend->io_size = size;

1319

ioend->io_size = size;

1260

ioend->io_iocb = iocb;

1320

ioend->io_iocb = iocb;

1261

ioend->io_result = ret;

1321

ioend->io_result = ret;

1262

if (private && size > 0)

1322

if (private && size > 0)

1263

ioend->io_type = IO_UNWRITTEN;

1323

ioend->io_type = IO_UNWRITTEN;

1264

1324

1265

if (is_async) {

1325

if (is_async) {

1266

ioend->io_isasync = 1;

1326

ioend->io_isasync = 1;

1267

xfs_finish_ioend(ioend);

1327

xfs_finish_ioend(ioend);

1268

} else {

1328

} else {

1269

xfs_finish_ioend_sync(ioend);

1329

xfs_finish_ioend_sync(ioend);

1270

}

1330

}

1271

}

1331

}

1272

1332

1273

STATIC ssize_t

1333

STATIC ssize_t

1274

xfs_vm_direct_IO(

1334

xfs_vm_direct_IO(

1275

int rw,

1335

int rw,

1276

struct kiocb *iocb,

1336

struct kiocb *iocb,

1277

const struct iovec *iov,

1337

const struct iovec *iov,

1278

loff_t offset,

1338

loff_t offset,

1279

unsigned long nr_segs)

1339

unsigned long nr_segs)

1280

{

1340

{

1281

struct inode *inode = iocb->ki_filp->f_mapping->host;

1341

struct inode *inode = iocb->ki_filp->f_mapping->host;

1282

struct block_device *bdev = xfs_find_bdev_for_inode(inode);

1342

struct block_device *bdev = xfs_find_bdev_for_inode(inode);

1343

struct xfs_ioend *ioend = NULL;

1283

ssize_t ret;

1344

ssize_t ret;

1284

1345

1285

if (rw & WRITE) {

1346

if (rw & WRITE) {

1286

iocb->private = xfs_alloc_ioend(inode, IO_DIRECT);

1347

size_t size = iov_length(iov, nr_segs);

1287

1348

1349

/*

1350

* We need to preallocate a transaction for a size update

1351

* here. In the case that this write both updates the size

1352

* and converts at least on unwritten extent we will cancel

1353

* the still clean transaction after the I/O has finished.

1354

*/

1355

iocb->private = ioend = xfs_alloc_ioend(inode, IO_DIRECT);

1356

if (offset + size > XFS_I(inode)->i_d.di_size) {

1357

ret = xfs_setfilesize_trans_alloc(ioend);

1358

if (ret)

1359

goto out_destroy_ioend;

1360

ioend->io_isdirect = 1;

1361

}

1362

1288

ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,

1363

ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,

1289

offset, nr_segs,

1364

offset, nr_segs,

1290

xfs_get_blocks_direct,

1365

xfs_get_blocks_direct,

1291

xfs_end_io_direct_write, NULL, 0);

1366

xfs_end_io_direct_write, NULL, 0);

1292

if (ret != -EIOCBQUEUED && iocb->private)

1367

if (ret != -EIOCBQUEUED && iocb->private)

1293

xfs_destroy_ioend(iocb->private);

1368

goto out_trans_cancel;

1294

} else {

1369

} else {

1295

ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,

1370

ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,

1296

offset, nr_segs,

1371

offset, nr_segs,

1297

xfs_get_blocks_direct,

1372

xfs_get_blocks_direct,

1298

NULL, NULL, 0);

1373

NULL, NULL, 0);

1299

}

1374

}

1300

1375

1376

return ret;

1377

1378

out_trans_cancel:

1379

if (ioend->io_append_trans) {

1380

current_set_flags_nested(&ioend->io_append_trans->t_pflags,

1381

PF_FSTRANS);

1382

xfs_trans_cancel(ioend->io_append_trans, 0);

1383

}

1384

out_destroy_ioend:

1385

xfs_destroy_ioend(ioend);

1301

return ret;

1386

return ret;

1302

}

1387

}

1303

1388

1304

STATIC void

1389

STATIC void

1305

xfs_vm_write_failed(

1390

xfs_vm_write_failed(

1306

struct address_space *mapping,

1391

struct address_space *mapping,

1307

loff_t to)

1392

loff_t to)

1308

{

1393

{

1309

struct inode *inode = mapping->host;

1394

struct inode *inode = mapping->host;

1310

1395

1311

if (to > inode->i_size) {

1396

if (to > inode->i_size) {

1312

/*

1397

/*

1313

* Punch out the delalloc blocks we have already allocated.

1398

* Punch out the delalloc blocks we have already allocated.

1314

*

1399

*

1315

* Don't bother with xfs_setattr given that nothing can have

1400

* Don't bother with xfs_setattr given that nothing can have

1316

* made it to disk yet as the page is still locked at this

1401

* made it to disk yet as the page is still locked at this

1317

* point.

1402

* point.

1318

*/

1403

*/

1319

struct xfs_inode *ip = XFS_I(inode);

1404

struct xfs_inode *ip = XFS_I(inode);

1320

xfs_fileoff_t start_fsb;

1405

xfs_fileoff_t start_fsb;

1321

xfs_fileoff_t end_fsb;

1406

xfs_fileoff_t end_fsb;

1322

int error;

1407

int error;

1323

1408

1324

truncate_pagecache(inode, to, inode->i_size);

1409

truncate_pagecache(inode, to, inode->i_size);

1325

1410

1326

/*

1411

/*

1327

* Check if there are any blocks that are outside of i_size

1412

* Check if there are any blocks that are outside of i_size

1328

* that need to be trimmed back.

1413

* that need to be trimmed back.

1329

*/

1414

*/

1330

start_fsb = XFS_B_TO_FSB(ip->i_mount, inode->i_size) + 1;

1415

start_fsb = XFS_B_TO_FSB(ip->i_mount, inode->i_size) + 1;

1331

end_fsb = XFS_B_TO_FSB(ip->i_mount, to);

1416

end_fsb = XFS_B_TO_FSB(ip->i_mount, to);

1332

if (end_fsb <= start_fsb)

1417

if (end_fsb <= start_fsb)

1333

return;

1418

return;

1334

1419

1335

xfs_ilock(ip, XFS_ILOCK_EXCL);

1420

xfs_ilock(ip, XFS_ILOCK_EXCL);

1336

error = xfs_bmap_punch_delalloc_range(ip, start_fsb,

1421

error = xfs_bmap_punch_delalloc_range(ip, start_fsb,

1337

end_fsb - start_fsb);

1422

end_fsb - start_fsb);

1338

if (error) {

1423

if (error) {

1339

/* something screwed, just bail */

1424

/* something screwed, just bail */

1340

if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {

1425

if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {

1341

xfs_alert(ip->i_mount,

1426

xfs_alert(ip->i_mount,

1342

"xfs_vm_write_failed: unable to clean up ino %lld",

1427

"xfs_vm_write_failed: unable to clean up ino %lld",

1343

ip->i_ino);

1428

ip->i_ino);

1344

}

1429

}

1345

}

1430

}

1346

xfs_iunlock(ip, XFS_ILOCK_EXCL);

1431

xfs_iunlock(ip, XFS_ILOCK_EXCL);

1347

}

1432

}

1348

}

1433

}

1349

1434

1350

STATIC int

1435

STATIC int

1351

xfs_vm_write_begin(

1436

xfs_vm_write_begin(

1352

struct file *file,

1437

struct file *file,

1353

struct address_space *mapping,

1438

struct address_space *mapping,

1354

loff_t pos,

1439

loff_t pos,

1355

unsigned len,

1440

unsigned len,

1356

unsigned flags,

1441

unsigned flags,

1357

struct page **pagep,

1442

struct page **pagep,

1358

void **fsdata)

1443

void **fsdata)

1359

{

1444

{

1360

int ret;

1445

int ret;

1361

1446

1362

ret = block_write_begin(mapping, pos, len, flags | AOP_FLAG_NOFS,

1447

ret = block_write_begin(mapping, pos, len, flags | AOP_FLAG_NOFS,

1363

pagep, xfs_get_blocks);

1448

pagep, xfs_get_blocks);

1364

if (unlikely(ret))

1449

if (unlikely(ret))

1365

xfs_vm_write_failed(mapping, pos + len);

1450

xfs_vm_write_failed(mapping, pos + len);

1366

return ret;

1451

return ret;

1367

}

1452

}

1368

1453

1369

STATIC int

1454

STATIC int

1370

xfs_vm_write_end(

1455

xfs_vm_write_end(

1371

struct file *file,

1456

struct file *file,

1372

struct address_space *mapping,

1457

struct address_space *mapping,

1373

loff_t pos,

1458

loff_t pos,

1374

unsigned len,

1459

unsigned len,

1375

unsigned copied,

1460

unsigned copied,

1376

struct page *page,

1461

struct page *page,

1377

void *fsdata)

1462

void *fsdata)

1378

{

1463

{

1379

int ret;

1464

int ret;

1380

1465

1381

ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);

1466

ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);

1382

if (unlikely(ret < len))

1467

if (unlikely(ret < len))

1383

xfs_vm_write_failed(mapping, pos + len);

1468

xfs_vm_write_failed(mapping, pos + len);

1384

return ret;

1469

return ret;

1385

}

1470

}

1386

1471

1387

STATIC sector_t

1472

STATIC sector_t

1388

xfs_vm_bmap(

1473

xfs_vm_bmap(

1389

struct address_space *mapping,

1474

struct address_space *mapping,

1390

sector_t block)

1475

sector_t block)

1391

{

1476

{

1392

struct inode *inode = (struct inode *)mapping->host;

1477

struct inode *inode = (struct inode *)mapping->host;

1393

struct xfs_inode *ip = XFS_I(inode);

1478

struct xfs_inode *ip = XFS_I(inode);

1394

1479

1395

trace_xfs_vm_bmap(XFS_I(inode));

1480

trace_xfs_vm_bmap(XFS_I(inode));

1396

xfs_ilock(ip, XFS_IOLOCK_SHARED);

1481

xfs_ilock(ip, XFS_IOLOCK_SHARED);

1397

xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF);

1482

xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF);

1398

xfs_iunlock(ip, XFS_IOLOCK_SHARED);

1483

xfs_iunlock(ip, XFS_IOLOCK_SHARED);

1399

return generic_block_bmap(mapping, block, xfs_get_blocks);

1484

return generic_block_bmap(mapping, block, xfs_get_blocks);

1400

}

1485

}

1401

1486

1402

STATIC int

1487

STATIC int

1403

xfs_vm_readpage(

1488

xfs_vm_readpage(

1404

struct file *unused,

1489

struct file *unused,

1405

struct page *page)

1490

struct page *page)

1406

{

1491

{

1407

return mpage_readpage(page, xfs_get_blocks);

1492

return mpage_readpage(page, xfs_get_blocks);

1408

}

1493

}

1409

1494

1410

STATIC int

1495

STATIC int

1411

xfs_vm_readpages(

1496

xfs_vm_readpages(

1412

struct file *unused,

1497

struct file *unused,

1413

struct address_space *mapping,

1498

struct address_space *mapping,

1414

struct list_head *pages,

1499

struct list_head *pages,

1415

unsigned nr_pages)

1500

unsigned nr_pages)

1416

{

1501

{

1417

return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);

1502

return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);

1418

}

1503

}

1419

1504

1420

const struct address_space_operations xfs_address_space_operations = {

1505

const struct address_space_operations xfs_address_space_operations = {

1421

.readpage = xfs_vm_readpage,

1506

.readpage = xfs_vm_readpage,

1422

.readpages = xfs_vm_readpages,

1507

.readpages = xfs_vm_readpages,

GITLAB

xfs: log file size updates at I/O completion time

 /*
  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as
  * published by the Free Software Foundation.
  *
  * This program is distributed in the hope that it would be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write the Free Software Foundation,
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 #include "xfs.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_inum.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_trans.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
+#include "xfs_inode_item.h"
 #include "xfs_alloc.h"
 #include "xfs_error.h"
 #include "xfs_rw.h"
 #include "xfs_iomap.h"
 #include "xfs_vnodeops.h"
 #include "xfs_trace.h"
 #include "xfs_bmap.h"
 #include <linux/gfp.h>
 #include <linux/mpage.h>
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
 void
 xfs_count_page_state(
 	struct page		*page,
 	int			*delalloc,
 	int			*unwritten)
 {
 	struct buffer_head	*bh, *head;
 	*delalloc = *unwritten = 0;
 	bh = head = page_buffers(page);
 	do {
 		if (buffer_unwritten(bh))
 			(*unwritten) = 1;
 		else if (buffer_delay(bh))
 			(*delalloc) = 1;
 	} while ((bh = bh->b_this_page) != head);
 }
 STATIC struct block_device *
 xfs_find_bdev_for_inode(
 	struct inode		*inode)
 {
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
 	if (XFS_IS_REALTIME_INODE(ip))
 		return mp->m_rtdev_targp->bt_bdev;
 	else
 		return mp->m_ddev_targp->bt_bdev;
 }
 /*
  * We're now finished for good with this ioend structure.
  * Update the page state via the associated buffer_heads,
  * release holds on the inode and bio, and finally free
  * up memory.  Do not use the ioend after this.
  */
 STATIC void
 xfs_destroy_ioend(
 	xfs_ioend_t		*ioend)
 {
 	struct buffer_head	*bh, *next;
 	for (bh = ioend->io_buffer_head; bh; bh = next) {
 		next = bh->b_private;
 		bh->b_end_io(bh, !ioend->io_error);
 	}
 	if (ioend->io_iocb) {
 		if (ioend->io_isasync) {
 			aio_complete(ioend->io_iocb, ioend->io_error ?
 					ioend->io_error : ioend->io_result, 0);
 		}
 		inode_dio_done(ioend->io_inode);
 	}
 	mempool_free(ioend, xfs_ioend_pool);
 }
 /*
  * Fast and loose check if this write could update the on-disk inode size.
  */
 static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
 {
 	return ioend->io_offset + ioend->io_size >
 		XFS_I(ioend->io_inode)->i_d.di_size;
 }
+STATIC int
+xfs_setfilesize_trans_alloc(
+	struct xfs_ioend	*ioend)
+{
+	struct xfs_mount	*mp = XFS_I(ioend->io_inode)->i_mount;
+	struct xfs_trans	*tp;
+	int			error;
+	tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
+	error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		return error;
+	}
+	ioend->io_append_trans = tp;
+	/*
+	 * We hand off the transaction to the completion thread now, so
+	 * clear the flag here.
+	 */
+	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+	return 0;
+}
 /*
  * Update on-disk file size now that data has been written to disk.
  */
-STATIC void
+STATIC int
 xfs_setfilesize(
 	struct xfs_ioend	*ioend)
 {
 	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
+	struct xfs_trans	*tp = ioend->io_append_trans;
 	xfs_fsize_t		isize;
+	/*
+	 * The transaction was allocated in the I/O submission thread,
+	 * thus we need to mark ourselves as beeing in a transaction
+	 * manually.
+	 */
+	current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size);
-	if (isize) {
+	if (!isize) {
-		trace_xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-		ip->i_d.di_size = isize;
+		xfs_trans_cancel(tp, 0);
-		xfs_mark_inode_dirty(ip);
+		return 0;
 	}
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	trace_xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
+	ip->i_d.di_size = isize;
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	return xfs_trans_commit(tp, 0);
 }
 /*
  * Schedule IO completion handling on the final put of an ioend.
  *
  * If there is no work to do we might as well call it a day and free the
  * ioend right now.
  */
 STATIC void
 xfs_finish_ioend(
 	struct xfs_ioend	*ioend)
 {
 	if (atomic_dec_and_test(&ioend->io_remaining)) {
 		struct xfs_mount	*mp = XFS_I(ioend->io_inode)->i_mount;
 		if (ioend->io_type == IO_UNWRITTEN)
 			queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
-		else if (xfs_ioend_is_append(ioend))
+		else if (ioend->io_append_trans)
 			queue_work(mp->m_data_workqueue, &ioend->io_work);
 		else
 			xfs_destroy_ioend(ioend);
 	}
 }
 /*
  * IO write completion.
  */
 STATIC void
 xfs_end_io(
 	struct work_struct *work)
 {
 	xfs_ioend_t	*ioend = container_of(work, xfs_ioend_t, io_work);
 	struct xfs_inode *ip = XFS_I(ioend->io_inode);
 	int		error = 0;
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
 		ioend->io_error = -EIO;
 		goto done;
 	}
 	if (ioend->io_error)
 		goto done;
 	/*
 	 * For unwritten extents we need to issue transactions to convert a
 	 * range to normal written extens after the data I/O has finished.
 	 */
 	if (ioend->io_type == IO_UNWRITTEN) {
+		/*
+		 * For buffered I/O we never preallocate a transaction when
+		 * doing the unwritten extent conversion, but for direct I/O
+		 * we do not know if we are converting an unwritten extent
+		 * or not at the point where we preallocate the transaction.
+		 */
+		if (ioend->io_append_trans) {
+			ASSERT(ioend->io_isdirect);
+			current_set_flags_nested(
+				&ioend->io_append_trans->t_pflags, PF_FSTRANS);
+			xfs_trans_cancel(ioend->io_append_trans, 0);
+		}
 		error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
 						 ioend->io_size);
 		if (error) {
 			ioend->io_error = -error;
 			goto done;
 		}
+	} else if (ioend->io_append_trans) {
+		error = xfs_setfilesize(ioend);
+		if (error)
+			ioend->io_error = -error;
 	} else {
-		/*
+		ASSERT(!xfs_ioend_is_append(ioend));
-		 * We might have to update the on-disk file size after
-		 * extending writes.
-		 */
-		xfs_setfilesize(ioend);
 	}
 done:
 	xfs_destroy_ioend(ioend);
 }
 /*
  * Call IO completion handling in caller context on the final put of an ioend.
  */
 STATIC void
 xfs_finish_ioend_sync(
 	struct xfs_ioend	*ioend)
 {
 	if (atomic_dec_and_test(&ioend->io_remaining))
 		xfs_end_io(&ioend->io_work);
 }
 /*
  * Allocate and initialise an IO completion structure.
  * We need to track unwritten extent write completion here initially.
  * We'll need to extend this for updating the ondisk inode size later
  * (vs. incore size).
  */
 STATIC xfs_ioend_t *
 xfs_alloc_ioend(
 	struct inode		*inode,
 	unsigned int		type)
 {
 	xfs_ioend_t		*ioend;
 	ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS);
 	/*
 	 * Set the count to 1 initially, which will prevent an I/O
 	 * completion callback from happening before we have started
 	 * all the I/O from calling the completion routine too early.
 	 */
 	atomic_set(&ioend->io_remaining, 1);
 	ioend->io_isasync = 0;
+	ioend->io_isdirect = 0;
 	ioend->io_error = 0;
 	ioend->io_list = NULL;
 	ioend->io_type = type;
 	ioend->io_inode = inode;
 	ioend->io_buffer_head = NULL;
 	ioend->io_buffer_tail = NULL;
 	ioend->io_offset = 0;
 	ioend->io_size = 0;
 	ioend->io_iocb = NULL;
 	ioend->io_result = 0;
+	ioend->io_append_trans = NULL;
 	INIT_WORK(&ioend->io_work, xfs_end_io);
 	return ioend;
 }
 STATIC int
 xfs_map_blocks(
 	struct inode		*inode,
 	loff_t			offset,
 	struct xfs_bmbt_irec	*imap,
 	int			type,
 	int			nonblocking)
 {
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
 	ssize_t			count = 1 << inode->i_blkbits;
 	xfs_fileoff_t		offset_fsb, end_fsb;
 	int			error = 0;
 	int			bmapi_flags = XFS_BMAPI_ENTIRE;
 	int			nimaps = 1;
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -XFS_ERROR(EIO);
 	if (type == IO_UNWRITTEN)
 		bmapi_flags |= XFS_BMAPI_IGSTATE;
 	if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
 		if (nonblocking)
 			return -XFS_ERROR(EAGAIN);
 		xfs_ilock(ip, XFS_ILOCK_SHARED);
 	}
 	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
 	       (ip->i_df.if_flags & XFS_IFEXTENTS));
 	ASSERT(offset <= mp->m_maxioffset);
 	if (offset + count > mp->m_maxioffset)
 		count = mp->m_maxioffset - offset;
 	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
 	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
 				imap, &nimaps, bmapi_flags);
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
 	if (error)
 		return -XFS_ERROR(error);
 	if (type == IO_DELALLOC &&
 	    (!nimaps || isnullstartblock(imap->br_startblock))) {
 		error = xfs_iomap_write_allocate(ip, offset, count, imap);
 		if (!error)
 			trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
 		return -XFS_ERROR(error);
 	}
 #ifdef DEBUG
 	if (type == IO_UNWRITTEN) {
 		ASSERT(nimaps);
 		ASSERT(imap->br_startblock != HOLESTARTBLOCK);
 		ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
 	}
 #endif
 	if (nimaps)
 		trace_xfs_map_blocks_found(ip, offset, count, type, imap);
 	return 0;
 }
 STATIC int
 xfs_imap_valid(
 	struct inode		*inode,
 	struct xfs_bmbt_irec	*imap,
 	xfs_off_t		offset)
 {
 	offset >>= inode->i_blkbits;
 	return offset >= imap->br_startoff &&
 		offset < imap->br_startoff + imap->br_blockcount;
 }
 /*
  * BIO completion handler for buffered IO.
  */
 STATIC void
 xfs_end_bio(
 	struct bio		*bio,
 	int			error)
 {
 	xfs_ioend_t		*ioend = bio->bi_private;
 	ASSERT(atomic_read(&bio->bi_cnt) >= 1);
 	ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error;
 	/* Toss bio and pass work off to an xfsdatad thread */
 	bio->bi_private = NULL;
 	bio->bi_end_io = NULL;
 	bio_put(bio);
 	xfs_finish_ioend(ioend);
 }
 STATIC void
 xfs_submit_ioend_bio(
 	struct writeback_control *wbc,
 	xfs_ioend_t		*ioend,
 	struct bio		*bio)
 {
-	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
 	atomic_inc(&ioend->io_remaining);
 	bio->bi_private = ioend;
 	bio->bi_end_io = xfs_end_bio;
-	/*
-	 * If the I/O is beyond EOF we mark the inode dirty immediately
-	 * but don't update the inode size until I/O completion.
-	 */
-	if (xfs_new_eof(ip, ioend->io_offset + ioend->io_size))
-		xfs_mark_inode_dirty(ip);
 	submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio);
 }
 STATIC struct bio *
 xfs_alloc_ioend_bio(
 	struct buffer_head	*bh)
 {
 	int			nvecs = bio_get_nr_vecs(bh->b_bdev);
 	struct bio		*bio = bio_alloc(GFP_NOIO, nvecs);
 	ASSERT(bio->bi_private == NULL);
 	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
 	bio->bi_bdev = bh->b_bdev;
 	return bio;
 }
 STATIC void
 xfs_start_buffer_writeback(
 	struct buffer_head	*bh)
 {
 	ASSERT(buffer_mapped(bh));
 	ASSERT(buffer_locked(bh));
 	ASSERT(!buffer_delay(bh));
 	ASSERT(!buffer_unwritten(bh));
 	mark_buffer_async_write(bh);
 	set_buffer_uptodate(bh);
 	clear_buffer_dirty(bh);
 }
 STATIC void
 xfs_start_page_writeback(
 	struct page		*page,
 	int			clear_dirty,
 	int			buffers)
 {
 	ASSERT(PageLocked(page));
 	ASSERT(!PageWriteback(page));
 	if (clear_dirty)
 		clear_page_dirty_for_io(page);
 	set_page_writeback(page);
 	unlock_page(page);
 	/* If no buffers on the page are to be written, finish it here */
 	if (!buffers)
 		end_page_writeback(page);
 }
 static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh)
 {
 	return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
 }
 /*
  * Submit all of the bios for all of the ioends we have saved up, covering the
  * initial writepage page and also any probed pages.
  *
  * Because we may have multiple ioends spanning a page, we need to start
  * writeback on all the buffers before we submit them for I/O. If we mark the
  * buffers as we got, then we can end up with a page that only has buffers
  * marked async write and I/O complete on can occur before we mark the other
  * buffers async write.
  *
  * The end result of this is that we trip a bug in end_page_writeback() because
  * we call it twice for the one page as the code in end_buffer_async_write()
  * assumes that all buffers on the page are started at the same time.
  *
  * The fix is two passes across the ioend list - one to start writeback on the
  * buffer_heads, and then submit them for I/O on the second pass.
  */
 STATIC void
 xfs_submit_ioend(
 	struct writeback_control *wbc,
 	xfs_ioend_t		*ioend)
 {
 	xfs_ioend_t		*head = ioend;
 	xfs_ioend_t		*next;
 	struct buffer_head	*bh;
 	struct bio		*bio;
 	sector_t		lastblock = 0;
 	/* Pass 1 - start writeback */
 	do {
 		next = ioend->io_list;
 		for (bh = ioend->io_buffer_head; bh; bh = bh->b_private)
 			xfs_start_buffer_writeback(bh);
 	} while ((ioend = next) != NULL);
 	/* Pass 2 - submit I/O */
 	ioend = head;
 	do {
 		next = ioend->io_list;
 		bio = NULL;
 		for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
 			if (!bio) {
  retry:
 				bio = xfs_alloc_ioend_bio(bh);
 			} else if (bh->b_blocknr != lastblock + 1) {
 				xfs_submit_ioend_bio(wbc, ioend, bio);
 				goto retry;
 			}
 			if (bio_add_buffer(bio, bh) != bh->b_size) {
 				xfs_submit_ioend_bio(wbc, ioend, bio);
 				goto retry;
 			}
 			lastblock = bh->b_blocknr;
 		}
 		if (bio)
 			xfs_submit_ioend_bio(wbc, ioend, bio);
 		xfs_finish_ioend(ioend);
 	} while ((ioend = next) != NULL);
 }
 /*
  * Cancel submission of all buffer_heads so far in this endio.
  * Toss the endio too.  Only ever called for the initial page
  * in a writepage request, so only ever one page.
  */
 STATIC void
 xfs_cancel_ioend(
 	xfs_ioend_t		*ioend)
 {
 	xfs_ioend_t		*next;
 	struct buffer_head	*bh, *next_bh;
 	do {
 		next = ioend->io_list;
 		bh = ioend->io_buffer_head;
 		do {
 			next_bh = bh->b_private;
 			clear_buffer_async_write(bh);
 			unlock_buffer(bh);
 		} while ((bh = next_bh) != NULL);
 		mempool_free(ioend, xfs_ioend_pool);
 	} while ((ioend = next) != NULL);
 }
 /*
  * Test to see if we've been building up a completion structure for
  * earlier buffers -- if so, we try to append to this ioend if we
  * can, otherwise we finish off any current ioend and start another.
  * Return true if we've finished the given ioend.
  */
 STATIC void
 xfs_add_to_ioend(
 	struct inode		*inode,
 	struct buffer_head	*bh,
 	xfs_off_t		offset,
 	unsigned int		type,
 	xfs_ioend_t		**result,
 	int			need_ioend)
 {
 	xfs_ioend_t		*ioend = *result;
 	if (!ioend || need_ioend || type != ioend->io_type) {
 		xfs_ioend_t	*previous = *result;
 		ioend = xfs_alloc_ioend(inode, type);
 		ioend->io_offset = offset;
 		ioend->io_buffer_head = bh;
 		ioend->io_buffer_tail = bh;
 		if (previous)
 			previous->io_list = ioend;
 		*result = ioend;
 	} else {
 		ioend->io_buffer_tail->b_private = bh;
 		ioend->io_buffer_tail = bh;
 	}
 	bh->b_private = NULL;
 	ioend->io_size += bh->b_size;
 }
 STATIC void
 xfs_map_buffer(
 	struct inode		*inode,
 	struct buffer_head	*bh,
 	struct xfs_bmbt_irec	*imap,
 	xfs_off_t		offset)
 {
 	sector_t		bn;
 	struct xfs_mount	*m = XFS_I(inode)->i_mount;
 	xfs_off_t		iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff);
 	xfs_daddr_t		iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock);
 	ASSERT(imap->br_startblock != HOLESTARTBLOCK);
 	ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
 	bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) +
 	      ((offset - iomap_offset) >> inode->i_blkbits);
 	ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode)));
 	bh->b_blocknr = bn;
 	set_buffer_mapped(bh);
 }
 STATIC void
 xfs_map_at_offset(
 	struct inode		*inode,
 	struct buffer_head	*bh,
 	struct xfs_bmbt_irec	*imap,
 	xfs_off_t		offset)
 {
 	ASSERT(imap->br_startblock != HOLESTARTBLOCK);
 	ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
 	xfs_map_buffer(inode, bh, imap, offset);
 	set_buffer_mapped(bh);
 	clear_buffer_delay(bh);
 	clear_buffer_unwritten(bh);
 }
 /*
  * Test if a given page is suitable for writing as part of an unwritten
  * or delayed allocate extent.
  */
 STATIC int
 xfs_is_delayed_page(
 	struct page		*page,
 	unsigned int		type)
 {
 	if (PageWriteback(page))
 		return 0;
 	if (page->mapping && page_has_buffers(page)) {
 		struct buffer_head	*bh, *head;
 		int			acceptable = 0;
 		bh = head = page_buffers(page);
 		do {
 			if (buffer_unwritten(bh))
 				acceptable = (type == IO_UNWRITTEN);
 			else if (buffer_delay(bh))
 				acceptable = (type == IO_DELALLOC);
 			else if (buffer_dirty(bh) && buffer_mapped(bh))
 				acceptable = (type == IO_OVERWRITE);
 			else
 				break;
 		} while ((bh = bh->b_this_page) != head);
 		if (acceptable)
 			return 1;
 	}
 	return 0;
 }
 /*
  * Allocate & map buffers for page given the extent map. Write it out.
  * except for the original page of a writepage, this is called on
  * delalloc/unwritten pages only, for the original page it is possible
  * that the page has no mapping at all.
  */
 STATIC int
 xfs_convert_page(
 	struct inode		*inode,
 	struct page		*page,
 	loff_t			tindex,
 	struct xfs_bmbt_irec	*imap,
 	xfs_ioend_t		**ioendp,
 	struct writeback_control *wbc)
 {
 	struct buffer_head	*bh, *head;
 	xfs_off_t		end_offset;
 	unsigned long		p_offset;
 	unsigned int		type;
 	int			len, page_dirty;
 	int			count = 0, done = 0, uptodate = 1;
  	xfs_off_t		offset = page_offset(page);
 	if (page->index != tindex)
 		goto fail;
 	if (!trylock_page(page))
 		goto fail;
 	if (PageWriteback(page))
 		goto fail_unlock_page;
 	if (page->mapping != inode->i_mapping)
 		goto fail_unlock_page;
 	if (!xfs_is_delayed_page(page, (*ioendp)->io_type))
 		goto fail_unlock_page;
 	/*
 	 * page_dirty is initially a count of buffers on the page before
 	 * EOF and is decremented as we move each into a cleanable state.
 	 *
 	 * Derivation:
 	 *
 	 * End offset is the highest offset that this page should represent.
 	 * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
 	 * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
 	 * hence give us the correct page_dirty count. On any other page,
 	 * it will be zero and in that case we need page_dirty to be the
 	 * count of buffers on the page.
 	 */
 	end_offset = min_t(unsigned long long,
 			(xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
 			i_size_read(inode));
 	len = 1 << inode->i_blkbits;
 	p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
 					PAGE_CACHE_SIZE);
 	p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
 	page_dirty = p_offset / len;
 	bh = head = page_buffers(page);
 	do {
 		if (offset >= end_offset)
 			break;
 		if (!buffer_uptodate(bh))
 			uptodate = 0;
 		if (!(PageUptodate(page) || buffer_uptodate(bh))) {
 			done = 1;
 			continue;
 		}
 		if (buffer_unwritten(bh) || buffer_delay(bh) ||
 		    buffer_mapped(bh)) {
 			if (buffer_unwritten(bh))
 				type = IO_UNWRITTEN;
 			else if (buffer_delay(bh))
 				type = IO_DELALLOC;
 			else
 				type = IO_OVERWRITE;
 			if (!xfs_imap_valid(inode, imap, offset)) {
 				done = 1;
 				continue;
 			}
 			lock_buffer(bh);
 			if (type != IO_OVERWRITE)
 				xfs_map_at_offset(inode, bh, imap, offset);
 			xfs_add_to_ioend(inode, bh, offset, type,
 					 ioendp, done);
 			page_dirty--;
 			count++;
 		} else {
 			done = 1;
 		}
 	} while (offset += len, (bh = bh->b_this_page) != head);
 	if (uptodate && bh == head)
 		SetPageUptodate(page);
 	if (count) {
 		if (--wbc->nr_to_write <= 0 &&
 		    wbc->sync_mode == WB_SYNC_NONE)
 			done = 1;
 	}
 	xfs_start_page_writeback(page, !page_dirty, count);
 	return done;
  fail_unlock_page:
 	unlock_page(page);
  fail:
 	return 1;
 }
 /*
  * Convert & write out a cluster of pages in the same extent as defined
  * by mp and following the start page.
  */
 STATIC void
 xfs_cluster_write(
 	struct inode		*inode,
 	pgoff_t			tindex,
 	struct xfs_bmbt_irec	*imap,
 	xfs_ioend_t		**ioendp,
 	struct writeback_control *wbc,
 	pgoff_t			tlast)
 {
 	struct pagevec		pvec;
 	int			done = 0, i;
 	pagevec_init(&pvec, 0);
 	while (!done && tindex <= tlast) {
 		unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
 		if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
 			break;
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			done = xfs_convert_page(inode, pvec.pages[i], tindex++,
 					imap, ioendp, wbc);
 			if (done)
 				break;
 		}
 		pagevec_release(&pvec);
 		cond_resched();
 	}
 }
 STATIC void
 xfs_vm_invalidatepage(
 	struct page		*page,
 	unsigned long		offset)
 {
 	trace_xfs_invalidatepage(page->mapping->host, page, offset);
 	block_invalidatepage(page, offset);
 }
 /*
  * If the page has delalloc buffers on it, we need to punch them out before we
  * invalidate the page. If we don't, we leave a stale delalloc mapping on the
  * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read
  * is done on that same region - the delalloc extent is returned when none is
  * supposed to be there.
  *
  * We prevent this by truncating away the delalloc regions on the page before
  * invalidating it. Because they are delalloc, we can do this without needing a
  * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this
  * truncation without a transaction as there is no space left for block
  * reservation (typically why we see a ENOSPC in writeback).
  *
  * This is not a performance critical path, so for now just do the punching a
  * buffer head at a time.
  */
 STATIC void
 xfs_aops_discard_page(
 	struct page		*page)
 {
 	struct inode		*inode = page->mapping->host;
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct buffer_head	*bh, *head;
 	loff_t			offset = page_offset(page);
 	if (!xfs_is_delayed_page(page, IO_DELALLOC))
 		goto out_invalidate;
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 		goto out_invalidate;
 	xfs_alert(ip->i_mount,
 		"page discard on page %p, inode 0x%llx, offset %llu.",
 			page, ip->i_ino, offset);
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	bh = head = page_buffers(page);
 	do {
 		int		error;
 		xfs_fileoff_t	start_fsb;
 		if (!buffer_delay(bh))
 			goto next_buffer;
 		start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
 		error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1);
 		if (error) {
 			/* something screwed, just bail */
 			if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
 				xfs_alert(ip->i_mount,
 			"page discard unable to remove delalloc mapping.");
 			}
 			break;
 		}
 next_buffer:
 		offset += 1 << inode->i_blkbits;
 	} while ((bh = bh->b_this_page) != head);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 out_invalidate:
 	xfs_vm_invalidatepage(page, 0);
 	return;
 }
 /*
  * Write out a dirty page.
  *
  * For delalloc space on the page we need to allocate space and flush it.
  * For unwritten space on the page we need to start the conversion to
  * regular allocated space.
  * For any other dirty buffer heads on the page we should flush them.
  */
 STATIC int
 xfs_vm_writepage(
 	struct page		*page,
 	struct writeback_control *wbc)
 {
 	struct inode		*inode = page->mapping->host;
 	struct buffer_head	*bh, *head;
 	struct xfs_bmbt_irec	imap;
 	xfs_ioend_t		*ioend = NULL, *iohead = NULL;
 	loff_t			offset;
 	unsigned int		type;
 	__uint64_t              end_offset;
 	pgoff_t                 end_index, last_index;
 	ssize_t			len;
 	int			err, imap_valid = 0, uptodate = 1;
 	int			count = 0;
 	int			nonblocking = 0;
 	trace_xfs_writepage(inode, page, 0);
 	ASSERT(page_has_buffers(page));
 	/*
 	 * Refuse to write the page out if we are called from reclaim context.
 	 *
 	 * This avoids stack overflows when called from deeply used stacks in
 	 * random callers for direct reclaim or memcg reclaim.  We explicitly
 	 * allow reclaim from kswapd as the stack usage there is relatively low.
 	 *
 	 * This should never happen except in the case of a VM regression so
 	 * warn about it.
 	 */
 	if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
 			PF_MEMALLOC))
 		goto redirty;
 	/*
 	 * Given that we do not allow direct reclaim to call us, we should
 	 * never be called while in a filesystem transaction.
 	 */
 	if (WARN_ON(current->flags & PF_FSTRANS))
 		goto redirty;
 	/* Is this page beyond the end of the file? */
 	offset = i_size_read(inode);
 	end_index = offset >> PAGE_CACHE_SHIFT;
 	last_index = (offset - 1) >> PAGE_CACHE_SHIFT;
 	if (page->index >= end_index) {
 		if ((page->index >= end_index + 1) ||
 		    !(i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) {
 			unlock_page(page);
 			return 0;
 		}
 	}
 	end_offset = min_t(unsigned long long,
 			(xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
 			offset);
 	len = 1 << inode->i_blkbits;
 	bh = head = page_buffers(page);
 	offset = page_offset(page);
 	type = IO_OVERWRITE;
 	if (wbc->sync_mode == WB_SYNC_NONE)
 		nonblocking = 1;
 	do {
 		int new_ioend = 0;
 		if (offset >= end_offset)
 			break;
 		if (!buffer_uptodate(bh))
 			uptodate = 0;
 		/*
 		 * set_page_dirty dirties all buffers in a page, independent
 		 * of their state.  The dirty state however is entirely
 		 * meaningless for holes (!mapped && uptodate), so skip
 		 * buffers covering holes here.
 		 */
 		if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
 			imap_valid = 0;
 			continue;
 		}
 		if (buffer_unwritten(bh)) {
 			if (type != IO_UNWRITTEN) {
 				type = IO_UNWRITTEN;
 				imap_valid = 0;
 			}
 		} else if (buffer_delay(bh)) {
 			if (type != IO_DELALLOC) {
 				type = IO_DELALLOC;
 				imap_valid = 0;
 			}
 		} else if (buffer_uptodate(bh)) {
 			if (type != IO_OVERWRITE) {
 				type = IO_OVERWRITE;
 				imap_valid = 0;
 			}
 		} else {
 			if (PageUptodate(page)) {
 				ASSERT(buffer_mapped(bh));
 				imap_valid = 0;
 			}
 			continue;
 		}
 		if (imap_valid)
 			imap_valid = xfs_imap_valid(inode, &imap, offset);
 		if (!imap_valid) {
 			/*
 			 * If we didn't have a valid mapping then we need to
 			 * put the new mapping into a separate ioend structure.
 			 * This ensures non-contiguous extents always have
 			 * separate ioends, which is particularly important
 			 * for unwritten extent conversion at I/O completion
 			 * time.
 			 */
 			new_ioend = 1;
 			err = xfs_map_blocks(inode, offset, &imap, type,
 					     nonblocking);
 			if (err)
 				goto error;
 			imap_valid = xfs_imap_valid(inode, &imap, offset);
 		}
 		if (imap_valid) {
 			lock_buffer(bh);
 			if (type != IO_OVERWRITE)
 				xfs_map_at_offset(inode, bh, &imap, offset);
 			xfs_add_to_ioend(inode, bh, offset, type, &ioend,
 					 new_ioend);
 			count++;
 		}
 		if (!iohead)
 			iohead = ioend;
 	} while (offset += len, ((bh = bh->b_this_page) != head));
 	if (uptodate && bh == head)
 		SetPageUptodate(page);
 	xfs_start_page_writeback(page, 1, count);
 	if (ioend && imap_valid) {
 		xfs_off_t		end_index;
 		end_index = imap.br_startoff + imap.br_blockcount;
 		/* to bytes */
 		end_index <<= inode->i_blkbits;
 		/* to pages */
 		end_index = (end_index - 1) >> PAGE_CACHE_SHIFT;
 		/* check against file size */
 		if (end_index > last_index)
 			end_index = last_index;
 		xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
 				  wbc, end_index);
 	}
-	if (iohead)
+	if (iohead) {
+		/*
+		 * Reserve log space if we might write beyond the on-disk
+		 * inode size.
+		 */
+		if (ioend->io_type != IO_UNWRITTEN &&
+		    xfs_ioend_is_append(ioend)) {
+			err = xfs_setfilesize_trans_alloc(ioend);
+			if (err)
+				goto error;
+		}
 		xfs_submit_ioend(wbc, iohead);
+	}
 	return 0;
 error:
 	if (iohead)
 		xfs_cancel_ioend(iohead);
 	if (err == -EAGAIN)
 		goto redirty;
 	xfs_aops_discard_page(page);
 	ClearPageUptodate(page);
 	unlock_page(page);
 	return err;
 redirty:
 	redirty_page_for_writepage(wbc, page);
 	unlock_page(page);
 	return 0;
 }
 STATIC int
 xfs_vm_writepages(
 	struct address_space	*mapping,
 	struct writeback_control *wbc)
 {
 	xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
 	return generic_writepages(mapping, wbc);
 }
 /*
  * Called to move a page into cleanable state - and from there
  * to be released. The page should already be clean. We always
  * have buffer heads in this call.
  *
  * Returns 1 if the page is ok to release, 0 otherwise.
  */
 STATIC int
 xfs_vm_releasepage(
 	struct page		*page,
 	gfp_t			gfp_mask)
 {
 	int			delalloc, unwritten;
 	trace_xfs_releasepage(page->mapping->host, page, 0);
 	xfs_count_page_state(page, &delalloc, &unwritten);
 	if (WARN_ON(delalloc))
 		return 0;
 	if (WARN_ON(unwritten))
 		return 0;
 	return try_to_free_buffers(page);
 }
 STATIC int
 __xfs_get_blocks(
 	struct inode		*inode,
 	sector_t		iblock,
 	struct buffer_head	*bh_result,
 	int			create,
 	int			direct)
 {
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
 	xfs_fileoff_t		offset_fsb, end_fsb;
 	int			error = 0;
 	int			lockmode = 0;
 	struct xfs_bmbt_irec	imap;
 	int			nimaps = 1;
 	xfs_off_t		offset;
 	ssize_t			size;
 	int			new = 0;
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -XFS_ERROR(EIO);
 	offset = (xfs_off_t)iblock << inode->i_blkbits;
 	ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
 	size = bh_result->b_size;
 	if (!create && direct && offset >= i_size_read(inode))
 		return 0;
 	if (create) {
 		lockmode = XFS_ILOCK_EXCL;
 		xfs_ilock(ip, lockmode);
 	} else {
 		lockmode = xfs_ilock_map_shared(ip);
 	}
 	ASSERT(offset <= mp->m_maxioffset);
 	if (offset + size > mp->m_maxioffset)
 		size = mp->m_maxioffset - offset;
 	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
 	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
 				&imap, &nimaps, XFS_BMAPI_ENTIRE);
 	if (error)
 		goto out_unlock;
 	if (create &&
 	    (!nimaps ||
 	     (imap.br_startblock == HOLESTARTBLOCK ||
 	      imap.br_startblock == DELAYSTARTBLOCK))) {
 		if (direct) {
 			error = xfs_iomap_write_direct(ip, offset, size,
 						       &imap, nimaps);
 		} else {
 			error = xfs_iomap_write_delay(ip, offset, size, &imap);
 		}
 		if (error)
 			goto out_unlock;
 		trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
 	} else if (nimaps) {
 		trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
 	} else {
 		trace_xfs_get_blocks_notfound(ip, offset, size);
 		goto out_unlock;
 	}
 	xfs_iunlock(ip, lockmode);
 	if (imap.br_startblock != HOLESTARTBLOCK &&
 	    imap.br_startblock != DELAYSTARTBLOCK) {
 		/*
 		 * For unwritten extents do not report a disk address on
 		 * the read case (treat as if we're reading into a hole).
 		 */
 		if (create || !ISUNWRITTEN(&imap))
 			xfs_map_buffer(inode, bh_result, &imap, offset);
 		if (create && ISUNWRITTEN(&imap)) {
 			if (direct)
 				bh_result->b_private = inode;
 			set_buffer_unwritten(bh_result);
 		}
 	}
 	/*
 	 * If this is a realtime file, data may be on a different device.
 	 * to that pointed to from the buffer_head b_bdev currently.
 	 */
 	bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
 	/*
 	 * If we previously allocated a block out beyond eof and we are now
 	 * coming back to use it then we will need to flag it as new even if it
 	 * has a disk address.
 	 *
 	 * With sub-block writes into unwritten extents we also need to mark
 	 * the buffer as new so that the unwritten parts of the buffer gets
 	 * correctly zeroed.
 	 */
 	if (create &&
 	    ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
 	     (offset >= i_size_read(inode)) ||
 	     (new || ISUNWRITTEN(&imap))))
 		set_buffer_new(bh_result);
 	if (imap.br_startblock == DELAYSTARTBLOCK) {
 		BUG_ON(direct);
 		if (create) {
 			set_buffer_uptodate(bh_result);
 			set_buffer_mapped(bh_result);
 			set_buffer_delay(bh_result);
 		}
 	}
 	/*
 	 * If this is O_DIRECT or the mpage code calling tell them how large
 	 * the mapping is, so that we can avoid repeated get_blocks calls.
 	 */
 	if (direct || size > (1 << inode->i_blkbits)) {
 		xfs_off_t		mapping_size;
 		mapping_size = imap.br_startoff + imap.br_blockcount - iblock;
 		mapping_size <<= inode->i_blkbits;
 		ASSERT(mapping_size > 0);
 		if (mapping_size > size)
 			mapping_size = size;
 		if (mapping_size > LONG_MAX)
 			mapping_size = LONG_MAX;
 		bh_result->b_size = mapping_size;
 	}
 	return 0;
 out_unlock:
 	xfs_iunlock(ip, lockmode);
 	return -error;
 }
 int
 xfs_get_blocks(
 	struct inode		*inode,
 	sector_t		iblock,
 	struct buffer_head	*bh_result,
 	int			create)
 {
 	return __xfs_get_blocks(inode, iblock, bh_result, create, 0);
 }
 STATIC int
 xfs_get_blocks_direct(
 	struct inode		*inode,
 	sector_t		iblock,
 	struct buffer_head	*bh_result,
 	int			create)
 {
 	return __xfs_get_blocks(inode, iblock, bh_result, create, 1);
 }
 /*
  * Complete a direct I/O write request.
  *
  * If the private argument is non-NULL __xfs_get_blocks signals us that we
  * need to issue a transaction to convert the range from unwritten to written
  * extents.  In case this is regular synchronous I/O we just call xfs_end_io
  * to do this and we are done.  But in case this was a successful AIO
  * request this handler is called from interrupt context, from which we
  * can't start transactions.  In that case offload the I/O completion to
  * the workqueues we also use for buffered I/O completion.
  */
 STATIC void
 xfs_end_io_direct_write(
 	struct kiocb		*iocb,
 	loff_t			offset,
 	ssize_t			size,
 	void			*private,
 	int			ret,
 	bool			is_async)
 {
 	struct xfs_ioend	*ioend = iocb->private;
 	/*
 	 * While the generic direct I/O code updates the inode size, it does
 	 * so only after the end_io handler is called, which means our
 	 * end_io handler thinks the on-disk size is outside the in-core
 	 * size.  To prevent this just update it a little bit earlier here.
 	 */
 	if (offset + size > i_size_read(ioend->io_inode))
 		i_size_write(ioend->io_inode, offset + size);
 	/*
 	 * blockdev_direct_IO can return an error even after the I/O
 	 * completion handler was called.  Thus we need to protect
 	 * against double-freeing.
 	 */
 	iocb->private = NULL;
 	ioend->io_offset = offset;
 	ioend->io_size = size;
 	ioend->io_iocb = iocb;
 	ioend->io_result = ret;
 	if (private && size > 0)
 		ioend->io_type = IO_UNWRITTEN;
 	if (is_async) {
 		ioend->io_isasync = 1;
 		xfs_finish_ioend(ioend);
 	} else {
 		xfs_finish_ioend_sync(ioend);
 	}
 }
 STATIC ssize_t
 xfs_vm_direct_IO(
 	int			rw,
 	struct kiocb		*iocb,
 	const struct iovec	*iov,
 	loff_t			offset,
 	unsigned long		nr_segs)
 {
 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
 	struct block_device	*bdev = xfs_find_bdev_for_inode(inode);
+	struct xfs_ioend	*ioend = NULL;
 	ssize_t			ret;
 	if (rw & WRITE) {
-		iocb->private = xfs_alloc_ioend(inode, IO_DIRECT);
+		size_t size = iov_length(iov, nr_segs);
+		/*
+		 * We need to preallocate a transaction for a size update
+		 * here.  In the case that this write both updates the size
+		 * and converts at least on unwritten extent we will cancel
+		 * the still clean transaction after the I/O has finished.
+		 */
+		iocb->private = ioend = xfs_alloc_ioend(inode, IO_DIRECT);
+		if (offset + size > XFS_I(inode)->i_d.di_size) {
+			ret = xfs_setfilesize_trans_alloc(ioend);
+			if (ret)
+				goto out_destroy_ioend;
+			ioend->io_isdirect = 1;
+		}
 		ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
 					    offset, nr_segs,
 					    xfs_get_blocks_direct,
 					    xfs_end_io_direct_write, NULL, 0);
 		if (ret != -EIOCBQUEUED && iocb->private)
-			xfs_destroy_ioend(iocb->private);
+			goto out_trans_cancel;
 	} else {
 		ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
 					    offset, nr_segs,
 					    xfs_get_blocks_direct,
 					    NULL, NULL, 0);
 	}
+	return ret;
+out_trans_cancel:
+	if (ioend->io_append_trans) {
+		current_set_flags_nested(&ioend->io_append_trans->t_pflags,
+					 PF_FSTRANS);
+		xfs_trans_cancel(ioend->io_append_trans, 0);
+	}
+out_destroy_ioend:
+	xfs_destroy_ioend(ioend);
 	return ret;
 }
 STATIC void
 xfs_vm_write_failed(
 	struct address_space	*mapping,
 	loff_t			to)
 {
 	struct inode		*inode = mapping->host;
 	if (to > inode->i_size) {
 		/*
 		 * Punch out the delalloc blocks we have already allocated.
 		 *
 		 * Don't bother with xfs_setattr given that nothing can have
 		 * made it to disk yet as the page is still locked at this
 		 * point.
 		 */
 		struct xfs_inode	*ip = XFS_I(inode);
 		xfs_fileoff_t		start_fsb;
 		xfs_fileoff_t		end_fsb;
 		int			error;
 		truncate_pagecache(inode, to, inode->i_size);
 		/*
 		 * Check if there are any blocks that are outside of i_size
 		 * that need to be trimmed back.
 		 */
 		start_fsb = XFS_B_TO_FSB(ip->i_mount, inode->i_size) + 1;
 		end_fsb = XFS_B_TO_FSB(ip->i_mount, to);
 		if (end_fsb <= start_fsb)
 			return;
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
 		error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
 							end_fsb - start_fsb);
 		if (error) {
 			/* something screwed, just bail */
 			if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
 				xfs_alert(ip->i_mount,
 			"xfs_vm_write_failed: unable to clean up ino %lld",
 						ip->i_ino);
 			}
 		}
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	}
 }
 STATIC int
 xfs_vm_write_begin(
 	struct file		*file,
 	struct address_space	*mapping,
 	loff_t			pos,
 	unsigned		len,
 	unsigned		flags,
 	struct page		**pagep,
 	void			**fsdata)
 {
 	int			ret;
 	ret = block_write_begin(mapping, pos, len, flags | AOP_FLAG_NOFS,
 				pagep, xfs_get_blocks);
 	if (unlikely(ret))
 		xfs_vm_write_failed(mapping, pos + len);
 	return ret;
 }
 STATIC int
 xfs_vm_write_end(
 	struct file		*file,
 	struct address_space	*mapping,
 	loff_t			pos,
 	unsigned		len,
 	unsigned		copied,
 	struct page		*page,
 	void			*fsdata)
 {
 	int			ret;
 	ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
 	if (unlikely(ret < len))
 		xfs_vm_write_failed(mapping, pos + len);
 	return ret;
 }
 STATIC sector_t
 xfs_vm_bmap(
 	struct address_space	*mapping,
 	sector_t		block)
 {
 	struct inode		*inode = (struct inode *)mapping->host;
 	struct xfs_inode	*ip = XFS_I(inode);
 	trace_xfs_vm_bmap(XFS_I(inode));
 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
 	xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF);
 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 	return generic_block_bmap(mapping, block, xfs_get_blocks);
 }
 STATIC int
 xfs_vm_readpage(
 	struct file		*unused,
 	struct page		*page)
 {
 	return mpage_readpage(page, xfs_get_blocks);
 }
 STATIC int
 xfs_vm_readpages(
 	struct file		*unused,
 	struct address_space	*mapping,
 	struct list_head	*pages,
 	unsigned		nr_pages)
 {
 	return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
 }
 const struct address_space_operations xfs_address_space_operations = {
 	.readpage		= xfs_vm_readpage,
 	.readpages		= xfs_vm_readpages,

 /*
  * Copyright (c) 2005-2006 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as
  * published by the Free Software Foundation.
  *
  * This program is distributed in the hope that it would be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write the Free Software Foundation,
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 #ifndef __XFS_AOPS_H__
 #define __XFS_AOPS_H__
 extern mempool_t *xfs_ioend_pool;
 /*
  * Types of I/O for bmap clustering and I/O completion tracking.
  */
 enum {
 	IO_DIRECT = 0,	/* special case for direct I/O ioends */
 	IO_DELALLOC,	/* mapping covers delalloc region */
 	IO_UNWRITTEN,	/* mapping covers allocated but uninitialized data */
 	IO_OVERWRITE,	/* mapping covers already allocated extent */
 };
 #define XFS_IO_TYPES \
 	{ 0,			"" }, \
 	{ IO_DELALLOC,		"delalloc" }, \
 	{ IO_UNWRITTEN,		"unwritten" }, \
 	{ IO_OVERWRITE,		"overwrite" }
 /*
  * xfs_ioend struct manages large extent writes for XFS.
  * It can manage several multi-page bio's at once.
  */
 typedef struct xfs_ioend {
 	struct xfs_ioend	*io_list;	/* next ioend in chain */
 	unsigned int		io_type;	/* delalloc / unwritten */
 	int			io_error;	/* I/O error code */
 	atomic_t		io_remaining;	/* hold count */
 	unsigned int		io_isasync : 1;	/* needs aio_complete */
+	unsigned int		io_isdirect : 1;/* direct I/O */
 	struct inode		*io_inode;	/* file being written to */
 	struct buffer_head	*io_buffer_head;/* buffer linked list head */
 	struct buffer_head	*io_buffer_tail;/* buffer linked list tail */
 	size_t			io_size;	/* size of the extent */
 	xfs_off_t		io_offset;	/* offset in the file */
 	struct work_struct	io_work;	/* xfsdatad work queue */
+	struct xfs_trans	*io_append_trans;/* xact. for size update */
 	struct kiocb		*io_iocb;
 	int			io_result;
 } xfs_ioend_t;
 extern const struct address_space_operations xfs_address_space_operations;
 extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
 extern void xfs_count_page_state(struct page *, int *, int *);
 #endif /* __XFS_AOPS_H__ */