Doug / smarc-fsl-linux-kernel

1

/*

1

/*

2

3

4

*

4

*

5

* This program is free software; you can redistribute it and/or

5

* This program is free software; you can redistribute it and/or

6

* modify it under the terms of the GNU General Public License as

6

* modify it under the terms of the GNU General Public License as

7

* published by the Free Software Foundation.

7

* published by the Free Software Foundation.

8

*

8

*

9

* This program is distributed in the hope that it would be useful,

9

* This program is distributed in the hope that it would be useful,

10

* but WITHOUT ANY WARRANTY; without even the implied warranty of

10

* but WITHOUT ANY WARRANTY; without even the implied warranty of

11

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

11

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

12

* GNU General Public License for more details.

12

* GNU General Public License for more details.

13

*

13

*

14

* You should have received a copy of the GNU General Public License

14

* You should have received a copy of the GNU General Public License

15

* along with this program; if not, write the Free Software Foundation,

15

* along with this program; if not, write the Free Software Foundation,

16

* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA

16

* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA

17

*/

17

*/

18

#include "xfs.h"

18

#include "xfs.h"

19

#include "xfs_bit.h"

19

#include "xfs_bit.h"

20

#include "xfs_log.h"

20

#include "xfs_log.h"

21

#include "xfs_inum.h"

21

#include "xfs_inum.h"

22

#include "xfs_sb.h"

22

#include "xfs_sb.h"

23

#include "xfs_ag.h"

23

#include "xfs_ag.h"

24

#include "xfs_trans.h"

24

#include "xfs_trans.h"

25

#include "xfs_mount.h"

25

#include "xfs_mount.h"

26

#include "xfs_bmap_btree.h"

26

#include "xfs_bmap_btree.h"

27

#include "xfs_dinode.h"

27

#include "xfs_dinode.h"

28

#include "xfs_inode.h"

28

#include "xfs_inode.h"

29

#include "xfs_alloc.h"

29

#include "xfs_alloc.h"

30

#include "xfs_error.h"

30

#include "xfs_error.h"

31

#include "xfs_rw.h"

31

#include "xfs_rw.h"

32

#include "xfs_iomap.h"

32

#include "xfs_iomap.h"

33

#include "xfs_vnodeops.h"

33

#include "xfs_vnodeops.h"

34

#include "xfs_trace.h"

34

#include "xfs_trace.h"

35

#include "xfs_bmap.h"

35

#include "xfs_bmap.h"

36

#include <linux/gfp.h>

36

#include <linux/gfp.h>

37

#include <linux/mpage.h>

37

#include <linux/mpage.h>

38

#include <linux/pagevec.h>

38

#include <linux/pagevec.h>

39

#include <linux/writeback.h>

39

#include <linux/writeback.h>

40

41

/*

41

/*

42

* Types of I/O for bmap clustering and I/O completion tracking.

42

* Types of I/O for bmap clustering and I/O completion tracking.

43

*/

43

*/

44

enum {

44

enum {

45

IO_READ, /* mapping for a read */

45

IO_READ, /* mapping for a read */

46

IO_DELAY, /* mapping covers delalloc region */

46

IO_DELAY, /* mapping covers delalloc region */

47

IO_UNWRITTEN, /* mapping covers allocated but uninitialized data */

47

IO_UNWRITTEN, /* mapping covers allocated but uninitialized data */

48

IO_NEW /* just allocated */

48

IO_NEW /* just allocated */

49

};

49

};

50

51

/*

51

/*

52

* Prime number of hash buckets since address is used as the key.

52

* Prime number of hash buckets since address is used as the key.

53

*/

53

*/

54

#define NVSYNC 37

54

#define NVSYNC 37

55

#define to_ioend_wq(v) (&xfs_ioend_wq[((unsigned long)v) % NVSYNC])

55

#define to_ioend_wq(v) (&xfs_ioend_wq[((unsigned long)v) % NVSYNC])

56

static wait_queue_head_t xfs_ioend_wq[NVSYNC];

56

static wait_queue_head_t xfs_ioend_wq[NVSYNC];

57

58

void __init

58

void __init

59

xfs_ioend_init(void)

59

xfs_ioend_init(void)

60

{

60

{

61

int i;

61

int i;

62

63

for (i = 0; i < NVSYNC; i++)

63

for (i = 0; i < NVSYNC; i++)

64

init_waitqueue_head(&xfs_ioend_wq[i]);

64

init_waitqueue_head(&xfs_ioend_wq[i]);

65

}

65

}

66

67

void

67

void

68

xfs_ioend_wait(

68

xfs_ioend_wait(

69

xfs_inode_t *ip)

69

xfs_inode_t *ip)

70

{

70

{

71

wait_queue_head_t *wq = to_ioend_wq(ip);

71

wait_queue_head_t *wq = to_ioend_wq(ip);

72

73

wait_event(*wq, (atomic_read(&ip->i_iocount) == 0));

73

wait_event(*wq, (atomic_read(&ip->i_iocount) == 0));

74

}

74

}

75

76

STATIC void

76

STATIC void

77

xfs_ioend_wake(

77

xfs_ioend_wake(

78

xfs_inode_t *ip)

78

xfs_inode_t *ip)

79

{

79

{

80

if (atomic_dec_and_test(&ip->i_iocount))

80

if (atomic_dec_and_test(&ip->i_iocount))

81

wake_up(to_ioend_wq(ip));

81

wake_up(to_ioend_wq(ip));

82

}

82

}

83

84

void

84

void

85

xfs_count_page_state(

85

xfs_count_page_state(

86

struct page *page,

86

struct page *page,

87

int *delalloc,

87

int *delalloc,

88

int *unwritten)

88

int *unwritten)

89

{

89

{

90

struct buffer_head *bh, *head;

90

struct buffer_head *bh, *head;

91

92

*delalloc = *unwritten = 0;

92

*delalloc = *unwritten = 0;

93

94

bh = head = page_buffers(page);

94

bh = head = page_buffers(page);

95

do {

95

do {

96

if (buffer_unwritten(bh))

96

if (buffer_unwritten(bh))

97

(*unwritten) = 1;

97

(*unwritten) = 1;

98

else if (buffer_delay(bh))

98

else if (buffer_delay(bh))

99

(*delalloc) = 1;

99

(*delalloc) = 1;

100

} while ((bh = bh->b_this_page) != head);

100

} while ((bh = bh->b_this_page) != head);

101

}

101

}

102

103

STATIC struct block_device *

103

STATIC struct block_device *

104

xfs_find_bdev_for_inode(

104

xfs_find_bdev_for_inode(

105

struct inode *inode)

105

struct inode *inode)

106

{

106

{

107

struct xfs_inode *ip = XFS_I(inode);

107

struct xfs_inode *ip = XFS_I(inode);

108

struct xfs_mount *mp = ip->i_mount;

108

struct xfs_mount *mp = ip->i_mount;

109

110

if (XFS_IS_REALTIME_INODE(ip))

110

if (XFS_IS_REALTIME_INODE(ip))

111

return mp->m_rtdev_targp->bt_bdev;

111

return mp->m_rtdev_targp->bt_bdev;

112

else

112

else

113

return mp->m_ddev_targp->bt_bdev;

113

return mp->m_ddev_targp->bt_bdev;

114

}

114

}

115

116

/*

116

/*

117

* We're now finished for good with this ioend structure.

117

* We're now finished for good with this ioend structure.

118

* Update the page state via the associated buffer_heads,

118

* Update the page state via the associated buffer_heads,

119

* release holds on the inode and bio, and finally free

119

* release holds on the inode and bio, and finally free

120

* up memory. Do not use the ioend after this.

120

* up memory. Do not use the ioend after this.

121

*/

121

*/

122

STATIC void

122

STATIC void

123

xfs_destroy_ioend(

123

xfs_destroy_ioend(

124

xfs_ioend_t *ioend)

124

xfs_ioend_t *ioend)

125

{

125

{

126

struct buffer_head *bh, *next;

126

struct buffer_head *bh, *next;

127

struct xfs_inode *ip = XFS_I(ioend->io_inode);

127

struct xfs_inode *ip = XFS_I(ioend->io_inode);

128

129

for (bh = ioend->io_buffer_head; bh; bh = next) {

129

for (bh = ioend->io_buffer_head; bh; bh = next) {

130

next = bh->b_private;

130

next = bh->b_private;

131

bh->b_end_io(bh, !ioend->io_error);

131

bh->b_end_io(bh, !ioend->io_error);

132

}

132

}

133

134

/*

134

/*

135

* Volume managers supporting multiple paths can send back ENODEV

135

* Volume managers supporting multiple paths can send back ENODEV

136

* when the final path disappears. In this case continuing to fill

136

* when the final path disappears. In this case continuing to fill

137

* the page cache with dirty data which cannot be written out is

137

* the page cache with dirty data which cannot be written out is

138

* evil, so prevent that.

138

* evil, so prevent that.

139

*/

139

*/

140

if (unlikely(ioend->io_error == -ENODEV)) {

140

if (unlikely(ioend->io_error == -ENODEV)) {

141

xfs_do_force_shutdown(ip->i_mount, SHUTDOWN_DEVICE_REQ,

141

xfs_do_force_shutdown(ip->i_mount, SHUTDOWN_DEVICE_REQ,

142

__FILE__, __LINE__);

142

__FILE__, __LINE__);

143

}

143

}

144

145

xfs_ioend_wake(ip);

145

xfs_ioend_wake(ip);

146

mempool_free(ioend, xfs_ioend_pool);

146

mempool_free(ioend, xfs_ioend_pool);

147

}

147

}

148

149

/*

149

/*

150

* If the end of the current ioend is beyond the current EOF,

150

* If the end of the current ioend is beyond the current EOF,

151

* return the new EOF value, otherwise zero.

151

* return the new EOF value, otherwise zero.

152

*/

152

*/

153

STATIC xfs_fsize_t

153

STATIC xfs_fsize_t

154

xfs_ioend_new_eof(

154

xfs_ioend_new_eof(

155

xfs_ioend_t *ioend)

155

xfs_ioend_t *ioend)

156

{

156

{

157

xfs_inode_t *ip = XFS_I(ioend->io_inode);

157

xfs_inode_t *ip = XFS_I(ioend->io_inode);

158

xfs_fsize_t isize;

158

xfs_fsize_t isize;

159

xfs_fsize_t bsize;

159

xfs_fsize_t bsize;

160

161

bsize = ioend->io_offset + ioend->io_size;

161

bsize = ioend->io_offset + ioend->io_size;

162

isize = MAX(ip->i_size, ip->i_new_size);

162

isize = MAX(ip->i_size, ip->i_new_size);

163

isize = MIN(isize, bsize);

163

isize = MIN(isize, bsize);

164

return isize > ip->i_d.di_size ? isize : 0;

164

return isize > ip->i_d.di_size ? isize : 0;

165

}

165

}

166

167

/*

167

/*

168

* Update on-disk file size now that data has been written to disk. The

168

* Update on-disk file size now that data has been written to disk. The

169

* current in-memory file size is i_size. If a write is beyond eof i_new_size

169

* current in-memory file size is i_size. If a write is beyond eof i_new_size

170

* will be the intended file size until i_size is updated. If this write does

170

* will be the intended file size until i_size is updated. If this write does

171

* not extend all the way to the valid file size then restrict this update to

171

* not extend all the way to the valid file size then restrict this update to

172

* the end of the write.

172

* the end of the write.

173

*

173

*

174

* This function does not block as blocking on the inode lock in IO completion

174

* This function does not block as blocking on the inode lock in IO completion

175

* can lead to IO completion order dependency deadlocks.. If it can't get the

175

* can lead to IO completion order dependency deadlocks.. If it can't get the

176

* inode ilock it will return EAGAIN. Callers must handle this.

176

* inode ilock it will return EAGAIN. Callers must handle this.

177

*/

177

*/

178

STATIC int

178

STATIC int

179

xfs_setfilesize(

179

xfs_setfilesize(

180

xfs_ioend_t *ioend)

180

xfs_ioend_t *ioend)

181

{

181

{

182

xfs_inode_t *ip = XFS_I(ioend->io_inode);

182

xfs_inode_t *ip = XFS_I(ioend->io_inode);

183

xfs_fsize_t isize;

183

xfs_fsize_t isize;

184

185

ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);

185

ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);

186

ASSERT(ioend->io_type != IO_READ);

186

ASSERT(ioend->io_type != IO_READ);

187

188

if (unlikely(ioend->io_error))

188

if (unlikely(ioend->io_error))

189

return 0;

189

return 0;

190

191

if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))

191

if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))

192

return EAGAIN;

192

return EAGAIN;

193

194

isize = xfs_ioend_new_eof(ioend);

194

isize = xfs_ioend_new_eof(ioend);

195

if (isize) {

195

if (isize) {

196

ip->i_d.di_size = isize;

196

ip->i_d.di_size = isize;

197

xfs_mark_inode_dirty(ip);

197

xfs_mark_inode_dirty(ip);

198

}

198

}

199

200

xfs_iunlock(ip, XFS_ILOCK_EXCL);

200

xfs_iunlock(ip, XFS_ILOCK_EXCL);

201

return 0;

201

return 0;

202

}

202

}

203

204

/*

204

/*

205

* Schedule IO completion handling on a xfsdatad if this was

205

* Schedule IO completion handling on a xfsdatad if this was

206

* the final hold on this ioend. If we are asked to wait,

206

* the final hold on this ioend. If we are asked to wait,

207

* flush the workqueue.

207

* flush the workqueue.

208

*/

208

*/

209

STATIC void

209

STATIC void

210

xfs_finish_ioend(

210

xfs_finish_ioend(

211

xfs_ioend_t *ioend,

211

xfs_ioend_t *ioend,

212

int wait)

212

int wait)

213

{

213

{

214

if (atomic_dec_and_test(&ioend->io_remaining)) {

214

if (atomic_dec_and_test(&ioend->io_remaining)) {

215

struct workqueue_struct *wq;

215

struct workqueue_struct *wq;

216

217

wq = (ioend->io_type == IO_UNWRITTEN) ?

217

wq = (ioend->io_type == IO_UNWRITTEN) ?

218

xfsconvertd_workqueue : xfsdatad_workqueue;

218

xfsconvertd_workqueue : xfsdatad_workqueue;

219

queue_work(wq, &ioend->io_work);

219

queue_work(wq, &ioend->io_work);

220

if (wait)

220

if (wait)

221

flush_workqueue(wq);

221

flush_workqueue(wq);

222

}

222

}

223

}

223

}

224

225

/*

225

/*

226

* IO write completion.

226

* IO write completion.

227

*/

227

*/

228

STATIC void

228

STATIC void

229

xfs_end_io(

229

xfs_end_io(

230

struct work_struct *work)

230

struct work_struct *work)

231

{

231

{

232

xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work);

232

xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work);

233

struct xfs_inode *ip = XFS_I(ioend->io_inode);

233

struct xfs_inode *ip = XFS_I(ioend->io_inode);

234

int error = 0;

234

int error = 0;

235

236

/*

236

/*

237

* For unwritten extents we need to issue transactions to convert a

237

* For unwritten extents we need to issue transactions to convert a

238

* range to normal written extens after the data I/O has finished.

238

* range to normal written extens after the data I/O has finished.

239

*/

239

*/

240

if (ioend->io_type == IO_UNWRITTEN &&

240

if (ioend->io_type == IO_UNWRITTEN &&

241

likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) {

241

likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) {

242

243

error = xfs_iomap_write_unwritten(ip, ioend->io_offset,

243

error = xfs_iomap_write_unwritten(ip, ioend->io_offset,

244

ioend->io_size);

244

ioend->io_size);

245

if (error)

245

if (error)

246

ioend->io_error = error;

246

ioend->io_error = error;

247

}

247

}

248

249

/*

249

/*

250

* We might have to update the on-disk file size after extending

250

* We might have to update the on-disk file size after extending

251

* writes.

251

* writes.

252

*/

252

*/

253

if (ioend->io_type != IO_READ) {

253

if (ioend->io_type != IO_READ) {

254

error = xfs_setfilesize(ioend);

254

error = xfs_setfilesize(ioend);

255

ASSERT(!error || error == EAGAIN);

255

ASSERT(!error || error == EAGAIN);

256

}

256

}

257

258

/*

258

/*

259

* If we didn't complete processing of the ioend, requeue it to the

259

* If we didn't complete processing of the ioend, requeue it to the

260

* tail of the workqueue for another attempt later. Otherwise destroy

260

* tail of the workqueue for another attempt later. Otherwise destroy

261

* it.

261

* it.

262

*/

262

*/

263

if (error == EAGAIN) {

263

if (error == EAGAIN) {

264

atomic_inc(&ioend->io_remaining);

264

atomic_inc(&ioend->io_remaining);

265

xfs_finish_ioend(ioend, 0);

265

xfs_finish_ioend(ioend, 0);

266

/* ensure we don't spin on blocked ioends */

266

/* ensure we don't spin on blocked ioends */

267

delay(1);

267

delay(1);

268

} else

268

} else

269

xfs_destroy_ioend(ioend);

269

xfs_destroy_ioend(ioend);

270

}

270

}

271

272

/*

272

/*

273

* Allocate and initialise an IO completion structure.

273

* Allocate and initialise an IO completion structure.

274

* We need to track unwritten extent write completion here initially.

274

* We need to track unwritten extent write completion here initially.

275

* We'll need to extend this for updating the ondisk inode size later

275

* We'll need to extend this for updating the ondisk inode size later

276

* (vs. incore size).

276

* (vs. incore size).

277

*/

277

*/

278

STATIC xfs_ioend_t *

278

STATIC xfs_ioend_t *

279

xfs_alloc_ioend(

279

xfs_alloc_ioend(

280

struct inode *inode,

280

struct inode *inode,

281

unsigned int type)

281

unsigned int type)

282

{

282

{

283

xfs_ioend_t *ioend;

283

xfs_ioend_t *ioend;

284

285

ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS);

285

ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS);

286

287

/*

287

/*

288

* Set the count to 1 initially, which will prevent an I/O

288

* Set the count to 1 initially, which will prevent an I/O

289

* completion callback from happening before we have started

289

* completion callback from happening before we have started

290

* all the I/O from calling the completion routine too early.

290

* all the I/O from calling the completion routine too early.

291

*/

291

*/

292

atomic_set(&ioend->io_remaining, 1);

292

atomic_set(&ioend->io_remaining, 1);

293

ioend->io_error = 0;

293

ioend->io_error = 0;

294

ioend->io_list = NULL;

294

ioend->io_list = NULL;

295

ioend->io_type = type;

295

ioend->io_type = type;

296

ioend->io_inode = inode;

296

ioend->io_inode = inode;

297

ioend->io_buffer_head = NULL;

297

ioend->io_buffer_head = NULL;

298

ioend->io_buffer_tail = NULL;

298

ioend->io_buffer_tail = NULL;

299

atomic_inc(&XFS_I(ioend->io_inode)->i_iocount);

299

atomic_inc(&XFS_I(ioend->io_inode)->i_iocount);

300

ioend->io_offset = 0;

300

ioend->io_offset = 0;

301

ioend->io_size = 0;

301

ioend->io_size = 0;

302

303

INIT_WORK(&ioend->io_work, xfs_end_io);

303

INIT_WORK(&ioend->io_work, xfs_end_io);

304

return ioend;

304

return ioend;

305

}

305

}

306

307

STATIC int

307

STATIC int

308

xfs_map_blocks(

308

xfs_map_blocks(

309

struct inode *inode,

309

struct inode *inode,

310

loff_t offset,

310

loff_t offset,

311

ssize_t count,

311

ssize_t count,

312

struct xfs_bmbt_irec *imap,

312

struct xfs_bmbt_irec *imap,

313

int flags)

313

int flags)

314

{

314

{

315

int nmaps = 1;

315

int nmaps = 1;

316

int new = 0;

316

int new = 0;

317

318

return -xfs_iomap(XFS_I(inode), offset, count, flags, imap, &nmaps, &new);

318

return -xfs_iomap(XFS_I(inode), offset, count, flags, imap, &nmaps, &new);

319

}

319

}

320

321

STATIC int

321

STATIC int

322

xfs_imap_valid(

322

xfs_imap_valid(

323

struct inode *inode,

323

struct inode *inode,

324

struct xfs_bmbt_irec *imap,

324

struct xfs_bmbt_irec *imap,

325

xfs_off_t offset)

325

xfs_off_t offset)

326

{

326

{

327

offset >>= inode->i_blkbits;

327

offset >>= inode->i_blkbits;

328

329

return offset >= imap->br_startoff &&

329

return offset >= imap->br_startoff &&

330

offset < imap->br_startoff + imap->br_blockcount;

330

offset < imap->br_startoff + imap->br_blockcount;

331

}

331

}

332

333

/*

333

/*

334

* BIO completion handler for buffered IO.

334

* BIO completion handler for buffered IO.

335

*/

335

*/

336

STATIC void

336

STATIC void

337

xfs_end_bio(

337

xfs_end_bio(

338

struct bio *bio,

338

struct bio *bio,

339

int error)

339

int error)

340

{

340

{

341

xfs_ioend_t *ioend = bio->bi_private;

341

xfs_ioend_t *ioend = bio->bi_private;

342

343

ASSERT(atomic_read(&bio->bi_cnt) >= 1);

343

ASSERT(atomic_read(&bio->bi_cnt) >= 1);

344

ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error;

344

ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error;

345

346

/* Toss bio and pass work off to an xfsdatad thread */

346

/* Toss bio and pass work off to an xfsdatad thread */

347

bio->bi_private = NULL;

347

bio->bi_private = NULL;

348

bio->bi_end_io = NULL;

348

bio->bi_end_io = NULL;

349

bio_put(bio);

349

bio_put(bio);

350

351

xfs_finish_ioend(ioend, 0);

351

xfs_finish_ioend(ioend, 0);

352

}

352

}

353

354

STATIC void

354

STATIC void

355

xfs_submit_ioend_bio(

355

xfs_submit_ioend_bio(

356

struct writeback_control *wbc,

356

struct writeback_control *wbc,

357

xfs_ioend_t *ioend,

357

xfs_ioend_t *ioend,

358

struct bio *bio)

358

struct bio *bio)

359

{

359

{

360

atomic_inc(&ioend->io_remaining);

360

atomic_inc(&ioend->io_remaining);

361

bio->bi_private = ioend;

361

bio->bi_private = ioend;

362

bio->bi_end_io = xfs_end_bio;

362

bio->bi_end_io = xfs_end_bio;

363

364

/*

364

/*

365

* If the I/O is beyond EOF we mark the inode dirty immediately

365

* If the I/O is beyond EOF we mark the inode dirty immediately

366

* but don't update the inode size until I/O completion.

366

* but don't update the inode size until I/O completion.

367

*/

367

*/

368

if (xfs_ioend_new_eof(ioend))

368

if (xfs_ioend_new_eof(ioend))

369

xfs_mark_inode_dirty(XFS_I(ioend->io_inode));

369

xfs_mark_inode_dirty(XFS_I(ioend->io_inode));

370

371

submit_bio(wbc->sync_mode == WB_SYNC_ALL ?

371

submit_bio(wbc->sync_mode == WB_SYNC_ALL ?

372

WRITE_SYNC_PLUG : WRITE, bio);

372

WRITE_SYNC_PLUG : WRITE, bio);

373

ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP));

373

ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP));

374

bio_put(bio);

374

bio_put(bio);

375

}

375

}

376

377

STATIC struct bio *

377

STATIC struct bio *

378

xfs_alloc_ioend_bio(

378

xfs_alloc_ioend_bio(

379

struct buffer_head *bh)

379

struct buffer_head *bh)

380

{

380

{

381

struct bio *bio;

381

struct bio *bio;

382

int nvecs = bio_get_nr_vecs(bh->b_bdev);

382

int nvecs = bio_get_nr_vecs(bh->b_bdev);

383

384

do {

384

do {

385

bio = bio_alloc(GFP_NOIO, nvecs);

385

bio = bio_alloc(GFP_NOIO, nvecs);

386

nvecs >>= 1;

386

nvecs >>= 1;

387

} while (!bio);

387

} while (!bio);

388

389

ASSERT(bio->bi_private == NULL);

389

ASSERT(bio->bi_private == NULL);

390

bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);

390

bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);

391

bio->bi_bdev = bh->b_bdev;

391

bio->bi_bdev = bh->b_bdev;

392

bio_get(bio);

392

bio_get(bio);

393

return bio;

393

return bio;

394

}

394

}

395

396

STATIC void

396

STATIC void

397

xfs_start_buffer_writeback(

397

xfs_start_buffer_writeback(

398

struct buffer_head *bh)

398

struct buffer_head *bh)

399

{

399

{

400

ASSERT(buffer_mapped(bh));

400

ASSERT(buffer_mapped(bh));

401

ASSERT(buffer_locked(bh));

401

ASSERT(buffer_locked(bh));

402

ASSERT(!buffer_delay(bh));

402

ASSERT(!buffer_delay(bh));

403

ASSERT(!buffer_unwritten(bh));

403

ASSERT(!buffer_unwritten(bh));

404

405

mark_buffer_async_write(bh);

405

mark_buffer_async_write(bh);

406

set_buffer_uptodate(bh);

406

set_buffer_uptodate(bh);

407

clear_buffer_dirty(bh);

407

clear_buffer_dirty(bh);

408

}

408

}

409

410

STATIC void

410

STATIC void

411

xfs_start_page_writeback(

411

xfs_start_page_writeback(

412

struct page *page,

412

struct page *page,

413

int clear_dirty,

413

int clear_dirty,

414

int buffers)

414

int buffers)

415

{

415

{

416

ASSERT(PageLocked(page));

416

ASSERT(PageLocked(page));

417

ASSERT(!PageWriteback(page));

417

ASSERT(!PageWriteback(page));

418

if (clear_dirty)

418

if (clear_dirty)

419

clear_page_dirty_for_io(page);

419

clear_page_dirty_for_io(page);

420

set_page_writeback(page);

420

set_page_writeback(page);

421

unlock_page(page);

421

unlock_page(page);

422

/* If no buffers on the page are to be written, finish it here */

422

/* If no buffers on the page are to be written, finish it here */

423

if (!buffers)

423

if (!buffers)

424

end_page_writeback(page);

424

end_page_writeback(page);

425

}

425

}

426

427

static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh)

427

static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh)

428

{

428

{

429

return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));

429

return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));

430

}

430

}

431

432

/*

432

/*

433

* Submit all of the bios for all of the ioends we have saved up, covering the

433

* Submit all of the bios for all of the ioends we have saved up, covering the

434

* initial writepage page and also any probed pages.

434

* initial writepage page and also any probed pages.

435

*

435

*

436

* Because we may have multiple ioends spanning a page, we need to start

436

* Because we may have multiple ioends spanning a page, we need to start

437

* writeback on all the buffers before we submit them for I/O. If we mark the

437

* writeback on all the buffers before we submit them for I/O. If we mark the

438

* buffers as we got, then we can end up with a page that only has buffers

438

* buffers as we got, then we can end up with a page that only has buffers

439

* marked async write and I/O complete on can occur before we mark the other

439

* marked async write and I/O complete on can occur before we mark the other

440

* buffers async write.

440

* buffers async write.

441

*

441

*

442

* The end result of this is that we trip a bug in end_page_writeback() because

442

* The end result of this is that we trip a bug in end_page_writeback() because

443

* we call it twice for the one page as the code in end_buffer_async_write()

443

* we call it twice for the one page as the code in end_buffer_async_write()

444

* assumes that all buffers on the page are started at the same time.

444

* assumes that all buffers on the page are started at the same time.

445

*

445

*

446

* The fix is two passes across the ioend list - one to start writeback on the

446

* The fix is two passes across the ioend list - one to start writeback on the

447

* buffer_heads, and then submit them for I/O on the second pass.

447

* buffer_heads, and then submit them for I/O on the second pass.

448

*/

448

*/

449

STATIC void

449

STATIC void

450

xfs_submit_ioend(

450

xfs_submit_ioend(

451

struct writeback_control *wbc,

451

struct writeback_control *wbc,

452

xfs_ioend_t *ioend)

452

xfs_ioend_t *ioend)

453

{

453

{

454

xfs_ioend_t *head = ioend;

454

xfs_ioend_t *head = ioend;

455

xfs_ioend_t *next;

455

xfs_ioend_t *next;

456

struct buffer_head *bh;

456

struct buffer_head *bh;

457

struct bio *bio;

457

struct bio *bio;

458

sector_t lastblock = 0;

458

sector_t lastblock = 0;

459

460

/* Pass 1 - start writeback */

460

/* Pass 1 - start writeback */

461

do {

461

do {

462

next = ioend->io_list;

462

next = ioend->io_list;

463

for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {

463

for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {

464

xfs_start_buffer_writeback(bh);

464

xfs_start_buffer_writeback(bh);

465

}

465

}

466

} while ((ioend = next) != NULL);

466

} while ((ioend = next) != NULL);

467

468

/* Pass 2 - submit I/O */

468

/* Pass 2 - submit I/O */

469

ioend = head;

469

ioend = head;

470

do {

470

do {

471

next = ioend->io_list;

471

next = ioend->io_list;

472

bio = NULL;

472

bio = NULL;

473

474

for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {

474

for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {

475

476

if (!bio) {

476

if (!bio) {

477

retry:

477

retry:

478

bio = xfs_alloc_ioend_bio(bh);

478

bio = xfs_alloc_ioend_bio(bh);

479

} else if (bh->b_blocknr != lastblock + 1) {

479

} else if (bh->b_blocknr != lastblock + 1) {

480

xfs_submit_ioend_bio(wbc, ioend, bio);

480

xfs_submit_ioend_bio(wbc, ioend, bio);

481

goto retry;

481

goto retry;

482

}

482

}

483

484

if (bio_add_buffer(bio, bh) != bh->b_size) {

484

if (bio_add_buffer(bio, bh) != bh->b_size) {

485

xfs_submit_ioend_bio(wbc, ioend, bio);

485

xfs_submit_ioend_bio(wbc, ioend, bio);

486

goto retry;

486

goto retry;

487

}

487

}

488

489

lastblock = bh->b_blocknr;

489

lastblock = bh->b_blocknr;

490

}

490

}

491

if (bio)

491

if (bio)

492

xfs_submit_ioend_bio(wbc, ioend, bio);

492

xfs_submit_ioend_bio(wbc, ioend, bio);

493

xfs_finish_ioend(ioend, 0);

493

xfs_finish_ioend(ioend, 0);

494

} while ((ioend = next) != NULL);

494

} while ((ioend = next) != NULL);

495

}

495

}

496

497

/*

497

/*

498

* Cancel submission of all buffer_heads so far in this endio.

498

* Cancel submission of all buffer_heads so far in this endio.

499

* Toss the endio too. Only ever called for the initial page

499

* Toss the endio too. Only ever called for the initial page

500

* in a writepage request, so only ever one page.

500

* in a writepage request, so only ever one page.

501

*/

501

*/

502

STATIC void

502

STATIC void

503

xfs_cancel_ioend(

503

xfs_cancel_ioend(

504

xfs_ioend_t *ioend)

504

xfs_ioend_t *ioend)

505

{

505

{

506

xfs_ioend_t *next;

506

xfs_ioend_t *next;

507

struct buffer_head *bh, *next_bh;

507

struct buffer_head *bh, *next_bh;

508

509

do {

509

do {

510

next = ioend->io_list;

510

next = ioend->io_list;

511

bh = ioend->io_buffer_head;

511

bh = ioend->io_buffer_head;

512

do {

512

do {

513

next_bh = bh->b_private;

513

next_bh = bh->b_private;

514

clear_buffer_async_write(bh);

514

clear_buffer_async_write(bh);

515

unlock_buffer(bh);

515

unlock_buffer(bh);

516

} while ((bh = next_bh) != NULL);

516

} while ((bh = next_bh) != NULL);

517

518

xfs_ioend_wake(XFS_I(ioend->io_inode));

518

xfs_ioend_wake(XFS_I(ioend->io_inode));

519

mempool_free(ioend, xfs_ioend_pool);

519

mempool_free(ioend, xfs_ioend_pool);

520

} while ((ioend = next) != NULL);

520

} while ((ioend = next) != NULL);

521

}

521

}

522

523

/*

523

/*

524

* Test to see if we've been building up a completion structure for

524

* Test to see if we've been building up a completion structure for

525

* earlier buffers -- if so, we try to append to this ioend if we

525

* earlier buffers -- if so, we try to append to this ioend if we

526

* can, otherwise we finish off any current ioend and start another.

526

* can, otherwise we finish off any current ioend and start another.

527

* Return true if we've finished the given ioend.

527

* Return true if we've finished the given ioend.

528

*/

528

*/

529

STATIC void

529

STATIC void

530

xfs_add_to_ioend(

530

xfs_add_to_ioend(

531

struct inode *inode,

531

struct inode *inode,

532

struct buffer_head *bh,

532

struct buffer_head *bh,

533

xfs_off_t offset,

533

xfs_off_t offset,

534

unsigned int type,

534

unsigned int type,

535

xfs_ioend_t **result,

535

xfs_ioend_t **result,

536

int need_ioend)

536

int need_ioend)

537

{

537

{

538

xfs_ioend_t *ioend = *result;

538

xfs_ioend_t *ioend = *result;

539

540

if (!ioend || need_ioend || type != ioend->io_type) {

540

if (!ioend || need_ioend || type != ioend->io_type) {

541

xfs_ioend_t *previous = *result;

541

xfs_ioend_t *previous = *result;

542

543

ioend = xfs_alloc_ioend(inode, type);

543

ioend = xfs_alloc_ioend(inode, type);

544

ioend->io_offset = offset;

544

ioend->io_offset = offset;

545

ioend->io_buffer_head = bh;

545

ioend->io_buffer_head = bh;

546

ioend->io_buffer_tail = bh;

546

ioend->io_buffer_tail = bh;

547

if (previous)

547

if (previous)

548

previous->io_list = ioend;

548

previous->io_list = ioend;

549

*result = ioend;

549

*result = ioend;

550

} else {

550

} else {

551

ioend->io_buffer_tail->b_private = bh;

551

ioend->io_buffer_tail->b_private = bh;

552

ioend->io_buffer_tail = bh;

552

ioend->io_buffer_tail = bh;

553

}

553

}

554

555

bh->b_private = NULL;

555

bh->b_private = NULL;

556

ioend->io_size += bh->b_size;

556

ioend->io_size += bh->b_size;

557

}

557

}

558

559

STATIC void

559

STATIC void

560

xfs_map_buffer(

560

xfs_map_buffer(

561

struct inode *inode,

561

struct inode *inode,

562

struct buffer_head *bh,

562

struct buffer_head *bh,

563

struct xfs_bmbt_irec *imap,

563

struct xfs_bmbt_irec *imap,

564

xfs_off_t offset)

564

xfs_off_t offset)

565

{

565

{

566

sector_t bn;

566

sector_t bn;

567

struct xfs_mount *m = XFS_I(inode)->i_mount;

567

struct xfs_mount *m = XFS_I(inode)->i_mount;

568

xfs_off_t iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff);

568

xfs_off_t iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff);

569

xfs_daddr_t iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock);

569

xfs_daddr_t iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock);

570

571

ASSERT(imap->br_startblock != HOLESTARTBLOCK);

571

ASSERT(imap->br_startblock != HOLESTARTBLOCK);

572

ASSERT(imap->br_startblock != DELAYSTARTBLOCK);

572

ASSERT(imap->br_startblock != DELAYSTARTBLOCK);

573

574

bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) +

574

bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) +

575

((offset - iomap_offset) >> inode->i_blkbits);

575

((offset - iomap_offset) >> inode->i_blkbits);

576

577

ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode)));

577

ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode)));

578

579

bh->b_blocknr = bn;

579

bh->b_blocknr = bn;

580

set_buffer_mapped(bh);

580

set_buffer_mapped(bh);

581

}

581

}

582

583

STATIC void

583

STATIC void

584

xfs_map_at_offset(

584

xfs_map_at_offset(

585

struct inode *inode,

585

struct inode *inode,

586

struct buffer_head *bh,

586

struct buffer_head *bh,

587

struct xfs_bmbt_irec *imap,

587

struct xfs_bmbt_irec *imap,

588

xfs_off_t offset)

588

xfs_off_t offset)

589

{

589

{

590

ASSERT(imap->br_startblock != HOLESTARTBLOCK);

590

ASSERT(imap->br_startblock != HOLESTARTBLOCK);

591

ASSERT(imap->br_startblock != DELAYSTARTBLOCK);

591

ASSERT(imap->br_startblock != DELAYSTARTBLOCK);

592

593

lock_buffer(bh);

593

lock_buffer(bh);

594

xfs_map_buffer(inode, bh, imap, offset);

594

xfs_map_buffer(inode, bh, imap, offset);

595

bh->b_bdev = xfs_find_bdev_for_inode(inode);

595

bh->b_bdev = xfs_find_bdev_for_inode(inode);

596

set_buffer_mapped(bh);

596

set_buffer_mapped(bh);

597

clear_buffer_delay(bh);

597

clear_buffer_delay(bh);

598

clear_buffer_unwritten(bh);

598

clear_buffer_unwritten(bh);

599

}

599

}

600

601

/*

601

/*

602

* Look for a page at index that is suitable for clustering.

602

* Look for a page at index that is suitable for clustering.

603

*/

603

*/

604

STATIC unsigned int

604

STATIC unsigned int

605

xfs_probe_page(

605

xfs_probe_page(

606

struct page *page,

606

struct page *page,

607

unsigned int pg_offset)

607

unsigned int pg_offset)

608

{

608

{

609

struct buffer_head *bh, *head;

609

struct buffer_head *bh, *head;

610

int ret = 0;

610

int ret = 0;

611

612

if (PageWriteback(page))

612

if (PageWriteback(page))

613

return 0;

613

return 0;

614

if (!PageDirty(page))

614

if (!PageDirty(page))

615

return 0;

615

return 0;

616

if (!page->mapping)

616

if (!page->mapping)

617

return 0;

617

return 0;

618

if (!page_has_buffers(page))

618

if (!page_has_buffers(page))

619

return 0;

619

return 0;

620

621

bh = head = page_buffers(page);

621

bh = head = page_buffers(page);

622

do {

622

do {

623

if (!buffer_uptodate(bh))

623

if (!buffer_uptodate(bh))

624

break;

624

break;

625

if (!buffer_mapped(bh))

625

if (!buffer_mapped(bh))

626

break;

626

break;

627

ret += bh->b_size;

627

ret += bh->b_size;

628

if (ret >= pg_offset)

628

if (ret >= pg_offset)

629

break;

629

break;

630

} while ((bh = bh->b_this_page) != head);

630

} while ((bh = bh->b_this_page) != head);

631

632

return ret;

632

return ret;

633

}

633

}

634

635

STATIC size_t

635

STATIC size_t

636

xfs_probe_cluster(

636

xfs_probe_cluster(

637

struct inode *inode,

637

struct inode *inode,

638

struct page *startpage,

638

struct page *startpage,

639

struct buffer_head *bh,

639

struct buffer_head *bh,

640

struct buffer_head *head)

640

struct buffer_head *head)

641

{

641

{

642

struct pagevec pvec;

642

struct pagevec pvec;

643

pgoff_t tindex, tlast, tloff;

643

pgoff_t tindex, tlast, tloff;

644

size_t total = 0;

644

size_t total = 0;

645

int done = 0, i;

645

int done = 0, i;

646

647

/* First sum forwards in this page */

647

/* First sum forwards in this page */

648

do {

648

do {

649

if (!buffer_uptodate(bh) || !buffer_mapped(bh))

649

if (!buffer_uptodate(bh) || !buffer_mapped(bh))

650

return total;

650

return total;

651

total += bh->b_size;

651

total += bh->b_size;

652

} while ((bh = bh->b_this_page) != head);

652

} while ((bh = bh->b_this_page) != head);

653

654

/* if we reached the end of the page, sum forwards in following pages */

654

/* if we reached the end of the page, sum forwards in following pages */

655

tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;

655

tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;

656

tindex = startpage->index + 1;

656

tindex = startpage->index + 1;

657

658

/* Prune this back to avoid pathological behavior */

658

/* Prune this back to avoid pathological behavior */

659

tloff = min(tlast, startpage->index + 64);

659

tloff = min(tlast, startpage->index + 64);

660

661

pagevec_init(&pvec, 0);

661

pagevec_init(&pvec, 0);

662

while (!done && tindex <= tloff) {

662

while (!done && tindex <= tloff) {

663

unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);

663

unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);

664

665

if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))

665

if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))

666

break;

666

break;

667

668

for (i = 0; i < pagevec_count(&pvec); i++) {

668

for (i = 0; i < pagevec_count(&pvec); i++) {

669

struct page *page = pvec.pages[i];

669

struct page *page = pvec.pages[i];

670

size_t pg_offset, pg_len = 0;

670

size_t pg_offset, pg_len = 0;

671

672

if (tindex == tlast) {

672

if (tindex == tlast) {

673

pg_offset =

673

pg_offset =

674

i_size_read(inode) & (PAGE_CACHE_SIZE - 1);

674

i_size_read(inode) & (PAGE_CACHE_SIZE - 1);

675

if (!pg_offset) {

675

if (!pg_offset) {

676

done = 1;

676

done = 1;

677

break;

677

break;

678

}

678

}

679

} else

679

} else

680

pg_offset = PAGE_CACHE_SIZE;

680

pg_offset = PAGE_CACHE_SIZE;

681

682

if (page->index == tindex && trylock_page(page)) {

682

if (page->index == tindex && trylock_page(page)) {

683

pg_len = xfs_probe_page(page, pg_offset);

683

pg_len = xfs_probe_page(page, pg_offset);

684

unlock_page(page);

684

unlock_page(page);

685

}

685

}

686

687

if (!pg_len) {

687

if (!pg_len) {

688

done = 1;

688

done = 1;

689

break;

689

break;

690

}

690

}

691

692

total += pg_len;

692

total += pg_len;

693

tindex++;

693

tindex++;

694

}

694

}

695

696

pagevec_release(&pvec);

696

pagevec_release(&pvec);

697

cond_resched();

697

cond_resched();

698

}

698

}

699

700

return total;

700

return total;

701

}

701

}

702

703

/*

703

/*

704

* Test if a given page is suitable for writing as part of an unwritten

704

* Test if a given page is suitable for writing as part of an unwritten

705

* or delayed allocate extent.

705

* or delayed allocate extent.

706

*/

706

*/

707

STATIC int

707

STATIC int

708

xfs_is_delayed_page(

708

xfs_is_delayed_page(

709

struct page *page,

709

struct page *page,

710

unsigned int type)

710

unsigned int type)

711

{

711

{

712

if (PageWriteback(page))

712

if (PageWriteback(page))

713

return 0;

713

return 0;

714

715

if (page->mapping && page_has_buffers(page)) {

715

if (page->mapping && page_has_buffers(page)) {

716

struct buffer_head *bh, *head;

716

struct buffer_head *bh, *head;

717

int acceptable = 0;

717

int acceptable = 0;

718

719

bh = head = page_buffers(page);

719

bh = head = page_buffers(page);

720

do {

720

do {

721

if (buffer_unwritten(bh))

721

if (buffer_unwritten(bh))

722

acceptable = (type == IO_UNWRITTEN);

722

acceptable = (type == IO_UNWRITTEN);

723

else if (buffer_delay(bh))

723

else if (buffer_delay(bh))

724

acceptable = (type == IO_DELAY);

724

acceptable = (type == IO_DELAY);

725

else if (buffer_dirty(bh) && buffer_mapped(bh))

725

else if (buffer_dirty(bh) && buffer_mapped(bh))

726

acceptable = (type == IO_NEW);

726

acceptable = (type == IO_NEW);

727

else

727

else

728

break;

728

break;

729

} while ((bh = bh->b_this_page) != head);

729

} while ((bh = bh->b_this_page) != head);

730

731

if (acceptable)

731

if (acceptable)

732

return 1;

732

return 1;

733

}

733

}

734

735

return 0;

735

return 0;

736

}

736

}

737

738

/*

738

/*

739

* Allocate & map buffers for page given the extent map. Write it out.

739

* Allocate & map buffers for page given the extent map. Write it out.

740

* except for the original page of a writepage, this is called on

740

* except for the original page of a writepage, this is called on

741

* delalloc/unwritten pages only, for the original page it is possible

741

* delalloc/unwritten pages only, for the original page it is possible

742

* that the page has no mapping at all.

742

* that the page has no mapping at all.

743

*/

743

*/

744

STATIC int

744

STATIC int

745

xfs_convert_page(

745

xfs_convert_page(

746

struct inode *inode,

746

struct inode *inode,

747

struct page *page,

747

struct page *page,

748

loff_t tindex,

748

loff_t tindex,

749

struct xfs_bmbt_irec *imap,

749

struct xfs_bmbt_irec *imap,

750

xfs_ioend_t **ioendp,

750

xfs_ioend_t **ioendp,

751

struct writeback_control *wbc,

751

struct writeback_control *wbc,

752

int all_bh)

752

int all_bh)

753

{

753

{

754

struct buffer_head *bh, *head;

754

struct buffer_head *bh, *head;

755

xfs_off_t end_offset;

755

xfs_off_t end_offset;

756

unsigned long p_offset;

756

unsigned long p_offset;

757

unsigned int type;

757

unsigned int type;

758

int len, page_dirty;

758

int len, page_dirty;

759

int count = 0, done = 0, uptodate = 1;

759

int count = 0, done = 0, uptodate = 1;

760

xfs_off_t offset = page_offset(page);

760

xfs_off_t offset = page_offset(page);

761

762

if (page->index != tindex)

762

if (page->index != tindex)

763

goto fail;

763

goto fail;

764

if (!trylock_page(page))

764

if (!trylock_page(page))

765

goto fail;

765

goto fail;

766

if (PageWriteback(page))

766

if (PageWriteback(page))

767

goto fail_unlock_page;

767

goto fail_unlock_page;

768

if (page->mapping != inode->i_mapping)

768

if (page->mapping != inode->i_mapping)

769

goto fail_unlock_page;

769

goto fail_unlock_page;

770

if (!xfs_is_delayed_page(page, (*ioendp)->io_type))

770

if (!xfs_is_delayed_page(page, (*ioendp)->io_type))

771

goto fail_unlock_page;

771

goto fail_unlock_page;

772

773

/*

773

/*

774

* page_dirty is initially a count of buffers on the page before

774

* page_dirty is initially a count of buffers on the page before

775

* EOF and is decremented as we move each into a cleanable state.

775

* EOF and is decremented as we move each into a cleanable state.

776

*

776

*

777

* Derivation:

777

* Derivation:

778

*

778

*

779

* End offset is the highest offset that this page should represent.

779

* End offset is the highest offset that this page should represent.

780

* If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))

780

* If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))

781

* will evaluate non-zero and be less than PAGE_CACHE_SIZE and

781

* will evaluate non-zero and be less than PAGE_CACHE_SIZE and

782

* hence give us the correct page_dirty count. On any other page,

782

* hence give us the correct page_dirty count. On any other page,

783

* it will be zero and in that case we need page_dirty to be the

783

* it will be zero and in that case we need page_dirty to be the

784

* count of buffers on the page.

784

* count of buffers on the page.

785

*/

785

*/

786

end_offset = min_t(unsigned long long,

786

end_offset = min_t(unsigned long long,

787

(xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,

787

(xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,

788

i_size_read(inode));

788

i_size_read(inode));

789

790

len = 1 << inode->i_blkbits;

790

len = 1 << inode->i_blkbits;

791

p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),

791

p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),

792

PAGE_CACHE_SIZE);

792

PAGE_CACHE_SIZE);

793

p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;

793

p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;

794

page_dirty = p_offset / len;

794

page_dirty = p_offset / len;

795

796

bh = head = page_buffers(page);

796

bh = head = page_buffers(page);

797

do {

797

do {

798

if (offset >= end_offset)

798

if (offset >= end_offset)

799

break;

799

break;

800

if (!buffer_uptodate(bh))

800

if (!buffer_uptodate(bh))

801

uptodate = 0;

801

uptodate = 0;

802

if (!(PageUptodate(page) || buffer_uptodate(bh))) {

802

if (!(PageUptodate(page) || buffer_uptodate(bh))) {

803

done = 1;

803

done = 1;

804

continue;

804

continue;

805

}

805

}

806

807

if (buffer_unwritten(bh) || buffer_delay(bh)) {

807

if (buffer_unwritten(bh) || buffer_delay(bh)) {

808

if (buffer_unwritten(bh))

808

if (buffer_unwritten(bh))

809

type = IO_UNWRITTEN;

809

type = IO_UNWRITTEN;

810

else

810

else

811

type = IO_DELAY;

811

type = IO_DELAY;

812

813

if (!xfs_imap_valid(inode, imap, offset)) {

813

if (!xfs_imap_valid(inode, imap, offset)) {

814

done = 1;

814

done = 1;

815

continue;

815

continue;

816

}

816

}

817

818

ASSERT(imap->br_startblock != HOLESTARTBLOCK);

818

ASSERT(imap->br_startblock != HOLESTARTBLOCK);

819

ASSERT(imap->br_startblock != DELAYSTARTBLOCK);

819

ASSERT(imap->br_startblock != DELAYSTARTBLOCK);

820

821

xfs_map_at_offset(inode, bh, imap, offset);

821

xfs_map_at_offset(inode, bh, imap, offset);

822

xfs_add_to_ioend(inode, bh, offset, type,

822

xfs_add_to_ioend(inode, bh, offset, type,

823

ioendp, done);

823

ioendp, done);

824

825

page_dirty--;

825

page_dirty--;

826

count++;

826

count++;

827

} else {

827

} else {

828

type = IO_NEW;

828

type = IO_NEW;

829

if (buffer_mapped(bh) && all_bh) {

829

if (buffer_mapped(bh) && all_bh) {

830

lock_buffer(bh);

830

lock_buffer(bh);

831

xfs_add_to_ioend(inode, bh, offset,

831

xfs_add_to_ioend(inode, bh, offset,

832

type, ioendp, done);

832

type, ioendp, done);

833

count++;

833

count++;

834

page_dirty--;

834

page_dirty--;

835

} else {

835

} else {

836

done = 1;

836

done = 1;

837

}

837

}

838

}

838

}

839

} while (offset += len, (bh = bh->b_this_page) != head);

839

} while (offset += len, (bh = bh->b_this_page) != head);

840

841

if (uptodate && bh == head)

841

if (uptodate && bh == head)

842

SetPageUptodate(page);

842

SetPageUptodate(page);

843

844

if (count) {

844

if (count) {

845

wbc->nr_to_write--;

845

wbc->nr_to_write--;

846

if (wbc->nr_to_write <= 0)

846

if (wbc->nr_to_write <= 0)

847

done = 1;

847

done = 1;

848

}

848

}

849

xfs_start_page_writeback(page, !page_dirty, count);

849

xfs_start_page_writeback(page, !page_dirty, count);

850

851

return done;

851

return done;

852

fail_unlock_page:

852

fail_unlock_page:

853

unlock_page(page);

853

unlock_page(page);

854

fail:

854

fail:

855

return 1;

855

return 1;

856

}

856

}

857

858

/*

858

/*

859

* Convert & write out a cluster of pages in the same extent as defined

859

* Convert & write out a cluster of pages in the same extent as defined

860

* by mp and following the start page.

860

* by mp and following the start page.

861

*/

861

*/

862

STATIC void

862

STATIC void

863

xfs_cluster_write(

863

xfs_cluster_write(

864

struct inode *inode,

864

struct inode *inode,

865

pgoff_t tindex,

865

pgoff_t tindex,

866

struct xfs_bmbt_irec *imap,

866

struct xfs_bmbt_irec *imap,

867

xfs_ioend_t **ioendp,

867

xfs_ioend_t **ioendp,

868

struct writeback_control *wbc,

868

struct writeback_control *wbc,

869

int all_bh,

869

int all_bh,

870

pgoff_t tlast)

870

pgoff_t tlast)

871

{

871

{

872

struct pagevec pvec;

872

struct pagevec pvec;

873

int done = 0, i;

873

int done = 0, i;

874

875

pagevec_init(&pvec, 0);

875

pagevec_init(&pvec, 0);

876

while (!done && tindex <= tlast) {

876

while (!done && tindex <= tlast) {

877

unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);

877

unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);

878

879

if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))

879

if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))

880

break;

880

break;

881

882

for (i = 0; i < pagevec_count(&pvec); i++) {

882

for (i = 0; i < pagevec_count(&pvec); i++) {

883

done = xfs_convert_page(inode, pvec.pages[i], tindex++,

883

done = xfs_convert_page(inode, pvec.pages[i], tindex++,

884

imap, ioendp, wbc, all_bh);

884

imap, ioendp, wbc, all_bh);

885

if (done)

885

if (done)

886

break;

886

break;

887

}

887

}

888

889

pagevec_release(&pvec);

889

pagevec_release(&pvec);

890

cond_resched();

890

cond_resched();

891

}

891

}

892

}

892

}

893

894

STATIC void

894

STATIC void

895

xfs_vm_invalidatepage(

895

xfs_vm_invalidatepage(

896

struct page *page,

896

struct page *page,

897

unsigned long offset)

897

unsigned long offset)

898

{

898

{

899

trace_xfs_invalidatepage(page->mapping->host, page, offset);

899

trace_xfs_invalidatepage(page->mapping->host, page, offset);

900

block_invalidatepage(page, offset);

900

block_invalidatepage(page, offset);

901

}

901

}

902

903

/*

903

/*

904

* If the page has delalloc buffers on it, we need to punch them out before we

904

* If the page has delalloc buffers on it, we need to punch them out before we

905

* invalidate the page. If we don't, we leave a stale delalloc mapping on the

905

* invalidate the page. If we don't, we leave a stale delalloc mapping on the

906

* inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read

906

* inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read

907

* is done on that same region - the delalloc extent is returned when none is

907

* is done on that same region - the delalloc extent is returned when none is

908

* supposed to be there.

908

* supposed to be there.

909

*

909

*

910

* We prevent this by truncating away the delalloc regions on the page before

910

* We prevent this by truncating away the delalloc regions on the page before

911

* invalidating it. Because they are delalloc, we can do this without needing a

911

* invalidating it. Because they are delalloc, we can do this without needing a

912

* transaction. Indeed - if we get ENOSPC errors, we have to be able to do this

912

* transaction. Indeed - if we get ENOSPC errors, we have to be able to do this

913

* truncation without a transaction as there is no space left for block

913

* truncation without a transaction as there is no space left for block

914

* reservation (typically why we see a ENOSPC in writeback).

914

* reservation (typically why we see a ENOSPC in writeback).

915

*

915

*

916

* This is not a performance critical path, so for now just do the punching a

916

* This is not a performance critical path, so for now just do the punching a

917

* buffer head at a time.

917

* buffer head at a time.

918

*/

918

*/

919

STATIC void

919

STATIC void

920

xfs_aops_discard_page(

920

xfs_aops_discard_page(

921

struct page *page)

921

struct page *page)

922

{

922

{

923

struct inode *inode = page->mapping->host;

923

struct inode *inode = page->mapping->host;

924

struct xfs_inode *ip = XFS_I(inode);

924

struct xfs_inode *ip = XFS_I(inode);

925

struct buffer_head *bh, *head;

925

struct buffer_head *bh, *head;

926

loff_t offset = page_offset(page);

926

loff_t offset = page_offset(page);

927

ssize_t len = 1 << inode->i_blkbits;

927

ssize_t len = 1 << inode->i_blkbits;

928

929

if (!xfs_is_delayed_page(page, IO_DELAY))

929

if (!xfs_is_delayed_page(page, IO_DELAY))

930

goto out_invalidate;

930

goto out_invalidate;

931

932

if (XFS_FORCED_SHUTDOWN(ip->i_mount))

932

if (XFS_FORCED_SHUTDOWN(ip->i_mount))

933

goto out_invalidate;

933

goto out_invalidate;

934

935

xfs_fs_cmn_err(CE_ALERT, ip->i_mount,

935

xfs_fs_cmn_err(CE_ALERT, ip->i_mount,

936

"page discard on page %p, inode 0x%llx, offset %llu.",

936

"page discard on page %p, inode 0x%llx, offset %llu.",

937

page, ip->i_ino, offset);

937

page, ip->i_ino, offset);

938

939

xfs_ilock(ip, XFS_ILOCK_EXCL);

939

xfs_ilock(ip, XFS_ILOCK_EXCL);

940

bh = head = page_buffers(page);

940

bh = head = page_buffers(page);

941

do {

941

do {

942

int done;

942

int done;

943

xfs_fileoff_t offset_fsb;

943

xfs_fileoff_t offset_fsb;

944

xfs_bmbt_irec_t imap;

944

xfs_bmbt_irec_t imap;

945

int nimaps = 1;

945

int nimaps = 1;

946

int error;

946

int error;

947

xfs_fsblock_t firstblock;

947

xfs_fsblock_t firstblock;

948

xfs_bmap_free_t flist;

948

xfs_bmap_free_t flist;

949

950

if (!buffer_delay(bh))

950

if (!buffer_delay(bh))

951

goto next_buffer;

951

goto next_buffer;

952

953

offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);

953

offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);

954

955

/*

955

/*

956

* Map the range first and check that it is a delalloc extent

956

* Map the range first and check that it is a delalloc extent

957

* before trying to unmap the range. Otherwise we will be

957

* before trying to unmap the range. Otherwise we will be

958

* trying to remove a real extent (which requires a

958

* trying to remove a real extent (which requires a

959

* transaction) or a hole, which is probably a bad idea...

959

* transaction) or a hole, which is probably a bad idea...

960

*/

960

*/

961

error = xfs_bmapi(NULL, ip, offset_fsb, 1,

961

error = xfs_bmapi(NULL, ip, offset_fsb, 1,

962

XFS_BMAPI_ENTIRE, NULL, 0, &imap,

962

XFS_BMAPI_ENTIRE, NULL, 0, &imap,

963

&nimaps, NULL);

963

&nimaps, NULL);

964

965

if (error) {

965

if (error) {

966

/* something screwed, just bail */

966

/* something screwed, just bail */

967

if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {

967

if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {

968

xfs_fs_cmn_err(CE_ALERT, ip->i_mount,

968

xfs_fs_cmn_err(CE_ALERT, ip->i_mount,

969

"page discard failed delalloc mapping lookup.");

969

"page discard failed delalloc mapping lookup.");

970

}

970

}

971

break;

971

break;

972

}

972

}

973

if (!nimaps) {

973

if (!nimaps) {

974

/* nothing there */

974

/* nothing there */

975

goto next_buffer;

975

goto next_buffer;

976

}

976

}

977

if (imap.br_startblock != DELAYSTARTBLOCK) {

977

if (imap.br_startblock != DELAYSTARTBLOCK) {

978

/* been converted, ignore */

978

/* been converted, ignore */

979

goto next_buffer;

979

goto next_buffer;

980

}

980

}

981

WARN_ON(imap.br_blockcount == 0);

981

WARN_ON(imap.br_blockcount == 0);

982

983

/*

983

/*

984

* Note: while we initialise the firstblock/flist pair, they

984

* Note: while we initialise the firstblock/flist pair, they

985

* should never be used because blocks should never be

985

* should never be used because blocks should never be

986

* allocated or freed for a delalloc extent and hence we need

986

* allocated or freed for a delalloc extent and hence we need

987

* don't cancel or finish them after the xfs_bunmapi() call.

987

* don't cancel or finish them after the xfs_bunmapi() call.

988

*/

988

*/

989

xfs_bmap_init(&flist, &firstblock);

989

xfs_bmap_init(&flist, &firstblock);

990

error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock,

990

error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock,

991

&flist, &done);

991

&flist, &done);

992

993

ASSERT(!flist.xbf_count && !flist.xbf_first);

993

ASSERT(!flist.xbf_count && !flist.xbf_first);

994

if (error) {

994

if (error) {

995

/* something screwed, just bail */

995

/* something screwed, just bail */

996

if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {

996

if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {

997

xfs_fs_cmn_err(CE_ALERT, ip->i_mount,

997

xfs_fs_cmn_err(CE_ALERT, ip->i_mount,

998

"page discard unable to remove delalloc mapping.");

998

"page discard unable to remove delalloc mapping.");

999

}

999

}

1000

break;

1000

break;

1001

}

1001

}

1002

next_buffer:

1002

next_buffer:

1003

offset += len;

1003

offset += len;

1004

1005

} while ((bh = bh->b_this_page) != head);

1005

} while ((bh = bh->b_this_page) != head);

1006

1007

xfs_iunlock(ip, XFS_ILOCK_EXCL);

1007

xfs_iunlock(ip, XFS_ILOCK_EXCL);

1008

out_invalidate:

1008

out_invalidate:

1009

xfs_vm_invalidatepage(page, 0);

1009

xfs_vm_invalidatepage(page, 0);

1010

return;

1010

return;

1011

}

1011

}

1012

1013

/*

1013

/*

1014

* Write out a dirty page.

1014

* Write out a dirty page.

1015

*

1015

*

1016

* For delalloc space on the page we need to allocate space and flush it.

1016

* For delalloc space on the page we need to allocate space and flush it.

1017

* For unwritten space on the page we need to start the conversion to

1017

* For unwritten space on the page we need to start the conversion to

1018

* regular allocated space.

1018

* regular allocated space.

1019

* For any other dirty buffer heads on the page we should flush them.

1019

* For any other dirty buffer heads on the page we should flush them.

1020

*

1020

*

1021

* If we detect that a transaction would be required to flush the page, we

1021

* If we detect that a transaction would be required to flush the page, we

1022

* have to check the process flags first, if we are already in a transaction

1022

* have to check the process flags first, if we are already in a transaction

1023

* or disk I/O during allocations is off, we need to fail the writepage and

1023

* or disk I/O during allocations is off, we need to fail the writepage and

1024

* redirty the page.

1024

* redirty the page.

1025

*/

1025

*/

1026

STATIC int

1026

STATIC int

1027

xfs_vm_writepage(

1027

xfs_vm_writepage(

1028

struct page *page,

1028

struct page *page,

1029

struct writeback_control *wbc)

1029

struct writeback_control *wbc)

1030

{

1030

{

1031

struct inode *inode = page->mapping->host;

1031

struct inode *inode = page->mapping->host;

1032

int delalloc, unwritten;

1032

int delalloc, unwritten;

1033

struct buffer_head *bh, *head;

1033

struct buffer_head *bh, *head;

1034

struct xfs_bmbt_irec imap;

1034

struct xfs_bmbt_irec imap;

1035

xfs_ioend_t *ioend = NULL, *iohead = NULL;

1035

xfs_ioend_t *ioend = NULL, *iohead = NULL;

1036

loff_t offset;

1036

loff_t offset;

1037

unsigned int type;

1037

unsigned int type;

1038

__uint64_t end_offset;

1038

__uint64_t end_offset;

1039

pgoff_t end_index, last_index;

1039

pgoff_t end_index, last_index;

1040

ssize_t size, len;

1040

ssize_t size, len;

1041

int flags, err, imap_valid = 0, uptodate = 1;

1041

int flags, err, imap_valid = 0, uptodate = 1;

1042

int count = 0;

1042

int count = 0;

1043

int all_bh = 0;

1043

int all_bh = 0;

1044

1045

trace_xfs_writepage(inode, page, 0);

1045

trace_xfs_writepage(inode, page, 0);

1046

1047

ASSERT(page_has_buffers(page));

1047

ASSERT(page_has_buffers(page));

1048

1049

/*

1049

/*

1050

* Refuse to write the page out if we are called from reclaim context.

1050

* Refuse to write the page out if we are called from reclaim context.

1051

*

1051

*

1052

* This avoids stack overflows when called from deeply used stacks in

1052

* This avoids stack overflows when called from deeply used stacks in

1053

* random callers for direct reclaim or memcg reclaim. We explicitly

1053

* random callers for direct reclaim or memcg reclaim. We explicitly

1054

* allow reclaim from kswapd as the stack usage there is relatively low.

1054

* allow reclaim from kswapd as the stack usage there is relatively low.

1055

*

1055

*

1056

* This should really be done by the core VM, but until that happens

1056

* This should really be done by the core VM, but until that happens

1057

* filesystems like XFS, btrfs and ext4 have to take care of this

1057

* filesystems like XFS, btrfs and ext4 have to take care of this

1058

* by themselves.

1058

* by themselves.

1059

*/

1059

*/

1060

if ((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC)

1060

if ((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC)

1061

goto out_fail;

1061

goto out_fail;

1062

1063

/*

1063

/*

1064

* We need a transaction if there are delalloc or unwritten buffers

1064

* We need a transaction if there are delalloc or unwritten buffers

1065

* on the page.

1065

* on the page.

1066

*

1066

*

1067

* If we need a transaction and the process flags say we are already

1067

* If we need a transaction and the process flags say we are already

1068

* in a transaction, or no IO is allowed then mark the page dirty

1068

* in a transaction, or no IO is allowed then mark the page dirty

1069

* again and leave the page as is.

1069

* again and leave the page as is.

1070

*/

1070

*/

1071

xfs_count_page_state(page, &delalloc, &unwritten);

1071

xfs_count_page_state(page, &delalloc, &unwritten);

1072

if ((current->flags & PF_FSTRANS) && (delalloc || unwritten))

1072

if ((current->flags & PF_FSTRANS) && (delalloc || unwritten))

1073

goto out_fail;

1073

goto out_fail;

1074

1075

/*

1076

* Delay hooking up buffer heads until we have

1077

* made our go/no-go decision.

1078

*/

1079

if (!page_has_buffers(page))

1080

create_empty_buffers(page, 1 << inode->i_blkbits, 0);

1081

1082

/* Is this page beyond the end of the file? */

1075

/* Is this page beyond the end of the file? */

1083

offset = i_size_read(inode);

1076

offset = i_size_read(inode);

1084

end_index = offset >> PAGE_CACHE_SHIFT;

1077

end_index = offset >> PAGE_CACHE_SHIFT;

1085

last_index = (offset - 1) >> PAGE_CACHE_SHIFT;

1078

last_index = (offset - 1) >> PAGE_CACHE_SHIFT;

1086

if (page->index >= end_index) {

1079

if (page->index >= end_index) {

1087

if ((page->index >= end_index + 1) ||

1080

if ((page->index >= end_index + 1) ||

1088

!(i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) {

1081

!(i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) {

1089

unlock_page(page);

1082

unlock_page(page);

1090

return 0;

1083

return 0;

1091

}

1084

}

1092

}

1085

}

1093

1086

1094

end_offset = min_t(unsigned long long,

1087

end_offset = min_t(unsigned long long,

1095

(xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,

1088

(xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,

1096

offset);

1089

offset);

1097

len = 1 << inode->i_blkbits;

1090

len = 1 << inode->i_blkbits;

1098

1091

1099

bh = head = page_buffers(page);

1092

bh = head = page_buffers(page);

1100

offset = page_offset(page);

1093

offset = page_offset(page);

1101

flags = BMAPI_READ;

1094

flags = BMAPI_READ;

1102

type = IO_NEW;

1095

type = IO_NEW;

1103

1096

1104

do {

1097

do {

1105

if (offset >= end_offset)

1098

if (offset >= end_offset)

1106

break;

1099

break;

1107

if (!buffer_uptodate(bh))

1100

if (!buffer_uptodate(bh))

1108

uptodate = 0;

1101

uptodate = 0;

1109

1102

1110

/*

1103

/*

1111

* A hole may still be marked uptodate because discard_buffer

1104

* A hole may still be marked uptodate because discard_buffer

1112

* leaves the flag set.

1105

* leaves the flag set.

1113

*/

1106

*/

1114

if (!buffer_mapped(bh) && buffer_uptodate(bh)) {

1107

if (!buffer_mapped(bh) && buffer_uptodate(bh)) {

1115

ASSERT(!buffer_dirty(bh));

1108

ASSERT(!buffer_dirty(bh));

1116

imap_valid = 0;

1109

imap_valid = 0;

1117

continue;

1110

continue;

1118

}

1111

}

1119

1112

1120

if (imap_valid)

1113

if (imap_valid)

1121

imap_valid = xfs_imap_valid(inode, &imap, offset);

1114

imap_valid = xfs_imap_valid(inode, &imap, offset);

1122

1115

1123

if (buffer_unwritten(bh) || buffer_delay(bh)) {

1116

if (buffer_unwritten(bh) || buffer_delay(bh)) {

1124

int new_ioend = 0;

1117

int new_ioend = 0;

1125

1118

1126

/*

1119

/*

1127

* Make sure we don't use a read-only iomap

1120

* Make sure we don't use a read-only iomap

1128

*/

1121

*/

1129

if (flags == BMAPI_READ)

1122

if (flags == BMAPI_READ)

1130

imap_valid = 0;

1123

imap_valid = 0;

1131

1124

1132

if (buffer_unwritten(bh)) {

1125

if (buffer_unwritten(bh)) {

1133

type = IO_UNWRITTEN;

1126

type = IO_UNWRITTEN;

1134

flags = BMAPI_WRITE | BMAPI_IGNSTATE;

1127

flags = BMAPI_WRITE | BMAPI_IGNSTATE;

1135

} else if (buffer_delay(bh)) {

1128

} else if (buffer_delay(bh)) {

1136

type = IO_DELAY;

1129

type = IO_DELAY;

1137

flags = BMAPI_ALLOCATE;

1130

flags = BMAPI_ALLOCATE;

1138

1131

1139

if (wbc->sync_mode == WB_SYNC_NONE &&

1132

if (wbc->sync_mode == WB_SYNC_NONE &&

1140

wbc->nonblocking)

1133

wbc->nonblocking)

1141

flags |= BMAPI_TRYLOCK;

1134

flags |= BMAPI_TRYLOCK;

1142

}

1135

}

1143

1136

1144

if (!imap_valid) {

1137

if (!imap_valid) {

1145

/*

1138

/*

1146

* If we didn't have a valid mapping then we

1139

* If we didn't have a valid mapping then we

1147

* need to ensure that we put the new mapping

1140

* need to ensure that we put the new mapping

1148

* in a new ioend structure. This needs to be

1141

* in a new ioend structure. This needs to be

1149

* done to ensure that the ioends correctly

1142

* done to ensure that the ioends correctly

1150

* reflect the block mappings at io completion

1143

* reflect the block mappings at io completion

1151

* for unwritten extent conversion.

1144

* for unwritten extent conversion.

1152

*/

1145

*/

1153

new_ioend = 1;

1146

new_ioend = 1;

1154

err = xfs_map_blocks(inode, offset, len,

1147

err = xfs_map_blocks(inode, offset, len,

1155

&imap, flags);

1148

&imap, flags);

1156

if (err)

1149

if (err)

1157

goto error;

1150

goto error;

1158

imap_valid = xfs_imap_valid(inode, &imap,

1151

imap_valid = xfs_imap_valid(inode, &imap,

1159

offset);

1152

offset);

1160

}

1153

}

1161

if (imap_valid) {

1154

if (imap_valid) {

1162

xfs_map_at_offset(inode, bh, &imap, offset);

1155

xfs_map_at_offset(inode, bh, &imap, offset);

1163

xfs_add_to_ioend(inode, bh, offset, type,

1156

xfs_add_to_ioend(inode, bh, offset, type,

1164

&ioend, new_ioend);

1157

&ioend, new_ioend);

1165

count++;

1158

count++;

1166

}

1159

}

1167

} else if (buffer_uptodate(bh)) {

1160

} else if (buffer_uptodate(bh)) {

1168

/*

1161

/*

1169

* we got here because the buffer is already mapped.

1162

* we got here because the buffer is already mapped.

1170

* That means it must already have extents allocated

1163

* That means it must already have extents allocated

1171

* underneath it. Map the extent by reading it.

1164

* underneath it. Map the extent by reading it.

1172

*/

1165

*/

1173

if (!imap_valid || flags != BMAPI_READ) {

1166

if (!imap_valid || flags != BMAPI_READ) {

1174

flags = BMAPI_READ;

1167

flags = BMAPI_READ;

1175

size = xfs_probe_cluster(inode, page, bh, head);

1168

size = xfs_probe_cluster(inode, page, bh, head);

1176

err = xfs_map_blocks(inode, offset, size,

1169

err = xfs_map_blocks(inode, offset, size,

1177

&imap, flags);

1170

&imap, flags);

1178

if (err)

1171

if (err)

1179

goto error;

1172

goto error;

1180

imap_valid = xfs_imap_valid(inode, &imap,

1173

imap_valid = xfs_imap_valid(inode, &imap,

1181

offset);

1174

offset);

1182

}

1175

}

1183

1176

1184

/*

1177

/*

1185

* We set the type to IO_NEW in case we are doing a

1178

* We set the type to IO_NEW in case we are doing a

1186

* small write at EOF that is extending the file but

1179

* small write at EOF that is extending the file but

1187

* without needing an allocation. We need to update the

1180

* without needing an allocation. We need to update the

1188

* file size on I/O completion in this case so it is

1181

* file size on I/O completion in this case so it is

1189

* the same case as having just allocated a new extent

1182

* the same case as having just allocated a new extent

1190

* that we are writing into for the first time.

1183

* that we are writing into for the first time.

1191

*/

1184

*/

1192

type = IO_NEW;

1185

type = IO_NEW;

1193

if (trylock_buffer(bh)) {

1186

if (trylock_buffer(bh)) {

1194

if (imap_valid)

1187

if (imap_valid)

1195

all_bh = 1;

1188

all_bh = 1;

1196

xfs_add_to_ioend(inode, bh, offset, type,

1189

xfs_add_to_ioend(inode, bh, offset, type,

1197

&ioend, !imap_valid);

1190

&ioend, !imap_valid);

1198

count++;

1191

count++;

1199

} else {

1192

} else {

1200

imap_valid = 0;

1193

imap_valid = 0;

1201

}

1194

}

1202

} else if (PageUptodate(page)) {

1195

} else if (PageUptodate(page)) {

1203

ASSERT(buffer_mapped(bh));

1196

ASSERT(buffer_mapped(bh));

1204

imap_valid = 0;

1197

imap_valid = 0;

1205

}

1198

}

1206

1199

1207

if (!iohead)

1200

if (!iohead)

1208

iohead = ioend;

1201

iohead = ioend;

1209

1202

1210

} while (offset += len, ((bh = bh->b_this_page) != head));

1203

} while (offset += len, ((bh = bh->b_this_page) != head));

1211

1204

1212

if (uptodate && bh == head)

1205

if (uptodate && bh == head)

1213

SetPageUptodate(page);

1206

SetPageUptodate(page);

1214

1207

1215

xfs_start_page_writeback(page, 1, count);

1208

xfs_start_page_writeback(page, 1, count);

1216

1209

1217

if (ioend && imap_valid) {

1210

if (ioend && imap_valid) {

1218

xfs_off_t end_index;

1211

xfs_off_t end_index;

1219

1212

1220

end_index = imap.br_startoff + imap.br_blockcount;

1213

end_index = imap.br_startoff + imap.br_blockcount;

1221

1214

1222

/* to bytes */

1215

/* to bytes */

1223

end_index <<= inode->i_blkbits;

1216

end_index <<= inode->i_blkbits;

1224

1217

1225

/* to pages */

1218

/* to pages */

1226

end_index = (end_index - 1) >> PAGE_CACHE_SHIFT;

1219

end_index = (end_index - 1) >> PAGE_CACHE_SHIFT;

1227

1220

1228

/* check against file size */

1221

/* check against file size */

1229

if (end_index > last_index)

1222

if (end_index > last_index)

1230

end_index = last_index;

1223

end_index = last_index;

1231

1224

1232

xfs_cluster_write(inode, page->index + 1, &imap, &ioend,

1225

xfs_cluster_write(inode, page->index + 1, &imap, &ioend,

1233

wbc, all_bh, end_index);

1226

wbc, all_bh, end_index);

1234

}

1227

}

1235

1228

1236

if (iohead)

1229

if (iohead)

1237

xfs_submit_ioend(wbc, iohead);

1230

xfs_submit_ioend(wbc, iohead);

1238

1231

1239

return 0;

1232

return 0;

1240

1233

1241

error:

1234

error:

1242

if (iohead)

1235

if (iohead)

1243

xfs_cancel_ioend(iohead);

1236

xfs_cancel_ioend(iohead);

1244

1237

1245

xfs_aops_discard_page(page);

1238

xfs_aops_discard_page(page);

1246

ClearPageUptodate(page);

1239

ClearPageUptodate(page);

1247

unlock_page(page);

1240

unlock_page(page);

1248

return err;

1241

return err;

1249

1242

1250

out_fail:

1243

out_fail:

1251

redirty_page_for_writepage(wbc, page);

1244

redirty_page_for_writepage(wbc, page);

1252

unlock_page(page);

1245

unlock_page(page);

1253

return 0;

1246

return 0;

1254

}

1247

}

1255

1248

1256

STATIC int

1249

STATIC int

1257

xfs_vm_writepages(

1250

xfs_vm_writepages(

1258

struct address_space *mapping,

1251

struct address_space *mapping,

1259

struct writeback_control *wbc)

1252

struct writeback_control *wbc)

1260

{

1253

{

1261

xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);

1254

xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);

1262

return generic_writepages(mapping, wbc);

1255

return generic_writepages(mapping, wbc);

1263

}

1256

}

1264

1257

1265

/*

1258

/*

1266

* Called to move a page into cleanable state - and from there

1259

* Called to move a page into cleanable state - and from there

1267

* to be released. The page should already be clean. We always

1260

* to be released. The page should already be clean. We always

1268

* have buffer heads in this call.

1261

* have buffer heads in this call.

1269

*

1262

*

1270

* Returns 1 if the page is ok to release, 0 otherwise.

1263

* Returns 1 if the page is ok to release, 0 otherwise.

1271

*/

1264

*/

1272

STATIC int

1265

STATIC int

1273

xfs_vm_releasepage(

1266

xfs_vm_releasepage(

1274

struct page *page,

1267

struct page *page,

1275

gfp_t gfp_mask)

1268

gfp_t gfp_mask)

1276

{

1269

{

1277

int delalloc, unwritten;

1270

int delalloc, unwritten;

1278

1271

1279

trace_xfs_releasepage(page->mapping->host, page, 0);

1272

trace_xfs_releasepage(page->mapping->host, page, 0);

1280

1273

1281

xfs_count_page_state(page, &delalloc, &unwritten);

1274

xfs_count_page_state(page, &delalloc, &unwritten);

1282

1275

1283

if (WARN_ON(delalloc))

1276

if (WARN_ON(delalloc))

1284

return 0;

1277

return 0;

1285

if (WARN_ON(unwritten))

1278

if (WARN_ON(unwritten))

1286

return 0;

1279

return 0;

1287

1280

1288

return try_to_free_buffers(page);

1281

return try_to_free_buffers(page);

1289

}

1282

}

1290

1283

1291

STATIC int

1284

STATIC int

1292

__xfs_get_blocks(

1285

__xfs_get_blocks(

1293

struct inode *inode,

1286

struct inode *inode,

1294

sector_t iblock,

1287

sector_t iblock,

1295

struct buffer_head *bh_result,

1288

struct buffer_head *bh_result,

1296

int create,

1289

int create,

1297

int direct)

1290

int direct)

1298

{

1291

{

1299

int flags = create ? BMAPI_WRITE : BMAPI_READ;

1292

int flags = create ? BMAPI_WRITE : BMAPI_READ;

1300

struct xfs_bmbt_irec imap;

1293

struct xfs_bmbt_irec imap;

1301

xfs_off_t offset;

1294

xfs_off_t offset;

1302

ssize_t size;

1295

ssize_t size;

1303

int nimap = 1;

1296

int nimap = 1;

1304

int new = 0;

1297

int new = 0;

1305

int error;

1298

int error;

1306

1299

1307

offset = (xfs_off_t)iblock << inode->i_blkbits;

1300

offset = (xfs_off_t)iblock << inode->i_blkbits;

1308

ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));

1301

ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));

1309

size = bh_result->b_size;

1302

size = bh_result->b_size;

1310

1303

1311

if (!create && direct && offset >= i_size_read(inode))

1304

if (!create && direct && offset >= i_size_read(inode))

1312

return 0;

1305

return 0;

1313

1306

1314

if (direct && create)

1307

if (direct && create)

1315

flags |= BMAPI_DIRECT;

1308

flags |= BMAPI_DIRECT;

1316

1309

1317

error = xfs_iomap(XFS_I(inode), offset, size, flags, &imap, &nimap,

1310

error = xfs_iomap(XFS_I(inode), offset, size, flags, &imap, &nimap,

1318

&new);

1311

&new);

1319

if (error)

1312

if (error)

1320

return -error;

1313

return -error;

1321

if (nimap == 0)

1314

if (nimap == 0)

1322

return 0;

1315

return 0;

1323

1316

1324

if (imap.br_startblock != HOLESTARTBLOCK &&

1317

if (imap.br_startblock != HOLESTARTBLOCK &&

1325

imap.br_startblock != DELAYSTARTBLOCK) {

1318

imap.br_startblock != DELAYSTARTBLOCK) {

1326

/*

1319

/*

1327

* For unwritten extents do not report a disk address on

1320

* For unwritten extents do not report a disk address on

1328

* the read case (treat as if we're reading into a hole).

1321

* the read case (treat as if we're reading into a hole).

1329

*/

1322

*/

1330

if (create || !ISUNWRITTEN(&imap))

1323

if (create || !ISUNWRITTEN(&imap))

1331

xfs_map_buffer(inode, bh_result, &imap, offset);

1324

xfs_map_buffer(inode, bh_result, &imap, offset);

1332

if (create && ISUNWRITTEN(&imap)) {

1325

if (create && ISUNWRITTEN(&imap)) {

1333

if (direct)

1326

if (direct)

1334

bh_result->b_private = inode;

1327

bh_result->b_private = inode;

1335

set_buffer_unwritten(bh_result);

1328

set_buffer_unwritten(bh_result);

1336

}

1329

}

1337

}

1330

}

1338

1331

1339

/*

1332

/*

1340

* If this is a realtime file, data may be on a different device.

1333

* If this is a realtime file, data may be on a different device.

1341

* to that pointed to from the buffer_head b_bdev currently.

1334

* to that pointed to from the buffer_head b_bdev currently.

1342

*/

1335

*/

1343

bh_result->b_bdev = xfs_find_bdev_for_inode(inode);

1336

bh_result->b_bdev = xfs_find_bdev_for_inode(inode);

1344

1337

1345

/*

1338

/*

1346

* If we previously allocated a block out beyond eof and we are now

1339

* If we previously allocated a block out beyond eof and we are now

1347

* coming back to use it then we will need to flag it as new even if it

1340

* coming back to use it then we will need to flag it as new even if it

1348

* has a disk address.

1341

* has a disk address.

1349

*

1342

*

1350

* With sub-block writes into unwritten extents we also need to mark

1343

* With sub-block writes into unwritten extents we also need to mark

1351

* the buffer as new so that the unwritten parts of the buffer gets

1344

* the buffer as new so that the unwritten parts of the buffer gets

1352

* correctly zeroed.

1345

* correctly zeroed.

1353

*/

1346

*/

1354

if (create &&

1347

if (create &&

1355

((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||

1348

((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||

1356

(offset >= i_size_read(inode)) ||

1349

(offset >= i_size_read(inode)) ||

1357

(new || ISUNWRITTEN(&imap))))

1350

(new || ISUNWRITTEN(&imap))))

1358

set_buffer_new(bh_result);

1351

set_buffer_new(bh_result);

1359

1352

1360

if (imap.br_startblock == DELAYSTARTBLOCK) {

1353

if (imap.br_startblock == DELAYSTARTBLOCK) {

1361

BUG_ON(direct);

1354

BUG_ON(direct);

1362

if (create) {

1355

if (create) {

1363

set_buffer_uptodate(bh_result);

1356

set_buffer_uptodate(bh_result);

1364

set_buffer_mapped(bh_result);

1357

set_buffer_mapped(bh_result);

1365

set_buffer_delay(bh_result);

1358

set_buffer_delay(bh_result);

1366

}

1359

}

1367

}

1360

}

1368

1361

1369

/*

1362

/*

1370

* If this is O_DIRECT or the mpage code calling tell them how large

1363

* If this is O_DIRECT or the mpage code calling tell them how large

1371

* the mapping is, so that we can avoid repeated get_blocks calls.

1364

* the mapping is, so that we can avoid repeated get_blocks calls.

1372

*/

1365

*/

1373

if (direct || size > (1 << inode->i_blkbits)) {

1366

if (direct || size > (1 << inode->i_blkbits)) {

1374

xfs_off_t mapping_size;

1367

xfs_off_t mapping_size;

1375

1368

1376

mapping_size = imap.br_startoff + imap.br_blockcount - iblock;

1369

mapping_size = imap.br_startoff + imap.br_blockcount - iblock;

1377

mapping_size <<= inode->i_blkbits;

1370

mapping_size <<= inode->i_blkbits;

1378

1371

1379

ASSERT(mapping_size > 0);

1372

ASSERT(mapping_size > 0);

1380

if (mapping_size > size)

1373

if (mapping_size > size)

1381

mapping_size = size;

1374

mapping_size = size;

1382

if (mapping_size > LONG_MAX)

1375

if (mapping_size > LONG_MAX)

1383

mapping_size = LONG_MAX;

1376

mapping_size = LONG_MAX;

1384

1377

1385

bh_result->b_size = mapping_size;

1378

bh_result->b_size = mapping_size;

1386

}

1379

}

1387

1380

1388

return 0;

1381

return 0;

1389

}

1382

}

1390

1383

1391

int

1384

int

1392

xfs_get_blocks(

1385

xfs_get_blocks(

1393

struct inode *inode,

1386

struct inode *inode,

1394

sector_t iblock,

1387

sector_t iblock,

1395

struct buffer_head *bh_result,

1388

struct buffer_head *bh_result,

1396

int create)

1389

int create)

1397

{

1390

{

1398

return __xfs_get_blocks(inode, iblock, bh_result, create, 0);

1391

return __xfs_get_blocks(inode, iblock, bh_result, create, 0);

1399

}

1392

}

1400

1393

1401

STATIC int

1394

STATIC int

1402

xfs_get_blocks_direct(

1395

xfs_get_blocks_direct(

1403

struct inode *inode,

1396

struct inode *inode,

1404

sector_t iblock,

1397

sector_t iblock,

1405

struct buffer_head *bh_result,

1398

struct buffer_head *bh_result,

1406

int create)

1399

int create)

1407

{

1400

{

1408

return __xfs_get_blocks(inode, iblock, bh_result, create, 1);

1401

return __xfs_get_blocks(inode, iblock, bh_result, create, 1);

1409

}

1402

}

1410

1403

1411

STATIC void

1404

STATIC void

1412

xfs_end_io_direct(

1405

xfs_end_io_direct(

1413

struct kiocb *iocb,

1406

struct kiocb *iocb,

1414

loff_t offset,

1407

loff_t offset,

1415

ssize_t size,

1408

ssize_t size,

1416

void *private)

1409

void *private)

1417

{

1410

{

1418

xfs_ioend_t *ioend = iocb->private;

1411

xfs_ioend_t *ioend = iocb->private;

1419

1412

1420

/*

1413

/*

1421

* Non-NULL private data means we need to issue a transaction to

1414

* Non-NULL private data means we need to issue a transaction to

1422

* convert a range from unwritten to written extents. This needs

1415

* convert a range from unwritten to written extents. This needs

1423

* to happen from process context but aio+dio I/O completion

1416

* to happen from process context but aio+dio I/O completion

1424

* happens from irq context so we need to defer it to a workqueue.

1417

* happens from irq context so we need to defer it to a workqueue.

1425

* This is not necessary for synchronous direct I/O, but we do

1418

* This is not necessary for synchronous direct I/O, but we do

1426

* it anyway to keep the code uniform and simpler.

1419

* it anyway to keep the code uniform and simpler.

1427

*

1420

*

1428

* Well, if only it were that simple. Because synchronous direct I/O

1421

* Well, if only it were that simple. Because synchronous direct I/O

1429

* requires extent conversion to occur *before* we return to userspace,

1422

* requires extent conversion to occur *before* we return to userspace,

1430

* we have to wait for extent conversion to complete. Look at the

1423

* we have to wait for extent conversion to complete. Look at the

1431

* iocb that has been passed to us to determine if this is AIO or

1424

* iocb that has been passed to us to determine if this is AIO or

1432

* not. If it is synchronous, tell xfs_finish_ioend() to kick the

1425

* not. If it is synchronous, tell xfs_finish_ioend() to kick the

1433

* workqueue and wait for it to complete.

1426

* workqueue and wait for it to complete.

1434

*

1427

*

1435

* The core direct I/O code might be changed to always call the

1428

* The core direct I/O code might be changed to always call the

1436

* completion handler in the future, in which case all this can

1429

* completion handler in the future, in which case all this can

1437

* go away.

1430

* go away.

1438

*/

1431

*/

1439

ioend->io_offset = offset;

1432

ioend->io_offset = offset;

1440

ioend->io_size = size;

1433

ioend->io_size = size;

1441

if (ioend->io_type == IO_READ) {

1434

if (ioend->io_type == IO_READ) {

1442

xfs_finish_ioend(ioend, 0);

1435

xfs_finish_ioend(ioend, 0);

1443

} else if (private && size > 0) {

1436

} else if (private && size > 0) {

1444

xfs_finish_ioend(ioend, is_sync_kiocb(iocb));

1437

xfs_finish_ioend(ioend, is_sync_kiocb(iocb));

1445

} else {

1438

} else {

1446

/*

1439

/*

1447

* A direct I/O write ioend starts it's life in unwritten

1440

* A direct I/O write ioend starts it's life in unwritten

1448

* state in case they map an unwritten extent. This write

1441

* state in case they map an unwritten extent. This write

1449

* didn't map an unwritten extent so switch it's completion

1442

* didn't map an unwritten extent so switch it's completion

1450

* handler.

1443

* handler.

1451

*/

1444

*/

1452

ioend->io_type = IO_NEW;

1445

ioend->io_type = IO_NEW;

1453

xfs_finish_ioend(ioend, 0);

1446

xfs_finish_ioend(ioend, 0);

1454

}

1447

}

1455

1448

1456

/*

1449

/*

1457

* blockdev_direct_IO can return an error even after the I/O

1450

* blockdev_direct_IO can return an error even after the I/O

1458

* completion handler was called. Thus we need to protect

1451

* completion handler was called. Thus we need to protect

1459

* against double-freeing.

1452

* against double-freeing.

1460

*/

1453

*/

1461

iocb->private = NULL;

1454

iocb->private = NULL;

1462

}

1455

}

1463

1456

1464

STATIC ssize_t

1457

STATIC ssize_t

1465

xfs_vm_direct_IO(

1458

xfs_vm_direct_IO(

1466

int rw,

1459

int rw,

1467

struct kiocb *iocb,

1460

struct kiocb *iocb,

1468

const struct iovec *iov,

1461

const struct iovec *iov,

1469

loff_t offset,

1462

loff_t offset,

1470

unsigned long nr_segs)

1463

unsigned long nr_segs)

1471

{

1464

{

1472

struct file *file = iocb->ki_filp;

1465

struct file *file = iocb->ki_filp;

1473

struct inode *inode = file->f_mapping->host;

1466

struct inode *inode = file->f_mapping->host;

1474

struct block_device *bdev;

1467

struct block_device *bdev;

1475

ssize_t ret;

1468

ssize_t ret;

1476

1469

1477

bdev = xfs_find_bdev_for_inode(inode);

1470

bdev = xfs_find_bdev_for_inode(inode);

1478

1471

1479

iocb->private = xfs_alloc_ioend(inode, rw == WRITE ?

1472

iocb->private = xfs_alloc_ioend(inode, rw == WRITE ?

1480

IO_UNWRITTEN : IO_READ);

1473

IO_UNWRITTEN : IO_READ);

1481

1474

1482

ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov,

1475

ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov,

1483

offset, nr_segs,

1476

offset, nr_segs,

1484

xfs_get_blocks_direct,

1477

xfs_get_blocks_direct,

1485

xfs_end_io_direct);

1478

xfs_end_io_direct);

1486

1479

1487

if (unlikely(ret != -EIOCBQUEUED && iocb->private))

1480

if (unlikely(ret != -EIOCBQUEUED && iocb->private))

1488

xfs_destroy_ioend(iocb->private);

1481

xfs_destroy_ioend(iocb->private);

1489

return ret;

1482

return ret;

1490

}

1483

}

1491

1484

1492

STATIC int

1485

STATIC int

1493

xfs_vm_write_begin(

1486

xfs_vm_write_begin(

1494

struct file *file,

1487

struct file *file,

1495

struct address_space *mapping,

1488

struct address_space *mapping,

1496

loff_t pos,

1489

loff_t pos,

1497

unsigned len,

1490

unsigned len,

1498

unsigned flags,

1491

unsigned flags,

1499

struct page **pagep,

1492

struct page **pagep,

1500

void **fsdata)

1493

void **fsdata)

1501

{

1494

{

1502

*pagep = NULL;

1495

*pagep = NULL;

1503

return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,

1496

return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,

1504

xfs_get_blocks);

1497

xfs_get_blocks);

1505

}

1498

}

1506

1499

1507

STATIC sector_t

1500

STATIC sector_t

1508

xfs_vm_bmap(

1501

xfs_vm_bmap(

1509

struct address_space *mapping,

1502

struct address_space *mapping,

1510

sector_t block)

1503

sector_t block)

1511

{

1504

{

1512

struct inode *inode = (struct inode *)mapping->host;

1505

struct inode *inode = (struct inode *)mapping->host;

1513

struct xfs_inode *ip = XFS_I(inode);

1506

struct xfs_inode *ip = XFS_I(inode);

1514

1507

1515

trace_xfs_vm_bmap(XFS_I(inode));

1508

trace_xfs_vm_bmap(XFS_I(inode));

1516

xfs_ilock(ip, XFS_IOLOCK_SHARED);

1509

xfs_ilock(ip, XFS_IOLOCK_SHARED);

1517

xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF);

1510

xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF);

1518

xfs_iunlock(ip, XFS_IOLOCK_SHARED);

1511

xfs_iunlock(ip, XFS_IOLOCK_SHARED);

1519

return generic_block_bmap(mapping, block, xfs_get_blocks);

1512

return generic_block_bmap(mapping, block, xfs_get_blocks);

1520

}

1513

}

1521

1514

1522

STATIC int

1515

STATIC int

1523

xfs_vm_readpage(

1516

xfs_vm_readpage(

1524

struct file *unused,

1517

struct file *unused,

1525

struct page *page)

1518

struct page *page)

1526

{

1519

{

1527

return mpage_readpage(page, xfs_get_blocks);

1520

return mpage_readpage(page, xfs_get_blocks);

1528

}

1521

}

1529

1522

1530

STATIC int

1523

STATIC int

1531

xfs_vm_readpages(

1524

xfs_vm_readpages(

1532

struct file *unused,

1525

struct file *unused,

1533

struct address_space *mapping,

1526

struct address_space *mapping,

1534

struct list_head *pages,

1527

struct list_head *pages,

1535

unsigned nr_pages)

1528

unsigned nr_pages)

1536

{

1529

{

1537

return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);

1530

return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);

1538

}

1531

}

1539

1532

1540

const struct address_space_operations xfs_address_space_operations = {

1533

const struct address_space_operations xfs_address_space_operations = {

1541

.readpage = xfs_vm_readpage,

1534

.readpage = xfs_vm_readpage,

1542

.readpages = xfs_vm_readpages,

1535

.readpages = xfs_vm_readpages,

1543

.writepage = xfs_vm_writepage,

1536

.writepage = xfs_vm_writepage,

1544

.writepages = xfs_vm_writepages,

1537

.writepages = xfs_vm_writepages,

1545

.sync_page = block_sync_page,

1538

.sync_page = block_sync_page,

1546

.releasepage = xfs_vm_releasepage,

1539

.releasepage = xfs_vm_releasepage,

1547

.invalidatepage = xfs_vm_invalidatepage,

1540

.invalidatepage = xfs_vm_invalidatepage,

1548

.write_begin = xfs_vm_write_begin,

1541

.write_begin = xfs_vm_write_begin,

1549

.write_end = generic_write_end,

1542

.write_end = generic_write_end,

1550

.bmap = xfs_vm_bmap,

1543

.bmap = xfs_vm_bmap,

1551

.direct_IO = xfs_vm_direct_IO,

1544

.direct_IO = xfs_vm_direct_IO,

1552

.migratepage = buffer_migrate_page,

1545

.migratepage = buffer_migrate_page,

1553

.is_partially_uptodate = block_is_partially_uptodate,

1546

.is_partially_uptodate = block_is_partially_uptodate,

1554

.error_remove_page = generic_error_remove_page,

1547

.error_remove_page = generic_error_remove_page,

1555

};

1548

};

1556

1549

GITLAB

xfs: writepage always has buffers

 /*
  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as
  * published by the Free Software Foundation.
  *
  * This program is distributed in the hope that it would be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write the Free Software Foundation,
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 #include "xfs.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_inum.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_trans.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_alloc.h"
 #include "xfs_error.h"
 #include "xfs_rw.h"
 #include "xfs_iomap.h"
 #include "xfs_vnodeops.h"
 #include "xfs_trace.h"
 #include "xfs_bmap.h"
 #include <linux/gfp.h>
 #include <linux/mpage.h>
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
 /*
  * Types of I/O for bmap clustering and I/O completion tracking.
  */
 enum {
 	IO_READ,	/* mapping for a read */
 	IO_DELAY,	/* mapping covers delalloc region */
 	IO_UNWRITTEN,	/* mapping covers allocated but uninitialized data */
 	IO_NEW		/* just allocated */
 };
 /*
  * Prime number of hash buckets since address is used as the key.
  */
 #define NVSYNC		37
 #define to_ioend_wq(v)	(&xfs_ioend_wq[((unsigned long)v) % NVSYNC])
 static wait_queue_head_t xfs_ioend_wq[NVSYNC];
 void __init
 xfs_ioend_init(void)
 {
 	int i;
 	for (i = 0; i < NVSYNC; i++)
 		init_waitqueue_head(&xfs_ioend_wq[i]);
 }
 void
 xfs_ioend_wait(
 	xfs_inode_t	*ip)
 {
 	wait_queue_head_t *wq = to_ioend_wq(ip);
 	wait_event(*wq, (atomic_read(&ip->i_iocount) == 0));
 }
 STATIC void
 xfs_ioend_wake(
 	xfs_inode_t	*ip)
 {
 	if (atomic_dec_and_test(&ip->i_iocount))
 		wake_up(to_ioend_wq(ip));
 }
 void
 xfs_count_page_state(
 	struct page		*page,
 	int			*delalloc,
 	int			*unwritten)
 {
 	struct buffer_head	*bh, *head;
 	*delalloc = *unwritten = 0;
 	bh = head = page_buffers(page);
 	do {
 		if (buffer_unwritten(bh))
 			(*unwritten) = 1;
 		else if (buffer_delay(bh))
 			(*delalloc) = 1;
 	} while ((bh = bh->b_this_page) != head);
 }
 STATIC struct block_device *
 xfs_find_bdev_for_inode(
 	struct inode		*inode)
 {
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
 	if (XFS_IS_REALTIME_INODE(ip))
 		return mp->m_rtdev_targp->bt_bdev;
 	else
 		return mp->m_ddev_targp->bt_bdev;
 }
 /*
  * We're now finished for good with this ioend structure.
  * Update the page state via the associated buffer_heads,
  * release holds on the inode and bio, and finally free
  * up memory.  Do not use the ioend after this.
  */
 STATIC void
 xfs_destroy_ioend(
 	xfs_ioend_t		*ioend)
 {
 	struct buffer_head	*bh, *next;
 	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
 	for (bh = ioend->io_buffer_head; bh; bh = next) {
 		next = bh->b_private;
 		bh->b_end_io(bh, !ioend->io_error);
 	}
 	/*
 	 * Volume managers supporting multiple paths can send back ENODEV
 	 * when the final path disappears.  In this case continuing to fill
 	 * the page cache with dirty data which cannot be written out is
 	 * evil, so prevent that.
 	 */
 	if (unlikely(ioend->io_error == -ENODEV)) {
 		xfs_do_force_shutdown(ip->i_mount, SHUTDOWN_DEVICE_REQ,
 				      __FILE__, __LINE__);
 	}
 	xfs_ioend_wake(ip);
 	mempool_free(ioend, xfs_ioend_pool);
 }
 /*
  * If the end of the current ioend is beyond the current EOF,
  * return the new EOF value, otherwise zero.
  */
 STATIC xfs_fsize_t
 xfs_ioend_new_eof(
 	xfs_ioend_t		*ioend)
 {
 	xfs_inode_t		*ip = XFS_I(ioend->io_inode);
 	xfs_fsize_t		isize;
 	xfs_fsize_t		bsize;
 	bsize = ioend->io_offset + ioend->io_size;
 	isize = MAX(ip->i_size, ip->i_new_size);
 	isize = MIN(isize, bsize);
 	return isize > ip->i_d.di_size ? isize : 0;
 }
 /*
  * Update on-disk file size now that data has been written to disk.  The
  * current in-memory file size is i_size.  If a write is beyond eof i_new_size
  * will be the intended file size until i_size is updated.  If this write does
  * not extend all the way to the valid file size then restrict this update to
  * the end of the write.
  *
  * This function does not block as blocking on the inode lock in IO completion
  * can lead to IO completion order dependency deadlocks.. If it can't get the
  * inode ilock it will return EAGAIN. Callers must handle this.
  */
 STATIC int
 xfs_setfilesize(
 	xfs_ioend_t		*ioend)
 {
 	xfs_inode_t		*ip = XFS_I(ioend->io_inode);
 	xfs_fsize_t		isize;
 	ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
 	ASSERT(ioend->io_type != IO_READ);
 	if (unlikely(ioend->io_error))
 		return 0;
 	if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
 		return EAGAIN;
 	isize = xfs_ioend_new_eof(ioend);
 	if (isize) {
 		ip->i_d.di_size = isize;
 		xfs_mark_inode_dirty(ip);
 	}
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	return 0;
 }
 /*
  * Schedule IO completion handling on a xfsdatad if this was
  * the final hold on this ioend. If we are asked to wait,
  * flush the workqueue.
  */
 STATIC void
 xfs_finish_ioend(
 	xfs_ioend_t	*ioend,
 	int		wait)
 {
 	if (atomic_dec_and_test(&ioend->io_remaining)) {
 		struct workqueue_struct *wq;
 		wq = (ioend->io_type == IO_UNWRITTEN) ?
 			xfsconvertd_workqueue : xfsdatad_workqueue;
 		queue_work(wq, &ioend->io_work);
 		if (wait)
 			flush_workqueue(wq);
 	}
 }
 /*
  * IO write completion.
  */
 STATIC void
 xfs_end_io(
 	struct work_struct *work)
 {
 	xfs_ioend_t	*ioend = container_of(work, xfs_ioend_t, io_work);
 	struct xfs_inode *ip = XFS_I(ioend->io_inode);
 	int		error = 0;
 	/*
 	 * For unwritten extents we need to issue transactions to convert a
 	 * range to normal written extens after the data I/O has finished.
 	 */
 	if (ioend->io_type == IO_UNWRITTEN &&
 	    likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) {
 		error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
 						 ioend->io_size);
 		if (error)
 			ioend->io_error = error;
 	}
 	/*
 	 * We might have to update the on-disk file size after extending
 	 * writes.
 	 */
 	if (ioend->io_type != IO_READ) {
 		error = xfs_setfilesize(ioend);
 		ASSERT(!error || error == EAGAIN);
 	}
 	/*
 	 * If we didn't complete processing of the ioend, requeue it to the
 	 * tail of the workqueue for another attempt later. Otherwise destroy
 	 * it.
 	 */
 	if (error == EAGAIN) {
 		atomic_inc(&ioend->io_remaining);
 		xfs_finish_ioend(ioend, 0);
 		/* ensure we don't spin on blocked ioends */
 		delay(1);
 	} else
 		xfs_destroy_ioend(ioend);
 }
 /*
  * Allocate and initialise an IO completion structure.
  * We need to track unwritten extent write completion here initially.
  * We'll need to extend this for updating the ondisk inode size later
  * (vs. incore size).
  */
 STATIC xfs_ioend_t *
 xfs_alloc_ioend(
 	struct inode		*inode,
 	unsigned int		type)
 {
 	xfs_ioend_t		*ioend;
 	ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS);
 	/*
 	 * Set the count to 1 initially, which will prevent an I/O
 	 * completion callback from happening before we have started
 	 * all the I/O from calling the completion routine too early.
 	 */
 	atomic_set(&ioend->io_remaining, 1);
 	ioend->io_error = 0;
 	ioend->io_list = NULL;
 	ioend->io_type = type;
 	ioend->io_inode = inode;
 	ioend->io_buffer_head = NULL;
 	ioend->io_buffer_tail = NULL;
 	atomic_inc(&XFS_I(ioend->io_inode)->i_iocount);
 	ioend->io_offset = 0;
 	ioend->io_size = 0;
 	INIT_WORK(&ioend->io_work, xfs_end_io);
 	return ioend;
 }
 STATIC int
 xfs_map_blocks(
 	struct inode		*inode,
 	loff_t			offset,
 	ssize_t			count,
 	struct xfs_bmbt_irec	*imap,
 	int			flags)
 {
 	int			nmaps = 1;
 	int			new = 0;
 	return -xfs_iomap(XFS_I(inode), offset, count, flags, imap, &nmaps, &new);
 }
 STATIC int
 xfs_imap_valid(
 	struct inode		*inode,
 	struct xfs_bmbt_irec	*imap,
 	xfs_off_t		offset)
 {
 	offset >>= inode->i_blkbits;
 	return offset >= imap->br_startoff &&
 		offset < imap->br_startoff + imap->br_blockcount;
 }
 /*
  * BIO completion handler for buffered IO.
  */
 STATIC void
 xfs_end_bio(
 	struct bio		*bio,
 	int			error)
 {
 	xfs_ioend_t		*ioend = bio->bi_private;
 	ASSERT(atomic_read(&bio->bi_cnt) >= 1);
 	ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error;
 	/* Toss bio and pass work off to an xfsdatad thread */
 	bio->bi_private = NULL;
 	bio->bi_end_io = NULL;
 	bio_put(bio);
 	xfs_finish_ioend(ioend, 0);
 }
 STATIC void
 xfs_submit_ioend_bio(
 	struct writeback_control *wbc,
 	xfs_ioend_t		*ioend,
 	struct bio		*bio)
 {
 	atomic_inc(&ioend->io_remaining);
 	bio->bi_private = ioend;
 	bio->bi_end_io = xfs_end_bio;
 	/*
 	 * If the I/O is beyond EOF we mark the inode dirty immediately
 	 * but don't update the inode size until I/O completion.
 	 */
 	if (xfs_ioend_new_eof(ioend))
 		xfs_mark_inode_dirty(XFS_I(ioend->io_inode));
 	submit_bio(wbc->sync_mode == WB_SYNC_ALL ?
 		   WRITE_SYNC_PLUG : WRITE, bio);
 	ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP));
 	bio_put(bio);
 }
 STATIC struct bio *
 xfs_alloc_ioend_bio(
 	struct buffer_head	*bh)
 {
 	struct bio		*bio;
 	int			nvecs = bio_get_nr_vecs(bh->b_bdev);
 	do {
 		bio = bio_alloc(GFP_NOIO, nvecs);
 		nvecs >>= 1;
 	} while (!bio);
 	ASSERT(bio->bi_private == NULL);
 	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
 	bio->bi_bdev = bh->b_bdev;
 	bio_get(bio);
 	return bio;
 }
 STATIC void
 xfs_start_buffer_writeback(
 	struct buffer_head	*bh)
 {
 	ASSERT(buffer_mapped(bh));
 	ASSERT(buffer_locked(bh));
 	ASSERT(!buffer_delay(bh));
 	ASSERT(!buffer_unwritten(bh));
 	mark_buffer_async_write(bh);
 	set_buffer_uptodate(bh);
 	clear_buffer_dirty(bh);
 }
 STATIC void
 xfs_start_page_writeback(
 	struct page		*page,
 	int			clear_dirty,
 	int			buffers)
 {
 	ASSERT(PageLocked(page));
 	ASSERT(!PageWriteback(page));
 	if (clear_dirty)
 		clear_page_dirty_for_io(page);
 	set_page_writeback(page);
 	unlock_page(page);
 	/* If no buffers on the page are to be written, finish it here */
 	if (!buffers)
 		end_page_writeback(page);
 }
 static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh)
 {
 	return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
 }
 /*
  * Submit all of the bios for all of the ioends we have saved up, covering the
  * initial writepage page and also any probed pages.
  *
  * Because we may have multiple ioends spanning a page, we need to start
  * writeback on all the buffers before we submit them for I/O. If we mark the
  * buffers as we got, then we can end up with a page that only has buffers
  * marked async write and I/O complete on can occur before we mark the other
  * buffers async write.
  *
  * The end result of this is that we trip a bug in end_page_writeback() because
  * we call it twice for the one page as the code in end_buffer_async_write()
  * assumes that all buffers on the page are started at the same time.
  *
  * The fix is two passes across the ioend list - one to start writeback on the
  * buffer_heads, and then submit them for I/O on the second pass.
  */
 STATIC void
 xfs_submit_ioend(
 	struct writeback_control *wbc,
 	xfs_ioend_t		*ioend)
 {
 	xfs_ioend_t		*head = ioend;
 	xfs_ioend_t		*next;
 	struct buffer_head	*bh;
 	struct bio		*bio;
 	sector_t		lastblock = 0;
 	/* Pass 1 - start writeback */
 	do {
 		next = ioend->io_list;
 		for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
 			xfs_start_buffer_writeback(bh);
 		}
 	} while ((ioend = next) != NULL);
 	/* Pass 2 - submit I/O */
 	ioend = head;
 	do {
 		next = ioend->io_list;
 		bio = NULL;
 		for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
 			if (!bio) {
  retry:
 				bio = xfs_alloc_ioend_bio(bh);
 			} else if (bh->b_blocknr != lastblock + 1) {
 				xfs_submit_ioend_bio(wbc, ioend, bio);
 				goto retry;
 			}
 			if (bio_add_buffer(bio, bh) != bh->b_size) {
 				xfs_submit_ioend_bio(wbc, ioend, bio);
 				goto retry;
 			}
 			lastblock = bh->b_blocknr;
 		}
 		if (bio)
 			xfs_submit_ioend_bio(wbc, ioend, bio);
 		xfs_finish_ioend(ioend, 0);
 	} while ((ioend = next) != NULL);
 }
 /*
  * Cancel submission of all buffer_heads so far in this endio.
  * Toss the endio too.  Only ever called for the initial page
  * in a writepage request, so only ever one page.
  */
 STATIC void
 xfs_cancel_ioend(
 	xfs_ioend_t		*ioend)
 {
 	xfs_ioend_t		*next;
 	struct buffer_head	*bh, *next_bh;
 	do {
 		next = ioend->io_list;
 		bh = ioend->io_buffer_head;
 		do {
 			next_bh = bh->b_private;
 			clear_buffer_async_write(bh);
 			unlock_buffer(bh);
 		} while ((bh = next_bh) != NULL);
 		xfs_ioend_wake(XFS_I(ioend->io_inode));
 		mempool_free(ioend, xfs_ioend_pool);
 	} while ((ioend = next) != NULL);
 }
 /*
  * Test to see if we've been building up a completion structure for
  * earlier buffers -- if so, we try to append to this ioend if we
  * can, otherwise we finish off any current ioend and start another.
  * Return true if we've finished the given ioend.
  */
 STATIC void
 xfs_add_to_ioend(
 	struct inode		*inode,
 	struct buffer_head	*bh,
 	xfs_off_t		offset,
 	unsigned int		type,
 	xfs_ioend_t		**result,
 	int			need_ioend)
 {
 	xfs_ioend_t		*ioend = *result;
 	if (!ioend || need_ioend || type != ioend->io_type) {
 		xfs_ioend_t	*previous = *result;
 		ioend = xfs_alloc_ioend(inode, type);
 		ioend->io_offset = offset;
 		ioend->io_buffer_head = bh;
 		ioend->io_buffer_tail = bh;
 		if (previous)
 			previous->io_list = ioend;
 		*result = ioend;
 	} else {
 		ioend->io_buffer_tail->b_private = bh;
 		ioend->io_buffer_tail = bh;
 	}
 	bh->b_private = NULL;
 	ioend->io_size += bh->b_size;
 }
 STATIC void
 xfs_map_buffer(
 	struct inode		*inode,
 	struct buffer_head	*bh,
 	struct xfs_bmbt_irec	*imap,
 	xfs_off_t		offset)
 {
 	sector_t		bn;
 	struct xfs_mount	*m = XFS_I(inode)->i_mount;
 	xfs_off_t		iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff);
 	xfs_daddr_t		iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock);
 	ASSERT(imap->br_startblock != HOLESTARTBLOCK);
 	ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
 	bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) +
 	      ((offset - iomap_offset) >> inode->i_blkbits);
 	ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode)));
 	bh->b_blocknr = bn;
 	set_buffer_mapped(bh);
 }
 STATIC void
 xfs_map_at_offset(
 	struct inode		*inode,
 	struct buffer_head	*bh,
 	struct xfs_bmbt_irec	*imap,
 	xfs_off_t		offset)
 {
 	ASSERT(imap->br_startblock != HOLESTARTBLOCK);
 	ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
 	lock_buffer(bh);
 	xfs_map_buffer(inode, bh, imap, offset);
 	bh->b_bdev = xfs_find_bdev_for_inode(inode);
 	set_buffer_mapped(bh);
 	clear_buffer_delay(bh);
 	clear_buffer_unwritten(bh);
 }
 /*
  * Look for a page at index that is suitable for clustering.
  */
 STATIC unsigned int
 xfs_probe_page(
 	struct page		*page,
 	unsigned int		pg_offset)
 {
 	struct buffer_head	*bh, *head;
 	int			ret = 0;
 	if (PageWriteback(page))
 		return 0;
 	if (!PageDirty(page))
 		return 0;
 	if (!page->mapping)
 		return 0;
 	if (!page_has_buffers(page))
 		return 0;
 	bh = head = page_buffers(page);
 	do {
 		if (!buffer_uptodate(bh))
 			break;
 		if (!buffer_mapped(bh))
 			break;
 		ret += bh->b_size;
 		if (ret >= pg_offset)
 			break;
 	} while ((bh = bh->b_this_page) != head);
 	return ret;
 }
 STATIC size_t
 xfs_probe_cluster(
 	struct inode		*inode,
 	struct page		*startpage,
 	struct buffer_head	*bh,
 	struct buffer_head	*head)
 {
 	struct pagevec		pvec;
 	pgoff_t			tindex, tlast, tloff;
 	size_t			total = 0;
 	int			done = 0, i;
 	/* First sum forwards in this page */
 	do {
 		if (!buffer_uptodate(bh) || !buffer_mapped(bh))
 			return total;
 		total += bh->b_size;
 	} while ((bh = bh->b_this_page) != head);
 	/* if we reached the end of the page, sum forwards in following pages */
 	tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
 	tindex = startpage->index + 1;
 	/* Prune this back to avoid pathological behavior */
 	tloff = min(tlast, startpage->index + 64);
 	pagevec_init(&pvec, 0);
 	while (!done && tindex <= tloff) {
 		unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
 		if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
 			break;
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 			size_t pg_offset, pg_len = 0;
 			if (tindex == tlast) {
 				pg_offset =
 				    i_size_read(inode) & (PAGE_CACHE_SIZE - 1);
 				if (!pg_offset) {
 					done = 1;
 					break;
 				}
 			} else
 				pg_offset = PAGE_CACHE_SIZE;
 			if (page->index == tindex && trylock_page(page)) {
 				pg_len = xfs_probe_page(page, pg_offset);
 				unlock_page(page);
 			}
 			if (!pg_len) {
 				done = 1;
 				break;
 			}
 			total += pg_len;
 			tindex++;
 		}
 		pagevec_release(&pvec);
 		cond_resched();
 	}
 	return total;
 }
 /*
  * Test if a given page is suitable for writing as part of an unwritten
  * or delayed allocate extent.
  */
 STATIC int
 xfs_is_delayed_page(
 	struct page		*page,
 	unsigned int		type)
 {
 	if (PageWriteback(page))
 		return 0;
 	if (page->mapping && page_has_buffers(page)) {
 		struct buffer_head	*bh, *head;
 		int			acceptable = 0;
 		bh = head = page_buffers(page);
 		do {
 			if (buffer_unwritten(bh))
 				acceptable = (type == IO_UNWRITTEN);
 			else if (buffer_delay(bh))
 				acceptable = (type == IO_DELAY);
 			else if (buffer_dirty(bh) && buffer_mapped(bh))
 				acceptable = (type == IO_NEW);
 			else
 				break;
 		} while ((bh = bh->b_this_page) != head);
 		if (acceptable)
 			return 1;
 	}
 	return 0;
 }
 /*
  * Allocate & map buffers for page given the extent map. Write it out.
  * except for the original page of a writepage, this is called on
  * delalloc/unwritten pages only, for the original page it is possible
  * that the page has no mapping at all.
  */
 STATIC int
 xfs_convert_page(
 	struct inode		*inode,
 	struct page		*page,
 	loff_t			tindex,
 	struct xfs_bmbt_irec	*imap,
 	xfs_ioend_t		**ioendp,
 	struct writeback_control *wbc,
 	int			all_bh)
 {
 	struct buffer_head	*bh, *head;
 	xfs_off_t		end_offset;
 	unsigned long		p_offset;
 	unsigned int		type;
 	int			len, page_dirty;
 	int			count = 0, done = 0, uptodate = 1;
  	xfs_off_t		offset = page_offset(page);
 	if (page->index != tindex)
 		goto fail;
 	if (!trylock_page(page))
 		goto fail;
 	if (PageWriteback(page))
 		goto fail_unlock_page;
 	if (page->mapping != inode->i_mapping)
 		goto fail_unlock_page;
 	if (!xfs_is_delayed_page(page, (*ioendp)->io_type))
 		goto fail_unlock_page;
 	/*
 	 * page_dirty is initially a count of buffers on the page before
 	 * EOF and is decremented as we move each into a cleanable state.
 	 *
 	 * Derivation:
 	 *
 	 * End offset is the highest offset that this page should represent.
 	 * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
 	 * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
 	 * hence give us the correct page_dirty count. On any other page,
 	 * it will be zero and in that case we need page_dirty to be the
 	 * count of buffers on the page.
 	 */
 	end_offset = min_t(unsigned long long,
 			(xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
 			i_size_read(inode));
 	len = 1 << inode->i_blkbits;
 	p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
 					PAGE_CACHE_SIZE);
 	p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
 	page_dirty = p_offset / len;
 	bh = head = page_buffers(page);
 	do {
 		if (offset >= end_offset)
 			break;
 		if (!buffer_uptodate(bh))
 			uptodate = 0;
 		if (!(PageUptodate(page) || buffer_uptodate(bh))) {
 			done = 1;
 			continue;
 		}
 		if (buffer_unwritten(bh) || buffer_delay(bh)) {
 			if (buffer_unwritten(bh))
 				type = IO_UNWRITTEN;
 			else
 				type = IO_DELAY;
 			if (!xfs_imap_valid(inode, imap, offset)) {
 				done = 1;
 				continue;
 			}
 			ASSERT(imap->br_startblock != HOLESTARTBLOCK);
 			ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
 			xfs_map_at_offset(inode, bh, imap, offset);
 			xfs_add_to_ioend(inode, bh, offset, type,
 					 ioendp, done);
 			page_dirty--;
 			count++;
 		} else {
 			type = IO_NEW;
 			if (buffer_mapped(bh) && all_bh) {
 				lock_buffer(bh);
 				xfs_add_to_ioend(inode, bh, offset,
 						type, ioendp, done);
 				count++;
 				page_dirty--;
 			} else {
 				done = 1;
 			}
 		}
 	} while (offset += len, (bh = bh->b_this_page) != head);
 	if (uptodate && bh == head)
 		SetPageUptodate(page);
 	if (count) {
 		wbc->nr_to_write--;
 		if (wbc->nr_to_write <= 0)
 			done = 1;
 	}
 	xfs_start_page_writeback(page, !page_dirty, count);
 	return done;
  fail_unlock_page:
 	unlock_page(page);
  fail:
 	return 1;
 }
 /*
  * Convert & write out a cluster of pages in the same extent as defined
  * by mp and following the start page.
  */
 STATIC void
 xfs_cluster_write(
 	struct inode		*inode,
 	pgoff_t			tindex,
 	struct xfs_bmbt_irec	*imap,
 	xfs_ioend_t		**ioendp,
 	struct writeback_control *wbc,
 	int			all_bh,
 	pgoff_t			tlast)
 {
 	struct pagevec		pvec;
 	int			done = 0, i;
 	pagevec_init(&pvec, 0);
 	while (!done && tindex <= tlast) {
 		unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
 		if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
 			break;
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			done = xfs_convert_page(inode, pvec.pages[i], tindex++,
 					imap, ioendp, wbc, all_bh);
 			if (done)
 				break;
 		}
 		pagevec_release(&pvec);
 		cond_resched();
 	}
 }
 STATIC void
 xfs_vm_invalidatepage(
 	struct page		*page,
 	unsigned long		offset)
 {
 	trace_xfs_invalidatepage(page->mapping->host, page, offset);
 	block_invalidatepage(page, offset);
 }
 /*
  * If the page has delalloc buffers on it, we need to punch them out before we
  * invalidate the page. If we don't, we leave a stale delalloc mapping on the
  * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read
  * is done on that same region - the delalloc extent is returned when none is
  * supposed to be there.
  *
  * We prevent this by truncating away the delalloc regions on the page before
  * invalidating it. Because they are delalloc, we can do this without needing a
  * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this
  * truncation without a transaction as there is no space left for block
  * reservation (typically why we see a ENOSPC in writeback).
  *
  * This is not a performance critical path, so for now just do the punching a
  * buffer head at a time.
  */
 STATIC void
 xfs_aops_discard_page(
 	struct page		*page)
 {
 	struct inode		*inode = page->mapping->host;
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct buffer_head	*bh, *head;
 	loff_t			offset = page_offset(page);
 	ssize_t			len = 1 << inode->i_blkbits;
 	if (!xfs_is_delayed_page(page, IO_DELAY))
 		goto out_invalidate;
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 		goto out_invalidate;
 	xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
 		"page discard on page %p, inode 0x%llx, offset %llu.",
 			page, ip->i_ino, offset);
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	bh = head = page_buffers(page);
 	do {
 		int		done;
 		xfs_fileoff_t	offset_fsb;
 		xfs_bmbt_irec_t	imap;
 		int		nimaps = 1;
 		int		error;
 		xfs_fsblock_t	firstblock;
 		xfs_bmap_free_t flist;
 		if (!buffer_delay(bh))
 			goto next_buffer;
 		offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
 		/*
 		 * Map the range first and check that it is a delalloc extent
 		 * before trying to unmap the range. Otherwise we will be
 		 * trying to remove a real extent (which requires a
 		 * transaction) or a hole, which is probably a bad idea...
 		 */
 		error = xfs_bmapi(NULL, ip, offset_fsb, 1,
 				XFS_BMAPI_ENTIRE,  NULL, 0, &imap,
 				&nimaps, NULL);
 		if (error) {
 			/* something screwed, just bail */
 			if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
 				xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
 				"page discard failed delalloc mapping lookup.");
 			}
 			break;
 		}
 		if (!nimaps) {
 			/* nothing there */
 			goto next_buffer;
 		}
 		if (imap.br_startblock != DELAYSTARTBLOCK) {
 			/* been converted, ignore */
 			goto next_buffer;
 		}
 		WARN_ON(imap.br_blockcount == 0);
 		/*
 		 * Note: while we initialise the firstblock/flist pair, they
 		 * should never be used because blocks should never be
 		 * allocated or freed for a delalloc extent and hence we need
 		 * don't cancel or finish them after the xfs_bunmapi() call.
 		 */
 		xfs_bmap_init(&flist, &firstblock);
 		error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock,
 					&flist, &done);
 		ASSERT(!flist.xbf_count && !flist.xbf_first);
 		if (error) {
 			/* something screwed, just bail */
 			if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
 				xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
 			"page discard unable to remove delalloc mapping.");
 			}
 			break;
 		}
 next_buffer:
 		offset += len;
 	} while ((bh = bh->b_this_page) != head);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 out_invalidate:
 	xfs_vm_invalidatepage(page, 0);
 	return;
 }
 /*
  * Write out a dirty page.
  *
  * For delalloc space on the page we need to allocate space and flush it.
  * For unwritten space on the page we need to start the conversion to
  * regular allocated space.
  * For any other dirty buffer heads on the page we should flush them.
  *
  * If we detect that a transaction would be required to flush the page, we
  * have to check the process flags first, if we are already in a transaction
  * or disk I/O during allocations is off, we need to fail the writepage and
  * redirty the page.
  */
 STATIC int
 xfs_vm_writepage(
 	struct page		*page,
 	struct writeback_control *wbc)
 {
 	struct inode		*inode = page->mapping->host;
 	int			delalloc, unwritten;
 	struct buffer_head	*bh, *head;
 	struct xfs_bmbt_irec	imap;
 	xfs_ioend_t		*ioend = NULL, *iohead = NULL;
 	loff_t			offset;
 	unsigned int		type;
 	__uint64_t              end_offset;
 	pgoff_t                 end_index, last_index;
 	ssize_t			size, len;
 	int			flags, err, imap_valid = 0, uptodate = 1;
 	int			count = 0;
 	int			all_bh = 0;
 	trace_xfs_writepage(inode, page, 0);
 	ASSERT(page_has_buffers(page));
 	/*
 	 * Refuse to write the page out if we are called from reclaim context.
 	 *
 	 * This avoids stack overflows when called from deeply used stacks in
 	 * random callers for direct reclaim or memcg reclaim.  We explicitly
 	 * allow reclaim from kswapd as the stack usage there is relatively low.
 	 *
 	 * This should really be done by the core VM, but until that happens
 	 * filesystems like XFS, btrfs and ext4 have to take care of this
 	 * by themselves.
 	 */
 	if ((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC)
 		goto out_fail;
 	/*
 	 * We need a transaction if there are delalloc or unwritten buffers
 	 * on the page.
 	 *
 	 * If we need a transaction and the process flags say we are already
 	 * in a transaction, or no IO is allowed then mark the page dirty
 	 * again and leave the page as is.
 	 */
 	xfs_count_page_state(page, &delalloc, &unwritten);
 	if ((current->flags & PF_FSTRANS) && (delalloc || unwritten))
 		goto out_fail;
-	/*
-	 * Delay hooking up buffer heads until we have
-	 * made our go/no-go decision.
-	 */
-	if (!page_has_buffers(page))
-		create_empty_buffers(page, 1 << inode->i_blkbits, 0);
 	/* Is this page beyond the end of the file? */
 	offset = i_size_read(inode);
 	end_index = offset >> PAGE_CACHE_SHIFT;
 	last_index = (offset - 1) >> PAGE_CACHE_SHIFT;
 	if (page->index >= end_index) {
 		if ((page->index >= end_index + 1) ||
 		    !(i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) {
 			unlock_page(page);
 			return 0;
 		}
 	}
 	end_offset = min_t(unsigned long long,
 			(xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
 			offset);
 	len = 1 << inode->i_blkbits;
 	bh = head = page_buffers(page);
 	offset = page_offset(page);
 	flags = BMAPI_READ;
 	type = IO_NEW;
 	do {
 		if (offset >= end_offset)
 			break;
 		if (!buffer_uptodate(bh))
 			uptodate = 0;
 		/*
 		 * A hole may still be marked uptodate because discard_buffer
 		 * leaves the flag set.
 		 */
 		if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
 			ASSERT(!buffer_dirty(bh));
 			imap_valid = 0;
 			continue;
 		}
 		if (imap_valid)
 			imap_valid = xfs_imap_valid(inode, &imap, offset);
 		if (buffer_unwritten(bh) || buffer_delay(bh)) {
 			int new_ioend = 0;
 			/*
 			 * Make sure we don't use a read-only iomap
 			 */
 			if (flags == BMAPI_READ)
 				imap_valid = 0;
 			if (buffer_unwritten(bh)) {
 				type = IO_UNWRITTEN;
 				flags = BMAPI_WRITE | BMAPI_IGNSTATE;
 			} else if (buffer_delay(bh)) {
 				type = IO_DELAY;
 				flags = BMAPI_ALLOCATE;
 				if (wbc->sync_mode == WB_SYNC_NONE &&
 				    wbc->nonblocking)
 					flags |= BMAPI_TRYLOCK;
 			}
 			if (!imap_valid) {
 				/*
 				 * If we didn't have a valid mapping then we
 				 * need to ensure that we put the new mapping
 				 * in a new ioend structure. This needs to be
 				 * done to ensure that the ioends correctly
 				 * reflect the block mappings at io completion
 				 * for unwritten extent conversion.
 				 */
 				new_ioend = 1;
 				err = xfs_map_blocks(inode, offset, len,
 						&imap, flags);
 				if (err)
 					goto error;
 				imap_valid = xfs_imap_valid(inode, &imap,
 							    offset);
 			}
 			if (imap_valid) {
 				xfs_map_at_offset(inode, bh, &imap, offset);
 				xfs_add_to_ioend(inode, bh, offset, type,
 						 &ioend, new_ioend);
 				count++;
 			}
 		} else if (buffer_uptodate(bh)) {
 			/*
 			 * we got here because the buffer is already mapped.
 			 * That means it must already have extents allocated
 			 * underneath it. Map the extent by reading it.
 			 */
 			if (!imap_valid || flags != BMAPI_READ) {
 				flags = BMAPI_READ;
 				size = xfs_probe_cluster(inode, page, bh, head);
 				err = xfs_map_blocks(inode, offset, size,
 						&imap, flags);
 				if (err)
 					goto error;
 				imap_valid = xfs_imap_valid(inode, &imap,
 							    offset);
 			}
 			/*
 			 * We set the type to IO_NEW in case we are doing a
 			 * small write at EOF that is extending the file but
 			 * without needing an allocation. We need to update the
 			 * file size on I/O completion in this case so it is
 			 * the same case as having just allocated a new extent
 			 * that we are writing into for the first time.
 			 */
 			type = IO_NEW;
 			if (trylock_buffer(bh)) {
 				if (imap_valid)
 					all_bh = 1;
 				xfs_add_to_ioend(inode, bh, offset, type,
 						&ioend, !imap_valid);
 				count++;
 			} else {
 				imap_valid = 0;
 			}
 		} else if (PageUptodate(page)) {
 			ASSERT(buffer_mapped(bh));
 			imap_valid = 0;
 		}
 		if (!iohead)
 			iohead = ioend;
 	} while (offset += len, ((bh = bh->b_this_page) != head));
 	if (uptodate && bh == head)
 		SetPageUptodate(page);
 	xfs_start_page_writeback(page, 1, count);
 	if (ioend && imap_valid) {
 		xfs_off_t		end_index;
 		end_index = imap.br_startoff + imap.br_blockcount;
 		/* to bytes */
 		end_index <<= inode->i_blkbits;
 		/* to pages */
 		end_index = (end_index - 1) >> PAGE_CACHE_SHIFT;
 		/* check against file size */
 		if (end_index > last_index)
 			end_index = last_index;
 		xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
 					wbc, all_bh, end_index);
 	}
 	if (iohead)
 		xfs_submit_ioend(wbc, iohead);
 	return 0;
 error:
 	if (iohead)
 		xfs_cancel_ioend(iohead);
 	xfs_aops_discard_page(page);
 	ClearPageUptodate(page);
 	unlock_page(page);
 	return err;
 out_fail:
 	redirty_page_for_writepage(wbc, page);
 	unlock_page(page);
 	return 0;
 }
 STATIC int
 xfs_vm_writepages(
 	struct address_space	*mapping,
 	struct writeback_control *wbc)
 {
 	xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
 	return generic_writepages(mapping, wbc);
 }
 /*
  * Called to move a page into cleanable state - and from there
  * to be released. The page should already be clean. We always
  * have buffer heads in this call.
  *
  * Returns 1 if the page is ok to release, 0 otherwise.
  */
 STATIC int
 xfs_vm_releasepage(
 	struct page		*page,
 	gfp_t			gfp_mask)
 {
 	int			delalloc, unwritten;
 	trace_xfs_releasepage(page->mapping->host, page, 0);
 	xfs_count_page_state(page, &delalloc, &unwritten);
 	if (WARN_ON(delalloc))
 		return 0;
 	if (WARN_ON(unwritten))
 		return 0;
 	return try_to_free_buffers(page);
 }
 STATIC int
 __xfs_get_blocks(
 	struct inode		*inode,
 	sector_t		iblock,
 	struct buffer_head	*bh_result,
 	int			create,
 	int			direct)
 {
 	int			flags = create ? BMAPI_WRITE : BMAPI_READ;
 	struct xfs_bmbt_irec	imap;
 	xfs_off_t		offset;
 	ssize_t			size;
 	int			nimap = 1;
 	int			new = 0;
 	int			error;
 	offset = (xfs_off_t)iblock << inode->i_blkbits;
 	ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
 	size = bh_result->b_size;
 	if (!create && direct && offset >= i_size_read(inode))
 		return 0;
 	if (direct && create)
 		flags |= BMAPI_DIRECT;
 	error = xfs_iomap(XFS_I(inode), offset, size, flags, &imap, &nimap,
 			  &new);
 	if (error)
 		return -error;
 	if (nimap == 0)
 		return 0;
 	if (imap.br_startblock != HOLESTARTBLOCK &&
 	    imap.br_startblock != DELAYSTARTBLOCK) {
 		/*
 		 * For unwritten extents do not report a disk address on
 		 * the read case (treat as if we're reading into a hole).
 		 */
 		if (create || !ISUNWRITTEN(&imap))
 			xfs_map_buffer(inode, bh_result, &imap, offset);
 		if (create && ISUNWRITTEN(&imap)) {
 			if (direct)
 				bh_result->b_private = inode;
 			set_buffer_unwritten(bh_result);
 		}
 	}
 	/*
 	 * If this is a realtime file, data may be on a different device.
 	 * to that pointed to from the buffer_head b_bdev currently.
 	 */
 	bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
 	/*
 	 * If we previously allocated a block out beyond eof and we are now
 	 * coming back to use it then we will need to flag it as new even if it
 	 * has a disk address.
 	 *
 	 * With sub-block writes into unwritten extents we also need to mark
 	 * the buffer as new so that the unwritten parts of the buffer gets
 	 * correctly zeroed.
 	 */
 	if (create &&
 	    ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
 	     (offset >= i_size_read(inode)) ||
 	     (new || ISUNWRITTEN(&imap))))
 		set_buffer_new(bh_result);
 	if (imap.br_startblock == DELAYSTARTBLOCK) {
 		BUG_ON(direct);
 		if (create) {
 			set_buffer_uptodate(bh_result);
 			set_buffer_mapped(bh_result);
 			set_buffer_delay(bh_result);
 		}
 	}
 	/*
 	 * If this is O_DIRECT or the mpage code calling tell them how large
 	 * the mapping is, so that we can avoid repeated get_blocks calls.
 	 */
 	if (direct || size > (1 << inode->i_blkbits)) {
 		xfs_off_t		mapping_size;
 		mapping_size = imap.br_startoff + imap.br_blockcount - iblock;
 		mapping_size <<= inode->i_blkbits;
 		ASSERT(mapping_size > 0);
 		if (mapping_size > size)
 			mapping_size = size;
 		if (mapping_size > LONG_MAX)
 			mapping_size = LONG_MAX;
 		bh_result->b_size = mapping_size;
 	}
 	return 0;
 }
 int
 xfs_get_blocks(
 	struct inode		*inode,
 	sector_t		iblock,
 	struct buffer_head	*bh_result,
 	int			create)
 {
 	return __xfs_get_blocks(inode, iblock, bh_result, create, 0);
 }
 STATIC int
 xfs_get_blocks_direct(
 	struct inode		*inode,
 	sector_t		iblock,
 	struct buffer_head	*bh_result,
 	int			create)
 {
 	return __xfs_get_blocks(inode, iblock, bh_result, create, 1);
 }
 STATIC void
 xfs_end_io_direct(
 	struct kiocb	*iocb,
 	loff_t		offset,
 	ssize_t		size,
 	void		*private)
 {
 	xfs_ioend_t	*ioend = iocb->private;
 	/*
 	 * Non-NULL private data means we need to issue a transaction to
 	 * convert a range from unwritten to written extents.  This needs
 	 * to happen from process context but aio+dio I/O completion
 	 * happens from irq context so we need to defer it to a workqueue.
 	 * This is not necessary for synchronous direct I/O, but we do
 	 * it anyway to keep the code uniform and simpler.
 	 *
 	 * Well, if only it were that simple. Because synchronous direct I/O
 	 * requires extent conversion to occur *before* we return to userspace,
 	 * we have to wait for extent conversion to complete. Look at the
 	 * iocb that has been passed to us to determine if this is AIO or
 	 * not. If it is synchronous, tell xfs_finish_ioend() to kick the
 	 * workqueue and wait for it to complete.
 	 *
 	 * The core direct I/O code might be changed to always call the
 	 * completion handler in the future, in which case all this can
 	 * go away.
 	 */
 	ioend->io_offset = offset;
 	ioend->io_size = size;
 	if (ioend->io_type == IO_READ) {
 		xfs_finish_ioend(ioend, 0);
 	} else if (private && size > 0) {
 		xfs_finish_ioend(ioend, is_sync_kiocb(iocb));
 	} else {
 		/*
 		 * A direct I/O write ioend starts it's life in unwritten
 		 * state in case they map an unwritten extent.  This write
 		 * didn't map an unwritten extent so switch it's completion
 		 * handler.
 		 */
 		ioend->io_type = IO_NEW;
 		xfs_finish_ioend(ioend, 0);
 	}
 	/*
 	 * blockdev_direct_IO can return an error even after the I/O
 	 * completion handler was called.  Thus we need to protect
 	 * against double-freeing.
 	 */
 	iocb->private = NULL;
 }
 STATIC ssize_t
 xfs_vm_direct_IO(
 	int			rw,
 	struct kiocb		*iocb,
 	const struct iovec	*iov,
 	loff_t			offset,
 	unsigned long		nr_segs)
 {
 	struct file	*file = iocb->ki_filp;
 	struct inode	*inode = file->f_mapping->host;
 	struct block_device *bdev;
 	ssize_t		ret;
 	bdev = xfs_find_bdev_for_inode(inode);
 	iocb->private = xfs_alloc_ioend(inode, rw == WRITE ?
 					IO_UNWRITTEN : IO_READ);
 	ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov,
 					    offset, nr_segs,
 					    xfs_get_blocks_direct,
 					    xfs_end_io_direct);
 	if (unlikely(ret != -EIOCBQUEUED && iocb->private))
 		xfs_destroy_ioend(iocb->private);
 	return ret;
 }
 STATIC int
 xfs_vm_write_begin(
 	struct file		*file,
 	struct address_space	*mapping,
 	loff_t			pos,
 	unsigned		len,
 	unsigned		flags,
 	struct page		**pagep,
 	void			**fsdata)
 {
 	*pagep = NULL;
 	return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
 								xfs_get_blocks);
 }
 STATIC sector_t
 xfs_vm_bmap(
 	struct address_space	*mapping,
 	sector_t		block)
 {
 	struct inode		*inode = (struct inode *)mapping->host;
 	struct xfs_inode	*ip = XFS_I(inode);
 	trace_xfs_vm_bmap(XFS_I(inode));
 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
 	xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF);
 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 	return generic_block_bmap(mapping, block, xfs_get_blocks);
 }
 STATIC int
 xfs_vm_readpage(
 	struct file		*unused,
 	struct page		*page)
 {
 	return mpage_readpage(page, xfs_get_blocks);
 }
 STATIC int
 xfs_vm_readpages(
 	struct file		*unused,
 	struct address_space	*mapping,
 	struct list_head	*pages,
 	unsigned		nr_pages)
 {
 	return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
 }
 const struct address_space_operations xfs_address_space_operations = {
 	.readpage		= xfs_vm_readpage,
 	.readpages		= xfs_vm_readpages,
 	.writepage		= xfs_vm_writepage,
 	.writepages		= xfs_vm_writepages,
 	.sync_page		= block_sync_page,
 	.releasepage		= xfs_vm_releasepage,
 	.invalidatepage		= xfs_vm_invalidatepage,
 	.write_begin		= xfs_vm_write_begin,
 	.write_end		= generic_write_end,
 	.bmap			= xfs_vm_bmap,
 	.direct_IO		= xfs_vm_direct_IO,
 	.migratepage		= buffer_migrate_page,
 	.is_partially_uptodate  = block_is_partially_uptodate,
 	.error_remove_page	= generic_error_remove_page,
 };