Doug / smarc-fsl-linux-kernel

1

/*

1

/*

2

* fs/direct-io.c

2

* fs/direct-io.c

3

*

3

*

4

5

*

5

*

6

* O_DIRECT

6

* O_DIRECT

7

*

7

*

8

* 04Jul2002 Andrew Morton

8

* 04Jul2002 Andrew Morton

9

* Initial version

9

* Initial version

10

* 11Sep2002 janetinc@us.ibm.com

10

* 11Sep2002 janetinc@us.ibm.com

11

* added readv/writev support.

11

* added readv/writev support.

12

* 29Oct2002 Andrew Morton

12

* 29Oct2002 Andrew Morton

13

* rewrote bio_add_page() support.

13

* rewrote bio_add_page() support.

14

* 30Oct2002 pbadari@us.ibm.com

14

* 30Oct2002 pbadari@us.ibm.com

15

* added support for non-aligned IO.

15

* added support for non-aligned IO.

16

* 06Nov2002 pbadari@us.ibm.com

16

* 06Nov2002 pbadari@us.ibm.com

17

* added asynchronous IO support.

17

* added asynchronous IO support.

18

* 21Jul2003 nathans@sgi.com

18

* 21Jul2003 nathans@sgi.com

19

* added IO completion notifier.

19

* added IO completion notifier.

20

*/

20

*/

21

22

#include <linux/kernel.h>

22

#include <linux/kernel.h>

23

#include <linux/module.h>

23

#include <linux/module.h>

24

#include <linux/types.h>

24

#include <linux/types.h>

25

#include <linux/fs.h>

25

#include <linux/fs.h>

26

#include <linux/mm.h>

26

#include <linux/mm.h>

27

#include <linux/slab.h>

27

#include <linux/slab.h>

28

#include <linux/highmem.h>

28

#include <linux/highmem.h>

29

#include <linux/pagemap.h>

29

#include <linux/pagemap.h>

30

#include <linux/task_io_accounting_ops.h>

30

#include <linux/task_io_accounting_ops.h>

31

#include <linux/bio.h>

31

#include <linux/bio.h>

32

#include <linux/wait.h>

32

#include <linux/wait.h>

33

#include <linux/err.h>

33

#include <linux/err.h>

34

#include <linux/blkdev.h>

34

#include <linux/blkdev.h>

35

#include <linux/buffer_head.h>

35

#include <linux/buffer_head.h>

36

#include <linux/rwsem.h>

36

#include <linux/rwsem.h>

37

#include <linux/uio.h>

37

#include <linux/uio.h>

38

#include <linux/atomic.h>

38

#include <linux/atomic.h>

39

#include <linux/prefetch.h>

39

#include <linux/prefetch.h>

40

41

/*

41

/*

42

* How many user pages to map in one call to get_user_pages(). This determines

42

* How many user pages to map in one call to get_user_pages(). This determines

43

* the size of a structure in the slab cache

43

* the size of a structure in the slab cache

44

*/

44

*/

45

#define DIO_PAGES 64

45

#define DIO_PAGES 64

46

47

/*

47

/*

48

* This code generally works in units of "dio_blocks". A dio_block is

48

* This code generally works in units of "dio_blocks". A dio_block is

49

* somewhere between the hard sector size and the filesystem block size. it

49

* somewhere between the hard sector size and the filesystem block size. it

50

* is determined on a per-invocation basis. When talking to the filesystem

50

* is determined on a per-invocation basis. When talking to the filesystem

51

* we need to convert dio_blocks to fs_blocks by scaling the dio_block quantity

51

* we need to convert dio_blocks to fs_blocks by scaling the dio_block quantity

52

* down by dio->blkfactor. Similarly, fs-blocksize quantities are converted

52

* down by dio->blkfactor. Similarly, fs-blocksize quantities are converted

53

* to bio_block quantities by shifting left by blkfactor.

53

* to bio_block quantities by shifting left by blkfactor.

54

*

54

*

55

* If blkfactor is zero then the user's request was aligned to the filesystem's

55

* If blkfactor is zero then the user's request was aligned to the filesystem's

56

* blocksize.

56

* blocksize.

57

*/

57

*/

58

59

/* dio_state only used in the submission path */

59

/* dio_state only used in the submission path */

60

61

struct dio_submit {

61

struct dio_submit {

62

struct bio *bio; /* bio under assembly */

62

struct bio *bio; /* bio under assembly */

63

unsigned blkbits; /* doesn't change */

63

unsigned blkbits; /* doesn't change */

64

unsigned blkfactor; /* When we're using an alignment which

64

unsigned blkfactor; /* When we're using an alignment which

65

is finer than the filesystem's soft

65

is finer than the filesystem's soft

66

blocksize, this specifies how much

66

blocksize, this specifies how much

67

finer. blkfactor=2 means 1/4-block

67

finer. blkfactor=2 means 1/4-block

68

alignment. Does not change */

68

alignment. Does not change */

69

unsigned start_zero_done; /* flag: sub-blocksize zeroing has

69

unsigned start_zero_done; /* flag: sub-blocksize zeroing has

70

been performed at the start of a

70

been performed at the start of a

71

write */

71

write */

72

int pages_in_io; /* approximate total IO pages */

72

int pages_in_io; /* approximate total IO pages */

73

size_t size; /* total request size (doesn't change)*/

73

size_t size; /* total request size (doesn't change)*/

74

sector_t block_in_file; /* Current offset into the underlying

74

sector_t block_in_file; /* Current offset into the underlying

75

file in dio_block units. */

75

file in dio_block units. */

76

unsigned blocks_available; /* At block_in_file. changes */

76

unsigned blocks_available; /* At block_in_file. changes */

77

int reap_counter; /* rate limit reaping */

77

int reap_counter; /* rate limit reaping */

78

sector_t final_block_in_request;/* doesn't change */

78

sector_t final_block_in_request;/* doesn't change */

79

unsigned first_block_in_page; /* doesn't change, Used only once */

79

unsigned first_block_in_page; /* doesn't change, Used only once */

80

int boundary; /* prev block is at a boundary */

80

int boundary; /* prev block is at a boundary */

81

get_block_t *get_block; /* block mapping function */

81

get_block_t *get_block; /* block mapping function */

82

dio_submit_t *submit_io; /* IO submition function */

82

dio_submit_t *submit_io; /* IO submition function */

83

84

loff_t logical_offset_in_bio; /* current first logical block in bio */

84

loff_t logical_offset_in_bio; /* current first logical block in bio */

85

sector_t final_block_in_bio; /* current final block in bio + 1 */

85

sector_t final_block_in_bio; /* current final block in bio + 1 */

86

sector_t next_block_for_io; /* next block to be put under IO,

86

sector_t next_block_for_io; /* next block to be put under IO,

87

in dio_blocks units */

87

in dio_blocks units */

88

89

/*

89

/*

90

* Deferred addition of a page to the dio. These variables are

90

* Deferred addition of a page to the dio. These variables are

91

* private to dio_send_cur_page(), submit_page_section() and

91

* private to dio_send_cur_page(), submit_page_section() and

92

* dio_bio_add_page().

92

* dio_bio_add_page().

93

*/

93

*/

94

struct page *cur_page; /* The page */

94

struct page *cur_page; /* The page */

95

unsigned cur_page_offset; /* Offset into it, in bytes */

95

unsigned cur_page_offset; /* Offset into it, in bytes */

96

unsigned cur_page_len; /* Nr of bytes at cur_page_offset */

96

unsigned cur_page_len; /* Nr of bytes at cur_page_offset */

97

sector_t cur_page_block; /* Where it starts */

97

sector_t cur_page_block; /* Where it starts */

98

loff_t cur_page_fs_offset; /* Offset in file */

98

loff_t cur_page_fs_offset; /* Offset in file */

99

100

/*

100

/*

101

* Page fetching state. These variables belong to dio_refill_pages().

101

* Page fetching state. These variables belong to dio_refill_pages().

102

*/

102

*/

103

int curr_page; /* changes */

103

int curr_page; /* changes */

104

int total_pages; /* doesn't change */

104

int total_pages; /* doesn't change */

105

unsigned long curr_user_address;/* changes */

105

unsigned long curr_user_address;/* changes */

106

107

/*

107

/*

108

* Page queue. These variables belong to dio_refill_pages() and

108

* Page queue. These variables belong to dio_refill_pages() and

109

* dio_get_page().

109

* dio_get_page().

110

*/

110

*/

111

unsigned head; /* next page to process */

111

unsigned head; /* next page to process */

112

unsigned tail; /* last valid page + 1 */

112

unsigned tail; /* last valid page + 1 */

113

};

113

};

114

115

/* dio_state communicated between submission path and end_io */

115

/* dio_state communicated between submission path and end_io */

116

struct dio {

116

struct dio {

117

int flags; /* doesn't change */

117

int flags; /* doesn't change */

118

int rw;

118

int rw;

119

struct inode *inode;

119

struct inode *inode;

120

loff_t i_size; /* i_size when submitted */

120

loff_t i_size; /* i_size when submitted */

121

dio_iodone_t *end_io; /* IO completion function */

121

dio_iodone_t *end_io; /* IO completion function */

122

123

void *private; /* copy from map_bh.b_private */

123

void *private; /* copy from map_bh.b_private */

124

125

/* BIO completion state */

125

/* BIO completion state */

126

spinlock_t bio_lock; /* protects BIO fields below */

126

spinlock_t bio_lock; /* protects BIO fields below */

127

int page_errors; /* errno from get_user_pages() */

127

int page_errors; /* errno from get_user_pages() */

128

int is_async; /* is IO async ? */

128

int is_async; /* is IO async ? */

129

int io_error; /* IO error in completion path */

129

int io_error; /* IO error in completion path */

130

unsigned long refcount; /* direct_io_worker() and bios */

130

unsigned long refcount; /* direct_io_worker() and bios */

131

struct bio *bio_list; /* singly linked via bi_private */

131

struct bio *bio_list; /* singly linked via bi_private */

132

struct task_struct *waiter; /* waiting task (NULL if none) */

132

struct task_struct *waiter; /* waiting task (NULL if none) */

133

134

/* AIO related stuff */

134

/* AIO related stuff */

135

struct kiocb *iocb; /* kiocb */

135

struct kiocb *iocb; /* kiocb */

136

ssize_t result; /* IO result */

136

ssize_t result; /* IO result */

137

138

/*

138

/*

139

* pages[] (and any fields placed after it) are not zeroed out at

139

* pages[] (and any fields placed after it) are not zeroed out at

140

* allocation time. Don't add new fields after pages[] unless you

140

* allocation time. Don't add new fields after pages[] unless you

141

* wish that they not be zeroed.

141

* wish that they not be zeroed.

142

*/

142

*/

143

struct page *pages[DIO_PAGES]; /* page buffer */

143

struct page *pages[DIO_PAGES]; /* page buffer */

144

} ____cacheline_aligned_in_smp;

144

} ____cacheline_aligned_in_smp;

145

146

static struct kmem_cache *dio_cache __read_mostly;

146

static struct kmem_cache *dio_cache __read_mostly;

147

148

/*

148

/*

149

* How many pages are in the queue?

149

* How many pages are in the queue?

150

*/

150

*/

151

static inline unsigned dio_pages_present(struct dio_submit *sdio)

151

static inline unsigned dio_pages_present(struct dio_submit *sdio)

152

{

152

{

153

return sdio->tail - sdio->head;

153

return sdio->tail - sdio->head;

154

}

154

}

155

156

/*

156

/*

157

* Go grab and pin some userspace pages. Typically we'll get 64 at a time.

157

* Go grab and pin some userspace pages. Typically we'll get 64 at a time.

158

*/

158

*/

159

static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)

159

static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)

160

{

160

{

161

int ret;

161

int ret;

162

int nr_pages;

162

int nr_pages;

163

164

nr_pages = min(sdio->total_pages - sdio->curr_page, DIO_PAGES);

164

nr_pages = min(sdio->total_pages - sdio->curr_page, DIO_PAGES);

165

ret = get_user_pages_fast(

165

ret = get_user_pages_fast(

166

sdio->curr_user_address, /* Where from? */

166

sdio->curr_user_address, /* Where from? */

167

nr_pages, /* How many pages? */

167

nr_pages, /* How many pages? */

168

dio->rw == READ, /* Write to memory? */

168

dio->rw == READ, /* Write to memory? */

169

&dio->pages[0]); /* Put results here */

169

&dio->pages[0]); /* Put results here */

170

171

if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) {

171

if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) {

172

struct page *page = ZERO_PAGE(0);

172

struct page *page = ZERO_PAGE(0);

173

/*

173

/*

174

* A memory fault, but the filesystem has some outstanding

174

* A memory fault, but the filesystem has some outstanding

175

* mapped blocks. We need to use those blocks up to avoid

175

* mapped blocks. We need to use those blocks up to avoid

176

* leaking stale data in the file.

176

* leaking stale data in the file.

177

*/

177

*/

178

if (dio->page_errors == 0)

178

if (dio->page_errors == 0)

179

dio->page_errors = ret;

179

dio->page_errors = ret;

180

page_cache_get(page);

180

page_cache_get(page);

181

dio->pages[0] = page;

181

dio->pages[0] = page;

182

sdio->head = 0;

182

sdio->head = 0;

183

sdio->tail = 1;

183

sdio->tail = 1;

184

ret = 0;

184

ret = 0;

185

goto out;

185

goto out;

186

}

186

}

187

188

if (ret >= 0) {

188

if (ret >= 0) {

189

sdio->curr_user_address += ret * PAGE_SIZE;

189

sdio->curr_user_address += ret * PAGE_SIZE;

190

sdio->curr_page += ret;

190

sdio->curr_page += ret;

191

sdio->head = 0;

191

sdio->head = 0;

192

sdio->tail = ret;

192

sdio->tail = ret;

193

ret = 0;

193

ret = 0;

194

}

194

}

195

out:

195

out:

196

return ret;

196

return ret;

197

}

197

}

198

199

/*

199

/*

200

* Get another userspace page. Returns an ERR_PTR on error. Pages are

200

* Get another userspace page. Returns an ERR_PTR on error. Pages are

201

* buffered inside the dio so that we can call get_user_pages() against a

201

* buffered inside the dio so that we can call get_user_pages() against a

202

* decent number of pages, less frequently. To provide nicer use of the

202

* decent number of pages, less frequently. To provide nicer use of the

203

* L1 cache.

203

* L1 cache.

204

*/

204

*/

205

static inline struct page *dio_get_page(struct dio *dio,

205

static inline struct page *dio_get_page(struct dio *dio,

206

struct dio_submit *sdio)

206

struct dio_submit *sdio)

207

{

207

{

208

if (dio_pages_present(sdio) == 0) {

208

if (dio_pages_present(sdio) == 0) {

209

int ret;

209

int ret;

210

211

ret = dio_refill_pages(dio, sdio);

211

ret = dio_refill_pages(dio, sdio);

212

if (ret)

212

if (ret)

213

return ERR_PTR(ret);

213

return ERR_PTR(ret);

214

BUG_ON(dio_pages_present(sdio) == 0);

214

BUG_ON(dio_pages_present(sdio) == 0);

215

}

215

}

216

return dio->pages[sdio->head++];

216

return dio->pages[sdio->head++];

217

}

217

}

218

219

/**

219

/**

220

* dio_complete() - called when all DIO BIO I/O has been completed

220

* dio_complete() - called when all DIO BIO I/O has been completed

221

* @offset: the byte offset in the file of the completed operation

221

* @offset: the byte offset in the file of the completed operation

222

*

222

*

223

* This releases locks as dictated by the locking type, lets interested parties

223

* This releases locks as dictated by the locking type, lets interested parties

224

* know that a DIO operation has completed, and calculates the resulting return

224

* know that a DIO operation has completed, and calculates the resulting return

225

* code for the operation.

225

* code for the operation.

226

*

226

*

227

* It lets the filesystem know if it registered an interest earlier via

227

* It lets the filesystem know if it registered an interest earlier via

228

* get_block. Pass the private field of the map buffer_head so that

228

* get_block. Pass the private field of the map buffer_head so that

229

* filesystems can use it to hold additional state between get_block calls and

229

* filesystems can use it to hold additional state between get_block calls and

230

* dio_complete.

230

* dio_complete.

231

*/

231

*/

232

static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is_async)

232

static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is_async)

233

{

233

{

234

ssize_t transferred = 0;

234

ssize_t transferred = 0;

235

236

/*

236

/*

237

* AIO submission can race with bio completion to get here while

237

* AIO submission can race with bio completion to get here while

238

* expecting to have the last io completed by bio completion.

238

* expecting to have the last io completed by bio completion.

239

* In that case -EIOCBQUEUED is in fact not an error we want

239

* In that case -EIOCBQUEUED is in fact not an error we want

240

* to preserve through this call.

240

* to preserve through this call.

241

*/

241

*/

242

if (ret == -EIOCBQUEUED)

242

if (ret == -EIOCBQUEUED)

243

ret = 0;

243

ret = 0;

244

245

if (dio->result) {

245

if (dio->result) {

246

transferred = dio->result;

246

transferred = dio->result;

247

248

/* Check for short read case */

248

/* Check for short read case */

249

if ((dio->rw == READ) && ((offset + transferred) > dio->i_size))

249

if ((dio->rw == READ) && ((offset + transferred) > dio->i_size))

250

transferred = dio->i_size - offset;

250

transferred = dio->i_size - offset;

251

}

251

}

252

253

if (ret == 0)

253

if (ret == 0)

254

ret = dio->page_errors;

254

ret = dio->page_errors;

255

if (ret == 0)

255

if (ret == 0)

256

ret = dio->io_error;

256

ret = dio->io_error;

257

if (ret == 0)

257

if (ret == 0)

258

ret = transferred;

258

ret = transferred;

259

260

if (dio->end_io && dio->result) {

260

if (dio->end_io && dio->result) {

261

dio->end_io(dio->iocb, offset, transferred,

261

dio->end_io(dio->iocb, offset, transferred,

262

dio->private, ret, is_async);

262

dio->private, ret, is_async);

263

} else {

263

} else {

264

inode_dio_done(dio->inode);

264

if (is_async)

265

if (is_async)

265

aio_complete(dio->iocb, ret, 0);

266

aio_complete(dio->iocb, ret, 0);

266

inode_dio_done(dio->inode);

267

}

267

}

268

269

return ret;

269

return ret;

270

}

270

}

271

272

static int dio_bio_complete(struct dio *dio, struct bio *bio);

272

static int dio_bio_complete(struct dio *dio, struct bio *bio);

273

/*

273

/*

274

* Asynchronous IO callback.

274

* Asynchronous IO callback.

275

*/

275

*/

276

static void dio_bio_end_aio(struct bio *bio, int error)

276

static void dio_bio_end_aio(struct bio *bio, int error)

277

{

277

{

278

struct dio *dio = bio->bi_private;

278

struct dio *dio = bio->bi_private;

279

unsigned long remaining;

279

unsigned long remaining;

280

unsigned long flags;

280

unsigned long flags;

281

282

/* cleanup the bio */

282

/* cleanup the bio */

283

dio_bio_complete(dio, bio);

283

dio_bio_complete(dio, bio);

284

285

spin_lock_irqsave(&dio->bio_lock, flags);

285

spin_lock_irqsave(&dio->bio_lock, flags);

286

remaining = --dio->refcount;

286

remaining = --dio->refcount;

287

if (remaining == 1 && dio->waiter)

287

if (remaining == 1 && dio->waiter)

288

wake_up_process(dio->waiter);

288

wake_up_process(dio->waiter);

289

spin_unlock_irqrestore(&dio->bio_lock, flags);

289

spin_unlock_irqrestore(&dio->bio_lock, flags);

290

291

if (remaining == 0) {

291

if (remaining == 0) {

292

dio_complete(dio, dio->iocb->ki_pos, 0, true);

292

dio_complete(dio, dio->iocb->ki_pos, 0, true);

293

kmem_cache_free(dio_cache, dio);

293

kmem_cache_free(dio_cache, dio);

294

}

294

}

295

}

295

}

296

297

/*

297

/*

298

* The BIO completion handler simply queues the BIO up for the process-context

298

* The BIO completion handler simply queues the BIO up for the process-context

299

* handler.

299

* handler.

300

*

300

*

301

* During I/O bi_private points at the dio. After I/O, bi_private is used to

301

* During I/O bi_private points at the dio. After I/O, bi_private is used to

302

* implement a singly-linked list of completed BIOs, at dio->bio_list.

302

* implement a singly-linked list of completed BIOs, at dio->bio_list.

303

*/

303

*/

304

static void dio_bio_end_io(struct bio *bio, int error)

304

static void dio_bio_end_io(struct bio *bio, int error)

305

{

305

{

306

struct dio *dio = bio->bi_private;

306

struct dio *dio = bio->bi_private;

307

unsigned long flags;

307

unsigned long flags;

308

309

spin_lock_irqsave(&dio->bio_lock, flags);

309

spin_lock_irqsave(&dio->bio_lock, flags);

310

bio->bi_private = dio->bio_list;

310

bio->bi_private = dio->bio_list;

311

dio->bio_list = bio;

311

dio->bio_list = bio;

312

if (--dio->refcount == 1 && dio->waiter)

312

if (--dio->refcount == 1 && dio->waiter)

313

wake_up_process(dio->waiter);

313

wake_up_process(dio->waiter);

314

spin_unlock_irqrestore(&dio->bio_lock, flags);

314

spin_unlock_irqrestore(&dio->bio_lock, flags);

315

}

315

}

316

317

/**

317

/**

318

* dio_end_io - handle the end io action for the given bio

318

* dio_end_io - handle the end io action for the given bio

319

* @bio: The direct io bio thats being completed

319

* @bio: The direct io bio thats being completed

320

* @error: Error if there was one

320

* @error: Error if there was one

321

*

321

*

322

* This is meant to be called by any filesystem that uses their own dio_submit_t

322

* This is meant to be called by any filesystem that uses their own dio_submit_t

323

* so that the DIO specific endio actions are dealt with after the filesystem

323

* so that the DIO specific endio actions are dealt with after the filesystem

324

* has done it's completion work.

324

* has done it's completion work.

325

*/

325

*/

326

void dio_end_io(struct bio *bio, int error)

326

void dio_end_io(struct bio *bio, int error)

327

{

327

{

328

struct dio *dio = bio->bi_private;

328

struct dio *dio = bio->bi_private;

329

330

if (dio->is_async)

330

if (dio->is_async)

331

dio_bio_end_aio(bio, error);

331

dio_bio_end_aio(bio, error);

332

else

332

else

333

dio_bio_end_io(bio, error);

333

dio_bio_end_io(bio, error);

334

}

334

}

335

EXPORT_SYMBOL_GPL(dio_end_io);

335

EXPORT_SYMBOL_GPL(dio_end_io);

336

337

static inline void

337

static inline void

338

dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,

338

dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,

339

struct block_device *bdev,

339

struct block_device *bdev,

340

sector_t first_sector, int nr_vecs)

340

sector_t first_sector, int nr_vecs)

341

{

341

{

342

struct bio *bio;

342

struct bio *bio;

343

344

/*

344

/*

345

* bio_alloc() is guaranteed to return a bio when called with

345

* bio_alloc() is guaranteed to return a bio when called with

346

* __GFP_WAIT and we request a valid number of vectors.

346

* __GFP_WAIT and we request a valid number of vectors.

347

*/

347

*/

348

bio = bio_alloc(GFP_KERNEL, nr_vecs);

348

bio = bio_alloc(GFP_KERNEL, nr_vecs);

349

350

bio->bi_bdev = bdev;

350

bio->bi_bdev = bdev;

351

bio->bi_sector = first_sector;

351

bio->bi_sector = first_sector;

352

if (dio->is_async)

352

if (dio->is_async)

353

bio->bi_end_io = dio_bio_end_aio;

353

bio->bi_end_io = dio_bio_end_aio;

354

else

354

else

355

bio->bi_end_io = dio_bio_end_io;

355

bio->bi_end_io = dio_bio_end_io;

356

357

sdio->bio = bio;

357

sdio->bio = bio;

358

sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;

358

sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;

359

}

359

}

360

361

/*

361

/*

362

* In the AIO read case we speculatively dirty the pages before starting IO.

362

* In the AIO read case we speculatively dirty the pages before starting IO.

363

* During IO completion, any of these pages which happen to have been written

363

* During IO completion, any of these pages which happen to have been written

364

* back will be redirtied by bio_check_pages_dirty().

364

* back will be redirtied by bio_check_pages_dirty().

365

*

365

*

366

* bios hold a dio reference between submit_bio and ->end_io.

366

* bios hold a dio reference between submit_bio and ->end_io.

367

*/

367

*/

368

static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)

368

static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)

369

{

369

{

370

struct bio *bio = sdio->bio;

370

struct bio *bio = sdio->bio;

371

unsigned long flags;

371

unsigned long flags;

372

373

bio->bi_private = dio;

373

bio->bi_private = dio;

374

375

spin_lock_irqsave(&dio->bio_lock, flags);

375

spin_lock_irqsave(&dio->bio_lock, flags);

376

dio->refcount++;

376

dio->refcount++;

377

spin_unlock_irqrestore(&dio->bio_lock, flags);

377

spin_unlock_irqrestore(&dio->bio_lock, flags);

378

379

if (dio->is_async && dio->rw == READ)

379

if (dio->is_async && dio->rw == READ)

380

bio_set_pages_dirty(bio);

380

bio_set_pages_dirty(bio);

381

382

if (sdio->submit_io)

382

if (sdio->submit_io)

383

sdio->submit_io(dio->rw, bio, dio->inode,

383

sdio->submit_io(dio->rw, bio, dio->inode,

384

sdio->logical_offset_in_bio);

384

sdio->logical_offset_in_bio);

385

else

385

else

386

submit_bio(dio->rw, bio);

386

submit_bio(dio->rw, bio);

387

388

sdio->bio = NULL;

388

sdio->bio = NULL;

389

sdio->boundary = 0;

389

sdio->boundary = 0;

390

sdio->logical_offset_in_bio = 0;

390

sdio->logical_offset_in_bio = 0;

391

}

391

}

392

393

/*

393

/*

394

* Release any resources in case of a failure

394

* Release any resources in case of a failure

395

*/

395

*/

396

static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio)

396

static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio)

397

{

397

{

398

while (dio_pages_present(sdio))

398

while (dio_pages_present(sdio))

399

page_cache_release(dio_get_page(dio, sdio));

399

page_cache_release(dio_get_page(dio, sdio));

400

}

400

}

401

402

/*

402

/*

403

* Wait for the next BIO to complete. Remove it and return it. NULL is

403

* Wait for the next BIO to complete. Remove it and return it. NULL is

404

* returned once all BIOs have been completed. This must only be called once

404

* returned once all BIOs have been completed. This must only be called once

405

* all bios have been issued so that dio->refcount can only decrease. This

405

* all bios have been issued so that dio->refcount can only decrease. This

406

* requires that that the caller hold a reference on the dio.

406

* requires that that the caller hold a reference on the dio.

407

*/

407

*/

408

static struct bio *dio_await_one(struct dio *dio)

408

static struct bio *dio_await_one(struct dio *dio)

409

{

409

{

410

unsigned long flags;

410

unsigned long flags;

411

struct bio *bio = NULL;

411

struct bio *bio = NULL;

412

413

spin_lock_irqsave(&dio->bio_lock, flags);

413

spin_lock_irqsave(&dio->bio_lock, flags);

414

415

/*

415

/*

416

* Wait as long as the list is empty and there are bios in flight. bio

416

* Wait as long as the list is empty and there are bios in flight. bio

417

* completion drops the count, maybe adds to the list, and wakes while

417

* completion drops the count, maybe adds to the list, and wakes while

418

* holding the bio_lock so we don't need set_current_state()'s barrier

418

* holding the bio_lock so we don't need set_current_state()'s barrier

419

* and can call it after testing our condition.

419

* and can call it after testing our condition.

420

*/

420

*/

421

while (dio->refcount > 1 && dio->bio_list == NULL) {

421

while (dio->refcount > 1 && dio->bio_list == NULL) {

422

__set_current_state(TASK_UNINTERRUPTIBLE);

422

__set_current_state(TASK_UNINTERRUPTIBLE);

423

dio->waiter = current;

423

dio->waiter = current;

424

spin_unlock_irqrestore(&dio->bio_lock, flags);

424

spin_unlock_irqrestore(&dio->bio_lock, flags);

425

io_schedule();

425

io_schedule();

426

/* wake up sets us TASK_RUNNING */

426

/* wake up sets us TASK_RUNNING */

427

spin_lock_irqsave(&dio->bio_lock, flags);

427

spin_lock_irqsave(&dio->bio_lock, flags);

428

dio->waiter = NULL;

428

dio->waiter = NULL;

429

}

429

}

430

if (dio->bio_list) {

430

if (dio->bio_list) {

431

bio = dio->bio_list;

431

bio = dio->bio_list;

432

dio->bio_list = bio->bi_private;

432

dio->bio_list = bio->bi_private;

433

}

433

}

434

spin_unlock_irqrestore(&dio->bio_lock, flags);

434

spin_unlock_irqrestore(&dio->bio_lock, flags);

435

return bio;

435

return bio;

436

}

436

}

437

438

/*

438

/*

439

* Process one completed BIO. No locks are held.

439

* Process one completed BIO. No locks are held.

440

*/

440

*/

441

static int dio_bio_complete(struct dio *dio, struct bio *bio)

441

static int dio_bio_complete(struct dio *dio, struct bio *bio)

442

{

442

{

443

const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);

443

const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);

444

struct bio_vec *bvec = bio->bi_io_vec;

444

struct bio_vec *bvec = bio->bi_io_vec;

445

int page_no;

445

int page_no;

446

447

if (!uptodate)

447

if (!uptodate)

448

dio->io_error = -EIO;

448

dio->io_error = -EIO;

449

450

if (dio->is_async && dio->rw == READ) {

450

if (dio->is_async && dio->rw == READ) {

451

bio_check_pages_dirty(bio); /* transfers ownership */

451

bio_check_pages_dirty(bio); /* transfers ownership */

452

} else {

452

} else {

453

for (page_no = 0; page_no < bio->bi_vcnt; page_no++) {

453

for (page_no = 0; page_no < bio->bi_vcnt; page_no++) {

454

struct page *page = bvec[page_no].bv_page;

454

struct page *page = bvec[page_no].bv_page;

455

456

if (dio->rw == READ && !PageCompound(page))

456

if (dio->rw == READ && !PageCompound(page))

457

set_page_dirty_lock(page);

457

set_page_dirty_lock(page);

458

page_cache_release(page);

458

page_cache_release(page);

459

}

459

}

460

bio_put(bio);

460

bio_put(bio);

461

}

461

}

462

return uptodate ? 0 : -EIO;

462

return uptodate ? 0 : -EIO;

463

}

463

}

464

465

/*

465

/*

466

* Wait on and process all in-flight BIOs. This must only be called once

466

* Wait on and process all in-flight BIOs. This must only be called once

467

* all bios have been issued so that the refcount can only decrease.

467

* all bios have been issued so that the refcount can only decrease.

468

* This just waits for all bios to make it through dio_bio_complete. IO

468

* This just waits for all bios to make it through dio_bio_complete. IO

469

* errors are propagated through dio->io_error and should be propagated via

469

* errors are propagated through dio->io_error and should be propagated via

470

* dio_complete().

470

* dio_complete().

471

*/

471

*/

472

static void dio_await_completion(struct dio *dio)

472

static void dio_await_completion(struct dio *dio)

473

{

473

{

474

struct bio *bio;

474

struct bio *bio;

475

do {

475

do {

476

bio = dio_await_one(dio);

476

bio = dio_await_one(dio);

477

if (bio)

477

if (bio)

478

dio_bio_complete(dio, bio);

478

dio_bio_complete(dio, bio);

479

} while (bio);

479

} while (bio);

480

}

480

}

481

482

/*

482

/*

483

* A really large O_DIRECT read or write can generate a lot of BIOs. So

483

* A really large O_DIRECT read or write can generate a lot of BIOs. So

484

* to keep the memory consumption sane we periodically reap any completed BIOs

484

* to keep the memory consumption sane we periodically reap any completed BIOs

485

* during the BIO generation phase.

485

* during the BIO generation phase.

486

*

486

*

487

* This also helps to limit the peak amount of pinned userspace memory.

487

* This also helps to limit the peak amount of pinned userspace memory.

488

*/

488

*/

489

static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio)

489

static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio)

490

{

490

{

491

int ret = 0;

491

int ret = 0;

492

493

if (sdio->reap_counter++ >= 64) {

493

if (sdio->reap_counter++ >= 64) {

494

while (dio->bio_list) {

494

while (dio->bio_list) {

495

unsigned long flags;

495

unsigned long flags;

496

struct bio *bio;

496

struct bio *bio;

497

int ret2;

497

int ret2;

498

499

spin_lock_irqsave(&dio->bio_lock, flags);

499

spin_lock_irqsave(&dio->bio_lock, flags);

500

bio = dio->bio_list;

500

bio = dio->bio_list;

501

dio->bio_list = bio->bi_private;

501

dio->bio_list = bio->bi_private;

502

spin_unlock_irqrestore(&dio->bio_lock, flags);

502

spin_unlock_irqrestore(&dio->bio_lock, flags);

503

ret2 = dio_bio_complete(dio, bio);

503

ret2 = dio_bio_complete(dio, bio);

504

if (ret == 0)

504

if (ret == 0)

505

ret = ret2;

505

ret = ret2;

506

}

506

}

507

sdio->reap_counter = 0;

507

sdio->reap_counter = 0;

508

}

508

}

509

return ret;

509

return ret;

510

}

510

}

511

512

/*

512

/*

513

* Call into the fs to map some more disk blocks. We record the current number

513

* Call into the fs to map some more disk blocks. We record the current number

514

* of available blocks at sdio->blocks_available. These are in units of the

514

* of available blocks at sdio->blocks_available. These are in units of the

515

* fs blocksize, (1 << inode->i_blkbits).

515

* fs blocksize, (1 << inode->i_blkbits).

516

*

516

*

517

* The fs is allowed to map lots of blocks at once. If it wants to do that,

517

* The fs is allowed to map lots of blocks at once. If it wants to do that,

518

* it uses the passed inode-relative block number as the file offset, as usual.

518

* it uses the passed inode-relative block number as the file offset, as usual.

519

*

519

*

520

* get_block() is passed the number of i_blkbits-sized blocks which direct_io

520

* get_block() is passed the number of i_blkbits-sized blocks which direct_io

521

* has remaining to do. The fs should not map more than this number of blocks.

521

* has remaining to do. The fs should not map more than this number of blocks.

522

*

522

*

523

* If the fs has mapped a lot of blocks, it should populate bh->b_size to

523

* If the fs has mapped a lot of blocks, it should populate bh->b_size to

524

* indicate how much contiguous disk space has been made available at

524

* indicate how much contiguous disk space has been made available at

525

* bh->b_blocknr.

525

* bh->b_blocknr.

526

*

526

*

527

* If *any* of the mapped blocks are new, then the fs must set buffer_new().

527

* If *any* of the mapped blocks are new, then the fs must set buffer_new().

528

* This isn't very efficient...

528

* This isn't very efficient...

529

*

529

*

530

* In the case of filesystem holes: the fs may return an arbitrarily-large

530

* In the case of filesystem holes: the fs may return an arbitrarily-large

531

* hole by returning an appropriate value in b_size and by clearing

531

* hole by returning an appropriate value in b_size and by clearing

532

* buffer_mapped(). However the direct-io code will only process holes one

532

* buffer_mapped(). However the direct-io code will only process holes one

533

* block at a time - it will repeatedly call get_block() as it walks the hole.

533

* block at a time - it will repeatedly call get_block() as it walks the hole.

534

*/

534

*/

535

static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,

535

static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,

536

struct buffer_head *map_bh)

536

struct buffer_head *map_bh)

537

{

537

{

538

int ret;

538

int ret;

539

sector_t fs_startblk; /* Into file, in filesystem-sized blocks */

539

sector_t fs_startblk; /* Into file, in filesystem-sized blocks */

540

sector_t fs_endblk; /* Into file, in filesystem-sized blocks */

540

sector_t fs_endblk; /* Into file, in filesystem-sized blocks */

541

unsigned long fs_count; /* Number of filesystem-sized blocks */

541

unsigned long fs_count; /* Number of filesystem-sized blocks */

542

int create;

542

int create;

543

unsigned int i_blkbits = sdio->blkbits + sdio->blkfactor;

543

unsigned int i_blkbits = sdio->blkbits + sdio->blkfactor;

544

545

/*

545

/*

546

* If there was a memory error and we've overwritten all the

546

* If there was a memory error and we've overwritten all the

547

* mapped blocks then we can now return that memory error

547

* mapped blocks then we can now return that memory error

548

*/

548

*/

549

ret = dio->page_errors;

549

ret = dio->page_errors;

550

if (ret == 0) {

550

if (ret == 0) {

551

BUG_ON(sdio->block_in_file >= sdio->final_block_in_request);

551

BUG_ON(sdio->block_in_file >= sdio->final_block_in_request);

552

fs_startblk = sdio->block_in_file >> sdio->blkfactor;

552

fs_startblk = sdio->block_in_file >> sdio->blkfactor;

553

fs_endblk = (sdio->final_block_in_request - 1) >>

553

fs_endblk = (sdio->final_block_in_request - 1) >>

554

sdio->blkfactor;

554

sdio->blkfactor;

555

fs_count = fs_endblk - fs_startblk + 1;

555

fs_count = fs_endblk - fs_startblk + 1;

556

557

map_bh->b_state = 0;

557

map_bh->b_state = 0;

558

map_bh->b_size = fs_count << i_blkbits;

558

map_bh->b_size = fs_count << i_blkbits;

559

560

/*

560

/*

561

* For writes inside i_size on a DIO_SKIP_HOLES filesystem we

561

* For writes inside i_size on a DIO_SKIP_HOLES filesystem we

562

* forbid block creations: only overwrites are permitted.

562

* forbid block creations: only overwrites are permitted.

563

* We will return early to the caller once we see an

563

* We will return early to the caller once we see an

564

* unmapped buffer head returned, and the caller will fall

564

* unmapped buffer head returned, and the caller will fall

565

* back to buffered I/O.

565

* back to buffered I/O.

566

*

566

*

567

* Otherwise the decision is left to the get_blocks method,

567

* Otherwise the decision is left to the get_blocks method,

568

* which may decide to handle it or also return an unmapped

568

* which may decide to handle it or also return an unmapped

569

* buffer head.

569

* buffer head.

570

*/

570

*/

571

create = dio->rw & WRITE;

571

create = dio->rw & WRITE;

572

if (dio->flags & DIO_SKIP_HOLES) {

572

if (dio->flags & DIO_SKIP_HOLES) {

573

if (sdio->block_in_file < (i_size_read(dio->inode) >>

573

if (sdio->block_in_file < (i_size_read(dio->inode) >>

574

sdio->blkbits))

574

sdio->blkbits))

575

create = 0;

575

create = 0;

576

}

576

}

577

578

ret = (*sdio->get_block)(dio->inode, fs_startblk,

578

ret = (*sdio->get_block)(dio->inode, fs_startblk,

579

map_bh, create);

579

map_bh, create);

580

581

/* Store for completion */

581

/* Store for completion */

582

dio->private = map_bh->b_private;

582

dio->private = map_bh->b_private;

583

}

583

}

584

return ret;

584

return ret;

585

}

585

}

586

587

/*

587

/*

588

* There is no bio. Make one now.

588

* There is no bio. Make one now.

589

*/

589

*/

590

static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio,

590

static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio,

591

sector_t start_sector, struct buffer_head *map_bh)

591

sector_t start_sector, struct buffer_head *map_bh)

592

{

592

{

593

sector_t sector;

593

sector_t sector;

594

int ret, nr_pages;

594

int ret, nr_pages;

595

596

ret = dio_bio_reap(dio, sdio);

596

ret = dio_bio_reap(dio, sdio);

597

if (ret)

597

if (ret)

598

goto out;

598

goto out;

599

sector = start_sector << (sdio->blkbits - 9);

599

sector = start_sector << (sdio->blkbits - 9);

600

nr_pages = min(sdio->pages_in_io, bio_get_nr_vecs(map_bh->b_bdev));

600

nr_pages = min(sdio->pages_in_io, bio_get_nr_vecs(map_bh->b_bdev));

601

nr_pages = min(nr_pages, BIO_MAX_PAGES);

601

nr_pages = min(nr_pages, BIO_MAX_PAGES);

602

BUG_ON(nr_pages <= 0);

602

BUG_ON(nr_pages <= 0);

603

dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages);

603

dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages);

604

sdio->boundary = 0;

604

sdio->boundary = 0;

605

out:

605

out:

606

return ret;

606

return ret;

607

}

607

}

608

609

/*

609

/*

610

* Attempt to put the current chunk of 'cur_page' into the current BIO. If

610

* Attempt to put the current chunk of 'cur_page' into the current BIO. If

611

* that was successful then update final_block_in_bio and take a ref against

611

* that was successful then update final_block_in_bio and take a ref against

612

* the just-added page.

612

* the just-added page.

613

*

613

*

614

* Return zero on success. Non-zero means the caller needs to start a new BIO.

614

* Return zero on success. Non-zero means the caller needs to start a new BIO.

615

*/

615

*/

616

static inline int dio_bio_add_page(struct dio_submit *sdio)

616

static inline int dio_bio_add_page(struct dio_submit *sdio)

617

{

617

{

618

int ret;

618

int ret;

619

620

ret = bio_add_page(sdio->bio, sdio->cur_page,

620

ret = bio_add_page(sdio->bio, sdio->cur_page,

621

sdio->cur_page_len, sdio->cur_page_offset);

621

sdio->cur_page_len, sdio->cur_page_offset);

622

if (ret == sdio->cur_page_len) {

622

if (ret == sdio->cur_page_len) {

623

/*

623

/*

624

* Decrement count only, if we are done with this page

624

* Decrement count only, if we are done with this page

625

*/

625

*/

626

if ((sdio->cur_page_len + sdio->cur_page_offset) == PAGE_SIZE)

626

if ((sdio->cur_page_len + sdio->cur_page_offset) == PAGE_SIZE)

627

sdio->pages_in_io--;

627

sdio->pages_in_io--;

628

page_cache_get(sdio->cur_page);

628

page_cache_get(sdio->cur_page);

629

sdio->final_block_in_bio = sdio->cur_page_block +

629

sdio->final_block_in_bio = sdio->cur_page_block +

630

(sdio->cur_page_len >> sdio->blkbits);

630

(sdio->cur_page_len >> sdio->blkbits);

631

ret = 0;

631

ret = 0;

632

} else {

632

} else {

633

ret = 1;

633

ret = 1;

634

}

634

}

635

return ret;

635

return ret;

636

}

636

}

637

638

/*

638

/*

639

* Put cur_page under IO. The section of cur_page which is described by

639

* Put cur_page under IO. The section of cur_page which is described by

640

* cur_page_offset,cur_page_len is put into a BIO. The section of cur_page

640

* cur_page_offset,cur_page_len is put into a BIO. The section of cur_page

641

* starts on-disk at cur_page_block.

641

* starts on-disk at cur_page_block.

642

*

642

*

643

* We take a ref against the page here (on behalf of its presence in the bio).

643

* We take a ref against the page here (on behalf of its presence in the bio).

644

*

644

*

645

* The caller of this function is responsible for removing cur_page from the

645

* The caller of this function is responsible for removing cur_page from the

646

* dio, and for dropping the refcount which came from that presence.

646

* dio, and for dropping the refcount which came from that presence.

647

*/

647

*/

648

static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio,

648

static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio,

649

struct buffer_head *map_bh)

649

struct buffer_head *map_bh)

650

{

650

{

651

int ret = 0;

651

int ret = 0;

652

653

if (sdio->bio) {

653

if (sdio->bio) {

654

loff_t cur_offset = sdio->cur_page_fs_offset;

654

loff_t cur_offset = sdio->cur_page_fs_offset;

655

loff_t bio_next_offset = sdio->logical_offset_in_bio +

655

loff_t bio_next_offset = sdio->logical_offset_in_bio +

656

sdio->bio->bi_size;

656

sdio->bio->bi_size;

657

658

/*

658

/*

659

* See whether this new request is contiguous with the old.

659

* See whether this new request is contiguous with the old.

660

*

660

*

661

* Btrfs cannot handle having logically non-contiguous requests

661

* Btrfs cannot handle having logically non-contiguous requests

662

* submitted. For example if you have

662

* submitted. For example if you have

663

*

663

*

664

* Logical: [0-4095][HOLE][8192-12287]

664

* Logical: [0-4095][HOLE][8192-12287]

665

* Physical: [0-4095] [4096-8191]

665

* Physical: [0-4095] [4096-8191]

666

*

666

*

667

* We cannot submit those pages together as one BIO. So if our

667

* We cannot submit those pages together as one BIO. So if our

668

* current logical offset in the file does not equal what would

668

* current logical offset in the file does not equal what would

669

* be the next logical offset in the bio, submit the bio we

669

* be the next logical offset in the bio, submit the bio we

670

* have.

670

* have.

671

*/

671

*/

672

if (sdio->final_block_in_bio != sdio->cur_page_block ||

672

if (sdio->final_block_in_bio != sdio->cur_page_block ||

673

cur_offset != bio_next_offset)

673

cur_offset != bio_next_offset)

674

dio_bio_submit(dio, sdio);

674

dio_bio_submit(dio, sdio);

675

/*

675

/*

676

* Submit now if the underlying fs is about to perform a

676

* Submit now if the underlying fs is about to perform a

677

* metadata read

677

* metadata read

678

*/

678

*/

679

else if (sdio->boundary)

679

else if (sdio->boundary)

680

dio_bio_submit(dio, sdio);

680

dio_bio_submit(dio, sdio);

681

}

681

}

682

683

if (sdio->bio == NULL) {

683

if (sdio->bio == NULL) {

684

ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh);

684

ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh);

685

if (ret)

685

if (ret)

686

goto out;

686

goto out;

687

}

687

}

688

689

if (dio_bio_add_page(sdio) != 0) {

689

if (dio_bio_add_page(sdio) != 0) {

690

dio_bio_submit(dio, sdio);

690

dio_bio_submit(dio, sdio);

691

ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh);

691

ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh);

692

if (ret == 0) {

692

if (ret == 0) {

693

ret = dio_bio_add_page(sdio);

693

ret = dio_bio_add_page(sdio);

694

BUG_ON(ret != 0);

694

BUG_ON(ret != 0);

695

}

695

}

696

}

696

}

697

out:

697

out:

698

return ret;

698

return ret;

699

}

699

}

700

701

/*

701

/*

702

* An autonomous function to put a chunk of a page under deferred IO.

702

* An autonomous function to put a chunk of a page under deferred IO.

703

*

703

*

704

* The caller doesn't actually know (or care) whether this piece of page is in

704

* The caller doesn't actually know (or care) whether this piece of page is in

705

* a BIO, or is under IO or whatever. We just take care of all possible

705

* a BIO, or is under IO or whatever. We just take care of all possible

706

* situations here. The separation between the logic of do_direct_IO() and

706

* situations here. The separation between the logic of do_direct_IO() and

707

* that of submit_page_section() is important for clarity. Please don't break.

707

* that of submit_page_section() is important for clarity. Please don't break.

708

*

708

*

709

* The chunk of page starts on-disk at blocknr.

709

* The chunk of page starts on-disk at blocknr.

710

*

710

*

711

* We perform deferred IO, by recording the last-submitted page inside our

711

* We perform deferred IO, by recording the last-submitted page inside our

712

* private part of the dio structure. If possible, we just expand the IO

712

* private part of the dio structure. If possible, we just expand the IO

713

* across that page here.

713

* across that page here.

714

*

714

*

715

* If that doesn't work out then we put the old page into the bio and add this

715

* If that doesn't work out then we put the old page into the bio and add this

716

* page to the dio instead.

716

* page to the dio instead.

717

*/

717

*/

718

static inline int

718

static inline int

719

submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,

719

submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,

720

unsigned offset, unsigned len, sector_t blocknr,

720

unsigned offset, unsigned len, sector_t blocknr,

721

struct buffer_head *map_bh)

721

struct buffer_head *map_bh)

722

{

722

{

723

int ret = 0;

723

int ret = 0;

724

725

if (dio->rw & WRITE) {

725

if (dio->rw & WRITE) {

726

/*

726

/*

727

* Read accounting is performed in submit_bio()

727

* Read accounting is performed in submit_bio()

728

*/

728

*/

729

task_io_account_write(len);

729

task_io_account_write(len);

730

}

730

}

731

732

/*

732

/*

733

* Can we just grow the current page's presence in the dio?

733

* Can we just grow the current page's presence in the dio?

734

*/

734

*/

735

if (sdio->cur_page == page &&

735

if (sdio->cur_page == page &&

736

sdio->cur_page_offset + sdio->cur_page_len == offset &&

736

sdio->cur_page_offset + sdio->cur_page_len == offset &&

737

sdio->cur_page_block +

737

sdio->cur_page_block +

738

(sdio->cur_page_len >> sdio->blkbits) == blocknr) {

738

(sdio->cur_page_len >> sdio->blkbits) == blocknr) {

739

sdio->cur_page_len += len;

739

sdio->cur_page_len += len;

740

741

/*

741

/*

742

* If sdio->boundary then we want to schedule the IO now to

742

* If sdio->boundary then we want to schedule the IO now to

743

* avoid metadata seeks.

743

* avoid metadata seeks.

744

*/

744

*/

745

if (sdio->boundary) {

745

if (sdio->boundary) {

746

ret = dio_send_cur_page(dio, sdio, map_bh);

746

ret = dio_send_cur_page(dio, sdio, map_bh);

747

page_cache_release(sdio->cur_page);

747

page_cache_release(sdio->cur_page);

748

sdio->cur_page = NULL;

748

sdio->cur_page = NULL;

749

}

749

}

750

goto out;

750

goto out;

751

}

751

}

752

753

/*

753

/*

754

* If there's a deferred page already there then send it.

754

* If there's a deferred page already there then send it.

755

*/

755

*/

756

if (sdio->cur_page) {

756

if (sdio->cur_page) {

757

ret = dio_send_cur_page(dio, sdio, map_bh);

757

ret = dio_send_cur_page(dio, sdio, map_bh);

758

page_cache_release(sdio->cur_page);

758

page_cache_release(sdio->cur_page);

759

sdio->cur_page = NULL;

759

sdio->cur_page = NULL;

760

if (ret)

760

if (ret)

761

goto out;

761

goto out;

762

}

762

}

763

764

page_cache_get(page); /* It is in dio */

764

page_cache_get(page); /* It is in dio */

765

sdio->cur_page = page;

765

sdio->cur_page = page;

766

sdio->cur_page_offset = offset;

766

sdio->cur_page_offset = offset;

767

sdio->cur_page_len = len;

767

sdio->cur_page_len = len;

768

sdio->cur_page_block = blocknr;

768

sdio->cur_page_block = blocknr;

769

sdio->cur_page_fs_offset = sdio->block_in_file << sdio->blkbits;

769

sdio->cur_page_fs_offset = sdio->block_in_file << sdio->blkbits;

770

out:

770

out:

771

return ret;

771

return ret;

772

}

772

}

773

774

/*

774

/*

775

* Clean any dirty buffers in the blockdev mapping which alias newly-created

775

* Clean any dirty buffers in the blockdev mapping which alias newly-created

776

* file blocks. Only called for S_ISREG files - blockdevs do not set

776

* file blocks. Only called for S_ISREG files - blockdevs do not set

777

* buffer_new

777

* buffer_new

778

*/

778

*/

779

static void clean_blockdev_aliases(struct dio *dio, struct buffer_head *map_bh)

779

static void clean_blockdev_aliases(struct dio *dio, struct buffer_head *map_bh)

780

{

780

{

781

unsigned i;

781

unsigned i;

782

unsigned nblocks;

782

unsigned nblocks;

783

784

nblocks = map_bh->b_size >> dio->inode->i_blkbits;

784

nblocks = map_bh->b_size >> dio->inode->i_blkbits;

785

786

for (i = 0; i < nblocks; i++) {

786

for (i = 0; i < nblocks; i++) {

787

unmap_underlying_metadata(map_bh->b_bdev,

787

unmap_underlying_metadata(map_bh->b_bdev,

788

map_bh->b_blocknr + i);

788

map_bh->b_blocknr + i);

789

}

789

}

790

}

790

}

791

792

/*

792

/*

793

* If we are not writing the entire block and get_block() allocated

793

* If we are not writing the entire block and get_block() allocated

794

* the block for us, we need to fill-in the unused portion of the

794

* the block for us, we need to fill-in the unused portion of the

795

* block with zeros. This happens only if user-buffer, fileoffset or

795

* block with zeros. This happens only if user-buffer, fileoffset or

796

* io length is not filesystem block-size multiple.

796

* io length is not filesystem block-size multiple.

797

*

797

*

798

* `end' is zero if we're doing the start of the IO, 1 at the end of the

798

* `end' is zero if we're doing the start of the IO, 1 at the end of the

799

* IO.

799

* IO.

800

*/

800

*/

801

static inline void dio_zero_block(struct dio *dio, struct dio_submit *sdio,

801

static inline void dio_zero_block(struct dio *dio, struct dio_submit *sdio,

802

int end, struct buffer_head *map_bh)

802

int end, struct buffer_head *map_bh)

803

{

803

{

804

unsigned dio_blocks_per_fs_block;

804

unsigned dio_blocks_per_fs_block;

805

unsigned this_chunk_blocks; /* In dio_blocks */

805

unsigned this_chunk_blocks; /* In dio_blocks */

806

unsigned this_chunk_bytes;

806

unsigned this_chunk_bytes;

807

struct page *page;

807

struct page *page;

808

809

sdio->start_zero_done = 1;

809

sdio->start_zero_done = 1;

810

if (!sdio->blkfactor || !buffer_new(map_bh))

810

if (!sdio->blkfactor || !buffer_new(map_bh))

811

return;

811

return;

812

813

dio_blocks_per_fs_block = 1 << sdio->blkfactor;

813

dio_blocks_per_fs_block = 1 << sdio->blkfactor;

814

this_chunk_blocks = sdio->block_in_file & (dio_blocks_per_fs_block - 1);

814

this_chunk_blocks = sdio->block_in_file & (dio_blocks_per_fs_block - 1);

815

816

if (!this_chunk_blocks)

816

if (!this_chunk_blocks)

817

return;

817

return;

818

819

/*

819

/*

820

* We need to zero out part of an fs block. It is either at the

820

* We need to zero out part of an fs block. It is either at the

821

* beginning or the end of the fs block.

821

* beginning or the end of the fs block.

822

*/

822

*/

823

if (end)

823

if (end)

824

this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;

824

this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;

825

826

this_chunk_bytes = this_chunk_blocks << sdio->blkbits;

826

this_chunk_bytes = this_chunk_blocks << sdio->blkbits;

827

828

page = ZERO_PAGE(0);

828

page = ZERO_PAGE(0);

829

if (submit_page_section(dio, sdio, page, 0, this_chunk_bytes,

829

if (submit_page_section(dio, sdio, page, 0, this_chunk_bytes,

830

sdio->next_block_for_io, map_bh))

830

sdio->next_block_for_io, map_bh))

831

return;

831

return;

832

833

sdio->next_block_for_io += this_chunk_blocks;

833

sdio->next_block_for_io += this_chunk_blocks;

834

}

834

}

835

836

/*

836

/*

837

* Walk the user pages, and the file, mapping blocks to disk and generating

837

* Walk the user pages, and the file, mapping blocks to disk and generating

838

* a sequence of (page,offset,len,block) mappings. These mappings are injected

838

* a sequence of (page,offset,len,block) mappings. These mappings are injected

839

* into submit_page_section(), which takes care of the next stage of submission

839

* into submit_page_section(), which takes care of the next stage of submission

840

*

840

*

841

* Direct IO against a blockdev is different from a file. Because we can

841

* Direct IO against a blockdev is different from a file. Because we can

842

* happily perform page-sized but 512-byte aligned IOs. It is important that

842

* happily perform page-sized but 512-byte aligned IOs. It is important that

843

* blockdev IO be able to have fine alignment and large sizes.

843

* blockdev IO be able to have fine alignment and large sizes.

844

*

844

*

845

* So what we do is to permit the ->get_block function to populate bh.b_size

845

* So what we do is to permit the ->get_block function to populate bh.b_size

846

* with the size of IO which is permitted at this offset and this i_blkbits.

846

* with the size of IO which is permitted at this offset and this i_blkbits.

847

*

847

*

848

* For best results, the blockdev should be set up with 512-byte i_blkbits and

848

* For best results, the blockdev should be set up with 512-byte i_blkbits and

849

* it should set b_size to PAGE_SIZE or more inside get_block(). This gives

849

* it should set b_size to PAGE_SIZE or more inside get_block(). This gives

850

* fine alignment but still allows this function to work in PAGE_SIZE units.

850

* fine alignment but still allows this function to work in PAGE_SIZE units.

851

*/

851

*/

852

static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,

852

static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,

853

struct buffer_head *map_bh)

853

struct buffer_head *map_bh)

854

{

854

{

855

const unsigned blkbits = sdio->blkbits;

855

const unsigned blkbits = sdio->blkbits;

856

const unsigned blocks_per_page = PAGE_SIZE >> blkbits;

856

const unsigned blocks_per_page = PAGE_SIZE >> blkbits;

857

struct page *page;

857

struct page *page;

858

unsigned block_in_page;

858

unsigned block_in_page;

859

int ret = 0;

859

int ret = 0;

860

861

/* The I/O can start at any block offset within the first page */

861

/* The I/O can start at any block offset within the first page */

862

block_in_page = sdio->first_block_in_page;

862

block_in_page = sdio->first_block_in_page;

863

864

while (sdio->block_in_file < sdio->final_block_in_request) {

864

while (sdio->block_in_file < sdio->final_block_in_request) {

865

page = dio_get_page(dio, sdio);

865

page = dio_get_page(dio, sdio);

866

if (IS_ERR(page)) {

866

if (IS_ERR(page)) {

867

ret = PTR_ERR(page);

867

ret = PTR_ERR(page);

868

goto out;

868

goto out;

869

}

869

}

870

871

while (block_in_page < blocks_per_page) {

871

while (block_in_page < blocks_per_page) {

872

unsigned offset_in_page = block_in_page << blkbits;

872

unsigned offset_in_page = block_in_page << blkbits;

873

unsigned this_chunk_bytes; /* # of bytes mapped */

873

unsigned this_chunk_bytes; /* # of bytes mapped */

874

unsigned this_chunk_blocks; /* # of blocks */

874

unsigned this_chunk_blocks; /* # of blocks */

875

unsigned u;

875

unsigned u;

876

877

if (sdio->blocks_available == 0) {

877

if (sdio->blocks_available == 0) {

878

/*

878

/*

879

* Need to go and map some more disk

879

* Need to go and map some more disk

880

*/

880

*/

881

unsigned long blkmask;

881

unsigned long blkmask;

882

unsigned long dio_remainder;

882

unsigned long dio_remainder;

883

884

ret = get_more_blocks(dio, sdio, map_bh);

884

ret = get_more_blocks(dio, sdio, map_bh);

885

if (ret) {

885

if (ret) {

886

page_cache_release(page);

886

page_cache_release(page);

887

goto out;

887

goto out;

888

}

888

}

889

if (!buffer_mapped(map_bh))

889

if (!buffer_mapped(map_bh))

890

goto do_holes;

890

goto do_holes;

891

892

sdio->blocks_available =

892

sdio->blocks_available =

893

map_bh->b_size >> sdio->blkbits;

893

map_bh->b_size >> sdio->blkbits;

894

sdio->next_block_for_io =

894

sdio->next_block_for_io =

895

map_bh->b_blocknr << sdio->blkfactor;

895

map_bh->b_blocknr << sdio->blkfactor;

896

if (buffer_new(map_bh))

896

if (buffer_new(map_bh))

897

clean_blockdev_aliases(dio, map_bh);

897

clean_blockdev_aliases(dio, map_bh);

898

899

if (!sdio->blkfactor)

899

if (!sdio->blkfactor)

900

goto do_holes;

900

goto do_holes;

901

902

blkmask = (1 << sdio->blkfactor) - 1;

902

blkmask = (1 << sdio->blkfactor) - 1;

903

dio_remainder = (sdio->block_in_file & blkmask);

903

dio_remainder = (sdio->block_in_file & blkmask);

904

905

/*

905

/*

906

* If we are at the start of IO and that IO

906

* If we are at the start of IO and that IO

907

* starts partway into a fs-block,

907

* starts partway into a fs-block,

908

* dio_remainder will be non-zero. If the IO

908

* dio_remainder will be non-zero. If the IO

909

* is a read then we can simply advance the IO

909

* is a read then we can simply advance the IO

910

* cursor to the first block which is to be

910

* cursor to the first block which is to be

911

* read. But if the IO is a write and the

911

* read. But if the IO is a write and the

912

* block was newly allocated we cannot do that;

912

* block was newly allocated we cannot do that;

913

* the start of the fs block must be zeroed out

913

* the start of the fs block must be zeroed out

914

* on-disk

914

* on-disk

915

*/

915

*/

916

if (!buffer_new(map_bh))

916

if (!buffer_new(map_bh))

917

sdio->next_block_for_io += dio_remainder;

917

sdio->next_block_for_io += dio_remainder;

918

sdio->blocks_available -= dio_remainder;

918

sdio->blocks_available -= dio_remainder;

919

}

919

}

920

do_holes:

920

do_holes:

921

/* Handle holes */

921

/* Handle holes */

922

if (!buffer_mapped(map_bh)) {

922

if (!buffer_mapped(map_bh)) {

923

loff_t i_size_aligned;

923

loff_t i_size_aligned;

924

925

/* AKPM: eargh, -ENOTBLK is a hack */

925

/* AKPM: eargh, -ENOTBLK is a hack */

926

if (dio->rw & WRITE) {

926

if (dio->rw & WRITE) {

927

page_cache_release(page);

927

page_cache_release(page);

928

return -ENOTBLK;

928

return -ENOTBLK;

929

}

929

}

930

931

/*

931

/*

932

* Be sure to account for a partial block as the

932

* Be sure to account for a partial block as the

933

* last block in the file

933

* last block in the file

934

*/

934

*/

935

i_size_aligned = ALIGN(i_size_read(dio->inode),

935

i_size_aligned = ALIGN(i_size_read(dio->inode),

936

1 << blkbits);

936

1 << blkbits);

937

if (sdio->block_in_file >=

937

if (sdio->block_in_file >=

938

i_size_aligned >> blkbits) {

938

i_size_aligned >> blkbits) {

939

/* We hit eof */

939

/* We hit eof */

940

page_cache_release(page);

940

page_cache_release(page);

941

goto out;

941

goto out;

942

}

942

}

943

zero_user(page, block_in_page << blkbits,

943

zero_user(page, block_in_page << blkbits,

944

1 << blkbits);

944

1 << blkbits);

945

sdio->block_in_file++;

945

sdio->block_in_file++;

946

block_in_page++;

946

block_in_page++;

947

goto next_block;

947

goto next_block;

948

}

948

}

949

950

/*

950

/*

951

* If we're performing IO which has an alignment which

951

* If we're performing IO which has an alignment which

952

* is finer than the underlying fs, go check to see if

952

* is finer than the underlying fs, go check to see if

953

* we must zero out the start of this block.

953

* we must zero out the start of this block.

954

*/

954

*/

955

if (unlikely(sdio->blkfactor && !sdio->start_zero_done))

955

if (unlikely(sdio->blkfactor && !sdio->start_zero_done))

956

dio_zero_block(dio, sdio, 0, map_bh);

956

dio_zero_block(dio, sdio, 0, map_bh);

957

958

/*

958

/*

959

* Work out, in this_chunk_blocks, how much disk we

959

* Work out, in this_chunk_blocks, how much disk we

960

* can add to this page

960

* can add to this page

961

*/

961

*/

962

this_chunk_blocks = sdio->blocks_available;

962

this_chunk_blocks = sdio->blocks_available;

963

u = (PAGE_SIZE - offset_in_page) >> blkbits;

963

u = (PAGE_SIZE - offset_in_page) >> blkbits;

964

if (this_chunk_blocks > u)

964

if (this_chunk_blocks > u)

965

this_chunk_blocks = u;

965

this_chunk_blocks = u;

966

u = sdio->final_block_in_request - sdio->block_in_file;

966

u = sdio->final_block_in_request - sdio->block_in_file;

967

if (this_chunk_blocks > u)

967

if (this_chunk_blocks > u)

968

this_chunk_blocks = u;

968

this_chunk_blocks = u;

969

this_chunk_bytes = this_chunk_blocks << blkbits;

969

this_chunk_bytes = this_chunk_blocks << blkbits;

970

BUG_ON(this_chunk_bytes == 0);

970

BUG_ON(this_chunk_bytes == 0);

971

972

sdio->boundary = buffer_boundary(map_bh);

972

sdio->boundary = buffer_boundary(map_bh);

973

ret = submit_page_section(dio, sdio, page,

973

ret = submit_page_section(dio, sdio, page,

974

offset_in_page,

974

offset_in_page,

975

this_chunk_bytes,

975

this_chunk_bytes,

976

sdio->next_block_for_io,

976

sdio->next_block_for_io,

977

map_bh);

977

map_bh);

978

if (ret) {

978

if (ret) {

979

page_cache_release(page);

979

page_cache_release(page);

980

goto out;

980

goto out;

981

}

981

}

982

sdio->next_block_for_io += this_chunk_blocks;

982

sdio->next_block_for_io += this_chunk_blocks;

983

984

sdio->block_in_file += this_chunk_blocks;

984

sdio->block_in_file += this_chunk_blocks;

985

block_in_page += this_chunk_blocks;

985

block_in_page += this_chunk_blocks;

986

sdio->blocks_available -= this_chunk_blocks;

986

sdio->blocks_available -= this_chunk_blocks;

987

next_block:

987

next_block:

988

BUG_ON(sdio->block_in_file > sdio->final_block_in_request);

988

BUG_ON(sdio->block_in_file > sdio->final_block_in_request);

989

if (sdio->block_in_file == sdio->final_block_in_request)

989

if (sdio->block_in_file == sdio->final_block_in_request)

990

break;

990

break;

991

}

991

}

992

993

/* Drop the ref which was taken in get_user_pages() */

993

/* Drop the ref which was taken in get_user_pages() */

994

page_cache_release(page);

994

page_cache_release(page);

995

block_in_page = 0;

995

block_in_page = 0;

996

}

996

}

997

out:

997

out:

998

return ret;

998

return ret;

999

}

999

}

1000

1001

static inline int drop_refcount(struct dio *dio)

1001

static inline int drop_refcount(struct dio *dio)

1002

{

1002

{

1003

int ret2;

1003

int ret2;

1004

unsigned long flags;

1004

unsigned long flags;

1005

1006

/*

1006

/*

1007

* Sync will always be dropping the final ref and completing the

1007

* Sync will always be dropping the final ref and completing the

1008

* operation. AIO can if it was a broken operation described above or

1008

* operation. AIO can if it was a broken operation described above or

1009

* in fact if all the bios race to complete before we get here. In

1009

* in fact if all the bios race to complete before we get here. In

1010

* that case dio_complete() translates the EIOCBQUEUED into the proper

1010

* that case dio_complete() translates the EIOCBQUEUED into the proper

1011

* return code that the caller will hand to aio_complete().

1011

* return code that the caller will hand to aio_complete().

1012

*

1012

*

1013

* This is managed by the bio_lock instead of being an atomic_t so that

1013

* This is managed by the bio_lock instead of being an atomic_t so that

1014

* completion paths can drop their ref and use the remaining count to

1014

* completion paths can drop their ref and use the remaining count to

1015

* decide to wake the submission path atomically.

1015

* decide to wake the submission path atomically.

1016

*/

1016

*/

1017

spin_lock_irqsave(&dio->bio_lock, flags);

1017

spin_lock_irqsave(&dio->bio_lock, flags);

1018

ret2 = --dio->refcount;

1018

ret2 = --dio->refcount;

1019

spin_unlock_irqrestore(&dio->bio_lock, flags);

1019

spin_unlock_irqrestore(&dio->bio_lock, flags);

1020

return ret2;

1020

return ret2;

1021

}

1021

}

1022

1023

/*

1023

/*

1024

* This is a library function for use by filesystem drivers.

1024

* This is a library function for use by filesystem drivers.

1025

*

1025

*

1026

* The locking rules are governed by the flags parameter:

1026

* The locking rules are governed by the flags parameter:

1027

* - if the flags value contains DIO_LOCKING we use a fancy locking

1027

* - if the flags value contains DIO_LOCKING we use a fancy locking

1028

* scheme for dumb filesystems.

1028

* scheme for dumb filesystems.

1029

* For writes this function is called under i_mutex and returns with

1029

* For writes this function is called under i_mutex and returns with

1030

* i_mutex held, for reads, i_mutex is not held on entry, but it is

1030

* i_mutex held, for reads, i_mutex is not held on entry, but it is

1031

* taken and dropped again before returning.

1031

* taken and dropped again before returning.

1032

* - if the flags value does NOT contain DIO_LOCKING we don't use any

1032

* - if the flags value does NOT contain DIO_LOCKING we don't use any

1033

* internal locking but rather rely on the filesystem to synchronize

1033

* internal locking but rather rely on the filesystem to synchronize

1034

* direct I/O reads/writes versus each other and truncate.

1034

* direct I/O reads/writes versus each other and truncate.

1035

*

1035

*

1036

* To help with locking against truncate we incremented the i_dio_count

1036

* To help with locking against truncate we incremented the i_dio_count

1037

* counter before starting direct I/O, and decrement it once we are done.

1037

* counter before starting direct I/O, and decrement it once we are done.

1038

* Truncate can wait for it to reach zero to provide exclusion. It is

1038

* Truncate can wait for it to reach zero to provide exclusion. It is

1039

* expected that filesystem provide exclusion between new direct I/O

1039

* expected that filesystem provide exclusion between new direct I/O

1040

* and truncates. For DIO_LOCKING filesystems this is done by i_mutex,

1040

* and truncates. For DIO_LOCKING filesystems this is done by i_mutex,

1041

* but other filesystems need to take care of this on their own.

1041

* but other filesystems need to take care of this on their own.

1042

*

1042

*

1043

* NOTE: if you pass "sdio" to anything by pointer make sure that function

1043

* NOTE: if you pass "sdio" to anything by pointer make sure that function

1044

* is always inlined. Otherwise gcc is unable to split the structure into

1044

* is always inlined. Otherwise gcc is unable to split the structure into

1045

* individual fields and will generate much worse code. This is important

1045

* individual fields and will generate much worse code. This is important

1046

* for the whole file.

1046

* for the whole file.

1047

*/

1047

*/

1048

static inline ssize_t

1048

static inline ssize_t

1049

do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,

1049

do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,

1050

struct block_device *bdev, const struct iovec *iov, loff_t offset,

1050

struct block_device *bdev, const struct iovec *iov, loff_t offset,

1051

unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,

1051

unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,

1052

dio_submit_t submit_io, int flags)

1052

dio_submit_t submit_io, int flags)

1053

{

1053

{

1054

int seg;

1054

int seg;

1055

size_t size;

1055

size_t size;

1056

unsigned long addr;

1056

unsigned long addr;

1057

unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits);

1057

unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits);

1058

unsigned blkbits = i_blkbits;

1058

unsigned blkbits = i_blkbits;

1059

unsigned blocksize_mask = (1 << blkbits) - 1;

1059

unsigned blocksize_mask = (1 << blkbits) - 1;

1060

ssize_t retval = -EINVAL;

1060

ssize_t retval = -EINVAL;

1061

loff_t end = offset;

1061

loff_t end = offset;

1062

struct dio *dio;

1062

struct dio *dio;

1063

struct dio_submit sdio = { 0, };

1063

struct dio_submit sdio = { 0, };

1064

unsigned long user_addr;

1064

unsigned long user_addr;

1065

size_t bytes;

1065

size_t bytes;

1066

struct buffer_head map_bh = { 0, };

1066

struct buffer_head map_bh = { 0, };

1067

struct blk_plug plug;

1067

struct blk_plug plug;

1068

1069

if (rw & WRITE)

1069

if (rw & WRITE)

1070

rw = WRITE_ODIRECT;

1070

rw = WRITE_ODIRECT;

1071

1072

/*

1072

/*

1073

* Avoid references to bdev if not absolutely needed to give

1073

* Avoid references to bdev if not absolutely needed to give

1074

* the early prefetch in the caller enough time.

1074

* the early prefetch in the caller enough time.

1075

*/

1075

*/

1076

1077

if (offset & blocksize_mask) {

1077

if (offset & blocksize_mask) {

1078

if (bdev)

1078

if (bdev)

1079

blkbits = blksize_bits(bdev_logical_block_size(bdev));

1079

blkbits = blksize_bits(bdev_logical_block_size(bdev));

1080

blocksize_mask = (1 << blkbits) - 1;

1080

blocksize_mask = (1 << blkbits) - 1;

1081

if (offset & blocksize_mask)

1081

if (offset & blocksize_mask)

1082

goto out;

1082

goto out;

1083

}

1083

}

1084

1085

/* Check the memory alignment. Blocks cannot straddle pages */

1085

/* Check the memory alignment. Blocks cannot straddle pages */

1086

for (seg = 0; seg < nr_segs; seg++) {

1086

for (seg = 0; seg < nr_segs; seg++) {

1087

addr = (unsigned long)iov[seg].iov_base;

1087

addr = (unsigned long)iov[seg].iov_base;

1088

size = iov[seg].iov_len;

1088

size = iov[seg].iov_len;

1089

end += size;

1089

end += size;

1090

if (unlikely((addr & blocksize_mask) ||

1090

if (unlikely((addr & blocksize_mask) ||

1091

(size & blocksize_mask))) {

1091

(size & blocksize_mask))) {

1092

if (bdev)

1092

if (bdev)

1093

blkbits = blksize_bits(

1093

blkbits = blksize_bits(

1094

bdev_logical_block_size(bdev));

1094

bdev_logical_block_size(bdev));

1095

blocksize_mask = (1 << blkbits) - 1;

1095

blocksize_mask = (1 << blkbits) - 1;

1096

if ((addr & blocksize_mask) || (size & blocksize_mask))

1096

if ((addr & blocksize_mask) || (size & blocksize_mask))

1097

goto out;

1097

goto out;

1098

}

1098

}

1099

}

1099

}

1100

1101

/* watch out for a 0 len io from a tricksy fs */

1101

/* watch out for a 0 len io from a tricksy fs */

1102

if (rw == READ && end == offset)

1102

if (rw == READ && end == offset)

1103

return 0;

1103

return 0;

1104

1105

dio = kmem_cache_alloc(dio_cache, GFP_KERNEL);

1105

dio = kmem_cache_alloc(dio_cache, GFP_KERNEL);

1106

retval = -ENOMEM;

1106

retval = -ENOMEM;

1107

if (!dio)

1107

if (!dio)

1108

goto out;

1108

goto out;

1109

/*

1109

/*

1110

* Believe it or not, zeroing out the page array caused a .5%

1110

* Believe it or not, zeroing out the page array caused a .5%

1111

* performance regression in a database benchmark. So, we take

1111

* performance regression in a database benchmark. So, we take

1112

* care to only zero out what's needed.

1112

* care to only zero out what's needed.

1113

*/

1113

*/

1114

memset(dio, 0, offsetof(struct dio, pages));

1114

memset(dio, 0, offsetof(struct dio, pages));

1115

1116

dio->flags = flags;

1116

dio->flags = flags;

1117

if (dio->flags & DIO_LOCKING) {

1117

if (dio->flags & DIO_LOCKING) {

1118

if (rw == READ) {

1118

if (rw == READ) {

1119

struct address_space *mapping =

1119

struct address_space *mapping =

1120

iocb->ki_filp->f_mapping;

1120

iocb->ki_filp->f_mapping;

1121

1122

/* will be released by direct_io_worker */

1122

/* will be released by direct_io_worker */

1123

mutex_lock(&inode->i_mutex);

1123

mutex_lock(&inode->i_mutex);

1124

1125

retval = filemap_write_and_wait_range(mapping, offset,

1125

retval = filemap_write_and_wait_range(mapping, offset,

1126

end - 1);

1126

end - 1);

1127

if (retval) {

1127

if (retval) {

1128

mutex_unlock(&inode->i_mutex);

1128

mutex_unlock(&inode->i_mutex);

1129

kmem_cache_free(dio_cache, dio);

1129

kmem_cache_free(dio_cache, dio);

1130

goto out;

1130

goto out;

1131

}

1131

}

1132

}

1132

}

1133

}

1133

}

1134

1135

/*

1135

/*

1136

* Will be decremented at I/O completion time.

1136

* Will be decremented at I/O completion time.

1137

*/

1137

*/

1138

atomic_inc(&inode->i_dio_count);

1138

atomic_inc(&inode->i_dio_count);

1139

1140

/*

1140

/*

1141

* For file extending writes updating i_size before data

1141

* For file extending writes updating i_size before data

1142

* writeouts complete can expose uninitialized blocks. So

1142

* writeouts complete can expose uninitialized blocks. So

1143

* even for AIO, we need to wait for i/o to complete before

1143

* even for AIO, we need to wait for i/o to complete before

1144

* returning in this case.

1144

* returning in this case.

1145

*/

1145

*/

1146

dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&

1146

dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&

1147

(end > i_size_read(inode)));

1147

(end > i_size_read(inode)));

1148

1149

retval = 0;

1149

retval = 0;

1150

1151

dio->inode = inode;

1151

dio->inode = inode;

1152

dio->rw = rw;

1152

dio->rw = rw;

1153

sdio.blkbits = blkbits;

1153

sdio.blkbits = blkbits;

1154

sdio.blkfactor = i_blkbits - blkbits;

1154

sdio.blkfactor = i_blkbits - blkbits;

1155

sdio.block_in_file = offset >> blkbits;

1155

sdio.block_in_file = offset >> blkbits;

1156

1157

sdio.get_block = get_block;

1157

sdio.get_block = get_block;

1158

dio->end_io = end_io;

1158

dio->end_io = end_io;

1159

sdio.submit_io = submit_io;

1159

sdio.submit_io = submit_io;

1160

sdio.final_block_in_bio = -1;

1160

sdio.final_block_in_bio = -1;

1161

sdio.next_block_for_io = -1;

1161

sdio.next_block_for_io = -1;

1162

1163

dio->iocb = iocb;

1163

dio->iocb = iocb;

1164

dio->i_size = i_size_read(inode);

1164

dio->i_size = i_size_read(inode);

1165

1166

spin_lock_init(&dio->bio_lock);

1166

spin_lock_init(&dio->bio_lock);

1167

dio->refcount = 1;

1167

dio->refcount = 1;

1168

1169

/*

1169

/*

1170

* In case of non-aligned buffers, we may need 2 more

1170

* In case of non-aligned buffers, we may need 2 more

1171

* pages since we need to zero out first and last block.

1171

* pages since we need to zero out first and last block.

1172

*/

1172

*/

1173

if (unlikely(sdio.blkfactor))

1173

if (unlikely(sdio.blkfactor))

1174

sdio.pages_in_io = 2;

1174

sdio.pages_in_io = 2;

1175

1176

for (seg = 0; seg < nr_segs; seg++) {

1176

for (seg = 0; seg < nr_segs; seg++) {

1177

user_addr = (unsigned long)iov[seg].iov_base;

1177

user_addr = (unsigned long)iov[seg].iov_base;

1178

sdio.pages_in_io +=

1178

sdio.pages_in_io +=

1179

((user_addr + iov[seg].iov_len + PAGE_SIZE-1) /

1179

((user_addr + iov[seg].iov_len + PAGE_SIZE-1) /

1180

PAGE_SIZE - user_addr / PAGE_SIZE);

1180

PAGE_SIZE - user_addr / PAGE_SIZE);

1181

}

1181

}

1182

1183

blk_start_plug(&plug);

1183

blk_start_plug(&plug);

1184

1185

for (seg = 0; seg < nr_segs; seg++) {

1185

for (seg = 0; seg < nr_segs; seg++) {

1186

user_addr = (unsigned long)iov[seg].iov_base;

1186

user_addr = (unsigned long)iov[seg].iov_base;

1187

sdio.size += bytes = iov[seg].iov_len;

1187

sdio.size += bytes = iov[seg].iov_len;

1188

1189

/* Index into the first page of the first block */

1189

/* Index into the first page of the first block */

1190

sdio.first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;

1190

sdio.first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;

1191

sdio.final_block_in_request = sdio.block_in_file +

1191

sdio.final_block_in_request = sdio.block_in_file +

1192

(bytes >> blkbits);

1192

(bytes >> blkbits);

1193

/* Page fetching state */

1193

/* Page fetching state */

1194

sdio.head = 0;

1194

sdio.head = 0;

1195

sdio.tail = 0;

1195

sdio.tail = 0;

1196

sdio.curr_page = 0;

1196

sdio.curr_page = 0;

1197

1198

sdio.total_pages = 0;

1198

sdio.total_pages = 0;

1199

if (user_addr & (PAGE_SIZE-1)) {

1199

if (user_addr & (PAGE_SIZE-1)) {

1200

sdio.total_pages++;

1200

sdio.total_pages++;

1201

bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));

1201

bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));

1202

}

1202

}

1203

sdio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;

1203

sdio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;

1204

sdio.curr_user_address = user_addr;

1204

sdio.curr_user_address = user_addr;

1205

1206

retval = do_direct_IO(dio, &sdio, &map_bh);

1206

retval = do_direct_IO(dio, &sdio, &map_bh);

1207

1208

dio->result += iov[seg].iov_len -

1208

dio->result += iov[seg].iov_len -

1209

((sdio.final_block_in_request - sdio.block_in_file) <<

1209

((sdio.final_block_in_request - sdio.block_in_file) <<

1210

blkbits);

1210

blkbits);

1211

1212

if (retval) {

1212

if (retval) {

1213

dio_cleanup(dio, &sdio);

1213

dio_cleanup(dio, &sdio);

1214

break;

1214

break;

1215

}

1215

}

1216

} /* end iovec loop */

1216

} /* end iovec loop */

1217

1218

if (retval == -ENOTBLK) {

1218

if (retval == -ENOTBLK) {

1219

/*

1219

/*

1220

* The remaining part of the request will be

1220

* The remaining part of the request will be

1221

* be handled by buffered I/O when we return

1221

* be handled by buffered I/O when we return

1222

*/

1222

*/

1223

retval = 0;

1223

retval = 0;

1224

}

1224

}

1225

/*

1225

/*

1226

* There may be some unwritten disk at the end of a part-written

1226

* There may be some unwritten disk at the end of a part-written

1227

* fs-block-sized block. Go zero that now.

1227

* fs-block-sized block. Go zero that now.

1228

*/

1228

*/

1229

dio_zero_block(dio, &sdio, 1, &map_bh);

1229

dio_zero_block(dio, &sdio, 1, &map_bh);

1230

1231

if (sdio.cur_page) {

1231

if (sdio.cur_page) {

1232

ssize_t ret2;

1232

ssize_t ret2;

1233

1234

ret2 = dio_send_cur_page(dio, &sdio, &map_bh);

1234

ret2 = dio_send_cur_page(dio, &sdio, &map_bh);

1235

if (retval == 0)

1235

if (retval == 0)

1236

retval = ret2;

1236

retval = ret2;

1237

page_cache_release(sdio.cur_page);

1237

page_cache_release(sdio.cur_page);

1238

sdio.cur_page = NULL;

1238

sdio.cur_page = NULL;

1239

}

1239

}

1240

if (sdio.bio)

1240

if (sdio.bio)

1241

dio_bio_submit(dio, &sdio);

1241

dio_bio_submit(dio, &sdio);

1242

1243

blk_finish_plug(&plug);

1243

blk_finish_plug(&plug);

1244

1245

/*

1245

/*

1246

* It is possible that, we return short IO due to end of file.

1246

* It is possible that, we return short IO due to end of file.

1247

* In that case, we need to release all the pages we got hold on.

1247

* In that case, we need to release all the pages we got hold on.

1248

*/

1248

*/

1249

dio_cleanup(dio, &sdio);

1249

dio_cleanup(dio, &sdio);

1250

1251

/*

1251

/*

1252

* All block lookups have been performed. For READ requests

1252

* All block lookups have been performed. For READ requests

1253

* we can let i_mutex go now that its achieved its purpose

1253

* we can let i_mutex go now that its achieved its purpose

1254

* of protecting us from looking up uninitialized blocks.

1254

* of protecting us from looking up uninitialized blocks.

1255

*/

1255

*/

1256

if (rw == READ && (dio->flags & DIO_LOCKING))

1256

if (rw == READ && (dio->flags & DIO_LOCKING))

1257

mutex_unlock(&dio->inode->i_mutex);

1257

mutex_unlock(&dio->inode->i_mutex);

1258

1259

/*

1259

/*

1260

* The only time we want to leave bios in flight is when a successful

1260

* The only time we want to leave bios in flight is when a successful

1261

* partial aio read or full aio write have been setup. In that case

1261

* partial aio read or full aio write have been setup. In that case

1262

* bio completion will call aio_complete. The only time it's safe to

1262

* bio completion will call aio_complete. The only time it's safe to

1263

* call aio_complete is when we return -EIOCBQUEUED, so we key on that.

1263

* call aio_complete is when we return -EIOCBQUEUED, so we key on that.

1264

* This had *better* be the only place that raises -EIOCBQUEUED.

1264

* This had *better* be the only place that raises -EIOCBQUEUED.

1265

*/

1265

*/

1266

BUG_ON(retval == -EIOCBQUEUED);

1266

BUG_ON(retval == -EIOCBQUEUED);

1267

if (dio->is_async && retval == 0 && dio->result &&

1267

if (dio->is_async && retval == 0 && dio->result &&

1268

((rw == READ) || (dio->result == sdio.size)))

1268

((rw == READ) || (dio->result == sdio.size)))

1269

retval = -EIOCBQUEUED;

1269

retval = -EIOCBQUEUED;

1270

1271

if (retval != -EIOCBQUEUED)

1271

if (retval != -EIOCBQUEUED)

1272

dio_await_completion(dio);

1272

dio_await_completion(dio);

1273

1274

if (drop_refcount(dio) == 0) {

1274

if (drop_refcount(dio) == 0) {

1275

retval = dio_complete(dio, offset, retval, false);

1275

retval = dio_complete(dio, offset, retval, false);

1276

kmem_cache_free(dio_cache, dio);

1276

kmem_cache_free(dio_cache, dio);

1277

} else

1277

} else

1278

BUG_ON(retval != -EIOCBQUEUED);

1278

BUG_ON(retval != -EIOCBQUEUED);

1279

1280

out:

1280

out:

1281

return retval;

1281

return retval;

1282

}

1282

}

1283

1284

ssize_t

1284

ssize_t

1285

__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,

1285

__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,

1286

struct block_device *bdev, const struct iovec *iov, loff_t offset,

1286

struct block_device *bdev, const struct iovec *iov, loff_t offset,

1287

unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,

1287

unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,

1288

dio_submit_t submit_io, int flags)

1288

dio_submit_t submit_io, int flags)

1289

{

1289

{

1290

/*

1290

/*

1291

* The block device state is needed in the end to finally

1291

* The block device state is needed in the end to finally

1292

* submit everything. Since it's likely to be cache cold

1292

* submit everything. Since it's likely to be cache cold

1293

* prefetch it here as first thing to hide some of the

1293

* prefetch it here as first thing to hide some of the

1294

* latency.

1294

* latency.

1295

*

1295

*

1296

* Attempt to prefetch the pieces we likely need later.

1296

* Attempt to prefetch the pieces we likely need later.

1297

*/

1297

*/

1298

prefetch(&bdev->bd_disk->part_tbl);

1298

prefetch(&bdev->bd_disk->part_tbl);

1299

prefetch(bdev->bd_queue);

1299

prefetch(bdev->bd_queue);

1300

prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES);

1300

prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES);

1301

1302

return do_blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,

1302

return do_blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,

1303

nr_segs, get_block, end_io,

1303

nr_segs, get_block, end_io,

1304

submit_io, flags);

1304

submit_io, flags);

1305

}

1305

}

1306

1307

EXPORT_SYMBOL(__blockdev_direct_IO);

1307

EXPORT_SYMBOL(__blockdev_direct_IO);

1308

1309

static __init int dio_init(void)

1309

static __init int dio_init(void)

1310

{

1310

{

1311

dio_cache = KMEM_CACHE(dio, SLAB_PANIC);

1311

dio_cache = KMEM_CACHE(dio, SLAB_PANIC);

1312

return 0;

1312

return 0;

1313

}

1313

}

1314

module_init(dio_init)

1314

module_init(dio_init)

GITLAB

fs: Fix possible use-after-free with AIO

 /*
  * fs/direct-io.c
  *
  * Copyright (C) 2002, Linus Torvalds.
  *
  * O_DIRECT
  *
  * 04Jul2002	Andrew Morton
  *		Initial version
  * 11Sep2002	janetinc@us.ibm.com
  * 		added readv/writev support.
  * 29Oct2002	Andrew Morton
  *		rewrote bio_add_page() support.
  * 30Oct2002	pbadari@us.ibm.com
  *		added support for non-aligned IO.
  * 06Nov2002	pbadari@us.ibm.com
  *		added asynchronous IO support.
  * 21Jul2003	nathans@sgi.com
  *		added IO completion notifier.
  */
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/bio.h>
 #include <linux/wait.h>
 #include <linux/err.h>
 #include <linux/blkdev.h>
 #include <linux/buffer_head.h>
 #include <linux/rwsem.h>
 #include <linux/uio.h>
 #include <linux/atomic.h>
 #include <linux/prefetch.h>
 /*
  * How many user pages to map in one call to get_user_pages().  This determines
  * the size of a structure in the slab cache
  */
 #define DIO_PAGES	64
 /*
  * This code generally works in units of "dio_blocks".  A dio_block is
  * somewhere between the hard sector size and the filesystem block size.  it
  * is determined on a per-invocation basis.   When talking to the filesystem
  * we need to convert dio_blocks to fs_blocks by scaling the dio_block quantity
  * down by dio->blkfactor.  Similarly, fs-blocksize quantities are converted
  * to bio_block quantities by shifting left by blkfactor.
  *
  * If blkfactor is zero then the user's request was aligned to the filesystem's
  * blocksize.
  */
 /* dio_state only used in the submission path */
 struct dio_submit {
 	struct bio *bio;		/* bio under assembly */
 	unsigned blkbits;		/* doesn't change */
 	unsigned blkfactor;		/* When we're using an alignment which
 					   is finer than the filesystem's soft
 					   blocksize, this specifies how much
 					   finer.  blkfactor=2 means 1/4-block
 					   alignment.  Does not change */
 	unsigned start_zero_done;	/* flag: sub-blocksize zeroing has
 					   been performed at the start of a
 					   write */
 	int pages_in_io;		/* approximate total IO pages */
 	size_t	size;			/* total request size (doesn't change)*/
 	sector_t block_in_file;		/* Current offset into the underlying
 					   file in dio_block units. */
 	unsigned blocks_available;	/* At block_in_file.  changes */
 	int reap_counter;		/* rate limit reaping */
 	sector_t final_block_in_request;/* doesn't change */
 	unsigned first_block_in_page;	/* doesn't change, Used only once */
 	int boundary;			/* prev block is at a boundary */
 	get_block_t *get_block;		/* block mapping function */
 	dio_submit_t *submit_io;	/* IO submition function */
 	loff_t logical_offset_in_bio;	/* current first logical block in bio */
 	sector_t final_block_in_bio;	/* current final block in bio + 1 */
 	sector_t next_block_for_io;	/* next block to be put under IO,
 					   in dio_blocks units */
 	/*
 	 * Deferred addition of a page to the dio.  These variables are
 	 * private to dio_send_cur_page(), submit_page_section() and
 	 * dio_bio_add_page().
 	 */
 	struct page *cur_page;		/* The page */
 	unsigned cur_page_offset;	/* Offset into it, in bytes */
 	unsigned cur_page_len;		/* Nr of bytes at cur_page_offset */
 	sector_t cur_page_block;	/* Where it starts */
 	loff_t cur_page_fs_offset;	/* Offset in file */
 	/*
 	 * Page fetching state. These variables belong to dio_refill_pages().
 	 */
 	int curr_page;			/* changes */
 	int total_pages;		/* doesn't change */
 	unsigned long curr_user_address;/* changes */
 	/*
 	 * Page queue.  These variables belong to dio_refill_pages() and
 	 * dio_get_page().
 	 */
 	unsigned head;			/* next page to process */
 	unsigned tail;			/* last valid page + 1 */
 };
 /* dio_state communicated between submission path and end_io */
 struct dio {
 	int flags;			/* doesn't change */
 	int rw;
 	struct inode *inode;
 	loff_t i_size;			/* i_size when submitted */
 	dio_iodone_t *end_io;		/* IO completion function */
 	void *private;			/* copy from map_bh.b_private */
 	/* BIO completion state */
 	spinlock_t bio_lock;		/* protects BIO fields below */
 	int page_errors;		/* errno from get_user_pages() */
 	int is_async;			/* is IO async ? */
 	int io_error;			/* IO error in completion path */
 	unsigned long refcount;		/* direct_io_worker() and bios */
 	struct bio *bio_list;		/* singly linked via bi_private */
 	struct task_struct *waiter;	/* waiting task (NULL if none) */
 	/* AIO related stuff */
 	struct kiocb *iocb;		/* kiocb */
 	ssize_t result;                 /* IO result */
 	/*
 	 * pages[] (and any fields placed after it) are not zeroed out at
 	 * allocation time.  Don't add new fields after pages[] unless you
 	 * wish that they not be zeroed.
 	 */
 	struct page *pages[DIO_PAGES];	/* page buffer */
 } ____cacheline_aligned_in_smp;
 static struct kmem_cache *dio_cache __read_mostly;
 /*
  * How many pages are in the queue?
  */
 static inline unsigned dio_pages_present(struct dio_submit *sdio)
 {
 	return sdio->tail - sdio->head;
 }
 /*
  * Go grab and pin some userspace pages.   Typically we'll get 64 at a time.
  */
 static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
 {
 	int ret;
 	int nr_pages;
 	nr_pages = min(sdio->total_pages - sdio->curr_page, DIO_PAGES);
 	ret = get_user_pages_fast(
 		sdio->curr_user_address,		/* Where from? */
 		nr_pages,			/* How many pages? */
 		dio->rw == READ,		/* Write to memory? */
 		&dio->pages[0]);		/* Put results here */
 	if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) {
 		struct page *page = ZERO_PAGE(0);
 		/*
 		 * A memory fault, but the filesystem has some outstanding
 		 * mapped blocks.  We need to use those blocks up to avoid
 		 * leaking stale data in the file.
 		 */
 		if (dio->page_errors == 0)
 			dio->page_errors = ret;
 		page_cache_get(page);
 		dio->pages[0] = page;
 		sdio->head = 0;
 		sdio->tail = 1;
 		ret = 0;
 		goto out;
 	}
 	if (ret >= 0) {
 		sdio->curr_user_address += ret * PAGE_SIZE;
 		sdio->curr_page += ret;
 		sdio->head = 0;
 		sdio->tail = ret;
 		ret = 0;
 	}
 out:
 	return ret;
 }
 /*
  * Get another userspace page.  Returns an ERR_PTR on error.  Pages are
  * buffered inside the dio so that we can call get_user_pages() against a
  * decent number of pages, less frequently.  To provide nicer use of the
  * L1 cache.
  */
 static inline struct page *dio_get_page(struct dio *dio,
 		struct dio_submit *sdio)
 {
 	if (dio_pages_present(sdio) == 0) {
 		int ret;
 		ret = dio_refill_pages(dio, sdio);
 		if (ret)
 			return ERR_PTR(ret);
 		BUG_ON(dio_pages_present(sdio) == 0);
 	}
 	return dio->pages[sdio->head++];
 }
 /**
  * dio_complete() - called when all DIO BIO I/O has been completed
  * @offset: the byte offset in the file of the completed operation
  *
  * This releases locks as dictated by the locking type, lets interested parties
  * know that a DIO operation has completed, and calculates the resulting return
  * code for the operation.
  *
  * It lets the filesystem know if it registered an interest earlier via
  * get_block.  Pass the private field of the map buffer_head so that
  * filesystems can use it to hold additional state between get_block calls and
  * dio_complete.
  */
 static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is_async)
 {
 	ssize_t transferred = 0;
 	/*
 	 * AIO submission can race with bio completion to get here while
 	 * expecting to have the last io completed by bio completion.
 	 * In that case -EIOCBQUEUED is in fact not an error we want
 	 * to preserve through this call.
 	 */
 	if (ret == -EIOCBQUEUED)
 		ret = 0;
 	if (dio->result) {
 		transferred = dio->result;
 		/* Check for short read case */
 		if ((dio->rw == READ) && ((offset + transferred) > dio->i_size))
 			transferred = dio->i_size - offset;
 	}
 	if (ret == 0)
 		ret = dio->page_errors;
 	if (ret == 0)
 		ret = dio->io_error;
 	if (ret == 0)
 		ret = transferred;
 	if (dio->end_io && dio->result) {
 		dio->end_io(dio->iocb, offset, transferred,
 			    dio->private, ret, is_async);
 	} else {
+		inode_dio_done(dio->inode);
 		if (is_async)
 			aio_complete(dio->iocb, ret, 0);
-		inode_dio_done(dio->inode);
 	}
 	return ret;
 }
 static int dio_bio_complete(struct dio *dio, struct bio *bio);
 /*
  * Asynchronous IO callback.
  */
 static void dio_bio_end_aio(struct bio *bio, int error)
 {
 	struct dio *dio = bio->bi_private;
 	unsigned long remaining;
 	unsigned long flags;
 	/* cleanup the bio */
 	dio_bio_complete(dio, bio);
 	spin_lock_irqsave(&dio->bio_lock, flags);
 	remaining = --dio->refcount;
 	if (remaining == 1 && dio->waiter)
 		wake_up_process(dio->waiter);
 	spin_unlock_irqrestore(&dio->bio_lock, flags);
 	if (remaining == 0) {
 		dio_complete(dio, dio->iocb->ki_pos, 0, true);
 		kmem_cache_free(dio_cache, dio);
 	}
 }
 /*
  * The BIO completion handler simply queues the BIO up for the process-context
  * handler.
  *
  * During I/O bi_private points at the dio.  After I/O, bi_private is used to
  * implement a singly-linked list of completed BIOs, at dio->bio_list.
  */
 static void dio_bio_end_io(struct bio *bio, int error)
 {
 	struct dio *dio = bio->bi_private;
 	unsigned long flags;
 	spin_lock_irqsave(&dio->bio_lock, flags);
 	bio->bi_private = dio->bio_list;
 	dio->bio_list = bio;
 	if (--dio->refcount == 1 && dio->waiter)
 		wake_up_process(dio->waiter);
 	spin_unlock_irqrestore(&dio->bio_lock, flags);
 }
 /**
  * dio_end_io - handle the end io action for the given bio
  * @bio: The direct io bio thats being completed
  * @error: Error if there was one
  *
  * This is meant to be called by any filesystem that uses their own dio_submit_t
  * so that the DIO specific endio actions are dealt with after the filesystem
  * has done it's completion work.
  */
 void dio_end_io(struct bio *bio, int error)
 {
 	struct dio *dio = bio->bi_private;
 	if (dio->is_async)
 		dio_bio_end_aio(bio, error);
 	else
 		dio_bio_end_io(bio, error);
 }
 EXPORT_SYMBOL_GPL(dio_end_io);
 static inline void
 dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
 	      struct block_device *bdev,
 	      sector_t first_sector, int nr_vecs)
 {
 	struct bio *bio;
 	/*
 	 * bio_alloc() is guaranteed to return a bio when called with
 	 * __GFP_WAIT and we request a valid number of vectors.
 	 */
 	bio = bio_alloc(GFP_KERNEL, nr_vecs);
 	bio->bi_bdev = bdev;
 	bio->bi_sector = first_sector;
 	if (dio->is_async)
 		bio->bi_end_io = dio_bio_end_aio;
 	else
 		bio->bi_end_io = dio_bio_end_io;
 	sdio->bio = bio;
 	sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
 }
 /*
  * In the AIO read case we speculatively dirty the pages before starting IO.
  * During IO completion, any of these pages which happen to have been written
  * back will be redirtied by bio_check_pages_dirty().
  *
  * bios hold a dio reference between submit_bio and ->end_io.
  */
 static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
 {
 	struct bio *bio = sdio->bio;
 	unsigned long flags;
 	bio->bi_private = dio;
 	spin_lock_irqsave(&dio->bio_lock, flags);
 	dio->refcount++;
 	spin_unlock_irqrestore(&dio->bio_lock, flags);
 	if (dio->is_async && dio->rw == READ)
 		bio_set_pages_dirty(bio);
 	if (sdio->submit_io)
 		sdio->submit_io(dio->rw, bio, dio->inode,
 			       sdio->logical_offset_in_bio);
 	else
 		submit_bio(dio->rw, bio);
 	sdio->bio = NULL;
 	sdio->boundary = 0;
 	sdio->logical_offset_in_bio = 0;
 }
 /*
  * Release any resources in case of a failure
  */
 static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio)
 {
 	while (dio_pages_present(sdio))
 		page_cache_release(dio_get_page(dio, sdio));
 }
 /*
  * Wait for the next BIO to complete.  Remove it and return it.  NULL is
  * returned once all BIOs have been completed.  This must only be called once
  * all bios have been issued so that dio->refcount can only decrease.  This
  * requires that that the caller hold a reference on the dio.
  */
 static struct bio *dio_await_one(struct dio *dio)
 {
 	unsigned long flags;
 	struct bio *bio = NULL;
 	spin_lock_irqsave(&dio->bio_lock, flags);
 	/*
 	 * Wait as long as the list is empty and there are bios in flight.  bio
 	 * completion drops the count, maybe adds to the list, and wakes while
 	 * holding the bio_lock so we don't need set_current_state()'s barrier
 	 * and can call it after testing our condition.
 	 */
 	while (dio->refcount > 1 && dio->bio_list == NULL) {
 		__set_current_state(TASK_UNINTERRUPTIBLE);
 		dio->waiter = current;
 		spin_unlock_irqrestore(&dio->bio_lock, flags);
 		io_schedule();
 		/* wake up sets us TASK_RUNNING */
 		spin_lock_irqsave(&dio->bio_lock, flags);
 		dio->waiter = NULL;
 	}
 	if (dio->bio_list) {
 		bio = dio->bio_list;
 		dio->bio_list = bio->bi_private;
 	}
 	spin_unlock_irqrestore(&dio->bio_lock, flags);
 	return bio;
 }
 /*
  * Process one completed BIO.  No locks are held.
  */
 static int dio_bio_complete(struct dio *dio, struct bio *bio)
 {
 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct bio_vec *bvec = bio->bi_io_vec;
 	int page_no;
 	if (!uptodate)
 		dio->io_error = -EIO;
 	if (dio->is_async && dio->rw == READ) {
 		bio_check_pages_dirty(bio);	/* transfers ownership */
 	} else {
 		for (page_no = 0; page_no < bio->bi_vcnt; page_no++) {
 			struct page *page = bvec[page_no].bv_page;
 			if (dio->rw == READ && !PageCompound(page))
 				set_page_dirty_lock(page);
 			page_cache_release(page);
 		}
 		bio_put(bio);
 	}
 	return uptodate ? 0 : -EIO;
 }
 /*
  * Wait on and process all in-flight BIOs.  This must only be called once
  * all bios have been issued so that the refcount can only decrease.
  * This just waits for all bios to make it through dio_bio_complete.  IO
  * errors are propagated through dio->io_error and should be propagated via
  * dio_complete().
  */
 static void dio_await_completion(struct dio *dio)
 {
 	struct bio *bio;
 	do {
 		bio = dio_await_one(dio);
 		if (bio)
 			dio_bio_complete(dio, bio);
 	} while (bio);
 }
 /*
  * A really large O_DIRECT read or write can generate a lot of BIOs.  So
  * to keep the memory consumption sane we periodically reap any completed BIOs
  * during the BIO generation phase.
  *
  * This also helps to limit the peak amount of pinned userspace memory.
  */
 static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio)
 {
 	int ret = 0;
 	if (sdio->reap_counter++ >= 64) {
 		while (dio->bio_list) {
 			unsigned long flags;
 			struct bio *bio;
 			int ret2;
 			spin_lock_irqsave(&dio->bio_lock, flags);
 			bio = dio->bio_list;
 			dio->bio_list = bio->bi_private;
 			spin_unlock_irqrestore(&dio->bio_lock, flags);
 			ret2 = dio_bio_complete(dio, bio);
 			if (ret == 0)
 				ret = ret2;
 		}
 		sdio->reap_counter = 0;
 	}
 	return ret;
 }
 /*
  * Call into the fs to map some more disk blocks.  We record the current number
  * of available blocks at sdio->blocks_available.  These are in units of the
  * fs blocksize, (1 << inode->i_blkbits).
  *
  * The fs is allowed to map lots of blocks at once.  If it wants to do that,
  * it uses the passed inode-relative block number as the file offset, as usual.
  *
  * get_block() is passed the number of i_blkbits-sized blocks which direct_io
  * has remaining to do.  The fs should not map more than this number of blocks.
  *
  * If the fs has mapped a lot of blocks, it should populate bh->b_size to
  * indicate how much contiguous disk space has been made available at
  * bh->b_blocknr.
  *
  * If *any* of the mapped blocks are new, then the fs must set buffer_new().
  * This isn't very efficient...
  *
  * In the case of filesystem holes: the fs may return an arbitrarily-large
  * hole by returning an appropriate value in b_size and by clearing
  * buffer_mapped().  However the direct-io code will only process holes one
  * block at a time - it will repeatedly call get_block() as it walks the hole.
  */
 static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
 			   struct buffer_head *map_bh)
 {
 	int ret;
 	sector_t fs_startblk;	/* Into file, in filesystem-sized blocks */
 	sector_t fs_endblk;	/* Into file, in filesystem-sized blocks */
 	unsigned long fs_count;	/* Number of filesystem-sized blocks */
 	int create;
 	unsigned int i_blkbits = sdio->blkbits + sdio->blkfactor;
 	/*
 	 * If there was a memory error and we've overwritten all the
 	 * mapped blocks then we can now return that memory error
 	 */
 	ret = dio->page_errors;
 	if (ret == 0) {
 		BUG_ON(sdio->block_in_file >= sdio->final_block_in_request);
 		fs_startblk = sdio->block_in_file >> sdio->blkfactor;
 		fs_endblk = (sdio->final_block_in_request - 1) >>
 					sdio->blkfactor;
 		fs_count = fs_endblk - fs_startblk + 1;
 		map_bh->b_state = 0;
 		map_bh->b_size = fs_count << i_blkbits;
 		/*
 		 * For writes inside i_size on a DIO_SKIP_HOLES filesystem we
 		 * forbid block creations: only overwrites are permitted.
 		 * We will return early to the caller once we see an
 		 * unmapped buffer head returned, and the caller will fall
 		 * back to buffered I/O.
 		 *
 		 * Otherwise the decision is left to the get_blocks method,
 		 * which may decide to handle it or also return an unmapped
 		 * buffer head.
 		 */
 		create = dio->rw & WRITE;
 		if (dio->flags & DIO_SKIP_HOLES) {
 			if (sdio->block_in_file < (i_size_read(dio->inode) >>
 							sdio->blkbits))
 				create = 0;
 		}
 		ret = (*sdio->get_block)(dio->inode, fs_startblk,
 						map_bh, create);
 		/* Store for completion */
 		dio->private = map_bh->b_private;
 	}
 	return ret;
 }
 /*
  * There is no bio.  Make one now.
  */
 static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio,
 		sector_t start_sector, struct buffer_head *map_bh)
 {
 	sector_t sector;
 	int ret, nr_pages;
 	ret = dio_bio_reap(dio, sdio);
 	if (ret)
 		goto out;
 	sector = start_sector << (sdio->blkbits - 9);
 	nr_pages = min(sdio->pages_in_io, bio_get_nr_vecs(map_bh->b_bdev));
 	nr_pages = min(nr_pages, BIO_MAX_PAGES);
 	BUG_ON(nr_pages <= 0);
 	dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages);
 	sdio->boundary = 0;
 out:
 	return ret;
 }
 /*
  * Attempt to put the current chunk of 'cur_page' into the current BIO.  If
  * that was successful then update final_block_in_bio and take a ref against
  * the just-added page.
  *
  * Return zero on success.  Non-zero means the caller needs to start a new BIO.
  */
 static inline int dio_bio_add_page(struct dio_submit *sdio)
 {
 	int ret;
 	ret = bio_add_page(sdio->bio, sdio->cur_page,
 			sdio->cur_page_len, sdio->cur_page_offset);
 	if (ret == sdio->cur_page_len) {
 		/*
 		 * Decrement count only, if we are done with this page
 		 */
 		if ((sdio->cur_page_len + sdio->cur_page_offset) == PAGE_SIZE)
 			sdio->pages_in_io--;
 		page_cache_get(sdio->cur_page);
 		sdio->final_block_in_bio = sdio->cur_page_block +
 			(sdio->cur_page_len >> sdio->blkbits);
 		ret = 0;
 	} else {
 		ret = 1;
 	}
 	return ret;
 }
 /*
  * Put cur_page under IO.  The section of cur_page which is described by
  * cur_page_offset,cur_page_len is put into a BIO.  The section of cur_page
  * starts on-disk at cur_page_block.
  *
  * We take a ref against the page here (on behalf of its presence in the bio).
  *
  * The caller of this function is responsible for removing cur_page from the
  * dio, and for dropping the refcount which came from that presence.
  */
 static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio,
 		struct buffer_head *map_bh)
 {
 	int ret = 0;
 	if (sdio->bio) {
 		loff_t cur_offset = sdio->cur_page_fs_offset;
 		loff_t bio_next_offset = sdio->logical_offset_in_bio +
 			sdio->bio->bi_size;
 		/*
 		 * See whether this new request is contiguous with the old.
 		 *
 		 * Btrfs cannot handle having logically non-contiguous requests
 		 * submitted.  For example if you have
 		 *
 		 * Logical:  [0-4095][HOLE][8192-12287]
 		 * Physical: [0-4095]      [4096-8191]
 		 *
 		 * We cannot submit those pages together as one BIO.  So if our
 		 * current logical offset in the file does not equal what would
 		 * be the next logical offset in the bio, submit the bio we
 		 * have.
 		 */
 		if (sdio->final_block_in_bio != sdio->cur_page_block ||
 		    cur_offset != bio_next_offset)
 			dio_bio_submit(dio, sdio);
 		/*
 		 * Submit now if the underlying fs is about to perform a
 		 * metadata read
 		 */
 		else if (sdio->boundary)
 			dio_bio_submit(dio, sdio);
 	}
 	if (sdio->bio == NULL) {
 		ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh);
 		if (ret)
 			goto out;
 	}
 	if (dio_bio_add_page(sdio) != 0) {
 		dio_bio_submit(dio, sdio);
 		ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh);
 		if (ret == 0) {
 			ret = dio_bio_add_page(sdio);
 			BUG_ON(ret != 0);
 		}
 	}
 out:
 	return ret;
 }
 /*
  * An autonomous function to put a chunk of a page under deferred IO.
  *
  * The caller doesn't actually know (or care) whether this piece of page is in
  * a BIO, or is under IO or whatever.  We just take care of all possible
  * situations here.  The separation between the logic of do_direct_IO() and
  * that of submit_page_section() is important for clarity.  Please don't break.
  *
  * The chunk of page starts on-disk at blocknr.
  *
  * We perform deferred IO, by recording the last-submitted page inside our
  * private part of the dio structure.  If possible, we just expand the IO
  * across that page here.
  *
  * If that doesn't work out then we put the old page into the bio and add this
  * page to the dio instead.
  */
 static inline int
 submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
 		    unsigned offset, unsigned len, sector_t blocknr,
 		    struct buffer_head *map_bh)
 {
 	int ret = 0;
 	if (dio->rw & WRITE) {
 		/*
 		 * Read accounting is performed in submit_bio()
 		 */
 		task_io_account_write(len);
 	}
 	/*
 	 * Can we just grow the current page's presence in the dio?
 	 */
 	if (sdio->cur_page == page &&
 	    sdio->cur_page_offset + sdio->cur_page_len == offset &&
 	    sdio->cur_page_block +
 	    (sdio->cur_page_len >> sdio->blkbits) == blocknr) {
 		sdio->cur_page_len += len;
 		/*
 		 * If sdio->boundary then we want to schedule the IO now to
 		 * avoid metadata seeks.
 		 */
 		if (sdio->boundary) {
 			ret = dio_send_cur_page(dio, sdio, map_bh);
 			page_cache_release(sdio->cur_page);
 			sdio->cur_page = NULL;
 		}
 		goto out;
 	}
 	/*
 	 * If there's a deferred page already there then send it.
 	 */
 	if (sdio->cur_page) {
 		ret = dio_send_cur_page(dio, sdio, map_bh);
 		page_cache_release(sdio->cur_page);
 		sdio->cur_page = NULL;
 		if (ret)
 			goto out;
 	}
 	page_cache_get(page);		/* It is in dio */
 	sdio->cur_page = page;
 	sdio->cur_page_offset = offset;
 	sdio->cur_page_len = len;
 	sdio->cur_page_block = blocknr;
 	sdio->cur_page_fs_offset = sdio->block_in_file << sdio->blkbits;
 out:
 	return ret;
 }
 /*
  * Clean any dirty buffers in the blockdev mapping which alias newly-created
  * file blocks.  Only called for S_ISREG files - blockdevs do not set
  * buffer_new
  */
 static void clean_blockdev_aliases(struct dio *dio, struct buffer_head *map_bh)
 {
 	unsigned i;
 	unsigned nblocks;
 	nblocks = map_bh->b_size >> dio->inode->i_blkbits;
 	for (i = 0; i < nblocks; i++) {
 		unmap_underlying_metadata(map_bh->b_bdev,
 					  map_bh->b_blocknr + i);
 	}
 }
 /*
  * If we are not writing the entire block and get_block() allocated
  * the block for us, we need to fill-in the unused portion of the
  * block with zeros. This happens only if user-buffer, fileoffset or
  * io length is not filesystem block-size multiple.
  *
  * `end' is zero if we're doing the start of the IO, 1 at the end of the
  * IO.
  */
 static inline void dio_zero_block(struct dio *dio, struct dio_submit *sdio,
 		int end, struct buffer_head *map_bh)
 {
 	unsigned dio_blocks_per_fs_block;
 	unsigned this_chunk_blocks;	/* In dio_blocks */
 	unsigned this_chunk_bytes;
 	struct page *page;
 	sdio->start_zero_done = 1;
 	if (!sdio->blkfactor || !buffer_new(map_bh))
 		return;
 	dio_blocks_per_fs_block = 1 << sdio->blkfactor;
 	this_chunk_blocks = sdio->block_in_file & (dio_blocks_per_fs_block - 1);
 	if (!this_chunk_blocks)
 		return;
 	/*
 	 * We need to zero out part of an fs block.  It is either at the
 	 * beginning or the end of the fs block.
 	 */
 	if (end)
 		this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;
 	this_chunk_bytes = this_chunk_blocks << sdio->blkbits;
 	page = ZERO_PAGE(0);
 	if (submit_page_section(dio, sdio, page, 0, this_chunk_bytes,
 				sdio->next_block_for_io, map_bh))
 		return;
 	sdio->next_block_for_io += this_chunk_blocks;
 }
 /*
  * Walk the user pages, and the file, mapping blocks to disk and generating
  * a sequence of (page,offset,len,block) mappings.  These mappings are injected
  * into submit_page_section(), which takes care of the next stage of submission
  *
  * Direct IO against a blockdev is different from a file.  Because we can
  * happily perform page-sized but 512-byte aligned IOs.  It is important that
  * blockdev IO be able to have fine alignment and large sizes.
  *
  * So what we do is to permit the ->get_block function to populate bh.b_size
  * with the size of IO which is permitted at this offset and this i_blkbits.
  *
  * For best results, the blockdev should be set up with 512-byte i_blkbits and
  * it should set b_size to PAGE_SIZE or more inside get_block().  This gives
  * fine alignment but still allows this function to work in PAGE_SIZE units.
  */
 static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
 			struct buffer_head *map_bh)
 {
 	const unsigned blkbits = sdio->blkbits;
 	const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
 	struct page *page;
 	unsigned block_in_page;
 	int ret = 0;
 	/* The I/O can start at any block offset within the first page */
 	block_in_page = sdio->first_block_in_page;
 	while (sdio->block_in_file < sdio->final_block_in_request) {
 		page = dio_get_page(dio, sdio);
 		if (IS_ERR(page)) {
 			ret = PTR_ERR(page);
 			goto out;
 		}
 		while (block_in_page < blocks_per_page) {
 			unsigned offset_in_page = block_in_page << blkbits;
 			unsigned this_chunk_bytes;	/* # of bytes mapped */
 			unsigned this_chunk_blocks;	/* # of blocks */
 			unsigned u;
 			if (sdio->blocks_available == 0) {
 				/*
 				 * Need to go and map some more disk
 				 */
 				unsigned long blkmask;
 				unsigned long dio_remainder;
 				ret = get_more_blocks(dio, sdio, map_bh);
 				if (ret) {
 					page_cache_release(page);
 					goto out;
 				}
 				if (!buffer_mapped(map_bh))
 					goto do_holes;
 				sdio->blocks_available =
 						map_bh->b_size >> sdio->blkbits;
 				sdio->next_block_for_io =
 					map_bh->b_blocknr << sdio->blkfactor;
 				if (buffer_new(map_bh))
 					clean_blockdev_aliases(dio, map_bh);
 				if (!sdio->blkfactor)
 					goto do_holes;
 				blkmask = (1 << sdio->blkfactor) - 1;
 				dio_remainder = (sdio->block_in_file & blkmask);
 				/*
 				 * If we are at the start of IO and that IO
 				 * starts partway into a fs-block,
 				 * dio_remainder will be non-zero.  If the IO
 				 * is a read then we can simply advance the IO
 				 * cursor to the first block which is to be
 				 * read.  But if the IO is a write and the
 				 * block was newly allocated we cannot do that;
 				 * the start of the fs block must be zeroed out
 				 * on-disk
 				 */
 				if (!buffer_new(map_bh))
 					sdio->next_block_for_io += dio_remainder;
 				sdio->blocks_available -= dio_remainder;
 			}
 do_holes:
 			/* Handle holes */
 			if (!buffer_mapped(map_bh)) {
 				loff_t i_size_aligned;
 				/* AKPM: eargh, -ENOTBLK is a hack */
 				if (dio->rw & WRITE) {
 					page_cache_release(page);
 					return -ENOTBLK;
 				}
 				/*
 				 * Be sure to account for a partial block as the
 				 * last block in the file
 				 */
 				i_size_aligned = ALIGN(i_size_read(dio->inode),
 							1 << blkbits);
 				if (sdio->block_in_file >=
 						i_size_aligned >> blkbits) {
 					/* We hit eof */
 					page_cache_release(page);
 					goto out;
 				}
 				zero_user(page, block_in_page << blkbits,
 						1 << blkbits);
 				sdio->block_in_file++;
 				block_in_page++;
 				goto next_block;
 			}
 			/*
 			 * If we're performing IO which has an alignment which
 			 * is finer than the underlying fs, go check to see if
 			 * we must zero out the start of this block.
 			 */
 			if (unlikely(sdio->blkfactor && !sdio->start_zero_done))
 				dio_zero_block(dio, sdio, 0, map_bh);
 			/*
 			 * Work out, in this_chunk_blocks, how much disk we
 			 * can add to this page
 			 */
 			this_chunk_blocks = sdio->blocks_available;
 			u = (PAGE_SIZE - offset_in_page) >> blkbits;
 			if (this_chunk_blocks > u)
 				this_chunk_blocks = u;
 			u = sdio->final_block_in_request - sdio->block_in_file;
 			if (this_chunk_blocks > u)
 				this_chunk_blocks = u;
 			this_chunk_bytes = this_chunk_blocks << blkbits;
 			BUG_ON(this_chunk_bytes == 0);
 			sdio->boundary = buffer_boundary(map_bh);
 			ret = submit_page_section(dio, sdio, page,
 						  offset_in_page,
 						  this_chunk_bytes,
 						  sdio->next_block_for_io,
 						  map_bh);
 			if (ret) {
 				page_cache_release(page);
 				goto out;
 			}
 			sdio->next_block_for_io += this_chunk_blocks;
 			sdio->block_in_file += this_chunk_blocks;
 			block_in_page += this_chunk_blocks;
 			sdio->blocks_available -= this_chunk_blocks;
 next_block:
 			BUG_ON(sdio->block_in_file > sdio->final_block_in_request);
 			if (sdio->block_in_file == sdio->final_block_in_request)
 				break;
 		}
 		/* Drop the ref which was taken in get_user_pages() */
 		page_cache_release(page);
 		block_in_page = 0;
 	}
 out:
 	return ret;
 }
 static inline int drop_refcount(struct dio *dio)
 {
 	int ret2;
 	unsigned long flags;
 	/*
 	 * Sync will always be dropping the final ref and completing the
 	 * operation.  AIO can if it was a broken operation described above or
 	 * in fact if all the bios race to complete before we get here.  In
 	 * that case dio_complete() translates the EIOCBQUEUED into the proper
 	 * return code that the caller will hand to aio_complete().
 	 *
 	 * This is managed by the bio_lock instead of being an atomic_t so that
 	 * completion paths can drop their ref and use the remaining count to
 	 * decide to wake the submission path atomically.
 	 */
 	spin_lock_irqsave(&dio->bio_lock, flags);
 	ret2 = --dio->refcount;
 	spin_unlock_irqrestore(&dio->bio_lock, flags);
 	return ret2;
 }
 /*
  * This is a library function for use by filesystem drivers.
  *
  * The locking rules are governed by the flags parameter:
  *  - if the flags value contains DIO_LOCKING we use a fancy locking
  *    scheme for dumb filesystems.
  *    For writes this function is called under i_mutex and returns with
  *    i_mutex held, for reads, i_mutex is not held on entry, but it is
  *    taken and dropped again before returning.
  *  - if the flags value does NOT contain DIO_LOCKING we don't use any
  *    internal locking but rather rely on the filesystem to synchronize
  *    direct I/O reads/writes versus each other and truncate.
  *
  * To help with locking against truncate we incremented the i_dio_count
  * counter before starting direct I/O, and decrement it once we are done.
  * Truncate can wait for it to reach zero to provide exclusion.  It is
  * expected that filesystem provide exclusion between new direct I/O
  * and truncates.  For DIO_LOCKING filesystems this is done by i_mutex,
  * but other filesystems need to take care of this on their own.
  *
  * NOTE: if you pass "sdio" to anything by pointer make sure that function
  * is always inlined. Otherwise gcc is unable to split the structure into
  * individual fields and will generate much worse code. This is important
  * for the whole file.
  */
 static inline ssize_t
 do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	struct block_device *bdev, const struct iovec *iov, loff_t offset,
 	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
 	dio_submit_t submit_io,	int flags)
 {
 	int seg;
 	size_t size;
 	unsigned long addr;
 	unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits);
 	unsigned blkbits = i_blkbits;
 	unsigned blocksize_mask = (1 << blkbits) - 1;
 	ssize_t retval = -EINVAL;
 	loff_t end = offset;
 	struct dio *dio;
 	struct dio_submit sdio = { 0, };
 	unsigned long user_addr;
 	size_t bytes;
 	struct buffer_head map_bh = { 0, };
 	struct blk_plug plug;
 	if (rw & WRITE)
 		rw = WRITE_ODIRECT;
 	/*
 	 * Avoid references to bdev if not absolutely needed to give
 	 * the early prefetch in the caller enough time.
 	 */
 	if (offset & blocksize_mask) {
 		if (bdev)
 			blkbits = blksize_bits(bdev_logical_block_size(bdev));
 		blocksize_mask = (1 << blkbits) - 1;
 		if (offset & blocksize_mask)
 			goto out;
 	}
 	/* Check the memory alignment.  Blocks cannot straddle pages */
 	for (seg = 0; seg < nr_segs; seg++) {
 		addr = (unsigned long)iov[seg].iov_base;
 		size = iov[seg].iov_len;
 		end += size;
 		if (unlikely((addr & blocksize_mask) ||
 			     (size & blocksize_mask))) {
 			if (bdev)
 				blkbits = blksize_bits(
 					 bdev_logical_block_size(bdev));
 			blocksize_mask = (1 << blkbits) - 1;
 			if ((addr & blocksize_mask) || (size & blocksize_mask))
 				goto out;
 		}
 	}
 	/* watch out for a 0 len io from a tricksy fs */
 	if (rw == READ && end == offset)
 		return 0;
 	dio = kmem_cache_alloc(dio_cache, GFP_KERNEL);
 	retval = -ENOMEM;
 	if (!dio)
 		goto out;
 	/*
 	 * Believe it or not, zeroing out the page array caused a .5%
 	 * performance regression in a database benchmark.  So, we take
 	 * care to only zero out what's needed.
 	 */
 	memset(dio, 0, offsetof(struct dio, pages));
 	dio->flags = flags;
 	if (dio->flags & DIO_LOCKING) {
 		if (rw == READ) {
 			struct address_space *mapping =
 					iocb->ki_filp->f_mapping;
 			/* will be released by direct_io_worker */
 			mutex_lock(&inode->i_mutex);
 			retval = filemap_write_and_wait_range(mapping, offset,
 							      end - 1);
 			if (retval) {
 				mutex_unlock(&inode->i_mutex);
 				kmem_cache_free(dio_cache, dio);
 				goto out;
 			}
 		}
 	}
 	/*
 	 * Will be decremented at I/O completion time.
 	 */
 	atomic_inc(&inode->i_dio_count);
 	/*
 	 * For file extending writes updating i_size before data
 	 * writeouts complete can expose uninitialized blocks. So
 	 * even for AIO, we need to wait for i/o to complete before
 	 * returning in this case.
 	 */
 	dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
 		(end > i_size_read(inode)));
 	retval = 0;
 	dio->inode = inode;
 	dio->rw = rw;
 	sdio.blkbits = blkbits;
 	sdio.blkfactor = i_blkbits - blkbits;
 	sdio.block_in_file = offset >> blkbits;
 	sdio.get_block = get_block;
 	dio->end_io = end_io;
 	sdio.submit_io = submit_io;
 	sdio.final_block_in_bio = -1;
 	sdio.next_block_for_io = -1;
 	dio->iocb = iocb;
 	dio->i_size = i_size_read(inode);
 	spin_lock_init(&dio->bio_lock);
 	dio->refcount = 1;
 	/*
 	 * In case of non-aligned buffers, we may need 2 more
 	 * pages since we need to zero out first and last block.
 	 */
 	if (unlikely(sdio.blkfactor))
 		sdio.pages_in_io = 2;
 	for (seg = 0; seg < nr_segs; seg++) {
 		user_addr = (unsigned long)iov[seg].iov_base;
 		sdio.pages_in_io +=
 			((user_addr + iov[seg].iov_len + PAGE_SIZE-1) /
 				PAGE_SIZE - user_addr / PAGE_SIZE);
 	}
 	blk_start_plug(&plug);
 	for (seg = 0; seg < nr_segs; seg++) {
 		user_addr = (unsigned long)iov[seg].iov_base;
 		sdio.size += bytes = iov[seg].iov_len;
 		/* Index into the first page of the first block */
 		sdio.first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
 		sdio.final_block_in_request = sdio.block_in_file +
 						(bytes >> blkbits);
 		/* Page fetching state */
 		sdio.head = 0;
 		sdio.tail = 0;
 		sdio.curr_page = 0;
 		sdio.total_pages = 0;
 		if (user_addr & (PAGE_SIZE-1)) {
 			sdio.total_pages++;
 			bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
 		}
 		sdio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
 		sdio.curr_user_address = user_addr;
 		retval = do_direct_IO(dio, &sdio, &map_bh);
 		dio->result += iov[seg].iov_len -
 			((sdio.final_block_in_request - sdio.block_in_file) <<
 					blkbits);
 		if (retval) {
 			dio_cleanup(dio, &sdio);
 			break;
 		}
 	} /* end iovec loop */
 	if (retval == -ENOTBLK) {
 		/*
 		 * The remaining part of the request will be
 		 * be handled by buffered I/O when we return
 		 */
 		retval = 0;
 	}
 	/*
 	 * There may be some unwritten disk at the end of a part-written
 	 * fs-block-sized block.  Go zero that now.
 	 */
 	dio_zero_block(dio, &sdio, 1, &map_bh);
 	if (sdio.cur_page) {
 		ssize_t ret2;
 		ret2 = dio_send_cur_page(dio, &sdio, &map_bh);
 		if (retval == 0)
 			retval = ret2;
 		page_cache_release(sdio.cur_page);
 		sdio.cur_page = NULL;
 	}
 	if (sdio.bio)
 		dio_bio_submit(dio, &sdio);
 	blk_finish_plug(&plug);
 	/*
 	 * It is possible that, we return short IO due to end of file.
 	 * In that case, we need to release all the pages we got hold on.
 	 */
 	dio_cleanup(dio, &sdio);
 	/*
 	 * All block lookups have been performed. For READ requests
 	 * we can let i_mutex go now that its achieved its purpose
 	 * of protecting us from looking up uninitialized blocks.
 	 */
 	if (rw == READ && (dio->flags & DIO_LOCKING))
 		mutex_unlock(&dio->inode->i_mutex);
 	/*
 	 * The only time we want to leave bios in flight is when a successful
 	 * partial aio read or full aio write have been setup.  In that case
 	 * bio completion will call aio_complete.  The only time it's safe to
 	 * call aio_complete is when we return -EIOCBQUEUED, so we key on that.
 	 * This had *better* be the only place that raises -EIOCBQUEUED.
 	 */
 	BUG_ON(retval == -EIOCBQUEUED);
 	if (dio->is_async && retval == 0 && dio->result &&
 	    ((rw == READ) || (dio->result == sdio.size)))
 		retval = -EIOCBQUEUED;
 	if (retval != -EIOCBQUEUED)
 		dio_await_completion(dio);
 	if (drop_refcount(dio) == 0) {
 		retval = dio_complete(dio, offset, retval, false);
 		kmem_cache_free(dio_cache, dio);
 	} else
 		BUG_ON(retval != -EIOCBQUEUED);
 out:
 	return retval;
 }
 ssize_t
 __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	struct block_device *bdev, const struct iovec *iov, loff_t offset,
 	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
 	dio_submit_t submit_io,	int flags)
 {
 	/*
 	 * The block device state is needed in the end to finally
 	 * submit everything.  Since it's likely to be cache cold
 	 * prefetch it here as first thing to hide some of the
 	 * latency.
 	 *
 	 * Attempt to prefetch the pieces we likely need later.
 	 */
 	prefetch(&bdev->bd_disk->part_tbl);
 	prefetch(bdev->bd_queue);
 	prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES);
 	return do_blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
 				     nr_segs, get_block, end_io,
 				     submit_io, flags);
 }
 EXPORT_SYMBOL(__blockdev_direct_IO);
 static __init int dio_init(void)
 {
 	dio_cache = KMEM_CACHE(dio, SLAB_PANIC);
 	return 0;
 }
 module_init(dio_init)